diff --git a/diag_manager/diag_data.F90 b/diag_manager/diag_data.F90
index 306c1d453f..3c38664b2a 100644
--- a/diag_manager/diag_data.F90
+++ b/diag_manager/diag_data.F90
@@ -335,7 +335,9 @@ MODULE diag_data_mod
     class(*), allocatable         :: att_value(:) !< Value of the attribute
     character(len=:), allocatable :: att_name     !< Name of the attribute
     contains
+#ifndef __NVCOMPILER
       procedure :: add => fms_add_attribute
+#endif
       procedure :: write_metadata
   end type fmsDiagAttribute_type
 ! Include variable "version" to be written to log file.
@@ -558,6 +560,7 @@ function get_base_second() &
     res = base_second
   end function get_base_second
 
+#ifndef __NVCOMPILER
   !> @brief Adds an attribute to the attribute type
   subroutine fms_add_attribute(this, att_name, att_value)
     class(fmsDiagAttribute_type), intent(inout) :: this         !< Diag attribute type
@@ -589,6 +592,7 @@ subroutine fms_add_attribute(this, att_name, att_value)
       end select
     end select
   end subroutine fms_add_attribute
+#endif
 
   !> @brief gets the type of a variable
   !> @return the type of the variable (r4,r8,i4,i8,string)
diff --git a/mpp/include/group_update_pack.inc b/mpp/include/group_update_pack.inc
index de08b89e56..696075dd74 100644
--- a/mpp/include/group_update_pack.inc
+++ b/mpp/include/group_update_pack.inc
@@ -17,64 +17,95 @@
 !***********************************************************************
 
 if( group%k_loop_inside ) then
-!$OMP parallel do default(none) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) &
+! nvfortran + cray pointers imposes some restrictions on the loops below:
+!   * the compiler cannot privatise OpenMP cray pointers in offloaded loops. Hence, inner loops
+!     must be ported rather than the whole outer loop.
+!   * the more verbose form of openmp offload loops must be used. Would prefer "target teams loop".
+!   * default(shared) must be used otherwise loops hang or segfault. Would prefer "default(none)".
+#ifndef __NVCOMPILER_OPENMP_GPU
+!$OMP parallel do default(shared) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) &
 !$OMP                          private(buffer_pos,pos,m,is,ie,js,je,rotation, &
-!$OMP                                  ptr_field, ptr_fieldx, ptr_fieldy,n,k)
+!$OMP                                  ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx)
+#endif
   do n = 1, npack
      buffer_pos = group%pack_buffer_pos(n) + buffer_start_pos
      pos  = buffer_pos
-     is = group%pack_is(n); ie = group%pack_ie(n)
-     js = group%pack_js(n); je = group%pack_je(n)
+     is = group%pack_is(n); ie = group%pack_ie(n); ni = ie-is+1
+     js = group%pack_js(n); je = group%pack_je(n); nj = je-js+1
      rotation = group%pack_rotation(n)
      if( group%pack_type(n) == FIELD_S ) then
         select case( rotation )
         case(ZERO)
            do l=1, group%nscalar  ! loop over number of fields
               ptr_field = group%addrs_s(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+              !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) &
+              !$omp   default(shared) private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_field,ptr) &
+              !$omp   map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni))
+#endif
               do k = 1, ksize
                  do j = js, je
                     do i = is, ie
-                       pos = pos + 1
-                       buffer(pos) = field(i,j,k)
+                       idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1
+                       buffer(idx) = field(i,j,k)
                     end do
                  end do
               enddo
+              pos = pos + ksize*nj*ni
            enddo
         case( MINUS_NINETY )
            do l=1,group%nscalar  ! loop over number of fields
               ptr_field = group%addrs_s(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+              !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) &
+              !$omp   default(shared) private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_field,ptr) &
+              !$omp   map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni))
+#endif
               do k = 1, ksize
                  do i = is, ie
                     do j = je, js, -1
-                       pos = pos + 1
-                       buffer(pos) = field(i,j,k)
+                       idx = pos + (k-1)*nj*ni + (i-is)*nj + (je-j) + 1
+                       buffer(idx) = field(i,j,k)
                     end do
                  end do
               end do
+              pos = pos + ksize*nj*ni
            end do
         case( NINETY )
            do l=1,group%nscalar  ! loop over number of fields
               ptr_field = group%addrs_s(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+              !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) &
+              !$omp   default(shared) private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_field,ptr) &
+              !$omp   map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni))
+#endif
               do k = 1, ksize
                  do i = ie, is, -1
                     do j = js, je
-                       pos = pos + 1
-                       buffer(pos) = field(i,j,k)
+                       idx = pos + (k-1)*nj*ni + (ie-i)*nj + (j-js) + 1
+                       buffer(idx) = field(i,j,k)
                     end do
                  end do
               end do
+              pos = pos + ksize*nj*ni
            end do
         case( ONE_HUNDRED_EIGHTY )
            do l=1,group%nscalar  ! loop over number of fields
               ptr_field = group%addrs_s(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+              !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) &
+              !$omp   default(shared) private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_field,ptr) &
+              !$omp   map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni))
+#endif
               do k = 1, ksize
                  do j = je, js, -1
                     do i = ie, is, -1
-                       pos = pos + 1
-                       buffer(pos) = field(i,j,k)
+                       idx = pos + (k-1)*nj*ni + (je-j)*ni + (ie-i) + 1
+                       buffer(idx) = field(i,j,k)
                     end do
                  end do
               end do
+              pos = pos + ksize*nj*ni
            end do
         end select
      else if( group%pack_type(n) == FIELD_X  ) then
@@ -82,77 +113,115 @@ if( group%k_loop_inside ) then
         case(ZERO)
            do l=1, nvector  ! loop over number of fields
               ptr_fieldx = group%addrs_x(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+              !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) &
+              !$omp   default(shared) private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) &
+              !$omp   map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni))
+#endif
               do k = 1, ksize
                  do j = js, je
                     do i = is, ie
-                       pos = pos + 1
-                       buffer(pos) = fieldx(i,j,k)
+                       idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1
+                       buffer(idx) = fieldx(i,j,k)
                     end do
                  end do
               end do
+              pos = pos + ksize*nj*ni
            end do
         case( MINUS_NINETY )
            if( BTEST(group%flags_v,SCALAR_BIT) ) then
               do l=1,nvector  ! loop over number of fields
                  ptr_fieldy = group%addrs_y(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+                 !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) &
+                 !$omp   default(shared) private(i,j,k,idx) shared(ksize,is,ie,je,js,pos,nj,ni,ptr_fieldy,ptr) &
+                 !$omp   map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni))
+#endif
                  do k = 1, ksize
                     do i = is, ie
                        do j = je, js, -1
-                          pos = pos + 1
-                          buffer(pos) = fieldy(i,j,k)
+                        idx = pos + (k-1)*nj*ni + (i-is)*nj + (je-j) + 1
+                        buffer(idx) = fieldy(i,j,k)
                        end do
                     end do
                  end do
+                 pos = pos + ksize*nj*ni
               end do
            else
               do l=1,nvector  ! loop over number of fields
                  ptr_fieldy = group%addrs_y(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+                 !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) &
+                 !$omp   default(shared) private(i,j,k,idx) shared(ksize,is,ie,je,js,pos,nj,ni,ptr_fieldy,ptr) &
+                 !$omp   map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni))
+#endif
                  do k = 1, ksize
                     do i = is, ie
                        do j = je, js, -1
-                          pos = pos + 1
-                          buffer(pos) = -fieldy(i,j,k)
+                          idx = pos + (k-1)*nj*ni + (i-is)*nj + (je-j) + 1
+                          buffer(idx) = -fieldy(i,j,k)
                        end do
                     end do
                  end do
+                 pos = pos + ksize*nj*ni
               end do
            end if
         case( NINETY )
            do l=1, nvector  ! loop over number of fields
               ptr_fieldy = group%addrs_y(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+              !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) &
+              !$omp   default(shared) private(i,j,k,idx) shared(ksize,is,ie,je,js,pos,nj,ni,ptr_fieldy,ptr) &
+              !$omp   map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni))
+#endif
               do k = 1, ksize
                  do i = ie, is, -1
                     do j = js, je
-                       pos = pos + 1
-                       buffer(pos) = fieldy(i,j,k)
+                     !   pos = pos + 1
+                     !   buffer(pos) = fieldy(i,j,k)
+                       idx = pos + (k-1)*nj*ni + (ie-i)*nj + (j-js) + 1
+                       buffer(idx) = fieldy(i,j,k)
                     end do
                  end do
               end do
+              pos = pos + ksize*nj*ni
            end do
         case( ONE_HUNDRED_EIGHTY )
            if( BTEST(group%flags_v,SCALAR_BIT) ) then
               do l=1,nvector  ! loop over number of fields
                  ptr_fieldx = group%addrs_x(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+                 !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) &
+                 !$omp   default(shared) private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) &
+                 !$omp   map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni))
+#endif
                  do k = 1, ksize
                     do j = je, js, -1
                        do i = ie, is, -1
-                          pos = pos + 1
-                          buffer(pos) =  fieldx(i,j,k)
+                          idx = pos + (k-1)*nj*ni + (je-j)*ni + (ie-i) + 1
+                          buffer(idx) = fieldx(i,j,k)
                        end do
                     end do
                  end do
+                 pos = pos + ksize*nj*ni
               end do
            else
               do l=1,nvector  ! loop over number of fields
                  ptr_fieldx = group%addrs_x(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+                 !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) &
+                 !$omp   default(shared) private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) &
+                 !$omp   map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni))
+#endif
                  do k = 1, ksize
                     do j = je, js, -1
                        do i = ie, is, -1
-                          pos = pos + 1
-                          buffer(pos) =  -fieldx(i,j,k)
+                          idx = pos + (k-1)*nj*ni + (je-j)*ni + (ie-i) + 1
+                          buffer(idx) = -fieldx(i,j,k)
                        end do
                     end do
                  end do
+                 pos = pos + ksize*nj*ni
               end do
            end if
         end select ! select case( rotation(n) )
@@ -161,135 +230,197 @@ if( group%k_loop_inside ) then
         case(ZERO)
            do l=1, nvector  ! loop over number of fields
               ptr_fieldy = group%addrs_y(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+              !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) &
+              !$omp   default(shared) private(i,j,k,idx) shared(ksize,is,ie,je,js,pos,nj,ni,ptr_fieldy,ptr) &
+              !$omp   map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni))
+#endif
               do k = 1, ksize
                  do j = js, je
                     do i = is, ie
-                       pos = pos + 1
-                       buffer(pos) = fieldy(i,j,k)
+                       idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1
+                       buffer(idx) = fieldy(i,j,k)
                     end do
                  end do
               end do
+              pos = pos + ksize*nj*ni
            end do
         case( MINUS_NINETY )
            do l=1,nvector  ! loop over number of fields
               ptr_fieldx = group%addrs_x(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+              !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) &
+              !$omp   default(shared) private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) &
+              !$omp   map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni))
+#endif
               do k = 1, ksize
                  do i = is, ie
                     do j = je, js, -1
-                       pos = pos + 1
-                       buffer(pos) = fieldx(i,j,k)
+                     idx = pos + (k-1)*nj*ni + (i-is)*nj + (je-j) + 1
+                     buffer(idx) = fieldx(i,j,k)
                     end do
                  end do
               end do
+              pos = pos + ksize*nj*ni
            end do
         case( NINETY )
            if( BTEST(group%flags_v,SCALAR_BIT) ) then
               do l=1, nvector  ! loop over number of fields
                  ptr_fieldx = group%addrs_x(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+                 !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) &
+                 !$omp   default(shared) private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) &
+                 !$omp   map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni))
+#endif
                  do k = 1, ksize
                     do i = ie, is, -1
                        do j = js, je
-                          pos = pos + 1
-                          buffer(pos) = fieldx(i,j,k)
+                          idx = pos + (k-1)*nj*ni + (ie-i)*nj + (j-js) + 1
+                          buffer(idx) = fieldx(i,j,k)
                        end do
                     end do
                  end do
+                 pos = pos + ksize*nj*ni
               end do
            else
               do l=1,nvector  ! loop over number of fields
                  ptr_fieldx = group%addrs_x(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+                 !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) &
+                 !$omp   default(shared) private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) &
+                 !$omp   map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni))
+#endif
                  do k = 1, ksize
                     do i = ie, is, -1
                        do j = js, je
-                          pos = pos + 1
-                          buffer(pos) = -fieldx(i,j,k)
+                          idx = pos + (k-1)*nj*ni + (ie-i)*nj + (j-js) + 1
+                          buffer(idx) = -fieldx(i,j,k)
                        end do
                     end do
                  end do
+                 pos = pos + ksize*nj*ni
               end do
            end if
         case( ONE_HUNDRED_EIGHTY )
            if( BTEST(group%flags_v,SCALAR_BIT) ) then
               do l=1,nvector  ! loop over number of fields
                  ptr_fieldy = group%addrs_y(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+                 !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) &
+                 !$omp   default(shared) private(i,j,k,idx) shared(ksize,is,ie,je,js,pos,nj,ni,ptr_fieldy,ptr) &
+                 !$omp   map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni))
+#endif
                  do k = 1, ksize
                     do j = je, js, -1
                        do i = ie, is, -1
-                          pos = pos + 1
-                          buffer(pos) =  fieldy(i,j,k)
+                          idx = pos + (k-1)*nj*ni + (je-j)*ni + (ie-i) + 1
+                          buffer(idx) = fieldy(i,j,k)
                        end do
                     end do
                  end do
+                 pos = pos + ksize*nj*ni
               end do
            else
               do l=1,nvector  ! loop over number of fields
                  ptr_fieldy = group%addrs_y(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+                 !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) &
+                 !$omp   default(shared) private(i,j,k,idx) shared(ksize,is,ie,je,js,pos,nj,ni,ptr_fieldy,ptr) &
+                 !$omp   map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni))
+#endif
                  do k = 1, ksize
                     do j = je, js, -1
                        do i = ie, is, -1
-                          pos = pos + 1
-                          buffer(pos) =  -fieldy(i,j,k)
+                          idx = pos + (k-1)*nj*ni + (je-j)*ni + (ie-i) + 1
+                          buffer(idx) = -fieldy(i,j,k)
                        end do
                     end do
                  end do
+                 pos = pos + ksize*nj*ni
               end do
            end if
         end select ! select case( rotation(n) )
      endif
   enddo
 else
-!$OMP parallel do default(none) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) &
+#ifndef __NVCOMPILER_OPENMP_GPU
+!$OMP parallel do default(shared) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) &
 !$OMP                          private(buffer_pos,pos,m,is,ie,js,je,rotation, &
-!$OMP                                  ptr_field, ptr_fieldx, ptr_fieldy,n,k)
+!$OMP                                  ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx)
+#endif
   do nk = 1, npack*ksize
      n = (nk-1)/ksize + 1
      k = mod((nk-1), ksize) + 1
      buffer_pos = group%pack_buffer_pos(n) + buffer_start_pos
      pos  = buffer_pos + (k-1)*group%pack_size(n)
-     is = group%pack_is(n); ie = group%pack_ie(n)
-     js = group%pack_js(n); je = group%pack_je(n)
+     is = group%pack_is(n); ie = group%pack_ie(n); ni = ie-is+1
+     js = group%pack_js(n); je = group%pack_je(n); nj = je-js+1
      rotation = group%pack_rotation(n)
      if( group%pack_type(n) == FIELD_S ) then
         select case( rotation )
         case(ZERO)
            do l=1, group%nscalar  ! loop over number of fields
               ptr_field = group%addrs_s(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+              !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) &
+              !$omp   default(shared) private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_field,ptr) &
+              !$omp   map(to: field(is:ie, js:je, k)) map(from: buffer(pos+1:pos+nj*ni))
+#endif
               do j = js, je
                  do i = is, ie
-                    pos = pos + 1
-                    buffer(pos) = field(i,j,k)
+                    idx = pos + (j-js)*ni + (i-is) + 1
+                    buffer(idx) = field(i,j,k)
                  end do
               end do
+              pos = pos + nj*ni
            enddo
         case( MINUS_NINETY )
            do l=1,group%nscalar  ! loop over number of fields
               ptr_field = group%addrs_s(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+              !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) &
+              !$omp   default(shared) private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_field,ptr) &
+              !$omp   map(to: field(is:ie, js:je, k)) map(from: buffer(pos+1:pos+nj*ni))
+#endif
               do i = is, ie
                  do j = je, js, -1
-                    pos = pos + 1
-                    buffer(pos) = field(i,j,k)
+                    idx = pos + (i-is)*nj + (je-j) + 1
+                    buffer(idx) = field(i,j,k)
                  end do
               end do
+              pos = pos + nj*ni
            end do
         case( NINETY )
            do l=1,group%nscalar  ! loop over number of fields
               ptr_field = group%addrs_s(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+              !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) &
+              !$omp   default(shared) private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_field,ptr) &
+              !$omp   map(to: field(is:ie, js:je, k)) map(from: buffer(pos+1:pos+nj*ni))
+#endif
               do i = ie, is, -1
                  do j = js, je
-                    pos = pos + 1
-                    buffer(pos) = field(i,j,k)
+                    idx = pos + (ie-i)*nj + (j-js) + 1
+                    buffer(idx) = field(i,j,k)
                  end do
               end do
+              pos = pos + nj*ni
            end do
         case( ONE_HUNDRED_EIGHTY )
            do l=1,group%nscalar  ! loop over number of fields
               ptr_field = group%addrs_s(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+              !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) &
+              !$omp   default(shared) private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_field,ptr) &
+              !$omp   map(to: field(is:ie, js:je, k)) map(from: buffer(pos+1:pos+nj*ni))
+#endif
               do j = je, js, -1
                  do i = ie, is, -1
-                    pos = pos + 1
-                    buffer(pos) = field(i,j,k)
+                    idx = pos + (je-j)*ni + (ie-i) + 1
+                    buffer(idx) = field(i,j,k)
                  end do
               end do
+              pos = pos + nj*ni
            end do
         end select
      else if( group%pack_type(n) == FIELD_X  ) then
@@ -297,65 +428,101 @@ else
         case(ZERO)
            do l=1, nvector  ! loop over number of fields
               ptr_fieldx = group%addrs_x(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+              !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) &
+              !$omp   default(shared) private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) &
+              !$omp   map(to: fieldx(is:ie, js:je, k)) map(from: buffer(pos+1:pos+nj*ni))
+#endif
               do j = js, je
                  do i = is, ie
-                    pos = pos + 1
-                    buffer(pos) = fieldx(i,j,k)
+                    idx = pos + (j-js)*ni + (i-is) + 1
+                    buffer(idx) = fieldx(i,j,k)
                  end do
               end do
+              pos = pos + nj*ni
            end do
         case( MINUS_NINETY )
            if( BTEST(group%flags_v,SCALAR_BIT) ) then
               do l=1,nvector  ! loop over number of fields
                  ptr_fieldy = group%addrs_y(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+                 !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) &
+                 !$omp   default(shared) private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) &
+                 !$omp   map(to: fieldy(is:ie, js:je, k)) map(from: buffer(pos+1:pos+nj*ni))
+#endif
                  do i = is, ie
                     do j = je, js, -1
-                       pos = pos + 1
-                       buffer(pos) = fieldy(i,j,k)
+                       idx = pos + (i-is)*nj + (je-j) + 1
+                       buffer(idx) = fieldy(i,j,k)
                     end do
                  end do
+                 pos = pos + nj*ni
               end do
            else
               do l=1,nvector  ! loop over number of fields
                  ptr_fieldy = group%addrs_y(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+                 !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) &
+                 !$omp   default(shared) private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) &
+                 !$omp   map(to: fieldy(is:ie, js:je, k)) map(from: buffer(pos+1:pos+nj*ni))
+#endif
                  do i = is, ie
                     do j = je, js, -1
-                       pos = pos + 1
-                       buffer(pos) = -fieldy(i,j,k)
+                       idx = pos + (i-is)*nj + (je-j) + 1
+                       buffer(idx) = -fieldy(i,j,k)
                     end do
                  end do
+                 pos = pos + nj*ni
               end do
            end if
         case( NINETY )
            do l=1, nvector  ! loop over number of fields
               ptr_fieldy = group%addrs_y(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+              !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) &
+              !$omp   default(shared) private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) &
+              !$omp   map(to: fieldy(is:ie, js:je, k)) map(from: buffer(pos+1:pos+nj*ni))
+#endif
               do i = ie, is, -1
                  do j = js, je
-                    pos = pos + 1
-                    buffer(pos) = fieldy(i,j,k)
+                    idx = pos + (ie-i)*nj + (j-js) + 1
+                    buffer(idx) = fieldy(i,j,k)
                  end do
               end do
+              pos = pos + nj*ni
            end do
         case( ONE_HUNDRED_EIGHTY )
            if( BTEST(group%flags_v,SCALAR_BIT) ) then
               do l=1,nvector  ! loop over number of fields
                  ptr_fieldx = group%addrs_x(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+                 !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) &
+                 !$omp   default(shared) private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) &
+                 !$omp   map(to: fieldx(is:ie, js:je, k)) map(from: buffer(pos+1:pos+nj*ni))
+#endif
                  do j = je, js, -1
                     do i = ie, is, -1
-                       pos = pos + 1
-                       buffer(pos) =  fieldx(i,j,k)
+                       idx = pos + (je-j)*ni + (ie-i) + 1
+                       buffer(idx) = fieldx(i,j,k)
                     end do
                  end do
+                 pos = pos + nj*ni
               end do
            else
               do l=1,nvector  ! loop over number of fields
                  ptr_fieldx = group%addrs_x(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+                 !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) &
+                 !$omp   default(shared) private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) &
+                 !$omp   map(to: fieldx(is:ie, js:je, k)) map(from: buffer(pos+1:pos+nj*ni))
+#endif
                  do j = je, js, -1
                     do i = ie, is, -1
-                       pos = pos + 1
-                       buffer(pos) =  -fieldx(i,j,k)
+                       idx = pos + (je-j)*ni + (ie-i) + 1
+                       buffer(idx) = -fieldx(i,j,k)
                     end do
                  end do
+                 pos = pos + nj*ni
               end do
            end if
         end select ! select case( rotation(n) )
@@ -364,65 +531,101 @@ else
         case(ZERO)
            do l=1, nvector  ! loop over number of fields
               ptr_fieldy = group%addrs_y(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+              !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) &
+              !$omp   default(shared) private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) &
+              !$omp   map(to: fieldy(is:ie, js:je, k)) map(from: buffer(pos+1:pos+nj*ni))
+#endif
               do j = js, je
                  do i = is, ie
-                    pos = pos + 1
-                    buffer(pos) = fieldy(i,j,k)
+                    idx = pos + (j-js)*ni + (i-is) + 1
+                    buffer(idx) = fieldy(i,j,k)
                  end do
               end do
+              pos = pos + nj*ni
            end do
         case( MINUS_NINETY )
            do l=1,nvector  ! loop over number of fields
               ptr_fieldx = group%addrs_x(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+              !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) &
+              !$omp   default(shared) private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) &
+              !$omp   map(to: fieldx(is:ie, js:je, k)) map(from: buffer(pos+1:pos+nj*ni))
+#endif
               do i = is, ie
                  do j = je, js, -1
-                    pos = pos + 1
-                    buffer(pos) = fieldx(i,j,k)
+                    idx = pos + (i-is)*nj + (je-j) + 1
+                    buffer(idx) = fieldx(i,j,k)
                  end do
               end do
+              pos = pos + nj*ni
            end do
         case( NINETY )
            if( BTEST(group%flags_v,SCALAR_BIT) ) then
               do l=1, nvector  ! loop over number of fields
                  ptr_fieldx = group%addrs_x(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+                 !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) &
+                 !$omp   default(shared) private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) &
+                 !$omp   map(to: fieldx(is:ie, js:je, k)) map(from: buffer(pos+1:pos+nj*ni))
+#endif
                  do i = ie, is, -1
                     do j = js, je
-                       pos = pos + 1
-                       buffer(pos) = fieldx(i,j,k)
+                       idx = pos + (ie-i)*nj + (j-js) + 1
+                       buffer(idx) = fieldx(i,j,k)
                     end do
                  end do
+                 pos = pos + nj*ni
               end do
            else
               do l=1,nvector  ! loop over number of fields
                  ptr_fieldx = group%addrs_x(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+                 !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) &
+                 !$omp   default(shared) private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) &
+                 !$omp   map(to: fieldx(is:ie, js:je, k)) map(from: buffer(pos+1:pos+nj*ni))
+#endif
                  do i = ie, is, -1
                     do j = js, je
-                       pos = pos + 1
-                       buffer(pos) = -fieldx(i,j,k)
+                       idx = pos + (ie-i)*nj + (j-js) + 1
+                       buffer(idx) = -fieldx(i,j,k)
                     end do
                  end do
+                 pos = pos + nj*ni
               end do
            end if
         case( ONE_HUNDRED_EIGHTY )
            if( BTEST(group%flags_v,SCALAR_BIT) ) then
               do l=1,nvector  ! loop over number of fields
                  ptr_fieldy = group%addrs_y(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+                 !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) &
+                 !$omp   default(shared) private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) &
+                 !$omp   map(to: fieldy(is:ie, js:je, k)) map(from: buffer(pos+1:pos+nj*ni))
+#endif
                  do j = je, js, -1
                     do i = ie, is, -1
-                       pos = pos + 1
-                       buffer(pos) =  fieldy(i,j,k)
+                       idx = pos + (je-j)*ni + (ie-i) + 1
+                       buffer(idx) = fieldy(i,j,k)
                     end do
                  end do
+                 pos = pos + nj*ni
               end do
            else
               do l=1,nvector  ! loop over number of fields
                  ptr_fieldy = group%addrs_y(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+                 !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) &
+                 !$omp   default(shared) private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) &
+                 !$omp   map(to: fieldy(is:ie, js:je, k)) map(from: buffer(pos+1:pos+nj*ni))
+#endif
                  do j = je, js, -1
                     do i = ie, is, -1
-                       pos = pos + 1
-                       buffer(pos) =  -fieldy(i,j,k)
+                       idx = pos + (je-j)*ni + (ie-i) + 1
+                       buffer(idx) = -fieldy(i,j,k)
                     end do
                  end do
+                 pos = pos + nj*ni
               end do
            end if
         end select ! select case( rotation(n) )
diff --git a/mpp/include/group_update_unpack.inc b/mpp/include/group_update_unpack.inc
index 49fb2555ce..d8386a2fce 100644
--- a/mpp/include/group_update_unpack.inc
+++ b/mpp/include/group_update_unpack.inc
@@ -17,92 +17,146 @@
 !***********************************************************************
 
 if( group%k_loop_inside ) then
-!$OMP parallel do default(none) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) &
+! nvfortran + cray pointers imposes some restrictions on the loops below:
+!   * the compiler cannot privatise OpenMP cray pointers in offloaded loops. Hence, inner loops
+!     must be ported rather than the whole outer loop.
+!   * the more verbose form of openmp offload loops must be used. Would prefer "target teams loop".
+!   * default(shared) must be used otherwise loops hang or segfault. Would prefer "default(none)".
+#ifndef __NVCOMPILER_OPENMP_GPU
+!$OMP parallel do default(shared) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) &
 !$OMP                          private(buffer_pos,pos,m,is, ie, js, je,rotation, &
-!$OMP                                  ptr_field, ptr_fieldx, ptr_fieldy, n,k )
+!$OMP                                  ptr_field, ptr_fieldx, ptr_fieldy, n,k,ni,nj,idx)
+#endif
   do n = nunpack, 1, -1
      buffer_pos = group%unpack_buffer_pos(n) + buffer_start_pos
      pos = buffer_pos
-     is = group%unpack_is(n); ie = group%unpack_ie(n)
-     js = group%unpack_js(n); je = group%unpack_je(n)
+     is = group%unpack_is(n); ie = group%unpack_ie(n); ni = ie-is+1
+     js = group%unpack_js(n); je = group%unpack_je(n); nj = je-js+1
      if( group%unpack_type(n) == FIELD_S ) then
         do l=1,nscalar  ! loop over number of fields
            ptr_field = group%addrs_s(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+           !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) &
+           !$omp   default(shared) &
+           !$omp   private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_field,ptr) &
+           !$omp   map(to: buffer(pos+1:pos+ksize*nj*ni)) &
+           !$omp   map(from: field(is:ie,js:je,1:ksize))
+#endif
            do k = 1, ksize
               do j = js, je
                  do i = is, ie
-                    pos = pos + 1
-                    field(i,j,k) = buffer(pos)
+                    idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1
+                    field(i,j,k) = buffer(idx)
                  end do
               end do
            end do
+           pos = pos + ksize*nj*ni
         end do
      else if( group%unpack_type(n) == FIELD_X ) then
         do l=1,nvector  ! loop over number of fields
            ptr_fieldx = group%addrs_x(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+           !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) &
+           !$omp   default(shared) &
+           !$omp   private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) &
+           !$omp   map(to: buffer(pos+1:pos+ksize*nj*ni)) &
+           !$omp   map(from: fieldx(is:ie,js:je,1:ksize))
+#endif
            do k = 1, ksize
               do j = js, je
                  do i = is, ie
-                    pos = pos + 1
-                    fieldx(i,j,k) = buffer(pos)
+                    idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1
+                    fieldx(i,j,k) = buffer(idx)
                  end do
               end do
            end do
+           pos = pos + ksize*nj*ni
         end do
      else if( group%unpack_type(n) == FIELD_Y ) then
         do l=1,nvector  ! loop over number of fields
            ptr_fieldy = group%addrs_y(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+           !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) &
+           !$omp   default(shared) &
+           !$omp   private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldy,ptr) &
+           !$omp   map(to: buffer(pos+1:pos+ksize*nj*ni)) &
+           !$omp   map(from: fieldy(is:ie,js:je,1:ksize))
+#endif
            do k = 1, ksize
               do j = js, je
                  do i = is, ie
-                    pos = pos + 1
-                    fieldy(i,j,k) = buffer(pos)
+                    idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1
+                    fieldy(i,j,k) = buffer(idx)
                  end do
               end do
            end do
+           pos = pos + ksize*nj*ni
         end do
      endif
   enddo
 else
-!$OMP parallel do default(none) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) &
+#ifndef __NVCOMPILER_OPENMP_GPU
+!$OMP parallel do default(shared) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) &
 !$OMP                          private(buffer_pos,pos,m,is, ie, js, je,rotation, &
-!$OMP                                  ptr_field, ptr_fieldx, ptr_fieldy,n,k)
+!$OMP                                  ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx)
+#endif
   do nk = nunpack*ksize, 1, -1
      n = (nk-1)/ksize + 1
      k = mod((nk-1), ksize) + 1
      buffer_pos = group%unpack_buffer_pos(n) + buffer_start_pos
      pos = buffer_pos + (k-1)*group%unpack_size(n)
-     is = group%unpack_is(n); ie = group%unpack_ie(n)
-     js = group%unpack_js(n); je = group%unpack_je(n)
+     is = group%unpack_is(n); ie = group%unpack_ie(n); ni = ie-is+1
+     js = group%unpack_js(n); je = group%unpack_je(n); nj = je-js+1
      if( group%unpack_type(n) == FIELD_S ) then
         do l=1,nscalar  ! loop over number of fields
            ptr_field = group%addrs_s(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+           !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) &
+           !$omp   default(shared) &
+           !$omp   private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_field,ptr) &
+           !$omp   map(to: buffer(pos+1:pos+nj*ni)) map(from: field(is:ie,js:je,k))
+#endif
            do j = js, je
               do i = is, ie
-                 pos = pos + 1
-                 field(i,j,k) = buffer(pos)
+                 idx = pos + (j-js)*ni + (i-is) + 1
+                 field(i,j,k) = buffer(idx)
               end do
            end do
+           pos = pos + ni*nj
         end do
      else if( group%unpack_type(n) == FIELD_X ) then
         do l=1,nvector  ! loop over number of fields
            ptr_fieldx = group%addrs_x(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+           !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) &
+           !$omp   default(shared) &
+           !$omp   private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) &
+           !$omp   map(to: buffer(pos+1:pos+nj*ni)) map(from: fieldx(is:ie,js:je,k))
+#endif
            do j = js, je
               do i = is, ie
-                 pos = pos + 1
-                 fieldx(i,j,k) = buffer(pos)
+                 idx = pos + (j-js)*ni + (i-is) + 1
+                 fieldx(i,j,k) = buffer(idx)
               end do
            end do
+           pos = pos + ni*nj
         end do
      else if( group%unpack_type(n) == FIELD_Y ) then
         do l=1,nvector  ! loop over number of fields
            ptr_fieldy = group%addrs_y(l)
+#ifdef __NVCOMPILER_OPENMP_GPU
+           !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) &
+           !$omp   default(shared) &
+           !$omp   private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) &
+           !$omp   map(to: buffer(pos+1:pos+nj*ni)) map(from: fieldy(is:ie,js:je,k))
+#endif
            do j = js, je
               do i = is, ie
-                 pos = pos + 1
-                 fieldy(i,j,k) = buffer(pos)
+               idx = pos + (j-js)*ni + (i-is) + 1
+               fieldy(i,j,k) = buffer(idx)
               end do
            end do
+           pos = pos + ni*nj
         end do
      endif
   enddo
diff --git a/mpp/include/mpp_comm_mpi.inc b/mpp/include/mpp_comm_mpi.inc
index e7b5ccb76a..42e22b2598 100644
--- a/mpp/include/mpp_comm_mpi.inc
+++ b/mpp/include/mpp_comm_mpi.inc
@@ -30,6 +30,7 @@
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   !> @brief Initialize the @ref mpp_mod module. Must be called before any usage.
   subroutine mpp_init_f08( flags, localcomm, test_level, alt_input_nml_path )
+  !$ use omp_lib
   integer, optional, intent(in) :: flags !< Flags for debug output, can be MPP_VERBOSE or MPP_DEBUG
   type(mpi_comm), optional, intent(in) :: localcomm !< MPI communicator to use. Only relevant if MPI has already
                                                !! been initialized by an external call to mpi_init.
@@ -60,6 +61,14 @@
   call MPI_COMM_RANK( mpp_comm_private, pe,   error )
   call MPI_COMM_SIZE( mpp_comm_private, npes, error )
 
+  ! set default device to enable multi GPU parallelism
+  ! calls to both OpenACC and OpenMP runtimes are needed
+  ! because we use both do-concurrent and openmp
+  ! if you remove either, the code will run multiple
+  ! ranks on a _single_ GPU. Be careful out there!
+  !$ call omp_set_default_device(pe)
+  !$acc set device_num(pe)
+
   module_is_initialized = .TRUE.
   if (present(test_level)) then
     t_level = test_level
diff --git a/mpp/include/mpp_domains_reduce.inc b/mpp/include/mpp_domains_reduce.inc
index 826b2f66ac..874d0217b7 100644
--- a/mpp/include/mpp_domains_reduce.inc
+++ b/mpp/include/mpp_domains_reduce.inc
@@ -905,6 +905,8 @@
 #undef MPP_TYPE_INIT_VALUE
 
 !****************************************************
+#ifndef __NVCOMPILER
+
 #undef MPP_GLOBAL_FIELD_
 #define MPP_GLOBAL_FIELD_ mpp_global_field_r8
 #undef MPP_TYPE_
@@ -972,6 +974,127 @@
 #undef DEFAULT_VALUE_
 #define DEFAULT_VALUE_ .false._l4_kind
 #include <mpp_global_field.fh>
+
+
+!! if __NVCOMPILER is defined, use compatibility version
+#else
+
+#undef MPP_TYPE_
+#define MPP_TYPE_ real(r8_kind)
+#undef DEFAULT_VALUE_
+#define DEFAULT_VALUE_ 0._r8_kind
+#undef MPP_GLOBAL_FIELD_2D_
+#define MPP_GLOBAL_FIELD_2D_ mpp_global_field_r8_2d
+#undef MPP_GLOBAL_FIELD_3D_
+#define MPP_GLOBAL_FIELD_3D_ mpp_global_field_r8_3d
+#undef MPP_GLOBAL_FIELD_4D_
+#define MPP_GLOBAL_FIELD_4D_ mpp_global_field_r8_4d
+#undef MPP_GLOBAL_FIELD_5D_
+#define MPP_GLOBAL_FIELD_5D_ mpp_global_field_r8_5d
+#include <mpp_global_field_compat.fh>
+
+#undef MPP_TYPE_
+#define MPP_TYPE_ integer(i8_kind)
+#undef DEFAULT_VALUE_
+#define DEFAULT_VALUE_ 0_i8_kind
+#undef MPP_GLOBAL_FIELD_2D_
+#define MPP_GLOBAL_FIELD_2D_ mpp_global_field_i8_2d
+#undef MPP_GLOBAL_FIELD_3D_
+#define MPP_GLOBAL_FIELD_3D_ mpp_global_field_i8_3d
+#undef MPP_GLOBAL_FIELD_4D_
+#define MPP_GLOBAL_FIELD_4D_ mpp_global_field_i8_4d
+#undef MPP_GLOBAL_FIELD_5D_
+#define MPP_GLOBAL_FIELD_5D_ mpp_global_field_i8_5d
+#include <mpp_global_field_compat.fh>
+
+#ifdef OVERLOAD_C8
+#undef MPP_TYPE_
+#define MPP_TYPE_ complex(c8_kind)
+#undef DEFAULT_VALUE_
+#define DEFAULT_VALUE_ (0._r8_kind,0._r8_kind)
+#undef MPP_GLOBAL_FIELD_2D_
+#define MPP_GLOBAL_FIELD_2D_ mpp_global_field_c8_2d
+#undef MPP_GLOBAL_FIELD_3D_
+#define MPP_GLOBAL_FIELD_3D_ mpp_global_field_c8_3d
+#undef MPP_GLOBAL_FIELD_4D_
+#define MPP_GLOBAL_FIELD_4D_ mpp_global_field_c8_4d
+#undef MPP_GLOBAL_FIELD_5D_
+#define MPP_GLOBAL_FIELD_5D_ mpp_global_field_c8_5d
+#include <mpp_global_field_compat.fh>
+#endif
+
+#ifdef OVERLOAD_C4
+#undef MPP_TYPE_
+#define MPP_TYPE_ complex(c4_kind)
+#undef DEFAULT_VALUE_
+#define DEFAULT_VALUE_ (0._r4_kind,0._r4_kind)
+#undef MPP_GLOBAL_FIELD_2D_
+#define MPP_GLOBAL_FIELD_2D_ mpp_global_field_c4_2d
+#undef MPP_GLOBAL_FIELD_3D_
+#define MPP_GLOBAL_FIELD_3D_ mpp_global_field_c4_3d
+#undef MPP_GLOBAL_FIELD_4D_
+#define MPP_GLOBAL_FIELD_4D_ mpp_global_field_c4_4d
+#undef MPP_GLOBAL_FIELD_5D_
+#define MPP_GLOBAL_FIELD_5D_ mpp_global_field_c4_5d
+#include <mpp_global_field_compat.fh>
+#endif
+
+#undef MPP_TYPE_
+#define MPP_TYPE_ logical(l8_kind)
+#undef DEFAULT_VALUE_
+#define DEFAULT_VALUE_ .false._l8_kind
+#undef MPP_GLOBAL_FIELD_2D_
+#define MPP_GLOBAL_FIELD_2D_ mpp_global_field_l8_2d
+#undef MPP_GLOBAL_FIELD_3D_
+#define MPP_GLOBAL_FIELD_3D_ mpp_global_field_l8_3d
+#undef MPP_GLOBAL_FIELD_4D_
+#define MPP_GLOBAL_FIELD_4D_ mpp_global_field_l8_4d
+#undef MPP_GLOBAL_FIELD_5D_
+#define MPP_GLOBAL_FIELD_5D_ mpp_global_field_l8_5d
+#include <mpp_global_field_compat.fh>
+
+#undef MPP_TYPE_
+#define MPP_TYPE_ real(r4_kind)
+#undef DEFAULT_VALUE_
+#define DEFAULT_VALUE_ 0._r4_kind
+#undef MPP_GLOBAL_FIELD_2D_
+#define MPP_GLOBAL_FIELD_2D_ mpp_global_field_r4_2d
+#undef MPP_GLOBAL_FIELD_3D_
+#define MPP_GLOBAL_FIELD_3D_ mpp_global_field_r4_3d
+#undef MPP_GLOBAL_FIELD_4D_
+#define MPP_GLOBAL_FIELD_4D_ mpp_global_field_r4_4d
+#undef MPP_GLOBAL_FIELD_5D_
+#define MPP_GLOBAL_FIELD_5D_ mpp_global_field_r4_5d
+#include <mpp_global_field_compat.fh>
+
+#undef MPP_TYPE_
+#define MPP_TYPE_ integer(i4_kind)
+#undef DEFAULT_VALUE_
+#define DEFAULT_VALUE_ 0_i4_kind
+#undef MPP_GLOBAL_FIELD_2D_
+#define MPP_GLOBAL_FIELD_2D_ mpp_global_field_i4_2d
+#undef MPP_GLOBAL_FIELD_3D_
+#define MPP_GLOBAL_FIELD_3D_ mpp_global_field_i4_3d
+#undef MPP_GLOBAL_FIELD_4D_
+#define MPP_GLOBAL_FIELD_4D_ mpp_global_field_i4_4d
+#undef MPP_GLOBAL_FIELD_5D_
+#define MPP_GLOBAL_FIELD_5D_ mpp_global_field_i4_5d
+#include <mpp_global_field_compat.fh>
+
+#undef MPP_TYPE_
+#define MPP_TYPE_ logical(l4_kind)
+#undef DEFAULT_VALUE_
+#define DEFAULT_VALUE_ .false._l4_kind
+#undef MPP_GLOBAL_FIELD_2D_
+#define MPP_GLOBAL_FIELD_2D_ mpp_global_field_l4_2d
+#undef MPP_GLOBAL_FIELD_3D_
+#define MPP_GLOBAL_FIELD_3D_ mpp_global_field_l4_3d
+#undef MPP_GLOBAL_FIELD_4D_
+#define MPP_GLOBAL_FIELD_4D_ mpp_global_field_l4_4d
+#undef MPP_GLOBAL_FIELD_5D_
+#define MPP_GLOBAL_FIELD_5D_ mpp_global_field_l4_5d
+#include <mpp_global_field_compat.fh>
+#endif
 !****************************************************
 #undef MPP_DO_GLOBAL_FIELD_3D_AD_
 #define MPP_DO_GLOBAL_FIELD_3D_AD_ mpp_do_global_field2D_r8_3d_ad
diff --git a/mpp/include/mpp_global_field_compat.fh b/mpp/include/mpp_global_field_compat.fh
new file mode 100644
index 0000000000..a6a4bbc23d
--- /dev/null
+++ b/mpp/include/mpp_global_field_compat.fh
@@ -0,0 +1,1038 @@
+!***********************************************************************
+!*                             Apache License 2.0
+!*
+!* This file is part of the GFDL Flexible Modeling System (FMS).
+!*
+!* Licensed under the Apache License, Version 2.0 (the "License");
+!* you may not use this file except in compliance with the License.
+!* You may obtain a copy of the License at
+!*
+!*     http://www.apache.org/licenses/LICENSE-2.0
+!*
+!* FMS is distributed in the hope that it will be useful, but WITHOUT
+!* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied;
+!* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+!* PARTICULAR PURPOSE. See the License for the specific language
+!* governing permissions and limitations under the License.
+!***********************************************************************
+!> @addtogroup mpp_domains_mod
+!> {@
+    !> get a global field from a local field
+    !! local field may be on compute OR data domain
+
+      !> Gets a global field from a local field
+      !! local field may be on compute OR data domain
+    subroutine MPP_GLOBAL_FIELD_2D_( domain, local, global, flags, position, tile_count, default_data, xdim, ydim)
+      type(domain2D), intent(in)    :: domain
+      MPP_TYPE_, intent(in)         ::  local(:,:)
+      MPP_TYPE_, intent(out)        :: global(:,:)
+      integer, intent(in), optional :: flags
+      integer, intent(in), optional :: position
+      integer, intent(in), optional :: tile_count
+      MPP_TYPE_, intent(in), optional :: default_data
+      integer, intent(in), optional :: xdim, ydim !< Indices of the domain-decomposed dimensions. In typical
+                                                  !! Fortran-based code, xdim=1 and ydim=2.
+
+      integer :: i, j, k, m, n, nd, num_words, lpos, rpos, ioff, joff, from_pe, root_pe, tile_id
+      integer :: ke, isc, iec, jsc, jec, is, ie, js, je, num_word_me
+      integer :: ipos, jpos, n_per_gridpoint_local, n_per_gridpoint_global
+      logical :: xonly, yonly, root_only, global_on_this_pe
+      integer :: ix, iy
+      MPP_TYPE_, pointer, dimension(:) :: clocal, cremote
+      integer :: stackuse
+      character(len=8) :: text
+      type(c_ptr) :: stack_cptr !< Workaround for GFortran bug
+
+      integer :: tile, ishift, jshift, ipos0, jpos0
+      integer :: size_clocal, size_cremote
+
+      tile = 1
+      if(present(tile_count)) tile = tile_count
+
+      call mpp_get_domain_shift(domain, ishift, jshift, position)
+
+      ipos0 = -domain%x(tile)%global%begin + 1
+      jpos0 = -domain%y(tile)%global%begin + 1
+
+      if (present(xdim)) then
+        ix = xdim
+      else
+        ix = 1
+      endif
+
+      if (present(ydim)) then
+        iy = ydim
+      else
+        iy = 2
+      endif
+
+      n_per_gridpoint_local = size(local) / (size(local,ix) * size(local,iy))
+      n_per_gridpoint_global = size(global) / (size(global,ix) * size(global,iy))
+      size_clocal = (domain%x(1)%compute%size+ishift) * (domain%y(1)%compute%size+jshift) * n_per_gridpoint_local
+      size_cremote = (domain%x(1)%compute%max_size+ishift) * (domain%y(1)%compute%max_size+jshift) * &
+                     n_per_gridpoint_local
+
+      stackuse = size_clocal + size_cremote
+      if( stackuse.GT.mpp_domains_stack_size )then
+          write( text, '(i8)' )stackuse
+          call mpp_error( FATAL, &
+               'MPP_DO_GLOBAL_FIELD user stack overflow: call mpp_domains_set_stack_size('//trim(text)// &
+               & ') from all PEs.' )
+      end if
+      mpp_domains_stack_hwm = max( mpp_domains_stack_hwm, stackuse )
+
+      stack_cptr = c_loc(mpp_domains_stack(1))
+      call c_f_pointer(stack_cptr, clocal, [size_clocal])
+
+      stack_cptr = c_loc(mpp_domains_stack(1+size_clocal))
+      call c_f_pointer(stack_cptr, cremote, [size_cremote])
+
+      if( .NOT.module_is_initialized )call mpp_error( FATAL, 'MPP_GLOBAL_FIELD: must first call mpp_domains_init.' )
+
+      xonly = .FALSE.
+      yonly = .FALSE.
+      root_only = .FALSE.
+      if( PRESENT(flags) ) then
+         xonly = BTEST(flags,EAST)
+         yonly = BTEST(flags,SOUTH)
+         if( .NOT.xonly .AND. .NOT.yonly )call mpp_error( WARNING,  &
+                   'MPP_GLOBAL_FIELD: you must have flags=XUPDATE, YUPDATE or XUPDATE+YUPDATE' )
+         if(xonly .AND. yonly) then
+            xonly = .false.; yonly = .false.
+         endif
+         root_only = BTEST(flags, ROOT_GLOBAL)
+         if( (xonly .or. yonly) .AND. root_only ) then
+            call mpp_error( WARNING, 'MPP_GLOBAL_FIELD: flags = XUPDATE+GLOBAL_ROOT_ONLY or ' // &
+                 'flags = YUPDATE+GLOBAL_ROOT_ONLY is not supported, will ignore GLOBAL_ROOT_ONLY' )
+            root_only = .FALSE.
+         endif
+      endif
+
+      global_on_this_pe =  .NOT. root_only .OR. domain%pe == domain%tile_root_pe
+      ipos = 0; jpos = 0
+      if(global_on_this_pe ) then
+         if(n_per_gridpoint_local.ne.n_per_gridpoint_global) then
+           call mpp_error(FATAL, 'MPP_GLOBAL_FIELD: mismatch of global and local dimension sizes')
+         endif
+         if( size(global,ix).NE.(domain%x(tile)%global%size+ishift) .OR. &
+             size(global,iy).NE.(domain%y(tile)%global%size+jshift))then
+            if(xonly) then
+               if(size(global,ix).NE.(domain%x(tile)%global%size+ishift) .OR. &
+                   size(global,iy).NE.(domain%y(tile)%compute%size+jshift)) &
+                  call mpp_error( FATAL, &
+                                 &  'MPP_GLOBAL_FIELD: incoming arrays do not match domain for xonly global field.' )
+               jpos = -domain%y(tile)%compute%begin + 1
+            else if(yonly) then
+               if(size(global,ix).NE.(domain%x(tile)%compute%size+ishift) .OR. &
+                   size(global,iy).NE.(domain%y(tile)%global%size+jshift)) &
+                  call mpp_error( FATAL, &
+                                 &  'MPP_GLOBAL_FIELD: incoming arrays do not match domain for yonly global field.' )
+               ipos = -domain%x(tile)%compute%begin + 1
+            else
+               call mpp_error( FATAL, 'MPP_GLOBAL_FIELD: incoming arrays do not match domain.' )
+            endif
+         endif
+      endif
+
+      if( size(local,ix).EQ.(domain%x(tile)%compute%size+ishift) .AND. &
+          size(local,iy).EQ.(domain%y(tile)%compute%size+jshift) )then
+         !local is on compute domain
+         ioff = -domain%x(tile)%compute%begin + 1
+         joff = -domain%y(tile)%compute%begin + 1
+      else if( size(local,ix).EQ.(domain%x(tile)%memory%size+ishift) .AND. &
+               size(local,iy).EQ.(domain%y(tile)%memory%size+jshift) )then
+         !local is on data domain
+         ioff = -domain%x(tile)%domain_data%begin + 1
+         joff = -domain%y(tile)%domain_data%begin + 1
+      else
+         call mpp_error( FATAL, &
+                       & 'MPP_GLOBAL_FIELD_: incoming field array must match either compute domain or memory domain.')
+      end if
+
+      !ke  = size(local,3)
+      ke  = n_per_gridpoint_local
+      isc = domain%x(tile)%compute%begin; iec = domain%x(tile)%compute%end+ishift
+      jsc = domain%y(tile)%compute%begin; jec = domain%y(tile)%compute%end+jshift
+
+      num_word_me = (iec-isc+1)*(jec-jsc+1)*n_per_gridpoint_local
+
+! make contiguous array from compute domain
+      m = 0
+      if(global_on_this_pe) then
+         !z1l: initialize global = 0 to support mask domain
+         if(present(default_data)) then
+           call arr_init(global, default_data)
+         else
+           call arr_init(global, DEFAULT_VALUE_)
+         endif
+
+        call arr2vec(local, clocal, ix, iy, ioff+isc, ioff+iec, joff+jsc, joff+jec)
+
+        ! Fill local domain directly
+        call vec2arr(clocal, global, ix, iy, ipos0+ipos+isc, ipos0+ipos+iec, jpos0+jpos+jsc, jpos0+jpos+jec)
+      else
+        call arr2vec(local, clocal, ix, iy, ioff+isc, ioff+iec, joff+jsc, joff+jec)
+      endif
+
+! if there is more than one tile on this pe, then no decomposition for all tiles on this pe, so we can just return
+      if(size(domain%x(:))>1) then
+         !--- the following is needed to avoid deadlock.
+         if( tile == size(domain%x(:)) ) call mpp_sync_self( )
+         return
+      end if
+
+      root_pe = mpp_root_pe()
+
+!fill off-domains (note loops begin at an offset of 1)
+      if( xonly )then
+          nd = size(domain%x(1)%list(:))
+          do n = 1,nd-1
+             lpos = mod(domain%x(1)%pos+nd-n,nd)
+             rpos = mod(domain%x(1)%pos   +n,nd)
+             from_pe = domain%x(1)%list(rpos)%pe
+             rpos = from_pe - root_pe ! for concurrent run, root_pe may not be 0.
+             if (from_pe == NULL_PE) then
+                num_words = 0
+             else
+                num_words = (domain%list(rpos)%x(1)%compute%size+ishift) &
+                       * (domain%list(rpos)%y(1)%compute%size+jshift) * ke
+             endif
+           ! Force use of scalar, integer ptr interface
+             call mpp_transmit( put_data=clocal(1), plen=num_word_me, to_pe=domain%x(1)%list(lpos)%pe, &
+                                get_data=cremote(1), glen=num_words, from_pe=from_pe )
+             m = 0
+             if (from_pe /= NULL_PE) then
+                is = domain%list(rpos)%x(1)%compute%begin; ie = domain%list(rpos)%x(1)%compute%end+ishift
+                call vec2arr(cremote, global, ix, iy, ipos0+is, ipos0+ie, jpos0+jpos+jsc, jpos0+jpos+jec)
+             endif
+             call mpp_sync_self()  !-ensure MPI_ISEND is done.
+          end do
+      else if( yonly )then
+          nd = size(domain%y(1)%list(:))
+          do n = 1,nd-1
+             lpos = mod(domain%y(1)%pos+nd-n,nd)
+             rpos = mod(domain%y(1)%pos   +n,nd)
+             from_pe = domain%y(1)%list(rpos)%pe
+             rpos = from_pe - root_pe
+             if (from_pe == NULL_PE) then
+                num_words = 0
+             else
+                num_words = (domain%list(rpos)%x(1)%compute%size+ishift) &
+                       * (domain%list(rpos)%y(1)%compute%size+jshift) * ke
+             endif
+           ! Force use of scalar, integer pointer interface
+             call mpp_transmit( put_data=clocal(1), plen=num_word_me, to_pe=domain%y(1)%list(lpos)%pe, &
+                                get_data=cremote(1), glen=num_words, from_pe=from_pe )
+             m = 0
+             if (from_pe /= NULL_PE) then
+               js = domain%list(rpos)%y(1)%compute%begin; je = domain%list(rpos)%y(1)%compute%end+jshift
+               call vec2arr(cremote, global, ix, iy, ipos0+ipos+isc, ipos0+ipos+iec, jpos0+js, jpos0+je)
+             endif
+             call mpp_sync_self()  !-ensure MPI_ISEND is done.
+          end do
+      else
+         tile_id = domain%tile_id(1)
+         nd = size(domain%list(:))
+         if(root_only) then
+            if(domain%pe .NE. domain%tile_root_pe) then
+               call mpp_send( clocal(1), plen=num_word_me, to_pe=domain%tile_root_pe, tag=COMM_TAG_1 )
+            else
+               do n = 1,nd-1
+                  rpos = mod(domain%pos+n,nd)
+                  if( domain%list(rpos)%tile_id(1) .NE. tile_id ) cycle
+                  num_words = (domain%list(rpos)%x(1)%compute%size+ishift) * &
+                            & (domain%list(rpos)%y(1)%compute%size+jshift) * ke
+                  call mpp_recv(cremote(1), glen=num_words, from_pe=domain%list(rpos)%pe, tag=COMM_TAG_1 )
+                  m = 0
+                  is = domain%list(rpos)%x(1)%compute%begin; ie = domain%list(rpos)%x(1)%compute%end+ishift
+                  js = domain%list(rpos)%y(1)%compute%begin; je = domain%list(rpos)%y(1)%compute%end+jshift
+
+                  call vec2arr(cremote, global, ix, iy, ipos0+is, ipos0+ie, jpos0+js, jpos0+je)
+               end do
+            endif
+         else
+            do n = 1,nd-1
+               lpos = mod(domain%pos+nd-n,nd)
+               if( domain%list(lpos)%tile_id(1).NE. tile_id ) cycle ! global field only within tile
+               call mpp_send( clocal(1), plen=num_word_me, to_pe=domain%list(lpos)%pe, tag=COMM_TAG_2 )
+            end do
+            do n = 1,nd-1
+               rpos = mod(domain%pos+n,nd)
+               if( domain%list(rpos)%tile_id(1) .NE. tile_id ) cycle ! global field only within tile
+               num_words = (domain%list(rpos)%x(1)%compute%size+ishift) * &
+                         & (domain%list(rpos)%y(1)%compute%size+jshift) * ke
+               call mpp_recv( cremote(1), glen=num_words, from_pe=domain%list(rpos)%pe, tag=COMM_TAG_2 )
+               m = 0
+               is = domain%list(rpos)%x(1)%compute%begin; ie = domain%list(rpos)%x(1)%compute%end+ishift
+               js = domain%list(rpos)%y(1)%compute%begin; je = domain%list(rpos)%y(1)%compute%end+jshift
+
+               call vec2arr(cremote, global, ix, iy, ipos0+is, ipos0+ie, jpos0+js, jpos0+je)
+            end do
+         endif
+      end if
+
+      call mpp_sync_self
+    end subroutine MPP_GLOBAL_FIELD_2D_
+
+      !> Gets a global field from a local field
+      !! local field may be on compute OR data domain
+    subroutine MPP_GLOBAL_FIELD_3D_( domain, local, global, flags, position, tile_count, default_data, xdim, ydim)
+      type(domain2D), intent(in)    :: domain
+      MPP_TYPE_, intent(in)         ::  local(:,:,:)
+      MPP_TYPE_, intent(out)        :: global(:,:,:)
+      integer, intent(in), optional :: flags
+      integer, intent(in), optional :: position
+      integer, intent(in), optional :: tile_count
+      MPP_TYPE_, intent(in), optional :: default_data
+      integer, intent(in), optional :: xdim, ydim !< Indices of the domain-decomposed dimensions. In typical
+                                                  !! Fortran-based code, xdim=1 and ydim=2.
+
+      integer :: i, j, k, m, n, nd, num_words, lpos, rpos, ioff, joff, from_pe, root_pe, tile_id
+      integer :: ke, isc, iec, jsc, jec, is, ie, js, je, num_word_me
+      integer :: ipos, jpos, n_per_gridpoint_local, n_per_gridpoint_global
+      logical :: xonly, yonly, root_only, global_on_this_pe
+      integer :: ix, iy
+      MPP_TYPE_, pointer, dimension(:) :: clocal, cremote
+      integer :: stackuse
+      character(len=8) :: text
+      type(c_ptr) :: stack_cptr !< Workaround for GFortran bug
+
+      integer :: tile, ishift, jshift, ipos0, jpos0
+      integer :: size_clocal, size_cremote
+
+      tile = 1
+      if(present(tile_count)) tile = tile_count
+
+      call mpp_get_domain_shift(domain, ishift, jshift, position)
+
+      ipos0 = -domain%x(tile)%global%begin + 1
+      jpos0 = -domain%y(tile)%global%begin + 1
+
+      if (present(xdim)) then
+        ix = xdim
+      else
+        ix = 1
+      endif
+
+      if (present(ydim)) then
+        iy = ydim
+      else
+        iy = 2
+      endif
+
+      n_per_gridpoint_local = size(local) / (size(local,ix) * size(local,iy))
+      n_per_gridpoint_global = size(global) / (size(global,ix) * size(global,iy))
+      size_clocal = (domain%x(1)%compute%size+ishift) * (domain%y(1)%compute%size+jshift) * n_per_gridpoint_local
+      size_cremote = (domain%x(1)%compute%max_size+ishift) * (domain%y(1)%compute%max_size+jshift) * &
+                     n_per_gridpoint_local
+
+      stackuse = size_clocal + size_cremote
+      if( stackuse.GT.mpp_domains_stack_size )then
+          write( text, '(i8)' )stackuse
+          call mpp_error( FATAL, &
+               'MPP_DO_GLOBAL_FIELD user stack overflow: call mpp_domains_set_stack_size('//trim(text)// &
+               & ') from all PEs.' )
+      end if
+      mpp_domains_stack_hwm = max( mpp_domains_stack_hwm, stackuse )
+
+      stack_cptr = c_loc(mpp_domains_stack(1))
+      call c_f_pointer(stack_cptr, clocal, [size_clocal])
+
+      stack_cptr = c_loc(mpp_domains_stack(1+size_clocal))
+      call c_f_pointer(stack_cptr, cremote, [size_cremote])
+
+      if( .NOT.module_is_initialized )call mpp_error( FATAL, 'MPP_GLOBAL_FIELD: must first call mpp_domains_init.' )
+
+      xonly = .FALSE.
+      yonly = .FALSE.
+      root_only = .FALSE.
+      if( PRESENT(flags) ) then
+         xonly = BTEST(flags,EAST)
+         yonly = BTEST(flags,SOUTH)
+         if( .NOT.xonly .AND. .NOT.yonly )call mpp_error( WARNING,  &
+                   'MPP_GLOBAL_FIELD: you must have flags=XUPDATE, YUPDATE or XUPDATE+YUPDATE' )
+         if(xonly .AND. yonly) then
+            xonly = .false.; yonly = .false.
+         endif
+         root_only = BTEST(flags, ROOT_GLOBAL)
+         if( (xonly .or. yonly) .AND. root_only ) then
+            call mpp_error( WARNING, 'MPP_GLOBAL_FIELD: flags = XUPDATE+GLOBAL_ROOT_ONLY or ' // &
+                 'flags = YUPDATE+GLOBAL_ROOT_ONLY is not supported, will ignore GLOBAL_ROOT_ONLY' )
+            root_only = .FALSE.
+         endif
+      endif
+
+      global_on_this_pe =  .NOT. root_only .OR. domain%pe == domain%tile_root_pe
+      ipos = 0; jpos = 0
+      if(global_on_this_pe ) then
+         if(n_per_gridpoint_local.ne.n_per_gridpoint_global) then
+           call mpp_error(FATAL, 'MPP_GLOBAL_FIELD: mismatch of global and local dimension sizes')
+         endif
+         if( size(global,ix).NE.(domain%x(tile)%global%size+ishift) .OR. &
+             size(global,iy).NE.(domain%y(tile)%global%size+jshift))then
+            if(xonly) then
+               if(size(global,ix).NE.(domain%x(tile)%global%size+ishift) .OR. &
+                   size(global,iy).NE.(domain%y(tile)%compute%size+jshift)) &
+                  call mpp_error( FATAL, &
+                                 &  'MPP_GLOBAL_FIELD: incoming arrays do not match domain for xonly global field.' )
+               jpos = -domain%y(tile)%compute%begin + 1
+            else if(yonly) then
+               if(size(global,ix).NE.(domain%x(tile)%compute%size+ishift) .OR. &
+                   size(global,iy).NE.(domain%y(tile)%global%size+jshift)) &
+                  call mpp_error( FATAL, &
+                                 &  'MPP_GLOBAL_FIELD: incoming arrays do not match domain for yonly global field.' )
+               ipos = -domain%x(tile)%compute%begin + 1
+            else
+               call mpp_error( FATAL, 'MPP_GLOBAL_FIELD: incoming arrays do not match domain.' )
+            endif
+         endif
+      endif
+
+      if( size(local,ix).EQ.(domain%x(tile)%compute%size+ishift) .AND. &
+          size(local,iy).EQ.(domain%y(tile)%compute%size+jshift) )then
+         !local is on compute domain
+         ioff = -domain%x(tile)%compute%begin + 1
+         joff = -domain%y(tile)%compute%begin + 1
+      else if( size(local,ix).EQ.(domain%x(tile)%memory%size+ishift) .AND. &
+               size(local,iy).EQ.(domain%y(tile)%memory%size+jshift) )then
+         !local is on data domain
+         ioff = -domain%x(tile)%domain_data%begin + 1
+         joff = -domain%y(tile)%domain_data%begin + 1
+      else
+         call mpp_error( FATAL, &
+                       & 'MPP_GLOBAL_FIELD_: incoming field array must match either compute domain or memory domain.')
+      end if
+
+      !ke  = size(local,3)
+      ke  = n_per_gridpoint_local
+      isc = domain%x(tile)%compute%begin; iec = domain%x(tile)%compute%end+ishift
+      jsc = domain%y(tile)%compute%begin; jec = domain%y(tile)%compute%end+jshift
+
+      num_word_me = (iec-isc+1)*(jec-jsc+1)*n_per_gridpoint_local
+
+! make contiguous array from compute domain
+      m = 0
+      if(global_on_this_pe) then
+         !z1l: initialize global = 0 to support mask domain
+         if(present(default_data)) then
+           call arr_init(global, default_data)
+         else
+           call arr_init(global, DEFAULT_VALUE_)
+         endif
+
+        call arr2vec(local, clocal, ix, iy, ioff+isc, ioff+iec, joff+jsc, joff+jec)
+
+        ! Fill local domain directly
+        call vec2arr(clocal, global, ix, iy, ipos0+ipos+isc, ipos0+ipos+iec, jpos0+jpos+jsc, jpos0+jpos+jec)
+      else
+        call arr2vec(local, clocal, ix, iy, ioff+isc, ioff+iec, joff+jsc, joff+jec)
+      endif
+
+! if there is more than one tile on this pe, then no decomposition for all tiles on this pe, so we can just return
+      if(size(domain%x(:))>1) then
+         !--- the following is needed to avoid deadlock.
+         if( tile == size(domain%x(:)) ) call mpp_sync_self( )
+         return
+      end if
+
+      root_pe = mpp_root_pe()
+
+!fill off-domains (note loops begin at an offset of 1)
+      if( xonly )then
+          nd = size(domain%x(1)%list(:))
+          do n = 1,nd-1
+             lpos = mod(domain%x(1)%pos+nd-n,nd)
+             rpos = mod(domain%x(1)%pos   +n,nd)
+             from_pe = domain%x(1)%list(rpos)%pe
+             rpos = from_pe - root_pe ! for concurrent run, root_pe may not be 0.
+             if (from_pe == NULL_PE) then
+                num_words = 0
+             else
+                num_words = (domain%list(rpos)%x(1)%compute%size+ishift) &
+                       * (domain%list(rpos)%y(1)%compute%size+jshift) * ke
+             endif
+           ! Force use of scalar, integer ptr interface
+             call mpp_transmit( put_data=clocal(1), plen=num_word_me, to_pe=domain%x(1)%list(lpos)%pe, &
+                                get_data=cremote(1), glen=num_words, from_pe=from_pe )
+             m = 0
+             if (from_pe /= NULL_PE) then
+                is = domain%list(rpos)%x(1)%compute%begin; ie = domain%list(rpos)%x(1)%compute%end+ishift
+                call vec2arr(cremote, global, ix, iy, ipos0+is, ipos0+ie, jpos0+jpos+jsc, jpos0+jpos+jec)
+             endif
+             call mpp_sync_self()  !-ensure MPI_ISEND is done.
+          end do
+      else if( yonly )then
+          nd = size(domain%y(1)%list(:))
+          do n = 1,nd-1
+             lpos = mod(domain%y(1)%pos+nd-n,nd)
+             rpos = mod(domain%y(1)%pos   +n,nd)
+             from_pe = domain%y(1)%list(rpos)%pe
+             rpos = from_pe - root_pe
+             if (from_pe == NULL_PE) then
+                num_words = 0
+             else
+                num_words = (domain%list(rpos)%x(1)%compute%size+ishift) &
+                       * (domain%list(rpos)%y(1)%compute%size+jshift) * ke
+             endif
+           ! Force use of scalar, integer pointer interface
+             call mpp_transmit( put_data=clocal(1), plen=num_word_me, to_pe=domain%y(1)%list(lpos)%pe, &
+                                get_data=cremote(1), glen=num_words, from_pe=from_pe )
+             m = 0
+             if (from_pe /= NULL_PE) then
+               js = domain%list(rpos)%y(1)%compute%begin; je = domain%list(rpos)%y(1)%compute%end+jshift
+               call vec2arr(cremote, global, ix, iy, ipos0+ipos+isc, ipos0+ipos+iec, jpos0+js, jpos0+je)
+             endif
+             call mpp_sync_self()  !-ensure MPI_ISEND is done.
+          end do
+      else
+         tile_id = domain%tile_id(1)
+         nd = size(domain%list(:))
+         if(root_only) then
+            if(domain%pe .NE. domain%tile_root_pe) then
+               call mpp_send( clocal(1), plen=num_word_me, to_pe=domain%tile_root_pe, tag=COMM_TAG_1 )
+            else
+               do n = 1,nd-1
+                  rpos = mod(domain%pos+n,nd)
+                  if( domain%list(rpos)%tile_id(1) .NE. tile_id ) cycle
+                  num_words = (domain%list(rpos)%x(1)%compute%size+ishift) * &
+                            & (domain%list(rpos)%y(1)%compute%size+jshift) * ke
+                  call mpp_recv(cremote(1), glen=num_words, from_pe=domain%list(rpos)%pe, tag=COMM_TAG_1 )
+                  m = 0
+                  is = domain%list(rpos)%x(1)%compute%begin; ie = domain%list(rpos)%x(1)%compute%end+ishift
+                  js = domain%list(rpos)%y(1)%compute%begin; je = domain%list(rpos)%y(1)%compute%end+jshift
+
+                  call vec2arr(cremote, global, ix, iy, ipos0+is, ipos0+ie, jpos0+js, jpos0+je)
+               end do
+            endif
+         else
+            do n = 1,nd-1
+               lpos = mod(domain%pos+nd-n,nd)
+               if( domain%list(lpos)%tile_id(1).NE. tile_id ) cycle ! global field only within tile
+               call mpp_send( clocal(1), plen=num_word_me, to_pe=domain%list(lpos)%pe, tag=COMM_TAG_2 )
+            end do
+            do n = 1,nd-1
+               rpos = mod(domain%pos+n,nd)
+               if( domain%list(rpos)%tile_id(1) .NE. tile_id ) cycle ! global field only within tile
+               num_words = (domain%list(rpos)%x(1)%compute%size+ishift) * &
+                         & (domain%list(rpos)%y(1)%compute%size+jshift) * ke
+               call mpp_recv( cremote(1), glen=num_words, from_pe=domain%list(rpos)%pe, tag=COMM_TAG_2 )
+               m = 0
+               is = domain%list(rpos)%x(1)%compute%begin; ie = domain%list(rpos)%x(1)%compute%end+ishift
+               js = domain%list(rpos)%y(1)%compute%begin; je = domain%list(rpos)%y(1)%compute%end+jshift
+
+               call vec2arr(cremote, global, ix, iy, ipos0+is, ipos0+ie, jpos0+js, jpos0+je)
+            end do
+         endif
+      end if
+
+      call mpp_sync_self
+    end subroutine MPP_GLOBAL_FIELD_3D_
+
+      !> Gets a global field from a local field
+      !! local field may be on compute OR data domain
+    subroutine MPP_GLOBAL_FIELD_4D_( domain, local, global, flags, position, tile_count, default_data, xdim, ydim)
+      type(domain2D), intent(in)    :: domain
+      MPP_TYPE_, intent(in)         ::  local(:,:,:,:)
+      MPP_TYPE_, intent(out)        :: global(:,:,:,:)
+      integer, intent(in), optional :: flags
+      integer, intent(in), optional :: position
+      integer, intent(in), optional :: tile_count
+      MPP_TYPE_, intent(in), optional :: default_data
+      integer, intent(in), optional :: xdim, ydim !< Indices of the domain-decomposed dimensions. In typical
+                                                  !! Fortran-based code, xdim=1 and ydim=2.
+
+      integer :: i, j, k, m, n, nd, num_words, lpos, rpos, ioff, joff, from_pe, root_pe, tile_id
+      integer :: ke, isc, iec, jsc, jec, is, ie, js, je, num_word_me
+      integer :: ipos, jpos, n_per_gridpoint_local, n_per_gridpoint_global
+      logical :: xonly, yonly, root_only, global_on_this_pe
+      integer :: ix, iy
+      MPP_TYPE_, pointer, dimension(:) :: clocal, cremote
+      integer :: stackuse
+      character(len=8) :: text
+      type(c_ptr) :: stack_cptr !< Workaround for GFortran bug
+
+      integer :: tile, ishift, jshift, ipos0, jpos0
+      integer :: size_clocal, size_cremote
+
+      tile = 1
+      if(present(tile_count)) tile = tile_count
+
+      call mpp_get_domain_shift(domain, ishift, jshift, position)
+
+      ipos0 = -domain%x(tile)%global%begin + 1
+      jpos0 = -domain%y(tile)%global%begin + 1
+
+      if (present(xdim)) then
+        ix = xdim
+      else
+        ix = 1
+      endif
+
+      if (present(ydim)) then
+        iy = ydim
+      else
+        iy = 2
+      endif
+
+      n_per_gridpoint_local = size(local) / (size(local,ix) * size(local,iy))
+      n_per_gridpoint_global = size(global) / (size(global,ix) * size(global,iy))
+      size_clocal = (domain%x(1)%compute%size+ishift) * (domain%y(1)%compute%size+jshift) * n_per_gridpoint_local
+      size_cremote = (domain%x(1)%compute%max_size+ishift) * (domain%y(1)%compute%max_size+jshift) * &
+                     n_per_gridpoint_local
+
+      stackuse = size_clocal + size_cremote
+      if( stackuse.GT.mpp_domains_stack_size )then
+          write( text, '(i8)' )stackuse
+          call mpp_error( FATAL, &
+               'MPP_DO_GLOBAL_FIELD user stack overflow: call mpp_domains_set_stack_size('//trim(text)// &
+               & ') from all PEs.' )
+      end if
+      mpp_domains_stack_hwm = max( mpp_domains_stack_hwm, stackuse )
+
+      stack_cptr = c_loc(mpp_domains_stack(1))
+      call c_f_pointer(stack_cptr, clocal, [size_clocal])
+
+      stack_cptr = c_loc(mpp_domains_stack(1+size_clocal))
+      call c_f_pointer(stack_cptr, cremote, [size_cremote])
+
+      if( .NOT.module_is_initialized )call mpp_error( FATAL, 'MPP_GLOBAL_FIELD: must first call mpp_domains_init.' )
+
+      xonly = .FALSE.
+      yonly = .FALSE.
+      root_only = .FALSE.
+      if( PRESENT(flags) ) then
+         xonly = BTEST(flags,EAST)
+         yonly = BTEST(flags,SOUTH)
+         if( .NOT.xonly .AND. .NOT.yonly )call mpp_error( WARNING,  &
+                   'MPP_GLOBAL_FIELD: you must have flags=XUPDATE, YUPDATE or XUPDATE+YUPDATE' )
+         if(xonly .AND. yonly) then
+            xonly = .false.; yonly = .false.
+         endif
+         root_only = BTEST(flags, ROOT_GLOBAL)
+         if( (xonly .or. yonly) .AND. root_only ) then
+            call mpp_error( WARNING, 'MPP_GLOBAL_FIELD: flags = XUPDATE+GLOBAL_ROOT_ONLY or ' // &
+                 'flags = YUPDATE+GLOBAL_ROOT_ONLY is not supported, will ignore GLOBAL_ROOT_ONLY' )
+            root_only = .FALSE.
+         endif
+      endif
+
+      global_on_this_pe =  .NOT. root_only .OR. domain%pe == domain%tile_root_pe
+      ipos = 0; jpos = 0
+      if(global_on_this_pe ) then
+         if(n_per_gridpoint_local.ne.n_per_gridpoint_global) then
+           call mpp_error(FATAL, 'MPP_GLOBAL_FIELD: mismatch of global and local dimension sizes')
+         endif
+         if( size(global,ix).NE.(domain%x(tile)%global%size+ishift) .OR. &
+             size(global,iy).NE.(domain%y(tile)%global%size+jshift))then
+            if(xonly) then
+               if(size(global,ix).NE.(domain%x(tile)%global%size+ishift) .OR. &
+                   size(global,iy).NE.(domain%y(tile)%compute%size+jshift)) &
+                  call mpp_error( FATAL, &
+                                 &  'MPP_GLOBAL_FIELD: incoming arrays do not match domain for xonly global field.' )
+               jpos = -domain%y(tile)%compute%begin + 1
+            else if(yonly) then
+               if(size(global,ix).NE.(domain%x(tile)%compute%size+ishift) .OR. &
+                   size(global,iy).NE.(domain%y(tile)%global%size+jshift)) &
+                  call mpp_error( FATAL, &
+                                 &  'MPP_GLOBAL_FIELD: incoming arrays do not match domain for yonly global field.' )
+               ipos = -domain%x(tile)%compute%begin + 1
+            else
+               call mpp_error( FATAL, 'MPP_GLOBAL_FIELD: incoming arrays do not match domain.' )
+            endif
+         endif
+      endif
+
+      if( size(local,ix).EQ.(domain%x(tile)%compute%size+ishift) .AND. &
+          size(local,iy).EQ.(domain%y(tile)%compute%size+jshift) )then
+         !local is on compute domain
+         ioff = -domain%x(tile)%compute%begin + 1
+         joff = -domain%y(tile)%compute%begin + 1
+      else if( size(local,ix).EQ.(domain%x(tile)%memory%size+ishift) .AND. &
+               size(local,iy).EQ.(domain%y(tile)%memory%size+jshift) )then
+         !local is on data domain
+         ioff = -domain%x(tile)%domain_data%begin + 1
+         joff = -domain%y(tile)%domain_data%begin + 1
+      else
+         call mpp_error( FATAL, &
+                       & 'MPP_GLOBAL_FIELD_: incoming field array must match either compute domain or memory domain.')
+      end if
+
+      !ke  = size(local,3)
+      ke  = n_per_gridpoint_local
+      isc = domain%x(tile)%compute%begin; iec = domain%x(tile)%compute%end+ishift
+      jsc = domain%y(tile)%compute%begin; jec = domain%y(tile)%compute%end+jshift
+
+      num_word_me = (iec-isc+1)*(jec-jsc+1)*n_per_gridpoint_local
+
+! make contiguous array from compute domain
+      m = 0
+      if(global_on_this_pe) then
+         !z1l: initialize global = 0 to support mask domain
+         if(present(default_data)) then
+           call arr_init(global, default_data)
+         else
+           call arr_init(global, DEFAULT_VALUE_)
+         endif
+
+        call arr2vec(local, clocal, ix, iy, ioff+isc, ioff+iec, joff+jsc, joff+jec)
+
+        ! Fill local domain directly
+        call vec2arr(clocal, global, ix, iy, ipos0+ipos+isc, ipos0+ipos+iec, jpos0+jpos+jsc, jpos0+jpos+jec)
+      else
+        call arr2vec(local, clocal, ix, iy, ioff+isc, ioff+iec, joff+jsc, joff+jec)
+      endif
+
+! if there is more than one tile on this pe, then no decomposition for all tiles on this pe, so we can just return
+      if(size(domain%x(:))>1) then
+         !--- the following is needed to avoid deadlock.
+         if( tile == size(domain%x(:)) ) call mpp_sync_self( )
+         return
+      end if
+
+      root_pe = mpp_root_pe()
+
+!fill off-domains (note loops begin at an offset of 1)
+      if( xonly )then
+          nd = size(domain%x(1)%list(:))
+          do n = 1,nd-1
+             lpos = mod(domain%x(1)%pos+nd-n,nd)
+             rpos = mod(domain%x(1)%pos   +n,nd)
+             from_pe = domain%x(1)%list(rpos)%pe
+             rpos = from_pe - root_pe ! for concurrent run, root_pe may not be 0.
+             if (from_pe == NULL_PE) then
+                num_words = 0
+             else
+                num_words = (domain%list(rpos)%x(1)%compute%size+ishift) &
+                       * (domain%list(rpos)%y(1)%compute%size+jshift) * ke
+             endif
+           ! Force use of scalar, integer ptr interface
+             call mpp_transmit( put_data=clocal(1), plen=num_word_me, to_pe=domain%x(1)%list(lpos)%pe, &
+                                get_data=cremote(1), glen=num_words, from_pe=from_pe )
+             m = 0
+             if (from_pe /= NULL_PE) then
+                is = domain%list(rpos)%x(1)%compute%begin; ie = domain%list(rpos)%x(1)%compute%end+ishift
+                call vec2arr(cremote, global, ix, iy, ipos0+is, ipos0+ie, jpos0+jpos+jsc, jpos0+jpos+jec)
+             endif
+             call mpp_sync_self()  !-ensure MPI_ISEND is done.
+          end do
+      else if( yonly )then
+          nd = size(domain%y(1)%list(:))
+          do n = 1,nd-1
+             lpos = mod(domain%y(1)%pos+nd-n,nd)
+             rpos = mod(domain%y(1)%pos   +n,nd)
+             from_pe = domain%y(1)%list(rpos)%pe
+             rpos = from_pe - root_pe
+             if (from_pe == NULL_PE) then
+                num_words = 0
+             else
+                num_words = (domain%list(rpos)%x(1)%compute%size+ishift) &
+                       * (domain%list(rpos)%y(1)%compute%size+jshift) * ke
+             endif
+           ! Force use of scalar, integer pointer interface
+             call mpp_transmit( put_data=clocal(1), plen=num_word_me, to_pe=domain%y(1)%list(lpos)%pe, &
+                                get_data=cremote(1), glen=num_words, from_pe=from_pe )
+             m = 0
+             if (from_pe /= NULL_PE) then
+               js = domain%list(rpos)%y(1)%compute%begin; je = domain%list(rpos)%y(1)%compute%end+jshift
+               call vec2arr(cremote, global, ix, iy, ipos0+ipos+isc, ipos0+ipos+iec, jpos0+js, jpos0+je)
+             endif
+             call mpp_sync_self()  !-ensure MPI_ISEND is done.
+          end do
+      else
+         tile_id = domain%tile_id(1)
+         nd = size(domain%list(:))
+         if(root_only) then
+            if(domain%pe .NE. domain%tile_root_pe) then
+               call mpp_send( clocal(1), plen=num_word_me, to_pe=domain%tile_root_pe, tag=COMM_TAG_1 )
+            else
+               do n = 1,nd-1
+                  rpos = mod(domain%pos+n,nd)
+                  if( domain%list(rpos)%tile_id(1) .NE. tile_id ) cycle
+                  num_words = (domain%list(rpos)%x(1)%compute%size+ishift) * &
+                            & (domain%list(rpos)%y(1)%compute%size+jshift) * ke
+                  call mpp_recv(cremote(1), glen=num_words, from_pe=domain%list(rpos)%pe, tag=COMM_TAG_1 )
+                  m = 0
+                  is = domain%list(rpos)%x(1)%compute%begin; ie = domain%list(rpos)%x(1)%compute%end+ishift
+                  js = domain%list(rpos)%y(1)%compute%begin; je = domain%list(rpos)%y(1)%compute%end+jshift
+
+                  call vec2arr(cremote, global, ix, iy, ipos0+is, ipos0+ie, jpos0+js, jpos0+je)
+               end do
+            endif
+         else
+            do n = 1,nd-1
+               lpos = mod(domain%pos+nd-n,nd)
+               if( domain%list(lpos)%tile_id(1).NE. tile_id ) cycle ! global field only within tile
+               call mpp_send( clocal(1), plen=num_word_me, to_pe=domain%list(lpos)%pe, tag=COMM_TAG_2 )
+            end do
+            do n = 1,nd-1
+               rpos = mod(domain%pos+n,nd)
+               if( domain%list(rpos)%tile_id(1) .NE. tile_id ) cycle ! global field only within tile
+               num_words = (domain%list(rpos)%x(1)%compute%size+ishift) * &
+                         & (domain%list(rpos)%y(1)%compute%size+jshift) * ke
+               call mpp_recv( cremote(1), glen=num_words, from_pe=domain%list(rpos)%pe, tag=COMM_TAG_2 )
+               m = 0
+               is = domain%list(rpos)%x(1)%compute%begin; ie = domain%list(rpos)%x(1)%compute%end+ishift
+               js = domain%list(rpos)%y(1)%compute%begin; je = domain%list(rpos)%y(1)%compute%end+jshift
+
+               call vec2arr(cremote, global, ix, iy, ipos0+is, ipos0+ie, jpos0+js, jpos0+je)
+            end do
+         endif
+      end if
+
+      call mpp_sync_self
+    end subroutine MPP_GLOBAL_FIELD_4D_
+
+      !> Gets a global field from a local field
+      !! local field may be on compute OR data domain
+    subroutine MPP_GLOBAL_FIELD_5D_( domain, local, global, flags, position, tile_count, default_data, xdim, ydim)
+      type(domain2D), intent(in)    :: domain
+      MPP_TYPE_, intent(in)         ::  local(:,:,:,:,:)
+      MPP_TYPE_, intent(out)        :: global(:,:,:,:,:)
+      integer, intent(in), optional :: flags
+      integer, intent(in), optional :: position
+      integer, intent(in), optional :: tile_count
+      MPP_TYPE_, intent(in), optional :: default_data
+      integer, intent(in), optional :: xdim, ydim !< Indices of the domain-decomposed dimensions. In typical
+                                                  !! Fortran-based code, xdim=1 and ydim=2.
+
+      integer :: i, j, k, m, n, nd, num_words, lpos, rpos, ioff, joff, from_pe, root_pe, tile_id
+      integer :: ke, isc, iec, jsc, jec, is, ie, js, je, num_word_me
+      integer :: ipos, jpos, n_per_gridpoint_local, n_per_gridpoint_global
+      logical :: xonly, yonly, root_only, global_on_this_pe
+      integer :: ix, iy
+      MPP_TYPE_, pointer, dimension(:) :: clocal, cremote
+      integer :: stackuse
+      character(len=8) :: text
+      type(c_ptr) :: stack_cptr !< Workaround for GFortran bug
+
+      integer :: tile, ishift, jshift, ipos0, jpos0
+      integer :: size_clocal, size_cremote
+
+      tile = 1
+      if(present(tile_count)) tile = tile_count
+
+      call mpp_get_domain_shift(domain, ishift, jshift, position)
+
+      ipos0 = -domain%x(tile)%global%begin + 1
+      jpos0 = -domain%y(tile)%global%begin + 1
+
+      if (present(xdim)) then
+        ix = xdim
+      else
+        ix = 1
+      endif
+
+      if (present(ydim)) then
+        iy = ydim
+      else
+        iy = 2
+      endif
+
+      n_per_gridpoint_local = size(local) / (size(local,ix) * size(local,iy))
+      n_per_gridpoint_global = size(global) / (size(global,ix) * size(global,iy))
+      size_clocal = (domain%x(1)%compute%size+ishift) * (domain%y(1)%compute%size+jshift) * n_per_gridpoint_local
+      size_cremote = (domain%x(1)%compute%max_size+ishift) * (domain%y(1)%compute%max_size+jshift) * &
+                     n_per_gridpoint_local
+
+      stackuse = size_clocal + size_cremote
+      if( stackuse.GT.mpp_domains_stack_size )then
+          write( text, '(i8)' )stackuse
+          call mpp_error( FATAL, &
+               'MPP_DO_GLOBAL_FIELD user stack overflow: call mpp_domains_set_stack_size('//trim(text)// &
+               & ') from all PEs.' )
+      end if
+      mpp_domains_stack_hwm = max( mpp_domains_stack_hwm, stackuse )
+
+      stack_cptr = c_loc(mpp_domains_stack(1))
+      call c_f_pointer(stack_cptr, clocal, [size_clocal])
+
+      stack_cptr = c_loc(mpp_domains_stack(1+size_clocal))
+      call c_f_pointer(stack_cptr, cremote, [size_cremote])
+
+      if( .NOT.module_is_initialized )call mpp_error( FATAL, 'MPP_GLOBAL_FIELD: must first call mpp_domains_init.' )
+
+      xonly = .FALSE.
+      yonly = .FALSE.
+      root_only = .FALSE.
+      if( PRESENT(flags) ) then
+         xonly = BTEST(flags,EAST)
+         yonly = BTEST(flags,SOUTH)
+         if( .NOT.xonly .AND. .NOT.yonly )call mpp_error( WARNING,  &
+                   'MPP_GLOBAL_FIELD: you must have flags=XUPDATE, YUPDATE or XUPDATE+YUPDATE' )
+         if(xonly .AND. yonly) then
+            xonly = .false.; yonly = .false.
+         endif
+         root_only = BTEST(flags, ROOT_GLOBAL)
+         if( (xonly .or. yonly) .AND. root_only ) then
+            call mpp_error( WARNING, 'MPP_GLOBAL_FIELD: flags = XUPDATE+GLOBAL_ROOT_ONLY or ' // &
+                 'flags = YUPDATE+GLOBAL_ROOT_ONLY is not supported, will ignore GLOBAL_ROOT_ONLY' )
+            root_only = .FALSE.
+         endif
+      endif
+
+      global_on_this_pe =  .NOT. root_only .OR. domain%pe == domain%tile_root_pe
+      ipos = 0; jpos = 0
+      if(global_on_this_pe ) then
+         if(n_per_gridpoint_local.ne.n_per_gridpoint_global) then
+           call mpp_error(FATAL, 'MPP_GLOBAL_FIELD: mismatch of global and local dimension sizes')
+         endif
+         if( size(global,ix).NE.(domain%x(tile)%global%size+ishift) .OR. &
+             size(global,iy).NE.(domain%y(tile)%global%size+jshift))then
+            if(xonly) then
+               if(size(global,ix).NE.(domain%x(tile)%global%size+ishift) .OR. &
+                   size(global,iy).NE.(domain%y(tile)%compute%size+jshift)) &
+                  call mpp_error( FATAL, &
+                                 &  'MPP_GLOBAL_FIELD: incoming arrays do not match domain for xonly global field.' )
+               jpos = -domain%y(tile)%compute%begin + 1
+            else if(yonly) then
+               if(size(global,ix).NE.(domain%x(tile)%compute%size+ishift) .OR. &
+                   size(global,iy).NE.(domain%y(tile)%global%size+jshift)) &
+                  call mpp_error( FATAL, &
+                                 &  'MPP_GLOBAL_FIELD: incoming arrays do not match domain for yonly global field.' )
+               ipos = -domain%x(tile)%compute%begin + 1
+            else
+               call mpp_error( FATAL, 'MPP_GLOBAL_FIELD: incoming arrays do not match domain.' )
+            endif
+         endif
+      endif
+
+      if( size(local,ix).EQ.(domain%x(tile)%compute%size+ishift) .AND. &
+          size(local,iy).EQ.(domain%y(tile)%compute%size+jshift) )then
+         !local is on compute domain
+         ioff = -domain%x(tile)%compute%begin + 1
+         joff = -domain%y(tile)%compute%begin + 1
+      else if( size(local,ix).EQ.(domain%x(tile)%memory%size+ishift) .AND. &
+               size(local,iy).EQ.(domain%y(tile)%memory%size+jshift) )then
+         !local is on data domain
+         ioff = -domain%x(tile)%domain_data%begin + 1
+         joff = -domain%y(tile)%domain_data%begin + 1
+      else
+         call mpp_error( FATAL, &
+                       & 'MPP_GLOBAL_FIELD_: incoming field array must match either compute domain or memory domain.')
+      end if
+
+      !ke  = size(local,3)
+      ke  = n_per_gridpoint_local
+      isc = domain%x(tile)%compute%begin; iec = domain%x(tile)%compute%end+ishift
+      jsc = domain%y(tile)%compute%begin; jec = domain%y(tile)%compute%end+jshift
+
+      num_word_me = (iec-isc+1)*(jec-jsc+1)*n_per_gridpoint_local
+
+! make contiguous array from compute domain
+      m = 0
+      if(global_on_this_pe) then
+         !z1l: initialize global = 0 to support mask domain
+         if(present(default_data)) then
+           call arr_init(global, default_data)
+         else
+           call arr_init(global, DEFAULT_VALUE_)
+         endif
+
+        call arr2vec(local, clocal, ix, iy, ioff+isc, ioff+iec, joff+jsc, joff+jec)
+
+        ! Fill local domain directly
+        call vec2arr(clocal, global, ix, iy, ipos0+ipos+isc, ipos0+ipos+iec, jpos0+jpos+jsc, jpos0+jpos+jec)
+      else
+        call arr2vec(local, clocal, ix, iy, ioff+isc, ioff+iec, joff+jsc, joff+jec)
+      endif
+
+! if there is more than one tile on this pe, then no decomposition for all tiles on this pe, so we can just return
+      if(size(domain%x(:))>1) then
+         !--- the following is needed to avoid deadlock.
+         if( tile == size(domain%x(:)) ) call mpp_sync_self( )
+         return
+      end if
+
+      root_pe = mpp_root_pe()
+
+!fill off-domains (note loops begin at an offset of 1)
+      if( xonly )then
+          nd = size(domain%x(1)%list(:))
+          do n = 1,nd-1
+             lpos = mod(domain%x(1)%pos+nd-n,nd)
+             rpos = mod(domain%x(1)%pos   +n,nd)
+             from_pe = domain%x(1)%list(rpos)%pe
+             rpos = from_pe - root_pe ! for concurrent run, root_pe may not be 0.
+             if (from_pe == NULL_PE) then
+                num_words = 0
+             else
+                num_words = (domain%list(rpos)%x(1)%compute%size+ishift) &
+                       * (domain%list(rpos)%y(1)%compute%size+jshift) * ke
+             endif
+           ! Force use of scalar, integer ptr interface
+             call mpp_transmit( put_data=clocal(1), plen=num_word_me, to_pe=domain%x(1)%list(lpos)%pe, &
+                                get_data=cremote(1), glen=num_words, from_pe=from_pe )
+             m = 0
+             if (from_pe /= NULL_PE) then
+                is = domain%list(rpos)%x(1)%compute%begin; ie = domain%list(rpos)%x(1)%compute%end+ishift
+                call vec2arr(cremote, global, ix, iy, ipos0+is, ipos0+ie, jpos0+jpos+jsc, jpos0+jpos+jec)
+             endif
+             call mpp_sync_self()  !-ensure MPI_ISEND is done.
+          end do
+      else if( yonly )then
+          nd = size(domain%y(1)%list(:))
+          do n = 1,nd-1
+             lpos = mod(domain%y(1)%pos+nd-n,nd)
+             rpos = mod(domain%y(1)%pos   +n,nd)
+             from_pe = domain%y(1)%list(rpos)%pe
+             rpos = from_pe - root_pe
+             if (from_pe == NULL_PE) then
+                num_words = 0
+             else
+                num_words = (domain%list(rpos)%x(1)%compute%size+ishift) &
+                       * (domain%list(rpos)%y(1)%compute%size+jshift) * ke
+             endif
+           ! Force use of scalar, integer pointer interface
+             call mpp_transmit( put_data=clocal(1), plen=num_word_me, to_pe=domain%y(1)%list(lpos)%pe, &
+                                get_data=cremote(1), glen=num_words, from_pe=from_pe )
+             m = 0
+             if (from_pe /= NULL_PE) then
+               js = domain%list(rpos)%y(1)%compute%begin; je = domain%list(rpos)%y(1)%compute%end+jshift
+               call vec2arr(cremote, global, ix, iy, ipos0+ipos+isc, ipos0+ipos+iec, jpos0+js, jpos0+je)
+             endif
+             call mpp_sync_self()  !-ensure MPI_ISEND is done.
+          end do
+      else
+         tile_id = domain%tile_id(1)
+         nd = size(domain%list(:))
+         if(root_only) then
+            if(domain%pe .NE. domain%tile_root_pe) then
+               call mpp_send( clocal(1), plen=num_word_me, to_pe=domain%tile_root_pe, tag=COMM_TAG_1 )
+            else
+               do n = 1,nd-1
+                  rpos = mod(domain%pos+n,nd)
+                  if( domain%list(rpos)%tile_id(1) .NE. tile_id ) cycle
+                  num_words = (domain%list(rpos)%x(1)%compute%size+ishift) * &
+                            & (domain%list(rpos)%y(1)%compute%size+jshift) * ke
+                  call mpp_recv(cremote(1), glen=num_words, from_pe=domain%list(rpos)%pe, tag=COMM_TAG_1 )
+                  m = 0
+                  is = domain%list(rpos)%x(1)%compute%begin; ie = domain%list(rpos)%x(1)%compute%end+ishift
+                  js = domain%list(rpos)%y(1)%compute%begin; je = domain%list(rpos)%y(1)%compute%end+jshift
+
+                  call vec2arr(cremote, global, ix, iy, ipos0+is, ipos0+ie, jpos0+js, jpos0+je)
+               end do
+            endif
+         else
+            do n = 1,nd-1
+               lpos = mod(domain%pos+nd-n,nd)
+               if( domain%list(lpos)%tile_id(1).NE. tile_id ) cycle ! global field only within tile
+               call mpp_send( clocal(1), plen=num_word_me, to_pe=domain%list(lpos)%pe, tag=COMM_TAG_2 )
+            end do
+            do n = 1,nd-1
+               rpos = mod(domain%pos+n,nd)
+               if( domain%list(rpos)%tile_id(1) .NE. tile_id ) cycle ! global field only within tile
+               num_words = (domain%list(rpos)%x(1)%compute%size+ishift) * &
+                         & (domain%list(rpos)%y(1)%compute%size+jshift) * ke
+               call mpp_recv( cremote(1), glen=num_words, from_pe=domain%list(rpos)%pe, tag=COMM_TAG_2 )
+               m = 0
+               is = domain%list(rpos)%x(1)%compute%begin; ie = domain%list(rpos)%x(1)%compute%end+ishift
+               js = domain%list(rpos)%y(1)%compute%begin; je = domain%list(rpos)%y(1)%compute%end+jshift
+
+               call vec2arr(cremote, global, ix, iy, ipos0+is, ipos0+ie, jpos0+js, jpos0+je)
+            end do
+         endif
+      end if
+
+      call mpp_sync_self
+    end subroutine MPP_GLOBAL_FIELD_5D_
+!> @}
diff --git a/mpp/include/mpp_group_update.fh b/mpp/include/mpp_group_update.fh
index 8a10b1eb13..9275ce635b 100644
--- a/mpp/include/mpp_group_update.fh
+++ b/mpp/include/mpp_group_update.fh
@@ -424,20 +424,22 @@ subroutine MPP_CREATE_GROUP_UPDATE_4D_V_( group, fieldx, fieldy, domain, flags,
 end subroutine MPP_CREATE_GROUP_UPDATE_4D_V_
 
 
-subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type)
+subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type, omp_offload)
   type(mpp_group_update_type), intent(inout) :: group
   type(domain2D),              intent(inout) :: domain
   MPP_TYPE_,                   intent(in)    :: d_type
+  logical, optional, intent(in) :: omp_offload
 
   integer   :: nscalar, nvector, nlist
   logical   :: recv_y(8)
   integer   :: nsend, nrecv, flags_v
   integer   :: msgsize
-  integer   :: from_pe, to_pe, buffer_pos, pos
+  integer   :: from_pe, to_pe, buffer_pos, pos, idx
   integer   :: ksize, is, ie, js, je
-  integer   :: n, l, m, i, j, k, buffer_start_pos, nk
+  integer   :: n, l, m, i, j, k, buffer_start_pos, ni, nj, nk
   integer   :: shift, gridtype, midpoint
   integer   :: npack, nunpack, rotation, isd
+  logical   :: use_device_ptr
 
   MPP_TYPE_ :: buffer(mpp_domains_stack_size)
   MPP_TYPE_ :: field (group%is_s:group%ie_s,group%js_s:group%je_s, group%ksize_s)
@@ -453,6 +455,9 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type)
   nlist   = size(domain%list(:))
   gridtype = group%gridtype
 
+  use_device_ptr = .false.
+  if (present(omp_offload)) use_device_ptr = omp_offload
+
   !--- ksize_s must equal ksize_v
   if(nvector > 0 .AND. nscalar > 0) then
      if(group%ksize_s .NE. group%ksize_v) then
@@ -481,13 +486,16 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type)
 
   !---pre-post receive.
   call mpp_clock_begin(group_recv_clock)
+#ifdef __NVCOMPILER_OPENMP_GPU
+  !$omp target enter data map(alloc: buffer) if(use_device_ptr)
+#endif
   do m = 1, nrecv
      msgsize = group%recv_size(m)
      from_pe = group%from_pe(m)
      if( msgsize .GT. 0 )then
         buffer_pos = group%buffer_pos_recv(m)
         call mpp_recv( buffer(buffer_pos+1), glen=msgsize, from_pe=from_pe, block=.false., &
-             tag=COMM_TAG_1)
+             tag=COMM_TAG_1, omp_offload=omp_offload)
      end if
   end do
 
@@ -500,7 +508,19 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type)
   call mpp_clock_begin(group_pack_clock)
   !pack the data
   buffer_start_pos = 0
+  ! below switch runs OpenMP offloaded packing if ompoffload is .true. and compiled with
+  ! OpenMP offload support. Otherwise, run OpenMP CPU by undefining GPU macro if defined
+  if (use_device_ptr) then
+#include <group_update_pack.inc>
+  else
+#ifdef __NVCOMPILER_OPENMP_GPU
+#undef __NVCOMPILER_OPENMP_GPU
+#include <group_update_pack.inc>
+#define __NVCOMPILER_OPENMP_GPU
+#else
 #include <group_update_pack.inc>
+#endif
+  endif
   call mpp_clock_end(group_pack_clock)
 
   call mpp_clock_begin(group_send_clock)
@@ -509,7 +529,7 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type)
      if( msgsize .GT. 0 )then
         buffer_pos = group%buffer_pos_send(n)
         to_pe = group%to_pe(n)
-        call mpp_send( buffer(buffer_pos+1), plen=msgsize, to_pe=to_pe, tag=COMM_TAG_1)
+        call mpp_send( buffer(buffer_pos+1), plen=msgsize, to_pe=to_pe, tag=COMM_TAG_1, omp_offload=omp_offload)
      endif
   enddo
   call mpp_clock_end(group_send_clock)
@@ -523,10 +543,24 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type)
   !---unpack the buffer
   nunpack = group%nunpack
   call mpp_clock_begin(group_unpk_clock)
+  ! below switch runs OpenMP offloaded unpacking if ompoffload is .true. and compiled with
+  ! OpenMP offload support. Otherwise, run OpenMP CPU by undefining GPU macro if defined
+  if (use_device_ptr) then
+#include <group_update_unpack.inc>
+  else
+#ifdef __NVCOMPILER_OPENMP_GPU
+#undef __NVCOMPILER_OPENMP_GPU
+#include <group_update_unpack.inc>
+#define __NVCOMPILER_OPENMP_GPU
+  !$omp target exit data map(release: buffer) if(use_device_ptr)
+#else
 #include <group_update_unpack.inc>
+#endif
+  endif
   call mpp_clock_end(group_unpk_clock)
 
   ! ---northern boundary fold
+  ! below omp offload loops have maps excluded as gcc doesn't support cray pointers in maps
   shift = 0
   if(domain%symmetry) shift = 1
   if( nvector >0 .AND. BTEST(domain%fold,NORTH) .AND. (.NOT.BTEST(flags_v,SCALAR_BIT)) )then
@@ -538,15 +572,25 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type)
            j  = domain%y(1)%global%end+shift
            is = domain%x(1)%global%begin; ie = domain%x(1)%global%end+shift
            if( .NOT. domain%symmetry ) is = is - 1
+           isd = domain%x(1)%domain_data%begin
            do i = is ,ie, midpoint
-              if( domain%x(1)%domain_data%begin.LE.i .AND. i.LE. domain%x(1)%domain_data%end+shift )then
+              if( isd.LE.i .AND. i.LE. domain%x(1)%domain_data%end+shift )then
                  do l=1,nvector
                     ptr_fieldx = group%addrs_x(l)
                     ptr_fieldy = group%addrs_y(l)
-                    do k = 1,ksize
-                       fieldx(i,j,k) = 0.
-                       fieldy(i,j,k) = 0.
-                    end do
+                    if (use_device_ptr) then
+                       !$omp target teams distribute parallel do default(shared) &
+                       !$omp   shared(ksize, ptr_fieldx, ptr_fieldy)
+                       do k = 1,ksize
+                          fieldx(i,j,k) = 0.
+                          fieldy(i,j,k) = 0.
+                       end do
+                    else
+                       do k = 1,ksize
+                          fieldx(i,j,k) = 0.
+                          fieldy(i,j,k) = 0.
+                       end do
+                    end if
                  end do
               end if
            end do
@@ -563,19 +607,31 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type)
               else
                  is = domain%x(1)%global%begin - 1
               end if
-              if( is.GT.domain%x(1)%domain_data%begin )then
+              isd = domain%x(1)%domain_data%begin
+              if( is.GT.isd )then
 
-                 if( 2*is-domain%x(1)%domain_data%begin.GT.domain%x(1)%domain_data%end+shift ) &
+                 if( 2*is-isd.GT.domain%x(1)%domain_data%end+shift ) &
                       call mpp_error( FATAL, 'MPP_DO_UPDATE_V: folded-north BGRID_NE west edge ubound error.' )
                  do l=1,nvector
                     ptr_fieldx = group%addrs_x(l)
                     ptr_fieldy = group%addrs_y(l)
-                    do k = 1,ksize
-                       do i = domain%x(1)%domain_data%begin,is-1
-                          fieldx(i,j,k) = fieldx(2*is-i,j,k)
-                          fieldy(i,j,k) = fieldy(2*is-i,j,k)
+                    if (use_device_ptr) then
+                       !$omp target teams distribute parallel do collapse(2) default(shared) &
+                       !$omp   shared(ksize, isd, is, ptr_fieldx, ptr_fieldy)
+                       do k = 1,ksize
+                          do i = isd,is-1
+                             fieldx(i,j,k) = fieldx(2*is-i,j,k)
+                             fieldy(i,j,k) = fieldy(2*is-i,j,k)
+                          end do
                        end do
-                    end do
+                    else
+                       do k = 1,ksize
+                          do i = isd,is-1
+                             fieldx(i,j,k) = fieldx(2*is-i,j,k)
+                             fieldy(i,j,k) = fieldy(2*is-i,j,k)
+                          end do
+                       end do
+                    endif
                  end do
               end if
            case(CGRID_NE)
@@ -586,11 +642,21 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type)
                       call mpp_error( FATAL, 'MPP_DO_UPDATE_V: folded-north CGRID_NE west edge ubound error.' )
                  do l=1,nvector
                     ptr_fieldy = group%addrs_y(l)
-                    do k = 1,ksize
-                       do i = isd,is-1
-                          fieldy(i,j,k) = fieldy(2*is-i-1,j,k)
+                    if (use_device_ptr) then
+                       !$omp target teams distribute parallel do collapse(2) default(shared) &
+                       !$omp   shared(ksize, isd, is, ptr_fieldy)
+                       do k = 1,ksize
+                          do i = isd,is-1
+                             fieldy(i,j,k) = fieldy(2*is-i-1,j,k)
+                          end do
                        end do
-                    end do
+                    else
+                       do k = 1,ksize
+                          do i = isd,is-1
+                             fieldy(i,j,k) = fieldy(2*is-i-1,j,k)
+                          end do
+                       end do
+                    end if
                  end do
               end if
            end select
@@ -607,21 +673,42 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type)
               do l=1,nvector
                  ptr_fieldx = group%addrs_x(l)
                  ptr_fieldy = group%addrs_y(l)
-                 do k = 1,ksize
-                    do i = is,ie
-                       fieldx(i,j,k) = -fieldx(i,j,k)
-                       fieldy(i,j,k) = -fieldy(i,j,k)
+                 if (use_device_ptr) then
+                    !$omp target teams distribute parallel do collapse(2) default(shared) &
+                    !$omp   shared(ksize, is, ie, ptr_fieldx, ptr_fieldy)
+                    do k = 1,ksize
+                       do i = is,ie
+                          fieldx(i,j,k) = -fieldx(i,j,k)
+                          fieldy(i,j,k) = -fieldy(i,j,k)
+                       end do
                     end do
-                 end do
+                 else
+                    do k = 1,ksize
+                       do i = is,ie
+                          fieldx(i,j,k) = -fieldx(i,j,k)
+                          fieldy(i,j,k) = -fieldy(i,j,k)
+                       end do
+                    end do
+                 end if
               end do
            case(CGRID_NE)
               do l=1,nvector
                  ptr_fieldy = group%addrs_y(l)
-                 do k = 1,ksize
-                    do i = is, ie
-                       fieldy(i,j,k) = -fieldy(i,j,k)
+                 if (use_device_ptr) then
+                    !$omp target teams distribute parallel do collapse(2) default(shared) &
+                    !$omp   shared(ksize, is, ie, ptr_fieldy)
+                    do k = 1,ksize
+                       do i = is, ie
+                          fieldy(i,j,k) = -fieldy(i,j,k)
+                       end do
                     end do
-                 end do
+                 else
+                    do k = 1,ksize
+                       do i = is, ie
+                          fieldy(i,j,k) = -fieldy(i,j,k)
+                       end do
+                    end do
+                 end if
               end do
            end select
         end if
@@ -649,10 +736,11 @@ subroutine MPP_START_GROUP_UPDATE_(group, domain, d_type, reuse_buffer)
   integer   :: nscalar, nvector
   integer   :: nsend, nrecv, flags_v
   integer   :: msgsize, npack, rotation
-  integer   :: from_pe, to_pe, buffer_pos, pos
+  integer   :: from_pe, to_pe, buffer_pos, pos, idx
   integer   :: ksize, is, ie, js, je
-  integer   :: n, l, m, i, j, k, buffer_start_pos, nk
+  integer   :: n, l, m, i, j, k, buffer_start_pos, ni, nj, nk
   logical   :: reuse_buf_pos
+  logical, parameter :: use_device_ptr = .false. ! placeholder
   character(len=8)            :: text
 
   MPP_TYPE_ :: buffer(size(mpp_domains_stack_nonblock(:)))
@@ -729,7 +817,19 @@ subroutine MPP_START_GROUP_UPDATE_(group, domain, d_type, reuse_buffer)
   call mpp_clock_begin(nonblock_group_pack_clock)
   npack = group%npack
   buffer_start_pos = group%buffer_start_pos
+  ! below switch runs OpenMP offloaded packing if ompoffload is .true. and compiled with
+  ! OpenMP offload support. Otherwise, run OpenMP CPU by undefining GPU macro if defined
+  if (use_device_ptr) then
+#include <group_update_pack.inc>
+  else
+#ifdef __NVCOMPILER_OPENMP_GPU
+#undef __NVCOMPILER_OPENMP_GPU
+#include <group_update_pack.inc>
+#define __NVCOMPILER_OPENMP_GPU
+#else
 #include <group_update_pack.inc>
+#endif
+  endif
   call mpp_clock_end(nonblock_group_pack_clock)
 
   call mpp_clock_begin(nonblock_group_send_clock)
@@ -752,11 +852,12 @@ subroutine MPP_COMPLETE_GROUP_UPDATE_(group, domain, d_type)
   MPP_TYPE_,                   intent(in)    :: d_type
 
   integer   :: nsend, nrecv, nscalar, nvector
-  integer   :: k, buffer_pos, pos, m, n, l
+  integer   :: k, buffer_pos, pos, m, n, l, idx
   integer   :: is, ie, js, je, ksize, i, j
   integer   :: shift, gridtype, midpoint, flags_v
-  integer   :: nunpack, rotation, buffer_start_pos, nk, isd
+  integer   :: nunpack, rotation, buffer_start_pos, ni, nj, nk, isd
   logical   :: recv_y(8)
+  logical, parameter :: use_device_ptr = .false. ! placeholder
   MPP_TYPE_ :: buffer(size(mpp_domains_stack_nonblock(:)))
   MPP_TYPE_ :: field (group%is_s:group%ie_s,group%js_s:group%je_s, group%ksize_s)
   MPP_TYPE_ :: fieldx(group%is_x:group%ie_x,group%js_x:group%je_x, group%ksize_v)
@@ -797,7 +898,20 @@ subroutine MPP_COMPLETE_GROUP_UPDATE_(group, domain, d_type)
 
   call mpp_clock_begin(nonblock_group_unpk_clock)
   buffer_start_pos = group%buffer_start_pos
+  ! below switch runs OpenMP offloaded unpacking if ompoffload is .true. and compiled with
+  ! OpenMP offload support. Otherwise, run OpenMP CPU by undefining GPU macro if defined
+  if (use_device_ptr) then
 #include <group_update_unpack.inc>
+  else
+#ifdef __NVCOMPILER_OPENMP_GPU
+#undef __NVCOMPILER_OPENMP_GPU
+#include <group_update_unpack.inc>
+#define __NVCOMPILER_OPENMP_GPU
+  !$omp target exit data map(release: buffer) if(use_device_ptr)
+#else
+#include <group_update_unpack.inc>
+#endif
+  endif
   call mpp_clock_end(nonblock_group_unpk_clock)
 
   ! ---northern boundary fold
diff --git a/mpp/include/mpp_pack.inc b/mpp/include/mpp_pack.inc
index 1e8b9164d6..70e1c52e6b 100644
--- a/mpp/include/mpp_pack.inc
+++ b/mpp/include/mpp_pack.inc
@@ -1,3 +1,5 @@
+#ifndef __NVCOMPILER
+
 #undef MPP_TYPE_
 #define MPP_TYPE_ real(r8_kind)
 #undef ARR2VEC_
@@ -81,3 +83,251 @@
 #undef ARR_INIT_
 #define ARR_INIT_ arr_init_l4
 #include <mpp_pack.fh>
+
+#else
+
+#undef MPP_TYPE_
+#define MPP_TYPE_ real(r8_kind)
+#undef ARR2VEC_2D
+#define ARR2VEC_2D arr2vec_r8_2d
+#undef ARR2VEC_3D
+#define ARR2VEC_3D arr2vec_r8_3d
+#undef ARR2VEC_4D
+#define ARR2VEC_4D arr2vec_r8_4d
+#undef ARR2VEC_5D
+#define ARR2VEC_5D arr2vec_r8_5d
+#undef VEC2ARR_2D
+#define VEC2ARR_2D vec2arr_r8_2d
+#undef VEC2ARR_3D
+#define VEC2ARR_3D vec2arr_r8_3d
+#undef VEC2ARR_4D
+#define VEC2ARR_4D vec2arr_r8_4d
+#undef VEC2ARR_5D
+#define VEC2ARR_5D vec2arr_r8_5d
+#undef ARR_INIT_1D
+#define ARR_INIT_1D arr_init_r8_1d
+#undef ARR_INIT_2D
+#define ARR_INIT_2D arr_init_r8_2d
+#undef ARR_INIT_3D
+#define ARR_INIT_3D arr_init_r8_3d
+#undef ARR_INIT_4D
+#define ARR_INIT_4D arr_init_r8_4d
+#undef ARR_INIT_5D
+#define ARR_INIT_5D arr_init_r8_5d
+#include <mpp_pack_compat.fh>
+
+#ifdef OVERLOAD_C8
+#undef MPP_TYPE_
+#define MPP_TYPE_ complex(c8_kind)
+#undef ARR2VEC_2D
+#define ARR2VEC_2D arr2vec_c8_2d
+#undef ARR2VEC_3D
+#define ARR2VEC_3D arr2vec_c8_3d
+#undef ARR2VEC_4D
+#define ARR2VEC_4D arr2vec_c8_4d
+#undef ARR2VEC_5D
+#define ARR2VEC_5D arr2vec_c8_5d
+#undef VEC2ARR_2D
+#define VEC2ARR_2D vec2arr_c8_2d
+#undef VEC2ARR_3D
+#define VEC2ARR_3D vec2arr_c8_3d
+#undef VEC2ARR_4D
+#define VEC2ARR_4D vec2arr_c8_4d
+#undef VEC2ARR_5D
+#define VEC2ARR_5D vec2arr_c8_5d
+#undef ARR_INIT_1D
+#define ARR_INIT_1D arr_init_c8_1d
+#undef ARR_INIT_2D
+#define ARR_INIT_2D arr_init_c8_2d
+#undef ARR_INIT_3D
+#define ARR_INIT_3D arr_init_c8_3d
+#undef ARR_INIT_4D
+#define ARR_INIT_4D arr_init_c8_4d
+#undef ARR_INIT_5D
+#define ARR_INIT_5D arr_init_c8_5d
+#include <mpp_pack_compat.fh>
+#endif
+
+#undef MPP_TYPE_
+#define MPP_TYPE_ integer(i8_kind)
+#undef ARR2VEC_2D
+#define ARR2VEC_2D arr2vec_i8_2d
+#undef ARR2VEC_3D
+#define ARR2VEC_3D arr2vec_i8_3d
+#undef ARR2VEC_4D
+#define ARR2VEC_4D arr2vec_i8_4d
+#undef ARR2VEC_5D
+#define ARR2VEC_5D arr2vec_i8_5d
+#undef VEC2ARR_2D
+#define VEC2ARR_2D vec2arr_i8_2d
+#undef VEC2ARR_3D
+#define VEC2ARR_3D vec2arr_i8_3d
+#undef VEC2ARR_4D
+#define VEC2ARR_4D vec2arr_i8_4d
+#undef VEC2ARR_5D
+#define VEC2ARR_5D vec2arr_i8_5d
+#undef ARR_INIT_1D
+#define ARR_INIT_1D arr_init_i8_1d
+#undef ARR_INIT_2D
+#define ARR_INIT_2D arr_init_i8_2d
+#undef ARR_INIT_3D
+#define ARR_INIT_3D arr_init_i8_3d
+#undef ARR_INIT_4D
+#define ARR_INIT_4D arr_init_i8_4d
+#undef ARR_INIT_5D
+#define ARR_INIT_5D arr_init_i8_5d
+#include <mpp_pack_compat.fh>
+
+#undef MPP_TYPE_
+#define MPP_TYPE_ logical(l8_kind)
+#undef ARR2VEC_2D
+#define ARR2VEC_2D arr2vec_l8_2d
+#undef ARR2VEC_3D
+#define ARR2VEC_3D arr2vec_l8_3d
+#undef ARR2VEC_4D
+#define ARR2VEC_4D arr2vec_l8_4d
+#undef ARR2VEC_5D
+#define ARR2VEC_5D arr2vec_l8_5d
+#undef VEC2ARR_2D
+#define VEC2ARR_2D vec2arr_l8_2d
+#undef VEC2ARR_3D
+#define VEC2ARR_3D vec2arr_l8_3d
+#undef VEC2ARR_4D
+#define VEC2ARR_4D vec2arr_l8_4d
+#undef VEC2ARR_5D
+#define VEC2ARR_5D vec2arr_l8_5d
+#undef ARR_INIT_1D
+#define ARR_INIT_1D arr_init_l8_1d
+#undef ARR_INIT_2D
+#define ARR_INIT_2D arr_init_l8_2d
+#undef ARR_INIT_3D
+#define ARR_INIT_3D arr_init_l8_3d
+#undef ARR_INIT_4D
+#define ARR_INIT_4D arr_init_l8_4d
+#undef ARR_INIT_5D
+#define ARR_INIT_5D arr_init_l8_5d
+#include <mpp_pack_compat.fh>
+
+#undef MPP_TYPE_
+#define MPP_TYPE_ real(r4_kind)
+#undef ARR2VEC_2D
+#define ARR2VEC_2D arr2vec_r4_2d
+#undef ARR2VEC_3D
+#define ARR2VEC_3D arr2vec_r4_3d
+#undef ARR2VEC_4D
+#define ARR2VEC_4D arr2vec_r4_4d
+#undef ARR2VEC_5D
+#define ARR2VEC_5D arr2vec_r4_5d
+#undef VEC2ARR_2D
+#define VEC2ARR_2D vec2arr_r4_2d
+#undef VEC2ARR_3D
+#define VEC2ARR_3D vec2arr_r4_3d
+#undef VEC2ARR_4D
+#define VEC2ARR_4D vec2arr_r4_4d
+#undef VEC2ARR_5D
+#define VEC2ARR_5D vec2arr_r4_5d
+#undef ARR_INIT_1D
+#define ARR_INIT_1D arr_init_r4_1d
+#undef ARR_INIT_2D
+#define ARR_INIT_2D arr_init_r4_2d
+#undef ARR_INIT_3D
+#define ARR_INIT_3D arr_init_r4_3d
+#undef ARR_INIT_4D
+#define ARR_INIT_4D arr_init_r4_4d
+#undef ARR_INIT_5D
+#define ARR_INIT_5D arr_init_r4_5d
+#include <mpp_pack_compat.fh>
+
+#ifdef OVERLOAD_C4
+#undef MPP_TYPE_
+#define MPP_TYPE_ complex(c4_kind)
+#undef ARR2VEC_2D
+#define ARR2VEC_2D arr2vec_c4_2d
+#undef ARR2VEC_3D
+#define ARR2VEC_3D arr2vec_c4_3d
+#undef ARR2VEC_4D
+#define ARR2VEC_4D arr2vec_c4_4d
+#undef ARR2VEC_5D
+#define ARR2VEC_5D arr2vec_c4_5d
+#undef VEC2ARR_2D
+#define VEC2ARR_2D vec2arr_c4_2d
+#undef VEC2ARR_3D
+#define VEC2ARR_3D vec2arr_c4_3d
+#undef VEC2ARR_4D
+#define VEC2ARR_4D vec2arr_c4_4d
+#undef VEC2ARR_5D
+#define VEC2ARR_5D vec2arr_c4_5d
+#undef ARR_INIT_1D
+#define ARR_INIT_1D arr_init_c4_1d
+#undef ARR_INIT_2D
+#define ARR_INIT_2D arr_init_c4_2d
+#undef ARR_INIT_3D
+#define ARR_INIT_3D arr_init_c4_3d
+#undef ARR_INIT_4D
+#define ARR_INIT_4D arr_init_c4_4d
+#undef ARR_INIT_5D
+#define ARR_INIT_5D arr_init_c4_5d
+#include <mpp_pack_compat.fh>
+#endif
+
+#undef MPP_TYPE_
+#define MPP_TYPE_ integer(i4_kind)
+#undef ARR2VEC_2D
+#define ARR2VEC_2D arr2vec_i4_2d
+#undef ARR2VEC_3D
+#define ARR2VEC_3D arr2vec_i4_3d
+#undef ARR2VEC_4D
+#define ARR2VEC_4D arr2vec_i4_4d
+#undef ARR2VEC_5D
+#define ARR2VEC_5D arr2vec_i4_5d
+#undef VEC2ARR_2D
+#define VEC2ARR_2D vec2arr_i4_2d
+#undef VEC2ARR_3D
+#define VEC2ARR_3D vec2arr_i4_3d
+#undef VEC2ARR_4D
+#define VEC2ARR_4D vec2arr_i4_4d
+#undef VEC2ARR_5D
+#define VEC2ARR_5D vec2arr_i4_5d
+#undef ARR_INIT_1D
+#define ARR_INIT_1D arr_init_i4_1d
+#undef ARR_INIT_2D
+#define ARR_INIT_2D arr_init_i4_2d
+#undef ARR_INIT_3D
+#define ARR_INIT_3D arr_init_i4_3d
+#undef ARR_INIT_4D
+#define ARR_INIT_4D arr_init_i4_4d
+#undef ARR_INIT_5D
+#define ARR_INIT_5D arr_init_i4_5d
+#include <mpp_pack_compat.fh>
+
+#undef MPP_TYPE_
+#define MPP_TYPE_ logical(l4_kind)
+#undef ARR2VEC_2D
+#define ARR2VEC_2D arr2vec_l4_2d
+#undef ARR2VEC_3D
+#define ARR2VEC_3D arr2vec_l4_3d
+#undef ARR2VEC_4D
+#define ARR2VEC_4D arr2vec_l4_4d
+#undef ARR2VEC_5D
+#define ARR2VEC_5D arr2vec_l4_5d
+#undef VEC2ARR_2D
+#define VEC2ARR_2D vec2arr_l4_2d
+#undef VEC2ARR_3D
+#define VEC2ARR_3D vec2arr_l4_3d
+#undef VEC2ARR_4D
+#define VEC2ARR_4D vec2arr_l4_4d
+#undef VEC2ARR_5D
+#define VEC2ARR_5D vec2arr_l4_5d
+#undef ARR_INIT_1D
+#define ARR_INIT_1D arr_init_l4_1d
+#undef ARR_INIT_2D
+#define ARR_INIT_2D arr_init_l4_2d
+#undef ARR_INIT_3D
+#define ARR_INIT_3D arr_init_l4_3d
+#undef ARR_INIT_4D
+#define ARR_INIT_4D arr_init_l4_4d
+#undef ARR_INIT_5D
+#define ARR_INIT_5D arr_init_l4_5d
+#include <mpp_pack_compat.fh>
+
+#endif
diff --git a/mpp/include/mpp_pack_compat.fh b/mpp/include/mpp_pack_compat.fh
new file mode 100644
index 0000000000..77c32505f3
--- /dev/null
+++ b/mpp/include/mpp_pack_compat.fh
@@ -0,0 +1,385 @@
+!***********************************************************************
+!*                             Apache License 2.0
+!*
+!* This file is part of the GFDL Flexible Modeling System (FMS).
+!*
+!* Licensed under the Apache License, Version 2.0 (the "License");
+!* you may not use this file except in compliance with the License.
+!* You may obtain a copy of the License at
+!*
+!*     http://www.apache.org/licenses/LICENSE-2.0
+!*
+!* FMS is distributed in the hope that it will be useful, but WITHOUT
+!* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied;
+!* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+!* PARTICULAR PURPOSE. See the License for the specific language
+!* governing permissions and limitations under the License.
+!***********************************************************************
+
+      !> @brief This file provides duplicate subroutines for the mpp_pack.fh, specifically for compatibility with
+      !! Fortran compilers that do not support the 2018 standard, specfically deferred rank arrays and select rank
+      !! constructs.
+
+      !> @brief Pack a 2D domain-decomposed array into a contiguous 1D array. This is
+      !!        typically used to prepare a contiguous array which will be transmitted via MPI.
+      subroutine ARR2VEC_2D (arr, vec, xdim, ydim, is, ie, js, je)
+        MPP_TYPE_, intent(in) :: arr(:,:) !< The array to be packed
+        MPP_TYPE_, intent(out) :: vec(:) !< The vector to copy the data into
+        integer, intent(in) :: xdim, ydim !< Indices of the domain-decomposed dimensions. In typical Fortran-based
+                                          !! code, xdim=1 and ydim=2.
+        integer, intent(in) :: is, ie, js, je !< Starting and ending indices of the x and y dimensions, expressed
+                                              !! in terms of the 1-based indices seen within ARR2VEC_2D. These indices
+                                              !! may be different from the indices used in the calling code! If `arr`
+                                              !! is defined over the data domain, e.g. dimension(isd:ied,jsd:jed),
+                                              !! and you wish to pack only the compute domain, the following values
+                                              !! should be passed:
+                                              !! is = isc - isd + 1
+                                              !! ie = iec - isd + 1
+                                              !! js = jsc - jsd + 1
+                                              !! je = jec - jsd + 1
+        integer, allocatable, dimension(:) :: lb, ub !< These are 1 and shape(arr) respectively for all dimensions
+                                                     !! except the domain-decomposed dimensions, for which they are
+                                                     !! set according to the is,ie,js,je options.
+        integer :: ndims, n
+        integer :: i1, i2, i3, i4, i5
+
+        ndims = 2
+        allocate (lb(ndims), ub(ndims))
+
+        lb = 1
+        ub = shape(arr)
+
+        lb(xdim) = is
+        lb(ydim) = js
+
+        ub(xdim) = ie
+        ub(ydim) = je
+
+        n = product(ub - lb + 1)
+
+        vec(1:n) = reshape(arr(lb(1):ub(1), lb(2):ub(2)), [n])
+      end subroutine ARR2VEC_2D
+
+      !> @brief Pack a 3D domain-decomposed array into a contiguous 1D array. This is
+      !!        typically used to prepare a contiguous array which will be transmitted via MPI.
+      subroutine ARR2VEC_3D (arr, vec, xdim, ydim, is, ie, js, je)
+        MPP_TYPE_, intent(in) :: arr(:,:,:) !< The array to be packed
+        MPP_TYPE_, intent(out) :: vec(:) !< The vector to copy the data into
+        integer, intent(in) :: xdim, ydim !< Indices of the domain-decomposed dimensions. In typical Fortran-based
+                                          !! code, xdim=1 and ydim=2.
+        integer, intent(in) :: is, ie, js, je !< Starting and ending indices of the x and y dimensions, expressed
+                                              !! in terms of the 1-based indices seen within ARR2VEC_3D. These indices
+                                              !! may be different from the indices used in the calling code! If `arr`
+                                              !! is defined over the data domain, e.g. dimension(isd:ied,jsd:jed,...),
+                                              !! and you wish to pack only the compute domain, the following values
+                                              !! should be passed:
+                                              !! is = isc - isd + 1
+                                              !! ie = iec - isd + 1
+                                              !! js = jsc - jsd + 1
+                                              !! je = jec - jsd + 1
+        integer, allocatable, dimension(:) :: lb, ub !< These are 1 and shape(arr) respectively for all dimensions
+                                                     !! except the domain-decomposed dimensions, for which they are
+                                                     !! set according to the is,ie,js,je options.
+        integer :: ndims, n
+        integer :: i1, i2, i3, i4, i5
+
+        ndims = 3
+        allocate (lb(ndims), ub(ndims))
+
+        lb = 1
+        ub = shape(arr)
+
+        lb(xdim) = is
+        lb(ydim) = js
+
+        ub(xdim) = ie
+        ub(ydim) = je
+
+        n = product(ub - lb + 1)
+
+        vec(1:n) = reshape(arr(lb(1):ub(1), lb(2):ub(2), lb(3):ub(3)), [n])
+      end subroutine ARR2VEC_3D
+
+      !> @brief Pack a 4D domain-decomposed array into a contiguous 1D array. This is
+      !!        typically used to prepare a contiguous array which will be transmitted via MPI.
+      subroutine ARR2VEC_4D (arr, vec, xdim, ydim, is, ie, js, je)
+        MPP_TYPE_, intent(in) :: arr(:,:,:,:) !< The array to be packed
+        MPP_TYPE_, intent(out) :: vec(:) !< The vector to copy the data into
+        integer, intent(in) :: xdim, ydim !< Indices of the domain-decomposed dimensions. In typical Fortran-based
+                                          !! code, xdim=1 and ydim=2.
+        integer, intent(in) :: is, ie, js, je !< Starting and ending indices of the x and y dimensions, expressed
+                                              !! in terms of the 1-based indices seen within ARR2VEC_4D. These indices
+                                              !! may be different from the indices used in the calling code! If `arr`
+                                              !! is defined over the data domain, e.g. dimension(isd:ied,jsd:jed,...),
+                                              !! and you wish to pack only the compute domain, the following values
+                                              !! should be passed:
+                                              !! is = isc - isd + 1
+                                              !! ie = iec - isd + 1
+                                              !! js = jsc - jsd + 1
+                                              !! je = jec - jsd + 1
+        integer, allocatable, dimension(:) :: lb, ub !< These are 1 and shape(arr) respectively for all dimensions
+                                                     !! except the domain-decomposed dimensions, for which they are
+                                                     !! set according to the is,ie,js,je options.
+        integer :: ndims, n
+        integer :: i1, i2, i3, i4, i5
+
+        ndims = 4
+        allocate (lb(ndims), ub(ndims))
+
+        lb = 1
+        ub = shape(arr)
+
+        lb(xdim) = is
+        lb(ydim) = js
+
+        ub(xdim) = ie
+        ub(ydim) = je
+
+        n = product(ub - lb + 1)
+
+        vec(1:n) = reshape(arr(lb(1):ub(1), lb(2):ub(2), lb(3):ub(3), lb(4):ub(4)), [n])
+      end subroutine ARR2VEC_4D
+
+      !> @brief Pack a 5D domain-decomposed array into a contiguous 1D array. This is
+      !!        typically used to prepare a contiguous array which will be transmitted via MPI.
+      subroutine ARR2VEC_5D (arr, vec, xdim, ydim, is, ie, js, je)
+        MPP_TYPE_, intent(in) :: arr(:,:,:,:,:) !< The array to be packed
+        MPP_TYPE_, intent(out) :: vec(:) !< The vector to copy the data into
+        integer, intent(in) :: xdim, ydim !< Indices of the domain-decomposed dimensions. In typical Fortran-based
+                                          !! code, xdim=1 and ydim=2.
+        integer, intent(in) :: is, ie, js, je !< Starting and ending indices of the x and y dimensions, expressed
+                                              !! in terms of the 1-based indices seen within ARR2VEC_5D. These indices
+                                              !! may be different from the indices used in the calling code! If `arr`
+                                              !! is defined over the data domain, e.g. dimension(isd:ied,jsd:jed,...),
+                                              !! and you wish to pack only the compute domain, the following values
+                                              !! should be passed:
+                                              !! is = isc - isd + 1
+                                              !! ie = iec - isd + 1
+                                              !! js = jsc - jsd + 1
+                                              !! je = jec - jsd + 1
+        integer, allocatable, dimension(:) :: lb, ub !< These are 1 and shape(arr) respectively for all dimensions
+                                                     !! except the domain-decomposed dimensions, for which they are
+                                                     !! set according to the is,ie,js,je options.
+        integer :: ndims, n
+        integer :: i1, i2, i3, i4, i5
+
+        ndims = 5
+        allocate (lb(ndims), ub(ndims))
+
+        lb = 1
+        ub = shape(arr)
+
+        lb(xdim) = is
+        lb(ydim) = js
+
+        ub(xdim) = ie
+        ub(ydim) = je
+
+        n = product(ub - lb + 1)
+
+        vec(1:n) = reshape(arr(lb(1):ub(1), lb(2):ub(2), lb(3):ub(3), lb(4):ub(4), lb(5):ub(5)), [n])
+      end subroutine ARR2VEC_5D
+
+      !> @brief Unpack a contiguous 1D array into a 2D array. This is typically used to transform
+      !!        data obtained from an MPI call into the format that is expected by the calling code.
+      subroutine VEC2ARR_2D (vec, arr, xdim, ydim, is, ie, js, je)
+        MPP_TYPE_, intent(in) :: vec(:) !< The 1D array to be unpacked
+        MPP_TYPE_, intent(out) :: arr(:,:) !< The multi-dimensional array to copy the data into
+        integer, intent(in) :: xdim, ydim !< Indices of the domain-decomposed dimensions. In typical Fortran-based
+                                          !! code, xdim=1 and ydim=2.
+        integer, intent(in) :: is, ie, js, je !< Starting and ending indices of the x and y dimensions, expressed
+                                              !! in terms of the 1-based indices seen within VEC2ARR_2D. These indices
+                                              !! may be different from the indices used in the calling code! If `arr`
+                                              !! is defined over the data domain, e.g. dimension(isd:ied,jsd:jed),
+                                              !! and you wish to unpack only the compute domain, the following values
+                                              !! should be passed:
+                                              !! is = isc - isd + 1
+                                              !! ie = iec - isd + 1
+                                              !! js = jsc - jsd + 1
+                                              !! je = jec - jsd + 1
+        integer, allocatable, dimension(:) :: lb, ub !< These are 1 and shape(arr) respectively for all dimensions
+                                                     !! except the domain-decomposed dimensions, for which they are
+                                                     !! set according to the is,ie,js,je options.
+        integer :: ndims, n
+        integer :: i1, i2, i3, i4, i5
+
+        ndims = 2
+        allocate (lb(ndims), ub(ndims))
+
+        lb = 1
+        ub = shape(arr)
+
+        lb(xdim) = is
+        lb(ydim) = js
+
+        ub(xdim) = ie
+        ub(ydim) = je
+
+        associate (s => ub - lb + 1)
+          n = product(s)
+          arr(lb(1):ub(1), lb(2):ub(2)) = reshape(vec(1:n), s(1:2))
+        end associate
+      end subroutine VEC2ARR_2D
+
+      !> @brief Unpack a contiguous 1D array into a 3D array. This is typically used to transform
+      !!        data obtained from an MPI call into the format that is expected by the calling code.
+      subroutine VEC2ARR_3D (vec, arr, xdim, ydim, is, ie, js, je)
+        MPP_TYPE_, intent(in) :: vec(:) !< The 1D array to be unpacked
+        MPP_TYPE_, intent(out) :: arr(:,:,:) !< The multi-dimensional array to copy the data into
+        integer, intent(in) :: xdim, ydim !< Indices of the domain-decomposed dimensions. In typical Fortran-based
+                                          !! code, xdim=1 and ydim=2.
+        integer, intent(in) :: is, ie, js, je !< Starting and ending indices of the x and y dimensions, expressed
+                                              !! in terms of the 1-based indices seen within VEC2ARR_3D. These indices
+                                              !! may be different from the indices used in the calling code! If `arr`
+                                              !! is defined over the data domain, e.g. dimension(isd:ied,jsd:jed,...),
+                                              !! and you wish to unpack only the compute domain, the following values
+                                              !! should be passed:
+                                              !! is = isc - isd + 1
+                                              !! ie = iec - isd + 1
+                                              !! js = jsc - jsd + 1
+                                              !! je = jec - jsd + 1
+        integer, allocatable, dimension(:) :: lb, ub !< These are 1 and shape(arr) respectively for all dimensions
+                                                     !! except the domain-decomposed dimensions, for which they are
+                                                     !! set according to the is,ie,js,je options.
+        integer :: ndims, n
+        integer :: i1, i2, i3, i4, i5
+
+        ndims = 3
+        allocate (lb(ndims), ub(ndims))
+
+        lb = 1
+        ub = shape(arr)
+
+        lb(xdim) = is
+        lb(ydim) = js
+
+        ub(xdim) = ie
+        ub(ydim) = je
+
+        associate (s => ub - lb + 1)
+          n = product(s)
+          arr(lb(1):ub(1), lb(2):ub(2), lb(3):ub(3)) = reshape(vec(1:n), s(1:3))
+        end associate
+      end subroutine VEC2ARR_3D
+
+      !> @brief Unpack a contiguous 1D array into a 4D array. This is typically used to transform
+      !!        data obtained from an MPI call into the format that is expected by the calling code.
+      subroutine VEC2ARR_4D (vec, arr, xdim, ydim, is, ie, js, je)
+        MPP_TYPE_, intent(in) :: vec(:) !< The 1D array to be unpacked
+        MPP_TYPE_, intent(out) :: arr(:,:,:,:) !< The multi-dimensional array to copy the data into
+        integer, intent(in) :: xdim, ydim !< Indices of the domain-decomposed dimensions. In typical Fortran-based
+                                          !! code, xdim=1 and ydim=2.
+        integer, intent(in) :: is, ie, js, je !< Starting and ending indices of the x and y dimensions, expressed
+                                              !! in terms of the 1-based indices seen within VEC2ARR_4D. These indices
+                                              !! may be different from the indices used in the calling code! If `arr`
+                                              !! is defined over the data domain, e.g. dimension(isd:ied,jsd:jed,...),
+                                              !! and you wish to unpack only the compute domain, the following values
+                                              !! should be passed:
+                                              !! is = isc - isd + 1
+                                              !! ie = iec - isd + 1
+                                              !! js = jsc - jsd + 1
+                                              !! je = jec - jsd + 1
+        integer, allocatable, dimension(:) :: lb, ub !< These are 1 and shape(arr) respectively for all dimensions
+                                                     !! except the domain-decomposed dimensions, for which they are
+                                                     !! set according to the is,ie,js,je options.
+        integer :: ndims, n
+        integer :: i1, i2, i3, i4, i5
+
+        ndims = 4
+        allocate (lb(ndims), ub(ndims))
+
+        lb = 1
+        ub = shape(arr)
+
+        lb(xdim) = is
+        lb(ydim) = js
+
+        ub(xdim) = ie
+        ub(ydim) = je
+
+        associate (s => ub - lb + 1)
+          n = product(s)
+          arr(lb(1):ub(1), lb(2):ub(2), lb(3):ub(3), lb(4):ub(4)) = reshape(vec(1:n), s(1:4))
+        end associate
+      end subroutine VEC2ARR_4D
+
+      !> @brief Unpack a contiguous 1D array into a 5D array. This is typically used to transform
+      !!        data obtained from an MPI call into the format that is expected by the calling code.
+      subroutine VEC2ARR_5D (vec, arr, xdim, ydim, is, ie, js, je)
+        MPP_TYPE_, intent(in) :: vec(:) !< The 1D array to be unpacked
+        MPP_TYPE_, intent(out) :: arr(:,:,:,:,:) !< The multi-dimensional array to copy the data into
+        integer, intent(in) :: xdim, ydim !< Indices of the domain-decomposed dimensions. In typical Fortran-based
+                                          !! code, xdim=1 and ydim=2.
+        integer, intent(in) :: is, ie, js, je !< Starting and ending indices of the x and y dimensions, expressed
+                                              !! in terms of the 1-based indices seen within VEC2ARR_5D. These indices
+                                              !! may be different from the indices used in the calling code! If `arr`
+                                              !! is defined over the data domain, e.g. dimension(isd:ied,jsd:jed,...),
+                                              !! and you wish to unpack only the compute domain, the following values
+                                              !! should be passed:
+                                              !! is = isc - isd + 1
+                                              !! ie = iec - isd + 1
+                                              !! js = jsc - jsd + 1
+                                              !! je = jec - jsd + 1
+        integer, allocatable, dimension(:) :: lb, ub !< These are 1 and shape(arr) respectively for all dimensions
+                                                     !! except the domain-decomposed dimensions, for which they are
+                                                     !! set according to the is,ie,js,je options.
+        integer :: ndims, n
+        integer :: i1, i2, i3, i4, i5
+
+        ndims = 5
+        allocate (lb(ndims), ub(ndims))
+
+        lb = 1
+        ub = shape(arr)
+
+        lb(xdim) = is
+        lb(ydim) = js
+
+        ub(xdim) = ie
+        ub(ydim) = je
+
+        associate (s => ub - lb + 1)
+          n = product(s)
+          arr(lb(1):ub(1), lb(2):ub(2), lb(3):ub(3), lb(4):ub(4), lb(5):ub(5)) = reshape(vec(1:n), s(1:5))
+        end associate
+      end subroutine VEC2ARR_5D
+
+      !> @brief Initialize a 1D array to `val`.
+      subroutine ARR_INIT_1D (arr, val)
+        MPP_TYPE_, dimension(:), intent(out) :: arr !< The array to be initialized
+        MPP_TYPE_, intent(in) :: val !< The value to initialize to
+
+        arr = val
+      end subroutine ARR_INIT_1D
+
+      !> @brief Initialize a 2D array to `val`.
+      subroutine ARR_INIT_2D (arr, val)
+        MPP_TYPE_, dimension(:,:), intent(out) :: arr !< The array to be initialized
+        MPP_TYPE_, intent(in) :: val !< The value to initialize to
+
+        arr = val
+      end subroutine ARR_INIT_2D
+
+      !> @brief Initialize a 3D array to `val`.
+      subroutine ARR_INIT_3D (arr, val)
+        MPP_TYPE_, dimension(:,:,:), intent(out) :: arr !< The array to be initialized
+        MPP_TYPE_, intent(in) :: val !< The value to initialize to
+
+        arr = val
+      end subroutine ARR_INIT_3D
+
+      !> @brief Initialize a 4D array to `val`.
+      subroutine ARR_INIT_4D (arr, val)
+        MPP_TYPE_, dimension(:,:,:,:), intent(out) :: arr !< The array to be initialized
+        MPP_TYPE_, intent(in) :: val !< The value to initialize to
+
+        arr = val
+      end subroutine ARR_INIT_4D
+
+      !> @brief Initialize a 5D array to `val`.
+      subroutine ARR_INIT_5D (arr, val)
+        MPP_TYPE_, dimension(:,:,:,:,:), intent(out) :: arr !< The array to be initialized
+        MPP_TYPE_, intent(in) :: val !< The value to initialize to
+
+        arr = val
+      end subroutine ARR_INIT_5D
\ No newline at end of file
diff --git a/mpp/include/mpp_transmit.inc b/mpp/include/mpp_transmit.inc
index b745e71da0..3b55a5b339 100644
--- a/mpp/include/mpp_transmit.inc
+++ b/mpp/include/mpp_transmit.inc
@@ -171,7 +171,7 @@
       call mpp_transmit( put_data, put_len, to_pe, dummy, 1, NULL_PE, tag=tag, send_request=request )
     end subroutine MPP_SEND_
 
-    subroutine MPP_RECV_SCALAR_( get_data, from_pe, glen, block, tag, request )
+    subroutine MPP_RECV_SCALAR_( get_data, from_pe, glen, block, tag, request, omp_offload )
 !a mpp_transmit with null arguments on the put side
       integer, intent(in) :: from_pe
       MPP_TYPE_, intent(out) :: get_data
@@ -180,6 +180,7 @@
       type(mpi_request), intent(out), optional :: request
 
       integer, optional, intent(in) :: glen
+      logical, optional, intent(in) :: omp_offload
       integer                       :: get_len
       MPP_TYPE_ :: get_data1D(1)
       MPP_TYPE_ :: dummy(1)
@@ -189,17 +190,19 @@
 
       ptr = LOC(get_data)
       get_len=1; if(PRESENT(glen))get_len=glen
-      call mpp_transmit( dummy, 1, NULL_PE, get_data1D, get_len, from_pe, block, tag, recv_request=request )
+      call mpp_transmit( dummy, 1, NULL_PE, get_data1D, get_len, from_pe, &
+          block, tag, recv_request=request, omp_offload=omp_offload )
 
     end subroutine MPP_RECV_SCALAR_
 
-    subroutine MPP_SEND_SCALAR_( put_data, to_pe, plen, tag, request)
+    subroutine MPP_SEND_SCALAR_( put_data, to_pe, plen, tag, request, omp_offload)
 !a mpp_transmit with null arguments on the get side
       integer, intent(in) :: to_pe
       MPP_TYPE_, intent(in) :: put_data
       integer, optional, intent(in) :: plen
       integer, intent(in), optional :: tag
       type(mpi_request), intent(out), optional :: request
+      logical, optional, intent(in) :: omp_offload
       integer                       :: put_len
       MPP_TYPE_ :: put_data1D(1)
       MPP_TYPE_ :: dummy(1)
@@ -207,7 +210,8 @@
       pointer( ptr, put_data1D )
       ptr = LOC(put_data)
       put_len=1; if(PRESENT(plen))put_len=plen
-      call mpp_transmit( put_data1D, put_len, to_pe, dummy, 1, NULL_PE, tag = tag, send_request=request )
+      call mpp_transmit( put_data1D, put_len, to_pe, dummy, 1, NULL_PE, &
+          tag=tag, send_request=request, omp_offload=omp_offload )
 
     end subroutine MPP_SEND_SCALAR_
 
@@ -220,7 +224,8 @@
       type(mpi_request), intent(out), optional :: request
 
       MPP_TYPE_ :: dummy(1,1)
-      call mpp_transmit( dummy, 1, NULL_PE, get_data, get_len, from_pe, block, tag, recv_request=request )
+      call mpp_transmit( dummy, 1, NULL_PE, get_data, get_len, from_pe, &
+          block, tag, recv_request=request )
     end subroutine MPP_RECV_2D_
 
     subroutine MPP_SEND_2D_( put_data, put_len, to_pe, tag, request )
diff --git a/mpp/include/mpp_transmit_mpi.fh b/mpp/include/mpp_transmit_mpi.fh
index d649a984de..d7b28880d5 100644
--- a/mpp/include/mpp_transmit_mpi.fh
+++ b/mpp/include/mpp_transmit_mpi.fh
@@ -36,7 +36,7 @@
 !!(avoiding f90 rank conformance check)
 !!caller is responsible for completion checks (mpp_sync_self) before and after
     subroutine MPP_TRANSMIT_( put_data, put_len, to_pe, get_data, get_len, from_pe, block, tag, recv_request, &
-                            &  send_request )
+                            &  send_request, omp_offload )
 
       integer, intent(in) :: put_len, to_pe, get_len, from_pe
       MPP_TYPE_, intent(in)  :: put_data(*)
@@ -44,14 +44,19 @@
       logical, intent(in),  optional :: block
       integer, intent(in),  optional :: tag
       type(mpi_request), intent(out), optional :: recv_request, send_request
+      logical, intent(in),  optional :: omp_offload
       logical                       :: block_comm
       integer                       :: i
       integer                       :: comm_tag
       integer                       :: rsize
+      logical                       :: use_device_ptr
 
       if( .NOT.module_is_initialized )call mpp_error( FATAL, 'MPP_TRANSMIT: You must first call mpp_init.' )
       if( to_pe.EQ.NULL_PE .AND. from_pe.EQ.NULL_PE )return
 
+      use_device_ptr = .false.
+      if (present(omp_offload)) use_device_ptr = omp_offload
+
       block_comm = .true.
       if(PRESENT(block)) block_comm = block
 
@@ -81,8 +86,15 @@
              if( cur_send_request > max_request ) &
              call mpp_error(FATAL, &
                   & "MPP_TRANSMIT: cur_send_request is greater than max_request, increase mpp_nml request_multiply")
-             call MPI_ISEND( put_data, put_len, MPI_TYPE_, to_pe, comm_tag, mpp_comm_private, &
-                             request_send(cur_send_request), error)
+             if (use_device_ptr) then
+                 !$omp target data use_device_addr(put_data)
+                 call MPI_ISEND( put_data, put_len, MPI_TYPE_, to_pe, comm_tag, mpp_comm_private, &
+                                 request_send(cur_send_request), error)
+                 !$omp end target data
+             else
+                 call MPI_ISEND( put_data, put_len, MPI_TYPE_, to_pe, comm_tag, mpp_comm_private, &
+                                 request_send(cur_send_request), error)
+             endif
           endif
           if (debug .and. (current_clock.NE.0)) call increment_current_clock(EVENT_SEND, put_len*MPP_TYPE_BYTELEN_)
       else if (to_pe.EQ.ALL_PES) then !this is a broadcast from from_pe
@@ -129,8 +141,15 @@
                 if( cur_recv_request > max_request ) &
                 call mpp_error(FATAL, &
                      "MPP_TRANSMIT: cur_recv_request is greater than max_request, increase mpp_nml request_multiply")
-                call MPI_IRECV( get_data, get_len, MPI_TYPE_, from_pe, comm_tag, mpp_comm_private, &
-                     request_recv(cur_recv_request), error )
+                if (use_device_ptr) then
+                    !$omp target data use_device_addr(get_data)
+                    call MPI_IRECV( get_data, get_len, MPI_TYPE_, from_pe, comm_tag, mpp_comm_private, &
+                                    request_recv(cur_recv_request), error )
+                    !$omp end target data
+                else
+                    call MPI_IRECV( get_data, get_len, MPI_TYPE_, from_pe, comm_tag, mpp_comm_private, &
+                                    request_recv(cur_recv_request), error )
+                endif
                 size_recv(cur_recv_request) = get_len
                 type_recv(cur_recv_request) = MPI_TYPE_
              endif
diff --git a/mpp/include/mpp_transmit_nocomm.fh b/mpp/include/mpp_transmit_nocomm.fh
index 5db86f978d..732155280e 100644
--- a/mpp/include/mpp_transmit_nocomm.fh
+++ b/mpp/include/mpp_transmit_nocomm.fh
@@ -35,7 +35,7 @@
 !!words from an array of any rank to be passed (avoiding f90 rank conformance check)
 !!caller is responsible for completion checks (mpp_sync_self) before and after
     subroutine MPP_TRANSMIT_( put_data, put_len, to_pe, get_data, get_len, from_pe, block, tag, recv_request, &
-                            &  send_request )
+                            &  send_request, omp_offload )
 
       integer, intent(in) :: put_len, to_pe, get_len, from_pe
       MPP_TYPE_, intent(in)  :: put_data(*)
@@ -43,6 +43,8 @@
       logical, intent(in), optional :: block
       integer, intent(in), optional :: tag
       type(mpi_request), intent(out), optional :: recv_request, send_request
+      logical, intent(in), optional :: omp_offload
+        ! NOTE: omp_offload is unused in this function
 
       integer :: i, outunit
       MPP_TYPE_, allocatable, save :: local_data(:) !local copy used by non-parallel code (no SHMEM or MPI)
@@ -54,6 +56,7 @@
       if( .NOT.module_is_initialized )call mpp_error( FATAL, 'MPP_TRANSMIT: You must first call mpp_init.' )
       if( to_pe.EQ.NULL_PE .AND. from_pe.EQ.NULL_PE )return
 
+
       outunit = stdout()
       if( debug )then
           call SYSTEM_CLOCK(tick)
diff --git a/mpp/include/mpp_util_mpi.inc b/mpp/include/mpp_util_mpi.inc
index f3b18518d1..a30e5f1a22 100644
--- a/mpp/include/mpp_util_mpi.inc
+++ b/mpp/include/mpp_util_mpi.inc
@@ -54,7 +54,11 @@ subroutine mpp_error_basic( errortype, errormsg )
      text_errortype = 'WARNING: non-existent errortype (must be NOTE|WARNING|FATAL)'
   end select
 
-  if( npes.GT.1 ) write( text,'(a,i6)' ) trim(text_errortype) // ' from PE', pe   !this is the mpp part
+  if( npes.GT.1 ) then
+     write( text,'(a,i6)' ) trim(text_errortype) // ' from PE', pe   !this is the mpp part
+  else
+     text = text_errortype
+  endif
   if( PRESENT(errormsg) )text = trim(text) // ': ' // trim(errormsg)
 !$OMP CRITICAL (MPP_ERROR_CRITICAL)
   select case( errortype )
diff --git a/mpp/mpp_domains.F90 b/mpp/mpp_domains.F90
index da9a1ff848..ca4c8c61e2 100644
--- a/mpp/mpp_domains.F90
+++ b/mpp/mpp_domains.F90
@@ -1785,6 +1785,8 @@ module mpp_domains_mod
 !! @endcode
 !> @ingroup mpp_domains_mod
   interface mpp_global_field
+
+#ifndef __NVCOMPILER
      module procedure mpp_global_field_r8
 #ifdef OVERLOAD_C8
      module procedure mpp_global_field_c8
@@ -1797,6 +1799,45 @@ module mpp_domains_mod
 #endif
      module procedure mpp_global_field_i4
      module procedure mpp_global_field_l4
+!! rank specific interfaces are needed for NVHPC because it does not support deferred-rank arrays
+#else
+     module procedure mpp_global_field_r8_2d
+     module procedure mpp_global_field_r8_3d
+     module procedure mpp_global_field_r8_4d
+     module procedure mpp_global_field_r8_5d
+#ifdef OVERLOAD_C8
+     module procedure mpp_global_field_c8_2d
+     module procedure mpp_global_field_c8_3d
+     module procedure mpp_global_field_c8_4d
+     module procedure mpp_global_field_c8_5d
+#endif
+     module procedure mpp_global_field_i8_2d
+     module procedure mpp_global_field_i8_3d
+     module procedure mpp_global_field_i8_4d
+     module procedure mpp_global_field_i8_5d
+     module procedure mpp_global_field_l8_2d
+     module procedure mpp_global_field_l8_3d
+     module procedure mpp_global_field_l8_4d
+     module procedure mpp_global_field_l8_5d
+     module procedure mpp_global_field_r4_2d
+     module procedure mpp_global_field_r4_3d
+     module procedure mpp_global_field_r4_4d
+     module procedure mpp_global_field_r4_5d
+#ifdef OVERLOAD_C4
+     module procedure mpp_global_field_c4_2d
+     module procedure mpp_global_field_c4_3d
+     module procedure mpp_global_field_c4_4d
+     module procedure mpp_global_field_c4_5d
+#endif
+     module procedure mpp_global_field_i4_2d
+     module procedure mpp_global_field_i4_3d
+     module procedure mpp_global_field_i4_4d
+     module procedure mpp_global_field_i4_5d
+     module procedure mpp_global_field_l4_2d
+     module procedure mpp_global_field_l4_3d
+     module procedure mpp_global_field_l4_4d
+     module procedure mpp_global_field_l4_5d
+#endif
   end interface
 
 !> @ingroup mpp_domains_mod
@@ -2297,6 +2338,7 @@ module mpp_domains_mod
   !> Private interface to pack an array into a vector
   !> @ingroup mpp_domains_mod
   interface arr2vec
+#ifndef __NVCOMPILER
      module procedure arr2vec_r8
 #ifdef OVERLOAD_C8
      module procedure arr2vec_c8
@@ -2309,11 +2351,38 @@ module mpp_domains_mod
 #endif
      module procedure arr2vec_i4
      module procedure arr2vec_l4
+#else
+     module procedure arr2vec_r8_2d
+     module procedure arr2vec_r8_3d
+     module procedure arr2vec_r8_4d
+     module procedure arr2vec_r8_5d
+     module procedure arr2vec_i8_2d
+     module procedure arr2vec_i8_3d
+     module procedure arr2vec_i8_4d
+     module procedure arr2vec_i8_5d
+     module procedure arr2vec_l8_2d
+     module procedure arr2vec_l8_3d
+     module procedure arr2vec_l8_4d
+     module procedure arr2vec_l8_5d
+     module procedure arr2vec_r4_2d
+     module procedure arr2vec_r4_3d
+     module procedure arr2vec_r4_4d
+     module procedure arr2vec_r4_5d
+     module procedure arr2vec_i4_2d
+     module procedure arr2vec_i4_3d
+     module procedure arr2vec_i4_4d
+     module procedure arr2vec_i4_5d
+     module procedure arr2vec_l4_2d
+     module procedure arr2vec_l4_3d
+     module procedure arr2vec_l4_4d
+     module procedure arr2vec_l4_5d
+#endif
   end interface
 
   !> Private interface to unpack a vector into an array
   !> @ingroup mpp_domains_mod
   interface vec2arr
+#ifndef __NVCOMPILER
      module procedure vec2arr_r8
 #ifdef OVERLOAD_C8
      module procedure vec2arr_c8
@@ -2326,11 +2395,38 @@ module mpp_domains_mod
 #endif
      module procedure vec2arr_i4
      module procedure vec2arr_l4
+#else
+     module procedure vec2arr_r8_2d
+     module procedure vec2arr_r8_3d
+     module procedure vec2arr_r8_4d
+     module procedure vec2arr_r8_5d
+     module procedure vec2arr_i8_2d
+     module procedure vec2arr_i8_3d
+     module procedure vec2arr_i8_4d
+     module procedure vec2arr_i8_5d
+     module procedure vec2arr_l8_2d
+     module procedure vec2arr_l8_3d
+     module procedure vec2arr_l8_4d
+     module procedure vec2arr_l8_5d
+     module procedure vec2arr_r4_2d
+     module procedure vec2arr_r4_3d
+     module procedure vec2arr_r4_4d
+     module procedure vec2arr_r4_5d
+     module procedure vec2arr_i4_2d
+     module procedure vec2arr_i4_3d
+     module procedure vec2arr_i4_4d
+     module procedure vec2arr_i4_5d
+     module procedure vec2arr_l4_2d
+     module procedure vec2arr_l4_3d
+     module procedure vec2arr_l4_4d
+     module procedure vec2arr_l4_5d
+#endif
   end interface
 
   !> Private interface to initialize an assumed-rank array
   !> @ingroup mpp_domains_mod
   interface arr_init
+#ifndef __NVCOMPILER
      module procedure arr_init_r8
 #ifdef OVERLOAD_C8
      module procedure arr_init_c8
@@ -2343,6 +2439,32 @@ module mpp_domains_mod
 #endif
      module procedure arr_init_i4
      module procedure arr_init_l4
+#else
+     module procedure arr_init_r8_2d
+     module procedure arr_init_r8_3d
+     module procedure arr_init_r8_4d
+     module procedure arr_init_r8_5d
+     module procedure arr_init_i8_2d
+     module procedure arr_init_i8_3d
+     module procedure arr_init_i8_4d
+     module procedure arr_init_i8_5d
+     module procedure arr_init_l8_2d
+     module procedure arr_init_l8_3d
+     module procedure arr_init_l8_4d
+     module procedure arr_init_l8_5d
+     module procedure arr_init_r4_2d
+     module procedure arr_init_r4_3d
+     module procedure arr_init_r4_4d
+     module procedure arr_init_r4_5d
+     module procedure arr_init_i4_2d
+     module procedure arr_init_i4_3d
+     module procedure arr_init_i4_4d
+     module procedure arr_init_i4_5d
+     module procedure arr_init_l4_2d
+     module procedure arr_init_l4_3d
+     module procedure arr_init_l4_4d
+     module procedure arr_init_l4_5d
+#endif
   end interface
 
   ! Include variable "version" to be written to log file.
@@ -2357,8 +2479,8 @@ module mpp_domains_mod
 #include <mpp_domains_comm.inc>
 #include <mpp_domains_define.inc>
 #include <mpp_domains_misc.inc>
+#include <mpp_pack.inc>
 #include <mpp_domains_reduce.inc>
 #include <mpp_unstruct_domain.inc>
-#include <mpp_pack.inc>
 
 end module mpp_domains_mod
diff --git a/test_fms/mpp/test_mpp_domains.F90 b/test_fms/mpp/test_mpp_domains.F90
index e681173944..0ef0abe929 100644
--- a/test_fms/mpp/test_mpp_domains.F90
+++ b/test_fms/mpp/test_mpp_domains.F90
@@ -82,6 +82,7 @@ program test_mpp_domains
   logical :: test_edge_update = .false.
   logical :: test_nonsym_edge = .false.
   logical :: test_group = .false.
+  logical :: test_group_offload = .false.
   logical :: test_cubic_grid_redistribute = .false.
   logical :: check_parallel = .FALSE.
   logical :: test_get_nbr = .FALSE.
@@ -113,7 +114,7 @@ program test_mpp_domains
                                layout_ensemble, nthreads, test_boundary, layout_tripolar, &
                                test_group, test_global_sum, test_subset, test_nonsym_edge, &
                                test_halosize_performance, test_adjoint, wide_halo, &
-                               test_unstruct
+                               test_unstruct, test_group_offload
   integer :: i, j, k, n, p
   integer :: layout(2)
   integer :: id
@@ -236,6 +237,13 @@ program test_mpp_domains
      if (mpp_pe() == mpp_root_pe())  print *, '--------------------> Finished testing group_update <-------------------'
   endif
 
+  if( test_group_offload) then
+      if (mpp_pe() == mpp_root_pe())  print *, '--------------------> Calling test_group <-------------------'
+     call test_group_update( 'Folded-north', use_omp_offload=test_group_offload )
+     call test_group_update( 'Cubic-Grid', use_omp_offload=test_group_offload )
+      if (mpp_pe() == mpp_root_pe())  print *, '--------------------> Finished test_group <-------------------'
+  endif
+
   if( test_interface ) then
       if (mpp_pe() == mpp_root_pe())  print *, '--------------------> Calling test_interface  <-------------------'
       call test_modify_domain()
@@ -2486,6 +2494,607 @@ end subroutine test_mpp_global_sum
 #undef FMS_TEST_KIND_
 #undef TEST_GROUP_UPDATE_
 
+  !###############################################################
+  subroutine test_group_update( type, use_omp_offload )
+    character(len=*), intent(in) :: type
+    logical, intent(in), optional :: use_omp_offload
+
+    type(domain2D) :: domain
+    integer        :: num_contact, ntiles, npes_per_tile
+    integer        :: i, j, k, l, n, shift
+    integer        :: isc, iec, jsc, jec, isd, ied, jsd, jed
+    integer        :: ism, iem, jsm, jem
+
+    integer, allocatable, dimension(:)       :: pe_start, pe_end, tile1, tile2
+    integer, allocatable, dimension(:)       :: istart1, iend1, jstart1, jend1
+    integer, allocatable, dimension(:)       :: istart2, iend2, jstart2, jend2
+    integer, allocatable, dimension(:,:)     :: layout2D, global_indices
+    real,    allocatable, dimension(:,:,:,:) :: x1, y1, x2, y2
+    real,    allocatable, dimension(:,:,:,:) :: a1, a2
+    real,    allocatable, dimension(:,:,:)   :: base
+    integer            :: id1, id2, id3
+    logical            :: folded_north
+    logical            :: cubic_grid
+    character(len=3)   :: text
+    integer            :: nx_save, ny_save
+    type(mpp_group_update_type) :: group_update
+    type(mpp_group_update_type), allocatable :: update_list(:)
+    logical :: do_omp_offload
+
+    folded_north       = .false.
+    cubic_grid         = .false.
+    do_omp_offload     = .false.
+    if (present(use_omp_offload)) do_omp_offload = use_omp_offload
+
+    nx_save = nx
+    ny_save = ny
+    !--- check the type
+    select case(type)
+    case ( 'Folded-north' )
+       ntiles = 1
+       shift = 0
+       num_contact = 2
+       folded_north = .true.
+       npes_per_tile = npes
+       if(layout_tripolar(1)*layout_tripolar(2) == npes ) then
+          layout = layout_tripolar
+       else
+          call mpp_define_layout( (/1,nx,1,ny/), npes_per_tile, layout )
+       endif
+    case ( 'Cubic-Grid' )
+       if( nx_cubic == 0 ) then
+          call mpp_error(NOTE,'test_group_update: for Cubic_grid mosaic, nx_cubic is zero, '//&
+               'No test is done for Cubic-Grid mosaic. ' )
+          return
+       endif
+       if( nx_cubic .NE. ny_cubic ) then
+          call mpp_error(NOTE,'test_group_update: for Cubic_grid mosaic, nx_cubic does not equal ny_cubic, '//&
+               'No test is done for Cubic-Grid mosaic. ' )
+          return
+       endif
+       shift = 1
+       nx = nx_cubic
+       ny = ny_cubic
+       ntiles = 6
+       num_contact = 12
+       cubic_grid = .true.
+       if( mod(npes, ntiles) == 0 ) then
+          npes_per_tile = npes/ntiles
+          write(outunit,*)'NOTE from update_domains_performance ==> For Mosaic "', trim(type), &
+               '", each tile will be distributed over ', npes_per_tile, ' processors.'
+       else
+          call mpp_error(NOTE,'test_group_update: npes should be multiple of ntiles No test is done for '//trim(type))
+          return
+       endif
+       if(layout_cubic(1)*layout_cubic(2) == npes_per_tile) then
+          layout = layout_cubic
+       else
+          call mpp_define_layout( (/1,nx,1,ny/), npes_per_tile, layout )
+       endif
+    case default
+       call mpp_error(FATAL, 'test_group_update: no such test: '//type)
+    end select
+
+    allocate(layout2D(2,ntiles), global_indices(4,ntiles), pe_start(ntiles), pe_end(ntiles) )
+    do n = 1, ntiles
+       pe_start(n) = (n-1)*npes_per_tile
+       pe_end(n)   = n*npes_per_tile-1
+    end do
+
+    do n = 1, ntiles
+       global_indices(:,n) = (/1,nx,1,ny/)
+       layout2D(:,n)         = layout
+    end do
+
+    allocate(tile1(num_contact), tile2(num_contact) )
+    allocate(istart1(num_contact), iend1(num_contact), jstart1(num_contact), jend1(num_contact) )
+    allocate(istart2(num_contact), iend2(num_contact), jstart2(num_contact), jend2(num_contact) )
+
+    !--- define domain
+    if(folded_north) then
+       !--- Contact line 1, between tile 1 (EAST) and tile 1 (WEST)  --- cyclic
+       tile1(1) = 1; tile2(1) = 1
+       istart1(1) = nx; iend1(1) = nx; jstart1(1) = 1;  jend1(1) = ny
+       istart2(1) = 1;  iend2(1) = 1;  jstart2(1) = 1;  jend2(1) = ny
+       !--- Contact line 2, between tile 1 (NORTH) and tile 1 (NORTH)  --- folded-north-edge
+       tile1(2) = 1; tile2(2) = 1
+       istart1(2) = 1;  iend1(2) = nx/2;   jstart1(2) = ny;  jend1(2) = ny
+       istart2(2) = nx; iend2(2) = nx/2+1; jstart2(2) = ny;  jend2(2) = ny
+       call mpp_define_mosaic(global_indices, layout2D, domain, ntiles, num_contact, tile1, tile2, &
+            istart1, iend1, jstart1, jend1, istart2, iend2, jstart2, jend2,      &
+            pe_start, pe_end, whalo=whalo, ehalo=ehalo, shalo=shalo, nhalo=nhalo, &
+            name = type, symmetry = .false.  )
+    else if( cubic_grid ) then
+       call define_cubic_mosaic(type, domain, (/nx,nx,nx,nx,nx,nx/), (/ny,ny,ny,ny,ny,ny/), &
+            global_indices, layout2D, pe_start, pe_end )
+    endif
+
+    !--- setup data
+    call mpp_get_compute_domain( domain, isc, iec, jsc, jec )
+    call mpp_get_data_domain   ( domain, isd, ied, jsd, jed )
+    call mpp_get_memory_domain   ( domain, ism, iem, jsm, jem )
+
+    if(num_fields<1) then
+       call mpp_error(FATAL, "test_mpp_domains: num_fields must be a positive integer")
+    endif
+
+    allocate(update_list(num_fields))
+
+    id1 = mpp_clock_id( type//' group 2D', flags=MPP_CLOCK_SYNC )
+    id2 = mpp_clock_id( type//' non-group 2D', flags=MPP_CLOCK_SYNC )
+    id3 = mpp_clock_id( type//' non-block group 2D', flags=MPP_CLOCK_SYNC )
+
+    allocate( a1(ism:iem,      jsm:jem,       nz, num_fields) )
+    allocate( x1(ism:iem+shift,jsm:jem,       nz, num_fields) )
+    allocate( y1(ism:iem,      jsm:jem+shift, nz, num_fields) )
+    allocate( a2(ism:iem,      jsm:jem,       nz, num_fields) )
+    allocate( x2(ism:iem+shift,jsm:jem,       nz, num_fields) )
+    allocate( y2(ism:iem,      jsm:jem+shift, nz, num_fields) )
+    allocate( base(isc:iec+shift,jsc:jec+shift,nz) )
+    a1 = 0; x1 = 0; y1 = 0
+    ! allocate a1, x1, y1 on GPU (GPU x1, y1 only used for CGRID test)
+    !$omp target enter data if(do_omp_offload) map(to: a1, x1, y1)
+
+    base = 0
+    do k = 1,nz
+       do j = jsc, jec+shift
+          do i = isc, iec+shift
+             base(i,j,k) = k + i*1e-3 + j*1e-6
+          end do
+       end do
+    end do
+
+    !--- Test for partial direction update
+    do l =1, num_fields
+       call mpp_create_group_update(group_update, a1(:,:,:,l), domain, flags=WUPDATE+SUPDATE)
+    end do
+
+    ! initialize a1 values on GPU only to check GPU data is being transferred
+    !$omp target teams loop if(do_omp_offload) default(shared) &
+    !$omp   shared(num_fields, nz, jsc, jec, isc, iec, a1, base) map(tofrom: a1) map(to: base)
+    do l = 1, num_fields
+       do k=1,nz
+          do j=jsc,jec
+             do i=isc,iec
+                a1(i,j,k,l) = base(i,j,k) + l*1e3
+             enddo
+          enddo
+          do i=isc-1,iec+1
+             a1(i,jsc-1,k,l) = 999;
+             a1(i,jec+1,k,l) = 999;
+          enddo
+          do j=jsc,jec
+             a1(isc-1,j,k,l) = 999
+             a1(iec+1,j,k,l) = 999
+          enddo
+       enddo
+    enddo
+    !$omp target teams loop collapse(4) if(do_omp_offload) default(shared) &
+    !$omp   shared(num_fields, nz, jsm, jem, ism, iem, a1, a2) map(to: a1) map(from: a2)
+    do l=1,num_fields ; do k=1,nz ; do j=jsm,jem ; do i=ism,iem
+      a2(i,j,k,l) = a1(i,j,k,l)
+    enddo ; enddo ; enddo ; enddo
+    call mpp_do_group_update(group_update, domain, a1(isc,jsc,1,1), omp_offload=do_omp_offload)
+
+    do l = 1, num_fields
+       call mpp_update_domains( a2(:,:,:,l), domain, flags=WUPDATE+SUPDATE, complete=l==num_fields )
+    enddo
+
+    ! copy a1 back to host
+    !$omp target update if(do_omp_offload) from(a1)
+    do l = 1, num_fields
+       write(text, '(i3.3)') l
+       call compare_checksums(a1(isd:ied,jsd:jed,:,l),a2(isd:ied,jsd:jed,:,l),type//' CENTER South West '//text)
+    enddo
+
+    call mpp_clear_group_update(group_update)
+
+    !--- Test for DGRID update
+    if(type == 'Cubic-Grid' ) then
+       x1 = 0; y1 = 0
+       do l =1, num_fields
+          call mpp_create_group_update(group_update, x1(:,:,:,l), y1(:,:,:,l), domain, gridtype=DGRID_NE)
+       end do
+
+       do l = 1, num_fields
+          y1(isc:iec+shift,jsc:jec,      :,l) = base(isc:iec+shift,jsc:jec,      :) + l*1e3 + 1e6
+          x1(isc:iec,      jsc:jec+shift,:,l) = base(isc:iec,      jsc:jec+shift,:) + l*1e3 + 2*1e6
+       enddo
+       x2 = x1; y2 = y1
+       call mpp_start_group_update(group_update, domain, x1(isc,jsc,1,1))
+       call mpp_complete_group_update(group_update, domain, x1(isc,jsc,1,1))
+
+       do l = 1, num_fields
+          call mpp_update_domains( x2(:,:,:,l), y2(:,:,:,l), domain, gridtype=DGRID_NE, complete=l==num_fields )
+       enddo
+
+    !--- compare checksum
+       do l = 1, num_fields
+          write(text, '(i3.3)') l
+          call compare_checksums(x1(isd:ied+shift,jsd:jed, :,l),x2(isd:ied+shift,jsd:jed, :,l),type//' DGRID X'//text)
+          call compare_checksums(y1(isd:ied, jsd:jed+shift,:,l),y2(isd:ied, jsd:jed+shift,:,l),type//' DGRID Y'//text)
+       enddo
+
+       call mpp_clear_group_update(group_update)
+    endif
+    !--- Test for CGRID
+    a1 = 0; x1 = 0; y1 = 0
+    do l =1, num_fields
+       call mpp_create_group_update(group_update, a1(:,:,:,l), domain)
+       call mpp_create_group_update(group_update, x1(:,:,:,l), y1(:,:,:,l), domain, gridtype=CGRID_NE)
+    end do
+
+    do n = 1, num_iter
+       a1 = 0; x1 = 0; y1 = 0
+       !$omp target update if(do_omp_offload) to(a1, x1, y1)
+       do l = 1, num_fields
+          !$omp target teams loop collapse(3) if(do_omp_offload) default(shared) &
+          !$omp   shared(nz, jsc, jec, isc, iec, a1, base, l) map(to: base) map(tofrom: a1)
+          do k=1,nz
+             do j=jsc,jec
+                do i=isc,iec
+                   a1(i,j,k,l) = base(i,j,k) + l*1e3
+                enddo
+             enddo
+          enddo
+          !$omp target teams loop collapse(3) if(do_omp_offload) default(shared) &
+          !$omp   shared(nz, jsc, jec, shift, isc, iec, x1, base, l) map(to: base) map(tofrom: x1)
+          do k=1,nz
+             do j=jsc,jec
+                do i=isc,iec+shift
+                   x1(i,j,k,l) = base(i,j,k) + l*1e3 + 1e6
+                enddo
+             enddo
+          enddo
+          !$omp target teams loop collapse(3) if(do_omp_offload) default(shared) &
+          !$omp   shared(nz, jsc, jec, shift, isc, iec, y1, base, l) map(to: base) map(tofrom: y1)
+          do k=1,nz
+             do j=jsc,jec+shift
+                do i=isc,iec
+                   y1(i,j,k,l) = base(i,j,k) + l*1e3 + 2*1e6
+                enddo
+             enddo
+          enddo
+       enddo
+       !$omp target teams loop collapse(4) if(do_omp_offload) default(shared) &
+       !$omp   shared(num_fields, nz, jsm, jem, ism, iem, a1, a2) map(from: a2) map(to: a1)
+       do l=1,num_fields ; do k=1,nz ; do j=jsm,jem ; do i=ism,iem
+          a2(i,j,k,l) = a1(i,j,k,l)
+       enddo ; enddo ; enddo ; enddo
+       !$omp target teams loop collapse(4) if(do_omp_offload) default(shared) &
+       !$omp   shared(num_fields, nz, jsm, jem, ism, iem, shift, x1, x2) map(from: x2) map(to: x1)
+       do l=1,num_fields ; do k=1,nz ; do j=jsm,jem ; do i=ism,iem+shift
+          x2(i,j,k,l) = x1(i,j,k,l)
+       enddo ; enddo ; enddo ; enddo
+       !$omp target teams loop collapse(4) if(do_omp_offload) default(shared) &
+       !$omp   shared(num_fields, nz, jsm, jem, shift, ism, iem, y1, y2) map(from: y2) map(to: y1)
+       do l=1,num_fields ; do k=1,nz ; do j=jsm,jem+shift ; do i=ism,iem
+          y2(i,j,k,l) = y1(i,j,k,l)
+       enddo ; enddo ; enddo ; enddo
+
+       call mpp_clock_begin(id1)
+       call mpp_do_group_update(group_update, domain, a1(isc,jsc,1,1), omp_offload=do_omp_offload)
+       call mpp_clock_end  (id1)
+
+       call mpp_clock_begin(id2)
+       do l = 1, num_fields
+          call mpp_update_domains( a2(:,:,:,l), domain, complete=l==num_fields )
+       enddo
+       do l = 1, num_fields
+          call mpp_update_domains( x2(:,:,:,l), y2(:,:,:,l), domain, gridtype=CGRID_NE, complete=l==num_fields )
+       enddo
+       call mpp_clock_end(id2)
+
+       !--- compare checksum
+       if( n == num_iter ) then
+       !$omp target update if(do_omp_offload) from(a1, x1, y1)
+       do l = 1, num_fields
+          write(text, '(i3.3)') l
+          call compare_checksums(a1(isd:ied, jsd:jed, :,l),a2(isd:ied, jsd:jed, :,l),type//' CENTER '//text)
+          call compare_checksums(x1(isd:ied+shift,jsd:jed, :,l),x2(isd:ied+shift,jsd:jed, :,l),type//' CGRID X'//text)
+          call compare_checksums(y1(isd:ied, jsd:jed+shift,:,l),y2(isd:ied, jsd:jed+shift,:,l),type//' CGRID Y'//text)
+       enddo
+       endif
+       a1 = 0; x1 = 0; y1 = 0
+       do l = 1, num_fields
+          a1(isc:iec,      jsc:jec,      :,l) = base(isc:iec,      jsc:jec,      :) + l*1e3
+          x1(isc:iec+shift,jsc:jec,      :,l) = base(isc:iec+shift,jsc:jec,      :) + l*1e3 + 1e6
+          y1(isc:iec,      jsc:jec+shift,:,l) = base(isc:iec,      jsc:jec+shift,:) + l*1e3 + 2*1e6
+       enddo
+       call mpp_clock_begin(id3)
+       call mpp_start_group_update(group_update, domain, a1(isc,jsc,1,1))
+       call mpp_complete_group_update(group_update, domain, a1(isc,jsc,1,1))
+       call mpp_clock_end  (id3)
+       !--- compare checksum
+       if( n == num_iter ) then
+       do l = 1, num_fields
+          write(text, '(i3.3)') l
+          call compare_checksums(a1(isd:ied,      jsd:jed,      :,l),a2(isd:ied,      jsd:jed,      :,l), &
+                                 type//' nonblock CENTER '//text)
+          call compare_checksums(x1(isd:ied+shift,jsd:jed,      :,l),x2(isd:ied+shift,jsd:jed,      :,l), &
+                                 type//' nonblock CGRID X'//text)
+          call compare_checksums(y1(isd:ied,      jsd:jed+shift,:,l),y2(isd:ied,      jsd:jed+shift,:,l), &
+                                 type//' nonblock CGRID Y'//text)
+       enddo
+       endif
+    enddo
+
+    call mpp_clear_group_update(group_update)
+
+    !--- The following is to test overlapping start and complete
+    if( num_fields > 1 ) then
+       do l =1, num_fields
+          call mpp_create_group_update(update_list(l), a1(:,:,:,l), domain)
+          call mpp_create_group_update(update_list(l), x1(:,:,:,l), y1(:,:,:,l), domain, gridtype=CGRID_NE)
+       end do
+
+       do n = 1, num_iter
+
+          a1 = 0; x1 = 0; y1 = 0
+          do l = 1, num_fields
+             a1(isc:iec,      jsc:jec,      :,l) = base(isc:iec,      jsc:jec,      :) + l*1e3
+             x1(isc:iec+shift,jsc:jec,      :,l) = base(isc:iec+shift,jsc:jec,      :) + l*1e3 + 1e6
+             y1(isc:iec,      jsc:jec+shift,:,l) = base(isc:iec,      jsc:jec+shift,:) + l*1e3 + 2*1e6
+          enddo
+          do l = 1, num_fields-1
+             call mpp_start_group_update(update_list(l), domain, a1(isc,jsc,1,1))
+          enddo
+
+          call mpp_complete_group_update(update_list(1), domain, a1(isc,jsc,1,1))
+          call mpp_start_group_update(update_list(num_fields), domain, a1(isc,jsc,1,1))
+          do l = 2, num_fields
+             call mpp_complete_group_update(update_list(l), domain, a1(isc,jsc,1,1))
+          enddo
+          !--- compare checksum
+          if( n == num_iter ) then
+          do l = 1, num_fields
+             write(text, '(i3.3)') l
+             call compare_checksums(a1(isd:ied,      jsd:jed,      :,l),a2(isd:ied,      jsd:jed,      :,l), &
+                                    type//' multiple nonblock CENTER '//text)
+             call compare_checksums(x1(isd:ied+shift,jsd:jed,      :,l),x2(isd:ied+shift,jsd:jed,      :,l), &
+                                    type//' multiple nonblock CGRID X'//text)
+             call compare_checksums(y1(isd:ied,      jsd:jed+shift,:,l),y2(isd:ied,      jsd:jed+shift,:,l), &
+                                    type//' multiple nonblock CGRID Y'//text)
+          enddo
+          endif
+       enddo
+    endif
+
+    do l =1, num_fields
+      call mpp_clear_group_update(update_list(l))
+    enddo
+    deallocate(update_list)
+
+    !--- test scalar 4-D variable
+    call mpp_create_group_update(group_update, a1(:,:,:,:), domain)
+
+    a1 = 0; x1 = 0; y1 = 0
+    !$omp target update if(do_omp_offload) to(a1)
+    !$omp target teams loop collapse(4) if(do_omp_offload) default(shared) &
+    !$omp   shared(num_fields, nz, jsc, jec, isc, iec, a1, base) map(to: base) map(from: a1)
+    do l = 1, num_fields
+       do k=1,nz
+          do j=jsc,jec
+             do i=isc,iec
+                a1(i,j,k,l) = base(i,j,k) + l*1e3
+             enddo
+          enddo
+       enddo
+    enddo
+    !$omp target teams loop collapse(4) if(do_omp_offload) default(shared) &
+    !$omp   shared(num_fields, nz, jsm, jem, ism, iem, a1, a2) map(from: a2) map(to: a1)
+    do l=1,num_fields ; do k=1,nz ; do j=jsm,jem ; do i=ism,iem
+       a2(i,j,k,l) = a1(i,j,k,l)
+    enddo ; enddo ; enddo ; enddo
+    x2 = x1; y2 = y1
+    call mpp_clock_begin(id1)
+    call mpp_do_group_update(group_update, domain, a1(isc,jsc,1,1), omp_offload=do_omp_offload)
+    call mpp_clock_end  (id1)
+
+    call mpp_clock_begin(id2)
+    call mpp_update_domains( a2(:,:,:,:), domain )
+    call mpp_clock_end(id2)
+
+    !--- compare checksum
+    !$omp target update if(do_omp_offload) from(a1)
+    do l = 1, num_fields
+       write(text, '(i3.3)') l
+       call compare_checksums(a1(isd:ied, jsd:jed, :,l),a2(isd:ied, jsd:jed, :,l),type//' 4D CENTER '//text)
+    enddo
+
+    a1 = 0
+    do l = 1, num_fields
+       a1(isc:iec,      jsc:jec,      :,l) = base(isc:iec,      jsc:jec,      :) + l*1e3
+    enddo
+    call mpp_clock_begin(id3)
+    call mpp_start_group_update(group_update, domain, a1(isc,jsc,1,1))
+    call mpp_complete_group_update(group_update, domain, a1(isc,jsc,1,1))
+    call mpp_clock_end  (id3)
+
+    !--- compare checksum
+    do l = 1, num_fields
+       write(text, '(i3.3)') l
+       call compare_checksums(a1(isd:ied, jsd:jed, :,l),a2(isd:ied, jsd:jed, :,l), &
+                              type//' nonblock 4D CENTER '//text)
+    enddo
+
+    !$omp target exit data if(do_omp_offload) map(release: a1, x1, y1)
+
+    !--- test for BGRID.
+    deallocate(a1, x1, y1)
+    deallocate(a2, x2, y2)
+    call mpp_clear_group_update(group_update)
+
+    allocate( a1(ism:iem+shift,jsm:jem+shift, nz, num_fields) )
+    allocate( x1(ism:iem+shift,jsm:jem+shift, nz, num_fields) )
+    allocate( y1(ism:iem+shift,jsm:jem+shift, nz, num_fields) )
+    allocate( a2(ism:iem+shift,jsm:jem+shift, nz, num_fields) )
+    allocate( x2(ism:iem+shift,jsm:jem+shift, nz, num_fields) )
+    allocate( y2(ism:iem+shift,jsm:jem+shift, nz, num_fields) )
+
+    !$omp target enter data map(alloc: a1, x1, y1) if(do_omp_offload)
+
+    do l =1, num_fields
+       call mpp_create_group_update(group_update, a1(:,:,:,l), domain, position=CORNER)
+       call mpp_create_group_update(group_update, x1(:,:,:,l), y1(:,:,:,l), domain, gridtype=BGRID_NE)
+    end do
+
+    do n = 1, num_iter
+       a1 = 0; x1 = 0; y1 = 0
+       !$omp target update if(do_omp_offload) to(a1, x1, y1)
+       do l = 1, num_fields
+          !$omp target teams loop collapse(3) if(do_omp_offload) default(shared) &
+          !$omp   shared(nz, base, a1, x1, y1, l, jsc, jec, shift, isc, iec) &
+          !$omp   map(to: base) map(tofrom: a1, x1, y1)
+          do k=1,nz
+             do j=jsc,jec+shift
+                do i=isc,iec+shift
+                   a1(i,j,k,l) = base(i,j,k) + l*1e3
+                   x1(i,j,k,l) = base(i,j,k) + l*1e3 + 1e6
+                   y1(i,j,k,l) = base(i,j,k) + l*1e3 + 2*1e6
+                enddo
+             enddo
+          enddo
+       enddo
+       !$omp target teams loop collapse(4) if(do_omp_offload) default(shared) &
+       !$omp   shared(num_fields, nz, jsm, jem, shift, ism, iem, a1, x1, y1, a2, x2, y2) &
+       !$omp   map(from: a2, x2, y2) map(to: a1, x1, y1)
+       do l=1,num_fields ; do k=1,nz ; do j=jsm,jem+shift ; do i=ism,iem+shift
+          a2(i,j,k,l) = a1(i,j,k,l)
+          x2(i,j,k,l) = x1(i,j,k,l)
+          y2(i,j,k,l) = y1(i,j,k,l)
+       enddo ; enddo ; enddo ; enddo
+       call mpp_clock_begin(id1)
+       call mpp_do_group_update(group_update, domain, a1(isc,jsc,1,1), omp_offload=do_omp_offload)
+       call mpp_clock_end  (id1)
+
+       call mpp_clock_begin(id2)
+       do l = 1, num_fields
+          call mpp_update_domains( a2(:,:,:,l), domain, position=CORNER, complete=l==num_fields )
+       enddo
+       do l = 1, num_fields
+          call mpp_update_domains( x2(:,:,:,l), y2(:,:,:,l), domain, gridtype=BGRID_NE, complete=l==num_fields )
+       enddo
+       call mpp_clock_end(id2)
+
+       !--- compare checksum
+       if( n == num_iter ) then
+       !$omp target update if(do_omp_offload) from(a1,x1,y1)
+       do l = 1, num_fields
+          write(text, '(i3.3)') l
+          call compare_checksums(a1(isd:ied+shift,jsd:jed+shift,:,l),a2(isd:ied+shift,jsd:jed+shift,:,l),type//&
+                               & ' CORNER '// text)
+          call compare_checksums(x1(isd:ied+shift,jsd:jed+shift,:,l),x2(isd:ied+shift,jsd:jed+shift,:,l),type//&
+                               & ' BGRID X'// text)
+          call compare_checksums(y1(isd:ied+shift,jsd:jed+shift,:,l),y2(isd:ied+shift,jsd:jed+shift,:,l),type//&
+                               & ' BGRID Y'// text)
+       enddo
+       endif
+
+       a1 = 0; x1 = 0; y1 = 0
+       do l = 1, num_fields
+          a1(isc:iec+shift,jsc:jec+shift,:,l) = base(isc:iec+shift,jsc:jec+shift,:) + l*1e3
+          x1(isc:iec+shift,jsc:jec+shift,:,l) = base(isc:iec+shift,jsc:jec+shift,:) + l*1e3 + 1e6
+          y1(isc:iec+shift,jsc:jec+shift,:,l) = base(isc:iec+shift,jsc:jec+shift,:) + l*1e3 + 2*1e6
+       enddo
+       call mpp_clock_begin(id3)
+       call mpp_start_group_update(group_update, domain, a1(isc,jsc,1,1))
+       call mpp_complete_group_update(group_update, domain, a1(isc,jsc,1,1))
+       call mpp_clock_end  (id3)
+       !--- compare checksum
+       if( n == num_iter ) then
+       do l = 1, num_fields
+          write(text, '(i3.3)') l
+          call compare_checksums(a1(isd:ied+shift,jsd:jed+shift,:,l),a2(isd:ied+shift,jsd:jed+shift,:,l), &
+                                 type//' nonblockCORNER '//text)
+          call compare_checksums(x1(isd:ied+shift,jsd:jed+shift,:,l),x2(isd:ied+shift,jsd:jed+shift,:,l), &
+                                 type//' nonblock BGRID X'//text)
+          call compare_checksums(y1(isd:ied+shift,jsd:jed+shift,:,l),y2(isd:ied+shift,jsd:jed+shift,:,l), &
+                                 type//' nonblock BGRID Y'//text)
+       enddo
+       endif
+    enddo
+    !$omp target exit data if(do_omp_offload) map(release: a1,x1,y1)
+    call mpp_clear_group_update(group_update)
+
+    !-----------------------------------------------------------------------------
+    !                   test for AGRID vector and scalar pair
+    !-----------------------------------------------------------------------------
+    deallocate(x1, y1)
+    deallocate(x2, y2)
+
+    allocate( x1(ism:iem,jsm:jem, nz, num_fields) )
+    allocate( y1(ism:iem,jsm:jem, nz, num_fields) )
+    allocate( x2(ism:iem,jsm:jem, nz, num_fields) )
+    allocate( y2(ism:iem,jsm:jem, nz, num_fields) )
+
+    x1 = 0; y1 = 0
+    do l = 1, num_fields
+       x1(isc:iec,jsc:jec,:,l) = base(isc:iec,jsc:jec,:) + l*1e3 + 1e6
+       y1(isc:iec,jsc:jec,:,l) = base(isc:iec,jsc:jec,:) + l*1e3 + 2*1e6
+    enddo
+    x2 = x1; y2 = y1
+
+    do l =1, num_fields
+       call mpp_create_group_update(group_update, x1(:,:,:,l), y1(:,:,:,l), domain, gridtype=AGRID)
+    end do
+
+    do l = 1, num_fields
+       call mpp_update_domains( x2(:,:,:,l), y2(:,:,:,l), domain, gridtype=AGRID, complete=l==num_fields )
+    enddo
+
+    call mpp_start_group_update(group_update, domain, a1(isc,jsc,1,1))
+    call mpp_complete_group_update(group_update, domain, a1(isc,jsc,1,1))
+
+    !--- compare checksum
+    do l = 1, num_fields
+       write(text, '(i3.3)') l
+       call compare_checksums(x1(isd:ied,jsd:jed,:,l),x2(isd:ied,jsd:jed,:,l),type//' AGRID X'//text)
+       call compare_checksums(y1(isd:ied,jsd:jed,:,l),y2(isd:ied,jsd:jed,:,l),type//' AGRID Y'//text)
+    enddo
+
+    call mpp_clear_group_update(group_update)
+
+    x1 = 0; y1 = 0
+    do l = 1, num_fields
+       x1(isc:iec,jsc:jec,:,l) = base(isc:iec,jsc:jec,:) + l*1e3 + 1e6
+       y1(isc:iec,jsc:jec,:,l) = base(isc:iec,jsc:jec,:) + l*1e3 + 2*1e6
+    enddo
+    x2 = x1; y2 = y1
+
+    do l =1, num_fields
+       call mpp_create_group_update(group_update, x1(:,:,:,l), y1(:,:,:,l), domain, gridtype=AGRID, flags=SCALAR_PAIR)
+    end do
+
+    do l = 1, num_fields
+       call mpp_update_domains( x2(:,:,:,l), y2(:,:,:,l), domain, gridtype=AGRID, flags=SCALAR_PAIR, &
+                              &  complete=l==num_fields)
+    enddo
+
+    call mpp_start_group_update(group_update, domain, x1(isc,jsc,1,1))
+    call mpp_complete_group_update(group_update, domain, x1(isc,jsc,1,1))
+
+    !--- compare checksum
+    do l = 1, num_fields
+       write(text, '(i3.3)') l
+       call compare_checksums(x1(isd:ied,jsd:jed,:,l),x2(isd:ied,jsd:jed,:,l),type//' AGRID SCALAR_PAIR X'//text)
+       call compare_checksums(y1(isd:ied,jsd:jed,:,l),y2(isd:ied,jsd:jed,:,l),type//' AGRID SCALAR_PAIR Y'//text)
+    enddo
+
+    call mpp_clear_group_update(group_update)
+
+    deallocate(pe_start, pe_end, tile1, tile2)
+    deallocate(istart1, iend1, jstart1, jend1)
+    deallocate(istart2, iend2, jstart2, jend2)
+    deallocate(layout2D, global_indices)
+
+    deallocate(a1, x1, y1)
+    deallocate(a2, x2, y2)
+    deallocate(base)
+    call mpp_deallocate_domain(domain)
+
+end subroutine test_group_update
+
 
   !###############################################################
   !--- This will test scalar and CGRID performance between halo=1 and halo=3
diff --git a/test_fms/mpp/test_mpp_domains2.sh b/test_fms/mpp/test_mpp_domains2.sh
index a9b26f2cd3..f23762be88 100755
--- a/test_fms/mpp/test_mpp_domains2.sh
+++ b/test_fms/mpp/test_mpp_domains2.sh
@@ -28,7 +28,7 @@
 . ../test-lib.sh
 
 # TODO: Enable this test once generalized indices work is complete
-SKIP_TESTS="test_mpp_domains2.12"
+SKIP_TESTS="test_mpp_domains2.15"
 
 # TODO edge update, fails on non-blocking with gnu
 #SKIP_TESTS="$SKIP_TESTS $(basename $0 .sh).6"
@@ -79,6 +79,7 @@ test_nonsym_edge = .false.
 test_halosize_performance = .false.
 test_adjoint = .false.
 wide_halo = .false.
+test_group_offload = .false.
 /
 _EOF
 
@@ -128,10 +129,6 @@ sed "s/test_unstruct = .false./test_unstruct = .true./" input_base.nml > input.n
 test_expect_success "unstruct" '
     mpirun -n 2 ../test_mpp_domains
 '
-sed "s/test_group = .false./test_group = .true./" input_base.nml > input.nml
-test_expect_success "group" '
-    mpirun -n 2 ../test_mpp_domains
-'
 sed "s/test_interface = .false./test_interface = .true./" input_base.nml > input.nml
 test_expect_success "interface" '
     mpirun -n 6 ../test_mpp_domains
@@ -144,5 +141,14 @@ sed "s/test_get_nbr = .false./test_get_nbr = .true./" input_base.nml > input.nml
 test_expect_success "get nbr" '
     mpirun -n 8 ../test_mpp_domains
 '
+sed "s/test_group = .false./test_group = .true./" input_base.nml > input.nml
+test_expect_success "group" '
+    mpirun -n 2 ../test_mpp_domains
+'
+# do the group update again, but with openmp offload flag
+sed "s/test_group_offload = .false./test_group_offload = .true./" input_base.nml > input.nml
+test_expect_success "group update with OpenMP offload" '
+    mpirun -n 2 ../test_mpp_domains
+'
 
 test_done