diff --git a/mpp/include/mpp_do_redistribute.fh b/mpp/include/mpp_do_redistribute.fh
index 5acf4bb314..475af25669 100644
--- a/mpp/include/mpp_do_redistribute.fh
+++ b/mpp/include/mpp_do_redistribute.fh
@@ -17,97 +17,384 @@
 !***********************************************************************
 !> @addtogroup mpp_domains_mod
 !> @{
-    subroutine MPP_DO_REDISTRIBUTE_3D_( f_in, f_out, d_comm, d_type )
-      integer(i8_kind), intent(in)         :: f_in(:), f_out(:)
-      type(DomainCommunicator2D), intent(in) :: d_comm
-      MPP_TYPE_, intent(in)                  :: d_type
-      MPP_TYPE_ :: field_in(d_comm%domain_in%x(1)%domain_data%begin:d_comm%domain_in%x(1)%domain_data%end, &
-                            d_comm%domain_in%y(1)%domain_data%begin:d_comm%domain_in%y(1)%domain_data%end,d_comm%ke)
-      pointer( ptr_field_in, field_in)
-      MPP_TYPE_ :: field_out(d_comm%domain_out%x(1)%domain_data%begin:d_comm%domain_out%x(1)%domain_data%end, &
-                             d_comm%domain_out%y(1)%domain_data%begin:d_comm%domain_out%y(1)%domain_data%end,d_comm%ke)
-      pointer( ptr_field_out, field_out)
-      type(domain2D), pointer :: domain_in, domain_out
-      integer :: i, j, k, l, n, l_size
-      integer :: is, ie, js, je
-      integer :: ke
-      integer :: list, pos, msgsize
-      integer :: to_pe, from_pe
-      MPP_TYPE_ :: buffer(size(mpp_domains_stack(:)))
-      pointer( ptr, buffer )
-      integer :: buffer_pos, wordlen, errunit
-
-!fix ke
-      errunit = stderr()
-      l_size = size(f_out(:))  ! equal to size(f_in(:))
-      ke = d_comm%ke
-      domain_in =>d_comm%domain_in; domain_out =>d_comm%domain_out
-
-      buffer_pos = 0
-      ptr = LOC(mpp_domains_stack)
-      wordlen = size(TRANSFER(buffer(1),mpp_domains_stack))
-
-!pre-post recv
-      n = d_comm%Rlist_size
-      do list = 0,n-1
-         if( .NOT. d_comm%R_do_buf(list) )cycle
-         from_pe = d_comm%cfrom_pe(list)
-         msgsize = d_comm%R_msize(list)*l_size
-         call mpp_recv( buffer(buffer_pos+1), glen=msgsize, from_pe=from_pe, block=.FALSE., tag=COMM_TAG_1 )
-         buffer_pos = buffer_pos + msgsize
+subroutine MPP_DO_REDISTRIBUTE_3D_( f_in, f_out, d_comm, d_type, axis_to_storage_in, &
+                                  & nchunks, chunk_shape_in, chunk_shape_out )
+
+  use iso_c_binding, only : c_ptr, c_null_ptr, c_f_pointer, c_intptr_t
+
+  integer(i8_kind), intent(in) :: f_in(:), f_out(:)
+  type(DomainCommunicator2D), intent(in) :: d_comm
+  MPP_TYPE_, intent(in) :: d_type
+
+  integer, intent(in), optional :: axis_to_storage_in(3)
+
+  ! One entry per public redistribute field.
+  !
+  ! 2D/3D field:
+  !   nchunks(field) = 1
+  !
+  ! 4D field:
+  !   nchunks(field) = size(field,4)
+  !
+  ! 5D field:
+  !   nchunks(field) = size(field,4) * size(field,5)
+  integer, intent(in) :: nchunks(:)
+
+  ! Storage shape of one contiguous 3D chunk for each public field.
+  !
+  ! First dimension is the storage-dimension selector.
+  ! Second dimension is the public field-list selector.
+  integer, intent(in) :: chunk_shape_in(:,:)
+  integer, intent(in) :: chunk_shape_out(:,:)
+
+  integer :: axis_to_storage(3)
+
+  type(c_ptr) :: p_in, p_out
+
+  MPP_TYPE_, pointer :: field_in(:,:,:)
+  MPP_TYPE_, pointer :: field_out(:,:,:)
+
+  integer :: shape3(3)
+
+  ! Logical-axis bounds:
+  !   1 = x
+  !   2 = y
+  !   3 = z
+  integer :: logical_lo(3), logical_hi(3)
+
+  ! Bounds reordered into storage-dimension order.
+  integer :: storage_lo(3), storage_hi(3)
+
+  integer(c_intptr_t) :: base_addr
+  integer(c_intptr_t) :: chunk_addr
+  integer(c_intptr_t) :: chunk_offset
+  integer(c_intptr_t) :: chunk_elements
+  integer(c_intptr_t) :: element_bytes
+
+  type(domain2D), pointer :: domain_in, domain_out
+
+  integer :: nfields
+  integer :: total_chunks
+
+  integer :: field
+  integer :: chunk
+  integer :: list
+  integer :: n
+
+  integer :: s1, s2, s3
+  integer :: is, ie, js, je
+  integer :: ke
+
+  integer :: pos
+  integer :: msgsize
+  integer :: buffer_pos
+
+  integer :: to_pe
+  integer :: from_pe
+
+  integer :: errunit
+
+  MPP_TYPE_ :: buffer(size(mpp_domains_stack(:)))
+  pointer(ptr, buffer)
+
+  !--------------------------------------------------------------------
+  ! Validate axis permutation and field metadata.
+  !--------------------------------------------------------------------
+  axis_to_storage = (/1,2,3/)
+  if (present(axis_to_storage_in)) axis_to_storage = axis_to_storage_in
+
+  if (any(axis_to_storage < 1) .OR. any(axis_to_storage > 3) .OR. &
+      axis_to_storage(1) == axis_to_storage(2) .OR.               &
+      axis_to_storage(1) == axis_to_storage(3) .OR.               &
+      axis_to_storage(2) == axis_to_storage(3)) then
+    call mpp_error(FATAL, &
+         'MPP_DO_REDISTRIBUTE_3D_: invalid axis_to_storage permutation')
+  endif
+
+  nfields = size(f_in)
+
+  if (size(f_out) /= nfields) then
+    call mpp_error(FATAL, &
+         'MPP_DO_REDISTRIBUTE_3D_: input/output field-list sizes differ')
+  endif
+
+  if (size(nchunks) /= nfields) then
+    call mpp_error(FATAL, &
+         'MPP_DO_REDISTRIBUTE_3D_: invalid nchunks size')
+  endif
+
+  if (size(chunk_shape_in,1) /= 3 .OR.  &
+      size(chunk_shape_out,1) /= 3 .OR. &
+      size(chunk_shape_in,2) /= nfields .OR. &
+      size(chunk_shape_out,2) /= nfields) then
+    call mpp_error(FATAL, &
+         'MPP_DO_REDISTRIBUTE_3D_: invalid chunk-shape metadata')
+  endif
+
+  if (any(nchunks < 1)) then
+    call mpp_error(FATAL, &
+         'MPP_DO_REDISTRIBUTE_3D_: chunk counts must be positive')
+  endif
+
+  if (any(chunk_shape_in < 1) .OR. any(chunk_shape_out < 1)) then
+    call mpp_error(FATAL, &
+         'MPP_DO_REDISTRIBUTE_3D_: chunk extents must be positive')
+  endif
+
+  total_chunks = sum(nchunks)
+
+  !--------------------------------------------------------------------
+  ! General setup.
+  !--------------------------------------------------------------------
+  errunit = stderr()
+
+  ke = d_comm%ke
+
+  domain_in  => d_comm%domain_in
+  domain_out => d_comm%domain_out
+
+  ptr = LOC(mpp_domains_stack)
+
+  ! LOC addresses are byte addresses on the supported targets.
+  ! STORAGE_SIZE returns the number of bits per scalar element.
+  element_bytes = int(storage_size(d_type) / 8, kind=c_intptr_t)
+
+  if (element_bytes < 1_c_intptr_t) then
+    call mpp_error(FATAL, &
+         'MPP_DO_REDISTRIBUTE_3D_: invalid element size')
+  endif
+
+  !--------------------------------------------------------------------
+  ! Pre-post receives.
+  !
+  ! R_msize contains the x*y*z message size for one logical 3D chunk.
+  !--------------------------------------------------------------------
+  buffer_pos = 0
+  n = d_comm%Rlist_size
+
+  do list = 0, n-1
+
+    if (.NOT. d_comm%R_do_buf(list)) cycle
+
+    from_pe = d_comm%cfrom_pe(list)
+    msgsize = d_comm%R_msize(list) * total_chunks
+
+    call mpp_recv( buffer(buffer_pos+1), glen=msgsize, &
+                 & from_pe=from_pe, block=.FALSE., tag=COMM_TAG_1 )
+
+    buffer_pos = buffer_pos + msgsize
+
+  enddo
+
+  !--------------------------------------------------------------------
+  ! Pack and send.
+  !--------------------------------------------------------------------
+  n = d_comm%Slist_size
+
+  do list = 0, n-1
+
+    if (.NOT. d_comm%S_do_buf(list)) cycle
+
+    to_pe = d_comm%cto_pe(list)
+
+    is = d_comm%sendis(1,list)
+    ie = d_comm%sendie(1,list)
+    js = d_comm%sendjs(1,list)
+    je = d_comm%sendje(1,list)
+
+    ! Logical bounds within the local input field.
+    logical_lo(1) = is - domain_in%x(1)%domain_data%begin + 1
+    logical_hi(1) = ie - domain_in%x(1)%domain_data%begin + 1
+
+    logical_lo(2) = js - domain_in%y(1)%domain_data%begin + 1
+    logical_hi(2) = je - domain_in%y(1)%domain_data%begin + 1
+
+    logical_lo(3) = 1
+    logical_hi(3) = ke
+
+    ! Map logical-axis bounds onto the storage dimensions.
+    storage_lo = 0
+    storage_hi = 0
+
+    storage_lo(axis_to_storage(1)) = logical_lo(1)
+    storage_hi(axis_to_storage(1)) = logical_hi(1)
+
+    storage_lo(axis_to_storage(2)) = logical_lo(2)
+    storage_hi(axis_to_storage(2)) = logical_hi(2)
+
+    storage_lo(axis_to_storage(3)) = logical_lo(3)
+    storage_hi(axis_to_storage(3)) = logical_hi(3)
+
+    pos = buffer_pos
+
+    do field = 1, nfields
+
+      shape3 = chunk_shape_in(:,field)
+
+      ! Each input chunk must represent the same logical x/y/z shape
+      ! used to initialize this communicator.
+      if (shape3(axis_to_storage(1)) /= d_comm%isize_in .OR. &
+          shape3(axis_to_storage(2)) /= d_comm%jsize_in .OR. &
+          shape3(axis_to_storage(3)) /= ke) then
+        call mpp_error(FATAL, &
+             'MPP_DO_REDISTRIBUTE_3D_: incompatible input chunk shape')
+      endif
+
+      chunk_elements = int(shape3(1), kind=c_intptr_t) * &
+                       int(shape3(2), kind=c_intptr_t) * &
+                       int(shape3(3), kind=c_intptr_t)
+
+      base_addr = int(f_in(field), kind=c_intptr_t)
+
+      do chunk = 1, nchunks(field)
+
+        ! Each chunk is one contiguous 3D slab.
+        chunk_offset = int(chunk-1, kind=c_intptr_t) * &
+                       chunk_elements * element_bytes
+
+        chunk_addr = base_addr + chunk_offset
+
+        p_in = transfer(chunk_addr, c_null_ptr)
+
+        call c_f_pointer(p_in, field_in, shape3)
+
+        ! Traverse in Fortran column-major storage order:
+        ! storage dim 1 varies fastest.
+        do s3 = storage_lo(3), storage_hi(3)
+          do s2 = storage_lo(2), storage_hi(2)
+            do s1 = storage_lo(1), storage_hi(1)
+
+              pos = pos + 1
+              buffer(pos) = field_in(s1,s2,s3)
+
+            enddo
+          enddo
+        enddo
+
+      enddo
+    enddo
+
+    if (debug) then
+      write(errunit,*) 'PE', pe, ' to PE ', to_pe, &
+                       'is,ie,js,je=', is, ie, js, je
+    endif
+
+    msgsize = pos - buffer_pos
+
+    if (msgsize /= d_comm%S_msize(list) * total_chunks) then
+      call mpp_error(FATAL, &
+           'MPP_DO_REDISTRIBUTE_3D_: packed send size mismatch')
+    endif
+
+    call mpp_send( buffer(buffer_pos+1), plen=msgsize, &
+                 & to_pe=to_pe, tag=COMM_TAG_1 )
+
+    buffer_pos = pos
+
+  enddo
+
+  call mpp_sync_self(check=EVENT_RECV)
+
+  !--------------------------------------------------------------------
+  ! Unpack receive buffers.
+  !--------------------------------------------------------------------
+  buffer_pos = 0
+  n = d_comm%Rlist_size
+
+  do list = 0, n-1
+
+    if (.NOT. d_comm%R_do_buf(list)) cycle
+
+    from_pe = d_comm%cfrom_pe(list)
+
+    is = d_comm%recvis(1,list)
+    ie = d_comm%recvie(1,list)
+    js = d_comm%recvjs(1,list)
+    je = d_comm%recvje(1,list)
+
+    if (debug) then
+      write(errunit,*) 'PE', pe, ' from PE ', from_pe, &
+                       'is,ie,js,je=', is, ie, js, je
+    endif
+
+    ! Logical bounds within the local output field.
+    logical_lo(1) = is - domain_out%x(1)%domain_data%begin + 1
+    logical_hi(1) = ie - domain_out%x(1)%domain_data%begin + 1
+
+    logical_lo(2) = js - domain_out%y(1)%domain_data%begin + 1
+    logical_hi(2) = je - domain_out%y(1)%domain_data%begin + 1
+
+    logical_lo(3) = 1
+    logical_hi(3) = ke
+
+    storage_lo = 0
+    storage_hi = 0
+
+    storage_lo(axis_to_storage(1)) = logical_lo(1)
+    storage_hi(axis_to_storage(1)) = logical_hi(1)
+
+    storage_lo(axis_to_storage(2)) = logical_lo(2)
+    storage_hi(axis_to_storage(2)) = logical_hi(2)
+
+    storage_lo(axis_to_storage(3)) = logical_lo(3)
+    storage_hi(axis_to_storage(3)) = logical_hi(3)
+
+    pos = buffer_pos
+
+    do field = 1, nfields
+
+      shape3 = chunk_shape_out(:,field)
+
+      ! Input and output horizontal extents may differ, but the output
+      ! chunk must match the output communicator decomposition.
+      if (shape3(axis_to_storage(1)) /= d_comm%isize_out .OR. &
+          shape3(axis_to_storage(2)) /= d_comm%jsize_out .OR. &
+          shape3(axis_to_storage(3)) /= ke) then
+        call mpp_error(FATAL, &
+             'MPP_DO_REDISTRIBUTE_3D_: incompatible output chunk shape')
+      endif
+
+      chunk_elements = int(shape3(1), kind=c_intptr_t) * &
+                       int(shape3(2), kind=c_intptr_t) * &
+                       int(shape3(3), kind=c_intptr_t)
+
+      base_addr = int(f_out(field), kind=c_intptr_t)
+
+      do chunk = 1, nchunks(field)
+
+        chunk_offset = int(chunk-1, kind=c_intptr_t) * &
+                       chunk_elements * element_bytes
+
+        chunk_addr = base_addr + chunk_offset
+
+        p_out = transfer(chunk_addr, c_null_ptr)
+
+        call c_f_pointer(p_out, field_out, shape3)
+
+        do s3 = storage_lo(3), storage_hi(3)
+          do s2 = storage_lo(2), storage_hi(2)
+            do s1 = storage_lo(1), storage_hi(1)
+
+              pos = pos + 1
+              field_out(s1,s2,s3) = buffer(pos)
+
+            enddo
+          enddo
+        enddo
+
       enddo
+    enddo
+
+    if (pos - buffer_pos /= d_comm%R_msize(list) * total_chunks) then
+      call mpp_error(FATAL, &
+           'MPP_DO_REDISTRIBUTE_3D_: unpacked receive size mismatch')
+    endif
+
+    buffer_pos = pos
+
+  enddo
+
+  call mpp_sync_self()
 
-!send
-      n = d_comm%Slist_size
-      do list = 0,n-1
-         if( .NOT. d_comm%S_do_buf(list) )cycle
-         to_pe = d_comm%cto_pe(list)
-         is=d_comm%sendis(1,list); ie=d_comm%sendie(1,list)
-         js=d_comm%sendjs(1,list); je=d_comm%sendje(1,list)
-         pos = buffer_pos
-         do l=1,l_size  ! loop over number of fields
-           ptr_field_in = f_in(l)
-           do k = 1,ke
-              do j = js,je
-                 do i = is,ie
-                    pos = pos+1
-                    buffer(pos) = field_in(i,j,k)
-                 end do
-              end do
-           end do
-         end do
-         if( debug )write( errunit,* )'PE', pe, ' to PE ', to_pe, 'is,ie,js,je=', is, ie, js, je
-         msgsize = pos - buffer_pos
-         call mpp_send( buffer(buffer_pos+1), plen=msgsize, to_pe=to_pe, tag=COMM_TAG_1  )
-         buffer_pos = pos
-      end do
-
-      call mpp_sync_self(check=EVENT_RECV)
-
-!unpack buffer
-      buffer_pos = 0
-      n = d_comm%Rlist_size
-      do list = 0,n-1
-         if( .NOT. d_comm%R_do_buf(list) )cycle
-         from_pe = d_comm%cfrom_pe(list)
-         is=d_comm%recvis(1,list); ie=d_comm%recvie(1,list)
-         js=d_comm%recvjs(1,list); je=d_comm%recvje(1,list)
-         if( debug )write( errunit,* )'PE', pe, ' from PE ', from_pe, 'is,ie,js,je=', is, ie, js, je
-         pos = buffer_pos
-         do l=1,l_size  ! loop over number of in/out fields
-           ptr_field_out = f_out(l)
-           do k = 1,ke
-              do j = js,je
-                 do i = is,ie
-                    pos = pos+1
-                    field_out(i,j,k) = buffer(pos)
-                 end do
-              end do
-           end do
-         end do
-         buffer_pos = pos
-      end do
-
-      call mpp_sync_self()
-    end subroutine MPP_DO_REDISTRIBUTE_3D_
+end subroutine MPP_DO_REDISTRIBUTE_3D_
 !> @}
diff --git a/mpp/include/mpp_domains_misc.inc b/mpp/include/mpp_domains_misc.inc
index fe13a75867..ff07d527ca 100644
--- a/mpp/include/mpp_domains_misc.inc
+++ b/mpp/include/mpp_domains_misc.inc
@@ -988,6 +988,8 @@ end subroutine init_nonblock_type
 #undef MPP_UPDATE_DOMAINS_5D_V_
 #define MPP_UPDATE_DOMAINS_5D_V_ mpp_update_domain2D_r8_5Dv
 #endif
+#undef MPP_REDISTRIBUTE_REGISTER_
+#define MPP_REDISTRIBUTE_REGISTER_ mpp_redistribute_register_r8
 #undef MPP_REDISTRIBUTE_2D_
 #define MPP_REDISTRIBUTE_2D_ mpp_redistribute_r8_2D
 #undef MPP_REDISTRIBUTE_3D_
@@ -1010,6 +1012,8 @@ end subroutine init_nonblock_type
 #define MPP_UPDATE_DOMAINS_4D_ mpp_update_domain2D_c8_4D
 #undef MPP_UPDATE_DOMAINS_5D_
 #define MPP_UPDATE_DOMAINS_5D_ mpp_update_domain2D_c8_5D
+#undef MPP_REDISTRIBUTE_REGISTER_
+#define MPP_REDISTRIBUTE_REGISTER_ mpp_redistribute_register_c8
 #undef MPP_REDISTRIBUTE_2D_
 #define MPP_REDISTRIBUTE_2D_ mpp_redistribute_c8_2D
 #undef MPP_REDISTRIBUTE_3D_
@@ -1031,6 +1035,8 @@ end subroutine init_nonblock_type
 #define MPP_UPDATE_DOMAINS_4D_ mpp_update_domain2D_i8_4D
 #undef MPP_UPDATE_DOMAINS_5D_
 #define MPP_UPDATE_DOMAINS_5D_ mpp_update_domain2D_i8_5D
+#undef MPP_REDISTRIBUTE_REGISTER_
+#define MPP_REDISTRIBUTE_REGISTER_ mpp_redistribute_register_i8
 #undef MPP_REDISTRIBUTE_2D_
 #define MPP_REDISTRIBUTE_2D_ mpp_redistribute_i8_2D
 #undef MPP_REDISTRIBUTE_3D_
@@ -1062,6 +1068,8 @@ end subroutine init_nonblock_type
 #define MPP_UPDATE_DOMAINS_4D_V_ mpp_update_domain2D_r4_4Dv
 #undef MPP_UPDATE_DOMAINS_5D_V_
 #define MPP_UPDATE_DOMAINS_5D_V_ mpp_update_domain2D_r4_5Dv
+#undef MPP_REDISTRIBUTE_REGISTER_
+#define MPP_REDISTRIBUTE_REGISTER_ mpp_redistribute_register_r4
 #undef MPP_REDISTRIBUTE_2D_
 #define MPP_REDISTRIBUTE_2D_ mpp_redistribute_r4_2D
 #undef MPP_REDISTRIBUTE_3D_
@@ -1085,6 +1093,8 @@ end subroutine init_nonblock_type
 #define MPP_UPDATE_DOMAINS_4D_ mpp_update_domain2D_c4_4D
 #undef MPP_UPDATE_DOMAINS_5D_
 #define MPP_UPDATE_DOMAINS_5D_ mpp_update_domain2D_c4_5D
+#undef MPP_REDISTRIBUTE_REGISTER_
+#define MPP_REDISTRIBUTE_REGISTER_ mpp_redistribute_register_c4
 #undef MPP_REDISTRIBUTE_2D_
 #define MPP_REDISTRIBUTE_2D_ mpp_redistribute_c4_2D
 #undef MPP_REDISTRIBUTE_3D_
@@ -1106,6 +1116,8 @@ end subroutine init_nonblock_type
 #define MPP_UPDATE_DOMAINS_4D_ mpp_update_domain2D_i4_4D
 #undef MPP_UPDATE_DOMAINS_5D_
 #define MPP_UPDATE_DOMAINS_5D_ mpp_update_domain2D_i4_5D
+#undef MPP_REDISTRIBUTE_REGISTER_
+#define MPP_REDISTRIBUTE_REGISTER_ mpp_redistribute_register_i4
 #undef MPP_REDISTRIBUTE_2D_
 #define MPP_REDISTRIBUTE_2D_ mpp_redistribute_i4_2D
 #undef MPP_REDISTRIBUTE_3D_
diff --git a/mpp/include/mpp_update_domains2D.fh b/mpp/include/mpp_update_domains2D.fh
index b9b0a95e09..ebd90c6f7e 100644
--- a/mpp/include/mpp_update_domains2D.fh
+++ b/mpp/include/mpp_update_domains2D.fh
@@ -216,179 +216,422 @@
       return
     end subroutine MPP_UPDATE_DOMAINS_5D_
 
+    subroutine MPP_REDISTRIBUTE_REGISTER_( domain_in, addr_in, shape_in, domain_out, addr_out, shape_out, &
+                                         & nchunks, complete, free, list_size, dc_handle, position, axis_to_storage_in)
+      type(domain2D), intent(in) :: domain_in, domain_out
+      integer(i8_kind), intent(in) :: addr_in, addr_out
+      integer, intent(in) :: shape_in(3), shape_out(3)
+      integer, intent(in) :: nchunks
+
+      logical, intent(in), optional :: complete, free
+      integer, intent(in), optional :: list_size
+      integer, intent(in), optional :: position
+      integer, intent(in) :: axis_to_storage_in(3)
+
+      type(DomainCommunicator2D), pointer, optional :: dc_handle
+      type(DomainCommunicator2D), pointer, save :: d_comm => NULL()
+
+      integer(i8_kind), save :: l_addrs_in(MAX_DOMAIN_FIELDS)  = -9999
+      integer(i8_kind), save :: l_addrs_out(MAX_DOMAIN_FIELDS) = -9999
+
+      integer, save :: l_nchunks(MAX_DOMAIN_FIELDS) = 1
+      integer, save :: l_shape_in(3,MAX_DOMAIN_FIELDS)  = 0
+      integer, save :: l_shape_out(3,MAX_DOMAIN_FIELDS) = 0
+
+      integer, save :: l_size = 0
+
+      integer, save :: isize_in  = 0
+      integer, save :: jsize_in  = 0
+      integer, save :: ke_in     = 0
+      integer, save :: isize_out = 0
+      integer, save :: jsize_out = 0
+      integer, save :: ke_out    = 0
+
+      integer, save :: saved_axis_to_storage(3) = (/1,2,3/)
+
+      integer :: axis_to_storage(3)
+      integer :: xdim, ydim, zdim
+      integer :: ke, lsize
+
+      logical :: do_redist
+      logical :: free_comm
+      logical :: set_mismatch
+
+      character(len=16) :: text
+      MPP_TYPE_ :: d_type
+
+      axis_to_storage = axis_to_storage_in
+
+      xdim = axis_to_storage(1)
+      ydim = axis_to_storage(2)
+      zdim = axis_to_storage(3)
+
+      if (nchunks < 1) then
+        call mpp_error(FATAL, 'MPP_REDISTRIBUTE_REGISTER_3D_: nchunks must be positive')
+      endif
+
+      if (present(position)) then
+        if (position /= CENTER) then
+          call mpp_error(FATAL, 'MPP_REDISTRIBUTE_REGISTER_3D_: only position=CENTER is implemented')
+        endif
+      endif
+
+      do_redist = .true.
+      if (present(complete)) do_redist = complete
+
+      free_comm = .false.
+      if (present(free)) free_comm = free
+
+      if (free_comm) then
+
+        if (addr_out > 0) then
+          ke = shape_out(zdim)
+        else
+          ke = shape_in(zdim)
+        endif
+
+        lsize = 1
+        if (present(list_size)) lsize = list_size
+
+        call mpp_redistribute_free_comm( domain_in,  addr_in, domain_out, addr_out, ke, lsize )
+
+        return
+      endif
+
+      l_size = l_size + 1
+
+      if (l_size > MAX_DOMAIN_FIELDS) then
+        write(text,'(i0)') MAX_DOMAIN_FIELDS
+        call mpp_error(FATAL, 'MPP_REDISTRIBUTE_REGISTER_3D_: MAX_DOMAIN_FIELDS=' // trim(text) // ' exceeded')
+      endif
+
+      l_addrs_in(l_size)  = addr_in
+      l_addrs_out(l_size) = addr_out
+
+      l_nchunks(l_size)    = nchunks
+      l_shape_in(:,l_size) = shape_in
+      l_shape_out(:,l_size) = shape_out
+
+      if (l_size == 1) then
+
+        saved_axis_to_storage = axis_to_storage
+
+        if (addr_in > 0) then
+          isize_in = shape_in(xdim)
+          jsize_in = shape_in(ydim)
+          ke_in    = shape_in(zdim)
+        endif
+
+        if (addr_out > 0) then
+          isize_out = shape_out(xdim)
+          jsize_out = shape_out(ydim)
+          ke_out    = shape_out(zdim)
+        endif
+
+      else
+
+        set_mismatch = any(axis_to_storage /= saved_axis_to_storage)
+
+        set_mismatch = set_mismatch .OR. (addr_in == 0 .AND. l_addrs_in(l_size-1) /= 0)
+        set_mismatch = set_mismatch .OR. (addr_in > 0 .AND. l_addrs_in(l_size-1) == 0)
+        set_mismatch = set_mismatch .OR. (addr_out == 0 .AND. l_addrs_out(l_size-1) /= 0)
+        set_mismatch = set_mismatch .OR. (addr_out > 0 .AND. l_addrs_out(l_size-1) == 0)
+
+        if (addr_in > 0) then
+          set_mismatch = set_mismatch .OR. (isize_in /= shape_in(xdim))
+          set_mismatch = set_mismatch .OR. (jsize_in /= shape_in(ydim))
+          set_mismatch = set_mismatch .OR. (ke_in /= shape_in(zdim))
+        endif
+
+        if (addr_out > 0) then
+          set_mismatch = set_mismatch .OR. (isize_out /= shape_out(xdim))
+          set_mismatch = set_mismatch .OR. (jsize_out /= shape_out(ydim))
+          set_mismatch = set_mismatch .OR. (ke_out /= shape_out(zdim))
+        endif
+
+        if (set_mismatch) then
+          write(text,'(i0)') l_size
+          call mpp_error(FATAL, 'MPP_REDISTRIBUTE_REGISTER_3D_: incompatible grouped field at count ' // trim(text))
+        endif
+
+      endif
+
+      if (.not. do_redist) return
+
+      if (present(dc_handle)) d_comm => dc_handle
+
+      if (.not. associated(d_comm)) then
+
+         d_comm => mpp_redistribute_init_comm(domain_in,  l_addrs_in(1:l_size), domain_out, l_addrs_out(1:l_size), &
+                                  isize_in,  jsize_in,  ke_in, isize_out, jsize_out, ke_out )
+
+        if (present(dc_handle)) dc_handle => d_comm
+
+      endif
+
+      call mpp_do_redistribute(l_addrs_in(1:l_size), l_addrs_out(1:l_size), d_comm, d_type,            &
+             &  axis_to_storage_in = axis_to_storage, nchunks = l_nchunks(1:l_size),                   &
+             &  chunk_shape_in = l_shape_in(:,1:l_size), chunk_shape_out = l_shape_out(:,1:l_size) )
+
+      !--------------------------------------------------------------------
+      ! Reset grouped state.
+      !--------------------------------------------------------------------
+      l_size      = 0
+      l_addrs_in  = -9999
+      l_addrs_out = -9999
+
+      l_nchunks   = 1
+      l_shape_in  = 0
+      l_shape_out = 0
+
+      isize_in  = 0
+      jsize_in  = 0
+      ke_in     = 0
+
+      isize_out = 0
+      jsize_out = 0
+      ke_out    = 0
+
+      saved_axis_to_storage = (/1,2,3/)
+      d_comm => NULL()
+
+    end subroutine MPP_REDISTRIBUTE_REGISTER_
+
     subroutine MPP_REDISTRIBUTE_2D_( domain_in, field_in, domain_out, field_out, complete, free, list_size, &
-                                   &  dc_handle, position )
+                                   & dc_handle, position, axis_to_storage_in )
       type(domain2D), intent(in) :: domain_in, domain_out
-      MPP_TYPE_, intent(in)  :: field_in (:,:)
-      MPP_TYPE_, intent(out) :: field_out(:,:)
+      MPP_TYPE_, intent(in)     :: field_in(:,:)
+      MPP_TYPE_, intent(out)    :: field_out(:,:)
+
       logical, intent(in), optional :: complete, free
       integer, intent(in), optional :: list_size
       integer, intent(in), optional :: position
-      MPP_TYPE_ :: field3D_in (size(field_in, 1),size(field_in, 2),1)
-      MPP_TYPE_ :: field3D_out(size(field_out,1),size(field_out,2),1)
-      type(DomainCommunicator2D),pointer,optional :: dc_handle
-      pointer( ptr_in,  field3D_in  )
-      pointer( ptr_out, field3D_out )
-
-      field_out = 0
-      ptr_in = 0
-      ptr_out = 0
-      if(domain_in%initialized) ptr_in  = LOC(field_in )
-      if(domain_out%initialized) ptr_out = LOC(field_out)
-      call mpp_redistribute( domain_in, field3D_in, domain_out, field3D_out, complete, free, list_size, &
-                           &  dc_handle, position )
+      integer, intent(in), optional :: axis_to_storage_in(2)
+
+      type(DomainCommunicator2D), pointer, optional :: dc_handle
+
+      integer(i8_kind) :: addr_in, addr_out
+      integer :: shape_in(3), shape_out(3)
+      integer :: axis_to_storage(3)
+      integer :: nchunks
+
+      ! Logical x/y may occupy storage dims 1 and 2 in either order.
+      ! The third dimension is an internal singleton carried dimension.
+      axis_to_storage = (/1,2,3/)
+      if (present(axis_to_storage_in)) then
+        if (any(axis_to_storage_in < 1) .OR. any(axis_to_storage_in > 2) .OR. &
+            axis_to_storage_in(1) == axis_to_storage_in(2)) then
+          call mpp_error(FATAL, &
+               'MPP_REDISTRIBUTE_2D_: invalid axis_to_storage permutation')
+        endif
+
+        axis_to_storage(1:2) = axis_to_storage_in
+      endif
 
-      return
-    end subroutine MPP_REDISTRIBUTE_2D_
+      nchunks = 1
+
+      addr_in  = 0
+      addr_out = 0
 
+      if (domain_in%initialized)  addr_in  = LOC(field_in)
+      if (domain_out%initialized) addr_out = LOC(field_out)
+
+      shape_in = (/ size(field_in,1),  &
+                  & size(field_in,2),  &
+                  & 1 /)
+
+      shape_out = (/ size(field_out,1),  &
+                   & size(field_out,2),  &
+                   & 1 /)
+
+      if (domain_out%initialized) field_out = 0
+
+      call MPP_REDISTRIBUTE_REGISTER_( domain_in = domain_in, addr_in = addr_in, shape_in = shape_in,       &
+                                     & domain_out = domain_out, addr_out = addr_out, shape_out = shape_out, &
+                                     & nchunks = nchunks, complete = complete, free = free,                 &
+                                     & list_size = list_size, dc_handle = dc_handle, position = position,   &
+                                     & axis_to_storage_in = axis_to_storage )
+
+    end subroutine MPP_REDISTRIBUTE_2D_
 
     subroutine MPP_REDISTRIBUTE_3D_( domain_in, field_in, domain_out, field_out, complete, free, list_size, &
-                                   &  dc_handle, position )
+                                   & dc_handle, position, axis_to_storage_in )
       type(domain2D), intent(in) :: domain_in, domain_out
-      MPP_TYPE_, intent(in)  :: field_in (:,:,:)
-      MPP_TYPE_, intent(out) :: field_out(:,:,:)
+      MPP_TYPE_, intent(in)     :: field_in(:,:,:)
+      MPP_TYPE_, intent(out)    :: field_out(:,:,:)
+
       logical, intent(in), optional :: complete, free
       integer, intent(in), optional :: list_size
       integer, intent(in), optional :: position
-      type(DomainCommunicator2D),pointer,optional :: dc_handle
-      type(DomainCommunicator2D),pointer,save :: d_comm =>NULL()
-      logical                       :: do_redist,free_comm
-      integer                       :: lsize
-      integer(i8_kind),dimension(MAX_DOMAIN_FIELDS),save :: l_addrs_in=-9999, l_addrs_out=-9999
-      integer, save :: isize_in=0,jsize_in=0,ke_in=0,l_size=0
-      integer, save :: isize_out=0,jsize_out=0,ke_out=0
-      logical       :: set_mismatch
-      integer       :: ke
-      character(len=2) :: text
-      MPP_TYPE_ :: d_type
-      integer(i8_kind) :: floc_in, floc_out
+      integer, intent(in), optional :: axis_to_storage_in(3)
 
-      floc_in = 0
-      floc_out = 0
-      if(domain_in%initialized) floc_in = LOC(field_in)
-      if(domain_out%initialized) floc_out = LOC(field_out)
+      type(DomainCommunicator2D), pointer, optional :: dc_handle
 
-      if(present(position)) then
-         if(position .NE. CENTER) call mpp_error( FATAL,  &
-              'MPP_REDISTRIBUTE_3Dold_: only position = CENTER is implemented, contact author')
-      endif
+      integer(i8_kind) :: addr_in, addr_out
+      integer :: shape_in(3), shape_out(3)
+      integer :: axis_to_storage(3)
+      integer :: nchunks
 
-      do_redist=.true.; if(PRESENT(complete))do_redist=complete
-      free_comm=.false.; if(PRESENT(free))free_comm=free
-      if(free_comm)then
-         l_addrs_in(1) = floc_in; l_addrs_out(1) = floc_out
-         if(l_addrs_out(1)>0)then
-            ke = size(field_out,3)
-         else
-            ke = size(field_in,3)
-         end if
-         lsize=1; if(PRESENT(list_size))lsize=list_size
-         call mpp_redistribute_free_comm(domain_in,l_addrs_in(1),domain_out,l_addrs_out(1),ke,lsize)
-      else
-         l_size = l_size+1
-         if(l_size > MAX_DOMAIN_FIELDS)then
-            write( text,'(i2)' ) MAX_DOMAIN_FIELDS
-            call mpp_error(FATAL,'MPP_REDISTRIBUTE_3D: MAX_DOMAIN_FIELDS='//text//' exceeded for group redistribute.' )
-         end if
-         l_addrs_in(l_size) = floc_in; l_addrs_out(l_size) = floc_out
-         if(l_size == 1)then
-            if(l_addrs_in(l_size) > 0)then
-               isize_in=size(field_in,1); jsize_in=size(field_in,2); ke_in = size(field_in,3)
-            end if
-            if(l_addrs_out(l_size) > 0)then
-               isize_out=size(field_out,1); jsize_out=size(field_out,2); ke_out = size(field_out,3)
-            endif
-         else
-            set_mismatch = .false.
-            set_mismatch = l_addrs_in(l_size) == 0 .AND. l_addrs_in(l_size-1) /= 0
-            set_mismatch = set_mismatch .OR. (l_addrs_in(l_size) > 0 .AND. l_addrs_in(l_size-1) == 0)
-            set_mismatch = set_mismatch .OR. (l_addrs_out(l_size) == 0 .AND. l_addrs_out(l_size-1) /= 0)
-            set_mismatch = set_mismatch .OR. (l_addrs_out(l_size) > 0 .AND. l_addrs_out(l_size-1) == 0)
-            if(l_addrs_in(l_size) > 0)then
-               set_mismatch = set_mismatch .OR. (isize_in /= size(field_in,1))
-               set_mismatch = set_mismatch .OR. (jsize_in /= size(field_in,2))
-               set_mismatch = set_mismatch .OR. (ke_in /= size(field_in,3))
-            endif
-            if(l_addrs_out(l_size) > 0)then
-               set_mismatch = set_mismatch .OR. (isize_out /= size(field_out,1))
-               set_mismatch = set_mismatch .OR. (jsize_out /= size(field_out,2))
-               set_mismatch = set_mismatch .OR. (ke_out /= size(field_out,3))
-            endif
-            if(set_mismatch)then
-               write( text,'(i2)' ) l_size
-               call mpp_error(FATAL,'MPP_REDISTRIBUTE_3D: Incompatible field at count '// &
-                              & text//' for group redistribute.' )
-            endif
-         endif
-         if(do_redist)then
-            if(PRESENT(dc_handle))d_comm =>dc_handle  ! User has kept pointer to d_comm
-            if(.not.ASSOCIATED(d_comm))then  ! d_comm needs initialization or lookup
-               d_comm =>mpp_redistribute_init_comm(domain_in,l_addrs_in(1:l_size),domain_out,l_addrs_out(1:l_size), &
-                    isize_in,jsize_in,ke_in,isize_out,jsize_out,ke_out)
-               if(PRESENT(dc_handle))dc_handle =>d_comm  ! User wants to keep pointer to d_comm
-            endif
-            call mpp_do_redistribute( l_addrs_in(1:l_size), l_addrs_out(1:l_size), d_comm, d_type )
-            l_size=0; l_addrs_in=-9999; l_addrs_out=-9999
-            isize_in=0;  jsize_in=0;  ke_in=0
-            isize_out=0; jsize_out=0; ke_out=0
-            d_comm =>NULL()
-         endif
+      axis_to_storage = (/1,2,3/)
+      if (present(axis_to_storage_in)) axis_to_storage = axis_to_storage_in
+
+      if (any(axis_to_storage < 1) .OR. any(axis_to_storage > 3) .OR. &
+          axis_to_storage(1) == axis_to_storage(2) .OR.               &
+          axis_to_storage(1) == axis_to_storage(3) .OR.               &
+          axis_to_storage(2) == axis_to_storage(3)) then
+        call mpp_error(FATAL, 'MPP_REDISTRIBUTE_3D_: invalid axis_to_storage permutation')
       endif
 
-    end subroutine MPP_REDISTRIBUTE_3D_
+      nchunks = 1
+
+      addr_in  = 0
+      addr_out = 0
+
+      if (domain_in%initialized)  addr_in  = LOC(field_in)
+      if (domain_out%initialized) addr_out = LOC(field_out)
 
+      shape_in = (/ size(field_in,1),  &
+                  & size(field_in,2),  &
+                  & size(field_in,3) /)
+
+      shape_out = (/ size(field_out,1),  &
+                   & size(field_out,2),  &
+                   & size(field_out,3) /)
+
+      if (domain_out%initialized) field_out = 0
+
+      call MPP_REDISTRIBUTE_REGISTER_( domain_in = domain_in, addr_in = addr_in, shape_in = shape_in,           &
+                                     & domain_out = domain_out, addr_out = addr_out, shape_out = shape_out,     &
+                                     & nchunks = nchunks, complete = complete, free = free,                     &
+                                     & list_size = list_size, dc_handle = dc_handle, position = position,       &
+                                     & axis_to_storage_in = axis_to_storage )
+
+    end subroutine MPP_REDISTRIBUTE_3D_
 
     subroutine MPP_REDISTRIBUTE_4D_( domain_in, field_in, domain_out, field_out, complete, free, list_size, &
-                                   &  dc_handle, position )
+                                   & dc_handle, position, axis_to_storage_in )
       type(domain2D), intent(in) :: domain_in, domain_out
-      MPP_TYPE_, intent(in)  :: field_in (:,:,:,:)
-      MPP_TYPE_, intent(out) :: field_out(:,:,:,:)
+      MPP_TYPE_, intent(in)     :: field_in(:,:,:,:)
+      MPP_TYPE_, intent(out)    :: field_out(:,:,:,:)
+
       logical, intent(in), optional :: complete, free
       integer, intent(in), optional :: list_size
       integer, intent(in), optional :: position
-      MPP_TYPE_ :: field3D_in (size(field_in, 1),size(field_in, 2),size(field_in ,3)*size(field_in ,4))
-      MPP_TYPE_ :: field3D_out(size(field_out,1),size(field_out,2),size(field_out,3)*size(field_out,4))
-      type(DomainCommunicator2D),pointer,optional :: dc_handle
-      pointer( ptr_in,  field3D_in  )
-      pointer( ptr_out, field3D_out )
-
-      field_out = 0
-      ptr_in = 0
-      ptr_out = 0
-      if(domain_in%initialized) ptr_in  = LOC(field_in )
-      if(domain_out%initialized) ptr_out = LOC(field_out)
-      call mpp_redistribute( domain_in, field3D_in, domain_out, field3D_out, complete, free, list_size, &
-                           &  dc_handle, position  )
+      integer, intent(in), optional :: axis_to_storage_in(3)
+
+      type(DomainCommunicator2D), pointer, optional :: dc_handle
+
+      integer(i8_kind) :: addr_in, addr_out
+      integer :: shape_in(3), shape_out(3)
+      integer :: axis_to_storage(3)
+      integer :: nchunks
+
+      axis_to_storage = (/1,2,3/)
+      if (present(axis_to_storage_in)) axis_to_storage = axis_to_storage_in
+
+      nchunks = size(field_in,4)
+
+      if (nchunks /= size(field_out,4)) then
+        call mpp_error(FATAL, 'MPP_REDISTRIBUTE_4D_: input and output fourth extents differ')
+      endif
+
+      if (nchunks < 1) then
+        call mpp_error(FATAL, 'MPP_REDISTRIBUTE_4D_: fourth dimension must be nonempty')
+      endif
+
+      addr_in  = 0
+      addr_out = 0
+
+      if (domain_in%initialized)  addr_in  = LOC(field_in)
+      if (domain_out%initialized) addr_out = LOC(field_out)
+
+      ! Each fixed-fourth-index section field(:,:,:,l) is one contiguous
+      !  3D chunk. Logical x/y/z may be mapped to any of these three
+      !  storage dimensions through axis_to_storage.
+      shape_in = (/ size(field_in,1),  &
+                  & size(field_in,2),  &
+                  & size(field_in,3) /)
+
+      shape_out = (/ size(field_out,1),  &
+                   & size(field_out,2),  &
+                   & size(field_out,3) /)
+
+      if (domain_out%initialized) field_out = 0
+
+      call MPP_REDISTRIBUTE_REGISTER_( domain_in = domain_in, addr_in = addr_in, shape_in = shape_in,        &
+                                     & domain_out = domain_out, addr_out = addr_out, shape_out = shape_out,  &
+                                     & nchunks = nchunks, complete = complete, free = free,                  &
+                                     & list_size = list_size, dc_handle = dc_handle, position = position,    &
+                                     & axis_to_storage_in  = axis_to_storage )
 
-      return
     end subroutine MPP_REDISTRIBUTE_4D_
 
     subroutine MPP_REDISTRIBUTE_5D_( domain_in, field_in, domain_out, field_out, complete, free, list_size, &
-                                   &  dc_handle, position )
+                                   & dc_handle, position, axis_to_storage_in )
       type(domain2D), intent(in) :: domain_in, domain_out
-      MPP_TYPE_, intent(in)  :: field_in (:,:,:,:,:)
-      MPP_TYPE_, intent(out) :: field_out(:,:,:,:,:)
+      MPP_TYPE_, intent(in)     :: field_in(:,:,:,:,:)
+      MPP_TYPE_, intent(out)    :: field_out(:,:,:,:,:)
+
       logical, intent(in), optional :: complete, free
       integer, intent(in), optional :: list_size
       integer, intent(in), optional :: position
-      MPP_TYPE_ :: field3D_in (size(field_in, 1),size(field_in, 2), &
-                             & size(field_in ,3)*size(field_in,4)*size(field_in ,5))
-      MPP_TYPE_ :: field3D_out(size(field_out,1),size(field_out,2), &
-                             & size(field_out,3)*size(field_out,4)*size(field_out,5))
-
-      type(DomainCommunicator2D),pointer,optional :: dc_handle
-      pointer( ptr_in,  field3D_in  )
-      pointer( ptr_out, field3D_out )
-
-      field_out = 0
-      ptr_in = 0
-      ptr_out = 0
-      if(domain_in%initialized) ptr_in  = LOC(field_in )
-      if(domain_out%initialized) ptr_out = LOC(field_out)
-      call mpp_redistribute( domain_in, field3D_in, domain_out, field3D_out, complete, free, list_size, &
-                           &  dc_handle, position  )
+      integer, intent(in), optional :: axis_to_storage_in(3)
+
+      type(DomainCommunicator2D), pointer, optional :: dc_handle
+
+      integer(i8_kind) :: addr_in, addr_out
+      integer :: shape_in(3), shape_out(3)
+      integer :: axis_to_storage(3)
+      integer :: nchunks
+
+      axis_to_storage = (/1,2,3/)
+      if (present(axis_to_storage_in)) axis_to_storage = axis_to_storage_in
+
+      if (any(axis_to_storage < 1) .OR. any(axis_to_storage > 3) .OR. &
+          axis_to_storage(1) == axis_to_storage(2) .OR.               &
+          axis_to_storage(1) == axis_to_storage(3) .OR.               &
+          axis_to_storage(2) == axis_to_storage(3)) then
+        call mpp_error(FATAL, 'MPP_REDISTRIBUTE_5D_: invalid axis_to_storage permutation')
+      endif
+
+      if (size(field_in,4) /= size(field_out,4)) then
+        call mpp_error(FATAL, 'MPP_REDISTRIBUTE_5D_: input and output fourth extents differ')
+      endif
+
+      if (size(field_in,5) /= size(field_out,5)) then
+        call mpp_error(FATAL, 'MPP_REDISTRIBUTE_5D_: input and output fifth extents differ')
+      endif
+
+      if (size(field_in,4) < 1 .OR. size(field_in,5) < 1) then
+        call mpp_error(FATAL, 'MPP_REDISTRIBUTE_5D_: fourth and fifth dimensions must be nonempty')
+      endif
+
+      nchunks = size(field_in,4) * size(field_in,5)
+
+      addr_in  = 0
+      addr_out = 0
+
+      if (domain_in%initialized)  addr_in  = LOC(field_in)
+      if (domain_out%initialized) addr_out = LOC(field_out)
+
+      ! Each fixed-(fourth,fifth)-index section field(:,:,:,l,m)
+      ! is one contiguous 3D chunk.
+      shape_in = (/ size(field_in,1),  &
+                  & size(field_in,2),  &
+                  & size(field_in,3) /)
+
+      shape_out = (/ size(field_out,1),  &
+                   & size(field_out,2),  &
+                   & size(field_out,3) /)
+
+      if (domain_out%initialized) field_out = 0
+
+      call MPP_REDISTRIBUTE_REGISTER_( domain_in = domain_in, addr_in = addr_in, shape_in = shape_in,           &
+                                     & domain_out = domain_out, addr_out = addr_out, shape_out = shape_out,     &
+                                     & nchunks = nchunks, complete = complete, free = free,                     &
+                                     & list_size = list_size, dc_handle = dc_handle, position = position,       &
+                                     & axis_to_storage_in = axis_to_storage )
 
-      return
     end subroutine MPP_REDISTRIBUTE_5D_
 
 #ifdef VECTOR_FIELD_
diff --git a/test_fms/mpp/test_mpp_domains.F90 b/test_fms/mpp/test_mpp_domains.F90
index e681173944..c6cbaebd8f 100644
--- a/test_fms/mpp/test_mpp_domains.F90
+++ b/test_fms/mpp/test_mpp_domains.F90
@@ -83,6 +83,7 @@ program test_mpp_domains
   logical :: test_nonsym_edge = .false.
   logical :: test_group = .false.
   logical :: test_cubic_grid_redistribute = .false.
+  logical :: test_cubic_grid_redistribute_generalized_indices = .false.
   logical :: check_parallel = .FALSE.
   logical :: test_get_nbr = .FALSE.
   logical :: test_boundary = .false.
@@ -109,7 +110,8 @@ program test_mpp_domains
                                warn_level, wide_halo_x, wide_halo_y, nx_cubic, ny_cubic, &
                                test_performance, test_interface, num_fields, do_sleep, num_iter, &
                                mix_2D_3D, test_get_nbr, test_edge_update, &
-                               test_cubic_grid_redistribute, ensemble_size, layout_cubic, &
+                               test_cubic_grid_redistribute, test_cubic_grid_redistribute_generalized_indices, &
+                               ensemble_size, layout_cubic, &
                                layout_ensemble, nthreads, test_boundary, layout_tripolar, &
                                test_group, test_global_sum, test_subset, test_nonsym_edge, &
                                test_halosize_performance, test_adjoint, wide_halo, &
@@ -202,6 +204,16 @@ program test_mpp_domains
          &  '--------------------> Finished cubic_grid_redistribute <-------------------'
   endif
 
+  if( test_cubic_grid_redistribute_generalized_indices ) then
+      if (mpp_pe() == mpp_root_pe())  print *, &
+         &  '--------------------> Calling cubic_grid_redistribute_generalized_indices <-------------------'
+      if (mpp_pe() == mpp_root_pe())  print *, &
+         &  'Storage Layout = (3,1,2)'
+     call cubic_grid_redistribute((/3,1,2/))
+      if (mpp_pe() == mpp_root_pe())  print *, &
+         &  '--------------------> Finished cubic_grid_redistribute_generalized_indices <-------------------'
+  endif
+
   if(test_boundary) then
       if (mpp_pe() == mpp_root_pe())  print *, '--------------------> Calling test_boundary <-------------------'
       call test_get_boundary('torus')
@@ -598,7 +610,8 @@ end subroutine test_redistribute
 
   !#######################################################################
 
-  subroutine cubic_grid_redistribute
+  subroutine cubic_grid_redistribute(axis_to_storage_in)
+     integer, intent(in), optional :: axis_to_storage_in(3)
 
      integer              :: npes, npes_per_ensemble, npes_per_tile
      integer              :: ensemble_id, tile_id, ensemble_tile_id
@@ -607,6 +620,8 @@ subroutine cubic_grid_redistribute
      integer              :: isd_ens, ied_ens, jsd_ens, jed_ens
      integer              :: isc, iec, jsc, jec
      integer              :: isd, ied, jsd, jed
+     integer              :: sc(3), ec(3), sc_ens(3), ec_ens(3)
+     integer              :: sd(3), ed(3), sd_ens(3), ed_ens(3)
      integer, allocatable :: my_ensemble_pelist(:), pe_start(:), pe_end(:)
      integer, allocatable :: global_indices(:,:), layout2D(:,:)
      real,    allocatable :: x(:,:,:,:), x_ens(:,:,:), y(:,:,:)
@@ -614,6 +629,34 @@ subroutine cubic_grid_redistribute
      type(domain2D)       :: domain
      type(domain2D), allocatable :: domain_ensemble(:)
      character(len=128)   :: mesg
+     integer              :: axis_to_storage(3), storage_to_axis(3)
+     integer              :: i_dim, j_dim, k_dim
+
+     !-------------------------------------------------------------------
+     ! axis_to_storage maps logical axis order to storage layout
+     ! axis_to_storage = (3,1,2) maps
+     !          x -> 3rd dim
+     !          y -> 1st dim
+     !          z -> 2nd dim
+     ! This map is passed on to MPP_REDISTRIBUTE to select storage
+     !    dim by logical axis desired
+     ! The inverse map is defined by storage_to_axis(axis_to_storage)=(1,2,3)
+     !    maps storage layout to logical axis order and is used in this
+     !    routine to select the correct logical start/stop indices when
+     !    allocating storage arrays.
+     !-------------------------------------------------------------------
+     axis_to_storage = (/1,2,3/)
+     if (present(axis_to_storage_in)) axis_to_storage = axis_to_storage_in
+
+     ! Inverse Map
+     storage_to_axis(axis_to_storage(1))=1
+     storage_to_axis(axis_to_storage(2))=2
+     storage_to_axis(axis_to_storage(3))=3
+
+     ! Selectors for logical indices by storage dim
+     i_dim = storage_to_axis(1)
+     j_dim = storage_to_axis(2)
+     k_dim = storage_to_axis(3)
 
      ! --- set up pelist
      npes = mpp_npes()
@@ -698,44 +741,55 @@ subroutine cubic_grid_redistribute
      call mpp_get_data_domain( domain, isd, ied, jsd, jed)
      call mpp_get_compute_domain( domain, isc, iec, jsc, jec)
 
-     allocate(x_ens(isd_ens:ied_ens, jsd_ens:jed_ens, nz))
-     allocate(x(isd:ied, jsd:jed, nz, ensemble_size))
-     allocate(y(isd:ied, jsd:jed, nz))
+     sd_ens = (/isd_ens, jsd_ens, 1/)
+     ed_ens = (/ied_ens, jed_ens, nz/)
+     allocate(x_ens(sd_ens(i_dim):ed_ens(i_dim), sd_ens(j_dim):ed_ens(j_dim), sd_ens(k_dim):ed_ens(k_dim) ))
+
+     sd = (/isd, jsd, 1/)
+     ed = (/ied, jed, nz/)
+     allocate(x(sd(i_dim):ed(i_dim), sd(j_dim):ed(j_dim), sd(k_dim):ed(k_dim), ensemble_size))
+     allocate(y(sd(i_dim):ed(i_dim), sd(j_dim):ed(j_dim), sd(k_dim):ed(k_dim) ))
+     print *, 'shape x:', shape(x)
 
+     sc_ens = (/isc_ens, jsc_ens, 1/)
+     ec_ens = (/iec_ens, jec_ens, nz/)
      x = 0
-     do k = 1, nz
-        do j = jsc_ens, jec_ens
-           do i = isc_ens, iec_ens
+     do k = sc_ens(k_dim), ec_ens(k_dim)
+        do j = sc_ens(j_dim), ec_ens(j_dim)
+           do i = sc_ens(i_dim), ec_ens(i_dim)
               x_ens(i,j,k) = ensemble_id *1e6 + ensemble_tile_id*1e3 + i + j * 1.e-3 + k * 1.e-6
            enddo
         enddo
      enddo
 
+     sc = (/isc, jsc, 1/)
+     ec = (/iec, jec, nz/)
      do n = 1, ensemble_size
         x = 0
-        call mpp_redistribute( domain_ensemble(n), x_ens, domain, x(:,:,:,n) )
+        call mpp_redistribute( domain_ensemble(n), x_ens, domain, x(:,:,:,n), axis_to_storage_in=axis_to_storage )
         y = 0
-        do k = 1, nz
-           do j = jsc, jec
-              do i = isc, iec
+        do k = sc(k_dim), ec(k_dim)
+           do j = sc(j_dim), ec(j_dim)
+              do i = sc(i_dim), ec(i_dim)
                  y(i,j,k) = n *1e6 + tile_id*1e3 + i + j * 1.e-3 + k * 1.e-6
               enddo
            enddo
         enddo
        write(mesg,'(a,i4)') "cubic_grid redistribute from ensemble", n
-        call compare_checksums( x(isc:iec,jsc:jec,:,n), y(isc:iec,jsc:jec,:), trim(mesg) )
+        call compare_checksums( x(sc(i_dim):ec(i_dim),sc(j_dim):ec(j_dim),sc(k_dim):ec(k_dim),n),            &
+                              & y(sc(i_dim):ec(i_dim),sc(j_dim):ec(j_dim),sc(k_dim):ec(k_dim)), trim(mesg) )
      enddo
 
      ! redistribute data to each ensemble.
      deallocate(x,y,x_ens)
-     allocate(x(isd:ied, jsd:jed, nz, ensemble_size))
-     allocate(x_ens(isd_ens:ied_ens, jsd_ens:jed_ens, nz))
-     allocate(y(isd_ens:ied_ens, jsd_ens:jed_ens, nz))
+     allocate(x(sd(i_dim):ed(i_dim), sd(j_dim):ed(j_dim), sd(k_dim):ed(k_dim), ensemble_size))
+     allocate(x_ens(sd_ens(i_dim):ed_ens(i_dim), sd_ens(j_dim):ed_ens(j_dim), sd_ens(k_dim):ed_ens(k_dim) ))
+     allocate(y(sd_ens(i_dim):ed_ens(i_dim), sd_ens(j_dim):ed_ens(j_dim), sd_ens(k_dim):ed_ens(k_dim) ))
 
      y = 0
-     do k = 1, nz
-        do j = jsc, jec
-           do i = isc, iec
+     do k = sc(k_dim), ec(k_dim)
+        do j = sc(j_dim), ec(j_dim)
+           do i = sc(i_dim), ec(i_dim)
               x(i,j,k,:) = i + j * 1.e-3 + k * 1.e-6
            enddo
         enddo
@@ -743,20 +797,21 @@ subroutine cubic_grid_redistribute
 
      do n = 1, ensemble_size
         x_ens = 0
-        call mpp_redistribute(domain, x(:,:,:,n), domain_ensemble(n), x_ens)
+        call mpp_redistribute(domain, x(:,:,:,n), domain_ensemble(n), x_ens, axis_to_storage_in=axis_to_storage)
         y = 0
         if( ensemble_id == n ) then
-           do k = 1, nz
-              do j = jsc_ens, jec_ens
-                 do i = isc_ens, iec_ens
+           do k = sc_ens(k_dim), ec_ens(k_dim)
+              do j = sc_ens(j_dim), ec_ens(j_dim)
+                 do i = sc_ens(i_dim), ec_ens(i_dim)
                     y(i,j,k) = i + j * 1.e-3 + k * 1.e-6
                  enddo
               enddo
            enddo
         endif
         write(mesg,'(a,i4)') "cubic_grid redistribute to ensemble", n
-        call compare_checksums( x_ens(isc_ens:iec_ens,jsc_ens:jec_ens,:), y(isc_ens:iec_ens,jsc_ens:jec_ens,:), &
-                              &  trim(mesg) )
+  call compare_checksums( x_ens(sc_ens(i_dim):ec_ens(i_dim),sc_ens(j_dim):ec_ens(j_dim),sc_ens(k_dim):ec_ens(k_dim)), &
+                        & y(sc_ens(i_dim):ec_ens(i_dim),sc_ens(j_dim):ec_ens(j_dim),sc_ens(k_dim):ec_ens(k_dim)),     &
+                        & trim(mesg) )
      enddo
 
      deallocate(x, y, x_ens)
diff --git a/test_fms/mpp/test_mpp_domains2.sh b/test_fms/mpp/test_mpp_domains2.sh
index a9b26f2cd3..a87189640c 100755
--- a/test_fms/mpp/test_mpp_domains2.sh
+++ b/test_fms/mpp/test_mpp_domains2.sh
@@ -28,7 +28,7 @@
 . ../test-lib.sh
 
 # TODO: Enable this test once generalized indices work is complete
-SKIP_TESTS="test_mpp_domains2.12"
+SKIP_TESTS="test_mpp_domains2.13"
 
 # TODO edge update, fails on non-blocking with gnu
 #SKIP_TESTS="$SKIP_TESTS $(basename $0 .sh).6"
@@ -66,6 +66,7 @@ mix_2D_3D = .false.
 test_get_nbr = .false.
 test_edge_update = .false.
 test_cubic_grid_redistribute = .false.
+test_cubic_grid_redistribute_generalized_indices = .false.
 ensemble_size = 1
 layout_cubic = 0,0
 layout_ensemble = 0,0
@@ -116,6 +117,10 @@ sed "s/test_cubic_grid_redistribute = .false./test_cubic_grid_redistribute = .tr
 test_expect_success "cubic grid redistribute" '
     mpirun -n 6 ../test_mpp_domains
 '
+sed "s/test_cubic_grid_redistribute_generalized_indices = .false./test_cubic_grid_redistribute_generalized_indices = .true./" input_base.nml > input.nml
+test_expect_success "cubic grid redistribute_generalized_indices" '
+    mpirun -n 6 ../test_mpp_domains
+'
 sed "s/test_boundary = .false./test_boundary = .true./" input_base.nml > input.nml
 test_expect_success "boundary" '
     mpirun -n 6 ../test_mpp_domains