diff --git a/diag_manager/diag_data.F90 b/diag_manager/diag_data.F90 index 306c1d453f..3c38664b2a 100644 --- a/diag_manager/diag_data.F90 +++ b/diag_manager/diag_data.F90 @@ -335,7 +335,9 @@ MODULE diag_data_mod class(*), allocatable :: att_value(:) !< Value of the attribute character(len=:), allocatable :: att_name !< Name of the attribute contains +#ifndef __NVCOMPILER procedure :: add => fms_add_attribute +#endif procedure :: write_metadata end type fmsDiagAttribute_type ! Include variable "version" to be written to log file. @@ -558,6 +560,7 @@ function get_base_second() & res = base_second end function get_base_second +#ifndef __NVCOMPILER !> @brief Adds an attribute to the attribute type subroutine fms_add_attribute(this, att_name, att_value) class(fmsDiagAttribute_type), intent(inout) :: this !< Diag attribute type @@ -589,6 +592,7 @@ subroutine fms_add_attribute(this, att_name, att_value) end select end select end subroutine fms_add_attribute +#endif !> @brief gets the type of a variable !> @return the type of the variable (r4,r8,i4,i8,string) diff --git a/mpp/include/group_update_pack.inc b/mpp/include/group_update_pack.inc index de08b89e56..696075dd74 100644 --- a/mpp/include/group_update_pack.inc +++ b/mpp/include/group_update_pack.inc @@ -17,64 +17,95 @@ !*********************************************************************** if( group%k_loop_inside ) then -!$OMP parallel do default(none) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & +! nvfortran + cray pointers imposes some restrictions on the loops below: +! * the compiler cannot privatise OpenMP cray pointers in offloaded loops. Hence, inner loops +! must be ported rather than the whole outer loop. +! * the more verbose form of openmp offload loops must be used. Would prefer "target teams loop". +! * default(shared) must be used otherwise loops hang or segfault. Would prefer "default(none)". +#ifndef __NVCOMPILER_OPENMP_GPU +!$OMP parallel do default(shared) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & !$OMP private(buffer_pos,pos,m,is,ie,js,je,rotation, & -!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k) +!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx) +#endif do n = 1, npack buffer_pos = group%pack_buffer_pos(n) + buffer_start_pos pos = buffer_pos - is = group%pack_is(n); ie = group%pack_ie(n) - js = group%pack_js(n); je = group%pack_je(n) + is = group%pack_is(n); ie = group%pack_ie(n); ni = ie-is+1 + js = group%pack_js(n); je = group%pack_je(n); nj = je-js+1 rotation = group%pack_rotation(n) if( group%pack_type(n) == FIELD_S ) then select case( rotation ) case(ZERO) do l=1, group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) & + !$omp default(shared) private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_field,ptr) & + !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) +#endif do k = 1, ksize do j = js, je do i = is, ie - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1 + buffer(idx) = field(i,j,k) end do end do enddo + pos = pos + ksize*nj*ni enddo case( MINUS_NINETY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) & + !$omp default(shared) private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_field,ptr) & + !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) +#endif do k = 1, ksize do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (k-1)*nj*ni + (i-is)*nj + (je-j) + 1 + buffer(idx) = field(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do case( NINETY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) & + !$omp default(shared) private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_field,ptr) & + !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) +#endif do k = 1, ksize do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (k-1)*nj*ni + (ie-i)*nj + (j-js) + 1 + buffer(idx) = field(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do case( ONE_HUNDRED_EIGHTY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) & + !$omp default(shared) private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_field,ptr) & + !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) +#endif do k = 1, ksize do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (k-1)*nj*ni + (je-j)*ni + (ie-i) + 1 + buffer(idx) = field(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do end select else if( group%pack_type(n) == FIELD_X ) then @@ -82,77 +113,115 @@ if( group%k_loop_inside ) then case(ZERO) do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) & + !$omp default(shared) private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) & + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) +#endif do k = 1, ksize do j = js, je do i = is, ie - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1 + buffer(idx) = fieldx(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do case( MINUS_NINETY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) & + !$omp default(shared) private(i,j,k,idx) shared(ksize,is,ie,je,js,pos,nj,ni,ptr_fieldy,ptr) & + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) +#endif do k = 1, ksize do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + idx = pos + (k-1)*nj*ni + (i-is)*nj + (je-j) + 1 + buffer(idx) = fieldy(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) & + !$omp default(shared) private(i,j,k,idx) shared(ksize,is,ie,je,js,pos,nj,ni,ptr_fieldy,ptr) & + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) +#endif do k = 1, ksize do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = -fieldy(i,j,k) + idx = pos + (k-1)*nj*ni + (i-is)*nj + (je-j) + 1 + buffer(idx) = -fieldy(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do end if case( NINETY ) do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) & + !$omp default(shared) private(i,j,k,idx) shared(ksize,is,ie,je,js,pos,nj,ni,ptr_fieldy,ptr) & + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) +#endif do k = 1, ksize do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + ! pos = pos + 1 + ! buffer(pos) = fieldy(i,j,k) + idx = pos + (k-1)*nj*ni + (ie-i)*nj + (j-js) + 1 + buffer(idx) = fieldy(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do case( ONE_HUNDRED_EIGHTY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) & + !$omp default(shared) private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) & + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) +#endif do k = 1, ksize do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (k-1)*nj*ni + (je-j)*ni + (ie-i) + 1 + buffer(idx) = fieldx(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) & + !$omp default(shared) private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) & + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) +#endif do k = 1, ksize do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = -fieldx(i,j,k) + idx = pos + (k-1)*nj*ni + (je-j)*ni + (ie-i) + 1 + buffer(idx) = -fieldx(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do end if end select ! select case( rotation(n) ) @@ -161,135 +230,197 @@ if( group%k_loop_inside ) then case(ZERO) do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) & + !$omp default(shared) private(i,j,k,idx) shared(ksize,is,ie,je,js,pos,nj,ni,ptr_fieldy,ptr) & + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) +#endif do k = 1, ksize do j = js, je do i = is, ie - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1 + buffer(idx) = fieldy(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do case( MINUS_NINETY ) do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) & + !$omp default(shared) private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) & + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) +#endif do k = 1, ksize do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (k-1)*nj*ni + (i-is)*nj + (je-j) + 1 + buffer(idx) = fieldx(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do case( NINETY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) & + !$omp default(shared) private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) & + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) +#endif do k = 1, ksize do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (k-1)*nj*ni + (ie-i)*nj + (j-js) + 1 + buffer(idx) = fieldx(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) & + !$omp default(shared) private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) & + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) +#endif do k = 1, ksize do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = -fieldx(i,j,k) + idx = pos + (k-1)*nj*ni + (ie-i)*nj + (j-js) + 1 + buffer(idx) = -fieldx(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do end if case( ONE_HUNDRED_EIGHTY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) & + !$omp default(shared) private(i,j,k,idx) shared(ksize,is,ie,je,js,pos,nj,ni,ptr_fieldy,ptr) & + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) +#endif do k = 1, ksize do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + idx = pos + (k-1)*nj*ni + (je-j)*ni + (ie-i) + 1 + buffer(idx) = fieldy(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) & + !$omp default(shared) private(i,j,k,idx) shared(ksize,is,ie,je,js,pos,nj,ni,ptr_fieldy,ptr) & + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) +#endif do k = 1, ksize do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = -fieldy(i,j,k) + idx = pos + (k-1)*nj*ni + (je-j)*ni + (ie-i) + 1 + buffer(idx) = -fieldy(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do end if end select ! select case( rotation(n) ) endif enddo else -!$OMP parallel do default(none) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & +#ifndef __NVCOMPILER_OPENMP_GPU +!$OMP parallel do default(shared) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & !$OMP private(buffer_pos,pos,m,is,ie,js,je,rotation, & -!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k) +!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx) +#endif do nk = 1, npack*ksize n = (nk-1)/ksize + 1 k = mod((nk-1), ksize) + 1 buffer_pos = group%pack_buffer_pos(n) + buffer_start_pos pos = buffer_pos + (k-1)*group%pack_size(n) - is = group%pack_is(n); ie = group%pack_ie(n) - js = group%pack_js(n); je = group%pack_je(n) + is = group%pack_is(n); ie = group%pack_ie(n); ni = ie-is+1 + js = group%pack_js(n); je = group%pack_je(n); nj = je-js+1 rotation = group%pack_rotation(n) if( group%pack_type(n) == FIELD_S ) then select case( rotation ) case(ZERO) do l=1, group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) & + !$omp default(shared) private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_field,ptr) & + !$omp map(to: field(is:ie, js:je, k)) map(from: buffer(pos+1:pos+nj*ni)) +#endif do j = js, je do i = is, ie - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (j-js)*ni + (i-is) + 1 + buffer(idx) = field(i,j,k) end do end do + pos = pos + nj*ni enddo case( MINUS_NINETY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) & + !$omp default(shared) private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_field,ptr) & + !$omp map(to: field(is:ie, js:je, k)) map(from: buffer(pos+1:pos+nj*ni)) +#endif do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (i-is)*nj + (je-j) + 1 + buffer(idx) = field(i,j,k) end do end do + pos = pos + nj*ni end do case( NINETY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) & + !$omp default(shared) private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_field,ptr) & + !$omp map(to: field(is:ie, js:je, k)) map(from: buffer(pos+1:pos+nj*ni)) +#endif do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (ie-i)*nj + (j-js) + 1 + buffer(idx) = field(i,j,k) end do end do + pos = pos + nj*ni end do case( ONE_HUNDRED_EIGHTY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) & + !$omp default(shared) private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_field,ptr) & + !$omp map(to: field(is:ie, js:je, k)) map(from: buffer(pos+1:pos+nj*ni)) +#endif do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (je-j)*ni + (ie-i) + 1 + buffer(idx) = field(i,j,k) end do end do + pos = pos + nj*ni end do end select else if( group%pack_type(n) == FIELD_X ) then @@ -297,65 +428,101 @@ else case(ZERO) do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) & + !$omp default(shared) private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) & + !$omp map(to: fieldx(is:ie, js:je, k)) map(from: buffer(pos+1:pos+nj*ni)) +#endif do j = js, je do i = is, ie - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (j-js)*ni + (i-is) + 1 + buffer(idx) = fieldx(i,j,k) end do end do + pos = pos + nj*ni end do case( MINUS_NINETY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) & + !$omp default(shared) private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) & + !$omp map(to: fieldy(is:ie, js:je, k)) map(from: buffer(pos+1:pos+nj*ni)) +#endif do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + idx = pos + (i-is)*nj + (je-j) + 1 + buffer(idx) = fieldy(i,j,k) end do end do + pos = pos + nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) & + !$omp default(shared) private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) & + !$omp map(to: fieldy(is:ie, js:je, k)) map(from: buffer(pos+1:pos+nj*ni)) +#endif do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = -fieldy(i,j,k) + idx = pos + (i-is)*nj + (je-j) + 1 + buffer(idx) = -fieldy(i,j,k) end do end do + pos = pos + nj*ni end do end if case( NINETY ) do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) & + !$omp default(shared) private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) & + !$omp map(to: fieldy(is:ie, js:je, k)) map(from: buffer(pos+1:pos+nj*ni)) +#endif do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + idx = pos + (ie-i)*nj + (j-js) + 1 + buffer(idx) = fieldy(i,j,k) end do end do + pos = pos + nj*ni end do case( ONE_HUNDRED_EIGHTY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) & + !$omp default(shared) private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) & + !$omp map(to: fieldx(is:ie, js:je, k)) map(from: buffer(pos+1:pos+nj*ni)) +#endif do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (je-j)*ni + (ie-i) + 1 + buffer(idx) = fieldx(i,j,k) end do end do + pos = pos + nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) & + !$omp default(shared) private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) & + !$omp map(to: fieldx(is:ie, js:je, k)) map(from: buffer(pos+1:pos+nj*ni)) +#endif do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = -fieldx(i,j,k) + idx = pos + (je-j)*ni + (ie-i) + 1 + buffer(idx) = -fieldx(i,j,k) end do end do + pos = pos + nj*ni end do end if end select ! select case( rotation(n) ) @@ -364,65 +531,101 @@ else case(ZERO) do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) & + !$omp default(shared) private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) & + !$omp map(to: fieldy(is:ie, js:je, k)) map(from: buffer(pos+1:pos+nj*ni)) +#endif do j = js, je do i = is, ie - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + idx = pos + (j-js)*ni + (i-is) + 1 + buffer(idx) = fieldy(i,j,k) end do end do + pos = pos + nj*ni end do case( MINUS_NINETY ) do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) & + !$omp default(shared) private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) & + !$omp map(to: fieldx(is:ie, js:je, k)) map(from: buffer(pos+1:pos+nj*ni)) +#endif do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (i-is)*nj + (je-j) + 1 + buffer(idx) = fieldx(i,j,k) end do end do + pos = pos + nj*ni end do case( NINETY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) & + !$omp default(shared) private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) & + !$omp map(to: fieldx(is:ie, js:je, k)) map(from: buffer(pos+1:pos+nj*ni)) +#endif do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (ie-i)*nj + (j-js) + 1 + buffer(idx) = fieldx(i,j,k) end do end do + pos = pos + nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) & + !$omp default(shared) private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) & + !$omp map(to: fieldx(is:ie, js:je, k)) map(from: buffer(pos+1:pos+nj*ni)) +#endif do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = -fieldx(i,j,k) + idx = pos + (ie-i)*nj + (j-js) + 1 + buffer(idx) = -fieldx(i,j,k) end do end do + pos = pos + nj*ni end do end if case( ONE_HUNDRED_EIGHTY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) & + !$omp default(shared) private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) & + !$omp map(to: fieldy(is:ie, js:je, k)) map(from: buffer(pos+1:pos+nj*ni)) +#endif do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + idx = pos + (je-j)*ni + (ie-i) + 1 + buffer(idx) = fieldy(i,j,k) end do end do + pos = pos + nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) & + !$omp default(shared) private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) & + !$omp map(to: fieldy(is:ie, js:je, k)) map(from: buffer(pos+1:pos+nj*ni)) +#endif do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = -fieldy(i,j,k) + idx = pos + (je-j)*ni + (ie-i) + 1 + buffer(idx) = -fieldy(i,j,k) end do end do + pos = pos + nj*ni end do end if end select ! select case( rotation(n) ) diff --git a/mpp/include/group_update_unpack.inc b/mpp/include/group_update_unpack.inc index 49fb2555ce..d8386a2fce 100644 --- a/mpp/include/group_update_unpack.inc +++ b/mpp/include/group_update_unpack.inc @@ -17,92 +17,146 @@ !*********************************************************************** if( group%k_loop_inside ) then -!$OMP parallel do default(none) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & +! nvfortran + cray pointers imposes some restrictions on the loops below: +! * the compiler cannot privatise OpenMP cray pointers in offloaded loops. Hence, inner loops +! must be ported rather than the whole outer loop. +! * the more verbose form of openmp offload loops must be used. Would prefer "target teams loop". +! * default(shared) must be used otherwise loops hang or segfault. Would prefer "default(none)". +#ifndef __NVCOMPILER_OPENMP_GPU +!$OMP parallel do default(shared) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & !$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation, & -!$OMP ptr_field, ptr_fieldx, ptr_fieldy, n,k ) +!$OMP ptr_field, ptr_fieldx, ptr_fieldy, n,k,ni,nj,idx) +#endif do n = nunpack, 1, -1 buffer_pos = group%unpack_buffer_pos(n) + buffer_start_pos pos = buffer_pos - is = group%unpack_is(n); ie = group%unpack_ie(n) - js = group%unpack_js(n); je = group%unpack_je(n) + is = group%unpack_is(n); ie = group%unpack_ie(n); ni = ie-is+1 + js = group%unpack_js(n); je = group%unpack_je(n); nj = je-js+1 if( group%unpack_type(n) == FIELD_S ) then do l=1,nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) & + !$omp default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_field,ptr) & + !$omp map(to: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp map(from: field(is:ie,js:je,1:ksize)) +#endif do k = 1, ksize do j = js, je do i = is, ie - pos = pos + 1 - field(i,j,k) = buffer(pos) + idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1 + field(i,j,k) = buffer(idx) end do end do end do + pos = pos + ksize*nj*ni end do else if( group%unpack_type(n) == FIELD_X ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) & + !$omp default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) & + !$omp map(to: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp map(from: fieldx(is:ie,js:je,1:ksize)) +#endif do k = 1, ksize do j = js, je do i = is, ie - pos = pos + 1 - fieldx(i,j,k) = buffer(pos) + idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1 + fieldx(i,j,k) = buffer(idx) end do end do end do + pos = pos + ksize*nj*ni end do else if( group%unpack_type(n) == FIELD_Y ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) & + !$omp default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldy,ptr) & + !$omp map(to: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp map(from: fieldy(is:ie,js:je,1:ksize)) +#endif do k = 1, ksize do j = js, je do i = is, ie - pos = pos + 1 - fieldy(i,j,k) = buffer(pos) + idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1 + fieldy(i,j,k) = buffer(idx) end do end do end do + pos = pos + ksize*nj*ni end do endif enddo else -!$OMP parallel do default(none) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & +#ifndef __NVCOMPILER_OPENMP_GPU +!$OMP parallel do default(shared) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & !$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation, & -!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k) +!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx) +#endif do nk = nunpack*ksize, 1, -1 n = (nk-1)/ksize + 1 k = mod((nk-1), ksize) + 1 buffer_pos = group%unpack_buffer_pos(n) + buffer_start_pos pos = buffer_pos + (k-1)*group%unpack_size(n) - is = group%unpack_is(n); ie = group%unpack_ie(n) - js = group%unpack_js(n); je = group%unpack_je(n) + is = group%unpack_is(n); ie = group%unpack_ie(n); ni = ie-is+1 + js = group%unpack_js(n); je = group%unpack_je(n); nj = je-js+1 if( group%unpack_type(n) == FIELD_S ) then do l=1,nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) & + !$omp default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_field,ptr) & + !$omp map(to: buffer(pos+1:pos+nj*ni)) map(from: field(is:ie,js:je,k)) +#endif do j = js, je do i = is, ie - pos = pos + 1 - field(i,j,k) = buffer(pos) + idx = pos + (j-js)*ni + (i-is) + 1 + field(i,j,k) = buffer(idx) end do end do + pos = pos + ni*nj end do else if( group%unpack_type(n) == FIELD_X ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) & + !$omp default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) & + !$omp map(to: buffer(pos+1:pos+nj*ni)) map(from: fieldx(is:ie,js:je,k)) +#endif do j = js, je do i = is, ie - pos = pos + 1 - fieldx(i,j,k) = buffer(pos) + idx = pos + (j-js)*ni + (i-is) + 1 + fieldx(i,j,k) = buffer(idx) end do end do + pos = pos + ni*nj end do else if( group%unpack_type(n) == FIELD_Y ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) if(use_device_ptr) & + !$omp default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) & + !$omp map(to: buffer(pos+1:pos+nj*ni)) map(from: fieldy(is:ie,js:je,k)) +#endif do j = js, je do i = is, ie - pos = pos + 1 - fieldy(i,j,k) = buffer(pos) + idx = pos + (j-js)*ni + (i-is) + 1 + fieldy(i,j,k) = buffer(idx) end do end do + pos = pos + ni*nj end do endif enddo diff --git a/mpp/include/mpp_comm_mpi.inc b/mpp/include/mpp_comm_mpi.inc index e7b5ccb76a..42e22b2598 100644 --- a/mpp/include/mpp_comm_mpi.inc +++ b/mpp/include/mpp_comm_mpi.inc @@ -30,6 +30,7 @@ !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! !> @brief Initialize the @ref mpp_mod module. Must be called before any usage. subroutine mpp_init_f08( flags, localcomm, test_level, alt_input_nml_path ) + !$ use omp_lib integer, optional, intent(in) :: flags !< Flags for debug output, can be MPP_VERBOSE or MPP_DEBUG type(mpi_comm), optional, intent(in) :: localcomm !< MPI communicator to use. Only relevant if MPI has already !! been initialized by an external call to mpi_init. @@ -60,6 +61,14 @@ call MPI_COMM_RANK( mpp_comm_private, pe, error ) call MPI_COMM_SIZE( mpp_comm_private, npes, error ) + ! set default device to enable multi GPU parallelism + ! calls to both OpenACC and OpenMP runtimes are needed + ! because we use both do-concurrent and openmp + ! if you remove either, the code will run multiple + ! ranks on a _single_ GPU. Be careful out there! + !$ call omp_set_default_device(pe) + !$acc set device_num(pe) + module_is_initialized = .TRUE. if (present(test_level)) then t_level = test_level diff --git a/mpp/include/mpp_domains_reduce.inc b/mpp/include/mpp_domains_reduce.inc index 826b2f66ac..874d0217b7 100644 --- a/mpp/include/mpp_domains_reduce.inc +++ b/mpp/include/mpp_domains_reduce.inc @@ -905,6 +905,8 @@ #undef MPP_TYPE_INIT_VALUE !**************************************************** +#ifndef __NVCOMPILER + #undef MPP_GLOBAL_FIELD_ #define MPP_GLOBAL_FIELD_ mpp_global_field_r8 #undef MPP_TYPE_ @@ -972,6 +974,127 @@ #undef DEFAULT_VALUE_ #define DEFAULT_VALUE_ .false._l4_kind #include + + +!! if __NVCOMPILER is defined, use compatibility version +#else + +#undef MPP_TYPE_ +#define MPP_TYPE_ real(r8_kind) +#undef DEFAULT_VALUE_ +#define DEFAULT_VALUE_ 0._r8_kind +#undef MPP_GLOBAL_FIELD_2D_ +#define MPP_GLOBAL_FIELD_2D_ mpp_global_field_r8_2d +#undef MPP_GLOBAL_FIELD_3D_ +#define MPP_GLOBAL_FIELD_3D_ mpp_global_field_r8_3d +#undef MPP_GLOBAL_FIELD_4D_ +#define MPP_GLOBAL_FIELD_4D_ mpp_global_field_r8_4d +#undef MPP_GLOBAL_FIELD_5D_ +#define MPP_GLOBAL_FIELD_5D_ mpp_global_field_r8_5d +#include + +#undef MPP_TYPE_ +#define MPP_TYPE_ integer(i8_kind) +#undef DEFAULT_VALUE_ +#define DEFAULT_VALUE_ 0_i8_kind +#undef MPP_GLOBAL_FIELD_2D_ +#define MPP_GLOBAL_FIELD_2D_ mpp_global_field_i8_2d +#undef MPP_GLOBAL_FIELD_3D_ +#define MPP_GLOBAL_FIELD_3D_ mpp_global_field_i8_3d +#undef MPP_GLOBAL_FIELD_4D_ +#define MPP_GLOBAL_FIELD_4D_ mpp_global_field_i8_4d +#undef MPP_GLOBAL_FIELD_5D_ +#define MPP_GLOBAL_FIELD_5D_ mpp_global_field_i8_5d +#include + +#ifdef OVERLOAD_C8 +#undef MPP_TYPE_ +#define MPP_TYPE_ complex(c8_kind) +#undef DEFAULT_VALUE_ +#define DEFAULT_VALUE_ (0._r8_kind,0._r8_kind) +#undef MPP_GLOBAL_FIELD_2D_ +#define MPP_GLOBAL_FIELD_2D_ mpp_global_field_c8_2d +#undef MPP_GLOBAL_FIELD_3D_ +#define MPP_GLOBAL_FIELD_3D_ mpp_global_field_c8_3d +#undef MPP_GLOBAL_FIELD_4D_ +#define MPP_GLOBAL_FIELD_4D_ mpp_global_field_c8_4d +#undef MPP_GLOBAL_FIELD_5D_ +#define MPP_GLOBAL_FIELD_5D_ mpp_global_field_c8_5d +#include +#endif + +#ifdef OVERLOAD_C4 +#undef MPP_TYPE_ +#define MPP_TYPE_ complex(c4_kind) +#undef DEFAULT_VALUE_ +#define DEFAULT_VALUE_ (0._r4_kind,0._r4_kind) +#undef MPP_GLOBAL_FIELD_2D_ +#define MPP_GLOBAL_FIELD_2D_ mpp_global_field_c4_2d +#undef MPP_GLOBAL_FIELD_3D_ +#define MPP_GLOBAL_FIELD_3D_ mpp_global_field_c4_3d +#undef MPP_GLOBAL_FIELD_4D_ +#define MPP_GLOBAL_FIELD_4D_ mpp_global_field_c4_4d +#undef MPP_GLOBAL_FIELD_5D_ +#define MPP_GLOBAL_FIELD_5D_ mpp_global_field_c4_5d +#include +#endif + +#undef MPP_TYPE_ +#define MPP_TYPE_ logical(l8_kind) +#undef DEFAULT_VALUE_ +#define DEFAULT_VALUE_ .false._l8_kind +#undef MPP_GLOBAL_FIELD_2D_ +#define MPP_GLOBAL_FIELD_2D_ mpp_global_field_l8_2d +#undef MPP_GLOBAL_FIELD_3D_ +#define MPP_GLOBAL_FIELD_3D_ mpp_global_field_l8_3d +#undef MPP_GLOBAL_FIELD_4D_ +#define MPP_GLOBAL_FIELD_4D_ mpp_global_field_l8_4d +#undef MPP_GLOBAL_FIELD_5D_ +#define MPP_GLOBAL_FIELD_5D_ mpp_global_field_l8_5d +#include + +#undef MPP_TYPE_ +#define MPP_TYPE_ real(r4_kind) +#undef DEFAULT_VALUE_ +#define DEFAULT_VALUE_ 0._r4_kind +#undef MPP_GLOBAL_FIELD_2D_ +#define MPP_GLOBAL_FIELD_2D_ mpp_global_field_r4_2d +#undef MPP_GLOBAL_FIELD_3D_ +#define MPP_GLOBAL_FIELD_3D_ mpp_global_field_r4_3d +#undef MPP_GLOBAL_FIELD_4D_ +#define MPP_GLOBAL_FIELD_4D_ mpp_global_field_r4_4d +#undef MPP_GLOBAL_FIELD_5D_ +#define MPP_GLOBAL_FIELD_5D_ mpp_global_field_r4_5d +#include + +#undef MPP_TYPE_ +#define MPP_TYPE_ integer(i4_kind) +#undef DEFAULT_VALUE_ +#define DEFAULT_VALUE_ 0_i4_kind +#undef MPP_GLOBAL_FIELD_2D_ +#define MPP_GLOBAL_FIELD_2D_ mpp_global_field_i4_2d +#undef MPP_GLOBAL_FIELD_3D_ +#define MPP_GLOBAL_FIELD_3D_ mpp_global_field_i4_3d +#undef MPP_GLOBAL_FIELD_4D_ +#define MPP_GLOBAL_FIELD_4D_ mpp_global_field_i4_4d +#undef MPP_GLOBAL_FIELD_5D_ +#define MPP_GLOBAL_FIELD_5D_ mpp_global_field_i4_5d +#include + +#undef MPP_TYPE_ +#define MPP_TYPE_ logical(l4_kind) +#undef DEFAULT_VALUE_ +#define DEFAULT_VALUE_ .false._l4_kind +#undef MPP_GLOBAL_FIELD_2D_ +#define MPP_GLOBAL_FIELD_2D_ mpp_global_field_l4_2d +#undef MPP_GLOBAL_FIELD_3D_ +#define MPP_GLOBAL_FIELD_3D_ mpp_global_field_l4_3d +#undef MPP_GLOBAL_FIELD_4D_ +#define MPP_GLOBAL_FIELD_4D_ mpp_global_field_l4_4d +#undef MPP_GLOBAL_FIELD_5D_ +#define MPP_GLOBAL_FIELD_5D_ mpp_global_field_l4_5d +#include +#endif !**************************************************** #undef MPP_DO_GLOBAL_FIELD_3D_AD_ #define MPP_DO_GLOBAL_FIELD_3D_AD_ mpp_do_global_field2D_r8_3d_ad diff --git a/mpp/include/mpp_global_field_compat.fh b/mpp/include/mpp_global_field_compat.fh new file mode 100644 index 0000000000..a6a4bbc23d --- /dev/null +++ b/mpp/include/mpp_global_field_compat.fh @@ -0,0 +1,1038 @@ +!*********************************************************************** +!* Apache License 2.0 +!* +!* This file is part of the GFDL Flexible Modeling System (FMS). +!* +!* Licensed under the Apache License, Version 2.0 (the "License"); +!* you may not use this file except in compliance with the License. +!* You may obtain a copy of the License at +!* +!* http://www.apache.org/licenses/LICENSE-2.0 +!* +!* FMS is distributed in the hope that it will be useful, but WITHOUT +!* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied; +!* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +!* PARTICULAR PURPOSE. See the License for the specific language +!* governing permissions and limitations under the License. +!*********************************************************************** +!> @addtogroup mpp_domains_mod +!> {@ + !> get a global field from a local field + !! local field may be on compute OR data domain + + !> Gets a global field from a local field + !! local field may be on compute OR data domain + subroutine MPP_GLOBAL_FIELD_2D_( domain, local, global, flags, position, tile_count, default_data, xdim, ydim) + type(domain2D), intent(in) :: domain + MPP_TYPE_, intent(in) :: local(:,:) + MPP_TYPE_, intent(out) :: global(:,:) + integer, intent(in), optional :: flags + integer, intent(in), optional :: position + integer, intent(in), optional :: tile_count + MPP_TYPE_, intent(in), optional :: default_data + integer, intent(in), optional :: xdim, ydim !< Indices of the domain-decomposed dimensions. In typical + !! Fortran-based code, xdim=1 and ydim=2. + + integer :: i, j, k, m, n, nd, num_words, lpos, rpos, ioff, joff, from_pe, root_pe, tile_id + integer :: ke, isc, iec, jsc, jec, is, ie, js, je, num_word_me + integer :: ipos, jpos, n_per_gridpoint_local, n_per_gridpoint_global + logical :: xonly, yonly, root_only, global_on_this_pe + integer :: ix, iy + MPP_TYPE_, pointer, dimension(:) :: clocal, cremote + integer :: stackuse + character(len=8) :: text + type(c_ptr) :: stack_cptr !< Workaround for GFortran bug + + integer :: tile, ishift, jshift, ipos0, jpos0 + integer :: size_clocal, size_cremote + + tile = 1 + if(present(tile_count)) tile = tile_count + + call mpp_get_domain_shift(domain, ishift, jshift, position) + + ipos0 = -domain%x(tile)%global%begin + 1 + jpos0 = -domain%y(tile)%global%begin + 1 + + if (present(xdim)) then + ix = xdim + else + ix = 1 + endif + + if (present(ydim)) then + iy = ydim + else + iy = 2 + endif + + n_per_gridpoint_local = size(local) / (size(local,ix) * size(local,iy)) + n_per_gridpoint_global = size(global) / (size(global,ix) * size(global,iy)) + size_clocal = (domain%x(1)%compute%size+ishift) * (domain%y(1)%compute%size+jshift) * n_per_gridpoint_local + size_cremote = (domain%x(1)%compute%max_size+ishift) * (domain%y(1)%compute%max_size+jshift) * & + n_per_gridpoint_local + + stackuse = size_clocal + size_cremote + if( stackuse.GT.mpp_domains_stack_size )then + write( text, '(i8)' )stackuse + call mpp_error( FATAL, & + 'MPP_DO_GLOBAL_FIELD user stack overflow: call mpp_domains_set_stack_size('//trim(text)// & + & ') from all PEs.' ) + end if + mpp_domains_stack_hwm = max( mpp_domains_stack_hwm, stackuse ) + + stack_cptr = c_loc(mpp_domains_stack(1)) + call c_f_pointer(stack_cptr, clocal, [size_clocal]) + + stack_cptr = c_loc(mpp_domains_stack(1+size_clocal)) + call c_f_pointer(stack_cptr, cremote, [size_cremote]) + + if( .NOT.module_is_initialized )call mpp_error( FATAL, 'MPP_GLOBAL_FIELD: must first call mpp_domains_init.' ) + + xonly = .FALSE. + yonly = .FALSE. + root_only = .FALSE. + if( PRESENT(flags) ) then + xonly = BTEST(flags,EAST) + yonly = BTEST(flags,SOUTH) + if( .NOT.xonly .AND. .NOT.yonly )call mpp_error( WARNING, & + 'MPP_GLOBAL_FIELD: you must have flags=XUPDATE, YUPDATE or XUPDATE+YUPDATE' ) + if(xonly .AND. yonly) then + xonly = .false.; yonly = .false. + endif + root_only = BTEST(flags, ROOT_GLOBAL) + if( (xonly .or. yonly) .AND. root_only ) then + call mpp_error( WARNING, 'MPP_GLOBAL_FIELD: flags = XUPDATE+GLOBAL_ROOT_ONLY or ' // & + 'flags = YUPDATE+GLOBAL_ROOT_ONLY is not supported, will ignore GLOBAL_ROOT_ONLY' ) + root_only = .FALSE. + endif + endif + + global_on_this_pe = .NOT. root_only .OR. domain%pe == domain%tile_root_pe + ipos = 0; jpos = 0 + if(global_on_this_pe ) then + if(n_per_gridpoint_local.ne.n_per_gridpoint_global) then + call mpp_error(FATAL, 'MPP_GLOBAL_FIELD: mismatch of global and local dimension sizes') + endif + if( size(global,ix).NE.(domain%x(tile)%global%size+ishift) .OR. & + size(global,iy).NE.(domain%y(tile)%global%size+jshift))then + if(xonly) then + if(size(global,ix).NE.(domain%x(tile)%global%size+ishift) .OR. & + size(global,iy).NE.(domain%y(tile)%compute%size+jshift)) & + call mpp_error( FATAL, & + & 'MPP_GLOBAL_FIELD: incoming arrays do not match domain for xonly global field.' ) + jpos = -domain%y(tile)%compute%begin + 1 + else if(yonly) then + if(size(global,ix).NE.(domain%x(tile)%compute%size+ishift) .OR. & + size(global,iy).NE.(domain%y(tile)%global%size+jshift)) & + call mpp_error( FATAL, & + & 'MPP_GLOBAL_FIELD: incoming arrays do not match domain for yonly global field.' ) + ipos = -domain%x(tile)%compute%begin + 1 + else + call mpp_error( FATAL, 'MPP_GLOBAL_FIELD: incoming arrays do not match domain.' ) + endif + endif + endif + + if( size(local,ix).EQ.(domain%x(tile)%compute%size+ishift) .AND. & + size(local,iy).EQ.(domain%y(tile)%compute%size+jshift) )then + !local is on compute domain + ioff = -domain%x(tile)%compute%begin + 1 + joff = -domain%y(tile)%compute%begin + 1 + else if( size(local,ix).EQ.(domain%x(tile)%memory%size+ishift) .AND. & + size(local,iy).EQ.(domain%y(tile)%memory%size+jshift) )then + !local is on data domain + ioff = -domain%x(tile)%domain_data%begin + 1 + joff = -domain%y(tile)%domain_data%begin + 1 + else + call mpp_error( FATAL, & + & 'MPP_GLOBAL_FIELD_: incoming field array must match either compute domain or memory domain.') + end if + + !ke = size(local,3) + ke = n_per_gridpoint_local + isc = domain%x(tile)%compute%begin; iec = domain%x(tile)%compute%end+ishift + jsc = domain%y(tile)%compute%begin; jec = domain%y(tile)%compute%end+jshift + + num_word_me = (iec-isc+1)*(jec-jsc+1)*n_per_gridpoint_local + +! make contiguous array from compute domain + m = 0 + if(global_on_this_pe) then + !z1l: initialize global = 0 to support mask domain + if(present(default_data)) then + call arr_init(global, default_data) + else + call arr_init(global, DEFAULT_VALUE_) + endif + + call arr2vec(local, clocal, ix, iy, ioff+isc, ioff+iec, joff+jsc, joff+jec) + + ! Fill local domain directly + call vec2arr(clocal, global, ix, iy, ipos0+ipos+isc, ipos0+ipos+iec, jpos0+jpos+jsc, jpos0+jpos+jec) + else + call arr2vec(local, clocal, ix, iy, ioff+isc, ioff+iec, joff+jsc, joff+jec) + endif + +! if there is more than one tile on this pe, then no decomposition for all tiles on this pe, so we can just return + if(size(domain%x(:))>1) then + !--- the following is needed to avoid deadlock. + if( tile == size(domain%x(:)) ) call mpp_sync_self( ) + return + end if + + root_pe = mpp_root_pe() + +!fill off-domains (note loops begin at an offset of 1) + if( xonly )then + nd = size(domain%x(1)%list(:)) + do n = 1,nd-1 + lpos = mod(domain%x(1)%pos+nd-n,nd) + rpos = mod(domain%x(1)%pos +n,nd) + from_pe = domain%x(1)%list(rpos)%pe + rpos = from_pe - root_pe ! for concurrent run, root_pe may not be 0. + if (from_pe == NULL_PE) then + num_words = 0 + else + num_words = (domain%list(rpos)%x(1)%compute%size+ishift) & + * (domain%list(rpos)%y(1)%compute%size+jshift) * ke + endif + ! Force use of scalar, integer ptr interface + call mpp_transmit( put_data=clocal(1), plen=num_word_me, to_pe=domain%x(1)%list(lpos)%pe, & + get_data=cremote(1), glen=num_words, from_pe=from_pe ) + m = 0 + if (from_pe /= NULL_PE) then + is = domain%list(rpos)%x(1)%compute%begin; ie = domain%list(rpos)%x(1)%compute%end+ishift + call vec2arr(cremote, global, ix, iy, ipos0+is, ipos0+ie, jpos0+jpos+jsc, jpos0+jpos+jec) + endif + call mpp_sync_self() !-ensure MPI_ISEND is done. + end do + else if( yonly )then + nd = size(domain%y(1)%list(:)) + do n = 1,nd-1 + lpos = mod(domain%y(1)%pos+nd-n,nd) + rpos = mod(domain%y(1)%pos +n,nd) + from_pe = domain%y(1)%list(rpos)%pe + rpos = from_pe - root_pe + if (from_pe == NULL_PE) then + num_words = 0 + else + num_words = (domain%list(rpos)%x(1)%compute%size+ishift) & + * (domain%list(rpos)%y(1)%compute%size+jshift) * ke + endif + ! Force use of scalar, integer pointer interface + call mpp_transmit( put_data=clocal(1), plen=num_word_me, to_pe=domain%y(1)%list(lpos)%pe, & + get_data=cremote(1), glen=num_words, from_pe=from_pe ) + m = 0 + if (from_pe /= NULL_PE) then + js = domain%list(rpos)%y(1)%compute%begin; je = domain%list(rpos)%y(1)%compute%end+jshift + call vec2arr(cremote, global, ix, iy, ipos0+ipos+isc, ipos0+ipos+iec, jpos0+js, jpos0+je) + endif + call mpp_sync_self() !-ensure MPI_ISEND is done. + end do + else + tile_id = domain%tile_id(1) + nd = size(domain%list(:)) + if(root_only) then + if(domain%pe .NE. domain%tile_root_pe) then + call mpp_send( clocal(1), plen=num_word_me, to_pe=domain%tile_root_pe, tag=COMM_TAG_1 ) + else + do n = 1,nd-1 + rpos = mod(domain%pos+n,nd) + if( domain%list(rpos)%tile_id(1) .NE. tile_id ) cycle + num_words = (domain%list(rpos)%x(1)%compute%size+ishift) * & + & (domain%list(rpos)%y(1)%compute%size+jshift) * ke + call mpp_recv(cremote(1), glen=num_words, from_pe=domain%list(rpos)%pe, tag=COMM_TAG_1 ) + m = 0 + is = domain%list(rpos)%x(1)%compute%begin; ie = domain%list(rpos)%x(1)%compute%end+ishift + js = domain%list(rpos)%y(1)%compute%begin; je = domain%list(rpos)%y(1)%compute%end+jshift + + call vec2arr(cremote, global, ix, iy, ipos0+is, ipos0+ie, jpos0+js, jpos0+je) + end do + endif + else + do n = 1,nd-1 + lpos = mod(domain%pos+nd-n,nd) + if( domain%list(lpos)%tile_id(1).NE. tile_id ) cycle ! global field only within tile + call mpp_send( clocal(1), plen=num_word_me, to_pe=domain%list(lpos)%pe, tag=COMM_TAG_2 ) + end do + do n = 1,nd-1 + rpos = mod(domain%pos+n,nd) + if( domain%list(rpos)%tile_id(1) .NE. tile_id ) cycle ! global field only within tile + num_words = (domain%list(rpos)%x(1)%compute%size+ishift) * & + & (domain%list(rpos)%y(1)%compute%size+jshift) * ke + call mpp_recv( cremote(1), glen=num_words, from_pe=domain%list(rpos)%pe, tag=COMM_TAG_2 ) + m = 0 + is = domain%list(rpos)%x(1)%compute%begin; ie = domain%list(rpos)%x(1)%compute%end+ishift + js = domain%list(rpos)%y(1)%compute%begin; je = domain%list(rpos)%y(1)%compute%end+jshift + + call vec2arr(cremote, global, ix, iy, ipos0+is, ipos0+ie, jpos0+js, jpos0+je) + end do + endif + end if + + call mpp_sync_self + end subroutine MPP_GLOBAL_FIELD_2D_ + + !> Gets a global field from a local field + !! local field may be on compute OR data domain + subroutine MPP_GLOBAL_FIELD_3D_( domain, local, global, flags, position, tile_count, default_data, xdim, ydim) + type(domain2D), intent(in) :: domain + MPP_TYPE_, intent(in) :: local(:,:,:) + MPP_TYPE_, intent(out) :: global(:,:,:) + integer, intent(in), optional :: flags + integer, intent(in), optional :: position + integer, intent(in), optional :: tile_count + MPP_TYPE_, intent(in), optional :: default_data + integer, intent(in), optional :: xdim, ydim !< Indices of the domain-decomposed dimensions. In typical + !! Fortran-based code, xdim=1 and ydim=2. + + integer :: i, j, k, m, n, nd, num_words, lpos, rpos, ioff, joff, from_pe, root_pe, tile_id + integer :: ke, isc, iec, jsc, jec, is, ie, js, je, num_word_me + integer :: ipos, jpos, n_per_gridpoint_local, n_per_gridpoint_global + logical :: xonly, yonly, root_only, global_on_this_pe + integer :: ix, iy + MPP_TYPE_, pointer, dimension(:) :: clocal, cremote + integer :: stackuse + character(len=8) :: text + type(c_ptr) :: stack_cptr !< Workaround for GFortran bug + + integer :: tile, ishift, jshift, ipos0, jpos0 + integer :: size_clocal, size_cremote + + tile = 1 + if(present(tile_count)) tile = tile_count + + call mpp_get_domain_shift(domain, ishift, jshift, position) + + ipos0 = -domain%x(tile)%global%begin + 1 + jpos0 = -domain%y(tile)%global%begin + 1 + + if (present(xdim)) then + ix = xdim + else + ix = 1 + endif + + if (present(ydim)) then + iy = ydim + else + iy = 2 + endif + + n_per_gridpoint_local = size(local) / (size(local,ix) * size(local,iy)) + n_per_gridpoint_global = size(global) / (size(global,ix) * size(global,iy)) + size_clocal = (domain%x(1)%compute%size+ishift) * (domain%y(1)%compute%size+jshift) * n_per_gridpoint_local + size_cremote = (domain%x(1)%compute%max_size+ishift) * (domain%y(1)%compute%max_size+jshift) * & + n_per_gridpoint_local + + stackuse = size_clocal + size_cremote + if( stackuse.GT.mpp_domains_stack_size )then + write( text, '(i8)' )stackuse + call mpp_error( FATAL, & + 'MPP_DO_GLOBAL_FIELD user stack overflow: call mpp_domains_set_stack_size('//trim(text)// & + & ') from all PEs.' ) + end if + mpp_domains_stack_hwm = max( mpp_domains_stack_hwm, stackuse ) + + stack_cptr = c_loc(mpp_domains_stack(1)) + call c_f_pointer(stack_cptr, clocal, [size_clocal]) + + stack_cptr = c_loc(mpp_domains_stack(1+size_clocal)) + call c_f_pointer(stack_cptr, cremote, [size_cremote]) + + if( .NOT.module_is_initialized )call mpp_error( FATAL, 'MPP_GLOBAL_FIELD: must first call mpp_domains_init.' ) + + xonly = .FALSE. + yonly = .FALSE. + root_only = .FALSE. + if( PRESENT(flags) ) then + xonly = BTEST(flags,EAST) + yonly = BTEST(flags,SOUTH) + if( .NOT.xonly .AND. .NOT.yonly )call mpp_error( WARNING, & + 'MPP_GLOBAL_FIELD: you must have flags=XUPDATE, YUPDATE or XUPDATE+YUPDATE' ) + if(xonly .AND. yonly) then + xonly = .false.; yonly = .false. + endif + root_only = BTEST(flags, ROOT_GLOBAL) + if( (xonly .or. yonly) .AND. root_only ) then + call mpp_error( WARNING, 'MPP_GLOBAL_FIELD: flags = XUPDATE+GLOBAL_ROOT_ONLY or ' // & + 'flags = YUPDATE+GLOBAL_ROOT_ONLY is not supported, will ignore GLOBAL_ROOT_ONLY' ) + root_only = .FALSE. + endif + endif + + global_on_this_pe = .NOT. root_only .OR. domain%pe == domain%tile_root_pe + ipos = 0; jpos = 0 + if(global_on_this_pe ) then + if(n_per_gridpoint_local.ne.n_per_gridpoint_global) then + call mpp_error(FATAL, 'MPP_GLOBAL_FIELD: mismatch of global and local dimension sizes') + endif + if( size(global,ix).NE.(domain%x(tile)%global%size+ishift) .OR. & + size(global,iy).NE.(domain%y(tile)%global%size+jshift))then + if(xonly) then + if(size(global,ix).NE.(domain%x(tile)%global%size+ishift) .OR. & + size(global,iy).NE.(domain%y(tile)%compute%size+jshift)) & + call mpp_error( FATAL, & + & 'MPP_GLOBAL_FIELD: incoming arrays do not match domain for xonly global field.' ) + jpos = -domain%y(tile)%compute%begin + 1 + else if(yonly) then + if(size(global,ix).NE.(domain%x(tile)%compute%size+ishift) .OR. & + size(global,iy).NE.(domain%y(tile)%global%size+jshift)) & + call mpp_error( FATAL, & + & 'MPP_GLOBAL_FIELD: incoming arrays do not match domain for yonly global field.' ) + ipos = -domain%x(tile)%compute%begin + 1 + else + call mpp_error( FATAL, 'MPP_GLOBAL_FIELD: incoming arrays do not match domain.' ) + endif + endif + endif + + if( size(local,ix).EQ.(domain%x(tile)%compute%size+ishift) .AND. & + size(local,iy).EQ.(domain%y(tile)%compute%size+jshift) )then + !local is on compute domain + ioff = -domain%x(tile)%compute%begin + 1 + joff = -domain%y(tile)%compute%begin + 1 + else if( size(local,ix).EQ.(domain%x(tile)%memory%size+ishift) .AND. & + size(local,iy).EQ.(domain%y(tile)%memory%size+jshift) )then + !local is on data domain + ioff = -domain%x(tile)%domain_data%begin + 1 + joff = -domain%y(tile)%domain_data%begin + 1 + else + call mpp_error( FATAL, & + & 'MPP_GLOBAL_FIELD_: incoming field array must match either compute domain or memory domain.') + end if + + !ke = size(local,3) + ke = n_per_gridpoint_local + isc = domain%x(tile)%compute%begin; iec = domain%x(tile)%compute%end+ishift + jsc = domain%y(tile)%compute%begin; jec = domain%y(tile)%compute%end+jshift + + num_word_me = (iec-isc+1)*(jec-jsc+1)*n_per_gridpoint_local + +! make contiguous array from compute domain + m = 0 + if(global_on_this_pe) then + !z1l: initialize global = 0 to support mask domain + if(present(default_data)) then + call arr_init(global, default_data) + else + call arr_init(global, DEFAULT_VALUE_) + endif + + call arr2vec(local, clocal, ix, iy, ioff+isc, ioff+iec, joff+jsc, joff+jec) + + ! Fill local domain directly + call vec2arr(clocal, global, ix, iy, ipos0+ipos+isc, ipos0+ipos+iec, jpos0+jpos+jsc, jpos0+jpos+jec) + else + call arr2vec(local, clocal, ix, iy, ioff+isc, ioff+iec, joff+jsc, joff+jec) + endif + +! if there is more than one tile on this pe, then no decomposition for all tiles on this pe, so we can just return + if(size(domain%x(:))>1) then + !--- the following is needed to avoid deadlock. + if( tile == size(domain%x(:)) ) call mpp_sync_self( ) + return + end if + + root_pe = mpp_root_pe() + +!fill off-domains (note loops begin at an offset of 1) + if( xonly )then + nd = size(domain%x(1)%list(:)) + do n = 1,nd-1 + lpos = mod(domain%x(1)%pos+nd-n,nd) + rpos = mod(domain%x(1)%pos +n,nd) + from_pe = domain%x(1)%list(rpos)%pe + rpos = from_pe - root_pe ! for concurrent run, root_pe may not be 0. + if (from_pe == NULL_PE) then + num_words = 0 + else + num_words = (domain%list(rpos)%x(1)%compute%size+ishift) & + * (domain%list(rpos)%y(1)%compute%size+jshift) * ke + endif + ! Force use of scalar, integer ptr interface + call mpp_transmit( put_data=clocal(1), plen=num_word_me, to_pe=domain%x(1)%list(lpos)%pe, & + get_data=cremote(1), glen=num_words, from_pe=from_pe ) + m = 0 + if (from_pe /= NULL_PE) then + is = domain%list(rpos)%x(1)%compute%begin; ie = domain%list(rpos)%x(1)%compute%end+ishift + call vec2arr(cremote, global, ix, iy, ipos0+is, ipos0+ie, jpos0+jpos+jsc, jpos0+jpos+jec) + endif + call mpp_sync_self() !-ensure MPI_ISEND is done. + end do + else if( yonly )then + nd = size(domain%y(1)%list(:)) + do n = 1,nd-1 + lpos = mod(domain%y(1)%pos+nd-n,nd) + rpos = mod(domain%y(1)%pos +n,nd) + from_pe = domain%y(1)%list(rpos)%pe + rpos = from_pe - root_pe + if (from_pe == NULL_PE) then + num_words = 0 + else + num_words = (domain%list(rpos)%x(1)%compute%size+ishift) & + * (domain%list(rpos)%y(1)%compute%size+jshift) * ke + endif + ! Force use of scalar, integer pointer interface + call mpp_transmit( put_data=clocal(1), plen=num_word_me, to_pe=domain%y(1)%list(lpos)%pe, & + get_data=cremote(1), glen=num_words, from_pe=from_pe ) + m = 0 + if (from_pe /= NULL_PE) then + js = domain%list(rpos)%y(1)%compute%begin; je = domain%list(rpos)%y(1)%compute%end+jshift + call vec2arr(cremote, global, ix, iy, ipos0+ipos+isc, ipos0+ipos+iec, jpos0+js, jpos0+je) + endif + call mpp_sync_self() !-ensure MPI_ISEND is done. + end do + else + tile_id = domain%tile_id(1) + nd = size(domain%list(:)) + if(root_only) then + if(domain%pe .NE. domain%tile_root_pe) then + call mpp_send( clocal(1), plen=num_word_me, to_pe=domain%tile_root_pe, tag=COMM_TAG_1 ) + else + do n = 1,nd-1 + rpos = mod(domain%pos+n,nd) + if( domain%list(rpos)%tile_id(1) .NE. tile_id ) cycle + num_words = (domain%list(rpos)%x(1)%compute%size+ishift) * & + & (domain%list(rpos)%y(1)%compute%size+jshift) * ke + call mpp_recv(cremote(1), glen=num_words, from_pe=domain%list(rpos)%pe, tag=COMM_TAG_1 ) + m = 0 + is = domain%list(rpos)%x(1)%compute%begin; ie = domain%list(rpos)%x(1)%compute%end+ishift + js = domain%list(rpos)%y(1)%compute%begin; je = domain%list(rpos)%y(1)%compute%end+jshift + + call vec2arr(cremote, global, ix, iy, ipos0+is, ipos0+ie, jpos0+js, jpos0+je) + end do + endif + else + do n = 1,nd-1 + lpos = mod(domain%pos+nd-n,nd) + if( domain%list(lpos)%tile_id(1).NE. tile_id ) cycle ! global field only within tile + call mpp_send( clocal(1), plen=num_word_me, to_pe=domain%list(lpos)%pe, tag=COMM_TAG_2 ) + end do + do n = 1,nd-1 + rpos = mod(domain%pos+n,nd) + if( domain%list(rpos)%tile_id(1) .NE. tile_id ) cycle ! global field only within tile + num_words = (domain%list(rpos)%x(1)%compute%size+ishift) * & + & (domain%list(rpos)%y(1)%compute%size+jshift) * ke + call mpp_recv( cremote(1), glen=num_words, from_pe=domain%list(rpos)%pe, tag=COMM_TAG_2 ) + m = 0 + is = domain%list(rpos)%x(1)%compute%begin; ie = domain%list(rpos)%x(1)%compute%end+ishift + js = domain%list(rpos)%y(1)%compute%begin; je = domain%list(rpos)%y(1)%compute%end+jshift + + call vec2arr(cremote, global, ix, iy, ipos0+is, ipos0+ie, jpos0+js, jpos0+je) + end do + endif + end if + + call mpp_sync_self + end subroutine MPP_GLOBAL_FIELD_3D_ + + !> Gets a global field from a local field + !! local field may be on compute OR data domain + subroutine MPP_GLOBAL_FIELD_4D_( domain, local, global, flags, position, tile_count, default_data, xdim, ydim) + type(domain2D), intent(in) :: domain + MPP_TYPE_, intent(in) :: local(:,:,:,:) + MPP_TYPE_, intent(out) :: global(:,:,:,:) + integer, intent(in), optional :: flags + integer, intent(in), optional :: position + integer, intent(in), optional :: tile_count + MPP_TYPE_, intent(in), optional :: default_data + integer, intent(in), optional :: xdim, ydim !< Indices of the domain-decomposed dimensions. In typical + !! Fortran-based code, xdim=1 and ydim=2. + + integer :: i, j, k, m, n, nd, num_words, lpos, rpos, ioff, joff, from_pe, root_pe, tile_id + integer :: ke, isc, iec, jsc, jec, is, ie, js, je, num_word_me + integer :: ipos, jpos, n_per_gridpoint_local, n_per_gridpoint_global + logical :: xonly, yonly, root_only, global_on_this_pe + integer :: ix, iy + MPP_TYPE_, pointer, dimension(:) :: clocal, cremote + integer :: stackuse + character(len=8) :: text + type(c_ptr) :: stack_cptr !< Workaround for GFortran bug + + integer :: tile, ishift, jshift, ipos0, jpos0 + integer :: size_clocal, size_cremote + + tile = 1 + if(present(tile_count)) tile = tile_count + + call mpp_get_domain_shift(domain, ishift, jshift, position) + + ipos0 = -domain%x(tile)%global%begin + 1 + jpos0 = -domain%y(tile)%global%begin + 1 + + if (present(xdim)) then + ix = xdim + else + ix = 1 + endif + + if (present(ydim)) then + iy = ydim + else + iy = 2 + endif + + n_per_gridpoint_local = size(local) / (size(local,ix) * size(local,iy)) + n_per_gridpoint_global = size(global) / (size(global,ix) * size(global,iy)) + size_clocal = (domain%x(1)%compute%size+ishift) * (domain%y(1)%compute%size+jshift) * n_per_gridpoint_local + size_cremote = (domain%x(1)%compute%max_size+ishift) * (domain%y(1)%compute%max_size+jshift) * & + n_per_gridpoint_local + + stackuse = size_clocal + size_cremote + if( stackuse.GT.mpp_domains_stack_size )then + write( text, '(i8)' )stackuse + call mpp_error( FATAL, & + 'MPP_DO_GLOBAL_FIELD user stack overflow: call mpp_domains_set_stack_size('//trim(text)// & + & ') from all PEs.' ) + end if + mpp_domains_stack_hwm = max( mpp_domains_stack_hwm, stackuse ) + + stack_cptr = c_loc(mpp_domains_stack(1)) + call c_f_pointer(stack_cptr, clocal, [size_clocal]) + + stack_cptr = c_loc(mpp_domains_stack(1+size_clocal)) + call c_f_pointer(stack_cptr, cremote, [size_cremote]) + + if( .NOT.module_is_initialized )call mpp_error( FATAL, 'MPP_GLOBAL_FIELD: must first call mpp_domains_init.' ) + + xonly = .FALSE. + yonly = .FALSE. + root_only = .FALSE. + if( PRESENT(flags) ) then + xonly = BTEST(flags,EAST) + yonly = BTEST(flags,SOUTH) + if( .NOT.xonly .AND. .NOT.yonly )call mpp_error( WARNING, & + 'MPP_GLOBAL_FIELD: you must have flags=XUPDATE, YUPDATE or XUPDATE+YUPDATE' ) + if(xonly .AND. yonly) then + xonly = .false.; yonly = .false. + endif + root_only = BTEST(flags, ROOT_GLOBAL) + if( (xonly .or. yonly) .AND. root_only ) then + call mpp_error( WARNING, 'MPP_GLOBAL_FIELD: flags = XUPDATE+GLOBAL_ROOT_ONLY or ' // & + 'flags = YUPDATE+GLOBAL_ROOT_ONLY is not supported, will ignore GLOBAL_ROOT_ONLY' ) + root_only = .FALSE. + endif + endif + + global_on_this_pe = .NOT. root_only .OR. domain%pe == domain%tile_root_pe + ipos = 0; jpos = 0 + if(global_on_this_pe ) then + if(n_per_gridpoint_local.ne.n_per_gridpoint_global) then + call mpp_error(FATAL, 'MPP_GLOBAL_FIELD: mismatch of global and local dimension sizes') + endif + if( size(global,ix).NE.(domain%x(tile)%global%size+ishift) .OR. & + size(global,iy).NE.(domain%y(tile)%global%size+jshift))then + if(xonly) then + if(size(global,ix).NE.(domain%x(tile)%global%size+ishift) .OR. & + size(global,iy).NE.(domain%y(tile)%compute%size+jshift)) & + call mpp_error( FATAL, & + & 'MPP_GLOBAL_FIELD: incoming arrays do not match domain for xonly global field.' ) + jpos = -domain%y(tile)%compute%begin + 1 + else if(yonly) then + if(size(global,ix).NE.(domain%x(tile)%compute%size+ishift) .OR. & + size(global,iy).NE.(domain%y(tile)%global%size+jshift)) & + call mpp_error( FATAL, & + & 'MPP_GLOBAL_FIELD: incoming arrays do not match domain for yonly global field.' ) + ipos = -domain%x(tile)%compute%begin + 1 + else + call mpp_error( FATAL, 'MPP_GLOBAL_FIELD: incoming arrays do not match domain.' ) + endif + endif + endif + + if( size(local,ix).EQ.(domain%x(tile)%compute%size+ishift) .AND. & + size(local,iy).EQ.(domain%y(tile)%compute%size+jshift) )then + !local is on compute domain + ioff = -domain%x(tile)%compute%begin + 1 + joff = -domain%y(tile)%compute%begin + 1 + else if( size(local,ix).EQ.(domain%x(tile)%memory%size+ishift) .AND. & + size(local,iy).EQ.(domain%y(tile)%memory%size+jshift) )then + !local is on data domain + ioff = -domain%x(tile)%domain_data%begin + 1 + joff = -domain%y(tile)%domain_data%begin + 1 + else + call mpp_error( FATAL, & + & 'MPP_GLOBAL_FIELD_: incoming field array must match either compute domain or memory domain.') + end if + + !ke = size(local,3) + ke = n_per_gridpoint_local + isc = domain%x(tile)%compute%begin; iec = domain%x(tile)%compute%end+ishift + jsc = domain%y(tile)%compute%begin; jec = domain%y(tile)%compute%end+jshift + + num_word_me = (iec-isc+1)*(jec-jsc+1)*n_per_gridpoint_local + +! make contiguous array from compute domain + m = 0 + if(global_on_this_pe) then + !z1l: initialize global = 0 to support mask domain + if(present(default_data)) then + call arr_init(global, default_data) + else + call arr_init(global, DEFAULT_VALUE_) + endif + + call arr2vec(local, clocal, ix, iy, ioff+isc, ioff+iec, joff+jsc, joff+jec) + + ! Fill local domain directly + call vec2arr(clocal, global, ix, iy, ipos0+ipos+isc, ipos0+ipos+iec, jpos0+jpos+jsc, jpos0+jpos+jec) + else + call arr2vec(local, clocal, ix, iy, ioff+isc, ioff+iec, joff+jsc, joff+jec) + endif + +! if there is more than one tile on this pe, then no decomposition for all tiles on this pe, so we can just return + if(size(domain%x(:))>1) then + !--- the following is needed to avoid deadlock. + if( tile == size(domain%x(:)) ) call mpp_sync_self( ) + return + end if + + root_pe = mpp_root_pe() + +!fill off-domains (note loops begin at an offset of 1) + if( xonly )then + nd = size(domain%x(1)%list(:)) + do n = 1,nd-1 + lpos = mod(domain%x(1)%pos+nd-n,nd) + rpos = mod(domain%x(1)%pos +n,nd) + from_pe = domain%x(1)%list(rpos)%pe + rpos = from_pe - root_pe ! for concurrent run, root_pe may not be 0. + if (from_pe == NULL_PE) then + num_words = 0 + else + num_words = (domain%list(rpos)%x(1)%compute%size+ishift) & + * (domain%list(rpos)%y(1)%compute%size+jshift) * ke + endif + ! Force use of scalar, integer ptr interface + call mpp_transmit( put_data=clocal(1), plen=num_word_me, to_pe=domain%x(1)%list(lpos)%pe, & + get_data=cremote(1), glen=num_words, from_pe=from_pe ) + m = 0 + if (from_pe /= NULL_PE) then + is = domain%list(rpos)%x(1)%compute%begin; ie = domain%list(rpos)%x(1)%compute%end+ishift + call vec2arr(cremote, global, ix, iy, ipos0+is, ipos0+ie, jpos0+jpos+jsc, jpos0+jpos+jec) + endif + call mpp_sync_self() !-ensure MPI_ISEND is done. + end do + else if( yonly )then + nd = size(domain%y(1)%list(:)) + do n = 1,nd-1 + lpos = mod(domain%y(1)%pos+nd-n,nd) + rpos = mod(domain%y(1)%pos +n,nd) + from_pe = domain%y(1)%list(rpos)%pe + rpos = from_pe - root_pe + if (from_pe == NULL_PE) then + num_words = 0 + else + num_words = (domain%list(rpos)%x(1)%compute%size+ishift) & + * (domain%list(rpos)%y(1)%compute%size+jshift) * ke + endif + ! Force use of scalar, integer pointer interface + call mpp_transmit( put_data=clocal(1), plen=num_word_me, to_pe=domain%y(1)%list(lpos)%pe, & + get_data=cremote(1), glen=num_words, from_pe=from_pe ) + m = 0 + if (from_pe /= NULL_PE) then + js = domain%list(rpos)%y(1)%compute%begin; je = domain%list(rpos)%y(1)%compute%end+jshift + call vec2arr(cremote, global, ix, iy, ipos0+ipos+isc, ipos0+ipos+iec, jpos0+js, jpos0+je) + endif + call mpp_sync_self() !-ensure MPI_ISEND is done. + end do + else + tile_id = domain%tile_id(1) + nd = size(domain%list(:)) + if(root_only) then + if(domain%pe .NE. domain%tile_root_pe) then + call mpp_send( clocal(1), plen=num_word_me, to_pe=domain%tile_root_pe, tag=COMM_TAG_1 ) + else + do n = 1,nd-1 + rpos = mod(domain%pos+n,nd) + if( domain%list(rpos)%tile_id(1) .NE. tile_id ) cycle + num_words = (domain%list(rpos)%x(1)%compute%size+ishift) * & + & (domain%list(rpos)%y(1)%compute%size+jshift) * ke + call mpp_recv(cremote(1), glen=num_words, from_pe=domain%list(rpos)%pe, tag=COMM_TAG_1 ) + m = 0 + is = domain%list(rpos)%x(1)%compute%begin; ie = domain%list(rpos)%x(1)%compute%end+ishift + js = domain%list(rpos)%y(1)%compute%begin; je = domain%list(rpos)%y(1)%compute%end+jshift + + call vec2arr(cremote, global, ix, iy, ipos0+is, ipos0+ie, jpos0+js, jpos0+je) + end do + endif + else + do n = 1,nd-1 + lpos = mod(domain%pos+nd-n,nd) + if( domain%list(lpos)%tile_id(1).NE. tile_id ) cycle ! global field only within tile + call mpp_send( clocal(1), plen=num_word_me, to_pe=domain%list(lpos)%pe, tag=COMM_TAG_2 ) + end do + do n = 1,nd-1 + rpos = mod(domain%pos+n,nd) + if( domain%list(rpos)%tile_id(1) .NE. tile_id ) cycle ! global field only within tile + num_words = (domain%list(rpos)%x(1)%compute%size+ishift) * & + & (domain%list(rpos)%y(1)%compute%size+jshift) * ke + call mpp_recv( cremote(1), glen=num_words, from_pe=domain%list(rpos)%pe, tag=COMM_TAG_2 ) + m = 0 + is = domain%list(rpos)%x(1)%compute%begin; ie = domain%list(rpos)%x(1)%compute%end+ishift + js = domain%list(rpos)%y(1)%compute%begin; je = domain%list(rpos)%y(1)%compute%end+jshift + + call vec2arr(cremote, global, ix, iy, ipos0+is, ipos0+ie, jpos0+js, jpos0+je) + end do + endif + end if + + call mpp_sync_self + end subroutine MPP_GLOBAL_FIELD_4D_ + + !> Gets a global field from a local field + !! local field may be on compute OR data domain + subroutine MPP_GLOBAL_FIELD_5D_( domain, local, global, flags, position, tile_count, default_data, xdim, ydim) + type(domain2D), intent(in) :: domain + MPP_TYPE_, intent(in) :: local(:,:,:,:,:) + MPP_TYPE_, intent(out) :: global(:,:,:,:,:) + integer, intent(in), optional :: flags + integer, intent(in), optional :: position + integer, intent(in), optional :: tile_count + MPP_TYPE_, intent(in), optional :: default_data + integer, intent(in), optional :: xdim, ydim !< Indices of the domain-decomposed dimensions. In typical + !! Fortran-based code, xdim=1 and ydim=2. + + integer :: i, j, k, m, n, nd, num_words, lpos, rpos, ioff, joff, from_pe, root_pe, tile_id + integer :: ke, isc, iec, jsc, jec, is, ie, js, je, num_word_me + integer :: ipos, jpos, n_per_gridpoint_local, n_per_gridpoint_global + logical :: xonly, yonly, root_only, global_on_this_pe + integer :: ix, iy + MPP_TYPE_, pointer, dimension(:) :: clocal, cremote + integer :: stackuse + character(len=8) :: text + type(c_ptr) :: stack_cptr !< Workaround for GFortran bug + + integer :: tile, ishift, jshift, ipos0, jpos0 + integer :: size_clocal, size_cremote + + tile = 1 + if(present(tile_count)) tile = tile_count + + call mpp_get_domain_shift(domain, ishift, jshift, position) + + ipos0 = -domain%x(tile)%global%begin + 1 + jpos0 = -domain%y(tile)%global%begin + 1 + + if (present(xdim)) then + ix = xdim + else + ix = 1 + endif + + if (present(ydim)) then + iy = ydim + else + iy = 2 + endif + + n_per_gridpoint_local = size(local) / (size(local,ix) * size(local,iy)) + n_per_gridpoint_global = size(global) / (size(global,ix) * size(global,iy)) + size_clocal = (domain%x(1)%compute%size+ishift) * (domain%y(1)%compute%size+jshift) * n_per_gridpoint_local + size_cremote = (domain%x(1)%compute%max_size+ishift) * (domain%y(1)%compute%max_size+jshift) * & + n_per_gridpoint_local + + stackuse = size_clocal + size_cremote + if( stackuse.GT.mpp_domains_stack_size )then + write( text, '(i8)' )stackuse + call mpp_error( FATAL, & + 'MPP_DO_GLOBAL_FIELD user stack overflow: call mpp_domains_set_stack_size('//trim(text)// & + & ') from all PEs.' ) + end if + mpp_domains_stack_hwm = max( mpp_domains_stack_hwm, stackuse ) + + stack_cptr = c_loc(mpp_domains_stack(1)) + call c_f_pointer(stack_cptr, clocal, [size_clocal]) + + stack_cptr = c_loc(mpp_domains_stack(1+size_clocal)) + call c_f_pointer(stack_cptr, cremote, [size_cremote]) + + if( .NOT.module_is_initialized )call mpp_error( FATAL, 'MPP_GLOBAL_FIELD: must first call mpp_domains_init.' ) + + xonly = .FALSE. + yonly = .FALSE. + root_only = .FALSE. + if( PRESENT(flags) ) then + xonly = BTEST(flags,EAST) + yonly = BTEST(flags,SOUTH) + if( .NOT.xonly .AND. .NOT.yonly )call mpp_error( WARNING, & + 'MPP_GLOBAL_FIELD: you must have flags=XUPDATE, YUPDATE or XUPDATE+YUPDATE' ) + if(xonly .AND. yonly) then + xonly = .false.; yonly = .false. + endif + root_only = BTEST(flags, ROOT_GLOBAL) + if( (xonly .or. yonly) .AND. root_only ) then + call mpp_error( WARNING, 'MPP_GLOBAL_FIELD: flags = XUPDATE+GLOBAL_ROOT_ONLY or ' // & + 'flags = YUPDATE+GLOBAL_ROOT_ONLY is not supported, will ignore GLOBAL_ROOT_ONLY' ) + root_only = .FALSE. + endif + endif + + global_on_this_pe = .NOT. root_only .OR. domain%pe == domain%tile_root_pe + ipos = 0; jpos = 0 + if(global_on_this_pe ) then + if(n_per_gridpoint_local.ne.n_per_gridpoint_global) then + call mpp_error(FATAL, 'MPP_GLOBAL_FIELD: mismatch of global and local dimension sizes') + endif + if( size(global,ix).NE.(domain%x(tile)%global%size+ishift) .OR. & + size(global,iy).NE.(domain%y(tile)%global%size+jshift))then + if(xonly) then + if(size(global,ix).NE.(domain%x(tile)%global%size+ishift) .OR. & + size(global,iy).NE.(domain%y(tile)%compute%size+jshift)) & + call mpp_error( FATAL, & + & 'MPP_GLOBAL_FIELD: incoming arrays do not match domain for xonly global field.' ) + jpos = -domain%y(tile)%compute%begin + 1 + else if(yonly) then + if(size(global,ix).NE.(domain%x(tile)%compute%size+ishift) .OR. & + size(global,iy).NE.(domain%y(tile)%global%size+jshift)) & + call mpp_error( FATAL, & + & 'MPP_GLOBAL_FIELD: incoming arrays do not match domain for yonly global field.' ) + ipos = -domain%x(tile)%compute%begin + 1 + else + call mpp_error( FATAL, 'MPP_GLOBAL_FIELD: incoming arrays do not match domain.' ) + endif + endif + endif + + if( size(local,ix).EQ.(domain%x(tile)%compute%size+ishift) .AND. & + size(local,iy).EQ.(domain%y(tile)%compute%size+jshift) )then + !local is on compute domain + ioff = -domain%x(tile)%compute%begin + 1 + joff = -domain%y(tile)%compute%begin + 1 + else if( size(local,ix).EQ.(domain%x(tile)%memory%size+ishift) .AND. & + size(local,iy).EQ.(domain%y(tile)%memory%size+jshift) )then + !local is on data domain + ioff = -domain%x(tile)%domain_data%begin + 1 + joff = -domain%y(tile)%domain_data%begin + 1 + else + call mpp_error( FATAL, & + & 'MPP_GLOBAL_FIELD_: incoming field array must match either compute domain or memory domain.') + end if + + !ke = size(local,3) + ke = n_per_gridpoint_local + isc = domain%x(tile)%compute%begin; iec = domain%x(tile)%compute%end+ishift + jsc = domain%y(tile)%compute%begin; jec = domain%y(tile)%compute%end+jshift + + num_word_me = (iec-isc+1)*(jec-jsc+1)*n_per_gridpoint_local + +! make contiguous array from compute domain + m = 0 + if(global_on_this_pe) then + !z1l: initialize global = 0 to support mask domain + if(present(default_data)) then + call arr_init(global, default_data) + else + call arr_init(global, DEFAULT_VALUE_) + endif + + call arr2vec(local, clocal, ix, iy, ioff+isc, ioff+iec, joff+jsc, joff+jec) + + ! Fill local domain directly + call vec2arr(clocal, global, ix, iy, ipos0+ipos+isc, ipos0+ipos+iec, jpos0+jpos+jsc, jpos0+jpos+jec) + else + call arr2vec(local, clocal, ix, iy, ioff+isc, ioff+iec, joff+jsc, joff+jec) + endif + +! if there is more than one tile on this pe, then no decomposition for all tiles on this pe, so we can just return + if(size(domain%x(:))>1) then + !--- the following is needed to avoid deadlock. + if( tile == size(domain%x(:)) ) call mpp_sync_self( ) + return + end if + + root_pe = mpp_root_pe() + +!fill off-domains (note loops begin at an offset of 1) + if( xonly )then + nd = size(domain%x(1)%list(:)) + do n = 1,nd-1 + lpos = mod(domain%x(1)%pos+nd-n,nd) + rpos = mod(domain%x(1)%pos +n,nd) + from_pe = domain%x(1)%list(rpos)%pe + rpos = from_pe - root_pe ! for concurrent run, root_pe may not be 0. + if (from_pe == NULL_PE) then + num_words = 0 + else + num_words = (domain%list(rpos)%x(1)%compute%size+ishift) & + * (domain%list(rpos)%y(1)%compute%size+jshift) * ke + endif + ! Force use of scalar, integer ptr interface + call mpp_transmit( put_data=clocal(1), plen=num_word_me, to_pe=domain%x(1)%list(lpos)%pe, & + get_data=cremote(1), glen=num_words, from_pe=from_pe ) + m = 0 + if (from_pe /= NULL_PE) then + is = domain%list(rpos)%x(1)%compute%begin; ie = domain%list(rpos)%x(1)%compute%end+ishift + call vec2arr(cremote, global, ix, iy, ipos0+is, ipos0+ie, jpos0+jpos+jsc, jpos0+jpos+jec) + endif + call mpp_sync_self() !-ensure MPI_ISEND is done. + end do + else if( yonly )then + nd = size(domain%y(1)%list(:)) + do n = 1,nd-1 + lpos = mod(domain%y(1)%pos+nd-n,nd) + rpos = mod(domain%y(1)%pos +n,nd) + from_pe = domain%y(1)%list(rpos)%pe + rpos = from_pe - root_pe + if (from_pe == NULL_PE) then + num_words = 0 + else + num_words = (domain%list(rpos)%x(1)%compute%size+ishift) & + * (domain%list(rpos)%y(1)%compute%size+jshift) * ke + endif + ! Force use of scalar, integer pointer interface + call mpp_transmit( put_data=clocal(1), plen=num_word_me, to_pe=domain%y(1)%list(lpos)%pe, & + get_data=cremote(1), glen=num_words, from_pe=from_pe ) + m = 0 + if (from_pe /= NULL_PE) then + js = domain%list(rpos)%y(1)%compute%begin; je = domain%list(rpos)%y(1)%compute%end+jshift + call vec2arr(cremote, global, ix, iy, ipos0+ipos+isc, ipos0+ipos+iec, jpos0+js, jpos0+je) + endif + call mpp_sync_self() !-ensure MPI_ISEND is done. + end do + else + tile_id = domain%tile_id(1) + nd = size(domain%list(:)) + if(root_only) then + if(domain%pe .NE. domain%tile_root_pe) then + call mpp_send( clocal(1), plen=num_word_me, to_pe=domain%tile_root_pe, tag=COMM_TAG_1 ) + else + do n = 1,nd-1 + rpos = mod(domain%pos+n,nd) + if( domain%list(rpos)%tile_id(1) .NE. tile_id ) cycle + num_words = (domain%list(rpos)%x(1)%compute%size+ishift) * & + & (domain%list(rpos)%y(1)%compute%size+jshift) * ke + call mpp_recv(cremote(1), glen=num_words, from_pe=domain%list(rpos)%pe, tag=COMM_TAG_1 ) + m = 0 + is = domain%list(rpos)%x(1)%compute%begin; ie = domain%list(rpos)%x(1)%compute%end+ishift + js = domain%list(rpos)%y(1)%compute%begin; je = domain%list(rpos)%y(1)%compute%end+jshift + + call vec2arr(cremote, global, ix, iy, ipos0+is, ipos0+ie, jpos0+js, jpos0+je) + end do + endif + else + do n = 1,nd-1 + lpos = mod(domain%pos+nd-n,nd) + if( domain%list(lpos)%tile_id(1).NE. tile_id ) cycle ! global field only within tile + call mpp_send( clocal(1), plen=num_word_me, to_pe=domain%list(lpos)%pe, tag=COMM_TAG_2 ) + end do + do n = 1,nd-1 + rpos = mod(domain%pos+n,nd) + if( domain%list(rpos)%tile_id(1) .NE. tile_id ) cycle ! global field only within tile + num_words = (domain%list(rpos)%x(1)%compute%size+ishift) * & + & (domain%list(rpos)%y(1)%compute%size+jshift) * ke + call mpp_recv( cremote(1), glen=num_words, from_pe=domain%list(rpos)%pe, tag=COMM_TAG_2 ) + m = 0 + is = domain%list(rpos)%x(1)%compute%begin; ie = domain%list(rpos)%x(1)%compute%end+ishift + js = domain%list(rpos)%y(1)%compute%begin; je = domain%list(rpos)%y(1)%compute%end+jshift + + call vec2arr(cremote, global, ix, iy, ipos0+is, ipos0+ie, jpos0+js, jpos0+je) + end do + endif + end if + + call mpp_sync_self + end subroutine MPP_GLOBAL_FIELD_5D_ +!> @} diff --git a/mpp/include/mpp_group_update.fh b/mpp/include/mpp_group_update.fh index 8a10b1eb13..9275ce635b 100644 --- a/mpp/include/mpp_group_update.fh +++ b/mpp/include/mpp_group_update.fh @@ -424,20 +424,22 @@ subroutine MPP_CREATE_GROUP_UPDATE_4D_V_( group, fieldx, fieldy, domain, flags, end subroutine MPP_CREATE_GROUP_UPDATE_4D_V_ -subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) +subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type, omp_offload) type(mpp_group_update_type), intent(inout) :: group type(domain2D), intent(inout) :: domain MPP_TYPE_, intent(in) :: d_type + logical, optional, intent(in) :: omp_offload integer :: nscalar, nvector, nlist logical :: recv_y(8) integer :: nsend, nrecv, flags_v integer :: msgsize - integer :: from_pe, to_pe, buffer_pos, pos + integer :: from_pe, to_pe, buffer_pos, pos, idx integer :: ksize, is, ie, js, je - integer :: n, l, m, i, j, k, buffer_start_pos, nk + integer :: n, l, m, i, j, k, buffer_start_pos, ni, nj, nk integer :: shift, gridtype, midpoint integer :: npack, nunpack, rotation, isd + logical :: use_device_ptr MPP_TYPE_ :: buffer(mpp_domains_stack_size) MPP_TYPE_ :: field (group%is_s:group%ie_s,group%js_s:group%je_s, group%ksize_s) @@ -453,6 +455,9 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) nlist = size(domain%list(:)) gridtype = group%gridtype + use_device_ptr = .false. + if (present(omp_offload)) use_device_ptr = omp_offload + !--- ksize_s must equal ksize_v if(nvector > 0 .AND. nscalar > 0) then if(group%ksize_s .NE. group%ksize_v) then @@ -481,13 +486,16 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) !---pre-post receive. call mpp_clock_begin(group_recv_clock) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target enter data map(alloc: buffer) if(use_device_ptr) +#endif do m = 1, nrecv msgsize = group%recv_size(m) from_pe = group%from_pe(m) if( msgsize .GT. 0 )then buffer_pos = group%buffer_pos_recv(m) call mpp_recv( buffer(buffer_pos+1), glen=msgsize, from_pe=from_pe, block=.false., & - tag=COMM_TAG_1) + tag=COMM_TAG_1, omp_offload=omp_offload) end if end do @@ -500,7 +508,19 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) call mpp_clock_begin(group_pack_clock) !pack the data buffer_start_pos = 0 + ! below switch runs OpenMP offloaded packing if ompoffload is .true. and compiled with + ! OpenMP offload support. Otherwise, run OpenMP CPU by undefining GPU macro if defined + if (use_device_ptr) then +#include + else +#ifdef __NVCOMPILER_OPENMP_GPU +#undef __NVCOMPILER_OPENMP_GPU +#include +#define __NVCOMPILER_OPENMP_GPU +#else #include +#endif + endif call mpp_clock_end(group_pack_clock) call mpp_clock_begin(group_send_clock) @@ -509,7 +529,7 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) if( msgsize .GT. 0 )then buffer_pos = group%buffer_pos_send(n) to_pe = group%to_pe(n) - call mpp_send( buffer(buffer_pos+1), plen=msgsize, to_pe=to_pe, tag=COMM_TAG_1) + call mpp_send( buffer(buffer_pos+1), plen=msgsize, to_pe=to_pe, tag=COMM_TAG_1, omp_offload=omp_offload) endif enddo call mpp_clock_end(group_send_clock) @@ -523,10 +543,24 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) !---unpack the buffer nunpack = group%nunpack call mpp_clock_begin(group_unpk_clock) + ! below switch runs OpenMP offloaded unpacking if ompoffload is .true. and compiled with + ! OpenMP offload support. Otherwise, run OpenMP CPU by undefining GPU macro if defined + if (use_device_ptr) then +#include + else +#ifdef __NVCOMPILER_OPENMP_GPU +#undef __NVCOMPILER_OPENMP_GPU +#include +#define __NVCOMPILER_OPENMP_GPU + !$omp target exit data map(release: buffer) if(use_device_ptr) +#else #include +#endif + endif call mpp_clock_end(group_unpk_clock) ! ---northern boundary fold + ! below omp offload loops have maps excluded as gcc doesn't support cray pointers in maps shift = 0 if(domain%symmetry) shift = 1 if( nvector >0 .AND. BTEST(domain%fold,NORTH) .AND. (.NOT.BTEST(flags_v,SCALAR_BIT)) )then @@ -538,15 +572,25 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) j = domain%y(1)%global%end+shift is = domain%x(1)%global%begin; ie = domain%x(1)%global%end+shift if( .NOT. domain%symmetry ) is = is - 1 + isd = domain%x(1)%domain_data%begin do i = is ,ie, midpoint - if( domain%x(1)%domain_data%begin.LE.i .AND. i.LE. domain%x(1)%domain_data%end+shift )then + if( isd.LE.i .AND. i.LE. domain%x(1)%domain_data%end+shift )then do l=1,nvector ptr_fieldx = group%addrs_x(l) ptr_fieldy = group%addrs_y(l) - do k = 1,ksize - fieldx(i,j,k) = 0. - fieldy(i,j,k) = 0. - end do + if (use_device_ptr) then + !$omp target teams distribute parallel do default(shared) & + !$omp shared(ksize, ptr_fieldx, ptr_fieldy) + do k = 1,ksize + fieldx(i,j,k) = 0. + fieldy(i,j,k) = 0. + end do + else + do k = 1,ksize + fieldx(i,j,k) = 0. + fieldy(i,j,k) = 0. + end do + end if end do end if end do @@ -563,19 +607,31 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) else is = domain%x(1)%global%begin - 1 end if - if( is.GT.domain%x(1)%domain_data%begin )then + isd = domain%x(1)%domain_data%begin + if( is.GT.isd )then - if( 2*is-domain%x(1)%domain_data%begin.GT.domain%x(1)%domain_data%end+shift ) & + if( 2*is-isd.GT.domain%x(1)%domain_data%end+shift ) & call mpp_error( FATAL, 'MPP_DO_UPDATE_V: folded-north BGRID_NE west edge ubound error.' ) do l=1,nvector ptr_fieldx = group%addrs_x(l) ptr_fieldy = group%addrs_y(l) - do k = 1,ksize - do i = domain%x(1)%domain_data%begin,is-1 - fieldx(i,j,k) = fieldx(2*is-i,j,k) - fieldy(i,j,k) = fieldy(2*is-i,j,k) + if (use_device_ptr) then + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp shared(ksize, isd, is, ptr_fieldx, ptr_fieldy) + do k = 1,ksize + do i = isd,is-1 + fieldx(i,j,k) = fieldx(2*is-i,j,k) + fieldy(i,j,k) = fieldy(2*is-i,j,k) + end do end do - end do + else + do k = 1,ksize + do i = isd,is-1 + fieldx(i,j,k) = fieldx(2*is-i,j,k) + fieldy(i,j,k) = fieldy(2*is-i,j,k) + end do + end do + endif end do end if case(CGRID_NE) @@ -586,11 +642,21 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) call mpp_error( FATAL, 'MPP_DO_UPDATE_V: folded-north CGRID_NE west edge ubound error.' ) do l=1,nvector ptr_fieldy = group%addrs_y(l) - do k = 1,ksize - do i = isd,is-1 - fieldy(i,j,k) = fieldy(2*is-i-1,j,k) + if (use_device_ptr) then + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp shared(ksize, isd, is, ptr_fieldy) + do k = 1,ksize + do i = isd,is-1 + fieldy(i,j,k) = fieldy(2*is-i-1,j,k) + end do end do - end do + else + do k = 1,ksize + do i = isd,is-1 + fieldy(i,j,k) = fieldy(2*is-i-1,j,k) + end do + end do + end if end do end if end select @@ -607,21 +673,42 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) do l=1,nvector ptr_fieldx = group%addrs_x(l) ptr_fieldy = group%addrs_y(l) - do k = 1,ksize - do i = is,ie - fieldx(i,j,k) = -fieldx(i,j,k) - fieldy(i,j,k) = -fieldy(i,j,k) + if (use_device_ptr) then + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp shared(ksize, is, ie, ptr_fieldx, ptr_fieldy) + do k = 1,ksize + do i = is,ie + fieldx(i,j,k) = -fieldx(i,j,k) + fieldy(i,j,k) = -fieldy(i,j,k) + end do end do - end do + else + do k = 1,ksize + do i = is,ie + fieldx(i,j,k) = -fieldx(i,j,k) + fieldy(i,j,k) = -fieldy(i,j,k) + end do + end do + end if end do case(CGRID_NE) do l=1,nvector ptr_fieldy = group%addrs_y(l) - do k = 1,ksize - do i = is, ie - fieldy(i,j,k) = -fieldy(i,j,k) + if (use_device_ptr) then + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp shared(ksize, is, ie, ptr_fieldy) + do k = 1,ksize + do i = is, ie + fieldy(i,j,k) = -fieldy(i,j,k) + end do end do - end do + else + do k = 1,ksize + do i = is, ie + fieldy(i,j,k) = -fieldy(i,j,k) + end do + end do + end if end do end select end if @@ -649,10 +736,11 @@ subroutine MPP_START_GROUP_UPDATE_(group, domain, d_type, reuse_buffer) integer :: nscalar, nvector integer :: nsend, nrecv, flags_v integer :: msgsize, npack, rotation - integer :: from_pe, to_pe, buffer_pos, pos + integer :: from_pe, to_pe, buffer_pos, pos, idx integer :: ksize, is, ie, js, je - integer :: n, l, m, i, j, k, buffer_start_pos, nk + integer :: n, l, m, i, j, k, buffer_start_pos, ni, nj, nk logical :: reuse_buf_pos + logical, parameter :: use_device_ptr = .false. ! placeholder character(len=8) :: text MPP_TYPE_ :: buffer(size(mpp_domains_stack_nonblock(:))) @@ -729,7 +817,19 @@ subroutine MPP_START_GROUP_UPDATE_(group, domain, d_type, reuse_buffer) call mpp_clock_begin(nonblock_group_pack_clock) npack = group%npack buffer_start_pos = group%buffer_start_pos + ! below switch runs OpenMP offloaded packing if ompoffload is .true. and compiled with + ! OpenMP offload support. Otherwise, run OpenMP CPU by undefining GPU macro if defined + if (use_device_ptr) then +#include + else +#ifdef __NVCOMPILER_OPENMP_GPU +#undef __NVCOMPILER_OPENMP_GPU +#include +#define __NVCOMPILER_OPENMP_GPU +#else #include +#endif + endif call mpp_clock_end(nonblock_group_pack_clock) call mpp_clock_begin(nonblock_group_send_clock) @@ -752,11 +852,12 @@ subroutine MPP_COMPLETE_GROUP_UPDATE_(group, domain, d_type) MPP_TYPE_, intent(in) :: d_type integer :: nsend, nrecv, nscalar, nvector - integer :: k, buffer_pos, pos, m, n, l + integer :: k, buffer_pos, pos, m, n, l, idx integer :: is, ie, js, je, ksize, i, j integer :: shift, gridtype, midpoint, flags_v - integer :: nunpack, rotation, buffer_start_pos, nk, isd + integer :: nunpack, rotation, buffer_start_pos, ni, nj, nk, isd logical :: recv_y(8) + logical, parameter :: use_device_ptr = .false. ! placeholder MPP_TYPE_ :: buffer(size(mpp_domains_stack_nonblock(:))) MPP_TYPE_ :: field (group%is_s:group%ie_s,group%js_s:group%je_s, group%ksize_s) MPP_TYPE_ :: fieldx(group%is_x:group%ie_x,group%js_x:group%je_x, group%ksize_v) @@ -797,7 +898,20 @@ subroutine MPP_COMPLETE_GROUP_UPDATE_(group, domain, d_type) call mpp_clock_begin(nonblock_group_unpk_clock) buffer_start_pos = group%buffer_start_pos + ! below switch runs OpenMP offloaded unpacking if ompoffload is .true. and compiled with + ! OpenMP offload support. Otherwise, run OpenMP CPU by undefining GPU macro if defined + if (use_device_ptr) then #include + else +#ifdef __NVCOMPILER_OPENMP_GPU +#undef __NVCOMPILER_OPENMP_GPU +#include +#define __NVCOMPILER_OPENMP_GPU + !$omp target exit data map(release: buffer) if(use_device_ptr) +#else +#include +#endif + endif call mpp_clock_end(nonblock_group_unpk_clock) ! ---northern boundary fold diff --git a/mpp/include/mpp_pack.inc b/mpp/include/mpp_pack.inc index 1e8b9164d6..70e1c52e6b 100644 --- a/mpp/include/mpp_pack.inc +++ b/mpp/include/mpp_pack.inc @@ -1,3 +1,5 @@ +#ifndef __NVCOMPILER + #undef MPP_TYPE_ #define MPP_TYPE_ real(r8_kind) #undef ARR2VEC_ @@ -81,3 +83,251 @@ #undef ARR_INIT_ #define ARR_INIT_ arr_init_l4 #include + +#else + +#undef MPP_TYPE_ +#define MPP_TYPE_ real(r8_kind) +#undef ARR2VEC_2D +#define ARR2VEC_2D arr2vec_r8_2d +#undef ARR2VEC_3D +#define ARR2VEC_3D arr2vec_r8_3d +#undef ARR2VEC_4D +#define ARR2VEC_4D arr2vec_r8_4d +#undef ARR2VEC_5D +#define ARR2VEC_5D arr2vec_r8_5d +#undef VEC2ARR_2D +#define VEC2ARR_2D vec2arr_r8_2d +#undef VEC2ARR_3D +#define VEC2ARR_3D vec2arr_r8_3d +#undef VEC2ARR_4D +#define VEC2ARR_4D vec2arr_r8_4d +#undef VEC2ARR_5D +#define VEC2ARR_5D vec2arr_r8_5d +#undef ARR_INIT_1D +#define ARR_INIT_1D arr_init_r8_1d +#undef ARR_INIT_2D +#define ARR_INIT_2D arr_init_r8_2d +#undef ARR_INIT_3D +#define ARR_INIT_3D arr_init_r8_3d +#undef ARR_INIT_4D +#define ARR_INIT_4D arr_init_r8_4d +#undef ARR_INIT_5D +#define ARR_INIT_5D arr_init_r8_5d +#include + +#ifdef OVERLOAD_C8 +#undef MPP_TYPE_ +#define MPP_TYPE_ complex(c8_kind) +#undef ARR2VEC_2D +#define ARR2VEC_2D arr2vec_c8_2d +#undef ARR2VEC_3D +#define ARR2VEC_3D arr2vec_c8_3d +#undef ARR2VEC_4D +#define ARR2VEC_4D arr2vec_c8_4d +#undef ARR2VEC_5D +#define ARR2VEC_5D arr2vec_c8_5d +#undef VEC2ARR_2D +#define VEC2ARR_2D vec2arr_c8_2d +#undef VEC2ARR_3D +#define VEC2ARR_3D vec2arr_c8_3d +#undef VEC2ARR_4D +#define VEC2ARR_4D vec2arr_c8_4d +#undef VEC2ARR_5D +#define VEC2ARR_5D vec2arr_c8_5d +#undef ARR_INIT_1D +#define ARR_INIT_1D arr_init_c8_1d +#undef ARR_INIT_2D +#define ARR_INIT_2D arr_init_c8_2d +#undef ARR_INIT_3D +#define ARR_INIT_3D arr_init_c8_3d +#undef ARR_INIT_4D +#define ARR_INIT_4D arr_init_c8_4d +#undef ARR_INIT_5D +#define ARR_INIT_5D arr_init_c8_5d +#include +#endif + +#undef MPP_TYPE_ +#define MPP_TYPE_ integer(i8_kind) +#undef ARR2VEC_2D +#define ARR2VEC_2D arr2vec_i8_2d +#undef ARR2VEC_3D +#define ARR2VEC_3D arr2vec_i8_3d +#undef ARR2VEC_4D +#define ARR2VEC_4D arr2vec_i8_4d +#undef ARR2VEC_5D +#define ARR2VEC_5D arr2vec_i8_5d +#undef VEC2ARR_2D +#define VEC2ARR_2D vec2arr_i8_2d +#undef VEC2ARR_3D +#define VEC2ARR_3D vec2arr_i8_3d +#undef VEC2ARR_4D +#define VEC2ARR_4D vec2arr_i8_4d +#undef VEC2ARR_5D +#define VEC2ARR_5D vec2arr_i8_5d +#undef ARR_INIT_1D +#define ARR_INIT_1D arr_init_i8_1d +#undef ARR_INIT_2D +#define ARR_INIT_2D arr_init_i8_2d +#undef ARR_INIT_3D +#define ARR_INIT_3D arr_init_i8_3d +#undef ARR_INIT_4D +#define ARR_INIT_4D arr_init_i8_4d +#undef ARR_INIT_5D +#define ARR_INIT_5D arr_init_i8_5d +#include + +#undef MPP_TYPE_ +#define MPP_TYPE_ logical(l8_kind) +#undef ARR2VEC_2D +#define ARR2VEC_2D arr2vec_l8_2d +#undef ARR2VEC_3D +#define ARR2VEC_3D arr2vec_l8_3d +#undef ARR2VEC_4D +#define ARR2VEC_4D arr2vec_l8_4d +#undef ARR2VEC_5D +#define ARR2VEC_5D arr2vec_l8_5d +#undef VEC2ARR_2D +#define VEC2ARR_2D vec2arr_l8_2d +#undef VEC2ARR_3D +#define VEC2ARR_3D vec2arr_l8_3d +#undef VEC2ARR_4D +#define VEC2ARR_4D vec2arr_l8_4d +#undef VEC2ARR_5D +#define VEC2ARR_5D vec2arr_l8_5d +#undef ARR_INIT_1D +#define ARR_INIT_1D arr_init_l8_1d +#undef ARR_INIT_2D +#define ARR_INIT_2D arr_init_l8_2d +#undef ARR_INIT_3D +#define ARR_INIT_3D arr_init_l8_3d +#undef ARR_INIT_4D +#define ARR_INIT_4D arr_init_l8_4d +#undef ARR_INIT_5D +#define ARR_INIT_5D arr_init_l8_5d +#include + +#undef MPP_TYPE_ +#define MPP_TYPE_ real(r4_kind) +#undef ARR2VEC_2D +#define ARR2VEC_2D arr2vec_r4_2d +#undef ARR2VEC_3D +#define ARR2VEC_3D arr2vec_r4_3d +#undef ARR2VEC_4D +#define ARR2VEC_4D arr2vec_r4_4d +#undef ARR2VEC_5D +#define ARR2VEC_5D arr2vec_r4_5d +#undef VEC2ARR_2D +#define VEC2ARR_2D vec2arr_r4_2d +#undef VEC2ARR_3D +#define VEC2ARR_3D vec2arr_r4_3d +#undef VEC2ARR_4D +#define VEC2ARR_4D vec2arr_r4_4d +#undef VEC2ARR_5D +#define VEC2ARR_5D vec2arr_r4_5d +#undef ARR_INIT_1D +#define ARR_INIT_1D arr_init_r4_1d +#undef ARR_INIT_2D +#define ARR_INIT_2D arr_init_r4_2d +#undef ARR_INIT_3D +#define ARR_INIT_3D arr_init_r4_3d +#undef ARR_INIT_4D +#define ARR_INIT_4D arr_init_r4_4d +#undef ARR_INIT_5D +#define ARR_INIT_5D arr_init_r4_5d +#include + +#ifdef OVERLOAD_C4 +#undef MPP_TYPE_ +#define MPP_TYPE_ complex(c4_kind) +#undef ARR2VEC_2D +#define ARR2VEC_2D arr2vec_c4_2d +#undef ARR2VEC_3D +#define ARR2VEC_3D arr2vec_c4_3d +#undef ARR2VEC_4D +#define ARR2VEC_4D arr2vec_c4_4d +#undef ARR2VEC_5D +#define ARR2VEC_5D arr2vec_c4_5d +#undef VEC2ARR_2D +#define VEC2ARR_2D vec2arr_c4_2d +#undef VEC2ARR_3D +#define VEC2ARR_3D vec2arr_c4_3d +#undef VEC2ARR_4D +#define VEC2ARR_4D vec2arr_c4_4d +#undef VEC2ARR_5D +#define VEC2ARR_5D vec2arr_c4_5d +#undef ARR_INIT_1D +#define ARR_INIT_1D arr_init_c4_1d +#undef ARR_INIT_2D +#define ARR_INIT_2D arr_init_c4_2d +#undef ARR_INIT_3D +#define ARR_INIT_3D arr_init_c4_3d +#undef ARR_INIT_4D +#define ARR_INIT_4D arr_init_c4_4d +#undef ARR_INIT_5D +#define ARR_INIT_5D arr_init_c4_5d +#include +#endif + +#undef MPP_TYPE_ +#define MPP_TYPE_ integer(i4_kind) +#undef ARR2VEC_2D +#define ARR2VEC_2D arr2vec_i4_2d +#undef ARR2VEC_3D +#define ARR2VEC_3D arr2vec_i4_3d +#undef ARR2VEC_4D +#define ARR2VEC_4D arr2vec_i4_4d +#undef ARR2VEC_5D +#define ARR2VEC_5D arr2vec_i4_5d +#undef VEC2ARR_2D +#define VEC2ARR_2D vec2arr_i4_2d +#undef VEC2ARR_3D +#define VEC2ARR_3D vec2arr_i4_3d +#undef VEC2ARR_4D +#define VEC2ARR_4D vec2arr_i4_4d +#undef VEC2ARR_5D +#define VEC2ARR_5D vec2arr_i4_5d +#undef ARR_INIT_1D +#define ARR_INIT_1D arr_init_i4_1d +#undef ARR_INIT_2D +#define ARR_INIT_2D arr_init_i4_2d +#undef ARR_INIT_3D +#define ARR_INIT_3D arr_init_i4_3d +#undef ARR_INIT_4D +#define ARR_INIT_4D arr_init_i4_4d +#undef ARR_INIT_5D +#define ARR_INIT_5D arr_init_i4_5d +#include + +#undef MPP_TYPE_ +#define MPP_TYPE_ logical(l4_kind) +#undef ARR2VEC_2D +#define ARR2VEC_2D arr2vec_l4_2d +#undef ARR2VEC_3D +#define ARR2VEC_3D arr2vec_l4_3d +#undef ARR2VEC_4D +#define ARR2VEC_4D arr2vec_l4_4d +#undef ARR2VEC_5D +#define ARR2VEC_5D arr2vec_l4_5d +#undef VEC2ARR_2D +#define VEC2ARR_2D vec2arr_l4_2d +#undef VEC2ARR_3D +#define VEC2ARR_3D vec2arr_l4_3d +#undef VEC2ARR_4D +#define VEC2ARR_4D vec2arr_l4_4d +#undef VEC2ARR_5D +#define VEC2ARR_5D vec2arr_l4_5d +#undef ARR_INIT_1D +#define ARR_INIT_1D arr_init_l4_1d +#undef ARR_INIT_2D +#define ARR_INIT_2D arr_init_l4_2d +#undef ARR_INIT_3D +#define ARR_INIT_3D arr_init_l4_3d +#undef ARR_INIT_4D +#define ARR_INIT_4D arr_init_l4_4d +#undef ARR_INIT_5D +#define ARR_INIT_5D arr_init_l4_5d +#include + +#endif diff --git a/mpp/include/mpp_pack_compat.fh b/mpp/include/mpp_pack_compat.fh new file mode 100644 index 0000000000..77c32505f3 --- /dev/null +++ b/mpp/include/mpp_pack_compat.fh @@ -0,0 +1,385 @@ +!*********************************************************************** +!* Apache License 2.0 +!* +!* This file is part of the GFDL Flexible Modeling System (FMS). +!* +!* Licensed under the Apache License, Version 2.0 (the "License"); +!* you may not use this file except in compliance with the License. +!* You may obtain a copy of the License at +!* +!* http://www.apache.org/licenses/LICENSE-2.0 +!* +!* FMS is distributed in the hope that it will be useful, but WITHOUT +!* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied; +!* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +!* PARTICULAR PURPOSE. See the License for the specific language +!* governing permissions and limitations under the License. +!*********************************************************************** + + !> @brief This file provides duplicate subroutines for the mpp_pack.fh, specifically for compatibility with + !! Fortran compilers that do not support the 2018 standard, specfically deferred rank arrays and select rank + !! constructs. + + !> @brief Pack a 2D domain-decomposed array into a contiguous 1D array. This is + !! typically used to prepare a contiguous array which will be transmitted via MPI. + subroutine ARR2VEC_2D (arr, vec, xdim, ydim, is, ie, js, je) + MPP_TYPE_, intent(in) :: arr(:,:) !< The array to be packed + MPP_TYPE_, intent(out) :: vec(:) !< The vector to copy the data into + integer, intent(in) :: xdim, ydim !< Indices of the domain-decomposed dimensions. In typical Fortran-based + !! code, xdim=1 and ydim=2. + integer, intent(in) :: is, ie, js, je !< Starting and ending indices of the x and y dimensions, expressed + !! in terms of the 1-based indices seen within ARR2VEC_2D. These indices + !! may be different from the indices used in the calling code! If `arr` + !! is defined over the data domain, e.g. dimension(isd:ied,jsd:jed), + !! and you wish to pack only the compute domain, the following values + !! should be passed: + !! is = isc - isd + 1 + !! ie = iec - isd + 1 + !! js = jsc - jsd + 1 + !! je = jec - jsd + 1 + integer, allocatable, dimension(:) :: lb, ub !< These are 1 and shape(arr) respectively for all dimensions + !! except the domain-decomposed dimensions, for which they are + !! set according to the is,ie,js,je options. + integer :: ndims, n + integer :: i1, i2, i3, i4, i5 + + ndims = 2 + allocate (lb(ndims), ub(ndims)) + + lb = 1 + ub = shape(arr) + + lb(xdim) = is + lb(ydim) = js + + ub(xdim) = ie + ub(ydim) = je + + n = product(ub - lb + 1) + + vec(1:n) = reshape(arr(lb(1):ub(1), lb(2):ub(2)), [n]) + end subroutine ARR2VEC_2D + + !> @brief Pack a 3D domain-decomposed array into a contiguous 1D array. This is + !! typically used to prepare a contiguous array which will be transmitted via MPI. + subroutine ARR2VEC_3D (arr, vec, xdim, ydim, is, ie, js, je) + MPP_TYPE_, intent(in) :: arr(:,:,:) !< The array to be packed + MPP_TYPE_, intent(out) :: vec(:) !< The vector to copy the data into + integer, intent(in) :: xdim, ydim !< Indices of the domain-decomposed dimensions. In typical Fortran-based + !! code, xdim=1 and ydim=2. + integer, intent(in) :: is, ie, js, je !< Starting and ending indices of the x and y dimensions, expressed + !! in terms of the 1-based indices seen within ARR2VEC_3D. These indices + !! may be different from the indices used in the calling code! If `arr` + !! is defined over the data domain, e.g. dimension(isd:ied,jsd:jed,...), + !! and you wish to pack only the compute domain, the following values + !! should be passed: + !! is = isc - isd + 1 + !! ie = iec - isd + 1 + !! js = jsc - jsd + 1 + !! je = jec - jsd + 1 + integer, allocatable, dimension(:) :: lb, ub !< These are 1 and shape(arr) respectively for all dimensions + !! except the domain-decomposed dimensions, for which they are + !! set according to the is,ie,js,je options. + integer :: ndims, n + integer :: i1, i2, i3, i4, i5 + + ndims = 3 + allocate (lb(ndims), ub(ndims)) + + lb = 1 + ub = shape(arr) + + lb(xdim) = is + lb(ydim) = js + + ub(xdim) = ie + ub(ydim) = je + + n = product(ub - lb + 1) + + vec(1:n) = reshape(arr(lb(1):ub(1), lb(2):ub(2), lb(3):ub(3)), [n]) + end subroutine ARR2VEC_3D + + !> @brief Pack a 4D domain-decomposed array into a contiguous 1D array. This is + !! typically used to prepare a contiguous array which will be transmitted via MPI. + subroutine ARR2VEC_4D (arr, vec, xdim, ydim, is, ie, js, je) + MPP_TYPE_, intent(in) :: arr(:,:,:,:) !< The array to be packed + MPP_TYPE_, intent(out) :: vec(:) !< The vector to copy the data into + integer, intent(in) :: xdim, ydim !< Indices of the domain-decomposed dimensions. In typical Fortran-based + !! code, xdim=1 and ydim=2. + integer, intent(in) :: is, ie, js, je !< Starting and ending indices of the x and y dimensions, expressed + !! in terms of the 1-based indices seen within ARR2VEC_4D. These indices + !! may be different from the indices used in the calling code! If `arr` + !! is defined over the data domain, e.g. dimension(isd:ied,jsd:jed,...), + !! and you wish to pack only the compute domain, the following values + !! should be passed: + !! is = isc - isd + 1 + !! ie = iec - isd + 1 + !! js = jsc - jsd + 1 + !! je = jec - jsd + 1 + integer, allocatable, dimension(:) :: lb, ub !< These are 1 and shape(arr) respectively for all dimensions + !! except the domain-decomposed dimensions, for which they are + !! set according to the is,ie,js,je options. + integer :: ndims, n + integer :: i1, i2, i3, i4, i5 + + ndims = 4 + allocate (lb(ndims), ub(ndims)) + + lb = 1 + ub = shape(arr) + + lb(xdim) = is + lb(ydim) = js + + ub(xdim) = ie + ub(ydim) = je + + n = product(ub - lb + 1) + + vec(1:n) = reshape(arr(lb(1):ub(1), lb(2):ub(2), lb(3):ub(3), lb(4):ub(4)), [n]) + end subroutine ARR2VEC_4D + + !> @brief Pack a 5D domain-decomposed array into a contiguous 1D array. This is + !! typically used to prepare a contiguous array which will be transmitted via MPI. + subroutine ARR2VEC_5D (arr, vec, xdim, ydim, is, ie, js, je) + MPP_TYPE_, intent(in) :: arr(:,:,:,:,:) !< The array to be packed + MPP_TYPE_, intent(out) :: vec(:) !< The vector to copy the data into + integer, intent(in) :: xdim, ydim !< Indices of the domain-decomposed dimensions. In typical Fortran-based + !! code, xdim=1 and ydim=2. + integer, intent(in) :: is, ie, js, je !< Starting and ending indices of the x and y dimensions, expressed + !! in terms of the 1-based indices seen within ARR2VEC_5D. These indices + !! may be different from the indices used in the calling code! If `arr` + !! is defined over the data domain, e.g. dimension(isd:ied,jsd:jed,...), + !! and you wish to pack only the compute domain, the following values + !! should be passed: + !! is = isc - isd + 1 + !! ie = iec - isd + 1 + !! js = jsc - jsd + 1 + !! je = jec - jsd + 1 + integer, allocatable, dimension(:) :: lb, ub !< These are 1 and shape(arr) respectively for all dimensions + !! except the domain-decomposed dimensions, for which they are + !! set according to the is,ie,js,je options. + integer :: ndims, n + integer :: i1, i2, i3, i4, i5 + + ndims = 5 + allocate (lb(ndims), ub(ndims)) + + lb = 1 + ub = shape(arr) + + lb(xdim) = is + lb(ydim) = js + + ub(xdim) = ie + ub(ydim) = je + + n = product(ub - lb + 1) + + vec(1:n) = reshape(arr(lb(1):ub(1), lb(2):ub(2), lb(3):ub(3), lb(4):ub(4), lb(5):ub(5)), [n]) + end subroutine ARR2VEC_5D + + !> @brief Unpack a contiguous 1D array into a 2D array. This is typically used to transform + !! data obtained from an MPI call into the format that is expected by the calling code. + subroutine VEC2ARR_2D (vec, arr, xdim, ydim, is, ie, js, je) + MPP_TYPE_, intent(in) :: vec(:) !< The 1D array to be unpacked + MPP_TYPE_, intent(out) :: arr(:,:) !< The multi-dimensional array to copy the data into + integer, intent(in) :: xdim, ydim !< Indices of the domain-decomposed dimensions. In typical Fortran-based + !! code, xdim=1 and ydim=2. + integer, intent(in) :: is, ie, js, je !< Starting and ending indices of the x and y dimensions, expressed + !! in terms of the 1-based indices seen within VEC2ARR_2D. These indices + !! may be different from the indices used in the calling code! If `arr` + !! is defined over the data domain, e.g. dimension(isd:ied,jsd:jed), + !! and you wish to unpack only the compute domain, the following values + !! should be passed: + !! is = isc - isd + 1 + !! ie = iec - isd + 1 + !! js = jsc - jsd + 1 + !! je = jec - jsd + 1 + integer, allocatable, dimension(:) :: lb, ub !< These are 1 and shape(arr) respectively for all dimensions + !! except the domain-decomposed dimensions, for which they are + !! set according to the is,ie,js,je options. + integer :: ndims, n + integer :: i1, i2, i3, i4, i5 + + ndims = 2 + allocate (lb(ndims), ub(ndims)) + + lb = 1 + ub = shape(arr) + + lb(xdim) = is + lb(ydim) = js + + ub(xdim) = ie + ub(ydim) = je + + associate (s => ub - lb + 1) + n = product(s) + arr(lb(1):ub(1), lb(2):ub(2)) = reshape(vec(1:n), s(1:2)) + end associate + end subroutine VEC2ARR_2D + + !> @brief Unpack a contiguous 1D array into a 3D array. This is typically used to transform + !! data obtained from an MPI call into the format that is expected by the calling code. + subroutine VEC2ARR_3D (vec, arr, xdim, ydim, is, ie, js, je) + MPP_TYPE_, intent(in) :: vec(:) !< The 1D array to be unpacked + MPP_TYPE_, intent(out) :: arr(:,:,:) !< The multi-dimensional array to copy the data into + integer, intent(in) :: xdim, ydim !< Indices of the domain-decomposed dimensions. In typical Fortran-based + !! code, xdim=1 and ydim=2. + integer, intent(in) :: is, ie, js, je !< Starting and ending indices of the x and y dimensions, expressed + !! in terms of the 1-based indices seen within VEC2ARR_3D. These indices + !! may be different from the indices used in the calling code! If `arr` + !! is defined over the data domain, e.g. dimension(isd:ied,jsd:jed,...), + !! and you wish to unpack only the compute domain, the following values + !! should be passed: + !! is = isc - isd + 1 + !! ie = iec - isd + 1 + !! js = jsc - jsd + 1 + !! je = jec - jsd + 1 + integer, allocatable, dimension(:) :: lb, ub !< These are 1 and shape(arr) respectively for all dimensions + !! except the domain-decomposed dimensions, for which they are + !! set according to the is,ie,js,je options. + integer :: ndims, n + integer :: i1, i2, i3, i4, i5 + + ndims = 3 + allocate (lb(ndims), ub(ndims)) + + lb = 1 + ub = shape(arr) + + lb(xdim) = is + lb(ydim) = js + + ub(xdim) = ie + ub(ydim) = je + + associate (s => ub - lb + 1) + n = product(s) + arr(lb(1):ub(1), lb(2):ub(2), lb(3):ub(3)) = reshape(vec(1:n), s(1:3)) + end associate + end subroutine VEC2ARR_3D + + !> @brief Unpack a contiguous 1D array into a 4D array. This is typically used to transform + !! data obtained from an MPI call into the format that is expected by the calling code. + subroutine VEC2ARR_4D (vec, arr, xdim, ydim, is, ie, js, je) + MPP_TYPE_, intent(in) :: vec(:) !< The 1D array to be unpacked + MPP_TYPE_, intent(out) :: arr(:,:,:,:) !< The multi-dimensional array to copy the data into + integer, intent(in) :: xdim, ydim !< Indices of the domain-decomposed dimensions. In typical Fortran-based + !! code, xdim=1 and ydim=2. + integer, intent(in) :: is, ie, js, je !< Starting and ending indices of the x and y dimensions, expressed + !! in terms of the 1-based indices seen within VEC2ARR_4D. These indices + !! may be different from the indices used in the calling code! If `arr` + !! is defined over the data domain, e.g. dimension(isd:ied,jsd:jed,...), + !! and you wish to unpack only the compute domain, the following values + !! should be passed: + !! is = isc - isd + 1 + !! ie = iec - isd + 1 + !! js = jsc - jsd + 1 + !! je = jec - jsd + 1 + integer, allocatable, dimension(:) :: lb, ub !< These are 1 and shape(arr) respectively for all dimensions + !! except the domain-decomposed dimensions, for which they are + !! set according to the is,ie,js,je options. + integer :: ndims, n + integer :: i1, i2, i3, i4, i5 + + ndims = 4 + allocate (lb(ndims), ub(ndims)) + + lb = 1 + ub = shape(arr) + + lb(xdim) = is + lb(ydim) = js + + ub(xdim) = ie + ub(ydim) = je + + associate (s => ub - lb + 1) + n = product(s) + arr(lb(1):ub(1), lb(2):ub(2), lb(3):ub(3), lb(4):ub(4)) = reshape(vec(1:n), s(1:4)) + end associate + end subroutine VEC2ARR_4D + + !> @brief Unpack a contiguous 1D array into a 5D array. This is typically used to transform + !! data obtained from an MPI call into the format that is expected by the calling code. + subroutine VEC2ARR_5D (vec, arr, xdim, ydim, is, ie, js, je) + MPP_TYPE_, intent(in) :: vec(:) !< The 1D array to be unpacked + MPP_TYPE_, intent(out) :: arr(:,:,:,:,:) !< The multi-dimensional array to copy the data into + integer, intent(in) :: xdim, ydim !< Indices of the domain-decomposed dimensions. In typical Fortran-based + !! code, xdim=1 and ydim=2. + integer, intent(in) :: is, ie, js, je !< Starting and ending indices of the x and y dimensions, expressed + !! in terms of the 1-based indices seen within VEC2ARR_5D. These indices + !! may be different from the indices used in the calling code! If `arr` + !! is defined over the data domain, e.g. dimension(isd:ied,jsd:jed,...), + !! and you wish to unpack only the compute domain, the following values + !! should be passed: + !! is = isc - isd + 1 + !! ie = iec - isd + 1 + !! js = jsc - jsd + 1 + !! je = jec - jsd + 1 + integer, allocatable, dimension(:) :: lb, ub !< These are 1 and shape(arr) respectively for all dimensions + !! except the domain-decomposed dimensions, for which they are + !! set according to the is,ie,js,je options. + integer :: ndims, n + integer :: i1, i2, i3, i4, i5 + + ndims = 5 + allocate (lb(ndims), ub(ndims)) + + lb = 1 + ub = shape(arr) + + lb(xdim) = is + lb(ydim) = js + + ub(xdim) = ie + ub(ydim) = je + + associate (s => ub - lb + 1) + n = product(s) + arr(lb(1):ub(1), lb(2):ub(2), lb(3):ub(3), lb(4):ub(4), lb(5):ub(5)) = reshape(vec(1:n), s(1:5)) + end associate + end subroutine VEC2ARR_5D + + !> @brief Initialize a 1D array to `val`. + subroutine ARR_INIT_1D (arr, val) + MPP_TYPE_, dimension(:), intent(out) :: arr !< The array to be initialized + MPP_TYPE_, intent(in) :: val !< The value to initialize to + + arr = val + end subroutine ARR_INIT_1D + + !> @brief Initialize a 2D array to `val`. + subroutine ARR_INIT_2D (arr, val) + MPP_TYPE_, dimension(:,:), intent(out) :: arr !< The array to be initialized + MPP_TYPE_, intent(in) :: val !< The value to initialize to + + arr = val + end subroutine ARR_INIT_2D + + !> @brief Initialize a 3D array to `val`. + subroutine ARR_INIT_3D (arr, val) + MPP_TYPE_, dimension(:,:,:), intent(out) :: arr !< The array to be initialized + MPP_TYPE_, intent(in) :: val !< The value to initialize to + + arr = val + end subroutine ARR_INIT_3D + + !> @brief Initialize a 4D array to `val`. + subroutine ARR_INIT_4D (arr, val) + MPP_TYPE_, dimension(:,:,:,:), intent(out) :: arr !< The array to be initialized + MPP_TYPE_, intent(in) :: val !< The value to initialize to + + arr = val + end subroutine ARR_INIT_4D + + !> @brief Initialize a 5D array to `val`. + subroutine ARR_INIT_5D (arr, val) + MPP_TYPE_, dimension(:,:,:,:,:), intent(out) :: arr !< The array to be initialized + MPP_TYPE_, intent(in) :: val !< The value to initialize to + + arr = val + end subroutine ARR_INIT_5D \ No newline at end of file diff --git a/mpp/include/mpp_transmit.inc b/mpp/include/mpp_transmit.inc index b745e71da0..3b55a5b339 100644 --- a/mpp/include/mpp_transmit.inc +++ b/mpp/include/mpp_transmit.inc @@ -171,7 +171,7 @@ call mpp_transmit( put_data, put_len, to_pe, dummy, 1, NULL_PE, tag=tag, send_request=request ) end subroutine MPP_SEND_ - subroutine MPP_RECV_SCALAR_( get_data, from_pe, glen, block, tag, request ) + subroutine MPP_RECV_SCALAR_( get_data, from_pe, glen, block, tag, request, omp_offload ) !a mpp_transmit with null arguments on the put side integer, intent(in) :: from_pe MPP_TYPE_, intent(out) :: get_data @@ -180,6 +180,7 @@ type(mpi_request), intent(out), optional :: request integer, optional, intent(in) :: glen + logical, optional, intent(in) :: omp_offload integer :: get_len MPP_TYPE_ :: get_data1D(1) MPP_TYPE_ :: dummy(1) @@ -189,17 +190,19 @@ ptr = LOC(get_data) get_len=1; if(PRESENT(glen))get_len=glen - call mpp_transmit( dummy, 1, NULL_PE, get_data1D, get_len, from_pe, block, tag, recv_request=request ) + call mpp_transmit( dummy, 1, NULL_PE, get_data1D, get_len, from_pe, & + block, tag, recv_request=request, omp_offload=omp_offload ) end subroutine MPP_RECV_SCALAR_ - subroutine MPP_SEND_SCALAR_( put_data, to_pe, plen, tag, request) + subroutine MPP_SEND_SCALAR_( put_data, to_pe, plen, tag, request, omp_offload) !a mpp_transmit with null arguments on the get side integer, intent(in) :: to_pe MPP_TYPE_, intent(in) :: put_data integer, optional, intent(in) :: plen integer, intent(in), optional :: tag type(mpi_request), intent(out), optional :: request + logical, optional, intent(in) :: omp_offload integer :: put_len MPP_TYPE_ :: put_data1D(1) MPP_TYPE_ :: dummy(1) @@ -207,7 +210,8 @@ pointer( ptr, put_data1D ) ptr = LOC(put_data) put_len=1; if(PRESENT(plen))put_len=plen - call mpp_transmit( put_data1D, put_len, to_pe, dummy, 1, NULL_PE, tag = tag, send_request=request ) + call mpp_transmit( put_data1D, put_len, to_pe, dummy, 1, NULL_PE, & + tag=tag, send_request=request, omp_offload=omp_offload ) end subroutine MPP_SEND_SCALAR_ @@ -220,7 +224,8 @@ type(mpi_request), intent(out), optional :: request MPP_TYPE_ :: dummy(1,1) - call mpp_transmit( dummy, 1, NULL_PE, get_data, get_len, from_pe, block, tag, recv_request=request ) + call mpp_transmit( dummy, 1, NULL_PE, get_data, get_len, from_pe, & + block, tag, recv_request=request ) end subroutine MPP_RECV_2D_ subroutine MPP_SEND_2D_( put_data, put_len, to_pe, tag, request ) diff --git a/mpp/include/mpp_transmit_mpi.fh b/mpp/include/mpp_transmit_mpi.fh index d649a984de..d7b28880d5 100644 --- a/mpp/include/mpp_transmit_mpi.fh +++ b/mpp/include/mpp_transmit_mpi.fh @@ -36,7 +36,7 @@ !!(avoiding f90 rank conformance check) !!caller is responsible for completion checks (mpp_sync_self) before and after subroutine MPP_TRANSMIT_( put_data, put_len, to_pe, get_data, get_len, from_pe, block, tag, recv_request, & - & send_request ) + & send_request, omp_offload ) integer, intent(in) :: put_len, to_pe, get_len, from_pe MPP_TYPE_, intent(in) :: put_data(*) @@ -44,14 +44,19 @@ logical, intent(in), optional :: block integer, intent(in), optional :: tag type(mpi_request), intent(out), optional :: recv_request, send_request + logical, intent(in), optional :: omp_offload logical :: block_comm integer :: i integer :: comm_tag integer :: rsize + logical :: use_device_ptr if( .NOT.module_is_initialized )call mpp_error( FATAL, 'MPP_TRANSMIT: You must first call mpp_init.' ) if( to_pe.EQ.NULL_PE .AND. from_pe.EQ.NULL_PE )return + use_device_ptr = .false. + if (present(omp_offload)) use_device_ptr = omp_offload + block_comm = .true. if(PRESENT(block)) block_comm = block @@ -81,8 +86,15 @@ if( cur_send_request > max_request ) & call mpp_error(FATAL, & & "MPP_TRANSMIT: cur_send_request is greater than max_request, increase mpp_nml request_multiply") - call MPI_ISEND( put_data, put_len, MPI_TYPE_, to_pe, comm_tag, mpp_comm_private, & - request_send(cur_send_request), error) + if (use_device_ptr) then + !$omp target data use_device_addr(put_data) + call MPI_ISEND( put_data, put_len, MPI_TYPE_, to_pe, comm_tag, mpp_comm_private, & + request_send(cur_send_request), error) + !$omp end target data + else + call MPI_ISEND( put_data, put_len, MPI_TYPE_, to_pe, comm_tag, mpp_comm_private, & + request_send(cur_send_request), error) + endif endif if (debug .and. (current_clock.NE.0)) call increment_current_clock(EVENT_SEND, put_len*MPP_TYPE_BYTELEN_) else if (to_pe.EQ.ALL_PES) then !this is a broadcast from from_pe @@ -129,8 +141,15 @@ if( cur_recv_request > max_request ) & call mpp_error(FATAL, & "MPP_TRANSMIT: cur_recv_request is greater than max_request, increase mpp_nml request_multiply") - call MPI_IRECV( get_data, get_len, MPI_TYPE_, from_pe, comm_tag, mpp_comm_private, & - request_recv(cur_recv_request), error ) + if (use_device_ptr) then + !$omp target data use_device_addr(get_data) + call MPI_IRECV( get_data, get_len, MPI_TYPE_, from_pe, comm_tag, mpp_comm_private, & + request_recv(cur_recv_request), error ) + !$omp end target data + else + call MPI_IRECV( get_data, get_len, MPI_TYPE_, from_pe, comm_tag, mpp_comm_private, & + request_recv(cur_recv_request), error ) + endif size_recv(cur_recv_request) = get_len type_recv(cur_recv_request) = MPI_TYPE_ endif diff --git a/mpp/include/mpp_transmit_nocomm.fh b/mpp/include/mpp_transmit_nocomm.fh index 5db86f978d..732155280e 100644 --- a/mpp/include/mpp_transmit_nocomm.fh +++ b/mpp/include/mpp_transmit_nocomm.fh @@ -35,7 +35,7 @@ !!words from an array of any rank to be passed (avoiding f90 rank conformance check) !!caller is responsible for completion checks (mpp_sync_self) before and after subroutine MPP_TRANSMIT_( put_data, put_len, to_pe, get_data, get_len, from_pe, block, tag, recv_request, & - & send_request ) + & send_request, omp_offload ) integer, intent(in) :: put_len, to_pe, get_len, from_pe MPP_TYPE_, intent(in) :: put_data(*) @@ -43,6 +43,8 @@ logical, intent(in), optional :: block integer, intent(in), optional :: tag type(mpi_request), intent(out), optional :: recv_request, send_request + logical, intent(in), optional :: omp_offload + ! NOTE: omp_offload is unused in this function integer :: i, outunit MPP_TYPE_, allocatable, save :: local_data(:) !local copy used by non-parallel code (no SHMEM or MPI) @@ -54,6 +56,7 @@ if( .NOT.module_is_initialized )call mpp_error( FATAL, 'MPP_TRANSMIT: You must first call mpp_init.' ) if( to_pe.EQ.NULL_PE .AND. from_pe.EQ.NULL_PE )return + outunit = stdout() if( debug )then call SYSTEM_CLOCK(tick) diff --git a/mpp/include/mpp_util_mpi.inc b/mpp/include/mpp_util_mpi.inc index f3b18518d1..a30e5f1a22 100644 --- a/mpp/include/mpp_util_mpi.inc +++ b/mpp/include/mpp_util_mpi.inc @@ -54,7 +54,11 @@ subroutine mpp_error_basic( errortype, errormsg ) text_errortype = 'WARNING: non-existent errortype (must be NOTE|WARNING|FATAL)' end select - if( npes.GT.1 ) write( text,'(a,i6)' ) trim(text_errortype) // ' from PE', pe !this is the mpp part + if( npes.GT.1 ) then + write( text,'(a,i6)' ) trim(text_errortype) // ' from PE', pe !this is the mpp part + else + text = text_errortype + endif if( PRESENT(errormsg) )text = trim(text) // ': ' // trim(errormsg) !$OMP CRITICAL (MPP_ERROR_CRITICAL) select case( errortype ) diff --git a/mpp/mpp_domains.F90 b/mpp/mpp_domains.F90 index da9a1ff848..ca4c8c61e2 100644 --- a/mpp/mpp_domains.F90 +++ b/mpp/mpp_domains.F90 @@ -1785,6 +1785,8 @@ module mpp_domains_mod !! @endcode !> @ingroup mpp_domains_mod interface mpp_global_field + +#ifndef __NVCOMPILER module procedure mpp_global_field_r8 #ifdef OVERLOAD_C8 module procedure mpp_global_field_c8 @@ -1797,6 +1799,45 @@ module mpp_domains_mod #endif module procedure mpp_global_field_i4 module procedure mpp_global_field_l4 +!! rank specific interfaces are needed for NVHPC because it does not support deferred-rank arrays +#else + module procedure mpp_global_field_r8_2d + module procedure mpp_global_field_r8_3d + module procedure mpp_global_field_r8_4d + module procedure mpp_global_field_r8_5d +#ifdef OVERLOAD_C8 + module procedure mpp_global_field_c8_2d + module procedure mpp_global_field_c8_3d + module procedure mpp_global_field_c8_4d + module procedure mpp_global_field_c8_5d +#endif + module procedure mpp_global_field_i8_2d + module procedure mpp_global_field_i8_3d + module procedure mpp_global_field_i8_4d + module procedure mpp_global_field_i8_5d + module procedure mpp_global_field_l8_2d + module procedure mpp_global_field_l8_3d + module procedure mpp_global_field_l8_4d + module procedure mpp_global_field_l8_5d + module procedure mpp_global_field_r4_2d + module procedure mpp_global_field_r4_3d + module procedure mpp_global_field_r4_4d + module procedure mpp_global_field_r4_5d +#ifdef OVERLOAD_C4 + module procedure mpp_global_field_c4_2d + module procedure mpp_global_field_c4_3d + module procedure mpp_global_field_c4_4d + module procedure mpp_global_field_c4_5d +#endif + module procedure mpp_global_field_i4_2d + module procedure mpp_global_field_i4_3d + module procedure mpp_global_field_i4_4d + module procedure mpp_global_field_i4_5d + module procedure mpp_global_field_l4_2d + module procedure mpp_global_field_l4_3d + module procedure mpp_global_field_l4_4d + module procedure mpp_global_field_l4_5d +#endif end interface !> @ingroup mpp_domains_mod @@ -2297,6 +2338,7 @@ module mpp_domains_mod !> Private interface to pack an array into a vector !> @ingroup mpp_domains_mod interface arr2vec +#ifndef __NVCOMPILER module procedure arr2vec_r8 #ifdef OVERLOAD_C8 module procedure arr2vec_c8 @@ -2309,11 +2351,38 @@ module mpp_domains_mod #endif module procedure arr2vec_i4 module procedure arr2vec_l4 +#else + module procedure arr2vec_r8_2d + module procedure arr2vec_r8_3d + module procedure arr2vec_r8_4d + module procedure arr2vec_r8_5d + module procedure arr2vec_i8_2d + module procedure arr2vec_i8_3d + module procedure arr2vec_i8_4d + module procedure arr2vec_i8_5d + module procedure arr2vec_l8_2d + module procedure arr2vec_l8_3d + module procedure arr2vec_l8_4d + module procedure arr2vec_l8_5d + module procedure arr2vec_r4_2d + module procedure arr2vec_r4_3d + module procedure arr2vec_r4_4d + module procedure arr2vec_r4_5d + module procedure arr2vec_i4_2d + module procedure arr2vec_i4_3d + module procedure arr2vec_i4_4d + module procedure arr2vec_i4_5d + module procedure arr2vec_l4_2d + module procedure arr2vec_l4_3d + module procedure arr2vec_l4_4d + module procedure arr2vec_l4_5d +#endif end interface !> Private interface to unpack a vector into an array !> @ingroup mpp_domains_mod interface vec2arr +#ifndef __NVCOMPILER module procedure vec2arr_r8 #ifdef OVERLOAD_C8 module procedure vec2arr_c8 @@ -2326,11 +2395,38 @@ module mpp_domains_mod #endif module procedure vec2arr_i4 module procedure vec2arr_l4 +#else + module procedure vec2arr_r8_2d + module procedure vec2arr_r8_3d + module procedure vec2arr_r8_4d + module procedure vec2arr_r8_5d + module procedure vec2arr_i8_2d + module procedure vec2arr_i8_3d + module procedure vec2arr_i8_4d + module procedure vec2arr_i8_5d + module procedure vec2arr_l8_2d + module procedure vec2arr_l8_3d + module procedure vec2arr_l8_4d + module procedure vec2arr_l8_5d + module procedure vec2arr_r4_2d + module procedure vec2arr_r4_3d + module procedure vec2arr_r4_4d + module procedure vec2arr_r4_5d + module procedure vec2arr_i4_2d + module procedure vec2arr_i4_3d + module procedure vec2arr_i4_4d + module procedure vec2arr_i4_5d + module procedure vec2arr_l4_2d + module procedure vec2arr_l4_3d + module procedure vec2arr_l4_4d + module procedure vec2arr_l4_5d +#endif end interface !> Private interface to initialize an assumed-rank array !> @ingroup mpp_domains_mod interface arr_init +#ifndef __NVCOMPILER module procedure arr_init_r8 #ifdef OVERLOAD_C8 module procedure arr_init_c8 @@ -2343,6 +2439,32 @@ module mpp_domains_mod #endif module procedure arr_init_i4 module procedure arr_init_l4 +#else + module procedure arr_init_r8_2d + module procedure arr_init_r8_3d + module procedure arr_init_r8_4d + module procedure arr_init_r8_5d + module procedure arr_init_i8_2d + module procedure arr_init_i8_3d + module procedure arr_init_i8_4d + module procedure arr_init_i8_5d + module procedure arr_init_l8_2d + module procedure arr_init_l8_3d + module procedure arr_init_l8_4d + module procedure arr_init_l8_5d + module procedure arr_init_r4_2d + module procedure arr_init_r4_3d + module procedure arr_init_r4_4d + module procedure arr_init_r4_5d + module procedure arr_init_i4_2d + module procedure arr_init_i4_3d + module procedure arr_init_i4_4d + module procedure arr_init_i4_5d + module procedure arr_init_l4_2d + module procedure arr_init_l4_3d + module procedure arr_init_l4_4d + module procedure arr_init_l4_5d +#endif end interface ! Include variable "version" to be written to log file. @@ -2357,8 +2479,8 @@ module mpp_domains_mod #include #include #include +#include #include #include -#include end module mpp_domains_mod diff --git a/test_fms/mpp/test_mpp_domains.F90 b/test_fms/mpp/test_mpp_domains.F90 index e681173944..0ef0abe929 100644 --- a/test_fms/mpp/test_mpp_domains.F90 +++ b/test_fms/mpp/test_mpp_domains.F90 @@ -82,6 +82,7 @@ program test_mpp_domains logical :: test_edge_update = .false. logical :: test_nonsym_edge = .false. logical :: test_group = .false. + logical :: test_group_offload = .false. logical :: test_cubic_grid_redistribute = .false. logical :: check_parallel = .FALSE. logical :: test_get_nbr = .FALSE. @@ -113,7 +114,7 @@ program test_mpp_domains layout_ensemble, nthreads, test_boundary, layout_tripolar, & test_group, test_global_sum, test_subset, test_nonsym_edge, & test_halosize_performance, test_adjoint, wide_halo, & - test_unstruct + test_unstruct, test_group_offload integer :: i, j, k, n, p integer :: layout(2) integer :: id @@ -236,6 +237,13 @@ program test_mpp_domains if (mpp_pe() == mpp_root_pe()) print *, '--------------------> Finished testing group_update <-------------------' endif + if( test_group_offload) then + if (mpp_pe() == mpp_root_pe()) print *, '--------------------> Calling test_group <-------------------' + call test_group_update( 'Folded-north', use_omp_offload=test_group_offload ) + call test_group_update( 'Cubic-Grid', use_omp_offload=test_group_offload ) + if (mpp_pe() == mpp_root_pe()) print *, '--------------------> Finished test_group <-------------------' + endif + if( test_interface ) then if (mpp_pe() == mpp_root_pe()) print *, '--------------------> Calling test_interface <-------------------' call test_modify_domain() @@ -2486,6 +2494,607 @@ end subroutine test_mpp_global_sum #undef FMS_TEST_KIND_ #undef TEST_GROUP_UPDATE_ + !############################################################### + subroutine test_group_update( type, use_omp_offload ) + character(len=*), intent(in) :: type + logical, intent(in), optional :: use_omp_offload + + type(domain2D) :: domain + integer :: num_contact, ntiles, npes_per_tile + integer :: i, j, k, l, n, shift + integer :: isc, iec, jsc, jec, isd, ied, jsd, jed + integer :: ism, iem, jsm, jem + + integer, allocatable, dimension(:) :: pe_start, pe_end, tile1, tile2 + integer, allocatable, dimension(:) :: istart1, iend1, jstart1, jend1 + integer, allocatable, dimension(:) :: istart2, iend2, jstart2, jend2 + integer, allocatable, dimension(:,:) :: layout2D, global_indices + real, allocatable, dimension(:,:,:,:) :: x1, y1, x2, y2 + real, allocatable, dimension(:,:,:,:) :: a1, a2 + real, allocatable, dimension(:,:,:) :: base + integer :: id1, id2, id3 + logical :: folded_north + logical :: cubic_grid + character(len=3) :: text + integer :: nx_save, ny_save + type(mpp_group_update_type) :: group_update + type(mpp_group_update_type), allocatable :: update_list(:) + logical :: do_omp_offload + + folded_north = .false. + cubic_grid = .false. + do_omp_offload = .false. + if (present(use_omp_offload)) do_omp_offload = use_omp_offload + + nx_save = nx + ny_save = ny + !--- check the type + select case(type) + case ( 'Folded-north' ) + ntiles = 1 + shift = 0 + num_contact = 2 + folded_north = .true. + npes_per_tile = npes + if(layout_tripolar(1)*layout_tripolar(2) == npes ) then + layout = layout_tripolar + else + call mpp_define_layout( (/1,nx,1,ny/), npes_per_tile, layout ) + endif + case ( 'Cubic-Grid' ) + if( nx_cubic == 0 ) then + call mpp_error(NOTE,'test_group_update: for Cubic_grid mosaic, nx_cubic is zero, '//& + 'No test is done for Cubic-Grid mosaic. ' ) + return + endif + if( nx_cubic .NE. ny_cubic ) then + call mpp_error(NOTE,'test_group_update: for Cubic_grid mosaic, nx_cubic does not equal ny_cubic, '//& + 'No test is done for Cubic-Grid mosaic. ' ) + return + endif + shift = 1 + nx = nx_cubic + ny = ny_cubic + ntiles = 6 + num_contact = 12 + cubic_grid = .true. + if( mod(npes, ntiles) == 0 ) then + npes_per_tile = npes/ntiles + write(outunit,*)'NOTE from update_domains_performance ==> For Mosaic "', trim(type), & + '", each tile will be distributed over ', npes_per_tile, ' processors.' + else + call mpp_error(NOTE,'test_group_update: npes should be multiple of ntiles No test is done for '//trim(type)) + return + endif + if(layout_cubic(1)*layout_cubic(2) == npes_per_tile) then + layout = layout_cubic + else + call mpp_define_layout( (/1,nx,1,ny/), npes_per_tile, layout ) + endif + case default + call mpp_error(FATAL, 'test_group_update: no such test: '//type) + end select + + allocate(layout2D(2,ntiles), global_indices(4,ntiles), pe_start(ntiles), pe_end(ntiles) ) + do n = 1, ntiles + pe_start(n) = (n-1)*npes_per_tile + pe_end(n) = n*npes_per_tile-1 + end do + + do n = 1, ntiles + global_indices(:,n) = (/1,nx,1,ny/) + layout2D(:,n) = layout + end do + + allocate(tile1(num_contact), tile2(num_contact) ) + allocate(istart1(num_contact), iend1(num_contact), jstart1(num_contact), jend1(num_contact) ) + allocate(istart2(num_contact), iend2(num_contact), jstart2(num_contact), jend2(num_contact) ) + + !--- define domain + if(folded_north) then + !--- Contact line 1, between tile 1 (EAST) and tile 1 (WEST) --- cyclic + tile1(1) = 1; tile2(1) = 1 + istart1(1) = nx; iend1(1) = nx; jstart1(1) = 1; jend1(1) = ny + istart2(1) = 1; iend2(1) = 1; jstart2(1) = 1; jend2(1) = ny + !--- Contact line 2, between tile 1 (NORTH) and tile 1 (NORTH) --- folded-north-edge + tile1(2) = 1; tile2(2) = 1 + istart1(2) = 1; iend1(2) = nx/2; jstart1(2) = ny; jend1(2) = ny + istart2(2) = nx; iend2(2) = nx/2+1; jstart2(2) = ny; jend2(2) = ny + call mpp_define_mosaic(global_indices, layout2D, domain, ntiles, num_contact, tile1, tile2, & + istart1, iend1, jstart1, jend1, istart2, iend2, jstart2, jend2, & + pe_start, pe_end, whalo=whalo, ehalo=ehalo, shalo=shalo, nhalo=nhalo, & + name = type, symmetry = .false. ) + else if( cubic_grid ) then + call define_cubic_mosaic(type, domain, (/nx,nx,nx,nx,nx,nx/), (/ny,ny,ny,ny,ny,ny/), & + global_indices, layout2D, pe_start, pe_end ) + endif + + !--- setup data + call mpp_get_compute_domain( domain, isc, iec, jsc, jec ) + call mpp_get_data_domain ( domain, isd, ied, jsd, jed ) + call mpp_get_memory_domain ( domain, ism, iem, jsm, jem ) + + if(num_fields<1) then + call mpp_error(FATAL, "test_mpp_domains: num_fields must be a positive integer") + endif + + allocate(update_list(num_fields)) + + id1 = mpp_clock_id( type//' group 2D', flags=MPP_CLOCK_SYNC ) + id2 = mpp_clock_id( type//' non-group 2D', flags=MPP_CLOCK_SYNC ) + id3 = mpp_clock_id( type//' non-block group 2D', flags=MPP_CLOCK_SYNC ) + + allocate( a1(ism:iem, jsm:jem, nz, num_fields) ) + allocate( x1(ism:iem+shift,jsm:jem, nz, num_fields) ) + allocate( y1(ism:iem, jsm:jem+shift, nz, num_fields) ) + allocate( a2(ism:iem, jsm:jem, nz, num_fields) ) + allocate( x2(ism:iem+shift,jsm:jem, nz, num_fields) ) + allocate( y2(ism:iem, jsm:jem+shift, nz, num_fields) ) + allocate( base(isc:iec+shift,jsc:jec+shift,nz) ) + a1 = 0; x1 = 0; y1 = 0 + ! allocate a1, x1, y1 on GPU (GPU x1, y1 only used for CGRID test) + !$omp target enter data if(do_omp_offload) map(to: a1, x1, y1) + + base = 0 + do k = 1,nz + do j = jsc, jec+shift + do i = isc, iec+shift + base(i,j,k) = k + i*1e-3 + j*1e-6 + end do + end do + end do + + !--- Test for partial direction update + do l =1, num_fields + call mpp_create_group_update(group_update, a1(:,:,:,l), domain, flags=WUPDATE+SUPDATE) + end do + + ! initialize a1 values on GPU only to check GPU data is being transferred + !$omp target teams loop if(do_omp_offload) default(shared) & + !$omp shared(num_fields, nz, jsc, jec, isc, iec, a1, base) map(tofrom: a1) map(to: base) + do l = 1, num_fields + do k=1,nz + do j=jsc,jec + do i=isc,iec + a1(i,j,k,l) = base(i,j,k) + l*1e3 + enddo + enddo + do i=isc-1,iec+1 + a1(i,jsc-1,k,l) = 999; + a1(i,jec+1,k,l) = 999; + enddo + do j=jsc,jec + a1(isc-1,j,k,l) = 999 + a1(iec+1,j,k,l) = 999 + enddo + enddo + enddo + !$omp target teams loop collapse(4) if(do_omp_offload) default(shared) & + !$omp shared(num_fields, nz, jsm, jem, ism, iem, a1, a2) map(to: a1) map(from: a2) + do l=1,num_fields ; do k=1,nz ; do j=jsm,jem ; do i=ism,iem + a2(i,j,k,l) = a1(i,j,k,l) + enddo ; enddo ; enddo ; enddo + call mpp_do_group_update(group_update, domain, a1(isc,jsc,1,1), omp_offload=do_omp_offload) + + do l = 1, num_fields + call mpp_update_domains( a2(:,:,:,l), domain, flags=WUPDATE+SUPDATE, complete=l==num_fields ) + enddo + + ! copy a1 back to host + !$omp target update if(do_omp_offload) from(a1) + do l = 1, num_fields + write(text, '(i3.3)') l + call compare_checksums(a1(isd:ied,jsd:jed,:,l),a2(isd:ied,jsd:jed,:,l),type//' CENTER South West '//text) + enddo + + call mpp_clear_group_update(group_update) + + !--- Test for DGRID update + if(type == 'Cubic-Grid' ) then + x1 = 0; y1 = 0 + do l =1, num_fields + call mpp_create_group_update(group_update, x1(:,:,:,l), y1(:,:,:,l), domain, gridtype=DGRID_NE) + end do + + do l = 1, num_fields + y1(isc:iec+shift,jsc:jec, :,l) = base(isc:iec+shift,jsc:jec, :) + l*1e3 + 1e6 + x1(isc:iec, jsc:jec+shift,:,l) = base(isc:iec, jsc:jec+shift,:) + l*1e3 + 2*1e6 + enddo + x2 = x1; y2 = y1 + call mpp_start_group_update(group_update, domain, x1(isc,jsc,1,1)) + call mpp_complete_group_update(group_update, domain, x1(isc,jsc,1,1)) + + do l = 1, num_fields + call mpp_update_domains( x2(:,:,:,l), y2(:,:,:,l), domain, gridtype=DGRID_NE, complete=l==num_fields ) + enddo + + !--- compare checksum + do l = 1, num_fields + write(text, '(i3.3)') l + call compare_checksums(x1(isd:ied+shift,jsd:jed, :,l),x2(isd:ied+shift,jsd:jed, :,l),type//' DGRID X'//text) + call compare_checksums(y1(isd:ied, jsd:jed+shift,:,l),y2(isd:ied, jsd:jed+shift,:,l),type//' DGRID Y'//text) + enddo + + call mpp_clear_group_update(group_update) + endif + !--- Test for CGRID + a1 = 0; x1 = 0; y1 = 0 + do l =1, num_fields + call mpp_create_group_update(group_update, a1(:,:,:,l), domain) + call mpp_create_group_update(group_update, x1(:,:,:,l), y1(:,:,:,l), domain, gridtype=CGRID_NE) + end do + + do n = 1, num_iter + a1 = 0; x1 = 0; y1 = 0 + !$omp target update if(do_omp_offload) to(a1, x1, y1) + do l = 1, num_fields + !$omp target teams loop collapse(3) if(do_omp_offload) default(shared) & + !$omp shared(nz, jsc, jec, isc, iec, a1, base, l) map(to: base) map(tofrom: a1) + do k=1,nz + do j=jsc,jec + do i=isc,iec + a1(i,j,k,l) = base(i,j,k) + l*1e3 + enddo + enddo + enddo + !$omp target teams loop collapse(3) if(do_omp_offload) default(shared) & + !$omp shared(nz, jsc, jec, shift, isc, iec, x1, base, l) map(to: base) map(tofrom: x1) + do k=1,nz + do j=jsc,jec + do i=isc,iec+shift + x1(i,j,k,l) = base(i,j,k) + l*1e3 + 1e6 + enddo + enddo + enddo + !$omp target teams loop collapse(3) if(do_omp_offload) default(shared) & + !$omp shared(nz, jsc, jec, shift, isc, iec, y1, base, l) map(to: base) map(tofrom: y1) + do k=1,nz + do j=jsc,jec+shift + do i=isc,iec + y1(i,j,k,l) = base(i,j,k) + l*1e3 + 2*1e6 + enddo + enddo + enddo + enddo + !$omp target teams loop collapse(4) if(do_omp_offload) default(shared) & + !$omp shared(num_fields, nz, jsm, jem, ism, iem, a1, a2) map(from: a2) map(to: a1) + do l=1,num_fields ; do k=1,nz ; do j=jsm,jem ; do i=ism,iem + a2(i,j,k,l) = a1(i,j,k,l) + enddo ; enddo ; enddo ; enddo + !$omp target teams loop collapse(4) if(do_omp_offload) default(shared) & + !$omp shared(num_fields, nz, jsm, jem, ism, iem, shift, x1, x2) map(from: x2) map(to: x1) + do l=1,num_fields ; do k=1,nz ; do j=jsm,jem ; do i=ism,iem+shift + x2(i,j,k,l) = x1(i,j,k,l) + enddo ; enddo ; enddo ; enddo + !$omp target teams loop collapse(4) if(do_omp_offload) default(shared) & + !$omp shared(num_fields, nz, jsm, jem, shift, ism, iem, y1, y2) map(from: y2) map(to: y1) + do l=1,num_fields ; do k=1,nz ; do j=jsm,jem+shift ; do i=ism,iem + y2(i,j,k,l) = y1(i,j,k,l) + enddo ; enddo ; enddo ; enddo + + call mpp_clock_begin(id1) + call mpp_do_group_update(group_update, domain, a1(isc,jsc,1,1), omp_offload=do_omp_offload) + call mpp_clock_end (id1) + + call mpp_clock_begin(id2) + do l = 1, num_fields + call mpp_update_domains( a2(:,:,:,l), domain, complete=l==num_fields ) + enddo + do l = 1, num_fields + call mpp_update_domains( x2(:,:,:,l), y2(:,:,:,l), domain, gridtype=CGRID_NE, complete=l==num_fields ) + enddo + call mpp_clock_end(id2) + + !--- compare checksum + if( n == num_iter ) then + !$omp target update if(do_omp_offload) from(a1, x1, y1) + do l = 1, num_fields + write(text, '(i3.3)') l + call compare_checksums(a1(isd:ied, jsd:jed, :,l),a2(isd:ied, jsd:jed, :,l),type//' CENTER '//text) + call compare_checksums(x1(isd:ied+shift,jsd:jed, :,l),x2(isd:ied+shift,jsd:jed, :,l),type//' CGRID X'//text) + call compare_checksums(y1(isd:ied, jsd:jed+shift,:,l),y2(isd:ied, jsd:jed+shift,:,l),type//' CGRID Y'//text) + enddo + endif + a1 = 0; x1 = 0; y1 = 0 + do l = 1, num_fields + a1(isc:iec, jsc:jec, :,l) = base(isc:iec, jsc:jec, :) + l*1e3 + x1(isc:iec+shift,jsc:jec, :,l) = base(isc:iec+shift,jsc:jec, :) + l*1e3 + 1e6 + y1(isc:iec, jsc:jec+shift,:,l) = base(isc:iec, jsc:jec+shift,:) + l*1e3 + 2*1e6 + enddo + call mpp_clock_begin(id3) + call mpp_start_group_update(group_update, domain, a1(isc,jsc,1,1)) + call mpp_complete_group_update(group_update, domain, a1(isc,jsc,1,1)) + call mpp_clock_end (id3) + !--- compare checksum + if( n == num_iter ) then + do l = 1, num_fields + write(text, '(i3.3)') l + call compare_checksums(a1(isd:ied, jsd:jed, :,l),a2(isd:ied, jsd:jed, :,l), & + type//' nonblock CENTER '//text) + call compare_checksums(x1(isd:ied+shift,jsd:jed, :,l),x2(isd:ied+shift,jsd:jed, :,l), & + type//' nonblock CGRID X'//text) + call compare_checksums(y1(isd:ied, jsd:jed+shift,:,l),y2(isd:ied, jsd:jed+shift,:,l), & + type//' nonblock CGRID Y'//text) + enddo + endif + enddo + + call mpp_clear_group_update(group_update) + + !--- The following is to test overlapping start and complete + if( num_fields > 1 ) then + do l =1, num_fields + call mpp_create_group_update(update_list(l), a1(:,:,:,l), domain) + call mpp_create_group_update(update_list(l), x1(:,:,:,l), y1(:,:,:,l), domain, gridtype=CGRID_NE) + end do + + do n = 1, num_iter + + a1 = 0; x1 = 0; y1 = 0 + do l = 1, num_fields + a1(isc:iec, jsc:jec, :,l) = base(isc:iec, jsc:jec, :) + l*1e3 + x1(isc:iec+shift,jsc:jec, :,l) = base(isc:iec+shift,jsc:jec, :) + l*1e3 + 1e6 + y1(isc:iec, jsc:jec+shift,:,l) = base(isc:iec, jsc:jec+shift,:) + l*1e3 + 2*1e6 + enddo + do l = 1, num_fields-1 + call mpp_start_group_update(update_list(l), domain, a1(isc,jsc,1,1)) + enddo + + call mpp_complete_group_update(update_list(1), domain, a1(isc,jsc,1,1)) + call mpp_start_group_update(update_list(num_fields), domain, a1(isc,jsc,1,1)) + do l = 2, num_fields + call mpp_complete_group_update(update_list(l), domain, a1(isc,jsc,1,1)) + enddo + !--- compare checksum + if( n == num_iter ) then + do l = 1, num_fields + write(text, '(i3.3)') l + call compare_checksums(a1(isd:ied, jsd:jed, :,l),a2(isd:ied, jsd:jed, :,l), & + type//' multiple nonblock CENTER '//text) + call compare_checksums(x1(isd:ied+shift,jsd:jed, :,l),x2(isd:ied+shift,jsd:jed, :,l), & + type//' multiple nonblock CGRID X'//text) + call compare_checksums(y1(isd:ied, jsd:jed+shift,:,l),y2(isd:ied, jsd:jed+shift,:,l), & + type//' multiple nonblock CGRID Y'//text) + enddo + endif + enddo + endif + + do l =1, num_fields + call mpp_clear_group_update(update_list(l)) + enddo + deallocate(update_list) + + !--- test scalar 4-D variable + call mpp_create_group_update(group_update, a1(:,:,:,:), domain) + + a1 = 0; x1 = 0; y1 = 0 + !$omp target update if(do_omp_offload) to(a1) + !$omp target teams loop collapse(4) if(do_omp_offload) default(shared) & + !$omp shared(num_fields, nz, jsc, jec, isc, iec, a1, base) map(to: base) map(from: a1) + do l = 1, num_fields + do k=1,nz + do j=jsc,jec + do i=isc,iec + a1(i,j,k,l) = base(i,j,k) + l*1e3 + enddo + enddo + enddo + enddo + !$omp target teams loop collapse(4) if(do_omp_offload) default(shared) & + !$omp shared(num_fields, nz, jsm, jem, ism, iem, a1, a2) map(from: a2) map(to: a1) + do l=1,num_fields ; do k=1,nz ; do j=jsm,jem ; do i=ism,iem + a2(i,j,k,l) = a1(i,j,k,l) + enddo ; enddo ; enddo ; enddo + x2 = x1; y2 = y1 + call mpp_clock_begin(id1) + call mpp_do_group_update(group_update, domain, a1(isc,jsc,1,1), omp_offload=do_omp_offload) + call mpp_clock_end (id1) + + call mpp_clock_begin(id2) + call mpp_update_domains( a2(:,:,:,:), domain ) + call mpp_clock_end(id2) + + !--- compare checksum + !$omp target update if(do_omp_offload) from(a1) + do l = 1, num_fields + write(text, '(i3.3)') l + call compare_checksums(a1(isd:ied, jsd:jed, :,l),a2(isd:ied, jsd:jed, :,l),type//' 4D CENTER '//text) + enddo + + a1 = 0 + do l = 1, num_fields + a1(isc:iec, jsc:jec, :,l) = base(isc:iec, jsc:jec, :) + l*1e3 + enddo + call mpp_clock_begin(id3) + call mpp_start_group_update(group_update, domain, a1(isc,jsc,1,1)) + call mpp_complete_group_update(group_update, domain, a1(isc,jsc,1,1)) + call mpp_clock_end (id3) + + !--- compare checksum + do l = 1, num_fields + write(text, '(i3.3)') l + call compare_checksums(a1(isd:ied, jsd:jed, :,l),a2(isd:ied, jsd:jed, :,l), & + type//' nonblock 4D CENTER '//text) + enddo + + !$omp target exit data if(do_omp_offload) map(release: a1, x1, y1) + + !--- test for BGRID. + deallocate(a1, x1, y1) + deallocate(a2, x2, y2) + call mpp_clear_group_update(group_update) + + allocate( a1(ism:iem+shift,jsm:jem+shift, nz, num_fields) ) + allocate( x1(ism:iem+shift,jsm:jem+shift, nz, num_fields) ) + allocate( y1(ism:iem+shift,jsm:jem+shift, nz, num_fields) ) + allocate( a2(ism:iem+shift,jsm:jem+shift, nz, num_fields) ) + allocate( x2(ism:iem+shift,jsm:jem+shift, nz, num_fields) ) + allocate( y2(ism:iem+shift,jsm:jem+shift, nz, num_fields) ) + + !$omp target enter data map(alloc: a1, x1, y1) if(do_omp_offload) + + do l =1, num_fields + call mpp_create_group_update(group_update, a1(:,:,:,l), domain, position=CORNER) + call mpp_create_group_update(group_update, x1(:,:,:,l), y1(:,:,:,l), domain, gridtype=BGRID_NE) + end do + + do n = 1, num_iter + a1 = 0; x1 = 0; y1 = 0 + !$omp target update if(do_omp_offload) to(a1, x1, y1) + do l = 1, num_fields + !$omp target teams loop collapse(3) if(do_omp_offload) default(shared) & + !$omp shared(nz, base, a1, x1, y1, l, jsc, jec, shift, isc, iec) & + !$omp map(to: base) map(tofrom: a1, x1, y1) + do k=1,nz + do j=jsc,jec+shift + do i=isc,iec+shift + a1(i,j,k,l) = base(i,j,k) + l*1e3 + x1(i,j,k,l) = base(i,j,k) + l*1e3 + 1e6 + y1(i,j,k,l) = base(i,j,k) + l*1e3 + 2*1e6 + enddo + enddo + enddo + enddo + !$omp target teams loop collapse(4) if(do_omp_offload) default(shared) & + !$omp shared(num_fields, nz, jsm, jem, shift, ism, iem, a1, x1, y1, a2, x2, y2) & + !$omp map(from: a2, x2, y2) map(to: a1, x1, y1) + do l=1,num_fields ; do k=1,nz ; do j=jsm,jem+shift ; do i=ism,iem+shift + a2(i,j,k,l) = a1(i,j,k,l) + x2(i,j,k,l) = x1(i,j,k,l) + y2(i,j,k,l) = y1(i,j,k,l) + enddo ; enddo ; enddo ; enddo + call mpp_clock_begin(id1) + call mpp_do_group_update(group_update, domain, a1(isc,jsc,1,1), omp_offload=do_omp_offload) + call mpp_clock_end (id1) + + call mpp_clock_begin(id2) + do l = 1, num_fields + call mpp_update_domains( a2(:,:,:,l), domain, position=CORNER, complete=l==num_fields ) + enddo + do l = 1, num_fields + call mpp_update_domains( x2(:,:,:,l), y2(:,:,:,l), domain, gridtype=BGRID_NE, complete=l==num_fields ) + enddo + call mpp_clock_end(id2) + + !--- compare checksum + if( n == num_iter ) then + !$omp target update if(do_omp_offload) from(a1,x1,y1) + do l = 1, num_fields + write(text, '(i3.3)') l + call compare_checksums(a1(isd:ied+shift,jsd:jed+shift,:,l),a2(isd:ied+shift,jsd:jed+shift,:,l),type//& + & ' CORNER '// text) + call compare_checksums(x1(isd:ied+shift,jsd:jed+shift,:,l),x2(isd:ied+shift,jsd:jed+shift,:,l),type//& + & ' BGRID X'// text) + call compare_checksums(y1(isd:ied+shift,jsd:jed+shift,:,l),y2(isd:ied+shift,jsd:jed+shift,:,l),type//& + & ' BGRID Y'// text) + enddo + endif + + a1 = 0; x1 = 0; y1 = 0 + do l = 1, num_fields + a1(isc:iec+shift,jsc:jec+shift,:,l) = base(isc:iec+shift,jsc:jec+shift,:) + l*1e3 + x1(isc:iec+shift,jsc:jec+shift,:,l) = base(isc:iec+shift,jsc:jec+shift,:) + l*1e3 + 1e6 + y1(isc:iec+shift,jsc:jec+shift,:,l) = base(isc:iec+shift,jsc:jec+shift,:) + l*1e3 + 2*1e6 + enddo + call mpp_clock_begin(id3) + call mpp_start_group_update(group_update, domain, a1(isc,jsc,1,1)) + call mpp_complete_group_update(group_update, domain, a1(isc,jsc,1,1)) + call mpp_clock_end (id3) + !--- compare checksum + if( n == num_iter ) then + do l = 1, num_fields + write(text, '(i3.3)') l + call compare_checksums(a1(isd:ied+shift,jsd:jed+shift,:,l),a2(isd:ied+shift,jsd:jed+shift,:,l), & + type//' nonblockCORNER '//text) + call compare_checksums(x1(isd:ied+shift,jsd:jed+shift,:,l),x2(isd:ied+shift,jsd:jed+shift,:,l), & + type//' nonblock BGRID X'//text) + call compare_checksums(y1(isd:ied+shift,jsd:jed+shift,:,l),y2(isd:ied+shift,jsd:jed+shift,:,l), & + type//' nonblock BGRID Y'//text) + enddo + endif + enddo + !$omp target exit data if(do_omp_offload) map(release: a1,x1,y1) + call mpp_clear_group_update(group_update) + + !----------------------------------------------------------------------------- + ! test for AGRID vector and scalar pair + !----------------------------------------------------------------------------- + deallocate(x1, y1) + deallocate(x2, y2) + + allocate( x1(ism:iem,jsm:jem, nz, num_fields) ) + allocate( y1(ism:iem,jsm:jem, nz, num_fields) ) + allocate( x2(ism:iem,jsm:jem, nz, num_fields) ) + allocate( y2(ism:iem,jsm:jem, nz, num_fields) ) + + x1 = 0; y1 = 0 + do l = 1, num_fields + x1(isc:iec,jsc:jec,:,l) = base(isc:iec,jsc:jec,:) + l*1e3 + 1e6 + y1(isc:iec,jsc:jec,:,l) = base(isc:iec,jsc:jec,:) + l*1e3 + 2*1e6 + enddo + x2 = x1; y2 = y1 + + do l =1, num_fields + call mpp_create_group_update(group_update, x1(:,:,:,l), y1(:,:,:,l), domain, gridtype=AGRID) + end do + + do l = 1, num_fields + call mpp_update_domains( x2(:,:,:,l), y2(:,:,:,l), domain, gridtype=AGRID, complete=l==num_fields ) + enddo + + call mpp_start_group_update(group_update, domain, a1(isc,jsc,1,1)) + call mpp_complete_group_update(group_update, domain, a1(isc,jsc,1,1)) + + !--- compare checksum + do l = 1, num_fields + write(text, '(i3.3)') l + call compare_checksums(x1(isd:ied,jsd:jed,:,l),x2(isd:ied,jsd:jed,:,l),type//' AGRID X'//text) + call compare_checksums(y1(isd:ied,jsd:jed,:,l),y2(isd:ied,jsd:jed,:,l),type//' AGRID Y'//text) + enddo + + call mpp_clear_group_update(group_update) + + x1 = 0; y1 = 0 + do l = 1, num_fields + x1(isc:iec,jsc:jec,:,l) = base(isc:iec,jsc:jec,:) + l*1e3 + 1e6 + y1(isc:iec,jsc:jec,:,l) = base(isc:iec,jsc:jec,:) + l*1e3 + 2*1e6 + enddo + x2 = x1; y2 = y1 + + do l =1, num_fields + call mpp_create_group_update(group_update, x1(:,:,:,l), y1(:,:,:,l), domain, gridtype=AGRID, flags=SCALAR_PAIR) + end do + + do l = 1, num_fields + call mpp_update_domains( x2(:,:,:,l), y2(:,:,:,l), domain, gridtype=AGRID, flags=SCALAR_PAIR, & + & complete=l==num_fields) + enddo + + call mpp_start_group_update(group_update, domain, x1(isc,jsc,1,1)) + call mpp_complete_group_update(group_update, domain, x1(isc,jsc,1,1)) + + !--- compare checksum + do l = 1, num_fields + write(text, '(i3.3)') l + call compare_checksums(x1(isd:ied,jsd:jed,:,l),x2(isd:ied,jsd:jed,:,l),type//' AGRID SCALAR_PAIR X'//text) + call compare_checksums(y1(isd:ied,jsd:jed,:,l),y2(isd:ied,jsd:jed,:,l),type//' AGRID SCALAR_PAIR Y'//text) + enddo + + call mpp_clear_group_update(group_update) + + deallocate(pe_start, pe_end, tile1, tile2) + deallocate(istart1, iend1, jstart1, jend1) + deallocate(istart2, iend2, jstart2, jend2) + deallocate(layout2D, global_indices) + + deallocate(a1, x1, y1) + deallocate(a2, x2, y2) + deallocate(base) + call mpp_deallocate_domain(domain) + +end subroutine test_group_update + !############################################################### !--- This will test scalar and CGRID performance between halo=1 and halo=3 diff --git a/test_fms/mpp/test_mpp_domains2.sh b/test_fms/mpp/test_mpp_domains2.sh index a9b26f2cd3..f23762be88 100755 --- a/test_fms/mpp/test_mpp_domains2.sh +++ b/test_fms/mpp/test_mpp_domains2.sh @@ -28,7 +28,7 @@ . ../test-lib.sh # TODO: Enable this test once generalized indices work is complete -SKIP_TESTS="test_mpp_domains2.12" +SKIP_TESTS="test_mpp_domains2.15" # TODO edge update, fails on non-blocking with gnu #SKIP_TESTS="$SKIP_TESTS $(basename $0 .sh).6" @@ -79,6 +79,7 @@ test_nonsym_edge = .false. test_halosize_performance = .false. test_adjoint = .false. wide_halo = .false. +test_group_offload = .false. / _EOF @@ -128,10 +129,6 @@ sed "s/test_unstruct = .false./test_unstruct = .true./" input_base.nml > input.n test_expect_success "unstruct" ' mpirun -n 2 ../test_mpp_domains ' -sed "s/test_group = .false./test_group = .true./" input_base.nml > input.nml -test_expect_success "group" ' - mpirun -n 2 ../test_mpp_domains -' sed "s/test_interface = .false./test_interface = .true./" input_base.nml > input.nml test_expect_success "interface" ' mpirun -n 6 ../test_mpp_domains @@ -144,5 +141,14 @@ sed "s/test_get_nbr = .false./test_get_nbr = .true./" input_base.nml > input.nml test_expect_success "get nbr" ' mpirun -n 8 ../test_mpp_domains ' +sed "s/test_group = .false./test_group = .true./" input_base.nml > input.nml +test_expect_success "group" ' + mpirun -n 2 ../test_mpp_domains +' +# do the group update again, but with openmp offload flag +sed "s/test_group_offload = .false./test_group_offload = .true./" input_base.nml > input.nml +test_expect_success "group update with OpenMP offload" ' + mpirun -n 2 ../test_mpp_domains +' test_done