pic_mpi.f90 Source File


Files dependent on this one

sourcefile~~pic_mpi.f90~~AfferentGraph sourcefile~pic_mpi.f90 pic_mpi.f90 sourcefile~pic_mpi_lib.f90 pic_mpi_lib.F90 sourcefile~pic_mpi_lib.f90->sourcefile~pic_mpi.f90 sourcefile~darrays.f90 darrays.f90 sourcefile~darrays.f90->sourcefile~pic_mpi_lib.f90 sourcefile~darrays_core.f90 darrays_core.f90 sourcefile~darrays.f90->sourcefile~darrays_core.f90 sourcefile~darrays_dlb.f90 darrays_dlb.f90 sourcefile~darrays.f90->sourcefile~darrays_dlb.f90 sourcefile~darrays_types.f90 darrays_types.f90 sourcefile~darrays.f90->sourcefile~darrays_types.f90 sourcefile~darrays_core.f90->sourcefile~pic_mpi_lib.f90 sourcefile~darrays_core.f90->sourcefile~darrays_types.f90 sourcefile~darrays_dlb.f90->sourcefile~pic_mpi_lib.f90 sourcefile~darrays_types.f90->sourcefile~pic_mpi_lib.f90

Source Code

module pic_mpi_f08
   !! Modern MPI wrapper module using mpi_f08 interface
   !!
   !! This module provides a high-level object-oriented interface to MPI
   !! using the modern mpi_f08 bindings. It wraps MPI communicators and
   !! requests into derived types with type-bound procedures.
   use pic_types, only: int32, int64, sp, dp
   use mpi_f08, only: MPI_Comm, MPI_Status, MPI_Request, MPI_COMM_NULL, MPI_COMM_WORLD, &
                      MPI_COMM_TYPE_SHARED, MPI_INFO_NULL, MPI_UNDEFINED, &
                      MPI_INTEGER, MPI_INTEGER8, MPI_STATUS_IGNORE, MPI_REQUEST_NULL, MPI_Comm_rank, &
                      MPI_Comm_size, MPI_Comm_dup, MPI_Barrier, &
                      MPI_Comm_split_type, MPI_Comm_split, MPI_Send, MPI_Recv, &
                      MPI_Isend, MPI_Irecv, MPI_Wait, MPI_Waitall, MPI_Test, &
                      MPI_Probe, MPI_Get_count, MPI_Iprobe, MPI_Comm_free, &
                      MPI_Abort, MPI_Allgather, MPI_Get_processor_name, &
                      MPI_Bcast, MPI_Init, MPI_Init_thread, MPI_Query_thread, MPI_Finalize, &
                      MPI_THREAD_SINGLE, MPI_THREAD_FUNNELED, MPI_THREAD_SERIALIZED, MPI_THREAD_MULTIPLE, &
                      MPI_ANY_SOURCE, MPI_ANY_TAG, &
                      MPI_MAX_PROCESSOR_NAME, MPI_LOGICAL, &
                      operator(==), operator(/=), MPI_DOUBLE_PRECISION, MPI_REAL, &
                      MPI_Win, MPI_Op, MPI_WIN_NULL, MPI_Win_create, MPI_Win_create_dynamic, &
                      MPI_Win_free, MPI_Win_fence, MPI_Win_lock, MPI_Win_unlock, &
                      MPI_Win_lock_all, MPI_Win_unlock_all, MPI_Win_flush, MPI_Win_flush_all, &
                      MPI_Rget, MPI_Rput, MPI_Win_allocate, &
                      MPI_Get, MPI_Put, MPI_Accumulate, MPI_Fetch_and_op, &
                      MPI_Allreduce, MPI_ADDRESS_KIND, MPI_LOCK_SHARED, MPI_SUM, MPI_MIN, MPI_MAX, MPI_IN_PLACE
   implicit none
   private

   public :: comm_t, comm_world, comm_null
   public :: send, recv, isend, irecv
   public :: comm_isend_real_dp_array_n, comm_irecv_real_dp_array_n  ! Direct export for host_data blocks (nvhpc bug workaround)
   public :: comm_isend_real_sp_array_n, comm_irecv_real_sp_array_n  ! Single precision equivalents
   public :: comm_recv_real_dp_array_n, comm_recv_real_sp_array_n    ! Fixed-size blocking recv variants
   public :: comm_recv_integer_array_n, comm_recv_integer64_array_n
   ! 2D fixed-count variants — send/recv contiguous (m, n) arrays without
   ! the dim-prefix protocol used by the 2D allocatable overloads.
   public :: comm_send_real_dp_array_2d_n, comm_recv_real_dp_array_2d_n
   public :: comm_isend_real_dp_array_2d_n
   public :: comm_send_integer_array_2d_n, comm_recv_integer_array_2d_n
   public :: request_t, wait, waitall, test
   public :: iprobe, probe, abort_comm, allgather, get_processor_name, bcast
   public :: pic_mpi_init, pic_mpi_finalize, pic_mpi_query_thread_level
   public :: win_t, win_create, win_create_dynamic, win_allocate
   public :: allreduce

   ! Export MPI types and constants needed by applications
   public :: MPI_Status, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_MAX_PROCESSOR_NAME, MPI_Request
   public :: MPI_THREAD_SINGLE, MPI_THREAD_FUNNELED, MPI_THREAD_SERIALIZED, MPI_THREAD_MULTIPLE
   public :: MPI_SUM, MPI_MIN, MPI_MAX

   type :: request_t
   !! Wraps MPI_Request to provide object-oriented interface for
   !! non-blocking communication operations (isend, irecv)
      private
      type(MPI_Request) :: m_request = MPI_REQUEST_NULL !! Internal MPI request handle
      logical :: is_valid = .false. !! Validity flag
   contains
      procedure :: is_null => request_is_null !! Check if request is null
      procedure :: get => request_get !! Get underlying MPI_Request
      procedure :: free => request_free !! Free the request
   end type request_t

   type :: win_t
      !! MPI-3 Window type for one-sided communication (RMA)
      !!
      !! Wraps MPI_Win to provide object-oriented interface for
      !! Remote Memory Access (RMA) operations needed for DDI
      private
      type(MPI_Win) :: m_win = MPI_WIN_NULL
      logical :: is_valid = .false.
   contains
      procedure :: is_null => win_is_null
      procedure :: get_handle => win_get_handle
      procedure :: fence => win_fence
      procedure :: lock => win_lock
      procedure :: unlock => win_unlock
      procedure :: lock_all => win_lock_all
      procedure :: unlock_all => win_unlock_all
      procedure :: flush => win_flush
      procedure :: flush_all => win_flush_all
      ! Double precision (dp) RMA operations
      procedure :: get_dp => win_get_dp
      procedure :: put_dp => win_put_dp
      procedure :: rget_dp => win_rget_dp
      procedure :: rput_dp => win_rput_dp
      procedure :: accumulate_dp => win_accumulate_dp
      ! Single precision (sp) RMA operations
      procedure :: get_sp => win_get_sp
      procedure :: put_sp => win_put_sp
      procedure :: rget_sp => win_rget_sp
      procedure :: rput_sp => win_rput_sp
      procedure :: accumulate_sp => win_accumulate_sp
      ! Integer32 RMA operations
      procedure :: get_i32 => win_get_i32
      procedure :: put_i32 => win_put_i32
      procedure :: rget_i32 => win_rget_i32
      procedure :: rput_i32 => win_rput_i32
      procedure :: accumulate_i32 => win_accumulate_i32
      ! Integer64 RMA operations
      procedure :: get_i64 => win_get_i64
      procedure :: put_i64 => win_put_i64
      procedure :: rget_i64 => win_rget_i64
      procedure :: rput_i64 => win_rput_i64
      procedure :: accumulate_i64 => win_accumulate_i64
      procedure :: fetch_and_add_i64 => win_fetch_and_add_i64
      procedure :: finalize => win_finalize
   end type win_t

   type :: comm_t
      !! MPI communicator wrapper type
      !!
      !! Provides object-oriented interface to MPI communicators with
      !! type-bound procedures for common operations. Automatically caches
      !! rank and size information for efficient access.
      private
      type(MPI_Comm) :: m_comm = MPI_COMM_NULL !! Internal MPI communicator
      integer(int32) :: m_rank = -1 !! Cached rank in this communicator
      integer(int32) :: m_size = -1 !! Cached size of this communicator
      logical :: is_valid = .false. !! Validity flag
   contains
      procedure :: rank => comm_rank !! Get rank in communicator
      procedure :: size => m_size_func !! Get size of communicator
      procedure :: leader => comm_leader !! Check if this rank is leader (rank 0)
      procedure :: is_null => comm_is_null !! Check if communicator is null
      procedure :: get => comm_get !! Get underlying MPI_Comm

      procedure :: barrier => comm_barrier !! Synchronization barrier

      procedure :: split => comm_split_shared !! Split into shared memory communicators
      procedure :: split_by => comm_split_by_color !! Split communicator by color
      procedure :: discard_leader => comm_discard_leader !! Create communicator without leader
      procedure :: discard_to => comm_discard_to !! Create communicator with first N ranks
      procedure :: duplicate => comm_duplicate !! Duplicate communicator

      procedure :: finalize => comm_finalize !! Free communicator resources
   end type comm_t

   interface comm_world
      module procedure create_world_comm
   end interface

   interface comm_null
      module procedure create_null_comm
   end interface

   interface send
      module procedure :: comm_send_integer
      module procedure :: comm_send_integer_array
      module procedure :: comm_send_integer64
      module procedure :: comm_send_integer64_array
      module procedure :: comm_send_real_dp
      module procedure :: comm_send_real_dp_array
      module procedure :: comm_send_real_dp_array_2d
      module procedure :: comm_send_real_dp_array_2d_n
      module procedure :: comm_send_integer_array_2d_n
      module procedure :: comm_send_real_sp
      module procedure :: comm_send_real_sp_array
      module procedure :: comm_send_real_sp_array_2d
      module procedure :: comm_send_logical
   end interface send

   interface recv
      module procedure :: comm_recv_integer
      module procedure :: comm_recv_integer_array
      module procedure :: comm_recv_integer64
      module procedure :: comm_recv_integer64_array
      module procedure :: comm_recv_real_dp
      module procedure :: comm_recv_real_dp_array
      module procedure :: comm_recv_real_dp_array_2d
      module procedure :: comm_recv_real_sp
      module procedure :: comm_recv_real_sp_array
      module procedure :: comm_recv_real_sp_array_2d
      module procedure :: comm_recv_logical
      ! Fixed-size receive variants (count is positional, no probe-then-allocate).
      ! Use these when receiving into a pre-allocated buffer.
      module procedure :: comm_recv_integer_array_n
      module procedure :: comm_recv_integer64_array_n
      module procedure :: comm_recv_real_dp_array_n
      module procedure :: comm_recv_real_sp_array_n
      module procedure :: comm_recv_real_dp_array_2d_n
      module procedure :: comm_recv_integer_array_2d_n
   end interface recv

   interface iprobe
      module procedure :: comm_iprobe
   end interface iprobe

   interface probe
      module procedure :: comm_probe
   end interface probe

   interface allgather
      module procedure :: comm_allgather_integer
   end interface allgather

   interface bcast
      module procedure :: comm_bcast_integer
      module procedure :: comm_bcast_integer64
      module procedure :: comm_bcast_real_dp
      module procedure :: comm_bcast_real_dp_array
      module procedure :: comm_bcast_real_sp
      module procedure :: comm_bcast_real_sp_array
   end interface bcast

   interface isend
      module procedure :: comm_isend_integer
      module procedure :: comm_isend_integer_array
      module procedure :: comm_isend_integer64
      module procedure :: comm_isend_integer64_array
      module procedure :: comm_isend_real_dp
      module procedure :: comm_isend_real_dp_array
      module procedure :: comm_isend_real_dp_array_2d
      module procedure :: comm_isend_real_dp_array_2d_n
      module procedure :: comm_isend_real_sp
      module procedure :: comm_isend_real_sp_array
      module procedure :: comm_isend_real_sp_array_2d
      module procedure :: comm_isend_logical
   end interface isend

   interface irecv
      module procedure :: comm_irecv_integer
      module procedure :: comm_irecv_integer_array
      module procedure :: comm_irecv_integer64
      module procedure :: comm_irecv_integer64_array
      module procedure :: comm_irecv_real_dp
      module procedure :: comm_irecv_real_dp_array
      module procedure :: comm_irecv_real_dp_array_2d
      module procedure :: comm_irecv_real_sp
      module procedure :: comm_irecv_real_sp_array
      module procedure :: comm_irecv_real_sp_array_2d
      module procedure :: comm_irecv_logical
   end interface irecv

   interface wait
      module procedure :: request_wait
   end interface wait

   interface waitall
      module procedure :: request_waitall
   end interface waitall

   interface test
      module procedure :: request_test
   end interface test

   interface win_create
      module procedure create_win_dp_array
   end interface win_create

   interface win_create_dynamic
      module procedure create_win_dynamic
   end interface win_create_dynamic

   interface win_allocate
      module procedure create_win_allocate_dp_1d
      module procedure create_win_allocate_dp_2d
      module procedure create_win_allocate_sp_1d
      module procedure create_win_allocate_i32_1d
      module procedure create_win_allocate_i64_1d
   end interface win_allocate

   interface allreduce
      module procedure :: allreduce_dp
      module procedure :: allreduce_dp_array
      module procedure :: allreduce_sp
      module procedure :: allreduce_sp_array
      module procedure :: allreduce_i32
      module procedure :: allreduce_i32_array
      module procedure :: allreduce_dp_to
      module procedure :: allreduce_dp_array_to
      module procedure :: allreduce_sp_to
      module procedure :: allreduce_sp_array_to
   end interface allreduce

contains

   function create_comm_from_mpi(mpi_comm_in) result(comm)
   !! Internal helper function that wraps an MPI_Comm into a comm_t object
   !! and caches rank and size information
      type(MPI_Comm), intent(in) :: mpi_comm_in
      type(comm_t) :: comm
      integer(int32) :: ierr

      comm%m_comm = mpi_comm_in
      if (mpi_comm_in /= MPI_COMM_NULL) then
         call MPI_Comm_rank(comm%m_comm, comm%m_rank, ierr)
         call MPI_Comm_size(comm%m_comm, comm%m_size, ierr)
         comm%is_valid = .true.
      else
         comm%is_valid = .false.
      end if

   end function create_comm_from_mpi

   function create_world_comm() result(comm)
   !! Creates a new communicator that duplicates MPI_COMM_WORLD.
   !! This is the standard way to obtain a communicator for application use.
      type(comm_t) :: comm
      type(MPI_Comm) :: dup_comm
      integer(int32) :: ierr

      call MPI_Comm_dup(MPI_COMM_WORLD, dup_comm, ierr)
      comm = create_comm_from_mpi(dup_comm)

   end function create_world_comm

   function create_null_comm() result(comm)
   !! Creates an invalid/null communicator object that can be used
   !! for initialization or to represent absence of a communicator.
      type(comm_t) :: comm

      ! Explicitly initialize to null/invalid state
      comm%m_comm = MPI_COMM_NULL
      comm%m_rank = -1
      comm%m_size = -1
      comm%is_valid = .false.
   end function create_null_comm

   pure function comm_rank(this) result(rank)
   !!
   !! Returns the 0-indexed rank of the calling process
      class(comm_t), intent(in) :: this
      integer :: rank
      rank = this%m_rank
   end function comm_rank

   pure function m_size_func(this) result(size)
   !! Returns the number of processes in the communicator
      class(comm_t), intent(in) :: this
      integer :: size
      size = this%m_size
   end function m_size_func

   pure function comm_leader(this) result(is_leader)
   !! Returns true if the calling process has rank 0
      class(comm_t), intent(in) :: this
      logical :: is_leader
      is_leader = (this%m_rank == 0)
   end function comm_leader

   pure function comm_is_null(this) result(is_null)
      class(comm_t), intent(in) :: this
      logical :: is_null
      is_null = .not. this%is_valid
   end function comm_is_null

   function comm_get(this) result(mpi_comm_out)
      class(comm_t), intent(in) :: this
      type(MPI_Comm) :: mpi_comm_out

      if (.not. this%is_valid) then
         error stop "Cannot get MPI_Comm from null Comm"
      end if
      mpi_comm_out = this%m_comm
   end function comm_get

   subroutine comm_barrier(this)
   !! Blocks until all processes in the communicator have called barrier
      class(comm_t), intent(in) :: this
      integer(int32) :: ierr
      call MPI_Barrier(this%m_comm, ierr)
   end subroutine comm_barrier

   function comm_split_shared(this) result(new_comm)
   !! Creates a new communicator containing only processes that share
   !! memory with each other (typically processes on the same node)
      class(comm_t), intent(in) :: this
      type(comm_t) :: new_comm
      type(MPI_Comm) :: mpi_comm_new
      integer(int32) :: ierr

      if (.not. this%is_valid) then
         new_comm = comm_null()
         return
      end if

      call MPI_Comm_split_type(this%get(), MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, mpi_comm_new, ierr)
      new_comm = create_comm_from_mpi(mpi_comm_new)
   end function comm_split_shared

   function comm_split_by_color(this, color) result(new_comm)
   !! Partitions the communicator into disjoint subgroups based on color.
   !! Processes with the same color end up in the same new communicator.
      class(comm_t), intent(in) :: this
      integer, intent(in) :: color
      type(comm_t) :: new_comm
      type(MPI_Comm) :: mpi_comm_new
      integer(int32) :: ierr

      if (.not. this%is_valid) then
         new_comm = comm_null()
         return
      end if

      call MPI_Comm_split(this%m_comm, color, 0, mpi_comm_new, ierr)
      new_comm = create_comm_from_mpi(mpi_comm_new)
   end function comm_split_by_color

   function comm_discard_leader(this) result(new_comm)
      class(comm_t), intent(in) :: this
      type(comm_t) :: new_comm
      integer :: color

      if (.not. this%is_valid) then
         new_comm = comm_null()
         return
      end if

      if (this%rank() == 0) then
         color = MPI_UNDEFINED
      else
         color = 0
      end if
      new_comm = this%split_by(color)
   end function comm_discard_leader

   function comm_discard_to(this, num_ranks) result(new_comm)
      class(comm_t), intent(in) :: this
      integer, intent(in) :: num_ranks
      type(comm_t) :: new_comm
      integer :: color

      if (.not. this%is_valid) then
         new_comm = comm_null()
         return
      end if

      if (this%rank() < num_ranks) then
         color = 0
      else
         color = MPI_UNDEFINED
      end if
      new_comm = this%split_by(color)
   end function comm_discard_to

   function comm_duplicate(this) result(new_comm)
      class(comm_t), intent(in) :: this
      type(comm_t) :: new_comm
      type(MPI_Comm) :: mpi_comm_new
      integer(int32) :: ierr

      if (.not. this%is_valid) then
         new_comm = comm_null()
         return
      end if

      call MPI_Comm_dup(this%m_comm, mpi_comm_new, ierr)
      new_comm = create_comm_from_mpi(mpi_comm_new)
   end function comm_duplicate

   subroutine comm_send_integer(comm, data, dest, tag)
   !! Blocking send of an integer to specified destination
      type(comm_t), intent(in) :: comm
      integer(int32), intent(in) :: data
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      integer(int32) :: ierr

      call MPI_Send(data, 1, MPI_INTEGER, dest, tag, comm%m_comm, ierr)
   end subroutine comm_send_integer

   subroutine comm_send_integer_array(comm, data, dest, tag)
   !! Blocking send of an integer array to specified destination
      type(comm_t), intent(in) :: comm
      integer(int32), intent(in) :: data(:)
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      integer(int32) :: ierr

      call MPI_Send(data, size(data), MPI_INTEGER, dest, tag, comm%m_comm, ierr)
   end subroutine comm_send_integer_array

   subroutine comm_send_integer64(comm, data, dest, tag)
   !! Blocking send of an integer64 to specified destination
      type(comm_t), intent(in) :: comm
      integer(int64), intent(in) :: data
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      integer(int32) :: ierr

      call MPI_Send(data, 1, MPI_INTEGER8, dest, tag, comm%m_comm, ierr)
   end subroutine comm_send_integer64

   subroutine comm_send_integer64_array(comm, data, dest, tag)
   !! Blocking send of an integer64 array to specified destination
      type(comm_t), intent(in) :: comm
      integer(int64), intent(in) :: data(:)
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      integer(int32) :: ierr

      call MPI_Send(data, size(data), MPI_INTEGER8, dest, tag, comm%m_comm, ierr)
   end subroutine comm_send_integer64_array

   subroutine comm_send_real_dp(comm, data, dest, tag)
      !! Blocking send of a single double precision real to specified destination
      type(comm_t), intent(in) :: comm
      real(dp), intent(in) :: data
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      integer(int32) :: ierr

      call MPI_Send(data, 1, MPI_DOUBLE_PRECISION, dest, tag, comm%m_comm, ierr)
   end subroutine comm_send_real_dp

   subroutine comm_send_real_dp_array(comm, data, dest, tag)
      !! Blocking send of a double precision real array to specified destination
      type(comm_t), intent(in) :: comm
      real(dp), intent(in) :: data(:)
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      integer(int32) :: ierr

      call MPI_Send(data, size(data), MPI_DOUBLE_PRECISION, dest, tag, comm%m_comm, ierr)
   end subroutine comm_send_real_dp_array

   subroutine comm_send_real_dp_array_2d(comm, data, dest, tag)
   !! Blocking send of a 2D double precision real array to specified destination
      type(comm_t), intent(in) :: comm
      real(dp), intent(in) :: data(:, :)
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      integer(int32) :: ierr, dim1, dim2

      ! Send dimensions first
      dim1 = size(data, 1)
      dim2 = size(data, 2)
      call MPI_Send(dim1, 1, MPI_INTEGER, dest, tag, comm%m_comm, ierr)
      call MPI_Send(dim2, 1, MPI_INTEGER, dest, tag, comm%m_comm, ierr)

      ! Send data
      call MPI_Send(data, size(data), MPI_DOUBLE_PRECISION, dest, tag, comm%m_comm, ierr)
   end subroutine comm_send_real_dp_array_2d

   subroutine comm_send_real_dp_array_2d_n(comm, data, count, dest, tag)
      !! Blocking send of a contiguous 2D double-precision array using
      !! an explicit count.  Unlike `comm_send_real_dp_array_2d`, this
      !! variant does NOT prefix the message with the dimensions —
      !! caller and receiver agree on the shape via the protocol.  The
      !! receiver uses `comm_recv_real_dp_array_2d_n` (or any of the
      !! `_array_n` overloads) with the same `count`.
      type(comm_t), intent(in) :: comm
      real(dp), intent(in) :: data(:, :)
      integer(int32), intent(in) :: count
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      integer(int32) :: ierr

      call MPI_Send(data, count, MPI_DOUBLE_PRECISION, dest, tag, &
                    comm%m_comm, ierr)
   end subroutine comm_send_real_dp_array_2d_n

   subroutine comm_send_integer_array_2d_n(comm, data, count, dest, tag)
      !! Blocking send of a contiguous 2D int32 array using an explicit
      !! count, no dim-prefix.  Pairs with `comm_recv_integer_array_2d_n`.
      type(comm_t), intent(in) :: comm
      integer(int32), intent(in) :: data(:, :)
      integer(int32), intent(in) :: count
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      integer(int32) :: ierr

      call MPI_Send(data, count, MPI_INTEGER, dest, tag, &
                    comm%m_comm, ierr)
   end subroutine comm_send_integer_array_2d_n

   subroutine comm_send_real_sp(comm, data, dest, tag)
      !! Blocking send of a single single-precision real to specified destination
      type(comm_t), intent(in) :: comm
      real(sp), intent(in) :: data
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      integer(int32) :: ierr

      call MPI_Send(data, 1, MPI_REAL, dest, tag, comm%m_comm, ierr)
   end subroutine comm_send_real_sp

   subroutine comm_send_real_sp_array(comm, data, dest, tag)
      !! Blocking send of a single-precision real array to specified destination
      type(comm_t), intent(in) :: comm
      real(sp), intent(in) :: data(:)
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      integer(int32) :: ierr

      call MPI_Send(data, size(data), MPI_REAL, dest, tag, comm%m_comm, ierr)
   end subroutine comm_send_real_sp_array

   subroutine comm_send_real_sp_array_2d(comm, data, dest, tag)
   !! Blocking send of a 2D single-precision real array to specified destination
      type(comm_t), intent(in) :: comm
      real(sp), intent(in) :: data(:, :)
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      integer(int32) :: ierr, dim1, dim2

      ! Send dimensions first
      dim1 = size(data, 1)
      dim2 = size(data, 2)
      call MPI_Send(dim1, 1, MPI_INTEGER, dest, tag, comm%m_comm, ierr)
      call MPI_Send(dim2, 1, MPI_INTEGER, dest, tag, comm%m_comm, ierr)

      ! Send data
      call MPI_Send(data, size(data), MPI_REAL, dest, tag, comm%m_comm, ierr)
   end subroutine comm_send_real_sp_array_2d

   subroutine comm_send_logical(comm, data, dest, tag)
   !! Blocking send of a logical value to specified destination
      type(comm_t), intent(in) :: comm
      logical, intent(in) :: data
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      integer(int32) :: ierr

      call MPI_Send(data, 1, MPI_LOGICAL, dest, tag, comm%m_comm, ierr)
   end subroutine comm_send_logical

   subroutine comm_recv_integer(comm, data, source, tag, status)
   !! Blocking receive of an integer from specified source.
   !! Use MPI_ANY_SOURCE or MPI_ANY_TAG for wildcards.
      type(comm_t), intent(in) :: comm
      integer(int32), intent(out) :: data
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      integer(int32) :: ierr
      type(MPI_Status), intent(out), optional :: status

      if (present(status)) then
         call MPI_Recv(data, 1, MPI_INTEGER, source, tag, comm%m_comm, status, ierr)
      else
         call MPI_Recv(data, 1, MPI_INTEGER, source, tag, comm%m_comm, MPI_STATUS_IGNORE, ierr)
      end if
   end subroutine comm_recv_integer

   subroutine comm_recv_integer_array(comm, data, source, tag, status)
   !! Blocking receive of an integer array from specified source.
      type(comm_t), intent(in) :: comm
      integer(int32), allocatable, intent(out) :: data(:)
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(MPI_Status), intent(out) :: status
      integer(int32) :: count
      integer(int32) :: ierr

      ! First probe to get message size
      call MPI_Probe(source, tag, comm%m_comm, status, ierr)
      call MPI_Get_count(status, MPI_INTEGER, count, ierr)

      ! Allocate and receive
      allocate (data(count))
      call MPI_Recv(data, count, MPI_INTEGER, source, tag, comm%m_comm, MPI_STATUS_IGNORE, ierr)
   end subroutine comm_recv_integer_array

   subroutine comm_recv_integer64(comm, data, source, tag, status)
   !! Blocking receive of an integer64 from specified source.
   !! Use MPI_ANY_SOURCE or MPI_ANY_TAG for wildcards.
      type(comm_t), intent(in) :: comm
      integer(int64), intent(out) :: data
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      integer(int32) :: ierr
      type(MPI_Status), intent(out), optional :: status

      if (present(status)) then
         call MPI_Recv(data, 1, MPI_INTEGER8, source, tag, comm%m_comm, status, ierr)
      else
         call MPI_Recv(data, 1, MPI_INTEGER8, source, tag, comm%m_comm, MPI_STATUS_IGNORE, ierr)
      end if
   end subroutine comm_recv_integer64

   subroutine comm_recv_integer64_array(comm, data, source, tag, status)
   !! Blocking receive of an integer64 array from specified source.
   !! Array is automatically allocated to the correct size.
      type(comm_t), intent(in) :: comm
      integer(int64), allocatable, intent(out) :: data(:)
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(MPI_Status), intent(out) :: status
      integer(int32) :: count
      integer(int32) :: ierr

      ! First probe to get message size
      call MPI_Probe(source, tag, comm%m_comm, status, ierr)
      call MPI_Get_count(status, MPI_INTEGER8, count, ierr)

      ! Allocate and receive
      allocate (data(count))
      call MPI_Recv(data, count, MPI_INTEGER8, source, tag, comm%m_comm, MPI_STATUS_IGNORE, ierr)
   end subroutine comm_recv_integer64_array

   subroutine comm_recv_real_dp(comm, data, source, tag, status)
      !! Blocking receive of a single double precision real from specified source.
      type(comm_t), intent(in) :: comm
      real(dp), intent(out) :: data
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      integer(int32) :: ierr
      type(MPI_Status), intent(out), optional :: status

      if (present(status)) then
         call MPI_Recv(data, 1, MPI_DOUBLE_PRECISION, source, tag, comm%m_comm, status, ierr)
      else
         call MPI_Recv(data, 1, MPI_DOUBLE_PRECISION, source, tag, comm%m_comm, MPI_STATUS_IGNORE, ierr)
      end if
   end subroutine comm_recv_real_dp

   subroutine comm_recv_real_dp_array(comm, data, source, tag, status)
      !! Blocking receive of a double precision real array from specified source.
      type(comm_t), intent(in) :: comm
      real(dp), allocatable, intent(out) :: data(:)
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(MPI_Status) :: status
      integer(int32) :: count
      integer(int32) :: ierr

      ! First probe to get message size
      call MPI_Probe(source, tag, comm%m_comm, status, ierr)
      call MPI_Get_count(status, MPI_DOUBLE_PRECISION, count, ierr)

      ! Allocate and receive
      allocate (data(count))
      call MPI_Recv(data, count, MPI_DOUBLE_PRECISION, source, tag, comm%m_comm, MPI_STATUS_IGNORE, ierr)
   end subroutine comm_recv_real_dp_array

   subroutine comm_recv_real_dp_array_2d(comm, data, source, tag, status)
   !! Blocking receive of a 2D allocatable double precision real array
      type(comm_t), intent(in) :: comm
      real(dp), intent(inout), allocatable :: data(:, :)
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(MPI_Status), intent(out) :: status
      integer(int32) :: ierr, count, dim1, dim2

      ! Receive dimensions first
      call MPI_Recv(dim1, 1, MPI_INTEGER, source, tag, comm%m_comm, MPI_STATUS_IGNORE, ierr)
      call MPI_Recv(dim2, 1, MPI_INTEGER, source, tag, comm%m_comm, MPI_STATUS_IGNORE, ierr)

      ! Allocate array with received dimensions
      if (.not. allocated(data)) then
         allocate (data(dim1, dim2))
      end if

      ! Receive data
      count = dim1*dim2
      call MPI_Recv(data, count, MPI_DOUBLE_PRECISION, source, tag, comm%m_comm, status, ierr)
   end subroutine comm_recv_real_dp_array_2d

   subroutine comm_recv_real_dp_array_2d_n(comm, data, count, source, tag, status)
      !! Blocking receive of a contiguous 2D double-precision array
      !! with explicit `count`.  No dim-prefix protocol — the caller
      !! has already shaped `data`.  Mirrors `comm_send_real_dp_array_2d_n`.
      type(comm_t), intent(in)    :: comm
      real(dp), intent(out)   :: data(:, :)
      integer(int32), intent(in)    :: count
      integer(int32), intent(in)    :: source
      integer(int32), intent(in)    :: tag
      type(MPI_Status), intent(out), optional :: status
      integer(int32) :: ierr

      if (present(status)) then
         call MPI_Recv(data, count, MPI_DOUBLE_PRECISION, source, tag, &
                       comm%m_comm, status, ierr)
      else
         call MPI_Recv(data, count, MPI_DOUBLE_PRECISION, source, tag, &
                       comm%m_comm, MPI_STATUS_IGNORE, ierr)
      end if
   end subroutine comm_recv_real_dp_array_2d_n

   subroutine comm_recv_integer_array_2d_n(comm, data, count, source, tag, status)
      !! Blocking recv of a contiguous 2D int32 array with explicit count.
      type(comm_t), intent(in)    :: comm
      integer(int32), intent(out)   :: data(:, :)
      integer(int32), intent(in)    :: count
      integer(int32), intent(in)    :: source
      integer(int32), intent(in)    :: tag
      type(MPI_Status), intent(out), optional :: status
      integer(int32) :: ierr

      if (present(status)) then
         call MPI_Recv(data, count, MPI_INTEGER, source, tag, &
                       comm%m_comm, status, ierr)
      else
         call MPI_Recv(data, count, MPI_INTEGER, source, tag, &
                       comm%m_comm, MPI_STATUS_IGNORE, ierr)
      end if
   end subroutine comm_recv_integer_array_2d_n

   subroutine comm_recv_real_sp(comm, data, source, tag, status)
      !! Blocking receive of a single single-precision real from specified source.
      type(comm_t), intent(in) :: comm
      real(sp), intent(out) :: data
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      integer(int32) :: ierr
      type(MPI_Status), intent(out), optional :: status

      if (present(status)) then
         call MPI_Recv(data, 1, MPI_REAL, source, tag, comm%m_comm, status, ierr)
      else
         call MPI_Recv(data, 1, MPI_REAL, source, tag, comm%m_comm, MPI_STATUS_IGNORE, ierr)
      end if
   end subroutine comm_recv_real_sp

   subroutine comm_recv_real_sp_array(comm, data, source, tag, status)
      !! Blocking receive of a single-precision real array from specified source.
      type(comm_t), intent(in) :: comm
      real(sp), allocatable, intent(out) :: data(:)
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(MPI_Status) :: status
      integer(int32) :: count
      integer(int32) :: ierr

      ! First probe to get message size
      call MPI_Probe(source, tag, comm%m_comm, status, ierr)
      call MPI_Get_count(status, MPI_REAL, count, ierr)

      ! Allocate and receive
      allocate (data(count))
      call MPI_Recv(data, count, MPI_REAL, source, tag, comm%m_comm, MPI_STATUS_IGNORE, ierr)
   end subroutine comm_recv_real_sp_array

   subroutine comm_recv_real_sp_array_2d(comm, data, source, tag, status)
   !! Blocking receive of a 2D allocatable single-precision real array
      type(comm_t), intent(in) :: comm
      real(sp), intent(inout), allocatable :: data(:, :)
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(MPI_Status), intent(out) :: status
      integer(int32) :: ierr, count, dim1, dim2

      ! Receive dimensions first
      call MPI_Recv(dim1, 1, MPI_INTEGER, source, tag, comm%m_comm, MPI_STATUS_IGNORE, ierr)
      call MPI_Recv(dim2, 1, MPI_INTEGER, source, tag, comm%m_comm, MPI_STATUS_IGNORE, ierr)

      ! Allocate array with received dimensions
      if (.not. allocated(data)) then
         allocate (data(dim1, dim2))
      end if

      ! Receive data
      count = dim1*dim2
      call MPI_Recv(data, count, MPI_REAL, source, tag, comm%m_comm, status, ierr)
   end subroutine comm_recv_real_sp_array_2d

   subroutine comm_recv_logical(comm, data, source, tag, status)
   !! Blocking receive of a logical value from specified source
      type(comm_t), intent(in) :: comm
      logical, intent(out) :: data
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(MPI_Status), intent(out), optional :: status
      integer(int32) :: ierr

      if (present(status)) then
         call MPI_Recv(data, 1, MPI_LOGICAL, source, tag, comm%m_comm, status, ierr)
      else
         call MPI_Recv(data, 1, MPI_LOGICAL, source, tag, comm%m_comm, MPI_STATUS_IGNORE, ierr)
      end if
   end subroutine comm_recv_logical

   ! ========================================================================
   ! Fixed-size blocking receive variants
   !
   ! Mirror the existing `_array_n` shape used by the isend / irecv
   ! interfaces.  Take an explicit `count` instead of probing for size,
   ! so callers can receive into a pre-allocated buffer.  Useful for
   ! the I/O server pattern where buffer shapes are known up front.
   ! ========================================================================

   subroutine comm_recv_real_dp_array_n(comm, data, count, source, tag, status)
      !! Blocking receive into a pre-allocated double-precision array.
      type(comm_t), intent(in) :: comm
      real(dp), intent(out) :: data(:)
      integer(int32), intent(in) :: count
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(MPI_Status), intent(out), optional :: status
      integer(int32) :: ierr

      if (present(status)) then
         call MPI_Recv(data, count, MPI_DOUBLE_PRECISION, source, tag, &
                       comm%m_comm, status, ierr)
      else
         call MPI_Recv(data, count, MPI_DOUBLE_PRECISION, source, tag, &
                       comm%m_comm, MPI_STATUS_IGNORE, ierr)
      end if
   end subroutine comm_recv_real_dp_array_n

   subroutine comm_recv_real_sp_array_n(comm, data, count, source, tag, status)
      !! Blocking receive into a pre-allocated single-precision array.
      type(comm_t), intent(in) :: comm
      real(sp), intent(out) :: data(:)
      integer(int32), intent(in) :: count
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(MPI_Status), intent(out), optional :: status
      integer(int32) :: ierr

      if (present(status)) then
         call MPI_Recv(data, count, MPI_REAL, source, tag, &
                       comm%m_comm, status, ierr)
      else
         call MPI_Recv(data, count, MPI_REAL, source, tag, &
                       comm%m_comm, MPI_STATUS_IGNORE, ierr)
      end if
   end subroutine comm_recv_real_sp_array_n

   subroutine comm_recv_integer_array_n(comm, data, count, source, tag, status)
      !! Blocking receive into a pre-allocated int32 array.
      type(comm_t), intent(in) :: comm
      integer(int32), intent(out) :: data(:)
      integer(int32), intent(in) :: count
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(MPI_Status), intent(out), optional :: status
      integer(int32) :: ierr

      if (present(status)) then
         call MPI_Recv(data, count, MPI_INTEGER, source, tag, &
                       comm%m_comm, status, ierr)
      else
         call MPI_Recv(data, count, MPI_INTEGER, source, tag, &
                       comm%m_comm, MPI_STATUS_IGNORE, ierr)
      end if
   end subroutine comm_recv_integer_array_n

   subroutine comm_recv_integer64_array_n(comm, data, count, source, tag, status)
      !! Blocking receive into a pre-allocated int64 array.
      type(comm_t), intent(in) :: comm
      integer(int64), intent(out) :: data(:)
      integer(int32), intent(in) :: count
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(MPI_Status), intent(out), optional :: status
      integer(int32) :: ierr

      if (present(status)) then
         call MPI_Recv(data, count, MPI_INTEGER8, source, tag, &
                       comm%m_comm, status, ierr)
      else
         call MPI_Recv(data, count, MPI_INTEGER8, source, tag, &
                       comm%m_comm, MPI_STATUS_IGNORE, ierr)
      end if
   end subroutine comm_recv_integer64_array_n

   subroutine comm_iprobe(comm, source, tag, message_pending, status)
      !! Non-blocking probe for incoming messages
      type(comm_t), intent(in) :: comm
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      logical, intent(out) :: message_pending
      type(MPI_Status), intent(out) :: status
      integer(int32) :: ierr

      call MPI_Iprobe(source, tag, comm%m_comm, message_pending, status, ierr)
   end subroutine comm_iprobe

   subroutine comm_probe(comm, source, tag, status)
      !! Blocking probe for incoming messages.  Returns once a message
      !! matching `(source, tag)` is queued at the receiver — caller
      !! reads `status%MPI_SOURCE` and `status%MPI_TAG` to decide who's
      !! talking and what kind of message it is, then issues the
      !! matching `recv`.  Use `MPI_ANY_SOURCE` / `MPI_ANY_TAG` to
      !! dispatch on whichever rank speaks first.
      type(comm_t), intent(in) :: comm
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(MPI_Status), intent(out) :: status
      integer(int32) :: ierr

      call MPI_Probe(source, tag, comm%m_comm, status, ierr)
   end subroutine comm_probe

   subroutine comm_finalize(this)
      !! Frees the MPI communicator resources
      class(comm_t), intent(inout) :: this
      integer(int32) :: ierr

      if (this%is_valid .and. this%m_comm /= MPI_COMM_NULL) then
         call MPI_Comm_free(this%m_comm, ierr)
         this%is_valid = .false.
         this%m_comm = MPI_COMM_NULL
      end if
   end subroutine comm_finalize

   subroutine abort_comm(comm, errorcode)
      !! Aborts all processes in the communicator with the given error code
      type(comm_t), intent(in) :: comm
      integer(int32), intent(in) :: errorcode
      integer(int32) :: ierr

      call MPI_Abort(comm%m_comm, errorcode, ierr)
   end subroutine abort_comm

   subroutine comm_allgather_integer(comm, sendbuf, recvbuf)
      !! Gathers integer values from all processes in the communicator
      type(comm_t), intent(in) :: comm
      integer(int32), intent(in) :: sendbuf
      integer(int32), intent(out) :: recvbuf(:)
      integer(int32) :: ierr

      call MPI_Allgather(sendbuf, 1, MPI_INTEGER, recvbuf, 1, MPI_INTEGER, comm%m_comm, ierr)
   end subroutine comm_allgather_integer

   subroutine comm_bcast_integer(comm, buffer, count, root)
      !! Broadcasts integer data from root process to all processes in communicator
      type(comm_t), intent(in) :: comm
      integer(int32), intent(inout) :: buffer
      integer(int32), intent(in) :: count
      integer(int32), intent(in) :: root
      integer(int32) :: ierr

      call MPI_Bcast(buffer, count, MPI_INTEGER, root, comm%m_comm, ierr)
   end subroutine comm_bcast_integer

   subroutine comm_bcast_integer64(comm, buffer, count, root)
      !! Broadcasts integer64 data from root process to all processes in communicator
      type(comm_t), intent(in) :: comm
      integer(int64), intent(inout) :: buffer
      integer(int32), intent(in) :: count
      integer(int32), intent(in) :: root
      integer(int32) :: ierr

      call MPI_Bcast(buffer, count, MPI_INTEGER8, root, comm%m_comm, ierr)
   end subroutine comm_bcast_integer64

   subroutine comm_bcast_real_dp(comm, buffer, count, root)
      !! Broadcasts double precision data from root process to all processes in communicator
      type(comm_t), intent(in) :: comm
      real(dp), intent(inout) :: buffer
      integer(int32), intent(in) :: count
      integer(int32), intent(in) :: root
      integer(int32) :: ierr

      call MPI_Bcast(buffer, count, MPI_DOUBLE_PRECISION, root, comm%m_comm, ierr)
   end subroutine comm_bcast_real_dp

   subroutine comm_bcast_real_dp_array(comm, buffer, count, root)
      !! Broadcasts double precision array from root process to all processes in communicator
      type(comm_t), intent(in) :: comm
      real(dp), intent(inout) :: buffer(:)
      integer(int32), intent(in) :: count
      integer(int32), intent(in) :: root
      integer(int32) :: ierr

      call MPI_Bcast(buffer, count, MPI_DOUBLE_PRECISION, root, comm%m_comm, ierr)
   end subroutine comm_bcast_real_dp_array

   subroutine comm_bcast_real_sp(comm, buffer, count, root)
      !! Broadcasts single-precision data from root process to all processes in communicator
      type(comm_t), intent(in) :: comm
      real(sp), intent(inout) :: buffer
      integer(int32), intent(in) :: count
      integer(int32), intent(in) :: root
      integer(int32) :: ierr

      call MPI_Bcast(buffer, count, MPI_REAL, root, comm%m_comm, ierr)
   end subroutine comm_bcast_real_sp

   subroutine comm_bcast_real_sp_array(comm, buffer, count, root)
      !! Broadcasts single-precision array from root process to all processes in communicator
      type(comm_t), intent(in) :: comm
      real(sp), intent(inout) :: buffer(:)
      integer(int32), intent(in) :: count
      integer(int32), intent(in) :: root
      integer(int32) :: ierr

      call MPI_Bcast(buffer, count, MPI_REAL, root, comm%m_comm, ierr)
   end subroutine comm_bcast_real_sp_array

   subroutine get_processor_name(name, namelen)
      !! Retrieves the name of the processor
      character(len=*), intent(inout) :: name
      integer(int32), intent(out) :: namelen
      integer(int32) :: ierr

      call MPI_Get_processor_name(name, namelen, ierr)
   end subroutine get_processor_name

   subroutine pic_mpi_init(requested_thread_level, provided_thread_level)
      !! Initialize MPI environment with optional threading support
      !!
      !! If no thread level is requested, uses MPI_THREAD_FUNNELED by default
      !! to allow OpenMP threading in compute_mbe_energy and similar functions.
      !!
      !! Thread levels:
      !!   MPI_THREAD_SINGLE: No threading support
      !!   MPI_THREAD_FUNNELED: Only main thread makes MPI calls (good for OpenMP)
      !!   MPI_THREAD_SERIALIZED: Multiple threads can make MPI calls, but not simultaneously
      !!   MPI_THREAD_MULTIPLE: Full thread safety
      integer(int32), intent(in), optional :: requested_thread_level
      integer(int32), intent(out), optional :: provided_thread_level
      integer(int32) :: ierr, requested, provided

      ! Default to FUNNELED for OpenMP compatibility
      if (present(requested_thread_level)) then
         requested = requested_thread_level
      else
         requested = MPI_THREAD_FUNNELED
      end if

      call MPI_Init_thread(requested, provided, ierr)

      ! Return the provided level if requested
      if (present(provided_thread_level)) then
         provided_thread_level = provided
      end if

      ! Warn if we didn't get what we asked for
      if (provided < requested .and. requested /= MPI_THREAD_SINGLE) then
         if (requested == MPI_THREAD_FUNNELED) then
            write (*, '(a)') "Warning: MPI_THREAD_FUNNELED requested but not provided."
            write (*, '(a)') "OpenMP threading in compute functions may not work correctly."
         else if (requested == MPI_THREAD_SERIALIZED) then
            write (*, '(a)') "Warning: MPI_THREAD_SERIALIZED requested but not provided."
         else if (requested == MPI_THREAD_MULTIPLE) then
            write (*, '(a)') "Warning: MPI_THREAD_MULTIPLE requested but not provided."
         end if
      end if
   end subroutine pic_mpi_init

   function pic_mpi_query_thread_level() result(thread_level)
      !! Query the current MPI thread support level
      use mpi_f08, only: MPI_Query_thread
      integer(int32) :: thread_level
      integer(int32) :: ierr

      call MPI_Query_thread(thread_level, ierr)
   end function pic_mpi_query_thread_level

   subroutine pic_mpi_finalize(ierr)
      !! Finalize MPI environment
      integer(int32), optional, intent(out) :: ierr
      integer(int32) :: ierr_local
      call MPI_Finalize(ierr_local)
      if (present(ierr)) ierr = ierr_local
   end subroutine pic_mpi_finalize

   ! ========================================================================
   ! Request type methods
   ! ========================================================================

   pure function request_is_null(this) result(is_null)
      !! Checks if the request is null/invalid
      class(request_t), intent(in) :: this
      logical :: is_null
      is_null = .not. this%is_valid
   end function request_is_null

   function request_get(this) result(mpi_request_out)
      !! Retrieves the underlying MPI_Request handle
      class(request_t), intent(in) :: this
      type(MPI_Request) :: mpi_request_out

      if (.not. this%is_valid) then
         error stop "Cannot get MPI_Request from null request"
      end if
      mpi_request_out = this%m_request
   end function request_get

   subroutine request_free(this)
      !! Frees the MPI request resources
      class(request_t), intent(inout) :: this
      this%m_request = MPI_REQUEST_NULL
      this%is_valid = .false.
   end subroutine request_free

   ! ========================================================================
   ! Non-blocking send operations
   ! ========================================================================

   subroutine comm_isend_integer(comm, data, dest, tag, request)
   !! Initiates a non-blocking send operation. The request must be
   !! waited on using wait() or test() before the buffer can be reused.
      type(comm_t), intent(in) :: comm
      integer(int32), intent(in) :: data
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Isend(data, 1, MPI_INTEGER, dest, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_isend_integer

   subroutine comm_isend_integer_array(comm, data, dest, tag, request)
   !! Initiates a non-blocking send operation. The request must be
   !! waited on using wait() or test() before the buffer can be reused.
      type(comm_t), intent(in) :: comm
      integer(int32), intent(in) :: data(:)
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Isend(data, size(data), MPI_INTEGER, dest, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_isend_integer_array

   subroutine comm_isend_integer64(comm, data, dest, tag, request)
   !! Initiates a non-blocking send operation. The request must be
   !! waited on using wait() or test() before the buffer can be reused.
      type(comm_t), intent(in) :: comm
      integer(int64), intent(in) :: data
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Isend(data, 1, MPI_INTEGER8, dest, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_isend_integer64

   subroutine comm_isend_integer64_array(comm, data, dest, tag, request)
   !! Initiates a non-blocking send operation. The request must be
   !! waited on using wait() or test() before the buffer can be reused.
      type(comm_t), intent(in) :: comm
      integer(int64), intent(in) :: data(:)
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Isend(data, size(data), MPI_INTEGER8, dest, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_isend_integer64_array

   subroutine comm_isend_real_dp(comm, data, dest, tag, request)
   !! Initiates a non-blocking send operation. The request must be
   !! waited on using wait() or test() before the buffer can be reused.
      type(comm_t), intent(in) :: comm
      real(dp), intent(in) :: data
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Isend(data, 1, MPI_DOUBLE_PRECISION, dest, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_isend_real_dp

   subroutine comm_isend_real_dp_array(comm, data, dest, tag, request)
      !! Initiates a non-blocking send operation. The request must be
      !! waited on using wait() or test() before the buffer can be reused.
      type(comm_t), intent(in) :: comm
      real(dp), intent(in) :: data(:)
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Isend(data, size(data), MPI_DOUBLE_PRECISION, dest, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_isend_real_dp_array

   subroutine comm_isend_real_dp_array_2d(comm, data, dest, tag, request)
   !! Non-blocking send of a 2D double precision real array
      type(comm_t), intent(in) :: comm
      real(dp), intent(in) :: data(:, :)
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr, dim1, dim2

      ! Send dimensions first (blocking - simple approach)
      dim1 = size(data, 1)
      dim2 = size(data, 2)
      call MPI_Send(dim1, 1, MPI_INTEGER, dest, tag, comm%m_comm, ierr)
      call MPI_Send(dim2, 1, MPI_INTEGER, dest, tag, comm%m_comm, ierr)

      ! Send data (non-blocking)
      call MPI_Isend(data, size(data), MPI_DOUBLE_PRECISION, dest, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_isend_real_dp_array_2d

   subroutine comm_isend_real_dp_array_2d_n(comm, data, count, dest, tag, request)
      !! Non-blocking send of a contiguous 2D double-precision array
      !! with explicit count — no dim-prefix protocol.  Pairs with
      !! `comm_recv_real_dp_array_2d_n` on the receive side.  The
      !! caller must keep `data` valid until `wait`/`waitall` on the
      !! returned request completes.
      type(comm_t), intent(in)  :: comm
      real(dp), intent(in)  :: data(:, :)
      integer(int32), intent(in)  :: count
      integer(int32), intent(in)  :: dest
      integer(int32), intent(in)  :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Isend(data, count, MPI_DOUBLE_PRECISION, dest, tag, &
                     comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_isend_real_dp_array_2d_n

   subroutine comm_isend_logical(comm, data, dest, tag, request)
   !! Non-blocking send of a logical value
      type(comm_t), intent(in) :: comm
      logical, intent(in) :: data
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Isend(data, 1, MPI_LOGICAL, dest, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_isend_logical

   subroutine comm_isend_real_dp_array_n(comm, data, count, dest, tag, request)
      !! Non-blocking send with explicit count (for device pointers in host_data blocks)
      type(comm_t), intent(in) :: comm
      real(dp) :: data(:)
      integer(int32), intent(in) :: count
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Isend(data, count, MPI_DOUBLE_PRECISION, dest, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_isend_real_dp_array_n

   subroutine comm_isend_real_sp(comm, data, dest, tag, request)
   !! Non-blocking send of a single single-precision real
      type(comm_t), intent(in) :: comm
      real(sp), intent(in) :: data
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Isend(data, 1, MPI_REAL, dest, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_isend_real_sp

   subroutine comm_isend_real_sp_array(comm, data, dest, tag, request)
      !! Non-blocking send of a single-precision real array
      type(comm_t), intent(in) :: comm
      real(sp), intent(in) :: data(:)
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Isend(data, size(data), MPI_REAL, dest, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_isend_real_sp_array

   subroutine comm_isend_real_sp_array_2d(comm, data, dest, tag, request)
   !! Non-blocking send of a 2D single-precision real array
      type(comm_t), intent(in) :: comm
      real(sp), intent(in) :: data(:, :)
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr, dim1, dim2

      ! Send dimensions first (blocking - simple approach)
      dim1 = size(data, 1)
      dim2 = size(data, 2)
      call MPI_Send(dim1, 1, MPI_INTEGER, dest, tag, comm%m_comm, ierr)
      call MPI_Send(dim2, 1, MPI_INTEGER, dest, tag, comm%m_comm, ierr)

      ! Send data (non-blocking)
      call MPI_Isend(data, size(data), MPI_REAL, dest, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_isend_real_sp_array_2d

   subroutine comm_isend_real_sp_array_n(comm, data, count, dest, tag, request)
      !! Non-blocking send with explicit count for single-precision (for device pointers in host_data blocks)
      type(comm_t), intent(in) :: comm
      real(sp) :: data(:)
      integer(int32), intent(in) :: count
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Isend(data, count, MPI_REAL, dest, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_isend_real_sp_array_n

   ! ========================================================================
   ! Non-blocking receive operations
   ! ========================================================================

   subroutine comm_irecv_integer(comm, data, source, tag, request)
      !! Initiates a non-blocking receive operation. The request must be
      !! waited on using wait() or test() before the buffer can be used.
      type(comm_t), intent(in) :: comm
      integer(int32), intent(out) :: data
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Irecv(data, 1, MPI_INTEGER, source, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_irecv_integer

   subroutine comm_irecv_integer_array(comm, data, source, tag, request)
      !! Initiates a non-blocking receive operation. The request must be
      !! waited on using wait() or test() before the buffer can be used.
      type(comm_t), intent(in) :: comm
      integer(int32), intent(out) :: data(:)
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Irecv(data, size(data), MPI_INTEGER, source, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_irecv_integer_array

   subroutine comm_irecv_integer64(comm, data, source, tag, request)
   !! Initiates a non-blocking receive operation. The request must be
   !! waited on using wait() or test() before the buffer can be used.
      type(comm_t), intent(in) :: comm
      integer(int64), intent(out) :: data
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Irecv(data, 1, MPI_INTEGER8, source, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_irecv_integer64

   subroutine comm_irecv_integer64_array(comm, data, source, tag, request)
   !! Initiates a non-blocking receive operation. The request must be
   !! waited on using wait() or test() before the buffer can be used.
      type(comm_t), intent(in) :: comm
      integer(int64), intent(out) :: data(:)
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Irecv(data, size(data), MPI_INTEGER8, source, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_irecv_integer64_array

   subroutine comm_irecv_real_dp(comm, data, source, tag, request)
      type(comm_t), intent(in) :: comm
      real(dp), intent(out) :: data
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Irecv(data, 1, MPI_DOUBLE_PRECISION, source, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_irecv_real_dp

   subroutine comm_irecv_real_dp_array(comm, data, source, tag, request)
      type(comm_t), intent(in) :: comm
      real(dp), intent(out) :: data(:)
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Irecv(data, size(data), MPI_DOUBLE_PRECISION, source, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_irecv_real_dp_array

   subroutine comm_irecv_real_dp_array_2d(comm, data, source, tag, request)
   !! Non-blocking receive of a 2D allocatable double precision real array
      type(comm_t), intent(in) :: comm
      real(dp), intent(inout), allocatable :: data(:, :)
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr, dim1, dim2

      ! Receive dimensions first (blocking - needed to allocate)
      call MPI_Recv(dim1, 1, MPI_INTEGER, source, tag, comm%m_comm, MPI_STATUS_IGNORE, ierr)
      call MPI_Recv(dim2, 1, MPI_INTEGER, source, tag, comm%m_comm, MPI_STATUS_IGNORE, ierr)

      ! Allocate array with received dimensions
      if (.not. allocated(data)) then
         allocate (data(dim1, dim2))
      end if

      ! Receive data (non-blocking)
      call MPI_Irecv(data, dim1*dim2, MPI_DOUBLE_PRECISION, source, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_irecv_real_dp_array_2d

   subroutine comm_irecv_logical(comm, data, source, tag, request)
   !! Non-blocking receive of a logical value
      type(comm_t), intent(in) :: comm
      logical, intent(out) :: data
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Irecv(data, 1, MPI_LOGICAL, source, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_irecv_logical

   subroutine comm_irecv_real_dp_array_n(comm, data, count, source, tag, request)
      !! Non-blocking receive with explicit count (for device pointers in host_data blocks)
      type(comm_t), intent(in) :: comm
      real(dp) :: data(:)
      integer(int32), intent(in) :: count
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Irecv(data, count, MPI_DOUBLE_PRECISION, source, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_irecv_real_dp_array_n

   subroutine comm_irecv_real_sp(comm, data, source, tag, request)
      !! Non-blocking receive of a single single-precision real
      type(comm_t), intent(in) :: comm
      real(sp), intent(out) :: data
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Irecv(data, 1, MPI_REAL, source, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_irecv_real_sp

   subroutine comm_irecv_real_sp_array(comm, data, source, tag, request)
      !! Non-blocking receive of a single-precision real array
      type(comm_t), intent(in) :: comm
      real(sp), intent(out) :: data(:)
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Irecv(data, size(data), MPI_REAL, source, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_irecv_real_sp_array

   subroutine comm_irecv_real_sp_array_2d(comm, data, source, tag, request)
   !! Non-blocking receive of a 2D allocatable single-precision real array
      type(comm_t), intent(in) :: comm
      real(sp), intent(inout), allocatable :: data(:, :)
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr, dim1, dim2

      ! Receive dimensions first (blocking - needed to allocate)
      call MPI_Recv(dim1, 1, MPI_INTEGER, source, tag, comm%m_comm, MPI_STATUS_IGNORE, ierr)
      call MPI_Recv(dim2, 1, MPI_INTEGER, source, tag, comm%m_comm, MPI_STATUS_IGNORE, ierr)

      ! Allocate array with received dimensions
      if (.not. allocated(data)) then
         allocate (data(dim1, dim2))
      end if

      ! Receive data (non-blocking)
      call MPI_Irecv(data, dim1*dim2, MPI_REAL, source, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_irecv_real_sp_array_2d

   subroutine comm_irecv_real_sp_array_n(comm, data, count, source, tag, request)
      !! Non-blocking receive with explicit count for single-precision (for device pointers in host_data blocks)
      type(comm_t), intent(in) :: comm
      real(sp) :: data(:)
      integer(int32), intent(in) :: count
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Irecv(data, count, MPI_REAL, source, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_irecv_real_sp_array_n

   ! ========================================================================
   ! Request completion operations
   ! ========================================================================

   subroutine request_wait(request, status)
   !! Blocks until the operation associated with the request completes.
   !! The request is freed after completion.
      type(request_t), intent(inout) :: request
      type(MPI_Status), intent(out), optional :: status
      integer(int32) :: ierr

      if (.not. request%is_valid) then
         error stop "Cannot wait on null request"
      end if

      if (present(status)) then
         call MPI_Wait(request%m_request, status, ierr)
      else
         call MPI_Wait(request%m_request, MPI_STATUS_IGNORE, ierr)
      end if

      call request%free()
   end subroutine request_wait

   subroutine request_waitall(requests, statuses)
   !! Blocks until all operations in the request array complete.
   !! All requests are freed after completion.
      type(request_t), intent(inout) :: requests(:)
      type(MPI_Status), intent(out), optional :: statuses(:)
      type(MPI_Request), allocatable :: mpi_requests(:)
      type(MPI_Status), allocatable :: temp_statuses(:)
      integer(int32) :: i, count, ierr

      count = size(requests)
      allocate (mpi_requests(count))

      do i = 1, count
         if (requests(i)%is_valid) then
            mpi_requests(i) = requests(i)%m_request
         else
            mpi_requests(i) = MPI_REQUEST_NULL
         end if
      end do

      if (present(statuses)) then
         call MPI_Waitall(count, mpi_requests, statuses, ierr)
      else
         allocate (temp_statuses(count))
         call MPI_Waitall(count, mpi_requests, temp_statuses, ierr)
      end if

      do i = 1, count
         call requests(i)%free()
      end do
   end subroutine request_waitall

   subroutine request_test(request, flag, status)
      type(request_t), intent(inout) :: request
      logical, intent(out) :: flag
      type(MPI_Status), intent(out), optional :: status
      integer(int32) :: ierr

      if (.not. request%is_valid) then
         flag = .true.
         return
      end if

      if (present(status)) then
         call MPI_Test(request%m_request, flag, status, ierr)
      else
         call MPI_Test(request%m_request, flag, MPI_STATUS_IGNORE, ierr)
      end if

      if (flag) then
         call request%free()
      end if
   end subroutine request_test

   ! ========================================================================
   ! Window creation
   ! ========================================================================

   function create_win_dp_array(comm, base, win_size) result(win)
      !! Create MPI window for RMA operations
      !!
      !! Creates a window exposing local memory to remote RMA operations.
      !! Used for DDI distributed arrays.
      type(comm_t), intent(in) :: comm
      real(dp), target :: base(:)
      integer(MPI_ADDRESS_KIND), intent(in) :: win_size
      type(win_t) :: win
      integer(int32) :: ierr
      integer(int32) :: disp_unit

      disp_unit = int(storage_size(base(1))/8_int32, int32)
      call MPI_Win_create(base, win_size, disp_unit, &
                          MPI_INFO_NULL, comm%get(), win%m_win, ierr)
      win%is_valid = .true.
   end function create_win_dp_array

   function create_win_dynamic(comm) result(win)
      !! Create dynamic MPI window
      !!
      !! For windows where memory will be attached later.
      !! Useful for load balancing counters.
      type(comm_t), intent(in) :: comm
      type(win_t) :: win
      integer(int32) :: ierr

      call MPI_Win_create_dynamic(MPI_INFO_NULL, comm%get(), win%m_win, ierr)
      win%is_valid = .true.
   end function create_win_dynamic

   subroutine create_win_allocate_dp_1d(comm, length, baseptr, win)
      use iso_c_binding, only: c_ptr, c_f_pointer
      type(comm_t), intent(in) :: comm
      integer(int32), intent(in) :: length
      real(dp), pointer, intent(out) :: baseptr(:)
      type(win_t), intent(out) :: win
      type(c_ptr) :: cptr
      integer(MPI_ADDRESS_KIND) :: win_size
      integer(int32) :: disp_unit, ierr

      win_size = int(length, MPI_ADDRESS_KIND)*int(storage_size(1.0_dp)/8_int32, MPI_ADDRESS_KIND)
      disp_unit = int(storage_size(1.0_dp)/8_int32, int32)

      call MPI_Win_allocate(win_size, disp_unit, MPI_INFO_NULL, &
                            comm%get(), cptr, win%m_win, ierr)
      call c_f_pointer(cptr, baseptr, [length])
      win%is_valid = .true.
   end subroutine create_win_allocate_dp_1d

   !! Allocate window memory and create window in one call (2D array)
   !!
   !! This is more efficient than separate allocation + win_create.
   !! The baseptr is associated with the allocated memory.
   !! Memory is freed when window is finalized.
   subroutine create_win_allocate_dp_2d(comm, dim1, dim2, baseptr, win)
      use iso_c_binding, only: c_ptr, c_f_pointer
      type(comm_t), intent(in) :: comm
      integer(int32), intent(in) :: dim1, dim2
      real(dp), pointer, intent(out) :: baseptr(:, :)
      type(win_t), intent(out) :: win
      type(c_ptr) :: cptr
      integer(MPI_ADDRESS_KIND) :: win_size
      integer(int32) :: disp_unit, ierr, total_size

      total_size = dim1*dim2
      win_size = int(total_size, MPI_ADDRESS_KIND)*int(storage_size(1.0_dp)/8_int32, MPI_ADDRESS_KIND)
      disp_unit = int(storage_size(1.0_dp)/8_int32, int32)

      call MPI_Win_allocate(win_size, disp_unit, MPI_INFO_NULL, &
                            comm%get(), cptr, win%m_win, ierr)
      call c_f_pointer(cptr, baseptr, [dim1, dim2])
      win%is_valid = .true.
   end subroutine create_win_allocate_dp_2d

   !! Allocate window memory for integer64 array
   !!
   !! Used for atomic counters in dynamic load balancing.
   subroutine create_win_allocate_i64_1d(comm, length, baseptr, win)
      use iso_c_binding, only: c_ptr, c_f_pointer
      type(comm_t), intent(in) :: comm
      integer(int32), intent(in) :: length
      integer(int64), pointer, intent(out) :: baseptr(:)
      type(win_t), intent(out) :: win
      type(c_ptr) :: cptr
      integer(MPI_ADDRESS_KIND) :: win_size
      integer(int32) :: disp_unit, ierr

      win_size = int(length, MPI_ADDRESS_KIND)*int(storage_size(1_int64)/8_int32, MPI_ADDRESS_KIND)
      disp_unit = int(storage_size(1_int64)/8_int32, int32)

      call MPI_Win_allocate(win_size, disp_unit, MPI_INFO_NULL, &
                            comm%get(), cptr, win%m_win, ierr)
      call c_f_pointer(cptr, baseptr, [length])
      win%is_valid = .true.
   end subroutine create_win_allocate_i64_1d

   !! Allocate window memory for single precision array
   subroutine create_win_allocate_sp_1d(comm, length, baseptr, win)
      use iso_c_binding, only: c_ptr, c_f_pointer
      type(comm_t), intent(in) :: comm
      integer(int32), intent(in) :: length
      real(sp), pointer, intent(out) :: baseptr(:)
      type(win_t), intent(out) :: win
      type(c_ptr) :: cptr
      integer(MPI_ADDRESS_KIND) :: win_size
      integer(int32) :: disp_unit, ierr

      win_size = int(length, MPI_ADDRESS_KIND)*int(storage_size(1.0_sp)/8_int32, MPI_ADDRESS_KIND)
      disp_unit = int(storage_size(1.0_sp)/8_int32, int32)

      call MPI_Win_allocate(win_size, disp_unit, MPI_INFO_NULL, &
                            comm%get(), cptr, win%m_win, ierr)
      call c_f_pointer(cptr, baseptr, [length])
      win%is_valid = .true.
   end subroutine create_win_allocate_sp_1d

   !! Allocate window memory for integer32 array
   subroutine create_win_allocate_i32_1d(comm, length, baseptr, win)
      use iso_c_binding, only: c_ptr, c_f_pointer
      type(comm_t), intent(in) :: comm
      integer(int32), intent(in) :: length
      integer(int32), pointer, intent(out) :: baseptr(:)
      type(win_t), intent(out) :: win
      type(c_ptr) :: cptr
      integer(MPI_ADDRESS_KIND) :: win_size
      integer(int32) :: disp_unit, ierr

      win_size = int(length, MPI_ADDRESS_KIND)*int(storage_size(1_int32)/8_int32, MPI_ADDRESS_KIND)
      disp_unit = int(storage_size(1_int32)/8_int32, int32)

      call MPI_Win_allocate(win_size, disp_unit, MPI_INFO_NULL, &
                            comm%get(), cptr, win%m_win, ierr)
      call c_f_pointer(cptr, baseptr, [length])
      win%is_valid = .true.
   end subroutine create_win_allocate_i32_1d

   ! ========================================================================
   ! Window query methods
   ! ========================================================================

   pure function win_is_null(this) result(is_null)
      class(win_t), intent(in) :: this
      logical :: is_null
      is_null = .not. this%is_valid
   end function win_is_null

   function win_get_handle(this) result(mpi_win_out)
      class(win_t), intent(in) :: this
      type(MPI_Win) :: mpi_win_out

      if (.not. this%is_valid) then
         error stop "Cannot get MPI_Win from null window"
      end if
      mpi_win_out = this%m_win
   end function win_get_handle

   ! ========================================================================
   ! Synchronization
   ! ========================================================================

   !! Fence synchronization for active target RMA
   !!
   !! Completes all pending RMA operations.
   !! Use before/after Get/Put/Accumulate operations.
   subroutine win_fence(this, assert)
      class(win_t), intent(in) :: this
      integer(int32), intent(in), optional :: assert
      integer(int32) :: ierr, assert_val

      if (present(assert)) then
         assert_val = assert
      else
         assert_val = 0_int32
      end if

      call MPI_Win_fence(assert_val, this%m_win, ierr)
   end subroutine win_fence

   !! Lock window for passive target RMA
   !!
   !! Begins RMA access epoch for specified target rank.
   !! Must be paired with unlock.
   subroutine win_lock(this, rank, lock_type)
      class(win_t), intent(in) :: this
      integer(int32), intent(in) :: rank
      integer(int32), intent(in), optional :: lock_type
      integer(int32) :: ierr, ltype

      if (present(lock_type)) then
         ltype = lock_type
      else
         ltype = MPI_LOCK_SHARED
      end if

      call MPI_Win_lock(ltype, rank, 0_int32, this%m_win, ierr)
   end subroutine win_lock

   !! Unlock window for passive target RMA
   subroutine win_unlock(this, rank)
      class(win_t), intent(in) :: this
      integer(int32), intent(in) :: rank
      integer(int32) :: ierr

      call MPI_Win_unlock(rank, this%m_win, ierr)
   end subroutine win_unlock

   !! Lock window on all ranks for passive target RMA
   !!
   !! Begins RMA access epoch for all ranks simultaneously.
   !! More efficient than individual locks when accessing multiple ranks.
   !! Must be paired with unlock_all.
   subroutine win_lock_all(this, assert)
      class(win_t), intent(in) :: this
      integer(int32), intent(in), optional :: assert
      integer(int32) :: ierr, assert_val

      if (present(assert)) then
         assert_val = assert
      else
         assert_val = 0_int32
      end if

      call MPI_Win_lock_all(assert_val, this%m_win, ierr)
   end subroutine win_lock_all

   !! Unlock window on all ranks for passive target RMA
   subroutine win_unlock_all(this)
      class(win_t), intent(in) :: this
      integer(int32) :: ierr

      call MPI_Win_unlock_all(this%m_win, ierr)
   end subroutine win_unlock_all

   !! Flush pending RMA operations to a specific rank
   !!
   !! Ensures all RMA operations to target rank have completed.
   !! Use within lock_all/unlock_all epoch.
   subroutine win_flush(this, rank)
      class(win_t), intent(in) :: this
      integer(int32), intent(in) :: rank
      integer(int32) :: ierr

      call MPI_Win_flush(rank, this%m_win, ierr)
   end subroutine win_flush

   !! Flush pending RMA operations to all ranks
   !!
   !! Ensures all RMA operations to all ranks have completed.
   !! Use within lock_all/unlock_all epoch.
   subroutine win_flush_all(this)
      class(win_t), intent(in) :: this
      integer(int32) :: ierr

      call MPI_Win_flush_all(this%m_win, ierr)
   end subroutine win_flush_all

   ! ========================================================================
   ! RMA Get/Put/Accumulate operations
   ! ========================================================================

   !! Get data from remote window
   !!
   !! Retrieves data from target rank's window into local buffer.
   !! Must be called between fence or lock/unlock pairs.
   subroutine win_get_dp(this, target_rank, target_disp, count, buffer)
      class(win_t), intent(in) :: this
      integer(int32), intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer(int32), intent(in) :: count
      real(dp), intent(out) :: buffer(*)
      integer(int32) :: ierr

      call MPI_Get(buffer, count, MPI_DOUBLE_PRECISION, &
                   target_rank, target_disp, count, MPI_DOUBLE_PRECISION, &
                   this%m_win, ierr)
   end subroutine win_get_dp

   !! Put data to remote window
   !!
   !! Sends data from local buffer to target rank's window.
   !! Must be called between fence or lock/unlock pairs.
   subroutine win_put_dp(this, target_rank, target_disp, count, buffer)
      class(win_t), intent(in) :: this
      integer(int32), intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer(int32), intent(in) :: count
      real(dp), intent(in) :: buffer(*)
      integer(int32) :: ierr

      call MPI_Put(buffer, count, MPI_DOUBLE_PRECISION, &
                   target_rank, target_disp, count, MPI_DOUBLE_PRECISION, &
                   this%m_win, ierr)
   end subroutine win_put_dp

   !! Non-blocking get data from remote window
   !!
   !! Retrieves data from target rank's window into local buffer.
   !! Returns a request handle for later completion via wait().
   !! Must be called between lock_all/unlock_all pairs.
   subroutine win_rget_dp(this, target_rank, target_disp, count, buffer, request)
      class(win_t), intent(in) :: this
      integer(int32), intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer(int32), intent(in) :: count
      real(dp), intent(out) :: buffer(*)
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Rget(buffer, count, MPI_DOUBLE_PRECISION, &
                    target_rank, target_disp, count, MPI_DOUBLE_PRECISION, &
                    this%m_win, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine win_rget_dp

   !! Non-blocking put data to remote window
   !!
   !! Sends data from local buffer to target rank's window.
   !! Returns a request handle for later completion via wait().
   !! Must be called between lock_all/unlock_all pairs.
   subroutine win_rput_dp(this, target_rank, target_disp, count, buffer, request)
      class(win_t), intent(in) :: this
      integer(int32), intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer(int32), intent(in) :: count
      real(dp), intent(in) :: buffer(*)
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Rput(buffer, count, MPI_DOUBLE_PRECISION, &
                    target_rank, target_disp, count, MPI_DOUBLE_PRECISION, &
                    this%m_win, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine win_rput_dp

   !! Accumulate data to remote window
   !!
   !! Atomically adds local buffer to target rank's window.
   !! Critical for DDI_ACC (Fock matrix accumulation).
   subroutine win_accumulate_dp(this, target_rank, target_disp, count, buffer, op)
      class(win_t), intent(in) :: this
      integer(int32), intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer(int32), intent(in) :: count
      real(dp), intent(in) :: buffer(*)
      type(MPI_Op), intent(in), optional :: op
      type(MPI_Op) :: mpi_op
      integer(int32) :: ierr

      if (present(op)) then
         mpi_op = op
      else
         mpi_op = MPI_SUM
      end if

      call MPI_Accumulate(buffer, count, MPI_DOUBLE_PRECISION, &
                          target_rank, target_disp, count, MPI_DOUBLE_PRECISION, &
                          mpi_op, this%m_win, ierr)
   end subroutine win_accumulate_dp

   ! ========================================================================
   ! Single precision (sp) RMA operations
   ! ========================================================================

   subroutine win_get_sp(this, target_rank, target_disp, count, buffer)
      class(win_t), intent(in) :: this
      integer(int32), intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer(int32), intent(in) :: count
      real(sp), intent(out) :: buffer(*)
      integer(int32) :: ierr

      call MPI_Get(buffer, count, MPI_REAL, &
                   target_rank, target_disp, count, MPI_REAL, &
                   this%m_win, ierr)
   end subroutine win_get_sp

   subroutine win_put_sp(this, target_rank, target_disp, count, buffer)
      class(win_t), intent(in) :: this
      integer(int32), intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer(int32), intent(in) :: count
      real(sp), intent(in) :: buffer(*)
      integer(int32) :: ierr

      call MPI_Put(buffer, count, MPI_REAL, &
                   target_rank, target_disp, count, MPI_REAL, &
                   this%m_win, ierr)
   end subroutine win_put_sp

   subroutine win_rget_sp(this, target_rank, target_disp, count, buffer, request)
      class(win_t), intent(in) :: this
      integer(int32), intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer(int32), intent(in) :: count
      real(sp), intent(out) :: buffer(*)
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Rget(buffer, count, MPI_REAL, &
                    target_rank, target_disp, count, MPI_REAL, &
                    this%m_win, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine win_rget_sp

   subroutine win_rput_sp(this, target_rank, target_disp, count, buffer, request)
      class(win_t), intent(in) :: this
      integer(int32), intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer(int32), intent(in) :: count
      real(sp), intent(in) :: buffer(*)
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Rput(buffer, count, MPI_REAL, &
                    target_rank, target_disp, count, MPI_REAL, &
                    this%m_win, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine win_rput_sp

   subroutine win_accumulate_sp(this, target_rank, target_disp, count, buffer, op)
      class(win_t), intent(in) :: this
      integer(int32), intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer(int32), intent(in) :: count
      real(sp), intent(in) :: buffer(*)
      type(MPI_Op), intent(in), optional :: op
      type(MPI_Op) :: mpi_op
      integer(int32) :: ierr

      if (present(op)) then
         mpi_op = op
      else
         mpi_op = MPI_SUM
      end if

      call MPI_Accumulate(buffer, count, MPI_REAL, &
                          target_rank, target_disp, count, MPI_REAL, &
                          mpi_op, this%m_win, ierr)
   end subroutine win_accumulate_sp

   ! ========================================================================
   ! Integer32 (i32) RMA operations
   ! ========================================================================

   subroutine win_get_i32(this, target_rank, target_disp, count, buffer)
      class(win_t), intent(in) :: this
      integer(int32), intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer(int32), intent(in) :: count
      integer(int32), intent(out) :: buffer(*)
      integer(int32) :: ierr

      call MPI_Get(buffer, count, MPI_INTEGER, &
                   target_rank, target_disp, count, MPI_INTEGER, &
                   this%m_win, ierr)
   end subroutine win_get_i32

   subroutine win_put_i32(this, target_rank, target_disp, count, buffer)
      class(win_t), intent(in) :: this
      integer(int32), intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer(int32), intent(in) :: count
      integer(int32), intent(in) :: buffer(*)
      integer(int32) :: ierr

      call MPI_Put(buffer, count, MPI_INTEGER, &
                   target_rank, target_disp, count, MPI_INTEGER, &
                   this%m_win, ierr)
   end subroutine win_put_i32

   subroutine win_rget_i32(this, target_rank, target_disp, count, buffer, request)
      class(win_t), intent(in) :: this
      integer(int32), intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer(int32), intent(in) :: count
      integer(int32), intent(out) :: buffer(*)
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Rget(buffer, count, MPI_INTEGER, &
                    target_rank, target_disp, count, MPI_INTEGER, &
                    this%m_win, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine win_rget_i32

   subroutine win_rput_i32(this, target_rank, target_disp, count, buffer, request)
      class(win_t), intent(in) :: this
      integer(int32), intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer(int32), intent(in) :: count
      integer(int32), intent(in) :: buffer(*)
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Rput(buffer, count, MPI_INTEGER, &
                    target_rank, target_disp, count, MPI_INTEGER, &
                    this%m_win, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine win_rput_i32

   subroutine win_accumulate_i32(this, target_rank, target_disp, count, buffer, op)
      class(win_t), intent(in) :: this
      integer(int32), intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer(int32), intent(in) :: count
      integer(int32), intent(in) :: buffer(*)
      type(MPI_Op), intent(in), optional :: op
      type(MPI_Op) :: mpi_op
      integer(int32) :: ierr

      if (present(op)) then
         mpi_op = op
      else
         mpi_op = MPI_SUM
      end if

      call MPI_Accumulate(buffer, count, MPI_INTEGER, &
                          target_rank, target_disp, count, MPI_INTEGER, &
                          mpi_op, this%m_win, ierr)
   end subroutine win_accumulate_i32

   ! ========================================================================
   ! Integer64 (i64) RMA operations
   ! ========================================================================

   subroutine win_get_i64(this, target_rank, target_disp, count, buffer)
      class(win_t), intent(in) :: this
      integer(int32), intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer(int32), intent(in) :: count
      integer(int64), intent(out) :: buffer(*)
      integer(int32) :: ierr

      call MPI_Get(buffer, count, MPI_INTEGER8, &
                   target_rank, target_disp, count, MPI_INTEGER8, &
                   this%m_win, ierr)
   end subroutine win_get_i64

   subroutine win_put_i64(this, target_rank, target_disp, count, buffer)
      class(win_t), intent(in) :: this
      integer(int32), intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer(int32), intent(in) :: count
      integer(int64), intent(in) :: buffer(*)
      integer(int32) :: ierr

      call MPI_Put(buffer, count, MPI_INTEGER8, &
                   target_rank, target_disp, count, MPI_INTEGER8, &
                   this%m_win, ierr)
   end subroutine win_put_i64

   subroutine win_rget_i64(this, target_rank, target_disp, count, buffer, request)
      class(win_t), intent(in) :: this
      integer(int32), intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer(int32), intent(in) :: count
      integer(int64), intent(out) :: buffer(*)
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Rget(buffer, count, MPI_INTEGER8, &
                    target_rank, target_disp, count, MPI_INTEGER8, &
                    this%m_win, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine win_rget_i64

   subroutine win_rput_i64(this, target_rank, target_disp, count, buffer, request)
      class(win_t), intent(in) :: this
      integer(int32), intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer(int32), intent(in) :: count
      integer(int64), intent(in) :: buffer(*)
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Rput(buffer, count, MPI_INTEGER8, &
                    target_rank, target_disp, count, MPI_INTEGER8, &
                    this%m_win, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine win_rput_i64

   subroutine win_accumulate_i64(this, target_rank, target_disp, count, buffer, op)
      class(win_t), intent(in) :: this
      integer(int32), intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer(int32), intent(in) :: count
      integer(int64), intent(in) :: buffer(*)
      type(MPI_Op), intent(in), optional :: op
      type(MPI_Op) :: mpi_op
      integer(int32) :: ierr

      if (present(op)) then
         mpi_op = op
      else
         mpi_op = MPI_SUM
      end if

      call MPI_Accumulate(buffer, count, MPI_INTEGER8, &
                          target_rank, target_disp, count, MPI_INTEGER8, &
                          mpi_op, this%m_win, ierr)
   end subroutine win_accumulate_i64

   !! Atomic fetch-and-add for load balancing
   !!
   !! Atomically increments remote counter and returns old value.
   !! Used for DDI_DLBNEXT (dynamic load balancing).
   subroutine win_fetch_and_add_i64(this, target_rank, target_disp, value, result)
      class(win_t), intent(in) :: this
      integer(int32), intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer(int64), intent(in) :: value
      integer(int64), intent(out) :: result
      integer(int32) :: ierr

      call MPI_Fetch_and_op(value, result, MPI_INTEGER8, &
                            target_rank, target_disp, MPI_SUM, this%m_win, ierr)
   end subroutine win_fetch_and_add_i64

   ! ========================================================================
   ! Window cleanup
   ! ========================================================================

   subroutine win_finalize(this)
      class(win_t), intent(inout) :: this
      integer(int32) :: ierr

      if (this%is_valid .and. this%m_win /= MPI_WIN_NULL) then
         call MPI_Win_free(this%m_win, ierr)
         this%is_valid = .false.
         this%m_win = MPI_WIN_NULL
      end if
   end subroutine win_finalize

   ! ========================================================================
   ! Allreduce operations (for DDI_GSUMF/GSUMI)
   ! ========================================================================

   !! Allreduce for scalar double precision
   !!
   !! In-place global reduction. Replaces DDI_GSUMF for scalars.
   subroutine allreduce_dp(comm, buffer, op)
      type(comm_t), intent(in) :: comm
      real(dp), intent(inout) :: buffer
      type(MPI_Op), intent(in), optional :: op
      type(MPI_Op) :: mpi_op
      integer :: ierr

      if (present(op)) then
         mpi_op = op
      else
         mpi_op = MPI_SUM
      end if

      call MPI_Allreduce(MPI_IN_PLACE, buffer, 1, MPI_DOUBLE_PRECISION, &
                         mpi_op, comm%get(), ierr)
   end subroutine allreduce_dp

   !! Allreduce for double precision array
   !!
   !! In-place global reduction. Replaces DDI_GSUMF for arrays.
   !! This is THE most-called DDI function (1,301 calls in GAMESS).
   subroutine allreduce_dp_array(comm, buffer, count, op)
      type(comm_t), intent(in) :: comm
      real(dp), intent(inout) :: buffer(:)
      integer, intent(in), optional :: count
      type(MPI_Op), intent(in), optional :: op
      type(MPI_Op) :: mpi_op
      integer :: ierr, n

      if (present(count)) then
         n = count
      else
         n = size(buffer)
      end if

      if (present(op)) then
         mpi_op = op
      else
         mpi_op = MPI_SUM
      end if

      call MPI_Allreduce(MPI_IN_PLACE, buffer, n, MPI_DOUBLE_PRECISION, &
                         mpi_op, comm%get(), ierr)
   end subroutine allreduce_dp_array

   !! Allreduce for scalar integer
   !!
   !! In-place global reduction. Replaces DDI_GSUMI for scalars.
   subroutine allreduce_i32(comm, buffer, op)
      type(comm_t), intent(in) :: comm
      integer(int32), intent(inout) :: buffer
      type(MPI_Op), intent(in), optional :: op
      type(MPI_Op) :: mpi_op
      integer :: ierr

      if (present(op)) then
         mpi_op = op
      else
         mpi_op = MPI_SUM
      end if

      call MPI_Allreduce(MPI_IN_PLACE, buffer, 1, MPI_INTEGER, &
                         mpi_op, comm%get(), ierr)
   end subroutine allreduce_i32

   !! Allreduce for integer array
   !!
   !! In-place global reduction. Replaces DDI_GSUMI for arrays.
   subroutine allreduce_i32_array(comm, buffer, count, op)
      type(comm_t), intent(in) :: comm
      integer(int32), intent(inout) :: buffer(:)
      integer, intent(in), optional :: count
      type(MPI_Op), intent(in), optional :: op
      type(MPI_Op) :: mpi_op
      integer :: ierr, n

      if (present(count)) then
         n = count
      else
         n = size(buffer)
      end if

      if (present(op)) then
         mpi_op = op
      else
         mpi_op = MPI_SUM
      end if

      call MPI_Allreduce(MPI_IN_PLACE, buffer, n, MPI_INTEGER, &
                         mpi_op, comm%get(), ierr)
   end subroutine allreduce_i32_array

   !! Non-in-place allreduce for scalar double precision
   !!
   !! Reduces sendbuf and stores result in recvbuf.
   !! Useful for timestep reduction where local value must be preserved.
   subroutine allreduce_dp_to(comm, sendbuf, recvbuf, op)
      type(comm_t), intent(in) :: comm
      real(dp), intent(in) :: sendbuf
      real(dp), intent(out) :: recvbuf
      type(MPI_Op), intent(in), optional :: op
      type(MPI_Op) :: mpi_op
      integer :: ierr

      if (present(op)) then
         mpi_op = op
      else
         mpi_op = MPI_SUM
      end if

      call MPI_Allreduce(sendbuf, recvbuf, 1, MPI_DOUBLE_PRECISION, &
                         mpi_op, comm%get(), ierr)
   end subroutine allreduce_dp_to

   !! Non-in-place allreduce for double precision array
   !!
   !! Reduces sendbuf and stores result in recvbuf.
   subroutine allreduce_dp_array_to(comm, sendbuf, recvbuf, count, op)
      type(comm_t), intent(in) :: comm
      real(dp), intent(in) :: sendbuf(:)
      real(dp), intent(out) :: recvbuf(:)
      integer, intent(in), optional :: count
      type(MPI_Op), intent(in), optional :: op
      type(MPI_Op) :: mpi_op
      integer :: ierr, n

      if (present(count)) then
         n = count
      else
         n = size(sendbuf)
      end if

      if (present(op)) then
         mpi_op = op
      else
         mpi_op = MPI_SUM
      end if

      call MPI_Allreduce(sendbuf, recvbuf, n, MPI_DOUBLE_PRECISION, &
                         mpi_op, comm%get(), ierr)
   end subroutine allreduce_dp_array_to

   !! Allreduce for single-precision scalar
   subroutine allreduce_sp(comm, buffer, op)
      type(comm_t), intent(in) :: comm
      real(sp), intent(inout) :: buffer
      type(MPI_Op), intent(in), optional :: op
      type(MPI_Op) :: mpi_op
      integer :: ierr

      if (present(op)) then
         mpi_op = op
      else
         mpi_op = MPI_SUM
      end if

      call MPI_Allreduce(MPI_IN_PLACE, buffer, 1, MPI_REAL, &
                         mpi_op, comm%get(), ierr)
   end subroutine allreduce_sp

   !! Allreduce for single-precision array
   subroutine allreduce_sp_array(comm, buffer, count, op)
      type(comm_t), intent(in) :: comm
      real(sp), intent(inout) :: buffer(:)
      integer, intent(in), optional :: count
      type(MPI_Op), intent(in), optional :: op
      type(MPI_Op) :: mpi_op
      integer :: ierr, n

      if (present(count)) then
         n = count
      else
         n = size(buffer)
      end if

      if (present(op)) then
         mpi_op = op
      else
         mpi_op = MPI_SUM
      end if

      call MPI_Allreduce(MPI_IN_PLACE, buffer, n, MPI_REAL, &
                         mpi_op, comm%get(), ierr)
   end subroutine allreduce_sp_array

   !! Non-in-place allreduce for scalar single-precision
   subroutine allreduce_sp_to(comm, sendbuf, recvbuf, op)
      type(comm_t), intent(in) :: comm
      real(sp), intent(in) :: sendbuf
      real(sp), intent(out) :: recvbuf
      type(MPI_Op), intent(in), optional :: op
      type(MPI_Op) :: mpi_op
      integer :: ierr

      if (present(op)) then
         mpi_op = op
      else
         mpi_op = MPI_SUM
      end if

      call MPI_Allreduce(sendbuf, recvbuf, 1, MPI_REAL, &
                         mpi_op, comm%get(), ierr)
   end subroutine allreduce_sp_to

   !! Non-in-place allreduce for single-precision array
   subroutine allreduce_sp_array_to(comm, sendbuf, recvbuf, count, op)
      type(comm_t), intent(in) :: comm
      real(sp), intent(in) :: sendbuf(:)
      real(sp), intent(out) :: recvbuf(:)
      integer, intent(in), optional :: count
      type(MPI_Op), intent(in), optional :: op
      type(MPI_Op) :: mpi_op
      integer :: ierr, n

      if (present(count)) then
         n = count
      else
         n = size(sendbuf)
      end if

      if (present(op)) then
         mpi_op = op
      else
         mpi_op = MPI_SUM
      end if

      call MPI_Allreduce(sendbuf, recvbuf, n, MPI_REAL, &
                         mpi_op, comm%get(), ierr)
   end subroutine allreduce_sp_array_to

end module pic_mpi_f08