module pic_mpi
!! Legacy MPI wrapper module using traditional MPI interface
!!
!! This module provides a high-level object-oriented interface to MPI
!! using the legacy MPI bindings for compatibility with older MPI implementations.
!! It provides the same API as pic_mpi_f08 but uses integer-based MPI handles.
   use pic_types, only: int32, dp, int64, sp
   use mpi, only: MPI_COMM_NULL, MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, &
                  MPI_INFO_NULL, MPI_UNDEFINED, MPI_INTEGER, MPI_INTEGER8, MPI_STATUS_SIZE, &
                  MPI_REQUEST_NULL, MPI_Comm_rank, MPI_Comm_size, MPI_Comm_dup, MPI_Barrier, &
                  MPI_Comm_split_type, MPI_Comm_split, MPI_Send, MPI_Recv, &
                  MPI_Isend, MPI_Irecv, MPI_Wait, MPI_Waitall, MPI_Test, &
                  MPI_Probe, MPI_Get_count, MPI_Iprobe, MPI_Comm_free, &
                  MPI_Abort, MPI_Allgather, MPI_Get_processor_name, MPI_DOUBLE_PRECISION, &
                  MPI_Bcast, MPI_Init, MPI_Init_thread, MPI_Query_thread, MPI_Finalize, MPI_LOGICAL, &
                  MPI_THREAD_SINGLE, MPI_THREAD_FUNNELED, MPI_THREAD_SERIALIZED, MPI_THREAD_MULTIPLE, &
                  MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_SOURCE, MPI_MAX_PROCESSOR_NAME, &
                  MPI_WIN_NULL, MPI_Win_create, MPI_Win_create_dynamic, MPI_Win_free, &
                  MPI_Win_fence, MPI_Win_lock, MPI_Win_unlock, MPI_Get, MPI_Put, &
                  MPI_Win_lock_all, MPI_Win_unlock_all, MPI_Win_flush, MPI_Win_flush_all, &
                  MPI_Rget, MPI_Rput, MPI_Win_allocate, &
                  MPI_Accumulate, MPI_Fetch_and_op, MPI_Allreduce, MPI_ADDRESS_KIND, &
                  MPI_LOCK_SHARED, MPI_SUM, MPI_MIN, MPI_MAX, MPI_IN_PLACE, MPI_REAL
   implicit none
   private

   public :: comm_t, comm_world, comm_null
   public :: send, recv, isend, irecv
   public :: comm_isend_real_dp_array_n, comm_irecv_real_dp_array_n  ! Direct export for host_data blocks (nvhpc bug workaround)
   public :: comm_isend_real_sp_array_n, comm_irecv_real_sp_array_n  ! Single precision equivalents
   public :: comm_recv_real_dp_array_n, comm_recv_real_sp_array_n    ! Fixed-size blocking recv variants
   public :: comm_recv_integer_array_n, comm_recv_integer64_array_n
   public :: comm_send_real_dp_array_2d_n, comm_recv_real_dp_array_2d_n
   public :: comm_isend_real_dp_array_2d_n
   public :: comm_send_integer_array_2d_n, comm_recv_integer_array_2d_n
   public :: request_t, wait, waitall, test
   public :: iprobe, probe, abort_comm, allgather, get_processor_name, bcast
   public :: pic_mpi_init, pic_mpi_finalize, pic_mpi_query_thread_level
   public :: win_t, win_create, win_create_dynamic
#ifndef MPICH_PERLMUTTER
   public :: win_allocate
#endif
   public :: allreduce

   ! Export MPI constants needed by applications
   public :: MPI_Status, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_MAX_PROCESSOR_NAME
   public :: MPI_THREAD_SINGLE, MPI_THREAD_FUNNELED, MPI_THREAD_SERIALIZED, MPI_THREAD_MULTIPLE
   public :: MPI_SUM, MPI_MIN, MPI_MAX

   type :: MPI_Status
   !! MPI_Status wrapper type for legacy MPI compatibility
   !!
   !! This type mimics the mpi_f08 MPI_Status type interface,
   !! providing a consistent API between legacy and modern MPI versions
      integer :: MPI_SOURCE = 0 !! Source rank of received message
      integer :: MPI_TAG = 0 !! Tag of received message
      integer :: MPI_ERROR = 0 !! Error code
      integer :: internal(3) = 0 !! Additional status fields
   end type MPI_Status

   type :: request_t
   !! Request type for non-blocking MPI operations
   !!
   !! Wraps MPI request handles to provide object-oriented interface for
   !! non-blocking communication operations (isend, irecv)
      private
      integer :: m_request = MPI_REQUEST_NULL !! Internal MPI request handle (integer)
      logical :: is_valid = .false. !! Validity flag
   contains
      procedure :: is_null => request_is_null !! Check if request is null
      procedure :: get => request_get !! Get underlying MPI request handle
      procedure :: free => request_free !! Free the request
   end type request_t

   !! MPI-3 Window type for one-sided communication (RMA) - legacy version
   !!
   !! Wraps MPI window handles to provide object-oriented interface for
   !! Remote Memory Access (RMA) operations needed for DDI
   type :: win_t
      private
      integer :: m_win = MPI_WIN_NULL
      logical :: is_valid = .false.
   contains
      procedure :: is_null => win_is_null
      procedure :: get_handle => win_get_handle
      procedure :: fence => win_fence
      procedure :: lock => win_lock
      procedure :: unlock => win_unlock
      procedure :: lock_all => win_lock_all
      procedure :: unlock_all => win_unlock_all
      procedure :: flush => win_flush
      procedure :: flush_all => win_flush_all
      procedure :: get_dp => win_get_dp
      procedure :: put_dp => win_put_dp
      procedure :: rget_dp => win_rget_dp
      procedure :: rput_dp => win_rput_dp
      procedure :: accumulate_dp => win_accumulate_dp
      procedure :: get_sp => win_get_sp
      procedure :: put_sp => win_put_sp
      procedure :: rget_sp => win_rget_sp
      procedure :: rput_sp => win_rput_sp
      procedure :: accumulate_sp => win_accumulate_sp
      procedure :: get_i32 => win_get_i32
      procedure :: put_i32 => win_put_i32
      procedure :: rget_i32 => win_rget_i32
      procedure :: rput_i32 => win_rput_i32
      procedure :: accumulate_i32 => win_accumulate_i32
      procedure :: get_i64 => win_get_i64
      procedure :: put_i64 => win_put_i64
      procedure :: rget_i64 => win_rget_i64
      procedure :: rput_i64 => win_rput_i64
      procedure :: accumulate_i64 => win_accumulate_i64
      procedure :: fetch_and_add_i64 => win_fetch_and_add_i64
      procedure :: finalize => win_finalize
   end type win_t

   !! MPI communicator wrapper type for legacy MPI
   type :: comm_t
   !! MPI communicator wrapper type for legacy MPI
   !!
   !! Provides object-oriented interface to MPI communicators with
   !! type-bound procedures for common operations. Uses integer handles
   !! for compatibility with legacy MPI implementations.
      private
      integer :: m_comm = MPI_COMM_NULL !! Internal MPI communicator (integer handle)
      integer(int32) :: m_rank = -1_int32 !! Cached rank in this communicator
      integer(int32) :: m_size = -1_int32 !! Cached size of this communicator
      logical :: is_valid = .false. !! Validity flag
   contains
      procedure :: rank => comm_rank !! Get rank in communicator
      procedure :: size => m_size_func !! Get size of communicator
      procedure :: leader => comm_leader !! Check if this rank is leader (rank 0)
      procedure :: is_null => comm_is_null !! Check if communicator is null
      procedure :: get => comm_get !! Get underlying MPI communicator handle

      procedure :: barrier => comm_barrier !! Synchronization barrier

      procedure :: split => comm_split_shared !! Split into shared memory communicators
      procedure :: split_by => comm_split_by_color !! Split communicator by color
      procedure :: discard_leader => comm_discard_leader !! Create communicator without leader
      procedure :: discard_to => comm_discard_to !! Create communicator with first N ranks
      procedure :: duplicate => comm_duplicate !! Duplicate communicator

      procedure :: finalize => comm_finalize !! Free communicator resources
   end type comm_t

   interface comm_world
      module procedure create_world_comm
   end interface

   interface comm_null
      module procedure create_null_comm
   end interface

   interface send
      module procedure :: comm_send_integer
      module procedure :: comm_send_integer_array
      module procedure :: comm_send_integer64
      module procedure :: comm_send_integer64_array
      module procedure :: comm_send_real_dp
      module procedure :: comm_send_real_dp_array
      module procedure :: comm_send_real_dp_array_2d
      module procedure :: comm_send_real_dp_array_2d_n
      module procedure :: comm_send_integer_array_2d_n
      module procedure :: comm_send_real_sp
      module procedure :: comm_send_real_sp_array
      module procedure :: comm_send_real_sp_array_2d
      module procedure :: comm_send_logical
   end interface send

   interface recv
      module procedure :: comm_recv_integer
      module procedure :: comm_recv_integer_array
      module procedure :: comm_recv_integer64
      module procedure :: comm_recv_integer64_array
      module procedure :: comm_recv_real_dp
      module procedure :: comm_recv_real_dp_array
      module procedure :: comm_recv_real_dp_array_2d
      module procedure :: comm_recv_real_sp
      module procedure :: comm_recv_real_sp_array
      module procedure :: comm_recv_real_sp_array_2d
      module procedure :: comm_recv_logical
      ! Fixed-size receive variants (count is positional, no probe-then-allocate).
      module procedure :: comm_recv_integer_array_n
      module procedure :: comm_recv_integer64_array_n
      module procedure :: comm_recv_real_dp_array_n
      module procedure :: comm_recv_real_sp_array_n
      module procedure :: comm_recv_real_dp_array_2d_n
      module procedure :: comm_recv_integer_array_2d_n
   end interface recv

   interface iprobe
      module procedure :: comm_iprobe
   end interface iprobe

   interface probe
      module procedure :: comm_probe
   end interface probe

   interface allgather
      module procedure :: comm_allgather_integer
   end interface allgather

   interface bcast
      module procedure :: comm_bcast_integer
      module procedure :: comm_bcast_integer64
      module procedure :: comm_bcast_real_dp
      module procedure :: comm_bcast_real_dp_array
      module procedure :: comm_bcast_real_sp
      module procedure :: comm_bcast_real_sp_array
   end interface bcast

   interface isend
      module procedure :: comm_isend_integer
      module procedure :: comm_isend_integer_array
      module procedure :: comm_isend_integer64
      module procedure :: comm_isend_integer64_array
      module procedure :: comm_isend_real_dp
      module procedure :: comm_isend_real_dp_array
      module procedure :: comm_isend_real_dp_array_2d
      module procedure :: comm_isend_real_dp_array_2d_n
      module procedure :: comm_isend_real_sp
      module procedure :: comm_isend_real_sp_array
      module procedure :: comm_isend_real_sp_array_2d
      module procedure :: comm_isend_logical
   end interface isend

   interface irecv
      module procedure :: comm_irecv_integer
      module procedure :: comm_irecv_integer_array
      module procedure :: comm_irecv_integer64
      module procedure :: comm_irecv_integer64_array
      module procedure :: comm_irecv_real_dp
      module procedure :: comm_irecv_real_dp_array
      module procedure :: comm_irecv_real_dp_array_2d
      module procedure :: comm_irecv_real_sp
      module procedure :: comm_irecv_real_sp_array
      module procedure :: comm_irecv_real_sp_array_2d
      module procedure :: comm_irecv_logical
   end interface irecv

   interface wait
      module procedure :: request_wait
   end interface wait

   interface waitall
      module procedure :: request_waitall
   end interface waitall

   interface test
      module procedure :: request_test
   end interface test

   interface win_create
      module procedure create_win_dp_array
   end interface win_create

   interface win_create_dynamic
      module procedure create_win_dynamic
   end interface win_create_dynamic

#ifndef MPICH_PERLMUTTER
   interface win_allocate
      module procedure create_win_allocate_dp_1d
      module procedure create_win_allocate_dp_2d
      module procedure create_win_allocate_sp_1d
      module procedure create_win_allocate_i32_1d
      module procedure create_win_allocate_i64_1d
   end interface win_allocate
#endif

   interface allreduce
      module procedure :: allreduce_dp
      module procedure :: allreduce_dp_array
      module procedure :: allreduce_sp
      module procedure :: allreduce_sp_array
      module procedure :: allreduce_i32
      module procedure :: allreduce_i32_array
      module procedure :: allreduce_dp_to
      module procedure :: allreduce_dp_array_to
      module procedure :: allreduce_sp_to
      module procedure :: allreduce_sp_array_to
   end interface allreduce

contains

   pure function status_array_to_type(status_array) result(status_type)
   !! Convert legacy integer array status to MPI_Status type
   !!
   !! Internal helper function for converting between array and type representations
      integer, intent(in) :: status_array(MPI_STATUS_SIZE)
      type(MPI_Status) :: status_type

      status_type%MPI_SOURCE = status_array(1)  ! MPI_SOURCE is at index 1
      status_type%MPI_TAG = status_array(2)     ! MPI_TAG is at index 2
      status_type%MPI_ERROR = status_array(3)   ! MPI_ERROR is at index 3
      status_type%internal(1:3) = status_array(4:6)
   end function status_array_to_type

   ! Helper function to convert MPI_Status type to legacy integer array
   pure function status_type_to_array(status_type) result(status_array)
      type(MPI_Status), intent(in) :: status_type
      integer :: status_array(MPI_STATUS_SIZE)

      status_array(1) = status_type%MPI_SOURCE
      status_array(2) = status_type%MPI_TAG
      status_array(3) = status_type%MPI_ERROR
      status_array(4:6) = status_type%internal(1:3)
   end function status_type_to_array

   function create_comm_from_mpi(mpi_comm_in) result(comm)
      integer, intent(in) :: mpi_comm_in
      type(comm_t) :: comm
      integer(int32) :: ierr

      comm%m_comm = mpi_comm_in
      if (mpi_comm_in /= MPI_COMM_NULL) then
         call MPI_Comm_rank(comm%m_comm, comm%m_rank, ierr)
         call MPI_Comm_size(comm%m_comm, comm%m_size, ierr)
         comm%is_valid = .true.
      else
         comm%is_valid = .false.
      end if

   end function create_comm_from_mpi

   function create_world_comm() result(comm)
      type(comm_t) :: comm
      integer :: dup_comm
      integer(int32) :: ierr

      call MPI_Comm_dup(MPI_COMM_WORLD, dup_comm, ierr)
      comm = create_comm_from_mpi(dup_comm)

   end function create_world_comm

   function create_null_comm() result(comm)
      type(comm_t) :: comm

      ! Explicitly initialize to null/invalid state
      comm%m_comm = MPI_COMM_NULL
      comm%m_rank = -1_int32
      comm%m_size = -1_int32
      comm%is_valid = .false.
   end function create_null_comm

   pure function comm_rank(this) result(rank)
      class(comm_t), intent(in) :: this
      integer :: rank
      rank = this%m_rank
   end function comm_rank

   pure function m_size_func(this) result(size)
      class(comm_t), intent(in) :: this
      integer :: size
      size = this%m_size
   end function m_size_func

   pure function comm_leader(this) result(is_leader)
      class(comm_t), intent(in) :: this
      logical :: is_leader
      is_leader = (this%m_rank == 0_int32)
   end function comm_leader

   pure function comm_is_null(this) result(is_null)
      class(comm_t), intent(in) :: this
      logical :: is_null
      is_null = .not. this%is_valid
   end function comm_is_null

   function comm_get(this) result(mpi_comm_out)
      class(comm_t), intent(in) :: this
      integer :: mpi_comm_out

      if (.not. this%is_valid) then
         error stop "Cannot get MPI_Comm from null Comm"
      end if
      mpi_comm_out = this%m_comm
   end function comm_get

   subroutine comm_barrier(this)
      class(comm_t), intent(in) :: this
      integer(int32) :: ierr
      call MPI_Barrier(this%m_comm, ierr)
   end subroutine comm_barrier

   function comm_split_shared(this) result(new_comm)
      class(comm_t), intent(in) :: this
      type(comm_t) :: new_comm
      integer :: mpi_comm_new
      integer(int32) :: ierr

      if (.not. this%is_valid) then
         new_comm = comm_null()
         return
      end if

      call MPI_Comm_split_type(this%get(), MPI_COMM_TYPE_SHARED, 0_int32, MPI_INFO_NULL, mpi_comm_new, ierr)
      new_comm = create_comm_from_mpi(mpi_comm_new)
   end function comm_split_shared

   function comm_split_by_color(this, color) result(new_comm)
      class(comm_t), intent(in) :: this
      integer, intent(in) :: color
      type(comm_t) :: new_comm
      integer :: mpi_comm_new
      integer(int32) :: ierr

      if (.not. this%is_valid) then
         new_comm = comm_null()
         return
      end if

      call MPI_Comm_split(this%m_comm, color, 0_int32, mpi_comm_new, ierr)
      new_comm = create_comm_from_mpi(mpi_comm_new)
   end function comm_split_by_color

   function comm_discard_leader(this) result(new_comm)
      class(comm_t), intent(in) :: this
      type(comm_t) :: new_comm
      integer(int32) :: color

      if (.not. this%is_valid) then
         new_comm = comm_null()
         return
      end if

      if (this%rank() == 0_int32) then
         color = MPI_UNDEFINED
      else
         color = 0_int32
      end if
      new_comm = this%split_by(color)
   end function comm_discard_leader

   function comm_discard_to(this, num_ranks) result(new_comm)
      class(comm_t), intent(in) :: this
      integer, intent(in) :: num_ranks
      type(comm_t) :: new_comm
      integer(int32) :: color

      if (.not. this%is_valid) then
         new_comm = comm_null()
         return
      end if

      if (this%rank() < num_ranks) then
         color = 0_int32
      else
         color = MPI_UNDEFINED
      end if
      new_comm = this%split_by(color)
   end function comm_discard_to

   function comm_duplicate(this) result(new_comm)
      class(comm_t), intent(in) :: this
      type(comm_t) :: new_comm
      integer :: mpi_comm_new
      integer(int32) :: ierr

      if (.not. this%is_valid) then
         new_comm = comm_null()
         return
      end if

      call MPI_Comm_dup(this%m_comm, mpi_comm_new, ierr)
      new_comm = create_comm_from_mpi(mpi_comm_new)
   end function comm_duplicate

   subroutine comm_send_integer(comm, data, dest, tag)
      type(comm_t), intent(in) :: comm
      integer(int32), intent(in) :: data
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      integer(int32) :: ierr

      call MPI_Send(data, 1_int32, MPI_INTEGER, dest, tag, comm%m_comm, ierr)
   end subroutine comm_send_integer

   subroutine comm_send_integer_array(comm, data, dest, tag)
      type(comm_t), intent(in) :: comm
      integer(int32), intent(in) :: data(:)
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      integer(int32) :: ierr

      call MPI_Send(data, size(data), MPI_INTEGER, dest, tag, comm%m_comm, ierr)
   end subroutine comm_send_integer_array

   subroutine comm_send_integer64(comm, data, dest, tag)
   !! Blocking send of an integer64 to specified destination
      type(comm_t), intent(in) :: comm
      integer(int64), intent(in) :: data
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      integer(int32) :: ierr

      call MPI_Send(data, 1_int32, MPI_INTEGER8, dest, tag, comm%m_comm, ierr)
   end subroutine comm_send_integer64

   subroutine comm_send_integer64_array(comm, data, dest, tag)
   !! Blocking send of an integer64 array to specified destination
      type(comm_t), intent(in) :: comm
      integer(int64), intent(in) :: data(:)
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      integer(int32) :: ierr

      call MPI_Send(data, size(data), MPI_INTEGER8, dest, tag, comm%m_comm, ierr)
   end subroutine comm_send_integer64_array

   subroutine comm_send_real_dp(comm, data, dest, tag)
      type(comm_t), intent(in) :: comm
      real(dp), intent(in) :: data
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      integer(int32) :: ierr

      call MPI_Send(data, 1_int32, MPI_DOUBLE_PRECISION, dest, tag, comm%m_comm, ierr)
   end subroutine comm_send_real_dp

   subroutine comm_send_real_dp_array(comm, data, dest, tag)
      type(comm_t), intent(in) :: comm
      real(dp), intent(in) :: data(:)
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      integer(int32) :: ierr

      call MPI_Send(data, size(data), MPI_DOUBLE_PRECISION, dest, tag, comm%m_comm, ierr)
   end subroutine comm_send_real_dp_array

   subroutine comm_send_real_dp_array_2d(comm, data, dest, tag)
      type(comm_t), intent(in) :: comm
      real(dp), intent(in) :: data(:, :)
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      integer(int32) :: ierr, dim1, dim2

      ! Send dimensions first
      dim1 = size(data, 1)
      dim2 = size(data, 2)
      call MPI_Send(dim1, 1_int32, MPI_INTEGER, dest, tag, comm%m_comm, ierr)
      call MPI_Send(dim2, 1_int32, MPI_INTEGER, dest, tag, comm%m_comm, ierr)

      ! Send data
      call MPI_Send(data, size(data), MPI_DOUBLE_PRECISION, dest, tag, comm%m_comm, ierr)
   end subroutine comm_send_real_dp_array_2d

   subroutine comm_send_real_dp_array_2d_n(comm, data, count, dest, tag)
      !! Blocking send of a contiguous 2D dp array using an explicit
      !! count.  No dim-prefix protocol — caller and receiver agree on
      !! shape externally.
      type(comm_t), intent(in) :: comm
      real(dp), intent(in) :: data(:, :)
      integer(int32), intent(in) :: count
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      integer(int32) :: ierr

      call MPI_Send(data, count, MPI_DOUBLE_PRECISION, dest, tag, &
                    comm%m_comm, ierr)
   end subroutine comm_send_real_dp_array_2d_n

   subroutine comm_send_integer_array_2d_n(comm, data, count, dest, tag)
      !! Blocking send of a contiguous 2D int32 array with explicit count.
      type(comm_t), intent(in) :: comm
      integer(int32), intent(in) :: data(:, :)
      integer(int32), intent(in) :: count
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      integer(int32) :: ierr

      call MPI_Send(data, count, MPI_INTEGER, dest, tag, &
                    comm%m_comm, ierr)
   end subroutine comm_send_integer_array_2d_n

   subroutine comm_send_real_sp(comm, data, dest, tag)
      type(comm_t), intent(in) :: comm
      real(sp), intent(in) :: data
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      integer(int32) :: ierr

      call MPI_Send(data, 1_int32, MPI_REAL, dest, tag, comm%m_comm, ierr)
   end subroutine comm_send_real_sp

   subroutine comm_send_real_sp_array(comm, data, dest, tag)
      type(comm_t), intent(in) :: comm
      real(sp), intent(in) :: data(:)
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      integer(int32) :: ierr

      call MPI_Send(data, size(data), MPI_REAL, dest, tag, comm%m_comm, ierr)
   end subroutine comm_send_real_sp_array

   subroutine comm_send_real_sp_array_2d(comm, data, dest, tag)
      type(comm_t), intent(in) :: comm
      real(sp), intent(in) :: data(:, :)
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      integer(int32) :: ierr, dim1, dim2

      ! Send dimensions first
      dim1 = size(data, 1)
      dim2 = size(data, 2)
      call MPI_Send(dim1, 1_int32, MPI_INTEGER, dest, tag, comm%m_comm, ierr)
      call MPI_Send(dim2, 1_int32, MPI_INTEGER, dest, tag, comm%m_comm, ierr)

      ! Send data
      call MPI_Send(data, size(data), MPI_REAL, dest, tag, comm%m_comm, ierr)
   end subroutine comm_send_real_sp_array_2d

   subroutine comm_send_logical(comm, data, dest, tag)
      type(comm_t), intent(in) :: comm
      logical, intent(in) :: data
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      integer(int32) :: ierr

      call MPI_Send(data, 1_int32, MPI_LOGICAL, dest, tag, comm%m_comm, ierr)
   end subroutine comm_send_logical

   subroutine comm_recv_integer(comm, data, source, tag, status)
      type(comm_t), intent(in) :: comm
      integer(int32), intent(out) :: data
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      integer(int32) :: ierr
      type(MPI_Status), intent(out), optional :: status
      integer :: stat(MPI_STATUS_SIZE)

      if (present(status)) then
         call MPI_Recv(data, 1_int32, MPI_INTEGER, source, tag, comm%m_comm, stat, ierr)
         status = status_array_to_type(stat)
      else
         call MPI_Recv(data, 1_int32, MPI_INTEGER, source, tag, comm%m_comm, stat, ierr)
      end if
   end subroutine comm_recv_integer

   subroutine comm_recv_integer_array(comm, data, source, tag, status)
      type(comm_t), intent(in) :: comm
      integer(int32), allocatable, intent(out) :: data(:)
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(MPI_Status), intent(out) :: status
      integer(int32) :: count
      integer(int32) :: ierr
      integer :: stat(MPI_STATUS_SIZE)

      ! First probe to get message size
      call MPI_Probe(source, tag, comm%m_comm, stat, ierr)
      call MPI_Get_count(stat, MPI_INTEGER, count, ierr)

      ! Allocate and receive
      allocate (data(count))
      call MPI_Recv(data, count, MPI_INTEGER, source, tag, comm%m_comm, stat, ierr)

      ! Convert status
      status = status_array_to_type(stat)
   end subroutine comm_recv_integer_array

   subroutine comm_recv_integer64(comm, data, source, tag, status)
   !! Blocking receive of an integer64 from specified source
      type(comm_t), intent(in) :: comm
      integer(int64), intent(out) :: data
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      integer(int32) :: ierr
      type(MPI_Status), intent(out), optional :: status
      integer :: stat(MPI_STATUS_SIZE)

      if (present(status)) then
         call MPI_Recv(data, 1_int32, MPI_INTEGER8, source, tag, comm%m_comm, stat, ierr)
         status = status_array_to_type(stat)
      else
         call MPI_Recv(data, 1_int32, MPI_INTEGER8, source, tag, comm%m_comm, stat, ierr)
      end if
   end subroutine comm_recv_integer64

   subroutine comm_recv_integer64_array(comm, data, source, tag, status)
   !! Blocking receive of an integer64 array from specified source
      type(comm_t), intent(in) :: comm
      integer(int64), allocatable, intent(out) :: data(:)
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(MPI_Status), intent(out) :: status
      integer(int32) :: count
      integer(int32) :: ierr
      integer :: stat(MPI_STATUS_SIZE)

      ! First probe to get message size
      call MPI_Probe(source, tag, comm%m_comm, stat, ierr)
      call MPI_Get_count(stat, MPI_INTEGER8, count, ierr)

      ! Allocate and receive
      allocate (data(count))
      call MPI_Recv(data, count, MPI_INTEGER8, source, tag, comm%m_comm, stat, ierr)

      ! Convert status
      status = status_array_to_type(stat)
   end subroutine comm_recv_integer64_array

   subroutine comm_recv_real_dp(comm, data, source, tag, status)
      type(comm_t), intent(in) :: comm
      real(dp), intent(out) :: data
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      integer(int32) :: ierr
      type(MPI_Status), intent(out), optional :: status
      integer :: stat(MPI_STATUS_SIZE)

      if (present(status)) then
         call MPI_Recv(data, 1_int32, MPI_DOUBLE_PRECISION, source, tag, comm%m_comm, stat, ierr)
         status = status_array_to_type(stat)
      else
         call MPI_Recv(data, 1_int32, MPI_DOUBLE_PRECISION, source, tag, comm%m_comm, stat, ierr)
      end if
   end subroutine comm_recv_real_dp

   subroutine comm_recv_real_dp_array(comm, data, source, tag, status)
      type(comm_t), intent(in) :: comm
      real(dp), allocatable, intent(out) :: data(:)
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(MPI_Status) :: status
      integer(int32) :: count
      integer(int32) :: ierr
      integer :: stat(MPI_STATUS_SIZE)

      ! First probe to get message size
      call MPI_Probe(source, tag, comm%m_comm, stat, ierr)
      call MPI_Get_count(stat, MPI_DOUBLE_PRECISION, count, ierr)

      ! Allocate and receive
      allocate (data(count))
      call MPI_Recv(data, count, MPI_DOUBLE_PRECISION, source, tag, comm%m_comm, stat, ierr)

      ! Convert status
      status = status_array_to_type(stat)
   end subroutine comm_recv_real_dp_array

   subroutine comm_recv_real_dp_array_2d(comm, data, source, tag, status)
      !! Receive 2D real(dp) array (must be pre-allocated by receiver)
      type(comm_t), intent(in) :: comm
      real(dp), intent(inout), allocatable :: data(:, :)
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(MPI_Status), intent(out) :: status
      integer(int32) :: ierr, count, dim1, dim2
      integer :: stat(MPI_STATUS_SIZE)

      ! Receive dimensions first
      call MPI_Recv(dim1, 1_int32, MPI_INTEGER, source, tag, comm%m_comm, stat, ierr)
      call MPI_Recv(dim2, 1_int32, MPI_INTEGER, source, tag, comm%m_comm, stat, ierr)

      ! Allocate array with received dimensions
      if (.not. allocated(data)) then
         allocate (data(dim1, dim2))
      end if

      ! Receive data
      count = dim1*dim2
      call MPI_Recv(data, count, MPI_DOUBLE_PRECISION, source, tag, comm%m_comm, stat, ierr)

      ! Convert status
      status = status_array_to_type(stat)
   end subroutine comm_recv_real_dp_array_2d

   subroutine comm_recv_real_dp_array_2d_n(comm, data, count, source, tag, status)
      !! Blocking recv of a contiguous 2D dp array with explicit count.
      type(comm_t), intent(in)    :: comm
      real(dp), intent(out)   :: data(:, :)
      integer(int32), intent(in)    :: count
      integer(int32), intent(in)    :: source
      integer(int32), intent(in)    :: tag
      type(MPI_Status), intent(out), optional :: status
      integer :: stat(MPI_STATUS_SIZE)
      integer(int32) :: ierr

      call MPI_Recv(data, count, MPI_DOUBLE_PRECISION, source, tag, &
                    comm%m_comm, stat, ierr)
      if (present(status)) status = status_array_to_type(stat)
   end subroutine comm_recv_real_dp_array_2d_n

   subroutine comm_recv_integer_array_2d_n(comm, data, count, source, tag, status)
      !! Blocking recv of a contiguous 2D int32 array with explicit count.
      type(comm_t), intent(in)    :: comm
      integer(int32), intent(out)   :: data(:, :)
      integer(int32), intent(in)    :: count
      integer(int32), intent(in)    :: source
      integer(int32), intent(in)    :: tag
      type(MPI_Status), intent(out), optional :: status
      integer :: stat(MPI_STATUS_SIZE)
      integer(int32) :: ierr

      call MPI_Recv(data, count, MPI_INTEGER, source, tag, &
                    comm%m_comm, stat, ierr)
      if (present(status)) status = status_array_to_type(stat)
   end subroutine comm_recv_integer_array_2d_n

   subroutine comm_recv_real_sp(comm, data, source, tag, status)
      type(comm_t), intent(in) :: comm
      real(sp), intent(out) :: data
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      integer(int32) :: ierr
      type(MPI_Status), intent(out), optional :: status
      integer :: stat(MPI_STATUS_SIZE)

      if (present(status)) then
         call MPI_Recv(data, 1_int32, MPI_REAL, source, tag, comm%m_comm, stat, ierr)
         status = status_array_to_type(stat)
      else
         call MPI_Recv(data, 1_int32, MPI_REAL, source, tag, comm%m_comm, stat, ierr)
      end if
   end subroutine comm_recv_real_sp

   subroutine comm_recv_real_sp_array(comm, data, source, tag, status)
      type(comm_t), intent(in) :: comm
      real(sp), allocatable, intent(out) :: data(:)
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(MPI_Status) :: status
      integer(int32) :: count
      integer(int32) :: ierr
      integer :: stat(MPI_STATUS_SIZE)

      ! First probe to get message size
      call MPI_Probe(source, tag, comm%m_comm, stat, ierr)
      call MPI_Get_count(stat, MPI_REAL, count, ierr)

      ! Allocate and receive
      allocate (data(count))
      call MPI_Recv(data, count, MPI_REAL, source, tag, comm%m_comm, stat, ierr)

      ! Convert status
      status = status_array_to_type(stat)
   end subroutine comm_recv_real_sp_array

   subroutine comm_recv_real_sp_array_2d(comm, data, source, tag, status)
      !! Receive 2D real(sp) array (must be pre-allocated by receiver)
      type(comm_t), intent(in) :: comm
      real(sp), intent(inout), allocatable :: data(:, :)
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(MPI_Status), intent(out) :: status
      integer(int32) :: ierr, count, dim1, dim2
      integer :: stat(MPI_STATUS_SIZE)

      ! Receive dimensions first
      call MPI_Recv(dim1, 1_int32, MPI_INTEGER, source, tag, comm%m_comm, stat, ierr)
      call MPI_Recv(dim2, 1_int32, MPI_INTEGER, source, tag, comm%m_comm, stat, ierr)

      ! Allocate array with received dimensions
      if (.not. allocated(data)) then
         allocate (data(dim1, dim2))
      end if

      ! Receive data
      count = dim1*dim2
      call MPI_Recv(data, count, MPI_REAL, source, tag, comm%m_comm, stat, ierr)

      ! Convert status
      status = status_array_to_type(stat)
   end subroutine comm_recv_real_sp_array_2d

   subroutine comm_recv_logical(comm, data, source, tag, status)
      type(comm_t), intent(in) :: comm
      logical, intent(out) :: data
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(MPI_Status), intent(out), optional :: status
      integer :: stat(MPI_STATUS_SIZE)
      integer(int32) :: ierr

      if (present(status)) then
         call MPI_Recv(data, 1_int32, MPI_LOGICAL, source, tag, comm%m_comm, stat, ierr)
         status = status_array_to_type(stat)
      else
         call MPI_Recv(data, 1_int32, MPI_LOGICAL, source, tag, comm%m_comm, stat, ierr)
      end if
   end subroutine comm_recv_logical

   ! ========================================================================
   ! Fixed-size blocking receive variants (legacy backend)
   ! ========================================================================

   subroutine comm_recv_real_dp_array_n(comm, data, count, source, tag, status)
      !! Blocking receive into a pre-allocated double-precision array.
      type(comm_t), intent(in) :: comm
      real(dp), intent(out) :: data(:)
      integer(int32), intent(in) :: count
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(MPI_Status), intent(out), optional :: status
      integer :: stat(MPI_STATUS_SIZE)
      integer(int32) :: ierr

      call MPI_Recv(data, count, MPI_DOUBLE_PRECISION, source, tag, &
                    comm%m_comm, stat, ierr)
      if (present(status)) status = status_array_to_type(stat)
   end subroutine comm_recv_real_dp_array_n

   subroutine comm_recv_real_sp_array_n(comm, data, count, source, tag, status)
      !! Blocking receive into a pre-allocated single-precision array.
      type(comm_t), intent(in) :: comm
      real(sp), intent(out) :: data(:)
      integer(int32), intent(in) :: count
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(MPI_Status), intent(out), optional :: status
      integer :: stat(MPI_STATUS_SIZE)
      integer(int32) :: ierr

      call MPI_Recv(data, count, MPI_REAL, source, tag, &
                    comm%m_comm, stat, ierr)
      if (present(status)) status = status_array_to_type(stat)
   end subroutine comm_recv_real_sp_array_n

   subroutine comm_recv_integer_array_n(comm, data, count, source, tag, status)
      !! Blocking receive into a pre-allocated int32 array.
      type(comm_t), intent(in) :: comm
      integer(int32), intent(out) :: data(:)
      integer(int32), intent(in) :: count
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(MPI_Status), intent(out), optional :: status
      integer :: stat(MPI_STATUS_SIZE)
      integer(int32) :: ierr

      call MPI_Recv(data, count, MPI_INTEGER, source, tag, &
                    comm%m_comm, stat, ierr)
      if (present(status)) status = status_array_to_type(stat)
   end subroutine comm_recv_integer_array_n

   subroutine comm_recv_integer64_array_n(comm, data, count, source, tag, status)
      !! Blocking receive into a pre-allocated int64 array.
      type(comm_t), intent(in) :: comm
      integer(int64), intent(out) :: data(:)
      integer(int32), intent(in) :: count
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(MPI_Status), intent(out), optional :: status
      integer :: stat(MPI_STATUS_SIZE)
      integer(int32) :: ierr

      call MPI_Recv(data, count, MPI_INTEGER8, source, tag, &
                    comm%m_comm, stat, ierr)
      if (present(status)) status = status_array_to_type(stat)
   end subroutine comm_recv_integer64_array_n

   subroutine comm_iprobe(comm, source, tag, message_pending, status)
      type(comm_t), intent(in) :: comm
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      logical, intent(out) :: message_pending
      type(MPI_Status), intent(out) :: status
      integer(int32) :: ierr
      integer :: status_array(MPI_STATUS_SIZE)

      call MPI_Iprobe(source, tag, comm%m_comm, message_pending, status_array, ierr)
      ! Convert legacy status array to MPI_Status type
      status = status_array_to_type(status_array)
   end subroutine comm_iprobe

   subroutine comm_probe(comm, source, tag, status)
      !! Blocking probe for incoming messages — caller reads
      !! `status%MPI_SOURCE` / `status%MPI_TAG` to dispatch the matching
      !! `recv`.  Mirrors the f08 backend's `comm_probe`.
      type(comm_t), intent(in) :: comm
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(MPI_Status), intent(out) :: status
      integer(int32) :: ierr
      integer :: status_array(MPI_STATUS_SIZE)

      call MPI_Probe(source, tag, comm%m_comm, status_array, ierr)
      status = status_array_to_type(status_array)
   end subroutine comm_probe

   subroutine comm_finalize(this)
      class(comm_t), intent(inout) :: this
      integer(int32) :: ierr

      if (this%is_valid .and. this%m_comm /= MPI_COMM_NULL) then
         call MPI_Comm_free(this%m_comm, ierr)
         this%is_valid = .false.
         this%m_comm = MPI_COMM_NULL
      end if
   end subroutine comm_finalize

   subroutine abort_comm(comm, errorcode)
      type(comm_t), intent(in) :: comm
      integer(int32), intent(in) :: errorcode
      integer(int32) :: ierr

      call MPI_Abort(comm%m_comm, errorcode, ierr)
   end subroutine abort_comm

   subroutine comm_allgather_integer(comm, sendbuf, recvbuf)
      type(comm_t), intent(in) :: comm
      integer(int32), intent(in) :: sendbuf
      integer(int32), intent(out) :: recvbuf(:)
      integer(int32) :: ierr

      call MPI_Allgather(sendbuf, 1_int32, MPI_INTEGER, recvbuf, 1_int32, MPI_INTEGER, comm%m_comm, ierr)
   end subroutine comm_allgather_integer

   subroutine comm_bcast_integer(comm, buffer, count, root)
      type(comm_t), intent(in) :: comm
      integer(int32), intent(inout) :: buffer
      integer(int32), intent(in) :: count
      integer(int32), intent(in) :: root
      integer(int32) :: ierr

      call MPI_Bcast(buffer, count, MPI_INTEGER, root, comm%m_comm, ierr)
   end subroutine comm_bcast_integer

   subroutine comm_bcast_integer64(comm, buffer, count, root)
   !! Broadcasts integer64 data from root process to all processes in communicator
      type(comm_t), intent(in) :: comm
      integer(int64), intent(inout) :: buffer
      integer(int32), intent(in) :: count
      integer(int32), intent(in) :: root
      integer(int32) :: ierr

      call MPI_Bcast(buffer, count, MPI_INTEGER8, root, comm%m_comm, ierr)
   end subroutine comm_bcast_integer64

   subroutine comm_bcast_real_dp(comm, buffer, count, root)
      !! Broadcasts double precision data from root process to all processes in communicator
      type(comm_t), intent(in) :: comm
      real(dp), intent(inout) :: buffer
      integer(int32), intent(in) :: count
      integer(int32), intent(in) :: root
      integer(int32) :: ierr

      call MPI_Bcast(buffer, count, MPI_DOUBLE_PRECISION, root, comm%m_comm, ierr)
   end subroutine comm_bcast_real_dp

   subroutine comm_bcast_real_dp_array(comm, buffer, count, root)
      !! Broadcasts double precision array from root process to all processes in communicator
      type(comm_t), intent(in) :: comm
      real(dp), intent(inout) :: buffer(:)
      integer(int32), intent(in) :: count
      integer(int32), intent(in) :: root
      integer(int32) :: ierr

      call MPI_Bcast(buffer, count, MPI_DOUBLE_PRECISION, root, comm%m_comm, ierr)
   end subroutine comm_bcast_real_dp_array

   subroutine comm_bcast_real_sp(comm, buffer, count, root)
      !! Broadcasts single-precision data from root process to all processes in communicator
      type(comm_t), intent(in) :: comm
      real(sp), intent(inout) :: buffer
      integer(int32), intent(in) :: count
      integer(int32), intent(in) :: root
      integer(int32) :: ierr

      call MPI_Bcast(buffer, count, MPI_REAL, root, comm%m_comm, ierr)
   end subroutine comm_bcast_real_sp

   subroutine comm_bcast_real_sp_array(comm, buffer, count, root)
      !! Broadcasts single-precision array from root process to all processes in communicator
      type(comm_t), intent(in) :: comm
      real(sp), intent(inout) :: buffer(:)
      integer(int32), intent(in) :: count
      integer(int32), intent(in) :: root
      integer(int32) :: ierr

      call MPI_Bcast(buffer, count, MPI_REAL, root, comm%m_comm, ierr)
   end subroutine comm_bcast_real_sp_array

   subroutine get_processor_name(name, namelen)
      character(len=*), intent(inout) :: name
      integer(int32), intent(out) :: namelen
      integer(int32) :: ierr

      call MPI_Get_processor_name(name, namelen, ierr)
   end subroutine get_processor_name

   subroutine pic_mpi_init(requested_thread_level, provided_thread_level)
      !! Initialize MPI environment with optional threading support
      !!
      !! If no thread level is requested, uses MPI_THREAD_FUNNELED by default
      !! to allow OpenMP threading in compute_mbe_energy and similar functions.
      !!
      !! Thread levels:
      !!   MPI_THREAD_SINGLE: No threading support
      !!   MPI_THREAD_FUNNELED: Only main thread makes MPI calls (good for OpenMP)
      !!   MPI_THREAD_SERIALIZED: Multiple threads can make MPI calls, but not simultaneously
      !!   MPI_THREAD_MULTIPLE: Full thread safety
      integer(int32), intent(in), optional :: requested_thread_level
      integer(int32), intent(out), optional :: provided_thread_level
      integer(int32) :: ierr, requested, provided

      ! Default to FUNNELED for OpenMP compatibility
      if (present(requested_thread_level)) then
         requested = requested_thread_level
      else
         requested = MPI_THREAD_FUNNELED
      end if

      call MPI_Init_thread(requested, provided, ierr)

      ! Return the provided level if requested
      if (present(provided_thread_level)) then
         provided_thread_level = provided
      end if

      ! Warn if we didn't get what we asked for
      if (provided < requested .and. requested /= MPI_THREAD_SINGLE) then
         if (requested == MPI_THREAD_FUNNELED) then
            write (*, '(a)') "Warning: MPI_THREAD_FUNNELED requested but not provided."
            write (*, '(a)') "OpenMP threading in compute functions may not work correctly."
         else if (requested == MPI_THREAD_SERIALIZED) then
            write (*, '(a)') "Warning: MPI_THREAD_SERIALIZED requested but not provided."
         else if (requested == MPI_THREAD_MULTIPLE) then
            write (*, '(a)') "Warning: MPI_THREAD_MULTIPLE requested but not provided."
         end if
      end if
   end subroutine pic_mpi_init

   function pic_mpi_query_thread_level() result(thread_level)
      !! Query the current MPI thread support level
      integer(int32) :: thread_level
      integer(int32) :: ierr

      call MPI_Query_thread(thread_level, ierr)
   end function pic_mpi_query_thread_level

   subroutine pic_mpi_finalize(ierr)
      !! Finalize MPI environment
      integer(int32), optional, intent(out) :: ierr
      integer(int32) :: ierr_local
      call MPI_Finalize(ierr_local)
      if (present(ierr)) ierr = ierr_local
   end subroutine pic_mpi_finalize

   ! ========================================================================
   ! Request type methods
   ! ========================================================================

   pure function request_is_null(this) result(is_null)
      class(request_t), intent(in) :: this
      logical :: is_null
      is_null = .not. this%is_valid
   end function request_is_null

   function request_get(this) result(mpi_request_out)
      class(request_t), intent(in) :: this
      integer :: mpi_request_out

      if (.not. this%is_valid) then
         error stop "Cannot get MPI_Request from null request"
      end if
      mpi_request_out = this%m_request
   end function request_get

   subroutine request_free(this)
      class(request_t), intent(inout) :: this
      this%m_request = MPI_REQUEST_NULL
      this%is_valid = .false.
   end subroutine request_free

   ! ========================================================================
   ! Non-blocking send operations
   ! ========================================================================

   subroutine comm_isend_integer(comm, data, dest, tag, request)
      type(comm_t), intent(in) :: comm
      integer(int32), intent(in) :: data
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Isend(data, 1_int32, MPI_INTEGER, dest, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_isend_integer

   subroutine comm_isend_integer_array(comm, data, dest, tag, request)
      type(comm_t), intent(in) :: comm
      integer(int32), intent(in) :: data(:)
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Isend(data, size(data), MPI_INTEGER, dest, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_isend_integer_array

   subroutine comm_isend_integer64(comm, data, dest, tag, request)
   !! Initiates a non-blocking send operation of an integer64
      type(comm_t), intent(in) :: comm
      integer(int64), intent(in) :: data
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Isend(data, 1_int32, MPI_INTEGER8, dest, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_isend_integer64

   subroutine comm_isend_integer64_array(comm, data, dest, tag, request)
   !! Initiates a non-blocking send operation of an integer64 array
      type(comm_t), intent(in) :: comm
      integer(int64), intent(in) :: data(:)
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Isend(data, size(data), MPI_INTEGER8, dest, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_isend_integer64_array

   subroutine comm_isend_real_dp(comm, data, dest, tag, request)
      type(comm_t), intent(in) :: comm
      real(dp), intent(in) :: data
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Isend(data, 1_int32, MPI_DOUBLE_PRECISION, dest, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_isend_real_dp

   subroutine comm_isend_real_dp_array(comm, data, dest, tag, request)
      type(comm_t), intent(in) :: comm
      real(dp), intent(in) :: data(:)
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Isend(data, size(data), MPI_DOUBLE_PRECISION, dest, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_isend_real_dp_array

   subroutine comm_isend_real_dp_array_2d(comm, data, dest, tag, request)
      type(comm_t), intent(in) :: comm
      real(dp), intent(in) :: data(:, :)
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr, dim1, dim2

      ! Send dimensions first (blocking - simple approach)
      dim1 = size(data, 1)
      dim2 = size(data, 2)
      call MPI_Send(dim1, 1_int32, MPI_INTEGER, dest, tag, comm%m_comm, ierr)
      call MPI_Send(dim2, 1_int32, MPI_INTEGER, dest, tag, comm%m_comm, ierr)

      ! Send data (non-blocking)
      call MPI_Isend(data, size(data), MPI_DOUBLE_PRECISION, dest, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_isend_real_dp_array_2d

   subroutine comm_isend_real_dp_array_2d_n(comm, data, count, dest, tag, request)
      !! Non-blocking send of a contiguous 2D dp array with explicit
      !! count.  No dim-prefix protocol; pairs with
      !! `comm_recv_real_dp_array_2d_n` on the receive side.
      type(comm_t), intent(in)  :: comm
      real(dp), intent(in)  :: data(:, :)
      integer(int32), intent(in)  :: count
      integer(int32), intent(in)  :: dest
      integer(int32), intent(in)  :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Isend(data, count, MPI_DOUBLE_PRECISION, dest, tag, &
                     comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_isend_real_dp_array_2d_n

   subroutine comm_isend_logical(comm, data, dest, tag, request)
      type(comm_t), intent(in) :: comm
      logical, intent(in) :: data
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Isend(data, 1_int32, MPI_LOGICAL, dest, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_isend_logical

   subroutine comm_isend_real_dp_array_n(comm, data, count, dest, tag, request)
      !! Non-blocking send with explicit count (for device pointers in host_data blocks)
      type(comm_t), intent(in) :: comm
      real(dp) :: data(:)
      integer(int32), intent(in) :: count
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Isend(data, count, MPI_DOUBLE_PRECISION, dest, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_isend_real_dp_array_n

   subroutine comm_isend_real_sp(comm, data, dest, tag, request)
      type(comm_t), intent(in) :: comm
      real(sp), intent(in) :: data
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Isend(data, 1_int32, MPI_REAL, dest, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_isend_real_sp

   subroutine comm_isend_real_sp_array(comm, data, dest, tag, request)
      type(comm_t), intent(in) :: comm
      real(sp), intent(in) :: data(:)
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Isend(data, size(data), MPI_REAL, dest, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_isend_real_sp_array

   subroutine comm_isend_real_sp_array_2d(comm, data, dest, tag, request)
      type(comm_t), intent(in) :: comm
      real(sp), intent(in) :: data(:, :)
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr, dim1, dim2

      ! Send dimensions first (blocking - simple approach)
      dim1 = size(data, 1)
      dim2 = size(data, 2)
      call MPI_Send(dim1, 1_int32, MPI_INTEGER, dest, tag, comm%m_comm, ierr)
      call MPI_Send(dim2, 1_int32, MPI_INTEGER, dest, tag, comm%m_comm, ierr)

      ! Send data (non-blocking)
      call MPI_Isend(data, size(data), MPI_REAL, dest, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_isend_real_sp_array_2d

   subroutine comm_isend_real_sp_array_n(comm, data, count, dest, tag, request)
      !! Non-blocking send with explicit count for single-precision (for device pointers in host_data blocks)
      type(comm_t), intent(in) :: comm
      real(sp) :: data(:)
      integer(int32), intent(in) :: count
      integer(int32), intent(in) :: dest
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Isend(data, count, MPI_REAL, dest, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_isend_real_sp_array_n

   ! ========================================================================
   ! Non-blocking receive operations
   ! ========================================================================

   subroutine comm_irecv_integer(comm, data, source, tag, request)
      type(comm_t), intent(in) :: comm
      integer(int32), intent(out) :: data
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Irecv(data, 1_int32, MPI_INTEGER, source, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_irecv_integer

   subroutine comm_irecv_integer_array(comm, data, source, tag, request)
      type(comm_t), intent(in) :: comm
      integer(int32), intent(out) :: data(:)
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Irecv(data, size(data), MPI_INTEGER, source, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_irecv_integer_array

   subroutine comm_irecv_integer64(comm, data, source, tag, request)
   !! Initiates a non-blocking receive operation of an integer64
      type(comm_t), intent(in) :: comm
      integer(int64), intent(out) :: data
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Irecv(data, 1_int32, MPI_INTEGER8, source, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_irecv_integer64

   subroutine comm_irecv_integer64_array(comm, data, source, tag, request)
   !! Initiates a non-blocking receive operation of an integer64 array
      type(comm_t), intent(in) :: comm
      integer(int64), intent(out) :: data(:)
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Irecv(data, size(data), MPI_INTEGER8, source, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_irecv_integer64_array

   subroutine comm_irecv_real_dp(comm, data, source, tag, request)
      type(comm_t), intent(in) :: comm
      real(dp), intent(out) :: data
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Irecv(data, 1_int32, MPI_DOUBLE_PRECISION, source, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_irecv_real_dp

   subroutine comm_irecv_real_dp_array(comm, data, source, tag, request)
      type(comm_t), intent(in) :: comm
      real(dp), intent(out) :: data(:)
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Irecv(data, size(data), MPI_DOUBLE_PRECISION, source, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_irecv_real_dp_array

   subroutine comm_irecv_real_dp_array_2d(comm, data, source, tag, request)
      type(comm_t), intent(in) :: comm
      real(dp), intent(inout), allocatable :: data(:, :)
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr, dim1, dim2
      integer :: stat(MPI_STATUS_SIZE)

      ! Receive dimensions first (blocking - needed to allocate)
      call MPI_Recv(dim1, 1_int32, MPI_INTEGER, source, tag, comm%m_comm, stat, ierr)
      call MPI_Recv(dim2, 1_int32, MPI_INTEGER, source, tag, comm%m_comm, stat, ierr)

      ! Allocate array with received dimensions
      if (.not. allocated(data)) then
         allocate (data(dim1, dim2))
      end if

      ! Receive data (non-blocking)
      call MPI_Irecv(data, dim1*dim2, MPI_DOUBLE_PRECISION, source, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_irecv_real_dp_array_2d

   subroutine comm_irecv_logical(comm, data, source, tag, request)
      type(comm_t), intent(in) :: comm
      logical, intent(out) :: data
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Irecv(data, 1_int32, MPI_LOGICAL, source, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_irecv_logical

   subroutine comm_irecv_real_dp_array_n(comm, data, count, source, tag, request)
      !! Non-blocking receive with explicit count (for device pointers in host_data blocks)
      type(comm_t), intent(in) :: comm
      real(dp) :: data(:)
      integer(int32), intent(in) :: count
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Irecv(data, count, MPI_DOUBLE_PRECISION, source, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_irecv_real_dp_array_n

   subroutine comm_irecv_real_sp(comm, data, source, tag, request)
      type(comm_t), intent(in) :: comm
      real(sp), intent(out) :: data
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Irecv(data, 1_int32, MPI_REAL, source, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_irecv_real_sp

   subroutine comm_irecv_real_sp_array(comm, data, source, tag, request)
      type(comm_t), intent(in) :: comm
      real(sp), intent(out) :: data(:)
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Irecv(data, size(data), MPI_REAL, source, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_irecv_real_sp_array

   subroutine comm_irecv_real_sp_array_2d(comm, data, source, tag, request)
      type(comm_t), intent(in) :: comm
      real(sp), intent(inout), allocatable :: data(:, :)
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr, dim1, dim2
      integer :: stat(MPI_STATUS_SIZE)

      ! Receive dimensions first (blocking - needed to allocate)
      call MPI_Recv(dim1, 1_int32, MPI_INTEGER, source, tag, comm%m_comm, stat, ierr)
      call MPI_Recv(dim2, 1_int32, MPI_INTEGER, source, tag, comm%m_comm, stat, ierr)

      ! Allocate array with received dimensions
      if (.not. allocated(data)) then
         allocate (data(dim1, dim2))
      end if

      ! Receive data (non-blocking)
      call MPI_Irecv(data, dim1*dim2, MPI_REAL, source, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_irecv_real_sp_array_2d

   subroutine comm_irecv_real_sp_array_n(comm, data, count, source, tag, request)
      !! Non-blocking receive with explicit count for single-precision (for device pointers in host_data blocks)
      type(comm_t), intent(in) :: comm
      real(sp) :: data(:)
      integer(int32), intent(in) :: count
      integer(int32), intent(in) :: source
      integer(int32), intent(in) :: tag
      type(request_t), intent(out) :: request
      integer(int32) :: ierr

      call MPI_Irecv(data, count, MPI_REAL, source, tag, comm%m_comm, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine comm_irecv_real_sp_array_n

   ! ========================================================================
   ! Request completion operations
   ! ========================================================================

   subroutine request_wait(request, status)
      type(request_t), intent(inout) :: request
      type(MPI_Status), intent(out), optional :: status
      integer(int32) :: ierr
      integer :: stat(MPI_STATUS_SIZE)

      if (.not. request%is_valid) then
         error stop "Cannot wait on null request"
      end if

      call MPI_Wait(request%m_request, stat, ierr)

      if (present(status)) then
         status = status_array_to_type(stat)
      end if

      call request%free()
   end subroutine request_wait

   subroutine request_waitall(requests, statuses)
      type(request_t), intent(inout) :: requests(:)
      type(MPI_Status), intent(out), optional :: statuses(:)
      integer, allocatable :: mpi_requests(:)
      integer, allocatable :: stat_array(:, :)
      integer(int32) :: i, count, ierr

      count = size(requests)
      allocate (mpi_requests(count))

      do i = 1, count
         if (requests(i)%is_valid) then
            mpi_requests(i) = requests(i)%m_request
         else
            mpi_requests(i) = MPI_REQUEST_NULL
         end if
      end do

      if (present(statuses)) then
         allocate (stat_array(MPI_STATUS_SIZE, count))
         call MPI_Waitall(count, mpi_requests, stat_array, ierr)
         do i = 1, count
            statuses(i) = status_array_to_type(stat_array(:, i))
         end do
      else
         allocate (stat_array(MPI_STATUS_SIZE, count))
         call MPI_Waitall(count, mpi_requests, stat_array, ierr)
      end if

      do i = 1, count
         call requests(i)%free()
      end do
   end subroutine request_waitall

   subroutine request_test(request, flag, status)
      type(request_t), intent(inout) :: request
      logical, intent(out) :: flag
      type(MPI_Status), intent(out), optional :: status
      integer(int32) :: ierr
      integer :: stat(MPI_STATUS_SIZE)

      if (.not. request%is_valid) then
         flag = .true.
         return
      end if

      call MPI_Test(request%m_request, flag, stat, ierr)

      if (present(status)) then
         status = status_array_to_type(stat)
      end if

      if (flag) then
         call request%free()
      end if
   end subroutine request_test

   ! ========================================================================
   ! Window creation
   ! ========================================================================

   !! Create MPI window for RMA operations
   !!
   !! Creates a window exposing local memory to remote RMA operations.
   !! Used for DDI distributed arrays.
   function create_win_dp_array(comm, base, win_size) result(win)
      type(comm_t), intent(in) :: comm
      real(dp), target, asynchronous :: base(:)
      integer(MPI_ADDRESS_KIND), intent(in) :: win_size
      type(win_t) :: win
      integer :: ierr
      integer :: disp_unit

      disp_unit = int(storage_size(base(1))/8_int32)
      call MPI_Win_create(base, win_size, disp_unit, &
                          MPI_INFO_NULL, comm%get(), win%m_win, ierr)
      win%is_valid = .true.
   end function create_win_dp_array

   !! Create dynamic MPI window
   !!
   !! For windows where memory will be attached later.
   !! Useful for load balancing counters.
   function create_win_dynamic(comm) result(win)
      type(comm_t), intent(in) :: comm
      type(win_t) :: win
      integer :: ierr

      call MPI_Win_create_dynamic(MPI_INFO_NULL, comm%get(), win%m_win, ierr)
      win%is_valid = .true.
   end function create_win_dynamic

#ifndef MPICH_PERLMUTTER
   !! Allocate window memory and create window in one call (1D array)
   subroutine create_win_allocate_dp_1d(comm, length, baseptr, win)
      use iso_c_binding, only: c_ptr, c_f_pointer
      type(comm_t), intent(in) :: comm
      integer, intent(in) :: length
      real(dp), pointer, intent(out) :: baseptr(:)
      type(win_t), intent(out) :: win
      type(c_ptr) :: cptr
      integer(MPI_ADDRESS_KIND) :: win_size
      integer :: disp_unit, ierr

      win_size = int(length, MPI_ADDRESS_KIND)*int(storage_size(1.0_dp)/8_int32, MPI_ADDRESS_KIND)
      disp_unit = int(storage_size(1.0_dp)/8_int32)

      call MPI_Win_allocate(win_size, disp_unit, MPI_INFO_NULL, &
                            comm%get(), cptr, win%m_win, ierr)
      call c_f_pointer(cptr, baseptr, [length])
      win%is_valid = .true.
   end subroutine create_win_allocate_dp_1d

   !! Allocate window memory and create window in one call (2D array)
   subroutine create_win_allocate_dp_2d(comm, dim1, dim2, baseptr, win)
      use iso_c_binding, only: c_ptr, c_f_pointer
      type(comm_t), intent(in) :: comm
      integer, intent(in) :: dim1, dim2
      real(dp), pointer, intent(out) :: baseptr(:, :)
      type(win_t), intent(out) :: win
      type(c_ptr) :: cptr
      integer(MPI_ADDRESS_KIND) :: win_size
      integer :: disp_unit, ierr, total_size

      total_size = dim1*dim2
      win_size = int(total_size, MPI_ADDRESS_KIND)*int(storage_size(1.0_dp)/8_int32, MPI_ADDRESS_KIND)
      disp_unit = int(storage_size(1.0_dp)/8_int32)

      call MPI_Win_allocate(win_size, disp_unit, MPI_INFO_NULL, &
                            comm%get(), cptr, win%m_win, ierr)
      call c_f_pointer(cptr, baseptr, [dim1, dim2])
      win%is_valid = .true.
   end subroutine create_win_allocate_dp_2d

   !! Allocate window memory for single precision 1D array
   subroutine create_win_allocate_sp_1d(comm, length, baseptr, win)
      use iso_c_binding, only: c_ptr, c_f_pointer
      type(comm_t), intent(in) :: comm
      integer, intent(in) :: length
      real(sp), pointer, intent(out) :: baseptr(:)
      type(win_t), intent(out) :: win
      type(c_ptr) :: cptr
      integer(MPI_ADDRESS_KIND) :: win_size
      integer :: disp_unit, ierr

      win_size = int(length, MPI_ADDRESS_KIND)*int(storage_size(1.0_sp)/8_int32, MPI_ADDRESS_KIND)
      disp_unit = int(storage_size(1.0_sp)/8_int32)

      call MPI_Win_allocate(win_size, disp_unit, MPI_INFO_NULL, &
                            comm%get(), cptr, win%m_win, ierr)
      call c_f_pointer(cptr, baseptr, [length])
      win%is_valid = .true.
   end subroutine create_win_allocate_sp_1d

   !! Allocate window memory for int32 1D array
   subroutine create_win_allocate_i32_1d(comm, length, baseptr, win)
      use iso_c_binding, only: c_ptr, c_f_pointer
      type(comm_t), intent(in) :: comm
      integer, intent(in) :: length
      integer(int32), pointer, intent(out) :: baseptr(:)
      type(win_t), intent(out) :: win
      type(c_ptr) :: cptr
      integer(MPI_ADDRESS_KIND) :: win_size
      integer :: disp_unit, ierr

      win_size = int(length, MPI_ADDRESS_KIND)*int(storage_size(1_int32)/8_int32, MPI_ADDRESS_KIND)
      disp_unit = int(storage_size(1_int32)/8_int32)

      call MPI_Win_allocate(win_size, disp_unit, MPI_INFO_NULL, &
                            comm%get(), cptr, win%m_win, ierr)
      call c_f_pointer(cptr, baseptr, [length])
      win%is_valid = .true.
   end subroutine create_win_allocate_i32_1d

   !! Allocate window memory for int64 1D array
   subroutine create_win_allocate_i64_1d(comm, length, baseptr, win)
      use iso_c_binding, only: c_ptr, c_f_pointer
      type(comm_t), intent(in) :: comm
      integer, intent(in) :: length
      integer(int64), pointer, intent(out) :: baseptr(:)
      type(win_t), intent(out) :: win
      type(c_ptr) :: cptr
      integer(MPI_ADDRESS_KIND) :: win_size
      integer :: disp_unit, ierr

      win_size = int(length, MPI_ADDRESS_KIND)*int(storage_size(1_int64)/8_int32, MPI_ADDRESS_KIND)
      disp_unit = int(storage_size(1_int64)/8_int32)

      call MPI_Win_allocate(win_size, disp_unit, MPI_INFO_NULL, &
                            comm%get(), cptr, win%m_win, ierr)
      call c_f_pointer(cptr, baseptr, [length])
      win%is_valid = .true.
   end subroutine create_win_allocate_i64_1d
#endif

   ! ========================================================================
   ! Window query methods
   ! ========================================================================

   pure function win_is_null(this) result(is_null)
      class(win_t), intent(in) :: this
      logical :: is_null
      is_null = .not. this%is_valid
   end function win_is_null

   function win_get_handle(this) result(mpi_win_out)
      class(win_t), intent(in) :: this
      integer :: mpi_win_out

      if (.not. this%is_valid) then
         error stop "Cannot get MPI_Win from null window"
      end if
      mpi_win_out = this%m_win
   end function win_get_handle

   ! ========================================================================
   ! Synchronization
   ! ========================================================================

   !! Fence synchronization for active target RMA
   !!
   !! Completes all pending RMA operations.
   !! Use before/after Get/Put/Accumulate operations.
   subroutine win_fence(this, assert)
      class(win_t), intent(in) :: this
      integer, intent(in), optional :: assert
      integer :: ierr, assert_val

      if (present(assert)) then
         assert_val = assert
      else
         assert_val = 0_int32
      end if

      call MPI_Win_fence(assert_val, this%m_win, ierr)
   end subroutine win_fence

   !! Lock window for passive target RMA
   !!
   !! Begins RMA access epoch for specified target rank.
   !! Must be paired with unlock.
   subroutine win_lock(this, rank, lock_type)
      class(win_t), intent(in) :: this
      integer, intent(in) :: rank
      integer, intent(in), optional :: lock_type
      integer :: ierr, ltype

      if (present(lock_type)) then
         ltype = lock_type
      else
         ltype = MPI_LOCK_SHARED
      end if

      call MPI_Win_lock(ltype, rank, 0_int32, this%m_win, ierr)
   end subroutine win_lock

   !! Unlock window for passive target RMA
   subroutine win_unlock(this, rank)
      class(win_t), intent(in) :: this
      integer, intent(in) :: rank
      integer :: ierr

      call MPI_Win_unlock(rank, this%m_win, ierr)
   end subroutine win_unlock

   !! Lock window on all ranks for passive target RMA
   subroutine win_lock_all(this, assert)
      class(win_t), intent(in) :: this
      integer, intent(in), optional :: assert
      integer :: ierr, assert_val

      if (present(assert)) then
         assert_val = assert
      else
         assert_val = 0_int32
      end if

      call MPI_Win_lock_all(assert_val, this%m_win, ierr)
   end subroutine win_lock_all

   !! Unlock window on all ranks for passive target RMA
   subroutine win_unlock_all(this)
      class(win_t), intent(in) :: this
      integer :: ierr

      call MPI_Win_unlock_all(this%m_win, ierr)
   end subroutine win_unlock_all

   !! Flush pending RMA operations to a specific rank
   subroutine win_flush(this, rank)
      class(win_t), intent(in) :: this
      integer, intent(in) :: rank
      integer :: ierr

      call MPI_Win_flush(rank, this%m_win, ierr)
   end subroutine win_flush

   !! Flush pending RMA operations to all ranks
   subroutine win_flush_all(this)
      class(win_t), intent(in) :: this
      integer :: ierr

      call MPI_Win_flush_all(this%m_win, ierr)
   end subroutine win_flush_all

   ! ========================================================================
   ! RMA Get/Put/Accumulate operations
   ! ========================================================================

   !! Get data from remote window
   !!
   !! Retrieves data from target rank's window into local buffer.
   !! Must be called between fence or lock/unlock pairs.
   subroutine win_get_dp(this, target_rank, target_disp, count, buffer)
      class(win_t), intent(in) :: this
      integer, intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer, intent(in) :: count
      real(dp), intent(out) :: buffer(*)
      integer :: ierr

      call MPI_Get(buffer, count, MPI_DOUBLE_PRECISION, &
                   target_rank, target_disp, count, MPI_DOUBLE_PRECISION, &
                   this%m_win, ierr)
   end subroutine win_get_dp

   !! Put data to remote window
   !!
   !! Sends data from local buffer to target rank's window.
   !! Must be called between fence or lock/unlock pairs.
   subroutine win_put_dp(this, target_rank, target_disp, count, buffer)
      class(win_t), intent(in) :: this
      integer, intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer, intent(in) :: count
      real(dp), intent(in) :: buffer(*)
      integer :: ierr

      call MPI_Put(buffer, count, MPI_DOUBLE_PRECISION, &
                   target_rank, target_disp, count, MPI_DOUBLE_PRECISION, &
                   this%m_win, ierr)
   end subroutine win_put_dp

   !! Non-blocking get data from remote window
   subroutine win_rget_dp(this, target_rank, target_disp, count, buffer, request)
      class(win_t), intent(in) :: this
      integer, intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer, intent(in) :: count
      real(dp), intent(out) :: buffer(*)
      type(request_t), intent(out) :: request
      integer :: ierr

      call MPI_Rget(buffer, count, MPI_DOUBLE_PRECISION, &
                    target_rank, target_disp, count, MPI_DOUBLE_PRECISION, &
                    this%m_win, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine win_rget_dp

   !! Non-blocking put data to remote window
   subroutine win_rput_dp(this, target_rank, target_disp, count, buffer, request)
      class(win_t), intent(in) :: this
      integer, intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer, intent(in) :: count
      real(dp), intent(in) :: buffer(*)
      type(request_t), intent(out) :: request
      integer :: ierr

      call MPI_Rput(buffer, count, MPI_DOUBLE_PRECISION, &
                    target_rank, target_disp, count, MPI_DOUBLE_PRECISION, &
                    this%m_win, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine win_rput_dp

   !! Accumulate data to remote window
   !!
   !! Atomically adds local buffer to target rank's window.
   !! Critical for DDI_ACC (Fock matrix accumulation).
   subroutine win_accumulate_dp(this, target_rank, target_disp, count, buffer, op)
      class(win_t), intent(in) :: this
      integer, intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer, intent(in) :: count
      real(dp), intent(in) :: buffer(*)
      integer, intent(in), optional :: op
      integer :: mpi_op
      integer :: ierr

      if (present(op)) then
         mpi_op = op
      else
         mpi_op = MPI_SUM
      end if

      call MPI_Accumulate(buffer, count, MPI_DOUBLE_PRECISION, &
                          target_rank, target_disp, count, MPI_DOUBLE_PRECISION, &
                          mpi_op, this%m_win, ierr)
   end subroutine win_accumulate_dp

   ! ========================================================================
   ! RMA operations for single precision (sp)
   ! ========================================================================

   subroutine win_get_sp(this, target_rank, target_disp, count, buffer)
      class(win_t), intent(in) :: this
      integer, intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer, intent(in) :: count
      real(sp), intent(out) :: buffer(*)
      integer :: ierr

      call MPI_Get(buffer, count, MPI_REAL, &
                   target_rank, target_disp, count, MPI_REAL, &
                   this%m_win, ierr)
   end subroutine win_get_sp

   subroutine win_put_sp(this, target_rank, target_disp, count, buffer)
      class(win_t), intent(in) :: this
      integer, intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer, intent(in) :: count
      real(sp), intent(in) :: buffer(*)
      integer :: ierr

      call MPI_Put(buffer, count, MPI_REAL, &
                   target_rank, target_disp, count, MPI_REAL, &
                   this%m_win, ierr)
   end subroutine win_put_sp

   subroutine win_rget_sp(this, target_rank, target_disp, count, buffer, request)
      class(win_t), intent(in) :: this
      integer, intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer, intent(in) :: count
      real(sp), intent(out) :: buffer(*)
      type(request_t), intent(out) :: request
      integer :: ierr

      call MPI_Rget(buffer, count, MPI_REAL, &
                    target_rank, target_disp, count, MPI_REAL, &
                    this%m_win, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine win_rget_sp

   subroutine win_rput_sp(this, target_rank, target_disp, count, buffer, request)
      class(win_t), intent(in) :: this
      integer, intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer, intent(in) :: count
      real(sp), intent(in) :: buffer(*)
      type(request_t), intent(out) :: request
      integer :: ierr

      call MPI_Rput(buffer, count, MPI_REAL, &
                    target_rank, target_disp, count, MPI_REAL, &
                    this%m_win, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine win_rput_sp

   subroutine win_accumulate_sp(this, target_rank, target_disp, count, buffer, op)
      class(win_t), intent(in) :: this
      integer, intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer, intent(in) :: count
      real(sp), intent(in) :: buffer(*)
      integer, intent(in), optional :: op
      integer :: mpi_op
      integer :: ierr

      if (present(op)) then
         mpi_op = op
      else
         mpi_op = MPI_SUM
      end if

      call MPI_Accumulate(buffer, count, MPI_REAL, &
                          target_rank, target_disp, count, MPI_REAL, &
                          mpi_op, this%m_win, ierr)
   end subroutine win_accumulate_sp

   ! ========================================================================
   ! RMA operations for int32
   ! ========================================================================

   subroutine win_get_i32(this, target_rank, target_disp, count, buffer)
      class(win_t), intent(in) :: this
      integer, intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer, intent(in) :: count
      integer(int32), intent(out) :: buffer(*)
      integer :: ierr

      call MPI_Get(buffer, count, MPI_INTEGER, &
                   target_rank, target_disp, count, MPI_INTEGER, &
                   this%m_win, ierr)
   end subroutine win_get_i32

   subroutine win_put_i32(this, target_rank, target_disp, count, buffer)
      class(win_t), intent(in) :: this
      integer, intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer, intent(in) :: count
      integer(int32), intent(in) :: buffer(*)
      integer :: ierr

      call MPI_Put(buffer, count, MPI_INTEGER, &
                   target_rank, target_disp, count, MPI_INTEGER, &
                   this%m_win, ierr)
   end subroutine win_put_i32

   subroutine win_rget_i32(this, target_rank, target_disp, count, buffer, request)
      class(win_t), intent(in) :: this
      integer, intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer, intent(in) :: count
      integer(int32), intent(out) :: buffer(*)
      type(request_t), intent(out) :: request
      integer :: ierr

      call MPI_Rget(buffer, count, MPI_INTEGER, &
                    target_rank, target_disp, count, MPI_INTEGER, &
                    this%m_win, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine win_rget_i32

   subroutine win_rput_i32(this, target_rank, target_disp, count, buffer, request)
      class(win_t), intent(in) :: this
      integer, intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer, intent(in) :: count
      integer(int32), intent(in) :: buffer(*)
      type(request_t), intent(out) :: request
      integer :: ierr

      call MPI_Rput(buffer, count, MPI_INTEGER, &
                    target_rank, target_disp, count, MPI_INTEGER, &
                    this%m_win, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine win_rput_i32

   subroutine win_accumulate_i32(this, target_rank, target_disp, count, buffer, op)
      class(win_t), intent(in) :: this
      integer, intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer, intent(in) :: count
      integer(int32), intent(in) :: buffer(*)
      integer, intent(in), optional :: op
      integer :: mpi_op
      integer :: ierr

      if (present(op)) then
         mpi_op = op
      else
         mpi_op = MPI_SUM
      end if

      call MPI_Accumulate(buffer, count, MPI_INTEGER, &
                          target_rank, target_disp, count, MPI_INTEGER, &
                          mpi_op, this%m_win, ierr)
   end subroutine win_accumulate_i32

   ! ========================================================================
   ! RMA operations for int64
   ! ========================================================================

   subroutine win_get_i64(this, target_rank, target_disp, count, buffer)
      class(win_t), intent(in) :: this
      integer, intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer, intent(in) :: count
      integer(int64), intent(out) :: buffer(*)
      integer :: ierr

      call MPI_Get(buffer, count, MPI_INTEGER8, &
                   target_rank, target_disp, count, MPI_INTEGER8, &
                   this%m_win, ierr)
   end subroutine win_get_i64

   subroutine win_put_i64(this, target_rank, target_disp, count, buffer)
      class(win_t), intent(in) :: this
      integer, intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer, intent(in) :: count
      integer(int64), intent(in) :: buffer(*)
      integer :: ierr

      call MPI_Put(buffer, count, MPI_INTEGER8, &
                   target_rank, target_disp, count, MPI_INTEGER8, &
                   this%m_win, ierr)
   end subroutine win_put_i64

   subroutine win_rget_i64(this, target_rank, target_disp, count, buffer, request)
      class(win_t), intent(in) :: this
      integer, intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer, intent(in) :: count
      integer(int64), intent(out) :: buffer(*)
      type(request_t), intent(out) :: request
      integer :: ierr

      call MPI_Rget(buffer, count, MPI_INTEGER8, &
                    target_rank, target_disp, count, MPI_INTEGER8, &
                    this%m_win, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine win_rget_i64

   subroutine win_rput_i64(this, target_rank, target_disp, count, buffer, request)
      class(win_t), intent(in) :: this
      integer, intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer, intent(in) :: count
      integer(int64), intent(in) :: buffer(*)
      type(request_t), intent(out) :: request
      integer :: ierr

      call MPI_Rput(buffer, count, MPI_INTEGER8, &
                    target_rank, target_disp, count, MPI_INTEGER8, &
                    this%m_win, request%m_request, ierr)
      request%is_valid = .true.
   end subroutine win_rput_i64

   subroutine win_accumulate_i64(this, target_rank, target_disp, count, buffer, op)
      class(win_t), intent(in) :: this
      integer, intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer, intent(in) :: count
      integer(int64), intent(in) :: buffer(*)
      integer, intent(in), optional :: op
      integer :: mpi_op
      integer :: ierr

      if (present(op)) then
         mpi_op = op
      else
         mpi_op = MPI_SUM
      end if

      call MPI_Accumulate(buffer, count, MPI_INTEGER8, &
                          target_rank, target_disp, count, MPI_INTEGER8, &
                          mpi_op, this%m_win, ierr)
   end subroutine win_accumulate_i64

   !! Atomic fetch-and-add for load balancing
   !!
   !! Atomically increments remote counter and returns old value.
   !! Used for DDI_DLBNEXT (dynamic load balancing).
   subroutine win_fetch_and_add_i64(this, target_rank, target_disp, value, result)
      class(win_t), intent(in) :: this
      integer, intent(in) :: target_rank
      integer(MPI_ADDRESS_KIND), intent(in) :: target_disp
      integer(int64), intent(in) :: value
      integer(int64), intent(out) :: result
      integer :: ierr

      call MPI_Fetch_and_op(value, result, MPI_INTEGER8, &
                            target_rank, target_disp, MPI_SUM, this%m_win, ierr)
   end subroutine win_fetch_and_add_i64

   ! ========================================================================
   ! Window cleanup
   ! ========================================================================

   subroutine win_finalize(this)
      class(win_t), intent(inout) :: this
      integer :: ierr

      if (this%is_valid .and. this%m_win /= MPI_WIN_NULL) then
         call MPI_Win_free(this%m_win, ierr)
         this%is_valid = .false.
         this%m_win = MPI_WIN_NULL
      end if
   end subroutine win_finalize

   ! ========================================================================
   ! Allreduce operations (for DDI_GSUMF/GSUMI)
   ! ========================================================================

   !! Allreduce for scalar double precision
   !!
   !! In-place global reduction. Replaces DDI_GSUMF for scalars.
   subroutine allreduce_dp(comm, buffer, op)
      type(comm_t), intent(in) :: comm
      real(dp), intent(inout) :: buffer
      integer, intent(in), optional :: op
      integer :: mpi_op
      integer :: ierr

      if (present(op)) then
         mpi_op = op
      else
         mpi_op = MPI_SUM
      end if

      call MPI_Allreduce(MPI_IN_PLACE, buffer, 1_int32, MPI_DOUBLE_PRECISION, &
                         mpi_op, comm%get(), ierr)
   end subroutine allreduce_dp

   !! Allreduce for double precision array
   !!
   !! In-place global reduction. Replaces DDI_GSUMF for arrays.
   !! This is THE most-called DDI function (1,301 calls in GAMESS).
   subroutine allreduce_dp_array(comm, buffer, count, op)
      type(comm_t), intent(in) :: comm
      real(dp), intent(inout) :: buffer(:)
      integer, intent(in), optional :: count
      integer, intent(in), optional :: op
      integer :: mpi_op
      integer :: ierr, n

      if (present(count)) then
         n = count
      else
         n = size(buffer)
      end if

      if (present(op)) then
         mpi_op = op
      else
         mpi_op = MPI_SUM
      end if

      call MPI_Allreduce(MPI_IN_PLACE, buffer, n, MPI_DOUBLE_PRECISION, &
                         mpi_op, comm%get(), ierr)
   end subroutine allreduce_dp_array

   !! Allreduce for scalar integer
   !!
   !! In-place global reduction. Replaces DDI_GSUMI for scalars.
   subroutine allreduce_i32(comm, buffer, op)
      type(comm_t), intent(in) :: comm
      integer(int32), intent(inout) :: buffer
      integer, intent(in), optional :: op
      integer :: mpi_op
      integer :: ierr

      if (present(op)) then
         mpi_op = op
      else
         mpi_op = MPI_SUM
      end if

      call MPI_Allreduce(MPI_IN_PLACE, buffer, 1_int32, MPI_INTEGER, &
                         mpi_op, comm%get(), ierr)
   end subroutine allreduce_i32

   !! Allreduce for integer array
   !!
   !! In-place global reduction. Replaces DDI_GSUMI for arrays.
   subroutine allreduce_i32_array(comm, buffer, count, op)
      type(comm_t), intent(in) :: comm
      integer(int32), intent(inout) :: buffer(:)
      integer, intent(in), optional :: count
      integer, intent(in), optional :: op
      integer :: mpi_op
      integer :: ierr, n

      if (present(count)) then
         n = count
      else
         n = size(buffer)
      end if

      if (present(op)) then
         mpi_op = op
      else
         mpi_op = MPI_SUM
      end if

      call MPI_Allreduce(MPI_IN_PLACE, buffer, n, MPI_INTEGER, &
                         mpi_op, comm%get(), ierr)
   end subroutine allreduce_i32_array

   !! Non-in-place allreduce for scalar double precision
   !!
   !! Reduces sendbuf and stores result in recvbuf.
   !! Useful for timestep reduction where local value must be preserved.
   subroutine allreduce_dp_to(comm, sendbuf, recvbuf, op)
      type(comm_t), intent(in) :: comm
      real(dp), intent(in) :: sendbuf
      real(dp), intent(out) :: recvbuf
      integer, intent(in), optional :: op
      integer :: mpi_op
      integer :: ierr

      if (present(op)) then
         mpi_op = op
      else
         mpi_op = MPI_SUM
      end if

      call MPI_Allreduce(sendbuf, recvbuf, 1_int32, MPI_DOUBLE_PRECISION, &
                         mpi_op, comm%get(), ierr)
   end subroutine allreduce_dp_to

   !! Non-in-place allreduce for double precision array
   !!
   !! Reduces sendbuf and stores result in recvbuf.
   subroutine allreduce_dp_array_to(comm, sendbuf, recvbuf, count, op)
      type(comm_t), intent(in) :: comm
      real(dp), intent(in) :: sendbuf(:)
      real(dp), intent(out) :: recvbuf(:)
      integer, intent(in), optional :: count
      integer, intent(in), optional :: op
      integer :: mpi_op
      integer :: ierr, n

      if (present(count)) then
         n = count
      else
         n = size(sendbuf)
      end if

      if (present(op)) then
         mpi_op = op
      else
         mpi_op = MPI_SUM
      end if

      call MPI_Allreduce(sendbuf, recvbuf, n, MPI_DOUBLE_PRECISION, &
                         mpi_op, comm%get(), ierr)
   end subroutine allreduce_dp_array_to

   !! Allreduce for single-precision scalar
   subroutine allreduce_sp(comm, buffer, op)
      type(comm_t), intent(in) :: comm
      real(sp), intent(inout) :: buffer
      integer, intent(in), optional :: op
      integer :: mpi_op
      integer :: ierr

      if (present(op)) then
         mpi_op = op
      else
         mpi_op = MPI_SUM
      end if

      call MPI_Allreduce(MPI_IN_PLACE, buffer, 1_int32, MPI_REAL, &
                         mpi_op, comm%get(), ierr)
   end subroutine allreduce_sp

   !! Allreduce for single-precision array
   subroutine allreduce_sp_array(comm, buffer, count, op)
      type(comm_t), intent(in) :: comm
      real(sp), intent(inout) :: buffer(:)
      integer, intent(in), optional :: count
      integer, intent(in), optional :: op
      integer :: mpi_op
      integer :: ierr, n

      if (present(count)) then
         n = count
      else
         n = size(buffer)
      end if

      if (present(op)) then
         mpi_op = op
      else
         mpi_op = MPI_SUM
      end if

      call MPI_Allreduce(MPI_IN_PLACE, buffer, n, MPI_REAL, &
                         mpi_op, comm%get(), ierr)
   end subroutine allreduce_sp_array

   !! Non-in-place allreduce for scalar single-precision
   subroutine allreduce_sp_to(comm, sendbuf, recvbuf, op)
      type(comm_t), intent(in) :: comm
      real(sp), intent(in) :: sendbuf
      real(sp), intent(out) :: recvbuf
      integer, intent(in), optional :: op
      integer :: mpi_op
      integer :: ierr

      if (present(op)) then
         mpi_op = op
      else
         mpi_op = MPI_SUM
      end if

      call MPI_Allreduce(sendbuf, recvbuf, 1_int32, MPI_REAL, &
                         mpi_op, comm%get(), ierr)
   end subroutine allreduce_sp_to

   !! Non-in-place allreduce for single-precision array
   subroutine allreduce_sp_array_to(comm, sendbuf, recvbuf, count, op)
      type(comm_t), intent(in) :: comm
      real(sp), intent(in) :: sendbuf(:)
      real(sp), intent(out) :: recvbuf(:)
      integer, intent(in), optional :: count
      integer, intent(in), optional :: op
      integer :: mpi_op
      integer :: ierr, n

      if (present(count)) then
         n = count
      else
         n = size(sendbuf)
      end if

      if (present(op)) then
         mpi_op = op
      else
         mpi_op = MPI_SUM
      end if

      call MPI_Allreduce(sendbuf, recvbuf, n, MPI_REAL, &
                         mpi_op, comm%get(), ierr)
   end subroutine allreduce_sp_array_to

end module pic_mpi
