311                         std::array<std::int64_t, 2> shape,
 
  312                         std::int64_t rank_offset)
 
  314  assert(rank_offset >= 0 or x.empty());
 
  315  using T = 
typename std::remove_reference_t<typename U::value_type>;
 
  319  assert(x.size() % shape[1] == 0);
 
  320  const std::int32_t shape0_local = x.size() / shape[1];
 
  322  spdlog::debug(
"Sending data to post offices (distribute_to_postoffice)");
 
  325  std::vector<int> row_to_dest(shape0_local);
 
  326  for (std::int32_t i = 0; i < shape0_local; ++i)
 
  329    row_to_dest[i] = dest;
 
  334  std::vector<std::array<std::int32_t, 2>> dest_to_index;
 
  335  dest_to_index.reserve(shape0_local);
 
  336  for (std::int32_t i = 0; i < shape0_local; ++i)
 
  338    std::size_t idx = i + rank_offset;
 
  340      dest_to_index.push_back({dest, i});
 
  342  std::ranges::sort(dest_to_index);
 
  346  std::vector<int> dest;
 
  347  std::vector<std::int32_t> num_items_per_dest,
 
  348      pos_to_neigh_rank(shape0_local, -1);
 
  350    auto it = dest_to_index.begin();
 
  351    while (it != dest_to_index.end())
 
  353      const int neigh_rank = dest.size();
 
  356      dest.push_back((*it)[0]);
 
  360          = std::find_if(it, dest_to_index.end(),
 
  361                         [r = dest.back()](
auto& idx) { return idx[0] != r; });
 
  364      num_items_per_dest.push_back(std::distance(it, it1));
 
  367      for (
auto e = it; e != it1; ++e)
 
  368        pos_to_neigh_rank[(*e)[1]] = neigh_rank;
 
  378      "Number of neighbourhood source ranks in distribute_to_postoffice: {}",
 
  379      static_cast<int>(src.size()));
 
  383  int err = MPI_Dist_graph_create_adjacent(
 
  384      comm, src.size(), src.data(), MPI_UNWEIGHTED, dest.size(), dest.data(),
 
  385      MPI_UNWEIGHTED, MPI_INFO_NULL, 
false, &neigh_comm);
 
  389  std::vector<std::int32_t> send_disp = {0};
 
  390  std::partial_sum(num_items_per_dest.begin(), num_items_per_dest.end(),
 
  391                   std::back_inserter(send_disp));
 
  394  std::vector<T> send_buffer_data(shape[1] * send_disp.back());
 
  395  std::vector<std::int64_t> send_buffer_index(send_disp.back());
 
  397    std::vector<std::int32_t> send_offsets = send_disp;
 
  398    for (std::int32_t i = 0; i < shape0_local; ++i)
 
  400      if (
int neigh_dest = pos_to_neigh_rank[i]; neigh_dest != -1)
 
  402        std::size_t pos = send_offsets[neigh_dest];
 
  403        send_buffer_index[pos] = i + rank_offset;
 
  404        std::copy_n(std::next(x.begin(), i * shape[1]), shape[1],
 
  405                    std::next(send_buffer_data.begin(), shape[1] * pos));
 
  406        ++send_offsets[neigh_dest];
 
  413  std::vector<int> num_items_recv(src.size());
 
  414  num_items_per_dest.reserve(1);
 
  415  num_items_recv.reserve(1);
 
  416  err = MPI_Neighbor_alltoall(num_items_per_dest.data(), 1, MPI_INT,
 
  417                              num_items_recv.data(), 1, MPI_INT, neigh_comm);
 
  421  std::vector<std::int32_t> recv_disp(num_items_recv.size() + 1, 0);
 
  422  std::partial_sum(num_items_recv.begin(), num_items_recv.end(),
 
  423                   std::next(recv_disp.begin()));
 
  426  std::vector<std::int64_t> recv_buffer_index(recv_disp.back());
 
  427  err = MPI_Neighbor_alltoallv(
 
  428      send_buffer_index.data(), num_items_per_dest.data(), send_disp.data(),
 
  429      MPI_INT64_T, recv_buffer_index.data(), num_items_recv.data(),
 
  430      recv_disp.data(), MPI_INT64_T, neigh_comm);
 
  434  MPI_Datatype compound_type;
 
  436  MPI_Type_commit(&compound_type);
 
  437  std::vector<T> recv_buffer_data(shape[1] * recv_disp.back());
 
  438  err = MPI_Neighbor_alltoallv(
 
  439      send_buffer_data.data(), num_items_per_dest.data(), send_disp.data(),
 
  440      compound_type, recv_buffer_data.data(), num_items_recv.data(),
 
  441      recv_disp.data(), compound_type, neigh_comm);
 
  443  err = MPI_Type_free(&compound_type);
 
  445  err = MPI_Comm_free(&neigh_comm);
 
  448  spdlog::debug(
"Completed send data to post offices.");
 
  452  std::vector<std::int32_t> index_local(recv_buffer_index.size());
 
  453  std::ranges::transform(recv_buffer_index, index_local.begin(),
 
  454                         [r0](
auto idx) { return idx - r0; });
 
  456  return {index_local, recv_buffer_data};
 
 
  462                           const U& x, std::array<std::int64_t, 2> shape,
 
  463                           std::int64_t rank_offset)
 
  465  assert(rank_offset >= 0 or x.empty());
 
  466  using T = 
typename std::remove_reference_t<typename U::value_type>;
 
  469  assert(shape[1] > 0);
 
  473  assert(x.size() % shape[1] == 0);
 
  474  const std::int64_t shape0_local = x.size() / shape[1];
 
  481      comm, x, {shape[0], shape[1]}, rank_offset);
 
  482  assert(post_indices.size() == post_x.size() / shape[1]);
 
  488  std::vector<std::tuple<int, std::int64_t, std::int32_t>> src_to_index;
 
  489  for (std::size_t i = 0; i < indices.size(); ++i)
 
  491    std::size_t idx = indices[i];
 
  493      src_to_index.push_back({src, idx, i});
 
  495  std::ranges::sort(src_to_index);
 
  499  std::vector<std::int32_t> num_items_per_src;
 
  500  std::vector<int> src;
 
  502    auto it = src_to_index.begin();
 
  503    while (it != src_to_index.end())
 
  505      src.push_back(std::get<0>(*it));
 
  507          = std::find_if(it, src_to_index.end(), [r = src.back()](
auto& idx)
 
  508                         { return std::get<0>(idx) != r; });
 
  509      num_items_per_src.push_back(std::distance(it, it1));
 
  516  const std::vector<int> dest
 
  519      "Neighbourhood destination ranks from post office in " 
  520      "distribute_data (rank, num dests, num dests/mpi_size): {}, {}, {}",
 
  521      rank, 
static_cast<int>(dest.size()),
 
  522      static_cast<double>(dest.size()) / 
size);
 
  526  MPI_Comm neigh_comm0;
 
  527  int err = MPI_Dist_graph_create_adjacent(
 
  528      comm, dest.size(), dest.data(), MPI_UNWEIGHTED, src.size(), src.data(),
 
  529      MPI_UNWEIGHTED, MPI_INFO_NULL, 
false, &neigh_comm0);
 
  533  std::vector<int> num_items_recv(dest.size());
 
  534  num_items_per_src.reserve(1);
 
  535  num_items_recv.reserve(1);
 
  536  err = MPI_Neighbor_alltoall(num_items_per_src.data(), 1, MPI_INT,
 
  537                              num_items_recv.data(), 1, MPI_INT, neigh_comm0);
 
  541  std::vector<std::int32_t> send_disp = {0};
 
  542  std::partial_sum(num_items_per_src.begin(), num_items_per_src.end(),
 
  543                   std::back_inserter(send_disp));
 
  544  std::vector<std::int32_t> recv_disp = {0};
 
  545  std::partial_sum(num_items_recv.begin(), num_items_recv.end(),
 
  546                   std::back_inserter(recv_disp));
 
  550  assert(send_disp.back() == (
int)src_to_index.size());
 
  551  std::vector<std::int64_t> send_buffer_index(src_to_index.size());
 
  552  std::ranges::transform(src_to_index, send_buffer_index.begin(),
 
  553                         [](
auto x) { return std::get<1>(x); });
 
  556  std::vector<std::int64_t> recv_buffer_index(recv_disp.back());
 
  557  err = MPI_Neighbor_alltoallv(
 
  558      send_buffer_index.data(), num_items_per_src.data(), send_disp.data(),
 
  559      MPI_INT64_T, recv_buffer_index.data(), num_items_recv.data(),
 
  560      recv_disp.data(), MPI_INT64_T, neigh_comm0);
 
  563  err = MPI_Comm_free(&neigh_comm0);
 
  572  const std::array<std::int64_t, 2> postoffice_range
 
  574  std::vector<std::int32_t> post_indices_map(
 
  575      postoffice_range[1] - postoffice_range[0], -1);
 
  576  for (std::size_t i = 0; i < post_indices.size(); ++i)
 
  578    assert(post_indices[i] < (
int)post_indices_map.size());
 
  579    post_indices_map[post_indices[i]] = i;
 
  583  std::vector<T> send_buffer_data(shape[1] * recv_disp.back());
 
  584  for (std::size_t p = 0; p < recv_disp.size() - 1; ++p)
 
  586    int offset = recv_disp[p];
 
  587    for (std::int32_t i = recv_disp[p]; i < recv_disp[p + 1]; ++i)
 
  589      std::int64_t index = recv_buffer_index[i];
 
  590      if (index >= rank_offset and index < (rank_offset + shape0_local))
 
  593        std::int32_t local_index = index - rank_offset;
 
  594        std::copy_n(std::next(x.begin(), shape[1] * local_index), shape[1],
 
  595                    std::next(send_buffer_data.begin(), shape[1] * offset));
 
  600        auto local_index = index - postoffice_range[0];
 
  601        std::int32_t pos = post_indices_map[local_index];
 
  603        std::copy_n(std::next(post_x.begin(), shape[1] * pos), shape[1],
 
  604                    std::next(send_buffer_data.begin(), shape[1] * offset));
 
  611  err = MPI_Dist_graph_create_adjacent(
 
  612      comm, src.size(), src.data(), MPI_UNWEIGHTED, dest.size(), dest.data(),
 
  613      MPI_UNWEIGHTED, MPI_INFO_NULL, 
false, &neigh_comm0);
 
  616  MPI_Datatype compound_type0;
 
  618  MPI_Type_commit(&compound_type0);
 
  620  std::vector<T> recv_buffer_data(shape[1] * send_disp.back());
 
  621  err = MPI_Neighbor_alltoallv(
 
  622      send_buffer_data.data(), num_items_recv.data(), recv_disp.data(),
 
  623      compound_type0, recv_buffer_data.data(), num_items_per_src.data(),
 
  624      send_disp.data(), compound_type0, neigh_comm0);
 
  627  err = MPI_Type_free(&compound_type0);
 
  629  err = MPI_Comm_free(&neigh_comm0);
 
  632  std::vector<std::int32_t> index_pos_to_buffer(indices.size(), -1);
 
  633  for (std::size_t i = 0; i < src_to_index.size(); ++i)
 
  634    index_pos_to_buffer[std::get<2>(src_to_index[i])] = i;
 
  637  std::vector<T> x_new(shape[1] * indices.size());
 
  638  for (std::size_t i = 0; i < indices.size(); ++i)
 
  640    const std::int64_t index = indices[i];
 
  641    if (index >= rank_offset and index < (rank_offset + shape0_local))
 
  644      auto local_index = index - rank_offset;
 
  645      std::copy_n(std::next(x.begin(), shape[1] * local_index), shape[1],
 
  646                  std::next(x_new.begin(), shape[1] * i));
 
  654        auto local_index = index - postoffice_range[0];
 
  655        std::int32_t pos = post_indices_map[local_index];
 
  657        std::copy_n(std::next(post_x.begin(), shape[1] * pos), shape[1],
 
  658                    std::next(x_new.begin(), shape[1] * i));
 
  663        std::int32_t pos = index_pos_to_buffer[i];
 
  665        std::copy_n(std::next(recv_buffer_data.begin(), shape[1] * pos),
 
  666                    shape[1], std::next(x_new.begin(), shape[1] * i));