409    : _index_maps({p.index_map(0),
 
  410                   std::make_shared<common::IndexMap>(p.column_index_map())}),
 
  411      _block_mode(mode), _bs({p.block_size(0), p.block_size(1)}),
 
  412      _data(p.num_nonzeros() * _bs[0] * _bs[1], 0),
 
  413      _cols(p.graph().first.begin(), p.graph().first.end()),
 
  414      _row_ptr(p.graph().second.begin(), p.graph().second.end()),
 
  420    for (
int i = 0; i < 2; ++i)
 
  422      const auto im = _index_maps[i];
 
  423      const int size_local = im->size_local() * _bs[i];
 
  424      std::span ghost_i = im->ghosts();
 
  425      std::vector<std::int64_t> ghosts;
 
  426      const std::vector<int> ghost_owner_i(im->owners().begin(),
 
  428      std::vector<int> src_rank;
 
  429      for (std::size_t j = 0; j < ghost_i.size(); ++j)
 
  431        for (
int k = 0; k < _bs[i]; ++k)
 
  433          ghosts.push_back(ghost_i[j] * _bs[i] + k);
 
  434          src_rank.push_back(ghost_owner_i[j]);
 
  437      const std::array<std::vector<int>, 2> src_dest0
 
  438          = {std::vector(_index_maps[i]->src().begin(),
 
  439                         _index_maps[i]->src().end()),
 
  440             std::vector(_index_maps[i]->dest().begin(),
 
  441                         _index_maps[i]->dest().end())};
 
  442      _index_maps[i] = std::make_shared<common::IndexMap>(
 
  443          _index_maps[i]->comm(), size_local, src_dest0, ghosts, src_rank);
 
  448    column_container_type new_cols;
 
  449    new_cols.reserve(_data.size());
 
  450    rowptr_container_type new_row_ptr = {0};
 
  451    new_row_ptr.reserve(_row_ptr.size() * _bs[0]);
 
  452    std::span<const std::int32_t> num_diag_nnz = p.off_diagonal_offsets();
 
  453    for (std::size_t i = 0; i < _row_ptr.size() - 1; ++i)
 
  456      for (
int q0 = 0; q0 < _bs[0]; ++q0)
 
  458        _off_diagonal_offset.push_back(new_row_ptr.back()
 
  459                                       + num_diag_nnz[i] * _bs[1]);
 
  460        for (
auto j = _row_ptr[i]; j < _row_ptr[i + 1]; ++j)
 
  462          for (
int q1 = 0; q1 < _bs[1]; ++q1)
 
  463            new_cols.push_back(_cols[j] * _bs[1] + q1);
 
  465        new_row_ptr.push_back(new_cols.size());
 
  469    _row_ptr = new_row_ptr;
 
  476    std::span<const std::int32_t> num_diag_nnz = p.off_diagonal_offsets();
 
  477    _off_diagonal_offset.reserve(num_diag_nnz.size());
 
  478    std::ranges::transform(num_diag_nnz, _row_ptr,
 
  479                           std::back_inserter(_off_diagonal_offset),
 
  484  const std::array local_size
 
  485      = {_index_maps[0]->size_local(), _index_maps[1]->size_local()};
 
  487      = {_index_maps[0]->local_range(), _index_maps[1]->local_range()};
 
  488  std::span ghosts1 = _index_maps[1]->ghosts();
 
  490  std::span ghosts0 = _index_maps[0]->ghosts();
 
  491  std::span src_ranks = _index_maps[0]->src();
 
  492  std::span dest_ranks = _index_maps[0]->dest();
 
  496  MPI_Dist_graph_create_adjacent(_index_maps[0]->comm(), dest_ranks.size(),
 
  497                                 dest_ranks.data(), MPI_UNWEIGHTED,
 
  498                                 src_ranks.size(), src_ranks.data(),
 
  499                                 MPI_UNWEIGHTED, MPI_INFO_NULL, 
false, &comm);
 
  504  _ghost_row_to_rank.reserve(_index_maps[0]->owners().
size());
 
  505  for (
int r : _index_maps[0]->owners())
 
  507    auto it = std::ranges::lower_bound(src_ranks, r);
 
  508    assert(it != src_ranks.end() and *it == r);
 
  509    int pos = std::distance(src_ranks.begin(), it);
 
  510    _ghost_row_to_rank.push_back(pos);
 
  514  std::vector<std::int32_t> data_per_proc(src_ranks.size(), 0);
 
  515  for (std::size_t i = 0; i < _ghost_row_to_rank.size(); ++i)
 
  517    assert(_ghost_row_to_rank[i] < data_per_proc.size());
 
  518    std::size_t pos = local_size[0] + i;
 
  519    data_per_proc[_ghost_row_to_rank[i]] += _row_ptr[pos + 1] - _row_ptr[pos];
 
  523  _val_send_disp.resize(src_ranks.size() + 1, 0);
 
  524  std::partial_sum(data_per_proc.begin(), data_per_proc.end(),
 
  525                   std::next(_val_send_disp.begin()));
 
  528  std::vector<std::int64_t> ghost_index_data(2 * _val_send_disp.back());
 
  530    std::vector<int> insert_pos = _val_send_disp;
 
  531    for (std::size_t i = 0; i < _ghost_row_to_rank.size(); ++i)
 
  533      const int rank = _ghost_row_to_rank[i];
 
  534      int row_id = local_size[0] + i;
 
  535      for (
int j = _row_ptr[row_id]; j < _row_ptr[row_id + 1]; ++j)
 
  538        const std::int32_t idx_pos = 2 * insert_pos[
rank];
 
  541        ghost_index_data[idx_pos] = ghosts0[i];
 
  542        if (std::int32_t col_local = _cols[j]; col_local < local_size[1])
 
  543          ghost_index_data[idx_pos + 1] = col_local + local_range[1][0];
 
  545          ghost_index_data[idx_pos + 1] = ghosts1[col_local - local_size[1]];
 
  547        insert_pos[
rank] += 1;
 
  553  std::vector<std::int64_t> ghost_index_array;
 
  554  std::vector<int> recv_disp;
 
  556    std::vector<int> send_sizes;
 
  557    std::ranges::transform(data_per_proc, std::back_inserter(send_sizes),
 
  558                           [](
auto x) { 
return 2 * x; });
 
  560    std::vector<int> recv_sizes(dest_ranks.size());
 
  561    send_sizes.reserve(1);
 
  562    recv_sizes.reserve(1);
 
  563    MPI_Neighbor_alltoall(send_sizes.data(), 1, MPI_INT, recv_sizes.data(), 1,
 
  564                          MPI_INT, _comm.comm());
 
  567    std::vector<int> send_disp = {0};
 
  568    std::partial_sum(send_sizes.begin(), send_sizes.end(),
 
  569                     std::back_inserter(send_disp));
 
  571    std::partial_sum(recv_sizes.begin(), recv_sizes.end(),
 
  572                     std::back_inserter(recv_disp));
 
  574    ghost_index_array.resize(recv_disp.back());
 
  575    MPI_Neighbor_alltoallv(ghost_index_data.data(), send_sizes.data(),
 
  576                           send_disp.data(), MPI_INT64_T,
 
  577                           ghost_index_array.data(), recv_sizes.data(),
 
  578                           recv_disp.data(), MPI_INT64_T, _comm.comm());
 
  583  _val_recv_disp.resize(recv_disp.size());
 
  584  const int bs2 = _bs[0] * _bs[1];
 
  585  std::ranges::transform(recv_disp, _val_recv_disp.begin(),
 
  586                         [&bs2](
auto d) { return bs2 * d / 2; });
 
  587  std::ranges::transform(_val_send_disp, _val_send_disp.begin(),
 
  588                         [&bs2](
auto d) { return d * bs2; });
 
  591  std::vector<std::pair<std::int64_t, std::int32_t>> global_to_local;
 
  592  global_to_local.reserve(ghosts1.size());
 
  593  for (std::int64_t idx : ghosts1)
 
  594    global_to_local.push_back({idx, global_to_local.size() + local_size[1]});
 
  595  std::ranges::sort(global_to_local);
 
  599  for (std::size_t i = 0; i < ghost_index_array.size(); i += 2)
 
  602    const std::int32_t local_row = ghost_index_array[i] - 
local_range[0][0];
 
  603    assert(local_row >= 0 and local_row < local_size[0]);
 
  606    std::int32_t local_col = ghost_index_array[i + 1] - 
local_range[1][0];
 
  607    if (local_col < 0 or local_col >= local_size[1])
 
  609      auto it = std::ranges::lower_bound(
 
  610          global_to_local, std::pair(ghost_index_array[i + 1], -1),
 
  611          [](
auto& a, 
auto& b) { 
return a.first < b.first; });
 
  612      assert(it != global_to_local.end()
 
  613             and it->first == ghost_index_array[i + 1]);
 
  614      local_col = it->second;
 
  616    auto cit0 = std::next(_cols.begin(), _row_ptr[local_row]);
 
  617    auto cit1 = std::next(_cols.begin(), _row_ptr[local_row + 1]);
 
  620    auto cit = std::lower_bound(cit0, cit1, local_col);
 
  622    assert(*cit == local_col);
 
  623    std::size_t d = std::distance(_cols.begin(), cit);
 
  624    _unpack_pos.push_back(d);
 
 
  653  const std::int32_t local_size0 = _index_maps[0]->size_local();
 
  654  const std::int32_t num_ghosts0 = _index_maps[0]->num_ghosts();
 
  655  const int bs2 = _bs[0] * _bs[1];
 
  658  std::vector<int> insert_pos = _val_send_disp;
 
  659  _ghost_value_data.resize(_val_send_disp.back());
 
  660  for (
int i = 0; i < num_ghosts0; ++i)
 
  662    const int rank = _ghost_row_to_rank[i];
 
  666    const std::int32_t val_pos = insert_pos[rank];
 
  667    std::copy(std::next(_data.data(), _row_ptr[local_size0 + i] * bs2),
 
  668              std::next(_data.data(), _row_ptr[local_size0 + i + 1] * bs2),
 
  669              std::next(_ghost_value_data.begin(), val_pos));
 
  671        += bs2 * (_row_ptr[local_size0 + i + 1] - _row_ptr[local_size0 + i]);
 
  674  _ghost_value_data_in.resize(_val_recv_disp.back());
 
  677  std::vector<int> val_send_count(_val_send_disp.size() - 1);
 
  678  std::adjacent_difference(std::next(_val_send_disp.begin()),
 
  679                           _val_send_disp.end(), val_send_count.begin());
 
  681  std::vector<int> val_recv_count(_val_recv_disp.size() - 1);
 
  682  std::adjacent_difference(std::next(_val_recv_disp.begin()),
 
  683                           _val_recv_disp.end(), val_recv_count.begin());
 
  685  int status = MPI_Ineighbor_alltoallv(
 
  686      _ghost_value_data.data(), val_send_count.data(), _val_send_disp.data(),
 
  688      val_recv_count.data(), _val_recv_disp.data(),
 
  690  assert(status == MPI_SUCCESS);