133 if constexpr (BS0 != -1 or BS1 != -1)
135 if (bs[0] != BS0 or bs[1] != BS1)
136 throw std::runtime_error(
"Invalid template parameters/block size");
142 const std::int32_t n_row = row_map_A->size_local();
143 const std::int32_t n_col = col_map_A->size_local();
148 std::span<const int> src = col_map_A->src();
149 std::span<const int> dest = col_map_A->dest();
159 MPI_Comm comm = row_map_A->comm();
161 if (comm_size == 1 or (src.empty() && dest.empty()))
164 auto at_col_map = std::make_shared<dolfinx::common::IndexMap>(comm, n_row);
165 auto at_row_map = std::make_shared<dolfinx::common::IndexMap>(comm, n_col);
167 auto [at_cols, at_rp, at_vals] = impl::local_transpose<T, BS0, BS1>(A);
170 std::vector<std::int32_t> off_diag_offsets(n_col);
171 for (
int j = 0; j < n_col; ++j)
172 off_diag_offsets[j] = at_rp[j + 1] - at_rp[j];
175 at_rp, off_diag_offsets, {bs[1], bs[0]}};
177 std::ranges::copy(at_vals, At.
values().begin());
183 auto cols_A = A.
cols();
186 std::span<const std::int64_t> ghost_col_globals = col_map_A->ghosts();
190 auto rank_to_nbr = [&src](
int r) ->
int
192 auto it = std::lower_bound(src.begin(), src.end(), r);
193 assert(it != src.end() && *it == r);
194 return static_cast<int>(std::distance(src.begin(), it));
196 std::vector<int> ghost_col_owners(col_map_A->owners().size());
197 std::transform(col_map_A->owners().begin(), col_map_A->owners().end(),
198 ghost_col_owners.begin(), rank_to_nbr);
203 MPI_Dist_graph_create_adjacent(
204 comm,
static_cast<int>(dest.size()), dest.data(), MPI_UNWEIGHTED,
205 static_cast<int>(src.size()), src.data(), MPI_UNWEIGHTED, MPI_INFO_NULL,
211 std::vector<int> send_count(src.size(), 0);
212 for (std::int32_t i = 0; i < n_row; ++i)
214 for (std::int64_t k = offset_row_A[i]; k < row_ptr_A[i + 1]; ++k)
216 const std::int32_t j = cols_A[k] - n_col;
217 ++send_count[ghost_col_owners[j]];
221 std::vector<int> recv_count(dest.size(), 0);
222 MPI_Neighbor_alltoall(send_count.data(), 1, MPI_INT, recv_count.data(), 1,
223 MPI_INT, neigh_comm);
225 std::vector<int> send_disp(src.size() + 1, 0);
226 std::partial_sum(send_count.begin(), send_count.end(),
227 std::next(send_disp.begin()));
228 std::vector<int> recv_disp(dest.size() + 1, 0);
229 std::partial_sum(recv_count.begin(), recv_count.end(),
230 std::next(recv_disp.begin()));
231 const int total_send = send_disp.back();
232 const int total_recv = recv_disp.back();
238 auto [row_start, row_end] = row_map_A->local_range();
244 std::vector<std::int64_t> send_row_gidx(total_send);
245 std::vector<std::int64_t> send_col_gidx(total_send);
246 std::vector<T> send_vals(total_send * bs[0] * bs[1]);
248 int nbs = bs[0] * bs[1];
249 std::vector<int> pos = send_disp;
250 for (std::int32_t i = 0; i < n_row; ++i)
252 const std::int64_t global_i = row_start + i;
253 for (std::int64_t k = offset_row_A[i]; k < row_ptr_A[i + 1]; ++k)
255 const std::int32_t j = cols_A[k] - n_col;
256 const int sidx = ghost_col_owners[j];
257 const int p = pos[sidx]++;
258 send_row_gidx[p] = global_i;
259 send_col_gidx[p] = ghost_col_globals[j];
260 std::copy_n(std::next(vals_A.begin(), k * nbs), nbs,
261 std::next(send_vals.begin(), p * nbs));
269 std::vector<std::int64_t> recv_row_gidx(total_recv);
270 std::vector<std::int64_t> recv_col_gidx(total_recv);
271 std::vector<T> recv_vals(total_recv * bs[0] * bs[1]);
273 MPI_Request data_reqs[3];
275 if (bs[0] * bs[1] == 1)
280 MPI_Type_commit(&mpi_T);
283 MPI_Ineighbor_alltoallv(send_row_gidx.data(), send_count.data(),
284 send_disp.data(), MPI_INT64_T, recv_row_gidx.data(),
285 recv_count.data(), recv_disp.data(), MPI_INT64_T,
286 neigh_comm, &data_reqs[0]);
287 MPI_Ineighbor_alltoallv(send_col_gidx.data(), send_count.data(),
288 send_disp.data(), MPI_INT64_T, recv_col_gidx.data(),
289 recv_count.data(), recv_disp.data(), MPI_INT64_T,
290 neigh_comm, &data_reqs[1]);
291 MPI_Ineighbor_alltoallv(send_vals.data(), send_count.data(), send_disp.data(),
292 mpi_T, recv_vals.data(), recv_count.data(),
293 recv_disp.data(), mpi_T, neigh_comm, &data_reqs[2]);
296 auto [at0_cols, at0_rp, at0_vals] = impl::local_transpose<T, BS0, BS1>(A);
301 MPI_Waitall(3, data_reqs, MPI_STATUSES_IGNORE);
302 if (bs[0] * bs[1] != 1)
303 MPI_Type_free(&mpi_T);
304 MPI_Comm_free(&neigh_comm);
309 auto [col_start, col_end] = col_map_A->local_range();
312 std::vector<std::int64_t> new_row_count(n_col);
313 std::adjacent_difference(std::next(at0_rp.begin()), at0_rp.end(),
314 new_row_count.begin());
315 for (std::int64_t c : recv_col_gidx)
317 const std::int32_t local_col =
static_cast<std::int32_t
>(c - col_start);
318 assert(local_col >= 0 && local_col < n_col);
319 ++new_row_count[local_col];
321 std::vector<std::int64_t> at_row_ptr(new_row_count.size() + 1, 0);
322 std::partial_sum(new_row_count.begin(), new_row_count.end(),
323 std::next(at_row_ptr.begin()));
326 std::vector<std::int64_t> ghost_row_globals;
327 std::vector<int> ghost_row_owners;
330 std::vector<std::pair<std::int64_t, int>> ghost_row_pairs;
331 ghost_row_pairs.reserve(recv_disp.back());
332 for (std::size_t p = 0; p < dest.size(); ++p)
334 for (
int k = recv_disp[p]; k < recv_disp[p + 1]; ++k)
336 const std::int64_t gidx = recv_row_gidx[k];
337 assert(gidx < row_start || gidx >= row_end);
338 ghost_row_pairs.emplace_back(gidx,
static_cast<int>(dest[p]));
345 std::sort(ghost_row_pairs.begin(), ghost_row_pairs.end());
346 ghost_row_pairs.erase(
347 std::unique(ghost_row_pairs.begin(), ghost_row_pairs.end()),
348 ghost_row_pairs.end());
350 ghost_row_globals.reserve(ghost_row_pairs.size());
351 ghost_row_owners.reserve(ghost_row_pairs.size());
352 for (
auto& [g, o] : ghost_row_pairs)
354 ghost_row_globals.push_back(g);
355 ghost_row_owners.push_back(o);
359 auto at_col_map = std::make_shared<dolfinx::common::IndexMap>(
360 comm, n_row, ghost_row_globals, ghost_row_owners);
363 std::vector<std::int32_t> new_col_local(recv_disp.back());
364 at_col_map->global_to_local(recv_row_gidx, new_col_local);
367 std::vector<std::int32_t> at_cols(at_row_ptr.back());
368 std::vector<T> at_vals(at_row_ptr.back() * bs[0] * bs[1]);
369 for (std::int32_t i = 0; i < n_col; ++i)
371 std::copy(std::next(at0_cols.begin(), at0_rp[i]),
372 std::next(at0_cols.begin(), at0_rp[i + 1]),
373 std::next(at_cols.begin(), at_row_ptr[i]));
374 std::copy(std::next(at0_vals.begin(), at0_rp[i] * bs[0] * bs[1]),
375 std::next(at0_vals.begin(), at0_rp[i + 1] * bs[0] * bs[1]),
376 std::next(at_vals.begin(), at_row_ptr[i] * bs[0] * bs[1]));
382 std::vector<std::int32_t> cursor(n_col);
383 std::vector<std::int32_t> off_diag_offsets(n_col);
384 for (
int j = 0; j < n_col; ++j)
386 off_diag_offsets[j] = at0_rp[j + 1] - at0_rp[j];
387 cursor[j] =
static_cast<std::int32_t
>(at_row_ptr[j]) + off_diag_offsets[j];
390 for (std::size_t p = 0; p < dest.size(); ++p)
392 for (
int k = recv_disp[p]; k < recv_disp[p + 1]; ++k)
394 const std::int32_t local_col
395 =
static_cast<std::int32_t
>(recv_col_gidx[k] - col_start);
396 assert(local_col >= 0 && local_col < n_col);
397 const int pos = cursor[local_col]++;
398 at_cols[pos] = new_col_local[k];
400 if constexpr (BS0 < 0 or BS1 < 0)
402 for (
int k0 = 0; k0 < bs[0]; ++k0)
403 for (
int k1 = 0; k1 < bs[1]; ++k1)
404 at_vals[pos * bs[0] * bs[1] + k1 * bs[0] + k0]
405 = recv_vals[k * bs[0] * bs[1] + k0 * bs[1] + k1];
409 for (
int k0 = 0; k0 < BS0; ++k0)
410 for (
int k1 = 0; k1 < BS1; ++k1)
411 at_vals[pos * BS0 * BS1 + k1 * BS0 + k0]
412 = recv_vals[k * BS0 * BS1 + k0 * BS1 + k1];
421 auto at_row_map = std::make_shared<dolfinx::common::IndexMap>(comm, n_col);
424 at_row_ptr, off_diag_offsets, {bs[1], bs[0]}};
427 std::ranges::copy(at_vals, At.
values().begin());
431 spdlog::info(
"Transpose: ({} x {}) -> ({} x {}), nnz {}",
432 row_map_A->size_global(), col_map_A->size_global(),
433 col_map_A->size_global(), row_map_A->size_global(),