Note: this is documentation for an old release. View the latest documentation at docs.fenicsproject.org/dolfinx/v0.9.0/cpp/doxygen/d0/d3b/MPI_8h_source.html
DOLFINx 0.6.0
DOLFINx C++ interface
Loading...
Searching...
No Matches
MPI.h
1// Copyright (C) 2007-2014 Magnus Vikstrøm and Garth N. Wells
2//
3// This file is part of DOLFINx (https://www.fenicsproject.org)
4//
5// SPDX-License-Identifier: LGPL-3.0-or-later
6
7#pragma once
8
9#include <algorithm>
10#include <array>
11#include <cassert>
12#include <complex>
13#include <cstdint>
14#include <dolfinx/common/Timer.h>
15#include <dolfinx/common/log.h>
16#include <dolfinx/graph/AdjacencyList.h>
17#include <numeric>
18#include <set>
19#include <span>
20#include <tuple>
21#include <type_traits>
22#include <utility>
23#include <vector>
24
25#define MPICH_IGNORE_CXX_SEEK 1
26#include <mpi.h>
27
29namespace dolfinx::MPI
30{
31
33enum class tag : int
34{
35 consensus_pcx,
36 consensus_pex
37};
38
41class Comm
42{
43public:
45 explicit Comm(MPI_Comm comm, bool duplicate = true);
46
48 Comm(const Comm& comm) noexcept;
49
51 Comm(Comm&& comm) noexcept;
52
53 // Disable copy assignment operator
54 Comm& operator=(const Comm& comm) = delete;
55
57 Comm& operator=(Comm&& comm) noexcept;
58
60 ~Comm();
61
63 MPI_Comm comm() const noexcept;
64
65private:
66 // MPI communicator
67 MPI_Comm _comm;
68};
69
71int rank(MPI_Comm comm);
72
75int size(MPI_Comm comm);
76
81void check_error(MPI_Comm comm, int code);
82
90constexpr std::array<std::int64_t, 2> local_range(int rank, std::int64_t N,
91 int size)
92{
93 assert(rank >= 0);
94 assert(N >= 0);
95 assert(size > 0);
96
97 // Compute number of items per rank and remainder
98 const std::int64_t n = N / size;
99 const std::int64_t r = N % size;
100
101 // Compute local range
102 if (rank < r)
103 return {rank * (n + 1), rank * (n + 1) + n + 1};
104 else
105 return {rank * n + r, rank * n + r + n};
106}
107
114constexpr int index_owner(int size, std::size_t index, std::size_t N)
115{
116 assert(index < N);
117
118 // Compute number of items per rank and remainder
119 const std::size_t n = N / size;
120 const std::size_t r = N % size;
121
122 if (index < r * (n + 1))
123 {
124 // First r ranks own n + 1 indices
125 return index / (n + 1);
126 }
127 else
128 {
129 // Remaining ranks own n indices
130 return r + (index - r * (n + 1)) / n;
131 }
132}
133
157std::vector<int> compute_graph_edges_pcx(MPI_Comm comm,
158 std::span<const int> edges);
159
184std::vector<int> compute_graph_edges_nbx(MPI_Comm comm,
185 std::span<const int> edges);
186
206template <typename T>
207std::pair<std::vector<std::int32_t>, std::vector<T>>
208distribute_to_postoffice(MPI_Comm comm, std::span<const T> x,
209 std::array<std::int64_t, 2> shape,
210 std::int64_t rank_offset);
211
233template <typename T>
234std::vector<T> distribute_from_postoffice(MPI_Comm comm,
235 std::span<const std::int64_t> indices,
236 std::span<const T> x,
237 std::array<std::int64_t, 2> shape,
238 std::int64_t rank_offset);
239
263template <typename T>
264std::vector<T> distribute_data(MPI_Comm comm,
265 std::span<const std::int64_t> indices,
266 std::span<const T> x, int shape1);
267
268template <typename T>
269struct dependent_false : std::false_type
270{
271};
272
274template <typename T>
275constexpr MPI_Datatype mpi_type()
276{
277 if constexpr (std::is_same_v<T, float>)
278 return MPI_FLOAT;
279 else if constexpr (std::is_same_v<T, double>)
280 return MPI_DOUBLE;
281 else if constexpr (std::is_same_v<T, std::complex<double>>)
282 return MPI_C_DOUBLE_COMPLEX;
283 else if constexpr (std::is_same_v<T, std::complex<float>>)
284 return MPI_C_FLOAT_COMPLEX;
285 else if constexpr (std::is_same_v<T, short int>)
286 return MPI_SHORT;
287 else if constexpr (std::is_same_v<T, int>)
288 return MPI_INT;
289 else if constexpr (std::is_same_v<T, unsigned int>)
290 return MPI_UNSIGNED;
291 else if constexpr (std::is_same_v<T, long int>)
292 return MPI_LONG;
293 else if constexpr (std::is_same_v<T, unsigned long>)
294 return MPI_UNSIGNED_LONG;
295 else if constexpr (std::is_same_v<T, long long>)
296 return MPI_LONG_LONG;
297 else if constexpr (std::is_same_v<T, unsigned long long>)
298 return MPI_UNSIGNED_LONG_LONG;
299 else if constexpr (std::is_same_v<T, bool>)
300 return MPI_C_BOOL;
301 else if constexpr (std::is_same_v<T, std::int8_t>)
302 return MPI_INT8_T;
303 else
304 // Issue compile time error
305 static_assert(!std::is_same_v<T, T>);
306}
307
308//---------------------------------------------------------------------------
309template <typename T>
310std::pair<std::vector<std::int32_t>, std::vector<T>>
311distribute_to_postoffice(MPI_Comm comm, std::span<const T> x,
312 std::array<std::int64_t, 2> shape,
313 std::int64_t rank_offset)
314{
315 const int size = dolfinx::MPI::size(comm);
316 const int rank = dolfinx::MPI::rank(comm);
317 assert(x.size() % shape[1] == 0);
318 const std::int32_t shape0_local = x.size() / shape[1];
319
320 LOG(2) << "Sending data to post offices (distribute_to_postoffice)";
321
322 // Post office ranks will receive data from this rank
323 std::vector<int> row_to_dest(shape0_local);
324 for (std::int32_t i = 0; i < shape0_local; ++i)
325 {
326 int dest = MPI::index_owner(size, i + rank_offset, shape[0]);
327 row_to_dest[i] = dest;
328 }
329
330 // Build list of (dest, positions) for each row that doesn't belong to
331 // this rank, then sort
332 std::vector<std::array<std::int32_t, 2>> dest_to_index;
333 dest_to_index.reserve(shape0_local);
334 for (std::int32_t i = 0; i < shape0_local; ++i)
335 {
336 std::size_t idx = i + rank_offset;
337 if (int dest = MPI::index_owner(size, idx, shape[0]); dest != rank)
338 dest_to_index.push_back({dest, i});
339 }
340 std::sort(dest_to_index.begin(), dest_to_index.end());
341
342 // Build list of neighbour src ranks and count number of items (rows
343 // of x) to receive from each src post office (by neighbourhood rank)
344 std::vector<int> dest;
345 std::vector<std::int32_t> num_items_per_dest,
346 pos_to_neigh_rank(shape0_local, -1);
347 {
348 auto it = dest_to_index.begin();
349 while (it != dest_to_index.end())
350 {
351 const int neigh_rank = dest.size();
352
353 // Store global rank
354 dest.push_back((*it)[0]);
355
356 // Find iterator to next global rank
357 auto it1
358 = std::find_if(it, dest_to_index.end(),
359 [r = dest.back()](auto& idx) { return idx[0] != r; });
360
361 // Store number of items for current rank
362 num_items_per_dest.push_back(std::distance(it, it1));
363
364 // Map from local x index to local destination rank
365 for (auto e = it; e != it1; ++e)
366 pos_to_neigh_rank[(*e)[1]] = neigh_rank;
367
368 // Advance iterator
369 it = it1;
370 }
371 }
372
373 // Determine source ranks
374 const std::vector<int> src = MPI::compute_graph_edges_nbx(comm, dest);
375 LOG(INFO)
376 << "Number of neighbourhood source ranks in distribute_to_postoffice: "
377 << src.size();
378
379 // Create neighbourhood communicator for sending data to post offices
380 MPI_Comm neigh_comm;
381 int err = MPI_Dist_graph_create_adjacent(
382 comm, src.size(), src.data(), MPI_UNWEIGHTED, dest.size(), dest.data(),
383 MPI_UNWEIGHTED, MPI_INFO_NULL, false, &neigh_comm);
384 dolfinx::MPI::check_error(comm, err);
385
386 // Compute send displacements
387 std::vector<std::int32_t> send_disp = {0};
388 std::partial_sum(num_items_per_dest.begin(), num_items_per_dest.end(),
389 std::back_inserter(send_disp));
390
391 // Pack send buffers
392 std::vector<T> send_buffer_data(shape[1] * send_disp.back());
393 std::vector<std::int64_t> send_buffer_index(send_disp.back());
394 {
395 std::vector<std::int32_t> send_offsets = send_disp;
396 for (std::int32_t i = 0; i < shape0_local; ++i)
397 {
398 if (int neigh_dest = pos_to_neigh_rank[i]; neigh_dest != -1)
399 {
400 std::size_t pos = send_offsets[neigh_dest];
401 send_buffer_index[pos] = i + rank_offset;
402 std::copy_n(std::next(x.begin(), i * shape[1]), shape[1],
403 std::next(send_buffer_data.begin(), shape[1] * pos));
404 ++send_offsets[neigh_dest];
405 }
406 }
407 }
408
409 // Send number of items to post offices (destination) that I will be
410 // sending
411 std::vector<int> num_items_recv(src.size());
412 num_items_per_dest.reserve(1);
413 num_items_recv.reserve(1);
414 err = MPI_Neighbor_alltoall(num_items_per_dest.data(), 1, MPI_INT,
415 num_items_recv.data(), 1, MPI_INT, neigh_comm);
416 dolfinx::MPI::check_error(comm, err);
417
418 // Prepare receive displacement and buffers
419 std::vector<std::int32_t> recv_disp(num_items_recv.size() + 1, 0);
420 std::partial_sum(num_items_recv.begin(), num_items_recv.end(),
421 std::next(recv_disp.begin()));
422
423 // Send/receive global indices
424 std::vector<std::int64_t> recv_buffer_index(recv_disp.back());
425 err = MPI_Neighbor_alltoallv(
426 send_buffer_index.data(), num_items_per_dest.data(), send_disp.data(),
427 MPI_INT64_T, recv_buffer_index.data(), num_items_recv.data(),
428 recv_disp.data(), MPI_INT64_T, neigh_comm);
429 dolfinx::MPI::check_error(comm, err);
430
431 // Send/receive data (x)
432 MPI_Datatype compound_type;
433 MPI_Type_contiguous(shape[1], dolfinx::MPI::mpi_type<T>(), &compound_type);
434 MPI_Type_commit(&compound_type);
435 std::vector<T> recv_buffer_data(shape[1] * recv_disp.back());
436 err = MPI_Neighbor_alltoallv(
437 send_buffer_data.data(), num_items_per_dest.data(), send_disp.data(),
438 compound_type, recv_buffer_data.data(), num_items_recv.data(),
439 recv_disp.data(), compound_type, neigh_comm);
440 dolfinx::MPI::check_error(comm, err);
441 err = MPI_Type_free(&compound_type);
442 dolfinx::MPI::check_error(comm, err);
443 err = MPI_Comm_free(&neigh_comm);
444 dolfinx::MPI::check_error(comm, err);
445
446 LOG(2) << "Completed send data to post offices.";
447
448 // Convert to local indices
449 const std::int64_t r0 = MPI::local_range(rank, shape[0], size)[0];
450 std::vector<std::int32_t> index_local(recv_buffer_index.size());
451 std::transform(recv_buffer_index.cbegin(), recv_buffer_index.cend(),
452 index_local.begin(), [r0](auto idx) { return idx - r0; });
453
454 return {index_local, recv_buffer_data};
455}
456//---------------------------------------------------------------------------
457template <typename T>
458std::vector<T> distribute_from_postoffice(MPI_Comm comm,
459 std::span<const std::int64_t> indices,
460 std::span<const T> x,
461 std::array<std::int64_t, 2> shape,
462 std::int64_t rank_offset)
463{
464 common::Timer timer("Distribute row-wise data (scalable)");
465 assert(shape[1] > 0);
466
467 const int size = dolfinx::MPI::size(comm);
468 const int rank = dolfinx::MPI::rank(comm);
469 assert(x.size() % shape[1] == 0);
470 const std::int64_t shape0_local = x.size() / shape[1];
471
472 // 0. Send x data to/from post offices
473
474 // Send receive x data to post office (only for rows that need to be
475 // communicated)
476 auto [post_indices, post_x] = MPI::distribute_to_postoffice(
477 comm, x, {shape[0], shape[1]}, rank_offset);
478 assert(post_indices.size() == post_x.size() / shape[1]);
479
480 // 1. Send request to post office ranks for data
481
482 // Build list of (src, global index, global, index positions) for each
483 // entry in 'indices' that doesn't belong to this rank, then sort
484 std::vector<std::tuple<int, std::int64_t, std::int32_t>> src_to_index;
485 for (std::size_t i = 0; i < indices.size(); ++i)
486 {
487 std::size_t idx = indices[i];
488 if (int src = MPI::index_owner(size, idx, shape[0]); src != rank)
489 src_to_index.push_back({src, idx, i});
490 }
491 std::sort(src_to_index.begin(), src_to_index.end());
492
493 // Build list is neighbour src ranks and count number of items (rows
494 // of x) to receive from each src post office (by neighbourhood rank)
495 std::vector<std::int32_t> num_items_per_src;
496 std::vector<int> src;
497 {
498 auto it = src_to_index.begin();
499 while (it != src_to_index.end())
500 {
501 src.push_back(std::get<0>(*it));
502 auto it1 = std::find_if(it, src_to_index.end(),
503 [r = src.back()](auto& idx)
504 { return std::get<0>(idx) != r; });
505 num_items_per_src.push_back(std::distance(it, it1));
506 it = it1;
507 }
508 }
509
510 // Determine 'delivery' destination ranks (ranks that want data from
511 // me)
512 const std::vector<int> dest
514 LOG(INFO) << "Neighbourhood destination ranks from post office in "
515 "distribute_data (rank, num dests, num dests/mpi_size): "
516 << rank << ", " << dest.size() << ", "
517 << static_cast<double>(dest.size()) / size;
518
519 // Create neighbourhood communicator for sending data to post offices
520 // (src), and receiving data form my send my post office
521 MPI_Comm neigh_comm0;
522 int err = MPI_Dist_graph_create_adjacent(
523 comm, dest.size(), dest.data(), MPI_UNWEIGHTED, src.size(), src.data(),
524 MPI_UNWEIGHTED, MPI_INFO_NULL, false, &neigh_comm0);
525 dolfinx::MPI::check_error(comm, err);
526
527 // Communicate number of requests to each source
528 std::vector<int> num_items_recv(dest.size());
529 num_items_per_src.reserve(1);
530 num_items_recv.reserve(1);
531 err = MPI_Neighbor_alltoall(num_items_per_src.data(), 1, MPI_INT,
532 num_items_recv.data(), 1, MPI_INT, neigh_comm0);
533 dolfinx::MPI::check_error(comm, err);
534
535 // Prepare send/receive displacements
536 std::vector<std::int32_t> send_disp = {0};
537 std::partial_sum(num_items_per_src.begin(), num_items_per_src.end(),
538 std::back_inserter(send_disp));
539 std::vector<std::int32_t> recv_disp = {0};
540 std::partial_sum(num_items_recv.begin(), num_items_recv.end(),
541 std::back_inserter(recv_disp));
542
543 // Pack my requested indices (global) in send buffer ready to send to
544 // post offices
545 assert(send_disp.back() == (int)src_to_index.size());
546 std::vector<std::int64_t> send_buffer_index(src_to_index.size());
547 std::transform(src_to_index.cbegin(), src_to_index.cend(),
548 send_buffer_index.begin(),
549 [](auto& x) { return std::get<1>(x); });
550
551 // Prepare the receive buffer
552 std::vector<std::int64_t> recv_buffer_index(recv_disp.back());
553 err = MPI_Neighbor_alltoallv(
554 send_buffer_index.data(), num_items_per_src.data(), send_disp.data(),
555 MPI_INT64_T, recv_buffer_index.data(), num_items_recv.data(),
556 recv_disp.data(), MPI_INT64_T, neigh_comm0);
557 dolfinx::MPI::check_error(comm, err);
558
559 err = MPI_Comm_free(&neigh_comm0);
560 dolfinx::MPI::check_error(comm, err);
561
562 // 2. Send data (rows of x) back to requesting ranks (transpose of the
563 // preceding communication pattern operation)
564
565 // Build map from local index to post_indices position. Set to -1 for
566 // data that was already on this rank and was therefore was not
567 // sent/received via a postoffice.
568 const std::array<std::int64_t, 2> postoffice_range
569 = MPI::local_range(rank, shape[0], size);
570 std::vector<std::int32_t> post_indices_map(
571 postoffice_range[1] - postoffice_range[0], -1);
572 for (std::size_t i = 0; i < post_indices.size(); ++i)
573 {
574 assert(post_indices[i] < (int)post_indices_map.size());
575 post_indices_map[post_indices[i]] = i;
576 }
577
578 // Build send buffer
579 std::vector<T> send_buffer_data(shape[1] * recv_disp.back());
580 for (std::size_t p = 0; p < recv_disp.size() - 1; ++p)
581 {
582 int offset = recv_disp[p];
583 for (std::int32_t i = recv_disp[p]; i < recv_disp[p + 1]; ++i)
584 {
585 std::int64_t index = recv_buffer_index[i];
586 if (index >= rank_offset and index < (rank_offset + shape0_local))
587 {
588 // I already had this index before any communication
589 std::int32_t local_index = index - rank_offset;
590 std::copy_n(std::next(x.begin(), shape[1] * local_index), shape[1],
591 std::next(send_buffer_data.begin(), shape[1] * offset));
592 }
593 else
594 {
595 // Take from my 'post bag'
596 auto local_index = index - postoffice_range[0];
597 std::int32_t pos = post_indices_map[local_index];
598 assert(pos != -1);
599 std::copy_n(std::next(post_x.begin(), shape[1] * pos), shape[1],
600 std::next(send_buffer_data.begin(), shape[1] * offset));
601 }
602
603 ++offset;
604 }
605 }
606
607 err = MPI_Dist_graph_create_adjacent(
608 comm, src.size(), src.data(), MPI_UNWEIGHTED, dest.size(), dest.data(),
609 MPI_UNWEIGHTED, MPI_INFO_NULL, false, &neigh_comm0);
610 dolfinx::MPI::check_error(comm, err);
611
612 MPI_Datatype compound_type0;
613 MPI_Type_contiguous(shape[1], dolfinx::MPI::mpi_type<T>(), &compound_type0);
614 MPI_Type_commit(&compound_type0);
615
616 std::vector<T> recv_buffer_data(shape[1] * send_disp.back());
617 err = MPI_Neighbor_alltoallv(
618 send_buffer_data.data(), num_items_recv.data(), recv_disp.data(),
619 compound_type0, recv_buffer_data.data(), num_items_per_src.data(),
620 send_disp.data(), compound_type0, neigh_comm0);
621 dolfinx::MPI::check_error(comm, err);
622
623 err = MPI_Type_free(&compound_type0);
624 dolfinx::MPI::check_error(comm, err);
625 err = MPI_Comm_free(&neigh_comm0);
626 dolfinx::MPI::check_error(comm, err);
627
628 std::vector<std::int32_t> index_pos_to_buffer(indices.size(), -1);
629 for (std::size_t i = 0; i < src_to_index.size(); ++i)
630 index_pos_to_buffer[std::get<2>(src_to_index[i])] = i;
631
632 // Extra data to return
633 std::vector<T> x_new(shape[1] * indices.size());
634 for (std::size_t i = 0; i < indices.size(); ++i)
635 {
636 const std::int64_t index = indices[i];
637 if (index >= rank_offset and index < (rank_offset + shape0_local))
638 {
639 // Had data from the start in x
640 auto local_index = index - rank_offset;
641 std::copy_n(std::next(x.begin(), shape[1] * local_index), shape[1],
642 std::next(x_new.begin(), shape[1] * i));
643 }
644 else
645 {
646 if (int src = MPI::index_owner(size, index, shape[0]); src == rank)
647 {
648 // In my post office bag
649 auto local_index = index - postoffice_range[0];
650 std::int32_t pos = post_indices_map[local_index];
651 assert(pos != -1);
652 std::copy_n(std::next(post_x.begin(), shape[1] * pos), shape[1],
653 std::next(x_new.begin(), shape[1] * i));
654 }
655 else
656 {
657 // In my received post
658 std::int32_t pos = index_pos_to_buffer[i];
659 assert(pos != -1);
660 std::copy_n(std::next(recv_buffer_data.begin(), shape[1] * pos),
661 shape[1], std::next(x_new.begin(), shape[1] * i));
662 }
663 }
664 }
665
666 return x_new;
667}
668//---------------------------------------------------------------------------
669template <typename T>
670std::vector<T> distribute_data(MPI_Comm comm,
671 std::span<const std::int64_t> indices,
672 std::span<const T> x, int shape1)
673{
674 assert(shape1 > 0);
675 assert(x.size() % shape1 == 0);
676 const std::int64_t shape0_local = x.size() / shape1;
677
678 std::int64_t shape0(0), rank_offset(0);
679 int err
680 = MPI_Allreduce(&shape0_local, &shape0, 1, MPI_INT64_T, MPI_SUM, comm);
681 dolfinx::MPI::check_error(comm, err);
682 err = MPI_Exscan(&shape0_local, &rank_offset, 1, MPI_INT64_T, MPI_SUM, comm);
683 dolfinx::MPI::check_error(comm, err);
684
685 return distribute_from_postoffice(comm, indices, x, {shape0, shape1},
686 rank_offset);
687}
688//---------------------------------------------------------------------------
689
690} // namespace dolfinx::MPI
A duplicate MPI communicator and manage lifetime of the communicator.
Definition: MPI.h:42
~Comm()
Destructor (frees wrapped communicator)
Definition: MPI.cpp:35
MPI_Comm comm() const noexcept
Return the underlying MPI_Comm object.
Definition: MPI.cpp:61
A timer can be used for timing tasks. The basic usage is.
Definition: Timer.h:31
MPI support functionality.
Definition: MPI.h:30
std::pair< std::vector< std::int32_t >, std::vector< T > > distribute_to_postoffice(MPI_Comm comm, std::span< const T > x, std::array< std::int64_t, 2 > shape, std::int64_t rank_offset)
Distribute row data to 'post office' ranks.
Definition: MPI.h:311
std::vector< int > compute_graph_edges_nbx(MPI_Comm comm, std::span< const int > edges)
Determine incoming graph edges using the NBX consensus algorithm.
Definition: MPI.cpp:163
std::vector< T > distribute_from_postoffice(MPI_Comm comm, std::span< const std::int64_t > indices, std::span< const T > x, std::array< std::int64_t, 2 > shape, std::int64_t rank_offset)
Distribute rows of a rectangular data array from post office ranks to ranks where they are required.
Definition: MPI.h:458
constexpr int index_owner(int size, std::size_t index, std::size_t N)
Return which rank owns index in global range [0, N - 1] (inverse of MPI::local_range).
Definition: MPI.h:114
std::vector< T > distribute_data(MPI_Comm comm, std::span< const std::int64_t > indices, std::span< const T > x, int shape1)
Distribute rows of a rectangular data array to ranks where they are required (scalable version).
Definition: MPI.h:670
std::vector< int > compute_graph_edges_pcx(MPI_Comm comm, std::span< const int > edges)
Determine incoming graph edges using the PCX consensus algorithm.
Definition: MPI.cpp:96
void check_error(MPI_Comm comm, int code)
Check MPI error code. If the error code is not equal to MPI_SUCCESS, then std::abort is called.
Definition: MPI.cpp:79
int size(MPI_Comm comm)
Return size of the group (number of processes) associated with the communicator.
Definition: MPI.cpp:71
int rank(MPI_Comm comm)
Return process rank for the communicator.
Definition: MPI.cpp:63
constexpr std::array< std::int64_t, 2 > local_range(int rank, std::int64_t N, int size)
Return local range for the calling process, partitioning the global [0, N - 1] range across all ranks...
Definition: MPI.h:90
constexpr MPI_Datatype mpi_type()
MPI Type.
Definition: MPI.h:275
tag
MPI communication tags.
Definition: MPI.h:34
Definition: MPI.h:270