Note: this is documentation for an old release. View the latest documentation at docs.fenicsproject.org/dolfinx/v0.9.0/cpp/doxygen/d0/d3b/MPI_8h_source.html
DOLFINx  0.5.1
DOLFINx C++ interface
MPI.h
1 // Copyright (C) 2007-2014 Magnus Vikstrøm and Garth N. Wells
2 //
3 // This file is part of DOLFINx (https://www.fenicsproject.org)
4 //
5 // SPDX-License-Identifier: LGPL-3.0-or-later
6 
7 #pragma once
8 
9 #include <array>
10 #include <cassert>
11 #include <complex>
12 #include <cstdint>
13 #include <dolfinx/common/Timer.h>
14 #include <dolfinx/common/log.h>
15 #include <dolfinx/graph/AdjacencyList.h>
16 #include <numeric>
17 #include <set>
18 #include <span>
19 #include <tuple>
20 #include <type_traits>
21 #include <utility>
22 #include <vector>
23 
24 #define MPICH_IGNORE_CXX_SEEK 1
25 #include <mpi.h>
26 
28 namespace dolfinx::MPI
29 {
30 
32 enum class tag : int
33 {
34  consensus_pcx,
35  consensus_pex
36 };
37 
40 class Comm
41 {
42 public:
44  explicit Comm(MPI_Comm comm, bool duplicate = true);
45 
47  Comm(const Comm& comm) noexcept;
48 
50  Comm(Comm&& comm) noexcept;
51 
52  // Disable copy assignment operator
53  Comm& operator=(const Comm& comm) = delete;
54 
56  Comm& operator=(Comm&& comm) noexcept;
57 
59  ~Comm();
60 
62  MPI_Comm comm() const noexcept;
63 
64 private:
65  // MPI communicator
66  MPI_Comm _comm;
67 };
68 
70 int rank(MPI_Comm comm);
71 
74 int size(MPI_Comm comm);
75 
83 constexpr std::array<std::int64_t, 2> local_range(int rank, std::int64_t N,
84  int size)
85 {
86  assert(rank >= 0);
87  assert(N >= 0);
88  assert(size > 0);
89 
90  // Compute number of items per rank and remainder
91  const std::int64_t n = N / size;
92  const std::int64_t r = N % size;
93 
94  // Compute local range
95  if (rank < r)
96  return {rank * (n + 1), rank * (n + 1) + n + 1};
97  else
98  return {rank * n + r, rank * n + r + n};
99 }
100 
107 constexpr int index_owner(int size, std::size_t index, std::size_t N)
108 {
109  assert(index < N);
110 
111  // Compute number of items per rank and remainder
112  const std::size_t n = N / size;
113  const std::size_t r = N % size;
114 
115  if (index < r * (n + 1))
116  {
117  // First r ranks own n + 1 indices
118  return index / (n + 1);
119  }
120  else
121  {
122  // Remaining ranks own n indices
123  return r + (index - r * (n + 1)) / n;
124  }
125 }
126 
150 std::vector<int> compute_graph_edges_pcx(MPI_Comm comm,
151  const std::span<const int>& edges);
152 
177 std::vector<int> compute_graph_edges_nbx(MPI_Comm comm,
178  const std::span<const int>& edges);
179 
199 template <typename T>
200 std::pair<std::vector<std::int32_t>, std::vector<T>>
201 distribute_to_postoffice(MPI_Comm comm, const std::span<const T>& x,
202  std::array<std::int64_t, 2> shape,
203  std::int64_t rank_offset);
204 
226 template <typename T>
227 std::vector<T> distribute_from_postoffice(
228  MPI_Comm comm, const std::span<const std::int64_t>& indices,
229  const std::span<const T>& x, std::array<std::int64_t, 2> shape,
230  std::int64_t rank_offset);
231 
255 template <typename T>
256 std::vector<T> distribute_data(MPI_Comm comm,
257  const std::span<const std::int64_t>& indices,
258  const std::span<const T>& x, int shape1);
259 
260 template <typename T>
261 struct dependent_false : std::false_type
262 {
263 };
264 
266 template <typename T>
267 constexpr MPI_Datatype mpi_type()
268 {
269  if constexpr (std::is_same_v<T, float>)
270  return MPI_FLOAT;
271  else if constexpr (std::is_same_v<T, double>)
272  return MPI_DOUBLE;
273  else if constexpr (std::is_same_v<T, std::complex<double>>)
274  return MPI_C_DOUBLE_COMPLEX;
275  else if constexpr (std::is_same_v<T, std::complex<float>>)
276  return MPI_C_FLOAT_COMPLEX;
277  else if constexpr (std::is_same_v<T, short int>)
278  return MPI_SHORT;
279  else if constexpr (std::is_same_v<T, int>)
280  return MPI_INT;
281  else if constexpr (std::is_same_v<T, unsigned int>)
282  return MPI_UNSIGNED;
283  else if constexpr (std::is_same_v<T, long int>)
284  return MPI_LONG;
285  else if constexpr (std::is_same_v<T, unsigned long>)
286  return MPI_UNSIGNED_LONG;
287  else if constexpr (std::is_same_v<T, long long>)
288  return MPI_LONG_LONG;
289  else if constexpr (std::is_same_v<T, unsigned long long>)
290  return MPI_UNSIGNED_LONG_LONG;
291  else if constexpr (std::is_same_v<T, bool>)
292  return MPI_C_BOOL;
293  else if constexpr (std::is_same_v<T, std::int8_t>)
294  return MPI_INT8_T;
295  else
296  // Issue compile time error
297  static_assert(!std::is_same_v<T, T>);
298 }
299 
300 //---------------------------------------------------------------------------
301 template <typename T>
302 std::pair<std::vector<std::int32_t>, std::vector<T>>
303 distribute_to_postoffice(MPI_Comm comm, const std::span<const T>& x,
304  std::array<std::int64_t, 2> shape,
305  std::int64_t rank_offset)
306 {
307  const int size = dolfinx::MPI::size(comm);
308  const int rank = dolfinx::MPI::rank(comm);
309  assert(x.size() % shape[1] == 0);
310  const std::int32_t shape0_local = x.size() / shape[1];
311 
312  LOG(2) << "Sending data to post offices (distribute_to_postoffice)";
313 
314  // Post office ranks will receive data from this rank
315  std::vector<int> row_to_dest(shape0_local);
316  for (std::int32_t i = 0; i < shape0_local; ++i)
317  {
318  int dest = MPI::index_owner(size, i + rank_offset, shape[0]);
319  row_to_dest[i] = dest;
320  }
321 
322  // Build list of (dest, positions) for each row that doesn't belong to
323  // this rank, then sort
324  std::vector<std::array<std::int32_t, 2>> dest_to_index;
325  dest_to_index.reserve(shape0_local);
326  for (std::int32_t i = 0; i < shape0_local; ++i)
327  {
328  std::size_t idx = i + rank_offset;
329  if (int dest = MPI::index_owner(size, idx, shape[0]); dest != rank)
330  dest_to_index.push_back({dest, i});
331  }
332  std::sort(dest_to_index.begin(), dest_to_index.end());
333 
334  // Build list of neighbour src ranks and count number of items (rows
335  // of x) to receive from each src post office (by neighbourhood rank)
336  std::vector<int> dest;
337  std::vector<std::int32_t> num_items_per_dest,
338  pos_to_neigh_rank(shape0_local, -1);
339  {
340  auto it = dest_to_index.begin();
341  while (it != dest_to_index.end())
342  {
343  const int neigh_rank = dest.size();
344 
345  // Store global rank
346  dest.push_back((*it)[0]);
347 
348  // Find iterator to next global rank
349  auto it1
350  = std::find_if(it, dest_to_index.end(),
351  [r = dest.back()](auto& idx) { return idx[0] != r; });
352 
353  // Store number of items for current rank
354  num_items_per_dest.push_back(std::distance(it, it1));
355 
356  // Map from local x index to local destination rank
357  for (auto e = it; e != it1; ++e)
358  pos_to_neigh_rank[(*e)[1]] = neigh_rank;
359 
360  // Advance iterator
361  it = it1;
362  }
363  }
364 
365  // Determine source ranks
366  const std::vector<int> src = MPI::compute_graph_edges_nbx(comm, dest);
367  LOG(INFO)
368  << "Number of neighbourhood source ranks in distribute_to_postoffice: "
369  << src.size();
370 
371  // Create neighbourhood communicator for sending data to post offices
372  MPI_Comm neigh_comm;
373  MPI_Dist_graph_create_adjacent(comm, src.size(), src.data(), MPI_UNWEIGHTED,
374  dest.size(), dest.data(), MPI_UNWEIGHTED,
375  MPI_INFO_NULL, false, &neigh_comm);
376 
377  // Compute send displacements
378  std::vector<std::int32_t> send_disp = {0};
379  std::partial_sum(num_items_per_dest.begin(), num_items_per_dest.end(),
380  std::back_inserter(send_disp));
381 
382  // Pack send buffers
383  std::vector<T> send_buffer_data(shape[1] * send_disp.back());
384  std::vector<std::int64_t> send_buffer_index(send_disp.back());
385  {
386  std::vector<std::int32_t> send_offsets = send_disp;
387  for (std::int32_t i = 0; i < shape0_local; ++i)
388  {
389  if (int neigh_dest = pos_to_neigh_rank[i]; neigh_dest != -1)
390  {
391  std::size_t pos = send_offsets[neigh_dest];
392  send_buffer_index[pos] = i + rank_offset;
393  std::copy_n(std::next(x.begin(), i * shape[1]), shape[1],
394  std::next(send_buffer_data.begin(), shape[1] * pos));
395  ++send_offsets[neigh_dest];
396  }
397  }
398  }
399 
400  // Send number of items to post offices (destination) that I will be
401  // sending
402  std::vector<int> num_items_recv(src.size());
403  num_items_per_dest.reserve(1);
404  num_items_recv.reserve(1);
405  MPI_Neighbor_alltoall(num_items_per_dest.data(), 1, MPI_INT,
406  num_items_recv.data(), 1, MPI_INT, neigh_comm);
407 
408  // Prepare receive displacement and buffers
409  std::vector<std::int32_t> recv_disp(num_items_recv.size() + 1, 0);
410  std::partial_sum(num_items_recv.begin(), num_items_recv.end(),
411  std::next(recv_disp.begin()));
412 
413  // Send/receive global indices
414  std::vector<std::int64_t> recv_buffer_index(recv_disp.back());
415  MPI_Neighbor_alltoallv(send_buffer_index.data(), num_items_per_dest.data(),
416  send_disp.data(), MPI_INT64_T,
417  recv_buffer_index.data(), num_items_recv.data(),
418  recv_disp.data(), MPI_INT64_T, neigh_comm);
419 
420  // Send/receive data (x)
421  MPI_Datatype compound_type;
422  MPI_Type_contiguous(shape[1], dolfinx::MPI::mpi_type<T>(), &compound_type);
423  MPI_Type_commit(&compound_type);
424  std::vector<T> recv_buffer_data(shape[1] * recv_disp.back());
425  MPI_Neighbor_alltoallv(send_buffer_data.data(), num_items_per_dest.data(),
426  send_disp.data(), compound_type,
427  recv_buffer_data.data(), num_items_recv.data(),
428  recv_disp.data(), compound_type, neigh_comm);
429  MPI_Type_free(&compound_type);
430  MPI_Comm_free(&neigh_comm);
431 
432  LOG(2) << "Completed send data to post offices.";
433 
434  // Convert to local indices
435  const std::int64_t r0 = MPI::local_range(rank, shape[0], size)[0];
436  std::vector<std::int32_t> index_local(recv_buffer_index.size());
437  std::transform(recv_buffer_index.cbegin(), recv_buffer_index.cend(),
438  index_local.begin(), [r0](auto idx) { return idx - r0; });
439 
440  return {index_local, recv_buffer_data};
441 };
442 //---------------------------------------------------------------------------
443 template <typename T>
445  MPI_Comm comm, const std::span<const std::int64_t>& indices,
446  const std::span<const T>& x, std::array<std::int64_t, 2> shape,
447  std::int64_t rank_offset)
448 {
449  common::Timer timer("Distribute row-wise data (scalable)");
450  assert(shape[1] > 0);
451 
452  const int size = dolfinx::MPI::size(comm);
453  const int rank = dolfinx::MPI::rank(comm);
454  assert(x.size() % shape[1] == 0);
455  const std::int64_t shape0_local = x.size() / shape[1];
456 
457  // 0. Send x data to/from post offices
458 
459  // Send receive x data to post office (only for rows that need to be
460  // communicated)
461  auto [post_indices, post_x] = MPI::distribute_to_postoffice(
462  comm, x, {shape[0], shape[1]}, rank_offset);
463  assert(post_indices.size() == post_x.size() / shape[1]);
464 
465  // 1. Send request to post office ranks for data
466 
467  // Build list of (src, global index, global, index positions) for each
468  // entry in 'indices' that doesn't belong to this rank, then sort
469  std::vector<std::tuple<int, std::int64_t, std::int32_t>> src_to_index;
470  for (std::size_t i = 0; i < indices.size(); ++i)
471  {
472  std::size_t idx = indices[i];
473  if (int src = MPI::index_owner(size, idx, shape[0]); src != rank)
474  src_to_index.push_back({src, idx, i});
475  }
476  std::sort(src_to_index.begin(), src_to_index.end());
477 
478  // Build list is neighbour src ranks and count number of items (rows
479  // of x) to receive from each src post office (by neighbourhood rank)
480  std::vector<std::int32_t> num_items_per_src;
481  std::vector<int> src;
482  {
483  auto it = src_to_index.begin();
484  while (it != src_to_index.end())
485  {
486  src.push_back(std::get<0>(*it));
487  auto it1 = std::find_if(it, src_to_index.end(),
488  [r = src.back()](auto& idx)
489  { return std::get<0>(idx) != r; });
490  num_items_per_src.push_back(std::distance(it, it1));
491  it = it1;
492  }
493  }
494 
495  // Determine 'delivery' destination ranks (ranks that want data from
496  // me)
497  const std::vector<int> dest
499  LOG(INFO) << "Neighbourhood destination ranks from post office in "
500  "distribute_data (rank, num dests, num dests/mpi_size): "
501  << rank << ", " << dest.size() << ", "
502  << static_cast<double>(dest.size()) / size;
503 
504  // Create neighbourhood communicator for sending data to post offices
505  // (src), and receiving data form my send my post office
506  MPI_Comm neigh_comm0;
507  MPI_Dist_graph_create_adjacent(comm, dest.size(), dest.data(), MPI_UNWEIGHTED,
508  src.size(), src.data(), MPI_UNWEIGHTED,
509  MPI_INFO_NULL, false, &neigh_comm0);
510 
511  // Communicate number of requests to each source
512  std::vector<int> num_items_recv(dest.size());
513  num_items_per_src.reserve(1);
514  num_items_recv.reserve(1);
515  MPI_Neighbor_alltoall(num_items_per_src.data(), 1, MPI_INT,
516  num_items_recv.data(), 1, MPI_INT, neigh_comm0);
517 
518  // Prepare send/receive displacements
519  std::vector<std::int32_t> send_disp = {0};
520  std::partial_sum(num_items_per_src.begin(), num_items_per_src.end(),
521  std::back_inserter(send_disp));
522  std::vector<std::int32_t> recv_disp = {0};
523  std::partial_sum(num_items_recv.begin(), num_items_recv.end(),
524  std::back_inserter(recv_disp));
525 
526  // Pack my requested indices (global) in send buffer ready to send to
527  // post offices
528  assert(send_disp.back() == (int)src_to_index.size());
529  std::vector<std::int64_t> send_buffer_index(src_to_index.size());
530  std::transform(src_to_index.cbegin(), src_to_index.cend(),
531  send_buffer_index.begin(),
532  [](auto& x) { return std::get<1>(x); });
533 
534  // Prepare the receive buffer
535  std::vector<std::int64_t> recv_buffer_index(recv_disp.back());
536  MPI_Neighbor_alltoallv(send_buffer_index.data(), num_items_per_src.data(),
537  send_disp.data(), MPI_INT64_T,
538  recv_buffer_index.data(), num_items_recv.data(),
539  recv_disp.data(), MPI_INT64_T, neigh_comm0);
540 
541  MPI_Comm_free(&neigh_comm0);
542 
543  // 2. Send data (rows of x) back to requesting ranks (transpose of the
544  // preceding communication pattern operation)
545 
546  // Build map from local index to post_indices position. Set to -1 for
547  // data that was already on this rank and was therefore was not
548  // sent/received via a postoffice.
549  const std::array<std::int64_t, 2> postoffice_range
550  = MPI::local_range(rank, shape[0], size);
551  std::vector<std::int32_t> post_indices_map(
552  postoffice_range[1] - postoffice_range[0], -1);
553  for (std::size_t i = 0; i < post_indices.size(); ++i)
554  {
555  assert(post_indices[i] < (int)post_indices_map.size());
556  post_indices_map[post_indices[i]] = i;
557  }
558 
559  // Build send buffer
560  std::vector<T> send_buffer_data(shape[1] * recv_disp.back());
561  for (std::size_t p = 0; p < recv_disp.size() - 1; ++p)
562  {
563  int offset = recv_disp[p];
564  for (std::int32_t i = recv_disp[p]; i < recv_disp[p + 1]; ++i)
565  {
566  std::int64_t index = recv_buffer_index[i];
567  if (index >= rank_offset and index < (rank_offset + shape0_local))
568  {
569  // I already had this index before any communication
570  std::int32_t local_index = index - rank_offset;
571  std::copy_n(std::next(x.begin(), shape[1] * local_index), shape[1],
572  std::next(send_buffer_data.begin(), shape[1] * offset));
573  }
574  else
575  {
576  // Take from my 'post bag'
577  auto local_index = index - postoffice_range[0];
578  std::int32_t pos = post_indices_map[local_index];
579  assert(pos != -1);
580  std::copy_n(std::next(post_x.begin(), shape[1] * pos), shape[1],
581  std::next(send_buffer_data.begin(), shape[1] * offset));
582  }
583 
584  ++offset;
585  }
586  }
587 
588  MPI_Dist_graph_create_adjacent(comm, src.size(), src.data(), MPI_UNWEIGHTED,
589  dest.size(), dest.data(), MPI_UNWEIGHTED,
590  MPI_INFO_NULL, false, &neigh_comm0);
591 
592  MPI_Datatype compound_type0;
593  MPI_Type_contiguous(shape[1], dolfinx::MPI::mpi_type<T>(), &compound_type0);
594  MPI_Type_commit(&compound_type0);
595 
596  std::vector<T> recv_buffer_data(shape[1] * send_disp.back());
597  MPI_Neighbor_alltoallv(send_buffer_data.data(), num_items_recv.data(),
598  recv_disp.data(), compound_type0,
599  recv_buffer_data.data(), num_items_per_src.data(),
600  send_disp.data(), compound_type0, neigh_comm0);
601 
602  MPI_Type_free(&compound_type0);
603  MPI_Comm_free(&neigh_comm0);
604 
605  std::vector<std::int32_t> index_pos_to_buffer(indices.size(), -1);
606  for (std::size_t i = 0; i < src_to_index.size(); ++i)
607  index_pos_to_buffer[std::get<2>(src_to_index[i])] = i;
608 
609  // Extra data to return
610  std::vector<T> x_new(shape[1] * indices.size());
611  for (std::size_t i = 0; i < indices.size(); ++i)
612  {
613  const std::int64_t index = indices[i];
614  if (index >= rank_offset and index < (rank_offset + shape0_local))
615  {
616  // Had data from the start in x
617  auto local_index = index - rank_offset;
618  std::copy_n(std::next(x.begin(), shape[1] * local_index), shape[1],
619  std::next(x_new.begin(), shape[1] * i));
620  }
621  else
622  {
623  if (int src = MPI::index_owner(size, index, shape[0]); src == rank)
624  {
625  // In my post office bag
626  auto local_index = index - postoffice_range[0];
627  std::int32_t pos = post_indices_map[local_index];
628  assert(pos != -1);
629  std::copy_n(std::next(post_x.begin(), shape[1] * pos), shape[1],
630  std::next(x_new.begin(), shape[1] * i));
631  }
632  else
633  {
634  // In my received post
635  std::int32_t pos = index_pos_to_buffer[i];
636  assert(pos != -1);
637  std::copy_n(std::next(recv_buffer_data.begin(), shape[1] * pos),
638  shape[1], std::next(x_new.begin(), shape[1] * i));
639  }
640  }
641  }
642 
643  return x_new;
644 }
645 //---------------------------------------------------------------------------
646 template <typename T>
647 std::vector<T> distribute_data(MPI_Comm comm,
648  const std::span<const std::int64_t>& indices,
649  const std::span<const T>& x, int shape1)
650 {
651  assert(shape1 > 0);
652  assert(x.size() % shape1 == 0);
653  const std::int64_t shape0_local = x.size() / shape1;
654 
655  std::int64_t shape0(0), rank_offset(0);
656  MPI_Allreduce(&shape0_local, &shape0, 1, MPI_INT64_T, MPI_SUM, comm);
657  MPI_Exscan(&shape0_local, &rank_offset, 1, MPI_INT64_T, MPI_SUM, comm);
658 
659  return distribute_from_postoffice(comm, indices, x, {shape0, shape1},
660  rank_offset);
661 }
662 //---------------------------------------------------------------------------
663 
664 } // namespace dolfinx::MPI
A duplicate MPI communicator and manage lifetime of the communicator.
Definition: MPI.h:41
Comm(MPI_Comm comm, bool duplicate=true)
Duplicate communicator and wrap duplicate.
Definition: MPI.cpp:13
~Comm()
Destructor (frees wrapped communicator)
Definition: MPI.cpp:40
MPI_Comm comm() const noexcept
Return the underlying MPI_Comm object.
Definition: MPI.cpp:74
A timer can be used for timing tasks. The basic usage is.
Definition: Timer.h:31
MPI support functionality.
Definition: MPI.h:29
std::pair< std::vector< std::int32_t >, std::vector< T > > distribute_to_postoffice(MPI_Comm comm, const std::span< const T > &x, std::array< std::int64_t, 2 > shape, std::int64_t rank_offset)
Distribute row data to 'post office' ranks.
Definition: MPI.h:303
std::vector< T > distribute_from_postoffice(MPI_Comm comm, const std::span< const std::int64_t > &indices, const std::span< const T > &x, std::array< std::int64_t, 2 > shape, std::int64_t rank_offset)
Distribute rows of a rectangular data array from post office ranks to ranks where they are required.
Definition: MPI.h:444
std::vector< int > compute_graph_edges_nbx(MPI_Comm comm, const std::span< const int > &edges)
Determine incoming graph edges using the NBX consensus algorithm.
Definition: MPI.cpp:152
constexpr int index_owner(int size, std::size_t index, std::size_t N)
Return which rank owns index in global range [0, N - 1] (inverse of MPI::local_range).
Definition: MPI.h:107
std::vector< int > compute_graph_edges_pcx(MPI_Comm comm, const std::span< const int > &edges)
Determine incoming graph edges using the PCX consensus algorithm.
Definition: MPI.cpp:92
std::vector< T > distribute_data(MPI_Comm comm, const std::span< const std::int64_t > &indices, const std::span< const T > &x, int shape1)
Distribute rows of a rectangular data array to ranks where they are required (scalable version).
Definition: MPI.h:647
int size(MPI_Comm comm)
Return size of the group (number of processes) associated with the communicator.
Definition: MPI.cpp:84
int rank(MPI_Comm comm)
Return process rank for the communicator.
Definition: MPI.cpp:76
constexpr MPI_Datatype mpi_type()
MPI Type.
Definition: MPI.h:267
constexpr std::array< std::int64_t, 2 > local_range(int rank, std::int64_t N, int size)
Return local range for the calling process, partitioning the global [0, N - 1] range across all ranks...
Definition: MPI.h:83
tag
MPI communication tags.
Definition: MPI.h:33
Definition: MPI.h:262