Copyright (C) 2024 Jack S. Hale and Garth N. Wells
This file is part of DOLFINx (https://www.fenicsproject.org)
SPDX-License-Identifier:    LGPL-3.0-or-later

Custom cell kernel assembly

This demo shows various methods to define custom cell kernels in C++ and have them assembled into DOLFINx linear algebra data structures.

#include <basix/finite-element.h>
#include <basix/mdspan.hpp>
#include <basix/quadrature.h>
#include <cmath>
#include <concepts>
#include <dolfinx.h>
#include <dolfinx/la/MatrixCSR.h>
#include <dolfinx/la/SparsityPattern.h>
#include <functional>
#include <stdint.h>
#include <utility>
#include <vector>

using namespace dolfinx;

template <typename T, std::size_t ndim>
using mdspand_t = MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
    T, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, ndim>>;
template <typename T, std::size_t n0, std::size_t n1>
using mdspan2_t
    = MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<T,
                                             std::extents<std::size_t, n0, n1>>;

/// @brief Compute the P1 element mass matrix on the reference cell.
/// @tparam T Scalar type.
/// @param phi Basis functions.
/// @param w Integration weights.
/// @return Element reference matrix (row-major storage).
template <typename T>
std::array<T, 9> A_ref(mdspand_t<const T, 4> phi, std::span<const T> w)
{
  std::array<T, 9> A_b{};
  mdspan2_t<T, 3, 3> A(A_b.data());
  for (std::size_t k = 0; k < phi.extent(1); ++k)   // quadrature point
    for (std::size_t i = 0; i < A.extent(0); ++i)   // row i
      for (std::size_t j = 0; j < A.extent(1); ++j) // column j
        A(i, j) += w[k] * phi(0, k, i, 0) * phi(0, k, j, 0);
  return A_b;
}

/// @brief Compute the P1 RHS vector for f=1 on the reference cell.
/// @tparam T Scalar type.
/// @param phi Basis functions.
/// @param w Integration weights.
/// @return RHS reference vector.
template <typename T>
std::array<T, 3> b_ref(mdspand_t<const T, 4> phi, std::span<const T> w)
{
  std::array<T, 3> b{};
  for (std::size_t k = 0; k < phi.extent(1); ++k) // quadrature point
    for (std::size_t i = 0; i < b.size(); ++i)    // row i
      b[i] += w[k] * phi(0, k, i, 0);
  return b;
}

/// @brief Assemble a matrix operator using a `std::function` kernel
/// function.
/// @tparam T Scalar type.
/// @param V Function space.
/// @param kernel Element kernel to execute.
/// @param cells Cells to execute the kernel over.
/// @return Frobenius norm squared of the matrix.
template <std::floating_point T>
double assemble_matrix0(std::shared_ptr<fem::FunctionSpace<T>> V, auto kernel,
                        std::span<const std::int32_t> cells)
{
  // Kernel data (ID, kernel function, cell indices to execute over)
  std::vector kernel_data{
      fem::integral_data<T>(-1, kernel, cells, std::vector<int>{})};

  // Associate kernel with cells (as opposed to facets, etc)
  std::map integrals{std::pair{fem::IntegralType::cell, kernel_data}};

  fem::Form<T> a({V, V}, integrals, {}, {}, false, {}, V->mesh());
  auto dofmap = V->dofmap();
  auto sp = la::SparsityPattern(
      V->mesh()->comm(), {dofmap->index_map, dofmap->index_map},
      {dofmap->index_map_bs(), dofmap->index_map_bs()});
  fem::sparsitybuild::cells(sp, {cells, cells}, {*dofmap, *dofmap});
  sp.finalize();
  la::MatrixCSR<T> A(sp);
  common::Timer timer("Assembler0 std::function (matrix)");
  assemble_matrix(A.mat_add_values(), a, {});
  A.scatter_rev();
  return A.squared_norm();
}

/// @brief Assemble a RHS vector using a `std::function` kernel
/// function.
/// @tparam T Scalar type.
/// @param V Function space.
/// @param kernel Element kernel to execute.
/// @param cells Cells to execute the kernel over.
/// @return l2 norm squared of the vector.
template <std::floating_point T>
double assemble_vector0(std::shared_ptr<fem::FunctionSpace<T>> V, auto kernel,
                        std::span<const std::int32_t> cells)
{
  auto mesh = V->mesh();
  std::vector kernal_data{
      fem::integral_data<T>(-1, kernel, cells, std::vector<int>{})};
  std::map integrals{std::pair{fem::IntegralType::cell, kernal_data}};
  fem::Form<T> L({V}, integrals, {}, {}, false, {}, mesh);
  auto dofmap = V->dofmap();
  la::Vector<T> b(dofmap->index_map, 1);
  common::Timer timer("Assembler0 std::function (vector)");
  fem::assemble_vector(b.mutable_array(), L);
  b.scatter_rev(std::plus<T>());
  return la::squared_norm(b);
}

/// @brief Assemble a matrix operator using a lambda kernel function.
///
/// The lambda function can be inlined in the assembly code, which can
/// be important for performance for lightweight kernels.
///
/// @tparam T Scalar type.
/// @param g mesh geometry.
/// @param dofmap dofmap.
/// @param kernel Element kernel to execute.
/// @param cells Cells to execute the kernel over.
/// @return Frobenius norm squared of the matrix.
template <std::floating_point T>
double assemble_matrix1(const mesh::Geometry<T>& g, const fem::DofMap& dofmap,
                        auto kernel, std::span<const std::int32_t> cells)
{
  auto sp = la::SparsityPattern(dofmap.index_map->comm(),
                                {dofmap.index_map, dofmap.index_map},
                                {dofmap.index_map_bs(), dofmap.index_map_bs()});
  fem::sparsitybuild::cells(sp, {cells, cells}, {dofmap, dofmap});
  sp.finalize();
  la::MatrixCSR<T> A(sp);
  auto ident = [](auto, auto, auto, auto) {}; // DOF permutation not required
  common::Timer timer("Assembler1 lambda (matrix)");
  fem::impl::assemble_cells(A.mat_add_values(), g.dofmap(), g.x(), cells,
                            {dofmap.map(), 1, cells}, ident,
                            {dofmap.map(), 1, cells}, ident, {}, {}, kernel,
                            std::span<const T>(), 0, {}, {}, {});
  A.scatter_rev();
  return A.squared_norm();
}

/// @brief Assemble a RHS vector using using a lambda kernel function.
///
/// The lambda function can be inlined in the assembly code, which can
/// be important for performance for lightweight kernels.
///
/// @tparam T Scalar type.
/// @param g mesh geometry.
/// @param dofmap dofmap.
/// @param kernel Element kernel to execute.
/// @param cells Cells to execute the kernel over.
/// @return l2 norm squared of the vector.
template <std::floating_point T>
double assemble_vector1(const mesh::Geometry<T>& g, const fem::DofMap& dofmap,
                        auto kernel, const std::vector<std::int32_t>& cells)
{
  la::Vector<T> b(dofmap.index_map, 1);
  common::Timer timer("Assembler1 lambda (vector)");
  fem::impl::assemble_cells<T, 1>(
      [](auto, auto, auto, auto) {}, b.mutable_array(), g.dofmap(), g.x(),
      cells, {dofmap.map(), 1, cells}, kernel, {}, {}, 0, {});
  b.scatter_rev(std::plus<T>());
  return la::squared_norm(b);
}

/// @brief Assemble P1 mass matrix and a RHS vector using element kernel
/// approaches.
///
/// Function demonstrates how hand-coded element kernels can be executed
/// in assembly over cells.
///
/// @tparam T Scalar type.
/// @param comm MPI communicator to assembler over.
template <std::floating_point T>
void assemble(MPI_Comm comm)
{
  // Create mesh
  auto mesh = std::make_shared<mesh::Mesh<T>>(mesh::create_rectangle<T>(
      comm, {{{0, 0}, {1, 1}}}, {516, 116}, mesh::CellType::triangle));

  // Create Basix P1 Lagrange element. This will be used to construct
  // basis functions inside the custom cell kernel.
  constexpr int order = 1;
  basix::FiniteElement e = basix::create_element<T>(
      basix::element::family::P,
      mesh::cell_type_to_basix_type(mesh::CellType::triangle), order,
      basix::element::lagrange_variant::unset,
      basix::element::dpc_variant::unset, false);

  // Construct quadrature rule
  constexpr int max_degree = 2 * order;
  auto quadrature_type = basix::quadrature::get_default_rule(
      basix::cell::type::triangle, max_degree);
  auto [X_b, weights] = basix::quadrature::make_quadrature<T>(
      quadrature_type, basix::cell::type::triangle,
      basix::polyset::type::standard, max_degree);
  mdspand_t<const T, 2> X(X_b.data(), weights.size(), 2);

  // Create a scalar function space
  auto V = std::make_shared<fem::FunctionSpace<T>>(
      fem::create_functionspace(mesh, e));

  // Build list of cells to assembler over (all cells owned by this
  // rank)
  std::int32_t size_local
      = mesh->topology()->index_map(mesh->topology()->dim())->size_local();
  std::vector<std::int32_t> cells(size_local);
  std::iota(cells.begin(), cells.end(), 0);

  // Tabulate basis functions at quadrature points
  auto e_shape = e.tabulate_shape(0, weights.size());
  std::size_t length
      = std::accumulate(e_shape.begin(), e_shape.end(), 1, std::multiplies<>{});
  std::vector<T> phi_b(length);
  mdspand_t<T, 4> phi(phi_b.data(), e_shape);
  e.tabulate(0, X, phi);

  // Utility function to compute det(J) for an affine triangle cell
  // (geometry is 3D)
  auto detJ = [](mdspan2_t<const T, 3, 3> x)
  {
    return std::abs((x(0, 0) - x(1, 0)) * (x(2, 1) - x(1, 1))
                    - (x(0, 1) - x(1, 1)) * (x(2, 0) - x(1, 0)));
  };

  // Finite element mass matrix kernel function
  std::array<T, 9> A_hat_b = A_ref<T>(phi, weights);
  auto kernel_a
      = [A_hat = mdspan2_t<T, 3, 3>(A_hat_b.data()),
         detJ](T* A, const T*, const T*, const T* x, const int*, const uint8_t*)
  {
    T scale = detJ(mdspan2_t<const T, 3, 3>(x));
    mdspan2_t<T, 3, 3> _A(A);
    for (std::size_t i = 0; i < A_hat.extent(0); ++i)
      for (std::size_t j = 0; j < A_hat.extent(1); ++j)
        _A(i, j) = scale * A_hat(i, j);
  };

  // Finite element RHS (f=1) kernel function
  auto kernel_L
      = [b_hat = b_ref<T>(phi, weights),
         detJ](T* b, const T*, const T*, const T* x, const int*, const uint8_t*)
  {
    T scale = detJ(mdspan2_t<const T, 3, 3>(x));
    for (std::size_t i = 0; i < 3; ++i)
      b[i] = scale * b_hat[i];
  };

  // Assemble matrix and vector using std::function kernel
  assemble_matrix0<T>(V, kernel_a, cells);
  assemble_vector0<T>(V, kernel_L, cells);

  // Assemble matrix and vector using lambda kernel. This version
  // supports efficient inlining of the kernel in the assembler. This
  // can give a significant performance improvement for lightweight
  // kernels.
  assemble_matrix1<T>(mesh->geometry(), *V->dofmap(), kernel_a, cells);
  assemble_vector1<T>(mesh->geometry(), *V->dofmap(), kernel_L, cells);

  list_timings(comm, {TimingType::wall});
}

int main(int argc, char* argv[])
{
  MPI_Init(&argc, &argv);
  dolfinx::init_logging(argc, argv);
  assemble<float>(MPI_COMM_WORLD);
  assemble<double>(MPI_COMM_WORLD);
  MPI_Finalize();
  return 0;
}