terraneo/epsilon__divdiv__kerngen__v03__teams__precomp_8hpp_source.html

#pragma once


#include "../../../quadrature/quadrature.hpp"

#include "communication/shell/communication.hpp"

#include "dense/vec.hpp"

#include "fe/wedge/integrands.hpp"

#include "fe/wedge/kernel_helpers.hpp"

#include "grid/shell/spherical_shell.hpp"

#include "impl/Kokkos_Profiling.hpp"

#include "linalg/operator.hpp"

#include "linalg/solvers/gca/local_matrix_storage.hpp"

#include "linalg/trafo/local_basis_trafo_normal_tangential.hpp"

#include "linalg/vector.hpp"

#include "linalg/vector_q1.hpp"

#include "util/timer.hpp"


namespace terra::fe::wedge::operators::shell::epsdivdiv_history {


using grid::shell::ShellBoundaryFlag::CMB;

using grid::shell::ShellBoundaryFlag::SURFACE;

using terra::grid::shell::BoundaryConditionFlag;

using terra::grid::shell::BoundaryConditions;

using terra::grid::shell::ShellBoundaryFlag;


template < typename ScalarT, int VecDim = 3 >


class EpsilonDivDivKerngenV03TeamsPrecomp

{

  public:

    using SrcVectorType                 = linalg::VectorQ1Vec< ScalarT, VecDim >;

    using DstVectorType                 = linalg::VectorQ1Vec< ScalarT, VecDim >;

    using ScalarType                    = ScalarT;

    static constexpr int LocalMatrixDim = 18;

    using Grid4DDataLocalMatrices = terra::grid::Grid4DDataMatrices< ScalarType, LocalMatrixDim, LocalMatrixDim, 2 >;

    using LocalMatrixStorage      = linalg::solvers::LocalMatrixStorage< ScalarType, LocalMatrixDim >;


  private:

    LocalMatrixStorage local_matrix_storage_;


    grid::shell::DistributedDomain domain_;


    grid::Grid3DDataVec< ScalarT, 3 >                        grid_;

    grid::Grid2DDataScalar< ScalarT >                        radii_;

    grid::Grid4DDataScalar< ScalarType >                     k_;

    grid::Grid4DDataScalar< grid::shell::ShellBoundaryFlag > mask_;

    BoundaryConditions                                       bcs_;


    bool treat_boundary_;

    bool diagonal_;


    linalg::OperatorApplyMode         operator_apply_mode_;

    linalg::OperatorCommunicationMode operator_communication_mode_;

    linalg::OperatorStoredMatrixMode  operator_stored_matrix_mode_;


    communication::shell::SubdomainNeighborhoodSendRecvBuffer< ScalarT, VecDim > send_buffers_;

    communication::shell::SubdomainNeighborhoodSendRecvBuffer< ScalarT, VecDim > recv_buffers_;


    grid::Grid4DDataVec< ScalarType, VecDim > src_;

    grid::Grid4DDataVec< ScalarType, VecDim > dst_;


    // Quadrature points.

    const int num_quad_points = quadrature::quad_felippa_1x1_num_quad_points;


    dense::Vec< ScalarT, 3 > quad_points[quadrature::quad_felippa_1x1_num_quad_points];

    ScalarT                  quad_weights[quadrature::quad_felippa_1x1_num_quad_points];


    // subdomain, x, y, 2 wedges, quad points, 9 components in the matrix

    Kokkos::View< ScalarT*** [2][1][3][3], grid::Layout > g1_;


    // subdomain, x, y, 2 wedges, quad points

    Kokkos::View< ScalarT*** [2][1], grid::Layout > g2_;


    int local_subdomains_;

    int hex_lat_;

    int hex_rad_;

    int lat_refinement_level_;

    int block_size_;

    int blocks_per_column_;

    int blocks_;


  public:


    EpsilonDivDivKerngenV03TeamsPrecomp(

        const grid::shell::DistributedDomain&                           domain,

        const grid::Grid3DDataVec< ScalarT, 3 >&                        grid,

        const grid::Grid2DDataScalar< ScalarT >&                        radii,

        const grid::Grid4DDataScalar< grid::shell::ShellBoundaryFlag >& mask,

        const grid::Grid4DDataScalar< ScalarT >&                        k,

        BoundaryConditions                                              bcs,

        bool                                                            diagonal,

        linalg::OperatorApplyMode         operator_apply_mode = linalg::OperatorApplyMode::Replace,

        linalg::OperatorCommunicationMode operator_communication_mode =

            linalg::OperatorCommunicationMode::CommunicateAdditively,

        linalg::OperatorStoredMatrixMode operator_stored_matrix_mode = linalg::OperatorStoredMatrixMode::Off )

    : domain_( domain )

    , grid_( grid )

    , radii_( radii )

    , mask_( mask )

    , k_( k )

    , treat_boundary_( true )

    , diagonal_( diagonal )

    , operator_apply_mode_( operator_apply_mode )

    , operator_communication_mode_( operator_communication_mode )

    , operator_stored_matrix_mode_( operator_stored_matrix_mode )

    , g1_( "g1", grid.extent( 0 ), grid.extent( 1 ) - 1, grid.extent( 2 ) - 1 )

    , g2_( "g2", grid.extent( 0 ), grid.extent( 1 ) - 1, grid.extent( 2 ) - 1 )

    // TODO: we can reuse the send and recv buffers and pass in from the outside somehow

    , send_buffers_( domain )

    , recv_buffers_( domain )

    {

        bcs_[0] = bcs[0];

        bcs_[1] = bcs[1];

        quadrature::quad_felippa_1x1_quad_points( quad_points );

        quadrature::quad_felippa_1x1_quad_weights( quad_weights );

        local_subdomains_            = domain_.subdomains().size();

        hex_lat_                     = domain_.domain_info().subdomain_num_nodes_per_side_laterally() - 1;

        hex_rad_                     = domain_.domain_info().subdomain_num_nodes_radially() - 1;

        lat_refinement_level_        = domain_.domain_info().diamond_lateral_refinement_level();

        const int threads_per_column = hex_rad_;

        block_size_                  = std::min( 128, threads_per_column );

        blocks_per_column_           = ( threads_per_column + block_size_ - 1 ) / block_size_;

        blocks_                      = local_subdomains_ * hex_lat_ * hex_lat_ * blocks_per_column_;


        precompute();

    }


    void set_operator_apply_and_communication_modes(

        const linalg::OperatorApplyMode         operator_apply_mode,

        const linalg::OperatorCommunicationMode operator_communication_mode )

    {

        operator_apply_mode_         = operator_apply_mode;

        operator_communication_mode_ = operator_communication_mode;

    }


    /// @brief S/Getter for diagonal member

    void set_diagonal( bool v ) { diagonal_ = v; }


    /// @brief Getter for coefficient

    const grid::Grid4DDataScalar< ScalarType >& k_grid_data() { return k_; }


    /// @brief Getter for domain member

    const grid::shell::DistributedDomain& get_domain() const { return domain_; }


    /// @brief Getter for radii member

    grid::Grid2DDataScalar< ScalarT > get_radii() const { return radii_; }


    /// @brief Getter for grid member

    grid::Grid3DDataVec< ScalarT, 3 > get_grid() { return grid_; }


    /// @brief Getter for mask member

    KOKKOS_INLINE_FUNCTION


    bool has_flag(

        const int                      local_subdomain_id,

        const int                      x_cell,

        const int                      y_cell,

        const int                      r_cell,

        grid::shell::ShellBoundaryFlag flag ) const

    {

        return util::has_flag( mask_( local_subdomain_id, x_cell, y_cell, r_cell ), flag );

    }


    /// @brief allocates memory for the local matrices


    void set_stored_matrix_mode(

        linalg::OperatorStoredMatrixMode                      operator_stored_matrix_mode,

        std::optional< int >                                  level_range,

        std::optional< grid::Grid4DDataScalar< ScalarType > > GCAElements )

    {

        operator_stored_matrix_mode_ = operator_stored_matrix_mode;


        // allocate storage if necessary

        if ( operator_stored_matrix_mode_ != linalg::OperatorStoredMatrixMode::Off )

        {

            local_matrix_storage_ = linalg::solvers::LocalMatrixStorage< ScalarType, LocalMatrixDim >(

                domain_, operator_stored_matrix_mode_, level_range, GCAElements );

        }

    }


    linalg::OperatorStoredMatrixMode get_stored_matrix_mode() { return operator_stored_matrix_mode_; }


    /// @brief Set the local matrix stored in the operator

    KOKKOS_INLINE_FUNCTION


    void set_local_matrix(

        const int                                                    local_subdomain_id,

        const int                                                    x_cell,

        const int                                                    y_cell,

        const int                                                    r_cell,

        const int                                                    wedge,

        const dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim >& mat ) const

    {

        KOKKOS_ASSERT( operator_stored_matrix_mode_ != linalg::OperatorStoredMatrixMode::Off );

        local_matrix_storage_.set_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge, mat );

    }


    /// @brief Retrives the local matrix

    /// if there is stored local matrices, the desired local matrix is loaded and returned

    /// if not, the local matrix is assembled on-the-fly

    KOKKOS_INLINE_FUNCTION


    dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > get_local_matrix(

        const int local_subdomain_id,

        const int x_cell,

        const int y_cell,

        const int r_cell,

        const int wedge ) const

    {

        // request from storage

        if ( operator_stored_matrix_mode_ != linalg::OperatorStoredMatrixMode::Off )

        {

            if ( !local_matrix_storage_.has_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge ) )

            {

                Kokkos::abort( "No matrix found at that spatial index." );

            }

            return local_matrix_storage_.get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge );

        }

        else

        {

            return assemble_local_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge );

        }

    }


    void apply_impl( const SrcVectorType& src, DstVectorType& dst )

    {

        util::Timer timer_apply( "epsilon_divdiv_apply" );


        if ( operator_apply_mode_ == linalg::OperatorApplyMode::Replace )

        {

            assign( dst, 0 );

        }


        src_ = src.grid_data();

        dst_ = dst.grid_data();


        if ( src_.extent( 0 ) != dst_.extent( 0 ) || src_.extent( 1 ) != dst_.extent( 1 ) ||

             src_.extent( 2 ) != dst_.extent( 2 ) || src_.extent( 3 ) != dst_.extent( 3 ) )

        {

            throw std::runtime_error( "EpsilonDivDiv: src/dst mismatch" );

        }


        if ( src_.extent( 1 ) != grid_.extent( 1 ) || src_.extent( 2 ) != grid_.extent( 2 ) )

        {

            throw std::runtime_error( "EpsilonDivDiv: src/dst mismatch" );

        }


        util::Timer timer_kernel( "epsilon_divdiv_kernel" );

        const auto  num_cells =

            src_.extent( 0 ) * ( src_.extent( 1 ) - 1 ) * ( src_.extent( 2 ) - 1 ) * ( src_.extent( 3 ) - 1 );


        constexpr int        size_g1    = num_wedges_per_hex_cell * 1 * 9 * sizeof( ScalarT );

        constexpr int        size_g2    = num_wedges_per_hex_cell * 1 * sizeof( ScalarT );

        constexpr int        shmem_size = size_g1 + size_g2;

        Kokkos::TeamPolicy<> policy =

            Kokkos::TeamPolicy<>( blocks_, block_size_ ).set_scratch_size( 0, Kokkos::PerTeam( shmem_size ));


        Kokkos::parallel_for( "matvec", policy, *this );


        //Kokkos::parallel_for( "matvec", grid::shell::local_domain_md_range_policy_cells( domain_ ), *this );

        Kokkos::fence();

        timer_kernel.stop();


        if ( operator_communication_mode_ == linalg::OperatorCommunicationMode::CommunicateAdditively )

        {

            util::Timer timer_comm( "epsilon_divdiv_comm" );


            communication::shell::pack_send_and_recv_local_subdomain_boundaries(

                domain_, dst_, send_buffers_, recv_buffers_ );

            communication::shell::unpack_and_reduce_local_subdomain_boundaries( domain_, dst_, recv_buffers_ );

        }

    }


    void precompute()

    {

        Kokkos::parallel_for( "precompute", grid::shell::local_domain_md_range_policy_cells_lateral( domain_ ), *this );

    }


    KOKKOS_INLINE_FUNCTION void operator()( const int local_subdomain_id, const int x_cell, const int y_cell ) const

    {

        // Gather surface points for each wedge.

        dense::Vec< ScalarT, 3 > wedge_phy_surf[num_wedges_per_hex_cell][num_nodes_per_wedge_surface] = {};

        wedge_surface_physical_coords( wedge_phy_surf, grid_, local_subdomain_id, x_cell, y_cell );


        for ( int wedge = 0; wedge < num_wedges_per_hex_cell; ++wedge )

        {

            const auto p1_phy = wedge_phy_surf[wedge][0];

            const auto p2_phy = wedge_phy_surf[wedge][1];

            const auto p3_phy = wedge_phy_surf[wedge][2];

            for ( int q = 0; q < num_quad_points; ++q )

            {

                const auto    quad_point = quad_points[q];

                const auto    b          = jac_lat( p1_phy, p2_phy, p3_phy, quad_point( 0 ), quad_point( 1 ) );

                const ScalarT det_b      = b.det();

                const ScalarT g2         = Kokkos::abs( det_b );

                g2_( local_subdomain_id, x_cell, y_cell, wedge, q ) = g2;

                const auto b_inv_t                                  = b.inv_transposed( det_b );

                for ( int d1 = 0; d1 < 3; ++d1 )

                {

                    for ( int d2 = 0; d2 < 3; ++d2 )

                    {

                        g1_( local_subdomain_id, x_cell, y_cell, wedge, q, d1, d2 ) = b_inv_t( d1, d2 );

                    }

                }

            }

        }

    }


    KOKKOS_INLINE_FUNCTION


    void map_league_idx_to_indices(

        int  global_cell_idx,

        int  cells_per_subdomain,

        int  num_cells_x,

        int  num_cells_y,

        int  num_cells_r,

        int& local_subdomain_id,

        int& x_cell,

        int& y_cell,

        int& r_cell ) const

    {

        local_subdomain_id = global_cell_idx / cells_per_subdomain;

        int rem            = global_cell_idx % cells_per_subdomain;

        x_cell             = rem / ( num_cells_x * num_cells_y );

        rem                = rem % ( num_cells_x * num_cells_y );

        y_cell             = rem / num_cells_x;

        r_cell             = rem % num_cells_x;

    }


    using Team = Kokkos::TeamPolicy<>::member_type;


    KOKKOS_INLINE_FUNCTION void operator()( const Team& team ) const

    {

        int local_subdomain_id, x_cell, y_cell, r_cell;


        {

            const int league_rank   = team.league_rank();

            int       tmp           = league_rank;

            const int r_block_index = tmp % blocks_per_column_;

            tmp /= blocks_per_column_;

            y_cell             = tmp & ( hex_lat_ - 1 );       // league_rank % hex_lat_

            tmp                = tmp >> lat_refinement_level_; // league_rank / hex_lat_

            x_cell             = tmp & ( hex_lat_ - 1 );       // tmp % hex_lat_

            local_subdomain_id = tmp >> lat_refinement_level_; // tmp / hex_lat_


            const int thread_index = team.team_rank();

            const int block_size   = team.team_size();

            r_cell                 = r_block_index * block_size + thread_index;

        }


        // If we have stored lmatrices, use them.

        // It's the user's responsibility to write meaningful matrices via set_lmatrix()

        // We probably never want to assemble lmatrices with DCA and store,

        // so GCA should be the actor storing matrices.

        // use stored matrices (at least on some elements)

        if ( operator_stored_matrix_mode_ != linalg::OperatorStoredMatrixMode::Off )

        {

            dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > A[num_wedges_per_hex_cell] = { 0 };


            if ( operator_stored_matrix_mode_ == linalg::OperatorStoredMatrixMode::Full )

            {

                A[0] = local_matrix_storage_.get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 );

                A[1] = local_matrix_storage_.get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 );

            }

            else if ( operator_stored_matrix_mode_ == linalg::OperatorStoredMatrixMode::Selective )

            {

                if ( local_matrix_storage_.has_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 ) &&

                     local_matrix_storage_.has_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 ) )

                {

                    A[0] = local_matrix_storage_.get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 );

                    A[1] = local_matrix_storage_.get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 );

                }

                else

                {

                    // Kokkos::abort("Matrix not found.");

                    A[0] = assemble_local_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 );

                    A[1] = assemble_local_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 );

                }

            }

            // BCs are applied by GCA ... to be discussed for free-slip


            if ( diagonal_ )

            {

                A[0] = A[0].diagonal();

                A[1] = A[1].diagonal();

            }


            dense::Vec< ScalarT, 18 > src[num_wedges_per_hex_cell];

            for ( int dimj = 0; dimj < 3; dimj++ )

            {

                dense::Vec< ScalarT, 6 > src_d[num_wedges_per_hex_cell];

                extract_local_wedge_vector_coefficients(

                    src_d, local_subdomain_id, x_cell, y_cell, r_cell, dimj, src_ );


                for ( int wedge = 0; wedge < num_wedges_per_hex_cell; wedge++ )

                {

                    for ( int i = 0; i < num_nodes_per_wedge; i++ )

                    {

                        src[wedge]( dimj * num_nodes_per_wedge + i ) = src_d[wedge]( i );

                    }

                }

            }


            dense::Vec< ScalarT, LocalMatrixDim > dst[num_wedges_per_hex_cell];


            dst[0] = A[0] * src[0];

            dst[1] = A[1] * src[1];


            for ( int dimi = 0; dimi < 3; dimi++ )

            {

                dense::Vec< ScalarT, 6 > dst_d[num_wedges_per_hex_cell];

                dst_d[0] = dst[0].template slice< 6 >( dimi * num_nodes_per_wedge );

                dst_d[1] = dst[1].template slice< 6 >( dimi * num_nodes_per_wedge );


                atomically_add_local_wedge_vector_coefficients(

                    dst_, local_subdomain_id, x_cell, y_cell, r_cell, dimi, dst_d );

            }

        }

        else

        {

            double wedge_surf_phy_coords[2][3][3];

            double quad_surface_coords[2][2][3];

            quad_surface_coords[0][0][0]   = grid_( local_subdomain_id, x_cell, y_cell, 0 );

            quad_surface_coords[0][0][1]   = grid_( local_subdomain_id, x_cell, y_cell, 1 );

            quad_surface_coords[0][0][2]   = grid_( local_subdomain_id, x_cell, y_cell, 2 );

            quad_surface_coords[0][1][0]   = grid_( local_subdomain_id, x_cell, y_cell + 1, 0 );

            quad_surface_coords[0][1][1]   = grid_( local_subdomain_id, x_cell, y_cell + 1, 1 );

            quad_surface_coords[0][1][2]   = grid_( local_subdomain_id, x_cell, y_cell + 1, 2 );

            quad_surface_coords[1][0][0]   = grid_( local_subdomain_id, x_cell + 1, y_cell, 0 );

            quad_surface_coords[1][0][1]   = grid_( local_subdomain_id, x_cell + 1, y_cell, 1 );

            quad_surface_coords[1][0][2]   = grid_( local_subdomain_id, x_cell + 1, y_cell, 2 );

            quad_surface_coords[1][1][0]   = grid_( local_subdomain_id, x_cell + 1, y_cell + 1, 0 );

            quad_surface_coords[1][1][1]   = grid_( local_subdomain_id, x_cell + 1, y_cell + 1, 1 );

            quad_surface_coords[1][1][2]   = grid_( local_subdomain_id, x_cell + 1, y_cell + 1, 2 );

            wedge_surf_phy_coords[0][0][0] = quad_surface_coords[0][0][0];

            wedge_surf_phy_coords[0][0][1] = quad_surface_coords[0][0][1];

            wedge_surf_phy_coords[0][0][2] = quad_surface_coords[0][0][2];

            wedge_surf_phy_coords[0][1][0] = quad_surface_coords[1][0][0];

            wedge_surf_phy_coords[0][1][1] = quad_surface_coords[1][0][1];

            wedge_surf_phy_coords[0][1][2] = quad_surface_coords[1][0][2];

            wedge_surf_phy_coords[0][2][0] = quad_surface_coords[0][1][0];

            wedge_surf_phy_coords[0][2][1] = quad_surface_coords[0][1][1];

            wedge_surf_phy_coords[0][2][2] = quad_surface_coords[0][1][2];

            wedge_surf_phy_coords[1][0][0] = quad_surface_coords[1][1][0];

            wedge_surf_phy_coords[1][0][1] = quad_surface_coords[1][1][1];

            wedge_surf_phy_coords[1][0][2] = quad_surface_coords[1][1][2];

            wedge_surf_phy_coords[1][1][0] = quad_surface_coords[0][1][0];

            wedge_surf_phy_coords[1][1][1] = quad_surface_coords[0][1][1];

            wedge_surf_phy_coords[1][1][2] = quad_surface_coords[0][1][2];

            wedge_surf_phy_coords[1][2][0] = quad_surface_coords[1][0][0];

            wedge_surf_phy_coords[1][2][1] = quad_surface_coords[1][0][1];

            wedge_surf_phy_coords[1][2][2] = quad_surface_coords[1][0][2];

            double r_0                     = radii_( local_subdomain_id, r_cell );

            double r_1                     = radii_( local_subdomain_id, r_cell + 1 );

            double src_local_hex[3][2][6];

            int    dim;

            for ( dim = 0; dim < 3; dim += 1 )

            {

                src_local_hex[dim][0][0] = src_( local_subdomain_id, x_cell, y_cell, r_cell, dim );

                src_local_hex[dim][0][1] = src_( local_subdomain_id, x_cell + 1, y_cell, r_cell, dim );

                src_local_hex[dim][0][2] = src_( local_subdomain_id, x_cell, y_cell + 1, r_cell, dim );

                src_local_hex[dim][0][3] = src_( local_subdomain_id, x_cell, y_cell, r_cell + 1, dim );

                src_local_hex[dim][0][4] = src_( local_subdomain_id, x_cell + 1, y_cell, r_cell + 1, dim );

                src_local_hex[dim][0][5] = src_( local_subdomain_id, x_cell, y_cell + 1, r_cell + 1, dim );

                src_local_hex[dim][1][0] = src_( local_subdomain_id, x_cell + 1, y_cell + 1, r_cell, dim );

                src_local_hex[dim][1][1] = src_( local_subdomain_id, x_cell, y_cell + 1, r_cell, dim );

                src_local_hex[dim][1][2] = src_( local_subdomain_id, x_cell + 1, y_cell, r_cell, dim );

                src_local_hex[dim][1][3] = src_( local_subdomain_id, x_cell + 1, y_cell + 1, r_cell + 1, dim );

                src_local_hex[dim][1][4] = src_( local_subdomain_id, x_cell, y_cell + 1, r_cell + 1, dim );

                src_local_hex[dim][1][5] = src_( local_subdomain_id, x_cell + 1, y_cell, r_cell + 1, dim );

            };

            double k_local_hex[2][6];

            k_local_hex[0][0] = k_( local_subdomain_id, x_cell, y_cell, r_cell );

            k_local_hex[0][1] = k_( local_subdomain_id, x_cell + 1, y_cell, r_cell );

            k_local_hex[0][2] = k_( local_subdomain_id, x_cell, y_cell + 1, r_cell );

            k_local_hex[0][3] = k_( local_subdomain_id, x_cell, y_cell, r_cell + 1 );

            k_local_hex[0][4] = k_( local_subdomain_id, x_cell + 1, y_cell, r_cell + 1 );

            k_local_hex[0][5] = k_( local_subdomain_id, x_cell, y_cell + 1, r_cell + 1 );

            k_local_hex[1][0] = k_( local_subdomain_id, x_cell + 1, y_cell + 1, r_cell );

            k_local_hex[1][1] = k_( local_subdomain_id, x_cell, y_cell + 1, r_cell );

            k_local_hex[1][2] = k_( local_subdomain_id, x_cell + 1, y_cell, r_cell );

            k_local_hex[1][3] = k_( local_subdomain_id, x_cell + 1, y_cell + 1, r_cell + 1 );

            k_local_hex[1][4] = k_( local_subdomain_id, x_cell, y_cell + 1, r_cell + 1 );

            k_local_hex[1][5] = k_( local_subdomain_id, x_cell + 1, y_cell, r_cell + 1 );

            double qp_array[1][3];

            double qw_array[1];

            qp_array[0][0]          = 0.33333333333333331;

            qp_array[0][1]          = 0.33333333333333331;

            qp_array[0][2]          = 0.0;

            qw_array[0]             = 1.0;

            int at_cmb_boundary     = has_flag( local_subdomain_id, x_cell, y_cell, r_cell, CMB );

            int at_surface_boundary = has_flag( local_subdomain_id, x_cell, y_cell, r_cell + 1, SURFACE );

            int cmb_shift = ( ( treat_boundary_ && diagonal_ == false && at_cmb_boundary != 0 ) ? ( 3 ) : ( 0 ) );

            int max_rad   = radii_.extent( 1 ) - 1;

            int surface_shift =

                ( ( treat_boundary_ && diagonal_ == false && at_surface_boundary != 0 ) ? ( 3 ) : ( 0 ) );

            double dst_array[3][2][6] = { 0 };

            double grad_r             = -1.0 / 2.0 * r_0 + ( 1.0 / 2.0 ) * r_1;

            double grad_r_inv         = 1.0 / grad_r;

            int    w                  = 0;


            constexpr int N_q     = 1;

            constexpr int size_g1 = num_wedges_per_hex_cell * N_q * 9 * sizeof( ScalarT );

            constexpr int size_g2 = num_wedges_per_hex_cell * N_q * sizeof( ScalarT );


            ScalarT* shmem_g1 = (ScalarT*) team.team_shmem().get_shmem( size_g1 );

            ScalarT* shmem_g2 = (ScalarT*) team.team_shmem().get_shmem( size_g2 );


            if ( team.team_rank() == 0 )

            {

                for ( int wedge = 0; wedge < num_wedges_per_hex_cell; ++wedge )

                {

                    for ( int q = 0; q < N_q; ++q )

                    {

                        shmem_g2[wedge * N_q + q] = g2_( local_subdomain_id, x_cell, y_cell, wedge, q );

                        for ( int d1 = 0; d1 < 3; ++d1 )

                        {

                            for ( int d2 = 0; d2 < 3; ++d2 )

                            {

                                shmem_g1[wedge * N_q * 9 + q * 9 + 3 * d1 + d2] =

                                    g1_( local_subdomain_id, x_cell, y_cell, wedge, q, d1, d2 );

                            }

                        }

                    }

                }

            }

            team.team_barrier();


            /* Apply local matrix for both wedges and accumulated for all quadrature points. */;

            for ( w = 0; w < 2; w += 1 )

            {

                int q = 0;

                for ( q = 0; q < 1; q += 1 )

                {

                    /* Coefficient evaluation on current wedge w */;

                    double tmpcse_k_eval_0 = ( 1.0 / 2.0 ) * qp_array[q][2];

                    double tmpcse_k_eval_1 = 1.0 / 2.0 - tmpcse_k_eval_0;

                    double tmpcse_k_eval_2 = tmpcse_k_eval_0 + 1.0 / 2.0;

                    double tmpcse_k_eval_3 = -qp_array[q][0] - qp_array[q][1] + 1;

                    double k_eval          = tmpcse_k_eval_1 * tmpcse_k_eval_3 * k_local_hex[w][0] +

                                    tmpcse_k_eval_1 * k_local_hex[w][1] * qp_array[q][0] +

                                    tmpcse_k_eval_1 * k_local_hex[w][2] * qp_array[q][1] +

                                    tmpcse_k_eval_2 * tmpcse_k_eval_3 * k_local_hex[w][3] +

                                    tmpcse_k_eval_2 * k_local_hex[w][4] * qp_array[q][0] +

                                    tmpcse_k_eval_2 * k_local_hex[w][5] * qp_array[q][1];

                    /*

      Load the radially constant parts of the Jacobial from storage,

      scale with radial parts of the current element. */

                    ;

                    double J_invT[3][3] = { 0 };

                    double r_inv = 1.0 / ( r_0 + ( -1.0 / 2.0 * r_0 + ( 1.0 / 2.0 ) * r_1 ) * ( qp_array[q][2] + 1 ) );

                    double factors[3] = { r_inv, r_inv, grad_r_inv };

                    int    d1;

                    int    d2;

                    for ( d1 = 0; d1 < 3; d1 += 1 )

                    {

                        for ( d2 = 0; d2 < 3; d2 += 1 )

                        {

                            J_invT[d1][d2] = factors[d2] * shmem_g1[w * N_q * 9 + q * 9 + 3 * d1 + d2];

                        };

                    };

                    double g2 = shmem_g2[w * N_q + q];

                    /* Computation of the gradient of the scalar shape functions belonging to each DoF.

      In the Eps-component-loops, we insert the gradient at the entry of the

      vectorial gradient matrix corresponding to the Eps-component. */

                    ;


                    double scalar_grad[6][3] = { 0 };

                    double tmpcse_grad_i_0   = ( 1.0 / 2.0 ) * qp_array[q][2];

                    double tmpcse_grad_i_1   = tmpcse_grad_i_0 - 1.0 / 2.0;

                    double tmpcse_grad_i_2   = ( 1.0 / 2.0 ) * qp_array[q][0];

                    double tmpcse_grad_i_3   = ( 1.0 / 2.0 ) * qp_array[q][1];

                    double tmpcse_grad_i_4   = tmpcse_grad_i_2 + tmpcse_grad_i_3 - 1.0 / 2.0;

                    double tmpcse_grad_i_5   = tmpcse_grad_i_2 * J_invT[0][2];

                    double tmpcse_grad_i_6   = -tmpcse_grad_i_1;

                    double tmpcse_grad_i_7   = tmpcse_grad_i_2 * J_invT[1][2];

                    double tmpcse_grad_i_8   = tmpcse_grad_i_2 * J_invT[2][2];

                    double tmpcse_grad_i_9   = tmpcse_grad_i_3 * J_invT[0][2];

                    double tmpcse_grad_i_10  = tmpcse_grad_i_3 * J_invT[1][2];

                    double tmpcse_grad_i_11  = tmpcse_grad_i_3 * J_invT[2][2];

                    double tmpcse_grad_i_12  = tmpcse_grad_i_0 + 1.0 / 2.0;

                    double tmpcse_grad_i_13  = -tmpcse_grad_i_12;

                    double tmpcse_grad_i_14  = -tmpcse_grad_i_4;

                    scalar_grad[0][0]        = tmpcse_grad_i_1 * J_invT[0][0] + tmpcse_grad_i_1 * J_invT[0][1] +

                                        tmpcse_grad_i_4 * J_invT[0][2];

                    scalar_grad[0][1] = tmpcse_grad_i_1 * J_invT[1][0] + tmpcse_grad_i_1 * J_invT[1][1] +

                                        tmpcse_grad_i_4 * J_invT[1][2];

                    scalar_grad[0][2] = tmpcse_grad_i_1 * J_invT[2][0] + tmpcse_grad_i_1 * J_invT[2][1] +

                                        tmpcse_grad_i_4 * J_invT[2][2];

                    scalar_grad[1][0] = -tmpcse_grad_i_5 + tmpcse_grad_i_6 * J_invT[0][0];

                    scalar_grad[1][1] = tmpcse_grad_i_6 * J_invT[1][0] - tmpcse_grad_i_7;

                    scalar_grad[1][2] = tmpcse_grad_i_6 * J_invT[2][0] - tmpcse_grad_i_8;

                    scalar_grad[2][0] = tmpcse_grad_i_6 * J_invT[0][1] - tmpcse_grad_i_9;

                    scalar_grad[2][1] = -tmpcse_grad_i_10 + tmpcse_grad_i_6 * J_invT[1][1];

                    scalar_grad[2][2] = -tmpcse_grad_i_11 + tmpcse_grad_i_6 * J_invT[2][1];

                    scalar_grad[3][0] = tmpcse_grad_i_13 * J_invT[0][0] + tmpcse_grad_i_13 * J_invT[0][1] +

                                        tmpcse_grad_i_14 * J_invT[0][2];

                    scalar_grad[3][1] = tmpcse_grad_i_13 * J_invT[1][0] + tmpcse_grad_i_13 * J_invT[1][1] +

                                        tmpcse_grad_i_14 * J_invT[1][2];

                    scalar_grad[3][2] = tmpcse_grad_i_13 * J_invT[2][0] + tmpcse_grad_i_13 * J_invT[2][1] +

                                        tmpcse_grad_i_14 * J_invT[2][2];

                    scalar_grad[4][0] = tmpcse_grad_i_12 * J_invT[0][0] + tmpcse_grad_i_5;

                    scalar_grad[4][1] = tmpcse_grad_i_12 * J_invT[1][0] + tmpcse_grad_i_7;

                    scalar_grad[4][2] = tmpcse_grad_i_12 * J_invT[2][0] + tmpcse_grad_i_8;

                    scalar_grad[5][0] = tmpcse_grad_i_12 * J_invT[0][1] + tmpcse_grad_i_9;

                    scalar_grad[5][1] = tmpcse_grad_i_10 + tmpcse_grad_i_12 * J_invT[1][1];

                    scalar_grad[5][2] = tmpcse_grad_i_11 + tmpcse_grad_i_12 * J_invT[2][1];

                    int dimj;


                    double grad_u[3][3] = { 0 };

                    double div_u        = 0.0;

                    int    node_idx;

                    /* In the following, we exploit the outer-product-structure of the local MV both in

      the components of the Epsilon operators and in the local DoFs. */

                    ;

                    /* Loop to assemble the trial gradient. */;

                    for ( dimj = 0; dimj < 3; dimj += 1 )

                    {

                        if ( diagonal_ == false )

                        {

                            for ( node_idx = cmb_shift; node_idx < 6 - surface_shift; node_idx += 1 )

                            {

                                double E_grad_trial[3][3]     = { 0 };

                                E_grad_trial[0][dimj]         = scalar_grad[node_idx][0];

                                E_grad_trial[1][dimj]         = scalar_grad[node_idx][1];

                                E_grad_trial[2][dimj]         = scalar_grad[node_idx][2];

                                double tmpcse_symgrad_trial_0 = 0.5 * E_grad_trial[0][1] + 0.5 * E_grad_trial[1][0];

                                double tmpcse_symgrad_trial_1 = 0.5 * E_grad_trial[0][2] + 0.5 * E_grad_trial[2][0];

                                double tmpcse_symgrad_trial_2 = 0.5 * E_grad_trial[1][2] + 0.5 * E_grad_trial[2][1];

                                grad_u[0][0] =

                                    1.0 * E_grad_trial[0][0] * src_local_hex[dimj][w][node_idx] + grad_u[0][0];

                                grad_u[0][1] = tmpcse_symgrad_trial_0 * src_local_hex[dimj][w][node_idx] + grad_u[0][1];

                                grad_u[0][2] = tmpcse_symgrad_trial_1 * src_local_hex[dimj][w][node_idx] + grad_u[0][2];

                                grad_u[1][0] = tmpcse_symgrad_trial_0 * src_local_hex[dimj][w][node_idx] + grad_u[1][0];

                                grad_u[1][1] =

                                    1.0 * E_grad_trial[1][1] * src_local_hex[dimj][w][node_idx] + grad_u[1][1];

                                grad_u[1][2] = tmpcse_symgrad_trial_2 * src_local_hex[dimj][w][node_idx] + grad_u[1][2];

                                grad_u[2][0] = tmpcse_symgrad_trial_1 * src_local_hex[dimj][w][node_idx] + grad_u[2][0];

                                grad_u[2][1] = tmpcse_symgrad_trial_2 * src_local_hex[dimj][w][node_idx] + grad_u[2][1];

                                grad_u[2][2] =

                                    1.0 * E_grad_trial[2][2] * src_local_hex[dimj][w][node_idx] + grad_u[2][2];

                                div_u = div_u + E_grad_trial[dimj][dimj] * src_local_hex[dimj][w][node_idx];

                            };

                        };

                    };

                    int dimi;

                    /* Loop to pair the assembled trial gradient with the test gradients. */;

                    for ( dimi = 0; dimi < 3; dimi += 1 )

                    {

                        if ( diagonal_ == false )

                        {

                            for ( node_idx = cmb_shift; node_idx < 6 - surface_shift; node_idx += 1 )

                            {

                                double E_grad_test[3][3]     = { 0 };

                                E_grad_test[0][dimi]         = scalar_grad[node_idx][0];

                                E_grad_test[1][dimi]         = scalar_grad[node_idx][1];

                                E_grad_test[2][dimi]         = scalar_grad[node_idx][2];

                                double tmpcse_symgrad_test_0 = 0.5 * E_grad_test[0][1] + 0.5 * E_grad_test[1][0];

                                double tmpcse_symgrad_test_1 = 0.5 * E_grad_test[0][2] + 0.5 * E_grad_test[2][0];

                                double tmpcse_symgrad_test_2 = 0.5 * E_grad_test[1][2] + 0.5 * E_grad_test[2][1];

                                double tmpcse_pairing_0      = 2 * tmpcse_symgrad_test_0;

                                double tmpcse_pairing_1      = 2 * tmpcse_symgrad_test_1;

                                double tmpcse_pairing_2      = 2 * tmpcse_symgrad_test_2;

                                dst_array[dimi][w][node_idx] =

                                    g2 * grad_r * k_eval *

                                        ( -0.66666666666666663 * div_u * E_grad_test[dimi][dimi] +

                                          tmpcse_pairing_0 * grad_u[0][1] + tmpcse_pairing_0 * grad_u[1][0] +

                                          tmpcse_pairing_1 * grad_u[0][2] + tmpcse_pairing_1 * grad_u[2][0] +

                                          tmpcse_pairing_2 * grad_u[1][2] + tmpcse_pairing_2 * grad_u[2][1] +

                                          2.0 * E_grad_test[0][0] * grad_u[0][0] +

                                          2.0 * E_grad_test[1][1] * grad_u[1][1] +

                                          2.0 * E_grad_test[2][2] * grad_u[2][2] ) *

                                        qw_array[q] / pow( r_inv, 2 ) +

                                    dst_array[dimi][w][node_idx];

                            };

                        };

                    };

                    int dim_diagBC;

                    /* Loop to apply BCs or only the diagonal of the operator. */;

                    for ( dim_diagBC = 0; dim_diagBC < 3; dim_diagBC += 1 )

                    {

                        if ( diagonal_ || treat_boundary_ && ( at_cmb_boundary != 0 || at_surface_boundary != 0 ) )

                        {

                            int node_idx;

                            for ( node_idx = surface_shift; node_idx < 6 - cmb_shift; node_idx += 1 )

                            {

                                double E_grad_test[3][3]   = { 0 };

                                E_grad_test[0][dim_diagBC] = scalar_grad[node_idx][0];

                                E_grad_test[1][dim_diagBC] = scalar_grad[node_idx][1];

                                E_grad_test[2][dim_diagBC] = scalar_grad[node_idx][2];


                                double grad_u_diag[3][3]     = { 0 };

                                double tmpcse_symgrad_test_0 = 0.5 * E_grad_test[0][1] + 0.5 * E_grad_test[1][0];

                                double tmpcse_symgrad_test_1 = 0.5 * E_grad_test[0][2] + 0.5 * E_grad_test[2][0];

                                double tmpcse_symgrad_test_2 = 0.5 * E_grad_test[1][2] + 0.5 * E_grad_test[2][1];

                                grad_u_diag[0][0] = 1.0 * E_grad_test[0][0] * src_local_hex[dim_diagBC][w][node_idx];

                                grad_u_diag[0][1] = tmpcse_symgrad_test_0 * src_local_hex[dim_diagBC][w][node_idx];

                                grad_u_diag[0][2] = tmpcse_symgrad_test_1 * src_local_hex[dim_diagBC][w][node_idx];

                                grad_u_diag[1][0] = tmpcse_symgrad_test_0 * src_local_hex[dim_diagBC][w][node_idx];

                                grad_u_diag[1][1] = 1.0 * E_grad_test[1][1] * src_local_hex[dim_diagBC][w][node_idx];

                                grad_u_diag[1][2] = tmpcse_symgrad_test_2 * src_local_hex[dim_diagBC][w][node_idx];

                                grad_u_diag[2][0] = tmpcse_symgrad_test_1 * src_local_hex[dim_diagBC][w][node_idx];

                                grad_u_diag[2][1] = tmpcse_symgrad_test_2 * src_local_hex[dim_diagBC][w][node_idx];

                                grad_u_diag[2][2] = 1.0 * E_grad_test[2][2] * src_local_hex[dim_diagBC][w][node_idx];

                                double tmpcse_pairing_0 = 4 * src_local_hex[dim_diagBC][w][node_idx];

                                double tmpcse_pairing_1 = 2.0 * src_local_hex[dim_diagBC][w][node_idx];

                                dst_array[dim_diagBC][w][node_idx] =

                                    g2 * grad_r * k_eval *

                                        ( tmpcse_pairing_0 * pow( tmpcse_symgrad_test_0, 2 ) +

                                          tmpcse_pairing_0 * pow( tmpcse_symgrad_test_1, 2 ) +

                                          tmpcse_pairing_0 * pow( tmpcse_symgrad_test_2, 2 ) +

                                          tmpcse_pairing_1 * pow( E_grad_test[0][0], 2 ) +

                                          tmpcse_pairing_1 * pow( E_grad_test[1][1], 2 ) +

                                          tmpcse_pairing_1 * pow( E_grad_test[2][2], 2 ) -

                                          0.66666666666666663 * pow( E_grad_test[dim_diagBC][dim_diagBC], 2 ) *

                                              src_local_hex[dim_diagBC][w][node_idx] ) *

                                        qw_array[q] / pow( r_inv, 2 ) +

                                    dst_array[dim_diagBC][w][node_idx];

                            };

                        };

                    };

                };

            };

            int dim_add;

            for ( dim_add = 0; dim_add < 3; dim_add += 1 )

            {

                Kokkos::atomic_add(

                    &dst_( local_subdomain_id, x_cell, y_cell, r_cell, dim_add ), dst_array[dim_add][0][0] );

                Kokkos::atomic_add(

                    &dst_( local_subdomain_id, x_cell + 1, y_cell, r_cell, dim_add ),

                    dst_array[dim_add][0][1] + dst_array[dim_add][1][2] );

                Kokkos::atomic_add(

                    &dst_( local_subdomain_id, x_cell, y_cell + 1, r_cell, dim_add ),

                    dst_array[dim_add][0][2] + dst_array[dim_add][1][1] );

                Kokkos::atomic_add(

                    &dst_( local_subdomain_id, x_cell, y_cell, r_cell + 1, dim_add ), dst_array[dim_add][0][3] );

                Kokkos::atomic_add(

                    &dst_( local_subdomain_id, x_cell + 1, y_cell, r_cell + 1, dim_add ),

                    dst_array[dim_add][0][4] + dst_array[dim_add][1][5] );

                Kokkos::atomic_add(

                    &dst_( local_subdomain_id, x_cell, y_cell + 1, r_cell + 1, dim_add ),

                    dst_array[dim_add][0][5] + dst_array[dim_add][1][4] );

                Kokkos::atomic_add(

                    &dst_( local_subdomain_id, x_cell + 1, y_cell + 1, r_cell, dim_add ), dst_array[dim_add][1][0] );

                Kokkos::atomic_add(

                    &dst_( local_subdomain_id, x_cell + 1, y_cell + 1, r_cell + 1, dim_add ),

                    dst_array[dim_add][1][3] );

            };

        }

    }


    /// @brief: For both trial and test space this function sets up a vector:

    /// each vector element holds the symmetric gradient (a 3x3 matrix) of the shape function of the corresponding dof

    /// (if dimi == dimj, these are the same and we are on the diagonal of the vectorial diffusion operator)

    /// Additionally, we compute the scalar factor for the numerical integral comp: determinant of the jacobian,

    /// evaluation of the coefficient k on the element and the quadrature weight of the current quad-point.


    /// The idea of this function is that the two vectors can be:

    /// - accumulated to the result of the local matvec with 2 * num_nodes_per_wedge complexity

    ///   by scaling the dot product of the trial vec and local src dofs with each element of the test vec

    ///   (and adding to the dst dofs, this is the fused local matvec).

    /// - propagated to the local matrix by an outer product of the two vectors

    ///   (without applying it to dofs). This is e.g. required to assemble the finest grid local

    ///   matrix on-the-fly during GCA/Galerkin coarsening.


    ///


    KOKKOS_INLINE_FUNCTION void assemble_trial_test_vecs(

        const int                               wedge,

        const dense::Vec< ScalarType, VecDim >& quad_point,

        const ScalarType                        quad_weight,

        const ScalarT                           r_1,

        const ScalarT                           r_2,

        dense::Vec< ScalarT, 3 > ( *wedge_phy_surf )[3],

        const dense::Vec< ScalarT, 6 >*           k_local_hex,

        const int                                 dimi,

        const int                                 dimj,

        dense::Mat< ScalarType, VecDim, VecDim >* sym_grad_i,

        dense::Mat< ScalarType, VecDim, VecDim >* sym_grad_j,

        ScalarType&                               jdet_keval_quadweight ) const

    {

        dense::Mat< ScalarType, VecDim, VecDim >       J       = jac( wedge_phy_surf[wedge], r_1, r_2, quad_point );

        const auto                                     det     = J.det();

        const auto                                     abs_det = Kokkos::abs( det );

        const dense::Mat< ScalarType, VecDim, VecDim > J_inv_transposed = J.inv_transposed( det );


        // dot of coeff dofs and element-local shape functions to evaluate the coefficent on the current element

        ScalarType k_eval = 0.0;

        for ( int k = 0; k < num_nodes_per_wedge; k++ )

        {

            k_eval += shape( k, quad_point ) * k_local_hex[wedge]( k );

        }


        for ( int k = 0; k < num_nodes_per_wedge; k++ )

        {

            sym_grad_i[k] = symmetric_grad( J_inv_transposed, quad_point, k, dimi );

            sym_grad_j[k] = symmetric_grad( J_inv_transposed, quad_point, k, dimj );

        }

        jdet_keval_quadweight = quad_weight * k_eval * abs_det;

    }


    /// @brief assemble the local matrix and return it for a given element, wedge, and vectorial component

    /// (determined by dimi, dimj)

    KOKKOS_INLINE_FUNCTION


    dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > assemble_local_matrix(

        const int local_subdomain_id,

        const int x_cell,

        const int y_cell,

        const int r_cell,

        const int wedge ) const

    {

        // Gather surface points for each wedge.

        // TODO gather this for only 1 wedge

        dense::Vec< ScalarT, 3 > wedge_phy_surf[num_wedges_per_hex_cell][num_nodes_per_wedge_surface] = {};

        wedge_surface_physical_coords( wedge_phy_surf, grid_, local_subdomain_id, x_cell, y_cell );


        // Gather wedge radii.

        const ScalarT r_1 = radii_( local_subdomain_id, r_cell );

        const ScalarT r_2 = radii_( local_subdomain_id, r_cell + 1 );


        dense::Vec< ScalarT, 6 > k_local_hex[num_wedges_per_hex_cell];

        extract_local_wedge_scalar_coefficients( k_local_hex, local_subdomain_id, x_cell, y_cell, r_cell, k_ );


        // Compute the local element matrix.

        dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > A = {};

        for ( int dimi = 0; dimi < 3; ++dimi )

        {

            for ( int dimj = 0; dimj < 3; ++dimj )

            {

                // spatial dimensions: quadrature points and wedge

                for ( int q = 0; q < num_quad_points; q++ )

                {

                    dense::Mat< ScalarType, VecDim, VecDim > sym_grad_i[num_nodes_per_wedge];

                    dense::Mat< ScalarType, VecDim, VecDim > sym_grad_j[num_nodes_per_wedge];

                    ScalarType                               jdet_keval_quadweight = 0;

                    assemble_trial_test_vecs(

                        wedge,

                        quad_points[q],

                        quad_weights[q],

                        r_1,

                        r_2,

                        wedge_phy_surf,

                        k_local_hex,

                        dimi,

                        dimj,

                        sym_grad_i,

                        sym_grad_j,

                        jdet_keval_quadweight );


                    // propagate on local matrix by outer product of test and trial vecs

                    for ( int i = 0; i < num_nodes_per_wedge; i++ )

                    {

                        for ( int j = 0; j < num_nodes_per_wedge; j++ )

                        {

                            A( i + dimi * num_nodes_per_wedge, j + dimj * num_nodes_per_wedge ) +=

                                jdet_keval_quadweight *

                                ( 2 * sym_grad_j[j].double_contract( sym_grad_i[i] ) -

                                  2.0 / 3.0 * sym_grad_j[j]( dimj, dimj ) * sym_grad_i[i]( dimi, dimi ) );

                            // for the div, we just extract the component from the gradient vector

                        }

                    }

                }

            }

        }


        if ( treat_boundary_ )

        {

            dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > boundary_mask;

            boundary_mask.fill( 1.0 );


            for ( int dimi = 0; dimi < 3; ++dimi )

            {

                for ( int dimj = 0; dimj < 3; ++dimj )

                {

                    if ( r_cell == 0 )

                    {

                        // Inner boundary (CMB).

                        for ( int i = 0; i < 6; i++ )

                        {

                            for ( int j = 0; j < 6; j++ )

                            {

                                // on diagonal components of the vectorial diffusion operator, we exclude the diagonal entries from elimination

                                if ( ( dimi == dimj && i != j && ( i < 3 || j < 3 ) ) or

                                     ( dimi != dimj && ( i < 3 || j < 3 ) ) )

                                {

                                    boundary_mask( i + dimi * num_nodes_per_wedge, j + dimj * num_nodes_per_wedge ) =

                                        0.0;

                                }

                            }

                        }

                    }


                    if ( r_cell + 1 == radii_.extent( 1 ) - 1 )

                    {

                        // Outer boundary (surface).

                        for ( int i = 0; i < 6; i++ )

                        {

                            for ( int j = 0; j < 6; j++ )

                            {

                                // on diagonal components of the vectorial diffusion operator, we exclude the diagonal entries from elimination

                                if ( ( dimi == dimj && i != j && ( i >= 3 || j >= 3 ) ) or

                                     ( dimi != dimj && ( i >= 3 || j >= 3 ) ) )

                                {

                                    boundary_mask( i + dimi * num_nodes_per_wedge, j + dimj * num_nodes_per_wedge ) =

                                        0.0;

                                }

                            }

                        }

                    }

                }

            }

            A.hadamard_product( boundary_mask );

        }


        return A;

    }


    // executes the fused local matvec on an element, given the assembled trial and test vectors


    KOKKOS_INLINE_FUNCTION void fused_local_mv(

        ScalarType                      src_local_hex[8],

        ScalarType                      dst_local_hex[8],

        const int                       wedge,

        const ScalarType                jdet_keval_quadweight,

        dense::Mat< ScalarType, 3, 3 >* sym_grad_i,

        dense::Mat< ScalarType, 3, 3 >* sym_grad_j,

        const int                       dimi,

        const int                       dimj,

        int                             r_cell ) const

    {

        constexpr int offset_x[2][6] = { { 0, 1, 0, 0, 1, 0 }, { 1, 0, 1, 1, 0, 1 } };

        constexpr int offset_y[2][6] = { { 0, 0, 1, 0, 0, 1 }, { 1, 1, 0, 1, 1, 0 } };

        constexpr int offset_r[2][6] = { { 0, 0, 0, 1, 1, 1 }, { 0, 0, 0, 1, 1, 1 } };


        dense::Mat< ScalarType, 3, 3 > grad_u;

        ScalarType                     divu = 0.0;

        grad_u.fill( 0.0 );


        const bool at_cmb        = r_cell == 0;

        const bool at_surface    = r_cell + 1 == radii_.extent( 1 ) - 1;

        int        cmb_shift     = 0;

        int        surface_shift = 0;


        // Compute nabla u at this quadrature point.

        if ( !diagonal_ )

        {

            if ( treat_boundary_ && at_cmb )

            {

                // at the core-mantle boundary, we exclude dofs that are lower-indexed than the dof on the boundary

                cmb_shift = 3;

            }

            else if ( treat_boundary_ && at_surface )

            {

                // at the surface boundary, we exclude dofs that are higher-indexed than the dof on the boundary

                surface_shift = 3;

            }


            // accumulate the element-local gradient/divergence of the trial function (loop over columns of local matrix/local dofs)

            // by dot of trial vec and src dofs

            for ( int i = 0 + cmb_shift; i < num_nodes_per_wedge - surface_shift; i++ )

            {

                grad_u =

                    grad_u +

                    sym_grad_i[i] * src_local_hex[4 * offset_r[wedge][i] + 2 * offset_y[wedge][i] + offset_x[wedge][i]];

                divu += sym_grad_i[i]( dimi, dimi ) *

                        src_local_hex[4 * offset_r[wedge][i] + 2 * offset_y[wedge][i] + offset_x[wedge][i]];

            }


            // Add the test function contributions.

            // for each row of the local matrix (test-functions):

            // multiply trial part (fully assembled for the current element from loop above) with test part corresponding to the current row/dof

            // += due to contributions from other elements

            for ( int j = 0 + cmb_shift; j < num_nodes_per_wedge - surface_shift; j++ )

            {

                dst_local_hex[4 * offset_r[wedge][j] + 2 * offset_y[wedge][j] + offset_x[wedge][j]] +=

                    jdet_keval_quadweight * ( 2 * ( sym_grad_j[j] ).double_contract( grad_u ) -

                                              2.0 / 3.0 * sym_grad_j[j]( dimj, dimj ) * divu );

                // for the div, we just extract the component from the gradient vector

            }

        }


        // Dirichlet DoFs are only to be eliminated on diagonal blocks of epsilon

        if ( diagonal_ || ( dimi == dimj && ( treat_boundary_ && ( at_cmb || at_surface ) ) ) )

        {

            // for the diagonal elements at the boundary, we switch the shifts

            for ( int i = 0 + surface_shift; i < num_nodes_per_wedge - cmb_shift; i++ )

            {

                const auto grad_u_diag =

                    sym_grad_i[i] * src_local_hex[4 * offset_r[wedge][i] + 2 * offset_y[wedge][i] + offset_x[wedge][i]];

                const auto div_u_diag =

                    sym_grad_i[i]( dimi, dimi ) *

                    src_local_hex[4 * offset_r[wedge][i] + 2 * offset_y[wedge][i] + offset_x[wedge][i]];


                dst_local_hex[4 * offset_r[wedge][i] + 2 * offset_y[wedge][i] + offset_x[wedge][i]] +=

                    jdet_keval_quadweight * ( 2 * ( sym_grad_j[i] ).double_contract( grad_u_diag ) -

                                              2.0 / 3.0 * sym_grad_j[i]( dimj, dimj ) * div_u_diag );

            }

        }

    }


};


static_assert( linalg::GCACapable< EpsilonDivDivKerngenV03TeamsPrecomp< float > > );

static_assert( linalg::GCACapable< EpsilonDivDivKerngenV03TeamsPrecomp< double > > );


} // namespace terra::fe::wedge::operators::shell::epsdivdiv_history

terra::communication::shell::SubdomainNeighborhoodSendRecvBuffer
Send and receive buffers for all process-local subdomain boundaries.
Definition communication.hpp:56

terra::fe::wedge::operators::shell::epsdivdiv_history::EpsilonDivDivKerngenV03TeamsPrecomp
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:27

terra::fe::wedge::operators::shell::epsdivdiv_history::EpsilonDivDivKerngenV03TeamsPrecomp::get_radii
grid::Grid2DDataScalar< ScalarT > get_radii() const
Getter for radii member.
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:143

terra::fe::wedge::operators::shell::epsdivdiv_history::EpsilonDivDivKerngenV03TeamsPrecomp::get_grid
grid::Grid3DDataVec< ScalarT, 3 > get_grid()
Getter for grid member.
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:146

terra::fe::wedge::operators::shell::epsdivdiv_history::EpsilonDivDivKerngenV03TeamsPrecomp::operator()
void operator()(const int local_subdomain_id, const int x_cell, const int y_cell) const
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:273

terra::fe::wedge::operators::shell::epsdivdiv_history::EpsilonDivDivKerngenV03TeamsPrecomp::Grid4DDataLocalMatrices
terra::grid::Grid4DDataMatrices< ScalarType, LocalMatrixDim, LocalMatrixDim, 2 > Grid4DDataLocalMatrices
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:33

terra::fe::wedge::operators::shell::epsdivdiv_history::EpsilonDivDivKerngenV03TeamsPrecomp::assemble_trial_test_vecs
void assemble_trial_test_vecs(const int wedge, const dense::Vec< ScalarType, VecDim > &quad_point, const ScalarType quad_weight, const ScalarT r_1, const ScalarT r_2, dense::Vec< ScalarT, 3 >(*wedge_phy_surf)[3], const dense::Vec< ScalarT, 6 > *k_local_hex, const int dimi, const int dimj, dense::Mat< ScalarType, VecDim, VecDim > *sym_grad_i, dense::Mat< ScalarType, VecDim, VecDim > *sym_grad_j, ScalarType &jdet_keval_quadweight) const
: For both trial and test space this function sets up a vector: each vector element holds the symmetr...
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:759

terra::fe::wedge::operators::shell::epsdivdiv_history::EpsilonDivDivKerngenV03TeamsPrecomp::set_diagonal
void set_diagonal(bool v)
S/Getter for diagonal member.
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:134

terra::fe::wedge::operators::shell::epsdivdiv_history::EpsilonDivDivKerngenV03TeamsPrecomp::map_league_idx_to_indices
void map_league_idx_to_indices(int global_cell_idx, int cells_per_subdomain, int num_cells_x, int num_cells_y, int num_cells_r, int &local_subdomain_id, int &x_cell, int &y_cell, int &r_cell) const
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:304

terra::fe::wedge::operators::shell::epsdivdiv_history::EpsilonDivDivKerngenV03TeamsPrecomp::precompute
void precompute()
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:268

terra::fe::wedge::operators::shell::epsdivdiv_history::EpsilonDivDivKerngenV03TeamsPrecomp::get_domain
const grid::shell::DistributedDomain & get_domain() const
Getter for domain member.
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:140

terra::fe::wedge::operators::shell::epsdivdiv_history::EpsilonDivDivKerngenV03TeamsPrecomp::apply_impl
void apply_impl(const SrcVectorType &src, DstVectorType &dst)
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:218

terra::fe::wedge::operators::shell::epsdivdiv_history::EpsilonDivDivKerngenV03TeamsPrecomp::has_flag
bool has_flag(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, grid::shell::ShellBoundaryFlag flag) const
Getter for mask member.
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:150

terra::fe::wedge::operators::shell::epsdivdiv_history::EpsilonDivDivKerngenV03TeamsPrecomp::k_grid_data
const grid::Grid4DDataScalar< ScalarType > & k_grid_data()
Getter for coefficient.
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:137

terra::fe::wedge::operators::shell::epsdivdiv_history::EpsilonDivDivKerngenV03TeamsPrecomp::Team
Kokkos::TeamPolicy<>::member_type Team
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:323

terra::fe::wedge::operators::shell::epsdivdiv_history::EpsilonDivDivKerngenV03TeamsPrecomp::set_local_matrix
void set_local_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge, const dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > &mat) const
Set the local matrix stored in the operator.
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:180

terra::fe::wedge::operators::shell::epsdivdiv_history::EpsilonDivDivKerngenV03TeamsPrecomp::fused_local_mv
void fused_local_mv(ScalarType src_local_hex[8], ScalarType dst_local_hex[8], const int wedge, const ScalarType jdet_keval_quadweight, dense::Mat< ScalarType, 3, 3 > *sym_grad_i, dense::Mat< ScalarType, 3, 3 > *sym_grad_j, const int dimi, const int dimj, int r_cell) const
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:910

terra::fe::wedge::operators::shell::epsdivdiv_history::EpsilonDivDivKerngenV03TeamsPrecomp::operator()
void operator()(const Team &team) const
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:324

terra::fe::wedge::operators::shell::epsdivdiv_history::EpsilonDivDivKerngenV03TeamsPrecomp::ScalarType
ScalarT ScalarType
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:31

terra::fe::wedge::operators::shell::epsdivdiv_history::EpsilonDivDivKerngenV03TeamsPrecomp::set_stored_matrix_mode
void set_stored_matrix_mode(linalg::OperatorStoredMatrixMode operator_stored_matrix_mode, std::optional< int > level_range, std::optional< grid::Grid4DDataScalar< ScalarType > > GCAElements)
allocates memory for the local matrices
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:161

terra::fe::wedge::operators::shell::epsdivdiv_history::EpsilonDivDivKerngenV03TeamsPrecomp::set_operator_apply_and_communication_modes
void set_operator_apply_and_communication_modes(const linalg::OperatorApplyMode operator_apply_mode, const linalg::OperatorCommunicationMode operator_communication_mode)
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:125

terra::fe::wedge::operators::shell::epsdivdiv_history::EpsilonDivDivKerngenV03TeamsPrecomp::LocalMatrixDim
static constexpr int LocalMatrixDim
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:32

terra::fe::wedge::operators::shell::epsdivdiv_history::EpsilonDivDivKerngenV03TeamsPrecomp::assemble_local_matrix
dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > assemble_local_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge) const
assemble the local matrix and return it for a given element, wedge, and vectorial component (determin...
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:796

terra::fe::wedge::operators::shell::epsdivdiv_history::EpsilonDivDivKerngenV03TeamsPrecomp::EpsilonDivDivKerngenV03TeamsPrecomp
EpsilonDivDivKerngenV03TeamsPrecomp(const grid::shell::DistributedDomain &domain, const grid::Grid3DDataVec< ScalarT, 3 > &grid, const grid::Grid2DDataScalar< ScalarT > &radii, const grid::Grid4DDataScalar< grid::shell::ShellBoundaryFlag > &mask, const grid::Grid4DDataScalar< ScalarT > &k, BoundaryConditions bcs, bool diagonal, linalg::OperatorApplyMode operator_apply_mode=linalg::OperatorApplyMode::Replace, linalg::OperatorCommunicationMode operator_communication_mode=linalg::OperatorCommunicationMode::CommunicateAdditively, linalg::OperatorStoredMatrixMode operator_stored_matrix_mode=linalg::OperatorStoredMatrixMode::Off)
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:81

terra::fe::wedge::operators::shell::epsdivdiv_history::EpsilonDivDivKerngenV03TeamsPrecomp::get_local_matrix
dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > get_local_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge) const
Retrives the local matrix if there is stored local matrices, the desired local matrix is loaded and r...
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:196

terra::fe::wedge::operators::shell::epsdivdiv_history::EpsilonDivDivKerngenV03TeamsPrecomp::get_stored_matrix_mode
linalg::OperatorStoredMatrixMode get_stored_matrix_mode()
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:176

terra::grid::shell::DistributedDomain
Parallel data structure organizing the thick spherical shell metadata for distributed (MPI parallel) ...
Definition spherical_shell.hpp:2518

terra::grid::shell::DistributedDomain::subdomains
const std::map< SubdomainInfo, std::tuple< LocalSubdomainIdx, SubdomainNeighborhood > > & subdomains() const
Definition spherical_shell.hpp:2650

terra::grid::shell::DistributedDomain::domain_info
const DomainInfo & domain_info() const
Returns a const reference.
Definition spherical_shell.hpp:2647

terra::grid::shell::DomainInfo::diamond_lateral_refinement_level
int diamond_lateral_refinement_level() const
Definition spherical_shell.hpp:843

terra::grid::shell::DomainInfo::subdomain_num_nodes_radially
int subdomain_num_nodes_radially() const
Equivalent to calling subdomain_num_nodes_radially( subdomain_refinement_level() )
Definition spherical_shell.hpp:861

terra::grid::shell::DomainInfo::subdomain_num_nodes_per_side_laterally
int subdomain_num_nodes_per_side_laterally() const
Equivalent to calling subdomain_num_nodes_per_side_laterally( subdomain_refinement_level() )
Definition spherical_shell.hpp:852

terra::linalg::VectorQ1Vec
Static assertion: VectorQ1Scalar satisfies VectorLike concept.
Definition vector_q1.hpp:168

terra::linalg::VectorQ1Vec::grid_data
const grid::Grid4DDataVec< ScalarType, VecDim > & grid_data() const
Get const reference to grid data.
Definition vector_q1.hpp:288

terra::linalg::solvers::LocalMatrixStorage< ScalarType, LocalMatrixDim >

terra::linalg::solvers::LocalMatrixStorage::has_matrix
bool has_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge) const
Checks for presence of a local matrix for a certain element.
Definition local_matrix_storage.hpp:223

terra::linalg::solvers::LocalMatrixStorage::get_matrix
dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > get_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge) const
Retrives the local matrix if there is stored local matrices, the desired local matrix is loaded and r...
Definition local_matrix_storage.hpp:175

terra::linalg::solvers::LocalMatrixStorage::set_matrix
void set_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge, dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > mat) const
Set the local matrix stored in the operator.
Definition local_matrix_storage.hpp:118

terra::util::Timer
Timer supporting RAII scope or manual stop.
Definition timer.hpp:342

terra::util::Timer::stop
void stop()
Stop the timer and record elapsed time.
Definition timer.hpp:364

communication.hpp

terra::linalg::GCACapable
Concept for types that can be used as Galerkin coarse-grid operators in a multigrid hierarchy....
Definition operator.hpp:81

integrands.hpp

kernel_helpers.hpp

local_basis_trafo_normal_tangential.hpp

local_matrix_storage.hpp

terra::communication::shell::unpack_and_reduce_local_subdomain_boundaries
void unpack_and_reduce_local_subdomain_boundaries(const grid::shell::DistributedDomain &domain, const GridDataType &data, SubdomainNeighborhoodSendRecvBuffer< typename GridDataType::value_type, grid::grid_data_vec_dim< GridDataType >() > &boundary_recv_buffers, CommunicationReduction reduction=CommunicationReduction::SUM)
Unpacks and reduces local subdomain boundaries.
Definition communication.hpp:672

terra::communication::shell::pack_send_and_recv_local_subdomain_boundaries
void pack_send_and_recv_local_subdomain_boundaries(const grid::shell::DistributedDomain &domain, const GridDataType &data, SubdomainNeighborhoodSendRecvBuffer< typename GridDataType::value_type, grid::grid_data_vec_dim< GridDataType >() > &boundary_send_buffers, SubdomainNeighborhoodSendRecvBuffer< typename GridDataType::value_type, grid::grid_data_vec_dim< GridDataType >() > &boundary_recv_buffers)
Packs, sends and recvs local subdomain boundaries using two sets of buffers.
Definition communication.hpp:242

terra::fe::wedge::operators::shell::epsdivdiv_history
Definition epsilon_divdiv_kerngen_v01_initial.hpp:16

terra::fe::wedge::quadrature::quad_felippa_1x1_quad_points
constexpr void quad_felippa_1x1_quad_points(dense::Vec< T, 3 >(&quad_points)[quad_felippa_1x1_num_quad_points])
Definition wedge/quadrature/quadrature.hpp:36

terra::fe::wedge::quadrature::quad_felippa_1x1_quad_weights
constexpr void quad_felippa_1x1_quad_weights(T(&quad_weights)[quad_felippa_1x1_num_quad_points])
Definition wedge/quadrature/quadrature.hpp:43

terra::fe::wedge::quadrature::quad_felippa_1x1_num_quad_points
constexpr int quad_felippa_1x1_num_quad_points
Definition wedge/quadrature/quadrature.hpp:32

terra::fe::wedge::num_nodes_per_wedge_surface
constexpr int num_nodes_per_wedge_surface
Definition kernel_helpers.hpp:6

terra::fe::wedge::symmetric_grad
constexpr dense::Mat< T, 3, 3 > symmetric_grad(const dense::Mat< T, 3, 3 > &J_inv_transposed, const dense::Vec< T, 3 > &quad_point, const int dof, const int dim)
Returns the symmetric gradient of the shape function of a dof at a quadrature point.
Definition integrands.hpp:685

terra::fe::wedge::wedge_surface_physical_coords
void wedge_surface_physical_coords(dense::Vec< T, 3 >(&wedge_surf_phy_coords)[num_wedges_per_hex_cell][num_nodes_per_wedge_surface], const grid::Grid3DDataVec< T, 3 > &lateral_grid, const int local_subdomain_id, const int x_cell, const int y_cell)
Extracts the (unit sphere) surface vertex coords of the two wedges of a hex cell.
Definition kernel_helpers.hpp:26

terra::fe::wedge::atomically_add_local_wedge_vector_coefficients
void atomically_add_local_wedge_vector_coefficients(const grid::Grid4DDataVec< T, VecDim > &global_coefficients, const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int d, const dense::Vec< T, 6 > local_coefficients[2])
Performs an atomic add of the two local wedge coefficient vectors of a hex cell into the global coeff...
Definition kernel_helpers.hpp:465

terra::fe::wedge::num_wedges_per_hex_cell
constexpr int num_wedges_per_hex_cell
Definition kernel_helpers.hpp:5

terra::fe::wedge::extract_local_wedge_scalar_coefficients
void extract_local_wedge_scalar_coefficients(dense::Vec< T, 6 >(&local_coefficients)[2], const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const grid::Grid4DDataScalar< T > &global_coefficients)
Extracts the local vector coefficients for the two wedges of a hex cell from the global coefficient v...
Definition kernel_helpers.hpp:306

terra::fe::wedge::extract_local_wedge_vector_coefficients
void extract_local_wedge_vector_coefficients(dense::Vec< T, 6 >(&local_coefficients)[2], const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int d, const grid::Grid4DDataVec< T, VecDim > &global_coefficients)
Extracts the local vector coefficients for the two wedges of a hex cell from the global coefficient v...
Definition kernel_helpers.hpp:356

terra::fe::wedge::num_nodes_per_wedge
constexpr int num_nodes_per_wedge
Definition kernel_helpers.hpp:7

terra::fe::wedge::jac_lat
constexpr dense::Mat< T, 3, 3 > jac_lat(const dense::Vec< T, 3 > &p1_phy, const dense::Vec< T, 3 > &p2_phy, const dense::Vec< T, 3 > &p3_phy, const T xi, const T eta)
Definition integrands.hpp:634

terra::fe::wedge::shape
constexpr T shape(const int node_idx, const T xi, const T eta, const T zeta)
(Tensor-product) Shape function.
Definition integrands.hpp:146

terra::fe::wedge::jac
constexpr dense::Mat< T, 3, 3 > jac(const dense::Vec< T, 3 > &p1_phy, const dense::Vec< T, 3 > &p2_phy, const dense::Vec< T, 3 > &p3_phy, const T r_1, const T r_2, const T xi, const T eta, const T zeta)
Definition integrands.hpp:657

terra::grid::shell::BoundaryConditions
BoundaryConditionMapping[2] BoundaryConditions
Definition shell/bit_masks.hpp:37

terra::grid::shell::ShellBoundaryFlag
ShellBoundaryFlag
FlagLike that indicates boundary types for the thick spherical shell.
Definition shell/bit_masks.hpp:12

terra::grid::shell::ShellBoundaryFlag::SURFACE
@ SURFACE

terra::grid::shell::ShellBoundaryFlag::CMB
@ CMB

terra::grid::shell::local_domain_md_range_policy_cells_lateral
Kokkos::MDRangePolicy< Kokkos::Rank< 3 > > local_domain_md_range_policy_cells_lateral(const DistributedDomain &distributed_domain)
Definition spherical_shell.hpp:2729

terra::grid::shell::BoundaryConditionFlag
BoundaryConditionFlag
FlagLike that indicates the type of boundary condition
Definition shell/bit_masks.hpp:25

terra::grid::Grid4DDataMatrices
Kokkos::View< dense::Mat< ScalarType, Rows, Cols > ****[NumMatrices], Layout > Grid4DDataMatrices
Definition grid_types.hpp:173

terra::grid::Grid3DDataVec
Kokkos::View< ScalarType ***[VecDim], Layout > Grid3DDataVec
Definition grid_types.hpp:42

terra::grid::Grid4DDataScalar
Kokkos::View< ScalarType ****, Layout > Grid4DDataScalar
Definition grid_types.hpp:27

terra::grid::Grid2DDataScalar
Kokkos::View< ScalarType **, Layout > Grid2DDataScalar
Definition grid_types.hpp:21

terra::linalg::OperatorApplyMode
OperatorApplyMode
Modes for applying an operator to a vector.
Definition operator.hpp:30

terra::linalg::OperatorApplyMode::Replace
@ Replace
Overwrite the destination vector.

terra::linalg::OperatorStoredMatrixMode
OperatorStoredMatrixMode
Modes for applying stored matrices.
Definition operator.hpp:47

terra::linalg::OperatorStoredMatrixMode::Selective
@ Selective
Use stored matrices on selected, marked elements only, assemble on all others.

terra::linalg::OperatorStoredMatrixMode::Full
@ Full
Use stored matrices on all elements.

terra::linalg::OperatorStoredMatrixMode::Off
@ Off
Do not use stored matrices.

terra::linalg::OperatorCommunicationMode
OperatorCommunicationMode
Modes for communication during operator application.
Definition operator.hpp:40

terra::linalg::OperatorCommunicationMode::CommunicateAdditively
@ CommunicateAdditively
Communicate and add results.

terra::util::has_flag
constexpr bool has_flag(E mask_value, E flag) noexcept
Checks if a bitmask value contains a specific flag.
Definition bit_masking.hpp:43

operator.hpp

spherical_shell.hpp

terra::dense::Mat
Definition mat.hpp:10

terra::dense::Mat::fill
void fill(const T value)
Definition mat.hpp:201

terra::dense::Mat::diagonal
constexpr Mat diagonal() const
Definition mat.hpp:377

terra::dense::Mat::hadamard_product
Mat & hadamard_product(const Mat &mat)
Definition mat.hpp:213

terra::dense::Mat::double_contract
T double_contract(const Mat &mat)
Definition mat.hpp:226

terra::dense::Vec< ScalarT, 3 >

terra::grid::Grid4DDataVec
SoA (Structure-of-Arrays) 4D vector grid data.
Definition grid_types.hpp:51

terra::grid::Grid4DDataVec::extent
auto extent(int i) const
Definition grid_types.hpp:75

timer.hpp

vec.hpp

vector.hpp

vector_q1.hpp