66 using Team = Kokkos::TeamPolicy<>::member_type;
78 BoundaryConditions bcs_;
99 int local_subdomains_;
102 int lat_refinement_level_;
123 enum class KernelPath
126 FastDirichletNeumann,
129 KernelPath kernel_path_ = KernelPath::FastDirichletNeumann;
140 void update_kernel_path_flag_host_only()
142 const BoundaryConditionFlag cmb_bc = get_boundary_condition_flag( bcs_, CMB );
143 const BoundaryConditionFlag surface_bc = get_boundary_condition_flag( bcs_, SURFACE );
145 const bool has_freeslip = ( cmb_bc == FREESLIP ) || ( surface_bc == FREESLIP );
148 if ( has_stored_matrices )
149 kernel_path_ = KernelPath::Slow;
150 else if ( has_freeslip )
151 kernel_path_ = KernelPath::FastFreeslip;
153 kernel_path_ = KernelPath::FastDirichletNeumann;
163 BoundaryConditions bcs,
174 , diagonal_( diagonal )
175 , operator_apply_mode_( operator_apply_mode )
176 , operator_communication_mode_( operator_communication_mode )
177 , operator_stored_matrix_mode_( operator_stored_matrix_mode )
178 , recv_buffers_( domain )
179 , comm_plan_( domain )
188 local_subdomains_ = domain_.
subdomains().size();
196 r_tile_block_ = r_tile_ * r_passes_;
198 lat_tiles_ = ( hex_lat_ + lat_tile_ - 1 ) / lat_tile_;
199 r_tiles_ = ( hex_rad_ + r_tile_block_ - 1 ) / r_tile_block_;
201 team_size_ = lat_tile_ * lat_tile_ * r_tile_;
202 blocks_ = local_subdomains_ * lat_tiles_ * lat_tiles_ * r_tiles_;
204 r_min_ = domain_info.
radii()[0];
205 r_max_ = domain_info.
radii()[domain_info.
radii().size() - 1];
208 update_kernel_path_flag_host_only();
210 util::logroot <<
"[EpsilonDivDiv] tile size (x,y,r)=(" << lat_tile_ <<
"," << lat_tile_ <<
"," << r_tile_
211 <<
"), r_passes=" << r_passes_ << std::endl;
212 util::logroot <<
"[EpsilonDivDiv] number of tiles (x,y,r)=(" << lat_tiles_ <<
"," << lat_tiles_ <<
","
213 << r_tiles_ <<
"), team_size=" << team_size_ <<
", blocks=" << blocks_ << std::endl;
214 const char* path_name = ( kernel_path_ == KernelPath::Slow ) ?
"slow" :
215 ( kernel_path_ == KernelPath::FastFreeslip ) ?
"fast-freeslip" :
216 "fast-dirichlet-neumann";
217 util::logroot <<
"[EpsilonDivDiv] kernel path = " << path_name << std::endl;
224 operator_apply_mode_ = operator_apply_mode;
225 operator_communication_mode_ = operator_communication_mode;
236 update_kernel_path_flag_host_only();
244 KOKKOS_INLINE_FUNCTION
246 const int local_subdomain_id,
252 return util::has_flag( mask_( local_subdomain_id, x_cell, y_cell, r_cell ), flag );
260 operator_stored_matrix_mode_ = operator_stored_matrix_mode;
265 domain_, operator_stored_matrix_mode_, level_range, GCAElements );
269 update_kernel_path_flag_host_only();
271 util::logroot <<
"[EpsilonDivDiv] (set_stored_matrix_mode) kernel path = "
272 << (( kernel_path_ == KernelPath::Slow ) ?
"slow" :
273 ( kernel_path_ == KernelPath::FastFreeslip ) ?
"fast-freeslip" :
274 "fast-dirichlet-neumann")
280 KOKKOS_INLINE_FUNCTION
282 const int local_subdomain_id,
290 local_matrix_storage_.
set_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge, mat );
293 KOKKOS_INLINE_FUNCTION
295 const int local_subdomain_id,
299 const int wedge )
const
303 if ( !local_matrix_storage_.
has_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge ) )
305 Kokkos::abort(
"No matrix found at that spatial index." );
307 return local_matrix_storage_.
get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge );
337 util::Timer timer_kernel(
"epsilon_divdiv_kernel" );
338 Kokkos::TeamPolicy<> policy( blocks_, team_size_ );
341 if ( kernel_path_ != KernelPath::Slow )
343 policy.set_scratch_size( 0, Kokkos::PerTeam(
team_shmem_size( team_size_ ) ) );
347 if ( kernel_path_ == KernelPath::Slow )
349 Kokkos::parallel_for(
350 "epsilon_divdiv_apply_kernel_slow", policy, KOKKOS_CLASS_LAMBDA(
const Team& team ) {
351 this->run_team_slow( team );
354 else if ( kernel_path_ == KernelPath::FastFreeslip )
358 Kokkos::parallel_for(
359 "epsilon_divdiv_apply_kernel_fast_fs_diag",
361 KOKKOS_CLASS_LAMBDA(
const Team& team ) {
362 this->
template run_team_fast_freeslip< true >( team );
367 Kokkos::parallel_for(
368 "epsilon_divdiv_apply_kernel_fast_fs_matvec",
370 KOKKOS_CLASS_LAMBDA(
const Team& team ) {
371 this->
template run_team_fast_freeslip< false >( team );
377 Kokkos::TeamPolicy< Kokkos::LaunchBounds< 128, 5 > > dn_policy( blocks_, team_size_ );
381 Kokkos::parallel_for(
382 "epsilon_divdiv_apply_kernel_fast_dn_diag",
384 KOKKOS_CLASS_LAMBDA(
const Team& team ) {
385 this->
template run_team_fast_dirichlet_neumann< true >( team );
390 Kokkos::parallel_for(
391 "epsilon_divdiv_apply_kernel_fast_dn_matvec",
393 KOKKOS_CLASS_LAMBDA(
const Team& team ) {
394 this->
template run_team_fast_dirichlet_neumann< false >( team );
419 KOKKOS_INLINE_FUNCTION
433 E00 = E11 = E22 = sym01 = sym02 = sym12 = gdd = 0.0;
464 KOKKOS_INLINE_FUNCTION
467 const int nlev = r_tile_block_ + 1;
468 const int n = lat_tile_ + 1;
469 const int nxy = n * n;
472 const size_t nscalars =
473 size_t( nxy ) * 3 + size_t( nxy ) * 3 + size_t( nxy ) * 3 * nlev + size_t( nxy ) * nlev + size_t( nlev ) + 1;
478 KOKKOS_INLINE_FUNCTION
481 const int nlev = r_tile_block_ + 1;
482 const int n = lat_tile_ + 1;
483 const int nxy = n * n;
486 const size_t nscalars =
487 size_t( nxy ) * 3 + size_t( nxy ) * 3 * nlev + size_t( nxy ) * nlev + size_t( nlev );
499 KOKKOS_INLINE_FUNCTION
500 void decode_team_indices(
502 int& local_subdomain_id,
513 int tmp = team.league_rank();
515 const int r_tile_id = tmp % r_tiles_;
518 const int lat_y_id = tmp % lat_tiles_;
521 const int lat_x_id = tmp % lat_tiles_;
524 local_subdomain_id = tmp;
526 x0 = lat_x_id * lat_tile_;
527 y0 = lat_y_id * lat_tile_;
528 r0 = r_tile_id * r_tile_block_;
530 const int tid = team.team_rank();
532 tx = ( tid / r_tile_ ) % lat_tile_;
533 ty = tid / ( r_tile_ * lat_tile_ );
555 KOKKOS_INLINE_FUNCTION
556 void run_team_slow(
const Team& team )
const
558 int local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell;
559 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
564 for (
int pass = 0; pass < r_passes_; ++pass )
566 const int r_cell_pass = r0 + pass * r_tile_ + tr;
567 if ( r_cell_pass >= hex_rad_ )
570 const bool at_cmb = has_flag( local_subdomain_id, x_cell, y_cell, r_cell_pass, CMB );
571 const bool at_surface = has_flag( local_subdomain_id, x_cell, y_cell, r_cell_pass + 1, SURFACE );
574 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell_pass, at_cmb, at_surface );
584 template <
bool Diagonal >
585 KOKKOS_INLINE_FUNCTION
586 void run_team_fast_dirichlet_neumann(
const Team& team )
const
588 int local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell;
589 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
594 operator_fast_dirichlet_neumann_path< Diagonal >(
595 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell );
604 template <
bool Diagonal >
605 KOKKOS_INLINE_FUNCTION
606 void run_team_fast_freeslip(
const Team& team )
const
608 int local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell;
609 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
614 operator_fast_freeslip_path< Diagonal >(
615 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell );
619 KOKKOS_INLINE_FUNCTION
620 void operator_slow_path(
622 const int local_subdomain_id,
633 const bool at_surface )
const
643 if ( x_cell >= hex_lat_ || y_cell >= hex_lat_ || r_cell >= hex_rad_ )
650 A[0] = local_matrix_storage_.
get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 );
651 A[1] = local_matrix_storage_.
get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 );
655 if ( local_matrix_storage_.
has_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 ) &&
656 local_matrix_storage_.
has_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 ) )
658 A[0] = local_matrix_storage_.
get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 );
659 A[1] = local_matrix_storage_.
get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 );
674 for (
int dimj = 0; dimj < 3; dimj++ )
688 dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > boundary_mask;
689 boundary_mask.fill( 1.0 );
691 bool freeslip_reorder =
false;
694 if ( at_cmb || at_surface )
699 if ( bcf == DIRICHLET )
701 for (
int dimi = 0; dimi < 3; ++dimi )
703 for (
int dimj = 0; dimj < 3; ++dimj )
709 if ( ( at_cmb && ( ( dimi == dimj && i != j && ( i < 3 || j < 3 ) ) ||
710 ( dimi != dimj && ( i < 3 || j < 3 ) ) ) ) ||
711 ( at_surface && ( ( dimi == dimj && i != j && ( i >= 3 || j >= 3 ) ) ||
712 ( dimi != dimj && ( i >= 3 || j >= 3 ) ) ) ) )
722 else if ( bcf == FREESLIP )
724 freeslip_reorder =
true;
727 for (
int wedge = 0; wedge < 2; ++wedge )
729 for (
int dimi = 0; dimi < 3; ++dimi )
733 for (
int dimj = 0; dimj < 3; ++dimj )
737 A_tmp[wedge]( node_idxi * 3 + dimi, node_idxj * 3 + dimj ) = A[wedge](
747 constexpr int layer_hex_offset_x[2][3] = { { 0, 1, 0 }, { 1, 0, 1 } };
748 constexpr int layer_hex_offset_y[2][3] = { { 0, 0, 1 }, { 1, 1, 0 } };
750 for (
int wedge = 0; wedge < 2; ++wedge )
754 R[wedge]( i, i ) = 1.0;
757 for (
int boundary_node_idx = 0; boundary_node_idx < 3; boundary_node_idx++ )
761 x_cell + layer_hex_offset_x[wedge][boundary_node_idx],
762 y_cell + layer_hex_offset_y[wedge][boundary_node_idx],
763 r_cell + ( at_cmb ? 0 : 1 ),
769 const int offset_in_R = at_cmb ? 0 : 9;
770 for (
int dimi = 0; dimi < 3; ++dimi )
772 for (
int dimj = 0; dimj < 3; ++dimj )
775 offset_in_R + boundary_node_idx * 3 + dimi,
776 offset_in_R + boundary_node_idx * 3 + dimj ) = R_i( dimi, dimj );
781 A[wedge] = R[wedge] * A_tmp[wedge] * R[wedge].
transposed();
783 auto src_tmp = R[wedge] * src[wedge];
784 for (
int i = 0; i < 18; ++i )
786 src[wedge]( i ) = src_tmp( i );
789 const int node_start = at_surface ? 3 : 0;
790 const int node_end = at_surface ? 6 : 3;
792 for (
int node_idx = node_start; node_idx < node_end; node_idx++ )
794 const int idx = node_idx * 3;
795 for (
int k = 0; k < 18; ++k )
799 boundary_mask( idx, k ) = 0.0;
800 boundary_mask( k, idx ) = 0.0;
820 dst[0] = A[0] * src[0];
821 dst[1] = A[1] * src[1];
823 if ( freeslip_reorder )
827 dst_tmp[1] = R[1].transposed() * dst[1];
829 for (
int i = 0; i < 18; ++i )
831 dst[0]( i ) = dst_tmp[0]( i );
832 dst[1]( i ) = dst_tmp[1]( i );
839 for (
int dimi = 0; dimi < 3; dimi++ )
846 dst_, local_subdomain_id, x_cell, y_cell, r_cell, dimi, dst_d );
851 template <
bool Diagonal >
852 KOKKOS_INLINE_FUNCTION
853 void operator_fast_dirichlet_neumann_path(
855 const int local_subdomain_id,
863 const int y_cell )
const
865 const int nlev = r_tile_block_ + 1;
866 const int nxy = ( lat_tile_ + 1 ) * ( lat_tile_ + 1 );
869 reinterpret_cast< double*
>( team.team_shmem().get_shmem(
team_shmem_size_dn( team.team_size() ) ) );
871 using ScratchCoords =
872 Kokkos::View< double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
873 using ScratchSrc = Kokkos::
874 View< double***, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
876 Kokkos::View< double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
878 ScratchCoords coords_sh( shmem, nxy, 3 );
881 ScratchSrc src_sh( shmem, nxy, 3, nlev );
882 shmem += nxy * 3 * nlev;
884 ScratchK k_sh( shmem, nxy, nlev );
888 Kokkos::View< double*, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >(
891 auto node_id = [&](
int nx,
int ny ) ->
int {
return nx + ( lat_tile_ + 1 ) * ny; };
893 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nxy ), [&](
int n ) {
894 const int dxn = n % ( lat_tile_ + 1 );
895 const int dyn = n / ( lat_tile_ + 1 );
896 const int xi = x0 + dxn;
897 const int yi = y0 + dyn;
899 if ( xi <= hex_lat_ && yi <= hex_lat_ )
901 coords_sh( n, 0 ) = grid_( local_subdomain_id, xi, yi, 0 );
902 coords_sh( n, 1 ) = grid_( local_subdomain_id, xi, yi, 1 );
903 coords_sh( n, 2 ) = grid_( local_subdomain_id, xi, yi, 2 );
907 coords_sh( n, 0 ) = coords_sh( n, 1 ) = coords_sh( n, 2 ) = 0.0;
911 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nlev ), [&](
int lvl ) {
912 const int rr = r0 + lvl;
913 r_sh( lvl ) = ( rr <= hex_rad_ ) ? radii_( local_subdomain_id, rr ) : 0.0;
916 const int total_pairs = nxy * nlev;
917 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, total_pairs ), [&](
int t ) {
918 const int node = t / nlev;
919 const int lvl = t - node * nlev;
921 const int dxn = node % ( lat_tile_ + 1 );
922 const int dyn = node / ( lat_tile_ + 1 );
924 const int xi = x0 + dxn;
925 const int yi = y0 + dyn;
926 const int rr = r0 + lvl;
928 if ( xi <= hex_lat_ && yi <= hex_lat_ && rr <= hex_rad_ )
930 k_sh( node, lvl ) = k_( local_subdomain_id, xi, yi, rr );
931 src_sh( node, 0, lvl ) = src_( local_subdomain_id, xi, yi, rr, 0 );
932 src_sh( node, 1, lvl ) = src_( local_subdomain_id, xi, yi, rr, 1 );
933 src_sh( node, 2, lvl ) = src_( local_subdomain_id, xi, yi, rr, 2 );
937 k_sh( node, lvl ) = 0.0;
938 src_sh( node, 0, lvl ) = src_sh( node, 1, lvl ) = src_sh( node, 2, lvl ) = 0.0;
944 if ( x_cell >= hex_lat_ || y_cell >= hex_lat_ )
947 constexpr double ONE_THIRD = 1.0 / 3.0;
948 constexpr double ONE_SIXTH = 1.0 / 6.0;
950 static constexpr double dN_ref[6][3] = {
951 { -0.5, -0.5, -ONE_SIXTH },
952 { 0.5, 0.0, -ONE_SIXTH },
953 { 0.0, 0.5, -ONE_SIXTH },
954 { -0.5, -0.5, ONE_SIXTH },
955 { 0.5, 0.0, ONE_SIXTH },
956 { 0.0, 0.5, ONE_SIXTH } };
958 static constexpr int WEDGE_NODE_OFF[2][6][3] = {
959 { { 0, 0, 0 }, { 1, 0, 0 }, { 0, 1, 0 }, { 0, 0, 1 }, { 1, 0, 1 }, { 0, 1, 1 } },
960 { { 1, 1, 0 }, { 0, 1, 0 }, { 1, 0, 0 }, { 1, 1, 1 }, { 0, 1, 1 }, { 1, 0, 1 } } };
962 const int n00 = node_id( tx, ty );
963 const int n01 = node_id( tx, ty + 1 );
964 const int n10 = node_id( tx + 1, ty );
965 const int n11 = node_id( tx + 1, ty + 1 );
967 for (
int pass = 0; pass < r_passes_; ++pass )
969 const int lvl0 = pass * r_tile_ + tr;
970 const int r_cell = r0 + lvl0;
972 if ( r_cell >= hex_rad_ )
975 const double r_0 = r_sh( lvl0 );
976 const double r_1 = r_sh( lvl0 + 1 );
978 const bool at_cmb =
has_flag( local_subdomain_id, x_cell, y_cell, r_cell, CMB );
979 const bool at_surface =
has_flag( local_subdomain_id, x_cell, y_cell, r_cell + 1, SURFACE );
981 const bool at_boundary = at_cmb || at_surface;
982 bool treat_boundary_dirichlet =
false;
989 const int cmb_shift = ( ( at_boundary && treat_boundary_dirichlet && ( !Diagonal ) && at_cmb ) ? 3 : 0 );
991 ( ( at_boundary && treat_boundary_dirichlet && ( !Diagonal ) && at_surface ) ? 3 : 0 );
993 for (
int w = 0; w < 2; ++w )
995 const int v0 = w == 0 ? n00 : n11;
996 const int v1 = w == 0 ? n10 : n01;
997 const int v2 = w == 0 ? n01 : n10;
1001 for (
int node = 0; node < 6; ++node )
1003 const int nid = node_id( tx + WEDGE_NODE_OFF[w][node][0], ty + WEDGE_NODE_OFF[w][node][1] );
1004 k_sum += k_sh( nid, lvl0 + WEDGE_NODE_OFF[w][node][2] );
1006 const double k_eval = ONE_SIXTH * k_sum;
1013 double gu10 = 0.0, gu11 = 0.0;
1014 double gu20 = 0.0, gu21 = 0.0, gu22 = 0.0;
1017 const double half_dr = 0.5 * ( r_1 - r_0 );
1018 const double r_mid = 0.5 * ( r_0 + r_1 );
1020 const double J_0_0 = r_mid * ( -coords_sh( v0, 0 ) + coords_sh( v1, 0 ) );
1021 const double J_0_1 = r_mid * ( -coords_sh( v0, 0 ) + coords_sh( v2, 0 ) );
1022 const double J_0_2 =
1023 half_dr * ( ONE_THIRD * ( coords_sh( v0, 0 ) + coords_sh( v1, 0 ) + coords_sh( v2, 0 ) ) );
1025 const double J_1_0 = r_mid * ( -coords_sh( v0, 1 ) + coords_sh( v1, 1 ) );
1026 const double J_1_1 = r_mid * ( -coords_sh( v0, 1 ) + coords_sh( v2, 1 ) );
1027 const double J_1_2 =
1028 half_dr * ( ONE_THIRD * ( coords_sh( v0, 1 ) + coords_sh( v1, 1 ) + coords_sh( v2, 1 ) ) );
1030 const double J_2_0 = r_mid * ( -coords_sh( v0, 2 ) + coords_sh( v1, 2 ) );
1031 const double J_2_1 = r_mid * ( -coords_sh( v0, 2 ) + coords_sh( v2, 2 ) );
1032 const double J_2_2 =
1033 half_dr * ( ONE_THIRD * ( coords_sh( v0, 2 ) + coords_sh( v1, 2 ) + coords_sh( v2, 2 ) ) );
1035 const double J_det = J_0_0 * J_1_1 * J_2_2 - J_0_0 * J_1_2 * J_2_1 -
1036 J_0_1 * J_1_0 * J_2_2 + J_0_1 * J_1_2 * J_2_0 +
1037 J_0_2 * J_1_0 * J_2_1 - J_0_2 * J_1_1 * J_2_0;
1039 kwJ =
k_eval * Kokkos::abs( J_det );
1041 const double inv_det = 1.0 /
J_det;
1043 const double i00 = inv_det * ( J_1_1 * J_2_2 - J_1_2 * J_2_1 );
1044 const double i01 = inv_det * ( -J_1_0 * J_2_2 + J_1_2 * J_2_0 );
1045 const double i02 = inv_det * ( J_1_0 * J_2_1 - J_1_1 * J_2_0 );
1046 const double i10 = inv_det * ( -J_0_1 * J_2_2 + J_0_2 * J_2_1 );
1047 const double i11 = inv_det * ( J_0_0 * J_2_2 - J_0_2 * J_2_0 );
1048 const double i12 = inv_det * ( -J_0_0 * J_2_1 + J_0_1 * J_2_0 );
1049 const double i20 = inv_det * ( J_0_1 * J_1_2 - J_0_2 * J_1_1 );
1050 const double i21 = inv_det * ( -J_0_0 * J_1_2 + J_0_2 * J_1_0 );
1051 const double i22 = inv_det * ( J_0_0 * J_1_1 - J_0_1 * J_1_0 );
1058 const double gx = dN_ref[n][0];
1059 const double gy = dN_ref[n][1];
1060 const double gz = dN_ref[n][2];
1061 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1062 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1063 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1065 const int ddx = WEDGE_NODE_OFF[w][n][0];
1066 const int ddy = WEDGE_NODE_OFF[w][n][1];
1067 const int ddr = WEDGE_NODE_OFF[w][n][2];
1068 const int nid = node_id( tx + ddx, ty + ddy );
1069 const int lvl = lvl0 + ddr;
1071 const double s0 = src_sh( nid, 0, lvl );
1072 const double s1 = src_sh( nid, 1, lvl );
1073 const double s2 = src_sh( nid, 2, lvl );
1078 gu10 += 0.5 * (
g1 * s0 + g0 * s1 );
1079 gu20 += 0.5 * (
g2 * s0 + g0 * s2 );
1080 gu21 += 0.5 * (
g2 * s1 +
g1 * s2 );
1089 const double half_dr = 0.5 * ( r_1 - r_0 );
1090 const double r_mid = 0.5 * ( r_0 + r_1 );
1092 const double J_0_0 = r_mid * ( -coords_sh( v0, 0 ) + coords_sh( v1, 0 ) );
1093 const double J_0_1 = r_mid * ( -coords_sh( v0, 0 ) + coords_sh( v2, 0 ) );
1094 const double J_0_2 =
1095 half_dr * ( ONE_THIRD * ( coords_sh( v0, 0 ) + coords_sh( v1, 0 ) + coords_sh( v2, 0 ) ) );
1097 const double J_1_0 = r_mid * ( -coords_sh( v0, 1 ) + coords_sh( v1, 1 ) );
1098 const double J_1_1 = r_mid * ( -coords_sh( v0, 1 ) + coords_sh( v2, 1 ) );
1099 const double J_1_2 =
1100 half_dr * ( ONE_THIRD * ( coords_sh( v0, 1 ) + coords_sh( v1, 1 ) + coords_sh( v2, 1 ) ) );
1102 const double J_2_0 = r_mid * ( -coords_sh( v0, 2 ) + coords_sh( v1, 2 ) );
1103 const double J_2_1 = r_mid * ( -coords_sh( v0, 2 ) + coords_sh( v2, 2 ) );
1104 const double J_2_2 =
1105 half_dr * ( ONE_THIRD * ( coords_sh( v0, 2 ) + coords_sh( v1, 2 ) + coords_sh( v2, 2 ) ) );
1107 const double J_det = J_0_0 * J_1_1 * J_2_2 - J_0_0 * J_1_2 * J_2_1 -
1108 J_0_1 * J_1_0 * J_2_2 + J_0_1 * J_1_2 * J_2_0 +
1109 J_0_2 * J_1_0 * J_2_1 - J_0_2 * J_1_1 * J_2_0;
1111 const double inv_det = 1.0 /
J_det;
1113 const double i00 = inv_det * ( J_1_1 * J_2_2 - J_1_2 * J_2_1 );
1114 const double i01 = inv_det * ( -J_1_0 * J_2_2 + J_1_2 * J_2_0 );
1115 const double i02 = inv_det * ( J_1_0 * J_2_1 - J_1_1 * J_2_0 );
1116 const double i10 = inv_det * ( -J_0_1 * J_2_2 + J_0_2 * J_2_1 );
1117 const double i11 = inv_det * ( J_0_0 * J_2_2 - J_0_2 * J_2_0 );
1118 const double i12 = inv_det * ( -J_0_0 * J_2_1 + J_0_1 * J_2_0 );
1119 const double i20 = inv_det * ( J_0_1 * J_1_2 - J_0_2 * J_1_1 );
1120 const double i21 = inv_det * ( -J_0_0 * J_1_2 + J_0_2 * J_1_0 );
1121 const double i22 = inv_det * ( J_0_0 * J_1_1 - J_0_1 * J_1_0 );
1125 constexpr double NEG_TWO_THIRDS = -0.66666666666666663;
1129 const double gx = dN_ref[n][0];
1130 const double gy = dN_ref[n][1];
1131 const double gz = dN_ref[n][2];
1132 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1133 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1134 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1136 const int ddx = WEDGE_NODE_OFF[w][n][0];
1137 const int ddy = WEDGE_NODE_OFF[w][n][1];
1138 const int ddr = WEDGE_NODE_OFF[w][n][2];
1140 &dst_( local_subdomain_id, x_cell + ddx, y_cell + ddy, r_cell + ddr, 0 ),
1141 kwJ * ( 2.0 * ( g0 * gu00 + g1 * gu10 + g2 * gu20 ) +
1142 NEG_TWO_THIRDS * g0 * div_u ) );
1144 &dst_( local_subdomain_id, x_cell + ddx, y_cell + ddy, r_cell + ddr, 1 ),
1145 kwJ * ( 2.0 * ( g0 * gu10 + g1 * gu11 + g2 * gu21 ) +
1146 NEG_TWO_THIRDS * g1 * div_u ) );
1148 &dst_( local_subdomain_id, x_cell + ddx, y_cell + ddy, r_cell + ddr, 2 ),
1149 kwJ * ( 2.0 * ( g0 * gu20 + g1 * gu21 + g2 * gu22 ) +
1150 NEG_TWO_THIRDS * g2 * div_u ) );
1154 if ( Diagonal || ( treat_boundary_dirichlet && at_boundary ) )
1157 for (
int n = surface_shift; n < 6 -
cmb_shift; ++n )
1159 const double gx = dN_ref[n][0];
1160 const double gy = dN_ref[n][1];
1161 const double gz = dN_ref[n][2];
1162 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1163 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1164 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1165 const double gg = g0 * g0 +
g1 *
g1 +
g2 *
g2;
1167 const int nid = node_id( tx + WEDGE_NODE_OFF[w][n][0], ty + WEDGE_NODE_OFF[w][n][1] );
1168 const int lvl = lvl0 + WEDGE_NODE_OFF[w][n][2];
1170 const double sv0 = src_sh( nid, 0, lvl );
1171 const double sv1 = src_sh( nid, 1, lvl );
1172 const double sv2 = src_sh( nid, 2, lvl );
1174 const int ddx = WEDGE_NODE_OFF[w][n][0];
1175 const int ddy = WEDGE_NODE_OFF[w][n][1];
1176 const int ddr = WEDGE_NODE_OFF[w][n][2];
1178 &dst_( local_subdomain_id, x_cell + ddx, y_cell + ddy, r_cell + ddr, 0 ),
1179 kwJ * sv0 * ( gg + ONE_THIRD * g0 * g0 ) );
1181 &dst_( local_subdomain_id, x_cell + ddx, y_cell + ddy, r_cell + ddr, 1 ),
1182 kwJ * sv1 * ( gg + ONE_THIRD * g1 * g1 ) );
1184 &dst_( local_subdomain_id, x_cell + ddx, y_cell + ddy, r_cell + ddr, 2 ),
1185 kwJ * sv2 * ( gg + ONE_THIRD * g2 * g2 ) );
1195 KOKKOS_INLINE_FUNCTION
1196 void normalize3(
double& x,
double& y,
double& z )
const
1198 const double n2 = x * x + y * y + z * z;
1201 const double invn = 1.0 / Kokkos::sqrt( n2 );
1208 KOKKOS_INLINE_FUNCTION
1209 void project_tangential_inplace(
1217 const double dot = nx * ux + ny * uy + nz * uz;
1223 template <
bool Diagonal >
1224 KOKKOS_INLINE_FUNCTION
1225 void operator_fast_freeslip_path(
1227 const int local_subdomain_id,
1235 const int y_cell )
const
1237 const int nlev = r_tile_block_ + 1;
1238 const int nxy = ( lat_tile_ + 1 ) * ( lat_tile_ + 1 );
1241 reinterpret_cast< double*
>( team.team_shmem().get_shmem(
team_shmem_size( team.team_size() ) ) );
1243 using ScratchCoords =
1244 Kokkos::View< double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
1245 using ScratchSrc = Kokkos::
1246 View< double***, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
1248 Kokkos::View< double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
1250 ScratchCoords coords_sh( shmem, nxy, 3 );
1254 ScratchCoords normals_sh( shmem, nxy, 3 );
1257 ScratchSrc src_sh( shmem, nxy, 3, nlev );
1258 shmem += nxy * 3 * nlev;
1260 ScratchK k_sh( shmem, nxy, nlev );
1261 shmem += nxy * nlev;
1264 Kokkos::View< double*, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >(
1267 auto node_id = [&](
int nx,
int ny ) ->
int {
return nx + ( lat_tile_ + 1 ) * ny; };
1270 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nxy ), [&](
int n ) {
1271 const int dxn = n % ( lat_tile_ + 1 );
1272 const int dyn = n / ( lat_tile_ + 1 );
1273 const int xi = x0 + dxn;
1274 const int yi = y0 + dyn;
1276 if ( xi <= hex_lat_ && yi <= hex_lat_ )
1278 const double cx = grid_( local_subdomain_id, xi, yi, 0 );
1279 const double cy = grid_( local_subdomain_id, xi, yi, 1 );
1280 const double cz = grid_( local_subdomain_id, xi, yi, 2 );
1281 coords_sh( n, 0 ) = cx;
1282 coords_sh( n, 1 ) = cy;
1283 coords_sh( n, 2 ) = cz;
1285 const double n2 = cx * cx + cy * cy + cz * cz;
1288 const double invn = 1.0 / Kokkos::sqrt( n2 );
1289 normals_sh( n, 0 ) = cx * invn;
1290 normals_sh( n, 1 ) = cy * invn;
1291 normals_sh( n, 2 ) = cz * invn;
1295 normals_sh( n, 0 ) = normals_sh( n, 1 ) = normals_sh( n, 2 ) = 0.0;
1300 coords_sh( n, 0 ) = coords_sh( n, 1 ) = coords_sh( n, 2 ) = 0.0;
1301 normals_sh( n, 0 ) = normals_sh( n, 1 ) = normals_sh( n, 2 ) = 0.0;
1305 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nlev ), [&](
int lvl ) {
1306 const int rr = r0 + lvl;
1307 r_sh( lvl ) = ( rr <= hex_rad_ ) ? radii_( local_subdomain_id, rr ) : 0.0;
1310 const int total_pairs = nxy * nlev;
1311 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, total_pairs ), [&](
int t ) {
1312 const int node = t / nlev;
1313 const int lvl = t - node * nlev;
1315 const int dxn = node % ( lat_tile_ + 1 );
1316 const int dyn = node / ( lat_tile_ + 1 );
1318 const int xi = x0 + dxn;
1319 const int yi = y0 + dyn;
1320 const int rr = r0 + lvl;
1322 if ( xi <= hex_lat_ && yi <= hex_lat_ && rr <= hex_rad_ )
1324 k_sh( node, lvl ) = k_( local_subdomain_id, xi, yi, rr );
1325 src_sh( node, 0, lvl ) = src_( local_subdomain_id, xi, yi, rr, 0 );
1326 src_sh( node, 1, lvl ) = src_( local_subdomain_id, xi, yi, rr, 1 );
1327 src_sh( node, 2, lvl ) = src_( local_subdomain_id, xi, yi, rr, 2 );
1331 k_sh( node, lvl ) = 0.0;
1332 src_sh( node, 0, lvl ) = src_sh( node, 1, lvl ) = src_sh( node, 2, lvl ) = 0.0;
1336 team.team_barrier();
1338 if ( x_cell >= hex_lat_ || y_cell >= hex_lat_ )
1341 for (
int pass = 0; pass < r_passes_; ++pass )
1343 const int lvl0 = pass * r_tile_ + tr;
1344 const int r_cell = r0 + lvl0;
1346 if ( r_cell >= hex_rad_ )
1349 const double r_0 = r_sh( lvl0 );
1350 const double r_1 = r_sh( lvl0 + 1 );
1352 const bool at_cmb =
has_flag( local_subdomain_id, x_cell, y_cell, r_cell, CMB );
1353 const bool at_surface =
has_flag( local_subdomain_id, x_cell, y_cell, r_cell + 1, SURFACE );
1358 const bool cmb_freeslip = at_cmb && ( cmb_bc ==
FREESLIP );
1359 const bool surf_freeslip = at_surface && ( surface_bc ==
FREESLIP );
1360 const bool cmb_dirichlet = at_cmb && ( cmb_bc ==
DIRICHLET );
1361 const bool surface_dirichlet = at_surface && ( surface_bc ==
DIRICHLET );
1363 const int cmb_shift = ( ( cmb_dirichlet && ( !Diagonal ) ) ? 3 : 0 );
1364 const int surface_shift = ( ( surface_dirichlet && ( !Diagonal ) ) ? 3 : 0 );
1366 static constexpr int WEDGE_NODE_OFF[2][6][3] = {
1367 { { 0, 0, 0 }, { 1, 0, 0 }, { 0, 1, 0 }, { 0, 0, 1 }, { 1, 0, 1 }, { 0, 1, 1 } },
1368 { { 1, 1, 0 }, { 0, 1, 0 }, { 1, 0, 0 }, { 1, 1, 1 }, { 0, 1, 1 }, { 1, 0, 1 } } };
1370 static constexpr int WEDGE_TO_UNIQUE[2][6] = {
1371 { 0, 1, 2, 3, 4, 5 },
1372 { 6, 2, 1, 7, 5, 4 }
1375 constexpr double ONE_THIRD = 1.0 / 3.0;
1376 constexpr double ONE_SIXTH = 1.0 / 6.0;
1377 constexpr double NEG_TWO_THIRDS = -0.66666666666666663;
1379 static constexpr double dN_ref[6][3] = {
1380 { -0.5, -0.5, -ONE_SIXTH },
1381 { 0.5, 0.0, -ONE_SIXTH },
1382 { 0.0, 0.5, -ONE_SIXTH },
1383 { -0.5, -0.5, ONE_SIXTH },
1384 { 0.5, 0.0, ONE_SIXTH },
1385 { 0.0, 0.5, ONE_SIXTH } };
1387 const int n00 = node_id( tx, ty );
1388 const int n01 = node_id( tx, ty + 1 );
1389 const int n10 = node_id( tx + 1, ty );
1390 const int n11 = node_id( tx + 1, ty + 1 );
1393 const int corner_node[4] = { n00, n10, n01, n11 };
1395 static constexpr int CMB_CORNER_TO_UNIQUE[4] = { 0, 1, 2, 6 };
1396 static constexpr int SURF_CORNER_TO_UNIQUE[4] = { 3, 4, 5, 7 };
1398 double dst8[3][8] = { { 0.0 } };
1402 double Ann_acc_cmb[4] = {};
1403 double Ann_acc_surf[4] = {};
1405 for (
int w = 0; w < 2; ++w )
1407 const int v0 = w == 0 ? n00 : n11;
1408 const int v1 = w == 0 ? n10 : n01;
1409 const int v2 = w == 0 ? n01 : n10;
1413 for (
int node = 0; node < 6; ++node )
1415 const int ddx = WEDGE_NODE_OFF[w][node][0];
1416 const int ddy = WEDGE_NODE_OFF[w][node][1];
1417 const int ddr = WEDGE_NODE_OFF[w][node][2];
1419 const int nid = node_id( tx + ddx, ty + ddy );
1420 const int lvl = lvl0 + ddr;
1421 k_sum += k_sh( nid, lvl );
1423 const double k_eval = ONE_SIXTH * k_sum;
1426 double i00, i01, i02;
1427 double i10, i11, i12;
1428 double i20, i21, i22;
1431 const double half_dr = 0.5 * ( r_1 - r_0 );
1432 const double r_mid = 0.5 * ( r_0 + r_1 );
1434 const double J_0_0 = r_mid * ( -coords_sh( v0, 0 ) + coords_sh( v1, 0 ) );
1435 const double J_0_1 = r_mid * ( -coords_sh( v0, 0 ) + coords_sh( v2, 0 ) );
1436 const double J_0_2 = half_dr * ( ONE_THIRD * ( coords_sh( v0, 0 ) + coords_sh( v1, 0 ) + coords_sh( v2, 0 ) ) );
1438 const double J_1_0 = r_mid * ( -coords_sh( v0, 1 ) + coords_sh( v1, 1 ) );
1439 const double J_1_1 = r_mid * ( -coords_sh( v0, 1 ) + coords_sh( v2, 1 ) );
1440 const double J_1_2 = half_dr * ( ONE_THIRD * ( coords_sh( v0, 1 ) + coords_sh( v1, 1 ) + coords_sh( v2, 1 ) ) );
1442 const double J_2_0 = r_mid * ( -coords_sh( v0, 2 ) + coords_sh( v1, 2 ) );
1443 const double J_2_1 = r_mid * ( -coords_sh( v0, 2 ) + coords_sh( v2, 2 ) );
1444 const double J_2_2 = half_dr * ( ONE_THIRD * ( coords_sh( v0, 2 ) + coords_sh( v1, 2 ) + coords_sh( v2, 2 ) ) );
1446 const double J_det = J_0_0 * J_1_1 * J_2_2 - J_0_0 * J_1_2 * J_2_1 - J_0_1 * J_1_0 * J_2_2 +
1447 J_0_1 * J_1_2 * J_2_0 + J_0_2 * J_1_0 * J_2_1 - J_0_2 * J_1_1 * J_2_0;
1449 const double invJ = 1.0 /
J_det;
1451 i00 = invJ * ( J_1_1 * J_2_2 - J_1_2 * J_2_1 );
1452 i01 = invJ * ( -J_1_0 * J_2_2 + J_1_2 * J_2_0 );
1453 i02 = invJ * ( J_1_0 * J_2_1 - J_1_1 * J_2_0 );
1455 i10 = invJ * ( -J_0_1 * J_2_2 + J_0_2 * J_2_1 );
1456 i11 = invJ * ( J_0_0 * J_2_2 - J_0_2 * J_2_0 );
1457 i12 = invJ * ( -J_0_0 * J_2_1 + J_0_1 * J_2_0 );
1459 i20 = invJ * ( J_0_1 * J_1_2 - J_0_2 * J_1_1 );
1460 i21 = invJ * ( -J_0_0 * J_1_2 + J_0_2 * J_1_0 );
1461 i22 = invJ * ( J_0_0 * J_1_1 - J_0_1 * J_1_0 );
1463 wJ = Kokkos::abs( J_det );
1466 const double kwJ =
k_eval * wJ;
1469 static constexpr int CMB_NODE_TO_CORNER[2][3] = { { 0, 1, 2 }, { 3, 2, 1 } };
1472 double gu10 = 0.0, gu11 = 0.0;
1473 double gu20 = 0.0, gu21 = 0.0, gu22 = 0.0;
1483 const double gx = dN_ref[n][0];
1484 const double gy = dN_ref[n][1];
1485 const double gz = dN_ref[n][2];
1486 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1487 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1488 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1490 const int nid = node_id( tx + WEDGE_NODE_OFF[w][n][0], ty + WEDGE_NODE_OFF[w][n][1] );
1491 const int lvl = lvl0 + WEDGE_NODE_OFF[w][n][2];
1493 double s0 = src_sh( nid, 0, lvl );
1494 double s1 = src_sh( nid, 1, lvl );
1495 double s2 = src_sh( nid, 2, lvl );
1498 if ( cmb_freeslip && n < 3 )
1500 const double nx = normals_sh( nid, 0 );
1501 const double ny = normals_sh( nid, 1 );
1502 const double nz = normals_sh( nid, 2 );
1503 const double dot = nx * s0 + ny * s1 + nz * s2;
1508 if ( surf_freeslip && n >= 3 )
1510 const double nx = normals_sh( nid, 0 );
1511 const double ny = normals_sh( nid, 1 );
1512 const double nz = normals_sh( nid, 2 );
1513 const double dot = nx * s0 + ny * s1 + nz * s2;
1522 gu10 += 0.5 * (
g1 * s0 + g0 * s1 );
1523 gu20 += 0.5 * (
g2 * s0 + g0 * s2 );
1524 gu21 += 0.5 * (
g2 * s1 +
g1 * s2 );
1533 const double gx = dN_ref[n][0];
1534 const double gy = dN_ref[n][1];
1535 const double gz = dN_ref[n][2];
1536 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1537 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1538 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1540 const int uid = WEDGE_TO_UNIQUE[w][n];
1542 kwJ * ( 2.0 * ( g0 * gu00 +
g1 * gu10 +
g2 * gu20 ) + NEG_TWO_THIRDS * g0 * div_u );
1544 kwJ * ( 2.0 * ( g0 * gu10 +
g1 * gu11 +
g2 * gu21 ) + NEG_TWO_THIRDS * g1 * div_u );
1546 kwJ * ( 2.0 * ( g0 * gu20 +
g1 * gu21 +
g2 * gu22 ) + NEG_TWO_THIRDS * g2 * div_u );
1549 if ( cmb_freeslip && n < 3 )
1551 const int corner = CMB_NODE_TO_CORNER[w][n];
1552 const int cn = corner_node[corner];
1553 const double nxu = normals_sh( cn, 0 );
1554 const double nyu = normals_sh( cn, 1 );
1555 const double nzu = normals_sh( cn, 2 );
1556 const double gg = g0 * g0 +
g1 *
g1 +
g2 *
g2;
1557 const double ng = nxu * g0 + nyu *
g1 + nzu *
g2;
1558 Ann_acc_cmb[corner] += kwJ * ( gg + ONE_THIRD * ng * ng );
1560 if ( surf_freeslip && n >= 3 )
1562 const int corner = CMB_NODE_TO_CORNER[w][n - 3];
1563 const int cn = corner_node[corner];
1564 const double nxu = normals_sh( cn, 0 );
1565 const double nyu = normals_sh( cn, 1 );
1566 const double nzu = normals_sh( cn, 2 );
1567 const double gg = g0 * g0 +
g1 *
g1 +
g2 *
g2;
1568 const double ng = nxu * g0 + nyu *
g1 + nzu *
g2;
1569 Ann_acc_surf[corner] += kwJ * ( gg + ONE_THIRD * ng * ng );
1575 if ( Diagonal || cmb_dirichlet || surface_dirichlet )
1579 for (
int n = surface_shift; n < 6 -
cmb_shift; ++n )
1581 if ( Diagonal && cmb_freeslip && n < 3 )
1583 if ( Diagonal && surf_freeslip && n >= 3 )
1586 const double gx = dN_ref[n][0];
1587 const double gy = dN_ref[n][1];
1588 const double gz = dN_ref[n][2];
1589 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1590 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1591 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1592 const double gg = g0 * g0 +
g1 *
g1 +
g2 *
g2;
1594 const int uid = WEDGE_TO_UNIQUE[w][n];
1595 const int nid = node_id( tx + WEDGE_NODE_OFF[w][n][0], ty + WEDGE_NODE_OFF[w][n][1] );
1596 const int lvl = lvl0 + WEDGE_NODE_OFF[w][n][2];
1597 const double s0 = src_sh( nid, 0, lvl );
1598 const double s1 = src_sh( nid, 1, lvl );
1599 const double s2 = src_sh( nid, 2, lvl );
1601 dst8[0][uid] += kwJ * s0 * ( gg + ONE_THIRD * g0 * g0 );
1602 dst8[1][uid] += kwJ * s1 * ( gg + ONE_THIRD *
g1 *
g1 );
1603 dst8[2][uid] += kwJ * s2 * ( gg + ONE_THIRD *
g2 *
g2 );
1610 static constexpr int FS_CORNER_MAP[2][3] = { { 0, 1, 2 }, { 3, 2, 1 } };
1612 auto apply_rotated_diag =
1613 [&](
const int ni,
const int node_idx,
const int src_lvl ) {
1614 const int corner = FS_CORNER_MAP[w][ni];
1615 const int cn = corner_node[corner];
1616 const double nxu = normals_sh( cn, 0 );
1617 const double nyu = normals_sh( cn, 1 );
1618 const double nzu = normals_sh( cn, 2 );
1619 const int u = WEDGE_TO_UNIQUE[w][node_idx];
1621 const double gx = dN_ref[node_idx][0];
1622 const double gy = dN_ref[node_idx][1];
1623 const double gz = dN_ref[node_idx][2];
1625 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1626 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1627 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1628 const double gg_loc = g0 * g0 +
g1 *
g1 +
g2 *
g2;
1630 dense::Vec< double, 3 > n_vec;
1634 const auto R_rot = trafo_mat_cartesian_to_normal_tangential< double >( n_vec );
1638 const double s0 = src_sh( cn, 0, src_lvl );
1639 const double s1 = src_sh( cn, 1, src_lvl );
1640 const double s2 = src_sh( cn, 2, src_lvl );
1642 const double Rg0 = R_rot( 0, 0 ) * g0 + R_rot( 0, 1 ) *
g1 + R_rot( 0, 2 ) *
g2;
1643 const double Rg1 = R_rot( 1, 0 ) * g0 + R_rot( 1, 1 ) *
g1 + R_rot( 1, 2 ) *
g2;
1644 const double Rg2 = R_rot( 2, 0 ) * g0 + R_rot( 2, 1 ) *
g1 + R_rot( 2, 2 ) *
g2;
1645 const double Rs0 = R_rot( 0, 0 ) * s0 + R_rot( 0, 1 ) * s1 + R_rot( 0, 2 ) * s2;
1646 const double Rs1 = R_rot( 1, 0 ) * s0 + R_rot( 1, 1 ) * s1 + R_rot( 1, 2 ) * s2;
1647 const double Rs2 = R_rot( 2, 0 ) * s0 + R_rot( 2, 1 ) * s1 + R_rot( 2, 2 ) * s2;
1649 const double v0 = kwJ * ( gg_loc + ONE_THIRD * Rg0 * Rg0 ) * Rs0;
1650 const double v1 = kwJ * ( gg_loc + ONE_THIRD * Rg1 * Rg1 ) * Rs1;
1651 const double v2 = kwJ * ( gg_loc + ONE_THIRD * Rg2 * Rg2 ) * Rs2;
1653 dst8[0][u] += R_rot( 0, 0 ) * v0 + R_rot( 1, 0 ) * v1 + R_rot( 2, 0 ) * v2;
1654 dst8[1][u] += R_rot( 0, 1 ) * v0 + R_rot( 1, 1 ) * v1 + R_rot( 2, 1 ) * v2;
1655 dst8[2][u] += R_rot( 0, 2 ) * v0 + R_rot( 1, 2 ) * v1 + R_rot( 2, 2 ) * v2;
1660 for (
int ni = 0; ni < 3; ++ni )
1661 apply_rotated_diag( ni, ni, lvl0 );
1663 if ( surf_freeslip )
1665 for (
int ni = 0; ni < 3; ++ni )
1666 apply_rotated_diag( ni, ni + 3, lvl0 + 1 );
1673 if ( !Diagonal && cmb_freeslip )
1675 for (
int c = 0; c < 4; ++c )
1677 const double nx = normals_sh( corner_node[c], 0 );
1678 const double ny = normals_sh( corner_node[c], 1 );
1679 const double nz = normals_sh( corner_node[c], 2 );
1680 const int u = CMB_CORNER_TO_UNIQUE[c];
1681 const double dot = nx * dst8[0][u] + ny * dst8[1][u] + nz * dst8[2][u];
1682 dst8[0][u] -=
dot * nx;
1683 dst8[1][u] -=
dot * ny;
1684 dst8[2][u] -=
dot * nz;
1687 if ( !Diagonal && surf_freeslip )
1689 for (
int c = 0; c < 4; ++c )
1691 const double nx = normals_sh( corner_node[c], 0 );
1692 const double ny = normals_sh( corner_node[c], 1 );
1693 const double nz = normals_sh( corner_node[c], 2 );
1694 const int u = SURF_CORNER_TO_UNIQUE[c];
1695 const double dot = nx * dst8[0][u] + ny * dst8[1][u] + nz * dst8[2][u];
1696 dst8[0][u] -=
dot * nx;
1697 dst8[1][u] -=
dot * ny;
1698 dst8[2][u] -=
dot * nz;
1704 if ( !Diagonal && cmb_freeslip )
1706 for (
int c = 0; c < 4; ++c )
1708 const int cn = corner_node[c];
1709 const double nx = normals_sh( cn, 0 );
1710 const double ny = normals_sh( cn, 1 );
1711 const double nz = normals_sh( cn, 2 );
1712 const double os0 = src_sh( cn, 0, lvl0 );
1713 const double os1 = src_sh( cn, 1, lvl0 );
1714 const double os2 = src_sh( cn, 2, lvl0 );
1715 const double u_n_val = nx * os0 + ny * os1 + nz * os2;
1716 const double corr = Ann_acc_cmb[c] * u_n_val;
1717 const int u = CMB_CORNER_TO_UNIQUE[c];
1718 dst8[0][u] += corr * nx;
1719 dst8[1][u] += corr * ny;
1720 dst8[2][u] += corr * nz;
1723 if ( !Diagonal && surf_freeslip )
1725 for (
int c = 0; c < 4; ++c )
1727 const int cn = corner_node[c];
1728 const double nx = normals_sh( cn, 0 );
1729 const double ny = normals_sh( cn, 1 );
1730 const double nz = normals_sh( cn, 2 );
1731 const double os0 = src_sh( cn, 0, lvl0 + 1 );
1732 const double os1 = src_sh( cn, 1, lvl0 + 1 );
1733 const double os2 = src_sh( cn, 2, lvl0 + 1 );
1734 const double u_n_val = nx * os0 + ny * os1 + nz * os2;
1735 const double corr = Ann_acc_surf[c] * u_n_val;
1736 const int u = SURF_CORNER_TO_UNIQUE[c];
1737 dst8[0][u] += corr * nx;
1738 dst8[1][u] += corr * ny;
1739 dst8[2][u] += corr * nz;
1745 for (
int dim_add = 0; dim_add < 3; ++dim_add )
1747 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell, y_cell, r_cell, dim_add ), dst8[dim_add][0] );
1748 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell + 1, y_cell, r_cell, dim_add ), dst8[dim_add][1] );
1749 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell, y_cell + 1, r_cell, dim_add ), dst8[dim_add][2] );
1750 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell, y_cell, r_cell + 1, dim_add ), dst8[dim_add][3] );
1752 &dst_( local_subdomain_id, x_cell + 1, y_cell, r_cell + 1, dim_add ), dst8[dim_add][4] );
1754 &dst_( local_subdomain_id, x_cell, y_cell + 1, r_cell + 1, dim_add ), dst8[dim_add][5] );
1756 &dst_( local_subdomain_id, x_cell + 1, y_cell + 1, r_cell, dim_add ), dst8[dim_add][6] );
1758 &dst_( local_subdomain_id, x_cell + 1, y_cell + 1, r_cell + 1, dim_add ), dst8[dim_add][7] );
1774 KOKKOS_INLINE_FUNCTION
1777 int local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell;
1778 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
1780 if ( tr >= r_tile_ )
1783 if ( kernel_path_ == KernelPath::Slow )
1785 for (
int pass = 0; pass < r_passes_; ++pass )
1787 const int r_cell_pass = r0 + pass * r_tile_ + tr;
1788 if ( r_cell_pass >= hex_rad_ )
1790 const bool at_cmb =
has_flag( local_subdomain_id, x_cell, y_cell, r_cell_pass, CMB );
1791 const bool at_surface =
has_flag( local_subdomain_id, x_cell, y_cell, r_cell_pass + 1, SURFACE );
1793 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell_pass, at_cmb, at_surface );
1796 else if ( kernel_path_ == KernelPath::FastFreeslip )
1799 operator_fast_freeslip_path< true >(
1800 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell );
1802 operator_fast_freeslip_path< false >(
1803 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell );
1808 operator_fast_dirichlet_neumann_path< true >(
1809 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell );
1811 operator_fast_dirichlet_neumann_path< false >(
1812 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell );
1831 const auto det = J.det();
1832 const auto abs_det = Kokkos::abs( det );
1838 k_eval +=
shape( k, quad_point ) * k_local_hex[wedge]( k );
1843 sym_grad_i[k] =
symmetric_grad( J_inv_transposed, quad_point, k, dimi );
1844 sym_grad_j[k] =
symmetric_grad( J_inv_transposed, quad_point, k, dimj );
1847 jdet_keval_quadweight = quad_weight * k_eval * abs_det;
1853 KOKKOS_INLINE_FUNCTION
1855 const int local_subdomain_id,
1859 const int wedge )
const
1864 const ScalarT r_1 = radii_( local_subdomain_id, r_cell );
1865 const ScalarT r_2 = radii_( local_subdomain_id, r_cell + 1 );
1872 for (
int dimi = 0; dimi < 3; ++dimi )
1874 for (
int dimj = 0; dimj < 3; ++dimj )
1876 for (
int q = 0; q < num_quad_points; q++ )
1894 jdet_keval_quadweight );
1901 jdet_keval_quadweight *
1903 2.0 / 3.0 * sym_grad_j[j]( dimj, dimj ) * sym_grad_i[i]( dimi, dimi ) );