66 using Team = Kokkos::TeamPolicy<>::member_type;
78 BoundaryConditions bcs_;
99 int local_subdomains_;
102 int lat_refinement_level_;
121 enum class KernelPath
124 FastDirichletNeumann,
127 KernelPath kernel_path_ = KernelPath::FastDirichletNeumann;
138 void update_kernel_path_flag_host_only()
140 const BoundaryConditionFlag cmb_bc = get_boundary_condition_flag( bcs_, CMB );
141 const BoundaryConditionFlag surface_bc = get_boundary_condition_flag( bcs_, SURFACE );
143 const bool has_freeslip = ( cmb_bc == FREESLIP ) || ( surface_bc == FREESLIP );
146 if ( has_stored_matrices )
147 kernel_path_ = KernelPath::Slow;
148 else if ( has_freeslip )
149 kernel_path_ = KernelPath::FastFreeslip;
151 kernel_path_ = KernelPath::FastDirichletNeumann;
161 BoundaryConditions bcs,
172 , diagonal_( diagonal )
173 , operator_apply_mode_( operator_apply_mode )
174 , operator_communication_mode_( operator_communication_mode )
175 , operator_stored_matrix_mode_( operator_stored_matrix_mode )
176 , recv_buffers_( domain )
177 , comm_plan_( domain )
186 local_subdomains_ = domain_.
subdomains().size();
194 lat_tiles_ = ( hex_lat_ + lat_tile_ - 1 ) / lat_tile_;
195 r_tiles_ = ( hex_rad_ + r_tile_ - 1 ) / r_tile_;
197 team_size_ = lat_tile_ * lat_tile_ * r_tile_;
198 blocks_ = local_subdomains_ * lat_tiles_ * lat_tiles_ * r_tiles_;
200 r_min_ = domain_info.
radii()[0];
201 r_max_ = domain_info.
radii()[domain_info.
radii().size() - 1];
204 update_kernel_path_flag_host_only();
206 util::logroot <<
"[EpsilonDivDiv] tile size (x,y,r)=(" << lat_tile_ <<
"," << lat_tile_ <<
"," << r_tile_ <<
")"
208 util::logroot <<
"[EpsilonDivDiv] number of tiles (x,y,r)=(" << lat_tiles_ <<
"," << lat_tiles_ <<
","
209 << r_tiles_ <<
"), team_size=" << team_size_ <<
", blocks=" << blocks_ << std::endl;
210 const char* path_name = ( kernel_path_ == KernelPath::Slow ) ?
"slow" :
211 ( kernel_path_ == KernelPath::FastFreeslip ) ?
"fast-freeslip" :
212 "fast-dirichlet-neumann";
213 util::logroot <<
"[EpsilonDivDiv] kernel path = " << path_name << std::endl;
220 operator_apply_mode_ = operator_apply_mode;
221 operator_communication_mode_ = operator_communication_mode;
232 update_kernel_path_flag_host_only();
240 KOKKOS_INLINE_FUNCTION
242 const int local_subdomain_id,
248 return util::has_flag( mask_( local_subdomain_id, x_cell, y_cell, r_cell ), flag );
256 operator_stored_matrix_mode_ = operator_stored_matrix_mode;
261 domain_, operator_stored_matrix_mode_, level_range, GCAElements );
265 update_kernel_path_flag_host_only();
267 util::logroot <<
"[EpsilonDivDiv] (set_stored_matrix_mode) kernel path = "
268 << (( kernel_path_ == KernelPath::Slow ) ?
"slow" :
269 ( kernel_path_ == KernelPath::FastFreeslip ) ?
"fast-freeslip" :
270 "fast-dirichlet-neumann")
276 KOKKOS_INLINE_FUNCTION
278 const int local_subdomain_id,
286 local_matrix_storage_.
set_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge, mat );
289 KOKKOS_INLINE_FUNCTION
291 const int local_subdomain_id,
295 const int wedge )
const
299 if ( !local_matrix_storage_.
has_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge ) )
301 Kokkos::abort(
"No matrix found at that spatial index." );
303 return local_matrix_storage_.
get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge );
333 util::Timer timer_kernel(
"epsilon_divdiv_kernel" );
334 Kokkos::TeamPolicy<> policy( blocks_, team_size_ );
337 if ( kernel_path_ != KernelPath::Slow )
339 policy.set_scratch_size( 0, Kokkos::PerTeam(
team_shmem_size( team_size_ ) ) );
343 if ( kernel_path_ == KernelPath::Slow )
345 Kokkos::parallel_for(
346 "epsilon_divdiv_apply_kernel_slow", policy, KOKKOS_CLASS_LAMBDA(
const Team& team ) {
347 this->run_team_slow( team );
350 else if ( kernel_path_ == KernelPath::FastFreeslip )
352 Kokkos::parallel_for(
353 "epsilon_divdiv_apply_kernel_fast_freeslip", policy, KOKKOS_CLASS_LAMBDA(
const Team& team ) {
354 this->run_team_fast_freeslip( team );
359 Kokkos::parallel_for(
360 "epsilon_divdiv_apply_kernel_fast_dirichlet_neumann",
362 KOKKOS_CLASS_LAMBDA(
const Team& team ) { this->run_team_fast_dirichlet_neumann( team ); } );
385 KOKKOS_INLINE_FUNCTION
399 E00 = E11 = E22 = sym01 = sym02 = sym12 = gdd = 0.0;
430 KOKKOS_INLINE_FUNCTION
433 const int nlev = r_tile_ + 1;
434 const int n = lat_tile_ + 1;
435 const int nxy = n * n;
437 const size_t nscalars =
438 size_t( nxy ) * 3 + size_t( nxy ) * 3 * nlev + size_t( nxy ) * nlev + size_t( nlev ) + 1;
450 KOKKOS_INLINE_FUNCTION
451 void decode_team_indices(
453 int& local_subdomain_id,
464 int tmp = team.league_rank();
466 const int r_tile_id = tmp % r_tiles_;
469 const int lat_y_id = tmp % lat_tiles_;
472 const int lat_x_id = tmp % lat_tiles_;
475 local_subdomain_id = tmp;
477 x0 = lat_x_id * lat_tile_;
478 y0 = lat_y_id * lat_tile_;
479 r0 = r_tile_id * r_tile_;
481 const int tid = team.team_rank();
482 tx = tid % lat_tile_;
483 ty = ( tid / lat_tile_ ) % lat_tile_;
484 tr = tid / ( lat_tile_ * lat_tile_ );
506 KOKKOS_INLINE_FUNCTION
507 void run_team_slow(
const Team& team )
const
509 int local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell;
510 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
515 const bool at_cmb = has_flag( local_subdomain_id, x_cell, y_cell, r_cell, CMB );
516 const bool at_surface = has_flag( local_subdomain_id, x_cell, y_cell, r_cell + 1, SURFACE );
519 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell, at_cmb, at_surface );
525 KOKKOS_INLINE_FUNCTION
526 void run_team_fast_dirichlet_neumann(
const Team& team )
const
528 int local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell;
529 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
534 const bool at_cmb = has_flag( local_subdomain_id, x_cell, y_cell, r_cell, CMB );
535 const bool at_surface = has_flag( local_subdomain_id, x_cell, y_cell, r_cell + 1, SURFACE );
537 operator_fast_dirichlet_neumann_path(
538 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell, at_cmb, at_surface );
544 KOKKOS_INLINE_FUNCTION
545 void run_team_fast_freeslip(
const Team& team )
const
547 int local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell;
548 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
553 const bool at_cmb = has_flag( local_subdomain_id, x_cell, y_cell, r_cell, CMB );
554 const bool at_surface = has_flag( local_subdomain_id, x_cell, y_cell, r_cell + 1, SURFACE );
556 operator_fast_freeslip_path(
557 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell, at_cmb, at_surface );
561 KOKKOS_INLINE_FUNCTION
562 void operator_slow_path(
564 const int local_subdomain_id,
575 const bool at_surface )
const
585 if ( x_cell >= hex_lat_ || y_cell >= hex_lat_ || r_cell >= hex_rad_ )
592 A[0] = local_matrix_storage_.
get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 );
593 A[1] = local_matrix_storage_.
get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 );
597 if ( local_matrix_storage_.
has_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 ) &&
598 local_matrix_storage_.
has_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 ) )
600 A[0] = local_matrix_storage_.
get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 );
601 A[1] = local_matrix_storage_.
get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 );
616 for (
int dimj = 0; dimj < 3; dimj++ )
630 dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > boundary_mask;
631 boundary_mask.fill( 1.0 );
633 bool freeslip_reorder =
false;
636 if ( at_cmb || at_surface )
641 if ( bcf == DIRICHLET )
643 for (
int dimi = 0; dimi < 3; ++dimi )
645 for (
int dimj = 0; dimj < 3; ++dimj )
651 if ( ( at_cmb && ( ( dimi == dimj && i != j && ( i < 3 || j < 3 ) ) ||
652 ( dimi != dimj && ( i < 3 || j < 3 ) ) ) ) ||
653 ( at_surface && ( ( dimi == dimj && i != j && ( i >= 3 || j >= 3 ) ) ||
654 ( dimi != dimj && ( i >= 3 || j >= 3 ) ) ) ) )
664 else if ( bcf == FREESLIP )
666 freeslip_reorder =
true;
669 for (
int wedge = 0; wedge < 2; ++wedge )
671 for (
int dimi = 0; dimi < 3; ++dimi )
675 for (
int dimj = 0; dimj < 3; ++dimj )
679 A_tmp[wedge]( node_idxi * 3 + dimi, node_idxj * 3 + dimj ) = A[wedge](
689 constexpr int layer_hex_offset_x[2][3] = { { 0, 1, 0 }, { 1, 0, 1 } };
690 constexpr int layer_hex_offset_y[2][3] = { { 0, 0, 1 }, { 1, 1, 0 } };
692 for (
int wedge = 0; wedge < 2; ++wedge )
696 R[wedge]( i, i ) = 1.0;
699 for (
int boundary_node_idx = 0; boundary_node_idx < 3; boundary_node_idx++ )
703 x_cell + layer_hex_offset_x[wedge][boundary_node_idx],
704 y_cell + layer_hex_offset_y[wedge][boundary_node_idx],
705 r_cell + ( at_cmb ? 0 : 1 ),
711 const int offset_in_R = at_cmb ? 0 : 9;
712 for (
int dimi = 0; dimi < 3; ++dimi )
714 for (
int dimj = 0; dimj < 3; ++dimj )
717 offset_in_R + boundary_node_idx * 3 + dimi,
718 offset_in_R + boundary_node_idx * 3 + dimj ) = R_i( dimi, dimj );
723 A[wedge] = R[wedge] * A_tmp[wedge] * R[wedge].
transposed();
725 auto src_tmp = R[wedge] * src[wedge];
726 for (
int i = 0; i < 18; ++i )
728 src[wedge]( i ) = src_tmp( i );
731 const int node_start = at_surface ? 3 : 0;
732 const int node_end = at_surface ? 6 : 3;
734 for (
int node_idx = node_start; node_idx < node_end; node_idx++ )
736 const int idx = node_idx * 3;
737 for (
int k = 0; k < 18; ++k )
741 boundary_mask( idx, k ) = 0.0;
742 boundary_mask( k, idx ) = 0.0;
762 dst[0] = A[0] * src[0];
763 dst[1] = A[1] * src[1];
765 if ( freeslip_reorder )
769 dst_tmp[1] = R[1].transposed() * dst[1];
771 for (
int i = 0; i < 18; ++i )
773 dst[0]( i ) = dst_tmp[0]( i );
774 dst[1]( i ) = dst_tmp[1]( i );
781 for (
int dimi = 0; dimi < 3; dimi++ )
788 dst_, local_subdomain_id, x_cell, y_cell, r_cell, dimi, dst_d );
793 KOKKOS_INLINE_FUNCTION
794 void operator_fast_dirichlet_neumann_path(
796 const int local_subdomain_id,
807 const bool at_surface )
const
809 const int nlev = r_tile_ + 1;
810 const int nxy = ( lat_tile_ + 1 ) * ( lat_tile_ + 1 );
813 reinterpret_cast< double*
>( team.team_shmem().get_shmem(
team_shmem_size( team.team_size() ) ) );
815 using ScratchCoords =
816 Kokkos::View< double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
817 using ScratchSrc = Kokkos::
818 View< double***, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
820 Kokkos::View< double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
822 ScratchCoords coords_sh( shmem, nxy, 3 );
825 ScratchSrc src_sh( shmem, nxy, 3, nlev );
826 shmem += nxy * 3 * nlev;
828 ScratchK k_sh( shmem, nxy, nlev );
832 Kokkos::View< double*, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >(
835 auto node_id = [&](
int nx,
int ny ) ->
int {
return nx + ( lat_tile_ + 1 ) * ny; };
837 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nxy ), [&](
int n ) {
838 const int dxn = n % ( lat_tile_ + 1 );
839 const int dyn = n / ( lat_tile_ + 1 );
840 const int xi = x0 + dxn;
841 const int yi = y0 + dyn;
843 if ( xi <= hex_lat_ && yi <= hex_lat_ )
845 coords_sh( n, 0 ) = grid_( local_subdomain_id, xi, yi, 0 );
846 coords_sh( n, 1 ) = grid_( local_subdomain_id, xi, yi, 1 );
847 coords_sh( n, 2 ) = grid_( local_subdomain_id, xi, yi, 2 );
851 coords_sh( n, 0 ) = coords_sh( n, 1 ) = coords_sh( n, 2 ) = 0.0;
855 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nlev ), [&](
int lvl ) {
856 const int rr = r0 + lvl;
857 r_sh( lvl ) = ( rr <= hex_rad_ ) ? radii_( local_subdomain_id, rr ) : 0.0;
860 const int total_pairs = nxy * nlev;
861 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, total_pairs ), [&](
int t ) {
862 const int lvl = t / nxy;
863 const int node = t - lvl * nxy;
865 const int dxn = node % ( lat_tile_ + 1 );
866 const int dyn = node / ( lat_tile_ + 1 );
868 const int xi = x0 + dxn;
869 const int yi = y0 + dyn;
870 const int rr = r0 + lvl;
872 if ( xi <= hex_lat_ && yi <= hex_lat_ && rr <= hex_rad_ )
874 k_sh( node, lvl ) = k_( local_subdomain_id, xi, yi, rr );
875 src_sh( node, 0, lvl ) = src_( local_subdomain_id, xi, yi, rr, 0 );
876 src_sh( node, 1, lvl ) = src_( local_subdomain_id, xi, yi, rr, 1 );
877 src_sh( node, 2, lvl ) = src_( local_subdomain_id, xi, yi, rr, 2 );
881 k_sh( node, lvl ) = 0.0;
882 src_sh( node, 0, lvl ) = src_sh( node, 1, lvl ) = src_sh( node, 2, lvl ) = 0.0;
888 if ( x_cell >= hex_lat_ || y_cell >= hex_lat_ || r_cell >= hex_rad_ )
892 const double r_0 = r_sh( lvl0 );
893 const double r_1 = r_sh( lvl0 + 1 );
895 const bool at_boundary = at_cmb || at_surface;
896 bool treat_boundary_dirichlet =
false;
903 const int cmb_shift = ( ( at_boundary && treat_boundary_dirichlet && ( !diagonal_ ) && at_cmb ) ? 3 : 0 );
905 ( ( at_boundary && treat_boundary_dirichlet && ( !diagonal_ ) && at_surface ) ? 3 : 0 );
907 static constexpr int WEDGE_NODE_OFF[2][6][3] = {
908 { { 0, 0, 0 }, { 1, 0, 0 }, { 0, 1, 0 }, { 0, 0, 1 }, { 1, 0, 1 }, { 0, 1, 1 } },
909 { { 1, 1, 0 }, { 0, 1, 0 }, { 1, 0, 0 }, { 1, 1, 1 }, { 0, 1, 1 }, { 1, 0, 1 } } };
911 static constexpr int WEDGE_TO_UNIQUE[2][6] = {
912 { 0, 1, 2, 3, 4, 5 },
916 constexpr double ONE_THIRD = 1.0 / 3.0;
917 constexpr double ONE_SIXTH = 1.0 / 6.0;
918 constexpr double NEG_TWO_THIRDS = -0.66666666666666663;
920 static constexpr double dN_ref[6][3] = {
921 { -0.5, -0.5, -ONE_SIXTH },
922 { 0.5, 0.0, -ONE_SIXTH },
923 { 0.0, 0.5, -ONE_SIXTH },
924 { -0.5, -0.5, ONE_SIXTH },
925 { 0.5, 0.0, ONE_SIXTH },
926 { 0.0, 0.5, ONE_SIXTH } };
928 const int n00 = node_id( tx, ty );
929 const int n01 = node_id( tx, ty + 1 );
930 const int n10 = node_id( tx + 1, ty );
931 const int n11 = node_id( tx + 1, ty + 1 );
935 ws[0][0][0] = coords_sh( n00, 0 );
936 ws[0][0][1] = coords_sh( n00, 1 );
937 ws[0][0][2] = coords_sh( n00, 2 );
938 ws[0][1][0] = coords_sh( n10, 0 );
939 ws[0][1][1] = coords_sh( n10, 1 );
940 ws[0][1][2] = coords_sh( n10, 2 );
941 ws[0][2][0] = coords_sh( n01, 0 );
942 ws[0][2][1] = coords_sh( n01, 1 );
943 ws[0][2][2] = coords_sh( n01, 2 );
945 ws[1][0][0] = coords_sh( n11, 0 );
946 ws[1][0][1] = coords_sh( n11, 1 );
947 ws[1][0][2] = coords_sh( n11, 2 );
948 ws[1][1][0] = coords_sh( n01, 0 );
949 ws[1][1][1] = coords_sh( n01, 1 );
950 ws[1][1][2] = coords_sh( n01, 2 );
951 ws[1][2][0] = coords_sh( n10, 0 );
952 ws[1][2][1] = coords_sh( n10, 1 );
953 ws[1][2][2] = coords_sh( n10, 2 );
955 double dst8[3][8] = { 0.0 };
957 for (
int w = 0; w < 2; ++w )
961 for (
int node = 0; node < 6; ++node )
963 const int ddx = WEDGE_NODE_OFF[w][node][0];
964 const int ddy = WEDGE_NODE_OFF[w][node][1];
965 const int ddr = WEDGE_NODE_OFF[w][node][2];
967 const int nid = node_id( tx + ddx, ty + ddy );
968 const int lvl = lvl0 + ddr;
970 k_sum += k_sh( nid, lvl );
972 const double k_eval = ONE_SIXTH * k_sum;
975 double i00, i01, i02;
976 double i10, i11, i12;
977 double i20, i21, i22;
980 const double half_dr = 0.5 * ( r_1 - r_0 );
981 const double r_mid = 0.5 * ( r_0 + r_1 );
983 const double J_0_0 = r_mid * ( -ws[w][0][0] + ws[w][1][0] );
984 const double J_0_1 = r_mid * ( -ws[w][0][0] + ws[w][2][0] );
985 const double J_0_2 = half_dr * ( ONE_THIRD * ( ws[w][0][0] + ws[w][1][0] + ws[w][2][0] ) );
987 const double J_1_0 = r_mid * ( -ws[w][0][1] + ws[w][1][1] );
988 const double J_1_1 = r_mid * ( -ws[w][0][1] + ws[w][2][1] );
989 const double J_1_2 = half_dr * ( ONE_THIRD * ( ws[w][0][1] + ws[w][1][1] + ws[w][2][1] ) );
991 const double J_2_0 = r_mid * ( -ws[w][0][2] + ws[w][1][2] );
992 const double J_2_1 = r_mid * ( -ws[w][0][2] + ws[w][2][2] );
993 const double J_2_2 = half_dr * ( ONE_THIRD * ( ws[w][0][2] + ws[w][1][2] + ws[w][2][2] ) );
995 const double J_det = J_0_0 * J_1_1 * J_2_2 - J_0_0 * J_1_2 * J_2_1 - J_0_1 * J_1_0 * J_2_2 +
996 J_0_1 * J_1_2 * J_2_0 + J_0_2 * J_1_0 * J_2_1 - J_0_2 * J_1_1 * J_2_0;
998 const double invJ = 1.0 /
J_det;
1000 i00 = invJ * ( J_1_1 * J_2_2 - J_1_2 * J_2_1 );
1001 i01 = invJ * ( -J_1_0 * J_2_2 + J_1_2 * J_2_0 );
1002 i02 = invJ * ( J_1_0 * J_2_1 - J_1_1 * J_2_0 );
1003 i10 = invJ * ( -J_0_1 * J_2_2 + J_0_2 * J_2_1 );
1004 i11 = invJ * ( J_0_0 * J_2_2 - J_0_2 * J_2_0 );
1005 i12 = invJ * ( -J_0_0 * J_2_1 + J_0_1 * J_2_0 );
1006 i20 = invJ * ( J_0_1 * J_1_2 - J_0_2 * J_1_1 );
1007 i21 = invJ * ( -J_0_0 * J_1_2 + J_0_2 * J_1_0 );
1008 i22 = invJ * ( J_0_0 * J_1_1 - J_0_1 * J_1_0 );
1010 wJ = Kokkos::abs( J_det );
1013 const double kwJ =
k_eval * wJ;
1016 double gu10 = 0.0, gu11 = 0.0;
1017 double gu20 = 0.0, gu21 = 0.0, gu22 = 0.0;
1022 for (
int dimj = 0; dimj < 3; ++dimj )
1025 for (
int node_idx = cmb_shift; node_idx < 6 -
surface_shift; ++node_idx )
1027 const double gx = dN_ref[node_idx][0];
1028 const double gy = dN_ref[node_idx][1];
1029 const double gz = dN_ref[node_idx][2];
1031 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1032 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1033 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1035 double E00, E11, E22, sym01, sym02, sym12, gdd;
1036 column_grad_to_sym( dimj, g0, g1, g2, E00, E11, E22, sym01, sym02, sym12, gdd );
1038 const int ddx = WEDGE_NODE_OFF[w][node_idx][0];
1039 const int ddy = WEDGE_NODE_OFF[w][node_idx][1];
1040 const int ddr = WEDGE_NODE_OFF[w][node_idx][2];
1042 const int nid = node_id( tx + ddx, ty + ddy );
1043 const int lvl = lvl0 + ddr;
1045 const double s = src_sh( nid, dimj, lvl );
1057 for (
int dimi = 0; dimi < 3; ++dimi )
1060 for (
int node_idx = cmb_shift; node_idx < 6 -
surface_shift; ++node_idx )
1062 const double gx = dN_ref[node_idx][0];
1063 const double gy = dN_ref[node_idx][1];
1064 const double gz = dN_ref[node_idx][2];
1066 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1067 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1068 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1070 double E00, E11, E22, sym01, sym02, sym12, gdd;
1071 column_grad_to_sym( dimi, g0, g1, g2, E00, E11, E22, sym01, sym02, sym12, gdd );
1073 const int u = WEDGE_TO_UNIQUE[w][node_idx];
1076 kwJ * ( NEG_TWO_THIRDS *
div_u * gdd + 4.0 * sym01 * gu10 + 4.0 * sym02 * gu20 +
1077 4.0 * sym12 * gu21 + 2.0 * E00 * gu00 + 2.0 * E11 * gu11 + 2.0 * E22 * gu22 );
1082 if ( diagonal_ || ( treat_boundary_dirichlet && at_boundary ) )
1084 for (
int dim_diagBC = 0; dim_diagBC < 3; ++dim_diagBC )
1087 for (
int node_idx = surface_shift; node_idx < 6 -
cmb_shift; ++node_idx )
1089 const double gx = dN_ref[node_idx][0];
1090 const double gy = dN_ref[node_idx][1];
1091 const double gz = dN_ref[node_idx][2];
1093 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1094 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1095 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1097 double E00, E11, E22, sym01, sym02, sym12, gdd;
1098 column_grad_to_sym( dim_diagBC, g0, g1, g2, E00, E11, E22, sym01, sym02, sym12, gdd );
1100 const int ddx = WEDGE_NODE_OFF[w][node_idx][0];
1101 const int ddy = WEDGE_NODE_OFF[w][node_idx][1];
1102 const int ddr = WEDGE_NODE_OFF[w][node_idx][2];
1104 const int nid = node_id( tx + ddx, ty + ddy );
1105 const int lvl = lvl0 + ddr;
1107 const double s = src_sh( nid, dim_diagBC, lvl );
1108 const int u = WEDGE_TO_UNIQUE[w][node_idx];
1110 dst8[dim_diagBC][u] += kwJ * ( 4.0 * s * ( sym01 * sym01 + sym02 * sym02 + sym12 * sym12 ) +
1111 2.0 * s * ( E00 * E00 + E11 * E11 + E22 * E22 ) +
1112 NEG_TWO_THIRDS * ( gdd * gdd ) * s );
1118 for (
int dim_add = 0; dim_add < 3; ++dim_add )
1120 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell, y_cell, r_cell, dim_add ), dst8[dim_add][0] );
1121 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell + 1, y_cell, r_cell, dim_add ), dst8[dim_add][1] );
1122 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell, y_cell + 1, r_cell, dim_add ), dst8[dim_add][2] );
1123 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell, y_cell, r_cell + 1, dim_add ), dst8[dim_add][3] );
1125 &dst_( local_subdomain_id, x_cell + 1, y_cell, r_cell + 1, dim_add ), dst8[dim_add][4] );
1127 &dst_( local_subdomain_id, x_cell, y_cell + 1, r_cell + 1, dim_add ), dst8[dim_add][5] );
1129 &dst_( local_subdomain_id, x_cell + 1, y_cell + 1, r_cell, dim_add ), dst8[dim_add][6] );
1131 &dst_( local_subdomain_id, x_cell + 1, y_cell + 1, r_cell + 1, dim_add ), dst8[dim_add][7] );
1136 KOKKOS_INLINE_FUNCTION
1137 void normalize3(
double& x,
double& y,
double& z )
const
1139 const double n2 = x * x + y * y + z * z;
1142 const double invn = 1.0 / Kokkos::sqrt( n2 );
1149 KOKKOS_INLINE_FUNCTION
1150 void project_tangential_inplace(
1158 const double dot = nx * ux + ny * uy + nz * uz;
1164 KOKKOS_INLINE_FUNCTION
1165 void operator_fast_freeslip_path(
1167 const int local_subdomain_id,
1178 const bool at_surface )
const
1180 const int nlev = r_tile_ + 1;
1181 const int nxy = ( lat_tile_ + 1 ) * ( lat_tile_ + 1 );
1184 reinterpret_cast< double*
>( team.team_shmem().get_shmem(
team_shmem_size( team.team_size() ) ) );
1186 using ScratchCoords =
1187 Kokkos::View< double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
1188 using ScratchSrc = Kokkos::
1189 View< double***, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
1191 Kokkos::View< double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
1193 ScratchCoords coords_sh( shmem, nxy, 3 );
1196 ScratchSrc src_sh( shmem, nxy, 3, nlev );
1197 shmem += nxy * 3 * nlev;
1199 ScratchK k_sh( shmem, nxy, nlev );
1200 shmem += nxy * nlev;
1203 Kokkos::View< double*, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >(
1206 auto node_id = [&](
int nx,
int ny ) ->
int {
return nx + ( lat_tile_ + 1 ) * ny; };
1208 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nxy ), [&](
int n ) {
1209 const int dxn = n % ( lat_tile_ + 1 );
1210 const int dyn = n / ( lat_tile_ + 1 );
1211 const int xi = x0 + dxn;
1212 const int yi = y0 + dyn;
1214 if ( xi <= hex_lat_ && yi <= hex_lat_ )
1216 coords_sh( n, 0 ) = grid_( local_subdomain_id, xi, yi, 0 );
1217 coords_sh( n, 1 ) = grid_( local_subdomain_id, xi, yi, 1 );
1218 coords_sh( n, 2 ) = grid_( local_subdomain_id, xi, yi, 2 );
1222 coords_sh( n, 0 ) = coords_sh( n, 1 ) = coords_sh( n, 2 ) = 0.0;
1226 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nlev ), [&](
int lvl ) {
1227 const int rr = r0 + lvl;
1228 r_sh( lvl ) = ( rr <= hex_rad_ ) ? radii_( local_subdomain_id, rr ) : 0.0;
1231 const int total_pairs = nxy * nlev;
1232 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, total_pairs ), [&](
int t ) {
1233 const int lvl = t / nxy;
1234 const int node = t - lvl * nxy;
1236 const int dxn = node % ( lat_tile_ + 1 );
1237 const int dyn = node / ( lat_tile_ + 1 );
1239 const int xi = x0 + dxn;
1240 const int yi = y0 + dyn;
1241 const int rr = r0 + lvl;
1243 if ( xi <= hex_lat_ && yi <= hex_lat_ && rr <= hex_rad_ )
1245 k_sh( node, lvl ) = k_( local_subdomain_id, xi, yi, rr );
1246 src_sh( node, 0, lvl ) = src_( local_subdomain_id, xi, yi, rr, 0 );
1247 src_sh( node, 1, lvl ) = src_( local_subdomain_id, xi, yi, rr, 1 );
1248 src_sh( node, 2, lvl ) = src_( local_subdomain_id, xi, yi, rr, 2 );
1252 k_sh( node, lvl ) = 0.0;
1253 src_sh( node, 0, lvl ) = src_sh( node, 1, lvl ) = src_sh( node, 2, lvl ) = 0.0;
1257 team.team_barrier();
1259 if ( x_cell >= hex_lat_ || y_cell >= hex_lat_ || r_cell >= hex_rad_ )
1262 const int lvl0 = tr;
1263 const double r_0 = r_sh( lvl0 );
1264 const double r_1 = r_sh( lvl0 + 1 );
1269 const bool cmb_freeslip = at_cmb && ( cmb_bc ==
FREESLIP );
1270 const bool surf_freeslip = at_surface && ( surface_bc ==
FREESLIP );
1271 const bool cmb_dirichlet = at_cmb && ( cmb_bc ==
DIRICHLET );
1272 const bool surface_dirichlet = at_surface && ( surface_bc ==
DIRICHLET );
1274 const int cmb_shift = ( ( cmb_dirichlet && ( !diagonal_ ) ) ? 3 : 0 );
1275 const int surface_shift = ( ( surface_dirichlet && ( !diagonal_ ) ) ? 3 : 0 );
1277 static constexpr int WEDGE_NODE_OFF[2][6][3] = {
1278 { { 0, 0, 0 }, { 1, 0, 0 }, { 0, 1, 0 }, { 0, 0, 1 }, { 1, 0, 1 }, { 0, 1, 1 } },
1279 { { 1, 1, 0 }, { 0, 1, 0 }, { 1, 0, 0 }, { 1, 1, 1 }, { 0, 1, 1 }, { 1, 0, 1 } } };
1281 static constexpr int WEDGE_TO_UNIQUE[2][6] = {
1282 { 0, 1, 2, 3, 4, 5 },
1283 { 6, 2, 1, 7, 5, 4 }
1286 constexpr double ONE_THIRD = 1.0 / 3.0;
1287 constexpr double ONE_SIXTH = 1.0 / 6.0;
1288 constexpr double NEG_TWO_THIRDS = -0.66666666666666663;
1290 static constexpr double dN_ref[6][3] = {
1291 { -0.5, -0.5, -ONE_SIXTH },
1292 { 0.5, 0.0, -ONE_SIXTH },
1293 { 0.0, 0.5, -ONE_SIXTH },
1294 { -0.5, -0.5, ONE_SIXTH },
1295 { 0.5, 0.0, ONE_SIXTH },
1296 { 0.0, 0.5, ONE_SIXTH } };
1298 const int n00 = node_id( tx, ty );
1299 const int n01 = node_id( tx, ty + 1 );
1300 const int n10 = node_id( tx + 1, ty );
1301 const int n11 = node_id( tx + 1, ty + 1 );
1305 ws[0][0][0] = coords_sh( n00, 0 );
1306 ws[0][0][1] = coords_sh( n00, 1 );
1307 ws[0][0][2] = coords_sh( n00, 2 );
1308 ws[0][1][0] = coords_sh( n10, 0 );
1309 ws[0][1][1] = coords_sh( n10, 1 );
1310 ws[0][1][2] = coords_sh( n10, 2 );
1311 ws[0][2][0] = coords_sh( n01, 0 );
1312 ws[0][2][1] = coords_sh( n01, 1 );
1313 ws[0][2][2] = coords_sh( n01, 2 );
1315 ws[1][0][0] = coords_sh( n11, 0 );
1316 ws[1][0][1] = coords_sh( n11, 1 );
1317 ws[1][0][2] = coords_sh( n11, 2 );
1318 ws[1][1][0] = coords_sh( n01, 0 );
1319 ws[1][1][1] = coords_sh( n01, 1 );
1320 ws[1][1][2] = coords_sh( n01, 2 );
1321 ws[1][2][0] = coords_sh( n10, 0 );
1322 ws[1][2][1] = coords_sh( n10, 1 );
1323 ws[1][2][2] = coords_sh( n10, 2 );
1327 src8[0][0] = src_sh( n00, 0, lvl0 );
1328 src8[1][0] = src_sh( n00, 1, lvl0 );
1329 src8[2][0] = src_sh( n00, 2, lvl0 );
1330 src8[0][1] = src_sh( n10, 0, lvl0 );
1331 src8[1][1] = src_sh( n10, 1, lvl0 );
1332 src8[2][1] = src_sh( n10, 2, lvl0 );
1333 src8[0][2] = src_sh( n01, 0, lvl0 );
1334 src8[1][2] = src_sh( n01, 1, lvl0 );
1335 src8[2][2] = src_sh( n01, 2, lvl0 );
1336 src8[0][6] = src_sh( n11, 0, lvl0 );
1337 src8[1][6] = src_sh( n11, 1, lvl0 );
1338 src8[2][6] = src_sh( n11, 2, lvl0 );
1340 src8[0][3] = src_sh( n00, 0, lvl0 + 1 );
1341 src8[1][3] = src_sh( n00, 1, lvl0 + 1 );
1342 src8[2][3] = src_sh( n00, 2, lvl0 + 1 );
1343 src8[0][4] = src_sh( n10, 0, lvl0 + 1 );
1344 src8[1][4] = src_sh( n10, 1, lvl0 + 1 );
1345 src8[2][4] = src_sh( n10, 2, lvl0 + 1 );
1346 src8[0][5] = src_sh( n01, 0, lvl0 + 1 );
1347 src8[1][5] = src_sh( n01, 1, lvl0 + 1 );
1348 src8[2][5] = src_sh( n01, 2, lvl0 + 1 );
1349 src8[0][7] = src_sh( n11, 0, lvl0 + 1 );
1350 src8[1][7] = src_sh( n11, 1, lvl0 + 1 );
1351 src8[2][7] = src_sh( n11, 2, lvl0 + 1 );
1353 double nx00 = coords_sh( n00, 0 ), ny00 = coords_sh( n00, 1 ), nz00 = coords_sh( n00, 2 );
1354 double nx10 = coords_sh( n10, 0 ), ny10 = coords_sh( n10, 1 ), nz10 = coords_sh( n10, 2 );
1355 double nx01 = coords_sh( n01, 0 ), ny01 = coords_sh( n01, 1 ), nz01 = coords_sh( n01, 2 );
1356 double nx11 = coords_sh( n11, 0 ), ny11 = coords_sh( n11, 1 ), nz11 = coords_sh( n11, 2 );
1358 normalize3( nx00, ny00, nz00 );
1359 normalize3( nx10, ny10, nz10 );
1360 normalize3( nx01, ny01, nz01 );
1361 normalize3( nx11, ny11, nz11 );
1370 double u_n_cmb[4] = {};
1371 double u_n_surf[4] = {};
1374 u_n_cmb[0] = nx00 * src8[0][0] + ny00 * src8[1][0] + nz00 * src8[2][0];
1375 u_n_cmb[1] = nx10 * src8[0][1] + ny10 * src8[1][1] + nz10 * src8[2][1];
1376 u_n_cmb[2] = nx01 * src8[0][2] + ny01 * src8[1][2] + nz01 * src8[2][2];
1377 u_n_cmb[3] = nx11 * src8[0][6] + ny11 * src8[1][6] + nz11 * src8[2][6];
1379 if ( surf_freeslip )
1381 u_n_surf[0] = nx00 * src8[0][3] + ny00 * src8[1][3] + nz00 * src8[2][3];
1382 u_n_surf[1] = nx10 * src8[0][4] + ny10 * src8[1][4] + nz10 * src8[2][4];
1383 u_n_surf[2] = nx01 * src8[0][5] + ny01 * src8[1][5] + nz01 * src8[2][5];
1384 u_n_surf[3] = nx11 * src8[0][7] + ny11 * src8[1][7] + nz11 * src8[2][7];
1389 project_tangential_inplace( nx00, ny00, nz00, src8[0][0], src8[1][0], src8[2][0] );
1390 project_tangential_inplace( nx10, ny10, nz10, src8[0][1], src8[1][1], src8[2][1] );
1391 project_tangential_inplace( nx01, ny01, nz01, src8[0][2], src8[1][2], src8[2][2] );
1392 project_tangential_inplace( nx11, ny11, nz11, src8[0][6], src8[1][6], src8[2][6] );
1394 if ( surf_freeslip )
1396 project_tangential_inplace( nx00, ny00, nz00, src8[0][3], src8[1][3], src8[2][3] );
1397 project_tangential_inplace( nx10, ny10, nz10, src8[0][4], src8[1][4], src8[2][4] );
1398 project_tangential_inplace( nx01, ny01, nz01, src8[0][5], src8[1][5], src8[2][5] );
1399 project_tangential_inplace( nx11, ny11, nz11, src8[0][7], src8[1][7], src8[2][7] );
1402 double dst8[3][8] = { { 0.0 } };
1403 double normal_corr[3][8] = {};
1407 for (
int w = 0; w < 2; ++w )
1411 for (
int node = 0; node < 6; ++node )
1413 const int ddx = WEDGE_NODE_OFF[w][node][0];
1414 const int ddy = WEDGE_NODE_OFF[w][node][1];
1415 const int ddr = WEDGE_NODE_OFF[w][node][2];
1417 const int nid = node_id( tx + ddx, ty + ddy );
1418 const int lvl = lvl0 + ddr;
1419 k_sum += k_sh( nid, lvl );
1421 const double k_eval = ONE_SIXTH * k_sum;
1424 double i00, i01, i02;
1425 double i10, i11, i12;
1426 double i20, i21, i22;
1429 const double half_dr = 0.5 * ( r_1 - r_0 );
1430 const double r_mid = 0.5 * ( r_0 + r_1 );
1432 const double J_0_0 = r_mid * ( -ws[w][0][0] + ws[w][1][0] );
1433 const double J_0_1 = r_mid * ( -ws[w][0][0] + ws[w][2][0] );
1434 const double J_0_2 = half_dr * ( ONE_THIRD * ( ws[w][0][0] + ws[w][1][0] + ws[w][2][0] ) );
1436 const double J_1_0 = r_mid * ( -ws[w][0][1] + ws[w][1][1] );
1437 const double J_1_1 = r_mid * ( -ws[w][0][1] + ws[w][2][1] );
1438 const double J_1_2 = half_dr * ( ONE_THIRD * ( ws[w][0][1] + ws[w][1][1] + ws[w][2][1] ) );
1440 const double J_2_0 = r_mid * ( -ws[w][0][2] + ws[w][1][2] );
1441 const double J_2_1 = r_mid * ( -ws[w][0][2] + ws[w][2][2] );
1442 const double J_2_2 = half_dr * ( ONE_THIRD * ( ws[w][0][2] + ws[w][1][2] + ws[w][2][2] ) );
1444 const double J_det = J_0_0 * J_1_1 * J_2_2 - J_0_0 * J_1_2 * J_2_1 - J_0_1 * J_1_0 * J_2_2 +
1445 J_0_1 * J_1_2 * J_2_0 + J_0_2 * J_1_0 * J_2_1 - J_0_2 * J_1_1 * J_2_0;
1447 const double invJ = 1.0 /
J_det;
1449 i00 = invJ * ( J_1_1 * J_2_2 - J_1_2 * J_2_1 );
1450 i01 = invJ * ( -J_1_0 * J_2_2 + J_1_2 * J_2_0 );
1451 i02 = invJ * ( J_1_0 * J_2_1 - J_1_1 * J_2_0 );
1453 i10 = invJ * ( -J_0_1 * J_2_2 + J_0_2 * J_2_1 );
1454 i11 = invJ * ( J_0_0 * J_2_2 - J_0_2 * J_2_0 );
1455 i12 = invJ * ( -J_0_0 * J_2_1 + J_0_1 * J_2_0 );
1457 i20 = invJ * ( J_0_1 * J_1_2 - J_0_2 * J_1_1 );
1458 i21 = invJ * ( -J_0_0 * J_1_2 + J_0_2 * J_1_0 );
1459 i22 = invJ * ( J_0_0 * J_1_1 - J_0_1 * J_1_0 );
1461 wJ = Kokkos::abs( J_det );
1464 const double kwJ =
k_eval * wJ;
1475 if ( !diagonal_ && ( cmb_freeslip || surf_freeslip ) )
1477 static constexpr int CMB_NODE_TO_CORNER[2][3] = { { 0, 1, 2 }, { 3, 2, 1 } };
1478 const double cn[4][3] = { { nx00, ny00, nz00 },
1479 { nx10, ny10, nz10 },
1480 { nx01, ny01, nz01 },
1481 { nx11, ny11, nz11 } };
1484 for (
int ni = 0; ni < 3; ++ni )
1486 const int corner = CMB_NODE_TO_CORNER[w][ni];
1487 const double nxu = cn[corner][0], nyu = cn[corner][1], nzu = cn[corner][2];
1489 const double gx = dN_ref[ni][0];
1490 const double gy = dN_ref[ni][1];
1491 const double gz = dN_ref[ni][2];
1492 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1493 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1494 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1495 const double gg = g0 * g0 +
g1 *
g1 +
g2 *
g2;
1496 const double ng = nxu * g0 + nyu *
g1 + nzu *
g2;
1497 const double Ann = kwJ * ( gg + ONE_THIRD * ng * ng );
1498 const double c = Ann * u_n_cmb[corner];
1499 const int u = WEDGE_TO_UNIQUE[w][ni];
1501 normal_corr[0][u] += c * nxu;
1502 normal_corr[1][u] += c * nyu;
1503 normal_corr[2][u] += c * nzu;
1506 if ( surf_freeslip )
1508 for (
int ni = 0; ni < 3; ++ni )
1510 const int corner = CMB_NODE_TO_CORNER[w][ni];
1511 const double nxu = cn[corner][0], nyu = cn[corner][1], nzu = cn[corner][2];
1513 const double gx = dN_ref[ni + 3][0];
1514 const double gy = dN_ref[ni + 3][1];
1515 const double gz = dN_ref[ni + 3][2];
1516 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1517 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1518 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1519 const double gg = g0 * g0 +
g1 *
g1 +
g2 *
g2;
1520 const double ng = nxu * g0 + nyu *
g1 + nzu *
g2;
1521 const double Ann = kwJ * ( gg + ONE_THIRD * ng * ng );
1522 const double c = Ann * u_n_surf[corner];
1523 const int u = WEDGE_TO_UNIQUE[w][ni + 3];
1525 normal_corr[0][u] += c * nxu;
1526 normal_corr[1][u] += c * nyu;
1527 normal_corr[2][u] += c * nzu;
1533 double gu10 = 0.0, gu11 = 0.0;
1534 double gu20 = 0.0, gu21 = 0.0, gu22 = 0.0;
1539 for (
int dimj = 0; dimj < 3; ++dimj )
1542 for (
int node_idx = cmb_shift; node_idx < 6 -
surface_shift; ++node_idx )
1544 const double gx = dN_ref[node_idx][0];
1545 const double gy = dN_ref[node_idx][1];
1546 const double gz = dN_ref[node_idx][2];
1548 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1549 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1550 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1552 double E00, E11, E22, sym01, sym02, sym12, gdd;
1553 column_grad_to_sym( dimj, g0, g1, g2, E00, E11, E22, sym01, sym02, sym12, gdd );
1555 const int u = WEDGE_TO_UNIQUE[w][node_idx];
1556 const double s = src8[dimj][u];
1568 for (
int dimi = 0; dimi < 3; ++dimi )
1571 for (
int node_idx = cmb_shift; node_idx < 6 -
surface_shift; ++node_idx )
1573 const double gx = dN_ref[node_idx][0];
1574 const double gy = dN_ref[node_idx][1];
1575 const double gz = dN_ref[node_idx][2];
1577 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1578 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1579 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1581 double E00, E11, E22, sym01, sym02, sym12, gdd;
1582 column_grad_to_sym( dimi, g0, g1, g2, E00, E11, E22, sym01, sym02, sym12, gdd );
1584 const int u = WEDGE_TO_UNIQUE[w][node_idx];
1587 kwJ * ( NEG_TWO_THIRDS *
div_u * gdd + 4.0 * sym01 * gu10 + 4.0 * sym02 * gu20 +
1588 4.0 * sym12 * gu21 + 2.0 * E00 * gu00 + 2.0 * E11 * gu11 + 2.0 * E22 * gu22 );
1594 if ( diagonal_ || cmb_dirichlet || surface_dirichlet )
1596 for (
int dim_diagBC = 0; dim_diagBC < 3; ++dim_diagBC )
1598 for (
int node_idx = surface_shift; node_idx < 6 -
cmb_shift; ++node_idx )
1602 if ( diagonal_ && cmb_freeslip && node_idx < 3 )
1604 if ( diagonal_ && surf_freeslip && node_idx >= 3 )
1607 const double gx = dN_ref[node_idx][0];
1608 const double gy = dN_ref[node_idx][1];
1609 const double gz = dN_ref[node_idx][2];
1611 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1612 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1613 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1615 double E00, E11, E22, sym01, sym02, sym12, gdd;
1616 column_grad_to_sym( dim_diagBC, g0, g1, g2, E00, E11, E22, sym01, sym02, sym12, gdd );
1618 const int u = WEDGE_TO_UNIQUE[w][node_idx];
1619 const double s = src8[dim_diagBC][u];
1621 dst8[dim_diagBC][u] += kwJ * ( 4.0 * s * ( sym01 * sym01 + sym02 * sym02 + sym12 * sym12 ) +
1622 2.0 * s * ( E00 * E00 + E11 * E11 + E22 * E22 ) +
1623 NEG_TWO_THIRDS * ( gdd * gdd ) * s );
1634 static constexpr int FS_CORNER_MAP[2][3] = { { 0, 1, 2 }, { 3, 2, 1 } };
1635 const double ncoords[4][3] = { { nx00, ny00, nz00 },
1636 { nx10, ny10, nz10 },
1637 { nx01, ny01, nz01 },
1638 { nx11, ny11, nz11 } };
1640 auto apply_rotated_diag =
1641 [&](
const int ni,
const int node_idx,
const double* u_n_arr ) {
1642 const int corner = FS_CORNER_MAP[w][ni];
1643 const double nxu = ncoords[corner][0];
1644 const double nyu = ncoords[corner][1];
1645 const double nzu = ncoords[corner][2];
1646 const int u = WEDGE_TO_UNIQUE[w][node_idx];
1648 const double gx = dN_ref[node_idx][0];
1649 const double gy = dN_ref[node_idx][1];
1650 const double gz = dN_ref[node_idx][2];
1652 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1653 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1654 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1655 const double gg_loc = g0 * g0 +
g1 *
g1 +
g2 *
g2;
1657 dense::Vec< double, 3 > n_vec;
1661 const auto R_rot = trafo_mat_cartesian_to_normal_tangential< double >( n_vec );
1665 const double s0 = src8[0][u] + u_n_arr[corner] * nxu;
1666 const double s1 = src8[1][u] + u_n_arr[corner] * nyu;
1667 const double s2 = src8[2][u] + u_n_arr[corner] * nzu;
1670 const double Rg0 = R_rot( 0, 0 ) * g0 + R_rot( 0, 1 ) *
g1 + R_rot( 0, 2 ) *
g2;
1671 const double Rg1 = R_rot( 1, 0 ) * g0 + R_rot( 1, 1 ) *
g1 + R_rot( 1, 2 ) *
g2;
1672 const double Rg2 = R_rot( 2, 0 ) * g0 + R_rot( 2, 1 ) *
g1 + R_rot( 2, 2 ) *
g2;
1673 const double Rs0 = R_rot( 0, 0 ) * s0 + R_rot( 0, 1 ) * s1 + R_rot( 0, 2 ) * s2;
1674 const double Rs1 = R_rot( 1, 0 ) * s0 + R_rot( 1, 1 ) * s1 + R_rot( 1, 2 ) * s2;
1675 const double Rs2 = R_rot( 2, 0 ) * s0 + R_rot( 2, 1 ) * s1 + R_rot( 2, 2 ) * s2;
1679 const double v0 = kwJ * ( gg_loc + ONE_THIRD * Rg0 * Rg0 ) * Rs0;
1680 const double v1 = kwJ * ( gg_loc + ONE_THIRD * Rg1 * Rg1 ) * Rs1;
1681 const double v2 = kwJ * ( gg_loc + ONE_THIRD * Rg2 * Rg2 ) * Rs2;
1684 dst8[0][u] += R_rot( 0, 0 ) * v0 + R_rot( 1, 0 ) * v1 + R_rot( 2, 0 ) * v2;
1685 dst8[1][u] += R_rot( 0, 1 ) * v0 + R_rot( 1, 1 ) * v1 + R_rot( 2, 1 ) * v2;
1686 dst8[2][u] += R_rot( 0, 2 ) * v0 + R_rot( 1, 2 ) * v1 + R_rot( 2, 2 ) * v2;
1691 for (
int ni = 0; ni < 3; ++ni )
1692 apply_rotated_diag( ni, ni, u_n_cmb );
1694 if ( surf_freeslip )
1696 for (
int ni = 0; ni < 3; ++ni )
1697 apply_rotated_diag( ni, ni + 3, u_n_surf );
1706 if ( !diagonal_ && cmb_freeslip )
1708 project_tangential_inplace( nx00, ny00, nz00, dst8[0][0], dst8[1][0], dst8[2][0] );
1709 project_tangential_inplace( nx10, ny10, nz10, dst8[0][1], dst8[1][1], dst8[2][1] );
1710 project_tangential_inplace( nx01, ny01, nz01, dst8[0][2], dst8[1][2], dst8[2][2] );
1711 project_tangential_inplace( nx11, ny11, nz11, dst8[0][6], dst8[1][6], dst8[2][6] );
1713 if ( !diagonal_ && surf_freeslip )
1715 project_tangential_inplace( nx00, ny00, nz00, dst8[0][3], dst8[1][3], dst8[2][3] );
1716 project_tangential_inplace( nx10, ny10, nz10, dst8[0][4], dst8[1][4], dst8[2][4] );
1717 project_tangential_inplace( nx01, ny01, nz01, dst8[0][5], dst8[1][5], dst8[2][5] );
1718 project_tangential_inplace( nx11, ny11, nz11, dst8[0][7], dst8[1][7], dst8[2][7] );
1724 if ( !diagonal_ && ( cmb_freeslip || surf_freeslip ) )
1726 for (
int dim = 0;
dim < 3; ++
dim )
1727 for (
int u = 0; u < 8; ++u )
1728 dst8[dim][u] += normal_corr[dim][u];
1731 for (
int dim_add = 0; dim_add < 3; ++dim_add )
1733 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell, y_cell, r_cell, dim_add ), dst8[dim_add][0] );
1734 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell + 1, y_cell, r_cell, dim_add ), dst8[dim_add][1] );
1735 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell, y_cell + 1, r_cell, dim_add ), dst8[dim_add][2] );
1736 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell, y_cell, r_cell + 1, dim_add ), dst8[dim_add][3] );
1738 &dst_( local_subdomain_id, x_cell + 1, y_cell, r_cell + 1, dim_add ), dst8[dim_add][4] );
1740 &dst_( local_subdomain_id, x_cell, y_cell + 1, r_cell + 1, dim_add ), dst8[dim_add][5] );
1742 &dst_( local_subdomain_id, x_cell + 1, y_cell + 1, r_cell, dim_add ), dst8[dim_add][6] );
1744 &dst_( local_subdomain_id, x_cell + 1, y_cell + 1, r_cell + 1, dim_add ), dst8[dim_add][7] );
1758 KOKKOS_INLINE_FUNCTION
1761 int local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell;
1762 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
1764 if ( tr >= r_tile_ )
1767 const bool at_cmb =
has_flag( local_subdomain_id, x_cell, y_cell, r_cell, CMB );
1768 const bool at_surface =
has_flag( local_subdomain_id, x_cell, y_cell, r_cell + 1, SURFACE );
1770 if ( kernel_path_ == KernelPath::Slow )
1773 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell, at_cmb, at_surface );
1775 else if ( kernel_path_ == KernelPath::FastFreeslip )
1777 operator_fast_freeslip_path(
1778 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell, at_cmb, at_surface );
1782 operator_fast_dirichlet_neumann_path(
1783 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell, at_cmb, at_surface );
1802 const auto det = J.det();
1803 const auto abs_det = Kokkos::abs( det );
1809 k_eval +=
shape( k, quad_point ) * k_local_hex[wedge]( k );
1814 sym_grad_i[k] =
symmetric_grad( J_inv_transposed, quad_point, k, dimi );
1815 sym_grad_j[k] =
symmetric_grad( J_inv_transposed, quad_point, k, dimj );
1818 jdet_keval_quadweight = quad_weight * k_eval * abs_det;
1824 KOKKOS_INLINE_FUNCTION
1826 const int local_subdomain_id,
1830 const int wedge )
const
1835 const ScalarT r_1 = radii_( local_subdomain_id, r_cell );
1836 const ScalarT r_2 = radii_( local_subdomain_id, r_cell + 1 );
1843 for (
int dimi = 0; dimi < 3; ++dimi )
1845 for (
int dimj = 0; dimj < 3; ++dimj )
1847 for (
int q = 0; q < num_quad_points; q++ )
1865 jdet_keval_quadweight );
1872 jdet_keval_quadweight *
1874 2.0 / 3.0 * sym_grad_j[j]( dimj, dimj ) * sym_grad_i[i]( dimi, dimi ) );