60 using Team = Kokkos::TeamPolicy<>::member_type;
72 BoundaryConditions bcs_;
93 int local_subdomains_;
96 int lat_refinement_level_;
118 bool use_slow_path_ =
false;
130 void update_kernel_path_flag_host_only()
132 const BoundaryConditionFlag cmb_bc = get_boundary_condition_flag( bcs_, CMB );
133 const BoundaryConditionFlag surface_bc = get_boundary_condition_flag( bcs_, SURFACE );
135 const bool has_freeslip_bc = ( cmb_bc == FREESLIP ) || ( surface_bc == FREESLIP );
138 use_slow_path_ = has_freeslip_bc || has_stored_matrices;
148 BoundaryConditions bcs,
159 , diagonal_( diagonal )
160 , operator_apply_mode_( operator_apply_mode )
161 , operator_communication_mode_( operator_communication_mode )
162 , operator_stored_matrix_mode_( operator_stored_matrix_mode )
163 , recv_buffers_( domain )
164 , comm_plan_( domain )
173 local_subdomains_ = domain_.
subdomains().size();
182 lat_tiles_ = ( hex_lat_ + lat_tile_ - 1 ) / lat_tile_;
183 r_tiles_ = ( hex_rad_ + r_tile_ - 1 ) / r_tile_;
185 team_size_ = lat_tile_ * lat_tile_ * r_tile_;
186 blocks_ = local_subdomains_ * lat_tiles_ * lat_tiles_ * r_tiles_;
188 r_min_ = domain_info.
radii()[0];
189 r_max_ = domain_info.
radii()[domain_info.
radii().size() - 1];
191 update_kernel_path_flag_host_only();
193 util::logroot <<
"[EpsilonDivDiv] tile size (x,y,r)=(" << lat_tile_ <<
"," << lat_tile_ <<
"," << r_tile_ <<
")"
195 util::logroot <<
"[EpsilonDivDiv] number of tiles (x,y,r)=(" << lat_tiles_ <<
"," << lat_tiles_ <<
","
196 << r_tiles_ <<
"), team_size=" << team_size_ <<
", blocks=" << blocks_ << std::endl;
197 util::logroot <<
"[EpsilonDivDiv] kernel path = " << ( use_slow_path_ ?
"slow" :
"fast" ) << std::endl;
204 operator_apply_mode_ = operator_apply_mode;
205 operator_communication_mode_ = operator_communication_mode;
215 update_kernel_path_flag_host_only();
224 KOKKOS_INLINE_FUNCTION
226 const int local_subdomain_id,
232 return util::has_flag( mask_( local_subdomain_id, x_cell, y_cell, r_cell ), flag );
246 operator_stored_matrix_mode_ = operator_stored_matrix_mode;
251 domain_, operator_stored_matrix_mode_, level_range, GCAElements );
254 update_kernel_path_flag_host_only();
260 KOKKOS_INLINE_FUNCTION
262 const int local_subdomain_id,
270 local_matrix_storage_.
set_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge, mat );
276 KOKKOS_INLINE_FUNCTION
278 const int local_subdomain_id,
282 const int wedge )
const
286 if ( !local_matrix_storage_.
has_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge ) )
288 Kokkos::abort(
"No matrix found at that spatial index." );
290 return local_matrix_storage_.
get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge );
315 util::Timer timer_kernel(
"epsilon_divdiv_kernel" );
316 Kokkos::TeamPolicy<> policy( blocks_, team_size_ );
319 if ( !use_slow_path_ )
321 policy.set_scratch_size( 0, Kokkos::PerTeam(
team_shmem_size( team_size_ ) ) );
325 if ( use_slow_path_ )
327 Kokkos::parallel_for(
330 KOKKOS_CLASS_LAMBDA(
const Team& team ) {
331 this->operator_slow_kernel( team );
336 Kokkos::parallel_for(
339 KOKKOS_CLASS_LAMBDA(
const Team& team ) {
340 this->operator_fast_kernel( team );
363 KOKKOS_INLINE_FUNCTION
377 E00 = E11 = E22 = sym01 = sym02 = sym12 = gdd = 0.0;
410 KOKKOS_INLINE_FUNCTION
413 const int nlev = r_tile_ + 1;
414 const int n = lat_tile_ + 1;
415 const int nxy = n * n;
417 const size_t nscalars =
419 size_t( nxy ) * 3 * nlev +
420 size_t( nxy ) * nlev +
435 KOKKOS_INLINE_FUNCTION
436 void decode_team_indices(
438 int& local_subdomain_id,
449 int tmp = team.league_rank();
451 const int r_tile_id = tmp % r_tiles_;
454 const int lat_y_id = tmp % lat_tiles_;
457 const int lat_x_id = tmp % lat_tiles_;
460 local_subdomain_id = tmp;
462 x0 = lat_x_id * lat_tile_;
463 y0 = lat_y_id * lat_tile_;
464 r0 = r_tile_id * r_tile_;
466 const int tid = team.team_rank();
467 tx = tid % lat_tile_;
468 ty = ( tid / lat_tile_ ) % lat_tile_;
469 tr = tid / ( lat_tile_ * lat_tile_ );
481 KOKKOS_INLINE_FUNCTION
482 void operator_slow_kernel(
const Team& team )
const
484 int local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell;
485 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
490 const bool at_cmb = has_flag( local_subdomain_id, x_cell, y_cell, r_cell, CMB );
491 const bool at_surface = has_flag( local_subdomain_id, x_cell, y_cell, r_cell + 1, SURFACE );
494 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell, at_cmb, at_surface );
502 KOKKOS_INLINE_FUNCTION
503 void operator_fast_kernel(
const Team& team )
const
505 int local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell;
506 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
511 const bool at_cmb = has_flag( local_subdomain_id, x_cell, y_cell, r_cell, CMB );
512 const bool at_surface = has_flag( local_subdomain_id, x_cell, y_cell, r_cell + 1, SURFACE );
515 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell, at_cmb, at_surface );
534 KOKKOS_INLINE_FUNCTION
535 void operator_slow_path(
537 const int local_subdomain_id,
548 const bool at_surface )
const
559 if ( x_cell >= hex_lat_ || y_cell >= hex_lat_ || r_cell >= hex_rad_ )
567 A[0] = local_matrix_storage_.
get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 );
568 A[1] = local_matrix_storage_.
get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 );
572 if ( local_matrix_storage_.
has_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 ) &&
573 local_matrix_storage_.
has_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 ) )
575 A[0] = local_matrix_storage_.
get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 );
576 A[1] = local_matrix_storage_.
get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 );
592 for (
int dimj = 0; dimj < 3; dimj++ )
608 dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > boundary_mask;
609 boundary_mask.fill( 1.0 );
611 bool freeslip_reorder =
false;
614 if ( at_cmb || at_surface )
619 if ( bcf == DIRICHLET )
622 for (
int dimi = 0; dimi < 3; ++dimi )
624 for (
int dimj = 0; dimj < 3; ++dimj )
630 if ( ( at_cmb && ( ( dimi == dimj && i != j && ( i < 3 || j < 3 ) ) ||
631 ( dimi != dimj && ( i < 3 || j < 3 ) ) ) ) ||
632 ( at_surface && ( ( dimi == dimj && i != j && ( i >= 3 || j >= 3 ) ) ||
633 ( dimi != dimj && ( i >= 3 || j >= 3 ) ) ) ) )
644 else if ( bcf == FREESLIP )
649 freeslip_reorder =
true;
653 for (
int wedge = 0; wedge < 2; ++wedge )
655 for (
int dimi = 0; dimi < 3; ++dimi )
659 for (
int dimj = 0; dimj < 3; ++dimj )
663 A_tmp[wedge]( node_idxi * 3 + dimi, node_idxj * 3 + dimj ) = A[wedge](
674 constexpr int layer_hex_offset_x[2][3] = { { 0, 1, 0 }, { 1, 0, 1 } };
675 constexpr int layer_hex_offset_y[2][3] = { { 0, 0, 1 }, { 1, 1, 0 } };
677 for (
int wedge = 0; wedge < 2; ++wedge )
680 for (
int i = 0; i <
LocalMatrixDim; ++i ) { R[wedge]( i, i ) = 1.0; }
683 for (
int boundary_node_idx = 0; boundary_node_idx < 3; boundary_node_idx++ )
687 x_cell + layer_hex_offset_x[wedge][boundary_node_idx],
688 y_cell + layer_hex_offset_y[wedge][boundary_node_idx],
689 r_cell + ( at_cmb ? 0 : 1 ),
695 const int offset_in_R = at_cmb ? 0 : 9;
696 for (
int dimi = 0; dimi < 3; ++dimi )
698 for (
int dimj = 0; dimj < 3; ++dimj )
701 offset_in_R + boundary_node_idx * 3 + dimi,
702 offset_in_R + boundary_node_idx * 3 + dimj ) = R_i( dimi, dimj );
708 A[wedge] = R[wedge] * A_tmp[wedge] * R[wedge].
transposed();
710 auto src_tmp = R[wedge] * src[wedge];
711 for (
int i = 0; i < 18; ++i ) { src[wedge]( i ) = src_tmp( i ); }
714 const int node_start = at_surface ? 3 : 0;
715 const int node_end = at_surface ? 6 : 3;
717 for (
int node_idx = node_start; node_idx < node_end; node_idx++ )
719 const int idx = node_idx * 3;
720 for (
int k = 0; k < 18; ++k )
724 boundary_mask( idx, k ) = 0.0;
725 boundary_mask( k, idx ) = 0.0;
731 else if ( bcf == NEUMANN )
751 dst[0] = A[0] * src[0];
752 dst[1] = A[1] * src[1];
755 if ( freeslip_reorder )
759 dst_tmp[1] = R[1].transposed() * dst[1];
761 for (
int i = 0; i < 18; ++i )
763 dst[0]( i ) = dst_tmp[0]( i );
764 dst[1]( i ) = dst_tmp[1]( i );
772 for (
int dimi = 0; dimi < 3; dimi++ )
779 dst_, local_subdomain_id, x_cell, y_cell, r_cell, dimi, dst_d );
800 KOKKOS_INLINE_FUNCTION
801 void operator_fast_path(
803 const int local_subdomain_id,
814 const bool at_surface )
const
817 const int nlev = r_tile_ + 1;
818 const int nxy = ( lat_tile_ + 1 ) * ( lat_tile_ + 1 );
822 double* shmem =
reinterpret_cast< double*
>(
825 using ScratchCoords =
826 Kokkos::View< double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
828 Kokkos::View< double***, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
830 Kokkos::View< double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
832 ScratchCoords coords_sh( shmem, nxy, 3 );
835 ScratchSrc src_sh( shmem, nxy, 3, nlev );
836 shmem += nxy * 3 * nlev;
838 ScratchK k_sh( shmem, nxy, nlev );
842 Kokkos::View< double*, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >(
845 auto node_id = [&](
int nx,
int ny ) ->
int {
return nx + ( lat_tile_ + 1 ) * ny; };
849 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nxy ), [&](
int n ) {
850 const int dxn = n % ( lat_tile_ + 1 );
851 const int dyn = n / ( lat_tile_ + 1 );
852 const int xi = x0 + dxn;
853 const int yi = y0 + dyn;
855 if ( xi <= hex_lat_ && yi <= hex_lat_ )
857 coords_sh( n, 0 ) = grid_( local_subdomain_id, xi, yi, 0 );
858 coords_sh( n, 1 ) = grid_( local_subdomain_id, xi, yi, 1 );
859 coords_sh( n, 2 ) = grid_( local_subdomain_id, xi, yi, 2 );
863 coords_sh( n, 0 ) = coords_sh( n, 1 ) = coords_sh( n, 2 ) = 0.0;
868 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nlev ), [&](
int lvl ) {
869 const int rr = r0 + lvl;
870 r_sh( lvl ) = ( rr <= hex_rad_ ) ? radii_( local_subdomain_id, rr ) : 0.0;
874 const int total_pairs = nxy * nlev;
875 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, total_pairs ), [&](
int t ) {
876 const int lvl = t / nxy;
877 const int node = t - lvl * nxy;
879 const int dxn = node % ( lat_tile_ + 1 );
880 const int dyn = node / ( lat_tile_ + 1 );
882 const int xi = x0 + dxn;
883 const int yi = y0 + dyn;
884 const int rr = r0 + lvl;
886 if ( xi <= hex_lat_ && yi <= hex_lat_ && rr <= hex_rad_ )
888 k_sh( node, lvl ) = k_( local_subdomain_id, xi, yi, rr );
889 src_sh( node, 0, lvl ) = src_( local_subdomain_id, xi, yi, rr, 0 );
890 src_sh( node, 1, lvl ) = src_( local_subdomain_id, xi, yi, rr, 1 );
891 src_sh( node, 2, lvl ) = src_( local_subdomain_id, xi, yi, rr, 2 );
895 k_sh( node, lvl ) = 0.0;
896 src_sh( node, 0, lvl ) = src_sh( node, 1, lvl ) = src_sh( node, 2, lvl ) = 0.0;
903 if ( x_cell >= hex_lat_ || y_cell >= hex_lat_ || r_cell >= hex_rad_ )
907 const double r_0 = r_sh( lvl0 );
908 const double r_1 = r_sh( lvl0 + 1 );
912 const bool at_boundary = at_cmb || at_surface;
913 bool treat_boundary_dirichlet =
false;
922 const int cmb_shift = ( ( at_boundary && treat_boundary_dirichlet && ( !diagonal_ ) && at_cmb ) ? 3 : 0 );
924 ( ( at_boundary && treat_boundary_dirichlet && ( !diagonal_ ) && at_surface ) ? 3 : 0 );
927 static constexpr int WEDGE_NODE_OFF[2][6][3] = {
928 { { 0, 0, 0 }, { 1, 0, 0 }, { 0, 1, 0 }, { 0, 0, 1 }, { 1, 0, 1 }, { 0, 1, 1 } },
929 { { 1, 1, 0 }, { 0, 1, 0 }, { 1, 0, 0 }, { 1, 1, 1 }, { 0, 1, 1 }, { 1, 0, 1 } } };
932 static constexpr int WEDGE_TO_UNIQUE[2][6] = {
933 { 0, 1, 2, 3, 4, 5 },
937 constexpr double ONE_THIRD = 1.0 / 3.0;
938 constexpr double ONE_SIXTH = 1.0 / 6.0;
939 constexpr double NEG_TWO_THIRDS = -0.66666666666666663;
942 static constexpr double dN_ref[6][3] = {
943 { -0.5, -0.5, -ONE_SIXTH },
944 { 0.5, 0.0, -ONE_SIXTH },
945 { 0.0, 0.5, -ONE_SIXTH },
946 { -0.5, -0.5, ONE_SIXTH },
947 { 0.5, 0.0, ONE_SIXTH },
948 { 0.0, 0.5, ONE_SIXTH } };
951 const int n00 = node_id( tx, ty );
952 const int n01 = node_id( tx, ty + 1 );
953 const int n10 = node_id( tx + 1, ty );
954 const int n11 = node_id( tx + 1, ty + 1 );
960 ws[0][0][0] = coords_sh( n00, 0 );
961 ws[0][0][1] = coords_sh( n00, 1 );
962 ws[0][0][2] = coords_sh( n00, 2 );
963 ws[0][1][0] = coords_sh( n10, 0 );
964 ws[0][1][1] = coords_sh( n10, 1 );
965 ws[0][1][2] = coords_sh( n10, 2 );
966 ws[0][2][0] = coords_sh( n01, 0 );
967 ws[0][2][1] = coords_sh( n01, 1 );
968 ws[0][2][2] = coords_sh( n01, 2 );
971 ws[1][0][0] = coords_sh( n11, 0 );
972 ws[1][0][1] = coords_sh( n11, 1 );
973 ws[1][0][2] = coords_sh( n11, 2 );
974 ws[1][1][0] = coords_sh( n01, 0 );
975 ws[1][1][1] = coords_sh( n01, 1 );
976 ws[1][1][2] = coords_sh( n01, 2 );
977 ws[1][2][0] = coords_sh( n10, 0 );
978 ws[1][2][1] = coords_sh( n10, 1 );
979 ws[1][2][2] = coords_sh( n10, 2 );
983 double dst8[3][8] = { 0.0 };
985 for (
int w = 0; w < 2; ++w )
990 for (
int node = 0; node < 6; ++node )
992 const int ddx = WEDGE_NODE_OFF[w][node][0];
993 const int ddy = WEDGE_NODE_OFF[w][node][1];
994 const int ddr = WEDGE_NODE_OFF[w][node][2];
996 const int nid = node_id( tx + ddx, ty + ddy );
997 const int lvl = lvl0 + ddr;
999 k_sum += k_sh( nid, lvl );
1001 const double k_eval = ONE_SIXTH * k_sum;
1005 double i00, i01, i02;
1006 double i10, i11, i12;
1007 double i20, i21, i22;
1010 const double half_dr = 0.5 * ( r_1 - r_0 );
1011 const double r_mid = 0.5 * ( r_0 + r_1 );
1013 const double J_0_0 = r_mid * ( -ws[w][0][0] + ws[w][1][0] );
1014 const double J_0_1 = r_mid * ( -ws[w][0][0] + ws[w][2][0] );
1015 const double J_0_2 = half_dr * ( ONE_THIRD * ( ws[w][0][0] + ws[w][1][0] + ws[w][2][0] ) );
1017 const double J_1_0 = r_mid * ( -ws[w][0][1] + ws[w][1][1] );
1018 const double J_1_1 = r_mid * ( -ws[w][0][1] + ws[w][2][1] );
1019 const double J_1_2 = half_dr * ( ONE_THIRD * ( ws[w][0][1] + ws[w][1][1] + ws[w][2][1] ) );
1021 const double J_2_0 = r_mid * ( -ws[w][0][2] + ws[w][1][2] );
1022 const double J_2_1 = r_mid * ( -ws[w][0][2] + ws[w][2][2] );
1023 const double J_2_2 = half_dr * ( ONE_THIRD * ( ws[w][0][2] + ws[w][1][2] + ws[w][2][2] ) );
1025 const double J_det = J_0_0 * J_1_1 * J_2_2 - J_0_0 * J_1_2 * J_2_1 - J_0_1 * J_1_0 * J_2_2 +
1026 J_0_1 * J_1_2 * J_2_0 + J_0_2 * J_1_0 * J_2_1 - J_0_2 * J_1_1 * J_2_0;
1028 const double invJ = 1.0 /
J_det;
1030 i00 = invJ * ( J_1_1 * J_2_2 - J_1_2 * J_2_1 );
1031 i01 = invJ * ( -J_1_0 * J_2_2 + J_1_2 * J_2_0 );
1032 i02 = invJ * ( J_1_0 * J_2_1 - J_1_1 * J_2_0 );
1033 i10 = invJ * ( -J_0_1 * J_2_2 + J_0_2 * J_2_1 );
1034 i11 = invJ * ( J_0_0 * J_2_2 - J_0_2 * J_2_0 );
1035 i12 = invJ * ( -J_0_0 * J_2_1 + J_0_1 * J_2_0 );
1036 i20 = invJ * ( J_0_1 * J_1_2 - J_0_2 * J_1_1 );
1037 i21 = invJ * ( -J_0_0 * J_1_2 + J_0_2 * J_1_0 );
1038 i22 = invJ * ( J_0_0 * J_1_1 - J_0_1 * J_1_0 );
1040 wJ = Kokkos::abs( J_det );
1043 const double kwJ =
k_eval * wJ;
1049 double gu10 = 0.0, gu11 = 0.0;
1050 double gu20 = 0.0, gu21 = 0.0, gu22 = 0.0;
1056 for (
int dimj = 0; dimj < 3; ++dimj )
1059 for (
int node_idx = cmb_shift; node_idx < 6 -
surface_shift; ++node_idx )
1061 const double gx = dN_ref[node_idx][0];
1062 const double gy = dN_ref[node_idx][1];
1063 const double gz = dN_ref[node_idx][2];
1065 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1066 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1067 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1069 double E00, E11, E22, sym01, sym02, sym12, gdd;
1070 column_grad_to_sym( dimj, g0, g1, g2, E00, E11, E22, sym01, sym02, sym12, gdd );
1072 const int ddx = WEDGE_NODE_OFF[w][node_idx][0];
1073 const int ddy = WEDGE_NODE_OFF[w][node_idx][1];
1074 const int ddr = WEDGE_NODE_OFF[w][node_idx][2];
1076 const int nid = node_id( tx + ddx, ty + ddy );
1077 const int lvl = lvl0 + ddr;
1079 const double s = src_sh( nid, dimj, lvl );
1092 for (
int dimi = 0; dimi < 3; ++dimi )
1095 for (
int node_idx = cmb_shift; node_idx < 6 -
surface_shift; ++node_idx )
1097 const double gx = dN_ref[node_idx][0];
1098 const double gy = dN_ref[node_idx][1];
1099 const double gz = dN_ref[node_idx][2];
1101 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1102 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1103 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1105 double E00, E11, E22, sym01, sym02, sym12, gdd;
1106 column_grad_to_sym( dimi, g0, g1, g2, E00, E11, E22, sym01, sym02, sym12, gdd );
1108 const int u = WEDGE_TO_UNIQUE[w][node_idx];
1110 dst8[dimi][u] += kwJ * ( NEG_TWO_THIRDS *
div_u * gdd +
1111 4.0 * sym01 * gu10 +
1112 4.0 * sym02 * gu20 +
1113 4.0 * sym12 * gu21 +
1122 if ( diagonal_ || ( treat_boundary_dirichlet && at_boundary ) )
1124 for (
int dim_diagBC = 0; dim_diagBC < 3; ++dim_diagBC )
1127 for (
int node_idx = surface_shift; node_idx < 6 -
cmb_shift; ++node_idx )
1129 const double gx = dN_ref[node_idx][0];
1130 const double gy = dN_ref[node_idx][1];
1131 const double gz = dN_ref[node_idx][2];
1133 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1134 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1135 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1137 double E00, E11, E22, sym01, sym02, sym12, gdd;
1138 column_grad_to_sym( dim_diagBC, g0, g1, g2, E00, E11, E22, sym01, sym02, sym12, gdd );
1140 const int ddx = WEDGE_NODE_OFF[w][node_idx][0];
1141 const int ddy = WEDGE_NODE_OFF[w][node_idx][1];
1142 const int ddr = WEDGE_NODE_OFF[w][node_idx][2];
1144 const int nid = node_id( tx + ddx, ty + ddy );
1145 const int lvl = lvl0 + ddr;
1147 const double s = src_sh( nid, dim_diagBC, lvl );
1148 const int u = WEDGE_TO_UNIQUE[w][node_idx];
1150 dst8[dim_diagBC][u] +=
1151 kwJ * ( 4.0 * s * ( sym01 * sym01 + sym02 * sym02 + sym12 * sym12 ) +
1152 2.0 * s * ( E00 * E00 + E11 * E11 + E22 * E22 ) +
1153 NEG_TWO_THIRDS * ( gdd * gdd ) * s );
1160 for (
int dim_add = 0; dim_add < 3; ++dim_add )
1162 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell, y_cell, r_cell, dim_add ), dst8[dim_add][0] );
1163 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell + 1, y_cell, r_cell, dim_add ), dst8[dim_add][1] );
1164 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell, y_cell + 1, r_cell, dim_add ), dst8[dim_add][2] );
1165 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell, y_cell, r_cell + 1, dim_add ), dst8[dim_add][3] );
1166 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell + 1, y_cell, r_cell + 1, dim_add ), dst8[dim_add][4] );
1167 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell, y_cell + 1, r_cell + 1, dim_add ), dst8[dim_add][5] );
1168 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell + 1, y_cell + 1, r_cell, dim_add ), dst8[dim_add][6] );
1169 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell + 1, y_cell + 1, r_cell + 1, dim_add ), dst8[dim_add][7] );
1180 KOKKOS_INLINE_FUNCTION
1183 if ( use_slow_path_ ) { operator_slow_kernel( team ); }
1184 else { operator_fast_kernel( team ); }
1218 const auto det = J.det();
1219 const auto abs_det = Kokkos::abs( det );
1225 k_eval +=
shape( k, quad_point ) * k_local_hex[wedge]( k );
1230 sym_grad_i[k] =
symmetric_grad( J_inv_transposed, quad_point, k, dimi );
1231 sym_grad_j[k] =
symmetric_grad( J_inv_transposed, quad_point, k, dimj );
1234 jdet_keval_quadweight = quad_weight * k_eval * abs_det;
1242 KOKKOS_INLINE_FUNCTION
1244 const int local_subdomain_id,
1248 const int wedge )
const
1253 const ScalarT r_1 = radii_( local_subdomain_id, r_cell );
1254 const ScalarT r_2 = radii_( local_subdomain_id, r_cell + 1 );
1261 for (
int dimi = 0; dimi < 3; ++dimi )
1263 for (
int dimj = 0; dimj < 3; ++dimj )
1265 for (
int q = 0; q < num_quad_points; q++ )
1283 jdet_keval_quadweight );
1290 jdet_keval_quadweight *
1292 2.0 / 3.0 * sym_grad_j[j]( dimj, dimj ) * sym_grad_i[i]( dimi, dimi ) );