71 using Team = Kokkos::TeamPolicy<>::member_type;
83 BoundaryConditions bcs_;
104 int local_subdomains_;
107 int lat_refinement_level_;
128 enum class KernelPath
131 FastDirichletNeumann,
134 KernelPath kernel_path_ = KernelPath::FastDirichletNeumann;
137 bool penalty_active_ =
false;
138 ScalarT penalty_epsilon_ = 1.0;
151 void update_kernel_path_flag_host_only()
154 if constexpr ( std::is_same_v< Kokkos::DefaultExecutionSpace, Kokkos::Serial > )
156 kernel_path_ = KernelPath::Slow;
160 const BoundaryConditionFlag cmb_bc = get_boundary_condition_flag( bcs_, CMB );
161 const BoundaryConditionFlag surface_bc = get_boundary_condition_flag( bcs_, SURFACE );
163 const bool has_freeslip = ( cmb_bc == FREESLIP ) || ( surface_bc == FREESLIP );
166 if ( has_stored_matrices )
167 kernel_path_ = KernelPath::Slow;
168 else if ( has_freeslip )
169 kernel_path_ = KernelPath::FastFreeslip;
171 kernel_path_ = KernelPath::FastDirichletNeumann;
181 BoundaryConditions bcs,
192 , diagonal_( diagonal )
193 , operator_apply_mode_( operator_apply_mode )
194 , operator_communication_mode_( operator_communication_mode )
195 , operator_stored_matrix_mode_( operator_stored_matrix_mode )
196 , recv_buffers_( domain )
197 , comm_plan_( domain )
206 local_subdomains_ = domain_.
subdomains().size();
212 if constexpr ( std::is_same_v< Kokkos::DefaultExecutionSpace, Kokkos::Serial > )
218#ifdef KOKKOS_ENABLE_OPENMP
221 else if constexpr ( std::is_same_v< Kokkos::DefaultExecutionSpace, Kokkos::OpenMP > )
223 const int max_team = std::min(
224 Kokkos::OpenMP().concurrency(),
225 static_cast< int >( Kokkos::Impl::HostThreadTeamData::max_team_members ) );
226 if ( max_team >= 64 )
232 else if ( max_team >= 16 )
252 r_tile_block_ = r_tile_ * r_passes_;
254 lat_tiles_ = ( hex_lat_ + lat_tile_ - 1 ) / lat_tile_;
255 r_tiles_ = ( hex_rad_ + r_tile_block_ - 1 ) / r_tile_block_;
257 team_size_ = lat_tile_ * lat_tile_ * r_tile_;
258 blocks_ = local_subdomains_ * lat_tiles_ * lat_tiles_ * r_tiles_;
260 r_min_ = domain_info.
radii()[0];
261 r_max_ = domain_info.
radii()[domain_info.
radii().size() - 1];
264 update_kernel_path_flag_host_only();
266 util::logroot <<
"[EpsilonDivDiv] tile size (x,y,r)=(" << lat_tile_ <<
"," << lat_tile_ <<
"," << r_tile_
267 <<
"), r_passes=" << r_passes_ << std::endl;
268 util::logroot <<
"[EpsilonDivDiv] number of tiles (x,y,r)=(" << lat_tiles_ <<
"," << lat_tiles_ <<
","
269 << r_tiles_ <<
"), team_size=" << team_size_ <<
", blocks=" << blocks_ << std::endl;
270 const char* path_name = ( kernel_path_ == KernelPath::Slow ) ?
"slow" :
271 ( kernel_path_ == KernelPath::FastFreeslip ) ?
"fast-freeslip" :
272 "fast-dirichlet-neumann";
273 util::logroot <<
"[EpsilonDivDiv] kernel path = " << path_name << std::endl;
289 const BoundaryConditionFlag cmb_bc = get_boundary_condition_flag( bcs_, CMB );
290 const BoundaryConditionFlag surface_bc = get_boundary_condition_flag( bcs_, SURFACE );
292 penalty_active_ = ( cmb_bc == FREESLIP ) && ( surface_bc == FREESLIP );
293 if ( !penalty_active_ )
296 util::logroot <<
"[EpsilonDivDiv] Initializing fs/fs null-space penalty (epsilon=" << penalty_epsilon_ <<
")"
305 auto grid_local = grid_;
306 auto radii_local = radii_;
307 auto mask_local = mask_;
310 for (
int axis = 0; axis < 3; ++axis )
313 "penalty_null_mode_" + std::to_string( axis ), nsub, nlat, nlat, nrad );
315 auto nm = null_modes_[axis];
317 Kokkos::parallel_for(
318 "penalty_fill_mode_" + std::to_string( axis ),
319 Kokkos::MDRangePolicy< Kokkos::Rank< 4 > >( { 0, 0, 0, 0 }, { nsub, nlat, nlat, nrad } ),
320 KOKKOS_LAMBDA(
int s,
int x,
int y,
int r ) {
324 const int a1 = ( axis + 1 ) % 3;
325 const int a2 = ( axis + 2 ) % 3;
326 for (
int d = 0; d < VecDim; ++d )
327 nm( s, x, y, r, d ) = 0.0;
328 nm( s, x, y, r, a1 ) = c( a2 );
329 nm( s, x, y, r, a2 ) = -c( a1 );
336 static_cast< uint8_t
>( CMB ) |
static_cast< uint8_t
>( SURFACE ) );
338 Kokkos::parallel_for(
339 "penalty_fs_enforce_" + std::to_string( axis ),
340 Kokkos::MDRangePolicy< Kokkos::Rank< 4 > >( { 0, 0, 0, 0 }, { nsub, nlat, nlat, nrad } ),
341 KOKKOS_LAMBDA(
int s,
int x,
int y,
int r ) {
342 if ( !
util::has_flag( mask_local( s, x, y, r ), freeslip_boundary_mask ) )
346 for (
int d = 0; d < 3; ++d )
347 v( d ) = nm( s, x, y, r, d );
350 for (
int d = 0; d < 3; ++d )
351 normal( d ) = grid_local( s, x, y, d );
354 const auto R = trafo_mat_cartesian_to_normal_tangential( normal );
361 const auto Rt = R.transposed();
362 const auto cart = Rt * nt;
364 for (
int d = 0; d < 3; ++d )
365 nm( s, x, y, r, d ) = cart( d );
371 for (
int i = 0; i < 3; ++i )
374 for (
int j = 0; j < i; ++j )
379 auto ni = null_modes_[i];
380 auto nj = null_modes_[j];
381 Kokkos::parallel_for(
382 "penalty_gs_subtract",
383 Kokkos::MDRangePolicy< Kokkos::Rank< 4 > >( { 0, 0, 0, 0 }, { nsub, nlat, nlat, nrad } ),
384 KOKKOS_LAMBDA(
int s,
int x,
int y,
int r ) {
385 for (
int d = 0; d < VecDim; ++d )
386 ni( s, x, y, r, d ) -= dot_ij * nj( s, x, y, r, d );
394 ScalarType inv_norm = 1.0 / Kokkos::sqrt( norm_sq );
396 auto ni = null_modes_[i];
397 Kokkos::parallel_for(
398 "penalty_gs_normalize",
399 Kokkos::MDRangePolicy< Kokkos::Rank< 4 > >( { 0, 0, 0, 0 }, { nsub, nlat, nlat, nrad } ),
400 KOKKOS_LAMBDA(
int s,
int x,
int y,
int r ) {
401 for (
int d = 0; d < VecDim; ++d )
402 ni( s, x, y, r, d ) *= inv_norm;
407 util::logroot <<
"[EpsilonDivDiv] Null-space penalty initialized (3 modes)" << std::endl;
414 operator_apply_mode_ = operator_apply_mode;
415 operator_communication_mode_ = operator_communication_mode;
426 update_kernel_path_flag_host_only();
434 KOKKOS_INLINE_FUNCTION
436 const int local_subdomain_id,
441 {
return util::has_flag( mask_( local_subdomain_id, x_cell, y_cell, r_cell ), flag ); }
448 operator_stored_matrix_mode_ = operator_stored_matrix_mode;
453 domain_, operator_stored_matrix_mode_, level_range, GCAElements );
457 update_kernel_path_flag_host_only();
459 util::logroot <<
"[EpsilonDivDiv] (set_stored_matrix_mode) kernel path = "
460 << ( ( kernel_path_ == KernelPath::Slow ) ?
"slow" :
461 ( kernel_path_ == KernelPath::FastFreeslip ) ?
"fast-freeslip" :
462 "fast-dirichlet-neumann" )
468 KOKKOS_INLINE_FUNCTION
470 const int local_subdomain_id,
478 local_matrix_storage_.
set_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge, mat );
481 KOKKOS_INLINE_FUNCTION
483 const int local_subdomain_id,
487 const int wedge )
const
491 if ( !local_matrix_storage_.
has_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge ) )
493 Kokkos::abort(
"No matrix found at that spatial index." );
495 return local_matrix_storage_.
get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge );
525 util::Timer timer_kernel(
"epsilon_divdiv_kernel" );
526 Kokkos::TeamPolicy<> policy( blocks_, team_size_ );
529 if ( kernel_path_ != KernelPath::Slow )
531 policy.set_scratch_size( 0, Kokkos::PerTeam(
team_shmem_size( team_size_ ) ) );
535 if ( kernel_path_ == KernelPath::Slow )
537 Kokkos::parallel_for(
538 "epsilon_divdiv_apply_kernel_slow", policy, KOKKOS_CLASS_LAMBDA(
const Team& team ) {
539 this->run_team_slow( team );
542 else if ( kernel_path_ == KernelPath::FastFreeslip )
546 Kokkos::parallel_for(
547 "epsilon_divdiv_apply_kernel_fast_fs_diag", policy, KOKKOS_CLASS_LAMBDA(
const Team& team ) {
548 this->
template run_team_fast_freeslip< true >( team );
553 Kokkos::parallel_for(
554 "epsilon_divdiv_apply_kernel_fast_fs_matvec", policy, KOKKOS_CLASS_LAMBDA(
const Team& team ) {
555 this->
template run_team_fast_freeslip< false >( team );
561 Kokkos::TeamPolicy< Kokkos::LaunchBounds< 128, 5 > > dn_policy( blocks_, team_size_ );
565 Kokkos::parallel_for(
566 "epsilon_divdiv_apply_kernel_fast_dn_diag", dn_policy, KOKKOS_CLASS_LAMBDA(
const Team& team ) {
567 this->
template run_team_fast_dirichlet_neumann< true >( team );
572 Kokkos::parallel_for(
573 "epsilon_divdiv_apply_kernel_fast_dn_matvec", dn_policy, KOKKOS_CLASS_LAMBDA(
const Team& team ) {
574 this->
template run_team_fast_dirichlet_neumann< false >( team );
583 if ( penalty_active_ && !diagonal_ )
585 for (
int i = 0; i < 3; ++i )
591 auto nm = null_modes_[i];
592 auto dst_local = dst_;
593 Kokkos::parallel_for(
595 Kokkos::MDRangePolicy< Kokkos::Rank< 4 > >(
597 { dst_local.extent( 0 ),
598 dst_local.extent( 1 ),
599 dst_local.extent( 2 ),
600 dst_local.extent( 3 ) } ),
601 KOKKOS_LAMBDA(
int s,
int x,
int y,
int r ) {
602 for (
int d = 0; d < VecDim; ++d )
603 dst_local( s, x, y, r, d ) += alpha * nm( s, x, y, r, d );
606 Kokkos::fence(
"penalty" );
626 KOKKOS_INLINE_FUNCTION
640 E00 = E11 = E22 = sym01 = sym02 = sym12 = gdd = 0.0;
671 KOKKOS_INLINE_FUNCTION
674 const int nlev = r_tile_block_ + 1;
675 const int n = lat_tile_ + 1;
676 const int nxy = n * n;
679 const size_t nscalars = size_t( nxy ) * 3 + size_t( nxy ) * 3 + size_t( nxy ) * 3 * nlev +
680 size_t( nxy ) * nlev + size_t( nlev ) + 1;
685 KOKKOS_INLINE_FUNCTION
688 const int nlev = r_tile_block_ + 1;
689 const int n = lat_tile_ + 1;
690 const int nxy = n * n;
693 const size_t nscalars = size_t( nxy ) * 3 + size_t( nxy ) * 3 * nlev + size_t( nxy ) * nlev + size_t( nlev );
705 KOKKOS_INLINE_FUNCTION
706 void decode_team_indices(
708 int& local_subdomain_id,
719 int tmp = team.league_rank();
721 const int r_tile_id = tmp % r_tiles_;
724 const int lat_y_id = tmp % lat_tiles_;
727 const int lat_x_id = tmp % lat_tiles_;
730 local_subdomain_id = tmp;
732 x0 = lat_x_id * lat_tile_;
733 y0 = lat_y_id * lat_tile_;
734 r0 = r_tile_id * r_tile_block_;
736 const int tid = team.team_rank();
738 tx = ( tid / r_tile_ ) % lat_tile_;
739 ty = tid / ( r_tile_ * lat_tile_ );
761 KOKKOS_INLINE_FUNCTION
762 void run_team_slow(
const Team& team )
const
764 int local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell;
765 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
770 for (
int pass = 0; pass < r_passes_; ++pass )
772 const int r_cell_pass = r0 + pass * r_tile_ + tr;
773 if ( r_cell_pass >= hex_rad_ )
776 const bool at_cmb = has_flag( local_subdomain_id, x_cell, y_cell, r_cell_pass, CMB );
777 const bool at_surface = has_flag( local_subdomain_id, x_cell, y_cell, r_cell_pass + 1, SURFACE );
780 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell_pass, at_cmb, at_surface );
790 template <
bool Diagonal >
791 KOKKOS_INLINE_FUNCTION
void run_team_fast_dirichlet_neumann(
const Team& team )
const
793 int local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell;
794 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
799 operator_fast_dirichlet_neumann_path< Diagonal >(
800 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell );
809 template <
bool Diagonal >
810 KOKKOS_INLINE_FUNCTION
void run_team_fast_freeslip(
const Team& team )
const
812 int local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell;
813 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
818 operator_fast_freeslip_path< Diagonal >( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell );
822 KOKKOS_INLINE_FUNCTION
823 void operator_slow_path(
825 const int local_subdomain_id,
836 const bool at_surface )
const
846 if ( x_cell >= hex_lat_ || y_cell >= hex_lat_ || r_cell >= hex_rad_ )
853 A[0] = local_matrix_storage_.
get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 );
854 A[1] = local_matrix_storage_.
get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 );
858 if ( local_matrix_storage_.
has_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 ) &&
859 local_matrix_storage_.
has_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 ) )
861 A[0] = local_matrix_storage_.
get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 );
862 A[1] = local_matrix_storage_.
get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 );
877 for (
int dimj = 0; dimj < 3; dimj++ )
891 dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > boundary_mask;
892 boundary_mask.fill( 1.0 );
894 bool freeslip_reorder =
false;
897 if ( at_cmb || at_surface )
902 if ( bcf == DIRICHLET )
904 for (
int dimi = 0; dimi < 3; ++dimi )
906 for (
int dimj = 0; dimj < 3; ++dimj )
912 if ( ( at_cmb && ( ( dimi == dimj && i != j && ( i < 3 || j < 3 ) ) ||
913 ( dimi != dimj && ( i < 3 || j < 3 ) ) ) ) ||
914 ( at_surface && ( ( dimi == dimj && i != j && ( i >= 3 || j >= 3 ) ) ||
915 ( dimi != dimj && ( i >= 3 || j >= 3 ) ) ) ) )
925 else if ( bcf == FREESLIP )
927 freeslip_reorder =
true;
930 for (
int wedge = 0; wedge < 2; ++wedge )
932 for (
int dimi = 0; dimi < 3; ++dimi )
936 for (
int dimj = 0; dimj < 3; ++dimj )
940 A_tmp[wedge]( node_idxi * 3 + dimi, node_idxj * 3 + dimj ) = A[wedge](
950 constexpr int layer_hex_offset_x[2][3] = { { 0, 1, 0 }, { 1, 0, 1 } };
951 constexpr int layer_hex_offset_y[2][3] = { { 0, 0, 1 }, { 1, 1, 0 } };
953 for (
int wedge = 0; wedge < 2; ++wedge )
957 R[wedge]( i, i ) = 1.0;
960 for (
int boundary_node_idx = 0; boundary_node_idx < 3; boundary_node_idx++ )
964 x_cell + layer_hex_offset_x[wedge][boundary_node_idx],
965 y_cell + layer_hex_offset_y[wedge][boundary_node_idx],
966 r_cell + ( at_cmb ? 0 : 1 ),
972 const int offset_in_R = at_cmb ? 0 : 9;
973 for (
int dimi = 0; dimi < 3; ++dimi )
975 for (
int dimj = 0; dimj < 3; ++dimj )
978 offset_in_R + boundary_node_idx * 3 + dimi,
979 offset_in_R + boundary_node_idx * 3 + dimj ) = R_i( dimi, dimj );
984 A[wedge] = R[wedge] * A_tmp[wedge] * R[wedge].transposed();
986 auto src_tmp = R[wedge] * src[wedge];
987 for (
int i = 0; i < 18; ++i )
989 src[wedge]( i ) = src_tmp( i );
992 const int node_start = at_surface ? 3 : 0;
993 const int node_end = at_surface ? 6 : 3;
995 for (
int node_idx = node_start; node_idx < node_end; node_idx++ )
997 const int idx = node_idx * 3;
998 for (
int k = 0; k < 18; ++k )
1002 boundary_mask( idx, k ) = 0.0;
1003 boundary_mask( k, idx ) = 0.0;
1013 A[wedge].hadamard_product( boundary_mask );
1018 A[0] = A[0].diagonal();
1019 A[1] = A[1].diagonal();
1023 dst[0] = A[0] * src[0];
1024 dst[1] = A[1] * src[1];
1026 if ( freeslip_reorder )
1029 dst_tmp[0] = R[0].transposed() * dst[0];
1030 dst_tmp[1] = R[1].transposed() * dst[1];
1032 for (
int i = 0; i < 18; ++i )
1034 dst[0]( i ) = dst_tmp[0]( i );
1035 dst[1]( i ) = dst_tmp[1]( i );
1042 for (
int dimi = 0; dimi < 3; dimi++ )
1049 dst_, local_subdomain_id, x_cell, y_cell, r_cell, dimi, dst_d );
1054 template <
bool Diagonal >
1055 KOKKOS_INLINE_FUNCTION
void operator_fast_dirichlet_neumann_path(
1057 const int local_subdomain_id,
1065 const int y_cell )
const
1067 const int nlev = r_tile_block_ + 1;
1068 const int nxy = ( lat_tile_ + 1 ) * ( lat_tile_ + 1 );
1071 reinterpret_cast< double*
>( team.team_shmem().get_shmem(
team_shmem_size_dn( team.team_size() ) ) );
1073 using ScratchCoords =
1074 Kokkos::View< double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
1075 using ScratchSrc = Kokkos::
1076 View< double***, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
1078 Kokkos::View< double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
1080 ScratchCoords coords_sh( shmem, nxy, 3 );
1083 ScratchSrc src_sh( shmem, nxy, 3, nlev );
1084 shmem += nxy * 3 * nlev;
1086 ScratchK k_sh( shmem, nxy, nlev );
1087 shmem += nxy * nlev;
1090 Kokkos::View< double*, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >(
1093 auto node_id = [&](
int nx,
int ny ) ->
int {
return nx + ( lat_tile_ + 1 ) * ny; };
1095 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nxy ), [&](
int n ) {
1096 const int dxn = n % ( lat_tile_ + 1 );
1097 const int dyn = n / ( lat_tile_ + 1 );
1098 const int xi = x0 + dxn;
1099 const int yi = y0 + dyn;
1101 if ( xi <= hex_lat_ && yi <= hex_lat_ )
1103 coords_sh( n, 0 ) = grid_( local_subdomain_id, xi, yi, 0 );
1104 coords_sh( n, 1 ) = grid_( local_subdomain_id, xi, yi, 1 );
1105 coords_sh( n, 2 ) = grid_( local_subdomain_id, xi, yi, 2 );
1109 coords_sh( n, 0 ) = coords_sh( n, 1 ) = coords_sh( n, 2 ) = 0.0;
1113 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nlev ), [&](
int lvl ) {
1114 const int rr = r0 + lvl;
1115 r_sh( lvl ) = ( rr <= hex_rad_ ) ? radii_( local_subdomain_id, rr ) : 0.0;
1118 const int total_pairs = nxy * nlev;
1119 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, total_pairs ), [&](
int t ) {
1120 const int node = t / nlev;
1121 const int lvl = t - node * nlev;
1123 const int dxn = node % ( lat_tile_ + 1 );
1124 const int dyn = node / ( lat_tile_ + 1 );
1126 const int xi = x0 + dxn;
1127 const int yi = y0 + dyn;
1128 const int rr = r0 + lvl;
1130 if ( xi <= hex_lat_ && yi <= hex_lat_ && rr <= hex_rad_ )
1132 k_sh( node, lvl ) = k_( local_subdomain_id, xi, yi, rr );
1133 src_sh( node, 0, lvl ) = src_( local_subdomain_id, xi, yi, rr, 0 );
1134 src_sh( node, 1, lvl ) = src_( local_subdomain_id, xi, yi, rr, 1 );
1135 src_sh( node, 2, lvl ) = src_( local_subdomain_id, xi, yi, rr, 2 );
1139 k_sh( node, lvl ) = 0.0;
1140 src_sh( node, 0, lvl ) = src_sh( node, 1, lvl ) = src_sh( node, 2, lvl ) = 0.0;
1144 team.team_barrier();
1146 if ( x_cell >= hex_lat_ || y_cell >= hex_lat_ )
1149 constexpr double ONE_THIRD = 1.0 / 3.0;
1150 constexpr double ONE_SIXTH = 1.0 / 6.0;
1152 static constexpr double dN_ref[6][3] = {
1153 { -0.5, -0.5, -ONE_SIXTH },
1154 { 0.5, 0.0, -ONE_SIXTH },
1155 { 0.0, 0.5, -ONE_SIXTH },
1156 { -0.5, -0.5, ONE_SIXTH },
1157 { 0.5, 0.0, ONE_SIXTH },
1158 { 0.0, 0.5, ONE_SIXTH } };
1160 static constexpr int WEDGE_NODE_OFF[2][6][3] = {
1161 { { 0, 0, 0 }, { 1, 0, 0 }, { 0, 1, 0 }, { 0, 0, 1 }, { 1, 0, 1 }, { 0, 1, 1 } },
1162 { { 1, 1, 0 }, { 0, 1, 0 }, { 1, 0, 0 }, { 1, 1, 1 }, { 0, 1, 1 }, { 1, 0, 1 } } };
1164 const int n00 = node_id( tx, ty );
1165 const int n01 = node_id( tx, ty + 1 );
1166 const int n10 = node_id( tx + 1, ty );
1167 const int n11 = node_id( tx + 1, ty + 1 );
1169 for (
int pass = 0; pass < r_passes_; ++pass )
1171 const int lvl0 = pass * r_tile_ + tr;
1172 const int r_cell = r0 + lvl0;
1174 if ( r_cell >= hex_rad_ )
1177 const double r_0 = r_sh( lvl0 );
1178 const double r_1 = r_sh( lvl0 + 1 );
1180 const bool at_cmb =
has_flag( local_subdomain_id, x_cell, y_cell, r_cell, CMB );
1181 const bool at_surface =
has_flag( local_subdomain_id, x_cell, y_cell, r_cell + 1, SURFACE );
1183 const bool at_boundary = at_cmb || at_surface;
1184 bool treat_boundary_dirichlet =
false;
1191 const int cmb_shift = ( ( at_boundary && treat_boundary_dirichlet && ( !Diagonal ) && at_cmb ) ? 3 : 0 );
1193 ( ( at_boundary && treat_boundary_dirichlet && ( !Diagonal ) && at_surface ) ? 3 : 0 );
1195 for (
int w = 0; w < 2; ++w )
1197 const int v0 = w == 0 ? n00 : n11;
1198 const int v1 = w == 0 ? n10 : n01;
1199 const int v2 = w == 0 ? n01 : n10;
1203 for (
int node = 0; node < 6; ++node )
1205 const int nid = node_id( tx + WEDGE_NODE_OFF[w][node][0], ty + WEDGE_NODE_OFF[w][node][1] );
1206 k_sum += k_sh( nid, lvl0 + WEDGE_NODE_OFF[w][node][2] );
1208 const double k_eval = ONE_SIXTH * k_sum;
1215 double gu10 = 0.0, gu11 = 0.0;
1216 double gu20 = 0.0, gu21 = 0.0, gu22 = 0.0;
1219 const double half_dr = 0.5 * ( r_1 - r_0 );
1220 const double r_mid = 0.5 * ( r_0 + r_1 );
1222 const double J_0_0 = r_mid * ( -coords_sh( v0, 0 ) + coords_sh( v1, 0 ) );
1223 const double J_0_1 = r_mid * ( -coords_sh( v0, 0 ) + coords_sh( v2, 0 ) );
1224 const double J_0_2 =
1225 half_dr * ( ONE_THIRD * ( coords_sh( v0, 0 ) + coords_sh( v1, 0 ) + coords_sh( v2, 0 ) ) );
1227 const double J_1_0 = r_mid * ( -coords_sh( v0, 1 ) + coords_sh( v1, 1 ) );
1228 const double J_1_1 = r_mid * ( -coords_sh( v0, 1 ) + coords_sh( v2, 1 ) );
1229 const double J_1_2 =
1230 half_dr * ( ONE_THIRD * ( coords_sh( v0, 1 ) + coords_sh( v1, 1 ) + coords_sh( v2, 1 ) ) );
1232 const double J_2_0 = r_mid * ( -coords_sh( v0, 2 ) + coords_sh( v1, 2 ) );
1233 const double J_2_1 = r_mid * ( -coords_sh( v0, 2 ) + coords_sh( v2, 2 ) );
1234 const double J_2_2 =
1235 half_dr * ( ONE_THIRD * ( coords_sh( v0, 2 ) + coords_sh( v1, 2 ) + coords_sh( v2, 2 ) ) );
1237 const double J_det = J_0_0 * J_1_1 * J_2_2 - J_0_0 * J_1_2 * J_2_1 - J_0_1 * J_1_0 * J_2_2 +
1238 J_0_1 * J_1_2 * J_2_0 + J_0_2 * J_1_0 * J_2_1 - J_0_2 * J_1_1 * J_2_0;
1240 kwJ =
k_eval * Kokkos::abs( J_det );
1242 const double inv_det = 1.0 /
J_det;
1244 const double i00 = inv_det * ( J_1_1 * J_2_2 - J_1_2 * J_2_1 );
1245 const double i01 = inv_det * ( -J_1_0 * J_2_2 + J_1_2 * J_2_0 );
1246 const double i02 = inv_det * ( J_1_0 * J_2_1 - J_1_1 * J_2_0 );
1247 const double i10 = inv_det * ( -J_0_1 * J_2_2 + J_0_2 * J_2_1 );
1248 const double i11 = inv_det * ( J_0_0 * J_2_2 - J_0_2 * J_2_0 );
1249 const double i12 = inv_det * ( -J_0_0 * J_2_1 + J_0_1 * J_2_0 );
1250 const double i20 = inv_det * ( J_0_1 * J_1_2 - J_0_2 * J_1_1 );
1251 const double i21 = inv_det * ( -J_0_0 * J_1_2 + J_0_2 * J_1_0 );
1252 const double i22 = inv_det * ( J_0_0 * J_1_1 - J_0_1 * J_1_0 );
1259 const double gx = dN_ref[n][0];
1260 const double gy = dN_ref[n][1];
1261 const double gz = dN_ref[n][2];
1262 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1263 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1264 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1266 const int ddx = WEDGE_NODE_OFF[w][n][0];
1267 const int ddy = WEDGE_NODE_OFF[w][n][1];
1268 const int ddr = WEDGE_NODE_OFF[w][n][2];
1269 const int nid = node_id( tx + ddx, ty + ddy );
1270 const int lvl = lvl0 + ddr;
1272 const double s0 = src_sh( nid, 0, lvl );
1273 const double s1 = src_sh( nid, 1, lvl );
1274 const double s2 = src_sh( nid, 2, lvl );
1279 gu10 += 0.5 * (
g1 * s0 + g0 * s1 );
1280 gu20 += 0.5 * (
g2 * s0 + g0 * s2 );
1281 gu21 += 0.5 * (
g2 * s1 +
g1 * s2 );
1290 const double half_dr = 0.5 * ( r_1 - r_0 );
1291 const double r_mid = 0.5 * ( r_0 + r_1 );
1293 const double J_0_0 = r_mid * ( -coords_sh( v0, 0 ) + coords_sh( v1, 0 ) );
1294 const double J_0_1 = r_mid * ( -coords_sh( v0, 0 ) + coords_sh( v2, 0 ) );
1295 const double J_0_2 =
1296 half_dr * ( ONE_THIRD * ( coords_sh( v0, 0 ) + coords_sh( v1, 0 ) + coords_sh( v2, 0 ) ) );
1298 const double J_1_0 = r_mid * ( -coords_sh( v0, 1 ) + coords_sh( v1, 1 ) );
1299 const double J_1_1 = r_mid * ( -coords_sh( v0, 1 ) + coords_sh( v2, 1 ) );
1300 const double J_1_2 =
1301 half_dr * ( ONE_THIRD * ( coords_sh( v0, 1 ) + coords_sh( v1, 1 ) + coords_sh( v2, 1 ) ) );
1303 const double J_2_0 = r_mid * ( -coords_sh( v0, 2 ) + coords_sh( v1, 2 ) );
1304 const double J_2_1 = r_mid * ( -coords_sh( v0, 2 ) + coords_sh( v2, 2 ) );
1305 const double J_2_2 =
1306 half_dr * ( ONE_THIRD * ( coords_sh( v0, 2 ) + coords_sh( v1, 2 ) + coords_sh( v2, 2 ) ) );
1308 const double J_det = J_0_0 * J_1_1 * J_2_2 - J_0_0 * J_1_2 * J_2_1 - J_0_1 * J_1_0 * J_2_2 +
1309 J_0_1 * J_1_2 * J_2_0 + J_0_2 * J_1_0 * J_2_1 - J_0_2 * J_1_1 * J_2_0;
1311 const double inv_det = 1.0 /
J_det;
1313 const double i00 = inv_det * ( J_1_1 * J_2_2 - J_1_2 * J_2_1 );
1314 const double i01 = inv_det * ( -J_1_0 * J_2_2 + J_1_2 * J_2_0 );
1315 const double i02 = inv_det * ( J_1_0 * J_2_1 - J_1_1 * J_2_0 );
1316 const double i10 = inv_det * ( -J_0_1 * J_2_2 + J_0_2 * J_2_1 );
1317 const double i11 = inv_det * ( J_0_0 * J_2_2 - J_0_2 * J_2_0 );
1318 const double i12 = inv_det * ( -J_0_0 * J_2_1 + J_0_1 * J_2_0 );
1319 const double i20 = inv_det * ( J_0_1 * J_1_2 - J_0_2 * J_1_1 );
1320 const double i21 = inv_det * ( -J_0_0 * J_1_2 + J_0_2 * J_1_0 );
1321 const double i22 = inv_det * ( J_0_0 * J_1_1 - J_0_1 * J_1_0 );
1325 constexpr double NEG_TWO_THIRDS = -0.66666666666666663;
1329 const double gx = dN_ref[n][0];
1330 const double gy = dN_ref[n][1];
1331 const double gz = dN_ref[n][2];
1332 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1333 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1334 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1336 const int ddx = WEDGE_NODE_OFF[w][n][0];
1337 const int ddy = WEDGE_NODE_OFF[w][n][1];
1338 const int ddr = WEDGE_NODE_OFF[w][n][2];
1340 &dst_( local_subdomain_id, x_cell + ddx, y_cell + ddy, r_cell + ddr, 0 ),
1341 kwJ * ( 2.0 * ( g0 * gu00 + g1 * gu10 + g2 * gu20 ) + NEG_TWO_THIRDS * g0 * div_u ) );
1343 &dst_( local_subdomain_id, x_cell + ddx, y_cell + ddy, r_cell + ddr, 1 ),
1344 kwJ * ( 2.0 * ( g0 * gu10 + g1 * gu11 + g2 * gu21 ) + NEG_TWO_THIRDS * g1 * div_u ) );
1346 &dst_( local_subdomain_id, x_cell + ddx, y_cell + ddy, r_cell + ddr, 2 ),
1347 kwJ * ( 2.0 * ( g0 * gu20 + g1 * gu21 + g2 * gu22 ) + NEG_TWO_THIRDS * g2 * div_u ) );
1351 if ( Diagonal || ( treat_boundary_dirichlet && at_boundary ) )
1354 for (
int n = surface_shift; n < 6 -
cmb_shift; ++n )
1356 const double gx = dN_ref[n][0];
1357 const double gy = dN_ref[n][1];
1358 const double gz = dN_ref[n][2];
1359 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1360 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1361 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1362 const double gg = g0 * g0 +
g1 *
g1 +
g2 *
g2;
1364 const int nid = node_id( tx + WEDGE_NODE_OFF[w][n][0], ty + WEDGE_NODE_OFF[w][n][1] );
1365 const int lvl = lvl0 + WEDGE_NODE_OFF[w][n][2];
1367 const double sv0 = src_sh( nid, 0, lvl );
1368 const double sv1 = src_sh( nid, 1, lvl );
1369 const double sv2 = src_sh( nid, 2, lvl );
1371 const int ddx = WEDGE_NODE_OFF[w][n][0];
1372 const int ddy = WEDGE_NODE_OFF[w][n][1];
1373 const int ddr = WEDGE_NODE_OFF[w][n][2];
1375 &dst_( local_subdomain_id, x_cell + ddx, y_cell + ddy, r_cell + ddr, 0 ),
1376 kwJ * sv0 * ( gg + ONE_THIRD * g0 * g0 ) );
1378 &dst_( local_subdomain_id, x_cell + ddx, y_cell + ddy, r_cell + ddr, 1 ),
1379 kwJ * sv1 * ( gg + ONE_THIRD * g1 * g1 ) );
1381 &dst_( local_subdomain_id, x_cell + ddx, y_cell + ddy, r_cell + ddr, 2 ),
1382 kwJ * sv2 * ( gg + ONE_THIRD * g2 * g2 ) );
1392 KOKKOS_INLINE_FUNCTION
1393 void normalize3(
double& x,
double& y,
double& z )
const
1395 const double n2 = x * x + y * y + z * z;
1398 const double invn = 1.0 / Kokkos::sqrt( n2 );
1405 KOKKOS_INLINE_FUNCTION
1406 void project_tangential_inplace(
1414 const double dot = nx * ux + ny * uy + nz * uz;
1420 template <
bool Diagonal >
1421 KOKKOS_INLINE_FUNCTION
void operator_fast_freeslip_path(
1423 const int local_subdomain_id,
1431 const int y_cell )
const
1433 const int nlev = r_tile_block_ + 1;
1434 const int nxy = ( lat_tile_ + 1 ) * ( lat_tile_ + 1 );
1437 reinterpret_cast< double*
>( team.team_shmem().get_shmem(
team_shmem_size( team.team_size() ) ) );
1439 using ScratchCoords =
1440 Kokkos::View< double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
1441 using ScratchSrc = Kokkos::
1442 View< double***, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
1444 Kokkos::View< double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
1446 ScratchCoords coords_sh( shmem, nxy, 3 );
1450 ScratchCoords normals_sh( shmem, nxy, 3 );
1453 ScratchSrc src_sh( shmem, nxy, 3, nlev );
1454 shmem += nxy * 3 * nlev;
1456 ScratchK k_sh( shmem, nxy, nlev );
1457 shmem += nxy * nlev;
1460 Kokkos::View< double*, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >(
1463 auto node_id = [&](
int nx,
int ny ) ->
int {
return nx + ( lat_tile_ + 1 ) * ny; };
1466 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nxy ), [&](
int n ) {
1467 const int dxn = n % ( lat_tile_ + 1 );
1468 const int dyn = n / ( lat_tile_ + 1 );
1469 const int xi = x0 + dxn;
1470 const int yi = y0 + dyn;
1472 if ( xi <= hex_lat_ && yi <= hex_lat_ )
1474 const double cx = grid_( local_subdomain_id, xi, yi, 0 );
1475 const double cy = grid_( local_subdomain_id, xi, yi, 1 );
1476 const double cz = grid_( local_subdomain_id, xi, yi, 2 );
1477 coords_sh( n, 0 ) = cx;
1478 coords_sh( n, 1 ) = cy;
1479 coords_sh( n, 2 ) = cz;
1481 const double n2 = cx * cx + cy * cy + cz * cz;
1484 const double invn = 1.0 / Kokkos::sqrt( n2 );
1485 normals_sh( n, 0 ) = cx * invn;
1486 normals_sh( n, 1 ) = cy * invn;
1487 normals_sh( n, 2 ) = cz * invn;
1491 normals_sh( n, 0 ) = normals_sh( n, 1 ) = normals_sh( n, 2 ) = 0.0;
1496 coords_sh( n, 0 ) = coords_sh( n, 1 ) = coords_sh( n, 2 ) = 0.0;
1497 normals_sh( n, 0 ) = normals_sh( n, 1 ) = normals_sh( n, 2 ) = 0.0;
1501 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nlev ), [&](
int lvl ) {
1502 const int rr = r0 + lvl;
1503 r_sh( lvl ) = ( rr <= hex_rad_ ) ? radii_( local_subdomain_id, rr ) : 0.0;
1506 const int total_pairs = nxy * nlev;
1507 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, total_pairs ), [&](
int t ) {
1508 const int node = t / nlev;
1509 const int lvl = t - node * nlev;
1511 const int dxn = node % ( lat_tile_ + 1 );
1512 const int dyn = node / ( lat_tile_ + 1 );
1514 const int xi = x0 + dxn;
1515 const int yi = y0 + dyn;
1516 const int rr = r0 + lvl;
1518 if ( xi <= hex_lat_ && yi <= hex_lat_ && rr <= hex_rad_ )
1520 k_sh( node, lvl ) = k_( local_subdomain_id, xi, yi, rr );
1521 src_sh( node, 0, lvl ) = src_( local_subdomain_id, xi, yi, rr, 0 );
1522 src_sh( node, 1, lvl ) = src_( local_subdomain_id, xi, yi, rr, 1 );
1523 src_sh( node, 2, lvl ) = src_( local_subdomain_id, xi, yi, rr, 2 );
1527 k_sh( node, lvl ) = 0.0;
1528 src_sh( node, 0, lvl ) = src_sh( node, 1, lvl ) = src_sh( node, 2, lvl ) = 0.0;
1532 team.team_barrier();
1534 if ( x_cell >= hex_lat_ || y_cell >= hex_lat_ )
1537 for (
int pass = 0; pass < r_passes_; ++pass )
1539 const int lvl0 = pass * r_tile_ + tr;
1540 const int r_cell = r0 + lvl0;
1542 if ( r_cell >= hex_rad_ )
1545 const double r_0 = r_sh( lvl0 );
1546 const double r_1 = r_sh( lvl0 + 1 );
1548 const bool at_cmb =
has_flag( local_subdomain_id, x_cell, y_cell, r_cell, CMB );
1549 const bool at_surface =
has_flag( local_subdomain_id, x_cell, y_cell, r_cell + 1, SURFACE );
1554 const bool cmb_freeslip = at_cmb && ( cmb_bc ==
FREESLIP );
1555 const bool surf_freeslip = at_surface && ( surface_bc ==
FREESLIP );
1556 const bool cmb_dirichlet = at_cmb && ( cmb_bc ==
DIRICHLET );
1557 const bool surface_dirichlet = at_surface && ( surface_bc ==
DIRICHLET );
1559 const int cmb_shift = ( ( cmb_dirichlet && ( !Diagonal ) ) ? 3 : 0 );
1560 const int surface_shift = ( ( surface_dirichlet && ( !Diagonal ) ) ? 3 : 0 );
1562 static constexpr int WEDGE_NODE_OFF[2][6][3] = {
1563 { { 0, 0, 0 }, { 1, 0, 0 }, { 0, 1, 0 }, { 0, 0, 1 }, { 1, 0, 1 }, { 0, 1, 1 } },
1564 { { 1, 1, 0 }, { 0, 1, 0 }, { 1, 0, 0 }, { 1, 1, 1 }, { 0, 1, 1 }, { 1, 0, 1 } } };
1566 static constexpr int WEDGE_TO_UNIQUE[2][6] = { { 0, 1, 2, 3, 4, 5 }, { 6, 2, 1, 7, 5, 4 } };
1568 constexpr double ONE_THIRD = 1.0 / 3.0;
1569 constexpr double ONE_SIXTH = 1.0 / 6.0;
1570 constexpr double NEG_TWO_THIRDS = -0.66666666666666663;
1572 static constexpr double dN_ref[6][3] = {
1573 { -0.5, -0.5, -ONE_SIXTH },
1574 { 0.5, 0.0, -ONE_SIXTH },
1575 { 0.0, 0.5, -ONE_SIXTH },
1576 { -0.5, -0.5, ONE_SIXTH },
1577 { 0.5, 0.0, ONE_SIXTH },
1578 { 0.0, 0.5, ONE_SIXTH } };
1580 const int n00 = node_id( tx, ty );
1581 const int n01 = node_id( tx, ty + 1 );
1582 const int n10 = node_id( tx + 1, ty );
1583 const int n11 = node_id( tx + 1, ty + 1 );
1586 const int corner_node[4] = { n00, n10, n01, n11 };
1588 static constexpr int CMB_CORNER_TO_UNIQUE[4] = { 0, 1, 2, 6 };
1589 static constexpr int SURF_CORNER_TO_UNIQUE[4] = { 3, 4, 5, 7 };
1591 double dst8[3][8] = { { 0.0 } };
1595 double Ann_acc_cmb[4] = {};
1596 double Ann_acc_surf[4] = {};
1598 for (
int w = 0; w < 2; ++w )
1600 const int v0 = w == 0 ? n00 : n11;
1601 const int v1 = w == 0 ? n10 : n01;
1602 const int v2 = w == 0 ? n01 : n10;
1606 for (
int node = 0; node < 6; ++node )
1608 const int ddx = WEDGE_NODE_OFF[w][node][0];
1609 const int ddy = WEDGE_NODE_OFF[w][node][1];
1610 const int ddr = WEDGE_NODE_OFF[w][node][2];
1612 const int nid = node_id( tx + ddx, ty + ddy );
1613 const int lvl = lvl0 + ddr;
1614 k_sum += k_sh( nid, lvl );
1616 const double k_eval = ONE_SIXTH * k_sum;
1619 double i00, i01, i02;
1620 double i10, i11, i12;
1621 double i20, i21, i22;
1624 const double half_dr = 0.5 * ( r_1 - r_0 );
1625 const double r_mid = 0.5 * ( r_0 + r_1 );
1627 const double J_0_0 = r_mid * ( -coords_sh( v0, 0 ) + coords_sh( v1, 0 ) );
1628 const double J_0_1 = r_mid * ( -coords_sh( v0, 0 ) + coords_sh( v2, 0 ) );
1629 const double J_0_2 =
1630 half_dr * ( ONE_THIRD * ( coords_sh( v0, 0 ) + coords_sh( v1, 0 ) + coords_sh( v2, 0 ) ) );
1632 const double J_1_0 = r_mid * ( -coords_sh( v0, 1 ) + coords_sh( v1, 1 ) );
1633 const double J_1_1 = r_mid * ( -coords_sh( v0, 1 ) + coords_sh( v2, 1 ) );
1634 const double J_1_2 =
1635 half_dr * ( ONE_THIRD * ( coords_sh( v0, 1 ) + coords_sh( v1, 1 ) + coords_sh( v2, 1 ) ) );
1637 const double J_2_0 = r_mid * ( -coords_sh( v0, 2 ) + coords_sh( v1, 2 ) );
1638 const double J_2_1 = r_mid * ( -coords_sh( v0, 2 ) + coords_sh( v2, 2 ) );
1639 const double J_2_2 =
1640 half_dr * ( ONE_THIRD * ( coords_sh( v0, 2 ) + coords_sh( v1, 2 ) + coords_sh( v2, 2 ) ) );
1642 const double J_det = J_0_0 * J_1_1 * J_2_2 - J_0_0 * J_1_2 * J_2_1 - J_0_1 * J_1_0 * J_2_2 +
1643 J_0_1 * J_1_2 * J_2_0 + J_0_2 * J_1_0 * J_2_1 - J_0_2 * J_1_1 * J_2_0;
1645 const double invJ = 1.0 /
J_det;
1647 i00 = invJ * ( J_1_1 * J_2_2 - J_1_2 * J_2_1 );
1648 i01 = invJ * ( -J_1_0 * J_2_2 + J_1_2 * J_2_0 );
1649 i02 = invJ * ( J_1_0 * J_2_1 - J_1_1 * J_2_0 );
1651 i10 = invJ * ( -J_0_1 * J_2_2 + J_0_2 * J_2_1 );
1652 i11 = invJ * ( J_0_0 * J_2_2 - J_0_2 * J_2_0 );
1653 i12 = invJ * ( -J_0_0 * J_2_1 + J_0_1 * J_2_0 );
1655 i20 = invJ * ( J_0_1 * J_1_2 - J_0_2 * J_1_1 );
1656 i21 = invJ * ( -J_0_0 * J_1_2 + J_0_2 * J_1_0 );
1657 i22 = invJ * ( J_0_0 * J_1_1 - J_0_1 * J_1_0 );
1659 wJ = Kokkos::abs( J_det );
1662 const double kwJ =
k_eval * wJ;
1665 static constexpr int CMB_NODE_TO_CORNER[2][3] = { { 0, 1, 2 }, { 3, 2, 1 } };
1668 double gu10 = 0.0, gu11 = 0.0;
1669 double gu20 = 0.0, gu21 = 0.0, gu22 = 0.0;
1679 const double gx = dN_ref[n][0];
1680 const double gy = dN_ref[n][1];
1681 const double gz = dN_ref[n][2];
1682 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1683 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1684 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1686 const int nid = node_id( tx + WEDGE_NODE_OFF[w][n][0], ty + WEDGE_NODE_OFF[w][n][1] );
1687 const int lvl = lvl0 + WEDGE_NODE_OFF[w][n][2];
1689 double s0 = src_sh( nid, 0, lvl );
1690 double s1 = src_sh( nid, 1, lvl );
1691 double s2 = src_sh( nid, 2, lvl );
1694 if ( cmb_freeslip && n < 3 )
1696 const double nx = normals_sh( nid, 0 );
1697 const double ny = normals_sh( nid, 1 );
1698 const double nz = normals_sh( nid, 2 );
1699 const double dot = nx * s0 + ny * s1 + nz * s2;
1704 if ( surf_freeslip && n >= 3 )
1706 const double nx = normals_sh( nid, 0 );
1707 const double ny = normals_sh( nid, 1 );
1708 const double nz = normals_sh( nid, 2 );
1709 const double dot = nx * s0 + ny * s1 + nz * s2;
1718 gu10 += 0.5 * (
g1 * s0 + g0 * s1 );
1719 gu20 += 0.5 * (
g2 * s0 + g0 * s2 );
1720 gu21 += 0.5 * (
g2 * s1 +
g1 * s2 );
1729 const double gx = dN_ref[n][0];
1730 const double gy = dN_ref[n][1];
1731 const double gz = dN_ref[n][2];
1732 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1733 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1734 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1736 const int uid = WEDGE_TO_UNIQUE[w][n];
1738 kwJ * ( 2.0 * ( g0 * gu00 +
g1 * gu10 +
g2 * gu20 ) + NEG_TWO_THIRDS * g0 * div_u );
1740 kwJ * ( 2.0 * ( g0 * gu10 +
g1 * gu11 +
g2 * gu21 ) + NEG_TWO_THIRDS * g1 * div_u );
1742 kwJ * ( 2.0 * ( g0 * gu20 +
g1 * gu21 +
g2 * gu22 ) + NEG_TWO_THIRDS * g2 * div_u );
1745 if ( cmb_freeslip && n < 3 )
1747 const int corner = CMB_NODE_TO_CORNER[w][n];
1748 const int cn = corner_node[corner];
1749 const double nxu = normals_sh( cn, 0 );
1750 const double nyu = normals_sh( cn, 1 );
1751 const double nzu = normals_sh( cn, 2 );
1752 const double gg = g0 * g0 +
g1 *
g1 +
g2 *
g2;
1753 const double ng = nxu * g0 + nyu *
g1 + nzu *
g2;
1754 Ann_acc_cmb[corner] += kwJ * ( gg + ONE_THIRD * ng * ng );
1756 if ( surf_freeslip && n >= 3 )
1758 const int corner = CMB_NODE_TO_CORNER[w][n - 3];
1759 const int cn = corner_node[corner];
1760 const double nxu = normals_sh( cn, 0 );
1761 const double nyu = normals_sh( cn, 1 );
1762 const double nzu = normals_sh( cn, 2 );
1763 const double gg = g0 * g0 +
g1 *
g1 +
g2 *
g2;
1764 const double ng = nxu * g0 + nyu *
g1 + nzu *
g2;
1765 Ann_acc_surf[corner] += kwJ * ( gg + ONE_THIRD * ng * ng );
1771 if ( Diagonal || cmb_dirichlet || surface_dirichlet )
1775 for (
int n = surface_shift; n < 6 -
cmb_shift; ++n )
1777 if ( Diagonal && cmb_freeslip && n < 3 )
1779 if ( Diagonal && surf_freeslip && n >= 3 )
1782 const double gx = dN_ref[n][0];
1783 const double gy = dN_ref[n][1];
1784 const double gz = dN_ref[n][2];
1785 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1786 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1787 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1788 const double gg = g0 * g0 +
g1 *
g1 +
g2 *
g2;
1790 const int uid = WEDGE_TO_UNIQUE[w][n];
1791 const int nid = node_id( tx + WEDGE_NODE_OFF[w][n][0], ty + WEDGE_NODE_OFF[w][n][1] );
1792 const int lvl = lvl0 + WEDGE_NODE_OFF[w][n][2];
1793 const double s0 = src_sh( nid, 0, lvl );
1794 const double s1 = src_sh( nid, 1, lvl );
1795 const double s2 = src_sh( nid, 2, lvl );
1797 dst8[0][uid] += kwJ * s0 * ( gg + ONE_THIRD * g0 * g0 );
1798 dst8[1][uid] += kwJ * s1 * ( gg + ONE_THIRD *
g1 *
g1 );
1799 dst8[2][uid] += kwJ * s2 * ( gg + ONE_THIRD *
g2 *
g2 );
1806 static constexpr int FS_CORNER_MAP[2][3] = { { 0, 1, 2 }, { 3, 2, 1 } };
1808 auto apply_rotated_diag = [&](
const int ni,
const int node_idx,
const int src_lvl ) {
1809 const int corner = FS_CORNER_MAP[w][ni];
1810 const int cn = corner_node[corner];
1811 const double nxu = normals_sh( cn, 0 );
1812 const double nyu = normals_sh( cn, 1 );
1813 const double nzu = normals_sh( cn, 2 );
1814 const int u = WEDGE_TO_UNIQUE[w][node_idx];
1816 const double gx = dN_ref[node_idx][0];
1817 const double gy = dN_ref[node_idx][1];
1818 const double gz = dN_ref[node_idx][2];
1820 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1821 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1822 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1823 const double gg_loc = g0 * g0 +
g1 *
g1 +
g2 *
g2;
1825 dense::Vec< double, 3 > n_vec;
1829 const auto R_rot = trafo_mat_cartesian_to_normal_tangential< double >( n_vec );
1833 const double s0 = src_sh( cn, 0, src_lvl );
1834 const double s1 = src_sh( cn, 1, src_lvl );
1835 const double s2 = src_sh( cn, 2, src_lvl );
1837 const double Rg0 = R_rot( 0, 0 ) * g0 + R_rot( 0, 1 ) *
g1 + R_rot( 0, 2 ) *
g2;
1838 const double Rg1 = R_rot( 1, 0 ) * g0 + R_rot( 1, 1 ) *
g1 + R_rot( 1, 2 ) *
g2;
1839 const double Rg2 = R_rot( 2, 0 ) * g0 + R_rot( 2, 1 ) *
g1 + R_rot( 2, 2 ) *
g2;
1840 const double Rs0 = R_rot( 0, 0 ) * s0 + R_rot( 0, 1 ) * s1 + R_rot( 0, 2 ) * s2;
1841 const double Rs1 = R_rot( 1, 0 ) * s0 + R_rot( 1, 1 ) * s1 + R_rot( 1, 2 ) * s2;
1842 const double Rs2 = R_rot( 2, 0 ) * s0 + R_rot( 2, 1 ) * s1 + R_rot( 2, 2 ) * s2;
1844 const double v0 = kwJ * ( gg_loc + ONE_THIRD * Rg0 * Rg0 ) * Rs0;
1845 const double v1 = kwJ * ( gg_loc + ONE_THIRD * Rg1 * Rg1 ) * Rs1;
1846 const double v2 = kwJ * ( gg_loc + ONE_THIRD * Rg2 * Rg2 ) * Rs2;
1848 dst8[0][u] += R_rot( 0, 0 ) * v0 + R_rot( 1, 0 ) * v1 + R_rot( 2, 0 ) * v2;
1849 dst8[1][u] += R_rot( 0, 1 ) * v0 + R_rot( 1, 1 ) * v1 + R_rot( 2, 1 ) * v2;
1850 dst8[2][u] += R_rot( 0, 2 ) * v0 + R_rot( 1, 2 ) * v1 + R_rot( 2, 2 ) * v2;
1855 for (
int ni = 0; ni < 3; ++ni )
1856 apply_rotated_diag( ni, ni, lvl0 );
1858 if ( surf_freeslip )
1860 for (
int ni = 0; ni < 3; ++ni )
1861 apply_rotated_diag( ni, ni + 3, lvl0 + 1 );
1868 if ( !Diagonal && cmb_freeslip )
1870 for (
int c = 0; c < 4; ++c )
1872 const double nx = normals_sh( corner_node[c], 0 );
1873 const double ny = normals_sh( corner_node[c], 1 );
1874 const double nz = normals_sh( corner_node[c], 2 );
1875 const int u = CMB_CORNER_TO_UNIQUE[c];
1876 const double dot = nx * dst8[0][u] + ny * dst8[1][u] + nz * dst8[2][u];
1877 dst8[0][u] -=
dot * nx;
1878 dst8[1][u] -=
dot * ny;
1879 dst8[2][u] -=
dot * nz;
1882 if ( !Diagonal && surf_freeslip )
1884 for (
int c = 0; c < 4; ++c )
1886 const double nx = normals_sh( corner_node[c], 0 );
1887 const double ny = normals_sh( corner_node[c], 1 );
1888 const double nz = normals_sh( corner_node[c], 2 );
1889 const int u = SURF_CORNER_TO_UNIQUE[c];
1890 const double dot = nx * dst8[0][u] + ny * dst8[1][u] + nz * dst8[2][u];
1891 dst8[0][u] -=
dot * nx;
1892 dst8[1][u] -=
dot * ny;
1893 dst8[2][u] -=
dot * nz;
1899 if ( !Diagonal && cmb_freeslip )
1901 for (
int c = 0; c < 4; ++c )
1903 const int cn = corner_node[c];
1904 const double nx = normals_sh( cn, 0 );
1905 const double ny = normals_sh( cn, 1 );
1906 const double nz = normals_sh( cn, 2 );
1907 const double os0 = src_sh( cn, 0, lvl0 );
1908 const double os1 = src_sh( cn, 1, lvl0 );
1909 const double os2 = src_sh( cn, 2, lvl0 );
1910 const double u_n_val = nx * os0 + ny * os1 + nz * os2;
1911 const double corr = Ann_acc_cmb[c] * u_n_val;
1912 const int u = CMB_CORNER_TO_UNIQUE[c];
1913 dst8[0][u] += corr * nx;
1914 dst8[1][u] += corr * ny;
1915 dst8[2][u] += corr * nz;
1918 if ( !Diagonal && surf_freeslip )
1920 for (
int c = 0; c < 4; ++c )
1922 const int cn = corner_node[c];
1923 const double nx = normals_sh( cn, 0 );
1924 const double ny = normals_sh( cn, 1 );
1925 const double nz = normals_sh( cn, 2 );
1926 const double os0 = src_sh( cn, 0, lvl0 + 1 );
1927 const double os1 = src_sh( cn, 1, lvl0 + 1 );
1928 const double os2 = src_sh( cn, 2, lvl0 + 1 );
1929 const double u_n_val = nx * os0 + ny * os1 + nz * os2;
1930 const double corr = Ann_acc_surf[c] * u_n_val;
1931 const int u = SURF_CORNER_TO_UNIQUE[c];
1932 dst8[0][u] += corr * nx;
1933 dst8[1][u] += corr * ny;
1934 dst8[2][u] += corr * nz;
1940 for (
int dim_add = 0; dim_add < 3; ++dim_add )
1942 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell, y_cell, r_cell, dim_add ), dst8[dim_add][0] );
1944 &dst_( local_subdomain_id, x_cell + 1, y_cell, r_cell, dim_add ), dst8[dim_add][1] );
1946 &dst_( local_subdomain_id, x_cell, y_cell + 1, r_cell, dim_add ), dst8[dim_add][2] );
1948 &dst_( local_subdomain_id, x_cell, y_cell, r_cell + 1, dim_add ), dst8[dim_add][3] );
1950 &dst_( local_subdomain_id, x_cell + 1, y_cell, r_cell + 1, dim_add ), dst8[dim_add][4] );
1952 &dst_( local_subdomain_id, x_cell, y_cell + 1, r_cell + 1, dim_add ), dst8[dim_add][5] );
1954 &dst_( local_subdomain_id, x_cell + 1, y_cell + 1, r_cell, dim_add ), dst8[dim_add][6] );
1956 &dst_( local_subdomain_id, x_cell + 1, y_cell + 1, r_cell + 1, dim_add ), dst8[dim_add][7] );
1972 KOKKOS_INLINE_FUNCTION
1975 int local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell;
1976 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
1978 if ( tr >= r_tile_ )
1981 if ( kernel_path_ == KernelPath::Slow )
1983 for (
int pass = 0; pass < r_passes_; ++pass )
1985 const int r_cell_pass = r0 + pass * r_tile_ + tr;
1986 if ( r_cell_pass >= hex_rad_ )
1988 const bool at_cmb =
has_flag( local_subdomain_id, x_cell, y_cell, r_cell_pass, CMB );
1989 const bool at_surface =
has_flag( local_subdomain_id, x_cell, y_cell, r_cell_pass + 1, SURFACE );
1991 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell_pass, at_cmb, at_surface );
1994 else if ( kernel_path_ == KernelPath::FastFreeslip )
1997 operator_fast_freeslip_path< true >( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell );
1999 operator_fast_freeslip_path< false >(
2000 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell );
2005 operator_fast_dirichlet_neumann_path< true >(
2006 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell );
2008 operator_fast_dirichlet_neumann_path< false >(
2009 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell );
2028 const auto det = J.det();
2029 const auto abs_det = Kokkos::abs( det );
2035 k_eval +=
shape( k, quad_point ) * k_local_hex[wedge]( k );
2040 sym_grad_i[k] =
symmetric_grad( J_inv_transposed, quad_point, k, dimi );
2041 sym_grad_j[k] =
symmetric_grad( J_inv_transposed, quad_point, k, dimj );
2044 jdet_keval_quadweight = quad_weight * k_eval * abs_det;
2050 KOKKOS_INLINE_FUNCTION
2052 const int local_subdomain_id,
2056 const int wedge )
const
2061 const ScalarT r_1 = radii_( local_subdomain_id, r_cell );
2062 const ScalarT r_2 = radii_( local_subdomain_id, r_cell + 1 );
2069 for (
int dimi = 0; dimi < 3; ++dimi )
2071 for (
int dimj = 0; dimj < 3; ++dimj )
2073 for (
int q = 0; q < num_quad_points; q++ )
2091 jdet_keval_quadweight );
2098 jdet_keval_quadweight *
2100 2.0 / 3.0 * sym_grad_j[j]( dimj, dimj ) * sym_grad_i[i]( dimi, dimi ) );