79 using Team = Kokkos::TeamPolicy<>::member_type;
101 ScalarT mass_scaling_;
114 int local_subdomains_;
128 void update_kernel_path_flag_host_only()
130 if constexpr ( std::is_same_v< Kokkos::DefaultExecutionSpace, Kokkos::Serial > )
145 const ScalarT diffusivity,
148 bool diagonal =
false,
149 ScalarT mass_scaling = 1.0,
150 bool lumped_mass =
false,
157 , boundary_mask_( boundary_mask )
158 , velocity_( velocity )
159 , diffusivity_( diffusivity )
161 , treat_boundary_( treat_boundary )
162 , diagonal_( diagonal )
163 , mass_scaling_( mass_scaling )
164 , lumped_mass_( lumped_mass )
165 , operator_apply_mode_( operator_apply_mode )
166 , operator_communication_mode_( operator_communication_mode )
167 , recv_buffers_( domain )
168 , comm_plan_( domain )
171 local_subdomains_ = domain_.
subdomains().size();
175 if constexpr ( std::is_same_v< Kokkos::DefaultExecutionSpace, Kokkos::Serial > )
177 lat_tile_ = 1; r_tile_ = 1; r_passes_ = 1;
179#ifdef KOKKOS_ENABLE_OPENMP
180 else if constexpr ( std::is_same_v< Kokkos::DefaultExecutionSpace, Kokkos::OpenMP > )
182 const int max_team = std::min( Kokkos::OpenMP().concurrency(),
183 static_cast< int >( Kokkos::Impl::HostThreadTeamData::max_team_members ) );
184 if ( max_team >= 64 ) { lat_tile_ = 4; r_tile_ = 4; r_passes_ = 4; }
185 else if ( max_team >= 16 ) { lat_tile_ = 4; r_tile_ = 1; r_passes_ = 16; }
186 else { lat_tile_ = 1; r_tile_ = 1; r_passes_ = 1; }
191 lat_tile_ = 4; r_tile_ = 8; r_passes_ = 2;
193 r_tile_block_ = r_tile_ * r_passes_;
194 lat_tiles_ = ( hex_lat_ + lat_tile_ - 1 ) / lat_tile_;
195 r_tiles_ = ( hex_rad_ + r_tile_block_ - 1 ) / r_tile_block_;
196 team_size_ = lat_tile_ * lat_tile_ * r_tile_;
197 blocks_ = local_subdomains_ * lat_tiles_ * lat_tiles_ * r_tiles_;
199 update_kernel_path_flag_host_only();
201 util::logroot <<
"[SUPGKerngen] tile=(" << lat_tile_ <<
"," << lat_tile_ <<
"," << r_tile_
202 <<
"), r_passes=" << r_passes_ <<
", team=" << team_size_ <<
", blocks=" << blocks_
204 <<
", lumped=" << lumped_mass_ <<
", diagonal=" << diagonal_
205 <<
", treat_boundary=" << treat_boundary_ << std::endl;
208 ScalarT&
dt() {
return dt_; }
209 const ScalarT&
dt()
const {
return dt_; }
227 Kokkos::TeamPolicy<> policy( blocks_, team_size_ );
229 policy.set_scratch_size( 0, Kokkos::PerTeam(
team_shmem_size( team_size_ ) ) );
233 Kokkos::parallel_for(
"ad_supg_apply_kernel_slow", policy,
234 KOKKOS_CLASS_LAMBDA(
const Team& team ) { this->run_team_slow( team ); } );
238 Kokkos::TeamPolicy< Kokkos::LaunchBounds< 128, 5 > > fast_policy( blocks_, team_size_ );
239 fast_policy.set_scratch_size( 0, Kokkos::PerTeam(
team_shmem_size( team_size_ ) ) );
244 if ( treat_boundary_ && lumped_mass_ )
245 Kokkos::parallel_for(
"ad_supg_fast_D_L_TB", fast_policy,
246 KOKKOS_CLASS_LAMBDA(
const Team& t ) { this->
template run_team_fast< true, true, true >( t ); } );
247 else if ( treat_boundary_ )
248 Kokkos::parallel_for(
"ad_supg_fast_D_TB", fast_policy,
249 KOKKOS_CLASS_LAMBDA(
const Team& t ) { this->
template run_team_fast< false, true, true >( t ); } );
250 else if ( lumped_mass_ )
251 Kokkos::parallel_for(
"ad_supg_fast_D_L", fast_policy,
252 KOKKOS_CLASS_LAMBDA(
const Team& t ) { this->
template run_team_fast< true, true, false >( t ); } );
254 Kokkos::parallel_for(
"ad_supg_fast_D", fast_policy,
255 KOKKOS_CLASS_LAMBDA(
const Team& t ) { this->
template run_team_fast< false, true, false >( t ); } );
259 if ( treat_boundary_ && lumped_mass_ )
260 Kokkos::parallel_for(
"ad_supg_fast_L_TB", fast_policy,
261 KOKKOS_CLASS_LAMBDA(
const Team& t ) { this->
template run_team_fast< true, false, true >( t ); } );
262 else if ( treat_boundary_ )
263 Kokkos::parallel_for(
"ad_supg_fast_TB", fast_policy,
264 KOKKOS_CLASS_LAMBDA(
const Team& t ) { this->
template run_team_fast< false, false, true >( t ); } );
265 else if ( lumped_mass_ )
266 Kokkos::parallel_for(
"ad_supg_fast_L", fast_policy,
267 KOKKOS_CLASS_LAMBDA(
const Team& t ) { this->
template run_team_fast< true, false, false >( t ); } );
269 Kokkos::parallel_for(
"ad_supg_fast", fast_policy,
270 KOKKOS_CLASS_LAMBDA(
const Team& t ) { this->
template run_team_fast< false, false, false >( t ); } );
284 KOKKOS_INLINE_FUNCTION
287 const int nlev = r_tile_block_ + 1;
288 const int n = lat_tile_ + 1;
289 const int nxy = n * n;
291 const size_t nscalars = size_t( nxy ) * 3 + size_t( nlev ) + size_t( nxy ) * nlev + size_t( nxy ) * 3 * nlev;
296 KOKKOS_INLINE_FUNCTION
297 void decode_team_indices(
299 int& local_subdomain_id,
300 int& x0,
int& y0,
int& r0,
301 int& tx,
int& ty,
int& tr,
302 int& x_cell,
int& y_cell,
int& r_cell )
const
304 int tmp = team.league_rank();
305 const int r_tile_id = tmp % r_tiles_;
307 const int lat_y_id = tmp % lat_tiles_;
309 const int lat_x_id = tmp % lat_tiles_;
311 local_subdomain_id = tmp;
313 x0 = lat_x_id * lat_tile_;
314 y0 = lat_y_id * lat_tile_;
315 r0 = r_tile_id * r_tile_block_;
317 const int tid = team.team_rank();
319 tx = ( tid / r_tile_ ) % lat_tile_;
320 ty = tid / ( r_tile_ * lat_tile_ );
331 KOKKOS_INLINE_FUNCTION
332 void run_team_slow(
const Team& team )
const
334 int local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell;
335 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
340 for (
int pass = 0; pass < r_passes_; ++pass )
342 const int r_cell_pass = r0 + pass * r_tile_ + tr;
343 if ( r_cell_pass >= hex_rad_ )
345 if ( x_cell >= hex_lat_ || y_cell >= hex_lat_ )
348 process_cell_legacy( local_subdomain_id, x_cell, y_cell, r_cell_pass );
353 KOKKOS_INLINE_FUNCTION
354 void process_cell_legacy(
int s,
int x_cell,
int y_cell,
int r_cell )
const
359 const ScalarT r_1 = radii_( s, r_cell );
360 const ScalarT r_2 = radii_( s, r_cell + 1 );
364 ScalarT quad_weights[num_quad_points];
370 for (
int d = 0; d < VelocityVecDim; ++d )
374 for (
int q = 0; q < num_quad_points; ++q )
377 const auto shape_i =
shape( i, quad_points[q] );
378 for (
int d = 0; d < VelocityVecDim; ++d )
379 vel_interp[wedge][q]( d ) += vel_coeffs[d][wedge]( i ) * shape_i;
383 const auto h = r_2 - r_1;
386 ScalarT tau_accum = 0.0, waccum = 0.0;
387 for (
int q = 0; q < num_quad_points; ++q )
389 const auto& uq = vel_interp[wedge][q];
390 const ScalarT vel_norm_q = uq.
norm();
391 const ScalarT tau_q = supg_tau< ScalarT >( vel_norm_q, diffusivity_, h, 1e-08 );
392 tau_accum += tau_q * quad_weights[q];
393 waccum += quad_weights[q];
395 streamline_diffusivity[wedge] = ( waccum > 0.0 ) ? ( tau_accum / waccum ) : 0.0;
401 for (
int q = 0; q < num_quad_points; ++q )
403 const auto w = quad_weights[q];
406 const auto J =
jac( wedge_phy_surf[wedge], r_1, r_2, quad_points[q] );
407 const auto det = Kokkos::abs(
J.det() );
408 const auto J_inv_transposed =
J.inv().transposed();
409 const auto vel = vel_interp[wedge][q];
413 const auto shape_i =
shape( i, quad_points[q] );
414 const auto grad_i = J_inv_transposed *
grad_shape( i, quad_points[q] );
417 const auto shape_j =
shape( j, quad_points[q] );
418 const auto grad_j = J_inv_transposed *
grad_shape( j, quad_points[q] );
420 const auto mass = shape_i * shape_j;
421 const auto diffusion = diffusivity_ * grad_i.dot( grad_j );
422 const auto advection = vel.dot( grad_j ) * shape_i;
423 const auto streamline = streamline_diffusivity[wedge] * vel.dot( grad_j ) * vel.dot( grad_i );
425 M[wedge]( i, j ) += w * mass_scaling_ * mass * det;
426 A[wedge]( i, j ) += w * dt_ * ( diffusion + advection + streamline ) * det;
434 dense::Vec< ScalarT, 6 > ones;
440 if ( treat_boundary_ )
449 dense::Mat< ScalarT, 6, 6 > boundary_mask;
450 boundary_mask.fill( 1.0 );
451 if ( at_cmb_boundary )
452 for (
int i = 0; i < 6; ++i )
453 for (
int j = 0; j < 6; ++j )
454 if ( i != j && ( i < 3 || j < 3 ) )
455 boundary_mask( i, j ) = 0.0;
456 if ( at_surface_boundary )
457 for (
int i = 0; i < 6; ++i )
458 for (
int j = 0; j < 6; ++j )
459 if ( i != j && ( i >= 3 || j >= 3 ) )
460 boundary_mask( i, j ) = 0.0;
461 M[wedge].hadamard_product( boundary_mask );
462 A[wedge].hadamard_product( boundary_mask );
468 M[0] = M[0].diagonal();
469 M[1] = M[1].diagonal();
470 A[0] = A[0].diagonal();
471 A[1] = A[1].diagonal();
478 dst[0] = ( M[0] + A[0] ) * src[0];
479 dst[1] = ( M[1] + A[1] ) * src[1];
489 template <
bool LumpedMass,
bool Diagonal,
bool TreatBoundary >
490 KOKKOS_INLINE_FUNCTION
491 void run_team_fast(
const Team& team )
const
494 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
496 const int nlev = r_tile_block_ + 1;
497 const int n = lat_tile_ + 1;
498 const int nxy = n * n;
501 reinterpret_cast< double*
>( team.team_shmem().get_shmem(
team_shmem_size( team.team_size() ) ) );
503 using ScratchCoords =
504 Kokkos::View< double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
506 Kokkos::View< double*, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
508 Kokkos::View< double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
510 Kokkos::View< double***, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
512 ScratchCoords coords_sh( shmem, nxy, 3 );
514 ScratchR r_sh( shmem, nlev );
516 ScratchT T_sh( shmem, nxy, nlev );
518 ScratchVel vel_sh( shmem, nxy, 3, nlev );
520 auto node_id = [&](
int nx,
int ny ) ->
int {
return nx + n * ny; };
523 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nxy ), [&](
int id ) {
524 const int dxn =
id % n;
525 const int dyn =
id / n;
526 const int xi = x0 + dxn;
527 const int yi = y0 + dyn;
528 if ( xi <= hex_lat_ && yi <= hex_lat_ )
530 coords_sh(
id, 0 ) = grid_( local_subdomain_id, xi, yi, 0 );
531 coords_sh(
id, 1 ) = grid_( local_subdomain_id, xi, yi, 1 );
532 coords_sh(
id, 2 ) = grid_( local_subdomain_id, xi, yi, 2 );
536 coords_sh(
id, 0 ) = coords_sh(
id, 1 ) = coords_sh(
id, 2 ) = 0.0;
539 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nlev ), [&](
int lvl ) {
540 const int rr = r0 + lvl;
541 r_sh( lvl ) = ( rr <= hex_rad_ ) ? radii_( local_subdomain_id, rr ) : 0.0;
543 const int total = nxy * nlev;
544 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, total ), [&](
int t ) {
545 const int node = t / nlev;
546 const int lvl = t - node * nlev;
547 const int dxn = node % n;
548 const int dyn = node / n;
549 const int xi = x0 + dxn;
550 const int yi = y0 + dyn;
551 const int rr = r0 + lvl;
552 if ( xi <= hex_lat_ && yi <= hex_lat_ && rr <= hex_rad_ )
554 T_sh( node, lvl ) = src_( local_subdomain_id, xi, yi, rr );
555 vel_sh( node, 0, lvl ) = vel_grid_( local_subdomain_id, xi, yi, rr, 0 );
556 vel_sh( node, 1, lvl ) = vel_grid_( local_subdomain_id, xi, yi, rr, 1 );
557 vel_sh( node, 2, lvl ) = vel_grid_( local_subdomain_id, xi, yi, rr, 2 );
561 T_sh( node, lvl ) = 0.0;
562 vel_sh( node, 0, lvl ) = vel_sh( node, 1, lvl ) = vel_sh( node, 2, lvl ) = 0.0;
570 if ( x_cell >= hex_lat_ || y_cell >= hex_lat_ )
574 constexpr int WEDGE_NODE_OFF[2][6][3] = {
575 { { 0, 0, 0 }, { 1, 0, 0 }, { 0, 1, 0 }, { 0, 0, 1 }, { 1, 0, 1 }, { 0, 1, 1 } },
576 { { 1, 1, 0 }, { 0, 1, 0 }, { 1, 0, 0 }, { 1, 1, 1 }, { 0, 1, 1 }, { 1, 0, 1 } } };
579 constexpr int NQ = 6;
580 constexpr double QUAD_W = 1.0 / 6.0;
583 constexpr double ONE_OVER_SQRT3 = 0.5773502691896257;
586 constexpr double BARY[NQ][3] = {
587 { 1.0/6.0, 2.0/3.0, 1.0/6.0 },
588 { 1.0/6.0, 1.0/6.0, 2.0/3.0 },
589 { 2.0/3.0, 1.0/6.0, 1.0/6.0 },
590 { 1.0/6.0, 2.0/3.0, 1.0/6.0 },
591 { 1.0/6.0, 1.0/6.0, 2.0/3.0 },
592 { 2.0/3.0, 1.0/6.0, 1.0/6.0 }
597 constexpr double ZETA_Q[NQ] = {
598 -ONE_OVER_SQRT3, -ONE_OVER_SQRT3, -ONE_OVER_SQRT3,
599 +ONE_OVER_SQRT3, +ONE_OVER_SQRT3, +ONE_OVER_SQRT3 };
607 auto phi = [&](
int j,
int q ) ->
double {
608 const double sl = BARY[q][j % 3];
609 const double sr = ( j < 3 ) ? 0.5 * ( 1.0 - ZETA_Q[q] ) : 0.5 * ( 1.0 + ZETA_Q[q] );
619 auto gref = [&](
int j,
int q,
int d ) ->
double {
620 const double sr = ( j < 3 ) ? 0.5 * ( 1.0 - ZETA_Q[q] ) : 0.5 * ( 1.0 + ZETA_Q[q] );
621 const double grad_rad = ( j < 3 ) ? -0.5 : 0.5;
622 const int jmod = j % 3;
623 const double glat_xi = ( jmod == 0 ) ? -1.0 : ( jmod == 1 ) ? 1.0 : 0.0;
624 const double glat_eta = ( jmod == 0 ) ? -1.0 : ( jmod == 1 ) ? 0.0 : 1.0;
625 const double sl = BARY[q][jmod];
626 if ( d == 0 )
return glat_xi * sr;
627 if ( d == 1 )
return glat_eta * sr;
628 return sl * grad_rad;
632 const int n00 = node_id( tx, ty );
633 const int n01 = node_id( tx, ty + 1 );
634 const int n10 = node_id( tx + 1, ty );
635 const int n11 = node_id( tx + 1, ty + 1 );
638 for (
int pass = 0; pass < r_passes_; ++pass )
640 const int lvl0 = pass * r_tile_ + tr;
641 const int r_cell_abs = r0 + lvl0;
642 if ( r_cell_abs >= hex_rad_ )
645 const double r_lower = r_sh( lvl0 );
646 const double r_upper = r_sh( lvl0 + 1 );
647 const double half_dr = 0.5 * ( r_upper - r_lower );
648 const double r_mid = 0.5 * ( r_lower + r_upper );
649 const double h_cell = r_upper - r_lower;
652 bool at_cmb =
false, at_surface =
false;
653 if constexpr ( TreatBoundary )
655 at_cmb =
util::has_flag( boundary_mask_( local_subdomain_id, x_cell, y_cell, r_cell_abs ),
657 at_surface =
util::has_flag( boundary_mask_( local_subdomain_id, x_cell, y_cell, r_cell_abs + 1 ),
665 auto boundary_j_col = [&](
int j ) ->
bool {
666 if constexpr ( !TreatBoundary )
return false;
667 return ( at_cmb && j < 3 ) || ( at_surface && j >= 3 );
673 const int v0 = ( w == 0 ? n00 : n11 );
674 const int v1 = ( w == 0 ? n10 : n01 );
675 const int v2 = ( w == 0 ? n01 : n10 );
678 const double P0[3] = { coords_sh( v0, 0 ), coords_sh( v0, 1 ), coords_sh( v0, 2 ) };
679 const double P1[3] = { coords_sh( v1, 0 ), coords_sh( v1, 1 ), coords_sh( v1, 2 ) };
680 const double P2[3] = { coords_sh( v2, 0 ), coords_sh( v2, 1 ), coords_sh( v2, 2 ) };
683 const double dP1_P0[3] = {
P1[0] -
P0[0],
P1[1] -
P0[1],
P1[2] -
P0[2] };
684 const double dP2_P0[3] = { P2[0] -
P0[0], P2[1] -
P0[1], P2[2] -
P0[2] };
688 double vel_q[NQ][3] = { { 0.0 } };
690 double tau_sum = 0.0;
692 for (
int q = 0; q < NQ; ++q )
694 double ux = 0.0, uy = 0.0, uz = 0.0;
698 const int ddx = WEDGE_NODE_OFF[w][j][0];
699 const int ddy = WEDGE_NODE_OFF[w][j][1];
700 const int ddr = WEDGE_NODE_OFF[w][j][2];
701 const int nid = node_id( tx + ddx, ty + ddy );
702 const int lvl = lvl0 + ddr;
703 const double p = phi( j, q );
704 ux += p * vel_sh( nid, 0, lvl );
705 uy += p * vel_sh( nid, 1, lvl );
706 uz += p * vel_sh( nid, 2, lvl );
712 const double vn = Kokkos::sqrt( ux * ux + uy * uy + uz * uz );
713 const double tauq = supg_tau< double >( vn,
double( diffusivity_ ), h_cell, 1e-08 );
714 tau_sum += tauq * QUAD_W;
718 const double tau_wedge = ( w_sum > 0.0 ) ? ( tau_sum / w_sum ) : 0.0;
726 for (
int q = 0; q < NQ; ++q )
732 const double r_at_q = r_mid + ZETA_Q[q] * half_dr;
733 const double b0 = BARY[q][0], b1 = BARY[q][1], b2 = BARY[q][2];
735 const double J00 = r_at_q * dP1_P0[0];
736 const double J10 = r_at_q * dP1_P0[1];
737 const double J20 = r_at_q * dP1_P0[2];
738 const double J01 = r_at_q * dP2_P0[0];
739 const double J11 = r_at_q * dP2_P0[1];
740 const double J21 = r_at_q * dP2_P0[2];
741 const double J02 = half_dr * ( b0 *
P0[0] + b1 *
P1[0] + b2 * P2[0] );
742 const double J12 = half_dr * ( b0 *
P0[1] + b1 *
P1[1] + b2 * P2[1] );
743 const double J22 = half_dr * ( b0 *
P0[2] + b1 *
P1[2] + b2 * P2[2] );
745 const double J_det = J00 * J11 * J22 - J00 * J12 * J21
746 - J01 * J10 * J22 + J01 * J12 * J20
747 + J02 * J10 * J21 - J02 * J11 * J20;
748 const double abs_det = Kokkos::abs( J_det );
749 const double inv_det = 1.0 /
J_det;
752 const double i00 = inv_det * ( J11 * J22 - J12 * J21 );
753 const double i01 = inv_det * ( -J10 * J22 + J12 * J20 );
754 const double i02 = inv_det * ( J10 * J21 - J11 * J20 );
755 const double i10 = inv_det * ( -J01 * J22 + J02 * J21 );
756 const double i11 = inv_det * ( J00 * J22 - J02 * J20 );
757 const double i12 = inv_det * ( -J00 * J21 + J01 * J20 );
758 const double i20 = inv_det * ( J01 * J12 - J02 * J11 );
759 const double i21 = inv_det * ( -J00 * J12 + J02 * J10 );
760 const double i22 = inv_det * ( J00 * J11 - J01 * J10 );
762 const double wdet = QUAD_W * abs_det;
766 double gT0 = 0.0, gT1 = 0.0, gT2 = 0.0;
770 if constexpr ( TreatBoundary )
771 if ( boundary_j_col( j ) )
774 const int ddx = WEDGE_NODE_OFF[w][j][0];
775 const int ddy = WEDGE_NODE_OFF[w][j][1];
776 const int ddr = WEDGE_NODE_OFF[w][j][2];
777 const int nid = node_id( tx + ddx, ty + ddy );
778 const int lvl = lvl0 + ddr;
779 const double Tj = T_sh( nid, lvl );
781 const double pj = phi( j, q );
784 const double gx = gref( j, q, 0 );
785 const double gy = gref( j, q, 1 );
786 const double gz = gref( j, q, 2 );
787 const double g0 = i00 * gx + i01 * gy + i02 * gz;
788 const double g1 = i10 * gx + i11 * gy + i12 * gz;
789 const double g2 = i20 * gx + i21 * gy + i22 * gz;
795 const double ux = vel_q[q][0], uy = vel_q[q][1], uz = vel_q[q][2];
796 const double u_dot_gT = ux * gT0 + uy * gT1 + uz * gT2;
800 const double mass_in_scalar = LumpedMass ? 0.0 : ( double( mass_scaling_ ) * T_hat );
801 const double A_scalar = mass_in_scalar + double( dt_ ) * u_dot_gT;
802 const double dkappa = double( dt_ ) * double( diffusivity_ );
803 const double dtau = double( dt_ ) * tau_wedge * u_dot_gT;
804 const double Bx = dkappa * gT0 + dtau * ux;
805 const double By = dkappa * gT1 + dtau * uy;
806 const double Bz = dkappa * gT2 + dtau * uz;
812 const double pi = phi( i, q );
813 const double gxi = gref( i, q, 0 );
814 const double gyi = gref( i, q, 1 );
815 const double gzi = gref( i, q, 2 );
816 const double gi0 = i00 * gxi + i01 * gyi + i02 * gzi;
817 const double gi1 = i10 * gxi + i11 * gyi + i12 * gzi;
818 const double gi2 = i20 * gxi + i21 * gyi + i22 * gzi;
821 bool is_boundary_i =
false;
822 if constexpr ( TreatBoundary )
824 is_boundary_i = ( at_cmb && i < 3 ) || ( at_surface && i >= 3 );
833 auto add_diagonal_contribution = [&]() {
834 const int ddx = WEDGE_NODE_OFF[w][i][0];
835 const int ddy = WEDGE_NODE_OFF[w][i][1];
836 const int ddr = WEDGE_NODE_OFF[w][i][2];
837 const int nid = node_id( tx + ddx, ty + ddy );
838 const int lvl = lvl0 + ddr;
839 const double Ti = T_sh( nid, lvl );
841 const double u_dot_gi = ux * gi0 + uy * gi1 + uz * gi2;
842 const double diff_ii = double( diffusivity_ ) * ( gi0 * gi0 + gi1 * gi1 + gi2 * gi2 );
843 const double adv_ii = pi * u_dot_gi;
844 const double supg_ii = tau_wedge * u_dot_gi * u_dot_gi;
845 const double A_ii = diff_ii + adv_ii + supg_ii;
847 if constexpr ( LumpedMass )
850 lumped_Mii[i] += wdet * double( mass_scaling_ ) * pi;
851 dst_acc[i] += wdet * double( dt_ ) * A_ii * Ti;
855 const double M_ii_q = double( mass_scaling_ ) * pi * pi;
856 dst_acc[i] += wdet * ( M_ii_q + double( dt_ ) * A_ii ) * Ti;
862 add_diagonal_contribution();
866 if constexpr ( Diagonal )
868 add_diagonal_contribution();
873 const double contrib = wdet * ( pi * A_scalar + gi0 * Bx + gi1 * By + gi2 * Bz );
874 dst_acc[i] += contrib;
875 if constexpr ( LumpedMass )
878 lumped_Mii[i] += wdet * double( mass_scaling_ ) * pi;
886 if constexpr ( LumpedMass )
894 const int ddx = WEDGE_NODE_OFF[w][i][0];
895 const int ddy = WEDGE_NODE_OFF[w][i][1];
896 const int ddr = WEDGE_NODE_OFF[w][i][2];
897 const int nid = node_id( tx + ddx, ty + ddy );
898 const int lvl = lvl0 + ddr;
899 const double Ti = T_sh( nid, lvl );
900 dst_acc[i] += lumped_Mii[i] * Ti;
910 const int rc = r_cell_abs;
914 const int ddx = WEDGE_NODE_OFF[w][i][0];
915 const int ddy = WEDGE_NODE_OFF[w][i][1];
916 const int ddr = WEDGE_NODE_OFF[w][i][2];
917 Kokkos::atomic_add( &dst_( local_subdomain_id, xc + ddx, yc + ddy, rc + ddr ), dst_acc[i] );