Loading...
Searching...
No Matches
unsteady_advection_diffusion_supg_kerngen.hpp
Go to the documentation of this file.
1#pragma once
2
3/// @file unsteady_advection_diffusion_supg_kerngen.hpp
4/// @brief Team-based matrix-free unsteady advection-diffusion with SUPG stabilization,
5/// with the fused-arithmetic optimizations transferred from EpsilonDivDivKerngen
6/// and DivergenceKerngen/GradientKerngen.
7///
8/// Math identical to `UnsteadyAdvectionDiffusionSUPG`, but the 6×6 local mass
9/// matrix M and operator matrix A (advection + diffusion + streamline-diffusion)
10/// are never materialised. The standard bilinear form
11///
12/// dst_i = Σⱼ (M_ij + dt·A_ij) · src_j
13///
14/// with
15/// M_ij = Σ_q w_q |det J_q| · m · φ_i(q) · φ_j(q)
16/// A_ij = Σ_q w_q |det J_q| · [ κ · ∇φ_i(q)·∇φ_j(q)
17/// + φ_i(q) · u(q)·∇φ_j(q)
18/// + τ · (u(q)·∇φ_i(q)) · (u(q)·∇φ_j(q)) ]
19///
20/// collapses to
21/// dst_i = Σ_q w_q |det J_q| · { φ_i(q) · A_scalar(q)
22/// + ∇φ_i(q) · B_vec(q) }
23/// where, for each quadrature point,
24/// T̂(q) = Σⱼ φ_j(q) · T_j
25/// ∇T(q) = Σⱼ ∇φ_j(q) · T_j
26/// u(q) = Σⱼ φ_j(q) · u_j
27/// A_scalar(q) = m · T̂(q) + dt · (u(q)·∇T(q))
28/// B_vec(q) = dt·κ·∇T(q) + dt·τ·(u(q)·∇T(q)) · u(q)
29///
30/// τ is kept **exact**: volume-averaged over all 6 quadrature points (matching the
31/// legacy code) via a pre-pass that reuses the `u(q)` values.
32///
33/// Dirichlet boundary handling:
34/// * **column elimination** — when accumulating T̂(q) and ∇T(q), skip
35/// boundary-node T_j contributions.
36/// * **row elimination** — for boundary node i, only the diagonal term
37/// survives. A separate inline diagonal compute handles this.
38///
39/// Lumped mass: mass term replaced by row-sum diagonal M_ii T_i, where
40/// M_ii = m · Σ_q w_q |det J_q| · φ_i(q)
41/// (because Σⱼ φ_j ≡ 1 for a partition-of-unity basis).
42///
43/// Diagonal mode: full diagonal-only of (M + dt·A).
44///
45/// Transferred structural optimisations:
46/// - Kokkos::TeamPolicy with backend-aware tiling (4,4,8) × r_passes=2 on CUDA.
47/// - Host-side KernelPath dispatch (Slow / Fast) + `template<bool LumpedMass,
48/// bool Diagonal, bool TreatBoundary>` so the compiler dead-eliminates unused
49/// branches.
50/// - LaunchBounds<128, 5>.
51/// - Shared-memory staging: coords, radii, T, velocity.
52/// - `ShellBoundaryCommPlan` for halo exchange.
53
54#include "../../quadrature/quadrature.hpp"
57#include "fe/wedge/operators/shell/unsteady_advection_diffusion_supg.hpp" // for supg_tau + legacy comparison
58#include "dense/vec.hpp"
62#include "linalg/operator.hpp"
63#include "linalg/vector.hpp"
64#include "linalg/vector_q1.hpp"
65#include "util/timer.hpp"
66
68
69// NB: `supg_tau` is defined in the legacy header; we #include it to reuse.
70// Re-declaration here would collide, so we rely on include order / placement.
71
72template < typename ScalarT, int VelocityVecDim = 3 >
74{
75 public:
78 using ScalarType = ScalarT;
79 using Team = Kokkos::TeamPolicy<>::member_type;
80
81 enum class KernelPath
82 {
83 Slow,
84 Fast,
85 };
86
87 private:
89
93
95
96 ScalarT diffusivity_;
97 ScalarT dt_;
98
99 bool treat_boundary_;
100 bool diagonal_;
101 ScalarT mass_scaling_;
102 bool lumped_mass_;
103
104 linalg::OperatorApplyMode operator_apply_mode_;
105 linalg::OperatorCommunicationMode operator_communication_mode_;
106
109
113
114 int local_subdomains_;
115 int hex_lat_;
116 int hex_rad_;
117 int lat_tile_;
118 int r_tile_;
119 int r_passes_;
120 int r_tile_block_;
121 int lat_tiles_;
122 int r_tiles_;
123 int team_size_;
124 int blocks_;
125
126 KernelPath kernel_path_ = KernelPath::Fast;
127
128 void update_kernel_path_flag_host_only()
129 {
130 if constexpr ( std::is_same_v< Kokkos::DefaultExecutionSpace, Kokkos::Serial > )
131 {
132 kernel_path_ = KernelPath::Slow;
133 return;
134 }
135 kernel_path_ = KernelPath::Fast;
136 }
137
138 public:
140 const grid::shell::DistributedDomain& domain,
145 const ScalarT diffusivity,
146 const ScalarT dt,
147 bool treat_boundary,
148 bool diagonal = false,
149 ScalarT mass_scaling = 1.0,
150 bool lumped_mass = false,
152 linalg::OperatorCommunicationMode operator_communication_mode =
154 : domain_( domain )
155 , grid_( grid )
156 , radii_( radii )
157 , boundary_mask_( boundary_mask )
158 , velocity_( velocity )
159 , diffusivity_( diffusivity )
160 , dt_( dt )
161 , treat_boundary_( treat_boundary )
162 , diagonal_( diagonal )
163 , mass_scaling_( mass_scaling )
164 , lumped_mass_( lumped_mass )
165 , operator_apply_mode_( operator_apply_mode )
166 , operator_communication_mode_( operator_communication_mode )
167 , recv_buffers_( domain )
168 , comm_plan_( domain )
169 {
170 const grid::shell::DomainInfo& domain_info = domain_.domain_info();
171 local_subdomains_ = domain_.subdomains().size();
172 hex_lat_ = domain_info.subdomain_num_nodes_per_side_laterally() - 1;
173 hex_rad_ = domain_info.subdomain_num_nodes_radially() - 1;
174
175 if constexpr ( std::is_same_v< Kokkos::DefaultExecutionSpace, Kokkos::Serial > )
176 {
177 lat_tile_ = 1; r_tile_ = 1; r_passes_ = 1;
178 }
179#ifdef KOKKOS_ENABLE_OPENMP
180 else if constexpr ( std::is_same_v< Kokkos::DefaultExecutionSpace, Kokkos::OpenMP > )
181 {
182 const int max_team = std::min( Kokkos::OpenMP().concurrency(),
183 static_cast< int >( Kokkos::Impl::HostThreadTeamData::max_team_members ) );
184 if ( max_team >= 64 ) { lat_tile_ = 4; r_tile_ = 4; r_passes_ = 4; }
185 else if ( max_team >= 16 ) { lat_tile_ = 4; r_tile_ = 1; r_passes_ = 16; }
186 else { lat_tile_ = 1; r_tile_ = 1; r_passes_ = 1; }
187 }
188#endif
189 else
190 {
191 lat_tile_ = 4; r_tile_ = 8; r_passes_ = 2;
192 }
193 r_tile_block_ = r_tile_ * r_passes_;
194 lat_tiles_ = ( hex_lat_ + lat_tile_ - 1 ) / lat_tile_;
195 r_tiles_ = ( hex_rad_ + r_tile_block_ - 1 ) / r_tile_block_;
196 team_size_ = lat_tile_ * lat_tile_ * r_tile_;
197 blocks_ = local_subdomains_ * lat_tiles_ * lat_tiles_ * r_tiles_;
198
199 update_kernel_path_flag_host_only();
200
201 util::logroot << "[SUPGKerngen] tile=(" << lat_tile_ << "," << lat_tile_ << "," << r_tile_
202 << "), r_passes=" << r_passes_ << ", team=" << team_size_ << ", blocks=" << blocks_
203 << ", path=" << path_name()
204 << ", lumped=" << lumped_mass_ << ", diagonal=" << diagonal_
205 << ", treat_boundary=" << treat_boundary_ << std::endl;
206 }
207
208 ScalarT& dt() { return dt_; }
209 const ScalarT& dt() const { return dt_; }
210
211 const char* path_name() const { return kernel_path_ == KernelPath::Slow ? "slow" : "fast"; }
212 KernelPath kernel_path() const { return kernel_path_; }
213 void force_slow_path() { kernel_path_ = KernelPath::Slow; }
214
215 void apply_impl( const SrcVectorType& src, DstVectorType& dst )
216 {
217 util::Timer timer_apply( "ad_supg_apply" );
218
219 if ( operator_apply_mode_ == linalg::OperatorApplyMode::Replace )
220 assign( dst, 0 );
221
222 src_ = src.grid_data();
223 dst_ = dst.grid_data();
224 vel_grid_ = velocity_.grid_data();
225
226 util::Timer timer_kernel( "ad_supg_kernel" );
227 Kokkos::TeamPolicy<> policy( blocks_, team_size_ );
228 if ( kernel_path_ == KernelPath::Fast )
229 policy.set_scratch_size( 0, Kokkos::PerTeam( team_shmem_size( team_size_ ) ) );
230
231 if ( kernel_path_ == KernelPath::Slow )
232 {
233 Kokkos::parallel_for( "ad_supg_apply_kernel_slow", policy,
234 KOKKOS_CLASS_LAMBDA( const Team& team ) { this->run_team_slow( team ); } );
235 }
236 else
237 {
238 Kokkos::TeamPolicy< Kokkos::LaunchBounds< 128, 5 > > fast_policy( blocks_, team_size_ );
239 fast_policy.set_scratch_size( 0, Kokkos::PerTeam( team_shmem_size( team_size_ ) ) );
240
241 // 2x2x2 = 8 template variants; host-side branch on runtime flags.
242 if ( diagonal_ )
243 {
244 if ( treat_boundary_ && lumped_mass_ )
245 Kokkos::parallel_for( "ad_supg_fast_D_L_TB", fast_policy,
246 KOKKOS_CLASS_LAMBDA( const Team& t ) { this->template run_team_fast< true, true, true >( t ); } );
247 else if ( treat_boundary_ )
248 Kokkos::parallel_for( "ad_supg_fast_D_TB", fast_policy,
249 KOKKOS_CLASS_LAMBDA( const Team& t ) { this->template run_team_fast< false, true, true >( t ); } );
250 else if ( lumped_mass_ )
251 Kokkos::parallel_for( "ad_supg_fast_D_L", fast_policy,
252 KOKKOS_CLASS_LAMBDA( const Team& t ) { this->template run_team_fast< true, true, false >( t ); } );
253 else
254 Kokkos::parallel_for( "ad_supg_fast_D", fast_policy,
255 KOKKOS_CLASS_LAMBDA( const Team& t ) { this->template run_team_fast< false, true, false >( t ); } );
256 }
257 else
258 {
259 if ( treat_boundary_ && lumped_mass_ )
260 Kokkos::parallel_for( "ad_supg_fast_L_TB", fast_policy,
261 KOKKOS_CLASS_LAMBDA( const Team& t ) { this->template run_team_fast< true, false, true >( t ); } );
262 else if ( treat_boundary_ )
263 Kokkos::parallel_for( "ad_supg_fast_TB", fast_policy,
264 KOKKOS_CLASS_LAMBDA( const Team& t ) { this->template run_team_fast< false, false, true >( t ); } );
265 else if ( lumped_mass_ )
266 Kokkos::parallel_for( "ad_supg_fast_L", fast_policy,
267 KOKKOS_CLASS_LAMBDA( const Team& t ) { this->template run_team_fast< true, false, false >( t ); } );
268 else
269 Kokkos::parallel_for( "ad_supg_fast", fast_policy,
270 KOKKOS_CLASS_LAMBDA( const Team& t ) { this->template run_team_fast< false, false, false >( t ); } );
271 }
272 }
273
274 Kokkos::fence();
275 timer_kernel.stop();
276
277 if ( operator_communication_mode_ == linalg::OperatorCommunicationMode::CommunicateAdditively )
278 {
279 util::Timer timer_comm( "ad_supg_comm" );
280 terra::communication::shell::send_recv_with_plan( comm_plan_, dst_, recv_buffers_ );
281 }
282 }
283
284 KOKKOS_INLINE_FUNCTION
285 size_t team_shmem_size( const int /*ts*/ ) const
286 {
287 const int nlev = r_tile_block_ + 1;
288 const int n = lat_tile_ + 1;
289 const int nxy = n * n;
290 // coords_sh(nxy,3) + r_sh(nlev) + T_sh(nxy, nlev) + vel_sh(nxy, 3, nlev)
291 const size_t nscalars = size_t( nxy ) * 3 + size_t( nlev ) + size_t( nxy ) * nlev + size_t( nxy ) * 3 * nlev;
292 return sizeof( ScalarType ) * nscalars;
293 }
294
295 private:
296 KOKKOS_INLINE_FUNCTION
297 void decode_team_indices(
298 const Team& team,
299 int& local_subdomain_id,
300 int& x0, int& y0, int& r0,
301 int& tx, int& ty, int& tr,
302 int& x_cell, int& y_cell, int& r_cell ) const
303 {
304 int tmp = team.league_rank();
305 const int r_tile_id = tmp % r_tiles_;
306 tmp /= r_tiles_;
307 const int lat_y_id = tmp % lat_tiles_;
308 tmp /= lat_tiles_;
309 const int lat_x_id = tmp % lat_tiles_;
310 tmp /= lat_tiles_;
311 local_subdomain_id = tmp;
312
313 x0 = lat_x_id * lat_tile_;
314 y0 = lat_y_id * lat_tile_;
315 r0 = r_tile_id * r_tile_block_;
316
317 const int tid = team.team_rank();
318 tr = tid % r_tile_;
319 tx = ( tid / r_tile_ ) % lat_tile_;
320 ty = tid / ( r_tile_ * lat_tile_ );
321
322 x_cell = x0 + tx;
323 y_cell = y0 + ty;
324 r_cell = r0 + tr;
325 }
326
327 // ----------------------------------------------------------
328 // Slow (reference) path — team wrapper that calls the legacy per-cell kernel.
329 // Bit-identical to UnsteadyAdvectionDiffusionSUPG::operator().
330 // ----------------------------------------------------------
331 KOKKOS_INLINE_FUNCTION
332 void run_team_slow( const Team& team ) const
333 {
334 int local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell;
335 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
336
337 if ( tr >= r_tile_ )
338 return;
339
340 for ( int pass = 0; pass < r_passes_; ++pass )
341 {
342 const int r_cell_pass = r0 + pass * r_tile_ + tr;
343 if ( r_cell_pass >= hex_rad_ )
344 break;
345 if ( x_cell >= hex_lat_ || y_cell >= hex_lat_ )
346 continue;
347
348 process_cell_legacy( local_subdomain_id, x_cell, y_cell, r_cell_pass );
349 }
350 }
351
352 // Per-cell legacy math (replica of UnsteadyAdvectionDiffusionSUPG::operator()).
353 KOKKOS_INLINE_FUNCTION
354 void process_cell_legacy( int s, int x_cell, int y_cell, int r_cell ) const
355 {
357 wedge_surface_physical_coords( wedge_phy_surf, grid_, s, x_cell, y_cell );
358
359 const ScalarT r_1 = radii_( s, r_cell );
360 const ScalarT r_2 = radii_( s, r_cell + 1 );
361
362 constexpr auto num_quad_points = quadrature::quad_felippa_3x2_num_quad_points;
363 dense::Vec< ScalarT, 3 > quad_points[num_quad_points];
364 ScalarT quad_weights[num_quad_points];
367
368 dense::Vec< ScalarT, VelocityVecDim > vel_interp[num_wedges_per_hex_cell][num_quad_points] = {};
369 dense::Vec< ScalarT, 6 > vel_coeffs[VelocityVecDim][num_wedges_per_hex_cell];
370 for ( int d = 0; d < VelocityVecDim; ++d )
371 extract_local_wedge_vector_coefficients( vel_coeffs[d], s, x_cell, y_cell, r_cell, d, vel_grid_ );
372
373 for ( int wedge = 0; wedge < num_wedges_per_hex_cell; ++wedge )
374 for ( int q = 0; q < num_quad_points; ++q )
375 for ( int i = 0; i < num_nodes_per_wedge; ++i )
376 {
377 const auto shape_i = shape( i, quad_points[q] );
378 for ( int d = 0; d < VelocityVecDim; ++d )
379 vel_interp[wedge][q]( d ) += vel_coeffs[d][wedge]( i ) * shape_i;
380 }
381
382 ScalarT streamline_diffusivity[num_wedges_per_hex_cell];
383 const auto h = r_2 - r_1;
384 for ( int wedge = 0; wedge < num_wedges_per_hex_cell; ++wedge )
385 {
386 ScalarT tau_accum = 0.0, waccum = 0.0;
387 for ( int q = 0; q < num_quad_points; ++q )
388 {
389 const auto& uq = vel_interp[wedge][q];
390 const ScalarT vel_norm_q = uq.norm();
391 const ScalarT tau_q = supg_tau< ScalarT >( vel_norm_q, diffusivity_, h, 1e-08 );
392 tau_accum += tau_q * quad_weights[q];
393 waccum += quad_weights[q];
394 }
395 streamline_diffusivity[wedge] = ( waccum > 0.0 ) ? ( tau_accum / waccum ) : 0.0;
396 }
397
398 dense::Mat< ScalarT, 6, 6 > A[num_wedges_per_hex_cell] = {};
399 dense::Mat< ScalarT, 6, 6 > M[num_wedges_per_hex_cell] = {};
400
401 for ( int q = 0; q < num_quad_points; ++q )
402 {
403 const auto w = quad_weights[q];
404 for ( int wedge = 0; wedge < num_wedges_per_hex_cell; ++wedge )
405 {
406 const auto J = jac( wedge_phy_surf[wedge], r_1, r_2, quad_points[q] );
407 const auto det = Kokkos::abs( J.det() );
408 const auto J_inv_transposed = J.inv().transposed();
409 const auto vel = vel_interp[wedge][q];
410
411 for ( int i = 0; i < num_nodes_per_wedge; ++i )
412 {
413 const auto shape_i = shape( i, quad_points[q] );
414 const auto grad_i = J_inv_transposed * grad_shape( i, quad_points[q] );
415 for ( int j = 0; j < num_nodes_per_wedge; ++j )
416 {
417 const auto shape_j = shape( j, quad_points[q] );
418 const auto grad_j = J_inv_transposed * grad_shape( j, quad_points[q] );
419
420 const auto mass = shape_i * shape_j;
421 const auto diffusion = diffusivity_ * grad_i.dot( grad_j );
422 const auto advection = vel.dot( grad_j ) * shape_i;
423 const auto streamline = streamline_diffusivity[wedge] * vel.dot( grad_j ) * vel.dot( grad_i );
424
425 M[wedge]( i, j ) += w * mass_scaling_ * mass * det;
426 A[wedge]( i, j ) += w * dt_ * ( diffusion + advection + streamline ) * det;
427 }
428 }
429 }
430 }
431
432 if ( lumped_mass_ )
433 {
434 dense::Vec< ScalarT, 6 > ones;
435 ones.fill( 1.0 );
438 }
439
440 if ( treat_boundary_ )
441 {
443 boundary_mask_( s, x_cell, y_cell, r_cell ), grid::shell::ShellBoundaryFlag::CMB );
445 boundary_mask_( s, x_cell, y_cell, r_cell + 1 ), grid::shell::ShellBoundaryFlag::SURFACE );
446
447 for ( int wedge = 0; wedge < num_wedges_per_hex_cell; ++wedge )
448 {
449 dense::Mat< ScalarT, 6, 6 > boundary_mask;
450 boundary_mask.fill( 1.0 );
451 if ( at_cmb_boundary )
452 for ( int i = 0; i < 6; ++i )
453 for ( int j = 0; j < 6; ++j )
454 if ( i != j && ( i < 3 || j < 3 ) )
455 boundary_mask( i, j ) = 0.0;
456 if ( at_surface_boundary )
457 for ( int i = 0; i < 6; ++i )
458 for ( int j = 0; j < 6; ++j )
459 if ( i != j && ( i >= 3 || j >= 3 ) )
460 boundary_mask( i, j ) = 0.0;
461 M[wedge].hadamard_product( boundary_mask );
462 A[wedge].hadamard_product( boundary_mask );
463 }
464 }
465
466 if ( diagonal_ )
467 {
468 M[0] = M[0].diagonal();
469 M[1] = M[1].diagonal();
470 A[0] = A[0].diagonal();
471 A[1] = A[1].diagonal();
472 }
473
474 dense::Vec< ScalarT, 6 > src[num_wedges_per_hex_cell];
475 extract_local_wedge_scalar_coefficients( src, s, x_cell, y_cell, r_cell, src_ );
476
477 dense::Vec< ScalarT, 6 > dst[num_wedges_per_hex_cell];
478 dst[0] = ( M[0] + A[0] ) * src[0];
479 dst[1] = ( M[1] + A[1] ) * src[1];
480
481 atomically_add_local_wedge_scalar_coefficients( dst_, s, x_cell, y_cell, r_cell, dst );
482 }
483
484 // ----------------------------------------------------------
485 // Fast path: fused arithmetic, shmem-cached inputs.
486 // Templated on LumpedMass, Diagonal, TreatBoundary so the compiler
487 // dead-eliminates inapplicable branches.
488 // ----------------------------------------------------------
489 template < bool LumpedMass, bool Diagonal, bool TreatBoundary >
490 KOKKOS_INLINE_FUNCTION
491 void run_team_fast( const Team& team ) const
492 {
493 int local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell;
494 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
495
496 const int nlev = r_tile_block_ + 1;
497 const int n = lat_tile_ + 1;
498 const int nxy = n * n;
499
500 double* shmem =
501 reinterpret_cast< double* >( team.team_shmem().get_shmem( team_shmem_size( team.team_size() ) ) );
502
503 using ScratchCoords =
504 Kokkos::View< double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
505 using ScratchR =
506 Kokkos::View< double*, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
507 using ScratchT =
508 Kokkos::View< double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
509 using ScratchVel =
510 Kokkos::View< double***, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
511
512 ScratchCoords coords_sh( shmem, nxy, 3 );
513 shmem += nxy * 3;
514 ScratchR r_sh( shmem, nlev );
515 shmem += nlev;
516 ScratchT T_sh( shmem, nxy, nlev );
517 shmem += nxy * nlev;
518 ScratchVel vel_sh( shmem, nxy, 3, nlev );
519
520 auto node_id = [&]( int nx, int ny ) -> int { return nx + n * ny; };
521
522 // Preload coords, radii, T, velocity.
523 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nxy ), [&]( int id ) {
524 const int dxn = id % n;
525 const int dyn = id / n;
526 const int xi = x0 + dxn;
527 const int yi = y0 + dyn;
528 if ( xi <= hex_lat_ && yi <= hex_lat_ )
529 {
530 coords_sh( id, 0 ) = grid_( local_subdomain_id, xi, yi, 0 );
531 coords_sh( id, 1 ) = grid_( local_subdomain_id, xi, yi, 1 );
532 coords_sh( id, 2 ) = grid_( local_subdomain_id, xi, yi, 2 );
533 }
534 else
535 {
536 coords_sh( id, 0 ) = coords_sh( id, 1 ) = coords_sh( id, 2 ) = 0.0;
537 }
538 } );
539 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nlev ), [&]( int lvl ) {
540 const int rr = r0 + lvl;
541 r_sh( lvl ) = ( rr <= hex_rad_ ) ? radii_( local_subdomain_id, rr ) : 0.0;
542 } );
543 const int total = nxy * nlev;
544 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, total ), [&]( int t ) {
545 const int node = t / nlev;
546 const int lvl = t - node * nlev;
547 const int dxn = node % n;
548 const int dyn = node / n;
549 const int xi = x0 + dxn;
550 const int yi = y0 + dyn;
551 const int rr = r0 + lvl;
552 if ( xi <= hex_lat_ && yi <= hex_lat_ && rr <= hex_rad_ )
553 {
554 T_sh( node, lvl ) = src_( local_subdomain_id, xi, yi, rr );
555 vel_sh( node, 0, lvl ) = vel_grid_( local_subdomain_id, xi, yi, rr, 0 );
556 vel_sh( node, 1, lvl ) = vel_grid_( local_subdomain_id, xi, yi, rr, 1 );
557 vel_sh( node, 2, lvl ) = vel_grid_( local_subdomain_id, xi, yi, rr, 2 );
558 }
559 else
560 {
561 T_sh( node, lvl ) = 0.0;
562 vel_sh( node, 0, lvl ) = vel_sh( node, 1, lvl ) = vel_sh( node, 2, lvl ) = 0.0;
563 }
564 } );
565
566 team.team_barrier();
567
568 if ( tr >= r_tile_ )
569 return;
570 if ( x_cell >= hex_lat_ || y_cell >= hex_lat_ )
571 return;
572
573 // Wedge-node offset table.
574 constexpr int WEDGE_NODE_OFF[2][6][3] = {
575 { { 0, 0, 0 }, { 1, 0, 0 }, { 0, 1, 0 }, { 0, 0, 1 }, { 1, 0, 1 }, { 0, 1, 1 } },
576 { { 1, 1, 0 }, { 0, 1, 0 }, { 1, 0, 0 }, { 1, 1, 1 }, { 0, 1, 1 }, { 1, 0, 1 } } };
577
578 // Felippa 3x2 quadrature points & weights, hoisted for compile-time propagation.
579 constexpr int NQ = 6;
580 constexpr double QUAD_W = 1.0 / 6.0;
581 // (xi, eta, zeta)
582 // points 0..2 share zeta = -1/sqrt(3); points 3..5 share zeta = +1/sqrt(3).
583 constexpr double ONE_OVER_SQRT3 = 0.5773502691896257;
584 // Lateral barycentric coordinates per quadrature point:
585 // (1-xi-eta, xi, eta)
586 constexpr double BARY[NQ][3] = {
587 { 1.0/6.0, 2.0/3.0, 1.0/6.0 }, // q=0: xi=2/3, eta=1/6
588 { 1.0/6.0, 1.0/6.0, 2.0/3.0 }, // q=1: xi=1/6, eta=2/3
589 { 2.0/3.0, 1.0/6.0, 1.0/6.0 }, // q=2: xi=1/6, eta=1/6
590 { 1.0/6.0, 2.0/3.0, 1.0/6.0 }, // q=3: same lat as 0
591 { 1.0/6.0, 1.0/6.0, 2.0/3.0 }, // q=4: same lat as 1
592 { 2.0/3.0, 1.0/6.0, 1.0/6.0 } // q=5: same lat as 2
593 };
594 // Zeta half-width factor: grad_shape_rad returns ±0.5; shape_rad(j, zeta) = (1 ∓ zeta)/2.
595 // j<3 (lower layer): shape_rad = (1 - zeta)/2
596 // j>=3 (upper layer): shape_rad = (1 + zeta)/2
597 constexpr double ZETA_Q[NQ] = {
598 -ONE_OVER_SQRT3, -ONE_OVER_SQRT3, -ONE_OVER_SQRT3,
599 +ONE_OVER_SQRT3, +ONE_OVER_SQRT3, +ONE_OVER_SQRT3 };
600
601 // PHI[j][q] = shape_lat(j, xi, eta) * shape_rad(j, zeta)
602 // shape_lat(j=0|3, xi, eta) = 1 - xi - eta = BARY[q][0]
603 // shape_lat(j=1|4, ...) = xi = BARY[q][1]
604 // shape_lat(j=2|5, ...) = eta = BARY[q][2]
605 // shape_rad(j<3, zeta) = 0.5 (1 - zeta)
606 // shape_rad(j>=3,zeta) = 0.5 (1 + zeta)
607 auto phi = [&]( int j, int q ) -> double {
608 const double sl = BARY[q][j % 3];
609 const double sr = ( j < 3 ) ? 0.5 * ( 1.0 - ZETA_Q[q] ) : 0.5 * ( 1.0 + ZETA_Q[q] );
610 return sl * sr;
611 };
612 // grad_shape_lat_xi(j=0|3) = -1, (1|4) = +1, (2|5) = 0
613 // grad_shape_lat_eta(j=0|3) = -1, (1|4) = 0, (2|5) = +1
614 // grad_shape_rad(j<3) = -0.5, (j>=3) = +0.5
615 // grad_shape(j, xi, eta, zeta)
616 // = ( grad_lat_xi(j) * shape_rad(j, zeta),
617 // grad_lat_eta(j) * shape_rad(j, zeta),
618 // shape_lat(j, xi, eta) * grad_shape_rad(j) )
619 auto gref = [&]( int j, int q, int d ) -> double {
620 const double sr = ( j < 3 ) ? 0.5 * ( 1.0 - ZETA_Q[q] ) : 0.5 * ( 1.0 + ZETA_Q[q] );
621 const double grad_rad = ( j < 3 ) ? -0.5 : 0.5;
622 const int jmod = j % 3;
623 const double glat_xi = ( jmod == 0 ) ? -1.0 : ( jmod == 1 ) ? 1.0 : 0.0;
624 const double glat_eta = ( jmod == 0 ) ? -1.0 : ( jmod == 1 ) ? 0.0 : 1.0;
625 const double sl = BARY[q][jmod];
626 if ( d == 0 ) return glat_xi * sr;
627 if ( d == 1 ) return glat_eta * sr;
628 return sl * grad_rad;
629 };
630
631 // Node neighbours for Jacobian surface-coord gather (lat-only).
632 const int n00 = node_id( tx, ty );
633 const int n01 = node_id( tx, ty + 1 );
634 const int n10 = node_id( tx + 1, ty );
635 const int n11 = node_id( tx + 1, ty + 1 );
636
637 // Per-pass radial loop.
638 for ( int pass = 0; pass < r_passes_; ++pass )
639 {
640 const int lvl0 = pass * r_tile_ + tr;
641 const int r_cell_abs = r0 + lvl0;
642 if ( r_cell_abs >= hex_rad_ )
643 break;
644
645 const double r_lower = r_sh( lvl0 );
646 const double r_upper = r_sh( lvl0 + 1 );
647 const double half_dr = 0.5 * ( r_upper - r_lower );
648 const double r_mid = 0.5 * ( r_lower + r_upper );
649 const double h_cell = r_upper - r_lower;
650
651 // Boundary flags (determine Dirichlet row/col elimination masks).
652 bool at_cmb = false, at_surface = false;
653 if constexpr ( TreatBoundary )
654 {
655 at_cmb = util::has_flag( boundary_mask_( local_subdomain_id, x_cell, y_cell, r_cell_abs ),
657 at_surface = util::has_flag( boundary_mask_( local_subdomain_id, x_cell, y_cell, r_cell_abs + 1 ),
659 }
660
661 // Dirichlet treatment helper: node j is a "boundary node" if
662 // at_cmb AND ddr_j == 0 (inner layer) -> j in {0,1,2}
663 // at_surface AND ddr_j == 1 (outer layer) -> j in {3,4,5}
664 // We keep the diagonal (j == i) so row-elimination is handled in the scatter branch.
665 auto boundary_j_col = [&]( int j ) -> bool {
666 if constexpr ( !TreatBoundary ) return false;
667 return ( at_cmb && j < 3 ) || ( at_surface && j >= 3 );
668 };
669
670 for ( int w = 0; w < num_wedges_per_hex_cell; ++w )
671 {
672 // Wedge surface vertices in the lateral plane (shmem lookup).
673 const int v0 = ( w == 0 ? n00 : n11 );
674 const int v1 = ( w == 0 ? n10 : n01 );
675 const int v2 = ( w == 0 ? n01 : n10 );
676
677 // Surface coords P0, P1, P2 (3 doubles each).
678 const double P0[3] = { coords_sh( v0, 0 ), coords_sh( v0, 1 ), coords_sh( v0, 2 ) };
679 const double P1[3] = { coords_sh( v1, 0 ), coords_sh( v1, 1 ), coords_sh( v1, 2 ) };
680 const double P2[3] = { coords_sh( v2, 0 ), coords_sh( v2, 1 ), coords_sh( v2, 2 ) };
681
682 // Differences, hoisted (used per quad).
683 const double dP1_P0[3] = { P1[0] - P0[0], P1[1] - P0[1], P1[2] - P0[2] };
684 const double dP2_P0[3] = { P2[0] - P0[0], P2[1] - P0[1], P2[2] - P0[2] };
685
686 // Pre-pass 1: interpolate velocity at each quad point; compute tau_wedge.
687 // We cache vel_q[q][d] for reuse in main pass + boundary-diagonal pass.
688 double vel_q[NQ][3] = { { 0.0 } };
689 {
690 double tau_sum = 0.0;
691 double w_sum = 0.0;
692 for ( int q = 0; q < NQ; ++q )
693 {
694 double ux = 0.0, uy = 0.0, uz = 0.0;
695#pragma unroll
696 for ( int j = 0; j < num_nodes_per_wedge; ++j )
697 {
698 const int ddx = WEDGE_NODE_OFF[w][j][0];
699 const int ddy = WEDGE_NODE_OFF[w][j][1];
700 const int ddr = WEDGE_NODE_OFF[w][j][2];
701 const int nid = node_id( tx + ddx, ty + ddy );
702 const int lvl = lvl0 + ddr;
703 const double p = phi( j, q );
704 ux += p * vel_sh( nid, 0, lvl );
705 uy += p * vel_sh( nid, 1, lvl );
706 uz += p * vel_sh( nid, 2, lvl );
707 }
708 vel_q[q][0] = ux;
709 vel_q[q][1] = uy;
710 vel_q[q][2] = uz;
711
712 const double vn = Kokkos::sqrt( ux * ux + uy * uy + uz * uz );
713 const double tauq = supg_tau< double >( vn, double( diffusivity_ ), h_cell, 1e-08 );
714 tau_sum += tauq * QUAD_W;
715 w_sum += QUAD_W;
716 }
717 // volume-averaged tau (unchanged from legacy)
718 const double tau_wedge = ( w_sum > 0.0 ) ? ( tau_sum / w_sum ) : 0.0;
719
720 // Per-i accumulators for the 6 output nodes of this wedge.
721 double dst_acc[num_nodes_per_wedge] = { 0.0 };
722 // Lumped mass diagonal accumulator (per output node i); used only if LumpedMass.
723 double lumped_Mii[num_nodes_per_wedge] = { 0.0 };
724
725 // Main fused loop over quad points.
726 for ( int q = 0; q < NQ; ++q )
727 {
728 // Assemble J at this quad point.
729 // J_*_0 = r(zeta_q) * dP1_P0
730 // J_*_1 = r(zeta_q) * dP2_P0
731 // J_*_2 = half_dr * (BARY[q][0]*P0 + BARY[q][1]*P1 + BARY[q][2]*P2)
732 const double r_at_q = r_mid + ZETA_Q[q] * half_dr;
733 const double b0 = BARY[q][0], b1 = BARY[q][1], b2 = BARY[q][2];
734
735 const double J00 = r_at_q * dP1_P0[0];
736 const double J10 = r_at_q * dP1_P0[1];
737 const double J20 = r_at_q * dP1_P0[2];
738 const double J01 = r_at_q * dP2_P0[0];
739 const double J11 = r_at_q * dP2_P0[1];
740 const double J21 = r_at_q * dP2_P0[2];
741 const double J02 = half_dr * ( b0 * P0[0] + b1 * P1[0] + b2 * P2[0] );
742 const double J12 = half_dr * ( b0 * P0[1] + b1 * P1[1] + b2 * P2[1] );
743 const double J22 = half_dr * ( b0 * P0[2] + b1 * P1[2] + b2 * P2[2] );
744
745 const double J_det = J00 * J11 * J22 - J00 * J12 * J21
746 - J01 * J10 * J22 + J01 * J12 * J20
747 + J02 * J10 * J21 - J02 * J11 * J20;
748 const double abs_det = Kokkos::abs( J_det );
749 const double inv_det = 1.0 / J_det;
750
751 // invJ_T entries (row d of J^{-T} dot ∇_ref = physical gradient component d)
752 const double i00 = inv_det * ( J11 * J22 - J12 * J21 );
753 const double i01 = inv_det * ( -J10 * J22 + J12 * J20 );
754 const double i02 = inv_det * ( J10 * J21 - J11 * J20 );
755 const double i10 = inv_det * ( -J01 * J22 + J02 * J21 );
756 const double i11 = inv_det * ( J00 * J22 - J02 * J20 );
757 const double i12 = inv_det * ( -J00 * J21 + J01 * J20 );
758 const double i20 = inv_det * ( J01 * J12 - J02 * J11 );
759 const double i21 = inv_det * ( -J00 * J12 + J02 * J10 );
760 const double i22 = inv_det * ( J00 * J11 - J01 * J10 );
761
762 const double wdet = QUAD_W * abs_det;
763
764 // Gather: T̂(q), ∇T(q). Column-elimination at Dirichlet boundary nodes.
765 double T_hat = 0.0;
766 double gT0 = 0.0, gT1 = 0.0, gT2 = 0.0;
767#pragma unroll
768 for ( int j = 0; j < num_nodes_per_wedge; ++j )
769 {
770 if constexpr ( TreatBoundary )
771 if ( boundary_j_col( j ) )
772 continue;
773
774 const int ddx = WEDGE_NODE_OFF[w][j][0];
775 const int ddy = WEDGE_NODE_OFF[w][j][1];
776 const int ddr = WEDGE_NODE_OFF[w][j][2];
777 const int nid = node_id( tx + ddx, ty + ddy );
778 const int lvl = lvl0 + ddr;
779 const double Tj = T_sh( nid, lvl );
780
781 const double pj = phi( j, q );
782 T_hat += pj * Tj;
783
784 const double gx = gref( j, q, 0 );
785 const double gy = gref( j, q, 1 );
786 const double gz = gref( j, q, 2 );
787 const double g0 = i00 * gx + i01 * gy + i02 * gz;
788 const double g1 = i10 * gx + i11 * gy + i12 * gz;
789 const double g2 = i20 * gx + i21 * gy + i22 * gz;
790 gT0 += g0 * Tj;
791 gT1 += g1 * Tj;
792 gT2 += g2 * Tj;
793 }
794
795 const double ux = vel_q[q][0], uy = vel_q[q][1], uz = vel_q[q][2];
796 const double u_dot_gT = ux * gT0 + uy * gT1 + uz * gT2;
797
798 // A_scalar and B_vec, with LumpedMass handling.
799 // For LumpedMass: exclude the mass term from A_scalar (handled in diagonal acc below).
800 const double mass_in_scalar = LumpedMass ? 0.0 : ( double( mass_scaling_ ) * T_hat );
801 const double A_scalar = mass_in_scalar + double( dt_ ) * u_dot_gT;
802 const double dkappa = double( dt_ ) * double( diffusivity_ );
803 const double dtau = double( dt_ ) * tau_wedge * u_dot_gT;
804 const double Bx = dkappa * gT0 + dtau * ux;
805 const double By = dkappa * gT1 + dtau * uy;
806 const double Bz = dkappa * gT2 + dtau * uz;
807
808 // Scatter accumulate into each output node's dst_acc.
809#pragma unroll
810 for ( int i = 0; i < num_nodes_per_wedge; ++i )
811 {
812 const double pi = phi( i, q );
813 const double gxi = gref( i, q, 0 );
814 const double gyi = gref( i, q, 1 );
815 const double gzi = gref( i, q, 2 );
816 const double gi0 = i00 * gxi + i01 * gyi + i02 * gzi;
817 const double gi1 = i10 * gxi + i11 * gyi + i12 * gzi;
818 const double gi2 = i20 * gxi + i21 * gyi + i22 * gzi;
819
820 // Boundary row: only the i=j diagonal term contributes.
821 bool is_boundary_i = false;
822 if constexpr ( TreatBoundary )
823 {
824 is_boundary_i = ( at_cmb && i < 3 ) || ( at_surface && i >= 3 );
825 }
826
827 // Diagonal accumulators (used when is_boundary_i or Diagonal==true).
828 // When LumpedMass is true, the mass contribution uses the row-sum form
829 // M_ii_lumped = m · Σ_q wdet · φ_i(q) (since Σⱼ φ_j ≡ 1)
830 // When LumpedMass is false, the mass diagonal is the unlumped one:
831 // M_ii = m · Σ_q wdet · φ_i(q) · φ_i(q)
832 // A's diagonal (diff + adv + supg) is identical for lumped and unlumped.
833 auto add_diagonal_contribution = [&]() {
834 const int ddx = WEDGE_NODE_OFF[w][i][0];
835 const int ddy = WEDGE_NODE_OFF[w][i][1];
836 const int ddr = WEDGE_NODE_OFF[w][i][2];
837 const int nid = node_id( tx + ddx, ty + ddy );
838 const int lvl = lvl0 + ddr;
839 const double Ti = T_sh( nid, lvl );
840
841 const double u_dot_gi = ux * gi0 + uy * gi1 + uz * gi2;
842 const double diff_ii = double( diffusivity_ ) * ( gi0 * gi0 + gi1 * gi1 + gi2 * gi2 );
843 const double adv_ii = pi * u_dot_gi;
844 const double supg_ii = tau_wedge * u_dot_gi * u_dot_gi;
845 const double A_ii = diff_ii + adv_ii + supg_ii;
846
847 if constexpr ( LumpedMass )
848 {
849 // Row-sum lumped mass (partition of unity).
850 lumped_Mii[i] += wdet * double( mass_scaling_ ) * pi;
851 dst_acc[i] += wdet * double( dt_ ) * A_ii * Ti;
852 }
853 else
854 {
855 const double M_ii_q = double( mass_scaling_ ) * pi * pi;
856 dst_acc[i] += wdet * ( M_ii_q + double( dt_ ) * A_ii ) * Ti;
857 }
858 };
859
860 if ( is_boundary_i )
861 {
862 add_diagonal_contribution();
863 }
864 else
865 {
866 if constexpr ( Diagonal )
867 {
868 add_diagonal_contribution();
869 }
870 else
871 {
872 // Standard fused interior contribution.
873 const double contrib = wdet * ( pi * A_scalar + gi0 * Bx + gi1 * By + gi2 * Bz );
874 dst_acc[i] += contrib;
875 if constexpr ( LumpedMass )
876 {
877 // Row-sum lumped mass (partition of unity).
878 lumped_Mii[i] += wdet * double( mass_scaling_ ) * pi;
879 }
880 }
881 }
882 }
883 } // end quad loop
884
885 // Apply lumped mass: dst_i += M_ii_lumped · T_i
886 if constexpr ( LumpedMass )
887 {
888#pragma unroll
889 for ( int i = 0; i < num_nodes_per_wedge; ++i )
890 {
891 // For interior nodes: lumped_Mii[i] is Σ_q wdet · m · φ_i(q).
892 // For boundary nodes: lumped_Mii[i] is Σ_q wdet · m · φ_i(q)·φ_i(q).
893 // Both are diagonal contributions to be multiplied by T_i.
894 const int ddx = WEDGE_NODE_OFF[w][i][0];
895 const int ddy = WEDGE_NODE_OFF[w][i][1];
896 const int ddr = WEDGE_NODE_OFF[w][i][2];
897 const int nid = node_id( tx + ddx, ty + ddy );
898 const int lvl = lvl0 + ddr;
899 const double Ti = T_sh( nid, lvl );
900 dst_acc[i] += lumped_Mii[i] * Ti;
901 }
902 }
903
904 // Final atomic scatter to global dst_.
905 // Use the scalar-wedge helper by materialising the two-wedge buffer in order.
906 // atomically_add_local_wedge_scalar_coefficients expects a [2][6] layout
907 // for both wedges at once, so we accumulate per-wedge and emit inline atomics.
908 const int xc = x_cell;
909 const int yc = y_cell;
910 const int rc = r_cell_abs;
911#pragma unroll
912 for ( int i = 0; i < num_nodes_per_wedge; ++i )
913 {
914 const int ddx = WEDGE_NODE_OFF[w][i][0];
915 const int ddy = WEDGE_NODE_OFF[w][i][1];
916 const int ddr = WEDGE_NODE_OFF[w][i][2];
917 Kokkos::atomic_add( &dst_( local_subdomain_id, xc + ddx, yc + ddy, rc + ddr ), dst_acc[i] );
918 }
919 }
920 } // end wedge loop
921 } // end pass loop
922 }
923};
924
925static_assert( linalg::OperatorLike< UnsteadyAdvectionDiffusionSUPGKerngen< double > > );
926
927} // namespace terra::fe::wedge::operators::shell
Definition communication_plan.hpp:33
Definition unsteady_advection_diffusion_supg_kerngen.hpp:74
const char * path_name() const
Definition unsteady_advection_diffusion_supg_kerngen.hpp:211
Kokkos::TeamPolicy<>::member_type Team
Definition unsteady_advection_diffusion_supg_kerngen.hpp:79
KernelPath
Definition unsteady_advection_diffusion_supg_kerngen.hpp:82
const ScalarT & dt() const
Definition unsteady_advection_diffusion_supg_kerngen.hpp:209
size_t team_shmem_size(const int) const
Definition unsteady_advection_diffusion_supg_kerngen.hpp:285
ScalarT ScalarType
Definition unsteady_advection_diffusion_supg_kerngen.hpp:78
ScalarT & dt()
Definition unsteady_advection_diffusion_supg_kerngen.hpp:208
void apply_impl(const SrcVectorType &src, DstVectorType &dst)
Definition unsteady_advection_diffusion_supg_kerngen.hpp:215
void force_slow_path()
Definition unsteady_advection_diffusion_supg_kerngen.hpp:213
UnsteadyAdvectionDiffusionSUPGKerngen(const grid::shell::DistributedDomain &domain, const grid::Grid3DDataVec< ScalarT, 3 > &grid, const grid::Grid2DDataScalar< ScalarT > &radii, const grid::Grid4DDataScalar< grid::shell::ShellBoundaryFlag > &boundary_mask, const linalg::VectorQ1Vec< ScalarT, VelocityVecDim > &velocity, const ScalarT diffusivity, const ScalarT dt, bool treat_boundary, bool diagonal=false, ScalarT mass_scaling=1.0, bool lumped_mass=false, linalg::OperatorApplyMode operator_apply_mode=linalg::OperatorApplyMode::Replace, linalg::OperatorCommunicationMode operator_communication_mode=linalg::OperatorCommunicationMode::CommunicateAdditively)
Definition unsteady_advection_diffusion_supg_kerngen.hpp:139
KernelPath kernel_path() const
Definition unsteady_advection_diffusion_supg_kerngen.hpp:212
Parallel data structure organizing the thick spherical shell metadata for distributed (MPI parallel) ...
Definition spherical_shell.hpp:2518
const std::map< SubdomainInfo, std::tuple< LocalSubdomainIdx, SubdomainNeighborhood > > & subdomains() const
Definition spherical_shell.hpp:2650
const DomainInfo & domain_info() const
Returns a const reference.
Definition spherical_shell.hpp:2647
Information about the thick spherical shell mesh.
Definition spherical_shell.hpp:780
int subdomain_num_nodes_radially() const
Equivalent to calling subdomain_num_nodes_radially( subdomain_refinement_level() )
Definition spherical_shell.hpp:861
int subdomain_num_nodes_per_side_laterally() const
Equivalent to calling subdomain_num_nodes_per_side_laterally( subdomain_refinement_level() )
Definition spherical_shell.hpp:852
Q1 scalar finite element vector on a distributed shell grid.
Definition vector_q1.hpp:21
const grid::Grid4DDataScalar< ScalarType > & grid_data() const
Get const reference to grid data.
Definition vector_q1.hpp:139
Static assertion: VectorQ1Scalar satisfies VectorLike concept.
Definition vector_q1.hpp:168
const grid::Grid4DDataVec< ScalarType, VecDim > & grid_data() const
Get const reference to grid data.
Definition vector_q1.hpp:288
Timer supporting RAII scope or manual stop.
Definition timer.hpp:342
void stop()
Stop the timer and record elapsed time.
Definition timer.hpp:364
at_cmb_boundary
Definition EpsilonDivDiv_kernel_gen.py:98
J
Definition EpsilonDivDiv_kernel_gen.py:199
r_cell
Definition EpsilonDivDiv_kernel_gen.py:23
at_surface_boundary
Definition EpsilonDivDiv_kernel_gen.py:98
x_cell
Definition EpsilonDivDiv_kernel_gen.py:23
g2
Definition EpsilonDivDiv_kernel_gen.py:267
local_subdomain_id
Definition EpsilonDivDiv_kernel_gen.py:23
g1
Definition EpsilonDivDiv_kernel_gen.py:266
J_det
Definition EpsilonDivDiv_kernel_gen.py:216
y_cell
Definition EpsilonDivDiv_kernel_gen.py:23
void send_recv_with_plan(const ShellBoundaryCommPlan< GridDataType > &plan, const GridDataType &data, SubdomainNeighborhoodSendRecvBuffer< typename GridDataType::value_type, grid::grid_data_vec_dim< GridDataType >() > &recv_buffers, CommunicationReduction reduction=CommunicationReduction::SUM)
Definition communication_plan.hpp:652
Definition boundary_mass.hpp:14
constexpr void quad_felippa_3x2_quad_weights(T(&quad_weights)[quad_felippa_3x2_num_quad_points])
Definition wedge/quadrature/quadrature.hpp:93
constexpr int quad_felippa_3x2_num_quad_points
Definition wedge/quadrature/quadrature.hpp:66
constexpr void quad_felippa_3x2_quad_points(dense::Vec< T, 3 >(&quad_points)[quad_felippa_3x2_num_quad_points])
Definition wedge/quadrature/quadrature.hpp:70
constexpr int num_nodes_per_wedge_surface
Definition kernel_helpers.hpp:6
void atomically_add_local_wedge_scalar_coefficients(const grid::Grid4DDataScalar< T > &global_coefficients, const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const dense::Vec< T, 6 >(&local_coefficients)[2])
Performs an atomic add of the two local wedge coefficient vectors of a hex cell into the global coeff...
Definition kernel_helpers.hpp:407
void wedge_surface_physical_coords(dense::Vec< T, 3 >(&wedge_surf_phy_coords)[num_wedges_per_hex_cell][num_nodes_per_wedge_surface], const grid::Grid3DDataVec< T, 3 > &lateral_grid, const int local_subdomain_id, const int x_cell, const int y_cell)
Extracts the (unit sphere) surface vertex coords of the two wedges of a hex cell.
Definition kernel_helpers.hpp:26
constexpr int num_wedges_per_hex_cell
Definition kernel_helpers.hpp:5
void extract_local_wedge_scalar_coefficients(dense::Vec< T, 6 >(&local_coefficients)[2], const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const grid::Grid4DDataScalar< T > &global_coefficients)
Extracts the local vector coefficients for the two wedges of a hex cell from the global coefficient v...
Definition kernel_helpers.hpp:306
void extract_local_wedge_vector_coefficients(dense::Vec< T, 6 >(&local_coefficients)[2], const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int d, const grid::Grid4DDataVec< T, VecDim > &global_coefficients)
Extracts the local vector coefficients for the two wedges of a hex cell from the global coefficient v...
Definition kernel_helpers.hpp:356
constexpr int num_nodes_per_wedge
Definition kernel_helpers.hpp:7
constexpr dense::Vec< T, 3 > grad_shape(const int node_idx, const T xi, const T eta, const T zeta)
Gradient of the full shape function:
Definition integrands.hpp:228
constexpr T shape(const int node_idx, const T xi, const T eta, const T zeta)
(Tensor-product) Shape function.
Definition integrands.hpp:146
constexpr dense::Mat< T, 3, 3 > jac(const dense::Vec< T, 3 > &p1_phy, const dense::Vec< T, 3 > &p2_phy, const dense::Vec< T, 3 > &p3_phy, const T r_1, const T r_2, const T xi, const T eta, const T zeta)
Definition integrands.hpp:657
Kokkos::View< ScalarType ***[VecDim], Layout > Grid3DDataVec
Definition grid_types.hpp:42
Kokkos::View< ScalarType ****, Layout > Grid4DDataScalar
Definition grid_types.hpp:27
Kokkos::View< ScalarType **, Layout > Grid2DDataScalar
Definition grid_types.hpp:21
OperatorApplyMode
Modes for applying an operator to a vector.
Definition operator.hpp:30
@ Replace
Overwrite the destination vector.
OperatorCommunicationMode
Modes for communication during operator application.
Definition operator.hpp:40
@ CommunicateAdditively
Communicate and add results.
constexpr bool has_flag(E mask_value, E flag) noexcept
Checks if a bitmask value contains a specific flag.
Definition bit_masking.hpp:43
detail::PrefixCout logroot([]() { return detail::log_prefix();})
std::ostream subclass that just logs on root and adds a timestamp for each line.
static constexpr Mat diagonal_from_vec(const Vec< T, Rows > &diagonal)
Definition mat.hpp:79
T norm() const
Definition vec.hpp:76
SoA (Structure-of-Arrays) 4D vector grid data.
Definition grid_types.hpp:51