Loading...
Searching...
No Matches
gradient_kerngen.hpp
Go to the documentation of this file.
1#pragma once
2
3/// @file gradient_kerngen.hpp
4/// @brief Team-based matrix-free Gradient operator for the spherical shell,
5/// with the same fused-arithmetic optimisations applied to
6/// `DivergenceKerngen` / `EpsilonDivDivKerngen`.
7///
8/// Structure mirrors `DivergenceKerngen` — but Gradient is the transpose of
9/// Divergence:
10/// - src: scalar pressure on the coarse grid (Grid4DDataScalar)
11/// - dst: vec3 velocity on the fine grid (Grid4DDataVec<..,3>)
12///
13/// Fused arithmetic derivation. The original element-matrix form is
14/// A[wedge](d·6+i, j) = -qw · |det J| · (J⁻ᵀ ∇N_i)_d · shape_coarse_j
15/// so
16/// dst_{d,i} = Σⱼ A(d·6+i, j) · p_j
17/// = -qw · |det J| · (J⁻ᵀ ∇N_i)_d · Σⱼ shape_coarse_j · p_j
18/// = -qw · |det J| · (J⁻ᵀ ∇N_i)_d · p_interp,
19/// where p_interp is the interpolated coarse pressure at the quadrature point.
20/// We never materialise A.
21///
22/// Per wedge:
23/// 1. Compute J, |det J|, J⁻¹.
24/// 2. Compute p_interp = Σⱼ shape_coarse_j(q) · p_j (6 muls).
25/// 3. prefactor = -qw · |det J| · p_interp.
26/// 4. For each fine node i ∈ 0..5:
27/// g = J⁻ᵀ · dN_ref_i (9 FMAs)
28/// contribution_d = prefactor · g_d (3 muls)
29/// Dirichlet: skip boundary-node scatter.
30/// Freeslip: project contribution onto tangent plane at boundary nodes.
31/// atomic_add into dst_(s, x+ddx, y+ddy, r+ddr, d) for d=0,1,2.
32///
33/// The freeslip projection uses the fact that on a spherical shell the outward
34/// normal at a lateral node equals the unit-sphere coord already cached in
35/// coords_sh — same trick as DivergenceKerngen, applied to the output side here.
36
37#include "../../quadrature/quadrature.hpp"
40#include "dense/vec.hpp"
44#include "linalg/operator.hpp"
46#include "linalg/vector.hpp"
47#include "linalg/vector_q1.hpp"
48#include "util/timer.hpp"
49
51
62
63template < typename ScalarT >
65{
66 public:
69 using ScalarType = ScalarT;
70 using Team = Kokkos::TeamPolicy<>::member_type;
71
72 enum class KernelPath
73 {
74 Slow,
77 };
78
79 private:
81 grid::shell::DistributedDomain domain_coarse_;
82
86
87 BoundaryConditions bcs_;
88
89 linalg::OperatorApplyMode operator_apply_mode_;
90 linalg::OperatorCommunicationMode operator_communication_mode_;
91
94
97
98 int local_subdomains_;
99 int hex_lat_;
100 int hex_rad_;
101 int lat_tile_;
102 int r_tile_;
103 int r_passes_;
104 int r_tile_block_;
105 int lat_tiles_;
106 int r_tiles_;
107 int team_size_;
108 int blocks_;
109
111
112 void update_kernel_path_flag_host_only()
113 {
114 if constexpr ( std::is_same_v< Kokkos::DefaultExecutionSpace, Kokkos::Serial > )
115 {
116 kernel_path_ = KernelPath::Slow;
117 return;
118 }
119 const BoundaryConditionFlag cmb_bc = get_boundary_condition_flag( bcs_, CMB );
120 const BoundaryConditionFlag surface_bc = get_boundary_condition_flag( bcs_, SURFACE );
121 const bool has_freeslip = ( cmb_bc == FREESLIP ) || ( surface_bc == FREESLIP );
122 kernel_path_ = has_freeslip ? KernelPath::FastFreeslip : KernelPath::FastDirichletNeumann;
123 }
124
125 public:
127 const grid::shell::DistributedDomain& domain_fine,
128 const grid::shell::DistributedDomain& domain_coarse,
129 const grid::Grid3DDataVec< ScalarT, 3 >& grid_fine,
130 const grid::Grid2DDataScalar< ScalarT >& radii_fine,
132 BoundaryConditions bcs,
134 linalg::OperatorCommunicationMode operator_communication_mode =
136 : domain_fine_( domain_fine )
137 , domain_coarse_( domain_coarse )
138 , grid_fine_( grid_fine )
139 , radii_( radii_fine )
140 , boundary_mask_fine_( boundary_mask_fine )
141 , operator_apply_mode_( operator_apply_mode )
142 , operator_communication_mode_( operator_communication_mode )
143 , recv_buffers_( domain_fine )
144 , comm_plan_( domain_fine )
145 {
146 bcs_[0] = bcs[0];
147 bcs_[1] = bcs[1];
148
149 const grid::shell::DomainInfo& domain_info = domain_fine_.domain_info();
150 local_subdomains_ = domain_fine_.subdomains().size();
151 hex_lat_ = domain_info.subdomain_num_nodes_per_side_laterally() - 1;
152 hex_rad_ = domain_info.subdomain_num_nodes_radially() - 1;
153
154 if constexpr ( std::is_same_v< Kokkos::DefaultExecutionSpace, Kokkos::Serial > )
155 {
156 lat_tile_ = 1; r_tile_ = 1; r_passes_ = 1;
157 }
158#ifdef KOKKOS_ENABLE_OPENMP
159 else if constexpr ( std::is_same_v< Kokkos::DefaultExecutionSpace, Kokkos::OpenMP > )
160 {
161 const int max_team = std::min( Kokkos::OpenMP().concurrency(),
162 static_cast< int >( Kokkos::Impl::HostThreadTeamData::max_team_members ) );
163 if ( max_team >= 64 ) { lat_tile_ = 4; r_tile_ = 4; r_passes_ = 4; }
164 else if ( max_team >= 16 ) { lat_tile_ = 4; r_tile_ = 1; r_passes_ = 16; }
165 else { lat_tile_ = 1; r_tile_ = 1; r_passes_ = 1; }
166 }
167#endif
168 else
169 {
170 lat_tile_ = 4; r_tile_ = 8; r_passes_ = 2;
171 }
172 r_tile_block_ = r_tile_ * r_passes_;
173 lat_tiles_ = ( hex_lat_ + lat_tile_ - 1 ) / lat_tile_;
174 r_tiles_ = ( hex_rad_ + r_tile_block_ - 1 ) / r_tile_block_;
175 team_size_ = lat_tile_ * lat_tile_ * r_tile_;
176 blocks_ = local_subdomains_ * lat_tiles_ * lat_tiles_ * r_tiles_;
177
178 update_kernel_path_flag_host_only();
179
180 util::logroot << "[GradientKerngen] tile=(" << lat_tile_ << "," << lat_tile_ << "," << r_tile_
181 << "), r_passes=" << r_passes_ << ", team=" << team_size_ << ", blocks=" << blocks_
182 << ", path=" << path_name() << std::endl;
183 }
184
185 const char* path_name() const
186 {
187 switch ( kernel_path_ )
188 {
189 case KernelPath::Slow: return "slow";
190 case KernelPath::FastFreeslip: return "fast-freeslip";
191 default: return "fast-dirichlet-neumann";
192 }
193 }
194
195 KernelPath kernel_path() const { return kernel_path_; }
196
197 void force_slow_path() { kernel_path_ = KernelPath::Slow; }
198
200 const linalg::OperatorApplyMode operator_apply_mode,
201 const linalg::OperatorCommunicationMode operator_communication_mode )
202 {
203 operator_apply_mode_ = operator_apply_mode;
204 operator_communication_mode_ = operator_communication_mode;
205 }
206
207 void apply_impl( const SrcVectorType& src, DstVectorType& dst )
208 {
209 util::Timer timer_apply( "gradient_apply" );
210
211 if ( operator_apply_mode_ == linalg::OperatorApplyMode::Replace )
212 {
213 assign( dst, 0 );
214 }
215
216 src_ = src.grid_data();
217 dst_ = dst.grid_data();
218
219 util::Timer timer_kernel( "gradient_kernel" );
220 Kokkos::TeamPolicy<> policy( blocks_, team_size_ );
221 if ( kernel_path_ != KernelPath::Slow )
222 {
223 policy.set_scratch_size( 0, Kokkos::PerTeam( team_shmem_size( team_size_ ) ) );
224 }
225
226 if ( kernel_path_ == KernelPath::Slow )
227 {
228 Kokkos::parallel_for(
229 "gradient_apply_kernel_slow", policy, KOKKOS_CLASS_LAMBDA( const Team& team ) {
230 this->run_team_slow( team );
231 } );
232 }
233 else if ( kernel_path_ == KernelPath::FastFreeslip )
234 {
235 Kokkos::parallel_for(
236 "gradient_apply_kernel_fast_fs", policy, KOKKOS_CLASS_LAMBDA( const Team& team ) {
237 this->template run_team_fast< /*Freeslip=*/true >( team );
238 } );
239 }
240 else
241 {
242 Kokkos::TeamPolicy< Kokkos::LaunchBounds< 128, 5 > > dn_policy( blocks_, team_size_ );
243 dn_policy.set_scratch_size( 0, Kokkos::PerTeam( team_shmem_size( team_size_ ) ) );
244 Kokkos::parallel_for(
245 "gradient_apply_kernel_fast_dn", dn_policy, KOKKOS_CLASS_LAMBDA( const Team& team ) {
246 this->template run_team_fast< /*Freeslip=*/false >( team );
247 } );
248 }
249
250 Kokkos::fence();
251 timer_kernel.stop();
252
253 if ( operator_communication_mode_ == linalg::OperatorCommunicationMode::CommunicateAdditively )
254 {
255 util::Timer timer_comm( "gradient_comm" );
256 terra::communication::shell::send_recv_with_plan( comm_plan_, dst_, recv_buffers_ );
257 }
258 }
259
260 KOKKOS_INLINE_FUNCTION
261 size_t team_shmem_size( const int /*ts*/ ) const
262 {
263 const int nlev = r_tile_block_ + 1;
264 const int n = lat_tile_ + 1;
265 const int nxy = n * n;
266 const int n_c = ( lat_tile_ / 2 ) + 1;
267 const int nxy_c = n_c * n_c;
268 const int nlev_c = ( r_tile_block_ / 2 ) + 1;
269 // coords_sh(nxy,3) + r_sh(nlev) + p_sh(nxy_c, nlev_c)
270 const size_t nscalars = size_t( nxy ) * 3 + size_t( nlev ) + size_t( nxy_c ) * nlev_c;
271 return sizeof( ScalarType ) * nscalars;
272 }
273
274 private:
275 KOKKOS_INLINE_FUNCTION
276 void decode_team_indices(
277 const Team& team,
278 int& local_subdomain_id,
279 int& x0, int& y0, int& r0,
280 int& tx, int& ty, int& tr,
281 int& x_cell, int& y_cell, int& r_cell ) const
282 {
283 int tmp = team.league_rank();
284 const int r_tile_id = tmp % r_tiles_;
285 tmp /= r_tiles_;
286 const int lat_y_id = tmp % lat_tiles_;
287 tmp /= lat_tiles_;
288 const int lat_x_id = tmp % lat_tiles_;
289 tmp /= lat_tiles_;
290 local_subdomain_id = tmp;
291
292 x0 = lat_x_id * lat_tile_;
293 y0 = lat_y_id * lat_tile_;
294 r0 = r_tile_id * r_tile_block_;
295
296 const int tid = team.team_rank();
297 tr = tid % r_tile_;
298 tx = ( tid / r_tile_ ) % lat_tile_;
299 ty = tid / ( r_tile_ * lat_tile_ );
300
301 x_cell = x0 + tx;
302 y_cell = y0 + ty;
303 r_cell = r0 + tr;
304 }
305
306 KOKKOS_INLINE_FUNCTION
307 bool has_flag( int s, int x, int y, int r, grid::shell::ShellBoundaryFlag flag ) const
308 {
309 return util::has_flag( boundary_mask_fine_( s, x, y, r ), flag );
310 }
311
312 // ----------------------------------------------------------
313 // Slow (reference) path — team wrapper around the legacy per-cell logic.
314 // Kept math-identical to Gradient (gradient.hpp).
315 // ----------------------------------------------------------
316 KOKKOS_INLINE_FUNCTION
317 void run_team_slow( const Team& team ) const
318 {
319 int local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell;
320 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
321
322 if ( tr >= r_tile_ )
323 return;
324
325 for ( int pass = 0; pass < r_passes_; ++pass )
326 {
327 const int r_cell_pass = r0 + pass * r_tile_ + tr;
328 if ( r_cell_pass >= hex_rad_ )
329 break;
330 if ( x_cell >= hex_lat_ || y_cell >= hex_lat_ )
331 continue;
332
333 process_cell_legacy( local_subdomain_id, x_cell, y_cell, r_cell_pass );
334 }
335 }
336
337 // Per-cell legacy Gradient kernel (same math as gradient.hpp), used by the slow path.
338 KOKKOS_INLINE_FUNCTION
339 void process_cell_legacy( int s, int x_cell, int y_cell, int r_cell ) const
340 {
342 wedge_surface_physical_coords( wedge_phy_surf, grid_fine_, s, x_cell, y_cell );
343
344 const ScalarT r_1 = radii_( s, r_cell );
345 const ScalarT r_2 = radii_( s, r_cell + 1 );
346
347 dense::Vec< ScalarT, 3 > quad_points[quadrature::quad_felippa_1x1_num_quad_points];
351
352 constexpr int num_quad_points = quadrature::quad_felippa_1x1_num_quad_points;
353 const int fine_radial_wedge_index = r_cell % 2;
354
355 dense::Mat< ScalarT, 18, 6 > A[num_wedges_per_hex_cell] = {};
356
357 for ( int q = 0; q < num_quad_points; q++ )
358 {
359 for ( int wedge = 0; wedge < num_wedges_per_hex_cell; wedge++ )
360 {
361 const int fine_lateral_wedge_index = fine_lateral_wedge_idx( x_cell, y_cell, wedge );
362 const auto J = jac( wedge_phy_surf[wedge], r_1, r_2, quad_points[q] );
363 const auto det = Kokkos::abs( J.det() );
364 const auto J_inv_transposed = J.inv().transposed();
365
366 for ( int i = 0; i < num_nodes_per_wedge; i++ )
367 {
368 const auto grad_i = grad_shape( i, quad_points[q] );
369 for ( int j = 0; j < num_nodes_per_wedge; j++ )
370 {
371 const auto shape_j =
372 shape_coarse( j, fine_radial_wedge_index, fine_lateral_wedge_index, quad_points[q] );
373 for ( int d = 0; d < 3; d++ )
374 {
375 A[wedge]( d * 6 + i, j ) +=
376 quad_weights[q] * ( -( ( J_inv_transposed * grad_i )( d ) * shape_j ) * det );
377 }
378 }
379 }
380 }
381 }
382
383 dense::Vec< ScalarT, 6 > src[num_wedges_per_hex_cell];
384 extract_local_wedge_scalar_coefficients( src, s, x_cell / 2, y_cell / 2, r_cell / 2, src_ );
385
386 bool at_cmb = util::has_flag( boundary_mask_fine_( s, x_cell, y_cell, r_cell ), CMB );
387 bool at_surface = util::has_flag( boundary_mask_fine_( s, x_cell, y_cell, r_cell + 1 ), SURFACE );
388
389 dense::Mat< ScalarT, 18, 6 > boundary_mask;
390 boundary_mask.fill( 1.0 );
391
392 dense::Mat< ScalarT, 18, 18 > R[num_wedges_per_hex_cell];
393 bool freeslip_reorder = false;
394
395 if ( at_cmb || at_surface )
396 {
397 ShellBoundaryFlag sbf = at_cmb ? CMB : SURFACE;
399
400 if ( bcf == DIRICHLET )
401 {
402 for ( int dimi = 0; dimi < 3; ++dimi )
403 for ( int i = 0; i < num_nodes_per_wedge; i++ )
404 for ( int j = 0; j < num_nodes_per_wedge; j++ )
405 if ( ( at_cmb && i < 3 ) || ( at_surface && i >= 3 ) )
406 boundary_mask( dimi * num_nodes_per_wedge + i, j ) = 0.0;
407 }
408 else if ( bcf == FREESLIP )
409 {
410 freeslip_reorder = true;
411 dense::Mat< ScalarT, 18, 6 > A_tmp[num_wedges_per_hex_cell] = { 0 };
412
413 for ( int wedge = 0; wedge < 2; ++wedge )
414 for ( int node_idxi = 0; node_idxi < num_nodes_per_wedge; ++node_idxi )
415 for ( int dimi = 0; dimi < 3; ++dimi )
416 for ( int node_idxj = 0; node_idxj < num_nodes_per_wedge; ++node_idxj )
417 A_tmp[wedge]( node_idxi * 3 + dimi, node_idxj ) =
418 A[wedge]( node_idxi + dimi * num_nodes_per_wedge, node_idxj );
419
420 constexpr int layer_hex_offset_x[2][3] = { { 0, 1, 0 }, { 1, 0, 1 } };
421 constexpr int layer_hex_offset_y[2][3] = { { 0, 0, 1 }, { 1, 1, 0 } };
422
423 for ( int wedge = 0; wedge < 2; ++wedge )
424 {
425 for ( int i = 0; i < 18; ++i )
426 R[wedge]( i, i ) = 1.0;
427
428 for ( int bn = 0; bn < 3; ++bn )
429 {
430 dense::Vec< double, 3 > normal = grid::shell::coords(
431 s,
432 x_cell + layer_hex_offset_x[wedge][bn],
433 y_cell + layer_hex_offset_y[wedge][bn],
434 r_cell + ( at_cmb ? 0 : 1 ),
435 grid_fine_,
436 radii_ );
437
438 auto R_i = trafo_mat_cartesian_to_normal_tangential( normal );
439 const int offset_in_R = at_cmb ? 0 : 9;
440 for ( int dimi = 0; dimi < 3; ++dimi )
441 for ( int dimj = 0; dimj < 3; ++dimj )
442 R[wedge]( offset_in_R + bn * 3 + dimi, offset_in_R + bn * 3 + dimj ) = R_i( dimi, dimj );
443 }
444
445 A[wedge] = R[wedge] * A_tmp[wedge];
446
447 const int node_start = at_surface ? 3 : 0;
448 const int node_end = at_surface ? 6 : 3;
449 for ( int node_idx = node_start; node_idx < node_end; ++node_idx )
450 {
451 const int idx = node_idx * 3;
452 for ( int k = 0; k < 6; ++k )
453 boundary_mask( idx, k ) = 0.0;
454 }
455 }
456 }
457 else if ( bcf == NEUMANN )
458 {
459 // No mask modification — matches legacy Gradient.
460 }
461 }
462
463 for ( int wedge = 0; wedge < num_wedges_per_hex_cell; wedge++ )
464 A[wedge].hadamard_product( boundary_mask );
465
466 dense::Vec< ScalarT, 18 > dst_loc[num_wedges_per_hex_cell];
467 dst_loc[0] = A[0] * src[0];
468 dst_loc[1] = A[1] * src[1];
469
470 if ( freeslip_reorder )
471 {
472 dense::Vec< ScalarT, 18 > dst_tmp[num_wedges_per_hex_cell];
473 dst_tmp[0] = R[0].transposed() * dst_loc[0];
474 dst_tmp[1] = R[1].transposed() * dst_loc[1];
475 for ( int i = 0; i < 18; ++i )
476 {
477 dst_loc[0]( i ) = dst_tmp[0]( i );
478 dst_loc[1]( i ) = dst_tmp[1]( i );
479 }
482 }
483
484 for ( int d = 0; d < 3; d++ )
485 {
486 dense::Vec< ScalarT, 6 > dst_d[num_wedges_per_hex_cell];
487 dst_d[0] = dst_loc[0].template slice< 6 >( d * 6 );
488 dst_d[1] = dst_loc[1].template slice< 6 >( d * 6 );
490 dst_, s, x_cell, y_cell, r_cell, d, dst_d );
491 }
492 }
493
494 // ----------------------------------------------------------
495 // Fast fused-arithmetic path. Templated on Freeslip so the projection code
496 // dead-eliminates on the Dirichlet/Neumann path.
497 // ----------------------------------------------------------
498 template < bool Freeslip >
499 KOKKOS_INLINE_FUNCTION
500 void run_team_fast( const Team& team ) const
501 {
502 int local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell;
503 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
504
505 const int nlev = r_tile_block_ + 1;
506 const int n = lat_tile_ + 1;
507 const int nxy = n * n;
508 const int n_c = ( lat_tile_ / 2 ) + 1;
509 const int nxy_c = n_c * n_c;
510 const int nlev_c = ( r_tile_block_ / 2 ) + 1;
511
512 double* shmem =
513 reinterpret_cast< double* >( team.team_shmem().get_shmem( team_shmem_size( team.team_size() ) ) );
514
515 using ScratchCoords =
516 Kokkos::View< double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
517 using ScratchR =
518 Kokkos::View< double*, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
519 using ScratchP =
520 Kokkos::View< double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
521
522 ScratchCoords coords_sh( shmem, nxy, 3 );
523 shmem += nxy * 3;
524 ScratchR r_sh( shmem, nlev );
525 shmem += nlev;
526 ScratchP p_sh( shmem, nxy_c, nlev_c ); // coarse pressure
527
528 auto node_id = [&]( int nx, int ny ) -> int { return nx + n * ny; };
529 auto node_id_c = [&]( int nx, int ny ) -> int { return nx + n_c * ny; };
530
531 // Coarse-grid starting point for the tile: since x0, y0 are multiples of
532 // lat_tile_ (≥2 if fast path active), x0/2 / y0/2 are integer coarse indices.
533 const int x0_c = x0 / 2;
534 const int y0_c = y0 / 2;
535 const int r0_c = r0 / 2;
536
537 // Preload lat coords.
538 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nxy ), [&]( int id ) {
539 const int dxn = id % n;
540 const int dyn = id / n;
541 const int xi = x0 + dxn;
542 const int yi = y0 + dyn;
543 if ( xi <= hex_lat_ && yi <= hex_lat_ )
544 {
545 coords_sh( id, 0 ) = grid_fine_( local_subdomain_id, xi, yi, 0 );
546 coords_sh( id, 1 ) = grid_fine_( local_subdomain_id, xi, yi, 1 );
547 coords_sh( id, 2 ) = grid_fine_( local_subdomain_id, xi, yi, 2 );
548 }
549 else
550 {
551 coords_sh( id, 0 ) = coords_sh( id, 1 ) = coords_sh( id, 2 ) = 0.0;
552 }
553 } );
554
555 // Preload radii.
556 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nlev ), [&]( int lvl ) {
557 const int rr = r0 + lvl;
558 r_sh( lvl ) = ( rr <= hex_rad_ ) ? radii_( local_subdomain_id, rr ) : 0.0;
559 } );
560
561 // Preload coarse pressure.
562 const int total_coarse = nxy_c * nlev_c;
563 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, total_coarse ), [&]( int t ) {
564 const int node_c = t / nlev_c;
565 const int lvl_c = t - node_c * nlev_c;
566 const int dxn = node_c % n_c;
567 const int dyn = node_c / n_c;
568 const int xi_c = x0_c + dxn;
569 const int yi_c = y0_c + dyn;
570 const int rr_c = r0_c + lvl_c;
571 // Coarse index bounds: coarse hex_lat ≈ hex_lat/2 nodes. We just bounds-check using the
572 // src_ extents (guaranteed sized for the coarse domain by the caller).
573 if ( xi_c < static_cast< int >( src_.extent( 1 ) ) &&
574 yi_c < static_cast< int >( src_.extent( 2 ) ) &&
575 rr_c < static_cast< int >( src_.extent( 3 ) ) )
576 {
577 p_sh( node_c, lvl_c ) = src_( local_subdomain_id, xi_c, yi_c, rr_c );
578 }
579 else
580 {
581 p_sh( node_c, lvl_c ) = 0.0;
582 }
583 } );
584
585 team.team_barrier();
586
587 if ( tr >= r_tile_ )
588 return;
589 if ( x_cell >= hex_lat_ || y_cell >= hex_lat_ )
590 return;
591
592 dense::Vec< ScalarT, 3 > quad_points[quadrature::quad_felippa_1x1_num_quad_points];
596 const ScalarT qw = quad_weights[0];
597
598 constexpr int WEDGE_NODE_OFF[2][6][3] = {
599 { { 0, 0, 0 }, { 1, 0, 0 }, { 0, 1, 0 }, { 0, 0, 1 }, { 1, 0, 1 }, { 0, 1, 1 } },
600 { { 1, 1, 0 }, { 0, 1, 0 }, { 1, 0, 0 }, { 1, 1, 1 }, { 0, 1, 1 }, { 1, 0, 1 } } };
601
602 const int n00 = node_id( tx, ty );
603 const int n01 = node_id( tx, ty + 1 );
604 const int n10 = node_id( tx + 1, ty );
605 const int n11 = node_id( tx + 1, ty + 1 );
606
607 for ( int pass = 0; pass < r_passes_; ++pass )
608 {
609 const int lvl0 = pass * r_tile_ + tr;
610 const int r_cell_abs = r0 + lvl0;
611 if ( r_cell_abs >= hex_rad_ )
612 break;
613
614 const ScalarT r_1 = r_sh( lvl0 );
615 const ScalarT r_2 = r_sh( lvl0 + 1 );
616
617 const bool at_cmb = has_flag( local_subdomain_id, x_cell, y_cell, r_cell_abs, CMB );
618 const bool at_surface = has_flag( local_subdomain_id, x_cell, y_cell, r_cell_abs + 1, SURFACE );
619
620 bool treat_dirichlet = false;
621 bool treat_freeslip = false;
622 if ( at_cmb || at_surface )
623 {
624 const ShellBoundaryFlag sbf = at_cmb ? CMB : SURFACE;
625 const BoundaryConditionFlag bcf = get_boundary_condition_flag( bcs_, sbf );
626 treat_dirichlet = ( bcf == DIRICHLET );
627 if constexpr ( Freeslip )
628 treat_freeslip = ( bcf == FREESLIP );
629 }
630 const int boundary_ddr = at_cmb ? 0 : 1;
631
632 const int fine_rad_wedge = r_cell_abs % 2;
633
634 // Coarse indices for this fine cell within the tile's coarse slab.
635 const int cxc_in_tile = ( x_cell - x0 ) / 2; // 0..(lat_tile/2-1)
636 const int cyc_in_tile = ( y_cell - y0 ) / 2;
637 const int crc_in_tile = lvl0 / 2; // 0..(r_tile_block/2-1)
638
639 for ( int w = 0; w < num_wedges_per_hex_cell; ++w )
640 {
641 const int v0 = ( w == 0 ? n00 : n11 );
642 const int v1 = ( w == 0 ? n10 : n01 );
643 const int v2 = ( w == 0 ? n01 : n10 );
644
645 constexpr double ONE_THIRD = 1.0 / 3.0;
646 const double half_dr = 0.5 * ( r_2 - r_1 );
647 const double r_mid = 0.5 * ( r_1 + r_2 );
648
649 const double J_0_0 = r_mid * ( -coords_sh( v0, 0 ) + coords_sh( v1, 0 ) );
650 const double J_0_1 = r_mid * ( -coords_sh( v0, 0 ) + coords_sh( v2, 0 ) );
651 const double J_0_2 = half_dr * ( ONE_THIRD * ( coords_sh( v0, 0 ) + coords_sh( v1, 0 ) + coords_sh( v2, 0 ) ) );
652 const double J_1_0 = r_mid * ( -coords_sh( v0, 1 ) + coords_sh( v1, 1 ) );
653 const double J_1_1 = r_mid * ( -coords_sh( v0, 1 ) + coords_sh( v2, 1 ) );
654 const double J_1_2 = half_dr * ( ONE_THIRD * ( coords_sh( v0, 1 ) + coords_sh( v1, 1 ) + coords_sh( v2, 1 ) ) );
655 const double J_2_0 = r_mid * ( -coords_sh( v0, 2 ) + coords_sh( v1, 2 ) );
656 const double J_2_1 = r_mid * ( -coords_sh( v0, 2 ) + coords_sh( v2, 2 ) );
657 const double J_2_2 = half_dr * ( ONE_THIRD * ( coords_sh( v0, 2 ) + coords_sh( v1, 2 ) + coords_sh( v2, 2 ) ) );
658
659 const double J_det = J_0_0 * J_1_1 * J_2_2 - J_0_0 * J_1_2 * J_2_1
660 - J_0_1 * J_1_0 * J_2_2 + J_0_1 * J_1_2 * J_2_0
661 + J_0_2 * J_1_0 * J_2_1 - J_0_2 * J_1_1 * J_2_0;
662 const double abs_det = Kokkos::abs( J_det );
663 const double inv_det = 1.0 / J_det;
664
665 const double i00 = inv_det * ( J_1_1 * J_2_2 - J_1_2 * J_2_1 );
666 const double i01 = inv_det * ( -J_1_0 * J_2_2 + J_1_2 * J_2_0 );
667 const double i02 = inv_det * ( J_1_0 * J_2_1 - J_1_1 * J_2_0 );
668 const double i10 = inv_det * ( -J_0_1 * J_2_2 + J_0_2 * J_2_1 );
669 const double i11 = inv_det * ( J_0_0 * J_2_2 - J_0_2 * J_2_0 );
670 const double i12 = inv_det * ( -J_0_0 * J_2_1 + J_0_1 * J_2_0 );
671 const double i20 = inv_det * ( J_0_1 * J_1_2 - J_0_2 * J_1_1 );
672 const double i21 = inv_det * ( -J_0_0 * J_1_2 + J_0_2 * J_1_0 );
673 const double i22 = inv_det * ( J_0_0 * J_1_1 - J_0_1 * J_1_0 );
674
675 constexpr double ONE_SIXTH = 1.0 / 6.0;
676 static constexpr double dN_ref[6][3] = {
677 { -0.5, -0.5, -ONE_SIXTH },
678 { 0.5, 0.0, -ONE_SIXTH },
679 { 0.0, 0.5, -ONE_SIXTH },
680 { -0.5, -0.5, ONE_SIXTH },
681 { 0.5, 0.0, ONE_SIXTH },
682 { 0.0, 0.5, ONE_SIXTH } };
683
684 const int fine_lat_wedge = fine_lateral_wedge_idx( x_cell, y_cell, w );
685
686 // --- Interpolate coarse pressure at the quadrature point. ---
687 // The 6 coarse nodes of this fine cell live at the 8 corners of one coarse hex
688 // (wedge layout via WEDGE_NODE_OFF on the coarse grid).
689 double p_interp = 0.0;
690#pragma unroll
691 for ( int j = 0; j < num_nodes_per_wedge; ++j )
692 {
693 const int cddx = WEDGE_NODE_OFF[w][j][0];
694 const int cddy = WEDGE_NODE_OFF[w][j][1];
695 const int cddr = WEDGE_NODE_OFF[w][j][2];
696 const int nidc = node_id_c( cxc_in_tile + cddx, cyc_in_tile + cddy );
697 const int lvlc = crc_in_tile + cddr;
698 const double pj = p_sh( nidc, lvlc );
699 const double sj = shape_coarse( j, fine_rad_wedge, fine_lat_wedge, quad_points[0] );
700 p_interp += sj * pj;
701 }
702
703 const double prefactor = -qw * abs_det * p_interp;
704
705 // --- Scatter contributions to 6 fine nodes (3 dims each). ---
706 const int xc_base = x_cell;
707 const int yc_base = y_cell;
708 const int rc_base = r_cell_abs;
709
710#pragma unroll
711 for ( int i = 0; i < num_nodes_per_wedge; ++i )
712 {
713 const int ddx = WEDGE_NODE_OFF[w][i][0];
714 const int ddy = WEDGE_NODE_OFF[w][i][1];
715 const int ddr = WEDGE_NODE_OFF[w][i][2];
716
717 const bool is_boundary_node = ( at_cmb || at_surface ) && ( ddr == boundary_ddr );
718
719 // Dirichlet: no scatter at boundary nodes.
720 if ( treat_dirichlet && is_boundary_node )
721 continue;
722
723 // Physical gradient of fine basis function i.
724 const double gx = dN_ref[i][0];
725 const double gy = dN_ref[i][1];
726 const double gz = dN_ref[i][2];
727 const double g0 = i00 * gx + i01 * gy + i02 * gz;
728 const double g1 = i10 * gx + i11 * gy + i12 * gz;
729 const double g2 = i20 * gx + i21 * gy + i22 * gz;
730
731 double c0 = prefactor * g0;
732 double c1 = prefactor * g1;
733 double c2 = prefactor * g2;
734
735 // Freeslip: project the 3-component contribution onto the tangent plane at boundary nodes.
736 if constexpr ( Freeslip )
737 {
738 if ( treat_freeslip && is_boundary_node )
739 {
740 const int nid = node_id( tx + ddx, ty + ddy );
741 const double nx = coords_sh( nid, 0 );
742 const double ny = coords_sh( nid, 1 );
743 const double nz = coords_sh( nid, 2 );
744 const double cn = c0 * nx + c1 * ny + c2 * nz;
745 c0 -= cn * nx;
746 c1 -= cn * ny;
747 c2 -= cn * nz;
748 }
749 }
750
751 Kokkos::atomic_add( &dst_( local_subdomain_id, xc_base + ddx, yc_base + ddy, rc_base + ddr, 0 ), c0 );
752 Kokkos::atomic_add( &dst_( local_subdomain_id, xc_base + ddx, yc_base + ddy, rc_base + ddr, 1 ), c1 );
753 Kokkos::atomic_add( &dst_( local_subdomain_id, xc_base + ddx, yc_base + ddy, rc_base + ddr, 2 ), c2 );
754 }
755 }
756 }
757 }
758};
759
760static_assert( linalg::OperatorLike< GradientKerngen< double > > );
761
762} // namespace terra::fe::wedge::operators::shell
Definition communication_plan.hpp:33
Definition gradient_kerngen.hpp:65
KernelPath
Definition gradient_kerngen.hpp:73
KernelPath kernel_path() const
Definition gradient_kerngen.hpp:195
void apply_impl(const SrcVectorType &src, DstVectorType &dst)
Definition gradient_kerngen.hpp:207
ScalarT ScalarType
Definition gradient_kerngen.hpp:69
size_t team_shmem_size(const int) const
Definition gradient_kerngen.hpp:261
void set_operator_apply_and_communication_modes(const linalg::OperatorApplyMode operator_apply_mode, const linalg::OperatorCommunicationMode operator_communication_mode)
Definition gradient_kerngen.hpp:199
GradientKerngen(const grid::shell::DistributedDomain &domain_fine, const grid::shell::DistributedDomain &domain_coarse, const grid::Grid3DDataVec< ScalarT, 3 > &grid_fine, const grid::Grid2DDataScalar< ScalarT > &radii_fine, const grid::Grid4DDataScalar< grid::shell::ShellBoundaryFlag > &boundary_mask_fine, BoundaryConditions bcs, linalg::OperatorApplyMode operator_apply_mode=linalg::OperatorApplyMode::Replace, linalg::OperatorCommunicationMode operator_communication_mode=linalg::OperatorCommunicationMode::CommunicateAdditively)
Definition gradient_kerngen.hpp:126
void force_slow_path()
Definition gradient_kerngen.hpp:197
const char * path_name() const
Definition gradient_kerngen.hpp:185
Kokkos::TeamPolicy<>::member_type Team
Definition gradient_kerngen.hpp:70
Parallel data structure organizing the thick spherical shell metadata for distributed (MPI parallel) ...
Definition spherical_shell.hpp:2518
const std::map< SubdomainInfo, std::tuple< LocalSubdomainIdx, SubdomainNeighborhood > > & subdomains() const
Definition spherical_shell.hpp:2650
const DomainInfo & domain_info() const
Returns a const reference.
Definition spherical_shell.hpp:2647
Information about the thick spherical shell mesh.
Definition spherical_shell.hpp:780
int subdomain_num_nodes_radially() const
Equivalent to calling subdomain_num_nodes_radially( subdomain_refinement_level() )
Definition spherical_shell.hpp:861
int subdomain_num_nodes_per_side_laterally() const
Equivalent to calling subdomain_num_nodes_per_side_laterally( subdomain_refinement_level() )
Definition spherical_shell.hpp:852
Q1 scalar finite element vector on a distributed shell grid.
Definition vector_q1.hpp:21
const grid::Grid4DDataScalar< ScalarType > & grid_data() const
Get const reference to grid data.
Definition vector_q1.hpp:139
const grid::Grid4DDataVec< ScalarType, VecDim > & grid_data() const
Get const reference to grid data.
Definition vector_q1.hpp:288
Timer supporting RAII scope or manual stop.
Definition timer.hpp:342
void stop()
Stop the timer and record elapsed time.
Definition timer.hpp:364
J
Definition EpsilonDivDiv_kernel_gen.py:199
r_cell
Definition EpsilonDivDiv_kernel_gen.py:23
qw
Definition EpsilonDivDiv_kernel_gen.py:166
x_cell
Definition EpsilonDivDiv_kernel_gen.py:23
g2
Definition EpsilonDivDiv_kernel_gen.py:267
local_subdomain_id
Definition EpsilonDivDiv_kernel_gen.py:23
g1
Definition EpsilonDivDiv_kernel_gen.py:266
J_det
Definition EpsilonDivDiv_kernel_gen.py:216
y_cell
Definition EpsilonDivDiv_kernel_gen.py:23
constexpr int idx(const int loop_idx, const int size, const grid::BoundaryPosition position, const grid::BoundaryDirection direction)
Definition buffer_copy_kernels.hpp:73
void send_recv_with_plan(const ShellBoundaryCommPlan< GridDataType > &plan, const GridDataType &data, SubdomainNeighborhoodSendRecvBuffer< typename GridDataType::value_type, grid::grid_data_vec_dim< GridDataType >() > &recv_buffers, CommunicationReduction reduction=CommunicationReduction::SUM)
Definition communication_plan.hpp:652
Definition boundary_mass.hpp:14
constexpr void quad_felippa_1x1_quad_points(dense::Vec< T, 3 >(&quad_points)[quad_felippa_1x1_num_quad_points])
Definition wedge/quadrature/quadrature.hpp:36
constexpr void quad_felippa_1x1_quad_weights(T(&quad_weights)[quad_felippa_1x1_num_quad_points])
Definition wedge/quadrature/quadrature.hpp:43
constexpr int quad_felippa_1x1_num_quad_points
Definition wedge/quadrature/quadrature.hpp:32
constexpr int num_nodes_per_wedge_surface
Definition kernel_helpers.hpp:6
constexpr int fine_lateral_wedge_idx(const int x_cell_fine, const int y_cell_fine, const int wedge_idx_fine)
Returns the lateral wedge index with respect to a coarse grid wedge from the fine wedge indices.
Definition kernel_helpers.hpp:601
void wedge_surface_physical_coords(dense::Vec< T, 3 >(&wedge_surf_phy_coords)[num_wedges_per_hex_cell][num_nodes_per_wedge_surface], const grid::Grid3DDataVec< T, 3 > &lateral_grid, const int local_subdomain_id, const int x_cell, const int y_cell)
Extracts the (unit sphere) surface vertex coords of the two wedges of a hex cell.
Definition kernel_helpers.hpp:26
constexpr void reorder_local_dofs(const DoFOrdering doo_from, const DoFOrdering doo_to, dense::Vec< ScalarT, 18 > &dofs)
Definition kernel_helpers.hpp:619
void atomically_add_local_wedge_vector_coefficients(const grid::Grid4DDataVec< T, VecDim > &global_coefficients, const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int d, const dense::Vec< T, 6 > local_coefficients[2])
Performs an atomic add of the two local wedge coefficient vectors of a hex cell into the global coeff...
Definition kernel_helpers.hpp:465
constexpr T shape_coarse(const int coarse_node_idx, const int fine_radial_wedge_idx, const int fine_lateral_wedge_idx, const T xi_fine, const T eta_fine, const T zeta_fine)
Definition integrands.hpp:373
constexpr int num_wedges_per_hex_cell
Definition kernel_helpers.hpp:5
void extract_local_wedge_scalar_coefficients(dense::Vec< T, 6 >(&local_coefficients)[2], const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const grid::Grid4DDataScalar< T > &global_coefficients)
Extracts the local vector coefficients for the two wedges of a hex cell from the global coefficient v...
Definition kernel_helpers.hpp:306
constexpr int num_nodes_per_wedge
Definition kernel_helpers.hpp:7
constexpr dense::Vec< T, 3 > grad_shape(const int node_idx, const T xi, const T eta, const T zeta)
Gradient of the full shape function:
Definition integrands.hpp:228
constexpr dense::Mat< T, 3, 3 > jac(const dense::Vec< T, 3 > &p1_phy, const dense::Vec< T, 3 > &p2_phy, const dense::Vec< T, 3 > &p3_phy, const T r_1, const T r_2, const T xi, const T eta, const T zeta)
Definition integrands.hpp:657
dense::Vec< typename CoordsShellType::value_type, 3 > coords(const int subdomain, const int x, const int y, const int r, const CoordsShellType &coords_shell, const CoordsRadiiType &coords_radii)
Definition spherical_shell.hpp:2871
BoundaryConditionMapping[2] BoundaryConditions
Definition shell/bit_masks.hpp:37
ShellBoundaryFlag
FlagLike that indicates boundary types for the thick spherical shell.
Definition shell/bit_masks.hpp:12
BoundaryConditionFlag get_boundary_condition_flag(const BoundaryConditions bcs, ShellBoundaryFlag sbf)
Retrieve the boundary condition flag that is associated with a location in the shell e....
Definition shell/bit_masks.hpp:42
BoundaryConditionFlag
FlagLike that indicates the type of boundary condition
Definition shell/bit_masks.hpp:25
Kokkos::View< ScalarType ***[VecDim], Layout > Grid3DDataVec
Definition grid_types.hpp:42
Kokkos::View< ScalarType ****, Layout > Grid4DDataScalar
Definition grid_types.hpp:27
Kokkos::View< ScalarType **, Layout > Grid2DDataScalar
Definition grid_types.hpp:21
dense::Mat< ScalarType, 3, 3 > trafo_mat_cartesian_to_normal_tangential(const dense::Vec< ScalarType, 3 > &n_input)
Constructs a robust orthonormal transformation matrix from Cartesian to (normal–tangential–tangential...
Definition local_basis_trafo_normal_tangential.hpp:36
OperatorApplyMode
Modes for applying an operator to a vector.
Definition operator.hpp:30
@ Replace
Overwrite the destination vector.
OperatorCommunicationMode
Modes for communication during operator application.
Definition operator.hpp:40
@ CommunicateAdditively
Communicate and add results.
constexpr bool has_flag(E mask_value, E flag) noexcept
Checks if a bitmask value contains a specific flag.
Definition bit_masking.hpp:43
detail::PrefixCout logroot([]() { return detail::log_prefix();})
std::ostream subclass that just logs on root and adds a timestamp for each line.