Loading...
Searching...
No Matches
epsilon_divdiv_kerngen.hpp
Go to the documentation of this file.
1#pragma once
2
3#include "../../quadrature/quadrature.hpp"
6#include "dense/vec.hpp"
10#include "impl/Kokkos_Profiling.hpp"
11#ifdef KOKKOS_ENABLE_OPENMP
12#include "impl/Kokkos_HostThreadTeam.hpp"
13#endif
14#include "grid/bit_masks.hpp"
16#include "linalg/operator.hpp"
19#include "linalg/vector.hpp"
20#include "linalg/vector_q1.hpp"
21#include "util/timer.hpp"
22
24
35
36/**
37 * @brief Matrix-free / matrix-based epsilon-div-div operator on wedge elements in a spherical shell.
38 *
39 * This class supports three execution paths:
40 *
41 * 1) Slow path (local matrix path)
42 * - Used if local matrices are stored (full or selective)
43 * - Reuses assembled/stored 18x18 local matrices
44 *
45 * 2) Fast path: Dirichlet/Neumann
46 * - Matrix-free
47 * - Handles Dirichlet by skipping constrained face-node couplings (and diagonal treatment if requested)
48 * - Neumann is naturally included
49 *
50 * 3) Fast path: Free-slip
51 * - Matrix-free
52 * - Applies trial and test-side tangential projection on free-slip boundaries
53 * - Preserves behavior consistent with the slow path for iterative solves
54 *
55 * IMPORTANT DESIGN CHANGE:
56 * ------------------------
57 * The path decision is made on the HOST and cached in `kernel_path_`.
58 * `apply_impl()` dispatches to a different kernel launch per path.
59 * This avoids a runtime branch inside the hot device kernel.
60 */
61template < typename ScalarT, int VecDim = 3 >
63{
64 public:
67 using ScalarType = ScalarT;
68 static constexpr int LocalMatrixDim = 18;
71 using Team = Kokkos::TeamPolicy<>::member_type;
72
73 private:
74 // Optional element-local matrix storage (GCA/coarsening/explicit local-matrix mode)
75 LocalMatrixStorage local_matrix_storage_;
76
77 // Domain and geometry / coefficients
79 grid::Grid3DDataVec< ScalarT, 3 > grid_; ///< Lateral shell geometry (unit sphere coords)
80 grid::Grid2DDataScalar< ScalarT > radii_; ///< Radial coordinates per local subdomain
81 grid::Grid4DDataScalar< ScalarType > k_; ///< Scalar coefficient field
82 grid::Grid4DDataScalar< grid::shell::ShellBoundaryFlag > mask_; ///< Boundary flags per cell/node
83 BoundaryConditions bcs_; ///< CMB and SURFACE boundary conditions
84
85 bool diagonal_; ///< If true, apply diagonal-only action
86
87 linalg::OperatorApplyMode operator_apply_mode_;
88 linalg::OperatorCommunicationMode operator_communication_mode_;
89 linalg::OperatorStoredMatrixMode operator_stored_matrix_mode_;
90
93
94 // Device views captured by kernels
97
98 // Quadrature (Felippa 1x1 on wedge)
99 const int num_quad_points = quadrature::quad_felippa_1x1_num_quad_points;
102
103 // Local subdomain extents (in cells)
104 int local_subdomains_;
105 int hex_lat_;
106 int hex_rad_;
107 int lat_refinement_level_;
108
109 // Team-policy tiling (one team handles a slab lat_tile x lat_tile x r_tile cells)
110 int lat_tile_;
111 int r_tile_;
112 int r_passes_;
113 int r_tile_block_;
114 int lat_tiles_;
115 int r_tiles_;
116 int team_size_;
117 int blocks_;
118
119 ScalarT r_max_;
120 ScalarT r_min_;
121
122 /**
123 * @brief Kernel path selected on host.
124 *
125 * This value is computed whenever BCs or stored-matrix mode change and is then
126 * used by `apply_impl()` to select which kernel launch to issue.
127 */
128 enum class KernelPath
129 {
130 Slow,
131 FastDirichletNeumann,
132 FastFreeslip,
133 };
134 KernelPath kernel_path_ = KernelPath::FastDirichletNeumann;
135
136 // Rotation null-space penalty (active only for fs/fs)
137 bool penalty_active_ = false;
138 ScalarT penalty_epsilon_ = 1.0;
141
142 private:
143 /**
144 * @brief Recompute the kernel path on host.
145 *
146 * Rules:
147 * - Any stored local matrix mode => slow path
148 * - Else if any free-slip BC on CMB or SURFACE => fast free-slip path
149 * - Else => fast Dirichlet/Neumann path
150 */
151 void update_kernel_path_flag_host_only()
152 {
153 // Serial backend: always use slow path (fast paths require shared memory / larger teams).
154 if constexpr ( std::is_same_v< Kokkos::DefaultExecutionSpace, Kokkos::Serial > )
155 {
156 kernel_path_ = KernelPath::Slow;
157 return;
158 }
159
160 const BoundaryConditionFlag cmb_bc = get_boundary_condition_flag( bcs_, CMB );
161 const BoundaryConditionFlag surface_bc = get_boundary_condition_flag( bcs_, SURFACE );
162
163 const bool has_freeslip = ( cmb_bc == FREESLIP ) || ( surface_bc == FREESLIP );
164 const bool has_stored_matrices = ( operator_stored_matrix_mode_ != linalg::OperatorStoredMatrixMode::Off );
165
166 if ( has_stored_matrices )
167 kernel_path_ = KernelPath::Slow;
168 else if ( has_freeslip )
169 kernel_path_ = KernelPath::FastFreeslip;
170 else
171 kernel_path_ = KernelPath::FastDirichletNeumann;
172 }
173
174 public:
176 const grid::shell::DistributedDomain& domain,
181 BoundaryConditions bcs,
182 bool diagonal,
184 linalg::OperatorCommunicationMode operator_communication_mode =
187 : domain_( domain )
188 , grid_( grid )
189 , radii_( radii )
190 , k_( k )
191 , mask_( mask )
192 , diagonal_( diagonal )
193 , operator_apply_mode_( operator_apply_mode )
194 , operator_communication_mode_( operator_communication_mode )
195 , operator_stored_matrix_mode_( operator_stored_matrix_mode )
196 , recv_buffers_( domain )
197 , comm_plan_( domain )
198 {
199 bcs_[0] = bcs[0];
200 bcs_[1] = bcs[1];
201
204
205 const grid::shell::DomainInfo& domain_info = domain_.domain_info();
206 local_subdomains_ = domain_.subdomains().size();
207 hex_lat_ = domain_info.subdomain_num_nodes_per_side_laterally() - 1;
208 hex_rad_ = domain_info.subdomain_num_nodes_radially() - 1;
209 lat_refinement_level_ = domain_info.diamond_lateral_refinement_level();
210
211 // On Serial backend, team_size must be 1 => use 1x1x1 tiles.
212 if constexpr ( std::is_same_v< Kokkos::DefaultExecutionSpace, Kokkos::Serial > )
213 {
214 lat_tile_ = 1;
215 r_tile_ = 1;
216 r_passes_ = 1;
217 }
218#ifdef KOKKOS_ENABLE_OPENMP
219 // OpenMP host teams are capped at min(num_threads, 64) by Kokkos.
220 // Choose tile sizes so that team_size = lat^2 * r_tile fits.
221 else if constexpr ( std::is_same_v< Kokkos::DefaultExecutionSpace, Kokkos::OpenMP > )
222 {
223 const int max_team = std::min(
224 Kokkos::OpenMP().concurrency(),
225 static_cast< int >( Kokkos::Impl::HostThreadTeamData::max_team_members ) );
226 if ( max_team >= 64 )
227 {
228 lat_tile_ = 4;
229 r_tile_ = 4;
230 r_passes_ = 4; // team_size = 64
231 }
232 else if ( max_team >= 16 )
233 {
234 lat_tile_ = 4;
235 r_tile_ = 1;
236 r_passes_ = 16; // team_size = 16
237 }
238 else
239 {
240 lat_tile_ = 1;
241 r_tile_ = 1;
242 r_passes_ = 1; // team_size = 1 (serial-like)
243 }
244 }
245#endif
246 else
247 {
248 lat_tile_ = 4;
249 r_tile_ = 8;
250 r_passes_ = 2;
251 }
252 r_tile_block_ = r_tile_ * r_passes_;
253
254 lat_tiles_ = ( hex_lat_ + lat_tile_ - 1 ) / lat_tile_;
255 r_tiles_ = ( hex_rad_ + r_tile_block_ - 1 ) / r_tile_block_;
256
257 team_size_ = lat_tile_ * lat_tile_ * r_tile_;
258 blocks_ = local_subdomains_ * lat_tiles_ * lat_tiles_ * r_tiles_;
259
260 r_min_ = domain_info.radii()[0];
261 r_max_ = domain_info.radii()[domain_info.radii().size() - 1];
262
263 // Host-side path selection (no in-kernel path branching)
264 update_kernel_path_flag_host_only();
265#if 0
266 util::logroot << "[EpsilonDivDiv] tile size (x,y,r)=(" << lat_tile_ << "," << lat_tile_ << "," << r_tile_
267 << "), r_passes=" << r_passes_ << std::endl;
268 util::logroot << "[EpsilonDivDiv] number of tiles (x,y,r)=(" << lat_tiles_ << "," << lat_tiles_ << ","
269 << r_tiles_ << "), team_size=" << team_size_ << ", blocks=" << blocks_ << std::endl;
270 const char* path_name = ( kernel_path_ == KernelPath::Slow ) ? "slow" :
271 ( kernel_path_ == KernelPath::FastFreeslip ) ? "fast-freeslip" :
272 "fast-dirichlet-neumann";
273 util::logroot << "[EpsilonDivDiv] kernel path = " << path_name << std::endl;
274#endif
275 init_penalty();
276 }
277
278 void set_penalty_epsilon( ScalarT eps ) { penalty_epsilon_ = eps; }
279
280 /**
281 * @brief Initialize penalty for fs/fs null-space regularization.
282 *
283 * Builds 3 orthonormal rotation-mode vectors (ê_i × r) with free-slip
284 * enforcement, then Gram-Schmidt orthonormalizes them.
285 * Only called when both CMB and SURFACE have FREESLIP BCs.
286 */
288 {
289 const BoundaryConditionFlag cmb_bc = get_boundary_condition_flag( bcs_, CMB );
290 const BoundaryConditionFlag surface_bc = get_boundary_condition_flag( bcs_, SURFACE );
291
292 penalty_active_ = ( cmb_bc == FREESLIP ) && ( surface_bc == FREESLIP );
293 if ( !penalty_active_ )
294 return;
295
296 util::logroot << "[EpsilonDivDiv] Initializing fs/fs null-space penalty (epsilon=" << penalty_epsilon_ << ")"
297 << std::endl;
298
299 ownership_mask_ = grid::setup_node_ownership_mask_data( domain_ );
300
301 const int nsub = domain_.subdomains().size();
302 const int nlat = domain_.domain_info().subdomain_num_nodes_per_side_laterally();
303 const int nrad = domain_.domain_info().subdomain_num_nodes_radially();
304
305 auto grid_local = grid_;
306 auto radii_local = radii_;
307 auto mask_local = mask_;
308
309 // Build 3 rotation modes: axis i => ê_i × r
310 for ( int axis = 0; axis < 3; ++axis )
311 {
313 "penalty_null_mode_" + std::to_string( axis ), nsub, nlat, nlat, nrad );
314
315 auto nm = null_modes_[axis];
316
317 Kokkos::parallel_for(
318 "penalty_fill_mode_" + std::to_string( axis ),
319 Kokkos::MDRangePolicy< Kokkos::Rank< 4 > >( { 0, 0, 0, 0 }, { nsub, nlat, nlat, nrad } ),
320 KOKKOS_LAMBDA( int s, int x, int y, int r ) {
321 const auto c = grid::shell::coords( s, x, y, r, grid_local, radii_local );
322 // ê_axis × r: cyclic cross product
323 // axis=0: (0, z, -y), axis=1: (-z, 0, x), axis=2: (y, -x, 0)
324 const int a1 = ( axis + 1 ) % 3;
325 const int a2 = ( axis + 2 ) % 3;
326 for ( int d = 0; d < VecDim; ++d )
327 nm( s, x, y, r, d ) = 0.0;
328 nm( s, x, y, r, a1 ) = c( a2 );
329 nm( s, x, y, r, a2 ) = -c( a1 );
330 } );
331 Kokkos::fence();
332
333 // Free-slip enforce: transform to n-t, zero normal, transform back
334 // Do this for both CMB and SURFACE boundaries
335 auto freeslip_boundary_mask = static_cast< grid::shell::ShellBoundaryFlag >(
336 static_cast< uint8_t >( CMB ) | static_cast< uint8_t >( SURFACE ) );
337
338 Kokkos::parallel_for(
339 "penalty_fs_enforce_" + std::to_string( axis ),
340 Kokkos::MDRangePolicy< Kokkos::Rank< 4 > >( { 0, 0, 0, 0 }, { nsub, nlat, nlat, nrad } ),
341 KOKKOS_LAMBDA( int s, int x, int y, int r ) {
342 if ( !util::has_flag( mask_local( s, x, y, r ), freeslip_boundary_mask ) )
343 return;
344
346 for ( int d = 0; d < 3; ++d )
347 v( d ) = nm( s, x, y, r, d );
348
350 for ( int d = 0; d < 3; ++d )
351 normal( d ) = grid_local( s, x, y, d );
352
353 // R * v -> normal-tangential
354 const auto R = trafo_mat_cartesian_to_normal_tangential( normal );
355 auto nt = R * v;
356
357 // Zero normal component
358 nt( 0 ) = 0.0;
359
360 // R^T * nt -> back to Cartesian
361 const auto Rt = R.transposed();
362 const auto cart = Rt * nt;
363
364 for ( int d = 0; d < 3; ++d )
365 nm( s, x, y, r, d ) = cart( d );
366 } );
367 Kokkos::fence();
368 }
369
370 // Gram-Schmidt orthonormalization
371 for ( int i = 0; i < 3; ++i )
372 {
373 // Subtract projections of previous modes
374 for ( int j = 0; j < i; ++j )
375 {
377 null_modes_[i], null_modes_[j], ownership_mask_, grid::NodeOwnershipFlag::OWNED );
378
379 auto ni = null_modes_[i];
380 auto nj = null_modes_[j];
381 Kokkos::parallel_for(
382 "penalty_gs_subtract",
383 Kokkos::MDRangePolicy< Kokkos::Rank< 4 > >( { 0, 0, 0, 0 }, { nsub, nlat, nlat, nrad } ),
384 KOKKOS_LAMBDA( int s, int x, int y, int r ) {
385 for ( int d = 0; d < VecDim; ++d )
386 ni( s, x, y, r, d ) -= dot_ij * nj( s, x, y, r, d );
387 } );
388 Kokkos::fence();
389 }
390
391 // Normalize
393 null_modes_[i], null_modes_[i], ownership_mask_, grid::NodeOwnershipFlag::OWNED );
394 ScalarType inv_norm = 1.0 / Kokkos::sqrt( norm_sq );
395
396 auto ni = null_modes_[i];
397 Kokkos::parallel_for(
398 "penalty_gs_normalize",
399 Kokkos::MDRangePolicy< Kokkos::Rank< 4 > >( { 0, 0, 0, 0 }, { nsub, nlat, nlat, nrad } ),
400 KOKKOS_LAMBDA( int s, int x, int y, int r ) {
401 for ( int d = 0; d < VecDim; ++d )
402 ni( s, x, y, r, d ) *= inv_norm;
403 } );
404 Kokkos::fence();
405 }
406
407 util::logroot << "[EpsilonDivDiv] Null-space penalty initialized (3 modes)" << std::endl;
408 }
409
411 const linalg::OperatorApplyMode operator_apply_mode,
412 const linalg::OperatorCommunicationMode operator_communication_mode )
413 {
414 operator_apply_mode_ = operator_apply_mode;
415 operator_communication_mode_ = operator_communication_mode;
416 }
417
418 void set_diagonal( bool v ) { diagonal_ = v; }
419
420 void set_boundary_conditions( BoundaryConditions bcs )
421 {
422 bcs_[0] = bcs[0];
423 bcs_[1] = bcs[1];
424
425 // Recompute host-side dispatch mode whenever BCs change
426 update_kernel_path_flag_host_only();
427 }
428
430 const grid::shell::DistributedDomain& get_domain() const { return domain_; }
433
434 KOKKOS_INLINE_FUNCTION
436 const int local_subdomain_id,
437 const int x_cell,
438 const int y_cell,
439 const int r_cell,
441 { return util::has_flag( mask_( local_subdomain_id, x_cell, y_cell, r_cell ), flag ); }
442
444 linalg::OperatorStoredMatrixMode operator_stored_matrix_mode,
445 int level_range,
447 {
448 operator_stored_matrix_mode_ = operator_stored_matrix_mode;
449
450 if ( operator_stored_matrix_mode_ != linalg::OperatorStoredMatrixMode::Off )
451 {
453 domain_, operator_stored_matrix_mode_, level_range, GCAElements );
454 }
455
456 // Recompute host-side dispatch mode whenever storage mode changes
457 update_kernel_path_flag_host_only();
458
459 util::logroot << "[EpsilonDivDiv] (set_stored_matrix_mode) kernel path = "
460 << ( ( kernel_path_ == KernelPath::Slow ) ? "slow" :
461 ( kernel_path_ == KernelPath::FastFreeslip ) ? "fast-freeslip" :
462 "fast-dirichlet-neumann" )
463 << std::endl;
464 }
465
466 linalg::OperatorStoredMatrixMode get_stored_matrix_mode() { return operator_stored_matrix_mode_; }
467
468 KOKKOS_INLINE_FUNCTION
470 const int local_subdomain_id,
471 const int x_cell,
472 const int y_cell,
473 const int r_cell,
474 const int wedge,
476 {
477 KOKKOS_ASSERT( operator_stored_matrix_mode_ != linalg::OperatorStoredMatrixMode::Off );
478 local_matrix_storage_.set_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge, mat );
479 }
480
481 KOKKOS_INLINE_FUNCTION
483 const int local_subdomain_id,
484 const int x_cell,
485 const int y_cell,
486 const int r_cell,
487 const int wedge ) const
488 {
489 if ( operator_stored_matrix_mode_ != linalg::OperatorStoredMatrixMode::Off )
490 {
491 if ( !local_matrix_storage_.has_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge ) )
492 {
493 Kokkos::abort( "No matrix found at that spatial index." );
494 }
495 return local_matrix_storage_.get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge );
496 }
497 return assemble_local_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge );
498 }
499
500 /**
501 * @brief Apply the operator: dst <- Op(src) (or additive, depending on mode).
502 *
503 * Host-side responsibilities:
504 * - Handle replace/add mode initialization
505 * - Cache src/dst views for kernel capture
506 * - Build team policy
507 * - Dispatch to exactly one kernel variant based on `kernel_path_`
508 * - Optional halo communication accumulation
509 *
510 * This is where the path decision now lives (host), so device code does not branch
511 * on `kernel_path_` in the hot path.
512 */
513 void apply_impl( const SrcVectorType& src, DstVectorType& dst )
514 {
515 util::Timer timer_apply( "epsilon_divdiv_apply" );
516
517 if ( operator_apply_mode_ == linalg::OperatorApplyMode::Replace )
518 {
519 assign( dst, 0 );
520 }
521
522 dst_ = dst.grid_data();
523 src_ = src.grid_data();
524
525 util::Timer timer_kernel( "epsilon_divdiv_kernel" );
526 Kokkos::TeamPolicy<> policy( blocks_, team_size_ );
527
528 // Fast paths use shared-memory slab staging
529 if ( kernel_path_ != KernelPath::Slow )
530 {
531 policy.set_scratch_size( 0, Kokkos::PerTeam( team_shmem_size( team_size_ ) ) );
532 }
533
534 // Host-side dispatch => no per-team path branching in device code
535 if ( kernel_path_ == KernelPath::Slow )
536 {
537 Kokkos::parallel_for(
538 "epsilon_divdiv_apply_kernel_slow", policy, KOKKOS_CLASS_LAMBDA( const Team& team ) {
539 this->run_team_slow( team );
540 } );
541 }
542 else if ( kernel_path_ == KernelPath::FastFreeslip )
543 {
544 if ( diagonal_ )
545 {
546 Kokkos::parallel_for(
547 "epsilon_divdiv_apply_kernel_fast_fs_diag", policy, KOKKOS_CLASS_LAMBDA( const Team& team ) {
548 this->template run_team_fast_freeslip< true >( team );
549 } );
550 }
551 else
552 {
553 Kokkos::parallel_for(
554 "epsilon_divdiv_apply_kernel_fast_fs_matvec", policy, KOKKOS_CLASS_LAMBDA( const Team& team ) {
555 this->template run_team_fast_freeslip< false >( team );
556 } );
557 }
558 }
559 else
560 {
561 Kokkos::TeamPolicy< Kokkos::LaunchBounds< 128, 5 > > dn_policy( blocks_, team_size_ );
562 dn_policy.set_scratch_size( 0, Kokkos::PerTeam( team_shmem_size_dn( team_size_ ) ) );
563 if ( diagonal_ )
564 {
565 Kokkos::parallel_for(
566 "epsilon_divdiv_apply_kernel_fast_dn_diag", dn_policy, KOKKOS_CLASS_LAMBDA( const Team& team ) {
567 this->template run_team_fast_dirichlet_neumann< true >( team );
568 } );
569 }
570 else
571 {
572 Kokkos::parallel_for(
573 "epsilon_divdiv_apply_kernel_fast_dn_matvec", dn_policy, KOKKOS_CLASS_LAMBDA( const Team& team ) {
574 this->template run_team_fast_dirichlet_neumann< false >( team );
575 } );
576 }
577 }
578
579 Kokkos::fence();
580 timer_kernel.stop();
581
582 // Null-space penalty for fs/fs: dst += epsilon * sum_i (n_i^T src) n_i
583 if ( penalty_active_ && !diagonal_ )
584 {
585 for ( int i = 0; i < 3; ++i )
586 {
587 ScalarType alpha =
588 penalty_epsilon_ * kernels::common::masked_dot_product(
589 null_modes_[i], src_, ownership_mask_, grid::NodeOwnershipFlag::OWNED );
590
591 auto nm = null_modes_[i];
592 auto dst_local = dst_;
593 Kokkos::parallel_for(
594 "penalty_axpy",
595 Kokkos::MDRangePolicy< Kokkos::Rank< 4 > >(
596 { 0, 0, 0, 0 },
597 { dst_local.extent( 0 ),
598 dst_local.extent( 1 ),
599 dst_local.extent( 2 ),
600 dst_local.extent( 3 ) } ),
601 KOKKOS_LAMBDA( int s, int x, int y, int r ) {
602 for ( int d = 0; d < VecDim; ++d )
603 dst_local( s, x, y, r, d ) += alpha * nm( s, x, y, r, d );
604 } );
605 }
606 Kokkos::fence( "penalty" );
607 }
608
609 if ( operator_communication_mode_ == linalg::OperatorCommunicationMode::CommunicateAdditively )
610 {
611 util::Timer timer_comm( "epsilon_divdiv_comm" );
612 terra::communication::shell::send_recv_with_plan( comm_plan_, dst_, recv_buffers_ );
613 }
614 }
615
616 /**
617 * @brief Convert one gradient column into symmetric-gradient and div contributions.
618 *
619 * Given grad(phi e_dim), this helper computes:
620 * - diagonal entries (E00,E11,E22)
621 * - off-diagonal symmetric entries (sym01,sym02,sym12)
622 * - divergence contribution `gdd`
623 *
624 * The fast kernels use this repeatedly in fused operator application.
625 */
626 KOKKOS_INLINE_FUNCTION
628 const int dim,
629 const double g0,
630 const double g1,
631 const double g2,
632 double& E00,
633 double& E11,
634 double& E22,
635 double& sym01,
636 double& sym02,
637 double& sym12,
638 double& gdd ) const
639 {
640 E00 = E11 = E22 = sym01 = sym02 = sym12 = gdd = 0.0;
641
642 switch ( dim )
643 {
644 case 0:
645 E00 = g0;
646 gdd = g0;
647 sym01 = 0.5 * g1;
648 sym02 = 0.5 * g2;
649 break;
650 case 1:
651 E11 = g1;
652 gdd = g1;
653 sym01 = 0.5 * g0;
654 sym12 = 0.5 * g2;
655 break;
656 default:
657 E22 = g2;
658 gdd = g2;
659 sym02 = 0.5 * g0;
660 sym12 = 0.5 * g1;
661 break;
662 }
663 }
664
665 /**
666 * @brief Team scratch memory size for fast paths.
667 *
668 * Layout per team:
669 * [coords_sh | src_sh | k_sh | r_sh]
670 */
671 KOKKOS_INLINE_FUNCTION
672 size_t team_shmem_size( const int ts ) const
673 {
674 const int nlev = r_tile_block_ + 1;
675 const int n = lat_tile_ + 1;
676 const int nxy = n * n;
677
678 // coords_sh(nxy,3) + normals_sh(nxy,3) + src_sh(nxy,3,nlev) + k_sh(nxy,nlev) + r_sh(nlev) + 1
679 const size_t nscalars = size_t( nxy ) * 3 + size_t( nxy ) * 3 + size_t( nxy ) * 3 * nlev +
680 size_t( nxy ) * nlev + size_t( nlev ) + 1;
681
682 return sizeof( ScalarType ) * nscalars;
683 }
684
685 KOKKOS_INLINE_FUNCTION
686 size_t team_shmem_size_dn( const int /* ts */ ) const
687 {
688 const int nlev = r_tile_block_ + 1;
689 const int n = lat_tile_ + 1;
690 const int nxy = n * n;
691
692 // coords_sh(nxy,3) + src_sh(nxy,3,nlev) + k_sh(nxy,nlev) + r_sh(nlev)
693 const size_t nscalars = size_t( nxy ) * 3 + size_t( nxy ) * 3 * nlev + size_t( nxy ) * nlev + size_t( nlev );
694
695 return sizeof( ScalarType ) * nscalars;
696 }
697
698 private:
699 /**
700 * @brief Decode a team league rank / team rank into:
701 * - tile origin (x0,y0,r0)
702 * - intra-tile thread coordinates (tx,ty,tr)
703 * - target cell (x_cell,y_cell,r_cell)
704 */
705 KOKKOS_INLINE_FUNCTION
706 void decode_team_indices(
707 const Team& team,
708 int& local_subdomain_id,
709 int& x0,
710 int& y0,
711 int& r0,
712 int& tx,
713 int& ty,
714 int& tr,
715 int& x_cell,
716 int& y_cell,
717 int& r_cell ) const
718 {
719 int tmp = team.league_rank();
720
721 const int r_tile_id = tmp % r_tiles_;
722 tmp /= r_tiles_;
723
724 const int lat_y_id = tmp % lat_tiles_;
725 tmp /= lat_tiles_;
726
727 const int lat_x_id = tmp % lat_tiles_;
728 tmp /= lat_tiles_;
729
730 local_subdomain_id = tmp;
731
732 x0 = lat_x_id * lat_tile_;
733 y0 = lat_y_id * lat_tile_;
734 r0 = r_tile_id * r_tile_block_;
735
736 const int tid = team.team_rank();
737 tr = tid % r_tile_;
738 tx = ( tid / r_tile_ ) % lat_tile_;
739 ty = tid / ( r_tile_ * lat_tile_ );
740
741 x_cell = x0 + tx;
742 y_cell = y0 + ty;
743 r_cell = r0 + tr;
744 }
745
746 // -------------------------------------------------------------------------
747 // Path-specific team entry points (NO path branching)
748 // These are the functions called directly by host-dispatched kernel launches.
749 // -------------------------------------------------------------------------
750
751 /**
752 * @brief Team entry for slow (local-matrix) path.
753 *
754 * This wrapper performs only:
755 * - team index decoding
756 * - boundary flag queries
757 * - forwarding to operator_slow_path()
758 *
759 * No dispatch branching happens here.
760 */
761 KOKKOS_INLINE_FUNCTION
762 void run_team_slow( const Team& team ) const
763 {
764 int local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell;
765 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
766
767 if ( tr >= r_tile_ )
768 return;
769
770 for ( int pass = 0; pass < r_passes_; ++pass )
771 {
772 const int r_cell_pass = r0 + pass * r_tile_ + tr;
773 if ( r_cell_pass >= hex_rad_ )
774 break;
775
776 const bool at_cmb = has_flag( local_subdomain_id, x_cell, y_cell, r_cell_pass, CMB );
777 const bool at_surface = has_flag( local_subdomain_id, x_cell, y_cell, r_cell_pass + 1, SURFACE );
778
779 operator_slow_path(
780 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell_pass, at_cmb, at_surface );
781 }
782 }
783
784 /**
785 * @brief Team entry for fast Dirichlet/Neumann matrix-free path.
786 *
787 * Templated on Diagonal so the compiler can dead-code-eliminate the
788 * unused matvec or diagonal-only path, reducing register pressure.
789 */
790 template < bool Diagonal >
791 KOKKOS_INLINE_FUNCTION void run_team_fast_dirichlet_neumann( const Team& team ) const
792 {
793 int local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell;
794 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
795
796 if ( tr >= r_tile_ )
797 return;
798
799 operator_fast_dirichlet_neumann_path< Diagonal >(
800 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell );
801 }
802
803 /**
804 * @brief Team entry for fast free-slip matrix-free path.
805 *
806 * Templated on Diagonal so the compiler can dead-code-eliminate the
807 * unused matvec or diagonal-only path, reducing register pressure.
808 */
809 template < bool Diagonal >
810 KOKKOS_INLINE_FUNCTION void run_team_fast_freeslip( const Team& team ) const
811 {
812 int local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell;
813 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
814
815 if ( tr >= r_tile_ )
816 return;
817
818 operator_fast_freeslip_path< Diagonal >( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell );
819 }
820
821 // ===================== SLOW PATH =====================
822 KOKKOS_INLINE_FUNCTION
823 void operator_slow_path(
824 const Team& team,
825 const int local_subdomain_id,
826 const int x0,
827 const int y0,
828 const int r0,
829 const int tx,
830 const int ty,
831 const int tr,
832 const int x_cell,
833 const int y_cell,
834 const int r_cell,
835 const bool at_cmb,
836 const bool at_surface ) const
837 {
838 (void) team;
839 (void) x0;
840 (void) y0;
841 (void) r0;
842 (void) tx;
843 (void) ty;
844 (void) tr;
845
846 if ( x_cell >= hex_lat_ || y_cell >= hex_lat_ || r_cell >= hex_rad_ )
847 return;
848
849 dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > A[num_wedges_per_hex_cell] = {};
850
851 if ( operator_stored_matrix_mode_ == linalg::OperatorStoredMatrixMode::Full )
852 {
853 A[0] = local_matrix_storage_.get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 );
854 A[1] = local_matrix_storage_.get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 );
855 }
856 else if ( operator_stored_matrix_mode_ == linalg::OperatorStoredMatrixMode::Selective )
857 {
858 if ( local_matrix_storage_.has_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 ) &&
859 local_matrix_storage_.has_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 ) )
860 {
861 A[0] = local_matrix_storage_.get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 );
862 A[1] = local_matrix_storage_.get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 );
863 }
864 else
865 {
866 A[0] = assemble_local_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 );
867 A[1] = assemble_local_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 );
868 }
869 }
870 else
871 {
872 A[0] = assemble_local_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 );
873 A[1] = assemble_local_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 );
874 }
875
876 dense::Vec< ScalarT, 18 > src[num_wedges_per_hex_cell];
877 for ( int dimj = 0; dimj < 3; dimj++ )
878 {
879 dense::Vec< ScalarT, 6 > src_d[num_wedges_per_hex_cell];
880 extract_local_wedge_vector_coefficients( src_d, local_subdomain_id, x_cell, y_cell, r_cell, dimj, src_ );
881
882 for ( int wedge = 0; wedge < num_wedges_per_hex_cell; wedge++ )
883 {
884 for ( int i = 0; i < num_nodes_per_wedge; i++ )
885 {
886 src[wedge]( dimj * num_nodes_per_wedge + i ) = src_d[wedge]( i );
887 }
888 }
889 }
890
891 dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > boundary_mask;
892 boundary_mask.fill( 1.0 );
893
894 bool freeslip_reorder = false;
895 dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > R[num_wedges_per_hex_cell];
896
897 if ( at_cmb || at_surface )
898 {
899 ShellBoundaryFlag sbf = at_cmb ? CMB : SURFACE;
901
902 if ( bcf == DIRICHLET )
903 {
904 for ( int dimi = 0; dimi < 3; ++dimi )
905 {
906 for ( int dimj = 0; dimj < 3; ++dimj )
907 {
908 for ( int i = 0; i < num_nodes_per_wedge; i++ )
909 {
910 for ( int j = 0; j < num_nodes_per_wedge; j++ )
911 {
912 if ( ( at_cmb && ( ( dimi == dimj && i != j && ( i < 3 || j < 3 ) ) ||
913 ( dimi != dimj && ( i < 3 || j < 3 ) ) ) ) ||
914 ( at_surface && ( ( dimi == dimj && i != j && ( i >= 3 || j >= 3 ) ) ||
915 ( dimi != dimj && ( i >= 3 || j >= 3 ) ) ) ) )
916 {
917 boundary_mask( i + dimi * num_nodes_per_wedge, j + dimj * num_nodes_per_wedge ) =
918 0.0;
919 }
920 }
921 }
922 }
923 }
924 }
925 else if ( bcf == FREESLIP )
926 {
927 freeslip_reorder = true;
928 dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > A_tmp[num_wedges_per_hex_cell] = {};
929
930 for ( int wedge = 0; wedge < 2; ++wedge )
931 {
932 for ( int dimi = 0; dimi < 3; ++dimi )
933 {
934 for ( int node_idxi = 0; node_idxi < num_nodes_per_wedge; node_idxi++ )
935 {
936 for ( int dimj = 0; dimj < 3; ++dimj )
937 {
938 for ( int node_idxj = 0; node_idxj < num_nodes_per_wedge; node_idxj++ )
939 {
940 A_tmp[wedge]( node_idxi * 3 + dimi, node_idxj * 3 + dimj ) = A[wedge](
941 node_idxi + dimi * num_nodes_per_wedge,
942 node_idxj + dimj * num_nodes_per_wedge );
943 }
944 }
945 }
946 }
948 }
949
950 constexpr int layer_hex_offset_x[2][3] = { { 0, 1, 0 }, { 1, 0, 1 } };
951 constexpr int layer_hex_offset_y[2][3] = { { 0, 0, 1 }, { 1, 1, 0 } };
952
953 for ( int wedge = 0; wedge < 2; ++wedge )
954 {
955 for ( int i = 0; i < LocalMatrixDim; ++i )
956 {
957 R[wedge]( i, i ) = 1.0;
958 }
959
960 for ( int boundary_node_idx = 0; boundary_node_idx < 3; boundary_node_idx++ )
961 {
962 dense::Vec< double, 3 > normal = grid::shell::coords(
963 local_subdomain_id,
964 x_cell + layer_hex_offset_x[wedge][boundary_node_idx],
965 y_cell + layer_hex_offset_y[wedge][boundary_node_idx],
966 r_cell + ( at_cmb ? 0 : 1 ),
967 grid_,
968 radii_ );
969
970 auto R_i = trafo_mat_cartesian_to_normal_tangential( normal );
971
972 const int offset_in_R = at_cmb ? 0 : 9;
973 for ( int dimi = 0; dimi < 3; ++dimi )
974 {
975 for ( int dimj = 0; dimj < 3; ++dimj )
976 {
977 R[wedge](
978 offset_in_R + boundary_node_idx * 3 + dimi,
979 offset_in_R + boundary_node_idx * 3 + dimj ) = R_i( dimi, dimj );
980 }
981 }
982 }
983
984 A[wedge] = R[wedge] * A_tmp[wedge] * R[wedge].transposed();
985
986 auto src_tmp = R[wedge] * src[wedge];
987 for ( int i = 0; i < 18; ++i )
988 {
989 src[wedge]( i ) = src_tmp( i );
990 }
991
992 const int node_start = at_surface ? 3 : 0;
993 const int node_end = at_surface ? 6 : 3;
994
995 for ( int node_idx = node_start; node_idx < node_end; node_idx++ )
996 {
997 const int idx = node_idx * 3;
998 for ( int k = 0; k < 18; ++k )
999 {
1000 if ( k != idx )
1001 {
1002 boundary_mask( idx, k ) = 0.0;
1003 boundary_mask( k, idx ) = 0.0;
1004 }
1005 }
1006 }
1007 }
1008 }
1009 }
1010
1011 for ( int wedge = 0; wedge < num_wedges_per_hex_cell; wedge++ )
1012 {
1013 A[wedge].hadamard_product( boundary_mask );
1014 }
1015
1016 if ( diagonal_ )
1017 {
1018 A[0] = A[0].diagonal();
1019 A[1] = A[1].diagonal();
1020 }
1021
1022 dense::Vec< ScalarT, LocalMatrixDim > dst[num_wedges_per_hex_cell];
1023 dst[0] = A[0] * src[0];
1024 dst[1] = A[1] * src[1];
1025
1026 if ( freeslip_reorder )
1027 {
1028 dense::Vec< ScalarT, LocalMatrixDim > dst_tmp[num_wedges_per_hex_cell];
1029 dst_tmp[0] = R[0].transposed() * dst[0];
1030 dst_tmp[1] = R[1].transposed() * dst[1];
1031
1032 for ( int i = 0; i < 18; ++i )
1033 {
1034 dst[0]( i ) = dst_tmp[0]( i );
1035 dst[1]( i ) = dst_tmp[1]( i );
1036 }
1037
1040 }
1041
1042 for ( int dimi = 0; dimi < 3; dimi++ )
1043 {
1044 dense::Vec< ScalarT, 6 > dst_d[num_wedges_per_hex_cell];
1045 dst_d[0] = dst[0].template slice< 6 >( dimi * num_nodes_per_wedge );
1046 dst_d[1] = dst[1].template slice< 6 >( dimi * num_nodes_per_wedge );
1047
1049 dst_, local_subdomain_id, x_cell, y_cell, r_cell, dimi, dst_d );
1050 }
1051 }
1052
1053 // ===================== FAST DIRICHLET/NEUMANN PATH =====================
1054 template < bool Diagonal >
1055 KOKKOS_INLINE_FUNCTION void operator_fast_dirichlet_neumann_path(
1056 const Team& team,
1057 const int local_subdomain_id,
1058 const int x0,
1059 const int y0,
1060 const int r0,
1061 const int tx,
1062 const int ty,
1063 const int tr,
1064 const int x_cell,
1065 const int y_cell ) const
1066 {
1067 const int nlev = r_tile_block_ + 1;
1068 const int nxy = ( lat_tile_ + 1 ) * ( lat_tile_ + 1 );
1069
1070 double* shmem =
1071 reinterpret_cast< double* >( team.team_shmem().get_shmem( team_shmem_size_dn( team.team_size() ) ) );
1072
1073 using ScratchCoords =
1074 Kokkos::View< double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
1075 using ScratchSrc = Kokkos::
1076 View< double***, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
1077 using ScratchK =
1078 Kokkos::View< double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
1079
1080 ScratchCoords coords_sh( shmem, nxy, 3 );
1081 shmem += nxy * 3;
1082
1083 ScratchSrc src_sh( shmem, nxy, 3, nlev );
1084 shmem += nxy * 3 * nlev;
1085
1086 ScratchK k_sh( shmem, nxy, nlev );
1087 shmem += nxy * nlev;
1088
1089 auto r_sh =
1090 Kokkos::View< double*, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >(
1091 shmem, nlev );
1092
1093 auto node_id = [&]( int nx, int ny ) -> int { return nx + ( lat_tile_ + 1 ) * ny; };
1094
1095 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nxy ), [&]( int n ) {
1096 const int dxn = n % ( lat_tile_ + 1 );
1097 const int dyn = n / ( lat_tile_ + 1 );
1098 const int xi = x0 + dxn;
1099 const int yi = y0 + dyn;
1100
1101 if ( xi <= hex_lat_ && yi <= hex_lat_ )
1102 {
1103 coords_sh( n, 0 ) = grid_( local_subdomain_id, xi, yi, 0 );
1104 coords_sh( n, 1 ) = grid_( local_subdomain_id, xi, yi, 1 );
1105 coords_sh( n, 2 ) = grid_( local_subdomain_id, xi, yi, 2 );
1106 }
1107 else
1108 {
1109 coords_sh( n, 0 ) = coords_sh( n, 1 ) = coords_sh( n, 2 ) = 0.0;
1110 }
1111 } );
1112
1113 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nlev ), [&]( int lvl ) {
1114 const int rr = r0 + lvl;
1115 r_sh( lvl ) = ( rr <= hex_rad_ ) ? radii_( local_subdomain_id, rr ) : 0.0;
1116 } );
1117
1118 const int total_pairs = nxy * nlev;
1119 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, total_pairs ), [&]( int t ) {
1120 const int node = t / nlev;
1121 const int lvl = t - node * nlev;
1122
1123 const int dxn = node % ( lat_tile_ + 1 );
1124 const int dyn = node / ( lat_tile_ + 1 );
1125
1126 const int xi = x0 + dxn;
1127 const int yi = y0 + dyn;
1128 const int rr = r0 + lvl;
1129
1130 if ( xi <= hex_lat_ && yi <= hex_lat_ && rr <= hex_rad_ )
1131 {
1132 k_sh( node, lvl ) = k_( local_subdomain_id, xi, yi, rr );
1133 src_sh( node, 0, lvl ) = src_( local_subdomain_id, xi, yi, rr, 0 );
1134 src_sh( node, 1, lvl ) = src_( local_subdomain_id, xi, yi, rr, 1 );
1135 src_sh( node, 2, lvl ) = src_( local_subdomain_id, xi, yi, rr, 2 );
1136 }
1137 else
1138 {
1139 k_sh( node, lvl ) = 0.0;
1140 src_sh( node, 0, lvl ) = src_sh( node, 1, lvl ) = src_sh( node, 2, lvl ) = 0.0;
1141 }
1142 } );
1143
1144 team.team_barrier();
1145
1146 if ( x_cell >= hex_lat_ || y_cell >= hex_lat_ )
1147 return;
1148
1149 constexpr double ONE_THIRD = 1.0 / 3.0;
1150 constexpr double ONE_SIXTH = 1.0 / 6.0;
1151
1152 static constexpr double dN_ref[6][3] = {
1153 { -0.5, -0.5, -ONE_SIXTH },
1154 { 0.5, 0.0, -ONE_SIXTH },
1155 { 0.0, 0.5, -ONE_SIXTH },
1156 { -0.5, -0.5, ONE_SIXTH },
1157 { 0.5, 0.0, ONE_SIXTH },
1158 { 0.0, 0.5, ONE_SIXTH } };
1159
1160 static constexpr int WEDGE_NODE_OFF[2][6][3] = {
1161 { { 0, 0, 0 }, { 1, 0, 0 }, { 0, 1, 0 }, { 0, 0, 1 }, { 1, 0, 1 }, { 0, 1, 1 } },
1162 { { 1, 1, 0 }, { 0, 1, 0 }, { 1, 0, 0 }, { 1, 1, 1 }, { 0, 1, 1 }, { 1, 0, 1 } } };
1163
1164 const int n00 = node_id( tx, ty );
1165 const int n01 = node_id( tx, ty + 1 );
1166 const int n10 = node_id( tx + 1, ty );
1167 const int n11 = node_id( tx + 1, ty + 1 );
1168
1169 for ( int pass = 0; pass < r_passes_; ++pass )
1170 {
1171 const int lvl0 = pass * r_tile_ + tr;
1172 const int r_cell = r0 + lvl0;
1173
1174 if ( r_cell >= hex_rad_ )
1175 break;
1176
1177 const double r_0 = r_sh( lvl0 );
1178 const double r_1 = r_sh( lvl0 + 1 );
1179
1180 const bool at_cmb = has_flag( local_subdomain_id, x_cell, y_cell, r_cell, CMB );
1181 const bool at_surface = has_flag( local_subdomain_id, x_cell, y_cell, r_cell + 1, SURFACE );
1182
1183 const bool at_boundary = at_cmb || at_surface;
1184 bool treat_boundary_dirichlet = false;
1185 if ( at_boundary )
1186 {
1187 const ShellBoundaryFlag sbf = at_cmb ? CMB : SURFACE;
1188 treat_boundary_dirichlet = ( get_boundary_condition_flag( bcs_, sbf ) == DIRICHLET );
1189 }
1190
1191 const int cmb_shift = ( ( at_boundary && treat_boundary_dirichlet && ( !Diagonal ) && at_cmb ) ? 3 : 0 );
1192 const int surface_shift =
1193 ( ( at_boundary && treat_boundary_dirichlet && ( !Diagonal ) && at_surface ) ? 3 : 0 );
1194
1195 for ( int w = 0; w < 2; ++w )
1196 {
1197 const int v0 = w == 0 ? n00 : n11;
1198 const int v1 = w == 0 ? n10 : n01;
1199 const int v2 = w == 0 ? n01 : n10;
1200
1201 double k_sum = 0.0;
1202#pragma unroll
1203 for ( int node = 0; node < 6; ++node )
1204 {
1205 const int nid = node_id( tx + WEDGE_NODE_OFF[w][node][0], ty + WEDGE_NODE_OFF[w][node][1] );
1206 k_sum += k_sh( nid, lvl0 + WEDGE_NODE_OFF[w][node][2] );
1207 }
1208 const double k_eval = ONE_SIXTH * k_sum;
1209
1210 double kwJ;
1211
1212 // ==== Phase 1: Jacobian + Gather (gu tensor) ====
1213 // invJ lives only in this scope so the compiler can reclaim its registers.
1214 double gu00 = 0.0;
1215 double gu10 = 0.0, gu11 = 0.0;
1216 double gu20 = 0.0, gu21 = 0.0, gu22 = 0.0;
1217 double div_u = 0.0;
1218 {
1219 const double half_dr = 0.5 * ( r_1 - r_0 );
1220 const double r_mid = 0.5 * ( r_0 + r_1 );
1221
1222 const double J_0_0 = r_mid * ( -coords_sh( v0, 0 ) + coords_sh( v1, 0 ) );
1223 const double J_0_1 = r_mid * ( -coords_sh( v0, 0 ) + coords_sh( v2, 0 ) );
1224 const double J_0_2 =
1225 half_dr * ( ONE_THIRD * ( coords_sh( v0, 0 ) + coords_sh( v1, 0 ) + coords_sh( v2, 0 ) ) );
1226
1227 const double J_1_0 = r_mid * ( -coords_sh( v0, 1 ) + coords_sh( v1, 1 ) );
1228 const double J_1_1 = r_mid * ( -coords_sh( v0, 1 ) + coords_sh( v2, 1 ) );
1229 const double J_1_2 =
1230 half_dr * ( ONE_THIRD * ( coords_sh( v0, 1 ) + coords_sh( v1, 1 ) + coords_sh( v2, 1 ) ) );
1231
1232 const double J_2_0 = r_mid * ( -coords_sh( v0, 2 ) + coords_sh( v1, 2 ) );
1233 const double J_2_1 = r_mid * ( -coords_sh( v0, 2 ) + coords_sh( v2, 2 ) );
1234 const double J_2_2 =
1235 half_dr * ( ONE_THIRD * ( coords_sh( v0, 2 ) + coords_sh( v1, 2 ) + coords_sh( v2, 2 ) ) );
1236
1237 const double J_det = J_0_0 * J_1_1 * J_2_2 - J_0_0 * J_1_2 * J_2_1 - J_0_1 * J_1_0 * J_2_2 +
1238 J_0_1 * J_1_2 * J_2_0 + J_0_2 * J_1_0 * J_2_1 - J_0_2 * J_1_1 * J_2_0;
1239
1240 kwJ = k_eval * Kokkos::abs( J_det );
1241
1242 const double inv_det = 1.0 / J_det;
1243
1244 const double i00 = inv_det * ( J_1_1 * J_2_2 - J_1_2 * J_2_1 );
1245 const double i01 = inv_det * ( -J_1_0 * J_2_2 + J_1_2 * J_2_0 );
1246 const double i02 = inv_det * ( J_1_0 * J_2_1 - J_1_1 * J_2_0 );
1247 const double i10 = inv_det * ( -J_0_1 * J_2_2 + J_0_2 * J_2_1 );
1248 const double i11 = inv_det * ( J_0_0 * J_2_2 - J_0_2 * J_2_0 );
1249 const double i12 = inv_det * ( -J_0_0 * J_2_1 + J_0_1 * J_2_0 );
1250 const double i20 = inv_det * ( J_0_1 * J_1_2 - J_0_2 * J_1_1 );
1251 const double i21 = inv_det * ( -J_0_0 * J_1_2 + J_0_2 * J_1_0 );
1252 const double i22 = inv_det * ( J_0_0 * J_1_1 - J_0_1 * J_1_0 );
1253
1254 if ( !Diagonal )
1255 {
1256#pragma unroll
1257 for ( int n = cmb_shift; n < 6 - surface_shift; ++n )
1258 {
1259 const double gx = dN_ref[n][0];
1260 const double gy = dN_ref[n][1];
1261 const double gz = dN_ref[n][2];
1262 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1263 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1264 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1265
1266 const int ddx = WEDGE_NODE_OFF[w][n][0];
1267 const int ddy = WEDGE_NODE_OFF[w][n][1];
1268 const int ddr = WEDGE_NODE_OFF[w][n][2];
1269 const int nid = node_id( tx + ddx, ty + ddy );
1270 const int lvl = lvl0 + ddr;
1271
1272 const double s0 = src_sh( nid, 0, lvl );
1273 const double s1 = src_sh( nid, 1, lvl );
1274 const double s2 = src_sh( nid, 2, lvl );
1275
1276 gu00 += g0 * s0;
1277 gu11 += g1 * s1;
1278 gu22 += g2 * s2;
1279 gu10 += 0.5 * ( g1 * s0 + g0 * s1 );
1280 gu20 += 0.5 * ( g2 * s0 + g0 * s2 );
1281 gu21 += 0.5 * ( g2 * s1 + g1 * s2 );
1282 div_u += g0 * s0 + g1 * s1 + g2 * s2;
1283 }
1284 }
1285 }
1286 // invJ (i00..i22) is now out of scope — registers can be reclaimed.
1287
1288 // ==== Phase 2: Recompute Jacobian + Scatter ====
1289 {
1290 const double half_dr = 0.5 * ( r_1 - r_0 );
1291 const double r_mid = 0.5 * ( r_0 + r_1 );
1292
1293 const double J_0_0 = r_mid * ( -coords_sh( v0, 0 ) + coords_sh( v1, 0 ) );
1294 const double J_0_1 = r_mid * ( -coords_sh( v0, 0 ) + coords_sh( v2, 0 ) );
1295 const double J_0_2 =
1296 half_dr * ( ONE_THIRD * ( coords_sh( v0, 0 ) + coords_sh( v1, 0 ) + coords_sh( v2, 0 ) ) );
1297
1298 const double J_1_0 = r_mid * ( -coords_sh( v0, 1 ) + coords_sh( v1, 1 ) );
1299 const double J_1_1 = r_mid * ( -coords_sh( v0, 1 ) + coords_sh( v2, 1 ) );
1300 const double J_1_2 =
1301 half_dr * ( ONE_THIRD * ( coords_sh( v0, 1 ) + coords_sh( v1, 1 ) + coords_sh( v2, 1 ) ) );
1302
1303 const double J_2_0 = r_mid * ( -coords_sh( v0, 2 ) + coords_sh( v1, 2 ) );
1304 const double J_2_1 = r_mid * ( -coords_sh( v0, 2 ) + coords_sh( v2, 2 ) );
1305 const double J_2_2 =
1306 half_dr * ( ONE_THIRD * ( coords_sh( v0, 2 ) + coords_sh( v1, 2 ) + coords_sh( v2, 2 ) ) );
1307
1308 const double J_det = J_0_0 * J_1_1 * J_2_2 - J_0_0 * J_1_2 * J_2_1 - J_0_1 * J_1_0 * J_2_2 +
1309 J_0_1 * J_1_2 * J_2_0 + J_0_2 * J_1_0 * J_2_1 - J_0_2 * J_1_1 * J_2_0;
1310
1311 const double inv_det = 1.0 / J_det;
1312
1313 const double i00 = inv_det * ( J_1_1 * J_2_2 - J_1_2 * J_2_1 );
1314 const double i01 = inv_det * ( -J_1_0 * J_2_2 + J_1_2 * J_2_0 );
1315 const double i02 = inv_det * ( J_1_0 * J_2_1 - J_1_1 * J_2_0 );
1316 const double i10 = inv_det * ( -J_0_1 * J_2_2 + J_0_2 * J_2_1 );
1317 const double i11 = inv_det * ( J_0_0 * J_2_2 - J_0_2 * J_2_0 );
1318 const double i12 = inv_det * ( -J_0_0 * J_2_1 + J_0_1 * J_2_0 );
1319 const double i20 = inv_det * ( J_0_1 * J_1_2 - J_0_2 * J_1_1 );
1320 const double i21 = inv_det * ( -J_0_0 * J_1_2 + J_0_2 * J_1_0 );
1321 const double i22 = inv_det * ( J_0_0 * J_1_1 - J_0_1 * J_1_0 );
1322
1323 if ( !Diagonal )
1324 {
1325 constexpr double NEG_TWO_THIRDS = -0.66666666666666663;
1326#pragma unroll
1327 for ( int n = cmb_shift; n < 6 - surface_shift; ++n )
1328 {
1329 const double gx = dN_ref[n][0];
1330 const double gy = dN_ref[n][1];
1331 const double gz = dN_ref[n][2];
1332 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1333 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1334 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1335
1336 const int ddx = WEDGE_NODE_OFF[w][n][0];
1337 const int ddy = WEDGE_NODE_OFF[w][n][1];
1338 const int ddr = WEDGE_NODE_OFF[w][n][2];
1339 Kokkos::atomic_add(
1340 &dst_( local_subdomain_id, x_cell + ddx, y_cell + ddy, r_cell + ddr, 0 ),
1341 kwJ * ( 2.0 * ( g0 * gu00 + g1 * gu10 + g2 * gu20 ) + NEG_TWO_THIRDS * g0 * div_u ) );
1342 Kokkos::atomic_add(
1343 &dst_( local_subdomain_id, x_cell + ddx, y_cell + ddy, r_cell + ddr, 1 ),
1344 kwJ * ( 2.0 * ( g0 * gu10 + g1 * gu11 + g2 * gu21 ) + NEG_TWO_THIRDS * g1 * div_u ) );
1345 Kokkos::atomic_add(
1346 &dst_( local_subdomain_id, x_cell + ddx, y_cell + ddy, r_cell + ddr, 2 ),
1347 kwJ * ( 2.0 * ( g0 * gu20 + g1 * gu21 + g2 * gu22 ) + NEG_TWO_THIRDS * g2 * div_u ) );
1348 }
1349 }
1350
1351 if ( Diagonal || ( treat_boundary_dirichlet && at_boundary ) )
1352 {
1353#pragma unroll
1354 for ( int n = surface_shift; n < 6 - cmb_shift; ++n )
1355 {
1356 const double gx = dN_ref[n][0];
1357 const double gy = dN_ref[n][1];
1358 const double gz = dN_ref[n][2];
1359 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1360 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1361 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1362 const double gg = g0 * g0 + g1 * g1 + g2 * g2;
1363
1364 const int nid = node_id( tx + WEDGE_NODE_OFF[w][n][0], ty + WEDGE_NODE_OFF[w][n][1] );
1365 const int lvl = lvl0 + WEDGE_NODE_OFF[w][n][2];
1366
1367 const double sv0 = src_sh( nid, 0, lvl );
1368 const double sv1 = src_sh( nid, 1, lvl );
1369 const double sv2 = src_sh( nid, 2, lvl );
1370
1371 const int ddx = WEDGE_NODE_OFF[w][n][0];
1372 const int ddy = WEDGE_NODE_OFF[w][n][1];
1373 const int ddr = WEDGE_NODE_OFF[w][n][2];
1374 Kokkos::atomic_add(
1375 &dst_( local_subdomain_id, x_cell + ddx, y_cell + ddy, r_cell + ddr, 0 ),
1376 kwJ * sv0 * ( gg + ONE_THIRD * g0 * g0 ) );
1377 Kokkos::atomic_add(
1378 &dst_( local_subdomain_id, x_cell + ddx, y_cell + ddy, r_cell + ddr, 1 ),
1379 kwJ * sv1 * ( gg + ONE_THIRD * g1 * g1 ) );
1380 Kokkos::atomic_add(
1381 &dst_( local_subdomain_id, x_cell + ddx, y_cell + ddy, r_cell + ddr, 2 ),
1382 kwJ * sv2 * ( gg + ONE_THIRD * g2 * g2 ) );
1383 }
1384 }
1385 }
1386
1387 } // end wedge loop
1388 }
1389 }
1390
1391 // ===================== FAST FREESLIP PATH =====================
1392 KOKKOS_INLINE_FUNCTION
1393 void normalize3( double& x, double& y, double& z ) const
1394 {
1395 const double n2 = x * x + y * y + z * z;
1396 if ( n2 > 0.0 )
1397 {
1398 const double invn = 1.0 / Kokkos::sqrt( n2 );
1399 x *= invn;
1400 y *= invn;
1401 z *= invn;
1402 }
1403 }
1404
1405 KOKKOS_INLINE_FUNCTION
1406 void project_tangential_inplace(
1407 const double nx,
1408 const double ny,
1409 const double nz,
1410 double& ux,
1411 double& uy,
1412 double& uz ) const
1413 {
1414 const double dot = nx * ux + ny * uy + nz * uz;
1415 ux -= dot * nx;
1416 uy -= dot * ny;
1417 uz -= dot * nz;
1418 }
1419
1420 template < bool Diagonal >
1421 KOKKOS_INLINE_FUNCTION void operator_fast_freeslip_path(
1422 const Team& team,
1423 const int local_subdomain_id,
1424 const int x0,
1425 const int y0,
1426 const int r0,
1427 const int tx,
1428 const int ty,
1429 const int tr,
1430 const int x_cell,
1431 const int y_cell ) const
1432 {
1433 const int nlev = r_tile_block_ + 1;
1434 const int nxy = ( lat_tile_ + 1 ) * ( lat_tile_ + 1 );
1435
1436 double* shmem =
1437 reinterpret_cast< double* >( team.team_shmem().get_shmem( team_shmem_size( team.team_size() ) ) );
1438
1439 using ScratchCoords =
1440 Kokkos::View< double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
1441 using ScratchSrc = Kokkos::
1442 View< double***, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
1443 using ScratchK =
1444 Kokkos::View< double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
1445
1446 ScratchCoords coords_sh( shmem, nxy, 3 );
1447 shmem += nxy * 3;
1448
1449 // Normalized normals in shared memory — avoids 12 persistent register-doubles.
1450 ScratchCoords normals_sh( shmem, nxy, 3 );
1451 shmem += nxy * 3;
1452
1453 ScratchSrc src_sh( shmem, nxy, 3, nlev );
1454 shmem += nxy * 3 * nlev;
1455
1456 ScratchK k_sh( shmem, nxy, nlev );
1457 shmem += nxy * nlev;
1458
1459 auto r_sh =
1460 Kokkos::View< double*, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >(
1461 shmem, nlev );
1462
1463 auto node_id = [&]( int nx, int ny ) -> int { return nx + ( lat_tile_ + 1 ) * ny; };
1464
1465 // Preload coords + compute normalized normals cooperatively.
1466 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nxy ), [&]( int n ) {
1467 const int dxn = n % ( lat_tile_ + 1 );
1468 const int dyn = n / ( lat_tile_ + 1 );
1469 const int xi = x0 + dxn;
1470 const int yi = y0 + dyn;
1471
1472 if ( xi <= hex_lat_ && yi <= hex_lat_ )
1473 {
1474 const double cx = grid_( local_subdomain_id, xi, yi, 0 );
1475 const double cy = grid_( local_subdomain_id, xi, yi, 1 );
1476 const double cz = grid_( local_subdomain_id, xi, yi, 2 );
1477 coords_sh( n, 0 ) = cx;
1478 coords_sh( n, 1 ) = cy;
1479 coords_sh( n, 2 ) = cz;
1480
1481 const double n2 = cx * cx + cy * cy + cz * cz;
1482 if ( n2 > 0.0 )
1483 {
1484 const double invn = 1.0 / Kokkos::sqrt( n2 );
1485 normals_sh( n, 0 ) = cx * invn;
1486 normals_sh( n, 1 ) = cy * invn;
1487 normals_sh( n, 2 ) = cz * invn;
1488 }
1489 else
1490 {
1491 normals_sh( n, 0 ) = normals_sh( n, 1 ) = normals_sh( n, 2 ) = 0.0;
1492 }
1493 }
1494 else
1495 {
1496 coords_sh( n, 0 ) = coords_sh( n, 1 ) = coords_sh( n, 2 ) = 0.0;
1497 normals_sh( n, 0 ) = normals_sh( n, 1 ) = normals_sh( n, 2 ) = 0.0;
1498 }
1499 } );
1500
1501 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nlev ), [&]( int lvl ) {
1502 const int rr = r0 + lvl;
1503 r_sh( lvl ) = ( rr <= hex_rad_ ) ? radii_( local_subdomain_id, rr ) : 0.0;
1504 } );
1505
1506 const int total_pairs = nxy * nlev;
1507 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, total_pairs ), [&]( int t ) {
1508 const int node = t / nlev;
1509 const int lvl = t - node * nlev;
1510
1511 const int dxn = node % ( lat_tile_ + 1 );
1512 const int dyn = node / ( lat_tile_ + 1 );
1513
1514 const int xi = x0 + dxn;
1515 const int yi = y0 + dyn;
1516 const int rr = r0 + lvl;
1517
1518 if ( xi <= hex_lat_ && yi <= hex_lat_ && rr <= hex_rad_ )
1519 {
1520 k_sh( node, lvl ) = k_( local_subdomain_id, xi, yi, rr );
1521 src_sh( node, 0, lvl ) = src_( local_subdomain_id, xi, yi, rr, 0 );
1522 src_sh( node, 1, lvl ) = src_( local_subdomain_id, xi, yi, rr, 1 );
1523 src_sh( node, 2, lvl ) = src_( local_subdomain_id, xi, yi, rr, 2 );
1524 }
1525 else
1526 {
1527 k_sh( node, lvl ) = 0.0;
1528 src_sh( node, 0, lvl ) = src_sh( node, 1, lvl ) = src_sh( node, 2, lvl ) = 0.0;
1529 }
1530 } );
1531
1532 team.team_barrier();
1533
1534 if ( x_cell >= hex_lat_ || y_cell >= hex_lat_ )
1535 return;
1536
1537 for ( int pass = 0; pass < r_passes_; ++pass )
1538 {
1539 const int lvl0 = pass * r_tile_ + tr;
1540 const int r_cell = r0 + lvl0;
1541
1542 if ( r_cell >= hex_rad_ )
1543 break;
1544
1545 const double r_0 = r_sh( lvl0 );
1546 const double r_1 = r_sh( lvl0 + 1 );
1547
1548 const bool at_cmb = has_flag( local_subdomain_id, x_cell, y_cell, r_cell, CMB );
1549 const bool at_surface = has_flag( local_subdomain_id, x_cell, y_cell, r_cell + 1, SURFACE );
1550
1551 const BoundaryConditionFlag cmb_bc = get_boundary_condition_flag( bcs_, CMB );
1552 const BoundaryConditionFlag surface_bc = get_boundary_condition_flag( bcs_, SURFACE );
1553
1554 const bool cmb_freeslip = at_cmb && ( cmb_bc == FREESLIP );
1555 const bool surf_freeslip = at_surface && ( surface_bc == FREESLIP );
1556 const bool cmb_dirichlet = at_cmb && ( cmb_bc == DIRICHLET );
1557 const bool surface_dirichlet = at_surface && ( surface_bc == DIRICHLET );
1558
1559 const int cmb_shift = ( ( cmb_dirichlet && ( !Diagonal ) ) ? 3 : 0 );
1560 const int surface_shift = ( ( surface_dirichlet && ( !Diagonal ) ) ? 3 : 0 );
1561
1562 static constexpr int WEDGE_NODE_OFF[2][6][3] = {
1563 { { 0, 0, 0 }, { 1, 0, 0 }, { 0, 1, 0 }, { 0, 0, 1 }, { 1, 0, 1 }, { 0, 1, 1 } },
1564 { { 1, 1, 0 }, { 0, 1, 0 }, { 1, 0, 0 }, { 1, 1, 1 }, { 0, 1, 1 }, { 1, 0, 1 } } };
1565
1566 static constexpr int WEDGE_TO_UNIQUE[2][6] = { { 0, 1, 2, 3, 4, 5 }, { 6, 2, 1, 7, 5, 4 } };
1567
1568 constexpr double ONE_THIRD = 1.0 / 3.0;
1569 constexpr double ONE_SIXTH = 1.0 / 6.0;
1570 constexpr double NEG_TWO_THIRDS = -0.66666666666666663;
1571
1572 static constexpr double dN_ref[6][3] = {
1573 { -0.5, -0.5, -ONE_SIXTH },
1574 { 0.5, 0.0, -ONE_SIXTH },
1575 { 0.0, 0.5, -ONE_SIXTH },
1576 { -0.5, -0.5, ONE_SIXTH },
1577 { 0.5, 0.0, ONE_SIXTH },
1578 { 0.0, 0.5, ONE_SIXTH } };
1579
1580 const int n00 = node_id( tx, ty );
1581 const int n01 = node_id( tx, ty + 1 );
1582 const int n10 = node_id( tx + 1, ty );
1583 const int n11 = node_id( tx + 1, ty + 1 );
1584
1585 // Corner-to-shared-memory-node mapping: 0→n00, 1→n10, 2→n01, 3→n11.
1586 const int corner_node[4] = { n00, n10, n01, n11 };
1587 // Corner-to-unique-node for CMB (r=0) and surface (r=1) layers.
1588 static constexpr int CMB_CORNER_TO_UNIQUE[4] = { 0, 1, 2, 6 };
1589 static constexpr int SURF_CORNER_TO_UNIQUE[4] = { 3, 4, 5, 7 };
1590
1591 double dst8[3][8] = { { 0.0 } };
1592
1593 // Scalar Ann accumulators per corner. Accumulated inside the test-side loop
1594 // (merged from the former separate Ann loop to reuse already-computed gradients).
1595 double Ann_acc_cmb[4] = {};
1596 double Ann_acc_surf[4] = {};
1597
1598 for ( int w = 0; w < 2; ++w )
1599 {
1600 const int v0 = w == 0 ? n00 : n11;
1601 const int v1 = w == 0 ? n10 : n01;
1602 const int v2 = w == 0 ? n01 : n10;
1603
1604 double k_sum = 0.0;
1605#pragma unroll
1606 for ( int node = 0; node < 6; ++node )
1607 {
1608 const int ddx = WEDGE_NODE_OFF[w][node][0];
1609 const int ddy = WEDGE_NODE_OFF[w][node][1];
1610 const int ddr = WEDGE_NODE_OFF[w][node][2];
1611
1612 const int nid = node_id( tx + ddx, ty + ddy );
1613 const int lvl = lvl0 + ddr;
1614 k_sum += k_sh( nid, lvl );
1615 }
1616 const double k_eval = ONE_SIXTH * k_sum;
1617
1618 double wJ = 0.0;
1619 double i00, i01, i02;
1620 double i10, i11, i12;
1621 double i20, i21, i22;
1622
1623 {
1624 const double half_dr = 0.5 * ( r_1 - r_0 );
1625 const double r_mid = 0.5 * ( r_0 + r_1 );
1626
1627 const double J_0_0 = r_mid * ( -coords_sh( v0, 0 ) + coords_sh( v1, 0 ) );
1628 const double J_0_1 = r_mid * ( -coords_sh( v0, 0 ) + coords_sh( v2, 0 ) );
1629 const double J_0_2 =
1630 half_dr * ( ONE_THIRD * ( coords_sh( v0, 0 ) + coords_sh( v1, 0 ) + coords_sh( v2, 0 ) ) );
1631
1632 const double J_1_0 = r_mid * ( -coords_sh( v0, 1 ) + coords_sh( v1, 1 ) );
1633 const double J_1_1 = r_mid * ( -coords_sh( v0, 1 ) + coords_sh( v2, 1 ) );
1634 const double J_1_2 =
1635 half_dr * ( ONE_THIRD * ( coords_sh( v0, 1 ) + coords_sh( v1, 1 ) + coords_sh( v2, 1 ) ) );
1636
1637 const double J_2_0 = r_mid * ( -coords_sh( v0, 2 ) + coords_sh( v1, 2 ) );
1638 const double J_2_1 = r_mid * ( -coords_sh( v0, 2 ) + coords_sh( v2, 2 ) );
1639 const double J_2_2 =
1640 half_dr * ( ONE_THIRD * ( coords_sh( v0, 2 ) + coords_sh( v1, 2 ) + coords_sh( v2, 2 ) ) );
1641
1642 const double J_det = J_0_0 * J_1_1 * J_2_2 - J_0_0 * J_1_2 * J_2_1 - J_0_1 * J_1_0 * J_2_2 +
1643 J_0_1 * J_1_2 * J_2_0 + J_0_2 * J_1_0 * J_2_1 - J_0_2 * J_1_1 * J_2_0;
1644
1645 const double invJ = 1.0 / J_det;
1646
1647 i00 = invJ * ( J_1_1 * J_2_2 - J_1_2 * J_2_1 );
1648 i01 = invJ * ( -J_1_0 * J_2_2 + J_1_2 * J_2_0 );
1649 i02 = invJ * ( J_1_0 * J_2_1 - J_1_1 * J_2_0 );
1650
1651 i10 = invJ * ( -J_0_1 * J_2_2 + J_0_2 * J_2_1 );
1652 i11 = invJ * ( J_0_0 * J_2_2 - J_0_2 * J_2_0 );
1653 i12 = invJ * ( -J_0_0 * J_2_1 + J_0_1 * J_2_0 );
1654
1655 i20 = invJ * ( J_0_1 * J_1_2 - J_0_2 * J_1_1 );
1656 i21 = invJ * ( -J_0_0 * J_1_2 + J_0_2 * J_1_0 );
1657 i22 = invJ * ( J_0_0 * J_1_1 - J_0_1 * J_1_0 );
1658
1659 wJ = Kokkos::abs( J_det );
1660 }
1661
1662 const double kwJ = k_eval * wJ;
1663
1664 // ---- Fused trial + test side with merged Ann accumulation ----
1665 static constexpr int CMB_NODE_TO_CORNER[2][3] = { { 0, 1, 2 }, { 3, 2, 1 } };
1666
1667 double gu00 = 0.0;
1668 double gu10 = 0.0, gu11 = 0.0;
1669 double gu20 = 0.0, gu21 = 0.0, gu22 = 0.0;
1670 double div_u = 0.0;
1671
1672 if ( !Diagonal )
1673 {
1674 // Trial side: accumulate symmetric gradient of u (fused dim loops).
1675 // Read directly from shared memory with inline tangential projection for freeslip nodes.
1676#pragma unroll
1677 for ( int n = cmb_shift; n < 6 - surface_shift; ++n )
1678 {
1679 const double gx = dN_ref[n][0];
1680 const double gy = dN_ref[n][1];
1681 const double gz = dN_ref[n][2];
1682 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1683 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1684 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1685
1686 const int nid = node_id( tx + WEDGE_NODE_OFF[w][n][0], ty + WEDGE_NODE_OFF[w][n][1] );
1687 const int lvl = lvl0 + WEDGE_NODE_OFF[w][n][2];
1688
1689 double s0 = src_sh( nid, 0, lvl );
1690 double s1 = src_sh( nid, 1, lvl );
1691 double s2 = src_sh( nid, 2, lvl );
1692
1693 // Inline tangential projection for freeslip boundary nodes.
1694 if ( cmb_freeslip && n < 3 )
1695 {
1696 const double nx = normals_sh( nid, 0 );
1697 const double ny = normals_sh( nid, 1 );
1698 const double nz = normals_sh( nid, 2 );
1699 const double dot = nx * s0 + ny * s1 + nz * s2;
1700 s0 -= dot * nx;
1701 s1 -= dot * ny;
1702 s2 -= dot * nz;
1703 }
1704 if ( surf_freeslip && n >= 3 )
1705 {
1706 const double nx = normals_sh( nid, 0 );
1707 const double ny = normals_sh( nid, 1 );
1708 const double nz = normals_sh( nid, 2 );
1709 const double dot = nx * s0 + ny * s1 + nz * s2;
1710 s0 -= dot * nx;
1711 s1 -= dot * ny;
1712 s2 -= dot * nz;
1713 }
1714
1715 gu00 += g0 * s0;
1716 gu11 += g1 * s1;
1717 gu22 += g2 * s2;
1718 gu10 += 0.5 * ( g1 * s0 + g0 * s1 );
1719 gu20 += 0.5 * ( g2 * s0 + g0 * s2 );
1720 gu21 += 0.5 * ( g2 * s1 + g1 * s2 );
1721 div_u += g0 * s0 + g1 * s1 + g2 * s2;
1722 }
1723
1724 // Test side + merged Ann accumulation.
1725 // Ann uses the same gradient already computed — no separate loop needed.
1726#pragma unroll
1727 for ( int n = cmb_shift; n < 6 - surface_shift; ++n )
1728 {
1729 const double gx = dN_ref[n][0];
1730 const double gy = dN_ref[n][1];
1731 const double gz = dN_ref[n][2];
1732 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1733 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1734 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1735
1736 const int uid = WEDGE_TO_UNIQUE[w][n];
1737 dst8[0][uid] +=
1738 kwJ * ( 2.0 * ( g0 * gu00 + g1 * gu10 + g2 * gu20 ) + NEG_TWO_THIRDS * g0 * div_u );
1739 dst8[1][uid] +=
1740 kwJ * ( 2.0 * ( g0 * gu10 + g1 * gu11 + g2 * gu21 ) + NEG_TWO_THIRDS * g1 * div_u );
1741 dst8[2][uid] +=
1742 kwJ * ( 2.0 * ( g0 * gu20 + g1 * gu21 + g2 * gu22 ) + NEG_TWO_THIRDS * g2 * div_u );
1743
1744 // Accumulate Ann for freeslip CMB nodes (n < 3) and surface nodes (n >= 3).
1745 if ( cmb_freeslip && n < 3 )
1746 {
1747 const int corner = CMB_NODE_TO_CORNER[w][n];
1748 const int cn = corner_node[corner];
1749 const double nxu = normals_sh( cn, 0 );
1750 const double nyu = normals_sh( cn, 1 );
1751 const double nzu = normals_sh( cn, 2 );
1752 const double gg = g0 * g0 + g1 * g1 + g2 * g2;
1753 const double ng = nxu * g0 + nyu * g1 + nzu * g2;
1754 Ann_acc_cmb[corner] += kwJ * ( gg + ONE_THIRD * ng * ng );
1755 }
1756 if ( surf_freeslip && n >= 3 )
1757 {
1758 const int corner = CMB_NODE_TO_CORNER[w][n - 3];
1759 const int cn = corner_node[corner];
1760 const double nxu = normals_sh( cn, 0 );
1761 const double nyu = normals_sh( cn, 1 );
1762 const double nzu = normals_sh( cn, 2 );
1763 const double gg = g0 * g0 + g1 * g1 + g2 * g2;
1764 const double ng = nxu * g0 + nyu * g1 + nzu * g2;
1765 Ann_acc_surf[corner] += kwJ * ( gg + ONE_THIRD * ng * ng );
1766 }
1767 }
1768 }
1769
1770 // ---- Diagonal / Dirichlet boundary handling (fused) ----
1771 if ( Diagonal || cmb_dirichlet || surface_dirichlet )
1772 {
1773 // Fused diagonal: kwJ * s_d * (|g|^2 + (1/3) * g_d^2)
1774#pragma unroll
1775 for ( int n = surface_shift; n < 6 - cmb_shift; ++n )
1776 {
1777 if ( Diagonal && cmb_freeslip && n < 3 )
1778 continue;
1779 if ( Diagonal && surf_freeslip && n >= 3 )
1780 continue;
1781
1782 const double gx = dN_ref[n][0];
1783 const double gy = dN_ref[n][1];
1784 const double gz = dN_ref[n][2];
1785 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1786 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1787 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1788 const double gg = g0 * g0 + g1 * g1 + g2 * g2;
1789
1790 const int uid = WEDGE_TO_UNIQUE[w][n];
1791 const int nid = node_id( tx + WEDGE_NODE_OFF[w][n][0], ty + WEDGE_NODE_OFF[w][n][1] );
1792 const int lvl = lvl0 + WEDGE_NODE_OFF[w][n][2];
1793 const double s0 = src_sh( nid, 0, lvl );
1794 const double s1 = src_sh( nid, 1, lvl );
1795 const double s2 = src_sh( nid, 2, lvl );
1796
1797 dst8[0][uid] += kwJ * s0 * ( gg + ONE_THIRD * g0 * g0 );
1798 dst8[1][uid] += kwJ * s1 * ( gg + ONE_THIRD * g1 * g1 );
1799 dst8[2][uid] += kwJ * s2 * ( gg + ONE_THIRD * g2 * g2 );
1800 }
1801
1802 // For free-slip boundary nodes in diagonal mode: compute R^T diag(R A_3x3 R^T) R src.
1803 // Normals loaded from shared memory, u_n recomputed from src_sh (original, unprojected).
1804 if ( Diagonal )
1805 {
1806 static constexpr int FS_CORNER_MAP[2][3] = { { 0, 1, 2 }, { 3, 2, 1 } };
1807
1808 auto apply_rotated_diag = [&]( const int ni, const int node_idx, const int src_lvl ) {
1809 const int corner = FS_CORNER_MAP[w][ni];
1810 const int cn = corner_node[corner];
1811 const double nxu = normals_sh( cn, 0 );
1812 const double nyu = normals_sh( cn, 1 );
1813 const double nzu = normals_sh( cn, 2 );
1814 const int u = WEDGE_TO_UNIQUE[w][node_idx];
1815
1816 const double gx = dN_ref[node_idx][0];
1817 const double gy = dN_ref[node_idx][1];
1818 const double gz = dN_ref[node_idx][2];
1819
1820 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1821 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1822 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1823 const double gg_loc = g0 * g0 + g1 * g1 + g2 * g2;
1824
1825 dense::Vec< double, 3 > n_vec;
1826 n_vec( 0 ) = nxu;
1827 n_vec( 1 ) = nyu;
1828 n_vec( 2 ) = nzu;
1829 const auto R_rot = trafo_mat_cartesian_to_normal_tangential< double >( n_vec );
1830
1831 // Read original (unprojected) src directly from shared memory.
1832 // (Previously: projected src8 + u_n*n = original, now simplified.)
1833 const double s0 = src_sh( cn, 0, src_lvl );
1834 const double s1 = src_sh( cn, 1, src_lvl );
1835 const double s2 = src_sh( cn, 2, src_lvl );
1836
1837 const double Rg0 = R_rot( 0, 0 ) * g0 + R_rot( 0, 1 ) * g1 + R_rot( 0, 2 ) * g2;
1838 const double Rg1 = R_rot( 1, 0 ) * g0 + R_rot( 1, 1 ) * g1 + R_rot( 1, 2 ) * g2;
1839 const double Rg2 = R_rot( 2, 0 ) * g0 + R_rot( 2, 1 ) * g1 + R_rot( 2, 2 ) * g2;
1840 const double Rs0 = R_rot( 0, 0 ) * s0 + R_rot( 0, 1 ) * s1 + R_rot( 0, 2 ) * s2;
1841 const double Rs1 = R_rot( 1, 0 ) * s0 + R_rot( 1, 1 ) * s1 + R_rot( 1, 2 ) * s2;
1842 const double Rs2 = R_rot( 2, 0 ) * s0 + R_rot( 2, 1 ) * s1 + R_rot( 2, 2 ) * s2;
1843
1844 const double v0 = kwJ * ( gg_loc + ONE_THIRD * Rg0 * Rg0 ) * Rs0;
1845 const double v1 = kwJ * ( gg_loc + ONE_THIRD * Rg1 * Rg1 ) * Rs1;
1846 const double v2 = kwJ * ( gg_loc + ONE_THIRD * Rg2 * Rg2 ) * Rs2;
1847
1848 dst8[0][u] += R_rot( 0, 0 ) * v0 + R_rot( 1, 0 ) * v1 + R_rot( 2, 0 ) * v2;
1849 dst8[1][u] += R_rot( 0, 1 ) * v0 + R_rot( 1, 1 ) * v1 + R_rot( 2, 1 ) * v2;
1850 dst8[2][u] += R_rot( 0, 2 ) * v0 + R_rot( 1, 2 ) * v1 + R_rot( 2, 2 ) * v2;
1851 };
1852
1853 if ( cmb_freeslip )
1854 {
1855 for ( int ni = 0; ni < 3; ++ni )
1856 apply_rotated_diag( ni, ni, lvl0 );
1857 }
1858 if ( surf_freeslip )
1859 {
1860 for ( int ni = 0; ni < 3; ++ni )
1861 apply_rotated_diag( ni, ni + 3, lvl0 + 1 );
1862 }
1863 }
1864 }
1865 }
1866
1867 // Test-side projection for free-slip (P A P) — normals loaded from shared memory.
1868 if ( !Diagonal && cmb_freeslip )
1869 {
1870 for ( int c = 0; c < 4; ++c )
1871 {
1872 const double nx = normals_sh( corner_node[c], 0 );
1873 const double ny = normals_sh( corner_node[c], 1 );
1874 const double nz = normals_sh( corner_node[c], 2 );
1875 const int u = CMB_CORNER_TO_UNIQUE[c];
1876 const double dot = nx * dst8[0][u] + ny * dst8[1][u] + nz * dst8[2][u];
1877 dst8[0][u] -= dot * nx;
1878 dst8[1][u] -= dot * ny;
1879 dst8[2][u] -= dot * nz;
1880 }
1881 }
1882 if ( !Diagonal && surf_freeslip )
1883 {
1884 for ( int c = 0; c < 4; ++c )
1885 {
1886 const double nx = normals_sh( corner_node[c], 0 );
1887 const double ny = normals_sh( corner_node[c], 1 );
1888 const double nz = normals_sh( corner_node[c], 2 );
1889 const int u = SURF_CORNER_TO_UNIQUE[c];
1890 const double dot = nx * dst8[0][u] + ny * dst8[1][u] + nz * dst8[2][u];
1891 dst8[0][u] -= dot * nx;
1892 dst8[1][u] -= dot * ny;
1893 dst8[2][u] -= dot * nz;
1894 }
1895 }
1896
1897 // Add back normal correction: Ann_acc[c] * u_n[c] * n_c.
1898 // u_n recomputed from original (unprojected) src in shared memory.
1899 if ( !Diagonal && cmb_freeslip )
1900 {
1901 for ( int c = 0; c < 4; ++c )
1902 {
1903 const int cn = corner_node[c];
1904 const double nx = normals_sh( cn, 0 );
1905 const double ny = normals_sh( cn, 1 );
1906 const double nz = normals_sh( cn, 2 );
1907 const double os0 = src_sh( cn, 0, lvl0 );
1908 const double os1 = src_sh( cn, 1, lvl0 );
1909 const double os2 = src_sh( cn, 2, lvl0 );
1910 const double u_n_val = nx * os0 + ny * os1 + nz * os2;
1911 const double corr = Ann_acc_cmb[c] * u_n_val;
1912 const int u = CMB_CORNER_TO_UNIQUE[c];
1913 dst8[0][u] += corr * nx;
1914 dst8[1][u] += corr * ny;
1915 dst8[2][u] += corr * nz;
1916 }
1917 }
1918 if ( !Diagonal && surf_freeslip )
1919 {
1920 for ( int c = 0; c < 4; ++c )
1921 {
1922 const int cn = corner_node[c];
1923 const double nx = normals_sh( cn, 0 );
1924 const double ny = normals_sh( cn, 1 );
1925 const double nz = normals_sh( cn, 2 );
1926 const double os0 = src_sh( cn, 0, lvl0 + 1 );
1927 const double os1 = src_sh( cn, 1, lvl0 + 1 );
1928 const double os2 = src_sh( cn, 2, lvl0 + 1 );
1929 const double u_n_val = nx * os0 + ny * os1 + nz * os2;
1930 const double corr = Ann_acc_surf[c] * u_n_val;
1931 const int u = SURF_CORNER_TO_UNIQUE[c];
1932 dst8[0][u] += corr * nx;
1933 dst8[1][u] += corr * ny;
1934 dst8[2][u] += corr * nz;
1935 }
1936 }
1937
1938 // Scatter accumulated hex-cell contributions to global memory.
1939#pragma unroll
1940 for ( int dim_add = 0; dim_add < 3; ++dim_add )
1941 {
1942 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell, y_cell, r_cell, dim_add ), dst8[dim_add][0] );
1943 Kokkos::atomic_add(
1944 &dst_( local_subdomain_id, x_cell + 1, y_cell, r_cell, dim_add ), dst8[dim_add][1] );
1945 Kokkos::atomic_add(
1946 &dst_( local_subdomain_id, x_cell, y_cell + 1, r_cell, dim_add ), dst8[dim_add][2] );
1947 Kokkos::atomic_add(
1948 &dst_( local_subdomain_id, x_cell, y_cell, r_cell + 1, dim_add ), dst8[dim_add][3] );
1949 Kokkos::atomic_add(
1950 &dst_( local_subdomain_id, x_cell + 1, y_cell, r_cell + 1, dim_add ), dst8[dim_add][4] );
1951 Kokkos::atomic_add(
1952 &dst_( local_subdomain_id, x_cell, y_cell + 1, r_cell + 1, dim_add ), dst8[dim_add][5] );
1953 Kokkos::atomic_add(
1954 &dst_( local_subdomain_id, x_cell + 1, y_cell + 1, r_cell, dim_add ), dst8[dim_add][6] );
1955 Kokkos::atomic_add(
1956 &dst_( local_subdomain_id, x_cell + 1, y_cell + 1, r_cell + 1, dim_add ), dst8[dim_add][7] );
1957 }
1958
1959 } // end r_passes loop
1960 }
1961
1962 public:
1963 /**
1964 * @brief Legacy generic team operator.
1965 *
1966 * Kept for compatibility/debugging, but no longer used by apply_impl().
1967 * The host now dispatches directly to path-specific kernels.
1968 *
1969 * This function still works, but it reintroduces a branch on `kernel_path_`
1970 * and should therefore be avoided in performance-critical use.
1971 */
1972 KOKKOS_INLINE_FUNCTION
1973 void operator()( const Team& team ) const
1974 {
1975 int local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell;
1976 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
1977
1978 if ( tr >= r_tile_ )
1979 return;
1980
1981 if ( kernel_path_ == KernelPath::Slow )
1982 {
1983 for ( int pass = 0; pass < r_passes_; ++pass )
1984 {
1985 const int r_cell_pass = r0 + pass * r_tile_ + tr;
1986 if ( r_cell_pass >= hex_rad_ )
1987 break;
1988 const bool at_cmb = has_flag( local_subdomain_id, x_cell, y_cell, r_cell_pass, CMB );
1989 const bool at_surface = has_flag( local_subdomain_id, x_cell, y_cell, r_cell_pass + 1, SURFACE );
1990 operator_slow_path(
1991 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell_pass, at_cmb, at_surface );
1992 }
1993 }
1994 else if ( kernel_path_ == KernelPath::FastFreeslip )
1995 {
1996 if ( diagonal_ )
1997 operator_fast_freeslip_path< true >( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell );
1998 else
1999 operator_fast_freeslip_path< false >(
2000 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell );
2001 }
2002 else
2003 {
2004 if ( diagonal_ )
2005 operator_fast_dirichlet_neumann_path< true >(
2006 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell );
2007 else
2008 operator_fast_dirichlet_neumann_path< false >(
2009 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell );
2010 }
2011 }
2012
2013 KOKKOS_INLINE_FUNCTION void assemble_trial_test_vecs(
2014 const int wedge,
2015 const dense::Vec< ScalarType, VecDim >& quad_point,
2016 const ScalarType quad_weight,
2017 const ScalarT r_1,
2018 const ScalarT r_2,
2019 dense::Vec< ScalarT, 3 > ( *wedge_phy_surf )[3],
2020 const dense::Vec< ScalarT, 6 >* k_local_hex,
2021 const int dimi,
2022 const int dimj,
2025 ScalarType& jdet_keval_quadweight ) const
2026 {
2027 dense::Mat< ScalarType, VecDim, VecDim > J = jac( wedge_phy_surf[wedge], r_1, r_2, quad_point );
2028 const auto det = J.det();
2029 const auto abs_det = Kokkos::abs( det );
2030 const dense::Mat< ScalarType, VecDim, VecDim > J_inv_transposed = J.inv_transposed( det );
2031
2032 ScalarType k_eval = 0.0;
2033 for ( int k = 0; k < num_nodes_per_wedge; k++ )
2034 {
2035 k_eval += shape( k, quad_point ) * k_local_hex[wedge]( k );
2036 }
2037
2038 for ( int k = 0; k < num_nodes_per_wedge; k++ )
2039 {
2040 sym_grad_i[k] = symmetric_grad( J_inv_transposed, quad_point, k, dimi );
2041 sym_grad_j[k] = symmetric_grad( J_inv_transposed, quad_point, k, dimj );
2042 }
2043
2044 jdet_keval_quadweight = quad_weight * k_eval * abs_det;
2045 }
2046
2047 /**
2048 * @brief Assemble one wedge-local 18x18 matrix (slow path / on-demand assembly).
2049 */
2050 KOKKOS_INLINE_FUNCTION
2052 const int local_subdomain_id,
2053 const int x_cell,
2054 const int y_cell,
2055 const int r_cell,
2056 const int wedge ) const
2057 {
2059 wedge_surface_physical_coords( wedge_phy_surf, grid_, local_subdomain_id, x_cell, y_cell );
2060
2061 const ScalarT r_1 = radii_( local_subdomain_id, r_cell );
2062 const ScalarT r_2 = radii_( local_subdomain_id, r_cell + 1 );
2063
2065 extract_local_wedge_scalar_coefficients( k_local_hex, local_subdomain_id, x_cell, y_cell, r_cell, k_ );
2066
2068
2069 for ( int dimi = 0; dimi < 3; ++dimi )
2070 {
2071 for ( int dimj = 0; dimj < 3; ++dimj )
2072 {
2073 for ( int q = 0; q < num_quad_points; q++ )
2074 {
2077 ScalarType jdet_keval_quadweight = 0;
2078
2080 wedge,
2081 quad_points[q],
2082 quad_weights[q],
2083 r_1,
2084 r_2,
2085 wedge_phy_surf,
2086 k_local_hex,
2087 dimi,
2088 dimj,
2089 sym_grad_i,
2090 sym_grad_j,
2091 jdet_keval_quadweight );
2092
2093 for ( int i = 0; i < num_nodes_per_wedge; i++ )
2094 {
2095 for ( int j = 0; j < num_nodes_per_wedge; j++ )
2096 {
2097 A( i + dimi * num_nodes_per_wedge, j + dimj * num_nodes_per_wedge ) +=
2098 jdet_keval_quadweight *
2099 ( 2 * sym_grad_j[j].double_contract( sym_grad_i[i] ) -
2100 2.0 / 3.0 * sym_grad_j[j]( dimj, dimj ) * sym_grad_i[i]( dimi, dimi ) );
2101 }
2102 }
2103 }
2104 }
2105 }
2106
2107 return A;
2108 }
2109};
2110
2113
2114} // namespace terra::fe::wedge::operators::shell
Definition communication_plan.hpp:33
Send and receive buffers for all process-local subdomain boundaries.
Definition communication.hpp:56
Matrix-free / matrix-based epsilon-div-div operator on wedge elements in a spherical shell.
Definition epsilon_divdiv_kerngen.hpp:63
const grid::shell::DistributedDomain & get_domain() const
Definition epsilon_divdiv_kerngen.hpp:430
void set_boundary_conditions(BoundaryConditions bcs)
Definition epsilon_divdiv_kerngen.hpp:420
linalg::OperatorStoredMatrixMode get_stored_matrix_mode()
Definition epsilon_divdiv_kerngen.hpp:466
void set_local_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge, const dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > &mat) const
Definition epsilon_divdiv_kerngen.hpp:469
void column_grad_to_sym(const int dim, const double g0, const double g1, const double g2, double &E00, double &E11, double &E22, double &sym01, double &sym02, double &sym12, double &gdd) const
Convert one gradient column into symmetric-gradient and div contributions.
Definition epsilon_divdiv_kerngen.hpp:627
dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > assemble_local_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge) const
Assemble one wedge-local 18x18 matrix (slow path / on-demand assembly).
Definition epsilon_divdiv_kerngen.hpp:2051
void apply_impl(const SrcVectorType &src, DstVectorType &dst)
Apply the operator: dst <- Op(src) (or additive, depending on mode).
Definition epsilon_divdiv_kerngen.hpp:513
void init_penalty()
Initialize penalty for fs/fs null-space regularization.
Definition epsilon_divdiv_kerngen.hpp:287
grid::Grid2DDataScalar< ScalarT > get_radii() const
Definition epsilon_divdiv_kerngen.hpp:431
ScalarT ScalarType
Definition epsilon_divdiv_kerngen.hpp:67
void set_operator_apply_and_communication_modes(const linalg::OperatorApplyMode operator_apply_mode, const linalg::OperatorCommunicationMode operator_communication_mode)
Definition epsilon_divdiv_kerngen.hpp:410
void set_diagonal(bool v)
Definition epsilon_divdiv_kerngen.hpp:418
terra::grid::Grid4DDataMatrices< ScalarType, LocalMatrixDim, LocalMatrixDim, 2 > Grid4DDataLocalMatrices
Definition epsilon_divdiv_kerngen.hpp:69
void assemble_trial_test_vecs(const int wedge, const dense::Vec< ScalarType, VecDim > &quad_point, const ScalarType quad_weight, const ScalarT r_1, const ScalarT r_2, dense::Vec< ScalarT, 3 >(*wedge_phy_surf)[3], const dense::Vec< ScalarT, 6 > *k_local_hex, const int dimi, const int dimj, dense::Mat< ScalarType, VecDim, VecDim > *sym_grad_i, dense::Mat< ScalarType, VecDim, VecDim > *sym_grad_j, ScalarType &jdet_keval_quadweight) const
Definition epsilon_divdiv_kerngen.hpp:2013
Kokkos::TeamPolicy<>::member_type Team
Definition epsilon_divdiv_kerngen.hpp:71
size_t team_shmem_size_dn(const int) const
Definition epsilon_divdiv_kerngen.hpp:686
static constexpr int LocalMatrixDim
Definition epsilon_divdiv_kerngen.hpp:68
void set_penalty_epsilon(ScalarT eps)
Definition epsilon_divdiv_kerngen.hpp:278
dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > get_local_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge) const
Definition epsilon_divdiv_kerngen.hpp:482
void set_stored_matrix_mode(linalg::OperatorStoredMatrixMode operator_stored_matrix_mode, int level_range, grid::Grid4DDataScalar< ScalarType > GCAElements)
Definition epsilon_divdiv_kerngen.hpp:443
size_t team_shmem_size(const int ts) const
Team scratch memory size for fast paths.
Definition epsilon_divdiv_kerngen.hpp:672
bool has_flag(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, grid::shell::ShellBoundaryFlag flag) const
Definition epsilon_divdiv_kerngen.hpp:435
const grid::Grid4DDataScalar< ScalarType > & k_grid_data()
Definition epsilon_divdiv_kerngen.hpp:429
grid::Grid3DDataVec< ScalarT, 3 > get_grid()
Definition epsilon_divdiv_kerngen.hpp:432
EpsilonDivDivKerngen(const grid::shell::DistributedDomain &domain, const grid::Grid3DDataVec< ScalarT, 3 > &grid, const grid::Grid2DDataScalar< ScalarT > &radii, const grid::Grid4DDataScalar< grid::shell::ShellBoundaryFlag > &mask, const grid::Grid4DDataScalar< ScalarT > &k, BoundaryConditions bcs, bool diagonal, linalg::OperatorApplyMode operator_apply_mode=linalg::OperatorApplyMode::Replace, linalg::OperatorCommunicationMode operator_communication_mode=linalg::OperatorCommunicationMode::CommunicateAdditively, linalg::OperatorStoredMatrixMode operator_stored_matrix_mode=linalg::OperatorStoredMatrixMode::Off)
Definition epsilon_divdiv_kerngen.hpp:175
void operator()(const Team &team) const
Legacy generic team operator.
Definition epsilon_divdiv_kerngen.hpp:1973
Parallel data structure organizing the thick spherical shell metadata for distributed (MPI parallel) ...
Definition spherical_shell.hpp:2518
const std::map< SubdomainInfo, std::tuple< LocalSubdomainIdx, SubdomainNeighborhood > > & subdomains() const
Definition spherical_shell.hpp:2650
const DomainInfo & domain_info() const
Returns a const reference.
Definition spherical_shell.hpp:2647
Information about the thick spherical shell mesh.
Definition spherical_shell.hpp:780
const std::vector< double > & radii() const
Definition spherical_shell.hpp:845
int diamond_lateral_refinement_level() const
Definition spherical_shell.hpp:843
int subdomain_num_nodes_radially() const
Equivalent to calling subdomain_num_nodes_radially( subdomain_refinement_level() )
Definition spherical_shell.hpp:861
int subdomain_num_nodes_per_side_laterally() const
Equivalent to calling subdomain_num_nodes_per_side_laterally( subdomain_refinement_level() )
Definition spherical_shell.hpp:852
Static assertion: VectorQ1Scalar satisfies VectorLike concept.
Definition vector_q1.hpp:168
const grid::Grid4DDataVec< ScalarType, VecDim > & grid_data() const
Get const reference to grid data.
Definition vector_q1.hpp:288
bool has_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge) const
Checks for presence of a local matrix for a certain element.
Definition local_matrix_storage.hpp:223
dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > get_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge) const
Retrives the local matrix if there is stored local matrices, the desired local matrix is loaded and r...
Definition local_matrix_storage.hpp:175
void set_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge, dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > mat) const
Set the local matrix stored in the operator.
Definition local_matrix_storage.hpp:118
Timer supporting RAII scope or manual stop.
Definition timer.hpp:342
void stop()
Stop the timer and record elapsed time.
Definition timer.hpp:364
Concept for types that can be used as Galerkin coarse-grid operators in a multigrid hierarchy....
Definition operator.hpp:81
k_eval
Definition EpsilonDivDiv_kernel_gen.py:175
r_cell
Definition EpsilonDivDiv_kernel_gen.py:23
g2
Definition EpsilonDivDiv_kernel_gen.py:267
div_u
Definition EpsilonDivDiv_kernel_gen.py:359
cmb_shift
Definition EpsilonDivDiv_kernel_gen.py:98
g1
Definition EpsilonDivDiv_kernel_gen.py:266
J_det
Definition EpsilonDivDiv_kernel_gen.py:216
surface_shift
Definition EpsilonDivDiv_kernel_gen.py:98
constexpr int idx(const int loop_idx, const int size, const grid::BoundaryPosition position, const grid::BoundaryDirection direction)
Definition buffer_copy_kernels.hpp:73
void send_recv_with_plan(const ShellBoundaryCommPlan< GridDataType > &plan, const GridDataType &data, SubdomainNeighborhoodSendRecvBuffer< typename GridDataType::value_type, grid::grid_data_vec_dim< GridDataType >() > &recv_buffers, CommunicationReduction reduction=CommunicationReduction::SUM)
Definition communication_plan.hpp:652
Definition boundary_mass.hpp:14
constexpr void quad_felippa_1x1_quad_points(dense::Vec< T, 3 >(&quad_points)[quad_felippa_1x1_num_quad_points])
Definition wedge/quadrature/quadrature.hpp:36
constexpr void quad_felippa_1x1_quad_weights(T(&quad_weights)[quad_felippa_1x1_num_quad_points])
Definition wedge/quadrature/quadrature.hpp:43
constexpr int quad_felippa_1x1_num_quad_points
Definition wedge/quadrature/quadrature.hpp:32
constexpr int num_nodes_per_wedge_surface
Definition kernel_helpers.hpp:6
constexpr dense::Mat< T, 3, 3 > symmetric_grad(const dense::Mat< T, 3, 3 > &J_inv_transposed, const dense::Vec< T, 3 > &quad_point, const int dof, const int dim)
Returns the symmetric gradient of the shape function of a dof at a quadrature point.
Definition integrands.hpp:685
void wedge_surface_physical_coords(dense::Vec< T, 3 >(&wedge_surf_phy_coords)[num_wedges_per_hex_cell][num_nodes_per_wedge_surface], const grid::Grid3DDataVec< T, 3 > &lateral_grid, const int local_subdomain_id, const int x_cell, const int y_cell)
Extracts the (unit sphere) surface vertex coords of the two wedges of a hex cell.
Definition kernel_helpers.hpp:26
constexpr void reorder_local_dofs(const DoFOrdering doo_from, const DoFOrdering doo_to, dense::Vec< ScalarT, 18 > &dofs)
Definition kernel_helpers.hpp:619
void atomically_add_local_wedge_vector_coefficients(const grid::Grid4DDataVec< T, VecDim > &global_coefficients, const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int d, const dense::Vec< T, 6 > local_coefficients[2])
Performs an atomic add of the two local wedge coefficient vectors of a hex cell into the global coeff...
Definition kernel_helpers.hpp:465
constexpr int num_wedges_per_hex_cell
Definition kernel_helpers.hpp:5
void extract_local_wedge_scalar_coefficients(dense::Vec< T, 6 >(&local_coefficients)[2], const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const grid::Grid4DDataScalar< T > &global_coefficients)
Extracts the local vector coefficients for the two wedges of a hex cell from the global coefficient v...
Definition kernel_helpers.hpp:306
void extract_local_wedge_vector_coefficients(dense::Vec< T, 6 >(&local_coefficients)[2], const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int d, const grid::Grid4DDataVec< T, VecDim > &global_coefficients)
Extracts the local vector coefficients for the two wedges of a hex cell from the global coefficient v...
Definition kernel_helpers.hpp:356
constexpr int num_nodes_per_wedge
Definition kernel_helpers.hpp:7
constexpr T shape(const int node_idx, const T xi, const T eta, const T zeta)
(Tensor-product) Shape function.
Definition integrands.hpp:146
constexpr dense::Mat< T, 3, 3 > jac(const dense::Vec< T, 3 > &p1_phy, const dense::Vec< T, 3 > &p2_phy, const dense::Vec< T, 3 > &p3_phy, const T r_1, const T r_2, const T xi, const T eta, const T zeta)
Definition integrands.hpp:657
dense::Vec< typename CoordsShellType::value_type, 3 > coords(const int subdomain, const int x, const int y, const int r, const CoordsShellType &coords_shell, const CoordsRadiiType &coords_radii)
Definition spherical_shell.hpp:2871
BoundaryConditionMapping[2] BoundaryConditions
Definition shell/bit_masks.hpp:37
ShellBoundaryFlag
FlagLike that indicates boundary types for the thick spherical shell.
Definition shell/bit_masks.hpp:12
BoundaryConditionFlag get_boundary_condition_flag(const BoundaryConditions bcs, ShellBoundaryFlag sbf)
Retrieve the boundary condition flag that is associated with a location in the shell e....
Definition shell/bit_masks.hpp:42
BoundaryConditionFlag
FlagLike that indicates the type of boundary condition
Definition shell/bit_masks.hpp:25
Kokkos::View< dense::Mat< ScalarType, Rows, Cols > ****[NumMatrices], Layout > Grid4DDataMatrices
Definition grid_types.hpp:173
Kokkos::View< ScalarType ***[VecDim], Layout > Grid3DDataVec
Definition grid_types.hpp:42
Grid4DDataScalar< NodeOwnershipFlag > setup_node_ownership_mask_data(const shell::DistributedDomain &domain)
Set up mask data for a distributed shell domain. The mask encodes ownership information for each grid...
Definition bit_masks.hpp:29
Kokkos::View< ScalarType ****, Layout > Grid4DDataScalar
Definition grid_types.hpp:27
Kokkos::View< ScalarType **, Layout > Grid2DDataScalar
Definition grid_types.hpp:21
ScalarType masked_dot_product(const grid::Grid4DDataScalar< ScalarType > &x, const grid::Grid4DDataScalar< ScalarType > &y, const grid::Grid4DDataScalar< FlagType > &mask, const FlagType &mask_value, MPI_Comm comm=MPI_COMM_WORLD)
Definition grid_operations.hpp:631
dense::Mat< ScalarType, 3, 3 > trafo_mat_cartesian_to_normal_tangential(const dense::Vec< ScalarType, 3 > &n_input)
Constructs a robust orthonormal transformation matrix from Cartesian to (normal–tangential–tangential...
Definition local_basis_trafo_normal_tangential.hpp:36
ScalarOf< Vector > dot(const Vector &y, const Vector &x)
Compute the dot product of two vectors. Implements: .
Definition vector.hpp:118
OperatorApplyMode
Modes for applying an operator to a vector.
Definition operator.hpp:30
@ Replace
Overwrite the destination vector.
OperatorStoredMatrixMode
Modes for applying stored matrices.
Definition operator.hpp:47
@ Selective
Use stored matrices on selected, marked elements only, assemble on all others.
@ Full
Use stored matrices on all elements.
@ Off
Do not use stored matrices.
OperatorCommunicationMode
Modes for communication during operator application.
Definition operator.hpp:40
@ CommunicateAdditively
Communicate and add results.
constexpr bool has_flag(E mask_value, E flag) noexcept
Checks if a bitmask value contains a specific flag.
Definition bit_masking.hpp:43
detail::PrefixCout logroot([]() { return detail::log_prefix();})
std::ostream subclass that just logs on root and adds a timestamp for each line.
Definition mat.hpp:10
T double_contract(const Mat &mat)
Definition mat.hpp:226
SoA (Structure-of-Arrays) 4D vector grid data.
Definition grid_types.hpp:51