Loading...
Searching...
No Matches
epsilon_divdiv_kerngen_v10_seq_rpasses.hpp
Go to the documentation of this file.
1#pragma once
2
3#include "../../../quadrature/quadrature.hpp"
6#include "dense/vec.hpp"
10#include "impl/Kokkos_Profiling.hpp"
11#include "linalg/operator.hpp"
14#include "linalg/vector.hpp"
15#include "linalg/vector_q1.hpp"
16#include "util/timer.hpp"
17
19
30
31/**
32 * @brief Matrix-free / matrix-based epsilon-div-div operator on wedge elements in a spherical shell.
33 *
34 * This class supports three execution paths:
35 *
36 * 1) Slow path (local matrix path)
37 * - Used if local matrices are stored (full or selective)
38 * - Reuses assembled/stored 18x18 local matrices
39 *
40 * 2) Fast path: Dirichlet/Neumann
41 * - Matrix-free
42 * - Handles Dirichlet by skipping constrained face-node couplings (and diagonal treatment if requested)
43 * - Neumann is naturally included
44 *
45 * 3) Fast path: Free-slip
46 * - Matrix-free
47 * - Applies trial and test-side tangential projection on free-slip boundaries
48 * - Preserves behavior consistent with the slow path for iterative solves
49 *
50 * IMPORTANT DESIGN CHANGE:
51 * ------------------------
52 * The path decision is made on the HOST and cached in `kernel_path_`.
53 * `apply_impl()` dispatches to a different kernel launch per path.
54 * This avoids a runtime branch inside the hot device kernel.
55 */
56template < typename ScalarT, int VecDim = 3 >
58{
59 public:
62 using ScalarType = ScalarT;
63 static constexpr int LocalMatrixDim = 18;
66 using Team = Kokkos::TeamPolicy<>::member_type;
67
68 private:
69 // Optional element-local matrix storage (GCA/coarsening/explicit local-matrix mode)
70 LocalMatrixStorage local_matrix_storage_;
71
72 // Domain and geometry / coefficients
74 grid::Grid3DDataVec< ScalarT, 3 > grid_; ///< Lateral shell geometry (unit sphere coords)
75 grid::Grid2DDataScalar< ScalarT > radii_; ///< Radial coordinates per local subdomain
76 grid::Grid4DDataScalar< ScalarType > k_; ///< Scalar coefficient field
77 grid::Grid4DDataScalar< grid::shell::ShellBoundaryFlag > mask_; ///< Boundary flags per cell/node
78 BoundaryConditions bcs_; ///< CMB and SURFACE boundary conditions
79
80 bool diagonal_; ///< If true, apply diagonal-only action
81
82 linalg::OperatorApplyMode operator_apply_mode_;
83 linalg::OperatorCommunicationMode operator_communication_mode_;
84 linalg::OperatorStoredMatrixMode operator_stored_matrix_mode_;
85
88
89 // Device views captured by kernels
92
93 // Quadrature (Felippa 1x1 on wedge)
94 const int num_quad_points = quadrature::quad_felippa_1x1_num_quad_points;
97
98 // Local subdomain extents (in cells)
99 int local_subdomains_;
100 int hex_lat_;
101 int hex_rad_;
102 int lat_refinement_level_;
103
104 // Team-policy tiling (one team handles a slab lat_tile x lat_tile x r_tile cells)
105 int lat_tile_;
106 int r_tile_;
107 int r_passes_;
108 int r_tile_block_;
109 int lat_tiles_;
110 int r_tiles_;
111 int team_size_;
112 int blocks_;
113
114 ScalarT r_max_;
115 ScalarT r_min_;
116
117 /**
118 * @brief Kernel path selected on host.
119 *
120 * This value is computed whenever BCs or stored-matrix mode change and is then
121 * used by `apply_impl()` to select which kernel launch to issue.
122 */
123 enum class KernelPath
124 {
125 Slow,
126 FastDirichletNeumann,
127 FastFreeslip,
128 };
129 KernelPath kernel_path_ = KernelPath::FastDirichletNeumann;
130
131 private:
132 /**
133 * @brief Recompute the kernel path on host.
134 *
135 * Rules:
136 * - Any stored local matrix mode => slow path
137 * - Else if any free-slip BC on CMB or SURFACE => fast free-slip path
138 * - Else => fast Dirichlet/Neumann path
139 */
140 void update_kernel_path_flag_host_only()
141 {
142 const BoundaryConditionFlag cmb_bc = get_boundary_condition_flag( bcs_, CMB );
143 const BoundaryConditionFlag surface_bc = get_boundary_condition_flag( bcs_, SURFACE );
144
145 const bool has_freeslip = ( cmb_bc == FREESLIP ) || ( surface_bc == FREESLIP );
146 const bool has_stored_matrices = ( operator_stored_matrix_mode_ != linalg::OperatorStoredMatrixMode::Off );
147
148 if ( has_stored_matrices )
149 kernel_path_ = KernelPath::Slow;
150 else if ( has_freeslip )
151 kernel_path_ = KernelPath::FastFreeslip;
152 else
153 kernel_path_ = KernelPath::FastDirichletNeumann;
154 }
155
156 public:
158 const grid::shell::DistributedDomain& domain,
163 BoundaryConditions bcs,
164 bool diagonal,
166 linalg::OperatorCommunicationMode operator_communication_mode =
169 : domain_( domain )
170 , grid_( grid )
171 , radii_( radii )
172 , k_( k )
173 , mask_( mask )
174 , diagonal_( diagonal )
175 , operator_apply_mode_( operator_apply_mode )
176 , operator_communication_mode_( operator_communication_mode )
177 , operator_stored_matrix_mode_( operator_stored_matrix_mode )
178 , recv_buffers_( domain )
179 , comm_plan_( domain )
180 {
181 bcs_[0] = bcs[0];
182 bcs_[1] = bcs[1];
183
186
187 const grid::shell::DomainInfo& domain_info = domain_.domain_info();
188 local_subdomains_ = domain_.subdomains().size();
189 hex_lat_ = domain_info.subdomain_num_nodes_per_side_laterally() - 1;
190 hex_rad_ = domain_info.subdomain_num_nodes_radially() - 1;
191 lat_refinement_level_ = domain_info.diamond_lateral_refinement_level();
192
193 lat_tile_ = 4;
194 r_tile_ = 8;
195 r_passes_ = 2;
196 r_tile_block_ = r_tile_ * r_passes_;
197
198 lat_tiles_ = ( hex_lat_ + lat_tile_ - 1 ) / lat_tile_;
199 r_tiles_ = ( hex_rad_ + r_tile_block_ - 1 ) / r_tile_block_;
200
201 team_size_ = lat_tile_ * lat_tile_ * r_tile_;
202 blocks_ = local_subdomains_ * lat_tiles_ * lat_tiles_ * r_tiles_;
203
204 r_min_ = domain_info.radii()[0];
205 r_max_ = domain_info.radii()[domain_info.radii().size() - 1];
206
207 // Host-side path selection (no in-kernel path branching)
208 update_kernel_path_flag_host_only();
209
210 util::logroot << "[EpsilonDivDiv] tile size (x,y,r)=(" << lat_tile_ << "," << lat_tile_ << "," << r_tile_
211 << "), r_passes=" << r_passes_ << std::endl;
212 util::logroot << "[EpsilonDivDiv] number of tiles (x,y,r)=(" << lat_tiles_ << "," << lat_tiles_ << ","
213 << r_tiles_ << "), team_size=" << team_size_ << ", blocks=" << blocks_ << std::endl;
214 const char* path_name = ( kernel_path_ == KernelPath::Slow ) ? "slow" :
215 ( kernel_path_ == KernelPath::FastFreeslip ) ? "fast-freeslip" :
216 "fast-dirichlet-neumann";
217 util::logroot << "[EpsilonDivDiv] kernel path = " << path_name << std::endl;
218 }
219
221 const linalg::OperatorApplyMode operator_apply_mode,
222 const linalg::OperatorCommunicationMode operator_communication_mode )
223 {
224 operator_apply_mode_ = operator_apply_mode;
225 operator_communication_mode_ = operator_communication_mode;
226 }
227
228 void set_diagonal( bool v ) { diagonal_ = v; }
229
230 void set_boundary_conditions( BoundaryConditions bcs )
231 {
232 bcs_[0] = bcs[0];
233 bcs_[1] = bcs[1];
234
235 // Recompute host-side dispatch mode whenever BCs change
236 update_kernel_path_flag_host_only();
237 }
238
240 const grid::shell::DistributedDomain& get_domain() const { return domain_; }
243
244 KOKKOS_INLINE_FUNCTION
246 const int local_subdomain_id,
247 const int x_cell,
248 const int y_cell,
249 const int r_cell,
251 {
252 return util::has_flag( mask_( local_subdomain_id, x_cell, y_cell, r_cell ), flag );
253 }
254
256 linalg::OperatorStoredMatrixMode operator_stored_matrix_mode,
257 int level_range,
259 {
260 operator_stored_matrix_mode_ = operator_stored_matrix_mode;
261
262 if ( operator_stored_matrix_mode_ != linalg::OperatorStoredMatrixMode::Off )
263 {
265 domain_, operator_stored_matrix_mode_, level_range, GCAElements );
266 }
267
268 // Recompute host-side dispatch mode whenever storage mode changes
269 update_kernel_path_flag_host_only();
270
271 util::logroot << "[EpsilonDivDiv] (set_stored_matrix_mode) kernel path = "
272 << (( kernel_path_ == KernelPath::Slow ) ? "slow" :
273 ( kernel_path_ == KernelPath::FastFreeslip ) ? "fast-freeslip" :
274 "fast-dirichlet-neumann")
275 << std::endl;
276 }
277
278 linalg::OperatorStoredMatrixMode get_stored_matrix_mode() { return operator_stored_matrix_mode_; }
279
280 KOKKOS_INLINE_FUNCTION
282 const int local_subdomain_id,
283 const int x_cell,
284 const int y_cell,
285 const int r_cell,
286 const int wedge,
288 {
289 KOKKOS_ASSERT( operator_stored_matrix_mode_ != linalg::OperatorStoredMatrixMode::Off );
290 local_matrix_storage_.set_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge, mat );
291 }
292
293 KOKKOS_INLINE_FUNCTION
295 const int local_subdomain_id,
296 const int x_cell,
297 const int y_cell,
298 const int r_cell,
299 const int wedge ) const
300 {
301 if ( operator_stored_matrix_mode_ != linalg::OperatorStoredMatrixMode::Off )
302 {
303 if ( !local_matrix_storage_.has_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge ) )
304 {
305 Kokkos::abort( "No matrix found at that spatial index." );
306 }
307 return local_matrix_storage_.get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge );
308 }
309 return assemble_local_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge );
310 }
311
312 /**
313 * @brief Apply the operator: dst <- Op(src) (or additive, depending on mode).
314 *
315 * Host-side responsibilities:
316 * - Handle replace/add mode initialization
317 * - Cache src/dst views for kernel capture
318 * - Build team policy
319 * - Dispatch to exactly one kernel variant based on `kernel_path_`
320 * - Optional halo communication accumulation
321 *
322 * This is where the path decision now lives (host), so device code does not branch
323 * on `kernel_path_` in the hot path.
324 */
325 void apply_impl( const SrcVectorType& src, DstVectorType& dst )
326 {
327 util::Timer timer_apply( "epsilon_divdiv_apply" );
328
329 if ( operator_apply_mode_ == linalg::OperatorApplyMode::Replace )
330 {
331 assign( dst, 0 );
332 }
333
334 dst_ = dst.grid_data();
335 src_ = src.grid_data();
336
337 util::Timer timer_kernel( "epsilon_divdiv_kernel" );
338 Kokkos::TeamPolicy<> policy( blocks_, team_size_ );
339
340 // Fast paths use shared-memory slab staging
341 if ( kernel_path_ != KernelPath::Slow )
342 {
343 policy.set_scratch_size( 0, Kokkos::PerTeam( team_shmem_size( team_size_ ) ) );
344 }
345
346 // Host-side dispatch => no per-team path branching in device code
347 if ( kernel_path_ == KernelPath::Slow )
348 {
349 Kokkos::parallel_for(
350 "epsilon_divdiv_apply_kernel_slow", policy, KOKKOS_CLASS_LAMBDA( const Team& team ) {
351 this->run_team_slow( team );
352 } );
353 }
354 else if ( kernel_path_ == KernelPath::FastFreeslip )
355 {
356 if ( diagonal_ )
357 {
358 Kokkos::parallel_for(
359 "epsilon_divdiv_apply_kernel_fast_fs_diag",
360 policy,
361 KOKKOS_CLASS_LAMBDA( const Team& team ) {
362 this->template run_team_fast_freeslip< true >( team );
363 } );
364 }
365 else
366 {
367 Kokkos::parallel_for(
368 "epsilon_divdiv_apply_kernel_fast_fs_matvec",
369 policy,
370 KOKKOS_CLASS_LAMBDA( const Team& team ) {
371 this->template run_team_fast_freeslip< false >( team );
372 } );
373 }
374 }
375 else
376 {
377 Kokkos::TeamPolicy< Kokkos::LaunchBounds< 128, 5 > > dn_policy( blocks_, team_size_ );
378 dn_policy.set_scratch_size( 0, Kokkos::PerTeam( team_shmem_size_dn( team_size_ ) ) );
379 if ( diagonal_ )
380 {
381 Kokkos::parallel_for(
382 "epsilon_divdiv_apply_kernel_fast_dn_diag",
383 dn_policy,
384 KOKKOS_CLASS_LAMBDA( const Team& team ) {
385 this->template run_team_fast_dirichlet_neumann< true >( team );
386 } );
387 }
388 else
389 {
390 Kokkos::parallel_for(
391 "epsilon_divdiv_apply_kernel_fast_dn_matvec",
392 dn_policy,
393 KOKKOS_CLASS_LAMBDA( const Team& team ) {
394 this->template run_team_fast_dirichlet_neumann< false >( team );
395 } );
396 }
397 }
398
399 Kokkos::fence();
400 timer_kernel.stop();
401
402 if ( operator_communication_mode_ == linalg::OperatorCommunicationMode::CommunicateAdditively )
403 {
404 util::Timer timer_comm( "epsilon_divdiv_comm" );
405 terra::communication::shell::send_recv_with_plan( comm_plan_, dst_, recv_buffers_ );
406 }
407 }
408
409 /**
410 * @brief Convert one gradient column into symmetric-gradient and div contributions.
411 *
412 * Given grad(phi e_dim), this helper computes:
413 * - diagonal entries (E00,E11,E22)
414 * - off-diagonal symmetric entries (sym01,sym02,sym12)
415 * - divergence contribution `gdd`
416 *
417 * The fast kernels use this repeatedly in fused operator application.
418 */
419 KOKKOS_INLINE_FUNCTION
421 const int dim,
422 const double g0,
423 const double g1,
424 const double g2,
425 double& E00,
426 double& E11,
427 double& E22,
428 double& sym01,
429 double& sym02,
430 double& sym12,
431 double& gdd ) const
432 {
433 E00 = E11 = E22 = sym01 = sym02 = sym12 = gdd = 0.0;
434
435 switch ( dim )
436 {
437 case 0:
438 E00 = g0;
439 gdd = g0;
440 sym01 = 0.5 * g1;
441 sym02 = 0.5 * g2;
442 break;
443 case 1:
444 E11 = g1;
445 gdd = g1;
446 sym01 = 0.5 * g0;
447 sym12 = 0.5 * g2;
448 break;
449 default:
450 E22 = g2;
451 gdd = g2;
452 sym02 = 0.5 * g0;
453 sym12 = 0.5 * g1;
454 break;
455 }
456 }
457
458 /**
459 * @brief Team scratch memory size for fast paths.
460 *
461 * Layout per team:
462 * [coords_sh | src_sh | k_sh | r_sh]
463 */
464 KOKKOS_INLINE_FUNCTION
465 size_t team_shmem_size( const int ts ) const
466 {
467 const int nlev = r_tile_block_ + 1;
468 const int n = lat_tile_ + 1;
469 const int nxy = n * n;
470
471 // coords_sh(nxy,3) + normals_sh(nxy,3) + src_sh(nxy,3,nlev) + k_sh(nxy,nlev) + r_sh(nlev) + 1
472 const size_t nscalars =
473 size_t( nxy ) * 3 + size_t( nxy ) * 3 + size_t( nxy ) * 3 * nlev + size_t( nxy ) * nlev + size_t( nlev ) + 1;
474
475 return sizeof( ScalarType ) * nscalars;
476 }
477
478 KOKKOS_INLINE_FUNCTION
479 size_t team_shmem_size_dn( const int /* ts */ ) const
480 {
481 const int nlev = r_tile_block_ + 1;
482 const int n = lat_tile_ + 1;
483 const int nxy = n * n;
484
485 // coords_sh(nxy,3) + src_sh(nxy,3,nlev) + k_sh(nxy,nlev) + r_sh(nlev)
486 const size_t nscalars =
487 size_t( nxy ) * 3 + size_t( nxy ) * 3 * nlev + size_t( nxy ) * nlev + size_t( nlev );
488
489 return sizeof( ScalarType ) * nscalars;
490 }
491
492 private:
493 /**
494 * @brief Decode a team league rank / team rank into:
495 * - tile origin (x0,y0,r0)
496 * - intra-tile thread coordinates (tx,ty,tr)
497 * - target cell (x_cell,y_cell,r_cell)
498 */
499 KOKKOS_INLINE_FUNCTION
500 void decode_team_indices(
501 const Team& team,
502 int& local_subdomain_id,
503 int& x0,
504 int& y0,
505 int& r0,
506 int& tx,
507 int& ty,
508 int& tr,
509 int& x_cell,
510 int& y_cell,
511 int& r_cell ) const
512 {
513 int tmp = team.league_rank();
514
515 const int r_tile_id = tmp % r_tiles_;
516 tmp /= r_tiles_;
517
518 const int lat_y_id = tmp % lat_tiles_;
519 tmp /= lat_tiles_;
520
521 const int lat_x_id = tmp % lat_tiles_;
522 tmp /= lat_tiles_;
523
524 local_subdomain_id = tmp;
525
526 x0 = lat_x_id * lat_tile_;
527 y0 = lat_y_id * lat_tile_;
528 r0 = r_tile_id * r_tile_block_;
529
530 const int tid = team.team_rank();
531 tr = tid % r_tile_;
532 tx = ( tid / r_tile_ ) % lat_tile_;
533 ty = tid / ( r_tile_ * lat_tile_ );
534
535 x_cell = x0 + tx;
536 y_cell = y0 + ty;
537 r_cell = r0 + tr;
538 }
539
540 // -------------------------------------------------------------------------
541 // Path-specific team entry points (NO path branching)
542 // These are the functions called directly by host-dispatched kernel launches.
543 // -------------------------------------------------------------------------
544
545 /**
546 * @brief Team entry for slow (local-matrix) path.
547 *
548 * This wrapper performs only:
549 * - team index decoding
550 * - boundary flag queries
551 * - forwarding to operator_slow_path()
552 *
553 * No dispatch branching happens here.
554 */
555 KOKKOS_INLINE_FUNCTION
556 void run_team_slow( const Team& team ) const
557 {
558 int local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell;
559 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
560
561 if ( tr >= r_tile_ )
562 return;
563
564 for ( int pass = 0; pass < r_passes_; ++pass )
565 {
566 const int r_cell_pass = r0 + pass * r_tile_ + tr;
567 if ( r_cell_pass >= hex_rad_ )
568 break;
569
570 const bool at_cmb = has_flag( local_subdomain_id, x_cell, y_cell, r_cell_pass, CMB );
571 const bool at_surface = has_flag( local_subdomain_id, x_cell, y_cell, r_cell_pass + 1, SURFACE );
572
573 operator_slow_path(
574 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell_pass, at_cmb, at_surface );
575 }
576 }
577
578 /**
579 * @brief Team entry for fast Dirichlet/Neumann matrix-free path.
580 *
581 * Templated on Diagonal so the compiler can dead-code-eliminate the
582 * unused matvec or diagonal-only path, reducing register pressure.
583 */
584 template < bool Diagonal >
585 KOKKOS_INLINE_FUNCTION
586 void run_team_fast_dirichlet_neumann( const Team& team ) const
587 {
588 int local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell;
589 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
590
591 if ( tr >= r_tile_ )
592 return;
593
594 operator_fast_dirichlet_neumann_path< Diagonal >(
595 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell );
596 }
597
598 /**
599 * @brief Team entry for fast free-slip matrix-free path.
600 *
601 * Templated on Diagonal so the compiler can dead-code-eliminate the
602 * unused matvec or diagonal-only path, reducing register pressure.
603 */
604 template < bool Diagonal >
605 KOKKOS_INLINE_FUNCTION
606 void run_team_fast_freeslip( const Team& team ) const
607 {
608 int local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell;
609 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
610
611 if ( tr >= r_tile_ )
612 return;
613
614 operator_fast_freeslip_path< Diagonal >(
615 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell );
616 }
617
618 // ===================== SLOW PATH =====================
619 KOKKOS_INLINE_FUNCTION
620 void operator_slow_path(
621 const Team& team,
622 const int local_subdomain_id,
623 const int x0,
624 const int y0,
625 const int r0,
626 const int tx,
627 const int ty,
628 const int tr,
629 const int x_cell,
630 const int y_cell,
631 const int r_cell,
632 const bool at_cmb,
633 const bool at_surface ) const
634 {
635 (void) team;
636 (void) x0;
637 (void) y0;
638 (void) r0;
639 (void) tx;
640 (void) ty;
641 (void) tr;
642
643 if ( x_cell >= hex_lat_ || y_cell >= hex_lat_ || r_cell >= hex_rad_ )
644 return;
645
647
648 if ( operator_stored_matrix_mode_ == linalg::OperatorStoredMatrixMode::Full )
649 {
650 A[0] = local_matrix_storage_.get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 );
651 A[1] = local_matrix_storage_.get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 );
652 }
653 else if ( operator_stored_matrix_mode_ == linalg::OperatorStoredMatrixMode::Selective )
654 {
655 if ( local_matrix_storage_.has_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 ) &&
656 local_matrix_storage_.has_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 ) )
657 {
658 A[0] = local_matrix_storage_.get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 );
659 A[1] = local_matrix_storage_.get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 );
660 }
661 else
662 {
663 A[0] = assemble_local_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 );
664 A[1] = assemble_local_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 );
665 }
666 }
667 else
668 {
669 A[0] = assemble_local_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 );
670 A[1] = assemble_local_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 );
671 }
672
673 dense::Vec< ScalarT, 18 > src[num_wedges_per_hex_cell];
674 for ( int dimj = 0; dimj < 3; dimj++ )
675 {
676 dense::Vec< ScalarT, 6 > src_d[num_wedges_per_hex_cell];
677 extract_local_wedge_vector_coefficients( src_d, local_subdomain_id, x_cell, y_cell, r_cell, dimj, src_ );
678
679 for ( int wedge = 0; wedge < num_wedges_per_hex_cell; wedge++ )
680 {
681 for ( int i = 0; i < num_nodes_per_wedge; i++ )
682 {
683 src[wedge]( dimj * num_nodes_per_wedge + i ) = src_d[wedge]( i );
684 }
685 }
686 }
687
688 dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > boundary_mask;
689 boundary_mask.fill( 1.0 );
690
691 bool freeslip_reorder = false;
692 dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > R[num_wedges_per_hex_cell];
693
694 if ( at_cmb || at_surface )
695 {
696 ShellBoundaryFlag sbf = at_cmb ? CMB : SURFACE;
698
699 if ( bcf == DIRICHLET )
700 {
701 for ( int dimi = 0; dimi < 3; ++dimi )
702 {
703 for ( int dimj = 0; dimj < 3; ++dimj )
704 {
705 for ( int i = 0; i < num_nodes_per_wedge; i++ )
706 {
707 for ( int j = 0; j < num_nodes_per_wedge; j++ )
708 {
709 if ( ( at_cmb && ( ( dimi == dimj && i != j && ( i < 3 || j < 3 ) ) ||
710 ( dimi != dimj && ( i < 3 || j < 3 ) ) ) ) ||
711 ( at_surface && ( ( dimi == dimj && i != j && ( i >= 3 || j >= 3 ) ) ||
712 ( dimi != dimj && ( i >= 3 || j >= 3 ) ) ) ) )
713 {
714 boundary_mask( i + dimi * num_nodes_per_wedge, j + dimj * num_nodes_per_wedge ) =
715 0.0;
716 }
717 }
718 }
719 }
720 }
721 }
722 else if ( bcf == FREESLIP )
723 {
724 freeslip_reorder = true;
725 dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > A_tmp[num_wedges_per_hex_cell] = { 0 };
726
727 for ( int wedge = 0; wedge < 2; ++wedge )
728 {
729 for ( int dimi = 0; dimi < 3; ++dimi )
730 {
731 for ( int node_idxi = 0; node_idxi < num_nodes_per_wedge; node_idxi++ )
732 {
733 for ( int dimj = 0; dimj < 3; ++dimj )
734 {
735 for ( int node_idxj = 0; node_idxj < num_nodes_per_wedge; node_idxj++ )
736 {
737 A_tmp[wedge]( node_idxi * 3 + dimi, node_idxj * 3 + dimj ) = A[wedge](
738 node_idxi + dimi * num_nodes_per_wedge,
739 node_idxj + dimj * num_nodes_per_wedge );
740 }
741 }
742 }
743 }
745 }
746
747 constexpr int layer_hex_offset_x[2][3] = { { 0, 1, 0 }, { 1, 0, 1 } };
748 constexpr int layer_hex_offset_y[2][3] = { { 0, 0, 1 }, { 1, 1, 0 } };
749
750 for ( int wedge = 0; wedge < 2; ++wedge )
751 {
752 for ( int i = 0; i < LocalMatrixDim; ++i )
753 {
754 R[wedge]( i, i ) = 1.0;
755 }
756
757 for ( int boundary_node_idx = 0; boundary_node_idx < 3; boundary_node_idx++ )
758 {
759 dense::Vec< double, 3 > normal = grid::shell::coords(
760 local_subdomain_id,
761 x_cell + layer_hex_offset_x[wedge][boundary_node_idx],
762 y_cell + layer_hex_offset_y[wedge][boundary_node_idx],
763 r_cell + ( at_cmb ? 0 : 1 ),
764 grid_,
765 radii_ );
766
767 auto R_i = trafo_mat_cartesian_to_normal_tangential( normal );
768
769 const int offset_in_R = at_cmb ? 0 : 9;
770 for ( int dimi = 0; dimi < 3; ++dimi )
771 {
772 for ( int dimj = 0; dimj < 3; ++dimj )
773 {
774 R[wedge](
775 offset_in_R + boundary_node_idx * 3 + dimi,
776 offset_in_R + boundary_node_idx * 3 + dimj ) = R_i( dimi, dimj );
777 }
778 }
779 }
780
781 A[wedge] = R[wedge] * A_tmp[wedge] * R[wedge].transposed();
782
783 auto src_tmp = R[wedge] * src[wedge];
784 for ( int i = 0; i < 18; ++i )
785 {
786 src[wedge]( i ) = src_tmp( i );
787 }
788
789 const int node_start = at_surface ? 3 : 0;
790 const int node_end = at_surface ? 6 : 3;
791
792 for ( int node_idx = node_start; node_idx < node_end; node_idx++ )
793 {
794 const int idx = node_idx * 3;
795 for ( int k = 0; k < 18; ++k )
796 {
797 if ( k != idx )
798 {
799 boundary_mask( idx, k ) = 0.0;
800 boundary_mask( k, idx ) = 0.0;
801 }
802 }
803 }
804 }
805 }
806 }
807
808 for ( int wedge = 0; wedge < num_wedges_per_hex_cell; wedge++ )
809 {
810 A[wedge].hadamard_product( boundary_mask );
811 }
812
813 if ( diagonal_ )
814 {
815 A[0] = A[0].diagonal();
816 A[1] = A[1].diagonal();
817 }
818
819 dense::Vec< ScalarT, LocalMatrixDim > dst[num_wedges_per_hex_cell];
820 dst[0] = A[0] * src[0];
821 dst[1] = A[1] * src[1];
822
823 if ( freeslip_reorder )
824 {
825 dense::Vec< ScalarT, LocalMatrixDim > dst_tmp[num_wedges_per_hex_cell];
826 dst_tmp[0] = R[0].transposed() * dst[0];
827 dst_tmp[1] = R[1].transposed() * dst[1];
828
829 for ( int i = 0; i < 18; ++i )
830 {
831 dst[0]( i ) = dst_tmp[0]( i );
832 dst[1]( i ) = dst_tmp[1]( i );
833 }
834
837 }
838
839 for ( int dimi = 0; dimi < 3; dimi++ )
840 {
841 dense::Vec< ScalarT, 6 > dst_d[num_wedges_per_hex_cell];
842 dst_d[0] = dst[0].template slice< 6 >( dimi * num_nodes_per_wedge );
843 dst_d[1] = dst[1].template slice< 6 >( dimi * num_nodes_per_wedge );
844
846 dst_, local_subdomain_id, x_cell, y_cell, r_cell, dimi, dst_d );
847 }
848 }
849
850 // ===================== FAST DIRICHLET/NEUMANN PATH =====================
851 template < bool Diagonal >
852 KOKKOS_INLINE_FUNCTION
853 void operator_fast_dirichlet_neumann_path(
854 const Team& team,
855 const int local_subdomain_id,
856 const int x0,
857 const int y0,
858 const int r0,
859 const int tx,
860 const int ty,
861 const int tr,
862 const int x_cell,
863 const int y_cell ) const
864 {
865 const int nlev = r_tile_block_ + 1;
866 const int nxy = ( lat_tile_ + 1 ) * ( lat_tile_ + 1 );
867
868 double* shmem =
869 reinterpret_cast< double* >( team.team_shmem().get_shmem( team_shmem_size_dn( team.team_size() ) ) );
870
871 using ScratchCoords =
872 Kokkos::View< double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
873 using ScratchSrc = Kokkos::
874 View< double***, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
875 using ScratchK =
876 Kokkos::View< double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
877
878 ScratchCoords coords_sh( shmem, nxy, 3 );
879 shmem += nxy * 3;
880
881 ScratchSrc src_sh( shmem, nxy, 3, nlev );
882 shmem += nxy * 3 * nlev;
883
884 ScratchK k_sh( shmem, nxy, nlev );
885 shmem += nxy * nlev;
886
887 auto r_sh =
888 Kokkos::View< double*, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >(
889 shmem, nlev );
890
891 auto node_id = [&]( int nx, int ny ) -> int { return nx + ( lat_tile_ + 1 ) * ny; };
892
893 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nxy ), [&]( int n ) {
894 const int dxn = n % ( lat_tile_ + 1 );
895 const int dyn = n / ( lat_tile_ + 1 );
896 const int xi = x0 + dxn;
897 const int yi = y0 + dyn;
898
899 if ( xi <= hex_lat_ && yi <= hex_lat_ )
900 {
901 coords_sh( n, 0 ) = grid_( local_subdomain_id, xi, yi, 0 );
902 coords_sh( n, 1 ) = grid_( local_subdomain_id, xi, yi, 1 );
903 coords_sh( n, 2 ) = grid_( local_subdomain_id, xi, yi, 2 );
904 }
905 else
906 {
907 coords_sh( n, 0 ) = coords_sh( n, 1 ) = coords_sh( n, 2 ) = 0.0;
908 }
909 } );
910
911 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nlev ), [&]( int lvl ) {
912 const int rr = r0 + lvl;
913 r_sh( lvl ) = ( rr <= hex_rad_ ) ? radii_( local_subdomain_id, rr ) : 0.0;
914 } );
915
916 const int total_pairs = nxy * nlev;
917 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, total_pairs ), [&]( int t ) {
918 const int node = t / nlev;
919 const int lvl = t - node * nlev;
920
921 const int dxn = node % ( lat_tile_ + 1 );
922 const int dyn = node / ( lat_tile_ + 1 );
923
924 const int xi = x0 + dxn;
925 const int yi = y0 + dyn;
926 const int rr = r0 + lvl;
927
928 if ( xi <= hex_lat_ && yi <= hex_lat_ && rr <= hex_rad_ )
929 {
930 k_sh( node, lvl ) = k_( local_subdomain_id, xi, yi, rr );
931 src_sh( node, 0, lvl ) = src_( local_subdomain_id, xi, yi, rr, 0 );
932 src_sh( node, 1, lvl ) = src_( local_subdomain_id, xi, yi, rr, 1 );
933 src_sh( node, 2, lvl ) = src_( local_subdomain_id, xi, yi, rr, 2 );
934 }
935 else
936 {
937 k_sh( node, lvl ) = 0.0;
938 src_sh( node, 0, lvl ) = src_sh( node, 1, lvl ) = src_sh( node, 2, lvl ) = 0.0;
939 }
940 } );
941
942 team.team_barrier();
943
944 if ( x_cell >= hex_lat_ || y_cell >= hex_lat_ )
945 return;
946
947 constexpr double ONE_THIRD = 1.0 / 3.0;
948 constexpr double ONE_SIXTH = 1.0 / 6.0;
949
950 static constexpr double dN_ref[6][3] = {
951 { -0.5, -0.5, -ONE_SIXTH },
952 { 0.5, 0.0, -ONE_SIXTH },
953 { 0.0, 0.5, -ONE_SIXTH },
954 { -0.5, -0.5, ONE_SIXTH },
955 { 0.5, 0.0, ONE_SIXTH },
956 { 0.0, 0.5, ONE_SIXTH } };
957
958 static constexpr int WEDGE_NODE_OFF[2][6][3] = {
959 { { 0, 0, 0 }, { 1, 0, 0 }, { 0, 1, 0 }, { 0, 0, 1 }, { 1, 0, 1 }, { 0, 1, 1 } },
960 { { 1, 1, 0 }, { 0, 1, 0 }, { 1, 0, 0 }, { 1, 1, 1 }, { 0, 1, 1 }, { 1, 0, 1 } } };
961
962 const int n00 = node_id( tx, ty );
963 const int n01 = node_id( tx, ty + 1 );
964 const int n10 = node_id( tx + 1, ty );
965 const int n11 = node_id( tx + 1, ty + 1 );
966
967 for ( int pass = 0; pass < r_passes_; ++pass )
968 {
969 const int lvl0 = pass * r_tile_ + tr;
970 const int r_cell = r0 + lvl0;
971
972 if ( r_cell >= hex_rad_ )
973 break;
974
975 const double r_0 = r_sh( lvl0 );
976 const double r_1 = r_sh( lvl0 + 1 );
977
978 const bool at_cmb = has_flag( local_subdomain_id, x_cell, y_cell, r_cell, CMB );
979 const bool at_surface = has_flag( local_subdomain_id, x_cell, y_cell, r_cell + 1, SURFACE );
980
981 const bool at_boundary = at_cmb || at_surface;
982 bool treat_boundary_dirichlet = false;
983 if ( at_boundary )
984 {
985 const ShellBoundaryFlag sbf = at_cmb ? CMB : SURFACE;
986 treat_boundary_dirichlet = ( get_boundary_condition_flag( bcs_, sbf ) == DIRICHLET );
987 }
988
989 const int cmb_shift = ( ( at_boundary && treat_boundary_dirichlet && ( !Diagonal ) && at_cmb ) ? 3 : 0 );
990 const int surface_shift =
991 ( ( at_boundary && treat_boundary_dirichlet && ( !Diagonal ) && at_surface ) ? 3 : 0 );
992
993 for ( int w = 0; w < 2; ++w )
994 {
995 const int v0 = w == 0 ? n00 : n11;
996 const int v1 = w == 0 ? n10 : n01;
997 const int v2 = w == 0 ? n01 : n10;
998
999 double k_sum = 0.0;
1000#pragma unroll
1001 for ( int node = 0; node < 6; ++node )
1002 {
1003 const int nid = node_id( tx + WEDGE_NODE_OFF[w][node][0], ty + WEDGE_NODE_OFF[w][node][1] );
1004 k_sum += k_sh( nid, lvl0 + WEDGE_NODE_OFF[w][node][2] );
1005 }
1006 const double k_eval = ONE_SIXTH * k_sum;
1007
1008 double kwJ;
1009
1010 // ==== Phase 1: Jacobian + Gather (gu tensor) ====
1011 // invJ lives only in this scope so the compiler can reclaim its registers.
1012 double gu00 = 0.0;
1013 double gu10 = 0.0, gu11 = 0.0;
1014 double gu20 = 0.0, gu21 = 0.0, gu22 = 0.0;
1015 double div_u = 0.0;
1016 {
1017 const double half_dr = 0.5 * ( r_1 - r_0 );
1018 const double r_mid = 0.5 * ( r_0 + r_1 );
1019
1020 const double J_0_0 = r_mid * ( -coords_sh( v0, 0 ) + coords_sh( v1, 0 ) );
1021 const double J_0_1 = r_mid * ( -coords_sh( v0, 0 ) + coords_sh( v2, 0 ) );
1022 const double J_0_2 =
1023 half_dr * ( ONE_THIRD * ( coords_sh( v0, 0 ) + coords_sh( v1, 0 ) + coords_sh( v2, 0 ) ) );
1024
1025 const double J_1_0 = r_mid * ( -coords_sh( v0, 1 ) + coords_sh( v1, 1 ) );
1026 const double J_1_1 = r_mid * ( -coords_sh( v0, 1 ) + coords_sh( v2, 1 ) );
1027 const double J_1_2 =
1028 half_dr * ( ONE_THIRD * ( coords_sh( v0, 1 ) + coords_sh( v1, 1 ) + coords_sh( v2, 1 ) ) );
1029
1030 const double J_2_0 = r_mid * ( -coords_sh( v0, 2 ) + coords_sh( v1, 2 ) );
1031 const double J_2_1 = r_mid * ( -coords_sh( v0, 2 ) + coords_sh( v2, 2 ) );
1032 const double J_2_2 =
1033 half_dr * ( ONE_THIRD * ( coords_sh( v0, 2 ) + coords_sh( v1, 2 ) + coords_sh( v2, 2 ) ) );
1034
1035 const double J_det = J_0_0 * J_1_1 * J_2_2 - J_0_0 * J_1_2 * J_2_1 -
1036 J_0_1 * J_1_0 * J_2_2 + J_0_1 * J_1_2 * J_2_0 +
1037 J_0_2 * J_1_0 * J_2_1 - J_0_2 * J_1_1 * J_2_0;
1038
1039 kwJ = k_eval * Kokkos::abs( J_det );
1040
1041 const double inv_det = 1.0 / J_det;
1042
1043 const double i00 = inv_det * ( J_1_1 * J_2_2 - J_1_2 * J_2_1 );
1044 const double i01 = inv_det * ( -J_1_0 * J_2_2 + J_1_2 * J_2_0 );
1045 const double i02 = inv_det * ( J_1_0 * J_2_1 - J_1_1 * J_2_0 );
1046 const double i10 = inv_det * ( -J_0_1 * J_2_2 + J_0_2 * J_2_1 );
1047 const double i11 = inv_det * ( J_0_0 * J_2_2 - J_0_2 * J_2_0 );
1048 const double i12 = inv_det * ( -J_0_0 * J_2_1 + J_0_1 * J_2_0 );
1049 const double i20 = inv_det * ( J_0_1 * J_1_2 - J_0_2 * J_1_1 );
1050 const double i21 = inv_det * ( -J_0_0 * J_1_2 + J_0_2 * J_1_0 );
1051 const double i22 = inv_det * ( J_0_0 * J_1_1 - J_0_1 * J_1_0 );
1052
1053 if ( !Diagonal )
1054 {
1055#pragma unroll
1056 for ( int n = cmb_shift; n < 6 - surface_shift; ++n )
1057 {
1058 const double gx = dN_ref[n][0];
1059 const double gy = dN_ref[n][1];
1060 const double gz = dN_ref[n][2];
1061 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1062 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1063 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1064
1065 const int ddx = WEDGE_NODE_OFF[w][n][0];
1066 const int ddy = WEDGE_NODE_OFF[w][n][1];
1067 const int ddr = WEDGE_NODE_OFF[w][n][2];
1068 const int nid = node_id( tx + ddx, ty + ddy );
1069 const int lvl = lvl0 + ddr;
1070
1071 const double s0 = src_sh( nid, 0, lvl );
1072 const double s1 = src_sh( nid, 1, lvl );
1073 const double s2 = src_sh( nid, 2, lvl );
1074
1075 gu00 += g0 * s0;
1076 gu11 += g1 * s1;
1077 gu22 += g2 * s2;
1078 gu10 += 0.5 * ( g1 * s0 + g0 * s1 );
1079 gu20 += 0.5 * ( g2 * s0 + g0 * s2 );
1080 gu21 += 0.5 * ( g2 * s1 + g1 * s2 );
1081 div_u += g0 * s0 + g1 * s1 + g2 * s2;
1082 }
1083 }
1084 }
1085 // invJ (i00..i22) is now out of scope — registers can be reclaimed.
1086
1087 // ==== Phase 2: Recompute Jacobian + Scatter ====
1088 {
1089 const double half_dr = 0.5 * ( r_1 - r_0 );
1090 const double r_mid = 0.5 * ( r_0 + r_1 );
1091
1092 const double J_0_0 = r_mid * ( -coords_sh( v0, 0 ) + coords_sh( v1, 0 ) );
1093 const double J_0_1 = r_mid * ( -coords_sh( v0, 0 ) + coords_sh( v2, 0 ) );
1094 const double J_0_2 =
1095 half_dr * ( ONE_THIRD * ( coords_sh( v0, 0 ) + coords_sh( v1, 0 ) + coords_sh( v2, 0 ) ) );
1096
1097 const double J_1_0 = r_mid * ( -coords_sh( v0, 1 ) + coords_sh( v1, 1 ) );
1098 const double J_1_1 = r_mid * ( -coords_sh( v0, 1 ) + coords_sh( v2, 1 ) );
1099 const double J_1_2 =
1100 half_dr * ( ONE_THIRD * ( coords_sh( v0, 1 ) + coords_sh( v1, 1 ) + coords_sh( v2, 1 ) ) );
1101
1102 const double J_2_0 = r_mid * ( -coords_sh( v0, 2 ) + coords_sh( v1, 2 ) );
1103 const double J_2_1 = r_mid * ( -coords_sh( v0, 2 ) + coords_sh( v2, 2 ) );
1104 const double J_2_2 =
1105 half_dr * ( ONE_THIRD * ( coords_sh( v0, 2 ) + coords_sh( v1, 2 ) + coords_sh( v2, 2 ) ) );
1106
1107 const double J_det = J_0_0 * J_1_1 * J_2_2 - J_0_0 * J_1_2 * J_2_1 -
1108 J_0_1 * J_1_0 * J_2_2 + J_0_1 * J_1_2 * J_2_0 +
1109 J_0_2 * J_1_0 * J_2_1 - J_0_2 * J_1_1 * J_2_0;
1110
1111 const double inv_det = 1.0 / J_det;
1112
1113 const double i00 = inv_det * ( J_1_1 * J_2_2 - J_1_2 * J_2_1 );
1114 const double i01 = inv_det * ( -J_1_0 * J_2_2 + J_1_2 * J_2_0 );
1115 const double i02 = inv_det * ( J_1_0 * J_2_1 - J_1_1 * J_2_0 );
1116 const double i10 = inv_det * ( -J_0_1 * J_2_2 + J_0_2 * J_2_1 );
1117 const double i11 = inv_det * ( J_0_0 * J_2_2 - J_0_2 * J_2_0 );
1118 const double i12 = inv_det * ( -J_0_0 * J_2_1 + J_0_1 * J_2_0 );
1119 const double i20 = inv_det * ( J_0_1 * J_1_2 - J_0_2 * J_1_1 );
1120 const double i21 = inv_det * ( -J_0_0 * J_1_2 + J_0_2 * J_1_0 );
1121 const double i22 = inv_det * ( J_0_0 * J_1_1 - J_0_1 * J_1_0 );
1122
1123 if ( !Diagonal )
1124 {
1125 constexpr double NEG_TWO_THIRDS = -0.66666666666666663;
1126#pragma unroll
1127 for ( int n = cmb_shift; n < 6 - surface_shift; ++n )
1128 {
1129 const double gx = dN_ref[n][0];
1130 const double gy = dN_ref[n][1];
1131 const double gz = dN_ref[n][2];
1132 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1133 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1134 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1135
1136 const int ddx = WEDGE_NODE_OFF[w][n][0];
1137 const int ddy = WEDGE_NODE_OFF[w][n][1];
1138 const int ddr = WEDGE_NODE_OFF[w][n][2];
1139 Kokkos::atomic_add(
1140 &dst_( local_subdomain_id, x_cell + ddx, y_cell + ddy, r_cell + ddr, 0 ),
1141 kwJ * ( 2.0 * ( g0 * gu00 + g1 * gu10 + g2 * gu20 ) +
1142 NEG_TWO_THIRDS * g0 * div_u ) );
1143 Kokkos::atomic_add(
1144 &dst_( local_subdomain_id, x_cell + ddx, y_cell + ddy, r_cell + ddr, 1 ),
1145 kwJ * ( 2.0 * ( g0 * gu10 + g1 * gu11 + g2 * gu21 ) +
1146 NEG_TWO_THIRDS * g1 * div_u ) );
1147 Kokkos::atomic_add(
1148 &dst_( local_subdomain_id, x_cell + ddx, y_cell + ddy, r_cell + ddr, 2 ),
1149 kwJ * ( 2.0 * ( g0 * gu20 + g1 * gu21 + g2 * gu22 ) +
1150 NEG_TWO_THIRDS * g2 * div_u ) );
1151 }
1152 }
1153
1154 if ( Diagonal || ( treat_boundary_dirichlet && at_boundary ) )
1155 {
1156#pragma unroll
1157 for ( int n = surface_shift; n < 6 - cmb_shift; ++n )
1158 {
1159 const double gx = dN_ref[n][0];
1160 const double gy = dN_ref[n][1];
1161 const double gz = dN_ref[n][2];
1162 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1163 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1164 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1165 const double gg = g0 * g0 + g1 * g1 + g2 * g2;
1166
1167 const int nid = node_id( tx + WEDGE_NODE_OFF[w][n][0], ty + WEDGE_NODE_OFF[w][n][1] );
1168 const int lvl = lvl0 + WEDGE_NODE_OFF[w][n][2];
1169
1170 const double sv0 = src_sh( nid, 0, lvl );
1171 const double sv1 = src_sh( nid, 1, lvl );
1172 const double sv2 = src_sh( nid, 2, lvl );
1173
1174 const int ddx = WEDGE_NODE_OFF[w][n][0];
1175 const int ddy = WEDGE_NODE_OFF[w][n][1];
1176 const int ddr = WEDGE_NODE_OFF[w][n][2];
1177 Kokkos::atomic_add(
1178 &dst_( local_subdomain_id, x_cell + ddx, y_cell + ddy, r_cell + ddr, 0 ),
1179 kwJ * sv0 * ( gg + ONE_THIRD * g0 * g0 ) );
1180 Kokkos::atomic_add(
1181 &dst_( local_subdomain_id, x_cell + ddx, y_cell + ddy, r_cell + ddr, 1 ),
1182 kwJ * sv1 * ( gg + ONE_THIRD * g1 * g1 ) );
1183 Kokkos::atomic_add(
1184 &dst_( local_subdomain_id, x_cell + ddx, y_cell + ddy, r_cell + ddr, 2 ),
1185 kwJ * sv2 * ( gg + ONE_THIRD * g2 * g2 ) );
1186 }
1187 }
1188 }
1189
1190 } // end wedge loop
1191 }
1192 }
1193
1194 // ===================== FAST FREESLIP PATH =====================
1195 KOKKOS_INLINE_FUNCTION
1196 void normalize3( double& x, double& y, double& z ) const
1197 {
1198 const double n2 = x * x + y * y + z * z;
1199 if ( n2 > 0.0 )
1200 {
1201 const double invn = 1.0 / Kokkos::sqrt( n2 );
1202 x *= invn;
1203 y *= invn;
1204 z *= invn;
1205 }
1206 }
1207
1208 KOKKOS_INLINE_FUNCTION
1209 void project_tangential_inplace(
1210 const double nx,
1211 const double ny,
1212 const double nz,
1213 double& ux,
1214 double& uy,
1215 double& uz ) const
1216 {
1217 const double dot = nx * ux + ny * uy + nz * uz;
1218 ux -= dot * nx;
1219 uy -= dot * ny;
1220 uz -= dot * nz;
1221 }
1222
1223 template < bool Diagonal >
1224 KOKKOS_INLINE_FUNCTION
1225 void operator_fast_freeslip_path(
1226 const Team& team,
1227 const int local_subdomain_id,
1228 const int x0,
1229 const int y0,
1230 const int r0,
1231 const int tx,
1232 const int ty,
1233 const int tr,
1234 const int x_cell,
1235 const int y_cell ) const
1236 {
1237 const int nlev = r_tile_block_ + 1;
1238 const int nxy = ( lat_tile_ + 1 ) * ( lat_tile_ + 1 );
1239
1240 double* shmem =
1241 reinterpret_cast< double* >( team.team_shmem().get_shmem( team_shmem_size( team.team_size() ) ) );
1242
1243 using ScratchCoords =
1244 Kokkos::View< double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
1245 using ScratchSrc = Kokkos::
1246 View< double***, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
1247 using ScratchK =
1248 Kokkos::View< double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
1249
1250 ScratchCoords coords_sh( shmem, nxy, 3 );
1251 shmem += nxy * 3;
1252
1253 // Normalized normals in shared memory — avoids 12 persistent register-doubles.
1254 ScratchCoords normals_sh( shmem, nxy, 3 );
1255 shmem += nxy * 3;
1256
1257 ScratchSrc src_sh( shmem, nxy, 3, nlev );
1258 shmem += nxy * 3 * nlev;
1259
1260 ScratchK k_sh( shmem, nxy, nlev );
1261 shmem += nxy * nlev;
1262
1263 auto r_sh =
1264 Kokkos::View< double*, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >(
1265 shmem, nlev );
1266
1267 auto node_id = [&]( int nx, int ny ) -> int { return nx + ( lat_tile_ + 1 ) * ny; };
1268
1269 // Preload coords + compute normalized normals cooperatively.
1270 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nxy ), [&]( int n ) {
1271 const int dxn = n % ( lat_tile_ + 1 );
1272 const int dyn = n / ( lat_tile_ + 1 );
1273 const int xi = x0 + dxn;
1274 const int yi = y0 + dyn;
1275
1276 if ( xi <= hex_lat_ && yi <= hex_lat_ )
1277 {
1278 const double cx = grid_( local_subdomain_id, xi, yi, 0 );
1279 const double cy = grid_( local_subdomain_id, xi, yi, 1 );
1280 const double cz = grid_( local_subdomain_id, xi, yi, 2 );
1281 coords_sh( n, 0 ) = cx;
1282 coords_sh( n, 1 ) = cy;
1283 coords_sh( n, 2 ) = cz;
1284
1285 const double n2 = cx * cx + cy * cy + cz * cz;
1286 if ( n2 > 0.0 )
1287 {
1288 const double invn = 1.0 / Kokkos::sqrt( n2 );
1289 normals_sh( n, 0 ) = cx * invn;
1290 normals_sh( n, 1 ) = cy * invn;
1291 normals_sh( n, 2 ) = cz * invn;
1292 }
1293 else
1294 {
1295 normals_sh( n, 0 ) = normals_sh( n, 1 ) = normals_sh( n, 2 ) = 0.0;
1296 }
1297 }
1298 else
1299 {
1300 coords_sh( n, 0 ) = coords_sh( n, 1 ) = coords_sh( n, 2 ) = 0.0;
1301 normals_sh( n, 0 ) = normals_sh( n, 1 ) = normals_sh( n, 2 ) = 0.0;
1302 }
1303 } );
1304
1305 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nlev ), [&]( int lvl ) {
1306 const int rr = r0 + lvl;
1307 r_sh( lvl ) = ( rr <= hex_rad_ ) ? radii_( local_subdomain_id, rr ) : 0.0;
1308 } );
1309
1310 const int total_pairs = nxy * nlev;
1311 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, total_pairs ), [&]( int t ) {
1312 const int node = t / nlev;
1313 const int lvl = t - node * nlev;
1314
1315 const int dxn = node % ( lat_tile_ + 1 );
1316 const int dyn = node / ( lat_tile_ + 1 );
1317
1318 const int xi = x0 + dxn;
1319 const int yi = y0 + dyn;
1320 const int rr = r0 + lvl;
1321
1322 if ( xi <= hex_lat_ && yi <= hex_lat_ && rr <= hex_rad_ )
1323 {
1324 k_sh( node, lvl ) = k_( local_subdomain_id, xi, yi, rr );
1325 src_sh( node, 0, lvl ) = src_( local_subdomain_id, xi, yi, rr, 0 );
1326 src_sh( node, 1, lvl ) = src_( local_subdomain_id, xi, yi, rr, 1 );
1327 src_sh( node, 2, lvl ) = src_( local_subdomain_id, xi, yi, rr, 2 );
1328 }
1329 else
1330 {
1331 k_sh( node, lvl ) = 0.0;
1332 src_sh( node, 0, lvl ) = src_sh( node, 1, lvl ) = src_sh( node, 2, lvl ) = 0.0;
1333 }
1334 } );
1335
1336 team.team_barrier();
1337
1338 if ( x_cell >= hex_lat_ || y_cell >= hex_lat_ )
1339 return;
1340
1341 for ( int pass = 0; pass < r_passes_; ++pass )
1342 {
1343 const int lvl0 = pass * r_tile_ + tr;
1344 const int r_cell = r0 + lvl0;
1345
1346 if ( r_cell >= hex_rad_ )
1347 break;
1348
1349 const double r_0 = r_sh( lvl0 );
1350 const double r_1 = r_sh( lvl0 + 1 );
1351
1352 const bool at_cmb = has_flag( local_subdomain_id, x_cell, y_cell, r_cell, CMB );
1353 const bool at_surface = has_flag( local_subdomain_id, x_cell, y_cell, r_cell + 1, SURFACE );
1354
1355 const BoundaryConditionFlag cmb_bc = get_boundary_condition_flag( bcs_, CMB );
1356 const BoundaryConditionFlag surface_bc = get_boundary_condition_flag( bcs_, SURFACE );
1357
1358 const bool cmb_freeslip = at_cmb && ( cmb_bc == FREESLIP );
1359 const bool surf_freeslip = at_surface && ( surface_bc == FREESLIP );
1360 const bool cmb_dirichlet = at_cmb && ( cmb_bc == DIRICHLET );
1361 const bool surface_dirichlet = at_surface && ( surface_bc == DIRICHLET );
1362
1363 const int cmb_shift = ( ( cmb_dirichlet && ( !Diagonal ) ) ? 3 : 0 );
1364 const int surface_shift = ( ( surface_dirichlet && ( !Diagonal ) ) ? 3 : 0 );
1365
1366 static constexpr int WEDGE_NODE_OFF[2][6][3] = {
1367 { { 0, 0, 0 }, { 1, 0, 0 }, { 0, 1, 0 }, { 0, 0, 1 }, { 1, 0, 1 }, { 0, 1, 1 } },
1368 { { 1, 1, 0 }, { 0, 1, 0 }, { 1, 0, 0 }, { 1, 1, 1 }, { 0, 1, 1 }, { 1, 0, 1 } } };
1369
1370 static constexpr int WEDGE_TO_UNIQUE[2][6] = {
1371 { 0, 1, 2, 3, 4, 5 },
1372 { 6, 2, 1, 7, 5, 4 }
1373 };
1374
1375 constexpr double ONE_THIRD = 1.0 / 3.0;
1376 constexpr double ONE_SIXTH = 1.0 / 6.0;
1377 constexpr double NEG_TWO_THIRDS = -0.66666666666666663;
1378
1379 static constexpr double dN_ref[6][3] = {
1380 { -0.5, -0.5, -ONE_SIXTH },
1381 { 0.5, 0.0, -ONE_SIXTH },
1382 { 0.0, 0.5, -ONE_SIXTH },
1383 { -0.5, -0.5, ONE_SIXTH },
1384 { 0.5, 0.0, ONE_SIXTH },
1385 { 0.0, 0.5, ONE_SIXTH } };
1386
1387 const int n00 = node_id( tx, ty );
1388 const int n01 = node_id( tx, ty + 1 );
1389 const int n10 = node_id( tx + 1, ty );
1390 const int n11 = node_id( tx + 1, ty + 1 );
1391
1392 // Corner-to-shared-memory-node mapping: 0→n00, 1→n10, 2→n01, 3→n11.
1393 const int corner_node[4] = { n00, n10, n01, n11 };
1394 // Corner-to-unique-node for CMB (r=0) and surface (r=1) layers.
1395 static constexpr int CMB_CORNER_TO_UNIQUE[4] = { 0, 1, 2, 6 };
1396 static constexpr int SURF_CORNER_TO_UNIQUE[4] = { 3, 4, 5, 7 };
1397
1398 double dst8[3][8] = { { 0.0 } };
1399
1400 // Scalar Ann accumulators per corner. Accumulated inside the test-side loop
1401 // (merged from the former separate Ann loop to reuse already-computed gradients).
1402 double Ann_acc_cmb[4] = {};
1403 double Ann_acc_surf[4] = {};
1404
1405 for ( int w = 0; w < 2; ++w )
1406 {
1407 const int v0 = w == 0 ? n00 : n11;
1408 const int v1 = w == 0 ? n10 : n01;
1409 const int v2 = w == 0 ? n01 : n10;
1410
1411 double k_sum = 0.0;
1412#pragma unroll
1413 for ( int node = 0; node < 6; ++node )
1414 {
1415 const int ddx = WEDGE_NODE_OFF[w][node][0];
1416 const int ddy = WEDGE_NODE_OFF[w][node][1];
1417 const int ddr = WEDGE_NODE_OFF[w][node][2];
1418
1419 const int nid = node_id( tx + ddx, ty + ddy );
1420 const int lvl = lvl0 + ddr;
1421 k_sum += k_sh( nid, lvl );
1422 }
1423 const double k_eval = ONE_SIXTH * k_sum;
1424
1425 double wJ = 0.0;
1426 double i00, i01, i02;
1427 double i10, i11, i12;
1428 double i20, i21, i22;
1429
1430 {
1431 const double half_dr = 0.5 * ( r_1 - r_0 );
1432 const double r_mid = 0.5 * ( r_0 + r_1 );
1433
1434 const double J_0_0 = r_mid * ( -coords_sh( v0, 0 ) + coords_sh( v1, 0 ) );
1435 const double J_0_1 = r_mid * ( -coords_sh( v0, 0 ) + coords_sh( v2, 0 ) );
1436 const double J_0_2 = half_dr * ( ONE_THIRD * ( coords_sh( v0, 0 ) + coords_sh( v1, 0 ) + coords_sh( v2, 0 ) ) );
1437
1438 const double J_1_0 = r_mid * ( -coords_sh( v0, 1 ) + coords_sh( v1, 1 ) );
1439 const double J_1_1 = r_mid * ( -coords_sh( v0, 1 ) + coords_sh( v2, 1 ) );
1440 const double J_1_2 = half_dr * ( ONE_THIRD * ( coords_sh( v0, 1 ) + coords_sh( v1, 1 ) + coords_sh( v2, 1 ) ) );
1441
1442 const double J_2_0 = r_mid * ( -coords_sh( v0, 2 ) + coords_sh( v1, 2 ) );
1443 const double J_2_1 = r_mid * ( -coords_sh( v0, 2 ) + coords_sh( v2, 2 ) );
1444 const double J_2_2 = half_dr * ( ONE_THIRD * ( coords_sh( v0, 2 ) + coords_sh( v1, 2 ) + coords_sh( v2, 2 ) ) );
1445
1446 const double J_det = J_0_0 * J_1_1 * J_2_2 - J_0_0 * J_1_2 * J_2_1 - J_0_1 * J_1_0 * J_2_2 +
1447 J_0_1 * J_1_2 * J_2_0 + J_0_2 * J_1_0 * J_2_1 - J_0_2 * J_1_1 * J_2_0;
1448
1449 const double invJ = 1.0 / J_det;
1450
1451 i00 = invJ * ( J_1_1 * J_2_2 - J_1_2 * J_2_1 );
1452 i01 = invJ * ( -J_1_0 * J_2_2 + J_1_2 * J_2_0 );
1453 i02 = invJ * ( J_1_0 * J_2_1 - J_1_1 * J_2_0 );
1454
1455 i10 = invJ * ( -J_0_1 * J_2_2 + J_0_2 * J_2_1 );
1456 i11 = invJ * ( J_0_0 * J_2_2 - J_0_2 * J_2_0 );
1457 i12 = invJ * ( -J_0_0 * J_2_1 + J_0_1 * J_2_0 );
1458
1459 i20 = invJ * ( J_0_1 * J_1_2 - J_0_2 * J_1_1 );
1460 i21 = invJ * ( -J_0_0 * J_1_2 + J_0_2 * J_1_0 );
1461 i22 = invJ * ( J_0_0 * J_1_1 - J_0_1 * J_1_0 );
1462
1463 wJ = Kokkos::abs( J_det );
1464 }
1465
1466 const double kwJ = k_eval * wJ;
1467
1468 // ---- Fused trial + test side with merged Ann accumulation ----
1469 static constexpr int CMB_NODE_TO_CORNER[2][3] = { { 0, 1, 2 }, { 3, 2, 1 } };
1470
1471 double gu00 = 0.0;
1472 double gu10 = 0.0, gu11 = 0.0;
1473 double gu20 = 0.0, gu21 = 0.0, gu22 = 0.0;
1474 double div_u = 0.0;
1475
1476 if ( !Diagonal )
1477 {
1478 // Trial side: accumulate symmetric gradient of u (fused dim loops).
1479 // Read directly from shared memory with inline tangential projection for freeslip nodes.
1480#pragma unroll
1481 for ( int n = cmb_shift; n < 6 - surface_shift; ++n )
1482 {
1483 const double gx = dN_ref[n][0];
1484 const double gy = dN_ref[n][1];
1485 const double gz = dN_ref[n][2];
1486 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1487 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1488 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1489
1490 const int nid = node_id( tx + WEDGE_NODE_OFF[w][n][0], ty + WEDGE_NODE_OFF[w][n][1] );
1491 const int lvl = lvl0 + WEDGE_NODE_OFF[w][n][2];
1492
1493 double s0 = src_sh( nid, 0, lvl );
1494 double s1 = src_sh( nid, 1, lvl );
1495 double s2 = src_sh( nid, 2, lvl );
1496
1497 // Inline tangential projection for freeslip boundary nodes.
1498 if ( cmb_freeslip && n < 3 )
1499 {
1500 const double nx = normals_sh( nid, 0 );
1501 const double ny = normals_sh( nid, 1 );
1502 const double nz = normals_sh( nid, 2 );
1503 const double dot = nx * s0 + ny * s1 + nz * s2;
1504 s0 -= dot * nx;
1505 s1 -= dot * ny;
1506 s2 -= dot * nz;
1507 }
1508 if ( surf_freeslip && n >= 3 )
1509 {
1510 const double nx = normals_sh( nid, 0 );
1511 const double ny = normals_sh( nid, 1 );
1512 const double nz = normals_sh( nid, 2 );
1513 const double dot = nx * s0 + ny * s1 + nz * s2;
1514 s0 -= dot * nx;
1515 s1 -= dot * ny;
1516 s2 -= dot * nz;
1517 }
1518
1519 gu00 += g0 * s0;
1520 gu11 += g1 * s1;
1521 gu22 += g2 * s2;
1522 gu10 += 0.5 * ( g1 * s0 + g0 * s1 );
1523 gu20 += 0.5 * ( g2 * s0 + g0 * s2 );
1524 gu21 += 0.5 * ( g2 * s1 + g1 * s2 );
1525 div_u += g0 * s0 + g1 * s1 + g2 * s2;
1526 }
1527
1528 // Test side + merged Ann accumulation.
1529 // Ann uses the same gradient already computed — no separate loop needed.
1530#pragma unroll
1531 for ( int n = cmb_shift; n < 6 - surface_shift; ++n )
1532 {
1533 const double gx = dN_ref[n][0];
1534 const double gy = dN_ref[n][1];
1535 const double gz = dN_ref[n][2];
1536 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1537 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1538 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1539
1540 const int uid = WEDGE_TO_UNIQUE[w][n];
1541 dst8[0][uid] +=
1542 kwJ * ( 2.0 * ( g0 * gu00 + g1 * gu10 + g2 * gu20 ) + NEG_TWO_THIRDS * g0 * div_u );
1543 dst8[1][uid] +=
1544 kwJ * ( 2.0 * ( g0 * gu10 + g1 * gu11 + g2 * gu21 ) + NEG_TWO_THIRDS * g1 * div_u );
1545 dst8[2][uid] +=
1546 kwJ * ( 2.0 * ( g0 * gu20 + g1 * gu21 + g2 * gu22 ) + NEG_TWO_THIRDS * g2 * div_u );
1547
1548 // Accumulate Ann for freeslip CMB nodes (n < 3) and surface nodes (n >= 3).
1549 if ( cmb_freeslip && n < 3 )
1550 {
1551 const int corner = CMB_NODE_TO_CORNER[w][n];
1552 const int cn = corner_node[corner];
1553 const double nxu = normals_sh( cn, 0 );
1554 const double nyu = normals_sh( cn, 1 );
1555 const double nzu = normals_sh( cn, 2 );
1556 const double gg = g0 * g0 + g1 * g1 + g2 * g2;
1557 const double ng = nxu * g0 + nyu * g1 + nzu * g2;
1558 Ann_acc_cmb[corner] += kwJ * ( gg + ONE_THIRD * ng * ng );
1559 }
1560 if ( surf_freeslip && n >= 3 )
1561 {
1562 const int corner = CMB_NODE_TO_CORNER[w][n - 3];
1563 const int cn = corner_node[corner];
1564 const double nxu = normals_sh( cn, 0 );
1565 const double nyu = normals_sh( cn, 1 );
1566 const double nzu = normals_sh( cn, 2 );
1567 const double gg = g0 * g0 + g1 * g1 + g2 * g2;
1568 const double ng = nxu * g0 + nyu * g1 + nzu * g2;
1569 Ann_acc_surf[corner] += kwJ * ( gg + ONE_THIRD * ng * ng );
1570 }
1571 }
1572 }
1573
1574 // ---- Diagonal / Dirichlet boundary handling (fused) ----
1575 if ( Diagonal || cmb_dirichlet || surface_dirichlet )
1576 {
1577 // Fused diagonal: kwJ * s_d * (|g|^2 + (1/3) * g_d^2)
1578#pragma unroll
1579 for ( int n = surface_shift; n < 6 - cmb_shift; ++n )
1580 {
1581 if ( Diagonal && cmb_freeslip && n < 3 )
1582 continue;
1583 if ( Diagonal && surf_freeslip && n >= 3 )
1584 continue;
1585
1586 const double gx = dN_ref[n][0];
1587 const double gy = dN_ref[n][1];
1588 const double gz = dN_ref[n][2];
1589 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1590 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1591 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1592 const double gg = g0 * g0 + g1 * g1 + g2 * g2;
1593
1594 const int uid = WEDGE_TO_UNIQUE[w][n];
1595 const int nid = node_id( tx + WEDGE_NODE_OFF[w][n][0], ty + WEDGE_NODE_OFF[w][n][1] );
1596 const int lvl = lvl0 + WEDGE_NODE_OFF[w][n][2];
1597 const double s0 = src_sh( nid, 0, lvl );
1598 const double s1 = src_sh( nid, 1, lvl );
1599 const double s2 = src_sh( nid, 2, lvl );
1600
1601 dst8[0][uid] += kwJ * s0 * ( gg + ONE_THIRD * g0 * g0 );
1602 dst8[1][uid] += kwJ * s1 * ( gg + ONE_THIRD * g1 * g1 );
1603 dst8[2][uid] += kwJ * s2 * ( gg + ONE_THIRD * g2 * g2 );
1604 }
1605
1606 // For free-slip boundary nodes in diagonal mode: compute R^T diag(R A_3x3 R^T) R src.
1607 // Normals loaded from shared memory, u_n recomputed from src_sh (original, unprojected).
1608 if ( Diagonal )
1609 {
1610 static constexpr int FS_CORNER_MAP[2][3] = { { 0, 1, 2 }, { 3, 2, 1 } };
1611
1612 auto apply_rotated_diag =
1613 [&]( const int ni, const int node_idx, const int src_lvl ) {
1614 const int corner = FS_CORNER_MAP[w][ni];
1615 const int cn = corner_node[corner];
1616 const double nxu = normals_sh( cn, 0 );
1617 const double nyu = normals_sh( cn, 1 );
1618 const double nzu = normals_sh( cn, 2 );
1619 const int u = WEDGE_TO_UNIQUE[w][node_idx];
1620
1621 const double gx = dN_ref[node_idx][0];
1622 const double gy = dN_ref[node_idx][1];
1623 const double gz = dN_ref[node_idx][2];
1624
1625 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1626 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1627 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1628 const double gg_loc = g0 * g0 + g1 * g1 + g2 * g2;
1629
1630 dense::Vec< double, 3 > n_vec;
1631 n_vec( 0 ) = nxu;
1632 n_vec( 1 ) = nyu;
1633 n_vec( 2 ) = nzu;
1634 const auto R_rot = trafo_mat_cartesian_to_normal_tangential< double >( n_vec );
1635
1636 // Read original (unprojected) src directly from shared memory.
1637 // (Previously: projected src8 + u_n*n = original, now simplified.)
1638 const double s0 = src_sh( cn, 0, src_lvl );
1639 const double s1 = src_sh( cn, 1, src_lvl );
1640 const double s2 = src_sh( cn, 2, src_lvl );
1641
1642 const double Rg0 = R_rot( 0, 0 ) * g0 + R_rot( 0, 1 ) * g1 + R_rot( 0, 2 ) * g2;
1643 const double Rg1 = R_rot( 1, 0 ) * g0 + R_rot( 1, 1 ) * g1 + R_rot( 1, 2 ) * g2;
1644 const double Rg2 = R_rot( 2, 0 ) * g0 + R_rot( 2, 1 ) * g1 + R_rot( 2, 2 ) * g2;
1645 const double Rs0 = R_rot( 0, 0 ) * s0 + R_rot( 0, 1 ) * s1 + R_rot( 0, 2 ) * s2;
1646 const double Rs1 = R_rot( 1, 0 ) * s0 + R_rot( 1, 1 ) * s1 + R_rot( 1, 2 ) * s2;
1647 const double Rs2 = R_rot( 2, 0 ) * s0 + R_rot( 2, 1 ) * s1 + R_rot( 2, 2 ) * s2;
1648
1649 const double v0 = kwJ * ( gg_loc + ONE_THIRD * Rg0 * Rg0 ) * Rs0;
1650 const double v1 = kwJ * ( gg_loc + ONE_THIRD * Rg1 * Rg1 ) * Rs1;
1651 const double v2 = kwJ * ( gg_loc + ONE_THIRD * Rg2 * Rg2 ) * Rs2;
1652
1653 dst8[0][u] += R_rot( 0, 0 ) * v0 + R_rot( 1, 0 ) * v1 + R_rot( 2, 0 ) * v2;
1654 dst8[1][u] += R_rot( 0, 1 ) * v0 + R_rot( 1, 1 ) * v1 + R_rot( 2, 1 ) * v2;
1655 dst8[2][u] += R_rot( 0, 2 ) * v0 + R_rot( 1, 2 ) * v1 + R_rot( 2, 2 ) * v2;
1656 };
1657
1658 if ( cmb_freeslip )
1659 {
1660 for ( int ni = 0; ni < 3; ++ni )
1661 apply_rotated_diag( ni, ni, lvl0 );
1662 }
1663 if ( surf_freeslip )
1664 {
1665 for ( int ni = 0; ni < 3; ++ni )
1666 apply_rotated_diag( ni, ni + 3, lvl0 + 1 );
1667 }
1668 }
1669 }
1670 }
1671
1672 // Test-side projection for free-slip (P A P) — normals loaded from shared memory.
1673 if ( !Diagonal && cmb_freeslip )
1674 {
1675 for ( int c = 0; c < 4; ++c )
1676 {
1677 const double nx = normals_sh( corner_node[c], 0 );
1678 const double ny = normals_sh( corner_node[c], 1 );
1679 const double nz = normals_sh( corner_node[c], 2 );
1680 const int u = CMB_CORNER_TO_UNIQUE[c];
1681 const double dot = nx * dst8[0][u] + ny * dst8[1][u] + nz * dst8[2][u];
1682 dst8[0][u] -= dot * nx;
1683 dst8[1][u] -= dot * ny;
1684 dst8[2][u] -= dot * nz;
1685 }
1686 }
1687 if ( !Diagonal && surf_freeslip )
1688 {
1689 for ( int c = 0; c < 4; ++c )
1690 {
1691 const double nx = normals_sh( corner_node[c], 0 );
1692 const double ny = normals_sh( corner_node[c], 1 );
1693 const double nz = normals_sh( corner_node[c], 2 );
1694 const int u = SURF_CORNER_TO_UNIQUE[c];
1695 const double dot = nx * dst8[0][u] + ny * dst8[1][u] + nz * dst8[2][u];
1696 dst8[0][u] -= dot * nx;
1697 dst8[1][u] -= dot * ny;
1698 dst8[2][u] -= dot * nz;
1699 }
1700 }
1701
1702 // Add back normal correction: Ann_acc[c] * u_n[c] * n_c.
1703 // u_n recomputed from original (unprojected) src in shared memory.
1704 if ( !Diagonal && cmb_freeslip )
1705 {
1706 for ( int c = 0; c < 4; ++c )
1707 {
1708 const int cn = corner_node[c];
1709 const double nx = normals_sh( cn, 0 );
1710 const double ny = normals_sh( cn, 1 );
1711 const double nz = normals_sh( cn, 2 );
1712 const double os0 = src_sh( cn, 0, lvl0 );
1713 const double os1 = src_sh( cn, 1, lvl0 );
1714 const double os2 = src_sh( cn, 2, lvl0 );
1715 const double u_n_val = nx * os0 + ny * os1 + nz * os2;
1716 const double corr = Ann_acc_cmb[c] * u_n_val;
1717 const int u = CMB_CORNER_TO_UNIQUE[c];
1718 dst8[0][u] += corr * nx;
1719 dst8[1][u] += corr * ny;
1720 dst8[2][u] += corr * nz;
1721 }
1722 }
1723 if ( !Diagonal && surf_freeslip )
1724 {
1725 for ( int c = 0; c < 4; ++c )
1726 {
1727 const int cn = corner_node[c];
1728 const double nx = normals_sh( cn, 0 );
1729 const double ny = normals_sh( cn, 1 );
1730 const double nz = normals_sh( cn, 2 );
1731 const double os0 = src_sh( cn, 0, lvl0 + 1 );
1732 const double os1 = src_sh( cn, 1, lvl0 + 1 );
1733 const double os2 = src_sh( cn, 2, lvl0 + 1 );
1734 const double u_n_val = nx * os0 + ny * os1 + nz * os2;
1735 const double corr = Ann_acc_surf[c] * u_n_val;
1736 const int u = SURF_CORNER_TO_UNIQUE[c];
1737 dst8[0][u] += corr * nx;
1738 dst8[1][u] += corr * ny;
1739 dst8[2][u] += corr * nz;
1740 }
1741 }
1742
1743 // Scatter accumulated hex-cell contributions to global memory.
1744#pragma unroll
1745 for ( int dim_add = 0; dim_add < 3; ++dim_add )
1746 {
1747 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell, y_cell, r_cell, dim_add ), dst8[dim_add][0] );
1748 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell + 1, y_cell, r_cell, dim_add ), dst8[dim_add][1] );
1749 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell, y_cell + 1, r_cell, dim_add ), dst8[dim_add][2] );
1750 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell, y_cell, r_cell + 1, dim_add ), dst8[dim_add][3] );
1751 Kokkos::atomic_add(
1752 &dst_( local_subdomain_id, x_cell + 1, y_cell, r_cell + 1, dim_add ), dst8[dim_add][4] );
1753 Kokkos::atomic_add(
1754 &dst_( local_subdomain_id, x_cell, y_cell + 1, r_cell + 1, dim_add ), dst8[dim_add][5] );
1755 Kokkos::atomic_add(
1756 &dst_( local_subdomain_id, x_cell + 1, y_cell + 1, r_cell, dim_add ), dst8[dim_add][6] );
1757 Kokkos::atomic_add(
1758 &dst_( local_subdomain_id, x_cell + 1, y_cell + 1, r_cell + 1, dim_add ), dst8[dim_add][7] );
1759 }
1760
1761 } // end r_passes loop
1762 }
1763
1764 public:
1765 /**
1766 * @brief Legacy generic team operator.
1767 *
1768 * Kept for compatibility/debugging, but no longer used by apply_impl().
1769 * The host now dispatches directly to path-specific kernels.
1770 *
1771 * This function still works, but it reintroduces a branch on `kernel_path_`
1772 * and should therefore be avoided in performance-critical use.
1773 */
1774 KOKKOS_INLINE_FUNCTION
1775 void operator()( const Team& team ) const
1776 {
1777 int local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell;
1778 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
1779
1780 if ( tr >= r_tile_ )
1781 return;
1782
1783 if ( kernel_path_ == KernelPath::Slow )
1784 {
1785 for ( int pass = 0; pass < r_passes_; ++pass )
1786 {
1787 const int r_cell_pass = r0 + pass * r_tile_ + tr;
1788 if ( r_cell_pass >= hex_rad_ )
1789 break;
1790 const bool at_cmb = has_flag( local_subdomain_id, x_cell, y_cell, r_cell_pass, CMB );
1791 const bool at_surface = has_flag( local_subdomain_id, x_cell, y_cell, r_cell_pass + 1, SURFACE );
1792 operator_slow_path(
1793 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell_pass, at_cmb, at_surface );
1794 }
1795 }
1796 else if ( kernel_path_ == KernelPath::FastFreeslip )
1797 {
1798 if ( diagonal_ )
1799 operator_fast_freeslip_path< true >(
1800 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell );
1801 else
1802 operator_fast_freeslip_path< false >(
1803 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell );
1804 }
1805 else
1806 {
1807 if ( diagonal_ )
1808 operator_fast_dirichlet_neumann_path< true >(
1809 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell );
1810 else
1811 operator_fast_dirichlet_neumann_path< false >(
1812 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell );
1813 }
1814 }
1815
1816 KOKKOS_INLINE_FUNCTION void assemble_trial_test_vecs(
1817 const int wedge,
1818 const dense::Vec< ScalarType, VecDim >& quad_point,
1819 const ScalarType quad_weight,
1820 const ScalarT r_1,
1821 const ScalarT r_2,
1822 dense::Vec< ScalarT, 3 > ( *wedge_phy_surf )[3],
1823 const dense::Vec< ScalarT, 6 >* k_local_hex,
1824 const int dimi,
1825 const int dimj,
1828 ScalarType& jdet_keval_quadweight ) const
1829 {
1830 dense::Mat< ScalarType, VecDim, VecDim > J = jac( wedge_phy_surf[wedge], r_1, r_2, quad_point );
1831 const auto det = J.det();
1832 const auto abs_det = Kokkos::abs( det );
1833 const dense::Mat< ScalarType, VecDim, VecDim > J_inv_transposed = J.inv_transposed( det );
1834
1835 ScalarType k_eval = 0.0;
1836 for ( int k = 0; k < num_nodes_per_wedge; k++ )
1837 {
1838 k_eval += shape( k, quad_point ) * k_local_hex[wedge]( k );
1839 }
1840
1841 for ( int k = 0; k < num_nodes_per_wedge; k++ )
1842 {
1843 sym_grad_i[k] = symmetric_grad( J_inv_transposed, quad_point, k, dimi );
1844 sym_grad_j[k] = symmetric_grad( J_inv_transposed, quad_point, k, dimj );
1845 }
1846
1847 jdet_keval_quadweight = quad_weight * k_eval * abs_det;
1848 }
1849
1850 /**
1851 * @brief Assemble one wedge-local 18x18 matrix (slow path / on-demand assembly).
1852 */
1853 KOKKOS_INLINE_FUNCTION
1855 const int local_subdomain_id,
1856 const int x_cell,
1857 const int y_cell,
1858 const int r_cell,
1859 const int wedge ) const
1860 {
1862 wedge_surface_physical_coords( wedge_phy_surf, grid_, local_subdomain_id, x_cell, y_cell );
1863
1864 const ScalarT r_1 = radii_( local_subdomain_id, r_cell );
1865 const ScalarT r_2 = radii_( local_subdomain_id, r_cell + 1 );
1866
1868 extract_local_wedge_scalar_coefficients( k_local_hex, local_subdomain_id, x_cell, y_cell, r_cell, k_ );
1869
1871
1872 for ( int dimi = 0; dimi < 3; ++dimi )
1873 {
1874 for ( int dimj = 0; dimj < 3; ++dimj )
1875 {
1876 for ( int q = 0; q < num_quad_points; q++ )
1877 {
1880 ScalarType jdet_keval_quadweight = 0;
1881
1883 wedge,
1884 quad_points[q],
1885 quad_weights[q],
1886 r_1,
1887 r_2,
1888 wedge_phy_surf,
1889 k_local_hex,
1890 dimi,
1891 dimj,
1892 sym_grad_i,
1893 sym_grad_j,
1894 jdet_keval_quadweight );
1895
1896 for ( int i = 0; i < num_nodes_per_wedge; i++ )
1897 {
1898 for ( int j = 0; j < num_nodes_per_wedge; j++ )
1899 {
1900 A( i + dimi * num_nodes_per_wedge, j + dimj * num_nodes_per_wedge ) +=
1901 jdet_keval_quadweight *
1902 ( 2 * sym_grad_j[j].double_contract( sym_grad_i[i] ) -
1903 2.0 / 3.0 * sym_grad_j[j]( dimj, dimj ) * sym_grad_i[i]( dimi, dimi ) );
1904 }
1905 }
1906 }
1907 }
1908 }
1909
1910 return A;
1911 }
1912};
1913
1916
1917} // namespace terra::fe::wedge::operators::shell::epsdivdiv_history
Definition communication_plan.hpp:33
Send and receive buffers for all process-local subdomain boundaries.
Definition communication.hpp:56
Matrix-free / matrix-based epsilon-div-div operator on wedge elements in a spherical shell.
Definition epsilon_divdiv_kerngen_v10_seq_rpasses.hpp:58
grid::Grid2DDataScalar< ScalarT > get_radii() const
Definition epsilon_divdiv_kerngen_v10_seq_rpasses.hpp:241
bool has_flag(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, grid::shell::ShellBoundaryFlag flag) const
Definition epsilon_divdiv_kerngen_v10_seq_rpasses.hpp:245
void column_grad_to_sym(const int dim, const double g0, const double g1, const double g2, double &E00, double &E11, double &E22, double &sym01, double &sym02, double &sym12, double &gdd) const
Convert one gradient column into symmetric-gradient and div contributions.
Definition epsilon_divdiv_kerngen_v10_seq_rpasses.hpp:420
terra::grid::Grid4DDataMatrices< ScalarType, LocalMatrixDim, LocalMatrixDim, 2 > Grid4DDataLocalMatrices
Definition epsilon_divdiv_kerngen_v10_seq_rpasses.hpp:64
dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > get_local_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge) const
Definition epsilon_divdiv_kerngen_v10_seq_rpasses.hpp:294
void set_local_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge, const dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > &mat) const
Definition epsilon_divdiv_kerngen_v10_seq_rpasses.hpp:281
Kokkos::TeamPolicy<>::member_type Team
Definition epsilon_divdiv_kerngen_v10_seq_rpasses.hpp:66
void apply_impl(const SrcVectorType &src, DstVectorType &dst)
Apply the operator: dst <- Op(src) (or additive, depending on mode).
Definition epsilon_divdiv_kerngen_v10_seq_rpasses.hpp:325
size_t team_shmem_size(const int ts) const
Team scratch memory size for fast paths.
Definition epsilon_divdiv_kerngen_v10_seq_rpasses.hpp:465
const grid::shell::DistributedDomain & get_domain() const
Definition epsilon_divdiv_kerngen_v10_seq_rpasses.hpp:240
grid::Grid3DDataVec< ScalarT, 3 > get_grid()
Definition epsilon_divdiv_kerngen_v10_seq_rpasses.hpp:242
const grid::Grid4DDataScalar< ScalarType > & k_grid_data()
Definition epsilon_divdiv_kerngen_v10_seq_rpasses.hpp:239
void operator()(const Team &team) const
Legacy generic team operator.
Definition epsilon_divdiv_kerngen_v10_seq_rpasses.hpp:1775
void set_operator_apply_and_communication_modes(const linalg::OperatorApplyMode operator_apply_mode, const linalg::OperatorCommunicationMode operator_communication_mode)
Definition epsilon_divdiv_kerngen_v10_seq_rpasses.hpp:220
static constexpr int LocalMatrixDim
Definition epsilon_divdiv_kerngen_v10_seq_rpasses.hpp:63
void assemble_trial_test_vecs(const int wedge, const dense::Vec< ScalarType, VecDim > &quad_point, const ScalarType quad_weight, const ScalarT r_1, const ScalarT r_2, dense::Vec< ScalarT, 3 >(*wedge_phy_surf)[3], const dense::Vec< ScalarT, 6 > *k_local_hex, const int dimi, const int dimj, dense::Mat< ScalarType, VecDim, VecDim > *sym_grad_i, dense::Mat< ScalarType, VecDim, VecDim > *sym_grad_j, ScalarType &jdet_keval_quadweight) const
Definition epsilon_divdiv_kerngen_v10_seq_rpasses.hpp:1816
ScalarT ScalarType
Definition epsilon_divdiv_kerngen_v10_seq_rpasses.hpp:62
void set_stored_matrix_mode(linalg::OperatorStoredMatrixMode operator_stored_matrix_mode, int level_range, grid::Grid4DDataScalar< ScalarType > GCAElements)
Definition epsilon_divdiv_kerngen_v10_seq_rpasses.hpp:255
void set_boundary_conditions(BoundaryConditions bcs)
Definition epsilon_divdiv_kerngen_v10_seq_rpasses.hpp:230
void set_diagonal(bool v)
Definition epsilon_divdiv_kerngen_v10_seq_rpasses.hpp:228
linalg::OperatorStoredMatrixMode get_stored_matrix_mode()
Definition epsilon_divdiv_kerngen_v10_seq_rpasses.hpp:278
EpsilonDivDivKerngenV10SeqRpasses(const grid::shell::DistributedDomain &domain, const grid::Grid3DDataVec< ScalarT, 3 > &grid, const grid::Grid2DDataScalar< ScalarT > &radii, const grid::Grid4DDataScalar< grid::shell::ShellBoundaryFlag > &mask, const grid::Grid4DDataScalar< ScalarT > &k, BoundaryConditions bcs, bool diagonal, linalg::OperatorApplyMode operator_apply_mode=linalg::OperatorApplyMode::Replace, linalg::OperatorCommunicationMode operator_communication_mode=linalg::OperatorCommunicationMode::CommunicateAdditively, linalg::OperatorStoredMatrixMode operator_stored_matrix_mode=linalg::OperatorStoredMatrixMode::Off)
Definition epsilon_divdiv_kerngen_v10_seq_rpasses.hpp:157
size_t team_shmem_size_dn(const int) const
Definition epsilon_divdiv_kerngen_v10_seq_rpasses.hpp:479
dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > assemble_local_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge) const
Assemble one wedge-local 18x18 matrix (slow path / on-demand assembly).
Definition epsilon_divdiv_kerngen_v10_seq_rpasses.hpp:1854
Parallel data structure organizing the thick spherical shell metadata for distributed (MPI parallel) ...
Definition spherical_shell.hpp:2518
const std::map< SubdomainInfo, std::tuple< LocalSubdomainIdx, SubdomainNeighborhood > > & subdomains() const
Definition spherical_shell.hpp:2650
const DomainInfo & domain_info() const
Returns a const reference.
Definition spherical_shell.hpp:2647
Information about the thick spherical shell mesh.
Definition spherical_shell.hpp:780
const std::vector< double > & radii() const
Definition spherical_shell.hpp:845
int diamond_lateral_refinement_level() const
Definition spherical_shell.hpp:843
int subdomain_num_nodes_radially() const
Equivalent to calling subdomain_num_nodes_radially( subdomain_refinement_level() )
Definition spherical_shell.hpp:861
int subdomain_num_nodes_per_side_laterally() const
Equivalent to calling subdomain_num_nodes_per_side_laterally( subdomain_refinement_level() )
Definition spherical_shell.hpp:852
Static assertion: VectorQ1Scalar satisfies VectorLike concept.
Definition vector_q1.hpp:168
const grid::Grid4DDataVec< ScalarType, VecDim > & grid_data() const
Get const reference to grid data.
Definition vector_q1.hpp:288
bool has_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge) const
Checks for presence of a local matrix for a certain element.
Definition local_matrix_storage.hpp:223
dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > get_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge) const
Retrives the local matrix if there is stored local matrices, the desired local matrix is loaded and r...
Definition local_matrix_storage.hpp:175
void set_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge, dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > mat) const
Set the local matrix stored in the operator.
Definition local_matrix_storage.hpp:118
Timer supporting RAII scope or manual stop.
Definition timer.hpp:342
void stop()
Stop the timer and record elapsed time.
Definition timer.hpp:364
Concept for types that can be used as Galerkin coarse-grid operators in a multigrid hierarchy....
Definition operator.hpp:81
k_eval
Definition EpsilonDivDiv_kernel_gen.py:175
r_cell
Definition EpsilonDivDiv_kernel_gen.py:23
g2
Definition EpsilonDivDiv_kernel_gen.py:267
div_u
Definition EpsilonDivDiv_kernel_gen.py:359
cmb_shift
Definition EpsilonDivDiv_kernel_gen.py:98
g1
Definition EpsilonDivDiv_kernel_gen.py:266
J_det
Definition EpsilonDivDiv_kernel_gen.py:216
surface_shift
Definition EpsilonDivDiv_kernel_gen.py:98
constexpr int idx(const int loop_idx, const int size, const grid::BoundaryPosition position, const grid::BoundaryDirection direction)
Definition buffer_copy_kernels.hpp:73
void send_recv_with_plan(const ShellBoundaryCommPlan< GridDataType > &plan, const GridDataType &data, SubdomainNeighborhoodSendRecvBuffer< typename GridDataType::value_type, grid::grid_data_vec_dim< GridDataType >() > &recv_buffers, CommunicationReduction reduction=CommunicationReduction::SUM)
Definition communication_plan.hpp:652
Definition epsilon_divdiv_kerngen_v01_initial.hpp:16
constexpr void quad_felippa_1x1_quad_points(dense::Vec< T, 3 >(&quad_points)[quad_felippa_1x1_num_quad_points])
Definition wedge/quadrature/quadrature.hpp:36
constexpr void quad_felippa_1x1_quad_weights(T(&quad_weights)[quad_felippa_1x1_num_quad_points])
Definition wedge/quadrature/quadrature.hpp:43
constexpr int quad_felippa_1x1_num_quad_points
Definition wedge/quadrature/quadrature.hpp:32
constexpr int num_nodes_per_wedge_surface
Definition kernel_helpers.hpp:6
constexpr dense::Mat< T, 3, 3 > symmetric_grad(const dense::Mat< T, 3, 3 > &J_inv_transposed, const dense::Vec< T, 3 > &quad_point, const int dof, const int dim)
Returns the symmetric gradient of the shape function of a dof at a quadrature point.
Definition integrands.hpp:685
void wedge_surface_physical_coords(dense::Vec< T, 3 >(&wedge_surf_phy_coords)[num_wedges_per_hex_cell][num_nodes_per_wedge_surface], const grid::Grid3DDataVec< T, 3 > &lateral_grid, const int local_subdomain_id, const int x_cell, const int y_cell)
Extracts the (unit sphere) surface vertex coords of the two wedges of a hex cell.
Definition kernel_helpers.hpp:26
constexpr void reorder_local_dofs(const DoFOrdering doo_from, const DoFOrdering doo_to, dense::Vec< ScalarT, 18 > &dofs)
Definition kernel_helpers.hpp:619
void atomically_add_local_wedge_vector_coefficients(const grid::Grid4DDataVec< T, VecDim > &global_coefficients, const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int d, const dense::Vec< T, 6 > local_coefficients[2])
Performs an atomic add of the two local wedge coefficient vectors of a hex cell into the global coeff...
Definition kernel_helpers.hpp:465
constexpr int num_wedges_per_hex_cell
Definition kernel_helpers.hpp:5
void extract_local_wedge_scalar_coefficients(dense::Vec< T, 6 >(&local_coefficients)[2], const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const grid::Grid4DDataScalar< T > &global_coefficients)
Extracts the local vector coefficients for the two wedges of a hex cell from the global coefficient v...
Definition kernel_helpers.hpp:306
void extract_local_wedge_vector_coefficients(dense::Vec< T, 6 >(&local_coefficients)[2], const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int d, const grid::Grid4DDataVec< T, VecDim > &global_coefficients)
Extracts the local vector coefficients for the two wedges of a hex cell from the global coefficient v...
Definition kernel_helpers.hpp:356
constexpr int num_nodes_per_wedge
Definition kernel_helpers.hpp:7
constexpr T shape(const int node_idx, const T xi, const T eta, const T zeta)
(Tensor-product) Shape function.
Definition integrands.hpp:146
constexpr dense::Mat< T, 3, 3 > jac(const dense::Vec< T, 3 > &p1_phy, const dense::Vec< T, 3 > &p2_phy, const dense::Vec< T, 3 > &p3_phy, const T r_1, const T r_2, const T xi, const T eta, const T zeta)
Definition integrands.hpp:657
dense::Vec< typename CoordsShellType::value_type, 3 > coords(const int subdomain, const int x, const int y, const int r, const CoordsShellType &coords_shell, const CoordsRadiiType &coords_radii)
Definition spherical_shell.hpp:2871
BoundaryConditionMapping[2] BoundaryConditions
Definition shell/bit_masks.hpp:37
ShellBoundaryFlag
FlagLike that indicates boundary types for the thick spherical shell.
Definition shell/bit_masks.hpp:12
BoundaryConditionFlag get_boundary_condition_flag(const BoundaryConditions bcs, ShellBoundaryFlag sbf)
Retrieve the boundary condition flag that is associated with a location in the shell e....
Definition shell/bit_masks.hpp:42
BoundaryConditionFlag
FlagLike that indicates the type of boundary condition
Definition shell/bit_masks.hpp:25
Kokkos::View< dense::Mat< ScalarType, Rows, Cols > ****[NumMatrices], Layout > Grid4DDataMatrices
Definition grid_types.hpp:173
Kokkos::View< ScalarType ***[VecDim], Layout > Grid3DDataVec
Definition grid_types.hpp:42
Kokkos::View< ScalarType ****, Layout > Grid4DDataScalar
Definition grid_types.hpp:27
Kokkos::View< ScalarType **, Layout > Grid2DDataScalar
Definition grid_types.hpp:21
dense::Mat< ScalarType, 3, 3 > trafo_mat_cartesian_to_normal_tangential(const dense::Vec< ScalarType, 3 > &n_input)
Constructs a robust orthonormal transformation matrix from Cartesian to (normal–tangential–tangential...
Definition local_basis_trafo_normal_tangential.hpp:36
ScalarOf< Vector > dot(const Vector &y, const Vector &x)
Compute the dot product of two vectors. Implements: .
Definition vector.hpp:118
OperatorApplyMode
Modes for applying an operator to a vector.
Definition operator.hpp:30
@ Replace
Overwrite the destination vector.
OperatorStoredMatrixMode
Modes for applying stored matrices.
Definition operator.hpp:47
@ Selective
Use stored matrices on selected, marked elements only, assemble on all others.
@ Full
Use stored matrices on all elements.
@ Off
Do not use stored matrices.
OperatorCommunicationMode
Modes for communication during operator application.
Definition operator.hpp:40
@ CommunicateAdditively
Communicate and add results.
constexpr bool has_flag(E mask_value, E flag) noexcept
Checks if a bitmask value contains a specific flag.
Definition bit_masking.hpp:43
detail::PrefixCout logroot([]() { return detail::log_prefix();})
std::ostream subclass that just logs on root and adds a timestamp for each line.
Definition mat.hpp:10
constexpr Mat diagonal() const
Definition mat.hpp:377
constexpr Mat< T, Cols, Rows > transposed() const
Definition mat.hpp:187
Mat & hadamard_product(const Mat &mat)
Definition mat.hpp:213
T double_contract(const Mat &mat)
Definition mat.hpp:226
SoA (Structure-of-Arrays) 4D vector grid data.
Definition grid_types.hpp:51