Loading...
Searching...
No Matches
epsilon_divdiv_kerngen.hpp
Go to the documentation of this file.
1#pragma once
2
3#include "../../quadrature/quadrature.hpp"
6#include "dense/vec.hpp"
10#include "impl/Kokkos_Profiling.hpp"
11#include "linalg/operator.hpp"
14#include "linalg/vector.hpp"
15#include "linalg/vector_q1.hpp"
16#include "util/timer.hpp"
17
19
30
31/**
32 * @brief Matrix-free / matrix-based epsilon-div-div operator on wedge elements in a spherical shell.
33 *
34 * This class supports three execution paths:
35 *
36 * 1) Slow path (local matrix path)
37 * - Used if local matrices are stored (full or selective)
38 * - Reuses assembled/stored 18x18 local matrices
39 *
40 * 2) Fast path: Dirichlet/Neumann
41 * - Matrix-free
42 * - Handles Dirichlet by skipping constrained face-node couplings (and diagonal treatment if requested)
43 * - Neumann is naturally included
44 *
45 * 3) Fast path: Free-slip
46 * - Matrix-free
47 * - Applies trial and test-side tangential projection on free-slip boundaries
48 * - Preserves behavior consistent with the slow path for iterative solves
49 *
50 * IMPORTANT DESIGN CHANGE:
51 * ------------------------
52 * The path decision is made on the HOST and cached in `kernel_path_`.
53 * `apply_impl()` dispatches to a different kernel launch per path.
54 * This avoids a runtime branch inside the hot device kernel.
55 */
56template < typename ScalarT, int VecDim = 3 >
58{
59 public:
62 using ScalarType = ScalarT;
63 static constexpr int LocalMatrixDim = 18;
66 using Team = Kokkos::TeamPolicy<>::member_type;
67
68 private:
69 // Optional element-local matrix storage (GCA/coarsening/explicit local-matrix mode)
70 LocalMatrixStorage local_matrix_storage_;
71
72 // Domain and geometry / coefficients
74 grid::Grid3DDataVec< ScalarT, 3 > grid_; ///< Lateral shell geometry (unit sphere coords)
75 grid::Grid2DDataScalar< ScalarT > radii_; ///< Radial coordinates per local subdomain
76 grid::Grid4DDataScalar< ScalarType > k_; ///< Scalar coefficient field
77 grid::Grid4DDataScalar< grid::shell::ShellBoundaryFlag > mask_; ///< Boundary flags per cell/node
78 BoundaryConditions bcs_; ///< CMB and SURFACE boundary conditions
79
80 bool diagonal_; ///< If true, apply diagonal-only action
81
82 linalg::OperatorApplyMode operator_apply_mode_;
83 linalg::OperatorCommunicationMode operator_communication_mode_;
84 linalg::OperatorStoredMatrixMode operator_stored_matrix_mode_;
85
88
89 // Device views captured by kernels
92
93 // Quadrature (Felippa 1x1 on wedge)
94 const int num_quad_points = quadrature::quad_felippa_1x1_num_quad_points;
97
98 // Local subdomain extents (in cells)
99 int local_subdomains_;
100 int hex_lat_;
101 int hex_rad_;
102 int lat_refinement_level_;
103
104 // Team-policy tiling (one team handles a slab lat_tile x lat_tile x r_tile cells)
105 int lat_tile_;
106 int r_tile_;
107 int lat_tiles_;
108 int r_tiles_;
109 int team_size_;
110 int blocks_;
111
112 ScalarT r_max_;
113 ScalarT r_min_;
114
115 /**
116 * @brief Kernel path selected on host.
117 *
118 * This value is computed whenever BCs or stored-matrix mode change and is then
119 * used by `apply_impl()` to select which kernel launch to issue.
120 */
121 enum class KernelPath
122 {
123 Slow,
124 FastDirichletNeumann,
125 FastFreeslip,
126 };
127 KernelPath kernel_path_ = KernelPath::FastDirichletNeumann;
128
129 private:
130 /**
131 * @brief Recompute the kernel path on host.
132 *
133 * Rules:
134 * - Any stored local matrix mode => slow path
135 * - Else if any free-slip BC on CMB or SURFACE => fast free-slip path
136 * - Else => fast Dirichlet/Neumann path
137 */
138 void update_kernel_path_flag_host_only()
139 {
140 const BoundaryConditionFlag cmb_bc = get_boundary_condition_flag( bcs_, CMB );
141 const BoundaryConditionFlag surface_bc = get_boundary_condition_flag( bcs_, SURFACE );
142
143 const bool has_freeslip = ( cmb_bc == FREESLIP ) || ( surface_bc == FREESLIP );
144 const bool has_stored_matrices = ( operator_stored_matrix_mode_ != linalg::OperatorStoredMatrixMode::Off );
145
146 if ( has_stored_matrices )
147 kernel_path_ = KernelPath::Slow;
148 else if ( has_freeslip )
149 kernel_path_ = KernelPath::FastFreeslip;
150 else
151 kernel_path_ = KernelPath::FastDirichletNeumann;
152 }
153
154 public:
156 const grid::shell::DistributedDomain& domain,
161 BoundaryConditions bcs,
162 bool diagonal,
164 linalg::OperatorCommunicationMode operator_communication_mode =
167 : domain_( domain )
168 , grid_( grid )
169 , radii_( radii )
170 , k_( k )
171 , mask_( mask )
172 , diagonal_( diagonal )
173 , operator_apply_mode_( operator_apply_mode )
174 , operator_communication_mode_( operator_communication_mode )
175 , operator_stored_matrix_mode_( operator_stored_matrix_mode )
176 , recv_buffers_( domain )
177 , comm_plan_( domain )
178 {
179 bcs_[0] = bcs[0];
180 bcs_[1] = bcs[1];
181
184
185 const grid::shell::DomainInfo& domain_info = domain_.domain_info();
186 local_subdomains_ = domain_.subdomains().size();
187 hex_lat_ = domain_info.subdomain_num_nodes_per_side_laterally() - 1;
188 hex_rad_ = domain_info.subdomain_num_nodes_radially() - 1;
189 lat_refinement_level_ = domain_info.diamond_lateral_refinement_level();
190
191 lat_tile_ = 4;
192 r_tile_ = 8;
193
194 lat_tiles_ = ( hex_lat_ + lat_tile_ - 1 ) / lat_tile_;
195 r_tiles_ = ( hex_rad_ + r_tile_ - 1 ) / r_tile_;
196
197 team_size_ = lat_tile_ * lat_tile_ * r_tile_;
198 blocks_ = local_subdomains_ * lat_tiles_ * lat_tiles_ * r_tiles_;
199
200 r_min_ = domain_info.radii()[0];
201 r_max_ = domain_info.radii()[domain_info.radii().size() - 1];
202
203 // Host-side path selection (no in-kernel path branching)
204 update_kernel_path_flag_host_only();
205
206 util::logroot << "[EpsilonDivDiv] tile size (x,y,r)=(" << lat_tile_ << "," << lat_tile_ << "," << r_tile_ << ")"
207 << std::endl;
208 util::logroot << "[EpsilonDivDiv] number of tiles (x,y,r)=(" << lat_tiles_ << "," << lat_tiles_ << ","
209 << r_tiles_ << "), team_size=" << team_size_ << ", blocks=" << blocks_ << std::endl;
210 const char* path_name = ( kernel_path_ == KernelPath::Slow ) ? "slow" :
211 ( kernel_path_ == KernelPath::FastFreeslip ) ? "fast-freeslip" :
212 "fast-dirichlet-neumann";
213 util::logroot << "[EpsilonDivDiv] kernel path = " << path_name << std::endl;
214 }
215
217 const linalg::OperatorApplyMode operator_apply_mode,
218 const linalg::OperatorCommunicationMode operator_communication_mode )
219 {
220 operator_apply_mode_ = operator_apply_mode;
221 operator_communication_mode_ = operator_communication_mode;
222 }
223
224 void set_diagonal( bool v ) { diagonal_ = v; }
225
226 void set_boundary_conditions( BoundaryConditions bcs )
227 {
228 bcs_[0] = bcs[0];
229 bcs_[1] = bcs[1];
230
231 // Recompute host-side dispatch mode whenever BCs change
232 update_kernel_path_flag_host_only();
233 }
234
236 const grid::shell::DistributedDomain& get_domain() const { return domain_; }
239
240 KOKKOS_INLINE_FUNCTION
242 const int local_subdomain_id,
243 const int x_cell,
244 const int y_cell,
245 const int r_cell,
247 {
248 return util::has_flag( mask_( local_subdomain_id, x_cell, y_cell, r_cell ), flag );
249 }
250
252 linalg::OperatorStoredMatrixMode operator_stored_matrix_mode,
253 int level_range,
255 {
256 operator_stored_matrix_mode_ = operator_stored_matrix_mode;
257
258 if ( operator_stored_matrix_mode_ != linalg::OperatorStoredMatrixMode::Off )
259 {
261 domain_, operator_stored_matrix_mode_, level_range, GCAElements );
262 }
263
264 // Recompute host-side dispatch mode whenever storage mode changes
265 update_kernel_path_flag_host_only();
266
267 util::logroot << "[EpsilonDivDiv] (set_stored_matrix_mode) kernel path = "
268 << (( kernel_path_ == KernelPath::Slow ) ? "slow" :
269 ( kernel_path_ == KernelPath::FastFreeslip ) ? "fast-freeslip" :
270 "fast-dirichlet-neumann")
271 << std::endl;
272 }
273
274 linalg::OperatorStoredMatrixMode get_stored_matrix_mode() { return operator_stored_matrix_mode_; }
275
276 KOKKOS_INLINE_FUNCTION
278 const int local_subdomain_id,
279 const int x_cell,
280 const int y_cell,
281 const int r_cell,
282 const int wedge,
284 {
285 KOKKOS_ASSERT( operator_stored_matrix_mode_ != linalg::OperatorStoredMatrixMode::Off );
286 local_matrix_storage_.set_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge, mat );
287 }
288
289 KOKKOS_INLINE_FUNCTION
291 const int local_subdomain_id,
292 const int x_cell,
293 const int y_cell,
294 const int r_cell,
295 const int wedge ) const
296 {
297 if ( operator_stored_matrix_mode_ != linalg::OperatorStoredMatrixMode::Off )
298 {
299 if ( !local_matrix_storage_.has_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge ) )
300 {
301 Kokkos::abort( "No matrix found at that spatial index." );
302 }
303 return local_matrix_storage_.get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge );
304 }
305 return assemble_local_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge );
306 }
307
308 /**
309 * @brief Apply the operator: dst <- Op(src) (or additive, depending on mode).
310 *
311 * Host-side responsibilities:
312 * - Handle replace/add mode initialization
313 * - Cache src/dst views for kernel capture
314 * - Build team policy
315 * - Dispatch to exactly one kernel variant based on `kernel_path_`
316 * - Optional halo communication accumulation
317 *
318 * This is where the path decision now lives (host), so device code does not branch
319 * on `kernel_path_` in the hot path.
320 */
321 void apply_impl( const SrcVectorType& src, DstVectorType& dst )
322 {
323 util::Timer timer_apply( "epsilon_divdiv_apply" );
324
325 if ( operator_apply_mode_ == linalg::OperatorApplyMode::Replace )
326 {
327 assign( dst, 0 );
328 }
329
330 dst_ = dst.grid_data();
331 src_ = src.grid_data();
332
333 util::Timer timer_kernel( "epsilon_divdiv_kernel" );
334 Kokkos::TeamPolicy<> policy( blocks_, team_size_ );
335
336 // Fast paths use shared-memory slab staging
337 if ( kernel_path_ != KernelPath::Slow )
338 {
339 policy.set_scratch_size( 0, Kokkos::PerTeam( team_shmem_size( team_size_ ) ) );
340 }
341
342 // Host-side dispatch => no per-team path branching in device code
343 if ( kernel_path_ == KernelPath::Slow )
344 {
345 Kokkos::parallel_for(
346 "epsilon_divdiv_apply_kernel_slow", policy, KOKKOS_CLASS_LAMBDA( const Team& team ) {
347 this->run_team_slow( team );
348 } );
349 }
350 else if ( kernel_path_ == KernelPath::FastFreeslip )
351 {
352 Kokkos::parallel_for(
353 "epsilon_divdiv_apply_kernel_fast_freeslip", policy, KOKKOS_CLASS_LAMBDA( const Team& team ) {
354 this->run_team_fast_freeslip( team );
355 } );
356 }
357 else
358 {
359 Kokkos::parallel_for(
360 "epsilon_divdiv_apply_kernel_fast_dirichlet_neumann",
361 policy,
362 KOKKOS_CLASS_LAMBDA( const Team& team ) { this->run_team_fast_dirichlet_neumann( team ); } );
363 }
364
365 Kokkos::fence();
366 timer_kernel.stop();
367
368 if ( operator_communication_mode_ == linalg::OperatorCommunicationMode::CommunicateAdditively )
369 {
370 util::Timer timer_comm( "epsilon_divdiv_comm" );
371 terra::communication::shell::send_recv_with_plan( comm_plan_, dst_, recv_buffers_ );
372 }
373 }
374
375 /**
376 * @brief Convert one gradient column into symmetric-gradient and div contributions.
377 *
378 * Given grad(phi e_dim), this helper computes:
379 * - diagonal entries (E00,E11,E22)
380 * - off-diagonal symmetric entries (sym01,sym02,sym12)
381 * - divergence contribution `gdd`
382 *
383 * The fast kernels use this repeatedly in fused operator application.
384 */
385 KOKKOS_INLINE_FUNCTION
387 const int dim,
388 const double g0,
389 const double g1,
390 const double g2,
391 double& E00,
392 double& E11,
393 double& E22,
394 double& sym01,
395 double& sym02,
396 double& sym12,
397 double& gdd ) const
398 {
399 E00 = E11 = E22 = sym01 = sym02 = sym12 = gdd = 0.0;
400
401 switch ( dim )
402 {
403 case 0:
404 E00 = g0;
405 gdd = g0;
406 sym01 = 0.5 * g1;
407 sym02 = 0.5 * g2;
408 break;
409 case 1:
410 E11 = g1;
411 gdd = g1;
412 sym01 = 0.5 * g0;
413 sym12 = 0.5 * g2;
414 break;
415 default:
416 E22 = g2;
417 gdd = g2;
418 sym02 = 0.5 * g0;
419 sym12 = 0.5 * g1;
420 break;
421 }
422 }
423
424 /**
425 * @brief Team scratch memory size for fast paths.
426 *
427 * Layout per team:
428 * [coords_sh | src_sh | k_sh | r_sh | padding]
429 */
430 KOKKOS_INLINE_FUNCTION
431 size_t team_shmem_size( const int /*ts*/ ) const
432 {
433 const int nlev = r_tile_ + 1;
434 const int n = lat_tile_ + 1;
435 const int nxy = n * n;
436
437 const size_t nscalars =
438 size_t( nxy ) * 3 + size_t( nxy ) * 3 * nlev + size_t( nxy ) * nlev + size_t( nlev ) + 1;
439
440 return sizeof( ScalarType ) * nscalars;
441 }
442
443 private:
444 /**
445 * @brief Decode a team league rank / team rank into:
446 * - tile origin (x0,y0,r0)
447 * - intra-tile thread coordinates (tx,ty,tr)
448 * - target cell (x_cell,y_cell,r_cell)
449 */
450 KOKKOS_INLINE_FUNCTION
451 void decode_team_indices(
452 const Team& team,
453 int& local_subdomain_id,
454 int& x0,
455 int& y0,
456 int& r0,
457 int& tx,
458 int& ty,
459 int& tr,
460 int& x_cell,
461 int& y_cell,
462 int& r_cell ) const
463 {
464 int tmp = team.league_rank();
465
466 const int r_tile_id = tmp % r_tiles_;
467 tmp /= r_tiles_;
468
469 const int lat_y_id = tmp % lat_tiles_;
470 tmp /= lat_tiles_;
471
472 const int lat_x_id = tmp % lat_tiles_;
473 tmp /= lat_tiles_;
474
475 local_subdomain_id = tmp;
476
477 x0 = lat_x_id * lat_tile_;
478 y0 = lat_y_id * lat_tile_;
479 r0 = r_tile_id * r_tile_;
480
481 const int tid = team.team_rank();
482 tx = tid % lat_tile_;
483 ty = ( tid / lat_tile_ ) % lat_tile_;
484 tr = tid / ( lat_tile_ * lat_tile_ );
485
486 x_cell = x0 + tx;
487 y_cell = y0 + ty;
488 r_cell = r0 + tr;
489 }
490
491 // -------------------------------------------------------------------------
492 // Path-specific team entry points (NO path branching)
493 // These are the functions called directly by host-dispatched kernel launches.
494 // -------------------------------------------------------------------------
495
496 /**
497 * @brief Team entry for slow (local-matrix) path.
498 *
499 * This wrapper performs only:
500 * - team index decoding
501 * - boundary flag queries
502 * - forwarding to operator_slow_path()
503 *
504 * No dispatch branching happens here.
505 */
506 KOKKOS_INLINE_FUNCTION
507 void run_team_slow( const Team& team ) const
508 {
509 int local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell;
510 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
511
512 if ( tr >= r_tile_ )
513 return;
514
515 const bool at_cmb = has_flag( local_subdomain_id, x_cell, y_cell, r_cell, CMB );
516 const bool at_surface = has_flag( local_subdomain_id, x_cell, y_cell, r_cell + 1, SURFACE );
517
518 operator_slow_path(
519 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell, at_cmb, at_surface );
520 }
521
522 /**
523 * @brief Team entry for fast Dirichlet/Neumann matrix-free path.
524 */
525 KOKKOS_INLINE_FUNCTION
526 void run_team_fast_dirichlet_neumann( const Team& team ) const
527 {
528 int local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell;
529 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
530
531 if ( tr >= r_tile_ )
532 return;
533
534 const bool at_cmb = has_flag( local_subdomain_id, x_cell, y_cell, r_cell, CMB );
535 const bool at_surface = has_flag( local_subdomain_id, x_cell, y_cell, r_cell + 1, SURFACE );
536
537 operator_fast_dirichlet_neumann_path(
538 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell, at_cmb, at_surface );
539 }
540
541 /**
542 * @brief Team entry for fast free-slip matrix-free path.
543 */
544 KOKKOS_INLINE_FUNCTION
545 void run_team_fast_freeslip( const Team& team ) const
546 {
547 int local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell;
548 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
549
550 if ( tr >= r_tile_ )
551 return;
552
553 const bool at_cmb = has_flag( local_subdomain_id, x_cell, y_cell, r_cell, CMB );
554 const bool at_surface = has_flag( local_subdomain_id, x_cell, y_cell, r_cell + 1, SURFACE );
555
556 operator_fast_freeslip_path(
557 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell, at_cmb, at_surface );
558 }
559
560 // ===================== SLOW PATH =====================
561 KOKKOS_INLINE_FUNCTION
562 void operator_slow_path(
563 const Team& team,
564 const int local_subdomain_id,
565 const int x0,
566 const int y0,
567 const int r0,
568 const int tx,
569 const int ty,
570 const int tr,
571 const int x_cell,
572 const int y_cell,
573 const int r_cell,
574 const bool at_cmb,
575 const bool at_surface ) const
576 {
577 (void) team;
578 (void) x0;
579 (void) y0;
580 (void) r0;
581 (void) tx;
582 (void) ty;
583 (void) tr;
584
585 if ( x_cell >= hex_lat_ || y_cell >= hex_lat_ || r_cell >= hex_rad_ )
586 return;
587
589
590 if ( operator_stored_matrix_mode_ == linalg::OperatorStoredMatrixMode::Full )
591 {
592 A[0] = local_matrix_storage_.get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 );
593 A[1] = local_matrix_storage_.get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 );
594 }
595 else if ( operator_stored_matrix_mode_ == linalg::OperatorStoredMatrixMode::Selective )
596 {
597 if ( local_matrix_storage_.has_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 ) &&
598 local_matrix_storage_.has_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 ) )
599 {
600 A[0] = local_matrix_storage_.get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 );
601 A[1] = local_matrix_storage_.get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 );
602 }
603 else
604 {
605 A[0] = assemble_local_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 );
606 A[1] = assemble_local_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 );
607 }
608 }
609 else
610 {
611 A[0] = assemble_local_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 );
612 A[1] = assemble_local_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 );
613 }
614
615 dense::Vec< ScalarT, 18 > src[num_wedges_per_hex_cell];
616 for ( int dimj = 0; dimj < 3; dimj++ )
617 {
618 dense::Vec< ScalarT, 6 > src_d[num_wedges_per_hex_cell];
619 extract_local_wedge_vector_coefficients( src_d, local_subdomain_id, x_cell, y_cell, r_cell, dimj, src_ );
620
621 for ( int wedge = 0; wedge < num_wedges_per_hex_cell; wedge++ )
622 {
623 for ( int i = 0; i < num_nodes_per_wedge; i++ )
624 {
625 src[wedge]( dimj * num_nodes_per_wedge + i ) = src_d[wedge]( i );
626 }
627 }
628 }
629
630 dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > boundary_mask;
631 boundary_mask.fill( 1.0 );
632
633 bool freeslip_reorder = false;
634 dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > R[num_wedges_per_hex_cell];
635
636 if ( at_cmb || at_surface )
637 {
638 ShellBoundaryFlag sbf = at_cmb ? CMB : SURFACE;
640
641 if ( bcf == DIRICHLET )
642 {
643 for ( int dimi = 0; dimi < 3; ++dimi )
644 {
645 for ( int dimj = 0; dimj < 3; ++dimj )
646 {
647 for ( int i = 0; i < num_nodes_per_wedge; i++ )
648 {
649 for ( int j = 0; j < num_nodes_per_wedge; j++ )
650 {
651 if ( ( at_cmb && ( ( dimi == dimj && i != j && ( i < 3 || j < 3 ) ) ||
652 ( dimi != dimj && ( i < 3 || j < 3 ) ) ) ) ||
653 ( at_surface && ( ( dimi == dimj && i != j && ( i >= 3 || j >= 3 ) ) ||
654 ( dimi != dimj && ( i >= 3 || j >= 3 ) ) ) ) )
655 {
656 boundary_mask( i + dimi * num_nodes_per_wedge, j + dimj * num_nodes_per_wedge ) =
657 0.0;
658 }
659 }
660 }
661 }
662 }
663 }
664 else if ( bcf == FREESLIP )
665 {
666 freeslip_reorder = true;
667 dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > A_tmp[num_wedges_per_hex_cell] = { 0 };
668
669 for ( int wedge = 0; wedge < 2; ++wedge )
670 {
671 for ( int dimi = 0; dimi < 3; ++dimi )
672 {
673 for ( int node_idxi = 0; node_idxi < num_nodes_per_wedge; node_idxi++ )
674 {
675 for ( int dimj = 0; dimj < 3; ++dimj )
676 {
677 for ( int node_idxj = 0; node_idxj < num_nodes_per_wedge; node_idxj++ )
678 {
679 A_tmp[wedge]( node_idxi * 3 + dimi, node_idxj * 3 + dimj ) = A[wedge](
680 node_idxi + dimi * num_nodes_per_wedge,
681 node_idxj + dimj * num_nodes_per_wedge );
682 }
683 }
684 }
685 }
687 }
688
689 constexpr int layer_hex_offset_x[2][3] = { { 0, 1, 0 }, { 1, 0, 1 } };
690 constexpr int layer_hex_offset_y[2][3] = { { 0, 0, 1 }, { 1, 1, 0 } };
691
692 for ( int wedge = 0; wedge < 2; ++wedge )
693 {
694 for ( int i = 0; i < LocalMatrixDim; ++i )
695 {
696 R[wedge]( i, i ) = 1.0;
697 }
698
699 for ( int boundary_node_idx = 0; boundary_node_idx < 3; boundary_node_idx++ )
700 {
701 dense::Vec< double, 3 > normal = grid::shell::coords(
702 local_subdomain_id,
703 x_cell + layer_hex_offset_x[wedge][boundary_node_idx],
704 y_cell + layer_hex_offset_y[wedge][boundary_node_idx],
705 r_cell + ( at_cmb ? 0 : 1 ),
706 grid_,
707 radii_ );
708
709 auto R_i = trafo_mat_cartesian_to_normal_tangential( normal );
710
711 const int offset_in_R = at_cmb ? 0 : 9;
712 for ( int dimi = 0; dimi < 3; ++dimi )
713 {
714 for ( int dimj = 0; dimj < 3; ++dimj )
715 {
716 R[wedge](
717 offset_in_R + boundary_node_idx * 3 + dimi,
718 offset_in_R + boundary_node_idx * 3 + dimj ) = R_i( dimi, dimj );
719 }
720 }
721 }
722
723 A[wedge] = R[wedge] * A_tmp[wedge] * R[wedge].transposed();
724
725 auto src_tmp = R[wedge] * src[wedge];
726 for ( int i = 0; i < 18; ++i )
727 {
728 src[wedge]( i ) = src_tmp( i );
729 }
730
731 const int node_start = at_surface ? 3 : 0;
732 const int node_end = at_surface ? 6 : 3;
733
734 for ( int node_idx = node_start; node_idx < node_end; node_idx++ )
735 {
736 const int idx = node_idx * 3;
737 for ( int k = 0; k < 18; ++k )
738 {
739 if ( k != idx )
740 {
741 boundary_mask( idx, k ) = 0.0;
742 boundary_mask( k, idx ) = 0.0;
743 }
744 }
745 }
746 }
747 }
748 }
749
750 for ( int wedge = 0; wedge < num_wedges_per_hex_cell; wedge++ )
751 {
752 A[wedge].hadamard_product( boundary_mask );
753 }
754
755 if ( diagonal_ )
756 {
757 A[0] = A[0].diagonal();
758 A[1] = A[1].diagonal();
759 }
760
761 dense::Vec< ScalarT, LocalMatrixDim > dst[num_wedges_per_hex_cell];
762 dst[0] = A[0] * src[0];
763 dst[1] = A[1] * src[1];
764
765 if ( freeslip_reorder )
766 {
767 dense::Vec< ScalarT, LocalMatrixDim > dst_tmp[num_wedges_per_hex_cell];
768 dst_tmp[0] = R[0].transposed() * dst[0];
769 dst_tmp[1] = R[1].transposed() * dst[1];
770
771 for ( int i = 0; i < 18; ++i )
772 {
773 dst[0]( i ) = dst_tmp[0]( i );
774 dst[1]( i ) = dst_tmp[1]( i );
775 }
776
779 }
780
781 for ( int dimi = 0; dimi < 3; dimi++ )
782 {
783 dense::Vec< ScalarT, 6 > dst_d[num_wedges_per_hex_cell];
784 dst_d[0] = dst[0].template slice< 6 >( dimi * num_nodes_per_wedge );
785 dst_d[1] = dst[1].template slice< 6 >( dimi * num_nodes_per_wedge );
786
788 dst_, local_subdomain_id, x_cell, y_cell, r_cell, dimi, dst_d );
789 }
790 }
791
792 // ===================== FAST DIRICHLET/NEUMANN PATH =====================
793 KOKKOS_INLINE_FUNCTION
794 void operator_fast_dirichlet_neumann_path(
795 const Team& team,
796 const int local_subdomain_id,
797 const int x0,
798 const int y0,
799 const int r0,
800 const int tx,
801 const int ty,
802 const int tr,
803 const int x_cell,
804 const int y_cell,
805 const int r_cell,
806 const bool at_cmb,
807 const bool at_surface ) const
808 {
809 const int nlev = r_tile_ + 1;
810 const int nxy = ( lat_tile_ + 1 ) * ( lat_tile_ + 1 );
811
812 double* shmem =
813 reinterpret_cast< double* >( team.team_shmem().get_shmem( team_shmem_size( team.team_size() ) ) );
814
815 using ScratchCoords =
816 Kokkos::View< double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
817 using ScratchSrc = Kokkos::
818 View< double***, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
819 using ScratchK =
820 Kokkos::View< double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
821
822 ScratchCoords coords_sh( shmem, nxy, 3 );
823 shmem += nxy * 3;
824
825 ScratchSrc src_sh( shmem, nxy, 3, nlev );
826 shmem += nxy * 3 * nlev;
827
828 ScratchK k_sh( shmem, nxy, nlev );
829 shmem += nxy * nlev;
830
831 auto r_sh =
832 Kokkos::View< double*, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >(
833 shmem, nlev );
834
835 auto node_id = [&]( int nx, int ny ) -> int { return nx + ( lat_tile_ + 1 ) * ny; };
836
837 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nxy ), [&]( int n ) {
838 const int dxn = n % ( lat_tile_ + 1 );
839 const int dyn = n / ( lat_tile_ + 1 );
840 const int xi = x0 + dxn;
841 const int yi = y0 + dyn;
842
843 if ( xi <= hex_lat_ && yi <= hex_lat_ )
844 {
845 coords_sh( n, 0 ) = grid_( local_subdomain_id, xi, yi, 0 );
846 coords_sh( n, 1 ) = grid_( local_subdomain_id, xi, yi, 1 );
847 coords_sh( n, 2 ) = grid_( local_subdomain_id, xi, yi, 2 );
848 }
849 else
850 {
851 coords_sh( n, 0 ) = coords_sh( n, 1 ) = coords_sh( n, 2 ) = 0.0;
852 }
853 } );
854
855 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nlev ), [&]( int lvl ) {
856 const int rr = r0 + lvl;
857 r_sh( lvl ) = ( rr <= hex_rad_ ) ? radii_( local_subdomain_id, rr ) : 0.0;
858 } );
859
860 const int total_pairs = nxy * nlev;
861 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, total_pairs ), [&]( int t ) {
862 const int lvl = t / nxy;
863 const int node = t - lvl * nxy;
864
865 const int dxn = node % ( lat_tile_ + 1 );
866 const int dyn = node / ( lat_tile_ + 1 );
867
868 const int xi = x0 + dxn;
869 const int yi = y0 + dyn;
870 const int rr = r0 + lvl;
871
872 if ( xi <= hex_lat_ && yi <= hex_lat_ && rr <= hex_rad_ )
873 {
874 k_sh( node, lvl ) = k_( local_subdomain_id, xi, yi, rr );
875 src_sh( node, 0, lvl ) = src_( local_subdomain_id, xi, yi, rr, 0 );
876 src_sh( node, 1, lvl ) = src_( local_subdomain_id, xi, yi, rr, 1 );
877 src_sh( node, 2, lvl ) = src_( local_subdomain_id, xi, yi, rr, 2 );
878 }
879 else
880 {
881 k_sh( node, lvl ) = 0.0;
882 src_sh( node, 0, lvl ) = src_sh( node, 1, lvl ) = src_sh( node, 2, lvl ) = 0.0;
883 }
884 } );
885
886 team.team_barrier();
887
888 if ( x_cell >= hex_lat_ || y_cell >= hex_lat_ || r_cell >= hex_rad_ )
889 return;
890
891 const int lvl0 = tr;
892 const double r_0 = r_sh( lvl0 );
893 const double r_1 = r_sh( lvl0 + 1 );
894
895 const bool at_boundary = at_cmb || at_surface;
896 bool treat_boundary_dirichlet = false;
897 if ( at_boundary )
898 {
899 const ShellBoundaryFlag sbf = at_cmb ? CMB : SURFACE;
900 treat_boundary_dirichlet = ( get_boundary_condition_flag( bcs_, sbf ) == DIRICHLET );
901 }
902
903 const int cmb_shift = ( ( at_boundary && treat_boundary_dirichlet && ( !diagonal_ ) && at_cmb ) ? 3 : 0 );
904 const int surface_shift =
905 ( ( at_boundary && treat_boundary_dirichlet && ( !diagonal_ ) && at_surface ) ? 3 : 0 );
906
907 static constexpr int WEDGE_NODE_OFF[2][6][3] = {
908 { { 0, 0, 0 }, { 1, 0, 0 }, { 0, 1, 0 }, { 0, 0, 1 }, { 1, 0, 1 }, { 0, 1, 1 } },
909 { { 1, 1, 0 }, { 0, 1, 0 }, { 1, 0, 0 }, { 1, 1, 1 }, { 0, 1, 1 }, { 1, 0, 1 } } };
910
911 static constexpr int WEDGE_TO_UNIQUE[2][6] = {
912 { 0, 1, 2, 3, 4, 5 },
913 { 6, 2, 1, 7, 5, 4 }
914 };
915
916 constexpr double ONE_THIRD = 1.0 / 3.0;
917 constexpr double ONE_SIXTH = 1.0 / 6.0;
918 constexpr double NEG_TWO_THIRDS = -0.66666666666666663;
919
920 static constexpr double dN_ref[6][3] = {
921 { -0.5, -0.5, -ONE_SIXTH },
922 { 0.5, 0.0, -ONE_SIXTH },
923 { 0.0, 0.5, -ONE_SIXTH },
924 { -0.5, -0.5, ONE_SIXTH },
925 { 0.5, 0.0, ONE_SIXTH },
926 { 0.0, 0.5, ONE_SIXTH } };
927
928 const int n00 = node_id( tx, ty );
929 const int n01 = node_id( tx, ty + 1 );
930 const int n10 = node_id( tx + 1, ty );
931 const int n11 = node_id( tx + 1, ty + 1 );
932
933 double ws[2][3][3];
934
935 ws[0][0][0] = coords_sh( n00, 0 );
936 ws[0][0][1] = coords_sh( n00, 1 );
937 ws[0][0][2] = coords_sh( n00, 2 );
938 ws[0][1][0] = coords_sh( n10, 0 );
939 ws[0][1][1] = coords_sh( n10, 1 );
940 ws[0][1][2] = coords_sh( n10, 2 );
941 ws[0][2][0] = coords_sh( n01, 0 );
942 ws[0][2][1] = coords_sh( n01, 1 );
943 ws[0][2][2] = coords_sh( n01, 2 );
944
945 ws[1][0][0] = coords_sh( n11, 0 );
946 ws[1][0][1] = coords_sh( n11, 1 );
947 ws[1][0][2] = coords_sh( n11, 2 );
948 ws[1][1][0] = coords_sh( n01, 0 );
949 ws[1][1][1] = coords_sh( n01, 1 );
950 ws[1][1][2] = coords_sh( n01, 2 );
951 ws[1][2][0] = coords_sh( n10, 0 );
952 ws[1][2][1] = coords_sh( n10, 1 );
953 ws[1][2][2] = coords_sh( n10, 2 );
954
955 double dst8[3][8] = { 0.0 };
956
957 for ( int w = 0; w < 2; ++w )
958 {
959 double k_sum = 0.0;
960#pragma unroll
961 for ( int node = 0; node < 6; ++node )
962 {
963 const int ddx = WEDGE_NODE_OFF[w][node][0];
964 const int ddy = WEDGE_NODE_OFF[w][node][1];
965 const int ddr = WEDGE_NODE_OFF[w][node][2];
966
967 const int nid = node_id( tx + ddx, ty + ddy );
968 const int lvl = lvl0 + ddr;
969
970 k_sum += k_sh( nid, lvl );
971 }
972 const double k_eval = ONE_SIXTH * k_sum;
973
974 double wJ = 0.0;
975 double i00, i01, i02;
976 double i10, i11, i12;
977 double i20, i21, i22;
978
979 {
980 const double half_dr = 0.5 * ( r_1 - r_0 );
981 const double r_mid = 0.5 * ( r_0 + r_1 );
982
983 const double J_0_0 = r_mid * ( -ws[w][0][0] + ws[w][1][0] );
984 const double J_0_1 = r_mid * ( -ws[w][0][0] + ws[w][2][0] );
985 const double J_0_2 = half_dr * ( ONE_THIRD * ( ws[w][0][0] + ws[w][1][0] + ws[w][2][0] ) );
986
987 const double J_1_0 = r_mid * ( -ws[w][0][1] + ws[w][1][1] );
988 const double J_1_1 = r_mid * ( -ws[w][0][1] + ws[w][2][1] );
989 const double J_1_2 = half_dr * ( ONE_THIRD * ( ws[w][0][1] + ws[w][1][1] + ws[w][2][1] ) );
990
991 const double J_2_0 = r_mid * ( -ws[w][0][2] + ws[w][1][2] );
992 const double J_2_1 = r_mid * ( -ws[w][0][2] + ws[w][2][2] );
993 const double J_2_2 = half_dr * ( ONE_THIRD * ( ws[w][0][2] + ws[w][1][2] + ws[w][2][2] ) );
994
995 const double J_det = J_0_0 * J_1_1 * J_2_2 - J_0_0 * J_1_2 * J_2_1 - J_0_1 * J_1_0 * J_2_2 +
996 J_0_1 * J_1_2 * J_2_0 + J_0_2 * J_1_0 * J_2_1 - J_0_2 * J_1_1 * J_2_0;
997
998 const double invJ = 1.0 / J_det;
999
1000 i00 = invJ * ( J_1_1 * J_2_2 - J_1_2 * J_2_1 );
1001 i01 = invJ * ( -J_1_0 * J_2_2 + J_1_2 * J_2_0 );
1002 i02 = invJ * ( J_1_0 * J_2_1 - J_1_1 * J_2_0 );
1003 i10 = invJ * ( -J_0_1 * J_2_2 + J_0_2 * J_2_1 );
1004 i11 = invJ * ( J_0_0 * J_2_2 - J_0_2 * J_2_0 );
1005 i12 = invJ * ( -J_0_0 * J_2_1 + J_0_1 * J_2_0 );
1006 i20 = invJ * ( J_0_1 * J_1_2 - J_0_2 * J_1_1 );
1007 i21 = invJ * ( -J_0_0 * J_1_2 + J_0_2 * J_1_0 );
1008 i22 = invJ * ( J_0_0 * J_1_1 - J_0_1 * J_1_0 );
1009
1010 wJ = Kokkos::abs( J_det );
1011 }
1012
1013 const double kwJ = k_eval * wJ;
1014
1015 double gu00 = 0.0;
1016 double gu10 = 0.0, gu11 = 0.0;
1017 double gu20 = 0.0, gu21 = 0.0, gu22 = 0.0;
1018 double div_u = 0.0;
1019
1020 if ( !diagonal_ )
1021 {
1022 for ( int dimj = 0; dimj < 3; ++dimj )
1023 {
1024#pragma unroll
1025 for ( int node_idx = cmb_shift; node_idx < 6 - surface_shift; ++node_idx )
1026 {
1027 const double gx = dN_ref[node_idx][0];
1028 const double gy = dN_ref[node_idx][1];
1029 const double gz = dN_ref[node_idx][2];
1030
1031 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1032 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1033 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1034
1035 double E00, E11, E22, sym01, sym02, sym12, gdd;
1036 column_grad_to_sym( dimj, g0, g1, g2, E00, E11, E22, sym01, sym02, sym12, gdd );
1037
1038 const int ddx = WEDGE_NODE_OFF[w][node_idx][0];
1039 const int ddy = WEDGE_NODE_OFF[w][node_idx][1];
1040 const int ddr = WEDGE_NODE_OFF[w][node_idx][2];
1041
1042 const int nid = node_id( tx + ddx, ty + ddy );
1043 const int lvl = lvl0 + ddr;
1044
1045 const double s = src_sh( nid, dimj, lvl );
1046
1047 gu00 += E00 * s;
1048 gu10 += sym01 * s;
1049 gu11 += E11 * s;
1050 gu20 += sym02 * s;
1051 gu21 += sym12 * s;
1052 gu22 += E22 * s;
1053 div_u += gdd * s;
1054 }
1055 }
1056
1057 for ( int dimi = 0; dimi < 3; ++dimi )
1058 {
1059#pragma unroll
1060 for ( int node_idx = cmb_shift; node_idx < 6 - surface_shift; ++node_idx )
1061 {
1062 const double gx = dN_ref[node_idx][0];
1063 const double gy = dN_ref[node_idx][1];
1064 const double gz = dN_ref[node_idx][2];
1065
1066 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1067 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1068 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1069
1070 double E00, E11, E22, sym01, sym02, sym12, gdd;
1071 column_grad_to_sym( dimi, g0, g1, g2, E00, E11, E22, sym01, sym02, sym12, gdd );
1072
1073 const int u = WEDGE_TO_UNIQUE[w][node_idx];
1074
1075 dst8[dimi][u] +=
1076 kwJ * ( NEG_TWO_THIRDS * div_u * gdd + 4.0 * sym01 * gu10 + 4.0 * sym02 * gu20 +
1077 4.0 * sym12 * gu21 + 2.0 * E00 * gu00 + 2.0 * E11 * gu11 + 2.0 * E22 * gu22 );
1078 }
1079 }
1080 }
1081
1082 if ( diagonal_ || ( treat_boundary_dirichlet && at_boundary ) )
1083 {
1084 for ( int dim_diagBC = 0; dim_diagBC < 3; ++dim_diagBC )
1085 {
1086#pragma unroll
1087 for ( int node_idx = surface_shift; node_idx < 6 - cmb_shift; ++node_idx )
1088 {
1089 const double gx = dN_ref[node_idx][0];
1090 const double gy = dN_ref[node_idx][1];
1091 const double gz = dN_ref[node_idx][2];
1092
1093 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1094 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1095 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1096
1097 double E00, E11, E22, sym01, sym02, sym12, gdd;
1098 column_grad_to_sym( dim_diagBC, g0, g1, g2, E00, E11, E22, sym01, sym02, sym12, gdd );
1099
1100 const int ddx = WEDGE_NODE_OFF[w][node_idx][0];
1101 const int ddy = WEDGE_NODE_OFF[w][node_idx][1];
1102 const int ddr = WEDGE_NODE_OFF[w][node_idx][2];
1103
1104 const int nid = node_id( tx + ddx, ty + ddy );
1105 const int lvl = lvl0 + ddr;
1106
1107 const double s = src_sh( nid, dim_diagBC, lvl );
1108 const int u = WEDGE_TO_UNIQUE[w][node_idx];
1109
1110 dst8[dim_diagBC][u] += kwJ * ( 4.0 * s * ( sym01 * sym01 + sym02 * sym02 + sym12 * sym12 ) +
1111 2.0 * s * ( E00 * E00 + E11 * E11 + E22 * E22 ) +
1112 NEG_TWO_THIRDS * ( gdd * gdd ) * s );
1113 }
1114 }
1115 }
1116 }
1117
1118 for ( int dim_add = 0; dim_add < 3; ++dim_add )
1119 {
1120 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell, y_cell, r_cell, dim_add ), dst8[dim_add][0] );
1121 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell + 1, y_cell, r_cell, dim_add ), dst8[dim_add][1] );
1122 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell, y_cell + 1, r_cell, dim_add ), dst8[dim_add][2] );
1123 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell, y_cell, r_cell + 1, dim_add ), dst8[dim_add][3] );
1124 Kokkos::atomic_add(
1125 &dst_( local_subdomain_id, x_cell + 1, y_cell, r_cell + 1, dim_add ), dst8[dim_add][4] );
1126 Kokkos::atomic_add(
1127 &dst_( local_subdomain_id, x_cell, y_cell + 1, r_cell + 1, dim_add ), dst8[dim_add][5] );
1128 Kokkos::atomic_add(
1129 &dst_( local_subdomain_id, x_cell + 1, y_cell + 1, r_cell, dim_add ), dst8[dim_add][6] );
1130 Kokkos::atomic_add(
1131 &dst_( local_subdomain_id, x_cell + 1, y_cell + 1, r_cell + 1, dim_add ), dst8[dim_add][7] );
1132 }
1133 }
1134
1135 // ===================== FAST FREESLIP PATH =====================
1136 KOKKOS_INLINE_FUNCTION
1137 void normalize3( double& x, double& y, double& z ) const
1138 {
1139 const double n2 = x * x + y * y + z * z;
1140 if ( n2 > 0.0 )
1141 {
1142 const double invn = 1.0 / Kokkos::sqrt( n2 );
1143 x *= invn;
1144 y *= invn;
1145 z *= invn;
1146 }
1147 }
1148
1149 KOKKOS_INLINE_FUNCTION
1150 void project_tangential_inplace(
1151 const double nx,
1152 const double ny,
1153 const double nz,
1154 double& ux,
1155 double& uy,
1156 double& uz ) const
1157 {
1158 const double dot = nx * ux + ny * uy + nz * uz;
1159 ux -= dot * nx;
1160 uy -= dot * ny;
1161 uz -= dot * nz;
1162 }
1163
1164 KOKKOS_INLINE_FUNCTION
1165 void operator_fast_freeslip_path(
1166 const Team& team,
1167 const int local_subdomain_id,
1168 const int x0,
1169 const int y0,
1170 const int r0,
1171 const int tx,
1172 const int ty,
1173 const int tr,
1174 const int x_cell,
1175 const int y_cell,
1176 const int r_cell,
1177 const bool at_cmb,
1178 const bool at_surface ) const
1179 {
1180 const int nlev = r_tile_ + 1;
1181 const int nxy = ( lat_tile_ + 1 ) * ( lat_tile_ + 1 );
1182
1183 double* shmem =
1184 reinterpret_cast< double* >( team.team_shmem().get_shmem( team_shmem_size( team.team_size() ) ) );
1185
1186 using ScratchCoords =
1187 Kokkos::View< double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
1188 using ScratchSrc = Kokkos::
1189 View< double***, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
1190 using ScratchK =
1191 Kokkos::View< double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
1192
1193 ScratchCoords coords_sh( shmem, nxy, 3 );
1194 shmem += nxy * 3;
1195
1196 ScratchSrc src_sh( shmem, nxy, 3, nlev );
1197 shmem += nxy * 3 * nlev;
1198
1199 ScratchK k_sh( shmem, nxy, nlev );
1200 shmem += nxy * nlev;
1201
1202 auto r_sh =
1203 Kokkos::View< double*, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >(
1204 shmem, nlev );
1205
1206 auto node_id = [&]( int nx, int ny ) -> int { return nx + ( lat_tile_ + 1 ) * ny; };
1207
1208 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nxy ), [&]( int n ) {
1209 const int dxn = n % ( lat_tile_ + 1 );
1210 const int dyn = n / ( lat_tile_ + 1 );
1211 const int xi = x0 + dxn;
1212 const int yi = y0 + dyn;
1213
1214 if ( xi <= hex_lat_ && yi <= hex_lat_ )
1215 {
1216 coords_sh( n, 0 ) = grid_( local_subdomain_id, xi, yi, 0 );
1217 coords_sh( n, 1 ) = grid_( local_subdomain_id, xi, yi, 1 );
1218 coords_sh( n, 2 ) = grid_( local_subdomain_id, xi, yi, 2 );
1219 }
1220 else
1221 {
1222 coords_sh( n, 0 ) = coords_sh( n, 1 ) = coords_sh( n, 2 ) = 0.0;
1223 }
1224 } );
1225
1226 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nlev ), [&]( int lvl ) {
1227 const int rr = r0 + lvl;
1228 r_sh( lvl ) = ( rr <= hex_rad_ ) ? radii_( local_subdomain_id, rr ) : 0.0;
1229 } );
1230
1231 const int total_pairs = nxy * nlev;
1232 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, total_pairs ), [&]( int t ) {
1233 const int lvl = t / nxy;
1234 const int node = t - lvl * nxy;
1235
1236 const int dxn = node % ( lat_tile_ + 1 );
1237 const int dyn = node / ( lat_tile_ + 1 );
1238
1239 const int xi = x0 + dxn;
1240 const int yi = y0 + dyn;
1241 const int rr = r0 + lvl;
1242
1243 if ( xi <= hex_lat_ && yi <= hex_lat_ && rr <= hex_rad_ )
1244 {
1245 k_sh( node, lvl ) = k_( local_subdomain_id, xi, yi, rr );
1246 src_sh( node, 0, lvl ) = src_( local_subdomain_id, xi, yi, rr, 0 );
1247 src_sh( node, 1, lvl ) = src_( local_subdomain_id, xi, yi, rr, 1 );
1248 src_sh( node, 2, lvl ) = src_( local_subdomain_id, xi, yi, rr, 2 );
1249 }
1250 else
1251 {
1252 k_sh( node, lvl ) = 0.0;
1253 src_sh( node, 0, lvl ) = src_sh( node, 1, lvl ) = src_sh( node, 2, lvl ) = 0.0;
1254 }
1255 } );
1256
1257 team.team_barrier();
1258
1259 if ( x_cell >= hex_lat_ || y_cell >= hex_lat_ || r_cell >= hex_rad_ )
1260 return;
1261
1262 const int lvl0 = tr;
1263 const double r_0 = r_sh( lvl0 );
1264 const double r_1 = r_sh( lvl0 + 1 );
1265
1266 const BoundaryConditionFlag cmb_bc = get_boundary_condition_flag( bcs_, CMB );
1267 const BoundaryConditionFlag surface_bc = get_boundary_condition_flag( bcs_, SURFACE );
1268
1269 const bool cmb_freeslip = at_cmb && ( cmb_bc == FREESLIP );
1270 const bool surf_freeslip = at_surface && ( surface_bc == FREESLIP );
1271 const bool cmb_dirichlet = at_cmb && ( cmb_bc == DIRICHLET );
1272 const bool surface_dirichlet = at_surface && ( surface_bc == DIRICHLET );
1273
1274 const int cmb_shift = ( ( cmb_dirichlet && ( !diagonal_ ) ) ? 3 : 0 );
1275 const int surface_shift = ( ( surface_dirichlet && ( !diagonal_ ) ) ? 3 : 0 );
1276
1277 static constexpr int WEDGE_NODE_OFF[2][6][3] = {
1278 { { 0, 0, 0 }, { 1, 0, 0 }, { 0, 1, 0 }, { 0, 0, 1 }, { 1, 0, 1 }, { 0, 1, 1 } },
1279 { { 1, 1, 0 }, { 0, 1, 0 }, { 1, 0, 0 }, { 1, 1, 1 }, { 0, 1, 1 }, { 1, 0, 1 } } };
1280
1281 static constexpr int WEDGE_TO_UNIQUE[2][6] = {
1282 { 0, 1, 2, 3, 4, 5 },
1283 { 6, 2, 1, 7, 5, 4 }
1284 };
1285
1286 constexpr double ONE_THIRD = 1.0 / 3.0;
1287 constexpr double ONE_SIXTH = 1.0 / 6.0;
1288 constexpr double NEG_TWO_THIRDS = -0.66666666666666663;
1289
1290 static constexpr double dN_ref[6][3] = {
1291 { -0.5, -0.5, -ONE_SIXTH },
1292 { 0.5, 0.0, -ONE_SIXTH },
1293 { 0.0, 0.5, -ONE_SIXTH },
1294 { -0.5, -0.5, ONE_SIXTH },
1295 { 0.5, 0.0, ONE_SIXTH },
1296 { 0.0, 0.5, ONE_SIXTH } };
1297
1298 const int n00 = node_id( tx, ty );
1299 const int n01 = node_id( tx, ty + 1 );
1300 const int n10 = node_id( tx + 1, ty );
1301 const int n11 = node_id( tx + 1, ty + 1 );
1302
1303 double ws[2][3][3];
1304
1305 ws[0][0][0] = coords_sh( n00, 0 );
1306 ws[0][0][1] = coords_sh( n00, 1 );
1307 ws[0][0][2] = coords_sh( n00, 2 );
1308 ws[0][1][0] = coords_sh( n10, 0 );
1309 ws[0][1][1] = coords_sh( n10, 1 );
1310 ws[0][1][2] = coords_sh( n10, 2 );
1311 ws[0][2][0] = coords_sh( n01, 0 );
1312 ws[0][2][1] = coords_sh( n01, 1 );
1313 ws[0][2][2] = coords_sh( n01, 2 );
1314
1315 ws[1][0][0] = coords_sh( n11, 0 );
1316 ws[1][0][1] = coords_sh( n11, 1 );
1317 ws[1][0][2] = coords_sh( n11, 2 );
1318 ws[1][1][0] = coords_sh( n01, 0 );
1319 ws[1][1][1] = coords_sh( n01, 1 );
1320 ws[1][1][2] = coords_sh( n01, 2 );
1321 ws[1][2][0] = coords_sh( n10, 0 );
1322 ws[1][2][1] = coords_sh( n10, 1 );
1323 ws[1][2][2] = coords_sh( n10, 2 );
1324
1325 double src8[3][8];
1326
1327 src8[0][0] = src_sh( n00, 0, lvl0 );
1328 src8[1][0] = src_sh( n00, 1, lvl0 );
1329 src8[2][0] = src_sh( n00, 2, lvl0 );
1330 src8[0][1] = src_sh( n10, 0, lvl0 );
1331 src8[1][1] = src_sh( n10, 1, lvl0 );
1332 src8[2][1] = src_sh( n10, 2, lvl0 );
1333 src8[0][2] = src_sh( n01, 0, lvl0 );
1334 src8[1][2] = src_sh( n01, 1, lvl0 );
1335 src8[2][2] = src_sh( n01, 2, lvl0 );
1336 src8[0][6] = src_sh( n11, 0, lvl0 );
1337 src8[1][6] = src_sh( n11, 1, lvl0 );
1338 src8[2][6] = src_sh( n11, 2, lvl0 );
1339
1340 src8[0][3] = src_sh( n00, 0, lvl0 + 1 );
1341 src8[1][3] = src_sh( n00, 1, lvl0 + 1 );
1342 src8[2][3] = src_sh( n00, 2, lvl0 + 1 );
1343 src8[0][4] = src_sh( n10, 0, lvl0 + 1 );
1344 src8[1][4] = src_sh( n10, 1, lvl0 + 1 );
1345 src8[2][4] = src_sh( n10, 2, lvl0 + 1 );
1346 src8[0][5] = src_sh( n01, 0, lvl0 + 1 );
1347 src8[1][5] = src_sh( n01, 1, lvl0 + 1 );
1348 src8[2][5] = src_sh( n01, 2, lvl0 + 1 );
1349 src8[0][7] = src_sh( n11, 0, lvl0 + 1 );
1350 src8[1][7] = src_sh( n11, 1, lvl0 + 1 );
1351 src8[2][7] = src_sh( n11, 2, lvl0 + 1 );
1352
1353 double nx00 = coords_sh( n00, 0 ), ny00 = coords_sh( n00, 1 ), nz00 = coords_sh( n00, 2 );
1354 double nx10 = coords_sh( n10, 0 ), ny10 = coords_sh( n10, 1 ), nz10 = coords_sh( n10, 2 );
1355 double nx01 = coords_sh( n01, 0 ), ny01 = coords_sh( n01, 1 ), nz01 = coords_sh( n01, 2 );
1356 double nx11 = coords_sh( n11, 0 ), ny11 = coords_sh( n11, 1 ), nz11 = coords_sh( n11, 2 );
1357
1358 normalize3( nx00, ny00, nz00 );
1359 normalize3( nx10, ny10, nz10 );
1360 normalize3( nx01, ny01, nz01 );
1361 normalize3( nx11, ny11, nz11 );
1362
1363 // Save normal components of src at free-slip boundary nodes before trial-side projection.
1364 // Used in two places:
1365 // - non-diagonal mode: to restore A_nn * u_n * n after the test-side projection
1366 // (normal_corr mechanism).
1367 // - diagonal mode: to reconstruct the unprojected src inside apply_rotated_diag so
1368 // that R^T diag(RAR^T) R src can be evaluated.
1369 // Corner indexing: 0=n00, 1=n10, 2=n01, 3=n11.
1370 double u_n_cmb[4] = {};
1371 double u_n_surf[4] = {};
1372 if ( cmb_freeslip )
1373 {
1374 u_n_cmb[0] = nx00 * src8[0][0] + ny00 * src8[1][0] + nz00 * src8[2][0];
1375 u_n_cmb[1] = nx10 * src8[0][1] + ny10 * src8[1][1] + nz10 * src8[2][1];
1376 u_n_cmb[2] = nx01 * src8[0][2] + ny01 * src8[1][2] + nz01 * src8[2][2];
1377 u_n_cmb[3] = nx11 * src8[0][6] + ny11 * src8[1][6] + nz11 * src8[2][6];
1378 }
1379 if ( surf_freeslip )
1380 {
1381 u_n_surf[0] = nx00 * src8[0][3] + ny00 * src8[1][3] + nz00 * src8[2][3];
1382 u_n_surf[1] = nx10 * src8[0][4] + ny10 * src8[1][4] + nz10 * src8[2][4];
1383 u_n_surf[2] = nx01 * src8[0][5] + ny01 * src8[1][5] + nz01 * src8[2][5];
1384 u_n_surf[3] = nx11 * src8[0][7] + ny11 * src8[1][7] + nz11 * src8[2][7];
1385 }
1386
1387 if ( cmb_freeslip )
1388 {
1389 project_tangential_inplace( nx00, ny00, nz00, src8[0][0], src8[1][0], src8[2][0] );
1390 project_tangential_inplace( nx10, ny10, nz10, src8[0][1], src8[1][1], src8[2][1] );
1391 project_tangential_inplace( nx01, ny01, nz01, src8[0][2], src8[1][2], src8[2][2] );
1392 project_tangential_inplace( nx11, ny11, nz11, src8[0][6], src8[1][6], src8[2][6] );
1393 }
1394 if ( surf_freeslip )
1395 {
1396 project_tangential_inplace( nx00, ny00, nz00, src8[0][3], src8[1][3], src8[2][3] );
1397 project_tangential_inplace( nx10, ny10, nz10, src8[0][4], src8[1][4], src8[2][4] );
1398 project_tangential_inplace( nx01, ny01, nz01, src8[0][5], src8[1][5], src8[2][5] );
1399 project_tangential_inplace( nx11, ny11, nz11, src8[0][7], src8[1][7], src8[2][7] );
1400 }
1401
1402 double dst8[3][8] = { { 0.0 } };
1403 double normal_corr[3][8] = {};
1404
1405
1406
1407 for ( int w = 0; w < 2; ++w )
1408 {
1409 double k_sum = 0.0;
1410#pragma unroll
1411 for ( int node = 0; node < 6; ++node )
1412 {
1413 const int ddx = WEDGE_NODE_OFF[w][node][0];
1414 const int ddy = WEDGE_NODE_OFF[w][node][1];
1415 const int ddr = WEDGE_NODE_OFF[w][node][2];
1416
1417 const int nid = node_id( tx + ddx, ty + ddy );
1418 const int lvl = lvl0 + ddr;
1419 k_sum += k_sh( nid, lvl );
1420 }
1421 const double k_eval = ONE_SIXTH * k_sum;
1422
1423 double wJ = 0.0;
1424 double i00, i01, i02;
1425 double i10, i11, i12;
1426 double i20, i21, i22;
1427
1428 {
1429 const double half_dr = 0.5 * ( r_1 - r_0 );
1430 const double r_mid = 0.5 * ( r_0 + r_1 );
1431
1432 const double J_0_0 = r_mid * ( -ws[w][0][0] + ws[w][1][0] );
1433 const double J_0_1 = r_mid * ( -ws[w][0][0] + ws[w][2][0] );
1434 const double J_0_2 = half_dr * ( ONE_THIRD * ( ws[w][0][0] + ws[w][1][0] + ws[w][2][0] ) );
1435
1436 const double J_1_0 = r_mid * ( -ws[w][0][1] + ws[w][1][1] );
1437 const double J_1_1 = r_mid * ( -ws[w][0][1] + ws[w][2][1] );
1438 const double J_1_2 = half_dr * ( ONE_THIRD * ( ws[w][0][1] + ws[w][1][1] + ws[w][2][1] ) );
1439
1440 const double J_2_0 = r_mid * ( -ws[w][0][2] + ws[w][1][2] );
1441 const double J_2_1 = r_mid * ( -ws[w][0][2] + ws[w][2][2] );
1442 const double J_2_2 = half_dr * ( ONE_THIRD * ( ws[w][0][2] + ws[w][1][2] + ws[w][2][2] ) );
1443
1444 const double J_det = J_0_0 * J_1_1 * J_2_2 - J_0_0 * J_1_2 * J_2_1 - J_0_1 * J_1_0 * J_2_2 +
1445 J_0_1 * J_1_2 * J_2_0 + J_0_2 * J_1_0 * J_2_1 - J_0_2 * J_1_1 * J_2_0;
1446
1447 const double invJ = 1.0 / J_det;
1448
1449 i00 = invJ * ( J_1_1 * J_2_2 - J_1_2 * J_2_1 );
1450 i01 = invJ * ( -J_1_0 * J_2_2 + J_1_2 * J_2_0 );
1451 i02 = invJ * ( J_1_0 * J_2_1 - J_1_1 * J_2_0 );
1452
1453 i10 = invJ * ( -J_0_1 * J_2_2 + J_0_2 * J_2_1 );
1454 i11 = invJ * ( J_0_0 * J_2_2 - J_0_2 * J_2_0 );
1455 i12 = invJ * ( -J_0_0 * J_2_1 + J_0_1 * J_2_0 );
1456
1457 i20 = invJ * ( J_0_1 * J_1_2 - J_0_2 * J_1_1 );
1458 i21 = invJ * ( -J_0_0 * J_1_2 + J_0_2 * J_1_0 );
1459 i22 = invJ * ( J_0_0 * J_1_1 - J_0_1 * J_1_0 );
1460
1461 wJ = Kokkos::abs( J_det );
1462 }
1463
1464 const double kwJ = k_eval * wJ;
1465
1466 // Accumulate normal-normal diagonal correction for free-slip boundary nodes.
1467 // For each boundary node u with outward unit normal n_u and shape-function physical
1468 // gradient g_u = J^{-T} dN_ref[node_idx]:
1469 // A_nn_u += kwJ * ( |g_u|^2 + (1/3) * (n_u · g_u)^2 )
1470 // This equals n_u^T · A_3x3_self_u · n_u — the (0,0) entry of R_u A R_u^T in the
1471 // normal-tangential frame — which the slow path preserves on its diagonal.
1472 // Map: (w, in-boundary index ni in 0..2) -> corner index {0=n00,1=n10,2=n01,3=n11}
1473 // normal_corr is only needed for the non-diagonal operator apply (P A P + A_nn u_n n).
1474 // In diagonal mode the rotated-frame diagonal is used directly (see below).
1475 if ( !diagonal_ && ( cmb_freeslip || surf_freeslip ) )
1476 {
1477 static constexpr int CMB_NODE_TO_CORNER[2][3] = { { 0, 1, 2 }, { 3, 2, 1 } };
1478 const double cn[4][3] = { { nx00, ny00, nz00 },
1479 { nx10, ny10, nz10 },
1480 { nx01, ny01, nz01 },
1481 { nx11, ny11, nz11 } };
1482 if ( cmb_freeslip )
1483 {
1484 for ( int ni = 0; ni < 3; ++ni )
1485 {
1486 const int corner = CMB_NODE_TO_CORNER[w][ni];
1487 const double nxu = cn[corner][0], nyu = cn[corner][1], nzu = cn[corner][2];
1488
1489 const double gx = dN_ref[ni][0];
1490 const double gy = dN_ref[ni][1];
1491 const double gz = dN_ref[ni][2];
1492 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1493 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1494 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1495 const double gg = g0 * g0 + g1 * g1 + g2 * g2;
1496 const double ng = nxu * g0 + nyu * g1 + nzu * g2;
1497 const double Ann = kwJ * ( gg + ONE_THIRD * ng * ng );
1498 const double c = Ann * u_n_cmb[corner];
1499 const int u = WEDGE_TO_UNIQUE[w][ni];
1500
1501 normal_corr[0][u] += c * nxu;
1502 normal_corr[1][u] += c * nyu;
1503 normal_corr[2][u] += c * nzu;
1504 }
1505 }
1506 if ( surf_freeslip )
1507 {
1508 for ( int ni = 0; ni < 3; ++ni )
1509 {
1510 const int corner = CMB_NODE_TO_CORNER[w][ni];
1511 const double nxu = cn[corner][0], nyu = cn[corner][1], nzu = cn[corner][2];
1512
1513 const double gx = dN_ref[ni + 3][0];
1514 const double gy = dN_ref[ni + 3][1];
1515 const double gz = dN_ref[ni + 3][2];
1516 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1517 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1518 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1519 const double gg = g0 * g0 + g1 * g1 + g2 * g2;
1520 const double ng = nxu * g0 + nyu * g1 + nzu * g2;
1521 const double Ann = kwJ * ( gg + ONE_THIRD * ng * ng );
1522 const double c = Ann * u_n_surf[corner];
1523 const int u = WEDGE_TO_UNIQUE[w][ni + 3];
1524
1525 normal_corr[0][u] += c * nxu;
1526 normal_corr[1][u] += c * nyu;
1527 normal_corr[2][u] += c * nzu;
1528 }
1529 }
1530 }
1531
1532 double gu00 = 0.0;
1533 double gu10 = 0.0, gu11 = 0.0;
1534 double gu20 = 0.0, gu21 = 0.0, gu22 = 0.0;
1535 double div_u = 0.0;
1536
1537 if ( !diagonal_ )
1538 {
1539 for ( int dimj = 0; dimj < 3; ++dimj )
1540 {
1541#pragma unroll
1542 for ( int node_idx = cmb_shift; node_idx < 6 - surface_shift; ++node_idx )
1543 {
1544 const double gx = dN_ref[node_idx][0];
1545 const double gy = dN_ref[node_idx][1];
1546 const double gz = dN_ref[node_idx][2];
1547
1548 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1549 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1550 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1551
1552 double E00, E11, E22, sym01, sym02, sym12, gdd;
1553 column_grad_to_sym( dimj, g0, g1, g2, E00, E11, E22, sym01, sym02, sym12, gdd );
1554
1555 const int u = WEDGE_TO_UNIQUE[w][node_idx];
1556 const double s = src8[dimj][u];
1557
1558 gu00 += E00 * s;
1559 gu10 += sym01 * s;
1560 gu11 += E11 * s;
1561 gu20 += sym02 * s;
1562 gu21 += sym12 * s;
1563 gu22 += E22 * s;
1564 div_u += gdd * s;
1565 }
1566 }
1567
1568 for ( int dimi = 0; dimi < 3; ++dimi )
1569 {
1570#pragma unroll
1571 for ( int node_idx = cmb_shift; node_idx < 6 - surface_shift; ++node_idx )
1572 {
1573 const double gx = dN_ref[node_idx][0];
1574 const double gy = dN_ref[node_idx][1];
1575 const double gz = dN_ref[node_idx][2];
1576
1577 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1578 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1579 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1580
1581 double E00, E11, E22, sym01, sym02, sym12, gdd;
1582 column_grad_to_sym( dimi, g0, g1, g2, E00, E11, E22, sym01, sym02, sym12, gdd );
1583
1584 const int u = WEDGE_TO_UNIQUE[w][node_idx];
1585
1586 dst8[dimi][u] +=
1587 kwJ * ( NEG_TWO_THIRDS * div_u * gdd + 4.0 * sym01 * gu10 + 4.0 * sym02 * gu20 +
1588 4.0 * sym12 * gu21 + 2.0 * E00 * gu00 + 2.0 * E11 * gu11 + 2.0 * E22 * gu22 );
1589 }
1590 }
1591 }
1592
1593
1594 if ( diagonal_ || cmb_dirichlet || surface_dirichlet )
1595 {
1596 for ( int dim_diagBC = 0; dim_diagBC < 3; ++dim_diagBC )
1597 {
1598 for ( int node_idx = surface_shift; node_idx < 6 - cmb_shift; ++node_idx )
1599 {
1600 // In diagonal mode, free-slip boundary nodes are handled below with the
1601 // rotated-frame diagonal (R^T diag(RAR^T) R) to match the slow path.
1602 if ( diagonal_ && cmb_freeslip && node_idx < 3 )
1603 continue;
1604 if ( diagonal_ && surf_freeslip && node_idx >= 3 )
1605 continue;
1606
1607 const double gx = dN_ref[node_idx][0];
1608 const double gy = dN_ref[node_idx][1];
1609 const double gz = dN_ref[node_idx][2];
1610
1611 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1612 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1613 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1614
1615 double E00, E11, E22, sym01, sym02, sym12, gdd;
1616 column_grad_to_sym( dim_diagBC, g0, g1, g2, E00, E11, E22, sym01, sym02, sym12, gdd );
1617
1618 const int u = WEDGE_TO_UNIQUE[w][node_idx];
1619 const double s = src8[dim_diagBC][u];
1620
1621 dst8[dim_diagBC][u] += kwJ * ( 4.0 * s * ( sym01 * sym01 + sym02 * sym02 + sym12 * sym12 ) +
1622 2.0 * s * ( E00 * E00 + E11 * E11 + E22 * E22 ) +
1623 NEG_TWO_THIRDS * ( gdd * gdd ) * s );
1624 }
1625 }
1626
1627 // For free-slip boundary nodes in diagonal mode: compute R^T diag(R A_3x3 R^T) R src.
1628 // The slow path takes the diagonal of the 18x18 matrix *after* rotating to the
1629 // normal-tangential frame, giving diag entries kwJ*(gg + (r_alpha . g)^2 / 3) for
1630 // alpha in {n, t1, t2}. The Cartesian diagonal used above differs from this by a
1631 // rotation, so it must not be used for free-slip boundary nodes.
1632 if ( diagonal_ )
1633 {
1634 static constexpr int FS_CORNER_MAP[2][3] = { { 0, 1, 2 }, { 3, 2, 1 } };
1635 const double ncoords[4][3] = { { nx00, ny00, nz00 },
1636 { nx10, ny10, nz10 },
1637 { nx01, ny01, nz01 },
1638 { nx11, ny11, nz11 } };
1639
1640 auto apply_rotated_diag =
1641 [&]( const int ni, const int node_idx, const double* u_n_arr ) {
1642 const int corner = FS_CORNER_MAP[w][ni];
1643 const double nxu = ncoords[corner][0];
1644 const double nyu = ncoords[corner][1];
1645 const double nzu = ncoords[corner][2];
1646 const int u = WEDGE_TO_UNIQUE[w][node_idx];
1647
1648 const double gx = dN_ref[node_idx][0];
1649 const double gy = dN_ref[node_idx][1];
1650 const double gz = dN_ref[node_idx][2];
1651
1652 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1653 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1654 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1655 const double gg_loc = g0 * g0 + g1 * g1 + g2 * g2;
1656
1657 dense::Vec< double, 3 > n_vec;
1658 n_vec( 0 ) = nxu;
1659 n_vec( 1 ) = nyu;
1660 n_vec( 2 ) = nzu;
1661 const auto R_rot = trafo_mat_cartesian_to_normal_tangential< double >( n_vec );
1662
1663 // Reconstruct full (unprojected) src at this node.
1664 // src8 was tangentially projected; u_n_arr holds the saved normal component.
1665 const double s0 = src8[0][u] + u_n_arr[corner] * nxu;
1666 const double s1 = src8[1][u] + u_n_arr[corner] * nyu;
1667 const double s2 = src8[2][u] + u_n_arr[corner] * nzu;
1668
1669 // Physical gradient and src in normal-tangential frame
1670 const double Rg0 = R_rot( 0, 0 ) * g0 + R_rot( 0, 1 ) * g1 + R_rot( 0, 2 ) * g2;
1671 const double Rg1 = R_rot( 1, 0 ) * g0 + R_rot( 1, 1 ) * g1 + R_rot( 1, 2 ) * g2;
1672 const double Rg2 = R_rot( 2, 0 ) * g0 + R_rot( 2, 1 ) * g1 + R_rot( 2, 2 ) * g2;
1673 const double Rs0 = R_rot( 0, 0 ) * s0 + R_rot( 0, 1 ) * s1 + R_rot( 0, 2 ) * s2;
1674 const double Rs1 = R_rot( 1, 0 ) * s0 + R_rot( 1, 1 ) * s1 + R_rot( 1, 2 ) * s2;
1675 const double Rs2 = R_rot( 2, 0 ) * s0 + R_rot( 2, 1 ) * s1 + R_rot( 2, 2 ) * s2;
1676
1677 // Rotated-frame diagonal applied to Rsrc:
1678 // diag_alpha = kwJ * (gg_loc + (1/3) * Rg_alpha^2)
1679 const double v0 = kwJ * ( gg_loc + ONE_THIRD * Rg0 * Rg0 ) * Rs0;
1680 const double v1 = kwJ * ( gg_loc + ONE_THIRD * Rg1 * Rg1 ) * Rs1;
1681 const double v2 = kwJ * ( gg_loc + ONE_THIRD * Rg2 * Rg2 ) * Rs2;
1682
1683 // Transform back to Cartesian: R^T [v0, v1, v2]
1684 dst8[0][u] += R_rot( 0, 0 ) * v0 + R_rot( 1, 0 ) * v1 + R_rot( 2, 0 ) * v2;
1685 dst8[1][u] += R_rot( 0, 1 ) * v0 + R_rot( 1, 1 ) * v1 + R_rot( 2, 1 ) * v2;
1686 dst8[2][u] += R_rot( 0, 2 ) * v0 + R_rot( 1, 2 ) * v1 + R_rot( 2, 2 ) * v2;
1687 };
1688
1689 if ( cmb_freeslip )
1690 {
1691 for ( int ni = 0; ni < 3; ++ni )
1692 apply_rotated_diag( ni, ni, u_n_cmb );
1693 }
1694 if ( surf_freeslip )
1695 {
1696 for ( int ni = 0; ni < 3; ++ni )
1697 apply_rotated_diag( ni, ni + 3, u_n_surf );
1698 }
1699 }
1700 }
1701 }
1702
1703 // Test-side projection for free-slip (P A P).
1704 // In diagonal mode the rotated-frame diagonal is used, which already encodes the correct
1705 // normal/tangential split — no post-projection is needed or wanted.
1706 if ( !diagonal_ && cmb_freeslip )
1707 {
1708 project_tangential_inplace( nx00, ny00, nz00, dst8[0][0], dst8[1][0], dst8[2][0] );
1709 project_tangential_inplace( nx10, ny10, nz10, dst8[0][1], dst8[1][1], dst8[2][1] );
1710 project_tangential_inplace( nx01, ny01, nz01, dst8[0][2], dst8[1][2], dst8[2][2] );
1711 project_tangential_inplace( nx11, ny11, nz11, dst8[0][6], dst8[1][6], dst8[2][6] );
1712 }
1713 if ( !diagonal_ && surf_freeslip )
1714 {
1715 project_tangential_inplace( nx00, ny00, nz00, dst8[0][3], dst8[1][3], dst8[2][3] );
1716 project_tangential_inplace( nx10, ny10, nz10, dst8[0][4], dst8[1][4], dst8[2][4] );
1717 project_tangential_inplace( nx01, ny01, nz01, dst8[0][5], dst8[1][5], dst8[2][5] );
1718 project_tangential_inplace( nx11, ny11, nz11, dst8[0][7], dst8[1][7], dst8[2][7] );
1719 }
1720
1721 // Add back the normal-normal diagonal contribution removed by the test-side projection.
1722 // This makes the fast path equivalent to the slow path for arbitrary input vectors.
1723 // Not needed in diagonal mode: the rotated-frame diagonal already includes A_nn * u_n.
1724 if ( !diagonal_ && ( cmb_freeslip || surf_freeslip ) )
1725 {
1726 for ( int dim = 0; dim < 3; ++dim )
1727 for ( int u = 0; u < 8; ++u )
1728 dst8[dim][u] += normal_corr[dim][u];
1729 }
1730
1731 for ( int dim_add = 0; dim_add < 3; ++dim_add )
1732 {
1733 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell, y_cell, r_cell, dim_add ), dst8[dim_add][0] );
1734 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell + 1, y_cell, r_cell, dim_add ), dst8[dim_add][1] );
1735 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell, y_cell + 1, r_cell, dim_add ), dst8[dim_add][2] );
1736 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell, y_cell, r_cell + 1, dim_add ), dst8[dim_add][3] );
1737 Kokkos::atomic_add(
1738 &dst_( local_subdomain_id, x_cell + 1, y_cell, r_cell + 1, dim_add ), dst8[dim_add][4] );
1739 Kokkos::atomic_add(
1740 &dst_( local_subdomain_id, x_cell, y_cell + 1, r_cell + 1, dim_add ), dst8[dim_add][5] );
1741 Kokkos::atomic_add(
1742 &dst_( local_subdomain_id, x_cell + 1, y_cell + 1, r_cell, dim_add ), dst8[dim_add][6] );
1743 Kokkos::atomic_add(
1744 &dst_( local_subdomain_id, x_cell + 1, y_cell + 1, r_cell + 1, dim_add ), dst8[dim_add][7] );
1745 }
1746 }
1747
1748 public:
1749 /**
1750 * @brief Legacy generic team operator.
1751 *
1752 * Kept for compatibility/debugging, but no longer used by apply_impl().
1753 * The host now dispatches directly to path-specific kernels.
1754 *
1755 * This function still works, but it reintroduces a branch on `kernel_path_`
1756 * and should therefore be avoided in performance-critical use.
1757 */
1758 KOKKOS_INLINE_FUNCTION
1759 void operator()( const Team& team ) const
1760 {
1761 int local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell;
1762 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
1763
1764 if ( tr >= r_tile_ )
1765 return;
1766
1767 const bool at_cmb = has_flag( local_subdomain_id, x_cell, y_cell, r_cell, CMB );
1768 const bool at_surface = has_flag( local_subdomain_id, x_cell, y_cell, r_cell + 1, SURFACE );
1769
1770 if ( kernel_path_ == KernelPath::Slow )
1771 {
1772 operator_slow_path(
1773 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell, at_cmb, at_surface );
1774 }
1775 else if ( kernel_path_ == KernelPath::FastFreeslip )
1776 {
1777 operator_fast_freeslip_path(
1778 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell, at_cmb, at_surface );
1779 }
1780 else
1781 {
1782 operator_fast_dirichlet_neumann_path(
1783 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell, at_cmb, at_surface );
1784 }
1785 }
1786
1787 KOKKOS_INLINE_FUNCTION void assemble_trial_test_vecs(
1788 const int wedge,
1789 const dense::Vec< ScalarType, VecDim >& quad_point,
1790 const ScalarType quad_weight,
1791 const ScalarT r_1,
1792 const ScalarT r_2,
1793 dense::Vec< ScalarT, 3 > ( *wedge_phy_surf )[3],
1794 const dense::Vec< ScalarT, 6 >* k_local_hex,
1795 const int dimi,
1796 const int dimj,
1799 ScalarType& jdet_keval_quadweight ) const
1800 {
1801 dense::Mat< ScalarType, VecDim, VecDim > J = jac( wedge_phy_surf[wedge], r_1, r_2, quad_point );
1802 const auto det = J.det();
1803 const auto abs_det = Kokkos::abs( det );
1804 const dense::Mat< ScalarType, VecDim, VecDim > J_inv_transposed = J.inv_transposed( det );
1805
1806 ScalarType k_eval = 0.0;
1807 for ( int k = 0; k < num_nodes_per_wedge; k++ )
1808 {
1809 k_eval += shape( k, quad_point ) * k_local_hex[wedge]( k );
1810 }
1811
1812 for ( int k = 0; k < num_nodes_per_wedge; k++ )
1813 {
1814 sym_grad_i[k] = symmetric_grad( J_inv_transposed, quad_point, k, dimi );
1815 sym_grad_j[k] = symmetric_grad( J_inv_transposed, quad_point, k, dimj );
1816 }
1817
1818 jdet_keval_quadweight = quad_weight * k_eval * abs_det;
1819 }
1820
1821 /**
1822 * @brief Assemble one wedge-local 18x18 matrix (slow path / on-demand assembly).
1823 */
1824 KOKKOS_INLINE_FUNCTION
1826 const int local_subdomain_id,
1827 const int x_cell,
1828 const int y_cell,
1829 const int r_cell,
1830 const int wedge ) const
1831 {
1833 wedge_surface_physical_coords( wedge_phy_surf, grid_, local_subdomain_id, x_cell, y_cell );
1834
1835 const ScalarT r_1 = radii_( local_subdomain_id, r_cell );
1836 const ScalarT r_2 = radii_( local_subdomain_id, r_cell + 1 );
1837
1839 extract_local_wedge_scalar_coefficients( k_local_hex, local_subdomain_id, x_cell, y_cell, r_cell, k_ );
1840
1842
1843 for ( int dimi = 0; dimi < 3; ++dimi )
1844 {
1845 for ( int dimj = 0; dimj < 3; ++dimj )
1846 {
1847 for ( int q = 0; q < num_quad_points; q++ )
1848 {
1851 ScalarType jdet_keval_quadweight = 0;
1852
1854 wedge,
1855 quad_points[q],
1856 quad_weights[q],
1857 r_1,
1858 r_2,
1859 wedge_phy_surf,
1860 k_local_hex,
1861 dimi,
1862 dimj,
1863 sym_grad_i,
1864 sym_grad_j,
1865 jdet_keval_quadweight );
1866
1867 for ( int i = 0; i < num_nodes_per_wedge; i++ )
1868 {
1869 for ( int j = 0; j < num_nodes_per_wedge; j++ )
1870 {
1871 A( i + dimi * num_nodes_per_wedge, j + dimj * num_nodes_per_wedge ) +=
1872 jdet_keval_quadweight *
1873 ( 2 * sym_grad_j[j].double_contract( sym_grad_i[i] ) -
1874 2.0 / 3.0 * sym_grad_j[j]( dimj, dimj ) * sym_grad_i[i]( dimi, dimi ) );
1875 }
1876 }
1877 }
1878 }
1879 }
1880
1881 return A;
1882 }
1883};
1884
1887
1888} // namespace terra::fe::wedge::operators::shell
Definition communication_plan.hpp:33
Send and receive buffers for all process-local subdomain boundaries.
Definition communication.hpp:56
Matrix-free / matrix-based epsilon-div-div operator on wedge elements in a spherical shell.
Definition epsilon_divdiv_kerngen.hpp:58
const grid::shell::DistributedDomain & get_domain() const
Definition epsilon_divdiv_kerngen.hpp:236
size_t team_shmem_size(const int) const
Team scratch memory size for fast paths.
Definition epsilon_divdiv_kerngen.hpp:431
void set_boundary_conditions(BoundaryConditions bcs)
Definition epsilon_divdiv_kerngen.hpp:226
linalg::OperatorStoredMatrixMode get_stored_matrix_mode()
Definition epsilon_divdiv_kerngen.hpp:274
void set_local_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge, const dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > &mat) const
Definition epsilon_divdiv_kerngen.hpp:277
void column_grad_to_sym(const int dim, const double g0, const double g1, const double g2, double &E00, double &E11, double &E22, double &sym01, double &sym02, double &sym12, double &gdd) const
Convert one gradient column into symmetric-gradient and div contributions.
Definition epsilon_divdiv_kerngen.hpp:386
dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > assemble_local_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge) const
Assemble one wedge-local 18x18 matrix (slow path / on-demand assembly).
Definition epsilon_divdiv_kerngen.hpp:1825
void apply_impl(const SrcVectorType &src, DstVectorType &dst)
Apply the operator: dst <- Op(src) (or additive, depending on mode).
Definition epsilon_divdiv_kerngen.hpp:321
grid::Grid2DDataScalar< ScalarT > get_radii() const
Definition epsilon_divdiv_kerngen.hpp:237
ScalarT ScalarType
Definition epsilon_divdiv_kerngen.hpp:62
void set_operator_apply_and_communication_modes(const linalg::OperatorApplyMode operator_apply_mode, const linalg::OperatorCommunicationMode operator_communication_mode)
Definition epsilon_divdiv_kerngen.hpp:216
void set_diagonal(bool v)
Definition epsilon_divdiv_kerngen.hpp:224
terra::grid::Grid4DDataMatrices< ScalarType, LocalMatrixDim, LocalMatrixDim, 2 > Grid4DDataLocalMatrices
Definition epsilon_divdiv_kerngen.hpp:64
void assemble_trial_test_vecs(const int wedge, const dense::Vec< ScalarType, VecDim > &quad_point, const ScalarType quad_weight, const ScalarT r_1, const ScalarT r_2, dense::Vec< ScalarT, 3 >(*wedge_phy_surf)[3], const dense::Vec< ScalarT, 6 > *k_local_hex, const int dimi, const int dimj, dense::Mat< ScalarType, VecDim, VecDim > *sym_grad_i, dense::Mat< ScalarType, VecDim, VecDim > *sym_grad_j, ScalarType &jdet_keval_quadweight) const
Definition epsilon_divdiv_kerngen.hpp:1787
Kokkos::TeamPolicy<>::member_type Team
Definition epsilon_divdiv_kerngen.hpp:66
static constexpr int LocalMatrixDim
Definition epsilon_divdiv_kerngen.hpp:63
dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > get_local_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge) const
Definition epsilon_divdiv_kerngen.hpp:290
void set_stored_matrix_mode(linalg::OperatorStoredMatrixMode operator_stored_matrix_mode, int level_range, grid::Grid4DDataScalar< ScalarType > GCAElements)
Definition epsilon_divdiv_kerngen.hpp:251
bool has_flag(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, grid::shell::ShellBoundaryFlag flag) const
Definition epsilon_divdiv_kerngen.hpp:241
const grid::Grid4DDataScalar< ScalarType > & k_grid_data()
Definition epsilon_divdiv_kerngen.hpp:235
grid::Grid3DDataVec< ScalarT, 3 > get_grid()
Definition epsilon_divdiv_kerngen.hpp:238
EpsilonDivDivKerngen(const grid::shell::DistributedDomain &domain, const grid::Grid3DDataVec< ScalarT, 3 > &grid, const grid::Grid2DDataScalar< ScalarT > &radii, const grid::Grid4DDataScalar< grid::shell::ShellBoundaryFlag > &mask, const grid::Grid4DDataScalar< ScalarT > &k, BoundaryConditions bcs, bool diagonal, linalg::OperatorApplyMode operator_apply_mode=linalg::OperatorApplyMode::Replace, linalg::OperatorCommunicationMode operator_communication_mode=linalg::OperatorCommunicationMode::CommunicateAdditively, linalg::OperatorStoredMatrixMode operator_stored_matrix_mode=linalg::OperatorStoredMatrixMode::Off)
Definition epsilon_divdiv_kerngen.hpp:155
void operator()(const Team &team) const
Legacy generic team operator.
Definition epsilon_divdiv_kerngen.hpp:1759
Parallel data structure organizing the thick spherical shell metadata for distributed (MPI parallel) ...
Definition spherical_shell.hpp:2498
const std::map< SubdomainInfo, std::tuple< LocalSubdomainIdx, SubdomainNeighborhood > > & subdomains() const
Definition spherical_shell.hpp:2580
const DomainInfo & domain_info() const
Returns a const reference.
Definition spherical_shell.hpp:2577
Information about the thick spherical shell mesh.
Definition spherical_shell.hpp:780
const std::vector< double > & radii() const
Definition spherical_shell.hpp:845
int diamond_lateral_refinement_level() const
Definition spherical_shell.hpp:843
int subdomain_num_nodes_radially() const
Equivalent to calling subdomain_num_nodes_radially( subdomain_refinement_level() )
Definition spherical_shell.hpp:861
int subdomain_num_nodes_per_side_laterally() const
Equivalent to calling subdomain_num_nodes_per_side_laterally( subdomain_refinement_level() )
Definition spherical_shell.hpp:852
Static assertion: VectorQ1Scalar satisfies VectorLike concept.
Definition vector_q1.hpp:162
const grid::Grid4DDataVec< ScalarType, VecDim > & grid_data() const
Get const reference to grid data.
Definition vector_q1.hpp:280
bool has_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge) const
Checks for presence of a local matrix for a certain element.
Definition local_matrix_storage.hpp:223
dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > get_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge) const
Retrives the local matrix if there is stored local matrices, the desired local matrix is loaded and r...
Definition local_matrix_storage.hpp:175
void set_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge, dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > mat) const
Set the local matrix stored in the operator.
Definition local_matrix_storage.hpp:118
Timer supporting RAII scope or manual stop.
Definition timer.hpp:270
void stop()
Stop the timer and record elapsed time.
Definition timer.hpp:289
Concept for types that can be used as Galerkin coarse-grid operators in a multigrid hierarchy....
Definition operator.hpp:81
int dim
Definition EpsilonDivDiv_kernel_gen.py:66
k_eval
Definition EpsilonDivDiv_kernel_gen.py:175
g2
Definition EpsilonDivDiv_kernel_gen.py:267
div_u
Definition EpsilonDivDiv_kernel_gen.py:359
cmb_shift
Definition EpsilonDivDiv_kernel_gen.py:98
g1
Definition EpsilonDivDiv_kernel_gen.py:266
J_det
Definition EpsilonDivDiv_kernel_gen.py:216
surface_shift
Definition EpsilonDivDiv_kernel_gen.py:98
constexpr int idx(const int loop_idx, const int size, const grid::BoundaryPosition position, const grid::BoundaryDirection direction)
Definition buffer_copy_kernels.hpp:73
void send_recv_with_plan(const ShellBoundaryCommPlan< GridDataType > &plan, const GridDataType &data, SubdomainNeighborhoodSendRecvBuffer< typename GridDataType::value_type, grid::grid_data_vec_dim< GridDataType >() > &recv_buffers, CommunicationReduction reduction=CommunicationReduction::SUM)
Definition communication_plan.hpp:630
Definition boundary_mass.hpp:14
constexpr void quad_felippa_1x1_quad_points(dense::Vec< T, 3 >(&quad_points)[quad_felippa_1x1_num_quad_points])
Definition wedge/quadrature/quadrature.hpp:36
constexpr void quad_felippa_1x1_quad_weights(T(&quad_weights)[quad_felippa_1x1_num_quad_points])
Definition wedge/quadrature/quadrature.hpp:43
constexpr int quad_felippa_1x1_num_quad_points
Definition wedge/quadrature/quadrature.hpp:32
constexpr int num_nodes_per_wedge_surface
Definition kernel_helpers.hpp:6
constexpr dense::Mat< T, 3, 3 > symmetric_grad(const dense::Mat< T, 3, 3 > &J_inv_transposed, const dense::Vec< T, 3 > &quad_point, const int dof, const int dim)
Returns the symmetric gradient of the shape function of a dof at a quadrature point.
Definition integrands.hpp:671
void wedge_surface_physical_coords(dense::Vec< T, 3 >(&wedge_surf_phy_coords)[num_wedges_per_hex_cell][num_nodes_per_wedge_surface], const grid::Grid3DDataVec< T, 3 > &lateral_grid, const int local_subdomain_id, const int x_cell, const int y_cell)
Extracts the (unit sphere) surface vertex coords of the two wedges of a hex cell.
Definition kernel_helpers.hpp:26
constexpr void reorder_local_dofs(const DoFOrdering doo_from, const DoFOrdering doo_to, dense::Vec< ScalarT, 18 > &dofs)
Definition kernel_helpers.hpp:619
void atomically_add_local_wedge_vector_coefficients(const grid::Grid4DDataVec< T, VecDim > &global_coefficients, const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int d, const dense::Vec< T, 6 > local_coefficients[2])
Performs an atomic add of the two local wedge coefficient vectors of a hex cell into the global coeff...
Definition kernel_helpers.hpp:465
constexpr int num_wedges_per_hex_cell
Definition kernel_helpers.hpp:5
void extract_local_wedge_scalar_coefficients(dense::Vec< T, 6 >(&local_coefficients)[2], const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const grid::Grid4DDataScalar< T > &global_coefficients)
Extracts the local vector coefficients for the two wedges of a hex cell from the global coefficient v...
Definition kernel_helpers.hpp:306
void extract_local_wedge_vector_coefficients(dense::Vec< T, 6 >(&local_coefficients)[2], const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int d, const grid::Grid4DDataVec< T, VecDim > &global_coefficients)
Extracts the local vector coefficients for the two wedges of a hex cell from the global coefficient v...
Definition kernel_helpers.hpp:356
constexpr int num_nodes_per_wedge
Definition kernel_helpers.hpp:7
constexpr T shape(const int node_idx, const T xi, const T eta, const T zeta)
(Tensor-product) Shape function.
Definition integrands.hpp:146
constexpr dense::Mat< T, 3, 3 > jac(const dense::Vec< T, 3 > &p1_phy, const dense::Vec< T, 3 > &p2_phy, const dense::Vec< T, 3 > &p3_phy, const T r_1, const T r_2, const T xi, const T eta, const T zeta)
Definition integrands.hpp:643
dense::Vec< typename CoordsShellType::value_type, 3 > coords(const int subdomain, const int x, const int y, const int r, const CoordsShellType &coords_shell, const CoordsRadiiType &coords_radii)
Definition spherical_shell.hpp:2789
BoundaryConditionMapping[2] BoundaryConditions
Definition shell/bit_masks.hpp:37
ShellBoundaryFlag
FlagLike that indicates boundary types for the thick spherical shell.
Definition shell/bit_masks.hpp:12
BoundaryConditionFlag get_boundary_condition_flag(const BoundaryConditions bcs, ShellBoundaryFlag sbf)
Retrieve the boundary condition flag that is associated with a location in the shell e....
Definition shell/bit_masks.hpp:42
BoundaryConditionFlag
FlagLike that indicates the type of boundary condition
Definition shell/bit_masks.hpp:25
Kokkos::View< dense::Mat< ScalarType, Rows, Cols > ****[NumMatrices], Layout > Grid4DDataMatrices
Definition grid_types.hpp:46
Kokkos::View< ScalarType ***[VecDim], Layout > Grid3DDataVec
Definition grid_types.hpp:40
Kokkos::View< ScalarType ****[VecDim], Layout > Grid4DDataVec
Definition grid_types.hpp:43
Kokkos::View< ScalarType ****, Layout > Grid4DDataScalar
Definition grid_types.hpp:25
Kokkos::View< ScalarType **, Layout > Grid2DDataScalar
Definition grid_types.hpp:19
dense::Mat< ScalarType, 3, 3 > trafo_mat_cartesian_to_normal_tangential(const dense::Vec< ScalarType, 3 > &n_input)
Constructs a robust orthonormal transformation matrix from Cartesian to (normal–tangential–tangential...
Definition local_basis_trafo_normal_tangential.hpp:36
ScalarOf< Vector > dot(const Vector &y, const Vector &x)
Compute the dot product of two vectors. Implements: .
Definition vector.hpp:118
OperatorApplyMode
Modes for applying an operator to a vector.
Definition operator.hpp:30
@ Replace
Overwrite the destination vector.
OperatorStoredMatrixMode
Modes for applying stored matrices.
Definition operator.hpp:47
@ Selective
Use stored matrices on selected, marked elements only, assemble on all others.
@ Full
Use stored matrices on all elements.
@ Off
Do not use stored matrices.
OperatorCommunicationMode
Modes for communication during operator application.
Definition operator.hpp:40
@ CommunicateAdditively
Communicate and add results.
constexpr bool has_flag(E mask_value, E flag) noexcept
Checks if a bitmask value contains a specific flag.
Definition bit_masking.hpp:43
detail::PrefixCout logroot([]() { return detail::log_prefix();})
std::ostream subclass that just logs on root and adds a timestamp for each line.
Definition mat.hpp:10
constexpr Mat diagonal() const
Definition mat.hpp:377
constexpr Mat< T, Cols, Rows > transposed() const
Definition mat.hpp:187
Mat & hadamard_product(const Mat &mat)
Definition mat.hpp:213
T double_contract(const Mat &mat)
Definition mat.hpp:226