Loading...
Searching...
No Matches
epsilon_divdiv_kerngen_v08_scalar_coalesced.hpp
Go to the documentation of this file.
1#pragma once
2
3#include "../../../quadrature/quadrature.hpp"
6#include "dense/vec.hpp"
10#include "impl/Kokkos_Profiling.hpp"
11#include "linalg/operator.hpp"
14#include "linalg/vector.hpp"
15#include "linalg/vector_q1.hpp"
16#include "util/timer.hpp"
17
19
30
31/**
32 * @brief Matrix-free / matrix-based epsilon-div-div operator on wedge elements in a spherical shell.
33 *
34 * This class supports three execution paths:
35 *
36 * 1) Slow path (local matrix path)
37 * - Used if local matrices are stored (full or selective)
38 * - Reuses assembled/stored 18x18 local matrices
39 *
40 * 2) Fast path: Dirichlet/Neumann
41 * - Matrix-free
42 * - Handles Dirichlet by skipping constrained face-node couplings (and diagonal treatment if requested)
43 * - Neumann is naturally included
44 *
45 * 3) Fast path: Free-slip
46 * - Matrix-free
47 * - Applies trial and test-side tangential projection on free-slip boundaries
48 * - Preserves behavior consistent with the slow path for iterative solves
49 *
50 * IMPORTANT DESIGN CHANGE:
51 * ------------------------
52 * The path decision is made on the HOST and cached in `kernel_path_`.
53 * `apply_impl()` dispatches to a different kernel launch per path.
54 * This avoids a runtime branch inside the hot device kernel.
55 */
56template < typename ScalarT, int VecDim = 3 >
58{
59 public:
62 using ScalarType = ScalarT;
63 static constexpr int LocalMatrixDim = 18;
66 using Team = Kokkos::TeamPolicy<>::member_type;
67
68 private:
69 // Optional element-local matrix storage (GCA/coarsening/explicit local-matrix mode)
70 LocalMatrixStorage local_matrix_storage_;
71
72 // Domain and geometry / coefficients
74 grid::Grid3DDataVec< ScalarT, 3 > grid_; ///< Lateral shell geometry (unit sphere coords)
75 grid::Grid2DDataScalar< ScalarT > radii_; ///< Radial coordinates per local subdomain
76 grid::Grid4DDataScalar< ScalarType > k_; ///< Scalar coefficient field
77 grid::Grid4DDataScalar< grid::shell::ShellBoundaryFlag > mask_; ///< Boundary flags per cell/node
78 BoundaryConditions bcs_; ///< CMB and SURFACE boundary conditions
79
80 bool diagonal_; ///< If true, apply diagonal-only action
81
82 linalg::OperatorApplyMode operator_apply_mode_;
83 linalg::OperatorCommunicationMode operator_communication_mode_;
84 linalg::OperatorStoredMatrixMode operator_stored_matrix_mode_;
85
88
89 // Device views captured by kernels
92
93 // Quadrature (Felippa 1x1 on wedge)
94 const int num_quad_points = quadrature::quad_felippa_1x1_num_quad_points;
97
98 // Local subdomain extents (in cells)
99 int local_subdomains_;
100 int hex_lat_;
101 int hex_rad_;
102 int lat_refinement_level_;
103
104 // Team-policy tiling (one team handles a slab lat_tile x lat_tile x r_tile cells)
105 int lat_tile_;
106 int r_tile_;
107 int lat_tiles_;
108 int r_tiles_;
109 int team_size_;
110 int blocks_;
111
112 ScalarT r_max_;
113 ScalarT r_min_;
114
115 /**
116 * @brief Kernel path selected on host.
117 *
118 * This value is computed whenever BCs or stored-matrix mode change and is then
119 * used by `apply_impl()` to select which kernel launch to issue.
120 */
121 enum class KernelPath
122 {
123 Slow,
124 FastDirichletNeumann,
125 FastFreeslip,
126 };
127 KernelPath kernel_path_ = KernelPath::FastDirichletNeumann;
128
129 private:
130 /**
131 * @brief Recompute the kernel path on host.
132 *
133 * Rules:
134 * - Any stored local matrix mode => slow path
135 * - Else if any free-slip BC on CMB or SURFACE => fast free-slip path
136 * - Else => fast Dirichlet/Neumann path
137 */
138 void update_kernel_path_flag_host_only()
139 {
140 const BoundaryConditionFlag cmb_bc = get_boundary_condition_flag( bcs_, CMB );
141 const BoundaryConditionFlag surface_bc = get_boundary_condition_flag( bcs_, SURFACE );
142
143 const bool has_freeslip = ( cmb_bc == FREESLIP ) || ( surface_bc == FREESLIP );
144 const bool has_stored_matrices = ( operator_stored_matrix_mode_ != linalg::OperatorStoredMatrixMode::Off );
145
146 if ( has_stored_matrices )
147 kernel_path_ = KernelPath::Slow;
148 else if ( has_freeslip )
149 kernel_path_ = KernelPath::FastFreeslip;
150 else
151 kernel_path_ = KernelPath::FastDirichletNeumann;
152 }
153
154 public:
156 const grid::shell::DistributedDomain& domain,
161 BoundaryConditions bcs,
162 bool diagonal,
164 linalg::OperatorCommunicationMode operator_communication_mode =
167 : domain_( domain )
168 , grid_( grid )
169 , radii_( radii )
170 , k_( k )
171 , mask_( mask )
172 , diagonal_( diagonal )
173 , operator_apply_mode_( operator_apply_mode )
174 , operator_communication_mode_( operator_communication_mode )
175 , operator_stored_matrix_mode_( operator_stored_matrix_mode )
176 , recv_buffers_( domain )
177 , comm_plan_( domain )
178 {
179 bcs_[0] = bcs[0];
180 bcs_[1] = bcs[1];
181
184
185 const grid::shell::DomainInfo& domain_info = domain_.domain_info();
186 local_subdomains_ = domain_.subdomains().size();
187 hex_lat_ = domain_info.subdomain_num_nodes_per_side_laterally() - 1;
188 hex_rad_ = domain_info.subdomain_num_nodes_radially() - 1;
189 lat_refinement_level_ = domain_info.diamond_lateral_refinement_level();
190
191 lat_tile_ = 4;
192 r_tile_ = 16;
193
194 lat_tiles_ = ( hex_lat_ + lat_tile_ - 1 ) / lat_tile_;
195 r_tiles_ = ( hex_rad_ + r_tile_ - 1 ) / r_tile_;
196
197 team_size_ = lat_tile_ * lat_tile_ * r_tile_;
198 blocks_ = local_subdomains_ * lat_tiles_ * lat_tiles_ * r_tiles_;
199
200 r_min_ = domain_info.radii()[0];
201 r_max_ = domain_info.radii()[domain_info.radii().size() - 1];
202
203 // Host-side path selection (no in-kernel path branching)
204 update_kernel_path_flag_host_only();
205
206 util::logroot << "[EpsilonDivDiv] tile size (x,y,r)=(" << lat_tile_ << "," << lat_tile_ << "," << r_tile_ << ")"
207 << std::endl;
208 util::logroot << "[EpsilonDivDiv] number of tiles (x,y,r)=(" << lat_tiles_ << "," << lat_tiles_ << ","
209 << r_tiles_ << "), team_size=" << team_size_ << ", blocks=" << blocks_ << std::endl;
210 const char* path_name = ( kernel_path_ == KernelPath::Slow ) ? "slow" :
211 ( kernel_path_ == KernelPath::FastFreeslip ) ? "fast-freeslip" :
212 "fast-dirichlet-neumann";
213 util::logroot << "[EpsilonDivDiv] kernel path = " << path_name << std::endl;
214 }
215
217 const linalg::OperatorApplyMode operator_apply_mode,
218 const linalg::OperatorCommunicationMode operator_communication_mode )
219 {
220 operator_apply_mode_ = operator_apply_mode;
221 operator_communication_mode_ = operator_communication_mode;
222 }
223
224 void set_diagonal( bool v ) { diagonal_ = v; }
225
226 void set_boundary_conditions( BoundaryConditions bcs )
227 {
228 bcs_[0] = bcs[0];
229 bcs_[1] = bcs[1];
230
231 // Recompute host-side dispatch mode whenever BCs change
232 update_kernel_path_flag_host_only();
233 }
234
236 const grid::shell::DistributedDomain& get_domain() const { return domain_; }
239
240 KOKKOS_INLINE_FUNCTION
242 const int local_subdomain_id,
243 const int x_cell,
244 const int y_cell,
245 const int r_cell,
247 {
248 return util::has_flag( mask_( local_subdomain_id, x_cell, y_cell, r_cell ), flag );
249 }
250
252 linalg::OperatorStoredMatrixMode operator_stored_matrix_mode,
253 int level_range,
255 {
256 operator_stored_matrix_mode_ = operator_stored_matrix_mode;
257
258 if ( operator_stored_matrix_mode_ != linalg::OperatorStoredMatrixMode::Off )
259 {
261 domain_, operator_stored_matrix_mode_, level_range, GCAElements );
262 }
263
264 // Recompute host-side dispatch mode whenever storage mode changes
265 update_kernel_path_flag_host_only();
266
267 util::logroot << "[EpsilonDivDiv] (set_stored_matrix_mode) kernel path = "
268 << (( kernel_path_ == KernelPath::Slow ) ? "slow" :
269 ( kernel_path_ == KernelPath::FastFreeslip ) ? "fast-freeslip" :
270 "fast-dirichlet-neumann")
271 << std::endl;
272 }
273
274 linalg::OperatorStoredMatrixMode get_stored_matrix_mode() { return operator_stored_matrix_mode_; }
275
276 KOKKOS_INLINE_FUNCTION
278 const int local_subdomain_id,
279 const int x_cell,
280 const int y_cell,
281 const int r_cell,
282 const int wedge,
284 {
285 KOKKOS_ASSERT( operator_stored_matrix_mode_ != linalg::OperatorStoredMatrixMode::Off );
286 local_matrix_storage_.set_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge, mat );
287 }
288
289 KOKKOS_INLINE_FUNCTION
291 const int local_subdomain_id,
292 const int x_cell,
293 const int y_cell,
294 const int r_cell,
295 const int wedge ) const
296 {
297 if ( operator_stored_matrix_mode_ != linalg::OperatorStoredMatrixMode::Off )
298 {
299 if ( !local_matrix_storage_.has_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge ) )
300 {
301 Kokkos::abort( "No matrix found at that spatial index." );
302 }
303 return local_matrix_storage_.get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge );
304 }
305 return assemble_local_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge );
306 }
307
308 /**
309 * @brief Apply the operator: dst <- Op(src) (or additive, depending on mode).
310 *
311 * Host-side responsibilities:
312 * - Handle replace/add mode initialization
313 * - Cache src/dst views for kernel capture
314 * - Build team policy
315 * - Dispatch to exactly one kernel variant based on `kernel_path_`
316 * - Optional halo communication accumulation
317 *
318 * This is where the path decision now lives (host), so device code does not branch
319 * on `kernel_path_` in the hot path.
320 */
321 void apply_impl( const SrcVectorType& src, DstVectorType& dst )
322 {
323 util::Timer timer_apply( "epsilon_divdiv_apply" );
324
325 if ( operator_apply_mode_ == linalg::OperatorApplyMode::Replace )
326 {
327 assign( dst, 0 );
328 }
329
330 dst_ = dst.grid_data();
331 src_ = src.grid_data();
332
333 util::Timer timer_kernel( "epsilon_divdiv_kernel" );
334 Kokkos::TeamPolicy<> policy( blocks_, team_size_ );
335
336 // Fast paths use shared-memory slab staging
337 if ( kernel_path_ != KernelPath::Slow )
338 {
339 policy.set_scratch_size( 0, Kokkos::PerTeam( team_shmem_size( team_size_ ) ) );
340 }
341
342 // Host-side dispatch => no per-team path branching in device code
343 if ( kernel_path_ == KernelPath::Slow )
344 {
345 Kokkos::parallel_for(
346 "epsilon_divdiv_apply_kernel_slow", policy, KOKKOS_CLASS_LAMBDA( const Team& team ) {
347 this->run_team_slow( team );
348 } );
349 }
350 else if ( kernel_path_ == KernelPath::FastFreeslip )
351 {
352 Kokkos::parallel_for(
353 "epsilon_divdiv_apply_kernel_fast_freeslip", policy, KOKKOS_CLASS_LAMBDA( const Team& team ) {
354 this->run_team_fast_freeslip( team );
355 } );
356 }
357 else
358 {
359 // Launch bounds: 256 threads/block, min 3 blocks/SM to reduce register pressure
360 Kokkos::TeamPolicy< > dn_policy( blocks_, team_size_ );
361 dn_policy.set_scratch_size( 0, Kokkos::PerTeam( team_shmem_size( team_size_ ) ) );
362 Kokkos::parallel_for(
363 "epsilon_divdiv_apply_kernel_fast_dirichlet_neumann",
364 dn_policy,
365 KOKKOS_CLASS_LAMBDA( const Team& team ) { this->run_team_fast_dirichlet_neumann( team ); } );
366 }
367
368 Kokkos::fence();
369 timer_kernel.stop();
370
371 if ( operator_communication_mode_ == linalg::OperatorCommunicationMode::CommunicateAdditively )
372 {
373 util::Timer timer_comm( "epsilon_divdiv_comm" );
374 terra::communication::shell::send_recv_with_plan( comm_plan_, dst_, recv_buffers_ );
375 }
376 }
377
378 /**
379 * @brief Convert one gradient column into symmetric-gradient and div contributions.
380 *
381 * Given grad(phi e_dim), this helper computes:
382 * - diagonal entries (E00,E11,E22)
383 * - off-diagonal symmetric entries (sym01,sym02,sym12)
384 * - divergence contribution `gdd`
385 *
386 * The fast kernels use this repeatedly in fused operator application.
387 */
388 KOKKOS_INLINE_FUNCTION
390 const int dim,
391 const double g0,
392 const double g1,
393 const double g2,
394 double& E00,
395 double& E11,
396 double& E22,
397 double& sym01,
398 double& sym02,
399 double& sym12,
400 double& gdd ) const
401 {
402 E00 = E11 = E22 = sym01 = sym02 = sym12 = gdd = 0.0;
403
404 switch ( dim )
405 {
406 case 0:
407 E00 = g0;
408 gdd = g0;
409 sym01 = 0.5 * g1;
410 sym02 = 0.5 * g2;
411 break;
412 case 1:
413 E11 = g1;
414 gdd = g1;
415 sym01 = 0.5 * g0;
416 sym12 = 0.5 * g2;
417 break;
418 default:
419 E22 = g2;
420 gdd = g2;
421 sym02 = 0.5 * g0;
422 sym12 = 0.5 * g1;
423 break;
424 }
425 }
426
427 /**
428 * @brief Team scratch memory size for fast paths.
429 *
430 * Layout per team:
431 * [coords_sh | src_sh | k_sh | r_sh | padding]
432 */
433 KOKKOS_INLINE_FUNCTION
434 size_t team_shmem_size( const int ts ) const
435 {
436 const int nlev = r_tile_ + 1;
437 const int n = lat_tile_ + 1;
438 const int nxy = n * n;
439
440 const size_t nscalars =
441 size_t( nxy ) * 3 + size_t( nxy ) * 3 * nlev + size_t( nxy ) * nlev + size_t( nlev ) + 1;
442
443 return sizeof( ScalarType ) * nscalars;
444 }
445
446 private:
447 /**
448 * @brief Decode a team league rank / team rank into:
449 * - tile origin (x0,y0,r0)
450 * - intra-tile thread coordinates (tx,ty,tr)
451 * - target cell (x_cell,y_cell,r_cell)
452 */
453 KOKKOS_INLINE_FUNCTION
454 void decode_team_indices(
455 const Team& team,
456 int& local_subdomain_id,
457 int& x0,
458 int& y0,
459 int& r0,
460 int& tx,
461 int& ty,
462 int& tr,
463 int& x_cell,
464 int& y_cell,
465 int& r_cell ) const
466 {
467 int tmp = team.league_rank();
468
469 const int r_tile_id = tmp % r_tiles_;
470 tmp /= r_tiles_;
471
472 const int lat_y_id = tmp % lat_tiles_;
473 tmp /= lat_tiles_;
474
475 const int lat_x_id = tmp % lat_tiles_;
476 tmp /= lat_tiles_;
477
478 local_subdomain_id = tmp;
479
480 x0 = lat_x_id * lat_tile_;
481 y0 = lat_y_id * lat_tile_;
482 r0 = r_tile_id * r_tile_;
483
484 const int tid = team.team_rank();
485 tr = tid % r_tile_;
486 tx = ( tid / r_tile_ ) % lat_tile_;
487 ty = tid / ( r_tile_ * lat_tile_ );
488
489 x_cell = x0 + tx;
490 y_cell = y0 + ty;
491 r_cell = r0 + tr;
492 }
493
494 // -------------------------------------------------------------------------
495 // Path-specific team entry points (NO path branching)
496 // These are the functions called directly by host-dispatched kernel launches.
497 // -------------------------------------------------------------------------
498
499 /**
500 * @brief Team entry for slow (local-matrix) path.
501 *
502 * This wrapper performs only:
503 * - team index decoding
504 * - boundary flag queries
505 * - forwarding to operator_slow_path()
506 *
507 * No dispatch branching happens here.
508 */
509 KOKKOS_INLINE_FUNCTION
510 void run_team_slow( const Team& team ) const
511 {
512 int local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell;
513 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
514
515 if ( tr >= r_tile_ )
516 return;
517
518 const bool at_cmb = has_flag( local_subdomain_id, x_cell, y_cell, r_cell, CMB );
519 const bool at_surface = has_flag( local_subdomain_id, x_cell, y_cell, r_cell + 1, SURFACE );
520
521 operator_slow_path(
522 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell, at_cmb, at_surface );
523 }
524
525 /**
526 * @brief Team entry for fast Dirichlet/Neumann matrix-free path.
527 */
528 KOKKOS_INLINE_FUNCTION
529 void run_team_fast_dirichlet_neumann( const Team& team ) const
530 {
531 int local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell;
532 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
533
534 if ( tr >= r_tile_ )
535 return;
536
537 const bool at_cmb = has_flag( local_subdomain_id, x_cell, y_cell, r_cell, CMB );
538 const bool at_surface = has_flag( local_subdomain_id, x_cell, y_cell, r_cell + 1, SURFACE );
539
540 operator_fast_dirichlet_neumann_path(
541 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell, at_cmb, at_surface );
542 }
543
544 /**
545 * @brief Team entry for fast free-slip matrix-free path.
546 */
547 KOKKOS_INLINE_FUNCTION
548 void run_team_fast_freeslip( const Team& team ) const
549 {
550 int local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell;
551 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
552
553 if ( tr >= r_tile_ )
554 return;
555
556 const bool at_cmb = has_flag( local_subdomain_id, x_cell, y_cell, r_cell, CMB );
557 const bool at_surface = has_flag( local_subdomain_id, x_cell, y_cell, r_cell + 1, SURFACE );
558
559 operator_fast_freeslip_path(
560 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell, at_cmb, at_surface );
561 }
562
563 // ===================== SLOW PATH =====================
564 KOKKOS_INLINE_FUNCTION
565 void operator_slow_path(
566 const Team& team,
567 const int local_subdomain_id,
568 const int x0,
569 const int y0,
570 const int r0,
571 const int tx,
572 const int ty,
573 const int tr,
574 const int x_cell,
575 const int y_cell,
576 const int r_cell,
577 const bool at_cmb,
578 const bool at_surface ) const
579 {
580 (void) team;
581 (void) x0;
582 (void) y0;
583 (void) r0;
584 (void) tx;
585 (void) ty;
586 (void) tr;
587
588 if ( x_cell >= hex_lat_ || y_cell >= hex_lat_ || r_cell >= hex_rad_ )
589 return;
590
592
593 if ( operator_stored_matrix_mode_ == linalg::OperatorStoredMatrixMode::Full )
594 {
595 A[0] = local_matrix_storage_.get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 );
596 A[1] = local_matrix_storage_.get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 );
597 }
598 else if ( operator_stored_matrix_mode_ == linalg::OperatorStoredMatrixMode::Selective )
599 {
600 if ( local_matrix_storage_.has_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 ) &&
601 local_matrix_storage_.has_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 ) )
602 {
603 A[0] = local_matrix_storage_.get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 );
604 A[1] = local_matrix_storage_.get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 );
605 }
606 else
607 {
608 A[0] = assemble_local_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 );
609 A[1] = assemble_local_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 );
610 }
611 }
612 else
613 {
614 A[0] = assemble_local_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 );
615 A[1] = assemble_local_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 );
616 }
617
618 dense::Vec< ScalarT, 18 > src[num_wedges_per_hex_cell];
619 for ( int dimj = 0; dimj < 3; dimj++ )
620 {
621 dense::Vec< ScalarT, 6 > src_d[num_wedges_per_hex_cell];
622 extract_local_wedge_vector_coefficients( src_d, local_subdomain_id, x_cell, y_cell, r_cell, dimj, src_ );
623
624 for ( int wedge = 0; wedge < num_wedges_per_hex_cell; wedge++ )
625 {
626 for ( int i = 0; i < num_nodes_per_wedge; i++ )
627 {
628 src[wedge]( dimj * num_nodes_per_wedge + i ) = src_d[wedge]( i );
629 }
630 }
631 }
632
633 dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > boundary_mask;
634 boundary_mask.fill( 1.0 );
635
636 bool freeslip_reorder = false;
637 dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > R[num_wedges_per_hex_cell];
638
639 if ( at_cmb || at_surface )
640 {
641 ShellBoundaryFlag sbf = at_cmb ? CMB : SURFACE;
643
644 if ( bcf == DIRICHLET )
645 {
646 for ( int dimi = 0; dimi < 3; ++dimi )
647 {
648 for ( int dimj = 0; dimj < 3; ++dimj )
649 {
650 for ( int i = 0; i < num_nodes_per_wedge; i++ )
651 {
652 for ( int j = 0; j < num_nodes_per_wedge; j++ )
653 {
654 if ( ( at_cmb && ( ( dimi == dimj && i != j && ( i < 3 || j < 3 ) ) ||
655 ( dimi != dimj && ( i < 3 || j < 3 ) ) ) ) ||
656 ( at_surface && ( ( dimi == dimj && i != j && ( i >= 3 || j >= 3 ) ) ||
657 ( dimi != dimj && ( i >= 3 || j >= 3 ) ) ) ) )
658 {
659 boundary_mask( i + dimi * num_nodes_per_wedge, j + dimj * num_nodes_per_wedge ) =
660 0.0;
661 }
662 }
663 }
664 }
665 }
666 }
667 else if ( bcf == FREESLIP )
668 {
669 freeslip_reorder = true;
670 dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > A_tmp[num_wedges_per_hex_cell] = { 0 };
671
672 for ( int wedge = 0; wedge < 2; ++wedge )
673 {
674 for ( int dimi = 0; dimi < 3; ++dimi )
675 {
676 for ( int node_idxi = 0; node_idxi < num_nodes_per_wedge; node_idxi++ )
677 {
678 for ( int dimj = 0; dimj < 3; ++dimj )
679 {
680 for ( int node_idxj = 0; node_idxj < num_nodes_per_wedge; node_idxj++ )
681 {
682 A_tmp[wedge]( node_idxi * 3 + dimi, node_idxj * 3 + dimj ) = A[wedge](
683 node_idxi + dimi * num_nodes_per_wedge,
684 node_idxj + dimj * num_nodes_per_wedge );
685 }
686 }
687 }
688 }
690 }
691
692 constexpr int layer_hex_offset_x[2][3] = { { 0, 1, 0 }, { 1, 0, 1 } };
693 constexpr int layer_hex_offset_y[2][3] = { { 0, 0, 1 }, { 1, 1, 0 } };
694
695 for ( int wedge = 0; wedge < 2; ++wedge )
696 {
697 for ( int i = 0; i < LocalMatrixDim; ++i )
698 {
699 R[wedge]( i, i ) = 1.0;
700 }
701
702 for ( int boundary_node_idx = 0; boundary_node_idx < 3; boundary_node_idx++ )
703 {
704 dense::Vec< double, 3 > normal = grid::shell::coords(
705 local_subdomain_id,
706 x_cell + layer_hex_offset_x[wedge][boundary_node_idx],
707 y_cell + layer_hex_offset_y[wedge][boundary_node_idx],
708 r_cell + ( at_cmb ? 0 : 1 ),
709 grid_,
710 radii_ );
711
712 auto R_i = trafo_mat_cartesian_to_normal_tangential( normal );
713
714 const int offset_in_R = at_cmb ? 0 : 9;
715 for ( int dimi = 0; dimi < 3; ++dimi )
716 {
717 for ( int dimj = 0; dimj < 3; ++dimj )
718 {
719 R[wedge](
720 offset_in_R + boundary_node_idx * 3 + dimi,
721 offset_in_R + boundary_node_idx * 3 + dimj ) = R_i( dimi, dimj );
722 }
723 }
724 }
725
726 A[wedge] = R[wedge] * A_tmp[wedge] * R[wedge].transposed();
727
728 auto src_tmp = R[wedge] * src[wedge];
729 for ( int i = 0; i < 18; ++i )
730 {
731 src[wedge]( i ) = src_tmp( i );
732 }
733
734 const int node_start = at_surface ? 3 : 0;
735 const int node_end = at_surface ? 6 : 3;
736
737 for ( int node_idx = node_start; node_idx < node_end; node_idx++ )
738 {
739 const int idx = node_idx * 3;
740 for ( int k = 0; k < 18; ++k )
741 {
742 if ( k != idx )
743 {
744 boundary_mask( idx, k ) = 0.0;
745 boundary_mask( k, idx ) = 0.0;
746 }
747 }
748 }
749 }
750 }
751 }
752
753 for ( int wedge = 0; wedge < num_wedges_per_hex_cell; wedge++ )
754 {
755 A[wedge].hadamard_product( boundary_mask );
756 }
757
758 if ( diagonal_ )
759 {
760 A[0] = A[0].diagonal();
761 A[1] = A[1].diagonal();
762 }
763
764 dense::Vec< ScalarT, LocalMatrixDim > dst[num_wedges_per_hex_cell];
765 dst[0] = A[0] * src[0];
766 dst[1] = A[1] * src[1];
767
768 if ( freeslip_reorder )
769 {
770 dense::Vec< ScalarT, LocalMatrixDim > dst_tmp[num_wedges_per_hex_cell];
771 dst_tmp[0] = R[0].transposed() * dst[0];
772 dst_tmp[1] = R[1].transposed() * dst[1];
773
774 for ( int i = 0; i < 18; ++i )
775 {
776 dst[0]( i ) = dst_tmp[0]( i );
777 dst[1]( i ) = dst_tmp[1]( i );
778 }
779
782 }
783
784 for ( int dimi = 0; dimi < 3; dimi++ )
785 {
786 dense::Vec< ScalarT, 6 > dst_d[num_wedges_per_hex_cell];
787 dst_d[0] = dst[0].template slice< 6 >( dimi * num_nodes_per_wedge );
788 dst_d[1] = dst[1].template slice< 6 >( dimi * num_nodes_per_wedge );
789
791 dst_, local_subdomain_id, x_cell, y_cell, r_cell, dimi, dst_d );
792 }
793 }
794
795 // ===================== FAST DIRICHLET/NEUMANN PATH =====================
796 KOKKOS_INLINE_FUNCTION
797 void operator_fast_dirichlet_neumann_path(
798 const Team& team,
799 const int local_subdomain_id,
800 const int x0,
801 const int y0,
802 const int r0,
803 const int tx,
804 const int ty,
805 const int tr,
806 const int x_cell,
807 const int y_cell,
808 const int r_cell,
809 const bool at_cmb,
810 const bool at_surface ) const
811 {
812 const int nlev = r_tile_ + 1;
813 const int nxy = ( lat_tile_ + 1 ) * ( lat_tile_ + 1 );
814
815 double* shmem =
816 reinterpret_cast< double* >( team.team_shmem().get_shmem( team_shmem_size( team.team_size() ) ) );
817
818 using ScratchCoords =
819 Kokkos::View< double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
820 using ScratchSrc = Kokkos::
821 View< double***, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
822 using ScratchK =
823 Kokkos::View< double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
824
825 ScratchCoords coords_sh( shmem, nxy, 3 );
826 shmem += nxy * 3;
827
828 ScratchSrc src_sh( shmem, nxy, 3, nlev );
829 shmem += nxy * 3 * nlev;
830
831 ScratchK k_sh( shmem, nxy, nlev );
832 shmem += nxy * nlev;
833
834 auto r_sh =
835 Kokkos::View< double*, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >(
836 shmem, nlev );
837
838 auto node_id = [&]( int nx, int ny ) -> int { return nx + ( lat_tile_ + 1 ) * ny; };
839
840 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nxy ), [&]( int n ) {
841 const int dxn = n % ( lat_tile_ + 1 );
842 const int dyn = n / ( lat_tile_ + 1 );
843 const int xi = x0 + dxn;
844 const int yi = y0 + dyn;
845
846 if ( xi <= hex_lat_ && yi <= hex_lat_ )
847 {
848 coords_sh( n, 0 ) = grid_( local_subdomain_id, xi, yi, 0 );
849 coords_sh( n, 1 ) = grid_( local_subdomain_id, xi, yi, 1 );
850 coords_sh( n, 2 ) = grid_( local_subdomain_id, xi, yi, 2 );
851 }
852 else
853 {
854 coords_sh( n, 0 ) = coords_sh( n, 1 ) = coords_sh( n, 2 ) = 0.0;
855 }
856 } );
857
858 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nlev ), [&]( int lvl ) {
859 const int rr = r0 + lvl;
860 r_sh( lvl ) = ( rr <= hex_rad_ ) ? radii_( local_subdomain_id, rr ) : 0.0;
861 } );
862
863 const int total_pairs = nxy * nlev;
864 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, total_pairs ), [&]( int t ) {
865 const int node = t / nlev;
866 const int lvl = t - node * nlev;
867
868 const int dxn = node % ( lat_tile_ + 1 );
869 const int dyn = node / ( lat_tile_ + 1 );
870
871 const int xi = x0 + dxn;
872 const int yi = y0 + dyn;
873 const int rr = r0 + lvl;
874
875 if ( xi <= hex_lat_ && yi <= hex_lat_ && rr <= hex_rad_ )
876 {
877 k_sh( node, lvl ) = k_( local_subdomain_id, xi, yi, rr );
878 src_sh( node, 0, lvl ) = src_( local_subdomain_id, xi, yi, rr, 0 );
879 src_sh( node, 1, lvl ) = src_( local_subdomain_id, xi, yi, rr, 1 );
880 src_sh( node, 2, lvl ) = src_( local_subdomain_id, xi, yi, rr, 2 );
881 }
882 else
883 {
884 k_sh( node, lvl ) = 0.0;
885 src_sh( node, 0, lvl ) = src_sh( node, 1, lvl ) = src_sh( node, 2, lvl ) = 0.0;
886 }
887 } );
888
889 team.team_barrier();
890
891 if ( x_cell >= hex_lat_ || y_cell >= hex_lat_ || r_cell >= hex_rad_ )
892 return;
893
894 const int lvl0 = tr;
895 const double r_0 = r_sh( lvl0 );
896 const double r_1 = r_sh( lvl0 + 1 );
897
898 const bool at_boundary = at_cmb || at_surface;
899 bool treat_boundary_dirichlet = false;
900 if ( at_boundary )
901 {
902 const ShellBoundaryFlag sbf = at_cmb ? CMB : SURFACE;
903 treat_boundary_dirichlet = ( get_boundary_condition_flag( bcs_, sbf ) == DIRICHLET );
904 }
905
906 const int cmb_shift = ( ( at_boundary && treat_boundary_dirichlet && ( !diagonal_ ) && at_cmb ) ? 3 : 0 );
907 const int surface_shift =
908 ( ( at_boundary && treat_boundary_dirichlet && ( !diagonal_ ) && at_surface ) ? 3 : 0 );
909
910 static constexpr int WEDGE_NODE_OFF[2][6][3] = {
911 { { 0, 0, 0 }, { 1, 0, 0 }, { 0, 1, 0 }, { 0, 0, 1 }, { 1, 0, 1 }, { 0, 1, 1 } },
912 { { 1, 1, 0 }, { 0, 1, 0 }, { 1, 0, 0 }, { 1, 1, 1 }, { 0, 1, 1 }, { 1, 0, 1 } } };
913
914 static constexpr int WEDGE_TO_UNIQUE[2][6] = {
915 { 0, 1, 2, 3, 4, 5 },
916 { 6, 2, 1, 7, 5, 4 }
917 };
918
919 constexpr double ONE_THIRD = 1.0 / 3.0;
920 constexpr double ONE_SIXTH = 1.0 / 6.0;
921 constexpr double NEG_TWO_THIRDS = -0.66666666666666663;
922
923 static constexpr double dN_ref[6][3] = {
924 { -0.5, -0.5, -ONE_SIXTH },
925 { 0.5, 0.0, -ONE_SIXTH },
926 { 0.0, 0.5, -ONE_SIXTH },
927 { -0.5, -0.5, ONE_SIXTH },
928 { 0.5, 0.0, ONE_SIXTH },
929 { 0.0, 0.5, ONE_SIXTH } };
930
931 const int n00 = node_id( tx, ty );
932 const int n01 = node_id( tx, ty + 1 );
933 const int n10 = node_id( tx + 1, ty );
934 const int n11 = node_id( tx + 1, ty + 1 );
935
936 for ( int w = 0; w < 2; ++w )
937 {
938 const int v0 = w == 0 ? n00 : n11;
939 const int v1 = w == 0 ? n10 : n01;
940 const int v2 = w == 0 ? n01 : n10;
941
942 double k_sum = 0.0;
943#pragma unroll
944 for ( int node = 0; node < 6; ++node )
945 {
946 const int ddx = WEDGE_NODE_OFF[w][node][0];
947 const int ddy = WEDGE_NODE_OFF[w][node][1];
948 const int ddr = WEDGE_NODE_OFF[w][node][2];
949
950 const int nid = node_id( tx + ddx, ty + ddy );
951 const int lvl = lvl0 + ddr;
952
953 k_sum += k_sh( nid, lvl );
954 }
955 const double k_eval = ONE_SIXTH * k_sum;
956
957 double wJ = 0.0;
958 double i00, i01, i02;
959 double i10, i11, i12;
960 double i20, i21, i22;
961
962 {
963 const double half_dr = 0.5 * ( r_1 - r_0 );
964 const double r_mid = 0.5 * ( r_0 + r_1 );
965
966 const double J_0_0 = r_mid * ( -coords_sh( v0, 0 ) + coords_sh( v1, 0 ) );
967 const double J_0_1 = r_mid * ( -coords_sh( v0, 0 ) + coords_sh( v2, 0 ) );
968 const double J_0_2 = half_dr * ( ONE_THIRD * ( coords_sh( v0, 0 ) + coords_sh( v1, 0 ) + coords_sh( v2, 0 ) ) );
969
970 const double J_1_0 = r_mid * ( -coords_sh( v0, 1 ) + coords_sh( v1, 1 ) );
971 const double J_1_1 = r_mid * ( -coords_sh( v0, 1 ) + coords_sh( v2, 1 ) );
972 const double J_1_2 = half_dr * ( ONE_THIRD * ( coords_sh( v0, 1 ) + coords_sh( v1, 1 ) + coords_sh( v2, 1 ) ) );
973
974 const double J_2_0 = r_mid * ( -coords_sh( v0, 2 ) + coords_sh( v1, 2 ) );
975 const double J_2_1 = r_mid * ( -coords_sh( v0, 2 ) + coords_sh( v2, 2 ) );
976 const double J_2_2 = half_dr * ( ONE_THIRD * ( coords_sh( v0, 2 ) + coords_sh( v1, 2 ) + coords_sh( v2, 2 ) ) );
977
978 const double J_det = J_0_0 * J_1_1 * J_2_2 - J_0_0 * J_1_2 * J_2_1 - J_0_1 * J_1_0 * J_2_2 +
979 J_0_1 * J_1_2 * J_2_0 + J_0_2 * J_1_0 * J_2_1 - J_0_2 * J_1_1 * J_2_0;
980
981 const double invJ = 1.0 / J_det;
982
983 i00 = invJ * ( J_1_1 * J_2_2 - J_1_2 * J_2_1 );
984 i01 = invJ * ( -J_1_0 * J_2_2 + J_1_2 * J_2_0 );
985 i02 = invJ * ( J_1_0 * J_2_1 - J_1_1 * J_2_0 );
986 i10 = invJ * ( -J_0_1 * J_2_2 + J_0_2 * J_2_1 );
987 i11 = invJ * ( J_0_0 * J_2_2 - J_0_2 * J_2_0 );
988 i12 = invJ * ( -J_0_0 * J_2_1 + J_0_1 * J_2_0 );
989 i20 = invJ * ( J_0_1 * J_1_2 - J_0_2 * J_1_1 );
990 i21 = invJ * ( -J_0_0 * J_1_2 + J_0_2 * J_1_0 );
991 i22 = invJ * ( J_0_0 * J_1_1 - J_0_1 * J_1_0 );
992
993 wJ = Kokkos::abs( J_det );
994 }
995
996 const double kwJ = k_eval * wJ;
997
998 // Per-wedge output accumulator (scatter after each wedge to reduce register pressure)
999 double dst_w[3][6] = { 0.0 };
1000
1001 double gu00 = 0.0;
1002 double gu10 = 0.0, gu11 = 0.0;
1003 double gu20 = 0.0, gu21 = 0.0, gu22 = 0.0;
1004 double div_u = 0.0;
1005
1006 if ( !diagonal_ )
1007 {
1008 // Trial side: accumulate symmetric gradient of u (fused dim loops).
1009 // Physical gradients computed inline to avoid local memory spills.
1010#pragma unroll
1011 for ( int n = cmb_shift; n < 6 - surface_shift; ++n )
1012 {
1013 const double gx = dN_ref[n][0];
1014 const double gy = dN_ref[n][1];
1015 const double gz = dN_ref[n][2];
1016 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1017 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1018 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1019
1020 const int ddx = WEDGE_NODE_OFF[w][n][0];
1021 const int ddy = WEDGE_NODE_OFF[w][n][1];
1022 const int ddr = WEDGE_NODE_OFF[w][n][2];
1023 const int nid = node_id( tx + ddx, ty + ddy );
1024 const int lvl = lvl0 + ddr;
1025
1026 const double s0 = src_sh( nid, 0, lvl );
1027 const double s1 = src_sh( nid, 1, lvl );
1028 const double s2 = src_sh( nid, 2, lvl );
1029
1030 gu00 += g0 * s0;
1031 gu11 += g1 * s1;
1032 gu22 += g2 * s2;
1033 gu10 += 0.5 * ( g1 * s0 + g0 * s1 );
1034 gu20 += 0.5 * ( g2 * s0 + g0 * s2 );
1035 gu21 += 0.5 * ( g2 * s1 + g1 * s2 );
1036 div_u += g0 * s0 + g1 * s1 + g2 * s2;
1037 }
1038
1039 // Test side: compute dst for all 3 dims at once (fused).
1040 // 2*epsilon(v):epsilon(u) - (2/3)*div(v)*div(u)
1041#pragma unroll
1042 for ( int n = cmb_shift; n < 6 - surface_shift; ++n )
1043 {
1044 const double gx = dN_ref[n][0];
1045 const double gy = dN_ref[n][1];
1046 const double gz = dN_ref[n][2];
1047 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1048 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1049 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1050
1051 dst_w[0][n] +=
1052 kwJ * ( 2.0 * ( g0 * gu00 + g1 * gu10 + g2 * gu20 ) + NEG_TWO_THIRDS * g0 * div_u );
1053 dst_w[1][n] +=
1054 kwJ * ( 2.0 * ( g0 * gu10 + g1 * gu11 + g2 * gu21 ) + NEG_TWO_THIRDS * g1 * div_u );
1055 dst_w[2][n] +=
1056 kwJ * ( 2.0 * ( g0 * gu20 + g1 * gu21 + g2 * gu22 ) + NEG_TWO_THIRDS * g2 * div_u );
1057 }
1058 }
1059
1060 if ( diagonal_ || ( treat_boundary_dirichlet && at_boundary ) )
1061 {
1062 // Diagonal action: kwJ * s_d * (|g|^2 + (1/3) * g_d^2)
1063#pragma unroll
1064 for ( int n = surface_shift; n < 6 - cmb_shift; ++n )
1065 {
1066 const double gx = dN_ref[n][0];
1067 const double gy = dN_ref[n][1];
1068 const double gz = dN_ref[n][2];
1069 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1070 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1071 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1072 const double gg = g0 * g0 + g1 * g1 + g2 * g2;
1073
1074 const int ddx = WEDGE_NODE_OFF[w][n][0];
1075 const int ddy = WEDGE_NODE_OFF[w][n][1];
1076 const int ddr = WEDGE_NODE_OFF[w][n][2];
1077 const int nid = node_id( tx + ddx, ty + ddy );
1078 const int lvl = lvl0 + ddr;
1079
1080 const double s0 = src_sh( nid, 0, lvl );
1081 const double s1 = src_sh( nid, 1, lvl );
1082 const double s2 = src_sh( nid, 2, lvl );
1083 dst_w[0][n] += kwJ * s0 * ( gg + ONE_THIRD * g0 * g0 );
1084 dst_w[1][n] += kwJ * s1 * ( gg + ONE_THIRD * g1 * g1 );
1085 dst_w[2][n] += kwJ * s2 * ( gg + ONE_THIRD * g2 * g2 );
1086 }
1087 }
1088
1089 // Scatter this wedge's contributions immediately
1090#pragma unroll
1091 for ( int n = 0; n < 6; ++n )
1092 {
1093 const int ddx = WEDGE_NODE_OFF[w][n][0];
1094 const int ddy = WEDGE_NODE_OFF[w][n][1];
1095 const int ddr = WEDGE_NODE_OFF[w][n][2];
1096 for ( int dim_add = 0; dim_add < 3; ++dim_add )
1097 {
1098 Kokkos::atomic_add(
1099 &dst_( local_subdomain_id, x_cell + ddx, y_cell + ddy, r_cell + ddr, dim_add ),
1100 dst_w[dim_add][n] );
1101 }
1102 }
1103
1104 }
1105 }
1106
1107 // ===================== FAST FREESLIP PATH =====================
1108 KOKKOS_INLINE_FUNCTION
1109 void normalize3( double& x, double& y, double& z ) const
1110 {
1111 const double n2 = x * x + y * y + z * z;
1112 if ( n2 > 0.0 )
1113 {
1114 const double invn = 1.0 / Kokkos::sqrt( n2 );
1115 x *= invn;
1116 y *= invn;
1117 z *= invn;
1118 }
1119 }
1120
1121 KOKKOS_INLINE_FUNCTION
1122 void project_tangential_inplace(
1123 const double nx,
1124 const double ny,
1125 const double nz,
1126 double& ux,
1127 double& uy,
1128 double& uz ) const
1129 {
1130 const double dot = nx * ux + ny * uy + nz * uz;
1131 ux -= dot * nx;
1132 uy -= dot * ny;
1133 uz -= dot * nz;
1134 }
1135
1136 KOKKOS_INLINE_FUNCTION
1137 void operator_fast_freeslip_path(
1138 const Team& team,
1139 const int local_subdomain_id,
1140 const int x0,
1141 const int y0,
1142 const int r0,
1143 const int tx,
1144 const int ty,
1145 const int tr,
1146 const int x_cell,
1147 const int y_cell,
1148 const int r_cell,
1149 const bool at_cmb,
1150 const bool at_surface ) const
1151 {
1152 const int nlev = r_tile_ + 1;
1153 const int nxy = ( lat_tile_ + 1 ) * ( lat_tile_ + 1 );
1154
1155 double* shmem =
1156 reinterpret_cast< double* >( team.team_shmem().get_shmem( team_shmem_size( team.team_size() ) ) );
1157
1158 using ScratchCoords =
1159 Kokkos::View< double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
1160 using ScratchSrc = Kokkos::
1161 View< double***, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
1162 using ScratchK =
1163 Kokkos::View< double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
1164
1165 ScratchCoords coords_sh( shmem, nxy, 3 );
1166 shmem += nxy * 3;
1167
1168 ScratchSrc src_sh( shmem, nxy, 3, nlev );
1169 shmem += nxy * 3 * nlev;
1170
1171 ScratchK k_sh( shmem, nxy, nlev );
1172 shmem += nxy * nlev;
1173
1174 auto r_sh =
1175 Kokkos::View< double*, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >(
1176 shmem, nlev );
1177
1178 auto node_id = [&]( int nx, int ny ) -> int { return nx + ( lat_tile_ + 1 ) * ny; };
1179
1180 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nxy ), [&]( int n ) {
1181 const int dxn = n % ( lat_tile_ + 1 );
1182 const int dyn = n / ( lat_tile_ + 1 );
1183 const int xi = x0 + dxn;
1184 const int yi = y0 + dyn;
1185
1186 if ( xi <= hex_lat_ && yi <= hex_lat_ )
1187 {
1188 coords_sh( n, 0 ) = grid_( local_subdomain_id, xi, yi, 0 );
1189 coords_sh( n, 1 ) = grid_( local_subdomain_id, xi, yi, 1 );
1190 coords_sh( n, 2 ) = grid_( local_subdomain_id, xi, yi, 2 );
1191 }
1192 else
1193 {
1194 coords_sh( n, 0 ) = coords_sh( n, 1 ) = coords_sh( n, 2 ) = 0.0;
1195 }
1196 } );
1197
1198 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nlev ), [&]( int lvl ) {
1199 const int rr = r0 + lvl;
1200 r_sh( lvl ) = ( rr <= hex_rad_ ) ? radii_( local_subdomain_id, rr ) : 0.0;
1201 } );
1202
1203 const int total_pairs = nxy * nlev;
1204 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, total_pairs ), [&]( int t ) {
1205 const int node = t / nlev;
1206 const int lvl = t - node * nlev;
1207
1208 const int dxn = node % ( lat_tile_ + 1 );
1209 const int dyn = node / ( lat_tile_ + 1 );
1210
1211 const int xi = x0 + dxn;
1212 const int yi = y0 + dyn;
1213 const int rr = r0 + lvl;
1214
1215 if ( xi <= hex_lat_ && yi <= hex_lat_ && rr <= hex_rad_ )
1216 {
1217 k_sh( node, lvl ) = k_( local_subdomain_id, xi, yi, rr );
1218 src_sh( node, 0, lvl ) = src_( local_subdomain_id, xi, yi, rr, 0 );
1219 src_sh( node, 1, lvl ) = src_( local_subdomain_id, xi, yi, rr, 1 );
1220 src_sh( node, 2, lvl ) = src_( local_subdomain_id, xi, yi, rr, 2 );
1221 }
1222 else
1223 {
1224 k_sh( node, lvl ) = 0.0;
1225 src_sh( node, 0, lvl ) = src_sh( node, 1, lvl ) = src_sh( node, 2, lvl ) = 0.0;
1226 }
1227 } );
1228
1229 team.team_barrier();
1230
1231 if ( x_cell >= hex_lat_ || y_cell >= hex_lat_ || r_cell >= hex_rad_ )
1232 return;
1233
1234 const int lvl0 = tr;
1235 const double r_0 = r_sh( lvl0 );
1236 const double r_1 = r_sh( lvl0 + 1 );
1237
1238 const BoundaryConditionFlag cmb_bc = get_boundary_condition_flag( bcs_, CMB );
1239 const BoundaryConditionFlag surface_bc = get_boundary_condition_flag( bcs_, SURFACE );
1240
1241 const bool cmb_freeslip = at_cmb && ( cmb_bc == FREESLIP );
1242 const bool surf_freeslip = at_surface && ( surface_bc == FREESLIP );
1243 const bool cmb_dirichlet = at_cmb && ( cmb_bc == DIRICHLET );
1244 const bool surface_dirichlet = at_surface && ( surface_bc == DIRICHLET );
1245
1246 const int cmb_shift = ( ( cmb_dirichlet && ( !diagonal_ ) ) ? 3 : 0 );
1247 const int surface_shift = ( ( surface_dirichlet && ( !diagonal_ ) ) ? 3 : 0 );
1248
1249 static constexpr int WEDGE_NODE_OFF[2][6][3] = {
1250 { { 0, 0, 0 }, { 1, 0, 0 }, { 0, 1, 0 }, { 0, 0, 1 }, { 1, 0, 1 }, { 0, 1, 1 } },
1251 { { 1, 1, 0 }, { 0, 1, 0 }, { 1, 0, 0 }, { 1, 1, 1 }, { 0, 1, 1 }, { 1, 0, 1 } } };
1252
1253 static constexpr int WEDGE_TO_UNIQUE[2][6] = {
1254 { 0, 1, 2, 3, 4, 5 },
1255 { 6, 2, 1, 7, 5, 4 }
1256 };
1257
1258 constexpr double ONE_THIRD = 1.0 / 3.0;
1259 constexpr double ONE_SIXTH = 1.0 / 6.0;
1260 constexpr double NEG_TWO_THIRDS = -0.66666666666666663;
1261
1262 static constexpr double dN_ref[6][3] = {
1263 { -0.5, -0.5, -ONE_SIXTH },
1264 { 0.5, 0.0, -ONE_SIXTH },
1265 { 0.0, 0.5, -ONE_SIXTH },
1266 { -0.5, -0.5, ONE_SIXTH },
1267 { 0.5, 0.0, ONE_SIXTH },
1268 { 0.0, 0.5, ONE_SIXTH } };
1269
1270 const int n00 = node_id( tx, ty );
1271 const int n01 = node_id( tx, ty + 1 );
1272 const int n10 = node_id( tx + 1, ty );
1273 const int n11 = node_id( tx + 1, ty + 1 );
1274
1275 double src8[3][8];
1276
1277 src8[0][0] = src_sh( n00, 0, lvl0 );
1278 src8[1][0] = src_sh( n00, 1, lvl0 );
1279 src8[2][0] = src_sh( n00, 2, lvl0 );
1280 src8[0][1] = src_sh( n10, 0, lvl0 );
1281 src8[1][1] = src_sh( n10, 1, lvl0 );
1282 src8[2][1] = src_sh( n10, 2, lvl0 );
1283 src8[0][2] = src_sh( n01, 0, lvl0 );
1284 src8[1][2] = src_sh( n01, 1, lvl0 );
1285 src8[2][2] = src_sh( n01, 2, lvl0 );
1286 src8[0][6] = src_sh( n11, 0, lvl0 );
1287 src8[1][6] = src_sh( n11, 1, lvl0 );
1288 src8[2][6] = src_sh( n11, 2, lvl0 );
1289
1290 src8[0][3] = src_sh( n00, 0, lvl0 + 1 );
1291 src8[1][3] = src_sh( n00, 1, lvl0 + 1 );
1292 src8[2][3] = src_sh( n00, 2, lvl0 + 1 );
1293 src8[0][4] = src_sh( n10, 0, lvl0 + 1 );
1294 src8[1][4] = src_sh( n10, 1, lvl0 + 1 );
1295 src8[2][4] = src_sh( n10, 2, lvl0 + 1 );
1296 src8[0][5] = src_sh( n01, 0, lvl0 + 1 );
1297 src8[1][5] = src_sh( n01, 1, lvl0 + 1 );
1298 src8[2][5] = src_sh( n01, 2, lvl0 + 1 );
1299 src8[0][7] = src_sh( n11, 0, lvl0 + 1 );
1300 src8[1][7] = src_sh( n11, 1, lvl0 + 1 );
1301 src8[2][7] = src_sh( n11, 2, lvl0 + 1 );
1302
1303 double nx00 = coords_sh( n00, 0 ), ny00 = coords_sh( n00, 1 ), nz00 = coords_sh( n00, 2 );
1304 double nx10 = coords_sh( n10, 0 ), ny10 = coords_sh( n10, 1 ), nz10 = coords_sh( n10, 2 );
1305 double nx01 = coords_sh( n01, 0 ), ny01 = coords_sh( n01, 1 ), nz01 = coords_sh( n01, 2 );
1306 double nx11 = coords_sh( n11, 0 ), ny11 = coords_sh( n11, 1 ), nz11 = coords_sh( n11, 2 );
1307
1308 normalize3( nx00, ny00, nz00 );
1309 normalize3( nx10, ny10, nz10 );
1310 normalize3( nx01, ny01, nz01 );
1311 normalize3( nx11, ny11, nz11 );
1312
1313 // Save normal components of src at free-slip boundary nodes before trial-side projection.
1314 // Used in two places:
1315 // - non-diagonal mode: to restore A_nn * u_n * n after the test-side projection
1316 // (normal_corr mechanism).
1317 // - diagonal mode: to reconstruct the unprojected src inside apply_rotated_diag so
1318 // that R^T diag(RAR^T) R src can be evaluated.
1319 // Corner indexing: 0=n00, 1=n10, 2=n01, 3=n11.
1320 double u_n_cmb[4] = {};
1321 double u_n_surf[4] = {};
1322 if ( cmb_freeslip )
1323 {
1324 u_n_cmb[0] = nx00 * src8[0][0] + ny00 * src8[1][0] + nz00 * src8[2][0];
1325 u_n_cmb[1] = nx10 * src8[0][1] + ny10 * src8[1][1] + nz10 * src8[2][1];
1326 u_n_cmb[2] = nx01 * src8[0][2] + ny01 * src8[1][2] + nz01 * src8[2][2];
1327 u_n_cmb[3] = nx11 * src8[0][6] + ny11 * src8[1][6] + nz11 * src8[2][6];
1328 }
1329 if ( surf_freeslip )
1330 {
1331 u_n_surf[0] = nx00 * src8[0][3] + ny00 * src8[1][3] + nz00 * src8[2][3];
1332 u_n_surf[1] = nx10 * src8[0][4] + ny10 * src8[1][4] + nz10 * src8[2][4];
1333 u_n_surf[2] = nx01 * src8[0][5] + ny01 * src8[1][5] + nz01 * src8[2][5];
1334 u_n_surf[3] = nx11 * src8[0][7] + ny11 * src8[1][7] + nz11 * src8[2][7];
1335 }
1336
1337 if ( cmb_freeslip )
1338 {
1339 project_tangential_inplace( nx00, ny00, nz00, src8[0][0], src8[1][0], src8[2][0] );
1340 project_tangential_inplace( nx10, ny10, nz10, src8[0][1], src8[1][1], src8[2][1] );
1341 project_tangential_inplace( nx01, ny01, nz01, src8[0][2], src8[1][2], src8[2][2] );
1342 project_tangential_inplace( nx11, ny11, nz11, src8[0][6], src8[1][6], src8[2][6] );
1343 }
1344 if ( surf_freeslip )
1345 {
1346 project_tangential_inplace( nx00, ny00, nz00, src8[0][3], src8[1][3], src8[2][3] );
1347 project_tangential_inplace( nx10, ny10, nz10, src8[0][4], src8[1][4], src8[2][4] );
1348 project_tangential_inplace( nx01, ny01, nz01, src8[0][5], src8[1][5], src8[2][5] );
1349 project_tangential_inplace( nx11, ny11, nz11, src8[0][7], src8[1][7], src8[2][7] );
1350 }
1351
1352 double dst8[3][8] = { { 0.0 } };
1353 double normal_corr[3][8] = {};
1354
1355
1356
1357 for ( int w = 0; w < 2; ++w )
1358 {
1359 const int v0 = w == 0 ? n00 : n11;
1360 const int v1 = w == 0 ? n10 : n01;
1361 const int v2 = w == 0 ? n01 : n10;
1362
1363 double k_sum = 0.0;
1364#pragma unroll
1365 for ( int node = 0; node < 6; ++node )
1366 {
1367 const int ddx = WEDGE_NODE_OFF[w][node][0];
1368 const int ddy = WEDGE_NODE_OFF[w][node][1];
1369 const int ddr = WEDGE_NODE_OFF[w][node][2];
1370
1371 const int nid = node_id( tx + ddx, ty + ddy );
1372 const int lvl = lvl0 + ddr;
1373 k_sum += k_sh( nid, lvl );
1374 }
1375 const double k_eval = ONE_SIXTH * k_sum;
1376
1377 double wJ = 0.0;
1378 double i00, i01, i02;
1379 double i10, i11, i12;
1380 double i20, i21, i22;
1381
1382 {
1383 const double half_dr = 0.5 * ( r_1 - r_0 );
1384 const double r_mid = 0.5 * ( r_0 + r_1 );
1385
1386 const double J_0_0 = r_mid * ( -coords_sh( v0, 0 ) + coords_sh( v1, 0 ) );
1387 const double J_0_1 = r_mid * ( -coords_sh( v0, 0 ) + coords_sh( v2, 0 ) );
1388 const double J_0_2 = half_dr * ( ONE_THIRD * ( coords_sh( v0, 0 ) + coords_sh( v1, 0 ) + coords_sh( v2, 0 ) ) );
1389
1390 const double J_1_0 = r_mid * ( -coords_sh( v0, 1 ) + coords_sh( v1, 1 ) );
1391 const double J_1_1 = r_mid * ( -coords_sh( v0, 1 ) + coords_sh( v2, 1 ) );
1392 const double J_1_2 = half_dr * ( ONE_THIRD * ( coords_sh( v0, 1 ) + coords_sh( v1, 1 ) + coords_sh( v2, 1 ) ) );
1393
1394 const double J_2_0 = r_mid * ( -coords_sh( v0, 2 ) + coords_sh( v1, 2 ) );
1395 const double J_2_1 = r_mid * ( -coords_sh( v0, 2 ) + coords_sh( v2, 2 ) );
1396 const double J_2_2 = half_dr * ( ONE_THIRD * ( coords_sh( v0, 2 ) + coords_sh( v1, 2 ) + coords_sh( v2, 2 ) ) );
1397
1398 const double J_det = J_0_0 * J_1_1 * J_2_2 - J_0_0 * J_1_2 * J_2_1 - J_0_1 * J_1_0 * J_2_2 +
1399 J_0_1 * J_1_2 * J_2_0 + J_0_2 * J_1_0 * J_2_1 - J_0_2 * J_1_1 * J_2_0;
1400
1401 const double invJ = 1.0 / J_det;
1402
1403 i00 = invJ * ( J_1_1 * J_2_2 - J_1_2 * J_2_1 );
1404 i01 = invJ * ( -J_1_0 * J_2_2 + J_1_2 * J_2_0 );
1405 i02 = invJ * ( J_1_0 * J_2_1 - J_1_1 * J_2_0 );
1406
1407 i10 = invJ * ( -J_0_1 * J_2_2 + J_0_2 * J_2_1 );
1408 i11 = invJ * ( J_0_0 * J_2_2 - J_0_2 * J_2_0 );
1409 i12 = invJ * ( -J_0_0 * J_2_1 + J_0_1 * J_2_0 );
1410
1411 i20 = invJ * ( J_0_1 * J_1_2 - J_0_2 * J_1_1 );
1412 i21 = invJ * ( -J_0_0 * J_1_2 + J_0_2 * J_1_0 );
1413 i22 = invJ * ( J_0_0 * J_1_1 - J_0_1 * J_1_0 );
1414
1415 wJ = Kokkos::abs( J_det );
1416 }
1417
1418 const double kwJ = k_eval * wJ;
1419
1420 // Accumulate normal-normal diagonal correction for free-slip boundary nodes.
1421 // For each boundary node u with outward unit normal n_u and shape-function physical
1422 // gradient g_u = J^{-T} dN_ref[node_idx]:
1423 // A_nn_u += kwJ * ( |g_u|^2 + (1/3) * (n_u · g_u)^2 )
1424 // This equals n_u^T · A_3x3_self_u · n_u — the (0,0) entry of R_u A R_u^T in the
1425 // normal-tangential frame — which the slow path preserves on its diagonal.
1426 // Map: (w, in-boundary index ni in 0..2) -> corner index {0=n00,1=n10,2=n01,3=n11}
1427 // normal_corr is only needed for the non-diagonal operator apply (P A P + A_nn u_n n).
1428 // In diagonal mode the rotated-frame diagonal is used directly (see below).
1429 if ( !diagonal_ && ( cmb_freeslip || surf_freeslip ) )
1430 {
1431 static constexpr int CMB_NODE_TO_CORNER[2][3] = { { 0, 1, 2 }, { 3, 2, 1 } };
1432 const double cn[4][3] = { { nx00, ny00, nz00 },
1433 { nx10, ny10, nz10 },
1434 { nx01, ny01, nz01 },
1435 { nx11, ny11, nz11 } };
1436 if ( cmb_freeslip )
1437 {
1438 for ( int ni = 0; ni < 3; ++ni )
1439 {
1440 const int corner = CMB_NODE_TO_CORNER[w][ni];
1441 const double nxu = cn[corner][0], nyu = cn[corner][1], nzu = cn[corner][2];
1442
1443 const double gx = dN_ref[ni][0];
1444 const double gy = dN_ref[ni][1];
1445 const double gz = dN_ref[ni][2];
1446 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1447 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1448 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1449 const double gg = g0 * g0 + g1 * g1 + g2 * g2;
1450 const double ng = nxu * g0 + nyu * g1 + nzu * g2;
1451 const double Ann = kwJ * ( gg + ONE_THIRD * ng * ng );
1452 const double c = Ann * u_n_cmb[corner];
1453 const int u = WEDGE_TO_UNIQUE[w][ni];
1454
1455 normal_corr[0][u] += c * nxu;
1456 normal_corr[1][u] += c * nyu;
1457 normal_corr[2][u] += c * nzu;
1458 }
1459 }
1460 if ( surf_freeslip )
1461 {
1462 for ( int ni = 0; ni < 3; ++ni )
1463 {
1464 const int corner = CMB_NODE_TO_CORNER[w][ni];
1465 const double nxu = cn[corner][0], nyu = cn[corner][1], nzu = cn[corner][2];
1466
1467 const double gx = dN_ref[ni + 3][0];
1468 const double gy = dN_ref[ni + 3][1];
1469 const double gz = dN_ref[ni + 3][2];
1470 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1471 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1472 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1473 const double gg = g0 * g0 + g1 * g1 + g2 * g2;
1474 const double ng = nxu * g0 + nyu * g1 + nzu * g2;
1475 const double Ann = kwJ * ( gg + ONE_THIRD * ng * ng );
1476 const double c = Ann * u_n_surf[corner];
1477 const int u = WEDGE_TO_UNIQUE[w][ni + 3];
1478
1479 normal_corr[0][u] += c * nxu;
1480 normal_corr[1][u] += c * nyu;
1481 normal_corr[2][u] += c * nzu;
1482 }
1483 }
1484 }
1485
1486 double gu00 = 0.0;
1487 double gu10 = 0.0, gu11 = 0.0;
1488 double gu20 = 0.0, gu21 = 0.0, gu22 = 0.0;
1489 double div_u = 0.0;
1490
1491 if ( !diagonal_ )
1492 {
1493 for ( int dimj = 0; dimj < 3; ++dimj )
1494 {
1495#pragma unroll
1496 for ( int node_idx = cmb_shift; node_idx < 6 - surface_shift; ++node_idx )
1497 {
1498 const double gx = dN_ref[node_idx][0];
1499 const double gy = dN_ref[node_idx][1];
1500 const double gz = dN_ref[node_idx][2];
1501
1502 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1503 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1504 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1505
1506 double E00, E11, E22, sym01, sym02, sym12, gdd;
1507 column_grad_to_sym( dimj, g0, g1, g2, E00, E11, E22, sym01, sym02, sym12, gdd );
1508
1509 const int u = WEDGE_TO_UNIQUE[w][node_idx];
1510 const double s = src8[dimj][u];
1511
1512 gu00 += E00 * s;
1513 gu10 += sym01 * s;
1514 gu11 += E11 * s;
1515 gu20 += sym02 * s;
1516 gu21 += sym12 * s;
1517 gu22 += E22 * s;
1518 div_u += gdd * s;
1519 }
1520 }
1521
1522 for ( int dimi = 0; dimi < 3; ++dimi )
1523 {
1524#pragma unroll
1525 for ( int node_idx = cmb_shift; node_idx < 6 - surface_shift; ++node_idx )
1526 {
1527 const double gx = dN_ref[node_idx][0];
1528 const double gy = dN_ref[node_idx][1];
1529 const double gz = dN_ref[node_idx][2];
1530
1531 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1532 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1533 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1534
1535 double E00, E11, E22, sym01, sym02, sym12, gdd;
1536 column_grad_to_sym( dimi, g0, g1, g2, E00, E11, E22, sym01, sym02, sym12, gdd );
1537
1538 const int u = WEDGE_TO_UNIQUE[w][node_idx];
1539
1540 dst8[dimi][u] +=
1541 kwJ * ( NEG_TWO_THIRDS * div_u * gdd + 4.0 * sym01 * gu10 + 4.0 * sym02 * gu20 +
1542 4.0 * sym12 * gu21 + 2.0 * E00 * gu00 + 2.0 * E11 * gu11 + 2.0 * E22 * gu22 );
1543 }
1544 }
1545 }
1546
1547
1548 if ( diagonal_ || cmb_dirichlet || surface_dirichlet )
1549 {
1550 for ( int dim_diagBC = 0; dim_diagBC < 3; ++dim_diagBC )
1551 {
1552 for ( int node_idx = surface_shift; node_idx < 6 - cmb_shift; ++node_idx )
1553 {
1554 // In diagonal mode, free-slip boundary nodes are handled below with the
1555 // rotated-frame diagonal (R^T diag(RAR^T) R) to match the slow path.
1556 if ( diagonal_ && cmb_freeslip && node_idx < 3 )
1557 continue;
1558 if ( diagonal_ && surf_freeslip && node_idx >= 3 )
1559 continue;
1560
1561 const double gx = dN_ref[node_idx][0];
1562 const double gy = dN_ref[node_idx][1];
1563 const double gz = dN_ref[node_idx][2];
1564
1565 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1566 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1567 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1568
1569 double E00, E11, E22, sym01, sym02, sym12, gdd;
1570 column_grad_to_sym( dim_diagBC, g0, g1, g2, E00, E11, E22, sym01, sym02, sym12, gdd );
1571
1572 const int u = WEDGE_TO_UNIQUE[w][node_idx];
1573 const double s = src8[dim_diagBC][u];
1574
1575 dst8[dim_diagBC][u] += kwJ * ( 4.0 * s * ( sym01 * sym01 + sym02 * sym02 + sym12 * sym12 ) +
1576 2.0 * s * ( E00 * E00 + E11 * E11 + E22 * E22 ) +
1577 NEG_TWO_THIRDS * ( gdd * gdd ) * s );
1578 }
1579 }
1580
1581 // For free-slip boundary nodes in diagonal mode: compute R^T diag(R A_3x3 R^T) R src.
1582 // The slow path takes the diagonal of the 18x18 matrix *after* rotating to the
1583 // normal-tangential frame, giving diag entries kwJ*(gg + (r_alpha . g)^2 / 3) for
1584 // alpha in {n, t1, t2}. The Cartesian diagonal used above differs from this by a
1585 // rotation, so it must not be used for free-slip boundary nodes.
1586 if ( diagonal_ )
1587 {
1588 static constexpr int FS_CORNER_MAP[2][3] = { { 0, 1, 2 }, { 3, 2, 1 } };
1589 const double ncoords[4][3] = { { nx00, ny00, nz00 },
1590 { nx10, ny10, nz10 },
1591 { nx01, ny01, nz01 },
1592 { nx11, ny11, nz11 } };
1593
1594 auto apply_rotated_diag =
1595 [&]( const int ni, const int node_idx, const double* u_n_arr ) {
1596 const int corner = FS_CORNER_MAP[w][ni];
1597 const double nxu = ncoords[corner][0];
1598 const double nyu = ncoords[corner][1];
1599 const double nzu = ncoords[corner][2];
1600 const int u = WEDGE_TO_UNIQUE[w][node_idx];
1601
1602 const double gx = dN_ref[node_idx][0];
1603 const double gy = dN_ref[node_idx][1];
1604 const double gz = dN_ref[node_idx][2];
1605
1606 const double g0 = i00 * gx + i01 * gy + i02 * gz;
1607 const double g1 = i10 * gx + i11 * gy + i12 * gz;
1608 const double g2 = i20 * gx + i21 * gy + i22 * gz;
1609 const double gg_loc = g0 * g0 + g1 * g1 + g2 * g2;
1610
1611 dense::Vec< double, 3 > n_vec;
1612 n_vec( 0 ) = nxu;
1613 n_vec( 1 ) = nyu;
1614 n_vec( 2 ) = nzu;
1615 const auto R_rot = trafo_mat_cartesian_to_normal_tangential< double >( n_vec );
1616
1617 // Reconstruct full (unprojected) src at this node.
1618 // src8 was tangentially projected; u_n_arr holds the saved normal component.
1619 const double s0 = src8[0][u] + u_n_arr[corner] * nxu;
1620 const double s1 = src8[1][u] + u_n_arr[corner] * nyu;
1621 const double s2 = src8[2][u] + u_n_arr[corner] * nzu;
1622
1623 // Physical gradient and src in normal-tangential frame
1624 const double Rg0 = R_rot( 0, 0 ) * g0 + R_rot( 0, 1 ) * g1 + R_rot( 0, 2 ) * g2;
1625 const double Rg1 = R_rot( 1, 0 ) * g0 + R_rot( 1, 1 ) * g1 + R_rot( 1, 2 ) * g2;
1626 const double Rg2 = R_rot( 2, 0 ) * g0 + R_rot( 2, 1 ) * g1 + R_rot( 2, 2 ) * g2;
1627 const double Rs0 = R_rot( 0, 0 ) * s0 + R_rot( 0, 1 ) * s1 + R_rot( 0, 2 ) * s2;
1628 const double Rs1 = R_rot( 1, 0 ) * s0 + R_rot( 1, 1 ) * s1 + R_rot( 1, 2 ) * s2;
1629 const double Rs2 = R_rot( 2, 0 ) * s0 + R_rot( 2, 1 ) * s1 + R_rot( 2, 2 ) * s2;
1630
1631 // Rotated-frame diagonal applied to Rsrc:
1632 // diag_alpha = kwJ * (gg_loc + (1/3) * Rg_alpha^2)
1633 const double v0 = kwJ * ( gg_loc + ONE_THIRD * Rg0 * Rg0 ) * Rs0;
1634 const double v1 = kwJ * ( gg_loc + ONE_THIRD * Rg1 * Rg1 ) * Rs1;
1635 const double v2 = kwJ * ( gg_loc + ONE_THIRD * Rg2 * Rg2 ) * Rs2;
1636
1637 // Transform back to Cartesian: R^T [v0, v1, v2]
1638 dst8[0][u] += R_rot( 0, 0 ) * v0 + R_rot( 1, 0 ) * v1 + R_rot( 2, 0 ) * v2;
1639 dst8[1][u] += R_rot( 0, 1 ) * v0 + R_rot( 1, 1 ) * v1 + R_rot( 2, 1 ) * v2;
1640 dst8[2][u] += R_rot( 0, 2 ) * v0 + R_rot( 1, 2 ) * v1 + R_rot( 2, 2 ) * v2;
1641 };
1642
1643 if ( cmb_freeslip )
1644 {
1645 for ( int ni = 0; ni < 3; ++ni )
1646 apply_rotated_diag( ni, ni, u_n_cmb );
1647 }
1648 if ( surf_freeslip )
1649 {
1650 for ( int ni = 0; ni < 3; ++ni )
1651 apply_rotated_diag( ni, ni + 3, u_n_surf );
1652 }
1653 }
1654 }
1655 }
1656
1657 // Test-side projection for free-slip (P A P).
1658 // In diagonal mode the rotated-frame diagonal is used, which already encodes the correct
1659 // normal/tangential split — no post-projection is needed or wanted.
1660 if ( !diagonal_ && cmb_freeslip )
1661 {
1662 project_tangential_inplace( nx00, ny00, nz00, dst8[0][0], dst8[1][0], dst8[2][0] );
1663 project_tangential_inplace( nx10, ny10, nz10, dst8[0][1], dst8[1][1], dst8[2][1] );
1664 project_tangential_inplace( nx01, ny01, nz01, dst8[0][2], dst8[1][2], dst8[2][2] );
1665 project_tangential_inplace( nx11, ny11, nz11, dst8[0][6], dst8[1][6], dst8[2][6] );
1666 }
1667 if ( !diagonal_ && surf_freeslip )
1668 {
1669 project_tangential_inplace( nx00, ny00, nz00, dst8[0][3], dst8[1][3], dst8[2][3] );
1670 project_tangential_inplace( nx10, ny10, nz10, dst8[0][4], dst8[1][4], dst8[2][4] );
1671 project_tangential_inplace( nx01, ny01, nz01, dst8[0][5], dst8[1][5], dst8[2][5] );
1672 project_tangential_inplace( nx11, ny11, nz11, dst8[0][7], dst8[1][7], dst8[2][7] );
1673 }
1674
1675 // Add back the normal-normal diagonal contribution removed by the test-side projection.
1676 // This makes the fast path equivalent to the slow path for arbitrary input vectors.
1677 // Not needed in diagonal mode: the rotated-frame diagonal already includes A_nn * u_n.
1678 if ( !diagonal_ && ( cmb_freeslip || surf_freeslip ) )
1679 {
1680 for ( int dim = 0; dim < 3; ++dim )
1681 for ( int u = 0; u < 8; ++u )
1682 dst8[dim][u] += normal_corr[dim][u];
1683 }
1684
1685 for ( int dim_add = 0; dim_add < 3; ++dim_add )
1686 {
1687 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell, y_cell, r_cell, dim_add ), dst8[dim_add][0] );
1688 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell + 1, y_cell, r_cell, dim_add ), dst8[dim_add][1] );
1689 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell, y_cell + 1, r_cell, dim_add ), dst8[dim_add][2] );
1690 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell, y_cell, r_cell + 1, dim_add ), dst8[dim_add][3] );
1691 Kokkos::atomic_add(
1692 &dst_( local_subdomain_id, x_cell + 1, y_cell, r_cell + 1, dim_add ), dst8[dim_add][4] );
1693 Kokkos::atomic_add(
1694 &dst_( local_subdomain_id, x_cell, y_cell + 1, r_cell + 1, dim_add ), dst8[dim_add][5] );
1695 Kokkos::atomic_add(
1696 &dst_( local_subdomain_id, x_cell + 1, y_cell + 1, r_cell, dim_add ), dst8[dim_add][6] );
1697 Kokkos::atomic_add(
1698 &dst_( local_subdomain_id, x_cell + 1, y_cell + 1, r_cell + 1, dim_add ), dst8[dim_add][7] );
1699 }
1700 }
1701
1702 public:
1703 /**
1704 * @brief Legacy generic team operator.
1705 *
1706 * Kept for compatibility/debugging, but no longer used by apply_impl().
1707 * The host now dispatches directly to path-specific kernels.
1708 *
1709 * This function still works, but it reintroduces a branch on `kernel_path_`
1710 * and should therefore be avoided in performance-critical use.
1711 */
1712 KOKKOS_INLINE_FUNCTION
1713 void operator()( const Team& team ) const
1714 {
1715 int local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell;
1716 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
1717
1718 if ( tr >= r_tile_ )
1719 return;
1720
1721 const bool at_cmb = has_flag( local_subdomain_id, x_cell, y_cell, r_cell, CMB );
1722 const bool at_surface = has_flag( local_subdomain_id, x_cell, y_cell, r_cell + 1, SURFACE );
1723
1724 if ( kernel_path_ == KernelPath::Slow )
1725 {
1726 operator_slow_path(
1727 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell, at_cmb, at_surface );
1728 }
1729 else if ( kernel_path_ == KernelPath::FastFreeslip )
1730 {
1731 operator_fast_freeslip_path(
1732 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell, at_cmb, at_surface );
1733 }
1734 else
1735 {
1736 operator_fast_dirichlet_neumann_path(
1737 team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell, at_cmb, at_surface );
1738 }
1739 }
1740
1741 KOKKOS_INLINE_FUNCTION void assemble_trial_test_vecs(
1742 const int wedge,
1743 const dense::Vec< ScalarType, VecDim >& quad_point,
1744 const ScalarType quad_weight,
1745 const ScalarT r_1,
1746 const ScalarT r_2,
1747 dense::Vec< ScalarT, 3 > ( *wedge_phy_surf )[3],
1748 const dense::Vec< ScalarT, 6 >* k_local_hex,
1749 const int dimi,
1750 const int dimj,
1753 ScalarType& jdet_keval_quadweight ) const
1754 {
1755 dense::Mat< ScalarType, VecDim, VecDim > J = jac( wedge_phy_surf[wedge], r_1, r_2, quad_point );
1756 const auto det = J.det();
1757 const auto abs_det = Kokkos::abs( det );
1758 const dense::Mat< ScalarType, VecDim, VecDim > J_inv_transposed = J.inv_transposed( det );
1759
1760 ScalarType k_eval = 0.0;
1761 for ( int k = 0; k < num_nodes_per_wedge; k++ )
1762 {
1763 k_eval += shape( k, quad_point ) * k_local_hex[wedge]( k );
1764 }
1765
1766 for ( int k = 0; k < num_nodes_per_wedge; k++ )
1767 {
1768 sym_grad_i[k] = symmetric_grad( J_inv_transposed, quad_point, k, dimi );
1769 sym_grad_j[k] = symmetric_grad( J_inv_transposed, quad_point, k, dimj );
1770 }
1771
1772 jdet_keval_quadweight = quad_weight * k_eval * abs_det;
1773 }
1774
1775 /**
1776 * @brief Assemble one wedge-local 18x18 matrix (slow path / on-demand assembly).
1777 */
1778 KOKKOS_INLINE_FUNCTION
1780 const int local_subdomain_id,
1781 const int x_cell,
1782 const int y_cell,
1783 const int r_cell,
1784 const int wedge ) const
1785 {
1787 wedge_surface_physical_coords( wedge_phy_surf, grid_, local_subdomain_id, x_cell, y_cell );
1788
1789 const ScalarT r_1 = radii_( local_subdomain_id, r_cell );
1790 const ScalarT r_2 = radii_( local_subdomain_id, r_cell + 1 );
1791
1793 extract_local_wedge_scalar_coefficients( k_local_hex, local_subdomain_id, x_cell, y_cell, r_cell, k_ );
1794
1796
1797 for ( int dimi = 0; dimi < 3; ++dimi )
1798 {
1799 for ( int dimj = 0; dimj < 3; ++dimj )
1800 {
1801 for ( int q = 0; q < num_quad_points; q++ )
1802 {
1805 ScalarType jdet_keval_quadweight = 0;
1806
1808 wedge,
1809 quad_points[q],
1810 quad_weights[q],
1811 r_1,
1812 r_2,
1813 wedge_phy_surf,
1814 k_local_hex,
1815 dimi,
1816 dimj,
1817 sym_grad_i,
1818 sym_grad_j,
1819 jdet_keval_quadweight );
1820
1821 for ( int i = 0; i < num_nodes_per_wedge; i++ )
1822 {
1823 for ( int j = 0; j < num_nodes_per_wedge; j++ )
1824 {
1825 A( i + dimi * num_nodes_per_wedge, j + dimj * num_nodes_per_wedge ) +=
1826 jdet_keval_quadweight *
1827 ( 2 * sym_grad_j[j].double_contract( sym_grad_i[i] ) -
1828 2.0 / 3.0 * sym_grad_j[j]( dimj, dimj ) * sym_grad_i[i]( dimi, dimi ) );
1829 }
1830 }
1831 }
1832 }
1833 }
1834
1835 return A;
1836 }
1837};
1838
1841
1842} // namespace terra::fe::wedge::operators::shell::epsdivdiv_history
Definition communication_plan.hpp:33
Send and receive buffers for all process-local subdomain boundaries.
Definition communication.hpp:56
Matrix-free / matrix-based epsilon-div-div operator on wedge elements in a spherical shell.
Definition epsilon_divdiv_kerngen_v08_scalar_coalesced.hpp:58
void set_operator_apply_and_communication_modes(const linalg::OperatorApplyMode operator_apply_mode, const linalg::OperatorCommunicationMode operator_communication_mode)
Definition epsilon_divdiv_kerngen_v08_scalar_coalesced.hpp:216
void set_boundary_conditions(BoundaryConditions bcs)
Definition epsilon_divdiv_kerngen_v08_scalar_coalesced.hpp:226
ScalarT ScalarType
Definition epsilon_divdiv_kerngen_v08_scalar_coalesced.hpp:62
EpsilonDivDivKerngenV08ScalarCoalesced(const grid::shell::DistributedDomain &domain, const grid::Grid3DDataVec< ScalarT, 3 > &grid, const grid::Grid2DDataScalar< ScalarT > &radii, const grid::Grid4DDataScalar< grid::shell::ShellBoundaryFlag > &mask, const grid::Grid4DDataScalar< ScalarT > &k, BoundaryConditions bcs, bool diagonal, linalg::OperatorApplyMode operator_apply_mode=linalg::OperatorApplyMode::Replace, linalg::OperatorCommunicationMode operator_communication_mode=linalg::OperatorCommunicationMode::CommunicateAdditively, linalg::OperatorStoredMatrixMode operator_stored_matrix_mode=linalg::OperatorStoredMatrixMode::Off)
Definition epsilon_divdiv_kerngen_v08_scalar_coalesced.hpp:155
grid::Grid2DDataScalar< ScalarT > get_radii() const
Definition epsilon_divdiv_kerngen_v08_scalar_coalesced.hpp:237
linalg::OperatorStoredMatrixMode get_stored_matrix_mode()
Definition epsilon_divdiv_kerngen_v08_scalar_coalesced.hpp:274
void operator()(const Team &team) const
Legacy generic team operator.
Definition epsilon_divdiv_kerngen_v08_scalar_coalesced.hpp:1713
dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > get_local_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge) const
Definition epsilon_divdiv_kerngen_v08_scalar_coalesced.hpp:290
void column_grad_to_sym(const int dim, const double g0, const double g1, const double g2, double &E00, double &E11, double &E22, double &sym01, double &sym02, double &sym12, double &gdd) const
Convert one gradient column into symmetric-gradient and div contributions.
Definition epsilon_divdiv_kerngen_v08_scalar_coalesced.hpp:389
void apply_impl(const SrcVectorType &src, DstVectorType &dst)
Apply the operator: dst <- Op(src) (or additive, depending on mode).
Definition epsilon_divdiv_kerngen_v08_scalar_coalesced.hpp:321
void set_stored_matrix_mode(linalg::OperatorStoredMatrixMode operator_stored_matrix_mode, int level_range, grid::Grid4DDataScalar< ScalarType > GCAElements)
Definition epsilon_divdiv_kerngen_v08_scalar_coalesced.hpp:251
grid::Grid3DDataVec< ScalarT, 3 > get_grid()
Definition epsilon_divdiv_kerngen_v08_scalar_coalesced.hpp:238
Kokkos::TeamPolicy<>::member_type Team
Definition epsilon_divdiv_kerngen_v08_scalar_coalesced.hpp:66
size_t team_shmem_size(const int ts) const
Team scratch memory size for fast paths.
Definition epsilon_divdiv_kerngen_v08_scalar_coalesced.hpp:434
void set_local_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge, const dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > &mat) const
Definition epsilon_divdiv_kerngen_v08_scalar_coalesced.hpp:277
static constexpr int LocalMatrixDim
Definition epsilon_divdiv_kerngen_v08_scalar_coalesced.hpp:63
terra::grid::Grid4DDataMatrices< ScalarType, LocalMatrixDim, LocalMatrixDim, 2 > Grid4DDataLocalMatrices
Definition epsilon_divdiv_kerngen_v08_scalar_coalesced.hpp:64
const grid::Grid4DDataScalar< ScalarType > & k_grid_data()
Definition epsilon_divdiv_kerngen_v08_scalar_coalesced.hpp:235
void set_diagonal(bool v)
Definition epsilon_divdiv_kerngen_v08_scalar_coalesced.hpp:224
void assemble_trial_test_vecs(const int wedge, const dense::Vec< ScalarType, VecDim > &quad_point, const ScalarType quad_weight, const ScalarT r_1, const ScalarT r_2, dense::Vec< ScalarT, 3 >(*wedge_phy_surf)[3], const dense::Vec< ScalarT, 6 > *k_local_hex, const int dimi, const int dimj, dense::Mat< ScalarType, VecDim, VecDim > *sym_grad_i, dense::Mat< ScalarType, VecDim, VecDim > *sym_grad_j, ScalarType &jdet_keval_quadweight) const
Definition epsilon_divdiv_kerngen_v08_scalar_coalesced.hpp:1741
dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > assemble_local_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge) const
Assemble one wedge-local 18x18 matrix (slow path / on-demand assembly).
Definition epsilon_divdiv_kerngen_v08_scalar_coalesced.hpp:1779
const grid::shell::DistributedDomain & get_domain() const
Definition epsilon_divdiv_kerngen_v08_scalar_coalesced.hpp:236
bool has_flag(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, grid::shell::ShellBoundaryFlag flag) const
Definition epsilon_divdiv_kerngen_v08_scalar_coalesced.hpp:241
Parallel data structure organizing the thick spherical shell metadata for distributed (MPI parallel) ...
Definition spherical_shell.hpp:2518
const std::map< SubdomainInfo, std::tuple< LocalSubdomainIdx, SubdomainNeighborhood > > & subdomains() const
Definition spherical_shell.hpp:2650
const DomainInfo & domain_info() const
Returns a const reference.
Definition spherical_shell.hpp:2647
Information about the thick spherical shell mesh.
Definition spherical_shell.hpp:780
const std::vector< double > & radii() const
Definition spherical_shell.hpp:845
int diamond_lateral_refinement_level() const
Definition spherical_shell.hpp:843
int subdomain_num_nodes_radially() const
Equivalent to calling subdomain_num_nodes_radially( subdomain_refinement_level() )
Definition spherical_shell.hpp:861
int subdomain_num_nodes_per_side_laterally() const
Equivalent to calling subdomain_num_nodes_per_side_laterally( subdomain_refinement_level() )
Definition spherical_shell.hpp:852
Static assertion: VectorQ1Scalar satisfies VectorLike concept.
Definition vector_q1.hpp:168
const grid::Grid4DDataVec< ScalarType, VecDim > & grid_data() const
Get const reference to grid data.
Definition vector_q1.hpp:288
bool has_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge) const
Checks for presence of a local matrix for a certain element.
Definition local_matrix_storage.hpp:223
dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > get_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge) const
Retrives the local matrix if there is stored local matrices, the desired local matrix is loaded and r...
Definition local_matrix_storage.hpp:175
void set_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge, dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > mat) const
Set the local matrix stored in the operator.
Definition local_matrix_storage.hpp:118
Timer supporting RAII scope or manual stop.
Definition timer.hpp:342
void stop()
Stop the timer and record elapsed time.
Definition timer.hpp:364
Concept for types that can be used as Galerkin coarse-grid operators in a multigrid hierarchy....
Definition operator.hpp:81
int dim
Definition EpsilonDivDiv_kernel_gen.py:66
k_eval
Definition EpsilonDivDiv_kernel_gen.py:175
g2
Definition EpsilonDivDiv_kernel_gen.py:267
div_u
Definition EpsilonDivDiv_kernel_gen.py:359
cmb_shift
Definition EpsilonDivDiv_kernel_gen.py:98
g1
Definition EpsilonDivDiv_kernel_gen.py:266
J_det
Definition EpsilonDivDiv_kernel_gen.py:216
surface_shift
Definition EpsilonDivDiv_kernel_gen.py:98
constexpr int idx(const int loop_idx, const int size, const grid::BoundaryPosition position, const grid::BoundaryDirection direction)
Definition buffer_copy_kernels.hpp:73
void send_recv_with_plan(const ShellBoundaryCommPlan< GridDataType > &plan, const GridDataType &data, SubdomainNeighborhoodSendRecvBuffer< typename GridDataType::value_type, grid::grid_data_vec_dim< GridDataType >() > &recv_buffers, CommunicationReduction reduction=CommunicationReduction::SUM)
Definition communication_plan.hpp:652
Definition epsilon_divdiv_kerngen_v01_initial.hpp:16
constexpr void quad_felippa_1x1_quad_points(dense::Vec< T, 3 >(&quad_points)[quad_felippa_1x1_num_quad_points])
Definition wedge/quadrature/quadrature.hpp:36
constexpr void quad_felippa_1x1_quad_weights(T(&quad_weights)[quad_felippa_1x1_num_quad_points])
Definition wedge/quadrature/quadrature.hpp:43
constexpr int quad_felippa_1x1_num_quad_points
Definition wedge/quadrature/quadrature.hpp:32
constexpr int num_nodes_per_wedge_surface
Definition kernel_helpers.hpp:6
constexpr dense::Mat< T, 3, 3 > symmetric_grad(const dense::Mat< T, 3, 3 > &J_inv_transposed, const dense::Vec< T, 3 > &quad_point, const int dof, const int dim)
Returns the symmetric gradient of the shape function of a dof at a quadrature point.
Definition integrands.hpp:685
void wedge_surface_physical_coords(dense::Vec< T, 3 >(&wedge_surf_phy_coords)[num_wedges_per_hex_cell][num_nodes_per_wedge_surface], const grid::Grid3DDataVec< T, 3 > &lateral_grid, const int local_subdomain_id, const int x_cell, const int y_cell)
Extracts the (unit sphere) surface vertex coords of the two wedges of a hex cell.
Definition kernel_helpers.hpp:26
constexpr void reorder_local_dofs(const DoFOrdering doo_from, const DoFOrdering doo_to, dense::Vec< ScalarT, 18 > &dofs)
Definition kernel_helpers.hpp:619
void atomically_add_local_wedge_vector_coefficients(const grid::Grid4DDataVec< T, VecDim > &global_coefficients, const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int d, const dense::Vec< T, 6 > local_coefficients[2])
Performs an atomic add of the two local wedge coefficient vectors of a hex cell into the global coeff...
Definition kernel_helpers.hpp:465
constexpr int num_wedges_per_hex_cell
Definition kernel_helpers.hpp:5
void extract_local_wedge_scalar_coefficients(dense::Vec< T, 6 >(&local_coefficients)[2], const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const grid::Grid4DDataScalar< T > &global_coefficients)
Extracts the local vector coefficients for the two wedges of a hex cell from the global coefficient v...
Definition kernel_helpers.hpp:306
void extract_local_wedge_vector_coefficients(dense::Vec< T, 6 >(&local_coefficients)[2], const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int d, const grid::Grid4DDataVec< T, VecDim > &global_coefficients)
Extracts the local vector coefficients for the two wedges of a hex cell from the global coefficient v...
Definition kernel_helpers.hpp:356
constexpr int num_nodes_per_wedge
Definition kernel_helpers.hpp:7
constexpr T shape(const int node_idx, const T xi, const T eta, const T zeta)
(Tensor-product) Shape function.
Definition integrands.hpp:146
constexpr dense::Mat< T, 3, 3 > jac(const dense::Vec< T, 3 > &p1_phy, const dense::Vec< T, 3 > &p2_phy, const dense::Vec< T, 3 > &p3_phy, const T r_1, const T r_2, const T xi, const T eta, const T zeta)
Definition integrands.hpp:657
dense::Vec< typename CoordsShellType::value_type, 3 > coords(const int subdomain, const int x, const int y, const int r, const CoordsShellType &coords_shell, const CoordsRadiiType &coords_radii)
Definition spherical_shell.hpp:2871
BoundaryConditionMapping[2] BoundaryConditions
Definition shell/bit_masks.hpp:37
ShellBoundaryFlag
FlagLike that indicates boundary types for the thick spherical shell.
Definition shell/bit_masks.hpp:12
BoundaryConditionFlag get_boundary_condition_flag(const BoundaryConditions bcs, ShellBoundaryFlag sbf)
Retrieve the boundary condition flag that is associated with a location in the shell e....
Definition shell/bit_masks.hpp:42
BoundaryConditionFlag
FlagLike that indicates the type of boundary condition
Definition shell/bit_masks.hpp:25
Kokkos::View< dense::Mat< ScalarType, Rows, Cols > ****[NumMatrices], Layout > Grid4DDataMatrices
Definition grid_types.hpp:173
Kokkos::View< ScalarType ***[VecDim], Layout > Grid3DDataVec
Definition grid_types.hpp:42
Kokkos::View< ScalarType ****, Layout > Grid4DDataScalar
Definition grid_types.hpp:27
Kokkos::View< ScalarType **, Layout > Grid2DDataScalar
Definition grid_types.hpp:21
dense::Mat< ScalarType, 3, 3 > trafo_mat_cartesian_to_normal_tangential(const dense::Vec< ScalarType, 3 > &n_input)
Constructs a robust orthonormal transformation matrix from Cartesian to (normal–tangential–tangential...
Definition local_basis_trafo_normal_tangential.hpp:36
ScalarOf< Vector > dot(const Vector &y, const Vector &x)
Compute the dot product of two vectors. Implements: .
Definition vector.hpp:118
OperatorApplyMode
Modes for applying an operator to a vector.
Definition operator.hpp:30
@ Replace
Overwrite the destination vector.
OperatorStoredMatrixMode
Modes for applying stored matrices.
Definition operator.hpp:47
@ Selective
Use stored matrices on selected, marked elements only, assemble on all others.
@ Full
Use stored matrices on all elements.
@ Off
Do not use stored matrices.
OperatorCommunicationMode
Modes for communication during operator application.
Definition operator.hpp:40
@ CommunicateAdditively
Communicate and add results.
constexpr bool has_flag(E mask_value, E flag) noexcept
Checks if a bitmask value contains a specific flag.
Definition bit_masking.hpp:43
detail::PrefixCout logroot([]() { return detail::log_prefix();})
std::ostream subclass that just logs on root and adds a timestamp for each line.
Definition mat.hpp:10
constexpr Mat diagonal() const
Definition mat.hpp:377
constexpr Mat< T, Cols, Rows > transposed() const
Definition mat.hpp:187
Mat & hadamard_product(const Mat &mat)
Definition mat.hpp:213
T double_contract(const Mat &mat)
Definition mat.hpp:226
SoA (Structure-of-Arrays) 4D vector grid data.
Definition grid_types.hpp:51