Loading...
Searching...
No Matches
epsilon_divdiv_kerngen_v06_xy_tiling.hpp
Go to the documentation of this file.
1#pragma once
2
3#include "../../../quadrature/quadrature.hpp"
5#include "dense/vec.hpp"
9#include "impl/Kokkos_Profiling.hpp"
10#include "linalg/operator.hpp"
13#include "linalg/vector.hpp"
14#include "linalg/vector_q1.hpp"
15#include "util/timer.hpp"
16
18
29
30template < typename ScalarT, int VecDim = 3 >
32{
33 public:
36 using ScalarType = ScalarT;
37 static constexpr int LocalMatrixDim = 18;
40
41 private:
42 LocalMatrixStorage local_matrix_storage_;
43
45
50 BoundaryConditions bcs_;
51
52 bool diagonal_;
53
54 linalg::OperatorApplyMode operator_apply_mode_;
55 linalg::OperatorCommunicationMode operator_communication_mode_;
56 linalg::OperatorStoredMatrixMode operator_stored_matrix_mode_;
57
60
63
64 // Quadrature points.
65 const int num_quad_points = quadrature::quad_felippa_1x1_num_quad_points;
66
69
70 int local_subdomains_;
71 int hex_lat_;
72 int hex_rad_;
73 int lat_refinement_level_;
74
75 // --- NEW: 3D tiling parameters ---
76 int lat_tile_; // slab size in x and y (same)
77 int r_tile_; // slab size in r (team's r dimension)
78 int lat_tiles_; // number of tiles per lateral dimension (x AND y)
79 int r_tiles_; // number of tiles in r
80
81 int team_size_; // = lat_tile_*lat_tile_*r_tile_
82 int blocks_; // league size = local_subdomains_ * lat_tiles_^2 * r_tiles_
83
84 ScalarT r_max_;
85 ScalarT r_min_;
86
87 public:
94 BoundaryConditions bcs,
95 bool diagonal,
97 linalg::OperatorCommunicationMode operator_communication_mode =
100 : domain_( domain )
101 , grid_( grid )
102 , radii_( radii )
103 , mask_( mask )
104 , k_( k )
105 , diagonal_( diagonal )
106 , operator_apply_mode_( operator_apply_mode )
107 , operator_communication_mode_( operator_communication_mode )
108 , operator_stored_matrix_mode_( operator_stored_matrix_mode )
109 , send_buffers_( domain )
110 , recv_buffers_( domain )
111 {
112 bcs_[0] = bcs[0];
113 bcs_[1] = bcs[1];
116
117 const grid::shell::DomainInfo& domain_info = domain_.domain_info();
118 local_subdomains_ = domain_.subdomains().size();
119 hex_lat_ = domain_info.subdomain_num_nodes_per_side_laterally() - 1;
120 hex_rad_ = domain_info.subdomain_num_nodes_radially() - 1;
121 lat_refinement_level_ = domain_info.diamond_lateral_refinement_level();
122
123 // ---- choose tiles (tune) ----
124 // must keep team_size_ reasonable for backend (GPU often <= 1024)
125 lat_tile_ = 4; // x=y slab size
126 r_tile_ = 8; // r slab size
127
128 lat_tiles_ = (hex_lat_ + lat_tile_ - 1) / lat_tile_;
129 r_tiles_ = (hex_rad_ + r_tile_ - 1) / r_tile_;
130
131 team_size_ = lat_tile_ * lat_tile_ * r_tile_;
132 blocks_ = local_subdomains_ * lat_tiles_ * lat_tiles_ * r_tiles_;
133
134 r_min_ = domain_info.radii()[0];
135 r_max_ = domain_info.radii()[domain_info.radii().size() - 1];
136
137 util::logroot << "[EpsilonDivDiv] tile size (x,y,r)=(" << lat_tile_ << "," << lat_tile_ << "," << r_tile_
138 << ")" << std::endl;
139 util::logroot << "[EpsilonDivDiv] number of tiles (x,y,r)=(" << lat_tiles_ << "," << lat_tiles_ << "," << r_tiles_
140 << "), team_size=" << team_size_ << ", blocks=" << blocks_ << std::endl;
141
142 }
143
145 const linalg::OperatorApplyMode operator_apply_mode,
146 const linalg::OperatorCommunicationMode operator_communication_mode )
147 {
148 operator_apply_mode_ = operator_apply_mode;
149 operator_communication_mode_ = operator_communication_mode;
150 }
151
152 /// @brief S/Getter for diagonal member
153 void set_diagonal( bool v ) { diagonal_ = v; }
154
155 /// @brief Getter for coefficient
157
158 /// @brief Getter for domain member
159 const grid::shell::DistributedDomain& get_domain() const { return domain_; }
160
161 /// @brief Getter for radii member
163
164 /// @brief Getter for grid member
166
167 /// @brief Getter for mask member
168 KOKKOS_INLINE_FUNCTION
170 const int local_subdomain_id,
171 const int x_cell,
172 const int y_cell,
173 const int r_cell,
175 {
176 return util::has_flag( mask_( local_subdomain_id, x_cell, y_cell, r_cell ), flag );
177 }
178
179 /// @brief allocates memory for the local matrices
181 linalg::OperatorStoredMatrixMode operator_stored_matrix_mode,
182 int level_range,
184 {
185 operator_stored_matrix_mode_ = operator_stored_matrix_mode;
186
187 // allocate storage if necessary
188 if ( operator_stored_matrix_mode_ != linalg::OperatorStoredMatrixMode::Off )
189 {
191 domain_, operator_stored_matrix_mode_, level_range, GCAElements );
192 }
193 }
194
195 linalg::OperatorStoredMatrixMode get_stored_matrix_mode() { return operator_stored_matrix_mode_; }
196
197 /// @brief Set the local matrix stored in the operator
198 KOKKOS_INLINE_FUNCTION
200 const int local_subdomain_id,
201 const int x_cell,
202 const int y_cell,
203 const int r_cell,
204 const int wedge,
206 {
207 KOKKOS_ASSERT( operator_stored_matrix_mode_ != linalg::OperatorStoredMatrixMode::Off );
208 local_matrix_storage_.set_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge, mat );
209 }
210
211 /// @brief Retrives the local matrix
212 /// if there is stored local matrices, the desired local matrix is loaded and returned
213 /// if not, the local matrix is assembled on-the-fly
214 KOKKOS_INLINE_FUNCTION
216 const int local_subdomain_id,
217 const int x_cell,
218 const int y_cell,
219 const int r_cell,
220 const int wedge ) const
221 {
222 // request from storage
223 if ( operator_stored_matrix_mode_ != linalg::OperatorStoredMatrixMode::Off )
224 {
225 if ( !local_matrix_storage_.has_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge ) )
226 {
227 Kokkos::abort( "No matrix found at that spatial index." );
228 }
229 return local_matrix_storage_.get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge );
230 }
231 else
232 {
233 return assemble_local_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge );
234 }
235 }
236
237 void apply_impl( const SrcVectorType& src, DstVectorType& dst )
238 {
239 util::Timer timer_apply( "epsilon_divdiv_apply" );
240
241 if ( operator_apply_mode_ == linalg::OperatorApplyMode::Replace )
242 {
243 assign( dst, 0 );
244 }
245
246 dst_ = dst.grid_data();
247 src_ = src.grid_data();
248
249 util::Timer timer_kernel( "epsilon_divdiv_kernel" );
250 Kokkos::TeamPolicy<> policy( blocks_, team_size_ );
251 Kokkos::parallel_for( "matvec", policy, *this );
252 // grid::shell::local_domain_md_range_policy_cells( domain_ ),
253 //s *this );
254 Kokkos::fence();
255 timer_kernel.stop();
256
257 if ( operator_communication_mode_ == linalg::OperatorCommunicationMode::CommunicateAdditively )
258 {
259 util::Timer timer_comm( "epsilon_divdiv_comm" );
260
262 domain_, dst_, send_buffers_, recv_buffers_ );
264 }
265 }
266
267 KOKKOS_INLINE_FUNCTION
269 const int dim,
270 const double g0,
271 const double g1,
272 const double g2,
273 double& E00,
274 double& E11,
275 double& E22,
276 double& sym01,
277 double& sym02,
278 double& sym12,
279 double& gdd ) const
280 {
281 E00 = 0.0;
282 E11 = 0.0;
283 E22 = 0.0;
284 sym01 = 0.0;
285 sym02 = 0.0;
286 sym12 = 0.0;
287 gdd = 0.0;
288
289 // dim selects which COLUMN is populated:
290 // dim==0: E[0][0]=g0, E[1][0]=g1, E[2][0]=g2
291 // dim==1: E[0][1]=g0, E[1][1]=g1, E[2][1]=g2
292 // dim==2: E[0][2]=g0, E[1][2]=g1, E[2][2]=g2
293 switch ( dim )
294 {
295 case 0:
296 E00 = g0;
297 gdd = g0; // E[0][0]
298 sym01 = 0.5 * g1; // 0.5*(E[0][1]=0 + E[1][0]=g1)
299 sym02 = 0.5 * g2; // 0.5*(E[0][2]=0 + E[2][0]=g2)
300 sym12 = 0.0;
301 break;
302
303 case 1:
304 E11 = g1;
305 gdd = g1; // E[1][1]
306 sym01 = 0.5 * g0; // 0.5*(E[0][1]=g0 + E[1][0]=0)
307 sym02 = 0.0;
308 sym12 = 0.5 * g2; // 0.5*(E[1][2]=0 + E[2][1]=g2)
309 break;
310
311 default: // case 2
312 E22 = g2;
313 gdd = g2; // E[2][2]
314 sym01 = 0.0;
315 sym02 = 0.5 * g0; // 0.5*(E[0][2]=g0 + E[2][0]=0)
316 sym12 = 0.5 * g1; // 0.5*(E[1][2]=g1 + E[2][1]=0)
317 break;
318 }
319 }
320
321 using Team = Kokkos::TeamPolicy<>::member_type;
322
323 // scratch size for the WHOLE (x,y,r) tile slab
324 KOKKOS_INLINE_FUNCTION
325 size_t team_shmem_size( const int /*ts*/ ) const
326 {
327 const int nlev = r_tile_ + 1;
328 const int n = lat_tile_ + 1;
329 const int nxy = n * n;
330
331 // coords_sh: nxy * 3
332 // src_sh: nxy * 3 * nlev
333 // k_sh: nxy * nlev
334 // r_sh: nlev
335 const size_t ndoubles =
336 size_t(nxy) * 3 + size_t(nxy) * 3 * nlev + size_t(nxy) * nlev + size_t(nlev);
337
338 return sizeof(double) * ndoubles;
339 }
340
341 KOKKOS_INLINE_FUNCTION
342 void operator()(const Team& team) const
343 {
344 // league_rank -> (subdomain, lat_y_id, lat_x_id, r_tile_id)
345 int tmp = team.league_rank();
346
347 const int r_tile_id = tmp % r_tiles_;
348 tmp /= r_tiles_;
349
350 const int lat_y_id = tmp % lat_tiles_;
351 tmp /= lat_tiles_;
352
353 const int lat_x_id = tmp % lat_tiles_;
354 tmp /= lat_tiles_;
355
356 const int local_subdomain_id = tmp;
357
358 const int x0 = lat_x_id * lat_tile_;
359 const int y0 = lat_y_id * lat_tile_;
360 const int r0 = r_tile_id * r_tile_;
361
362 // team_rank -> (tx, ty, tr) where tx,ty in [0..lat_tile_-1], tr in [0..r_tile_-1]
363 const int tid = team.team_rank();
364 const int tx = tid % lat_tile_;
365 const int ty = (tid / lat_tile_) % lat_tile_;
366 const int tr = tid / (lat_tile_ * lat_tile_);
367
368 if (tr >= r_tile_) return;
369
370 const int x_cell = x0 + tx;
371 const int y_cell = y0 + ty;
372 const int r_cell = r0 + tr;
373
374 // Each element needs x_cell+1, y_cell+1 and r_cell+1
375
376 // ---- shared slab dims ----
377 const int nlev = r_tile_ + 1;
378 const int nxy = (lat_tile_ + 1) * (lat_tile_ + 1);
379
380 double* shmem = reinterpret_cast<double*>(
381 team.team_shmem().get_shmem(team_shmem_size(team.team_size())));
382
383 using ScratchCoords =
384 Kokkos::View<double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged>;
385 using ScratchSrc =
386 Kokkos::View<double***, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged>;
387 using ScratchK =
388 Kokkos::View<double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged>;
389
390 ScratchCoords coords_sh(shmem, nxy, 3); shmem += nxy * 3;
391 ScratchSrc src_sh (shmem, nxy, 3, nlev); shmem += nxy * 3 * nlev;
392 ScratchK k_sh (shmem, nxy, nlev); shmem += nxy * nlev;
393
394 auto r_sh =
395 Kokkos::View<double*, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged>(
396 shmem, nlev);
397
398 auto node_id = [&](int nx, int ny) -> int { return nx + (lat_tile_ + 1) * ny; };
399
400 // ---- cooperative loads for whole tile slab ----
401 // coords
402 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nxy), [&](int n) {
403 const int dxn = n % (lat_tile_ + 1) ;
404 const int dyn = n / (lat_tile_ + 1) ;
405 const int xi = x0 + dxn;
406 const int yi = y0 + dyn;
407
408 if (xi <= hex_lat_ && yi <= hex_lat_) {
409 coords_sh(n,0) = grid_(local_subdomain_id, xi, yi, 0);
410 coords_sh(n,1) = grid_(local_subdomain_id, xi, yi, 1);
411 coords_sh(n,2) = grid_(local_subdomain_id, xi, yi, 2);
412 } else {
413 coords_sh(n,0) = coords_sh(n,1) = coords_sh(n,2) = 0.0;
414 }
415 });
416
417 // radii
418 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nlev), [&](int lvl) {
419 const int rr = r0 + lvl;
420 r_sh(lvl) = (rr <= hex_rad_) ? radii_(local_subdomain_id, rr) : 0.0;
421 });
422
423 // src/k dofs
424 const int total_pairs = nxy * nlev;
425 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, total_pairs), [&](int t) {
426 const int lvl = t / nxy;
427 const int node = t - lvl * nxy;
428
429 const int dxn = node % (lat_tile_ + 1) ;
430 const int dyn = node / (lat_tile_ + 1) ;
431
432 const int xi = x0 + dxn;
433 const int yi = y0 + dyn;
434 const int rr = r0 + lvl;
435
436 if (xi <= hex_lat_ && yi <= hex_lat_ && rr <= hex_rad_) {
437 k_sh(node, lvl) = k_(local_subdomain_id, xi, yi, rr);
438 src_sh(node, 0, lvl) = src_(local_subdomain_id, xi, yi, rr, 0);
439 src_sh(node, 1, lvl) = src_(local_subdomain_id, xi, yi, rr, 1);
440 src_sh(node, 2, lvl) = src_(local_subdomain_id, xi, yi, rr, 2);
441 } else {
442 k_sh(node, lvl) = 0.0;
443 src_sh(node,0,lvl) = src_sh(node,1,lvl) = src_sh(node,2,lvl) = 0.0;
444 }
445 });
446
447 team.team_barrier();
448
449 // ---------------- per-thread element compute (ONE element)
450 // radii for this element's r (local slab index = tr)
451
452 if (x_cell < hex_lat_ && y_cell < hex_lat_ && r_cell < hex_rad_) {
453
454 const int lvl0 = tr;
455 const double r_0 = r_sh( lvl0 );
456 const double r_1 = r_sh( lvl0 + 1 );
457
458 const bool at_cmb = has_flag( local_subdomain_id, x_cell, y_cell, r_cell, CMB );
459 const bool at_surface = has_flag( local_subdomain_id, x_cell, y_cell, r_cell + 1, SURFACE );
460
461 const bool at_boundary = at_cmb || at_surface;
462 bool treat_boundary = false;
463 if ( at_boundary )
464 {
465 const ShellBoundaryFlag sbf = at_cmb ? CMB : SURFACE;
466 treat_boundary = ( get_boundary_condition_flag( bcs_, sbf ) == DIRICHLET );
467 }
468
469 const int cmb_shift = ( ( at_boundary && treat_boundary && ( !diagonal_ ) && at_cmb ) ? 3 : 0 );
470 const int surface_shift = ( ( at_boundary && treat_boundary && ( !diagonal_ ) && at_surface ) ? 3 : 0 );
471
472 // ---- your existing constants (unchanged) ----
473 static constexpr int WEDGE_NODE_OFF[2][6][3] = {
474 { { 0, 0, 0 }, { 1, 0, 0 }, { 0, 1, 0 }, { 0, 0, 1 }, { 1, 0, 1 }, { 0, 1, 1 } },
475 { { 1, 1, 0 }, { 0, 1, 0 }, { 1, 0, 0 }, { 1, 1, 1 }, { 0, 1, 1 }, { 1, 0, 1 } } };
476
477 static constexpr int WEDGE_TO_UNIQUE[2][6] = {
478 { 0, 1, 2, 3, 4, 5 }, // wedge 0
479 { 6, 2, 1, 7, 5, 4 } // wedge 1
480 };
481
482 constexpr double ONE_THIRD = 1.0 / 3.0;
483 constexpr double ONE_SIXTH = 1.0 / 6.0;
484 constexpr double NEG_TWO_THIRDS = -0.66666666666666663;
485
486 static constexpr double dN_ref[6][3] = {
487 { -0.5, -0.5, -ONE_SIXTH },
488 { 0.5, 0.0, -ONE_SIXTH },
489 { 0.0, 0.5, -ONE_SIXTH },
490 { -0.5, -0.5, ONE_SIXTH },
491 { 0.5, 0.0, ONE_SIXTH },
492 { 0.0, 0.5, ONE_SIXTH } };
493
494
495
496 // 4 surface nodes ids (within tile footprint)
497 const int n00 = node_id( tx, ty );
498 const int n01 = node_id( tx, ty + 1 );
499 const int n10 = node_id( tx + 1, ty );
500 const int n11 = node_id( tx + 1, ty + 1 );
501
502 // wedge surface coords in registers
503 double ws[2][3][3];
504
505 // wedge 0: (q00,q10,q01)
506 ws[0][0][0] = coords_sh( n00, 0 ); ws[0][0][1] = coords_sh( n00, 1 ); ws[0][0][2] = coords_sh( n00, 2 );
507 ws[0][1][0] = coords_sh( n10, 0 ); ws[0][1][1] = coords_sh( n10, 1 ); ws[0][1][2] = coords_sh( n10, 2 );
508 ws[0][2][0] = coords_sh( n01, 0 ); ws[0][2][1] = coords_sh( n01, 1 ); ws[0][2][2] = coords_sh( n01, 2 );
509
510 // wedge 1: (q11,q01,q10)
511 ws[1][0][0] = coords_sh( n11, 0 ); ws[1][0][1] = coords_sh( n11, 1 ); ws[1][0][2] = coords_sh( n11, 2 );
512 ws[1][1][0] = coords_sh( n01, 0 ); ws[1][1][1] = coords_sh( n01, 1 ); ws[1][1][2] = coords_sh( n01, 2 );
513 ws[1][2][0] = coords_sh( n10, 0 ); ws[1][2][1] = coords_sh( n10, 1 ); ws[1][2][2] = coords_sh( n10, 2 );
514
515 // per-thread accumulation (same as your dst8 approach)
516 double dst8[3][8] = { 0.0 };
517
518 for ( int w = 0; w < 2; ++w )
519 {
520
521 // k_eval from shared slab
522 double k_sum = 0.0;
523#pragma unroll
524 for ( int node = 0; node < 6; ++node )
525 {
526 const int ddx = WEDGE_NODE_OFF[w][node][0];
527 const int ddy = WEDGE_NODE_OFF[w][node][1];
528 const int ddr = WEDGE_NODE_OFF[w][node][2];
529
530 const int nid = node_id( tx + ddx, ty + ddy );
531 const int lvl = lvl0 + ddr;
532
533 k_sum += k_sh( nid, lvl );
534 }
535 const double k_eval = ONE_SIXTH * k_sum;
536
537 double wJ = 0.0;
538
539 double i00, i01, i02;
540 double i10, i11, i12;
541 double i20, i21, i22;
542
543 {
544 const double half_dr = 0.5 * ( r_1 - r_0 );
545 const double r_mid = 0.5 * ( r_0 + r_1 );
546
547 const double J_0_0 = r_mid * ( -ws[w][0][0] + ws[w][1][0] );
548 const double J_0_1 = r_mid * ( -ws[w][0][0] + ws[w][2][0] );
549 const double J_0_2 = half_dr * ( ONE_THIRD * ( ws[w][0][0] + ws[w][1][0] + ws[w][2][0] ) );
550
551 const double J_1_0 = r_mid * ( -ws[w][0][1] + ws[w][1][1] );
552 const double J_1_1 = r_mid * ( -ws[w][0][1] + ws[w][2][1] );
553 const double J_1_2 = half_dr * ( ONE_THIRD * ( ws[w][0][1] + ws[w][1][1] + ws[w][2][1] ) );
554
555 const double J_2_0 = r_mid * ( -ws[w][0][2] + ws[w][1][2] );
556 const double J_2_1 = r_mid * ( -ws[w][0][2] + ws[w][2][2] );
557 const double J_2_2 = half_dr * ( ONE_THIRD * ( ws[w][0][2] + ws[w][1][2] + ws[w][2][2] ) );
558
559 const double J_det = J_0_0 * J_1_1 * J_2_2 - J_0_0 * J_1_2 * J_2_1 -
560 J_0_1 * J_1_0 * J_2_2 + J_0_1 * J_1_2 * J_2_0 +
561 J_0_2 * J_1_0 * J_2_1 - J_0_2 * J_1_1 * J_2_0;
562
563 const double invJ = 1.0 / J_det;
564
565 i00 = invJ * ( J_1_1 * J_2_2 - J_1_2 * J_2_1 );
566 i01 = invJ * ( -J_1_0 * J_2_2 + J_1_2 * J_2_0 );
567 i02 = invJ * ( J_1_0 * J_2_1 - J_1_1 * J_2_0 );
568
569 i10 = invJ * ( -J_0_1 * J_2_2 + J_0_2 * J_2_1 );
570 i11 = invJ * ( J_0_0 * J_2_2 - J_0_2 * J_2_0 );
571 i12 = invJ * ( -J_0_0 * J_2_1 + J_0_1 * J_2_0 );
572
573 i20 = invJ * ( J_0_1 * J_1_2 - J_0_2 * J_1_1 );
574 i21 = invJ * ( -J_0_0 * J_1_2 + J_0_2 * J_1_0 );
575 i22 = invJ * ( J_0_0 * J_1_1 - J_0_1 * J_1_0 );
576
577 wJ = Kokkos::abs( J_det );
578 }
579
580 const double kwJ = k_eval * wJ;
581
582 double gu00 = 0.0;
583 double gu10 = 0.0, gu11 = 0.0;
584 double gu20 = 0.0, gu21 = 0.0, gu22 = 0.0;
585 double div_u = 0.0;
586
587 if ( !diagonal_ )
588 {
589 for ( int dimj = 0; dimj < 3; ++dimj )
590 {
591#pragma unroll
592 for ( int node_idx = cmb_shift; node_idx < 6 - surface_shift; ++node_idx )
593 {
594 const double gx = dN_ref[node_idx][0];
595 const double gy = dN_ref[node_idx][1];
596 const double gz = dN_ref[node_idx][2];
597
598 const double g0 = i00 * gx + i01 * gy + i02 * gz;
599 const double g1 = i10 * gx + i11 * gy + i12 * gz;
600 const double g2 = i20 * gx + i21 * gy + i22 * gz;
601
602 double E00, E11, E22, sym01, sym02, sym12, gdd;
603 column_grad_to_sym( dimj, g0, g1, g2, E00, E11, E22, sym01, sym02, sym12, gdd );
604
605 const int ddx = WEDGE_NODE_OFF[w][node_idx][0];
606 const int ddy = WEDGE_NODE_OFF[w][node_idx][1];
607 const int ddr = WEDGE_NODE_OFF[w][node_idx][2];
608
609 const int nid = node_id( tx + ddx, ty + ddy );
610 const int lvl = lvl0 + ddr;
611
612 const double s = src_sh( nid, dimj, lvl );
613
614 gu00 += E00 * s;
615 gu10 += sym01 * s;
616 gu11 += E11 * s;
617 gu20 += sym02 * s;
618 gu21 += sym12 * s;
619 gu22 += E22 * s;
620
621 div_u += gdd * s;
622 }
623 }
624
625 for ( int dimi = 0; dimi < 3; ++dimi )
626 {
627#pragma unroll
628 for ( int node_idx = cmb_shift; node_idx < 6 - surface_shift; ++node_idx )
629 {
630 const double gx = dN_ref[node_idx][0];
631 const double gy = dN_ref[node_idx][1];
632 const double gz = dN_ref[node_idx][2];
633
634 const double g0 = i00 * gx + i01 * gy + i02 * gz;
635 const double g1 = i10 * gx + i11 * gy + i12 * gz;
636 const double g2 = i20 * gx + i21 * gy + i22 * gz;
637
638 double E00, E11, E22, sym01, sym02, sym12, gdd;
639 column_grad_to_sym( dimi, g0, g1, g2, E00, E11, E22, sym01, sym02, sym12, gdd );
640
641 const double pairing0 = 2.0 * sym01;
642 const double pairing1 = 2.0 * sym02;
643 const double pairing2 = 2.0 * sym12;
644
645 const int u = WEDGE_TO_UNIQUE[w][node_idx];
646
647 dst8[dimi][u] += kwJ * ( NEG_TWO_THIRDS * div_u * gdd + 2.0 * pairing0 * gu10 +
648 2.0 * pairing1 * gu20 + 2.0 * pairing2 * gu21 + 2.0 * E00 * gu00 +
649 2.0 * E11 * gu11 + 2.0 * E22 * gu22 );
650 }
651 }
652 }
653
654 if ( diagonal_ || ( treat_boundary && at_boundary ) )
655 {
656 for ( int dim_diagBC = 0; dim_diagBC < 3; ++dim_diagBC )
657 {
658#pragma unroll
659 for ( int node_idx = surface_shift; node_idx < 6 - cmb_shift; ++node_idx )
660 {
661 const double gx = dN_ref[node_idx][0];
662 const double gy = dN_ref[node_idx][1];
663 const double gz = dN_ref[node_idx][2];
664
665 const double g0 = i00 * gx + i01 * gy + i02 * gz;
666 const double g1 = i10 * gx + i11 * gy + i12 * gz;
667 const double g2 = i20 * gx + i21 * gy + i22 * gz;
668
669 double E00, E11, E22, sym01, sym02, sym12, gdd;
670 column_grad_to_sym( dim_diagBC, g0, g1, g2, E00, E11, E22, sym01, sym02, sym12, gdd );
671
672 const int ddx = WEDGE_NODE_OFF[w][node_idx][0];
673 const int ddy = WEDGE_NODE_OFF[w][node_idx][1];
674 const int ddr = WEDGE_NODE_OFF[w][node_idx][2];
675
676 const int nid = node_id( tx + ddx, ty + ddy );
677 const int lvl = lvl0 + ddr;
678
679 const double s = src_sh( nid, dim_diagBC, lvl );
680
681 const double pairing0 = 4.0 * s;
682 const double pairing1 = 2.0 * s;
683
684 const int u = WEDGE_TO_UNIQUE[w][node_idx];
685
686 dst8[dim_diagBC][u] +=
687 kwJ * ( pairing0 * ( sym01 * sym01 ) + pairing0 * ( sym02 * sym02 ) +
688 pairing0 * ( sym12 * sym12 ) + pairing1 * ( E00 * E00 ) + pairing1 * ( E11 * E11 ) +
689 pairing1 * ( E22 * E22 ) + NEG_TWO_THIRDS * ( gdd * gdd ) * s );
690 }
691 }
692 }
693 } // w
694
695 // scatter
696 for ( int dim_add = 0; dim_add < 3; ++dim_add )
697 {
698 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell, y_cell, r_cell, dim_add ), dst8[dim_add][0] );
699 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell + 1, y_cell, r_cell, dim_add ), dst8[dim_add][1] );
700 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell, y_cell + 1, r_cell, dim_add ), dst8[dim_add][2] );
701 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell, y_cell, r_cell + 1, dim_add ), dst8[dim_add][3] );
702 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell + 1, y_cell, r_cell + 1, dim_add ), dst8[dim_add][4] );
703 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell, y_cell + 1, r_cell + 1, dim_add ), dst8[dim_add][5] );
704 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell + 1, y_cell + 1, r_cell, dim_add ), dst8[dim_add][6] );
705 Kokkos::atomic_add( &dst_( local_subdomain_id, x_cell + 1, y_cell + 1, r_cell + 1, dim_add ), dst8[dim_add][7] );
706 }
707 }
708 }
709
710 /// @brief: For both trial and test space this function sets up a vector:
711 /// each vector element holds the symmetric gradient (a 3x3 matrix) of the shape function of the corresponding dof
712 /// (if dimi == dimj, these are the same and we are on the diagonal of the vectorial diffusion operator)
713 /// Additionally, we compute the scalar factor for the numerical integral comp: determinant of the jacobian,
714 /// evaluation of the coefficient k on the element and the quadrature weight of the current quad-point.
715
716 /// The idea of this function is that the two vectors can be:
717 /// - accumulated to the result of the local matvec with 2 * num_nodes_per_wedge complexity
718 /// by scaling the dot product of the trial vec and local src dofs with each element of the test vec
719 /// (and adding to the dst dofs, this is the fused local matvec).
720 /// - propagated to the local matrix by an outer product of the two vectors
721 /// (without applying it to dofs). This is e.g. required to assemble the finest grid local
722 /// matrix on-the-fly during GCA/Galerkin coarsening.
723
724 ///
725 KOKKOS_INLINE_FUNCTION void assemble_trial_test_vecs(
726 const int wedge,
727 const dense::Vec< ScalarType, VecDim >& quad_point,
728 const ScalarType quad_weight,
729 const ScalarT r_1,
730 const ScalarT r_2,
731 dense::Vec< ScalarT, 3 > ( *wedge_phy_surf )[3],
732 const dense::Vec< ScalarT, 6 >* k_local_hex,
733 const int dimi,
734 const int dimj,
737 ScalarType& jdet_keval_quadweight ) const
738 {
739 dense::Mat< ScalarType, VecDim, VecDim > J = jac( wedge_phy_surf[wedge], r_1, r_2, quad_point );
740 const auto det = J.det();
741 const auto abs_det = Kokkos::abs( det );
742 const dense::Mat< ScalarType, VecDim, VecDim > J_inv_transposed = J.inv_transposed( det );
743
744 // dot of coeff dofs and element-local shape functions to evaluate the coefficent on the current element
745 ScalarType k_eval = 0.0;
746 for ( int k = 0; k < num_nodes_per_wedge; k++ )
747 {
748 k_eval += shape( k, quad_point ) * k_local_hex[wedge]( k );
749 }
750
751 for ( int k = 0; k < num_nodes_per_wedge; k++ )
752 {
753 sym_grad_i[k] = symmetric_grad( J_inv_transposed, quad_point, k, dimi );
754 sym_grad_j[k] = symmetric_grad( J_inv_transposed, quad_point, k, dimj );
755 }
756 jdet_keval_quadweight = quad_weight * k_eval * abs_det;
757 }
758
759 /// @brief assemble the local matrix and return it for a given element, wedge, and vectorial component
760 /// (determined by dimi, dimj)
761 KOKKOS_INLINE_FUNCTION
763 const int local_subdomain_id,
764 const int x_cell,
765 const int y_cell,
766 const int r_cell,
767 const int wedge ) const
768 {
769 // Gather surface points for each wedge.
770 // TODO gather this for only 1 wedge
772 wedge_surface_physical_coords( wedge_phy_surf, grid_, local_subdomain_id, x_cell, y_cell );
773
774 // Gather wedge radii.
775 const ScalarT r_1 = radii_( local_subdomain_id, r_cell );
776 const ScalarT r_2 = radii_( local_subdomain_id, r_cell + 1 );
777
779 extract_local_wedge_scalar_coefficients( k_local_hex, local_subdomain_id, x_cell, y_cell, r_cell, k_ );
780
781 // Compute the local element matrix.
783 for ( int dimi = 0; dimi < 3; ++dimi )
784 {
785 for ( int dimj = 0; dimj < 3; ++dimj )
786 {
787 // spatial dimensions: quadrature points and wedge
788 for ( int q = 0; q < num_quad_points; q++ )
789 {
792 ScalarType jdet_keval_quadweight = 0;
794 wedge,
795 quad_points[q],
796 quad_weights[q],
797 r_1,
798 r_2,
799 wedge_phy_surf,
800 k_local_hex,
801 dimi,
802 dimj,
803 sym_grad_i,
804 sym_grad_j,
805 jdet_keval_quadweight );
806
807 // propagate on local matrix by outer product of test and trial vecs
808 for ( int i = 0; i < num_nodes_per_wedge; i++ )
809 {
810 for ( int j = 0; j < num_nodes_per_wedge; j++ )
811 {
812 A( i + dimi * num_nodes_per_wedge, j + dimj * num_nodes_per_wedge ) +=
813 jdet_keval_quadweight *
814 ( 2 * sym_grad_j[j].double_contract( sym_grad_i[i] ) -
815 2.0 / 3.0 * sym_grad_j[j]( dimj, dimj ) * sym_grad_i[i]( dimi, dimi ) );
816 // for the div, we just extract the component from the gradient vector
817 }
818 }
819 }
820 }
821 }
822
823 return A;
824 }
825};
826
829
830} // namespace terra::fe::wedge::operators::shell::epsdivdiv_history
Send and receive buffers for all process-local subdomain boundaries.
Definition communication.hpp:56
dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > get_local_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge) const
Retrives the local matrix if there is stored local matrices, the desired local matrix is loaded and r...
Definition epsilon_divdiv_kerngen_v06_xy_tiling.hpp:215
grid::Grid3DDataVec< ScalarT, 3 > get_grid()
Getter for grid member.
Definition epsilon_divdiv_kerngen_v06_xy_tiling.hpp:165
void column_grad_to_sym(const int dim, const double g0, const double g1, const double g2, double &E00, double &E11, double &E22, double &sym01, double &sym02, double &sym12, double &gdd) const
Definition epsilon_divdiv_kerngen_v06_xy_tiling.hpp:268
dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > assemble_local_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge) const
assemble the local matrix and return it for a given element, wedge, and vectorial component (determin...
Definition epsilon_divdiv_kerngen_v06_xy_tiling.hpp:762
void set_local_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge, const dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > &mat) const
Set the local matrix stored in the operator.
Definition epsilon_divdiv_kerngen_v06_xy_tiling.hpp:199
terra::grid::Grid4DDataMatrices< ScalarType, LocalMatrixDim, LocalMatrixDim, 2 > Grid4DDataLocalMatrices
Definition epsilon_divdiv_kerngen_v06_xy_tiling.hpp:38
static constexpr int LocalMatrixDim
Definition epsilon_divdiv_kerngen_v06_xy_tiling.hpp:37
linalg::OperatorStoredMatrixMode get_stored_matrix_mode()
Definition epsilon_divdiv_kerngen_v06_xy_tiling.hpp:195
bool has_flag(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, grid::shell::ShellBoundaryFlag flag) const
Getter for mask member.
Definition epsilon_divdiv_kerngen_v06_xy_tiling.hpp:169
void set_diagonal(bool v)
S/Getter for diagonal member.
Definition epsilon_divdiv_kerngen_v06_xy_tiling.hpp:153
const grid::Grid4DDataScalar< ScalarType > & k_grid_data()
Getter for coefficient.
Definition epsilon_divdiv_kerngen_v06_xy_tiling.hpp:156
const grid::shell::DistributedDomain & get_domain() const
Getter for domain member.
Definition epsilon_divdiv_kerngen_v06_xy_tiling.hpp:159
void set_operator_apply_and_communication_modes(const linalg::OperatorApplyMode operator_apply_mode, const linalg::OperatorCommunicationMode operator_communication_mode)
Definition epsilon_divdiv_kerngen_v06_xy_tiling.hpp:144
void operator()(const Team &team) const
Definition epsilon_divdiv_kerngen_v06_xy_tiling.hpp:342
grid::Grid2DDataScalar< ScalarT > get_radii() const
Getter for radii member.
Definition epsilon_divdiv_kerngen_v06_xy_tiling.hpp:162
void apply_impl(const SrcVectorType &src, DstVectorType &dst)
Definition epsilon_divdiv_kerngen_v06_xy_tiling.hpp:237
size_t team_shmem_size(const int) const
Definition epsilon_divdiv_kerngen_v06_xy_tiling.hpp:325
void set_stored_matrix_mode(linalg::OperatorStoredMatrixMode operator_stored_matrix_mode, int level_range, grid::Grid4DDataScalar< ScalarType > GCAElements)
allocates memory for the local matrices
Definition epsilon_divdiv_kerngen_v06_xy_tiling.hpp:180
ScalarT ScalarType
Definition epsilon_divdiv_kerngen_v06_xy_tiling.hpp:36
void assemble_trial_test_vecs(const int wedge, const dense::Vec< ScalarType, VecDim > &quad_point, const ScalarType quad_weight, const ScalarT r_1, const ScalarT r_2, dense::Vec< ScalarT, 3 >(*wedge_phy_surf)[3], const dense::Vec< ScalarT, 6 > *k_local_hex, const int dimi, const int dimj, dense::Mat< ScalarType, VecDim, VecDim > *sym_grad_i, dense::Mat< ScalarType, VecDim, VecDim > *sym_grad_j, ScalarType &jdet_keval_quadweight) const
: For both trial and test space this function sets up a vector: each vector element holds the symmetr...
Definition epsilon_divdiv_kerngen_v06_xy_tiling.hpp:725
Kokkos::TeamPolicy<>::member_type Team
Definition epsilon_divdiv_kerngen_v06_xy_tiling.hpp:321
EpsilonDivDivKerngenV06XyTiling(const grid::shell::DistributedDomain &domain, const grid::Grid3DDataVec< ScalarT, 3 > &grid, const grid::Grid2DDataScalar< ScalarT > &radii, const grid::Grid4DDataScalar< grid::shell::ShellBoundaryFlag > &mask, const grid::Grid4DDataScalar< ScalarT > &k, BoundaryConditions bcs, bool diagonal, linalg::OperatorApplyMode operator_apply_mode=linalg::OperatorApplyMode::Replace, linalg::OperatorCommunicationMode operator_communication_mode=linalg::OperatorCommunicationMode::CommunicateAdditively, linalg::OperatorStoredMatrixMode operator_stored_matrix_mode=linalg::OperatorStoredMatrixMode::Off)
Definition epsilon_divdiv_kerngen_v06_xy_tiling.hpp:88
Parallel data structure organizing the thick spherical shell metadata for distributed (MPI parallel) ...
Definition spherical_shell.hpp:2518
const std::map< SubdomainInfo, std::tuple< LocalSubdomainIdx, SubdomainNeighborhood > > & subdomains() const
Definition spherical_shell.hpp:2650
const DomainInfo & domain_info() const
Returns a const reference.
Definition spherical_shell.hpp:2647
Information about the thick spherical shell mesh.
Definition spherical_shell.hpp:780
const std::vector< double > & radii() const
Definition spherical_shell.hpp:845
int diamond_lateral_refinement_level() const
Definition spherical_shell.hpp:843
int subdomain_num_nodes_radially() const
Equivalent to calling subdomain_num_nodes_radially( subdomain_refinement_level() )
Definition spherical_shell.hpp:861
int subdomain_num_nodes_per_side_laterally() const
Equivalent to calling subdomain_num_nodes_per_side_laterally( subdomain_refinement_level() )
Definition spherical_shell.hpp:852
Static assertion: VectorQ1Scalar satisfies VectorLike concept.
Definition vector_q1.hpp:168
const grid::Grid4DDataVec< ScalarType, VecDim > & grid_data() const
Get const reference to grid data.
Definition vector_q1.hpp:288
bool has_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge) const
Checks for presence of a local matrix for a certain element.
Definition local_matrix_storage.hpp:223
dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > get_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge) const
Retrives the local matrix if there is stored local matrices, the desired local matrix is loaded and r...
Definition local_matrix_storage.hpp:175
void set_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge, dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > mat) const
Set the local matrix stored in the operator.
Definition local_matrix_storage.hpp:118
Timer supporting RAII scope or manual stop.
Definition timer.hpp:342
void stop()
Stop the timer and record elapsed time.
Definition timer.hpp:364
Concept for types that can be used as Galerkin coarse-grid operators in a multigrid hierarchy....
Definition operator.hpp:81
void unpack_and_reduce_local_subdomain_boundaries(const grid::shell::DistributedDomain &domain, const GridDataType &data, SubdomainNeighborhoodSendRecvBuffer< typename GridDataType::value_type, grid::grid_data_vec_dim< GridDataType >() > &boundary_recv_buffers, CommunicationReduction reduction=CommunicationReduction::SUM)
Unpacks and reduces local subdomain boundaries.
Definition communication.hpp:672
void pack_send_and_recv_local_subdomain_boundaries(const grid::shell::DistributedDomain &domain, const GridDataType &data, SubdomainNeighborhoodSendRecvBuffer< typename GridDataType::value_type, grid::grid_data_vec_dim< GridDataType >() > &boundary_send_buffers, SubdomainNeighborhoodSendRecvBuffer< typename GridDataType::value_type, grid::grid_data_vec_dim< GridDataType >() > &boundary_recv_buffers)
Packs, sends and recvs local subdomain boundaries using two sets of buffers.
Definition communication.hpp:242
Definition epsilon_divdiv_kerngen_v01_initial.hpp:16
constexpr void quad_felippa_1x1_quad_points(dense::Vec< T, 3 >(&quad_points)[quad_felippa_1x1_num_quad_points])
Definition wedge/quadrature/quadrature.hpp:36
constexpr void quad_felippa_1x1_quad_weights(T(&quad_weights)[quad_felippa_1x1_num_quad_points])
Definition wedge/quadrature/quadrature.hpp:43
constexpr int quad_felippa_1x1_num_quad_points
Definition wedge/quadrature/quadrature.hpp:32
constexpr int num_nodes_per_wedge_surface
Definition kernel_helpers.hpp:6
constexpr dense::Mat< T, 3, 3 > symmetric_grad(const dense::Mat< T, 3, 3 > &J_inv_transposed, const dense::Vec< T, 3 > &quad_point, const int dof, const int dim)
Returns the symmetric gradient of the shape function of a dof at a quadrature point.
Definition integrands.hpp:685
void wedge_surface_physical_coords(dense::Vec< T, 3 >(&wedge_surf_phy_coords)[num_wedges_per_hex_cell][num_nodes_per_wedge_surface], const grid::Grid3DDataVec< T, 3 > &lateral_grid, const int local_subdomain_id, const int x_cell, const int y_cell)
Extracts the (unit sphere) surface vertex coords of the two wedges of a hex cell.
Definition kernel_helpers.hpp:26
constexpr int num_wedges_per_hex_cell
Definition kernel_helpers.hpp:5
void extract_local_wedge_scalar_coefficients(dense::Vec< T, 6 >(&local_coefficients)[2], const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const grid::Grid4DDataScalar< T > &global_coefficients)
Extracts the local vector coefficients for the two wedges of a hex cell from the global coefficient v...
Definition kernel_helpers.hpp:306
constexpr int num_nodes_per_wedge
Definition kernel_helpers.hpp:7
constexpr T shape(const int node_idx, const T xi, const T eta, const T zeta)
(Tensor-product) Shape function.
Definition integrands.hpp:146
constexpr dense::Mat< T, 3, 3 > jac(const dense::Vec< T, 3 > &p1_phy, const dense::Vec< T, 3 > &p2_phy, const dense::Vec< T, 3 > &p3_phy, const T r_1, const T r_2, const T xi, const T eta, const T zeta)
Definition integrands.hpp:657
BoundaryConditionMapping[2] BoundaryConditions
Definition shell/bit_masks.hpp:37
ShellBoundaryFlag
FlagLike that indicates boundary types for the thick spherical shell.
Definition shell/bit_masks.hpp:12
BoundaryConditionFlag get_boundary_condition_flag(const BoundaryConditions bcs, ShellBoundaryFlag sbf)
Retrieve the boundary condition flag that is associated with a location in the shell e....
Definition shell/bit_masks.hpp:42
BoundaryConditionFlag
FlagLike that indicates the type of boundary condition
Definition shell/bit_masks.hpp:25
Kokkos::View< dense::Mat< ScalarType, Rows, Cols > ****[NumMatrices], Layout > Grid4DDataMatrices
Definition grid_types.hpp:173
Kokkos::View< ScalarType ***[VecDim], Layout > Grid3DDataVec
Definition grid_types.hpp:42
Kokkos::View< ScalarType ****, Layout > Grid4DDataScalar
Definition grid_types.hpp:27
Kokkos::View< ScalarType **, Layout > Grid2DDataScalar
Definition grid_types.hpp:21
dense::Mat< ScalarType, 3, 3 > trafo_mat_cartesian_to_normal_tangential(const dense::Vec< ScalarType, 3 > &n_input)
Constructs a robust orthonormal transformation matrix from Cartesian to (normal–tangential–tangential...
Definition local_basis_trafo_normal_tangential.hpp:36
OperatorApplyMode
Modes for applying an operator to a vector.
Definition operator.hpp:30
@ Replace
Overwrite the destination vector.
OperatorStoredMatrixMode
Modes for applying stored matrices.
Definition operator.hpp:47
@ Off
Do not use stored matrices.
OperatorCommunicationMode
Modes for communication during operator application.
Definition operator.hpp:40
@ CommunicateAdditively
Communicate and add results.
constexpr bool has_flag(E mask_value, E flag) noexcept
Checks if a bitmask value contains a specific flag.
Definition bit_masking.hpp:43
detail::PrefixCout logroot([]() { return detail::log_prefix();})
std::ostream subclass that just logs on root and adds a timestamp for each line.
Definition mat.hpp:10
T double_contract(const Mat &mat)
Definition mat.hpp:226
SoA (Structure-of-Arrays) 4D vector grid data.
Definition grid_types.hpp:51