Loading...
Searching...
No Matches
epsilon_divdiv_kerngen_v03_teams_precomp.hpp
Go to the documentation of this file.
1#pragma once
2
3#include "../../../quadrature/quadrature.hpp"
5#include "dense/vec.hpp"
9#include "impl/Kokkos_Profiling.hpp"
10#include "linalg/operator.hpp"
13#include "linalg/vector.hpp"
14#include "linalg/vector_q1.hpp"
15#include "util/timer.hpp"
16
18
24
25template < typename ScalarT, int VecDim = 3 >
27{
28 public:
31 using ScalarType = ScalarT;
32 static constexpr int LocalMatrixDim = 18;
35
36 private:
37 LocalMatrixStorage local_matrix_storage_;
38
40
45 BoundaryConditions bcs_;
46
47 bool treat_boundary_;
48 bool diagonal_;
49
50 linalg::OperatorApplyMode operator_apply_mode_;
51 linalg::OperatorCommunicationMode operator_communication_mode_;
52 linalg::OperatorStoredMatrixMode operator_stored_matrix_mode_;
53
56
59
60 // Quadrature points.
61 const int num_quad_points = quadrature::quad_felippa_1x1_num_quad_points;
62
65
66 // subdomain, x, y, 2 wedges, quad points, 9 components in the matrix
67 Kokkos::View< ScalarT*** [2][1][3][3], grid::Layout > g1_;
68
69 // subdomain, x, y, 2 wedges, quad points
70 Kokkos::View< ScalarT*** [2][1], grid::Layout > g2_;
71
72 int local_subdomains_;
73 int hex_lat_;
74 int hex_rad_;
75 int lat_refinement_level_;
76 int block_size_;
77 int blocks_per_column_;
78 int blocks_;
79
80 public:
87 BoundaryConditions bcs,
88 bool diagonal,
90 linalg::OperatorCommunicationMode operator_communication_mode =
93 : domain_( domain )
94 , grid_( grid )
95 , radii_( radii )
96 , mask_( mask )
97 , k_( k )
98 , treat_boundary_( true )
99 , diagonal_( diagonal )
100 , operator_apply_mode_( operator_apply_mode )
101 , operator_communication_mode_( operator_communication_mode )
102 , operator_stored_matrix_mode_( operator_stored_matrix_mode )
103 , g1_( "g1", grid.extent( 0 ), grid.extent( 1 ) - 1, grid.extent( 2 ) - 1 )
104 , g2_( "g2", grid.extent( 0 ), grid.extent( 1 ) - 1, grid.extent( 2 ) - 1 )
105 // TODO: we can reuse the send and recv buffers and pass in from the outside somehow
106 , send_buffers_( domain )
107 , recv_buffers_( domain )
108 {
109 bcs_[0] = bcs[0];
110 bcs_[1] = bcs[1];
113 local_subdomains_ = domain_.subdomains().size();
114 hex_lat_ = domain_.domain_info().subdomain_num_nodes_per_side_laterally() - 1;
115 hex_rad_ = domain_.domain_info().subdomain_num_nodes_radially() - 1;
116 lat_refinement_level_ = domain_.domain_info().diamond_lateral_refinement_level();
117 const int threads_per_column = hex_rad_;
118 block_size_ = std::min( 128, threads_per_column );
119 blocks_per_column_ = ( threads_per_column + block_size_ - 1 ) / block_size_;
120 blocks_ = local_subdomains_ * hex_lat_ * hex_lat_ * blocks_per_column_;
121
122 precompute();
123 }
124
126 const linalg::OperatorApplyMode operator_apply_mode,
127 const linalg::OperatorCommunicationMode operator_communication_mode )
128 {
129 operator_apply_mode_ = operator_apply_mode;
130 operator_communication_mode_ = operator_communication_mode;
131 }
132
133 /// @brief S/Getter for diagonal member
134 void set_diagonal( bool v ) { diagonal_ = v; }
135
136 /// @brief Getter for coefficient
138
139 /// @brief Getter for domain member
140 const grid::shell::DistributedDomain& get_domain() const { return domain_; }
141
142 /// @brief Getter for radii member
144
145 /// @brief Getter for grid member
147
148 /// @brief Getter for mask member
149 KOKKOS_INLINE_FUNCTION
151 const int local_subdomain_id,
152 const int x_cell,
153 const int y_cell,
154 const int r_cell,
156 {
157 return util::has_flag( mask_( local_subdomain_id, x_cell, y_cell, r_cell ), flag );
158 }
159
160 /// @brief allocates memory for the local matrices
162 linalg::OperatorStoredMatrixMode operator_stored_matrix_mode,
163 std::optional< int > level_range,
164 std::optional< grid::Grid4DDataScalar< ScalarType > > GCAElements )
165 {
166 operator_stored_matrix_mode_ = operator_stored_matrix_mode;
167
168 // allocate storage if necessary
169 if ( operator_stored_matrix_mode_ != linalg::OperatorStoredMatrixMode::Off )
170 {
172 domain_, operator_stored_matrix_mode_, level_range, GCAElements );
173 }
174 }
175
176 linalg::OperatorStoredMatrixMode get_stored_matrix_mode() { return operator_stored_matrix_mode_; }
177
178 /// @brief Set the local matrix stored in the operator
179 KOKKOS_INLINE_FUNCTION
181 const int local_subdomain_id,
182 const int x_cell,
183 const int y_cell,
184 const int r_cell,
185 const int wedge,
187 {
188 KOKKOS_ASSERT( operator_stored_matrix_mode_ != linalg::OperatorStoredMatrixMode::Off );
189 local_matrix_storage_.set_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge, mat );
190 }
191
192 /// @brief Retrives the local matrix
193 /// if there is stored local matrices, the desired local matrix is loaded and returned
194 /// if not, the local matrix is assembled on-the-fly
195 KOKKOS_INLINE_FUNCTION
197 const int local_subdomain_id,
198 const int x_cell,
199 const int y_cell,
200 const int r_cell,
201 const int wedge ) const
202 {
203 // request from storage
204 if ( operator_stored_matrix_mode_ != linalg::OperatorStoredMatrixMode::Off )
205 {
206 if ( !local_matrix_storage_.has_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge ) )
207 {
208 Kokkos::abort( "No matrix found at that spatial index." );
209 }
210 return local_matrix_storage_.get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge );
211 }
212 else
213 {
214 return assemble_local_matrix( local_subdomain_id, x_cell, y_cell, r_cell, wedge );
215 }
216 }
217
218 void apply_impl( const SrcVectorType& src, DstVectorType& dst )
219 {
220 util::Timer timer_apply( "epsilon_divdiv_apply" );
221
222 if ( operator_apply_mode_ == linalg::OperatorApplyMode::Replace )
223 {
224 assign( dst, 0 );
225 }
226
227 src_ = src.grid_data();
228 dst_ = dst.grid_data();
229
230 if ( src_.extent( 0 ) != dst_.extent( 0 ) || src_.extent( 1 ) != dst_.extent( 1 ) ||
231 src_.extent( 2 ) != dst_.extent( 2 ) || src_.extent( 3 ) != dst_.extent( 3 ) )
232 {
233 throw std::runtime_error( "EpsilonDivDiv: src/dst mismatch" );
234 }
235
236 if ( src_.extent( 1 ) != grid_.extent( 1 ) || src_.extent( 2 ) != grid_.extent( 2 ) )
237 {
238 throw std::runtime_error( "EpsilonDivDiv: src/dst mismatch" );
239 }
240
241 util::Timer timer_kernel( "epsilon_divdiv_kernel" );
242 const auto num_cells =
243 src_.extent( 0 ) * ( src_.extent( 1 ) - 1 ) * ( src_.extent( 2 ) - 1 ) * ( src_.extent( 3 ) - 1 );
244
245 constexpr int size_g1 = num_wedges_per_hex_cell * 1 * 9 * sizeof( ScalarT );
246 constexpr int size_g2 = num_wedges_per_hex_cell * 1 * sizeof( ScalarT );
247 constexpr int shmem_size = size_g1 + size_g2;
248 Kokkos::TeamPolicy<> policy =
249 Kokkos::TeamPolicy<>( blocks_, block_size_ ).set_scratch_size( 0, Kokkos::PerTeam( shmem_size ));
250
251
252 Kokkos::parallel_for( "matvec", policy, *this );
253
254 //Kokkos::parallel_for( "matvec", grid::shell::local_domain_md_range_policy_cells( domain_ ), *this );
255 Kokkos::fence();
256 timer_kernel.stop();
257
258 if ( operator_communication_mode_ == linalg::OperatorCommunicationMode::CommunicateAdditively )
259 {
260 util::Timer timer_comm( "epsilon_divdiv_comm" );
261
263 domain_, dst_, send_buffers_, recv_buffers_ );
265 }
266 }
267
269 {
270 Kokkos::parallel_for( "precompute", grid::shell::local_domain_md_range_policy_cells_lateral( domain_ ), *this );
271 }
272
273 KOKKOS_INLINE_FUNCTION void operator()( const int local_subdomain_id, const int x_cell, const int y_cell ) const
274 {
275 // Gather surface points for each wedge.
277 wedge_surface_physical_coords( wedge_phy_surf, grid_, local_subdomain_id, x_cell, y_cell );
278
279 for ( int wedge = 0; wedge < num_wedges_per_hex_cell; ++wedge )
280 {
281 const auto p1_phy = wedge_phy_surf[wedge][0];
282 const auto p2_phy = wedge_phy_surf[wedge][1];
283 const auto p3_phy = wedge_phy_surf[wedge][2];
284 for ( int q = 0; q < num_quad_points; ++q )
285 {
286 const auto quad_point = quad_points[q];
287 const auto b = jac_lat( p1_phy, p2_phy, p3_phy, quad_point( 0 ), quad_point( 1 ) );
288 const ScalarT det_b = b.det();
289 const ScalarT g2 = Kokkos::abs( det_b );
290 g2_( local_subdomain_id, x_cell, y_cell, wedge, q ) = g2;
291 const auto b_inv_t = b.inv_transposed( det_b );
292 for ( int d1 = 0; d1 < 3; ++d1 )
293 {
294 for ( int d2 = 0; d2 < 3; ++d2 )
295 {
296 g1_( local_subdomain_id, x_cell, y_cell, wedge, q, d1, d2 ) = b_inv_t( d1, d2 );
297 }
298 }
299 }
300 }
301 }
302
303 KOKKOS_INLINE_FUNCTION
305 int global_cell_idx,
306 int cells_per_subdomain,
307 int num_cells_x,
308 int num_cells_y,
309 int num_cells_r,
310 int& local_subdomain_id,
311 int& x_cell,
312 int& y_cell,
313 int& r_cell ) const
314 {
315 local_subdomain_id = global_cell_idx / cells_per_subdomain;
316 int rem = global_cell_idx % cells_per_subdomain;
317 x_cell = rem / ( num_cells_x * num_cells_y );
318 rem = rem % ( num_cells_x * num_cells_y );
319 y_cell = rem / num_cells_x;
320 r_cell = rem % num_cells_x;
321 }
322
323 using Team = Kokkos::TeamPolicy<>::member_type;
324 KOKKOS_INLINE_FUNCTION void operator()( const Team& team ) const
325 {
326 int local_subdomain_id, x_cell, y_cell, r_cell;
327
328 {
329 const int league_rank = team.league_rank();
330 int tmp = league_rank;
331 const int r_block_index = tmp % blocks_per_column_;
332 tmp /= blocks_per_column_;
333 y_cell = tmp & ( hex_lat_ - 1 ); // league_rank % hex_lat_
334 tmp = tmp >> lat_refinement_level_; // league_rank / hex_lat_
335 x_cell = tmp & ( hex_lat_ - 1 ); // tmp % hex_lat_
336 local_subdomain_id = tmp >> lat_refinement_level_; // tmp / hex_lat_
337
338 const int thread_index = team.team_rank();
339 const int block_size = team.team_size();
340 r_cell = r_block_index * block_size + thread_index;
341 }
342
343 // If we have stored lmatrices, use them.
344 // It's the user's responsibility to write meaningful matrices via set_lmatrix()
345 // We probably never want to assemble lmatrices with DCA and store,
346 // so GCA should be the actor storing matrices.
347 // use stored matrices (at least on some elements)
348 if ( operator_stored_matrix_mode_ != linalg::OperatorStoredMatrixMode::Off )
349 {
351
352 if ( operator_stored_matrix_mode_ == linalg::OperatorStoredMatrixMode::Full )
353 {
354 A[0] = local_matrix_storage_.get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 );
355 A[1] = local_matrix_storage_.get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 );
356 }
357 else if ( operator_stored_matrix_mode_ == linalg::OperatorStoredMatrixMode::Selective )
358 {
359 if ( local_matrix_storage_.has_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 ) &&
360 local_matrix_storage_.has_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 ) )
361 {
362 A[0] = local_matrix_storage_.get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 );
363 A[1] = local_matrix_storage_.get_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 );
364 }
365 else
366 {
367 // Kokkos::abort("Matrix not found.");
368 A[0] = assemble_local_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 0 );
369 A[1] = assemble_local_matrix( local_subdomain_id, x_cell, y_cell, r_cell, 1 );
370 }
371 }
372 // BCs are applied by GCA ... to be discussed for free-slip
373
374 if ( diagonal_ )
375 {
376 A[0] = A[0].diagonal();
377 A[1] = A[1].diagonal();
378 }
379
381 for ( int dimj = 0; dimj < 3; dimj++ )
382 {
385 src_d, local_subdomain_id, x_cell, y_cell, r_cell, dimj, src_ );
386
387 for ( int wedge = 0; wedge < num_wedges_per_hex_cell; wedge++ )
388 {
389 for ( int i = 0; i < num_nodes_per_wedge; i++ )
390 {
391 src[wedge]( dimj * num_nodes_per_wedge + i ) = src_d[wedge]( i );
392 }
393 }
394 }
395
397
398 dst[0] = A[0] * src[0];
399 dst[1] = A[1] * src[1];
400
401 for ( int dimi = 0; dimi < 3; dimi++ )
402 {
404 dst_d[0] = dst[0].template slice< 6 >( dimi * num_nodes_per_wedge );
405 dst_d[1] = dst[1].template slice< 6 >( dimi * num_nodes_per_wedge );
406
408 dst_, local_subdomain_id, x_cell, y_cell, r_cell, dimi, dst_d );
409 }
410 }
411 else
412 {
413 double wedge_surf_phy_coords[2][3][3];
414 double quad_surface_coords[2][2][3];
415 quad_surface_coords[0][0][0] = grid_( local_subdomain_id, x_cell, y_cell, 0 );
416 quad_surface_coords[0][0][1] = grid_( local_subdomain_id, x_cell, y_cell, 1 );
417 quad_surface_coords[0][0][2] = grid_( local_subdomain_id, x_cell, y_cell, 2 );
418 quad_surface_coords[0][1][0] = grid_( local_subdomain_id, x_cell, y_cell + 1, 0 );
419 quad_surface_coords[0][1][1] = grid_( local_subdomain_id, x_cell, y_cell + 1, 1 );
420 quad_surface_coords[0][1][2] = grid_( local_subdomain_id, x_cell, y_cell + 1, 2 );
421 quad_surface_coords[1][0][0] = grid_( local_subdomain_id, x_cell + 1, y_cell, 0 );
422 quad_surface_coords[1][0][1] = grid_( local_subdomain_id, x_cell + 1, y_cell, 1 );
423 quad_surface_coords[1][0][2] = grid_( local_subdomain_id, x_cell + 1, y_cell, 2 );
424 quad_surface_coords[1][1][0] = grid_( local_subdomain_id, x_cell + 1, y_cell + 1, 0 );
425 quad_surface_coords[1][1][1] = grid_( local_subdomain_id, x_cell + 1, y_cell + 1, 1 );
426 quad_surface_coords[1][1][2] = grid_( local_subdomain_id, x_cell + 1, y_cell + 1, 2 );
427 wedge_surf_phy_coords[0][0][0] = quad_surface_coords[0][0][0];
428 wedge_surf_phy_coords[0][0][1] = quad_surface_coords[0][0][1];
429 wedge_surf_phy_coords[0][0][2] = quad_surface_coords[0][0][2];
430 wedge_surf_phy_coords[0][1][0] = quad_surface_coords[1][0][0];
431 wedge_surf_phy_coords[0][1][1] = quad_surface_coords[1][0][1];
432 wedge_surf_phy_coords[0][1][2] = quad_surface_coords[1][0][2];
433 wedge_surf_phy_coords[0][2][0] = quad_surface_coords[0][1][0];
434 wedge_surf_phy_coords[0][2][1] = quad_surface_coords[0][1][1];
435 wedge_surf_phy_coords[0][2][2] = quad_surface_coords[0][1][2];
436 wedge_surf_phy_coords[1][0][0] = quad_surface_coords[1][1][0];
437 wedge_surf_phy_coords[1][0][1] = quad_surface_coords[1][1][1];
438 wedge_surf_phy_coords[1][0][2] = quad_surface_coords[1][1][2];
439 wedge_surf_phy_coords[1][1][0] = quad_surface_coords[0][1][0];
440 wedge_surf_phy_coords[1][1][1] = quad_surface_coords[0][1][1];
441 wedge_surf_phy_coords[1][1][2] = quad_surface_coords[0][1][2];
442 wedge_surf_phy_coords[1][2][0] = quad_surface_coords[1][0][0];
443 wedge_surf_phy_coords[1][2][1] = quad_surface_coords[1][0][1];
444 wedge_surf_phy_coords[1][2][2] = quad_surface_coords[1][0][2];
445 double r_0 = radii_( local_subdomain_id, r_cell );
446 double r_1 = radii_( local_subdomain_id, r_cell + 1 );
447 double src_local_hex[3][2][6];
448 int dim;
449 for ( dim = 0; dim < 3; dim += 1 )
450 {
451 src_local_hex[dim][0][0] = src_( local_subdomain_id, x_cell, y_cell, r_cell, dim );
452 src_local_hex[dim][0][1] = src_( local_subdomain_id, x_cell + 1, y_cell, r_cell, dim );
453 src_local_hex[dim][0][2] = src_( local_subdomain_id, x_cell, y_cell + 1, r_cell, dim );
454 src_local_hex[dim][0][3] = src_( local_subdomain_id, x_cell, y_cell, r_cell + 1, dim );
455 src_local_hex[dim][0][4] = src_( local_subdomain_id, x_cell + 1, y_cell, r_cell + 1, dim );
456 src_local_hex[dim][0][5] = src_( local_subdomain_id, x_cell, y_cell + 1, r_cell + 1, dim );
457 src_local_hex[dim][1][0] = src_( local_subdomain_id, x_cell + 1, y_cell + 1, r_cell, dim );
458 src_local_hex[dim][1][1] = src_( local_subdomain_id, x_cell, y_cell + 1, r_cell, dim );
459 src_local_hex[dim][1][2] = src_( local_subdomain_id, x_cell + 1, y_cell, r_cell, dim );
460 src_local_hex[dim][1][3] = src_( local_subdomain_id, x_cell + 1, y_cell + 1, r_cell + 1, dim );
461 src_local_hex[dim][1][4] = src_( local_subdomain_id, x_cell, y_cell + 1, r_cell + 1, dim );
462 src_local_hex[dim][1][5] = src_( local_subdomain_id, x_cell + 1, y_cell, r_cell + 1, dim );
463 };
464 double k_local_hex[2][6];
465 k_local_hex[0][0] = k_( local_subdomain_id, x_cell, y_cell, r_cell );
466 k_local_hex[0][1] = k_( local_subdomain_id, x_cell + 1, y_cell, r_cell );
467 k_local_hex[0][2] = k_( local_subdomain_id, x_cell, y_cell + 1, r_cell );
468 k_local_hex[0][3] = k_( local_subdomain_id, x_cell, y_cell, r_cell + 1 );
469 k_local_hex[0][4] = k_( local_subdomain_id, x_cell + 1, y_cell, r_cell + 1 );
470 k_local_hex[0][5] = k_( local_subdomain_id, x_cell, y_cell + 1, r_cell + 1 );
471 k_local_hex[1][0] = k_( local_subdomain_id, x_cell + 1, y_cell + 1, r_cell );
472 k_local_hex[1][1] = k_( local_subdomain_id, x_cell, y_cell + 1, r_cell );
473 k_local_hex[1][2] = k_( local_subdomain_id, x_cell + 1, y_cell, r_cell );
474 k_local_hex[1][3] = k_( local_subdomain_id, x_cell + 1, y_cell + 1, r_cell + 1 );
475 k_local_hex[1][4] = k_( local_subdomain_id, x_cell, y_cell + 1, r_cell + 1 );
476 k_local_hex[1][5] = k_( local_subdomain_id, x_cell + 1, y_cell, r_cell + 1 );
477 double qp_array[1][3];
478 double qw_array[1];
479 qp_array[0][0] = 0.33333333333333331;
480 qp_array[0][1] = 0.33333333333333331;
481 qp_array[0][2] = 0.0;
482 qw_array[0] = 1.0;
483 int at_cmb_boundary = has_flag( local_subdomain_id, x_cell, y_cell, r_cell, CMB );
484 int at_surface_boundary = has_flag( local_subdomain_id, x_cell, y_cell, r_cell + 1, SURFACE );
485 int cmb_shift = ( ( treat_boundary_ && diagonal_ == false && at_cmb_boundary != 0 ) ? ( 3 ) : ( 0 ) );
486 int max_rad = radii_.extent( 1 ) - 1;
487 int surface_shift =
488 ( ( treat_boundary_ && diagonal_ == false && at_surface_boundary != 0 ) ? ( 3 ) : ( 0 ) );
489 double dst_array[3][2][6] = { 0 };
490 double grad_r = -1.0 / 2.0 * r_0 + ( 1.0 / 2.0 ) * r_1;
491 double grad_r_inv = 1.0 / grad_r;
492 int w = 0;
493
494 constexpr int N_q = 1;
495 constexpr int size_g1 = num_wedges_per_hex_cell * N_q * 9 * sizeof( ScalarT );
496 constexpr int size_g2 = num_wedges_per_hex_cell * N_q * sizeof( ScalarT );
497
498 ScalarT* shmem_g1 = (ScalarT*) team.team_shmem().get_shmem( size_g1 );
499 ScalarT* shmem_g2 = (ScalarT*) team.team_shmem().get_shmem( size_g2 );
500
501 if ( team.team_rank() == 0 )
502 {
503 for ( int wedge = 0; wedge < num_wedges_per_hex_cell; ++wedge )
504 {
505 for ( int q = 0; q < N_q; ++q )
506 {
507 shmem_g2[wedge * N_q + q] = g2_( local_subdomain_id, x_cell, y_cell, wedge, q );
508 for ( int d1 = 0; d1 < 3; ++d1 )
509 {
510 for ( int d2 = 0; d2 < 3; ++d2 )
511 {
512 shmem_g1[wedge * N_q * 9 + q * 9 + 3 * d1 + d2] =
513 g1_( local_subdomain_id, x_cell, y_cell, wedge, q, d1, d2 );
514 }
515 }
516 }
517 }
518 }
519 team.team_barrier();
520
521 /* Apply local matrix for both wedges and accumulated for all quadrature points. */;
522 for ( w = 0; w < 2; w += 1 )
523 {
524 int q = 0;
525 for ( q = 0; q < 1; q += 1 )
526 {
527 /* Coefficient evaluation on current wedge w */;
528 double tmpcse_k_eval_0 = ( 1.0 / 2.0 ) * qp_array[q][2];
529 double tmpcse_k_eval_1 = 1.0 / 2.0 - tmpcse_k_eval_0;
530 double tmpcse_k_eval_2 = tmpcse_k_eval_0 + 1.0 / 2.0;
531 double tmpcse_k_eval_3 = -qp_array[q][0] - qp_array[q][1] + 1;
532 double k_eval = tmpcse_k_eval_1 * tmpcse_k_eval_3 * k_local_hex[w][0] +
533 tmpcse_k_eval_1 * k_local_hex[w][1] * qp_array[q][0] +
534 tmpcse_k_eval_1 * k_local_hex[w][2] * qp_array[q][1] +
535 tmpcse_k_eval_2 * tmpcse_k_eval_3 * k_local_hex[w][3] +
536 tmpcse_k_eval_2 * k_local_hex[w][4] * qp_array[q][0] +
537 tmpcse_k_eval_2 * k_local_hex[w][5] * qp_array[q][1];
538 /*
539 Load the radially constant parts of the Jacobial from storage,
540 scale with radial parts of the current element. */
541 ;
542 double J_invT[3][3] = { 0 };
543 double r_inv = 1.0 / ( r_0 + ( -1.0 / 2.0 * r_0 + ( 1.0 / 2.0 ) * r_1 ) * ( qp_array[q][2] + 1 ) );
544 double factors[3] = { r_inv, r_inv, grad_r_inv };
545 int d1;
546 int d2;
547 for ( d1 = 0; d1 < 3; d1 += 1 )
548 {
549 for ( d2 = 0; d2 < 3; d2 += 1 )
550 {
551 J_invT[d1][d2] = factors[d2] * shmem_g1[w * N_q * 9 + q * 9 + 3 * d1 + d2];
552 };
553 };
554 double g2 = shmem_g2[w * N_q + q];
555 /* Computation of the gradient of the scalar shape functions belonging to each DoF.
556 In the Eps-component-loops, we insert the gradient at the entry of the
557 vectorial gradient matrix corresponding to the Eps-component. */
558 ;
559
560 double scalar_grad[6][3] = { 0 };
561 double tmpcse_grad_i_0 = ( 1.0 / 2.0 ) * qp_array[q][2];
562 double tmpcse_grad_i_1 = tmpcse_grad_i_0 - 1.0 / 2.0;
563 double tmpcse_grad_i_2 = ( 1.0 / 2.0 ) * qp_array[q][0];
564 double tmpcse_grad_i_3 = ( 1.0 / 2.0 ) * qp_array[q][1];
565 double tmpcse_grad_i_4 = tmpcse_grad_i_2 + tmpcse_grad_i_3 - 1.0 / 2.0;
566 double tmpcse_grad_i_5 = tmpcse_grad_i_2 * J_invT[0][2];
567 double tmpcse_grad_i_6 = -tmpcse_grad_i_1;
568 double tmpcse_grad_i_7 = tmpcse_grad_i_2 * J_invT[1][2];
569 double tmpcse_grad_i_8 = tmpcse_grad_i_2 * J_invT[2][2];
570 double tmpcse_grad_i_9 = tmpcse_grad_i_3 * J_invT[0][2];
571 double tmpcse_grad_i_10 = tmpcse_grad_i_3 * J_invT[1][2];
572 double tmpcse_grad_i_11 = tmpcse_grad_i_3 * J_invT[2][2];
573 double tmpcse_grad_i_12 = tmpcse_grad_i_0 + 1.0 / 2.0;
574 double tmpcse_grad_i_13 = -tmpcse_grad_i_12;
575 double tmpcse_grad_i_14 = -tmpcse_grad_i_4;
576 scalar_grad[0][0] = tmpcse_grad_i_1 * J_invT[0][0] + tmpcse_grad_i_1 * J_invT[0][1] +
577 tmpcse_grad_i_4 * J_invT[0][2];
578 scalar_grad[0][1] = tmpcse_grad_i_1 * J_invT[1][0] + tmpcse_grad_i_1 * J_invT[1][1] +
579 tmpcse_grad_i_4 * J_invT[1][2];
580 scalar_grad[0][2] = tmpcse_grad_i_1 * J_invT[2][0] + tmpcse_grad_i_1 * J_invT[2][1] +
581 tmpcse_grad_i_4 * J_invT[2][2];
582 scalar_grad[1][0] = -tmpcse_grad_i_5 + tmpcse_grad_i_6 * J_invT[0][0];
583 scalar_grad[1][1] = tmpcse_grad_i_6 * J_invT[1][0] - tmpcse_grad_i_7;
584 scalar_grad[1][2] = tmpcse_grad_i_6 * J_invT[2][0] - tmpcse_grad_i_8;
585 scalar_grad[2][0] = tmpcse_grad_i_6 * J_invT[0][1] - tmpcse_grad_i_9;
586 scalar_grad[2][1] = -tmpcse_grad_i_10 + tmpcse_grad_i_6 * J_invT[1][1];
587 scalar_grad[2][2] = -tmpcse_grad_i_11 + tmpcse_grad_i_6 * J_invT[2][1];
588 scalar_grad[3][0] = tmpcse_grad_i_13 * J_invT[0][0] + tmpcse_grad_i_13 * J_invT[0][1] +
589 tmpcse_grad_i_14 * J_invT[0][2];
590 scalar_grad[3][1] = tmpcse_grad_i_13 * J_invT[1][0] + tmpcse_grad_i_13 * J_invT[1][1] +
591 tmpcse_grad_i_14 * J_invT[1][2];
592 scalar_grad[3][2] = tmpcse_grad_i_13 * J_invT[2][0] + tmpcse_grad_i_13 * J_invT[2][1] +
593 tmpcse_grad_i_14 * J_invT[2][2];
594 scalar_grad[4][0] = tmpcse_grad_i_12 * J_invT[0][0] + tmpcse_grad_i_5;
595 scalar_grad[4][1] = tmpcse_grad_i_12 * J_invT[1][0] + tmpcse_grad_i_7;
596 scalar_grad[4][2] = tmpcse_grad_i_12 * J_invT[2][0] + tmpcse_grad_i_8;
597 scalar_grad[5][0] = tmpcse_grad_i_12 * J_invT[0][1] + tmpcse_grad_i_9;
598 scalar_grad[5][1] = tmpcse_grad_i_10 + tmpcse_grad_i_12 * J_invT[1][1];
599 scalar_grad[5][2] = tmpcse_grad_i_11 + tmpcse_grad_i_12 * J_invT[2][1];
600 int dimj;
601
602 double grad_u[3][3] = { 0 };
603 double div_u = 0.0;
604 int node_idx;
605 /* In the following, we exploit the outer-product-structure of the local MV both in
606 the components of the Epsilon operators and in the local DoFs. */
607 ;
608 /* Loop to assemble the trial gradient. */;
609 for ( dimj = 0; dimj < 3; dimj += 1 )
610 {
611 if ( diagonal_ == false )
612 {
613 for ( node_idx = cmb_shift; node_idx < 6 - surface_shift; node_idx += 1 )
614 {
615 double E_grad_trial[3][3] = { 0 };
616 E_grad_trial[0][dimj] = scalar_grad[node_idx][0];
617 E_grad_trial[1][dimj] = scalar_grad[node_idx][1];
618 E_grad_trial[2][dimj] = scalar_grad[node_idx][2];
619 double tmpcse_symgrad_trial_0 = 0.5 * E_grad_trial[0][1] + 0.5 * E_grad_trial[1][0];
620 double tmpcse_symgrad_trial_1 = 0.5 * E_grad_trial[0][2] + 0.5 * E_grad_trial[2][0];
621 double tmpcse_symgrad_trial_2 = 0.5 * E_grad_trial[1][2] + 0.5 * E_grad_trial[2][1];
622 grad_u[0][0] =
623 1.0 * E_grad_trial[0][0] * src_local_hex[dimj][w][node_idx] + grad_u[0][0];
624 grad_u[0][1] = tmpcse_symgrad_trial_0 * src_local_hex[dimj][w][node_idx] + grad_u[0][1];
625 grad_u[0][2] = tmpcse_symgrad_trial_1 * src_local_hex[dimj][w][node_idx] + grad_u[0][2];
626 grad_u[1][0] = tmpcse_symgrad_trial_0 * src_local_hex[dimj][w][node_idx] + grad_u[1][0];
627 grad_u[1][1] =
628 1.0 * E_grad_trial[1][1] * src_local_hex[dimj][w][node_idx] + grad_u[1][1];
629 grad_u[1][2] = tmpcse_symgrad_trial_2 * src_local_hex[dimj][w][node_idx] + grad_u[1][2];
630 grad_u[2][0] = tmpcse_symgrad_trial_1 * src_local_hex[dimj][w][node_idx] + grad_u[2][0];
631 grad_u[2][1] = tmpcse_symgrad_trial_2 * src_local_hex[dimj][w][node_idx] + grad_u[2][1];
632 grad_u[2][2] =
633 1.0 * E_grad_trial[2][2] * src_local_hex[dimj][w][node_idx] + grad_u[2][2];
634 div_u = div_u + E_grad_trial[dimj][dimj] * src_local_hex[dimj][w][node_idx];
635 };
636 };
637 };
638 int dimi;
639 /* Loop to pair the assembled trial gradient with the test gradients. */;
640 for ( dimi = 0; dimi < 3; dimi += 1 )
641 {
642 if ( diagonal_ == false )
643 {
644 for ( node_idx = cmb_shift; node_idx < 6 - surface_shift; node_idx += 1 )
645 {
646 double E_grad_test[3][3] = { 0 };
647 E_grad_test[0][dimi] = scalar_grad[node_idx][0];
648 E_grad_test[1][dimi] = scalar_grad[node_idx][1];
649 E_grad_test[2][dimi] = scalar_grad[node_idx][2];
650 double tmpcse_symgrad_test_0 = 0.5 * E_grad_test[0][1] + 0.5 * E_grad_test[1][0];
651 double tmpcse_symgrad_test_1 = 0.5 * E_grad_test[0][2] + 0.5 * E_grad_test[2][0];
652 double tmpcse_symgrad_test_2 = 0.5 * E_grad_test[1][2] + 0.5 * E_grad_test[2][1];
653 double tmpcse_pairing_0 = 2 * tmpcse_symgrad_test_0;
654 double tmpcse_pairing_1 = 2 * tmpcse_symgrad_test_1;
655 double tmpcse_pairing_2 = 2 * tmpcse_symgrad_test_2;
656 dst_array[dimi][w][node_idx] =
657 g2 * grad_r * k_eval *
658 ( -0.66666666666666663 * div_u * E_grad_test[dimi][dimi] +
659 tmpcse_pairing_0 * grad_u[0][1] + tmpcse_pairing_0 * grad_u[1][0] +
660 tmpcse_pairing_1 * grad_u[0][2] + tmpcse_pairing_1 * grad_u[2][0] +
661 tmpcse_pairing_2 * grad_u[1][2] + tmpcse_pairing_2 * grad_u[2][1] +
662 2.0 * E_grad_test[0][0] * grad_u[0][0] +
663 2.0 * E_grad_test[1][1] * grad_u[1][1] +
664 2.0 * E_grad_test[2][2] * grad_u[2][2] ) *
665 qw_array[q] / pow( r_inv, 2 ) +
666 dst_array[dimi][w][node_idx];
667 };
668 };
669 };
670 int dim_diagBC;
671 /* Loop to apply BCs or only the diagonal of the operator. */;
672 for ( dim_diagBC = 0; dim_diagBC < 3; dim_diagBC += 1 )
673 {
674 if ( diagonal_ || treat_boundary_ && ( at_cmb_boundary != 0 || at_surface_boundary != 0 ) )
675 {
676 int node_idx;
677 for ( node_idx = surface_shift; node_idx < 6 - cmb_shift; node_idx += 1 )
678 {
679 double E_grad_test[3][3] = { 0 };
680 E_grad_test[0][dim_diagBC] = scalar_grad[node_idx][0];
681 E_grad_test[1][dim_diagBC] = scalar_grad[node_idx][1];
682 E_grad_test[2][dim_diagBC] = scalar_grad[node_idx][2];
683
684 double grad_u_diag[3][3] = { 0 };
685 double tmpcse_symgrad_test_0 = 0.5 * E_grad_test[0][1] + 0.5 * E_grad_test[1][0];
686 double tmpcse_symgrad_test_1 = 0.5 * E_grad_test[0][2] + 0.5 * E_grad_test[2][0];
687 double tmpcse_symgrad_test_2 = 0.5 * E_grad_test[1][2] + 0.5 * E_grad_test[2][1];
688 grad_u_diag[0][0] = 1.0 * E_grad_test[0][0] * src_local_hex[dim_diagBC][w][node_idx];
689 grad_u_diag[0][1] = tmpcse_symgrad_test_0 * src_local_hex[dim_diagBC][w][node_idx];
690 grad_u_diag[0][2] = tmpcse_symgrad_test_1 * src_local_hex[dim_diagBC][w][node_idx];
691 grad_u_diag[1][0] = tmpcse_symgrad_test_0 * src_local_hex[dim_diagBC][w][node_idx];
692 grad_u_diag[1][1] = 1.0 * E_grad_test[1][1] * src_local_hex[dim_diagBC][w][node_idx];
693 grad_u_diag[1][2] = tmpcse_symgrad_test_2 * src_local_hex[dim_diagBC][w][node_idx];
694 grad_u_diag[2][0] = tmpcse_symgrad_test_1 * src_local_hex[dim_diagBC][w][node_idx];
695 grad_u_diag[2][1] = tmpcse_symgrad_test_2 * src_local_hex[dim_diagBC][w][node_idx];
696 grad_u_diag[2][2] = 1.0 * E_grad_test[2][2] * src_local_hex[dim_diagBC][w][node_idx];
697 double tmpcse_pairing_0 = 4 * src_local_hex[dim_diagBC][w][node_idx];
698 double tmpcse_pairing_1 = 2.0 * src_local_hex[dim_diagBC][w][node_idx];
699 dst_array[dim_diagBC][w][node_idx] =
700 g2 * grad_r * k_eval *
701 ( tmpcse_pairing_0 * pow( tmpcse_symgrad_test_0, 2 ) +
702 tmpcse_pairing_0 * pow( tmpcse_symgrad_test_1, 2 ) +
703 tmpcse_pairing_0 * pow( tmpcse_symgrad_test_2, 2 ) +
704 tmpcse_pairing_1 * pow( E_grad_test[0][0], 2 ) +
705 tmpcse_pairing_1 * pow( E_grad_test[1][1], 2 ) +
706 tmpcse_pairing_1 * pow( E_grad_test[2][2], 2 ) -
707 0.66666666666666663 * pow( E_grad_test[dim_diagBC][dim_diagBC], 2 ) *
708 src_local_hex[dim_diagBC][w][node_idx] ) *
709 qw_array[q] / pow( r_inv, 2 ) +
710 dst_array[dim_diagBC][w][node_idx];
711 };
712 };
713 };
714 };
715 };
716 int dim_add;
717 for ( dim_add = 0; dim_add < 3; dim_add += 1 )
718 {
719 Kokkos::atomic_add(
720 &dst_( local_subdomain_id, x_cell, y_cell, r_cell, dim_add ), dst_array[dim_add][0][0] );
721 Kokkos::atomic_add(
722 &dst_( local_subdomain_id, x_cell + 1, y_cell, r_cell, dim_add ),
723 dst_array[dim_add][0][1] + dst_array[dim_add][1][2] );
724 Kokkos::atomic_add(
725 &dst_( local_subdomain_id, x_cell, y_cell + 1, r_cell, dim_add ),
726 dst_array[dim_add][0][2] + dst_array[dim_add][1][1] );
727 Kokkos::atomic_add(
728 &dst_( local_subdomain_id, x_cell, y_cell, r_cell + 1, dim_add ), dst_array[dim_add][0][3] );
729 Kokkos::atomic_add(
730 &dst_( local_subdomain_id, x_cell + 1, y_cell, r_cell + 1, dim_add ),
731 dst_array[dim_add][0][4] + dst_array[dim_add][1][5] );
732 Kokkos::atomic_add(
733 &dst_( local_subdomain_id, x_cell, y_cell + 1, r_cell + 1, dim_add ),
734 dst_array[dim_add][0][5] + dst_array[dim_add][1][4] );
735 Kokkos::atomic_add(
736 &dst_( local_subdomain_id, x_cell + 1, y_cell + 1, r_cell, dim_add ), dst_array[dim_add][1][0] );
737 Kokkos::atomic_add(
738 &dst_( local_subdomain_id, x_cell + 1, y_cell + 1, r_cell + 1, dim_add ),
739 dst_array[dim_add][1][3] );
740 };
741 }
742 }
743
744 /// @brief: For both trial and test space this function sets up a vector:
745 /// each vector element holds the symmetric gradient (a 3x3 matrix) of the shape function of the corresponding dof
746 /// (if dimi == dimj, these are the same and we are on the diagonal of the vectorial diffusion operator)
747 /// Additionally, we compute the scalar factor for the numerical integral comp: determinant of the jacobian,
748 /// evaluation of the coefficient k on the element and the quadrature weight of the current quad-point.
749
750 /// The idea of this function is that the two vectors can be:
751 /// - accumulated to the result of the local matvec with 2 * num_nodes_per_wedge complexity
752 /// by scaling the dot product of the trial vec and local src dofs with each element of the test vec
753 /// (and adding to the dst dofs, this is the fused local matvec).
754 /// - propagated to the local matrix by an outer product of the two vectors
755 /// (without applying it to dofs). This is e.g. required to assemble the finest grid local
756 /// matrix on-the-fly during GCA/Galerkin coarsening.
757
758 ///
759 KOKKOS_INLINE_FUNCTION void assemble_trial_test_vecs(
760 const int wedge,
761 const dense::Vec< ScalarType, VecDim >& quad_point,
762 const ScalarType quad_weight,
763 const ScalarT r_1,
764 const ScalarT r_2,
765 dense::Vec< ScalarT, 3 > ( *wedge_phy_surf )[3],
766 const dense::Vec< ScalarT, 6 >* k_local_hex,
767 const int dimi,
768 const int dimj,
771 ScalarType& jdet_keval_quadweight ) const
772 {
773 dense::Mat< ScalarType, VecDim, VecDim > J = jac( wedge_phy_surf[wedge], r_1, r_2, quad_point );
774 const auto det = J.det();
775 const auto abs_det = Kokkos::abs( det );
776 const dense::Mat< ScalarType, VecDim, VecDim > J_inv_transposed = J.inv_transposed( det );
777
778 // dot of coeff dofs and element-local shape functions to evaluate the coefficent on the current element
779 ScalarType k_eval = 0.0;
780 for ( int k = 0; k < num_nodes_per_wedge; k++ )
781 {
782 k_eval += shape( k, quad_point ) * k_local_hex[wedge]( k );
783 }
784
785 for ( int k = 0; k < num_nodes_per_wedge; k++ )
786 {
787 sym_grad_i[k] = symmetric_grad( J_inv_transposed, quad_point, k, dimi );
788 sym_grad_j[k] = symmetric_grad( J_inv_transposed, quad_point, k, dimj );
789 }
790 jdet_keval_quadweight = quad_weight * k_eval * abs_det;
791 }
792
793 /// @brief assemble the local matrix and return it for a given element, wedge, and vectorial component
794 /// (determined by dimi, dimj)
795 KOKKOS_INLINE_FUNCTION
797 const int local_subdomain_id,
798 const int x_cell,
799 const int y_cell,
800 const int r_cell,
801 const int wedge ) const
802 {
803 // Gather surface points for each wedge.
804 // TODO gather this for only 1 wedge
806 wedge_surface_physical_coords( wedge_phy_surf, grid_, local_subdomain_id, x_cell, y_cell );
807
808 // Gather wedge radii.
809 const ScalarT r_1 = radii_( local_subdomain_id, r_cell );
810 const ScalarT r_2 = radii_( local_subdomain_id, r_cell + 1 );
811
813 extract_local_wedge_scalar_coefficients( k_local_hex, local_subdomain_id, x_cell, y_cell, r_cell, k_ );
814
815 // Compute the local element matrix.
817 for ( int dimi = 0; dimi < 3; ++dimi )
818 {
819 for ( int dimj = 0; dimj < 3; ++dimj )
820 {
821 // spatial dimensions: quadrature points and wedge
822 for ( int q = 0; q < num_quad_points; q++ )
823 {
826 ScalarType jdet_keval_quadweight = 0;
828 wedge,
829 quad_points[q],
830 quad_weights[q],
831 r_1,
832 r_2,
833 wedge_phy_surf,
834 k_local_hex,
835 dimi,
836 dimj,
837 sym_grad_i,
838 sym_grad_j,
839 jdet_keval_quadweight );
840
841 // propagate on local matrix by outer product of test and trial vecs
842 for ( int i = 0; i < num_nodes_per_wedge; i++ )
843 {
844 for ( int j = 0; j < num_nodes_per_wedge; j++ )
845 {
846 A( i + dimi * num_nodes_per_wedge, j + dimj * num_nodes_per_wedge ) +=
847 jdet_keval_quadweight *
848 ( 2 * sym_grad_j[j].double_contract( sym_grad_i[i] ) -
849 2.0 / 3.0 * sym_grad_j[j]( dimj, dimj ) * sym_grad_i[i]( dimi, dimi ) );
850 // for the div, we just extract the component from the gradient vector
851 }
852 }
853 }
854 }
855 }
856
857 if ( treat_boundary_ )
858 {
860 boundary_mask.fill( 1.0 );
861
862 for ( int dimi = 0; dimi < 3; ++dimi )
863 {
864 for ( int dimj = 0; dimj < 3; ++dimj )
865 {
866 if ( r_cell == 0 )
867 {
868 // Inner boundary (CMB).
869 for ( int i = 0; i < 6; i++ )
870 {
871 for ( int j = 0; j < 6; j++ )
872 {
873 // on diagonal components of the vectorial diffusion operator, we exclude the diagonal entries from elimination
874 if ( ( dimi == dimj && i != j && ( i < 3 || j < 3 ) ) or
875 ( dimi != dimj && ( i < 3 || j < 3 ) ) )
876 {
877 boundary_mask( i + dimi * num_nodes_per_wedge, j + dimj * num_nodes_per_wedge ) =
878 0.0;
879 }
880 }
881 }
882 }
883
884 if ( r_cell + 1 == radii_.extent( 1 ) - 1 )
885 {
886 // Outer boundary (surface).
887 for ( int i = 0; i < 6; i++ )
888 {
889 for ( int j = 0; j < 6; j++ )
890 {
891 // on diagonal components of the vectorial diffusion operator, we exclude the diagonal entries from elimination
892 if ( ( dimi == dimj && i != j && ( i >= 3 || j >= 3 ) ) or
893 ( dimi != dimj && ( i >= 3 || j >= 3 ) ) )
894 {
895 boundary_mask( i + dimi * num_nodes_per_wedge, j + dimj * num_nodes_per_wedge ) =
896 0.0;
897 }
898 }
899 }
900 }
901 }
902 }
903 A.hadamard_product( boundary_mask );
904 }
905
906 return A;
907 }
908
909 // executes the fused local matvec on an element, given the assembled trial and test vectors
910 KOKKOS_INLINE_FUNCTION void fused_local_mv(
911 ScalarType src_local_hex[8],
912 ScalarType dst_local_hex[8],
913 const int wedge,
914 const ScalarType jdet_keval_quadweight,
917 const int dimi,
918 const int dimj,
919 int r_cell ) const
920 {
921 constexpr int offset_x[2][6] = { { 0, 1, 0, 0, 1, 0 }, { 1, 0, 1, 1, 0, 1 } };
922 constexpr int offset_y[2][6] = { { 0, 0, 1, 0, 0, 1 }, { 1, 1, 0, 1, 1, 0 } };
923 constexpr int offset_r[2][6] = { { 0, 0, 0, 1, 1, 1 }, { 0, 0, 0, 1, 1, 1 } };
924
926 ScalarType divu = 0.0;
927 grad_u.fill( 0.0 );
928
929 const bool at_cmb = r_cell == 0;
930 const bool at_surface = r_cell + 1 == radii_.extent( 1 ) - 1;
931 int cmb_shift = 0;
932 int surface_shift = 0;
933
934 // Compute nabla u at this quadrature point.
935 if ( !diagonal_ )
936 {
937 if ( treat_boundary_ && at_cmb )
938 {
939 // at the core-mantle boundary, we exclude dofs that are lower-indexed than the dof on the boundary
940 cmb_shift = 3;
941 }
942 else if ( treat_boundary_ && at_surface )
943 {
944 // at the surface boundary, we exclude dofs that are higher-indexed than the dof on the boundary
945 surface_shift = 3;
946 }
947
948 // accumulate the element-local gradient/divergence of the trial function (loop over columns of local matrix/local dofs)
949 // by dot of trial vec and src dofs
950 for ( int i = 0 + cmb_shift; i < num_nodes_per_wedge - surface_shift; i++ )
951 {
952 grad_u =
953 grad_u +
954 sym_grad_i[i] * src_local_hex[4 * offset_r[wedge][i] + 2 * offset_y[wedge][i] + offset_x[wedge][i]];
955 divu += sym_grad_i[i]( dimi, dimi ) *
956 src_local_hex[4 * offset_r[wedge][i] + 2 * offset_y[wedge][i] + offset_x[wedge][i]];
957 }
958
959 // Add the test function contributions.
960 // for each row of the local matrix (test-functions):
961 // multiply trial part (fully assembled for the current element from loop above) with test part corresponding to the current row/dof
962 // += due to contributions from other elements
963 for ( int j = 0 + cmb_shift; j < num_nodes_per_wedge - surface_shift; j++ )
964 {
965 dst_local_hex[4 * offset_r[wedge][j] + 2 * offset_y[wedge][j] + offset_x[wedge][j]] +=
966 jdet_keval_quadweight * ( 2 * ( sym_grad_j[j] ).double_contract( grad_u ) -
967 2.0 / 3.0 * sym_grad_j[j]( dimj, dimj ) * divu );
968 // for the div, we just extract the component from the gradient vector
969 }
970 }
971
972 // Dirichlet DoFs are only to be eliminated on diagonal blocks of epsilon
973 if ( diagonal_ || ( dimi == dimj && ( treat_boundary_ && ( at_cmb || at_surface ) ) ) )
974 {
975 // for the diagonal elements at the boundary, we switch the shifts
976 for ( int i = 0 + surface_shift; i < num_nodes_per_wedge - cmb_shift; i++ )
977 {
978 const auto grad_u_diag =
979 sym_grad_i[i] * src_local_hex[4 * offset_r[wedge][i] + 2 * offset_y[wedge][i] + offset_x[wedge][i]];
980 const auto div_u_diag =
981 sym_grad_i[i]( dimi, dimi ) *
982 src_local_hex[4 * offset_r[wedge][i] + 2 * offset_y[wedge][i] + offset_x[wedge][i]];
983
984 dst_local_hex[4 * offset_r[wedge][i] + 2 * offset_y[wedge][i] + offset_x[wedge][i]] +=
985 jdet_keval_quadweight * ( 2 * ( sym_grad_j[i] ).double_contract( grad_u_diag ) -
986 2.0 / 3.0 * sym_grad_j[i]( dimj, dimj ) * div_u_diag );
987 }
988 }
989 }
990};
991
994
995} // namespace terra::fe::wedge::operators::shell::epsdivdiv_history
Send and receive buffers for all process-local subdomain boundaries.
Definition communication.hpp:56
grid::Grid2DDataScalar< ScalarT > get_radii() const
Getter for radii member.
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:143
grid::Grid3DDataVec< ScalarT, 3 > get_grid()
Getter for grid member.
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:146
void operator()(const int local_subdomain_id, const int x_cell, const int y_cell) const
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:273
terra::grid::Grid4DDataMatrices< ScalarType, LocalMatrixDim, LocalMatrixDim, 2 > Grid4DDataLocalMatrices
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:33
void assemble_trial_test_vecs(const int wedge, const dense::Vec< ScalarType, VecDim > &quad_point, const ScalarType quad_weight, const ScalarT r_1, const ScalarT r_2, dense::Vec< ScalarT, 3 >(*wedge_phy_surf)[3], const dense::Vec< ScalarT, 6 > *k_local_hex, const int dimi, const int dimj, dense::Mat< ScalarType, VecDim, VecDim > *sym_grad_i, dense::Mat< ScalarType, VecDim, VecDim > *sym_grad_j, ScalarType &jdet_keval_quadweight) const
: For both trial and test space this function sets up a vector: each vector element holds the symmetr...
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:759
void set_diagonal(bool v)
S/Getter for diagonal member.
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:134
void map_league_idx_to_indices(int global_cell_idx, int cells_per_subdomain, int num_cells_x, int num_cells_y, int num_cells_r, int &local_subdomain_id, int &x_cell, int &y_cell, int &r_cell) const
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:304
void precompute()
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:268
const grid::shell::DistributedDomain & get_domain() const
Getter for domain member.
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:140
void apply_impl(const SrcVectorType &src, DstVectorType &dst)
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:218
bool has_flag(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, grid::shell::ShellBoundaryFlag flag) const
Getter for mask member.
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:150
const grid::Grid4DDataScalar< ScalarType > & k_grid_data()
Getter for coefficient.
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:137
Kokkos::TeamPolicy<>::member_type Team
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:323
void set_local_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge, const dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > &mat) const
Set the local matrix stored in the operator.
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:180
void fused_local_mv(ScalarType src_local_hex[8], ScalarType dst_local_hex[8], const int wedge, const ScalarType jdet_keval_quadweight, dense::Mat< ScalarType, 3, 3 > *sym_grad_i, dense::Mat< ScalarType, 3, 3 > *sym_grad_j, const int dimi, const int dimj, int r_cell) const
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:910
void operator()(const Team &team) const
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:324
ScalarT ScalarType
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:31
void set_stored_matrix_mode(linalg::OperatorStoredMatrixMode operator_stored_matrix_mode, std::optional< int > level_range, std::optional< grid::Grid4DDataScalar< ScalarType > > GCAElements)
allocates memory for the local matrices
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:161
void set_operator_apply_and_communication_modes(const linalg::OperatorApplyMode operator_apply_mode, const linalg::OperatorCommunicationMode operator_communication_mode)
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:125
static constexpr int LocalMatrixDim
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:32
dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > assemble_local_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge) const
assemble the local matrix and return it for a given element, wedge, and vectorial component (determin...
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:796
EpsilonDivDivKerngenV03TeamsPrecomp(const grid::shell::DistributedDomain &domain, const grid::Grid3DDataVec< ScalarT, 3 > &grid, const grid::Grid2DDataScalar< ScalarT > &radii, const grid::Grid4DDataScalar< grid::shell::ShellBoundaryFlag > &mask, const grid::Grid4DDataScalar< ScalarT > &k, BoundaryConditions bcs, bool diagonal, linalg::OperatorApplyMode operator_apply_mode=linalg::OperatorApplyMode::Replace, linalg::OperatorCommunicationMode operator_communication_mode=linalg::OperatorCommunicationMode::CommunicateAdditively, linalg::OperatorStoredMatrixMode operator_stored_matrix_mode=linalg::OperatorStoredMatrixMode::Off)
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:81
dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > get_local_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge) const
Retrives the local matrix if there is stored local matrices, the desired local matrix is loaded and r...
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:196
linalg::OperatorStoredMatrixMode get_stored_matrix_mode()
Definition epsilon_divdiv_kerngen_v03_teams_precomp.hpp:176
Parallel data structure organizing the thick spherical shell metadata for distributed (MPI parallel) ...
Definition spherical_shell.hpp:2518
const std::map< SubdomainInfo, std::tuple< LocalSubdomainIdx, SubdomainNeighborhood > > & subdomains() const
Definition spherical_shell.hpp:2650
const DomainInfo & domain_info() const
Returns a const reference.
Definition spherical_shell.hpp:2647
int diamond_lateral_refinement_level() const
Definition spherical_shell.hpp:843
int subdomain_num_nodes_radially() const
Equivalent to calling subdomain_num_nodes_radially( subdomain_refinement_level() )
Definition spherical_shell.hpp:861
int subdomain_num_nodes_per_side_laterally() const
Equivalent to calling subdomain_num_nodes_per_side_laterally( subdomain_refinement_level() )
Definition spherical_shell.hpp:852
Static assertion: VectorQ1Scalar satisfies VectorLike concept.
Definition vector_q1.hpp:168
const grid::Grid4DDataVec< ScalarType, VecDim > & grid_data() const
Get const reference to grid data.
Definition vector_q1.hpp:288
bool has_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge) const
Checks for presence of a local matrix for a certain element.
Definition local_matrix_storage.hpp:223
dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > get_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge) const
Retrives the local matrix if there is stored local matrices, the desired local matrix is loaded and r...
Definition local_matrix_storage.hpp:175
void set_matrix(const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int wedge, dense::Mat< ScalarT, LocalMatrixDim, LocalMatrixDim > mat) const
Set the local matrix stored in the operator.
Definition local_matrix_storage.hpp:118
Timer supporting RAII scope or manual stop.
Definition timer.hpp:342
void stop()
Stop the timer and record elapsed time.
Definition timer.hpp:364
Concept for types that can be used as Galerkin coarse-grid operators in a multigrid hierarchy....
Definition operator.hpp:81
void unpack_and_reduce_local_subdomain_boundaries(const grid::shell::DistributedDomain &domain, const GridDataType &data, SubdomainNeighborhoodSendRecvBuffer< typename GridDataType::value_type, grid::grid_data_vec_dim< GridDataType >() > &boundary_recv_buffers, CommunicationReduction reduction=CommunicationReduction::SUM)
Unpacks and reduces local subdomain boundaries.
Definition communication.hpp:672
void pack_send_and_recv_local_subdomain_boundaries(const grid::shell::DistributedDomain &domain, const GridDataType &data, SubdomainNeighborhoodSendRecvBuffer< typename GridDataType::value_type, grid::grid_data_vec_dim< GridDataType >() > &boundary_send_buffers, SubdomainNeighborhoodSendRecvBuffer< typename GridDataType::value_type, grid::grid_data_vec_dim< GridDataType >() > &boundary_recv_buffers)
Packs, sends and recvs local subdomain boundaries using two sets of buffers.
Definition communication.hpp:242
Definition epsilon_divdiv_kerngen_v01_initial.hpp:16
constexpr void quad_felippa_1x1_quad_points(dense::Vec< T, 3 >(&quad_points)[quad_felippa_1x1_num_quad_points])
Definition wedge/quadrature/quadrature.hpp:36
constexpr void quad_felippa_1x1_quad_weights(T(&quad_weights)[quad_felippa_1x1_num_quad_points])
Definition wedge/quadrature/quadrature.hpp:43
constexpr int quad_felippa_1x1_num_quad_points
Definition wedge/quadrature/quadrature.hpp:32
constexpr int num_nodes_per_wedge_surface
Definition kernel_helpers.hpp:6
constexpr dense::Mat< T, 3, 3 > symmetric_grad(const dense::Mat< T, 3, 3 > &J_inv_transposed, const dense::Vec< T, 3 > &quad_point, const int dof, const int dim)
Returns the symmetric gradient of the shape function of a dof at a quadrature point.
Definition integrands.hpp:685
void wedge_surface_physical_coords(dense::Vec< T, 3 >(&wedge_surf_phy_coords)[num_wedges_per_hex_cell][num_nodes_per_wedge_surface], const grid::Grid3DDataVec< T, 3 > &lateral_grid, const int local_subdomain_id, const int x_cell, const int y_cell)
Extracts the (unit sphere) surface vertex coords of the two wedges of a hex cell.
Definition kernel_helpers.hpp:26
void atomically_add_local_wedge_vector_coefficients(const grid::Grid4DDataVec< T, VecDim > &global_coefficients, const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int d, const dense::Vec< T, 6 > local_coefficients[2])
Performs an atomic add of the two local wedge coefficient vectors of a hex cell into the global coeff...
Definition kernel_helpers.hpp:465
constexpr int num_wedges_per_hex_cell
Definition kernel_helpers.hpp:5
void extract_local_wedge_scalar_coefficients(dense::Vec< T, 6 >(&local_coefficients)[2], const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const grid::Grid4DDataScalar< T > &global_coefficients)
Extracts the local vector coefficients for the two wedges of a hex cell from the global coefficient v...
Definition kernel_helpers.hpp:306
void extract_local_wedge_vector_coefficients(dense::Vec< T, 6 >(&local_coefficients)[2], const int local_subdomain_id, const int x_cell, const int y_cell, const int r_cell, const int d, const grid::Grid4DDataVec< T, VecDim > &global_coefficients)
Extracts the local vector coefficients for the two wedges of a hex cell from the global coefficient v...
Definition kernel_helpers.hpp:356
constexpr int num_nodes_per_wedge
Definition kernel_helpers.hpp:7
constexpr dense::Mat< T, 3, 3 > jac_lat(const dense::Vec< T, 3 > &p1_phy, const dense::Vec< T, 3 > &p2_phy, const dense::Vec< T, 3 > &p3_phy, const T xi, const T eta)
Definition integrands.hpp:634
constexpr T shape(const int node_idx, const T xi, const T eta, const T zeta)
(Tensor-product) Shape function.
Definition integrands.hpp:146
constexpr dense::Mat< T, 3, 3 > jac(const dense::Vec< T, 3 > &p1_phy, const dense::Vec< T, 3 > &p2_phy, const dense::Vec< T, 3 > &p3_phy, const T r_1, const T r_2, const T xi, const T eta, const T zeta)
Definition integrands.hpp:657
BoundaryConditionMapping[2] BoundaryConditions
Definition shell/bit_masks.hpp:37
ShellBoundaryFlag
FlagLike that indicates boundary types for the thick spherical shell.
Definition shell/bit_masks.hpp:12
Kokkos::MDRangePolicy< Kokkos::Rank< 3 > > local_domain_md_range_policy_cells_lateral(const DistributedDomain &distributed_domain)
Definition spherical_shell.hpp:2729
BoundaryConditionFlag
FlagLike that indicates the type of boundary condition
Definition shell/bit_masks.hpp:25
Kokkos::View< dense::Mat< ScalarType, Rows, Cols > ****[NumMatrices], Layout > Grid4DDataMatrices
Definition grid_types.hpp:173
Kokkos::View< ScalarType ***[VecDim], Layout > Grid3DDataVec
Definition grid_types.hpp:42
Kokkos::View< ScalarType ****, Layout > Grid4DDataScalar
Definition grid_types.hpp:27
Kokkos::View< ScalarType **, Layout > Grid2DDataScalar
Definition grid_types.hpp:21
OperatorApplyMode
Modes for applying an operator to a vector.
Definition operator.hpp:30
@ Replace
Overwrite the destination vector.
OperatorStoredMatrixMode
Modes for applying stored matrices.
Definition operator.hpp:47
@ Selective
Use stored matrices on selected, marked elements only, assemble on all others.
@ Full
Use stored matrices on all elements.
@ Off
Do not use stored matrices.
OperatorCommunicationMode
Modes for communication during operator application.
Definition operator.hpp:40
@ CommunicateAdditively
Communicate and add results.
constexpr bool has_flag(E mask_value, E flag) noexcept
Checks if a bitmask value contains a specific flag.
Definition bit_masking.hpp:43
Definition mat.hpp:10
void fill(const T value)
Definition mat.hpp:201
constexpr Mat diagonal() const
Definition mat.hpp:377
Mat & hadamard_product(const Mat &mat)
Definition mat.hpp:213
T double_contract(const Mat &mat)
Definition mat.hpp:226
SoA (Structure-of-Arrays) 4D vector grid data.
Definition grid_types.hpp:51
auto extent(int i) const
Definition grid_types.hpp:75