59 using Team = Kokkos::TeamPolicy<>::member_type;
76 BoundaryConditions bcs_;
89 int local_subdomains_;
104 void update_kernel_path_flag_host_only()
107 if constexpr ( std::is_same_v< Kokkos::DefaultExecutionSpace, Kokkos::Serial > )
113 const BoundaryConditionFlag cmb_bc = get_boundary_condition_flag( bcs_, CMB );
114 const BoundaryConditionFlag surface_bc = get_boundary_condition_flag( bcs_, SURFACE );
115 const bool has_freeslip = ( cmb_bc == FREESLIP ) || ( surface_bc == FREESLIP );
127 BoundaryConditions bcs,
131 : domain_fine_( domain_fine )
132 , domain_coarse_( domain_coarse )
133 , grid_fine_( grid_fine )
134 , radii_( radii_fine )
135 , boundary_mask_fine_( boundary_mask_fine )
136 , operator_apply_mode_( operator_apply_mode )
137 , operator_communication_mode_( operator_communication_mode )
138 , recv_buffers_( domain_coarse )
139 , comm_plan_( domain_coarse )
145 local_subdomains_ = domain_fine_.
subdomains().size();
149 if constexpr ( std::is_same_v< Kokkos::DefaultExecutionSpace, Kokkos::Serial > )
151 lat_tile_ = 1; r_tile_ = 1; r_passes_ = 1;
153#ifdef KOKKOS_ENABLE_OPENMP
154 else if constexpr ( std::is_same_v< Kokkos::DefaultExecutionSpace, Kokkos::OpenMP > )
156 const int max_team = std::min( Kokkos::OpenMP().concurrency(),
157 static_cast< int >( Kokkos::Impl::HostThreadTeamData::max_team_members ) );
158 if ( max_team >= 64 ) { lat_tile_ = 4; r_tile_ = 4; r_passes_ = 4; }
159 else if ( max_team >= 16 ) { lat_tile_ = 4; r_tile_ = 1; r_passes_ = 16; }
160 else { lat_tile_ = 1; r_tile_ = 1; r_passes_ = 1; }
165 lat_tile_ = 4; r_tile_ = 8; r_passes_ = 2;
167 r_tile_block_ = r_tile_ * r_passes_;
168 lat_tiles_ = ( hex_lat_ + lat_tile_ - 1 ) / lat_tile_;
169 r_tiles_ = ( hex_rad_ + r_tile_block_ - 1 ) / r_tile_block_;
170 team_size_ = lat_tile_ * lat_tile_ * r_tile_;
171 blocks_ = local_subdomains_ * lat_tiles_ * lat_tiles_ * r_tiles_;
173 update_kernel_path_flag_host_only();
175 util::logroot <<
"[DivergenceKerngen] tile=(" << lat_tile_ <<
"," << lat_tile_ <<
"," << r_tile_
176 <<
"), r_passes=" << r_passes_ <<
", team=" << team_size_ <<
", blocks=" << blocks_
177 <<
", path=" <<
path_name() << std::endl;
182 switch ( kernel_path_ )
186 default:
return "fast-dirichlet-neumann";
199 operator_apply_mode_ = operator_apply_mode;
200 operator_communication_mode_ = operator_communication_mode;
219 Kokkos::TeamPolicy<> policy( blocks_, team_size_ );
222 policy.set_scratch_size( 0, Kokkos::PerTeam(
team_shmem_size( team_size_ ) ) );
227 Kokkos::parallel_for(
228 "divergence_apply_kernel_slow", policy, KOKKOS_CLASS_LAMBDA(
const Team& team ) {
229 this->run_team_slow( team );
234 Kokkos::parallel_for(
235 "divergence_apply_kernel_fast_fs", policy, KOKKOS_CLASS_LAMBDA(
const Team& team ) {
236 this->
template run_team_fast<
true >( team );
241 Kokkos::TeamPolicy< Kokkos::LaunchBounds< 128, 5 > > dn_policy( blocks_, team_size_ );
242 dn_policy.set_scratch_size( 0, Kokkos::PerTeam(
team_shmem_size( team_size_ ) ) );
243 Kokkos::parallel_for(
244 "divergence_apply_kernel_fast_dn", dn_policy, KOKKOS_CLASS_LAMBDA(
const Team& team ) {
245 this->
template run_team_fast<
false >( team );
262 KOKKOS_INLINE_FUNCTION
265 const int nlev = r_tile_block_ + 1;
266 const int n = lat_tile_ + 1;
267 const int nxy = n * n;
269 const size_t nscalars = size_t( nxy ) * 3 + size_t( nxy ) * 3 * nlev + size_t( nlev );
274 KOKKOS_INLINE_FUNCTION
275 void decode_team_indices(
277 int& local_subdomain_id,
288 int tmp = team.league_rank();
289 const int r_tile_id = tmp % r_tiles_;
291 const int lat_y_id = tmp % lat_tiles_;
293 const int lat_x_id = tmp % lat_tiles_;
295 local_subdomain_id = tmp;
297 x0 = lat_x_id * lat_tile_;
298 y0 = lat_y_id * lat_tile_;
299 r0 = r_tile_id * r_tile_block_;
301 const int tid = team.team_rank();
303 tx = ( tid / r_tile_ ) % lat_tile_;
304 ty = tid / ( r_tile_ * lat_tile_ );
311 KOKKOS_INLINE_FUNCTION
321 KOKKOS_INLINE_FUNCTION
322 void run_team_slow(
const Team& team )
const
324 int local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell;
325 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
330 for (
int pass = 0; pass < r_passes_; ++pass )
332 const int r_cell_pass = r0 + pass * r_tile_ + tr;
333 if ( r_cell_pass >= hex_rad_ )
335 if ( x_cell >= hex_lat_ || y_cell >= hex_lat_ )
338 process_cell_from_global( local_subdomain_id, x_cell, y_cell, r_cell_pass );
344 KOKKOS_INLINE_FUNCTION
345 void process_cell_from_global(
int s,
int x_cell,
int y_cell,
int r_cell )
const
350 const ScalarT r_1 = radii_( s, r_cell );
351 const ScalarT r_2 = radii_( s, r_cell + 1 );
359 for (
int d = 0; d < 3; d++ )
365 src_loc[w]( d * 6 + i ) = src_d[w]( i );
368 eval_cell( s, x_cell, y_cell, r_cell, r_1, r_2, wedge_phy_surf, quad_points, quad_weights, src_loc );
373 KOKKOS_INLINE_FUNCTION
387 const int fine_radial_wedge_index =
r_cell % 2;
391 for (
int q = 0; q < num_quad_points; q++ )
396 const auto J =
jac( wedge_phy_surf[wedge], r_1, r_2, quad_points[q] );
397 const auto det = Kokkos::abs(
J.det() );
398 const auto J_inv_transposed =
J.inv().transposed();
403 shape_coarse( i, fine_radial_wedge_index, fine_lateral_wedge_index, quad_points[q] );
407 const auto grad_j =
grad_shape( j, quad_points[q] );
408 for (
int d = 0; d < 3; d++ )
410 A[wedge]( i, d * 6 + j ) +=
411 quad_weights[q] * ( -( J_inv_transposed * grad_j )( d ) * shape_i * det );
418 bool at_cmb =
util::has_flag( boundary_mask_fine_( s, x_cell, y_cell, r_cell ), CMB );
419 bool at_surface =
util::has_flag( boundary_mask_fine_( s, x_cell, y_cell, r_cell + 1 ), SURFACE );
424 src_loc[w] = src_in[w];
426 dense::Mat< ScalarT, 6, 18 > boundary_mask;
427 boundary_mask.fill( 1.0 );
429 if ( at_cmb || at_surface )
434 if ( bcf == DIRICHLET )
436 for (
int dimj = 0; dimj < 3; ++dimj )
439 if ( ( at_cmb && j < 3 ) || ( at_surface && j >= 3 ) )
442 else if ( bcf == FREESLIP )
446 for (
int wedge = 0; wedge < 2; ++wedge )
449 for (
int dimj = 0; dimj < 3; ++dimj )
451 A_tmp[wedge]( node_idxi, node_idxj * 3 + dimj ) =
456 constexpr int layer_hex_offset_x[2][3] = { { 0, 1, 0 }, { 1, 0, 1 } };
457 constexpr int layer_hex_offset_y[2][3] = { { 0, 0, 1 }, { 1, 1, 0 } };
460 for (
int wedge = 0; wedge < 2; ++wedge )
462 for (
int i = 0; i < 18; ++i )
463 R[wedge]( i, i ) = 1.0;
465 for (
int bn = 0; bn < 3; ++bn )
469 x_cell + layer_hex_offset_x[wedge][bn],
470 y_cell + layer_hex_offset_y[wedge][bn],
471 r_cell + ( at_cmb ? 0 : 1 ),
476 const int offset_in_R = at_cmb ? 0 : 9;
477 for (
int dimi = 0; dimi < 3; ++dimi )
478 for (
int dimj = 0; dimj < 3; ++dimj )
479 R[wedge]( offset_in_R + bn * 3 + dimi, offset_in_R + bn * 3 + dimj ) = R_i( dimi, dimj );
482 A[wedge] = A_tmp[wedge] * R[wedge].transposed();
484 auto src_tmp = R[wedge] * src_loc[wedge];
485 for (
int i = 0; i < 18; ++i )
486 src_loc[wedge]( i ) = src_tmp( i );
488 const int node_start = at_surface ? 3 : 0;
489 const int node_end = at_surface ? 6 : 3;
490 for (
int node_idx = node_start; node_idx < node_end; ++node_idx )
492 const int idx = node_idx * 3;
493 for (
int k = 0; k < 6; ++k )
494 boundary_mask( k, idx ) = 0.0;
498 else if ( bcf == NEUMANN )
505 A[wedge].hadamard_product( boundary_mask );
508 dst_loc[0] = A[0] * src_loc[0];
509 dst_loc[1] = A[1] * src_loc[1];
512 dst_, s, x_cell / 2, y_cell / 2, r_cell / 2, dst_loc );
519 template <
bool Freeslip >
520 KOKKOS_INLINE_FUNCTION
521 void run_team_fast(
const Team& team )
const
524 decode_team_indices( team, local_subdomain_id, x0, y0, r0, tx, ty, tr, x_cell, y_cell, r_cell );
526 const int nlev = r_tile_block_ + 1;
527 const int n = lat_tile_ + 1;
528 const int nxy = n * n;
531 reinterpret_cast< double*
>( team.team_shmem().get_shmem(
team_shmem_size( team.team_size() ) ) );
533 using ScratchCoords =
534 Kokkos::View< double**, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
536 Kokkos::View< double***, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
538 Kokkos::View< double*, Kokkos::LayoutRight, typename Team::scratch_memory_space, Kokkos::MemoryUnmanaged >;
540 ScratchCoords coords_sh( shmem, nxy, 3 );
542 ScratchSrc src_sh( shmem, nxy, 3, nlev );
543 shmem += nxy * 3 * nlev;
544 ScratchR r_sh( shmem, nlev );
546 auto node_id = [&](
int nx,
int ny ) ->
int {
return nx + n * ny; };
548 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nxy ), [&](
int id ) {
549 const int dxn =
id % n;
550 const int dyn =
id / n;
551 const int xi = x0 + dxn;
552 const int yi = y0 + dyn;
553 if ( xi <= hex_lat_ && yi <= hex_lat_ )
555 coords_sh(
id, 0 ) = grid_fine_( local_subdomain_id, xi, yi, 0 );
556 coords_sh(
id, 1 ) = grid_fine_( local_subdomain_id, xi, yi, 1 );
557 coords_sh(
id, 2 ) = grid_fine_( local_subdomain_id, xi, yi, 2 );
561 coords_sh(
id, 0 ) = coords_sh(
id, 1 ) = coords_sh(
id, 2 ) = 0.0;
565 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, nlev ), [&](
int lvl ) {
566 const int rr = r0 + lvl;
567 r_sh( lvl ) = ( rr <= hex_rad_ ) ? radii_( local_subdomain_id, rr ) : 0.0;
570 const int total_src = nxy * nlev;
571 Kokkos::parallel_for( Kokkos::TeamThreadRange( team, total_src ), [&](
int t ) {
572 const int node = t / nlev;
573 const int lvl = t - node * nlev;
574 const int dxn = node % n;
575 const int dyn = node / n;
576 const int xi = x0 + dxn;
577 const int yi = y0 + dyn;
578 const int rr = r0 + lvl;
579 if ( xi <= hex_lat_ && yi <= hex_lat_ && rr <= hex_rad_ )
581 src_sh( node, 0, lvl ) = src_( local_subdomain_id, xi, yi, rr, 0 );
582 src_sh( node, 1, lvl ) = src_( local_subdomain_id, xi, yi, rr, 1 );
583 src_sh( node, 2, lvl ) = src_( local_subdomain_id, xi, yi, rr, 2 );
587 src_sh( node, 0, lvl ) = src_sh( node, 1, lvl ) = src_sh( node, 2, lvl ) = 0.0;
595 if ( x_cell >= hex_lat_ || y_cell >= hex_lat_ )
616 const ScalarT
qw = quad_weights[0];
620 constexpr int WEDGE_NODE_OFF[2][6][3] = {
621 { { 0, 0, 0 }, { 1, 0, 0 }, { 0, 1, 0 }, { 0, 0, 1 }, { 1, 0, 1 }, { 0, 1, 1 } },
622 { { 1, 1, 0 }, { 0, 1, 0 }, { 1, 0, 0 }, { 1, 1, 1 }, { 0, 1, 1 }, { 1, 0, 1 } } };
626 const int n00 = node_id( tx, ty );
627 const int n01 = node_id( tx, ty + 1 );
628 const int n10 = node_id( tx + 1, ty );
629 const int n11 = node_id( tx + 1, ty + 1 );
631 for (
int pass = 0; pass < r_passes_; ++pass )
633 const int lvl0 = pass * r_tile_ + tr;
634 const int r_cell_abs = r0 + lvl0;
635 if ( r_cell_abs >= hex_rad_ )
638 const ScalarT r_1 = r_sh( lvl0 );
639 const ScalarT r_2 = r_sh( lvl0 + 1 );
641 const bool at_cmb = has_flag( local_subdomain_id, x_cell, y_cell, r_cell_abs, CMB );
642 const bool at_surface = has_flag( local_subdomain_id, x_cell, y_cell, r_cell_abs + 1, SURFACE );
645 bool treat_dirichlet =
false;
646 bool treat_freeslip =
false;
647 if ( at_cmb || at_surface )
652 if constexpr ( Freeslip )
653 treat_freeslip = ( bcf == FREESLIP );
657 const int boundary_ddr = at_cmb ? 0 : 1;
659 const int fine_rad_wedge = r_cell_abs % 2;
663 const int v0 = ( w == 0 ? n00 : n11 );
664 const int v1 = ( w == 0 ? n10 : n01 );
665 const int v2 = ( w == 0 ? n01 : n10 );
668 constexpr double ONE_THIRD = 1.0 / 3.0;
669 const double half_dr = 0.5 * ( r_2 - r_1 );
670 const double r_mid = 0.5 * ( r_1 + r_2 );
672 const double J_0_0 = r_mid * ( -coords_sh( v0, 0 ) + coords_sh( v1, 0 ) );
673 const double J_0_1 = r_mid * ( -coords_sh( v0, 0 ) + coords_sh( v2, 0 ) );
674 const double J_0_2 = half_dr * ( ONE_THIRD * ( coords_sh( v0, 0 ) + coords_sh( v1, 0 ) + coords_sh( v2, 0 ) ) );
675 const double J_1_0 = r_mid * ( -coords_sh( v0, 1 ) + coords_sh( v1, 1 ) );
676 const double J_1_1 = r_mid * ( -coords_sh( v0, 1 ) + coords_sh( v2, 1 ) );
677 const double J_1_2 = half_dr * ( ONE_THIRD * ( coords_sh( v0, 1 ) + coords_sh( v1, 1 ) + coords_sh( v2, 1 ) ) );
678 const double J_2_0 = r_mid * ( -coords_sh( v0, 2 ) + coords_sh( v1, 2 ) );
679 const double J_2_1 = r_mid * ( -coords_sh( v0, 2 ) + coords_sh( v2, 2 ) );
680 const double J_2_2 = half_dr * ( ONE_THIRD * ( coords_sh( v0, 2 ) + coords_sh( v1, 2 ) + coords_sh( v2, 2 ) ) );
682 const double J_det = J_0_0 * J_1_1 * J_2_2 - J_0_0 * J_1_2 * J_2_1
683 - J_0_1 * J_1_0 * J_2_2 + J_0_1 * J_1_2 * J_2_0
684 + J_0_2 * J_1_0 * J_2_1 - J_0_2 * J_1_1 * J_2_0;
685 const double abs_det = Kokkos::abs( J_det );
686 const double inv_det = 1.0 /
J_det;
689 const double i00 = inv_det * ( J_1_1 * J_2_2 - J_1_2 * J_2_1 );
690 const double i01 = inv_det * ( -J_1_0 * J_2_2 + J_1_2 * J_2_0 );
691 const double i02 = inv_det * ( J_1_0 * J_2_1 - J_1_1 * J_2_0 );
692 const double i10 = inv_det * ( -J_0_1 * J_2_2 + J_0_2 * J_2_1 );
693 const double i11 = inv_det * ( J_0_0 * J_2_2 - J_0_2 * J_2_0 );
694 const double i12 = inv_det * ( -J_0_0 * J_2_1 + J_0_1 * J_2_0 );
695 const double i20 = inv_det * ( J_0_1 * J_1_2 - J_0_2 * J_1_1 );
696 const double i21 = inv_det * ( -J_0_0 * J_1_2 + J_0_2 * J_1_0 );
697 const double i22 = inv_det * ( J_0_0 * J_1_1 - J_0_1 * J_1_0 );
700 constexpr double ONE_SIXTH = 1.0 / 6.0;
701 static constexpr double dN_ref[6][3] = {
702 { -0.5, -0.5, -ONE_SIXTH },
703 { 0.5, 0.0, -ONE_SIXTH },
704 { 0.0, 0.5, -ONE_SIXTH },
705 { -0.5, -0.5, ONE_SIXTH },
706 { 0.5, 0.0, ONE_SIXTH },
707 { 0.0, 0.5, ONE_SIXTH } };
714 const int ddx = WEDGE_NODE_OFF[w][j][0];
715 const int ddy = WEDGE_NODE_OFF[w][j][1];
716 const int ddr = WEDGE_NODE_OFF[w][j][2];
718 const bool is_boundary_node = ( at_cmb || at_surface ) && ( ddr == boundary_ddr );
721 if ( treat_dirichlet && is_boundary_node )
724 const int nid = node_id( tx + ddx, ty + ddy );
725 const int lvl = lvl0 + ddr;
727 double u0 = src_sh( nid, 0, lvl );
728 double u1 = src_sh( nid, 1, lvl );
729 double u2 = src_sh( nid, 2, lvl );
733 if constexpr ( Freeslip )
735 if ( treat_freeslip && is_boundary_node )
737 const double nx = coords_sh( nid, 0 );
738 const double ny = coords_sh( nid, 1 );
739 const double nz = coords_sh( nid, 2 );
740 const double un = u0 * nx + u1 * ny + u2 * nz;
748 const double gx = dN_ref[j][0];
749 const double gy = dN_ref[j][1];
750 const double gz = dN_ref[j][2];
751 const double g0 = i00 * gx + i01 * gy + i02 * gz;
752 const double g1 = i10 * gx + i11 * gy + i12 * gz;
753 const double g2 = i20 * gx + i21 * gy + i22 * gz;
760 const double prefactor = -
qw * abs_det *
div_u;
762 const int xc =
x_cell / 2;
763 const int yc =
y_cell / 2;
764 const int rc = r_cell_abs / 2;
769 const double shape_i =
shape_coarse( i, fine_rad_wedge, fine_lat_wedge, quad_points[0] );
770 const int ddx = WEDGE_NODE_OFF[w][i][0];
771 const int ddy = WEDGE_NODE_OFF[w][i][1];
772 const int ddr = WEDGE_NODE_OFF[w][i][2];
773 Kokkos::atomic_add( &dst_( local_subdomain_id, xc + ddx, yc + ddy, rc + ddr ),
774 prefactor * shape_i );