Loading...
Searching...
No Matches
timer.hpp
Go to the documentation of this file.
1#pragma once
2
3#include <Kokkos_Core.hpp>
4#include <iostream>
5#include <memory>
6#include <mpi.h>
7#include <mutex>
8#include <ranges>
9#include <set>
10#include <sstream>
11#include <string>
12#include <vector>
13#ifdef TERRANEO_USE_NESMIK
14#include <nesmik/nesmik.hpp>
15#endif
16
17namespace terra::util {
18
19/// @brief Node representing a timed region in the hierarchy.
20///
21/// @note See class `Timer` for actually running a timer.
23{
24 std::string name; ///< Name of the timer region
25 double total_time{ 0.0 }; ///< Accumulated time (per rank)
26 int count{ 0 }; ///< Number of times this node was timed
27 std::map< std::string, std::shared_ptr< TimerNode > > children; ///< Nested child timers
28 TimerNode* parent{ nullptr }; ///< Parent node pointer
29
30 // Aggregated statistics across MPI ranks
31 double root_time{ 0.0 }, sum_time{ 0.0 }, min_time{ 0.0 }, max_time{ 0.0 }, avg_time{ 0.0 };
32
33 public:
34 friend class TimerTree;
35
36 /// @brief Constructor
37 TimerNode( const std::string& n, TimerNode* p = nullptr )
38 : name( n )
39 , parent( p )
40 {}
41
43 {
44 total_time = 0.0;
45 count = 0.0;
46 root_time = 0.0;
47 sum_time = 0.0;
48 min_time = 0.0;
49 max_time = 0.0;
50 avg_time = 0.0;
51 children.clear();
52 }
53
54 /// @brief Convert this node (and children) to JSON (per-rank)
55 std::string to_json( int indent = 0 ) const
56 {
57 std::ostringstream oss;
58 std::string pad( indent, ' ' );
59 oss << pad << "{\n";
60 oss << pad << " \"name\": \"" << name << "\",\n";
61 oss << pad << " \"total_time\": " << total_time << ",\n";
62 oss << pad << " \"count\": " << count << ",\n";
63 oss << pad << " \"children\": [\n";
64 int i = 0;
65 for ( const auto& child : children | std::ranges::views::values )
66 {
67 oss << child->to_json( indent + 4 );
68 if ( i + 1 < children.size() )
69 {
70 oss << ",";
71 }
72 oss << "\n";
73 i++;
74 }
75 oss << pad << " ]\n" << pad << "}";
76 return oss.str();
77 }
78
79 /// @brief Convert this node (and children) to JSON with MPI-aggregated statistics
80 std::string to_agg_json( int indent = 0 ) const
81 {
82 std::ostringstream oss;
83 std::string pad( indent, ' ' );
84 oss << pad << "{\n";
85 oss << pad << " \"name\": \"" << name << "\",\n";
86 oss << pad << " \"root_time\": " << root_time << ",\n";
87 oss << pad << " \"sum_time\": " << sum_time << ",\n";
88 oss << pad << " \"min_time\": " << min_time << ",\n";
89 oss << pad << " \"avg_time\": " << avg_time << ",\n";
90 oss << pad << " \"max_time\": " << max_time << ",\n";
91 oss << pad << " \"count\": " << count << ",\n";
92 oss << pad << " \"children\": [\n";
93 int i = 0;
94 for ( const auto& child : children | std::ranges::views::values )
95 {
96 oss << child->to_agg_json( indent + 4 );
97 if ( i + 1 < children.size() )
98 {
99 oss << ",";
100 }
101 oss << "\n";
102 i++;
103 }
104 oss << pad << " ]\n" << pad << "}";
105 return oss.str();
106 }
107};
108
109/// @brief Singleton tree managing all timer nodes per MPI rank
110///
111/// @note Use `Timer` class for the actually starting and stopping timers. Internally `Timer` objects will access a
112/// `TimerTree` singleton. So you can easily add timer calls without changing the API of your code.
113///
114/// Can be exported via json.
115///
116/// Example:
117/// @code
118/// auto tt = TimerTree::instance();
119///
120/// tt.aggregate_mpi();
121/// std::cout << tt.json() << std::endl;
122/// std::cout << tt.json_aggregate() << std::endl;
123/// tt.clear();
124/// @endcode
125///
126/// Example output for `json()`.
127/// Note that the root node will always be there carrying no timings.
128/// @code
129/// {
130/// "name": "root",
131/// "total_time": 0,
132/// "count": 0,
133/// "children": [
134/// {
135/// "name": "laplace_apply",
136/// "total_time": 0.356301,
137/// "count": 28,
138/// "children": [
139/// {
140/// "name": "laplace_comm",
141/// "total_time": 0.02748,
142/// "count": 28,
143/// "children": [
144/// ]
145/// },
146/// {
147/// "name": "laplace_kernel",
148/// "total_time": 0.327421,
149/// "count": 28,
150/// "children": [
151/// ]
152/// }
153/// ]
154/// }
155/// ]
156/// }
157/// @endcode
159{
160 TimerNode root{ "root" }; ///< Root node
161 TimerNode* current{ &root }; ///< Pointer to current active node
162 std::mutex mtx; ///< Mutex for thread safety
163
164 public:
165 /// @brief Access the singleton instance
167 {
168 static TimerTree tree;
169 return tree;
170 }
171
172 void clear()
173 {
174 std::lock_guard< std::mutex > lock( mtx );
176 current = &root;
177 }
178
179 /// @brief Enter a new timing scope
180 void enter_scope( const std::string& name )
181 {
182 std::lock_guard< std::mutex > lock( mtx );
183 if ( !current->children.contains( name ) )
184 {
185 current->children[name] = std::make_shared< TimerNode >( name, current );
186 }
187 current = current->children[name].get();
188 }
189
190 /// @brief Exit the current timing scope and record elapsed time
191 void exit_scope( double elapsed )
192 {
193 std::lock_guard< std::mutex > lock( mtx );
194 current->total_time += elapsed;
195 current->count += 1;
196 if ( current->parent )
197 {
198 current = current->parent;
199 }
200 }
201
202 /// @brief Per-rank json tree.
203 ///
204 /// Returns a definitely non-reduced timer tree in json format.
205 /// This means that this returns the process-local timings depending on the process that calls this method.
206 std::string json() { return root.to_json(); }
207
208 /// @brief MPI-reduced / aggregate json.
209 ///
210 /// Returns the timings after reduction over all processes.
211 /// You need to call aggregate_mpi() before this for reasonable results.
212 ///
213 /// This method does not need to be called collectively.
214 std::string json_aggregate() { return root.to_agg_json(); }
215
216 /// @brief Aggregate timings across all MPI ranks
217 ///
218 /// Must be called collectively.
219 void aggregate_mpi() { aggregate_node( &root, MPI_COMM_WORLD ); }
220
221 private:
222 /// @brief Recursively aggregate a node's timings across MPI ranks.
223 ///
224 /// Uses a union-of-children walk: each rank broadcasts its local child
225 /// names, all ranks agree on the union (in the same deterministic order),
226 /// and then walk that union. For a child that a given rank hasn't seen
227 /// locally, we contribute a zero timing so the collective remains
228 /// well-formed on every rank. This is required for agglomerated multigrid,
229 /// where different ranks legitimately have different sub-trees (e.g.
230 /// ranks not on the coarse sub-comm never record mg_coarse_solve).
231 void aggregate_node( TimerNode* node, MPI_Comm comm )
232 {
233 double local_time = node->total_time;
234 double root_time, min_time, max_time, sum_time;
235
236 root_time = local_time;
237 MPI_Bcast( &root_time, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD );
238 MPI_Allreduce( &local_time, &min_time, 1, MPI_DOUBLE, MPI_MIN, comm );
239 MPI_Allreduce( &local_time, &max_time, 1, MPI_DOUBLE, MPI_MAX, comm );
240 MPI_Allreduce( &local_time, &sum_time, 1, MPI_DOUBLE, MPI_SUM, comm );
241
242 int size;
243 MPI_Comm_size( comm, &size );
244 node->root_time = root_time;
245 node->sum_time = sum_time;
246 node->min_time = min_time;
247 node->max_time = max_time;
248 node->avg_time = sum_time / size;
249
250 // Union of child names across all ranks on `comm`.
251 const auto union_children = gather_union_child_names( node, comm );
252
253 // Walk the union in sorted (= deterministic) order. For children the
254 // local rank hasn't seen, create a zero-timing stub so recursion stays
255 // in lockstep and the output tree contains every timer.
256 for ( const auto& name : union_children )
257 {
258 auto it = node->children.find( name );
259 if ( it == node->children.end() )
260 {
261 node->children[name] = std::make_shared< TimerNode >( name, node );
262 it = node->children.find( name );
263 }
264 aggregate_node( it->second.get(), comm );
265 }
266 }
267
268 /// @brief Gather the set-union of child keys of `node` across all ranks on `comm`.
269 ///
270 /// Each rank packs its own children names into a length-prefixed byte buffer,
271 /// MPI_Allgatherv concatenates them, and every rank reconstructs the same
272 /// sorted set.
273 std::set< std::string > gather_union_child_names( const TimerNode* node, MPI_Comm comm )
274 {
275 int rank = 0, size = 0;
276 MPI_Comm_rank( comm, &rank );
277 MPI_Comm_size( comm, &size );
278
279 // Pack local names as "len:name|len:name|..." — simpler than
280 // variable-stride Alltoallv and plenty fast for the ~few-hundred-node
281 // timer trees we emit.
282 std::string local;
283 local.reserve( 256 );
284 for ( const auto& name : node->children | std::ranges::views::keys )
285 {
286 local.append( std::to_string( name.size() ) );
287 local.push_back( ':' );
288 local.append( name );
289 local.push_back( '|' );
290 }
291
292 int local_bytes = static_cast< int >( local.size() );
293 std::vector< int > counts( size ), displs( size );
294 MPI_Allgather( &local_bytes, 1, MPI_INT, counts.data(), 1, MPI_INT, comm );
295 int total = 0;
296 for ( int r = 0; r < size; ++r ) { displs[r] = total; total += counts[r]; }
297
298 std::string all( total, '\0' );
299 MPI_Allgatherv( local.data(), local_bytes, MPI_CHAR,
300 all.data(), counts.data(), displs.data(), MPI_CHAR, comm );
301
302 std::set< std::string > names;
303 size_t i = 0;
304 while ( i < all.size() )
305 {
306 size_t colon = all.find( ':', i );
307 if ( colon == std::string::npos ) break;
308 const int nlen = std::stoi( all.substr( i, colon - i ) );
309 names.insert( all.substr( colon + 1, nlen ) );
310 i = colon + 1 + nlen + 1; // skip name + trailing '|'
311 }
312 return names;
313 }
314};
315
316/// @brief Timer supporting RAII scope or manual stop.
317///
318/// Starts timer on construction.
319///
320/// Automatically adds timing to `TimerTree`'s singleton instance.
321/// See `TimerTree` for details on how to export the timings.
322///
323/// Example usage: scoped
324/// @code
325/// {
326/// Timer t("compute"); // scoped timer - starts here
327/// // do computation
328/// } // timer ends here - writes result to TimerTree::instance()
329/// @endcode
330///
331/// Example usage: stop explicitly
332/// @code
333/// {
334/// Timer t("compute"); // scoped timer - starts here
335/// // do computation
336/// t.stop() // timer ends here - writes result to TimerTree::instance()
337/// // do something that is not included in timing
338/// }
339/// @endcode
340///
341class Timer
342{
343 std::string name; ///< Timer name
344 Kokkos::Timer timer; ///< Underlying Kokkos timer
345 bool running{ false }; ///< Is timer currently running
346
347 public:
348 /// @brief Constructor - starts the timer
349 /// @param n Timer name
350 explicit Timer( const std::string& n )
351 : name( n )
352 {
354 #ifdef TERRANEO_USE_NESMIK
355 nesmik::region_start( name );
356 #endif
357 timer.reset();
358 running = true;
359 }
360
361 /// @brief Stop the timer and record elapsed time.
362 ///
363 /// Can be safely called twice - does not do anything on second call.
364 void stop()
365 {
366 if ( running )
367 {
368 double elapsed = timer.seconds();
369 TimerTree::instance().exit_scope( elapsed );
370 #ifdef TERRANEO_USE_NESMIK
371 nesmik::region_stop( name );
372 #endif
373 running = false;
374 }
375 }
376
377 /// @brief Destructor stops timer if still running.
378 ///
379 /// Can be used instead of stopping manually.
381 {
382 if ( running )
383 {
384 stop();
385 }
386 }
387};
388
389} // namespace terra::util
Node representing a timed region in the hierarchy.
Definition timer.hpp:23
TimerNode(const std::string &n, TimerNode *p=nullptr)
Constructor.
Definition timer.hpp:37
void clear_this_and_children()
Definition timer.hpp:42
std::string to_agg_json(int indent=0) const
Convert this node (and children) to JSON with MPI-aggregated statistics.
Definition timer.hpp:80
std::string to_json(int indent=0) const
Convert this node (and children) to JSON (per-rank)
Definition timer.hpp:55
Singleton tree managing all timer nodes per MPI rank.
Definition timer.hpp:159
void clear()
Definition timer.hpp:172
void exit_scope(double elapsed)
Exit the current timing scope and record elapsed time.
Definition timer.hpp:191
std::string json_aggregate()
MPI-reduced / aggregate json.
Definition timer.hpp:214
std::string json()
Per-rank json tree.
Definition timer.hpp:206
void aggregate_mpi()
Aggregate timings across all MPI ranks.
Definition timer.hpp:219
static TimerTree & instance()
Access the singleton instance.
Definition timer.hpp:166
void enter_scope(const std::string &name)
Enter a new timing scope.
Definition timer.hpp:180
Timer supporting RAII scope or manual stop.
Definition timer.hpp:342
~Timer()
Destructor stops timer if still running.
Definition timer.hpp:380
void stop()
Stop the timer and record elapsed time.
Definition timer.hpp:364
Timer(const std::string &n)
Constructor - starts the timer.
Definition timer.hpp:350
int r
Definition EpsilonDivDiv_kernel_gen.py:345
MPIRank rank()
Definition mpi.hpp:13
Definition solver.hpp:9