Loading...
Searching...
No Matches
table.hpp
Go to the documentation of this file.
1#pragma once
2
3#include <fstream>
4#include <functional>
5#include <iomanip>
6#include <iostream>
7#include <optional>
8#include <set>
9#include <sstream>
10#include <string>
11#include <unordered_map>
12#include <variant>
13#include <vector>
14
15#include "logging.hpp"
16#include "result.hpp"
17#include "terra/mpi/mpi.hpp"
18#include "timestamp.hpp"
19
20namespace terra::util {
21
22/// @brief Table class for storing and manipulating tabular data.
23///
24/// Rows are stored as maps from column names to values.
25/// Columns are dynamically added as needed.
26/// Think: each row is a dict-like type. Columns are essentially keys.
27///
28/// @note Each row automatically gets an "id" and "timestamp" column.
29///
30/// Provides functionality to add rows, select columns, query data, and print in various formats (pretty, JSON lines, CSV).
31///
32/// Not optimized for performance but designed for ease of use and flexibility.
33/// If you need high performance, or millions of rows, consider using a database or specialized library.
34/// But still useful for small to medium datasets, logging, prototyping, and data analysis tasks.
35///
36/// Supports various value types, including strings, numbers, booleans, and None (null).
37///
38/// @note For logging, the convention is that most functions use the key "tag" to add some keyword to the table. To
39/// later sort data, therefore add a "tag" to mark where the data comes from.
40///
41/// Example usage:
42/// @code
43///
44/// terra::util::Table table;
45///
46/// // Add rows with various data types.
47/// table.add_row({{"name", "Charlie"}, {"age", 28}, {"active", true}});
48/// table.add_row({{"name", "Alice"}, {"age", 30}, {"active", false}});
49/// table.add_row({{"name", "Bob"}, {"age", 25}, {"active", true}});
50///
51/// // Add rows with different columns to the same table.
52/// table.add_row({{"city", "Berlin"}, {"country", "Germany"}, {"population", 3769000}});
53///
54/// // Select specific columns
55/// auto selected_table = table.select_columns({ "name", "age" });
56///
57/// // Query rows where age is greater than 26
58/// auto queried_table = table.query_rows_where( "age", []( const auto& value ) {
59/// return std::holds_alternative<int>( value ) && std::get<int>( value ) > 26;
60/// });
61///
62/// // Print the table in different formats
63/// table.print_pretty();
64/// selected_table.print_jsonl();
65/// queried_table.print_csv();
66///
67/// // Print to file
68///
69/// // CSV
70/// std::ofstream file("output.csv");
71/// table.print_csv(file);
72/// file.close();
73///
74/// // JSON lines
75/// std::ofstream json_file("output.jsonl");
76/// table.print_jsonl(json_file);
77/// json_file.close();
78///
79/// // Clear the table
80/// table.clear();
81/// @endcode
82///
83class Table
84{
85 public:
86 /// @brief Max length of string values (required for safe reading of possibly non-null-terminated char arrays).
87 static constexpr int MAX_STRING_LENGTH = 10000;
88
89 private:
90 using ValueBase = std::variant<
91 std::monostate,
92 std::string,
93 char,
94 short,
95 int,
96 long,
97 long long,
98 unsigned char,
99 unsigned short,
100 unsigned int,
101 unsigned long,
102 unsigned long long,
103 float,
104 double,
105 bool >;
106
107 public:
108 /// @brief Type for table cell values.
109 struct Value : ValueBase
110 {
111 // Using a variant directly is annoying because we need to handle string literals via const char *.
112 // (That conversion btw seems to be compiler-dependent :) which makes it even more annoying.)
113 // It is possible but requires another special case throughout the accessors.
114 // So we simply always convert to std::string.
115
116 using ValueBase::ValueBase; // inherit constructors
117
118 Value( const char* arg )
119 : ValueBase( char_ptr_to_string_safe( arg ) )
120 {}
121 };
122
123 /// @brief Type for a table row (mapping column name to value).
124 using Row = std::unordered_map< std::string, Value >;
125
126 /// @brief Construct an empty table.
127 Table() = default;
128
129 /// @brief Get all rows in the table.
130 /// @return Vector of rows.
131 [[nodiscard]] const std::vector< Row >& rows() const { return rows_; }
132
133 /// @brief Get all column names in the table.
134 /// @return Set of column names.
135 [[nodiscard]] const std::set< std::string >& columns() const { return columns_; }
136
137 /// @brief Add a row to the table.
138 /// Adds "id" and "timestamp" columns automatically.
139 /// @param row_data Row data as a map from column name to value.
140 void add_row( const Row& row_data )
141 {
142 Row row;
143 row["id"] = global_id_counter++;
144 row["timestamp"] = current_timestamp();
145 columns_.insert( "id" );
146 columns_.insert( "timestamp" );
147
148 for ( const auto& [key, value] : row_data )
149 {
150 row[key] = value;
151 columns_.insert( key );
152 }
153
154 rows_.emplace_back( row );
155 }
156
157 /// @brief Select a subset of columns from the table.
158 /// @param selected_columns Columns to select.
159 /// @return New Table with only selected columns.
160 [[nodiscard]] Table select_columns( const std::vector< std::string >& selected_columns ) const
161 {
162 Table result;
163
164 for ( const auto& col : selected_columns )
165 {
166 result.columns_.insert( col );
167 }
168
169 for ( const auto& row : rows_ )
170 {
171 Row new_row;
172 for ( const auto& col : result.columns_ )
173 {
174 new_row[col] = get_value_from_row_or_none( row, col );
175 }
176 result.rows_.push_back( std::move( new_row ) );
177 }
178
179 return result;
180 }
181
182 /// @brief Query rows where a column is not None.
183 /// @param column Column name.
184 /// @return New Table with matching rows.
185 [[nodiscard]] Table query_rows_not_none( const std::string& column ) const
186 {
187 Table result;
188 result.columns_ = columns_;
189 for ( const auto& row : rows_ )
190 {
191 if ( auto it = row.find( column );
192 it != row.end() && !std::holds_alternative< std::monostate >( it->second ) )
193 {
194 result.rows_.push_back( row );
195 }
196 }
197 return result;
198 }
199
200 /// @brief Query rows where a column equals a value.
201 /// @param column Column name.
202 /// @param value Value to compare.
203 /// @return New Table with matching rows.
204 [[nodiscard]] Table query_rows_equals( const std::string& column, const Value& value ) const
205 {
206 Table result;
207 result.columns_ = columns_;
208 for ( const auto& row : rows_ )
209 {
210 auto it = row.find( column );
211 if ( it != row.end() && it->second == value )
212 {
213 result.rows_.push_back( row );
214 }
215 }
216 return result;
217 }
218
219 /// @brief Query rows where a column satisfies a predicate.
220 /// @param column Column name.
221 /// @param predicate Predicate function.
222 /// @return New Table with matching rows.
223 Table query_rows_where( const std::string& column, const std::function< bool( const Value& ) >& predicate ) const
224 {
225 Table result;
226 result.columns_ = columns_;
227 for ( const auto& row : rows_ )
228 {
229 auto it = row.find( column );
230 if ( it != row.end() && predicate( it->second ) )
231 {
232 result.rows_.push_back( row );
233 }
234 }
235 return result;
236 }
237
238 /// @brief Returns the values of a column in a vector.
239 ///
240 /// Casts arithmetic values (std::is_arithmetic_v) if the output type is also arithmetic. Skips the row otherwise.
241 /// Skips rows where there is no entry for that column.
242 ///
243 /// @param column Column name.
244 /// @return Vector filled with non-empty values of the column with the specified name.
245 template < typename RawType >
246 [[nodiscard]] std::vector< RawType > column_as_vector( const std::string& column ) const
247 {
248 std::vector< RawType > result;
249
250 for ( const auto& row : rows_ )
251 {
252 auto it = row.find( column );
253 if ( it == row.end() )
254 {
255 continue;
256 }
257
258 const auto& val = it->second;
259
260 std::visit(
261 [&]( auto&& v ) {
262 using V = std::decay_t< decltype( v ) >;
263
264 // Case 1 — Exact match
265 if constexpr ( std::is_same_v< V, RawType > )
266 {
267 result.push_back( v );
268 }
269 // Case 2 — allow numeric → numeric
270 else if constexpr ( std::is_arithmetic_v< RawType > && std::is_arithmetic_v< V > )
271 {
272 result.push_back( static_cast< RawType >( v ) );
273 }
274 // Case 3 — string (or anything else) → skip
275 else
276 {
277 // skip
278 }
279 },
280 val );
281 }
282
283 return result;
284 }
285
286 /// @brief Print the table in a pretty formatted style.
287 /// @param os Output stream (default util::logroot).
288 /// @return Reference to this table.
289 const Table& print_pretty( std::ostream& os = logroot ) const
290 {
291 if ( columns_.empty() || rows_.empty() )
292 {
293 os << "Empty table.\n";
294 return *this;
295 }
296
297 if ( mpi::rank() == 0 )
298 {
299 std::unordered_map< std::string, size_t > widths;
300 for ( const auto& col : columns_ )
301 {
302 widths[col] = col.size();
303 }
304
305 for ( const auto& row : rows_ )
306 {
307 for ( const auto& col : columns_ )
308 {
309 widths[col] =
310 std::max( widths[col], value_to_string( get_value_from_row_or_none( row, col ) ).size() );
311 }
312 }
313
314 auto sep = [&] {
315 for ( const auto& col : columns_ )
316 {
317 os << "+" << std::string( widths[col] + 2, '-' );
318 }
319 os << "+\n";
320 };
321
322 sep();
323 os << "|";
324 for ( const auto& col : columns_ )
325 {
326 os << " " << std::setw( static_cast< int >( widths[col] ) ) << std::right << col << " |";
327 }
328 os << "\n";
329 sep();
330
331 for ( const auto& row : rows_ )
332 {
333 os << "|";
334 for ( const auto& col : columns_ )
335 {
336 os << " " << std::setw( static_cast< int >( widths[col] ) ) << std::right
337 << value_to_string( get_value_from_row_or_none( row, col ) ) << " |";
338 }
339 os << "\n";
340 }
341 sep();
342 }
343
344 return *this;
345 }
346
347 /// @brief Print the table as JSON lines.
348 /// Each row is a JSON object, one per line.
349 ///
350 /// Example output:
351 /// {"id":1,"timestamp":"2024-06-10 12:34:56","name":"Alice","age":30}
352 /// {"id":2,"timestamp":"2024-06-10 12:34:57","name":"Bob","age":25}
353 ///
354 /// To parse with pandas:
355 /// import pandas as pd
356 /// df = pd.read_json("yourfile.jsonl", lines=True)
357 ///
358 /// @param os Output stream (default util::logroot).
359 /// @return Reference to this table.
360 const Table& print_jsonl( std::ostream& os = logroot ) const
361 {
362 if ( mpi::rank() == 0 )
363 {
364 for ( const auto& row : rows_ )
365 {
366 os << "{";
367 bool first = true;
368 for ( const auto& [key, val] : row )
369 {
370 if ( !first )
371 {
372 os << ",";
373 }
374 os << "\"" << key << "\":";
375 if ( std::holds_alternative< std::string >( val ) )
376 {
377 os << "\"" << value_to_string( val ) << "\"";
378 }
379 else if ( std::holds_alternative< std::monostate >( val ) )
380 {
381 os << "null";
382 }
383 else if ( std::holds_alternative< bool >( val ) )
384 {
385 os << ( std::get< bool >( val ) ? "true" : "false" );
386 }
387 else
388 {
389 os << value_to_string( val );
390 }
391 first = false;
392 }
393
394 os << "}\n";
395 }
396 }
397
398 return *this;
399 }
400
401 /// @brief Print the table as CSV.
402 /// @param os Output stream (default util::logroot).
403 /// @return Reference to this table.
404 const Table& print_csv( std::ostream& os = logroot ) const
405 {
406 if ( mpi::rank() == 0 )
407 {
408 print_header( os, "," );
409 for ( const auto& row : rows_ )
410 {
411 bool first = true;
412 for ( const auto& col : columns_ )
413 {
414 if ( !first )
415 os << ",";
416 os << value_to_string( get_value_from_row_or_none( row, col ) );
417 first = false;
418 }
419 os << "\n";
420 }
421 }
422
423 return *this;
424 }
425
426 /// @brief Clear all rows and columns from the table.
427 ///
428 /// This resets the table to an empty state - but does not reset the running id counter.
429 void clear()
430 {
431 rows_.clear();
432 columns_.clear();
433 }
434
435 /// @brief Convert a Value to a string for printing.
436 /// @param v Value to convert.
437 /// @return String representation.
438 static std::string value_to_string( const Value& v )
439 {
440 return std::visit(
441 []( const auto& val ) -> std::string {
442 using T = std::decay_t< decltype( val ) >;
443 if constexpr ( std::is_same_v< T, std::monostate > )
444 {
445 return "None";
446 }
447 else if constexpr ( std::is_same_v< T, std::string > )
448 {
449 return val;
450 }
451 else if constexpr ( std::is_same_v< T, bool > )
452 {
453 return val ? "true" : "false";
454 }
455 else if constexpr ( std::is_same_v< T, float > || std::is_same_v< T, double > )
456 {
457 std::ostringstream ss;
458 ss << std::scientific << std::setprecision( 3 ) << val;
459 return ss.str();
460 }
461 else
462 {
463 return std::to_string( val );
464 }
465 },
466 v );
467 }
468
469 /// @brief Get a value from a row or return None if missing.
470 /// @param row Row to search.
471 /// @param col Column name.
472 /// @return Value or None.
473 static Value get_value_from_row_or_none( const Row& row, const std::string& col )
474 {
475 auto it = row.find( col );
476 return ( it != row.end() ) ? it->second : std::monostate{};
477 }
478
479 private:
480 std::vector< Row > rows_; ///< Rows of the table.
481 std::set< std::string > columns_; ///< Set of column names.
482
483 inline static int global_id_counter = 0; ///< Global row id counter.
484
485 /// @brief Print the table header.
486 /// @param os Output stream.
487 /// @param sep Separator string.
488 void print_header( std::ostream& os, const std::string& sep ) const
489 {
490 bool first = true;
491 for ( const auto& col : columns_ )
492 {
493 if ( !first )
494 {
495 os << sep;
496 }
497
498 os << col;
499 first = false;
500 }
501 os << "\n";
502 }
503
504 /// @brief Safely converts const char * to string, even if not null terminated (constant max string length of MAX_STRING_LENGTH).
505 static std::string char_ptr_to_string_safe( const char* val )
506 {
507 if ( !val )
508 {
509 return std::string{};
510 }
511
512 std::size_t len = strnlen( val, MAX_STRING_LENGTH ); // find '\0' but stop at MAX_LEN
513 return { val, len };
514 }
515};
516
517/// @brief Attempts to read a csv file and converts that into a \ref Table instance.
518///
519/// Recognizes commas as a separator.
520///
521/// @param filename CSV file path
522/// @return Result object either containing a \ref Table on success or an error string.
523[[nodiscard]] inline Result< Table > read_table_from_csv( const std::string& filename )
524{
525 auto trim = []( const std::string& s ) {
526 size_t start = 0;
527 while ( start < s.size() && std::isspace( static_cast< unsigned char >( s[start] ) ) )
528 {
529 start++;
530 }
531 size_t end = s.size();
532 while ( end > start && std::isspace( static_cast< unsigned char >( s[end - 1] ) ) )
533 {
534 end--;
535 }
536 return s.substr( start, end - start );
537 };
538
539 auto parse_line = [trim]( const std::string& line ) {
540 std::vector< std::string > result;
541 std::string field;
542 bool inQuotes = false;
543
544 for ( size_t i = 0; i < line.size(); ++i )
545 {
546 if ( const char c = line[i]; c == '"' )
547 {
548 if ( inQuotes && i + 1 < line.size() && line[i + 1] == '"' )
549 {
550 field.push_back( '"' ); // escaped quote
551 ++i;
552 }
553 else
554 {
555 inQuotes = !inQuotes;
556 }
557 }
558 else if ( c == ',' && !inQuotes )
559 {
560 result.push_back( trim( field ) );
561 field.clear();
562 }
563 else
564 {
565 field.push_back( c );
566 }
567 }
568
569 result.push_back( trim( field ) );
570 return result;
571 };
572
573 auto infer_value = []( const std::string& s ) -> Table::Value {
574 if ( s == "true" || s == "TRUE" )
575 return true;
576 if ( s == "false" || s == "FALSE" )
577 return false;
578
579 char* end = nullptr;
580
581 // try int64
582 long long iv = std::strtoll( s.c_str(), &end, 10 );
583 if ( end && *end == '\0' )
584 {
585 return static_cast< int64_t >( iv );
586 }
587
588 // try double
589 double dv = std::strtod( s.c_str(), &end );
590 if ( end && *end == '\0' )
591 {
592 return dv;
593 }
594
595 // fallback to string
596 return s;
597 };
598
599 auto build_row = [infer_value](
600 const std::vector< std::string >& headers,
601 const std::vector< std::string >& fields ) -> Table::Row {
602 Table::Row row;
603 for ( size_t i = 0; i < headers.size() && i < fields.size(); ++i )
604 {
605 row[headers[i]] = infer_value( fields[i] );
606 }
607 return row;
608 };
609
610 std::ifstream file( filename );
611 if ( !file.is_open() )
612 {
613 return { "Could not open file: " + filename };
614 }
615
616 std::string line;
617 if ( !std::getline( file, line ) )
618 {
619 return { "Could not find a single line of content in file: " + filename };
620 }
621
622 std::vector< std::string > headers = parse_line( line );
623
624 Table table;
625
626 long line_number = 2;
627 while ( std::getline( file, line ) )
628 {
629 auto fields = parse_line( line );
630
631 if ( fields.size() != headers.size() )
632 {
633 return {
634 "Error parsing CSV file (line " + std::to_string( line_number ) +
635 "):\n"
636 "Number of columns in row does not match number of headers.\n"
637 "Note that also empty lines could be the cause of this (an empty line has one column with empty value).\n"
638 "Line:" +
639 line };
640 }
641
642 auto row = build_row( headers, fields );
643 table.add_row( row );
644
645 line_number++;
646 }
647
648 return table;
649}
650
651} // namespace terra::util
Definition result.hpp:13
Table class for storing and manipulating tabular data.
Definition table.hpp:84
static constexpr int MAX_STRING_LENGTH
Max length of string values (required for safe reading of possibly non-null-terminated char arrays).
Definition table.hpp:87
const Table & print_jsonl(std::ostream &os=logroot) const
Print the table as JSON lines. Each row is a JSON object, one per line.
Definition table.hpp:360
static std::string value_to_string(const Value &v)
Convert a Value to a string for printing.
Definition table.hpp:438
void clear()
Clear all rows and columns from the table.
Definition table.hpp:429
Table query_rows_not_none(const std::string &column) const
Query rows where a column is not None.
Definition table.hpp:185
static Value get_value_from_row_or_none(const Row &row, const std::string &col)
Get a value from a row or return None if missing.
Definition table.hpp:473
Table query_rows_where(const std::string &column, const std::function< bool(const Value &) > &predicate) const
Query rows where a column satisfies a predicate.
Definition table.hpp:223
Table()=default
Construct an empty table.
const Table & print_csv(std::ostream &os=logroot) const
Print the table as CSV.
Definition table.hpp:404
Table select_columns(const std::vector< std::string > &selected_columns) const
Select a subset of columns from the table.
Definition table.hpp:160
std::unordered_map< std::string, Value > Row
Type for a table row (mapping column name to value).
Definition table.hpp:124
Table query_rows_equals(const std::string &column, const Value &value) const
Query rows where a column equals a value.
Definition table.hpp:204
void add_row(const Row &row_data)
Add a row to the table. Adds "id" and "timestamp" columns automatically.
Definition table.hpp:140
const Table & print_pretty(std::ostream &os=logroot) const
Print the table in a pretty formatted style.
Definition table.hpp:289
const std::vector< Row > & rows() const
Get all rows in the table.
Definition table.hpp:131
const std::set< std::string > & columns() const
Get all column names in the table.
Definition table.hpp:135
std::vector< RawType > column_as_vector(const std::string &column) const
Returns the values of a column in a vector.
Definition table.hpp:246
MPIRank rank()
Definition mpi.hpp:10
Definition solver.hpp:9
Result< Table > read_table_from_csv(const std::string &filename)
Attempts to read a csv file and converts that into a Table instance.
Definition table.hpp:523
std::string current_timestamp()
Get the current timestamp as a string.
Definition timestamp.hpp:10
detail::PrefixCout logroot([]() { return detail::log_prefix();})
std::ostream subclass that just logs on root and adds a timestamp for each line.
Type for table cell values.
Definition table.hpp:110
Value(const char *arg)
Definition table.hpp:118