diff --git a/changes/bug40222 b/changes/bug40222 new file mode 100644 index 0000000000..8338ea4a7b --- /dev/null +++ b/changes/bug40222 @@ -0,0 +1,6 @@ + o Major features (metrics): + - Introduce a new class of extra-info statistics that allows relays to + expose details of how overloaded they are. This information is controlled + using the OverloadStatistics torrc option, and it will be used to make + more informed decisions about the network's load balancing. Implements + proposal 328; closes ticket 40222. diff --git a/src/app/config/config.c b/src/app/config/config.c index 1ac460bac0..156e90a549 100644 --- a/src/app/config/config.c +++ b/src/app/config/config.c @@ -352,6 +352,7 @@ static const config_var_t option_vars_[] = { V(CacheDirectoryGroupReadable, AUTOBOOL, "auto"), V(CellStatistics, BOOL, "0"), V(PaddingStatistics, BOOL, "1"), + V(OverloadStatistics, BOOL, "1"), V(LearnCircuitBuildTimeout, BOOL, "1"), V(CircuitBuildTimeout, INTERVAL, "0"), OBSOLETE("CircuitIdleTimeout"), diff --git a/src/app/config/or_options_st.h b/src/app/config/or_options_st.h index 90302eae7b..689adbc71b 100644 --- a/src/app/config/or_options_st.h +++ b/src/app/config/or_options_st.h @@ -674,6 +674,9 @@ struct or_options_t { /** If true, include statistics file contents in extra-info documents. */ int ExtraInfoStatistics; + /** If true, include overload statistics in extra-info documents. */ + int OverloadStatistics; + /** If true, do not believe anybody who tells us that a domain resolves * to an internal address, or that an internal address has a PTR mapping. * Helps avoid some cross-site attacks. */ diff --git a/src/feature/relay/router.c b/src/feature/relay/router.c index 73e8393761..88160bd1cb 100644 --- a/src/feature/relay/router.c +++ b/src/feature/relay/router.c @@ -3372,6 +3372,12 @@ extrainfo_dump_to_string_stats_helper(smartlist_t *chunks, if (contents) smartlist_add(chunks, contents); } + if (options->OverloadStatistics) { + contents = rep_hist_get_overload_stats_lines(); + if (contents) { + smartlist_add(chunks, contents); + } + } /* bridge statistics */ if (should_record_bridge_info(options)) { const char *bridge_stats = geoip_get_bridge_stats_extrainfo(now); diff --git a/src/feature/stats/rephist.c b/src/feature/stats/rephist.c index f7c9336bff..64bec000df 100644 --- a/src/feature/stats/rephist.c +++ b/src/feature/stats/rephist.c @@ -183,6 +183,122 @@ static time_t started_tracking_stability = 0; /** Map from hex OR identity digest to or_history_t. */ static digestmap_t *history_map = NULL; +/** Represents a state of overload stats. + * + * All the timestamps in this structure have already been rounded down to the + * nearest hour. */ +typedef struct { + /* When did we last experience a general overload? */ + time_t overload_general_time; + + /* When did we last experience a bandwidth-related overload? */ + time_t overload_ratelimits_time; + /* How many times have we gone off the our read limits? */ + uint64_t overload_read_count; + /* How many times have we gone off the our write limits? */ + uint64_t overload_write_count; + + /* When did we last experience a file descriptor exhaustion? */ + time_t overload_fd_exhausted_time; + /* How many times have we experienced a file descriptor exhaustion? */ + uint64_t overload_fd_exhausted; +} overload_stats_t; + +/** Current state of overload stats */ +static overload_stats_t overload_stats; + +/** Return true if this overload happened within the last `n_hours`. */ +static bool +overload_happened_recently(time_t overload_time, unsigned n_hours) +{ + /* An overload is relevant if it happened in the last 72 hours */ + if (overload_time > approx_time() - 3600 * n_hours) { + return true; + } + return false; +} + +/* The current version of the overload stats version */ +#define OVERLOAD_STATS_VERSION 1 + +/** Returns an allocated string for extra-info documents for publishing + * overload statistics. */ +char * +rep_hist_get_overload_stats_lines(void) +{ + char *result = NULL; + smartlist_t *chunks = smartlist_new(); + char tbuf[ISO_TIME_LEN+1]; + + /* First encode the general overload */ + if (overload_happened_recently(overload_stats.overload_general_time, 72)) { + format_iso_time(tbuf, overload_stats.overload_general_time); + smartlist_add_asprintf(chunks, "overload-general %d %s", + OVERLOAD_STATS_VERSION, tbuf); + } + + /* Now do bandwidth-related overloads */ + if (overload_happened_recently(overload_stats.overload_ratelimits_time,24)) { + const or_options_t *options = get_options(); + format_iso_time(tbuf, overload_stats.overload_ratelimits_time); + smartlist_add_asprintf(chunks, + "overload-ratelimits %d %s %" PRIu64 " %" PRIu64 + " %" PRIu64 " %" PRIu64, + OVERLOAD_STATS_VERSION, tbuf, + options->BandwidthRate, options->BandwidthBurst, + overload_stats.overload_read_count, + overload_stats.overload_write_count); + } + + /* Finally file descriptor overloads */ + if (overload_happened_recently( + overload_stats.overload_fd_exhausted_time, 72)) { + format_iso_time(tbuf, overload_stats.overload_fd_exhausted_time); + smartlist_add_asprintf(chunks, "overload-fd-exhausted %d %s", + OVERLOAD_STATS_VERSION, tbuf); + } + + /* Bail early if we had nothing to write */ + if (smartlist_len(chunks) == 0) { + goto done; + } + + result = smartlist_join_strings(chunks, "\n", 0, NULL); + + done: + SMARTLIST_FOREACH(chunks, char *, cp, tor_free(cp)); + smartlist_free(chunks); + return result; +} + +/** Round down the time in `a` to the beginning of the current hour */ +#define SET_TO_START_OF_HOUR(a) STMT_BEGIN \ + (a) = approx_time() - (approx_time() % 3600); \ +STMT_END + +/** Note down an overload event of type `overload`. */ +void +rep_hist_note_overload(overload_type_t overload) +{ + switch (overload) { + case OVERLOAD_GENERAL: + SET_TO_START_OF_HOUR(overload_stats.overload_general_time); + break; + case OVERLOAD_READ: + SET_TO_START_OF_HOUR(overload_stats.overload_ratelimits_time); + overload_stats.overload_read_count++; + break; + case OVERLOAD_WRITE: + SET_TO_START_OF_HOUR(overload_stats.overload_ratelimits_time); + overload_stats.overload_write_count++; + break; + case OVERLOAD_FD_EXHAUSTED: + SET_TO_START_OF_HOUR(overload_stats.overload_fd_exhausted_time); + overload_stats.overload_fd_exhausted++; + break; + } +} + /** Return the or_history_t for the OR with identity digest id, * creating it if necessary. */ static or_history_t * diff --git a/src/feature/stats/rephist.h b/src/feature/stats/rephist.h index 7ecbfebfd7..45020d7967 100644 --- a/src/feature/stats/rephist.h +++ b/src/feature/stats/rephist.h @@ -140,6 +140,24 @@ void rep_hist_reset_padding_counts(void); void rep_hist_prep_published_padding_counts(time_t now); void rep_hist_padding_count_timers(uint64_t num_timers); +/** + * Represents the various types of overload we keep track of and expose in our + * extra-info descriptor. +*/ +typedef enum { + /* A general overload -- can have many different causes. */ + OVERLOAD_GENERAL, + /* We went over our configured read rate/burst bandwidth limit */ + OVERLOAD_READ, + /* We went over our configured write rate/burst bandwidth limit */ + OVERLOAD_WRITE, + /* We exhausted the file descriptors in this system */ + OVERLOAD_FD_EXHAUSTED, +} overload_type_t; + +void rep_hist_note_overload(overload_type_t overload); +char *rep_hist_get_overload_stats_lines(void); + #ifdef TOR_UNIT_TESTS struct hs_v2_stats_t; const struct hs_v2_stats_t *rep_hist_get_hs_v2_stats(void); diff --git a/src/test/test_stats.c b/src/test/test_stats.c index 617a36faba..1f14255e31 100644 --- a/src/test/test_stats.c +++ b/src/test/test_stats.c @@ -703,6 +703,114 @@ test_load_stats_file(void *arg) tor_free(content); } +/** Test the overload stats logic. */ +static void +test_overload_stats(void *arg) +{ + time_t current_time = 1010101010; + char *stats_str = NULL; + (void) arg; + + /* Change time to 03-01-2002 23:36 UTC */ + /* This should make the extrainfo timestamp be "2002-01-03 23:00:00" */ + update_approx_time(current_time); + + /* With an empty rephist we shouldn't get anything back */ + stats_str = rep_hist_get_overload_stats_lines(); + tt_assert(!stats_str); + + /* Note a DNS overload */ + rep_hist_note_overload(OVERLOAD_GENERAL); + + /* Move the time forward one hour */ + current_time += 3600; + update_approx_time(current_time); + + /* Now check the string */ + stats_str = rep_hist_get_overload_stats_lines(); + tt_str_op("overload-general 1 2002-01-03 23:00:00", OP_EQ, stats_str); + tor_free(stats_str); + + /* Move the time forward 72 hours: see that the line has disappeared. */ + current_time += 3600*72; + update_approx_time(current_time); + + stats_str = rep_hist_get_overload_stats_lines(); + tt_assert(!stats_str); + + /* Now the time should be 2002-01-07 00:00:00 */ + + /* Note a DNS overload */ + rep_hist_note_overload(OVERLOAD_GENERAL); + + stats_str = rep_hist_get_overload_stats_lines(); + tt_str_op("overload-general 1 2002-01-07 00:00:00", OP_EQ, stats_str); + tor_free(stats_str); + + /* Also note an fd exhaustion event */ + rep_hist_note_overload(OVERLOAD_FD_EXHAUSTED); + + stats_str = rep_hist_get_overload_stats_lines(); + tt_str_op("overload-general 1 2002-01-07 00:00:00\n" + "overload-fd-exhausted 1 2002-01-07 00:00:00", OP_EQ, stats_str); + tor_free(stats_str); + + /* Move the time forward. Register DNS overload. See that the time changed */ + current_time += 3600*2; + update_approx_time(current_time); + + rep_hist_note_overload(OVERLOAD_GENERAL); + + stats_str = rep_hist_get_overload_stats_lines(); + tt_str_op("overload-general 1 2002-01-07 02:00:00\n" + "overload-fd-exhausted 1 2002-01-07 00:00:00", OP_EQ, stats_str); + tor_free(stats_str); + + /* Move the time forward. Register a bandwidth ratelimit event. See that the + string is added */ + current_time += 3600*2; + update_approx_time(current_time); + + /* Register the rate limit event */ + rep_hist_note_overload(OVERLOAD_READ); + /* Also set some rate limiting values that should be reflected on the log */ + get_options_mutable()->BandwidthRate = 1000; + get_options_mutable()->BandwidthBurst = 2000; + + stats_str = rep_hist_get_overload_stats_lines(); + tt_str_op("overload-general 1 2002-01-07 02:00:00\n" + "overload-ratelimits 1 2002-01-07 04:00:00 1000 2000 1 0\n" + "overload-fd-exhausted 1 2002-01-07 00:00:00", OP_EQ, stats_str); + tor_free(stats_str); + + /* Move the time forward 24 hours: no rate limit line anymore. */ + current_time += 3600*24; + update_approx_time(current_time); + + stats_str = rep_hist_get_overload_stats_lines(); + tt_str_op("overload-general 1 2002-01-07 02:00:00\n" + "overload-fd-exhausted 1 2002-01-07 00:00:00", OP_EQ, stats_str); + tor_free(stats_str); + + /* Move the time forward 44 hours: no fd exhausted line anymore. */ + current_time += 3600*44; + update_approx_time(current_time); + + stats_str = rep_hist_get_overload_stats_lines(); + tt_str_op("overload-general 1 2002-01-07 02:00:00", OP_EQ, stats_str); + tor_free(stats_str); + + /* Move the time forward 2 hours: there is nothing left. */ + current_time += 3600*2; + update_approx_time(current_time); + + stats_str = rep_hist_get_overload_stats_lines(); + tt_assert(!stats_str); + + done: + tor_free(stats_str); +} + #define ENT(name) \ { #name, test_ ## name , 0, NULL, NULL } #define FORK(name) \ @@ -718,6 +826,7 @@ struct testcase_t stats_tests[] = { FORK(get_bandwidth_lines), FORK(rephist_v3_onions), FORK(load_stats_file), + FORK(overload_stats), END_OF_TESTCASES };