Implement backbone of overload statistics.

- Implement overload statistics structure.
- Implement function that keeps track of overload statistics.
- Implement function that writes overload statistics to descriptor.
- Unittest for the whole logic.
This commit is contained in:
George Kadianakis 2021-03-09 15:36:40 +02:00
parent 80b6054bb0
commit 0a5ecb3342
7 changed files with 259 additions and 0 deletions

6
changes/bug40222 Normal file
View File

@ -0,0 +1,6 @@
o Major features (metrics):
- Introduce a new class of extra-info statistics that allows relays to
expose details of how overloaded they are. This information is controlled
using the OverloadStatistics torrc option, and it will be used to make
more informed decisions about the network's load balancing. Implements
proposal 328; closes ticket 40222.

View File

@ -352,6 +352,7 @@ static const config_var_t option_vars_[] = {
V(CacheDirectoryGroupReadable, AUTOBOOL, "auto"), V(CacheDirectoryGroupReadable, AUTOBOOL, "auto"),
V(CellStatistics, BOOL, "0"), V(CellStatistics, BOOL, "0"),
V(PaddingStatistics, BOOL, "1"), V(PaddingStatistics, BOOL, "1"),
V(OverloadStatistics, BOOL, "1"),
V(LearnCircuitBuildTimeout, BOOL, "1"), V(LearnCircuitBuildTimeout, BOOL, "1"),
V(CircuitBuildTimeout, INTERVAL, "0"), V(CircuitBuildTimeout, INTERVAL, "0"),
OBSOLETE("CircuitIdleTimeout"), OBSOLETE("CircuitIdleTimeout"),

View File

@ -674,6 +674,9 @@ struct or_options_t {
/** If true, include statistics file contents in extra-info documents. */ /** If true, include statistics file contents in extra-info documents. */
int ExtraInfoStatistics; int ExtraInfoStatistics;
/** If true, include overload statistics in extra-info documents. */
int OverloadStatistics;
/** If true, do not believe anybody who tells us that a domain resolves /** If true, do not believe anybody who tells us that a domain resolves
* to an internal address, or that an internal address has a PTR mapping. * to an internal address, or that an internal address has a PTR mapping.
* Helps avoid some cross-site attacks. */ * Helps avoid some cross-site attacks. */

View File

@ -3372,6 +3372,12 @@ extrainfo_dump_to_string_stats_helper(smartlist_t *chunks,
if (contents) if (contents)
smartlist_add(chunks, contents); smartlist_add(chunks, contents);
} }
if (options->OverloadStatistics) {
contents = rep_hist_get_overload_stats_lines();
if (contents) {
smartlist_add(chunks, contents);
}
}
/* bridge statistics */ /* bridge statistics */
if (should_record_bridge_info(options)) { if (should_record_bridge_info(options)) {
const char *bridge_stats = geoip_get_bridge_stats_extrainfo(now); const char *bridge_stats = geoip_get_bridge_stats_extrainfo(now);

View File

@ -183,6 +183,122 @@ static time_t started_tracking_stability = 0;
/** Map from hex OR identity digest to or_history_t. */ /** Map from hex OR identity digest to or_history_t. */
static digestmap_t *history_map = NULL; static digestmap_t *history_map = NULL;
/** Represents a state of overload stats.
*
* All the timestamps in this structure have already been rounded down to the
* nearest hour. */
typedef struct {
/* When did we last experience a general overload? */
time_t overload_general_time;
/* When did we last experience a bandwidth-related overload? */
time_t overload_ratelimits_time;
/* How many times have we gone off the our read limits? */
uint64_t overload_read_count;
/* How many times have we gone off the our write limits? */
uint64_t overload_write_count;
/* When did we last experience a file descriptor exhaustion? */
time_t overload_fd_exhausted_time;
/* How many times have we experienced a file descriptor exhaustion? */
uint64_t overload_fd_exhausted;
} overload_stats_t;
/** Current state of overload stats */
static overload_stats_t overload_stats;
/** Return true if this overload happened within the last `n_hours`. */
static bool
overload_happened_recently(time_t overload_time, unsigned n_hours)
{
/* An overload is relevant if it happened in the last 72 hours */
if (overload_time > approx_time() - 3600 * n_hours) {
return true;
}
return false;
}
/* The current version of the overload stats version */
#define OVERLOAD_STATS_VERSION 1
/** Returns an allocated string for extra-info documents for publishing
* overload statistics. */
char *
rep_hist_get_overload_stats_lines(void)
{
char *result = NULL;
smartlist_t *chunks = smartlist_new();
char tbuf[ISO_TIME_LEN+1];
/* First encode the general overload */
if (overload_happened_recently(overload_stats.overload_general_time, 72)) {
format_iso_time(tbuf, overload_stats.overload_general_time);
smartlist_add_asprintf(chunks, "overload-general %d %s",
OVERLOAD_STATS_VERSION, tbuf);
}
/* Now do bandwidth-related overloads */
if (overload_happened_recently(overload_stats.overload_ratelimits_time,24)) {
const or_options_t *options = get_options();
format_iso_time(tbuf, overload_stats.overload_ratelimits_time);
smartlist_add_asprintf(chunks,
"overload-ratelimits %d %s %" PRIu64 " %" PRIu64
" %" PRIu64 " %" PRIu64,
OVERLOAD_STATS_VERSION, tbuf,
options->BandwidthRate, options->BandwidthBurst,
overload_stats.overload_read_count,
overload_stats.overload_write_count);
}
/* Finally file descriptor overloads */
if (overload_happened_recently(
overload_stats.overload_fd_exhausted_time, 72)) {
format_iso_time(tbuf, overload_stats.overload_fd_exhausted_time);
smartlist_add_asprintf(chunks, "overload-fd-exhausted %d %s",
OVERLOAD_STATS_VERSION, tbuf);
}
/* Bail early if we had nothing to write */
if (smartlist_len(chunks) == 0) {
goto done;
}
result = smartlist_join_strings(chunks, "\n", 0, NULL);
done:
SMARTLIST_FOREACH(chunks, char *, cp, tor_free(cp));
smartlist_free(chunks);
return result;
}
/** Round down the time in `a` to the beginning of the current hour */
#define SET_TO_START_OF_HOUR(a) STMT_BEGIN \
(a) = approx_time() - (approx_time() % 3600); \
STMT_END
/** Note down an overload event of type `overload`. */
void
rep_hist_note_overload(overload_type_t overload)
{
switch (overload) {
case OVERLOAD_GENERAL:
SET_TO_START_OF_HOUR(overload_stats.overload_general_time);
break;
case OVERLOAD_READ:
SET_TO_START_OF_HOUR(overload_stats.overload_ratelimits_time);
overload_stats.overload_read_count++;
break;
case OVERLOAD_WRITE:
SET_TO_START_OF_HOUR(overload_stats.overload_ratelimits_time);
overload_stats.overload_write_count++;
break;
case OVERLOAD_FD_EXHAUSTED:
SET_TO_START_OF_HOUR(overload_stats.overload_fd_exhausted_time);
overload_stats.overload_fd_exhausted++;
break;
}
}
/** Return the or_history_t for the OR with identity digest <b>id</b>, /** Return the or_history_t for the OR with identity digest <b>id</b>,
* creating it if necessary. */ * creating it if necessary. */
static or_history_t * static or_history_t *

View File

@ -140,6 +140,24 @@ void rep_hist_reset_padding_counts(void);
void rep_hist_prep_published_padding_counts(time_t now); void rep_hist_prep_published_padding_counts(time_t now);
void rep_hist_padding_count_timers(uint64_t num_timers); void rep_hist_padding_count_timers(uint64_t num_timers);
/**
* Represents the various types of overload we keep track of and expose in our
* extra-info descriptor.
*/
typedef enum {
/* A general overload -- can have many different causes. */
OVERLOAD_GENERAL,
/* We went over our configured read rate/burst bandwidth limit */
OVERLOAD_READ,
/* We went over our configured write rate/burst bandwidth limit */
OVERLOAD_WRITE,
/* We exhausted the file descriptors in this system */
OVERLOAD_FD_EXHAUSTED,
} overload_type_t;
void rep_hist_note_overload(overload_type_t overload);
char *rep_hist_get_overload_stats_lines(void);
#ifdef TOR_UNIT_TESTS #ifdef TOR_UNIT_TESTS
struct hs_v2_stats_t; struct hs_v2_stats_t;
const struct hs_v2_stats_t *rep_hist_get_hs_v2_stats(void); const struct hs_v2_stats_t *rep_hist_get_hs_v2_stats(void);

View File

@ -703,6 +703,114 @@ test_load_stats_file(void *arg)
tor_free(content); tor_free(content);
} }
/** Test the overload stats logic. */
static void
test_overload_stats(void *arg)
{
time_t current_time = 1010101010;
char *stats_str = NULL;
(void) arg;
/* Change time to 03-01-2002 23:36 UTC */
/* This should make the extrainfo timestamp be "2002-01-03 23:00:00" */
update_approx_time(current_time);
/* With an empty rephist we shouldn't get anything back */
stats_str = rep_hist_get_overload_stats_lines();
tt_assert(!stats_str);
/* Note a DNS overload */
rep_hist_note_overload(OVERLOAD_GENERAL);
/* Move the time forward one hour */
current_time += 3600;
update_approx_time(current_time);
/* Now check the string */
stats_str = rep_hist_get_overload_stats_lines();
tt_str_op("overload-general 1 2002-01-03 23:00:00", OP_EQ, stats_str);
tor_free(stats_str);
/* Move the time forward 72 hours: see that the line has disappeared. */
current_time += 3600*72;
update_approx_time(current_time);
stats_str = rep_hist_get_overload_stats_lines();
tt_assert(!stats_str);
/* Now the time should be 2002-01-07 00:00:00 */
/* Note a DNS overload */
rep_hist_note_overload(OVERLOAD_GENERAL);
stats_str = rep_hist_get_overload_stats_lines();
tt_str_op("overload-general 1 2002-01-07 00:00:00", OP_EQ, stats_str);
tor_free(stats_str);
/* Also note an fd exhaustion event */
rep_hist_note_overload(OVERLOAD_FD_EXHAUSTED);
stats_str = rep_hist_get_overload_stats_lines();
tt_str_op("overload-general 1 2002-01-07 00:00:00\n"
"overload-fd-exhausted 1 2002-01-07 00:00:00", OP_EQ, stats_str);
tor_free(stats_str);
/* Move the time forward. Register DNS overload. See that the time changed */
current_time += 3600*2;
update_approx_time(current_time);
rep_hist_note_overload(OVERLOAD_GENERAL);
stats_str = rep_hist_get_overload_stats_lines();
tt_str_op("overload-general 1 2002-01-07 02:00:00\n"
"overload-fd-exhausted 1 2002-01-07 00:00:00", OP_EQ, stats_str);
tor_free(stats_str);
/* Move the time forward. Register a bandwidth ratelimit event. See that the
string is added */
current_time += 3600*2;
update_approx_time(current_time);
/* Register the rate limit event */
rep_hist_note_overload(OVERLOAD_READ);
/* Also set some rate limiting values that should be reflected on the log */
get_options_mutable()->BandwidthRate = 1000;
get_options_mutable()->BandwidthBurst = 2000;
stats_str = rep_hist_get_overload_stats_lines();
tt_str_op("overload-general 1 2002-01-07 02:00:00\n"
"overload-ratelimits 1 2002-01-07 04:00:00 1000 2000 1 0\n"
"overload-fd-exhausted 1 2002-01-07 00:00:00", OP_EQ, stats_str);
tor_free(stats_str);
/* Move the time forward 24 hours: no rate limit line anymore. */
current_time += 3600*24;
update_approx_time(current_time);
stats_str = rep_hist_get_overload_stats_lines();
tt_str_op("overload-general 1 2002-01-07 02:00:00\n"
"overload-fd-exhausted 1 2002-01-07 00:00:00", OP_EQ, stats_str);
tor_free(stats_str);
/* Move the time forward 44 hours: no fd exhausted line anymore. */
current_time += 3600*44;
update_approx_time(current_time);
stats_str = rep_hist_get_overload_stats_lines();
tt_str_op("overload-general 1 2002-01-07 02:00:00", OP_EQ, stats_str);
tor_free(stats_str);
/* Move the time forward 2 hours: there is nothing left. */
current_time += 3600*2;
update_approx_time(current_time);
stats_str = rep_hist_get_overload_stats_lines();
tt_assert(!stats_str);
done:
tor_free(stats_str);
}
#define ENT(name) \ #define ENT(name) \
{ #name, test_ ## name , 0, NULL, NULL } { #name, test_ ## name , 0, NULL, NULL }
#define FORK(name) \ #define FORK(name) \
@ -718,6 +826,7 @@ struct testcase_t stats_tests[] = {
FORK(get_bandwidth_lines), FORK(get_bandwidth_lines),
FORK(rephist_v3_onions), FORK(rephist_v3_onions),
FORK(load_stats_file), FORK(load_stats_file),
FORK(overload_stats),
END_OF_TESTCASES END_OF_TESTCASES
}; };