From de907893befdb2bf6bb0b95b183cc1f01b8a464c Mon Sep 17 00:00:00 2001 From: David Goulet Date: Tue, 19 Oct 2021 14:13:33 -0400 Subject: [PATCH] relay: Overload state on DNS timeout is now X% over Y secs With this commit, we will only report a general overload state if we've seen more than X% of DNS timeout errors over Y seconds. Previous behavior was to report when a single timeout occured which is really too small of a threshold. The value X is a consensus parameters called "overload_dns_timeout_scale_percent" which is a scaled percentage (factor of 1000) so we can represent decimal points for X like 0.5% for instance. Its default is 1000 which ends up being 1%. The value Y is a consensus parameters called "overload_dns_timeout_period_secs" which is the time period for which will gather DNS errors and once over, we assess if that X% has been reached ultimately triggering a general overload signal. Closes #40491 Signed-off-by: David Goulet --- src/feature/nodelist/networkstatus.c | 2 + src/feature/relay/dns.c | 14 +-- src/feature/stats/rephist.c | 133 +++++++++++++++++++++++++++ src/feature/stats/rephist.h | 3 + 4 files changed, 142 insertions(+), 10 deletions(-) diff --git a/src/feature/nodelist/networkstatus.c b/src/feature/nodelist/networkstatus.c index 2ffa6da1a3..af808a6ba7 100644 --- a/src/feature/nodelist/networkstatus.c +++ b/src/feature/nodelist/networkstatus.c @@ -103,6 +103,7 @@ #include "feature/dirauth/vote_microdesc_hash_st.h" #include "feature/nodelist/vote_routerstatus_st.h" #include "feature/nodelist/routerstatus_st.h" +#include "feature/stats/rephist.h" #ifdef HAVE_UNISTD_H #include @@ -1663,6 +1664,7 @@ notify_before_networkstatus_changes(const networkstatus_t *old_c, dos_consensus_has_changed(new_c); relay_consensus_has_changed(new_c); hs_dos_consensus_has_changed(new_c); + rep_hist_consensus_has_changed(new_c); } /* Called after a new consensus has been put in the global state. It is safe diff --git a/src/feature/relay/dns.c b/src/feature/relay/dns.c index 22f929808e..ed8d235e92 100644 --- a/src/feature/relay/dns.c +++ b/src/feature/relay/dns.c @@ -1548,16 +1548,6 @@ evdns_callback(int result, char type, int count, int ttl, void *addresses, tor_addr_make_unspec(&addr); - /* Note down any DNS errors to the statistics module */ - if (result == DNS_ERR_TIMEOUT) { - /* libevent timed out while resolving a name. However, because libevent - * handles retries and timeouts internally, this means that all attempts of - * libevent timed out. If we wanted to get more granular information about - * individual libevent attempts, we would have to implement our own DNS - * timeout/retry logic */ - rep_hist_note_overload(OVERLOAD_GENERAL); - } - /* Keep track of whether IPv6 is working */ if (type == DNS_IPv6_AAAA) { if (result == DNS_ERR_TIMEOUT) { @@ -1659,6 +1649,10 @@ evdns_callback(int result, char type, int count, int ttl, void *addresses, dns_found_answer(string_address, orig_query_type, result, &addr, hostname, ttl); + /* The result can be changed within this function thus why we note the result + * at the end. */ + rep_hist_note_dns_query(type, result); + tor_free(arg_); } diff --git a/src/feature/stats/rephist.c b/src/feature/stats/rephist.c index e25c01331d..c3a281a8c2 100644 --- a/src/feature/stats/rephist.c +++ b/src/feature/stats/rephist.c @@ -84,6 +84,8 @@ #include "feature/nodelist/networkstatus_st.h" #include "core/or/or_circuit_st.h" +#include + #ifdef HAVE_FCNTL_H #include #endif @@ -204,6 +206,54 @@ typedef struct { uint64_t overload_fd_exhausted; } overload_stats_t; +/***** DNS statistics *****/ + +/** Represents the statistics of DNS queries seen if it is an Exit. */ +typedef struct { + /** Total number of DNS request seen at an Exit. They might not all end + * successfully or might even be lost by tor. This counter is incremented + * right before the DNS request is initiated. */ + uint64_t stats_n_request; + + /** Total number of DNS timeout errors. */ + uint64_t stats_n_error_timeout; + + /** When is the next assessment time of the general overload for DNS errors. + * Once this time is reached, all stats are reset and this time is set to the + * next assessment time. */ + time_t next_assessment_time; +} overload_dns_stats_t; + +/** Keep track of the DNS requests for the general overload state. */ +static overload_dns_stats_t overload_dns_stats; + +/* We use a scale here so we can represent percentages with decimal points by + * scaling the value by this factor and so 0.5% becomes a value of 500. + * Default is 1% and thus min and max range is 0 to 100%. */ +#define OVERLOAD_DNS_TIMEOUT_PERCENT_SCALE 1000.0 +#define OVERLOAD_DNS_TIMEOUT_PERCENT_DEFAULT 1000 +#define OVERLOAD_DNS_TIMEOUT_PERCENT_MIN 0 +#define OVERLOAD_DNS_TIMEOUT_PERCENT_MAX 100000 + +/** Consensus parameter: indicate what fraction of DNS timeout errors over the + * total number of DNS requests must be reached before we trigger a general + * overload signal .*/ +static double overload_dns_timeout_fraction = + OVERLOAD_DNS_TIMEOUT_PERCENT_DEFAULT / + OVERLOAD_DNS_TIMEOUT_PERCENT_SCALE / 100.0; + +/* Number of seconds for the assessment period. Default is 10 minutes (600) and + * the min max range is within a 32bit value. */ +#define OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_DEFAULT (10 * 60) +#define OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MIN 0 +#define OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MAX INT32_MAX + +/** Consensus parameter: Period, in seconds, over which we count the number of + * DNS requests and timeout errors. After that period, we assess if we trigger + * an overload or not. */ +static int32_t overload_dns_timeout_period_secs = + OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_DEFAULT; + /** Current state of overload stats */ static overload_stats_t overload_stats; @@ -218,6 +268,89 @@ overload_happened_recently(time_t overload_time, int n_hours) return false; } +/** Assess the DNS timeout errors and if we have enough to trigger a general + * overload. */ +static void +overload_general_dns_assessment(void) +{ + /* Initialize the time. Should be done once. */ + if (overload_dns_stats.next_assessment_time == 0) { + goto reset; + } + + /* Not the time yet. */ + if (overload_dns_stats.next_assessment_time > approx_time()) { + return; + } + + /* Lets see if we can signal a general overload. */ + double fraction = (double) overload_dns_stats.stats_n_error_timeout / + (double) overload_dns_stats.stats_n_request; + if (fraction >= overload_dns_timeout_fraction) { + log_notice(LD_HIST, "General overload -> DNS timeouts (%" PRIu64 ") " + "fraction %.4f%% is above threshold of %.4f%%", + overload_dns_stats.stats_n_error_timeout, + fraction * 100.0, + overload_dns_timeout_fraction * 100.0); + rep_hist_note_overload(OVERLOAD_GENERAL); + } + + reset: + /* Reset counters for the next period. */ + overload_dns_stats.stats_n_error_timeout = 0; + overload_dns_stats.stats_n_request = 0; + overload_dns_stats.next_assessment_time = + approx_time() + overload_dns_timeout_period_secs; +} + +/** Called just before the consensus will be replaced. Update the consensus + * parameters in case they changed. */ +void +rep_hist_consensus_has_changed(const networkstatus_t *ns) +{ + overload_dns_timeout_fraction = + networkstatus_get_param(ns, "overload_dns_timeout_scale_percent", + OVERLOAD_DNS_TIMEOUT_PERCENT_DEFAULT, + OVERLOAD_DNS_TIMEOUT_PERCENT_MIN, + OVERLOAD_DNS_TIMEOUT_PERCENT_MAX) / + OVERLOAD_DNS_TIMEOUT_PERCENT_SCALE / 100.0; + + overload_dns_timeout_period_secs = + networkstatus_get_param(ns, "overload_dns_timeout_period_secs", + OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_DEFAULT, + OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MIN, + OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MAX); +} + +/** Note a DNS error for the given given libevent DNS record type and error + * code. Possible types are: DNS_IPv4_A, DNS_PTR, DNS_IPv6_AAAA. + * + * IMPORTANT: Libevent is _not_ returning the type in case of an error and so + * if error is anything but DNS_ERR_NONE, the type is not usable and set to 0. + * + * See: https://gitlab.torproject.org/tpo/core/tor/-/issues/40490 */ +void +rep_hist_note_dns_query(int type, uint8_t error) +{ + (void) type; + + /* Assess if we need to trigger a general overload with regards to the DNS + * errors or not. */ + overload_general_dns_assessment(); + + /* We only care about timeouts for the moment. */ + switch (error) { + case DNS_ERR_TIMEOUT: + overload_dns_stats.stats_n_error_timeout++; + break; + default: + break; + } + + /* Increment total number of requests. */ + overload_dns_stats.stats_n_request++; +} + /* The current version of the overload stats version */ #define OVERLOAD_STATS_VERSION 1 diff --git a/src/feature/stats/rephist.h b/src/feature/stats/rephist.h index d4a2f301cf..749b4996a8 100644 --- a/src/feature/stats/rephist.h +++ b/src/feature/stats/rephist.h @@ -72,11 +72,14 @@ void rep_hist_seen_new_rp_cell(bool is_v2); char *rep_hist_get_hs_v3_stats_string(void); void rep_hist_hsdir_stored_maybe_new_v3_onion(const uint8_t *blinded_key); +void rep_hist_note_dns_query(int type, uint8_t error); + void rep_hist_free_all(void); void rep_hist_note_negotiated_link_proto(unsigned link_proto, int started_here); void rep_hist_log_link_protocol_counts(void); +void rep_hist_consensus_has_changed(const networkstatus_t *ns); extern uint64_t rephist_total_alloc; extern uint32_t rephist_total_num;