From cda7acb35d40c505dc4d2c3b55d611faab189477 Mon Sep 17 00:00:00 2001 From: David Goulet Date: Mon, 13 Dec 2021 10:22:29 -0500 Subject: [PATCH 1/5] relay: Don't make DNS timeout trigger an overload Tor has configure libevent to attempt up to 3 times a DNS query for a maximum of 5 seconds each. Once that 5 seconds has elapsed, it consider the query "Timed Out" but tor only gets a timeout if all 3 attempts have failed. For example, using Unbound, it has a much higher threshold of timeout. It is well defined in https://www.nlnetlabs.nl/documentation/unbound/info-timeout/ and has some complexity to it. But the gist is that if it times out, it will be much more than 5 seconds. And so the Tor DNS timeouts are more of a "UX issue" rather than a "network issue". For this reason, we are removing this metric from the overload general signal. See https://gitlab.torproject.org/tpo/network-health/team/-/issues/139 for more information. Fixes #40527 Signed-off-by: David Goulet --- changes/ticket40527 | 5 +++ src/feature/stats/rephist.c | 12 ------ src/test/test_stats.c | 82 ++----------------------------------- 3 files changed, 8 insertions(+), 91 deletions(-) create mode 100644 changes/ticket40527 diff --git a/changes/ticket40527 b/changes/ticket40527 new file mode 100644 index 0000000000..631b3d4bb9 --- /dev/null +++ b/changes/ticket40527 @@ -0,0 +1,5 @@ + o Major bugfixes (relay, overload): + - Don't make Tor DNS timeout trigger an overload general state. These + timeouts are different from DNS server timeout. They have to be seen as + timeout related to UX and not because of a network problem. Fixes bug + 40527; bugfix on 0.4.6.1-alpha. diff --git a/src/feature/stats/rephist.c b/src/feature/stats/rephist.c index c3a281a8c2..2bfa14d326 100644 --- a/src/feature/stats/rephist.c +++ b/src/feature/stats/rephist.c @@ -283,18 +283,6 @@ overload_general_dns_assessment(void) return; } - /* Lets see if we can signal a general overload. */ - double fraction = (double) overload_dns_stats.stats_n_error_timeout / - (double) overload_dns_stats.stats_n_request; - if (fraction >= overload_dns_timeout_fraction) { - log_notice(LD_HIST, "General overload -> DNS timeouts (%" PRIu64 ") " - "fraction %.4f%% is above threshold of %.4f%%", - overload_dns_stats.stats_n_error_timeout, - fraction * 100.0, - overload_dns_timeout_fraction * 100.0); - rep_hist_note_overload(OVERLOAD_GENERAL); - } - reset: /* Reset counters for the next period. */ overload_dns_stats.stats_n_error_timeout = 0; diff --git a/src/test/test_stats.c b/src/test/test_stats.c index 3b9a192c75..af035ae54c 100644 --- a/src/test/test_stats.c +++ b/src/test/test_stats.c @@ -721,7 +721,7 @@ test_overload_stats(void *arg) stats_str = rep_hist_get_overload_stats_lines(); tt_assert(!stats_str); - /* Note a DNS overload */ + /* Note a overload */ rep_hist_note_overload(OVERLOAD_GENERAL); /* Move the time forward one hour */ @@ -742,7 +742,7 @@ test_overload_stats(void *arg) /* Now the time should be 2002-01-07 00:00:00 */ - /* Note a DNS overload */ + /* Note a overload */ rep_hist_note_overload(OVERLOAD_GENERAL); stats_str = rep_hist_get_overload_general_line(); @@ -760,7 +760,7 @@ test_overload_stats(void *arg) tt_str_op("overload-fd-exhausted 1 2002-01-07 00:00:00\n", OP_EQ, stats_str); tor_free(stats_str); - /* Move the time forward. Register DNS overload. See that the time changed */ + /* Move the time forward. Register overload. See that the time changed */ current_time += 3600*2; update_approx_time(current_time); @@ -867,81 +867,6 @@ test_overload_stats(void *arg) tor_free(stats_str); } -/** Test the overload stats logic. */ -static void -test_overload_dns_timeout(void *arg) -{ - char *stats_str = NULL; - (void) arg; - - /* Lets simulate a series of timeouts but below our default 1% threshold. */ - - for (int i = 0; i < 1000; i++) { - /* This should trigger 9 timeouts which is just below 1% (10) */ - if (i > 0 && !(i % 100)) { - rep_hist_note_dns_query(0, DNS_ERR_TIMEOUT); - } else { - rep_hist_note_dns_query(0, DNS_ERR_NONE); - } - } - - /* No overload yet. */ - stats_str = rep_hist_get_overload_general_line(); - tt_assert(!stats_str); - - /* Move it 10 minutes in the future and see if we get a general overload. */ - update_approx_time(approx_time() + (10 * 60)); - - /* This query should NOT trigger the general overload because we are below - * our default of 1%. */ - rep_hist_note_dns_query(0, DNS_ERR_NONE); - stats_str = rep_hist_get_overload_general_line(); - tt_assert(!stats_str); - - /* We'll now go above our 1% threshold. */ - for (int i = 0; i < 1000; i++) { - /* This should trigger 10 timeouts which is our threshold of 1% (10) */ - if (!(i % 10)) { - rep_hist_note_dns_query(0, DNS_ERR_TIMEOUT); - } else { - rep_hist_note_dns_query(0, DNS_ERR_NONE); - } - } - - /* Move it 10 minutes in the future and see if we get a general overload. */ - update_approx_time(approx_time() + (10 * 60)); - - /* This query should trigger the general overload because we are above 1%. */ - rep_hist_note_dns_query(0, DNS_ERR_NONE); - stats_str = rep_hist_get_overload_general_line(); - tt_assert(stats_str); - tor_free(stats_str); - - /* Move 72h in the future, we should NOT get an overload anymore. */ - update_approx_time(approx_time() + (72 * 3600)); - - stats_str = rep_hist_get_overload_general_line(); - tt_assert(!stats_str); - - /* This query should NOT trigger the general overload. */ - rep_hist_note_dns_query(0, DNS_ERR_TIMEOUT); - stats_str = rep_hist_get_overload_general_line(); - tt_assert(!stats_str); - - /* Move it 10 minutes in the future and see if we get a general overload. We - * have now 100% of requests timing out. */ - update_approx_time(approx_time() + (10 * 60)); - - /* This query should trigger the general overload with 50% of timeouts. */ - rep_hist_note_dns_query(0, DNS_ERR_NONE); - stats_str = rep_hist_get_overload_general_line(); - tt_assert(stats_str); - tor_free(stats_str); - - done: - tor_free(stats_str); -} - #define ENT(name) \ { #name, test_ ## name , 0, NULL, NULL } #define FORK(name) \ @@ -958,7 +883,6 @@ struct testcase_t stats_tests[] = { FORK(rephist_v3_onions), FORK(load_stats_file), FORK(overload_stats), - FORK(overload_dns_timeout), END_OF_TESTCASES }; From bf1ed5c8534c515bf1ee39bfbf8c340d704b4490 Mon Sep 17 00:00:00 2001 From: David Goulet Date: Mon, 13 Dec 2021 10:44:24 -0500 Subject: [PATCH 2/5] relay: Change DNS timeout label on MetricsPort Change it from "timeout" to "tor_timeout" in order to indicate that the DNS timeout is one from tor's DNS threshold and not the DNS server itself. Fixes #40527 Signed-off-by: David Goulet --- changes/ticket40527 | 6 ++ src/feature/nodelist/networkstatus.c | 1 - src/feature/relay/relay_metrics.c | 2 +- src/feature/stats/rephist.c | 88 +--------------------------- src/feature/stats/rephist.h | 2 - 5 files changed, 10 insertions(+), 89 deletions(-) diff --git a/changes/ticket40527 b/changes/ticket40527 index 631b3d4bb9..25bf6c57e3 100644 --- a/changes/ticket40527 +++ b/changes/ticket40527 @@ -3,3 +3,9 @@ timeouts are different from DNS server timeout. They have to be seen as timeout related to UX and not because of a network problem. Fixes bug 40527; bugfix on 0.4.6.1-alpha. + - Change the MetricsPort DNS "timeout" label to be "tor_timeout" in order + to indicate that this was a DNS timeout from tor perspective and not the + DNS server itself. + - Deprecate overload_dns_timeout_period_secs and + overload_dns_timeout_scale_percent consensus parameters as well. They + were used to assess the overload state which is no more now. diff --git a/src/feature/nodelist/networkstatus.c b/src/feature/nodelist/networkstatus.c index d57db4c415..77e2b547f5 100644 --- a/src/feature/nodelist/networkstatus.c +++ b/src/feature/nodelist/networkstatus.c @@ -1666,7 +1666,6 @@ notify_before_networkstatus_changes(const networkstatus_t *old_c, dos_consensus_has_changed(new_c); relay_consensus_has_changed(new_c); hs_dos_consensus_has_changed(new_c); - rep_hist_consensus_has_changed(new_c); } /* Called after a new consensus has been put in the global state. It is safe diff --git a/src/feature/relay/relay_metrics.c b/src/feature/relay/relay_metrics.c index 0886b6ae87..fc8eb10d1b 100644 --- a/src/feature/relay/relay_metrics.c +++ b/src/feature/relay/relay_metrics.c @@ -161,7 +161,7 @@ fill_dns_error_values(void) { .name = "refused", .key = DNS_ERR_REFUSED }, { .name = "truncated", .key = DNS_ERR_TRUNCATED }, { .name = "unknown", .key = DNS_ERR_UNKNOWN }, - { .name = "timeout", .key = DNS_ERR_TIMEOUT }, + { .name = "tor_timeout", .key = DNS_ERR_TIMEOUT }, { .name = "shutdown", .key = DNS_ERR_SHUTDOWN }, { .name = "cancel", .key = DNS_ERR_CANCEL }, { .name = "nodata", .key = DNS_ERR_NODATA }, diff --git a/src/feature/stats/rephist.c b/src/feature/stats/rephist.c index 68c9349bc3..5ff4ef1d2e 100644 --- a/src/feature/stats/rephist.c +++ b/src/feature/stats/rephist.c @@ -219,33 +219,6 @@ static uint64_t stats_n_tcp_exhaustion = 0; /***** DNS statistics *****/ -/* We use a scale here so we can represent percentages with decimal points by - * scaling the value by this factor and so 0.5% becomes a value of 500. - * Default is 1% and thus min and max range is 0 to 100%. */ -#define OVERLOAD_DNS_TIMEOUT_PERCENT_SCALE 1000.0 -#define OVERLOAD_DNS_TIMEOUT_PERCENT_DEFAULT 1000 -#define OVERLOAD_DNS_TIMEOUT_PERCENT_MIN 0 -#define OVERLOAD_DNS_TIMEOUT_PERCENT_MAX 100000 - -/** Consensus parameter: indicate what fraction of DNS timeout errors over the - * total number of DNS requests must be reached before we trigger a general - * overload signal .*/ -static double overload_dns_timeout_fraction = - OVERLOAD_DNS_TIMEOUT_PERCENT_DEFAULT / - OVERLOAD_DNS_TIMEOUT_PERCENT_SCALE / 100.0; - -/* Number of seconds for the assessment period. Default is 10 minutes (600) and - * the min max range is within a 32bit value. */ -#define OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_DEFAULT (10 * 60) -#define OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MIN 0 -#define OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MAX INT32_MAX - -/** Consensus parameter: Period, in seconds, over which we count the number of - * DNS requests and timeout errors. After that period, we assess if we trigger - * an overload or not. */ -static int32_t overload_dns_timeout_period_secs = - OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_DEFAULT; - /** Overload DNS statistics. The information in this object is used to assess * if, due to DNS errors, we should emit a general overload signal or not. * @@ -260,9 +233,6 @@ typedef struct { * right before the DNS request is initiated. */ uint64_t stats_n_request; - /** Total number of DNS timeout errors. */ - uint64_t stats_n_error_timeout; - /** When is the next assessment time of the general overload for DNS errors. * Once this time is reached, all stats are reset and this time is set to the * next assessment time. */ @@ -285,7 +255,7 @@ typedef struct { /* Total number of DNS errors specific to libevent. */ uint64_t stats_n_error_truncated; /* 65 */ uint64_t stats_n_error_unknown; /* 66 */ - uint64_t stats_n_error_timeout; /* 67 */ + uint64_t stats_n_error_tor_timeout; /* 67 */ uint64_t stats_n_error_shutdown; /* 68 */ uint64_t stats_n_error_cancel; /* 69 */ uint64_t stats_n_error_nodata; /* 70 */ @@ -337,48 +307,6 @@ get_dns_stats_by_type(const int type) } #endif -/** Assess the DNS timeout errors and if we have enough to trigger a general - * overload. */ -static void -overload_general_dns_assessment(void) -{ - /* Initialize the time. Should be done once. */ - if (overload_dns_stats.next_assessment_time == 0) { - goto reset; - } - - /* Not the time yet. */ - if (overload_dns_stats.next_assessment_time > approx_time()) { - return; - } - - reset: - /* Reset counters for the next period. */ - overload_dns_stats.stats_n_error_timeout = 0; - overload_dns_stats.stats_n_request = 0; - overload_dns_stats.next_assessment_time = - approx_time() + overload_dns_timeout_period_secs; -} - -/** Called just before the consensus will be replaced. Update the consensus - * parameters in case they changed. */ -void -rep_hist_consensus_has_changed(const networkstatus_t *ns) -{ - overload_dns_timeout_fraction = - networkstatus_get_param(ns, "overload_dns_timeout_scale_percent", - OVERLOAD_DNS_TIMEOUT_PERCENT_DEFAULT, - OVERLOAD_DNS_TIMEOUT_PERCENT_MIN, - OVERLOAD_DNS_TIMEOUT_PERCENT_MAX) / - OVERLOAD_DNS_TIMEOUT_PERCENT_SCALE / 100.0; - - overload_dns_timeout_period_secs = - networkstatus_get_param(ns, "overload_dns_timeout_period_secs", - OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_DEFAULT, - OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MIN, - OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MAX); -} - /** Return the DNS error count for the given libevent DNS type and error code. * The possible types are: DNS_IPv4_A, DNS_PTR, DNS_IPv6_AAAA. */ uint64_t @@ -407,7 +335,7 @@ rep_hist_get_n_dns_error(int type, uint8_t error) case DNS_ERR_UNKNOWN: return dns_stats->stats_n_error_unknown; case DNS_ERR_TIMEOUT: - return dns_stats->stats_n_error_timeout; + return dns_stats->stats_n_error_tor_timeout; case DNS_ERR_SHUTDOWN: return dns_stats->stats_n_error_shutdown; case DNS_ERR_CANCEL: @@ -442,16 +370,6 @@ rep_hist_get_n_dns_request(int type) void rep_hist_note_dns_error(int type, uint8_t error) { - /* Assess if we need to trigger a general overload with regards to the DNS - * errors or not. */ - overload_general_dns_assessment(); - - /* Because of the libevent problem (see note in the function comment), we - * disregard the DNS query type and keep track of DNS timeout for the - * overload state. */ - if (error == DNS_ERR_TIMEOUT) { - overload_dns_stats.stats_n_error_timeout++; - } overload_dns_stats.stats_n_request++; /* Again, the libevent bug (see function comment), for an error that is @@ -490,7 +408,7 @@ rep_hist_note_dns_error(int type, uint8_t error) dns_stats->stats_n_error_unknown++; break; case DNS_ERR_TIMEOUT: - dns_stats->stats_n_error_timeout++; + dns_stats->stats_n_error_tor_timeout++; break; case DNS_ERR_SHUTDOWN: dns_stats->stats_n_error_shutdown++; diff --git a/src/feature/stats/rephist.h b/src/feature/stats/rephist.h index 6918ed18c2..7f414de4c8 100644 --- a/src/feature/stats/rephist.h +++ b/src/feature/stats/rephist.h @@ -89,8 +89,6 @@ uint64_t rep_hist_get_n_dns_request(int type); void rep_hist_note_dns_request(int type); void rep_hist_note_dns_error(int type, uint8_t error); -void rep_hist_consensus_has_changed(const networkstatus_t *ns); - extern uint64_t rephist_total_alloc; extern uint32_t rephist_total_num; #ifdef TOR_UNIT_TESTS From 122c60c323bf6467c4cd998bf19ce44426861cd8 Mon Sep 17 00:00:00 2001 From: David Goulet Date: Tue, 14 Dec 2021 12:14:39 -0500 Subject: [PATCH 3/5] fixup! relay: Change DNS timeout label on MetricsPort --- config.rust | 24 ------------------------ 1 file changed, 24 deletions(-) delete mode 100644 config.rust diff --git a/config.rust b/config.rust deleted file mode 100644 index 878a231cda..0000000000 --- a/config.rust +++ /dev/null @@ -1,24 +0,0 @@ -# Used by our cargo build.rs script to get variables from autoconf. -# -# The "configure" script will generate "config.rust" from "config.rust.in", -# and then build.rs will read "config.rust". - -BUILDDIR=/home/dgoulet/Documents/git/tor -TOR_LDFLAGS_zlib= -TOR_LDFLAGS_nss=@TOR_LDFLAGS_nss@ -TOR_LDFLAGS_openssl= -TOR_LDFLAGS_libevent= -TOR_ZLIB_LIBS=-lz -TOR_LIB_MATH=-lm -TOR_LIBEVENT_LIBS=-levent -TOR_OPENSSL_LIBS=-lssl -lcrypto -TOR_LIB_WS32= -TOR_LIB_GDI= -TOR_LIB_USERENV= -CURVE25519_LIBS= -TOR_SYSTEMD_LIBS=-lsystemd -TOR_LZMA_LIBS=-llzma -TOR_ZSTD_LIBS=-lzstd -LIBS=-lseccomp -lcap -LDFLAGS= -pie -z relro -z now -rdynamic -NSS_LIBS= From b37674fec75f14a0be9dbeff458355d91897cbe5 Mon Sep 17 00:00:00 2001 From: David Goulet Date: Tue, 14 Dec 2021 16:12:17 -0500 Subject: [PATCH 4/5] fixup! relay: Change DNS timeout label on MetricsPort Signed-off-by: David Goulet --- src/rust/.cargo/config | 12 ------------ 1 file changed, 12 deletions(-) delete mode 100644 src/rust/.cargo/config diff --git a/src/rust/.cargo/config b/src/rust/.cargo/config deleted file mode 100644 index d91e26719d..0000000000 --- a/src/rust/.cargo/config +++ /dev/null @@ -1,12 +0,0 @@ -[source] - - [source.crates-io] - registry = 'https://github.com/rust-lang/crates.io-index' - replace-with = 'vendored-sources' - - [source.vendored-sources] - directory = '' - -[build] - rustflags = [ "-D", "warnings" ] - From eb06d52dae9cbb8344f543e1077ca6f0b62a317b Mon Sep 17 00:00:00 2001 From: David Goulet Date: Tue, 14 Dec 2021 16:13:00 -0500 Subject: [PATCH 5/5] fixup! relay: Change DNS timeout label on MetricsPort Signed-off-by: David Goulet --- src/test/test_stats.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/test/test_stats.c b/src/test/test_stats.c index a224c21198..c7e0c10845 100644 --- a/src/test/test_stats.c +++ b/src/test/test_stats.c @@ -721,7 +721,7 @@ test_overload_stats(void *arg) stats_str = rep_hist_get_overload_stats_lines(); tt_assert(!stats_str); - /* Note a overload */ + /* Note an overload */ rep_hist_note_overload(OVERLOAD_GENERAL); /* Move the time forward one hour */ @@ -742,7 +742,7 @@ test_overload_stats(void *arg) /* Now the time should be 2002-01-07 00:00:00 */ - /* Note a overload */ + /* Note an overload */ rep_hist_note_overload(OVERLOAD_GENERAL); stats_str = rep_hist_get_overload_general_line();