Merge remote-tracking branch 'tor-gitlab/mr/500' into main

This commit is contained in:
Alexander Færøy 2021-12-15 12:46:18 +00:00
commit 193781e6ef
6 changed files with 18 additions and 180 deletions

11
changes/ticket40527 Normal file
View File

@ -0,0 +1,11 @@
o Major bugfixes (relay, overload):
- Don't make Tor DNS timeout trigger an overload general state. These
timeouts are different from DNS server timeout. They have to be seen as
timeout related to UX and not because of a network problem. Fixes bug
40527; bugfix on 0.4.6.1-alpha.
- Change the MetricsPort DNS "timeout" label to be "tor_timeout" in order
to indicate that this was a DNS timeout from tor perspective and not the
DNS server itself.
- Deprecate overload_dns_timeout_period_secs and
overload_dns_timeout_scale_percent consensus parameters as well. They
were used to assess the overload state which is no more now.

View File

@ -1666,7 +1666,6 @@ notify_before_networkstatus_changes(const networkstatus_t *old_c,
dos_consensus_has_changed(new_c);
relay_consensus_has_changed(new_c);
hs_dos_consensus_has_changed(new_c);
rep_hist_consensus_has_changed(new_c);
}
/* Called after a new consensus has been put in the global state. It is safe

View File

@ -161,7 +161,7 @@ fill_dns_error_values(void)
{ .name = "refused", .key = DNS_ERR_REFUSED },
{ .name = "truncated", .key = DNS_ERR_TRUNCATED },
{ .name = "unknown", .key = DNS_ERR_UNKNOWN },
{ .name = "timeout", .key = DNS_ERR_TIMEOUT },
{ .name = "tor_timeout", .key = DNS_ERR_TIMEOUT },
{ .name = "shutdown", .key = DNS_ERR_SHUTDOWN },
{ .name = "cancel", .key = DNS_ERR_CANCEL },
{ .name = "nodata", .key = DNS_ERR_NODATA },

View File

@ -219,33 +219,6 @@ static uint64_t stats_n_tcp_exhaustion = 0;
/***** DNS statistics *****/
/* We use a scale here so we can represent percentages with decimal points by
* scaling the value by this factor and so 0.5% becomes a value of 500.
* Default is 1% and thus min and max range is 0 to 100%. */
#define OVERLOAD_DNS_TIMEOUT_PERCENT_SCALE 1000.0
#define OVERLOAD_DNS_TIMEOUT_PERCENT_DEFAULT 1000
#define OVERLOAD_DNS_TIMEOUT_PERCENT_MIN 0
#define OVERLOAD_DNS_TIMEOUT_PERCENT_MAX 100000
/** Consensus parameter: indicate what fraction of DNS timeout errors over the
* total number of DNS requests must be reached before we trigger a general
* overload signal .*/
static double overload_dns_timeout_fraction =
OVERLOAD_DNS_TIMEOUT_PERCENT_DEFAULT /
OVERLOAD_DNS_TIMEOUT_PERCENT_SCALE / 100.0;
/* Number of seconds for the assessment period. Default is 10 minutes (600) and
* the min max range is within a 32bit value. */
#define OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_DEFAULT (10 * 60)
#define OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MIN 0
#define OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MAX INT32_MAX
/** Consensus parameter: Period, in seconds, over which we count the number of
* DNS requests and timeout errors. After that period, we assess if we trigger
* an overload or not. */
static int32_t overload_dns_timeout_period_secs =
OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_DEFAULT;
/** Overload DNS statistics. The information in this object is used to assess
* if, due to DNS errors, we should emit a general overload signal or not.
*
@ -260,9 +233,6 @@ typedef struct {
* right before the DNS request is initiated. */
uint64_t stats_n_request;
/** Total number of DNS timeout errors. */
uint64_t stats_n_error_timeout;
/** When is the next assessment time of the general overload for DNS errors.
* Once this time is reached, all stats are reset and this time is set to the
* next assessment time. */
@ -285,7 +255,7 @@ typedef struct {
/* Total number of DNS errors specific to libevent. */
uint64_t stats_n_error_truncated; /* 65 */
uint64_t stats_n_error_unknown; /* 66 */
uint64_t stats_n_error_timeout; /* 67 */
uint64_t stats_n_error_tor_timeout; /* 67 */
uint64_t stats_n_error_shutdown; /* 68 */
uint64_t stats_n_error_cancel; /* 69 */
uint64_t stats_n_error_nodata; /* 70 */
@ -337,60 +307,6 @@ get_dns_stats_by_type(const int type)
}
#endif
/** Assess the DNS timeout errors and if we have enough to trigger a general
* overload. */
static void
overload_general_dns_assessment(void)
{
/* Initialize the time. Should be done once. */
if (overload_dns_stats.next_assessment_time == 0) {
goto reset;
}
/* Not the time yet. */
if (overload_dns_stats.next_assessment_time > approx_time()) {
return;
}
/* Lets see if we can signal a general overload. */
double fraction = (double) overload_dns_stats.stats_n_error_timeout /
(double) overload_dns_stats.stats_n_request;
if (fraction >= overload_dns_timeout_fraction) {
log_notice(LD_HIST, "General overload -> DNS timeouts (%" PRIu64 ") "
"fraction %.4f%% is above threshold of %.4f%%",
overload_dns_stats.stats_n_error_timeout,
fraction * 100.0,
overload_dns_timeout_fraction * 100.0);
rep_hist_note_overload(OVERLOAD_GENERAL);
}
reset:
/* Reset counters for the next period. */
overload_dns_stats.stats_n_error_timeout = 0;
overload_dns_stats.stats_n_request = 0;
overload_dns_stats.next_assessment_time =
approx_time() + overload_dns_timeout_period_secs;
}
/** Called just before the consensus will be replaced. Update the consensus
* parameters in case they changed. */
void
rep_hist_consensus_has_changed(const networkstatus_t *ns)
{
overload_dns_timeout_fraction =
networkstatus_get_param(ns, "overload_dns_timeout_scale_percent",
OVERLOAD_DNS_TIMEOUT_PERCENT_DEFAULT,
OVERLOAD_DNS_TIMEOUT_PERCENT_MIN,
OVERLOAD_DNS_TIMEOUT_PERCENT_MAX) /
OVERLOAD_DNS_TIMEOUT_PERCENT_SCALE / 100.0;
overload_dns_timeout_period_secs =
networkstatus_get_param(ns, "overload_dns_timeout_period_secs",
OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_DEFAULT,
OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MIN,
OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MAX);
}
/** Return the DNS error count for the given libevent DNS type and error code.
* The possible types are: DNS_IPv4_A, DNS_PTR, DNS_IPv6_AAAA. */
uint64_t
@ -419,7 +335,7 @@ rep_hist_get_n_dns_error(int type, uint8_t error)
case DNS_ERR_UNKNOWN:
return dns_stats->stats_n_error_unknown;
case DNS_ERR_TIMEOUT:
return dns_stats->stats_n_error_timeout;
return dns_stats->stats_n_error_tor_timeout;
case DNS_ERR_SHUTDOWN:
return dns_stats->stats_n_error_shutdown;
case DNS_ERR_CANCEL:
@ -454,16 +370,6 @@ rep_hist_get_n_dns_request(int type)
void
rep_hist_note_dns_error(int type, uint8_t error)
{
/* Assess if we need to trigger a general overload with regards to the DNS
* errors or not. */
overload_general_dns_assessment();
/* Because of the libevent problem (see note in the function comment), we
* disregard the DNS query type and keep track of DNS timeout for the
* overload state. */
if (error == DNS_ERR_TIMEOUT) {
overload_dns_stats.stats_n_error_timeout++;
}
overload_dns_stats.stats_n_request++;
/* Again, the libevent bug (see function comment), for an error that is
@ -502,7 +408,7 @@ rep_hist_note_dns_error(int type, uint8_t error)
dns_stats->stats_n_error_unknown++;
break;
case DNS_ERR_TIMEOUT:
dns_stats->stats_n_error_timeout++;
dns_stats->stats_n_error_tor_timeout++;
break;
case DNS_ERR_SHUTDOWN:
dns_stats->stats_n_error_shutdown++;

View File

@ -89,8 +89,6 @@ uint64_t rep_hist_get_n_dns_request(int type);
void rep_hist_note_dns_request(int type);
void rep_hist_note_dns_error(int type, uint8_t error);
void rep_hist_consensus_has_changed(const networkstatus_t *ns);
extern uint64_t rephist_total_alloc;
extern uint32_t rephist_total_num;
#ifdef TOR_UNIT_TESTS

View File

@ -721,7 +721,7 @@ test_overload_stats(void *arg)
stats_str = rep_hist_get_overload_stats_lines();
tt_assert(!stats_str);
/* Note a DNS overload */
/* Note an overload */
rep_hist_note_overload(OVERLOAD_GENERAL);
/* Move the time forward one hour */
@ -742,7 +742,7 @@ test_overload_stats(void *arg)
/* Now the time should be 2002-01-07 00:00:00 */
/* Note a DNS overload */
/* Note an overload */
rep_hist_note_overload(OVERLOAD_GENERAL);
stats_str = rep_hist_get_overload_general_line();
@ -760,7 +760,7 @@ test_overload_stats(void *arg)
tt_str_op("overload-fd-exhausted 1 2002-01-07 00:00:00\n", OP_EQ, stats_str);
tor_free(stats_str);
/* Move the time forward. Register DNS overload. See that the time changed */
/* Move the time forward. Register overload. See that the time changed */
current_time += 3600*2;
update_approx_time(current_time);
@ -867,81 +867,6 @@ test_overload_stats(void *arg)
tor_free(stats_str);
}
/** Test the overload stats logic. */
static void
test_overload_dns_timeout(void *arg)
{
char *stats_str = NULL;
(void) arg;
/* Lets simulate a series of timeouts but below our default 1% threshold. */
for (int i = 0; i < 1000; i++) {
/* This should trigger 9 timeouts which is just below 1% (10) */
if (i > 0 && !(i % 100)) {
rep_hist_note_dns_error(0, DNS_ERR_TIMEOUT);
} else {
rep_hist_note_dns_error(0, DNS_ERR_NONE);
}
}
/* No overload yet. */
stats_str = rep_hist_get_overload_general_line();
tt_assert(!stats_str);
/* Move it 10 minutes in the future and see if we get a general overload. */
update_approx_time(approx_time() + (10 * 60));
/* This query should NOT trigger the general overload because we are below
* our default of 1%. */
rep_hist_note_dns_error(0, DNS_ERR_NONE);
stats_str = rep_hist_get_overload_general_line();
tt_assert(!stats_str);
/* We'll now go above our 1% threshold. */
for (int i = 0; i < 1000; i++) {
/* This should trigger 10 timeouts which is our threshold of 1% (10) */
if (!(i % 10)) {
rep_hist_note_dns_error(0, DNS_ERR_TIMEOUT);
} else {
rep_hist_note_dns_error(0, DNS_ERR_NONE);
}
}
/* Move it 10 minutes in the future and see if we get a general overload. */
update_approx_time(approx_time() + (10 * 60));
/* This query should trigger the general overload because we are above 1%. */
rep_hist_note_dns_error(0, DNS_ERR_NONE);
stats_str = rep_hist_get_overload_general_line();
tt_assert(stats_str);
tor_free(stats_str);
/* Move 72h in the future, we should NOT get an overload anymore. */
update_approx_time(approx_time() + (72 * 3600));
stats_str = rep_hist_get_overload_general_line();
tt_assert(!stats_str);
/* This query should NOT trigger the general overload. */
rep_hist_note_dns_error(0, DNS_ERR_TIMEOUT);
stats_str = rep_hist_get_overload_general_line();
tt_assert(!stats_str);
/* Move it 10 minutes in the future and see if we get a general overload. We
* have now 100% of requests timing out. */
update_approx_time(approx_time() + (10 * 60));
/* This query should trigger the general overload with 50% of timeouts. */
rep_hist_note_dns_error(0, DNS_ERR_NONE);
stats_str = rep_hist_get_overload_general_line();
tt_assert(stats_str);
tor_free(stats_str);
done:
tor_free(stats_str);
}
#define ENT(name) \
{ #name, test_ ## name , 0, NULL, NULL }
#define FORK(name) \
@ -958,7 +883,6 @@ struct testcase_t stats_tests[] = {
FORK(rephist_v3_onions),
FORK(load_stats_file),
FORK(overload_stats),
FORK(overload_dns_timeout),
END_OF_TESTCASES
};