mirror of
https://gitlab.torproject.org/tpo/core/tor.git
synced 2024-11-27 22:03:31 +01:00
relay: Overload state on DNS timeout is now X% over Y secs
With this commit, we will only report a general overload state if we've seen more than X% of DNS timeout errors over Y seconds. Previous behavior was to report when a single timeout occured which is really too small of a threshold. The value X is a consensus parameters called "overload_dns_timeout_scale_percent" which is a scaled percentage (factor of 1000) so we can represent decimal points for X like 0.5% for instance. Its default is 1000 which ends up being 1%. The value Y is a consensus parameters called "overload_dns_timeout_period_secs" which is the time period for which will gather DNS errors and once over, we assess if that X% has been reached ultimately triggering a general overload signal. Closes #40491 Signed-off-by: David Goulet <dgoulet@torproject.org>
This commit is contained in:
parent
af48f5736a
commit
de907893be
@ -103,6 +103,7 @@
|
||||
#include "feature/dirauth/vote_microdesc_hash_st.h"
|
||||
#include "feature/nodelist/vote_routerstatus_st.h"
|
||||
#include "feature/nodelist/routerstatus_st.h"
|
||||
#include "feature/stats/rephist.h"
|
||||
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
@ -1663,6 +1664,7 @@ notify_before_networkstatus_changes(const networkstatus_t *old_c,
|
||||
dos_consensus_has_changed(new_c);
|
||||
relay_consensus_has_changed(new_c);
|
||||
hs_dos_consensus_has_changed(new_c);
|
||||
rep_hist_consensus_has_changed(new_c);
|
||||
}
|
||||
|
||||
/* Called after a new consensus has been put in the global state. It is safe
|
||||
|
@ -1548,16 +1548,6 @@ evdns_callback(int result, char type, int count, int ttl, void *addresses,
|
||||
|
||||
tor_addr_make_unspec(&addr);
|
||||
|
||||
/* Note down any DNS errors to the statistics module */
|
||||
if (result == DNS_ERR_TIMEOUT) {
|
||||
/* libevent timed out while resolving a name. However, because libevent
|
||||
* handles retries and timeouts internally, this means that all attempts of
|
||||
* libevent timed out. If we wanted to get more granular information about
|
||||
* individual libevent attempts, we would have to implement our own DNS
|
||||
* timeout/retry logic */
|
||||
rep_hist_note_overload(OVERLOAD_GENERAL);
|
||||
}
|
||||
|
||||
/* Keep track of whether IPv6 is working */
|
||||
if (type == DNS_IPv6_AAAA) {
|
||||
if (result == DNS_ERR_TIMEOUT) {
|
||||
@ -1659,6 +1649,10 @@ evdns_callback(int result, char type, int count, int ttl, void *addresses,
|
||||
dns_found_answer(string_address, orig_query_type,
|
||||
result, &addr, hostname, ttl);
|
||||
|
||||
/* The result can be changed within this function thus why we note the result
|
||||
* at the end. */
|
||||
rep_hist_note_dns_query(type, result);
|
||||
|
||||
tor_free(arg_);
|
||||
}
|
||||
|
||||
|
@ -84,6 +84,8 @@
|
||||
#include "feature/nodelist/networkstatus_st.h"
|
||||
#include "core/or/or_circuit_st.h"
|
||||
|
||||
#include <event2/dns.h>
|
||||
|
||||
#ifdef HAVE_FCNTL_H
|
||||
#include <fcntl.h>
|
||||
#endif
|
||||
@ -204,6 +206,54 @@ typedef struct {
|
||||
uint64_t overload_fd_exhausted;
|
||||
} overload_stats_t;
|
||||
|
||||
/***** DNS statistics *****/
|
||||
|
||||
/** Represents the statistics of DNS queries seen if it is an Exit. */
|
||||
typedef struct {
|
||||
/** Total number of DNS request seen at an Exit. They might not all end
|
||||
* successfully or might even be lost by tor. This counter is incremented
|
||||
* right before the DNS request is initiated. */
|
||||
uint64_t stats_n_request;
|
||||
|
||||
/** Total number of DNS timeout errors. */
|
||||
uint64_t stats_n_error_timeout;
|
||||
|
||||
/** When is the next assessment time of the general overload for DNS errors.
|
||||
* Once this time is reached, all stats are reset and this time is set to the
|
||||
* next assessment time. */
|
||||
time_t next_assessment_time;
|
||||
} overload_dns_stats_t;
|
||||
|
||||
/** Keep track of the DNS requests for the general overload state. */
|
||||
static overload_dns_stats_t overload_dns_stats;
|
||||
|
||||
/* We use a scale here so we can represent percentages with decimal points by
|
||||
* scaling the value by this factor and so 0.5% becomes a value of 500.
|
||||
* Default is 1% and thus min and max range is 0 to 100%. */
|
||||
#define OVERLOAD_DNS_TIMEOUT_PERCENT_SCALE 1000.0
|
||||
#define OVERLOAD_DNS_TIMEOUT_PERCENT_DEFAULT 1000
|
||||
#define OVERLOAD_DNS_TIMEOUT_PERCENT_MIN 0
|
||||
#define OVERLOAD_DNS_TIMEOUT_PERCENT_MAX 100000
|
||||
|
||||
/** Consensus parameter: indicate what fraction of DNS timeout errors over the
|
||||
* total number of DNS requests must be reached before we trigger a general
|
||||
* overload signal .*/
|
||||
static double overload_dns_timeout_fraction =
|
||||
OVERLOAD_DNS_TIMEOUT_PERCENT_DEFAULT /
|
||||
OVERLOAD_DNS_TIMEOUT_PERCENT_SCALE / 100.0;
|
||||
|
||||
/* Number of seconds for the assessment period. Default is 10 minutes (600) and
|
||||
* the min max range is within a 32bit value. */
|
||||
#define OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_DEFAULT (10 * 60)
|
||||
#define OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MIN 0
|
||||
#define OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MAX INT32_MAX
|
||||
|
||||
/** Consensus parameter: Period, in seconds, over which we count the number of
|
||||
* DNS requests and timeout errors. After that period, we assess if we trigger
|
||||
* an overload or not. */
|
||||
static int32_t overload_dns_timeout_period_secs =
|
||||
OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_DEFAULT;
|
||||
|
||||
/** Current state of overload stats */
|
||||
static overload_stats_t overload_stats;
|
||||
|
||||
@ -218,6 +268,89 @@ overload_happened_recently(time_t overload_time, int n_hours)
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Assess the DNS timeout errors and if we have enough to trigger a general
|
||||
* overload. */
|
||||
static void
|
||||
overload_general_dns_assessment(void)
|
||||
{
|
||||
/* Initialize the time. Should be done once. */
|
||||
if (overload_dns_stats.next_assessment_time == 0) {
|
||||
goto reset;
|
||||
}
|
||||
|
||||
/* Not the time yet. */
|
||||
if (overload_dns_stats.next_assessment_time > approx_time()) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* Lets see if we can signal a general overload. */
|
||||
double fraction = (double) overload_dns_stats.stats_n_error_timeout /
|
||||
(double) overload_dns_stats.stats_n_request;
|
||||
if (fraction >= overload_dns_timeout_fraction) {
|
||||
log_notice(LD_HIST, "General overload -> DNS timeouts (%" PRIu64 ") "
|
||||
"fraction %.4f%% is above threshold of %.4f%%",
|
||||
overload_dns_stats.stats_n_error_timeout,
|
||||
fraction * 100.0,
|
||||
overload_dns_timeout_fraction * 100.0);
|
||||
rep_hist_note_overload(OVERLOAD_GENERAL);
|
||||
}
|
||||
|
||||
reset:
|
||||
/* Reset counters for the next period. */
|
||||
overload_dns_stats.stats_n_error_timeout = 0;
|
||||
overload_dns_stats.stats_n_request = 0;
|
||||
overload_dns_stats.next_assessment_time =
|
||||
approx_time() + overload_dns_timeout_period_secs;
|
||||
}
|
||||
|
||||
/** Called just before the consensus will be replaced. Update the consensus
|
||||
* parameters in case they changed. */
|
||||
void
|
||||
rep_hist_consensus_has_changed(const networkstatus_t *ns)
|
||||
{
|
||||
overload_dns_timeout_fraction =
|
||||
networkstatus_get_param(ns, "overload_dns_timeout_scale_percent",
|
||||
OVERLOAD_DNS_TIMEOUT_PERCENT_DEFAULT,
|
||||
OVERLOAD_DNS_TIMEOUT_PERCENT_MIN,
|
||||
OVERLOAD_DNS_TIMEOUT_PERCENT_MAX) /
|
||||
OVERLOAD_DNS_TIMEOUT_PERCENT_SCALE / 100.0;
|
||||
|
||||
overload_dns_timeout_period_secs =
|
||||
networkstatus_get_param(ns, "overload_dns_timeout_period_secs",
|
||||
OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_DEFAULT,
|
||||
OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MIN,
|
||||
OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MAX);
|
||||
}
|
||||
|
||||
/** Note a DNS error for the given given libevent DNS record type and error
|
||||
* code. Possible types are: DNS_IPv4_A, DNS_PTR, DNS_IPv6_AAAA.
|
||||
*
|
||||
* IMPORTANT: Libevent is _not_ returning the type in case of an error and so
|
||||
* if error is anything but DNS_ERR_NONE, the type is not usable and set to 0.
|
||||
*
|
||||
* See: https://gitlab.torproject.org/tpo/core/tor/-/issues/40490 */
|
||||
void
|
||||
rep_hist_note_dns_query(int type, uint8_t error)
|
||||
{
|
||||
(void) type;
|
||||
|
||||
/* Assess if we need to trigger a general overload with regards to the DNS
|
||||
* errors or not. */
|
||||
overload_general_dns_assessment();
|
||||
|
||||
/* We only care about timeouts for the moment. */
|
||||
switch (error) {
|
||||
case DNS_ERR_TIMEOUT:
|
||||
overload_dns_stats.stats_n_error_timeout++;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
/* Increment total number of requests. */
|
||||
overload_dns_stats.stats_n_request++;
|
||||
}
|
||||
|
||||
/* The current version of the overload stats version */
|
||||
#define OVERLOAD_STATS_VERSION 1
|
||||
|
||||
|
@ -72,11 +72,14 @@ void rep_hist_seen_new_rp_cell(bool is_v2);
|
||||
char *rep_hist_get_hs_v3_stats_string(void);
|
||||
void rep_hist_hsdir_stored_maybe_new_v3_onion(const uint8_t *blinded_key);
|
||||
|
||||
void rep_hist_note_dns_query(int type, uint8_t error);
|
||||
|
||||
void rep_hist_free_all(void);
|
||||
|
||||
void rep_hist_note_negotiated_link_proto(unsigned link_proto,
|
||||
int started_here);
|
||||
void rep_hist_log_link_protocol_counts(void);
|
||||
void rep_hist_consensus_has_changed(const networkstatus_t *ns);
|
||||
|
||||
extern uint64_t rephist_total_alloc;
|
||||
extern uint32_t rephist_total_num;
|
||||
|
Loading…
Reference in New Issue
Block a user