Merge branch 'bug40168+34088-035-v3' into bug40168+34088-035-v3-master

This commit is contained in:
Mike Perry 2021-02-18 18:17:18 +00:00
commit b2f025cb56
5 changed files with 183 additions and 130 deletions

4
changes/bug34088 Normal file
View File

@ -0,0 +1,4 @@
o Minor bugfixes (circuit build timeout):
- Remove max_time calculation and associated warn from circuit build
timeout 'alpha' parameter estimation, as this is no longer needed
by our new estimator from 40168. Fixes bug 34088; bugfix on 0.2.2.9-alpha.

16
changes/bug40168 Normal file
View File

@ -0,0 +1,16 @@
o Minor bugfixes (circuit build timeout):
- Improve the accuracy of our circuit build timeout calculation for 60%,
70%, and 80% build rates for various guard choices. We now use a maximum
likelihood estimator for Pareto parameters of the circuit build time
distribution, instead of a "right-censored estimator". This causes
clients to ignore circuits that never finish building in their timeout
calculations. Previously, clients were counting such unfinished circuits
as having the highest possible build time value, when in reality these
circuits most likely just contain relays that are offline. We also now
wait a bit longer to let circuits complete for measurement purposes,
lower the minimum possible effective timeout from 1.5 seconds to 10ms,
and increase the resolution of the circuit build time histogram from
50ms bin widths to 10ms bin widths. Additionally, we alter our estimate
Xm by taking the maximum of the top 10 most common build time values
of the 10ms histogram, and compute Xm as the average of these.
Fixes bug 40168; bugfix on 0.2.2.14-alpha.

View File

@ -203,10 +203,10 @@ circuit_build_times_max_timeouts(void)
* Retrieve and bounds-check the cbtnummodes consensus parameter.
*
* Effect: This value governs how many modes to use in the weighted
* average calculation of Pareto parameter Xm. A value of 3 introduces
* some bias (2-5% of CDF) under ideal conditions, but allows for better
* performance in the event that a client chooses guard nodes of radically
* different performance characteristics.
* average calculation of Pareto parameter Xm. Analysis of pairs of
* geographically near, far, and mixed guaeds has shown that a value of
* 10 introduces some allows for the actual timeout rate to be within
* 2-7% of the cutoff quantile, for quantiles between 60-80%.
*/
static int32_t
circuit_build_times_default_num_xm_modes(void)
@ -851,58 +851,49 @@ circuit_build_times_create_histogram(const circuit_build_times_t *cbt,
* Return the Pareto start-of-curve parameter Xm.
*
* Because we are not a true Pareto curve, we compute this as the
* weighted average of the N most frequent build time bins. N is either
* 1 if we don't have enough circuit build time data collected, or
* determined by the consensus parameter cbtnummodes (default 3).
* weighted average of the 10 most frequent build time bins. This
* heuristic allowed for the actual timeout rate to be closest
* to the chosen quantile cutoff, for quantiles 60-80%, out of
* many variant approaches (see #40157 for analysis).
*/
static build_time_t
STATIC build_time_t
circuit_build_times_get_xm(circuit_build_times_t *cbt)
{
build_time_t i, nbins;
build_time_t nbins = 0;
build_time_t *nth_max_bin;
int32_t bin_counts=0;
build_time_t ret = 0;
uint32_t *histogram = circuit_build_times_create_histogram(cbt, &nbins);
int n=0;
build_time_t xm_total = 0;
build_time_t Xm = 0;
int32_t xm_counts=0;
int num_modes = circuit_build_times_default_num_xm_modes();
uint32_t *histogram = circuit_build_times_create_histogram(cbt, &nbins);
tor_assert(nbins > 0);
tor_assert(num_modes > 0);
// Only use one mode if < 1000 buildtimes. Not enough data
// for multiple.
if (cbt->total_build_times < CBT_NCIRCUITS_TO_OBSERVE)
num_modes = 1;
nth_max_bin = tor_calloc(num_modes, sizeof(build_time_t));
/* Determine the N most common build times */
for (i = 0; i < nbins; i++) {
if (histogram[i] >= histogram[nth_max_bin[0]]) {
nth_max_bin[0] = i;
}
for (n = 1; n < num_modes; n++) {
if (histogram[i] >= histogram[nth_max_bin[n]] &&
(!histogram[nth_max_bin[n-1]]
|| histogram[i] < histogram[nth_max_bin[n-1]])) {
/* Determine the N most common build times, by selecting the
* nth largest mode, counting it, and removing it from the histogram. */
for (int n = 0; n < num_modes; n++) {
/* Get nth mode */
for (build_time_t i = 0; i < nbins; i++) {
if (histogram[i] > histogram[nth_max_bin[n]]) {
nth_max_bin[n] = i;
}
}
/* Update average */
xm_counts += histogram[nth_max_bin[n]];
xm_total += CBT_BIN_TO_MS(nth_max_bin[n])*histogram[nth_max_bin[n]];
/* Prevent from re-counting this value */
histogram[nth_max_bin[n]] = 0;
}
for (n = 0; n < num_modes; n++) {
bin_counts += histogram[nth_max_bin[n]];
ret += CBT_BIN_TO_MS(nth_max_bin[n])*histogram[nth_max_bin[n]];
log_info(LD_CIRC, "Xm mode #%d: %u %u", n, CBT_BIN_TO_MS(nth_max_bin[n]),
histogram[nth_max_bin[n]]);
}
/* bin_counts can become zero if all of our last CBT_NCIRCUITS_TO_OBSERVE
/* xm_counts can become zero if all of our last CBT_NCIRCUITS_TO_OBSERVE
* circuits were abandoned before they completed. This shouldn't happen,
* though. We should have reset/re-learned a lower timeout first. */
if (bin_counts == 0) {
ret = 0;
if (xm_counts == 0) {
log_warn(LD_CIRC,
"No valid circuit build time data out of %d times, %u modes, "
"have_timeout=%d, %lfms", cbt->total_build_times, num_modes,
@ -910,15 +901,13 @@ circuit_build_times_get_xm(circuit_build_times_t *cbt)
goto done;
}
tor_assert(bin_counts > 0);
ret /= bin_counts;
Xm = xm_total / xm_counts;
done:
tor_free(histogram);
tor_free(nth_max_bin);
return ret;
return Xm;
}
/**
@ -1007,43 +996,6 @@ circuit_build_times_shuffle_and_store_array(circuit_build_times_t *cbt,
}
}
/**
* Filter old synthetic timeouts that were created before the
* new right-censored Pareto calculation was deployed.
*
* Once all clients before 0.2.1.13-alpha are gone, this code
* will be unused.
*/
static int
circuit_build_times_filter_timeouts(circuit_build_times_t *cbt)
{
int num_filtered=0, i=0;
double timeout_rate = 0;
build_time_t max_timeout = 0;
timeout_rate = circuit_build_times_timeout_rate(cbt);
max_timeout = (build_time_t)cbt->close_ms;
for (i = 0; i < CBT_NCIRCUITS_TO_OBSERVE; i++) {
if (cbt->circuit_build_times[i] > max_timeout) {
build_time_t replaced = cbt->circuit_build_times[i];
num_filtered++;
cbt->circuit_build_times[i] = CBT_BUILD_ABANDONED;
log_debug(LD_CIRC, "Replaced timeout %d with %d", replaced,
cbt->circuit_build_times[i]);
}
}
log_info(LD_CIRC,
"We had %d timeouts out of %d build times, "
"and filtered %d above the max of %u",
(int)(cbt->total_build_times*timeout_rate),
cbt->total_build_times, num_filtered, max_timeout);
return num_filtered;
}
/**
* Load histogram from <b>state</b>, shuffling the resulting array
* after we do so. Use this result to estimate parameters and
@ -1169,10 +1121,6 @@ circuit_build_times_parse_state(circuit_build_times_t *cbt,
circuit_build_times_set_timeout(cbt);
if (!state->CircuitBuildAbandonedCount && cbt->total_build_times) {
circuit_build_times_filter_timeouts(cbt);
}
done:
tor_free(loaded_times);
return err ? -1 : 0;
@ -1193,7 +1141,6 @@ circuit_build_times_update_alpha(circuit_build_times_t *cbt)
build_time_t *x=cbt->circuit_build_times;
double a = 0;
int n=0,i=0,abandoned_count=0;
build_time_t max_time=0;
/* https://en.wikipedia.org/wiki/Pareto_distribution#Parameter_estimation */
/* We sort of cheat here and make our samples slightly more pareto-like
@ -1213,14 +1160,13 @@ circuit_build_times_update_alpha(circuit_build_times_t *cbt)
if (x[i] < cbt->Xm) {
a += tor_mathlog(cbt->Xm);
n++;
} else if (x[i] == CBT_BUILD_ABANDONED) {
abandoned_count++;
} else {
a += tor_mathlog(x[i]);
if (x[i] > max_time)
max_time = x[i];
n++;
}
n++;
}
/*
@ -1229,30 +1175,23 @@ circuit_build_times_update_alpha(circuit_build_times_t *cbt)
* performs this same check, and resets state if it hits it. If we
* hit it at runtime, something serious has gone wrong.
*/
if (n!=cbt->total_build_times) {
if (n!=cbt->total_build_times-abandoned_count) {
log_err(LD_CIRC, "Discrepancy in build times count: %d vs %d", n,
cbt->total_build_times);
}
tor_assert(n==cbt->total_build_times);
if (max_time <= 0) {
/* This can happen if Xm is actually the *maximum* value in the set.
* It can also happen if we've abandoned every single circuit somehow.
* In either case, tell the caller not to compute a new build timeout. */
log_warn(LD_BUG,
"Could not determine largest build time (%d). "
"Xm is %dms and we've abandoned %d out of %d circuits.", max_time,
cbt->Xm, abandoned_count, n);
return 0;
}
a += abandoned_count*tor_mathlog(max_time);
tor_assert_nonfatal(n==cbt->total_build_times-abandoned_count);
/* This is the "Maximum Likelihood Estimator" for parameter alpha of a Pareto
* Distribution. See:
* https://en.wikipedia.org/wiki/Pareto_distribution#Estimation_of_parameters
*
* The division in the estimator is done with subtraction outside the ln(),
* with the sum occurring in the for loop above.
*
* This done is to avoid the precision issues of logs of small values.
*/
a -= n*tor_mathlog(cbt->Xm);
// Estimator comes from Eq #4 in:
// "Bayesian estimation based on trimmed samples from Pareto populations"
// by Arturo J. Fernández. We are right-censored only.
a = (n-abandoned_count)/a;
a = n/a;
cbt->alpha = a;
@ -1661,9 +1600,8 @@ circuit_build_times_network_check_changed(circuit_build_times_t *cbt)
log_notice(LD_CIRC,
"Your network connection speed appears to have changed. Resetting "
"timeout to %lds after %d timeouts and %d buildtimes.",
tor_lround(cbt->timeout_ms/1000), timeout_count,
total_build_times);
"timeout to %ldms after %d timeouts and %d buildtimes.",
tor_lround(cbt->timeout_ms), timeout_count, total_build_times);
return 1;
}
@ -1829,7 +1767,7 @@ circuit_build_times_set_timeout(circuit_build_times_t *cbt)
return;
if (cbt->timeout_ms < circuit_build_times_min_timeout()) {
log_info(LD_CIRC, "Set buildtimeout to low value %fms. Setting to %dms",
log_notice(LD_CIRC, "Set buildtimeout to low value %fms. Setting to %dms",
cbt->timeout_ms, circuit_build_times_min_timeout());
cbt->timeout_ms = circuit_build_times_min_timeout();
if (cbt->close_ms < cbt->timeout_ms) {
@ -1847,9 +1785,9 @@ circuit_build_times_set_timeout(circuit_build_times_t *cbt)
log_info(LD_CIRC,
"Based on %d circuit times, it looks like we don't need to "
"wait so long for circuits to finish. We will now assume a "
"circuit is too slow to use after waiting %ld seconds.",
"circuit is too slow to use after waiting %ld milliseconds.",
cbt->total_build_times,
tor_lround(cbt->timeout_ms/1000));
tor_lround(cbt->timeout_ms));
log_info(LD_CIRC,
"Circuit timeout data: %fms, %fms, Xm: %d, a: %f, r: %f",
cbt->timeout_ms, cbt->close_ms, cbt->Xm, cbt->alpha,
@ -1858,18 +1796,18 @@ circuit_build_times_set_timeout(circuit_build_times_t *cbt)
log_info(LD_CIRC,
"Based on %d circuit times, it looks like we need to wait "
"longer for circuits to finish. We will now assume a "
"circuit is too slow to use after waiting %ld seconds.",
"circuit is too slow to use after waiting %ld milliseconds.",
cbt->total_build_times,
tor_lround(cbt->timeout_ms/1000));
tor_lround(cbt->timeout_ms));
log_info(LD_CIRC,
"Circuit timeout data: %fms, %fms, Xm: %d, a: %f, r: %f",
cbt->timeout_ms, cbt->close_ms, cbt->Xm, cbt->alpha,
timeout_rate);
} else {
log_info(LD_CIRC,
"Set circuit build timeout to %lds (%fms, %fms, Xm: %d, a: %f,"
"Set circuit build timeout to %ldms (%fms, %fms, Xm: %d, a: %f,"
" r: %f) based on %d circuit times",
tor_lround(cbt->timeout_ms/1000),
tor_lround(cbt->timeout_ms),
cbt->timeout_ms, cbt->close_ms, cbt->Xm, cbt->alpha, timeout_rate,
cbt->total_build_times);
}

View File

@ -56,10 +56,10 @@ void circuit_build_times_reset(circuit_build_times_t *cbt);
#define CBT_NCIRCUITS_TO_OBSERVE 1000
/** Width of the histogram bins in milliseconds */
#define CBT_BIN_WIDTH ((build_time_t)50)
#define CBT_BIN_WIDTH ((build_time_t)10)
/** Number of modes to use in the weighted-avg computation of Xm */
#define CBT_DEFAULT_NUM_XM_MODES 3
#define CBT_DEFAULT_NUM_XM_MODES 10
#define CBT_MIN_NUM_XM_MODES 1
#define CBT_MAX_NUM_XM_MODES 20
@ -79,7 +79,7 @@ void circuit_build_times_reset(circuit_build_times_t *cbt);
* How long to wait before actually closing circuits that take too long to
* build in terms of CDF quantile.
*/
#define CBT_DEFAULT_CLOSE_QUANTILE 95
#define CBT_DEFAULT_CLOSE_QUANTILE 99
#define CBT_MIN_CLOSE_QUANTILE CBT_MIN_QUANTILE_CUTOFF
#define CBT_MAX_CLOSE_QUANTILE CBT_MAX_QUANTILE_CUTOFF
@ -120,8 +120,8 @@ double circuit_build_times_quantile_cutoff(void);
#define CBT_MAX_TEST_FREQUENCY INT32_MAX
/** Lowest allowable value for CircuitBuildTimeout in milliseconds */
#define CBT_DEFAULT_TIMEOUT_MIN_VALUE (1500)
#define CBT_MIN_TIMEOUT_MIN_VALUE 500
#define CBT_DEFAULT_TIMEOUT_MIN_VALUE (CBT_BIN_WIDTH)
#define CBT_MIN_TIMEOUT_MIN_VALUE CBT_BIN_WIDTH
#define CBT_MAX_TIMEOUT_MIN_VALUE INT32_MAX
/** Initial circuit build timeout in milliseconds */
@ -142,6 +142,7 @@ STATIC int circuit_build_times_update_alpha(circuit_build_times_t *cbt);
/* Network liveness functions */
STATIC int circuit_build_times_network_check_changed(
circuit_build_times_t *cbt);
STATIC build_time_t circuit_build_times_get_xm(circuit_build_times_t *cbt);
#endif /* defined(CIRCUITSTATS_PRIVATE) */
#ifdef TOR_UNIT_TESTS

View File

@ -58,6 +58,7 @@
#include "core/or/policies.h"
#include "app/config/statefile.h"
#include "lib/crypt_ops/crypto_curve25519.h"
#include "feature/nodelist/networkstatus.h"
#include "core/or/extend_info_st.h"
#include "core/or/or_circuit_st.h"
@ -355,6 +356,106 @@ test_onion_queues(void *arg)
tor_free(onionskin);
}
static int32_t cbtnummodes = 10;
static int32_t
mock_xm_networkstatus_get_param(
const networkstatus_t *ns, const char *param_name, int32_t default_val,
int32_t min_val, int32_t max_val)
{
(void)ns;
(void)default_val;
(void)min_val;
(void)max_val;
// only support cbtnummodes right now
tor_assert(strcmp(param_name, "cbtnummodes")==0);
return cbtnummodes;
}
static void
test_circuit_timeout_xm_alpha(void *arg)
{
circuit_build_times_t cbt;
build_time_t Xm;
int alpha_ret;
circuit_build_times_init(&cbt);
(void)arg;
/* Plan:
* 1. Create array of build times with 10 modes.
* 2. Make sure Xm calc is sane for 1,3,5,10,15,20 modes.
* 3. Make sure alpha calc is sane for 1,3,5,10,15,20 modes.
*/
/* 110 build times, 9 modes, 8 mode ties, 10 abandoned */
build_time_t circuit_build_times[] = {
100, 20, 1000, 500, 200, 5000, 30, 600, 200, 300, CBT_BUILD_ABANDONED,
101, 21, 1001, 501, 201, 5001, 31, 601, 201, 301, CBT_BUILD_ABANDONED,
102, 22, 1002, 502, 202, 5002, 32, 602, 202, 302, CBT_BUILD_ABANDONED,
103, 23, 1003, 503, 203, 5003, 33, 603, 203, 303, CBT_BUILD_ABANDONED,
104, 24, 1004, 504, 204, 5004, 34, 604, 204, 304, CBT_BUILD_ABANDONED,
105, 25, 1005, 505, 205, 5005, 35, 605, 205, 305, CBT_BUILD_ABANDONED,
106, 26, 1006, 506, 206, 5006, 36, 606, 206, 306, CBT_BUILD_ABANDONED,
107, 27, 1007, 507, 207, 5007, 37, 607, 207, 307, CBT_BUILD_ABANDONED,
108, 28, 1008, 508, 208, 5008, 38, 608, 208, 308, CBT_BUILD_ABANDONED,
109, 29, 1009, 509, 209, 5009, 39, 609, 209, 309, CBT_BUILD_ABANDONED
};
memcpy(cbt.circuit_build_times, circuit_build_times,
sizeof(circuit_build_times));
cbt.total_build_times = 110;
MOCK(networkstatus_get_param, mock_xm_networkstatus_get_param);
#define CBT_ALPHA_PRECISION 0.00001
cbtnummodes = 1;
Xm = circuit_build_times_get_xm(&cbt);
alpha_ret = circuit_build_times_update_alpha(&cbt);
tt_int_op(alpha_ret, OP_EQ, 1);
tt_int_op(Xm, OP_EQ, 205);
tt_assert(fabs(cbt.alpha - 1.394401) < CBT_ALPHA_PRECISION);
cbtnummodes = 3;
Xm = circuit_build_times_get_xm(&cbt);
alpha_ret = circuit_build_times_update_alpha(&cbt);
tt_int_op(alpha_ret, OP_EQ, 1);
tt_int_op(Xm, OP_EQ, 117);
tt_assert(fabs(cbt.alpha - 0.902313) < CBT_ALPHA_PRECISION);
cbtnummodes = 5;
Xm = circuit_build_times_get_xm(&cbt);
alpha_ret = circuit_build_times_update_alpha(&cbt);
tt_int_op(alpha_ret, OP_EQ, 1);
tt_int_op(Xm, OP_EQ, 146);
tt_assert(fabs(cbt.alpha - 1.049032) < CBT_ALPHA_PRECISION);
cbtnummodes = 10;
Xm = circuit_build_times_get_xm(&cbt);
alpha_ret = circuit_build_times_update_alpha(&cbt);
tt_int_op(alpha_ret, OP_EQ, 1);
tt_int_op(Xm, OP_EQ, 800);
tt_assert(fabs(cbt.alpha - 4.851754) < CBT_ALPHA_PRECISION);
cbtnummodes = 15;
Xm = circuit_build_times_get_xm(&cbt);
alpha_ret = circuit_build_times_update_alpha(&cbt);
tt_int_op(alpha_ret, OP_EQ, 1);
tt_int_op(Xm, OP_EQ, 800);
tt_assert(fabs(cbt.alpha - 4.851754) < CBT_ALPHA_PRECISION);
cbtnummodes = 20;
Xm = circuit_build_times_get_xm(&cbt);
alpha_ret = circuit_build_times_update_alpha(&cbt);
tt_int_op(alpha_ret, OP_EQ, 1);
tt_int_op(Xm, OP_EQ, 800);
tt_assert(fabs(cbt.alpha - 4.851754) < CBT_ALPHA_PRECISION);
done:
#undef CBT_ALPHA_PRECISION
UNMOCK(networkstatus_get_param);
circuit_build_times_free_timeouts(&cbt);
}
static void
test_circuit_timeout(void *arg)
{
@ -373,7 +474,6 @@ test_circuit_timeout(void *arg)
double timeout1, timeout2;
or_state_t *state=NULL;
int i, runs;
double close_ms;
(void)arg;
initialize_periodic_events();
@ -394,18 +494,11 @@ test_circuit_timeout(void *arg)
circuit_build_times_initial_alpha(&initial,
CBT_DEFAULT_QUANTILE_CUTOFF/100.0,
timeout0);
close_ms = MAX(circuit_build_times_calculate_timeout(&initial,
CBT_DEFAULT_CLOSE_QUANTILE/100.0),
CBT_DEFAULT_TIMEOUT_INITIAL_VALUE);
do {
for (i=0; i < CBT_DEFAULT_MIN_CIRCUITS_TO_OBSERVE; i++) {
build_time_t sample = circuit_build_times_generate_sample(&initial,0,1);
if (sample > close_ms) {
circuit_build_times_add_time(&estimate, CBT_BUILD_ABANDONED);
} else {
circuit_build_times_add_time(&estimate, sample);
}
circuit_build_times_add_time(&estimate, sample);
}
circuit_build_times_update_alpha(&estimate);
timeout1 = circuit_build_times_calculate_timeout(&estimate,
@ -659,6 +752,7 @@ static struct testcase_t test_array[] = {
{ "ntor_handshake", test_ntor_handshake, 0, NULL, NULL },
{ "fast_handshake", test_fast_handshake, 0, NULL, NULL },
FORK(circuit_timeout),
FORK(circuit_timeout_xm_alpha),
FORK(rend_fns),
END_OF_TESTCASES