r14758@catbus: nickm | 2007-08-21 01:36:03 -0400

Finish implementing and documenting proposal 108: Authorities now use MTBF data to set their stability flags, once they have at least 4 days of data to use.


svn:r11240
This commit is contained in:
Nick Mathewson 2007-08-21 05:37:24 +00:00
parent 8cb6b2bc74
commit 7dbe7fd4d8
6 changed files with 162 additions and 25 deletions

View File

@ -1,4 +1,10 @@
Changes in version 0.2.0.6-alpha - 2007-??-??
o Major features (directory authorities):
- Track authorities by weighted mean-times-between-failures. When we
have 4 or more days of data, use measured MTBF rather than declared
uptime to decide whether to call a router Stable. Implements proposal
108.
o Major bugfixes:
- Handle unexpected whitespace better in malformed descriptors. Bug
found using Benedikt Boss's new Tor fuzzer! Bugfix on 0.2.0.x.

View File

@ -88,8 +88,7 @@ Things we'd like to do in 0.2.0.x:
- Download as needed.
o Serve list as needed.
o Avoid double-checking signatures every time we get a vote.
- Warn about expired stuff.
- Fix all XXXX020s in vote code
. Code to generate consensus from a list of votes
* Detect whether votes are really all for the same period.
. Push/pull documents as appropriate.
@ -100,23 +99,34 @@ Things we'd like to do in 0.2.0.x:
o Have clients know which authorities are v3 authorities, and what
their keys are.
- While we're at it, let v3 authorities have fqdns lines.
- Fix all XXXX020s in vote code
- Validate information properly.
- Warn if we get a vote with different authorities than we know.
- Don't count votes with a different valid-after when generating
the same consensus.
- Dump certificates with the wrong time. Or just warn?
- Warn authority ops when their certs are nearly invalid.
- When checking a consensus, make sure that its times are plausible.
- Add a function that will eventually tell us about our clock skew.
For now, just require that authorities not be skewed.
- Start caching consensus documents once authorities make them
- Start downloading and using consensus documents once caches serve them
. 104: Long and Short Router Descriptors
o Merge proposal
- Drop bandwidth history from router-descriptors
- 105: Version negotiation for the Tor protocol
. 108: Base "Stable" Flag on Mean Time Between Failures
o 108: Base "Stable" Flag on Mean Time Between Failures
o Track mtbf in rephist.c
o Do not delete old stability information if we're an authority.
o Make sure authorities call up/down functions as appropriate.
o Record mtbf between invocations
- Base Stable on mtbf.
- Test mtbf logic.
o Base Stable on mtbf.
o Test mtbf logic.
- 113: Simplifying directory authority administration
- 110: prevent infinite-length circuits (phase one)
- servers should recognize relay_extend cells and pass them
on just like relay cells
- Refactoring:
- Make resolves no longer use edge_connection_t unless they are actually
_on_ a socks connection: have edge_connection_t and (say)

View File

@ -1199,6 +1199,10 @@ Only used by servers. Holds the fingerprint of the server's identity key.
Only for naming authoritative directory servers (see \fBNamingAuthoritativeDirectory\fP). This file lists nickname to identity bindings. Each line lists a nickname and a fingerprint separated by whitespace. See your \fBfingerprint\fP file in the \fIDataDirectory\fP for an example line. If the nickname is \fB!reject\fP then descriptors from the given identity (fingerprint) are rejected by this server. If it is \fB!invalid\fP then descriptors are accepted but marked in the directory as not valid, that is, not recommended.
.LP
.TP
.B \fIDataDirectory\fP/router-stability
Only used by authoritative directory servers. Tracks measurements for router mean-time-between-failures so that authorities have a good idea of how to set their Stable flags.
.LP
.TP
.B \fIHiddenServiceDirectory\fP/hostname
The <base32-encoded-fingerprint>.onion domain name for this hidden service.
.LP

View File

@ -1500,6 +1500,9 @@ should_generate_v2_networkstatus(void)
* network using allegedly high-uptime nodes, displacing all the
* current guards. */
#define UPTIME_TO_GUARANTEE_STABLE (3600*24*30)
/* If a router's MTBF is at least this value, then it is always stable.
* See above. */
#define MTBF_TO_GUARANTEE_STABLE (60*60*24*10)
/** Similarly, we protect sufficiently fast nodes from being pushed
* out of the set of Fast nodes. */
#define BANDWIDTH_TO_GUARANTEE_FAST (100*1024)
@ -1511,6 +1514,8 @@ should_generate_v2_networkstatus(void)
* dirserv_compute_performance_thresholds, and used by
* generate_v2_networkstatus */
static uint32_t stable_uptime = 0; /* start at a safe value */
static double stable_mtbf = 0.0;
static int enough_mtbf_info = 0;
static uint32_t fast_bandwidth = 0;
static uint32_t guard_bandwidth_including_exits = 0;
static uint32_t guard_bandwidth_excluding_exits = 0;
@ -1539,10 +1544,20 @@ dirserv_thinks_router_is_unreliable(time_t now,
int need_uptime, int need_capacity)
{
if (need_uptime) {
int uptime = real_uptime(router, now);
if ((unsigned)uptime < stable_uptime &&
(unsigned)uptime < UPTIME_TO_GUARANTEE_STABLE)
return 1;
if (!enough_mtbf_info) {
/* XXXX Once most authorities are on v3, we should change the rule from
* "use uptime if we don't have mtbf data" to "don't advertise Stable on
* v3 if we don't have enough mtbf data." */
int uptime = real_uptime(router, now);
if ((unsigned)uptime < stable_uptime &&
(unsigned)uptime < UPTIME_TO_GUARANTEE_STABLE)
return 1;
} else {
double mtbf =
rep_hist_get_stability(router->cache_info.identity_digest, now);
if (mtbf < stable_mtbf && mtbf < MTBF_TO_GUARANTEE_STABLE)
return 1;
}
}
if (need_capacity) {
uint32_t bw = router_get_advertised_bandwidth(router);
@ -1563,6 +1578,17 @@ _compare_uint32(const void **a, const void **b)
return 0;
}
/** Helper: returns a tristate based on comparing **(double**)<b>a</b>
* to **(double**)<b>b</b>. */
static int
_compare_double(const void **a, const void **b)
{
double first = **(double **)a, second = **(double **)b;
if (first < second) return -1;
if (first > second) return 1;
return 0;
}
/** Look through the routerlist, and assign the median uptime of running valid
* servers to stable_uptime, and the relative bandwidth capacities to
* fast_bandwidth and guard_bandwidth. Set total_bandwidth to the total
@ -1572,7 +1598,7 @@ _compare_uint32(const void **a, const void **b)
static void
dirserv_compute_performance_thresholds(routerlist_t *rl)
{
smartlist_t *uptimes, *bandwidths, *bandwidths_excluding_exits;
smartlist_t *uptimes, *mtbfs, *bandwidths, *bandwidths_excluding_exits;
time_t now = time(NULL);
/* initialize these all here, in case there are no routers */
@ -1585,16 +1611,21 @@ dirserv_compute_performance_thresholds(routerlist_t *rl)
total_exit_bandwidth = 0;
uptimes = smartlist_create();
mtbfs = smartlist_create();
bandwidths = smartlist_create();
bandwidths_excluding_exits = smartlist_create();
/* XXXX020 we should just use arrays and qsort. */
SMARTLIST_FOREACH(rl->routers, routerinfo_t *, ri, {
if (router_is_active(ri, now)) {
uint32_t *up = tor_malloc(sizeof(uint32_t));
uint32_t *bw = tor_malloc(sizeof(uint32_t));
uint32_t *mtbf = tor_malloc(sizeof(double));
ri->is_exit = exit_policy_is_general_exit(ri->exit_policy);
*up = (uint32_t) real_uptime(ri, now);
smartlist_add(uptimes, up);
*mtbf = rep_hist_get_stability(ri->cache_info.identity_digest, now);
smartlist_add(mtbfs, mtbf);
*bw = router_get_advertised_bandwidth(ri);
total_bandwidth += *bw;
if (ri->is_exit && !ri->is_bad_exit) {
@ -1609,6 +1640,7 @@ dirserv_compute_performance_thresholds(routerlist_t *rl)
});
smartlist_sort(uptimes, _compare_uint32);
smartlist_sort(mtbfs, _compare_double);
smartlist_sort(bandwidths, _compare_uint32);
smartlist_sort(bandwidths_excluding_exits, _compare_uint32);
@ -1616,6 +1648,11 @@ dirserv_compute_performance_thresholds(routerlist_t *rl)
stable_uptime = *(uint32_t*)smartlist_get(uptimes,
smartlist_len(uptimes)/2);
if (smartlist_len(mtbfs))
stable_mtbf = *(double*)smartlist_get(mtbfs,
smartlist_len(mtbfs)/2);
enough_mtbf_info = rep_hist_have_measured_enough_stability();
if (smartlist_len(bandwidths)) {
fast_bandwidth = *(uint32_t*)smartlist_get(bandwidths,
smartlist_len(bandwidths)/8);
@ -1640,9 +1677,11 @@ dirserv_compute_performance_thresholds(routerlist_t *rl)
(unsigned long)guard_bandwidth_excluding_exits);
SMARTLIST_FOREACH(uptimes, uint32_t *, up, tor_free(up));
SMARTLIST_FOREACH(mtbfs, double *, mtbf, tor_free(mtbf));
SMARTLIST_FOREACH(bandwidths, uint32_t *, bw, tor_free(bw));
SMARTLIST_FOREACH(bandwidths_excluding_exits, uint32_t *, bw, tor_free(bw));
smartlist_free(uptimes);
smartlist_free(mtbfs);
smartlist_free(bandwidths);
smartlist_free(bandwidths_excluding_exits);
}

View File

@ -3124,6 +3124,7 @@ int rep_hist_load_mtbf_data(time_t now);
time_t rep_hist_downrate_old_runs(time_t now);
double rep_hist_get_stability(const char *id, time_t when);
int rep_hist_have_measured_enough_stability(void);
void rep_hist_note_used_port(uint16_t port, time_t now);
smartlist_t *rep_hist_get_predicted_ports(time_t now);

View File

@ -20,9 +20,18 @@ static void hs_usage_init(void);
uint64_t rephist_total_alloc=0;
uint32_t rephist_total_num=0;
/** If the total weighted run count of all runs for a router ever falls
* below this amount, the router can be treated as having 0 MTBF. */
#define STABILITY_EPSILON 0.0001
#define STABILITY_ALPHA 0.9
/** Value by which to discount all old intervals for MTBF purposses. This
* is compounded every STABILITY_INTERVAL. */
#define STABILITY_ALPHA 0.95
/** Interval at which to discount all old intervals for MTBF purposes. */
#define STABILITY_INTERVAL (12*60*60)
/* (This combination of ALPHA, INTERVAL, and EPSILON makes it so that an
* interval that just ended counts twice as much as one that ended a week ago,
* 20X as much as one that ended a month ago, and routers that have had no
* uptime data for about half a year will get forgotten.) */
/** History of an OR-\>OR link. */
typedef struct link_history_t {
@ -56,18 +65,30 @@ typedef struct or_history_t {
time_t up_since;
/** If nonzero, we have been unable to connect since this time. */
time_t down_since;
/** DOCDOC */
/* === For MTBF tracking: */
/** Weighted sum total of all times that this router has been online.
*/
unsigned long weighted_run_length;
/** If the router is now online (according to stability-checking rules),
* when did it come online? */
time_t start_of_run;
/** Sum of weights for runs in weighted_run_length. */
double total_run_weights;
/** Map from hex OR2 identity digest to a link_history_t for the link
* from this OR to OR2. */
digestmap_t *link_history_map;
} or_history_t;
/** DOCDOC */
/** When did we last multiply all routers' weighted_run_length and
* total_run_weights by STABILITY_ALPHA? */
static time_t stability_last_downrated = 0;
/** */
static time_t started_tracking_stability = 0;
/** Map from hex OR identity digest to or_history_t. */
static digestmap_t *history_map = NULL;
@ -163,7 +184,9 @@ rep_hist_init(void)
hs_usage_init();
}
/** DOCDOC */
/** Helper: note that we are no longer connected to the router with history
* <b>hist</b>. If <b>failed</b>, the connection failed; otherwise, it was
* closed correctly. */
static void
mark_or_down(or_history_t *hist, time_t when, int failed)
{
@ -176,7 +199,8 @@ mark_or_down(or_history_t *hist, time_t when, int failed)
}
}
/** DOCDOC */
/** Helper: note that we are connected to the router with history
* <b>hist</b>. */
static void
mark_or_up(or_history_t *hist, time_t when)
{
@ -259,6 +283,8 @@ void
rep_hist_note_router_reachable(const char *id, time_t when)
{
or_history_t *hist = get_or_history(id);
if (!started_tracking_stability)
started_tracking_stability = time(NULL);
if (hist && !hist->start_of_run) {
hist->start_of_run = when;
}
@ -270,6 +296,8 @@ void
rep_hist_note_router_unreachable(const char *id, time_t when)
{
or_history_t *hist = get_or_history(id);
if (!started_tracking_stability)
started_tracking_stability = time(NULL);
if (hist && hist->start_of_run) {
/*XXXX020 treat failure specially? */
long run_length = when - hist->start_of_run;
@ -279,7 +307,8 @@ rep_hist_note_router_unreachable(const char *id, time_t when)
}
}
/**DOCDOC*/
/** Helper: Discount all old MTBF data, if it is time to do so. Return
* the time at which we should next discount MTBF data. */
time_t
rep_hist_downrate_old_runs(time_t now)
{
@ -296,11 +325,13 @@ rep_hist_downrate_old_runs(time_t now)
if (stability_last_downrated + STABILITY_INTERVAL > now)
return stability_last_downrated + STABILITY_INTERVAL;
/* Okay, we should downrate the data. By how much? */
while (stability_last_downrated + STABILITY_INTERVAL < now) {
stability_last_downrated += STABILITY_INTERVAL;
alpha *= STABILITY_ALPHA;
}
/* Multiply every w_r_l, t_r_w pair by alpha. */
for (orhist_it = digestmap_iter_init(history_map);
!digestmap_iter_done(orhist_it);
orhist_it = digestmap_iter_next(history_map,orhist_it)) {
@ -315,7 +346,7 @@ rep_hist_downrate_old_runs(time_t now)
return stability_last_downrated + STABILITY_INTERVAL;
}
/** DOCDOC */
/** Helper: Return the weighted MTBF of the router with history <b>hist</b>. */
static double
get_stability(or_history_t *hist, time_t when)
{
@ -323,16 +354,21 @@ get_stability(or_history_t *hist, time_t when)
double total_weights = hist->total_run_weights;
if (hist->start_of_run) {
/* We're currently in a run. Let total and total_weights hold the values
* they would hold if the current run were to end now. */
total += (when-hist->start_of_run);
total_weights += 1.0;
}
if (total_weights < STABILITY_EPSILON)
if (total_weights < STABILITY_EPSILON) {
/* Round down to zero, and avoid divide-by-zero. */
return 0.0;
}
return total / total_weights;
}
/**DOCDOC*/
/** Return an estimated MTBF for the router whose identity digest is
* <b>id</b>. Return 0 if the router is unknown. */
double
rep_hist_get_stability(const char *id, time_t when)
{
@ -343,6 +379,16 @@ rep_hist_get_stability(const char *id, time_t when)
return get_stability(hist, when);
}
/** Return true if we've been measuring MTBFs for long enough to
* prounounce on Stability. */
int
rep_hist_have_measured_enough_stability(void)
{
/* XXXX020 This doesn't do so well when we change our opinion
* as to whether we're tracking router stability. */
return started_tracking_stability < time(NULL) - 4*60*60;
}
/** Remember that we successfully extended from the OR with identity
* digest <b>from_id</b> to the OR with identity digest
* <b>to_name</b>.
@ -502,7 +548,8 @@ rep_history_clean(time_t before)
}
}
/** DOCDOC */
/** Return a newly allocated string holding the filename in which we store
* MTBF information. */
static char *
get_mtbf_filename(void)
{
@ -513,7 +560,7 @@ get_mtbf_filename(void)
return fn;
}
/** DOCDOC */
/** Write MTBF data to disk. Returns 0 on success, negative on failure. */
int
rep_hist_record_mtbf_data(void)
{
@ -526,6 +573,16 @@ rep_hist_record_mtbf_data(void)
void *or_history_p;
or_history_t *hist;
/* File format is:
* FormatLine *KeywordLine Data
*
* FormatLine = "format 1" NL
* KeywordLine = Keyword SP Arguments NL
* Data = "data" NL *RouterMTBFLine "." NL
* RouterMTBFLine = Fingerprint SP WeightedRunLen SP
* TotalRunWeights [SP S=StartRunTime] NL
*/
lines = smartlist_create();
smartlist_add(lines, tor_strdup("format 1\n"));
@ -534,6 +591,11 @@ rep_hist_record_mtbf_data(void)
tor_snprintf(buf, sizeof(buf), "stored-at %s\n", time_buf);
smartlist_add(lines, tor_strdup(buf));
if (started_tracking_stability) {
format_iso_time(time_buf, started_tracking_stability);
tor_snprintf(buf, sizeof(buf), "tracked-since %s\n", time_buf);
smartlist_add(lines, tor_strdup(buf));
}
if (stability_last_downrated) {
format_iso_time(time_buf, stability_last_downrated);
tor_snprintf(buf, sizeof(buf), "last-downrated %s\n", time_buf);
@ -579,7 +641,8 @@ rep_hist_record_mtbf_data(void)
}
}
/** DOCDOC */
/** Load MTBF data from disk. Returns 0 on success or recoverable error, -1
* on failure. */
int
rep_hist_load_mtbf_data(time_t now)
{
@ -587,7 +650,8 @@ rep_hist_load_mtbf_data(time_t now)
smartlist_t *lines;
const char *line = NULL;
int r=0, i;
time_t last_downrated = 0, stored_at = 0;
time_t last_downrated = 0, stored_at = 0, tracked_since = 0;
time_t latest_possible_start = now;
{
char *filename = get_mtbf_filename();
@ -618,9 +682,16 @@ rep_hist_load_mtbf_data(time_t now)
log_warn(LD_GENERAL,"Couldn't parse stored time in mtbf "
"history file.");
}
if (!strcmpstart(line, "tracked-since ")) {
if (parse_iso_time(line+strlen("tracked-since "), &tracked_since)<0)
log_warn(LD_GENERAL,"Couldn't parse started-tracking time in mtbf "
"history file.");
}
}
if (last_downrated > now)
last_downrated = now;
if (tracked_since > now)
tracked_since = now;
if (!stored_at) {
log_warn(LD_GENERAL, "No stored time recorded.");
@ -635,7 +706,7 @@ rep_hist_load_mtbf_data(time_t now)
char hexbuf[HEX_DIGEST_LEN+1];
char timebuf[ISO_TIME_LEN+1];
time_t start_of_run = 0;
unsigned long wrl;
long wrl;
double trw;
int n;
or_history_t *hist;
@ -643,7 +714,7 @@ rep_hist_load_mtbf_data(time_t now)
if (!strcmp(line, "."))
break;
/* XXXX020 audit the heck out of my scanf usage. */
n = sscanf(line, "%40s %lu %lf S=%10s %8s",
n = sscanf(line, "%40s %ld %lf S=%10s %8s",
hexbuf, &wrl, &trw, timebuf, timebuf+11);
if (n != 3 && n != 5) {
log_warn(LD_GENERAL, "Couldn't scan line %s", escaped(line));
@ -668,6 +739,8 @@ rep_hist_load_mtbf_data(time_t now)
long run_length = stored_at - start_of_run;
hist->start_of_run = now - run_length;
}
if (hist->start_of_run < latest_possible_start + wrl)
latest_possible_start = hist->start_of_run - wrl;
hist->weighted_run_length = wrl;
hist->total_run_weights = trw;
@ -675,7 +748,11 @@ rep_hist_load_mtbf_data(time_t now)
if (strcmp(line, "."))
log_warn(LD_GENERAL, "Truncated MTBF file.");
if (!tracked_since)
tracked_since = latest_possible_start;
stability_last_downrated = last_downrated;
started_tracking_stability = tracked_since;
goto done;
err: