diff --git a/ChangeLog b/ChangeLog index 2033738d8e..ebb7b657b8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,10 @@ Changes in version 0.2.0.6-alpha - 2007-??-?? + o Major features (directory authorities): + - Track authorities by weighted mean-times-between-failures. When we + have 4 or more days of data, use measured MTBF rather than declared + uptime to decide whether to call a router Stable. Implements proposal + 108. + o Major bugfixes: - Handle unexpected whitespace better in malformed descriptors. Bug found using Benedikt Boss's new Tor fuzzer! Bugfix on 0.2.0.x. diff --git a/doc/TODO b/doc/TODO index 4804ae1ba0..fa6bce9c85 100644 --- a/doc/TODO +++ b/doc/TODO @@ -88,8 +88,7 @@ Things we'd like to do in 0.2.0.x: - Download as needed. o Serve list as needed. o Avoid double-checking signatures every time we get a vote. - - Warn about expired stuff. - - Fix all XXXX020s in vote code + . Code to generate consensus from a list of votes * Detect whether votes are really all for the same period. . Push/pull documents as appropriate. @@ -100,23 +99,34 @@ Things we'd like to do in 0.2.0.x: o Have clients know which authorities are v3 authorities, and what their keys are. - While we're at it, let v3 authorities have fqdns lines. + - Fix all XXXX020s in vote code + - Validate information properly. + - Warn if we get a vote with different authorities than we know. + - Don't count votes with a different valid-after when generating + the same consensus. + - Dump certificates with the wrong time. Or just warn? + - Warn authority ops when their certs are nearly invalid. + - When checking a consensus, make sure that its times are plausible. + - Add a function that will eventually tell us about our clock skew. + For now, just require that authorities not be skewed. - Start caching consensus documents once authorities make them - Start downloading and using consensus documents once caches serve them . 104: Long and Short Router Descriptors o Merge proposal - Drop bandwidth history from router-descriptors - 105: Version negotiation for the Tor protocol - . 108: Base "Stable" Flag on Mean Time Between Failures + o 108: Base "Stable" Flag on Mean Time Between Failures o Track mtbf in rephist.c o Do not delete old stability information if we're an authority. o Make sure authorities call up/down functions as appropriate. o Record mtbf between invocations - - Base Stable on mtbf. - - Test mtbf logic. + o Base Stable on mtbf. + o Test mtbf logic. - 113: Simplifying directory authority administration - 110: prevent infinite-length circuits (phase one) - servers should recognize relay_extend cells and pass them on just like relay cells + - Refactoring: - Make resolves no longer use edge_connection_t unless they are actually _on_ a socks connection: have edge_connection_t and (say) diff --git a/doc/tor.1.in b/doc/tor.1.in index 5be54902ef..e53a89e263 100644 --- a/doc/tor.1.in +++ b/doc/tor.1.in @@ -1199,6 +1199,10 @@ Only used by servers. Holds the fingerprint of the server's identity key. Only for naming authoritative directory servers (see \fBNamingAuthoritativeDirectory\fP). This file lists nickname to identity bindings. Each line lists a nickname and a fingerprint separated by whitespace. See your \fBfingerprint\fP file in the \fIDataDirectory\fP for an example line. If the nickname is \fB!reject\fP then descriptors from the given identity (fingerprint) are rejected by this server. If it is \fB!invalid\fP then descriptors are accepted but marked in the directory as not valid, that is, not recommended. .LP .TP +.B \fIDataDirectory\fP/router-stability +Only used by authoritative directory servers. Tracks measurements for router mean-time-between-failures so that authorities have a good idea of how to set their Stable flags. +.LP +.TP .B \fIHiddenServiceDirectory\fP/hostname The .onion domain name for this hidden service. .LP diff --git a/src/or/dirserv.c b/src/or/dirserv.c index 4762b99805..1401a6c907 100644 --- a/src/or/dirserv.c +++ b/src/or/dirserv.c @@ -1500,6 +1500,9 @@ should_generate_v2_networkstatus(void) * network using allegedly high-uptime nodes, displacing all the * current guards. */ #define UPTIME_TO_GUARANTEE_STABLE (3600*24*30) +/* If a router's MTBF is at least this value, then it is always stable. + * See above. */ +#define MTBF_TO_GUARANTEE_STABLE (60*60*24*10) /** Similarly, we protect sufficiently fast nodes from being pushed * out of the set of Fast nodes. */ #define BANDWIDTH_TO_GUARANTEE_FAST (100*1024) @@ -1511,6 +1514,8 @@ should_generate_v2_networkstatus(void) * dirserv_compute_performance_thresholds, and used by * generate_v2_networkstatus */ static uint32_t stable_uptime = 0; /* start at a safe value */ +static double stable_mtbf = 0.0; +static int enough_mtbf_info = 0; static uint32_t fast_bandwidth = 0; static uint32_t guard_bandwidth_including_exits = 0; static uint32_t guard_bandwidth_excluding_exits = 0; @@ -1539,10 +1544,20 @@ dirserv_thinks_router_is_unreliable(time_t now, int need_uptime, int need_capacity) { if (need_uptime) { - int uptime = real_uptime(router, now); - if ((unsigned)uptime < stable_uptime && - (unsigned)uptime < UPTIME_TO_GUARANTEE_STABLE) - return 1; + if (!enough_mtbf_info) { + /* XXXX Once most authorities are on v3, we should change the rule from + * "use uptime if we don't have mtbf data" to "don't advertise Stable on + * v3 if we don't have enough mtbf data." */ + int uptime = real_uptime(router, now); + if ((unsigned)uptime < stable_uptime && + (unsigned)uptime < UPTIME_TO_GUARANTEE_STABLE) + return 1; + } else { + double mtbf = + rep_hist_get_stability(router->cache_info.identity_digest, now); + if (mtbf < stable_mtbf && mtbf < MTBF_TO_GUARANTEE_STABLE) + return 1; + } } if (need_capacity) { uint32_t bw = router_get_advertised_bandwidth(router); @@ -1563,6 +1578,17 @@ _compare_uint32(const void **a, const void **b) return 0; } +/** Helper: returns a tristate based on comparing **(double**)a + * to **(double**)b. */ +static int +_compare_double(const void **a, const void **b) +{ + double first = **(double **)a, second = **(double **)b; + if (first < second) return -1; + if (first > second) return 1; + return 0; +} + /** Look through the routerlist, and assign the median uptime of running valid * servers to stable_uptime, and the relative bandwidth capacities to * fast_bandwidth and guard_bandwidth. Set total_bandwidth to the total @@ -1572,7 +1598,7 @@ _compare_uint32(const void **a, const void **b) static void dirserv_compute_performance_thresholds(routerlist_t *rl) { - smartlist_t *uptimes, *bandwidths, *bandwidths_excluding_exits; + smartlist_t *uptimes, *mtbfs, *bandwidths, *bandwidths_excluding_exits; time_t now = time(NULL); /* initialize these all here, in case there are no routers */ @@ -1585,16 +1611,21 @@ dirserv_compute_performance_thresholds(routerlist_t *rl) total_exit_bandwidth = 0; uptimes = smartlist_create(); + mtbfs = smartlist_create(); bandwidths = smartlist_create(); bandwidths_excluding_exits = smartlist_create(); + /* XXXX020 we should just use arrays and qsort. */ SMARTLIST_FOREACH(rl->routers, routerinfo_t *, ri, { if (router_is_active(ri, now)) { uint32_t *up = tor_malloc(sizeof(uint32_t)); uint32_t *bw = tor_malloc(sizeof(uint32_t)); + uint32_t *mtbf = tor_malloc(sizeof(double)); ri->is_exit = exit_policy_is_general_exit(ri->exit_policy); *up = (uint32_t) real_uptime(ri, now); smartlist_add(uptimes, up); + *mtbf = rep_hist_get_stability(ri->cache_info.identity_digest, now); + smartlist_add(mtbfs, mtbf); *bw = router_get_advertised_bandwidth(ri); total_bandwidth += *bw; if (ri->is_exit && !ri->is_bad_exit) { @@ -1609,6 +1640,7 @@ dirserv_compute_performance_thresholds(routerlist_t *rl) }); smartlist_sort(uptimes, _compare_uint32); + smartlist_sort(mtbfs, _compare_double); smartlist_sort(bandwidths, _compare_uint32); smartlist_sort(bandwidths_excluding_exits, _compare_uint32); @@ -1616,6 +1648,11 @@ dirserv_compute_performance_thresholds(routerlist_t *rl) stable_uptime = *(uint32_t*)smartlist_get(uptimes, smartlist_len(uptimes)/2); + if (smartlist_len(mtbfs)) + stable_mtbf = *(double*)smartlist_get(mtbfs, + smartlist_len(mtbfs)/2); + enough_mtbf_info = rep_hist_have_measured_enough_stability(); + if (smartlist_len(bandwidths)) { fast_bandwidth = *(uint32_t*)smartlist_get(bandwidths, smartlist_len(bandwidths)/8); @@ -1640,9 +1677,11 @@ dirserv_compute_performance_thresholds(routerlist_t *rl) (unsigned long)guard_bandwidth_excluding_exits); SMARTLIST_FOREACH(uptimes, uint32_t *, up, tor_free(up)); + SMARTLIST_FOREACH(mtbfs, double *, mtbf, tor_free(mtbf)); SMARTLIST_FOREACH(bandwidths, uint32_t *, bw, tor_free(bw)); SMARTLIST_FOREACH(bandwidths_excluding_exits, uint32_t *, bw, tor_free(bw)); smartlist_free(uptimes); + smartlist_free(mtbfs); smartlist_free(bandwidths); smartlist_free(bandwidths_excluding_exits); } diff --git a/src/or/or.h b/src/or/or.h index 809d69695c..409af8ade3 100644 --- a/src/or/or.h +++ b/src/or/or.h @@ -3124,6 +3124,7 @@ int rep_hist_load_mtbf_data(time_t now); time_t rep_hist_downrate_old_runs(time_t now); double rep_hist_get_stability(const char *id, time_t when); +int rep_hist_have_measured_enough_stability(void); void rep_hist_note_used_port(uint16_t port, time_t now); smartlist_t *rep_hist_get_predicted_ports(time_t now); diff --git a/src/or/rephist.c b/src/or/rephist.c index 0474ee644a..254f5f3b2d 100644 --- a/src/or/rephist.c +++ b/src/or/rephist.c @@ -20,9 +20,18 @@ static void hs_usage_init(void); uint64_t rephist_total_alloc=0; uint32_t rephist_total_num=0; +/** If the total weighted run count of all runs for a router ever falls + * below this amount, the router can be treated as having 0 MTBF. */ #define STABILITY_EPSILON 0.0001 -#define STABILITY_ALPHA 0.9 +/** Value by which to discount all old intervals for MTBF purposses. This + * is compounded every STABILITY_INTERVAL. */ +#define STABILITY_ALPHA 0.95 +/** Interval at which to discount all old intervals for MTBF purposes. */ #define STABILITY_INTERVAL (12*60*60) +/* (This combination of ALPHA, INTERVAL, and EPSILON makes it so that an + * interval that just ended counts twice as much as one that ended a week ago, + * 20X as much as one that ended a month ago, and routers that have had no + * uptime data for about half a year will get forgotten.) */ /** History of an OR-\>OR link. */ typedef struct link_history_t { @@ -56,18 +65,30 @@ typedef struct or_history_t { time_t up_since; /** If nonzero, we have been unable to connect since this time. */ time_t down_since; - /** DOCDOC */ + + + /* === For MTBF tracking: */ + /** Weighted sum total of all times that this router has been online. + */ unsigned long weighted_run_length; + /** If the router is now online (according to stability-checking rules), + * when did it come online? */ time_t start_of_run; + /** Sum of weights for runs in weighted_run_length. */ double total_run_weights; + /** Map from hex OR2 identity digest to a link_history_t for the link * from this OR to OR2. */ digestmap_t *link_history_map; } or_history_t; -/** DOCDOC */ +/** When did we last multiply all routers' weighted_run_length and + * total_run_weights by STABILITY_ALPHA? */ static time_t stability_last_downrated = 0; +/** */ +static time_t started_tracking_stability = 0; + /** Map from hex OR identity digest to or_history_t. */ static digestmap_t *history_map = NULL; @@ -163,7 +184,9 @@ rep_hist_init(void) hs_usage_init(); } -/** DOCDOC */ +/** Helper: note that we are no longer connected to the router with history + * hist. If failed, the connection failed; otherwise, it was + * closed correctly. */ static void mark_or_down(or_history_t *hist, time_t when, int failed) { @@ -176,7 +199,8 @@ mark_or_down(or_history_t *hist, time_t when, int failed) } } -/** DOCDOC */ +/** Helper: note that we are connected to the router with history + * hist. */ static void mark_or_up(or_history_t *hist, time_t when) { @@ -259,6 +283,8 @@ void rep_hist_note_router_reachable(const char *id, time_t when) { or_history_t *hist = get_or_history(id); + if (!started_tracking_stability) + started_tracking_stability = time(NULL); if (hist && !hist->start_of_run) { hist->start_of_run = when; } @@ -270,6 +296,8 @@ void rep_hist_note_router_unreachable(const char *id, time_t when) { or_history_t *hist = get_or_history(id); + if (!started_tracking_stability) + started_tracking_stability = time(NULL); if (hist && hist->start_of_run) { /*XXXX020 treat failure specially? */ long run_length = when - hist->start_of_run; @@ -279,7 +307,8 @@ rep_hist_note_router_unreachable(const char *id, time_t when) } } -/**DOCDOC*/ +/** Helper: Discount all old MTBF data, if it is time to do so. Return + * the time at which we should next discount MTBF data. */ time_t rep_hist_downrate_old_runs(time_t now) { @@ -296,11 +325,13 @@ rep_hist_downrate_old_runs(time_t now) if (stability_last_downrated + STABILITY_INTERVAL > now) return stability_last_downrated + STABILITY_INTERVAL; + /* Okay, we should downrate the data. By how much? */ while (stability_last_downrated + STABILITY_INTERVAL < now) { stability_last_downrated += STABILITY_INTERVAL; alpha *= STABILITY_ALPHA; } + /* Multiply every w_r_l, t_r_w pair by alpha. */ for (orhist_it = digestmap_iter_init(history_map); !digestmap_iter_done(orhist_it); orhist_it = digestmap_iter_next(history_map,orhist_it)) { @@ -315,7 +346,7 @@ rep_hist_downrate_old_runs(time_t now) return stability_last_downrated + STABILITY_INTERVAL; } -/** DOCDOC */ +/** Helper: Return the weighted MTBF of the router with history hist. */ static double get_stability(or_history_t *hist, time_t when) { @@ -323,16 +354,21 @@ get_stability(or_history_t *hist, time_t when) double total_weights = hist->total_run_weights; if (hist->start_of_run) { + /* We're currently in a run. Let total and total_weights hold the values + * they would hold if the current run were to end now. */ total += (when-hist->start_of_run); total_weights += 1.0; } - if (total_weights < STABILITY_EPSILON) + if (total_weights < STABILITY_EPSILON) { + /* Round down to zero, and avoid divide-by-zero. */ return 0.0; + } return total / total_weights; } -/**DOCDOC*/ +/** Return an estimated MTBF for the router whose identity digest is + * id. Return 0 if the router is unknown. */ double rep_hist_get_stability(const char *id, time_t when) { @@ -343,6 +379,16 @@ rep_hist_get_stability(const char *id, time_t when) return get_stability(hist, when); } +/** Return true if we've been measuring MTBFs for long enough to + * prounounce on Stability. */ +int +rep_hist_have_measured_enough_stability(void) +{ + /* XXXX020 This doesn't do so well when we change our opinion + * as to whether we're tracking router stability. */ + return started_tracking_stability < time(NULL) - 4*60*60; +} + /** Remember that we successfully extended from the OR with identity * digest from_id to the OR with identity digest * to_name. @@ -502,7 +548,8 @@ rep_history_clean(time_t before) } } -/** DOCDOC */ +/** Return a newly allocated string holding the filename in which we store + * MTBF information. */ static char * get_mtbf_filename(void) { @@ -513,7 +560,7 @@ get_mtbf_filename(void) return fn; } -/** DOCDOC */ +/** Write MTBF data to disk. Returns 0 on success, negative on failure. */ int rep_hist_record_mtbf_data(void) { @@ -526,6 +573,16 @@ rep_hist_record_mtbf_data(void) void *or_history_p; or_history_t *hist; + /* File format is: + * FormatLine *KeywordLine Data + * + * FormatLine = "format 1" NL + * KeywordLine = Keyword SP Arguments NL + * Data = "data" NL *RouterMTBFLine "." NL + * RouterMTBFLine = Fingerprint SP WeightedRunLen SP + * TotalRunWeights [SP S=StartRunTime] NL + */ + lines = smartlist_create(); smartlist_add(lines, tor_strdup("format 1\n")); @@ -534,6 +591,11 @@ rep_hist_record_mtbf_data(void) tor_snprintf(buf, sizeof(buf), "stored-at %s\n", time_buf); smartlist_add(lines, tor_strdup(buf)); + if (started_tracking_stability) { + format_iso_time(time_buf, started_tracking_stability); + tor_snprintf(buf, sizeof(buf), "tracked-since %s\n", time_buf); + smartlist_add(lines, tor_strdup(buf)); + } if (stability_last_downrated) { format_iso_time(time_buf, stability_last_downrated); tor_snprintf(buf, sizeof(buf), "last-downrated %s\n", time_buf); @@ -579,7 +641,8 @@ rep_hist_record_mtbf_data(void) } } -/** DOCDOC */ +/** Load MTBF data from disk. Returns 0 on success or recoverable error, -1 + * on failure. */ int rep_hist_load_mtbf_data(time_t now) { @@ -587,7 +650,8 @@ rep_hist_load_mtbf_data(time_t now) smartlist_t *lines; const char *line = NULL; int r=0, i; - time_t last_downrated = 0, stored_at = 0; + time_t last_downrated = 0, stored_at = 0, tracked_since = 0; + time_t latest_possible_start = now; { char *filename = get_mtbf_filename(); @@ -618,9 +682,16 @@ rep_hist_load_mtbf_data(time_t now) log_warn(LD_GENERAL,"Couldn't parse stored time in mtbf " "history file."); } + if (!strcmpstart(line, "tracked-since ")) { + if (parse_iso_time(line+strlen("tracked-since "), &tracked_since)<0) + log_warn(LD_GENERAL,"Couldn't parse started-tracking time in mtbf " + "history file."); + } } if (last_downrated > now) last_downrated = now; + if (tracked_since > now) + tracked_since = now; if (!stored_at) { log_warn(LD_GENERAL, "No stored time recorded."); @@ -635,7 +706,7 @@ rep_hist_load_mtbf_data(time_t now) char hexbuf[HEX_DIGEST_LEN+1]; char timebuf[ISO_TIME_LEN+1]; time_t start_of_run = 0; - unsigned long wrl; + long wrl; double trw; int n; or_history_t *hist; @@ -643,7 +714,7 @@ rep_hist_load_mtbf_data(time_t now) if (!strcmp(line, ".")) break; /* XXXX020 audit the heck out of my scanf usage. */ - n = sscanf(line, "%40s %lu %lf S=%10s %8s", + n = sscanf(line, "%40s %ld %lf S=%10s %8s", hexbuf, &wrl, &trw, timebuf, timebuf+11); if (n != 3 && n != 5) { log_warn(LD_GENERAL, "Couldn't scan line %s", escaped(line)); @@ -668,6 +739,8 @@ rep_hist_load_mtbf_data(time_t now) long run_length = stored_at - start_of_run; hist->start_of_run = now - run_length; } + if (hist->start_of_run < latest_possible_start + wrl) + latest_possible_start = hist->start_of_run - wrl; hist->weighted_run_length = wrl; hist->total_run_weights = trw; @@ -675,7 +748,11 @@ rep_hist_load_mtbf_data(time_t now) if (strcmp(line, ".")) log_warn(LD_GENERAL, "Truncated MTBF file."); + if (!tracked_since) + tracked_since = latest_possible_start; + stability_last_downrated = last_downrated; + started_tracking_stability = tracked_since; goto done; err: