From e7bc189f7c8fb4c2a490f10bd26d81893626ade1 Mon Sep 17 00:00:00 2001 From: Roger Dingledine Date: Sat, 20 Jun 2009 05:25:14 -0400 Subject: [PATCH] the third piece of bug 969 fixing when we write out our stability info, detect relays that have slipped through the cracks. log about them and correct the problem. if we continue to see a lot of these over time, it means there's another spot where relays fall out of the routerlist without being marked as unreachable. --- src/or/main.c | 9 +++++---- src/or/or.h | 2 +- src/or/rephist.c | 20 ++++++++++++++++++-- 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/src/or/main.c b/src/or/main.c index 8fc712bba3..60c42aaae3 100644 --- a/src/or/main.c +++ b/src/or/main.c @@ -903,7 +903,7 @@ run_scheduled_events(time_t now) time_to_downrate_stability = rep_hist_downrate_old_runs(now); if (authdir_mode_tests_reachability(options)) { if (time_to_save_stability < now) { - if (time_to_save_stability && rep_hist_record_mtbf_data()<0) { + if (time_to_save_stability && rep_hist_record_mtbf_data(now, 1)<0) { log_warn(LD_GENERAL, "Couldn't store mtbf data."); } #define SAVE_STABILITY_INTERVAL (30*60) @@ -1955,14 +1955,15 @@ tor_cleanup(void) /* Remove our pid file. We don't care if there was an error when we * unlink, nothing we could do about it anyways. */ if (options->command == CMD_RUN_TOR) { + time_t now = time(NULL); if (options->PidFile) unlink(options->PidFile); if (accounting_is_enabled(options)) - accounting_record_bandwidth_usage(time(NULL), get_or_state()); + accounting_record_bandwidth_usage(now, get_or_state()); or_state_mark_dirty(get_or_state(), 0); /* force an immediate save. */ - or_state_save(time(NULL)); + or_state_save(now); if (authdir_mode_tests_reachability(options)) - rep_hist_record_mtbf_data(); + rep_hist_record_mtbf_data(now, 0); } #ifdef USE_DMALLOC dmalloc_log_stats(); diff --git a/src/or/or.h b/src/or/or.h index f37b417fe5..eddeda1531 100644 --- a/src/or/or.h +++ b/src/or/or.h @@ -3970,7 +3970,7 @@ void rep_history_clean(time_t before); void rep_hist_note_router_reachable(const char *id, time_t when); void rep_hist_note_router_unreachable(const char *id, time_t when); -int rep_hist_record_mtbf_data(void); +int rep_hist_record_mtbf_data(time_t now, int missing_means_down); int rep_hist_load_mtbf_data(time_t now); time_t rep_hist_downrate_old_runs(time_t now); diff --git a/src/or/rephist.c b/src/or/rephist.c index 11e040c945..13fdb58b5e 100644 --- a/src/or/rephist.c +++ b/src/or/rephist.c @@ -683,9 +683,13 @@ rep_history_clean(time_t before) } } -/** Write MTBF data to disk. Returns 0 on success, negative on failure. */ +/** Write MTBF data to disk. Return 0 on success, negative on failure. + * + * If missing_means_down, then if we're about to write an entry + * that is still considered up but isn't in our routerlist, consider it + * to be down. */ int -rep_hist_record_mtbf_data(void) +rep_hist_record_mtbf_data(time_t now, int missing_means_down) { char time_buf[ISO_TIME_LEN+1]; @@ -745,6 +749,18 @@ rep_hist_record_mtbf_data(void) hist = (or_history_t*) or_history_p; base16_encode(dbuf, sizeof(dbuf), digest, DIGEST_LEN); + + if (missing_means_down && hist->start_of_run && + !router_get_by_digest(digest)) { + /* We think this relay is running, but it's not listed in our + * routerlist. Somehow it fell out without telling us it went + * down. Complain and also correct it. */ + log_info(LD_HIST, + "Relay '%s' is listed as up in rephist, but it's not in " + "our routerlist. Correcting.", dbuf); + rep_hist_note_router_unreachable(digest, now); + } + PRINTF((f, "R %s\n", dbuf)); if (hist->start_of_run > 0) { format_iso_time(time_buf, hist->start_of_run);