Merge remote branch 'sebastian/bug1035' into maint-0.2.2

This commit is contained in:
Nick Mathewson 2011-03-08 15:52:43 -05:00
commit 0d78a16c36
5 changed files with 84 additions and 16 deletions

13
changes/bug1035 Normal file
View File

@ -0,0 +1,13 @@
o Minor features (authorities)
- Take altered router IP addresses and ORPorts into account when
determining router stability. Previously, if a router changed
its IP or ORPort, the authorities would not treat it as having
any downtime for the purposes of stability calculation, whereas
clients would experience downtime since the change could take a
while to propagate to them. Resolves issue 1035.
o Minor bugfixes (authorities)
- Try to be more robust to hops back in time when calculating
router stability. Previously, if a run of uptime or downtime
appeared to be negative, the calculation could give incorrect
results. Bugfix on 0.2.0.6-alpha.

View File

@ -1098,9 +1098,6 @@ connection_or_check_valid_tls_handshake(or_connection_t *conn,
as_advertised = 0;
}
if (authdir_mode_tests_reachability(options)) {
/* We initiated this connection to address:port. Drop all routers
* with the same address:port and a different key.
*/
dirserv_orconn_tls_done(conn->_base.address, conn->_base.port,
digest_rcvd_out, as_advertised);
}

View File

@ -3116,19 +3116,27 @@ dirserv_orconn_tls_done(const char *address,
tor_assert(address);
tor_assert(digest_rcvd);
SMARTLIST_FOREACH(rl->routers, routerinfo_t *, ri, {
/* XXX023 Doing a loop like this is stupid. We should just look up the
* router by digest_rcvd, and see if address, orport, and as_advertised
* match up. -NM */
SMARTLIST_FOREACH_BEGIN(rl->routers, routerinfo_t *, ri) {
if (!strcasecmp(address, ri->address) && or_port == ri->or_port &&
as_advertised &&
!memcmp(ri->cache_info.identity_digest, digest_rcvd, DIGEST_LEN)) {
/* correct digest. mark this router reachable! */
if (!bridge_auth || ri->purpose == ROUTER_PURPOSE_BRIDGE) {
log_info(LD_DIRSERV, "Found router %s to be reachable. Yay.",
ri->nickname);
rep_hist_note_router_reachable(digest_rcvd, now);
tor_addr_t addr, *addrp=NULL;
log_info(LD_DIRSERV, "Found router %s to be reachable at %s:%d. Yay.",
ri->nickname, address, ri->or_port );
if (tor_addr_from_str(&addr, ri->address) != -1)
addrp = &addr;
else
log_warn(LD_BUG, "Couldn't parse IP address \"%s\"", ri->address);
rep_hist_note_router_reachable(digest_rcvd, addrp, or_port, now);
ri->last_reachable = now;
}
}
});
} SMARTLIST_FOREACH_END(ri);
/* FFFF Maybe we should reinstate the code that dumps routers with the same
* addr/port but with nonmatching keys, but instead of dumping, we should
* skip testing. */

View File

@ -14,6 +14,7 @@
#include "circuitlist.h"
#include "circuituse.h"
#include "config.h"
#include "networkstatus.h"
#include "rephist.h"
#include "router.h"
#include "routerlist.h"
@ -73,6 +74,13 @@ typedef struct or_history_t {
/** If nonzero, we have been unable to connect since this time. */
time_t down_since;
/** The address at which we most recently connected to this OR
* successfully. */
tor_addr_t last_reached_addr;
/** The port at which we most recently connected to this OR successfully */
uint16_t last_reached_port;
/* === For MTBF tracking: */
/** Weighted sum total of all times that this router has been online.
*/
@ -119,6 +127,7 @@ get_or_history(const char* id)
rephist_total_num++;
hist->link_history_map = digestmap_new();
hist->since = hist->changed = time(NULL);
tor_addr_make_unspec(&hist->last_reached_addr);
digestmap_set(history_map, id, hist);
}
return hist;
@ -289,13 +298,20 @@ rep_hist_note_connection_died(const char* id, time_t when)
/** We have just decided that this router with identity digest <b>id</b> is
* reachable, meaning we will give it a "Running" flag for the next while. */
void
rep_hist_note_router_reachable(const char *id, time_t when)
rep_hist_note_router_reachable(const char *id, const tor_addr_t *at_addr,
const uint16_t at_port, time_t when)
{
or_history_t *hist = get_or_history(id);
int was_in_run = 1;
char tbuf[ISO_TIME_LEN+1];
int addr_changed, port_changed;
tor_assert(hist);
tor_assert((!at_addr && !at_port) || (at_addr && at_port));
addr_changed = at_addr &&
tor_addr_compare(at_addr, &hist->last_reached_addr, CMP_EXACT) != 0;
port_changed = at_port && at_port != hist->last_reached_port;
if (!started_tracking_stability)
started_tracking_stability = time(NULL);
@ -315,6 +331,27 @@ rep_hist_note_router_reachable(const char *id, time_t when)
down_length = when - hist->start_of_downtime;
hist->total_weighted_time += down_length;
hist->start_of_downtime = 0;
} else if (addr_changed || port_changed) {
/* If we're reachable, but the address changed, treat this as some
* downtime. */
int penalty = get_options()->TestingTorNetwork ? 240 : 3600;
networkstatus_t *ns;
if ((ns = networkstatus_get_latest_consensus())) {
int fresh_interval = (int)(ns->fresh_until - ns->valid_after);
int live_interval = (int)(ns->valid_until - ns->valid_after);
/* on average, a descriptor addr change takes .5 intervals to make it
* into a consensus, and half a liveness period to make it to
* clients. */
penalty = (int)(fresh_interval + live_interval) / 2;
}
format_local_iso_time(tbuf, hist->start_of_run);
log_info(LD_HIST,"Router %s still seems Running, but its address appears "
"to have changed since the last time it was reachable. I'm "
"going to treat it as having been down for %d seconds",
hex_str(id, DIGEST_LEN), penalty);
rep_hist_note_router_unreachable(id, when-penalty);
rep_hist_note_router_reachable(id, NULL, 0, when);
} else {
format_local_iso_time(tbuf, hist->start_of_run);
if (was_in_run)
@ -324,6 +361,10 @@ rep_hist_note_router_reachable(const char *id, time_t when)
log_info(LD_HIST,"Router %s is now Running; it was previously untracked",
hex_str(id, DIGEST_LEN));
}
if (at_addr)
tor_addr_copy(&hist->last_reached_addr, at_addr);
if (at_port)
hist->last_reached_port = at_port;
}
/** We have just decided that this router is unreachable, meaning
@ -344,12 +385,20 @@ rep_hist_note_router_unreachable(const char *id, time_t when)
long run_length = when - hist->start_of_run;
format_local_iso_time(tbuf, hist->start_of_run);
hist->weighted_run_length += run_length;
hist->total_run_weights += 1.0;
hist->start_of_run = 0;
hist->weighted_uptime += run_length;
hist->total_weighted_time += run_length;
if (run_length < 0) {
unsigned long penalty = -run_length;
#define SUBTRACT_CLAMPED(var, penalty) \
do { (var) = (var) < (penalty) ? 0 : (var) - (penalty); } while (0)
SUBTRACT_CLAMPED(hist->weighted_run_length, penalty);
SUBTRACT_CLAMPED(hist->weighted_uptime, penalty);
} else {
hist->weighted_run_length += run_length;
hist->weighted_uptime += run_length;
hist->total_weighted_time += run_length;
}
was_running = 1;
log_info(LD_HIST, "Router %s is now non-Running: it had previously been "
"Running since %s. Its total weighted uptime is %lu/%lu.",
@ -422,7 +471,7 @@ rep_hist_downrate_old_runs(time_t now)
static double
get_stability(or_history_t *hist, time_t when)
{
unsigned long total = hist->weighted_run_length;
long total = hist->weighted_run_length;
double total_weights = hist->total_run_weights;
if (hist->start_of_run) {
@ -458,8 +507,8 @@ get_total_weighted_time(or_history_t *hist, time_t when)
static double
get_weighted_fractional_uptime(or_history_t *hist, time_t when)
{
unsigned long total = hist->total_weighted_time;
unsigned long up = hist->weighted_uptime;
long total = hist->total_weighted_time;
long up = hist->weighted_uptime;
if (hist->start_of_run) {
long run_length = (when - hist->start_of_run);

View File

@ -33,7 +33,8 @@ void rep_hist_update_state(or_state_t *state);
int rep_hist_load_state(or_state_t *state, char **err);
void rep_history_clean(time_t before);
void rep_hist_note_router_reachable(const char *id, time_t when);
void rep_hist_note_router_reachable(const char *id, const tor_addr_t *at_addr,
uint16_t at_port, time_t when);
void rep_hist_note_router_unreachable(const char *id, time_t when);
int rep_hist_record_mtbf_data(time_t now, int missing_means_down);
int rep_hist_load_mtbf_data(time_t now);