From dcabf801e52a83e2c3cc23ccc1fa906582a927d6 Mon Sep 17 00:00:00 2001 From: David Goulet Date: Wed, 8 Nov 2017 09:44:39 -0500 Subject: [PATCH 1/6] sched: Ignore closed channel after flushing cells The flush cells process can close a channel if the connection write fails but still return that it flushed at least one cell. This is due because the error is not propagated up the call stack so there is no way of knowing if the flush actually was successful or not. Because this would require an important refactoring touching multiple subsystems, this patch is a bandaid to avoid the KIST scheduler to handle closed channel in its loop. Bandaid on #23751. Signed-off-by: David Goulet --- changes/bug23751 | 6 ++++++ src/or/scheduler_kist.c | 9 +++++++++ 2 files changed, 15 insertions(+) create mode 100644 changes/bug23751 diff --git a/changes/bug23751 b/changes/bug23751 new file mode 100644 index 0000000000..2fd7021664 --- /dev/null +++ b/changes/bug23751 @@ -0,0 +1,6 @@ + o Minor bugfixes (scheduler, channel): + - Ignore channels that have been closed while flushing cells. This can + happen if the write on the connection fails leading to the channel being + closed while in the scheduler loop. This is not a complete fix, it is a + bandaid until we are able to refactor those interactions. Fixes bug + 23751; bugfix on 0.3.2.1-alpha. diff --git a/src/or/scheduler_kist.c b/src/or/scheduler_kist.c index a3b74e8cc9..d1726ba345 100644 --- a/src/or/scheduler_kist.c +++ b/src/or/scheduler_kist.c @@ -598,6 +598,15 @@ kist_scheduler_run(void) if (socket_can_write(&socket_table, chan)) { /* flush to channel queue/outbuf */ flush_result = (int)channel_flush_some_cells(chan, 1); // 1 for num cells + /* XXX: While flushing cells, it is possible that the connection write + * fails leading to the channel to be closed which triggers a release + * and free its entry in the socket table. And because of a engineering + * design issue, the error is not propagated back so we don't get an + * error at this poin. So before we continue, make sure the channel is + * open and if not just ignore it. See #23751. */ + if (!CHANNEL_IS_OPEN(chan)) { + continue; + } /* flush_result has the # cells flushed */ if (flush_result > 0) { update_socket_written(&socket_table, chan, flush_result * From 3c03e237ab372f495fa2498e925813931ba381da Mon Sep 17 00:00:00 2001 From: Nick Mathewson Date: Fri, 22 Sep 2017 15:29:15 -0400 Subject: [PATCH 2/6] Remove an erroneous 0.5 in compute_weighted_bandwidths() Back in 0.2.4.3-alpha (e106812a778f537), when we switched from using double to using uint64 for selecting by bandwidth, I got the math wrong: I should have used llround(x), or (uint64_t)(x+0.5), but instead I wrote llround(x+0.5). That means we would always round up, rather than rounding to the closest integer Fixes bug 23318; bugfix on 0.2.4.3-alpha. --- changes/bug23318 | 7 +++++++ src/or/routerlist.c | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) create mode 100644 changes/bug23318 diff --git a/changes/bug23318 b/changes/bug23318 new file mode 100644 index 0000000000..32c85eb194 --- /dev/null +++ b/changes/bug23318 @@ -0,0 +1,7 @@ + o Minor bugfixes (path selection): + - When selecting relays by bandwidth, avoid a rounding error that + could sometimes cause load to be imbalanced incorrectly. Previously, + we would always round upwards; now, we round towards the nearest + integer. This had the biggest effect when a relay's weight adjustments + should have given it weight 0, but it got weight 1 instead. + Fixes bug 23318; bugfix on 0.2.4.3-alpha. diff --git a/src/or/routerlist.c b/src/or/routerlist.c index 2365f28fd2..faf2eeda5d 100644 --- a/src/or/routerlist.c +++ b/src/or/routerlist.c @@ -2713,7 +2713,7 @@ compute_weighted_bandwidths(const smartlist_t *sl, final_weight = weight*this_bw; } - bandwidths[node_sl_idx] = final_weight + 0.5; + bandwidths[node_sl_idx] = final_weight; } SMARTLIST_FOREACH_END(node); log_debug(LD_CIRC, "Generated weighted bandwidths for rule %s based " From 14b0bba06e65a1dfca6a2f15f3016cb36409183f Mon Sep 17 00:00:00 2001 From: teor Date: Wed, 8 Nov 2017 14:09:50 +1100 Subject: [PATCH 3/6] Use node counts in networks with all zero-bandwidths When calculating the fraction of nodes that have descriptors, and all all nodes in the network have zero bandwidths, count the number of nodes instead. Fixes bug 23318; bugfix on 0.2.4.10-alpha. --- changes/bug23318 | 4 ++++ src/or/routerlist.c | 28 ++++++++++++++++++++-------- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/changes/bug23318 b/changes/bug23318 index 32c85eb194..7fcb8d4487 100644 --- a/changes/bug23318 +++ b/changes/bug23318 @@ -5,3 +5,7 @@ integer. This had the biggest effect when a relay's weight adjustments should have given it weight 0, but it got weight 1 instead. Fixes bug 23318; bugfix on 0.2.4.3-alpha. + - When calculating the fraction of nodes that have descriptors, and all + all nodes in the network have zero bandwidths, count the number of nodes + instead. + Fixes bug 23318; bugfix on 0.2.4.10-alpha. diff --git a/src/or/routerlist.c b/src/or/routerlist.c index faf2eeda5d..d6347c49f6 100644 --- a/src/or/routerlist.c +++ b/src/or/routerlist.c @@ -149,7 +149,8 @@ typedef struct cert_list_t cert_list_t; /* static function prototypes */ static int compute_weighted_bandwidths(const smartlist_t *sl, bandwidth_weight_rule_t rule, - double **bandwidths_out); + double **bandwidths_out, + double *total_bandwidth_out); static const routerstatus_t *router_pick_trusteddirserver_impl( const smartlist_t *sourcelist, dirinfo_type_t auth, int flags, int *n_busy_out); @@ -2513,7 +2514,7 @@ smartlist_choose_node_by_bandwidth_weights(const smartlist_t *sl, double *bandwidths_dbl=NULL; uint64_t *bandwidths_u64=NULL; - if (compute_weighted_bandwidths(sl, rule, &bandwidths_dbl) < 0) + if (compute_weighted_bandwidths(sl, rule, &bandwidths_dbl, NULL) < 0) return NULL; bandwidths_u64 = tor_calloc(smartlist_len(sl), sizeof(uint64_t)); @@ -2533,11 +2534,14 @@ smartlist_choose_node_by_bandwidth_weights(const smartlist_t *sl, * smartlist_choose_node_by_bandwidth_weights, compute weighted bandwidth * values for each node and store them in a freshly allocated * *bandwidths_out of the same length as sl, and holding results - * as doubles. Return 0 on success, -1 on failure. */ + * as doubles. If total_bandwidth_out is non-NULL, set it to the total + * of all the bandwidths. + * Return 0 on success, -1 on failure. */ static int compute_weighted_bandwidths(const smartlist_t *sl, bandwidth_weight_rule_t rule, - double **bandwidths_out) + double **bandwidths_out, + double *total_bandwidth_out) { int64_t weight_scale; double Wg = -1, Wm = -1, We = -1, Wd = -1; @@ -2545,6 +2549,7 @@ compute_weighted_bandwidths(const smartlist_t *sl, uint64_t weighted_bw = 0; guardfraction_bandwidth_t guardfraction_bw; double *bandwidths; + double total_bandwidth = 0.0; /* Can't choose exit and guard at same time */ tor_assert(rule == NO_WEIGHTING || @@ -2553,6 +2558,10 @@ compute_weighted_bandwidths(const smartlist_t *sl, rule == WEIGHT_FOR_MID || rule == WEIGHT_FOR_DIR); + if (total_bandwidth_out) { + *total_bandwidth_out = 0.0; + } + if (smartlist_len(sl) == 0) { log_info(LD_CIRC, "Empty routerlist passed in to consensus weight node " @@ -2714,6 +2723,7 @@ compute_weighted_bandwidths(const smartlist_t *sl, } bandwidths[node_sl_idx] = final_weight; + total_bandwidth += final_weight; } SMARTLIST_FOREACH_END(node); log_debug(LD_CIRC, "Generated weighted bandwidths for rule %s based " @@ -2724,6 +2734,10 @@ compute_weighted_bandwidths(const smartlist_t *sl, *bandwidths_out = bandwidths; + if (total_bandwidth_out) { + *total_bandwidth_out = total_bandwidth; + } + return 0; } @@ -2740,7 +2754,8 @@ frac_nodes_with_descriptors(const smartlist_t *sl, if (smartlist_len(sl) == 0) return 0.0; - if (compute_weighted_bandwidths(sl, rule, &bandwidths) < 0) { + if (compute_weighted_bandwidths(sl, rule, &bandwidths, &total) < 0 || + total <= 0.0) { int n_with_descs = 0; SMARTLIST_FOREACH(sl, const node_t *, node, { if (node_has_descriptor(node)) @@ -2759,9 +2774,6 @@ frac_nodes_with_descriptors(const smartlist_t *sl, tor_free(bandwidths); - if (total < 1.0) - return 0; - return present / total; } From fcaa4ab82463a0a27a0605d5aa01ed7d01e3387a Mon Sep 17 00:00:00 2001 From: teor Date: Wed, 8 Nov 2017 14:17:03 +1100 Subject: [PATCH 4/6] Actually log the total bandwidth in compute_weighted_bandwidths() Fixes bug 24170; bugfix on 0.2.4.3-alpha. --- changes/bug24170 | 3 +++ src/or/routerlist.c | 5 ++--- 2 files changed, 5 insertions(+), 3 deletions(-) create mode 100644 changes/bug24170 diff --git a/changes/bug24170 b/changes/bug24170 new file mode 100644 index 0000000000..d3d7347693 --- /dev/null +++ b/changes/bug24170 @@ -0,0 +1,3 @@ + o Minor bugfixes (path selection): + - Actually log the total bandwidth in compute_weighted_bandwidths(). + Fixes bug 24170; bugfix on 0.2.4.3-alpha. diff --git a/src/or/routerlist.c b/src/or/routerlist.c index d6347c49f6..80486cccbc 100644 --- a/src/or/routerlist.c +++ b/src/or/routerlist.c @@ -2546,7 +2546,6 @@ compute_weighted_bandwidths(const smartlist_t *sl, int64_t weight_scale; double Wg = -1, Wm = -1, We = -1, Wd = -1; double Wgb = -1, Wmb = -1, Web = -1, Wdb = -1; - uint64_t weighted_bw = 0; guardfraction_bandwidth_t guardfraction_bw; double *bandwidths; double total_bandwidth = 0.0; @@ -2728,9 +2727,9 @@ compute_weighted_bandwidths(const smartlist_t *sl, log_debug(LD_CIRC, "Generated weighted bandwidths for rule %s based " "on weights " - "Wg=%f Wm=%f We=%f Wd=%f with total bw "U64_FORMAT, + "Wg=%f Wm=%f We=%f Wd=%f with total bw %f", bandwidth_weight_rule_to_string(rule), - Wg, Wm, We, Wd, U64_PRINTF_ARG(weighted_bw)); + Wg, Wm, We, Wd, total_bandwidth); *bandwidths_out = bandwidths; From 4f944cc4cc1971baf2924bc3f713feef7b010691 Mon Sep 17 00:00:00 2001 From: teor Date: Wed, 8 Nov 2017 14:17:37 +1100 Subject: [PATCH 5/6] Check arguments and initialise variables in compute_weighted_bandwidths() Cleanup after 23318. --- src/or/routerlist.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/or/routerlist.c b/src/or/routerlist.c index 80486cccbc..c9d2cbaeaf 100644 --- a/src/or/routerlist.c +++ b/src/or/routerlist.c @@ -2547,9 +2547,12 @@ compute_weighted_bandwidths(const smartlist_t *sl, double Wg = -1, Wm = -1, We = -1, Wd = -1; double Wgb = -1, Wmb = -1, Web = -1, Wdb = -1; guardfraction_bandwidth_t guardfraction_bw; - double *bandwidths; + double *bandwidths = NULL; double total_bandwidth = 0.0; + tor_assert(sl); + tor_assert(bandwidths_out); + /* Can't choose exit and guard at same time */ tor_assert(rule == NO_WEIGHTING || rule == WEIGHT_FOR_EXIT || @@ -2557,6 +2560,8 @@ compute_weighted_bandwidths(const smartlist_t *sl, rule == WEIGHT_FOR_MID || rule == WEIGHT_FOR_DIR); + *bandwidths_out = NULL; + if (total_bandwidth_out) { *total_bandwidth_out = 0.0; } From 6e4ebd41bb5479ffe05ed173a1d7aeffcf592248 Mon Sep 17 00:00:00 2001 From: teor Date: Wed, 8 Nov 2017 14:18:46 +1100 Subject: [PATCH 6/6] Stop calculating total twice in frac_nodes_with_descriptors() Cleanup after 23318. --- src/or/routerlist.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/or/routerlist.c b/src/or/routerlist.c index c9d2cbaeaf..8f53c02009 100644 --- a/src/or/routerlist.c +++ b/src/or/routerlist.c @@ -2768,12 +2768,10 @@ frac_nodes_with_descriptors(const smartlist_t *sl, return ((double)n_with_descs) / (double)smartlist_len(sl); } - total = present = 0.0; + present = 0.0; SMARTLIST_FOREACH_BEGIN(sl, const node_t *, node) { - const double bw = bandwidths[node_sl_idx]; - total += bw; if (node_has_descriptor(node)) - present += bw; + present += bandwidths[node_sl_idx]; } SMARTLIST_FOREACH_END(node); tor_free(bandwidths);