diff --git a/src/core/or/congestion_control_common.c b/src/core/or/congestion_control_common.c new file mode 100644 index 0000000000..9db1d7d664 --- /dev/null +++ b/src/core/or/congestion_control_common.c @@ -0,0 +1,933 @@ +/* Copyright (c) 2021, The Tor Project, Inc. */ +/* See LICENSE for licensing information */ + +/** + * \file congestion_control_common.c + * \brief Common code used by all congestion control algorithms. + */ + +#define TOR_CONGESTION_CONTROL_COMMON_PRIVATE + +#include "core/or/or.h" + +#include "core/or/circuitlist.h" +#include "core/or/crypt_path.h" +#include "core/or/or_circuit_st.h" +#include "core/or/origin_circuit_st.h" +#include "core/or/channel.h" +#include "core/mainloop/connection.h" +#include "core/or/sendme.h" +#include "core/or/congestion_control_common.h" +#include "core/or/congestion_control_vegas.h" +#include "core/or/congestion_control_nola.h" +#include "core/or/congestion_control_westwood.h" +#include "core/or/congestion_control_st.h" +#include "lib/time/compat_time.h" +#include "feature/nodelist/networkstatus.h" + +/* Consensus parameter defaults */ +#define CIRCWINDOW_INIT (500) + +#define CWND_INC_PCT_SS_DFLT (100) + +#define SENDME_INC_DFLT (50) +#define CWND_MIN_DFLT (MAX(100, SENDME_INC_DFLT)) + +#define CWND_INC_DFLT (50) + +#define CWND_INC_RATE_DFLT (1) + +#define WESTWOOD_BDP_ALG BDP_ALG_PIECEWISE +#define VEGAS_BDP_MIX_ALG BDP_ALG_PIECEWISE +#define NOLA_BDP_ALG BDP_ALG_PIECEWISE + +#define EWMA_CWND_COUNT_DFLT 2 + +#define BWE_SENDME_MIN_DFLT 5 + +static uint64_t congestion_control_update_circuit_rtt(congestion_control_t *, + uint64_t); +static bool congestion_control_update_circuit_bdp(congestion_control_t *, + const circuit_t *, + const crypt_path_t *, + uint64_t, uint64_t); + +/** + * Set congestion control parameters on a circuit's congestion + * control object based on values from the consensus. + * + * cc_alg is the negotiated congestion control algorithm. + * + * sendme_inc is the number of packaged cells that a sendme cell + * acks. This parameter will come from circuit negotiation. + */ +static void +congestion_control_init_params(congestion_control_t *cc, + cc_alg_t cc_alg, + int sendme_inc) +{ +#define CWND_INIT_MIN 100 +#define CWND_INIT_MAX (10000) + cc->cwnd = + networkstatus_get_param(NULL, "cc_cwnd_init", + CIRCWINDOW_INIT, + CWND_INIT_MIN, + CWND_INIT_MAX); + +#define CWND_INC_PCT_SS_MIN 1 +#define CWND_INC_PCT_SS_MAX (500) + cc->cwnd_inc_pct_ss = + networkstatus_get_param(NULL, "cc_cwnd_inc_pct_ss", + CWND_INC_PCT_SS_DFLT, + CWND_INC_PCT_SS_MIN, + CWND_INC_PCT_SS_MAX); + +#define CWND_INC_MIN 1 +#define CWND_INC_MAX (1000) + cc->cwnd_inc = + networkstatus_get_param(NULL, "cc_cwnd_inc", + CWND_INC_DFLT, + CWND_INC_MIN, + CWND_INC_MAX); + +#define CWND_INC_RATE_MIN 1 +#define CWND_INC_RATE_MAX (250) + cc->cwnd_inc_rate = + networkstatus_get_param(NULL, "cc_cwnd_inc_rate", + CWND_INC_RATE_DFLT, + CWND_INC_RATE_MIN, + CWND_INC_RATE_MAX); + +#define SENDME_INC_MIN 10 +#define SENDME_INC_MAX (1000) + cc->sendme_inc = + networkstatus_get_param(NULL, "cc_sendme_inc", + sendme_inc, + SENDME_INC_MIN, + SENDME_INC_MAX); + + // XXX: this min needs to abide by sendme_inc range rules somehow +#define CWND_MIN_MIN sendme_inc +#define CWND_MIN_MAX (1000) + cc->cwnd_min = + networkstatus_get_param(NULL, "cc_cwnd_min", + CWND_MIN_DFLT, + CWND_MIN_MIN, + CWND_MIN_MAX); + +#define EWMA_CWND_COUNT_MIN 1 +#define EWMA_CWND_COUNT_MAX (100) + cc->ewma_cwnd_cnt = + networkstatus_get_param(NULL, "cc_ewma_cwnd_cnt", + EWMA_CWND_COUNT_DFLT, + EWMA_CWND_COUNT_MIN, + EWMA_CWND_COUNT_MAX); + +#define BWE_SENDME_MIN_MIN 2 +#define BWE_SENDME_MIN_MAX (20) + cc->bwe_sendme_min = + networkstatus_get_param(NULL, "cc_bwe_min", + BWE_SENDME_MIN_DFLT, + BWE_SENDME_MIN_MIN, + BWE_SENDME_MIN_MAX); + +#define CC_ALG_MIN 0 +#define CC_ALG_MAX (NUM_CC_ALGS-1) + cc->cc_alg = + networkstatus_get_param(NULL, "cc_alg", + cc_alg, + CC_ALG_MIN, + CC_ALG_MAX); + + bdp_alg_t default_bdp_alg = 0; + + switch (cc->cc_alg) { + case CC_ALG_WESTWOOD: + default_bdp_alg = WESTWOOD_BDP_ALG; + break; + case CC_ALG_VEGAS: + default_bdp_alg = VEGAS_BDP_MIX_ALG; + break; + case CC_ALG_NOLA: + default_bdp_alg = NOLA_BDP_ALG; + break; + case CC_ALG_SENDME: + default: + tor_fragile_assert(); + return; // No alg-specific params + } + + cc->bdp_alg = + networkstatus_get_param(NULL, "cc_bdp_alg", + default_bdp_alg, + 0, + NUM_BDP_ALGS-1); + + /* Algorithm-specific parameters */ + if (cc->cc_alg == CC_ALG_WESTWOOD) { + congestion_control_westwood_set_params(cc); + } else if (cc->cc_alg == CC_ALG_VEGAS) { + congestion_control_vegas_set_params(cc); + } else if (cc->cc_alg == CC_ALG_NOLA) { + congestion_control_nola_set_params(cc); + } +} + +/** + * Allocate and initialize fields in congestion control object. + * + * cc_alg is the negotiated congestion control algorithm. + * + * sendme_inc is the number of packaged cells that a sendme cell + * acks. This parameter will come from circuit negotiation. + */ +static void +congestion_control_init(congestion_control_t *cc, cc_alg_t cc_alg, + int sendme_inc) +{ + cc->sendme_pending_timestamps = smartlist_new(); + cc->sendme_arrival_timestamps = smartlist_new(); + + cc->in_slow_start = 1; + congestion_control_init_params(cc, cc_alg, sendme_inc); + + cc->next_cc_event = CWND_UPDATE_RATE(cc); +} + +/** Allocate and initialize a new congestion control object */ +congestion_control_t * +congestion_control_new(void) +{ + congestion_control_t *cc = tor_malloc_zero(sizeof(congestion_control_t)); + + // XXX: the alg and the sendme_inc need to be negotiated during + // circuit handshake + congestion_control_init(cc, CC_ALG_VEGAS, SENDME_INC_DFLT); + + return cc; +} + +/** + * Free a congestion control object and its asssociated state. + */ +void +congestion_control_free_(congestion_control_t *cc) +{ + if (!cc) + return; + + SMARTLIST_FOREACH(cc->sendme_pending_timestamps, uint64_t *, t, tor_free(t)); + SMARTLIST_FOREACH(cc->sendme_arrival_timestamps, uint64_t *, t, tor_free(t)); + smartlist_free(cc->sendme_pending_timestamps); + smartlist_free(cc->sendme_arrival_timestamps); + + tor_free(cc); +} + +/** + * Compute an N-count EWMA, aka N-EWMA. N-EWMA is defined as: + * EWMA = alpha*value + (1-alpha)*EWMA_prev + * with alpha = 2/(N+1). + * + * This works out to: + * EWMA = value*2/(N+1) + EMA_prev*(N-1)/(N+1) + * = (value*2 + EWMA_prev*(N-1))/(N+1) + */ +static inline uint64_t +n_count_ewma(uint64_t curr, uint64_t prev, uint64_t N) +{ + if (prev == 0) + return curr; + else + return (2*curr + (N-1)*prev)/(N+1); +} + +/** + * Enqueue a u64 timestamp to the end of a queue of timestamps. + */ +static inline void +enqueue_timestamp(smartlist_t *timestamps_u64, uint64_t timestamp_usec) +{ + uint64_t *timestamp_ptr = tor_malloc(sizeof(uint64_t)); + *timestamp_ptr = timestamp_usec; + + smartlist_add(timestamps_u64, timestamp_ptr); +} + +/** + * Peek at the head of a smartlist queue of u64 timestamps. + */ +static inline uint64_t +peek_timestamp(const smartlist_t *timestamps_u64_usecs) +{ + uint64_t *timestamp_ptr = smartlist_get(timestamps_u64_usecs, 0); + + if (BUG(!timestamp_ptr)) { + log_err(LD_CIRC, "Congestion control timestamp list became empty!"); + return 0; + } + + return *timestamp_ptr; +} + +/** + * Dequeue a u64 monotime usec timestamp from the front of a + * smartlist of pointers to 64. + */ +static inline uint64_t +dequeue_timestamp(smartlist_t *timestamps_u64_usecs) +{ + uint64_t *timestamp_ptr = smartlist_get(timestamps_u64_usecs, 0); + uint64_t timestamp_u64; + + if (BUG(!timestamp_ptr)) { + log_err(LD_CIRC, "Congestion control timestamp list became empty!"); + return 0; + } + + timestamp_u64 = *timestamp_ptr; + smartlist_del_keeporder(timestamps_u64_usecs, 0); + tor_free(timestamp_ptr); + + return timestamp_u64; +} + +/** + * Returns the number of sendme acks that will be recieved in the + * current congestion window size, rounded to nearest int. + */ +static inline uint64_t +sendme_acks_per_cwnd(const congestion_control_t *cc) +{ + /* We add half a sendme_inc to cwnd to round to the nearest int */ + return ((cc->cwnd + cc->sendme_inc/2)/cc->sendme_inc); +} + +/** + * Get a package window from either old sendme logic, or congestion control. + * + * A package window is how many cells you can still send. + */ +int +congestion_control_get_package_window(const circuit_t *circ, + const crypt_path_t *cpath) +{ + int package_window; + congestion_control_t *cc; + + tor_assert(circ); + + if (cpath) { + package_window = cpath->package_window; + cc = cpath->ccontrol; + } else { + package_window = circ->package_window; + cc = circ->ccontrol; + } + + if (!cc) { + return package_window; + } else { + /* Inflight can be above cwnd if cwnd was just reduced */ + if (cc->inflight > cc->cwnd) + return 0; + /* In the extremely unlikely event that cwnd-inflight is larger than + * INT32_MAX, just return that cap, so old code doesn't explode. */ + else if (cc->cwnd - cc->inflight > INT32_MAX) + return INT32_MAX; + else + return (int)(cc->cwnd - cc->inflight); + } +} + +/** + * Returns the number of cells that are acked by every sendme. + */ +int +sendme_get_inc_count(const circuit_t *circ, const crypt_path_t *layer_hint) +{ + int sendme_inc = CIRCWINDOW_INCREMENT; + congestion_control_t *cc = NULL; + + if (layer_hint) { + cc = layer_hint->ccontrol; + } else { + cc = circ->ccontrol; + } + + if (cc) { + sendme_inc = cc->sendme_inc; + } + + return sendme_inc; +} + +/** Return true iff the next cell we send will result in the other endpoint + * sending a SENDME. + * + * We are able to know that because the package or inflight window value minus + * one cell (the possible SENDME cell) should be a multiple of the + * cells-per-sendme increment value (set via consensus parameter, negotiated + * for the circuit, and passed in as sendme_inc). + * + * This function is used when recording a cell digest and this is done quite + * low in the stack when decrypting or encrypting a cell. The window is only + * updated once the cell is actually put in the outbuf. + */ +bool +circuit_sent_cell_for_sendme(const circuit_t *circ, + const crypt_path_t *layer_hint) +{ + congestion_control_t *cc; + int window; + + tor_assert(circ); + + if (layer_hint) { + window = layer_hint->package_window; + cc = layer_hint->ccontrol; + } else { + window = circ->package_window; + cc = circ->ccontrol; + } + + /* If we are using congestion control and the alg is not + * old-school 'fixed', then use cc->inflight to determine + * when sendmes will be sent */ + if (cc) { + if (!cc->inflight) + return false; + + /* This check must be +1 because this function is called *before* + * inflight is incremented for the sent cell */ + if ((cc->inflight+1) % cc->sendme_inc != 0) + return false; + + return true; + } + + /* At the start of the window, no SENDME will be expected. */ + if (window == CIRCWINDOW_START) { + return false; + } + + /* Are we at the limit of the increment and if not, we don't expect next + * cell is a SENDME. + * + * We test against the window minus 1 because when we are looking if the + * next cell is a SENDME, the window (either package or deliver) hasn't been + * decremented just yet so when this is called, we are currently processing + * the "window - 1" cell. + */ + if (((window - 1) % CIRCWINDOW_INCREMENT) != 0) { + return false; + } + + /* Next cell is expected to be a SENDME. */ + return true; +} + +/** + * Call-in to tell congestion control code that this circuit sent a cell. + * + * This updates the 'inflight' counter, and if this is a cell that will + * cause the other end to send a SENDME, record the current time in a list + * of pending timestamps, so that we can later compute the circuit RTT when + * the SENDME comes back. */ +void +congestion_control_note_cell_sent(congestion_control_t *cc, + const circuit_t *circ, + const crypt_path_t *cpath) +{ + tor_assert(circ); + tor_assert(cc); + + /* Is this the last cell before a SENDME? The idea is that if the + * package_window reaches a multiple of the increment, after this cell, we + * should expect a SENDME. Note that this function must be called *before* + * we account for the sent cell. */ + if (!circuit_sent_cell_for_sendme(circ, cpath)) { + cc->inflight++; + return; + } + + cc->inflight++; + + /* Record this cell time for RTT computation when SENDME arrives */ + enqueue_timestamp(cc->sendme_pending_timestamps, + monotime_absolute_usec()); +} + +/** + * Returns true if any edge connections are active. + * + * We need to know this so that we can stop computing BDP if the + * edges are not sending on the circuit. + */ +static int +circuit_has_active_streams(const circuit_t *circ, + const crypt_path_t *layer_hint) +{ + const edge_connection_t *streams; + + if (CIRCUIT_IS_ORIGIN(circ)) { + streams = CONST_TO_ORIGIN_CIRCUIT(circ)->p_streams; + } else { + streams = CONST_TO_OR_CIRCUIT(circ)->n_streams; + } + + /* Check linked list of streams */ + for (const edge_connection_t *conn = streams; conn != NULL; + conn = conn->next_stream) { + if (conn->base_.marked_for_close) + continue; + + if (!layer_hint || conn->cpath_layer == layer_hint) { + if (connection_get_inbuf_len(TO_CONN(conn)) > 0) { + log_info(LD_CIRC, "CC: More in edge inbuf..."); + return 1; + } + + /* If we did not reach EOF on this read, there's more */ + if (!TO_CONN(conn)->inbuf_reached_eof) { + log_info(LD_CIRC, "CC: More on edge conn..."); + return 1; + } + + if (TO_CONN(conn)->linked_conn) { + if (connection_get_inbuf_len(TO_CONN(conn)->linked_conn) > 0) { + log_info(LD_CIRC, "CC: More in linked inbuf..."); + return 1; + } + + /* If there is a linked conn, and *it* did not each EOF, + * there's more */ + if (!TO_CONN(conn)->linked_conn->inbuf_reached_eof) { + log_info(LD_CIRC, "CC: More on linked conn..."); + return 1; + } + } + } + } + + return 0; +} + +/** + * Upon receipt of a SENDME, pop the oldest timestamp off the timestamp + * list, and use this to update RTT. + * + * Returns true if circuit estimates were successfully updated, false + * otherwise. + */ +bool +congestion_control_update_circuit_estimates(congestion_control_t *cc, + const circuit_t *circ, + const crypt_path_t *layer_hint) +{ + uint64_t now_usec = monotime_absolute_usec(); + + /* Update RTT first, then BDP. BDP needs fresh RTT */ + uint64_t curr_rtt_usec = congestion_control_update_circuit_rtt(cc, now_usec); + return congestion_control_update_circuit_bdp(cc, circ, layer_hint, now_usec, + curr_rtt_usec); +} + +/** + * Returns true if we have enough time data to use heuristics + * to compare RTT to a baseline. + */ +static bool +time_delta_should_use_heuristics(const congestion_control_t *cc) +{ + + /* If we have exited slow start, we should have processed at least + * a cwnd worth of RTTs */ + if (!cc->in_slow_start) { + return true; + } + + /* If we managed to get enough acks to estimate a SENDME BDP, then + * we have enough to estimate clock jumps relative to a baseline, + * too. (This is at least 'cc_bwe_min' acks). */ + if (cc->bdp[BDP_ALG_SENDME_RATE]) { + return true; + } + + /* Not enough data to estimate clock jumps */ + return false; +} + +/** + * Returns true if the monotime delta is 0, or is significantly + * different than the previous delta. Either case indicates + * that the monotime time source stalled or jumped. + */ +static bool +time_delta_stalled_or_jumped(const congestion_control_t *cc, + uint64_t old_delta, uint64_t new_delta) +{ +#define DELTA_DISCREPENCY_RATIO_MAX 100 + /* If we have a 0 new_delta, that is definitely a monotime stall */ + if (new_delta == 0) { + static ratelim_t stall_info_limit = RATELIM_INIT(60); + log_fn_ratelim(&stall_info_limit, LOG_INFO, LD_CIRC, + "Congestion control cannot measure RTT due to monotime stall."); + return true; + } + + /* If the old_delta is 0, we have no previous values. So + * just assume this one is valid (beause it is non-zero) */ + if (old_delta == 0) + return false; + + /* + * For the heuristic cases, we need at least a few timestamps, + * to average out any previous partial stalls or jumps. So until + * than point, let's just delcare these time values "good enough + * to use". + */ + if (!time_delta_should_use_heuristics(cc)) { + return false; + } + + /* If old_delta is significantly larger than new_delta, then + * this means that the monotime clock recently stopped moving + * forward. */ + if (old_delta > new_delta * DELTA_DISCREPENCY_RATIO_MAX) { + static ratelim_t dec_notice_limit = RATELIM_INIT(300); + log_fn_ratelim(&dec_notice_limit, LOG_NOTICE, LD_CIRC, + "Sudden decrease in circuit RTT (%"PRIu64" vs %"PRIu64 + "), likely due to clock jump.", + new_delta/1000, old_delta/1000); + + return true; + } + + /* If new_delta is significantly larger than old_delta, then + * this means that the monotime clock suddenly jumped forward. */ + if (new_delta > old_delta * DELTA_DISCREPENCY_RATIO_MAX) { + static ratelim_t dec_notice_limit = RATELIM_INIT(300); + log_fn_ratelim(&dec_notice_limit, LOG_NOTICE, LD_CIRC, + "Sudden increase in circuit RTT (%"PRIu64" vs %"PRIu64 + "), likely due to clock jump.", + new_delta/1000, old_delta/1000); + + return true; + } + + return false; +} + +/** + * Called when we get a SENDME. Updates circuit RTT by pulling off a + * timestamp of when we sent the CIRCWINDOW_INCREMENT-th cell from + * the queue of such timestamps, and comparing that to current time. + * + * Also updates min, max, and EWMA of RTT. + * + * Returns the current circuit RTT in usecs, or 0 if it could not be + * measured (due to clock jump, stall, etc). + */ +static uint64_t +congestion_control_update_circuit_rtt(congestion_control_t *cc, + uint64_t now_usec) +{ + uint64_t rtt, ewma_cnt; + uint64_t sent_at_timestamp; + + tor_assert(cc); + + /* Get the time that we sent the cell that resulted in the other + * end sending this sendme. Use this to calculate RTT */ + sent_at_timestamp = dequeue_timestamp(cc->sendme_pending_timestamps); + + rtt = now_usec - sent_at_timestamp; + + /* Do not update RTT at all if it looks fishy */ + if (time_delta_stalled_or_jumped(cc, cc->ewma_rtt_usec, rtt)) { + return 0; + } + + ewma_cnt = cc->ewma_cwnd_cnt*sendme_acks_per_cwnd(cc); + ewma_cnt = MAX(ewma_cnt, 2); // Use at least 2 + + cc->ewma_rtt_usec = n_count_ewma(rtt, cc->ewma_rtt_usec, ewma_cnt); + + if (rtt > cc->max_rtt_usec) { + cc->max_rtt_usec = rtt; + } + + if (cc->min_rtt_usec == 0 || rtt < cc->min_rtt_usec) { + cc->min_rtt_usec = rtt; + } + + return rtt; +} + +/** + * Called when we get a SENDME. Updates the bandwidth-delay-product (BDP) + * estimates of a circuit. Several methods of computing BDP are used, + * depending on scenario. While some congestion control algorithms only + * use one of these methods, we update them all because it's quick and easy. + * + * - now_usec is the current monotime in usecs. + * - curr_rtt_usec is the current circuit RTT in usecs. It may be 0 if no + * RTT could bemeasured. + * + * Returns true if we were able to update BDP, false otherwise. + */ +static bool +congestion_control_update_circuit_bdp(congestion_control_t *cc, + const circuit_t *circ, + const crypt_path_t *layer_hint, + uint64_t now_usec, + uint64_t curr_rtt_usec) +{ + int chan_q = 0; + unsigned int blocked_on_chan = 0; + uint64_t timestamp_usec; + uint64_t sendme_rate_bdp = 0; + + tor_assert(cc); + + if (CIRCUIT_IS_ORIGIN(circ)) { + /* origin circs use n_chan */ + chan_q = circ->n_chan_cells.n; + blocked_on_chan = circ->streams_blocked_on_n_chan; + } else { + /* Both onion services and exits use or_circuit and p_chan */ + chan_q = CONST_TO_OR_CIRCUIT(circ)->p_chan_cells.n; + blocked_on_chan = circ->streams_blocked_on_p_chan; + } + + /* If we have no EWMA RTT, it is because monotime has been stalled + * or messed up the entire time so far. Set our BDP estimates directly + * to current cwnd */ + if (!cc->ewma_rtt_usec) { + uint64_t cwnd = cc->cwnd; + + /* If the channel is blocked, keep subtracting off the chan_q + * until we hit the min cwnd. */ + if (blocked_on_chan) { + cwnd = MAX(cwnd - chan_q, cc->cwnd_min); + cc->blocked_chan = 1; + } else { + cc->blocked_chan = 0; + } + + cc->bdp[BDP_ALG_CWND_RTT] = cwnd; + cc->bdp[BDP_ALG_INFLIGHT_RTT] = cwnd; + cc->bdp[BDP_ALG_SENDME_RATE] = cwnd; + cc->bdp[BDP_ALG_PIECEWISE] = cwnd; + + static ratelim_t dec_notice_limit = RATELIM_INIT(300); + log_fn_ratelim(&dec_notice_limit, LOG_NOTICE, LD_CIRC, + "Our clock has been stalled for the entire lifetime of a circuit. " + "Performance may be sub-optimal."); + + return blocked_on_chan; + } + + /* Congestion window based BDP will respond to changes in RTT only, and is + * relative to cwnd growth. It is useful for correcting for BDP + * overestimation, but if BDP is higher than the current cwnd, it will + * underestimate it. + * + * We multiply here first to avoid precision issues from min_RTT being + * close to ewma RTT. Since all fields are u64, there is plenty of + * room here to multiply first. + */ + cc->bdp[BDP_ALG_CWND_RTT] = cc->cwnd*cc->min_rtt_usec/cc->ewma_rtt_usec; + + /* + * If we have no pending streams, we do not have enough data to fill + * the BDP, so preserve our old estimates but do not make any more. + */ + if (!blocked_on_chan && !circuit_has_active_streams(circ, layer_hint)) { + log_info(LD_CIRC, + "CC: Streams drained. Spare package window: %"PRIu64 + ", no BDP update", cc->cwnd - cc->inflight); + + /* Clear SENDME timestamps; they will be wrong with intermittent data */ + SMARTLIST_FOREACH(cc->sendme_arrival_timestamps, uint64_t *, t, + tor_free(t)); + smartlist_clear(cc->sendme_arrival_timestamps); + } else if (curr_rtt_usec) { + /* Sendme-based BDP will quickly measure BDP in much less than + * a cwnd worth of data when in use (in 2-10 SENDMEs). + * + * But if the link goes idle, it will be vastly lower than true BDP. Hence + * we only compute it if we have either pending stream data, or streams + * are still blocked on the channel queued data. + * + * We also do not compute it if we do not have a current RTT passed in, + * because that means that monotime is currently stalled or just jumped. + */ + enqueue_timestamp(cc->sendme_arrival_timestamps, now_usec); + + if (smartlist_len(cc->sendme_arrival_timestamps) >= cc->bwe_sendme_min) { + /* If we have more sendmes than fit in a cwnd, trim the list. + * Those are not acurrately measuring throughput, if cwnd is + * currently smaller than BDP */ + while (smartlist_len(cc->sendme_arrival_timestamps) > + cc->bwe_sendme_min && + (uint64_t)smartlist_len(cc->sendme_arrival_timestamps) > + sendme_acks_per_cwnd(cc)) { + (void)dequeue_timestamp(cc->sendme_arrival_timestamps); + } + int sendme_cnt = smartlist_len(cc->sendme_arrival_timestamps); + + /* Calculate SENDME_BWE_COUNT pure average */ + timestamp_usec = peek_timestamp(cc->sendme_arrival_timestamps); + uint64_t delta = now_usec - timestamp_usec; + + /* The acked data is in sendme_cnt-1 chunks, because we are counting the + * data that is processed by the other endpoint *between* all of these + * sendmes. There's one less gap between the sendmes than the number + * of sendmes. */ + uint64_t cells = (sendme_cnt-1)*cc->sendme_inc; + + /* The bandwidth estimate is cells/delta, which when multiplied + * by min RTT obtains the BDP. However, we multiply first to + * avoid precision issues with the RTT being close to delta in size. */ + sendme_rate_bdp = cells*cc->min_rtt_usec/delta; + + /* Calculate BDP_EWMA_COUNT N-EWMA */ + cc->bdp[BDP_ALG_SENDME_RATE] = + n_count_ewma(sendme_rate_bdp, cc->bdp[BDP_ALG_SENDME_RATE], + cc->ewma_cwnd_cnt*sendme_acks_per_cwnd(cc)); + } + + /* In-flight BDP will cause the cwnd to drift down when underutilized. + * It is most useful when the local OR conn is blocked, so we only + * compute it if we're utilized. */ + cc->bdp[BDP_ALG_INFLIGHT_RTT] = + (cc->inflight - chan_q)*cc->min_rtt_usec/ + MAX(cc->ewma_rtt_usec, curr_rtt_usec); + } else { + /* We can still update inflight with just an EWMA RTT, but only + * if there is data flowing */ + cc->bdp[BDP_ALG_INFLIGHT_RTT] = + (cc->inflight - chan_q)*cc->min_rtt_usec/cc->ewma_rtt_usec; + } + + /* The orconn is blocked; use smaller of inflight vs SENDME */ + if (blocked_on_chan) { + log_info(LD_CIRC, "CC: Streams blocked on circ channel. Chanq: %d", + chan_q); + + /* A blocked channel is an immediate congestion signal, but it still + * happens only once per cwnd */ + if (!cc->blocked_chan) { + cc->next_cc_event = 0; + cc->blocked_chan = 1; + } + + if (cc->bdp[BDP_ALG_SENDME_RATE]) { + cc->bdp[BDP_ALG_PIECEWISE] = MIN(cc->bdp[BDP_ALG_INFLIGHT_RTT], + cc->bdp[BDP_ALG_SENDME_RATE]); + } else { + cc->bdp[BDP_ALG_PIECEWISE] = cc->bdp[BDP_ALG_INFLIGHT_RTT]; + } + } else { + /* If we were previously blocked, emit a new congestion event + * now that we are unblocked, to re-evaluate cwnd */ + if (cc->blocked_chan) { + cc->blocked_chan = 0; + cc->next_cc_event = 0; + log_info(LD_CIRC, "CC: Streams un-blocked on circ channel. Chanq: %d", + chan_q); + } + + cc->bdp[BDP_ALG_PIECEWISE] = MAX(cc->bdp[BDP_ALG_SENDME_RATE], + cc->bdp[BDP_ALG_CWND_RTT]); + } + + /* We can end up with no piecewise value if we didn't have either + * a SENDME estimate or enough data for an inflight estimate. + * It also happens on the very first sendme, since we need two + * to get a BDP. In these cases, use the cwnd method. */ + if (!cc->bdp[BDP_ALG_PIECEWISE]) { + cc->bdp[BDP_ALG_PIECEWISE] = cc->bdp[BDP_ALG_CWND_RTT]; + log_info(LD_CIRC, "CC: No piecewise BDP. Using %"PRIu64, + cc->bdp[BDP_ALG_PIECEWISE]); + } + + if (cc->next_cc_event == 0) { + if (CIRCUIT_IS_ORIGIN(circ)) { + log_info(LD_CIRC, + "CC: Circuit %d " + "SENDME RTT: %"PRIu64", %"PRIu64", %"PRIu64", %"PRIu64", " + "BDP estimates: " + "%"PRIu64", " + "%"PRIu64", " + "%"PRIu64", " + "%"PRIu64", " + "%"PRIu64". ", + CONST_TO_ORIGIN_CIRCUIT(circ)->global_identifier, + cc->min_rtt_usec/1000, + curr_rtt_usec/1000, + cc->ewma_rtt_usec/1000, + cc->max_rtt_usec/1000, + cc->bdp[BDP_ALG_INFLIGHT_RTT], + cc->bdp[BDP_ALG_CWND_RTT], + sendme_rate_bdp, + cc->bdp[BDP_ALG_SENDME_RATE], + cc->bdp[BDP_ALG_PIECEWISE] + ); + } else { + log_info(LD_CIRC, + "CC: Circuit %"PRIu64":%d " + "SENDME RTT: %"PRIu64", %"PRIu64", %"PRIu64", %"PRIu64", " + "%"PRIu64", " + "%"PRIu64", " + "%"PRIu64", " + "%"PRIu64", " + "%"PRIu64". ", + // XXX: actually, is this p_chan here? This is + // an or_circuit (exit or onion) + circ->n_chan->global_identifier, circ->n_circ_id, + cc->min_rtt_usec/1000, + curr_rtt_usec/1000, + cc->ewma_rtt_usec/1000, + cc->max_rtt_usec/1000, + cc->bdp[BDP_ALG_INFLIGHT_RTT], + cc->bdp[BDP_ALG_CWND_RTT], + sendme_rate_bdp, + cc->bdp[BDP_ALG_SENDME_RATE], + cc->bdp[BDP_ALG_PIECEWISE] + ); + } + } + + /* We updated BDP this round if either we had a blocked channel, or + * the curr_rtt_usec was not 0. */ + return (blocked_on_chan || curr_rtt_usec != 0); +} + +/** + * Dispatch the sendme to the appropriate congestion control algorithm. + */ +int +congestion_control_dispatch_cc_alg(congestion_control_t *cc, + const circuit_t *circ, + const crypt_path_t *layer_hint) +{ + switch (cc->cc_alg) { + case CC_ALG_WESTWOOD: + return congestion_control_westwood_process_sendme(cc, circ, layer_hint); + + case CC_ALG_VEGAS: + return congestion_control_vegas_process_sendme(cc, circ, layer_hint); + + case CC_ALG_NOLA: + return congestion_control_nola_process_sendme(cc, circ, layer_hint); + + case CC_ALG_SENDME: + default: + tor_assert(0); + } + + return -END_CIRC_REASON_INTERNAL; +} diff --git a/src/core/or/congestion_control_common.h b/src/core/or/congestion_control_common.h new file mode 100644 index 0000000000..4193d94cba --- /dev/null +++ b/src/core/or/congestion_control_common.h @@ -0,0 +1,55 @@ +/* Copyright (c) 2019-2021, The Tor Project, Inc. */ +/* See LICENSE for licensing information */ + +/** + * \file congestion_control_common.h + * \brief Public APIs for congestion control + **/ + +#ifndef TOR_CONGESTION_CONTROL_COMMON_H +#define TOR_CONGESTION_CONTROL_COMMON_H + +#include "core/or/crypt_path_st.h" +#include "core/or/circuit_st.h" + +typedef struct congestion_control_t congestion_control_t; + +/** Wrapper for the free function, set the CC pointer to NULL after free */ +#define congestion_control_free(cc) \ + FREE_AND_NULL(congestion_control_t, congestion_control_free_, cc) + +void congestion_control_free_(congestion_control_t *cc); + +congestion_control_t *congestion_control_new(void); + +int congestion_control_dispatch_cc_alg(congestion_control_t *cc, + const circuit_t *circ, + const crypt_path_t *layer_hint); + +void congestion_control_note_cell_sent(congestion_control_t *cc, + const circuit_t *circ, + const crypt_path_t *cpath); + +bool congestion_control_update_circuit_estimates(congestion_control_t *, + const circuit_t *, + const crypt_path_t *); + +int congestion_control_get_package_window(const circuit_t *, + const crypt_path_t *); + +int sendme_get_inc_count(const circuit_t *, const crypt_path_t *); +bool circuit_sent_cell_for_sendme(const circuit_t *, const crypt_path_t *); + +/* Private section starts. */ +#ifdef TOR_CONGESTION_CONTROL_PRIVATE + +/* + * Unit tests declaractions. + */ +#ifdef TOR_UNIT_TESTS + +#endif /* defined(TOR_UNIT_TESTS) */ + +#endif /* defined(TOR_CONGESTION_CONTROL_PRIVATE) */ + +#endif /* !defined(TOR_CONGESTION_CONTROL_COMMON_H) */ diff --git a/src/core/or/congestion_control_st.h b/src/core/or/congestion_control_st.h new file mode 100644 index 0000000000..251ebd82e3 --- /dev/null +++ b/src/core/or/congestion_control_st.h @@ -0,0 +1,257 @@ +/* Copyright (c) 2019-2021, The Tor Project, Inc. */ +/* See LICENSE for licensing information */ + +/** + * \file congestion_control_st.h + * \brief Structure definitions for congestion control. + **/ + +#ifndef CONGESTION_CONTROL_ST_H +#define CONGESTION_CONTROL_ST_H + +#include "core/or/crypt_path_st.h" +#include "core/or/circuit_st.h" + +/** Signifies which sendme algorithm to use */ +typedef enum { + /** OG Tor fixed-sized circ and stream windows. It sucks, but it is important + * to make sure that the new algs can compete with the old garbage. */ + CC_ALG_SENDME = 0, + + /** + * Prop#324 TOR_WESTWOOD - Deliberately agressive. Westwood may not even + * converge to fairness in some cases because max RTT will also increase + * on congesgtion, which boosts the Westwood RTT congestion threshhold. So it + * can cause runaway queue bloat, which may or may not lead to a robot + * uprising... Ok that's Westworld, not Westwood. Still, we need to test + * Vegas and NOLA against something more agressive to ensure they do not + * starve in the presence of cheaters. We also need to make sure cheaters + * trigger the oomkiller in those cases. + */ + CC_ALG_WESTWOOD = 1, + + /** + * Prop#324 TOR_VEGAS - TCP Vegas-style BDP tracker. Because Vegas backs off + * whenever it detects queue delay, it can be beaten out by more agressive + * algs. However, in live network testing, it seems to do just fine against + * current SENDMEs. It outperforms Westwood and does not stall. */ + CC_ALG_VEGAS = 2, + + /** + * Prop#324: TOR_NOLA - NOLA looks the BDP right in the eye and uses it + * immediately as CWND. No slow start, no other congestion signals, no delay, + * no bullshit. Like TOR_VEGAS, it also uses agressive BDP estimates, to + * avoid out-competition. It seems a bit better throughput than Vegas, + * but its agressive BDP and rapid updates may lead to more queue latency. */ + CC_ALG_NOLA = 3, +} cc_alg_t; + +/* Total number of CC algs in cc_alg_t enum */ +#define NUM_CC_ALGS (CC_ALG_NOLA+1) + +/** Signifies how we estimate circuit BDP */ +typedef enum { + /* CWND-based BDP will respond to changes in RTT only, and is relative + * to cwnd growth. So in slow-start, this will under-estimate BDP */ + BDP_ALG_CWND_RTT = 0, + + /* Sendme-based BDP will quickly measure BDP in less than + * a cwnd worth of data when in use. So it should be good for slow-start. + * But if the link goes idle, it will be vastly lower than true BDP. Thus, + * this estimate gets reset when the cwnd is not fully utilized. */ + BDP_ALG_SENDME_RATE = 1, + + /* Inflight BDP is similar to the cwnd estimator, except it uses + * packets inflight minus local circuit queues instead of current cwnd. + * Because it is strictly less than or equal to the cwnd, it will cause + * the cwnd to drift downward. It is only used if the local OR connection + * is blocked. */ + BDP_ALG_INFLIGHT_RTT = 2, + + /* The Piecewise BDP estimator uses the CWND estimator before there + * are sufficient SENDMEs to calculate the SENDME estimator. At that + * point, it uses the SENDME estimator, unless the local OR connection + * becomes blocked. In that case, it switches to the inflight estimator. */ + BDP_ALG_PIECEWISE = 3, + +} bdp_alg_t; + +/** Total number of BDP algs in bdp_alg_t enum */ +#define NUM_BDP_ALGS (BDP_ALG_PIECEWISE+1) + +/** Westwood algorithm parameters */ +struct westwood_params_t { + /** Cwnd backoff multiplier upon congestion (as percent) */ + uint8_t cwnd_backoff_m; + /** Max RTT backoff multiplier upon congestion (as percent) */ + uint8_t rtt_backoff_m; + + /** Threshold between min and max RTT, to signal congestion (percent) */ + uint8_t rtt_thresh; + + /** + * If true, use minimum of BDP and backoff multiplication in backoff. + * If false, use maximum of BDP and backoff multiplication in backoff. */ + bool min_backoff; +}; + +/** Vegas algorithm parameters. */ +struct vegas_params_t { + /** The queue use allowed before we exit slow start */ + uint16_t gamma; + /** The queue use below which we increment cwnd */ + uint16_t alpha; + /** The queue use above which we decrement cwnd */ + uint16_t beta; + /** Weighted average (percent) between cwnd estimator and + * piecewise estimator. */ + uint8_t bdp_mix_pct; +}; + +/** NOLA consensus params */ +struct nola_params_t { + /** How many cells to add to BDP estimate to obtain cwnd */ + uint16_t bdp_overshoot; +}; + +/** Fields common to all congestion control algorithms */ +typedef struct congestion_control_t { + /** + * Smartlist of uint64_t monotime usec timestamps of when we sent a data + * cell that is pending a sendme. FIFO queue that is managed similar to + * sendme_last_digests. */ + smartlist_t *sendme_pending_timestamps; + + /** + * Smartlist of uint64_t monotime timestamp of when sendme's arrived. + * FIFO queue that is managed similar to sendme_last_digests. + * Used to estimate circuitbandwidth and BDP. */ + smartlist_t *sendme_arrival_timestamps; + + /** RTT time data for congestion control. */ + uint64_t ewma_rtt_usec; + uint64_t min_rtt_usec; + uint64_t max_rtt_usec; + + /* BDP estimates by algorithm */ + uint64_t bdp[NUM_BDP_ALGS]; + + /** Congestion window */ + uint64_t cwnd; + + /** Number of cells in-flight (sent but awaiting SENDME ack). */ + uint64_t inflight; + + /** + * For steady-state: the number of sendme acks until we will acknowledge + * a congestion event again. It starts out as the number of sendme acks + * in a congestion windowm and is decremented each ack. When this reaches + * 0, it means we should examine our congestion algorithm conditions. + * In this way, we only react to one congestion event per congestion window. + * + * It is also reset to 0 immediately whenever the circuit's orconn is + * blocked, and when a previously blocked orconn is unblocked. + */ + uint64_t next_cc_event; + + /** Are we in slow start? */ + bool in_slow_start; + + /** Is the local channel blocked on us? That's a congestion signal */ + bool blocked_chan; + + /* The following parameters are cached from consensus values upon + * circuit setup. */ + + /** Percent of cwnd to increment by during slow start */ + uint16_t cwnd_inc_pct_ss; + + /** Number of cells to increment cwnd by during steady state */ + uint16_t cwnd_inc; + + /** Minimum congestion window (must be at least sendme_inc) */ + uint16_t cwnd_min; + + /** + * Number of times per congestion window to update based on congestion + * signals */ + uint8_t cwnd_inc_rate; + + /** + * Number of cwnd worth of sendme acks to smooth RTT and BDP with, + * using N_EWMA */ + uint8_t ewma_cwnd_cnt; + + /** + * Minimum number of sendmes before we begin BDP estimates + */ + uint8_t bwe_sendme_min; + + /** + * Number of cells to ack with every sendme. Taken from consensus parameter + * and negotiation during circuit setup. */ + uint8_t sendme_inc; + + /** Which congestion control algorithm to use. Taken from + * consensus parameter and negotiation during circuit setup. */ + cc_alg_t cc_alg; + + /** Which algorithm to estimate circuit bandwidth with. Taken from + * consensus parameter during circuit setup. */ + bdp_alg_t bdp_alg; + + /** Algorithm-specific parameters. The specific struct that is used + * depends upon the algoritghm selected by the cc_alg parameter. + * These should not be accessed anywhere other than the algorithm-specific + * files. */ + union { + struct westwood_params_t westwood_params; + struct vegas_params_t vegas_params; + struct nola_params_t nola_params; + }; +} congestion_control_t; + +/** + * Returns the number of sendme acks we will recieve before we update cwnd. + * + * Congestion control literature recommends only one update of cwnd per + * cwnd worth of acks. However, we can also tune this to be more frequent + * by increasing the 'cc_cwnd_inc_rate' consensus parameter. + * + * If this returns 0 due to high cwnd_inc_rate, the calling code will + * update every sendme ack. + */ +static inline uint64_t CWND_UPDATE_RATE(const congestion_control_t *cc) +{ + /* We add cwnd_inc_rate*sendme_inc/2 to round to nearest integer number + * of acks */ + return ((cc->cwnd + cc->cwnd_inc_rate*cc->sendme_inc/2) + / (cc->cwnd_inc_rate*cc->sendme_inc)); +} + +/** + * Returns the amount to increment the congestion window each update, + * during slow start. + * + * Congestion control literature recommends either doubling the cwnd + * every cwnd during slow start, or some similar exponential growth + * (such as 50% more every cwnd, for Vegas). + * + * This is controlled by a consensus parameter 'cwnd_inc_pct_ss', which + * allows us to specify the percent of the current consensus window + * to update by. + */ +static inline uint64_t CWND_INC_SS(const congestion_control_t *cc) +{ + return (cc->cwnd_inc_pct_ss*cc->cwnd/100); +} + +/** + * Returns the amount to increment (and for Vegas, also decrement) the + * congestion window by, every update period. + * + * This is controlled by the cc_cwnd_inc consensus parameter. + */ +#define CWND_INC(cc) ((cc)->cwnd_inc) + +#endif /* !defined(CONGESTION_CONTROL_ST_H) */ diff --git a/src/core/or/include.am b/src/core/or/include.am index b578b75673..6d2b73d03c 100644 --- a/src/core/or/include.am +++ b/src/core/or/include.am @@ -35,6 +35,7 @@ LIBTOR_APP_A_SOURCES += \ src/core/or/scheduler_kist.c \ src/core/or/scheduler_vanilla.c \ src/core/or/sendme.c \ + src/core/or/sendme_common.c \ src/core/or/status.c \ src/core/or/versions.c @@ -97,6 +98,7 @@ noinst_HEADERS += \ src/core/or/relay_crypto_st.h \ src/core/or/scheduler.h \ src/core/or/sendme.h \ + src/core/or/sendme_common.h \ src/core/or/server_port_cfg_st.h \ src/core/or/socks_request_st.h \ src/core/or/status.h \