Prop#324: Common RTT, BDP, and blocked channel signal support

2024-11-11 05:33:47 +01:00 · 2021-06-10 23:08:24 +00:00 · 2021-06-10 23:08:24 +00:00 · f1d0c2d826
commit f1d0c2d826
parent 4f68fe3e6c
4 changed files with 1247 additions and 0 deletions
--- a/src/core/or/congestion_control_common.c
+++ b/src/core/or/congestion_control_common.c
@ -0,0 +1,933 @@
+/* Copyright (c) 2021, The Tor Project, Inc. */
+/* See LICENSE for licensing information */
+
+/**
+ * \file congestion_control_common.c
+ * \brief Common code used by all congestion control algorithms.
+ */
+
+#define TOR_CONGESTION_CONTROL_COMMON_PRIVATE
+
+#include "core/or/or.h"
+
+#include "core/or/circuitlist.h"
+#include "core/or/crypt_path.h"
+#include "core/or/or_circuit_st.h"
+#include "core/or/origin_circuit_st.h"
+#include "core/or/channel.h"
+#include "core/mainloop/connection.h"
+#include "core/or/sendme.h"
+#include "core/or/congestion_control_common.h"
+#include "core/or/congestion_control_vegas.h"
+#include "core/or/congestion_control_nola.h"
+#include "core/or/congestion_control_westwood.h"
+#include "core/or/congestion_control_st.h"
+#include "lib/time/compat_time.h"
+#include "feature/nodelist/networkstatus.h"
+
+/* Consensus parameter defaults */
+#define CIRCWINDOW_INIT (500)
+
+#define CWND_INC_PCT_SS_DFLT (100)
+
+#define SENDME_INC_DFLT  (50)
+#define CWND_MIN_DFLT    (MAX(100, SENDME_INC_DFLT))
+
+#define CWND_INC_DFLT (50)
+
+#define CWND_INC_RATE_DFLT (1)
+
+#define WESTWOOD_BDP_ALG BDP_ALG_PIECEWISE
+#define VEGAS_BDP_MIX_ALG BDP_ALG_PIECEWISE
+#define NOLA_BDP_ALG BDP_ALG_PIECEWISE
+
+#define EWMA_CWND_COUNT_DFLT  2
+
+#define BWE_SENDME_MIN_DFLT   5
+
+static uint64_t congestion_control_update_circuit_rtt(congestion_control_t *,
+                                                      uint64_t);
+static bool congestion_control_update_circuit_bdp(congestion_control_t *,
+                                                  const circuit_t *,
+                                                  const crypt_path_t *,
+                                                  uint64_t, uint64_t);
+
+/**
+ * Set congestion control parameters on a circuit's congestion
+ * control object based on values from the consensus.
+ *
+ * cc_alg is the negotiated congestion control algorithm.
+ *
+ * sendme_inc is the number of packaged cells that a sendme cell
+ * acks. This parameter will come from circuit negotiation.
+ */
+static void
+congestion_control_init_params(congestion_control_t *cc,
+                               cc_alg_t cc_alg,
+                               int sendme_inc)
+{
+#define CWND_INIT_MIN 100
+#define CWND_INIT_MAX (10000)
+  cc->cwnd =
+    networkstatus_get_param(NULL, "cc_cwnd_init",
+        CIRCWINDOW_INIT,
+        CWND_INIT_MIN,
+        CWND_INIT_MAX);
+
+#define CWND_INC_PCT_SS_MIN 1
+#define CWND_INC_PCT_SS_MAX (500)
+  cc->cwnd_inc_pct_ss =
+    networkstatus_get_param(NULL, "cc_cwnd_inc_pct_ss",
+        CWND_INC_PCT_SS_DFLT,
+        CWND_INC_PCT_SS_MIN,
+        CWND_INC_PCT_SS_MAX);
+
+#define CWND_INC_MIN 1
+#define CWND_INC_MAX (1000)
+  cc->cwnd_inc =
+    networkstatus_get_param(NULL, "cc_cwnd_inc",
+        CWND_INC_DFLT,
+        CWND_INC_MIN,
+        CWND_INC_MAX);
+
+#define CWND_INC_RATE_MIN 1
+#define CWND_INC_RATE_MAX (250)
+  cc->cwnd_inc_rate =
+    networkstatus_get_param(NULL, "cc_cwnd_inc_rate",
+        CWND_INC_RATE_DFLT,
+        CWND_INC_RATE_MIN,
+        CWND_INC_RATE_MAX);
+
+#define SENDME_INC_MIN 10
+#define SENDME_INC_MAX (1000)
+  cc->sendme_inc =
+    networkstatus_get_param(NULL, "cc_sendme_inc",
+        sendme_inc,
+        SENDME_INC_MIN,
+        SENDME_INC_MAX);
+
+  // XXX: this min needs to abide by sendme_inc range rules somehow
+#define CWND_MIN_MIN sendme_inc
+#define CWND_MIN_MAX (1000)
+  cc->cwnd_min =
+    networkstatus_get_param(NULL, "cc_cwnd_min",
+        CWND_MIN_DFLT,
+        CWND_MIN_MIN,
+        CWND_MIN_MAX);
+
+#define EWMA_CWND_COUNT_MIN 1
+#define EWMA_CWND_COUNT_MAX (100)
+  cc->ewma_cwnd_cnt =
+    networkstatus_get_param(NULL, "cc_ewma_cwnd_cnt",
+        EWMA_CWND_COUNT_DFLT,
+        EWMA_CWND_COUNT_MIN,
+        EWMA_CWND_COUNT_MAX);
+
+#define BWE_SENDME_MIN_MIN 2
+#define BWE_SENDME_MIN_MAX (20)
+  cc->bwe_sendme_min =
+    networkstatus_get_param(NULL, "cc_bwe_min",
+        BWE_SENDME_MIN_DFLT,
+        BWE_SENDME_MIN_MIN,
+        BWE_SENDME_MIN_MAX);
+
+#define CC_ALG_MIN 0
+#define CC_ALG_MAX (NUM_CC_ALGS-1)
+  cc->cc_alg =
+    networkstatus_get_param(NULL, "cc_alg",
+        cc_alg,
+        CC_ALG_MIN,
+        CC_ALG_MAX);
+
+  bdp_alg_t default_bdp_alg = 0;
+
+  switch (cc->cc_alg) {
+    case CC_ALG_WESTWOOD:
+      default_bdp_alg = WESTWOOD_BDP_ALG;
+      break;
+    case CC_ALG_VEGAS:
+      default_bdp_alg = VEGAS_BDP_MIX_ALG;
+      break;
+    case CC_ALG_NOLA:
+      default_bdp_alg = NOLA_BDP_ALG;
+      break;
+    case CC_ALG_SENDME:
+    default:
+      tor_fragile_assert();
+      return; // No alg-specific params
+  }
+
+  cc->bdp_alg =
+    networkstatus_get_param(NULL, "cc_bdp_alg",
+        default_bdp_alg,
+        0,
+        NUM_BDP_ALGS-1);
+
+  /* Algorithm-specific parameters */
+  if (cc->cc_alg == CC_ALG_WESTWOOD) {
+    congestion_control_westwood_set_params(cc);
+  } else if (cc->cc_alg == CC_ALG_VEGAS) {
+    congestion_control_vegas_set_params(cc);
+  } else if (cc->cc_alg == CC_ALG_NOLA) {
+    congestion_control_nola_set_params(cc);
+  }
+}
+
+/**
+ * Allocate and initialize fields in congestion control object.
+ *
+ * cc_alg is the negotiated congestion control algorithm.
+ *
+ * sendme_inc is the number of packaged cells that a sendme cell
+ * acks. This parameter will come from circuit negotiation.
+ */
+static void
+congestion_control_init(congestion_control_t *cc, cc_alg_t cc_alg,
+                        int sendme_inc)
+{
+  cc->sendme_pending_timestamps = smartlist_new();
+  cc->sendme_arrival_timestamps = smartlist_new();
+
+  cc->in_slow_start = 1;
+  congestion_control_init_params(cc, cc_alg, sendme_inc);
+
+  cc->next_cc_event = CWND_UPDATE_RATE(cc);
+}
+
+/** Allocate and initialize a new congestion control object */
+congestion_control_t *
+congestion_control_new(void)
+{
+  congestion_control_t *cc = tor_malloc_zero(sizeof(congestion_control_t));
+
+  // XXX: the alg and the sendme_inc need to be negotiated during
+  // circuit handshake
+  congestion_control_init(cc, CC_ALG_VEGAS, SENDME_INC_DFLT);
+
+  return cc;
+}
+
+/**
+ * Free a congestion control object and its asssociated state.
+ */
+void
+congestion_control_free_(congestion_control_t *cc)
+{
+  if (!cc)
+    return;
+
+  SMARTLIST_FOREACH(cc->sendme_pending_timestamps, uint64_t *, t, tor_free(t));
+  SMARTLIST_FOREACH(cc->sendme_arrival_timestamps, uint64_t *, t, tor_free(t));
+  smartlist_free(cc->sendme_pending_timestamps);
+  smartlist_free(cc->sendme_arrival_timestamps);
+
+  tor_free(cc);
+}
+
+/**
+ * Compute an N-count EWMA, aka N-EWMA. N-EWMA is defined as:
+ *  EWMA = alpha*value + (1-alpha)*EWMA_prev
+ * with alpha = 2/(N+1).
+ *
+ * This works out to:
+ *  EWMA = value*2/(N+1) + EMA_prev*(N-1)/(N+1)
+ *       = (value*2 + EWMA_prev*(N-1))/(N+1)
+ */
+static inline uint64_t
+n_count_ewma(uint64_t curr, uint64_t prev, uint64_t N)
+{
+  if (prev == 0)
+    return curr;
+  else
+    return (2*curr + (N-1)*prev)/(N+1);
+}
+
+/**
+ * Enqueue a u64 timestamp to the end of a queue of timestamps.
+ */
+static inline void
+enqueue_timestamp(smartlist_t *timestamps_u64, uint64_t timestamp_usec)
+{
+  uint64_t *timestamp_ptr = tor_malloc(sizeof(uint64_t));
+  *timestamp_ptr = timestamp_usec;
+
+  smartlist_add(timestamps_u64, timestamp_ptr);
+}
+
+/**
+ * Peek at the head of a smartlist queue of u64 timestamps.
+ */
+static inline uint64_t
+peek_timestamp(const smartlist_t *timestamps_u64_usecs)
+{
+  uint64_t *timestamp_ptr = smartlist_get(timestamps_u64_usecs, 0);
+
+  if (BUG(!timestamp_ptr)) {
+    log_err(LD_CIRC, "Congestion control timestamp list became empty!");
+    return 0;
+  }
+
+  return *timestamp_ptr;
+}
+
+/**
+ * Dequeue a u64 monotime usec timestamp from the front of a
+ * smartlist of pointers to 64.
+ */
+static inline uint64_t
+dequeue_timestamp(smartlist_t *timestamps_u64_usecs)
+{
+  uint64_t *timestamp_ptr = smartlist_get(timestamps_u64_usecs, 0);
+  uint64_t timestamp_u64;
+
+  if (BUG(!timestamp_ptr)) {
+    log_err(LD_CIRC, "Congestion control timestamp list became empty!");
+    return 0;
+  }
+
+  timestamp_u64 = *timestamp_ptr;
+  smartlist_del_keeporder(timestamps_u64_usecs, 0);
+  tor_free(timestamp_ptr);
+
+  return timestamp_u64;
+}
+
+/**
+ * Returns the number of sendme acks that will be recieved in the
+ * current congestion window size, rounded to nearest int.
+ */
+static inline uint64_t
+sendme_acks_per_cwnd(const congestion_control_t *cc)
+{
+  /* We add half a sendme_inc to cwnd to round to the nearest int */
+  return ((cc->cwnd + cc->sendme_inc/2)/cc->sendme_inc);
+}
+
+/**
+ * Get a package window from either old sendme logic, or congestion control.
+ *
+ * A package window is how many cells you can still send.
+ */
+int
+congestion_control_get_package_window(const circuit_t *circ,
+                                      const crypt_path_t *cpath)
+{
+  int package_window;
+  congestion_control_t *cc;
+
+  tor_assert(circ);
+
+  if (cpath) {
+    package_window = cpath->package_window;
+    cc = cpath->ccontrol;
+  } else {
+    package_window = circ->package_window;
+    cc = circ->ccontrol;
+  }
+
+  if (!cc) {
+    return package_window;
+  } else {
+    /* Inflight can be above cwnd if cwnd was just reduced */
+    if (cc->inflight > cc->cwnd)
+      return 0;
+    /* In the extremely unlikely event that cwnd-inflight is larger than
+     * INT32_MAX, just return that cap, so old code doesn't explode. */
+    else if (cc->cwnd - cc->inflight > INT32_MAX)
+      return INT32_MAX;
+    else
+      return (int)(cc->cwnd - cc->inflight);
+  }
+}
+
+/**
+ * Returns the number of cells that are acked by every sendme.
+ */
+int
+sendme_get_inc_count(const circuit_t *circ, const crypt_path_t *layer_hint)
+{
+  int sendme_inc = CIRCWINDOW_INCREMENT;
+  congestion_control_t *cc = NULL;
+
+  if (layer_hint) {
+    cc = layer_hint->ccontrol;
+  } else {
+    cc = circ->ccontrol;
+  }
+
+  if (cc) {
+    sendme_inc = cc->sendme_inc;
+  }
+
+  return sendme_inc;
+}
+
+/** Return true iff the next cell we send will result in the other endpoint
+ * sending a SENDME.
+ *
+ * We are able to know that because the package or inflight window value minus
+ * one cell (the possible SENDME cell) should be a multiple of the
+ * cells-per-sendme increment value (set via consensus parameter, negotiated
+ * for the circuit, and passed in as sendme_inc).
+ *
+ * This function is used when recording a cell digest and this is done quite
+ * low in the stack when decrypting or encrypting a cell. The window is only
+ * updated once the cell is actually put in the outbuf.
+ */
+bool
+circuit_sent_cell_for_sendme(const circuit_t *circ,
+                             const crypt_path_t *layer_hint)
+{
+  congestion_control_t *cc;
+  int window;
+
+  tor_assert(circ);
+
+  if (layer_hint) {
+    window = layer_hint->package_window;
+    cc = layer_hint->ccontrol;
+  } else {
+    window = circ->package_window;
+    cc = circ->ccontrol;
+  }
+
+  /* If we are using congestion control and the alg is not
+   * old-school 'fixed', then use cc->inflight to determine
+   * when sendmes will be sent */
+  if (cc) {
+    if (!cc->inflight)
+      return false;
+
+    /* This check must be +1 because this function is called *before*
+     * inflight is incremented for the sent cell */
+    if ((cc->inflight+1) % cc->sendme_inc != 0)
+      return false;
+
+    return true;
+  }
+
+  /* At the start of the window, no SENDME will be expected. */
+  if (window == CIRCWINDOW_START) {
+    return false;
+  }
+
+  /* Are we at the limit of the increment and if not, we don't expect next
+   * cell is a SENDME.
+   *
+   * We test against the window minus 1 because when we are looking if the
+   * next cell is a SENDME, the window (either package or deliver) hasn't been
+   * decremented just yet so when this is called, we are currently processing
+   * the "window - 1" cell.
+   */
+  if (((window - 1) % CIRCWINDOW_INCREMENT) != 0) {
+    return false;
+  }
+
+  /* Next cell is expected to be a SENDME. */
+  return true;
+}
+
+/**
+ * Call-in to tell congestion control code that this circuit sent a cell.
+ *
+ * This updates the 'inflight' counter, and if this is a cell that will
+ * cause the other end to send a SENDME, record the current time in a list
+ * of pending timestamps, so that we can later compute the circuit RTT when
+ * the SENDME comes back. */
+void
+congestion_control_note_cell_sent(congestion_control_t *cc,
+                                  const circuit_t *circ,
+                                  const crypt_path_t *cpath)
+{
+  tor_assert(circ);
+  tor_assert(cc);
+
+  /* Is this the last cell before a SENDME? The idea is that if the
+   * package_window reaches a multiple of the increment, after this cell, we
+   * should expect a SENDME. Note that this function must be called *before*
+   * we account for the sent cell. */
+  if (!circuit_sent_cell_for_sendme(circ, cpath)) {
+    cc->inflight++;
+    return;
+  }
+
+  cc->inflight++;
+
+  /* Record this cell time for RTT computation when SENDME arrives */
+  enqueue_timestamp(cc->sendme_pending_timestamps,
+                    monotime_absolute_usec());
+}
+
+/**
+ * Returns true if any edge connections are active.
+ *
+ * We need to know this so that we can stop computing BDP if the
+ * edges are not sending on the circuit.
+ */
+static int
+circuit_has_active_streams(const circuit_t *circ,
+                           const crypt_path_t *layer_hint)
+{
+  const edge_connection_t *streams;
+
+  if (CIRCUIT_IS_ORIGIN(circ)) {
+    streams = CONST_TO_ORIGIN_CIRCUIT(circ)->p_streams;
+  } else {
+    streams = CONST_TO_OR_CIRCUIT(circ)->n_streams;
+  }
+
+  /* Check linked list of streams */
+  for (const edge_connection_t *conn = streams; conn != NULL;
+       conn = conn->next_stream) {
+    if (conn->base_.marked_for_close)
+      continue;
+
+    if (!layer_hint || conn->cpath_layer == layer_hint) {
+      if (connection_get_inbuf_len(TO_CONN(conn)) > 0) {
+        log_info(LD_CIRC, "CC: More in edge inbuf...");
+        return 1;
+      }
+
+      /* If we did not reach EOF on this read, there's more */
+      if (!TO_CONN(conn)->inbuf_reached_eof) {
+        log_info(LD_CIRC, "CC: More on edge conn...");
+        return 1;
+      }
+
+      if (TO_CONN(conn)->linked_conn) {
+        if (connection_get_inbuf_len(TO_CONN(conn)->linked_conn) > 0) {
+          log_info(LD_CIRC, "CC: More in linked inbuf...");
+          return 1;
+        }
+
+        /* If there is a linked conn, and *it* did not each EOF,
+         * there's more */
+        if (!TO_CONN(conn)->linked_conn->inbuf_reached_eof) {
+          log_info(LD_CIRC, "CC: More on linked conn...");
+          return 1;
+        }
+      }
+    }
+  }
+
+  return 0;
+}
+
+/**
+ * Upon receipt of a SENDME, pop the oldest timestamp off the timestamp
+ * list, and use this to update RTT.
+ *
+ * Returns true if circuit estimates were successfully updated, false
+ * otherwise.
+ */
+bool
+congestion_control_update_circuit_estimates(congestion_control_t *cc,
+                                            const circuit_t *circ,
+                                            const crypt_path_t *layer_hint)
+{
+  uint64_t now_usec = monotime_absolute_usec();
+
+  /* Update RTT first, then BDP. BDP needs fresh RTT */
+  uint64_t curr_rtt_usec = congestion_control_update_circuit_rtt(cc, now_usec);
+  return congestion_control_update_circuit_bdp(cc, circ, layer_hint, now_usec,
+                                               curr_rtt_usec);
+}
+
+/**
+ * Returns true if we have enough time data to use heuristics
+ * to compare RTT to a baseline.
+ */
+static bool
+time_delta_should_use_heuristics(const congestion_control_t *cc)
+{
+
+  /* If we have exited slow start, we should have processed at least
+   * a cwnd worth of RTTs */
+  if (!cc->in_slow_start) {
+    return true;
+  }
+
+  /* If we managed to get enough acks to estimate a SENDME BDP, then
+   * we have enough to estimate clock jumps relative to a baseline,
+   * too. (This is at least 'cc_bwe_min' acks). */
+  if (cc->bdp[BDP_ALG_SENDME_RATE]) {
+    return true;
+  }
+
+  /* Not enough data to estimate clock jumps */
+  return false;
+}
+
+/**
+ * Returns true if the monotime delta is 0, or is significantly
+ * different than the previous delta. Either case indicates
+ * that the monotime time source stalled or jumped.
+ */
+static bool
+time_delta_stalled_or_jumped(const congestion_control_t *cc,
+                             uint64_t old_delta, uint64_t new_delta)
+{
+#define DELTA_DISCREPENCY_RATIO_MAX 100
+  /* If we have a 0 new_delta, that is definitely a monotime stall */
+  if (new_delta == 0) {
+    static ratelim_t stall_info_limit = RATELIM_INIT(60);
+    log_fn_ratelim(&stall_info_limit, LOG_INFO, LD_CIRC,
+           "Congestion control cannot measure RTT due to monotime stall.");
+    return true;
+  }
+
+  /* If the old_delta is 0, we have no previous values. So
+   * just assume this one is valid (beause it is non-zero) */
+  if (old_delta == 0)
+    return false;
+
+  /*
+   * For the heuristic cases, we need at least a few timestamps,
+   * to average out any previous partial stalls or jumps. So until
+   * than point, let's just delcare these time values "good enough
+   * to use".
+   */
+  if (!time_delta_should_use_heuristics(cc)) {
+    return false;
+  }
+
+  /* If old_delta is significantly larger than new_delta, then
+   * this means that the monotime clock recently stopped moving
+   * forward. */
+  if (old_delta > new_delta * DELTA_DISCREPENCY_RATIO_MAX) {
+    static ratelim_t dec_notice_limit = RATELIM_INIT(300);
+    log_fn_ratelim(&dec_notice_limit, LOG_NOTICE, LD_CIRC,
+           "Sudden decrease in circuit RTT (%"PRIu64" vs %"PRIu64
+           "), likely due to clock jump.",
+           new_delta/1000, old_delta/1000);
+
+    return true;
+  }
+
+  /* If new_delta is significantly larger than old_delta, then
+   * this means that the monotime clock suddenly jumped forward. */
+  if (new_delta > old_delta * DELTA_DISCREPENCY_RATIO_MAX) {
+    static ratelim_t dec_notice_limit = RATELIM_INIT(300);
+    log_fn_ratelim(&dec_notice_limit, LOG_NOTICE, LD_CIRC,
+           "Sudden increase in circuit RTT (%"PRIu64" vs %"PRIu64
+           "), likely due to clock jump.",
+           new_delta/1000, old_delta/1000);
+
+    return true;
+  }
+
+  return false;
+}
+
+/**
+ * Called when we get a SENDME. Updates circuit RTT by pulling off a
+ * timestamp of when we sent the CIRCWINDOW_INCREMENT-th cell from
+ * the queue of such timestamps, and comparing that to current time.
+ *
+ * Also updates min, max, and EWMA of RTT.
+ *
+ * Returns the current circuit RTT in usecs, or 0 if it could not be
+ * measured (due to clock jump, stall, etc).
+ */
+static uint64_t
+congestion_control_update_circuit_rtt(congestion_control_t *cc,
+                                      uint64_t now_usec)
+{
+  uint64_t rtt, ewma_cnt;
+  uint64_t sent_at_timestamp;
+
+  tor_assert(cc);
+
+  /* Get the time that we sent the cell that resulted in the other
+   * end sending this sendme. Use this to calculate RTT */
+  sent_at_timestamp = dequeue_timestamp(cc->sendme_pending_timestamps);
+
+  rtt = now_usec - sent_at_timestamp;
+
+  /* Do not update RTT at all if it looks fishy */
+  if (time_delta_stalled_or_jumped(cc, cc->ewma_rtt_usec, rtt)) {
+    return 0;
+  }
+
+  ewma_cnt = cc->ewma_cwnd_cnt*sendme_acks_per_cwnd(cc);
+  ewma_cnt = MAX(ewma_cnt, 2); // Use at least 2
+
+  cc->ewma_rtt_usec = n_count_ewma(rtt, cc->ewma_rtt_usec, ewma_cnt);
+
+  if (rtt > cc->max_rtt_usec) {
+    cc->max_rtt_usec = rtt;
+  }
+
+  if (cc->min_rtt_usec == 0 || rtt < cc->min_rtt_usec) {
+    cc->min_rtt_usec = rtt;
+  }
+
+  return rtt;
+}
+
+/**
+ * Called when we get a SENDME. Updates the bandwidth-delay-product (BDP)
+ * estimates of a circuit. Several methods of computing BDP are used,
+ * depending on scenario. While some congestion control algorithms only
+ * use one of these methods, we update them all because it's quick and easy.
+ *
+ * - now_usec is the current monotime in usecs.
+ * - curr_rtt_usec is the current circuit RTT in usecs. It may be 0 if no
+ *   RTT could bemeasured.
+ *
+ * Returns true if we were able to update BDP, false otherwise.
+ */
+static bool
+congestion_control_update_circuit_bdp(congestion_control_t *cc,
+                                      const circuit_t *circ,
+                                      const crypt_path_t *layer_hint,
+                                      uint64_t now_usec,
+                                      uint64_t curr_rtt_usec)
+{
+  int chan_q = 0;
+  unsigned int blocked_on_chan = 0;
+  uint64_t timestamp_usec;
+  uint64_t sendme_rate_bdp = 0;
+
+  tor_assert(cc);
+
+  if (CIRCUIT_IS_ORIGIN(circ)) {
+    /* origin circs use n_chan */
+    chan_q = circ->n_chan_cells.n;
+    blocked_on_chan = circ->streams_blocked_on_n_chan;
+  } else {
+    /* Both onion services and exits use or_circuit and p_chan */
+    chan_q = CONST_TO_OR_CIRCUIT(circ)->p_chan_cells.n;
+    blocked_on_chan = circ->streams_blocked_on_p_chan;
+  }
+
+  /* If we have no EWMA RTT, it is because monotime has been stalled
+   * or messed up the entire time so far. Set our BDP estimates directly
+   * to current cwnd */
+  if (!cc->ewma_rtt_usec) {
+     uint64_t cwnd = cc->cwnd;
+
+     /* If the channel is blocked, keep subtracting off the chan_q
+      * until we hit the min cwnd. */
+     if (blocked_on_chan) {
+       cwnd = MAX(cwnd - chan_q, cc->cwnd_min);
+       cc->blocked_chan = 1;
+     } else {
+       cc->blocked_chan = 0;
+     }
+
+     cc->bdp[BDP_ALG_CWND_RTT] = cwnd;
+     cc->bdp[BDP_ALG_INFLIGHT_RTT] = cwnd;
+     cc->bdp[BDP_ALG_SENDME_RATE] = cwnd;
+     cc->bdp[BDP_ALG_PIECEWISE] = cwnd;
+
+     static ratelim_t dec_notice_limit = RATELIM_INIT(300);
+     log_fn_ratelim(&dec_notice_limit, LOG_NOTICE, LD_CIRC,
+            "Our clock has been stalled for the entire lifetime of a circuit. "
+            "Performance may be sub-optimal.");
+
+     return blocked_on_chan;
+  }
+
+  /* Congestion window based BDP will respond to changes in RTT only, and is
+   * relative to cwnd growth. It is useful for correcting for BDP
+   * overestimation, but if BDP is higher than the current cwnd, it will
+   * underestimate it.
+   *
+   * We multiply here first to avoid precision issues from min_RTT being
+   * close to ewma RTT. Since all fields are u64, there is plenty of
+   * room here to multiply first.
+   */
+  cc->bdp[BDP_ALG_CWND_RTT] = cc->cwnd*cc->min_rtt_usec/cc->ewma_rtt_usec;
+
+  /*
+   * If we have no pending streams, we do not have enough data to fill
+   * the BDP, so preserve our old estimates but do not make any more.
+   */
+  if (!blocked_on_chan && !circuit_has_active_streams(circ, layer_hint)) {
+    log_info(LD_CIRC,
+               "CC: Streams drained. Spare package window: %"PRIu64
+               ", no BDP update", cc->cwnd - cc->inflight);
+
+    /* Clear SENDME timestamps; they will be wrong with intermittent data */
+    SMARTLIST_FOREACH(cc->sendme_arrival_timestamps, uint64_t *, t,
+                      tor_free(t));
+    smartlist_clear(cc->sendme_arrival_timestamps);
+  } else if (curr_rtt_usec) {
+    /* Sendme-based BDP will quickly measure BDP in much less than
+     * a cwnd worth of data when in use (in 2-10 SENDMEs).
+     *
+     * But if the link goes idle, it will be vastly lower than true BDP. Hence
+     * we only compute it if we have either pending stream data, or streams
+     * are still blocked on the channel queued data.
+     *
+     * We also do not compute it if we do not have a current RTT passed in,
+     * because that means that monotime is currently stalled or just jumped.
+     */
+    enqueue_timestamp(cc->sendme_arrival_timestamps, now_usec);
+
+    if (smartlist_len(cc->sendme_arrival_timestamps) >= cc->bwe_sendme_min) {
+      /* If we have more sendmes than fit in a cwnd, trim the list.
+       * Those are not acurrately measuring throughput, if cwnd is
+       * currently smaller than BDP */
+      while (smartlist_len(cc->sendme_arrival_timestamps) >
+             cc->bwe_sendme_min &&
+             (uint64_t)smartlist_len(cc->sendme_arrival_timestamps) >
+                       sendme_acks_per_cwnd(cc)) {
+        (void)dequeue_timestamp(cc->sendme_arrival_timestamps);
+      }
+      int sendme_cnt = smartlist_len(cc->sendme_arrival_timestamps);
+
+      /* Calculate SENDME_BWE_COUNT pure average */
+      timestamp_usec = peek_timestamp(cc->sendme_arrival_timestamps);
+      uint64_t delta = now_usec - timestamp_usec;
+
+      /* The acked data is in sendme_cnt-1 chunks, because we are counting the
+       * data that is processed by the other endpoint *between* all of these
+       * sendmes. There's one less gap between the sendmes than the number
+       * of sendmes. */
+      uint64_t cells = (sendme_cnt-1)*cc->sendme_inc;
+
+      /* The bandwidth estimate is cells/delta, which when multiplied
+       * by min RTT obtains the BDP. However, we multiply first to
+       * avoid precision issues with the RTT being close to delta in size. */
+      sendme_rate_bdp = cells*cc->min_rtt_usec/delta;
+
+      /* Calculate BDP_EWMA_COUNT N-EWMA */
+      cc->bdp[BDP_ALG_SENDME_RATE] =
+                 n_count_ewma(sendme_rate_bdp, cc->bdp[BDP_ALG_SENDME_RATE],
+                              cc->ewma_cwnd_cnt*sendme_acks_per_cwnd(cc));
+    }
+
+    /* In-flight BDP will cause the cwnd to drift down when underutilized.
+     * It is most useful when the local OR conn is blocked, so we only
+     * compute it if we're utilized. */
+    cc->bdp[BDP_ALG_INFLIGHT_RTT] =
+        (cc->inflight - chan_q)*cc->min_rtt_usec/
+                              MAX(cc->ewma_rtt_usec, curr_rtt_usec);
+  } else {
+    /* We can still update inflight with just an EWMA RTT, but only
+     * if there is data flowing */
+    cc->bdp[BDP_ALG_INFLIGHT_RTT] =
+        (cc->inflight - chan_q)*cc->min_rtt_usec/cc->ewma_rtt_usec;
+  }
+
+  /* The orconn is blocked; use smaller of inflight vs SENDME */
+  if (blocked_on_chan) {
+    log_info(LD_CIRC, "CC: Streams blocked on circ channel. Chanq: %d",
+             chan_q);
+
+    /* A blocked channel is an immediate congestion signal, but it still
+     * happens only once per cwnd */
+    if (!cc->blocked_chan) {
+      cc->next_cc_event = 0;
+      cc->blocked_chan = 1;
+    }
+
+    if (cc->bdp[BDP_ALG_SENDME_RATE]) {
+      cc->bdp[BDP_ALG_PIECEWISE] = MIN(cc->bdp[BDP_ALG_INFLIGHT_RTT],
+                                      cc->bdp[BDP_ALG_SENDME_RATE]);
+    } else {
+      cc->bdp[BDP_ALG_PIECEWISE] = cc->bdp[BDP_ALG_INFLIGHT_RTT];
+    }
+  } else {
+    /* If we were previously blocked, emit a new congestion event
+     * now that we are unblocked, to re-evaluate cwnd */
+    if (cc->blocked_chan) {
+      cc->blocked_chan = 0;
+      cc->next_cc_event = 0;
+      log_info(LD_CIRC, "CC: Streams un-blocked on circ channel. Chanq: %d",
+               chan_q);
+    }
+
+    cc->bdp[BDP_ALG_PIECEWISE] = MAX(cc->bdp[BDP_ALG_SENDME_RATE],
+                                     cc->bdp[BDP_ALG_CWND_RTT]);
+  }
+
+  /* We can end up with no piecewise value if we didn't have either
+   * a SENDME estimate or enough data for an inflight estimate.
+   * It also happens on the very first sendme, since we need two
+   * to get a BDP. In these cases, use the cwnd method. */
+  if (!cc->bdp[BDP_ALG_PIECEWISE]) {
+    cc->bdp[BDP_ALG_PIECEWISE] = cc->bdp[BDP_ALG_CWND_RTT];
+    log_info(LD_CIRC, "CC: No piecewise BDP. Using %"PRIu64,
+             cc->bdp[BDP_ALG_PIECEWISE]);
+  }
+
+  if (cc->next_cc_event == 0) {
+    if (CIRCUIT_IS_ORIGIN(circ)) {
+      log_info(LD_CIRC,
+                 "CC: Circuit %d "
+                 "SENDME RTT: %"PRIu64", %"PRIu64", %"PRIu64", %"PRIu64", "
+                 "BDP estimates: "
+                 "%"PRIu64", "
+                 "%"PRIu64", "
+                 "%"PRIu64", "
+                 "%"PRIu64", "
+                 "%"PRIu64". ",
+               CONST_TO_ORIGIN_CIRCUIT(circ)->global_identifier,
+               cc->min_rtt_usec/1000,
+               curr_rtt_usec/1000,
+               cc->ewma_rtt_usec/1000,
+               cc->max_rtt_usec/1000,
+               cc->bdp[BDP_ALG_INFLIGHT_RTT],
+               cc->bdp[BDP_ALG_CWND_RTT],
+               sendme_rate_bdp,
+               cc->bdp[BDP_ALG_SENDME_RATE],
+               cc->bdp[BDP_ALG_PIECEWISE]
+               );
+    } else {
+      log_info(LD_CIRC,
+                 "CC: Circuit %"PRIu64":%d "
+                 "SENDME RTT: %"PRIu64", %"PRIu64", %"PRIu64", %"PRIu64", "
+                 "%"PRIu64", "
+                 "%"PRIu64", "
+                 "%"PRIu64", "
+                 "%"PRIu64", "
+                 "%"PRIu64". ",
+                 // XXX: actually, is this p_chan here? This is
+                 // an or_circuit (exit or onion)
+                 circ->n_chan->global_identifier, circ->n_circ_id,
+                 cc->min_rtt_usec/1000,
+                 curr_rtt_usec/1000,
+                 cc->ewma_rtt_usec/1000,
+                 cc->max_rtt_usec/1000,
+                 cc->bdp[BDP_ALG_INFLIGHT_RTT],
+                 cc->bdp[BDP_ALG_CWND_RTT],
+                 sendme_rate_bdp,
+                 cc->bdp[BDP_ALG_SENDME_RATE],
+                 cc->bdp[BDP_ALG_PIECEWISE]
+                 );
+    }
+  }
+
+  /* We updated BDP this round if either we had a blocked channel, or
+   * the curr_rtt_usec was not 0. */
+  return (blocked_on_chan || curr_rtt_usec != 0);
+}
+
+/**
+ * Dispatch the sendme to the appropriate congestion control algorithm.
+ */
+int
+congestion_control_dispatch_cc_alg(congestion_control_t *cc,
+                                   const circuit_t *circ,
+                                   const crypt_path_t *layer_hint)
+{
+  switch (cc->cc_alg) {
+    case CC_ALG_WESTWOOD:
+      return congestion_control_westwood_process_sendme(cc, circ, layer_hint);
+
+    case CC_ALG_VEGAS:
+      return congestion_control_vegas_process_sendme(cc, circ, layer_hint);
+
+    case CC_ALG_NOLA:
+      return congestion_control_nola_process_sendme(cc, circ, layer_hint);
+
+    case CC_ALG_SENDME:
+    default:
+      tor_assert(0);
+  }
+
+  return -END_CIRC_REASON_INTERNAL;
+}
--- a/src/core/or/congestion_control_common.h
+++ b/src/core/or/congestion_control_common.h
@ -0,0 +1,55 @@
+/* Copyright (c) 2019-2021, The Tor Project, Inc. */
+/* See LICENSE for licensing information */
+
+/**
+ * \file congestion_control_common.h
+ * \brief Public APIs for congestion control
+ **/
+
+#ifndef TOR_CONGESTION_CONTROL_COMMON_H
+#define TOR_CONGESTION_CONTROL_COMMON_H
+
+#include "core/or/crypt_path_st.h"
+#include "core/or/circuit_st.h"
+
+typedef struct congestion_control_t congestion_control_t;
+
+/** Wrapper for the free function, set the CC pointer to NULL after free */
+#define congestion_control_free(cc) \
+    FREE_AND_NULL(congestion_control_t, congestion_control_free_, cc)
+
+void congestion_control_free_(congestion_control_t *cc);
+
+congestion_control_t *congestion_control_new(void);
+
+int congestion_control_dispatch_cc_alg(congestion_control_t *cc,
+                                       const circuit_t *circ,
+                                       const crypt_path_t *layer_hint);
+
+void congestion_control_note_cell_sent(congestion_control_t *cc,
+                                       const circuit_t *circ,
+                                       const crypt_path_t *cpath);
+
+bool congestion_control_update_circuit_estimates(congestion_control_t *,
+                                                 const circuit_t *,
+                                                 const crypt_path_t *);
+
+int congestion_control_get_package_window(const circuit_t *,
+                                          const crypt_path_t *);
+
+int sendme_get_inc_count(const circuit_t *, const crypt_path_t *);
+bool circuit_sent_cell_for_sendme(const circuit_t *, const crypt_path_t *);
+
+/* Private section starts. */
+#ifdef TOR_CONGESTION_CONTROL_PRIVATE
+
+/*
+ * Unit tests declaractions.
+ */
+#ifdef TOR_UNIT_TESTS
+
+#endif /* defined(TOR_UNIT_TESTS) */
+
+#endif /* defined(TOR_CONGESTION_CONTROL_PRIVATE) */
+
+#endif /* !defined(TOR_CONGESTION_CONTROL_COMMON_H) */
--- a/src/core/or/congestion_control_st.h
+++ b/src/core/or/congestion_control_st.h
@ -0,0 +1,257 @@
+/* Copyright (c) 2019-2021, The Tor Project, Inc. */
+/* See LICENSE for licensing information */
+
+/**
+ * \file congestion_control_st.h
+ * \brief Structure definitions for congestion control.
+ **/
+
+#ifndef CONGESTION_CONTROL_ST_H
+#define CONGESTION_CONTROL_ST_H
+
+#include "core/or/crypt_path_st.h"
+#include "core/or/circuit_st.h"
+
+/** Signifies which sendme algorithm to use */
+typedef enum {
+  /** OG Tor fixed-sized circ and stream windows. It sucks, but it is important
+   * to make sure that the new algs can compete with the old garbage. */
+  CC_ALG_SENDME = 0,
+
+  /**
+   * Prop#324 TOR_WESTWOOD - Deliberately agressive. Westwood may not even
+   * converge to fairness in some cases because max RTT will also increase
+   * on congesgtion, which boosts the Westwood RTT congestion threshhold. So it
+   * can cause runaway queue bloat, which may or may not lead to a robot
+   * uprising... Ok that's Westworld, not Westwood. Still, we need to test
+   * Vegas and NOLA against something more agressive to ensure they do not
+   * starve in the presence of cheaters. We also need to make sure cheaters
+   * trigger the oomkiller in those cases.
+   */
+  CC_ALG_WESTWOOD = 1,
+
+  /**
+   * Prop#324 TOR_VEGAS - TCP Vegas-style BDP tracker. Because Vegas backs off
+   * whenever it detects queue delay, it can be beaten out by more agressive
+   * algs. However, in live network testing, it seems to do just fine against
+   * current SENDMEs. It outperforms Westwood and does not stall. */
+  CC_ALG_VEGAS = 2,
+
+  /**
+   * Prop#324: TOR_NOLA - NOLA looks the BDP right in the eye and uses it
+   * immediately as CWND. No slow start, no other congestion signals, no delay,
+   * no bullshit. Like TOR_VEGAS, it also uses agressive BDP estimates, to
+   * avoid out-competition. It seems a bit better throughput than Vegas,
+   * but its agressive BDP and rapid updates may lead to more queue latency. */
+  CC_ALG_NOLA = 3,
+} cc_alg_t;
+
+/* Total number of CC algs in cc_alg_t enum */
+#define NUM_CC_ALGS  (CC_ALG_NOLA+1)
+
+/** Signifies how we estimate circuit BDP */
+typedef enum {
+  /* CWND-based BDP will respond to changes in RTT only, and is relative
+   * to cwnd growth. So in slow-start, this will under-estimate BDP */
+  BDP_ALG_CWND_RTT = 0,
+
+  /* Sendme-based BDP will quickly measure BDP in less than
+   * a cwnd worth of data when in use. So it should be good for slow-start.
+   * But if the link goes idle, it will be vastly lower than true BDP. Thus,
+   * this estimate gets reset when the cwnd is not fully utilized. */
+  BDP_ALG_SENDME_RATE = 1,
+
+  /* Inflight BDP is similar to the cwnd estimator, except it uses
+   * packets inflight minus local circuit queues instead of current cwnd.
+   * Because it is strictly less than or equal to the cwnd, it will cause
+   * the cwnd to drift downward. It is only used if the local OR connection
+   * is blocked. */
+  BDP_ALG_INFLIGHT_RTT = 2,
+
+  /* The Piecewise BDP estimator uses the CWND estimator before there
+   * are sufficient SENDMEs to calculate the SENDME estimator. At that
+   * point, it uses the SENDME estimator, unless the local OR connection
+   * becomes blocked. In that case, it switches to the inflight estimator. */
+  BDP_ALG_PIECEWISE = 3,
+
+} bdp_alg_t;
+
+/** Total number of BDP algs in bdp_alg_t enum */
+#define NUM_BDP_ALGS (BDP_ALG_PIECEWISE+1)
+
+/** Westwood algorithm parameters */
+struct westwood_params_t {
+    /** Cwnd backoff multiplier upon congestion (as percent) */
+    uint8_t cwnd_backoff_m;
+    /** Max RTT backoff multiplier upon congestion (as percent) */
+    uint8_t rtt_backoff_m;
+
+    /** Threshold between min and max RTT, to signal congestion (percent) */
+    uint8_t rtt_thresh;
+
+    /**
+     * If true, use minimum of BDP and backoff multiplication in backoff.
+     * If false, use maximum of BDP and backoff multiplication in backoff. */
+    bool min_backoff;
+};
+
+/** Vegas algorithm parameters. */
+struct vegas_params_t {
+    /** The queue use allowed before we exit slow start */
+    uint16_t gamma;
+    /** The queue use below which we increment cwnd */
+    uint16_t alpha;
+    /** The queue use above which we decrement cwnd */
+    uint16_t beta;
+    /** Weighted average (percent) between cwnd estimator and
+     * piecewise estimator. */
+    uint8_t bdp_mix_pct;
+};
+
+/** NOLA consensus params */
+struct nola_params_t {
+    /** How many cells to add to BDP estimate to obtain cwnd */
+    uint16_t bdp_overshoot;
+};
+
+/** Fields common to all congestion control algorithms */
+typedef struct congestion_control_t {
+  /**
+   * Smartlist of uint64_t monotime usec timestamps of when we sent a data
+   * cell that is pending a sendme. FIFO queue that is managed similar to
+   * sendme_last_digests. */
+  smartlist_t *sendme_pending_timestamps;
+
+  /**
+   * Smartlist of uint64_t monotime timestamp of when sendme's arrived.
+   * FIFO queue that is managed similar to sendme_last_digests.
+   * Used to estimate circuitbandwidth and BDP. */
+  smartlist_t *sendme_arrival_timestamps;
+
+  /** RTT time data for congestion control. */
+  uint64_t ewma_rtt_usec;
+  uint64_t min_rtt_usec;
+  uint64_t max_rtt_usec;
+
+  /* BDP estimates by algorithm */
+  uint64_t bdp[NUM_BDP_ALGS];
+
+  /** Congestion window */
+  uint64_t cwnd;
+
+  /** Number of cells in-flight (sent but awaiting SENDME ack). */
+  uint64_t inflight;
+
+  /**
+   * For steady-state: the number of sendme acks until we will acknowledge
+   * a congestion event again. It starts out as the number of sendme acks
+   * in a congestion windowm and is decremented each ack. When this reaches
+   * 0, it means we should examine our congestion algorithm conditions.
+   * In this way, we only react to one congestion event per congestion window.
+   *
+   * It is also reset to 0 immediately whenever the circuit's orconn is
+   * blocked, and when a previously blocked orconn is unblocked.
+   */
+  uint64_t next_cc_event;
+
+  /** Are we in slow start? */
+  bool in_slow_start;
+
+  /** Is the local channel blocked on us? That's a congestion signal */
+  bool blocked_chan;
+
+  /* The following parameters are cached from consensus values upon
+   * circuit setup. */
+
+  /** Percent of cwnd to increment by during slow start */
+  uint16_t cwnd_inc_pct_ss;
+
+  /** Number of cells to increment cwnd by during steady state */
+  uint16_t cwnd_inc;
+
+  /** Minimum congestion window (must be at least sendme_inc) */
+  uint16_t cwnd_min;
+
+  /**
+   * Number of times per congestion window to update based on congestion
+   * signals */
+  uint8_t cwnd_inc_rate;
+
+  /**
+   * Number of cwnd worth of sendme acks to smooth RTT and BDP with,
+   * using N_EWMA */
+  uint8_t ewma_cwnd_cnt;
+
+  /**
+   * Minimum number of sendmes before we begin BDP estimates
+   */
+  uint8_t bwe_sendme_min;
+
+  /**
+   * Number of cells to ack with every sendme. Taken from consensus parameter
+   * and negotiation during circuit setup. */
+  uint8_t sendme_inc;
+
+  /** Which congestion control algorithm to use. Taken from
+   * consensus parameter and negotiation during circuit setup. */
+  cc_alg_t cc_alg;
+
+  /** Which algorithm to estimate circuit bandwidth with. Taken from
+   * consensus parameter during circuit setup. */
+  bdp_alg_t bdp_alg;
+
+  /** Algorithm-specific parameters. The specific struct that is used
+   * depends upon the algoritghm selected by the cc_alg parameter.
+   * These should not be accessed anywhere other than the algorithm-specific
+   * files. */
+  union {
+    struct westwood_params_t westwood_params;
+    struct vegas_params_t vegas_params;
+    struct nola_params_t nola_params;
+  };
+} congestion_control_t;
+
+/**
+ * Returns the number of sendme acks we will recieve before we update cwnd.
+ *
+ * Congestion control literature recommends only one update of cwnd per
+ * cwnd worth of acks. However, we can also tune this to be more frequent
+ * by increasing the 'cc_cwnd_inc_rate' consensus parameter.
+ *
+ * If this returns 0 due to high cwnd_inc_rate, the calling code will
+ * update every sendme ack.
+ */
+static inline uint64_t CWND_UPDATE_RATE(const congestion_control_t *cc)
+{
+  /* We add cwnd_inc_rate*sendme_inc/2 to round to nearest integer number
+   * of acks */
+  return ((cc->cwnd + cc->cwnd_inc_rate*cc->sendme_inc/2)
+           / (cc->cwnd_inc_rate*cc->sendme_inc));
+}
+
+/**
+ * Returns the amount to increment the congestion window each update,
+ * during slow start.
+ *
+ * Congestion control literature recommends either doubling the cwnd
+ * every cwnd during slow start, or some similar exponential growth
+ * (such as 50% more every cwnd, for Vegas).
+ *
+ * This is controlled by a consensus parameter 'cwnd_inc_pct_ss', which
+ * allows us to specify the percent of the current consensus window
+ * to update by.
+ */
+static inline uint64_t CWND_INC_SS(const congestion_control_t *cc)
+{
+  return (cc->cwnd_inc_pct_ss*cc->cwnd/100);
+}
+
+/**
+ * Returns the amount to increment (and for Vegas, also decrement) the
+ * congestion window by, every update period.
+ *
+ * This is controlled by the cc_cwnd_inc consensus parameter.
+ */
+#define CWND_INC(cc)           ((cc)->cwnd_inc)
+
+#endif /* !defined(CONGESTION_CONTROL_ST_H) */
--- a/src/core/or/include.am
+++ b/src/core/or/include.am
@ -35,6 +35,7 @@ LIBTOR_APP_A_SOURCES += 				\
 	src/core/or/scheduler_kist.c		\
 	src/core/or/scheduler_vanilla.c		\
 	src/core/or/sendme.c			\
+	src/core/or/sendme_common.c			\
 	src/core/or/status.c			\
 	src/core/or/versions.c

@ -97,6 +98,7 @@ noinst_HEADERS +=					\
 	src/core/or/relay_crypto_st.h			\
 	src/core/or/scheduler.h				\
 	src/core/or/sendme.h				\
+	src/core/or/sendme_common.h				\
 	src/core/or/server_port_cfg_st.h		\
 	src/core/or/socks_request_st.h			\
 	src/core/or/status.h				\