diff --git a/changes/bug12184_diagnostic b/changes/bug12184_diagnostic
new file mode 100644
index 0000000000..3c783b17d9
--- /dev/null
+++ b/changes/bug12184_diagnostic
@@ -0,0 +1,4 @@
+ o Minor features (diagnostic):
+ - Add significant new logging code to attempt to diagnose bug 12184,
+ where relays seem to run out of available circuit IDs.
+
diff --git a/src/or/circuitbuild.c b/src/or/circuitbuild.c
index 9d06759dec..e743a25f48 100644
--- a/src/or/circuitbuild.c
+++ b/src/or/circuitbuild.c
@@ -96,10 +96,12 @@ get_unique_circ_id_by_chan(channel_t *chan)
* N=256. */
#define MAX_CIRCID_ATTEMPTS 64
int in_use;
- unsigned n_with_circ = 0, n_pending_destroy = 0;
+ unsigned n_with_circ = 0, n_pending_destroy = 0, n_weird_pending_destroy = 0;
circid_t test_circ_id;
circid_t attempts=0;
circid_t high_bit, max_range, mask;
+ int64_t pending_destroy_time_total = 0;
+ int64_t pending_destroy_time_max = 0;
tor_assert(chan);
@@ -130,15 +132,57 @@ get_unique_circ_id_by_chan(channel_t *chan)
* whole circuit ID space every time we extend a circuit, which is
* not so great either.
*/
- log_fn_ratelim(&chan->last_warned_circ_ids_exhausted, LOG_WARN,
- LD_CIRC,"No unused circIDs found on channel %s wide "
+ int64_t queued_destroys;
+ char *m = rate_limit_log(&chan->last_warned_circ_ids_exhausted,
+ approx_time());
+ if (m == NULL)
+ return 0; /* This message has been rate-limited away. */
+ if (n_pending_destroy)
+ pending_destroy_time_total /= n_pending_destroy;
+ log_warn(LD_CIRC,"No unused circIDs found on channel %s wide "
"circID support, with %u inbound and %u outbound circuits. "
"Found %u circuit IDs in use by circuits, and %u with "
- "pending destroy cells."
- "Failing a circuit.",
+ "pending destroy cells. (%u of those were marked bogusly.) "
+ "The ones with pending destroy cells "
+ "have been marked unusable for an average of %ld seconds "
+ "and a maximum of %ld seconds. This channel is %ld seconds "
+ "old. Failing a circuit.%s",
chan->wide_circ_ids ? "with" : "without",
chan->num_p_circuits, chan->num_n_circuits,
- n_with_circ, n_pending_destroy);
+ n_with_circ, n_pending_destroy, n_weird_pending_destroy,
+ (long)pending_destroy_time_total,
+ (long)pending_destroy_time_max,
+ (long)(approx_time() - chan->timestamp_created),
+ m);
+ tor_free(m);
+
+ if (!chan->cmux) {
+ /* This warning should be impossible. */
+ log_warn(LD_BUG, " This channel somehow has no cmux on it!");
+ return 0;
+ }
+
+ /* analysis so far on 12184 suggests that we're running out of circuit
+ IDs because it looks like we have too many pending destroy
+ cells. Let's see how many we really have pending.
+ */
+ queued_destroys = circuitmux_count_queued_destroy_cells(chan,
+ chan->cmux);
+
+ log_warn(LD_CIRC, " Circuitmux on this channel has %u circuits, "
+ "of which %u are active. It says it has "I64_FORMAT
+ " destroy cells queued.",
+ circuitmux_num_circuits(chan->cmux),
+ circuitmux_num_active_circuits(chan->cmux),
+ I64_PRINTF_ARG(queued_destroys));
+
+ /* Change this into "if (1)" in order to get more information about
+ * possible failure modes here. You'll need to know how to use gdb with
+ * Tor: this will make Tor exit with an assertion failure if the cmux is
+ * corrupt. */
+ if (0)
+ circuitmux_assert_okay(chan->cmux);
+
return 0;
}
@@ -152,8 +196,20 @@ get_unique_circ_id_by_chan(channel_t *chan)
in_use = circuit_id_in_use_on_channel(test_circ_id, chan);
if (in_use == 1)
++n_with_circ;
- else if (in_use == 2)
+ else if (in_use == 2) {
+ time_t since_when;
++n_pending_destroy;
+ since_when =
+ circuit_id_when_marked_unusable_on_channel(test_circ_id, chan);
+ if (since_when) {
+ time_t waiting = approx_time() - since_when;
+ pending_destroy_time_total += waiting;
+ if (waiting > pending_destroy_time_max)
+ pending_destroy_time_max = waiting;
+ } else {
+ ++n_weird_pending_destroy;
+ }
+ }
} while (in_use);
return test_circ_id;
}
diff --git a/src/or/circuitlist.c b/src/or/circuitlist.c
index 0140afcd77..a2dd07fbe4 100644
--- a/src/or/circuitlist.c
+++ b/src/or/circuitlist.c
@@ -59,6 +59,8 @@ typedef struct chan_circid_circuit_map_t {
channel_t *chan;
circid_t circ_id;
circuit_t *circuit;
+ /* For debugging 12184: when was this placeholder item added? */
+ time_t made_placeholder_at;
} chan_circid_circuit_map_t;
/** Helper for hash tables: compare the channel and circuit ID for a and
@@ -184,6 +186,7 @@ circuit_set_circid_chan_helper(circuit_t *circ, int direction,
found = HT_FIND(chan_circid_map, &chan_circid_map, &search);
if (found) {
found->circuit = circ;
+ found->made_placeholder_at = 0;
} else {
found = tor_malloc_zero(sizeof(chan_circid_circuit_map_t));
found->circ_id = id;
@@ -241,11 +244,14 @@ channel_mark_circid_unusable(channel_t *chan, circid_t id)
"a circuit there.", (unsigned)id, chan);
} else if (ent) {
/* It's already marked. */
+ if (!ent->made_placeholder_at)
+ ent->made_placeholder_at = approx_time();
} else {
ent = tor_malloc_zero(sizeof(chan_circid_circuit_map_t));
ent->chan = chan;
ent->circ_id = id;
- /* leave circuit at NULL */
+ /* leave circuit at NULL. */
+ ent->made_placeholder_at = approx_time();
HT_INSERT(chan_circid_map, &chan_circid_map, ent);
}
}
@@ -1096,6 +1102,27 @@ circuit_id_in_use_on_channel(circid_t circ_id, channel_t *chan)
return 0;
}
+/** Helper for debugging 12184. Returns the time since which 'circ_id' has
+ * been marked unusable on 'chan'. */
+time_t
+circuit_id_when_marked_unusable_on_channel(circid_t circ_id, channel_t *chan)
+{
+ chan_circid_circuit_map_t search;
+ chan_circid_circuit_map_t *found;
+
+ memset(&search, 0, sizeof(search));
+ search.circ_id = circ_id;
+ search.chan = chan;
+
+ found = HT_FIND(chan_circid_map, &chan_circid_map, &search);
+
+ if (! found || found->circuit)
+ return 0;
+
+ return found->made_placeholder_at;
+}
+
+
/** Return the circuit that a given edge connection is using. */
circuit_t *
circuit_get_by_edge_conn(edge_connection_t *conn)
diff --git a/src/or/circuitlist.h b/src/or/circuitlist.h
index 916afba215..d48d7c3963 100644
--- a/src/or/circuitlist.h
+++ b/src/or/circuitlist.h
@@ -28,6 +28,8 @@ void circuit_set_n_circid_chan(circuit_t *circ, circid_t id,
channel_t *chan);
void channel_mark_circid_unusable(channel_t *chan, circid_t id);
void channel_mark_circid_usable(channel_t *chan, circid_t id);
+time_t circuit_id_when_marked_unusable_on_channel(circid_t circ_id,
+ channel_t *chan);
void circuit_set_state(circuit_t *circ, uint8_t state);
void circuit_close_all_marked(void);
int32_t circuit_initial_package_window(void);
diff --git a/src/or/circuitmux.c b/src/or/circuitmux.c
index 52ebfef084..563166eb46 100644
--- a/src/or/circuitmux.c
+++ b/src/or/circuitmux.c
@@ -1909,3 +1909,38 @@ circuitmux_append_destroy_cell(channel_t *chan,
}
}
+/*DOCDOC; for debugging 12184. This runs slowly. */
+int64_t
+circuitmux_count_queued_destroy_cells(const channel_t *chan,
+ const circuitmux_t *cmux)
+{
+ int64_t n_destroy_cells = cmux->destroy_ctr;
+ int64_t destroy_queue_size = cmux->destroy_cell_queue.n;
+
+ int64_t manual_total = 0;
+ int64_t manual_total_in_map = 0;
+ packed_cell_t *cell;
+
+ TOR_SIMPLEQ_FOREACH(cell, &cmux->destroy_cell_queue.head, next) {
+ circid_t id;
+ ++manual_total;
+
+ id = packed_cell_get_circid(cell, chan->wide_circ_ids);
+ if (circuit_id_in_use_on_channel(id, (channel_t*)chan))
+ ++manual_total_in_map;
+ }
+
+ if (n_destroy_cells != destroy_queue_size ||
+ n_destroy_cells != manual_total ||
+ n_destroy_cells != manual_total_in_map) {
+ log_warn(LD_BUG, " Discrepancy in counts for queued destroy cells on "
+ "circuitmux. n="I64_FORMAT". queue_size="I64_FORMAT". "
+ "manual_total="I64_FORMAT". manual_total_in_map="I64_FORMAT".",
+ I64_PRINTF_ARG(n_destroy_cells),
+ I64_PRINTF_ARG(destroy_queue_size),
+ I64_PRINTF_ARG(manual_total),
+ I64_PRINTF_ARG(manual_total_in_map));
+ }
+
+ return n_destroy_cells;
+}
diff --git a/src/or/circuitmux.h b/src/or/circuitmux.h
index c4c0649c6c..2b5fb7e51e 100644
--- a/src/or/circuitmux.h
+++ b/src/or/circuitmux.h
@@ -121,6 +121,10 @@ unsigned int circuitmux_num_cells(circuitmux_t *cmux);
unsigned int circuitmux_num_circuits(circuitmux_t *cmux);
unsigned int circuitmux_num_active_circuits(circuitmux_t *cmux);
+/* Debuging interface - slow. */
+int64_t circuitmux_count_queued_destroy_cells(const channel_t *chan,
+ const circuitmux_t *cmux);
+
/* Channel interface */
circuit_t * circuitmux_get_first_active_circuit(circuitmux_t *cmux,
cell_queue_t **destroy_queue_out);
diff --git a/src/or/relay.c b/src/or/relay.c
index 66d1251eb7..f42602d412 100644
--- a/src/or/relay.c
+++ b/src/or/relay.c
@@ -2569,6 +2569,17 @@ packed_cell_get_command(const packed_cell_t *cell, int wide_circ_ids)
}
}
+/** Extract the circuit ID from a packed cell. */
+circid_t
+packed_cell_get_circid(const packed_cell_t *cell, int wide_circ_ids)
+{
+ if (wide_circ_ids) {
+ return ntohl(get_uint32(cell->body));
+ } else {
+ return ntohs(get_uint16(cell->body));
+ }
+}
+
/** Pull as many cells as possible (but no more than max) from the
* queue of the first active circuit on chan, and write them to
* chan->outbuf. Return the number of cells written. Advance
diff --git a/src/or/relay.h b/src/or/relay.h
index 479d474b35..969c6fb61d 100644
--- a/src/or/relay.h
+++ b/src/or/relay.h
@@ -82,6 +82,8 @@ void stream_choice_seed_weak_rng(void);
int relay_crypt(circuit_t *circ, cell_t *cell, cell_direction_t cell_direction,
crypt_path_t **layer_hint, char *recognized);
+circid_t packed_cell_get_circid(const packed_cell_t *cell, int wide_circ_ids);
+
#ifdef RELAY_PRIVATE
STATIC int connected_cell_parse(const relay_header_t *rh, const cell_t *cell,
tor_addr_t *addr_out, int *ttl_out);