diff --git a/changes/bug12184_diagnostic b/changes/bug12184_diagnostic new file mode 100644 index 0000000000..3c783b17d9 --- /dev/null +++ b/changes/bug12184_diagnostic @@ -0,0 +1,4 @@ + o Minor features (diagnostic): + - Add significant new logging code to attempt to diagnose bug 12184, + where relays seem to run out of available circuit IDs. + diff --git a/src/or/circuitbuild.c b/src/or/circuitbuild.c index 9d06759dec..e743a25f48 100644 --- a/src/or/circuitbuild.c +++ b/src/or/circuitbuild.c @@ -96,10 +96,12 @@ get_unique_circ_id_by_chan(channel_t *chan) * N=256. */ #define MAX_CIRCID_ATTEMPTS 64 int in_use; - unsigned n_with_circ = 0, n_pending_destroy = 0; + unsigned n_with_circ = 0, n_pending_destroy = 0, n_weird_pending_destroy = 0; circid_t test_circ_id; circid_t attempts=0; circid_t high_bit, max_range, mask; + int64_t pending_destroy_time_total = 0; + int64_t pending_destroy_time_max = 0; tor_assert(chan); @@ -130,15 +132,57 @@ get_unique_circ_id_by_chan(channel_t *chan) * whole circuit ID space every time we extend a circuit, which is * not so great either. */ - log_fn_ratelim(&chan->last_warned_circ_ids_exhausted, LOG_WARN, - LD_CIRC,"No unused circIDs found on channel %s wide " + int64_t queued_destroys; + char *m = rate_limit_log(&chan->last_warned_circ_ids_exhausted, + approx_time()); + if (m == NULL) + return 0; /* This message has been rate-limited away. */ + if (n_pending_destroy) + pending_destroy_time_total /= n_pending_destroy; + log_warn(LD_CIRC,"No unused circIDs found on channel %s wide " "circID support, with %u inbound and %u outbound circuits. " "Found %u circuit IDs in use by circuits, and %u with " - "pending destroy cells." - "Failing a circuit.", + "pending destroy cells. (%u of those were marked bogusly.) " + "The ones with pending destroy cells " + "have been marked unusable for an average of %ld seconds " + "and a maximum of %ld seconds. This channel is %ld seconds " + "old. Failing a circuit.%s", chan->wide_circ_ids ? "with" : "without", chan->num_p_circuits, chan->num_n_circuits, - n_with_circ, n_pending_destroy); + n_with_circ, n_pending_destroy, n_weird_pending_destroy, + (long)pending_destroy_time_total, + (long)pending_destroy_time_max, + (long)(approx_time() - chan->timestamp_created), + m); + tor_free(m); + + if (!chan->cmux) { + /* This warning should be impossible. */ + log_warn(LD_BUG, " This channel somehow has no cmux on it!"); + return 0; + } + + /* analysis so far on 12184 suggests that we're running out of circuit + IDs because it looks like we have too many pending destroy + cells. Let's see how many we really have pending. + */ + queued_destroys = circuitmux_count_queued_destroy_cells(chan, + chan->cmux); + + log_warn(LD_CIRC, " Circuitmux on this channel has %u circuits, " + "of which %u are active. It says it has "I64_FORMAT + " destroy cells queued.", + circuitmux_num_circuits(chan->cmux), + circuitmux_num_active_circuits(chan->cmux), + I64_PRINTF_ARG(queued_destroys)); + + /* Change this into "if (1)" in order to get more information about + * possible failure modes here. You'll need to know how to use gdb with + * Tor: this will make Tor exit with an assertion failure if the cmux is + * corrupt. */ + if (0) + circuitmux_assert_okay(chan->cmux); + return 0; } @@ -152,8 +196,20 @@ get_unique_circ_id_by_chan(channel_t *chan) in_use = circuit_id_in_use_on_channel(test_circ_id, chan); if (in_use == 1) ++n_with_circ; - else if (in_use == 2) + else if (in_use == 2) { + time_t since_when; ++n_pending_destroy; + since_when = + circuit_id_when_marked_unusable_on_channel(test_circ_id, chan); + if (since_when) { + time_t waiting = approx_time() - since_when; + pending_destroy_time_total += waiting; + if (waiting > pending_destroy_time_max) + pending_destroy_time_max = waiting; + } else { + ++n_weird_pending_destroy; + } + } } while (in_use); return test_circ_id; } diff --git a/src/or/circuitlist.c b/src/or/circuitlist.c index 0140afcd77..a2dd07fbe4 100644 --- a/src/or/circuitlist.c +++ b/src/or/circuitlist.c @@ -59,6 +59,8 @@ typedef struct chan_circid_circuit_map_t { channel_t *chan; circid_t circ_id; circuit_t *circuit; + /* For debugging 12184: when was this placeholder item added? */ + time_t made_placeholder_at; } chan_circid_circuit_map_t; /** Helper for hash tables: compare the channel and circuit ID for a and @@ -184,6 +186,7 @@ circuit_set_circid_chan_helper(circuit_t *circ, int direction, found = HT_FIND(chan_circid_map, &chan_circid_map, &search); if (found) { found->circuit = circ; + found->made_placeholder_at = 0; } else { found = tor_malloc_zero(sizeof(chan_circid_circuit_map_t)); found->circ_id = id; @@ -241,11 +244,14 @@ channel_mark_circid_unusable(channel_t *chan, circid_t id) "a circuit there.", (unsigned)id, chan); } else if (ent) { /* It's already marked. */ + if (!ent->made_placeholder_at) + ent->made_placeholder_at = approx_time(); } else { ent = tor_malloc_zero(sizeof(chan_circid_circuit_map_t)); ent->chan = chan; ent->circ_id = id; - /* leave circuit at NULL */ + /* leave circuit at NULL. */ + ent->made_placeholder_at = approx_time(); HT_INSERT(chan_circid_map, &chan_circid_map, ent); } } @@ -1096,6 +1102,27 @@ circuit_id_in_use_on_channel(circid_t circ_id, channel_t *chan) return 0; } +/** Helper for debugging 12184. Returns the time since which 'circ_id' has + * been marked unusable on 'chan'. */ +time_t +circuit_id_when_marked_unusable_on_channel(circid_t circ_id, channel_t *chan) +{ + chan_circid_circuit_map_t search; + chan_circid_circuit_map_t *found; + + memset(&search, 0, sizeof(search)); + search.circ_id = circ_id; + search.chan = chan; + + found = HT_FIND(chan_circid_map, &chan_circid_map, &search); + + if (! found || found->circuit) + return 0; + + return found->made_placeholder_at; +} + + /** Return the circuit that a given edge connection is using. */ circuit_t * circuit_get_by_edge_conn(edge_connection_t *conn) diff --git a/src/or/circuitlist.h b/src/or/circuitlist.h index 916afba215..d48d7c3963 100644 --- a/src/or/circuitlist.h +++ b/src/or/circuitlist.h @@ -28,6 +28,8 @@ void circuit_set_n_circid_chan(circuit_t *circ, circid_t id, channel_t *chan); void channel_mark_circid_unusable(channel_t *chan, circid_t id); void channel_mark_circid_usable(channel_t *chan, circid_t id); +time_t circuit_id_when_marked_unusable_on_channel(circid_t circ_id, + channel_t *chan); void circuit_set_state(circuit_t *circ, uint8_t state); void circuit_close_all_marked(void); int32_t circuit_initial_package_window(void); diff --git a/src/or/circuitmux.c b/src/or/circuitmux.c index 52ebfef084..563166eb46 100644 --- a/src/or/circuitmux.c +++ b/src/or/circuitmux.c @@ -1909,3 +1909,38 @@ circuitmux_append_destroy_cell(channel_t *chan, } } +/*DOCDOC; for debugging 12184. This runs slowly. */ +int64_t +circuitmux_count_queued_destroy_cells(const channel_t *chan, + const circuitmux_t *cmux) +{ + int64_t n_destroy_cells = cmux->destroy_ctr; + int64_t destroy_queue_size = cmux->destroy_cell_queue.n; + + int64_t manual_total = 0; + int64_t manual_total_in_map = 0; + packed_cell_t *cell; + + TOR_SIMPLEQ_FOREACH(cell, &cmux->destroy_cell_queue.head, next) { + circid_t id; + ++manual_total; + + id = packed_cell_get_circid(cell, chan->wide_circ_ids); + if (circuit_id_in_use_on_channel(id, (channel_t*)chan)) + ++manual_total_in_map; + } + + if (n_destroy_cells != destroy_queue_size || + n_destroy_cells != manual_total || + n_destroy_cells != manual_total_in_map) { + log_warn(LD_BUG, " Discrepancy in counts for queued destroy cells on " + "circuitmux. n="I64_FORMAT". queue_size="I64_FORMAT". " + "manual_total="I64_FORMAT". manual_total_in_map="I64_FORMAT".", + I64_PRINTF_ARG(n_destroy_cells), + I64_PRINTF_ARG(destroy_queue_size), + I64_PRINTF_ARG(manual_total), + I64_PRINTF_ARG(manual_total_in_map)); + } + + return n_destroy_cells; +} diff --git a/src/or/circuitmux.h b/src/or/circuitmux.h index c4c0649c6c..2b5fb7e51e 100644 --- a/src/or/circuitmux.h +++ b/src/or/circuitmux.h @@ -121,6 +121,10 @@ unsigned int circuitmux_num_cells(circuitmux_t *cmux); unsigned int circuitmux_num_circuits(circuitmux_t *cmux); unsigned int circuitmux_num_active_circuits(circuitmux_t *cmux); +/* Debuging interface - slow. */ +int64_t circuitmux_count_queued_destroy_cells(const channel_t *chan, + const circuitmux_t *cmux); + /* Channel interface */ circuit_t * circuitmux_get_first_active_circuit(circuitmux_t *cmux, cell_queue_t **destroy_queue_out); diff --git a/src/or/relay.c b/src/or/relay.c index 66d1251eb7..f42602d412 100644 --- a/src/or/relay.c +++ b/src/or/relay.c @@ -2569,6 +2569,17 @@ packed_cell_get_command(const packed_cell_t *cell, int wide_circ_ids) } } +/** Extract the circuit ID from a packed cell. */ +circid_t +packed_cell_get_circid(const packed_cell_t *cell, int wide_circ_ids) +{ + if (wide_circ_ids) { + return ntohl(get_uint32(cell->body)); + } else { + return ntohs(get_uint16(cell->body)); + } +} + /** Pull as many cells as possible (but no more than max) from the * queue of the first active circuit on chan, and write them to * chan->outbuf. Return the number of cells written. Advance diff --git a/src/or/relay.h b/src/or/relay.h index 479d474b35..969c6fb61d 100644 --- a/src/or/relay.h +++ b/src/or/relay.h @@ -82,6 +82,8 @@ void stream_choice_seed_weak_rng(void); int relay_crypt(circuit_t *circ, cell_t *cell, cell_direction_t cell_direction, crypt_path_t **layer_hint, char *recognized); +circid_t packed_cell_get_circid(const packed_cell_t *cell, int wide_circ_ids); + #ifdef RELAY_PRIVATE STATIC int connected_cell_parse(const relay_header_t *rh, const cell_t *cell, tor_addr_t *addr_out, int *ttl_out);