/* * Copyright (c) 2013, The Tor Project, Inc. */ /* See LICENSE for licensing information */ /** * \file scheduler.c * \brief Relay scheduling system **/ #include "or.h" #define TOR_CHANNEL_INTERNAL_ /* For channel_flush_some_cells() */ #include "channel.h" #include "compat_libevent.h" #define SCHEDULER_PRIVATE_ #include "scheduler.h" #ifdef HAVE_EVENT2_EVENT_H #include #else #include #endif /* * Scheduler high/low watermarks */ static uint32_t sched_q_low_water = 16384; static uint32_t sched_q_high_water = 32768; /* * Maximum cells to flush in a single call to channel_flush_some_cells(); * setting this low means more calls, but too high and we could overshoot * sched_q_high_water. */ static uint32_t sched_max_flush_cells = 16; /* * Write scheduling works by keeping track of which channels can * accept cells, and have cells to write. From the scheduler's perspective, * a channel can be in four possible states: * * 1.) Not open for writes, no cells to send * - Not much to do here, and the channel will have scheduler_state == * SCHED_CHAN_IDLE * - Transitions from: * - Open for writes/has cells by simultaneously draining all circuit * queues and filling the output buffer. * - Transitions to: * - Not open for writes/has cells by arrival of cells on an attached * circuit (this would be driven from append_cell_to_circuit_queue()) * - Open for writes/no cells by a channel type specific path; * driven from connection_or_flushed_some() for channel_tls_t. * * 2.) Open for writes, no cells to send * - Not much here either; this will be the state an idle but open channel * can be expected to settle in. It will have scheduler_state == * SCHED_CHAN_WAITING_FOR_CELLS * - Transitions from: * - Not open for writes/no cells by flushing some of the output * buffer. * - Open for writes/has cells by the scheduler moving cells from * circuit queues to channel output queue, but not having enough * to fill the output queue. * - Transitions to: * - Open for writes/has cells by arrival of new cells on an attached * circuit, in append_cell_to_circuit_queue() * * 3.) Not open for writes, cells to send * - This is the state of a busy circuit limited by output bandwidth; * cells have piled up in the circuit queues waiting to be relayed. * The channel will have scheduler_state == SCHED_CHAN_WAITING_TO_WRITE. * - Transitions from: * - Not open for writes/no cells by arrival of cells on an attached * circuit * - Open for writes/has cells by filling an output buffer without * draining all cells from attached circuits * - Transitions to: * - Opens for writes/has cells by draining some of the output buffer * via the connection_or_flushed_some() path (for channel_tls_t). * * 4.) Open for writes, cells to send * - This connection is ready to relay some cells and waiting for * the scheduler to choose it. The channel will have scheduler_state == * SCHED_CHAN_PENDING. * - Transitions from: * - Not open for writes/has cells by the connection_or_flushed_some() * path * - Open for writes/no cells by the append_cell_to_circuit_queue() * path * - Transitions to: * - Not open for writes/no cells by draining all circuit queues and * simultaneously filling the output buffer. * - Not open for writes/has cells by writing enough cells to fill the * output buffer * - Open for writes/no cells by draining all attached circuit queues * without also filling the output buffer * * Other event-driven parts of the code move channels between these scheduling * states by calling scheduler functions; the scheduler only runs on open-for- * writes/has-cells channels and is the only path for those to transition to * other states. The scheduler_run() function gives us the opportunity to do * scheduling work, and is called from other scheduler functions whenever a * state transition occurs, and periodically from the main event loop. */ /* Scheduler global data structures */ /* * We keep a list of channels that are pending - i.e, have cells to write * and can accept them to send. The enum scheduler_state in channel_t * is reserved for our use. */ /* Pqueue of channels that can write and have cells (pending work) */ STATIC smartlist_t *channels_pending = NULL; /* * This event runs the scheduler from its callback, and is manually * activated whenever a channel enters open for writes/cells to send. */ STATIC struct event *run_sched_ev = NULL; /* * Queue heuristic; this is not the queue size, but an 'effective queuesize' * that ages out contributions from stalled channels. */ STATIC uint64_t queue_heuristic = 0; /* * Timestamp for last queue heuristic update */ STATIC time_t queue_heuristic_timestamp = 0; /* Scheduler static function declarations */ static void scheduler_evt_callback(evutil_socket_t fd, short events, void *arg); static int scheduler_more_work(void); static void scheduler_retrigger(void); #if 0 static void scheduler_trigger(void); #endif /* Scheduler function implementations */ /** Free everything and shut down the scheduling system */ void scheduler_free_all(void) { log_debug(LD_SCHED, "Shutting down scheduler"); if (run_sched_ev) { event_del(run_sched_ev); tor_event_free(run_sched_ev); run_sched_ev = NULL; } if (channels_pending) { smartlist_free(channels_pending); channels_pending = NULL; } } /** * Comparison function to use when sorting pending channels */ MOCK_IMPL(STATIC int, scheduler_compare_channels, (const void *c1_v, const void *c2_v)) { channel_t *c1 = NULL, *c2 = NULL; /* These are a workaround for -Wbad-function-cast throwing a fit */ const circuitmux_policy_t *p1, *p2; uintptr_t p1_i, p2_i; tor_assert(c1_v); tor_assert(c2_v); c1 = (channel_t *)(c1_v); c2 = (channel_t *)(c2_v); tor_assert(c1); tor_assert(c2); if (c1 != c2) { if (circuitmux_get_policy(c1->cmux) == circuitmux_get_policy(c2->cmux)) { /* Same cmux policy, so use the mux comparison */ return circuitmux_compare_muxes(c1->cmux, c2->cmux); } else { /* * Different policies; not important to get this edge case perfect * because the current code never actually gives different channels * different cmux policies anyway. Just use this arbitrary but * definite choice. */ p1 = circuitmux_get_policy(c1->cmux); p2 = circuitmux_get_policy(c2->cmux); p1_i = (uintptr_t)p1; p2_i = (uintptr_t)p2; return (p1_i < p2_i) ? -1 : 1; } } else { /* c1 == c2, so always equal */ return 0; } } /* * Scheduler event callback; this should get triggered once per event loop * if any scheduling work was created during the event loop. */ static void scheduler_evt_callback(evutil_socket_t fd, short events, void *arg) { (void)fd; (void)events; (void)arg; log_debug(LD_SCHED, "Scheduler event callback called"); tor_assert(run_sched_ev); /* Run the scheduler */ scheduler_run(); /* Do we have more work to do? */ if (scheduler_more_work()) scheduler_retrigger(); } /** Mark a channel as no longer ready to accept writes */ MOCK_IMPL(void, scheduler_channel_doesnt_want_writes,(channel_t *chan)) { tor_assert(chan); tor_assert(channels_pending); /* If it's already in pending, we can put it in waiting_to_write */ if (chan->scheduler_state == SCHED_CHAN_PENDING) { /* * It's in channels_pending, so it shouldn't be in any of * the other lists. It can't write any more, so it goes to * channels_waiting_to_write. */ smartlist_pqueue_remove(channels_pending, scheduler_compare_channels, STRUCT_OFFSET(channel_t, sched_heap_idx), chan); chan->scheduler_state = SCHED_CHAN_WAITING_TO_WRITE; log_debug(LD_SCHED, "Channel " U64_FORMAT " at %p went from pending " "to waiting_to_write", U64_PRINTF_ARG(chan->global_identifier), chan); } else { /* * It's not in pending, so it can't become waiting_to_write; it's * either not in any of the lists (nothing to do) or it's already in * waiting_for_cells (remove it, can't write any more). */ if (chan->scheduler_state == SCHED_CHAN_WAITING_FOR_CELLS) { chan->scheduler_state = SCHED_CHAN_IDLE; log_debug(LD_SCHED, "Channel " U64_FORMAT " at %p left waiting_for_cells", U64_PRINTF_ARG(chan->global_identifier), chan); } } } /** Mark a channel as having waiting cells */ MOCK_IMPL(void, scheduler_channel_has_waiting_cells,(channel_t *chan)) { int became_pending = 0; tor_assert(chan); tor_assert(channels_pending); /* First, check if this one also writeable */ if (chan->scheduler_state == SCHED_CHAN_WAITING_FOR_CELLS) { /* * It's in channels_waiting_for_cells, so it shouldn't be in any of * the other lists. It has waiting cells now, so it goes to * channels_pending. */ chan->scheduler_state = SCHED_CHAN_PENDING; smartlist_pqueue_add(channels_pending, scheduler_compare_channels, STRUCT_OFFSET(channel_t, sched_heap_idx), chan); log_debug(LD_SCHED, "Channel " U64_FORMAT " at %p went from waiting_for_cells " "to pending", U64_PRINTF_ARG(chan->global_identifier), chan); became_pending = 1; } else { /* * It's not in waiting_for_cells, so it can't become pending; it's * either not in any of the lists (we add it to waiting_to_write) * or it's already in waiting_to_write or pending (we do nothing) */ if (!(chan->scheduler_state == SCHED_CHAN_WAITING_TO_WRITE || chan->scheduler_state == SCHED_CHAN_PENDING)) { chan->scheduler_state = SCHED_CHAN_WAITING_TO_WRITE; log_debug(LD_SCHED, "Channel " U64_FORMAT " at %p entered waiting_to_write", U64_PRINTF_ARG(chan->global_identifier), chan); } } /* * If we made a channel pending, we potentially have scheduling work * to do. */ if (became_pending) scheduler_retrigger(); } /** Set up the scheduling system */ void scheduler_init(void) { log_debug(LD_SCHED, "Initting scheduler"); tor_assert(!run_sched_ev); run_sched_ev = tor_event_new(tor_libevent_get_base(), -1, 0, scheduler_evt_callback, NULL); channels_pending = smartlist_new(); queue_heuristic = 0; queue_heuristic_timestamp = approx_time(); } /** Check if there's more scheduling work */ static int scheduler_more_work(void) { tor_assert(channels_pending); return ((scheduler_get_queue_heuristic() < sched_q_low_water) && ((smartlist_len(channels_pending) > 0))) ? 1 : 0; } /** Retrigger the scheduler in a way safe to use from the callback */ static void scheduler_retrigger(void) { tor_assert(run_sched_ev); event_active(run_sched_ev, EV_TIMEOUT, 1); } /** Notify the scheduler of a channel being closed */ MOCK_IMPL(void, scheduler_release_channel,(channel_t *chan)) { tor_assert(chan); tor_assert(channels_pending); if (chan->scheduler_state == SCHED_CHAN_PENDING) { smartlist_pqueue_remove(channels_pending, scheduler_compare_channels, STRUCT_OFFSET(channel_t, sched_heap_idx), chan); } chan->scheduler_state = SCHED_CHAN_IDLE; } /** Run the scheduling algorithm if necessary */ MOCK_IMPL(void, scheduler_run, (void)) { int n_cells, n_chans_before, n_chans_after; uint64_t q_len_before, q_heur_before, q_len_after, q_heur_after; ssize_t flushed, flushed_this_time; smartlist_t *to_readd = NULL; channel_t *chan = NULL; log_debug(LD_SCHED, "We have a chance to run the scheduler"); if (scheduler_get_queue_heuristic() < sched_q_low_water) { n_chans_before = smartlist_len(channels_pending); q_len_before = channel_get_global_queue_estimate(); q_heur_before = scheduler_get_queue_heuristic(); while (scheduler_get_queue_heuristic() <= sched_q_high_water && smartlist_len(channels_pending) > 0) { /* Pop off a channel */ chan = smartlist_pqueue_pop(channels_pending, scheduler_compare_channels, STRUCT_OFFSET(channel_t, sched_heap_idx)); tor_assert(chan); /* Figure out how many cells we can write */ n_cells = channel_num_cells_writeable(chan); if (n_cells > 0) { log_debug(LD_SCHED, "Scheduler saw pending channel " U64_FORMAT " at %p with " "%d cells writeable", U64_PRINTF_ARG(chan->global_identifier), chan, n_cells); flushed = 0; while (flushed < n_cells && scheduler_get_queue_heuristic() <= sched_q_high_water) { flushed_this_time = channel_flush_some_cells(chan, MIN(sched_max_flush_cells, (size_t) n_cells - flushed)); if (flushed_this_time <= 0) break; flushed += flushed_this_time; } if (flushed < n_cells) { /* We ran out of cells to flush */ chan->scheduler_state = SCHED_CHAN_WAITING_FOR_CELLS; log_debug(LD_SCHED, "Channel " U64_FORMAT " at %p " "entered waiting_for_cells from pending", U64_PRINTF_ARG(chan->global_identifier), chan); } else { /* The channel may still have some cells */ if (channel_more_to_flush(chan)) { /* The channel goes to either pending or waiting_to_write */ if (channel_num_cells_writeable(chan) > 0) { /* Add it back to pending later */ if (!to_readd) to_readd = smartlist_new(); smartlist_add(to_readd, chan); log_debug(LD_SCHED, "Channel " U64_FORMAT " at %p " "is still pending", U64_PRINTF_ARG(chan->global_identifier), chan); } else { /* It's waiting to be able to write more */ chan->scheduler_state = SCHED_CHAN_WAITING_TO_WRITE; log_debug(LD_SCHED, "Channel " U64_FORMAT " at %p " "entered waiting_to_write from pending", U64_PRINTF_ARG(chan->global_identifier), chan); } } else { /* No cells left; it can go to idle or waiting_for_cells */ if (channel_num_cells_writeable(chan) > 0) { /* * It can still accept writes, so it goes to * waiting_for_cells */ chan->scheduler_state = SCHED_CHAN_WAITING_FOR_CELLS; log_debug(LD_SCHED, "Channel " U64_FORMAT " at %p " "entered waiting_for_cells from pending", U64_PRINTF_ARG(chan->global_identifier), chan); } else { /* * We exactly filled up the output queue with all available * cells; go to idle. */ chan->scheduler_state = SCHED_CHAN_IDLE; log_debug(LD_SCHED, "Channel " U64_FORMAT " at %p " "become idle from pending", U64_PRINTF_ARG(chan->global_identifier), chan); } } } log_debug(LD_SCHED, "Scheduler flushed %d cells onto pending channel " U64_FORMAT " at %p", (int)flushed, U64_PRINTF_ARG(chan->global_identifier), chan); } else { log_info(LD_SCHED, "Scheduler saw pending channel " U64_FORMAT " at %p with " "no cells writeable", U64_PRINTF_ARG(chan->global_identifier), chan); /* Put it back to WAITING_TO_WRITE */ chan->scheduler_state = SCHED_CHAN_WAITING_TO_WRITE; } } /* Readd any channels we need to */ if (to_readd) { SMARTLIST_FOREACH_BEGIN(to_readd, channel_t *, chan) { chan->scheduler_state = SCHED_CHAN_PENDING; smartlist_pqueue_add(channels_pending, scheduler_compare_channels, STRUCT_OFFSET(channel_t, sched_heap_idx), chan); } SMARTLIST_FOREACH_END(chan); smartlist_free(to_readd); } n_chans_after = smartlist_len(channels_pending); q_len_after = channel_get_global_queue_estimate(); q_heur_after = scheduler_get_queue_heuristic(); log_debug(LD_SCHED, "Scheduler handled %d of %d pending channels, queue size from " U64_FORMAT " to " U64_FORMAT ", queue heuristic from " U64_FORMAT " to " U64_FORMAT, n_chans_before - n_chans_after, n_chans_before, U64_PRINTF_ARG(q_len_before), U64_PRINTF_ARG(q_len_after), U64_PRINTF_ARG(q_heur_before), U64_PRINTF_ARG(q_heur_after)); } } /** Trigger the scheduling event so we run the scheduler later */ #if 0 static void scheduler_trigger(void) { log_debug(LD_SCHED, "Triggering scheduler event"); tor_assert(run_sched_ev); event_add(run_sched_ev, EV_TIMEOUT, 1); } #endif /** Mark a channel as ready to accept writes */ void scheduler_channel_wants_writes(channel_t *chan) { int became_pending = 0; tor_assert(chan); tor_assert(channels_pending); /* If it's already in waiting_to_write, we can put it in pending */ if (chan->scheduler_state == SCHED_CHAN_WAITING_TO_WRITE) { /* * It can write now, so it goes to channels_pending. */ smartlist_pqueue_add(channels_pending, scheduler_compare_channels, STRUCT_OFFSET(channel_t, sched_heap_idx), chan); chan->scheduler_state = SCHED_CHAN_PENDING; log_debug(LD_SCHED, "Channel " U64_FORMAT " at %p went from waiting_to_write " "to pending", U64_PRINTF_ARG(chan->global_identifier), chan); became_pending = 1; } else { /* * It's not in SCHED_CHAN_WAITING_TO_WRITE, so it can't become pending; * it's either idle and goes to WAITING_FOR_CELLS, or it's a no-op. */ if (!(chan->scheduler_state == SCHED_CHAN_WAITING_FOR_CELLS || chan->scheduler_state == SCHED_CHAN_PENDING)) { chan->scheduler_state = SCHED_CHAN_WAITING_FOR_CELLS; log_debug(LD_SCHED, "Channel " U64_FORMAT " at %p entered waiting_for_cells", U64_PRINTF_ARG(chan->global_identifier), chan); } } /* * If we made a channel pending, we potentially have scheduling work * to do. */ if (became_pending) scheduler_retrigger(); } /** * Notify the scheduler that a channel's position in the pqueue may have * changed */ void scheduler_touch_channel(channel_t *chan) { tor_assert(chan); if (chan->scheduler_state == SCHED_CHAN_PENDING) { /* Remove and re-add it */ smartlist_pqueue_remove(channels_pending, scheduler_compare_channels, STRUCT_OFFSET(channel_t, sched_heap_idx), chan); smartlist_pqueue_add(channels_pending, scheduler_compare_channels, STRUCT_OFFSET(channel_t, sched_heap_idx), chan); } /* else no-op, since it isn't in the queue */ } /** * Notify the scheduler of a queue size adjustment, to recalculate the * queue heuristic. */ void scheduler_adjust_queue_size(channel_t *chan, char dir, uint64_t adj) { time_t now = approx_time(); log_debug(LD_SCHED, "Queue size adjustment by %s" U64_FORMAT " for channel " U64_FORMAT, (dir >= 0) ? "+" : "-", U64_PRINTF_ARG(adj), U64_PRINTF_ARG(chan->global_identifier)); /* Get the queue heuristic up to date */ scheduler_update_queue_heuristic(now); /* Adjust as appropriate */ if (dir >= 0) { /* Increasing it */ queue_heuristic += adj; } else { /* Decreasing it */ if (queue_heuristic > adj) queue_heuristic -= adj; else queue_heuristic = 0; } log_debug(LD_SCHED, "Queue heuristic is now " U64_FORMAT, U64_PRINTF_ARG(queue_heuristic)); } /** * Query the current value of the queue heuristic */ STATIC uint64_t scheduler_get_queue_heuristic(void) { time_t now = approx_time(); scheduler_update_queue_heuristic(now); return queue_heuristic; } /** * Adjust the queue heuristic value to the present time */ STATIC void scheduler_update_queue_heuristic(time_t now) { time_t diff; if (queue_heuristic_timestamp == 0) { /* * Nothing we can sensibly do; must not have been initted properly. * Oh well. */ queue_heuristic_timestamp = now; } else if (queue_heuristic_timestamp < now) { diff = now - queue_heuristic_timestamp; /* * This is a simple exponential age-out; the other proposed alternative * was a linear age-out using the bandwidth history in rephist.c; I'm * going with this out of concern that if an adversary can jam the * scheduler long enough, it would cause the bandwidth to drop to * zero and render the aging mechanism ineffective thereafter. */ if (0 <= diff && diff < 64) queue_heuristic >>= diff; else queue_heuristic = 0; queue_heuristic_timestamp = now; log_debug(LD_SCHED, "Queue heuristic is now " U64_FORMAT, U64_PRINTF_ARG(queue_heuristic)); } /* else no update needed, or time went backward */ } /** * Set scheduler watermarks and flush size */ void scheduler_set_watermarks(uint32_t lo, uint32_t hi, uint32_t max_flush) { /* Sanity assertions - caller should ensure these are true */ tor_assert(lo > 0); tor_assert(hi > lo); tor_assert(max_flush > 0); sched_q_low_water = lo; sched_q_high_water = hi; sched_max_flush_cells = max_flush; }