diff --git a/changes/bug24767 b/changes/bug24767
new file mode 100644
index 0000000000..56fbe51a98
--- /dev/null
+++ b/changes/bug24767
@@ -0,0 +1,5 @@
+ o Major bugfixes (relay, connection):
+ - Refuse to connect again to a relay from which we failed previously with
+ a connection refused, timeout or error (at the TCP level). The relay
+ won't be retried for 60 seconds after the failure occured. Fixes bug
+ 24767; bugfix on 0.0.6.
diff --git a/src/or/connection_or.c b/src/or/connection_or.c
index 272a086a32..267463312c 100644
--- a/src/or/connection_or.c
+++ b/src/or/connection_or.c
@@ -28,6 +28,7 @@
* part of a subclass (channel_tls_t).
*/
#define TOR_CHANNEL_INTERNAL_
+#define CONNECTION_OR_PRIVATE
#include "channel.h"
#include "channeltls.h"
#include "circuitbuild.h"
@@ -1122,6 +1123,216 @@ connection_or_group_set_badness_(smartlist_t *group, int force)
} SMARTLIST_FOREACH_END(or_conn);
}
+/* Lifetime of a connection failure. After that, we'll retry. This is in
+ * seconds. */
+#define OR_CONNECT_FAILURE_LIFETIME 60
+/* The interval to use with when to clean up the failure cache. */
+#define OR_CONNECT_FAILURE_CLEANUP_INTERVAL 60
+
+/* When is the next time we have to cleanup the failure map. We keep this
+ * because we clean it opportunistically. */
+static time_t or_connect_failure_map_next_cleanup_ts = 0;
+
+/* OR connection failure entry data structure. It is kept in the connection
+ * failure map defined below and indexed by OR identity digest, address and
+ * port.
+ *
+ * We need to identify a connection failure with these three values because we
+ * want to avoid to wrongfully blacklist a relay if someone is trying to
+ * extend to a known identity digest but with the wrong IP/port. For instance,
+ * it can happen if a relay changed its port but the client still has an old
+ * descriptor with the old port. We want to stop connecting to that
+ * IP/port/identity all together, not only the relay identity. */
+typedef struct or_connect_failure_entry_t {
+ HT_ENTRY(or_connect_failure_entry_t) node;
+ /* Identity digest of the connection where it is connecting to. */
+ uint8_t identity_digest[DIGEST_LEN];
+ /* This is the connection address from the base connection_t. After the
+ * connection is checked for canonicity, the base address should represent
+ * what we know instead of where we are connecting to. This is what we need
+ * so we can correlate known relays within the consensus. */
+ tor_addr_t addr;
+ uint16_t port;
+ /* Last time we were unable to connect. */
+ time_t last_failed_connect_ts;
+} or_connect_failure_entry_t;
+
+/* Map where we keep connection failure entries. They are indexed by addr,
+ * port and identity digest. */
+static HT_HEAD(or_connect_failure_ht, or_connect_failure_entry_t)
+ or_connect_failures_map = HT_INITIALIZER();
+
+/* Helper: Hashtable equal function. Return 1 if equal else 0. */
+static int
+or_connect_failure_ht_eq(const or_connect_failure_entry_t *a,
+ const or_connect_failure_entry_t *b)
+{
+ return fast_memeq(a->identity_digest, b->identity_digest, DIGEST_LEN) &&
+ tor_addr_eq(&a->addr, &b->addr) &&
+ a->port == b->port;
+}
+
+/* Helper: Return the hash for the hashtable of the given entry. For this
+ * table, it is a combination of address, port and identity digest. */
+static unsigned int
+or_connect_failure_ht_hash(const or_connect_failure_entry_t *entry)
+{
+ size_t offset = 0, addr_size;
+ const void *addr_ptr;
+ /* Largest size is IPv6 and IPv4 is smaller so it is fine. */
+ uint8_t data[16 + sizeof(uint16_t) + DIGEST_LEN];
+
+ /* Get the right address bytes depending on the family. */
+ switch (tor_addr_family(&entry->addr)) {
+ case AF_INET:
+ addr_size = 4;
+ addr_ptr = &entry->addr.addr.in_addr.s_addr;
+ break;
+ case AF_INET6:
+ addr_size = 16;
+ addr_ptr = &entry->addr.addr.in6_addr.s6_addr;
+ break;
+ default:
+ tor_assert_nonfatal_unreached();
+ return 0;
+ }
+
+ memcpy(data, addr_ptr, addr_size);
+ offset += addr_size;
+ memcpy(data + offset, entry->identity_digest, DIGEST_LEN);
+ offset += DIGEST_LEN;
+ set_uint16(data + offset, entry->port);
+ offset += sizeof(uint16_t);
+
+ return (unsigned int) siphash24g(data, offset);
+}
+
+HT_PROTOTYPE(or_connect_failure_ht, or_connect_failure_entry_t, node,
+ or_connect_failure_ht_hash, or_connect_failure_ht_eq)
+
+HT_GENERATE2(or_connect_failure_ht, or_connect_failure_entry_t, node,
+ or_connect_failure_ht_hash, or_connect_failure_ht_eq,
+ 0.6, tor_reallocarray_, tor_free_)
+
+/* Initialize a given connect failure entry with the given identity_digest,
+ * addr and port. All field are optional except ocf. */
+static void
+or_connect_failure_init(const char *identity_digest, const tor_addr_t *addr,
+ uint16_t port, or_connect_failure_entry_t *ocf)
+{
+ tor_assert(ocf);
+ if (identity_digest) {
+ memcpy(ocf->identity_digest, identity_digest,
+ sizeof(ocf->identity_digest));
+ }
+ if (addr) {
+ tor_addr_copy(&ocf->addr, addr);
+ }
+ ocf->port = port;
+}
+
+/* Return a newly allocated connection failure entry. It is initialized with
+ * the given or_conn data. This can't fail. */
+static or_connect_failure_entry_t *
+or_connect_failure_new(const or_connection_t *or_conn)
+{
+ or_connect_failure_entry_t *ocf = tor_malloc_zero(sizeof(*ocf));
+ or_connect_failure_init(or_conn->identity_digest, &or_conn->real_addr,
+ TO_CONN(or_conn)->port, ocf);
+ return ocf;
+}
+
+/* Return a connection failure entry matching the given or_conn. NULL is
+ * returned if not found. */
+static or_connect_failure_entry_t *
+or_connect_failure_find(const or_connection_t *or_conn)
+{
+ or_connect_failure_entry_t lookup;
+ tor_assert(or_conn);
+ or_connect_failure_init(or_conn->identity_digest, &TO_CONN(or_conn)->addr,
+ TO_CONN(or_conn)->port, &lookup);
+ return HT_FIND(or_connect_failure_ht, &or_connect_failures_map, &lookup);
+}
+
+/* Note down in the connection failure cache that a failure occurred on the
+ * given or_conn. */
+STATIC void
+note_or_connect_failed(const or_connection_t *or_conn)
+{
+ or_connect_failure_entry_t *ocf = NULL;
+
+ tor_assert(or_conn);
+
+ ocf = or_connect_failure_find(or_conn);
+ if (ocf == NULL) {
+ ocf = or_connect_failure_new(or_conn);
+ HT_INSERT(or_connect_failure_ht, &or_connect_failures_map, ocf);
+ }
+ ocf->last_failed_connect_ts = approx_time();
+}
+
+/* Cleanup the connection failure cache and remove all entries below the
+ * given cutoff. */
+static void
+or_connect_failure_map_cleanup(time_t cutoff)
+{
+ or_connect_failure_entry_t **ptr, **next, *entry;
+
+ for (ptr = HT_START(or_connect_failure_ht, &or_connect_failures_map);
+ ptr != NULL; ptr = next) {
+ entry = *ptr;
+ if (entry->last_failed_connect_ts <= cutoff) {
+ next = HT_NEXT_RMV(or_connect_failure_ht, &or_connect_failures_map, ptr);
+ tor_free(entry);
+ } else {
+ next = HT_NEXT(or_connect_failure_ht, &or_connect_failures_map, ptr);
+ }
+ }
+}
+
+/* Return true iff the given OR connection can connect to its destination that
+ * is the triplet identity_digest, address and port.
+ *
+ * The or_conn MUST have gone through connection_or_check_canonicity() so the
+ * base address is properly set to what we know or doesn't know. */
+STATIC int
+should_connect_to_relay(const or_connection_t *or_conn)
+{
+ time_t now, cutoff;
+ time_t connect_failed_since_ts = 0;
+ or_connect_failure_entry_t *ocf;
+
+ tor_assert(or_conn);
+
+ now = approx_time();
+ cutoff = now - OR_CONNECT_FAILURE_LIFETIME;
+
+ /* Opportunistically try to cleanup the failure cache. We do that at regular
+ * interval so it doesn't grow too big. */
+ if (or_connect_failure_map_next_cleanup_ts <= now) {
+ or_connect_failure_map_cleanup(cutoff);
+ or_connect_failure_map_next_cleanup_ts =
+ now + OR_CONNECT_FAILURE_CLEANUP_INTERVAL;
+ }
+
+ /* Look if we have failed previously to the same destination as this
+ * OR connection. */
+ ocf = or_connect_failure_find(or_conn);
+ if (ocf) {
+ connect_failed_since_ts = ocf->last_failed_connect_ts;
+ }
+ /* If we do have an unable to connect timestamp and it is below cutoff, we
+ * can connect. Or we have never failed before so let it connect. */
+ if (connect_failed_since_ts > cutoff) {
+ goto no_connect;
+ }
+
+ /* Ok we can connect! */
+ return 1;
+ no_connect:
+ return 0;
+}
+
/** conn is in the 'connecting' state, and it failed to complete
* a TCP connection. Send notifications appropriately.
*
@@ -1135,6 +1346,7 @@ connection_or_connect_failed(or_connection_t *conn,
control_event_or_conn_status(conn, OR_CONN_EVENT_FAILED, reason);
if (!authdir_mode_tests_reachability(get_options()))
control_event_bootstrap_prob_or(msg, reason, conn);
+ note_or_connect_failed(conn);
}
/** conn got an error in connection_handle_read_impl() or
@@ -1225,6 +1437,19 @@ connection_or_connect, (const tor_addr_t *_addr, uint16_t port,
conn->chan = chan;
chan->conn = conn;
connection_or_init_conn_from_address(conn, &addr, port, id_digest, ed_id, 1);
+
+ /* We have a proper OR connection setup, now check if we can connect to it
+ * that is we haven't had a failure earlier. This is to avoid to try to
+ * constantly connect to relays that we think are not reachable. */
+ if (!should_connect_to_relay(conn)) {
+ log_info(LD_GENERAL, "Can't connect to identity %s at %s:%u because we "
+ "failed earlier. Refusing.",
+ hex_str(id_digest, DIGEST_LEN), fmt_addr(&TO_CONN(conn)->addr),
+ TO_CONN(conn)->port);
+ connection_free_(TO_CONN(conn));
+ return NULL;
+ }
+
connection_or_change_state(conn, OR_CONN_STATE_CONNECTING);
control_event_or_conn_status(conn, OR_CONN_EVENT_LAUNCHED, 0);
diff --git a/src/or/connection_or.h b/src/or/connection_or.h
index 7c1dced631..158eb1fdad 100644
--- a/src/or/connection_or.h
+++ b/src/or/connection_or.h
@@ -120,6 +120,11 @@ int connection_or_single_set_badness_(time_t now,
int force);
void connection_or_group_set_badness_(smartlist_t *group, int force);
+#ifdef CONNECTION_OR_PRIVATE
+STATIC int should_connect_to_relay(const or_connection_t *or_conn);
+STATIC void note_or_connect_failed(const or_connection_t *or_conn);
+#endif
+
#ifdef TOR_UNIT_TESTS
extern int certs_cell_ed25519_disabled_for_testing;
#endif
diff --git a/src/or/nodelist.c b/src/or/nodelist.c
index 125dd8b9f1..3a26aee611 100644
--- a/src/or/nodelist.c
+++ b/src/or/nodelist.c
@@ -161,8 +161,8 @@ init_nodelist(void)
}
/** As node_get_by_id, but returns a non-const pointer */
-node_t *
-node_get_mutable_by_id(const char *identity_digest)
+MOCK_IMPL(node_t *,
+node_get_mutable_by_id,(const char *identity_digest))
{
node_t search, *node;
if (PREDICT_UNLIKELY(the_nodelist == NULL))
diff --git a/src/or/nodelist.h b/src/or/nodelist.h
index dc20eaf0a5..043d7b3414 100644
--- a/src/or/nodelist.h
+++ b/src/or/nodelist.h
@@ -16,7 +16,7 @@
tor_assert((n)->ri || (n)->rs); \
} STMT_END
-node_t *node_get_mutable_by_id(const char *identity_digest);
+MOCK_DECL(node_t *, node_get_mutable_by_id,(const char *identity_digest));
MOCK_DECL(const node_t *, node_get_by_id, (const char *identity_digest));
node_t *node_get_mutable_by_ed25519_id(const ed25519_public_key_t *ed_id);
MOCK_DECL(const node_t *, node_get_by_ed25519_id,
diff --git a/src/test/test_connection.c b/src/test/test_connection.c
index 33f453b8b2..dc0f6860d9 100644
--- a/src/test/test_connection.c
+++ b/src/test/test_connection.c
@@ -5,6 +5,7 @@
#define CONNECTION_PRIVATE
#define MAIN_PRIVATE
+#define CONNECTION_OR_PRIVATE
#include "or.h"
#include "test.h"
@@ -13,9 +14,11 @@
#include "hs_common.h"
#include "main.h"
#include "microdesc.h"
+#include "nodelist.h"
#include "networkstatus.h"
#include "rendcache.h"
#include "directory.h"
+#include "connection_or.h"
#include "test_connection.h"
#include "test_helpers.h"
@@ -776,6 +779,99 @@ test_conn_download_status(void *arg)
/* the teardown function removes all the connections in the global list*/;
}
+static node_t test_node;
+
+static node_t *
+mock_node_get_mutable_by_id(const char *digest)
+{
+ (void) digest;
+ static routerinfo_t node_ri;
+ memset(&node_ri, 0, sizeof(node_ri));
+
+ test_node.ri = &node_ri;
+ memset(test_node.identity, 'c', sizeof(test_node.identity));
+
+ tor_addr_t ipv4_addr;
+ tor_addr_parse(&ipv4_addr, "18.0.0.1");
+ node_ri.addr = tor_addr_to_ipv4h(&ipv4_addr);
+ node_ri.or_port = 1;
+
+ return &test_node;
+}
+
+static const node_t *
+mock_node_get_by_id(const char *digest)
+{
+ (void) digest;
+ memset(test_node.identity, 'c', sizeof(test_node.identity));
+ return &test_node;
+}
+
+/* Test whether we correctly track failed connections between relays. */
+static void
+test_failed_orconn_tracker(void *arg)
+{
+ (void) arg;
+
+ int can_connect;
+ time_t now = 1281533250; /* 2010-08-11 13:27:30 UTC */
+ (void) now;
+
+ update_approx_time(now);
+
+ /* Prepare the OR connection that will be used in this test */
+ or_connection_t or_conn;
+ tt_int_op(AF_INET,OP_EQ, tor_addr_parse(&or_conn.real_addr, "18.0.0.1"));
+ tt_int_op(AF_INET,OP_EQ, tor_addr_parse(&or_conn.base_.addr, "18.0.0.1"));
+ or_conn.base_.port = 1;
+ memset(or_conn.identity_digest, 'c', sizeof(or_conn.identity_digest));
+
+ /* Check whether we can connect with an empty failure cache:
+ * this should succeed */
+ can_connect = should_connect_to_relay(&or_conn);
+ tt_int_op(can_connect, OP_EQ, 1);
+
+ /* Now add the destination to the failure cache */
+ note_or_connect_failed(&or_conn);
+
+ /* Check again: now it shouldn't connect */
+ can_connect = should_connect_to_relay(&or_conn);
+ tt_int_op(can_connect, OP_EQ, 0);
+
+ /* Move time forward and check again: the cache should have been cleared and
+ * now it should connect */
+ now += 3600;
+ update_approx_time(now);
+ can_connect = should_connect_to_relay(&or_conn);
+ tt_int_op(can_connect, OP_EQ, 1);
+
+ /* Now mock the node_get_*by_id() functions to start using the node subsystem
+ * optimization. */
+ MOCK(node_get_by_id, mock_node_get_by_id);
+ MOCK(node_get_mutable_by_id, mock_node_get_mutable_by_id);
+
+ /* Since we just started using the node subsystem it will allow connections
+ * now */
+ can_connect = should_connect_to_relay(&or_conn);
+ tt_int_op(can_connect, OP_EQ, 1);
+
+ /* Mark it as failed */
+ note_or_connect_failed(&or_conn);
+
+ /* Check that it shouldn't connect now */
+ can_connect = should_connect_to_relay(&or_conn);
+ tt_int_op(can_connect, OP_EQ, 0);
+
+ /* Move time forward and check again: now it should connect */
+ now += 3600;
+ update_approx_time(now);
+ can_connect = should_connect_to_relay(&or_conn);
+ tt_int_op(can_connect, OP_EQ, 1);
+
+ done:
+ ;
+}
+
#define CONNECTION_TESTCASE(name, fork, setup) \
{ #name, test_conn_##name, fork, &setup, NULL }
@@ -792,6 +888,7 @@ struct testcase_t connection_tests[] = {
CONNECTION_TESTCASE_ARG(download_status, TT_FORK,
test_conn_download_status_st, FLAV_NS),
//CONNECTION_TESTCASE(func_suffix, TT_FORK, setup_func_pair),
+ { "failed_orconn_tracker", test_failed_orconn_tracker, TT_FORK, NULL, NULL },
END_OF_TESTCASES
};