From bf89278c799e718a3b6c1ea139a6069db054ac09 Mon Sep 17 00:00:00 2001 From: Nick Mathewson Date: Tue, 26 Jun 2018 13:18:23 -0400 Subject: [PATCH] Refactor bloom filter logic not to be digest-specific. Now the address-set code and the digest-set code share the same backend. Closes ticket 26510 --- changes/ticket26510 | 4 ++ src/common/address_set.c | 85 +++++++---------------------------- src/common/address_set.h | 12 ++--- src/lib/container/bloomfilt.c | 79 +++++++++++++++++++++++++++++--- src/lib/container/bloomfilt.h | 61 ++++++++----------------- src/lib/crypt_ops/digestset.c | 58 ++++++++++++++++++++++++ src/lib/crypt_ops/digestset.h | 32 +++++++++++++ src/lib/crypt_ops/include.am | 6 ++- src/or/entrynodes.c | 2 +- src/or/routerlist.c | 2 +- src/test/bench.c | 4 +- src/test/test_containers.c | 2 +- 12 files changed, 214 insertions(+), 133 deletions(-) create mode 100644 changes/ticket26510 create mode 100644 src/lib/crypt_ops/digestset.c create mode 100644 src/lib/crypt_ops/digestset.h diff --git a/changes/ticket26510 b/changes/ticket26510 new file mode 100644 index 0000000000..f00457964d --- /dev/null +++ b/changes/ticket26510 @@ -0,0 +1,4 @@ + o Code simplification and refactoring: + - Unify our bloom filter logic. Previously we had two copies of this + code: one for routerlist filtering, and one for address set + calculations. Closes ticket 26510. diff --git a/src/common/address_set.c b/src/common/address_set.c index f4c5f581c8..369d33e9a6 100644 --- a/src/common/address_set.c +++ b/src/common/address_set.c @@ -14,35 +14,19 @@ #include "common/address_set.h" #include "common/address.h" #include "common/compat.h" -#include "lib/container/bitarray.h" +#include "lib/container/bloomfilt.h" #include "lib/crypt_ops/crypto_rand.h" #include "common/util.h" #include "siphash.h" -/** How many 64-bit siphash values to extract per address */ -#define N_HASHES 2 -/** How many bloom-filter bits we set per address. This is twice the N_HASHES - * value, since we split the siphash output into two 32-bit values. */ -#define N_BITS_PER_ITEM (N_HASHES * 2) - -/* XXXX This code is largely duplicated with digestset_t. We should merge - * them together into a common bloom-filter implementation. I'm keeping - * them separate for now, though, since this module needs to be backported - * all the way to 0.2.9. - * - * The main difference between digestset_t and this code is that we use - * independent siphashes rather than messing around with bit-shifts. The - * approach here is probably more sound, and we should prefer it if&when we - * unify the implementations. - */ - -struct address_set_t { - /** siphash keys to make N_HASHES independent hashes for each address. */ - struct sipkey key[N_HASHES]; - int mask; /**< One less than the number of bits in ba; always one less - * than a power of two. */ - bitarray_t *ba; /**< A bit array to implement the Bloom filter. */ -}; +/* Wrap our hash function to have the signature that the bloom filter + * needs. */ +static uint64_t +bloomfilt_addr_hash(const struct sipkey *key, + const void *item) +{ + return tor_addr_keyed_hash(key, item); +} /** * Allocate and return an address_set, suitable for holding up to @@ -51,33 +35,11 @@ struct address_set_t { address_set_t * address_set_new(int max_addresses_guess) { - /* See digestset_new() for rationale on this equation. */ - int n_bits = 1u << (tor_log2(max_addresses_guess)+5); - - address_set_t *set = tor_malloc_zero(sizeof(address_set_t)); - set->mask = n_bits - 1; - set->ba = bitarray_init_zero(n_bits); - crypto_rand((char*) set->key, sizeof(set->key)); - - return set; + uint8_t k[BLOOMFILT_KEY_LEN]; + crypto_rand((void*)k, sizeof(k)); + return bloomfilt_new(max_addresses_guess, bloomfilt_addr_hash, k); } -/** - * Release all storage associated with set. - */ -void -address_set_free(address_set_t *set) -{ - if (! set) - return; - - bitarray_free(set->ba); - tor_free(set); -} - -/** Yield the bit index corresponding to 'val' for set. */ -#define BIT(set, val) ((val) & (set)->mask) - /** * Add addr to set. * @@ -87,14 +49,7 @@ address_set_free(address_set_t *set) void address_set_add(address_set_t *set, const struct tor_addr_t *addr) { - int i; - for (i = 0; i < N_HASHES; ++i) { - uint64_t h = tor_addr_keyed_hash(&set->key[i], addr); - uint32_t high_bits = (uint32_t)(h >> 32); - uint32_t low_bits = (uint32_t)(h); - bitarray_set(set->ba, BIT(set, high_bits)); - bitarray_set(set->ba, BIT(set, low_bits)); - } + bloomfilt_add(set, addr); } /** As address_set_add(), but take an ipv4 address in host order. */ @@ -111,18 +66,8 @@ address_set_add_ipv4h(address_set_t *set, uint32_t addr) * return false if addr is not a member of set.) */ int -address_set_probably_contains(address_set_t *set, +address_set_probably_contains(const address_set_t *set, const struct tor_addr_t *addr) { - int i, matches = 0; - for (i = 0; i < N_HASHES; ++i) { - uint64_t h = tor_addr_keyed_hash(&set->key[i], addr); - uint32_t high_bits = (uint32_t)(h >> 32); - uint32_t low_bits = (uint32_t)(h); - // Note that !! is necessary here, since bitarray_is_set does not - // necessarily return 1 on true. - matches += !! bitarray_is_set(set->ba, BIT(set, high_bits)); - matches += !! bitarray_is_set(set->ba, BIT(set, low_bits)); - } - return matches == N_BITS_PER_ITEM; + return bloomfilt_probably_contains(set, addr); } diff --git a/src/common/address_set.h b/src/common/address_set.h index cfee89cfb8..2efa1cb03b 100644 --- a/src/common/address_set.h +++ b/src/common/address_set.h @@ -4,10 +4,6 @@ /** * \file address_set.h * \brief Types to handle sets of addresses. - * - * This module was first written on a semi-emergency basis to improve the - * robustness of the anti-DoS module. As such, it's written in a pretty - * conservative way, and should be susceptible to improvement later on. **/ #ifndef TOR_ADDRESS_SET_H @@ -15,21 +11,21 @@ #include "orconfig.h" #include "lib/cc/torint.h" +#include "lib/container/bloomfilt.h" /** * An address_set_t represents a set of tor_addr_t values. The implementation * is probabilistic: false negatives cannot occur but false positives are * possible. */ -typedef struct address_set_t address_set_t; +typedef struct bloomfilt_t address_set_t; struct tor_addr_t; address_set_t *address_set_new(int max_addresses_guess); -void address_set_free(address_set_t *set); +#define address_set_free(set) bloomfilt_free(set) void address_set_add(address_set_t *set, const struct tor_addr_t *addr); void address_set_add_ipv4h(address_set_t *set, uint32_t addr); -int address_set_probably_contains(address_set_t *set, +int address_set_probably_contains(const address_set_t *set, const struct tor_addr_t *addr); #endif - diff --git a/src/lib/container/bloomfilt.c b/src/lib/container/bloomfilt.c index 2133c280a5..1cab817e18 100644 --- a/src/lib/container/bloomfilt.c +++ b/src/lib/container/bloomfilt.c @@ -9,14 +9,75 @@ **/ #include + #include "lib/malloc/util_malloc.h" #include "lib/container/bloomfilt.h" #include "lib/intmath/bits.h" +#include "lib/log/util_bug.h" +#include "siphash.h" -/** Return a newly allocated digestset_t, optimized to hold a total of - * max_elements digests with a reasonably low false positive weight. */ -digestset_t * -digestset_new(int max_elements) +/** How many bloom-filter bits we set per address. This is twice the + * BLOOMFILT_N_HASHES value, since we split the siphash output into two 32-bit + * values. */ +#define N_BITS_PER_ITEM (BLOOMFILT_N_HASHES * 2) + +struct bloomfilt_t { + /** siphash keys to make BLOOMFILT_N_HASHES independent hashes for each + * items. */ + struct sipkey key[BLOOMFILT_N_HASHES]; + bloomfilt_hash_fn hashfn; /**< Function used to generate hashes */ + uint32_t mask; /**< One less than the number of bits in ba; always + * one less than a power of two. */ + bitarray_t *ba; /**< A bit array to implement the Bloom filter. */ +}; + +#define BIT(set, n) ((n) & (set)->mask) + +/** Add the element item to set. */ +void +bloomfilt_add(bloomfilt_t *set, + const void *item) +{ + int i; + for (i = 0; i < BLOOMFILT_N_HASHES; ++i) { + uint64_t h = set->hashfn(&set->key[i], item); + uint32_t high_bits = (uint32_t)(h >> 32); + uint32_t low_bits = (uint32_t)(h); + bitarray_set(set->ba, BIT(set, high_bits)); + bitarray_set(set->ba, BIT(set, low_bits)); + } +} + +/** If item is in set, return nonzero. Otherwise, + * probably return zero. */ +int +bloomfilt_probably_contains(const bloomfilt_t *set, + const void *item) +{ + int i, matches = 0; + for (i = 0; i < BLOOMFILT_N_HASHES; ++i) { + uint64_t h = set->hashfn(&set->key[i], item); + uint32_t high_bits = (uint32_t)(h >> 32); + uint32_t low_bits = (uint32_t)(h); + // Note that !! is necessary here, since bitarray_is_set does not + // necessarily return 1 on true. + matches += !! bitarray_is_set(set->ba, BIT(set, high_bits)); + matches += !! bitarray_is_set(set->ba, BIT(set, low_bits)); + } + return matches == N_BITS_PER_ITEM; +} + +/** Return a newly allocated bloomfilt_t, optimized to hold a total of + * max_elements elements with a reasonably low false positive weight. + * + * Uses the siphash-based function hashfn to compute hard-to-collide + * functions of the items, and the key material random_key to + * key the hash. There must be BLOOMFILT_KEY_LEN bytes in the supplied key. + **/ +bloomfilt_t * +bloomfilt_new(int max_elements, + bloomfilt_hash_fn hashfn, + const uint8_t *random_key) { /* The probability of false positives is about P=(1 - exp(-kn/m))^k, where k * is the number of hash functions per entry, m is the bits in the array, @@ -29,15 +90,21 @@ digestset_new(int max_elements) * conserve CPU, and k==13 is pretty big. */ int n_bits = 1u << (tor_log2(max_elements)+5); - digestset_t *r = tor_malloc(sizeof(digestset_t)); + bloomfilt_t *r = tor_malloc(sizeof(bloomfilt_t)); r->mask = n_bits - 1; r->ba = bitarray_init_zero(n_bits); + + tor_assert(sizeof(r->key) == BLOOMFILT_KEY_LEN); + memcpy(r->key, random_key, sizeof(r->key)); + + r->hashfn = hashfn; + return r; } /** Free all storage held in set. */ void -digestset_free_(digestset_t *set) +bloomfilt_free_(bloomfilt_t *set) { if (!set) return; diff --git a/src/lib/container/bloomfilt.h b/src/lib/container/bloomfilt.h index adcdb10fc3..577acf5e48 100644 --- a/src/lib/container/bloomfilt.h +++ b/src/lib/container/bloomfilt.h @@ -9,50 +9,27 @@ #include "orconfig.h" #include "lib/cc/torint.h" #include "lib/container/bitarray.h" -#include "siphash.h" -/** A set of digests, implemented as a Bloom filter. */ -typedef struct { - int mask; /**< One less than the number of bits in ba; always one less - * than a power of two. */ - bitarray_t *ba; /**< A bit array to implement the Bloom filter. */ -} digestset_t; +/** A set of elements, implemented as a Bloom filter. */ +typedef struct bloomfilt_t bloomfilt_t; -#define BIT(n) ((n) & set->mask) -/** Add the digest digest to set. */ -static inline void -digestset_add(digestset_t *set, const char *digest) -{ - const uint64_t x = siphash24g(digest, 20); - const uint32_t d1 = (uint32_t) x; - const uint32_t d2 = (uint32_t)( (x>>16) + x); - const uint32_t d3 = (uint32_t)( (x>>32) + x); - const uint32_t d4 = (uint32_t)( (x>>48) + x); - bitarray_set(set->ba, BIT(d1)); - bitarray_set(set->ba, BIT(d2)); - bitarray_set(set->ba, BIT(d3)); - bitarray_set(set->ba, BIT(d4)); -} +/** How many 64-bit siphash values to extract per item. */ +#define BLOOMFILT_N_HASHES 2 -/** If digest is in set, return nonzero. Otherwise, - * probably return zero. */ -static inline int -digestset_contains(const digestset_t *set, const char *digest) -{ - const uint64_t x = siphash24g(digest, 20); - const uint32_t d1 = (uint32_t) x; - const uint32_t d2 = (uint32_t)( (x>>16) + x); - const uint32_t d3 = (uint32_t)( (x>>32) + x); - const uint32_t d4 = (uint32_t)( (x>>48) + x); - return bitarray_is_set(set->ba, BIT(d1)) && - bitarray_is_set(set->ba, BIT(d2)) && - bitarray_is_set(set->ba, BIT(d3)) && - bitarray_is_set(set->ba, BIT(d4)); -} -#undef BIT +/** How much key material do we need to randomize hashes? */ +#define BLOOMFILT_KEY_LEN (BLOOMFILT_N_HASHES * 16) -digestset_t *digestset_new(int max_elements); -void digestset_free_(digestset_t* set); -#define digestset_free(set) FREE_AND_NULL(digestset_t, digestset_free_, (set)) +struct sipkey; +typedef uint64_t (*bloomfilt_hash_fn)(const struct sipkey *key, + const void *item); -#endif /* !defined(TOR_CONTAINER_H) */ +void bloomfilt_add(bloomfilt_t *set, const void *item); +int bloomfilt_probably_contains(const bloomfilt_t *set, const void *item); + +bloomfilt_t *bloomfilt_new(int max_elements, + bloomfilt_hash_fn hashfn, + const uint8_t *random_key); +void bloomfilt_free_(bloomfilt_t* set); +#define bloomfilt_free(set) FREE_AND_NULL(bloomfilt_t, bloomfilt_free_, (set)) + +#endif /* !defined(TOR_BLOOMFILT_H) */ diff --git a/src/lib/crypt_ops/digestset.c b/src/lib/crypt_ops/digestset.c new file mode 100644 index 0000000000..89dd377a9c --- /dev/null +++ b/src/lib/crypt_ops/digestset.c @@ -0,0 +1,58 @@ +/* Copyright (c) 2018-2018, The Tor Project, Inc. */ +/* See LICENSE for licensing information */ + +/** + * \file digestset.c + * \brief Implementation for a set of digests + **/ + +#include "orconfig.h" +#include "lib/container/bloomfilt.h" +#include "lib/crypt_ops/crypto_rand.h" +#include "lib/defs/digest_sizes.h" +#include "lib/crypt_ops/digestset.h" +#include "siphash.h" + +/* Wrap our hash function to have the signature that the bloom filter + * needs. */ +static uint64_t +bloomfilt_digest_hash(const struct sipkey *key, + const void *item) +{ + return siphash24(item, DIGEST_LEN, key); +} + +/** + * Allocate and return an digestset, suitable for holding up to + * max_guess distinct values. + */ +digestset_t * +digestset_new(int max_guess) +{ + uint8_t k[BLOOMFILT_KEY_LEN]; + crypto_rand((void*)k, sizeof(k)); + return bloomfilt_new(max_guess, bloomfilt_digest_hash, k); +} + +/** + * Add digest to set. + * + * All future queries for digest in set will return true. Removing + * items is not possible. + */ +void +digestset_add(digestset_t *set, const char *digest) +{ + bloomfilt_add(set, digest); +} + +/** + * Return true if digest is a member of set. (And probably, + * return false if digest is not a member of set.) + */ +int +digestset_probably_contains(const digestset_t *set, + const char *digest) +{ + return bloomfilt_probably_contains(set, digest); +} diff --git a/src/lib/crypt_ops/digestset.h b/src/lib/crypt_ops/digestset.h new file mode 100644 index 0000000000..3b5d62c310 --- /dev/null +++ b/src/lib/crypt_ops/digestset.h @@ -0,0 +1,32 @@ +/* Copyright (c) 2018, The Tor Project, Inc. */ +/* See LICENSE for licensing information */ + +/** + * \file digestset.h + * \brief Types to handle sets of digests, based on bloom filters. + **/ + +#ifndef TOR_DIGESTSET_H +#define TOR_DIGESTSET_H + +#include "orconfig.h" +#include "lib/cc/torint.h" +#include "lib/container/bloomfilt.h" + +/** + * An digestset_t represents a set of 20-byte digest values. The + * implementation is probabilistic: false negatives cannot occur but false + * positives are possible. + */ +typedef struct bloomfilt_t digestset_t; + +digestset_t *digestset_new(int max_addresses_guess); +#define digestset_free(set) bloomfilt_free(set) +void digestset_add(digestset_t *set, const char *addr); +int digestset_probably_contains(const digestset_t *set, + const char *addr); + +// XXXX to remove. +#define digestset_contains digestset_probably_contains + +#endif diff --git a/src/lib/crypt_ops/include.am b/src/lib/crypt_ops/include.am index b881c689d8..1b88b880d0 100644 --- a/src/lib/crypt_ops/include.am +++ b/src/lib/crypt_ops/include.am @@ -19,7 +19,8 @@ src_lib_libtor_crypt_ops_a_SOURCES = \ src/lib/crypt_ops/crypto_rand.c \ src/lib/crypt_ops/crypto_rsa.c \ src/lib/crypt_ops/crypto_s2k.c \ - src/lib/crypt_ops/crypto_util.c + src/lib/crypt_ops/crypto_util.c \ + src/lib/crypt_ops/digestset.c src_lib_libtor_crypt_ops_testing_a_SOURCES = \ $(src_lib_libtor_crypt_ops_a_SOURCES) @@ -41,4 +42,5 @@ noinst_HEADERS += \ src/lib/crypt_ops/crypto_rand.h \ src/lib/crypt_ops/crypto_rsa.h \ src/lib/crypt_ops/crypto_s2k.h \ - src/lib/crypt_ops/crypto_util.h + src/lib/crypt_ops/crypto_util.h \ + src/lib/crypt_ops/digestset.h diff --git a/src/or/entrynodes.c b/src/or/entrynodes.c index 3c63d3c1c2..9d27d80a84 100644 --- a/src/or/entrynodes.c +++ b/src/or/entrynodes.c @@ -142,7 +142,7 @@ #include "or/node_st.h" #include "or/origin_circuit_st.h" -#include "lib/container/bloomfilt.h" +#include "lib/crypt_ops/digestset.h" /** A list of existing guard selection contexts. */ static smartlist_t *guard_contexts = NULL; diff --git a/src/or/routerlist.c b/src/or/routerlist.c index 5af09fad2b..a7dda68a56 100644 --- a/src/or/routerlist.c +++ b/src/or/routerlist.c @@ -137,7 +137,7 @@ #include "or/routerlist_st.h" #include "or/vote_routerstatus_st.h" -#include "lib/container/bloomfilt.h" +#include "lib/crypt_ops/digestset.h" // #define DEBUG_ROUTERLIST diff --git a/src/test/bench.c b/src/test/bench.c index 3cfdd0ae4f..aa4af703f5 100644 --- a/src/test/bench.c +++ b/src/test/bench.c @@ -29,7 +29,7 @@ #include "or/cell_st.h" #include "or/or_circuit_st.h" -#include "lib/container/bloomfilt.h" +#include "lib/crypt_ops/digestset.h" #if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_PROCESS_CPUTIME_ID) static uint64_t nanostart; @@ -376,7 +376,7 @@ bench_dmap(void) crypto_rand(d, 20); smartlist_add(sl2, tor_memdup(d, 20)); } - printf("nbits=%d\n", ds->mask+1); + //printf("nbits=%d\n", ds->mask+1); reset_perftime(); diff --git a/src/test/test_containers.c b/src/test/test_containers.c index f45082be0b..acb7543b39 100644 --- a/src/test/test_containers.c +++ b/src/test/test_containers.c @@ -10,8 +10,8 @@ #include "test/test.h" #include "lib/container/bitarray.h" -#include "lib/container/bloomfilt.h" #include "lib/container/order.h" +#include "lib/crypt_ops/digestset.h" /** Helper: return a tristate based on comparing the strings in *a and * *b. */