diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2021-03-28 17:38:28 -0400 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2021-03-29 00:22:38 -0400 |
commit | a2094890a90a2f865e49f94e8448deca7e5852ef (patch) | |
tree | 11bf5f426509e288b2b3482492c805a26bb1885a /linux | |
parent | bb6eccc2ecd4728871bfc70462d3a4a20daa9d68 (diff) |
Update bcachefs sources to 18686af684 bcachefs: Inode backpointersv0.13
Diffstat (limited to 'linux')
-rw-r--r-- | linux/rhashtable.c | 1081 | ||||
-rw-r--r-- | linux/six.c | 35 |
2 files changed, 930 insertions, 186 deletions
diff --git a/linux/rhashtable.c b/linux/rhashtable.c index 351eac79..ba2196fc 100644 --- a/linux/rhashtable.c +++ b/linux/rhashtable.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Resizable, Scalable, Concurrent Hash Table * @@ -8,27 +9,29 @@ * Code partially derived from nft_hash * Rewritten with rehash code from br_multicast plus single list * pointer as suggested by Josh Triplett - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/atomic.h> -#include <linux/cpumask.h> #include <linux/kernel.h> #include <linux/log2.h> #include <linux/sched.h> +#include <linux/rculist.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/jhash.h> +#include <linux/overflow.h> #include <linux/random.h> #include <linux/rhashtable.h> #include <linux/err.h> +#include <linux/export.h> #define HASH_DEFAULT_SIZE 64UL #define HASH_MIN_SIZE 4U -#define BUCKET_LOCKS_PER_CPU 32UL + +union nested_table { + union nested_table __rcu *table; + struct rhash_lock_head __rcu *bucket; +}; static u32 head_hashfn(struct rhashtable *ht, const struct bucket_table *tbl, @@ -37,40 +40,75 @@ static u32 head_hashfn(struct rhashtable *ht, return rht_head_hashfn(ht, tbl, he, ht->p); } -static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl, - gfp_t gfp) -{ - unsigned int i, size; - unsigned int nr_pcpus = num_possible_cpus(); +#ifdef CONFIG_PROVE_LOCKING +#define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT)) - nr_pcpus = min_t(unsigned int, nr_pcpus, 64UL); - size = roundup_pow_of_two(nr_pcpus * ht->p.locks_mul); +int lockdep_rht_mutex_is_held(struct rhashtable *ht) +{ + return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1; +} +EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held); - /* Never allocate more than 0.5 locks per bucket */ - size = min_t(unsigned int, size, tbl->size >> 1); +int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash) +{ + if (!debug_locks) + return 1; + if (unlikely(tbl->nest)) + return 1; + return bit_spin_is_locked(0, (unsigned long *)&tbl->buckets[hash]); +} +EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held); +#else +#define ASSERT_RHT_MUTEX(HT) +#endif - if (sizeof(spinlock_t) != 0) { - tbl->locks = NULL; - if (gfp != GFP_KERNEL) - gfp |= __GFP_NOWARN | __GFP_NORETRY; +static inline union nested_table *nested_table_top( + const struct bucket_table *tbl) +{ + /* The top-level bucket entry does not need RCU protection + * because it's set at the same time as tbl->nest. + */ + return (void *)rcu_dereference_protected(tbl->buckets[0], 1); +} - if (!tbl->locks) - tbl->locks = kmalloc_array(size, sizeof(spinlock_t), - gfp); - if (!tbl->locks) - return -ENOMEM; - for (i = 0; i < size; i++) - spin_lock_init(&tbl->locks[i]); +static void nested_table_free(union nested_table *ntbl, unsigned int size) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + const unsigned int len = 1 << shift; + unsigned int i; + + ntbl = rcu_dereference_protected(ntbl->table, 1); + if (!ntbl) + return; + + if (size > len) { + size >>= shift; + for (i = 0; i < len; i++) + nested_table_free(ntbl + i, size); } - tbl->locks_mask = size - 1; - return 0; + kfree(ntbl); +} + +static void nested_bucket_table_free(const struct bucket_table *tbl) +{ + unsigned int size = tbl->size >> tbl->nest; + unsigned int len = 1 << tbl->nest; + union nested_table *ntbl; + unsigned int i; + + ntbl = nested_table_top(tbl); + + for (i = 0; i < len; i++) + nested_table_free(ntbl + i, size); + + kfree(ntbl); } static void bucket_table_free(struct bucket_table *tbl) { - if (tbl) - kvfree(tbl->locks); + if (tbl->nest) + nested_bucket_table_free(tbl); kvfree(tbl); } @@ -80,6 +118,59 @@ static void bucket_table_free_rcu(struct rcu_head *head) bucket_table_free(container_of(head, struct bucket_table, rcu)); } +static union nested_table *nested_table_alloc(struct rhashtable *ht, + union nested_table __rcu **prev, + bool leaf) +{ + union nested_table *ntbl; + int i; + + ntbl = rcu_dereference(*prev); + if (ntbl) + return ntbl; + + ntbl = kzalloc(PAGE_SIZE, GFP_ATOMIC); + + if (ntbl && leaf) { + for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0]); i++) + INIT_RHT_NULLS_HEAD(ntbl[i].bucket); + } + + if (cmpxchg((union nested_table **)prev, NULL, ntbl) == NULL) + return ntbl; + /* Raced with another thread. */ + kfree(ntbl); + return rcu_dereference(*prev); +} + +static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht, + size_t nbuckets, + gfp_t gfp) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + struct bucket_table *tbl; + size_t size; + + if (nbuckets < (1 << (shift + 1))) + return NULL; + + size = sizeof(*tbl) + sizeof(tbl->buckets[0]); + + tbl = kzalloc(size, gfp); + if (!tbl) + return NULL; + + if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets, + false)) { + kfree(tbl); + return NULL; + } + + tbl->nest = (ilog2(nbuckets) - 1) % shift + 1; + + return tbl; +} + static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, size_t nbuckets, gfp_t gfp) @@ -88,28 +179,27 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, size_t size; int i; - size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]); - if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER) || - gfp != GFP_KERNEL) - tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY); - if (tbl == NULL && gfp == GFP_KERNEL) - tbl = vzalloc(size); - if (tbl == NULL) - return NULL; + tbl = kvzalloc(struct_size(tbl, buckets, nbuckets), gfp); - tbl->size = nbuckets; + size = nbuckets; - if (alloc_bucket_locks(ht, tbl, gfp) < 0) { - bucket_table_free(tbl); - return NULL; + if (tbl == NULL && (gfp & ~__GFP_NOFAIL) != GFP_KERNEL) { + tbl = nested_bucket_table_alloc(ht, nbuckets, gfp); + nbuckets = 0; } + if (tbl == NULL) + return NULL; + + tbl->size = size; + + rcu_head_init(&tbl->rcu); INIT_LIST_HEAD(&tbl->walkers); - get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd)); + tbl->hash_rnd = get_random_u32(); for (i = 0; i < nbuckets; i++) - INIT_RHT_NULLS_HEAD(tbl->buckets[i], ht, i); + INIT_RHT_NULLS_HEAD(tbl->buckets[i]); return tbl; } @@ -127,18 +217,24 @@ static struct bucket_table *rhashtable_last_table(struct rhashtable *ht, return new_tbl; } -static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash) +static int rhashtable_rehash_one(struct rhashtable *ht, + struct rhash_lock_head __rcu **bkt, + unsigned int old_hash) { struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); - struct bucket_table *new_tbl = rhashtable_last_table(ht, - rht_dereference_rcu(old_tbl->future_tbl, ht)); - struct rhash_head __rcu **pprev = &old_tbl->buckets[old_hash]; - int err = -ENOENT; + struct bucket_table *new_tbl = rhashtable_last_table(ht, old_tbl); + int err = -EAGAIN; struct rhash_head *head, *next, *entry; - spinlock_t *new_bucket_lock; + struct rhash_head __rcu **pprev = NULL; unsigned int new_hash; - rht_for_each(entry, old_tbl, old_hash) { + if (new_tbl->nest) + goto out; + + err = -ENOENT; + + rht_for_each_from(entry, rht_ptr(bkt, old_tbl, old_hash), + old_tbl, old_hash) { err = 0; next = rht_dereference_bucket(entry->next, old_tbl, old_hash); @@ -153,57 +249,58 @@ static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash) new_hash = head_hashfn(ht, new_tbl, entry); - new_bucket_lock = rht_bucket_lock(new_tbl, new_hash); + rht_lock(new_tbl, &new_tbl->buckets[new_hash]); - spin_lock_nested(new_bucket_lock, SINGLE_DEPTH_NESTING); - head = rht_dereference_bucket(new_tbl->buckets[new_hash], - new_tbl, new_hash); + head = rht_ptr(new_tbl->buckets + new_hash, new_tbl, new_hash); RCU_INIT_POINTER(entry->next, head); - rcu_assign_pointer(new_tbl->buckets[new_hash], entry); - spin_unlock(new_bucket_lock); + rht_assign_unlock(new_tbl, &new_tbl->buckets[new_hash], entry); - rcu_assign_pointer(*pprev, next); + if (pprev) + rcu_assign_pointer(*pprev, next); + else + /* Need to preserved the bit lock. */ + rht_assign_locked(bkt, next); out: return err; } -static void rhashtable_rehash_chain(struct rhashtable *ht, +static int rhashtable_rehash_chain(struct rhashtable *ht, unsigned int old_hash) { struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); - spinlock_t *old_bucket_lock; + struct rhash_lock_head __rcu **bkt = rht_bucket_var(old_tbl, old_hash); + int err; - old_bucket_lock = rht_bucket_lock(old_tbl, old_hash); + if (!bkt) + return 0; + rht_lock(old_tbl, bkt); - spin_lock_bh(old_bucket_lock); - while (!rhashtable_rehash_one(ht, old_hash)) + while (!(err = rhashtable_rehash_one(ht, bkt, old_hash))) ; - old_tbl->rehash++; - spin_unlock_bh(old_bucket_lock); + + if (err == -ENOENT) + err = 0; + rht_unlock(old_tbl, bkt); + + return err; } static int rhashtable_rehash_attach(struct rhashtable *ht, struct bucket_table *old_tbl, struct bucket_table *new_tbl) { - /* Protect future_tbl using the first bucket lock. */ - spin_lock_bh(old_tbl->locks); - - /* Did somebody beat us to it? */ - if (rcu_access_pointer(old_tbl->future_tbl)) { - spin_unlock_bh(old_tbl->locks); - return -EEXIST; - } - /* Make insertions go into the new, empty table right away. Deletions * and lookups will be attempted in both tables until we synchronize. + * As cmpxchg() provides strong barriers, we do not need + * rcu_assign_pointer(). */ - rcu_assign_pointer(old_tbl->future_tbl, new_tbl); - spin_unlock_bh(old_tbl->locks); + if (cmpxchg((struct bucket_table **)&old_tbl->future_tbl, NULL, + new_tbl) != NULL) + return -EEXIST; return 0; } @@ -214,13 +311,18 @@ static int rhashtable_rehash_table(struct rhashtable *ht) struct bucket_table *new_tbl; struct rhashtable_walker *walker; unsigned int old_hash; + int err; new_tbl = rht_dereference(old_tbl->future_tbl, ht); if (!new_tbl) return 0; - for (old_hash = 0; old_hash < old_tbl->size; old_hash++) - rhashtable_rehash_chain(ht, old_hash); + for (old_hash = 0; old_hash < old_tbl->size; old_hash++) { + err = rhashtable_rehash_chain(ht, old_hash); + if (err) + return err; + cond_resched(); + } /* Publish the new table pointer. */ rcu_assign_pointer(ht->tbl, new_tbl); @@ -228,25 +330,30 @@ static int rhashtable_rehash_table(struct rhashtable *ht) spin_lock(&ht->lock); list_for_each_entry(walker, &old_tbl->walkers, list) walker->tbl = NULL; - spin_unlock(&ht->lock); /* Wait for readers. All new readers will see the new * table, and thus no references to the old table will * remain. + * We do this inside the locked region so that + * rhashtable_walk_stop() can use rcu_head_after_call_rcu() + * to check if it should not re-link the table. */ call_rcu(&old_tbl->rcu, bucket_table_free_rcu); + spin_unlock(&ht->lock); return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0; } -static int rhashtable_expand(struct rhashtable *ht) +static int rhashtable_rehash_alloc(struct rhashtable *ht, + struct bucket_table *old_tbl, + unsigned int size) { - struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht); + struct bucket_table *new_tbl; int err; - old_tbl = rhashtable_last_table(ht, old_tbl); + ASSERT_RHT_MUTEX(ht); - new_tbl = bucket_table_alloc(ht, old_tbl->size * 2, GFP_KERNEL); + new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL); if (new_tbl == NULL) return -ENOMEM; @@ -257,12 +364,27 @@ static int rhashtable_expand(struct rhashtable *ht) return err; } +/** + * rhashtable_shrink - Shrink hash table while allowing concurrent lookups + * @ht: the hash table to shrink + * + * This function shrinks the hash table to fit, i.e., the smallest + * size would not cause it to expand right away automatically. + * + * The caller must ensure that no concurrent resizing occurs by holding + * ht->mutex. + * + * The caller must ensure that no concurrent table mutations take place. + * It is however valid to have concurrent lookups if they are RCU protected. + * + * It is valid to have concurrent insertions and deletions protected by per + * bucket locks or concurrent RCU protected lookups and traversals. + */ static int rhashtable_shrink(struct rhashtable *ht) { - struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht); + struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); unsigned int nelems = atomic_read(&ht->nelems); unsigned int size = 0; - int err; if (nelems) size = roundup_pow_of_two(nelems * 3 / 2); @@ -275,15 +397,7 @@ static int rhashtable_shrink(struct rhashtable *ht) if (rht_dereference(old_tbl->future_tbl, ht)) return -EEXIST; - new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL); - if (new_tbl == NULL) - return -ENOMEM; - - err = rhashtable_rehash_attach(ht, old_tbl, new_tbl); - if (err) - bucket_table_free(new_tbl); - - return err; + return rhashtable_rehash_alloc(ht, old_tbl, size); } static void rht_deferred_worker(struct work_struct *work) @@ -299,11 +413,18 @@ static void rht_deferred_worker(struct work_struct *work) tbl = rhashtable_last_table(ht, tbl); if (rht_grow_above_75(ht, tbl)) - rhashtable_expand(ht); + err = rhashtable_rehash_alloc(ht, tbl, tbl->size * 2); else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl)) - rhashtable_shrink(ht); + err = rhashtable_shrink(ht); + else if (tbl->nest) + err = rhashtable_rehash_alloc(ht, tbl, tbl->size); + + if (!err || err == -EEXIST) { + int nerr; - err = rhashtable_rehash_table(ht); + nerr = rhashtable_rehash_table(ht); + err = err ?: nerr; + } mutex_unlock(&ht->mutex); @@ -311,22 +432,8 @@ static void rht_deferred_worker(struct work_struct *work) schedule_work(&ht->run_work); } -static bool rhashtable_check_elasticity(struct rhashtable *ht, - struct bucket_table *tbl, - unsigned int hash) -{ - unsigned int elasticity = ht->elasticity; - struct rhash_head *head; - - rht_for_each(head, tbl, hash) - if (!--elasticity) - return true; - - return false; -} - -int rhashtable_insert_rehash(struct rhashtable *ht, - struct bucket_table *tbl) +static int rhashtable_insert_rehash(struct rhashtable *ht, + struct bucket_table *tbl) { struct bucket_table *old_tbl; struct bucket_table *new_tbl; @@ -347,7 +454,7 @@ int rhashtable_insert_rehash(struct rhashtable *ht, err = -ENOMEM; - new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC); + new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC | __GFP_NOWARN); if (new_tbl == NULL) goto fail; @@ -363,7 +470,7 @@ int rhashtable_insert_rehash(struct rhashtable *ht, fail: /* Do not fail the insert if someone else did a rehash. */ - if (likely(rcu_dereference_raw(tbl->future_tbl))) + if (likely(rcu_access_pointer(tbl->future_tbl))) return 0; /* Schedule async rehash to retry allocation in process context. */ @@ -373,57 +480,485 @@ fail: return err; } -struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht, - const void *key, - struct rhash_head *obj, - struct bucket_table *tbl) +static void *rhashtable_lookup_one(struct rhashtable *ht, + struct rhash_lock_head __rcu **bkt, + struct bucket_table *tbl, unsigned int hash, + const void *key, struct rhash_head *obj) { + struct rhashtable_compare_arg arg = { + .ht = ht, + .key = key, + }; + struct rhash_head __rcu **pprev = NULL; struct rhash_head *head; - unsigned int hash; - int err; + int elasticity; + + elasticity = RHT_ELASTICITY; + rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) { + struct rhlist_head *list; + struct rhlist_head *plist; + + elasticity--; + if (!key || + (ht->p.obj_cmpfn ? + ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) : + rhashtable_compare(&arg, rht_obj(ht, head)))) { + pprev = &head->next; + continue; + } - tbl = rhashtable_last_table(ht, tbl); - hash = head_hashfn(ht, tbl, obj); - spin_lock_nested(rht_bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING); + if (!ht->rhlist) + return rht_obj(ht, head); - err = -EEXIST; - if (key && rhashtable_lookup_fast(ht, key, ht->p)) - goto exit; + list = container_of(obj, struct rhlist_head, rhead); + plist = container_of(head, struct rhlist_head, rhead); - err = -E2BIG; - if (unlikely(rht_grow_above_max(ht, tbl))) - goto exit; + RCU_INIT_POINTER(list->next, plist); + head = rht_dereference_bucket(head->next, tbl, hash); + RCU_INIT_POINTER(list->rhead.next, head); + if (pprev) + rcu_assign_pointer(*pprev, obj); + else + /* Need to preserve the bit lock */ + rht_assign_locked(bkt, obj); + + return NULL; + } + + if (elasticity <= 0) + return ERR_PTR(-EAGAIN); + + return ERR_PTR(-ENOENT); +} + +static struct bucket_table *rhashtable_insert_one( + struct rhashtable *ht, struct rhash_lock_head __rcu **bkt, + struct bucket_table *tbl, unsigned int hash, struct rhash_head *obj, + void *data) +{ + struct bucket_table *new_tbl; + struct rhash_head *head; + + if (!IS_ERR_OR_NULL(data)) + return ERR_PTR(-EEXIST); + + if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT) + return ERR_CAST(data); + + new_tbl = rht_dereference_rcu(tbl->future_tbl, ht); + if (new_tbl) + return new_tbl; + + if (PTR_ERR(data) != -ENOENT) + return ERR_CAST(data); - err = -EAGAIN; - if (rhashtable_check_elasticity(ht, tbl, hash) || - rht_grow_above_100(ht, tbl)) - goto exit; + if (unlikely(rht_grow_above_max(ht, tbl))) + return ERR_PTR(-E2BIG); - err = 0; + if (unlikely(rht_grow_above_100(ht, tbl))) + return ERR_PTR(-EAGAIN); - head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash); + head = rht_ptr(bkt, tbl, hash); RCU_INIT_POINTER(obj->next, head); + if (ht->rhlist) { + struct rhlist_head *list; - rcu_assign_pointer(tbl->buckets[hash], obj); + list = container_of(obj, struct rhlist_head, rhead); + RCU_INIT_POINTER(list->next, NULL); + } + + /* bkt is always the head of the list, so it holds + * the lock, which we need to preserve + */ + rht_assign_locked(bkt, obj); atomic_inc(&ht->nelems); + if (rht_grow_above_75(ht, tbl)) + schedule_work(&ht->run_work); + + return NULL; +} + +static void *rhashtable_try_insert(struct rhashtable *ht, const void *key, + struct rhash_head *obj) +{ + struct bucket_table *new_tbl; + struct bucket_table *tbl; + struct rhash_lock_head __rcu **bkt; + unsigned int hash; + void *data; + + new_tbl = rcu_dereference(ht->tbl); + + do { + tbl = new_tbl; + hash = rht_head_hashfn(ht, tbl, obj, ht->p); + if (rcu_access_pointer(tbl->future_tbl)) + /* Failure is OK */ + bkt = rht_bucket_var(tbl, hash); + else + bkt = rht_bucket_insert(ht, tbl, hash); + if (bkt == NULL) { + new_tbl = rht_dereference_rcu(tbl->future_tbl, ht); + data = ERR_PTR(-EAGAIN); + } else { + rht_lock(tbl, bkt); + data = rhashtable_lookup_one(ht, bkt, tbl, + hash, key, obj); + new_tbl = rhashtable_insert_one(ht, bkt, tbl, + hash, obj, data); + if (PTR_ERR(new_tbl) != -EEXIST) + data = ERR_CAST(new_tbl); + + rht_unlock(tbl, bkt); + } + } while (!IS_ERR_OR_NULL(new_tbl)); + + if (PTR_ERR(data) == -EAGAIN) + data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?: + -EAGAIN); + + return data; +} + +void *rhashtable_insert_slow(struct rhashtable *ht, const void *key, + struct rhash_head *obj) +{ + void *data; + + do { + rcu_read_lock(); + data = rhashtable_try_insert(ht, key, obj); + rcu_read_unlock(); + } while (PTR_ERR(data) == -EAGAIN); -exit: - spin_unlock(rht_bucket_lock(tbl, hash)); + return data; +} +EXPORT_SYMBOL_GPL(rhashtable_insert_slow); - if (err == 0) +/** + * rhashtable_walk_enter - Initialise an iterator + * @ht: Table to walk over + * @iter: Hash table Iterator + * + * This function prepares a hash table walk. + * + * Note that if you restart a walk after rhashtable_walk_stop you + * may see the same object twice. Also, you may miss objects if + * there are removals in between rhashtable_walk_stop and the next + * call to rhashtable_walk_start. + * + * For a completely stable walk you should construct your own data + * structure outside the hash table. + * + * This function may be called from any process context, including + * non-preemptable context, but cannot be called from softirq or + * hardirq context. + * + * You must call rhashtable_walk_exit after this function returns. + */ +void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter) +{ + iter->ht = ht; + iter->p = NULL; + iter->slot = 0; + iter->skip = 0; + iter->end_of_table = 0; + + spin_lock(&ht->lock); + iter->walker.tbl = + rcu_dereference_protected(ht->tbl, lockdep_is_held(&ht->lock)); + list_add(&iter->walker.list, &iter->walker.tbl->walkers); + spin_unlock(&ht->lock); +} +EXPORT_SYMBOL_GPL(rhashtable_walk_enter); + +/** + * rhashtable_walk_exit - Free an iterator + * @iter: Hash table Iterator + * + * This function frees resources allocated by rhashtable_walk_enter. + */ +void rhashtable_walk_exit(struct rhashtable_iter *iter) +{ + spin_lock(&iter->ht->lock); + if (iter->walker.tbl) + list_del(&iter->walker.list); + spin_unlock(&iter->ht->lock); +} +EXPORT_SYMBOL_GPL(rhashtable_walk_exit); + +/** + * rhashtable_walk_start_check - Start a hash table walk + * @iter: Hash table iterator + * + * Start a hash table walk at the current iterator position. Note that we take + * the RCU lock in all cases including when we return an error. So you must + * always call rhashtable_walk_stop to clean up. + * + * Returns zero if successful. + * + * Returns -EAGAIN if resize event occured. Note that the iterator + * will rewind back to the beginning and you may use it immediately + * by calling rhashtable_walk_next. + * + * rhashtable_walk_start is defined as an inline variant that returns + * void. This is preferred in cases where the caller would ignore + * resize events and always continue. + */ +int rhashtable_walk_start_check(struct rhashtable_iter *iter) + __acquires(RCU) +{ + struct rhashtable *ht = iter->ht; + bool rhlist = ht->rhlist; + + rcu_read_lock(); + + spin_lock(&ht->lock); + if (iter->walker.tbl) + list_del(&iter->walker.list); + spin_unlock(&ht->lock); + + if (iter->end_of_table) + return 0; + if (!iter->walker.tbl) { + iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht); + iter->slot = 0; + iter->skip = 0; + return -EAGAIN; + } + + if (iter->p && !rhlist) { + /* + * We need to validate that 'p' is still in the table, and + * if so, update 'skip' + */ + struct rhash_head *p; + int skip = 0; + rht_for_each_rcu(p, iter->walker.tbl, iter->slot) { + skip++; + if (p == iter->p) { + iter->skip = skip; + goto found; + } + } + iter->p = NULL; + } else if (iter->p && rhlist) { + /* Need to validate that 'list' is still in the table, and + * if so, update 'skip' and 'p'. + */ + struct rhash_head *p; + struct rhlist_head *list; + int skip = 0; + rht_for_each_rcu(p, iter->walker.tbl, iter->slot) { + for (list = container_of(p, struct rhlist_head, rhead); + list; + list = rcu_dereference(list->next)) { + skip++; + if (list == iter->list) { + iter->p = p; + iter->skip = skip; + goto found; + } + } + } + iter->p = NULL; + } +found: + return 0; +} +EXPORT_SYMBOL_GPL(rhashtable_walk_start_check); + +/** + * __rhashtable_walk_find_next - Find the next element in a table (or the first + * one in case of a new walk). + * + * @iter: Hash table iterator + * + * Returns the found object or NULL when the end of the table is reached. + * + * Returns -EAGAIN if resize event occurred. + */ +static void *__rhashtable_walk_find_next(struct rhashtable_iter *iter) +{ + struct bucket_table *tbl = iter->walker.tbl; + struct rhlist_head *list = iter->list; + struct rhashtable *ht = iter->ht; + struct rhash_head *p = iter->p; + bool rhlist = ht->rhlist; + + if (!tbl) return NULL; - else if (err == -EAGAIN) - return tbl; + + for (; iter->slot < tbl->size; iter->slot++) { + int skip = iter->skip; + + rht_for_each_rcu(p, tbl, iter->slot) { + if (rhlist) { + list = container_of(p, struct rhlist_head, + rhead); + do { + if (!skip) + goto next; + skip--; + list = rcu_dereference(list->next); + } while (list); + + continue; + } + if (!skip) + break; + skip--; + } + +next: + if (!rht_is_a_nulls(p)) { + iter->skip++; + iter->p = p; + iter->list = list; + return rht_obj(ht, rhlist ? &list->rhead : p); + } + + iter->skip = 0; + } + + iter->p = NULL; + + /* Ensure we see any new tables. */ + smp_rmb(); + + iter->walker.tbl = rht_dereference_rcu(tbl->future_tbl, ht); + if (iter->walker.tbl) { + iter->slot = 0; + iter->skip = 0; + return ERR_PTR(-EAGAIN); + } else { + iter->end_of_table = true; + } + + return NULL; +} + +/** + * rhashtable_walk_next - Return the next object and advance the iterator + * @iter: Hash table iterator + * + * Note that you must call rhashtable_walk_stop when you are finished + * with the walk. + * + * Returns the next object or NULL when the end of the table is reached. + * + * Returns -EAGAIN if resize event occurred. Note that the iterator + * will rewind back to the beginning and you may continue to use it. + */ +void *rhashtable_walk_next(struct rhashtable_iter *iter) +{ + struct rhlist_head *list = iter->list; + struct rhashtable *ht = iter->ht; + struct rhash_head *p = iter->p; + bool rhlist = ht->rhlist; + + if (p) { + if (!rhlist || !(list = rcu_dereference(list->next))) { + p = rcu_dereference(p->next); + list = container_of(p, struct rhlist_head, rhead); + } + if (!rht_is_a_nulls(p)) { + iter->skip++; + iter->p = p; + iter->list = list; + return rht_obj(ht, rhlist ? &list->rhead : p); + } + + /* At the end of this slot, switch to next one and then find + * next entry from that point. + */ + iter->skip = 0; + iter->slot++; + } + + return __rhashtable_walk_find_next(iter); +} +EXPORT_SYMBOL_GPL(rhashtable_walk_next); + +/** + * rhashtable_walk_peek - Return the next object but don't advance the iterator + * @iter: Hash table iterator + * + * Returns the next object or NULL when the end of the table is reached. + * + * Returns -EAGAIN if resize event occurred. Note that the iterator + * will rewind back to the beginning and you may continue to use it. + */ +void *rhashtable_walk_peek(struct rhashtable_iter *iter) +{ + struct rhlist_head *list = iter->list; + struct rhashtable *ht = iter->ht; + struct rhash_head *p = iter->p; + + if (p) + return rht_obj(ht, ht->rhlist ? &list->rhead : p); + + /* No object found in current iter, find next one in the table. */ + + if (iter->skip) { + /* A nonzero skip value points to the next entry in the table + * beyond that last one that was found. Decrement skip so + * we find the current value. __rhashtable_walk_find_next + * will restore the original value of skip assuming that + * the table hasn't changed. + */ + iter->skip--; + } + + return __rhashtable_walk_find_next(iter); +} +EXPORT_SYMBOL_GPL(rhashtable_walk_peek); + +/** + * rhashtable_walk_stop - Finish a hash table walk + * @iter: Hash table iterator + * + * Finish a hash table walk. Does not reset the iterator to the start of the + * hash table. + */ +void rhashtable_walk_stop(struct rhashtable_iter *iter) + __releases(RCU) +{ + struct rhashtable *ht; + struct bucket_table *tbl = iter->walker.tbl; + + if (!tbl) + goto out; + + ht = iter->ht; + + spin_lock(&ht->lock); + if (rcu_head_after_call_rcu(&tbl->rcu, bucket_table_free_rcu)) + /* This bucket table is being freed, don't re-link it. */ + iter->walker.tbl = NULL; else - return ERR_PTR(err); + list_add(&iter->walker.list, &tbl->walkers); + spin_unlock(&ht->lock); + +out: + rcu_read_unlock(); } +EXPORT_SYMBOL_GPL(rhashtable_walk_stop); static size_t rounded_hashtable_size(const struct rhashtable_params *params) { - return max(roundup_pow_of_two(params->nelem_hint * 4 / 3), - (unsigned long)params->min_size); + size_t retsize; + + if (params->nelem_hint) + retsize = max(roundup_pow_of_two(params->nelem_hint * 4 / 3), + (unsigned long)params->min_size); + else + retsize = max(HASH_DEFAULT_SIZE, + (unsigned long)params->min_size); + + return retsize; } static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed) @@ -431,21 +966,58 @@ static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed) return jhash2(key, length, seed); } +/** + * rhashtable_init - initialize a new hash table + * @ht: hash table to be initialized + * @params: configuration parameters + * + * Initializes a new hash table based on the provided configuration + * parameters. A table can be configured either with a variable or + * fixed length key: + * + * Configuration Example 1: Fixed length keys + * struct test_obj { + * int key; + * void * my_member; + * struct rhash_head node; + * }; + * + * struct rhashtable_params params = { + * .head_offset = offsetof(struct test_obj, node), + * .key_offset = offsetof(struct test_obj, key), + * .key_len = sizeof(int), + * .hashfn = jhash, + * }; + * + * Configuration Example 2: Variable length keys + * struct test_obj { + * [...] + * struct rhash_head node; + * }; + * + * u32 my_hash_fn(const void *data, u32 len, u32 seed) + * { + * struct test_obj *obj = data; + * + * return [... hash ...]; + * } + * + * struct rhashtable_params params = { + * .head_offset = offsetof(struct test_obj, node), + * .hashfn = jhash, + * .obj_hashfn = my_hash_fn, + * }; + */ int rhashtable_init(struct rhashtable *ht, const struct rhashtable_params *params) { struct bucket_table *tbl; size_t size; - size = HASH_DEFAULT_SIZE; - if ((!params->key_len && !params->obj_hashfn) || (params->obj_hashfn && !params->obj_cmpfn)) return -EINVAL; - if (params->nulls_base && params->nulls_base < (1U << RHT_BASE_SHIFT)) - return -EINVAL; - memset(ht, 0, sizeof(*ht)); mutex_init(&ht->mutex); spin_lock_init(&ht->lock); @@ -454,39 +1026,18 @@ int rhashtable_init(struct rhashtable *ht, if (params->min_size) ht->p.min_size = roundup_pow_of_two(params->min_size); - if (params->max_size) - ht->p.max_size = rounddown_pow_of_two(params->max_size); + /* Cap total entries at 2^31 to avoid nelems overflow. */ + ht->max_elems = 1u << 31; - if (params->insecure_max_entries) - ht->p.insecure_max_entries = - rounddown_pow_of_two(params->insecure_max_entries); - else - ht->p.insecure_max_entries = ht->p.max_size * 2; - - ht->p.min_size = max(ht->p.min_size, HASH_MIN_SIZE); + if (params->max_size) { + ht->p.max_size = rounddown_pow_of_two(params->max_size); + if (ht->p.max_size < ht->max_elems / 2) + ht->max_elems = ht->p.max_size * 2; + } - if (params->nelem_hint) - size = rounded_hashtable_size(&ht->p); - - /* The maximum (not average) chain length grows with the - * size of the hash table, at a rate of (log N)/(log log N). - * The value of 16 is selected so that even if the hash - * table grew to 2^32 you would not expect the maximum - * chain length to exceed it unless we are under attack - * (or extremely unlucky). - * - * As this limit is only to detect attacks, we don't need - * to set it to a lower value as you'd need the chain - * length to vastly exceed 16 to have any real effect - * on the system. - */ - if (!params->insecure_elasticity) - ht->elasticity = 16; + ht->p.min_size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE); - if (params->locks_mul) - ht->p.locks_mul = roundup_pow_of_two(params->locks_mul); - else - ht->p.locks_mul = BUCKET_LOCKS_PER_CPU; + size = rounded_hashtable_size(&ht->p); ht->key_len = ht->p.key_len; if (!params->hashfn) { @@ -498,9 +1049,16 @@ int rhashtable_init(struct rhashtable *ht, } } + /* + * This is api initialization and thus we need to guarantee the + * initial rhashtable allocation. Upon failure, retry with the + * smallest possible size with __GFP_NOFAIL semantics. + */ tbl = bucket_table_alloc(ht, size, GFP_KERNEL); - if (tbl == NULL) - return -ENOMEM; + if (unlikely(tbl == NULL)) { + size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE); + tbl = bucket_table_alloc(ht, size, GFP_KERNEL | __GFP_NOFAIL); + } atomic_set(&ht->nelems, 0); @@ -510,15 +1068,170 @@ int rhashtable_init(struct rhashtable *ht, return 0; } +EXPORT_SYMBOL_GPL(rhashtable_init); -void rhashtable_destroy(struct rhashtable *ht) +/** + * rhltable_init - initialize a new hash list table + * @hlt: hash list table to be initialized + * @params: configuration parameters + * + * Initializes a new hash list table. + * + * See documentation for rhashtable_init. + */ +int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params) { - struct bucket_table *tbl; + int err; + + err = rhashtable_init(&hlt->ht, params); + hlt->ht.rhlist = true; + return err; +} +EXPORT_SYMBOL_GPL(rhltable_init); + +static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj, + void (*free_fn)(void *ptr, void *arg), + void *arg) +{ + struct rhlist_head *list; + + if (!ht->rhlist) { + free_fn(rht_obj(ht, obj), arg); + return; + } + + list = container_of(obj, struct rhlist_head, rhead); + do { + obj = &list->rhead; + list = rht_dereference(list->next, ht); + free_fn(rht_obj(ht, obj), arg); + } while (list); +} + +/** + * rhashtable_free_and_destroy - free elements and destroy hash table + * @ht: the hash table to destroy + * @free_fn: callback to release resources of element + * @arg: pointer passed to free_fn + * + * Stops an eventual async resize. If defined, invokes free_fn for each + * element to releasal resources. Please note that RCU protected + * readers may still be accessing the elements. Releasing of resources + * must occur in a compatible manner. Then frees the bucket array. + * + * This function will eventually sleep to wait for an async resize + * to complete. The caller is responsible that no further write operations + * occurs in parallel. + */ +void rhashtable_free_and_destroy(struct rhashtable *ht, + void (*free_fn)(void *ptr, void *arg), + void *arg) +{ + struct bucket_table *tbl, *next_tbl; + unsigned int i; cancel_work_sync(&ht->run_work); mutex_lock(&ht->mutex); tbl = rht_dereference(ht->tbl, ht); +restart: + if (free_fn) { + for (i = 0; i < tbl->size; i++) { + struct rhash_head *pos, *next; + + cond_resched(); + for (pos = rht_ptr_exclusive(rht_bucket(tbl, i)), + next = !rht_is_a_nulls(pos) ? + rht_dereference(pos->next, ht) : NULL; + !rht_is_a_nulls(pos); + pos = next, + next = !rht_is_a_nulls(pos) ? + rht_dereference(pos->next, ht) : NULL) + rhashtable_free_one(ht, pos, free_fn, arg); + } + } + + next_tbl = rht_dereference(tbl->future_tbl, ht); bucket_table_free(tbl); + if (next_tbl) { + tbl = next_tbl; + goto restart; + } mutex_unlock(&ht->mutex); } +EXPORT_SYMBOL_GPL(rhashtable_free_and_destroy); + +void rhashtable_destroy(struct rhashtable *ht) +{ + return rhashtable_free_and_destroy(ht, NULL, NULL); +} +EXPORT_SYMBOL_GPL(rhashtable_destroy); + +struct rhash_lock_head __rcu **__rht_bucket_nested( + const struct bucket_table *tbl, unsigned int hash) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + unsigned int index = hash & ((1 << tbl->nest) - 1); + unsigned int size = tbl->size >> tbl->nest; + unsigned int subhash = hash; + union nested_table *ntbl; + + ntbl = nested_table_top(tbl); + ntbl = rht_dereference_bucket_rcu(ntbl[index].table, tbl, hash); + subhash >>= tbl->nest; + + while (ntbl && size > (1 << shift)) { + index = subhash & ((1 << shift) - 1); + ntbl = rht_dereference_bucket_rcu(ntbl[index].table, + tbl, hash); + size >>= shift; + subhash >>= shift; + } + + if (!ntbl) + return NULL; + + return &ntbl[subhash].bucket; + +} +EXPORT_SYMBOL_GPL(__rht_bucket_nested); + +struct rhash_lock_head __rcu **rht_bucket_nested( + const struct bucket_table *tbl, unsigned int hash) +{ + static struct rhash_lock_head __rcu *rhnull; + + if (!rhnull) + INIT_RHT_NULLS_HEAD(rhnull); + return __rht_bucket_nested(tbl, hash) ?: &rhnull; +} +EXPORT_SYMBOL_GPL(rht_bucket_nested); + +struct rhash_lock_head __rcu **rht_bucket_nested_insert( + struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + unsigned int index = hash & ((1 << tbl->nest) - 1); + unsigned int size = tbl->size >> tbl->nest; + union nested_table *ntbl; + + ntbl = nested_table_top(tbl); + hash >>= tbl->nest; + ntbl = nested_table_alloc(ht, &ntbl[index].table, + size <= (1 << shift)); + + while (ntbl && size > (1 << shift)) { + index = hash & ((1 << shift) - 1); + size >>= shift; + hash >>= shift; + ntbl = nested_table_alloc(ht, &ntbl[index].table, + size <= (1 << shift)); + } + + if (!ntbl) + return NULL; + + return &ntbl[hash].bucket; + +} +EXPORT_SYMBOL_GPL(rht_bucket_nested_insert); diff --git a/linux/six.c b/linux/six.c index 53280044..fe721891 100644 --- a/linux/six.c +++ b/linux/six.c @@ -8,6 +8,7 @@ #include <linux/sched.h> #include <linux/sched/rt.h> #include <linux/six.h> +#include <linux/slab.h> #ifdef DEBUG #define EBUG_ON(cond) BUG_ON(cond) @@ -309,6 +310,9 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type, wake_up_process(p); } + if (ret) + six_acquire(&lock->dep_map, 1); + return ret; } @@ -560,6 +564,7 @@ static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type) lock->readers) { smp_mb(); /* unlock barrier */ this_cpu_dec(*lock->readers); + smp_mb(); /* between unlocking and checking for waiters */ state.v = READ_ONCE(lock->state.v); } else { EBUG_ON(!(lock->state.v & l[type].held_mask)); @@ -705,6 +710,34 @@ void six_lock_wakeup_all(struct six_lock *lock) } EXPORT_SYMBOL_GPL(six_lock_wakeup_all); +struct free_pcpu_rcu { + struct rcu_head rcu; + void __percpu *p; +}; + +static void free_pcpu_rcu_fn(struct rcu_head *_rcu) +{ + struct free_pcpu_rcu *rcu = + container_of(_rcu, struct free_pcpu_rcu, rcu); + + free_percpu(rcu->p); + kfree(rcu); +} + +void six_lock_pcpu_free_rcu(struct six_lock *lock) +{ + struct free_pcpu_rcu *rcu = kzalloc(sizeof(*rcu), GFP_KERNEL); + + if (!rcu) + return; + + rcu->p = lock->readers; + lock->readers = NULL; + + call_rcu(&rcu->rcu, free_pcpu_rcu_fn); +} +EXPORT_SYMBOL_GPL(six_lock_pcpu_free_rcu); + void six_lock_pcpu_free(struct six_lock *lock) { BUG_ON(lock->readers && pcpu_read_count(lock)); @@ -717,8 +750,6 @@ EXPORT_SYMBOL_GPL(six_lock_pcpu_free); void six_lock_pcpu_alloc(struct six_lock *lock) { - BUG_ON(lock->readers && pcpu_read_count(lock)); - BUG_ON(lock->state.read_lock); #ifdef __KERNEL__ if (!lock->readers) lock->readers = alloc_percpu(unsigned); |