summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--fs/bcachefs/alloc_background.c560
-rw-r--r--fs/bcachefs/alloc_background.h33
-rw-r--r--fs/bcachefs/alloc_foreground.c561
-rw-r--r--fs/bcachefs/alloc_foreground.h11
-rw-r--r--fs/bcachefs/alloc_types.h23
-rw-r--r--fs/bcachefs/bcachefs.h23
-rw-r--r--fs/bcachefs/btree_gc.c10
-rw-r--r--fs/bcachefs/btree_update_interior.c17
-rw-r--r--fs/bcachefs/buckets.c72
-rw-r--r--fs/bcachefs/buckets.h62
-rw-r--r--fs/bcachefs/buckets_types.h2
-rw-r--r--fs/bcachefs/ec.c13
-rw-r--r--fs/bcachefs/journal.c2
-rw-r--r--fs/bcachefs/journal_io.c4
-rw-r--r--fs/bcachefs/movinggc.c23
-rw-r--r--fs/bcachefs/recovery.c2
-rw-r--r--fs/bcachefs/super.c82
-rw-r--r--fs/bcachefs/sysfs.c47
-rw-r--r--fs/bcachefs/trace.h67
19 files changed, 615 insertions, 999 deletions
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 5d553d9b6151..3ba2b35fad53 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -27,13 +27,6 @@
#include <linux/sched/task.h>
#include <linux/sort.h>
-const char * const bch2_allocator_states[] = {
-#define x(n) #n,
- ALLOC_THREAD_STATES()
-#undef x
- NULL
-};
-
/* Persistent alloc info: */
static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
@@ -431,7 +424,6 @@ int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only)
g->_mark.gen = a.gen;
g->io_time[READ] = a.io_time[READ];
g->io_time[WRITE] = a.io_time[WRITE];
- g->oldest_gen = !gc ? a.oldest_gen : a.gen;
g->gen_valid = 1;
if (!gc ||
@@ -553,7 +545,6 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
- SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
}
if (old_a.data_type && !new_a->data_type &&
@@ -698,493 +689,6 @@ out:
return ret;
}
-/* Background allocator thread: */
-
-/*
- * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens
- * (marking them as invalidated on disk), then optionally issues discard
- * commands to the newly free buckets, then puts them on the various freelists.
- */
-
-/*
- * bucket_gc_gen() returns the difference between the bucket's current gen and
- * the oldest gen of any pointer into that bucket in the btree.
- */
-
-static inline u8 bucket_gc_gen(struct bucket *g)
-{
- return g->mark.gen - g->oldest_gen;
-}
-
-static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
- struct bucket_mark m)
-{
- u8 gc_gen;
-
- if (!is_available_bucket(m))
- return false;
-
- if (m.owned_by_allocator)
- return false;
-
- if (ca->buckets_nouse &&
- test_bit(b, ca->buckets_nouse))
- return false;
-
- if (ca->new_fs_bucket_idx) {
- /*
- * Device or filesystem is still being initialized, and we
- * haven't fully marked superblocks & journal:
- */
- if (is_superblock_bucket(ca, b))
- return false;
-
- if (b < ca->new_fs_bucket_idx)
- return false;
- }
-
- gc_gen = bucket_gc_gen(bucket(ca, b));
-
- ca->inc_gen_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX / 2;
- ca->inc_gen_really_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX;
-
- return gc_gen < BUCKET_GC_GEN_MAX;
-}
-
-/*
- * Determines what order we're going to reuse buckets, smallest bucket_key()
- * first.
- */
-
-static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
- u64 now, u64 last_seq_ondisk)
-{
- unsigned used = m.cached_sectors;
-
- if (used) {
- /*
- * Prefer to keep buckets that have been read more recently, and
- * buckets that have more data in them:
- */
- u64 last_read = max_t(s64, 0, now - g->io_time[READ]);
- u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used));
-
- return -last_read_scaled;
- } else {
- /*
- * Prefer to use buckets with smaller gc_gen so that we don't
- * have to walk the btree and recalculate oldest_gen - but shift
- * off the low bits so that buckets will still have equal sort
- * keys when there's only a small difference, so that we can
- * keep sequential buckets together:
- */
- return bucket_gc_gen(g) >> 4;
- }
-}
-
-static inline int bucket_alloc_cmp(alloc_heap *h,
- struct alloc_heap_entry l,
- struct alloc_heap_entry r)
-{
- return cmp_int(l.key, r.key) ?:
- cmp_int(r.nr, l.nr) ?:
- cmp_int(l.bucket, r.bucket);
-}
-
-static inline int bucket_idx_cmp(const void *_l, const void *_r)
-{
- const struct alloc_heap_entry *l = _l, *r = _r;
-
- return cmp_int(l->bucket, r->bucket);
-}
-
-static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
-{
- struct bucket_array *buckets;
- struct alloc_heap_entry e = { 0 };
- u64 now, last_seq_ondisk;
- size_t b, i, nr = 0;
-
- down_read(&ca->bucket_lock);
-
- buckets = bucket_array(ca);
- ca->alloc_heap.used = 0;
- now = atomic64_read(&c->io_clock[READ].now);
- last_seq_ondisk = c->journal.flushed_seq_ondisk;
-
- /*
- * Find buckets with lowest read priority, by building a maxheap sorted
- * by read priority and repeatedly replacing the maximum element until
- * all buckets have been visited.
- */
- for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
- struct bucket *g = &buckets->b[b];
- struct bucket_mark m = READ_ONCE(g->mark);
- unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk);
-
- cond_resched();
-
- if (!bch2_can_invalidate_bucket(ca, b, m))
- continue;
-
- if (!m.data_type &&
- bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
- last_seq_ondisk,
- ca->dev_idx, b)) {
- ca->buckets_waiting_on_journal++;
- continue;
- }
-
- if (e.nr && e.bucket + e.nr == b && e.key == key) {
- e.nr++;
- } else {
- if (e.nr)
- heap_add_or_replace(&ca->alloc_heap, e,
- -bucket_alloc_cmp, NULL);
-
- e = (struct alloc_heap_entry) {
- .bucket = b,
- .nr = 1,
- .key = key,
- };
- }
- }
-
- if (e.nr)
- heap_add_or_replace(&ca->alloc_heap, e,
- -bucket_alloc_cmp, NULL);
-
- for (i = 0; i < ca->alloc_heap.used; i++)
- nr += ca->alloc_heap.data[i].nr;
-
- while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) {
- nr -= ca->alloc_heap.data[0].nr;
- heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp, NULL);
- }
-
- up_read(&ca->bucket_lock);
-}
-
-static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
-{
- size_t i, nr = 0;
-
- ca->inc_gen_needs_gc = 0;
- ca->inc_gen_really_needs_gc = 0;
- ca->buckets_waiting_on_journal = 0;
-
- find_reclaimable_buckets_lru(c, ca);
-
- heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL);
-
- for (i = 0; i < ca->alloc_heap.used; i++)
- nr += ca->alloc_heap.data[i].nr;
-
- return nr;
-}
-
-static int bucket_invalidate_btree(struct btree_trans *trans,
- struct bch_dev *ca, u64 b,
- struct bkey_i_alloc_v4 *a)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
- POS(ca->dev_idx, b),
- BTREE_ITER_CACHED|
- BTREE_ITER_INTENT);
-
- k = bch2_btree_iter_peek_slot(&iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- bkey_alloc_v4_init(&a->k_i);
- a->k.p = iter.pos;
- bch2_alloc_to_v4(k, &a->v);
- a->v.gen++;
- a->v.data_type = 0;
- a->v.dirty_sectors = 0;
- a->v.cached_sectors = 0;
- a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
- a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now);
-
- ret = bch2_trans_update(trans, &iter, &a->k_i,
- BTREE_TRIGGER_BUCKET_INVALIDATE|
- BTREE_UPDATE_NO_KEY_CACHE_COHERENCY);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
- u64 *journal_seq, unsigned flags)
-{
- struct bkey_i_alloc_v4 a;
- size_t b;
- u64 commit_seq = 0;
- int ret = 0;
-
- /*
- * If the read-only path is trying to shut down, we can't be generating
- * new btree updates:
- */
- if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags))
- return 1;
-
- BUG_ON(!ca->alloc_heap.used ||
- !ca->alloc_heap.data[0].nr);
- b = ca->alloc_heap.data[0].bucket;
-
- /* first, put on free_inc and mark as owned by allocator: */
- percpu_down_read(&c->mark_lock);
-
- bch2_mark_alloc_bucket(c, ca, b, true);
-
- spin_lock(&c->freelist_lock);
- verify_not_on_freelist(c, ca, b);
- BUG_ON(!fifo_push(&ca->free_inc, b));
- spin_unlock(&c->freelist_lock);
-
- percpu_up_read(&c->mark_lock);
-
- ret = bch2_trans_do(c, NULL, &commit_seq,
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL|
- flags,
- bucket_invalidate_btree(&trans, ca, b, &a));
-
- if (!ret) {
- /* remove from alloc_heap: */
- struct alloc_heap_entry e, *top = ca->alloc_heap.data;
-
- top->bucket++;
- top->nr--;
-
- if (!top->nr)
- heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
-
- /*
- * If we invalidating cached data then we need to wait on the
- * journal commit:
- */
- if (a.v.data_type)
- *journal_seq = max(*journal_seq, commit_seq);
-
- /*
- * We already waiting on u.alloc_seq when we filtered out
- * buckets that need journal commit:
- */
- BUG_ON(*journal_seq > a.v.journal_seq);
- } else {
- size_t b2;
-
- /* remove from free_inc: */
- percpu_down_read(&c->mark_lock);
- spin_lock(&c->freelist_lock);
-
- bch2_mark_alloc_bucket(c, ca, b, false);
-
- BUG_ON(!fifo_pop_back(&ca->free_inc, b2));
- BUG_ON(b != b2);
-
- spin_unlock(&c->freelist_lock);
- percpu_up_read(&c->mark_lock);
- }
-
- return ret < 0 ? ret : 0;
-}
-
-/*
- * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc:
- */
-static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
-{
- u64 journal_seq = 0;
- int ret = 0;
-
- /* Only use nowait if we've already invalidated at least one bucket: */
- while (!ret &&
- !fifo_full(&ca->free_inc) &&
- ca->alloc_heap.used) {
- if (kthread_should_stop()) {
- ret = 1;
- break;
- }
-
- ret = bch2_invalidate_one_bucket(c, ca, &journal_seq,
- (!fifo_empty(&ca->free_inc)
- ? BTREE_INSERT_NOWAIT : 0));
- /*
- * We only want to batch up invalidates when they're going to
- * require flushing the journal:
- */
- if (!journal_seq)
- break;
- }
-
- /* If we used NOWAIT, don't return the error: */
- if (!fifo_empty(&ca->free_inc))
- ret = 0;
- if (ret < 0)
- bch_err(ca, "error invalidating buckets: %i", ret);
- if (ret)
- return ret;
-
- if (journal_seq)
- ret = bch2_journal_flush_seq(&c->journal, journal_seq);
- if (ret) {
- bch_err(ca, "journal error: %i", ret);
- return ret;
- }
-
- return 0;
-}
-
-static void alloc_thread_set_state(struct bch_dev *ca, unsigned new_state)
-{
- if (ca->allocator_state != new_state) {
- ca->allocator_state = new_state;
- closure_wake_up(&ca->fs->freelist_wait);
- }
-}
-
-static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
-{
- unsigned i;
- int ret = 0;
-
- spin_lock(&c->freelist_lock);
- for (i = 0; i < RESERVE_NR; i++) {
- /*
- * Don't strand buckets on the copygc freelist until
- * after recovery is finished:
- */
- if (i == RESERVE_movinggc &&
- !test_bit(BCH_FS_STARTED, &c->flags))
- continue;
-
- if (fifo_push(&ca->free[i], b)) {
- fifo_pop(&ca->free_inc, b);
- ret = 1;
- break;
- }
- }
- spin_unlock(&c->freelist_lock);
-
- ca->allocator_state = ret
- ? ALLOCATOR_running
- : ALLOCATOR_blocked_full;
- closure_wake_up(&c->freelist_wait);
- return ret;
-}
-
-static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
-{
- if (!c->opts.nochanges &&
- ca->mi.discard &&
- bdev_max_discard_sectors(ca->disk_sb.bdev))
- blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, b),
- ca->mi.bucket_size, GFP_NOFS);
-}
-
-static bool allocator_thread_running(struct bch_dev *ca)
-{
- unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw &&
- test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags)
- ? ALLOCATOR_running
- : ALLOCATOR_stopped;
- alloc_thread_set_state(ca, state);
- return state == ALLOCATOR_running;
-}
-
-static int buckets_available(struct bch_dev *ca, unsigned long gc_count)
-{
- s64 available = dev_buckets_reclaimable(ca) -
- (gc_count == ca->fs->gc_count ? ca->inc_gen_really_needs_gc : 0);
- bool ret = available > 0;
-
- alloc_thread_set_state(ca, ret
- ? ALLOCATOR_running
- : ALLOCATOR_blocked);
- return ret;
-}
-
-/**
- * bch_allocator_thread - move buckets from free_inc to reserves
- *
- * The free_inc FIFO is populated by find_reclaimable_buckets(), and
- * the reserves are depleted by bucket allocation. When we run out
- * of free_inc, try to invalidate some buckets and write out
- * prios and gens.
- */
-static int bch2_allocator_thread(void *arg)
-{
- struct bch_dev *ca = arg;
- struct bch_fs *c = ca->fs;
- unsigned long gc_count = c->gc_count;
- size_t nr;
- int ret;
-
- set_freezable();
-
- while (1) {
- ret = kthread_wait_freezable(allocator_thread_running(ca));
- if (ret)
- goto stop;
-
- while (!ca->alloc_heap.used) {
- cond_resched();
-
- ret = kthread_wait_freezable(buckets_available(ca, gc_count));
- if (ret)
- goto stop;
-
- gc_count = c->gc_count;
- nr = find_reclaimable_buckets(c, ca);
-
- if (!nr && ca->buckets_waiting_on_journal) {
- ret = bch2_journal_flush(&c->journal);
- if (ret)
- goto stop;
- } else if (nr < (ca->mi.nbuckets >> 6) &&
- ca->buckets_waiting_on_journal >= nr / 2) {
- bch2_journal_flush_async(&c->journal, NULL);
- }
-
- if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) ||
- ca->inc_gen_really_needs_gc) &&
- c->gc_thread) {
- atomic_inc(&c->kick_gc);
- wake_up_process(c->gc_thread);
- }
-
- trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc,
- ca->inc_gen_really_needs_gc);
- }
-
- ret = bch2_invalidate_buckets(c, ca);
- if (ret)
- goto stop;
-
- while (!fifo_empty(&ca->free_inc)) {
- u64 b = fifo_peek(&ca->free_inc);
-
- discard_one_bucket(c, ca, b);
-
- ret = kthread_wait_freezable(push_invalidated_bucket(c, ca, b));
- if (ret)
- goto stop;
- }
- }
-stop:
- alloc_thread_set_state(ca, ALLOCATOR_stopped);
- return 0;
-}
-
/* Startup/shutdown (ro/rw): */
void bch2_recalc_capacity(struct bch_fs *c)
@@ -1193,7 +697,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
u64 capacity = 0, reserved_sectors = 0, gc_reserve;
unsigned bucket_size_max = 0;
unsigned long ra_pages = 0;
- unsigned i, j;
+ unsigned i;
lockdep_assert_held(&c->state_lock);
@@ -1224,8 +728,9 @@ void bch2_recalc_capacity(struct bch_fs *c)
* allocations for foreground writes must wait -
* not -ENOSPC calculations.
*/
- for (j = 0; j < RESERVE_none; j++)
- dev_reserve += ca->free[j].size;
+
+ dev_reserve += ca->nr_btree_reserve * 2;
+ dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */
dev_reserve += 1; /* btree write point */
dev_reserve += 1; /* copygc write point */
@@ -1281,8 +786,6 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
{
unsigned i;
- BUG_ON(ca->alloc_thread);
-
/* First, remove device from allocation groups: */
for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
@@ -1356,61 +859,6 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
set_bit(ca->dev_idx, c->rw_devs[i].d);
}
-void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca)
-{
- if (ca->alloc_thread)
- closure_wait_event(&c->freelist_wait,
- ca->allocator_state != ALLOCATOR_running);
-}
-
-/* stop allocator thread: */
-void bch2_dev_allocator_stop(struct bch_dev *ca)
-{
- struct task_struct *p;
-
- p = rcu_dereference_protected(ca->alloc_thread, 1);
- ca->alloc_thread = NULL;
-
- /*
- * We need an rcu barrier between setting ca->alloc_thread = NULL and
- * the thread shutting down to avoid bch2_wake_allocator() racing:
- *
- * XXX: it would be better to have the rcu barrier be asynchronous
- * instead of blocking us here
- */
- synchronize_rcu();
-
- if (p) {
- kthread_stop(p);
- put_task_struct(p);
- }
-}
-
-/* start allocator thread: */
-int bch2_dev_allocator_start(struct bch_dev *ca)
-{
- struct task_struct *p;
-
- /*
- * allocator thread already started?
- */
- if (ca->alloc_thread)
- return 0;
-
- p = kthread_create(bch2_allocator_thread, ca,
- "bch-alloc/%s", ca->name);
- if (IS_ERR(p)) {
- bch_err(ca->fs, "error creating allocator thread: %li",
- PTR_ERR(p));
- return PTR_ERR(p);
- }
-
- get_task_struct(p);
- rcu_assign_pointer(ca->alloc_thread, p);
- wake_up_process(p);
- return 0;
-}
-
void bch2_fs_allocator_background_init(struct bch_fs *c)
{
spin_lock_init(&c->freelist_lock);
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 8de109674f0f..74b23f9b1bd3 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -8,8 +8,6 @@
#include "debug.h"
#include "super.h"
-extern const char * const bch2_allocator_states[];
-
/* How out of date a pointer gen is allowed to be: */
#define BUCKET_GC_GEN_MAX 96U
@@ -117,42 +115,11 @@ int bch2_trans_mark_alloc(struct btree_trans *, struct bkey_s_c,
struct bkey_i *, unsigned);
int bch2_fs_freespace_init(struct bch_fs *);
-static inline void bch2_wake_allocator(struct bch_dev *ca)
-{
- struct task_struct *p;
-
- rcu_read_lock();
- p = rcu_dereference(ca->alloc_thread);
- if (p)
- wake_up_process(p);
- rcu_read_unlock();
-}
-
-static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca,
- size_t bucket)
-{
- if (bch2_expensive_debug_checks) {
- size_t iter;
- long i;
- unsigned j;
-
- for (j = 0; j < RESERVE_NR; j++)
- fifo_for_each_entry(i, &ca->free[j], iter)
- BUG_ON(i == bucket);
- fifo_for_each_entry(i, &ca->free_inc, iter)
- BUG_ON(i == bucket);
- }
-}
-
void bch2_recalc_capacity(struct bch_fs *);
void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
-void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *);
-void bch2_dev_allocator_stop(struct bch_dev *);
-int bch2_dev_allocator_start(struct bch_dev *);
-
void bch2_fs_allocator_background_init(struct bch_fs *);
#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index c4b4689fdd0f..01abcf43341f 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -14,13 +14,18 @@
#include "bcachefs.h"
#include "alloc_background.h"
#include "alloc_foreground.h"
+#include "btree_iter.h"
+#include "btree_update.h"
#include "btree_gc.h"
#include "buckets.h"
+#include "buckets_waiting_for_journal.h"
#include "clock.h"
#include "debug.h"
#include "disk_groups.h"
#include "ec.h"
+#include "error.h"
#include "io.h"
+#include "journal.h"
#include "trace.h"
#include <linux/math64.h>
@@ -50,6 +55,17 @@ const char * const bch2_alloc_reserves[] = {
* reference _after_ doing the index update that makes its allocation reachable.
*/
+void bch2_reset_alloc_cursors(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ unsigned i;
+
+ rcu_read_lock();
+ for_each_member_device_rcu(ca, c, i, NULL)
+ ca->alloc_cursor = 0;
+ rcu_read_unlock();
+}
+
static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob)
{
open_bucket_idx_t idx = ob - c->open_buckets;
@@ -85,7 +101,6 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
percpu_down_read(&c->mark_lock);
spin_lock(&ob->lock);
- bch2_mark_alloc_bucket(c, ca, ob->bucket, false);
ob->valid = false;
ob->data_type = 0;
@@ -185,39 +200,35 @@ static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
}
}
-/**
- * bch_bucket_alloc - allocate a single bucket from a specific device
- *
- * Returns index of bucket on success, 0 on failure
- * */
-struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
- enum alloc_reserve reserve,
- bool may_alloc_partial,
- struct closure *cl)
+static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+ u64 bucket,
+ enum alloc_reserve reserve,
+ struct bch_alloc_v4 *a,
+ u64 *skipped_open,
+ u64 *skipped_need_journal_commit,
+ u64 *skipped_nouse,
+ struct closure *cl)
{
struct open_bucket *ob;
- long b = 0;
- spin_lock(&c->freelist_lock);
+ if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
+ (*skipped_nouse)++;
+ return NULL;
+ }
- if (may_alloc_partial) {
- int i;
-
- for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) {
- ob = c->open_buckets + ca->open_buckets_partial[i];
-
- if (reserve <= ob->alloc_reserve) {
- array_remove_item(ca->open_buckets_partial,
- ca->open_buckets_partial_nr,
- i);
- ob->on_partial_list = false;
- ob->alloc_reserve = reserve;
- spin_unlock(&c->freelist_lock);
- return ob;
- }
- }
+ if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
+ (*skipped_open)++;
+ return NULL;
+ }
+
+ if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+ c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) {
+ (*skipped_need_journal_commit)++;
+ return NULL;
}
+ spin_lock(&c->freelist_lock);
+
if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) {
if (cl)
closure_wait(&c->open_buckets_wait, cl);
@@ -226,36 +237,16 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
c->blocked_allocate_open_bucket = local_clock();
spin_unlock(&c->freelist_lock);
- trace_open_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve]);
return ERR_PTR(-OPEN_BUCKETS_EMPTY);
}
- if (likely(fifo_pop(&ca->free[RESERVE_none], b)))
- goto out;
-
- switch (reserve) {
- case RESERVE_btree_movinggc:
- case RESERVE_movinggc:
- if (fifo_pop(&ca->free[RESERVE_movinggc], b))
- goto out;
- break;
- default:
- break;
+ /* Recheck under lock: */
+ if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
+ spin_unlock(&c->freelist_lock);
+ (*skipped_open)++;
+ return NULL;
}
- if (cl)
- closure_wait(&c->freelist_wait, cl);
-
- if (!c->blocked_allocate)
- c->blocked_allocate = local_clock();
-
- spin_unlock(&c->freelist_lock);
-
- trace_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve]);
- return ERR_PTR(-FREELIST_EMPTY);
-out:
- verify_not_on_freelist(c, ca, b);
-
ob = bch2_open_bucket_alloc(c);
spin_lock(&ob->lock);
@@ -264,8 +255,8 @@ out:
ob->sectors_free = ca->mi.bucket_size;
ob->alloc_reserve = reserve;
ob->dev = ca->dev_idx;
- ob->gen = *bucket_gen(ca, b);
- ob->bucket = b;
+ ob->gen = a->gen;
+ ob->bucket = bucket;
spin_unlock(&ob->lock);
ca->nr_open_buckets++;
@@ -286,10 +277,326 @@ out:
}
spin_unlock(&c->freelist_lock);
+ return ob;
+}
+
+static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
+ enum alloc_reserve reserve, u64 free_entry,
+ u64 *skipped_open,
+ u64 *skipped_need_journal_commit,
+ u64 *skipped_nouse,
+ struct bkey_s_c freespace_k,
+ struct closure *cl)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter = { NULL };
+ struct bkey_s_c k;
+ struct open_bucket *ob;
+ struct bch_alloc_v4 a;
+ u64 b = free_entry & ~(~0ULL << 56);
+ unsigned genbits = free_entry >> 56;
+ struct printbuf buf = PRINTBUF;
+ int ret;
+
+ if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) {
+ pr_buf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n"
+ " freespace key ",
+ ca->mi.first_bucket, ca->mi.nbuckets);
+ bch2_bkey_val_to_text(&buf, c, freespace_k);
+ bch2_trans_inconsistent(trans, "%s", buf.buf);
+ ob = ERR_PTR(-EIO);
+ goto err;
+ }
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, b), BTREE_ITER_CACHED);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret) {
+ ob = ERR_PTR(ret);
+ goto err;
+ }
+
+ bch2_alloc_to_v4(k, &a);
+
+ if (genbits != (alloc_freespace_genbits(a) >> 56)) {
+ pr_buf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
+ " freespace key ",
+ genbits, alloc_freespace_genbits(a) >> 56);
+ bch2_bkey_val_to_text(&buf, c, freespace_k);
+ pr_buf(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ bch2_trans_inconsistent(trans, "%s", buf.buf);
+ ob = ERR_PTR(-EIO);
+ goto err;
+
+ }
+
+ if (a.data_type != BUCKET_free) {
+ pr_buf(&buf, "non free bucket in freespace btree\n"
+ " freespace key ");
+ bch2_bkey_val_to_text(&buf, c, freespace_k);
+ pr_buf(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ bch2_trans_inconsistent(trans, "%s", buf.buf);
+ ob = ERR_PTR(-EIO);
+ goto err;
+ }
+
+ ob = __try_alloc_bucket(c, ca, b, reserve, &a,
+ skipped_open,
+ skipped_need_journal_commit,
+ skipped_nouse,
+ cl);
+ if (!ob)
+ iter.path->preserve = false;
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf);
+ return ob;
+}
+
+static struct open_bucket *try_alloc_partial_bucket(struct bch_fs *c, struct bch_dev *ca,
+ enum alloc_reserve reserve)
+{
+ struct open_bucket *ob;
+ int i;
+
+ spin_lock(&c->freelist_lock);
+
+ for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) {
+ ob = c->open_buckets + ca->open_buckets_partial[i];
+
+ if (reserve <= ob->alloc_reserve) {
+ array_remove_item(ca->open_buckets_partial,
+ ca->open_buckets_partial_nr,
+ i);
+ ob->on_partial_list = false;
+ ob->alloc_reserve = reserve;
+ spin_unlock(&c->freelist_lock);
+ return ob;
+ }
+ }
+
+ spin_unlock(&c->freelist_lock);
+ return NULL;
+}
+
+/*
+ * This path is for before the freespace btree is initialized:
+ *
+ * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock &
+ * journal buckets - journal buckets will be < ca->new_fs_bucket_idx
+ */
+static noinline struct open_bucket *
+bch2_bucket_alloc_early(struct btree_trans *trans,
+ struct bch_dev *ca,
+ enum alloc_reserve reserve,
+ u64 *buckets_seen,
+ u64 *skipped_open,
+ u64 *skipped_need_journal_commit,
+ u64 *skipped_nouse,
+ struct closure *cl)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct open_bucket *ob = NULL;
+ u64 alloc_start = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
+ u64 alloc_cursor = max(alloc_start, READ_ONCE(ca->alloc_cursor));
+ int ret;
+again:
+ for_each_btree_key(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor),
+ BTREE_ITER_SLOTS, k, ret) {
+ struct bch_alloc_v4 a;
+
+ if (bkey_cmp(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0)
+ break;
+
+ if (ca->new_fs_bucket_idx &&
+ is_superblock_bucket(ca, k.k->p.offset))
+ continue;
+
+ bch2_alloc_to_v4(k, &a);
+
+ if (bucket_state(a) != BUCKET_free)
+ continue;
+
+ (*buckets_seen)++;
+
+ ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, reserve, &a,
+ skipped_open,
+ skipped_need_journal_commit,
+ skipped_nouse,
+ cl);
+ if (ob)
+ break;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ ca->alloc_cursor = alloc_cursor;
+
+ if (!ob && alloc_cursor > alloc_start) {
+ alloc_cursor = alloc_start;
+ goto again;
+ }
+
+ return ob ?: ERR_PTR(ret ?: -FREELIST_EMPTY);
+}
+
+static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
+ struct bch_dev *ca,
+ enum alloc_reserve reserve,
+ u64 *buckets_seen,
+ u64 *skipped_open,
+ u64 *skipped_need_journal_commit,
+ u64 *skipped_nouse,
+ struct closure *cl)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct open_bucket *ob = NULL;
+ u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(ca->alloc_cursor));
+ u64 alloc_cursor = alloc_start;
+ int ret;
+
+ BUG_ON(ca->new_fs_bucket_idx);
+again:
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace,
+ POS(ca->dev_idx, alloc_cursor), 0, k, ret) {
+ if (k.k->p.inode != ca->dev_idx)
+ break;
+
+ for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k));
+ alloc_cursor < k.k->p.offset;
+ alloc_cursor++) {
+ if (btree_trans_too_many_iters(trans)) {
+ ob = ERR_PTR(-EINTR);
+ break;
+ }
+
+ (*buckets_seen)++;
+
+ ob = try_alloc_bucket(trans, ca, reserve,
+ alloc_cursor,
+ skipped_open,
+ skipped_need_journal_commit,
+ skipped_nouse,
+ k, cl);
+ if (ob) {
+ iter.path->preserve = false;
+ break;
+ }
+ }
+ if (ob)
+ break;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ ca->alloc_cursor = alloc_cursor;
+
+ if (!ob && ret)
+ ob = ERR_PTR(ret);
+
+ if (!ob && alloc_start > ca->mi.first_bucket) {
+ alloc_cursor = alloc_start = ca->mi.first_bucket;
+ goto again;
+ }
+
+ return ob;
+}
+
+/**
+ * bch_bucket_alloc - allocate a single bucket from a specific device
+ *
+ * Returns index of bucket on success, 0 on failure
+ * */
+static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
+ struct bch_dev *ca,
+ enum alloc_reserve reserve,
+ bool may_alloc_partial,
+ struct closure *cl)
+{
+ struct bch_fs *c = trans->c;
+ struct open_bucket *ob = NULL;
+ u64 avail = dev_buckets_available(ca, reserve);
+ u64 buckets_seen = 0;
+ u64 skipped_open = 0;
+ u64 skipped_need_journal_commit = 0;
+ u64 skipped_nouse = 0;
+
+ if (may_alloc_partial) {
+ ob = try_alloc_partial_bucket(c, ca, reserve);
+ if (ob)
+ return ob;
+ }
+again:
+ if (!avail) {
+ if (cl) {
+ closure_wait(&c->freelist_wait, cl);
+ /* recheck after putting ourself on waitlist */
+ avail = dev_buckets_available(ca, reserve);
+ if (avail) {
+ closure_wake_up(&c->freelist_wait);
+ goto again;
+ }
+ }
+
+ if (!c->blocked_allocate)
+ c->blocked_allocate = local_clock();
+
+ ob = ERR_PTR(-FREELIST_EMPTY);
+ goto err;
+ }
+
+ ob = likely(ca->mi.freespace_initialized)
+ ? bch2_bucket_alloc_freelist(trans, ca, reserve,
+ &buckets_seen,
+ &skipped_open,
+ &skipped_need_journal_commit,
+ &skipped_nouse,
+ cl)
+ : bch2_bucket_alloc_early(trans, ca, reserve,
+ &buckets_seen,
+ &skipped_open,
+ &skipped_need_journal_commit,
+ &skipped_nouse,
+ cl);
+
+ if (skipped_need_journal_commit * 2 > avail)
+ bch2_journal_flush_async(&c->journal, NULL);
+err:
+ if (!ob)
+ ob = ERR_PTR(-FREELIST_EMPTY);
+
+ if (!IS_ERR(ob)) {
+ trace_bucket_alloc(ca, bch2_alloc_reserves[reserve], avail,
+ buckets_seen,
+ skipped_open,
+ skipped_need_journal_commit,
+ skipped_nouse,
+ cl == NULL, PTR_ERR_OR_ZERO(ob));
+ } else {
+ trace_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve], avail,
+ buckets_seen,
+ skipped_open,
+ skipped_need_journal_commit,
+ skipped_nouse,
+ cl == NULL, PTR_ERR_OR_ZERO(ob));
+ atomic_long_inc(&c->bucket_alloc_fail);
+ }
+
+ return ob;
+}
- bch2_wake_allocator(ca);
+struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
+ enum alloc_reserve reserve,
+ bool may_alloc_partial,
+ struct closure *cl)
+{
+ struct open_bucket *ob;
- trace_bucket_alloc(ca, bch2_alloc_reserves[reserve]);
+ bch2_trans_do(c, NULL, NULL, 0,
+ PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, reserve,
+ may_alloc_partial, cl)));
return ob;
}
@@ -320,7 +627,7 @@ void bch2_dev_stripe_increment(struct bch_dev *ca,
struct dev_stripe_state *stripe)
{
u64 *v = stripe->next_alloc + ca->dev_idx;
- u64 free_space = dev_buckets_available(ca);
+ u64 free_space = dev_buckets_available(ca, RESERVE_none);
u64 free_space_inv = free_space
? div64_u64(1ULL << 48, free_space)
: 1ULL << 48;
@@ -358,7 +665,7 @@ static void add_new_bucket(struct bch_fs *c,
ob_push(c, ptrs, ob);
}
-int bch2_bucket_alloc_set(struct bch_fs *c,
+static int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
struct open_buckets *ptrs,
struct dev_stripe_state *stripe,
struct bch_devs_mask *devs_may_alloc,
@@ -369,10 +676,12 @@ int bch2_bucket_alloc_set(struct bch_fs *c,
unsigned flags,
struct closure *cl)
{
+ struct bch_fs *c = trans->c;
struct dev_alloc_list devs_sorted =
bch2_dev_alloc_list(c, stripe, devs_may_alloc);
+ unsigned dev;
struct bch_dev *ca;
- int ret = -INSUFFICIENT_DEVICES;
+ int ret = 0;
unsigned i;
BUG_ON(*nr_effective >= nr_replicas);
@@ -380,35 +689,68 @@ int bch2_bucket_alloc_set(struct bch_fs *c,
for (i = 0; i < devs_sorted.nr; i++) {
struct open_bucket *ob;
- ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
+ dev = devs_sorted.devs[i];
+
+ rcu_read_lock();
+ ca = rcu_dereference(c->devs[dev]);
+ if (ca)
+ percpu_ref_get(&ca->ref);
+ rcu_read_unlock();
+
if (!ca)
continue;
- if (!ca->mi.durability && *have_cache)
+ if (!ca->mi.durability && *have_cache) {
+ percpu_ref_put(&ca->ref);
continue;
+ }
- ob = bch2_bucket_alloc(c, ca, reserve,
+ ob = bch2_bucket_alloc_trans(trans, ca, reserve,
flags & BUCKET_MAY_ALLOC_PARTIAL, cl);
- if (IS_ERR(ob)) {
- ret = PTR_ERR(ob);
-
- if (cl)
- return ret;
+ if (!IS_ERR(ob))
+ bch2_dev_stripe_increment(ca, stripe);
+ percpu_ref_put(&ca->ref);
+
+ ret = PTR_ERR_OR_ZERO(ob);
+ if (ret) {
+ if (ret == -EINTR || cl)
+ break;
continue;
}
add_new_bucket(c, ptrs, devs_may_alloc,
nr_effective, have_cache, flags, ob);
- bch2_dev_stripe_increment(ca, stripe);
-
if (*nr_effective >= nr_replicas)
- return 0;
+ break;
}
+ if (*nr_effective >= nr_replicas)
+ ret = 0;
+ else if (!ret)
+ ret = -INSUFFICIENT_DEVICES;
+
return ret;
}
+int bch2_bucket_alloc_set(struct bch_fs *c,
+ struct open_buckets *ptrs,
+ struct dev_stripe_state *stripe,
+ struct bch_devs_mask *devs_may_alloc,
+ unsigned nr_replicas,
+ unsigned *nr_effective,
+ bool *have_cache,
+ enum alloc_reserve reserve,
+ unsigned flags,
+ struct closure *cl)
+{
+ return bch2_trans_do(c, NULL, NULL, 0,
+ bch2_bucket_alloc_set_trans(&trans, ptrs, stripe,
+ devs_may_alloc, nr_replicas,
+ nr_effective, have_cache, reserve,
+ flags, cl));
+}
+
/* Allocate from stripes: */
/*
@@ -513,7 +855,7 @@ static void get_buckets_from_writepoint(struct bch_fs *c,
wp->ptrs = ptrs_skip;
}
-static int open_bucket_add_buckets(struct bch_fs *c,
+static int open_bucket_add_buckets(struct btree_trans *trans,
struct open_buckets *ptrs,
struct write_point *wp,
struct bch_devs_list *devs_have,
@@ -526,6 +868,7 @@ static int open_bucket_add_buckets(struct bch_fs *c,
unsigned flags,
struct closure *_cl)
{
+ struct bch_fs *c = trans->c;
struct bch_devs_mask devs;
struct open_bucket *ob;
struct closure *cl = NULL;
@@ -557,7 +900,8 @@ static int open_bucket_add_buckets(struct bch_fs *c,
target, erasure_code,
nr_replicas, nr_effective,
have_cache, flags, _cl);
- if (ret == -FREELIST_EMPTY ||
+ if (ret == -EINTR ||
+ ret == -FREELIST_EMPTY ||
ret == -OPEN_BUCKETS_EMPTY)
return ret;
if (*nr_effective >= nr_replicas)
@@ -571,25 +915,22 @@ static int open_bucket_add_buckets(struct bch_fs *c,
if (*nr_effective >= nr_replicas)
return 0;
- percpu_down_read(&c->mark_lock);
- rcu_read_lock();
-
retry_blocking:
/*
* Try nonblocking first, so that if one device is full we'll try from
* other devices:
*/
- ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs,
+ ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs,
nr_replicas, nr_effective, have_cache,
reserve, flags, cl);
- if (ret && ret != -INSUFFICIENT_DEVICES && !cl && _cl) {
+ if (ret &&
+ ret != -EINTR &&
+ ret != -INSUFFICIENT_DEVICES &&
+ !cl && _cl) {
cl = _cl;
goto retry_blocking;
}
- rcu_read_unlock();
- percpu_up_read(&c->mark_lock);
-
return ret;
}
@@ -703,15 +1044,25 @@ static bool try_decrease_writepoints(struct bch_fs *c,
return true;
}
-static struct write_point *writepoint_find(struct bch_fs *c,
+static void bch2_trans_mutex_lock(struct btree_trans *trans,
+ struct mutex *lock)
+{
+ if (!mutex_trylock(lock)) {
+ bch2_trans_unlock(trans);
+ mutex_lock(lock);
+ }
+}
+
+static struct write_point *writepoint_find(struct btree_trans *trans,
unsigned long write_point)
{
+ struct bch_fs *c = trans->c;
struct write_point *wp, *oldest;
struct hlist_head *head;
if (!(write_point & 1UL)) {
wp = (struct write_point *) write_point;
- mutex_lock(&wp->lock);
+ bch2_trans_mutex_lock(trans, &wp->lock);
return wp;
}
@@ -720,7 +1071,7 @@ restart_find:
wp = __writepoint_find(head, write_point);
if (wp) {
lock_wp:
- mutex_lock(&wp->lock);
+ bch2_trans_mutex_lock(trans, &wp->lock);
if (wp->write_point == write_point)
goto out;
mutex_unlock(&wp->lock);
@@ -733,8 +1084,8 @@ restart_find_oldest:
if (!oldest || time_before64(wp->last_used, oldest->last_used))
oldest = wp;
- mutex_lock(&oldest->lock);
- mutex_lock(&c->write_points_hash_lock);
+ bch2_trans_mutex_lock(trans, &oldest->lock);
+ bch2_trans_mutex_lock(trans, &c->write_points_hash_lock);
if (oldest >= c->write_points + c->write_points_nr ||
try_increase_writepoints(c)) {
mutex_unlock(&c->write_points_hash_lock);
@@ -762,7 +1113,7 @@ out:
/*
* Get us an open_bucket we can allocate from, return with it locked:
*/
-int bch2_alloc_sectors_start(struct bch_fs *c,
+int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
unsigned target,
unsigned erasure_code,
struct write_point_specifier write_point,
@@ -774,6 +1125,7 @@ int bch2_alloc_sectors_start(struct bch_fs *c,
struct closure *cl,
struct write_point **wp_ret)
{
+ struct bch_fs *c = trans->c;
struct write_point *wp;
struct open_bucket *ob;
struct open_buckets ptrs;
@@ -793,7 +1145,7 @@ retry:
write_points_nr = c->write_points_nr;
have_cache = false;
- *wp_ret = wp = writepoint_find(c, write_point.v);
+ *wp_ret = wp = writepoint_find(trans, write_point.v);
if (wp->data_type == BCH_DATA_user)
ob_flags |= BUCKET_MAY_ALLOC_PARTIAL;
@@ -803,21 +1155,21 @@ retry:
have_cache = true;
if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
- ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
+ ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
target, erasure_code,
nr_replicas, &nr_effective,
&have_cache, reserve,
ob_flags, cl);
} else {
- ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
+ ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
target, erasure_code,
nr_replicas, &nr_effective,
&have_cache, reserve,
ob_flags, NULL);
- if (!ret)
+ if (!ret || ret == -EINTR)
goto alloc_done;
- ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
+ ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
0, erasure_code,
nr_replicas, &nr_effective,
&have_cache, reserve,
@@ -871,10 +1223,33 @@ err:
case -INSUFFICIENT_DEVICES:
return -EROFS;
default:
- BUG();
+ return ret;
}
}
+int bch2_alloc_sectors_start(struct bch_fs *c,
+ unsigned target,
+ unsigned erasure_code,
+ struct write_point_specifier write_point,
+ struct bch_devs_list *devs_have,
+ unsigned nr_replicas,
+ unsigned nr_replicas_required,
+ enum alloc_reserve reserve,
+ unsigned flags,
+ struct closure *cl,
+ struct write_point **wp_ret)
+{
+ return bch2_trans_do(c, NULL, NULL, 0,
+ bch2_alloc_sectors_start_trans(&trans, target,
+ erasure_code,
+ write_point,
+ devs_have,
+ nr_replicas,
+ nr_replicas_required,
+ reserve,
+ flags, cl, wp_ret));
+}
+
struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
{
struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index efb6d155bd00..12583a7e7aa3 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -14,6 +14,8 @@ struct bch_devs_List;
extern const char * const bch2_alloc_reserves[];
+void bch2_reset_alloc_cursors(struct bch_fs *);
+
struct dev_alloc_list {
unsigned nr;
u8 devs[BCH_SB_MEMBERS_MAX];
@@ -136,6 +138,15 @@ int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *,
unsigned, unsigned *, bool *, enum alloc_reserve,
unsigned, struct closure *);
+int bch2_alloc_sectors_start_trans(struct btree_trans *,
+ unsigned, unsigned,
+ struct write_point_specifier,
+ struct bch_devs_list *,
+ unsigned, unsigned,
+ enum alloc_reserve,
+ unsigned,
+ struct closure *,
+ struct write_point **);
int bch2_alloc_sectors_start(struct bch_fs *,
unsigned, unsigned,
struct write_point_specifier,
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index 9e00afb17559..b3bef7074511 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -10,18 +10,6 @@
struct ec_bucket_buf;
-#define ALLOC_THREAD_STATES() \
- x(stopped) \
- x(running) \
- x(blocked) \
- x(blocked_full)
-
-enum allocator_states {
-#define x(n) ALLOCATOR_##n,
- ALLOC_THREAD_STATES()
-#undef x
-};
-
#define BCH_ALLOC_RESERVES() \
x(btree_movinggc) \
x(btree) \
@@ -32,11 +20,8 @@ enum alloc_reserve {
#define x(name) RESERVE_##name,
BCH_ALLOC_RESERVES()
#undef x
- RESERVE_NR
};
-typedef FIFO(long) alloc_fifo;
-
#define OPEN_BUCKETS_COUNT 1024
#define WRITE_POINT_HASH_NR 32
@@ -127,12 +112,4 @@ struct write_point_specifier {
unsigned long v;
};
-struct alloc_heap_entry {
- size_t bucket;
- size_t nr;
- unsigned long key;
-};
-
-typedef HEAP(struct alloc_heap_entry) alloc_heap;
-
#endif /* _BCACHEFS_ALLOC_TYPES_H */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 6c11ebee73a9..879b2adc8b42 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -462,34 +462,18 @@ struct bch_dev {
/* Allocator: */
u64 new_fs_bucket_idx;
- struct task_struct __rcu *alloc_thread;
+ u64 alloc_cursor;
- /*
- * free: Buckets that are ready to be used
- *
- * free_inc: Incoming buckets - these are buckets that currently have
- * cached data in them, and we can't reuse them until after we write
- * their new gen to disk. After prio_write() finishes writing the new
- * gens/prios, they'll be moved to the free list (and possibly discarded
- * in the process)
- */
- alloc_fifo free[RESERVE_NR];
- alloc_fifo free_inc;
unsigned nr_open_buckets;
+ unsigned nr_btree_reserve;
open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT];
open_bucket_idx_t open_buckets_partial_nr;
- size_t fifo_last_bucket;
-
size_t inc_gen_needs_gc;
size_t inc_gen_really_needs_gc;
size_t buckets_waiting_on_journal;
- enum allocator_states allocator_state;
-
- alloc_heap alloc_heap;
-
atomic64_t rebalance_work;
struct journal_device journal;
@@ -511,8 +495,6 @@ struct bch_dev {
enum {
/* startup: */
BCH_FS_ALLOC_CLEAN,
- BCH_FS_ALLOCATOR_RUNNING,
- BCH_FS_ALLOCATOR_STOPPING,
BCH_FS_INITIAL_GC_DONE,
BCH_FS_INITIAL_GC_UNFIXED,
BCH_FS_TOPOLOGY_REPAIR_DONE,
@@ -914,6 +896,7 @@ mempool_t bio_bounce_pages;
atomic_long_t read_realloc_races;
atomic_long_t extent_migrate_done;
atomic_long_t extent_migrate_raced;
+ atomic_long_t bucket_alloc_fail;
unsigned btree_gc_periodic:1;
unsigned copy_gc_enabled:1;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index c3d6c62ef062..7078b277e23b 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1684,9 +1684,8 @@ static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only)
*/
int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
{
- struct bch_dev *ca;
u64 start_time = local_clock();
- unsigned i, iter = 0;
+ unsigned iter = 0;
int ret;
lockdep_assert_held(&c->state_lock);
@@ -1788,13 +1787,6 @@ out:
bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
/*
- * Wake up allocator in case it was waiting for buckets
- * because of not being able to inc gens
- */
- for_each_member_device(ca, c, i)
- bch2_wake_allocator(ca);
-
- /*
* At startup, allocations can happen directly instead of via the
* allocator thread - issue wakeup in case they blocked on gc_lock:
*/
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index d1e3e2c76e30..2e958f88777b 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -178,12 +178,13 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans,
six_unlock_intent(&b->c.lock);
}
-static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
+static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
struct disk_reservation *res,
struct closure *cl,
bool interior_node,
unsigned flags)
{
+ struct bch_fs *c = trans->c;
struct write_point *wp;
struct btree *b;
__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
@@ -214,7 +215,7 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
mutex_unlock(&c->btree_reserve_cache_lock);
retry:
- ret = bch2_alloc_sectors_start(c,
+ ret = bch2_alloc_sectors_start_trans(trans,
c->opts.metadata_target ?:
c->opts.foreground_target,
0,
@@ -414,7 +415,8 @@ static void bch2_btree_reserve_put(struct btree_update *as)
mutex_unlock(&c->btree_reserve_cache_lock);
}
-static int bch2_btree_reserve_get(struct btree_update *as,
+static int bch2_btree_reserve_get(struct btree_trans *trans,
+ struct btree_update *as,
unsigned nr_nodes[2],
unsigned flags,
struct closure *cl)
@@ -441,7 +443,7 @@ static int bch2_btree_reserve_get(struct btree_update *as,
struct prealloc_nodes *p = as->prealloc_nodes + interior;
while (p->nr < nr_nodes[interior]) {
- b = __bch2_btree_node_alloc(c, &as->disk_res,
+ b = __bch2_btree_node_alloc(trans, &as->disk_res,
flags & BTREE_INSERT_NOWAIT ? NULL : cl,
interior, flags);
if (IS_ERR(b)) {
@@ -1066,8 +1068,9 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
if (ret)
goto err;
- ret = bch2_btree_reserve_get(as, nr_nodes, flags, NULL);
- if (ret) {
+ ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL);
+ if (ret == -EAGAIN ||
+ ret == -ENOMEM) {
struct closure cl;
closure_init_stack(&cl);
@@ -1075,7 +1078,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
bch2_trans_unlock(trans);
do {
- ret = bch2_btree_reserve_get(as, nr_nodes, flags, &cl);
+ ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl);
closure_sync(&cl);
} while (ret == -EAGAIN);
}
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 5e247235ab69..2c6fdf385ba3 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -296,11 +296,6 @@ static inline int bucket_sectors_fragmented(struct bch_dev *ca,
: 0;
}
-static inline int is_stripe_data_bucket(struct bucket_mark m)
-{
- return m.stripe && m.data_type != BCH_DATA_parity;
-}
-
static inline enum bch_data_type bucket_type(struct bucket_mark m)
{
return m.cached_sectors && !m.dirty_sectors
@@ -350,9 +345,6 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new);
preempt_enable();
-
- if (!is_available_bucket(old) && is_available_bucket(new))
- bch2_wake_allocator(ca);
}
static inline int __update_replicas(struct bch_fs *c,
@@ -488,19 +480,6 @@ static inline void update_cached_sectors_list(struct btree_trans *trans,
update_replicas_list(trans, &r.e, sectors);
}
-void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
- size_t b, bool owned_by_allocator)
-{
- struct bucket *g = bucket(ca, b);
- struct bucket_mark old, new;
-
- old = bucket_cmpxchg(g, new, ({
- new.owned_by_allocator = owned_by_allocator;
- }));
-
- BUG_ON(owned_by_allocator == old.owned_by_allocator);
-}
-
int bch2_mark_alloc(struct btree_trans *trans,
struct bkey_s_c old, struct bkey_s_c new,
unsigned flags)
@@ -560,6 +539,10 @@ int bch2_mark_alloc(struct btree_trans *trans,
}
}
+ if (!new_a.data_type &&
+ (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk))
+ closure_wake_up(&c->freelist_wait);
+
if (bucket_state(new_a) == BUCKET_need_gc_gens) {
atomic_inc(&c->kick_gc);
wake_up_process(c->gc_thread);
@@ -583,7 +566,6 @@ int bch2_mark_alloc(struct btree_trans *trans,
g->io_time[READ] = new_a.io_time[READ];
g->io_time[WRITE] = new_a.io_time[WRITE];
- g->oldest_gen = new_a.oldest_gen;
g->gen_valid = 1;
g->stripe = new_a.stripe;
g->stripe_redundancy = new_a.stripe_redundancy;
@@ -1861,8 +1843,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
a->v.data_type = type;
a->v.dirty_sectors = sectors;
- ret = bch2_trans_update(trans, &iter, &a->k_i,
- BTREE_UPDATE_NO_KEY_CACHE_COHERENCY);
+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
if (ret)
goto out;
out:
@@ -2048,24 +2029,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
struct bucket_array *buckets = NULL, *old_buckets = NULL;
struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL;
unsigned long *buckets_nouse = NULL;
- alloc_fifo free[RESERVE_NR];
- alloc_fifo free_inc;
- alloc_heap alloc_heap;
-
- size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
- ca->mi.bucket_size / btree_sectors(c));
- /* XXX: these should be tunable */
- size_t reserve_none = max_t(size_t, 1, nbuckets >> 9);
- size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 6);
- size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12),
- btree_reserve * 2);
bool resize = ca->buckets[0] != NULL;
int ret = -ENOMEM;
- unsigned i;
-
- memset(&free, 0, sizeof(free));
- memset(&free_inc, 0, sizeof(free_inc));
- memset(&alloc_heap, 0, sizeof(alloc_heap));
if (!(buckets = kvpmalloc(sizeof(struct bucket_array) +
nbuckets * sizeof(struct bucket),
@@ -2075,12 +2040,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
(c->opts.buckets_nouse &&
!(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) *
sizeof(unsigned long),
- GFP_KERNEL|__GFP_ZERO))) ||
- !init_fifo(&free[RESERVE_movinggc],
- copygc_reserve, GFP_KERNEL) ||
- !init_fifo(&free[RESERVE_none], reserve_none, GFP_KERNEL) ||
- !init_fifo(&free_inc, free_inc_nr, GFP_KERNEL) ||
- !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL))
+ GFP_KERNEL|__GFP_ZERO))))
goto err;
buckets->first_bucket = ca->mi.first_bucket;
@@ -2126,18 +2086,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
up_write(&c->gc_lock);
}
- spin_lock(&c->freelist_lock);
- for (i = 0; i < RESERVE_NR; i++) {
- fifo_move(&free[i], &ca->free[i]);
- swap(ca->free[i], free[i]);
- }
- fifo_move(&free_inc, &ca->free_inc);
- swap(ca->free_inc, free_inc);
- spin_unlock(&c->freelist_lock);
-
- /* with gc lock held, alloc_heap can't be in use: */
- swap(ca->alloc_heap, alloc_heap);
-
nbuckets = ca->mi.nbuckets;
if (resize)
@@ -2145,10 +2093,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
ret = 0;
err:
- free_heap(&alloc_heap);
- free_fifo(&free_inc);
- for (i = 0; i < RESERVE_NR; i++)
- free_fifo(&free[i]);
kvpfree(buckets_nouse,
BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
if (bucket_gens)
@@ -2163,10 +2107,6 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
{
unsigned i;
- free_heap(&ca->alloc_heap);
- free_fifo(&ca->free_inc);
- for (i = 0; i < RESERVE_NR; i++)
- free_fifo(&ca->free[i]);
kvpfree(ca->buckets_nouse,
BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
kvpfree(rcu_dereference_protected(ca->bucket_gens, 1),
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 757919d5e20f..bcb40f15f82e 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -58,11 +58,6 @@ static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
return __bucket(ca, b, true);
}
-static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
-{
- return __bucket(ca, b, false);
-}
-
static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
{
return rcu_dereference_check(ca->bucket_gens,
@@ -151,50 +146,50 @@ static inline bool is_available_bucket(struct bucket_mark mark)
struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
static inline u64 __dev_buckets_available(struct bch_dev *ca,
- struct bch_dev_usage stats)
+ struct bch_dev_usage stats,
+ enum alloc_reserve reserve)
{
- u64 total = ca->mi.nbuckets - ca->mi.first_bucket;
+ s64 total = ca->mi.nbuckets - ca->mi.first_bucket;
+ s64 reserved = 0;
+
+ switch (reserve) {
+ case RESERVE_none:
+ reserved += ca->mi.nbuckets >> 6;
+ fallthrough;
+ case RESERVE_movinggc:
+ reserved += ca->nr_btree_reserve;
+ fallthrough;
+ case RESERVE_btree:
+ reserved += ca->nr_btree_reserve;
+ fallthrough;
+ case RESERVE_btree_movinggc:
+ break;
+ default:
+ BUG();
+ }
if (WARN_ONCE(stats.buckets_unavailable > total,
"buckets_unavailable overflow (%llu > %llu)\n",
stats.buckets_unavailable, total))
return 0;
- return total - stats.buckets_unavailable;
+ return max_t(s64, 0,
+ total -
+ stats.buckets_unavailable -
+ ca->nr_open_buckets -
+ reserved);
}
-static inline u64 dev_buckets_available(struct bch_dev *ca)
+static inline u64 dev_buckets_available(struct bch_dev *ca,
+ enum alloc_reserve reserve)
{
- return __dev_buckets_available(ca, bch2_dev_usage_read(ca));
-}
-
-static inline u64 __dev_buckets_reclaimable(struct bch_dev *ca,
- struct bch_dev_usage stats)
-{
- struct bch_fs *c = ca->fs;
- s64 available = __dev_buckets_available(ca, stats);
- unsigned i;
-
- spin_lock(&c->freelist_lock);
- for (i = 0; i < RESERVE_NR; i++)
- available -= fifo_used(&ca->free[i]);
- available -= fifo_used(&ca->free_inc);
- available -= ca->nr_open_buckets;
- spin_unlock(&c->freelist_lock);
-
- return max(available, 0LL);
-}
-
-static inline u64 dev_buckets_reclaimable(struct bch_dev *ca)
-{
- return __dev_buckets_reclaimable(ca, bch2_dev_usage_read(ca));
+ return __dev_buckets_available(ca, bch2_dev_usage_read(ca), reserve);
}
/* Filesystem usage: */
static inline unsigned fs_usage_u64s(struct bch_fs *c)
{
-
return sizeof(struct bch_fs_usage) / sizeof(u64) +
READ_ONCE(c->replicas.nr);
}
@@ -222,7 +217,6 @@ bch2_fs_usage_read_short(struct bch_fs *);
void bch2_fs_usage_initialize(struct bch_fs *);
-void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool);
void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
size_t, enum bch_data_type, unsigned,
struct gc_pos, unsigned);
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 4f7018398385..6ddbea4da7d1 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -14,7 +14,6 @@ struct bucket_mark {
struct {
u8 gen;
u8 data_type:3,
- owned_by_allocator:1,
stripe:1;
u16 dirty_sectors;
u16 cached_sectors;
@@ -29,7 +28,6 @@ struct bucket {
};
u64 io_time[2];
- u8 oldest_gen;
unsigned gen_valid:1;
u8 stripe_redundancy;
u32 stripe;
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 9dc2f9f822c8..5030a5b831af 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1295,9 +1295,6 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
BUG_ON(nr_have_data > h->s->nr_data);
BUG_ON(nr_have_parity > h->s->nr_parity);
- percpu_down_read(&c->mark_lock);
- rcu_read_lock();
-
buckets.nr = 0;
if (nr_have_parity < h->s->nr_parity) {
ret = bch2_bucket_alloc_set(c, &buckets,
@@ -1324,7 +1321,7 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
}
if (ret)
- goto err;
+ return ret;
}
buckets.nr = 0;
@@ -1352,12 +1349,10 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
}
if (ret)
- goto err;
+ return ret;
}
-err:
- rcu_read_unlock();
- percpu_up_read(&c->mark_lock);
- return ret;
+
+ return 0;
}
/* XXX: doesn't obey target: */
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index cb15d1c8a135..f87f76553bf4 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -812,10 +812,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
break;
}
} else {
- rcu_read_lock();
ob[nr_got] = bch2_bucket_alloc(c, ca, RESERVE_none,
false, cl);
- rcu_read_unlock();
if (IS_ERR(ob[nr_got])) {
ret = cl ? -EAGAIN : -ENOSPC;
break;
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index b2c3ee336c1f..3e418342ee67 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1398,6 +1398,10 @@ static void journal_write_done(struct closure *cl)
if (!JSET_NO_FLUSH(w->data)) {
j->flushed_seq_ondisk = seq;
j->last_seq_ondisk = w->last_seq;
+
+ closure_wake_up(&c->freelist_wait);
+
+ bch2_reset_alloc_cursors(c);
}
} else if (!j->err_seq || seq < j->err_seq)
j->err_seq = seq;
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index aecec55eb421..b9e1bd7b1d05 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -104,18 +104,6 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
return DATA_SKIP;
}
-static bool have_copygc_reserve(struct bch_dev *ca)
-{
- bool ret;
-
- spin_lock(&ca->fs->freelist_lock);
- ret = fifo_full(&ca->free[RESERVE_movinggc]) ||
- ca->allocator_state != ALLOCATOR_running;
- spin_unlock(&ca->fs->freelist_lock);
-
- return ret;
-}
-
static inline int fragmentation_cmp(copygc_heap *heap,
struct copygc_heap_entry l,
struct copygc_heap_entry r)
@@ -247,11 +235,10 @@ static int bch2_copygc(struct bch_fs *c)
}
for_each_rw_member(ca, c, dev_idx) {
- closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca));
+ s64 avail = min(dev_buckets_available(ca, RESERVE_movinggc),
+ ca->mi.nbuckets >> 6);
- spin_lock(&ca->fs->freelist_lock);
- sectors_reserved += fifo_used(&ca->free[RESERVE_movinggc]) * ca->mi.bucket_size;
- spin_unlock(&ca->fs->freelist_lock);
+ sectors_reserved += avail * ca->mi.bucket_size;
}
ret = walk_buckets_to_copygc(c);
@@ -352,8 +339,8 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
for_each_rw_member(ca, c, dev_idx) {
struct bch_dev_usage usage = bch2_dev_usage_read(ca);
- fragmented_allowed = ((__dev_buckets_reclaimable(ca, usage) *
- ca->mi.bucket_size) >> 1);
+ fragmented_allowed = ((__dev_buckets_available(ca, usage, RESERVE_none) *
+ ca->mi.bucket_size) >> 1);
fragmented = usage.d[BCH_DATA_user].fragmented;
wait = min(wait, max(0LL, fragmented_allowed - fragmented));
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 690a36ea1383..50e5c5e852f7 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1374,6 +1374,7 @@ int bch2_fs_initialize(struct bch_fs *c)
* Write out the superblock and journal buckets, now that we can do
* btree updates
*/
+ bch_verbose(c, "marking superblocks");
err = "error marking superblock and journal";
for_each_member_device(ca, c, i) {
ret = bch2_trans_mark_dev_sb(c, ca);
@@ -1385,6 +1386,7 @@ int bch2_fs_initialize(struct bch_fs *c)
ca->new_fs_bucket_idx = 0;
}
+ bch_verbose(c, "initializing freespace");
err = "error initializing freespace";
ret = bch2_fs_freespace_init(c);
if (ret)
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index c6585034f4d4..3a8740fde9de 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -206,17 +206,9 @@ static void __bch2_fs_read_only(struct bch_fs *c)
*/
bch2_journal_flush_all_pins(&c->journal);
- /*
- * If the allocator threads didn't all start up, the btree updates to
- * write out alloc info aren't going to work:
- */
- if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags))
- goto nowrote_alloc;
-
bch_verbose(c, "flushing journal and stopping allocators");
bch2_journal_flush_all_pins(&c->journal);
- set_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags);
do {
clean_passes++;
@@ -241,17 +233,11 @@ static void __bch2_fs_read_only(struct bch_fs *c)
bch_verbose(c, "flushing journal and stopping allocators complete");
set_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
-nowrote_alloc:
+
closure_wait_event(&c->btree_interior_update_wait,
!bch2_btree_interior_updates_nr_pending(c));
flush_work(&c->btree_interior_update_work);
- for_each_member_device(ca, c, i)
- bch2_dev_allocator_stop(ca);
-
- clear_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
- clear_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags);
-
bch2_fs_journal_stop(&c->journal);
/*
@@ -287,10 +273,6 @@ void bch2_fs_read_only(struct bch_fs *c)
/*
* Block new foreground-end write operations from starting - any new
* writes will return -EROFS:
- *
- * (This is really blocking new _allocations_, writes to previously
- * allocated space can still happen until stopping the allocator in
- * bch2_dev_allocator_stop()).
*/
percpu_ref_kill(&c->writes);
@@ -419,20 +401,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
- for_each_rw_member(ca, c, i) {
- ret = bch2_dev_allocator_start(ca);
- if (ret) {
- bch_err(c, "error starting allocator threads");
- percpu_ref_put(&ca->io_ref);
- goto err;
- }
- }
-
- set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
-
- for_each_rw_member(ca, c, i)
- bch2_wake_allocator(ca);
-
if (!early) {
ret = bch2_fs_read_write_late(c);
if (ret)
@@ -946,20 +914,6 @@ int bch2_fs_start(struct bch_fs *c)
set_bit(BCH_FS_STARTED, &c->flags);
- /*
- * Allocator threads don't start filling copygc reserve until after we
- * set BCH_FS_STARTED - wake them now:
- *
- * XXX ugly hack:
- * Need to set ca->allocator_state here instead of relying on the
- * allocator threads to do it to avoid racing with the copygc threads
- * checking it and thinking they have no alloc reserve:
- */
- for_each_online_member(ca, c, i) {
- ca->allocator_state = ALLOCATOR_running;
- bch2_wake_allocator(ca);
- }
-
if (c->opts.read_only || c->opts.nochanges) {
bch2_fs_read_only(c);
} else {
@@ -1051,8 +1005,6 @@ static void bch2_dev_release(struct kobject *kobj)
static void bch2_dev_free(struct bch_dev *ca)
{
- bch2_dev_allocator_stop(ca);
-
cancel_work_sync(&ca->io_error_work);
if (ca->kobj.state_in_sysfs &&
@@ -1167,6 +1119,9 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
ca->mi = bch2_mi_to_cpu(member);
ca->uuid = member->uuid;
+ ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
+ ca->mi.bucket_size / btree_sectors(c));
+
if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
0, GFP_KERNEL) ||
percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
@@ -1216,12 +1171,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
ca->fs = c;
- if (ca->mi.state == BCH_MEMBER_STATE_rw &&
- bch2_dev_allocator_start(ca)) {
- bch2_dev_free(ca);
- goto err;
- }
-
bch2_dev_attach(c, ca, dev_idx);
out:
pr_verbose_init(c->opts, "ret %i", ret);
@@ -1405,14 +1354,13 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
/*
* The allocator thread itself allocates btree nodes, so stop it first:
*/
- bch2_dev_allocator_stop(ca);
bch2_dev_allocator_remove(c, ca);
bch2_dev_journal_stop(&c->journal, ca);
bch2_copygc_start(c);
}
-static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
+static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
{
lockdep_assert_held(&c->state_lock);
@@ -1420,8 +1368,6 @@ static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
-
- return bch2_dev_allocator_start(ca);
}
int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
@@ -1448,7 +1394,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
mutex_unlock(&c->sb_lock);
if (new_state == BCH_MEMBER_STATE_rw)
- ret = __bch2_dev_read_write(c, ca);
+ __bch2_dev_read_write(c, ca);
rebalance_wakeup(c);
@@ -1710,13 +1656,8 @@ have_slot:
ca->new_fs_bucket_idx = 0;
- if (ca->mi.state == BCH_MEMBER_STATE_rw) {
- ret = __bch2_dev_read_write(c, ca);
- if (ret) {
- bch_err(c, "device add error: error going RW on new device: %i", ret);
- goto err_late;
- }
- }
+ if (ca->mi.state == BCH_MEMBER_STATE_rw)
+ __bch2_dev_read_write(c, ca);
up_write(&c->state_lock);
return 0;
@@ -1776,11 +1717,8 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
goto err;
}
- if (ca->mi.state == BCH_MEMBER_STATE_rw) {
- ret = __bch2_dev_read_write(c, ca);
- if (ret)
- goto err;
- }
+ if (ca->mi.state == BCH_MEMBER_STATE_rw)
+ __bch2_dev_read_write(c, ca);
mutex_lock(&c->sb_lock);
mi = bch2_sb_get_members(c->disk_sb.sb);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index ec672134cb18..e995b84b6172 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -170,7 +170,6 @@ read_attribute(congested);
read_attribute(btree_avg_write_size);
-read_attribute(reserve_stats);
read_attribute(btree_cache_size);
read_attribute(compression_stats);
read_attribute(journal_debug);
@@ -186,11 +185,11 @@ read_attribute(internal_uuid);
read_attribute(has_data);
read_attribute(alloc_debug);
-write_attribute(wake_allocator);
read_attribute(read_realloc_races);
read_attribute(extent_migrate_done);
read_attribute(extent_migrate_raced);
+read_attribute(bucket_alloc_fail);
rw_attribute(discard);
rw_attribute(label);
@@ -377,6 +376,8 @@ SHOW(bch2_fs)
atomic_long_read(&c->extent_migrate_done));
sysfs_print(extent_migrate_raced,
atomic_long_read(&c->extent_migrate_raced));
+ sysfs_print(bucket_alloc_fail,
+ atomic_long_read(&c->bucket_alloc_fail));
sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic);
@@ -577,6 +578,7 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_read_realloc_races,
&sysfs_extent_migrate_done,
&sysfs_extent_migrate_raced,
+ &sysfs_bucket_alloc_fail,
&sysfs_gc_gens_pos,
@@ -705,24 +707,6 @@ struct attribute *bch2_fs_time_stats_files[] = {
NULL
};
-static void reserve_stats_to_text(struct printbuf *out, struct bch_dev *ca)
-{
- enum alloc_reserve i;
-
- spin_lock(&ca->fs->freelist_lock);
-
- pr_buf(out, "free_inc:\t%zu\t%zu\n",
- fifo_used(&ca->free_inc),
- ca->free_inc.size);
-
- for (i = 0; i < RESERVE_NR; i++)
- pr_buf(out, "free[%u]:\t%zu\t%zu\n", i,
- fifo_used(&ca->free[i]),
- ca->free[i].size);
-
- spin_unlock(&ca->fs->freelist_lock);
-}
-
static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
{
struct bch_fs *c = ca->fs;
@@ -748,9 +732,6 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
"ec\t%16llu\n"
"available%15llu\n"
"\n"
- "free_inc\t\t%zu/%zu\n"
- "free[RESERVE_MOVINGGC]\t%zu/%zu\n"
- "free[RESERVE_NONE]\t%zu/%zu\n"
"freelist_wait\t\t%s\n"
"open buckets allocated\t%u\n"
"open buckets this dev\t%u\n"
@@ -758,13 +739,9 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
"open_buckets_wait\t%s\n"
"open_buckets_btree\t%u\n"
"open_buckets_user\t%u\n"
- "btree reserve cache\t%u\n"
- "thread state:\t\t%s\n",
+ "btree reserve cache\t%u\n",
stats.buckets_ec,
- __dev_buckets_available(ca, stats),
- fifo_used(&ca->free_inc), ca->free_inc.size,
- fifo_used(&ca->free[RESERVE_movinggc]), ca->free[RESERVE_movinggc].size,
- fifo_used(&ca->free[RESERVE_none]), ca->free[RESERVE_none].size,
+ __dev_buckets_available(ca, stats, RESERVE_none),
c->freelist_wait.list.first ? "waiting" : "empty",
OPEN_BUCKETS_COUNT - c->open_buckets_nr_free,
ca->nr_open_buckets,
@@ -772,8 +749,7 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
c->open_buckets_wait.list.first ? "waiting" : "empty",
nr[BCH_DATA_btree],
nr[BCH_DATA_user],
- c->btree_reserve_cache_nr,
- bch2_allocator_states[ca->allocator_state]);
+ c->btree_reserve_cache_nr);
}
static const char * const bch2_rw[] = {
@@ -848,9 +824,6 @@ SHOW(bch2_dev)
clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
* 100 / CONGESTED_MAX);
- if (attr == &sysfs_reserve_stats)
- reserve_stats_to_text(out, ca);
-
if (attr == &sysfs_alloc_debug)
dev_alloc_debug_to_text(out, ca);
@@ -890,9 +863,6 @@ STORE(bch2_dev)
return ret;
}
- if (attr == &sysfs_wake_allocator)
- bch2_wake_allocator(ca);
-
return size;
}
SYSFS_OPS(bch2_dev);
@@ -918,11 +888,8 @@ struct attribute *bch2_dev_files[] = {
&sysfs_io_latency_stats_write,
&sysfs_congested,
- &sysfs_reserve_stats,
-
/* debug: */
&sysfs_alloc_debug,
- &sysfs_wake_allocator,
NULL
};
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 89207fd7b617..caf59b977e2f 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -471,37 +471,74 @@ TRACE_EVENT(invalidate,
);
DECLARE_EVENT_CLASS(bucket_alloc,
- TP_PROTO(struct bch_dev *ca, const char *alloc_reserve),
- TP_ARGS(ca, alloc_reserve),
+ TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
+ u64 avail,
+ u64 seen,
+ u64 open,
+ u64 need_journal_commit,
+ u64 nouse,
+ bool nonblocking,
+ int ret),
+ TP_ARGS(ca, alloc_reserve, avail, seen, open, need_journal_commit, nouse, nonblocking, ret),
TP_STRUCT__entry(
- __field(dev_t, dev )
- __array(char, reserve, 16 )
+ __field(dev_t, dev )
+ __array(char, reserve, 16 )
+ __field(u64, avail )
+ __field(u64, seen )
+ __field(u64, open )
+ __field(u64, need_journal_commit )
+ __field(u64, nouse )
+ __field(bool, nonblocking )
+ __field(int, ret )
),
TP_fast_assign(
__entry->dev = ca->dev;
strlcpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve));
+ __entry->avail = avail;
+ __entry->seen = seen;
+ __entry->open = open;
+ __entry->need_journal_commit = need_journal_commit;
+ __entry->nouse = nouse;
+ __entry->nonblocking = nonblocking;
+ __entry->ret = ret;
),
- TP_printk("%d,%d reserve %s",
+ TP_printk("%d,%d reserve %s avail %llu seen %llu open %llu need_journal_commit %llu nouse %llu nonblocking %u ret %i",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->reserve)
+ __entry->reserve,
+ __entry->avail,
+ __entry->seen,
+ __entry->open,
+ __entry->need_journal_commit,
+ __entry->nouse,
+ __entry->nonblocking,
+ __entry->ret)
);
DEFINE_EVENT(bucket_alloc, bucket_alloc,
- TP_PROTO(struct bch_dev *ca, const char *alloc_reserve),
- TP_ARGS(ca, alloc_reserve)
+ TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
+ u64 avail,
+ u64 seen,
+ u64 open,
+ u64 need_journal_commit,
+ u64 nouse,
+ bool nonblocking,
+ int ret),
+ TP_ARGS(ca, alloc_reserve, avail, seen, open, need_journal_commit, nouse, nonblocking, ret)
);
DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
- TP_PROTO(struct bch_dev *ca, const char *alloc_reserve),
- TP_ARGS(ca, alloc_reserve)
-);
-
-DEFINE_EVENT(bucket_alloc, open_bucket_alloc_fail,
- TP_PROTO(struct bch_dev *ca, const char *alloc_reserve),
- TP_ARGS(ca, alloc_reserve)
+ TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
+ u64 avail,
+ u64 seen,
+ u64 open,
+ u64 need_journal_commit,
+ u64 nouse,
+ bool nonblocking,
+ int ret),
+ TP_ARGS(ca, alloc_reserve, avail, seen, open, need_journal_commit, nouse, nonblocking, ret)
);
/* Moving IO */