summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.bcachefs_revision2
-rw-r--r--include/trace/events/bcachefs.h102
-rw-r--r--libbcachefs/alloc_background.c140
-rw-r--r--libbcachefs/bcachefs.h4
-rw-r--r--libbcachefs/btree_cache.c4
-rw-r--r--libbcachefs/btree_gc.c150
-rw-r--r--libbcachefs/btree_iter.c194
-rw-r--r--libbcachefs/btree_iter.h26
-rw-r--r--libbcachefs/btree_key_cache.c3
-rw-r--r--libbcachefs/btree_types.h1
-rw-r--r--libbcachefs/btree_update.h23
-rw-r--r--libbcachefs/btree_update_interior.c1
-rw-r--r--libbcachefs/btree_update_leaf.c214
-rw-r--r--libbcachefs/buckets.c96
-rw-r--r--libbcachefs/buckets.h13
-rw-r--r--libbcachefs/buckets_types.h9
-rw-r--r--libbcachefs/buckets_waiting_for_journal.c143
-rw-r--r--libbcachefs/buckets_waiting_for_journal.h13
-rw-r--r--libbcachefs/buckets_waiting_for_journal_types.h16
-rw-r--r--libbcachefs/inode.c57
-rw-r--r--libbcachefs/journal_io.c4
-rw-r--r--libbcachefs/move.c13
-rw-r--r--libbcachefs/movinggc.c26
-rw-r--r--libbcachefs/subvolume.c10
-rw-r--r--libbcachefs/subvolume.h4
-rw-r--r--libbcachefs/super.c3
-rw-r--r--libbcachefs/sysfs.c36
-rw-r--r--libbcachefs/tests.c71
28 files changed, 888 insertions, 490 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index 8226b3a6..71e83e28 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-5242db9aec10220b6ee7162ba7bec173417348cf
+bf340e68c74cdb70c692698ef7367b9dc6f6e61f
diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h
index 295dcd60..8f10d13b 100644
--- a/include/trace/events/bcachefs.h
+++ b/include/trace/events/bcachefs.h
@@ -346,6 +346,52 @@ TRACE_EVENT(btree_cache_scan,
__entry->ret)
);
+TRACE_EVENT(btree_node_relock_fail,
+ TP_PROTO(const char *trans_fn,
+ unsigned long caller_ip,
+ enum btree_id btree_id,
+ struct bpos *pos,
+ unsigned long node,
+ u32 iter_lock_seq,
+ u32 node_lock_seq),
+ TP_ARGS(trans_fn, caller_ip, btree_id, pos, node, iter_lock_seq, node_lock_seq),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 24 )
+ __array(char, caller, 32 )
+ __field(u8, btree_id )
+ __field(u64, pos_inode )
+ __field(u64, pos_offset )
+ __field(u32, pos_snapshot )
+ __field(unsigned long, node )
+ __field(u32, iter_lock_seq )
+ __field(u32, node_lock_seq )
+ ),
+
+ TP_fast_assign(
+ strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+ snprintf(__entry->caller, sizeof(__entry->caller), "%pS", (void *) caller_ip);
+ __entry->btree_id = btree_id;
+ __entry->pos_inode = pos->inode;
+ __entry->pos_offset = pos->offset;
+ __entry->pos_snapshot = pos->snapshot;
+ __entry->node = node;
+ __entry->iter_lock_seq = iter_lock_seq;
+ __entry->node_lock_seq = node_lock_seq;
+ ),
+
+ TP_printk("%s %s btree %u pos %llu:%llu:%u, node %lu iter seq %u lock seq %u",
+ __entry->trans_fn,
+ __entry->caller,
+ __entry->btree_id,
+ __entry->pos_inode,
+ __entry->pos_offset,
+ __entry->pos_snapshot,
+ __entry->node,
+ __entry->iter_lock_seq,
+ __entry->node_lock_seq)
+);
+
/* Garbage collection */
DEFINE_EVENT(btree_node, btree_gc_rewrite_node,
@@ -621,7 +667,7 @@ DECLARE_EVENT_CLASS(transaction_restart_iter,
TP_STRUCT__entry(
__array(char, trans_fn, 24 )
- __field(unsigned long, caller_ip )
+ __array(char, caller, 32 )
__field(u8, btree_id )
__field(u64, pos_inode )
__field(u64, pos_offset )
@@ -630,16 +676,16 @@ DECLARE_EVENT_CLASS(transaction_restart_iter,
TP_fast_assign(
strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
- __entry->caller_ip = caller_ip;
+ snprintf(__entry->caller, sizeof(__entry->caller), "%pS", (void *) caller_ip);
__entry->btree_id = btree_id;
__entry->pos_inode = pos->inode;
__entry->pos_offset = pos->offset;
__entry->pos_snapshot = pos->snapshot;
),
- TP_printk("%s %pS btree %u pos %llu:%llu:%u",
+ TP_printk("%s %s btree %u pos %llu:%llu:%u",
__entry->trans_fn,
- (void *) __entry->caller_ip,
+ __entry->caller,
__entry->btree_id,
__entry->pos_inode,
__entry->pos_offset,
@@ -694,6 +740,54 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_relock,
TP_ARGS(trans_fn, caller_ip, btree_id, pos)
);
+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_next_node,
+ TP_PROTO(const char *trans_fn,
+ unsigned long caller_ip,
+ enum btree_id btree_id,
+ struct bpos *pos),
+ TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+);
+
+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_parent_for_fill,
+ TP_PROTO(const char *trans_fn,
+ unsigned long caller_ip,
+ enum btree_id btree_id,
+ struct bpos *pos),
+ TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+);
+
+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_after_fill,
+ TP_PROTO(const char *trans_fn,
+ unsigned long caller_ip,
+ enum btree_id btree_id,
+ struct bpos *pos),
+ TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+);
+
+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_key_cache_fill,
+ TP_PROTO(const char *trans_fn,
+ unsigned long caller_ip,
+ enum btree_id btree_id,
+ struct bpos *pos),
+ TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+);
+
+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path,
+ TP_PROTO(const char *trans_fn,
+ unsigned long caller_ip,
+ enum btree_id btree_id,
+ struct bpos *pos),
+ TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+);
+
+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path_intent,
+ TP_PROTO(const char *trans_fn,
+ unsigned long caller_ip,
+ enum btree_id btree_id,
+ struct bpos *pos),
+ TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+);
+
DEFINE_EVENT(transaction_restart_iter, trans_restart_traverse,
TP_PROTO(const char *trans_fn,
unsigned long caller_ip,
diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c
index 688a53b4..7ad16c21 100644
--- a/libbcachefs/alloc_background.c
+++ b/libbcachefs/alloc_background.c
@@ -9,6 +9,7 @@
#include "btree_update_interior.h"
#include "btree_gc.h"
#include "buckets.h"
+#include "buckets_waiting_for_journal.h"
#include "clock.h"
#include "debug.h"
#include "ec.h"
@@ -463,19 +464,20 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
+ struct bkey_s_c k;
struct bkey_alloc_unpacked u;
u64 *time, now;
int ret = 0;
bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(dev, bucket_nr),
BTREE_ITER_CACHED|
- BTREE_ITER_CACHED_NOFILL|
BTREE_ITER_INTENT);
- ret = bch2_btree_iter_traverse(&iter);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
if (ret)
goto out;
- u = alloc_mem_to_key(c, &iter);
+ u = bch2_alloc_unpack(k);
time = rw == READ ? &u.read_time : &u.write_time;
now = atomic64_read(&c->io_clock[rw].now);
@@ -542,7 +544,7 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
u64 now, u64 last_seq_ondisk)
{
- unsigned used = bucket_sectors_used(m);
+ unsigned used = m.cached_sectors;
if (used) {
/*
@@ -561,8 +563,7 @@ static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
* keys when there's only a small difference, so that we can
* keep sequential buckets together:
*/
- return (bucket_needs_journal_commit(m, last_seq_ondisk) << 4)|
- (bucket_gc_gen(g) >> 4);
+ return bucket_gc_gen(g) >> 4;
}
}
@@ -611,6 +612,13 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
if (!bch2_can_invalidate_bucket(ca, b, m))
continue;
+ if (!m.data_type &&
+ bch2_bucket_needs_journal_commit(c, last_seq_ondisk,
+ ca->dev_idx, b)) {
+ ca->buckets_waiting_on_journal++;
+ continue;
+ }
+
if (e.nr && e.bucket + e.nr == b && e.key == key) {
e.nr++;
} else {
@@ -647,6 +655,7 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
ca->inc_gen_needs_gc = 0;
ca->inc_gen_really_needs_gc = 0;
+ ca->buckets_waiting_on_journal = 0;
find_reclaimable_buckets_lru(c, ca);
@@ -658,56 +667,34 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
return nr;
}
-/*
- * returns sequence number of most recent journal entry that updated this
- * bucket:
- */
-static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m)
-{
- if (m.journal_seq_valid) {
- u64 journal_seq = atomic64_read(&c->journal.seq);
- u64 bucket_seq = journal_seq;
-
- bucket_seq &= ~((u64) U16_MAX);
- bucket_seq |= m.journal_seq;
-
- if (bucket_seq > journal_seq)
- bucket_seq -= 1 << 16;
-
- return bucket_seq;
- } else {
- return 0;
- }
-}
-
static int bucket_invalidate_btree(struct btree_trans *trans,
- struct bch_dev *ca, u64 b)
+ struct bch_dev *ca, u64 b,
+ struct bkey_alloc_unpacked *u)
{
struct bch_fs *c = trans->c;
- struct bkey_alloc_unpacked u;
struct btree_iter iter;
+ struct bkey_s_c k;
int ret;
bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
POS(ca->dev_idx, b),
BTREE_ITER_CACHED|
- BTREE_ITER_CACHED_NOFILL|
BTREE_ITER_INTENT);
- ret = bch2_btree_iter_traverse(&iter);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
if (ret)
goto err;
- u = alloc_mem_to_key(c, &iter);
-
- u.gen++;
- u.data_type = 0;
- u.dirty_sectors = 0;
- u.cached_sectors = 0;
- u.read_time = atomic64_read(&c->io_clock[READ].now);
- u.write_time = atomic64_read(&c->io_clock[WRITE].now);
+ *u = bch2_alloc_unpack(k);
+ u->gen++;
+ u->data_type = 0;
+ u->dirty_sectors = 0;
+ u->cached_sectors = 0;
+ u->read_time = atomic64_read(&c->io_clock[READ].now);
+ u->write_time = atomic64_read(&c->io_clock[WRITE].now);
- ret = bch2_alloc_write(trans, &iter, &u,
+ ret = bch2_alloc_write(trans, &iter, u,
BTREE_TRIGGER_BUCKET_INVALIDATE);
err:
bch2_trans_iter_exit(trans, &iter);
@@ -717,21 +704,24 @@ err:
static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
u64 *journal_seq, unsigned flags)
{
- struct bucket *g;
- struct bucket_mark m;
+ struct bkey_alloc_unpacked u;
size_t b;
+ u64 commit_seq = 0;
int ret = 0;
+ /*
+ * If the read-only path is trying to shut down, we can't be generating
+ * new btree updates:
+ */
+ if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags))
+ return 1;
+
BUG_ON(!ca->alloc_heap.used ||
!ca->alloc_heap.data[0].nr);
b = ca->alloc_heap.data[0].bucket;
/* first, put on free_inc and mark as owned by allocator: */
percpu_down_read(&c->mark_lock);
- g = bucket(ca, b);
- m = READ_ONCE(g->mark);
-
- BUG_ON(m.dirty_sectors);
bch2_mark_alloc_bucket(c, ca, b, true);
@@ -740,38 +730,15 @@ static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
BUG_ON(!fifo_push(&ca->free_inc, b));
spin_unlock(&c->freelist_lock);
- /*
- * If we're not invalidating cached data, we only increment the bucket
- * gen in memory here, the incremented gen will be updated in the btree
- * by bch2_trans_mark_pointer():
- */
- if (!m.cached_sectors &&
- !bucket_needs_journal_commit(m, c->journal.last_seq_ondisk)) {
- BUG_ON(m.data_type);
- bucket_cmpxchg(g, m, m.gen++);
- *bucket_gen(ca, b) = m.gen;
- percpu_up_read(&c->mark_lock);
- goto out;
- }
-
percpu_up_read(&c->mark_lock);
- /*
- * If the read-only path is trying to shut down, we can't be generating
- * new btree updates:
- */
- if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags)) {
- ret = 1;
- goto out;
- }
-
- ret = bch2_trans_do(c, NULL, journal_seq,
+ ret = bch2_trans_do(c, NULL, &commit_seq,
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL|
BTREE_INSERT_JOURNAL_RESERVED|
flags,
- bucket_invalidate_btree(&trans, ca, b));
-out:
+ bucket_invalidate_btree(&trans, ca, b, &u));
+
if (!ret) {
/* remove from alloc_heap: */
struct alloc_heap_entry e, *top = ca->alloc_heap.data;
@@ -783,11 +750,17 @@ out:
heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
/*
- * Make sure we flush the last journal entry that updated this
- * bucket (i.e. deleting the last reference) before writing to
- * this bucket again:
+ * If we invalidating cached data then we need to wait on the
+ * journal commit:
+ */
+ if (u.data_type)
+ *journal_seq = max(*journal_seq, commit_seq);
+
+ /*
+ * We already waiting on u.alloc_seq when we filtered out
+ * buckets that need journal commit:
*/
- *journal_seq = max(*journal_seq, bucket_journal_seq(c, m));
+ BUG_ON(*journal_seq > u.journal_seq);
} else {
size_t b2;
@@ -954,8 +927,14 @@ static int bch2_allocator_thread(void *arg)
gc_count = c->gc_count;
nr = find_reclaimable_buckets(c, ca);
- trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc,
- ca->inc_gen_really_needs_gc);
+ if (!nr && ca->buckets_waiting_on_journal) {
+ ret = bch2_journal_flush(&c->journal);
+ if (ret)
+ goto stop;
+ } else if (nr < (ca->mi.nbuckets >> 6) &&
+ ca->buckets_waiting_on_journal >= nr / 2) {
+ bch2_journal_flush_async(&c->journal, NULL);
+ }
if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) ||
ca->inc_gen_really_needs_gc) &&
@@ -963,6 +942,9 @@ static int bch2_allocator_thread(void *arg)
atomic_inc(&c->kick_gc);
wake_up_process(c->gc_thread);
}
+
+ trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc,
+ ca->inc_gen_really_needs_gc);
}
ret = bch2_invalidate_buckets(c, ca);
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index c64db2bf..a28ddcd5 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -355,6 +355,7 @@ enum bch_time_stats {
#include "alloc_types.h"
#include "btree_types.h"
#include "buckets_types.h"
+#include "buckets_waiting_for_journal_types.h"
#include "clock_types.h"
#include "ec_types.h"
#include "journal_types.h"
@@ -482,6 +483,7 @@ struct bch_dev {
size_t inc_gen_needs_gc;
size_t inc_gen_really_needs_gc;
+ size_t buckets_waiting_on_journal;
enum allocator_states allocator_state;
@@ -777,6 +779,8 @@ struct bch_fs {
struct mutex write_points_hash_lock;
unsigned write_points_nr;
+ struct buckets_waiting_for_journal buckets_waiting_for_journal;
+
/* GARBAGE COLLECTION */
struct task_struct *gc_thread;
atomic_t kick_gc;
diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c
index fc6c4d4c..986d08d7 100644
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@@ -666,6 +666,8 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
* been freed:
*/
if (trans && !bch2_btree_node_relock(trans, path, level + 1)) {
+ trace_trans_restart_relock_parent_for_fill(trans->fn,
+ _THIS_IP_, btree_id, &path->pos);
btree_trans_restart(trans);
return ERR_PTR(-EINTR);
}
@@ -713,6 +715,8 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
}
if (!six_relock_type(&b->c.lock, lock_type, seq)) {
+ trace_trans_restart_relock_after_fill(trans->fn, _THIS_IP_,
+ btree_id, &path->pos);
btree_trans_restart(trans);
return ERR_PTR(-EINTR);
}
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index a201052e..809c9a76 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -604,8 +604,8 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
if (data_type == BCH_DATA_btree) {
g2->_mark.data_type = g->_mark.data_type = data_type;
- g2->gen_valid = g->gen_valid = true;
set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
+ set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
} else {
do_update = true;
}
@@ -1327,12 +1327,6 @@ static int bch2_gc_start(struct bch_fs *c,
percpu_down_write(&c->mark_lock);
- /*
- * indicate to stripe code that we need to allocate for the gc stripes
- * radix tree, too
- */
- gc_pos_set(c, gc_phase(GC_PHASE_START));
-
for_each_member_device(ca, c, i) {
struct bucket_array *dst = __bucket_array(ca, 1);
struct bucket_array *src = __bucket_array(ca, 0);
@@ -1360,6 +1354,27 @@ static int bch2_gc_start(struct bch_fs *c,
return 0;
}
+static void bch2_gc_alloc_reset(struct bch_fs *c, bool initial, bool metadata_only)
+{
+ struct bch_dev *ca;
+ unsigned i;
+
+ for_each_member_device(ca, c, i) {
+ struct bucket_array *buckets = __bucket_array(ca, true);
+ struct bucket *g;
+
+ for_each_bucket(g, buckets) {
+ if (metadata_only &&
+ (g->mark.data_type == BCH_DATA_user ||
+ g->mark.data_type == BCH_DATA_cached ||
+ g->mark.data_type == BCH_DATA_parity))
+ continue;
+ g->_mark.dirty_sectors = 0;
+ g->_mark.cached_sectors = 0;
+ }
+ };
+}
+
static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
bool metadata_only)
{
@@ -1430,6 +1445,55 @@ fsck_err:
return ret;
}
+static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
+ bool metadata_only)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct reflink_gc *r;
+ int ret = 0;
+
+ if (metadata_only)
+ return 0;
+
+ bch2_trans_init(&trans, c, 0, 0);
+ c->reflink_gc_nr = 0;
+
+ for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ret) {
+ const __le64 *refcount = bkey_refcount_c(k);
+
+ if (!refcount)
+ continue;
+
+ r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
+ GFP_KERNEL);
+ if (!r) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ r->offset = k.k->p.offset;
+ r->size = k.k->size;
+ r->refcount = 0;
+ }
+ bch2_trans_iter_exit(&trans, &iter);
+
+ bch2_trans_exit(&trans);
+ return ret;
+}
+
+static void bch2_gc_reflink_reset(struct bch_fs *c, bool initial,
+ bool metadata_only)
+{
+ struct genradix_iter iter;
+ struct reflink_gc *r;
+
+ genradix_for_each(&c->reflink_gc_table, iter, r)
+ r->refcount = 0;
+}
+
static int bch2_gc_stripes_done(struct bch_fs *c, bool initial,
bool metadata_only)
{
@@ -1493,43 +1557,10 @@ fsck_err:
return ret;
}
-static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
- bool metadata_only)
+static void bch2_gc_stripes_reset(struct bch_fs *c, bool initial,
+ bool metadata_only)
{
- struct btree_trans trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct reflink_gc *r;
- int ret = 0;
-
- if (metadata_only)
- return 0;
-
- bch2_trans_init(&trans, c, 0, 0);
- c->reflink_gc_nr = 0;
-
- for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
- BTREE_ITER_PREFETCH, k, ret) {
- const __le64 *refcount = bkey_refcount_c(k);
-
- if (!refcount)
- continue;
-
- r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
- GFP_KERNEL);
- if (!r) {
- ret = -ENOMEM;
- break;
- }
-
- r->offset = k.k->p.offset;
- r->size = k.k->size;
- r->refcount = 0;
- }
- bch2_trans_iter_exit(&trans, &iter);
-
- bch2_trans_exit(&trans);
- return ret;
+ genradix_free(&c->gc_stripes);
}
/**
@@ -1565,11 +1596,13 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
/* flush interior btree updates: */
closure_wait_event(&c->btree_interior_update_wait,
!bch2_btree_interior_updates_nr_pending(c));
-again:
+
ret = bch2_gc_start(c, metadata_only) ?:
bch2_gc_reflink_start(c, initial, metadata_only);
if (ret)
goto out;
+again:
+ gc_pos_set(c, gc_phase(GC_PHASE_START));
bch2_mark_superblocks(c);
@@ -1607,25 +1640,26 @@ again:
if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) ||
(!iter && bch2_test_restart_gc)) {
+ if (iter++ > 2) {
+ bch_info(c, "Unable to fix bucket gens, looping");
+ ret = -EINVAL;
+ goto out;
+ }
+
/*
* XXX: make sure gens we fixed got saved
*/
- if (iter++ <= 2) {
- bch_info(c, "Second GC pass needed, restarting:");
- clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
- __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
-
- percpu_down_write(&c->mark_lock);
- bch2_gc_free(c);
- percpu_up_write(&c->mark_lock);
- /* flush fsck errors, reset counters */
- bch2_flush_fsck_errs(c);
+ bch_info(c, "Second GC pass needed, restarting:");
+ clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+ __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
- goto again;
- }
+ bch2_gc_stripes_reset(c, initial, metadata_only);
+ bch2_gc_alloc_reset(c, initial, metadata_only);
+ bch2_gc_reflink_reset(c, initial, metadata_only);
- bch_info(c, "Unable to fix bucket gens, looping");
- ret = -EINVAL;
+ /* flush fsck errors, reset counters */
+ bch2_flush_fsck_errs(c);
+ goto again;
}
out:
if (!ret) {
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index 2ae4e523..efe9b8cb 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -178,19 +178,25 @@ bool __bch2_btree_node_relock(struct btree_trans *trans,
int want = __btree_lock_want(path, level);
if (!is_btree_node(path, level))
- return false;
+ goto fail;
if (race_fault())
- return false;
+ goto fail;
if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
(btree_node_lock_seq_matches(path, b, level) &&
btree_node_lock_increment(trans, b, level, want))) {
mark_btree_node_locked(path, level, want);
return true;
- } else {
- return false;
}
+fail:
+ trace_btree_node_relock_fail(trans->fn, _RET_IP_,
+ path->btree_id,
+ &path->pos,
+ (unsigned long) b,
+ path->l[level].lock_seq,
+ is_btree_node(path, level) ? b->c.lock.state.seq : 0);
+ return false;
}
bool bch2_btree_node_upgrade(struct btree_trans *trans,
@@ -237,7 +243,7 @@ success:
static inline bool btree_path_get_locks(struct btree_trans *trans,
struct btree_path *path,
- bool upgrade, unsigned long trace_ip)
+ bool upgrade)
{
unsigned l = path->level;
int fail_idx = -1;
@@ -440,6 +446,8 @@ bool bch2_btree_path_relock_intent(struct btree_trans *trans,
if (!bch2_btree_node_relock(trans, path, l)) {
__bch2_btree_path_unlock(path);
btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+ trace_trans_restart_relock_path_intent(trans->fn, _RET_IP_,
+ path->btree_id, &path->pos);
btree_trans_restart(trans);
return false;
}
@@ -452,10 +460,13 @@ __flatten
static bool bch2_btree_path_relock(struct btree_trans *trans,
struct btree_path *path, unsigned long trace_ip)
{
- bool ret = btree_path_get_locks(trans, path, false, trace_ip);
+ bool ret = btree_path_get_locks(trans, path, false);
- if (!ret)
+ if (!ret) {
+ trace_trans_restart_relock_path(trans->fn, trace_ip,
+ path->btree_id, &path->pos);
btree_trans_restart(trans);
+ }
return ret;
}
@@ -469,7 +480,7 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans,
path->locks_want = new_locks_want;
- if (btree_path_get_locks(trans, path, true, _THIS_IP_))
+ if (btree_path_get_locks(trans, path, true))
return true;
/*
@@ -497,7 +508,7 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans,
linked->btree_id == path->btree_id &&
linked->locks_want < new_locks_want) {
linked->locks_want = new_locks_want;
- btree_path_get_locks(trans, linked, true, _THIS_IP_);
+ btree_path_get_locks(trans, linked, true);
}
return false;
@@ -701,9 +712,6 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached);
- BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
- iter->pos.snapshot != iter->snapshot);
-
BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
@@ -711,6 +719,8 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
!btree_type_has_snapshots(iter->btree_id));
+ if (iter->update_path)
+ bch2_btree_path_verify(trans, iter->update_path);
bch2_btree_path_verify(trans, iter->path);
}
@@ -1962,7 +1972,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
locks_want = min(locks_want, BTREE_MAX_DEPTH);
if (locks_want > path->locks_want) {
path->locks_want = locks_want;
- btree_path_get_locks(trans, path, true, _THIS_IP_);
+ btree_path_get_locks(trans, path, true);
}
return path;
@@ -2099,6 +2109,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
__bch2_btree_path_unlock(path);
path->l[path->level].b = BTREE_ITER_NO_NODE_GET_LOCKS;
path->l[path->level + 1].b = BTREE_ITER_NO_NODE_GET_LOCKS;
+ trace_trans_restart_relock_next_node(trans->fn, _THIS_IP_,
+ path->btree_id, &path->pos);
btree_trans_restart(trans);
ret = -EINTR;
goto err;
@@ -2182,6 +2194,23 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
return ret;
}
+static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans,
+ enum btree_id btree_id,
+ struct bpos pos)
+{
+ struct btree_insert_entry *i;
+
+ trans_for_each_update(trans, i)
+ if ((cmp_int(btree_id, i->btree_id) ?:
+ bpos_cmp(pos, i->k->k.p)) <= 0) {
+ if (btree_id == i->btree_id)
+ return i->k;
+ break;
+ }
+
+ return NULL;
+}
+
static noinline
struct bkey_i *__btree_trans_peek_journal(struct btree_trans *trans,
struct btree_path *path)
@@ -2218,21 +2247,15 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
return k;
}
-/**
- * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
- * current position
- */
-struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
+static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
{
struct btree_trans *trans = iter->trans;
- struct bpos search_key = btree_iter_search_key(iter);
struct bkey_i *next_update;
struct bkey_s_c k;
int ret;
EBUG_ON(iter->path->cached || iter->path->level);
bch2_btree_iter_verify(iter);
- bch2_btree_iter_verify_entry_exit(iter);
while (1) {
iter->path = btree_path_set_pos(trans, iter->path, search_key,
@@ -2277,24 +2300,6 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
}
if (likely(k.k)) {
- /*
- * We can never have a key in a leaf node at POS_MAX, so
- * we don't have to check these successor() calls:
- */
- if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
- !bch2_snapshot_is_ancestor(trans->c,
- iter->snapshot,
- k.k->p.snapshot)) {
- search_key = bpos_successor(k.k->p);
- continue;
- }
-
- if (bkey_whiteout(k.k) &&
- !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
- search_key = bkey_successor(iter, k.k->p);
- continue;
- }
-
break;
} else if (likely(bpos_cmp(iter->path->l[0].b->key.k.p, SPOS_MAX))) {
/* Advance to next leaf node: */
@@ -2306,6 +2311,92 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
goto out;
}
}
+out:
+ bch2_btree_iter_verify(iter);
+
+ return k;
+}
+
+/**
+ * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
+ * current position
+ */
+struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
+{
+ struct btree_trans *trans = iter->trans;
+ struct bpos search_key = btree_iter_search_key(iter);
+ struct bkey_s_c k;
+ int ret;
+
+ if (iter->update_path) {
+ bch2_path_put(trans, iter->update_path,
+ iter->flags & BTREE_ITER_INTENT);
+ iter->update_path = NULL;
+ }
+
+ bch2_btree_iter_verify_entry_exit(iter);
+
+ while (1) {
+ k = __bch2_btree_iter_peek(iter, search_key);
+ if (!k.k || bkey_err(k))
+ goto out;
+
+ if (iter->update_path &&
+ bkey_cmp(iter->update_path->pos, k.k->p)) {
+ bch2_path_put(trans, iter->update_path,
+ iter->flags & BTREE_ITER_INTENT);
+ iter->update_path = NULL;
+ }
+
+ if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
+ (iter->flags & BTREE_ITER_INTENT) &&
+ !(iter->flags & BTREE_ITER_IS_EXTENTS) &&
+ !iter->update_path) {
+ struct bpos pos = k.k->p;
+
+ if (pos.snapshot < iter->snapshot) {
+ search_key = bpos_successor(k.k->p);
+ continue;
+ }
+
+ pos.snapshot = iter->snapshot;
+
+ /*
+ * advance, same as on exit for iter->path, but only up
+ * to snapshot
+ */
+ __btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT);
+ iter->update_path = iter->path;
+
+ iter->update_path = btree_path_set_pos(trans,
+ iter->update_path, pos,
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
+
+ BUG_ON(!(iter->update_path->nodes_locked & 1));
+ iter->update_path->should_be_locked = true;
+ }
+
+ /*
+ * We can never have a key in a leaf node at POS_MAX, so
+ * we don't have to check these successor() calls:
+ */
+ if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
+ !bch2_snapshot_is_ancestor(trans->c,
+ iter->snapshot,
+ k.k->p.snapshot)) {
+ search_key = bpos_successor(k.k->p);
+ continue;
+ }
+
+ if (bkey_whiteout(k.k) &&
+ !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
+ search_key = bkey_successor(iter, k.k->p);
+ continue;
+ }
+
+ break;
+ }
/*
* iter->pos should be mononotically increasing, and always be equal to
@@ -2316,21 +2407,27 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
iter->pos = bkey_start_pos(k.k);
- if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
- iter->pos.snapshot = iter->snapshot;
-
iter->path = btree_path_set_pos(trans, iter->path, k.k->p,
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
BUG_ON(!iter->path->nodes_locked);
out:
+ if (iter->update_path) {
+ BUG_ON(!(iter->update_path->nodes_locked & 1));
+ iter->update_path->should_be_locked = true;
+ }
iter->path->should_be_locked = true;
- bch2_btree_iter_verify_entry_exit(iter);
- bch2_btree_iter_verify(iter);
+ if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
+ iter->pos.snapshot = iter->snapshot;
+
ret = bch2_btree_iter_verify_ret(iter, k);
- if (unlikely(ret))
- return bkey_s_c_err(ret);
+ if (unlikely(ret)) {
+ bch2_btree_iter_set_pos(iter, iter->pos);
+ k = bkey_s_c_err(ret);
+ }
+
+ bch2_btree_iter_verify_entry_exit(iter);
return k;
}
@@ -2720,7 +2817,11 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
if (iter->path)
bch2_path_put(trans, iter->path,
iter->flags & BTREE_ITER_INTENT);
+ if (iter->update_path)
+ bch2_path_put(trans, iter->update_path,
+ iter->flags & BTREE_ITER_INTENT);
iter->path = NULL;
+ iter->update_path = NULL;
}
static void __bch2_trans_iter_init(struct btree_trans *trans,
@@ -2750,6 +2851,7 @@ static void __bch2_trans_iter_init(struct btree_trans *trans,
iter->trans = trans;
iter->path = NULL;
+ iter->update_path = NULL;
iter->btree_id = btree_id;
iter->min_depth = depth;
iter->flags = flags;
@@ -2798,6 +2900,8 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
*dst = *src;
if (src->path)
__btree_path_get(src->path, src->flags & BTREE_ITER_INTENT);
+ if (src->update_path)
+ __btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT);
}
void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h
index eceec5d5..5205d53c 100644
--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@@ -222,11 +222,8 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *);
bool bch2_btree_iter_advance(struct btree_iter *);
bool bch2_btree_iter_rewind(struct btree_iter *);
-static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
+static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
{
- if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
- new_pos.snapshot = iter->snapshot;
-
iter->k.type = KEY_TYPE_deleted;
iter->k.p.inode = iter->pos.inode = new_pos.inode;
iter->k.p.offset = iter->pos.offset = new_pos.offset;
@@ -234,6 +231,19 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos
iter->k.size = 0;
}
+static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
+{
+ if (unlikely(iter->update_path))
+ bch2_path_put(iter->trans, iter->update_path,
+ iter->flags & BTREE_ITER_INTENT);
+ iter->update_path = NULL;
+
+ if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
+ new_pos.snapshot = iter->snapshot;
+
+ __bch2_btree_iter_set_pos(iter, new_pos);
+}
+
static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter)
{
BUG_ON(!(iter->flags & BTREE_ITER_IS_EXTENTS));
@@ -295,7 +305,7 @@ static inline int bkey_err(struct bkey_s_c k)
return PTR_ERR_OR_ZERO(k.k);
}
-static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter,
+static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
unsigned flags)
{
return flags & BTREE_ITER_SLOTS
@@ -316,7 +326,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
struct bkey_s_c k;
while (btree_trans_too_many_iters(trans) ||
- (k = __bch2_btree_iter_peek(iter, flags),
+ (k = bch2_btree_iter_peek_type(iter, flags),
bkey_err(k) == -EINTR))
bch2_trans_begin(trans);
@@ -335,7 +345,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
_start, _flags, _k, _ret) \
for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
(_start), (_flags)); \
- (_k) = __bch2_btree_iter_peek(&(_iter), _flags), \
+ (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \
!((_ret) = bkey_err(_k)) && (_k).k; \
bch2_btree_iter_advance(&(_iter)))
@@ -347,7 +357,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \
for (; \
- (_k) = __bch2_btree_iter_peek(&(_iter), _flags), \
+ (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \
!((_ret) = bkey_err(_k)) && (_k).k; \
bch2_btree_iter_advance(&(_iter)))
diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c
index 1d7b1012..faed51e7 100644
--- a/libbcachefs/btree_key_cache.c
+++ b/libbcachefs/btree_key_cache.c
@@ -222,7 +222,8 @@ static int btree_key_cache_fill(struct btree_trans *trans,
goto err;
if (!bch2_btree_node_relock(trans, ck_path, 0)) {
- trace_transaction_restart_ip(trans->fn, _THIS_IP_);
+ trace_trans_restart_relock_key_cache_fill(trans->fn,
+ _THIS_IP_, ck_path->btree_id, &ck_path->pos);
ret = btree_trans_restart(trans);
goto err;
}
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h
index 914d536c..65f460e3 100644
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -276,6 +276,7 @@ static inline struct btree_path_level *path_l(struct btree_path *path)
struct btree_iter {
struct btree_trans *trans;
struct btree_path *path;
+ struct btree_path *update_path;
enum btree_id btree_id:4;
unsigned min_depth:4;
diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h
index 16ebf1a2..5e5a1b5e 100644
--- a/libbcachefs/btree_update.h
+++ b/libbcachefs/btree_update.h
@@ -73,8 +73,14 @@ int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *,
int bch2_btree_node_update_key_get_iter(struct btree_trans *,
struct btree *, struct bkey_i *, bool);
+int bch2_trans_update_extent(struct btree_trans *, struct btree_iter *,
+ struct bkey_i *, enum btree_update_flags);
+
+int __must_check bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
+ struct bkey_i *, enum btree_update_flags);
int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
struct bkey_i *, enum btree_update_flags);
+
void bch2_trans_commit_hook(struct btree_trans *,
struct btree_trans_commit_hook *);
int __bch2_trans_commit(struct btree_trans *);
@@ -135,21 +141,4 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
(_i) < (_trans)->updates + (_trans)->nr_updates; \
(_i)++)
-static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans,
- enum btree_id btree_id,
- struct bpos pos)
-{
- struct btree_insert_entry *i;
-
- trans_for_each_update(trans, i)
- if ((cmp_int(btree_id, i->btree_id) ?:
- bpos_cmp(pos, i->k->k.p)) <= 0) {
- if (btree_id == i->btree_id)
- return i->k;
- break;
- }
-
- return NULL;
-}
-
#endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index 47568a0b..7b8ca115 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -1938,6 +1938,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
ret = bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_NOCHECK_RW|
+ BTREE_INSERT_USE_RESERVE|
BTREE_INSERT_JOURNAL_RECLAIM|
BTREE_INSERT_JOURNAL_RESERVED);
if (ret)
diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c
index ca98e685..7186457d 100644
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@@ -828,7 +828,8 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
struct bch_fs *c = trans->c;
int ret;
- if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)))
+ if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)) ||
+ test_bit(BCH_FS_STARTED, &c->flags))
return -EROFS;
bch2_trans_unlock(trans);
@@ -844,28 +845,63 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
return 0;
}
-static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
+static int run_one_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
+ bool overwrite)
{
struct bkey _deleted = KEY(0, 0, 0);
struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL };
struct bkey_s_c old;
struct bkey unpacked;
- struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
- bool trans_trigger_run;
- unsigned btree_id = 0;
int ret = 0;
- /*
- *
- * For a given btree, this algorithm runs insert triggers before
- * overwrite triggers: this is so that when extents are being moved
- * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
- * they are re-added.
- */
- for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
- while (btree_id_start < trans->updates + trans->nr_updates &&
- btree_id_start->btree_id < btree_id)
- btree_id_start++;
+ if ((i->flags & BTREE_TRIGGER_NORUN) ||
+ !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
+ return 0;
+
+ if (!overwrite) {
+ if (i->insert_trigger_run)
+ return 0;
+
+ BUG_ON(i->overwrite_trigger_run);
+ i->insert_trigger_run = true;
+ } else {
+ if (i->overwrite_trigger_run)
+ return 0;
+
+ BUG_ON(!i->insert_trigger_run);
+ i->overwrite_trigger_run = true;
+ }
+
+ old = bch2_btree_path_peek_slot(i->path, &unpacked);
+ _deleted.p = i->path->pos;
+
+ if (overwrite) {
+ ret = bch2_trans_mark_key(trans, old, deleted,
+ BTREE_TRIGGER_OVERWRITE|i->flags);
+ } else if (old.k->type == i->k->k.type &&
+ ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
+ i->overwrite_trigger_run = true;
+ ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(i->k),
+ BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|i->flags);
+ } else {
+ ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(i->k),
+ BTREE_TRIGGER_INSERT|i->flags);
+ }
+
+ if (ret == -EINTR)
+ trace_trans_restart_mark(trans->fn, _RET_IP_,
+ i->btree_id, &i->path->pos);
+ return ret ?: 1;
+}
+
+static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
+ struct btree_insert_entry *btree_id_start)
+{
+ struct btree_insert_entry *i;
+ bool trans_trigger_run;
+ int ret, overwrite;
+
+ for (overwrite = 0; overwrite < 2; overwrite++) {
/*
* Running triggers will append more updates to the list of updates as
@@ -877,66 +913,39 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
for (i = btree_id_start;
i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
i++) {
- if (i->insert_trigger_run ||
- (i->flags & BTREE_TRIGGER_NORUN) ||
- !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
- continue;
-
- BUG_ON(i->overwrite_trigger_run);
-
- i->insert_trigger_run = true;
- trans_trigger_run = true;
-
- old = bch2_btree_path_peek_slot(i->path, &unpacked);
- _deleted.p = i->path->pos;
-
- if (old.k->type == i->k->k.type &&
- ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
- i->overwrite_trigger_run = true;
- ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(i->k),
- BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|i->flags);
- } else {
- ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(i->k),
- BTREE_TRIGGER_INSERT|i->flags);
- }
-
- if (ret == -EINTR)
- trace_trans_restart_mark(trans->fn, _RET_IP_,
- i->btree_id, &i->path->pos);
- if (ret)
+ ret = run_one_trigger(trans, i, overwrite);
+ if (ret < 0)
return ret;
+ if (ret)
+ trans_trigger_run = true;
}
} while (trans_trigger_run);
+ }
- do {
- trans_trigger_run = false;
-
- for (i = btree_id_start;
- i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
- i++) {
- if (i->overwrite_trigger_run ||
- (i->flags & BTREE_TRIGGER_NORUN) ||
- !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
- continue;
-
- BUG_ON(!i->insert_trigger_run);
-
- i->overwrite_trigger_run = true;
- trans_trigger_run = true;
+ return 0;
+}
- old = bch2_btree_path_peek_slot(i->path, &unpacked);
- _deleted.p = i->path->pos;
+static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
+{
+ struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
+ unsigned btree_id = 0;
+ int ret = 0;
- ret = bch2_trans_mark_key(trans, old, deleted,
- BTREE_TRIGGER_OVERWRITE|i->flags);
+ /*
+ *
+ * For a given btree, this algorithm runs insert triggers before
+ * overwrite triggers: this is so that when extents are being moved
+ * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
+ * they are re-added.
+ */
+ for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
+ while (btree_id_start < trans->updates + trans->nr_updates &&
+ btree_id_start->btree_id < btree_id)
+ btree_id_start++;
- if (ret == -EINTR)
- trace_trans_restart_mark(trans->fn, _RET_IP_,
- i->btree_id, &i->path->pos);
- if (ret)
- return ret;
- }
- } while (trans_trigger_run);
+ ret = run_btree_triggers(trans, btree_id, btree_id_start);
+ if (ret)
+ return ret;
}
trans_for_each_update(trans, i)
@@ -1072,6 +1081,9 @@ static int check_pos_snapshot_overwritten(struct btree_trans *trans,
struct bkey_s_c k;
int ret;
+ if (!btree_type_has_snapshots(id))
+ return 0;
+
if (!snapshot_t(c, pos.snapshot)->children[0])
return 0;
@@ -1100,10 +1112,10 @@ static int check_pos_snapshot_overwritten(struct btree_trans *trans,
return ret;
}
-static int bch2_trans_update_extent(struct btree_trans *trans,
- struct btree_iter *orig_iter,
- struct bkey_i *insert,
- enum btree_update_flags flags)
+int bch2_trans_update_extent(struct btree_trans *trans,
+ struct btree_iter *orig_iter,
+ struct bkey_i *insert,
+ enum btree_update_flags flags)
{
struct bch_fs *c = trans->c;
struct btree_iter iter, update_iter;
@@ -1261,13 +1273,9 @@ nomerge1:
bkey_reassemble(update, k);
bch2_cut_front(insert->k.p, update);
- bch2_trans_copy_iter(&update_iter, &iter);
- update_iter.pos = update->k.p;
- ret = bch2_trans_update(trans, &update_iter, update,
+ ret = bch2_trans_update_by_path(trans, iter.path, update,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
flags);
- bch2_trans_iter_exit(trans, &update_iter);
-
if (ret)
goto err;
goto out;
@@ -1350,26 +1358,23 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans,
return ret;
}
-int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
+int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
struct bkey_i *k, enum btree_update_flags flags)
{
struct btree_insert_entry *i, n;
- BUG_ON(!iter->path->should_be_locked);
-
- if (iter->flags & BTREE_ITER_IS_EXTENTS)
- return bch2_trans_update_extent(trans, iter, k, flags);
+ BUG_ON(!path->should_be_locked);
BUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
- BUG_ON(bpos_cmp(k->k.p, iter->path->pos));
+ BUG_ON(bpos_cmp(k->k.p, path->pos));
n = (struct btree_insert_entry) {
.flags = flags,
- .bkey_type = __btree_node_type(iter->path->level, iter->btree_id),
- .btree_id = iter->btree_id,
- .level = iter->path->level,
- .cached = iter->flags & BTREE_ITER_CACHED,
- .path = iter->path,
+ .bkey_type = __btree_node_type(path->level, path->btree_id),
+ .btree_id = path->btree_id,
+ .level = path->level,
+ .cached = path->cached,
+ .path = path,
.k = k,
.ip_allocated = _RET_IP_,
};
@@ -1380,16 +1385,6 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
btree_insert_entry_cmp(i - 1, i) >= 0);
#endif
- if (bkey_deleted(&n.k->k) &&
- (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
- int ret = need_whiteout_for_snapshot(trans, n.btree_id, n.k->k.p);
- if (unlikely(ret < 0))
- return ret;
-
- if (ret)
- n.k->k.type = KEY_TYPE_whiteout;
- }
-
/*
* Pending updates are kept sorted: first, find position of new update,
* then delete/trim any updates the new update overwrites:
@@ -1420,10 +1415,29 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
i - trans->updates, n);
__btree_path_get(n.path, true);
-
return 0;
}
+int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_i *k, enum btree_update_flags flags)
+{
+ if (iter->flags & BTREE_ITER_IS_EXTENTS)
+ return bch2_trans_update_extent(trans, iter, k, flags);
+
+ if (bkey_deleted(&k->k) &&
+ (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
+ int ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
+ if (unlikely(ret < 0))
+ return ret;
+
+ if (ret)
+ k->k.type = KEY_TYPE_whiteout;
+ }
+
+ return bch2_trans_update_by_path(trans, iter->update_path ?: iter->path,
+ k, flags);
+}
+
void bch2_trans_commit_hook(struct btree_trans *trans,
struct btree_trans_commit_hook *h)
{
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index 895ff255..bf5ad436 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -11,6 +11,7 @@
#include "btree_gc.h"
#include "btree_update.h"
#include "buckets.h"
+#include "buckets_waiting_for_journal.h"
#include "ec.h"
#include "error.h"
#include "inode.h"
@@ -43,43 +44,6 @@ static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage,
}
}
-/*
- * Clear journal_seq_valid for buckets for which it's not needed, to prevent
- * wraparound:
- */
-void bch2_bucket_seq_cleanup(struct bch_fs *c)
-{
- u64 journal_seq = atomic64_read(&c->journal.seq);
- u16 last_seq_ondisk = c->journal.flushed_seq_ondisk;
- struct bch_dev *ca;
- struct bucket_array *buckets;
- struct bucket *g;
- struct bucket_mark m;
- unsigned i;
-
- if (journal_seq - c->last_bucket_seq_cleanup <
- (1U << (BUCKET_JOURNAL_SEQ_BITS - 2)))
- return;
-
- c->last_bucket_seq_cleanup = journal_seq;
-
- for_each_member_device(ca, c, i) {
- down_read(&ca->bucket_lock);
- buckets = bucket_array(ca);
-
- for_each_bucket(g, buckets) {
- bucket_cmpxchg(g, m, ({
- if (!m.journal_seq_valid ||
- bucket_needs_journal_commit(m, last_seq_ondisk))
- break;
-
- m.journal_seq_valid = 0;
- }));
- }
- up_read(&ca->bucket_lock);
- }
-}
-
void bch2_fs_usage_initialize(struct bch_fs *c)
{
struct bch_fs_usage *usage;
@@ -323,8 +287,8 @@ static inline int is_unavailable_bucket(struct bucket_mark m)
static inline int bucket_sectors_fragmented(struct bch_dev *ca,
struct bucket_mark m)
{
- return bucket_sectors_used(m)
- ? max(0, (int) ca->mi.bucket_size - (int) bucket_sectors_used(m))
+ return m.dirty_sectors
+ ? max(0, (int) ca->mi.bucket_size - (int) m.dirty_sectors)
: 0;
}
@@ -570,16 +534,24 @@ static int bch2_mark_alloc(struct btree_trans *trans,
v->journal_seq = cpu_to_le64(new_u.journal_seq);
}
- ca = bch_dev_bkey_exists(c, new.k->p.inode);
+ if (old_u.data_type && !new_u.data_type && new_u.journal_seq) {
+ ret = bch2_set_bucket_needs_journal_commit(c,
+ new_u.dev, new_u.bucket,
+ new_u.journal_seq);
+ if (ret)
+ return ret;
+ }
+
+ ca = bch_dev_bkey_exists(c, new_u.dev);
- if (new.k->p.offset >= ca->mi.nbuckets)
+ if (new_u.bucket >= ca->mi.nbuckets)
return 0;
percpu_down_read(&c->mark_lock);
if (!gc && new_u.gen != old_u.gen)
- *bucket_gen(ca, new.k->p.offset) = new_u.gen;
+ *bucket_gen(ca, new_u.bucket) = new_u.gen;
- g = __bucket(ca, new.k->p.offset, gc);
+ g = __bucket(ca, new_u.bucket, gc);
old_m = bucket_cmpxchg(g, m, ({
m.gen = new_u.gen;
@@ -587,11 +559,6 @@ static int bch2_mark_alloc(struct btree_trans *trans,
m.dirty_sectors = new_u.dirty_sectors;
m.cached_sectors = new_u.cached_sectors;
m.stripe = new_u.stripe != 0;
-
- if (journal_seq) {
- m.journal_seq_valid = 1;
- m.journal_seq = journal_seq;
- }
}));
bch2_dev_usage_update(c, ca, old_m, m, journal_seq, gc);
@@ -619,7 +586,7 @@ static int bch2_mark_alloc(struct btree_trans *trans,
return ret;
}
- trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset),
+ trace_invalidate(ca, bucket_to_sector(ca, new_u.bucket),
old_m.cached_sectors);
}
@@ -767,9 +734,10 @@ static int check_bucket_ref(struct bch_fs *c,
static int mark_stripe_bucket(struct btree_trans *trans,
struct bkey_s_c k,
unsigned ptr_idx,
- u64 journal_seq, unsigned flags)
+ unsigned flags)
{
struct bch_fs *c = trans->c;
+ u64 journal_seq = trans->journal_res.seq;
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
unsigned nr_data = s->nr_blocks - s->nr_redundant;
bool parity = ptr_idx >= nr_data;
@@ -810,11 +778,6 @@ static int mark_stripe_bucket(struct btree_trans *trans,
if (data_type)
new.data_type = data_type;
- if (journal_seq) {
- new.journal_seq_valid = 1;
- new.journal_seq = journal_seq;
- }
-
new.stripe = true;
}));
@@ -886,11 +849,6 @@ static int bch2_mark_pointer(struct btree_trans *trans,
new.data_type = bucket_data_type;
- if (journal_seq) {
- new.journal_seq_valid = 1;
- new.journal_seq = journal_seq;
- }
-
if (flags & BTREE_TRIGGER_NOATOMIC) {
g->_mark = new;
break;
@@ -1111,7 +1069,7 @@ static int bch2_mark_stripe(struct btree_trans *trans,
memset(m->block_sectors, 0, sizeof(m->block_sectors));
for (i = 0; i < new_s->nr_blocks; i++) {
- ret = mark_stripe_bucket(trans, new, i, journal_seq, flags);
+ ret = mark_stripe_bucket(trans, new, i, flags);
if (ret)
return ret;
}
@@ -1459,24 +1417,22 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree
{
struct bch_fs *c = trans->c;
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bpos pos = POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
- struct bkey_i *update = btree_trans_peek_updates(trans, BTREE_ID_alloc, pos);
+ struct bkey_s_c k;
int ret;
- bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos,
+ bch2_trans_iter_init(trans, iter, BTREE_ID_alloc,
+ POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)),
+ BTREE_ITER_WITH_UPDATES|
BTREE_ITER_CACHED|
- BTREE_ITER_CACHED_NOFILL|
BTREE_ITER_INTENT);
- ret = bch2_btree_iter_traverse(iter);
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
if (ret) {
bch2_trans_iter_exit(trans, iter);
return ret;
}
- *u = update && !bpos_cmp(update->k.p, pos)
- ? bch2_alloc_unpack(bkey_i_to_s_c(update))
- : alloc_mem_to_key(c, iter);
-
+ *u = bch2_alloc_unpack(k);
return 0;
}
diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h
index 45c6d230..d35c96bc 100644
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@@ -149,23 +149,11 @@ static inline u8 ptr_stale(struct bch_dev *ca,
/* bucket gc marks */
-static inline unsigned bucket_sectors_used(struct bucket_mark mark)
-{
- return mark.dirty_sectors + mark.cached_sectors;
-}
-
static inline bool is_available_bucket(struct bucket_mark mark)
{
return !mark.dirty_sectors && !mark.stripe;
}
-static inline bool bucket_needs_journal_commit(struct bucket_mark m,
- u16 last_seq_ondisk)
-{
- return m.journal_seq_valid &&
- ((s16) m.journal_seq - (s16) last_seq_ondisk > 0);
-}
-
/* Device usage: */
struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
@@ -240,7 +228,6 @@ bch2_fs_usage_read_short(struct bch_fs *);
/* key/bucket marking: */
-void bch2_bucket_seq_cleanup(struct bch_fs *);
void bch2_fs_usage_initialize(struct bch_fs *);
void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool);
diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h
index 18bca269..24139831 100644
--- a/libbcachefs/buckets_types.h
+++ b/libbcachefs/buckets_types.h
@@ -15,18 +15,9 @@ struct bucket_mark {
u8 gen;
u8 data_type:3,
owned_by_allocator:1,
- journal_seq_valid:1,
stripe:1;
u16 dirty_sectors;
u16 cached_sectors;
-
- /*
- * low bits of journal sequence number when this bucket was most
- * recently modified: if journal_seq_valid is set, this bucket can't be
- * reused until the journal sequence number written to disk is >= the
- * bucket's journal sequence number:
- */
- u16 journal_seq;
};
};
};
diff --git a/libbcachefs/buckets_waiting_for_journal.c b/libbcachefs/buckets_waiting_for_journal.c
new file mode 100644
index 00000000..33ae6370
--- /dev/null
+++ b/libbcachefs/buckets_waiting_for_journal.c
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "buckets_waiting_for_journal.h"
+#include <linux/jhash.h>
+
+static u32 hash_seeds[] = {
+ 2168153708,
+ 1262039142,
+ 1183479835,
+};
+
+static inline unsigned bucket_hash(u64 dev_bucket, unsigned hash_seed_idx)
+{
+ return jhash_2words(dev_bucket << 32, dev_bucket, hash_seeds[hash_seed_idx]);
+}
+
+bool bch2_bucket_needs_journal_commit(struct bch_fs *c,
+ u64 flushed_seq,
+ unsigned dev, u64 bucket)
+{
+ struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
+ u64 dev_bucket = (u64) dev << 56 | bucket;
+ bool ret = false;
+ unsigned i;
+
+ mutex_lock(&b->lock);
+ BUG_ON(!is_power_of_2(b->nr));
+
+ for (i = 0; i < ARRAY_SIZE(hash_seeds); i++) {
+ u32 h = bucket_hash(dev_bucket, i) & (b->nr - 1);
+
+ if (b->d[h].dev_bucket == dev_bucket) {
+ ret = b->d[h].journal_seq > flushed_seq;
+ break;
+ }
+ }
+
+ mutex_unlock(&b->lock);
+
+ return ret;
+}
+
+static int bch2_buckets_waiting_for_journal_rehash(struct bch_fs *c)
+{
+ struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
+ u64 flushed_seq = c->journal.flushed_seq_ondisk;
+ unsigned i, j, h, new_nr = b->nr * 2, elements = 0;
+ struct bucket_hashed *new_table;
+
+ new_table = kvmalloc_array(new_nr, sizeof(*new_table), __GFP_ZERO);
+ if (!new_table)
+ return -ENOMEM;
+
+ for (i = 0; i < b->nr; i++) {
+ if (b->d[i].journal_seq < flushed_seq)
+ continue;
+
+ for (j = 0; j < ARRAY_SIZE(hash_seeds); j++) {
+ h = bucket_hash(b->d[i].dev_bucket, j);
+ if ((h & (b->nr - 1)) == i)
+ break;
+ }
+
+ BUG_ON(j == ARRAY_SIZE(hash_seeds));
+ BUG_ON(new_table[h & (new_nr - 1)].dev_bucket);
+
+ new_table[h & (new_nr - 1)] = b->d[i];
+
+ elements++;
+ }
+
+ kvfree(b->d);
+ b->nr = new_nr;
+ b->d = new_table;
+ return 0;
+}
+
+int bch2_set_bucket_needs_journal_commit(struct bch_fs *c, unsigned dev, u64 bucket,
+ u64 journal_seq)
+{
+ struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
+ struct bucket_hashed new = {
+ .dev_bucket = (u64) dev << 56 | bucket,
+ .journal_seq = journal_seq,
+ }, *last_evicted = NULL;
+ u64 flushed_seq = c->journal.flushed_seq_ondisk;
+ unsigned tries, i;
+ int ret = 0;
+
+ mutex_lock(&b->lock);
+ BUG_ON(!is_power_of_2(b->nr));
+retry:
+ for (tries = 0; tries < 5; tries++) {
+ struct bucket_hashed *old, *victim = NULL;
+
+ for (i = 0; i < ARRAY_SIZE(hash_seeds); i++) {
+ old = b->d + (bucket_hash(new.dev_bucket, i) & (b->nr - 1));
+
+ if (old->dev_bucket == new.dev_bucket ||
+ old->journal_seq <= flushed_seq) {
+ *old = new;
+ goto out;
+ }
+
+ if (last_evicted != old)
+ victim = old;
+ }
+
+ /* Failed to find an empty slot: */
+ swap(new, *victim);
+ last_evicted = victim;
+ }
+
+ ret = bch2_buckets_waiting_for_journal_rehash(c);
+ if (!ret)
+ goto retry;
+out:
+ mutex_unlock(&b->lock);
+
+ return ret;
+}
+
+void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *c)
+{
+ struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
+
+ kvfree(b->d);
+}
+
+int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c)
+{
+ struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
+
+ mutex_init(&b->lock);
+
+ b->nr = 8;
+ b->d = kvmalloc_array(b->nr, sizeof(*b->d), __GFP_ZERO);
+ if (!b->d)
+ return -ENOMEM;
+
+ return 0;
+}
diff --git a/libbcachefs/buckets_waiting_for_journal.h b/libbcachefs/buckets_waiting_for_journal.h
new file mode 100644
index 00000000..079a591c
--- /dev/null
+++ b/libbcachefs/buckets_waiting_for_journal.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BUCKETS_WAITING_FOR_JOURNAL_H
+#define _BUCKETS_WAITING_FOR_JOURNAL_H
+
+#include "buckets_waiting_for_journal_types.h"
+
+bool bch2_bucket_needs_journal_commit(struct bch_fs *, u64, unsigned, u64);
+int bch2_set_bucket_needs_journal_commit(struct bch_fs *, unsigned, u64, u64);
+
+void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *);
+int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *);
+
+#endif /* _BUCKETS_WAITING_FOR_JOURNAL_H */
diff --git a/libbcachefs/buckets_waiting_for_journal_types.h b/libbcachefs/buckets_waiting_for_journal_types.h
new file mode 100644
index 00000000..99d17ffb
--- /dev/null
+++ b/libbcachefs/buckets_waiting_for_journal_types.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H
+#define _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H
+
+struct bucket_hashed {
+ u64 dev_bucket;
+ u64 journal_seq;
+};
+
+struct buckets_waiting_for_journal {
+ struct mutex lock;
+ size_t nr;
+ struct bucket_hashed *d;
+};
+
+#endif /* _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H */
diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c
index ef6da535..3a7c1468 100644
--- a/libbcachefs/inode.c
+++ b/libbcachefs/inode.c
@@ -585,62 +585,49 @@ found_slot:
static int bch2_inode_delete_keys(struct btree_trans *trans,
subvol_inum inum, enum btree_id id)
{
- u64 offset = 0;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_i delete;
+ u32 snapshot;
int ret = 0;
- while (!ret || ret == -EINTR) {
- struct disk_reservation disk_res =
- bch2_disk_reservation_init(trans->c, 0);
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_i delete;
- u32 snapshot;
+ /*
+ * We're never going to be deleting extents, no need to use an extent
+ * iterator:
+ */
+ bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0),
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_INTENT);
+ while (1) {
bch2_trans_begin(trans);
ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
if (ret)
- continue;
+ goto err;
- bch2_trans_iter_init(trans, &iter, id,
- SPOS(inum.inum, offset, snapshot),
- BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek(&iter);
-
- if (!k.k || iter.pos.inode != inum.inum) {
- bch2_trans_iter_exit(trans, &iter);
- break;
- }
+ bch2_btree_iter_set_snapshot(&iter, snapshot);
+ k = bch2_btree_iter_peek(&iter);
ret = bkey_err(k);
if (ret)
goto err;
+ if (!k.k || iter.pos.inode != inum.inum)
+ break;
+
bkey_init(&delete.k);
delete.k.p = iter.pos;
- if (btree_node_type_is_extents(iter.btree_id)) {
- unsigned max_sectors =
- min_t(u64, U64_MAX - iter.pos.offset,
- KEY_SIZE_MAX & (~0 << trans->c->block_bits));
-
- /* create the biggest key we can */
- bch2_key_resize(&delete.k, max_sectors);
-
- ret = bch2_extent_trim_atomic(trans, &iter, &delete);
- if (ret)
- goto err;
- }
-
ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
- bch2_trans_commit(trans, &disk_res, NULL,
+ bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL);
- bch2_disk_reservation_put(trans->c, &disk_res);
err:
- offset = iter.pos.offset;
- bch2_trans_iter_exit(trans, &iter);
+ if (ret && ret != -EINTR)
+ break;
}
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c
index df4d1a7a..e566f851 100644
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@@ -1671,13 +1671,9 @@ retry_alloc:
}
}
- bch2_bucket_seq_cleanup(c);
-
continue_at(cl, do_journal_write, c->io_complete_wq);
return;
no_io:
- bch2_bucket_seq_cleanup(c);
-
continue_at(cl, journal_write_done, c->io_complete_wq);
return;
err:
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index f73be9cb..3e3dcec3 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -700,17 +700,20 @@ static int __bch2_move_data(struct bch_fs *c,
bch2_trans_begin(&trans);
k = bch2_btree_iter_peek(&iter);
-
- stats->pos = iter.pos;
-
if (!k.k)
break;
+
ret = bkey_err(k);
+ if (ret == -EINTR)
+ continue;
if (ret)
break;
+
if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
break;
+ stats->pos = iter.pos;
+
if (!bkey_extent_is_direct_data(k.k))
goto next_nondata;
@@ -753,10 +756,8 @@ static int __bch2_move_data(struct bch_fs *c,
ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k,
data_cmd, data_opts);
if (ret2) {
- if (ret2 == -EINTR) {
- bch2_trans_begin(&trans);
+ if (ret2 == -EINTR)
continue;
- }
if (ret2 == -ENOMEM) {
/* memory allocation failure, wait for some IO to finish */
diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c
index 7cd1b0cf..92f78907 100644
--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@@ -69,10 +69,14 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
.dev = p.ptr.dev,
.offset = p.ptr.offset,
};
+ ssize_t i;
- ssize_t i = eytzinger0_find_le(h->data, h->used,
- sizeof(h->data[0]),
- bucket_offset_cmp, &search);
+ if (p.ptr.cached)
+ continue;
+
+ i = eytzinger0_find_le(h->data, h->used,
+ sizeof(h->data[0]),
+ bucket_offset_cmp, &search);
#if 0
/* eytzinger search verify code: */
ssize_t j = -1, k;
@@ -185,8 +189,7 @@ static int bch2_copygc(struct bch_fs *c)
if (m.owned_by_allocator ||
m.data_type != BCH_DATA_user ||
- !bucket_sectors_used(m) ||
- bucket_sectors_used(m) >= ca->mi.bucket_size)
+ m.dirty_sectors >= ca->mi.bucket_size)
continue;
WARN_ON(m.stripe && !g->stripe_redundancy);
@@ -195,9 +198,9 @@ static int bch2_copygc(struct bch_fs *c)
.dev = dev_idx,
.gen = m.gen,
.replicas = 1 + g->stripe_redundancy,
- .fragmentation = bucket_sectors_used(m) * (1U << 15)
+ .fragmentation = m.dirty_sectors * (1U << 15)
/ ca->mi.bucket_size,
- .sectors = bucket_sectors_used(m),
+ .sectors = m.dirty_sectors,
.offset = bucket_to_sector(ca, b),
};
heap_add_or_replace(h, e, -fragmentation_cmp, NULL);
@@ -231,8 +234,11 @@ static int bch2_copygc(struct bch_fs *c)
buckets_to_move = h->used;
- if (!buckets_to_move)
+ if (!buckets_to_move) {
+ bch_err_ratelimited(c, "copygc cannot run - sectors_reserved %llu!",
+ sectors_reserved);
return 0;
+ }
eytzinger0_sort(h->data, h->used,
sizeof(h->data[0]),
@@ -260,8 +266,8 @@ static int bch2_copygc(struct bch_fs *c)
m = READ_ONCE(buckets->b[b].mark);
if (i->gen == m.gen &&
- bucket_sectors_used(m)) {
- sectors_not_moved += bucket_sectors_used(m);
+ m.dirty_sectors) {
+ sectors_not_moved += m.dirty_sectors;
buckets_not_moved++;
}
}
diff --git a/libbcachefs/subvolume.c b/libbcachefs/subvolume.c
index 8aeb2e41..69603327 100644
--- a/libbcachefs/subvolume.c
+++ b/libbcachefs/subvolume.c
@@ -456,10 +456,10 @@ err:
return ret;
}
-static int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
- u32 *new_snapids,
- u32 *snapshot_subvols,
- unsigned nr_snapids)
+int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
+ u32 *new_snapids,
+ u32 *snapshot_subvols,
+ unsigned nr_snapids)
{
struct btree_iter iter;
struct bkey_i_snapshot *n;
@@ -522,7 +522,7 @@ static int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
n = bch2_trans_kmalloc(trans, sizeof(*n));
ret = PTR_ERR_OR_ZERO(n);
if (ret)
- return ret;
+ goto err;
bkey_reassemble(&n->k_i, k);
diff --git a/libbcachefs/subvolume.h b/libbcachefs/subvolume.h
index e4c3fdcd..4abe53df 100644
--- a/libbcachefs/subvolume.h
+++ b/libbcachefs/subvolume.h
@@ -122,6 +122,10 @@ int bch2_snapshot_get_subvol(struct btree_trans *, u32,
struct bch_subvolume *);
int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
+/* only exported for tests: */
+int bch2_snapshot_node_create(struct btree_trans *, u32,
+ u32 *, u32 *, unsigned);
+
int bch2_subvolume_delete(struct btree_trans *, u32);
int bch2_subvolume_unlink(struct btree_trans *, u32);
int bch2_subvolume_create(struct btree_trans *, u64, u32,
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 577b58e4..586ba60d 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -16,6 +16,7 @@
#include "btree_key_cache.h"
#include "btree_update_interior.h"
#include "btree_io.h"
+#include "buckets_waiting_for_journal.h"
#include "chardev.h"
#include "checksum.h"
#include "clock.h"
@@ -468,6 +469,7 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_fs_ec_exit(c);
bch2_fs_encryption_exit(c);
bch2_fs_io_exit(c);
+ bch2_fs_buckets_waiting_for_journal_exit(c);
bch2_fs_btree_interior_update_exit(c);
bch2_fs_btree_iter_exit(c);
bch2_fs_btree_key_cache_exit(&c->btree_key_cache);
@@ -810,6 +812,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:
bch2_fs_btree_iter_init(c) ?:
bch2_fs_btree_interior_update_init(c) ?:
+ bch2_fs_buckets_waiting_for_journal_init(c);
bch2_fs_subvolumes_init(c) ?:
bch2_fs_io_init(c) ?:
bch2_fs_encryption_init(c) ?:
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index 6d159632..ed9a0950 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -192,7 +192,7 @@ read_attribute(new_stripes);
read_attribute(io_timers_read);
read_attribute(io_timers_write);
-read_attribute(data_op_data_progress);
+read_attribute(data_jobs);
#ifdef CONFIG_BCACHEFS_TESTS
write_attribute(perf_test);
@@ -230,32 +230,20 @@ static size_t bch2_btree_avg_write_size(struct bch_fs *c)
return nr ? div64_u64(sectors, nr) : 0;
}
-static long stats_to_text(struct printbuf *out, struct bch_fs *c,
- struct bch_move_stats *stats)
-{
- pr_buf(out, "%s: data type %s btree_id %s position: ",
- stats->name,
- bch2_data_types[stats->data_type],
- bch2_btree_ids[stats->btree_id]);
- bch2_bpos_to_text(out, stats->pos);
- pr_buf(out, "%s", "\n");
-
- return 0;
-}
-
static long data_progress_to_text(struct printbuf *out, struct bch_fs *c)
{
long ret = 0;
- struct bch_move_stats *iter;
+ struct bch_move_stats *stats;
mutex_lock(&c->data_progress_lock);
-
- if (list_empty(&c->data_progress_list))
- pr_buf(out, "%s", "no progress to report\n");
- else
- list_for_each_entry(iter, &c->data_progress_list, list) {
- stats_to_text(out, c, iter);
- }
+ list_for_each_entry(stats, &c->data_progress_list, list) {
+ pr_buf(out, "%s: data type %s btree_id %s position: ",
+ stats->name,
+ bch2_data_types[stats->data_type],
+ bch2_btree_ids[stats->btree_id]);
+ bch2_bpos_to_text(out, stats->pos);
+ pr_buf(out, "%s", "\n");
+ }
mutex_unlock(&c->data_progress_lock);
return ret;
@@ -463,7 +451,7 @@ SHOW(bch2_fs)
return out.pos - buf;
}
- if (attr == &sysfs_data_op_data_progress) {
+ if (attr == &sysfs_data_jobs) {
data_progress_to_text(&out, c);
return out.pos - buf;
}
@@ -616,7 +604,7 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_rebalance_work,
sysfs_pd_controller_files(rebalance),
- &sysfs_data_op_data_progress,
+ &sysfs_data_jobs,
&sysfs_internal_uuid,
NULL
diff --git a/libbcachefs/tests.c b/libbcachefs/tests.c
index 16d67eb6..de84ce83 100644
--- a/libbcachefs/tests.c
+++ b/libbcachefs/tests.c
@@ -4,6 +4,7 @@
#include "bcachefs.h"
#include "btree_update.h"
#include "journal_reclaim.h"
+#include "subvolume.h"
#include "tests.h"
#include "linux/kthread.h"
@@ -461,6 +462,70 @@ static int test_extent_overwrite_all(struct bch_fs *c, u64 nr)
__test_extent_overwrite(c, 32, 64, 32, 128);
}
+/* snapshot unit tests */
+
+/* Test skipping over keys in unrelated snapshots: */
+static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_i_cookie cookie;
+ int ret;
+
+ bkey_cookie_init(&cookie.k_i);
+ cookie.k.p.snapshot = snapid_hi;
+ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i,
+ NULL, NULL, 0);
+ if (ret)
+ return ret;
+
+ bch2_trans_init(&trans, c, 0, 0);
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+ SPOS(0, 0, snapid_lo), 0);
+ k = bch2_btree_iter_peek(&iter);
+
+ BUG_ON(k.k->p.snapshot != U32_MAX);
+
+ bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_exit(&trans);
+ return ret;
+}
+
+static int test_snapshots(struct bch_fs *c, u64 nr)
+{
+ struct bkey_i_cookie cookie;
+ u32 snapids[2];
+ u32 snapid_subvols[2] = { 1, 1 };
+ int ret;
+
+ bkey_cookie_init(&cookie.k_i);
+ cookie.k.p.snapshot = U32_MAX;
+ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i,
+ NULL, NULL, 0);
+ if (ret)
+ return ret;
+
+ ret = bch2_trans_do(c, NULL, NULL, 0,
+ bch2_snapshot_node_create(&trans, U32_MAX,
+ snapids,
+ snapid_subvols,
+ 2));
+ if (ret)
+ return ret;
+
+ if (snapids[0] > snapids[1])
+ swap(snapids[0], snapids[1]);
+
+ ret = test_snapshot_filter(c, snapids[0], snapids[1]);
+ if (ret) {
+ bch_err(c, "err %i from test_snapshot_filter", ret);
+ return ret;
+ }
+
+ return 0;
+}
+
/* perf tests */
static u64 test_rand(void)
@@ -789,8 +854,10 @@ static int btree_perf_test_thread(void *data)
}
ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads));
- if (ret)
+ if (ret) {
+ bch_err(j->c, "%ps: error %i", j->fn, ret);
j->ret = ret;
+ }
if (atomic_dec_and_test(&j->done)) {
j->finish = sched_clock();
@@ -843,6 +910,8 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
perf_test(test_extent_overwrite_middle);
perf_test(test_extent_overwrite_all);
+ perf_test(test_snapshots);
+
if (!j.fn) {
pr_err("unknown test %s", testname);
return -EINVAL;