summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.bcachefs_revision2
-rw-r--r--cmd_migrate.c2
-rw-r--r--include/linux/compiler.h1
-rw-r--r--include/trace/events/bcachefs.h6
-rw-r--r--libbcachefs/alloc_background.c139
-rw-r--r--libbcachefs/alloc_background.h42
-rw-r--r--libbcachefs/bcachefs.h10
-rw-r--r--libbcachefs/btree_gc.c379
-rw-r--r--libbcachefs/btree_iter.c115
-rw-r--r--libbcachefs/btree_iter.h8
-rw-r--r--libbcachefs/btree_key_cache.c33
-rw-r--r--libbcachefs/btree_types.h11
-rw-r--r--libbcachefs/btree_update.h2
-rw-r--r--libbcachefs/btree_update_interior.c90
-rw-r--r--libbcachefs/btree_update_leaf.c73
-rw-r--r--libbcachefs/buckets.c14
-rw-r--r--libbcachefs/buckets.h6
-rw-r--r--libbcachefs/buckets_types.h1
-rw-r--r--libbcachefs/buckets_waiting_for_journal.c4
-rw-r--r--libbcachefs/error.c4
-rw-r--r--libbcachefs/fs.c4
-rw-r--r--libbcachefs/inode.c15
-rw-r--r--libbcachefs/inode.h2
-rw-r--r--libbcachefs/journal_io.c2
-rw-r--r--libbcachefs/movinggc.c155
-rw-r--r--libbcachefs/recovery.c23
-rw-r--r--libbcachefs/replicas.c12
-rw-r--r--libbcachefs/super.c7
28 files changed, 685 insertions, 477 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index 71e83e28..4bc1040c 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-bf340e68c74cdb70c692698ef7367b9dc6f6e61f
+b84661c042c7d5caaab3f79661d04789070bea78
diff --git a/cmd_migrate.c b/cmd_migrate.c
index fc863f89..4772b3bd 100644
--- a/cmd_migrate.c
+++ b/cmd_migrate.c
@@ -328,7 +328,7 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst,
bch2_bkey_append_ptr(&e->k_i, (struct bch_extent_ptr) {
.offset = physical,
.dev = 0,
- .gen = bucket(ca, b)->mark.gen,
+ .gen = *bucket_gen(ca, b),
});
ret = bch2_disk_reservation_get(c, &res, sectors, 1,
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 2bfbfadb..6d039ea3 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -60,6 +60,7 @@
#define unlikely(x) __builtin_expect(!!(x), 0)
#define unreachable() __builtin_unreachable()
#define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
+#define fallthrough __attribute__((__fallthrough__))
#define ___PASTE(a,b) a##b
#define __PASTE(a,b) ___PASTE(a,b)
diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h
index 8f10d13b..36c4c884 100644
--- a/include/trace/events/bcachefs.h
+++ b/include/trace/events/bcachefs.h
@@ -658,6 +658,12 @@ DEFINE_EVENT(transaction_restart, trans_restart_mark_replicas,
TP_ARGS(trans_fn, caller_ip)
);
+DEFINE_EVENT(transaction_restart, trans_restart_key_cache_raced,
+ TP_PROTO(const char *trans_fn,
+ unsigned long caller_ip),
+ TP_ARGS(trans_fn, caller_ip)
+);
+
DECLARE_EVENT_CLASS(transaction_restart_iter,
TP_PROTO(const char *trans_fn,
unsigned long caller_ip,
diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c
index 7ad16c21..0a5ec99e 100644
--- a/libbcachefs/alloc_background.c
+++ b/libbcachefs/alloc_background.c
@@ -39,15 +39,6 @@ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
#undef x
};
-struct bkey_alloc_buf {
- struct bkey_i k;
- struct bch_alloc_v3 v;
-
-#define x(_name, _bits) + _bits / 8
- u8 _pad[0 + BCH_ALLOC_FIELDS_V2()];
-#undef x
-} __attribute__((packed, aligned(8)));
-
/* Persistent alloc info: */
static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
@@ -254,24 +245,25 @@ struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
return ret;
}
-static void bch2_alloc_pack(struct bch_fs *c,
- struct bkey_alloc_buf *dst,
- const struct bkey_alloc_unpacked src)
+struct bkey_alloc_buf *bch2_alloc_pack(struct btree_trans *trans,
+ const struct bkey_alloc_unpacked src)
{
- bch2_alloc_pack_v3(dst, src);
+ struct bkey_alloc_buf *dst;
+
+ dst = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
+ if (!IS_ERR(dst))
+ bch2_alloc_pack_v3(dst, src);
+
+ return dst;
}
int bch2_alloc_write(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_alloc_unpacked *u, unsigned trigger_flags)
{
- struct bkey_alloc_buf *a;
-
- a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
- if (IS_ERR(a))
- return PTR_ERR(a);
+ struct bkey_alloc_buf *a = bch2_alloc_pack(trans, *u);
- bch2_alloc_pack(trans->c, a, *u);
- return bch2_trans_update(trans, iter, &a->k, trigger_flags);
+ return PTR_ERR_OR_ZERO(a) ?:
+ bch2_trans_update(trans, iter, &a->k, trigger_flags);
}
static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
@@ -341,7 +333,7 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
#undef x
}
-int bch2_alloc_read(struct bch_fs *c)
+int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only)
{
struct btree_trans trans;
struct btree_iter iter;
@@ -352,108 +344,43 @@ int bch2_alloc_read(struct bch_fs *c)
int ret;
bch2_trans_init(&trans, c, 0, 0);
- down_read(&c->gc_lock);
for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
BTREE_ITER_PREFETCH, k, ret) {
- if (!bkey_is_alloc(k.k))
- continue;
-
ca = bch_dev_bkey_exists(c, k.k->p.inode);
- g = bucket(ca, k.k->p.offset);
+ g = __bucket(ca, k.k->p.offset, gc);
u = bch2_alloc_unpack(k);
- *bucket_gen(ca, k.k->p.offset) = u.gen;
+ if (!gc)
+ *bucket_gen(ca, k.k->p.offset) = u.gen;
+
g->_mark.gen = u.gen;
- g->_mark.data_type = u.data_type;
- g->_mark.dirty_sectors = u.dirty_sectors;
- g->_mark.cached_sectors = u.cached_sectors;
- g->_mark.stripe = u.stripe != 0;
- g->stripe = u.stripe;
- g->stripe_redundancy = u.stripe_redundancy;
g->io_time[READ] = u.read_time;
g->io_time[WRITE] = u.write_time;
- g->oldest_gen = u.oldest_gen;
+ g->oldest_gen = !gc ? u.oldest_gen : u.gen;
g->gen_valid = 1;
- }
- bch2_trans_iter_exit(&trans, &iter);
- up_read(&c->gc_lock);
- bch2_trans_exit(&trans);
+ if (!gc ||
+ (metadata_only &&
+ (u.data_type == BCH_DATA_user ||
+ u.data_type == BCH_DATA_cached ||
+ u.data_type == BCH_DATA_parity))) {
+ g->_mark.data_type = u.data_type;
+ g->_mark.dirty_sectors = u.dirty_sectors;
+ g->_mark.cached_sectors = u.cached_sectors;
+ g->_mark.stripe = u.stripe != 0;
+ g->stripe = u.stripe;
+ g->stripe_redundancy = u.stripe_redundancy;
+ }
- if (ret) {
- bch_err(c, "error reading alloc info: %i", ret);
- return ret;
}
+ bch2_trans_iter_exit(&trans, &iter);
- return 0;
-}
-
-static int bch2_alloc_write_key(struct btree_trans *trans,
- struct btree_iter *iter,
- unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct bkey_s_c k;
- struct bkey_alloc_unpacked old_u, new_u;
- int ret;
-retry:
- bch2_trans_begin(trans);
-
- ret = bch2_btree_key_cache_flush(trans,
- BTREE_ID_alloc, iter->pos);
- if (ret)
- goto err;
+ bch2_trans_exit(&trans);
- k = bch2_btree_iter_peek_slot(iter);
- ret = bkey_err(k);
if (ret)
- goto err;
-
- old_u = bch2_alloc_unpack(k);
- new_u = alloc_mem_to_key(c, iter);
-
- if (!bkey_alloc_unpacked_cmp(old_u, new_u))
- return 0;
-
- ret = bch2_alloc_write(trans, iter, &new_u,
- BTREE_TRIGGER_NORUN) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|flags);
-err:
- if (ret == -EINTR)
- goto retry;
- return ret;
-}
-
-int bch2_alloc_write_all(struct bch_fs *c, unsigned flags)
-{
- struct btree_trans trans;
- struct btree_iter iter;
- struct bch_dev *ca;
- unsigned i;
- int ret = 0;
-
- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN,
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-
- for_each_member_device(ca, c, i) {
- bch2_btree_iter_set_pos(&iter,
- POS(ca->dev_idx, ca->mi.first_bucket));
+ bch_err(c, "error reading alloc info: %i", ret);
- while (iter.pos.offset < ca->mi.nbuckets) {
- ret = bch2_alloc_write_key(&trans, &iter, flags);
- if (ret) {
- percpu_ref_put(&ca->ref);
- goto err;
- }
- bch2_btree_iter_advance(&iter);
- }
- }
-err:
- bch2_trans_iter_exit(&trans, &iter);
- bch2_trans_exit(&trans);
return ret;
}
diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h
index 86b64177..98c7866e 100644
--- a/libbcachefs/alloc_background.h
+++ b/libbcachefs/alloc_background.h
@@ -38,40 +38,23 @@ static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l,
;
}
+struct bkey_alloc_buf {
+ struct bkey_i k;
+ struct bch_alloc_v3 v;
+
+#define x(_name, _bits) + _bits / 8
+ u8 _pad[0 + BCH_ALLOC_FIELDS_V2()];
+#undef x
+} __attribute__((packed, aligned(8)));
+
struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c);
+struct bkey_alloc_buf *bch2_alloc_pack(struct btree_trans *,
+ const struct bkey_alloc_unpacked);
int bch2_alloc_write(struct btree_trans *, struct btree_iter *,
struct bkey_alloc_unpacked *, unsigned);
int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
-static inline struct bkey_alloc_unpacked
-alloc_mem_to_key(struct bch_fs *c, struct btree_iter *iter)
-{
- struct bch_dev *ca;
- struct bucket *g;
- struct bkey_alloc_unpacked ret;
-
- percpu_down_read(&c->mark_lock);
- ca = bch_dev_bkey_exists(c, iter->pos.inode);
- g = bucket(ca, iter->pos.offset);
- ret = (struct bkey_alloc_unpacked) {
- .dev = iter->pos.inode,
- .bucket = iter->pos.offset,
- .gen = g->mark.gen,
- .oldest_gen = g->oldest_gen,
- .data_type = g->mark.data_type,
- .dirty_sectors = g->mark.dirty_sectors,
- .cached_sectors = g->mark.cached_sectors,
- .read_time = g->io_time[READ],
- .write_time = g->io_time[WRITE],
- .stripe = g->stripe,
- .stripe_redundancy = g->stripe_redundancy,
- };
- percpu_up_read(&c->mark_lock);
-
- return ret;
-}
-
#define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c);
@@ -101,7 +84,7 @@ static inline bool bkey_is_alloc(const struct bkey *k)
k->type == KEY_TYPE_alloc_v3;
}
-int bch2_alloc_read(struct bch_fs *);
+int bch2_alloc_read(struct bch_fs *, bool, bool);
static inline void bch2_wake_allocator(struct bch_dev *ca)
{
@@ -139,7 +122,6 @@ void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_stop(struct bch_dev *);
int bch2_dev_allocator_start(struct bch_dev *);
-int bch2_alloc_write_all(struct bch_fs *, unsigned);
void bch2_fs_allocator_background_init(struct bch_fs *);
#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index a28ddcd5..eec02f8a 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -451,7 +451,8 @@ struct bch_dev {
* Or rcu_read_lock(), but only for ptr_stale():
*/
struct bucket_array __rcu *buckets[2];
- struct bucket_gens *bucket_gens;
+ struct bucket_gens __rcu *bucket_gens;
+ u8 *oldest_gen;
unsigned long *buckets_nouse;
struct rw_semaphore bucket_lock;
@@ -536,7 +537,6 @@ enum {
/* misc: */
BCH_FS_NEED_ANOTHER_GC,
BCH_FS_DELETED_NODES,
- BCH_FS_NEED_ALLOC_WRITE,
BCH_FS_REBUILD_REPLICAS,
BCH_FS_HOLD_BTREE_WRITES,
};
@@ -716,6 +716,7 @@ struct bch_fs {
bool btree_trans_barrier_initialized;
struct btree_key_cache btree_key_cache;
+ unsigned btree_key_cache_btrees;
struct workqueue_struct *btree_update_wq;
struct workqueue_struct *btree_io_complete_wq;
@@ -952,6 +953,11 @@ static inline size_t btree_sectors(const struct bch_fs *c)
return c->opts.btree_node_size >> 9;
}
+static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree)
+{
+ return c->btree_key_cache_btrees & (1U << btree);
+}
+
static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time)
{
struct timespec64 t;
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index 809c9a76..7cab220c 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -9,6 +9,7 @@
#include "alloc_foreground.h"
#include "bkey_methods.h"
#include "bkey_buf.h"
+#include "btree_key_cache.h"
#include "btree_locking.h"
#include "btree_update_interior.h"
#include "btree_io.h"
@@ -533,7 +534,6 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
bkey_for_each_ptr_decode(k->k, ptrs, p, entry) {
struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
- struct bucket *g2 = PTR_BUCKET(ca, &p.ptr);
enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr);
if (fsck_err_on(!g->gen_valid, c,
@@ -544,9 +544,8 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
p.ptr.gen,
(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
if (!p.ptr.cached) {
- g2->_mark.gen = g->_mark.gen = p.ptr.gen;
- g2->gen_valid = g->gen_valid = true;
- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
+ g->_mark.gen = p.ptr.gen;
+ g->gen_valid = true;
} else {
do_update = true;
}
@@ -560,13 +559,12 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
p.ptr.gen, g->mark.gen,
(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
if (!p.ptr.cached) {
- g2->_mark.gen = g->_mark.gen = p.ptr.gen;
- g2->gen_valid = g->gen_valid = true;
- g2->_mark.data_type = 0;
- g2->_mark.dirty_sectors = 0;
- g2->_mark.cached_sectors = 0;
+ g->_mark.gen = p.ptr.gen;
+ g->gen_valid = true;
+ g->_mark.data_type = 0;
+ g->_mark.dirty_sectors = 0;
+ g->_mark.cached_sectors = 0;
set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
} else {
do_update = true;
}
@@ -603,8 +601,7 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
bch2_data_types[data_type],
(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
if (data_type == BCH_DATA_btree) {
- g2->_mark.data_type = g->_mark.data_type = data_type;
- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
+ g->_mark.data_type = data_type;
set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
} else {
do_update = true;
@@ -1169,13 +1166,14 @@ static int bch2_gc_done(struct bch_fs *c,
unsigned i, dev;
int ret = 0;
+ percpu_down_write(&c->mark_lock);
+
#define copy_field(_f, _msg, ...) \
if (dst->_f != src->_f) { \
if (verify) \
fsck_err(c, _msg ": got %llu, should be %llu" \
, ##__VA_ARGS__, dst->_f, src->_f); \
dst->_f = src->_f; \
- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \
}
#define copy_stripe_field(_f, _msg, ...) \
if (dst->_f != src->_f) { \
@@ -1185,18 +1183,6 @@ static int bch2_gc_done(struct bch_fs *c,
iter.pos, ##__VA_ARGS__, \
dst->_f, src->_f); \
dst->_f = src->_f; \
- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \
- }
-#define copy_bucket_field(_f) \
- if (dst->b[b]._f != src->b[b]._f) { \
- if (verify) \
- fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f \
- ": got %u, should be %u", dev, b, \
- dst->b[b].mark.gen, \
- bch2_data_types[dst->b[b].mark.data_type],\
- dst->b[b]._f, src->b[b]._f); \
- dst->b[b]._f = src->b[b]._f; \
- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \
}
#define copy_dev_field(_f, _msg, ...) \
copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
@@ -1207,36 +1193,18 @@ static int bch2_gc_done(struct bch_fs *c,
bch2_fs_usage_acc_to_base(c, i);
for_each_member_device(ca, c, dev) {
- struct bucket_array *dst = __bucket_array(ca, 0);
- struct bucket_array *src = __bucket_array(ca, 1);
- size_t b;
-
- for (b = 0; b < src->nbuckets; b++) {
- copy_bucket_field(_mark.gen);
- copy_bucket_field(_mark.data_type);
- copy_bucket_field(_mark.stripe);
- copy_bucket_field(_mark.dirty_sectors);
- copy_bucket_field(_mark.cached_sectors);
- copy_bucket_field(stripe_redundancy);
- copy_bucket_field(stripe);
-
- dst->b[b].oldest_gen = src->b[b].oldest_gen;
- }
-
- {
- struct bch_dev_usage *dst = ca->usage_base;
- struct bch_dev_usage *src = (void *)
- bch2_acc_percpu_u64s((void *) ca->usage_gc,
- dev_usage_u64s());
-
- copy_dev_field(buckets_ec, "buckets_ec");
- copy_dev_field(buckets_unavailable, "buckets_unavailable");
-
- for (i = 0; i < BCH_DATA_NR; i++) {
- copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]);
- copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]);
- copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]);
- }
+ struct bch_dev_usage *dst = ca->usage_base;
+ struct bch_dev_usage *src = (void *)
+ bch2_acc_percpu_u64s((void *) ca->usage_gc,
+ dev_usage_u64s());
+
+ copy_dev_field(buckets_ec, "buckets_ec");
+ copy_dev_field(buckets_unavailable, "buckets_unavailable");
+
+ for (i = 0; i < BCH_DATA_NR; i++) {
+ copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]);
+ copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]);
+ copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]);
}
};
@@ -1278,7 +1246,6 @@ static int bch2_gc_done(struct bch_fs *c,
#undef copy_fs_field
#undef copy_dev_field
-#undef copy_bucket_field
#undef copy_stripe_field
#undef copy_field
fsck_err:
@@ -1286,6 +1253,8 @@ fsck_err:
percpu_ref_put(&ca->ref);
if (ret)
bch_err(c, "%s: ret %i", __func__, ret);
+
+ percpu_up_write(&c->mark_lock);
return ret;
}
@@ -1308,15 +1277,6 @@ static int bch2_gc_start(struct bch_fs *c,
BUG_ON(ca->buckets[1]);
BUG_ON(ca->usage_gc);
- ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) +
- ca->mi.nbuckets * sizeof(struct bucket),
- GFP_KERNEL|__GFP_ZERO);
- if (!ca->buckets[1]) {
- percpu_ref_put(&ca->ref);
- bch_err(c, "error allocating ca->buckets[gc]");
- return -ENOMEM;
- }
-
ca->usage_gc = alloc_percpu(struct bch_dev_usage);
if (!ca->usage_gc) {
bch_err(c, "error allocating ca->usage_gc");
@@ -1325,33 +1285,151 @@ static int bch2_gc_start(struct bch_fs *c,
}
}
- percpu_down_write(&c->mark_lock);
+ return 0;
+}
+
+static int bch2_alloc_write_key(struct btree_trans *trans,
+ struct btree_iter *iter,
+ bool initial, bool metadata_only)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode);
+ struct bucket *g;
+ struct bkey_s_c k;
+ struct bkey_alloc_unpacked old_u, new_u, gc_u;
+ struct bkey_alloc_buf *a;
+ int ret;
+
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ old_u = new_u = bch2_alloc_unpack(k);
+
+ percpu_down_read(&c->mark_lock);
+ g = gc_bucket(ca, iter->pos.offset);
+ gc_u = (struct bkey_alloc_unpacked) {
+ .dev = iter->pos.inode,
+ .bucket = iter->pos.offset,
+ .gen = g->mark.gen,
+ .oldest_gen = g->oldest_gen,
+ .data_type = g->mark.data_type,
+ .dirty_sectors = g->mark.dirty_sectors,
+ .cached_sectors = g->mark.cached_sectors,
+ .read_time = g->io_time[READ],
+ .write_time = g->io_time[WRITE],
+ .stripe = g->stripe,
+ .stripe_redundancy = g->stripe_redundancy,
+ };
+ percpu_up_read(&c->mark_lock);
+
+ if (metadata_only &&
+ gc_u.data_type != BCH_DATA_sb &&
+ gc_u.data_type != BCH_DATA_journal &&
+ gc_u.data_type != BCH_DATA_btree)
+ return 0;
+
+ if (!bkey_alloc_unpacked_cmp(old_u, gc_u) ||
+ gen_after(old_u.gen, gc_u.gen))
+ return 0;
+
+#define copy_bucket_field(_f) \
+ if (fsck_err_on(new_u._f != gc_u._f, c, \
+ "bucket %llu:%llu gen %u data type %s has wrong " #_f \
+ ": got %u, should be %u", \
+ iter->pos.inode, iter->pos.offset, \
+ new_u.gen, \
+ bch2_data_types[new_u.data_type], \
+ new_u._f, gc_u._f)) \
+ new_u._f = gc_u._f; \
+
+ copy_bucket_field(gen);
+ copy_bucket_field(data_type);
+ copy_bucket_field(stripe);
+ copy_bucket_field(dirty_sectors);
+ copy_bucket_field(cached_sectors);
+ copy_bucket_field(stripe_redundancy);
+ copy_bucket_field(stripe);
+#undef copy_bucket_field
+
+ new_u.oldest_gen = gc_u.oldest_gen;
+
+ if (!bkey_alloc_unpacked_cmp(old_u, new_u))
+ return 0;
+
+ a = bch2_alloc_pack(trans, new_u);
+ if (IS_ERR(a))
+ return PTR_ERR(a);
+
+ ret = initial
+ ? bch2_journal_key_insert(c, BTREE_ID_alloc, 0, &a->k)
+ : bch2_trans_update(trans, iter, &a->k, BTREE_TRIGGER_NORUN);
+fsck_err:
+ return ret;
+}
+
+static int bch2_gc_alloc_done(struct bch_fs *c, bool initial, bool metadata_only)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bch_dev *ca;
+ unsigned i;
+ int ret = 0;
+
+ bch2_trans_init(&trans, c, 0, 0);
for_each_member_device(ca, c, i) {
- struct bucket_array *dst = __bucket_array(ca, 1);
- struct bucket_array *src = __bucket_array(ca, 0);
- size_t b;
+ for_each_btree_key(&trans, iter, BTREE_ID_alloc,
+ POS(ca->dev_idx, ca->mi.first_bucket),
+ BTREE_ITER_SLOTS|
+ BTREE_ITER_PREFETCH, k, ret) {
+ if (bkey_cmp(iter.pos, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0)
+ break;
- dst->first_bucket = src->first_bucket;
- dst->nbuckets = src->nbuckets;
+ ret = __bch2_trans_do(&trans, NULL, NULL,
+ BTREE_INSERT_LAZY_RW,
+ bch2_alloc_write_key(&trans, &iter,
+ initial, metadata_only));
+ if (ret)
+ break;
+ }
+ bch2_trans_iter_exit(&trans, &iter);
- for (b = 0; b < src->nbuckets; b++) {
- struct bucket *d = &dst->b[b];
- struct bucket *s = &src->b[b];
+ if (ret) {
+ bch_err(c, "error writing alloc info: %i", ret);
+ percpu_ref_put(&ca->ref);
+ break;
+ }
+ }
- d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen;
- d->gen_valid = s->gen_valid;
+ bch2_trans_exit(&trans);
+ return ret;
+}
- if (metadata_only &&
- (s->mark.data_type == BCH_DATA_user ||
- s->mark.data_type == BCH_DATA_cached))
- d->_mark = s->mark;
+static int bch2_gc_alloc_start(struct bch_fs *c, bool initial, bool metadata_only)
+{
+ struct bch_dev *ca;
+ unsigned i;
+
+ for_each_member_device(ca, c, i) {
+ struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) +
+ ca->mi.nbuckets * sizeof(struct bucket),
+ GFP_KERNEL|__GFP_ZERO);
+ if (!buckets) {
+ percpu_ref_put(&ca->ref);
+ percpu_up_write(&c->mark_lock);
+ bch_err(c, "error allocating ca->buckets[gc]");
+ return -ENOMEM;
}
- };
- percpu_up_write(&c->mark_lock);
+ buckets->first_bucket = ca->mi.first_bucket;
+ buckets->nbuckets = ca->mi.nbuckets;
+ rcu_assign_pointer(ca->buckets[1], buckets);
+ };
- return 0;
+ return bch2_alloc_read(c, true, metadata_only);
}
static void bch2_gc_alloc_reset(struct bch_fs *c, bool initial, bool metadata_only)
@@ -1423,10 +1501,18 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
bkey_reassemble(new, k);
- if (!r->refcount)
+ if (!r->refcount) {
new->k.type = KEY_TYPE_deleted;
- else
+ /*
+ * XXX ugly: bch2_journal_key_insert() queues up
+ * the key for the journal replay code, which
+ * doesn't run the extent overwrite pass
+ */
+ if (initial)
+ new->k.size = 0;
+ } else {
*bkey_refcount(new) = cpu_to_le64(r->refcount);
+ }
ret = initial
? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, new)
@@ -1598,6 +1684,7 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
!bch2_btree_interior_updates_nr_pending(c));
ret = bch2_gc_start(c, metadata_only) ?:
+ bch2_gc_alloc_start(c, initial, metadata_only) ?:
bch2_gc_reflink_start(c, initial, metadata_only);
if (ret)
goto out;
@@ -1665,16 +1752,15 @@ out:
if (!ret) {
bch2_journal_block(&c->journal);
- percpu_down_write(&c->mark_lock);
- ret = bch2_gc_reflink_done(c, initial, metadata_only) ?:
- bch2_gc_stripes_done(c, initial, metadata_only) ?:
+ ret = bch2_gc_stripes_done(c, initial, metadata_only) ?:
+ bch2_gc_reflink_done(c, initial, metadata_only) ?:
+ bch2_gc_alloc_done(c, initial, metadata_only) ?:
bch2_gc_done(c, initial, metadata_only);
bch2_journal_unblock(&c->journal);
- } else {
- percpu_down_write(&c->mark_lock);
}
+ percpu_down_write(&c->mark_lock);
/* Indicates that gc is no longer in progress: */
__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
@@ -1709,9 +1795,8 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k)
percpu_down_read(&c->mark_lock);
bkey_for_each_ptr(ptrs, ptr) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket *g = PTR_BUCKET(ca, ptr);
- if (gen_after(g->mark.gen, ptr->gen) > 16) {
+ if (ptr_stale(ca, ptr) > 16) {
percpu_up_read(&c->mark_lock);
return true;
}
@@ -1719,10 +1804,10 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k)
bkey_for_each_ptr(ptrs, ptr) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket *g = PTR_BUCKET(ca, ptr);
+ u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
- if (gen_after(g->gc_gen, ptr->gen))
- g->gc_gen = ptr->gen;
+ if (gen_after(*gen, ptr->gen))
+ *gen = ptr->gen;
}
percpu_up_read(&c->mark_lock);
@@ -1733,23 +1818,22 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k)
* For recalculating oldest gen, we only need to walk keys in leaf nodes; btree
* node pointers currently never have cached pointers that can become stale:
*/
-static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
+static int bch2_gc_btree_gens(struct btree_trans *trans, enum btree_id btree_id)
{
- struct btree_trans trans;
+ struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_buf sk;
int ret = 0, commit_err = 0;
bch2_bkey_buf_init(&sk);
- bch2_trans_init(&trans, c, 0, 0);
- bch2_trans_iter_init(&trans, &iter, btree_id, POS_MIN,
+ bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN,
BTREE_ITER_PREFETCH|
BTREE_ITER_NOT_EXTENTS|
BTREE_ITER_ALL_SNAPSHOTS);
- while ((bch2_trans_begin(&trans),
+ while ((bch2_trans_begin(trans),
k = bch2_btree_iter_peek(&iter)).k) {
ret = bkey_err(k);
@@ -1765,10 +1849,10 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
bch2_extent_normalize(c, bkey_i_to_s(sk.k));
commit_err =
- bch2_trans_update(&trans, &iter, sk.k, 0) ?:
- bch2_trans_commit(&trans, NULL, NULL,
- BTREE_INSERT_NOWAIT|
- BTREE_INSERT_NOFAIL);
+ bch2_trans_update(trans, &iter, sk.k, 0) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BTREE_INSERT_NOWAIT|
+ BTREE_INSERT_NOFAIL);
if (commit_err == -EINTR) {
commit_err = 0;
continue;
@@ -1777,20 +1861,42 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
bch2_btree_iter_advance(&iter);
}
- bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_iter_exit(trans, &iter);
- bch2_trans_exit(&trans);
bch2_bkey_buf_exit(&sk, c);
return ret;
}
+static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter)
+{
+ struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode);
+ struct bkey_s_c k;
+ struct bkey_alloc_unpacked u;
+ int ret;
+
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ u = bch2_alloc_unpack(k);
+
+ if (u.oldest_gen == ca->oldest_gen[iter->pos.offset])
+ return 0;
+
+ u.oldest_gen = ca->oldest_gen[iter->pos.offset];
+
+ return bch2_alloc_write(trans, iter, &u, BTREE_TRIGGER_NORUN);
+}
+
int bch2_gc_gens(struct bch_fs *c)
{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
struct bch_dev *ca;
- struct bucket_array *buckets;
- struct bucket *g;
- u64 start_time = local_clock();
+ u64 b, start_time = local_clock();
unsigned i;
int ret;
@@ -1800,21 +1906,32 @@ int bch2_gc_gens(struct bch_fs *c)
* lock at the start of going RO, thus the gc thread may get stuck:
*/
down_read(&c->gc_lock);
+ bch2_trans_init(&trans, c, 0, 0);
for_each_member_device(ca, c, i) {
- down_read(&ca->bucket_lock);
- buckets = bucket_array(ca);
+ struct bucket_gens *gens;
+
+ BUG_ON(ca->oldest_gen);
+
+ ca->oldest_gen = kvmalloc(ca->mi.nbuckets, GFP_KERNEL);
+ if (!ca->oldest_gen) {
+ percpu_ref_put(&ca->ref);
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ gens = bucket_gens(ca);
- for_each_bucket(g, buckets)
- g->gc_gen = g->mark.gen;
- up_read(&ca->bucket_lock);
+ for (b = gens->first_bucket;
+ b < gens->nbuckets; b++)
+ ca->oldest_gen[b] = gens->b[b];
}
for (i = 0; i < BTREE_ID_NR; i++)
if ((1 << i) & BTREE_ID_HAS_PTRS) {
c->gc_gens_btree = i;
c->gc_gens_pos = POS_MIN;
- ret = bch2_gc_btree_gens(c, i);
+ ret = bch2_gc_btree_gens(&trans, i);
if (ret) {
bch_err(c, "error recalculating oldest_gen: %i", ret);
goto err;
@@ -1822,12 +1939,28 @@ int bch2_gc_gens(struct bch_fs *c)
}
for_each_member_device(ca, c, i) {
- down_read(&ca->bucket_lock);
- buckets = bucket_array(ca);
+ for_each_btree_key(&trans, iter, BTREE_ID_alloc,
+ POS(ca->dev_idx, ca->mi.first_bucket),
+ BTREE_ITER_SLOTS|
+ BTREE_ITER_PREFETCH, k, ret) {
+ if (bkey_cmp(iter.pos, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0)
+ break;
+
+ ret = __bch2_trans_do(&trans, NULL, NULL,
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_NOFAIL,
+ bch2_alloc_write_oldest_gen(&trans, &iter));
+ if (ret) {
+ bch_err(c, "error writing oldest_gen: %i", ret);
+ break;
+ }
+ }
+ bch2_trans_iter_exit(&trans, &iter);
- for_each_bucket(g, buckets)
- g->oldest_gen = g->gc_gen;
- up_read(&ca->bucket_lock);
+ if (ret) {
+ percpu_ref_put(&ca->ref);
+ break;
+ }
}
c->gc_gens_btree = 0;
@@ -1837,6 +1970,12 @@ int bch2_gc_gens(struct bch_fs *c)
bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
err:
+ for_each_member_device(ca, c, i) {
+ kvfree(ca->oldest_gen);
+ ca->oldest_gen = NULL;
+ }
+
+ bch2_trans_exit(&trans);
up_read(&c->gc_lock);
return ret;
}
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index efe9b8cb..8505ad5c 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -1717,8 +1717,8 @@ bch2_btree_path_make_mut(struct btree_trans *trans,
return path;
}
-static struct btree_path * __must_check
-btree_path_set_pos(struct btree_trans *trans,
+struct btree_path * __must_check
+bch2_btree_path_set_pos(struct btree_trans *trans,
struct btree_path *path, struct bpos new_pos,
bool intent, unsigned long ip)
{
@@ -1932,7 +1932,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
path_pos->btree_id == btree_id &&
path_pos->level == level) {
__btree_path_get(path_pos, intent);
- path = btree_path_set_pos(trans, path_pos, pos, intent, ip);
+ path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
} else {
path = btree_path_alloc(trans, path_pos);
path_pos = NULL;
@@ -1983,13 +1983,13 @@ inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct
struct bkey_s_c k;
- BUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
-
if (!path->cached) {
struct btree_path_level *l = path_l(path);
- struct bkey_packed *_k =
- bch2_btree_node_iter_peek_all(&l->iter, l->b);
+ struct bkey_packed *_k;
+
+ EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
+ _k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null;
EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, path->pos) == 0);
@@ -1999,12 +1999,15 @@ inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct
} else {
struct bkey_cached *ck = (void *) path->l[0].b;
- EBUG_ON(path->btree_id != ck->key.btree_id ||
- bkey_cmp(path->pos, ck->key.pos));
+ EBUG_ON(ck &&
+ (path->btree_id != ck->key.btree_id ||
+ bkey_cmp(path->pos, ck->key.pos)));
- /* BTREE_ITER_CACHED_NOFILL? */
- if (unlikely(!ck->valid))
- goto hole;
+ /* BTREE_ITER_CACHED_NOFILL|BTREE_ITER_CACHED_NOCREATE? */
+ if (unlikely(!ck || !ck->valid))
+ return bkey_s_c_null;
+
+ EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
k = bkey_i_to_s_c(ck->k);
}
@@ -2029,7 +2032,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
{
int ret;
- iter->path = btree_path_set_pos(iter->trans, iter->path,
+ iter->path = bch2_btree_path_set_pos(iter->trans, iter->path,
btree_iter_search_key(iter),
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
@@ -2066,7 +2069,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
bkey_init(&iter->k);
iter->k.p = iter->pos = b->key.k.p;
- iter->path = btree_path_set_pos(trans, iter->path, b->key.k.p,
+ iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
iter->path->should_be_locked = true;
@@ -2128,7 +2131,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
* the next child node
*/
path = iter->path =
- btree_path_set_pos(trans, path, bpos_successor(iter->pos),
+ bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos),
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
@@ -2151,7 +2154,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
bkey_init(&iter->k);
iter->k.p = iter->pos = b->key.k.p;
- iter->path = btree_path_set_pos(trans, iter->path, b->key.k.p,
+ iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
iter->path->should_be_locked = true;
@@ -2247,18 +2250,52 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
return k;
}
+/*
+ * Checks btree key cache for key at iter->pos and returns it if present, or
+ * bkey_s_c_null:
+ */
+static noinline
+struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
+{
+ struct btree_trans *trans = iter->trans;
+ struct bch_fs *c = trans->c;
+ struct bkey u;
+ int ret;
+
+ if (!bch2_btree_key_cache_find(c, iter->btree_id, pos))
+ return bkey_s_c_null;
+
+ if (!iter->key_cache_path)
+ iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos,
+ iter->flags & BTREE_ITER_INTENT, 0,
+ iter->flags|BTREE_ITER_CACHED,
+ _THIS_IP_);
+
+ iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos,
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
+
+ ret = bch2_btree_path_traverse(trans, iter->key_cache_path, iter->flags|BTREE_ITER_CACHED);
+ if (unlikely(ret))
+ return bkey_s_c_err(ret);
+
+ iter->key_cache_path->should_be_locked = true;
+
+ return bch2_btree_path_peek_slot(iter->key_cache_path, &u);
+}
+
static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
{
struct btree_trans *trans = iter->trans;
struct bkey_i *next_update;
- struct bkey_s_c k;
+ struct bkey_s_c k, k2;
int ret;
EBUG_ON(iter->path->cached || iter->path->level);
bch2_btree_iter_verify(iter);
while (1) {
- iter->path = btree_path_set_pos(trans, iter->path, search_key,
+ iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
@@ -2270,8 +2307,23 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
goto out;
}
+ iter->path->should_be_locked = true;
+
k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k);
+ if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
+ k.k &&
+ (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
+ ret = bkey_err(k2);
+ if (ret) {
+ k = k2;
+ goto out;
+ }
+
+ k = k2;
+ iter->k = *k.k;
+ }
+
if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL))
k = btree_trans_peek_journal(trans, iter, k);
@@ -2368,7 +2420,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
__btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT);
iter->update_path = iter->path;
- iter->update_path = btree_path_set_pos(trans,
+ iter->update_path = bch2_btree_path_set_pos(trans,
iter->update_path, pos,
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
@@ -2407,7 +2459,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
iter->pos = bkey_start_pos(k.k);
- iter->path = btree_path_set_pos(trans, iter->path, k.k->p,
+ iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p,
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
BUG_ON(!iter->path->nodes_locked);
@@ -2471,7 +2523,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
search_key.snapshot = U32_MAX;
while (1) {
- iter->path = btree_path_set_pos(trans, iter->path, search_key,
+ iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
@@ -2602,7 +2654,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
}
search_key = btree_iter_search_key(iter);
- iter->path = btree_path_set_pos(trans, iter->path, search_key,
+ iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
@@ -2631,6 +2683,13 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
goto out;
}
+ if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
+ (k = btree_trans_peek_key_cache(iter, iter->pos)).k) {
+ if (!bkey_err(k))
+ iter->k = *k.k;
+ goto out;
+ }
+
k = bch2_btree_path_peek_slot(iter->path, &iter->k);
} else {
struct bpos next;
@@ -2820,8 +2879,12 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
if (iter->update_path)
bch2_path_put(trans, iter->update_path,
iter->flags & BTREE_ITER_INTENT);
+ if (iter->key_cache_path)
+ bch2_path_put(trans, iter->key_cache_path,
+ iter->flags & BTREE_ITER_INTENT);
iter->path = NULL;
iter->update_path = NULL;
+ iter->key_cache_path = NULL;
}
static void __bch2_trans_iter_init(struct btree_trans *trans,
@@ -2849,9 +2912,16 @@ static void __bch2_trans_iter_init(struct btree_trans *trans,
if (!test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags))
flags |= BTREE_ITER_WITH_JOURNAL;
+ if (!btree_id_cached(trans->c, btree_id)) {
+ flags &= ~BTREE_ITER_CACHED;
+ flags &= ~BTREE_ITER_WITH_KEY_CACHE;
+ } else if (!(flags & BTREE_ITER_CACHED))
+ flags |= BTREE_ITER_WITH_KEY_CACHE;
+
iter->trans = trans;
iter->path = NULL;
iter->update_path = NULL;
+ iter->key_cache_path = NULL;
iter->btree_id = btree_id;
iter->min_depth = depth;
iter->flags = flags;
@@ -2902,6 +2972,7 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
__btree_path_get(src->path, src->flags & BTREE_ITER_INTENT);
if (src->update_path)
__btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT);
+ dst->key_cache_path = NULL;
}
void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h
index 5205d53c..759c7b52 100644
--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@@ -50,11 +50,6 @@ static inline struct btree *btree_node_parent(struct btree_path *path,
return btree_path_node(path, b->c.level + 1);
}
-static inline int btree_iter_err(const struct btree_iter *iter)
-{
- return iter->flags & BTREE_ITER_ERROR ? -EIO : 0;
-}
-
/* Iterate over paths within a transaction: */
static inline struct btree_path *
@@ -132,6 +127,9 @@ __trans_next_path_with_node(struct btree_trans *trans, struct btree *b,
struct btree_path * __must_check
bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *,
bool, unsigned long);
+struct btree_path * __must_check
+bch2_btree_path_set_pos(struct btree_trans *, struct btree_path *,
+ struct bpos, bool, unsigned long);
int __must_check bch2_btree_path_traverse(struct btree_trans *,
struct btree_path *, unsigned);
struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos,
diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c
index faed51e7..df016c98 100644
--- a/libbcachefs/btree_key_cache.c
+++ b/libbcachefs/btree_key_cache.c
@@ -208,19 +208,21 @@ static int btree_key_cache_fill(struct btree_trans *trans,
struct btree_path *ck_path,
struct bkey_cached *ck)
{
- struct btree_iter iter;
+ struct btree_path *path;
struct bkey_s_c k;
unsigned new_u64s = 0;
struct bkey_i *new_k = NULL;
+ struct bkey u;
int ret;
- bch2_trans_iter_init(trans, &iter, ck->key.btree_id,
- ck->key.pos, BTREE_ITER_SLOTS);
- k = bch2_btree_iter_peek_slot(&iter);
- ret = bkey_err(k);
+ path = bch2_path_get(trans, ck->key.btree_id,
+ ck->key.pos, 0, 0, 0, _THIS_IP_);
+ ret = bch2_btree_path_traverse(trans, path, 0);
if (ret)
goto err;
+ k = bch2_btree_path_peek_slot(path, &u);
+
if (!bch2_btree_node_relock(trans, ck_path, 0)) {
trace_trans_restart_relock_key_cache_fill(trans->fn,
_THIS_IP_, ck_path->btree_id, &ck_path->pos);
@@ -261,9 +263,9 @@ static int btree_key_cache_fill(struct btree_trans *trans,
bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b);
/* We're not likely to need this iterator again: */
- set_btree_iter_dontneed(&iter);
+ path->preserve = false;
err:
- bch2_trans_iter_exit(trans, &iter);
+ bch2_path_put(trans, path, 0);
return ret;
}
@@ -384,21 +386,27 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
BTREE_ITER_CACHED_NOFILL|
BTREE_ITER_CACHED_NOCREATE|
BTREE_ITER_INTENT);
+ b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE;
+
ret = bch2_btree_iter_traverse(&c_iter);
if (ret)
goto out;
ck = (void *) c_iter.path->l[0].b;
- if (!ck ||
- (journal_seq && ck->journal.seq != journal_seq))
+ if (!ck)
goto out;
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
- if (!evict)
- goto out;
- goto evict;
+ if (evict)
+ goto evict;
+ goto out;
}
+ BUG_ON(!ck->valid);
+
+ if (journal_seq && ck->journal.seq != journal_seq)
+ goto out;
+
/*
* Since journal reclaim depends on us making progress here, and the
* allocator/copygc depend on journal reclaim making progress, we need
@@ -406,6 +414,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
* */
ret = bch2_btree_iter_traverse(&b_iter) ?:
bch2_trans_update(trans, &b_iter, ck->k,
+ BTREE_UPDATE_KEY_CACHE_RECLAIM|
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
BTREE_TRIGGER_NORUN) ?:
bch2_trans_commit(trans, NULL, NULL,
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h
index 65f460e3..989129f9 100644
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -202,10 +202,10 @@ struct btree_node_iter {
*/
#define BTREE_ITER_IS_EXTENTS (1 << 4)
#define BTREE_ITER_NOT_EXTENTS (1 << 5)
-#define BTREE_ITER_ERROR (1 << 6)
-#define BTREE_ITER_CACHED (1 << 7)
-#define BTREE_ITER_CACHED_NOFILL (1 << 8)
-#define BTREE_ITER_CACHED_NOCREATE (1 << 9)
+#define BTREE_ITER_CACHED (1 << 6)
+#define BTREE_ITER_CACHED_NOFILL (1 << 7)
+#define BTREE_ITER_CACHED_NOCREATE (1 << 8)
+#define BTREE_ITER_WITH_KEY_CACHE (1 << 9)
#define BTREE_ITER_WITH_UPDATES (1 << 10)
#define BTREE_ITER_WITH_JOURNAL (1 << 11)
#define __BTREE_ITER_ALL_SNAPSHOTS (1 << 12)
@@ -277,6 +277,7 @@ struct btree_iter {
struct btree_trans *trans;
struct btree_path *path;
struct btree_path *update_path;
+ struct btree_path *key_cache_path;
enum btree_id btree_id:4;
unsigned min_depth:4;
@@ -636,6 +637,7 @@ static inline bool btree_type_has_snapshots(enum btree_id id)
enum btree_update_flags {
__BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
+ __BTREE_UPDATE_KEY_CACHE_RECLAIM,
__BTREE_TRIGGER_NORUN, /* Don't run triggers at all */
@@ -648,6 +650,7 @@ enum btree_update_flags {
};
#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
+#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM)
#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN)
diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h
index 5e5a1b5e..d9a406a2 100644
--- a/libbcachefs/btree_update.h
+++ b/libbcachefs/btree_update.h
@@ -76,8 +76,6 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *,
int bch2_trans_update_extent(struct btree_trans *, struct btree_iter *,
struct bkey_i *, enum btree_update_flags);
-int __must_check bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
- struct bkey_i *, enum btree_update_flags);
int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
struct bkey_i *, enum btree_update_flags);
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index 7b8ca115..a0f7a9f0 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -243,6 +243,8 @@ retry:
bch2_alloc_sectors_done(c, wp);
mem_alloc:
b = bch2_btree_node_mem_alloc(c);
+ six_unlock_write(&b->c.lock);
+ six_unlock_intent(&b->c.lock);
/* we hold cannibalize_lock: */
BUG_ON(IS_ERR(b));
@@ -265,6 +267,9 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
b = as->prealloc_nodes[--as->nr_prealloc_nodes];
+ six_lock_intent(&b->c.lock, NULL, NULL);
+ six_lock_write(&b->c.lock, NULL, NULL);
+
set_btree_node_accessed(b);
set_btree_node_dirty(c, b);
set_btree_node_need_write(b);
@@ -378,7 +383,8 @@ static void bch2_btree_reserve_put(struct btree_update *as)
while (as->nr_prealloc_nodes) {
struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes];
- six_unlock_write(&b->c.lock);
+ six_lock_intent(&b->c.lock, NULL, NULL);
+ six_lock_write(&b->c.lock, NULL, NULL);
if (c->btree_reserve_cache_nr <
ARRAY_SIZE(c->btree_reserve_cache)) {
@@ -392,10 +398,8 @@ static void bch2_btree_reserve_put(struct btree_update *as)
bch2_open_buckets_put(c, &b->ob);
}
- btree_node_lock_type(c, b, SIX_LOCK_write);
__btree_node_free(c, b);
six_unlock_write(&b->c.lock);
-
six_unlock_intent(&b->c.lock);
}
@@ -403,39 +407,52 @@ static void bch2_btree_reserve_put(struct btree_update *as)
}
static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes,
- unsigned flags, struct closure *cl)
+ unsigned flags)
{
struct bch_fs *c = as->c;
+ struct closure cl;
struct btree *b;
int ret;
+ closure_init_stack(&cl);
+retry:
+
BUG_ON(nr_nodes > BTREE_RESERVE_MAX);
/*
* Protects reaping from the btree node cache and using the btree node
* open bucket reserve:
+ *
+ * BTREE_INSERT_NOWAIT only applies to btree node allocation, not
+ * blocking on this lock:
*/
- ret = bch2_btree_cache_cannibalize_lock(c, cl);
+ ret = bch2_btree_cache_cannibalize_lock(c, &cl);
if (ret)
- return ret;
+ goto err;
while (as->nr_prealloc_nodes < nr_nodes) {
b = __bch2_btree_node_alloc(c, &as->disk_res,
flags & BTREE_INSERT_NOWAIT
- ? NULL : cl, flags);
+ ? NULL : &cl, flags);
if (IS_ERR(b)) {
ret = PTR_ERR(b);
- goto err_free;
+ goto err;
}
as->prealloc_nodes[as->nr_prealloc_nodes++] = b;
}
bch2_btree_cache_cannibalize_unlock(c);
+ closure_sync(&cl);
return 0;
-err_free:
+err:
bch2_btree_cache_cannibalize_unlock(c);
- trace_btree_reserve_get_fail(c, nr_nodes, cl);
+ closure_sync(&cl);
+
+ if (ret == -EAGAIN)
+ goto retry;
+
+ trace_btree_reserve_get_fail(c, nr_nodes, &cl);
return ret;
}
@@ -935,7 +952,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
{
struct bch_fs *c = trans->c;
struct btree_update *as;
- struct closure cl;
u64 start_time = local_clock();
int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
? BCH_DISK_RESERVATION_NOFAIL : 0;
@@ -946,9 +962,8 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
if (flags & BTREE_INSERT_JOURNAL_RESERVED)
journal_flags |= JOURNAL_RES_GET_RESERVED;
-
- closure_init_stack(&cl);
-retry:
+ if (flags & BTREE_INSERT_JOURNAL_RECLAIM)
+ journal_flags |= JOURNAL_RES_GET_NONBLOCK;
/*
* XXX: figure out how far we might need to split,
@@ -1003,30 +1018,16 @@ retry:
if (ret)
goto err;
+ bch2_trans_unlock(trans);
+
ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
BTREE_UPDATE_JOURNAL_RES,
- journal_flags|JOURNAL_RES_GET_NONBLOCK);
- if (ret == -EAGAIN) {
- bch2_trans_unlock(trans);
-
- if (flags & BTREE_INSERT_JOURNAL_RECLAIM) {
- bch2_btree_update_free(as);
- btree_trans_restart(trans);
- return ERR_PTR(ret);
- }
-
- ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
- BTREE_UPDATE_JOURNAL_RES,
- journal_flags);
- if (ret) {
- trace_trans_restart_journal_preres_get(trans->fn, _RET_IP_);
- goto err;
- }
-
- if (!bch2_trans_relock(trans)) {
- ret = -EINTR;
- goto err;
- }
+ journal_flags);
+ if (ret) {
+ bch2_btree_update_free(as);
+ trace_trans_restart_journal_preres_get(trans->fn, _RET_IP_);
+ btree_trans_restart(trans);
+ return ERR_PTR(ret);
}
ret = bch2_disk_reservation_get(c, &as->disk_res,
@@ -1036,10 +1037,15 @@ retry:
if (ret)
goto err;
- ret = bch2_btree_reserve_get(as, nr_nodes, flags, &cl);
+ ret = bch2_btree_reserve_get(as, nr_nodes, flags);
if (ret)
goto err;
+ if (!bch2_trans_relock(trans)) {
+ ret = -EINTR;
+ goto err;
+ }
+
bch2_journal_pin_add(&c->journal,
atomic64_read(&c->journal.seq),
&as->journal, NULL);
@@ -1047,16 +1053,6 @@ retry:
return as;
err:
bch2_btree_update_free(as);
-
- if (ret == -EAGAIN) {
- bch2_trans_unlock(trans);
- closure_sync(&cl);
- ret = -EINTR;
- }
-
- if (ret == -EINTR && bch2_trans_relock(trans))
- goto retry;
-
return ERR_PTR(ret);
}
diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c
index 7186457d..9d954537 100644
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@@ -23,6 +23,10 @@
#include <linux/sort.h>
#include <trace/events/bcachefs.h>
+static int __must_check
+bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
+ struct bkey_i *, enum btree_update_flags);
+
static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
const struct btree_insert_entry *r)
{
@@ -650,9 +654,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
char buf[200];
bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
- bch_err(c, "invalid bkey %s on insert from %s -> %ps: %s\n",
- buf, trans->fn, (void *) i->ip_allocated, invalid);
- bch2_fatal_error(c);
+ bch2_fs_fatal_error(c, "invalid bkey %s on insert from %s -> %ps: %s\n",
+ buf, trans->fn, (void *) i->ip_allocated, invalid);
return -EINVAL;
}
btree_insert_entry_checks(trans, i);
@@ -1358,8 +1361,9 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans,
return ret;
}
-int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
- struct bkey_i *k, enum btree_update_flags flags)
+static int __must_check
+bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
+ struct bkey_i *k, enum btree_update_flags flags)
{
struct btree_insert_entry *i, n;
@@ -1397,17 +1401,6 @@ int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btr
!btree_insert_entry_cmp(&n, i)) {
BUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
- /*
- * This is a hack to ensure that inode creates update the btree,
- * not the key cache, which helps with cache coherency issues in
- * other areas:
- */
- if (n.cached && !i->cached) {
- i->k = n.k;
- i->flags = n.flags;
- return 0;
- }
-
bch2_path_put(trans, i->path, true);
*i = n;
} else
@@ -1421,12 +1414,17 @@ int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btr
int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_i *k, enum btree_update_flags flags)
{
+ struct btree_path *path = iter->update_path ?: iter->path;
+ struct bkey_cached *ck;
+ int ret;
+
if (iter->flags & BTREE_ITER_IS_EXTENTS)
return bch2_trans_update_extent(trans, iter, k, flags);
if (bkey_deleted(&k->k) &&
+ !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
- int ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
+ ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
if (unlikely(ret < 0))
return ret;
@@ -1434,8 +1432,45 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
k->k.type = KEY_TYPE_whiteout;
}
- return bch2_trans_update_by_path(trans, iter->update_path ?: iter->path,
- k, flags);
+ if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
+ !path->cached &&
+ !path->level &&
+ btree_id_cached(trans->c, path->btree_id)) {
+ if (!iter->key_cache_path ||
+ !iter->key_cache_path->should_be_locked ||
+ bpos_cmp(iter->key_cache_path->pos, k->k.p)) {
+ if (!iter->key_cache_path)
+ iter->key_cache_path =
+ bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
+ BTREE_ITER_INTENT|
+ BTREE_ITER_CACHED, _THIS_IP_);
+
+ iter->key_cache_path =
+ bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
+ iter->flags & BTREE_ITER_INTENT,
+ _THIS_IP_);
+
+ ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
+ BTREE_ITER_CACHED|
+ BTREE_ITER_CACHED_NOFILL);
+ if (unlikely(ret))
+ return ret;
+
+ ck = (void *) iter->key_cache_path->l[0].b;
+
+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+ trace_trans_restart_key_cache_raced(trans->fn, _RET_IP_);
+ btree_trans_restart(trans);
+ return -EINTR;
+ }
+
+ iter->key_cache_path->should_be_locked = true;
+ }
+
+ path = iter->key_cache_path;
+ }
+
+ return bch2_trans_update_by_path(trans, path, k, flags);
}
void bch2_trans_commit_hook(struct btree_trans *trans,
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index bf5ad436..b9f09b82 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -520,6 +520,7 @@ static int bch2_mark_alloc(struct btree_trans *trans,
!old_u.data_type != !new_u.data_type &&
new.k->type == KEY_TYPE_alloc_v3) {
struct bch_alloc_v3 *v = (struct bch_alloc_v3 *) new.v;
+ u64 old_journal_seq = le64_to_cpu(v->journal_seq);
BUG_ON(!journal_seq);
@@ -529,7 +530,8 @@ static int bch2_mark_alloc(struct btree_trans *trans,
* to wait on a journal flush before we can reuse the bucket:
*/
new_u.journal_seq = !new_u.data_type &&
- bch2_journal_noflush_seq(&c->journal, journal_seq)
+ (journal_seq == old_journal_seq ||
+ bch2_journal_noflush_seq(&c->journal, old_journal_seq))
? 0 : journal_seq;
v->journal_seq = cpu_to_le64(new_u.journal_seq);
}
@@ -2094,7 +2096,7 @@ static void buckets_free_rcu(struct rcu_head *rcu)
container_of(rcu, struct bucket_array, rcu);
kvpfree(buckets,
- sizeof(struct bucket_array) +
+ sizeof(*buckets) +
buckets->nbuckets * sizeof(struct bucket));
}
@@ -2103,7 +2105,7 @@ static void bucket_gens_free_rcu(struct rcu_head *rcu)
struct bucket_gens *buckets =
container_of(rcu, struct bucket_gens, rcu);
- kvpfree(buckets, sizeof(struct bucket_array) + buckets->nbuckets);
+ kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets);
}
int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
@@ -2213,9 +2215,9 @@ err:
kvpfree(buckets_nouse,
BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
if (bucket_gens)
- call_rcu(&old_buckets->rcu, bucket_gens_free_rcu);
+ call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu);
if (buckets)
- call_rcu(&old_buckets->rcu, buckets_free_rcu);
+ call_rcu(&buckets->rcu, buckets_free_rcu);
return ret;
}
@@ -2230,6 +2232,8 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
free_fifo(&ca->free[i]);
kvpfree(ca->buckets_nouse,
BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
+ kvpfree(rcu_dereference_protected(ca->bucket_gens, 1),
+ sizeof(struct bucket_gens) + ca->mi.nbuckets);
kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket));
diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h
index d35c96bc..7c6c59c7 100644
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@@ -97,12 +97,6 @@ static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
return sector_to_bucket(ca, ptr->offset);
}
-static inline struct bucket *PTR_BUCKET(struct bch_dev *ca,
- const struct bch_extent_ptr *ptr)
-{
- return bucket(ca, PTR_BUCKET_NR(ca, ptr));
-}
-
static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca,
const struct bch_extent_ptr *ptr)
{
diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h
index 24139831..2c73dc60 100644
--- a/libbcachefs/buckets_types.h
+++ b/libbcachefs/buckets_types.h
@@ -30,7 +30,6 @@ struct bucket {
u64 io_time[2];
u8 oldest_gen;
- u8 gc_gen;
unsigned gen_valid:1;
u8 stripe_redundancy;
u32 stripe;
diff --git a/libbcachefs/buckets_waiting_for_journal.c b/libbcachefs/buckets_waiting_for_journal.c
index 33ae6370..56b37b24 100644
--- a/libbcachefs/buckets_waiting_for_journal.c
+++ b/libbcachefs/buckets_waiting_for_journal.c
@@ -107,6 +107,10 @@ retry:
victim = old;
}
+ /* hashed to same slot 3 times: */
+ if (!victim)
+ break;
+
/* Failed to find an empty slot: */
swap(new, *victim);
last_evicted = victim;
diff --git a/libbcachefs/error.c b/libbcachefs/error.c
index 2cea6945..8279a9ba 100644
--- a/libbcachefs/error.c
+++ b/libbcachefs/error.c
@@ -15,7 +15,7 @@ bool bch2_inconsistent_error(struct bch_fs *c)
return false;
case BCH_ON_ERROR_ro:
if (bch2_fs_emergency_read_only(c))
- bch_err(c, "emergency read only");
+ bch_err(c, "inconsistency detected - emergency read only");
return true;
case BCH_ON_ERROR_panic:
panic(bch2_fmt(c, "panic after error"));
@@ -35,7 +35,7 @@ void bch2_topology_error(struct bch_fs *c)
void bch2_fatal_error(struct bch_fs *c)
{
if (bch2_fs_emergency_read_only(c))
- bch_err(c, "emergency read only");
+ bch_err(c, "fatal error - emergency read only");
}
void bch2_io_error_work(struct work_struct *work)
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index 472c03d2..91fa1897 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -104,7 +104,7 @@ void bch2_inode_update_after_write(struct btree_trans *trans,
bch2_assert_pos_locked(trans, BTREE_ID_inodes,
POS(0, bi->bi_inum),
- 0 && c->opts.inodes_use_key_cache);
+ c->opts.inodes_use_key_cache);
set_nlink(&inode->v, bch2_inode_nlink_get(bi));
i_uid_write(&inode->v, bi->bi_uid);
@@ -1471,7 +1471,7 @@ static void bch2_evict_inode(struct inode *vinode)
KEY_TYPE_QUOTA_WARN);
bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
KEY_TYPE_QUOTA_WARN);
- bch2_inode_rm(c, inode_inum(inode), true);
+ bch2_inode_rm(c, inode_inum(inode));
}
}
diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c
index 3a7c1468..78e2db6c 100644
--- a/libbcachefs/inode.c
+++ b/libbcachefs/inode.c
@@ -252,15 +252,13 @@ int bch2_inode_peek(struct btree_trans *trans,
u32 snapshot;
int ret;
- if (0 && trans->c->opts.inodes_use_key_cache)
- flags |= BTREE_ITER_CACHED;
-
ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
if (ret)
return ret;
bch2_trans_iter_init(trans, iter, BTREE_ID_inodes,
- SPOS(0, inum.inum, snapshot), flags);
+ SPOS(0, inum.inum, snapshot),
+ flags|BTREE_ITER_CACHED);
k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
if (ret)
@@ -631,20 +629,16 @@ err:
return ret;
}
-int bch2_inode_rm(struct bch_fs *c, subvol_inum inum, bool cached)
+int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
{
struct btree_trans trans;
struct btree_iter iter = { NULL };
struct bkey_i_inode_generation delete;
struct bch_inode_unpacked inode_u;
struct bkey_s_c k;
- unsigned iter_flags = BTREE_ITER_INTENT;
u32 snapshot;
int ret;
- if (0 && cached && c->opts.inodes_use_key_cache)
- iter_flags |= BTREE_ITER_CACHED;
-
bch2_trans_init(&trans, c, 0, 1024);
/*
@@ -668,7 +662,8 @@ retry:
goto err;
bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes,
- SPOS(0, inum.inum, snapshot), iter_flags);
+ SPOS(0, inum.inum, snapshot),
+ BTREE_ITER_INTENT|BTREE_ITER_CACHED);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h
index 723186d8..77957cc7 100644
--- a/libbcachefs/inode.h
+++ b/libbcachefs/inode.h
@@ -87,7 +87,7 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
int bch2_inode_create(struct btree_trans *, struct btree_iter *,
struct bch_inode_unpacked *, u32, u64);
-int bch2_inode_rm(struct bch_fs *, subvol_inum, bool);
+int bch2_inode_rm(struct bch_fs *, subvol_inum);
int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum,
struct bch_inode_unpacked *);
diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c
index e566f851..651828b8 100644
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@@ -1677,6 +1677,6 @@ no_io:
continue_at(cl, journal_write_done, c->io_complete_wq);
return;
err:
- bch2_inconsistent_error(c);
+ bch2_fatal_error(c);
continue_at(cl, journal_write_done, c->io_complete_wq);
}
diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c
index 92f78907..c82ecff3 100644
--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@@ -6,6 +6,7 @@
*/
#include "bcachefs.h"
+#include "alloc_background.h"
#include "alloc_foreground.h"
#include "btree_iter.h"
#include "btree_update.h"
@@ -137,18 +138,106 @@ static inline int fragmentation_cmp(copygc_heap *heap,
return cmp_int(l.fragmentation, r.fragmentation);
}
+static int walk_buckets_to_copygc(struct bch_fs *c)
+{
+ copygc_heap *h = &c->copygc_heap;
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_alloc_unpacked u;
+ int ret;
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ret) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, iter.pos.inode);
+ struct copygc_heap_entry e;
+
+ u = bch2_alloc_unpack(k);
+
+ if (u.data_type != BCH_DATA_user ||
+ u.dirty_sectors >= ca->mi.bucket_size ||
+ bch2_bucket_is_open(c, iter.pos.inode, iter.pos.offset))
+ continue;
+
+ e = (struct copygc_heap_entry) {
+ .dev = iter.pos.inode,
+ .gen = u.gen,
+ .replicas = 1 + u.stripe_redundancy,
+ .fragmentation = u.dirty_sectors * (1U << 15)
+ / ca->mi.bucket_size,
+ .sectors = u.dirty_sectors,
+ .offset = bucket_to_sector(ca, iter.pos.offset),
+ };
+ heap_add_or_replace(h, e, -fragmentation_cmp, NULL);
+
+ }
+ bch2_trans_iter_exit(&trans, &iter);
+
+ bch2_trans_exit(&trans);
+ return ret;
+}
+
+static int bucket_inorder_cmp(const void *_l, const void *_r)
+{
+ const struct copygc_heap_entry *l = _l;
+ const struct copygc_heap_entry *r = _r;
+
+ return cmp_int(l->dev, r->dev) ?: cmp_int(l->offset, r->offset);
+}
+
+static int check_copygc_was_done(struct bch_fs *c,
+ u64 *sectors_not_moved,
+ u64 *buckets_not_moved)
+{
+ copygc_heap *h = &c->copygc_heap;
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_alloc_unpacked u;
+ struct copygc_heap_entry *i;
+ int ret = 0;
+
+ sort(h->data, h->used, sizeof(h->data[0]), bucket_inorder_cmp, NULL);
+
+ bch2_trans_init(&trans, c, 0, 0);
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN, 0);
+
+ for (i = h->data; i < h->data + h->used; i++) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev);
+
+ bch2_btree_iter_set_pos(&iter, POS(i->dev, sector_to_bucket(ca, i->offset)));
+
+ ret = lockrestart_do(&trans,
+ bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+ if (ret)
+ break;
+
+ u = bch2_alloc_unpack(k);
+
+ if (u.gen == i->gen && u.dirty_sectors) {
+ *sectors_not_moved += u.dirty_sectors;
+ *buckets_not_moved += 1;
+ }
+ }
+ bch2_trans_iter_exit(&trans, &iter);
+
+ bch2_trans_exit(&trans);
+ return ret;
+}
+
static int bch2_copygc(struct bch_fs *c)
{
copygc_heap *h = &c->copygc_heap;
struct copygc_heap_entry e, *i;
- struct bucket_array *buckets;
struct bch_move_stats move_stats;
u64 sectors_to_move = 0, sectors_to_write = 0, sectors_not_moved = 0;
u64 sectors_reserved = 0;
u64 buckets_to_move, buckets_not_moved = 0;
struct bch_dev *ca;
unsigned dev_idx;
- size_t b, heap_size = 0;
+ size_t heap_size = 0;
int ret;
bch_move_stats_init(&move_stats, "copygc");
@@ -178,34 +267,12 @@ static int bch2_copygc(struct bch_fs *c)
spin_lock(&ca->fs->freelist_lock);
sectors_reserved += fifo_used(&ca->free[RESERVE_MOVINGGC]) * ca->mi.bucket_size;
spin_unlock(&ca->fs->freelist_lock);
+ }
- down_read(&ca->bucket_lock);
- buckets = bucket_array(ca);
-
- for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
- struct bucket *g = buckets->b + b;
- struct bucket_mark m = READ_ONCE(g->mark);
- struct copygc_heap_entry e;
-
- if (m.owned_by_allocator ||
- m.data_type != BCH_DATA_user ||
- m.dirty_sectors >= ca->mi.bucket_size)
- continue;
-
- WARN_ON(m.stripe && !g->stripe_redundancy);
-
- e = (struct copygc_heap_entry) {
- .dev = dev_idx,
- .gen = m.gen,
- .replicas = 1 + g->stripe_redundancy,
- .fragmentation = m.dirty_sectors * (1U << 15)
- / ca->mi.bucket_size,
- .sectors = m.dirty_sectors,
- .offset = bucket_to_sector(ca, b),
- };
- heap_add_or_replace(h, e, -fragmentation_cmp, NULL);
- }
- up_read(&ca->bucket_lock);
+ ret = walk_buckets_to_copygc(c);
+ if (ret) {
+ bch2_fs_fatal_error(c, "error walking buckets to copygc!");
+ return ret;
}
if (!h->used) {
@@ -251,30 +318,18 @@ static int bch2_copygc(struct bch_fs *c)
writepoint_ptr(&c->copygc_write_point),
copygc_pred, NULL,
&move_stats);
+ if (ret) {
+ bch_err(c, "error %i from bch2_move_data() in copygc", ret);
+ return ret;
+ }
- for_each_rw_member(ca, c, dev_idx) {
- down_read(&ca->bucket_lock);
- buckets = bucket_array(ca);
- for (i = h->data; i < h->data + h->used; i++) {
- struct bucket_mark m;
- size_t b;
-
- if (i->dev != dev_idx)
- continue;
-
- b = sector_to_bucket(ca, i->offset);
- m = READ_ONCE(buckets->b[b].mark);
-
- if (i->gen == m.gen &&
- m.dirty_sectors) {
- sectors_not_moved += m.dirty_sectors;
- buckets_not_moved++;
- }
- }
- up_read(&ca->bucket_lock);
+ ret = check_copygc_was_done(c, &sectors_not_moved, &buckets_not_moved);
+ if (ret) {
+ bch_err(c, "error %i from check_copygc_was_done()", ret);
+ return ret;
}
- if (sectors_not_moved && !ret)
+ if (sectors_not_moved)
bch_warn_ratelimited(c,
"copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved (move stats: moved %llu sectors, raced %llu keys, %llu sectors)",
sectors_not_moved, sectors_to_move,
diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c
index b818093e..7e4400cc 100644
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@@ -1095,7 +1095,11 @@ use_clean:
bch_verbose(c, "starting alloc read");
err = "error reading allocation information";
- ret = bch2_alloc_read(c);
+
+ down_read(&c->gc_lock);
+ ret = bch2_alloc_read(c, false, false);
+ up_read(&c->gc_lock);
+
if (ret)
goto err;
bch_verbose(c, "alloc read done");
@@ -1153,23 +1157,6 @@ use_clean:
if (c->opts.verbose || !c->sb.clean)
bch_info(c, "journal replay done");
- if (test_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags) &&
- !c->opts.nochanges) {
- /*
- * note that even when filesystem was clean there might be work
- * to do here, if we ran gc (because of fsck) which recalculated
- * oldest_gen:
- */
- bch_verbose(c, "writing allocation info");
- err = "error writing out alloc info";
- ret = bch2_alloc_write_all(c, BTREE_INSERT_LAZY_RW);
- if (ret) {
- bch_err(c, "error writing alloc info");
- goto err;
- }
- bch_verbose(c, "alloc write done");
- }
-
if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
bch2_fs_lazy_rw(c);
diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c
index a08f1e08..96994b7a 100644
--- a/libbcachefs/replicas.c
+++ b/libbcachefs/replicas.c
@@ -414,18 +414,10 @@ err:
goto out;
}
-static int __bch2_mark_replicas(struct bch_fs *c,
- struct bch_replicas_entry *r,
- bool check)
-{
- return likely(bch2_replicas_marked(c, r)) ? 0
- : check ? -1
- : bch2_mark_replicas_slowpath(c, r);
-}
-
int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r)
{
- return __bch2_mark_replicas(c, r, false);
+ return likely(bch2_replicas_marked(c, r))
+ ? 0 : bch2_mark_replicas_slowpath(c, r);
}
/* replicas delta list: */
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 586ba60d..d8b72d8d 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -762,6 +762,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_opts_apply(&c->opts, opts);
+ /* key cache currently disabled for inodes, because of snapshots: */
+ c->opts.inodes_use_key_cache = 0;
+
+ c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc;
+ if (c->opts.inodes_use_key_cache)
+ c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes;
+
c->block_bits = ilog2(block_sectors(c));
c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);