summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.bcachefs_revision2
-rw-r--r--cmd_migrate.c2
-rw-r--r--libbcachefs/alloc_background.c3
-rw-r--r--libbcachefs/alloc_background.h2
-rw-r--r--libbcachefs/alloc_foreground.c35
-rw-r--r--libbcachefs/alloc_foreground.h10
-rw-r--r--libbcachefs/bcachefs.h1
-rw-r--r--libbcachefs/btree_gc.c35
-rw-r--r--libbcachefs/btree_iter.c60
-rw-r--r--libbcachefs/btree_iter.h14
-rw-r--r--libbcachefs/btree_types.h3
-rw-r--r--libbcachefs/btree_update_interior.c7
-rw-r--r--libbcachefs/btree_update_leaf.c24
-rw-r--r--libbcachefs/buckets.c284
-rw-r--r--libbcachefs/buckets.h4
-rw-r--r--libbcachefs/dirent.c5
-rw-r--r--libbcachefs/ec.c46
-rw-r--r--libbcachefs/errcode.h12
-rw-r--r--libbcachefs/fs-io.c382
-rw-r--r--libbcachefs/fs.c28
-rw-r--r--libbcachefs/fs.h6
-rw-r--r--libbcachefs/io.c4
-rw-r--r--libbcachefs/opts.h12
-rw-r--r--libbcachefs/quota.c69
-rw-r--r--libbcachefs/reflink.c3
-rw-r--r--libbcachefs/replicas.c66
-rw-r--r--libbcachefs/replicas.h3
-rw-r--r--libbcachefs/subvolume.c9
-rw-r--r--libbcachefs/subvolume.h2
-rw-r--r--libbcachefs/util.h2
30 files changed, 735 insertions, 400 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index e4c9f208..236c0c17 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-4c24a1cf56583a3da1e14eb1bce2c3240d860b06
+50d6a25d9c0090d84ad9aadd29f76bc0abff5423
diff --git a/cmd_migrate.c b/cmd_migrate.c
index 41cfe5d9..fa467306 100644
--- a/cmd_migrate.c
+++ b/cmd_migrate.c
@@ -331,8 +331,6 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst,
die("error reserving space in new filesystem: %s",
strerror(-ret));
- bch2_mark_bkey_replicas(c, extent_i_to_s_c(e).s_c);
-
ret = bch2_btree_insert(c, BTREE_ID_extents, &e->k_i,
&res, NULL, 0);
if (ret)
diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c
index b2735c85..bf3611e7 100644
--- a/libbcachefs/alloc_background.c
+++ b/libbcachefs/alloc_background.c
@@ -336,6 +336,9 @@ static int bch2_alloc_read_fn(struct btree_trans *trans, struct bkey_s_c k)
g->_mark.data_type = u.data_type;
g->_mark.dirty_sectors = u.dirty_sectors;
g->_mark.cached_sectors = u.cached_sectors;
+ g->_mark.stripe = u.stripe != 0;
+ g->stripe = u.stripe;
+ g->stripe_redundancy = u.stripe_redundancy;
g->io_time[READ] = u.read_time;
g->io_time[WRITE] = u.write_time;
g->oldest_gen = u.oldest_gen;
diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h
index 370573f8..b1efc149 100644
--- a/libbcachefs/alloc_background.h
+++ b/libbcachefs/alloc_background.h
@@ -65,6 +65,8 @@ alloc_mem_to_key(struct btree_iter *iter,
.cached_sectors = m.cached_sectors,
.read_time = g->io_time[READ],
.write_time = g->io_time[WRITE],
+ .stripe = g->stripe,
+ .stripe_redundancy = g->stripe_redundancy,
};
}
diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c
index 412fed47..2bb107b8 100644
--- a/libbcachefs/alloc_foreground.c
+++ b/libbcachefs/alloc_foreground.c
@@ -348,8 +348,7 @@ static void add_new_bucket(struct bch_fs *c,
ob_push(c, ptrs, ob);
}
-enum bucket_alloc_ret
-bch2_bucket_alloc_set(struct bch_fs *c,
+int bch2_bucket_alloc_set(struct bch_fs *c,
struct open_buckets *ptrs,
struct dev_stripe_state *stripe,
struct bch_devs_mask *devs_may_alloc,
@@ -363,7 +362,7 @@ bch2_bucket_alloc_set(struct bch_fs *c,
struct dev_alloc_list devs_sorted =
bch2_dev_alloc_list(c, stripe, devs_may_alloc);
struct bch_dev *ca;
- enum bucket_alloc_ret ret = INSUFFICIENT_DEVICES;
+ int ret = -INSUFFICIENT_DEVICES;
unsigned i;
BUG_ON(*nr_effective >= nr_replicas);
@@ -381,7 +380,7 @@ bch2_bucket_alloc_set(struct bch_fs *c,
ob = bch2_bucket_alloc(c, ca, reserve,
flags & BUCKET_MAY_ALLOC_PARTIAL, cl);
if (IS_ERR(ob)) {
- ret = -PTR_ERR(ob);
+ ret = PTR_ERR(ob);
if (cl)
return ret;
@@ -394,7 +393,7 @@ bch2_bucket_alloc_set(struct bch_fs *c,
bch2_dev_stripe_increment(ca, stripe);
if (*nr_effective >= nr_replicas)
- return ALLOC_SUCCESS;
+ return 0;
}
return ret;
@@ -408,8 +407,7 @@ bch2_bucket_alloc_set(struct bch_fs *c,
* it's to a device we don't want:
*/
-static enum bucket_alloc_ret
-bucket_alloc_from_stripe(struct bch_fs *c,
+static int bucket_alloc_from_stripe(struct bch_fs *c,
struct open_buckets *ptrs,
struct write_point *wp,
struct bch_devs_mask *devs_may_alloc,
@@ -505,8 +503,7 @@ static void get_buckets_from_writepoint(struct bch_fs *c,
wp->ptrs = ptrs_skip;
}
-static enum bucket_alloc_ret
-open_bucket_add_buckets(struct bch_fs *c,
+static int open_bucket_add_buckets(struct bch_fs *c,
struct open_buckets *ptrs,
struct write_point *wp,
struct bch_devs_list *devs_have,
@@ -522,7 +519,7 @@ open_bucket_add_buckets(struct bch_fs *c,
struct bch_devs_mask devs;
struct open_bucket *ob;
struct closure *cl = NULL;
- enum bucket_alloc_ret ret;
+ int ret;
unsigned i;
rcu_read_lock();
@@ -550,8 +547,8 @@ open_bucket_add_buckets(struct bch_fs *c,
target, erasure_code,
nr_replicas, nr_effective,
have_cache, flags, _cl);
- if (ret == FREELIST_EMPTY ||
- ret == OPEN_BUCKETS_EMPTY)
+ if (ret == -FREELIST_EMPTY ||
+ ret == -OPEN_BUCKETS_EMPTY)
return ret;
if (*nr_effective >= nr_replicas)
return 0;
@@ -575,7 +572,7 @@ retry_blocking:
ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs,
nr_replicas, nr_effective, have_cache,
reserve, flags, cl);
- if (ret && ret != INSUFFICIENT_DEVICES && !cl && _cl) {
+ if (ret && ret != -INSUFFICIENT_DEVICES && !cl && _cl) {
cl = _cl;
goto retry_blocking;
}
@@ -772,7 +769,7 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
unsigned nr_effective, write_points_nr;
unsigned ob_flags = 0;
bool have_cache;
- enum bucket_alloc_ret ret;
+ int ret;
int i;
if (!(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS))
@@ -821,7 +818,7 @@ alloc_done:
if (erasure_code && !ec_open_bucket(c, &ptrs))
pr_debug("failed to get ec bucket: ret %u", ret);
- if (ret == INSUFFICIENT_DEVICES &&
+ if (ret == -INSUFFICIENT_DEVICES &&
nr_effective >= nr_replicas_required)
ret = 0;
@@ -854,15 +851,15 @@ err:
mutex_unlock(&wp->lock);
- if (ret == FREELIST_EMPTY &&
+ if (ret == -FREELIST_EMPTY &&
try_decrease_writepoints(c, write_points_nr))
goto retry;
switch (ret) {
- case OPEN_BUCKETS_EMPTY:
- case FREELIST_EMPTY:
+ case -OPEN_BUCKETS_EMPTY:
+ case -FREELIST_EMPTY:
return cl ? ERR_PTR(-EAGAIN) : ERR_PTR(-ENOSPC);
- case INSUFFICIENT_DEVICES:
+ case -INSUFFICIENT_DEVICES:
return ERR_PTR(-EROFS);
default:
BUG();
diff --git a/libbcachefs/alloc_foreground.h b/libbcachefs/alloc_foreground.h
index c658295c..2e81712b 100644
--- a/libbcachefs/alloc_foreground.h
+++ b/libbcachefs/alloc_foreground.h
@@ -12,13 +12,6 @@ struct bch_dev;
struct bch_fs;
struct bch_devs_List;
-enum bucket_alloc_ret {
- ALLOC_SUCCESS,
- OPEN_BUCKETS_EMPTY,
- FREELIST_EMPTY, /* Allocator thread not keeping up */
- INSUFFICIENT_DEVICES,
-};
-
struct dev_alloc_list {
unsigned nr;
u8 devs[BCH_SB_MEMBERS_MAX];
@@ -98,8 +91,7 @@ static inline void bch2_open_bucket_get(struct bch_fs *c,
}
}
-enum bucket_alloc_ret
-bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *,
+int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *,
struct dev_stripe_state *, struct bch_devs_mask *,
unsigned, unsigned *, bool *, enum alloc_reserve,
unsigned, struct closure *);
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index fdf3a777..0439f3e0 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -200,6 +200,7 @@
#include <linux/zstd.h>
#include "bcachefs_format.h"
+#include "errcode.h"
#include "fifo.h"
#include "opts.h"
#include "util.h"
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index 091bddee..4deb87f9 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -710,12 +710,15 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs;
const struct bch_extent_ptr *ptr;
+ struct bkey deleted = KEY(0, 0, 0);
+ struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
unsigned flags =
BTREE_TRIGGER_GC|
(initial ? BTREE_TRIGGER_NOATOMIC : 0);
- char buf[200];
int ret = 0;
+ deleted.p = k->k->p;
+
if (initial) {
BUG_ON(bch2_journal_seq_verify &&
k->k->version.lo > journal_cur_seq(&c->journal));
@@ -729,18 +732,6 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
k->k->version.lo,
atomic64_read(&c->key_version)))
atomic64_set(&c->key_version, k->k->version.lo);
-
- if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
- fsck_err_on(!bch2_bkey_replicas_marked(c, *k), c,
- "superblock not marked as containing replicas\n"
- " while marking %s",
- (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
- ret = bch2_mark_bkey_replicas(c, *k);
- if (ret) {
- bch_err(c, "error marking bkey replicas: %i", ret);
- goto err;
- }
- }
}
ptrs = bch2_bkey_ptrs_c(*k);
@@ -754,7 +745,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
*max_stale = max(*max_stale, ptr_stale(ca, ptr));
}
- ret = bch2_mark_key(trans, *k, flags);
+ ret = bch2_mark_key(trans, old, *k, flags);
fsck_err:
err:
if (ret)
@@ -1185,14 +1176,14 @@ static int bch2_gc_done(struct bch_fs *c,
set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \
}
#define copy_bucket_field(_f) \
- if (dst->b[b].mark._f != src->b[b].mark._f) { \
+ if (dst->b[b]._f != src->b[b]._f) { \
if (verify) \
fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f \
": got %u, should be %u", dev, b, \
dst->b[b].mark.gen, \
bch2_data_types[dst->b[b].mark.data_type],\
- dst->b[b].mark._f, src->b[b].mark._f); \
- dst->b[b]._mark._f = src->b[b].mark._f; \
+ dst->b[b]._f, src->b[b]._f); \
+ dst->b[b]._f = src->b[b]._f; \
set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \
}
#define copy_dev_field(_f, _msg, ...) \
@@ -1238,11 +1229,13 @@ static int bch2_gc_done(struct bch_fs *c,
size_t b;
for (b = 0; b < src->nbuckets; b++) {
- copy_bucket_field(gen);
- copy_bucket_field(data_type);
+ copy_bucket_field(_mark.gen);
+ copy_bucket_field(_mark.data_type);
+ copy_bucket_field(_mark.stripe);
+ copy_bucket_field(_mark.dirty_sectors);
+ copy_bucket_field(_mark.cached_sectors);
+ copy_bucket_field(stripe_redundancy);
copy_bucket_field(stripe);
- copy_bucket_field(dirty_sectors);
- copy_bucket_field(cached_sectors);
dst->b[b].oldest_gen = src->b[b].oldest_gen;
}
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index f43044e6..fc9d5bac 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -25,6 +25,15 @@ static inline void btree_path_list_remove(struct btree_trans *, struct btree_pat
static inline void btree_path_list_add(struct btree_trans *, struct btree_path *,
struct btree_path *);
+static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+ return iter->ip_allocated;
+#else
+ return 0;
+#endif
+}
+
static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *);
/*
@@ -1601,14 +1610,15 @@ static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btr
inline struct btree_path * __must_check
bch2_btree_path_make_mut(struct btree_trans *trans,
- struct btree_path *path, bool intent)
+ struct btree_path *path, bool intent,
+ unsigned long ip)
{
if (path->ref > 1 || path->preserve) {
__btree_path_put(path, intent);
path = btree_path_clone(trans, path, intent);
path->preserve = false;
#ifdef CONFIG_BCACHEFS_DEBUG
- path->ip_allocated = _RET_IP_;
+ path->ip_allocated = ip;
#endif
btree_trans_verify_sorted(trans);
}
@@ -1619,7 +1629,7 @@ bch2_btree_path_make_mut(struct btree_trans *trans,
static struct btree_path * __must_check
btree_path_set_pos(struct btree_trans *trans,
struct btree_path *path, struct bpos new_pos,
- bool intent)
+ bool intent, unsigned long ip)
{
int cmp = bpos_cmp(new_pos, path->pos);
unsigned l = path->level;
@@ -1630,7 +1640,7 @@ btree_path_set_pos(struct btree_trans *trans,
if (!cmp)
return path;
- path = bch2_btree_path_make_mut(trans, path, intent);
+ path = bch2_btree_path_make_mut(trans, path, intent, ip);
path->pos = new_pos;
path->should_be_locked = false;
@@ -1806,7 +1816,7 @@ static struct btree_path *btree_path_alloc(struct btree_trans *trans,
struct btree_path *bch2_path_get(struct btree_trans *trans, bool cached,
enum btree_id btree_id, struct bpos pos,
unsigned locks_want, unsigned level,
- bool intent)
+ bool intent, unsigned long ip)
{
struct btree_path *path, *path_pos = NULL;
int i;
@@ -1829,7 +1839,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, bool cached,
path_pos->btree_id == btree_id &&
path_pos->level == level) {
__btree_path_get(path_pos, intent);
- path = btree_path_set_pos(trans, path_pos, pos, intent);
+ path = btree_path_set_pos(trans, path_pos, pos, intent, ip);
path->preserve = true;
} else {
path = btree_path_alloc(trans, path_pos);
@@ -1849,7 +1859,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, bool cached,
for (i = 0; i < ARRAY_SIZE(path->l); i++)
path->l[i].b = BTREE_ITER_NO_NODE_INIT;
#ifdef CONFIG_BCACHEFS_DEBUG
- path->ip_allocated = _RET_IP_;
+ path->ip_allocated = ip;
#endif
btree_trans_verify_sorted(trans);
}
@@ -1927,7 +1937,8 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
iter->path = btree_path_set_pos(iter->trans, iter->path,
btree_iter_search_key(iter),
- iter->flags & BTREE_ITER_INTENT);
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
if (ret)
@@ -1962,7 +1973,8 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
iter->k.p = iter->pos = b->key.k.p;
iter->path = btree_path_set_pos(trans, iter->path, b->key.k.p,
- iter->flags & BTREE_ITER_INTENT);
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
iter->path->should_be_locked = true;
BUG_ON(iter->path->uptodate);
out:
@@ -2021,7 +2033,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
*/
path = iter->path =
btree_path_set_pos(trans, path, bpos_successor(iter->pos),
- iter->flags & BTREE_ITER_INTENT);
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
path->level = iter->min_depth;
@@ -2043,7 +2056,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
iter->k.p = iter->pos = b->key.k.p;
iter->path = btree_path_set_pos(trans, iter->path, b->key.k.p,
- iter->flags & BTREE_ITER_INTENT);
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
iter->path->should_be_locked = true;
BUG_ON(iter->path->uptodate);
out:
@@ -2102,7 +2116,8 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
while (1) {
iter->path = btree_path_set_pos(trans, iter->path, search_key,
- iter->flags & BTREE_ITER_INTENT);
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (unlikely(ret)) {
@@ -2178,7 +2193,8 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
cmp = bpos_cmp(k.k->p, iter->path->pos);
if (cmp) {
iter->path = bch2_btree_path_make_mut(trans, iter->path,
- iter->flags & BTREE_ITER_INTENT);
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
iter->path->pos = k.k->p;
btree_path_check_sort(trans, iter->path, cmp);
}
@@ -2230,7 +2246,8 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
while (1) {
iter->path = btree_path_set_pos(trans, iter->path, search_key,
- iter->flags & BTREE_ITER_INTENT);
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (unlikely(ret)) {
@@ -2360,7 +2377,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
search_key = btree_iter_search_key(iter);
iter->path = btree_path_set_pos(trans, iter->path, search_key,
- iter->flags & BTREE_ITER_INTENT);
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (unlikely(ret))
@@ -2571,7 +2589,8 @@ static void __bch2_trans_iter_init(struct btree_trans *trans,
unsigned btree_id, struct bpos pos,
unsigned locks_want,
unsigned depth,
- unsigned flags)
+ unsigned flags,
+ unsigned long ip)
{
EBUG_ON(trans->restarted);
@@ -2597,6 +2616,9 @@ static void __bch2_trans_iter_init(struct btree_trans *trans,
iter->k.type = KEY_TYPE_deleted;
iter->k.p = pos;
iter->k.size = 0;
+#ifdef CONFIG_BCACHEFS_DEBUG
+ iter->ip_allocated = ip;
+#endif
iter->path = bch2_path_get(trans,
flags & BTREE_ITER_CACHED,
@@ -2604,7 +2626,7 @@ static void __bch2_trans_iter_init(struct btree_trans *trans,
iter->pos,
locks_want,
depth,
- flags & BTREE_ITER_INTENT);
+ flags & BTREE_ITER_INTENT, ip);
}
void bch2_trans_iter_init(struct btree_trans *trans,
@@ -2613,7 +2635,7 @@ void bch2_trans_iter_init(struct btree_trans *trans,
unsigned flags)
{
__bch2_trans_iter_init(trans, iter, btree_id, pos,
- 0, 0, flags);
+ 0, 0, flags, _RET_IP_);
}
void bch2_trans_node_iter_init(struct btree_trans *trans,
@@ -2628,7 +2650,7 @@ void bch2_trans_node_iter_init(struct btree_trans *trans,
BTREE_ITER_NOT_EXTENTS|
__BTREE_ITER_ALL_SNAPSHOTS|
BTREE_ITER_ALL_SNAPSHOTS|
- flags);
+ flags, _RET_IP_);
BUG_ON(iter->path->locks_want < min(locks_want, BTREE_MAX_DEPTH));
BUG_ON(iter->path->level != depth);
BUG_ON(iter->min_depth != depth);
diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h
index 33a703c2..26eb90a7 100644
--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@@ -130,11 +130,13 @@ __trans_next_path_with_node(struct btree_trans *trans, struct btree *b,
(_path)->idx + 1))
struct btree_path * __must_check
-bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *, bool);
+bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *,
+ bool, unsigned long);
int __must_check bch2_btree_path_traverse(struct btree_trans *,
struct btree_path *, unsigned);
struct btree_path *bch2_path_get(struct btree_trans *, bool, enum btree_id,
- struct bpos, unsigned, unsigned, bool);
+ struct bpos, unsigned, unsigned, bool,
+ unsigned long);
inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
#ifdef CONFIG_BCACHEFS_DEBUG
@@ -302,13 +304,19 @@ static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter,
: bch2_btree_iter_peek(iter);
}
+static inline int btree_trans_too_many_iters(struct btree_trans *trans)
+{
+ return hweight64(trans->paths_allocated) > BTREE_ITER_MAX / 2
+ ? -EINTR : 0;
+}
+
static inline struct bkey_s_c
__bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
struct btree_iter *iter, unsigned flags)
{
struct bkey_s_c k;
- while ((hweight64(trans->paths_allocated) > BTREE_ITER_MAX / 2) ||
+ while (btree_trans_too_many_iters(trans) ||
(k = __bch2_btree_iter_peek(iter, flags),
bkey_err(k) == -EINTR))
bch2_trans_begin(trans);
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h
index 0d0a719f..2c2e2f79 100644
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -291,6 +291,9 @@ struct btree_iter {
* bch2_btree_iter_next_slot() can correctly advance pos.
*/
struct bkey k;
+#ifdef CONFIG_BCACHEFS_DEBUG
+ unsigned long ip_allocated;
+#endif
};
struct btree_key_cache {
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index 61c7757b..dfff9725 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -1590,8 +1590,8 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
? bpos_predecessor(b->data->min_key)
: bpos_successor(b->data->max_key);
- sib_path = bch2_path_get(trans, false, path->btree_id,
- sib_pos, U8_MAX, level, true);
+ sib_path = bch2_path_get(trans, false, path->btree_id, sib_pos,
+ U8_MAX, level, true, _THIS_IP_);
ret = bch2_btree_path_traverse(trans, sib_path, false);
if (ret)
goto err;
@@ -1888,7 +1888,8 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
bch2_trans_copy_iter(&iter2, iter);
iter2.path = bch2_btree_path_make_mut(trans, iter2.path,
- iter2.flags & BTREE_ITER_INTENT);
+ iter2.flags & BTREE_ITER_INTENT,
+ _THIS_IP_);
BUG_ON(iter2.path->level != b->c.level);
BUG_ON(bpos_cmp(iter2.path->pos, new_key->k.p));
diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c
index 112ac7ca..131fd4c1 100644
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@@ -437,17 +437,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
marking = true;
}
- if (marking) {
- percpu_down_read(&c->mark_lock);
- }
-
- /* Must be called under mark_lock: */
- if (marking && trans->fs_usage_deltas &&
- !bch2_replicas_delta_list_marked(c, trans->fs_usage_deltas)) {
- ret = BTREE_INSERT_NEED_MARK_REPLICAS;
- goto err;
- }
-
/*
* Don't get journal reservation until after we know insert will
* succeed:
@@ -456,7 +445,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
ret = bch2_trans_journal_res_get(trans,
JOURNAL_RES_GET_NONBLOCK);
if (ret)
- goto err;
+ return ret;
} else {
trans->journal_res.seq = c->journal.replay_journal_seq;
}
@@ -484,22 +473,19 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
i->k->k.version = MAX_VERSION;
}
+ if (trans->fs_usage_deltas &&
+ bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
+ return BTREE_INSERT_NEED_MARK_REPLICAS;
+
trans_for_each_update(trans, i)
if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type))
bch2_mark_update(trans, i->path, i->k, i->flags);
- if (marking && trans->fs_usage_deltas)
- bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas);
-
if (unlikely(c->gc_pos.phase))
bch2_trans_mark_gc(trans);
trans_for_each_update(trans, i)
do_btree_insert_one(trans, i);
-err:
- if (marking) {
- percpu_up_read(&c->mark_lock);
- }
return ret;
}
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index 6fc93b56..4d55ef51 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -144,6 +144,7 @@ static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
unsigned journal_seq,
bool gc)
{
+ percpu_rwsem_assert_held(&c->mark_lock);
BUG_ON(!gc && !journal_seq);
return this_cpu_ptr(gc
@@ -371,8 +372,6 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
if (!journal_seq && !test_bit(BCH_FS_INITIALIZED, &c->flags))
journal_seq = 1;
- percpu_rwsem_assert_held(&c->mark_lock);
-
preempt_disable();
fs_usage = fs_usage_ptr(c, journal_seq, gc);
u = dev_usage_ptr(ca, journal_seq, gc);
@@ -418,25 +417,48 @@ static inline int __update_replicas(struct bch_fs *c,
return 0;
}
-static inline int update_replicas(struct bch_fs *c,
+static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k,
struct bch_replicas_entry *r, s64 sectors,
unsigned journal_seq, bool gc)
{
struct bch_fs_usage __percpu *fs_usage;
- int idx = bch2_replicas_entry_idx(c, r);
+ int idx, ret = 0;
+ char buf[200];
- if (idx < 0)
- return -1;
+ percpu_down_read(&c->mark_lock);
+
+ idx = bch2_replicas_entry_idx(c, r);
+ if (idx < 0 &&
+ (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
+ fsck_err(c, "no replicas entry\n"
+ " while marking %s",
+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))) {
+ percpu_up_read(&c->mark_lock);
+ ret = bch2_mark_replicas(c, r);
+ if (ret)
+ return ret;
+
+ percpu_down_read(&c->mark_lock);
+ idx = bch2_replicas_entry_idx(c, r);
+ }
+ if (idx < 0) {
+ ret = -1;
+ goto err;
+ }
preempt_disable();
fs_usage = fs_usage_ptr(c, journal_seq, gc);
fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
fs_usage->replicas[idx] += sectors;
preempt_enable();
- return 0;
+err:
+fsck_err:
+ percpu_up_read(&c->mark_lock);
+ return ret;
}
static inline int update_cached_sectors(struct bch_fs *c,
+ struct bkey_s_c k,
unsigned dev, s64 sectors,
unsigned journal_seq, bool gc)
{
@@ -444,7 +466,7 @@ static inline int update_cached_sectors(struct bch_fs *c,
bch2_replicas_entry_cached(&r.e, dev);
- return update_replicas(c, &r.e, sectors, journal_seq, gc);
+ return update_replicas(c, k, &r.e, sectors, journal_seq, gc);
}
static struct replicas_delta_list *
@@ -547,6 +569,7 @@ static int bch2_mark_alloc(struct btree_trans *trans,
struct bch_dev *ca;
struct bucket *g;
struct bucket_mark old_m, m;
+ int ret = 0;
/* We don't do anything for deletions - do we?: */
if (!bkey_is_alloc(new.k))
@@ -573,6 +596,7 @@ static int bch2_mark_alloc(struct btree_trans *trans,
if (new.k->p.offset >= ca->mi.nbuckets)
return 0;
+ percpu_down_read(&c->mark_lock);
g = __bucket(ca, new.k->p.offset, gc);
u = bch2_alloc_unpack(new);
@@ -597,6 +621,7 @@ static int bch2_mark_alloc(struct btree_trans *trans,
g->gen_valid = 1;
g->stripe = u.stripe;
g->stripe_redundancy = u.stripe_redundancy;
+ percpu_up_read(&c->mark_lock);
/*
* need to know if we're getting called from the invalidate path or
@@ -605,10 +630,12 @@ static int bch2_mark_alloc(struct btree_trans *trans,
if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
old_m.cached_sectors) {
- if (update_cached_sectors(c, ca->dev_idx, -old_m.cached_sectors,
- journal_seq, gc)) {
+ ret = update_cached_sectors(c, new, ca->dev_idx,
+ -old_m.cached_sectors,
+ journal_seq, gc);
+ if (ret) {
bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors");
- return -1;
+ return ret;
}
trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset),
@@ -779,43 +806,57 @@ static int mark_stripe_bucket(struct btree_trans *trans,
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
unsigned nr_data = s->nr_blocks - s->nr_redundant;
bool parity = ptr_idx >= nr_data;
+ enum bch_data_type data_type = parity ? BCH_DATA_parity : 0;
+ s64 sectors = parity ? le16_to_cpu(s->sectors) : 0;
const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
bool gc = flags & BTREE_TRIGGER_GC;
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket *g = PTR_BUCKET(ca, ptr, gc);
+ struct bucket *g;
struct bucket_mark new, old;
char buf[200];
- int ret;
+ int ret = 0;
- if (g->stripe && g->stripe != k.k->p.offset) {
+ /* * XXX doesn't handle deletion */
+
+ percpu_down_read(&c->mark_lock);
+ g = PTR_BUCKET(ca, ptr, gc);
+
+ if (g->mark.dirty_sectors ||
+ (g->stripe && g->stripe != k.k->p.offset)) {
bch2_fs_inconsistent(c,
"bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
ptr->dev, PTR_BUCKET_NR(ca, ptr), g->mark.gen,
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
- return -EINVAL;
+ ret = -EINVAL;
+ goto err;
}
old = bucket_cmpxchg(g, new, ({
- ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type,
+ ret = check_bucket_ref(c, k, ptr, sectors, data_type,
+ new.gen, new.data_type,
new.dirty_sectors, new.cached_sectors);
if (ret)
- return ret;
+ goto err;
- if (parity) {
- new.data_type = BCH_DATA_parity;
- new.dirty_sectors = le16_to_cpu(s->sectors);
- }
+ new.dirty_sectors += sectors;
+ if (data_type)
+ new.data_type = data_type;
if (journal_seq) {
new.journal_seq_valid = 1;
new.journal_seq = journal_seq;
}
+
+ new.stripe = true;
}));
g->stripe = k.k->p.offset;
g->stripe_redundancy = s->nr_redundant;
bch2_dev_usage_update(c, ca, old, new, journal_seq, gc);
+err:
+ percpu_up_read(&c->mark_lock);
+
return 0;
}
@@ -856,7 +897,10 @@ static int bch2_mark_pointer(struct btree_trans *trans,
struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc);
u8 bucket_data_type;
u64 v;
- int ret;
+ int ret = 0;
+
+ percpu_down_read(&c->mark_lock);
+ g = PTR_BUCKET(ca, &p.ptr, gc);
v = atomic64_read(&g->_mark.v);
do {
@@ -869,7 +913,7 @@ static int bch2_mark_pointer(struct btree_trans *trans,
&new.dirty_sectors,
&new.cached_sectors);
if (ret)
- return ret;
+ goto err;
new.data_type = bucket_data_type;
@@ -889,11 +933,14 @@ static int bch2_mark_pointer(struct btree_trans *trans,
bch2_dev_usage_update(c, ca, old, new, journal_seq, gc);
BUG_ON(!gc && bucket_became_unavailable(old, new));
+err:
+ percpu_up_read(&c->mark_lock);
- return 0;
+ return ret;
}
static int bch2_mark_stripe_ptr(struct btree_trans *trans,
+ struct bkey_s_c k,
struct bch_extent_stripe_ptr p,
enum bch_data_type data_type,
s64 sectors,
@@ -933,7 +980,7 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans,
spin_unlock(&c->ec_stripes_heap_lock);
r.e.data_type = data_type;
- update_replicas(c, &r.e, sectors, trans->journal_res.seq, gc);
+ update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, gc);
return 0;
}
@@ -978,18 +1025,19 @@ static int bch2_mark_extent(struct btree_trans *trans,
stale = ret > 0;
if (p.ptr.cached) {
- if (!stale)
- if (update_cached_sectors(c, p.ptr.dev, disk_sectors,
- journal_seq, gc)) {
+ if (!stale) {
+ ret = update_cached_sectors(c, k, p.ptr.dev,
+ disk_sectors, journal_seq, gc);
+ if (ret) {
bch2_fs_fatal_error(c, "bch2_mark_extent(): no replicas entry while updating cached sectors");
- return -1;
-
+ return ret;
}
+ }
} else if (!p.has_ec) {
dirty_sectors += disk_sectors;
r.e.devs[r.e.nr_devs++] = p.ptr.dev;
} else {
- ret = bch2_mark_stripe_ptr(trans, p.ec, data_type,
+ ret = bch2_mark_stripe_ptr(trans, k, p.ec, data_type,
disk_sectors, flags);
if (ret)
return ret;
@@ -1004,12 +1052,13 @@ static int bch2_mark_extent(struct btree_trans *trans,
}
if (r.e.nr_devs) {
- if (update_replicas(c, &r.e, dirty_sectors, journal_seq, gc)) {
+ ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, gc);
+ if (ret) {
char buf[200];
bch2_bkey_val_to_text(&PBUF(buf), c, k);
bch2_fs_fatal_error(c, "no replicas entry for %s", buf);
- return -1;
+ return ret;
}
}
@@ -1079,6 +1128,11 @@ static int bch2_mark_stripe(struct btree_trans *trans,
if (gc) {
/*
+ * This will be wrong when we bring back runtime gc: we should
+ * be unmarking the old key and then marking the new key
+ */
+
+ /*
* gc recalculates this field from stripe ptr
* references:
*/
@@ -1091,14 +1145,15 @@ static int bch2_mark_stripe(struct btree_trans *trans,
return ret;
}
- if (update_replicas(c, &m->r.e,
- ((s64) m->sectors * m->nr_redundant),
- journal_seq, gc)) {
+ ret = update_replicas(c, new, &m->r.e,
+ ((s64) m->sectors * m->nr_redundant),
+ journal_seq, gc);
+ if (ret) {
char buf[200];
bch2_bkey_val_to_text(&PBUF(buf), c, new);
bch2_fs_fatal_error(c, "no replicas entry for %s", buf);
- return -1;
+ return ret;
}
}
@@ -1123,11 +1178,15 @@ static int bch2_mark_inode(struct btree_trans *trans,
}
if (flags & BTREE_TRIGGER_GC) {
+ percpu_down_read(&c->mark_lock);
preempt_disable();
+
fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC);
fs_usage->nr_inodes += bkey_is_inode(new.k);
fs_usage->nr_inodes -= bkey_is_inode(old.k);
+
preempt_enable();
+ percpu_up_read(&c->mark_lock);
}
return 0;
}
@@ -1146,14 +1205,18 @@ static int bch2_mark_reservation(struct btree_trans *trans,
sectors = -sectors;
sectors *= replicas;
+ percpu_down_read(&c->mark_lock);
preempt_disable();
+
fs_usage = fs_usage_ptr(c, trans->journal_res.seq, flags & BTREE_TRIGGER_GC);
replicas = clamp_t(unsigned, replicas, 1,
ARRAY_SIZE(fs_usage->persistent_reserved));
fs_usage->reserved += sectors;
fs_usage->persistent_reserved[replicas - 1] += sectors;
+
preempt_enable();
+ percpu_up_read(&c->mark_lock);
return 0;
}
@@ -1241,10 +1304,10 @@ static int bch2_mark_reflink_p(struct btree_trans *trans,
return ret;
}
-static int bch2_mark_key_locked(struct btree_trans *trans,
- struct bkey_s_c old,
- struct bkey_s_c new,
- unsigned flags)
+int bch2_mark_key(struct btree_trans *trans,
+ struct bkey_s_c old,
+ struct bkey_s_c new,
+ unsigned flags)
{
struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
@@ -1274,22 +1337,6 @@ static int bch2_mark_key_locked(struct btree_trans *trans,
}
}
-int bch2_mark_key(struct btree_trans *trans, struct bkey_s_c new, unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct bkey deleted = KEY(0, 0, 0);
- struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
- int ret;
-
- deleted.p = new.k->p;
-
- percpu_down_read(&c->mark_lock);
- ret = bch2_mark_key_locked(trans, old, new, flags);
- percpu_up_read(&c->mark_lock);
-
- return ret;
-}
-
int bch2_mark_update(struct btree_trans *trans, struct btree_path *path,
struct bkey_i *new, unsigned flags)
{
@@ -1311,12 +1358,12 @@ int bch2_mark_update(struct btree_trans *trans, struct btree_path *path,
if (old.k->type == new->k.type &&
((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
- ret = bch2_mark_key_locked(trans, old, bkey_i_to_s_c(new),
+ ret = bch2_mark_key(trans, old, bkey_i_to_s_c(new),
BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
} else {
- ret = bch2_mark_key_locked(trans, deleted, bkey_i_to_s_c(new),
+ ret = bch2_mark_key(trans, deleted, bkey_i_to_s_c(new),
BTREE_TRIGGER_INSERT|flags) ?:
- bch2_mark_key_locked(trans, old, deleted,
+ bch2_mark_key(trans, old, deleted,
BTREE_TRIGGER_OVERWRITE|flags);
}
@@ -1359,21 +1406,20 @@ void fs_usage_apply_warn(struct btree_trans *trans,
__WARN();
}
-void bch2_trans_fs_usage_apply(struct btree_trans *trans,
- struct replicas_delta_list *deltas)
+int bch2_trans_fs_usage_apply(struct btree_trans *trans,
+ struct replicas_delta_list *deltas)
{
struct bch_fs *c = trans->c;
static int warned_disk_usage = 0;
bool warn = false;
unsigned disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
- struct replicas_delta *d = deltas->d;
+ struct replicas_delta *d = deltas->d, *d2;
struct replicas_delta *top = (void *) deltas->d + deltas->used;
struct bch_fs_usage *dst;
s64 added = 0, should_not_have_added;
unsigned i;
- percpu_rwsem_assert_held(&c->mark_lock);
-
+ percpu_down_read(&c->mark_lock);
preempt_disable();
dst = fs_usage_ptr(c, trans->journal_res.seq, false);
@@ -1385,7 +1431,8 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
added += d->delta;
}
- BUG_ON(__update_replicas(c, dst, &d->r, d->delta));
+ if (__update_replicas(c, dst, &d->r, d->delta))
+ goto need_mark;
}
dst->nr_inodes += deltas->nr_inodes;
@@ -1420,9 +1467,19 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
}
preempt_enable();
+ percpu_up_read(&c->mark_lock);
if (unlikely(warn) && !xchg(&warned_disk_usage, 1))
fs_usage_apply_warn(trans, disk_res_sectors, should_not_have_added);
+ return 0;
+need_mark:
+ /* revert changes: */
+ for (d2 = deltas->d; d2 != d; d2 = replicas_delta_next(d2))
+ BUG_ON(__update_replicas(c, dst, &d2->r, -d2->delta));
+
+ preempt_enable();
+ percpu_up_read(&c->mark_lock);
+ return -1;
}
/* trans_mark: */
@@ -1606,50 +1663,75 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
return 0;
}
-static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans,
- struct bkey_s_c_stripe s,
- unsigned idx, bool deleting)
+static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
+ struct bkey_s_c_stripe s,
+ unsigned idx, bool deleting)
{
struct bch_fs *c = trans->c;
const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
struct bkey_alloc_buf *a;
struct btree_iter iter;
struct bkey_alloc_unpacked u;
- bool parity = idx >= s.v->nr_blocks - s.v->nr_redundant;
+ enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant
+ ? BCH_DATA_parity : 0;
+ s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0;
int ret = 0;
+ if (deleting)
+ sectors = -sectors;
+
a = bch2_trans_start_alloc_update(trans, &iter, ptr, &u);
if (IS_ERR(a))
return PTR_ERR(a);
- if (parity) {
- s64 sectors = le16_to_cpu(s.v->sectors);
-
- if (deleting)
- sectors = -sectors;
-
- u.dirty_sectors += sectors;
- u.data_type = u.dirty_sectors
- ? BCH_DATA_parity
- : 0;
- }
+ ret = check_bucket_ref(c, s.s_c, ptr, sectors, data_type,
+ u.gen, u.data_type,
+ u.dirty_sectors, u.cached_sectors);
+ if (ret)
+ goto err;
if (!deleting) {
- if (bch2_fs_inconsistent_on(u.stripe && u.stripe != s.k->p.offset, c,
- "bucket %llu:%llu gen %u: multiple stripes using same bucket (%u, %llu)",
+ if (bch2_fs_inconsistent_on(u.stripe ||
+ u.stripe_redundancy, c,
+ "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)",
iter.pos.inode, iter.pos.offset, u.gen,
+ bch2_data_types[u.data_type],
+ u.dirty_sectors,
u.stripe, s.k->p.offset)) {
ret = -EIO;
goto err;
}
+ if (bch2_fs_inconsistent_on(data_type && u.dirty_sectors, c,
+ "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu",
+ iter.pos.inode, iter.pos.offset, u.gen,
+ bch2_data_types[u.data_type],
+ u.dirty_sectors,
+ s.k->p.offset)) {
+ ret = -EIO;
+ goto err;
+ }
+
u.stripe = s.k->p.offset;
u.stripe_redundancy = s.v->nr_redundant;
} else {
+ if (bch2_fs_inconsistent_on(u.stripe != s.k->p.offset ||
+ u.stripe_redundancy != s.v->nr_redundant, c,
+ "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)",
+ iter.pos.inode, iter.pos.offset, u.gen,
+ s.k->p.offset, u.stripe)) {
+ ret = -EIO;
+ goto err;
+ }
+
u.stripe = 0;
u.stripe_redundancy = 0;
}
+ u.dirty_sectors += sectors;
+ if (data_type)
+ u.data_type = !deleting ? data_type : 0;
+
bch2_alloc_pack(c, a, u);
bch2_trans_update(trans, &iter, &a->k, 0);
err:
@@ -1664,7 +1746,7 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans,
struct bkey_s_c_stripe old_s = { .k = NULL };
struct bkey_s_c_stripe new_s = { .k = NULL };
struct bch_replicas_padded r;
- unsigned i;
+ unsigned i, nr_blocks;
int ret = 0;
if (old.k->type == KEY_TYPE_stripe)
@@ -1682,18 +1764,17 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans,
new_s.v->nr_blocks * sizeof(struct bch_extent_ptr)))
return 0;
+ BUG_ON(new_s.k && old_s.k &&
+ (new_s.v->nr_blocks != old_s.v->nr_blocks ||
+ new_s.v->nr_redundant != old_s.v->nr_redundant));
+
+ nr_blocks = new_s.k ? new_s.v->nr_blocks : old_s.v->nr_blocks;
+
if (new_s.k) {
s64 sectors = le16_to_cpu(new_s.v->sectors);
bch2_bkey_to_replicas(&r.e, new);
update_replicas_list(trans, &r.e, sectors * new_s.v->nr_redundant);
-
- for (i = 0; i < new_s.v->nr_blocks; i++) {
- ret = bch2_trans_mark_stripe_alloc_ref(trans, new_s,
- i, false);
- if (ret)
- return ret;
- }
}
if (old_s.k) {
@@ -1701,12 +1782,25 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans,
bch2_bkey_to_replicas(&r.e, old);
update_replicas_list(trans, &r.e, sectors * old_s.v->nr_redundant);
+ }
- for (i = 0; i < old_s.v->nr_blocks; i++) {
- ret = bch2_trans_mark_stripe_alloc_ref(trans, old_s,
- i, true);
+ for (i = 0; i < nr_blocks; i++) {
+ if (new_s.k && old_s.k &&
+ !memcmp(&new_s.v->ptrs[i],
+ &old_s.v->ptrs[i],
+ sizeof(new_s.v->ptrs[i])))
+ continue;
+
+ if (new_s.k) {
+ ret = bch2_trans_mark_stripe_bucket(trans, new_s, i, false);
if (ret)
- return ret;
+ break;
+ }
+
+ if (old_s.k) {
+ ret = bch2_trans_mark_stripe_bucket(trans, old_s, i, true);
+ if (ret)
+ break;
}
}
diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h
index 5ed9441c..ac9b554a 100644
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@@ -226,14 +226,14 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
size_t, enum bch_data_type, unsigned,
struct gc_pos, unsigned);
-int bch2_mark_key(struct btree_trans *, struct bkey_s_c, unsigned);
+int bch2_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
int bch2_mark_update(struct btree_trans *, struct btree_path *,
struct bkey_i *, unsigned);
int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
struct bkey_s_c, unsigned);
-void bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
+int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
size_t, enum bch_data_type, unsigned);
diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c
index 4dfcc955..fe4a85a6 100644
--- a/libbcachefs/dirent.c
+++ b/libbcachefs/dirent.c
@@ -531,10 +531,9 @@ retry:
* read_target looks up subvolumes, we can overflow paths if the
* directory has many subvolumes in it
*/
- if (hweight64(trans.paths_allocated) > BTREE_ITER_MAX / 2) {
- ret = -EINTR;
+ ret = btree_trans_too_many_iters(&trans);
+ if (ret)
break;
- }
}
bch2_trans_iter_exit(&trans, &iter);
err:
diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c
index bca1b8a7..71d85c93 100644
--- a/libbcachefs/ec.c
+++ b/libbcachefs/ec.c
@@ -15,6 +15,7 @@
#include "io.h"
#include "keylist.h"
#include "recovery.h"
+#include "replicas.h"
#include "super-io.h"
#include "util.h"
@@ -1272,16 +1273,15 @@ found:
return h;
}
-static enum bucket_alloc_ret
-new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
- struct closure *cl)
+static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
+ struct closure *cl)
{
struct bch_devs_mask devs = h->devs;
struct open_bucket *ob;
struct open_buckets buckets;
unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
bool have_cache = true;
- enum bucket_alloc_ret ret = ALLOC_SUCCESS;
+ int ret = 0;
for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) {
if (test_bit(i, h->s->blocks_gotten)) {
@@ -1516,7 +1516,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
err:
bch2_ec_stripe_head_put(c, h);
- return ERR_PTR(-ret);
+ return ERR_PTR(ret);
}
void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
@@ -1636,13 +1636,41 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags)
static int bch2_stripes_read_fn(struct btree_trans *trans, struct bkey_s_c k)
{
+ const struct bch_stripe *s;
struct bch_fs *c = trans->c;
+ struct stripe *m;
+ unsigned i;
int ret = 0;
- if (k.k->type == KEY_TYPE_stripe)
- ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?:
- bch2_mark_key(trans, k,
- BTREE_TRIGGER_NOATOMIC);
+ if (k.k->type != KEY_TYPE_stripe)
+ return 0;
+
+ ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
+ if (ret)
+ return ret;
+
+ s = bkey_s_c_to_stripe(k).v;
+
+ m = genradix_ptr(&c->stripes[0], k.k->p.offset);
+ m->alive = true;
+ m->sectors = le16_to_cpu(s->sectors);
+ m->algorithm = s->algorithm;
+ m->nr_blocks = s->nr_blocks;
+ m->nr_redundant = s->nr_redundant;
+ m->blocks_nonempty = 0;
+
+ for (i = 0; i < s->nr_blocks; i++) {
+ m->block_sectors[i] =
+ stripe_blockcount_get(s, i);
+ m->blocks_nonempty += !!m->block_sectors[i];
+ m->ptrs[i] = s->ptrs[i];
+ }
+
+ bch2_bkey_to_replicas(&m->r.e, k);
+
+ spin_lock(&c->ec_stripes_heap_lock);
+ bch2_stripes_heap_update(c, m, k.k->p.offset);
+ spin_unlock(&c->ec_stripes_heap_lock);
return ret;
}
diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h
new file mode 100644
index 00000000..f7d12915
--- /dev/null
+++ b/libbcachefs/errcode.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ERRCODE_H
+#define _BCACHEFS_ERRCODE_H
+
+enum {
+ /* Bucket allocator: */
+ OPEN_BUCKETS_EMPTY = 2048,
+ FREELIST_EMPTY, /* Allocator thread not keeping up */
+ INSUFFICIENT_DEVICES,
+};
+
+#endif /* _BCACHFES_ERRCODE_H */
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c
index d3d48a5b..5bcdfe3c 100644
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -223,6 +223,9 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
return;
mutex_lock(&inode->ei_quota_lock);
+ BUG_ON((s64) inode->v.i_blocks + sectors < 0);
+ inode->v.i_blocks += sectors;
+
#ifdef CONFIG_BCACHEFS_QUOTA
if (quota_res && sectors > 0) {
BUG_ON(sectors > quota_res->sectors);
@@ -234,7 +237,6 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN);
}
#endif
- inode->v.i_blocks += sectors;
mutex_unlock(&inode->ei_quota_lock);
}
@@ -243,24 +245,26 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
/* stored in page->private: */
struct bch_page_sector {
- /* Uncompressed, fully allocated replicas: */
- unsigned nr_replicas:3;
+ /* Uncompressed, fully allocated replicas (or on disk reservation): */
+ unsigned nr_replicas:4;
- /* Owns PAGE_SECTORS * replicas_reserved sized reservation: */
- unsigned replicas_reserved:3;
+ /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */
+ unsigned replicas_reserved:4;
/* i_sectors: */
enum {
SECTOR_UNALLOCATED,
SECTOR_RESERVED,
SECTOR_DIRTY,
+ SECTOR_DIRTY_RESERVED,
SECTOR_ALLOCATED,
- } state:2;
+ } state:8;
};
struct bch_page_state {
spinlock_t lock;
atomic_t write_count;
+ bool uptodate;
struct bch_page_sector s[PAGE_SECTORS];
};
@@ -311,6 +315,212 @@ static struct bch_page_state *bch2_page_state_create(struct page *page,
return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp);
}
+static unsigned bkey_to_sector_state(const struct bkey *k)
+{
+ if (k->type == KEY_TYPE_reservation)
+ return SECTOR_RESERVED;
+ if (bkey_extent_is_allocation(k))
+ return SECTOR_ALLOCATED;
+ return SECTOR_UNALLOCATED;
+}
+
+static void __bch2_page_state_set(struct page *page,
+ unsigned pg_offset, unsigned pg_len,
+ unsigned nr_ptrs, unsigned state)
+{
+ struct bch_page_state *s = bch2_page_state_create(page, __GFP_NOFAIL);
+ unsigned i;
+
+ BUG_ON(pg_offset >= PAGE_SECTORS);
+ BUG_ON(pg_offset + pg_len > PAGE_SECTORS);
+
+ spin_lock(&s->lock);
+
+ for (i = pg_offset; i < pg_offset + pg_len; i++) {
+ s->s[i].nr_replicas = nr_ptrs;
+ s->s[i].state = state;
+ }
+
+ if (i == PAGE_SECTORS)
+ s->uptodate = true;
+
+ spin_unlock(&s->lock);
+}
+
+static int bch2_page_state_set(struct bch_fs *c, subvol_inum inum,
+ struct page **pages, unsigned nr_pages)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ u64 offset = pages[0]->index << PAGE_SECTORS_SHIFT;
+ unsigned pg_idx = 0;
+ u32 snapshot;
+ int ret;
+
+ bch2_trans_init(&trans, c, 0, 0);
+retry:
+ bch2_trans_begin(&trans);
+
+ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+ if (ret)
+ goto err;
+
+ for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
+ SPOS(inum.inum, offset, snapshot),
+ BTREE_ITER_SLOTS, k, ret) {
+ unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
+ unsigned state = bkey_to_sector_state(k.k);
+
+ while (pg_idx < nr_pages) {
+ struct page *page = pages[pg_idx];
+ u64 pg_start = page->index << PAGE_SECTORS_SHIFT;
+ u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT;
+ unsigned pg_offset = max(bkey_start_offset(k.k), pg_start) - pg_start;
+ unsigned pg_len = min(k.k->p.offset, pg_end) - pg_offset - pg_start;
+
+ BUG_ON(k.k->p.offset < pg_start);
+ BUG_ON(bkey_start_offset(k.k) > pg_end);
+
+ if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate)
+ __bch2_page_state_set(page, pg_offset, pg_len, nr_ptrs, state);
+
+ if (k.k->p.offset < pg_end)
+ break;
+ pg_idx++;
+ }
+
+ if (pg_idx == nr_pages)
+ break;
+ }
+
+ offset = iter.pos.offset;
+ bch2_trans_iter_exit(&trans, &iter);
+err:
+ if (ret == -EINTR)
+ goto retry;
+ bch2_trans_exit(&trans);
+
+ return ret;
+}
+
+static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
+{
+ struct bvec_iter iter;
+ struct bio_vec bv;
+ unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
+ ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
+ unsigned state = bkey_to_sector_state(k.k);
+
+ bio_for_each_segment(bv, bio, iter)
+ __bch2_page_state_set(bv.bv_page, bv.bv_offset >> 9,
+ bv.bv_len >> 9, nr_ptrs, state);
+}
+
+static void mark_pagecache_unallocated(struct bch_inode_info *inode,
+ u64 start, u64 end)
+{
+ pgoff_t index = start >> PAGE_SECTORS_SHIFT;
+ pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
+ struct pagevec pvec;
+
+ if (end <= start)
+ return;
+
+ pagevec_init(&pvec);
+
+ do {
+ unsigned nr_pages, i, j;
+
+ nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping,
+ &index, end_index);
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+ u64 pg_start = page->index << PAGE_SECTORS_SHIFT;
+ u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT;
+ unsigned pg_offset = max(start, pg_start) - pg_start;
+ unsigned pg_len = min(end, pg_end) - pg_offset - pg_start;
+ struct bch_page_state *s;
+
+ BUG_ON(end <= pg_start);
+ BUG_ON(pg_offset >= PAGE_SECTORS);
+ BUG_ON(pg_offset + pg_len > PAGE_SECTORS);
+
+ lock_page(page);
+ s = bch2_page_state(page);
+
+ if (s) {
+ spin_lock(&s->lock);
+ for (j = pg_offset; j < pg_offset + pg_len; j++)
+ s->s[j].nr_replicas = 0;
+ spin_unlock(&s->lock);
+ }
+
+ unlock_page(page);
+ }
+ pagevec_release(&pvec);
+ } while (index <= end_index);
+}
+
+static void mark_pagecache_reserved(struct bch_inode_info *inode,
+ u64 start, u64 end)
+{
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ pgoff_t index = start >> PAGE_SECTORS_SHIFT;
+ pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
+ struct pagevec pvec;
+ s64 i_sectors_delta = 0;
+
+ if (end <= start)
+ return;
+
+ pagevec_init(&pvec);
+
+ do {
+ unsigned nr_pages, i, j;
+
+ nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping,
+ &index, end_index);
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+ u64 pg_start = page->index << PAGE_SECTORS_SHIFT;
+ u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT;
+ unsigned pg_offset = max(start, pg_start) - pg_start;
+ unsigned pg_len = min(end, pg_end) - pg_offset - pg_start;
+ struct bch_page_state *s;
+
+ BUG_ON(end <= pg_start);
+ BUG_ON(pg_offset >= PAGE_SECTORS);
+ BUG_ON(pg_offset + pg_len > PAGE_SECTORS);
+
+ lock_page(page);
+ s = bch2_page_state(page);
+
+ if (s) {
+ spin_lock(&s->lock);
+ for (j = pg_offset; j < pg_offset + pg_len; j++)
+ switch (s->s[j].state) {
+ case SECTOR_UNALLOCATED:
+ s->s[j].state = SECTOR_RESERVED;
+ break;
+ case SECTOR_DIRTY:
+ s->s[j].state = SECTOR_DIRTY_RESERVED;
+ i_sectors_delta--;
+ break;
+ default:
+ break;
+ }
+ spin_unlock(&s->lock);
+ }
+
+ unlock_page(page);
+ }
+ pagevec_release(&pvec);
+ } while (index <= end_index);
+
+ i_sectors_acct(c, inode, NULL, i_sectors_delta);
+}
+
static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
{
/* XXX: this should not be open coded */
@@ -395,6 +605,8 @@ static int bch2_page_reservation_get(struct bch_fs *c,
if (!s)
return -ENOMEM;
+ BUG_ON(!s->uptodate);
+
for (i = round_down(offset, block_bytes(c)) >> 9;
i < round_up(offset + len, block_bytes(c)) >> 9;
i++) {
@@ -449,16 +661,22 @@ static void bch2_clear_page_bits(struct page *page)
disk_res.sectors += s->s[i].replicas_reserved;
s->s[i].replicas_reserved = 0;
- if (s->s[i].state == SECTOR_DIRTY) {
- dirty_sectors++;
+ switch (s->s[i].state) {
+ case SECTOR_DIRTY:
s->s[i].state = SECTOR_UNALLOCATED;
+ --dirty_sectors;
+ break;
+ case SECTOR_DIRTY_RESERVED:
+ s->s[i].state = SECTOR_RESERVED;
+ break;
+ default:
+ break;
}
}
bch2_disk_reservation_put(c, &disk_res);
- if (dirty_sectors)
- i_sectors_acct(c, inode, NULL, -dirty_sectors);
+ i_sectors_acct(c, inode, NULL, dirty_sectors);
bch2_page_state_release(page);
}
@@ -491,16 +709,22 @@ static void bch2_set_page_dirty(struct bch_fs *c,
s->s[i].replicas_reserved += sectors;
res->disk.sectors -= sectors;
- if (s->s[i].state == SECTOR_UNALLOCATED)
+ switch (s->s[i].state) {
+ case SECTOR_UNALLOCATED:
+ s->s[i].state = SECTOR_DIRTY;
dirty_sectors++;
-
- s->s[i].state = max_t(unsigned, s->s[i].state, SECTOR_DIRTY);
+ break;
+ case SECTOR_RESERVED:
+ s->s[i].state = SECTOR_DIRTY_RESERVED;
+ break;
+ default:
+ break;
+ }
}
spin_unlock(&s->lock);
- if (dirty_sectors)
- i_sectors_acct(c, inode, &res->quota, dirty_sectors);
+ i_sectors_acct(c, inode, &res->quota, dirty_sectors);
if (!PageDirty(page))
__set_page_dirty_nobuffers(page);
@@ -554,7 +778,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
struct bch2_page_reservation res;
unsigned len;
loff_t isize;
- int ret = VM_FAULT_LOCKED;
+ int ret;
bch2_page_reservation_init(c, inode, &res);
@@ -580,6 +804,14 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
len = min_t(loff_t, PAGE_SIZE, isize - page_offset(page));
+ if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) {
+ if (bch2_page_state_set(c, inode_inum(inode), &page, 1)) {
+ unlock_page(page);
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ }
+
if (bch2_page_reservation_get(c, inode, page, &res, 0, len, true)) {
unlock_page(page);
ret = VM_FAULT_SIGBUS;
@@ -590,6 +822,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
bch2_page_reservation_put(c, inode, &res);
wait_for_stable_page(page);
+ ret = VM_FAULT_LOCKED;
out:
bch2_pagecache_add_put(&inode->ei_pagecache_lock);
sb_end_pagefault(inode->v.i_sb);
@@ -703,29 +936,6 @@ static inline struct page *readpage_iter_next(struct readpages_iter *iter)
return iter->pages[iter->idx];
}
-static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
-{
- struct bvec_iter iter;
- struct bio_vec bv;
- unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
- ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
- unsigned state = k.k->type == KEY_TYPE_reservation
- ? SECTOR_RESERVED
- : SECTOR_ALLOCATED;
-
- bio_for_each_segment(bv, bio, iter) {
- struct bch_page_state *s = bch2_page_state(bv.bv_page);
- unsigned i;
-
- for (i = bv.bv_offset >> 9;
- i < (bv.bv_offset + bv.bv_len) >> 9;
- i++) {
- s->s[i].nr_replicas = nr_ptrs;
- s->s[i].state = state;
- }
- }
-}
-
static bool extent_partial_reads_expensive(struct bkey_s_c k)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
@@ -745,7 +955,7 @@ static void readpage_bio_extend(struct readpages_iter *iter,
{
while (bio_sectors(bio) < sectors_this_extent &&
bio->bi_vcnt < bio->bi_max_vecs) {
- pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTOR_SHIFT;
+ pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT;
struct page *page = readpage_iter_next(iter);
int ret;
@@ -864,8 +1074,7 @@ retry:
if (rbio->bio.bi_iter.bi_size == bytes)
flags |= BCH_READ_LAST_FRAGMENT;
- if (bkey_extent_is_allocation(k.k))
- bch2_add_page_sectors(&rbio->bio, k);
+ bch2_bio_page_state_set(&rbio->bio, k);
bch2_read_extent(trans, rbio, iter.pos,
data_btree, k, offset_into_extent, flags);
@@ -875,6 +1084,10 @@ retry:
swap(rbio->bio.bi_iter.bi_size, bytes);
bio_advance(&rbio->bio, bytes);
+
+ ret = btree_trans_too_many_iters(trans);
+ if (ret)
+ break;
}
err:
bch2_trans_iter_exit(trans, &iter);
@@ -922,7 +1135,7 @@ void bch2_readahead(struct readahead_control *ractl)
readpages_iter.idx++;
bio_set_op_attrs(&rbio->bio, REQ_OP_READ, 0);
- rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTOR_SHIFT;
+ rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTORS_SHIFT;
rbio->bio.bi_end_io = bch2_readpages_end_io;
BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
@@ -945,7 +1158,7 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC);
rbio->bio.bi_iter.bi_sector =
- (sector_t) page->index << PAGE_SECTOR_SHIFT;
+ (sector_t) page->index << PAGE_SECTORS_SHIFT;
BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
bch2_trans_init(&trans, c, 0, 0);
@@ -1232,7 +1445,7 @@ do_io:
}
BUG_ON(!sectors);
- sector = ((u64) page->index << PAGE_SECTOR_SHIFT) + offset;
+ sector = ((u64) page->index << PAGE_SECTORS_SHIFT) + offset;
if (w->io &&
(w->io->op.res.nr_replicas != nr_replicas_this_write ||
@@ -1349,6 +1562,12 @@ readpage:
if (ret)
goto err;
out:
+ if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) {
+ ret = bch2_page_state_set(c, inode_inum(inode), &page, 1);
+ if (ret)
+ goto out;
+ }
+
ret = bch2_page_reservation_get(c, inode, page, res,
offset, len, true);
if (ret) {
@@ -1478,20 +1697,21 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
}
while (reserved < len) {
- struct page *page = pages[(offset + reserved) >> PAGE_SHIFT];
+ unsigned i = (offset + reserved) >> PAGE_SHIFT;
+ struct page *page = pages[i];
unsigned pg_offset = (offset + reserved) & (PAGE_SIZE - 1);
unsigned pg_len = min_t(unsigned, len - reserved,
PAGE_SIZE - pg_offset);
-retry_reservation:
- ret = bch2_page_reservation_get(c, inode, page, &res,
- pg_offset, pg_len, true);
- if (ret && !PageUptodate(page)) {
- ret = bch2_read_single_page(page, mapping);
- if (!ret)
- goto retry_reservation;
+ if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) {
+ ret = bch2_page_state_set(c, inode_inum(inode),
+ pages + i, nr_pages - i);
+ if (ret)
+ goto out;
}
+ ret = bch2_page_reservation_get(c, inode, page, &res,
+ pg_offset, pg_len, true);
if (ret)
goto out;
@@ -2245,6 +2465,7 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1;
unsigned i;
struct page *page;
+ s64 i_sectors_delta = 0;
int ret = 0;
/* Page boundary? Nothing to do */
@@ -2263,8 +2484,8 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
* page
*/
ret = range_has_data(c, inode->ei_subvol,
- POS(inode->v.i_ino, index << PAGE_SECTOR_SHIFT),
- POS(inode->v.i_ino, (index + 1) << PAGE_SECTOR_SHIFT));
+ POS(inode->v.i_ino, index << PAGE_SECTORS_SHIFT),
+ POS(inode->v.i_ino, (index + 1) << PAGE_SECTORS_SHIFT));
if (ret <= 0)
return ret;
@@ -2296,9 +2517,13 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
i < round_down(end_offset, block_bytes(c)) >> 9;
i++) {
s->s[i].nr_replicas = 0;
+ if (s->s[i].state == SECTOR_DIRTY)
+ i_sectors_delta--;
s->s[i].state = SECTOR_UNALLOCATED;
}
+ i_sectors_acct(c, inode, NULL, i_sectors_delta);
+
/*
* Caller needs to know whether this page will be written out by
* writeback - doing an i_size update if necessary - or whether it will
@@ -2480,6 +2705,8 @@ int bch2_truncate(struct user_namespace *mnt_userns,
U64_MAX, &i_sectors_delta);
i_sectors_acct(c, inode, NULL, i_sectors_delta);
+ BUG_ON(!inode->v.i_size && inode->v.i_blocks);
+
if (unlikely(ret))
goto err;
@@ -2810,6 +3037,8 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
&reservation.k_i,
&disk_res, NULL,
0, &i_sectors_delta, true);
+ if (ret)
+ goto bkey_err;
i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
bkey_err:
bch2_quota_reservation_put(c, inode, &quota_res);
@@ -2818,6 +3047,9 @@ bkey_err:
ret = 0;
}
+ bch2_trans_unlock(&trans); /* lock ordering, before taking pagecache locks: */
+ mark_pagecache_reserved(inode, start_sector, iter.pos.offset);
+
if (ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE)) {
struct quota_res quota_res = { 0 };
s64 i_sectors_delta = 0;
@@ -2923,43 +3155,6 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
return ret;
}
-static void mark_range_unallocated(struct bch_inode_info *inode,
- loff_t start, loff_t end)
-{
- pgoff_t index = start >> PAGE_SHIFT;
- pgoff_t end_index = (end - 1) >> PAGE_SHIFT;
- struct pagevec pvec;
-
- pagevec_init(&pvec);
-
- do {
- unsigned nr_pages, i, j;
-
- nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping,
- &index, end_index);
- if (nr_pages == 0)
- break;
-
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
- struct bch_page_state *s;
-
- lock_page(page);
- s = bch2_page_state(page);
-
- if (s) {
- spin_lock(&s->lock);
- for (j = 0; j < PAGE_SECTORS; j++)
- s->s[j].nr_replicas = 0;
- spin_unlock(&s->lock);
- }
-
- unlock_page(page);
- }
- pagevec_release(&pvec);
- } while (index <= end_index);
-}
-
loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
struct file *file_dst, loff_t pos_dst,
loff_t len, unsigned remap_flags)
@@ -3005,7 +3200,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
if (ret)
goto err;
- mark_range_unallocated(src, pos_src, pos_src + aligned_len);
+ mark_pagecache_unallocated(src, pos_src >> 9,
+ (pos_src + aligned_len) >> 9);
ret = bch2_remap_range(c,
inode_inum(dst), pos_dst >> 9,
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index fc29e6c4..7eb33da9 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -38,7 +38,8 @@ static struct kmem_cache *bch2_inode_cache;
static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
struct bch_inode_info *,
- struct bch_inode_unpacked *);
+ struct bch_inode_unpacked *,
+ struct bch_subvolume *);
static void __pagecache_lock_put(struct pagecache_lock *lock, long i)
{
@@ -224,6 +225,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
struct bch_inode_unpacked inode_u;
struct bch_inode_info *inode;
struct btree_trans trans;
+ struct bch_subvolume subvol;
int ret;
inode = to_bch_ei(iget5_locked(c->vfs_sb,
@@ -238,10 +240,11 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
bch2_trans_init(&trans, c, 8, 0);
ret = lockrestart_do(&trans,
+ bch2_subvolume_get(&trans, inum.subvol, true, 0, &subvol) ?:
bch2_inode_find_by_inum_trans(&trans, inum, &inode_u));
if (!ret)
- bch2_vfs_inode_init(&trans, inum, inode, &inode_u);
+ bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol);
bch2_trans_exit(&trans);
if (ret) {
@@ -267,6 +270,7 @@ __bch2_create(struct user_namespace *mnt_userns,
struct bch_inode_unpacked inode_u;
struct posix_acl *default_acl = NULL, *acl = NULL;
subvol_inum inum;
+ struct bch_subvolume subvol;
u64 journal_seq = 0;
int ret;
@@ -309,7 +313,12 @@ retry:
if (unlikely(ret))
goto err_before_quota;
- ret = bch2_trans_commit(&trans, NULL, &journal_seq, 0);
+ inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
+ inum.inum = inode_u.bi_inum;
+
+ ret = bch2_subvolume_get(&trans, inum.subvol, true,
+ BTREE_ITER_WITH_UPDATES, &subvol) ?:
+ bch2_trans_commit(&trans, NULL, &journal_seq, 0);
if (unlikely(ret)) {
bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
KEY_TYPE_QUOTA_WARN);
@@ -325,11 +334,8 @@ err_before_quota:
mutex_unlock(&dir->ei_update_lock);
}
- inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
- inum.inum = inode_u.bi_inum;
-
bch2_iget5_set(&inode->v, &inum);
- bch2_vfs_inode_init(&trans, inum, inode, &inode_u);
+ bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol);
set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
@@ -1350,10 +1356,16 @@ static const struct export_operations bch_export_ops = {
static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi)
+ struct bch_inode_unpacked *bi,
+ struct bch_subvolume *subvol)
{
bch2_inode_update_after_write(trans, inode, bi, ~0);
+ if (BCH_SUBVOLUME_SNAP(subvol))
+ set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
+ else
+ clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
+
inode->v.i_blocks = bi->bi_sectors;
inode->v.i_ino = bi->bi_inum;
inode->v.i_rdev = bi->bi_dev;
diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h
index 27aacd7e..b2211ec7 100644
--- a/libbcachefs/fs.h
+++ b/libbcachefs/fs.h
@@ -64,6 +64,12 @@ static inline subvol_inum inode_inum(struct bch_inode_info *inode)
*/
#define EI_INODE_ERROR 0
+/*
+ * Set in the inode is in a snapshot subvolume - we don't do quota accounting in
+ * those:
+ */
+#define EI_INODE_SNAPSHOT 1
+
#define to_bch_ei(_inode) \
container_of_or_null(_inode, struct bch_inode_info, v)
diff --git a/libbcachefs/io.c b/libbcachefs/io.c
index 3a6b4446..5a3c9eff 100644
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@@ -2323,6 +2323,10 @@ retry:
swap(bvec_iter.bi_size, bytes);
bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
+
+ ret = btree_trans_too_many_iters(&trans);
+ if (ret)
+ break;
}
err:
bch2_trans_iter_exit(&trans, &iter);
diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h
index 5d9c00af..afb1bb2a 100644
--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@@ -223,19 +223,19 @@ enum opt_type {
BCH_SB_POSIX_ACL, true, \
NULL, "Enable POSIX acls") \
x(usrquota, u8, \
- 0, \
+ OPT_FORMAT|OPT_MOUNT, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH_SB_USRQUOTA, false, \
NULL, "Enable user quotas") \
x(grpquota, u8, \
- 0, \
+ OPT_FORMAT|OPT_MOUNT, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH_SB_GRPQUOTA, false, \
NULL, "Enable group quotas") \
x(prjquota, u8, \
- 0, \
+ OPT_FORMAT|OPT_MOUNT, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH_SB_PRJQUOTA, false, \
NULL, "Enable project quotas") \
x(degraded, u8, \
OPT_MOUNT, \
diff --git a/libbcachefs/quota.c b/libbcachefs/quota.c
index 5f1216da..8f8f4b0a 100644
--- a/libbcachefs/quota.c
+++ b/libbcachefs/quota.c
@@ -3,6 +3,7 @@
#include "btree_update.h"
#include "inode.h"
#include "quota.h"
+#include "subvolume.h"
#include "super-io.h"
static const char *bch2_sb_validate_quota(struct bch_sb *sb,
@@ -415,14 +416,55 @@ static void bch2_sb_quota_read(struct bch_fs *c)
}
}
+static int bch2_fs_quota_read_inode(struct btree_trans *trans,
+ struct btree_iter *iter)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_inode_unpacked u;
+ struct bch_subvolume subvolume;
+ struct bkey_s_c k;
+ int ret;
+
+ k = bch2_btree_iter_peek(iter);
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ if (!k.k)
+ return 1;
+
+ ret = bch2_snapshot_get_subvol(trans, k.k->p.snapshot, &subvolume);
+ if (ret)
+ return ret;
+
+ /*
+ * We don't do quota accounting in snapshots:
+ */
+ if (BCH_SUBVOLUME_SNAP(&subvolume))
+ goto advance;
+
+ if (!bkey_is_inode(k.k))
+ goto advance;
+
+ ret = bch2_inode_unpack(k, &u);
+ if (ret)
+ return ret;
+
+ bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors,
+ KEY_TYPE_QUOTA_NOCHECK);
+ bch2_quota_acct(c, bch_qid(&u), Q_INO, 1,
+ KEY_TYPE_QUOTA_NOCHECK);
+advance:
+ bch2_btree_iter_set_pos(iter, POS(iter->pos.inode, iter->pos.offset + 1));
+ return 0;
+}
+
int bch2_fs_quota_read(struct bch_fs *c)
{
unsigned i, qtypes = enabled_qtypes(c);
struct bch_memquota_type *q;
struct btree_trans trans;
struct btree_iter iter;
- struct bch_inode_unpacked u;
- struct bkey_s_c k;
int ret;
mutex_lock(&c->sb_lock);
@@ -437,23 +479,18 @@ int bch2_fs_quota_read(struct bch_fs *c)
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
- BTREE_ITER_PREFETCH, k, ret) {
- if (bkey_is_inode(k.k)) {
- ret = bch2_inode_unpack(k, &u);
- if (ret)
- return ret;
-
- bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors,
- KEY_TYPE_QUOTA_NOCHECK);
- bch2_quota_acct(c, bch_qid(&u), Q_INO, 1,
- KEY_TYPE_QUOTA_NOCHECK);
- }
- }
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, POS_MIN,
+ BTREE_ITER_INTENT|
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS);
+ do {
+ ret = lockrestart_do(&trans,
+ bch2_fs_quota_read_inode(&trans, &iter));
+ } while (!ret);
bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
- return ret;
+ return ret < 0 ? ret : 0;
}
/* Enable/disable/delete quotas for an entire filesystem: */
diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c
index 8dcac781..c8d6d736 100644
--- a/libbcachefs/reflink.c
+++ b/libbcachefs/reflink.c
@@ -184,7 +184,8 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k));
- ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, 0);
+ ret = bch2_trans_update(trans, extent_iter, &r_p->k_i,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
err:
c->reflink_hint = reflink_iter.pos.offset;
bch2_trans_iter_exit(trans, &reflink_iter);
diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c
index 00200659..6c5ea78d 100644
--- a/libbcachefs/replicas.c
+++ b/libbcachefs/replicas.c
@@ -427,61 +427,8 @@ int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r)
return __bch2_mark_replicas(c, r, false);
}
-static int __bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k,
- bool check)
-{
- struct bch_replicas_padded search;
- struct bch_devs_list cached = bch2_bkey_cached_devs(k);
- unsigned i;
- int ret;
-
- memset(&search, 0, sizeof(search));
-
- for (i = 0; i < cached.nr; i++) {
- bch2_replicas_entry_cached(&search.e, cached.devs[i]);
-
- ret = __bch2_mark_replicas(c, &search.e, check);
- if (ret)
- return ret;
- }
-
- bch2_bkey_to_replicas(&search.e, k);
-
- ret = __bch2_mark_replicas(c, &search.e, check);
- if (ret)
- return ret;
-
- if (search.e.data_type == BCH_DATA_parity) {
- search.e.data_type = BCH_DATA_cached;
- ret = __bch2_mark_replicas(c, &search.e, check);
- if (ret)
- return ret;
-
- search.e.data_type = BCH_DATA_user;
- ret = __bch2_mark_replicas(c, &search.e, check);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
/* replicas delta list: */
-bool bch2_replicas_delta_list_marked(struct bch_fs *c,
- struct replicas_delta_list *r)
-{
- struct replicas_delta *d = r->d;
- struct replicas_delta *top = (void *) r->d + r->used;
-
- percpu_rwsem_assert_held(&c->mark_lock);
-
- for (d = r->d; d != top; d = replicas_delta_next(d))
- if (bch2_replicas_entry_idx(c, &d->r) < 0)
- return false;
- return true;
-}
-
int bch2_replicas_delta_list_mark(struct bch_fs *c,
struct replicas_delta_list *r)
{
@@ -494,19 +441,6 @@ int bch2_replicas_delta_list_mark(struct bch_fs *c,
return ret;
}
-/* bkey replicas: */
-
-bool bch2_bkey_replicas_marked(struct bch_fs *c,
- struct bkey_s_c k)
-{
- return __bch2_mark_bkey_replicas(c, k, true) == 0;
-}
-
-int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
-{
- return __bch2_mark_bkey_replicas(c, k, false);
-}
-
/*
* Old replicas_gc mechanism: only used for journal replicas entries now, should
* die at some point:
diff --git a/libbcachefs/replicas.h b/libbcachefs/replicas.h
index 72ac544f..d237d7c5 100644
--- a/libbcachefs/replicas.h
+++ b/libbcachefs/replicas.h
@@ -48,12 +48,9 @@ replicas_delta_next(struct replicas_delta *d)
return (void *) d + replicas_entry_bytes(&d->r) + 8;
}
-bool bch2_replicas_delta_list_marked(struct bch_fs *, struct replicas_delta_list *);
int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *);
void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c);
-bool bch2_bkey_replicas_marked(struct bch_fs *, struct bkey_s_c);
-int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c);
static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e,
unsigned dev)
diff --git a/libbcachefs/subvolume.c b/libbcachefs/subvolume.c
index 0ef625d2..7e909a11 100644
--- a/libbcachefs/subvolume.c
+++ b/libbcachefs/subvolume.c
@@ -789,6 +789,15 @@ int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol,
return ret;
}
+int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot,
+ struct bch_subvolume *subvol)
+{
+ struct bch_snapshot snap;
+
+ return snapshot_lookup(trans, snapshot, &snap) ?:
+ bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol);
+}
+
int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol,
u32 *snapid)
{
diff --git a/libbcachefs/subvolume.h b/libbcachefs/subvolume.h
index dde755b4..e4c3fdcd 100644
--- a/libbcachefs/subvolume.h
+++ b/libbcachefs/subvolume.h
@@ -118,6 +118,8 @@ void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c)
int bch2_subvolume_get(struct btree_trans *, unsigned,
bool, int, struct bch_subvolume *);
+int bch2_snapshot_get_subvol(struct btree_trans *, u32,
+ struct bch_subvolume *);
int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
int bch2_subvolume_delete(struct btree_trans *, u32);
diff --git a/libbcachefs/util.h b/libbcachefs/util.h
index bec84d8a..80402b39 100644
--- a/libbcachefs/util.h
+++ b/libbcachefs/util.h
@@ -18,8 +18,6 @@
#include <linux/vmalloc.h>
#include <linux/workqueue.h>
-#define PAGE_SECTOR_SHIFT (PAGE_SHIFT - 9)
-
struct closure;
#ifdef CONFIG_BCACHEFS_DEBUG