summaryrefslogtreecommitdiff
path: root/libbcachefs
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2017-12-21 18:00:30 -0500
committerKent Overstreet <kent.overstreet@gmail.com>2017-12-21 18:06:45 -0500
commit1cf4d51dc4661f336f5318c176a3561ddf5bf04f (patch)
tree8b390ccd48361ba1408be6799d46e62c6382cc39 /libbcachefs
parent8acc54456e11ee0ec80ed0c6abb6d68abae60592 (diff)
Update bcachefs sources to 14ce2a2031 bcachefs: fixes for building in userspace
Diffstat (limited to 'libbcachefs')
-rw-r--r--libbcachefs/acl.c3
-rw-r--r--libbcachefs/alloc.c18
-rw-r--r--libbcachefs/bcachefs.h15
-rw-r--r--libbcachefs/bcachefs_format.h37
-rw-r--r--libbcachefs/bkey.c37
-rw-r--r--libbcachefs/bkey.h4
-rw-r--r--libbcachefs/bkey_methods.c127
-rw-r--r--libbcachefs/bkey_methods.h16
-rw-r--r--libbcachefs/btree_gc.c21
-rw-r--r--libbcachefs/btree_io.c141
-rw-r--r--libbcachefs/btree_iter.c23
-rw-r--r--libbcachefs/btree_locking.h3
-rw-r--r--libbcachefs/btree_types.h2
-rw-r--r--libbcachefs/btree_update.h4
-rw-r--r--libbcachefs/btree_update_interior.c192
-rw-r--r--libbcachefs/buckets.c6
-rw-r--r--libbcachefs/buckets_types.h6
-rw-r--r--libbcachefs/chardev.c5
-rw-r--r--libbcachefs/checksum.h13
-rw-r--r--libbcachefs/error.c10
-rw-r--r--libbcachefs/error.h9
-rw-r--r--libbcachefs/extents.c73
-rw-r--r--libbcachefs/extents.h9
-rw-r--r--libbcachefs/fs-io.c623
-rw-r--r--libbcachefs/fs-ioctl.c8
-rw-r--r--libbcachefs/fs.c44
-rw-r--r--libbcachefs/fs.h17
-rw-r--r--libbcachefs/fsck.c16
-rw-r--r--libbcachefs/inode.c29
-rw-r--r--libbcachefs/inode.h44
-rw-r--r--libbcachefs/io.c128
-rw-r--r--libbcachefs/io.h50
-rw-r--r--libbcachefs/io_types.h8
-rw-r--r--libbcachefs/journal.c119
-rw-r--r--libbcachefs/journal.h4
-rw-r--r--libbcachefs/journal_types.h1
-rw-r--r--libbcachefs/migrate.c197
-rw-r--r--libbcachefs/migrate.h5
-rw-r--r--libbcachefs/move.c36
-rw-r--r--libbcachefs/opts.c71
-rw-r--r--libbcachefs/opts.h28
-rw-r--r--libbcachefs/super-io.c716
-rw-r--r--libbcachefs/super-io.h28
-rw-r--r--libbcachefs/super.c193
-rw-r--r--libbcachefs/super.h30
-rw-r--r--libbcachefs/sysfs.c2
-rw-r--r--libbcachefs/tier.c3
-rw-r--r--libbcachefs/util.h6
-rw-r--r--libbcachefs/vstructs.h8
-rw-r--r--libbcachefs/xattr.c131
50 files changed, 1952 insertions, 1367 deletions
diff --git a/libbcachefs/acl.c b/libbcachefs/acl.c
index 2632d21c..480941d6 100644
--- a/libbcachefs/acl.c
+++ b/libbcachefs/acl.c
@@ -193,8 +193,7 @@ int bch2_set_acl(struct inode *vinode, struct posix_acl *acl, int type)
if (ret < 0)
return ret;
else {
- inode->v.i_ctime =
- current_fs_time(inode->v.i_sb);
+ inode->v.i_ctime = current_time(&inode->v);
mark_inode_dirty(&inode->v);
if (ret == 0)
acl = NULL;
diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c
index d29d871a..29799df6 100644
--- a/libbcachefs/alloc.c
+++ b/libbcachefs/alloc.c
@@ -257,7 +257,7 @@ static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
return;
a = bkey_s_c_to_alloc(k);
- ca = c->devs[a.k->p.inode];
+ ca = bch_dev_bkey_exists(c, a.k->p.inode);
if (a.k->p.offset >= ca->mi.nbuckets)
return;
@@ -305,10 +305,12 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
bch2_alloc_read_key(c, bkey_i_to_s_c(k));
}
+ mutex_lock(&c->bucket_lock);
for_each_member_device(ca, c, i) {
bch2_recalc_min_prio(c, ca, READ);
bch2_recalc_min_prio(c, ca, WRITE);
}
+ mutex_unlock(&c->bucket_lock);
return 0;
}
@@ -368,7 +370,7 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos)
if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode])
return 0;
- ca = c->devs[pos.inode];
+ ca = bch_dev_bkey_exists(c, pos.inode);
if (pos.offset >= ca->mi.nbuckets)
return 0;
@@ -461,7 +463,7 @@ static void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca,
/* Bucket heap / gen */
-void bch2_recalc_min_prio(struct bch_fs *c, struct bch_dev *ca, int rw)
+static void bch2_recalc_min_prio(struct bch_fs *c, struct bch_dev *ca, int rw)
{
struct prio_clock *clock = &c->prio_clock[rw];
struct bucket *g;
@@ -975,7 +977,7 @@ static int bch2_allocator_thread(void *arg)
void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
{
- struct bch_dev *ca = c->devs[ob->ptr.dev];
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
spin_lock(&ob->lock);
bch2_mark_alloc_bucket(c, ca, PTR_BUCKET(ca, &ob->ptr), false,
@@ -1303,7 +1305,7 @@ static void writepoint_drop_ptrs(struct bch_fs *c,
for (i = wp->nr_ptrs - 1; i >= 0; --i) {
struct open_bucket *ob = wp->ptrs[i];
- struct bch_dev *ca = c->devs[ob->ptr.dev];
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
if (nr_ptrs_dislike && !test_bit(ob->ptr.dev, devs->d)) {
BUG_ON(ca->open_buckets_partial_nr >=
@@ -1331,7 +1333,7 @@ static void verify_not_stale(struct bch_fs *c, const struct write_point *wp)
unsigned i;
writepoint_for_each_ptr(wp, ob, i) {
- struct bch_dev *ca = c->devs[ob->ptr.dev];
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
BUG_ON(ptr_stale(ca, &ob->ptr));
}
@@ -1537,7 +1539,7 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
for (i = 0; i < wp->nr_ptrs_can_use; i++) {
struct open_bucket *ob = wp->ptrs[i];
- struct bch_dev *ca = c->devs[ob->ptr.dev];
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
struct bch_extent_ptr tmp = ob->ptr;
EBUG_ON(bch2_extent_has_device(extent_i_to_s_c(e), ob->ptr.dev));
@@ -1589,7 +1591,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
ra_pages += bdi->ra_pages;
}
- c->bdi.ra_pages = ra_pages;
+ bch2_set_ra_pages(c, ra_pages);
/* Find fastest, slowest tiers with devices: */
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index b679dd16..e25baf56 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -326,9 +326,9 @@ struct io_count {
struct bch_dev {
struct kobject kobj;
struct percpu_ref ref;
+ struct completion ref_completion;
struct percpu_ref io_ref;
- struct completion stop_complete;
- struct completion offline_complete;
+ struct completion io_ref_completion;
struct bch_fs *fs;
@@ -515,12 +515,11 @@ struct bch_fs {
struct closure sb_write;
struct mutex sb_lock;
- struct backing_dev_info bdi;
-
/* BTREE CACHE */
struct bio_set btree_read_bio;
struct btree_root btree_roots[BTREE_ID_NR];
+ bool btree_roots_dirty;
struct mutex btree_root_lock;
struct btree_cache btree_cache;
@@ -710,6 +709,14 @@ struct bch_fs {
#undef BCH_TIME_STAT
};
+static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
+{
+#ifndef NO_BCACHEFS_FS
+ if (c->vfs_sb)
+ c->vfs_sb->s_bdi->ra_pages = ra_pages;
+#endif
+}
+
static inline bool bch2_fs_running(struct bch_fs *c)
{
return c->state == BCH_FS_RO || c->state == BCH_FS_RW;
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index 2dc9a7e0..6e0e0452 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -593,18 +593,24 @@ struct bch_inode_generation {
} __attribute__((packed, aligned(8)));
BKEY_VAL_TYPE(inode_generation, BCH_INODE_GENERATION);
-#define BCH_INODE_FIELDS() \
- BCH_INODE_FIELD(bi_atime, 64) \
- BCH_INODE_FIELD(bi_ctime, 64) \
- BCH_INODE_FIELD(bi_mtime, 64) \
- BCH_INODE_FIELD(bi_otime, 64) \
- BCH_INODE_FIELD(bi_size, 64) \
- BCH_INODE_FIELD(bi_sectors, 64) \
- BCH_INODE_FIELD(bi_uid, 32) \
- BCH_INODE_FIELD(bi_gid, 32) \
- BCH_INODE_FIELD(bi_nlink, 32) \
- BCH_INODE_FIELD(bi_generation, 32) \
- BCH_INODE_FIELD(bi_dev, 32)
+#define BCH_INODE_FIELDS() \
+ BCH_INODE_FIELD(bi_atime, 64) \
+ BCH_INODE_FIELD(bi_ctime, 64) \
+ BCH_INODE_FIELD(bi_mtime, 64) \
+ BCH_INODE_FIELD(bi_otime, 64) \
+ BCH_INODE_FIELD(bi_size, 64) \
+ BCH_INODE_FIELD(bi_sectors, 64) \
+ BCH_INODE_FIELD(bi_uid, 32) \
+ BCH_INODE_FIELD(bi_gid, 32) \
+ BCH_INODE_FIELD(bi_nlink, 32) \
+ BCH_INODE_FIELD(bi_generation, 32) \
+ BCH_INODE_FIELD(bi_dev, 32) \
+ BCH_INODE_FIELD(bi_data_checksum, 8) \
+ BCH_INODE_FIELD(bi_compression, 8)
+
+#define BCH_INODE_FIELDS_INHERIT() \
+ BCH_INODE_FIELD(bi_data_checksum) \
+ BCH_INODE_FIELD(bi_compression)
enum {
/*
@@ -794,7 +800,7 @@ struct bch_sb_layout {
__u8 sb_max_size_bits; /* base 2 of 512 byte sectors */
__u8 nr_superblocks;
__u8 pad[5];
- __u64 sb_offset[61];
+ __le64 sb_offset[61];
} __attribute__((packed, aligned(8)));
#define BCH_SB_LAYOUT_SECTOR 7
@@ -1089,6 +1095,11 @@ struct jset_entry {
};
};
+struct jset_entry_blacklist {
+ struct jset_entry entry;
+ __le64 seq;
+};
+
#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64))
enum {
diff --git a/libbcachefs/bkey.c b/libbcachefs/bkey.c
index 73089a90..97015084 100644
--- a/libbcachefs/bkey.c
+++ b/libbcachefs/bkey.c
@@ -1,6 +1,7 @@
#include "bcachefs.h"
#include "bkey.h"
+#include "bkey_methods.h"
#include "bset.h"
#include "util.h"
@@ -80,37 +81,6 @@ static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed,
const struct bkey_format *format) {}
#endif
-int bch2_bkey_to_text(char *buf, size_t size, const struct bkey *k)
-{
- char *out = buf, *end = buf + size;
-
-#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
-
- p("u64s %u type %u %llu:%llu snap %u len %u ver %llu",
- k->u64s, k->type, k->p.inode, k->p.offset,
- k->p.snapshot, k->size, k->version.lo);
-
- BUG_ON(bkey_packed(k));
-
- switch (k->type) {
- case KEY_TYPE_DELETED:
- p(" deleted");
- break;
- case KEY_TYPE_DISCARD:
- p(" discard");
- break;
- case KEY_TYPE_ERROR:
- p(" error");
- break;
- case KEY_TYPE_COOKIE:
- p(" cookie");
- break;
- }
-#undef p
-
- return out - buf;
-}
-
struct pack_state {
const struct bkey_format *format;
unsigned bits; /* bits remaining in current word */
@@ -336,7 +306,8 @@ bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
* Extents - we have to guarantee that if an extent is packed, a trimmed
* version will also pack:
*/
- if (bkey_start_offset(in) < format->field_offset[BKEY_FIELD_OFFSET])
+ if (bkey_start_offset(in) <
+ le64_to_cpu(format->field_offset[BKEY_FIELD_OFFSET]))
return false;
pack_state_finish(&state, out);
@@ -800,7 +771,7 @@ static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out,
bool *eax_zeroed)
{
unsigned bits = format->bits_per_field[field];
- u64 offset = format->field_offset[field];
+ u64 offset = le64_to_cpu(format->field_offset[field]);
unsigned i, byte, bit_offset, align, shl, shr;
if (!bits && !offset) {
diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h
index dc0b88f7..89697956 100644
--- a/libbcachefs/bkey.h
+++ b/libbcachefs/bkey.h
@@ -8,7 +8,6 @@
#include "vstructs.h"
void bch2_to_binary(char *, const u64 *, unsigned);
-int bch2_bkey_to_text(char *, size_t, const struct bkey *);
#define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX)
@@ -377,7 +376,8 @@ static inline u64 bkey_field_max(const struct bkey_format *f,
enum bch_bkey_fields nr)
{
return f->bits_per_field[nr] < 64
- ? f->field_offset[nr] + ~(~0ULL << f->bits_per_field[nr])
+ ? (le64_to_cpu(f->field_offset[nr]) +
+ ~(~0ULL << f->bits_per_field[nr]))
: U64_MAX;
}
diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c
index 23894158..1736a483 100644
--- a/libbcachefs/bkey_methods.c
+++ b/libbcachefs/bkey_methods.c
@@ -18,28 +18,11 @@ const struct bkey_ops *bch2_bkey_ops[] = {
[BKEY_TYPE_BTREE] = &bch2_bkey_btree_ops,
};
-/* Returns string indicating reason for being invalid, or NULL if valid: */
-const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
- struct bkey_s_c k)
+const char *bch2_bkey_val_invalid(struct bch_fs *c, enum bkey_type type,
+ struct bkey_s_c k)
{
const struct bkey_ops *ops = bch2_bkey_ops[type];
- if (k.k->u64s < BKEY_U64s)
- return "u64s too small";
-
- if (!ops->is_extents) {
- if (k.k->size)
- return "nonzero size field";
- } else {
- if ((k.k->size == 0) != bkey_deleted(k.k))
- return "bad size field";
- }
-
- if (ops->is_extents &&
- !k.k->size &&
- !bkey_deleted(k.k))
- return "zero size field";
-
switch (k.k->type) {
case KEY_TYPE_DELETED:
case KEY_TYPE_DISCARD:
@@ -63,8 +46,41 @@ const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
}
}
-const char *bch2_btree_bkey_invalid(struct bch_fs *c, struct btree *b,
- struct bkey_s_c k)
+const char *__bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
+ struct bkey_s_c k)
+{
+ const struct bkey_ops *ops = bch2_bkey_ops[type];
+
+ if (k.k->u64s < BKEY_U64s)
+ return "u64s too small";
+
+ if (!ops->is_extents) {
+ if (k.k->size)
+ return "nonzero size field";
+ } else {
+ if ((k.k->size == 0) != bkey_deleted(k.k))
+ return "bad size field";
+ }
+
+ if (ops->is_extents &&
+ !k.k->size &&
+ !bkey_deleted(k.k))
+ return "zero size field";
+
+ if (k.k->p.snapshot)
+ return "nonzero snapshot";
+
+ return NULL;
+}
+
+const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
+ struct bkey_s_c k)
+{
+ return __bch2_bkey_invalid(c, type, k) ?:
+ bch2_bkey_val_invalid(c, type, k);
+}
+
+const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
{
if (bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0)
return "key before start of btree node";
@@ -72,10 +88,7 @@ const char *bch2_btree_bkey_invalid(struct bch_fs *c, struct btree *b,
if (bkey_cmp(k.k->p, b->data->max_key) > 0)
return "key past end of btree node";
- if (k.k->p.snapshot)
- return "nonzero snapshot";
-
- return bch2_bkey_invalid(c, btree_node_type(b), k);
+ return NULL;
}
void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
@@ -86,7 +99,8 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
BUG_ON(!k.k->u64s);
- invalid = bch2_btree_bkey_invalid(c, b, k);
+ invalid = bch2_bkey_invalid(c, type, k) ?:
+ bch2_bkey_in_btree_node(b, k);
if (invalid) {
char buf[160];
@@ -100,33 +114,62 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
ops->key_debugcheck(c, b, k);
}
-char *bch2_val_to_text(struct bch_fs *c, enum bkey_type type,
- char *buf, size_t size, struct bkey_s_c k)
+#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
+
+int bch2_bkey_to_text(char *buf, size_t size, const struct bkey *k)
{
- const struct bkey_ops *ops = bch2_bkey_ops[type];
+ char *out = buf, *end = buf + size;
- if (k.k->type >= KEY_TYPE_GENERIC_NR &&
- ops->val_to_text)
- ops->val_to_text(c, buf, size, k);
+ p("u64s %u type %u ", k->u64s, k->type);
+
+ if (bkey_cmp(k->p, POS_MAX))
+ p("%llu:%llu", k->p.inode, k->p.offset);
+ else
+ p("POS_MAX");
- return buf;
+ p(" snap %u len %u ver %llu", k->p.snapshot, k->size, k->version.lo);
+
+ return out - buf;
}
-char *bch2_bkey_val_to_text(struct bch_fs *c, enum bkey_type type,
- char *buf, size_t size, struct bkey_s_c k)
+int bch2_val_to_text(struct bch_fs *c, enum bkey_type type,
+ char *buf, size_t size, struct bkey_s_c k)
{
const struct bkey_ops *ops = bch2_bkey_ops[type];
char *out = buf, *end = buf + size;
- out += bch2_bkey_to_text(out, end - out, k.k);
-
- if (k.k->type >= KEY_TYPE_GENERIC_NR &&
- ops->val_to_text) {
- out += scnprintf(out, end - out, ": ");
- ops->val_to_text(c, out, end - out, k);
+ switch (k.k->type) {
+ case KEY_TYPE_DELETED:
+ p(" deleted");
+ break;
+ case KEY_TYPE_DISCARD:
+ p(" discard");
+ break;
+ case KEY_TYPE_ERROR:
+ p(" error");
+ break;
+ case KEY_TYPE_COOKIE:
+ p(" cookie");
+ break;
+ default:
+ if (k.k->type >= KEY_TYPE_GENERIC_NR && ops->val_to_text)
+ ops->val_to_text(c, buf, size, k);
+ break;
}
- return buf;
+ return out - buf;
+}
+
+int bch2_bkey_val_to_text(struct bch_fs *c, enum bkey_type type,
+ char *buf, size_t size, struct bkey_s_c k)
+{
+ char *out = buf, *end = buf + size;
+
+ out += bch2_bkey_to_text(out, end - out, k.k);
+ out += scnprintf(out, end - out, ": ");
+ out += bch2_val_to_text(c, type, out, end - out, k);
+
+ return out - buf;
}
void bch2_bkey_swab(enum bkey_type type,
diff --git a/libbcachefs/bkey_methods.h b/libbcachefs/bkey_methods.h
index 29c1abd3..59db3037 100644
--- a/libbcachefs/bkey_methods.h
+++ b/libbcachefs/bkey_methods.h
@@ -64,15 +64,19 @@ struct bkey_ops {
bool is_extents;
};
+const char *bch2_bkey_val_invalid(struct bch_fs *, enum bkey_type,
+ struct bkey_s_c);
+const char *__bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c);
const char *bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c);
-const char *bch2_btree_bkey_invalid(struct bch_fs *, struct btree *,
- struct bkey_s_c);
+const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c);
void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
-char *bch2_val_to_text(struct bch_fs *, enum bkey_type,
- char *, size_t, struct bkey_s_c);
-char *bch2_bkey_val_to_text(struct bch_fs *, enum bkey_type,
- char *, size_t, struct bkey_s_c);
+
+int bch2_bkey_to_text(char *, size_t, const struct bkey *);
+int bch2_val_to_text(struct bch_fs *, enum bkey_type,
+ char *, size_t, struct bkey_s_c);
+int bch2_bkey_val_to_text(struct bch_fs *, enum bkey_type,
+ char *, size_t, struct bkey_s_c);
void bch2_bkey_swab(enum bkey_type, const struct bkey_format *,
struct bkey_packed *);
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index 1198fe39..2294cc3a 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -96,7 +96,7 @@ u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *c, struct bkey_s_c k)
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
extent_for_each_ptr(e, ptr) {
- struct bch_dev *ca = c->devs[ptr->dev];
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
size_t b = PTR_BUCKET_NR(ca, ptr);
if (gen_after(ca->oldest_gens[b], ptr->gen))
@@ -159,14 +159,15 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
(!c->opts.nofsck &&
fsck_err_on(!bch2_sb_has_replicas(c, e, data_type), c,
- "superblock not marked as containing replicas"))) {
+ "superblock not marked as containing replicas (type %u)",
+ data_type))) {
ret = bch2_check_mark_super(c, e, data_type);
if (ret)
return ret;
}
extent_for_each_ptr(e, ptr) {
- struct bch_dev *ca = c->devs[ptr->dev];
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
struct bucket *g = PTR_BUCKET(ca, ptr);
if (mustfix_fsck_err_on(!g->mark.gen_valid, c,
@@ -315,14 +316,14 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
lockdep_assert_held(&c->sb_lock);
for (i = 0; i < layout->nr_superblocks; i++) {
- if (layout->sb_offset[i] == BCH_SB_SECTOR)
+ u64 offset = le64_to_cpu(layout->sb_offset[i]);
+
+ if (offset == BCH_SB_SECTOR)
mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR,
BUCKET_SB, flags);
- mark_metadata_sectors(c, ca,
- layout->sb_offset[i],
- layout->sb_offset[i] +
- (1 << layout->sb_max_size_bits),
+ mark_metadata_sectors(c, ca, offset,
+ offset + (1 << layout->sb_max_size_bits),
BUCKET_SB, flags);
}
@@ -414,7 +415,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
spin_lock(&ob->lock);
if (ob->valid) {
gc_pos_set(c, gc_pos_alloc(c, ob));
- ca = c->devs[ob->ptr.dev];
+ ca = bch_dev_bkey_exists(c, ob->ptr.dev);
bch2_mark_alloc_bucket(c, ca, PTR_BUCKET(ca, &ob->ptr), true,
gc_pos_alloc(c, ob),
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
@@ -424,7 +425,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
}
}
-void bch2_gc_start(struct bch_fs *c)
+static void bch2_gc_start(struct bch_fs *c)
{
struct bch_dev *ca;
struct bucket *g;
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index 38c373c6..87a8ddf9 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -556,7 +556,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
struct bset_tree *t;
struct bset *start_bset = bset(b, &b->set[start_idx]);
bool used_mempool = false;
- u64 start_time;
+ u64 start_time, seq = 0;
unsigned i, u64s = 0, order, shift = end_idx - start_idx - 1;
bool sorting_entire_node = start_idx == 0 &&
end_idx == b->nsets;
@@ -595,12 +595,9 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
bch2_time_stats_update(&c->btree_sort_time, start_time);
/* Make sure we preserve bset journal_seq: */
- for (t = b->set + start_idx + 1;
- t < b->set + end_idx;
- t++)
- start_bset->journal_seq =
- max(start_bset->journal_seq,
- bset(b, t)->journal_seq);
+ for (t = b->set + start_idx; t < b->set + end_idx; t++)
+ seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq));
+ start_bset->journal_seq = cpu_to_le64(seq);
if (sorting_entire_node) {
unsigned u64s = le16_to_cpu(out->keys.u64s);
@@ -958,6 +955,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
{
struct bkey_packed *k, *prev = NULL;
struct bpos prev_pos = POS_MIN;
+ enum bkey_type type = btree_node_type(b);
bool seen_non_whiteout = false;
const char *err;
int ret = 0;
@@ -1025,7 +1023,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
if (!BSET_SEPARATE_WHITEOUTS(i)) {
seen_non_whiteout = true;
- whiteout_u64s = 0;
+ *whiteout_u64s = 0;
}
for (k = i->start;
@@ -1059,16 +1057,17 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
}
if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN)
- bch2_bkey_swab(btree_node_type(b), &b->format, k);
+ bch2_bkey_swab(type, &b->format, k);
u = bkey_disassemble(b, k, &tmp);
- invalid = bch2_btree_bkey_invalid(c, b, u);
+ invalid = __bch2_bkey_invalid(c, type, u) ?:
+ bch2_bkey_in_btree_node(b, u) ?:
+ (write ? bch2_bkey_val_invalid(c, type, u) : NULL);
if (invalid) {
char buf[160];
- bch2_bkey_val_to_text(c, btree_node_type(b),
- buf, sizeof(buf), u);
+ bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u);
btree_err(BTREE_ERR_FIXABLE, c, b, i,
"invalid bkey %s: %s", buf, invalid);
@@ -1114,6 +1113,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
struct btree_node_entry *bne;
struct btree_node_iter *iter;
struct btree_node *sorted;
+ struct bkey_packed *k;
+ struct bset *i;
bool used_mempool;
unsigned u64s;
int ret, retry_read = 0, write = READ;
@@ -1137,7 +1138,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
unsigned sectors, whiteout_u64s = 0;
struct nonce nonce;
struct bch_csum csum;
- struct bset *i;
if (!b->written) {
i = &b->data->keys;
@@ -1238,6 +1238,31 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
btree_bounce_free(c, btree_page_order(c), used_mempool, sorted);
+ i = &b->data->keys;
+ for (k = i->start; k != vstruct_last(i);) {
+ enum bkey_type type = btree_node_type(b);
+ struct bkey tmp;
+ struct bkey_s_c u = bkey_disassemble(b, k, &tmp);
+ const char *invalid = bch2_bkey_val_invalid(c, type, u);
+
+ if (invalid) {
+ char buf[160];
+
+ bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u);
+ btree_err(BTREE_ERR_FIXABLE, c, b, i,
+ "invalid bkey %s: %s", buf, invalid);
+
+ btree_keys_account_key_drop(&b->nr, 0, k);
+
+ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
+ memmove_u64s_down(k, bkey_next(k),
+ (u64 *) vstruct_end(i) - (u64 *) k);
+ continue;
+ }
+
+ k = bkey_next(k);
+ }
+
bch2_bset_build_aux_tree(b, b->set, false);
set_needs_whiteout(btree_bset_first(b));
@@ -1278,13 +1303,13 @@ static void btree_node_read_work(struct work_struct *work)
bio->bi_iter.bi_size = btree_bytes(c);
submit_bio_wait(bio);
start:
- bch2_dev_io_err_on(bio->bi_error, rb->pick.ca, "btree read");
+ bch2_dev_io_err_on(bio->bi_status, rb->pick.ca, "btree read");
percpu_ref_put(&rb->pick.ca->io_ref);
__set_bit(rb->pick.ca->dev_idx, avoid.d);
rb->pick = bch2_btree_pick_ptr(c, b, &avoid);
- if (!bio->bi_error &&
+ if (!bio->bi_status &&
!bch2_btree_node_read_done(c, b, !IS_ERR_OR_NULL(rb->pick.ca)))
goto out;
} while (!IS_ERR_OR_NULL(rb->pick.ca));
@@ -1377,17 +1402,24 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id));
bch2_btree_node_read(c, b, true);
- six_unlock_write(&b->lock);
if (btree_node_read_error(b)) {
- six_unlock_intent(&b->lock);
- return -EIO;
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+ mutex_lock(&c->btree_cache.lock);
+ list_move(&b->list, &c->btree_cache.freeable);
+ mutex_unlock(&c->btree_cache.lock);
+
+ ret = -EIO;
+ goto err;
}
bch2_btree_set_root_for_read(c, b);
+err:
+ six_unlock_write(&b->lock);
six_unlock_intent(&b->lock);
- return 0;
+ return ret;
}
void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
@@ -1412,35 +1444,57 @@ static void bch2_btree_node_write_error(struct bch_fs *c,
struct closure *cl = wbio->cl;
__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
struct bkey_i_extent *new_key;
+ struct bkey_s_extent e;
+ struct bch_extent_ptr *ptr;
+ struct btree_iter iter;
+ int ret;
- six_lock_read(&b->lock);
- bkey_copy(&tmp.k, &b->key);
- six_unlock_read(&b->lock);
+ __bch2_btree_iter_init(&iter, c, b->btree_id, b->key.k.p,
+ BTREE_MAX_DEPTH,
+ b->level, 0);
+retry:
+ ret = bch2_btree_iter_traverse(&iter);
+ if (ret)
+ goto err;
- if (!bkey_extent_is_data(&tmp.k.k) || !PTR_HASH(&tmp.k)) {
- /* Node has been freed: */
+ /* has node been freed? */
+ if (iter.nodes[b->level] != b) {
+ /* node has been freed: */
+ if (!btree_node_dying(b))
+ panic("foo4\n");
goto out;
}
- new_key = bkey_i_to_extent(&tmp.k);
+ if (!btree_node_hashed(b))
+ panic("foo5\n");
- while (wbio->replicas_failed) {
- unsigned idx = __fls(wbio->replicas_failed);
+ bkey_copy(&tmp.k, &b->key);
- bch2_extent_drop_ptr_idx(extent_i_to_s(new_key), idx);
- wbio->replicas_failed ^= 1 << idx;
- }
+ new_key = bkey_i_to_extent(&tmp.k);
+ e = extent_i_to_s(new_key);
+ extent_for_each_ptr_backwards(e, ptr)
+ if (bch2_dev_list_has_dev(wbio->failed, ptr->dev))
+ bch2_extent_drop_ptr(e, ptr);
- if (!bch2_extent_nr_ptrs(extent_i_to_s_c(new_key)) ||
- bch2_btree_node_update_key(c, b, new_key)) {
- set_btree_node_noevict(b);
- bch2_fatal_error(c);
- }
+ if (!bch2_extent_nr_ptrs(e.c))
+ goto err;
+
+ ret = bch2_btree_node_update_key(c, &iter, b, new_key);
+ if (ret == -EINTR)
+ goto retry;
+ if (ret)
+ goto err;
out:
+ bch2_btree_iter_unlock(&iter);
bio_put(&wbio->bio);
btree_node_write_done(c, b);
if (cl)
closure_put(cl);
+ return;
+err:
+ set_btree_node_noevict(b);
+ bch2_fs_fatal_error(c, "fatal error writing btree node");
+ goto out;
}
void bch2_btree_write_error_work(struct work_struct *work)
@@ -1470,12 +1524,17 @@ static void btree_node_write_endio(struct bio *bio)
struct closure *cl = !wbio->split ? wbio->cl : NULL;
struct bch_fs *c = wbio->c;
struct bch_dev *ca = wbio->ca;
+ unsigned long flags;
bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
- if (bch2_dev_io_err_on(bio->bi_error, ca, "btree write") ||
- bch2_meta_write_fault("btree"))
- set_bit(wbio->ptr_idx, (unsigned long *) &orig->replicas_failed);
+ if (bio->bi_status == BLK_STS_REMOVED ||
+ bch2_dev_io_err_on(bio->bi_status, ca, "btree write") ||
+ bch2_meta_write_fault("btree")) {
+ spin_lock_irqsave(&c->btree_write_error_lock, flags);
+ bch2_dev_list_add_dev(&orig->failed, ca->dev_idx);
+ spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
+ }
if (wbio->have_io_ref)
percpu_ref_put(&ca->io_ref);
@@ -1491,12 +1550,11 @@ static void btree_node_write_endio(struct bio *bio)
wbio->used_mempool,
wbio->data);
- if (wbio->replicas_failed) {
- unsigned long flags;
-
+ if (wbio->failed.nr) {
spin_lock_irqsave(&c->btree_write_error_lock, flags);
bio_list_add(&c->btree_write_error_list, &wbio->bio);
spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
+
queue_work(c->wq, &c->btree_write_error_work);
return;
}
@@ -1707,6 +1765,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
wbio = wbio_init(bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write));
wbio->cl = parent;
+ wbio->failed.nr = 0;
wbio->order = order;
wbio->used_mempool = used_mempool;
wbio->data = data;
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index b0e64957..0b505a73 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -75,8 +75,8 @@ bool bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
{
struct btree_iter *linked;
struct btree *b = iter->nodes[level];
- enum btree_node_locked_type want = btree_lock_want(iter, level);
- enum btree_node_locked_type have = btree_node_locked_type(iter, level);
+ int want = btree_lock_want(iter, level);
+ int have = btree_node_locked_type(iter, level);
if (want == have)
return true;
@@ -108,6 +108,17 @@ success:
return true;
}
+bool bch2_btree_iter_relock(struct btree_iter *iter)
+{
+ unsigned l;
+
+ for (l = iter->level; l < iter->locks_want && iter->nodes[l]; l++)
+ if (!bch2_btree_node_relock(iter, l))
+ return false;
+
+ return true;
+}
+
/* Slowpath: */
bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
unsigned level,
@@ -214,7 +225,6 @@ bool __bch2_btree_iter_set_locks_want(struct btree_iter *iter,
unsigned new_locks_want)
{
struct btree_iter *linked;
- unsigned l;
/* Drop locks we don't want anymore: */
if (new_locks_want < iter->locks_want)
@@ -228,12 +238,9 @@ bool __bch2_btree_iter_set_locks_want(struct btree_iter *iter,
iter->locks_want = new_locks_want;
btree_iter_drop_extra_locks(iter);
- for (l = iter->level; l < iter->locks_want && iter->nodes[l]; l++)
- if (!bch2_btree_node_relock(iter, l))
- goto fail;
+ if (bch2_btree_iter_relock(iter))
+ return true;
- return true;
-fail:
/*
* Just an optimization: ancestor nodes must be locked before child
* nodes, so set locks_want on iterators that might lock ancestors
diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h
index c2711892..acfe5b59 100644
--- a/libbcachefs/btree_locking.h
+++ b/libbcachefs/btree_locking.h
@@ -75,7 +75,7 @@ static inline void mark_btree_node_intent_locked(struct btree_iter *iter,
mark_btree_node_locked(iter, level, SIX_LOCK_intent);
}
-static inline int btree_lock_want(struct btree_iter *iter, int level)
+static inline enum six_lock_type btree_lock_want(struct btree_iter *iter, int level)
{
return level < iter->locks_want
? SIX_LOCK_intent
@@ -111,6 +111,7 @@ static inline bool btree_node_lock(struct btree *b, struct bpos pos,
}
bool bch2_btree_node_relock(struct btree_iter *, unsigned);
+bool bch2_btree_iter_relock(struct btree_iter *);
void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *);
void bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h
index f1e06a37..f0e6896a 100644
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -196,6 +196,7 @@ enum btree_flags {
BTREE_NODE_accessed,
BTREE_NODE_write_in_flight,
BTREE_NODE_just_written,
+ BTREE_NODE_dying,
};
BTREE_FLAG(read_in_flight);
@@ -207,6 +208,7 @@ BTREE_FLAG(write_idx);
BTREE_FLAG(accessed);
BTREE_FLAG(write_in_flight);
BTREE_FLAG(just_written);
+BTREE_FLAG(dying);
static inline struct btree_write *btree_current_write(struct btree *b)
{
diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h
index e11fcec9..c7c29306 100644
--- a/libbcachefs/btree_update.h
+++ b/libbcachefs/btree_update.h
@@ -130,7 +130,7 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
__le64, unsigned);
-int bch2_btree_node_update_key(struct bch_fs *, struct btree *,
- struct bkey_i_extent *);
+int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
+ struct btree *, struct bkey_i_extent *);
#endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index 1fe8fff8..04854532 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -21,7 +21,7 @@
static void btree_node_will_make_reachable(struct btree_update *,
struct btree *);
static void btree_update_drop_new_node(struct bch_fs *, struct btree *);
-static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *);
+static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *, int);
/* Debug code: */
@@ -686,7 +686,7 @@ retry:
BUG_ON(c->btree_roots[b->btree_id].as != as);
c->btree_roots[b->btree_id].as = NULL;
- bch2_btree_set_root_ondisk(c, b);
+ bch2_btree_set_root_ondisk(c, b, WRITE);
/*
* We don't have to wait anything anything here (before
@@ -914,6 +914,7 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
struct btree_write *w;
struct bset_tree *t;
+ set_btree_node_dying(b);
btree_interior_update_add_node_reference(as, b);
/*
@@ -925,7 +926,8 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
* in with keys that aren't in the journal anymore:
*/
for_each_bset(b, t)
- as->journal_seq = max(as->journal_seq, bset(b, t)->journal_seq);
+ as->journal_seq = max(as->journal_seq,
+ le64_to_cpu(bset(b, t)->journal_seq));
mutex_lock(&c->btree_interior_update_lock);
@@ -1027,6 +1029,10 @@ static void __bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
mutex_unlock(&c->btree_cache.lock);
mutex_lock(&c->btree_root_lock);
+ BUG_ON(btree_node_root(c, b) &&
+ (b->level < btree_node_root(c, b)->level ||
+ !btree_node_dying(btree_node_root(c, b))));
+
btree_node_root(c, b) = b;
mutex_unlock(&c->btree_root_lock);
@@ -1054,7 +1060,7 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
gc_pos_btree_root(b->btree_id));
}
-static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b)
+static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b, int rw)
{
struct btree_root *r = &c->btree_roots[b->btree_id];
@@ -1064,6 +1070,8 @@ static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b)
bkey_copy(&r->key, &b->key);
r->level = b->level;
r->alive = true;
+ if (rw == WRITE)
+ c->btree_roots_dirty = true;
mutex_unlock(&c->btree_root_lock);
}
@@ -1787,64 +1795,16 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
return ret;
}
-int bch2_btree_node_update_key(struct bch_fs *c, struct btree *b,
- struct bkey_i_extent *new_key)
+static void __bch2_btree_node_update_key(struct bch_fs *c,
+ struct btree_update *as,
+ struct btree_iter *iter,
+ struct btree *b, struct btree *new_hash,
+ struct bkey_i_extent *new_key)
{
- struct btree_update *as = NULL;
- struct btree *parent, *new_hash = NULL;
- struct btree_iter iter;
- struct closure cl;
+ struct btree *parent;
bool must_rewrite_parent = false;
int ret;
- __bch2_btree_iter_init(&iter, c, b->btree_id, b->key.k.p,
- BTREE_MAX_DEPTH,
- b->level, 0);
- closure_init_stack(&cl);
-
- ret = bch2_check_mark_super(c, extent_i_to_s_c(new_key), BCH_DATA_BTREE);
- if (ret)
- return ret;
-
-retry:
- down_read(&c->gc_lock);
- ret = bch2_btree_iter_traverse(&iter);
- if (ret)
- goto err;
-
- /* check PTR_HASH() after @b is locked by btree_iter_traverse(): */
- if (!new_hash &&
- PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
- /* bch2_btree_reserve_get will unlock */
- do {
- ret = bch2_btree_cache_cannibalize_lock(c, &cl);
- closure_sync(&cl);
- } while (ret == -EAGAIN);
-
- BUG_ON(ret);
-
- new_hash = bch2_btree_node_mem_alloc(c);
- }
-
- as = bch2_btree_update_start(c, iter.btree_id,
- btree_update_reserve_required(c, b),
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE|
- BTREE_INSERT_USE_ALLOC_RESERVE,
- &cl);
- if (IS_ERR(as)) {
- ret = PTR_ERR(as);
- if (ret == -EAGAIN || ret == -EINTR) {
- bch2_btree_iter_unlock(&iter);
- up_read(&c->gc_lock);
- closure_sync(&cl);
- goto retry;
- }
- goto err;
- }
-
- mutex_lock(&c->btree_interior_update_lock);
-
/*
* Two corner cases that need to be thought about here:
*
@@ -1869,22 +1829,12 @@ retry:
if (b->will_make_reachable)
must_rewrite_parent = true;
- /* other case: btree node being freed */
- if (iter.nodes[b->level] != b) {
- /* node has been freed: */
- BUG_ON(btree_node_hashed(b));
- mutex_unlock(&c->btree_interior_update_lock);
- goto err;
- }
-
- mutex_unlock(&c->btree_interior_update_lock);
-
if (must_rewrite_parent)
as->flags |= BTREE_INTERIOR_UPDATE_MUST_REWRITE;
btree_interior_update_add_node_reference(as, b);
- parent = iter.nodes[b->level + 1];
+ parent = iter->nodes[b->level + 1];
if (parent) {
if (new_hash) {
bkey_copy(&new_hash->key, &new_key->k_i);
@@ -1893,8 +1843,8 @@ retry:
BUG_ON(ret);
}
- bch2_btree_insert_node(as, parent, &iter,
- &keylist_single(&new_key->k_i));
+ bch2_keylist_add(&as->parent_keys, &new_key->k_i);
+ bch2_btree_insert_node(as, parent, iter, &as->parent_keys);
if (new_hash) {
mutex_lock(&c->btree_cache.lock);
@@ -1914,7 +1864,7 @@ retry:
BUG_ON(btree_node_root(c, b) != b);
- bch2_btree_node_lock_write(b, &iter);
+ bch2_btree_node_lock_write(b, iter);
bch2_mark_key(c, bkey_i_to_s_c(&new_key->k_i),
c->opts.btree_node_size, true,
@@ -1925,14 +1875,94 @@ retry:
&stats);
bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
gc_pos_btree_root(b->btree_id));
- bkey_copy(&b->key, &new_key->k_i);
+
+ if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
+ mutex_lock(&c->btree_cache.lock);
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+ bkey_copy(&b->key, &new_key->k_i);
+ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+ BUG_ON(ret);
+ mutex_unlock(&c->btree_cache.lock);
+ } else {
+ bkey_copy(&b->key, &new_key->k_i);
+ }
btree_update_updated_root(as);
- bch2_btree_node_unlock_write(b, &iter);
+ bch2_btree_node_unlock_write(b, iter);
}
bch2_btree_update_done(as);
-out:
+}
+
+int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
+ struct btree *b, struct bkey_i_extent *new_key)
+{
+ struct btree_update *as = NULL;
+ struct btree *new_hash = NULL;
+ struct closure cl;
+ int ret;
+
+ closure_init_stack(&cl);
+
+ if (!down_read_trylock(&c->gc_lock)) {
+ bch2_btree_iter_unlock(iter);
+ down_read(&c->gc_lock);
+
+ if (!bch2_btree_iter_relock(iter)) {
+ ret = -EINTR;
+ goto err;
+ }
+ }
+
+ /* check PTR_HASH() after @b is locked by btree_iter_traverse(): */
+ if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
+ /* bch2_btree_reserve_get will unlock */
+ ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+ if (ret) {
+ ret = -EINTR;
+
+ bch2_btree_iter_unlock(iter);
+ up_read(&c->gc_lock);
+ closure_sync(&cl);
+ down_read(&c->gc_lock);
+
+ if (!bch2_btree_iter_relock(iter))
+ goto err;
+ }
+
+ new_hash = bch2_btree_node_mem_alloc(c);
+ }
+
+ as = bch2_btree_update_start(c, iter->btree_id,
+ btree_update_reserve_required(c, b),
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_USE_RESERVE|
+ BTREE_INSERT_USE_ALLOC_RESERVE,
+ &cl);
+ if (IS_ERR(as)) {
+ ret = PTR_ERR(as);
+ if (ret == -EAGAIN)
+ ret = -EINTR;
+
+ if (ret != -EINTR)
+ goto err;
+
+ bch2_btree_iter_unlock(iter);
+ up_read(&c->gc_lock);
+ closure_sync(&cl);
+ down_read(&c->gc_lock);
+
+ if (!bch2_btree_iter_relock(iter))
+ goto err;
+ }
+
+ ret = bch2_check_mark_super(c, extent_i_to_s_c(new_key), BCH_DATA_BTREE);
+ if (ret)
+ goto err_free_update;
+
+ __bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key);
+err:
if (new_hash) {
mutex_lock(&c->btree_cache.lock);
list_move(&new_hash->list, &c->btree_cache.freeable);
@@ -1941,14 +1971,12 @@ out:
six_unlock_write(&new_hash->lock);
six_unlock_intent(&new_hash->lock);
}
- bch2_btree_iter_unlock(&iter);
up_read(&c->gc_lock);
closure_sync(&cl);
return ret;
-err:
- if (as)
- bch2_btree_update_free(as);
- goto out;
+err_free_update:
+ bch2_btree_update_free(as);
+ goto err;
}
/* Init code: */
@@ -1962,7 +1990,7 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
BUG_ON(btree_node_root(c, b));
__bch2_btree_set_root_inmem(c, b);
- bch2_btree_set_root_ondisk(c, b);
+ bch2_btree_set_root_ondisk(c, b, READ);
}
int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id,
@@ -1998,7 +2026,7 @@ int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id,
BUG_ON(btree_node_root(c, b));
bch2_btree_set_root_inmem(as, b);
- bch2_btree_set_root_ondisk(c, b);
+ bch2_btree_set_root_ondisk(c, b, WRITE);
bch2_btree_open_bucket_put(c, b);
six_unlock_intent(&b->lock);
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index b73002de..f0a63232 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -174,9 +174,11 @@ do { \
#define bch2_usage_read_raw(_stats) \
({ \
- typeof(*this_cpu_ptr(_stats)) _acc = { 0 }; \
+ typeof(*this_cpu_ptr(_stats)) _acc; \
int cpu; \
\
+ memset(&_acc, 0, sizeof(_acc)); \
+ \
for_each_possible_cpu(cpu) \
bch2_usage_add(&_acc, per_cpu_ptr((_stats), cpu)); \
\
@@ -479,7 +481,7 @@ static void bch2_mark_pointer(struct bch_fs *c,
{
struct bucket_mark old, new;
unsigned saturated;
- struct bch_dev *ca = c->devs[ptr->dev];
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
struct bucket *g = ca->buckets + PTR_BUCKET_NR(ca, ptr);
unsigned data_type = type == S_META
? BUCKET_BTREE : BUCKET_DATA;
diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h
index 0bd8d2d8..6f9b1226 100644
--- a/libbcachefs/buckets_types.h
+++ b/libbcachefs/buckets_types.h
@@ -68,16 +68,14 @@ struct bch_dev_usage {
struct bch_fs_usage {
/* all fields are in units of 512 byte sectors: */
-
/* _uncompressed_ sectors: */
+ u64 online_reserved;
+ u64 available_cache;
struct {
u64 data[S_ALLOC_NR];
u64 persistent_reserved;
} s[BCH_REPLICAS_MAX];
-
- u64 online_reserved;
- u64 available_cache;
};
/*
diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c
index d9a3212c..24af2ca1 100644
--- a/libbcachefs/chardev.c
+++ b/libbcachefs/chardev.c
@@ -2,6 +2,7 @@
#include "bcachefs.h"
#include "bcachefs_ioctl.h"
+#include "chardev.h"
#include "super.h"
#include "super-io.h"
@@ -25,7 +26,7 @@ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
return ERR_PTR(-EINVAL);
rcu_read_lock();
- ca = c->devs[dev];
+ ca = rcu_dereference(c->devs[dev]);
if (ca)
percpu_ref_get(&ca->ref);
rcu_read_unlock();
@@ -80,7 +81,7 @@ static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL);
- if (copy_from_user(user_devs, arg.devs,
+ if (copy_from_user(user_devs, user_arg->devs,
sizeof(u64) * arg.nr_devs))
goto err;
diff --git a/libbcachefs/checksum.h b/libbcachefs/checksum.h
index 1a089417..b0c8a50e 100644
--- a/libbcachefs/checksum.h
+++ b/libbcachefs/checksum.h
@@ -72,14 +72,15 @@ static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type,
}
}
-static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c)
+static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
+ unsigned opt)
{
if (c->sb.encryption_type)
return c->opts.wide_macs
? BCH_CSUM_CHACHA20_POLY1305_128
: BCH_CSUM_CHACHA20_POLY1305_80;
- return bch2_csum_opt_to_type(c->opts.data_checksum, true);
+ return bch2_csum_opt_to_type(opt, true);
}
static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
@@ -143,6 +144,14 @@ static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
return nonce;
}
+static inline struct nonce null_nonce(void)
+{
+ struct nonce ret;
+
+ memset(&ret, 0, sizeof(ret));
+ return ret;
+}
+
static inline struct nonce extent_nonce(struct bversion version,
struct bch_extent_crc_unpacked crc)
{
diff --git a/libbcachefs/error.c b/libbcachefs/error.c
index 8357c8de..ca2a06e2 100644
--- a/libbcachefs/error.c
+++ b/libbcachefs/error.c
@@ -95,11 +95,17 @@ print:
vscnprintf(buf, sizeof(_buf), fmt, args);
va_end(args);
+ if (c->opts.fix_errors == FSCK_OPT_EXIT) {
+ bch_err(c, "%s, exiting", buf);
+ mutex_unlock(&c->fsck_error_lock);
+ return FSCK_ERR_EXIT;
+ }
+
if (flags & FSCK_CAN_FIX) {
- if (c->opts.fix_errors == FSCK_ERR_ASK) {
+ if (c->opts.fix_errors == FSCK_OPT_ASK) {
printk(KERN_ERR "%s: fix?", buf);
fix = ask_yn();
- } else if (c->opts.fix_errors == FSCK_ERR_YES ||
+ } else if (c->opts.fix_errors == FSCK_OPT_YES ||
(c->opts.nochanges &&
!(flags & FSCK_CAN_IGNORE))) {
if (print)
diff --git a/libbcachefs/error.h b/libbcachefs/error.h
index 68635eee..28fe4fce 100644
--- a/libbcachefs/error.h
+++ b/libbcachefs/error.h
@@ -96,9 +96,10 @@ enum {
};
enum fsck_err_opts {
- FSCK_ERR_NO,
- FSCK_ERR_YES,
- FSCK_ERR_ASK,
+ FSCK_OPT_EXIT,
+ FSCK_OPT_YES,
+ FSCK_OPT_NO,
+ FSCK_OPT_ASK,
};
enum fsck_err_ret {
@@ -217,7 +218,7 @@ do { \
#define bcache_io_error(c, bio, fmt, ...) \
do { \
__bcache_io_error(c, fmt, ##__VA_ARGS__); \
- (bio)->bi_error = -EIO; \
+ (bio)->bi_status = BLK_STS_IOERR; \
} while (0)
#endif /* _BCACHEFS_ERROR_H */
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c
index 6e79f491..176978ca 100644
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -18,6 +18,7 @@
#include "extents.h"
#include "inode.h"
#include "journal.h"
+#include "super.h"
#include "super-io.h"
#include "util.h"
#include "xattr.h"
@@ -156,6 +157,19 @@ unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k)
return nr_ptrs;
}
+unsigned bch2_extent_nr_good_ptrs(struct bch_fs *c, struct bkey_s_c_extent e)
+{
+ const struct bch_extent_ptr *ptr;
+ unsigned nr_ptrs = 0;
+
+ extent_for_each_ptr(e, ptr)
+ nr_ptrs += (!ptr->cached &&
+ bch_dev_bkey_exists(c, ptr->dev)->mi.state !=
+ BCH_MEMBER_STATE_FAILED);
+
+ return nr_ptrs;
+}
+
unsigned bch2_extent_is_compressed(struct bkey_s_c k)
{
struct bkey_s_c_extent e;
@@ -362,7 +376,7 @@ static bool should_drop_ptr(const struct bch_fs *c,
struct bkey_s_c_extent e,
const struct bch_extent_ptr *ptr)
{
- return ptr->cached && ptr_stale(c->devs[ptr->dev], ptr);
+ return ptr->cached && ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr);
}
static void bch2_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e)
@@ -411,8 +425,10 @@ static void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
break;
case BCH_EXTENT_ENTRY_crc128:
- entry->crc128.csum.hi = swab64(entry->crc64.csum_hi);
- entry->crc128.csum.lo = swab64(entry->crc64.csum_lo);
+ entry->crc128.csum.hi = (__force __le64)
+ swab64((__force u64) entry->crc128.csum.hi);
+ entry->crc128.csum.lo = (__force __le64)
+ swab64((__force u64) entry->crc128.csum.lo);
break;
case BCH_EXTENT_ENTRY_ptr:
break;
@@ -432,10 +448,11 @@ static const char *extent_ptr_invalid(const struct bch_fs *c,
const struct bch_extent_ptr *ptr2;
struct bch_dev *ca;
- if (ptr->dev >= c->sb.nr_devices)
+ if (ptr->dev >= c->sb.nr_devices ||
+ !c->devs[ptr->dev])
return "pointer to invalid device";
- ca = c->devs[ptr->dev];
+ ca = bch_dev_bkey_exists(c, ptr->dev);
if (!ca)
return "pointer to invalid device";
@@ -487,7 +504,9 @@ static size_t extent_print_ptrs(struct bch_fs *c, char *buf,
break;
case BCH_EXTENT_ENTRY_ptr:
ptr = entry_to_ptr(entry);
- ca = c->devs[ptr->dev];
+ ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+ ? bch_dev_bkey_exists(c, ptr->dev)
+ : NULL;
p("ptr: %u:%llu gen %u%s", ptr->dev,
(u64) ptr->offset, ptr->gen,
@@ -528,7 +547,7 @@ static void extent_pick_read_device(struct bch_fs *c,
struct bch_extent_crc_unpacked crc;
extent_for_each_ptr_crc(e, ptr, crc) {
- struct bch_dev *ca = c->devs[ptr->dev];
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
if (ptr->cached && ptr_stale(ca, ptr))
continue;
@@ -621,7 +640,7 @@ static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
bool bad;
extent_for_each_ptr(e, ptr) {
- ca = c->devs[ptr->dev];
+ ca = bch_dev_bkey_exists(c, ptr->dev);
g = PTR_BUCKET(ca, ptr);
replicas++;
@@ -1730,7 +1749,7 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
memset(ptrs_per_tier, 0, sizeof(ptrs_per_tier));
extent_for_each_ptr(e, ptr) {
- ca = c->devs[ptr->dev];
+ ca = bch_dev_bkey_exists(c, ptr->dev);
g = PTR_BUCKET(ca, ptr);
replicas++;
ptrs_per_tier[ca->mi.tier]++;
@@ -1844,7 +1863,7 @@ static void bch2_extent_to_text(struct bch_fs *c, char *buf,
static unsigned PTR_TIER(struct bch_fs *c,
const struct bch_extent_ptr *ptr)
{
- return c->devs[ptr->dev]->mi.tier;
+ return bch_dev_bkey_exists(c, ptr->dev)->mi.tier;
}
static void bch2_extent_crc_init(union bch_extent_crc *crc,
@@ -1971,14 +1990,10 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
struct bkey_s_extent e)
{
struct bch_extent_ptr *ptr;
- unsigned tier = 0, nr_cached = 0, nr_good = 0;
+ unsigned tier = 0, nr_cached = 0;
+ unsigned nr_good = bch2_extent_nr_good_ptrs(c, e.c);
bool have_higher_tier;
- extent_for_each_ptr(e, ptr)
- if (!ptr->cached &&
- c->devs[ptr->dev]->mi.state != BCH_MEMBER_STATE_FAILED)
- nr_good++;
-
if (nr_good <= c->opts.data_replicas)
return;
@@ -2103,7 +2118,7 @@ static enum merge_result bch2_extent_merge(struct bch_fs *c,
return BCH_MERGE_NOMERGE;
/* We don't allow extents to straddle buckets: */
- ca = c->devs[lp->dev];
+ ca = bch_dev_bkey_exists(c, lp->dev);
if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp))
return BCH_MERGE_NOMERGE;
@@ -2347,6 +2362,30 @@ static bool bch2_extent_merge_inline(struct bch_fs *c,
}
}
+int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
+{
+ struct btree_iter iter;
+ struct bpos end = pos;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ end.offset += size;
+
+ for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, pos,
+ BTREE_ITER_WITH_HOLES, k) {
+ if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
+ break;
+
+ if (!bch2_extent_is_fully_allocated(k)) {
+ ret = -ENOSPC;
+ break;
+ }
+ }
+ bch2_btree_iter_unlock(&iter);
+
+ return ret;
+}
+
const struct bkey_ops bch2_bkey_extent_ops = {
.key_invalid = bch2_extent_invalid,
.key_debugcheck = bch2_extent_debugcheck,
diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h
index 1ec2db5e..ab7993ab 100644
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@@ -45,6 +45,7 @@ bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent);
unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c);
+unsigned bch2_extent_nr_good_ptrs(struct bch_fs *, struct bkey_s_c_extent);
unsigned bch2_extent_is_compressed(struct bkey_s_c);
bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent,
@@ -243,14 +244,14 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
case BCH_EXTENT_CRC32:
return (struct bch_extent_crc_unpacked) {
common_fields(crc->crc32),
- .csum.lo = crc->crc32.csum,
+ .csum.lo = (__force __le64) crc->crc32.csum,
};
case BCH_EXTENT_CRC64:
return (struct bch_extent_crc_unpacked) {
common_fields(crc->crc64),
.nonce = crc->crc64.nonce,
- .csum.lo = crc->crc64.csum_lo,
- .csum.hi = crc->crc64.csum_hi,
+ .csum.lo = (__force __le64) crc->crc64.csum_lo,
+ .csum.hi = (__force __le64) crc->crc64.csum_hi,
};
case BCH_EXTENT_CRC128:
return (struct bch_extent_crc_unpacked) {
@@ -425,4 +426,6 @@ bool bch2_cut_front(struct bpos, struct bkey_i *);
bool bch2_cut_back(struct bpos, struct bkey *);
void bch2_key_resize(struct bkey *, unsigned);
+int bch2_check_range_allocated(struct bch_fs *, struct bpos, u64);
+
#endif /* _BCACHEFS_EXTENTS_H */
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c
index 298e3592..2c34a85c 100644
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -28,8 +28,11 @@
struct i_sectors_hook {
struct extent_insert_hook hook;
- s64 sectors;
struct bch_inode_info *inode;
+ s64 sectors;
+ u64 new_i_size;
+ unsigned flags;
+ unsigned appending:1;
};
struct bchfs_write_op {
@@ -43,17 +46,6 @@ struct bchfs_write_op {
struct bch_write_op op;
};
-static inline void bch2_fswrite_op_init(struct bchfs_write_op *op,
- struct bch_inode_info *inode,
- bool is_dio)
-{
- op->inode = inode;
- op->sectors_added = 0;
- op->is_dio = is_dio;
- op->unalloc = false;
- op->new_i_size = U64_MAX;
-}
-
struct bch_writepage_io {
struct closure cl;
@@ -65,12 +57,8 @@ struct dio_write {
struct closure cl;
struct kiocb *req;
struct bch_fs *c;
- long written;
- long error;
loff_t offset;
- struct disk_reservation res;
-
struct iovec *iovec;
struct iovec inline_vecs[UIO_FASTIOV];
struct iov_iter iter;
@@ -129,12 +117,6 @@ static int inode_set_size(struct bch_inode_info *inode,
lockdep_assert_held(&inode->ei_update_lock);
bi->bi_size = *new_i_size;
-
- if (atomic_long_read(&inode->ei_size_dirty_count))
- bi->bi_flags |= BCH_INODE_I_SIZE_DIRTY;
- else
- bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
-
return 0;
}
@@ -145,16 +127,16 @@ static int __must_check bch2_write_inode_size(struct bch_fs *c,
return __bch2_write_inode(c, inode, inode_set_size, &new_size);
}
-static inline void i_size_dirty_put(struct bch_inode_info *inode)
+static void __i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, int sectors)
{
- atomic_long_dec_bug(&inode->ei_size_dirty_count);
+ inode->v.i_blocks += sectors;
}
-static inline void i_size_dirty_get(struct bch_inode_info *inode)
+static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, int sectors)
{
- lockdep_assert_held(&inode->v.i_rwsem);
-
- atomic_long_inc(&inode->ei_size_dirty_count);
+ mutex_lock(&inode->ei_update_lock);
+ __i_sectors_acct(c, inode, sectors);
+ mutex_unlock(&inode->ei_update_lock);
}
/* i_sectors accounting: */
@@ -172,90 +154,83 @@ i_sectors_hook_fn(struct extent_insert_hook *hook,
int sign = bkey_extent_is_allocation(&insert->k) -
(k.k && bkey_extent_is_allocation(k.k));
- EBUG_ON(!(h->inode->ei_flags & BCH_INODE_I_SECTORS_DIRTY));
- EBUG_ON(!atomic_long_read(&h->inode->ei_sectors_dirty_count));
+ EBUG_ON(!(h->inode->ei_inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY));
h->sectors += sectors * sign;
return BTREE_INSERT_OK;
}
-static int inode_set_i_sectors_dirty(struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi, void *p)
-{
- BUG_ON(bi->bi_flags & BCH_INODE_I_SECTORS_DIRTY);
-
- bi->bi_flags |= BCH_INODE_I_SECTORS_DIRTY;
- return 0;
-}
-
-static int inode_clear_i_sectors_dirty(struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi,
- void *p)
+static int i_sectors_dirty_finish_fn(struct bch_inode_info *inode,
+ struct bch_inode_unpacked *bi,
+ void *p)
{
- BUG_ON(!(bi->bi_flags & BCH_INODE_I_SECTORS_DIRTY));
+ struct i_sectors_hook *h = p;
- bi->bi_sectors = atomic64_read(&inode->ei_sectors);
- bi->bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY;
+ if (h->new_i_size != U64_MAX &&
+ (!h->appending ||
+ h->new_i_size > bi->bi_size))
+ bi->bi_size = h->new_i_size;
+ bi->bi_sectors += h->sectors;
+ bi->bi_flags &= ~h->flags;
return 0;
}
-static void i_sectors_dirty_put(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct i_sectors_hook *h)
+static int i_sectors_dirty_finish(struct bch_fs *c, struct i_sectors_hook *h)
{
- if (h->sectors) {
- spin_lock(&inode->v.i_lock);
- inode->v.i_blocks += h->sectors;
- spin_unlock(&inode->v.i_lock);
+ int ret;
- atomic64_add(h->sectors, &inode->ei_sectors);
- EBUG_ON(atomic64_read(&inode->ei_sectors) < 0);
- }
+ mutex_lock(&h->inode->ei_update_lock);
+ if (h->new_i_size != U64_MAX)
+ i_size_write(&h->inode->v, h->new_i_size);
- EBUG_ON(atomic_long_read(&inode->ei_sectors_dirty_count) <= 0);
+ __i_sectors_acct(c, h->inode, h->sectors);
- mutex_lock(&inode->ei_update_lock);
+ ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_finish_fn, h);
+ mutex_unlock(&h->inode->ei_update_lock);
- if (atomic_long_dec_and_test(&inode->ei_sectors_dirty_count)) {
- int ret = __bch2_write_inode(c, inode,
- inode_clear_i_sectors_dirty, NULL);
+ h->sectors = 0;
- ret = ret;
- }
-
- mutex_unlock(&inode->ei_update_lock);
+ return ret;
}
-static int __must_check i_sectors_dirty_get(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct i_sectors_hook *h)
+static int i_sectors_dirty_start_fn(struct bch_inode_info *inode,
+ struct bch_inode_unpacked *bi, void *p)
{
- int ret = 0;
+ struct i_sectors_hook *h = p;
- h->hook.fn = i_sectors_hook_fn;
- h->sectors = 0;
-#ifdef CONFIG_BCACHEFS_DEBUG
- h->inode = inode;
-#endif
+ if (h->flags & BCH_INODE_I_SIZE_DIRTY)
+ bi->bi_size = h->new_i_size;
- if (atomic_long_inc_not_zero(&inode->ei_sectors_dirty_count))
- return 0;
-
- mutex_lock(&inode->ei_update_lock);
-
- if (!(inode->ei_flags & BCH_INODE_I_SECTORS_DIRTY))
- ret = __bch2_write_inode(c, inode, inode_set_i_sectors_dirty,
- NULL);
+ bi->bi_flags |= h->flags;
+ return 0;
+}
- if (!ret)
- atomic_long_inc(&inode->ei_sectors_dirty_count);
+static int i_sectors_dirty_start(struct bch_fs *c, struct i_sectors_hook *h)
+{
+ int ret;
- mutex_unlock(&inode->ei_update_lock);
+ mutex_lock(&h->inode->ei_update_lock);
+ ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_start_fn, h);
+ mutex_unlock(&h->inode->ei_update_lock);
return ret;
}
+static inline struct i_sectors_hook
+i_sectors_hook_init(struct bch_inode_info *inode, unsigned flags)
+{
+ return (struct i_sectors_hook) {
+ .hook.fn = i_sectors_hook_fn,
+ .inode = inode,
+ .sectors = 0,
+ .new_i_size = U64_MAX,
+ .flags = flags|BCH_INODE_I_SECTORS_DIRTY,
+ };
+}
+
+/* normal i_size/i_sectors update machinery: */
+
struct bchfs_extent_trans_hook {
struct bchfs_write_op *op;
struct extent_insert_hook hook;
@@ -289,18 +264,18 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,
BUG_ON((next_pos.offset << 9) > round_up(offset, PAGE_SIZE));
/* XXX: inode->i_size locking */
- if (offset > inode->ei_size) {
- BUG_ON(inode->ei_flags & BCH_INODE_I_SIZE_DIRTY);
-
+ if (offset > inode->ei_inode.bi_size) {
if (!h->need_inode_update) {
h->need_inode_update = true;
return BTREE_INSERT_NEED_TRAVERSE;
}
+ BUG_ON(h->inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY);
+
h->inode_u.bi_size = offset;
do_pack = true;
- inode->ei_size = offset;
+ inode->ei_inode.bi_size = offset;
if (h->op->is_dio)
i_size_write(&inode->v, offset);
@@ -315,15 +290,7 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,
h->inode_u.bi_sectors += sectors;
do_pack = true;
- atomic64_add(sectors, &inode->ei_sectors);
-
h->op->sectors_added += sectors;
-
- if (h->op->is_dio) {
- spin_lock(&inode->v.i_lock);
- inode->v.i_blocks += sectors;
- spin_unlock(&inode->v.i_lock);
- }
}
if (do_pack)
@@ -340,6 +307,7 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
struct btree_iter extent_iter, inode_iter;
struct bchfs_extent_trans_hook hook;
struct bkey_i *k = bch2_keylist_front(keys);
+ s64 orig_sectors_added = op->sectors_added;
int ret;
BUG_ON(k->k.p.inode != op->inode->v.i_ino);
@@ -362,7 +330,8 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
/* XXX: inode->i_size locking */
k = bch2_keylist_front(keys);
- if (min(k->k.p.offset << 9, op->new_i_size) > op->inode->ei_size)
+ if (min(k->k.p.offset << 9, op->new_i_size) >
+ op->inode->ei_inode.bi_size)
hook.need_inode_update = true;
if (hook.need_inode_update) {
@@ -430,9 +399,41 @@ err:
bch2_btree_iter_unlock(&extent_iter);
bch2_btree_iter_unlock(&inode_iter);
+ if (op->is_dio)
+ i_sectors_acct(wop->c, op->inode,
+ op->sectors_added - orig_sectors_added);
+
return ret;
}
+static inline void bch2_fswrite_op_init(struct bchfs_write_op *op,
+ struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct bch_io_opts opts,
+ bool is_dio)
+{
+ op->inode = inode;
+ op->sectors_added = 0;
+ op->is_dio = is_dio;
+ op->unalloc = false;
+ op->new_i_size = U64_MAX;
+
+ bch2_write_op_init(&op->op, c);
+ op->op.csum_type = bch2_data_checksum_type(c, opts.data_checksum);
+ op->op.compression_type = bch2_compression_opt_to_type(opts.compression);
+ op->op.devs = c->fastest_devs;
+ op->op.index_update_fn = bchfs_write_index_update;
+ op_journal_seq_set(&op->op, &inode->ei_journal_seq);
+}
+
+static inline struct bch_io_opts io_opts(struct bch_fs *c, struct bch_inode_info *inode)
+{
+ struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts);
+
+ bch2_io_opts_apply(&opts, bch2_inode_opts_get(&inode->ei_inode));
+ return opts;
+}
+
/* page state: */
/* stored in page->private: */
@@ -551,11 +552,8 @@ static void bch2_clear_page_bits(struct page *page)
s = xchg(page_state(page), (struct bch_page_state) { .v = 0 });
ClearPagePrivate(page);
- if (s.dirty_sectors) {
- spin_lock(&inode->v.i_lock);
- inode->v.i_blocks -= s.dirty_sectors;
- spin_unlock(&inode->v.i_lock);
- }
+ if (s.dirty_sectors)
+ i_sectors_acct(c, inode, -s.dirty_sectors);
if (s.reserved)
bch2_disk_reservation_put(c, &res);
@@ -563,19 +561,16 @@ static void bch2_clear_page_bits(struct page *page)
int bch2_set_page_dirty(struct page *page)
{
+ struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_page_state old, new;
old = page_state_cmpxchg(page_state(page), new,
new.dirty_sectors = PAGE_SECTORS - new.sectors;
);
- if (old.dirty_sectors != new.dirty_sectors) {
- struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
-
- spin_lock(&inode->v.i_lock);
- inode->v.i_blocks += new.dirty_sectors - old.dirty_sectors;
- spin_unlock(&inode->v.i_lock);
- }
+ if (old.dirty_sectors != new.dirty_sectors)
+ i_sectors_acct(c, inode, new.dirty_sectors - old.dirty_sectors);
return __set_page_dirty_nobuffers(page);
}
@@ -624,7 +619,7 @@ static void bch2_readpages_end_io(struct bio *bio)
bio_for_each_segment_all(bv, bio, i) {
struct page *page = bv->bv_page;
- if (!bio->bi_error) {
+ if (!bio->bi_status) {
SetPageUptodate(page);
} else {
ClearPageUptodate(page);
@@ -846,6 +841,7 @@ int bch2_readpages(struct file *file, struct address_space *mapping,
{
struct bch_inode_info *inode = to_bch_ei(mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct bch_io_opts opts = io_opts(c, inode);
struct btree_iter iter;
struct page *page;
struct readpages_iter readpages_iter = {
@@ -868,7 +864,8 @@ int bch2_readpages(struct file *file, struct address_space *mapping,
c->sb.encoded_extent_max >> PAGE_SECTOR_SHIFT);
struct bch_read_bio *rbio =
- to_rbio(bio_alloc_bioset(GFP_NOFS, n, &c->bio_read));
+ rbio_init(bio_alloc_bioset(GFP_NOFS, n, &c->bio_read),
+ opts);
rbio->bio.bi_end_io = bch2_readpages_end_io;
bio_add_page_contig(&rbio->bio, page);
@@ -914,9 +911,10 @@ int bch2_readpage(struct file *file, struct page *page)
{
struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct bch_io_opts opts = io_opts(c, inode);
struct bch_read_bio *rbio;
- rbio = to_rbio(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read));
+ rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), opts);
rbio->bio.bi_end_io = bch2_readpages_end_io;
__bchfs_readpage(c, rbio, inode->v.i_ino, page);
@@ -925,8 +923,15 @@ int bch2_readpage(struct file *file, struct page *page)
struct bch_writepage_state {
struct bch_writepage_io *io;
+ struct bch_io_opts opts;
};
+static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
+ struct bch_inode_info *inode)
+{
+ return (struct bch_writepage_state) { .opts = io_opts(c, inode) };
+}
+
static void bch2_writepage_io_free(struct closure *cl)
{
struct bch_writepage_io *io = container_of(cl,
@@ -982,13 +987,8 @@ static void bch2_writepage_io_done(struct closure *cl)
* PageWriteback is effectively our ref on the inode - fixup i_blocks
* before calling end_page_writeback:
*/
- if (io->op.sectors_added) {
- struct bch_inode_info *inode = io->op.inode;
-
- spin_lock(&inode->v.i_lock);
- inode->v.i_blocks += io->op.sectors_added;
- spin_unlock(&inode->v.i_lock);
- }
+ if (io->op.sectors_added)
+ i_sectors_acct(c, io->op.inode, io->op.sectors_added);
bio_for_each_segment_all(bvec, bio, i)
end_page_writeback(bvec->bv_page);
@@ -1004,8 +1004,6 @@ static void bch2_writepage_do_io(struct bch_writepage_state *w)
w->io = NULL;
atomic_add(bio->bi_vcnt, &io->op.op.c->writeback_pages);
- io->op.op.pos.offset = bio->bi_iter.bi_sector;
-
closure_call(&io->op.op.cl, bch2_write, NULL, &io->cl);
continue_at(&io->cl, bch2_writepage_io_done, NULL);
}
@@ -1017,46 +1015,26 @@ static void bch2_writepage_do_io(struct bch_writepage_state *w)
static void bch2_writepage_io_alloc(struct bch_fs *c,
struct bch_writepage_state *w,
struct bch_inode_info *inode,
- struct page *page)
-{
- u64 inum = inode->v.i_ino;
- unsigned nr_replicas = page_state(page)->nr_replicas;
-
- EBUG_ON(!nr_replicas);
- /* XXX: disk_reservation->gen isn't plumbed through */
-
- if (!w->io) {
-alloc_io:
- w->io = container_of(bio_alloc_bioset(GFP_NOFS,
- BIO_MAX_PAGES,
- &c->writepage_bioset),
- struct bch_writepage_io, op.op.wbio.bio);
-
- closure_init(&w->io->cl, NULL);
- bch2_fswrite_op_init(&w->io->op, inode, false);
- bch2_write_op_init(&w->io->op.op, c,
- (struct disk_reservation) {
- .nr_replicas = c->opts.data_replicas,
- },
- c->fastest_devs,
- writepoint_hashed(inode->ei_last_dirtied),
- POS(inum, 0),
- &inode->ei_journal_seq,
- 0);
- w->io->op.op.index_update_fn = bchfs_write_index_update;
- }
+ struct page *page,
+ struct bch_page_state s)
+{
+ struct bch_write_op *op;
+ u64 offset = (u64) page->index << PAGE_SECTOR_SHIFT;
- if (w->io->op.op.res.nr_replicas != nr_replicas ||
- bio_add_page_contig(&w->io->op.op.wbio.bio, page)) {
- bch2_writepage_do_io(w);
- goto alloc_io;
- }
+ w->io = container_of(bio_alloc_bioset(GFP_NOFS,
+ BIO_MAX_PAGES,
+ &c->writepage_bioset),
+ struct bch_writepage_io, op.op.wbio.bio);
+ op = &w->io->op.op;
- /*
- * We shouldn't ever be handed pages for multiple inodes in a single
- * pass - right?
- */
- BUG_ON(inode != w->io->op.inode);
+ closure_init(&w->io->cl, NULL);
+
+ bch2_fswrite_op_init(&w->io->op, c, inode, w->opts, false);
+ op->nr_replicas = s.nr_replicas;
+ op->res.nr_replicas = s.nr_replicas;
+ op->write_point = writepoint_hashed(inode->ei_last_dirtied);
+ op->pos = POS(inode->v.i_ino, offset);
+ op->wbio.bio.bi_iter.bi_sector = offset;
}
static int __bch2_writepage(struct bch_fs *c, struct page *page,
@@ -1091,32 +1069,39 @@ static int __bch2_writepage(struct bch_fs *c, struct page *page,
*/
zero_user_segment(page, offset, PAGE_SIZE);
do_io:
- bch2_writepage_io_alloc(c, w, inode, page);
-
- /* while page is locked: */
- w->io->op.new_i_size = i_size;
-
- if (wbc->sync_mode == WB_SYNC_ALL)
- w->io->op.op.wbio.bio.bi_opf |= REQ_SYNC;
-
/* Before unlocking the page, transfer reservation to w->io: */
old = page_state_cmpxchg(page_state(page), new, {
EBUG_ON(!new.reserved &&
(new.sectors != PAGE_SECTORS ||
!new.allocated));
- if (new.allocated &&
- w->io->op.op.compression_type != BCH_COMPRESSION_NONE)
+ if (new.allocated && w->opts.compression)
new.allocated = 0;
else if (!new.reserved)
- goto out;
+ break;
new.reserved = 0;
});
- w->io->op.op.res.sectors += PAGE_SECTORS *
- (old.reserved - new.reserved) *
- old.nr_replicas;
-out:
+ if (w->io &&
+ (w->io->op.op.res.nr_replicas != old.nr_replicas ||
+ !bio_can_add_page_contig(&w->io->op.op.wbio.bio, page)))
+ bch2_writepage_do_io(w);
+
+ if (!w->io)
+ bch2_writepage_io_alloc(c, w, inode, page, old);
+
+ BUG_ON(inode != w->io->op.inode);
+ BUG_ON(bio_add_page_contig(&w->io->op.op.wbio.bio, page));
+
+ if (old.reserved)
+ w->io->op.op.res.sectors += old.nr_replicas * PAGE_SECTORS;
+
+ /* while page is locked: */
+ w->io->op.new_i_size = i_size;
+
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ w->io->op.op.wbio.bio.bi_opf |= REQ_SYNC;
+
BUG_ON(PageWriteback(page));
set_page_writeback(page);
unlock_page(page);
@@ -1127,7 +1112,8 @@ out:
int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
struct bch_fs *c = mapping->host->i_sb->s_fs_info;
- struct bch_writepage_state w = { NULL };
+ struct bch_writepage_state w =
+ bch_writepage_state_init(c, to_bch_ei(mapping->host));
struct pagecache_iter iter;
struct page *page;
int ret = 0;
@@ -1275,7 +1261,8 @@ continue_unlock:
int bch2_writepage(struct page *page, struct writeback_control *wbc)
{
struct bch_fs *c = page->mapping->host->i_sb->s_fs_info;
- struct bch_writepage_state w = { NULL };
+ struct bch_writepage_state w =
+ bch_writepage_state_init(c, to_bch_ei(page->mapping->host));
int ret;
ret = __bch2_writepage(c, page, wbc, &w);
@@ -1306,7 +1293,7 @@ static int bch2_read_single_page(struct page *page,
__bchfs_readpage(c, rbio, inode->v.i_ino, page);
wait_for_completion(&done);
- ret = rbio->bio.bi_error;
+ ret = blk_status_to_errno(rbio->bio.bi_status);
bio_put(&rbio->bio);
if (ret < 0)
@@ -1440,8 +1427,8 @@ static void bch2_direct_IO_read_endio(struct bio *bio)
{
struct dio_read *dio = bio->bi_private;
- if (bio->bi_error)
- dio->ret = bio->bi_error;
+ if (bio->bi_status)
+ dio->ret = blk_status_to_errno(bio->bi_status);
closure_put(&dio->cl);
}
@@ -1456,6 +1443,7 @@ static int bch2_direct_IO_read(struct bch_fs *c, struct kiocb *req,
struct file *file, struct bch_inode_info *inode,
struct iov_iter *iter, loff_t offset)
{
+ struct bch_io_opts opts = io_opts(c, inode);
struct dio_read *dio;
struct bio *bio;
bool sync = is_sync_kiocb(req);
@@ -1512,7 +1500,7 @@ start:
ret = bio_iov_iter_get_pages(bio, iter);
if (ret < 0) {
/* XXX: fault inject this path */
- bio->bi_error = ret;
+ bio->bi_status = BLK_STS_RESOURCE;
bio_endio(bio);
break;
}
@@ -1523,7 +1511,7 @@ start:
if (iter->count)
closure_get(&dio->cl);
- bch2_read(c, to_rbio(bio), inode->v.i_ino);
+ bch2_read(c, rbio_init(bio, opts), inode->v.i_ino);
}
if (sync) {
@@ -1542,9 +1530,9 @@ static long __bch2_dio_write_complete(struct dio_write *dio)
struct file *file = dio->req->ki_filp;
struct address_space *mapping = file->f_mapping;
struct bch_inode_info *inode = file_bch_inode(file);
- long ret = dio->error ?: dio->written;
+ long ret = dio->iop.op.error ?: ((long) dio->iop.op.written << 9);
- bch2_disk_reservation_put(dio->c, &dio->res);
+ bch2_disk_reservation_put(dio->c, &dio->iop.op.res);
__pagecache_block_put(&mapping->add_lock);
inode_dio_end(&inode->v);
@@ -1569,11 +1557,6 @@ static void bch2_dio_write_done(struct dio_write *dio)
struct bio_vec *bv;
int i;
- dio->written += dio->iop.op.written << 9;
-
- if (dio->iop.op.error)
- dio->error = dio->iop.op.error;
-
bio_for_each_segment_all(bv, &dio->iop.op.wbio.bio, i)
put_page(bv->bv_page);
@@ -1586,38 +1569,15 @@ static void bch2_do_direct_IO_write(struct dio_write *dio)
struct file *file = dio->req->ki_filp;
struct bch_inode_info *inode = file_bch_inode(file);
struct bio *bio = &dio->iop.op.wbio.bio;
- unsigned flags = 0;
int ret;
- if ((dio->req->ki_flags & IOCB_DSYNC) &&
- !dio->c->opts.journal_flush_disabled)
- flags |= BCH_WRITE_FLUSH;
-
ret = bio_iov_iter_get_pages(bio, &dio->iter);
if (ret < 0) {
- /*
- * these didn't get initialized, but bch2_dio_write_done() will
- * look at them:
- */
- dio->iop.op.error = 0;
- dio->iop.op.written = 0;
- dio->error = ret;
+ dio->iop.op.error = ret;
return;
}
- dio->iop.sectors_added = 0;
- bch2_write_op_init(&dio->iop.op, dio->c, dio->res,
- dio->c->fastest_devs,
- writepoint_hashed((unsigned long) dio->task),
- POS(inode->v.i_ino, (dio->offset + dio->written) >> 9),
- &inode->ei_journal_seq,
- flags);
- dio->iop.op.index_update_fn = bchfs_write_index_update;
-
- if (!dio->iop.unalloc) {
- dio->res.sectors -= bio_sectors(bio);
- dio->iop.op.res.sectors = bio_sectors(bio);
- }
+ dio->iop.op.pos = POS(inode->v.i_ino, (dio->offset >> 9) + dio->iop.op.written);
task_io_account_write(bio->bi_iter.bi_size);
@@ -1632,7 +1592,7 @@ static void bch2_dio_write_loop_async(struct closure *cl)
bch2_dio_write_done(dio);
- if (dio->iter.count && !dio->error) {
+ if (dio->iter.count && !dio->iop.op.error) {
use_mm(dio->task->mm);
pagecache_block_get(&mapping->add_lock);
@@ -1652,31 +1612,6 @@ static void bch2_dio_write_loop_async(struct closure *cl)
}
}
-static int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos,
- u64 size)
-{
- struct btree_iter iter;
- struct bpos end = pos;
- struct bkey_s_c k;
- int ret = 0;
-
- end.offset += size;
-
- for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, pos,
- BTREE_ITER_WITH_HOLES, k) {
- if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
- break;
-
- if (!bch2_extent_is_fully_allocated(k)) {
- ret = -ENOSPC;
- break;
- }
- }
- bch2_btree_iter_unlock(&iter);
-
- return ret;
-}
-
static int bch2_direct_IO_write(struct bch_fs *c,
struct kiocb *req, struct file *file,
struct bch_inode_info *inode,
@@ -1703,13 +1638,17 @@ static int bch2_direct_IO_write(struct bch_fs *c,
closure_init(&dio->cl, NULL);
dio->req = req;
dio->c = c;
- dio->written = 0;
- dio->error = 0;
dio->offset = offset;
dio->iovec = NULL;
dio->iter = *iter;
dio->task = current;
- bch2_fswrite_op_init(&dio->iop, inode, true);
+ bch2_fswrite_op_init(&dio->iop, c, inode, io_opts(c, inode), true);
+ dio->iop.op.write_point = writepoint_hashed((unsigned long) dio->task);
+ dio->iop.op.flags |= BCH_WRITE_NOPUT_RESERVATION;
+
+ if ((dio->req->ki_flags & IOCB_DSYNC) &&
+ !c->opts.journal_flush_disabled)
+ dio->iop.op.flags |= BCH_WRITE_FLUSH;
if (offset + iter->count > inode->v.i_size)
sync = true;
@@ -1722,7 +1661,7 @@ static int bch2_direct_IO_write(struct bch_fs *c,
* Have to then guard against racing with truncate (deleting data that
* we would have been overwriting)
*/
- ret = bch2_disk_reservation_get(c, &dio->res, iter->count >> 9, 0);
+ ret = bch2_disk_reservation_get(c, &dio->iop.op.res, iter->count >> 9, 0);
if (unlikely(ret)) {
if (bch2_check_range_allocated(c, POS(inode->v.i_ino,
offset >> 9),
@@ -1735,6 +1674,8 @@ static int bch2_direct_IO_write(struct bch_fs *c,
dio->iop.unalloc = true;
}
+ dio->iop.op.nr_replicas = dio->iop.op.res.nr_replicas;
+
inode_dio_begin(&inode->v);
__pagecache_block_get(&mapping->add_lock);
@@ -1744,20 +1685,20 @@ static int bch2_direct_IO_write(struct bch_fs *c,
closure_sync(&dio->cl);
bch2_dio_write_done(dio);
- } while (dio->iter.count && !dio->error);
+ } while (dio->iter.count && !dio->iop.op.error);
closure_debug_destroy(&dio->cl);
return __bch2_dio_write_complete(dio);
} else {
bch2_do_direct_IO_write(dio);
- if (dio->iter.count && !dio->error) {
+ if (dio->iter.count && !dio->iop.op.error) {
if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
dio->iovec = kmalloc(dio->iter.nr_segs *
sizeof(struct iovec),
GFP_KERNEL);
if (!dio->iovec)
- dio->error = -ENOMEM;
+ dio->iop.op.error = -ENOMEM;
} else {
dio->iovec = dio->inline_vecs;
}
@@ -1965,11 +1906,11 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
return bch2_journal_flush_seq(&c->journal, inode->ei_journal_seq);
}
-static int __bch2_truncate_page(struct address_space *mapping,
+static int __bch2_truncate_page(struct bch_inode_info *inode,
pgoff_t index, loff_t start, loff_t end)
{
- struct bch_inode_info *inode = to_bch_ei(mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct address_space *mapping = inode->v.i_mapping;
unsigned start_offset = start & (PAGE_SIZE - 1);
unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1;
struct page *page;
@@ -2049,10 +1990,10 @@ out:
return ret;
}
-static int bch2_truncate_page(struct address_space *mapping, loff_t from)
+static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from)
{
- return __bch2_truncate_page(mapping, from >> PAGE_SHIFT,
- from, from + PAGE_SIZE);
+ return __bch2_truncate_page(inode, from >> PAGE_SHIFT,
+ from, from + PAGE_SIZE);
}
int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
@@ -2060,6 +2001,8 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct address_space *mapping = inode->v.i_mapping;
bool shrink = iattr->ia_size <= inode->v.i_size;
+ struct i_sectors_hook i_sectors_hook =
+ i_sectors_hook_init(inode, BCH_INODE_I_SIZE_DIRTY);
int ret = 0;
inode_dio_wait(&inode->v);
@@ -2069,17 +2012,15 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
/* sync appends.. */
/* XXX what protects inode->i_size? */
- if (iattr->ia_size > inode->ei_size)
+ if (iattr->ia_size > inode->ei_inode.bi_size)
ret = filemap_write_and_wait_range(mapping,
- inode->ei_size, S64_MAX);
+ inode->ei_inode.bi_size, S64_MAX);
if (ret)
goto err_put_pagecache;
- mutex_lock(&inode->ei_update_lock);
- i_size_dirty_get(inode);
- ret = bch2_write_inode_size(c, inode, inode->v.i_size);
- mutex_unlock(&inode->ei_update_lock);
+ i_sectors_hook.new_i_size = iattr->ia_size;
+ ret = i_sectors_dirty_start(c, &i_sectors_hook);
if (unlikely(ret))
goto err;
@@ -2090,45 +2031,32 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
* here (new i_size < current i_size):
*/
if (shrink) {
- struct i_sectors_hook i_sectors_hook;
- int ret;
-
- ret = i_sectors_dirty_get(c, inode, &i_sectors_hook);
+ ret = bch2_truncate_page(inode, iattr->ia_size);
if (unlikely(ret))
goto err;
- ret = bch2_truncate_page(inode->v.i_mapping, iattr->ia_size);
- if (unlikely(ret)) {
- i_sectors_dirty_put(c, inode, &i_sectors_hook);
- goto err;
- }
-
ret = bch2_inode_truncate(c, inode->v.i_ino,
- round_up(iattr->ia_size, PAGE_SIZE) >> 9,
- &i_sectors_hook.hook,
- &inode->ei_journal_seq);
-
- i_sectors_dirty_put(c, inode, &i_sectors_hook);
-
+ round_up(iattr->ia_size, PAGE_SIZE) >> 9,
+ &i_sectors_hook.hook,
+ &inode->ei_journal_seq);
if (unlikely(ret))
goto err;
}
- mutex_lock(&inode->ei_update_lock);
setattr_copy(&inode->v, iattr);
- inode->v.i_mtime = inode->v.i_ctime = current_fs_time(inode->v.i_sb);
-out:
- /* clear I_SIZE_DIRTY: */
- i_size_dirty_put(inode);
- ret = bch2_write_inode_size(c, inode, inode->v.i_size);
- mutex_unlock(&inode->ei_update_lock);
+ inode->v.i_mtime = inode->v.i_ctime = current_time(&inode->v);
+err:
+ /*
+ * On error - in particular, bch2_truncate_page() error - don't clear
+ * I_SIZE_DIRTY, as we've left data above i_size!:
+ */
+ if (ret)
+ i_sectors_hook.flags &= ~BCH_INODE_I_SIZE_DIRTY;
+ ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
err_put_pagecache:
pagecache_block_put(&mapping->add_lock);
return ret;
-err:
- mutex_lock(&inode->ei_update_lock);
- goto out;
}
static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
@@ -2144,33 +2072,41 @@ static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
inode_dio_wait(&inode->v);
pagecache_block_get(&mapping->add_lock);
- ret = __bch2_truncate_page(mapping,
+ ret = __bch2_truncate_page(inode,
offset >> PAGE_SHIFT,
offset, offset + len);
if (unlikely(ret))
- goto out;
+ goto err;
if (offset >> PAGE_SHIFT !=
(offset + len) >> PAGE_SHIFT) {
- ret = __bch2_truncate_page(mapping,
+ ret = __bch2_truncate_page(inode,
(offset + len) >> PAGE_SHIFT,
offset, offset + len);
if (unlikely(ret))
- goto out;
+ goto err;
}
truncate_pagecache_range(&inode->v, offset, offset + len - 1);
if (discard_start < discard_end) {
struct disk_reservation disk_res;
- struct i_sectors_hook i_sectors_hook;
+ struct i_sectors_hook i_sectors_hook =
+ i_sectors_hook_init(inode, 0);
int ret;
- BUG_ON(bch2_disk_reservation_get(c, &disk_res, 0, 0));
-
- ret = i_sectors_dirty_get(c, inode, &i_sectors_hook);
+ ret = i_sectors_dirty_start(c, &i_sectors_hook);
if (unlikely(ret))
- goto out;
+ goto err;
+
+ /*
+ * We need to pass in a disk reservation here because we might
+ * be splitting a compressed extent into two. This isn't a
+ * problem with truncate because truncate will never split an
+ * extent, only truncate it...
+ */
+ ret = bch2_disk_reservation_get(c, &disk_res, 0, 0);
+ BUG_ON(ret);
ret = bch2_btree_delete_range(c,
BTREE_ID_EXTENTS,
@@ -2180,11 +2116,11 @@ static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
&disk_res,
&i_sectors_hook.hook,
&inode->ei_journal_seq);
-
- i_sectors_dirty_put(c, inode, &i_sectors_hook);
bch2_disk_reservation_put(c, &disk_res);
+
+ ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
}
-out:
+err:
pagecache_block_put(&mapping->add_lock);
inode_unlock(&inode->v);
@@ -2200,7 +2136,7 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
struct btree_iter dst;
BKEY_PADDED(k) copy;
struct bkey_s_c k;
- struct i_sectors_hook i_sectors_hook;
+ struct i_sectors_hook i_sectors_hook = i_sectors_hook_init(inode, 0);
loff_t new_size;
int ret;
@@ -2237,7 +2173,7 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
if (ret)
goto err;
- ret = i_sectors_dirty_get(c, inode, &i_sectors_hook);
+ ret = i_sectors_dirty_start(c, &i_sectors_hook);
if (ret)
goto err;
@@ -2278,8 +2214,14 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
BTREE_INSERT_ENTRY(&dst, &copy.k));
bch2_disk_reservation_put(c, &disk_res);
btree_iter_err:
- if (ret < 0 && ret != -EINTR)
- goto err_unwind;
+ if (ret == -EINTR)
+ ret = 0;
+ if (ret)
+ goto err_put_sectors_dirty;
+ /*
+ * XXX: if we error here we've left data with multiple
+ * pointers... which isn't a _super_ serious problem...
+ */
bch2_btree_iter_cond_resched(&src);
}
@@ -2292,30 +2234,18 @@ btree_iter_err:
&i_sectors_hook.hook,
&inode->ei_journal_seq);
if (ret)
- goto err_unwind;
-
- i_sectors_dirty_put(c, inode, &i_sectors_hook);
+ goto err_put_sectors_dirty;
- mutex_lock(&inode->ei_update_lock);
i_size_write(&inode->v, new_size);
- ret = bch2_write_inode_size(c, inode, inode->v.i_size);
- mutex_unlock(&inode->ei_update_lock);
-
+ i_sectors_hook.new_i_size = new_size;
+err_put_sectors_dirty:
+ ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
+err:
pagecache_block_put(&mapping->add_lock);
inode_unlock(&inode->v);
- return ret;
-err_unwind:
- /*
- * XXX: we've left data with multiple pointers... which isn't a _super_
- * serious problem...
- */
- i_sectors_dirty_put(c, inode, &i_sectors_hook);
-err:
bch2_btree_iter_unlock(&src);
bch2_btree_iter_unlock(&dst);
- pagecache_block_put(&mapping->add_lock);
- inode_unlock(&inode->v);
return ret;
}
@@ -2324,11 +2254,11 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
{
struct address_space *mapping = inode->v.i_mapping;
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct i_sectors_hook i_sectors_hook;
+ struct i_sectors_hook i_sectors_hook = i_sectors_hook_init(inode, 0);
struct btree_iter iter;
- struct bpos end;
+ struct bpos end_pos;
loff_t block_start, block_end;
- loff_t new_size = offset + len;
+ loff_t end = offset + len;
unsigned sectors;
unsigned replicas = READ_ONCE(c->opts.data_replicas);
int ret;
@@ -2340,45 +2270,43 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
inode_dio_wait(&inode->v);
pagecache_block_get(&mapping->add_lock);
- if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- new_size > inode->v.i_size) {
- ret = inode_newsize_ok(&inode->v, new_size);
+ if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) {
+ ret = inode_newsize_ok(&inode->v, end);
if (ret)
goto err;
}
if (mode & FALLOC_FL_ZERO_RANGE) {
- ret = __bch2_truncate_page(mapping,
+ ret = __bch2_truncate_page(inode,
offset >> PAGE_SHIFT,
- offset, offset + len);
+ offset, end);
if (!ret &&
- offset >> PAGE_SHIFT !=
- (offset + len) >> PAGE_SHIFT)
- ret = __bch2_truncate_page(mapping,
- (offset + len) >> PAGE_SHIFT,
- offset, offset + len);
+ offset >> PAGE_SHIFT != end >> PAGE_SHIFT)
+ ret = __bch2_truncate_page(inode,
+ end >> PAGE_SHIFT,
+ offset, end);
if (unlikely(ret))
goto err;
- truncate_pagecache_range(&inode->v, offset, offset + len - 1);
+ truncate_pagecache_range(&inode->v, offset, end - 1);
block_start = round_up(offset, PAGE_SIZE);
- block_end = round_down(offset + len, PAGE_SIZE);
+ block_end = round_down(end, PAGE_SIZE);
} else {
block_start = round_down(offset, PAGE_SIZE);
- block_end = round_up(offset + len, PAGE_SIZE);
+ block_end = round_up(end, PAGE_SIZE);
}
bch2_btree_iter_set_pos(&iter, POS(inode->v.i_ino, block_start >> 9));
- end = POS(inode->v.i_ino, block_end >> 9);
+ end_pos = POS(inode->v.i_ino, block_end >> 9);
- ret = i_sectors_dirty_get(c, inode, &i_sectors_hook);
+ ret = i_sectors_dirty_start(c, &i_sectors_hook);
if (unlikely(ret))
goto err;
- while (bkey_cmp(iter.pos, end) < 0) {
+ while (bkey_cmp(iter.pos, end_pos) < 0) {
struct disk_reservation disk_res = { 0 };
struct bkey_i_reservation reservation;
struct bkey_s_c k;
@@ -2407,7 +2335,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
reservation.k.size = k.k->size;
bch2_cut_front(iter.pos, &reservation.k_i);
- bch2_cut_back(end, &reservation.k);
+ bch2_cut_back(end_pos, &reservation.k);
sectors = reservation.k.size;
reservation.v.nr_replicas = bch2_extent_nr_dirty_ptrs(k);
@@ -2435,11 +2363,11 @@ btree_iter_err:
}
bch2_btree_iter_unlock(&iter);
- i_sectors_dirty_put(c, inode, &i_sectors_hook);
+ ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- new_size > inode->v.i_size) {
- i_size_write(&inode->v, new_size);
+ end > inode->v.i_size) {
+ i_size_write(&inode->v, end);
mutex_lock(&inode->ei_update_lock);
ret = bch2_write_inode_size(c, inode, inode->v.i_size);
@@ -2449,14 +2377,14 @@ btree_iter_err:
/* blech */
if ((mode & FALLOC_FL_KEEP_SIZE) &&
(mode & FALLOC_FL_ZERO_RANGE) &&
- inode->ei_size != inode->v.i_size) {
+ inode->ei_inode.bi_size != inode->v.i_size) {
/* sync appends.. */
ret = filemap_write_and_wait_range(mapping,
- inode->ei_size, S64_MAX);
+ inode->ei_inode.bi_size, S64_MAX);
if (ret)
goto err;
- if (inode->ei_size != inode->v.i_size) {
+ if (inode->ei_inode.bi_size != inode->v.i_size) {
mutex_lock(&inode->ei_update_lock);
ret = bch2_write_inode_size(c, inode, inode->v.i_size);
mutex_unlock(&inode->ei_update_lock);
@@ -2468,7 +2396,7 @@ btree_iter_err:
return 0;
err_put_sectors_dirty:
- i_sectors_dirty_put(c, inode, &i_sectors_hook);
+ ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
err:
bch2_btree_iter_unlock(&iter);
pagecache_block_put(&mapping->add_lock);
@@ -2669,11 +2597,14 @@ void bch2_fs_fsio_exit(struct bch_fs *c)
int bch2_fs_fsio_init(struct bch_fs *c)
{
if (bioset_init(&c->writepage_bioset,
- 4, offsetof(struct bch_writepage_io, op.op.wbio.bio)) ||
+ 4, offsetof(struct bch_writepage_io, op.op.wbio.bio),
+ BIOSET_NEED_BVECS) ||
bioset_init(&c->dio_read_bioset,
- 4, offsetof(struct dio_read, rbio.bio)) ||
+ 4, offsetof(struct dio_read, rbio.bio),
+ BIOSET_NEED_BVECS) ||
bioset_init(&c->dio_write_bioset,
- 4, offsetof(struct dio_write, iop.op.wbio.bio)))
+ 4, offsetof(struct dio_write, iop.op.wbio.bio),
+ BIOSET_NEED_BVECS))
return -ENOMEM;
return 0;
diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c
index bd915fec..24228c8e 100644
--- a/libbcachefs/fs-ioctl.c
+++ b/libbcachefs/fs-ioctl.c
@@ -75,7 +75,7 @@ do { \
/* Set VFS inode flags from bcachefs inode: */
void bch2_inode_flags_to_vfs(struct bch_inode_info *inode)
{
- set_flags(bch_flags_to_vfs, inode->ei_flags, inode->v.i_flags);
+ set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
}
static int bch2_inode_flags_set(struct bch_inode_info *inode,
@@ -99,13 +99,13 @@ static int bch2_inode_flags_set(struct bch_inode_info *inode,
return -EINVAL;
bi->bi_flags = newflags;
- inode->v.i_ctime = current_fs_time(inode->v.i_sb);
+ inode->v.i_ctime = current_time(&inode->v);
return 0;
}
static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg)
{
- unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_flags);
+ unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags);
return put_user(flags, arg);
}
@@ -153,7 +153,7 @@ static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode,
{
struct fsxattr fa = { 0 };
- fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_flags);
+ fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags);
return copy_to_user(arg, &fa, sizeof(fa));
}
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index 43688cd3..cb0397f1 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -12,6 +12,7 @@
#include "fs-ioctl.h"
#include "fsck.h"
#include "inode.h"
+#include "io.h"
#include "journal.h"
#include "keylist.h"
#include "super.h"
@@ -130,10 +131,8 @@ int __must_check __bch2_write_inode(struct bch_fs *c,
BTREE_INSERT_ENTRY(&iter, &inode_p.inode.k_i));
} while (ret == -EINTR);
- if (!ret) {
- inode->ei_size = inode_u.bi_size;
- inode->ei_flags = inode_u.bi_flags;
- }
+ if (!ret)
+ inode->ei_inode = inode_u;
out:
bch2_btree_iter_unlock(&iter);
@@ -146,7 +145,7 @@ int __must_check bch2_write_inode(struct bch_fs *c,
return __bch2_write_inode(c, inode, NULL, NULL);
}
-int bch2_inc_nlink(struct bch_fs *c, struct bch_inode_info *inode)
+static int bch2_inc_nlink(struct bch_fs *c, struct bch_inode_info *inode)
{
int ret;
@@ -158,7 +157,7 @@ int bch2_inc_nlink(struct bch_fs *c, struct bch_inode_info *inode)
return ret;
}
-int bch2_dec_nlink(struct bch_fs *c, struct bch_inode_info *inode)
+static int bch2_dec_nlink(struct bch_fs *c, struct bch_inode_info *inode)
{
int ret = 0;
@@ -223,7 +222,9 @@ static struct bch_inode_info *bch2_vfs_inode_create(struct bch_fs *c,
bch2_inode_init(c, &inode_u,
i_uid_read(&inode->v),
i_gid_read(&inode->v),
- inode->v.i_mode, rdev);
+ inode->v.i_mode, rdev,
+ &dir->ei_inode);
+
ret = bch2_inode_create(c, &inode_u,
BLOCKDEV_INODE_MAX, 0,
&c->unused_inode_hint);
@@ -277,7 +278,7 @@ static int bch2_vfs_dirent_create(struct bch_fs *c,
if (unlikely(ret))
return ret;
- dir->v.i_mtime = dir->v.i_ctime = current_fs_time(c->vfs_sb);
+ dir->v.i_mtime = dir->v.i_ctime = current_time(&dir->v);
mark_inode_dirty_sync(&dir->v);
return 0;
}
@@ -344,7 +345,7 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
lockdep_assert_held(&inode->v.i_rwsem);
- inode->v.i_ctime = current_fs_time(dir->v.i_sb);
+ inode->v.i_ctime = current_time(&dir->v);
ret = bch2_inc_nlink(c, inode);
if (ret)
@@ -473,7 +474,7 @@ static int bch2_rename(struct bch_fs *c,
{
struct bch_inode_info *old_inode = to_bch_ei(old_dentry->d_inode);
struct bch_inode_info *new_inode = to_bch_ei(new_dentry->d_inode);
- struct timespec now = current_fs_time(old_dir->v.i_sb);
+ struct timespec now = current_time(&old_dir->v);
int ret;
lockdep_assert_held(&old_dir->v.i_rwsem);
@@ -551,7 +552,7 @@ static int bch2_rename_exchange(struct bch_fs *c,
{
struct bch_inode_info *old_inode = to_bch_ei(old_dentry->d_inode);
struct bch_inode_info *new_inode = to_bch_ei(new_dentry->d_inode);
- struct timespec now = current_fs_time(old_dir->v.i_sb);
+ struct timespec now = current_time(&old_dir->v);
int ret;
ret = bch2_dirent_rename(c,
@@ -909,10 +910,8 @@ static void bch2_vfs_inode_init(struct bch_fs *c,
inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime);
inode->ei_journal_seq = 0;
- inode->ei_size = bi->bi_size;
- inode->ei_flags = bi->bi_flags;
- atomic64_set(&inode->ei_sectors, bi->bi_sectors);
inode->ei_str_hash = bch2_hash_info_init(c, bi);
+ inode->ei_inode = *bi;
bch2_inode_flags_to_vfs(inode);
@@ -949,8 +948,6 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)
inode_init_once(&inode->v);
mutex_init(&inode->ei_update_lock);
inode->ei_journal_seq = 0;
- atomic_long_set(&inode->ei_size_dirty_count, 0);
- atomic_long_set(&inode->ei_sectors_dirty_count, 0);
return &inode->v;
}
@@ -995,12 +992,6 @@ static void bch2_evict_inode(struct inode *vinode)
truncate_inode_pages_final(&inode->v.i_data);
- if (!bch2_journal_error(&c->journal) && !is_bad_inode(&inode->v)) {
- /* XXX - we want to check this stuff iff there weren't IO errors: */
- BUG_ON(atomic_long_read(&inode->ei_sectors_dirty_count));
- BUG_ON(atomic64_read(&inode->ei_sectors) != inode->v.i_blocks);
- }
-
clear_inode(&inode->v);
if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
@@ -1272,9 +1263,16 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
sb->s_magic = BCACHEFS_STATFS_MAGIC;
sb->s_time_gran = c->sb.time_precision;
c->vfs_sb = sb;
- sb->s_bdi = &c->bdi;
strlcpy(sb->s_id, c->name, sizeof(sb->s_id));
+ ret = super_setup_bdi(sb);
+ if (ret)
+ goto err_put_super;
+
+ sb->s_bdi->congested_fn = bch2_congested;
+ sb->s_bdi->congested_data = c;
+ sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+
for_each_online_member(ca, c, i) {
struct block_device *bdev = ca->disk_sb.bdev;
diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h
index d255ca7c..652105fb 100644
--- a/libbcachefs/fs.h
+++ b/libbcachefs/fs.h
@@ -1,6 +1,7 @@
#ifndef _BCACHEFS_FS_H
#define _BCACHEFS_FS_H
+#include "opts.h"
#include "str_hash.h"
#include <linux/seqlock.h>
@@ -11,22 +12,12 @@ struct bch_inode_info {
struct mutex ei_update_lock;
u64 ei_journal_seq;
-
- atomic_long_t ei_size_dirty_count;
-
- /*
- * these are updated whenever we update the inode in the btree - for
- * e.g. fsync
- */
- u64 ei_size;
- u32 ei_flags;
-
- atomic_long_t ei_sectors_dirty_count;
- atomic64_t ei_sectors;
+ unsigned long ei_last_dirtied;
struct bch_hash_info ei_str_hash;
- unsigned long ei_last_dirtied;
+ /* copy of inode in btree: */
+ struct bch_inode_unpacked ei_inode;
};
#define to_bch_ei(_inode) \
diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c
index 4760b16e..696926fe 100644
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@@ -204,7 +204,7 @@ static int hash_check_key(const struct bch_hash_desc desc,
"hash table key at wrong offset: %llu, "
"hashed to %llu chain starts at %llu\n%s",
k.k->p.offset, hashed, h->chain.pos.offset,
- bch2_bkey_val_to_text(c, desc.btree_id,
+ bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id),
buf, sizeof(buf), k))) {
ret = hash_redo_key(desc, h, c, k_iter, k, hashed);
if (ret) {
@@ -224,7 +224,7 @@ static int hash_check_key(const struct bch_hash_desc desc,
if (fsck_err_on(k2.k->type == desc.key_type &&
!desc.cmp_bkey(k, k2), c,
"duplicate hash table keys:\n%s",
- bch2_bkey_val_to_text(c, desc.btree_id,
+ bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id),
buf, sizeof(buf), k))) {
ret = bch2_hash_delete_at(desc, &h->info, &h->iter, NULL);
if (ret)
@@ -397,9 +397,9 @@ static int check_dirents(struct bch_fs *c)
if (fsck_err_on(have_target &&
d.v->d_type !=
- mode_to_type(le16_to_cpu(target.bi_mode)), c,
+ mode_to_type(target.bi_mode), c,
"incorrect d_type: should be %u:\n%s",
- mode_to_type(le16_to_cpu(target.bi_mode)),
+ mode_to_type(target.bi_mode),
bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS,
buf, sizeof(buf), k))) {
struct bkey_i_dirent *n;
@@ -411,7 +411,7 @@ static int check_dirents(struct bch_fs *c)
}
bkey_reassemble(&n->k_i, d.s_c);
- n->v.d_type = mode_to_type(le16_to_cpu(target.bi_mode));
+ n->v.d_type = mode_to_type(target.bi_mode);
ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
BTREE_INSERT_NOFAIL,
@@ -493,7 +493,8 @@ static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode)
fsck_err:
return ret;
create_root:
- bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0);
+ bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO,
+ 0, NULL);
root_inode->bi_inum = BCACHEFS_ROOT_INO;
bch2_inode_pack(&packed, root_inode);
@@ -545,7 +546,8 @@ create_lostfound:
if (ret)
return ret;
- bch2_inode_init(c, lostfound_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0);
+ bch2_inode_init(c, lostfound_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO,
+ 0, root_inode);
ret = bch2_inode_create(c, lostfound_inode, BLOCKDEV_INODE_MAX, 0,
&c->unused_inode_hint);
diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c
index 05f617ae..71a24cc6 100644
--- a/libbcachefs/inode.c
+++ b/libbcachefs/inode.c
@@ -198,6 +198,12 @@ static const char *bch2_inode_invalid(const struct bch_fs *c,
if (bch2_inode_unpack(inode, &unpacked))
return "invalid variable length fields";
+ if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1)
+ return "invalid data checksum type";
+
+ if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1)
+ return "invalid data checksum type";
+
return NULL;
}
case BCH_INODE_BLOCKDEV:
@@ -221,6 +227,7 @@ static const char *bch2_inode_invalid(const struct bch_fs *c,
static void bch2_inode_to_text(struct bch_fs *c, char *buf,
size_t size, struct bkey_s_c k)
{
+ char *out = buf, *end = out + size;
struct bkey_s_c_inode inode;
struct bch_inode_unpacked unpacked;
@@ -228,11 +235,14 @@ static void bch2_inode_to_text(struct bch_fs *c, char *buf,
case BCH_INODE_FS:
inode = bkey_s_c_to_inode(k);
if (bch2_inode_unpack(inode, &unpacked)) {
- scnprintf(buf, size, "(unpack error)");
+ out += scnprintf(out, end - out, "(unpack error)");
break;
}
- scnprintf(buf, size, "i_size %llu", unpacked.bi_size);
+#define BCH_INODE_FIELD(_name, _bits) \
+ out += scnprintf(out, end - out, #_name ": %llu ", (u64) unpacked._name);
+ BCH_INODE_FIELDS()
+#undef BCH_INODE_FIELD
break;
}
}
@@ -243,9 +253,12 @@ const struct bkey_ops bch2_bkey_inode_ops = {
};
void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
- uid_t uid, gid_t gid, umode_t mode, dev_t rdev)
+ uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
+ struct bch_inode_unpacked *parent)
{
- s64 now = timespec_to_bch2_time(c, CURRENT_TIME);
+ s64 now = timespec_to_bch2_time(c,
+ timespec_trunc(current_kernel_time(),
+ c->sb.time_precision));
memset(inode_u, 0, sizeof(*inode_u));
@@ -261,6 +274,12 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
inode_u->bi_mtime = now;
inode_u->bi_ctime = now;
inode_u->bi_otime = now;
+
+ if (parent) {
+#define BCH_INODE_FIELD(_name) inode_u->_name = parent->_name;
+ BCH_INODE_FIELDS_INHERIT()
+#undef BCH_INODE_FIELD
+ }
}
int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
@@ -416,7 +435,7 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
struct bch_inode_unpacked inode_u;
if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u))
- bi_generation = cpu_to_le32(inode_u.bi_generation) + 1;
+ bi_generation = inode_u.bi_generation + 1;
break;
}
case BCH_INODE_GENERATION: {
diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h
index 53c70617..8ebb6fb6 100644
--- a/libbcachefs/inode.h
+++ b/libbcachefs/inode.h
@@ -1,6 +1,8 @@
#ifndef _BCACHEFS_INODE_H
#define _BCACHEFS_INODE_H
+#include "opts.h"
+
#include <linux/math64.h>
extern const struct bkey_ops bch2_bkey_inode_ops;
@@ -28,7 +30,8 @@ void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *)
int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
- uid_t, gid_t, umode_t, dev_t);
+ uid_t, gid_t, umode_t, dev_t,
+ struct bch_inode_unpacked *);
int bch2_inode_create(struct bch_fs *, struct bch_inode_unpacked *,
u64, u64, u64 *);
int bch2_inode_truncate(struct bch_fs *, u64, u64,
@@ -55,6 +58,45 @@ static inline u64 timespec_to_bch2_time(struct bch_fs *c, struct timespec ts)
return div_s64(ns, c->sb.time_precision);
}
+static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode)
+{
+ struct bch_io_opts ret = { 0 };
+
+#define BCH_INODE_OPT(_name, _bits) \
+ if (inode->bi_##_name) \
+ opt_set(ret, _name, inode->bi_##_name - 1);
+ BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+ return ret;
+}
+
+static inline void __bch2_inode_opt_set(struct bch_inode_unpacked *inode,
+ enum bch_opt_id id, u64 v)
+{
+ switch (id) {
+#define BCH_INODE_OPT(_name, ...) \
+ case Opt_##_name: \
+ inode->bi_##_name = v; \
+ break;
+ BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+ default:
+ BUG();
+ }
+}
+
+static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode,
+ enum bch_opt_id id, u64 v)
+{
+ return __bch2_inode_opt_set(inode, id, v + 1);
+}
+
+static inline void bch2_inode_opt_clear(struct bch_inode_unpacked *inode,
+ enum bch_opt_id id)
+{
+ return __bch2_inode_opt_set(inode, id, 0);
+}
+
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_inode_pack_test(void);
#else
diff --git a/libbcachefs/io.c b/libbcachefs/io.c
index 0c41e411..3369a2ff 100644
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@@ -20,6 +20,7 @@
#include "journal.h"
#include "keylist.h"
#include "move.h"
+#include "super.h"
#include "super-io.h"
#include <linux/blkdev.h>
@@ -139,7 +140,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
const struct bch_extent_ptr *ptr;
struct bch_write_bio *n;
struct bch_dev *ca;
- unsigned ptr_idx = 0;
BUG_ON(c->opts.nochanges);
@@ -147,7 +147,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
!c->devs[ptr->dev]);
- ca = c->devs[ptr->dev];
+ ca = bch_dev_bkey_exists(c, ptr->dev);
if (ptr + 1 < &extent_entry_last(e)->ptr) {
n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO,
@@ -168,7 +168,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
n->c = c;
n->ca = ca;
- n->ptr_idx = ptr_idx++;
n->submit_time_us = local_clock_us();
n->bio.bi_iter.bi_sector = ptr->offset;
@@ -184,7 +183,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
submit_bio(&n->bio);
} else {
n->have_io_ref = false;
- bcache_io_error(c, &n->bio, "device has been removed");
+ n->bio.bi_status = BLK_STS_REMOVED;
bio_endio(&n->bio);
}
}
@@ -201,9 +200,12 @@ static void bch2_write_done(struct closure *cl)
if (!op->error && (op->flags & BCH_WRITE_FLUSH))
op->error = bch2_journal_error(&op->c->journal);
- bch2_disk_reservation_put(op->c, &op->res);
+ if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
+ bch2_disk_reservation_put(op->c, &op->res);
percpu_ref_put(&op->c->writes);
bch2_keylist_free(&op->insert_keys, op->inline_keys);
+ op->flags &= ~(BCH_WRITE_DONE|BCH_WRITE_LOOPED);
+
closure_return(cl);
}
@@ -244,9 +246,37 @@ static void bch2_write_index(struct closure *cl)
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
struct bch_fs *c = op->c;
struct keylist *keys = &op->insert_keys;
+ struct bkey_s_extent e;
+ struct bch_extent_ptr *ptr;
+ struct bkey_i *src, *dst = keys->keys, *n;
+ int ret;
op->flags |= BCH_WRITE_LOOPED;
+ for (src = keys->keys; src != keys->top; src = n) {
+ n = bkey_next(src);
+ bkey_copy(dst, src);
+
+ e = bkey_i_to_s_extent(dst);
+ extent_for_each_ptr_backwards(e, ptr)
+ if (test_bit(ptr->dev, op->failed.d))
+ bch2_extent_drop_ptr(e, ptr);
+
+ ret = bch2_extent_nr_ptrs(e.c)
+ ? bch2_check_mark_super(c, e.c, BCH_DATA_USER)
+ : -EIO;
+ if (ret) {
+ keys->top = keys->keys;
+ op->error = ret;
+ op->flags |= BCH_WRITE_DONE;
+ goto err;
+ }
+
+ dst = bkey_next(dst);
+ }
+
+ keys->top = dst;
+
if (!bch2_keylist_empty(keys)) {
u64 sectors_start = keylist_sectors(keys);
int ret = op->index_update_fn(op);
@@ -260,7 +290,7 @@ static void bch2_write_index(struct closure *cl)
op->error = ret;
}
}
-
+err:
bch2_open_bucket_put_refs(c, &op->open_buckets_nr, op->open_buckets);
if (!(op->flags & BCH_WRITE_DONE))
@@ -276,43 +306,6 @@ static void bch2_write_index(struct closure *cl)
}
}
-static void bch2_write_io_error(struct closure *cl)
-{
- struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
- struct keylist *keys = &op->insert_keys;
- struct bch_fs *c = op->c;
- struct bch_extent_ptr *ptr;
- struct bkey_i *k;
- int ret;
-
- for_each_keylist_key(keys, k) {
- struct bkey_i *n = bkey_next(k);
- struct bkey_s_extent e = bkey_i_to_s_extent(k);
-
- extent_for_each_ptr_backwards(e, ptr)
- if (test_bit(ptr->dev, op->failed.d))
- bch2_extent_drop_ptr(e, ptr);
-
- memmove(bkey_next(k), n, (void *) keys->top - (void *) n);
- keys->top_p -= (u64 *) n - (u64 *) bkey_next(k);
-
- ret = bch2_extent_nr_ptrs(e.c)
- ? bch2_check_mark_super(c, e.c, BCH_DATA_USER)
- : -EIO;
- if (ret) {
- keys->top = keys->keys;
- op->error = ret;
- op->flags |= BCH_WRITE_DONE;
- break;
- }
- }
-
- memset(&op->failed, 0, sizeof(op->failed));
-
- bch2_write_index(cl);
- return;
-}
-
static void bch2_write_endio(struct bio *bio)
{
struct closure *cl = bio->bi_private;
@@ -324,10 +317,8 @@ static void bch2_write_endio(struct bio *bio)
bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
- if (bch2_dev_io_err_on(bio->bi_error, ca, "data write")) {
+ if (bch2_dev_io_err_on(bio->bi_status, ca, "data write"))
set_bit(ca->dev_idx, op->failed.d);
- set_closure_fn(cl, bch2_write_io_error, index_update_wq(op));
- }
if (wbio->have_io_ref)
percpu_ref_put(&ca->io_ref);
@@ -706,11 +697,6 @@ do_write:
key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
- ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(key_to_write),
- BCH_DATA_USER);
- if (ret)
- goto err;
-
dst->bi_end_io = bch2_write_endio;
dst->bi_private = &op->cl;
bio_set_op_attrs(dst, REQ_OP_WRITE, 0);
@@ -870,7 +856,8 @@ void bch2_write(struct closure *cl)
!percpu_ref_tryget(&c->writes)) {
__bcache_io_error(c, "read only");
op->error = -EROFS;
- bch2_disk_reservation_put(c, &op->res);
+ if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
+ bch2_disk_reservation_put(c, &op->res);
closure_return(cl);
}
@@ -916,7 +903,10 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
rbio->promote = NULL;
- __bch2_write_op_init(&op->write.op, c);
+ bch2_write_op_init(&op->write.op, c);
+ op->write.op.csum_type = bch2_data_checksum_type(c, rbio->opts.data_checksum);
+ op->write.op.compression_type =
+ bch2_compression_opt_to_type(rbio->opts.compression);
op->write.move_dev = -1;
op->write.op.devs = c->fastest_devs;
@@ -1060,7 +1050,7 @@ static void bch2_rbio_retry(struct work_struct *work)
if (rbio->split)
rbio = bch2_rbio_free(rbio);
else
- rbio->bio.bi_error = 0;
+ rbio->bio.bi_status = 0;
if (!(flags & BCH_READ_NODECODE))
flags |= BCH_READ_MUST_CLONE;
@@ -1073,7 +1063,8 @@ static void bch2_rbio_retry(struct work_struct *work)
__bch2_read(c, rbio, iter, inode, &avoid, flags);
}
-static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, int error)
+static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
+ blk_status_t error)
{
rbio->retry = retry;
@@ -1081,7 +1072,7 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, int error)
return;
if (retry == READ_ERR) {
- bch2_rbio_parent(rbio)->bio.bi_error = error;
+ bch2_rbio_parent(rbio)->bio.bi_status = error;
bch2_rbio_done(rbio);
} else {
bch2_rbio_punt(rbio, bch2_rbio_retry,
@@ -1236,7 +1227,7 @@ csum_err:
*/
if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
rbio->flags |= BCH_READ_MUST_BOUNCE;
- bch2_rbio_error(rbio, READ_RETRY, -EIO);
+ bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
return;
}
@@ -1245,13 +1236,13 @@ csum_err:
rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
csum.hi, csum.lo, crc.csum_type);
- bch2_rbio_error(rbio, READ_RETRY_AVOID, -EIO);
+ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
return;
decompression_err:
__bcache_io_error(c, "decompression error, inode %llu offset %llu",
rbio->pos.inode,
(u64) rbio->bvec_iter.bi_sector);
- bch2_rbio_error(rbio, READ_ERR, -EIO);
+ bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
return;
}
@@ -1270,8 +1261,8 @@ static void bch2_read_endio(struct bio *bio)
if (!rbio->split)
rbio->bio.bi_end_io = rbio->end_io;
- if (bch2_dev_io_err_on(bio->bi_error, rbio->pick.ca, "data read")) {
- bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_error);
+ if (bch2_dev_io_err_on(bio->bi_status, rbio->pick.ca, "data read")) {
+ bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
return;
}
@@ -1281,9 +1272,9 @@ static void bch2_read_endio(struct bio *bio)
atomic_long_inc(&c->read_realloc_races);
if (rbio->flags & BCH_READ_RETRY_IF_STALE)
- bch2_rbio_error(rbio, READ_RETRY, -EINTR);
+ bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
else
- bch2_rbio_error(rbio, READ_ERR, -EINTR);
+ bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
return;
}
@@ -1360,7 +1351,8 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
rbio = rbio_init(bio_alloc_bioset(GFP_NOIO,
DIV_ROUND_UP(sectors, PAGE_SECTORS),
- &c->bio_read_split));
+ &c->bio_read_split),
+ orig->opts);
bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
split = true;
@@ -1374,7 +1366,8 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
* lose the error)
*/
rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO,
- &c->bio_read_split));
+ &c->bio_read_split),
+ orig->opts);
rbio->bio.bi_iter = iter;
split = true;
} else {
@@ -1428,6 +1421,8 @@ noclone:
bch2_read_endio(&rbio->bio);
ret = rbio->retry;
+ if (rbio->split)
+ rbio = bch2_rbio_free(rbio);
if (!ret)
bch2_rbio_done(rbio);
}
@@ -1503,7 +1498,7 @@ err:
* possibly bigger than the memory that was
* originally allocated)
*/
- rbio->bio.bi_error = -EINTR;
+ rbio->bio.bi_status = BLK_STS_AGAIN;
bio_endio(&rbio->bio);
return;
}
@@ -1561,6 +1556,7 @@ retry:
case READ_RETRY:
goto retry;
case READ_ERR:
+ rbio->bio.bi_status = BLK_STS_IOERR;
bio_endio(&rbio->bio);
return;
};
diff --git a/libbcachefs/io.h b/libbcachefs/io.h
index bd0d7c43..0c145eb6 100644
--- a/libbcachefs/io.h
+++ b/libbcachefs/io.h
@@ -21,6 +21,8 @@ void bch2_latency_acct(struct bch_dev *, unsigned, int);
void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
enum bch_data_type, const struct bkey_i *);
+#define BLK_STS_REMOVED ((__force blk_status_t)128)
+
enum bch_write_flags {
BCH_WRITE_ALLOC_NOWAIT = (1 << 0),
BCH_WRITE_CACHED = (1 << 1),
@@ -29,11 +31,12 @@ enum bch_write_flags {
BCH_WRITE_PAGES_STABLE = (1 << 4),
BCH_WRITE_PAGES_OWNED = (1 << 5),
BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6),
+ BCH_WRITE_NOPUT_RESERVATION = (1 << 7),
/* Internal: */
- BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 7),
- BCH_WRITE_DONE = (1 << 8),
- BCH_WRITE_LOOPED = (1 << 9),
+ BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 8),
+ BCH_WRITE_DONE = (1 << 9),
+ BCH_WRITE_LOOPED = (1 << 10),
};
static inline u64 *op_journal_seq(struct bch_write_op *op)
@@ -42,6 +45,12 @@ static inline u64 *op_journal_seq(struct bch_write_op *op)
? op->journal_seq_p : &op->journal_seq;
}
+static inline void op_journal_seq_set(struct bch_write_op *op, u64 *journal_seq)
+{
+ op->journal_seq_p = journal_seq;
+ op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR;
+}
+
static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
{
return op->alloc_reserve == RESERVE_MOVINGGC
@@ -51,14 +60,14 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
int bch2_write_index_default(struct bch_write_op *);
-static inline void __bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c)
+static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c)
{
op->c = c;
op->io_wq = index_update_wq(op);
op->flags = 0;
op->written = 0;
op->error = 0;
- op->csum_type = bch2_data_checksum_type(c);
+ op->csum_type = bch2_data_checksum_type(c, c->opts.data_checksum);
op->compression_type =
bch2_compression_opt_to_type(c->opts.compression);
op->nr_replicas = 0;
@@ -75,27 +84,6 @@ static inline void __bch2_write_op_init(struct bch_write_op *op, struct bch_fs *
op->index_update_fn = bch2_write_index_default;
}
-static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
- struct disk_reservation res,
- struct bch_devs_mask *devs,
- struct write_point_specifier write_point,
- struct bpos pos,
- u64 *journal_seq, unsigned flags)
-{
- __bch2_write_op_init(op, c);
- op->flags = flags;
- op->nr_replicas = res.nr_replicas;
- op->pos = pos;
- op->res = res;
- op->devs = devs;
- op->write_point = write_point;
-
- if (journal_seq) {
- op->journal_seq_p = journal_seq;
- op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR;
- }
-}
-
void bch2_write(struct closure *);
static inline struct bch_write_bio *wbio_init(struct bio *bio)
@@ -134,25 +122,27 @@ static inline void bch2_read_extent(struct bch_fs *c,
struct extent_pick_ptr *pick,
unsigned flags)
{
- rbio->_state = 0;
__bch2_read_extent(c, rbio, rbio->bio.bi_iter, e, pick, flags);
}
static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
u64 inode)
{
- rbio->_state = 0;
+ BUG_ON(rbio->_state);
__bch2_read(c, rbio, rbio->bio.bi_iter, inode, NULL,
BCH_READ_RETRY_IF_STALE|
BCH_READ_MAY_PROMOTE|
BCH_READ_USER_MAPPED);
}
-static inline struct bch_read_bio *rbio_init(struct bio *bio)
+static inline struct bch_read_bio *rbio_init(struct bio *bio,
+ struct bch_io_opts opts)
{
struct bch_read_bio *rbio = to_rbio(bio);
- rbio->_state = 0;
+ rbio->_state = 0;
+ rbio->promote = NULL;
+ rbio->opts = opts;
return rbio;
}
diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h
index ed9a4bbe..ff18fdc9 100644
--- a/libbcachefs/io_types.h
+++ b/libbcachefs/io_types.h
@@ -6,6 +6,7 @@
#include "buckets_types.h"
#include "extents_types.h"
#include "keylist_types.h"
+#include "opts.h"
#include "super_types.h"
#include <linux/llist.h>
@@ -56,6 +57,8 @@ struct bch_read_bio {
struct promote_op *promote;
+ struct bch_io_opts opts;
+
struct work_struct work;
struct bio bio;
@@ -69,8 +72,7 @@ struct bch_write_bio {
struct closure *cl;
};
- u8 ptr_idx;
- u8 replicas_failed;
+ struct bch_devs_list failed;
u8 order;
unsigned split:1,
@@ -90,8 +92,8 @@ struct bch_write_op {
struct bch_fs *c;
struct workqueue_struct *io_wq;
+ unsigned written; /* sectors */
u16 flags;
- u16 written; /* sectors */
s8 error;
unsigned csum_type:4;
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index 5d9a298d..b4e149ac 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -338,8 +338,8 @@ struct journal_list {
* Given a journal entry we just read, add it to the list of journal entries to
* be replayed:
*/
-static int journal_entry_add(struct bch_fs *c, struct journal_list *jlist,
- struct jset *j)
+static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
+ struct journal_list *jlist, struct jset *j)
{
struct journal_replay *i, *pos;
struct list_head *where;
@@ -347,8 +347,6 @@ static int journal_entry_add(struct bch_fs *c, struct journal_list *jlist,
__le64 last_seq;
int ret;
- mutex_lock(&jlist->lock);
-
last_seq = !list_empty(jlist->head)
? list_last_entry(jlist->head, struct journal_replay,
list)->j.last_seq
@@ -376,9 +374,7 @@ static int journal_entry_add(struct bch_fs *c, struct journal_list *jlist,
memcmp(j, &i->j, bytes), c,
"found duplicate but non identical journal entries (seq %llu)",
le64_to_cpu(j->seq));
-
- ret = JOURNAL_ENTRY_ADD_OK;
- goto out;
+ goto found;
}
if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
@@ -395,12 +391,16 @@ add:
goto out;
}
- memcpy(&i->j, j, bytes);
list_add(&i->list, where);
+ i->devs.nr = 0;
+ memcpy(&i->j, j, bytes);
+found:
+ if (!fsck_err_on(bch2_dev_list_has_dev(i->devs, ca->dev_idx),
+ c, "duplicate journal entries on same device"))
+ bch2_dev_list_add_dev(&i->devs, ca->dev_idx);
ret = JOURNAL_ENTRY_ADD_OK;
out:
fsck_err:
- mutex_unlock(&jlist->lock);
return ret;
}
@@ -496,8 +496,8 @@ fsck_err:
#define journal_entry_err_on(cond, c, msg, ...) \
((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
-static int __journal_entry_validate(struct bch_fs *c, struct jset *j,
- int write)
+static int journal_entry_validate_entries(struct bch_fs *c, struct jset *j,
+ int write)
{
struct jset_entry *entry;
int ret = 0;
@@ -508,7 +508,7 @@ static int __journal_entry_validate(struct bch_fs *c, struct jset *j,
if (journal_entry_err_on(vstruct_next(entry) >
vstruct_last(j), c,
"journal entry extends past end of jset")) {
- j->u64s = cpu_to_le64((u64 *) entry - j->_data);
+ j->u64s = cpu_to_le32((u64 *) entry - j->_data);
break;
}
@@ -614,7 +614,7 @@ static int journal_entry_validate(struct bch_fs *c,
"invalid journal entry: last_seq > seq"))
j->last_seq = j->seq;
- return __journal_entry_validate(c, j, write);
+ return 0;
fsck_err:
return ret;
}
@@ -722,7 +722,10 @@ reread: sectors_read = min_t(unsigned,
ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
- ret = journal_entry_add(c, jlist, j);
+ mutex_lock(&jlist->lock);
+ ret = journal_entry_add(c, ca, jlist, j);
+ mutex_unlock(&jlist->lock);
+
switch (ret) {
case JOURNAL_ENTRY_ADD_OK:
*entries_found = true;
@@ -916,7 +919,9 @@ static int journal_seq_blacklist_read(struct journal *j,
for_each_jset_entry_type(entry, &i->j,
JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED) {
- seq = le64_to_cpu(entry->_data[0]);
+ struct jset_entry_blacklist *bl_entry =
+ container_of(entry, struct jset_entry_blacklist, entry);
+ seq = le64_to_cpu(bl_entry->seq);
bch_verbose(c, "blacklisting existing journal seq %llu", seq);
@@ -982,6 +987,12 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
fsck_err_on(c->sb.clean && journal_has_keys(list), c,
"filesystem marked clean but journal has keys to replay");
+ list_for_each_entry(i, list, list) {
+ ret = journal_entry_validate_entries(c, &i->j, READ);
+ if (ret)
+ goto fsck_err;
+ }
+
i = list_last_entry(list, struct journal_replay, list);
unfixable_fsck_err_on(le64_to_cpu(i->j.seq) -
@@ -1002,6 +1013,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
INIT_LIST_HEAD(&p->list);
INIT_LIST_HEAD(&p->flushed);
atomic_set(&p->count, 0);
+ p->devs.nr = 0;
}
mutex_lock(&j->blacklist_lock);
@@ -1010,6 +1022,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
p = journal_seq_pin(j, le64_to_cpu(i->j.seq));
atomic_set(&p->count, 1);
+ p->devs = i->devs;
if (journal_seq_blacklist_read(j, i, p)) {
mutex_unlock(&j->blacklist_lock);
@@ -1090,7 +1103,7 @@ void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set)
{
struct journal_buf *w = journal_prev_buf(j);
- atomic_dec_bug(&journal_seq_pin(j, w->data->seq)->count);
+ atomic_dec_bug(&journal_seq_pin(j, le64_to_cpu(w->data->seq))->count);
if (!need_write_just_set &&
test_bit(JOURNAL_NEED_WRITE, &j->flags))
@@ -1122,6 +1135,7 @@ static void __journal_entry_new(struct journal *j, int count)
INIT_LIST_HEAD(&p->list);
INIT_LIST_HEAD(&p->flushed);
atomic_set(&p->count, count);
+ p->devs.nr = 0;
}
static void __bch2_journal_next_entry(struct journal *j)
@@ -1851,6 +1865,21 @@ void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
bch2_journal_error(j));
}
+int bch2_journal_flush_all_pins(struct journal *j)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ bool flush;
+
+ bch2_journal_flush_pins(j, U64_MAX);
+
+ spin_lock(&j->lock);
+ flush = last_seq(j) != j->last_seq_ondisk ||
+ c->btree_roots_dirty;
+ spin_unlock(&j->lock);
+
+ return flush ? bch2_journal_meta(j) : 0;
+}
+
static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
{
bool ret;
@@ -2002,7 +2031,7 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
* i.e. whichever device was limiting the current journal entry size.
*/
extent_for_each_ptr_backwards(e, ptr) {
- ca = c->devs[ptr->dev];
+ ca = bch_dev_bkey_exists(c, ptr->dev);
if (ca->mi.state != BCH_MEMBER_STATE_RW ||
ca->journal.sectors_free <= sectors)
@@ -2197,7 +2226,7 @@ static void journal_write_endio(struct bio *bio)
struct bch_dev *ca = bio->bi_private;
struct journal *j = &ca->fs->journal;
- if (bch2_dev_io_err_on(bio->bi_error, ca, "journal write") ||
+ if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write") ||
bch2_meta_write_fault("journal")) {
/* Was this a flush or an actual journal write? */
if (ca->journal.ptr_idx != U8_MAX) {
@@ -2233,6 +2262,7 @@ static void journal_write(struct closure *cl)
if (r->alive)
bch2_journal_add_btree_root(w, i, &r->key, r->level);
}
+ c->btree_roots_dirty = false;
mutex_unlock(&c->btree_root_lock);
journal_write_compact(jset);
@@ -2246,7 +2276,7 @@ static void journal_write(struct closure *cl)
SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
- __journal_entry_validate(c, jset, WRITE))
+ journal_entry_validate_entries(c, jset, WRITE))
goto err;
bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
@@ -2257,7 +2287,7 @@ static void journal_write(struct closure *cl)
journal_nonce(jset), jset);
if (!bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
- __journal_entry_validate(c, jset, WRITE))
+ journal_entry_validate_entries(c, jset, WRITE))
goto err;
sectors = vstruct_sectors(jset, c->block_bits);
@@ -2277,6 +2307,9 @@ static void journal_write(struct closure *cl)
BCH_DATA_JOURNAL))
goto err;
+ journal_seq_pin(j, le64_to_cpu(jset->seq))->devs =
+ bch2_extent_devs(bkey_i_to_s_c_extent(&j->key));
+
/*
* XXX: we really should just disable the entire journal in nochanges
* mode
@@ -2285,7 +2318,7 @@ static void journal_write(struct closure *cl)
goto no_io;
extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr) {
- ca = c->devs[ptr->dev];
+ ca = bch_dev_bkey_exists(c, ptr->dev);
if (!percpu_ref_tryget(&ca->io_ref)) {
/* XXX: fix this */
bch_err(c, "missing device for journal write\n");
@@ -2693,6 +2726,46 @@ int bch2_journal_flush(struct journal *j)
return bch2_journal_flush_seq(j, seq);
}
+int bch2_journal_flush_device(struct journal *j, unsigned dev_idx)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct journal_entry_pin_list *p;
+ struct bch_devs_list devs;
+ u64 seq = 0;
+ unsigned iter;
+ int ret = 0;
+
+ spin_lock(&j->lock);
+ fifo_for_each_entry_ptr(p, &j->pin, iter)
+ if (bch2_dev_list_has_dev(p->devs, dev_idx))
+ seq = journal_pin_seq(j, p);
+ spin_unlock(&j->lock);
+
+ bch2_journal_flush_pins(j, seq);
+
+ mutex_lock(&c->replicas_gc_lock);
+ bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
+
+ seq = 0;
+
+ spin_lock(&j->lock);
+ while (!ret && seq < atomic64_read(&j->seq)) {
+ seq = max(seq, last_seq(j));
+ devs = journal_seq_pin(j, seq)->devs;
+ seq++;
+
+ spin_unlock(&j->lock);
+ ret = bch2_check_mark_super_devlist(c, &devs, BCH_DATA_JOURNAL);
+ spin_lock(&j->lock);
+ }
+ spin_unlock(&j->lock);
+
+ bch2_replicas_gc_end(c, ret);
+ mutex_unlock(&c->replicas_gc_lock);
+
+ return ret;
+}
+
ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
@@ -2862,9 +2935,7 @@ void bch2_fs_journal_stop(struct journal *j)
* journal entries, then force a brand new empty journal entry to be
* written:
*/
- bch2_journal_flush_pins(j, U64_MAX);
- bch2_journal_flush_async(j, NULL);
- bch2_journal_meta(j);
+ bch2_journal_flush_all_pins(j);
cancel_delayed_work_sync(&j->write_work);
cancel_delayed_work_sync(&j->reclaim_work);
diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h
index 9d6c79c6..5f3ece08 100644
--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@@ -118,6 +118,8 @@
*/
struct journal_replay {
struct list_head list;
+ struct bch_devs_list devs;
+ /* must be last: */
struct jset j;
};
@@ -164,6 +166,7 @@ void bch2_journal_pin_add_if_older(struct journal *,
struct journal_entry_pin *,
journal_pin_flush_fn);
void bch2_journal_flush_pins(struct journal *, u64);
+int bch2_journal_flush_all_pins(struct journal *);
struct closure;
struct bch_fs;
@@ -356,6 +359,7 @@ void bch2_journal_meta_async(struct journal *, struct closure *);
int bch2_journal_flush_seq(struct journal *, u64);
int bch2_journal_flush(struct journal *);
int bch2_journal_meta(struct journal *);
+int bch2_journal_flush_device(struct journal *, unsigned);
void bch2_journal_halt(struct journal *);
diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h
index 55b41c56..87f378a6 100644
--- a/libbcachefs/journal_types.h
+++ b/libbcachefs/journal_types.h
@@ -34,6 +34,7 @@ struct journal_entry_pin_list {
struct list_head list;
struct list_head flushed;
atomic_t count;
+ struct bch_devs_list devs;
};
struct journal;
diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c
index 8d1c0ee0..e11ee953 100644
--- a/libbcachefs/migrate.c
+++ b/libbcachefs/migrate.c
@@ -27,23 +27,9 @@ static bool migrate_pred(void *arg, struct bkey_s_c_extent e)
#define MAX_DATA_OFF_ITER 10
-/*
- * This moves only the data off, leaving the meta-data (if any) in place.
- * It walks the key space, and for any key with a valid pointer to the
- * relevant device, it copies it elsewhere, updating the key to point to
- * the copy.
- * The meta-data is moved off by bch_move_meta_data_off_device.
- *
- * Note: If the number of data replicas desired is > 1, ideally, any
- * new copies would not be made in the same device that already have a
- * copy (if there are enough devices).
- * This is _not_ currently implemented. The multiple replicas can
- * land in the same device even if there are others available.
- */
-
-int bch2_move_data_off_device(struct bch_dev *ca)
+static int bch2_dev_usrdata_migrate(struct bch_fs *c, struct bch_dev *ca,
+ int flags)
{
- struct bch_fs *c = ca->fs;
struct btree_iter iter;
struct bkey_s_c k;
u64 keys_moved, sectors_moved;
@@ -113,10 +99,6 @@ int bch2_move_data_off_device(struct bch_dev *ca)
return ret;
}
-/*
- * This walks the btree, and for any node on the relevant device it moves the
- * node elsewhere.
- */
static int bch2_move_btree_off(struct bch_fs *c, struct bch_dev *ca,
enum btree_id id)
{
@@ -200,9 +182,9 @@ static int bch2_move_btree_off(struct bch_fs *c, struct bch_dev *ca,
* is written.
*/
-int bch2_move_metadata_off_device(struct bch_dev *ca)
+static int bch2_dev_metadata_migrate(struct bch_fs *c, struct bch_dev *ca,
+ int flags)
{
- struct bch_fs *c = ca->fs;
unsigned i;
int ret = 0;
@@ -240,37 +222,31 @@ err:
return ret;
}
-/*
- * Flagging data bad when forcibly removing a device after failing to
- * migrate the data off the device.
- */
+int bch2_dev_data_migrate(struct bch_fs *c, struct bch_dev *ca, int flags)
+{
+ return bch2_dev_usrdata_migrate(c, ca, flags) ?:
+ bch2_dev_metadata_migrate(c, ca, flags);
+}
-static int bch2_flag_key_bad(struct btree_iter *iter,
- struct bch_dev *ca,
- struct bkey_s_c_extent orig)
+static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e,
+ unsigned dev_idx, int flags, bool metadata)
{
- BKEY_PADDED(key) tmp;
- struct bkey_s_extent e;
struct bch_extent_ptr *ptr;
- struct bch_fs *c = ca->fs;
-
- bkey_reassemble(&tmp.key, orig.s_c);
- e = bkey_i_to_s_extent(&tmp.key);
+ unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas;
+ unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST;
+ unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED;
+ unsigned nr_good;
extent_for_each_ptr_backwards(e, ptr)
- if (ptr->dev == ca->dev_idx)
+ if (ptr->dev == dev_idx)
bch2_extent_drop_ptr(e, ptr);
- /*
- * If the new extent no longer has any pointers, bch2_extent_normalize()
- * will do the appropriate thing with it (turning it into a
- * KEY_TYPE_ERROR key, or just a discard if it was a cached extent)
- */
- bch2_extent_normalize(c, e.s);
+ nr_good = bch2_extent_nr_good_ptrs(c, e.c);
+ if ((!nr_good && !(flags & lost)) ||
+ (nr_good < replicas && !(flags & degraded)))
+ return -EINVAL;
- return bch2_btree_insert_at(c, NULL, NULL, NULL,
- BTREE_INSERT_ATOMIC,
- BTREE_INSERT_ENTRY(iter, &tmp.key));
+ return 0;
}
/*
@@ -284,11 +260,11 @@ static int bch2_flag_key_bad(struct btree_iter *iter,
* that we've already tried to move the data MAX_DATA_OFF_ITER times and
* are not likely to succeed if we try again.
*/
-int bch2_flag_data_bad(struct bch_dev *ca)
+static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
{
- struct bch_fs *c = ca->fs;
struct bkey_s_c k;
- struct bkey_s_c_extent e;
+ struct bkey_s_extent e;
+ BKEY_PADDED(key) tmp;
struct btree_iter iter;
int ret = 0;
@@ -303,11 +279,33 @@ int bch2_flag_data_bad(struct bch_dev *ca)
if (!bkey_extent_is_data(k.k))
goto advance;
- e = bkey_s_c_to_extent(k);
- if (!bch2_extent_has_device(e, ca->dev_idx))
+ if (!bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx))
goto advance;
- ret = bch2_flag_key_bad(&iter, ca, e);
+ bkey_reassemble(&tmp.key, k);
+ e = bkey_i_to_s_extent(&tmp.key);
+
+ ret = drop_dev_ptrs(c, e, dev_idx, flags, false);
+ if (ret)
+ break;
+
+ /*
+ * If the new extent no longer has any pointers, bch2_extent_normalize()
+ * will do the appropriate thing with it (turning it into a
+ * KEY_TYPE_ERROR key, or just a discard if it was a cached extent)
+ */
+ bch2_extent_normalize(c, e.s);
+
+ if (bkey_extent_is_data(e.k) &&
+ (ret = bch2_check_mark_super(c, e.c, BCH_DATA_USER)))
+ break;
+
+ iter.pos = bkey_start_pos(&tmp.key.k);
+
+ ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
+ BTREE_INSERT_ATOMIC|
+ BTREE_INSERT_NOFAIL,
+ BTREE_INSERT_ENTRY(&iter, &tmp.key));
/*
* don't want to leave ret == -EINTR, since if we raced and
@@ -319,26 +317,6 @@ int bch2_flag_data_bad(struct bch_dev *ca)
if (ret)
break;
- /*
- * If the replica we're dropping was dirty and there is an
- * additional cached replica, the cached replica will now be
- * considered dirty - upon inserting the new version of the key,
- * the bucket accounting will be updated to reflect the fact
- * that the cached data is now dirty and everything works out as
- * if by magic without us having to do anything.
- *
- * The one thing we need to be concerned with here is there's a
- * race between when we drop any stale pointers from the key
- * we're about to insert, and when the key actually gets
- * inserted and the cached data is marked as dirty - we could
- * end up trying to insert a key with a pointer that should be
- * dirty, but points to stale data.
- *
- * If that happens the insert code just bails out and doesn't do
- * the insert - however, it doesn't return an error. Hence we
- * need to always recheck the current key before advancing to
- * the next:
- */
continue;
advance:
if (bkey_extent_is_data(k.k)) {
@@ -357,3 +335,80 @@ advance:
return ret;
}
+
+static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+ struct btree_iter iter;
+ struct closure cl;
+ struct btree *b;
+ unsigned id;
+ int ret;
+
+ /* don't handle this yet: */
+ if (flags & BCH_FORCE_IF_METADATA_LOST)
+ return -EINVAL;
+
+ closure_init_stack(&cl);
+
+ mutex_lock(&c->replicas_gc_lock);
+ bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE);
+
+ for (id = 0; id < BTREE_ID_NR; id++) {
+ for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
+ __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
+ struct bkey_i_extent *new_key;
+retry:
+ if (!bch2_extent_has_device(bkey_i_to_s_c_extent(&b->key),
+ dev_idx)) {
+ bch2_btree_iter_set_locks_want(&iter, 0);
+
+ ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(&b->key),
+ BCH_DATA_BTREE);
+ if (ret)
+ goto err;
+ } else {
+ bkey_copy(&tmp.k, &b->key);
+ new_key = bkey_i_to_extent(&tmp.k);
+
+ ret = drop_dev_ptrs(c, extent_i_to_s(new_key),
+ dev_idx, flags, true);
+ if (ret)
+ goto err;
+
+ if (!bch2_btree_iter_set_locks_want(&iter, U8_MAX)) {
+ b = bch2_btree_iter_peek_node(&iter);
+ goto retry;
+ }
+
+ ret = bch2_btree_node_update_key(c, &iter, b, new_key);
+ if (ret == -EINTR) {
+ b = bch2_btree_iter_peek_node(&iter);
+ goto retry;
+ }
+ if (ret)
+ goto err;
+ }
+ }
+ bch2_btree_iter_unlock(&iter);
+
+ /* btree root */
+ mutex_lock(&c->btree_root_lock);
+ mutex_unlock(&c->btree_root_lock);
+ }
+
+ ret = 0;
+out:
+ bch2_replicas_gc_end(c, ret);
+ mutex_unlock(&c->replicas_gc_lock);
+
+ return ret;
+err:
+ bch2_btree_iter_unlock(&iter);
+ goto out;
+}
+
+int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+ return bch2_dev_usrdata_drop(c, dev_idx, flags) ?:
+ bch2_dev_metadata_drop(c, dev_idx, flags);
+}
diff --git a/libbcachefs/migrate.h b/libbcachefs/migrate.h
index 9bdaa792..6db7b911 100644
--- a/libbcachefs/migrate.h
+++ b/libbcachefs/migrate.h
@@ -1,8 +1,7 @@
#ifndef _BCACHEFS_MIGRATE_H
#define _BCACHEFS_MIGRATE_H
-int bch2_move_data_off_device(struct bch_dev *);
-int bch2_move_metadata_off_device(struct bch_dev *);
-int bch2_flag_data_bad(struct bch_dev *);
+int bch2_dev_data_migrate(struct bch_fs *, struct bch_dev *, int);
+int bch2_dev_data_drop(struct bch_fs *, unsigned, int);
#endif /* _BCACHEFS_MIGRATE_H */
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index 5eaf0cf8..8ce63d66 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -3,6 +3,7 @@
#include "btree_gc.h"
#include "btree_update.h"
#include "buckets.h"
+#include "inode.h"
#include "io.h"
#include "move.h"
#include "super-io.h"
@@ -206,7 +207,7 @@ static void move_write(struct closure *cl)
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
- if (likely(!io->rbio.bio.bi_error)) {
+ if (likely(!io->rbio.bio.bi_status)) {
bch2_migrate_write_init(&io->write, &io->rbio);
closure_call(&io->write.op.cl, bch2_write, NULL, cl);
}
@@ -240,6 +241,7 @@ static int bch2_move_extent(struct bch_fs *c,
struct write_point_specifier wp,
int btree_insert_flags,
int move_device,
+ struct bch_io_opts opts,
struct bkey_s_c k)
{
struct extent_pick_ptr pick;
@@ -276,6 +278,7 @@ static int bch2_move_extent(struct bch_fs *c,
goto err;
}
+ io->rbio.opts = opts;
bio_init(&io->rbio.bio, io->bi_inline_vecs, pages);
bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
io->rbio.bio.bi_iter.bi_size = sectors << 9;
@@ -284,9 +287,13 @@ static int bch2_move_extent(struct bch_fs *c,
io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k);
io->rbio.bio.bi_end_io = move_read_endio;
- __bch2_write_op_init(&io->write.op, c);
io->write.btree_insert_flags = btree_insert_flags;
io->write.move_dev = move_device;
+
+ bch2_write_op_init(&io->write.op, c);
+ io->write.op.csum_type = bch2_data_checksum_type(c, opts.data_checksum);
+ io->write.op.compression_type =
+ bch2_compression_opt_to_type(opts.compression);
io->write.op.devs = devs;
io->write.op.write_point = wp;
@@ -371,9 +378,11 @@ int bch2_move_data(struct bch_fs *c,
{
bool kthread = (current->flags & PF_KTHREAD) != 0;
struct moving_context ctxt;
+ struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts);
struct btree_iter iter;
BKEY_PADDED(k) tmp;
struct bkey_s_c k;
+ u64 cur_inum = U64_MAX;
int ret = 0;
bch2_move_ctxt_init(&ctxt);
@@ -396,7 +405,7 @@ int bch2_move_data(struct bch_fs *c,
(bch2_btree_iter_unlock(&iter),
(ret = bch2_ratelimit_wait_freezable_stoppable(rate))))
break;
-
+peek:
k = bch2_btree_iter_peek(&iter);
if (!k.k)
break;
@@ -404,8 +413,23 @@ int bch2_move_data(struct bch_fs *c,
if (ret)
break;
- if (!bkey_extent_is_data(k.k) ||
- !pred(arg, bkey_s_c_to_extent(k)))
+ if (!bkey_extent_is_data(k.k))
+ goto next;
+
+ if (cur_inum != k.k->p.inode) {
+ struct bch_inode_unpacked inode;
+
+ /* don't hold btree locks while looking up inode: */
+ bch2_btree_iter_unlock(&iter);
+
+ opts = bch2_opts_to_inode_opts(c->opts);
+ if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode))
+ bch2_io_opts_apply(&opts, bch2_inode_opts_get(&inode));
+ cur_inum = k.k->p.inode;
+ goto peek;
+ }
+
+ if (!pred(arg, bkey_s_c_to_extent(k)))
goto next;
/* unlock before doing IO: */
@@ -415,7 +439,7 @@ int bch2_move_data(struct bch_fs *c,
if (bch2_move_extent(c, &ctxt, devs, wp,
btree_insert_flags,
- move_device, k)) {
+ move_device, opts, k)) {
/* memory allocation failure, wait for some IO to finish */
bch2_move_ctxt_wait_for_io(&ctxt);
continue;
diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c
index c9482151..28e40e41 100644
--- a/libbcachefs/opts.c
+++ b/libbcachefs/opts.c
@@ -76,16 +76,27 @@ void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
#undef BCH_OPT
}
-u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id)
+bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id)
{
switch (id) {
#define BCH_OPT(_name, ...) \
case Opt_##_name: \
- return opts->_name; \
-
+ return opt_defined(*opts, _name);
BCH_OPTS()
#undef BCH_OPT
+ default:
+ BUG();
+ }
+}
+u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id)
+{
+ switch (id) {
+#define BCH_OPT(_name, ...) \
+ case Opt_##_name: \
+ return opts->_name;
+ BCH_OPTS()
+#undef BCH_OPT
default:
BUG();
}
@@ -98,10 +109,8 @@ void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v)
case Opt_##_name: \
opt_set(*opts, _name, v); \
break;
-
BCH_OPTS()
#undef BCH_OPT
-
default:
BUG();
}
@@ -118,7 +127,6 @@ struct bch_opts bch2_opts_from_sb(struct bch_sb *sb)
#define BCH_OPT(_name, _bits, _mode, _type, _sb_opt, _default) \
if (_sb_opt != NO_SB_OPT) \
opt_set(opts, _name, _sb_opt(sb));
-
BCH_OPTS()
#undef BCH_OPT
@@ -145,7 +153,7 @@ const struct bch_option bch2_opt_table[] = {
#undef BCH_OPT
};
-static int bch2_opt_lookup(const char *name)
+int bch2_opt_lookup(const char *name)
{
const struct bch_option *i;
@@ -247,3 +255,52 @@ no_val:
pr_err("Mount option %s requires a value", name);
return -1;
}
+
+/* io opts: */
+
+struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src)
+{
+ struct bch_io_opts ret = { 0 };
+#define BCH_INODE_OPT(_name, _bits) \
+ if (opt_defined(src, _name)) \
+ opt_set(ret, _name, src._name);
+ BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+ return ret;
+}
+
+struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts src)
+{
+ struct bch_opts ret = { 0 };
+#define BCH_INODE_OPT(_name, _bits) \
+ if (opt_defined(src, _name)) \
+ opt_set(ret, _name, src._name);
+ BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+ return ret;
+}
+
+void bch2_io_opts_apply(struct bch_io_opts *dst, struct bch_io_opts src)
+{
+#define BCH_INODE_OPT(_name, _bits) \
+ if (opt_defined(src, _name)) \
+ opt_set(*dst, _name, src._name);
+ BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+}
+
+bool bch2_opt_is_inode_opt(enum bch_opt_id id)
+{
+ static const enum bch_opt_id inode_opt_list[] = {
+#define BCH_INODE_OPT(_name, _bits) Opt_##_name,
+ BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+ };
+ unsigned i;
+
+ for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++)
+ if (inode_opt_list[i] == id)
+ return true;
+
+ return false;
+}
diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h
index 33e3a2c8..126056e6 100644
--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@@ -181,10 +181,7 @@ do { \
static inline struct bch_opts bch2_opts_empty(void)
{
- struct bch_opts opts;
-
- memset(&opts, 0, sizeof(opts));
- return opts;
+ return (struct bch_opts) { 0 };
}
void bch2_opts_apply(struct bch_opts *, struct bch_opts);
@@ -215,12 +212,35 @@ struct bch_option {
extern const struct bch_option bch2_opt_table[];
+bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id);
u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id);
void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
struct bch_opts bch2_opts_from_sb(struct bch_sb *);
+int bch2_opt_lookup(const char *);
int bch2_opt_parse(const struct bch_option *, const char *, u64 *);
int bch2_parse_mount_opts(struct bch_opts *, char *);
+/* inode opts: */
+
+#define BCH_INODE_OPTS() \
+ BCH_INODE_OPT(data_checksum, 8) \
+ BCH_INODE_OPT(compression, 8)
+
+struct bch_io_opts {
+#define BCH_INODE_OPT(_name, _bits) unsigned _name##_defined:1;
+ BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+
+#define BCH_INODE_OPT(_name, _bits) u##_bits _name;
+ BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+};
+
+struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
+struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts);
+void bch2_io_opts_apply(struct bch_io_opts *, struct bch_io_opts);
+bool bch2_opt_is_inode_opt(enum bch_opt_id);
+
#endif /* _BCACHEFS_OPTS_H */
diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c
index a3ecfb92..3f55c244 100644
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@@ -12,6 +12,8 @@
#include <linux/sort.h>
static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
+ struct bch_replicas_cpu *);
static const char *bch2_sb_validate_replicas(struct bch_sb *);
static inline void __bch2_sb_layout_size_assert(void)
@@ -157,7 +159,7 @@ struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb,
return NULL;
f = __bch2_sb_field_resize(sb->sb, f, u64s);
- f->type = type;
+ f->type = cpu_to_le32(type);
return f;
}
@@ -188,7 +190,7 @@ struct bch_sb_field *bch2_fs_sb_field_resize(struct bch_fs *c,
}
f = __bch2_sb_field_resize(c->disk_sb, f, u64s);
- f->type = type;
+ f->type = cpu_to_le32(type);
return f;
}
@@ -354,7 +356,16 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
if (!BCH_SB_DATA_REPLICAS_REQ(sb) ||
BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
- return "Invalid number of metadata replicas";
+ return "Invalid number of data replicas";
+
+ if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
+ return "Invalid metadata checksum type";
+
+ if (BCH_SB_DATA_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
+ return "Invalid metadata checksum type";
+
+ if (BCH_SB_COMPRESSION_TYPE(sb) >= BCH_COMPRESSION_OPT_NR)
+ return "Invalid compression type";
if (!BCH_SB_BTREE_NODE_SIZE(sb))
return "Btree node size not set";
@@ -507,7 +518,7 @@ static void __copy_super(struct bch_sb *dst, struct bch_sb *src)
if (src_f->type == BCH_SB_FIELD_journal)
continue;
- dst_f = bch2_sb_field_get(dst, src_f->type);
+ dst_f = bch2_sb_field_get(dst, le32_to_cpu(src_f->type));
dst_f = __bch2_sb_field_resize(dst, dst_f,
le32_to_cpu(src_f->u64s));
@@ -601,7 +612,7 @@ reread:
/* XXX: verify MACs */
csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb),
- (struct nonce) { 0 }, sb->sb);
+ null_nonce(), sb->sb);
if (bch2_crc_cmp(csum, sb->sb->csum))
return "bad checksum reading superblock";
@@ -688,9 +699,9 @@ const char *bch2_read_super(const char *path,
got_super:
pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
le64_to_cpu(ret->sb->version),
- le64_to_cpu(ret->sb->flags),
+ le64_to_cpu(ret->sb->flags[0]),
le64_to_cpu(ret->sb->seq),
- le16_to_cpu(ret->sb->u64s));
+ le32_to_cpu(ret->sb->u64s));
err = "Superblock block size smaller than device block size";
if (le16_to_cpu(ret->sb->block_size) << 9 <
@@ -711,7 +722,7 @@ static void write_super_endio(struct bio *bio)
/* XXX: return errors directly */
- if (bch2_dev_io_err_on(bio->bi_error, ca, "superblock write"))
+ if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write"))
ca->sb_write_error = 1;
closure_put(&ca->fs->sb_write);
@@ -727,7 +738,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum);
sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb),
- (struct nonce) { 0 }, sb);
+ null_nonce(), sb);
bio_reset(bio);
bio->bi_bdev = ca->disk_sb.bdev;
@@ -830,7 +841,12 @@ out:
bch2_sb_update(c);
}
-/* replica information: */
+/* Replicas tracking - in memory: */
+
+#define for_each_cpu_replicas_entry(_r, _i) \
+ for (_i = (_r)->entries; \
+ (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
+ _i = (void *) (_i) + (_r)->entry_size)
static inline struct bch_replicas_cpu_entry *
cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
@@ -838,6 +854,11 @@ cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
return (void *) r->entries + r->entry_size * i;
}
+static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
+{
+ eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
+}
+
static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e,
unsigned dev)
{
@@ -856,6 +877,246 @@ static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r)
offsetof(struct bch_replicas_cpu_entry, devs)) * 8;
}
+static unsigned bkey_to_replicas(struct bkey_s_c_extent e,
+ enum bch_data_type data_type,
+ struct bch_replicas_cpu_entry *r,
+ unsigned *max_dev)
+{
+ const struct bch_extent_ptr *ptr;
+ unsigned nr = 0;
+
+ BUG_ON(!data_type ||
+ data_type == BCH_DATA_SB ||
+ data_type >= BCH_DATA_NR);
+
+ memset(r, 0, sizeof(*r));
+ r->data_type = data_type;
+
+ *max_dev = 0;
+
+ extent_for_each_ptr(e, ptr)
+ if (!ptr->cached) {
+ *max_dev = max_t(unsigned, *max_dev, ptr->dev);
+ replicas_set_dev(r, ptr->dev);
+ nr++;
+ }
+ return nr;
+}
+
+static struct bch_replicas_cpu *
+cpu_replicas_add_entry(struct bch_replicas_cpu *old,
+ struct bch_replicas_cpu_entry new_entry,
+ unsigned max_dev)
+{
+ struct bch_replicas_cpu *new;
+ unsigned i, nr, entry_size;
+
+ entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
+ DIV_ROUND_UP(max_dev + 1, 8);
+ entry_size = max(entry_size, old->entry_size);
+ nr = old->nr + 1;
+
+ new = kzalloc(sizeof(struct bch_replicas_cpu) +
+ nr * entry_size, GFP_NOIO);
+ if (!new)
+ return NULL;
+
+ new->nr = nr;
+ new->entry_size = entry_size;
+
+ for (i = 0; i < old->nr; i++)
+ memcpy(cpu_replicas_entry(new, i),
+ cpu_replicas_entry(old, i),
+ min(new->entry_size, old->entry_size));
+
+ memcpy(cpu_replicas_entry(new, old->nr),
+ &new_entry,
+ new->entry_size);
+
+ bch2_cpu_replicas_sort(new);
+ return new;
+}
+
+static bool replicas_has_entry(struct bch_replicas_cpu *r,
+ struct bch_replicas_cpu_entry search,
+ unsigned max_dev)
+{
+ return max_dev < replicas_dev_slots(r) &&
+ eytzinger0_find(r->entries, r->nr,
+ r->entry_size,
+ memcmp, &search) < r->nr;
+}
+
+noinline
+static int bch2_check_mark_super_slowpath(struct bch_fs *c,
+ struct bch_replicas_cpu_entry new_entry,
+ unsigned max_dev)
+{
+ struct bch_replicas_cpu *old_gc, *new_gc = NULL, *old_r, *new_r;
+ int ret = -ENOMEM;
+
+ mutex_lock(&c->sb_lock);
+
+ old_gc = rcu_dereference_protected(c->replicas_gc,
+ lockdep_is_held(&c->sb_lock));
+ if (old_gc && !replicas_has_entry(old_gc, new_entry, max_dev)) {
+ new_gc = cpu_replicas_add_entry(old_gc, new_entry, max_dev);
+ if (!new_gc)
+ goto err;
+ }
+
+ old_r = rcu_dereference_protected(c->replicas,
+ lockdep_is_held(&c->sb_lock));
+ /* recheck, might have raced */
+ if (replicas_has_entry(old_r, new_entry, max_dev))
+ goto out;
+
+ new_r = cpu_replicas_add_entry(old_r, new_entry, max_dev);
+ if (!new_r)
+ goto err;
+
+ ret = bch2_cpu_replicas_to_sb_replicas(c, new_r);
+ if (ret)
+ goto err;
+
+ if (new_gc) {
+ rcu_assign_pointer(c->replicas_gc, new_gc);
+ kfree_rcu(old_gc, rcu);
+ }
+
+ rcu_assign_pointer(c->replicas, new_r);
+ kfree_rcu(old_r, rcu);
+
+ bch2_write_super(c);
+out:
+ ret = 0;
+err:
+ mutex_unlock(&c->sb_lock);
+ return ret;
+}
+
+static inline int __bch2_check_mark_super(struct bch_fs *c,
+ struct bch_replicas_cpu_entry search,
+ unsigned max_dev)
+{
+ struct bch_replicas_cpu *r, *gc_r;
+ bool marked;
+
+ rcu_read_lock();
+ r = rcu_dereference(c->replicas);
+ gc_r = rcu_dereference(c->replicas_gc);
+ marked = replicas_has_entry(r, search, max_dev) &&
+ (!likely(gc_r) || replicas_has_entry(gc_r, search, max_dev));
+ rcu_read_unlock();
+
+ return likely(marked) ? 0
+ : bch2_check_mark_super_slowpath(c, search, max_dev);
+}
+
+int bch2_check_mark_super(struct bch_fs *c, struct bkey_s_c_extent e,
+ enum bch_data_type data_type)
+{
+ struct bch_replicas_cpu_entry search;
+ unsigned max_dev;
+
+ if (!bkey_to_replicas(e, data_type, &search, &max_dev))
+ return 0;
+
+ return __bch2_check_mark_super(c, search, max_dev);
+}
+
+int bch2_check_mark_super_devlist(struct bch_fs *c,
+ struct bch_devs_list *devs,
+ enum bch_data_type data_type)
+{
+ struct bch_replicas_cpu_entry search = { .data_type = data_type };
+ unsigned i, max_dev = 0;
+
+ if (!devs->nr)
+ return 0;
+
+ for (i = 0; i < devs->nr; i++) {
+ max_dev = max_t(unsigned, max_dev, devs->devs[i]);
+ replicas_set_dev(&search, devs->devs[i]);
+ }
+
+ return __bch2_check_mark_super(c, search, max_dev);
+}
+
+int bch2_replicas_gc_end(struct bch_fs *c, int err)
+{
+ struct bch_replicas_cpu *new_r, *old_r;
+ int ret = 0;
+
+ lockdep_assert_held(&c->replicas_gc_lock);
+
+ mutex_lock(&c->sb_lock);
+
+ new_r = rcu_dereference_protected(c->replicas_gc,
+ lockdep_is_held(&c->sb_lock));
+
+ if (err) {
+ rcu_assign_pointer(c->replicas_gc, NULL);
+ kfree_rcu(new_r, rcu);
+ goto err;
+ }
+
+ if (bch2_cpu_replicas_to_sb_replicas(c, new_r)) {
+ ret = -ENOSPC;
+ goto err;
+ }
+
+ old_r = rcu_dereference_protected(c->replicas,
+ lockdep_is_held(&c->sb_lock));
+
+ rcu_assign_pointer(c->replicas, new_r);
+ rcu_assign_pointer(c->replicas_gc, NULL);
+ kfree_rcu(old_r, rcu);
+
+ bch2_write_super(c);
+err:
+ mutex_unlock(&c->sb_lock);
+ return ret;
+}
+
+int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
+{
+ struct bch_replicas_cpu *dst, *src;
+ struct bch_replicas_cpu_entry *e;
+
+ lockdep_assert_held(&c->replicas_gc_lock);
+
+ mutex_lock(&c->sb_lock);
+ BUG_ON(c->replicas_gc);
+
+ src = rcu_dereference_protected(c->replicas,
+ lockdep_is_held(&c->sb_lock));
+
+ dst = kzalloc(sizeof(struct bch_replicas_cpu) +
+ src->nr * src->entry_size, GFP_NOIO);
+ if (!dst) {
+ mutex_unlock(&c->sb_lock);
+ return -ENOMEM;
+ }
+
+ dst->nr = 0;
+ dst->entry_size = src->entry_size;
+
+ for_each_cpu_replicas_entry(src, e)
+ if (!((1 << e->data_type) & typemask))
+ memcpy(cpu_replicas_entry(dst, dst->nr++),
+ e, dst->entry_size);
+
+ bch2_cpu_replicas_sort(dst);
+
+ rcu_assign_pointer(c->replicas_gc, dst);
+ mutex_unlock(&c->sb_lock);
+
+ return 0;
+}
+
+/* Replicas tracking - superblock: */
+
static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r,
unsigned *nr,
unsigned *bytes,
@@ -914,10 +1175,7 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r)
}
}
- eytzinger0_sort(cpu_r->entries,
- cpu_r->nr,
- cpu_r->entry_size,
- memcmp, NULL);
+ bch2_cpu_replicas_sort(cpu_r);
return cpu_r;
}
@@ -926,14 +1184,12 @@ static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
struct bch_sb_field_replicas *sb_r;
struct bch_replicas_cpu *cpu_r, *old_r;
- lockdep_assert_held(&c->sb_lock);
-
sb_r = bch2_sb_get_replicas(c->disk_sb);
cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
if (!cpu_r)
return -ENOMEM;
- old_r = c->replicas;
+ old_r = rcu_dereference_check(c->replicas, lockdep_is_held(&c->sb_lock));
rcu_assign_pointer(c->replicas, cpu_r);
if (old_r)
kfree_rcu(old_r, rcu);
@@ -941,192 +1197,133 @@ static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
return 0;
}
-static void bkey_to_replicas(struct bkey_s_c_extent e,
- enum bch_data_type data_type,
- struct bch_replicas_cpu_entry *r,
- unsigned *max_dev)
+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
+ struct bch_replicas_cpu *r)
{
- const struct bch_extent_ptr *ptr;
-
- BUG_ON(!data_type ||
- data_type == BCH_DATA_SB ||
- data_type >= BCH_DATA_NR);
-
- memset(r, 0, sizeof(*r));
- r->data_type = data_type;
-
- *max_dev = 0;
-
- extent_for_each_ptr(e, ptr)
- if (!ptr->cached) {
- *max_dev = max_t(unsigned, *max_dev, ptr->dev);
- replicas_set_dev(r, ptr->dev);
- }
-}
+ struct bch_sb_field_replicas *sb_r;
+ struct bch_replicas_entry *sb_e;
+ struct bch_replicas_cpu_entry *e;
+ size_t i, bytes;
-/*
- * for when gc of replica information is in progress:
- */
-static int bch2_update_gc_replicas(struct bch_fs *c,
- struct bch_replicas_cpu *gc_r,
- struct bkey_s_c_extent e,
- enum bch_data_type data_type)
-{
- struct bch_replicas_cpu_entry new_e;
- struct bch_replicas_cpu *new;
- unsigned i, nr, entry_size, max_dev;
+ bytes = sizeof(struct bch_sb_field_replicas);
- bkey_to_replicas(e, data_type, &new_e, &max_dev);
+ for_each_cpu_replicas_entry(r, e) {
+ bytes += sizeof(struct bch_replicas_entry);
+ for (i = 0; i < r->entry_size - 1; i++)
+ bytes += hweight8(e->devs[i]);
+ }
- entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
- DIV_ROUND_UP(max_dev + 1, 8);
- entry_size = max(entry_size, gc_r->entry_size);
- nr = gc_r->nr + 1;
+ sb_r = bch2_fs_sb_resize_replicas(c,
+ DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64)));
+ if (!sb_r)
+ return -ENOSPC;
- new = kzalloc(sizeof(struct bch_replicas_cpu) +
- nr * entry_size, GFP_NOIO);
- if (!new)
- return -ENOMEM;
+ memset(&sb_r->entries, 0,
+ vstruct_end(&sb_r->field) -
+ (void *) &sb_r->entries);
- new->nr = nr;
- new->entry_size = entry_size;
+ sb_e = sb_r->entries;
+ for_each_cpu_replicas_entry(r, e) {
+ sb_e->data_type = e->data_type;
- for (i = 0; i < gc_r->nr; i++)
- memcpy(cpu_replicas_entry(new, i),
- cpu_replicas_entry(gc_r, i),
- gc_r->entry_size);
+ for (i = 0; i < replicas_dev_slots(r); i++)
+ if (replicas_test_dev(e, i))
+ sb_e->devs[sb_e->nr++] = i;
- memcpy(cpu_replicas_entry(new, nr - 1),
- &new_e,
- new->entry_size);
+ sb_e = replicas_entry_next(sb_e);
- eytzinger0_sort(new->entries,
- new->nr,
- new->entry_size,
- memcmp, NULL);
+ BUG_ON((void *) sb_e > vstruct_end(&sb_r->field));
+ }
- rcu_assign_pointer(c->replicas_gc, new);
- kfree_rcu(gc_r, rcu);
return 0;
}
-static bool replicas_has_extent(struct bch_replicas_cpu *r,
- struct bkey_s_c_extent e,
- enum bch_data_type data_type)
-{
- struct bch_replicas_cpu_entry search;
- unsigned max_dev;
-
- bkey_to_replicas(e, data_type, &search, &max_dev);
-
- return max_dev < replicas_dev_slots(r) &&
- eytzinger0_find(r->entries, r->nr,
- r->entry_size,
- memcmp, &search) < r->nr;
-}
-
-bool bch2_sb_has_replicas(struct bch_fs *c, struct bkey_s_c_extent e,
- enum bch_data_type data_type)
-{
- bool ret;
-
- rcu_read_lock();
- ret = replicas_has_extent(rcu_dereference(c->replicas),
- e, data_type);
- rcu_read_unlock();
-
- return ret;
-}
-
-noinline
-static int bch2_check_mark_super_slowpath(struct bch_fs *c,
- struct bkey_s_c_extent e,
- enum bch_data_type data_type)
+static const char *bch2_sb_validate_replicas(struct bch_sb *sb)
{
- struct bch_replicas_cpu *gc_r;
- const struct bch_extent_ptr *ptr;
+ struct bch_sb_field_members *mi;
struct bch_sb_field_replicas *sb_r;
- struct bch_replicas_entry *new_entry;
- unsigned new_entry_bytes, new_u64s, nr, bytes, max_dev;
- int ret = 0;
+ struct bch_replicas_cpu *cpu_r = NULL;
+ struct bch_replicas_entry *e;
+ const char *err;
+ unsigned i;
- mutex_lock(&c->sb_lock);
+ mi = bch2_sb_get_members(sb);
+ sb_r = bch2_sb_get_replicas(sb);
+ if (!sb_r)
+ return NULL;
- gc_r = rcu_dereference_protected(c->replicas_gc,
- lockdep_is_held(&c->sb_lock));
- if (gc_r &&
- !replicas_has_extent(gc_r, e, data_type)) {
- ret = bch2_update_gc_replicas(c, gc_r, e, data_type);
- if (ret)
+ for_each_replicas_entry(sb_r, e) {
+ err = "invalid replicas entry: invalid data type";
+ if (e->data_type >= BCH_DATA_NR)
goto err;
- }
-
- /* recheck, might have raced */
- if (bch2_sb_has_replicas(c, e, data_type)) {
- mutex_unlock(&c->sb_lock);
- return 0;
- }
- new_entry_bytes = sizeof(struct bch_replicas_entry) +
- bch2_extent_nr_dirty_ptrs(e.s_c);
-
- sb_r = bch2_sb_get_replicas(c->disk_sb);
+ err = "invalid replicas entry: no devices";
+ if (!e->nr)
+ goto err;
- bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev);
+ err = "invalid replicas entry: too many devices";
+ if (e->nr >= BCH_REPLICAS_MAX)
+ goto err;
- new_u64s = DIV_ROUND_UP(bytes + new_entry_bytes, sizeof(u64));
+ err = "invalid replicas entry: invalid device";
+ for (i = 0; i < e->nr; i++)
+ if (!bch2_dev_exists(sb, mi, e->devs[i]))
+ goto err;
+ }
- sb_r = bch2_fs_sb_resize_replicas(c,
- DIV_ROUND_UP(sizeof(*sb_r) + bytes + new_entry_bytes,
- sizeof(u64)));
- if (!sb_r) {
- ret = -ENOSPC;
+ err = "cannot allocate memory";
+ cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
+ if (!cpu_r)
goto err;
- }
- new_entry = (void *) sb_r + bytes;
- new_entry->data_type = data_type;
- new_entry->nr = 0;
+ sort_cmp_size(cpu_r->entries,
+ cpu_r->nr,
+ cpu_r->entry_size,
+ memcmp, NULL);
+
+ for (i = 0; i + 1 < cpu_r->nr; i++) {
+ struct bch_replicas_cpu_entry *l =
+ cpu_replicas_entry(cpu_r, i);
+ struct bch_replicas_cpu_entry *r =
+ cpu_replicas_entry(cpu_r, i + 1);
- extent_for_each_ptr(e, ptr)
- if (!ptr->cached)
- new_entry->devs[new_entry->nr++] = ptr->dev;
+ BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
- ret = bch2_sb_replicas_to_cpu_replicas(c);
- if (ret) {
- memset(new_entry, 0,
- vstruct_end(&sb_r->field) - (void *) new_entry);
- goto err;
+ err = "duplicate replicas entry";
+ if (!memcmp(l, r, cpu_r->entry_size))
+ goto err;
}
- bch2_write_super(c);
+ err = NULL;
err:
- mutex_unlock(&c->sb_lock);
- return ret;
+ kfree(cpu_r);
+ return err;
}
-int bch2_check_mark_super(struct bch_fs *c, struct bkey_s_c_extent e,
+/* Query replicas: */
+
+bool bch2_sb_has_replicas(struct bch_fs *c, struct bkey_s_c_extent e,
enum bch_data_type data_type)
{
- struct bch_replicas_cpu *gc_r;
- bool marked;
+ struct bch_replicas_cpu_entry search;
+ unsigned max_dev;
+ bool ret;
+
+ if (!bkey_to_replicas(e, data_type, &search, &max_dev))
+ return true;
rcu_read_lock();
- marked = replicas_has_extent(rcu_dereference(c->replicas),
- e, data_type) &&
- (!(gc_r = rcu_dereference(c->replicas_gc)) ||
- replicas_has_extent(gc_r, e, data_type));
+ ret = replicas_has_entry(rcu_dereference(c->replicas),
+ search, max_dev);
rcu_read_unlock();
- if (marked)
- return 0;
-
- return bch2_check_mark_super_slowpath(c, e, data_type);
+ return ret;
}
struct replicas_status __bch2_replicas_status(struct bch_fs *c,
- struct bch_devs_mask online_devs)
+ struct bch_devs_mask online_devs)
{
+ struct bch_sb_field_members *mi;
struct bch_replicas_cpu_entry *e;
struct bch_replicas_cpu *r;
unsigned i, dev, dev_slots, nr_online, nr_offline;
@@ -1137,14 +1334,15 @@ struct replicas_status __bch2_replicas_status(struct bch_fs *c,
for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
ret.replicas[i].nr_online = UINT_MAX;
+ mi = bch2_sb_get_members(c->disk_sb);
rcu_read_lock();
- r = rcu_dereference(c->replicas);
- dev_slots = min_t(unsigned, replicas_dev_slots(r), c->sb.nr_devices);
- for (i = 0; i < r->nr; i++) {
- e = cpu_replicas_entry(r, i);
+ r = rcu_dereference(c->replicas);
+ dev_slots = replicas_dev_slots(r);
- BUG_ON(e->data_type >= ARRAY_SIZE(ret.replicas));
+ for_each_cpu_replicas_entry(r, e) {
+ if (e->data_type >= ARRAY_SIZE(ret.replicas))
+ panic("e %p data_type %u\n", e, e->data_type);
nr_online = nr_offline = 0;
@@ -1152,6 +1350,8 @@ struct replicas_status __bch2_replicas_status(struct bch_fs *c,
if (!replicas_test_dev(e, dev))
continue;
+ BUG_ON(!bch2_dev_exists(c->disk_sb, mi, dev));
+
if (test_bit(dev, online_devs.d))
nr_online++;
else
@@ -1216,7 +1416,7 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
{
struct bch_replicas_cpu_entry *e;
struct bch_replicas_cpu *r;
- unsigned i, ret = 0;
+ unsigned ret = 0;
rcu_read_lock();
r = rcu_dereference(c->replicas);
@@ -1224,191 +1424,13 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
if (ca->dev_idx >= replicas_dev_slots(r))
goto out;
- for (i = 0; i < r->nr; i++) {
- e = cpu_replicas_entry(r, i);
-
+ for_each_cpu_replicas_entry(r, e)
if (replicas_test_dev(e, ca->dev_idx)) {
ret |= 1 << e->data_type;
break;
}
- }
out:
rcu_read_unlock();
return ret;
}
-
-static const char *bch2_sb_validate_replicas(struct bch_sb *sb)
-{
- struct bch_sb_field_members *mi;
- struct bch_sb_field_replicas *sb_r;
- struct bch_replicas_cpu *cpu_r = NULL;
- struct bch_replicas_entry *e;
- const char *err;
- unsigned i;
-
- mi = bch2_sb_get_members(sb);
- sb_r = bch2_sb_get_replicas(sb);
- if (!sb_r)
- return NULL;
-
- for_each_replicas_entry(sb_r, e) {
- err = "invalid replicas entry: invalid data type";
- if (e->data_type >= BCH_DATA_NR)
- goto err;
-
- err = "invalid replicas entry: too many devices";
- if (e->nr >= BCH_REPLICAS_MAX)
- goto err;
-
- err = "invalid replicas entry: invalid device";
- for (i = 0; i < e->nr; i++)
- if (!bch2_dev_exists(sb, mi, e->devs[i]))
- goto err;
- }
-
- err = "cannot allocate memory";
- cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
- if (!cpu_r)
- goto err;
-
- sort_cmp_size(cpu_r->entries,
- cpu_r->nr,
- cpu_r->entry_size,
- memcmp, NULL);
-
- for (i = 0; i + 1 < cpu_r->nr; i++) {
- struct bch_replicas_cpu_entry *l =
- cpu_replicas_entry(cpu_r, i);
- struct bch_replicas_cpu_entry *r =
- cpu_replicas_entry(cpu_r, i + 1);
-
- BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
-
- err = "duplicate replicas entry";
- if (!memcmp(l, r, cpu_r->entry_size))
- goto err;
- }
-
- err = NULL;
-err:
- kfree(cpu_r);
- return err;
-}
-
-int bch2_replicas_gc_end(struct bch_fs *c, int err)
-{
- struct bch_sb_field_replicas *sb_r;
- struct bch_replicas_cpu *r, *old_r;
- struct bch_replicas_entry *dst_e;
- size_t i, j, bytes, dev_slots;
- int ret = 0;
-
- lockdep_assert_held(&c->replicas_gc_lock);
-
- mutex_lock(&c->sb_lock);
-
- r = rcu_dereference_protected(c->replicas_gc,
- lockdep_is_held(&c->sb_lock));
-
- if (err) {
- rcu_assign_pointer(c->replicas_gc, NULL);
- kfree_rcu(r, rcu);
- goto err;
- }
-
- dev_slots = replicas_dev_slots(r);
-
- bytes = sizeof(struct bch_sb_field_replicas);
-
- for (i = 0; i < r->nr; i++) {
- struct bch_replicas_cpu_entry *e =
- cpu_replicas_entry(r, i);
-
- bytes += sizeof(struct bch_replicas_entry);
- for (j = 0; j < r->entry_size - 1; j++)
- bytes += hweight8(e->devs[j]);
- }
-
- sb_r = bch2_fs_sb_resize_replicas(c,
- DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64)));
- if (!sb_r) {
- ret = -ENOSPC;
- goto err;
- }
-
- memset(&sb_r->entries, 0,
- vstruct_end(&sb_r->field) -
- (void *) &sb_r->entries);
-
- dst_e = sb_r->entries;
- for (i = 0; i < r->nr; i++) {
- struct bch_replicas_cpu_entry *src_e =
- cpu_replicas_entry(r, i);
-
- dst_e->data_type = src_e->data_type;
-
- for (j = 0; j < dev_slots; j++)
- if (replicas_test_dev(src_e, j))
- dst_e->devs[dst_e->nr++] = j;
-
- dst_e = replicas_entry_next(dst_e);
- }
-
- old_r = rcu_dereference_protected(c->replicas,
- lockdep_is_held(&c->sb_lock));
- rcu_assign_pointer(c->replicas, r);
- rcu_assign_pointer(c->replicas_gc, NULL);
- kfree_rcu(old_r, rcu);
-
- bch2_write_super(c);
-err:
- mutex_unlock(&c->sb_lock);
- return ret;
-}
-
-int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
-{
- struct bch_replicas_cpu *r, *src;
- unsigned i;
-
- lockdep_assert_held(&c->replicas_gc_lock);
-
- mutex_lock(&c->sb_lock);
- BUG_ON(c->replicas_gc);
-
- src = rcu_dereference_protected(c->replicas,
- lockdep_is_held(&c->sb_lock));
-
- r = kzalloc(sizeof(struct bch_replicas_cpu) +
- src->nr * src->entry_size, GFP_NOIO);
- if (!r) {
- mutex_unlock(&c->sb_lock);
- return -ENOMEM;
- }
-
- r->entry_size = src->entry_size;
- r->nr = 0;
-
- for (i = 0; i < src->nr; i++) {
- struct bch_replicas_cpu_entry *dst_e =
- cpu_replicas_entry(r, r->nr);
- struct bch_replicas_cpu_entry *src_e =
- cpu_replicas_entry(src, i);
-
- if (!(src_e->data_type & typemask)) {
- memcpy(dst_e, src_e, r->entry_size);
- r->nr++;
- }
- }
-
- eytzinger0_sort(r->entries,
- r->nr,
- r->entry_size,
- memcmp, NULL);
-
- rcu_assign_pointer(c->replicas_gc, r);
- mutex_unlock(&c->sb_lock);
-
- return 0;
-}
diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h
index 8cafb301..4096efb2 100644
--- a/libbcachefs/super-io.h
+++ b/libbcachefs/super-io.h
@@ -125,23 +125,12 @@ void bch2_write_super(struct bch_fs *);
/* replicas: */
-/* iterate over bch_sb_field_replicas: */
-
-static inline struct bch_replicas_entry *
-replicas_entry_next(struct bch_replicas_entry *i)
-{
- return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr;
-}
-
-#define for_each_replicas_entry(_r, _i) \
- for (_i = (_r)->entries; \
- (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
- (_i) = replicas_entry_next(_i))
-
bool bch2_sb_has_replicas(struct bch_fs *, struct bkey_s_c_extent,
enum bch_data_type);
int bch2_check_mark_super(struct bch_fs *, struct bkey_s_c_extent,
enum bch_data_type);
+int bch2_check_mark_super_devlist(struct bch_fs *, struct bch_devs_list *,
+ enum bch_data_type);
struct replicas_status {
struct {
@@ -161,4 +150,17 @@ unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
int bch2_replicas_gc_end(struct bch_fs *, int);
int bch2_replicas_gc_start(struct bch_fs *, unsigned);
+/* iterate over superblock replicas - used by userspace tools: */
+
+static inline struct bch_replicas_entry *
+replicas_entry_next(struct bch_replicas_entry *i)
+{
+ return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr;
+}
+
+#define for_each_replicas_entry(_r, _i) \
+ for (_i = (_r)->entries; \
+ (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
+ (_i) = replicas_entry_next(_i))
+
#endif /* _BCACHEFS_SUPER_IO_H */
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 4e8b0a51..60a2d83e 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -140,8 +140,9 @@ struct bch_fs *bch2_uuid_to_fs(uuid_le uuid)
return c;
}
-int bch2_congested(struct bch_fs *c, int bdi_bits)
+int bch2_congested(void *data, int bdi_bits)
{
+ struct bch_fs *c = data;
struct backing_dev_info *bdi;
struct bch_dev *ca;
unsigned i;
@@ -178,13 +179,6 @@ int bch2_congested(struct bch_fs *c, int bdi_bits)
return ret;
}
-static int bch2_congested_fn(void *data, int bdi_bits)
-{
- struct bch_fs *c = data;
-
- return bch2_congested(c, bdi_bits);
-}
-
/* Filesystem RO/RW: */
/*
@@ -218,7 +212,7 @@ static void __bch2_fs_read_only(struct bch_fs *c)
* Flush journal before stopping allocators, because flushing journal
* blacklist entries involves allocating new btree nodes:
*/
- bch2_journal_flush_pins(&c->journal, U64_MAX);
+ bch2_journal_flush_all_pins(&c->journal);
if (!bch2_journal_error(&c->journal))
bch2_btree_verify_flushed(c);
@@ -379,8 +373,6 @@ static void bch2_fs_free(struct bch_fs *c)
bch2_io_clock_exit(&c->io_clock[WRITE]);
bch2_io_clock_exit(&c->io_clock[READ]);
bch2_fs_compress_exit(c);
- if (c->bdi.bdi_list.next)
- bdi_destroy(&c->bdi);
lg_lock_free(&c->usage_lock);
free_percpu(c->usage_percpu);
mempool_exit(&c->btree_bounce_pool);
@@ -393,7 +385,7 @@ static void bch2_fs_free(struct bch_fs *c)
mempool_exit(&c->btree_reserve_pool);
mempool_exit(&c->fill_iter);
percpu_ref_exit(&c->writes);
- kfree(c->replicas);
+ kfree(rcu_dereference_protected(c->replicas, 1));
if (c->copygc_wq)
destroy_workqueue(c->copygc_wq);
@@ -414,7 +406,7 @@ static void bch2_fs_exit(struct bch_fs *c)
for (i = 0; i < c->sb.nr_devices; i++)
if (c->devs[i])
- bch2_dev_free(c->devs[i]);
+ bch2_dev_free(rcu_dereference_protected(c->devs[i], 1));
closure_debug_destroy(&c->cl);
kobject_put(&c->kobj);
@@ -576,10 +568,14 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
sizeof(struct btree_update)) ||
mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
bioset_init(&c->btree_read_bio, 1,
- offsetof(struct btree_read_bio, bio)) ||
- bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio)) ||
- bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio)) ||
- bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio)) ||
+ offsetof(struct btree_read_bio, bio),
+ BIOSET_NEED_BVECS) ||
+ bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
+ BIOSET_NEED_BVECS) ||
+ bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
+ BIOSET_NEED_BVECS) ||
+ bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
+ BIOSET_NEED_BVECS) ||
mempool_init_page_pool(&c->bio_bounce_pages,
max_t(unsigned,
c->opts.btree_node_size,
@@ -588,7 +584,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
!(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
lg_lock_init(&c->usage_lock) ||
mempool_init_vp_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) ||
- bdi_setup_and_register(&c->bdi, "bcachefs") ||
bch2_io_clock_init(&c->io_clock[READ]) ||
bch2_io_clock_init(&c->io_clock[WRITE]) ||
bch2_fs_journal_init(&c->journal) ||
@@ -599,10 +594,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_fsio_init(c))
goto err;
- c->bdi.ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
- c->bdi.congested_fn = bch2_congested_fn;
- c->bdi.congested_data = c;
-
mi = bch2_sb_get_members(c->disk_sb);
for (i = 0; i < c->sb.nr_devices; i++)
if (bch2_dev_exists(c->disk_sb, mi, i) &&
@@ -729,8 +720,12 @@ static const char *__bch2_fs_start(struct bch_fs *c)
continue;
err = "error reading btree root";
- if (bch2_btree_root_read(c, i, k, level))
- goto err;
+ if (bch2_btree_root_read(c, i, k, level)) {
+ if (i != BTREE_ID_ALLOC)
+ goto err;
+
+ mustfix_fsck_err(c, "error reading btree root");
+ }
}
err = "error reading allocation information";
@@ -830,7 +825,7 @@ static const char *__bch2_fs_start(struct bch_fs *c)
closure_sync(&cl);
bch2_inode_init(c, &inode, 0, 0,
- S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0);
+ S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
inode.bi_inum = BCACHEFS_ROOT_INO;
bch2_inode_pack(&packed_inode, &inode);
@@ -877,6 +872,7 @@ out:
bch2_journal_entries_free(&journal);
return err;
err:
+fsck_err:
closure_sync(&cl);
switch (ret) {
@@ -995,24 +991,20 @@ static void bch2_dev_free(struct bch_dev *ca)
kobject_put(&ca->kobj);
}
-static void bch2_dev_io_ref_release(struct percpu_ref *ref)
-{
- struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
-
- complete(&ca->offline_complete);
-}
-
static void __bch2_dev_offline(struct bch_dev *ca)
{
struct bch_fs *c = ca->fs;
lockdep_assert_held(&c->state_lock);
+ if (percpu_ref_is_zero(&ca->io_ref))
+ return;
+
__bch2_dev_read_only(c, ca);
- reinit_completion(&ca->offline_complete);
+ reinit_completion(&ca->io_ref_completion);
percpu_ref_kill(&ca->io_ref);
- wait_for_completion(&ca->offline_complete);
+ wait_for_completion(&ca->io_ref_completion);
if (ca->kobj.state_in_sysfs) {
struct kobject *block =
@@ -1026,27 +1018,18 @@ static void __bch2_dev_offline(struct bch_dev *ca)
bch2_dev_journal_exit(ca);
}
-static void bch2_dev_ref_release(struct percpu_ref *ref)
+static void bch2_dev_ref_complete(struct percpu_ref *ref)
{
struct bch_dev *ca = container_of(ref, struct bch_dev, ref);
- complete(&ca->stop_complete);
+ complete(&ca->ref_completion);
}
-static void bch2_dev_stop(struct bch_dev *ca)
+static void bch2_dev_io_ref_complete(struct percpu_ref *ref)
{
- struct bch_fs *c = ca->fs;
-
- lockdep_assert_held(&c->state_lock);
-
- BUG_ON(rcu_access_pointer(c->devs[ca->dev_idx]) != ca);
- rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
-
- synchronize_rcu();
+ struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
- reinit_completion(&ca->stop_complete);
- percpu_ref_kill(&ca->ref);
- wait_for_completion(&ca->stop_complete);
+ complete(&ca->io_ref_completion);
}
static int bch2_dev_sysfs_online(struct bch_dev *ca)
@@ -1095,8 +1078,8 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
return -ENOMEM;
kobject_init(&ca->kobj, &bch2_dev_ktype);
- init_completion(&ca->stop_complete);
- init_completion(&ca->offline_complete);
+ init_completion(&ca->ref_completion);
+ init_completion(&ca->io_ref_completion);
ca->dev_idx = dev_idx;
__set_bit(ca->dev_idx, ca->self.d);
@@ -1132,9 +1115,9 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
DIV_ROUND_UP(BTREE_NODE_RESERVE,
ca->mi.bucket_size / c->opts.btree_node_size);
- if (percpu_ref_init(&ca->ref, bch2_dev_ref_release,
+ if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
0, GFP_KERNEL) ||
- percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_release,
+ percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
!init_fifo(&ca->free[RESERVE_BTREE], btree_node_reserve_buckets,
GFP_KERNEL) ||
@@ -1155,7 +1138,7 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
GFP_KERNEL|__GFP_ZERO)) ||
!(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)) ||
bioset_init(&ca->replica_set, 4,
- offsetof(struct bch_write_bio, bio)) ||
+ offsetof(struct bch_write_bio, bio), 0) ||
!(ca->io_done = alloc_percpu(*ca->io_done)))
goto err;
@@ -1180,8 +1163,6 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
struct bch_dev *ca;
int ret;
- lockdep_assert_held(&c->sb_lock);
-
if (le64_to_cpu(sb->sb->seq) >
le64_to_cpu(c->disk_sb->seq))
bch2_sb_to_fs(c, sb->sb);
@@ -1189,13 +1170,15 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices ||
!c->devs[sb->sb->dev_idx]);
- ca = c->devs[sb->sb->dev_idx];
+ ca = bch_dev_locked(c, sb->sb->dev_idx);
if (ca->disk_sb.bdev) {
bch_err(c, "already have device online in slot %u",
sb->sb->dev_idx);
return -EINVAL;
}
+ BUG_ON(!percpu_ref_is_zero(&ca->io_ref));
+
ret = bch2_dev_journal_init(ca, sb->sb);
if (ret)
return ret;
@@ -1222,7 +1205,7 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
if (bch2_dev_sysfs_online(ca))
pr_warn("error creating sysfs objects");
- bch2_mark_dev_superblock(c, ca, 0);
+ bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
if (ca->mi.state == BCH_MEMBER_STATE_RW)
bch2_dev_allocator_add(c, ca);
@@ -1293,6 +1276,7 @@ static bool bch2_fs_may_start(struct bch_fs *c)
{
struct replicas_status s;
struct bch_sb_field_members *mi;
+ struct bch_dev *ca;
unsigned i, flags = c->opts.degraded
? BCH_FORCE_IF_DEGRADED
: 0;
@@ -1301,14 +1285,19 @@ static bool bch2_fs_may_start(struct bch_fs *c)
mutex_lock(&c->sb_lock);
mi = bch2_sb_get_members(c->disk_sb);
- for (i = 0; i < c->disk_sb->nr_devices; i++)
- if (bch2_dev_exists(c->disk_sb, mi, i) &&
- !bch2_dev_is_online(c->devs[i]) &&
- (c->devs[i]->mi.state == BCH_MEMBER_STATE_RW ||
- c->devs[i]->mi.state == BCH_MEMBER_STATE_RO)) {
+ for (i = 0; i < c->disk_sb->nr_devices; i++) {
+ if (!bch2_dev_exists(c->disk_sb, mi, i))
+ continue;
+
+ ca = bch_dev_locked(c, i);
+
+ if (!bch2_dev_is_online(ca) &&
+ (ca->mi.state == BCH_MEMBER_STATE_RW ||
+ ca->mi.state == BCH_MEMBER_STATE_RO)) {
mutex_unlock(&c->sb_lock);
return false;
}
+ }
mutex_unlock(&c->sb_lock);
}
@@ -1419,22 +1408,59 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
*
* flag_data_bad() does not check btree pointers
*/
- ret = bch2_flag_data_bad(ca);
+ ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
if (ret) {
- bch_err(ca, "Remove failed");
+ bch_err(ca, "Remove failed: error %i dropping data", ret);
+ goto err;
+ }
+
+ ret = bch2_journal_flush_device(&c->journal, ca->dev_idx);
+ if (ret) {
+ bch_err(ca, "Remove failed: error %i flushing journal", ret);
goto err;
}
data = bch2_dev_has_data(c, ca);
if (data) {
- bch_err(ca, "Remove failed, still has data (%x)", data);
+ char data_has_str[100];
+ bch2_scnprint_flag_list(data_has_str,
+ sizeof(data_has_str),
+ bch2_data_types,
+ data);
+ bch_err(ca, "Remove failed, still has data (%s)", data_has_str);
+ ret = -EBUSY;
goto err;
}
- bch2_journal_meta(&c->journal);
+ ret = bch2_btree_delete_range(c, BTREE_ID_ALLOC,
+ POS(ca->dev_idx, 0),
+ POS(ca->dev_idx + 1, 0),
+ ZERO_VERSION,
+ NULL, NULL, NULL);
+ if (ret) {
+ bch_err(ca, "Remove failed, error deleting alloc info");
+ goto err;
+ }
+
+ /*
+ * must flush all existing journal entries, they might have
+ * (overwritten) keys that point to the device we're removing:
+ */
+ ret = bch2_journal_flush_all_pins(&c->journal);
+ if (ret) {
+ bch_err(ca, "Remove failed, journal error");
+ goto err;
+ }
__bch2_dev_offline(ca);
- bch2_dev_stop(ca);
+
+ mutex_lock(&c->sb_lock);
+ rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
+ mutex_unlock(&c->sb_lock);
+
+ percpu_ref_kill(&ca->ref);
+ wait_for_completion(&ca->ref_completion);
+
bch2_dev_free(ca);
/*
@@ -1542,7 +1568,7 @@ have_slot:
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
- ca = c->devs[dev_idx];
+ ca = bch_dev_locked(c, dev_idx);
if (ca->mi.state == BCH_MEMBER_STATE_RW) {
err = "journal alloc failed";
if (bch2_dev_journal_alloc(ca))
@@ -1568,7 +1594,7 @@ err:
/* Hot add existing device to running filesystem: */
int bch2_dev_online(struct bch_fs *c, const char *path)
{
- struct bch_sb_handle sb = { 0 };
+ struct bch_sb_handle sb = { NULL };
struct bch_dev *ca;
unsigned dev_idx;
const char *err;
@@ -1593,7 +1619,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
}
mutex_unlock(&c->sb_lock);
- ca = c->devs[dev_idx];
+ ca = bch_dev_locked(c, dev_idx);
if (ca->mi.state == BCH_MEMBER_STATE_RW) {
err = __bch2_dev_read_write(c, ca);
if (err)
@@ -1619,7 +1645,6 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
return -EINVAL;
}
- __bch2_dev_read_only(c, ca);
__bch2_dev_offline(ca);
mutex_unlock(&c->state_lock);
@@ -1629,37 +1654,31 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
int bch2_dev_evacuate(struct bch_fs *c, struct bch_dev *ca)
{
unsigned data;
- int ret;
+ int ret = 0;
mutex_lock(&c->state_lock);
if (ca->mi.state == BCH_MEMBER_STATE_RW) {
bch_err(ca, "Cannot migrate data off RW device");
- mutex_unlock(&c->state_lock);
- return -EINVAL;
+ ret = -EINVAL;
+ goto err;
}
- mutex_unlock(&c->state_lock);
-
- ret = bch2_move_data_off_device(ca);
+ ret = bch2_dev_data_migrate(c, ca, 0);
if (ret) {
bch_err(ca, "Error migrating data: %i", ret);
- return ret;
- }
-
- ret = bch2_move_metadata_off_device(ca);
- if (ret) {
- bch_err(ca, "Error migrating metadata: %i", ret);
- return ret;
+ goto err;
}
data = bch2_dev_has_data(c, ca);
if (data) {
bch_err(ca, "Migrate error: data still present (%x)", data);
- return -EINVAL;
+ ret = -EINVAL;
+ goto err;
}
-
- return 0;
+err:
+ mutex_unlock(&c->state_lock);
+ return ret;
}
/* Filesystem open: */
diff --git a/libbcachefs/super.h b/libbcachefs/super.h
index eb1d2f3d..7ebe5981 100644
--- a/libbcachefs/super.h
+++ b/libbcachefs/super.h
@@ -59,6 +59,14 @@ static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
}
}
+static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
+ unsigned dev)
+{
+ BUG_ON(bch2_dev_list_has_dev(*devs, dev));
+ BUG_ON(devs->nr >= BCH_REPLICAS_MAX);
+ devs->devs[devs->nr++] = dev;
+}
+
static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
struct bch_devs_mask *mask)
{
@@ -131,6 +139,26 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
__for_each_online_member(ca, c, iter, \
(1 << BCH_MEMBER_STATE_RW)|(1 << BCH_MEMBER_STATE_RO))
+/*
+ * If a key exists that references a device, the device won't be going away and
+ * we can omit rcu_read_lock():
+ */
+static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx)
+{
+ EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
+
+ return rcu_dereference_check(c->devs[idx], 1);
+}
+
+static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx)
+{
+ EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
+
+ return rcu_dereference_protected(c->devs[idx],
+ lockdep_is_held(&c->sb_lock) ||
+ lockdep_is_held(&c->state_lock));
+}
+
/* XXX kill, move to struct bch_fs */
static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
{
@@ -146,7 +174,7 @@ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
struct bch_fs *bch2_bdev_to_fs(struct block_device *);
struct bch_fs *bch2_uuid_to_fs(uuid_le);
-int bch2_congested(struct bch_fs *, int);
+int bch2_congested(void *, int);
bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *,
enum bch_member_state, int);
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index 35f1e561..3197a2e4 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -739,7 +739,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
c->open_buckets_wait.list.first ? "waiting" : "empty");
}
-const char * const bch2_rw[] = {
+static const char * const bch2_rw[] = {
"read",
"write",
NULL
diff --git a/libbcachefs/tier.c b/libbcachefs/tier.c
index 2e29f741..f5007864 100644
--- a/libbcachefs/tier.c
+++ b/libbcachefs/tier.c
@@ -6,7 +6,6 @@
#include "clock.h"
#include "extents.h"
#include "io.h"
-#include "keylist.h"
#include "move.h"
#include "super-io.h"
#include "tier.h"
@@ -28,7 +27,7 @@ static bool tiering_pred(void *arg, struct bkey_s_c_extent e)
return false;
extent_for_each_ptr(e, ptr)
- if (c->devs[ptr->dev]->mi.tier >= tier->idx)
+ if (bch_dev_bkey_exists(c, ptr->dev)->mi.tier >= tier->idx)
replicas++;
return replicas < c->opts.data_replicas;
diff --git a/libbcachefs/util.h b/libbcachefs/util.h
index a251bf9c..6e97e831 100644
--- a/libbcachefs/util.h
+++ b/libbcachefs/util.h
@@ -34,8 +34,12 @@ struct closure;
#define atomic64_sub_bug(i, v) BUG_ON(atomic64_sub_return(i, v) < 0)
#define atomic64_add_bug(i, v) BUG_ON(atomic64_add_return(i, v) < 0)
-#define memcpy(_dst, _src, _len) \
+#define memcpy(dst, src, len) \
({ \
+ void *_dst = (dst); \
+ const void *_src = (src); \
+ size_t _len = (len); \
+ \
BUG_ON(!((void *) (_dst) >= (void *) (_src) + (_len) || \
(void *) (_dst) + (_len) <= (void *) (_src))); \
memcpy(_dst, _src, _len); \
diff --git a/libbcachefs/vstructs.h b/libbcachefs/vstructs.h
index ce2cece0..79566442 100644
--- a/libbcachefs/vstructs.h
+++ b/libbcachefs/vstructs.h
@@ -9,10 +9,10 @@
*/
#define __vstruct_u64s(_s) \
({ \
- ( type_is((_s)->u64s, u64) ? le64_to_cpu((_s)->u64s) \
- : type_is((_s)->u64s, u32) ? le32_to_cpu((_s)->u64s) \
- : type_is((_s)->u64s, u16) ? le16_to_cpu((_s)->u64s) \
- : ((_s)->u64s)); \
+ ( type_is((_s)->u64s, u64) ? le64_to_cpu((__force __le64) (_s)->u64s) \
+ : type_is((_s)->u64s, u32) ? le32_to_cpu((__force __le32) (_s)->u64s) \
+ : type_is((_s)->u64s, u16) ? le16_to_cpu((__force __le16) (_s)->u64s) \
+ : ((__force u8) ((_s)->u64s))); \
})
#define __vstruct_bytes(_type, _u64s) \
diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c
index 3a49d728..1d6cbe72 100644
--- a/libbcachefs/xattr.c
+++ b/libbcachefs/xattr.c
@@ -2,6 +2,7 @@
#include "bcachefs.h"
#include "bkey_methods.h"
#include "btree_update.h"
+#include "compress.h"
#include "extents.h"
#include "fs.h"
#include "str_hash.h"
@@ -358,25 +359,139 @@ static const struct xattr_handler bch_xattr_security_handler = {
.flags = BCH_XATTR_INDEX_SECURITY,
};
-static const struct xattr_handler *bch_xattr_handler_map[] = {
- [BCH_XATTR_INDEX_USER] = &bch_xattr_user_handler,
- [BCH_XATTR_INDEX_POSIX_ACL_ACCESS] =
- &posix_acl_access_xattr_handler,
- [BCH_XATTR_INDEX_POSIX_ACL_DEFAULT] =
- &posix_acl_default_xattr_handler,
- [BCH_XATTR_INDEX_TRUSTED] = &bch_xattr_trusted_handler,
- [BCH_XATTR_INDEX_SECURITY] = &bch_xattr_security_handler,
+#ifndef NO_BCACHEFS_FS
+
+static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *vinode,
+ const char *name, void *buffer, size_t size)
+{
+ struct bch_inode_info *inode = to_bch_ei(vinode);
+ struct bch_opts opts =
+ bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode));
+ const struct bch_option *opt;
+ int ret, id;
+ u64 v;
+
+ id = bch2_opt_lookup(name);
+ if (id < 0 || !bch2_opt_is_inode_opt(id))
+ return -EINVAL;
+
+ opt = bch2_opt_table + id;
+
+ if (!bch2_opt_defined_by_id(&opts, id))
+ return -ENODATA;
+
+ v = bch2_opt_get_by_id(&opts, id);
+
+ if (opt->type == BCH_OPT_STR)
+ ret = snprintf(buffer, size, "%s", opt->choices[v]);
+ else
+ ret = snprintf(buffer, size, "%llu", v);
+
+ return ret <= size || !buffer ? ret : -ERANGE;
+}
+
+struct inode_opt_set {
+ int id;
+ u64 v;
+ bool defined;
};
+static int inode_opt_set_fn(struct bch_inode_info *inode,
+ struct bch_inode_unpacked *bi,
+ void *p)
+{
+ struct inode_opt_set *s = p;
+
+ if (s->defined)
+ bch2_inode_opt_set(bi, s->id, s->v);
+ else
+ bch2_inode_opt_clear(bi, s->id);
+ return 0;
+}
+
+static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *vinode,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ struct bch_inode_info *inode = to_bch_ei(vinode);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ const struct bch_option *opt;
+ char *buf;
+ struct inode_opt_set s;
+ int ret;
+
+ s.id = bch2_opt_lookup(name);
+ if (s.id < 0 || !bch2_opt_is_inode_opt(s.id))
+ return -EINVAL;
+
+ opt = bch2_opt_table + s.id;
+
+ if (value) {
+ buf = kmalloc(size + 1, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+ memcpy(buf, value, size);
+ buf[size] = '\0';
+
+ ret = bch2_opt_parse(opt, buf, &s.v);
+ kfree(buf);
+
+ if (ret < 0)
+ return ret;
+
+ if (s.id == Opt_compression) {
+ mutex_lock(&c->sb_lock);
+ ret = bch2_check_set_has_compressed_data(c, s.v);
+ mutex_unlock(&c->sb_lock);
+
+ if (ret)
+ return ret;
+ }
+
+ s.defined = true;
+ } else {
+ s.defined = false;
+ }
+
+ mutex_lock(&inode->ei_update_lock);
+ ret = __bch2_write_inode(c, inode, inode_opt_set_fn, &s);
+ mutex_unlock(&inode->ei_update_lock);
+
+ return ret;
+}
+
+static const struct xattr_handler bch_xattr_bcachefs_handler = {
+ .prefix = "bcachefs.",
+ .get = bch2_xattr_bcachefs_get,
+ .set = bch2_xattr_bcachefs_set,
+};
+
+#endif /* NO_BCACHEFS_FS */
+
const struct xattr_handler *bch2_xattr_handlers[] = {
&bch_xattr_user_handler,
&posix_acl_access_xattr_handler,
&posix_acl_default_xattr_handler,
&bch_xattr_trusted_handler,
&bch_xattr_security_handler,
+#ifndef NO_BCACHEFS_FS
+ &bch_xattr_bcachefs_handler,
+#endif
NULL
};
+static const struct xattr_handler *bch_xattr_handler_map[] = {
+ [BCH_XATTR_INDEX_USER] = &bch_xattr_user_handler,
+ [BCH_XATTR_INDEX_POSIX_ACL_ACCESS] =
+ &posix_acl_access_xattr_handler,
+ [BCH_XATTR_INDEX_POSIX_ACL_DEFAULT] =
+ &posix_acl_default_xattr_handler,
+ [BCH_XATTR_INDEX_TRUSTED] = &bch_xattr_trusted_handler,
+ [BCH_XATTR_INDEX_SECURITY] = &bch_xattr_security_handler,
+};
+
static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type)
{
return type < ARRAY_SIZE(bch_xattr_handler_map)