summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.bcachefs_revision2
-rw-r--r--.github/workflows/build-packages.yml2
-rw-r--r--Cargo.lock2
-rw-r--r--Cargo.toml2
-rw-r--r--Makefile2
-rw-r--r--c_src/cmd_dump.c372
-rw-r--r--c_src/cmd_image.c5
-rw-r--r--c_src/cmd_strip_alloc.c3
-rw-r--r--c_src/libbcachefs.c11
-rw-r--r--c_src/posix_to_bcachefs.c94
-rw-r--r--c_src/qcow2.c138
-rw-r--r--c_src/qcow2.h19
-rw-r--r--c_src/tools-util.c7
-rw-r--r--c_src/tools-util.h1
-rw-r--r--debian/changelog8
-rw-r--r--flake.lock42
-rw-r--r--flake.nix14
-rw-r--r--include/linux/blkdev.h2
-rw-r--r--libbcachefs/acl.c4
-rw-r--r--libbcachefs/alloc_background.c29
-rw-r--r--libbcachefs/bcachefs.h7
-rw-r--r--libbcachefs/btree_gc.c31
-rw-r--r--libbcachefs/btree_io.c24
-rw-r--r--libbcachefs/btree_iter.c5
-rw-r--r--libbcachefs/btree_key_cache.c8
-rw-r--r--libbcachefs/btree_node_scan.c23
-rw-r--r--libbcachefs/btree_trans_commit.c19
-rw-r--r--libbcachefs/btree_update_interior.c41
-rw-r--r--libbcachefs/clock.c6
-rw-r--r--libbcachefs/debug.c2
-rw-r--r--libbcachefs/dirent.h15
-rw-r--r--libbcachefs/ec.c3
-rw-r--r--libbcachefs/errcode.h8
-rw-r--r--libbcachefs/extents.c18
-rw-r--r--libbcachefs/extents.h2
-rw-r--r--libbcachefs/fs-io-direct.c2
-rw-r--r--libbcachefs/fs-io.c3
-rw-r--r--libbcachefs/fs.c8
-rw-r--r--libbcachefs/fsck.c62
-rw-r--r--libbcachefs/io_write.c7
-rw-r--r--libbcachefs/journal.c11
-rw-r--r--libbcachefs/journal.h2
-rw-r--r--libbcachefs/journal_io.c2
-rw-r--r--libbcachefs/journal_reclaim.c29
-rw-r--r--libbcachefs/lru.c10
-rw-r--r--libbcachefs/move.c112
-rw-r--r--libbcachefs/move.h12
-rw-r--r--libbcachefs/progress.c3
-rw-r--r--libbcachefs/progress.h3
-rw-r--r--libbcachefs/rebalance.c6
-rw-r--r--libbcachefs/recovery.c62
-rw-r--r--libbcachefs/recovery_passes.c40
-rw-r--r--libbcachefs/recovery_passes.h3
-rw-r--r--libbcachefs/sb-members_format.h2
-rw-r--r--libbcachefs/str_hash.c1
-rw-r--r--libbcachefs/super.c10
-rw-r--r--libbcachefs/sysfs.c8
-rw-r--r--libbcachefs/util.h7
-rw-r--r--linux/closure.c12
59 files changed, 913 insertions, 475 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index 7abb0304..8f8e5f44 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-deeffbdc52f1092dadb3d523c4429e002c7fc485
+e54ff0aa96886b753343100125bd3dfab1a8e337
diff --git a/.github/workflows/build-packages.yml b/.github/workflows/build-packages.yml
index 052f366a..6610a50e 100644
--- a/.github/workflows/build-packages.yml
+++ b/.github/workflows/build-packages.yml
@@ -8,7 +8,7 @@ jobs:
name: bcachefs-tools-deb
strategy:
matrix:
- os: [ubuntu-22.04, ubuntu-24.04]
+ os: [ubuntu-24.04]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v4
diff --git a/Cargo.lock b/Cargo.lock
index 440e6133..0b63b629 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -68,7 +68,7 @@ checksum = "86fdf8605db99b54d3cd748a44c6d04df638eb5dafb219b135d0149bd0db01f6"
[[package]]
name = "bcachefs-tools"
-version = "1.25.2"
+version = "1.25.3"
dependencies = [
"anyhow",
"bch_bindgen",
diff --git a/Cargo.toml b/Cargo.toml
index ea82379c..39bf51bc 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -4,7 +4,7 @@ default-members = [".", "bch_bindgen"]
[package]
name = "bcachefs-tools"
-version = "1.25.2"
+version = "1.25.3"
authors = ["Yuxuan Shui <yshuiv7@gmail.com>", "Kayla Firestack <dev@kaylafire.me>", "Kent Overstreet <kent.overstreet@linux.dev>" ]
edition = "2021"
rust-version = "1.77.0"
diff --git a/Makefile b/Makefile
index c7795af9..b0e7e5d3 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-VERSION=1.25.2
+VERSION=1.25.3
PREFIX?=/usr/local
LIBEXECDIR?=$(PREFIX)/libexec
diff --git a/c_src/cmd_dump.c b/c_src/cmd_dump.c
index eb338858..08051802 100644
--- a/c_src/cmd_dump.c
+++ b/c_src/cmd_dump.c
@@ -14,71 +14,216 @@
#include "libbcachefs/btree_iter.h"
#include "libbcachefs/error.h"
#include "libbcachefs/extents.h"
+#include "libbcachefs/journal_io.h"
#include "libbcachefs/sb-members.h"
#include "libbcachefs/super.h"
-static void dump_usage(void)
-{
- puts("bcachefs dump - dump filesystem metadata\n"
- "Usage: bcachefs dump [OPTION]... <devices>\n"
- "\n"
- "Options:\n"
- " -o output Output qcow2 image(s)\n"
- " -f, --force Force; overwrite when needed\n"
- " --nojournal Don't dump entire journal, just dirty entries\n"
- " --noexcl Open devices with O_NOEXCL (not recommended)\n"
- " -h, --help Display this help and exit\n"
- "Report bugs to <linux-bcachefs@vger.kernel.org>");
-}
+struct dump_dev {
+ ranges sb, journal, btree;
+};
+typedef DARRAY(struct dump_dev) dump_devs;
-static void dump_node(struct bch_fs *c, struct bch_dev *ca, struct bkey_s_c k, ranges *data)
+static void dump_node(struct bch_fs *c, dump_devs *devs, struct bkey_s_c k)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ unsigned bytes = btree_ptr_sectors_written(k) << 9 ?: c->opts.btree_node_size;
bkey_for_each_ptr(ptrs, ptr)
- if (ptr->dev == ca->dev_idx)
- range_add(data, ptr->offset << 9, c->opts.btree_node_size);
+ range_add(&devs->data[ptr->dev].btree,
+ ptr->offset << 9, bytes);
}
-static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd,
- bool entire_journal)
+static void get_sb_journal(struct bch_fs *c, struct bch_dev *ca,
+ bool entire_journal,
+ struct dump_dev *d)
{
struct bch_sb *sb = ca->disk_sb.sb;
- ranges data = { 0 };
- unsigned i;
- int ret;
/* Superblock: */
- range_add(&data, BCH_SB_LAYOUT_SECTOR << 9,
+ range_add(&d->sb, BCH_SB_LAYOUT_SECTOR << 9,
sizeof(struct bch_sb_layout));
- for (i = 0; i < sb->layout.nr_superblocks; i++)
- range_add(&data,
+ for (unsigned i = 0; i < sb->layout.nr_superblocks; i++)
+ range_add(&d->sb,
le64_to_cpu(sb->layout.sb_offset[i]) << 9,
vstruct_bytes(sb));
/* Journal: */
- for (i = 0; i < ca->journal.nr; i++)
+ for (unsigned i = 0; i < ca->journal.nr; i++)
if (entire_journal ||
ca->journal.bucket_seq[i] >= c->journal.last_seq_ondisk) {
u64 bucket = ca->journal.buckets[i];
- range_add(&data,
+ range_add(&d->journal,
bucket_bytes(ca) * bucket,
bucket_bytes(ca));
}
+}
+
+struct dump_opts {
+ char *out;
+ bool force;
+ bool sanitize;
+ bool entire_journal;
+ bool noexcl;
+};
+
+static void sanitize_key(struct bkey_packed *k, struct bkey_format *f, void *end,
+ bool *modified)
+{
+ struct bch_val *v = bkeyp_val(f, k);
+ unsigned len = min_t(unsigned, end - (void *) v, bkeyp_val_bytes(f, k));
+
+ switch (k->type) {
+ case KEY_TYPE_inline_data: {
+ struct bch_inline_data *d = container_of(v, struct bch_inline_data, v);
+
+ memset(&d->data[0], 0, len - offsetof(struct bch_inline_data, data));
+ *modified = true;
+ break;
+ }
+ case KEY_TYPE_indirect_inline_data: {
+ struct bch_indirect_inline_data *d = container_of(v, struct bch_indirect_inline_data, v);
+
+ memset(&d->data[0], 0, len - offsetof(struct bch_indirect_inline_data, data));
+ *modified = true;
+ break;
+ }
+ }
+}
+
+static void sanitize_journal(struct bch_fs *c, void *buf, size_t len)
+{
+ struct bkey_format f = BKEY_FORMAT_CURRENT;
+ void *end = buf + len;
+
+ while (len) {
+ struct jset *j = buf;
+ bool modified = false;
+
+ if (le64_to_cpu(j->magic) != jset_magic(c))
+ break;
+
+ vstruct_for_each(j, i) {
+ if ((void *) i >= end)
+ break;
+
+ if (!jset_entry_is_key(i))
+ continue;
+
+ jset_entry_for_each_key(i, k) {
+ if ((void *) k >= end)
+ break;
+ if (!k->k.u64s)
+ break;
+ sanitize_key(bkey_to_packed(k), &f, end, &modified);
+ }
+ }
+
+ if (modified) {
+ memset(&j->csum, 0, sizeof(j->csum));
+ SET_JSET_CSUM_TYPE(j, 0);
+ }
+
+ unsigned b = min(len, vstruct_sectors(j, c->block_bits) << 9);
+ len -= b;
+ buf += b;
+ }
+}
+
+static void sanitize_btree(struct bch_fs *c, void *buf, size_t len)
+{
+ void *end = buf + len;
+ bool first = true;
+ struct bkey_format f_current = BKEY_FORMAT_CURRENT;
+ struct bkey_format f;
+ u64 seq;
+
+ while (len) {
+ unsigned sectors;
+ struct bset *i;
+ bool modified = false;
+
+ if (first) {
+ struct btree_node *bn = buf;
+
+ if (le64_to_cpu(bn->magic) != bset_magic(c))
+ break;
+
+ i = &bn->keys;
+ seq = bn->keys.seq;
+ f = bn->format;
+
+ sectors = vstruct_sectors(bn, c->block_bits);
+ } else {
+ struct btree_node_entry *bne = buf;
+
+ if (bne->keys.seq != seq)
+ break;
+
+ i = &bne->keys;
+ sectors = vstruct_sectors(bne, c->block_bits);
+ }
+
+ vstruct_for_each(i, k) {
+ if ((void *) k >= end)
+ break;
+ if (!k->u64s)
+ break;
+
+ sanitize_key(k, bkey_packed(k) ? &f : &f_current, end, &modified);
+ }
+
+ if (modified) {
+ if (first) {
+ struct btree_node *bn = buf;
+ memset(&bn->csum, 0, sizeof(bn->csum));
+ } else {
+ struct btree_node_entry *bne = buf;
+ memset(&bne->csum, 0, sizeof(bne->csum));
+ }
+ SET_BSET_CSUM_TYPE(i, 0);
+ }
+
+ first = false;
+
+ unsigned b = min(len, sectors << 9);
+ len -= b;
+ buf += b;
+ }
+}
+
+static int dump_fs(struct bch_fs *c, struct dump_opts opts)
+{
+ if (opts.sanitize)
+ printf("Sanitizing inline data extents\n");
+
+ dump_devs devs = {};
+ while (devs.nr < c->sb.nr_devices)
+ darray_push(&devs, (struct dump_dev) {});
+
+ down_read(&c->state_lock);
+
+ unsigned nr_online = 0;
+ for_each_online_member(c, ca, 0) {
+ if (opts.sanitize && ca->mi.bucket_size % block_sectors(c))
+ die("%s has unaligned buckets, cannot sanitize", ca->name);
+
+ get_sb_journal(c, ca, opts.entire_journal, &devs.data[ca->dev_idx]);
+ nr_online++;
+ }
- /* Btree: */
- for (i = 0; i < BTREE_ID_NR; i++) {
- struct btree_trans *trans = bch2_trans_get(c);
+ bch_verbose(c, "walking metadata to dump");
+ for (unsigned i = 0; i < BTREE_ID_NR; i++) {
+ CLASS(btree_trans, trans)(c);
- ret = __for_each_btree_node(trans, iter, i, POS_MIN, 0, 1, 0, b, ({
+ int ret = __for_each_btree_node(trans, iter, i, POS_MIN, 0, 1, 0, b, ({
struct btree_node_iter iter;
struct bkey u;
struct bkey_s_c k;
for_each_btree_node_key_unpack(b, k, &iter, &u)
- dump_node(c, ca, k, &data);
+ dump_node(c, &devs, k);
0;
}));
@@ -87,57 +232,135 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd,
struct btree *b = bch2_btree_id_root(c, i)->b;
if (!btree_node_fake(b))
- dump_node(c, ca, bkey_i_to_s_c(&b->key), &data);
+ dump_node(c, &devs, bkey_i_to_s_c(&b->key));
+ }
+
+ bch_verbose(c, "writing metadata image(s)");
+ for_each_online_member(c, ca, 0) {
+ int flags = O_WRONLY|O_CREAT|O_TRUNC;
+
+ if (!opts.force)
+ flags |= O_EXCL;
+
+ char *path = nr_online > 1
+ ? mprintf("%s.%u.qcow2", opts.out, ca->dev_idx)
+ : mprintf("%s.qcow2", opts.out);
+ int fd = xopen(path, flags, 0600);
+ free(path);
+
+ struct qcow2_image img;
+ qcow2_image_init(&img, ca->disk_sb.bdev->bd_fd, fd, c->opts.block_size);
+
+ struct dump_dev *d = &devs.data[ca->dev_idx];
+
+ qcow2_write_ranges(&img, &d->sb);
+
+ if (!opts.sanitize) {
+ qcow2_write_ranges(&img, &d->journal);
+ qcow2_write_ranges(&img, &d->btree);
+ } else {
+ ranges_sort(&d->journal);
+ ranges_sort(&d->btree);
+
+ u64 bucket_bytes = ca->mi.bucket_size << 9;
+ char *buf = xmalloc(bucket_bytes);
- bch2_trans_put(trans);
+ darray_for_each(d->journal, r) {
+ u64 len = r->end - r->start;
+ BUG_ON(len > bucket_bytes);
+
+ xpread(img.infd, buf, len, r->start);
+ sanitize_journal(c, buf, len);
+ qcow2_write_buf(&img, buf, len, r->start);
+ }
+
+ darray_for_each(d->btree, r) {
+ u64 len = r->end - r->start;
+ BUG_ON(len > bucket_bytes);
+
+ xpread(img.infd, buf, len, r->start);
+ sanitize_btree(c, buf, len);
+ qcow2_write_buf(&img, buf, len, r->start);
+ }
+ free(buf);
+ }
+
+ qcow2_image_finish(&img);
+ xclose(fd);
}
- qcow2_write_image(ca->disk_sb.bdev->bd_fd, fd, &data,
- max_t(unsigned, c->opts.btree_node_size / 8, block_bytes(c)));
- darray_exit(&data);
+ up_read(&c->state_lock);
+
+ bch2_fs_stop(c);
+
+ darray_for_each(devs, d) {
+ darray_exit(&d->sb);
+ darray_exit(&d->journal);
+ darray_exit(&d->btree);
+ }
+ darray_exit(&devs);
+ return 0;
+}
+
+static void dump_usage(void)
+{
+ puts("bcachefs dump - dump filesystem metadata\n"
+ "Usage: bcachefs dump [OPTION]... <devices>\n"
+ "\n"
+ "Options:\n"
+ " -o output Output qcow2 image(s)\n"
+ " -f, --force Force; overwrite when needed\n"
+ " -s, --sanitize Zero out inline data extents\n"
+ " --nojournal Don't dump entire journal, just dirty entries\n"
+ " --noexcl Open devices with O_NOEXCL (not recommended)\n"
+ " -v, --verbose\n"
+ " -h, --help Display this help and exit\n"
+ "Report bugs to <linux-bcachefs@vger.kernel.org>");
}
int cmd_dump(int argc, char *argv[])
{
static const struct option longopts[] = {
{ "force", no_argument, NULL, 'f' },
+ { "sanitize", no_argument, NULL, 's' },
{ "nojournal", no_argument, NULL, 'j' },
{ "noexcl", no_argument, NULL, 'e' },
{ "verbose", no_argument, NULL, 'v' },
{ "help", no_argument, NULL, 'h' },
{ NULL }
};
- struct bch_opts opts = bch2_opts_empty();
- char *out = NULL;
- unsigned nr_devices = 0;
- bool force = false, entire_journal = true;
- int fd, opt;
-
- opt_set(opts, direct_io, false);
- opt_set(opts, read_only, true);
- opt_set(opts, nochanges, true);
- opt_set(opts, norecovery, true);
- opt_set(opts, degraded, BCH_DEGRADED_very);
- opt_set(opts, errors, BCH_ON_ERROR_continue);
- opt_set(opts, fix_errors, FSCK_FIX_no);
-
- while ((opt = getopt_long(argc, argv, "o:fvh",
+ struct bch_opts fs_opts = bch2_opts_empty();
+ struct dump_opts opts = { .entire_journal = true };
+ int opt;
+
+ opt_set(fs_opts, direct_io, false);
+ opt_set(fs_opts, read_only, true);
+ opt_set(fs_opts, nochanges, true);
+ opt_set(fs_opts, norecovery, true);
+ opt_set(fs_opts, degraded, BCH_DEGRADED_very);
+ opt_set(fs_opts, errors, BCH_ON_ERROR_continue);
+ opt_set(fs_opts, fix_errors, FSCK_FIX_no);
+
+ while ((opt = getopt_long(argc, argv, "o:fsvh",
longopts, NULL)) != -1)
switch (opt) {
case 'o':
- out = optarg;
+ opts.out = optarg;
break;
case 'f':
- force = true;
+ opts.force = true;
+ break;
+ case 's':
+ opts.sanitize = true;
break;
case 'j':
- entire_journal = false;
+ opts.entire_journal = false;
break;
case 'e':
- opt_set(opts, noexcl, true);
+ opt_set(fs_opts, noexcl, true);
break;
case 'v':
- opt_set(opts, verbose, true);
+ opt_set(fs_opts, verbose, true);
break;
case 'h':
dump_usage();
@@ -145,44 +368,19 @@ int cmd_dump(int argc, char *argv[])
}
args_shift(optind);
- if (!out)
+ if (!opts.out)
die("Please supply output filename");
if (!argc)
die("Please supply device(s) to check");
- darray_const_str devs = get_or_split_cmdline_devs(argc, argv);
+ darray_const_str dev_names = get_or_split_cmdline_devs(argc, argv);
- struct bch_fs *c = bch2_fs_open(&devs, &opts);
+ struct bch_fs *c = bch2_fs_open(&dev_names, &fs_opts);
if (IS_ERR(c))
die("error opening devices: %s", bch2_err_str(PTR_ERR(c)));
- down_read(&c->state_lock);
-
- for_each_online_member(c, ca, 0)
- nr_devices++;
-
- BUG_ON(!nr_devices);
-
- for_each_online_member(c, ca, 0) {
- int flags = O_WRONLY|O_CREAT|O_TRUNC;
-
- if (!force)
- flags |= O_EXCL;
-
- char *path = nr_devices > 1
- ? mprintf("%s.%u.qcow2", out, ca->dev_idx)
- : mprintf("%s.qcow2", out);
- fd = xopen(path, flags, 0600);
- free(path);
-
- dump_one_device(c, ca, fd, entire_journal);
- xclose(fd);
- }
-
- up_read(&c->state_lock);
-
- bch2_fs_stop(c);
- darray_exit(&devs);
- return 0;
+ int ret = dump_fs(c, opts);
+ darray_exit(&dev_names);
+ return ret;
}
diff --git a/c_src/cmd_image.c b/c_src/cmd_image.c
index d00d85cf..467378b0 100644
--- a/c_src/cmd_image.c
+++ b/c_src/cmd_image.c
@@ -665,7 +665,10 @@ static int image_update(const char *src_path, const char *dst_image,
goto err;
}
- if (ftruncate(dev_opts.bdev->bd_fd, input_bytes)) {
+ u64 metadata_dev_size = max(input_bytes,
+ c->opts.btree_node_size * BCH_MIN_NR_NBUCKETS);
+
+ if (ftruncate(dev_opts.bdev->bd_fd, metadata_dev_size)) {
fprintf(stderr, "ftruncate error: %m");
goto err;
}
diff --git a/c_src/cmd_strip_alloc.c b/c_src/cmd_strip_alloc.c
index c313b665..e16eb093 100644
--- a/c_src/cmd_strip_alloc.c
+++ b/c_src/cmd_strip_alloc.c
@@ -104,8 +104,9 @@ int cmd_strip_alloc(int argc, char *argv[])
struct bch_opts opts = bch2_opts_empty();
opt_set(opts, nostart, true);
+ struct bch_fs *c;
reopen:
- struct bch_fs *c = bch2_fs_open(&devs, &opts);
+ c = bch2_fs_open(&devs, &opts);
int ret = PTR_ERR_OR_ZERO(c);
if (ret)
die("Error opening filesystem: %s", bch2_err_str(ret));
diff --git a/c_src/libbcachefs.c b/c_src/libbcachefs.c
index 935b13ce..6b31d56f 100644
--- a/c_src/libbcachefs.c
+++ b/c_src/libbcachefs.c
@@ -79,9 +79,14 @@ void bch2_sb_layout_init(struct bch_sb_layout *l,
}
}
-static u64 dev_max_bucket_size(u64 dev_size)
+static u64 dev_max_bucket_size(struct bch_opts fs_opts, u64 dev_size)
{
- return rounddown_pow_of_two(dev_size / (BCH_MIN_NR_NBUCKETS * 4));
+ u64 size = rounddown_pow_of_two(dev_size / (BCH_MIN_NR_NBUCKETS * 4));
+ if (opt_defined(fs_opts, btree_node_size))
+ size = max(size, fs_opts.btree_node_size);
+ if (size * BCH_MIN_NR_NBUCKETS > dev_size)
+ die("bucket size %llu too big for device size", size);
+ return size;
}
u64 bch2_pick_bucket_size(struct bch_opts opts, dev_opts_list devs)
@@ -209,7 +214,7 @@ struct bch_sb *bch2_format(struct bch_opt_strs fs_opt_strs,
darray_for_each(devs, i)
if (!opt_defined(i->opts, bucket_size))
opt_set(i->opts, bucket_size,
- min(fs_bucket_size, dev_max_bucket_size(i->fs_size)));
+ min(fs_bucket_size, dev_max_bucket_size(fs_opts, i->fs_size)));
darray_for_each(devs, i) {
i->nbuckets = i->fs_size / i->opts.bucket_size;
diff --git a/c_src/posix_to_bcachefs.c b/c_src/posix_to_bcachefs.c
index 0e7d4c29..ca44db32 100644
--- a/c_src/posix_to_bcachefs.c
+++ b/c_src/posix_to_bcachefs.c
@@ -439,40 +439,38 @@ static void link_file_data(struct bch_fs *c,
fiemap_iter_exit(&iter);
}
-static struct range seek_data_aligned(int fd, u64 i_size, loff_t o, unsigned bs)
+static struct range align_range(struct range r, unsigned bs)
{
- struct range seek_data(int fd, loff_t o)
- {
- s64 s = lseek(fd, o, SEEK_DATA);
- if (s < 0 && errno == ENXIO)
- return (struct range) {};
- if (s < 0)
- die("lseek error: %m");
-
- s64 e = lseek(fd, s, SEEK_HOLE);
- if (e < 0 && errno == ENXIO)
- e = i_size;
- if (e < 0)
- die("lseek error: %m");
-
- return (struct range) { s, e };
- }
-
- struct range __seek_data_aligned(int fd, loff_t o, unsigned bs)
- {
- struct range r = seek_data(fd, o);
+ r.start = round_down(r.start, bs);
+ r.end = round_up(r.end, bs);
+ return r;
+}
- r.start = round_down(r.start, bs);
- r.end = round_up(r.end, bs);
- return r;
- }
+struct range seek_data(int fd, u64 i_size, loff_t o)
+{
+ s64 s = lseek(fd, o, SEEK_DATA);
+ if (s < 0 && errno == ENXIO)
+ return (struct range) {};
+ if (s < 0)
+ die("lseek error: %m");
+
+ s64 e = lseek(fd, s, SEEK_HOLE);
+ if (e < 0 && errno == ENXIO)
+ e = i_size;
+ if (e < 0)
+ die("lseek error: %m");
+
+ return (struct range) { s, e };
+}
- struct range r = __seek_data_aligned(fd, o, bs);
+static struct range seek_data_aligned(int fd, u64 i_size, loff_t o, unsigned bs)
+{
+ struct range r = align_range(seek_data(fd, i_size, o), bs);
if (!r.end)
return r;
while (true) {
- struct range n = __seek_data_aligned(fd, r.end, bs);
+ struct range n = align_range(seek_data(fd, i_size, r.end), bs);
if (!n.end || r.end < n.start)
break;
@@ -482,38 +480,30 @@ static struct range seek_data_aligned(int fd, u64 i_size, loff_t o, unsigned bs)
return r;
}
-static struct range seek_mismatch_aligned(const char *buf1, const char *buf2,
- unsigned offset, unsigned len,
- unsigned bs)
+struct range seek_mismatch(const char *buf1, const char *buf2,
+ unsigned o, unsigned len)
{
- struct range seek_mismatch(unsigned o)
- {
- while (o < len && buf1[o] == buf2[o])
- o++;
-
- if (o == len)
- return (struct range) {};
+ while (o < len && buf1[o] == buf2[o])
+ o++;
- unsigned s = o;
- while (o < len && buf1[o] != buf2[o])
- o++;
+ if (o == len)
+ return (struct range) {};
- return (struct range) { s, o };
- }
-
- struct range __seek_mismatch_aligned(unsigned o)
- {
- struct range r = seek_mismatch(o);
+ unsigned s = o;
+ while (o < len && buf1[o] != buf2[o])
+ o++;
- r.start = round_down(r.start, bs);
- r.end = round_up(r.end, bs);
- return r;
- }
+ return (struct range) { s, o };
+}
- struct range r = __seek_mismatch_aligned(offset);
+static struct range seek_mismatch_aligned(const char *buf1, const char *buf2,
+ unsigned offset, unsigned len,
+ unsigned bs)
+{
+ struct range r = align_range(seek_mismatch(buf1, buf2, offset, len), bs);
if (r.end)
while (true) {
- struct range n = __seek_mismatch_aligned(r.end);
+ struct range n = align_range(seek_mismatch(buf1, buf2, r.end, len), bs);
if (!n.end || r.end < n.start)
break;
diff --git a/c_src/qcow2.c b/c_src/qcow2.c
index 30a6e056..53959a00 100644
--- a/c_src/qcow2.c
+++ b/c_src/qcow2.c
@@ -31,24 +31,21 @@ struct qcow2_hdr {
u64 snapshots_offset;
};
-struct qcow2_image {
- int fd;
- u32 block_size;
- u64 *l1_table;
- u64 l1_offset;
- u32 l1_index;
- u64 *l2_table;
- u64 offset;
-};
+static void __qcow2_write_buf(struct qcow2_image *img, void *buf, unsigned len)
+{
+ assert(!(len % img->block_size));
+
+ xpwrite(img->outfd, buf, len, img->offset, "qcow2 data");
+ img->offset += len;
+}
static void flush_l2(struct qcow2_image *img)
{
if (img->l1_index != -1) {
img->l1_table[img->l1_index] =
cpu_to_be64(img->offset|QCOW_OFLAG_COPIED);
- xpwrite(img->fd, img->l2_table, img->block_size, img->offset,
- "qcow2 l2 table");
- img->offset += img->block_size;
+
+ __qcow2_write_buf(img, img->l2_table, img->block_size);
memset(img->l2_table, 0, img->block_size);
img->l1_index = -1;
@@ -69,66 +66,97 @@ static void add_l2(struct qcow2_image *img, u64 src_blk, u64 dst_offset)
img->l2_table[l2_index] = cpu_to_be64(dst_offset|QCOW_OFLAG_COPIED);
}
-void qcow2_write_image(int infd, int outfd, ranges *data,
- unsigned block_size)
+void qcow2_write_buf(struct qcow2_image *img, void *buf, unsigned len, u64 src_offset)
{
- u64 image_size = get_size(infd);
- unsigned l2_size = block_size / sizeof(u64);
- unsigned l1_size = DIV_ROUND_UP(image_size, (u64) block_size * l2_size);
- struct qcow2_hdr hdr = { 0 };
- struct qcow2_image img = {
- .fd = outfd,
- .block_size = block_size,
- .l2_table = xcalloc(l2_size, sizeof(u64)),
- .l1_table = xcalloc(l1_size, sizeof(u64)),
- .l1_index = -1,
- .offset = round_up(sizeof(hdr), block_size),
- };
- char *buf = xmalloc(block_size);
- u64 src_offset, dst_offset;
-
- assert(is_power_of_2(block_size));
+ u64 dst_offset = img->offset;
+ __qcow2_write_buf(img, buf, len);
+
+ while (len) {
+ add_l2(img, src_offset / img->block_size, dst_offset);
+ dst_offset += img->block_size;
+ src_offset += img->block_size;
+ len -= img->block_size;
+ }
+}
- ranges_roundup(data, block_size);
+void qcow2_write_ranges(struct qcow2_image *img, ranges *data)
+{
+ ranges_roundup(data, img->block_size);
ranges_sort_merge(data);
+ char *buf = xmalloc(img->block_size);
+
/* Write data: */
darray_for_each(*data, r)
- for (src_offset = r->start;
+ for (u64 src_offset = r->start;
src_offset < r->end;
- src_offset += block_size) {
- dst_offset = img.offset;
- img.offset += img.block_size;
+ src_offset += img->block_size) {
+ xpread(img->infd, buf, img->block_size, src_offset);
+ qcow2_write_buf(img, buf, img->block_size, src_offset);
+ }
+
+ free(buf);
+}
- xpread(infd, buf, block_size, src_offset);
- xpwrite(outfd, buf, block_size, dst_offset,
- "qcow2 data");
+void qcow2_image_init(struct qcow2_image *img, int infd, int outfd, unsigned block_size)
+{
+ assert(is_power_of_2(block_size));
- add_l2(&img, src_offset / block_size, dst_offset);
- }
+ u64 image_size = get_size(infd);
+ unsigned l2_size = block_size / sizeof(u64);
+ unsigned l1_size = DIV_ROUND_UP(image_size, (u64) block_size * l2_size);
+
+ *img = (struct qcow2_image) {
+ .infd = infd,
+ .outfd = outfd,
+ .image_size = image_size,
+ .block_size = block_size,
+ .l1_size = l1_size,
+ .l1_table = xcalloc(l1_size, sizeof(u64)),
+ .l1_index = -1,
+ .l2_table = xcalloc(l2_size, sizeof(u64)),
+ .offset = round_up(sizeof(struct qcow2_hdr), block_size),
+ };
+}
+
+void qcow2_image_finish(struct qcow2_image *img)
+{
+ char *buf = xmalloc(img->block_size);
- flush_l2(&img);
+ flush_l2(img);
/* Write L1 table: */
- dst_offset = img.offset;
- img.offset += round_up(l1_size * sizeof(u64), block_size);
- xpwrite(img.fd, img.l1_table, l1_size * sizeof(u64), dst_offset,
+ u64 dst_offset = img->offset;
+ img->offset += round_up(img->l1_size * sizeof(u64), img->block_size);
+ xpwrite(img->outfd, img->l1_table, img->l1_size * sizeof(u64), dst_offset,
"qcow2 l1 table");
/* Write header: */
- hdr.magic = cpu_to_be32(QCOW_MAGIC);
- hdr.version = cpu_to_be32(QCOW_VERSION);
- hdr.block_bits = cpu_to_be32(ilog2(block_size));
- hdr.size = cpu_to_be64(image_size);
- hdr.l1_size = cpu_to_be32(l1_size);
- hdr.l1_table_offset = cpu_to_be64(dst_offset);
-
- memset(buf, 0, block_size);
+ struct qcow2_hdr hdr = {
+ .magic = cpu_to_be32(QCOW_MAGIC),
+ .version = cpu_to_be32(QCOW_VERSION),
+ .block_bits = cpu_to_be32(ilog2(img->block_size)),
+ .size = cpu_to_be64(img->image_size),
+ .l1_size = cpu_to_be32(img->l1_size),
+ .l1_table_offset = cpu_to_be64(dst_offset),
+ };
+
+ memset(buf, 0, img->block_size);
memcpy(buf, &hdr, sizeof(hdr));
- xpwrite(img.fd, buf, block_size, 0,
+ xpwrite(img->outfd, buf, img->block_size, 0,
"qcow2 header");
- free(img.l2_table);
- free(img.l1_table);
+ free(img->l2_table);
+ free(img->l1_table);
free(buf);
}
+
+void qcow2_write_image(int infd, int outfd, ranges *data,
+ unsigned block_size)
+{
+ struct qcow2_image img;
+
+ qcow2_image_init(&img, infd, outfd, block_size);
+ qcow2_write_ranges(&img, data);
+ qcow2_image_finish(&img);
+}
diff --git a/c_src/qcow2.h b/c_src/qcow2.h
index 0943d55c..c7b35627 100644
--- a/c_src/qcow2.h
+++ b/c_src/qcow2.h
@@ -4,6 +4,25 @@
#include <linux/types.h>
#include "tools-util.h"
+struct qcow2_image {
+ int infd;
+ int outfd;
+ u64 image_size;
+ u32 block_size;
+ u32 l1_size;
+ u64 *l1_table;
+ u64 l1_offset;
+ u32 l1_index;
+ u64 *l2_table;
+ u64 offset;
+};
+
+void qcow2_write_buf(struct qcow2_image *, void *, unsigned, u64);
+void qcow2_write_ranges(struct qcow2_image *, ranges *);
+
+void qcow2_image_init(struct qcow2_image *, int, int, unsigned);
+void qcow2_image_finish(struct qcow2_image *);
+
void qcow2_write_image(int, int, ranges *, unsigned);
#endif /* _QCOW2_H */
diff --git a/c_src/tools-util.c b/c_src/tools-util.c
index a31adcb0..5a15f306 100644
--- a/c_src/tools-util.c
+++ b/c_src/tools-util.c
@@ -291,11 +291,16 @@ static int range_cmp(const void *_l, const void *_r)
return 0;
}
+void ranges_sort(ranges *r)
+{
+ sort(r->data, r->nr, sizeof(r->data[0]), range_cmp, NULL);
+}
+
void ranges_sort_merge(ranges *r)
{
ranges tmp = { 0 };
- sort(r->data, r->nr, sizeof(r->data[0]), range_cmp, NULL);
+ ranges_sort(r);
/* Merge contiguous ranges: */
darray_for_each(*r, i) {
diff --git a/c_src/tools-util.h b/c_src/tools-util.h
index 239d7e29..b8104002 100644
--- a/c_src/tools-util.h
+++ b/c_src/tools-util.h
@@ -117,6 +117,7 @@ static inline void range_add(ranges *data, u64 offset, u64 size)
}));
}
+void ranges_sort(ranges *);
void ranges_sort_merge(ranges *);
void ranges_roundup(ranges *, unsigned);
void ranges_rounddown(ranges *, unsigned);
diff --git a/debian/changelog b/debian/changelog
index b1ef3bd9..a5b9d3eb 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,11 @@
+bcachefs-tools (1:1.25.3-1) unstable; urgency=medium
+
+ New image tooling:
+ * bcachefs image create
+ * bcachefs image update
+
+ -- Kent Overstreet <kent.overstreet@linux.dev> Sun, 20 Jul 2025 12:21:03 -0400
+
bcachefs-tools (1:1.25.2-1) unstable; urgency=medium
* don't pick a non power of two bucket size
diff --git a/flake.lock b/flake.lock
index 0a9b2b85..f4bc8dca 100644
--- a/flake.lock
+++ b/flake.lock
@@ -2,11 +2,11 @@
"nodes": {
"crane": {
"locked": {
- "lastModified": 1742394900,
- "narHash": "sha256-vVOAp9ahvnU+fQoKd4SEXB2JG2wbENkpqcwlkIXgUC0=",
+ "lastModified": 1752946753,
+ "narHash": "sha256-g5uP3jIj+STUcfTJDKYopxnSijs2agRg13H0SGL5iE4=",
"owner": "ipetkov",
"repo": "crane",
- "rev": "70947c1908108c0c551ddfd73d4f750ff2ea67cd",
+ "rev": "544d09fecc8c2338542c57f3f742f1a0c8c71e13",
"type": "github"
},
"original": {
@@ -18,11 +18,11 @@
"flake-compat": {
"flake": false,
"locked": {
- "lastModified": 1733328505,
- "narHash": "sha256-NeCCThCEP3eCl2l/+27kNNK7QrwZB1IJCrXfrbv5oqU=",
+ "lastModified": 1747046372,
+ "narHash": "sha256-CIVLLkVgvHYbgI2UpXvIIBJ12HWgX+fjA8Xf8PUmqCY=",
"owner": "edolstra",
"repo": "flake-compat",
- "rev": "ff81ac966bb2cae68946d5ed5fc4994f96d0ffec",
+ "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
"type": "github"
},
"original": {
@@ -36,11 +36,11 @@
"nixpkgs-lib": "nixpkgs-lib"
},
"locked": {
- "lastModified": 1741352980,
- "narHash": "sha256-+u2UunDA4Cl5Fci3m7S643HzKmIDAe+fiXrLqYsR2fs=",
+ "lastModified": 1751413152,
+ "narHash": "sha256-Tyw1RjYEsp5scoigs1384gIg6e0GoBVjms4aXFfRssQ=",
"owner": "hercules-ci",
"repo": "flake-parts",
- "rev": "f4330d22f1c5d2ba72d3d22df5597d123fdb60a9",
+ "rev": "77826244401ea9de6e3bac47c2db46005e1f30b5",
"type": "github"
},
"original": {
@@ -71,11 +71,11 @@
},
"nixpkgs": {
"locked": {
- "lastModified": 1742422364,
- "narHash": "sha256-mNqIplmEohk5jRkqYqG19GA8MbQ/D4gQSK0Mu4LvfRQ=",
+ "lastModified": 1752950548,
+ "narHash": "sha256-NS6BLD0lxOrnCiEOcvQCDVPXafX1/ek1dfJHX1nUIzc=",
"owner": "nixos",
"repo": "nixpkgs",
- "rev": "a84ebe20c6bc2ecbcfb000a50776219f48d134cc",
+ "rev": "c87b95e25065c028d31a94f06a62927d18763fdf",
"type": "github"
},
"original": {
@@ -87,11 +87,11 @@
},
"nixpkgs-lib": {
"locked": {
- "lastModified": 1740877520,
- "narHash": "sha256-oiwv/ZK/2FhGxrCkQkB83i7GnWXPPLzoqFHpDD3uYpk=",
+ "lastModified": 1751159883,
+ "narHash": "sha256-urW/Ylk9FIfvXfliA1ywh75yszAbiTEVgpPeinFyVZo=",
"owner": "nix-community",
"repo": "nixpkgs.lib",
- "rev": "147dee35aab2193b174e4c0868bd80ead5ce755c",
+ "rev": "14a40a1d7fb9afa4739275ac642ed7301a9ba1ab",
"type": "github"
},
"original": {
@@ -118,11 +118,11 @@
]
},
"locked": {
- "lastModified": 1742524367,
- "narHash": "sha256-KzTwk/5ETJavJZYV1DEWdCx05M4duFCxCpRbQSKWpng=",
+ "lastModified": 1752979888,
+ "narHash": "sha256-qRRP3QavbwW0o+LOh31QNEfCgPlzK5SKlWALUJL6T7E=",
"owner": "oxalica",
"repo": "rust-overlay",
- "rev": "70bf752d176b2ce07417e346d85486acea9040ef",
+ "rev": "95719de18aefa63a624bf75a1ff98744b089ec12",
"type": "github"
},
"original": {
@@ -138,11 +138,11 @@
]
},
"locked": {
- "lastModified": 1742370146,
- "narHash": "sha256-XRE8hL4vKIQyVMDXykFh4ceo3KSpuJF3ts8GKwh5bIU=",
+ "lastModified": 1753006367,
+ "narHash": "sha256-tzbhc4XttkyEhswByk5R38l+ztN9UDbnj0cTcP6Hp9A=",
"owner": "numtide",
"repo": "treefmt-nix",
- "rev": "adc195eef5da3606891cedf80c0d9ce2d3190808",
+ "rev": "421b56313c65a0815a52b424777f55acf0b56ddf",
"type": "github"
},
"original": {
diff --git a/flake.nix b/flake.nix
index 2350f270..013f90cb 100644
--- a/flake.nix
+++ b/flake.nix
@@ -245,13 +245,13 @@
bcachefs-tools-fuse-i686-linux
;
- cargo-clippy = common.craneLib.cargoClippy (
- common.args
- // {
- inherit (common) cargoArtifacts;
- cargoClippyExtraArgs = "--all-targets --all-features -- --deny warnings";
- }
- );
+ #cargo-clippy = common.craneLib.cargoClippy (
+ # common.args
+ # // {
+ # inherit (common) cargoArtifacts;
+ # cargoClippyExtraArgs = "--all-targets --all-features -- --deny warnings";
+ # }
+ #);
# we have to build our own `craneLib.cargoTest`
cargo-test = common.craneLib.mkCargoDerivation (
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7c2ec1b2..7cf1a833 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -87,7 +87,7 @@ struct super_block {
};
static inline void evict_inodes(struct super_block *sb) {}
-static inline int sync_filesystem(struct super_block *) { return 0; }
+static inline int sync_filesystem(struct super_block *sb) { return 0; }
/*
* File types
diff --git a/libbcachefs/acl.c b/libbcachefs/acl.c
index 307824d6..8f970dc1 100644
--- a/libbcachefs/acl.c
+++ b/libbcachefs/acl.c
@@ -138,8 +138,8 @@ static struct posix_acl *bch2_acl_from_disk(struct btree_trans *trans,
acl = allocate_dropping_locks(trans, ret,
posix_acl_alloc(count, _gfp));
- if (!acl)
- return ERR_PTR(-ENOMEM);
+ if (!acl && !ret)
+ ret = bch_err_throw(trans->c, ENOMEM_acl);
if (ret) {
kfree(acl);
return ERR_PTR(ret);
diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c
index 4c1604fd..afc0ab75 100644
--- a/libbcachefs/alloc_background.c
+++ b/libbcachefs/alloc_background.c
@@ -20,6 +20,7 @@
#include "enumerated_ref.h"
#include "error.h"
#include "lru.h"
+#include "progress.h"
#include "recovery.h"
#include "varint.h"
@@ -337,9 +338,10 @@ void bch2_alloc_v4_swab(struct bkey_s k)
}
static inline void __bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs *c,
- unsigned dev, const struct bch_alloc_v4 *a)
+ struct bkey_s_c k,
+ const struct bch_alloc_v4 *a)
{
- struct bch_dev *ca = c ? bch2_dev_tryget_noerror(c, dev) : NULL;
+ struct bch_dev *ca = c ? bch2_dev_tryget_noerror(c, k.k->p.inode) : NULL;
prt_newline(out);
printbuf_indent_add(out, 2);
@@ -348,11 +350,14 @@ static inline void __bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs *
bch2_prt_data_type(out, a->data_type);
prt_newline(out);
prt_printf(out, "journal_seq_nonempty %llu\n", a->journal_seq_nonempty);
- prt_printf(out, "journal_seq_empty %llu\n", a->journal_seq_empty);
+ if (bkey_val_bytes(k.k) > offsetof(struct bch_alloc_v4, journal_seq_empty))
+ prt_printf(out, "journal_seq_empty %llu\n", a->journal_seq_empty);
+
prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a));
prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a));
prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors);
- prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors);
+ if (bkey_val_bytes(k.k) > offsetof(struct bch_alloc_v4, stripe_sectors))
+ prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors);
prt_printf(out, "cached_sectors %u\n", a->cached_sectors);
prt_printf(out, "stripe %u\n", a->stripe);
prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy);
@@ -372,12 +377,12 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
struct bch_alloc_v4 _a;
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
- __bch2_alloc_v4_to_text(out, c, k.k->p.inode, a);
+ __bch2_alloc_v4_to_text(out, c, k, a);
}
void bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
{
- __bch2_alloc_v4_to_text(out, c, k.k->p.inode, bkey_s_c_to_alloc_v4(k).v);
+ __bch2_alloc_v4_to_text(out, c, k, bkey_s_c_to_alloc_v4(k).v);
}
void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
@@ -385,7 +390,7 @@ void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
if (k.k->type == KEY_TYPE_alloc_v4) {
void *src, *dst;
- *out = *bkey_s_c_to_alloc_v4(k).v;
+ bkey_val_copy(out, bkey_s_c_to_alloc_v4(k));
src = alloc_v4_backpointers(out);
SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
@@ -1732,12 +1737,16 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
bch2_bkey_buf_init(&last_flushed);
bkey_init(&last_flushed.k->k);
+ struct progress_indicator_state progress;
+ bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_alloc));
+
CLASS(btree_trans, trans)(c);
int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
POS_MIN, BTREE_ITER_prefetch, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed)) ?:
- bch2_check_stripe_to_lru_refs(trans);
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+ progress_update_iter(trans, &progress, &iter);
+ bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed);
+ }))?: bch2_check_stripe_to_lru_refs(trans);
bch2_bkey_buf_exit(&last_flushed, c);
return ret;
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index 8a6f886b..45c15bda 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -1277,4 +1277,11 @@ static inline int bch2_fs_casefold_enabled(struct bch_fs *c)
return 0;
}
+static inline const char *strip_bch2(const char *msg)
+{
+ if (!strncmp("bch2_", msg, 5))
+ return msg + 5;
+ return msg;
+}
+
#endif /* _BCACHEFS_H */
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index 34cb8a43..e95bb684 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -44,10 +44,6 @@
#include <linux/rcupdate.h>
#include <linux/sched/task.h>
-#define DROP_THIS_NODE 10
-#define DROP_PREV_NODE 11
-#define DID_FILL_FROM_SCAN 12
-
/*
* Returns true if it's a btree we can easily reconstruct, or otherwise won't
* cause data loss if it's missing:
@@ -252,7 +248,7 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree *
return ret;
*pulled_from_scan = cur->data->min_key;
- ret = DID_FILL_FROM_SCAN;
+ ret = bch_err_throw(c, topology_repair_did_fill_from_scan);
} else {
if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key,
"btree node with incorrect min_key%s", buf.buf))
@@ -263,7 +259,7 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree *
if (bpos_ge(prev->data->min_key, cur->data->min_key)) { /* fully? */
if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_next_node,
"btree node overwritten by next node%s", buf.buf))
- ret = DROP_PREV_NODE;
+ ret = bch_err_throw(c, topology_repair_drop_prev_node);
} else {
if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key,
"btree node with incorrect max_key%s", buf.buf))
@@ -274,7 +270,7 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree *
if (bpos_ge(expected_start, cur->data->max_key)) { /* fully? */
if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_prev_node,
"btree node overwritten by prev node%s", buf.buf))
- ret = DROP_THIS_NODE;
+ ret = bch_err_throw(c, topology_repair_drop_this_node);
} else {
if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key,
"btree node with incorrect min_key%s", buf.buf))
@@ -314,7 +310,7 @@ static int btree_repair_node_end(struct btree_trans *trans, struct btree *b,
return ret;
*pulled_from_scan = b->key.k.p;
- ret = DID_FILL_FROM_SCAN;
+ ret = bch_err_throw(c, topology_repair_did_fill_from_scan);
} else {
ret = set_node_max(c, child, b->key.k.p);
}
@@ -391,15 +387,15 @@ again:
ret = lockrestart_do(trans,
btree_check_node_boundaries(trans, b, prev, cur, pulled_from_scan));
- if (ret < 0)
+ if (ret && !bch2_err_matches(ret, BCH_ERR_topology_repair))
goto err;
- if (ret == DID_FILL_FROM_SCAN) {
+ if (bch2_err_matches(ret, BCH_ERR_topology_repair_did_fill_from_scan)) {
new_pass = true;
ret = 0;
}
- if (ret == DROP_THIS_NODE) {
+ if (bch2_err_matches(ret, BCH_ERR_topology_repair_drop_this_node)) {
six_unlock_read(&cur->c.lock);
bch2_btree_node_evict(trans, cur_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
@@ -414,7 +410,7 @@ again:
six_unlock_read(&prev->c.lock);
prev = NULL;
- if (ret == DROP_PREV_NODE) {
+ if (bch2_err_matches(ret, BCH_ERR_topology_repair_drop_prev_node)) {
bch_info(c, "dropped prev node");
bch2_btree_node_evict(trans, prev_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
@@ -436,7 +432,7 @@ again:
BUG_ON(cur);
ret = lockrestart_do(trans,
btree_repair_node_end(trans, b, prev, pulled_from_scan));
- if (ret == DID_FILL_FROM_SCAN) {
+ if (bch2_err_matches(ret, BCH_ERR_topology_repair_did_fill_from_scan)) {
new_pass = true;
ret = 0;
}
@@ -477,7 +473,7 @@ again:
six_unlock_read(&cur->c.lock);
cur = NULL;
- if (ret == DROP_THIS_NODE) {
+ if (bch2_err_matches(ret, BCH_ERR_topology_repair_drop_this_node)) {
bch2_btree_node_evict(trans, cur_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p);
@@ -504,7 +500,7 @@ again:
if (mustfix_fsck_err_on(!have_child,
c, btree_node_topology_interior_node_empty,
"empty interior btree node at %s", buf.buf))
- ret = DROP_THIS_NODE;
+ ret = bch_err_throw(c, topology_repair_drop_this_node);
err:
fsck_err:
if (!IS_ERR_OR_NULL(prev))
@@ -521,7 +517,8 @@ fsck_err:
bch2_bkey_buf_exit(&prev_k, c);
bch2_bkey_buf_exit(&cur_k, c);
- bch_err_fn(c, ret);
+ if (!bch2_err_matches(ret, BCH_ERR_topology_repair))
+ bch_err_fn(c, ret);
return ret;
}
@@ -592,7 +589,7 @@ recover:
ret = bch2_btree_repair_topology_recurse(trans, b, &pulled_from_scan);
six_unlock_read(&b->c.lock);
- if (ret == DROP_THIS_NODE) {
+ if (bch2_err_matches(ret, BCH_ERR_topology_repair_drop_this_node)) {
scoped_guard(mutex, &c->btree_cache.lock)
bch2_btree_node_hash_remove(&c->btree_cache, b);
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index bd86dd71..83c83608 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -1405,10 +1405,8 @@ static void btree_node_read_work(struct work_struct *work)
ret = bch2_bkey_pick_read_device(c,
bkey_i_to_s_c(&b->key),
&failed, &rb->pick, -1);
- if (ret <= 0) {
- set_btree_node_read_error(b);
+ if (ret <= 0)
break;
- }
ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read);
rb->have_ioref = ca != NULL;
@@ -1442,27 +1440,21 @@ start:
bch2_maybe_corrupt_bio(bio, bch2_btree_read_corrupt_ratio);
ret = bch2_btree_node_read_done(c, ca, b, &failed, &buf);
- if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
- ret == -BCH_ERR_btree_node_read_err_must_retry)
- continue;
-
- if (ret)
- set_btree_node_read_error(b);
-
- break;
+ if (ret != -BCH_ERR_btree_node_read_err_want_retry &&
+ ret != -BCH_ERR_btree_node_read_err_must_retry)
+ break;
}
bch2_io_failures_to_text(&buf, c, &failed);
- if (btree_node_read_error(b))
- bch2_btree_lost_data(c, &buf, b->c.btree_id);
-
/*
* only print retry success if we read from a replica with no errors
*/
- if (btree_node_read_error(b))
+ if (ret) {
+ set_btree_node_read_error(b);
+ bch2_btree_lost_data(c, &buf, b->c.btree_id);
prt_printf(&buf, "ret %s", bch2_err_str(ret));
- else if (failed.nr) {
+ } else if (failed.nr) {
if (!bch2_dev_io_failures(&failed, rb->pick.ptr.dev))
prt_printf(&buf, "retry success");
else
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index cc771aff..a282c388 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -2860,8 +2860,9 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre
!bkey_deleted(k.k) &&
(k2 = btree_trans_peek_key_cache(trans, iter, iter->pos)).k) {
k = k2;
- if (!bkey_err(k))
- iter->k = *k.k;
+ if (bkey_err(k))
+ goto out;
+ iter->k = *k.k;
}
if (unlikely(k.k->type == KEY_TYPE_whiteout &&
diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c
index ebba14da..d61b7820 100644
--- a/libbcachefs/btree_key_cache.c
+++ b/libbcachefs/btree_key_cache.c
@@ -254,11 +254,13 @@ static int btree_key_cache_create(struct btree_trans *trans,
struct bkey_i *new_k = allocate_dropping_locks(trans, ret,
kmalloc(key_u64s * sizeof(u64), _gfp));
- if (unlikely(!new_k)) {
+ if (unlikely(!new_k && !ret)) {
bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
bch2_btree_id_str(ck->key.btree_id), key_u64s);
ret = bch_err_throw(c, ENOMEM_btree_key_cache_fill);
- } else if (ret) {
+ }
+
+ if (unlikely(ret)) {
kfree(new_k);
goto err;
}
@@ -407,7 +409,7 @@ int bch2_btree_path_traverse_cached(struct btree_trans *trans,
btree_node_unlock(trans, path, 0);
path->l[0].b = ERR_PTR(ret);
}
- } else {
+ } else if (!(flags & BTREE_ITER_cached_nofill)) {
BUG_ON(path->uptodate);
BUG_ON(!path->nodes_locked);
}
diff --git a/libbcachefs/btree_node_scan.c b/libbcachefs/btree_node_scan.c
index d997e381..4b7b5ca7 100644
--- a/libbcachefs/btree_node_scan.c
+++ b/libbcachefs/btree_node_scan.c
@@ -158,14 +158,6 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
if (BTREE_NODE_ID(bn) >= BTREE_ID_NR_MAX)
return;
- bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
- bio->bi_iter.bi_sector = offset;
- bch2_bio_map(bio, b->data, c->opts.btree_node_size);
-
- submit_time = local_clock();
- submit_bio_wait(bio);
- bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status);
-
rcu_read_lock();
struct found_btree_node n = {
.btree_id = BTREE_NODE_ID(bn),
@@ -182,6 +174,14 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
};
rcu_read_unlock();
+ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
+ bio->bi_iter.bi_sector = offset;
+ bch2_bio_map(bio, b->data, c->opts.btree_node_size);
+
+ submit_time = local_clock();
+ submit_bio_wait(bio);
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status);
+
found_btree_node_to_key(&b->key, &n);
CLASS(printbuf, buf)();
@@ -270,6 +270,9 @@ static int read_btree_nodes(struct find_btree_nodes *f)
int ret = 0;
closure_init_stack(&cl);
+ CLASS(printbuf, buf)();
+
+ prt_printf(&buf, "scanning for btree nodes on");
for_each_online_member(c, ca, BCH_DEV_READ_REF_btree_node_scan) {
if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree)))
@@ -295,10 +298,14 @@ static int read_btree_nodes(struct find_btree_nodes *f)
break;
}
+ prt_printf(&buf, " %s", ca->name);
+
closure_get(&cl);
enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan);
wake_up_process(t);
}
+
+ bch_notice(c, "%s", buf.buf);
err:
while (closure_sync_timeout(&cl, sysctl_hung_task_timeout_secs * HZ / 2))
;
diff --git a/libbcachefs/btree_trans_commit.c b/libbcachefs/btree_trans_commit.c
index 1f9965ae..58590ccc 100644
--- a/libbcachefs/btree_trans_commit.c
+++ b/libbcachefs/btree_trans_commit.c
@@ -772,12 +772,13 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
trans->journal_res.offset += trans->journal_entries.u64s;
trans->journal_res.u64s -= trans->journal_entries.u64s;
- memcpy_u64s_small(bch2_journal_add_entry(j, &trans->journal_res,
- BCH_JSET_ENTRY_write_buffer_keys,
- BTREE_ID_accounting, 0,
- trans->accounting.u64s)->_data,
- btree_trans_subbuf_base(trans, &trans->accounting),
- trans->accounting.u64s);
+ if (trans->accounting.u64s)
+ memcpy_u64s_small(bch2_journal_add_entry(j, &trans->journal_res,
+ BCH_JSET_ENTRY_write_buffer_keys,
+ BTREE_ID_accounting, 0,
+ trans->accounting.u64s)->_data,
+ btree_trans_subbuf_base(trans, &trans->accounting),
+ trans->accounting.u64s);
if (trans->journal_seq)
*trans->journal_seq = trans->journal_res.seq;
@@ -1065,11 +1066,15 @@ int __bch2_trans_commit(struct btree_trans *trans, enum bch_trans_commit_flags f
EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
- journal_u64s = jset_u64s(trans->accounting.u64s);
+ journal_u64s = 0;
+
trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
if (trans->journal_transaction_names)
journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
+ if (trans->accounting.u64s)
+ journal_u64s += jset_u64s(trans->accounting.u64s);
+
trans_for_each_update(trans, i) {
struct btree_path *path = trans->paths + i->path;
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index 312ef203..e4aa4fa7 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -14,6 +14,7 @@
#include "btree_locking.h"
#include "buckets.h"
#include "clock.h"
+#include "disk_groups.h"
#include "enumerated_ref.h"
#include "error.h"
#include "extents.h"
@@ -277,6 +278,36 @@ static void bch2_btree_node_free_never_used(struct btree_update *as,
bch2_trans_node_drop(trans, b);
}
+static bool can_use_btree_node(struct bch_fs *c,
+ struct disk_reservation *res,
+ unsigned target,
+ struct bkey_s_c k)
+{
+ if (!bch2_bkey_devs_rw(c, k))
+ return false;
+
+ if (target && !bch2_bkey_in_target(c, k, target))
+ return false;
+
+ unsigned durability = bch2_bkey_durability(c, k);
+
+ if (durability >= res->nr_replicas)
+ return true;
+
+ struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_btree, target);
+
+ guard(rcu)();
+
+ unsigned durability_available = 0, i;
+ for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) {
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, i);
+ if (ca)
+ durability_available += ca->mi.durability;
+ }
+
+ return durability >= durability_available;
+}
+
static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
struct disk_reservation *res,
struct closure *cl,
@@ -303,10 +334,14 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
mutex_lock(&c->btree_reserve_cache_lock);
if (c->btree_reserve_cache_nr > nr_reserve) {
for (struct btree_alloc *a = c->btree_reserve_cache;
- a < c->btree_reserve_cache + c->btree_reserve_cache_nr;
- a++) {
- if (target && !bch2_bkey_in_target(c, bkey_i_to_s_c(&a->k), target))
+ a < c->btree_reserve_cache + c->btree_reserve_cache_nr;) {
+ /* check if it has sufficient durability */
+
+ if (!can_use_btree_node(c, res, target, bkey_i_to_s_c(&a->k))) {
+ bch2_open_buckets_put(c, &a->ob);
+ *a = c->btree_reserve_cache[--c->btree_reserve_cache_nr];
continue;
+ }
bkey_copy(&b->key, &a->k);
b->ob = a->ob;
diff --git a/libbcachefs/clock.c b/libbcachefs/clock.c
index 5185794f..1c6d0cdc 100644
--- a/libbcachefs/clock.c
+++ b/libbcachefs/clock.c
@@ -21,7 +21,7 @@ static const struct min_heap_callbacks callbacks = {
void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
{
- guard(spinlock)(&clock->timer_lock);
+ spin_lock(&clock->timer_lock);
if (time_after_eq64((u64) atomic64_read(&clock->now), timer->expire)) {
spin_unlock(&clock->timer_lock);
@@ -31,9 +31,11 @@ void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
for (size_t i = 0; i < clock->timers.nr; i++)
if (clock->timers.data[i] == timer)
- return;
+ goto out;
BUG_ON(!min_heap_push(&clock->timers, &timer, &callbacks, NULL));
+out:
+ spin_unlock(&clock->timer_lock);
}
void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer)
diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c
index 97d7655a..33cb94f7 100644
--- a/libbcachefs/debug.c
+++ b/libbcachefs/debug.c
@@ -462,7 +462,7 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *
struct btree *b)
{
if (!out->nr_tabstops)
- printbuf_tabstop_push(out, 32);
+ printbuf_tabstop_push(out, 36);
prt_printf(out, "%px ", b);
bch2_btree_id_level_to_text(out, b->c.btree_id, b->c.level);
diff --git a/libbcachefs/dirent.h b/libbcachefs/dirent.h
index 373d382b..efb58d2d 100644
--- a/libbcachefs/dirent.h
+++ b/libbcachefs/dirent.h
@@ -26,6 +26,13 @@ struct bch_inode_info;
#if IS_ENABLED(CONFIG_UNICODE)
int bch2_casefold(struct btree_trans *, const struct bch_hash_info *,
const struct qstr *, struct qstr *);
+#else
+static inline int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info,
+ const struct qstr *str, struct qstr *out_cf)
+{
+ return bch_err_throw(trans->c, no_casefolding_without_utf8);
+}
+#endif
static inline int bch2_maybe_casefold(struct btree_trans *trans,
const struct bch_hash_info *info,
@@ -38,14 +45,6 @@ static inline int bch2_maybe_casefold(struct btree_trans *trans,
return bch2_casefold(trans, info, str, out_cf);
}
}
-#else
-static inline int bch2_maybe_casefold(struct btree_trans *trans,
- const struct bch_hash_info *info,
- const struct qstr *str, struct qstr *out_cf)
-{
- return bch_err_throw(trans->c, no_casefolding_without_utf8);
-}
-#endif
struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent);
diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c
index 62dda821..bea14f02 100644
--- a/libbcachefs/ec.c
+++ b/libbcachefs/ec.c
@@ -2060,6 +2060,9 @@ allocated:
BUG_ON(trans->restarted);
return h;
err:
+ if (waiting &&
+ !bch2_err_matches(ret, BCH_ERR_operation_blocked))
+ closure_wake_up(&c->freelist_wait);
bch2_ec_stripe_head_put(c, h);
return ERR_PTR(ret);
}
diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h
index 2de0dc91..cec8b0f4 100644
--- a/libbcachefs/errcode.h
+++ b/libbcachefs/errcode.h
@@ -90,6 +90,8 @@
x(ENOMEM, ENOMEM_disk_accounting) \
x(ENOMEM, ENOMEM_stripe_head_alloc) \
x(ENOMEM, ENOMEM_journal_read_bucket) \
+ x(ENOMEM, ENOMEM_acl) \
+ x(ENOMEM, ENOMEM_move_extent) \
x(ENOSPC, ENOSPC_disk_reservation) \
x(ENOSPC, ENOSPC_bucket_alloc) \
x(ENOSPC, ENOSPC_disk_label_add) \
@@ -216,9 +218,13 @@
x(EINVAL, varint_decode_error) \
x(EINVAL, erasure_coding_found_btree_node) \
x(EINVAL, option_negative) \
+ x(EINVAL, topology_repair) \
+ x(BCH_ERR_topology_repair, topology_repair_drop_this_node) \
+ x(BCH_ERR_topology_repair, topology_repair_drop_prev_node) \
+ x(BCH_ERR_topology_repair, topology_repair_did_fill_from_scan) \
x(EOPNOTSUPP, may_not_use_incompat_feature) \
x(EOPNOTSUPP, no_casefolding_without_utf8) \
- x(EOPNOTSUPP, casefolding_disabled) \
+ x(EOPNOTSUPP, casefolding_disabled) \
x(EOPNOTSUPP, casefold_opt_is_dir_only) \
x(EOPNOTSUPP, unsupported_fsx_flag) \
x(EOPNOTSUPP, unsupported_fa_flag) \
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c
index b36ecfc0..b879a586 100644
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -282,9 +282,9 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
if (have_pick)
return 1;
- if (!have_dirty_ptrs)
+ if (!have_dirty_ptrs && !bkey_is_btree_ptr(k.k))
return 0;
- if (have_missing_devs)
+ if (have_missing_devs || !have_dirty_ptrs)
return bch_err_throw(c, no_device_to_read_from);
if (have_csum_errors)
return bch_err_throw(c, data_read_csum_err);
@@ -1006,6 +1006,20 @@ const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned
return NULL;
}
+bool bch2_bkey_devs_rw(struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+ guard(rcu)();
+ bkey_for_each_ptr(ptrs, ptr) {
+ CLASS(bch2_dev_tryget, ca)(c, ptr->dev);
+ if (!ca || ca->mi.state != BCH_MEMBER_STATE_rw)
+ return false;
+ }
+
+ return true;
+}
+
bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h
index f212f91c..35ee03cd 100644
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@@ -614,6 +614,8 @@ static inline struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s k, unsig
return (void *) bch2_bkey_has_device_c(k.s_c, dev);
}
+bool bch2_bkey_devs_rw(struct bch_fs *, struct bkey_s_c);
+
bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned);
bool bch2_bkey_in_target(struct bch_fs *, struct bkey_s_c, unsigned);
diff --git a/libbcachefs/fs-io-direct.c b/libbcachefs/fs-io-direct.c
index 73d44875..e53fee05 100644
--- a/libbcachefs/fs-io-direct.c
+++ b/libbcachefs/fs-io-direct.c
@@ -127,7 +127,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
* the dirtying of requests that are internal from the kernel (i.e. from
* loopback), because we'll deadlock on page_lock.
*/
- dio->should_dirty = iter_is_iovec(iter);
+ dio->should_dirty = user_backed_iter(iter);
blk_start_plug(&plug);
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c
index cc203752..93ad33f0 100644
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -223,9 +223,8 @@ static int bch2_flush_inode(struct bch_fs *c,
if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_fsync))
return -EROFS;
- CLASS(btree_trans, trans)(c);
u64 seq;
- int ret = commit_do(trans, NULL, NULL, 0,
+ int ret = bch2_trans_commit_do(c, NULL, NULL, 0,
bch2_get_inode_journal_seq_trans(trans, inode_inum(inode), &seq)) ?:
bch2_journal_flush_seq(&c->journal, seq, TASK_INTERRUPTIBLE) ?:
bch2_inode_flush_nocow_writes(c, inode);
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index 2789b30a..56b7126b 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -1295,8 +1295,14 @@ static int bch2_fill_extent(struct bch_fs *c,
flags|
FIEMAP_EXTENT_DELALLOC|
FIEMAP_EXTENT_UNWRITTEN);
+ } else if (k.k->type == KEY_TYPE_error) {
+ return 0;
} else {
- BUG();
+ WARN_ONCE(1, "unhandled key type %s",
+ k.k->type < KEY_TYPE_MAX
+ ? bch2_bkey_types[k.k->type]
+ : "(unknown)");
+ return 0;
}
}
diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c
index df0aa252..40fc3c4e 100644
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@@ -15,6 +15,7 @@
#include "io_misc.h"
#include "keylist.h"
#include "namei.h"
+#include "progress.h"
#include "recovery_passes.h"
#include "snapshot.h"
#include "super.h"
@@ -1331,11 +1332,16 @@ int bch2_check_inodes(struct bch_fs *c)
CLASS(btree_trans, trans)(c);
CLASS(snapshots_seen, s)();
+ struct progress_indicator_state progress;
+ bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_inodes));
+
return for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
POS_MIN,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- check_inode(trans, &iter, k, &snapshot_root, &s));
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+ progress_update_iter(trans, &progress, &iter);
+ check_inode(trans, &iter, k, &snapshot_root, &s);
+ }));
}
static int find_oldest_inode_needs_reattach(struct btree_trans *trans,
@@ -1422,12 +1428,17 @@ fsck_err:
*/
int bch2_check_unreachable_inodes(struct bch_fs *c)
{
+ struct progress_indicator_state progress;
+ bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_inodes));
+
CLASS(btree_trans, trans)(c);
return for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
POS_MIN,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- check_unreachable_inode(trans, &iter, k));
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+ progress_update_iter(trans, &progress, &iter);
+ check_unreachable_inode(trans, &iter, k);
+ }));
}
static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode)
@@ -1975,6 +1986,10 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
}
}
+ ret = check_extent_overbig(trans, iter, k);
+ if (ret)
+ goto err;
+
ret = bch2_trans_commit(trans, res, NULL, BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto err;
@@ -2017,12 +2032,15 @@ int bch2_check_extents(struct bch_fs *c)
CLASS(inode_walker, w)();
CLASS(extent_ends, extent_ends)();
+ struct progress_indicator_state progress;
+ bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_extents));
+
int ret = for_each_btree_key(trans, iter, BTREE_ID_extents,
POS(BCACHEFS_ROOT_INO, 0),
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
+ progress_update_iter(trans, &progress, &iter);
bch2_disk_reservation_put(c, &res);
- check_extent(trans, &iter, k, &w, &s, &extent_ends, &res) ?:
- check_extent_overbig(trans, &iter, k);
+ check_extent(trans, &iter, k, &w, &s, &extent_ends, &res);
})) ?:
check_i_sectors_notnested(trans, &w);
@@ -2035,11 +2053,15 @@ int bch2_check_indirect_extents(struct bch_fs *c)
CLASS(btree_trans, trans)(c);
struct disk_reservation res = { 0 };
+ struct progress_indicator_state progress;
+ bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_reflink));
+
int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
POS_MIN,
BTREE_ITER_prefetch, k,
&res, NULL,
BCH_TRANS_COMMIT_no_enospc, ({
+ progress_update_iter(trans, &progress, &iter);
bch2_disk_reservation_put(c, &res);
check_extent_overbig(trans, &iter, k);
}));
@@ -2448,15 +2470,20 @@ int bch2_check_dirents(struct bch_fs *c)
CLASS(snapshots_seen, s)();
CLASS(inode_walker, dir)();
CLASS(inode_walker, target)();
+ struct progress_indicator_state progress;
bool need_second_pass = false, did_second_pass = false;
int ret;
again:
+ bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_dirents));
+
ret = for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
POS(BCACHEFS_ROOT_INO, 0),
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+ progress_update_iter(trans, &progress, &iter);
check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s,
- &need_second_pass)) ?:
+ &need_second_pass);
+ })) ?:
check_subdir_count_notnested(trans, &dir);
if (!ret && need_second_pass && !did_second_pass) {
@@ -2516,13 +2543,18 @@ int bch2_check_xattrs(struct bch_fs *c)
CLASS(btree_trans, trans)(c);
CLASS(inode_walker, inode)();
+ struct progress_indicator_state progress;
+ bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_xattrs));
+
int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
POS(BCACHEFS_ROOT_INO, 0),
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
k,
NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc,
- check_xattr(trans, &iter, k, &hash_info, &inode));
+ BCH_TRANS_COMMIT_no_enospc, ({
+ progress_update_iter(trans, &progress, &iter);
+ check_xattr(trans, &iter, k, &hash_info, &inode);
+ }));
return ret;
}
@@ -2664,10 +2696,16 @@ err:
int bch2_check_subvolume_structure(struct bch_fs *c)
{
CLASS(btree_trans, trans)(c);
+
+ struct progress_indicator_state progress;
+ bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_subvolumes));
+
return for_each_btree_key_commit(trans, iter,
BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- check_subvol_path(trans, &iter, k));
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+ progress_update_iter(trans, &progress, &iter);
+ check_subvol_path(trans, &iter, k);
+ }));
}
static int bch2_bi_depth_renumber_one(struct btree_trans *trans,
diff --git a/libbcachefs/io_write.c b/libbcachefs/io_write.c
index d7620138..44b02d4b 100644
--- a/libbcachefs/io_write.c
+++ b/libbcachefs/io_write.c
@@ -89,7 +89,12 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
new = ewma_add(old, io_latency, 5);
} while (!atomic64_try_cmpxchg(latency, &old, new));
- bch2_congested_acct(ca, io_latency, now, rw);
+ /*
+ * Only track read latency for congestion accounting: writes are subject
+ * to heavy queuing delays from page cache writeback:
+ */
+ if (rw == READ)
+ bch2_congested_acct(ca, io_latency, now, rw);
__bch2_time_stats_update(&ca->io_latency[rw].stats, submit_time, now);
}
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index 3ba1f9fd..f9e2e1a4 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -182,6 +182,8 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags)
void bch2_journal_do_writes(struct journal *j)
{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+
for (u64 seq = journal_last_unwritten_seq(j);
seq <= journal_cur_seq(j);
seq++) {
@@ -196,6 +198,7 @@ void bch2_journal_do_writes(struct journal *j)
if (!journal_state_seq_count(j, j->reservations, seq)) {
j->seq_write_started = seq;
w->write_started = true;
+ closure_get(&c->cl);
closure_call(&w->io, bch2_journal_write, j->wq, NULL);
}
@@ -1063,6 +1066,8 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou
ret = journal_state_count(s, idx & JOURNAL_STATE_BUF_MASK) > open
? ERR_PTR(-EAGAIN)
: buf;
+ if (!IS_ERR(ret))
+ smp_mb();
break;
}
}
@@ -1467,6 +1472,10 @@ int bch2_fs_journal_start(struct journal *j, u64 last_seq, u64 cur_seq)
last_seq = cur_seq;
u64 nr = cur_seq - last_seq;
+ if (nr * sizeof(struct journal_entry_pin_list) > 1U << 30) {
+ bch_err(c, "too many ntjournal fifo (%llu open entries)", nr);
+ return bch_err_throw(c, ENOMEM_journal_pin_fifo);
+ }
/*
* Extra fudge factor, in case we crashed when the journal pin fifo was
@@ -1479,7 +1488,7 @@ int bch2_fs_journal_start(struct journal *j, u64 last_seq, u64 cur_seq)
nr = max(nr, JOURNAL_PIN);
init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL);
if (!j->pin.data) {
- bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
+ bch_err(c, "error allocating journal fifo (%llu open entries)", nr);
return bch_err_throw(c, ENOMEM_journal_pin_fifo);
}
diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h
index b46b9718..c05aa942 100644
--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@@ -267,7 +267,7 @@ static inline union journal_res_state journal_state_buf_put(struct journal *j, u
{
union journal_res_state s;
- s.v = atomic64_sub_return(((union journal_res_state) {
+ s.v = atomic64_sub_return_release(((union journal_res_state) {
.buf0_count = idx == 0,
.buf1_count = idx == 1,
.buf2_count = idx == 2,
diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c
index 2835250a..47224666 100644
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@@ -1820,6 +1820,8 @@ static CLOSURE_CALLBACK(journal_write_done)
if (do_discards)
bch2_do_discards(c);
+
+ closure_put(&c->cl);
}
static void journal_write_endio(struct bio *bio)
diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c
index be50455c..f23e5ee9 100644
--- a/libbcachefs/journal_reclaim.c
+++ b/libbcachefs/journal_reclaim.c
@@ -874,7 +874,34 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
--type)
if (journal_flush_pins_or_still_flushing(j, seq_to_flush, BIT(type))) {
*did_work = true;
- return ret;
+
+ /*
+ * Question from Dan Carpenter, on the early return:
+ *
+ * If journal_flush_pins_or_still_flushing() returns
+ * true, then the flush hasn't complete and we must
+ * return 0; we want the outer closure_wait_event() in
+ * journal_flush_pins() to continue.
+ *
+ * The early return is there because we don't want to
+ * call journal_entry_close() until we've finished
+ * flushing all outstanding journal pins - otherwise
+ * seq_to_flush can be U64_MAX, and we'll close a bunch
+ * of journal entries and write tiny ones completely
+ * unnecessarily.
+ *
+ * Having the early return be in the loop where we loop
+ * over types is important, because flushing one journal
+ * pin can cause new journal pins to be added (even of
+ * the same type, btree node writes may generate more
+ * btree node writes, when updating the parent pointer
+ * has a full node and has to trigger a split/compact).
+ *
+ * This is part of our shutdown sequence, where order of
+ * flushing is important in order to make sure that it
+ * terminates...
+ */
+ return 0;
}
if (seq_to_flush > journal_cur_seq(j))
diff --git a/libbcachefs/lru.c b/libbcachefs/lru.c
index ee14656c..76109b37 100644
--- a/libbcachefs/lru.c
+++ b/libbcachefs/lru.c
@@ -9,6 +9,7 @@
#include "ec.h"
#include "error.h"
#include "lru.h"
+#include "progress.h"
#include "recovery.h"
/* KEY_TYPE_lru is obsolete: */
@@ -207,11 +208,16 @@ int bch2_check_lrus(struct bch_fs *c)
bch2_bkey_buf_init(&last_flushed);
bkey_init(&last_flushed.k->k);
+ struct progress_indicator_state progress;
+ bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_lru));
+
CLASS(btree_trans, trans)(c);
int ret = for_each_btree_key_commit(trans, iter,
BTREE_ID_lru, POS_MIN, BTREE_ITER_prefetch, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_check_lru_key(trans, &iter, k, &last_flushed));
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+ progress_update_iter(trans, &progress, &iter);
+ bch2_check_lru_key(trans, &iter, k, &last_flushed);
+ }));
bch2_bkey_buf_exit(&last_flushed, c);
return ret;
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index 3f44bb54..84a228c4 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -330,7 +330,7 @@ int bch2_move_extent(struct moving_context *ctxt,
{
struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
- int ret = -ENOMEM;
+ int ret = 0;
if (trace_io_move_enabled())
trace_io_move2(c, k, &io_opts, &data_opts);
@@ -351,11 +351,10 @@ int bch2_move_extent(struct moving_context *ctxt,
struct moving_io *io = allocate_dropping_locks(trans, ret,
kzalloc(sizeof(struct moving_io), _gfp));
- if (!io)
- goto err;
-
+ if (!io && !ret)
+ ret = bch_err_throw(c, ENOMEM_move_extent);
if (ret)
- goto err_free;
+ goto err;
INIT_LIST_HEAD(&io->io_list);
io->write.ctxt = ctxt;
@@ -366,7 +365,7 @@ int bch2_move_extent(struct moving_context *ctxt,
ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp,
&io_opts, data_opts, iter->btree_id, k);
if (ret)
- goto err_free;
+ goto err;
io->write.op.end_io = move_write_done;
} else {
@@ -380,7 +379,7 @@ int bch2_move_extent(struct moving_context *ctxt,
ret = bch2_data_update_bios_init(&io->write, c, &io_opts);
if (ret)
- goto err_free;
+ goto err;
}
io->write.rbio.bio.bi_end_io = move_read_endio;
@@ -423,9 +422,8 @@ int bch2_move_extent(struct moving_context *ctxt,
BCH_READ_last_fragment,
data_opts.scrub ? data_opts.read_dev : -1);
return 0;
-err_free:
- kfree(io);
err:
+ kfree(io);
if (bch2_err_matches(ret, EROFS) ||
bch2_err_matches(ret, BCH_ERR_transaction_restart))
return ret;
@@ -795,50 +793,50 @@ out:
return ret;
}
-int __bch2_move_data(struct moving_context *ctxt,
- struct bbpos start,
- struct bbpos end,
- move_pred_fn pred, void *arg)
+static int bch2_move_data(struct bch_fs *c,
+ struct bbpos start,
+ struct bbpos end,
+ unsigned min_depth,
+ struct bch_ratelimit *rate,
+ struct bch_move_stats *stats,
+ struct write_point_specifier wp,
+ bool wait_on_copygc,
+ move_pred_fn pred, void *arg)
{
- struct bch_fs *c = ctxt->trans->c;
- enum btree_id id;
int ret = 0;
- for (id = start.btree;
+ struct moving_context ctxt;
+ bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
+
+ for (enum btree_id id = start.btree;
id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
id++) {
- ctxt->stats->pos = BBPOS(id, POS_MIN);
+ ctxt.stats->pos = BBPOS(id, POS_MIN);
- if (!btree_type_has_ptrs(id) ||
- !bch2_btree_id_root(c, id)->b)
+ if (!bch2_btree_id_root(c, id)->b)
continue;
- ret = bch2_move_data_btree(ctxt,
- id == start.btree ? start.pos : POS_MIN,
- id == end.btree ? end.pos : POS_MAX,
- pred, arg, id, 0);
+ unsigned min_depth_this_btree = min_depth;
+
+ if (!btree_type_has_ptrs(id))
+ min_depth_this_btree = max(min_depth_this_btree, 1);
+
+ for (unsigned level = min_depth_this_btree;
+ level < BTREE_MAX_DEPTH;
+ level++) {
+ ret = bch2_move_data_btree(&ctxt,
+ id == start.btree ? start.pos : POS_MIN,
+ id == end.btree ? end.pos : POS_MAX,
+ pred, arg, id, level);
+ if (ret)
+ break;
+ }
+
if (ret)
break;
}
- return ret;
-}
-
-int bch2_move_data(struct bch_fs *c,
- struct bbpos start,
- struct bbpos end,
- struct bch_ratelimit *rate,
- struct bch_move_stats *stats,
- struct write_point_specifier wp,
- bool wait_on_copygc,
- move_pred_fn pred, void *arg)
-{
- struct moving_context ctxt;
-
- bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
- int ret = __bch2_move_data(&ctxt, start, end, pred, arg);
bch2_moving_ctxt_exit(&ctxt);
-
return ret;
}
@@ -1206,14 +1204,6 @@ static bool migrate_pred(struct bch_fs *c, void *arg,
return data_opts->rewrite_ptrs != 0;
}
-static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
- struct btree *b,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- return rereplicate_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key), io_opts, data_opts);
-}
-
/*
* Ancient versions of bcachefs produced packed formats which could represent
* keys that the in memory format cannot represent; this checks for those
@@ -1293,15 +1283,6 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
return data_opts->kill_ptrs != 0;
}
-static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg,
- struct btree *b,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- return drop_extra_replicas_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key),
- io_opts, data_opts);
-}
-
static bool scrub_pred(struct bch_fs *c, void *_arg,
enum btree_id btree, struct bkey_s_c k,
struct bch_io_opts *io_opts,
@@ -1359,14 +1340,11 @@ int bch2_data_job(struct bch_fs *c,
case BCH_DATA_OP_rereplicate:
stats->data_type = BCH_DATA_journal;
ret = bch2_journal_flush_device_pins(&c->journal, -1);
- ret = bch2_move_btree(c, start, end,
- rereplicate_btree_pred, c, stats) ?: ret;
- ret = bch2_move_data(c, start, end,
- NULL,
- stats,
+ ret = bch2_move_data(c, start, end, 0, NULL, stats,
writepoint_hashed((unsigned long) current),
true,
rereplicate_pred, c) ?: ret;
+ bch2_btree_interior_updates_flush(c);
ret = bch2_replicas_gc2(c) ?: ret;
break;
case BCH_DATA_OP_migrate:
@@ -1389,12 +1367,10 @@ int bch2_data_job(struct bch_fs *c,
ret = bch2_scan_old_btree_nodes(c, stats);
break;
case BCH_DATA_OP_drop_extra_replicas:
- ret = bch2_move_btree(c, start, end,
- drop_extra_replicas_btree_pred, c, stats) ?: ret;
- ret = bch2_move_data(c, start, end, NULL, stats,
- writepoint_hashed((unsigned long) current),
- true,
- drop_extra_replicas_pred, c) ?: ret;
+ ret = bch2_move_data(c, start, end, 0, NULL, stats,
+ writepoint_hashed((unsigned long) current),
+ true,
+ drop_extra_replicas_pred, c) ?: ret;
ret = bch2_replicas_gc2(c) ?: ret;
break;
default:
diff --git a/libbcachefs/move.h b/libbcachefs/move.h
index fe92ca6d..481026ff 100644
--- a/libbcachefs/move.h
+++ b/libbcachefs/move.h
@@ -128,18 +128,6 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *,
int bch2_move_data_btree(struct moving_context *, struct bpos, struct bpos,
move_pred_fn, void *, enum btree_id, unsigned);
-int __bch2_move_data(struct moving_context *,
- struct bbpos,
- struct bbpos,
- move_pred_fn, void *);
-int bch2_move_data(struct bch_fs *,
- struct bbpos start,
- struct bbpos end,
- struct bch_ratelimit *,
- struct bch_move_stats *,
- struct write_point_specifier,
- bool,
- move_pred_fn, void *);
int bch2_move_data_phys(struct bch_fs *, unsigned, u64, u64, unsigned,
struct bch_ratelimit *, struct bch_move_stats *,
diff --git a/libbcachefs/progress.c b/libbcachefs/progress.c
index 42353067..792fc6fe 100644
--- a/libbcachefs/progress.c
+++ b/libbcachefs/progress.c
@@ -52,7 +52,8 @@ void bch2_progress_update_iter(struct btree_trans *trans,
: 0;
prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ",
- msg, percent, s->nodes_seen, s->nodes_total);
+ strip_bch2(msg),
+ percent, s->nodes_seen, s->nodes_total);
bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos));
bch_info(c, "%s", buf.buf);
diff --git a/libbcachefs/progress.h b/libbcachefs/progress.h
index 23fb1811..972a7308 100644
--- a/libbcachefs/progress.h
+++ b/libbcachefs/progress.h
@@ -26,4 +26,7 @@ void bch2_progress_update_iter(struct btree_trans *,
struct btree_iter *,
const char *);
+#define progress_update_iter(trans, p, iter) \
+ bch2_progress_update_iter(trans, p, iter, __func__)
+
#endif /* _BCACHEFS_PROGRESS_H */
diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c
index 32fa7cf9..c7e7f508 100644
--- a/libbcachefs/rebalance.c
+++ b/libbcachefs/rebalance.c
@@ -15,6 +15,7 @@
#include "inode.h"
#include "io_write.h"
#include "move.h"
+#include "progress.h"
#include "rebalance.h"
#include "subvolume.h"
#include "super-io.h"
@@ -858,7 +859,12 @@ int bch2_check_rebalance_work(struct bch_fs *c)
bch2_bkey_buf_init(&last_flushed);
bkey_init(&last_flushed.k->k);
+ struct progress_indicator_state progress;
+ bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_rebalance_work));
+
while (!ret) {
+ progress_update_iter(trans, &progress, &rebalance_iter);
+
bch2_trans_begin(trans);
ret = check_rebalance_work_one(trans, &extent_iter, &rebalance_iter, &last_flushed);
diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c
index a8eea478..304473da 100644
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@@ -37,78 +37,82 @@ int bch2_btree_lost_data(struct bch_fs *c,
struct printbuf *msg,
enum btree_id btree)
{
- u64 b = BIT_ULL(btree);
int ret = 0;
guard(mutex)(&c->sb_lock);
+ bool write_sb = false;
struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
- if (!(c->sb.btrees_lost_data & b)) {
+ if (!(c->sb.btrees_lost_data & BIT_ULL(btree))) {
prt_printf(msg, "flagging btree ");
bch2_btree_id_to_text(msg, btree);
prt_printf(msg, " lost data\n");
- ext->btrees_lost_data |= cpu_to_le64(b);
+ write_sb |= !__test_and_set_bit_le64(btree, &ext->btrees_lost_data);
}
/* Once we have runtime self healing for topology errors we won't need this: */
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0, &write_sb) ?: ret;
/* Btree node accounting will be off: */
- __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent);
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0) ?: ret;
+ write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent);
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0, &write_sb) ?: ret;
#ifdef CONFIG_BCACHEFS_DEBUG
/*
* These are much more minor, and don't need to be corrected right away,
* but in debug mode we want the next fsck run to be clean:
*/
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_lrus, 0) ?: ret;
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_backpointers_to_extents, 0) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_lrus, 0, &write_sb) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_backpointers_to_extents, 0, &write_sb) ?: ret;
#endif
+ write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent);
+ write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_backpointer_to_missing_ptr, ext->errors_silent);
+ write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent);
+ write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
+
switch (btree) {
case BTREE_ID_alloc:
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
-
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent);
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0, &write_sb) ?: ret;
+
+ write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent);
+ write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent);
+ write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent);
+ write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent);
goto out;
case BTREE_ID_backpointers:
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_btree_backpointers, 0) ?: ret;
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_extents_to_backpointers, 0) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_btree_backpointers, 0, &write_sb) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_extents_to_backpointers, 0, &write_sb) ?: ret;
goto out;
case BTREE_ID_need_discard:
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0, &write_sb) ?: ret;
goto out;
case BTREE_ID_freespace:
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0, &write_sb) ?: ret;
goto out;
case BTREE_ID_bucket_gens:
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0, &write_sb) ?: ret;
goto out;
case BTREE_ID_lru:
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0, &write_sb) ?: ret;
goto out;
case BTREE_ID_accounting:
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0, &write_sb) ?: ret;
goto out;
case BTREE_ID_snapshots:
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_reconstruct_snapshots, 0) ?: ret;
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret;
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_reconstruct_snapshots, 0, &write_sb) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0, &write_sb) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0, &write_sb) ?: ret;
goto out;
default:
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret;
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0, &write_sb) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0, &write_sb) ?: ret;
goto out;
}
out:
- bch2_write_super(c);
+ if (write_sb)
+ bch2_write_super(c);
return ret;
}
diff --git a/libbcachefs/recovery_passes.c b/libbcachefs/recovery_passes.c
index f9d1c492..bd442652 100644
--- a/libbcachefs/recovery_passes.c
+++ b/libbcachefs/recovery_passes.c
@@ -340,7 +340,8 @@ static bool recovery_pass_needs_set(struct bch_fs *c,
int __bch2_run_explicit_recovery_pass(struct bch_fs *c,
struct printbuf *out,
enum bch_recovery_pass pass,
- enum bch_run_recovery_pass_flags flags)
+ enum bch_run_recovery_pass_flags flags,
+ bool *write_sb)
{
struct bch_fs_recovery *r = &c->recovery;
int ret = 0;
@@ -362,7 +363,8 @@ int __bch2_run_explicit_recovery_pass(struct bch_fs *c,
if (!(flags & RUN_RECOVERY_PASS_nopersistent)) {
struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
- __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required);
+ *write_sb |= !__test_and_set_bit_le64(bch2_recovery_pass_to_stable(pass),
+ ext->recovery_passes_required);
}
if (pass < BCH_RECOVERY_PASS_set_may_go_rw &&
@@ -408,14 +410,19 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c,
enum bch_recovery_pass pass,
enum bch_run_recovery_pass_flags flags)
{
- int ret = 0;
+ /*
+ * With RUN_RECOVERY_PASS_ratelimit, recovery_pass_needs_set needs
+ * sb_lock
+ */
+ if (!(flags & RUN_RECOVERY_PASS_ratelimit) &&
+ !recovery_pass_needs_set(c, pass, &flags))
+ return 0;
- if (recovery_pass_needs_set(c, pass, &flags)) {
- guard(mutex)(&c->sb_lock);
- ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags);
+ guard(mutex)(&c->sb_lock);
+ bool write_sb = false;
+ int ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags, &write_sb);
+ if (write_sb)
bch2_write_super(c);
- }
-
return ret;
}
@@ -438,14 +445,13 @@ int bch2_require_recovery_pass(struct bch_fs *c,
return 0;
enum bch_run_recovery_pass_flags flags = 0;
- int ret = 0;
- if (recovery_pass_needs_set(c, pass, &flags)) {
- ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags);
+ bool write_sb = false;
+ int ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags, &write_sb) ?:
+ bch_err_throw(c, recovery_pass_will_run);
+ if (write_sb)
bch2_write_super(c);
- }
-
- return ret ?: bch_err_throw(c, recovery_pass_will_run);
+ return ret;
}
int bch2_run_print_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
@@ -459,8 +465,10 @@ int bch2_run_print_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pa
bch2_log_msg_start(c, &buf);
guard(mutex)(&c->sb_lock);
+ bool write_sb = false;
int ret = __bch2_run_explicit_recovery_pass(c, &buf, pass,
- RUN_RECOVERY_PASS_nopersistent);
+ RUN_RECOVERY_PASS_nopersistent,
+ &write_sb);
bch2_print_str(c, KERN_NOTICE, buf.buf);
return ret;
@@ -631,6 +639,8 @@ void bch2_recovery_pass_status_to_text(struct printbuf *out, struct bch_fs *c)
prt_printf(out, "Current pass:\t%s\n", bch2_recovery_passes[r->curr_pass]);
prt_passes(out, "Current passes", r->passes_to_run);
}
+
+ prt_printf(out, "Pass done:\t%s\n", bch2_recovery_passes[r->pass_done]);
}
void bch2_fs_recovery_passes_init(struct bch_fs *c)
diff --git a/libbcachefs/recovery_passes.h b/libbcachefs/recovery_passes.h
index 2117f0ce..4f2c2f81 100644
--- a/libbcachefs/recovery_passes.h
+++ b/libbcachefs/recovery_passes.h
@@ -30,7 +30,8 @@ int bch2_run_print_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pas
int __bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *,
enum bch_recovery_pass,
- enum bch_run_recovery_pass_flags);
+ enum bch_run_recovery_pass_flags,
+ bool *);
int bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *,
enum bch_recovery_pass,
enum bch_run_recovery_pass_flags);
diff --git a/libbcachefs/sb-members_format.h b/libbcachefs/sb-members_format.h
index fb72ad73..b2b89268 100644
--- a/libbcachefs/sb-members_format.h
+++ b/libbcachefs/sb-members_format.h
@@ -17,7 +17,7 @@
UUID_INIT(0xffffffff, 0xffff, 0xffff, \
0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef)
-#define BCH_MIN_NR_NBUCKETS (1 << 6)
+#define BCH_MIN_NR_NBUCKETS (1 << 9)
#define BCH_IOPS_MEASUREMENTS() \
x(seqread, 0) \
diff --git a/libbcachefs/str_hash.c b/libbcachefs/str_hash.c
index dfe4b6ae..3e08e55d 100644
--- a/libbcachefs/str_hash.c
+++ b/libbcachefs/str_hash.c
@@ -329,7 +329,6 @@ duplicate_entries:
out:
fsck_err:
bch2_trans_iter_exit(trans, dup_iter);
- printbuf_exit(&buf);
if (free_snapshots_seen)
darray_exit(&s->ids);
return ret;
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 4e038f65..b3b2d835 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -514,6 +514,10 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
if (ret)
return ret;
+ ret = bch2_fs_mark_dirty(c);
+ if (ret)
+ return ret;
+
clear_bit(BCH_FS_clean_shutdown, &c->flags);
scoped_guard(rcu)
@@ -537,10 +541,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
bch2_journal_space_available(&c->journal);
}
- ret = bch2_fs_mark_dirty(c);
- if (ret)
- return ret;
-
/*
* Don't jump to our error path, and call bch2_fs_read_only(), unless we
* successfully marked the filesystem dirty
@@ -729,6 +729,8 @@ void __bch2_fs_stop(struct bch_fs *c)
cancel_work_sync(&ca->io_error_work);
cancel_work_sync(&c->read_only_work);
+
+ flush_work(&c->btree_interior_update_work);
}
void bch2_fs_free(struct bch_fs *c)
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index 158f526e..bd3fa9c3 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -18,6 +18,7 @@
#include "btree_key_cache.h"
#include "btree_update.h"
#include "btree_update_interior.h"
+#include "btree_write_buffer.h"
#include "btree_gc.h"
#include "buckets.h"
#include "clock.h"
@@ -150,6 +151,7 @@ write_attribute(trigger_journal_flush);
write_attribute(trigger_journal_writes);
write_attribute(trigger_btree_cache_shrink);
write_attribute(trigger_btree_key_cache_shrink);
+write_attribute(trigger_btree_write_buffer_flush);
write_attribute(trigger_btree_updates);
write_attribute(trigger_freelist_wakeup);
write_attribute(trigger_recalc_capacity);
@@ -539,6 +541,11 @@ STORE(bch2_fs)
c->btree_key_cache.shrink->scan_objects(c->btree_key_cache.shrink, &sc);
}
+ if (attr == &sysfs_trigger_btree_write_buffer_flush)
+ bch2_trans_do(c,
+ (bch2_btree_write_buffer_flush_sync(trans),
+ bch2_trans_begin(trans)));
+
if (attr == &sysfs_trigger_gc)
bch2_gc_gens(c);
@@ -709,6 +716,7 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_trigger_journal_writes,
&sysfs_trigger_btree_cache_shrink,
&sysfs_trigger_btree_key_cache_shrink,
+ &sysfs_trigger_btree_write_buffer_flush,
&sysfs_trigger_btree_updates,
&sysfs_trigger_freelist_wakeup,
&sysfs_trigger_recalc_capacity,
diff --git a/libbcachefs/util.h b/libbcachefs/util.h
index 768528c2..52ac8230 100644
--- a/libbcachefs/util.h
+++ b/libbcachefs/util.h
@@ -733,6 +733,13 @@ static inline bool test_bit_le64(size_t bit, __le64 *addr)
return (addr[bit / 64] & cpu_to_le64(BIT_ULL(bit % 64))) != 0;
}
+static inline bool __test_and_set_bit_le64(size_t bit, __le64 *addr)
+{
+ bool ret = test_bit_le64(bit, addr);
+ __set_bit_le64(bit, addr);
+ return ret;
+}
+
static inline void memcpy_swab(void *_dst, void *_src, size_t len)
{
u8 *dst = _dst + len;
diff --git a/linux/closure.c b/linux/closure.c
index 2bfe7d2a..4fb78d18 100644
--- a/linux/closure.c
+++ b/linux/closure.c
@@ -13,23 +13,25 @@
#include <linux/seq_file.h>
#include <linux/sched/debug.h>
-static inline void closure_put_after_sub_checks(int flags)
+static inline void closure_put_after_sub_checks(struct closure *cl, int flags)
{
int r = flags & CLOSURE_REMAINING_MASK;
if (WARN(flags & CLOSURE_GUARD_MASK,
- "closure has guard bits set: %x (%u)",
+ "closure %ps has guard bits set: %x (%u)",
+ cl->fn,
flags & CLOSURE_GUARD_MASK, (unsigned) __fls(r)))
r &= ~CLOSURE_GUARD_MASK;
WARN(!r && (flags & ~CLOSURE_DESTRUCTOR),
- "closure ref hit 0 with incorrect flags set: %x (%u)",
+ "closure %ps ref hit 0 with incorrect flags set: %x (%u)",
+ cl->fn,
flags & ~CLOSURE_DESTRUCTOR, (unsigned) __fls(flags));
}
static inline void closure_put_after_sub(struct closure *cl, int flags)
{
- closure_put_after_sub_checks(flags);
+ closure_put_after_sub_checks(cl, flags);
if (!(flags & CLOSURE_REMAINING_MASK)) {
smp_acquire__after_ctrl_dep();
@@ -167,7 +169,7 @@ void __sched closure_return_sync(struct closure *cl)
unsigned flags = atomic_sub_return_release(1 + CLOSURE_RUNNING - CLOSURE_DESTRUCTOR,
&cl->remaining);
- closure_put_after_sub_checks(flags);
+ closure_put_after_sub_checks(cl, flags);
if (unlikely(flags & CLOSURE_REMAINING_MASK)) {
while (1) {