diff options
59 files changed, 913 insertions, 475 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision index 7abb0304..8f8e5f44 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -deeffbdc52f1092dadb3d523c4429e002c7fc485 +e54ff0aa96886b753343100125bd3dfab1a8e337 diff --git a/.github/workflows/build-packages.yml b/.github/workflows/build-packages.yml index 052f366a..6610a50e 100644 --- a/.github/workflows/build-packages.yml +++ b/.github/workflows/build-packages.yml @@ -8,7 +8,7 @@ jobs: name: bcachefs-tools-deb strategy: matrix: - os: [ubuntu-22.04, ubuntu-24.04] + os: [ubuntu-24.04] runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 @@ -68,7 +68,7 @@ checksum = "86fdf8605db99b54d3cd748a44c6d04df638eb5dafb219b135d0149bd0db01f6" [[package]] name = "bcachefs-tools" -version = "1.25.2" +version = "1.25.3" dependencies = [ "anyhow", "bch_bindgen", @@ -4,7 +4,7 @@ default-members = [".", "bch_bindgen"] [package] name = "bcachefs-tools" -version = "1.25.2" +version = "1.25.3" authors = ["Yuxuan Shui <yshuiv7@gmail.com>", "Kayla Firestack <dev@kaylafire.me>", "Kent Overstreet <kent.overstreet@linux.dev>" ] edition = "2021" rust-version = "1.77.0" @@ -1,4 +1,4 @@ -VERSION=1.25.2 +VERSION=1.25.3 PREFIX?=/usr/local LIBEXECDIR?=$(PREFIX)/libexec diff --git a/c_src/cmd_dump.c b/c_src/cmd_dump.c index eb338858..08051802 100644 --- a/c_src/cmd_dump.c +++ b/c_src/cmd_dump.c @@ -14,71 +14,216 @@ #include "libbcachefs/btree_iter.h" #include "libbcachefs/error.h" #include "libbcachefs/extents.h" +#include "libbcachefs/journal_io.h" #include "libbcachefs/sb-members.h" #include "libbcachefs/super.h" -static void dump_usage(void) -{ - puts("bcachefs dump - dump filesystem metadata\n" - "Usage: bcachefs dump [OPTION]... <devices>\n" - "\n" - "Options:\n" - " -o output Output qcow2 image(s)\n" - " -f, --force Force; overwrite when needed\n" - " --nojournal Don't dump entire journal, just dirty entries\n" - " --noexcl Open devices with O_NOEXCL (not recommended)\n" - " -h, --help Display this help and exit\n" - "Report bugs to <linux-bcachefs@vger.kernel.org>"); -} +struct dump_dev { + ranges sb, journal, btree; +}; +typedef DARRAY(struct dump_dev) dump_devs; -static void dump_node(struct bch_fs *c, struct bch_dev *ca, struct bkey_s_c k, ranges *data) +static void dump_node(struct bch_fs *c, dump_devs *devs, struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + unsigned bytes = btree_ptr_sectors_written(k) << 9 ?: c->opts.btree_node_size; bkey_for_each_ptr(ptrs, ptr) - if (ptr->dev == ca->dev_idx) - range_add(data, ptr->offset << 9, c->opts.btree_node_size); + range_add(&devs->data[ptr->dev].btree, + ptr->offset << 9, bytes); } -static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd, - bool entire_journal) +static void get_sb_journal(struct bch_fs *c, struct bch_dev *ca, + bool entire_journal, + struct dump_dev *d) { struct bch_sb *sb = ca->disk_sb.sb; - ranges data = { 0 }; - unsigned i; - int ret; /* Superblock: */ - range_add(&data, BCH_SB_LAYOUT_SECTOR << 9, + range_add(&d->sb, BCH_SB_LAYOUT_SECTOR << 9, sizeof(struct bch_sb_layout)); - for (i = 0; i < sb->layout.nr_superblocks; i++) - range_add(&data, + for (unsigned i = 0; i < sb->layout.nr_superblocks; i++) + range_add(&d->sb, le64_to_cpu(sb->layout.sb_offset[i]) << 9, vstruct_bytes(sb)); /* Journal: */ - for (i = 0; i < ca->journal.nr; i++) + for (unsigned i = 0; i < ca->journal.nr; i++) if (entire_journal || ca->journal.bucket_seq[i] >= c->journal.last_seq_ondisk) { u64 bucket = ca->journal.buckets[i]; - range_add(&data, + range_add(&d->journal, bucket_bytes(ca) * bucket, bucket_bytes(ca)); } +} + +struct dump_opts { + char *out; + bool force; + bool sanitize; + bool entire_journal; + bool noexcl; +}; + +static void sanitize_key(struct bkey_packed *k, struct bkey_format *f, void *end, + bool *modified) +{ + struct bch_val *v = bkeyp_val(f, k); + unsigned len = min_t(unsigned, end - (void *) v, bkeyp_val_bytes(f, k)); + + switch (k->type) { + case KEY_TYPE_inline_data: { + struct bch_inline_data *d = container_of(v, struct bch_inline_data, v); + + memset(&d->data[0], 0, len - offsetof(struct bch_inline_data, data)); + *modified = true; + break; + } + case KEY_TYPE_indirect_inline_data: { + struct bch_indirect_inline_data *d = container_of(v, struct bch_indirect_inline_data, v); + + memset(&d->data[0], 0, len - offsetof(struct bch_indirect_inline_data, data)); + *modified = true; + break; + } + } +} + +static void sanitize_journal(struct bch_fs *c, void *buf, size_t len) +{ + struct bkey_format f = BKEY_FORMAT_CURRENT; + void *end = buf + len; + + while (len) { + struct jset *j = buf; + bool modified = false; + + if (le64_to_cpu(j->magic) != jset_magic(c)) + break; + + vstruct_for_each(j, i) { + if ((void *) i >= end) + break; + + if (!jset_entry_is_key(i)) + continue; + + jset_entry_for_each_key(i, k) { + if ((void *) k >= end) + break; + if (!k->k.u64s) + break; + sanitize_key(bkey_to_packed(k), &f, end, &modified); + } + } + + if (modified) { + memset(&j->csum, 0, sizeof(j->csum)); + SET_JSET_CSUM_TYPE(j, 0); + } + + unsigned b = min(len, vstruct_sectors(j, c->block_bits) << 9); + len -= b; + buf += b; + } +} + +static void sanitize_btree(struct bch_fs *c, void *buf, size_t len) +{ + void *end = buf + len; + bool first = true; + struct bkey_format f_current = BKEY_FORMAT_CURRENT; + struct bkey_format f; + u64 seq; + + while (len) { + unsigned sectors; + struct bset *i; + bool modified = false; + + if (first) { + struct btree_node *bn = buf; + + if (le64_to_cpu(bn->magic) != bset_magic(c)) + break; + + i = &bn->keys; + seq = bn->keys.seq; + f = bn->format; + + sectors = vstruct_sectors(bn, c->block_bits); + } else { + struct btree_node_entry *bne = buf; + + if (bne->keys.seq != seq) + break; + + i = &bne->keys; + sectors = vstruct_sectors(bne, c->block_bits); + } + + vstruct_for_each(i, k) { + if ((void *) k >= end) + break; + if (!k->u64s) + break; + + sanitize_key(k, bkey_packed(k) ? &f : &f_current, end, &modified); + } + + if (modified) { + if (first) { + struct btree_node *bn = buf; + memset(&bn->csum, 0, sizeof(bn->csum)); + } else { + struct btree_node_entry *bne = buf; + memset(&bne->csum, 0, sizeof(bne->csum)); + } + SET_BSET_CSUM_TYPE(i, 0); + } + + first = false; + + unsigned b = min(len, sectors << 9); + len -= b; + buf += b; + } +} + +static int dump_fs(struct bch_fs *c, struct dump_opts opts) +{ + if (opts.sanitize) + printf("Sanitizing inline data extents\n"); + + dump_devs devs = {}; + while (devs.nr < c->sb.nr_devices) + darray_push(&devs, (struct dump_dev) {}); + + down_read(&c->state_lock); + + unsigned nr_online = 0; + for_each_online_member(c, ca, 0) { + if (opts.sanitize && ca->mi.bucket_size % block_sectors(c)) + die("%s has unaligned buckets, cannot sanitize", ca->name); + + get_sb_journal(c, ca, opts.entire_journal, &devs.data[ca->dev_idx]); + nr_online++; + } - /* Btree: */ - for (i = 0; i < BTREE_ID_NR; i++) { - struct btree_trans *trans = bch2_trans_get(c); + bch_verbose(c, "walking metadata to dump"); + for (unsigned i = 0; i < BTREE_ID_NR; i++) { + CLASS(btree_trans, trans)(c); - ret = __for_each_btree_node(trans, iter, i, POS_MIN, 0, 1, 0, b, ({ + int ret = __for_each_btree_node(trans, iter, i, POS_MIN, 0, 1, 0, b, ({ struct btree_node_iter iter; struct bkey u; struct bkey_s_c k; for_each_btree_node_key_unpack(b, k, &iter, &u) - dump_node(c, ca, k, &data); + dump_node(c, &devs, k); 0; })); @@ -87,57 +232,135 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd, struct btree *b = bch2_btree_id_root(c, i)->b; if (!btree_node_fake(b)) - dump_node(c, ca, bkey_i_to_s_c(&b->key), &data); + dump_node(c, &devs, bkey_i_to_s_c(&b->key)); + } + + bch_verbose(c, "writing metadata image(s)"); + for_each_online_member(c, ca, 0) { + int flags = O_WRONLY|O_CREAT|O_TRUNC; + + if (!opts.force) + flags |= O_EXCL; + + char *path = nr_online > 1 + ? mprintf("%s.%u.qcow2", opts.out, ca->dev_idx) + : mprintf("%s.qcow2", opts.out); + int fd = xopen(path, flags, 0600); + free(path); + + struct qcow2_image img; + qcow2_image_init(&img, ca->disk_sb.bdev->bd_fd, fd, c->opts.block_size); + + struct dump_dev *d = &devs.data[ca->dev_idx]; + + qcow2_write_ranges(&img, &d->sb); + + if (!opts.sanitize) { + qcow2_write_ranges(&img, &d->journal); + qcow2_write_ranges(&img, &d->btree); + } else { + ranges_sort(&d->journal); + ranges_sort(&d->btree); + + u64 bucket_bytes = ca->mi.bucket_size << 9; + char *buf = xmalloc(bucket_bytes); - bch2_trans_put(trans); + darray_for_each(d->journal, r) { + u64 len = r->end - r->start; + BUG_ON(len > bucket_bytes); + + xpread(img.infd, buf, len, r->start); + sanitize_journal(c, buf, len); + qcow2_write_buf(&img, buf, len, r->start); + } + + darray_for_each(d->btree, r) { + u64 len = r->end - r->start; + BUG_ON(len > bucket_bytes); + + xpread(img.infd, buf, len, r->start); + sanitize_btree(c, buf, len); + qcow2_write_buf(&img, buf, len, r->start); + } + free(buf); + } + + qcow2_image_finish(&img); + xclose(fd); } - qcow2_write_image(ca->disk_sb.bdev->bd_fd, fd, &data, - max_t(unsigned, c->opts.btree_node_size / 8, block_bytes(c))); - darray_exit(&data); + up_read(&c->state_lock); + + bch2_fs_stop(c); + + darray_for_each(devs, d) { + darray_exit(&d->sb); + darray_exit(&d->journal); + darray_exit(&d->btree); + } + darray_exit(&devs); + return 0; +} + +static void dump_usage(void) +{ + puts("bcachefs dump - dump filesystem metadata\n" + "Usage: bcachefs dump [OPTION]... <devices>\n" + "\n" + "Options:\n" + " -o output Output qcow2 image(s)\n" + " -f, --force Force; overwrite when needed\n" + " -s, --sanitize Zero out inline data extents\n" + " --nojournal Don't dump entire journal, just dirty entries\n" + " --noexcl Open devices with O_NOEXCL (not recommended)\n" + " -v, --verbose\n" + " -h, --help Display this help and exit\n" + "Report bugs to <linux-bcachefs@vger.kernel.org>"); } int cmd_dump(int argc, char *argv[]) { static const struct option longopts[] = { { "force", no_argument, NULL, 'f' }, + { "sanitize", no_argument, NULL, 's' }, { "nojournal", no_argument, NULL, 'j' }, { "noexcl", no_argument, NULL, 'e' }, { "verbose", no_argument, NULL, 'v' }, { "help", no_argument, NULL, 'h' }, { NULL } }; - struct bch_opts opts = bch2_opts_empty(); - char *out = NULL; - unsigned nr_devices = 0; - bool force = false, entire_journal = true; - int fd, opt; - - opt_set(opts, direct_io, false); - opt_set(opts, read_only, true); - opt_set(opts, nochanges, true); - opt_set(opts, norecovery, true); - opt_set(opts, degraded, BCH_DEGRADED_very); - opt_set(opts, errors, BCH_ON_ERROR_continue); - opt_set(opts, fix_errors, FSCK_FIX_no); - - while ((opt = getopt_long(argc, argv, "o:fvh", + struct bch_opts fs_opts = bch2_opts_empty(); + struct dump_opts opts = { .entire_journal = true }; + int opt; + + opt_set(fs_opts, direct_io, false); + opt_set(fs_opts, read_only, true); + opt_set(fs_opts, nochanges, true); + opt_set(fs_opts, norecovery, true); + opt_set(fs_opts, degraded, BCH_DEGRADED_very); + opt_set(fs_opts, errors, BCH_ON_ERROR_continue); + opt_set(fs_opts, fix_errors, FSCK_FIX_no); + + while ((opt = getopt_long(argc, argv, "o:fsvh", longopts, NULL)) != -1) switch (opt) { case 'o': - out = optarg; + opts.out = optarg; break; case 'f': - force = true; + opts.force = true; + break; + case 's': + opts.sanitize = true; break; case 'j': - entire_journal = false; + opts.entire_journal = false; break; case 'e': - opt_set(opts, noexcl, true); + opt_set(fs_opts, noexcl, true); break; case 'v': - opt_set(opts, verbose, true); + opt_set(fs_opts, verbose, true); break; case 'h': dump_usage(); @@ -145,44 +368,19 @@ int cmd_dump(int argc, char *argv[]) } args_shift(optind); - if (!out) + if (!opts.out) die("Please supply output filename"); if (!argc) die("Please supply device(s) to check"); - darray_const_str devs = get_or_split_cmdline_devs(argc, argv); + darray_const_str dev_names = get_or_split_cmdline_devs(argc, argv); - struct bch_fs *c = bch2_fs_open(&devs, &opts); + struct bch_fs *c = bch2_fs_open(&dev_names, &fs_opts); if (IS_ERR(c)) die("error opening devices: %s", bch2_err_str(PTR_ERR(c))); - down_read(&c->state_lock); - - for_each_online_member(c, ca, 0) - nr_devices++; - - BUG_ON(!nr_devices); - - for_each_online_member(c, ca, 0) { - int flags = O_WRONLY|O_CREAT|O_TRUNC; - - if (!force) - flags |= O_EXCL; - - char *path = nr_devices > 1 - ? mprintf("%s.%u.qcow2", out, ca->dev_idx) - : mprintf("%s.qcow2", out); - fd = xopen(path, flags, 0600); - free(path); - - dump_one_device(c, ca, fd, entire_journal); - xclose(fd); - } - - up_read(&c->state_lock); - - bch2_fs_stop(c); - darray_exit(&devs); - return 0; + int ret = dump_fs(c, opts); + darray_exit(&dev_names); + return ret; } diff --git a/c_src/cmd_image.c b/c_src/cmd_image.c index d00d85cf..467378b0 100644 --- a/c_src/cmd_image.c +++ b/c_src/cmd_image.c @@ -665,7 +665,10 @@ static int image_update(const char *src_path, const char *dst_image, goto err; } - if (ftruncate(dev_opts.bdev->bd_fd, input_bytes)) { + u64 metadata_dev_size = max(input_bytes, + c->opts.btree_node_size * BCH_MIN_NR_NBUCKETS); + + if (ftruncate(dev_opts.bdev->bd_fd, metadata_dev_size)) { fprintf(stderr, "ftruncate error: %m"); goto err; } diff --git a/c_src/cmd_strip_alloc.c b/c_src/cmd_strip_alloc.c index c313b665..e16eb093 100644 --- a/c_src/cmd_strip_alloc.c +++ b/c_src/cmd_strip_alloc.c @@ -104,8 +104,9 @@ int cmd_strip_alloc(int argc, char *argv[]) struct bch_opts opts = bch2_opts_empty(); opt_set(opts, nostart, true); + struct bch_fs *c; reopen: - struct bch_fs *c = bch2_fs_open(&devs, &opts); + c = bch2_fs_open(&devs, &opts); int ret = PTR_ERR_OR_ZERO(c); if (ret) die("Error opening filesystem: %s", bch2_err_str(ret)); diff --git a/c_src/libbcachefs.c b/c_src/libbcachefs.c index 935b13ce..6b31d56f 100644 --- a/c_src/libbcachefs.c +++ b/c_src/libbcachefs.c @@ -79,9 +79,14 @@ void bch2_sb_layout_init(struct bch_sb_layout *l, } } -static u64 dev_max_bucket_size(u64 dev_size) +static u64 dev_max_bucket_size(struct bch_opts fs_opts, u64 dev_size) { - return rounddown_pow_of_two(dev_size / (BCH_MIN_NR_NBUCKETS * 4)); + u64 size = rounddown_pow_of_two(dev_size / (BCH_MIN_NR_NBUCKETS * 4)); + if (opt_defined(fs_opts, btree_node_size)) + size = max(size, fs_opts.btree_node_size); + if (size * BCH_MIN_NR_NBUCKETS > dev_size) + die("bucket size %llu too big for device size", size); + return size; } u64 bch2_pick_bucket_size(struct bch_opts opts, dev_opts_list devs) @@ -209,7 +214,7 @@ struct bch_sb *bch2_format(struct bch_opt_strs fs_opt_strs, darray_for_each(devs, i) if (!opt_defined(i->opts, bucket_size)) opt_set(i->opts, bucket_size, - min(fs_bucket_size, dev_max_bucket_size(i->fs_size))); + min(fs_bucket_size, dev_max_bucket_size(fs_opts, i->fs_size))); darray_for_each(devs, i) { i->nbuckets = i->fs_size / i->opts.bucket_size; diff --git a/c_src/posix_to_bcachefs.c b/c_src/posix_to_bcachefs.c index 0e7d4c29..ca44db32 100644 --- a/c_src/posix_to_bcachefs.c +++ b/c_src/posix_to_bcachefs.c @@ -439,40 +439,38 @@ static void link_file_data(struct bch_fs *c, fiemap_iter_exit(&iter); } -static struct range seek_data_aligned(int fd, u64 i_size, loff_t o, unsigned bs) +static struct range align_range(struct range r, unsigned bs) { - struct range seek_data(int fd, loff_t o) - { - s64 s = lseek(fd, o, SEEK_DATA); - if (s < 0 && errno == ENXIO) - return (struct range) {}; - if (s < 0) - die("lseek error: %m"); - - s64 e = lseek(fd, s, SEEK_HOLE); - if (e < 0 && errno == ENXIO) - e = i_size; - if (e < 0) - die("lseek error: %m"); - - return (struct range) { s, e }; - } - - struct range __seek_data_aligned(int fd, loff_t o, unsigned bs) - { - struct range r = seek_data(fd, o); + r.start = round_down(r.start, bs); + r.end = round_up(r.end, bs); + return r; +} - r.start = round_down(r.start, bs); - r.end = round_up(r.end, bs); - return r; - } +struct range seek_data(int fd, u64 i_size, loff_t o) +{ + s64 s = lseek(fd, o, SEEK_DATA); + if (s < 0 && errno == ENXIO) + return (struct range) {}; + if (s < 0) + die("lseek error: %m"); + + s64 e = lseek(fd, s, SEEK_HOLE); + if (e < 0 && errno == ENXIO) + e = i_size; + if (e < 0) + die("lseek error: %m"); + + return (struct range) { s, e }; +} - struct range r = __seek_data_aligned(fd, o, bs); +static struct range seek_data_aligned(int fd, u64 i_size, loff_t o, unsigned bs) +{ + struct range r = align_range(seek_data(fd, i_size, o), bs); if (!r.end) return r; while (true) { - struct range n = __seek_data_aligned(fd, r.end, bs); + struct range n = align_range(seek_data(fd, i_size, r.end), bs); if (!n.end || r.end < n.start) break; @@ -482,38 +480,30 @@ static struct range seek_data_aligned(int fd, u64 i_size, loff_t o, unsigned bs) return r; } -static struct range seek_mismatch_aligned(const char *buf1, const char *buf2, - unsigned offset, unsigned len, - unsigned bs) +struct range seek_mismatch(const char *buf1, const char *buf2, + unsigned o, unsigned len) { - struct range seek_mismatch(unsigned o) - { - while (o < len && buf1[o] == buf2[o]) - o++; - - if (o == len) - return (struct range) {}; + while (o < len && buf1[o] == buf2[o]) + o++; - unsigned s = o; - while (o < len && buf1[o] != buf2[o]) - o++; + if (o == len) + return (struct range) {}; - return (struct range) { s, o }; - } - - struct range __seek_mismatch_aligned(unsigned o) - { - struct range r = seek_mismatch(o); + unsigned s = o; + while (o < len && buf1[o] != buf2[o]) + o++; - r.start = round_down(r.start, bs); - r.end = round_up(r.end, bs); - return r; - } + return (struct range) { s, o }; +} - struct range r = __seek_mismatch_aligned(offset); +static struct range seek_mismatch_aligned(const char *buf1, const char *buf2, + unsigned offset, unsigned len, + unsigned bs) +{ + struct range r = align_range(seek_mismatch(buf1, buf2, offset, len), bs); if (r.end) while (true) { - struct range n = __seek_mismatch_aligned(r.end); + struct range n = align_range(seek_mismatch(buf1, buf2, r.end, len), bs); if (!n.end || r.end < n.start) break; diff --git a/c_src/qcow2.c b/c_src/qcow2.c index 30a6e056..53959a00 100644 --- a/c_src/qcow2.c +++ b/c_src/qcow2.c @@ -31,24 +31,21 @@ struct qcow2_hdr { u64 snapshots_offset; }; -struct qcow2_image { - int fd; - u32 block_size; - u64 *l1_table; - u64 l1_offset; - u32 l1_index; - u64 *l2_table; - u64 offset; -}; +static void __qcow2_write_buf(struct qcow2_image *img, void *buf, unsigned len) +{ + assert(!(len % img->block_size)); + + xpwrite(img->outfd, buf, len, img->offset, "qcow2 data"); + img->offset += len; +} static void flush_l2(struct qcow2_image *img) { if (img->l1_index != -1) { img->l1_table[img->l1_index] = cpu_to_be64(img->offset|QCOW_OFLAG_COPIED); - xpwrite(img->fd, img->l2_table, img->block_size, img->offset, - "qcow2 l2 table"); - img->offset += img->block_size; + + __qcow2_write_buf(img, img->l2_table, img->block_size); memset(img->l2_table, 0, img->block_size); img->l1_index = -1; @@ -69,66 +66,97 @@ static void add_l2(struct qcow2_image *img, u64 src_blk, u64 dst_offset) img->l2_table[l2_index] = cpu_to_be64(dst_offset|QCOW_OFLAG_COPIED); } -void qcow2_write_image(int infd, int outfd, ranges *data, - unsigned block_size) +void qcow2_write_buf(struct qcow2_image *img, void *buf, unsigned len, u64 src_offset) { - u64 image_size = get_size(infd); - unsigned l2_size = block_size / sizeof(u64); - unsigned l1_size = DIV_ROUND_UP(image_size, (u64) block_size * l2_size); - struct qcow2_hdr hdr = { 0 }; - struct qcow2_image img = { - .fd = outfd, - .block_size = block_size, - .l2_table = xcalloc(l2_size, sizeof(u64)), - .l1_table = xcalloc(l1_size, sizeof(u64)), - .l1_index = -1, - .offset = round_up(sizeof(hdr), block_size), - }; - char *buf = xmalloc(block_size); - u64 src_offset, dst_offset; - - assert(is_power_of_2(block_size)); + u64 dst_offset = img->offset; + __qcow2_write_buf(img, buf, len); + + while (len) { + add_l2(img, src_offset / img->block_size, dst_offset); + dst_offset += img->block_size; + src_offset += img->block_size; + len -= img->block_size; + } +} - ranges_roundup(data, block_size); +void qcow2_write_ranges(struct qcow2_image *img, ranges *data) +{ + ranges_roundup(data, img->block_size); ranges_sort_merge(data); + char *buf = xmalloc(img->block_size); + /* Write data: */ darray_for_each(*data, r) - for (src_offset = r->start; + for (u64 src_offset = r->start; src_offset < r->end; - src_offset += block_size) { - dst_offset = img.offset; - img.offset += img.block_size; + src_offset += img->block_size) { + xpread(img->infd, buf, img->block_size, src_offset); + qcow2_write_buf(img, buf, img->block_size, src_offset); + } + + free(buf); +} - xpread(infd, buf, block_size, src_offset); - xpwrite(outfd, buf, block_size, dst_offset, - "qcow2 data"); +void qcow2_image_init(struct qcow2_image *img, int infd, int outfd, unsigned block_size) +{ + assert(is_power_of_2(block_size)); - add_l2(&img, src_offset / block_size, dst_offset); - } + u64 image_size = get_size(infd); + unsigned l2_size = block_size / sizeof(u64); + unsigned l1_size = DIV_ROUND_UP(image_size, (u64) block_size * l2_size); + + *img = (struct qcow2_image) { + .infd = infd, + .outfd = outfd, + .image_size = image_size, + .block_size = block_size, + .l1_size = l1_size, + .l1_table = xcalloc(l1_size, sizeof(u64)), + .l1_index = -1, + .l2_table = xcalloc(l2_size, sizeof(u64)), + .offset = round_up(sizeof(struct qcow2_hdr), block_size), + }; +} + +void qcow2_image_finish(struct qcow2_image *img) +{ + char *buf = xmalloc(img->block_size); - flush_l2(&img); + flush_l2(img); /* Write L1 table: */ - dst_offset = img.offset; - img.offset += round_up(l1_size * sizeof(u64), block_size); - xpwrite(img.fd, img.l1_table, l1_size * sizeof(u64), dst_offset, + u64 dst_offset = img->offset; + img->offset += round_up(img->l1_size * sizeof(u64), img->block_size); + xpwrite(img->outfd, img->l1_table, img->l1_size * sizeof(u64), dst_offset, "qcow2 l1 table"); /* Write header: */ - hdr.magic = cpu_to_be32(QCOW_MAGIC); - hdr.version = cpu_to_be32(QCOW_VERSION); - hdr.block_bits = cpu_to_be32(ilog2(block_size)); - hdr.size = cpu_to_be64(image_size); - hdr.l1_size = cpu_to_be32(l1_size); - hdr.l1_table_offset = cpu_to_be64(dst_offset); - - memset(buf, 0, block_size); + struct qcow2_hdr hdr = { + .magic = cpu_to_be32(QCOW_MAGIC), + .version = cpu_to_be32(QCOW_VERSION), + .block_bits = cpu_to_be32(ilog2(img->block_size)), + .size = cpu_to_be64(img->image_size), + .l1_size = cpu_to_be32(img->l1_size), + .l1_table_offset = cpu_to_be64(dst_offset), + }; + + memset(buf, 0, img->block_size); memcpy(buf, &hdr, sizeof(hdr)); - xpwrite(img.fd, buf, block_size, 0, + xpwrite(img->outfd, buf, img->block_size, 0, "qcow2 header"); - free(img.l2_table); - free(img.l1_table); + free(img->l2_table); + free(img->l1_table); free(buf); } + +void qcow2_write_image(int infd, int outfd, ranges *data, + unsigned block_size) +{ + struct qcow2_image img; + + qcow2_image_init(&img, infd, outfd, block_size); + qcow2_write_ranges(&img, data); + qcow2_image_finish(&img); +} diff --git a/c_src/qcow2.h b/c_src/qcow2.h index 0943d55c..c7b35627 100644 --- a/c_src/qcow2.h +++ b/c_src/qcow2.h @@ -4,6 +4,25 @@ #include <linux/types.h> #include "tools-util.h" +struct qcow2_image { + int infd; + int outfd; + u64 image_size; + u32 block_size; + u32 l1_size; + u64 *l1_table; + u64 l1_offset; + u32 l1_index; + u64 *l2_table; + u64 offset; +}; + +void qcow2_write_buf(struct qcow2_image *, void *, unsigned, u64); +void qcow2_write_ranges(struct qcow2_image *, ranges *); + +void qcow2_image_init(struct qcow2_image *, int, int, unsigned); +void qcow2_image_finish(struct qcow2_image *); + void qcow2_write_image(int, int, ranges *, unsigned); #endif /* _QCOW2_H */ diff --git a/c_src/tools-util.c b/c_src/tools-util.c index a31adcb0..5a15f306 100644 --- a/c_src/tools-util.c +++ b/c_src/tools-util.c @@ -291,11 +291,16 @@ static int range_cmp(const void *_l, const void *_r) return 0; } +void ranges_sort(ranges *r) +{ + sort(r->data, r->nr, sizeof(r->data[0]), range_cmp, NULL); +} + void ranges_sort_merge(ranges *r) { ranges tmp = { 0 }; - sort(r->data, r->nr, sizeof(r->data[0]), range_cmp, NULL); + ranges_sort(r); /* Merge contiguous ranges: */ darray_for_each(*r, i) { diff --git a/c_src/tools-util.h b/c_src/tools-util.h index 239d7e29..b8104002 100644 --- a/c_src/tools-util.h +++ b/c_src/tools-util.h @@ -117,6 +117,7 @@ static inline void range_add(ranges *data, u64 offset, u64 size) })); } +void ranges_sort(ranges *); void ranges_sort_merge(ranges *); void ranges_roundup(ranges *, unsigned); void ranges_rounddown(ranges *, unsigned); diff --git a/debian/changelog b/debian/changelog index b1ef3bd9..a5b9d3eb 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,11 @@ +bcachefs-tools (1:1.25.3-1) unstable; urgency=medium + + New image tooling: + * bcachefs image create + * bcachefs image update + + -- Kent Overstreet <kent.overstreet@linux.dev> Sun, 20 Jul 2025 12:21:03 -0400 + bcachefs-tools (1:1.25.2-1) unstable; urgency=medium * don't pick a non power of two bucket size @@ -2,11 +2,11 @@ "nodes": { "crane": { "locked": { - "lastModified": 1742394900, - "narHash": "sha256-vVOAp9ahvnU+fQoKd4SEXB2JG2wbENkpqcwlkIXgUC0=", + "lastModified": 1752946753, + "narHash": "sha256-g5uP3jIj+STUcfTJDKYopxnSijs2agRg13H0SGL5iE4=", "owner": "ipetkov", "repo": "crane", - "rev": "70947c1908108c0c551ddfd73d4f750ff2ea67cd", + "rev": "544d09fecc8c2338542c57f3f742f1a0c8c71e13", "type": "github" }, "original": { @@ -18,11 +18,11 @@ "flake-compat": { "flake": false, "locked": { - "lastModified": 1733328505, - "narHash": "sha256-NeCCThCEP3eCl2l/+27kNNK7QrwZB1IJCrXfrbv5oqU=", + "lastModified": 1747046372, + "narHash": "sha256-CIVLLkVgvHYbgI2UpXvIIBJ12HWgX+fjA8Xf8PUmqCY=", "owner": "edolstra", "repo": "flake-compat", - "rev": "ff81ac966bb2cae68946d5ed5fc4994f96d0ffec", + "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885", "type": "github" }, "original": { @@ -36,11 +36,11 @@ "nixpkgs-lib": "nixpkgs-lib" }, "locked": { - "lastModified": 1741352980, - "narHash": "sha256-+u2UunDA4Cl5Fci3m7S643HzKmIDAe+fiXrLqYsR2fs=", + "lastModified": 1751413152, + "narHash": "sha256-Tyw1RjYEsp5scoigs1384gIg6e0GoBVjms4aXFfRssQ=", "owner": "hercules-ci", "repo": "flake-parts", - "rev": "f4330d22f1c5d2ba72d3d22df5597d123fdb60a9", + "rev": "77826244401ea9de6e3bac47c2db46005e1f30b5", "type": "github" }, "original": { @@ -71,11 +71,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1742422364, - "narHash": "sha256-mNqIplmEohk5jRkqYqG19GA8MbQ/D4gQSK0Mu4LvfRQ=", + "lastModified": 1752950548, + "narHash": "sha256-NS6BLD0lxOrnCiEOcvQCDVPXafX1/ek1dfJHX1nUIzc=", "owner": "nixos", "repo": "nixpkgs", - "rev": "a84ebe20c6bc2ecbcfb000a50776219f48d134cc", + "rev": "c87b95e25065c028d31a94f06a62927d18763fdf", "type": "github" }, "original": { @@ -87,11 +87,11 @@ }, "nixpkgs-lib": { "locked": { - "lastModified": 1740877520, - "narHash": "sha256-oiwv/ZK/2FhGxrCkQkB83i7GnWXPPLzoqFHpDD3uYpk=", + "lastModified": 1751159883, + "narHash": "sha256-urW/Ylk9FIfvXfliA1ywh75yszAbiTEVgpPeinFyVZo=", "owner": "nix-community", "repo": "nixpkgs.lib", - "rev": "147dee35aab2193b174e4c0868bd80ead5ce755c", + "rev": "14a40a1d7fb9afa4739275ac642ed7301a9ba1ab", "type": "github" }, "original": { @@ -118,11 +118,11 @@ ] }, "locked": { - "lastModified": 1742524367, - "narHash": "sha256-KzTwk/5ETJavJZYV1DEWdCx05M4duFCxCpRbQSKWpng=", + "lastModified": 1752979888, + "narHash": "sha256-qRRP3QavbwW0o+LOh31QNEfCgPlzK5SKlWALUJL6T7E=", "owner": "oxalica", "repo": "rust-overlay", - "rev": "70bf752d176b2ce07417e346d85486acea9040ef", + "rev": "95719de18aefa63a624bf75a1ff98744b089ec12", "type": "github" }, "original": { @@ -138,11 +138,11 @@ ] }, "locked": { - "lastModified": 1742370146, - "narHash": "sha256-XRE8hL4vKIQyVMDXykFh4ceo3KSpuJF3ts8GKwh5bIU=", + "lastModified": 1753006367, + "narHash": "sha256-tzbhc4XttkyEhswByk5R38l+ztN9UDbnj0cTcP6Hp9A=", "owner": "numtide", "repo": "treefmt-nix", - "rev": "adc195eef5da3606891cedf80c0d9ce2d3190808", + "rev": "421b56313c65a0815a52b424777f55acf0b56ddf", "type": "github" }, "original": { @@ -245,13 +245,13 @@ bcachefs-tools-fuse-i686-linux ; - cargo-clippy = common.craneLib.cargoClippy ( - common.args - // { - inherit (common) cargoArtifacts; - cargoClippyExtraArgs = "--all-targets --all-features -- --deny warnings"; - } - ); + #cargo-clippy = common.craneLib.cargoClippy ( + # common.args + # // { + # inherit (common) cargoArtifacts; + # cargoClippyExtraArgs = "--all-targets --all-features -- --deny warnings"; + # } + #); # we have to build our own `craneLib.cargoTest` cargo-test = common.craneLib.mkCargoDerivation ( diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7c2ec1b2..7cf1a833 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -87,7 +87,7 @@ struct super_block { }; static inline void evict_inodes(struct super_block *sb) {} -static inline int sync_filesystem(struct super_block *) { return 0; } +static inline int sync_filesystem(struct super_block *sb) { return 0; } /* * File types diff --git a/libbcachefs/acl.c b/libbcachefs/acl.c index 307824d6..8f970dc1 100644 --- a/libbcachefs/acl.c +++ b/libbcachefs/acl.c @@ -138,8 +138,8 @@ static struct posix_acl *bch2_acl_from_disk(struct btree_trans *trans, acl = allocate_dropping_locks(trans, ret, posix_acl_alloc(count, _gfp)); - if (!acl) - return ERR_PTR(-ENOMEM); + if (!acl && !ret) + ret = bch_err_throw(trans->c, ENOMEM_acl); if (ret) { kfree(acl); return ERR_PTR(ret); diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 4c1604fd..afc0ab75 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -20,6 +20,7 @@ #include "enumerated_ref.h" #include "error.h" #include "lru.h" +#include "progress.h" #include "recovery.h" #include "varint.h" @@ -337,9 +338,10 @@ void bch2_alloc_v4_swab(struct bkey_s k) } static inline void __bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs *c, - unsigned dev, const struct bch_alloc_v4 *a) + struct bkey_s_c k, + const struct bch_alloc_v4 *a) { - struct bch_dev *ca = c ? bch2_dev_tryget_noerror(c, dev) : NULL; + struct bch_dev *ca = c ? bch2_dev_tryget_noerror(c, k.k->p.inode) : NULL; prt_newline(out); printbuf_indent_add(out, 2); @@ -348,11 +350,14 @@ static inline void __bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs * bch2_prt_data_type(out, a->data_type); prt_newline(out); prt_printf(out, "journal_seq_nonempty %llu\n", a->journal_seq_nonempty); - prt_printf(out, "journal_seq_empty %llu\n", a->journal_seq_empty); + if (bkey_val_bytes(k.k) > offsetof(struct bch_alloc_v4, journal_seq_empty)) + prt_printf(out, "journal_seq_empty %llu\n", a->journal_seq_empty); + prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a)); prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a)); prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors); - prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors); + if (bkey_val_bytes(k.k) > offsetof(struct bch_alloc_v4, stripe_sectors)) + prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors); prt_printf(out, "cached_sectors %u\n", a->cached_sectors); prt_printf(out, "stripe %u\n", a->stripe); prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy); @@ -372,12 +377,12 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c struct bch_alloc_v4 _a; const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); - __bch2_alloc_v4_to_text(out, c, k.k->p.inode, a); + __bch2_alloc_v4_to_text(out, c, k, a); } void bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { - __bch2_alloc_v4_to_text(out, c, k.k->p.inode, bkey_s_c_to_alloc_v4(k).v); + __bch2_alloc_v4_to_text(out, c, k, bkey_s_c_to_alloc_v4(k).v); } void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) @@ -385,7 +390,7 @@ void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) if (k.k->type == KEY_TYPE_alloc_v4) { void *src, *dst; - *out = *bkey_s_c_to_alloc_v4(k).v; + bkey_val_copy(out, bkey_s_c_to_alloc_v4(k)); src = alloc_v4_backpointers(out); SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s); @@ -1732,12 +1737,16 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c) bch2_bkey_buf_init(&last_flushed); bkey_init(&last_flushed.k->k); + struct progress_indicator_state progress; + bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_alloc)); + CLASS(btree_trans, trans)(c); int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed)) ?: - bch2_check_stripe_to_lru_refs(trans); + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + progress_update_iter(trans, &progress, &iter); + bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed); + }))?: bch2_check_stripe_to_lru_refs(trans); bch2_bkey_buf_exit(&last_flushed, c); return ret; diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 8a6f886b..45c15bda 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -1277,4 +1277,11 @@ static inline int bch2_fs_casefold_enabled(struct bch_fs *c) return 0; } +static inline const char *strip_bch2(const char *msg) +{ + if (!strncmp("bch2_", msg, 5)) + return msg + 5; + return msg; +} + #endif /* _BCACHEFS_H */ diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 34cb8a43..e95bb684 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -44,10 +44,6 @@ #include <linux/rcupdate.h> #include <linux/sched/task.h> -#define DROP_THIS_NODE 10 -#define DROP_PREV_NODE 11 -#define DID_FILL_FROM_SCAN 12 - /* * Returns true if it's a btree we can easily reconstruct, or otherwise won't * cause data loss if it's missing: @@ -252,7 +248,7 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree * return ret; *pulled_from_scan = cur->data->min_key; - ret = DID_FILL_FROM_SCAN; + ret = bch_err_throw(c, topology_repair_did_fill_from_scan); } else { if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key, "btree node with incorrect min_key%s", buf.buf)) @@ -263,7 +259,7 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree * if (bpos_ge(prev->data->min_key, cur->data->min_key)) { /* fully? */ if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_next_node, "btree node overwritten by next node%s", buf.buf)) - ret = DROP_PREV_NODE; + ret = bch_err_throw(c, topology_repair_drop_prev_node); } else { if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key, "btree node with incorrect max_key%s", buf.buf)) @@ -274,7 +270,7 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree * if (bpos_ge(expected_start, cur->data->max_key)) { /* fully? */ if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_prev_node, "btree node overwritten by prev node%s", buf.buf)) - ret = DROP_THIS_NODE; + ret = bch_err_throw(c, topology_repair_drop_this_node); } else { if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key, "btree node with incorrect min_key%s", buf.buf)) @@ -314,7 +310,7 @@ static int btree_repair_node_end(struct btree_trans *trans, struct btree *b, return ret; *pulled_from_scan = b->key.k.p; - ret = DID_FILL_FROM_SCAN; + ret = bch_err_throw(c, topology_repair_did_fill_from_scan); } else { ret = set_node_max(c, child, b->key.k.p); } @@ -391,15 +387,15 @@ again: ret = lockrestart_do(trans, btree_check_node_boundaries(trans, b, prev, cur, pulled_from_scan)); - if (ret < 0) + if (ret && !bch2_err_matches(ret, BCH_ERR_topology_repair)) goto err; - if (ret == DID_FILL_FROM_SCAN) { + if (bch2_err_matches(ret, BCH_ERR_topology_repair_did_fill_from_scan)) { new_pass = true; ret = 0; } - if (ret == DROP_THIS_NODE) { + if (bch2_err_matches(ret, BCH_ERR_topology_repair_drop_this_node)) { six_unlock_read(&cur->c.lock); bch2_btree_node_evict(trans, cur_k.k); ret = bch2_journal_key_delete(c, b->c.btree_id, @@ -414,7 +410,7 @@ again: six_unlock_read(&prev->c.lock); prev = NULL; - if (ret == DROP_PREV_NODE) { + if (bch2_err_matches(ret, BCH_ERR_topology_repair_drop_prev_node)) { bch_info(c, "dropped prev node"); bch2_btree_node_evict(trans, prev_k.k); ret = bch2_journal_key_delete(c, b->c.btree_id, @@ -436,7 +432,7 @@ again: BUG_ON(cur); ret = lockrestart_do(trans, btree_repair_node_end(trans, b, prev, pulled_from_scan)); - if (ret == DID_FILL_FROM_SCAN) { + if (bch2_err_matches(ret, BCH_ERR_topology_repair_did_fill_from_scan)) { new_pass = true; ret = 0; } @@ -477,7 +473,7 @@ again: six_unlock_read(&cur->c.lock); cur = NULL; - if (ret == DROP_THIS_NODE) { + if (bch2_err_matches(ret, BCH_ERR_topology_repair_drop_this_node)) { bch2_btree_node_evict(trans, cur_k.k); ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level, cur_k.k->k.p); @@ -504,7 +500,7 @@ again: if (mustfix_fsck_err_on(!have_child, c, btree_node_topology_interior_node_empty, "empty interior btree node at %s", buf.buf)) - ret = DROP_THIS_NODE; + ret = bch_err_throw(c, topology_repair_drop_this_node); err: fsck_err: if (!IS_ERR_OR_NULL(prev)) @@ -521,7 +517,8 @@ fsck_err: bch2_bkey_buf_exit(&prev_k, c); bch2_bkey_buf_exit(&cur_k, c); - bch_err_fn(c, ret); + if (!bch2_err_matches(ret, BCH_ERR_topology_repair)) + bch_err_fn(c, ret); return ret; } @@ -592,7 +589,7 @@ recover: ret = bch2_btree_repair_topology_recurse(trans, b, &pulled_from_scan); six_unlock_read(&b->c.lock); - if (ret == DROP_THIS_NODE) { + if (bch2_err_matches(ret, BCH_ERR_topology_repair_drop_this_node)) { scoped_guard(mutex, &c->btree_cache.lock) bch2_btree_node_hash_remove(&c->btree_cache, b); diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index bd86dd71..83c83608 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -1405,10 +1405,8 @@ static void btree_node_read_work(struct work_struct *work) ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), &failed, &rb->pick, -1); - if (ret <= 0) { - set_btree_node_read_error(b); + if (ret <= 0) break; - } ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read); rb->have_ioref = ca != NULL; @@ -1442,27 +1440,21 @@ start: bch2_maybe_corrupt_bio(bio, bch2_btree_read_corrupt_ratio); ret = bch2_btree_node_read_done(c, ca, b, &failed, &buf); - if (ret == -BCH_ERR_btree_node_read_err_want_retry || - ret == -BCH_ERR_btree_node_read_err_must_retry) - continue; - - if (ret) - set_btree_node_read_error(b); - - break; + if (ret != -BCH_ERR_btree_node_read_err_want_retry && + ret != -BCH_ERR_btree_node_read_err_must_retry) + break; } bch2_io_failures_to_text(&buf, c, &failed); - if (btree_node_read_error(b)) - bch2_btree_lost_data(c, &buf, b->c.btree_id); - /* * only print retry success if we read from a replica with no errors */ - if (btree_node_read_error(b)) + if (ret) { + set_btree_node_read_error(b); + bch2_btree_lost_data(c, &buf, b->c.btree_id); prt_printf(&buf, "ret %s", bch2_err_str(ret)); - else if (failed.nr) { + } else if (failed.nr) { if (!bch2_dev_io_failures(&failed, rb->pick.ptr.dev)) prt_printf(&buf, "retry success"); else diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index cc771aff..a282c388 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -2860,8 +2860,9 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre !bkey_deleted(k.k) && (k2 = btree_trans_peek_key_cache(trans, iter, iter->pos)).k) { k = k2; - if (!bkey_err(k)) - iter->k = *k.k; + if (bkey_err(k)) + goto out; + iter->k = *k.k; } if (unlikely(k.k->type == KEY_TYPE_whiteout && diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index ebba14da..d61b7820 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -254,11 +254,13 @@ static int btree_key_cache_create(struct btree_trans *trans, struct bkey_i *new_k = allocate_dropping_locks(trans, ret, kmalloc(key_u64s * sizeof(u64), _gfp)); - if (unlikely(!new_k)) { + if (unlikely(!new_k && !ret)) { bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", bch2_btree_id_str(ck->key.btree_id), key_u64s); ret = bch_err_throw(c, ENOMEM_btree_key_cache_fill); - } else if (ret) { + } + + if (unlikely(ret)) { kfree(new_k); goto err; } @@ -407,7 +409,7 @@ int bch2_btree_path_traverse_cached(struct btree_trans *trans, btree_node_unlock(trans, path, 0); path->l[0].b = ERR_PTR(ret); } - } else { + } else if (!(flags & BTREE_ITER_cached_nofill)) { BUG_ON(path->uptodate); BUG_ON(!path->nodes_locked); } diff --git a/libbcachefs/btree_node_scan.c b/libbcachefs/btree_node_scan.c index d997e381..4b7b5ca7 100644 --- a/libbcachefs/btree_node_scan.c +++ b/libbcachefs/btree_node_scan.c @@ -158,14 +158,6 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, if (BTREE_NODE_ID(bn) >= BTREE_ID_NR_MAX) return; - bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ); - bio->bi_iter.bi_sector = offset; - bch2_bio_map(bio, b->data, c->opts.btree_node_size); - - submit_time = local_clock(); - submit_bio_wait(bio); - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status); - rcu_read_lock(); struct found_btree_node n = { .btree_id = BTREE_NODE_ID(bn), @@ -182,6 +174,14 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, }; rcu_read_unlock(); + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ); + bio->bi_iter.bi_sector = offset; + bch2_bio_map(bio, b->data, c->opts.btree_node_size); + + submit_time = local_clock(); + submit_bio_wait(bio); + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status); + found_btree_node_to_key(&b->key, &n); CLASS(printbuf, buf)(); @@ -270,6 +270,9 @@ static int read_btree_nodes(struct find_btree_nodes *f) int ret = 0; closure_init_stack(&cl); + CLASS(printbuf, buf)(); + + prt_printf(&buf, "scanning for btree nodes on"); for_each_online_member(c, ca, BCH_DEV_READ_REF_btree_node_scan) { if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree))) @@ -295,10 +298,14 @@ static int read_btree_nodes(struct find_btree_nodes *f) break; } + prt_printf(&buf, " %s", ca->name); + closure_get(&cl); enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); wake_up_process(t); } + + bch_notice(c, "%s", buf.buf); err: while (closure_sync_timeout(&cl, sysctl_hung_task_timeout_secs * HZ / 2)) ; diff --git a/libbcachefs/btree_trans_commit.c b/libbcachefs/btree_trans_commit.c index 1f9965ae..58590ccc 100644 --- a/libbcachefs/btree_trans_commit.c +++ b/libbcachefs/btree_trans_commit.c @@ -772,12 +772,13 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, trans->journal_res.offset += trans->journal_entries.u64s; trans->journal_res.u64s -= trans->journal_entries.u64s; - memcpy_u64s_small(bch2_journal_add_entry(j, &trans->journal_res, - BCH_JSET_ENTRY_write_buffer_keys, - BTREE_ID_accounting, 0, - trans->accounting.u64s)->_data, - btree_trans_subbuf_base(trans, &trans->accounting), - trans->accounting.u64s); + if (trans->accounting.u64s) + memcpy_u64s_small(bch2_journal_add_entry(j, &trans->journal_res, + BCH_JSET_ENTRY_write_buffer_keys, + BTREE_ID_accounting, 0, + trans->accounting.u64s)->_data, + btree_trans_subbuf_base(trans, &trans->accounting), + trans->accounting.u64s); if (trans->journal_seq) *trans->journal_seq = trans->journal_res.seq; @@ -1065,11 +1066,15 @@ int __bch2_trans_commit(struct btree_trans *trans, enum bch_trans_commit_flags f EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); - journal_u64s = jset_u64s(trans->accounting.u64s); + journal_u64s = 0; + trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); if (trans->journal_transaction_names) journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); + if (trans->accounting.u64s) + journal_u64s += jset_u64s(trans->accounting.u64s); + trans_for_each_update(trans, i) { struct btree_path *path = trans->paths + i->path; diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 312ef203..e4aa4fa7 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -14,6 +14,7 @@ #include "btree_locking.h" #include "buckets.h" #include "clock.h" +#include "disk_groups.h" #include "enumerated_ref.h" #include "error.h" #include "extents.h" @@ -277,6 +278,36 @@ static void bch2_btree_node_free_never_used(struct btree_update *as, bch2_trans_node_drop(trans, b); } +static bool can_use_btree_node(struct bch_fs *c, + struct disk_reservation *res, + unsigned target, + struct bkey_s_c k) +{ + if (!bch2_bkey_devs_rw(c, k)) + return false; + + if (target && !bch2_bkey_in_target(c, k, target)) + return false; + + unsigned durability = bch2_bkey_durability(c, k); + + if (durability >= res->nr_replicas) + return true; + + struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_btree, target); + + guard(rcu)(); + + unsigned durability_available = 0, i; + for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) { + struct bch_dev *ca = bch2_dev_rcu_noerror(c, i); + if (ca) + durability_available += ca->mi.durability; + } + + return durability >= durability_available; +} + static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, struct disk_reservation *res, struct closure *cl, @@ -303,10 +334,14 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, mutex_lock(&c->btree_reserve_cache_lock); if (c->btree_reserve_cache_nr > nr_reserve) { for (struct btree_alloc *a = c->btree_reserve_cache; - a < c->btree_reserve_cache + c->btree_reserve_cache_nr; - a++) { - if (target && !bch2_bkey_in_target(c, bkey_i_to_s_c(&a->k), target)) + a < c->btree_reserve_cache + c->btree_reserve_cache_nr;) { + /* check if it has sufficient durability */ + + if (!can_use_btree_node(c, res, target, bkey_i_to_s_c(&a->k))) { + bch2_open_buckets_put(c, &a->ob); + *a = c->btree_reserve_cache[--c->btree_reserve_cache_nr]; continue; + } bkey_copy(&b->key, &a->k); b->ob = a->ob; diff --git a/libbcachefs/clock.c b/libbcachefs/clock.c index 5185794f..1c6d0cdc 100644 --- a/libbcachefs/clock.c +++ b/libbcachefs/clock.c @@ -21,7 +21,7 @@ static const struct min_heap_callbacks callbacks = { void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) { - guard(spinlock)(&clock->timer_lock); + spin_lock(&clock->timer_lock); if (time_after_eq64((u64) atomic64_read(&clock->now), timer->expire)) { spin_unlock(&clock->timer_lock); @@ -31,9 +31,11 @@ void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) for (size_t i = 0; i < clock->timers.nr; i++) if (clock->timers.data[i] == timer) - return; + goto out; BUG_ON(!min_heap_push(&clock->timers, &timer, &callbacks, NULL)); +out: + spin_unlock(&clock->timer_lock); } void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer) diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c index 97d7655a..33cb94f7 100644 --- a/libbcachefs/debug.c +++ b/libbcachefs/debug.c @@ -462,7 +462,7 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs * struct btree *b) { if (!out->nr_tabstops) - printbuf_tabstop_push(out, 32); + printbuf_tabstop_push(out, 36); prt_printf(out, "%px ", b); bch2_btree_id_level_to_text(out, b->c.btree_id, b->c.level); diff --git a/libbcachefs/dirent.h b/libbcachefs/dirent.h index 373d382b..efb58d2d 100644 --- a/libbcachefs/dirent.h +++ b/libbcachefs/dirent.h @@ -26,6 +26,13 @@ struct bch_inode_info; #if IS_ENABLED(CONFIG_UNICODE) int bch2_casefold(struct btree_trans *, const struct bch_hash_info *, const struct qstr *, struct qstr *); +#else +static inline int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info, + const struct qstr *str, struct qstr *out_cf) +{ + return bch_err_throw(trans->c, no_casefolding_without_utf8); +} +#endif static inline int bch2_maybe_casefold(struct btree_trans *trans, const struct bch_hash_info *info, @@ -38,14 +45,6 @@ static inline int bch2_maybe_casefold(struct btree_trans *trans, return bch2_casefold(trans, info, str, out_cf); } } -#else -static inline int bch2_maybe_casefold(struct btree_trans *trans, - const struct bch_hash_info *info, - const struct qstr *str, struct qstr *out_cf) -{ - return bch_err_throw(trans->c, no_casefolding_without_utf8); -} -#endif struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent); diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index 62dda821..bea14f02 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -2060,6 +2060,9 @@ allocated: BUG_ON(trans->restarted); return h; err: + if (waiting && + !bch2_err_matches(ret, BCH_ERR_operation_blocked)) + closure_wake_up(&c->freelist_wait); bch2_ec_stripe_head_put(c, h); return ERR_PTR(ret); } diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h index 2de0dc91..cec8b0f4 100644 --- a/libbcachefs/errcode.h +++ b/libbcachefs/errcode.h @@ -90,6 +90,8 @@ x(ENOMEM, ENOMEM_disk_accounting) \ x(ENOMEM, ENOMEM_stripe_head_alloc) \ x(ENOMEM, ENOMEM_journal_read_bucket) \ + x(ENOMEM, ENOMEM_acl) \ + x(ENOMEM, ENOMEM_move_extent) \ x(ENOSPC, ENOSPC_disk_reservation) \ x(ENOSPC, ENOSPC_bucket_alloc) \ x(ENOSPC, ENOSPC_disk_label_add) \ @@ -216,9 +218,13 @@ x(EINVAL, varint_decode_error) \ x(EINVAL, erasure_coding_found_btree_node) \ x(EINVAL, option_negative) \ + x(EINVAL, topology_repair) \ + x(BCH_ERR_topology_repair, topology_repair_drop_this_node) \ + x(BCH_ERR_topology_repair, topology_repair_drop_prev_node) \ + x(BCH_ERR_topology_repair, topology_repair_did_fill_from_scan) \ x(EOPNOTSUPP, may_not_use_incompat_feature) \ x(EOPNOTSUPP, no_casefolding_without_utf8) \ - x(EOPNOTSUPP, casefolding_disabled) \ + x(EOPNOTSUPP, casefolding_disabled) \ x(EOPNOTSUPP, casefold_opt_is_dir_only) \ x(EOPNOTSUPP, unsupported_fsx_flag) \ x(EOPNOTSUPP, unsupported_fa_flag) \ diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index b36ecfc0..b879a586 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -282,9 +282,9 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, if (have_pick) return 1; - if (!have_dirty_ptrs) + if (!have_dirty_ptrs && !bkey_is_btree_ptr(k.k)) return 0; - if (have_missing_devs) + if (have_missing_devs || !have_dirty_ptrs) return bch_err_throw(c, no_device_to_read_from); if (have_csum_errors) return bch_err_throw(c, data_read_csum_err); @@ -1006,6 +1006,20 @@ const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned return NULL; } +bool bch2_bkey_devs_rw(struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + guard(rcu)(); + bkey_for_each_ptr(ptrs, ptr) { + CLASS(bch2_dev_tryget, ca)(c, ptr->dev); + if (!ca || ca->mi.state != BCH_MEMBER_STATE_rw) + return false; + } + + return true; +} + bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index f212f91c..35ee03cd 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -614,6 +614,8 @@ static inline struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s k, unsig return (void *) bch2_bkey_has_device_c(k.s_c, dev); } +bool bch2_bkey_devs_rw(struct bch_fs *, struct bkey_s_c); + bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned); bool bch2_bkey_in_target(struct bch_fs *, struct bkey_s_c, unsigned); diff --git a/libbcachefs/fs-io-direct.c b/libbcachefs/fs-io-direct.c index 73d44875..e53fee05 100644 --- a/libbcachefs/fs-io-direct.c +++ b/libbcachefs/fs-io-direct.c @@ -127,7 +127,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) * the dirtying of requests that are internal from the kernel (i.e. from * loopback), because we'll deadlock on page_lock. */ - dio->should_dirty = iter_is_iovec(iter); + dio->should_dirty = user_backed_iter(iter); blk_start_plug(&plug); diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index cc203752..93ad33f0 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -223,9 +223,8 @@ static int bch2_flush_inode(struct bch_fs *c, if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_fsync)) return -EROFS; - CLASS(btree_trans, trans)(c); u64 seq; - int ret = commit_do(trans, NULL, NULL, 0, + int ret = bch2_trans_commit_do(c, NULL, NULL, 0, bch2_get_inode_journal_seq_trans(trans, inode_inum(inode), &seq)) ?: bch2_journal_flush_seq(&c->journal, seq, TASK_INTERRUPTIBLE) ?: bch2_inode_flush_nocow_writes(c, inode); diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 2789b30a..56b7126b 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -1295,8 +1295,14 @@ static int bch2_fill_extent(struct bch_fs *c, flags| FIEMAP_EXTENT_DELALLOC| FIEMAP_EXTENT_UNWRITTEN); + } else if (k.k->type == KEY_TYPE_error) { + return 0; } else { - BUG(); + WARN_ONCE(1, "unhandled key type %s", + k.k->type < KEY_TYPE_MAX + ? bch2_bkey_types[k.k->type] + : "(unknown)"); + return 0; } } diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index df0aa252..40fc3c4e 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -15,6 +15,7 @@ #include "io_misc.h" #include "keylist.h" #include "namei.h" +#include "progress.h" #include "recovery_passes.h" #include "snapshot.h" #include "super.h" @@ -1331,11 +1332,16 @@ int bch2_check_inodes(struct bch_fs *c) CLASS(btree_trans, trans)(c); CLASS(snapshots_seen, s)(); + struct progress_indicator_state progress; + bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_inodes)); + return for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_inode(trans, &iter, k, &snapshot_root, &s)); + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + progress_update_iter(trans, &progress, &iter); + check_inode(trans, &iter, k, &snapshot_root, &s); + })); } static int find_oldest_inode_needs_reattach(struct btree_trans *trans, @@ -1422,12 +1428,17 @@ fsck_err: */ int bch2_check_unreachable_inodes(struct bch_fs *c) { + struct progress_indicator_state progress; + bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_inodes)); + CLASS(btree_trans, trans)(c); return for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_unreachable_inode(trans, &iter, k)); + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + progress_update_iter(trans, &progress, &iter); + check_unreachable_inode(trans, &iter, k); + })); } static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode) @@ -1975,6 +1986,10 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, } } + ret = check_extent_overbig(trans, iter, k); + if (ret) + goto err; + ret = bch2_trans_commit(trans, res, NULL, BCH_TRANS_COMMIT_no_enospc); if (ret) goto err; @@ -2017,12 +2032,15 @@ int bch2_check_extents(struct bch_fs *c) CLASS(inode_walker, w)(); CLASS(extent_ends, extent_ends)(); + struct progress_indicator_state progress; + bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_extents)); + int ret = for_each_btree_key(trans, iter, BTREE_ID_extents, POS(BCACHEFS_ROOT_INO, 0), BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ + progress_update_iter(trans, &progress, &iter); bch2_disk_reservation_put(c, &res); - check_extent(trans, &iter, k, &w, &s, &extent_ends, &res) ?: - check_extent_overbig(trans, &iter, k); + check_extent(trans, &iter, k, &w, &s, &extent_ends, &res); })) ?: check_i_sectors_notnested(trans, &w); @@ -2035,11 +2053,15 @@ int bch2_check_indirect_extents(struct bch_fs *c) CLASS(btree_trans, trans)(c); struct disk_reservation res = { 0 }; + struct progress_indicator_state progress; + bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_reflink)); + int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_reflink, POS_MIN, BTREE_ITER_prefetch, k, &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + progress_update_iter(trans, &progress, &iter); bch2_disk_reservation_put(c, &res); check_extent_overbig(trans, &iter, k); })); @@ -2448,15 +2470,20 @@ int bch2_check_dirents(struct bch_fs *c) CLASS(snapshots_seen, s)(); CLASS(inode_walker, dir)(); CLASS(inode_walker, target)(); + struct progress_indicator_state progress; bool need_second_pass = false, did_second_pass = false; int ret; again: + bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_dirents)); + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_dirents, POS(BCACHEFS_ROOT_INO, 0), BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + progress_update_iter(trans, &progress, &iter); check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s, - &need_second_pass)) ?: + &need_second_pass); + })) ?: check_subdir_count_notnested(trans, &dir); if (!ret && need_second_pass && !did_second_pass) { @@ -2516,13 +2543,18 @@ int bch2_check_xattrs(struct bch_fs *c) CLASS(btree_trans, trans)(c); CLASS(inode_walker, inode)(); + struct progress_indicator_state progress; + bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_xattrs)); + int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, POS(BCACHEFS_ROOT_INO, 0), BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, - check_xattr(trans, &iter, k, &hash_info, &inode)); + BCH_TRANS_COMMIT_no_enospc, ({ + progress_update_iter(trans, &progress, &iter); + check_xattr(trans, &iter, k, &hash_info, &inode); + })); return ret; } @@ -2664,10 +2696,16 @@ err: int bch2_check_subvolume_structure(struct bch_fs *c) { CLASS(btree_trans, trans)(c); + + struct progress_indicator_state progress; + bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_subvolumes)); + return for_each_btree_key_commit(trans, iter, BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_subvol_path(trans, &iter, k)); + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + progress_update_iter(trans, &progress, &iter); + check_subvol_path(trans, &iter, k); + })); } static int bch2_bi_depth_renumber_one(struct btree_trans *trans, diff --git a/libbcachefs/io_write.c b/libbcachefs/io_write.c index d7620138..44b02d4b 100644 --- a/libbcachefs/io_write.c +++ b/libbcachefs/io_write.c @@ -89,7 +89,12 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) new = ewma_add(old, io_latency, 5); } while (!atomic64_try_cmpxchg(latency, &old, new)); - bch2_congested_acct(ca, io_latency, now, rw); + /* + * Only track read latency for congestion accounting: writes are subject + * to heavy queuing delays from page cache writeback: + */ + if (rw == READ) + bch2_congested_acct(ca, io_latency, now, rw); __bch2_time_stats_update(&ca->io_latency[rw].stats, submit_time, now); } diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 3ba1f9fd..f9e2e1a4 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -182,6 +182,8 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags) void bch2_journal_do_writes(struct journal *j) { + struct bch_fs *c = container_of(j, struct bch_fs, journal); + for (u64 seq = journal_last_unwritten_seq(j); seq <= journal_cur_seq(j); seq++) { @@ -196,6 +198,7 @@ void bch2_journal_do_writes(struct journal *j) if (!journal_state_seq_count(j, j->reservations, seq)) { j->seq_write_started = seq; w->write_started = true; + closure_get(&c->cl); closure_call(&w->io, bch2_journal_write, j->wq, NULL); } @@ -1063,6 +1066,8 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou ret = journal_state_count(s, idx & JOURNAL_STATE_BUF_MASK) > open ? ERR_PTR(-EAGAIN) : buf; + if (!IS_ERR(ret)) + smp_mb(); break; } } @@ -1467,6 +1472,10 @@ int bch2_fs_journal_start(struct journal *j, u64 last_seq, u64 cur_seq) last_seq = cur_seq; u64 nr = cur_seq - last_seq; + if (nr * sizeof(struct journal_entry_pin_list) > 1U << 30) { + bch_err(c, "too many ntjournal fifo (%llu open entries)", nr); + return bch_err_throw(c, ENOMEM_journal_pin_fifo); + } /* * Extra fudge factor, in case we crashed when the journal pin fifo was @@ -1479,7 +1488,7 @@ int bch2_fs_journal_start(struct journal *j, u64 last_seq, u64 cur_seq) nr = max(nr, JOURNAL_PIN); init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL); if (!j->pin.data) { - bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); + bch_err(c, "error allocating journal fifo (%llu open entries)", nr); return bch_err_throw(c, ENOMEM_journal_pin_fifo); } diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index b46b9718..c05aa942 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -267,7 +267,7 @@ static inline union journal_res_state journal_state_buf_put(struct journal *j, u { union journal_res_state s; - s.v = atomic64_sub_return(((union journal_res_state) { + s.v = atomic64_sub_return_release(((union journal_res_state) { .buf0_count = idx == 0, .buf1_count = idx == 1, .buf2_count = idx == 2, diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index 2835250a..47224666 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -1820,6 +1820,8 @@ static CLOSURE_CALLBACK(journal_write_done) if (do_discards) bch2_do_discards(c); + + closure_put(&c->cl); } static void journal_write_endio(struct bio *bio) diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index be50455c..f23e5ee9 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -874,7 +874,34 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, --type) if (journal_flush_pins_or_still_flushing(j, seq_to_flush, BIT(type))) { *did_work = true; - return ret; + + /* + * Question from Dan Carpenter, on the early return: + * + * If journal_flush_pins_or_still_flushing() returns + * true, then the flush hasn't complete and we must + * return 0; we want the outer closure_wait_event() in + * journal_flush_pins() to continue. + * + * The early return is there because we don't want to + * call journal_entry_close() until we've finished + * flushing all outstanding journal pins - otherwise + * seq_to_flush can be U64_MAX, and we'll close a bunch + * of journal entries and write tiny ones completely + * unnecessarily. + * + * Having the early return be in the loop where we loop + * over types is important, because flushing one journal + * pin can cause new journal pins to be added (even of + * the same type, btree node writes may generate more + * btree node writes, when updating the parent pointer + * has a full node and has to trigger a split/compact). + * + * This is part of our shutdown sequence, where order of + * flushing is important in order to make sure that it + * terminates... + */ + return 0; } if (seq_to_flush > journal_cur_seq(j)) diff --git a/libbcachefs/lru.c b/libbcachefs/lru.c index ee14656c..76109b37 100644 --- a/libbcachefs/lru.c +++ b/libbcachefs/lru.c @@ -9,6 +9,7 @@ #include "ec.h" #include "error.h" #include "lru.h" +#include "progress.h" #include "recovery.h" /* KEY_TYPE_lru is obsolete: */ @@ -207,11 +208,16 @@ int bch2_check_lrus(struct bch_fs *c) bch2_bkey_buf_init(&last_flushed); bkey_init(&last_flushed.k->k); + struct progress_indicator_state progress; + bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_lru)); + CLASS(btree_trans, trans)(c); int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_lru, POS_MIN, BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_check_lru_key(trans, &iter, k, &last_flushed)); + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + progress_update_iter(trans, &progress, &iter); + bch2_check_lru_key(trans, &iter, k, &last_flushed); + })); bch2_bkey_buf_exit(&last_flushed, c); return ret; diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 3f44bb54..84a228c4 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -330,7 +330,7 @@ int bch2_move_extent(struct moving_context *ctxt, { struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; - int ret = -ENOMEM; + int ret = 0; if (trace_io_move_enabled()) trace_io_move2(c, k, &io_opts, &data_opts); @@ -351,11 +351,10 @@ int bch2_move_extent(struct moving_context *ctxt, struct moving_io *io = allocate_dropping_locks(trans, ret, kzalloc(sizeof(struct moving_io), _gfp)); - if (!io) - goto err; - + if (!io && !ret) + ret = bch_err_throw(c, ENOMEM_move_extent); if (ret) - goto err_free; + goto err; INIT_LIST_HEAD(&io->io_list); io->write.ctxt = ctxt; @@ -366,7 +365,7 @@ int bch2_move_extent(struct moving_context *ctxt, ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp, &io_opts, data_opts, iter->btree_id, k); if (ret) - goto err_free; + goto err; io->write.op.end_io = move_write_done; } else { @@ -380,7 +379,7 @@ int bch2_move_extent(struct moving_context *ctxt, ret = bch2_data_update_bios_init(&io->write, c, &io_opts); if (ret) - goto err_free; + goto err; } io->write.rbio.bio.bi_end_io = move_read_endio; @@ -423,9 +422,8 @@ int bch2_move_extent(struct moving_context *ctxt, BCH_READ_last_fragment, data_opts.scrub ? data_opts.read_dev : -1); return 0; -err_free: - kfree(io); err: + kfree(io); if (bch2_err_matches(ret, EROFS) || bch2_err_matches(ret, BCH_ERR_transaction_restart)) return ret; @@ -795,50 +793,50 @@ out: return ret; } -int __bch2_move_data(struct moving_context *ctxt, - struct bbpos start, - struct bbpos end, - move_pred_fn pred, void *arg) +static int bch2_move_data(struct bch_fs *c, + struct bbpos start, + struct bbpos end, + unsigned min_depth, + struct bch_ratelimit *rate, + struct bch_move_stats *stats, + struct write_point_specifier wp, + bool wait_on_copygc, + move_pred_fn pred, void *arg) { - struct bch_fs *c = ctxt->trans->c; - enum btree_id id; int ret = 0; - for (id = start.btree; + struct moving_context ctxt; + bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); + + for (enum btree_id id = start.btree; id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1); id++) { - ctxt->stats->pos = BBPOS(id, POS_MIN); + ctxt.stats->pos = BBPOS(id, POS_MIN); - if (!btree_type_has_ptrs(id) || - !bch2_btree_id_root(c, id)->b) + if (!bch2_btree_id_root(c, id)->b) continue; - ret = bch2_move_data_btree(ctxt, - id == start.btree ? start.pos : POS_MIN, - id == end.btree ? end.pos : POS_MAX, - pred, arg, id, 0); + unsigned min_depth_this_btree = min_depth; + + if (!btree_type_has_ptrs(id)) + min_depth_this_btree = max(min_depth_this_btree, 1); + + for (unsigned level = min_depth_this_btree; + level < BTREE_MAX_DEPTH; + level++) { + ret = bch2_move_data_btree(&ctxt, + id == start.btree ? start.pos : POS_MIN, + id == end.btree ? end.pos : POS_MAX, + pred, arg, id, level); + if (ret) + break; + } + if (ret) break; } - return ret; -} - -int bch2_move_data(struct bch_fs *c, - struct bbpos start, - struct bbpos end, - struct bch_ratelimit *rate, - struct bch_move_stats *stats, - struct write_point_specifier wp, - bool wait_on_copygc, - move_pred_fn pred, void *arg) -{ - struct moving_context ctxt; - - bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); - int ret = __bch2_move_data(&ctxt, start, end, pred, arg); bch2_moving_ctxt_exit(&ctxt); - return ret; } @@ -1206,14 +1204,6 @@ static bool migrate_pred(struct bch_fs *c, void *arg, return data_opts->rewrite_ptrs != 0; } -static bool rereplicate_btree_pred(struct bch_fs *c, void *arg, - struct btree *b, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - return rereplicate_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key), io_opts, data_opts); -} - /* * Ancient versions of bcachefs produced packed formats which could represent * keys that the in memory format cannot represent; this checks for those @@ -1293,15 +1283,6 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, return data_opts->kill_ptrs != 0; } -static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg, - struct btree *b, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - return drop_extra_replicas_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key), - io_opts, data_opts); -} - static bool scrub_pred(struct bch_fs *c, void *_arg, enum btree_id btree, struct bkey_s_c k, struct bch_io_opts *io_opts, @@ -1359,14 +1340,11 @@ int bch2_data_job(struct bch_fs *c, case BCH_DATA_OP_rereplicate: stats->data_type = BCH_DATA_journal; ret = bch2_journal_flush_device_pins(&c->journal, -1); - ret = bch2_move_btree(c, start, end, - rereplicate_btree_pred, c, stats) ?: ret; - ret = bch2_move_data(c, start, end, - NULL, - stats, + ret = bch2_move_data(c, start, end, 0, NULL, stats, writepoint_hashed((unsigned long) current), true, rereplicate_pred, c) ?: ret; + bch2_btree_interior_updates_flush(c); ret = bch2_replicas_gc2(c) ?: ret; break; case BCH_DATA_OP_migrate: @@ -1389,12 +1367,10 @@ int bch2_data_job(struct bch_fs *c, ret = bch2_scan_old_btree_nodes(c, stats); break; case BCH_DATA_OP_drop_extra_replicas: - ret = bch2_move_btree(c, start, end, - drop_extra_replicas_btree_pred, c, stats) ?: ret; - ret = bch2_move_data(c, start, end, NULL, stats, - writepoint_hashed((unsigned long) current), - true, - drop_extra_replicas_pred, c) ?: ret; + ret = bch2_move_data(c, start, end, 0, NULL, stats, + writepoint_hashed((unsigned long) current), + true, + drop_extra_replicas_pred, c) ?: ret; ret = bch2_replicas_gc2(c) ?: ret; break; default: diff --git a/libbcachefs/move.h b/libbcachefs/move.h index fe92ca6d..481026ff 100644 --- a/libbcachefs/move.h +++ b/libbcachefs/move.h @@ -128,18 +128,6 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *, int bch2_move_data_btree(struct moving_context *, struct bpos, struct bpos, move_pred_fn, void *, enum btree_id, unsigned); -int __bch2_move_data(struct moving_context *, - struct bbpos, - struct bbpos, - move_pred_fn, void *); -int bch2_move_data(struct bch_fs *, - struct bbpos start, - struct bbpos end, - struct bch_ratelimit *, - struct bch_move_stats *, - struct write_point_specifier, - bool, - move_pred_fn, void *); int bch2_move_data_phys(struct bch_fs *, unsigned, u64, u64, unsigned, struct bch_ratelimit *, struct bch_move_stats *, diff --git a/libbcachefs/progress.c b/libbcachefs/progress.c index 42353067..792fc6fe 100644 --- a/libbcachefs/progress.c +++ b/libbcachefs/progress.c @@ -52,7 +52,8 @@ void bch2_progress_update_iter(struct btree_trans *trans, : 0; prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ", - msg, percent, s->nodes_seen, s->nodes_total); + strip_bch2(msg), + percent, s->nodes_seen, s->nodes_total); bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos)); bch_info(c, "%s", buf.buf); diff --git a/libbcachefs/progress.h b/libbcachefs/progress.h index 23fb1811..972a7308 100644 --- a/libbcachefs/progress.h +++ b/libbcachefs/progress.h @@ -26,4 +26,7 @@ void bch2_progress_update_iter(struct btree_trans *, struct btree_iter *, const char *); +#define progress_update_iter(trans, p, iter) \ + bch2_progress_update_iter(trans, p, iter, __func__) + #endif /* _BCACHEFS_PROGRESS_H */ diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c index 32fa7cf9..c7e7f508 100644 --- a/libbcachefs/rebalance.c +++ b/libbcachefs/rebalance.c @@ -15,6 +15,7 @@ #include "inode.h" #include "io_write.h" #include "move.h" +#include "progress.h" #include "rebalance.h" #include "subvolume.h" #include "super-io.h" @@ -858,7 +859,12 @@ int bch2_check_rebalance_work(struct bch_fs *c) bch2_bkey_buf_init(&last_flushed); bkey_init(&last_flushed.k->k); + struct progress_indicator_state progress; + bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_rebalance_work)); + while (!ret) { + progress_update_iter(trans, &progress, &rebalance_iter); + bch2_trans_begin(trans); ret = check_rebalance_work_one(trans, &extent_iter, &rebalance_iter, &last_flushed); diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index a8eea478..304473da 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -37,78 +37,82 @@ int bch2_btree_lost_data(struct bch_fs *c, struct printbuf *msg, enum btree_id btree) { - u64 b = BIT_ULL(btree); int ret = 0; guard(mutex)(&c->sb_lock); + bool write_sb = false; struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - if (!(c->sb.btrees_lost_data & b)) { + if (!(c->sb.btrees_lost_data & BIT_ULL(btree))) { prt_printf(msg, "flagging btree "); bch2_btree_id_to_text(msg, btree); prt_printf(msg, " lost data\n"); - ext->btrees_lost_data |= cpu_to_le64(b); + write_sb |= !__test_and_set_bit_le64(btree, &ext->btrees_lost_data); } /* Once we have runtime self healing for topology errors we won't need this: */ - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0, &write_sb) ?: ret; /* Btree node accounting will be off: */ - __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent); - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0) ?: ret; + write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent); + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0, &write_sb) ?: ret; #ifdef CONFIG_BCACHEFS_DEBUG /* * These are much more minor, and don't need to be corrected right away, * but in debug mode we want the next fsck run to be clean: */ - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_lrus, 0) ?: ret; - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_backpointers_to_extents, 0) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_lrus, 0, &write_sb) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_backpointers_to_extents, 0, &write_sb) ?: ret; #endif + write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent); + write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_backpointer_to_missing_ptr, ext->errors_silent); + write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); + write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); + switch (btree) { case BTREE_ID_alloc: - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; - - __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0, &write_sb) ?: ret; + + write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); + write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent); + write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent); + write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); goto out; case BTREE_ID_backpointers: - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_btree_backpointers, 0) ?: ret; - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_extents_to_backpointers, 0) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_btree_backpointers, 0, &write_sb) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_extents_to_backpointers, 0, &write_sb) ?: ret; goto out; case BTREE_ID_need_discard: - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0, &write_sb) ?: ret; goto out; case BTREE_ID_freespace: - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0, &write_sb) ?: ret; goto out; case BTREE_ID_bucket_gens: - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0, &write_sb) ?: ret; goto out; case BTREE_ID_lru: - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0, &write_sb) ?: ret; goto out; case BTREE_ID_accounting: - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0, &write_sb) ?: ret; goto out; case BTREE_ID_snapshots: - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_reconstruct_snapshots, 0) ?: ret; - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret; - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_reconstruct_snapshots, 0, &write_sb) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0, &write_sb) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0, &write_sb) ?: ret; goto out; default: - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret; - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0, &write_sb) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0, &write_sb) ?: ret; goto out; } out: - bch2_write_super(c); + if (write_sb) + bch2_write_super(c); return ret; } diff --git a/libbcachefs/recovery_passes.c b/libbcachefs/recovery_passes.c index f9d1c492..bd442652 100644 --- a/libbcachefs/recovery_passes.c +++ b/libbcachefs/recovery_passes.c @@ -340,7 +340,8 @@ static bool recovery_pass_needs_set(struct bch_fs *c, int __bch2_run_explicit_recovery_pass(struct bch_fs *c, struct printbuf *out, enum bch_recovery_pass pass, - enum bch_run_recovery_pass_flags flags) + enum bch_run_recovery_pass_flags flags, + bool *write_sb) { struct bch_fs_recovery *r = &c->recovery; int ret = 0; @@ -362,7 +363,8 @@ int __bch2_run_explicit_recovery_pass(struct bch_fs *c, if (!(flags & RUN_RECOVERY_PASS_nopersistent)) { struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required); + *write_sb |= !__test_and_set_bit_le64(bch2_recovery_pass_to_stable(pass), + ext->recovery_passes_required); } if (pass < BCH_RECOVERY_PASS_set_may_go_rw && @@ -408,14 +410,19 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass, enum bch_run_recovery_pass_flags flags) { - int ret = 0; + /* + * With RUN_RECOVERY_PASS_ratelimit, recovery_pass_needs_set needs + * sb_lock + */ + if (!(flags & RUN_RECOVERY_PASS_ratelimit) && + !recovery_pass_needs_set(c, pass, &flags)) + return 0; - if (recovery_pass_needs_set(c, pass, &flags)) { - guard(mutex)(&c->sb_lock); - ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags); + guard(mutex)(&c->sb_lock); + bool write_sb = false; + int ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags, &write_sb); + if (write_sb) bch2_write_super(c); - } - return ret; } @@ -438,14 +445,13 @@ int bch2_require_recovery_pass(struct bch_fs *c, return 0; enum bch_run_recovery_pass_flags flags = 0; - int ret = 0; - if (recovery_pass_needs_set(c, pass, &flags)) { - ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags); + bool write_sb = false; + int ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags, &write_sb) ?: + bch_err_throw(c, recovery_pass_will_run); + if (write_sb) bch2_write_super(c); - } - - return ret ?: bch_err_throw(c, recovery_pass_will_run); + return ret; } int bch2_run_print_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) @@ -459,8 +465,10 @@ int bch2_run_print_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pa bch2_log_msg_start(c, &buf); guard(mutex)(&c->sb_lock); + bool write_sb = false; int ret = __bch2_run_explicit_recovery_pass(c, &buf, pass, - RUN_RECOVERY_PASS_nopersistent); + RUN_RECOVERY_PASS_nopersistent, + &write_sb); bch2_print_str(c, KERN_NOTICE, buf.buf); return ret; @@ -631,6 +639,8 @@ void bch2_recovery_pass_status_to_text(struct printbuf *out, struct bch_fs *c) prt_printf(out, "Current pass:\t%s\n", bch2_recovery_passes[r->curr_pass]); prt_passes(out, "Current passes", r->passes_to_run); } + + prt_printf(out, "Pass done:\t%s\n", bch2_recovery_passes[r->pass_done]); } void bch2_fs_recovery_passes_init(struct bch_fs *c) diff --git a/libbcachefs/recovery_passes.h b/libbcachefs/recovery_passes.h index 2117f0ce..4f2c2f81 100644 --- a/libbcachefs/recovery_passes.h +++ b/libbcachefs/recovery_passes.h @@ -30,7 +30,8 @@ int bch2_run_print_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pas int __bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *, enum bch_recovery_pass, - enum bch_run_recovery_pass_flags); + enum bch_run_recovery_pass_flags, + bool *); int bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *, enum bch_recovery_pass, enum bch_run_recovery_pass_flags); diff --git a/libbcachefs/sb-members_format.h b/libbcachefs/sb-members_format.h index fb72ad73..b2b89268 100644 --- a/libbcachefs/sb-members_format.h +++ b/libbcachefs/sb-members_format.h @@ -17,7 +17,7 @@ UUID_INIT(0xffffffff, 0xffff, 0xffff, \ 0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef) -#define BCH_MIN_NR_NBUCKETS (1 << 6) +#define BCH_MIN_NR_NBUCKETS (1 << 9) #define BCH_IOPS_MEASUREMENTS() \ x(seqread, 0) \ diff --git a/libbcachefs/str_hash.c b/libbcachefs/str_hash.c index dfe4b6ae..3e08e55d 100644 --- a/libbcachefs/str_hash.c +++ b/libbcachefs/str_hash.c @@ -329,7 +329,6 @@ duplicate_entries: out: fsck_err: bch2_trans_iter_exit(trans, dup_iter); - printbuf_exit(&buf); if (free_snapshots_seen) darray_exit(&s->ids); return ret; diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 4e038f65..b3b2d835 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -514,6 +514,10 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) if (ret) return ret; + ret = bch2_fs_mark_dirty(c); + if (ret) + return ret; + clear_bit(BCH_FS_clean_shutdown, &c->flags); scoped_guard(rcu) @@ -537,10 +541,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) bch2_journal_space_available(&c->journal); } - ret = bch2_fs_mark_dirty(c); - if (ret) - return ret; - /* * Don't jump to our error path, and call bch2_fs_read_only(), unless we * successfully marked the filesystem dirty @@ -729,6 +729,8 @@ void __bch2_fs_stop(struct bch_fs *c) cancel_work_sync(&ca->io_error_work); cancel_work_sync(&c->read_only_work); + + flush_work(&c->btree_interior_update_work); } void bch2_fs_free(struct bch_fs *c) diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 158f526e..bd3fa9c3 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -18,6 +18,7 @@ #include "btree_key_cache.h" #include "btree_update.h" #include "btree_update_interior.h" +#include "btree_write_buffer.h" #include "btree_gc.h" #include "buckets.h" #include "clock.h" @@ -150,6 +151,7 @@ write_attribute(trigger_journal_flush); write_attribute(trigger_journal_writes); write_attribute(trigger_btree_cache_shrink); write_attribute(trigger_btree_key_cache_shrink); +write_attribute(trigger_btree_write_buffer_flush); write_attribute(trigger_btree_updates); write_attribute(trigger_freelist_wakeup); write_attribute(trigger_recalc_capacity); @@ -539,6 +541,11 @@ STORE(bch2_fs) c->btree_key_cache.shrink->scan_objects(c->btree_key_cache.shrink, &sc); } + if (attr == &sysfs_trigger_btree_write_buffer_flush) + bch2_trans_do(c, + (bch2_btree_write_buffer_flush_sync(trans), + bch2_trans_begin(trans))); + if (attr == &sysfs_trigger_gc) bch2_gc_gens(c); @@ -709,6 +716,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_trigger_journal_writes, &sysfs_trigger_btree_cache_shrink, &sysfs_trigger_btree_key_cache_shrink, + &sysfs_trigger_btree_write_buffer_flush, &sysfs_trigger_btree_updates, &sysfs_trigger_freelist_wakeup, &sysfs_trigger_recalc_capacity, diff --git a/libbcachefs/util.h b/libbcachefs/util.h index 768528c2..52ac8230 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -733,6 +733,13 @@ static inline bool test_bit_le64(size_t bit, __le64 *addr) return (addr[bit / 64] & cpu_to_le64(BIT_ULL(bit % 64))) != 0; } +static inline bool __test_and_set_bit_le64(size_t bit, __le64 *addr) +{ + bool ret = test_bit_le64(bit, addr); + __set_bit_le64(bit, addr); + return ret; +} + static inline void memcpy_swab(void *_dst, void *_src, size_t len) { u8 *dst = _dst + len; diff --git a/linux/closure.c b/linux/closure.c index 2bfe7d2a..4fb78d18 100644 --- a/linux/closure.c +++ b/linux/closure.c @@ -13,23 +13,25 @@ #include <linux/seq_file.h> #include <linux/sched/debug.h> -static inline void closure_put_after_sub_checks(int flags) +static inline void closure_put_after_sub_checks(struct closure *cl, int flags) { int r = flags & CLOSURE_REMAINING_MASK; if (WARN(flags & CLOSURE_GUARD_MASK, - "closure has guard bits set: %x (%u)", + "closure %ps has guard bits set: %x (%u)", + cl->fn, flags & CLOSURE_GUARD_MASK, (unsigned) __fls(r))) r &= ~CLOSURE_GUARD_MASK; WARN(!r && (flags & ~CLOSURE_DESTRUCTOR), - "closure ref hit 0 with incorrect flags set: %x (%u)", + "closure %ps ref hit 0 with incorrect flags set: %x (%u)", + cl->fn, flags & ~CLOSURE_DESTRUCTOR, (unsigned) __fls(flags)); } static inline void closure_put_after_sub(struct closure *cl, int flags) { - closure_put_after_sub_checks(flags); + closure_put_after_sub_checks(cl, flags); if (!(flags & CLOSURE_REMAINING_MASK)) { smp_acquire__after_ctrl_dep(); @@ -167,7 +169,7 @@ void __sched closure_return_sync(struct closure *cl) unsigned flags = atomic_sub_return_release(1 + CLOSURE_RUNNING - CLOSURE_DESTRUCTOR, &cl->remaining); - closure_put_after_sub_checks(flags); + closure_put_after_sub_checks(cl, flags); if (unlikely(flags & CLOSURE_REMAINING_MASK)) { while (1) { |