summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.bcachefs_revision2
-rw-r--r--bch_bindgen/src/bkey.rs2
-rw-r--r--c_src/cmd_fs.c317
-rw-r--r--libbcachefs/bcachefs.h17
-rw-r--r--libbcachefs/bcachefs_format.h11
-rw-r--r--libbcachefs/bkey_methods.c6
-rw-r--r--libbcachefs/bkey_types.h5
-rw-r--r--libbcachefs/btree_cache.c4
-rw-r--r--libbcachefs/btree_io.c2
-rw-r--r--libbcachefs/btree_iter.c72
-rw-r--r--libbcachefs/btree_iter.h6
-rw-r--r--libbcachefs/btree_trans_commit.c2
-rw-r--r--libbcachefs/btree_update.c67
-rw-r--r--libbcachefs/btree_update_interior.c14
-rw-r--r--libbcachefs/data_update.c9
-rw-r--r--libbcachefs/data_update.h1
-rw-r--r--libbcachefs/extents.c17
-rw-r--r--libbcachefs/extents.h1
-rw-r--r--libbcachefs/fs.c10
-rw-r--r--libbcachefs/fsck.c4
-rw-r--r--libbcachefs/lru.h10
-rw-r--r--libbcachefs/move.c16
-rw-r--r--libbcachefs/movinggc.c188
-rw-r--r--libbcachefs/rebalance.c93
-rw-r--r--libbcachefs/sb-counters_format.h3
-rw-r--r--libbcachefs/super.c19
26 files changed, 572 insertions, 326 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index 386580af..abe206fd 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-68b6a68ecf72dba7035a888ad1247fd04d3cc98e
+933c0b52a810e410c1c871dacaaaa0f6a5d67f62
diff --git a/bch_bindgen/src/bkey.rs b/bch_bindgen/src/bkey.rs
index 0c4786eb..1c98b029 100644
--- a/bch_bindgen/src/bkey.rs
+++ b/bch_bindgen/src/bkey.rs
@@ -51,6 +51,7 @@ pub enum BkeyValC<'a> {
logged_op_finsert(&'a c::bch_logged_op_finsert),
accounting(&'a c::bch_accounting),
inode_alloc_cursor(&'a c::bch_inode_alloc_cursor),
+ extent_whiteout,
}
impl<'a, 'b> BkeySC<'a> {
@@ -109,6 +110,7 @@ impl<'a, 'b> BkeySC<'a> {
KEY_TYPE_logged_op_finsert => logged_op_finsert(transmute(self.v)),
KEY_TYPE_accounting => accounting(transmute(self.v)),
KEY_TYPE_inode_alloc_cursor => inode_alloc_cursor(transmute(self.v)),
+ KEY_TYPE_extent_whiteout => extent_whiteout,
KEY_TYPE_MAX => unreachable!(),
}
}
diff --git a/c_src/cmd_fs.c b/c_src/cmd_fs.c
index aa825e90..4eca866a 100644
--- a/c_src/cmd_fs.c
+++ b/c_src/cmd_fs.c
@@ -18,96 +18,85 @@
#include "libbcachefs/darray.h"
-static void __dev_usage_type_to_text(struct printbuf *out,
- enum bch_data_type type,
- unsigned bucket_size,
- u64 buckets, u64 sectors, u64 frag)
-{
- bch2_prt_data_type(out, type);
- prt_char(out, ':');
- prt_tab(out);
-
- prt_units_u64(out, sectors << 9);
- prt_tab_rjust(out);
-
- prt_printf(out, "%llu", buckets);
- prt_tab_rjust(out);
-
- if (frag) {
- prt_units_u64(out, frag << 9);
- prt_tab_rjust(out);
- }
- prt_newline(out);
-}
-
-static void dev_usage_type_to_text(struct printbuf *out,
- struct bch_ioctl_dev_usage_v2 *u,
- enum bch_data_type type)
-{
- u64 sectors = 0;
- switch (type) {
- case BCH_DATA_free:
- case BCH_DATA_need_discard:
- case BCH_DATA_need_gc_gens:
- /* sectors are 0 for these types so calculate sectors for them */
- sectors = u->d[type].buckets * u->bucket_size;
- break;
- default:
- sectors = u->d[type].sectors;
- }
-
- __dev_usage_type_to_text(out, type,
- u->bucket_size,
- u->d[type].buckets,
- sectors,
- u->d[type].fragmented);
-}
+#define FS_USAGE_FIELDS() \
+ x(replicas) \
+ x(btree) \
+ x(compression) \
+ x(rebalance_work) \
+ x(devices)
+
+enum __fs_usage_fields {
+#define x(n) __FS_USAGE_##n,
+ FS_USAGE_FIELDS()
+#undef x
+};
+
+enum fs_usage_fields {
+#define x(n) FS_USAGE_##n = BIT(__FS_USAGE_##n),
+ FS_USAGE_FIELDS()
+#undef x
+};
+
+const char * const fs_usage_field_strs[] = {
+#define x(n) [__FS_USAGE_##n] = #n,
+ FS_USAGE_FIELDS()
+#undef x
+ NULL
+};
static void dev_usage_to_text(struct printbuf *out,
struct bchfs_handle fs,
- struct dev_name *d)
+ struct dev_name *d,
+ bool full)
{
struct bch_ioctl_dev_usage_v2 *u = bchu_dev_usage(fs, d->idx);
- prt_newline(out);
- prt_printf(out, "%s (device %u):", d->label ?: "(no label)", d->idx);
- prt_tab(out);
- prt_str(out, d->dev ?: "(device not found)");
- prt_tab_rjust(out);
-
- prt_str(out, bch2_member_states[u->state]);
- prt_tab_rjust(out);
-
- prt_newline(out);
+ u64 used = 0, capacity = u->nr_buckets * u->bucket_size;
+ for (unsigned type = 0; type < u->nr_data_types; type++) {
+ if (!data_type_is_empty(type))
+ used += u->d[type].sectors;
+ }
- printbuf_indent_add(out, 2);
- prt_tab(out);
+ prt_printf(out, "%s (device %u):\t%s\r%s\r %02u%%\n",
+ d->label ?: "(no label)", d->idx,
+ d->dev ?: "(device not found)",
+ bch2_member_states[u->state],
+ (unsigned) (used * 100 / capacity));
- prt_str(out, "data");
- prt_tab_rjust(out);
+ if (full) {
+ printbuf_indent_add(out, 2);
+ prt_printf(out, "\tdata\rbuckets\rfragmented\r\n");
- prt_str(out, "buckets");
- prt_tab_rjust(out);
+ for (unsigned type = 0; type < u->nr_data_types; type++) {
+ bch2_prt_data_type(out, type);
+ prt_printf(out, ":\t");
- prt_str(out, "fragmented");
- prt_tab_rjust(out);
+ /* sectors are 0 for empty bucket data types, so calculate sectors for them */
+ u64 sectors = data_type_is_empty(type)
+ ? u->d[type].buckets * u->bucket_size
+ : u->d[type].sectors;
+ prt_units_u64(out, sectors << 9);
- prt_newline(out);
+ prt_printf(out, "\r%llu\r", u->d[type].buckets);
- for (unsigned i = 0; i < u->nr_data_types; i++)
- dev_usage_type_to_text(out, u, i);
+ u64 fragmented = u->d[type].buckets * u->bucket_size - sectors;
+ if (fragmented)
+ prt_units_u64(out, fragmented << 9);
+ prt_printf(out, "\r\n");
+ }
- prt_str(out, "capacity:");
- prt_tab(out);
+ prt_printf(out, "capacity:\t");
+ prt_units_u64(out, (u->nr_buckets * u->bucket_size) << 9);
+ prt_printf(out, "\r%llu\r\n", u->nr_buckets);
- prt_units_u64(out, (u->nr_buckets * u->bucket_size) << 9);
- prt_tab_rjust(out);
- prt_printf(out, "%llu", u->nr_buckets);
- prt_tab_rjust(out);
+ prt_printf(out, "bucket size:\t");
+ prt_units_u64(out, u->bucket_size << 9);
+ prt_printf(out, "\r\n");
- printbuf_indent_sub(out, 2);
+ printbuf_indent_sub(out, 2);
+ prt_newline(out);
+ }
- prt_newline(out);
free(u);
}
@@ -124,19 +113,21 @@ static int dev_by_label_cmp(const void *_l, const void *_r)
static void devs_usage_to_text(struct printbuf *out,
struct bchfs_handle fs,
- dev_names dev_names)
+ dev_names dev_names,
+ bool full)
{
sort(dev_names.data, dev_names.nr,
sizeof(dev_names.data[0]), dev_by_label_cmp, NULL);
printbuf_tabstops_reset(out);
+ prt_newline(out);
printbuf_tabstop_push(out, 16);
printbuf_tabstop_push(out, 20);
printbuf_tabstop_push(out, 16);
printbuf_tabstop_push(out, 14);
darray_for_each(dev_names, dev)
- dev_usage_to_text(out, fs, dev);
+ dev_usage_to_text(out, fs, dev, full);
darray_for_each(dev_names, dev) {
free(dev->dev);
@@ -150,14 +141,9 @@ static void persistent_reserved_to_text(struct printbuf *out,
if (!sectors)
return;
- prt_str(out, "reserved:");
- prt_tab(out);
- prt_printf(out, "%u/%u ", 1, nr_replicas);
- prt_tab(out);
- prt_str(out, "[] ");
+ prt_printf(out, "reserved:\t%u/%u\t[] ", 1, nr_replicas);
prt_units_u64(out, sectors << 9);
- prt_tab_rjust(out);
- prt_newline(out);
+ prt_printf(out, "\r\n");
}
static void replicas_usage_to_text(struct printbuf *out,
@@ -190,21 +176,12 @@ static void replicas_usage_to_text(struct printbuf *out,
*d++ = '\0';
bch2_prt_data_type(out, r->data_type);
- prt_char(out, ':');
- prt_tab(out);
-
- prt_printf(out, "%u/%u ", r->nr_required, r->nr_devs);
- prt_tab(out);
-
- prt_printf(out, "%u ", durability);
- prt_tab(out);
-
- prt_printf(out, "%s ", devs);
- prt_tab(out);
+ prt_printf(out, ":\t%u/%u\t%u\t%s\t",
+ r->nr_required, r->nr_devs,
+ durability, devs);
prt_units_u64(out, sectors << 9);
- prt_tab_rjust(out);
- prt_newline(out);
+ prt_printf(out, "\r\n");
}
#define for_each_usage_replica(_u, _r) \
@@ -251,15 +228,27 @@ static void accounting_swab_if_old(struct bch_ioctl_query_accounting *in)
static int fs_usage_v1_to_text(struct printbuf *out,
struct bchfs_handle fs,
- dev_names dev_names)
+ dev_names dev_names,
+ enum fs_usage_fields fields)
{
- struct bch_ioctl_query_accounting *a =
- bchu_fs_accounting(fs,
- BIT(BCH_DISK_ACCOUNTING_persistent_reserved)|
+ unsigned accounting_types = 0;
+
+ if (fields & FS_USAGE_replicas)
+ accounting_types |=
BIT(BCH_DISK_ACCOUNTING_replicas)|
- BIT(BCH_DISK_ACCOUNTING_compression)|
- BIT(BCH_DISK_ACCOUNTING_btree)|
- BIT(BCH_DISK_ACCOUNTING_rebalance_work));
+ BIT(BCH_DISK_ACCOUNTING_persistent_reserved);
+
+ if (fields & FS_USAGE_compression)
+ accounting_types |= BIT(BCH_DISK_ACCOUNTING_compression);
+
+ if (fields & FS_USAGE_btree)
+ accounting_types |= BIT(BCH_DISK_ACCOUNTING_btree);
+
+ if (fields & FS_USAGE_rebalance_work)
+ accounting_types |= BIT(BCH_DISK_ACCOUNTING_rebalance_work);
+
+ struct bch_ioctl_query_accounting *a =
+ bchu_fs_accounting(fs, accounting_types);
if (!a)
return -1;
@@ -277,45 +266,27 @@ static int fs_usage_v1_to_text(struct printbuf *out,
printbuf_tabstop_push(out, 20);
printbuf_tabstop_push(out, 16);
- prt_str(out, "Size:");
- prt_tab(out);
+ prt_printf(out, "Size:\t");
prt_units_u64(out, a->capacity << 9);
- prt_tab_rjust(out);
- prt_newline(out);
+ prt_printf(out, "\r\n");
- prt_str(out, "Used:");
- prt_tab(out);
+ prt_printf(out, "Used:\t");
prt_units_u64(out, a->used << 9);
- prt_tab_rjust(out);
- prt_newline(out);
+ prt_printf(out, "\r\n");
- prt_str(out, "Online reserved:");
- prt_tab(out);
+ prt_printf(out, "Online reserved:\t");
prt_units_u64(out, a->online_reserved << 9);
- prt_tab_rjust(out);
- prt_newline(out);
-
- prt_newline(out);
-
- printbuf_tabstops_reset(out);
-
- printbuf_tabstop_push(out, 16);
- prt_str(out, "Data type");
- prt_tab(out);
-
- printbuf_tabstop_push(out, 16);
- prt_str(out, "Required/total");
- prt_tab(out);
-
- printbuf_tabstop_push(out, 14);
- prt_str(out, "Durability");
- prt_tab(out);
-
- printbuf_tabstop_push(out, 14);
- prt_str(out, "Devices");
- prt_newline(out);
-
- printbuf_tabstop_push(out, 14);
+ prt_printf(out, "\r\n");
+
+ if (fields & FS_USAGE_replicas) {
+ printbuf_tabstops_reset(out);
+ printbuf_tabstop_push(out, 16);
+ printbuf_tabstop_push(out, 16);
+ printbuf_tabstop_push(out, 14);
+ printbuf_tabstop_push(out, 14);
+ printbuf_tabstop_push(out, 14);
+ prt_printf(out, "\nData type\tRequired/total\tDurability\tDevices\n");
+ }
unsigned prev_type = 0;
@@ -364,8 +335,7 @@ static int fs_usage_v1_to_text(struct printbuf *out,
prt_units_u64(out, nr_extents
? div_u64(sectors_uncompressed << 9, nr_extents)
: 0);
- prt_tab_rjust(out);
- prt_newline(out);
+ prt_printf(out, "\r\n");
break;
case BCH_DISK_ACCOUNTING_btree:
if (new_type) {
@@ -376,8 +346,7 @@ static int fs_usage_v1_to_text(struct printbuf *out,
}
prt_printf(out, "%s:\t", bch2_btree_id_str(acc_k.btree.id));
prt_units_u64(out, a->v.d[0] << 9);
- prt_tab_rjust(out);
- prt_newline(out);
+ prt_printf(out, "\r\n");
break;
case BCH_DISK_ACCOUNTING_rebalance_work:
if (new_type)
@@ -395,7 +364,8 @@ static int fs_usage_v1_to_text(struct printbuf *out,
static void fs_usage_v0_to_text(struct printbuf *out,
struct bchfs_handle fs,
- dev_names dev_names)
+ dev_names dev_names,
+ enum fs_usage_fields fields)
{
struct bch_ioctl_fs_usage *u = bchu_fs_usage(fs);
@@ -410,20 +380,17 @@ static void fs_usage_v0_to_text(struct printbuf *out,
prt_str(out, "Size:");
prt_tab(out);
prt_units_u64(out, u->capacity << 9);
- prt_tab_rjust(out);
- prt_newline(out);
+ prt_printf(out, "\r\n");
prt_str(out, "Used:");
prt_tab(out);
prt_units_u64(out, u->used << 9);
- prt_tab_rjust(out);
- prt_newline(out);
+ prt_printf(out, "\r\n");
prt_str(out, "Online reserved:");
prt_tab(out);
prt_units_u64(out, u->online_reserved << 9);
- prt_tab_rjust(out);
- prt_newline(out);
+ prt_printf(out, "\r\n");
prt_newline(out);
@@ -473,43 +440,34 @@ static void fs_usage_v0_to_text(struct printbuf *out,
free(u);
}
-static void fs_usage_to_text(struct printbuf *out, const char *path)
+static void fs_usage_to_text(struct printbuf *out, const char *path,
+ enum fs_usage_fields fields)
{
struct bchfs_handle fs = bcache_fs_open(path);
dev_names dev_names = bchu_fs_get_devices(fs);
- if (!fs_usage_v1_to_text(out, fs, dev_names))
+ if (!fs_usage_v1_to_text(out, fs, dev_names, fields))
goto devs;
- fs_usage_v0_to_text(out, fs, dev_names);
+ fs_usage_v0_to_text(out, fs, dev_names, fields);
devs:
- devs_usage_to_text(out, fs, dev_names);
+ devs_usage_to_text(out, fs, dev_names, fields & FS_USAGE_devices);
darray_exit(&dev_names);
bcache_fs_close(fs);
}
-int fs_usage(void)
-{
- puts("bcachefs fs - manage a running filesystem\n"
- "Usage: bcachefs fs <CMD> [OPTIONS]\n"
- "\n"
- "Commands:\n"
- " usage Display detailed filesystem usage\n"
- " top Show runtime performance information\n"
- "\n"
- "Report bugs to <linux-bcachefs@vger.kernel.org>");
- return 0;
-}
-
static void fs_usage_usage(void)
{
puts("bcachefs fs usage - display detailed filesystem usage\n"
"Usage: bcachefs fs usage [OPTION]... <mountpoint>\n"
"\n"
"Options:\n"
+ " -f, --fields=FIELDS List of accounting sections to print\n"
+ " replicas,btree,compression,rebalance_work,devices"
+ " -a Print all accounting fields\n"
" -h, --human-readable Human readable units\n"
" -H, --help Display this help and exit\n"
"Report bugs to <linux-bcachefs@vger.kernel.org>");
@@ -518,18 +476,26 @@ static void fs_usage_usage(void)
int cmd_fs_usage(int argc, char *argv[])
{
static const struct option longopts[] = {
- { "help", no_argument, NULL, 'H' },
+ { "fields", required_argument, NULL, 'f' },
+ { "all", no_argument, NULL, 'a' },
{ "human-readable", no_argument, NULL, 'h' },
+ { "help", no_argument, NULL, 'H' },
{ NULL }
};
bool human_readable = false;
+ unsigned fields = 0;
struct printbuf buf = PRINTBUF;
char *fs;
int opt;
- while ((opt = getopt_long(argc, argv, "h",
+ while ((opt = getopt_long(argc, argv, "f:ahH",
longopts, NULL)) != -1)
switch (opt) {
+ case 'f':
+ fields |= read_flag_list_or_die(optarg, fs_usage_field_strs, "fields");
+ break;
+ case 'a':
+ fields = ~0;
case 'h':
human_readable = true;
break;
@@ -545,13 +511,13 @@ int cmd_fs_usage(int argc, char *argv[])
if (!argc) {
printbuf_reset(&buf);
buf.human_readable_units = human_readable;
- fs_usage_to_text(&buf, ".");
+ fs_usage_to_text(&buf, ".", fields);
printf("%s", buf.buf);
} else {
while ((fs = arg_pop())) {
printbuf_reset(&buf);
buf.human_readable_units = human_readable;
- fs_usage_to_text(&buf, fs);
+ fs_usage_to_text(&buf, fs, fields);
printf("%s", buf.buf);
}
}
@@ -560,6 +526,19 @@ int cmd_fs_usage(int argc, char *argv[])
return 0;
}
+int fs_usage(void)
+{
+ puts("bcachefs fs - manage a running filesystem\n"
+ "Usage: bcachefs fs <CMD> [OPTIONS]\n"
+ "\n"
+ "Commands:\n"
+ " usage Display detailed filesystem usage\n"
+ " top Show runtime performance information\n"
+ "\n"
+ "Report bugs to <linux-bcachefs@vger.kernel.org>");
+ return 0;
+}
+
int fs_cmds(int argc, char *argv[])
{
char *cmd = pop_cmd(&argc, argv);
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index cdf593c5..16d08dfb 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -386,14 +386,6 @@ do { \
##__VA_ARGS__, bch2_err_str(_ret)); \
} while (0)
-static inline int __bch2_err_trace(struct bch_fs *c, int err)
-{
- trace_error_throw(c, err, _THIS_IP_);
- return err;
-}
-
-#define bch_err_throw(_c, _err) __bch2_err_trace(_c, -BCH_ERR_##_err)
-
/* Parameters that are useful for debugging, but should always be compiled in: */
#define BCH_DEBUG_PARAMS_ALWAYS() \
BCH_DEBUG_PARAM(key_merging_disabled, \
@@ -1153,6 +1145,15 @@ struct bch_fs {
struct mutex fsck_error_counts_lock;
};
+static inline int __bch2_err_trace(struct bch_fs *c, int err)
+{
+ this_cpu_inc(c->counters[BCH_COUNTER_error_throw]);
+ trace_error_throw(c, err, _THIS_IP_);
+ return err;
+}
+
+#define bch_err_throw(_c, _err) __bch2_err_trace(_c, -BCH_ERR_##_err)
+
extern struct wait_queue_head bch2_read_only_wait;
static inline bool bch2_ro_ref_tryget(struct bch_fs *c)
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index b4a04df5..a8f59522 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -423,7 +423,8 @@ enum bch_bkey_type_flags {
x(logged_op_truncate, 32, BKEY_TYPE_strict_btree_checks) \
x(logged_op_finsert, 33, BKEY_TYPE_strict_btree_checks) \
x(accounting, 34, BKEY_TYPE_strict_btree_checks) \
- x(inode_alloc_cursor, 35, BKEY_TYPE_strict_btree_checks)
+ x(inode_alloc_cursor, 35, BKEY_TYPE_strict_btree_checks) \
+ x(extent_whiteout, 36, BKEY_TYPE_strict_btree_checks)
enum bch_bkey_type {
#define x(name, nr, ...) KEY_TYPE_##name = nr,
@@ -440,6 +441,10 @@ struct bch_whiteout {
struct bch_val v;
};
+struct bch_extent_whiteout {
+ struct bch_val v;
+};
+
struct bch_error {
struct bch_val v;
};
@@ -700,7 +705,8 @@ struct bch_sb_field_ext {
x(extent_flags, BCH_VERSION(1, 25)) \
x(snapshot_deletion_v2, BCH_VERSION(1, 26)) \
x(fast_device_removal, BCH_VERSION(1, 27)) \
- x(inode_has_case_insensitive, BCH_VERSION(1, 28))
+ x(inode_has_case_insensitive, BCH_VERSION(1, 28)) \
+ x(extent_snapshot_whiteouts, BCH_VERSION(1, 29))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
@@ -1340,6 +1346,7 @@ enum btree_id_flags {
BTREE_IS_snapshots| \
BTREE_IS_data, \
BIT_ULL(KEY_TYPE_whiteout)| \
+ BIT_ULL(KEY_TYPE_extent_whiteout)| \
BIT_ULL(KEY_TYPE_error)| \
BIT_ULL(KEY_TYPE_cookie)| \
BIT_ULL(KEY_TYPE_extent)| \
diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c
index fcd8c82c..75d73677 100644
--- a/libbcachefs/bkey_methods.c
+++ b/libbcachefs/bkey_methods.c
@@ -41,6 +41,10 @@ static int deleted_key_validate(struct bch_fs *c, struct bkey_s_c k,
.key_validate = deleted_key_validate, \
})
+#define bch2_bkey_ops_extent_whiteout ((struct bkey_ops) { \
+ .key_validate = deleted_key_validate, \
+})
+
static int empty_val_key_validate(struct bch_fs *c, struct bkey_s_c k,
struct bkey_validate_context from)
{
@@ -203,7 +207,7 @@ int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k,
? bch2_bkey_types[k.k->type]
: "(unknown)");
- if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
+ if (btree_node_type_is_extents(type) && !bkey_extent_whiteout(k.k)) {
bkey_fsck_err_on(k.k->size == 0,
c, bkey_extent_size_zero,
"size == 0");
diff --git a/libbcachefs/bkey_types.h b/libbcachefs/bkey_types.h
index b4f328f9..88a48ce6 100644
--- a/libbcachefs/bkey_types.h
+++ b/libbcachefs/bkey_types.h
@@ -44,6 +44,11 @@ static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
#define bkey_whiteout(_k) \
((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout)
+#define bkey_extent_whiteout(_k) \
+ ((_k)->type == KEY_TYPE_deleted || \
+ (_k)->type == KEY_TYPE_whiteout || \
+ (_k)->type == KEY_TYPE_extent_whiteout)
+
/* bkey with split value, const */
struct bkey_s_c {
const struct bkey *k;
diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c
index 9261ad04..3b1d694d 100644
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@@ -804,7 +804,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea
goto got_node;
}
- b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN);
+ b = __btree_node_mem_alloc(c, GFP_NOWAIT);
if (b) {
bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_NOWAIT);
} else {
@@ -842,7 +842,7 @@ got_node:
mutex_unlock(&bc->lock);
- if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) {
+ if (btree_node_data_alloc(c, b, GFP_NOWAIT)) {
bch2_trans_unlock(trans);
if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN))
goto err;
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index 8a03cd75..276cf088 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -131,7 +131,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
BUG_ON(size > c->opts.btree_node_size);
*used_mempool = false;
- p = kvmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
+ p = kvmalloc(size, GFP_NOWAIT);
if (!p) {
*used_mempool = true;
p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index a67babf6..8962c481 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -2450,10 +2450,27 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en
continue;
}
- if (bkey_whiteout(k.k) &&
- !(iter->flags & BTREE_ITER_nofilter_whiteouts)) {
- search_key = bkey_successor(iter, k.k->p);
- continue;
+ if (!(iter->flags & BTREE_ITER_nofilter_whiteouts)) {
+ /*
+ * KEY_TYPE_extent_whiteout indicates that there
+ * are no extents that overlap with this
+ * whiteout - meaning bkey_start_pos() is
+ * monotonically increasing when including
+ * KEY_TYPE_extent_whiteout (not
+ * KEY_TYPE_whiteout).
+ *
+ * Without this @end wouldn't be able to
+ * terminate searches and we'd have to scan
+ * through tons of whiteouts:
+ */
+ if (k.k->type == KEY_TYPE_extent_whiteout &&
+ bkey_ge(k.k->p, end))
+ goto end;
+
+ if (bkey_extent_whiteout(k.k)) {
+ search_key = bkey_successor(iter, k.k->p);
+ continue;
+ }
}
}
@@ -2711,7 +2728,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp
saved_path = 0;
}
- if (!bkey_whiteout(k.k)) {
+ if (!bkey_extent_whiteout(k.k)) {
saved_path = btree_path_clone(trans, iter->path,
iter->flags & BTREE_ITER_intent,
_THIS_IP_);
@@ -2724,7 +2741,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp
continue;
}
- if (bkey_whiteout(k.k)) {
+ if (bkey_extent_whiteout(k.k)) {
search_key = bkey_predecessor(iter, k.k->p);
search_key.snapshot = U32_MAX;
continue;
@@ -2865,7 +2882,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
iter->k = *k.k;
}
- if (unlikely(k.k->type == KEY_TYPE_whiteout &&
+ if (unlikely(bkey_extent_whiteout(k.k) &&
(iter->flags & BTREE_ITER_filter_snapshots) &&
!(iter->flags & BTREE_ITER_nofilter_whiteouts)))
iter->k.type = KEY_TYPE_deleted;
@@ -2878,31 +2895,40 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
EBUG_ON(btree_iter_path(trans, iter)->level);
- if (iter->flags & BTREE_ITER_intent) {
- struct btree_iter iter2;
+ struct btree_iter iter2;
- bch2_trans_copy_iter(&iter2, iter);
- k = bch2_btree_iter_peek_max(&iter2, end);
+ bch2_trans_copy_iter(&iter2, iter);
+ iter2.flags |= BTREE_ITER_nofilter_whiteouts;
- if (k.k && !bkey_err(k)) {
- swap(iter->key_cache_path, iter2.key_cache_path);
- iter->k = iter2.k;
- k.k = &iter->k;
+ while (1) {
+ k = bch2_btree_iter_peek_max(&iter2, end);
+ if ((iter2.flags & BTREE_ITER_is_extents) &&
+ k.k &&
+ !bkey_err(k) &&
+ k.k->type == KEY_TYPE_whiteout) {
+ bch2_btree_iter_set_pos(&iter2, k.k->p);
+ continue;
}
- bch2_trans_iter_exit(&iter2);
- } else {
- struct bpos pos = iter->pos;
- k = bch2_btree_iter_peek_max(iter, end);
- if (unlikely(bkey_err(k)))
- bch2_btree_iter_set_pos(iter, pos);
- else
- iter->pos = pos;
+ break;
+ }
+
+ if (k.k && !bkey_err(k)) {
+ swap(iter->key_cache_path, iter2.key_cache_path);
+ iter->k = iter2.k;
+ k.k = &iter->k;
}
+ bch2_trans_iter_exit(&iter2);
if (unlikely(bkey_err(k)))
goto out;
+ if (unlikely(k.k &&
+ bkey_extent_whiteout(k.k) &&
+ (iter->flags & BTREE_ITER_filter_snapshots) &&
+ !(iter->flags & BTREE_ITER_nofilter_whiteouts)))
+ iter->k.type = KEY_TYPE_deleted;
+
next = k.k ? bkey_start_pos(k.k) : POS_MAX;
if (bkey_lt(iter->pos, next)) {
diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h
index b117cb5d..c8fc6ee0 100644
--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@@ -954,7 +954,7 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
#define allocate_dropping_locks_errcode(_trans, _do) \
({ \
- gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \
+ gfp_t _gfp = GFP_NOWAIT; \
int _ret = _do; \
\
if (bch2_err_matches(_ret, ENOMEM)) { \
@@ -966,7 +966,7 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
#define allocate_dropping_locks(_trans, _ret, _do) \
({ \
- gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \
+ gfp_t _gfp = GFP_NOWAIT; \
typeof(_do) _p = _do; \
\
_ret = 0; \
@@ -979,7 +979,7 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
#define allocate_dropping_locks_norelock(_trans, _lock_dropped, _do) \
({ \
- gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \
+ gfp_t _gfp = GFP_NOWAIT; \
typeof(_do) _p = _do; \
_lock_dropped = false; \
if (unlikely(!_p)) { \
diff --git a/libbcachefs/btree_trans_commit.c b/libbcachefs/btree_trans_commit.c
index 8b94a815..4d58bdb2 100644
--- a/libbcachefs/btree_trans_commit.c
+++ b/libbcachefs/btree_trans_commit.c
@@ -449,7 +449,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
return 0;
new_u64s = roundup_pow_of_two(u64s);
- new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN);
+ new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT);
if (unlikely(!new_k))
return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s);
diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c
index 6f3b5757..f59f018f 100644
--- a/libbcachefs/btree_update.c
+++ b/libbcachefs/btree_update.c
@@ -12,6 +12,7 @@
#include "extents.h"
#include "keylist.h"
#include "snapshot.h"
+#include "super-io.h"
#include "trace.h"
#include <linux/string_helpers.h>
@@ -158,6 +159,21 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
return ret;
}
+static inline enum bch_bkey_type extent_whiteout_type(struct bch_fs *c, enum btree_id btree, const struct bkey *k)
+{
+ /*
+ * KEY_TYPE_extent_whiteout indicates that there isn't a real extent
+ * present at that position: key start positions inclusive of
+ * KEY_TYPE_extent_whiteout (but not KEY_TYPE_whiteout) are
+ * monotonically increasing
+ */
+ return btree_id_is_extents_snapshots(btree) &&
+ bkey_deleted(k) &&
+ !bch2_request_incompat_feature(c, bcachefs_metadata_version_extent_snapshot_whiteouts)
+ ? KEY_TYPE_extent_whiteout
+ : KEY_TYPE_whiteout;
+}
+
int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
struct btree_iter *iter,
enum btree_iter_update_trigger_flags flags,
@@ -224,14 +240,14 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
update->k.p = old.k->p;
update->k.p.snapshot = new.k->p.snapshot;
- if (new.k->p.snapshot != old.k->p.snapshot) {
- update->k.type = KEY_TYPE_whiteout;
- } else if (btree_type_has_snapshots(btree_id)) {
- ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p);
+ if (btree_type_has_snapshots(btree_id)) {
+ ret = new.k->p.snapshot != old.k->p.snapshot
+ ? 1
+ : need_whiteout_for_snapshot(trans, btree_id, update->k.p);
if (ret < 0)
return ret;
if (ret)
- update->k.type = KEY_TYPE_whiteout;
+ update->k.type = extent_whiteout_type(trans->c, iter->btree_id, new.k);
}
ret = bch2_btree_insert_nonextent(trans, btree_id, update,
@@ -265,7 +281,8 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
CLASS(btree_iter, iter)(trans, btree_id, bkey_start_pos(&insert->k),
BTREE_ITER_intent|
BTREE_ITER_with_updates|
- BTREE_ITER_not_extents);
+ BTREE_ITER_not_extents|
+ BTREE_ITER_nofilter_whiteouts);
struct bkey_s_c k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX));
int ret = bkey_err(k);
if (ret)
@@ -283,12 +300,40 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
goto next;
}
- while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) {
- bool done = bkey_lt(insert->k.p, k.k->p);
+ while (true) {
+ BUG_ON(bkey_le(k.k->p, bkey_start_pos(&insert->k)));
- ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert));
- if (ret)
- return ret;
+ /*
+ * When KEY_TYPE_whiteout is included, bkey_start_pos is not
+ * monotonically increasing
+ */
+ if (k.k->type != KEY_TYPE_whiteout && bkey_le(insert->k.p, bkey_start_pos(k.k)))
+ break;
+
+ bool done = k.k->type != KEY_TYPE_whiteout && bkey_lt(insert->k.p, k.k->p);
+
+ if (bkey_extent_whiteout(k.k)) {
+ enum bch_bkey_type whiteout_type = extent_whiteout_type(trans->c, btree_id, &insert->k);
+
+ if (bkey_le(k.k->p, insert->k.p) &&
+ k.k->type != whiteout_type) {
+ struct bkey_i *update = bch2_bkey_make_mut_noupdate(trans, k);
+ ret = PTR_ERR_OR_ZERO(update);
+ if (ret)
+ return ret;
+
+ update->k.p.snapshot = iter.snapshot;
+ update->k.type = whiteout_type;
+
+ ret = bch2_trans_update(trans, &iter, update, 0);
+ if (ret)
+ return ret;
+ }
+ } else {
+ ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert));
+ if (ret)
+ return ret;
+ }
if (done)
goto out;
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index 76897cf1..65ca54c5 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -336,6 +336,20 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
BUG_ON(b->ob.nr);
mutex_lock(&c->btree_reserve_cache_lock);
+ if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(watermark))) {
+ guard(spinlock)(&c->freelist_lock);
+ if (c->open_buckets_nr_free <= bch2_open_buckets_reserved(watermark)) {
+ if (cl)
+ closure_wait(&c->open_buckets_wait, cl);
+
+ ret = cl
+ ? bch_err_throw(c, bucket_alloc_blocked)
+ : bch_err_throw(c, open_buckets_empty);
+ mutex_unlock(&c->btree_reserve_cache_lock);
+ goto err;
+ }
+ }
+
if (c->btree_reserve_cache_nr > nr_reserve) {
for (struct btree_alloc *a = c->btree_reserve_cache;
a < c->btree_reserve_cache + c->btree_reserve_cache_nr;) {
diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c
index 01838a3a..0bd4dd06 100644
--- a/libbcachefs/data_update.c
+++ b/libbcachefs/data_update.c
@@ -693,6 +693,15 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
if (ret)
return ret;
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ unsigned i = 0;
+ bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
+ if (data_opts->kill_ec_ptrs & BIT(i))
+ bch2_bkey_drop_ec(n, p.ptr.dev);
+ i++;
+ }
+
while (data_opts->kill_ptrs) {
unsigned i = 0, drop = __fls(data_opts->kill_ptrs);
diff --git a/libbcachefs/data_update.h b/libbcachefs/data_update.h
index 5e14d135..fc12aa65 100644
--- a/libbcachefs/data_update.h
+++ b/libbcachefs/data_update.h
@@ -12,6 +12,7 @@ struct moving_context;
struct data_update_opts {
unsigned rewrite_ptrs;
unsigned kill_ptrs;
+ unsigned kill_ec_ptrs;
u16 target;
u8 extra_replicas;
unsigned btree_insert_flags;
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c
index b879a586..7ab03987 100644
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -995,6 +995,22 @@ void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
bch2_bkey_drop_ptrs_noerror(k, ptr, ptr->dev == dev);
}
+void bch2_bkey_drop_ec(struct bkey_i *k, unsigned dev)
+{
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+ union bch_extent_entry *entry, *ec = NULL;
+
+ bkey_extent_entry_for_each(ptrs, entry) {
+ if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr)
+ ec = entry;
+ else if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_ptr &&
+ entry->ptr.dev == dev) {
+ bch2_bkey_extent_entry_drop(k, ec);
+ return;
+ }
+ }
+}
+
const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
@@ -1757,3 +1773,4 @@ int bch2_cut_back_s(struct bpos where, struct bkey_s k)
memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
return -val_u64s_delta;
}
+
diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h
index 35ee03cd..f6dcb171 100644
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@@ -650,6 +650,7 @@ void bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *);
void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned);
void bch2_bkey_drop_device(struct bkey_s, unsigned);
+void bch2_bkey_drop_ec(struct bkey_i *k, unsigned);
#define bch2_bkey_drop_ptrs_noerror(_k, _ptr, _cond) \
do { \
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index b5e3090f..52722a5e 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -268,7 +268,7 @@ restart:
rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) {
if (inode->ei_inum.inum == inum) {
ret = darray_push_gfp(&subvols, inode->ei_inum.subvol,
- GFP_NOWAIT|__GFP_NOWARN);
+ GFP_NOWAIT);
if (ret) {
rcu_read_unlock();
ret = darray_make_room(&subvols, 1);
@@ -826,14 +826,6 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
bch2_inode_update_after_write(trans, inode, &inode_u,
ATTR_MTIME);
- if (inode_u.bi_subvol) {
- /*
- * Subvolume deletion is asynchronous, but we still want to tell
- * the VFS that it's been deleted here:
- */
- set_nlink(&inode->v, 0);
- }
-
if (IS_CASEFOLDED(vdir))
d_invalidate(dentry);
err:
diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c
index 6ccea092..01c1c637 100644
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@@ -1444,7 +1444,7 @@ static int check_key_has_inode(struct btree_trans *trans,
if (ret)
return ret;
- if (k.k->type == KEY_TYPE_whiteout)
+ if (bkey_extent_whiteout(k.k))
return 0;
bool have_inode = i && !i->whiteout;
@@ -1924,7 +1924,9 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
&inode->recalculate_sums);
if (ret)
goto err;
+ }
+ if (!bkey_extent_whiteout(k.k)) {
/*
* Check inodes in reverse order, from oldest snapshots to
* newest, starting from the inode that matches this extent's
diff --git a/libbcachefs/lru.h b/libbcachefs/lru.h
index 8abd0aa2..6f1e0a7b 100644
--- a/libbcachefs/lru.h
+++ b/libbcachefs/lru.h
@@ -24,6 +24,16 @@ static inline struct bpos lru_pos(u16 lru_id, u64 dev_bucket, u64 time)
return pos;
}
+static inline struct bpos lru_start(u16 lru_id)
+{
+ return lru_pos(lru_id, 0, 0);
+}
+
+static inline struct bpos lru_end(u16 lru_id)
+{
+ return lru_pos(lru_id, U64_MAX, LRU_TIME_MAX);
+}
+
static inline enum bch_lru_type lru_type(struct bkey_s_c l)
{
u16 lru_id = l.k->p.inode >> 48;
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index 30fe269d..932b62a9 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -344,7 +344,7 @@ int bch2_move_extent(struct moving_context *ctxt,
if (!data_opts.rewrite_ptrs &&
!data_opts.extra_replicas &&
!data_opts.scrub) {
- if (data_opts.kill_ptrs) {
+ if (data_opts.kill_ptrs|data_opts.kill_ec_ptrs) {
this_cpu_add(c->counters[BCH_COUNTER_io_move_drop_only], k.k->size);
return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts);
} else {
@@ -542,7 +542,7 @@ int bch2_move_ratelimit(struct moving_context *ctxt)
if (ctxt->wait_on_copygc && c->copygc_running) {
bch2_moving_ctxt_flush_all(ctxt);
- wait_event_killable(c->copygc_running_wq,
+ wait_event_freezable(c->copygc_running_wq,
!c->copygc_running ||
(is_kthread && kthread_should_stop()));
}
@@ -1280,7 +1280,17 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
i++;
}
- return data_opts->kill_ptrs != 0;
+ i = 0;
+ bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
+ if (p.has_ec && durability - p.ec.redundancy >= replicas) {
+ data_opts->kill_ec_ptrs |= BIT(i);
+ durability -= p.ec.redundancy;
+ }
+
+ i++;
+ }
+
+ return (data_opts->kill_ptrs|data_opts->kill_ec_ptrs) != 0;
}
static bool scrub_pred(struct bch_fs *c, void *_arg,
diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c
index b0cbe3c1..f36d60b8 100644
--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@@ -14,6 +14,7 @@
#include "btree_write_buffer.h"
#include "buckets.h"
#include "clock.h"
+#include "ec.h"
#include "errcode.h"
#include "error.h"
#include "lru.h"
@@ -131,72 +132,153 @@ static bool bucket_in_flight(struct buckets_in_flight *list,
return rhashtable_lookup_fast(list->table, &k, bch_move_bucket_params);
}
+static int try_add_copygc_bucket(struct btree_trans *trans,
+ struct buckets_in_flight *buckets_in_flight,
+ struct bpos bucket, u64 lru_time)
+{
+ struct move_bucket b = { .k.bucket = bucket };
+
+ int ret = bch2_bucket_is_movable(trans, &b, lru_time);
+ if (ret <= 0)
+ return ret;
+
+ if (bucket_in_flight(buckets_in_flight, b.k))
+ return 0;
+
+ struct move_bucket *b_i = kmalloc(sizeof(*b_i), GFP_KERNEL);
+ if (!b_i)
+ return -ENOMEM;
+
+ *b_i = b;
+
+ ret = darray_push(&buckets_in_flight->to_evacuate, b_i);
+ if (ret) {
+ kfree(b_i);
+ return ret;
+ }
+
+ ret = rhashtable_lookup_insert_fast(buckets_in_flight->table, &b_i->hash,
+ bch_move_bucket_params);
+ BUG_ON(ret);
+
+ size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4);
+ return buckets_in_flight->to_evacuate.nr >= nr_to_get;
+}
+
static int bch2_copygc_get_buckets(struct moving_context *ctxt,
struct buckets_in_flight *buckets_in_flight)
{
struct btree_trans *trans = ctxt->trans;
- struct bch_fs *c = trans->c;
- size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4);
- size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0;
- int ret;
- move_buckets_wait(ctxt, buckets_in_flight, false);
+ int ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru,
+ lru_start(BCH_LRU_BUCKET_FRAGMENTATION),
+ lru_end(BCH_LRU_BUCKET_FRAGMENTATION),
+ 0, k,
+ try_add_copygc_bucket(trans, buckets_in_flight,
+ u64_to_bucket(k.k->p.offset),
+ lru_pos_time(k.k->p))
+ );
- ret = bch2_btree_write_buffer_tryflush(trans);
- if (bch2_err_matches(ret, EROFS))
- return ret;
+ return ret < 0 ? ret : 0;
+}
- if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret)))
- return ret;
+static int bch2_copygc_get_stripe_buckets(struct moving_context *ctxt,
+ struct buckets_in_flight *buckets_in_flight)
+{
+ struct btree_trans *trans = ctxt->trans;
- ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru,
- lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, 0, 0),
- lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, U64_MAX, LRU_TIME_MAX),
- 0, k, ({
- struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) };
- int ret2 = 0;
+ int ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru,
+ lru_start(BCH_LRU_STRIPE_FRAGMENTATION),
+ lru_end(BCH_LRU_STRIPE_FRAGMENTATION),
+ 0, lru_k, ({
+ CLASS(btree_iter, s_iter)(trans, BTREE_ID_stripes, POS(0, lru_k.k->p.offset), 0);
+ struct bkey_s_c s_k = bch2_btree_iter_peek_slot(&s_iter);
+ int ret2 = bkey_err(s_k);
+ if (ret2)
+ goto err;
- saw++;
+ if (s_k.k->type != KEY_TYPE_stripe)
+ continue;
- ret2 = bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p));
- if (ret2 < 0)
- goto err;
+ const struct bch_stripe *s = bkey_s_c_to_stripe(s_k).v;
- if (!ret2)
- not_movable++;
- else if (bucket_in_flight(buckets_in_flight, b.k))
- in_flight++;
- else {
- struct move_bucket *b_i = kmalloc(sizeof(*b_i), GFP_KERNEL);
- ret2 = b_i ? 0 : -ENOMEM;
+ /* write buffer race? */
+ if (stripe_lru_pos(s) != lru_pos_time(lru_k.k->p))
+ continue;
+
+ unsigned nr_data = s->nr_blocks - s->nr_redundant;
+ for (unsigned i = 0; i < nr_data; i++) {
+ if (!stripe_blockcount_get(s, i))
+ continue;
+
+ const struct bch_extent_ptr *ptr = s->ptrs + i;
+ CLASS(bch2_dev_tryget, ca)(trans->c, ptr->dev);
+ if (unlikely(!ca))
+ continue;
+
+ ret2 = try_add_copygc_bucket(trans, buckets_in_flight,
+ PTR_BUCKET_POS(ca, ptr), U64_MAX);
if (ret2)
- goto err;
+ break;
+ }
+err:
+ ret2;
+ }));
- *b_i = b;
+ return ret < 0 ? ret : 0;
+}
+
+static bool should_do_ec_copygc(struct btree_trans *trans)
+{
+ u64 stripe_frag_ratio = 0;
+
+ for_each_btree_key_max(trans, iter, BTREE_ID_lru,
+ lru_start(BCH_LRU_STRIPE_FRAGMENTATION),
+ lru_end(BCH_LRU_STRIPE_FRAGMENTATION),
+ 0, lru_k, ({
+ CLASS(btree_iter, s_iter)(trans, BTREE_ID_stripes, POS(0, lru_k.k->p.offset), 0);
+ struct bkey_s_c s_k = bch2_btree_iter_peek_slot(&s_iter);
+ int ret = bkey_err(s_k);
+ if (ret)
+ goto err;
- ret2 = darray_push(&buckets_in_flight->to_evacuate, b_i);
- if (ret2) {
- kfree(b_i);
- goto err;
- }
+ if (s_k.k->type != KEY_TYPE_stripe)
+ continue;
- ret2 = rhashtable_lookup_insert_fast(buckets_in_flight->table, &b_i->hash,
- bch_move_bucket_params);
- BUG_ON(ret2);
+ const struct bch_stripe *s = bkey_s_c_to_stripe(s_k).v;
- sectors += b.sectors;
- }
+ /* write buffer race? */
+ if (stripe_lru_pos(s) != lru_pos_time(lru_k.k->p))
+ continue;
- ret2 = buckets_in_flight->to_evacuate.nr >= nr_to_get;
+ unsigned nr_data = s->nr_blocks - s->nr_redundant, blocks_nonempty = 0;
+ for (unsigned i = 0; i < nr_data; i++)
+ blocks_nonempty += !!stripe_blockcount_get(s, i);
+
+ /* stripe is pending delete */
+ if (!blocks_nonempty)
+ continue;
+
+ /* This matches the calculation in alloc_lru_idx_fragmentation, so we can
+ * directly compare without actually looking up the bucket pointed to by the
+ * bucket fragmentation lru:
+ */
+ stripe_frag_ratio = div_u64(blocks_nonempty * (1ULL << 31), nr_data);
+ break;
err:
- ret2;
+ ret;
}));
- pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i",
- buckets_in_flight->nr, buckets_in_flight->sectors,
- saw, in_flight, not_movable, buckets_in_flight->to_evacuate.nr, sectors, nr_to_get, ret);
+ CLASS(btree_iter, iter)(trans, BTREE_ID_lru, lru_start(BCH_LRU_BUCKET_FRAGMENTATION), 0);
+ struct bkey_s_c lru_k;
- return ret < 0 ? ret : 0;
+ lockrestart_do(trans, bkey_err(lru_k = bch2_btree_iter_peek_max(&iter,
+ lru_end(BCH_LRU_BUCKET_FRAGMENTATION))));
+
+ u64 bucket_frag_ratio = lru_k.k && !bkey_err(lru_k) ? lru_pos_time(lru_k.k->p) : 0;
+
+ /* Prefer normal bucket copygc */
+ return stripe_frag_ratio && stripe_frag_ratio * 2 < bucket_frag_ratio;
}
noinline
@@ -213,7 +295,18 @@ static int bch2_copygc(struct moving_context *ctxt,
u64 sectors_moved = atomic64_read(&ctxt->stats->sectors_moved);
int ret = 0;
- ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight);
+ move_buckets_wait(ctxt, buckets_in_flight, false);
+
+ ret = bch2_btree_write_buffer_tryflush(trans);
+ if (bch2_err_matches(ret, EROFS))
+ goto err;
+
+ if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret)))
+ goto err;
+
+ ret = should_do_ec_copygc(trans)
+ ? bch2_copygc_get_stripe_buckets(ctxt, buckets_in_flight)
+ : bch2_copygc_get_buckets(ctxt, buckets_in_flight);
if (ret)
goto err;
@@ -265,7 +358,8 @@ static u64 bch2_copygc_dev_wait_amount(struct bch_dev *ca)
for (unsigned i = 0; i < BCH_DATA_NR; i++)
if (data_type_movable(i))
- fragmented += usage_full.d[i].fragmented;
+ fragmented += usage_full.d[i].buckets * ca->mi.bucket_size -
+ usage_full.d[i].sectors;
return max(0LL, fragmented_allowed - fragmented);
}
diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c
index c0c5fe96..17ca56b0 100644
--- a/libbcachefs/rebalance.c
+++ b/libbcachefs/rebalance.c
@@ -292,12 +292,48 @@ static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum,
: 0;
}
-static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans,
- struct btree_iter *work_iter)
+#define REBALANCE_WORK_BUF_NR 1024
+DEFINE_DARRAY_NAMED(darray_rebalance_work, struct bkey_i_cookie);
+
+static struct bkey_i *next_rebalance_entry(struct btree_trans *trans,
+ darray_rebalance_work *buf, struct bpos *work_pos)
{
- return !kthread_should_stop()
- ? bch2_btree_iter_peek(work_iter)
- : bkey_s_c_null;
+ if (unlikely(!buf->nr)) {
+ /*
+ * Avoid contention with write buffer flush: buffer up rebalance
+ * work entries in a darray
+ */
+
+ BUG_ON(!buf->size);;
+
+ bch2_trans_begin(trans);
+
+ for_each_btree_key(trans, iter, BTREE_ID_rebalance_work, *work_pos,
+ BTREE_ITER_all_snapshots|BTREE_ITER_prefetch, k, ({
+ /* we previously used darray_make_room */
+ BUG_ON(bkey_bytes(k.k) > sizeof(buf->data[0]));
+
+ bkey_reassemble(&darray_top(*buf).k_i, k);
+ buf->nr++;
+
+ *work_pos = bpos_successor(iter.pos);
+ if (buf->nr == buf->size)
+ break;
+ 0;
+ }));
+
+ if (!buf->nr)
+ return NULL;
+
+ unsigned l = 0, r = buf->nr - 1;
+ while (l < r) {
+ swap(buf->data[l], buf->data[r]);
+ l++;
+ --r;
+ }
+ }
+
+ return &(&darray_pop(buf))->k_i;
}
static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
@@ -464,10 +500,9 @@ static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
per_snapshot_io_opts_init(&snapshot_io_opts, c);
int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents,
- r->scan_start.pos, r->scan_end.pos,
- BTREE_ITER_all_snapshots|
- BTREE_ITER_not_extents|
- BTREE_ITER_prefetch, k, ({
+ r->scan_start.pos, r->scan_end.pos,
+ BTREE_ITER_all_snapshots|
+ BTREE_ITER_prefetch, k, ({
ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
struct bch_io_opts *io_opts = bch2_move_get_io_opts(trans,
@@ -524,49 +559,37 @@ static int do_rebalance(struct moving_context *ctxt)
struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
struct bch_fs_rebalance *r = &c->rebalance;
- struct btree_iter extent_iter = { NULL };
- struct bkey_s_c k;
+ struct btree_iter extent_iter = {};
u32 kick = r->kick;
- int ret = 0;
- bch2_trans_begin(trans);
+ struct bpos work_pos = POS_MIN;
+ CLASS(darray_rebalance_work, work)();
+ int ret = darray_make_room(&work, REBALANCE_WORK_BUF_NR);
+ if (ret)
+ return ret;
bch2_move_stats_init(&r->work_stats, "rebalance_work");
bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
- CLASS(btree_iter, rebalance_work_iter)(trans,
- BTREE_ID_rebalance_work, POS_MIN,
- BTREE_ITER_all_snapshots);
-
while (!bch2_move_ratelimit(ctxt)) {
if (!bch2_rebalance_enabled(c)) {
bch2_moving_ctxt_flush_all(ctxt);
kthread_wait_freezable(bch2_rebalance_enabled(c) ||
kthread_should_stop());
+ if (kthread_should_stop())
+ break;
}
- if (kthread_should_stop())
+ struct bkey_i *k = next_rebalance_entry(trans, &work, &work_pos);
+ if (!k)
break;
- bch2_trans_begin(trans);
-
- ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter));
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret || !k.k)
- break;
-
- ret = k.k->type == KEY_TYPE_cookie
- ? do_rebalance_scan(ctxt, k.k->p.inode,
- le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie))
- : do_rebalance_extent(ctxt, k.k->p, &extent_iter);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
+ ret = k->k.type == KEY_TYPE_cookie
+ ? do_rebalance_scan(ctxt, k->k.p.inode,
+ le64_to_cpu(bkey_i_to_cookie(k)->v.cookie))
+ : lockrestart_do(trans, do_rebalance_extent(ctxt, k->k.p, &extent_iter));
if (ret)
break;
-
- bch2_btree_iter_advance(&rebalance_work_iter);
}
bch2_trans_iter_exit(&extent_iter);
diff --git a/libbcachefs/sb-counters_format.h b/libbcachefs/sb-counters_format.h
index f3ea53a5..740859c7 100644
--- a/libbcachefs/sb-counters_format.h
+++ b/libbcachefs/sb-counters_format.h
@@ -101,7 +101,8 @@ enum counters_flags {
x(trans_restart_write_buffer_flush, 75, TYPE_COUNTER) \
x(trans_restart_split_race, 76, TYPE_COUNTER) \
x(write_buffer_flush_slowpath, 77, TYPE_COUNTER) \
- x(write_buffer_flush_sync, 78, TYPE_COUNTER)
+ x(write_buffer_flush_sync, 78, TYPE_COUNTER) \
+ x(error_throw, 93, TYPE_COUNTER)
enum bch_persistent_counters {
#define x(t, n, ...) BCH_COUNTER_##t,
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 6c72d93b..ef15e614 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -1054,16 +1054,19 @@ static int bch2_fs_opt_version_init(struct bch_fs *c)
bch2_print_str(c, KERN_INFO, p.buf);
- if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) {
- bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported");
- return -EINVAL;
- }
+ if (BCH_SB_INITIALIZED(c->disk_sb.sb)) {
+ if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) {
+ bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported");
+ return -EINVAL;
+ }
- if (!c->sb.clean &&
- !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) {
- bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix");
- return -EINVAL;
+ if (!c->sb.clean &&
+ !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) {
+ bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix");
+ return -EINVAL;
+ }
}
+
return 0;
}