summaryrefslogtreecommitdiff
path: root/libbcachefs/super.c
diff options
context:
space:
mode:
Diffstat (limited to 'libbcachefs/super.c')
-rw-r--r--libbcachefs/super.c193
1 files changed, 106 insertions, 87 deletions
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 4e8b0a51..60a2d83e 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -140,8 +140,9 @@ struct bch_fs *bch2_uuid_to_fs(uuid_le uuid)
return c;
}
-int bch2_congested(struct bch_fs *c, int bdi_bits)
+int bch2_congested(void *data, int bdi_bits)
{
+ struct bch_fs *c = data;
struct backing_dev_info *bdi;
struct bch_dev *ca;
unsigned i;
@@ -178,13 +179,6 @@ int bch2_congested(struct bch_fs *c, int bdi_bits)
return ret;
}
-static int bch2_congested_fn(void *data, int bdi_bits)
-{
- struct bch_fs *c = data;
-
- return bch2_congested(c, bdi_bits);
-}
-
/* Filesystem RO/RW: */
/*
@@ -218,7 +212,7 @@ static void __bch2_fs_read_only(struct bch_fs *c)
* Flush journal before stopping allocators, because flushing journal
* blacklist entries involves allocating new btree nodes:
*/
- bch2_journal_flush_pins(&c->journal, U64_MAX);
+ bch2_journal_flush_all_pins(&c->journal);
if (!bch2_journal_error(&c->journal))
bch2_btree_verify_flushed(c);
@@ -379,8 +373,6 @@ static void bch2_fs_free(struct bch_fs *c)
bch2_io_clock_exit(&c->io_clock[WRITE]);
bch2_io_clock_exit(&c->io_clock[READ]);
bch2_fs_compress_exit(c);
- if (c->bdi.bdi_list.next)
- bdi_destroy(&c->bdi);
lg_lock_free(&c->usage_lock);
free_percpu(c->usage_percpu);
mempool_exit(&c->btree_bounce_pool);
@@ -393,7 +385,7 @@ static void bch2_fs_free(struct bch_fs *c)
mempool_exit(&c->btree_reserve_pool);
mempool_exit(&c->fill_iter);
percpu_ref_exit(&c->writes);
- kfree(c->replicas);
+ kfree(rcu_dereference_protected(c->replicas, 1));
if (c->copygc_wq)
destroy_workqueue(c->copygc_wq);
@@ -414,7 +406,7 @@ static void bch2_fs_exit(struct bch_fs *c)
for (i = 0; i < c->sb.nr_devices; i++)
if (c->devs[i])
- bch2_dev_free(c->devs[i]);
+ bch2_dev_free(rcu_dereference_protected(c->devs[i], 1));
closure_debug_destroy(&c->cl);
kobject_put(&c->kobj);
@@ -576,10 +568,14 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
sizeof(struct btree_update)) ||
mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
bioset_init(&c->btree_read_bio, 1,
- offsetof(struct btree_read_bio, bio)) ||
- bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio)) ||
- bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio)) ||
- bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio)) ||
+ offsetof(struct btree_read_bio, bio),
+ BIOSET_NEED_BVECS) ||
+ bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
+ BIOSET_NEED_BVECS) ||
+ bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
+ BIOSET_NEED_BVECS) ||
+ bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
+ BIOSET_NEED_BVECS) ||
mempool_init_page_pool(&c->bio_bounce_pages,
max_t(unsigned,
c->opts.btree_node_size,
@@ -588,7 +584,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
!(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
lg_lock_init(&c->usage_lock) ||
mempool_init_vp_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) ||
- bdi_setup_and_register(&c->bdi, "bcachefs") ||
bch2_io_clock_init(&c->io_clock[READ]) ||
bch2_io_clock_init(&c->io_clock[WRITE]) ||
bch2_fs_journal_init(&c->journal) ||
@@ -599,10 +594,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_fsio_init(c))
goto err;
- c->bdi.ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
- c->bdi.congested_fn = bch2_congested_fn;
- c->bdi.congested_data = c;
-
mi = bch2_sb_get_members(c->disk_sb);
for (i = 0; i < c->sb.nr_devices; i++)
if (bch2_dev_exists(c->disk_sb, mi, i) &&
@@ -729,8 +720,12 @@ static const char *__bch2_fs_start(struct bch_fs *c)
continue;
err = "error reading btree root";
- if (bch2_btree_root_read(c, i, k, level))
- goto err;
+ if (bch2_btree_root_read(c, i, k, level)) {
+ if (i != BTREE_ID_ALLOC)
+ goto err;
+
+ mustfix_fsck_err(c, "error reading btree root");
+ }
}
err = "error reading allocation information";
@@ -830,7 +825,7 @@ static const char *__bch2_fs_start(struct bch_fs *c)
closure_sync(&cl);
bch2_inode_init(c, &inode, 0, 0,
- S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0);
+ S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
inode.bi_inum = BCACHEFS_ROOT_INO;
bch2_inode_pack(&packed_inode, &inode);
@@ -877,6 +872,7 @@ out:
bch2_journal_entries_free(&journal);
return err;
err:
+fsck_err:
closure_sync(&cl);
switch (ret) {
@@ -995,24 +991,20 @@ static void bch2_dev_free(struct bch_dev *ca)
kobject_put(&ca->kobj);
}
-static void bch2_dev_io_ref_release(struct percpu_ref *ref)
-{
- struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
-
- complete(&ca->offline_complete);
-}
-
static void __bch2_dev_offline(struct bch_dev *ca)
{
struct bch_fs *c = ca->fs;
lockdep_assert_held(&c->state_lock);
+ if (percpu_ref_is_zero(&ca->io_ref))
+ return;
+
__bch2_dev_read_only(c, ca);
- reinit_completion(&ca->offline_complete);
+ reinit_completion(&ca->io_ref_completion);
percpu_ref_kill(&ca->io_ref);
- wait_for_completion(&ca->offline_complete);
+ wait_for_completion(&ca->io_ref_completion);
if (ca->kobj.state_in_sysfs) {
struct kobject *block =
@@ -1026,27 +1018,18 @@ static void __bch2_dev_offline(struct bch_dev *ca)
bch2_dev_journal_exit(ca);
}
-static void bch2_dev_ref_release(struct percpu_ref *ref)
+static void bch2_dev_ref_complete(struct percpu_ref *ref)
{
struct bch_dev *ca = container_of(ref, struct bch_dev, ref);
- complete(&ca->stop_complete);
+ complete(&ca->ref_completion);
}
-static void bch2_dev_stop(struct bch_dev *ca)
+static void bch2_dev_io_ref_complete(struct percpu_ref *ref)
{
- struct bch_fs *c = ca->fs;
-
- lockdep_assert_held(&c->state_lock);
-
- BUG_ON(rcu_access_pointer(c->devs[ca->dev_idx]) != ca);
- rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
-
- synchronize_rcu();
+ struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
- reinit_completion(&ca->stop_complete);
- percpu_ref_kill(&ca->ref);
- wait_for_completion(&ca->stop_complete);
+ complete(&ca->io_ref_completion);
}
static int bch2_dev_sysfs_online(struct bch_dev *ca)
@@ -1095,8 +1078,8 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
return -ENOMEM;
kobject_init(&ca->kobj, &bch2_dev_ktype);
- init_completion(&ca->stop_complete);
- init_completion(&ca->offline_complete);
+ init_completion(&ca->ref_completion);
+ init_completion(&ca->io_ref_completion);
ca->dev_idx = dev_idx;
__set_bit(ca->dev_idx, ca->self.d);
@@ -1132,9 +1115,9 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
DIV_ROUND_UP(BTREE_NODE_RESERVE,
ca->mi.bucket_size / c->opts.btree_node_size);
- if (percpu_ref_init(&ca->ref, bch2_dev_ref_release,
+ if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
0, GFP_KERNEL) ||
- percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_release,
+ percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
!init_fifo(&ca->free[RESERVE_BTREE], btree_node_reserve_buckets,
GFP_KERNEL) ||
@@ -1155,7 +1138,7 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
GFP_KERNEL|__GFP_ZERO)) ||
!(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)) ||
bioset_init(&ca->replica_set, 4,
- offsetof(struct bch_write_bio, bio)) ||
+ offsetof(struct bch_write_bio, bio), 0) ||
!(ca->io_done = alloc_percpu(*ca->io_done)))
goto err;
@@ -1180,8 +1163,6 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
struct bch_dev *ca;
int ret;
- lockdep_assert_held(&c->sb_lock);
-
if (le64_to_cpu(sb->sb->seq) >
le64_to_cpu(c->disk_sb->seq))
bch2_sb_to_fs(c, sb->sb);
@@ -1189,13 +1170,15 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices ||
!c->devs[sb->sb->dev_idx]);
- ca = c->devs[sb->sb->dev_idx];
+ ca = bch_dev_locked(c, sb->sb->dev_idx);
if (ca->disk_sb.bdev) {
bch_err(c, "already have device online in slot %u",
sb->sb->dev_idx);
return -EINVAL;
}
+ BUG_ON(!percpu_ref_is_zero(&ca->io_ref));
+
ret = bch2_dev_journal_init(ca, sb->sb);
if (ret)
return ret;
@@ -1222,7 +1205,7 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
if (bch2_dev_sysfs_online(ca))
pr_warn("error creating sysfs objects");
- bch2_mark_dev_superblock(c, ca, 0);
+ bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
if (ca->mi.state == BCH_MEMBER_STATE_RW)
bch2_dev_allocator_add(c, ca);
@@ -1293,6 +1276,7 @@ static bool bch2_fs_may_start(struct bch_fs *c)
{
struct replicas_status s;
struct bch_sb_field_members *mi;
+ struct bch_dev *ca;
unsigned i, flags = c->opts.degraded
? BCH_FORCE_IF_DEGRADED
: 0;
@@ -1301,14 +1285,19 @@ static bool bch2_fs_may_start(struct bch_fs *c)
mutex_lock(&c->sb_lock);
mi = bch2_sb_get_members(c->disk_sb);
- for (i = 0; i < c->disk_sb->nr_devices; i++)
- if (bch2_dev_exists(c->disk_sb, mi, i) &&
- !bch2_dev_is_online(c->devs[i]) &&
- (c->devs[i]->mi.state == BCH_MEMBER_STATE_RW ||
- c->devs[i]->mi.state == BCH_MEMBER_STATE_RO)) {
+ for (i = 0; i < c->disk_sb->nr_devices; i++) {
+ if (!bch2_dev_exists(c->disk_sb, mi, i))
+ continue;
+
+ ca = bch_dev_locked(c, i);
+
+ if (!bch2_dev_is_online(ca) &&
+ (ca->mi.state == BCH_MEMBER_STATE_RW ||
+ ca->mi.state == BCH_MEMBER_STATE_RO)) {
mutex_unlock(&c->sb_lock);
return false;
}
+ }
mutex_unlock(&c->sb_lock);
}
@@ -1419,22 +1408,59 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
*
* flag_data_bad() does not check btree pointers
*/
- ret = bch2_flag_data_bad(ca);
+ ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
if (ret) {
- bch_err(ca, "Remove failed");
+ bch_err(ca, "Remove failed: error %i dropping data", ret);
+ goto err;
+ }
+
+ ret = bch2_journal_flush_device(&c->journal, ca->dev_idx);
+ if (ret) {
+ bch_err(ca, "Remove failed: error %i flushing journal", ret);
goto err;
}
data = bch2_dev_has_data(c, ca);
if (data) {
- bch_err(ca, "Remove failed, still has data (%x)", data);
+ char data_has_str[100];
+ bch2_scnprint_flag_list(data_has_str,
+ sizeof(data_has_str),
+ bch2_data_types,
+ data);
+ bch_err(ca, "Remove failed, still has data (%s)", data_has_str);
+ ret = -EBUSY;
goto err;
}
- bch2_journal_meta(&c->journal);
+ ret = bch2_btree_delete_range(c, BTREE_ID_ALLOC,
+ POS(ca->dev_idx, 0),
+ POS(ca->dev_idx + 1, 0),
+ ZERO_VERSION,
+ NULL, NULL, NULL);
+ if (ret) {
+ bch_err(ca, "Remove failed, error deleting alloc info");
+ goto err;
+ }
+
+ /*
+ * must flush all existing journal entries, they might have
+ * (overwritten) keys that point to the device we're removing:
+ */
+ ret = bch2_journal_flush_all_pins(&c->journal);
+ if (ret) {
+ bch_err(ca, "Remove failed, journal error");
+ goto err;
+ }
__bch2_dev_offline(ca);
- bch2_dev_stop(ca);
+
+ mutex_lock(&c->sb_lock);
+ rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
+ mutex_unlock(&c->sb_lock);
+
+ percpu_ref_kill(&ca->ref);
+ wait_for_completion(&ca->ref_completion);
+
bch2_dev_free(ca);
/*
@@ -1542,7 +1568,7 @@ have_slot:
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
- ca = c->devs[dev_idx];
+ ca = bch_dev_locked(c, dev_idx);
if (ca->mi.state == BCH_MEMBER_STATE_RW) {
err = "journal alloc failed";
if (bch2_dev_journal_alloc(ca))
@@ -1568,7 +1594,7 @@ err:
/* Hot add existing device to running filesystem: */
int bch2_dev_online(struct bch_fs *c, const char *path)
{
- struct bch_sb_handle sb = { 0 };
+ struct bch_sb_handle sb = { NULL };
struct bch_dev *ca;
unsigned dev_idx;
const char *err;
@@ -1593,7 +1619,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
}
mutex_unlock(&c->sb_lock);
- ca = c->devs[dev_idx];
+ ca = bch_dev_locked(c, dev_idx);
if (ca->mi.state == BCH_MEMBER_STATE_RW) {
err = __bch2_dev_read_write(c, ca);
if (err)
@@ -1619,7 +1645,6 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
return -EINVAL;
}
- __bch2_dev_read_only(c, ca);
__bch2_dev_offline(ca);
mutex_unlock(&c->state_lock);
@@ -1629,37 +1654,31 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
int bch2_dev_evacuate(struct bch_fs *c, struct bch_dev *ca)
{
unsigned data;
- int ret;
+ int ret = 0;
mutex_lock(&c->state_lock);
if (ca->mi.state == BCH_MEMBER_STATE_RW) {
bch_err(ca, "Cannot migrate data off RW device");
- mutex_unlock(&c->state_lock);
- return -EINVAL;
+ ret = -EINVAL;
+ goto err;
}
- mutex_unlock(&c->state_lock);
-
- ret = bch2_move_data_off_device(ca);
+ ret = bch2_dev_data_migrate(c, ca, 0);
if (ret) {
bch_err(ca, "Error migrating data: %i", ret);
- return ret;
- }
-
- ret = bch2_move_metadata_off_device(ca);
- if (ret) {
- bch_err(ca, "Error migrating metadata: %i", ret);
- return ret;
+ goto err;
}
data = bch2_dev_has_data(c, ca);
if (data) {
bch_err(ca, "Migrate error: data still present (%x)", data);
- return -EINVAL;
+ ret = -EINVAL;
+ goto err;
}
-
- return 0;
+err:
+ mutex_unlock(&c->state_lock);
+ return ret;
}
/* Filesystem open: */