diff options
Diffstat (limited to 'libbcachefs/super.c')
-rw-r--r-- | libbcachefs/super.c | 322 |
1 files changed, 171 insertions, 151 deletions
diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 2a3947e2..692eb417 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -224,6 +224,9 @@ static void __bch2_fs_read_only(struct bch_fs *c) bch2_dev_allocator_stop(ca); bch2_fs_journal_stop(&c->journal); + + for_each_member_device(ca, c, i) + bch2_dev_allocator_remove(c, ca); } static void bch2_writes_disabled(struct percpu_ref *writes) @@ -330,6 +333,10 @@ const char *bch2_fs_read_write(struct bch_fs *c) c->state != BCH_FS_RO) goto out; + for_each_rw_member(ca, c, i) + bch2_dev_allocator_add(c, ca); + bch2_recalc_capacity(c); + err = "error starting allocator thread"; for_each_rw_member(ca, c, i) if (bch2_dev_allocator_start(ca)) { @@ -484,6 +491,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) mutex_init(&c->state_lock); mutex_init(&c->sb_lock); + mutex_init(&c->replicas_gc_lock); mutex_init(&c->btree_cache_lock); mutex_init(&c->bucket_lock); mutex_init(&c->btree_root_lock); @@ -603,7 +611,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) mi = bch2_sb_get_members(c->disk_sb); for (i = 0; i < c->sb.nr_devices; i++) - if (!bch2_is_zero(mi->members[i].uuid.b, sizeof(uuid_le)) && + if (bch2_dev_exists(c->disk_sb, mi, i) && bch2_dev_alloc(c, i)) goto err; @@ -681,12 +689,16 @@ static const char *__bch2_fs_start(struct bch_fs *c) const char *err = "cannot allocate memory"; struct bch_sb_field_members *mi; struct bch_dev *ca; - unsigned i, id; - time64_t now; LIST_HEAD(journal); struct jset *j; + struct closure cl; + u64 journal_seq = 0; + time64_t now; + unsigned i; int ret = -EINVAL; + closure_init_stack(&cl); + BUG_ON(c->state != BCH_FS_STARTING); mutex_lock(&c->sb_lock); @@ -694,6 +706,10 @@ static const char *__bch2_fs_start(struct bch_fs *c) bch2_sb_from_fs(c, ca); mutex_unlock(&c->sb_lock); + for_each_rw_member(ca, c, i) + bch2_dev_allocator_add(c, ca); + bch2_recalc_capacity(c); + if (BCH_SB_INITIALIZED(c->disk_sb)) { ret = bch2_journal_read(c, &journal); if (ret) @@ -704,44 +720,45 @@ static const char *__bch2_fs_start(struct bch_fs *c) c->prio_clock[READ].hand = le16_to_cpu(j->read_clock); c->prio_clock[WRITE].hand = le16_to_cpu(j->write_clock); - err = "error reading priorities"; - for_each_readable_member(ca, c, i) { - ret = bch2_prio_read(ca); - if (ret) { - percpu_ref_put(&ca->io_ref); - goto err; - } - } - - for (id = 0; id < BTREE_ID_NR; id++) { + for (i = 0; i < BTREE_ID_NR; i++) { unsigned level; struct bkey_i *k; - err = "bad btree root"; - k = bch2_journal_find_btree_root(c, j, id, &level); - if (!k && id == BTREE_ID_EXTENTS) + err = "missing btree root"; + k = bch2_journal_find_btree_root(c, j, i, &level); + if (!k && i < BTREE_ID_ALLOC) goto err; - if (!k) { - pr_debug("missing btree root: %d", id); + + if (!k) continue; - } err = "error reading btree root"; - if (bch2_btree_root_read(c, id, k, level)) + if (bch2_btree_root_read(c, i, k, level)) goto err; } - bch_verbose(c, "starting mark and sweep:"); + err = "error reading allocation information"; + ret = bch2_alloc_read(c, &journal); + if (ret) + goto err; + bch_verbose(c, "starting mark and sweep:"); err = "error in recovery"; ret = bch2_initial_gc(c, &journal); if (ret) goto err; + bch_verbose(c, "mark and sweep done"); if (c->opts.noreplay) goto recovery_done; - bch_verbose(c, "mark and sweep done"); + err = "cannot allocate new btree root"; + for (i = 0; i < BTREE_ID_NR; i++) + if (!c->btree_roots[i].b && + bch2_btree_root_alloc(c, i, &cl)) + goto err; + + closure_sync(&cl); /* * bch2_journal_start() can't happen sooner, or btree_gc_finish() @@ -758,12 +775,10 @@ static const char *__bch2_fs_start(struct bch_fs *c) } bch_verbose(c, "starting journal replay:"); - err = "journal replay failed"; ret = bch2_journal_replay(c, &journal); if (ret) goto err; - bch_verbose(c, "journal replay done"); if (c->opts.norecovery) @@ -774,23 +789,21 @@ static const char *__bch2_fs_start(struct bch_fs *c) ret = bch2_fsck(c, !c->opts.nofsck); if (ret) goto err; + bch_verbose(c, "fsck done"); for_each_rw_member(ca, c, i) - if (ca->need_prio_write) { - ret = bch2_prio_write(ca); + if (ca->need_alloc_write) { + ret = bch2_alloc_write(c, ca, &journal_seq); if (ret) { percpu_ref_put(&ca->io_ref); goto err; } } - bch_verbose(c, "fsck done"); + bch2_journal_flush_seq(&c->journal, journal_seq); } else { struct bch_inode_unpacked inode; struct bkey_inode_buf packed_inode; - struct closure cl; - - closure_init_stack(&cl); bch_notice(c, "initializing new filesystem"); @@ -805,6 +818,11 @@ static const char *__bch2_fs_start(struct bch_fs *c) goto err; } + err = "cannot allocate new btree root"; + for (i = 0; i < BTREE_ID_NR; i++) + if (bch2_btree_root_alloc(c, i, &cl)) + goto err; + /* * journal_res_get() will crash if called before this has * set up the journal.pin FIFO and journal.cur pointer: @@ -819,13 +837,6 @@ static const char *__bch2_fs_start(struct bch_fs *c) goto err; } - err = "cannot allocate new btree root"; - for (id = 0; id < BTREE_ID_NR; id++) - if (bch2_btree_root_alloc(c, id, &cl)) { - closure_sync(&cl); - goto err; - } - /* Wait for new btree roots to be written: */ closure_sync(&cl); @@ -877,6 +888,8 @@ out: bch2_journal_entries_free(&journal); return err; err: + closure_sync(&cl); + switch (ret) { case BCH_FSCK_ERRORS_NOT_FIXED: bch_err(c, "filesystem contains errors: please report this to the developers"); @@ -940,10 +953,7 @@ static const char *bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb) if (uuid_le_cmp(fs->uuid, sb->uuid)) return "device not a member of filesystem"; - if (sb->dev_idx >= newest->nr_devices) - return "device has invalid dev_idx"; - - if (bch2_is_zero(mi->members[sb->dev_idx].uuid.b, sizeof(uuid_le))) + if (!bch2_dev_exists(newest, mi, sb->dev_idx)) return "device has been removed"; if (fs->block_size != sb->block_size) @@ -981,9 +991,6 @@ static void bch2_dev_free(struct bch_dev *ca) free_percpu(ca->sectors_written); bioset_exit(&ca->replica_set); free_percpu(ca->usage_percpu); - kvpfree(ca->disk_buckets, bucket_bytes(ca)); - kfree(ca->prio_buckets); - kfree(ca->bio_prio); kvpfree(ca->buckets, ca->mi.nbuckets * sizeof(struct bucket)); kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8)); free_heap(&ca->copygc_heap); @@ -1011,7 +1018,7 @@ static void __bch2_dev_offline(struct bch_dev *ca) lockdep_assert_held(&c->state_lock); - __bch2_dev_read_only(ca->fs, ca); + __bch2_dev_read_only(c, ca); reinit_completion(&ca->offline_complete); percpu_ref_kill(&ca->io_ref); @@ -1061,7 +1068,7 @@ static int bch2_dev_sysfs_online(struct bch_dev *ca) return 0; if (!ca->kobj.state_in_sysfs) { - ret = kobject_add(&ca->kobj, &ca->fs->kobj, + ret = kobject_add(&ca->kobj, &c->kobj, "dev-%u", ca->dev_idx); if (ret) return ret; @@ -1087,7 +1094,7 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) struct bch_member *member; size_t reserve_none, movinggc_reserve, free_inc_reserve, total_reserve; size_t heap_size; - unsigned i; + unsigned i, btree_node_reserve_buckets; struct bch_dev *ca; if (bch2_fs_init_fault("dev_alloc")) @@ -1107,8 +1114,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) ca->dev_idx = dev_idx; spin_lock_init(&ca->freelist_lock); - spin_lock_init(&ca->prio_buckets_lock); - mutex_init(&ca->prio_write_lock); bch2_dev_moving_gc_init(ca); INIT_WORK(&ca->io_error_work, bch2_nonfatal_io_error_work); @@ -1134,12 +1139,16 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) free_inc_reserve = movinggc_reserve / 2; heap_size = movinggc_reserve * 8; + btree_node_reserve_buckets = + DIV_ROUND_UP(BTREE_NODE_RESERVE, + ca->mi.bucket_size / c->sb.btree_node_size); + if (percpu_ref_init(&ca->ref, bch2_dev_ref_release, 0, GFP_KERNEL) || percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_release, PERCPU_REF_INIT_DEAD, GFP_KERNEL) || - !init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) || - !init_fifo(&ca->free[RESERVE_BTREE], BTREE_NODE_RESERVE, GFP_KERNEL) || + !init_fifo(&ca->free[RESERVE_BTREE], btree_node_reserve_buckets, + GFP_KERNEL) || !init_fifo(&ca->free[RESERVE_MOVINGGC], movinggc_reserve, GFP_KERNEL) || !init_fifo(&ca->free[RESERVE_NONE], reserve_none, GFP_KERNEL) || @@ -1152,18 +1161,12 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) !(ca->buckets = kvpmalloc(ca->mi.nbuckets * sizeof(struct bucket), GFP_KERNEL|__GFP_ZERO)) || - !(ca->prio_buckets = kzalloc(sizeof(u64) * prio_buckets(ca) * - 2, GFP_KERNEL)) || - !(ca->disk_buckets = kvpmalloc(bucket_bytes(ca), GFP_KERNEL)) || !(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)) || - !(ca->bio_prio = bio_kmalloc(GFP_NOIO, bucket_pages(ca))) || bioset_init(&ca->replica_set, 4, offsetof(struct bch_write_bio, bio)) || !(ca->sectors_written = alloc_percpu(*ca->sectors_written))) goto err; - ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca); - total_reserve = ca->free_inc.size; for (i = 0; i < RESERVE_NR; i++) total_reserve += ca->free[i].size; @@ -1232,53 +1235,48 @@ static int __bch2_dev_online(struct bch_fs *c, struct bcache_superblock *sb) lg_local_lock(&c->usage_lock); if (!gc_will_visit(c, gc_phase(GC_PHASE_SB_METADATA))) - bch2_mark_dev_metadata(ca->fs, ca); + bch2_mark_dev_metadata(c, ca); lg_local_unlock(&c->usage_lock); + if (ca->mi.state == BCH_MEMBER_STATE_RW) { + struct bch_sb_field_journal *journal_buckets = + bch2_sb_get_journal(ca->disk_sb.sb); + bool has_journal = + bch2_nr_journal_buckets(journal_buckets) >= + BCH_JOURNAL_BUCKETS_MIN; + + bch2_dev_group_add(&c->tiers[ca->mi.tier].devs, ca); + bch2_dev_group_add(&c->all_devs, ca); + + if (has_journal) + bch2_dev_group_add(&c->journal.devs, ca); + } + percpu_ref_reinit(&ca->io_ref); return 0; } /* Device management: */ -bool bch2_fs_may_start(struct bch_fs *c, int flags) +static bool have_enough_devs(struct bch_fs *c, + struct replicas_status s, + unsigned flags) { - struct bch_sb_field_members *mi; - unsigned meta_missing = 0; - unsigned data_missing = 0; - bool degraded = false; - unsigned i; - - mutex_lock(&c->sb_lock); - mi = bch2_sb_get_members(c->disk_sb); - - for (i = 0; i < c->disk_sb->nr_devices; i++) - if (!c->devs[i] && - !bch2_is_zero(mi->members[i].uuid.b, sizeof(uuid_le))) { - degraded = true; - if (BCH_MEMBER_HAS_METADATA(&mi->members[i])) - meta_missing++; - if (BCH_MEMBER_HAS_DATA(&mi->members[i])) - data_missing++; - } - mutex_unlock(&c->sb_lock); - - if (degraded && - !(flags & BCH_FORCE_IF_DEGRADED)) - return false; - - if (meta_missing && + if ((s.replicas[BCH_DATA_JOURNAL].nr_offline || + s.replicas[BCH_DATA_BTREE].nr_offline) && !(flags & BCH_FORCE_IF_METADATA_DEGRADED)) return false; - if (meta_missing >= BCH_SB_META_REPLICAS_HAVE(c->disk_sb) && + if ((!s.replicas[BCH_DATA_JOURNAL].nr_online || + !s.replicas[BCH_DATA_BTREE].nr_online) && !(flags & BCH_FORCE_IF_METADATA_LOST)) return false; - if (data_missing && !(flags & BCH_FORCE_IF_DATA_DEGRADED)) + if (s.replicas[BCH_DATA_USER].nr_offline && + !(flags & BCH_FORCE_IF_DATA_DEGRADED)) return false; - if (data_missing >= BCH_SB_DATA_REPLICAS_HAVE(c->disk_sb) && + if (!s.replicas[BCH_DATA_USER].nr_online && !(flags & BCH_FORCE_IF_DATA_LOST)) return false; @@ -1297,40 +1295,80 @@ bool bch2_fs_may_start(struct bch_fs *c, int flags) bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, enum bch_member_state new_state, int flags) { - lockdep_assert_held(&c->state_lock); - - if (new_state == BCH_MEMBER_STATE_RW) - return true; + struct replicas_status s; + struct bch_dev *ca2; + int i, nr_rw = 0, required; - if (ca->mi.state == BCH_MEMBER_STATE_FAILED) - return true; + lockdep_assert_held(&c->state_lock); - /* - * If the device is already offline - whatever is going on with it can't - * possible make the FS need to go RO: - */ - if (!bch2_dev_is_online(ca)) + switch (new_state) { + case BCH_MEMBER_STATE_RW: return true; + case BCH_MEMBER_STATE_RO: + if (ca->mi.state != BCH_MEMBER_STATE_RW) + return true; + + /* do we have enough devices to write to? */ + for_each_member_device(ca2, c, i) + nr_rw += ca2->mi.state == BCH_MEMBER_STATE_RW; + + required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED) + ? c->opts.metadata_replicas + : c->opts.metadata_replicas_required, + !(flags & BCH_FORCE_IF_DATA_DEGRADED) + ? c->opts.data_replicas + : c->opts.data_replicas_required); + + return nr_rw - 1 <= required; + case BCH_MEMBER_STATE_FAILED: + case BCH_MEMBER_STATE_SPARE: + if (ca->mi.state != BCH_MEMBER_STATE_RW && + ca->mi.state != BCH_MEMBER_STATE_RO) + return true; + + /* do we have enough devices to read from? */ + s = __bch2_replicas_status(c, ca); + + pr_info("replicas: j %u %u b %u %u d %u %u", + s.replicas[BCH_DATA_JOURNAL].nr_online, + s.replicas[BCH_DATA_JOURNAL].nr_offline, + + s.replicas[BCH_DATA_BTREE].nr_online, + s.replicas[BCH_DATA_BTREE].nr_offline, + + s.replicas[BCH_DATA_USER].nr_online, + s.replicas[BCH_DATA_USER].nr_offline); + + return have_enough_devs(c, s, flags); + default: + BUG(); + } +} - if (ca->mi.has_data && - !(flags & BCH_FORCE_IF_DATA_DEGRADED)) - return false; - - if (ca->mi.has_data && - c->sb.data_replicas_have <= 1 && - !(flags & BCH_FORCE_IF_DATA_LOST)) - return false; +static bool bch2_fs_may_start(struct bch_fs *c, int flags) +{ + struct replicas_status s; + struct bch_sb_field_members *mi; + unsigned i; - if (ca->mi.has_metadata && - !(flags & BCH_FORCE_IF_METADATA_DEGRADED)) - return false; + if (!c->opts.degraded) { + mutex_lock(&c->sb_lock); + mi = bch2_sb_get_members(c->disk_sb); + + for (i = 0; i < c->disk_sb->nr_devices; i++) + if (bch2_dev_exists(c->disk_sb, mi, i) && + !bch2_dev_is_online(c->devs[i]) && + (c->devs[i]->mi.state == BCH_MEMBER_STATE_RW || + c->devs[i]->mi.state == BCH_MEMBER_STATE_RO)) { + mutex_unlock(&c->sb_lock); + return false; + } + mutex_unlock(&c->sb_lock); + } - if (ca->mi.has_metadata && - c->sb.meta_replicas_have <= 1 && - !(flags & BCH_FORCE_IF_METADATA_LOST)) - return false; + s = bch2_replicas_status(c); - return true; + return have_enough_devs(c, s, flags); } static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) @@ -1343,8 +1381,7 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) * complete. */ bch2_dev_allocator_stop(ca); - - bch2_dev_group_remove(&c->journal.devs, ca); + bch2_dev_allocator_remove(c, ca); } static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) @@ -1353,6 +1390,9 @@ static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW); + bch2_dev_allocator_add(c, ca); + bch2_recalc_capacity(c); + if (bch2_dev_allocator_start(ca)) return "error starting allocator thread"; @@ -1411,7 +1451,7 @@ int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) { struct bch_sb_field_members *mi; - unsigned dev_idx = ca->dev_idx; + unsigned dev_idx = ca->dev_idx, data; int ret = -EINVAL; mutex_lock(&c->state_lock); @@ -1439,19 +1479,12 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) goto err; } - if (ca->mi.has_data || ca->mi.has_metadata) { - bch_err(ca, "Remove failed, still has data"); + data = bch2_dev_has_data(c, ca); + if (data) { + bch_err(ca, "Remove failed, still has data (%x)", data); goto err; } - /* - * Ok, really doing the remove: - * Drop device's prio pointer before removing it from superblock: - */ - spin_lock(&c->journal.lock); - c->journal.prio_buckets[dev_idx] = 0; - spin_unlock(&c->journal.lock); - bch2_journal_meta(&c->journal); __bch2_dev_offline(ca); @@ -1476,6 +1509,7 @@ err: return ret; } +/* Add new device to running filesystem: */ int bch2_dev_add(struct bch_fs *c, const char *path) { struct bcache_superblock sb; @@ -1490,7 +1524,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) if (err) return -EINVAL; - err = bch2_validate_cache_super(&sb); + err = bch2_sb_validate(&sb); if (err) return -EINVAL; @@ -1514,9 +1548,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) mi = bch2_sb_get_members(c->disk_sb); for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) - if (dev_idx >= c->sb.nr_devices || - bch2_is_zero(mi->members[dev_idx].uuid.b, - sizeof(uuid_le))) + if (!bch2_dev_exists(c->disk_sb, mi, dev_idx)) goto have_slot; no_slot: err = "no slots available in superblock"; @@ -1587,13 +1619,13 @@ err: return ret ?: -EINVAL; } +/* Hot add existing device to running filesystem: */ int bch2_dev_online(struct bch_fs *c, const char *path) { struct bcache_superblock sb = { 0 }; struct bch_dev *ca; unsigned dev_idx; const char *err; - int ret; mutex_lock(&c->state_lock); @@ -1616,12 +1648,6 @@ int bch2_dev_online(struct bch_fs *c, const char *path) mutex_unlock(&c->sb_lock); ca = c->devs[dev_idx]; - ret = bch2_prio_read(ca); - if (ret) { - err = "error reading priorities"; - goto err; - } - if (ca->mi.state == BCH_MEMBER_STATE_RW) { err = __bch2_dev_read_write(c, ca); if (err) @@ -1656,6 +1682,7 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) int bch2_dev_evacuate(struct bch_fs *c, struct bch_dev *ca) { + unsigned data; int ret; mutex_lock(&c->state_lock); @@ -1680,8 +1707,9 @@ int bch2_dev_evacuate(struct bch_fs *c, struct bch_dev *ca) return ret; } - if (ca->mi.has_data || ca->mi.has_metadata) { - bch_err(ca, "Migrate error: data still present"); + data = bch2_dev_has_data(c, ca); + if (data) { + bch_err(ca, "Migrate error: data still present (%x)", data); return -EINVAL; } @@ -1714,11 +1742,7 @@ const char *bch2_fs_open(char * const *devices, unsigned nr_devices, if (err) goto err; - err = "attempting to register backing device"; - if (__SB_IS_BDEV(le64_to_cpu(sb[i].sb->version))) - goto err; - - err = bch2_validate_cache_super(&sb[i]); + err = bch2_sb_validate(&sb[i]); if (err) goto err; } @@ -1790,7 +1814,7 @@ static const char *__bch2_fs_open_incremental(struct bcache_superblock *sb, struct bch_fs *c; bool allocated_fs = false; - err = bch2_validate_cache_super(sb); + err = bch2_sb_validate(sb); if (err) return err; @@ -1855,11 +1879,7 @@ const char *bch2_fs_open_incremental(const char *path) if (err) return err; - if (!__SB_IS_BDEV(le64_to_cpu(sb.sb->version))) - err = __bch2_fs_open_incremental(&sb, opts); - else - err = "not a bcachefs superblock"; - + err = __bch2_fs_open_incremental(&sb, opts); bch2_free_super(&sb); return err; |