diff options
Diffstat (limited to 'libbcachefs/alloc.c')
-rw-r--r-- | libbcachefs/alloc.c | 338 |
1 files changed, 137 insertions, 201 deletions
diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c index 953c6b3b..1c5b2e49 100644 --- a/libbcachefs/alloc.c +++ b/libbcachefs/alloc.c @@ -77,42 +77,6 @@ static void bch2_recalc_min_prio(struct bch_dev *, int); -/* Allocation groups: */ - -void bch2_dev_group_remove(struct dev_group *grp, struct bch_dev *ca) -{ - unsigned i; - - spin_lock(&grp->lock); - - for (i = 0; i < grp->nr; i++) - if (grp->d[i].dev == ca) { - grp->nr--; - memmove(&grp->d[i], - &grp->d[i + 1], - (grp->nr- i) * sizeof(grp->d[0])); - break; - } - - spin_unlock(&grp->lock); -} - -void bch2_dev_group_add(struct dev_group *grp, struct bch_dev *ca) -{ - unsigned i; - - spin_lock(&grp->lock); - for (i = 0; i < grp->nr; i++) - if (grp->d[i].dev == ca) - goto out; - - BUG_ON(grp->nr>= BCH_SB_MEMBERS_MAX); - - grp->d[grp->nr++].dev = ca; -out: - spin_unlock(&grp->lock); -} - /* Ratelimiting/PD controllers */ static void pd_controllers_update(struct work_struct *work) @@ -139,24 +103,24 @@ static void pd_controllers_update(struct work_struct *work) faster_tiers_dirty, -1); - spin_lock(&c->tiers[i].devs.lock); - group_for_each_dev(ca, &c->tiers[i].devs, iter) { + for_each_member_device_rcu(ca, c, iter, &c->tiers[i].devs) { struct bch_dev_usage stats = bch2_dev_usage_read(ca); - unsigned bucket_bits = ca->bucket_bits + 9; - u64 size = (ca->mi.nbuckets - - ca->mi.first_bucket) << bucket_bits; - u64 dirty = stats.buckets[S_DIRTY] << bucket_bits; - u64 free = __dev_buckets_free(ca, stats) << bucket_bits; + u64 size = bucket_to_sector(ca, ca->mi.nbuckets - + ca->mi.first_bucket) << 9; + u64 dirty = bucket_to_sector(ca, + stats.buckets[S_DIRTY]) << 9; + u64 free = bucket_to_sector(ca, + __dev_buckets_free(ca, stats)) << 9; /* * Bytes of internal fragmentation, which can be * reclaimed by copy GC */ - s64 fragmented = ((stats.buckets[S_DIRTY] + - stats.buckets_cached) << - bucket_bits) - - ((stats.sectors[S_DIRTY] + - stats.sectors_cached) << 9); + s64 fragmented = (bucket_to_sector(ca, + stats.buckets[S_DIRTY] + + stats.buckets_cached) - + (stats.sectors[S_DIRTY] + + stats.sectors_cached)) << 9; fragmented = max(0LL, fragmented); @@ -174,7 +138,6 @@ static void pd_controllers_update(struct work_struct *work) copygc_can_free += fragmented; } - spin_unlock(&c->tiers[i].devs.lock); } rcu_read_unlock(); @@ -427,19 +390,22 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos) return ret; } -int bch2_alloc_write(struct bch_fs *c, struct bch_dev *ca, u64 *journal_seq) +static int bch2_alloc_write(struct bch_fs *c, struct bch_dev *ca, u64 *journal_seq) { struct btree_iter iter; - struct bucket *g; + unsigned long bucket; int ret = 0; bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN, BTREE_ITER_INTENT); - for_each_bucket(g, ca) { - ret = __bch2_alloc_write_key(c, ca, g, &iter, journal_seq); + for_each_set_bit(bucket, ca->bucket_dirty, ca->mi.nbuckets) { + ret = __bch2_alloc_write_key(c, ca, ca->buckets + bucket, + &iter, journal_seq); if (ret) break; + + clear_bit(bucket, ca->bucket_dirty); } bch2_btree_iter_unlock(&iter); @@ -926,8 +892,10 @@ static int bch2_allocator_thread(void *arg) ca->nr_invalidated = ret; - if (ca->nr_invalidated == fifo_used(&ca->free_inc)) + if (ca->nr_invalidated == fifo_used(&ca->free_inc)) { ca->alloc_thread_started = true; + bch2_alloc_write(c, ca, &journal_seq); + } if (ca->allocator_invalidating_data) bch2_journal_flush_seq(&c->journal, journal_seq); @@ -996,6 +964,21 @@ static int bch2_allocator_thread(void *arg) /* Allocation */ +/* + * XXX: allocation on startup is still sketchy. There is insufficient + * synchronization for bch2_bucket_alloc_startup() to work correctly after + * bch2_alloc_write() has been called, and we aren't currently doing anything + * to guarantee that this won't happen. + * + * Even aside from that, it's really difficult to avoid situations where on + * startup we write out a pointer to a freshly allocated bucket before the + * corresponding gen - when we're still digging ourself out of the "i need to + * allocate to write bucket gens, but i need to write bucket gens to allocate" + * hole. + * + * Fortunately, bch2_btree_mark_key_initial() will detect and repair this + * easily enough... + */ static long bch2_bucket_alloc_startup(struct bch_fs *c, struct bch_dev *ca) { struct bucket *g; @@ -1012,6 +995,7 @@ static long bch2_bucket_alloc_startup(struct bch_fs *c, struct bch_dev *ca) is_available_bucket(g->mark) && bch2_mark_alloc_bucket_startup(ca, g)) { r = g - ca->buckets; + set_bit(r, ca->bucket_dirty); break; } out: @@ -1055,6 +1039,7 @@ long bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, spin_unlock(&ca->freelist_lock); if (unlikely(!ca->alloc_thread_started) && + (reserve == RESERVE_ALLOC) && (r = bch2_bucket_alloc_startup(c, ca)) >= 0) { verify_not_on_freelist(ca, r); goto out2; @@ -1081,92 +1066,87 @@ enum bucket_alloc_ret { FREELIST_EMPTY, /* Allocator thread not keeping up */ }; -static void recalc_alloc_group_weights(struct bch_fs *c, - struct dev_group *devs) +struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c, + struct write_point *wp, + struct bch_devs_mask *devs) { - struct bch_dev *ca; - u64 available_buckets = 1; /* avoid a divide by zero... */ - unsigned i; + struct dev_alloc_list ret = { .nr = 0 }; + struct bch_dev *ca, *ca2; + unsigned i, j; - for (i = 0; i < devs->nr; i++) { - ca = devs->d[i].dev; + for_each_member_device_rcu(ca, c, i, devs) { + for (j = 0; j < ret.nr; j++) { + unsigned idx = ret.devs[j]; - devs->d[i].weight = dev_buckets_free(ca); - available_buckets += devs->d[i].weight; - } + ca2 = rcu_dereference(c->devs[idx]); + if (!ca2) + break; + + if (ca->mi.tier < ca2->mi.tier) + break; + + if (ca->mi.tier == ca2->mi.tier && + wp->next_alloc[i] < wp->next_alloc[idx]) + break; + } - for (i = 0; i < devs->nr; i++) { - const unsigned min_weight = U32_MAX >> 4; - const unsigned max_weight = U32_MAX; - - devs->d[i].weight = - min_weight + - div64_u64(devs->d[i].weight * - devs->nr * - (max_weight - min_weight), - available_buckets); - devs->d[i].weight = min_t(u64, devs->d[i].weight, max_weight); + memmove(&ret.devs[j + 1], + &ret.devs[j], + sizeof(ret.devs[0]) * (ret.nr - j)); + ret.nr++; + ret.devs[j] = i; } + + return ret; +} + +void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca, + struct write_point *wp) +{ + unsigned i; + + for (i = 0; i < ARRAY_SIZE(wp->next_alloc); i++) + wp->next_alloc[i] >>= 1; } -static enum bucket_alloc_ret bch2_bucket_alloc_group(struct bch_fs *c, - struct open_bucket *ob, - enum alloc_reserve reserve, - unsigned nr_replicas, - struct dev_group *devs, - long *devs_used) +static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c, + struct write_point *wp, + struct open_bucket *ob, + unsigned nr_replicas, + enum alloc_reserve reserve, + struct bch_devs_mask *devs) { - enum bucket_alloc_ret ret; - unsigned fail_idx = -1, i; - unsigned available = 0; + enum bucket_alloc_ret ret = NO_DEVICES; + struct dev_alloc_list devs_sorted; + unsigned i; BUG_ON(nr_replicas > ARRAY_SIZE(ob->ptrs)); if (ob->nr_ptrs >= nr_replicas) return ALLOC_SUCCESS; - spin_lock(&devs->lock); - - for (i = 0; i < devs->nr; i++) - available += !test_bit(devs->d[i].dev->dev_idx, - devs_used); - - recalc_alloc_group_weights(c, devs); - - i = devs->cur_device; + rcu_read_lock(); + devs_sorted = bch2_wp_alloc_list(c, wp, devs); - while (ob->nr_ptrs < nr_replicas) { - struct bch_dev *ca; + for (i = 0; i < devs_sorted.nr; i++) { + struct bch_dev *ca = + rcu_dereference(c->devs[devs_sorted.devs[i]]); long bucket; - if (!available) { - ret = NO_DEVICES; - goto err; - } - - i++; - i %= devs->nr; - - ret = FREELIST_EMPTY; - if (i == fail_idx) - goto err; - - ca = devs->d[i].dev; - - if (test_bit(ca->dev_idx, devs_used)) - continue; - - if (fail_idx == -1 && - get_random_int() > devs->d[i].weight) + if (!ca) continue; bucket = bch2_bucket_alloc(c, ca, reserve); if (bucket < 0) { - if (fail_idx == -1) - fail_idx = i; + ret = FREELIST_EMPTY; continue; } + wp->next_alloc[ca->dev_idx] += + div64_u64(U64_MAX, dev_buckets_free(ca) * + ca->mi.bucket_size); + bch2_wp_rescale(c, ca, wp); + /* * open_bucket_add_buckets expects new pointers at the head of * the list: @@ -1185,56 +1165,28 @@ static enum bucket_alloc_ret bch2_bucket_alloc_group(struct bch_fs *c, }; ob->ptr_offset[0] = 0; - __set_bit(ca->dev_idx, devs_used); - available--; - devs->cur_device = i; + if (ob->nr_ptrs == nr_replicas) { + ret = ALLOC_SUCCESS; + break; + } } - ret = ALLOC_SUCCESS; -err: EBUG_ON(ret != ALLOC_SUCCESS && reserve == RESERVE_MOVINGGC); - spin_unlock(&devs->lock); + rcu_read_unlock(); return ret; } -static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c, - struct write_point *wp, - struct open_bucket *ob, - unsigned nr_replicas, - enum alloc_reserve reserve, - long *devs_used) -{ - struct bch_tier *tier; - /* - * this should implement policy - for a given type of allocation, decide - * which devices to allocate from: - * - * XXX: switch off wp->type and do something more intelligent here - */ - if (wp->group) - return bch2_bucket_alloc_group(c, ob, reserve, nr_replicas, - wp->group, devs_used); - - /* foreground writes: prefer fastest tier: */ - tier = READ_ONCE(c->fastest_tier); - if (tier) - bch2_bucket_alloc_group(c, ob, reserve, nr_replicas, - &tier->devs, devs_used); - - return bch2_bucket_alloc_group(c, ob, reserve, nr_replicas, - &c->all_devs, devs_used); -} - static int bch2_bucket_alloc_set(struct bch_fs *c, struct write_point *wp, struct open_bucket *ob, unsigned nr_replicas, - enum alloc_reserve reserve, long *devs_used, + enum alloc_reserve reserve, + struct bch_devs_mask *devs, struct closure *cl) { bool waiting = false; while (1) { switch (__bch2_bucket_alloc_set(c, wp, ob, nr_replicas, - reserve, devs_used)) { + reserve, devs)) { case ALLOC_SUCCESS: if (waiting) closure_wake_up(&c->freelist_wait); @@ -1354,13 +1306,12 @@ static unsigned ob_ptr_sectors_free(struct bch_fs *c, { struct bch_dev *ca = c->devs[ptr->dev]; unsigned i = ptr - ob->ptrs; - unsigned bucket_size = ca->mi.bucket_size; - unsigned used = (ptr->offset & (bucket_size - 1)) + + unsigned used = bucket_remainder(ca, ptr->offset) + ob->ptr_offset[i]; - BUG_ON(used > bucket_size); + BUG_ON(used > ca->mi.bucket_size); - return bucket_size - used; + return ca->mi.bucket_size - used; } static unsigned open_bucket_sectors_free(struct bch_fs *c, @@ -1432,28 +1383,22 @@ static int open_bucket_add_buckets(struct bch_fs *c, enum alloc_reserve reserve, struct closure *cl) { - long devs_used[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)]; + struct bch_devs_mask devs = c->rw_devs[wp->type]; unsigned i; int ret; - /* - * We might be allocating pointers to add to an existing extent - * (tiering/copygc/migration) - if so, some of the pointers in our - * existing open bucket might duplicate devices we already have. This is - * moderately annoying. - */ - - /* Short circuit all the fun stuff if posssible: */ if (ob->nr_ptrs >= nr_replicas) return 0; - memset(devs_used, 0, sizeof(devs_used)); - + /* Don't allocate from devices we already have pointers to: */ for (i = 0; i < ob->nr_ptrs; i++) - __set_bit(ob->ptrs[i].dev, devs_used); + __clear_bit(ob->ptrs[i].dev, devs.d); + + if (wp->group) + bitmap_and(devs.d, devs.d, wp->group->d, BCH_SB_MEMBERS_MAX); ret = bch2_bucket_alloc_set(c, wp, ob, nr_replicas, - reserve, devs_used, cl); + reserve, &devs, cl); if (ret == -EROFS && ob->nr_ptrs >= nr_replicas_required) @@ -1568,8 +1513,6 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct bkey_i_extent *e, extent_ptr_append(e, tmp); ob->ptr_offset[i] += sectors; - - this_cpu_add(*c->devs[tmp.dev]->sectors_written, sectors); } } @@ -1651,6 +1594,8 @@ void bch2_recalc_capacity(struct bch_fs *c) unsigned long ra_pages = 0; unsigned i, j; + lockdep_assert_held(&c->state_lock); + for_each_online_member(ca, c, i) { struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_bdi; @@ -1663,7 +1608,7 @@ void bch2_recalc_capacity(struct bch_fs *c) for (tier = c->tiers; tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) { - if (!tier->devs.nr) + if (!dev_mask_nr(&tier->devs)) continue; if (!fastest_tier) fastest_tier = tier; @@ -1681,8 +1626,7 @@ void bch2_recalc_capacity(struct bch_fs *c) * Capacity of the filesystem is the capacity of all the devices in the * slowest (highest) tier - we don't include lower tier devices. */ - spin_lock(&slowest_tier->devs.lock); - group_for_each_dev(ca, &slowest_tier->devs, i) { + for_each_member_device_rcu(ca, c, i, &slowest_tier->devs) { size_t reserve = 0; /* @@ -1712,13 +1656,11 @@ void bch2_recalc_capacity(struct bch_fs *c) reserve += 1; /* tiering write point */ reserve += 1; /* btree write point */ - reserved_sectors += reserve << ca->bucket_bits; + reserved_sectors += bucket_to_sector(ca, reserve); - capacity += (ca->mi.nbuckets - - ca->mi.first_bucket) << - ca->bucket_bits; + capacity += bucket_to_sector(ca, ca->mi.nbuckets - + ca->mi.first_bucket); } - spin_unlock(&slowest_tier->devs.lock); set_capacity: total_capacity = capacity; @@ -1795,7 +1737,6 @@ static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) /* device goes ro: */ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) { - struct dev_group *tier = &c->tiers[ca->mi.tier].devs; struct closure cl; unsigned i; @@ -1805,9 +1746,9 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) /* First, remove device from allocation groups: */ - bch2_dev_group_remove(&c->journal.devs, ca); - bch2_dev_group_remove(tier, ca); - bch2_dev_group_remove(&c->all_devs, ca); + clear_bit(ca->dev_idx, c->tiers[ca->mi.tier].devs.d); + for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) + clear_bit(ca->dev_idx, c->rw_devs[i].d); /* * Capacity is calculated based off of devices in allocation groups: @@ -1820,7 +1761,7 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) bch2_stop_write_point(c, ca, &ca->copygc_write_point); bch2_stop_write_point(c, ca, &c->promote_write_point); - bch2_stop_write_point(c, ca, &ca->tiering_write_point); + bch2_stop_write_point(c, ca, &c->tiers[ca->mi.tier].wp); bch2_stop_write_point(c, ca, &c->migration_write_point); bch2_stop_write_point(c, ca, &c->btree_write_point); @@ -1862,21 +1803,12 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) /* device goes rw: */ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) { - struct dev_group *tier = &c->tiers[ca->mi.tier].devs; - struct bch_sb_field_journal *journal_buckets; - bool has_journal; - - bch2_dev_group_add(&c->all_devs, ca); - bch2_dev_group_add(tier, ca); - - mutex_lock(&c->sb_lock); - journal_buckets = bch2_sb_get_journal(ca->disk_sb.sb); - has_journal = bch2_nr_journal_buckets(journal_buckets) >= - BCH_JOURNAL_BUCKETS_MIN; - mutex_unlock(&c->sb_lock); + unsigned i; - if (has_journal) - bch2_dev_group_add(&c->journal.devs, ca); + for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) + if (ca->mi.data_allowed & (1 << i)) + set_bit(ca->dev_idx, c->rw_devs[i].d); + set_bit(ca->dev_idx, c->tiers[ca->mi.tier].devs.d); } /* stop allocator thread: */ @@ -1942,13 +1874,17 @@ void bch2_fs_allocator_init(struct bch_fs *c) list_add(&c->open_buckets[i].list, &c->open_buckets_free); } - spin_lock_init(&c->all_devs.lock); + c->journal.wp.type = BCH_DATA_JOURNAL; + c->btree_write_point.type = BCH_DATA_BTREE; for (i = 0; i < ARRAY_SIZE(c->tiers); i++) - spin_lock_init(&c->tiers[i].devs.lock); + c->tiers[i].wp.type = BCH_DATA_USER; for (i = 0; i < ARRAY_SIZE(c->write_points); i++) - c->write_points[i].throttle = true; + c->write_points[i].type = BCH_DATA_USER; + + c->promote_write_point.type = BCH_DATA_USER; + c->migration_write_point.type = BCH_DATA_USER; c->pd_controllers_update_seconds = 5; INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update); |