diff options
Diffstat (limited to 'libbcachefs/extents.c')
-rw-r--r-- | libbcachefs/extents.c | 1941 |
1 files changed, 706 insertions, 1235 deletions
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index b9c69792..6bcc1786 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -9,12 +9,10 @@ #include "bcachefs.h" #include "bkey_methods.h" #include "btree_gc.h" -#include "btree_update.h" -#include "btree_update_interior.h" +#include "btree_iter.h" #include "buckets.h" #include "checksum.h" #include "debug.h" -#include "dirent.h" #include "disk_groups.h" #include "error.h" #include "extents.h" @@ -24,85 +22,18 @@ #include "super.h" #include "super-io.h" #include "util.h" -#include "xattr.h" #include <trace/events/bcachefs.h> -unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k) -{ - struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; - unsigned nr_ptrs = 0; - - bkey_for_each_ptr(p, ptr) - nr_ptrs++; - - return nr_ptrs; -} - -unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c k) -{ - unsigned nr_ptrs = 0; - - switch (k.k->type) { - case KEY_TYPE_btree_ptr: - case KEY_TYPE_extent: - case KEY_TYPE_reflink_v: { - struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; - - bkey_for_each_ptr(p, ptr) - nr_ptrs += !ptr->cached; - BUG_ON(!nr_ptrs); - break; - } - case KEY_TYPE_reservation: - nr_ptrs = bkey_s_c_to_reservation(k).v->nr_replicas; - break; - } - - return nr_ptrs; -} - -static unsigned bch2_extent_ptr_durability(struct bch_fs *c, - struct extent_ptr_decoded p) -{ - unsigned durability = 0; - struct bch_dev *ca; - - if (p.ptr.cached) - return 0; - - ca = bch_dev_bkey_exists(c, p.ptr.dev); - - if (ca->mi.state != BCH_MEMBER_STATE_FAILED) - durability = max_t(unsigned, durability, ca->mi.durability); - - if (p.has_ec) { - struct stripe *s = - genradix_ptr(&c->stripes[0], p.ec.idx); - - if (WARN_ON(!s)) - goto out; - - durability = max_t(unsigned, durability, s->nr_redundant); - } -out: - return durability; -} - -unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned durability = 0; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - durability += bch2_extent_ptr_durability(c, p); +static unsigned bch2_crc_field_size_max[] = { + [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, + [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, + [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX, +}; - return durability; -} +static void bch2_extent_crc_pack(union bch_extent_crc *, + struct bch_extent_crc_unpacked, + enum bch_extent_entry_type); static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f, unsigned dev) @@ -222,172 +153,299 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, return ret; } -void bch2_bkey_append_ptr(struct bkey_i *k, - struct bch_extent_ptr ptr) +/* KEY_TYPE_btree_ptr: */ + +const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k) { - EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev)); + if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) + return "value too big"; - switch (k->k.type) { - case KEY_TYPE_btree_ptr: - case KEY_TYPE_extent: - EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX); + return bch2_bkey_ptrs_invalid(c, k); +} - ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; +void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + const char *err; + char buf[160]; + struct bucket_mark mark; + struct bch_dev *ca; - memcpy((void *) &k->v + bkey_val_bytes(&k->k), - &ptr, - sizeof(ptr)); - k->u64s++; - break; - default: - BUG(); + bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && + !bch2_bkey_replicas_marked(c, k, false), c, + "btree key bad (replicas not marked in superblock):\n%s", + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); + + if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) + return; + + bkey_for_each_ptr(ptrs, ptr) { + ca = bch_dev_bkey_exists(c, ptr->dev); + + mark = ptr_bucket_mark(ca, ptr); + + err = "stale"; + if (gen_after(mark.gen, ptr->gen)) + goto err; + + err = "inconsistent"; + if (mark.data_type != BCH_DATA_BTREE || + mark.dirty_sectors < c->opts.btree_node_size) + goto err; } + + return; +err: + bch2_bkey_val_to_text(&PBUF(buf), c, k); + bch2_fs_bug(c, "%s btree pointer %s: bucket %zi gen %i mark %08x", + err, buf, PTR_BUCKET_NR(ca, ptr), + mark.gen, (unsigned) mark.v.counter); } -void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) +void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) { - struct bch_extent_ptr *ptr; + bch2_bkey_ptrs_to_text(out, c, k); +} - bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); +/* KEY_TYPE_extent: */ + +const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) +{ + return bch2_bkey_ptrs_invalid(c, k); } -const struct bch_extent_ptr * -bch2_bkey_has_device(struct bkey_s_c k, unsigned dev) +void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + char buf[160]; - bkey_for_each_ptr(ptrs, ptr) - if (ptr->dev == dev) - return ptr; + /* + * XXX: we should be doing most/all of these checks at startup time, + * where we check bch2_bkey_invalid() in btree_node_read_done() + * + * But note that we can't check for stale pointers or incorrect gc marks + * until after journal replay is done (it might be an extent that's + * going to get overwritten during replay) + */ - return NULL; -} + if (percpu_down_read_trylock(&c->mark_lock)) { + bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && + !bch2_bkey_replicas_marked_locked(c, e.s_c, false), c, + "extent key bad (replicas not marked in superblock):\n%s", + (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf)); + percpu_up_read(&c->mark_lock); + } + /* + * If journal replay hasn't finished, we might be seeing keys + * that will be overwritten by the time journal replay is done: + */ + if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) + return; -bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; + extent_for_each_ptr_decode(e, p, entry) { + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); + struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr); + unsigned stale = gen_after(mark.gen, p.ptr.gen); + unsigned disk_sectors = ptr_disk_sectors(p); + unsigned mark_sectors = p.ptr.cached + ? mark.cached_sectors + : mark.dirty_sectors; - bkey_for_each_ptr(ptrs, ptr) - if (bch2_dev_in_target(c, ptr->dev, target) && - (!ptr->cached || - !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr))) - return true; + bch2_fs_bug_on(stale && !p.ptr.cached, c, + "stale dirty pointer (ptr gen %u bucket %u", + p.ptr.gen, mark.gen); - return false; + bch2_fs_bug_on(stale > 96, c, "key too stale: %i", stale); + + bch2_fs_bug_on(!stale && + (mark.data_type != BCH_DATA_USER || + mark_sectors < disk_sectors), c, + "extent pointer not marked: %s:\n" + "type %u sectors %u < %u", + (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf), + mark.data_type, + mark_sectors, disk_sectors); + } } -/* extent specific utility code */ +void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + bch2_bkey_ptrs_to_text(out, c, k); +} -const struct bch_extent_ptr * -bch2_extent_has_device(struct bkey_s_c_extent e, unsigned dev) +enum merge_result bch2_extent_merge(struct bch_fs *c, + struct bkey_s _l, struct bkey_s _r) { - const struct bch_extent_ptr *ptr; + struct bkey_s_extent l = bkey_s_to_extent(_l); + struct bkey_s_extent r = bkey_s_to_extent(_r); + union bch_extent_entry *en_l = l.v->start; + union bch_extent_entry *en_r = r.v->start; + struct bch_extent_crc_unpacked crc_l, crc_r; - extent_for_each_ptr(e, ptr) - if (ptr->dev == dev) - return ptr; + if (bkey_val_u64s(l.k) != bkey_val_u64s(r.k)) + return BCH_MERGE_NOMERGE; - return NULL; -} + crc_l = bch2_extent_crc_unpack(l.k, NULL); -const struct bch_extent_ptr * -bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group) -{ - const struct bch_extent_ptr *ptr; + extent_for_each_entry(l, en_l) { + en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); - extent_for_each_ptr(e, ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + if (extent_entry_type(en_l) != extent_entry_type(en_r)) + return BCH_MERGE_NOMERGE; - if (ca->mi.group && - ca->mi.group - 1 == group) - return ptr; + switch (extent_entry_type(en_l)) { + case BCH_EXTENT_ENTRY_ptr: { + const struct bch_extent_ptr *lp = &en_l->ptr; + const struct bch_extent_ptr *rp = &en_r->ptr; + struct bch_dev *ca; + + if (lp->offset + crc_l.compressed_size != rp->offset || + lp->dev != rp->dev || + lp->gen != rp->gen) + return BCH_MERGE_NOMERGE; + + /* We don't allow extents to straddle buckets: */ + ca = bch_dev_bkey_exists(c, lp->dev); + + if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp)) + return BCH_MERGE_NOMERGE; + + break; + } + case BCH_EXTENT_ENTRY_stripe_ptr: + if (en_l->stripe_ptr.block != en_r->stripe_ptr.block || + en_l->stripe_ptr.idx != en_r->stripe_ptr.idx) + return BCH_MERGE_NOMERGE; + break; + case BCH_EXTENT_ENTRY_crc32: + case BCH_EXTENT_ENTRY_crc64: + case BCH_EXTENT_ENTRY_crc128: + crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); + crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); + + if (crc_l.csum_type != crc_r.csum_type || + crc_l.compression_type != crc_r.compression_type || + crc_l.nonce != crc_r.nonce) + return BCH_MERGE_NOMERGE; + + if (crc_l.offset + crc_l.live_size != crc_l.compressed_size || + crc_r.offset) + return BCH_MERGE_NOMERGE; + + if (!bch2_checksum_mergeable(crc_l.csum_type)) + return BCH_MERGE_NOMERGE; + + if (crc_l.compression_type) + return BCH_MERGE_NOMERGE; + + if (crc_l.csum_type && + crc_l.uncompressed_size + + crc_r.uncompressed_size > c->sb.encoded_extent_max) + return BCH_MERGE_NOMERGE; + + if (crc_l.uncompressed_size + crc_r.uncompressed_size - 1 > + bch2_crc_field_size_max[extent_entry_type(en_l)]) + return BCH_MERGE_NOMERGE; + + break; + default: + return BCH_MERGE_NOMERGE; + } } - return NULL; -} + extent_for_each_entry(l, en_l) { + struct bch_extent_crc_unpacked crc_l, crc_r; -unsigned bch2_extent_is_compressed(struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned ret = 0; + en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (!p.ptr.cached && - p.crc.compression_type != BCH_COMPRESSION_NONE) - ret += p.crc.compressed_size; + if (!extent_entry_is_crc(en_l)) + continue; - return ret; -} + crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); + crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); -bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, - struct bch_extent_ptr m, u64 offset) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; + crc_l.csum = bch2_checksum_merge(crc_l.csum_type, + crc_l.csum, + crc_r.csum, + crc_r.uncompressed_size << 9); - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (p.ptr.dev == m.dev && - p.ptr.gen == m.gen && - (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) == - (s64) m.offset - offset) - return true; + crc_l.uncompressed_size += crc_r.uncompressed_size; + crc_l.compressed_size += crc_r.compressed_size; - return false; + bch2_extent_crc_pack(entry_to_crc(en_l), crc_l, + extent_entry_type(en_l)); + } + + bch2_key_resize(l.k, l.k->size + r.k->size); + + return BCH_MERGE_MERGE; } -static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, - union bch_extent_entry *entry) +/* KEY_TYPE_reservation: */ + +const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k) { - union bch_extent_entry *i = ptrs.start; + struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); - if (i == entry) - return NULL; + if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) + return "incorrect value size"; - while (extent_entry_next(i) != entry) - i = extent_entry_next(i); - return i; + if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) + return "invalid nr_replicas"; + + return NULL; } -union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, - struct bch_extent_ptr *ptr) +void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) { - struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); - union bch_extent_entry *dst, *src, *prev; - bool drop_crc = true; - - EBUG_ON(ptr < &ptrs.start->ptr || - ptr >= &ptrs.end->ptr); - EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); + struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); - src = extent_entry_next(to_entry(ptr)); - if (src != ptrs.end && - !extent_entry_is_crc(src)) - drop_crc = false; + pr_buf(out, "generation %u replicas %u", + le32_to_cpu(r.v->generation), + r.v->nr_replicas); +} - dst = to_entry(ptr); - while ((prev = extent_entry_prev(ptrs, dst))) { - if (extent_entry_is_ptr(prev)) - break; +enum merge_result bch2_reservation_merge(struct bch_fs *c, + struct bkey_s _l, struct bkey_s _r) +{ + struct bkey_s_reservation l = bkey_s_to_reservation(_l); + struct bkey_s_reservation r = bkey_s_to_reservation(_r); - if (extent_entry_is_crc(prev)) { - if (drop_crc) - dst = prev; - break; - } + if (l.v->generation != r.v->generation || + l.v->nr_replicas != r.v->nr_replicas) + return BCH_MERGE_NOMERGE; - dst = prev; + if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { + bch2_key_resize(l.k, KEY_SIZE_MAX); + bch2_cut_front_s(l.k->p, r.s); + return BCH_MERGE_PARTIAL; } - memmove_u64s_down(dst, src, - (u64 *) ptrs.end - (u64 *) src); - k.k->u64s -= (u64 *) src - (u64 *) dst; + bch2_key_resize(l.k, l.k->size + r.k->size); - return dst; + return BCH_MERGE_MERGE; +} + +/* Extent checksum entries: */ + +/* returns true if not equal */ +static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l, + struct bch_extent_crc_unpacked r) +{ + return (l.csum_type != r.csum_type || + l.compression_type != r.compression_type || + l.compressed_size != r.compressed_size || + l.uncompressed_size != r.uncompressed_size || + l.offset != r.offset || + l.live_size != r.live_size || + l.nonce != r.nonce || + bch2_crc_cmp(l.csum, r.csum)); } static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u, @@ -466,52 +524,404 @@ restart_narrow_pointers: return ret; } -/* returns true if not equal */ -static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l, - struct bch_extent_crc_unpacked r) +static void bch2_extent_crc_pack(union bch_extent_crc *dst, + struct bch_extent_crc_unpacked src, + enum bch_extent_entry_type type) { - return (l.csum_type != r.csum_type || - l.compression_type != r.compression_type || - l.compressed_size != r.compressed_size || - l.uncompressed_size != r.uncompressed_size || - l.offset != r.offset || - l.live_size != r.live_size || - l.nonce != r.nonce || - bch2_crc_cmp(l.csum, r.csum)); +#define set_common_fields(_dst, _src) \ + _dst.type = 1 << type; \ + _dst.csum_type = _src.csum_type, \ + _dst.compression_type = _src.compression_type, \ + _dst._compressed_size = _src.compressed_size - 1, \ + _dst._uncompressed_size = _src.uncompressed_size - 1, \ + _dst.offset = _src.offset + + switch (type) { + case BCH_EXTENT_ENTRY_crc32: + set_common_fields(dst->crc32, src); + dst->crc32.csum = *((__le32 *) &src.csum.lo); + break; + case BCH_EXTENT_ENTRY_crc64: + set_common_fields(dst->crc64, src); + dst->crc64.nonce = src.nonce; + dst->crc64.csum_lo = src.csum.lo; + dst->crc64.csum_hi = *((__le16 *) &src.csum.hi); + break; + case BCH_EXTENT_ENTRY_crc128: + set_common_fields(dst->crc128, src); + dst->crc128.nonce = src.nonce; + dst->crc128.csum = src.csum; + break; + default: + BUG(); + } +#undef set_common_fields } -void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k) +void bch2_extent_crc_append(struct bkey_i *k, + struct bch_extent_crc_unpacked new) { - union bch_extent_entry *entry; - u64 *d = (u64 *) bkeyp_val(f, k); - unsigned i; + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); + union bch_extent_crc *crc = (void *) ptrs.end; + enum bch_extent_entry_type type; - for (i = 0; i < bkeyp_val_u64s(f, k); i++) - d[i] = swab64(d[i]); + if (bch_crc_bytes[new.csum_type] <= 4 && + new.uncompressed_size - 1 <= CRC32_SIZE_MAX && + new.nonce <= CRC32_NONCE_MAX) + type = BCH_EXTENT_ENTRY_crc32; + else if (bch_crc_bytes[new.csum_type] <= 10 && + new.uncompressed_size - 1 <= CRC64_SIZE_MAX && + new.nonce <= CRC64_NONCE_MAX) + type = BCH_EXTENT_ENTRY_crc64; + else if (bch_crc_bytes[new.csum_type] <= 16 && + new.uncompressed_size - 1 <= CRC128_SIZE_MAX && + new.nonce <= CRC128_NONCE_MAX) + type = BCH_EXTENT_ENTRY_crc128; + else + BUG(); - for (entry = (union bch_extent_entry *) d; - entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k)); - entry = extent_entry_next(entry)) { - switch (extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_ptr: - break; - case BCH_EXTENT_ENTRY_crc32: - entry->crc32.csum = swab32(entry->crc32.csum); + bch2_extent_crc_pack(crc, new, type); + + k->k.u64s += extent_entry_u64s(ptrs.end); + + EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX); +} + +/* Generic code for keys with pointers: */ + +unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k) +{ + return bch2_bkey_devs(k).nr; +} + +unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k) +{ + return k.k->type == KEY_TYPE_reservation + ? bkey_s_c_to_reservation(k).v->nr_replicas + : bch2_bkey_dirty_devs(k).nr; +} + +unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k) +{ + unsigned ret = 0; + + if (k.k->type == KEY_TYPE_reservation) { + ret = bkey_s_c_to_reservation(k).v->nr_replicas; + } else { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + ret += !p.ptr.cached && + p.crc.compression_type == BCH_COMPRESSION_NONE; + } + + return ret; +} + +unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned ret = 0; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (!p.ptr.cached && + p.crc.compression_type != BCH_COMPRESSION_NONE) + ret += p.crc.compressed_size; + + return ret; +} + +bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, + unsigned nr_replicas) +{ + struct btree_trans trans; + struct btree_iter *iter; + struct bpos end = pos; + struct bkey_s_c k; + bool ret = true; + int err; + + end.offset += size; + + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, pos, + BTREE_ITER_SLOTS, k, err) { + if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) break; - case BCH_EXTENT_ENTRY_crc64: - entry->crc64.csum_hi = swab16(entry->crc64.csum_hi); - entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); + + if (nr_replicas > bch2_bkey_nr_ptrs_fully_allocated(k)) { + ret = false; break; - case BCH_EXTENT_ENTRY_crc128: - entry->crc128.csum.hi = (__force __le64) - swab64((__force u64) entry->crc128.csum.hi); - entry->crc128.csum.lo = (__force __le64) - swab64((__force u64) entry->crc128.csum.lo); + } + } + bch2_trans_exit(&trans); + + return ret; +} + +static unsigned bch2_extent_ptr_durability(struct bch_fs *c, + struct extent_ptr_decoded p) +{ + unsigned durability = 0; + struct bch_dev *ca; + + if (p.ptr.cached) + return 0; + + ca = bch_dev_bkey_exists(c, p.ptr.dev); + + if (ca->mi.state != BCH_MEMBER_STATE_FAILED) + durability = max_t(unsigned, durability, ca->mi.durability); + + if (p.has_ec) { + struct stripe *s = + genradix_ptr(&c->stripes[0], p.ec.idx); + + if (WARN_ON(!s)) + goto out; + + durability = max_t(unsigned, durability, s->nr_redundant); + } +out: + return durability; +} + +unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned durability = 0; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + durability += bch2_extent_ptr_durability(c, p); + + return durability; +} + +void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k, + unsigned target, + unsigned nr_desired_replicas) +{ + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); + union bch_extent_entry *entry; + struct extent_ptr_decoded p; + int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas; + + if (target && extra > 0) + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + int n = bch2_extent_ptr_durability(c, p); + + if (n && n <= extra && + !bch2_dev_in_target(c, p.ptr.dev, target)) { + entry->ptr.cached = true; + extra -= n; + } + } + + if (extra > 0) + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + int n = bch2_extent_ptr_durability(c, p); + + if (n && n <= extra) { + entry->ptr.cached = true; + extra -= n; + } + } +} + +void bch2_bkey_append_ptr(struct bkey_i *k, + struct bch_extent_ptr ptr) +{ + EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev)); + + switch (k->k.type) { + case KEY_TYPE_btree_ptr: + case KEY_TYPE_extent: + EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX); + + ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; + + memcpy((void *) &k->v + bkey_val_bytes(&k->k), + &ptr, + sizeof(ptr)); + k->u64s++; + break; + default: + BUG(); + } +} + +static inline void __extent_entry_insert(struct bkey_i *k, + union bch_extent_entry *dst, + union bch_extent_entry *new) +{ + union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); + + memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new), + dst, (u64 *) end - (u64 *) dst); + k->k.u64s += extent_entry_u64s(new); + memcpy(dst, new, extent_entry_bytes(new)); +} + +void bch2_extent_ptr_decoded_append(struct bkey_i *k, + struct extent_ptr_decoded *p) +{ + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); + struct bch_extent_crc_unpacked crc = + bch2_extent_crc_unpack(&k->k, NULL); + union bch_extent_entry *pos; + + if (!bch2_crc_unpacked_cmp(crc, p->crc)) { + pos = ptrs.start; + goto found; + } + + bkey_for_each_crc(&k->k, ptrs, crc, pos) + if (!bch2_crc_unpacked_cmp(crc, p->crc)) { + pos = extent_entry_next(pos); + goto found; + } + + bch2_extent_crc_append(k, p->crc); + pos = bkey_val_end(bkey_i_to_s(k)); +found: + p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; + __extent_entry_insert(k, pos, to_entry(&p->ptr)); + + if (p->has_ec) { + p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr; + __extent_entry_insert(k, pos, to_entry(&p->ec)); + } +} + +static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, + union bch_extent_entry *entry) +{ + union bch_extent_entry *i = ptrs.start; + + if (i == entry) + return NULL; + + while (extent_entry_next(i) != entry) + i = extent_entry_next(i); + return i; +} + +union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, + struct bch_extent_ptr *ptr) +{ + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); + union bch_extent_entry *dst, *src, *prev; + bool drop_crc = true; + + EBUG_ON(ptr < &ptrs.start->ptr || + ptr >= &ptrs.end->ptr); + EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); + + src = extent_entry_next(to_entry(ptr)); + if (src != ptrs.end && + !extent_entry_is_crc(src)) + drop_crc = false; + + dst = to_entry(ptr); + while ((prev = extent_entry_prev(ptrs, dst))) { + if (extent_entry_is_ptr(prev)) break; - case BCH_EXTENT_ENTRY_stripe_ptr: + + if (extent_entry_is_crc(prev)) { + if (drop_crc) + dst = prev; break; } + + dst = prev; } + + memmove_u64s_down(dst, src, + (u64 *) ptrs.end - (u64 *) src); + k.k->u64s -= (u64 *) src - (u64 *) dst; + + return dst; +} + +void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) +{ + struct bch_extent_ptr *ptr; + + bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); +} + +const struct bch_extent_ptr * +bch2_bkey_has_device(struct bkey_s_c k, unsigned dev) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + + bkey_for_each_ptr(ptrs, ptr) + if (ptr->dev == dev) + return ptr; + + return NULL; +} + +bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + + bkey_for_each_ptr(ptrs, ptr) + if (bch2_dev_in_target(c, ptr->dev, target) && + (!ptr->cached || + !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr))) + return true; + + return false; +} + +bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, + struct bch_extent_ptr m, u64 offset) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (p.ptr.dev == m.dev && + p.ptr.gen == m.gen && + (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) == + (s64) m.offset - offset) + return true; + + return false; +} + +/* + * bch_extent_normalize - clean up an extent, dropping stale pointers etc. + * + * Returns true if @k should be dropped entirely + * + * For existing keys, only called when btree nodes are being rewritten, not when + * they're merely being compacted/resorted in memory. + */ +bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) +{ + struct bch_extent_ptr *ptr; + + bch2_bkey_drop_ptrs(k, ptr, + ptr->cached && + ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)); + + /* will only happen if all pointers were cached: */ + if (!bch2_bkey_nr_ptrs(k.s_c)) + k.k->type = KEY_TYPE_discard; + + return bkey_whiteout(k.k); } void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, @@ -662,70 +1072,50 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) return NULL; } -/* Btree ptrs */ - -const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k) -{ - if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) - return "value too big"; - - return bch2_bkey_ptrs_invalid(c, k); -} - -void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k) +void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; - const char *err; - char buf[160]; - struct bucket_mark mark; - struct bch_dev *ca; - - bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && - !bch2_bkey_replicas_marked(c, k, false), c, - "btree key bad (replicas not marked in superblock):\n%s", - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); - - if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) - return; - - bkey_for_each_ptr(ptrs, ptr) { - ca = bch_dev_bkey_exists(c, ptr->dev); - - mark = ptr_bucket_mark(ca, ptr); + union bch_extent_entry *entry; + u64 *d = (u64 *) bkeyp_val(f, k); + unsigned i; - err = "stale"; - if (gen_after(mark.gen, ptr->gen)) - goto err; + for (i = 0; i < bkeyp_val_u64s(f, k); i++) + d[i] = swab64(d[i]); - err = "inconsistent"; - if (mark.data_type != BCH_DATA_BTREE || - mark.dirty_sectors < c->opts.btree_node_size) - goto err; + for (entry = (union bch_extent_entry *) d; + entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k)); + entry = extent_entry_next(entry)) { + switch (extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + break; + case BCH_EXTENT_ENTRY_crc32: + entry->crc32.csum = swab32(entry->crc32.csum); + break; + case BCH_EXTENT_ENTRY_crc64: + entry->crc64.csum_hi = swab16(entry->crc64.csum_hi); + entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); + break; + case BCH_EXTENT_ENTRY_crc128: + entry->crc128.csum.hi = (__force __le64) + swab64((__force u64) entry->crc128.csum.hi); + entry->crc128.csum.lo = (__force __le64) + swab64((__force u64) entry->crc128.csum.lo); + break; + case BCH_EXTENT_ENTRY_stripe_ptr: + break; + } } - - return; -err: - bch2_bkey_val_to_text(&PBUF(buf), c, k); - bch2_fs_bug(c, "%s btree pointer %s: bucket %zi gen %i mark %08x", - err, buf, PTR_BUCKET_NR(ca, ptr), - mark.gen, (unsigned) mark.v.counter); -} - -void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - bch2_bkey_ptrs_to_text(out, c, k); } -/* Extents */ +/* Generic extent code: */ -void __bch2_cut_front(struct bpos where, struct bkey_s k) +int bch2_cut_front_s(struct bpos where, struct bkey_s k) { + unsigned new_val_u64s = bkey_val_u64s(k.k); + int val_u64s_delta; u64 sub; if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0) - return; + return 0; EBUG_ON(bkey_cmp(where, k.k->p) > 0); @@ -733,15 +1123,12 @@ void __bch2_cut_front(struct bpos where, struct bkey_s k) k.k->size -= sub; - if (!k.k->size) + if (!k.k->size) { k.k->type = KEY_TYPE_deleted; + new_val_u64s = 0; + } switch (k.k->type) { - case KEY_TYPE_deleted: - case KEY_TYPE_discard: - case KEY_TYPE_error: - case KEY_TYPE_cookie: - break; case KEY_TYPE_extent: case KEY_TYPE_reflink_v: { struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); @@ -779,975 +1166,59 @@ void __bch2_cut_front(struct bpos where, struct bkey_s k) le64_add_cpu(&p.v->idx, sub); break; } - case KEY_TYPE_reservation: - break; - default: - BUG(); - } -} - -bool bch2_cut_back(struct bpos where, struct bkey *k) -{ - u64 len = 0; - - if (bkey_cmp(where, k->p) >= 0) - return false; - - EBUG_ON(bkey_cmp(where, bkey_start_pos(k)) < 0); - - len = where.offset - bkey_start_offset(k); - - k->p = where; - k->size = len; - - if (!len) - k->type = KEY_TYPE_deleted; - - return true; -} - -static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - unsigned ret = 0; - - bkey_extent_entry_for_each(ptrs, entry) { - switch (__extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_ptr: - case BCH_EXTENT_ENTRY_stripe_ptr: - ret++; - } - } - - return ret; -} - -static int count_iters_for_insert(struct btree_trans *trans, - struct bkey_s_c k, - unsigned offset, - struct bpos *end, - unsigned *nr_iters, - unsigned max_iters, - bool overwrite) -{ - int ret = 0; - - switch (k.k->type) { - case KEY_TYPE_extent: - case KEY_TYPE_reflink_v: - *nr_iters += bch2_bkey_nr_alloc_ptrs(k); - - if (*nr_iters >= max_iters) { - *end = bpos_min(*end, k.k->p); - ret = 1; - } - - break; - case KEY_TYPE_reflink_p: { - struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - u64 idx = le64_to_cpu(p.v->idx); - unsigned sectors = bpos_min(*end, p.k->p).offset - - bkey_start_offset(p.k); - struct btree_iter *iter; - struct bkey_s_c r_k; - - for_each_btree_key(trans, iter, - BTREE_ID_REFLINK, POS(0, idx + offset), - BTREE_ITER_SLOTS, r_k, ret) { - if (bkey_cmp(bkey_start_pos(r_k.k), - POS(0, idx + sectors)) >= 0) - break; - - *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k); + case KEY_TYPE_inline_data: { + struct bkey_s_inline_data d = bkey_s_to_inline_data(k); - if (*nr_iters >= max_iters) { - struct bpos pos = bkey_start_pos(k.k); - pos.offset += r_k.k->p.offset - idx; + sub = min_t(u64, sub << 9, bkey_val_bytes(d.k)); - *end = bpos_min(*end, pos); - ret = 1; - break; - } - } + memmove(d.v->data, + d.v->data + sub, + bkey_val_bytes(d.k) - sub); - bch2_trans_iter_put(trans, iter); + new_val_u64s -= sub >> 3; break; } } - return ret; -} - -#define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3) - -int bch2_extent_atomic_end(struct btree_iter *iter, - struct bkey_i *insert, - struct bpos *end) -{ - struct btree_trans *trans = iter->trans; - struct btree *b; - struct btree_node_iter node_iter; - struct bkey_packed *_k; - unsigned nr_iters = 0; - int ret; - - ret = bch2_btree_iter_traverse(iter); - if (ret) - return ret; - - b = iter->l[0].b; - node_iter = iter->l[0].iter; - - BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0); - - *end = bpos_min(insert->k.p, b->key.k.p); - - ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end, - &nr_iters, EXTENT_ITERS_MAX / 2, false); - if (ret < 0) - return ret; - - while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, - KEY_TYPE_discard))) { - struct bkey unpacked; - struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); - unsigned offset = 0; - - if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0) - break; - - if (bkey_cmp(bkey_start_pos(&insert->k), - bkey_start_pos(k.k)) > 0) - offset = bkey_start_offset(&insert->k) - - bkey_start_offset(k.k); - - ret = count_iters_for_insert(trans, k, offset, end, - &nr_iters, EXTENT_ITERS_MAX, true); - if (ret) - break; - - bch2_btree_node_iter_advance(&node_iter, b); - } - - return ret < 0 ? ret : 0; -} - -int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter) -{ - struct bpos end; - int ret; - - ret = bch2_extent_atomic_end(iter, k, &end); - if (ret) - return ret; - - bch2_cut_back(end, &k->k); - return 0; -} - -int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter) -{ - struct bpos end; - int ret; - - ret = bch2_extent_atomic_end(iter, k, &end); - if (ret) - return ret; - - return !bkey_cmp(end, k->k.p); -} - -enum btree_insert_ret -bch2_extent_can_insert(struct btree_trans *trans, - struct btree_insert_entry *insert, - unsigned *u64s) -{ - struct btree_iter_level *l = &insert->iter->l[0]; - struct btree_node_iter node_iter = l->iter; - enum bch_extent_overlap overlap; - struct bkey_packed *_k; - struct bkey unpacked; - struct bkey_s_c k; - int sectors; - - /* - * We avoid creating whiteouts whenever possible when deleting, but - * those optimizations mean we may potentially insert two whiteouts - * instead of one (when we overlap with the front of one extent and the - * back of another): - */ - if (bkey_whiteout(&insert->k->k)) - *u64s += BKEY_U64s; - - _k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, - KEY_TYPE_discard); - if (!_k) - return BTREE_INSERT_OK; - - k = bkey_disassemble(l->b, _k, &unpacked); - - overlap = bch2_extent_overlap(&insert->k->k, k.k); - - /* account for having to split existing extent: */ - if (overlap == BCH_EXTENT_OVERLAP_MIDDLE) - *u64s += _k->u64s; - - if (overlap == BCH_EXTENT_OVERLAP_MIDDLE && - (sectors = bch2_extent_is_compressed(k))) { - int flags = trans->flags & BTREE_INSERT_NOFAIL - ? BCH_DISK_RESERVATION_NOFAIL : 0; - - switch (bch2_disk_reservation_add(trans->c, - trans->disk_res, - sectors, flags)) { - case 0: - break; - case -ENOSPC: - return BTREE_INSERT_ENOSPC; - default: - BUG(); - } - } - - return BTREE_INSERT_OK; -} - -static void verify_extent_nonoverlapping(struct bch_fs *c, - struct btree *b, - struct btree_node_iter *_iter, - struct bkey_i *insert) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - struct btree_node_iter iter; - struct bkey_packed *k; - struct bkey uk; - - if (!expensive_debug_checks(c)) - return; - - iter = *_iter; - k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard); - BUG_ON(k && - (uk = bkey_unpack_key(b, k), - bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0)); - - iter = *_iter; - k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard); -#if 0 - BUG_ON(k && - (uk = bkey_unpack_key(b, k), - bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0); -#else - if (k && - (uk = bkey_unpack_key(b, k), - bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) { - char buf1[100]; - char buf2[100]; - - bch2_bkey_to_text(&PBUF(buf1), &insert->k); - bch2_bkey_to_text(&PBUF(buf2), &uk); - - bch2_dump_btree_node(b); - panic("insert > next :\n" - "insert %s\n" - "next %s\n", - buf1, buf2); - } -#endif + val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; + BUG_ON(val_u64s_delta < 0); -#endif + set_bkey_val_u64s(k.k, new_val_u64s); + memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); + return -val_u64s_delta; } -static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, - struct bkey_i *insert) +int bch2_cut_back_s(struct bpos where, struct bkey_s k) { - struct btree_iter_level *l = &iter->l[0]; - struct bkey_packed *k = - bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b)); - - BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b)); - - EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); - verify_extent_nonoverlapping(c, l->b, &l->iter, insert); - - if (debug_check_bkeys(c)) - bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert)); - - bch2_bset_insert(l->b, &l->iter, k, insert, 0); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s); -} + unsigned new_val_u64s = bkey_val_u64s(k.k); + int val_u64s_delta; + u64 len = 0; -static void -extent_squash(struct bch_fs *c, struct btree_iter *iter, - struct bkey_i *insert, - struct bkey_packed *_k, struct bkey_s k, - enum bch_extent_overlap overlap) -{ - struct btree_iter_level *l = &iter->l[0]; - - switch (overlap) { - case BCH_EXTENT_OVERLAP_FRONT: - /* insert overlaps with start of k: */ - __bch2_cut_front(insert->k.p, k); - EBUG_ON(bkey_deleted(k.k)); - extent_save(l->b, _k, k.k); - bch2_btree_iter_fix_key_modified(iter, l->b, _k); - break; + if (bkey_cmp(where, k.k->p) >= 0) + return 0; - case BCH_EXTENT_OVERLAP_BACK: - /* insert overlaps with end of k: */ - bch2_cut_back(bkey_start_pos(&insert->k), k.k); - EBUG_ON(bkey_deleted(k.k)); - extent_save(l->b, _k, k.k); + EBUG_ON(bkey_cmp(where, bkey_start_pos(k.k)) < 0); - /* - * As the auxiliary tree is indexed by the end of the - * key and we've just changed the end, update the - * auxiliary tree. - */ - bch2_bset_fix_invalidated_key(l->b, _k); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, - _k, _k->u64s, _k->u64s); - break; + len = where.offset - bkey_start_offset(k.k); - case BCH_EXTENT_OVERLAP_ALL: { - /* The insert key completely covers k, invalidate k */ - if (!bkey_whiteout(k.k)) - btree_account_key_drop(l->b, _k); + k.k->p = where; + k.k->size = len; - k.k->size = 0; + if (!len) { k.k->type = KEY_TYPE_deleted; - - if (_k >= btree_bset_last(l->b)->start) { - unsigned u64s = _k->u64s; - - bch2_bset_delete(l->b, _k, _k->u64s); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, - _k, u64s, 0); - } else { - extent_save(l->b, _k, k.k); - bch2_btree_iter_fix_key_modified(iter, l->b, _k); - } - - break; - } - case BCH_EXTENT_OVERLAP_MIDDLE: { - BKEY_PADDED(k) split; - /* - * The insert key falls 'in the middle' of k - * The insert key splits k in 3: - * - start only in k, preserve - * - middle common section, invalidate in k - * - end only in k, preserve - * - * We update the old key to preserve the start, - * insert will be the new common section, - * we manually insert the end that we are preserving. - * - * modify k _before_ doing the insert (which will move - * what k points to) - */ - bkey_reassemble(&split.k, k.s_c); - split.k.k.needs_whiteout |= bkey_written(l->b, _k); - - bch2_cut_back(bkey_start_pos(&insert->k), &split.k.k); - BUG_ON(bkey_deleted(&split.k.k)); - - __bch2_cut_front(insert->k.p, k); - BUG_ON(bkey_deleted(k.k)); - extent_save(l->b, _k, k.k); - bch2_btree_iter_fix_key_modified(iter, l->b, _k); - - extent_bset_insert(c, iter, &split.k); - break; - } - } -} - -/** - * bch_extent_insert_fixup - insert a new extent and deal with overlaps - * - * this may result in not actually doing the insert, or inserting some subset - * of the insert key. For cmpxchg operations this is where that logic lives. - * - * All subsets of @insert that need to be inserted are inserted using - * bch2_btree_insert_and_journal(). If @b or @res fills up, this function - * returns false, setting @iter->pos for the prefix of @insert that actually got - * inserted. - * - * BSET INVARIANTS: this function is responsible for maintaining all the - * invariants for bsets of extents in memory. things get really hairy with 0 - * size extents - * - * within one bset: - * - * bkey_start_pos(bkey_next(k)) >= k - * or bkey_start_offset(bkey_next(k)) >= k->offset - * - * i.e. strict ordering, no overlapping extents. - * - * multiple bsets (i.e. full btree node): - * - * ∀ k, j - * k.size != 0 ∧ j.size != 0 → - * ¬ (k > bkey_start_pos(j) ∧ k < j) - * - * i.e. no two overlapping keys _of nonzero size_ - * - * We can't realistically maintain this invariant for zero size keys because of - * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j - * there may be another 0 size key between them in another bset, and it will - * thus overlap with the merged key. - * - * In addition, the end of iter->pos indicates how much has been processed. - * If the end of iter->pos is not the same as the end of insert, then - * key insertion needs to continue/be retried. - */ -void bch2_insert_fixup_extent(struct btree_trans *trans, - struct btree_insert_entry *insert_entry) -{ - struct bch_fs *c = trans->c; - struct btree_iter *iter = insert_entry->iter; - struct bkey_i *insert = insert_entry->k; - struct btree_iter_level *l = &iter->l[0]; - struct btree_node_iter node_iter = l->iter; - bool deleting = bkey_whiteout(&insert->k); - bool update_journal = !deleting; - bool update_btree = !deleting; - struct bkey_i whiteout = *insert; - struct bkey_packed *_k; - struct bkey unpacked; - - EBUG_ON(iter->level); - EBUG_ON(!insert->k.size); - EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k))); - - while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b, - KEY_TYPE_discard))) { - struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked); - struct bpos cur_end = bpos_min(insert->k.p, k.k->p); - enum bch_extent_overlap overlap = - bch2_extent_overlap(&insert->k, k.k); - - if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0) - break; - - if (!bkey_whiteout(k.k)) - update_journal = true; - - if (!update_journal) { - bch2_cut_front(cur_end, insert); - bch2_cut_front(cur_end, &whiteout); - bch2_btree_iter_set_pos_same_leaf(iter, cur_end); - goto next; - } - - /* - * When deleting, if possible just do it by switching the type - * of the key we're deleting, instead of creating and inserting - * a new whiteout: - */ - if (deleting && - !update_btree && - !bkey_cmp(insert->k.p, k.k->p) && - !bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) { - if (!bkey_whiteout(k.k)) { - btree_account_key_drop(l->b, _k); - _k->type = KEY_TYPE_discard; - reserve_whiteout(l->b, _k); - bch2_btree_iter_fix_key_modified(iter, - l->b, _k); - } - break; - } - - if (k.k->needs_whiteout || bkey_written(l->b, _k)) { - insert->k.needs_whiteout = true; - update_btree = true; - } - - if (update_btree && - overlap == BCH_EXTENT_OVERLAP_ALL && - bkey_whiteout(k.k) && - k.k->needs_whiteout) { - unreserve_whiteout(l->b, _k); - _k->needs_whiteout = false; - } - - extent_squash(c, iter, insert, _k, k, overlap); - - if (!update_btree) - bch2_cut_front(cur_end, insert); -next: - node_iter = l->iter; - - if (overlap == BCH_EXTENT_OVERLAP_FRONT || - overlap == BCH_EXTENT_OVERLAP_MIDDLE) - break; - } - - l->iter = node_iter; - bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p); - - if (update_btree) { - if (deleting) - insert->k.type = KEY_TYPE_discard; - - EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); - - extent_bset_insert(c, iter, insert); - } - - if (update_journal) { - struct bkey_i *k = !deleting ? insert : &whiteout; - - if (deleting) - k->k.type = KEY_TYPE_discard; - - EBUG_ON(bkey_deleted(&k->k) || !k->k.size); - - bch2_btree_journal_key(trans, iter, k); - } - - bch2_cut_front(insert->k.p, insert); -} - -const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) -{ - return bch2_bkey_ptrs_invalid(c, k); -} - -void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - char buf[160]; - - /* - * XXX: we should be doing most/all of these checks at startup time, - * where we check bch2_bkey_invalid() in btree_node_read_done() - * - * But note that we can't check for stale pointers or incorrect gc marks - * until after journal replay is done (it might be an extent that's - * going to get overwritten during replay) - */ - - if (percpu_down_read_trylock(&c->mark_lock)) { - bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && - !bch2_bkey_replicas_marked_locked(c, e.s_c, false), c, - "extent key bad (replicas not marked in superblock):\n%s", - (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf)); - percpu_up_read(&c->mark_lock); - } - /* - * If journal replay hasn't finished, we might be seeing keys - * that will be overwritten by the time journal replay is done: - */ - if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) - return; - - extent_for_each_ptr_decode(e, p, entry) { - struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr); - unsigned stale = gen_after(mark.gen, p.ptr.gen); - unsigned disk_sectors = ptr_disk_sectors(p); - unsigned mark_sectors = p.ptr.cached - ? mark.cached_sectors - : mark.dirty_sectors; - - bch2_fs_bug_on(stale && !p.ptr.cached, c, - "stale dirty pointer (ptr gen %u bucket %u", - p.ptr.gen, mark.gen); - - bch2_fs_bug_on(stale > 96, c, "key too stale: %i", stale); - - bch2_fs_bug_on(!stale && - (mark.data_type != BCH_DATA_USER || - mark_sectors < disk_sectors), c, - "extent pointer not marked: %s:\n" - "type %u sectors %u < %u", - (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf), - mark.data_type, - mark_sectors, disk_sectors); - } -} - -void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - bch2_bkey_ptrs_to_text(out, c, k); -} - -static unsigned bch2_crc_field_size_max[] = { - [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, - [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, - [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX, -}; - -static void bch2_extent_crc_pack(union bch_extent_crc *dst, - struct bch_extent_crc_unpacked src, - enum bch_extent_entry_type type) -{ -#define set_common_fields(_dst, _src) \ - _dst.type = 1 << type; \ - _dst.csum_type = _src.csum_type, \ - _dst.compression_type = _src.compression_type, \ - _dst._compressed_size = _src.compressed_size - 1, \ - _dst._uncompressed_size = _src.uncompressed_size - 1, \ - _dst.offset = _src.offset - - switch (type) { - case BCH_EXTENT_ENTRY_crc32: - set_common_fields(dst->crc32, src); - dst->crc32.csum = *((__le32 *) &src.csum.lo); - break; - case BCH_EXTENT_ENTRY_crc64: - set_common_fields(dst->crc64, src); - dst->crc64.nonce = src.nonce; - dst->crc64.csum_lo = src.csum.lo; - dst->crc64.csum_hi = *((__le16 *) &src.csum.hi); - break; - case BCH_EXTENT_ENTRY_crc128: - set_common_fields(dst->crc128, src); - dst->crc128.nonce = src.nonce; - dst->crc128.csum = src.csum; - break; - default: - BUG(); - } -#undef set_common_fields -} - -void bch2_extent_crc_append(struct bkey_i *k, - struct bch_extent_crc_unpacked new) -{ - struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); - union bch_extent_crc *crc = (void *) ptrs.end; - enum bch_extent_entry_type type; - - if (bch_crc_bytes[new.csum_type] <= 4 && - new.uncompressed_size - 1 <= CRC32_SIZE_MAX && - new.nonce <= CRC32_NONCE_MAX) - type = BCH_EXTENT_ENTRY_crc32; - else if (bch_crc_bytes[new.csum_type] <= 10 && - new.uncompressed_size - 1 <= CRC64_SIZE_MAX && - new.nonce <= CRC64_NONCE_MAX) - type = BCH_EXTENT_ENTRY_crc64; - else if (bch_crc_bytes[new.csum_type] <= 16 && - new.uncompressed_size - 1 <= CRC128_SIZE_MAX && - new.nonce <= CRC128_NONCE_MAX) - type = BCH_EXTENT_ENTRY_crc128; - else - BUG(); - - bch2_extent_crc_pack(crc, new, type); - - k->k.u64s += extent_entry_u64s(ptrs.end); - - EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX); -} - -static inline void __extent_entry_insert(struct bkey_i *k, - union bch_extent_entry *dst, - union bch_extent_entry *new) -{ - union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); - - memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new), - dst, (u64 *) end - (u64 *) dst); - k->k.u64s += extent_entry_u64s(new); - memcpy(dst, new, extent_entry_bytes(new)); -} - -void bch2_extent_ptr_decoded_append(struct bkey_i *k, - struct extent_ptr_decoded *p) -{ - struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); - struct bch_extent_crc_unpacked crc = - bch2_extent_crc_unpack(&k->k, NULL); - union bch_extent_entry *pos; - - if (!bch2_crc_unpacked_cmp(crc, p->crc)) { - pos = ptrs.start; - goto found; - } - - bkey_for_each_crc(&k->k, ptrs, crc, pos) - if (!bch2_crc_unpacked_cmp(crc, p->crc)) { - pos = extent_entry_next(pos); - goto found; - } - - bch2_extent_crc_append(k, p->crc); - pos = bkey_val_end(bkey_i_to_s(k)); -found: - p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; - __extent_entry_insert(k, pos, to_entry(&p->ptr)); - - if (p->has_ec) { - p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr; - __extent_entry_insert(k, pos, to_entry(&p->ec)); - } -} - -/* - * bch_extent_normalize - clean up an extent, dropping stale pointers etc. - * - * Returns true if @k should be dropped entirely - * - * For existing keys, only called when btree nodes are being rewritten, not when - * they're merely being compacted/resorted in memory. - */ -bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) -{ - struct bch_extent_ptr *ptr; - - bch2_bkey_drop_ptrs(k, ptr, - ptr->cached && - ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)); - - /* will only happen if all pointers were cached: */ - if (!bkey_val_u64s(k.k)) - k.k->type = KEY_TYPE_discard; - - return bkey_whiteout(k.k); -} - -void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k, - unsigned target, - unsigned nr_desired_replicas) -{ - struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); - union bch_extent_entry *entry; - struct extent_ptr_decoded p; - int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas; - - if (target && extra > 0) - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - int n = bch2_extent_ptr_durability(c, p); - - if (n && n <= extra && - !bch2_dev_in_target(c, p.ptr.dev, target)) { - entry->ptr.cached = true; - extra -= n; - } - } - - if (extra > 0) - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - int n = bch2_extent_ptr_durability(c, p); - - if (n && n <= extra) { - entry->ptr.cached = true; - extra -= n; - } - } -} - -enum merge_result bch2_extent_merge(struct bch_fs *c, - struct bkey_s _l, struct bkey_s _r) -{ - struct bkey_s_extent l = bkey_s_to_extent(_l); - struct bkey_s_extent r = bkey_s_to_extent(_r); - union bch_extent_entry *en_l = l.v->start; - union bch_extent_entry *en_r = r.v->start; - struct bch_extent_crc_unpacked crc_l, crc_r; - - if (bkey_val_u64s(l.k) != bkey_val_u64s(r.k)) - return BCH_MERGE_NOMERGE; - - crc_l = bch2_extent_crc_unpack(l.k, NULL); - - extent_for_each_entry(l, en_l) { - en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); - - if (extent_entry_type(en_l) != extent_entry_type(en_r)) - return BCH_MERGE_NOMERGE; - - switch (extent_entry_type(en_l)) { - case BCH_EXTENT_ENTRY_ptr: { - const struct bch_extent_ptr *lp = &en_l->ptr; - const struct bch_extent_ptr *rp = &en_r->ptr; - struct bch_dev *ca; - - if (lp->offset + crc_l.compressed_size != rp->offset || - lp->dev != rp->dev || - lp->gen != rp->gen) - return BCH_MERGE_NOMERGE; - - /* We don't allow extents to straddle buckets: */ - ca = bch_dev_bkey_exists(c, lp->dev); - - if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp)) - return BCH_MERGE_NOMERGE; - - break; - } - case BCH_EXTENT_ENTRY_stripe_ptr: - if (en_l->stripe_ptr.block != en_r->stripe_ptr.block || - en_l->stripe_ptr.idx != en_r->stripe_ptr.idx) - return BCH_MERGE_NOMERGE; - break; - case BCH_EXTENT_ENTRY_crc32: - case BCH_EXTENT_ENTRY_crc64: - case BCH_EXTENT_ENTRY_crc128: - crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); - crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); - - if (crc_l.csum_type != crc_r.csum_type || - crc_l.compression_type != crc_r.compression_type || - crc_l.nonce != crc_r.nonce) - return BCH_MERGE_NOMERGE; - - if (crc_l.offset + crc_l.live_size != crc_l.compressed_size || - crc_r.offset) - return BCH_MERGE_NOMERGE; - - if (!bch2_checksum_mergeable(crc_l.csum_type)) - return BCH_MERGE_NOMERGE; - - if (crc_l.compression_type) - return BCH_MERGE_NOMERGE; - - if (crc_l.csum_type && - crc_l.uncompressed_size + - crc_r.uncompressed_size > c->sb.encoded_extent_max) - return BCH_MERGE_NOMERGE; - - if (crc_l.uncompressed_size + crc_r.uncompressed_size - 1 > - bch2_crc_field_size_max[extent_entry_type(en_l)]) - return BCH_MERGE_NOMERGE; - - break; - default: - return BCH_MERGE_NOMERGE; - } + new_val_u64s = 0; } - extent_for_each_entry(l, en_l) { - struct bch_extent_crc_unpacked crc_l, crc_r; - - en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); - - if (!extent_entry_is_crc(en_l)) - continue; - - crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); - crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); - - crc_l.csum = bch2_checksum_merge(crc_l.csum_type, - crc_l.csum, - crc_r.csum, - crc_r.uncompressed_size << 9); - - crc_l.uncompressed_size += crc_r.uncompressed_size; - crc_l.compressed_size += crc_r.compressed_size; - - bch2_extent_crc_pack(entry_to_crc(en_l), crc_l, - extent_entry_type(en_l)); - } - - bch2_key_resize(l.k, l.k->size + r.k->size); - - return BCH_MERGE_MERGE; -} - -bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, - unsigned nr_replicas) -{ - struct btree_trans trans; - struct btree_iter *iter; - struct bpos end = pos; - struct bkey_s_c k; - bool ret = true; - int err; - - end.offset += size; - - bch2_trans_init(&trans, c, 0, 0); - - for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, pos, - BTREE_ITER_SLOTS, k, err) { - if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) - break; - - if (nr_replicas > bch2_bkey_nr_ptrs_allocated(k)) { - ret = false; - break; - } - } - bch2_trans_exit(&trans); - - return ret; -} - -unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k) -{ - unsigned ret = 0; - switch (k.k->type) { - case KEY_TYPE_extent: { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - - extent_for_each_ptr_decode(e, p, entry) - ret += !p.ptr.cached && - p.crc.compression_type == BCH_COMPRESSION_NONE; + case KEY_TYPE_inline_data: + new_val_u64s = min(new_val_u64s, k.k->size << 6); break; } - case KEY_TYPE_reservation: - ret = bkey_s_c_to_reservation(k).v->nr_replicas; - break; - } - - return ret; -} - -/* KEY_TYPE_reservation: */ - -const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); - - if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) - return "incorrect value size"; - - if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) - return "invalid nr_replicas"; - - return NULL; -} - -void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); - - pr_buf(out, "generation %u replicas %u", - le32_to_cpu(r.v->generation), - r.v->nr_replicas); -} -enum merge_result bch2_reservation_merge(struct bch_fs *c, - struct bkey_s _l, struct bkey_s _r) -{ - struct bkey_s_reservation l = bkey_s_to_reservation(_l); - struct bkey_s_reservation r = bkey_s_to_reservation(_r); + val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; + BUG_ON(val_u64s_delta < 0); - if (l.v->generation != r.v->generation || - l.v->nr_replicas != r.v->nr_replicas) - return BCH_MERGE_NOMERGE; - - if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { - bch2_key_resize(l.k, KEY_SIZE_MAX); - __bch2_cut_front(l.k->p, r.s); - return BCH_MERGE_PARTIAL; - } - - bch2_key_resize(l.k, l.k->size + r.k->size); - - return BCH_MERGE_MERGE; + set_bkey_val_u64s(k.k, new_val_u64s); + memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); + return -val_u64s_delta; } |