/* * Copyright (C) 2010 Kent Overstreet * * Code for managing the extent btree and dynamically updating the writeback * dirty sector count. */ #include "bcachefs.h" #include "bkey_methods.h" #include "btree_gc.h" #include "btree_update.h" #include "checksum.h" #include "debug.h" #include "dirent.h" #include "error.h" #include "extents.h" #include "inode.h" #include "journal.h" #include "super-io.h" #include "xattr.h" #include static enum merge_result bch2_extent_merge(struct bch_fs *, struct btree *, struct bkey_i *, struct bkey_i *); static void sort_key_next(struct btree_node_iter *iter, struct btree *b, struct btree_node_iter_set *i) { i->k += __btree_node_offset_to_key(b, i->k)->u64s; if (i->k == i->end) *i = iter->data[--iter->used]; } /* * Returns true if l > r - unless l == r, in which case returns true if l is * older than r. * * Necessary for btree_sort_fixup() - if there are multiple keys that compare * equal in different sets, we have to process them newest to oldest. */ #define key_sort_cmp(l, r) \ ({ \ int _c = bkey_cmp_packed(b, \ __btree_node_offset_to_key(b, (l).k), \ __btree_node_offset_to_key(b, (r).k)); \ \ _c ? _c > 0 : (l).k > (r).k; \ }) static inline bool should_drop_next_key(struct btree_node_iter *iter, struct btree *b) { struct btree_node_iter_set *l = iter->data, *r = iter->data + 1; struct bkey_packed *k = __btree_node_offset_to_key(b, l->k); if (bkey_whiteout(k)) return true; if (iter->used < 2) return false; if (iter->used > 2 && key_sort_cmp(r[0], r[1])) r++; /* * key_sort_cmp() ensures that when keys compare equal the older key * comes first; so if l->k compares equal to r->k then l->k is older and * should be dropped. */ return !bkey_cmp_packed(b, __btree_node_offset_to_key(b, l->k), __btree_node_offset_to_key(b, r->k)); } struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *dst, struct btree *b, struct btree_node_iter *iter) { struct bkey_packed *out = dst->start; struct btree_nr_keys nr; memset(&nr, 0, sizeof(nr)); heap_resort(iter, key_sort_cmp); while (!bch2_btree_node_iter_end(iter)) { if (!should_drop_next_key(iter, b)) { struct bkey_packed *k = __btree_node_offset_to_key(b, iter->data->k); bkey_copy(out, k); btree_keys_account_key_add(&nr, 0, out); out = bkey_next(out); } sort_key_next(iter, b, iter->data); heap_sift(iter, 0, key_sort_cmp); } dst->u64s = cpu_to_le16((u64 *) out - dst->_data); return nr; } /* Common among btree and extent ptrs */ const struct bch_extent_ptr * bch2_extent_has_device(struct bkey_s_c_extent e, unsigned dev) { const struct bch_extent_ptr *ptr; extent_for_each_ptr(e, ptr) if (ptr->dev == dev) return ptr; return NULL; } unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent e) { const struct bch_extent_ptr *ptr; unsigned nr_ptrs = 0; extent_for_each_ptr(e, ptr) nr_ptrs++; return nr_ptrs; } unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k) { struct bkey_s_c_extent e; const struct bch_extent_ptr *ptr; unsigned nr_ptrs = 0; switch (k.k->type) { case BCH_EXTENT: case BCH_EXTENT_CACHED: e = bkey_s_c_to_extent(k); extent_for_each_ptr(e, ptr) nr_ptrs += !ptr->cached; break; case BCH_RESERVATION: nr_ptrs = bkey_s_c_to_reservation(k).v->nr_replicas; break; } return nr_ptrs; } /* returns true if equal */ static bool crc_cmp(union bch_extent_crc *l, union bch_extent_crc *r) { return extent_crc_type(l) == extent_crc_type(r) && !memcmp(l, r, extent_entry_bytes(to_entry(l))); } /* Increment pointers after @crc by crc's offset until the next crc entry: */ void bch2_extent_crc_narrow_pointers(struct bkey_s_extent e, union bch_extent_crc *crc) { union bch_extent_entry *entry; extent_for_each_entry_from(e, entry, extent_entry_next(to_entry(crc))) { if (!extent_entry_is_ptr(entry)) return; entry->ptr.offset += crc_offset(crc); } } /* * We're writing another replica for this extent, so while we've got the data in * memory we'll be computing a new checksum for the currently live data. * * If there are other replicas we aren't moving, and they are checksummed but * not compressed, we can modify them to point to only the data that is * currently live (so that readers won't have to bounce) while we've got the * checksum we need: * * XXX: to guard against data being corrupted while in memory, instead of * recomputing the checksum here, it would be better in the read path to instead * of computing the checksum of the entire extent: * * | extent | * * compute the checksums of the live and dead data separately * | dead data || live data || dead data | * * and then verify that crc_dead1 + crc_live + crc_dead2 == orig_crc, and then * use crc_live here (that we verified was correct earlier) * * note: doesn't work with encryption */ void bch2_extent_narrow_crcs(struct bkey_s_extent e) { union bch_extent_crc *crc; bool have_wide = false, have_narrow = false; struct bch_csum csum = { 0 }; unsigned csum_type = 0; extent_for_each_crc(e, crc) { if (crc_compression_type(crc) || bch2_csum_type_is_encryption(crc_csum_type(crc))) continue; if (crc_uncompressed_size(e.k, crc) != e.k->size) { have_wide = true; } else { have_narrow = true; csum = crc_csum(crc); csum_type = crc_csum_type(crc); } } if (!have_wide || !have_narrow) return; extent_for_each_crc(e, crc) { if (crc_compression_type(crc)) continue; if (crc_uncompressed_size(e.k, crc) != e.k->size) { switch (extent_crc_type(crc)) { case BCH_EXTENT_CRC_NONE: BUG(); case BCH_EXTENT_CRC32: if (bch_crc_bytes[csum_type] > 4) continue; bch2_extent_crc_narrow_pointers(e, crc); crc->crc32._compressed_size = e.k->size - 1; crc->crc32._uncompressed_size = e.k->size - 1; crc->crc32.offset = 0; crc->crc32.csum_type = csum_type; crc->crc32.csum = csum.lo; break; case BCH_EXTENT_CRC64: if (bch_crc_bytes[csum_type] > 10) continue; bch2_extent_crc_narrow_pointers(e, crc); crc->crc64._compressed_size = e.k->size - 1; crc->crc64._uncompressed_size = e.k->size - 1; crc->crc64.offset = 0; crc->crc64.csum_type = csum_type; crc->crc64.csum_lo = csum.lo; crc->crc64.csum_hi = csum.hi; break; case BCH_EXTENT_CRC128: if (bch_crc_bytes[csum_type] > 16) continue; bch2_extent_crc_narrow_pointers(e, crc); crc->crc128._compressed_size = e.k->size - 1; crc->crc128._uncompressed_size = e.k->size - 1; crc->crc128.offset = 0; crc->crc128.csum_type = csum_type; crc->crc128.csum = csum; break; } } } } void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e) { union bch_extent_entry *entry = e.v->start; union bch_extent_crc *crc, *prev = NULL; while (entry != extent_entry_last(e)) { union bch_extent_entry *next = extent_entry_next(entry); size_t crc_u64s = extent_entry_u64s(entry); if (!extent_entry_is_crc(entry)) goto next; crc = entry_to_crc(entry); if (next == extent_entry_last(e)) { /* crc entry with no pointers after it: */ goto drop; } if (extent_entry_is_crc(next)) { /* no pointers before next crc entry: */ goto drop; } if (prev && crc_cmp(crc, prev)) { /* identical to previous crc entry: */ goto drop; } if (!prev && !crc_csum_type(crc) && !crc_compression_type(crc)) { /* null crc entry: */ bch2_extent_crc_narrow_pointers(e, crc); goto drop; } prev = crc; next: entry = next; continue; drop: memmove_u64s_down(crc, next, (u64 *) extent_entry_last(e) - (u64 *) next); e.k->u64s -= crc_u64s; } EBUG_ON(bkey_val_u64s(e.k) && !bch2_extent_nr_ptrs(e.c)); } static bool should_drop_ptr(const struct bch_fs *c, struct bkey_s_c_extent e, const struct bch_extent_ptr *ptr) { return ptr->cached && ptr_stale(c->devs[ptr->dev], ptr); } static void bch2_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e) { struct bch_extent_ptr *ptr = &e.v->start->ptr; bool dropped = false; while ((ptr = extent_ptr_next(e, ptr))) if (should_drop_ptr(c, e.c, ptr)) { __bch2_extent_drop_ptr(e, ptr); dropped = true; } else ptr++; if (dropped) bch2_extent_drop_redundant_crcs(e); } static bool bch2_ptr_normalize(struct bch_fs *c, struct btree *bk, struct bkey_s k) { return bch2_extent_normalize(c, k); } static void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k) { switch (k->type) { case BCH_EXTENT: case BCH_EXTENT_CACHED: { union bch_extent_entry *entry; u64 *d = (u64 *) bkeyp_val(f, k); unsigned i; for (i = 0; i < bkeyp_val_u64s(f, k); i++) d[i] = swab64(d[i]); for (entry = (union bch_extent_entry *) d; entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k)); entry = extent_entry_next(entry)) { switch (extent_entry_type(entry)) { case BCH_EXTENT_ENTRY_crc32: entry->crc32.csum = swab32(entry->crc32.csum); break; case BCH_EXTENT_ENTRY_crc64: entry->crc64.csum_hi = swab16(entry->crc64.csum_hi); entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); break; case BCH_EXTENT_ENTRY_crc128: entry->crc128.csum.hi = swab64(entry->crc64.csum_hi); entry->crc128.csum.lo = swab64(entry->crc64.csum_lo); break; case BCH_EXTENT_ENTRY_ptr: break; } } break; } } } static const char *extent_ptr_invalid(const struct bch_fs *c, struct bkey_s_c_extent e, const struct bch_extent_ptr *ptr, unsigned size_ondisk, bool metadata) { const struct bch_extent_ptr *ptr2; struct bch_dev *ca; if (ptr->dev >= c->sb.nr_devices) return "pointer to invalid device"; ca = c->devs[ptr->dev]; if (!ca) return "pointer to invalid device"; extent_for_each_ptr(e, ptr2) if (ptr != ptr2 && ptr->dev == ptr2->dev) return "multiple pointers to same device"; if (ptr->offset + size_ondisk > ca->mi.bucket_size * ca->mi.nbuckets) return "offset past end of device"; if (ptr->offset < ca->mi.bucket_size * ca->mi.first_bucket) return "offset before first bucket"; if ((ptr->offset & (ca->mi.bucket_size - 1)) + size_ondisk > ca->mi.bucket_size) return "spans multiple buckets"; if (!(metadata ? ca->mi.has_metadata : ca->mi.has_data)) return "device not marked as containing data"; return NULL; } static size_t extent_print_ptrs(struct bch_fs *c, char *buf, size_t size, struct bkey_s_c_extent e) { char *out = buf, *end = buf + size; const union bch_extent_entry *entry; const union bch_extent_crc *crc; const struct bch_extent_ptr *ptr; struct bch_dev *ca; bool first = true; #define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) extent_for_each_entry(e, entry) { if (!first) p(" "); switch (__extent_entry_type(entry)) { case BCH_EXTENT_ENTRY_crc32: case BCH_EXTENT_ENTRY_crc64: case BCH_EXTENT_ENTRY_crc128: crc = entry_to_crc(entry); p("crc: c_size %u size %u offset %u csum %u compress %u", crc_compressed_size(e.k, crc), crc_uncompressed_size(e.k, crc), crc_offset(crc), crc_csum_type(crc), crc_compression_type(crc)); break; case BCH_EXTENT_ENTRY_ptr: ptr = entry_to_ptr(entry); ca = c->devs[ptr->dev]; p("ptr: %u:%llu gen %u%s", ptr->dev, (u64) ptr->offset, ptr->gen, ca && ptr_stale(ca, ptr) ? " stale" : ""); break; default: p("(invalid extent entry %.16llx)", *((u64 *) entry)); goto out; } first = false; } out: if (bkey_extent_is_cached(e.k)) p(" cached"); #undef p return out - buf; } /* Btree ptrs */ static const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k) { if (bkey_extent_is_cached(k.k)) return "cached"; if (k.k->size) return "nonzero key size"; if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) return "value too big"; switch (k.k->type) { case BCH_EXTENT: { struct bkey_s_c_extent e = bkey_s_c_to_extent(k); const union bch_extent_entry *entry; const struct bch_extent_ptr *ptr; const union bch_extent_crc *crc; const char *reason; extent_for_each_entry(e, entry) if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) return "invalid extent entry type"; extent_for_each_ptr_crc(e, ptr, crc) { reason = extent_ptr_invalid(c, e, ptr, c->sb.btree_node_size, true); if (reason) return reason; } if (crc) return "has crc field"; return NULL; } default: return "invalid value type"; } } static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) { struct bkey_s_c_extent e = bkey_s_c_to_extent(k); const struct bch_extent_ptr *ptr; unsigned seq; const char *err; char buf[160]; struct bucket *g; struct bch_dev *ca; unsigned replicas = 0; bool bad; extent_for_each_ptr(e, ptr) { ca = c->devs[ptr->dev]; g = PTR_BUCKET(ca, ptr); replicas++; err = "stale"; if (ptr_stale(ca, ptr)) goto err; do { seq = read_seqcount_begin(&c->gc_pos_lock); bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 && g->mark.data_type != BUCKET_BTREE; } while (read_seqcount_retry(&c->gc_pos_lock, seq)); err = "inconsistent"; if (bad) goto err; } if (replicas < c->sb.meta_replicas_have) { bch2_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k); bch2_fs_bug(c, "btree key bad (too few replicas, %u < %u): %s", replicas, c->sb.meta_replicas_have, buf); return; } return; err: bch2_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k); bch2_fs_bug(c, "%s btree pointer %s: bucket %zi prio %i " "gen %i last_gc %i mark %08x", err, buf, PTR_BUCKET_NR(ca, ptr), g->read_prio, PTR_BUCKET(ca, ptr)->mark.gen, ca->oldest_gens[PTR_BUCKET_NR(ca, ptr)], (unsigned) g->mark.counter); } static void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf, size_t size, struct bkey_s_c k) { char *out = buf, *end = buf + size; const char *invalid; #define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) if (bkey_extent_is_data(k.k)) out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k)); invalid = bch2_btree_ptr_invalid(c, k); if (invalid) p(" invalid: %s", invalid); #undef p } struct extent_pick_ptr bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b) { struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key); const union bch_extent_crc *crc; const struct bch_extent_ptr *ptr; struct extent_pick_ptr pick = { .ca = NULL }; extent_for_each_ptr_crc(e, ptr, crc) { struct bch_dev *ca = c->devs[ptr->dev]; struct btree *root = btree_node_root(c, b); if (bch2_fs_inconsistent_on(crc, c, "btree node pointer with crc at btree %u level %u/%u bucket %zu", b->btree_id, b->level, root ? root->level : -1, PTR_BUCKET_NR(ca, ptr))) break; if (ca->mi.state == BCH_MEMBER_STATE_FAILED) continue; if (pick.ca && pick.ca->mi.tier < ca->mi.tier) continue; if (!percpu_ref_tryget(&ca->io_ref)) continue; if (pick.ca) percpu_ref_put(&pick.ca->io_ref); pick.ca = ca; pick.ptr = *ptr; } return pick; } const struct bkey_ops bch2_bkey_btree_ops = { .key_invalid = bch2_btree_ptr_invalid, .key_debugcheck = btree_ptr_debugcheck, .val_to_text = bch2_btree_ptr_to_text, .swab = bch2_ptr_swab, }; /* Extents */ static bool __bch2_cut_front(struct bpos where, struct bkey_s k) { u64 len = 0; if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0) return false; EBUG_ON(bkey_cmp(where, k.k->p) > 0); len = k.k->p.offset - where.offset; BUG_ON(len > k.k->size); /* * Don't readjust offset if the key size is now 0, because that could * cause offset to point to the next bucket: */ if (!len) __set_bkey_deleted(k.k); else if (bkey_extent_is_data(k.k)) { struct bkey_s_extent e = bkey_s_to_extent(k); struct bch_extent_ptr *ptr; union bch_extent_crc *crc, *prev_crc = NULL; extent_for_each_ptr_crc(e, ptr, crc) { switch (extent_crc_type(crc)) { case BCH_EXTENT_CRC_NONE: ptr->offset += e.k->size - len; break; case BCH_EXTENT_CRC32: if (prev_crc != crc) crc->crc32.offset += e.k->size - len; break; case BCH_EXTENT_CRC64: if (prev_crc != crc) crc->crc64.offset += e.k->size - len; break; case BCH_EXTENT_CRC128: if (prev_crc != crc) crc->crc128.offset += e.k->size - len; break; } prev_crc = crc; } } k.k->size = len; return true; } bool bch2_cut_front(struct bpos where, struct bkey_i *k) { return __bch2_cut_front(where, bkey_i_to_s(k)); } bool bch2_cut_back(struct bpos where, struct bkey *k) { u64 len = 0; if (bkey_cmp(where, k->p) >= 0) return false; EBUG_ON(bkey_cmp(where, bkey_start_pos(k)) < 0); len = where.offset - bkey_start_offset(k); BUG_ON(len > k->size); k->p = where; k->size = len; if (!len) __set_bkey_deleted(k); return true; } /** * bch_key_resize - adjust size of @k * * bkey_start_offset(k) will be preserved, modifies where the extent ends */ void bch2_key_resize(struct bkey *k, unsigned new_size) { k->p.offset -= k->size; k->p.offset += new_size; k->size = new_size; } /* * In extent_sort_fix_overlapping(), insert_fixup_extent(), * extent_merge_inline() - we're modifying keys in place that are packed. To do * that we have to unpack the key, modify the unpacked key - then this * copies/repacks the unpacked to the original as necessary. */ static bool __extent_save(struct btree *b, struct btree_node_iter *iter, struct bkey_packed *dst, struct bkey *src) { struct bkey_format *f = &b->format; struct bkey_i *dst_unpacked; bool ret; if ((dst_unpacked = packed_to_bkey(dst))) { dst_unpacked->k = *src; ret = true; } else { ret = bch2_bkey_pack_key(dst, src, f); } if (ret && iter) bch2_verify_key_order(b, iter, dst); return ret; } static void extent_save(struct btree *b, struct btree_node_iter *iter, struct bkey_packed *dst, struct bkey *src) { BUG_ON(!__extent_save(b, iter, dst, src)); } /* * Returns true if l > r - unless l == r, in which case returns true if l is * older than r. * * Necessary for sort_fix_overlapping() - if there are multiple keys that * compare equal in different sets, we have to process them newest to oldest. */ #define extent_sort_cmp(l, r) \ ({ \ struct bkey _ul = bkey_unpack_key(b, \ __btree_node_offset_to_key(b, (l).k)); \ struct bkey _ur = bkey_unpack_key(b, \ __btree_node_offset_to_key(b, (r).k)); \ \ int _c = bkey_cmp(bkey_start_pos(&_ul), bkey_start_pos(&_ur)); \ _c ? _c > 0 : (l).k < (r).k; \ }) static inline void extent_sort_sift(struct btree_node_iter *iter, struct btree *b, size_t i) { heap_sift(iter, i, extent_sort_cmp); } static inline void extent_sort_next(struct btree_node_iter *iter, struct btree *b, struct btree_node_iter_set *i) { sort_key_next(iter, b, i); heap_sift(iter, i - iter->data, extent_sort_cmp); } static void extent_sort_append(struct bch_fs *c, struct btree *b, struct btree_nr_keys *nr, struct bkey_packed *start, struct bkey_packed **prev, struct bkey_packed *k) { struct bkey_format *f = &b->format; BKEY_PADDED(k) tmp; if (bkey_whiteout(k)) return; bch2_bkey_unpack(b, &tmp.k, k); if (*prev && bch2_extent_merge(c, b, (void *) *prev, &tmp.k)) return; if (*prev) { bch2_bkey_pack(*prev, (void *) *prev, f); btree_keys_account_key_add(nr, 0, *prev); *prev = bkey_next(*prev); } else { *prev = start; } bkey_copy(*prev, &tmp.k); } struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, struct btree *b, struct btree_node_iter *iter) { struct bkey_format *f = &b->format; struct btree_node_iter_set *_l = iter->data, *_r; struct bkey_packed *prev = NULL, *out, *lk, *rk; struct bkey l_unpacked, r_unpacked; struct bkey_s l, r; struct btree_nr_keys nr; memset(&nr, 0, sizeof(nr)); heap_resort(iter, extent_sort_cmp); while (!bch2_btree_node_iter_end(iter)) { lk = __btree_node_offset_to_key(b, _l->k); if (iter->used == 1) { extent_sort_append(c, b, &nr, dst->start, &prev, lk); extent_sort_next(iter, b, _l); continue; } _r = iter->data + 1; if (iter->used > 2 && extent_sort_cmp(_r[0], _r[1])) _r++; rk = __btree_node_offset_to_key(b, _r->k); l = __bkey_disassemble(b, lk, &l_unpacked); r = __bkey_disassemble(b, rk, &r_unpacked); /* If current key and next key don't overlap, just append */ if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) { extent_sort_append(c, b, &nr, dst->start, &prev, lk); extent_sort_next(iter, b, _l); continue; } /* Skip 0 size keys */ if (!r.k->size) { extent_sort_next(iter, b, _r); continue; } /* * overlap: keep the newer key and trim the older key so they * don't overlap. comparing pointers tells us which one is * newer, since the bsets are appended one after the other. */ /* can't happen because of comparison func */ BUG_ON(_l->k < _r->k && !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k))); if (_l->k > _r->k) { /* l wins, trim r */ if (bkey_cmp(l.k->p, r.k->p) >= 0) { sort_key_next(iter, b, _r); } else { __bch2_cut_front(l.k->p, r); extent_save(b, NULL, rk, r.k); } extent_sort_sift(iter, b, _r - iter->data); } else if (bkey_cmp(l.k->p, r.k->p) > 0) { BKEY_PADDED(k) tmp; /* * r wins, but it overlaps in the middle of l - split l: */ bkey_reassemble(&tmp.k, l.s_c); bch2_cut_back(bkey_start_pos(r.k), &tmp.k.k); __bch2_cut_front(r.k->p, l); extent_save(b, NULL, lk, l.k); extent_sort_sift(iter, b, 0); extent_sort_append(c, b, &nr, dst->start, &prev, bkey_to_packed(&tmp.k)); } else { bch2_cut_back(bkey_start_pos(r.k), l.k); extent_save(b, NULL, lk, l.k); } } if (prev) { bch2_bkey_pack(prev, (void *) prev, f); btree_keys_account_key_add(&nr, 0, prev); out = bkey_next(prev); } else { out = dst->start; } dst->u64s = cpu_to_le16((u64 *) out - dst->_data); return nr; } struct extent_insert_state { struct btree_insert *trans; struct btree_insert_entry *insert; struct bpos committed; struct bch_fs_usage stats; /* for deleting: */ struct bkey_i whiteout; bool do_journal; bool deleting; }; static void bch2_add_sectors(struct extent_insert_state *s, struct bkey_s_c k, u64 offset, s64 sectors) { struct bch_fs *c = s->trans->c; struct btree *b = s->insert->iter->nodes[0]; EBUG_ON(bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0); if (!sectors) return; bch2_mark_key(c, k, sectors, false, gc_pos_btree_node(b), &s->stats, s->trans->journal_res.seq); } static void bch2_subtract_sectors(struct extent_insert_state *s, struct bkey_s_c k, u64 offset, s64 sectors) { bch2_add_sectors(s, k, offset, -sectors); } /* These wrappers subtract exactly the sectors that we're removing from @k */ static void bch2_cut_subtract_back(struct extent_insert_state *s, struct bpos where, struct bkey_s k) { bch2_subtract_sectors(s, k.s_c, where.offset, k.k->p.offset - where.offset); bch2_cut_back(where, k.k); } static void bch2_cut_subtract_front(struct extent_insert_state *s, struct bpos where, struct bkey_s k) { bch2_subtract_sectors(s, k.s_c, bkey_start_offset(k.k), where.offset - bkey_start_offset(k.k)); __bch2_cut_front(where, k); } static void bch2_drop_subtract(struct extent_insert_state *s, struct bkey_s k) { if (k.k->size) bch2_subtract_sectors(s, k.s_c, bkey_start_offset(k.k), k.k->size); k.k->size = 0; __set_bkey_deleted(k.k); } static bool bch2_extent_merge_inline(struct bch_fs *, struct btree_iter *, struct bkey_packed *, struct bkey_packed *, bool); #define MAX_LOCK_HOLD_TIME (5 * NSEC_PER_MSEC) static enum btree_insert_ret extent_insert_should_stop(struct extent_insert_state *s) { struct btree *b = s->insert->iter->nodes[0]; /* * Check if we have sufficient space in both the btree node and the * journal reservation: * * Each insert checks for room in the journal entry, but we check for * room in the btree node up-front. In the worst case, bkey_cmpxchg() * will insert two keys, and one iteration of this room will insert one * key, so we need room for three keys. */ if (!bch2_btree_node_insert_fits(s->trans->c, b, s->insert->k->k.u64s)) return BTREE_INSERT_BTREE_NODE_FULL; else if (!journal_res_insert_fits(s->trans, s->insert)) return BTREE_INSERT_JOURNAL_RES_FULL; /* XXX worth tracing */ else return BTREE_INSERT_OK; } static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, struct bkey_i *insert) { struct btree *b = iter->nodes[0]; struct btree_node_iter *node_iter = &iter->node_iters[0]; struct bset_tree *t = bset_tree_last(b); struct bkey_packed *where = bch2_btree_node_iter_bset_pos(node_iter, b, t); struct bkey_packed *prev = bch2_bkey_prev(b, t, where); struct bkey_packed *next_live_key = where; unsigned clobber_u64s; if (prev) where = bkey_next(prev); while (next_live_key != btree_bkey_last(b, t) && bkey_deleted(next_live_key)) next_live_key = bkey_next(next_live_key); /* * Everything between where and next_live_key is now deleted keys, and * is overwritten: */ clobber_u64s = (u64 *) next_live_key - (u64 *) where; if (prev && bch2_extent_merge_inline(c, iter, prev, bkey_to_packed(insert), true)) goto drop_deleted_keys; if (next_live_key != btree_bkey_last(b, t) && bch2_extent_merge_inline(c, iter, bkey_to_packed(insert), next_live_key, false)) goto drop_deleted_keys; bch2_bset_insert(b, node_iter, where, insert, clobber_u64s); bch2_btree_node_iter_fix(iter, b, node_iter, t, where, clobber_u64s, where->u64s); return; drop_deleted_keys: bch2_bset_delete(b, where, clobber_u64s); bch2_btree_node_iter_fix(iter, b, node_iter, t, where, clobber_u64s, 0); } static void extent_insert_committed(struct extent_insert_state *s) { struct bch_fs *c = s->trans->c; struct btree_iter *iter = s->insert->iter; struct bkey_i *insert = !s->deleting ? s->insert->k : &s->whiteout; BKEY_PADDED(k) split; EBUG_ON(bkey_cmp(insert->k.p, s->committed) < 0); EBUG_ON(bkey_cmp(s->committed, bkey_start_pos(&insert->k)) < 0); if (!bkey_cmp(s->committed, bkey_start_pos(&insert->k))) return; if (s->deleting && !s->do_journal) { bch2_cut_front(s->committed, insert); goto done; } EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); bkey_copy(&split.k, insert); if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY) && bkey_cmp(s->committed, insert->k.p) && bkey_extent_is_compressed(bkey_i_to_s_c(insert))) { /* XXX: possibly need to increase our reservation? */ bch2_cut_subtract_back(s, s->committed, bkey_i_to_s(&split.k)); bch2_cut_front(s->committed, insert); bch2_add_sectors(s, bkey_i_to_s_c(insert), bkey_start_offset(&insert->k), insert->k.size); } else { bch2_cut_back(s->committed, &split.k.k); bch2_cut_front(s->committed, insert); } if (debug_check_bkeys(c)) bch2_bkey_debugcheck(c, iter->nodes[iter->level], bkey_i_to_s_c(&split.k)); bch2_btree_journal_key(s->trans, iter, &split.k); if (!s->deleting) extent_bset_insert(c, iter, &split.k); done: bch2_btree_iter_set_pos_same_leaf(iter, s->committed); insert->k.needs_whiteout = false; s->do_journal = false; s->trans->did_work = true; } static enum extent_insert_hook_ret __extent_insert_advance_pos(struct extent_insert_state *s, struct bpos next_pos, struct bkey_s_c k) { struct extent_insert_hook *hook = s->trans->hook; enum extent_insert_hook_ret ret; #if 0 /* * Currently disabled for encryption - broken with fcollapse. Will have * to reenable when versions are exposed for send/receive - versions * will have to be monotonic then: */ if (k.k && k.k->size && !bversion_zero(s->insert->k->k.version) && bversion_cmp(k.k->version, s->insert->k->k.version) > 0) { ret = BTREE_HOOK_NO_INSERT; } else #endif if (hook) ret = hook->fn(hook, s->committed, next_pos, k, s->insert->k); else ret = BTREE_HOOK_DO_INSERT; EBUG_ON(bkey_deleted(&s->insert->k->k) || !s->insert->k->k.size); switch (ret) { case BTREE_HOOK_DO_INSERT: break; case BTREE_HOOK_NO_INSERT: extent_insert_committed(s); bch2_cut_subtract_front(s, next_pos, bkey_i_to_s(s->insert->k)); bch2_btree_iter_set_pos_same_leaf(s->insert->iter, next_pos); break; case BTREE_HOOK_RESTART_TRANS: return ret; } s->committed = next_pos; return ret; } /* * Update iter->pos, marking how much of @insert we've processed, and call hook * fn: */ static enum extent_insert_hook_ret extent_insert_advance_pos(struct extent_insert_state *s, struct bkey_s_c k) { struct btree *b = s->insert->iter->nodes[0]; struct bpos next_pos = bpos_min(s->insert->k->k.p, k.k ? k.k->p : b->key.k.p); /* hole? */ if (k.k && bkey_cmp(s->committed, bkey_start_pos(k.k)) < 0) { bool have_uncommitted = bkey_cmp(s->committed, bkey_start_pos(&s->insert->k->k)) > 0; switch (__extent_insert_advance_pos(s, bkey_start_pos(k.k), bkey_s_c_null)) { case BTREE_HOOK_DO_INSERT: break; case BTREE_HOOK_NO_INSERT: /* * we had to split @insert and insert the committed * part - need to bail out and recheck journal * reservation/btree node before we advance pos past @k: */ if (have_uncommitted) return BTREE_HOOK_NO_INSERT; break; case BTREE_HOOK_RESTART_TRANS: return BTREE_HOOK_RESTART_TRANS; } } /* avoid redundant calls to hook fn: */ if (!bkey_cmp(s->committed, next_pos)) return BTREE_HOOK_DO_INSERT; return __extent_insert_advance_pos(s, next_pos, k); } static enum btree_insert_ret extent_insert_check_split_compressed(struct extent_insert_state *s, struct bkey_s_c k, enum bch_extent_overlap overlap) { struct bch_fs *c = s->trans->c; unsigned sectors; if (overlap == BCH_EXTENT_OVERLAP_MIDDLE && (sectors = bkey_extent_is_compressed(k))) { int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD; if (s->trans->flags & BTREE_INSERT_NOFAIL) flags |= BCH_DISK_RESERVATION_NOFAIL; switch (bch2_disk_reservation_add(c, s->trans->disk_res, sectors, flags)) { case 0: break; case -ENOSPC: return BTREE_INSERT_ENOSPC; case -EINTR: return BTREE_INSERT_NEED_GC_LOCK; default: BUG(); } } return BTREE_INSERT_OK; } static enum btree_insert_ret extent_squash(struct extent_insert_state *s, struct bkey_i *insert, struct bset_tree *t, struct bkey_packed *_k, struct bkey_s k, enum bch_extent_overlap overlap) { struct bch_fs *c = s->trans->c; struct btree_iter *iter = s->insert->iter; struct btree *b = iter->nodes[0]; struct btree_node_iter *node_iter = &iter->node_iters[0]; switch (overlap) { case BCH_EXTENT_OVERLAP_FRONT: /* insert overlaps with start of k: */ bch2_cut_subtract_front(s, insert->k.p, k); BUG_ON(bkey_deleted(k.k)); extent_save(b, node_iter, _k, k.k); break; case BCH_EXTENT_OVERLAP_BACK: /* insert overlaps with end of k: */ bch2_cut_subtract_back(s, bkey_start_pos(&insert->k), k); BUG_ON(bkey_deleted(k.k)); extent_save(b, node_iter, _k, k.k); /* * As the auxiliary tree is indexed by the end of the * key and we've just changed the end, update the * auxiliary tree. */ bch2_bset_fix_invalidated_key(b, t, _k); bch2_btree_node_iter_fix(iter, b, node_iter, t, _k, _k->u64s, _k->u64s); break; case BCH_EXTENT_OVERLAP_ALL: { struct bpos orig_pos = k.k->p; /* The insert key completely covers k, invalidate k */ if (!bkey_whiteout(k.k)) btree_keys_account_key_drop(&b->nr, t - b->set, _k); bch2_drop_subtract(s, k); k.k->p = bkey_start_pos(&insert->k); if (!__extent_save(b, node_iter, _k, k.k)) { /* * Couldn't repack: we aren't necessarily able * to repack if the new key is outside the range * of the old extent, so we have to split * @insert: */ k.k->p = orig_pos; extent_save(b, node_iter, _k, k.k); if (extent_insert_advance_pos(s, k.s_c) == BTREE_HOOK_RESTART_TRANS) return BTREE_INSERT_NEED_TRAVERSE; extent_insert_committed(s); /* * We split and inserted upto at k.k->p - that * has to coincide with iter->pos, so that we * don't have anything more we have to insert * until we recheck our journal reservation: */ EBUG_ON(bkey_cmp(s->committed, k.k->p)); } else { bch2_bset_fix_invalidated_key(b, t, _k); bch2_btree_node_iter_fix(iter, b, node_iter, t, _k, _k->u64s, _k->u64s); } break; } case BCH_EXTENT_OVERLAP_MIDDLE: { BKEY_PADDED(k) split; /* * The insert key falls 'in the middle' of k * The insert key splits k in 3: * - start only in k, preserve * - middle common section, invalidate in k * - end only in k, preserve * * We update the old key to preserve the start, * insert will be the new common section, * we manually insert the end that we are preserving. * * modify k _before_ doing the insert (which will move * what k points to) */ bkey_reassemble(&split.k, k.s_c); split.k.k.needs_whiteout |= bset_written(b, bset(b, t)); bch2_cut_back(bkey_start_pos(&insert->k), &split.k.k); BUG_ON(bkey_deleted(&split.k.k)); bch2_cut_subtract_front(s, insert->k.p, k); BUG_ON(bkey_deleted(k.k)); extent_save(b, node_iter, _k, k.k); bch2_add_sectors(s, bkey_i_to_s_c(&split.k), bkey_start_offset(&split.k.k), split.k.k.size); extent_bset_insert(c, iter, &split.k); break; } } return BTREE_INSERT_OK; } static enum btree_insert_ret bch2_delete_fixup_extent(struct extent_insert_state *s) { struct bch_fs *c = s->trans->c; struct btree_iter *iter = s->insert->iter; struct btree *b = iter->nodes[0]; struct btree_node_iter *node_iter = &iter->node_iters[0]; struct bkey_packed *_k; struct bkey unpacked; struct bkey_i *insert = s->insert->k; enum btree_insert_ret ret = BTREE_INSERT_OK; EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k))); s->whiteout = *insert; s->do_journal = false; while (bkey_cmp(s->committed, insert->k.p) < 0 && (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK && (_k = bch2_btree_node_iter_peek_all(node_iter, b))) { struct bset_tree *t = bch2_bkey_to_bset(b, _k); struct bkey_s k = __bkey_disassemble(b, _k, &unpacked); enum bch_extent_overlap overlap; EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k))); EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0); if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0) break; if (bkey_whiteout(k.k)) { s->committed = bpos_min(insert->k.p, k.k->p); goto next; } overlap = bch2_extent_overlap(&insert->k, k.k); ret = extent_insert_check_split_compressed(s, k.s_c, overlap); if (ret != BTREE_INSERT_OK) goto stop; switch (extent_insert_advance_pos(s, k.s_c)) { case BTREE_HOOK_DO_INSERT: break; case BTREE_HOOK_NO_INSERT: continue; case BTREE_HOOK_RESTART_TRANS: ret = BTREE_INSERT_NEED_TRAVERSE; goto stop; } s->do_journal = true; if (overlap == BCH_EXTENT_OVERLAP_ALL) { btree_keys_account_key_drop(&b->nr, t - b->set, _k); bch2_subtract_sectors(s, k.s_c, bkey_start_offset(k.k), k.k->size); _k->type = KEY_TYPE_DISCARD; reserve_whiteout(b, t, _k); } else if (k.k->needs_whiteout || bset_written(b, bset(b, t))) { struct bkey_i discard = *insert; switch (overlap) { case BCH_EXTENT_OVERLAP_FRONT: bch2_cut_front(bkey_start_pos(k.k), &discard); break; case BCH_EXTENT_OVERLAP_BACK: bch2_cut_back(k.k->p, &discard.k); break; default: break; } discard.k.needs_whiteout = true; ret = extent_squash(s, insert, t, _k, k, overlap); BUG_ON(ret != BTREE_INSERT_OK); extent_bset_insert(c, iter, &discard); } else { ret = extent_squash(s, insert, t, _k, k, overlap); BUG_ON(ret != BTREE_INSERT_OK); } next: bch2_cut_front(s->committed, insert); bch2_btree_iter_set_pos_same_leaf(iter, s->committed); } if (bkey_cmp(s->committed, insert->k.p) < 0 && ret == BTREE_INSERT_OK && extent_insert_advance_pos(s, bkey_s_c_null) == BTREE_HOOK_RESTART_TRANS) ret = BTREE_INSERT_NEED_TRAVERSE; stop: extent_insert_committed(s); bch2_fs_usage_apply(c, &s->stats, s->trans->disk_res, gc_pos_btree_node(b)); EBUG_ON(bkey_cmp(iter->pos, s->committed)); EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) != iter->at_end_of_leaf); bch2_cut_front(iter->pos, insert); if (insert->k.size && iter->at_end_of_leaf) ret = BTREE_INSERT_NEED_TRAVERSE; EBUG_ON(insert->k.size && ret == BTREE_INSERT_OK); return ret; } /** * bch_extent_insert_fixup - insert a new extent and deal with overlaps * * this may result in not actually doing the insert, or inserting some subset * of the insert key. For cmpxchg operations this is where that logic lives. * * All subsets of @insert that need to be inserted are inserted using * bch2_btree_insert_and_journal(). If @b or @res fills up, this function * returns false, setting @iter->pos for the prefix of @insert that actually got * inserted. * * BSET INVARIANTS: this function is responsible for maintaining all the * invariants for bsets of extents in memory. things get really hairy with 0 * size extents * * within one bset: * * bkey_start_pos(bkey_next(k)) >= k * or bkey_start_offset(bkey_next(k)) >= k->offset * * i.e. strict ordering, no overlapping extents. * * multiple bsets (i.e. full btree node): * * ∀ k, j * k.size != 0 ∧ j.size != 0 → * ¬ (k > bkey_start_pos(j) ∧ k < j) * * i.e. no two overlapping keys _of nonzero size_ * * We can't realistically maintain this invariant for zero size keys because of * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j * there may be another 0 size key between them in another bset, and it will * thus overlap with the merged key. * * In addition, the end of iter->pos indicates how much has been processed. * If the end of iter->pos is not the same as the end of insert, then * key insertion needs to continue/be retried. */ enum btree_insert_ret bch2_insert_fixup_extent(struct btree_insert *trans, struct btree_insert_entry *insert) { struct bch_fs *c = trans->c; struct btree_iter *iter = insert->iter; struct btree *b = iter->nodes[0]; struct btree_node_iter *node_iter = &iter->node_iters[0]; struct bkey_packed *_k; struct bkey unpacked; enum btree_insert_ret ret = BTREE_INSERT_OK; struct extent_insert_state s = { .trans = trans, .insert = insert, .committed = insert->iter->pos, .deleting = bkey_whiteout(&insert->k->k), }; EBUG_ON(iter->level); EBUG_ON(bkey_deleted(&insert->k->k) || !insert->k->k.size); if (s.deleting) return bch2_delete_fixup_extent(&s); /* * As we process overlapping extents, we advance @iter->pos both to * signal to our caller (btree_insert_key()) how much of @insert->k has * been inserted, and also to keep @iter->pos consistent with * @insert->k and the node iterator that we're advancing: */ EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k))); if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) bch2_add_sectors(&s, bkey_i_to_s_c(insert->k), bkey_start_offset(&insert->k->k), insert->k->k.size); while (bkey_cmp(s.committed, insert->k->k.p) < 0 && (ret = extent_insert_should_stop(&s)) == BTREE_INSERT_OK && (_k = bch2_btree_node_iter_peek_all(node_iter, b))) { struct bset_tree *t = bch2_bkey_to_bset(b, _k); struct bkey_s k = __bkey_disassemble(b, _k, &unpacked); enum bch_extent_overlap overlap; EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k))); EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0); if (bkey_cmp(bkey_start_pos(k.k), insert->k->k.p) >= 0) break; overlap = bch2_extent_overlap(&insert->k->k, k.k); ret = extent_insert_check_split_compressed(&s, k.s_c, overlap); if (ret != BTREE_INSERT_OK) goto stop; if (!k.k->size) goto squash; /* * Only call advance pos & call hook for nonzero size extents: * If hook returned BTREE_HOOK_NO_INSERT, @insert->k no longer * overlaps with @k: */ switch (extent_insert_advance_pos(&s, k.s_c)) { case BTREE_HOOK_DO_INSERT: break; case BTREE_HOOK_NO_INSERT: continue; case BTREE_HOOK_RESTART_TRANS: ret = BTREE_INSERT_NEED_TRAVERSE; goto stop; } if (k.k->size && (k.k->needs_whiteout || bset_written(b, bset(b, t)))) insert->k->k.needs_whiteout = true; if (overlap == BCH_EXTENT_OVERLAP_ALL && bkey_whiteout(k.k) && k.k->needs_whiteout) { unreserve_whiteout(b, t, _k); _k->needs_whiteout = false; } squash: ret = extent_squash(&s, insert->k, t, _k, k, overlap); if (ret != BTREE_INSERT_OK) goto stop; } if (bkey_cmp(s.committed, insert->k->k.p) < 0 && ret == BTREE_INSERT_OK && extent_insert_advance_pos(&s, bkey_s_c_null) == BTREE_HOOK_RESTART_TRANS) ret = BTREE_INSERT_NEED_TRAVERSE; stop: extent_insert_committed(&s); /* * Subtract any remaining sectors from @insert, if we bailed out early * and didn't fully insert @insert: */ if (insert->k->k.size && !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) bch2_subtract_sectors(&s, bkey_i_to_s_c(insert->k), bkey_start_offset(&insert->k->k), insert->k->k.size); bch2_fs_usage_apply(c, &s.stats, trans->disk_res, gc_pos_btree_node(b)); EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k))); EBUG_ON(bkey_cmp(iter->pos, s.committed)); EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) != iter->at_end_of_leaf); if (insert->k->k.size && iter->at_end_of_leaf) ret = BTREE_INSERT_NEED_TRAVERSE; EBUG_ON(insert->k->k.size && ret == BTREE_INSERT_OK); return ret; } static const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) { if (bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX) return "value too big"; if (!k.k->size) return "zero key size"; switch (k.k->type) { case BCH_EXTENT: case BCH_EXTENT_CACHED: { struct bkey_s_c_extent e = bkey_s_c_to_extent(k); const union bch_extent_entry *entry; const union bch_extent_crc *crc; const struct bch_extent_ptr *ptr; unsigned size_ondisk = e.k->size; const char *reason; extent_for_each_entry(e, entry) { if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) return "invalid extent entry type"; if (extent_entry_is_crc(entry)) { crc = entry_to_crc(entry); if (crc_offset(crc) + e.k->size > crc_uncompressed_size(e.k, crc)) return "checksum offset + key size > uncompressed size"; size_ondisk = crc_compressed_size(e.k, crc); if (!bch2_checksum_type_valid(c, crc_csum_type(crc))) return "invalid checksum type"; if (crc_compression_type(crc) >= BCH_COMPRESSION_NR) return "invalid compression type"; } else { ptr = entry_to_ptr(entry); reason = extent_ptr_invalid(c, e, &entry->ptr, size_ondisk, false); if (reason) return reason; } } return NULL; } case BCH_RESERVATION: { struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) return "incorrect value size"; if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) return "invalid nr_replicas"; return NULL; } default: return "invalid value type"; } } static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b, struct bkey_s_c_extent e) { const struct bch_extent_ptr *ptr; struct bch_dev *ca; struct bucket *g; unsigned seq, stale; char buf[160]; bool bad; unsigned ptrs_per_tier[BCH_TIER_MAX]; unsigned replicas = 0; /* * XXX: we should be doing most/all of these checks at startup time, * where we check bch2_bkey_invalid() in btree_node_read_done() * * But note that we can't check for stale pointers or incorrect gc marks * until after journal replay is done (it might be an extent that's * going to get overwritten during replay) */ memset(ptrs_per_tier, 0, sizeof(ptrs_per_tier)); extent_for_each_ptr(e, ptr) { ca = c->devs[ptr->dev]; g = PTR_BUCKET(ca, ptr); replicas++; ptrs_per_tier[ca->mi.tier]++; /* * If journal replay hasn't finished, we might be seeing keys * that will be overwritten by the time journal replay is done: */ if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) continue; stale = 0; do { struct bucket_mark mark; seq = read_seqcount_begin(&c->gc_pos_lock); mark = READ_ONCE(g->mark); /* between mark and bucket gen */ smp_rmb(); stale = ptr_stale(ca, ptr); bch2_fs_bug_on(stale && !ptr->cached, c, "stale dirty pointer"); bch2_fs_bug_on(stale > 96, c, "key too stale: %i", stale); if (stale) break; bad = (mark.data_type != BUCKET_DATA || (gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 && !mark.owned_by_allocator && !(ptr->cached ? mark.cached_sectors : mark.dirty_sectors))); } while (read_seqcount_retry(&c->gc_pos_lock, seq)); if (bad) goto bad_ptr; } if (replicas > BCH_REPLICAS_MAX) { bch2_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), e.s_c); bch2_fs_bug(c, "extent key bad (too many replicas: %u): %s", replicas, buf); return; } if (!bkey_extent_is_cached(e.k) && replicas < c->sb.data_replicas_have) { bch2_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), e.s_c); bch2_fs_bug(c, "extent key bad (too few replicas, %u < %u): %s", replicas, c->sb.data_replicas_have, buf); return; } return; bad_ptr: bch2_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), e.s_c); bch2_fs_bug(c, "extent pointer bad gc mark: %s:\nbucket %zu prio %i " "gen %i last_gc %i mark 0x%08x", buf, PTR_BUCKET_NR(ca, ptr), g->read_prio, PTR_BUCKET(ca, ptr)->mark.gen, ca->oldest_gens[PTR_BUCKET_NR(ca, ptr)], (unsigned) g->mark.counter); return; } static void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) { switch (k.k->type) { case BCH_EXTENT: case BCH_EXTENT_CACHED: bch2_extent_debugcheck_extent(c, b, bkey_s_c_to_extent(k)); break; case BCH_RESERVATION: break; default: BUG(); } } static void bch2_extent_to_text(struct bch_fs *c, char *buf, size_t size, struct bkey_s_c k) { char *out = buf, *end = buf + size; const char *invalid; #define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) if (bkey_extent_is_data(k.k)) out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k)); invalid = bch2_extent_invalid(c, k); if (invalid) p(" invalid: %s", invalid); #undef p } static unsigned PTR_TIER(struct bch_fs *c, const struct bch_extent_ptr *ptr) { return c->devs[ptr->dev]->mi.tier; } static void bch2_extent_crc_init(union bch_extent_crc *crc, unsigned compressed_size, unsigned uncompressed_size, unsigned compression_type, unsigned nonce, struct bch_csum csum, unsigned csum_type) { if (bch_crc_bytes[csum_type] <= 4 && uncompressed_size <= CRC32_SIZE_MAX && nonce <= CRC32_NONCE_MAX) { crc->crc32 = (struct bch_extent_crc32) { .type = 1 << BCH_EXTENT_ENTRY_crc32, ._compressed_size = compressed_size - 1, ._uncompressed_size = uncompressed_size - 1, .offset = 0, .compression_type = compression_type, .csum_type = csum_type, .csum = *((__le32 *) &csum.lo), }; return; } if (bch_crc_bytes[csum_type] <= 10 && uncompressed_size <= CRC64_SIZE_MAX && nonce <= CRC64_NONCE_MAX) { crc->crc64 = (struct bch_extent_crc64) { .type = 1 << BCH_EXTENT_ENTRY_crc64, ._compressed_size = compressed_size - 1, ._uncompressed_size = uncompressed_size - 1, .offset = 0, .nonce = nonce, .compression_type = compression_type, .csum_type = csum_type, .csum_lo = csum.lo, .csum_hi = *((__le16 *) &csum.hi), }; return; } if (bch_crc_bytes[csum_type] <= 16 && uncompressed_size <= CRC128_SIZE_MAX && nonce <= CRC128_NONCE_MAX) { crc->crc128 = (struct bch_extent_crc128) { .type = 1 << BCH_EXTENT_ENTRY_crc128, ._compressed_size = compressed_size - 1, ._uncompressed_size = uncompressed_size - 1, .offset = 0, .nonce = nonce, .compression_type = compression_type, .csum_type = csum_type, .csum = csum, }; return; } BUG(); } void bch2_extent_crc_append(struct bkey_i_extent *e, unsigned compressed_size, unsigned uncompressed_size, unsigned compression_type, unsigned nonce, struct bch_csum csum, unsigned csum_type) { union bch_extent_crc *crc; BUG_ON(compressed_size > uncompressed_size); BUG_ON(uncompressed_size != e->k.size); BUG_ON(!compressed_size || !uncompressed_size); /* * Look up the last crc entry, so we can check if we need to add * another: */ extent_for_each_crc(extent_i_to_s(e), crc) ; if (!crc && !csum_type && !compression_type) return; if (crc && crc_compressed_size(&e->k, crc) == compressed_size && crc_uncompressed_size(&e->k, crc) == uncompressed_size && crc_offset(crc) == 0 && crc_nonce(crc) == nonce && crc_csum_type(crc) == csum_type && crc_compression_type(crc) == compression_type && crc_csum(crc).lo == csum.lo && crc_csum(crc).hi == csum.hi) return; bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), compressed_size, uncompressed_size, compression_type, nonce, csum, csum_type); __extent_entry_push(e); } /* * bch_extent_normalize - clean up an extent, dropping stale pointers etc. * * Returns true if @k should be dropped entirely * * For existing keys, only called when btree nodes are being rewritten, not when * they're merely being compacted/resorted in memory. */ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) { struct bkey_s_extent e; switch (k.k->type) { case KEY_TYPE_ERROR: return false; case KEY_TYPE_DELETED: case KEY_TYPE_COOKIE: return true; case KEY_TYPE_DISCARD: return bversion_zero(k.k->version); case BCH_EXTENT: case BCH_EXTENT_CACHED: e = bkey_s_to_extent(k); bch2_extent_drop_stale(c, e); if (!bkey_val_u64s(e.k)) { if (bkey_extent_is_cached(e.k)) { k.k->type = KEY_TYPE_DISCARD; if (bversion_zero(k.k->version)) return true; } else { k.k->type = KEY_TYPE_ERROR; } } return false; case BCH_RESERVATION: return false; default: BUG(); } } void bch2_extent_mark_replicas_cached(struct bch_fs *c, struct bkey_s_extent e, unsigned nr_cached) { struct bch_extent_ptr *ptr; bool have_higher_tier; unsigned tier = 0; if (!nr_cached) return; do { have_higher_tier = false; extent_for_each_ptr(e, ptr) { if (!ptr->cached && PTR_TIER(c, ptr) == tier) { ptr->cached = true; nr_cached--; if (!nr_cached) return; } if (PTR_TIER(c, ptr) > tier) have_higher_tier = true; } tier++; } while (have_higher_tier); } /* * This picks a non-stale pointer, preferabbly from a device other than * avoid. Avoid can be NULL, meaning pick any. If there are no non-stale * pointers to other devices, it will still pick a pointer from avoid. * Note that it prefers lowered-numbered pointers to higher-numbered pointers * as the pointers are sorted by tier, hence preferring pointers to tier 0 * rather than pointers to tier 1. */ void bch2_extent_pick_ptr_avoiding(struct bch_fs *c, struct bkey_s_c k, struct bch_dev *avoid, struct extent_pick_ptr *ret) { struct bkey_s_c_extent e; const union bch_extent_crc *crc; const struct bch_extent_ptr *ptr; switch (k.k->type) { case KEY_TYPE_DELETED: case KEY_TYPE_DISCARD: case KEY_TYPE_COOKIE: ret->ca = NULL; return; case KEY_TYPE_ERROR: ret->ca = ERR_PTR(-EIO); return; case BCH_EXTENT: case BCH_EXTENT_CACHED: e = bkey_s_c_to_extent(k); ret->ca = NULL; extent_for_each_ptr_crc(e, ptr, crc) { struct bch_dev *ca = c->devs[ptr->dev]; if (ptr->cached && ptr_stale(ca, ptr)) continue; if (ca->mi.state == BCH_MEMBER_STATE_FAILED) continue; if (ret->ca && (ca == avoid || ret->ca->mi.tier < ca->mi.tier)) continue; if (!percpu_ref_tryget(&ca->io_ref)) continue; if (ret->ca) percpu_ref_put(&ret->ca->io_ref); *ret = (struct extent_pick_ptr) { .crc = crc_to_128(e.k, crc), .ptr = *ptr, .ca = ca, }; } if (!ret->ca && !bkey_extent_is_cached(e.k)) ret->ca = ERR_PTR(-EIO); return; case BCH_RESERVATION: ret->ca = NULL; return; default: BUG(); } } static enum merge_result bch2_extent_merge(struct bch_fs *c, struct btree *bk, struct bkey_i *l, struct bkey_i *r) { struct bkey_s_extent el, er; union bch_extent_entry *en_l, *en_r; if (key_merging_disabled(c)) return BCH_MERGE_NOMERGE; /* * Generic header checks * Assumes left and right are in order * Left and right must be exactly aligned */ if (l->k.u64s != r->k.u64s || l->k.type != r->k.type || bversion_cmp(l->k.version, r->k.version) || bkey_cmp(l->k.p, bkey_start_pos(&r->k))) return BCH_MERGE_NOMERGE; switch (l->k.type) { case KEY_TYPE_DELETED: case KEY_TYPE_DISCARD: case KEY_TYPE_ERROR: /* These types are mergeable, and no val to check */ break; case BCH_EXTENT: case BCH_EXTENT_CACHED: el = bkey_i_to_s_extent(l); er = bkey_i_to_s_extent(r); extent_for_each_entry(el, en_l) { struct bch_extent_ptr *lp, *rp; unsigned bucket_size; en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data); if ((extent_entry_type(en_l) != extent_entry_type(en_r)) || extent_entry_is_crc(en_l)) return BCH_MERGE_NOMERGE; lp = &en_l->ptr; rp = &en_r->ptr; if (lp->offset + el.k->size != rp->offset || lp->dev != rp->dev || lp->gen != rp->gen) return BCH_MERGE_NOMERGE; /* We don't allow extents to straddle buckets: */ bucket_size = c->devs[lp->dev]->mi.bucket_size; if ((lp->offset & ~((u64) bucket_size - 1)) != (rp->offset & ~((u64) bucket_size - 1))) return BCH_MERGE_NOMERGE; } break; case BCH_RESERVATION: { struct bkey_i_reservation *li = bkey_i_to_reservation(l); struct bkey_i_reservation *ri = bkey_i_to_reservation(r); if (li->v.generation != ri->v.generation || li->v.nr_replicas != ri->v.nr_replicas) return BCH_MERGE_NOMERGE; break; } default: return BCH_MERGE_NOMERGE; } l->k.needs_whiteout |= r->k.needs_whiteout; /* Keys with no pointers aren't restricted to one bucket and could * overflow KEY_SIZE */ if ((u64) l->k.size + r->k.size > KEY_SIZE_MAX) { bch2_key_resize(&l->k, KEY_SIZE_MAX); bch2_cut_front(l->k.p, r); return BCH_MERGE_PARTIAL; } bch2_key_resize(&l->k, l->k.size + r->k.size); return BCH_MERGE_MERGE; } static void extent_i_save(struct btree *b, struct bkey_packed *dst, struct bkey_i *src) { struct bkey_format *f = &b->format; struct bkey_i *dst_unpacked; BUG_ON(bkeyp_val_u64s(f, dst) != bkey_val_u64s(&src->k)); /* * We don't want the bch2_verify_key_order() call in extent_save(), * because we may be out of order with deleted keys that are about to be * removed by extent_bset_insert() */ if ((dst_unpacked = packed_to_bkey(dst))) bkey_copy(dst_unpacked, src); else BUG_ON(!bch2_bkey_pack(dst, src, f)); } static bool extent_merge_one_overlapping(struct btree_iter *iter, struct bpos new_pos, struct bset_tree *t, struct bkey_packed *k, struct bkey uk, bool check, bool could_pack) { struct btree *b = iter->nodes[0]; struct btree_node_iter *node_iter = &iter->node_iters[0]; BUG_ON(!bkey_deleted(k)); if (check) { return !bkey_packed(k) || could_pack; } else { uk.p = new_pos; extent_save(b, node_iter, k, &uk); bch2_bset_fix_invalidated_key(b, t, k); bch2_btree_node_iter_fix(iter, b, node_iter, t, k, k->u64s, k->u64s); return true; } } static bool extent_merge_do_overlapping(struct btree_iter *iter, struct bkey *m, bool back_merge) { struct btree *b = iter->nodes[0]; struct btree_node_iter *node_iter = &iter->node_iters[0]; struct bset_tree *t; struct bkey_packed *k; struct bkey uk; struct bpos new_pos = back_merge ? m->p : bkey_start_pos(m); bool could_pack = bkey_pack_pos((void *) &uk, new_pos, b); bool check = true; /* * @m is the new merged extent: * * The merge took place in the last bset; we know there can't be any 0 * size extents overlapping with m there because if so they would have * been between the two extents we merged. * * But in the other bsets, we have to check for and fix such extents: */ do_fixup: for_each_bset(b, t) { if (t == bset_tree_last(b)) break; /* * if we don't find this bset in the iterator we already got to * the end of that bset, so start searching from the end. */ k = bch2_btree_node_iter_bset_pos(node_iter, b, t); if (k == btree_bkey_last(b, t)) k = bch2_bkey_prev_all(b, t, k); if (!k) continue; if (back_merge) { /* * Back merge: 0 size extents will be before the key * that was just inserted (and thus the iterator * position) - walk backwards to find them */ for (; k && (uk = bkey_unpack_key(b, k), bkey_cmp(uk.p, bkey_start_pos(m)) > 0); k = bch2_bkey_prev_all(b, t, k)) { if (bkey_cmp(uk.p, m->p) >= 0) continue; if (!extent_merge_one_overlapping(iter, new_pos, t, k, uk, check, could_pack)) return false; } } else { /* Front merge - walk forwards */ for (; k != btree_bkey_last(b, t) && (uk = bkey_unpack_key(b, k), bkey_cmp(uk.p, m->p) < 0); k = bkey_next(k)) { if (bkey_cmp(uk.p, bkey_start_pos(m)) <= 0) continue; if (!extent_merge_one_overlapping(iter, new_pos, t, k, uk, check, could_pack)) return false; } } } if (check) { check = false; goto do_fixup; } return true; } /* * When merging an extent that we're inserting into a btree node, the new merged * extent could overlap with an existing 0 size extent - if we don't fix that, * it'll break the btree node iterator so this code finds those 0 size extents * and shifts them out of the way. * * Also unpacks and repacks. */ static bool bch2_extent_merge_inline(struct bch_fs *c, struct btree_iter *iter, struct bkey_packed *l, struct bkey_packed *r, bool back_merge) { struct btree *b = iter->nodes[0]; struct btree_node_iter *node_iter = &iter->node_iters[0]; const struct bkey_format *f = &b->format; struct bset_tree *t = bset_tree_last(b); struct bkey_packed *m; BKEY_PADDED(k) li; BKEY_PADDED(k) ri; struct bkey_i *mi; struct bkey tmp; /* * We need to save copies of both l and r, because we might get a * partial merge (which modifies both) and then fails to repack */ bch2_bkey_unpack(b, &li.k, l); bch2_bkey_unpack(b, &ri.k, r); m = back_merge ? l : r; mi = back_merge ? &li.k : &ri.k; /* l & r should be in last bset: */ EBUG_ON(bch2_bkey_to_bset(b, m) != t); switch (bch2_extent_merge(c, b, &li.k, &ri.k)) { case BCH_MERGE_NOMERGE: return false; case BCH_MERGE_PARTIAL: if (bkey_packed(m) && !bch2_bkey_pack_key((void *) &tmp, &mi->k, f)) return false; if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge)) return false; extent_i_save(b, m, mi); bch2_bset_fix_invalidated_key(b, t, m); /* * Update iterator to reflect what we just inserted - otherwise, * the iter_fix() call is going to put us _before_ the key we * just partially merged with: */ if (back_merge) bch2_btree_iter_set_pos_same_leaf(iter, li.k.k.p); bch2_btree_node_iter_fix(iter, iter->nodes[0], node_iter, t, m, m->u64s, m->u64s); if (!back_merge) bkey_copy(packed_to_bkey(l), &li.k); else bkey_copy(packed_to_bkey(r), &ri.k); return false; case BCH_MERGE_MERGE: if (bkey_packed(m) && !bch2_bkey_pack_key((void *) &tmp, &li.k.k, f)) return false; if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge)) return false; extent_i_save(b, m, &li.k); bch2_bset_fix_invalidated_key(b, t, m); bch2_btree_node_iter_fix(iter, iter->nodes[0], node_iter, t, m, m->u64s, m->u64s); return true; default: BUG(); } } const struct bkey_ops bch2_bkey_extent_ops = { .key_invalid = bch2_extent_invalid, .key_debugcheck = bch2_extent_debugcheck, .val_to_text = bch2_extent_to_text, .swab = bch2_ptr_swab, .key_normalize = bch2_ptr_normalize, .key_merge = bch2_extent_merge, .is_extents = true, };