diff options
Diffstat (limited to 'libbcachefs/btree_io.c')
-rw-r--r-- | libbcachefs/btree_io.c | 165 |
1 files changed, 112 insertions, 53 deletions
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 8152dc4b..82dd196d 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -872,32 +872,57 @@ static void bset_encrypt(struct bch_fs *c, struct bset *i, struct nonce nonce) vstruct_end(i) - (void *) i->_data); } -#define btree_node_error(b, c, ptr, fmt, ...) \ - bch2_fs_inconsistent(c, \ - "btree node error at btree %u level %u/%u bucket %zu block %u u64s %u: " fmt,\ - (b)->btree_id, (b)->level, btree_node_root(c, b) \ - ? btree_node_root(c, b)->level : -1, \ - PTR_BUCKET_NR(ca, ptr), (b)->written, \ - le16_to_cpu((i)->u64s), ##__VA_ARGS__) - -static const char *validate_bset(struct bch_fs *c, struct btree *b, - struct bch_dev *ca, - const struct bch_extent_ptr *ptr, - struct bset *i, unsigned sectors, - unsigned *whiteout_u64s) +#define btree_node_error(c, b, ptr, msg, ...) \ +do { \ + if (write == READ && \ + !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \ + mustfix_fsck_err(c, \ + "btree node read error at btree %u level %u/%u\n"\ + "sector %llu node offset %u bset u64s %u: " msg,\ + (b)->btree_id, (b)->level, \ + (c)->btree_roots[(b)->btree_id].level, \ + (u64) ptr->offset, (b)->written, \ + le16_to_cpu((i)->u64s), ##__VA_ARGS__); \ + } else { \ + bch_err(c, "%s at btree %u level %u/%u\n" \ + "sector %llu node offset %u bset u64s %u: " msg,\ + write == WRITE \ + ? "corrupt metadata in btree node write" \ + : "btree node error", \ + (b)->btree_id, (b)->level, \ + (c)->btree_roots[(b)->btree_id].level, \ + (u64) ptr->offset, (b)->written, \ + le16_to_cpu((i)->u64s), ##__VA_ARGS__); \ + ret = BCH_FSCK_ERRORS_NOT_FIXED; \ + goto fsck_err; \ + } \ +} while (0) + +static int validate_bset(struct bch_fs *c, struct btree *b, + const struct bch_extent_ptr *ptr, + struct bset *i, unsigned sectors, + unsigned *whiteout_u64s, + int write) { struct bkey_packed *k, *prev = NULL; struct bpos prev_pos = POS_MIN; bool seen_non_whiteout = false; + int ret = 0; - if (le16_to_cpu(i->version) != BCACHE_BSET_VERSION) - return "unsupported bset version"; + if (le16_to_cpu(i->version) != BCACHE_BSET_VERSION) { + btree_node_error(c, b, ptr, "unsupported bset version"); + i->u64s = 0; + return 0; + } - if (b->written + sectors > c->sb.btree_node_size) - return "bset past end of btree node"; + if (b->written + sectors > c->sb.btree_node_size) { + btree_node_error(c, b, ptr, "bset past end of btree node"); + i->u64s = 0; + return 0; + } - if (i != &b->data->keys && !i->u64s) - btree_node_error(b, c, ptr, "empty set"); + if (b->written && !i->u64s) + btree_node_error(c, b, ptr, "empty set"); if (!BSET_SEPARATE_WHITEOUTS(i)) { seen_non_whiteout = true; @@ -911,7 +936,7 @@ static const char *validate_bset(struct bch_fs *c, struct btree *b, const char *invalid; if (!k->u64s) { - btree_node_error(b, c, ptr, + btree_node_error(c, b, ptr, "KEY_U64s 0: %zu bytes of metadata lost", vstruct_end(i) - (void *) k); @@ -920,7 +945,7 @@ static const char *validate_bset(struct bch_fs *c, struct btree *b, } if (bkey_next(k) > vstruct_last(i)) { - btree_node_error(b, c, ptr, + btree_node_error(c, b, ptr, "key extends past end of bset"); i->u64s = cpu_to_le16((u64 *) k - i->_data); @@ -928,7 +953,7 @@ static const char *validate_bset(struct bch_fs *c, struct btree *b, } if (k->format > KEY_FORMAT_CURRENT) { - btree_node_error(b, c, ptr, + btree_node_error(c, b, ptr, "invalid bkey format %u", k->format); i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); @@ -947,8 +972,8 @@ static const char *validate_bset(struct bch_fs *c, struct btree *b, char buf[160]; bch2_bkey_val_to_text(c, btree_node_type(b), - buf, sizeof(buf), u); - btree_node_error(b, c, ptr, + buf, sizeof(buf), u); + btree_node_error(c, b, ptr, "invalid bkey %s: %s", buf, invalid); i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); @@ -969,7 +994,7 @@ static const char *validate_bset(struct bch_fs *c, struct btree *b, *whiteout_u64s = k->_data - i->_data; seen_non_whiteout = true; } else if (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0) { - btree_node_error(b, c, ptr, + btree_node_error(c, b, ptr, "keys out of order: %llu:%llu > %llu:%llu", prev_pos.inode, prev_pos.offset, @@ -984,7 +1009,8 @@ static const char *validate_bset(struct bch_fs *c, struct btree *b, } SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); - return NULL; +fsck_err: + return ret; } static bool extent_contains_ptr(struct bkey_s_c_extent e, @@ -1012,7 +1038,7 @@ void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, const char *err; struct bch_csum csum; struct nonce nonce; - int ret; + int ret, write = READ; iter = mempool_alloc(&c->fill_iter, GFP_NOIO); __bch2_btree_node_iter_init(iter, btree_node_is_extents(b)); @@ -1115,9 +1141,10 @@ void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, sectors = vstruct_sectors(bne, c->block_bits); } - err = validate_bset(c, b, ca, ptr, i, sectors, &whiteout_u64s); - if (err) - goto err; + ret = validate_bset(c, b, ptr, i, sectors, + &whiteout_u64s, READ); + if (ret) + goto fsck_err; b->written += sectors; @@ -1172,8 +1199,10 @@ out: mempool_free(iter, &c->fill_iter); return; err: + btree_node_error(c, b, ptr, "%s", err); +fsck_err: + bch2_inconsistent_error(c); set_btree_node_read_error(b); - btree_node_error(b, c, ptr, "%s", err); goto out; } @@ -1309,6 +1338,23 @@ static void btree_node_write_endio(struct bio *bio) } } +static int validate_bset_for_write(struct bch_fs *c, struct btree *b, + struct bset *i, unsigned sectors) +{ + const struct bch_extent_ptr *ptr; + unsigned whiteout_u64s = 0; + int ret; + + extent_for_each_ptr(bkey_i_to_s_c_extent(&b->key), ptr) + break; + + ret = validate_bset(c, b, ptr, i, sectors, &whiteout_u64s, WRITE); + if (ret) + bch2_fatal_error(c); + + return ret; +} + void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, struct closure *parent, enum six_lock_type lock_type_held) @@ -1343,18 +1389,24 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, if (!(old & (1 << BTREE_NODE_dirty))) return; + if (b->written && + !btree_node_may_write(b)) + return; + if (old & (1 << BTREE_NODE_write_in_flight)) { btree_node_wait_on_io(b); continue; } new &= ~(1 << BTREE_NODE_dirty); + new &= ~(1 << BTREE_NODE_need_write); new |= (1 << BTREE_NODE_write_in_flight); new |= (1 << BTREE_NODE_just_written); new ^= (1 << BTREE_NODE_write_idx); } while (cmpxchg_acquire(&b->flags, old, new) != old); BUG_ON(!list_empty(&b->write_blocked)); + BUG_ON(!list_empty_careful(&b->reachable) != !b->written); BUG_ON(b->written >= c->sb.btree_node_size); BUG_ON(bset_written(b, btree_bset_last(b))); @@ -1430,13 +1482,17 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, clear_needs_whiteout(i); - if (b->written && !i->u64s) { - /* Nothing to write: */ - btree_bounce_free(c, order, used_mempool, data); - btree_node_write_done(c, b); - return; - } + /* do we have data to write? */ + if (b->written && !i->u64s) + goto nowrite; + + bytes_to_write = vstruct_end(i) - data; + sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9; + + memset(data + bytes_to_write, 0, + (sectors_to_write << 9) - bytes_to_write); + BUG_ON(b->written + sectors_to_write > c->sb.btree_node_size); BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN); BUG_ON(i->seq != b->data->keys.seq); @@ -1445,6 +1501,11 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, nonce = btree_nonce(b, i, b->written << 9); + /* if we're going to be encrypting, check metadata validity first: */ + if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) && + validate_bset_for_write(c, b, i, sectors_to_write)) + goto err; + if (bn) { bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags, @@ -1464,15 +1525,10 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); } - bytes_to_write = vstruct_end(i) - data; - sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9; - - memset(data + bytes_to_write, 0, - (sectors_to_write << 9) - bytes_to_write); - - BUG_ON(b->written + sectors_to_write > c->sb.btree_node_size); - - trace_btree_write(b, bytes_to_write, sectors_to_write); + /* if we're not encrypting, check metadata after checksumming: */ + if (!bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) && + validate_bset_for_write(c, b, i, sectors_to_write)) + goto err; /* * We handle btree write errors by immediately halting the journal - @@ -1488,14 +1544,10 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, * break: */ if (bch2_journal_error(&c->journal) || - c->opts.nochanges) { - set_btree_node_noevict(b); - b->written += sectors_to_write; + c->opts.nochanges) + goto err; - btree_bounce_free(c, order, used_mempool, data); - btree_node_write_done(c, b); - return; - } + trace_btree_write(b, bytes_to_write, sectors_to_write); bio = bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write); @@ -1543,6 +1595,13 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, b->written += sectors_to_write; bch2_submit_wbio_replicas(wbio, c, &k.key); + return; +err: + set_btree_node_noevict(b); + b->written += sectors_to_write; +nowrite: + btree_bounce_free(c, order, used_mempool, data); + btree_node_write_done(c, b); } /* |