diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2017-05-08 02:28:15 -0800 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2017-05-08 06:57:17 -0800 |
commit | 63065c01285601afbe2457e92729efc11581e37d (patch) | |
tree | 8e37af7dcd60f0a260536064f4c6ec0c5dc24a06 /libbcachefs/btree_update.c | |
parent | e57a624feb82e6d1bb8bd77c0f185939b1367b19 (diff) |
Update bcachefs sources to 9ceb982d77 bcachefs: Store bucket gens in a btree
Diffstat (limited to 'libbcachefs/btree_update.c')
-rw-r--r-- | libbcachefs/btree_update.c | 137 |
1 files changed, 110 insertions, 27 deletions
diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c index 8a4ee6d1..9794ac3b 100644 --- a/libbcachefs/btree_update.c +++ b/libbcachefs/btree_update.c @@ -233,17 +233,29 @@ void bch2_btree_open_bucket_put(struct bch_fs *c, struct btree *b) } static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, - bool use_reserve, - struct disk_reservation *res, - struct closure *cl) + struct disk_reservation *res, + struct closure *cl, + unsigned flags) { BKEY_PADDED(k) tmp; struct open_bucket *ob; struct btree *b; - unsigned reserve = use_reserve ? 0 : BTREE_NODE_RESERVE; + unsigned nr_reserve; + enum alloc_reserve alloc_reserve; + + if (flags & BTREE_INSERT_USE_ALLOC_RESERVE) { + nr_reserve = 0; + alloc_reserve = RESERVE_ALLOC; + } else if (flags & BTREE_INSERT_USE_RESERVE) { + nr_reserve = BTREE_NODE_RESERVE / 2; + alloc_reserve = RESERVE_BTREE; + } else { + nr_reserve = BTREE_NODE_RESERVE; + alloc_reserve = RESERVE_NONE; + } mutex_lock(&c->btree_reserve_cache_lock); - if (c->btree_reserve_cache_nr > reserve) { + if (c->btree_reserve_cache_nr > nr_reserve) { struct btree_alloc *a = &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; @@ -263,8 +275,7 @@ retry: bkey_i_to_extent(&tmp.k), res->nr_replicas, c->opts.metadata_replicas_required, - use_reserve ? RESERVE_BTREE : RESERVE_NONE, - cl); + alloc_reserve, cl); if (IS_ERR(ob)) return ERR_CAST(ob); @@ -311,7 +322,7 @@ static struct btree *bch2_btree_node_alloc(struct bch_fs *c, bch2_btree_build_aux_trees(b); - bch2_check_mark_super(c, &b->key, true); + bch2_check_mark_super(c, bkey_i_to_s_c_extent(&b->key), BCH_DATA_BTREE); trace_btree_node_alloc(c, b); return b; @@ -533,9 +544,6 @@ static struct btree_reserve *__bch2_btree_reserve_get(struct bch_fs *c, if (flags & BTREE_INSERT_NOFAIL) disk_res_flags |= BCH_DISK_RESERVATION_NOFAIL; - if (flags & BTREE_INSERT_NOWAIT) - cl = NULL; - /* * This check isn't necessary for correctness - it's just to potentially * prevent us from doing a lot of work that'll end up being wasted: @@ -565,8 +573,9 @@ static struct btree_reserve *__bch2_btree_reserve_get(struct bch_fs *c, reserve->nr = 0; while (reserve->nr < nr_nodes) { - b = __bch2_btree_node_alloc(c, flags & BTREE_INSERT_USE_RESERVE, - &disk_res, cl); + b = __bch2_btree_node_alloc(c, &disk_res, + flags & BTREE_INSERT_NOWAIT + ? NULL : cl, flags); if (IS_ERR(b)) { ret = PTR_ERR(b); goto err_free; @@ -793,8 +802,8 @@ void bch2_btree_journal_key(struct btree_insert *trans, struct btree_write *w = btree_current_write(b); EBUG_ON(iter->level || b->level); - EBUG_ON(!trans->journal_res.ref && - test_bit(JOURNAL_REPLAY_DONE, &j->flags)); + EBUG_ON(trans->journal_res.ref != + !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)); if (!journal_pin_active(&w->journal)) bch2_journal_pin_add(j, &trans->journal_res, @@ -1026,6 +1035,27 @@ retry: */ six_unlock_read(&b->lock); mutex_unlock(&c->btree_interior_update_lock); + + /* + * Bit of funny circularity going on here we have to break: + * + * We have to drop our journal pin before writing the journal + * entry that points to the new btree root: else, we could + * deadlock if the journal currently happens to be full. + * + * This mean we're dropping the journal pin _before_ the new + * nodes are technically reachable - but this is safe, because + * after the bch2_btree_set_root_ondisk() call above they will + * be reachable as of the very next journal write: + */ + bch2_journal_pin_drop(&c->journal, &as->journal); + + /* + * And, do a journal write to write the pointer to the new root, + * then wait for it to complete before freeing the nodes we + * replaced: + */ + bch2_journal_meta_async(&c->journal, cl); break; } @@ -1051,19 +1081,70 @@ static void btree_interior_update_updated_btree(struct bch_fs *c, mutex_unlock(&c->btree_interior_update_lock); + /* + * In general, when you're staging things in a journal that will later + * be written elsewhere, and you also want to guarantee ordering: that + * is, if you have updates a, b, c, after a crash you should never see c + * and not a or b - there's a problem: + * + * If the final destination of the update(s) (i.e. btree node) can be + * written/flushed _before_ the relevant journal entry - oops, that + * breaks ordering, since the various leaf nodes can be written in any + * order. + * + * Normally we use bset->journal_seq to deal with this - if during + * recovery we find a btree node write that's newer than the newest + * journal entry, we just ignore it - we don't need it, anything we're + * supposed to have (that we reported as completed via fsync()) will + * still be in the journal, and as far as the state of the journal is + * concerned that btree node write never happened. + * + * That breaks when we're rewriting/splitting/merging nodes, since we're + * mixing btree node writes that haven't happened yet with previously + * written data that has been reported as completed to the journal. + * + * Thus, before making the new nodes reachable, we have to wait the + * newest journal sequence number we have data for to be written (if it + * hasn't been yet). + */ bch2_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl); continue_at(&as->cl, btree_interior_update_nodes_written, system_freezable_wq); } -static void btree_interior_update_reparent(struct btree_interior_update *as, +static void interior_update_flush(struct journal *j, + struct journal_entry_pin *pin, u64 seq) +{ + struct btree_interior_update *as = + container_of(pin, struct btree_interior_update, journal); + + bch2_journal_flush_seq_async(j, as->journal_seq, NULL); +} + +static void btree_interior_update_reparent(struct bch_fs *c, + struct btree_interior_update *as, struct btree_interior_update *child) { child->b = NULL; child->mode = BTREE_INTERIOR_UPDATING_AS; child->parent_as = as; closure_get(&as->cl); + + /* + * When we write a new btree root, we have to drop our journal pin + * _before_ the new nodes are technically reachable; see + * btree_interior_update_nodes_written(). + * + * This goes for journal pins that are recursively blocked on us - so, + * just transfer the journal pin to the new interior update so + * btree_interior_update_nodes_written() can drop it. + */ + bch2_journal_pin_add_if_older(&c->journal, &child->journal, + &as->journal, interior_update_flush); + bch2_journal_pin_drop(&c->journal, &child->journal); + + as->journal_seq = max(as->journal_seq, child->journal_seq); } static void btree_interior_update_updated_root(struct bch_fs *c, @@ -1081,7 +1162,7 @@ static void btree_interior_update_updated_root(struct bch_fs *c, * btree_interior_update operation to point to us: */ if (r->as) - btree_interior_update_reparent(as, r->as); + btree_interior_update_reparent(c, as, r->as); as->mode = BTREE_INTERIOR_UPDATING_ROOT; as->b = r->b; @@ -1089,19 +1170,21 @@ static void btree_interior_update_updated_root(struct bch_fs *c, mutex_unlock(&c->btree_interior_update_lock); + /* + * When we're rewriting nodes and updating interior nodes, there's an + * issue with updates that haven't been written in the journal getting + * mixed together with older data - see * btree_interior_update_updated_btree() + * for the explanation. + * + * However, this doesn't affect us when we're writing a new btree root - + * because to make that new root reachable we have to write out a new + * journal entry, which must necessarily be newer than as->journal_seq. + */ + continue_at(&as->cl, btree_interior_update_nodes_written, system_freezable_wq); } -static void interior_update_flush(struct journal *j, - struct journal_entry_pin *pin, u64 seq) -{ - struct btree_interior_update *as = - container_of(pin, struct btree_interior_update, journal); - - bch2_journal_flush_seq_async(j, as->journal_seq, NULL); -} - /* * @b is being split/rewritten: it may have pointers to not-yet-written btree * nodes and thus outstanding btree_interior_updates - redirect @b's @@ -1150,7 +1233,7 @@ void bch2_btree_interior_update_will_free_node(struct bch_fs *c, */ list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) { list_del(&p->write_blocked_list); - btree_interior_update_reparent(as, p); + btree_interior_update_reparent(c, as, p); } clear_btree_node_dirty(b); |