diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2018-05-02 13:56:25 -0400 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2018-05-02 15:54:18 -0400 |
commit | 92fb4f056560452c3b6302b33481e8fe8c2638cf (patch) | |
tree | fc50ea57867307a7e959e8ece31052a1e3d77b62 | |
parent | 9d60f71727b2b1b1f20bb2894fe3c0c07ac21957 (diff) |
bcachefs: Split out journal_seq_blacklist.c
-rw-r--r-- | fs/bcachefs/Makefile | 1 | ||||
-rw-r--r-- | fs/bcachefs/btree_io.c | 2 | ||||
-rw-r--r-- | fs/bcachefs/btree_update_leaf.c | 12 | ||||
-rw-r--r-- | fs/bcachefs/journal.c | 331 | ||||
-rw-r--r-- | fs/bcachefs/journal.h | 50 | ||||
-rw-r--r-- | fs/bcachefs/journal_seq_blacklist.c | 317 | ||||
-rw-r--r-- | fs/bcachefs/journal_seq_blacklist.h | 11 | ||||
-rw-r--r-- | fs/bcachefs/journal_types.h | 2 |
8 files changed, 395 insertions, 331 deletions
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index aff607ec7c8c..fea5f7b0a24d 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -30,6 +30,7 @@ bcachefs-y := \ inode.o \ io.o \ journal.o \ + journal_seq_blacklist.o \ keylist.o \ lz4_decompress.o \ migrate.o \ diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 42190f05a0c1..27b86dcd636f 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -13,7 +13,7 @@ #include "error.h" #include "extents.h" #include "io.h" -#include "journal.h" +#include "journal_seq_blacklist.h" #include "super-io.h" #include <trace/events/bcachefs.h> diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c index 53b39de52c6b..7935226f55fb 100644 --- a/fs/bcachefs/btree_update_leaf.c +++ b/fs/bcachefs/btree_update_leaf.c @@ -137,7 +137,7 @@ void bch2_btree_journal_key(struct btree_insert *trans, EBUG_ON(trans->journal_res.ref != !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)); - if (likely(trans->journal_res.ref)) { + if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { u64 seq = trans->journal_res.seq; bool needs_whiteout = insert->k.needs_whiteout; @@ -155,12 +155,16 @@ void bch2_btree_journal_key(struct btree_insert *trans, btree_bset_last(b)->journal_seq = cpu_to_le64(seq); } - if (unlikely(!journal_pin_active(&w->journal))) - bch2_journal_pin_add(j, &trans->journal_res, - &w->journal, + if (unlikely(!journal_pin_active(&w->journal))) { + u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) + ? trans->journal_res.seq + : j->replay_journal_seq; + + bch2_journal_pin_add(j, seq, &w->journal, btree_node_write_idx(b) == 0 ? btree_node_flush0 : btree_node_flush1); + } if (unlikely(!btree_node_dirty(b))) set_btree_node_dirty(b); diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index aaa7ffa5237f..837228d9d59c 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -10,8 +10,6 @@ #include "buckets.h" #include "btree_gc.h" #include "btree_update.h" -#include "btree_update_interior.h" -#include "btree_io.h" #include "checksum.h" #include "debug.h" #include "error.h" @@ -19,6 +17,7 @@ #include "io.h" #include "keylist.h" #include "journal.h" +#include "journal_seq_blacklist.h" #include "replicas.h" #include "super-io.h" #include "vstructs.h" @@ -27,10 +26,6 @@ static void journal_write(struct closure *); static void journal_reclaim_fast(struct journal *); -static void journal_pin_add_entry(struct journal *, - struct journal_entry_pin_list *, - struct journal_entry_pin *, - journal_pin_flush_fn); static inline void journal_wake(struct journal *j) { @@ -38,30 +33,6 @@ static inline void journal_wake(struct journal *j) closure_wake_up(&j->async_wait); } -static inline struct journal_buf *journal_cur_buf(struct journal *j) -{ - return j->buf + j->reservations.idx; -} - -static inline struct journal_buf *journal_prev_buf(struct journal *j) -{ - return j->buf + !j->reservations.idx; -} - -/* Sequence number of oldest dirty journal entry */ - -static inline u64 journal_last_seq(struct journal *j) -{ - return j->pin.front; -} - -static inline u64 journal_cur_seq(struct journal *j) -{ - BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); - - return j->pin.back - 1; -} - static inline u64 journal_pin_seq(struct journal *j, struct journal_entry_pin_list *pin_list) { @@ -80,18 +51,6 @@ u64 bch2_journal_pin_seq(struct journal *j, struct journal_entry_pin *pin) return ret; } -static inline void bch2_journal_add_entry_noreservation(struct journal_buf *buf, - unsigned type, enum btree_id id, - unsigned level, - const void *data, size_t u64s) -{ - struct jset *jset = buf->data; - - bch2_journal_add_entry_at(buf, le32_to_cpu(jset->u64s), - type, id, level, data, u64s); - le32_add_cpu(&jset->u64s, jset_u64s(u64s)); -} - static struct jset_entry *bch2_journal_find_entry(struct jset *j, unsigned type, enum btree_id id) { @@ -132,216 +91,6 @@ static void bch2_journal_add_btree_root(struct journal_buf *buf, k, k->k.u64s); } -static void journal_seq_blacklist_flush(struct journal *j, - struct journal_entry_pin *pin, u64 seq) -{ - struct bch_fs *c = - container_of(j, struct bch_fs, journal); - struct journal_seq_blacklist *bl = - container_of(pin, struct journal_seq_blacklist, pin); - struct blacklisted_node n; - struct closure cl; - unsigned i; - int ret; - - closure_init_stack(&cl); - - for (i = 0;; i++) { - struct btree_iter iter; - struct btree *b; - - mutex_lock(&j->blacklist_lock); - if (i >= bl->nr_entries) { - mutex_unlock(&j->blacklist_lock); - break; - } - n = bl->entries[i]; - mutex_unlock(&j->blacklist_lock); - - __bch2_btree_iter_init(&iter, c, n.btree_id, n.pos, 0, 0, 0); - - b = bch2_btree_iter_peek_node(&iter); - - /* The node might have already been rewritten: */ - - if (b->data->keys.seq == n.seq) { - ret = bch2_btree_node_rewrite(c, &iter, n.seq, 0); - if (ret) { - bch2_btree_iter_unlock(&iter); - bch2_fs_fatal_error(c, - "error %i rewriting btree node with blacklisted journal seq", - ret); - bch2_journal_halt(j); - return; - } - } - - bch2_btree_iter_unlock(&iter); - } - - for (i = 0;; i++) { - struct btree_update *as; - struct pending_btree_node_free *d; - - mutex_lock(&j->blacklist_lock); - if (i >= bl->nr_entries) { - mutex_unlock(&j->blacklist_lock); - break; - } - n = bl->entries[i]; - mutex_unlock(&j->blacklist_lock); -redo_wait: - mutex_lock(&c->btree_interior_update_lock); - - /* - * Is the node on the list of pending interior node updates - - * being freed? If so, wait for that to finish: - */ - for_each_pending_btree_node_free(c, as, d) - if (n.seq == d->seq && - n.btree_id == d->btree_id && - !d->level && - !bkey_cmp(n.pos, d->key.k.p)) { - closure_wait(&as->wait, &cl); - mutex_unlock(&c->btree_interior_update_lock); - closure_sync(&cl); - goto redo_wait; - } - - mutex_unlock(&c->btree_interior_update_lock); - } - - mutex_lock(&j->blacklist_lock); - - bch2_journal_pin_drop(j, &bl->pin); - list_del(&bl->list); - kfree(bl->entries); - kfree(bl); - - mutex_unlock(&j->blacklist_lock); -} - -static struct journal_seq_blacklist * -journal_seq_blacklist_find(struct journal *j, u64 seq) -{ - struct journal_seq_blacklist *bl; - - lockdep_assert_held(&j->blacklist_lock); - - list_for_each_entry(bl, &j->seq_blacklist, list) - if (seq == bl->seq) - return bl; - - return NULL; -} - -static struct journal_seq_blacklist * -bch2_journal_seq_blacklisted_new(struct journal *j, u64 seq) -{ - struct journal_seq_blacklist *bl; - - lockdep_assert_held(&j->blacklist_lock); - - /* - * When we start the journal, bch2_journal_start() will skip over @seq: - */ - - bl = kzalloc(sizeof(*bl), GFP_KERNEL); - if (!bl) - return NULL; - - bl->seq = seq; - list_add_tail(&bl->list, &j->seq_blacklist); - return bl; -} - -/* - * Returns true if @seq is newer than the most recent journal entry that got - * written, and data corresponding to @seq should be ignored - also marks @seq - * as blacklisted so that on future restarts the corresponding data will still - * be ignored: - */ -int bch2_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b) -{ - struct journal *j = &c->journal; - struct journal_seq_blacklist *bl = NULL; - struct blacklisted_node *n; - u64 journal_seq, i; - int ret = 0; - - if (!seq) - return 0; - - spin_lock(&j->lock); - journal_seq = journal_cur_seq(j); - spin_unlock(&j->lock); - - /* Interier updates aren't journalled: */ - BUG_ON(b->level); - BUG_ON(seq > journal_seq && test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)); - - /* - * Decrease this back to j->seq + 2 when we next rev the on disk format: - * increasing it temporarily to work around bug in old kernels - */ - bch2_fs_inconsistent_on(seq > journal_seq + 4, c, - "bset journal seq too far in the future: %llu > %llu", - seq, journal_seq); - - if (seq <= journal_seq && - list_empty_careful(&j->seq_blacklist)) - return 0; - - mutex_lock(&j->blacklist_lock); - - if (seq <= journal_seq) { - bl = journal_seq_blacklist_find(j, seq); - if (!bl) - goto out; - } else { - bch_verbose(c, "btree node %u:%llu:%llu has future journal sequence number %llu, blacklisting", - b->btree_id, b->key.k.p.inode, b->key.k.p.offset, seq); - - for (i = journal_seq + 1; i <= seq; i++) { - bl = journal_seq_blacklist_find(j, i) ?: - bch2_journal_seq_blacklisted_new(j, i); - if (!bl) { - ret = -ENOMEM; - goto out; - } - } - } - - for (n = bl->entries; n < bl->entries + bl->nr_entries; n++) - if (b->data->keys.seq == n->seq && - b->btree_id == n->btree_id && - !bkey_cmp(b->key.k.p, n->pos)) - goto found_entry; - - if (!bl->nr_entries || - is_power_of_2(bl->nr_entries)) { - n = krealloc(bl->entries, - max(bl->nr_entries * 2, 8UL) * sizeof(*n), - GFP_KERNEL); - if (!n) { - ret = -ENOMEM; - goto out; - } - bl->entries = n; - } - - bl->entries[bl->nr_entries++] = (struct blacklisted_node) { - .seq = b->data->keys.seq, - .btree_id = b->btree_id, - .pos = b->key.k.p, - }; -found_entry: - ret = 1; -out: - mutex_unlock(&j->blacklist_lock); - return ret; -} - /* * Journal replay/recovery: * @@ -947,35 +696,6 @@ void bch2_journal_entries_free(struct list_head *list) } } -static int journal_seq_blacklist_read(struct journal *j, - struct journal_replay *i, - struct journal_entry_pin_list *p) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct jset_entry *entry; - struct journal_seq_blacklist *bl; - u64 seq; - - for_each_jset_entry_type(entry, &i->j, - JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED) { - struct jset_entry_blacklist *bl_entry = - container_of(entry, struct jset_entry_blacklist, entry); - seq = le64_to_cpu(bl_entry->seq); - - bch_verbose(c, "blacklisting existing journal seq %llu", seq); - - bl = bch2_journal_seq_blacklisted_new(j, seq); - if (!bl) - return -ENOMEM; - - journal_pin_add_entry(j, p, &bl->pin, - journal_seq_blacklist_flush); - bl->written = true; - } - - return 0; -} - static inline bool journal_has_keys(struct list_head *list) { struct journal_replay *i; @@ -1091,7 +811,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) atomic_set(&p->count, 1); p->devs = i->devs; - if (journal_seq_blacklist_read(j, i, p)) { + if (bch2_journal_seq_blacklist_read(j, i)) { mutex_unlock(&j->blacklist_lock); return -ENOMEM; } @@ -1110,10 +830,10 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) mutex_lock(&j->blacklist_lock); while (cur_seq < le64_to_cpu(i->j.seq) && - journal_seq_blacklist_find(j, cur_seq)) + bch2_journal_seq_blacklist_find(j, cur_seq)) cur_seq++; - blacklisted = journal_seq_blacklist_find(j, + blacklisted = bch2_journal_seq_blacklist_find(j, le64_to_cpu(i->j.seq)); mutex_unlock(&j->blacklist_lock); @@ -1512,18 +1232,7 @@ void bch2_journal_start(struct bch_fs *c) * disk for the next journal entry - this is ok, because these entries * only have to go down with the next journal entry we write: */ - list_for_each_entry(bl, &j->seq_blacklist, list) - if (!bl->written) { - bch2_journal_add_entry_noreservation(journal_cur_buf(j), - JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED, - 0, 0, &bl->seq, 1); - - journal_pin_add_entry(j, - &fifo_peek_back(&j->pin), - &bl->pin, - journal_seq_blacklist_flush); - bl->written = true; - } + bch2_journal_seq_blacklist_write(j); queue_delayed_work(system_freezable_wq, &j->reclaim_work, 0); } @@ -1531,14 +1240,15 @@ void bch2_journal_start(struct bch_fs *c) int bch2_journal_replay(struct bch_fs *c, struct list_head *list) { struct journal *j = &c->journal; + struct journal_entry_pin_list *pin_list; struct bkey_i *k, *_n; struct jset_entry *entry; struct journal_replay *i, *n; int ret = 0; list_for_each_entry_safe(i, n, list, list) { - j->replay_pin_list = - journal_seq_pin(j, le64_to_cpu(i->j.seq)); + + j->replay_journal_seq = le64_to_cpu(i->j.seq); for_each_jset_key(k, _n, entry, &i->j) { @@ -1572,11 +1282,13 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list) cond_resched(); } - if (atomic_dec_and_test(&j->replay_pin_list->count)) + pin_list = journal_seq_pin(j, j->replay_journal_seq); + + if (atomic_dec_and_test(&pin_list->count)) journal_wake(j); } - j->replay_pin_list = NULL; + j->replay_journal_seq = 0; bch2_journal_set_replay_done(j); ret = bch2_journal_flush_all_pins(j); @@ -1806,27 +1518,12 @@ static inline void __journal_pin_add(struct journal *j, journal_wake(j); } -static void journal_pin_add_entry(struct journal *j, - struct journal_entry_pin_list *pin_list, - struct journal_entry_pin *pin, - journal_pin_flush_fn flush_fn) -{ - spin_lock(&j->lock); - __journal_pin_add(j, pin_list, pin, flush_fn); - spin_unlock(&j->lock); -} - -void bch2_journal_pin_add(struct journal *j, - struct journal_res *res, +void bch2_journal_pin_add(struct journal *j, u64 seq, struct journal_entry_pin *pin, journal_pin_flush_fn flush_fn) { - struct journal_entry_pin_list *pin_list = res->ref - ? journal_seq_pin(j, res->seq) - : j->replay_pin_list; - spin_lock(&j->lock); - __journal_pin_add(j, pin_list, pin, flush_fn); + __journal_pin_add(j, journal_seq_pin(j, seq), pin, flush_fn); spin_unlock(&j->lock); } diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index cf5cc9ba008e..26bb787937fb 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -112,6 +112,32 @@ #include "journal_types.h" +struct bch_fs; + +static inline struct journal_buf *journal_cur_buf(struct journal *j) +{ + return j->buf + j->reservations.idx; +} + +static inline struct journal_buf *journal_prev_buf(struct journal *j) +{ + return j->buf + !j->reservations.idx; +} + +/* Sequence number of oldest dirty journal entry */ + +static inline u64 journal_last_seq(struct journal *j) +{ + return j->pin.front; +} + +static inline u64 journal_cur_seq(struct journal *j) +{ + BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); + + return j->pin.back - 1; +} + /* * Only used for holding the journal entries we read in btree_journal_read() * during cache_registration @@ -155,13 +181,15 @@ static inline bool journal_pin_active(struct journal_entry_pin *pin) static inline struct journal_entry_pin_list * journal_seq_pin(struct journal *j, u64 seq) { + BUG_ON(seq < j->pin.front || seq >= j->pin.back); + return &j->pin.data[seq & j->pin.mask]; } u64 bch2_journal_pin_seq(struct journal *, struct journal_entry_pin *); -void bch2_journal_pin_add(struct journal *, struct journal_res *, - struct journal_entry_pin *, journal_pin_flush_fn); +void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *, + journal_pin_flush_fn); void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *); void bch2_journal_pin_add_if_older(struct journal *, struct journal_entry_pin *, @@ -170,15 +198,9 @@ void bch2_journal_pin_add_if_older(struct journal *, int bch2_journal_flush_pins(struct journal *, u64); int bch2_journal_flush_all_pins(struct journal *); -struct closure; -struct bch_fs; -struct keylist; - struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *, struct jset *, enum btree_id, unsigned *); -int bch2_journal_seq_should_ignore(struct bch_fs *, u64, struct btree *); - u64 bch2_inode_journal_seq(struct journal *, u64); static inline int journal_state_count(union journal_res_state s, int idx) @@ -230,6 +252,18 @@ static inline void bch2_journal_add_entry_at(struct journal_buf *buf, memcpy_u64s(entry->_data, data, u64s); } +static inline void bch2_journal_add_entry_noreservation(struct journal_buf *buf, + unsigned type, enum btree_id id, + unsigned level, + const void *data, size_t u64s) +{ + struct jset *jset = buf->data; + + bch2_journal_add_entry_at(buf, le32_to_cpu(jset->u64s), + type, id, level, data, u64s); + le32_add_cpu(&jset->u64s, jset_u64s(u64s)); +} + static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res, unsigned type, enum btree_id id, unsigned level, diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c new file mode 100644 index 000000000000..ba697763daff --- /dev/null +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -0,0 +1,317 @@ + +#include "bcachefs.h" +#include "btree_update.h" +#include "btree_update_interior.h" +#include "error.h" +#include "journal.h" +#include "journal_seq_blacklist.h" + +/* + * journal_seq_blacklist machinery: + * + * To guarantee order of btree updates after a crash, we need to detect when a + * btree node entry (bset) is newer than the newest journal entry that was + * successfully written, and ignore it - effectively ignoring any btree updates + * that didn't make it into the journal. + * + * If we didn't do this, we might have two btree nodes, a and b, both with + * updates that weren't written to the journal yet: if b was updated after a, + * but b was flushed and not a - oops; on recovery we'll find that the updates + * to b happened, but not the updates to a that happened before it. + * + * Ignoring bsets that are newer than the newest journal entry is always safe, + * because everything they contain will also have been journalled - and must + * still be present in the journal on disk until a journal entry has been + * written _after_ that bset was written. + * + * To accomplish this, bsets record the newest journal sequence number they + * contain updates for; then, on startup, the btree code queries the journal + * code to ask "Is this sequence number newer than the newest journal entry? If + * so, ignore it." + * + * When this happens, we must blacklist that journal sequence number: the + * journal must not write any entries with that sequence number, and it must + * record that it was blacklisted so that a) on recovery we don't think we have + * missing journal entries and b) so that the btree code continues to ignore + * that bset, until that btree node is rewritten. + * + * Blacklisted journal sequence numbers are themselves recorded as entries in + * the journal. + */ + +/* + * Called when journal needs to evict a blacklist entry to reclaim space: find + * any btree nodes that refer to the blacklist journal sequence numbers, and + * rewrite them: + */ +static void journal_seq_blacklist_flush(struct journal *j, + struct journal_entry_pin *pin, u64 seq) +{ + struct bch_fs *c = + container_of(j, struct bch_fs, journal); + struct journal_seq_blacklist *bl = + container_of(pin, struct journal_seq_blacklist, pin); + struct blacklisted_node n; + struct closure cl; + unsigned i; + int ret; + + closure_init_stack(&cl); + + for (i = 0;; i++) { + struct btree_iter iter; + struct btree *b; + + mutex_lock(&j->blacklist_lock); + if (i >= bl->nr_entries) { + mutex_unlock(&j->blacklist_lock); + break; + } + n = bl->entries[i]; + mutex_unlock(&j->blacklist_lock); + + __bch2_btree_iter_init(&iter, c, n.btree_id, n.pos, 0, 0, 0); + + b = bch2_btree_iter_peek_node(&iter); + + /* The node might have already been rewritten: */ + + if (b->data->keys.seq == n.seq) { + ret = bch2_btree_node_rewrite(c, &iter, n.seq, 0); + if (ret) { + bch2_btree_iter_unlock(&iter); + bch2_fs_fatal_error(c, + "error %i rewriting btree node with blacklisted journal seq", + ret); + bch2_journal_halt(j); + return; + } + } + + bch2_btree_iter_unlock(&iter); + } + + for (i = 0;; i++) { + struct btree_update *as; + struct pending_btree_node_free *d; + + mutex_lock(&j->blacklist_lock); + if (i >= bl->nr_entries) { + mutex_unlock(&j->blacklist_lock); + break; + } + n = bl->entries[i]; + mutex_unlock(&j->blacklist_lock); +redo_wait: + mutex_lock(&c->btree_interior_update_lock); + + /* + * Is the node on the list of pending interior node updates - + * being freed? If so, wait for that to finish: + */ + for_each_pending_btree_node_free(c, as, d) + if (n.seq == d->seq && + n.btree_id == d->btree_id && + !d->level && + !bkey_cmp(n.pos, d->key.k.p)) { + closure_wait(&as->wait, &cl); + mutex_unlock(&c->btree_interior_update_lock); + closure_sync(&cl); + goto redo_wait; + } + + mutex_unlock(&c->btree_interior_update_lock); + } + + mutex_lock(&j->blacklist_lock); + + bch2_journal_pin_drop(j, &bl->pin); + list_del(&bl->list); + kfree(bl->entries); + kfree(bl); + + mutex_unlock(&j->blacklist_lock); +} + +/* + * Determine if a particular sequence number is blacklisted - if so, return + * blacklist entry: + */ +struct journal_seq_blacklist * +bch2_journal_seq_blacklist_find(struct journal *j, u64 seq) +{ + struct journal_seq_blacklist *bl; + + lockdep_assert_held(&j->blacklist_lock); + + list_for_each_entry(bl, &j->seq_blacklist, list) + if (seq == bl->seq) + return bl; + + return NULL; +} + +/* + * Allocate a new, in memory blacklist entry: + */ +static struct journal_seq_blacklist * +bch2_journal_seq_blacklisted_new(struct journal *j, u64 seq) +{ + struct journal_seq_blacklist *bl; + + lockdep_assert_held(&j->blacklist_lock); + + /* + * When we start the journal, bch2_journal_start() will skip over @seq: + */ + + bl = kzalloc(sizeof(*bl), GFP_KERNEL); + if (!bl) + return NULL; + + bl->seq = seq; + list_add_tail(&bl->list, &j->seq_blacklist); + return bl; +} + +/* + * Returns true if @seq is newer than the most recent journal entry that got + * written, and data corresponding to @seq should be ignored - also marks @seq + * as blacklisted so that on future restarts the corresponding data will still + * be ignored: + */ +int bch2_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b) +{ + struct journal *j = &c->journal; + struct journal_seq_blacklist *bl = NULL; + struct blacklisted_node *n; + u64 journal_seq, i; + int ret = 0; + + if (!seq) + return 0; + + spin_lock(&j->lock); + journal_seq = journal_cur_seq(j); + spin_unlock(&j->lock); + + /* Interier updates aren't journalled: */ + BUG_ON(b->level); + BUG_ON(seq > journal_seq && test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)); + + /* + * Decrease this back to j->seq + 2 when we next rev the on disk format: + * increasing it temporarily to work around bug in old kernels + */ + bch2_fs_inconsistent_on(seq > journal_seq + 4, c, + "bset journal seq too far in the future: %llu > %llu", + seq, journal_seq); + + if (seq <= journal_seq && + list_empty_careful(&j->seq_blacklist)) + return 0; + + mutex_lock(&j->blacklist_lock); + + if (seq <= journal_seq) { + bl = bch2_journal_seq_blacklist_find(j, seq); + if (!bl) + goto out; + } else { + bch_verbose(c, "btree node %u:%llu:%llu has future journal sequence number %llu, blacklisting", + b->btree_id, b->key.k.p.inode, b->key.k.p.offset, seq); + + for (i = journal_seq + 1; i <= seq; i++) { + bl = bch2_journal_seq_blacklist_find(j, i) ?: + bch2_journal_seq_blacklisted_new(j, i); + if (!bl) { + ret = -ENOMEM; + goto out; + } + } + } + + for (n = bl->entries; n < bl->entries + bl->nr_entries; n++) + if (b->data->keys.seq == n->seq && + b->btree_id == n->btree_id && + !bkey_cmp(b->key.k.p, n->pos)) + goto found_entry; + + if (!bl->nr_entries || + is_power_of_2(bl->nr_entries)) { + n = krealloc(bl->entries, + max(bl->nr_entries * 2, 8UL) * sizeof(*n), + GFP_KERNEL); + if (!n) { + ret = -ENOMEM; + goto out; + } + bl->entries = n; + } + + bl->entries[bl->nr_entries++] = (struct blacklisted_node) { + .seq = b->data->keys.seq, + .btree_id = b->btree_id, + .pos = b->key.k.p, + }; +found_entry: + ret = 1; +out: + mutex_unlock(&j->blacklist_lock); + return ret; +} + +/* + * After reading the journal, find existing journal seq blacklist entries and + * read them into memory: + */ +int bch2_journal_seq_blacklist_read(struct journal *j, + struct journal_replay *i) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct jset_entry *entry; + struct journal_seq_blacklist *bl; + u64 seq; + + for_each_jset_entry_type(entry, &i->j, + JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED) { + struct jset_entry_blacklist *bl_entry = + container_of(entry, struct jset_entry_blacklist, entry); + seq = le64_to_cpu(bl_entry->seq); + + bch_verbose(c, "blacklisting existing journal seq %llu", seq); + + bl = bch2_journal_seq_blacklisted_new(j, seq); + if (!bl) + return -ENOMEM; + + bch2_journal_pin_add(j, le64_to_cpu(i->j.seq), &bl->pin, + journal_seq_blacklist_flush); + bl->written = true; + } + + return 0; +} + +/* + * After reading the journal and walking the btree, we might have new journal + * sequence numbers to blacklist - add entries to the next journal entry to be + * written: + */ +void bch2_journal_seq_blacklist_write(struct journal *j) +{ + struct journal_seq_blacklist *bl; + + list_for_each_entry(bl, &j->seq_blacklist, list) + if (!bl->written) { + bch2_journal_add_entry_noreservation(journal_cur_buf(j), + JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED, + 0, 0, &bl->seq, 1); + + bch2_journal_pin_add(j, + journal_cur_seq(j), + &bl->pin, + journal_seq_blacklist_flush); + bl->written = true; + } +} diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h new file mode 100644 index 000000000000..3b7e36afc99d --- /dev/null +++ b/fs/bcachefs/journal_seq_blacklist.h @@ -0,0 +1,11 @@ +#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H +#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H + +struct journal_seq_blacklist * +bch2_journal_seq_blacklist_find(struct journal *, u64); +int bch2_journal_seq_should_ignore(struct bch_fs *, u64, struct btree *); +int bch2_journal_seq_blacklist_read(struct journal *, + struct journal_replay *); +void bch2_journal_seq_blacklist_write(struct journal *); + +#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */ diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index e39b18f27058..8a8059ee70db 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -171,7 +171,7 @@ struct journal { u64 front, back, size, mask; struct journal_entry_pin_list *data; } pin; - struct journal_entry_pin_list *replay_pin_list; + u64 replay_journal_seq; struct mutex blacklist_lock; struct list_head seq_blacklist; |