summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2018-05-02 13:56:25 -0400
committerKent Overstreet <kent.overstreet@gmail.com>2018-05-02 15:54:18 -0400
commit92fb4f056560452c3b6302b33481e8fe8c2638cf (patch)
treefc50ea57867307a7e959e8ece31052a1e3d77b62
parent9d60f71727b2b1b1f20bb2894fe3c0c07ac21957 (diff)
bcachefs: Split out journal_seq_blacklist.c
-rw-r--r--fs/bcachefs/Makefile1
-rw-r--r--fs/bcachefs/btree_io.c2
-rw-r--r--fs/bcachefs/btree_update_leaf.c12
-rw-r--r--fs/bcachefs/journal.c331
-rw-r--r--fs/bcachefs/journal.h50
-rw-r--r--fs/bcachefs/journal_seq_blacklist.c317
-rw-r--r--fs/bcachefs/journal_seq_blacklist.h11
-rw-r--r--fs/bcachefs/journal_types.h2
8 files changed, 395 insertions, 331 deletions
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index aff607ec7c8c..fea5f7b0a24d 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -30,6 +30,7 @@ bcachefs-y := \
inode.o \
io.o \
journal.o \
+ journal_seq_blacklist.o \
keylist.o \
lz4_decompress.o \
migrate.o \
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 42190f05a0c1..27b86dcd636f 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -13,7 +13,7 @@
#include "error.h"
#include "extents.h"
#include "io.h"
-#include "journal.h"
+#include "journal_seq_blacklist.h"
#include "super-io.h"
#include <trace/events/bcachefs.h>
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 53b39de52c6b..7935226f55fb 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -137,7 +137,7 @@ void bch2_btree_journal_key(struct btree_insert *trans,
EBUG_ON(trans->journal_res.ref !=
!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
- if (likely(trans->journal_res.ref)) {
+ if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
u64 seq = trans->journal_res.seq;
bool needs_whiteout = insert->k.needs_whiteout;
@@ -155,12 +155,16 @@ void bch2_btree_journal_key(struct btree_insert *trans,
btree_bset_last(b)->journal_seq = cpu_to_le64(seq);
}
- if (unlikely(!journal_pin_active(&w->journal)))
- bch2_journal_pin_add(j, &trans->journal_res,
- &w->journal,
+ if (unlikely(!journal_pin_active(&w->journal))) {
+ u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+ ? trans->journal_res.seq
+ : j->replay_journal_seq;
+
+ bch2_journal_pin_add(j, seq, &w->journal,
btree_node_write_idx(b) == 0
? btree_node_flush0
: btree_node_flush1);
+ }
if (unlikely(!btree_node_dirty(b)))
set_btree_node_dirty(b);
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index aaa7ffa5237f..837228d9d59c 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -10,8 +10,6 @@
#include "buckets.h"
#include "btree_gc.h"
#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_io.h"
#include "checksum.h"
#include "debug.h"
#include "error.h"
@@ -19,6 +17,7 @@
#include "io.h"
#include "keylist.h"
#include "journal.h"
+#include "journal_seq_blacklist.h"
#include "replicas.h"
#include "super-io.h"
#include "vstructs.h"
@@ -27,10 +26,6 @@
static void journal_write(struct closure *);
static void journal_reclaim_fast(struct journal *);
-static void journal_pin_add_entry(struct journal *,
- struct journal_entry_pin_list *,
- struct journal_entry_pin *,
- journal_pin_flush_fn);
static inline void journal_wake(struct journal *j)
{
@@ -38,30 +33,6 @@ static inline void journal_wake(struct journal *j)
closure_wake_up(&j->async_wait);
}
-static inline struct journal_buf *journal_cur_buf(struct journal *j)
-{
- return j->buf + j->reservations.idx;
-}
-
-static inline struct journal_buf *journal_prev_buf(struct journal *j)
-{
- return j->buf + !j->reservations.idx;
-}
-
-/* Sequence number of oldest dirty journal entry */
-
-static inline u64 journal_last_seq(struct journal *j)
-{
- return j->pin.front;
-}
-
-static inline u64 journal_cur_seq(struct journal *j)
-{
- BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
-
- return j->pin.back - 1;
-}
-
static inline u64 journal_pin_seq(struct journal *j,
struct journal_entry_pin_list *pin_list)
{
@@ -80,18 +51,6 @@ u64 bch2_journal_pin_seq(struct journal *j, struct journal_entry_pin *pin)
return ret;
}
-static inline void bch2_journal_add_entry_noreservation(struct journal_buf *buf,
- unsigned type, enum btree_id id,
- unsigned level,
- const void *data, size_t u64s)
-{
- struct jset *jset = buf->data;
-
- bch2_journal_add_entry_at(buf, le32_to_cpu(jset->u64s),
- type, id, level, data, u64s);
- le32_add_cpu(&jset->u64s, jset_u64s(u64s));
-}
-
static struct jset_entry *bch2_journal_find_entry(struct jset *j, unsigned type,
enum btree_id id)
{
@@ -132,216 +91,6 @@ static void bch2_journal_add_btree_root(struct journal_buf *buf,
k, k->k.u64s);
}
-static void journal_seq_blacklist_flush(struct journal *j,
- struct journal_entry_pin *pin, u64 seq)
-{
- struct bch_fs *c =
- container_of(j, struct bch_fs, journal);
- struct journal_seq_blacklist *bl =
- container_of(pin, struct journal_seq_blacklist, pin);
- struct blacklisted_node n;
- struct closure cl;
- unsigned i;
- int ret;
-
- closure_init_stack(&cl);
-
- for (i = 0;; i++) {
- struct btree_iter iter;
- struct btree *b;
-
- mutex_lock(&j->blacklist_lock);
- if (i >= bl->nr_entries) {
- mutex_unlock(&j->blacklist_lock);
- break;
- }
- n = bl->entries[i];
- mutex_unlock(&j->blacklist_lock);
-
- __bch2_btree_iter_init(&iter, c, n.btree_id, n.pos, 0, 0, 0);
-
- b = bch2_btree_iter_peek_node(&iter);
-
- /* The node might have already been rewritten: */
-
- if (b->data->keys.seq == n.seq) {
- ret = bch2_btree_node_rewrite(c, &iter, n.seq, 0);
- if (ret) {
- bch2_btree_iter_unlock(&iter);
- bch2_fs_fatal_error(c,
- "error %i rewriting btree node with blacklisted journal seq",
- ret);
- bch2_journal_halt(j);
- return;
- }
- }
-
- bch2_btree_iter_unlock(&iter);
- }
-
- for (i = 0;; i++) {
- struct btree_update *as;
- struct pending_btree_node_free *d;
-
- mutex_lock(&j->blacklist_lock);
- if (i >= bl->nr_entries) {
- mutex_unlock(&j->blacklist_lock);
- break;
- }
- n = bl->entries[i];
- mutex_unlock(&j->blacklist_lock);
-redo_wait:
- mutex_lock(&c->btree_interior_update_lock);
-
- /*
- * Is the node on the list of pending interior node updates -
- * being freed? If so, wait for that to finish:
- */
- for_each_pending_btree_node_free(c, as, d)
- if (n.seq == d->seq &&
- n.btree_id == d->btree_id &&
- !d->level &&
- !bkey_cmp(n.pos, d->key.k.p)) {
- closure_wait(&as->wait, &cl);
- mutex_unlock(&c->btree_interior_update_lock);
- closure_sync(&cl);
- goto redo_wait;
- }
-
- mutex_unlock(&c->btree_interior_update_lock);
- }
-
- mutex_lock(&j->blacklist_lock);
-
- bch2_journal_pin_drop(j, &bl->pin);
- list_del(&bl->list);
- kfree(bl->entries);
- kfree(bl);
-
- mutex_unlock(&j->blacklist_lock);
-}
-
-static struct journal_seq_blacklist *
-journal_seq_blacklist_find(struct journal *j, u64 seq)
-{
- struct journal_seq_blacklist *bl;
-
- lockdep_assert_held(&j->blacklist_lock);
-
- list_for_each_entry(bl, &j->seq_blacklist, list)
- if (seq == bl->seq)
- return bl;
-
- return NULL;
-}
-
-static struct journal_seq_blacklist *
-bch2_journal_seq_blacklisted_new(struct journal *j, u64 seq)
-{
- struct journal_seq_blacklist *bl;
-
- lockdep_assert_held(&j->blacklist_lock);
-
- /*
- * When we start the journal, bch2_journal_start() will skip over @seq:
- */
-
- bl = kzalloc(sizeof(*bl), GFP_KERNEL);
- if (!bl)
- return NULL;
-
- bl->seq = seq;
- list_add_tail(&bl->list, &j->seq_blacklist);
- return bl;
-}
-
-/*
- * Returns true if @seq is newer than the most recent journal entry that got
- * written, and data corresponding to @seq should be ignored - also marks @seq
- * as blacklisted so that on future restarts the corresponding data will still
- * be ignored:
- */
-int bch2_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b)
-{
- struct journal *j = &c->journal;
- struct journal_seq_blacklist *bl = NULL;
- struct blacklisted_node *n;
- u64 journal_seq, i;
- int ret = 0;
-
- if (!seq)
- return 0;
-
- spin_lock(&j->lock);
- journal_seq = journal_cur_seq(j);
- spin_unlock(&j->lock);
-
- /* Interier updates aren't journalled: */
- BUG_ON(b->level);
- BUG_ON(seq > journal_seq && test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags));
-
- /*
- * Decrease this back to j->seq + 2 when we next rev the on disk format:
- * increasing it temporarily to work around bug in old kernels
- */
- bch2_fs_inconsistent_on(seq > journal_seq + 4, c,
- "bset journal seq too far in the future: %llu > %llu",
- seq, journal_seq);
-
- if (seq <= journal_seq &&
- list_empty_careful(&j->seq_blacklist))
- return 0;
-
- mutex_lock(&j->blacklist_lock);
-
- if (seq <= journal_seq) {
- bl = journal_seq_blacklist_find(j, seq);
- if (!bl)
- goto out;
- } else {
- bch_verbose(c, "btree node %u:%llu:%llu has future journal sequence number %llu, blacklisting",
- b->btree_id, b->key.k.p.inode, b->key.k.p.offset, seq);
-
- for (i = journal_seq + 1; i <= seq; i++) {
- bl = journal_seq_blacklist_find(j, i) ?:
- bch2_journal_seq_blacklisted_new(j, i);
- if (!bl) {
- ret = -ENOMEM;
- goto out;
- }
- }
- }
-
- for (n = bl->entries; n < bl->entries + bl->nr_entries; n++)
- if (b->data->keys.seq == n->seq &&
- b->btree_id == n->btree_id &&
- !bkey_cmp(b->key.k.p, n->pos))
- goto found_entry;
-
- if (!bl->nr_entries ||
- is_power_of_2(bl->nr_entries)) {
- n = krealloc(bl->entries,
- max(bl->nr_entries * 2, 8UL) * sizeof(*n),
- GFP_KERNEL);
- if (!n) {
- ret = -ENOMEM;
- goto out;
- }
- bl->entries = n;
- }
-
- bl->entries[bl->nr_entries++] = (struct blacklisted_node) {
- .seq = b->data->keys.seq,
- .btree_id = b->btree_id,
- .pos = b->key.k.p,
- };
-found_entry:
- ret = 1;
-out:
- mutex_unlock(&j->blacklist_lock);
- return ret;
-}
-
/*
* Journal replay/recovery:
*
@@ -947,35 +696,6 @@ void bch2_journal_entries_free(struct list_head *list)
}
}
-static int journal_seq_blacklist_read(struct journal *j,
- struct journal_replay *i,
- struct journal_entry_pin_list *p)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct jset_entry *entry;
- struct journal_seq_blacklist *bl;
- u64 seq;
-
- for_each_jset_entry_type(entry, &i->j,
- JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED) {
- struct jset_entry_blacklist *bl_entry =
- container_of(entry, struct jset_entry_blacklist, entry);
- seq = le64_to_cpu(bl_entry->seq);
-
- bch_verbose(c, "blacklisting existing journal seq %llu", seq);
-
- bl = bch2_journal_seq_blacklisted_new(j, seq);
- if (!bl)
- return -ENOMEM;
-
- journal_pin_add_entry(j, p, &bl->pin,
- journal_seq_blacklist_flush);
- bl->written = true;
- }
-
- return 0;
-}
-
static inline bool journal_has_keys(struct list_head *list)
{
struct journal_replay *i;
@@ -1091,7 +811,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
atomic_set(&p->count, 1);
p->devs = i->devs;
- if (journal_seq_blacklist_read(j, i, p)) {
+ if (bch2_journal_seq_blacklist_read(j, i)) {
mutex_unlock(&j->blacklist_lock);
return -ENOMEM;
}
@@ -1110,10 +830,10 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
mutex_lock(&j->blacklist_lock);
while (cur_seq < le64_to_cpu(i->j.seq) &&
- journal_seq_blacklist_find(j, cur_seq))
+ bch2_journal_seq_blacklist_find(j, cur_seq))
cur_seq++;
- blacklisted = journal_seq_blacklist_find(j,
+ blacklisted = bch2_journal_seq_blacklist_find(j,
le64_to_cpu(i->j.seq));
mutex_unlock(&j->blacklist_lock);
@@ -1512,18 +1232,7 @@ void bch2_journal_start(struct bch_fs *c)
* disk for the next journal entry - this is ok, because these entries
* only have to go down with the next journal entry we write:
*/
- list_for_each_entry(bl, &j->seq_blacklist, list)
- if (!bl->written) {
- bch2_journal_add_entry_noreservation(journal_cur_buf(j),
- JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED,
- 0, 0, &bl->seq, 1);
-
- journal_pin_add_entry(j,
- &fifo_peek_back(&j->pin),
- &bl->pin,
- journal_seq_blacklist_flush);
- bl->written = true;
- }
+ bch2_journal_seq_blacklist_write(j);
queue_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
}
@@ -1531,14 +1240,15 @@ void bch2_journal_start(struct bch_fs *c)
int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
{
struct journal *j = &c->journal;
+ struct journal_entry_pin_list *pin_list;
struct bkey_i *k, *_n;
struct jset_entry *entry;
struct journal_replay *i, *n;
int ret = 0;
list_for_each_entry_safe(i, n, list, list) {
- j->replay_pin_list =
- journal_seq_pin(j, le64_to_cpu(i->j.seq));
+
+ j->replay_journal_seq = le64_to_cpu(i->j.seq);
for_each_jset_key(k, _n, entry, &i->j) {
@@ -1572,11 +1282,13 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
cond_resched();
}
- if (atomic_dec_and_test(&j->replay_pin_list->count))
+ pin_list = journal_seq_pin(j, j->replay_journal_seq);
+
+ if (atomic_dec_and_test(&pin_list->count))
journal_wake(j);
}
- j->replay_pin_list = NULL;
+ j->replay_journal_seq = 0;
bch2_journal_set_replay_done(j);
ret = bch2_journal_flush_all_pins(j);
@@ -1806,27 +1518,12 @@ static inline void __journal_pin_add(struct journal *j,
journal_wake(j);
}
-static void journal_pin_add_entry(struct journal *j,
- struct journal_entry_pin_list *pin_list,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
-{
- spin_lock(&j->lock);
- __journal_pin_add(j, pin_list, pin, flush_fn);
- spin_unlock(&j->lock);
-}
-
-void bch2_journal_pin_add(struct journal *j,
- struct journal_res *res,
+void bch2_journal_pin_add(struct journal *j, u64 seq,
struct journal_entry_pin *pin,
journal_pin_flush_fn flush_fn)
{
- struct journal_entry_pin_list *pin_list = res->ref
- ? journal_seq_pin(j, res->seq)
- : j->replay_pin_list;
-
spin_lock(&j->lock);
- __journal_pin_add(j, pin_list, pin, flush_fn);
+ __journal_pin_add(j, journal_seq_pin(j, seq), pin, flush_fn);
spin_unlock(&j->lock);
}
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index cf5cc9ba008e..26bb787937fb 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -112,6 +112,32 @@
#include "journal_types.h"
+struct bch_fs;
+
+static inline struct journal_buf *journal_cur_buf(struct journal *j)
+{
+ return j->buf + j->reservations.idx;
+}
+
+static inline struct journal_buf *journal_prev_buf(struct journal *j)
+{
+ return j->buf + !j->reservations.idx;
+}
+
+/* Sequence number of oldest dirty journal entry */
+
+static inline u64 journal_last_seq(struct journal *j)
+{
+ return j->pin.front;
+}
+
+static inline u64 journal_cur_seq(struct journal *j)
+{
+ BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
+
+ return j->pin.back - 1;
+}
+
/*
* Only used for holding the journal entries we read in btree_journal_read()
* during cache_registration
@@ -155,13 +181,15 @@ static inline bool journal_pin_active(struct journal_entry_pin *pin)
static inline struct journal_entry_pin_list *
journal_seq_pin(struct journal *j, u64 seq)
{
+ BUG_ON(seq < j->pin.front || seq >= j->pin.back);
+
return &j->pin.data[seq & j->pin.mask];
}
u64 bch2_journal_pin_seq(struct journal *, struct journal_entry_pin *);
-void bch2_journal_pin_add(struct journal *, struct journal_res *,
- struct journal_entry_pin *, journal_pin_flush_fn);
+void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *,
+ journal_pin_flush_fn);
void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
void bch2_journal_pin_add_if_older(struct journal *,
struct journal_entry_pin *,
@@ -170,15 +198,9 @@ void bch2_journal_pin_add_if_older(struct journal *,
int bch2_journal_flush_pins(struct journal *, u64);
int bch2_journal_flush_all_pins(struct journal *);
-struct closure;
-struct bch_fs;
-struct keylist;
-
struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *, struct jset *,
enum btree_id, unsigned *);
-int bch2_journal_seq_should_ignore(struct bch_fs *, u64, struct btree *);
-
u64 bch2_inode_journal_seq(struct journal *, u64);
static inline int journal_state_count(union journal_res_state s, int idx)
@@ -230,6 +252,18 @@ static inline void bch2_journal_add_entry_at(struct journal_buf *buf,
memcpy_u64s(entry->_data, data, u64s);
}
+static inline void bch2_journal_add_entry_noreservation(struct journal_buf *buf,
+ unsigned type, enum btree_id id,
+ unsigned level,
+ const void *data, size_t u64s)
+{
+ struct jset *jset = buf->data;
+
+ bch2_journal_add_entry_at(buf, le32_to_cpu(jset->u64s),
+ type, id, level, data, u64s);
+ le32_add_cpu(&jset->u64s, jset_u64s(u64s));
+}
+
static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res,
unsigned type, enum btree_id id,
unsigned level,
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
new file mode 100644
index 000000000000..ba697763daff
--- /dev/null
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -0,0 +1,317 @@
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_seq_blacklist.h"
+
+/*
+ * journal_seq_blacklist machinery:
+ *
+ * To guarantee order of btree updates after a crash, we need to detect when a
+ * btree node entry (bset) is newer than the newest journal entry that was
+ * successfully written, and ignore it - effectively ignoring any btree updates
+ * that didn't make it into the journal.
+ *
+ * If we didn't do this, we might have two btree nodes, a and b, both with
+ * updates that weren't written to the journal yet: if b was updated after a,
+ * but b was flushed and not a - oops; on recovery we'll find that the updates
+ * to b happened, but not the updates to a that happened before it.
+ *
+ * Ignoring bsets that are newer than the newest journal entry is always safe,
+ * because everything they contain will also have been journalled - and must
+ * still be present in the journal on disk until a journal entry has been
+ * written _after_ that bset was written.
+ *
+ * To accomplish this, bsets record the newest journal sequence number they
+ * contain updates for; then, on startup, the btree code queries the journal
+ * code to ask "Is this sequence number newer than the newest journal entry? If
+ * so, ignore it."
+ *
+ * When this happens, we must blacklist that journal sequence number: the
+ * journal must not write any entries with that sequence number, and it must
+ * record that it was blacklisted so that a) on recovery we don't think we have
+ * missing journal entries and b) so that the btree code continues to ignore
+ * that bset, until that btree node is rewritten.
+ *
+ * Blacklisted journal sequence numbers are themselves recorded as entries in
+ * the journal.
+ */
+
+/*
+ * Called when journal needs to evict a blacklist entry to reclaim space: find
+ * any btree nodes that refer to the blacklist journal sequence numbers, and
+ * rewrite them:
+ */
+static void journal_seq_blacklist_flush(struct journal *j,
+ struct journal_entry_pin *pin, u64 seq)
+{
+ struct bch_fs *c =
+ container_of(j, struct bch_fs, journal);
+ struct journal_seq_blacklist *bl =
+ container_of(pin, struct journal_seq_blacklist, pin);
+ struct blacklisted_node n;
+ struct closure cl;
+ unsigned i;
+ int ret;
+
+ closure_init_stack(&cl);
+
+ for (i = 0;; i++) {
+ struct btree_iter iter;
+ struct btree *b;
+
+ mutex_lock(&j->blacklist_lock);
+ if (i >= bl->nr_entries) {
+ mutex_unlock(&j->blacklist_lock);
+ break;
+ }
+ n = bl->entries[i];
+ mutex_unlock(&j->blacklist_lock);
+
+ __bch2_btree_iter_init(&iter, c, n.btree_id, n.pos, 0, 0, 0);
+
+ b = bch2_btree_iter_peek_node(&iter);
+
+ /* The node might have already been rewritten: */
+
+ if (b->data->keys.seq == n.seq) {
+ ret = bch2_btree_node_rewrite(c, &iter, n.seq, 0);
+ if (ret) {
+ bch2_btree_iter_unlock(&iter);
+ bch2_fs_fatal_error(c,
+ "error %i rewriting btree node with blacklisted journal seq",
+ ret);
+ bch2_journal_halt(j);
+ return;
+ }
+ }
+
+ bch2_btree_iter_unlock(&iter);
+ }
+
+ for (i = 0;; i++) {
+ struct btree_update *as;
+ struct pending_btree_node_free *d;
+
+ mutex_lock(&j->blacklist_lock);
+ if (i >= bl->nr_entries) {
+ mutex_unlock(&j->blacklist_lock);
+ break;
+ }
+ n = bl->entries[i];
+ mutex_unlock(&j->blacklist_lock);
+redo_wait:
+ mutex_lock(&c->btree_interior_update_lock);
+
+ /*
+ * Is the node on the list of pending interior node updates -
+ * being freed? If so, wait for that to finish:
+ */
+ for_each_pending_btree_node_free(c, as, d)
+ if (n.seq == d->seq &&
+ n.btree_id == d->btree_id &&
+ !d->level &&
+ !bkey_cmp(n.pos, d->key.k.p)) {
+ closure_wait(&as->wait, &cl);
+ mutex_unlock(&c->btree_interior_update_lock);
+ closure_sync(&cl);
+ goto redo_wait;
+ }
+
+ mutex_unlock(&c->btree_interior_update_lock);
+ }
+
+ mutex_lock(&j->blacklist_lock);
+
+ bch2_journal_pin_drop(j, &bl->pin);
+ list_del(&bl->list);
+ kfree(bl->entries);
+ kfree(bl);
+
+ mutex_unlock(&j->blacklist_lock);
+}
+
+/*
+ * Determine if a particular sequence number is blacklisted - if so, return
+ * blacklist entry:
+ */
+struct journal_seq_blacklist *
+bch2_journal_seq_blacklist_find(struct journal *j, u64 seq)
+{
+ struct journal_seq_blacklist *bl;
+
+ lockdep_assert_held(&j->blacklist_lock);
+
+ list_for_each_entry(bl, &j->seq_blacklist, list)
+ if (seq == bl->seq)
+ return bl;
+
+ return NULL;
+}
+
+/*
+ * Allocate a new, in memory blacklist entry:
+ */
+static struct journal_seq_blacklist *
+bch2_journal_seq_blacklisted_new(struct journal *j, u64 seq)
+{
+ struct journal_seq_blacklist *bl;
+
+ lockdep_assert_held(&j->blacklist_lock);
+
+ /*
+ * When we start the journal, bch2_journal_start() will skip over @seq:
+ */
+
+ bl = kzalloc(sizeof(*bl), GFP_KERNEL);
+ if (!bl)
+ return NULL;
+
+ bl->seq = seq;
+ list_add_tail(&bl->list, &j->seq_blacklist);
+ return bl;
+}
+
+/*
+ * Returns true if @seq is newer than the most recent journal entry that got
+ * written, and data corresponding to @seq should be ignored - also marks @seq
+ * as blacklisted so that on future restarts the corresponding data will still
+ * be ignored:
+ */
+int bch2_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b)
+{
+ struct journal *j = &c->journal;
+ struct journal_seq_blacklist *bl = NULL;
+ struct blacklisted_node *n;
+ u64 journal_seq, i;
+ int ret = 0;
+
+ if (!seq)
+ return 0;
+
+ spin_lock(&j->lock);
+ journal_seq = journal_cur_seq(j);
+ spin_unlock(&j->lock);
+
+ /* Interier updates aren't journalled: */
+ BUG_ON(b->level);
+ BUG_ON(seq > journal_seq && test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags));
+
+ /*
+ * Decrease this back to j->seq + 2 when we next rev the on disk format:
+ * increasing it temporarily to work around bug in old kernels
+ */
+ bch2_fs_inconsistent_on(seq > journal_seq + 4, c,
+ "bset journal seq too far in the future: %llu > %llu",
+ seq, journal_seq);
+
+ if (seq <= journal_seq &&
+ list_empty_careful(&j->seq_blacklist))
+ return 0;
+
+ mutex_lock(&j->blacklist_lock);
+
+ if (seq <= journal_seq) {
+ bl = bch2_journal_seq_blacklist_find(j, seq);
+ if (!bl)
+ goto out;
+ } else {
+ bch_verbose(c, "btree node %u:%llu:%llu has future journal sequence number %llu, blacklisting",
+ b->btree_id, b->key.k.p.inode, b->key.k.p.offset, seq);
+
+ for (i = journal_seq + 1; i <= seq; i++) {
+ bl = bch2_journal_seq_blacklist_find(j, i) ?:
+ bch2_journal_seq_blacklisted_new(j, i);
+ if (!bl) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+ }
+
+ for (n = bl->entries; n < bl->entries + bl->nr_entries; n++)
+ if (b->data->keys.seq == n->seq &&
+ b->btree_id == n->btree_id &&
+ !bkey_cmp(b->key.k.p, n->pos))
+ goto found_entry;
+
+ if (!bl->nr_entries ||
+ is_power_of_2(bl->nr_entries)) {
+ n = krealloc(bl->entries,
+ max(bl->nr_entries * 2, 8UL) * sizeof(*n),
+ GFP_KERNEL);
+ if (!n) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ bl->entries = n;
+ }
+
+ bl->entries[bl->nr_entries++] = (struct blacklisted_node) {
+ .seq = b->data->keys.seq,
+ .btree_id = b->btree_id,
+ .pos = b->key.k.p,
+ };
+found_entry:
+ ret = 1;
+out:
+ mutex_unlock(&j->blacklist_lock);
+ return ret;
+}
+
+/*
+ * After reading the journal, find existing journal seq blacklist entries and
+ * read them into memory:
+ */
+int bch2_journal_seq_blacklist_read(struct journal *j,
+ struct journal_replay *i)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct jset_entry *entry;
+ struct journal_seq_blacklist *bl;
+ u64 seq;
+
+ for_each_jset_entry_type(entry, &i->j,
+ JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED) {
+ struct jset_entry_blacklist *bl_entry =
+ container_of(entry, struct jset_entry_blacklist, entry);
+ seq = le64_to_cpu(bl_entry->seq);
+
+ bch_verbose(c, "blacklisting existing journal seq %llu", seq);
+
+ bl = bch2_journal_seq_blacklisted_new(j, seq);
+ if (!bl)
+ return -ENOMEM;
+
+ bch2_journal_pin_add(j, le64_to_cpu(i->j.seq), &bl->pin,
+ journal_seq_blacklist_flush);
+ bl->written = true;
+ }
+
+ return 0;
+}
+
+/*
+ * After reading the journal and walking the btree, we might have new journal
+ * sequence numbers to blacklist - add entries to the next journal entry to be
+ * written:
+ */
+void bch2_journal_seq_blacklist_write(struct journal *j)
+{
+ struct journal_seq_blacklist *bl;
+
+ list_for_each_entry(bl, &j->seq_blacklist, list)
+ if (!bl->written) {
+ bch2_journal_add_entry_noreservation(journal_cur_buf(j),
+ JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED,
+ 0, 0, &bl->seq, 1);
+
+ bch2_journal_pin_add(j,
+ journal_cur_seq(j),
+ &bl->pin,
+ journal_seq_blacklist_flush);
+ bl->written = true;
+ }
+}
diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h
new file mode 100644
index 000000000000..3b7e36afc99d
--- /dev/null
+++ b/fs/bcachefs/journal_seq_blacklist.h
@@ -0,0 +1,11 @@
+#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
+#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
+
+struct journal_seq_blacklist *
+bch2_journal_seq_blacklist_find(struct journal *, u64);
+int bch2_journal_seq_should_ignore(struct bch_fs *, u64, struct btree *);
+int bch2_journal_seq_blacklist_read(struct journal *,
+ struct journal_replay *);
+void bch2_journal_seq_blacklist_write(struct journal *);
+
+#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index e39b18f27058..8a8059ee70db 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -171,7 +171,7 @@ struct journal {
u64 front, back, size, mask;
struct journal_entry_pin_list *data;
} pin;
- struct journal_entry_pin_list *replay_pin_list;
+ u64 replay_journal_seq;
struct mutex blacklist_lock;
struct list_head seq_blacklist;