summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2017-04-10 21:19:15 -0800
committerKent Overstreet <kent.overstreet@gmail.com>2017-04-10 21:37:18 -0800
commit03bc9d71b13e6f8e879894f93ea16f1f4a8280c9 (patch)
tree758cc02fefb0fd507aa04efa04a1f0e68bbed1b8
parente394bd4ba3934cea237ad699cae9fe86396d6f15 (diff)
Update bcachefs sources to 3b4024f944
-rw-r--r--.bcachefs_revision2
-rw-r--r--libbcachefs/bcachefs.h2
-rw-r--r--libbcachefs/btree_io.c77
-rw-r--r--libbcachefs/btree_io.h4
-rw-r--r--libbcachefs/btree_update.c95
-rw-r--r--libbcachefs/buckets.h15
-rw-r--r--libbcachefs/chardev.c6
-rw-r--r--libbcachefs/io.c2
-rw-r--r--libbcachefs/journal.c129
-rw-r--r--libbcachefs/journal.h3
-rw-r--r--libbcachefs/journal_types.h1
-rw-r--r--libbcachefs/super.c9
-rw-r--r--libbcachefs/sysfs.c249
13 files changed, 224 insertions, 370 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index 35e8c14b..9a3f6873 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-da037866e669b09edc6b049ce09535d3456474cb
+3b4024f94489e4d8dc8eb7f1278754a2545f8026
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index cf1c4bd6..c170e85a 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -754,7 +754,7 @@ struct bch_fs {
unsigned bucket_journal_seq;
/* The rest of this all shows up in sysfs */
- atomic_long_t cache_read_races;
+ atomic_long_t read_realloc_races;
unsigned foreground_write_ratelimit_enabled:1;
unsigned copy_gc_enabled:1;
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index b56b1735..8152dc4b 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -1630,82 +1630,19 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
}
}
-/*
- * Write all dirty btree nodes to disk, including roots
- */
-void bch2_btree_flush(struct bch_fs *c)
+void bch2_btree_verify_flushed(struct bch_fs *c)
{
- struct closure cl;
- struct btree *b;
struct bucket_table *tbl;
struct rhash_head *pos;
- bool saw_dirty;
+ struct btree *b;
unsigned i;
- closure_init_stack(&cl);
-
rcu_read_lock();
+ tbl = rht_dereference_rcu(c->btree_cache_table.tbl,
+ &c->btree_cache_table);
- do {
- saw_dirty = false;
- i = 0;
-restart:
- tbl = rht_dereference_rcu(c->btree_cache_table.tbl,
- &c->btree_cache_table);
-
- for (; i < tbl->size; i++)
- rht_for_each_entry_rcu(b, pos, tbl, i, hash) {
- saw_dirty |= btree_node_dirty(b);
-
- if (btree_node_dirty(b) &&
- btree_node_may_write(b)) {
- rcu_read_unlock();
- six_lock_read(&b->lock);
- bch2_btree_node_write_dirty(c, b, &cl, 1);
- six_unlock_read(&b->lock);
- rcu_read_lock();
- goto restart;
- }
- }
- } while (saw_dirty);
-
+ for (i = 0; i < tbl->size; i++)
+ rht_for_each_entry_rcu(b, pos, tbl, i, hash)
+ BUG_ON(btree_node_dirty(b));
rcu_read_unlock();
-
- closure_sync(&cl);
-}
-
-/**
- * bch_btree_node_flush_journal - flush any journal entries that contain keys
- * from this node
- *
- * The bset's journal sequence number is used for preserving ordering of index
- * updates across unclean shutdowns - it's used to ignore bsets newer than the
- * most recent journal entry.
- *
- * But when rewriting btree nodes we compact all the bsets in a btree node - and
- * if we compacted a bset that should be ignored with bsets we do need, that
- * would be bad. So to avoid that, prior to making the new node visible ensure
- * that the journal has been flushed so that all the bsets we compacted should
- * be visible.
- */
-void bch2_btree_node_flush_journal_entries(struct bch_fs *c,
- struct btree *b,
- struct closure *cl)
-{
- int i = b->nsets;
-
- /*
- * Journal sequence numbers in the different bsets will always be in
- * ascending order, we only need to flush the highest - except that the
- * most recent bset might not have a journal sequence number yet, so we
- * need to loop:
- */
- while (i--) {
- u64 seq = le64_to_cpu(bset(b, &b->set[i])->journal_seq);
-
- if (seq) {
- bch2_journal_flush_seq_async(&c->journal, seq, cl);
- break;
- }
- }
}
diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h
index 84731144..3014b5f0 100644
--- a/libbcachefs/btree_io.h
+++ b/libbcachefs/btree_io.h
@@ -94,8 +94,6 @@ do { \
} \
} while (0)
-void bch2_btree_flush(struct bch_fs *);
-void bch2_btree_node_flush_journal_entries(struct bch_fs *, struct btree *,
- struct closure *);
+void bch2_btree_verify_flushed(struct bch_fs *);
#endif /* _BCACHE_BTREE_IO_H */
diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c
index cdbc0de4..196b7423 100644
--- a/libbcachefs/btree_update.c
+++ b/libbcachefs/btree_update.c
@@ -161,15 +161,14 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b,
{
trace_btree_node_free(c, b);
+ BUG_ON(btree_node_dirty(b));
BUG_ON(b == btree_node_root(c, b));
BUG_ON(b->ob);
BUG_ON(!list_empty(&b->write_blocked));
- six_lock_write(&b->lock);
+ clear_btree_node_noevict(b);
- if (btree_node_dirty(b))
- bch2_btree_complete_write(c, b, btree_current_write(b));
- clear_btree_node_dirty(b);
+ six_lock_write(&b->lock);
bch2_btree_node_hash_remove(c, b);
@@ -192,6 +191,8 @@ void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
b->ob = NULL;
+ clear_btree_node_dirty(b);
+
__btree_node_free(c, b, NULL);
bch2_open_bucket_put(c, ob);
@@ -890,7 +891,8 @@ bch2_btree_interior_update_alloc(struct bch_fs *c)
static void btree_interior_update_free(struct closure *cl)
{
- struct btree_interior_update *as = container_of(cl, struct btree_interior_update, cl);
+ struct btree_interior_update *as =
+ container_of(cl, struct btree_interior_update, cl);
mempool_free(as, &as->c->btree_interior_update_pool);
}
@@ -910,9 +912,6 @@ static void btree_interior_update_nodes_reachable(struct closure *cl)
bch2_btree_node_free_ondisk(c, &as->pending[i]);
as->nr_pending = 0;
- mutex_unlock(&c->btree_interior_update_lock);
-
- mutex_lock(&c->btree_interior_update_lock);
list_del(&as->list);
mutex_unlock(&c->btree_interior_update_lock);
@@ -1039,6 +1038,15 @@ static void btree_interior_update_updated_btree(struct bch_fs *c,
system_freezable_wq);
}
+static void btree_interior_update_reparent(struct btree_interior_update *as,
+ struct btree_interior_update *child)
+{
+ child->b = NULL;
+ child->mode = BTREE_INTERIOR_UPDATING_AS;
+ child->parent_as = as;
+ closure_get(&as->cl);
+}
+
static void btree_interior_update_updated_root(struct bch_fs *c,
struct btree_interior_update *as,
enum btree_id btree_id)
@@ -1053,14 +1061,8 @@ static void btree_interior_update_updated_root(struct bch_fs *c,
* Old root might not be persistent yet - if so, redirect its
* btree_interior_update operation to point to us:
*/
- if (r->as) {
- BUG_ON(r->as->mode != BTREE_INTERIOR_UPDATING_ROOT);
-
- r->as->b = NULL;
- r->as->mode = BTREE_INTERIOR_UPDATING_AS;
- r->as->parent_as = as;
- closure_get(&as->cl);
- }
+ if (r->as)
+ btree_interior_update_reparent(as, r->as);
as->mode = BTREE_INTERIOR_UPDATING_ROOT;
as->b = r->b;
@@ -1068,8 +1070,6 @@ static void btree_interior_update_updated_root(struct bch_fs *c,
mutex_unlock(&c->btree_interior_update_lock);
- bch2_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl);
-
continue_at(&as->cl, btree_interior_update_nodes_written,
system_freezable_wq);
}
@@ -1092,8 +1092,10 @@ void bch2_btree_interior_update_will_free_node(struct bch_fs *c,
struct btree_interior_update *as,
struct btree *b)
{
+ struct closure *cl, *cl_n;
struct btree_interior_update *p, *n;
struct pending_btree_node_free *d;
+ struct btree_write *w;
struct bset_tree *t;
/*
@@ -1107,23 +1109,18 @@ void bch2_btree_interior_update_will_free_node(struct bch_fs *c,
for_each_bset(b, t)
as->journal_seq = max(as->journal_seq, bset(b, t)->journal_seq);
- /*
- * Does this node have unwritten data that has a pin on the journal?
- *
- * If so, transfer that pin to the btree_interior_update operation -
- * note that if we're freeing multiple nodes, we only need to keep the
- * oldest pin of any of the nodes we're freeing. We'll release the pin
- * when the new nodes are persistent and reachable on disk:
- */
- bch2_journal_pin_add_if_older(&c->journal,
- &b->writes[0].journal,
- &as->journal, interior_update_flush);
- bch2_journal_pin_add_if_older(&c->journal,
- &b->writes[1].journal,
- &as->journal, interior_update_flush);
-
mutex_lock(&c->btree_interior_update_lock);
+ /* Add this node to the list of nodes being freed: */
+ BUG_ON(as->nr_pending >= ARRAY_SIZE(as->pending));
+
+ d = &as->pending[as->nr_pending++];
+ d->index_update_done = false;
+ d->seq = b->data->keys.seq;
+ d->btree_id = b->btree_id;
+ d->level = b->level;
+ bkey_copy(&d->key, &b->key);
+
/*
* Does this node have any btree_interior_update operations preventing
* it from being written?
@@ -1133,24 +1130,28 @@ void bch2_btree_interior_update_will_free_node(struct bch_fs *c,
* operations complete
*/
list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
- BUG_ON(p->mode != BTREE_INTERIOR_UPDATING_NODE);
-
- p->mode = BTREE_INTERIOR_UPDATING_AS;
list_del(&p->write_blocked_list);
- p->b = NULL;
- p->parent_as = as;
- closure_get(&as->cl);
+ btree_interior_update_reparent(as, p);
}
- /* Add this node to the list of nodes being freed: */
- BUG_ON(as->nr_pending >= ARRAY_SIZE(as->pending));
+ clear_btree_node_dirty(b);
+ w = btree_current_write(b);
+
+ llist_for_each_entry_safe(cl, cl_n, llist_del_all(&w->wait.list), list)
+ llist_add(&cl->list, &as->wait.list);
+
+ /*
+ * Does this node have unwritten data that has a pin on the journal?
+ *
+ * If so, transfer that pin to the btree_interior_update operation -
+ * note that if we're freeing multiple nodes, we only need to keep the
+ * oldest pin of any of the nodes we're freeing. We'll release the pin
+ * when the new nodes are persistent and reachable on disk:
+ */
+ bch2_journal_pin_add_if_older(&c->journal, &w->journal,
+ &as->journal, interior_update_flush);
+ bch2_journal_pin_drop(&c->journal, &w->journal);
- d = &as->pending[as->nr_pending++];
- d->index_update_done = false;
- d->seq = b->data->keys.seq;
- d->btree_id = b->btree_id;
- d->level = b->level;
- bkey_copy(&d->key, &b->key);
mutex_unlock(&c->btree_interior_update_lock);
}
diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h
index 5303cdc7..3b82d7f3 100644
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@@ -199,21 +199,6 @@ static inline u64 bch2_fs_sectors_used(struct bch_fs *c)
return min(c->capacity, __bch2_fs_sectors_used(c));
}
-/* XXX: kill? */
-static inline u64 sectors_available(struct bch_fs *c)
-{
- struct bch_dev *ca;
- unsigned i;
- u64 ret = 0;
-
- rcu_read_lock();
- for_each_member_device_rcu(ca, c, i)
- ret += dev_buckets_available(ca) << ca->bucket_bits;
- rcu_read_unlock();
-
- return ret;
-}
-
static inline bool is_available_bucket(struct bucket_mark mark)
{
return (!mark.owned_by_allocator &&
diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c
index 2d20061d..694fcd2d 100644
--- a/libbcachefs/chardev.c
+++ b/libbcachefs/chardev.c
@@ -37,10 +37,10 @@ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
path = strndup_user((const char __user *)
(unsigned long) dev, PATH_MAX);
- if (!path)
- return ERR_PTR(-ENOMEM);
+ if (IS_ERR(path))
+ return ERR_CAST(path);
- bdev = lookup_bdev(strim(path));
+ bdev = lookup_bdev(path);
kfree(path);
if (IS_ERR(bdev))
return ERR_CAST(bdev);
diff --git a/libbcachefs/io.c b/libbcachefs/io.c
index 0a64f35d..039dd044 100644
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@@ -1046,7 +1046,7 @@ static void bch2_read_endio(struct bio *bio)
if (rbio->ptr.cached &&
(((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
ptr_stale(rbio->ca, &rbio->ptr))) {
- atomic_long_inc(&c->cache_read_races);
+ atomic_long_inc(&c->read_realloc_races);
if (rbio->flags & BCH_READ_RETRY_IF_STALE)
bch2_rbio_retry(c, rbio);
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index 9e290618..f6203f1e 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -180,8 +180,10 @@ redo_peek:
ret == -EINTR)
goto redo_peek;
- /* -EROFS or perhaps -ENOSPC - bail out: */
- /* XXX warn here */
+ bch2_fs_fatal_error(c,
+ "error %i rewriting btree node with blacklisted journal seq",
+ ret);
+ bch2_journal_halt(j);
return;
}
}
@@ -1018,6 +1020,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
fifo_for_each_entry_ptr(p, &j->pin, iter) {
INIT_LIST_HEAD(&p->list);
+ INIT_LIST_HEAD(&p->flushed);
atomic_set(&p->count, 0);
}
@@ -1147,6 +1150,7 @@ static void __journal_entry_new(struct journal *j, int count)
&fifo_peek_back(&j->pin));
INIT_LIST_HEAD(&p->list);
+ INIT_LIST_HEAD(&p->flushed);
atomic_set(&p->count, count);
}
@@ -1516,7 +1520,7 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
j->replay_pin_list = NULL;
if (did_replay) {
- bch2_btree_flush(c);
+ bch2_journal_flush_pins(&c->journal, U64_MAX);
/*
* Write a new journal entry _before_ we start journalling new data -
@@ -1859,7 +1863,7 @@ journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
struct journal_entry_pin, list);
if (ret) {
/* must be list_del_init(), see bch2_journal_pin_drop() */
- list_del_init(&ret->list);
+ list_move(&ret->list, &pin_list->flushed);
*seq = journal_pin_seq(j, pin_list);
break;
}
@@ -1869,28 +1873,32 @@ journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
return ret;
}
-static bool journal_has_pins(struct journal *j)
+static bool journal_flush_done(struct journal *j, u64 seq_to_flush)
{
bool ret;
spin_lock(&j->lock);
journal_reclaim_fast(j);
- ret = fifo_used(&j->pin) > 1 ||
- atomic_read(&fifo_peek_front(&j->pin).count) > 1;
+
+ ret = (fifo_used(&j->pin) == 1 &&
+ atomic_read(&fifo_peek_front(&j->pin).count) == 1) ||
+ last_seq(j) > seq_to_flush;
spin_unlock(&j->lock);
return ret;
}
-void bch2_journal_flush_pins(struct journal *j)
+void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
{
struct journal_entry_pin *pin;
- u64 seq;
+ u64 pin_seq;
- while ((pin = journal_get_next_pin(j, U64_MAX, &seq)))
- pin->flush(j, pin, seq);
+ while ((pin = journal_get_next_pin(j, seq_to_flush, &pin_seq)))
+ pin->flush(j, pin, pin_seq);
- wait_event(j->wait, !journal_has_pins(j) || bch2_journal_error(j));
+ wait_event(j->wait,
+ journal_flush_done(j, seq_to_flush) ||
+ bch2_journal_error(j));
}
static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
@@ -2174,9 +2182,18 @@ static void journal_write_done(struct closure *cl)
struct journal *j = container_of(cl, struct journal, io);
struct journal_buf *w = journal_prev_buf(j);
+ __bch2_time_stats_update(j->write_time, j->write_start_time);
+
j->last_seq_ondisk = le64_to_cpu(w->data->last_seq);
- __bch2_time_stats_update(j->write_time, j->write_start_time);
+ /*
+ * Updating last_seq_ondisk may let journal_reclaim_work() discard more
+ * buckets:
+ *
+ * Must come before signaling write completion, for
+ * bch2_fs_journal_stop():
+ */
+ mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
BUG_ON(!j->reservations.prev_buf_unwritten);
atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v,
@@ -2199,12 +2216,6 @@ static void journal_write_done(struct closure *cl)
closure_wake_up(&w->wait);
wake_up(&j->wait);
-
- /*
- * Updating last_seq_ondisk may let journal_reclaim_work() discard more
- * buckets:
- */
- mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
}
static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
@@ -2345,8 +2356,12 @@ static void journal_write_work(struct work_struct *work)
struct journal *j = container_of(to_delayed_work(work),
struct journal, write_work);
spin_lock(&j->lock);
- set_bit(JOURNAL_NEED_WRITE, &j->flags);
+ if (!journal_entry_is_open(j)) {
+ spin_unlock(&j->lock);
+ return;
+ }
+ set_bit(JOURNAL_NEED_WRITE, &j->flags);
if (journal_buf_switch(j, false) != JOURNAL_UNLOCKED)
spin_unlock(&j->lock);
}
@@ -2505,6 +2520,8 @@ void bch2_journal_wait_on_seq(struct journal *j, u64 seq, struct closure *parent
void bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *parent)
{
+ struct journal_buf *buf;
+
spin_lock(&j->lock);
BUG_ON(seq > atomic64_read(&j->seq));
@@ -2517,8 +2534,9 @@ void bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *pa
if (seq == atomic64_read(&j->seq)) {
bool set_need_write = false;
- if (parent &&
- !closure_wait(&journal_cur_buf(j)->wait, parent))
+ buf = journal_cur_buf(j);
+
+ if (parent && !closure_wait(&buf->wait, parent))
BUG();
if (!test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags)) {
@@ -2529,7 +2547,7 @@ void bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *pa
switch (journal_buf_switch(j, set_need_write)) {
case JOURNAL_ENTRY_ERROR:
if (parent)
- closure_wake_up(&journal_cur_buf(j)->wait);
+ closure_wake_up(&buf->wait);
break;
case JOURNAL_ENTRY_CLOSED:
/*
@@ -2545,7 +2563,9 @@ void bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *pa
} else if (parent &&
seq + 1 == atomic64_read(&j->seq) &&
j->reservations.prev_buf_unwritten) {
- if (!closure_wait(&journal_prev_buf(j)->wait, parent))
+ buf = journal_prev_buf(j);
+
+ if (!closure_wait(&buf->wait, parent))
BUG();
smp_mb();
@@ -2553,7 +2573,7 @@ void bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *pa
/* check if raced with write completion (or failure) */
if (!j->reservations.prev_buf_unwritten ||
bch2_journal_error(j))
- closure_wake_up(&journal_prev_buf(j)->wait);
+ closure_wake_up(&buf->wait);
}
spin_unlock(&j->lock);
@@ -2698,6 +2718,39 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
return ret;
}
+ssize_t bch2_journal_print_pins(struct journal *j, char *buf)
+{
+ struct journal_entry_pin_list *pin_list;
+ struct journal_entry_pin *pin;
+ ssize_t ret = 0;
+ unsigned i;
+
+ spin_lock_irq(&j->pin_lock);
+ fifo_for_each_entry_ptr(pin_list, &j->pin, i) {
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+ "%llu: count %u\n",
+ journal_pin_seq(j, pin_list),
+ atomic_read(&pin_list->count));
+
+ list_for_each_entry(pin, &pin_list->list, list)
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+ "\t%p %pf\n",
+ pin, pin->flush);
+
+ if (!list_empty(&pin_list->flushed))
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+ "flushed:\n");
+
+ list_for_each_entry(pin, &pin_list->flushed, list)
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+ "\t%p %pf\n",
+ pin, pin->flush);
+ }
+ spin_unlock_irq(&j->pin_lock);
+
+ return ret;
+}
+
static bool bch2_journal_writing_to_device(struct bch_dev *ca)
{
struct journal *j = &ca->fs->journal;
@@ -2725,12 +2778,11 @@ static bool bch2_journal_writing_to_device(struct bch_dev *ca)
int bch2_journal_move(struct bch_dev *ca)
{
- u64 last_flushed_seq;
struct journal_device *ja = &ca->journal;
- struct bch_fs *c = ca->fs;
- struct journal *j = &c->journal;
+ struct journal *j = &ca->fs->journal;
+ u64 seq_to_flush = 0;
unsigned i;
- int ret = 0; /* Success */
+ int ret;
if (bch2_journal_writing_to_device(ca)) {
/*
@@ -2744,16 +2796,10 @@ int bch2_journal_move(struct bch_dev *ca)
BUG_ON(bch2_journal_writing_to_device(ca));
}
- /*
- * Flush all btree updates to backing store so that any
- * journal entries written to ca become stale and are no
- * longer needed.
- */
+ for (i = 0; i < ja->nr; i++)
+ seq_to_flush = max(seq_to_flush, ja->bucket_seq[i]);
- /*
- * XXX: switch to normal journal reclaim machinery
- */
- bch2_btree_flush(c);
+ bch2_journal_flush_pins(j, seq_to_flush);
/*
* Force a meta-data journal entry to be written so that
@@ -2767,12 +2813,9 @@ int bch2_journal_move(struct bch_dev *ca)
* the device
*/
spin_lock(&j->lock);
- last_flushed_seq = last_seq(j);
+ ret = j->last_seq_ondisk > seq_to_flush ? 0 : -EIO;
spin_unlock(&j->lock);
- for (i = 0; i < ja->nr; i += 1)
- BUG_ON(ja->bucket_seq[i] > last_flushed_seq);
-
return ret;
}
@@ -2786,7 +2829,7 @@ void bch2_fs_journal_stop(struct journal *j)
* journal entries, then force a brand new empty journal entry to be
* written:
*/
- bch2_journal_flush_pins(j);
+ bch2_journal_flush_pins(j, U64_MAX);
bch2_journal_flush_async(j, NULL);
bch2_journal_meta(j);
diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h
index 9ad82c60..d0dd0d33 100644
--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@@ -141,7 +141,7 @@ void bch2_journal_pin_add_if_older(struct journal *,
struct journal_entry_pin *,
struct journal_entry_pin *,
journal_pin_flush_fn);
-void bch2_journal_flush_pins(struct journal *);
+void bch2_journal_flush_pins(struct journal *, u64);
struct closure;
struct bch_fs;
@@ -354,6 +354,7 @@ static inline void bch2_journal_set_replay_done(struct journal *j)
}
ssize_t bch2_journal_print_debug(struct journal *, char *);
+ssize_t bch2_journal_print_pins(struct journal *, char *);
int bch2_dev_journal_alloc(struct bch_dev *);
diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h
index 75712aed..4b01b14a 100644
--- a/libbcachefs/journal_types.h
+++ b/libbcachefs/journal_types.h
@@ -38,6 +38,7 @@ struct journal_buf {
struct journal_entry_pin_list {
struct list_head list;
+ struct list_head flushed;
atomic_t count;
};
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 6cbfc801..f5ee2de3 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -211,7 +211,14 @@ static void __bch2_fs_read_only(struct bch_fs *c)
bch2_gc_thread_stop(c);
- bch2_btree_flush(c);
+ /*
+ * Flush journal before stopping allocators, because flushing journal
+ * blacklist entries involves allocating new btree nodes:
+ */
+ bch2_journal_flush_pins(&c->journal, U64_MAX);
+
+ if (!bch2_journal_error(&c->journal))
+ bch2_btree_verify_flushed(c);
for_each_member_device(ca, c, i)
bch2_dev_allocator_stop(ca);
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index 808b3089..ba04bbad 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -120,6 +120,7 @@ do { \
return strtoi_h(buf, &var) ?: (ssize_t) size; \
} while (0)
+write_attribute(trigger_journal_flush);
write_attribute(trigger_btree_coalesce);
write_attribute(trigger_gc);
write_attribute(prune_cache);
@@ -127,35 +128,25 @@ write_attribute(prune_cache);
read_attribute(uuid);
read_attribute(minor);
read_attribute(bucket_size);
-read_attribute(bucket_size_bytes);
read_attribute(block_size);
-read_attribute(block_size_bytes);
read_attribute(btree_node_size);
-read_attribute(btree_node_size_bytes);
read_attribute(first_bucket);
read_attribute(nbuckets);
-read_attribute(tree_depth);
-read_attribute(root_usage_percent);
read_attribute(read_priority_stats);
read_attribute(write_priority_stats);
read_attribute(fragmentation_stats);
read_attribute(oldest_gen_stats);
read_attribute(reserve_stats);
read_attribute(btree_cache_size);
-read_attribute(cache_available_percent);
read_attribute(compression_stats);
read_attribute(written);
read_attribute(btree_written);
read_attribute(metadata_written);
read_attribute(journal_debug);
-write_attribute(journal_flush);
-read_attribute(internal_uuid);
+read_attribute(journal_pins);
-read_attribute(btree_gc_running);
+read_attribute(internal_uuid);
-read_attribute(btree_nodes);
-read_attribute(btree_used_percent);
-read_attribute(average_key_size);
read_attribute(available_buckets);
read_attribute(free_buckets);
read_attribute(dirty_data);
@@ -168,10 +159,9 @@ read_attribute(meta_buckets);
read_attribute(alloc_buckets);
read_attribute(has_data);
read_attribute(has_metadata);
-read_attribute(bset_tree_stats);
read_attribute(alloc_debug);
-read_attribute(cache_read_races);
+read_attribute(read_realloc_races);
rw_attribute(journal_write_delay_ms);
rw_attribute(journal_reclaim_delay_ms);
@@ -221,73 +211,6 @@ static struct attribute sysfs_state_rw = {
.mode = S_IRUGO
};
-static int bch2_bset_print_stats(struct bch_fs *c, char *buf)
-{
- struct bset_stats stats;
- size_t nodes = 0;
- struct btree *b;
- struct bucket_table *tbl;
- struct rhash_head *pos;
- unsigned iter;
-
- memset(&stats, 0, sizeof(stats));
-
- rcu_read_lock();
- for_each_cached_btree(b, c, tbl, iter, pos) {
- bch2_btree_keys_stats(b, &stats);
- nodes++;
- }
- rcu_read_unlock();
-
- return snprintf(buf, PAGE_SIZE,
- "btree nodes: %zu\n"
- "written sets: %zu\n"
- "written key bytes: %zu\n"
- "unwritten sets: %zu\n"
- "unwritten key bytes: %zu\n"
- "no table sets: %zu\n"
- "no table key bytes: %zu\n"
- "floats: %zu\n"
- "failed unpacked: %zu\n"
- "failed prev: %zu\n"
- "failed overflow: %zu\n",
- nodes,
- stats.sets[BSET_RO_AUX_TREE].nr,
- stats.sets[BSET_RO_AUX_TREE].bytes,
- stats.sets[BSET_RW_AUX_TREE].nr,
- stats.sets[BSET_RW_AUX_TREE].bytes,
- stats.sets[BSET_NO_AUX_TREE].nr,
- stats.sets[BSET_NO_AUX_TREE].bytes,
- stats.floats,
- stats.failed_unpacked,
- stats.failed_prev,
- stats.failed_overflow);
-}
-
-static unsigned bch2_root_usage(struct bch_fs *c)
-{
- unsigned bytes = 0;
- struct bkey_packed *k;
- struct btree *b;
- struct btree_node_iter iter;
-
- goto lock_root;
-
- do {
- six_unlock_read(&b->lock);
-lock_root:
- b = c->btree_roots[BTREE_ID_EXTENTS].b;
- six_lock_read(&b->lock);
- } while (b != c->btree_roots[BTREE_ID_EXTENTS].b);
-
- for_each_btree_node_key(b, k, &iter, btree_node_is_extents(b))
- bytes += bkey_bytes(k);
-
- six_unlock_read(&b->lock);
-
- return (bytes * 100) / btree_bytes(c);
-}
-
static size_t bch2_btree_cache_size(struct bch_fs *c)
{
size_t ret = 0;
@@ -301,27 +224,6 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
return ret;
}
-static unsigned bch2_fs_available_percent(struct bch_fs *c)
-{
- return div64_u64((u64) sectors_available(c) * 100,
- c->capacity ?: 1);
-}
-
-#if 0
-static unsigned bch2_btree_used(struct bch_fs *c)
-{
- return div64_u64(c->gc_stats.key_bytes * 100,
- (c->gc_stats.nodes ?: 1) * btree_bytes(c));
-}
-
-static unsigned bch2_average_key_size(struct bch_fs *c)
-{
- return c->gc_stats.nkeys
- ? div64_u64(c->gc_stats.data, c->gc_stats.nkeys)
- : 0;
-}
-#endif
-
static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
{
struct bch_fs_usage stats = bch2_fs_usage_read(c);
@@ -358,6 +260,9 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
compressed_sectors_compressed = 0,
compressed_sectors_uncompressed = 0;
+ if (!bch2_fs_running(c))
+ return -EPERM;
+
for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, k)
if (k.k->type == BCH_EXTENT) {
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
@@ -402,29 +307,17 @@ SHOW(bch2_fs)
struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
sysfs_print(minor, c->minor);
+ sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b);
sysfs_print(journal_write_delay_ms, c->journal.write_delay_ms);
sysfs_print(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms);
- sysfs_hprint(block_size, block_bytes(c));
- sysfs_print(block_size_bytes, block_bytes(c));
- sysfs_hprint(btree_node_size, c->sb.btree_node_size << 9);
- sysfs_print(btree_node_size_bytes, c->sb.btree_node_size << 9);
-
+ sysfs_print(block_size, block_bytes(c));
+ sysfs_print(btree_node_size, btree_bytes(c));
sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c));
- sysfs_print(cache_available_percent, bch2_fs_available_percent(c));
- sysfs_print(btree_gc_running, c->gc_pos.phase != GC_PHASE_DONE);
-
-#if 0
- /* XXX: reimplement */
- sysfs_print(btree_used_percent, bch2_btree_used(c));
- sysfs_print(btree_nodes, c->gc_stats.nodes);
- sysfs_hprint(average_key_size, bch2_average_key_size(c));
-#endif
-
- sysfs_print(cache_read_races,
- atomic_long_read(&c->cache_read_races));
+ sysfs_print(read_realloc_races,
+ atomic_long_read(&c->read_realloc_races));
sysfs_printf(foreground_write_ratelimit_enabled, "%i",
c->foreground_write_ratelimit_enabled);
@@ -445,28 +338,21 @@ SHOW(bch2_fs)
/* Debugging: */
- if (attr == &sysfs_journal_debug)
- return bch2_journal_print_debug(&c->journal, buf);
-
-#define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name);
- BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
- if (!bch2_fs_running(c))
- return -EPERM;
-
- if (attr == &sysfs_bset_tree_stats)
- return bch2_bset_print_stats(c, buf);
if (attr == &sysfs_alloc_debug)
return show_fs_alloc_debug(c, buf);
- sysfs_print(tree_depth, c->btree_roots[BTREE_ID_EXTENTS].b->level);
- sysfs_print(root_usage_percent, bch2_root_usage(c));
+ if (attr == &sysfs_journal_debug)
+ return bch2_journal_print_debug(&c->journal, buf);
+
+ if (attr == &sysfs_journal_pins)
+ return bch2_journal_print_pins(&c->journal, buf);
if (attr == &sysfs_compression_stats)
return bch2_compression_stats(c, buf);
- sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b);
+#define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name);
+ BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
return 0;
}
@@ -519,17 +405,14 @@ STORE(__bch2_fs)
if (!bch2_fs_running(c))
return -EPERM;
- if (attr == &sysfs_journal_flush) {
- bch2_journal_meta_async(&c->journal, NULL);
+ /* Debugging: */
- return size;
- }
+ if (attr == &sysfs_trigger_journal_flush)
+ bch2_journal_meta_async(&c->journal, NULL);
if (attr == &sysfs_trigger_btree_coalesce)
bch2_coalesce(c);
- /* Debugging: */
-
if (attr == &sysfs_trigger_gc)
bch2_gc(c);
@@ -557,28 +440,21 @@ STORE(bch2_fs)
SYSFS_OPS(bch2_fs);
struct attribute *bch2_fs_files[] = {
- &sysfs_journal_write_delay_ms,
- &sysfs_journal_reclaim_delay_ms,
-
+ &sysfs_minor,
&sysfs_block_size,
- &sysfs_block_size_bytes,
&sysfs_btree_node_size,
- &sysfs_btree_node_size_bytes,
- &sysfs_tree_depth,
- &sysfs_root_usage_percent,
&sysfs_btree_cache_size,
- &sysfs_cache_available_percent,
- &sysfs_compression_stats,
-
- &sysfs_average_key_size,
&sysfs_meta_replicas_have,
&sysfs_data_replicas_have,
+ &sysfs_journal_write_delay_ms,
+ &sysfs_journal_reclaim_delay_ms,
+
&sysfs_foreground_target_percent,
&sysfs_tiering_percent,
- &sysfs_journal_flush,
+ &sysfs_compression_stats,
NULL
};
@@ -598,21 +474,17 @@ STORE(bch2_fs_internal)
SYSFS_OPS(bch2_fs_internal);
struct attribute *bch2_fs_internal_files[] = {
- &sysfs_journal_debug,
-
&sysfs_alloc_debug,
+ &sysfs_journal_debug,
+ &sysfs_journal_pins,
- &sysfs_btree_gc_running,
-
- &sysfs_btree_nodes,
- &sysfs_btree_used_percent,
-
- &sysfs_bset_tree_stats,
- &sysfs_cache_read_races,
+ &sysfs_read_realloc_races,
+ &sysfs_trigger_journal_flush,
&sysfs_trigger_btree_coalesce,
&sysfs_trigger_gc,
&sysfs_prune_cache,
+
&sysfs_foreground_write_ratelimit_enabled,
&sysfs_copy_gc_enabled,
&sysfs_tiering_enabled,
@@ -853,10 +725,8 @@ SHOW(bch2_dev)
sysfs_printf(uuid, "%pU\n", ca->uuid.b);
- sysfs_hprint(bucket_size, bucket_bytes(ca));
- sysfs_print(bucket_size_bytes, bucket_bytes(ca));
- sysfs_hprint(block_size, block_bytes(c));
- sysfs_print(block_size_bytes, block_bytes(c));
+ sysfs_print(bucket_size, bucket_bytes(ca));
+ sysfs_print(block_size, block_bytes(c));
sysfs_print(first_bucket, ca->mi.first_bucket);
sysfs_print(nbuckets, ca->mi.nbuckets);
sysfs_print(discard, ca->mi.discard);
@@ -979,35 +849,46 @@ SYSFS_OPS(bch2_dev);
struct attribute *bch2_dev_files[] = {
&sysfs_uuid,
&sysfs_bucket_size,
- &sysfs_bucket_size_bytes,
&sysfs_block_size,
- &sysfs_block_size_bytes,
&sysfs_first_bucket,
&sysfs_nbuckets,
- &sysfs_read_priority_stats,
- &sysfs_write_priority_stats,
- &sysfs_fragmentation_stats,
- &sysfs_oldest_gen_stats,
- &sysfs_reserve_stats,
- &sysfs_available_buckets,
- &sysfs_free_buckets,
+
+ /* settings: */
+ &sysfs_discard,
+ &sysfs_cache_replacement_policy,
+ &sysfs_tier,
+ &sysfs_state_rw,
+
+ &sysfs_has_data,
+ &sysfs_has_metadata,
+
+ /* io stats: */
+ &sysfs_written,
+ &sysfs_btree_written,
+ &sysfs_metadata_written,
+
+ /* alloc info - data: */
&sysfs_dirty_data,
&sysfs_dirty_bytes,
- &sysfs_dirty_buckets,
&sysfs_cached_data,
&sysfs_cached_bytes,
+
+ /* alloc info - buckets: */
+ &sysfs_available_buckets,
+ &sysfs_free_buckets,
+ &sysfs_dirty_buckets,
&sysfs_cached_buckets,
&sysfs_meta_buckets,
&sysfs_alloc_buckets,
- &sysfs_has_data,
- &sysfs_has_metadata,
- &sysfs_discard,
- &sysfs_written,
- &sysfs_btree_written,
- &sysfs_metadata_written,
- &sysfs_cache_replacement_policy,
- &sysfs_tier,
- &sysfs_state_rw,
+
+ /* alloc info - other stats: */
+ &sysfs_read_priority_stats,
+ &sysfs_write_priority_stats,
+ &sysfs_fragmentation_stats,
+ &sysfs_oldest_gen_stats,
+ &sysfs_reserve_stats,
+
+ /* debug: */
&sysfs_alloc_debug,
sysfs_pd_controller_files(copy_gc),