summaryrefslogtreecommitdiff
path: root/libbcachefs
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2018-06-27 14:41:51 -0400
committerKent Overstreet <kent.overstreet@gmail.com>2018-06-27 14:50:43 -0400
commit17e2f2775be6e10b966cd958bc0461aab662571a (patch)
tree68b5ea9ddaaaf494d1f155bfa060ba8e86d1384e /libbcachefs
parentf2f3de4da4d8c5d2abab5b0ba201ea7e839aa418 (diff)
Update bcachefs sources to 2cb70a82bc bcachefs: delete some debug code
Diffstat (limited to 'libbcachefs')
-rw-r--r--libbcachefs/bcachefs.h12
-rw-r--r--libbcachefs/bcachefs_format.h62
-rw-r--r--libbcachefs/btree_cache.c78
-rw-r--r--libbcachefs/btree_cache.h2
-rw-r--r--libbcachefs/btree_gc.c63
-rw-r--r--libbcachefs/btree_gc.h24
-rw-r--r--libbcachefs/btree_io.c4
-rw-r--r--libbcachefs/btree_iter.c437
-rw-r--r--libbcachefs/btree_iter.h103
-rw-r--r--libbcachefs/btree_locking.h28
-rw-r--r--libbcachefs/btree_update.h40
-rw-r--r--libbcachefs/btree_update_interior.c185
-rw-r--r--libbcachefs/btree_update_interior.h38
-rw-r--r--libbcachefs/btree_update_leaf.c312
-rw-r--r--libbcachefs/buckets.c5
-rw-r--r--libbcachefs/checksum.h8
-rw-r--r--libbcachefs/dirent.c32
-rw-r--r--libbcachefs/fs-io.c124
-rw-r--r--libbcachefs/fs.c14
-rw-r--r--libbcachefs/journal.c28
-rw-r--r--libbcachefs/journal_io.c117
-rw-r--r--libbcachefs/journal_io.h4
-rw-r--r--libbcachefs/journal_reclaim.c36
-rw-r--r--libbcachefs/journal_reclaim.h9
-rw-r--r--libbcachefs/journal_types.h1
-rw-r--r--libbcachefs/migrate.c15
-rw-r--r--libbcachefs/opts.h3
-rw-r--r--libbcachefs/recovery.c346
-rw-r--r--libbcachefs/recovery.h7
-rw-r--r--libbcachefs/replicas.c27
-rw-r--r--libbcachefs/str_hash.h1
-rw-r--r--libbcachefs/super-io.c82
-rw-r--r--libbcachefs/super-io.h4
-rw-r--r--libbcachefs/super.c242
-rw-r--r--libbcachefs/sysfs.c29
-rw-r--r--libbcachefs/tests.c289
-rw-r--r--libbcachefs/tests.h14
-rw-r--r--libbcachefs/util.c113
-rw-r--r--libbcachefs/xattr.c2
39 files changed, 1995 insertions, 945 deletions
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index 4702b01..1482b80 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -259,6 +259,10 @@ do { \
"Reread btree nodes at various points to verify the " \
"mergesort in the read path against modifications " \
"done in memory") \
+ BCH_DEBUG_PARAM(journal_seq_verify, \
+ "Store the journal sequence number in the version " \
+ "number of every btree key, and verify that btree " \
+ "update ordering is preserved during recovery")
#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
@@ -314,7 +318,13 @@ enum bch_time_stats {
struct btree;
enum gc_phase {
- GC_PHASE_SB = BTREE_ID_NR + 1,
+ GC_PHASE_START,
+ GC_PHASE_SB,
+
+#define DEF_BTREE_ID(kwd, val, name) GC_PHASE_BTREE_##kwd,
+ DEFINE_BCH_BTREE_IDS()
+#undef DEF_BTREE_ID
+
GC_PHASE_PENDING_DELETE,
GC_PHASE_ALLOC,
GC_PHASE_DONE
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index ab8b944..b6e7b98 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -426,6 +426,16 @@ enum bch_csum_type {
BCH_CSUM_NR = 7,
};
+static const unsigned bch_crc_bytes[] = {
+ [BCH_CSUM_NONE] = 0,
+ [BCH_CSUM_CRC32C_NONZERO] = 4,
+ [BCH_CSUM_CRC32C] = 4,
+ [BCH_CSUM_CRC64_NONZERO] = 8,
+ [BCH_CSUM_CRC64] = 8,
+ [BCH_CSUM_CHACHA20_POLY1305_80] = 10,
+ [BCH_CSUM_CHACHA20_POLY1305_128] = 16,
+};
+
static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
{
switch (type) {
@@ -783,6 +793,11 @@ struct bch_dirent {
} __attribute__((packed, aligned(8)));
BKEY_VAL_TYPE(dirent, BCH_DIRENT);
+#define BCH_NAME_MAX (U8_MAX * sizeof(u64) - \
+ sizeof(struct bkey) - \
+ offsetof(struct bch_dirent, d_name))
+
+
/* Xattrs */
enum {
@@ -868,7 +883,8 @@ struct bch_sb_field {
x(crypt, 2) \
x(replicas, 3) \
x(quota, 4) \
- x(disk_groups, 5)
+ x(disk_groups, 5) \
+ x(clean, 6)
enum bch_sb_field_type {
#define x(f, nr) BCH_SB_FIELD_##f = nr,
@@ -1038,6 +1054,37 @@ struct bch_sb_field_disk_groups {
struct bch_disk_group entries[0];
};
+/*
+ * On clean shutdown, store btree roots and current journal sequence number in
+ * the superblock:
+ */
+struct jset_entry {
+ __le16 u64s;
+ __u8 btree_id;
+ __u8 level;
+ __u8 type; /* designates what this jset holds */
+ __u8 pad[3];
+
+ union {
+ struct bkey_i start[0];
+ __u64 _data[0];
+ };
+};
+
+struct bch_sb_field_clean {
+ struct bch_sb_field field;
+
+ __le32 flags;
+ __le16 read_clock;
+ __le16 write_clock;
+ __le64 journal_seq;
+
+ union {
+ struct jset_entry start[0];
+ __u64 _data[0];
+ };
+};
+
/* Superblock: */
/*
@@ -1255,19 +1302,6 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
#define BCACHE_JSET_VERSION_JKEYS 2
#define BCACHE_JSET_VERSION 2
-struct jset_entry {
- __le16 u64s;
- __u8 btree_id;
- __u8 level;
- __u8 type; /* designates what this jset holds */
- __u8 pad[3];
-
- union {
- struct bkey_i start[0];
- __u64 _data[0];
- };
-};
-
#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64))
#define BCH_JSET_ENTRY_TYPES() \
diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c
index c950f25..b0dc4c8 100644
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@@ -649,7 +649,14 @@ struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
struct btree *b;
struct bset_tree *t;
- /* btree_node_fill() requires parent to be locked: */
+ /*
+ * XXX: locking optimization
+ *
+ * we can make the locking looser here - caller can drop lock on parent
+ * node before locking child node (and potentially blocking): we just
+ * have to have bch2_btree_node_fill() call relock on the parent and
+ * return -EINTR if that fails
+ */
EBUG_ON(!btree_node_locked(iter, level + 1));
EBUG_ON(level >= BTREE_MAX_DEPTH);
retry:
@@ -749,23 +756,22 @@ retry:
struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
struct btree_iter *iter,
struct btree *b,
+ bool may_drop_locks,
enum btree_node_sibling sib)
{
struct btree *parent;
struct btree_node_iter node_iter;
struct bkey_packed *k;
BKEY_PADDED(k) tmp;
- struct btree *ret;
+ struct btree *ret = NULL;
unsigned level = b->level;
parent = btree_iter_node(iter, level + 1);
if (!parent)
return NULL;
- if (!bch2_btree_node_relock(iter, level + 1)) {
- bch2_btree_iter_set_locks_want(iter, level + 2);
- return ERR_PTR(-EINTR);
- }
+ if (!bch2_btree_node_relock(iter, level + 1))
+ goto out_upgrade;
node_iter = iter->l[parent->level].iter;
@@ -778,34 +784,66 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
: (bch2_btree_node_iter_advance(&node_iter, parent),
bch2_btree_node_iter_peek_all(&node_iter, parent));
if (!k)
- return NULL;
+ goto out;
} while (bkey_deleted(k));
bch2_bkey_unpack(parent, &tmp.k, k);
ret = bch2_btree_node_get(c, iter, &tmp.k, level, SIX_LOCK_intent);
- if (IS_ERR(ret) && PTR_ERR(ret) == -EINTR) {
- btree_node_unlock(iter, level);
+ if (PTR_ERR_OR_ZERO(ret) == -EINTR && may_drop_locks) {
+ struct btree_iter *linked;
- if (!bch2_btree_node_relock(iter, level + 1)) {
- bch2_btree_iter_set_locks_want(iter, level + 2);
- return ERR_PTR(-EINTR);
- }
+ if (!bch2_btree_node_relock(iter, level + 1))
+ goto out_upgrade;
- ret = bch2_btree_node_get(c, iter, &tmp.k, level, SIX_LOCK_intent);
- }
+ /*
+ * We might have got -EINTR because trylock failed, and we're
+ * holding other locks that would cause us to deadlock:
+ */
+ for_each_linked_btree_iter(iter, linked)
+ if (btree_iter_cmp(iter, linked) < 0)
+ __bch2_btree_iter_unlock(linked);
+
+ if (sib == btree_prev_sib)
+ btree_node_unlock(iter, level);
- if (!bch2_btree_node_relock(iter, level)) {
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
+ ret = bch2_btree_node_get(c, iter, &tmp.k, level,
+ SIX_LOCK_intent);
- if (!IS_ERR(ret)) {
- six_unlock_intent(&ret->lock);
- ret = ERR_PTR(-EINTR);
+ /*
+ * before btree_iter_relock() calls btree_iter_verify_locks():
+ */
+ if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
+ btree_node_unlock(iter, level + 1);
+
+ if (!bch2_btree_node_relock(iter, level)) {
+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
+
+ if (!IS_ERR(ret)) {
+ six_unlock_intent(&ret->lock);
+ ret = ERR_PTR(-EINTR);
+ }
}
+
+ bch2_btree_iter_relock(iter);
}
+out:
+ if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
+ btree_node_unlock(iter, level + 1);
+
+ bch2_btree_iter_verify_locks(iter);
+
+ BUG_ON((!may_drop_locks || !IS_ERR(ret)) &&
+ (iter->uptodate >= BTREE_ITER_NEED_RELOCK ||
+ !btree_node_locked(iter, level)));
return ret;
+out_upgrade:
+ if (may_drop_locks)
+ bch2_btree_iter_upgrade(iter, level + 2);
+ ret = ERR_PTR(-EINTR);
+ goto out;
}
void bch2_btree_node_prefetch(struct bch_fs *c, const struct bkey_i *k,
diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h
index e021d6e..43109d0 100644
--- a/libbcachefs/btree_cache.h
+++ b/libbcachefs/btree_cache.h
@@ -26,7 +26,7 @@ struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
enum six_lock_type);
struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
- struct btree *,
+ struct btree *, bool,
enum btree_node_sibling);
void bch2_btree_node_prefetch(struct bch_fs *, const struct bkey_i *,
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index 02b14e3..969c1f1 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -148,6 +148,9 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
? BCH_DATA_BTREE : BCH_DATA_USER;
int ret = 0;
+ BUG_ON(journal_seq_verify(c) &&
+ k.k->version.lo > journal_cur_seq(&c->journal));
+
if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
fsck_err_on(!bch2_bkey_replicas_marked(c, data_type, k), c,
"superblock not marked as containing replicas (type %u)",
@@ -243,6 +246,11 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id)
unsigned max_stale;
int ret = 0;
+ gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
+
+ if (!c->btree_roots[btree_id].b)
+ return 0;
+
/*
* if expensive_debug_checks is on, run range_checks on all leaf nodes:
*/
@@ -454,7 +462,7 @@ static void bch2_gc_start(struct bch_fs *c)
* Indicates to buckets code that gc is now in progress - done under
* usage_lock to avoid racing with bch2_mark_key():
*/
- __gc_pos_set(c, GC_POS_MIN);
+ __gc_pos_set(c, gc_phase(GC_PHASE_START));
/* Save a copy of the existing bucket stats while we recompute them: */
for_each_member_device(ca, c, i) {
@@ -535,22 +543,18 @@ void bch2_gc(struct bch_fs *c)
bch2_gc_start(c);
- /* Walk btree: */
- while (c->gc_pos.phase < (int) BTREE_ID_NR) {
- int ret = c->btree_roots[c->gc_pos.phase].b
- ? bch2_gc_btree(c, (int) c->gc_pos.phase)
- : 0;
+ bch2_mark_superblocks(c);
+ /* Walk btree: */
+ for (i = 0; i < BTREE_ID_NR; i++) {
+ int ret = bch2_gc_btree(c, i);
if (ret) {
bch_err(c, "btree gc failed: %d", ret);
set_bit(BCH_FS_GC_FAILURE, &c->flags);
goto out;
}
-
- gc_pos_set(c, gc_phase(c->gc_pos.phase + 1));
}
- bch2_mark_superblocks(c);
bch2_mark_pending_btree_node_frees(c);
bch2_mark_allocator_buckets(c);
@@ -780,13 +784,13 @@ next:
bch2_keylist_add_in_order(&keylist, &new_nodes[i]->key);
/* Insert the newly coalesced nodes */
- bch2_btree_insert_node(as, parent, iter, &keylist);
+ bch2_btree_insert_node(as, parent, iter, &keylist, 0);
BUG_ON(!bch2_keylist_empty(&keylist));
BUG_ON(iter->l[old_nodes[0]->level].b != old_nodes[0]);
- BUG_ON(!bch2_btree_iter_node_replace(iter, new_nodes[0]));
+ bch2_btree_iter_node_replace(iter, new_nodes[0]);
for (i = 0; i < nr_new_nodes; i++)
bch2_btree_open_bucket_put(c, new_nodes[i]);
@@ -1003,6 +1007,8 @@ static int bch2_initial_gc_btree(struct bch_fs *c, enum btree_id id)
btree_node_range_checks_init(&r, 0);
+ gc_pos_set(c, gc_pos_btree(id, POS_MIN, 0));
+
if (!c->btree_roots[id].b)
return 0;
@@ -1041,36 +1047,33 @@ err:
return bch2_btree_iter_unlock(&iter) ?: ret;
}
-static int __bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
+int bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
{
unsigned iter = 0;
enum btree_id id;
- int ret;
+ int ret = 0;
- mutex_lock(&c->sb_lock);
- if (!bch2_sb_get_replicas(c->disk_sb.sb)) {
- if (BCH_SB_INITIALIZED(c->disk_sb.sb))
- bch_info(c, "building replicas info");
- set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
- }
- mutex_unlock(&c->sb_lock);
+ down_write(&c->gc_lock);
again:
bch2_gc_start(c);
+ bch2_mark_superblocks(c);
+
for (id = 0; id < BTREE_ID_NR; id++) {
ret = bch2_initial_gc_btree(c, id);
if (ret)
- return ret;
+ goto err;
}
ret = bch2_journal_mark(c, journal);
if (ret)
- return ret;
+ goto err;
if (test_bit(BCH_FS_FIXED_GENS, &c->flags)) {
if (iter++ > 2) {
bch_info(c, "Unable to fix bucket gens, looping");
- return -EINVAL;
+ ret = -EINVAL;
+ goto err;
}
bch_info(c, "Fixed gens, restarting initial mark and sweep:");
@@ -1085,21 +1088,9 @@ again:
if (c->sb.encryption_type)
atomic64_add(1 << 16, &c->key_version);
- bch2_mark_superblocks(c);
-
gc_pos_set(c, gc_phase(GC_PHASE_DONE));
set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
-
- return 0;
-}
-
-int bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
-{
- int ret;
-
- down_write(&c->gc_lock);
- ret = __bch2_initial_gc(c, journal);
+err:
up_write(&c->gc_lock);
-
return ret;
}
diff --git a/libbcachefs/btree_gc.h b/libbcachefs/btree_gc.h
index 4d1ab9d..214a3fe 100644
--- a/libbcachefs/btree_gc.h
+++ b/libbcachefs/btree_gc.h
@@ -46,8 +46,6 @@ static inline struct gc_pos gc_phase(enum gc_phase phase)
};
}
-#define GC_POS_MIN gc_phase(0)
-
static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
{
if (l.phase != r.phase)
@@ -59,17 +57,23 @@ static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
return 0;
}
+static inline struct gc_pos gc_pos_btree(enum btree_id id,
+ struct bpos pos, unsigned level)
+{
+ return (struct gc_pos) {
+ .phase = GC_PHASE_BTREE_EXTENTS + id,
+ .pos = pos,
+ .level = level,
+ };
+}
+
/*
* GC position of the pointers within a btree node: note, _not_ for &b->key
* itself, that lives in the parent node:
*/
static inline struct gc_pos gc_pos_btree_node(struct btree *b)
{
- return (struct gc_pos) {
- .phase = b->btree_id,
- .pos = b->key.k.p,
- .level = b->level,
- };
+ return gc_pos_btree(b->btree_id, b->key.k.p, b->level);
}
/*
@@ -81,11 +85,7 @@ static inline struct gc_pos gc_pos_btree_node(struct btree *b)
*/
static inline struct gc_pos gc_pos_btree_root(enum btree_id id)
{
- return (struct gc_pos) {
- .phase = (int) id,
- .pos = POS_MAX,
- .level = U8_MAX,
- };
+ return gc_pos_btree(id, POS_MAX, BTREE_MAX_DEPTH);
}
static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *ob)
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index 74ffad4..0c825bc 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -920,7 +920,7 @@ static int btree_err_msg(struct bch_fs *c, struct btree *b, struct bset *i,
char *out = buf, *end = buf + len;
out += scnprintf(out, end - out,
- "error validating btree node %s "
+ "error validating btree node %s"
"at btree %u level %u/%u\n"
"pos %llu:%llu node offset %u",
write ? "before write " : "",
@@ -1120,7 +1120,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u);
btree_err(BTREE_ERR_FIXABLE, c, b, i,
- "invalid bkey:\n%s\n%s", buf, invalid);
+ "invalid bkey:\n%s\n%s", invalid, buf);
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
memmove_u64s_down(k, bkey_next(k),
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index 95ee9f6..682a914 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -34,11 +34,9 @@ void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
EBUG_ON(iter->l[b->level].b != b);
EBUG_ON(iter->lock_seq[b->level] + 1 != b->lock.state.seq);
- for_each_linked_btree_node(iter, b, linked)
+ for_each_btree_iter_with_node(iter, b, linked)
linked->lock_seq[b->level] += 2;
- iter->lock_seq[b->level] += 2;
-
six_unlock_write(&b->lock);
}
@@ -48,6 +46,8 @@ void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
struct btree_iter *linked;
unsigned readers = 0;
+ EBUG_ON(btree_node_read_locked(iter, b->level));
+
for_each_linked_btree_iter(iter, linked)
if (linked->l[b->level].b == b &&
btree_node_read_locked(linked, b->level))
@@ -66,15 +66,51 @@ void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
&b->lock.state.counter);
}
-bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
+/*
+ * Lock a btree node if we already have it locked on one of our linked
+ * iterators:
+ */
+static inline bool btree_node_lock_increment(struct btree_iter *iter,
+ struct btree *b, unsigned level,
+ enum btree_node_locked_type want)
{
struct btree_iter *linked;
+
+ for_each_linked_btree_iter(iter, linked)
+ if (linked->l[level].b == b &&
+ btree_node_locked_type(linked, level) >= want) {
+ six_lock_increment(&b->lock, want);
+ return true;
+ }
+
+ return false;
+}
+
+bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
+{
struct btree *b = iter->l[level].b;
- int want = btree_lock_want(iter, level);
- int have = btree_node_locked_type(iter, level);
+ int want = __btree_lock_want(iter, level);
- if (want == have)
- return true;
+ if (!is_btree_node(iter, level))
+ return false;
+
+ if (race_fault())
+ return false;
+
+ if (!six_relock_type(&b->lock, want, iter->lock_seq[level]) &&
+ !(iter->lock_seq[level] >> 1 == b->lock.state.seq >> 1 &&
+ btree_node_lock_increment(iter, b, level, want)))
+ return false;
+
+ mark_btree_node_locked(iter, level, want);
+ return true;
+}
+
+static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level)
+{
+ struct btree *b = iter->l[level].b;
+
+ EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED);
if (!is_btree_node(iter, level))
return false;
@@ -82,42 +118,62 @@ bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
if (race_fault())
return false;
- if (have != BTREE_NODE_UNLOCKED
- ? six_trylock_convert(&b->lock, have, want)
- : six_relock_type(&b->lock, want, iter->lock_seq[level]))
+ if (btree_node_intent_locked(iter, level))
+ return true;
+
+ if (btree_node_locked(iter, level)
+ ? six_lock_tryupgrade(&b->lock)
+ : six_relock_type(&b->lock, SIX_LOCK_intent, iter->lock_seq[level]))
goto success;
- for_each_linked_btree_iter(iter, linked)
- if (linked->l[level].b == b &&
- btree_node_locked_type(linked, level) == want &&
- iter->lock_seq[level] == b->lock.state.seq) {
- btree_node_unlock(iter, level);
- six_lock_increment(&b->lock, want);
- goto success;
- }
+ if (iter->lock_seq[level] >> 1 == b->lock.state.seq >> 1 &&
+ btree_node_lock_increment(iter, b, level, BTREE_NODE_INTENT_LOCKED)) {
+ btree_node_unlock(iter, level);
+ goto success;
+ }
return false;
success:
- mark_btree_node_unlocked(iter, level);
- mark_btree_node_locked(iter, level, want);
+ mark_btree_node_intent_locked(iter, level);
return true;
}
-bool bch2_btree_iter_relock(struct btree_iter *iter)
+static inline bool btree_iter_get_locks(struct btree_iter *iter,
+ bool upgrade)
{
- unsigned l;
+ unsigned l = iter->level;
+ int fail_idx = -1;
- for (l = iter->level;
- l < max_t(unsigned, iter->locks_want, 1) && iter->l[l].b;
- l++)
- if (!bch2_btree_node_relock(iter, l)) {
+ do {
+ if (!btree_iter_node(iter, l))
+ break;
+
+ if (!(upgrade
+ ? bch2_btree_node_upgrade(iter, l)
+ : bch2_btree_node_relock(iter, l))) {
+ fail_idx = l;
btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
- return false;
}
+ l++;
+ } while (l < iter->locks_want);
+
+ /*
+ * When we fail to get a lock, we have to ensure that any child nodes
+ * can't be relocked so bch2_btree_iter_traverse has to walk back up to
+ * the node that we failed to relock:
+ */
+ while (fail_idx >= 0) {
+ btree_node_unlock(iter, fail_idx);
+ iter->l[fail_idx].b = BTREE_ITER_NOT_END;
+ --fail_idx;
+ }
+
if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
iter->uptodate = BTREE_ITER_NEED_PEEK;
- return true;
+
+ bch2_btree_iter_verify_locks(iter);
+ return iter->uptodate < BTREE_ITER_NEED_RELOCK;
}
/* Slowpath: */
@@ -128,6 +184,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
{
struct bch_fs *c = iter->c;
struct btree_iter *linked;
+ bool ret = true;
/* Can't have children locked before ancestors: */
EBUG_ON(iter->nodes_locked && level > __ffs(iter->nodes_locked));
@@ -140,15 +197,11 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
EBUG_ON(type == SIX_LOCK_intent &&
iter->nodes_locked != iter->nodes_intent_locked);
- for_each_linked_btree_iter(iter, linked)
- if (linked->l[level].b == b &&
- btree_node_locked_type(linked, level) == type) {
- six_lock_increment(&b->lock, type);
- return true;
- }
+ if (btree_node_lock_increment(iter, b, level, type))
+ return true;
/*
- * Must lock btree nodes in key order - this case hapens when locking
+ * Must lock btree nodes in key order - this case happens when locking
* the prev sibling in btree node merging:
*/
if (iter->nodes_locked &&
@@ -160,6 +213,10 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
if (!linked->nodes_locked)
continue;
+ /* We have to lock btree nodes in key order: */
+ if (__btree_iter_cmp(iter->btree_id, pos, linked) < 0)
+ ret = false;
+
/*
* Can't block taking an intent lock if we have _any_ nodes read
* locked:
@@ -175,15 +232,12 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
if (type == SIX_LOCK_intent &&
linked->nodes_locked != linked->nodes_intent_locked) {
linked->locks_want = max_t(unsigned,
- linked->locks_want,
- iter->locks_want);
- return false;
+ linked->locks_want,
+ __fls(linked->nodes_locked) + 1);
+ btree_iter_get_locks(linked, true);
+ ret = false;
}
- /* We have to lock btree nodes in key order: */
- if (__btree_iter_cmp(iter->btree_id, pos, linked) < 0)
- return false;
-
/*
* Interior nodes must be locked before their descendants: if
* another iterator has possible descendants locked of the node
@@ -194,82 +248,133 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
linked->locks_want = max_t(unsigned,
linked->locks_want,
iter->locks_want);
- return false;
+ btree_iter_get_locks(linked, true);
+ ret = false;
}
}
- __btree_node_lock_type(c, b, type);
- return true;
+ if (ret)
+ __btree_node_lock_type(c, b, type);
+ return ret;
}
/* Btree iterator locking: */
-static void btree_iter_drop_extra_locks(struct btree_iter *iter)
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_btree_iter_verify_locks(struct btree_iter *iter)
{
unsigned l;
- while (iter->nodes_locked &&
- (l = __fls(iter->nodes_locked)) > iter->locks_want) {
- if (l > iter->level) {
- btree_node_unlock(iter, l);
- } else {
- if (btree_node_intent_locked(iter, l)) {
- six_lock_downgrade(&iter->l[l].b->lock);
- iter->nodes_intent_locked ^= 1 << l;
- }
- break;
- }
+ if (iter->uptodate == BTREE_ITER_END) {
+ BUG_ON(iter->nodes_locked);
+ return;
+ }
+
+ for (l = 0; btree_iter_node(iter, l); l++) {
+ if (iter->uptodate >= BTREE_ITER_NEED_RELOCK &&
+ !btree_node_locked(iter, l))
+ continue;
+
+ BUG_ON(btree_lock_want(iter, l) !=
+ btree_node_locked_type(iter, l));
}
}
+#endif
+
+__flatten
+static bool __bch2_btree_iter_relock(struct btree_iter *iter)
+{
+ if (iter->uptodate < BTREE_ITER_NEED_RELOCK)
+ return true;
+
+ if (iter->uptodate > BTREE_ITER_NEED_TRAVERSE)
+ return false;
+
+ return btree_iter_get_locks(iter, false);
+}
-bool __bch2_btree_iter_set_locks_want(struct btree_iter *iter,
- unsigned new_locks_want)
+bool bch2_btree_iter_relock(struct btree_iter *iter)
{
struct btree_iter *linked;
+ bool ret = true;
- /* Drop locks we don't want anymore: */
- if (new_locks_want < iter->locks_want)
- for_each_linked_btree_iter(iter, linked)
- if (linked->locks_want > new_locks_want) {
- linked->locks_want = max_t(unsigned, 1,
- new_locks_want);
- btree_iter_drop_extra_locks(linked);
- }
+ for_each_btree_iter(iter, linked)
+ ret &= __bch2_btree_iter_relock(linked);
+
+ return ret;
+}
+
+bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
+ unsigned new_locks_want)
+{
+ struct btree_iter *linked;
+
+ EBUG_ON(iter->locks_want >= new_locks_want);
iter->locks_want = new_locks_want;
- btree_iter_drop_extra_locks(iter);
- if (bch2_btree_iter_relock(iter))
+ if (btree_iter_get_locks(iter, true))
return true;
/*
- * Just an optimization: ancestor nodes must be locked before child
- * nodes, so set locks_want on iterators that might lock ancestors
- * before us to avoid getting -EINTR later:
+ * Ancestor nodes must be locked before child nodes, so set locks_want
+ * on iterators that might lock ancestors before us to avoid getting
+ * -EINTR later:
*/
for_each_linked_btree_iter(iter, linked)
if (linked->btree_id == iter->btree_id &&
- btree_iter_cmp(linked, iter) <= 0)
- linked->locks_want = max_t(unsigned, linked->locks_want,
- new_locks_want);
+ btree_iter_cmp(linked, iter) <= 0 &&
+ linked->locks_want < new_locks_want) {
+ linked->locks_want = new_locks_want;
+ btree_iter_get_locks(linked, true);
+ }
+
return false;
}
-static void __bch2_btree_iter_unlock(struct btree_iter *iter)
+void __bch2_btree_iter_downgrade(struct btree_iter *iter,
+ unsigned downgrade_to)
{
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
+ struct btree_iter *linked;
+ unsigned l;
+
+ /*
+ * We downgrade linked iterators as well because btree_iter_upgrade
+ * might have had to modify locks_want on linked iterators due to lock
+ * ordering:
+ */
+ for_each_btree_iter(iter, linked) {
+ unsigned new_locks_want = downgrade_to ?:
+ (linked->flags & BTREE_ITER_INTENT ? 1 : 0);
+
+ if (linked->locks_want <= new_locks_want)
+ continue;
- while (iter->nodes_locked)
- btree_node_unlock(iter, __ffs(iter->nodes_locked));
+ linked->locks_want = new_locks_want;
+
+ while (linked->nodes_locked &&
+ (l = __fls(linked->nodes_locked)) >= linked->locks_want) {
+ if (l > linked->level) {
+ btree_node_unlock(linked, l);
+ } else {
+ if (btree_node_intent_locked(linked, l)) {
+ six_lock_downgrade(&linked->l[l].b->lock);
+ linked->nodes_intent_locked ^= 1 << l;
+ }
+ break;
+ }
+ }
+
+ bch2_btree_iter_verify_locks(linked);
+ }
}
int bch2_btree_iter_unlock(struct btree_iter *iter)
{
struct btree_iter *linked;
- for_each_linked_btree_iter(iter, linked)
+ for_each_btree_iter(iter, linked)
__bch2_btree_iter_unlock(linked);
- __bch2_btree_iter_unlock(iter);
return iter->flags & BTREE_ITER_ERROR ? -EIO : 0;
}
@@ -320,11 +425,8 @@ void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b)
{
struct btree_iter *linked;
- if (iter->l[b->level].b == b)
- __bch2_btree_iter_verify(iter, b);
-
- for_each_linked_btree_node(iter, b, linked)
- __bch2_btree_iter_verify(iter, b);
+ for_each_btree_iter_with_node(iter, b, linked)
+ __bch2_btree_iter_verify(linked, b);
}
#endif
@@ -456,12 +558,7 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter,
__bch2_btree_node_iter_fix(iter, b, node_iter, t,
where, clobber_u64s, new_u64s);
- if (iter->l[b->level].b == b)
- __bch2_btree_node_iter_fix(iter, b,
- &iter->l[b->level].iter, t,
- where, clobber_u64s, new_u64s);
-
- for_each_linked_btree_node(iter, b, linked)
+ for_each_btree_iter_with_node(iter, b, linked)
__bch2_btree_node_iter_fix(linked, b,
&linked->l[b->level].iter, t,
where, clobber_u64s, new_u64s);
@@ -613,11 +710,12 @@ static inline void btree_iter_node_set(struct btree_iter *iter,
* A btree node is being replaced - update the iterator to point to the new
* node:
*/
-bool bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
+void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
{
+ enum btree_node_locked_type t;
struct btree_iter *linked;
- for_each_linked_btree_iter(iter, linked)
+ for_each_btree_iter(iter, linked)
if (btree_iter_pos_in_node(linked, b)) {
/*
* bch2_btree_iter_node_drop() has already been called -
@@ -626,52 +724,28 @@ bool bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
*/
BUG_ON(btree_node_locked(linked, b->level));
- /*
- * If @linked wants this node read locked, we don't want
- * to actually take the read lock now because it's not
- * legal to hold read locks on other nodes while we take
- * write locks, so the journal can make forward
- * progress...
- *
- * Instead, btree_iter_node_set() sets things up so
- * bch2_btree_node_relock() will succeed:
- */
-
- if (btree_want_intent(linked, b->level)) {
- six_lock_increment(&b->lock, SIX_LOCK_intent);
- mark_btree_node_intent_locked(linked, b->level);
+ t = btree_lock_want(linked, b->level);
+ if (t != BTREE_NODE_UNLOCKED) {
+ six_lock_increment(&b->lock, t);
+ mark_btree_node_locked(linked, b->level, t);
}
btree_iter_node_set(linked, b);
}
- if (!btree_iter_pos_in_node(iter, b)) {
- six_unlock_intent(&b->lock);
- return false;
- }
-
- mark_btree_node_intent_locked(iter, b->level);
- btree_iter_node_set(iter, b);
- return true;
-}
-
-void bch2_btree_iter_node_drop_linked(struct btree_iter *iter, struct btree *b)
-{
- struct btree_iter *linked;
-
- for_each_linked_btree_iter(iter, linked)
- bch2_btree_iter_node_drop(linked, b);
+ six_unlock_intent(&b->lock);
}
void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
{
+ struct btree_iter *linked;
unsigned level = b->level;
- if (iter->l[level].b == b) {
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
- btree_node_unlock(iter, level);
- iter->l[level].b = BTREE_ITER_NOT_END;
- }
+ for_each_btree_iter(iter, linked)
+ if (linked->l[level].b == b) {
+ btree_node_unlock(linked, level);
+ linked->l[level].b = BTREE_ITER_NOT_END;
+ }
}
/*
@@ -682,9 +756,8 @@ void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b)
{
struct btree_iter *linked;
- for_each_linked_btree_node(iter, b, linked)
+ for_each_btree_iter_with_node(iter, b, linked)
__btree_iter_init(linked, b);
- __btree_iter_init(iter, b);
}
static inline int btree_iter_lock_root(struct btree_iter *iter,
@@ -713,7 +786,7 @@ static inline int btree_iter_lock_root(struct btree_iter *iter,
return 0;
}
- lock_type = btree_lock_want(iter, iter->level);
+ lock_type = __btree_lock_want(iter, iter->level);
if (unlikely(!btree_node_lock(b, POS_MAX, iter->level,
iter, lock_type)))
return -EINTR;
@@ -771,7 +844,7 @@ static inline int btree_iter_down(struct btree_iter *iter)
struct btree_iter_level *l = &iter->l[iter->level];
struct btree *b;
unsigned level = iter->level - 1;
- enum six_lock_type lock_type = btree_lock_want(iter, level);
+ enum six_lock_type lock_type = __btree_lock_want(iter, level);
BKEY_PADDED(k) tmp;
BUG_ON(!btree_node_locked(iter, iter->level));
@@ -799,6 +872,12 @@ static void btree_iter_up(struct btree_iter *iter)
btree_node_unlock(iter, iter->level++);
}
+static void btree_iter_set_end(struct btree_iter *iter)
+{
+ iter->uptodate = BTREE_ITER_END;
+ __bch2_btree_iter_unlock(iter);
+}
+
int __must_check __bch2_btree_iter_traverse(struct btree_iter *);
static int btree_iter_traverse_error(struct btree_iter *iter, int ret)
@@ -871,7 +950,7 @@ io_error:
BUG_ON(ret != -EIO);
iter->flags |= BTREE_ITER_ERROR;
- iter->l[iter->level].b = NULL;
+ iter->l[iter->level].b = BTREE_ITER_NOT_END;
goto out;
}
@@ -888,9 +967,12 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
{
unsigned depth_want = iter->level;
- if (unlikely(!iter->l[iter->level].b))
+ if (unlikely(iter->uptodate == BTREE_ITER_END))
return 0;
+ BUG_ON(iter->level >= BTREE_MAX_DEPTH);
+ BUG_ON(!iter->l[iter->level].b);
+
iter->flags &= ~BTREE_ITER_AT_END_OF_LEAF;
/* make sure we have all the intent locks we need - ugh */
@@ -959,6 +1041,7 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
}
iter->uptodate = BTREE_ITER_NEED_PEEK;
+ bch2_btree_iter_verify_locks(iter);
return 0;
}
@@ -966,13 +1049,15 @@ int __must_check bch2_btree_iter_traverse(struct btree_iter *iter)
{
int ret;
- if (iter->uptodate < BTREE_ITER_NEED_RELOCK)
+ if (__bch2_btree_iter_relock(iter))
return 0;
ret = __bch2_btree_iter_traverse(iter);
if (unlikely(ret))
ret = btree_iter_traverse_error(iter, ret);
+ BUG_ON(ret == -EINTR && !btree_iter_linked(iter));
+
return ret;
}
@@ -984,18 +1069,29 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
int ret;
EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
+ bch2_btree_iter_verify_locks(iter);
+
+ if (iter->uptodate == BTREE_ITER_UPTODATE)
+ return iter->l[iter->level].b;
+
+ if (unlikely(iter->uptodate == BTREE_ITER_END))
+ return NULL;
ret = bch2_btree_iter_traverse(iter);
if (ret)
return ERR_PTR(ret);
b = iter->l[iter->level].b;
-
- if (b) {
- EBUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0);
- iter->pos = b->key.k.p;
+ if (!b) {
+ btree_iter_set_end(iter);
+ return NULL;
}
+ BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0);
+
+ iter->pos = b->key.k.p;
+ iter->uptodate = BTREE_ITER_UPTODATE;
+
return b;
}
@@ -1005,24 +1101,39 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth)
int ret;
EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
+ bch2_btree_iter_verify_locks(iter);
btree_iter_up(iter);
- if (!btree_iter_node(iter, iter->level))
+ if (!btree_iter_node(iter, iter->level)) {
+ btree_iter_set_end(iter);
return NULL;
+ }
- /* parent node usually won't be locked: redo traversal if necessary */
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
- ret = bch2_btree_iter_traverse(iter);
- if (ret)
- return NULL;
+ if (!bch2_btree_node_relock(iter, iter->level)) {
+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+ ret = bch2_btree_iter_traverse(iter);
+ if (ret)
+ return NULL;
+ }
b = iter->l[iter->level].b;
- if (!b)
- return b;
+ BUG_ON(!b);
if (bkey_cmp(iter->pos, b->key.k.p) < 0) {
- /* Haven't gotten to the end of the parent node: */
+ /*
+ * Haven't gotten to the end of the parent node: go back down to
+ * the next child node
+ */
+
+ /*
+ * We don't really want to be unlocking here except we can't
+ * directly tell btree_iter_traverse() "traverse to this level"
+ * except by setting iter->level, so we have to unlock so we
+ * don't screw up our lock invariants:
+ */
+ if (btree_node_read_locked(iter, iter->level))
+ btree_node_unlock(iter, iter->level);
/* ick: */
iter->pos = iter->btree_id == BTREE_ID_INODES
@@ -1086,8 +1197,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
(iter->btree_id == BTREE_ID_EXTENTS));
EBUG_ON(iter->flags & BTREE_ITER_SLOTS);
- EBUG_ON(iter->uptodate == BTREE_ITER_UPTODATE &&
- !btree_node_locked(iter, 0));
+ bch2_btree_iter_verify_locks(iter);
if (iter->uptodate == BTREE_ITER_UPTODATE) {
struct bkey_packed *k =
@@ -1117,7 +1227,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
/* got to the end of the leaf, iterator needs to be traversed: */
iter->pos = l->b->key.k.p;
if (!bkey_cmp(iter->pos, POS_MAX)) {
- iter->uptodate = BTREE_ITER_END;
+ btree_iter_set_end(iter);
return bkey_s_c_null;
}
@@ -1144,7 +1254,7 @@ struct bkey_s_c bch2_btree_iter_peek_next_leaf(struct btree_iter *iter)
iter->pos = l->b->key.k.p;
if (!bkey_cmp(iter->pos, POS_MAX)) {
- iter->uptodate = BTREE_ITER_END;
+ btree_iter_set_end(iter);
return bkey_s_c_null;
}
@@ -1163,6 +1273,7 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
(iter->btree_id == BTREE_ID_EXTENTS));
EBUG_ON(iter->flags & BTREE_ITER_SLOTS);
+ bch2_btree_iter_verify_locks(iter);
if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
k = bch2_btree_iter_peek(iter);
@@ -1225,7 +1336,7 @@ recheck:
if (iter->flags & BTREE_ITER_IS_EXTENTS) {
if (n.p.offset == KEY_OFFSET_MAX) {
if (n.p.inode == KEY_INODE_MAX) {
- iter->uptodate = BTREE_ITER_END;
+ btree_iter_set_end(iter);
return bkey_s_c_null;
}
@@ -1259,8 +1370,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
(iter->btree_id == BTREE_ID_EXTENTS));
EBUG_ON(!(iter->flags & BTREE_ITER_SLOTS));
- EBUG_ON(iter->uptodate == BTREE_ITER_UPTODATE &&
- !btree_node_locked(iter, 0));
+ bch2_btree_iter_verify_locks(iter);
if (iter->uptodate == BTREE_ITER_UPTODATE) {
struct bkey_s_c ret = { .k = &iter->k };
@@ -1286,6 +1396,11 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
{
+ EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
+ (iter->btree_id == BTREE_ID_EXTENTS));
+ EBUG_ON(!(iter->flags & BTREE_ITER_SLOTS));
+ bch2_btree_iter_verify_locks(iter);
+
iter->pos = btree_type_successor(iter->btree_id, iter->k.p);
if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
@@ -1347,13 +1462,11 @@ void bch2_btree_iter_unlink(struct btree_iter *iter)
if (!btree_iter_linked(iter))
return;
- for_each_linked_btree_iter(iter, linked) {
-
+ for_each_linked_btree_iter(iter, linked)
if (linked->next == iter) {
linked->next = iter->next;
return;
}
- }
BUG();
}
@@ -1366,9 +1479,9 @@ void bch2_btree_iter_link(struct btree_iter *iter, struct btree_iter *new)
iter->next = new;
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
- unsigned nr_iters = 1;
+ unsigned nr_iters = 0;
- for_each_linked_btree_iter(iter, new)
+ for_each_btree_iter(iter, new)
nr_iters++;
BUG_ON(nr_iters > SIX_LOCK_MAX_RECURSE);
diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h
index 0097a2a..99e51b2 100644
--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@@ -28,40 +28,47 @@ static inline bool btree_iter_linked(const struct btree_iter *iter)
return iter->next != iter;
}
-/**
- * for_each_linked_btree_iter - iterate over all iterators linked with @_iter
- */
-#define for_each_linked_btree_iter(_iter, _linked) \
- for ((_linked) = (_iter)->next; \
- (_linked) != (_iter); \
- (_linked) = (_linked)->next)
+static inline bool __iter_has_node(const struct btree_iter *iter,
+ const struct btree *b)
+{
+ /*
+ * We don't compare the low bits of the lock sequence numbers because
+ * @iter might have taken a write lock on @b, and we don't want to skip
+ * the linked iterator if the sequence numbers were equal before taking
+ * that write lock. The lock sequence number is incremented by taking
+ * and releasing write locks and is even when unlocked:
+ */
+
+ return iter->l[b->level].b == b &&
+ iter->lock_seq[b->level] >> 1 == b->lock.state.seq >> 1;
+}
static inline struct btree_iter *
-__next_linked_btree_node(struct btree_iter *iter, struct btree *b,
- struct btree_iter *linked)
-{
- do {
- linked = linked->next;
-
- if (linked == iter)
- return NULL;
-
- /*
- * We don't compare the low bits of the lock sequence numbers
- * because @iter might have taken a write lock on @b, and we
- * don't want to skip the linked iterator if the sequence
- * numbers were equal before taking that write lock. The lock
- * sequence number is incremented by taking and releasing write
- * locks and is even when unlocked:
- */
- } while (linked->l[b->level].b != b ||
- linked->lock_seq[b->level] >> 1 != b->lock.state.seq >> 1);
+__next_linked_iter(struct btree_iter *iter, struct btree_iter *linked)
+{
+ return linked->next != iter ? linked->next : NULL;
+}
+
+static inline struct btree_iter *
+__next_iter_with_node(struct btree_iter *iter, struct btree *b,
+ struct btree_iter *linked)
+{
+ while (linked && !__iter_has_node(linked, b))
+ linked = __next_linked_iter(iter, linked);
return linked;
}
/**
- * for_each_linked_btree_node - iterate over all iterators linked with @_iter
+ * for_each_btree_iter - iterate over all iterators linked with @_iter,
+ * including @_iter
+ */
+#define for_each_btree_iter(_iter, _linked) \
+ for ((_linked) = (_iter); (_linked); \
+ (_linked) = __next_linked_iter(_iter, _linked))
+
+/**
+ * for_each_btree_iter_with_node - iterate over all iterators linked with @_iter
* that also point to @_b
*
* @_b is assumed to be locked by @_iter
@@ -69,15 +76,27 @@ __next_linked_btree_node(struct btree_iter *iter, struct btree *b,
* Filters out iterators that don't have a valid btree_node iterator for @_b -
* i.e. iterators for which bch2_btree_node_relock() would not succeed.
*/
-#define for_each_linked_btree_node(_iter, _b, _linked) \
+#define for_each_btree_iter_with_node(_iter, _b, _linked) \
for ((_linked) = (_iter); \
- ((_linked) = __next_linked_btree_node(_iter, _b, _linked));)
+ ((_linked) = __next_iter_with_node(_iter, _b, _linked)); \
+ (_linked) = __next_linked_iter(_iter, _linked))
+
+/**
+ * for_each_linked_btree_iter - iterate over all iterators linked with @_iter,
+ * _not_ including @_iter
+ */
+#define for_each_linked_btree_iter(_iter, _linked) \
+ for ((_linked) = (_iter)->next; \
+ (_linked) != (_iter); \
+ (_linked) = (_linked)->next)
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_btree_iter_verify(struct btree_iter *, struct btree *);
+void bch2_btree_iter_verify_locks(struct btree_iter *);
#else
static inline void bch2_btree_iter_verify(struct btree_iter *iter,
- struct btree *b) {}
+ struct btree *b) {}
+static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {}
#endif
void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
@@ -85,22 +104,28 @@ void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
struct bkey_packed *, unsigned, unsigned);
int bch2_btree_iter_unlock(struct btree_iter *);
-bool __bch2_btree_iter_set_locks_want(struct btree_iter *, unsigned);
-static inline bool bch2_btree_iter_set_locks_want(struct btree_iter *iter,
- unsigned new_locks_want)
+bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned);
+
+static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter,
+ unsigned new_locks_want)
{
new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
- if (iter->locks_want == new_locks_want &&
- iter->nodes_intent_locked == (1 << new_locks_want) - 1)
- return true;
+ return iter->locks_want < new_locks_want
+ ? __bch2_btree_iter_upgrade(iter, new_locks_want)
+ : iter->uptodate <= BTREE_ITER_NEED_PEEK;
+}
+
+void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned);
- return __bch2_btree_iter_set_locks_want(iter, new_locks_want);
+static inline void bch2_btree_iter_downgrade(struct btree_iter *iter)
+{
+ if (iter->locks_want > (iter->flags & BTREE_ITER_INTENT) ? 1 : 0)
+ __bch2_btree_iter_downgrade(iter, 0);
}
-bool bch2_btree_iter_node_replace(struct btree_iter *, struct btree *);
-void bch2_btree_iter_node_drop_linked(struct btree_iter *, struct btree *);
+void bch2_btree_iter_node_replace(struct btree_iter *, struct btree *);
void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *);
void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *);
diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h
index f48084b..1d97520 100644
--- a/libbcachefs/btree_locking.h
+++ b/libbcachefs/btree_locking.h
@@ -75,16 +75,23 @@ static inline void mark_btree_node_intent_locked(struct btree_iter *iter,
mark_btree_node_locked(iter, level, SIX_LOCK_intent);
}
-static inline enum six_lock_type btree_lock_want(struct btree_iter *iter, int level)
+static inline enum six_lock_type __btree_lock_want(struct btree_iter *iter, int level)
{
return level < iter->locks_want
? SIX_LOCK_intent
: SIX_LOCK_read;
}
-static inline bool btree_want_intent(struct btree_iter *iter, int level)
+static inline enum btree_node_locked_type
+btree_lock_want(struct btree_iter *iter, int level)
{
- return btree_lock_want(iter, level) == SIX_LOCK_intent;
+ if (level < iter->level)
+ return BTREE_NODE_UNLOCKED;
+ if (level < iter->locks_want)
+ return BTREE_NODE_INTENT_LOCKED;
+ if (level == iter->level)
+ return BTREE_NODE_READ_LOCKED;
+ return BTREE_NODE_UNLOCKED;
}
static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
@@ -98,6 +105,14 @@ static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
mark_btree_node_unlocked(iter, level);
}
+static inline void __bch2_btree_iter_unlock(struct btree_iter *iter)
+{
+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
+
+ while (iter->nodes_locked)
+ btree_node_unlock(iter, __ffs(iter->nodes_locked));
+}
+
static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
{
switch (type) {
@@ -150,8 +165,11 @@ bool __bch2_btree_node_relock(struct btree_iter *, unsigned);
static inline bool bch2_btree_node_relock(struct btree_iter *iter,
unsigned level)
{
- return likely(btree_lock_want(iter, level) ==
- btree_node_locked_type(iter, level)) ||
+ EBUG_ON(btree_node_locked(iter, level) &&
+ btree_node_locked_type(iter, level) !=
+ __btree_lock_want(iter, level));
+
+ return likely(btree_node_locked(iter, level)) ||
__bch2_btree_node_relock(iter, level);
}
diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h
index f357095..aac9795 100644
--- a/libbcachefs/btree_update.h
+++ b/libbcachefs/btree_update.h
@@ -85,31 +85,49 @@ int __bch2_btree_insert_at(struct btree_insert *);
__VA_ARGS__ \
}})
+enum {
+ __BTREE_INSERT_ATOMIC,
+ __BTREE_INSERT_NOUNLOCK,
+ __BTREE_INSERT_NOFAIL,
+ __BTREE_INSERT_USE_RESERVE,
+ __BTREE_INSERT_USE_ALLOC_RESERVE,
+ __BTREE_INSERT_JOURNAL_REPLAY,
+ __BTREE_INSERT_NOWAIT,
+ __BTREE_INSERT_GC_LOCK_HELD,
+ __BCH_HASH_SET_MUST_CREATE,
+ __BCH_HASH_SET_MUST_REPLACE,
+};
+
+/*
+ * Don't drop/retake locks before doing btree update, instead return -EINTR if
+ * we had to drop locks for any reason
+ */
+#define BTREE_INSERT_ATOMIC (1 << __BTREE_INSERT_ATOMIC)
+
/*
- * Don't drop/retake locks: instead return -EINTR if need to upgrade to intent
- * locks, -EAGAIN if need to wait on btree reserve
+ * Don't drop locks _after_ successfully updating btree:
*/
-#define BTREE_INSERT_ATOMIC (1 << 0)
+#define BTREE_INSERT_NOUNLOCK (1 << __BTREE_INSERT_NOUNLOCK)
/* Don't check for -ENOSPC: */
-#define BTREE_INSERT_NOFAIL (1 << 1)
+#define BTREE_INSERT_NOFAIL (1 << __BTREE_INSERT_NOFAIL)
/* for copygc, or when merging btree nodes */
-#define BTREE_INSERT_USE_RESERVE (1 << 2)
-#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << 3)
+#define BTREE_INSERT_USE_RESERVE (1 << __BTREE_INSERT_USE_RESERVE)
+#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << __BTREE_INSERT_USE_ALLOC_RESERVE)
/*
* Insert is for journal replay: don't get journal reservations, or mark extents
* (bch_mark_key)
*/
-#define BTREE_INSERT_JOURNAL_REPLAY (1 << 4)
+#define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY)
/* Don't block on allocation failure (for new btree nodes: */
-#define BTREE_INSERT_NOWAIT (1 << 5)
-#define BTREE_INSERT_GC_LOCK_HELD (1 << 6)
+#define BTREE_INSERT_NOWAIT (1 << __BTREE_INSERT_NOWAIT)
+#define BTREE_INSERT_GC_LOCK_HELD (1 << __BTREE_INSERT_GC_LOCK_HELD)
-#define BCH_HASH_SET_MUST_CREATE (1 << 7)
-#define BCH_HASH_SET_MUST_REPLACE (1 << 8)
+#define BCH_HASH_SET_MUST_CREATE (1 << __BCH_HASH_SET_MUST_CREATE)
+#define BCH_HASH_SET_MUST_REPLACE (1 << __BCH_HASH_SET_MUST_REPLACE)
int bch2_btree_delete_at(struct btree_iter *, unsigned);
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index 92e19c4..3e13f78 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -223,8 +223,7 @@ found:
mutex_unlock(&c->btree_interior_update_lock);
}
-static void __btree_node_free(struct bch_fs *c, struct btree *b,
- struct btree_iter *iter)
+static void __btree_node_free(struct bch_fs *c, struct btree *b)
{
trace_btree_node_free(c, b);
@@ -237,21 +236,11 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b,
clear_btree_node_noevict(b);
- btree_node_lock_type(c, b, SIX_LOCK_write);
-
bch2_btree_node_hash_remove(&c->btree_cache, b);
mutex_lock(&c->btree_cache.lock);
list_move(&b->list, &c->btree_cache.freeable);
mutex_unlock(&c->btree_cache.lock);
-
- /*
- * By using six_unlock_write() directly instead of
- * bch2_btree_node_unlock_write(), we don't update the iterator's
- * sequence numbers and cause future bch2_btree_node_relock() calls to
- * fail:
- */
- six_unlock_write(&b->lock);
}
void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
@@ -264,7 +253,9 @@ void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
clear_btree_node_dirty(b);
- __btree_node_free(c, b, NULL);
+ btree_node_lock_type(c, b, SIX_LOCK_write);
+ __btree_node_free(c, b);
+ six_unlock_write(&b->lock);
bch2_open_bucket_put_refs(c, &ob.nr, ob.refs);
}
@@ -283,9 +274,9 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
*/
btree_update_drop_new_node(c, b);
- bch2_btree_iter_node_drop_linked(iter, b);
-
- __btree_node_free(c, b, iter);
+ __bch2_btree_node_lock_write(b, iter);
+ __btree_node_free(c, b);
+ six_unlock_write(&b->lock);
bch2_btree_iter_node_drop(iter, b);
}
@@ -499,7 +490,9 @@ static void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reser
bch2_btree_open_bucket_put(c, b);
}
- __btree_node_free(c, b, NULL);
+ btree_node_lock_type(c, b, SIX_LOCK_write);
+ __btree_node_free(c, b);
+ six_unlock_write(&b->lock);
six_unlock_intent(&b->lock);
}
@@ -1362,7 +1355,8 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
}
static void btree_split(struct btree_update *as, struct btree *b,
- struct btree_iter *iter, struct keylist *keys)
+ struct btree_iter *iter, struct keylist *keys,
+ unsigned flags)
{
struct bch_fs *c = as->c;
struct btree *parent = btree_node_parent(iter, b);
@@ -1425,7 +1419,7 @@ static void btree_split(struct btree_update *as, struct btree *b,
if (parent) {
/* Split a non root node */
- bch2_btree_insert_node(as, parent, iter, &as->parent_keys);
+ bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
} else if (n3) {
bch2_btree_set_root(as, n3, iter);
} else {
@@ -1491,9 +1485,8 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
btree_update_updated_node(as, b);
- for_each_linked_btree_node(iter, b, linked)
+ for_each_btree_iter_with_node(iter, b, linked)
bch2_btree_node_iter_peek(&linked->l[b->level].iter, b);
- bch2_btree_node_iter_peek(&iter->l[b->level].iter, b);
bch2_btree_iter_verify(iter, b);
}
@@ -1511,7 +1504,8 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
* for leaf nodes -- inserts into interior nodes have to be atomic.
*/
void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
- struct btree_iter *iter, struct keylist *keys)
+ struct btree_iter *iter, struct keylist *keys,
+ unsigned flags)
{
struct bch_fs *c = as->c;
int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
@@ -1551,14 +1545,14 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
btree_node_interior_verify(b);
- bch2_foreground_maybe_merge(c, iter, b->level);
+ bch2_foreground_maybe_merge(c, iter, b->level, flags);
return;
split:
- btree_split(as, b, iter, keys);
+ btree_split(as, b, iter, keys, flags);
}
int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
- unsigned btree_reserve_flags)
+ unsigned flags)
{
struct btree *b = iter->l[0].b;
struct btree_update *as;
@@ -1570,16 +1564,17 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
* We already have a disk reservation and open buckets pinned; this
* allocation must not block:
*/
- for_each_linked_btree_iter(iter, linked)
+ for_each_btree_iter(iter, linked)
if (linked->btree_id == BTREE_ID_EXTENTS)
- btree_reserve_flags |= BTREE_INSERT_USE_RESERVE;
- if (iter->btree_id == BTREE_ID_EXTENTS)
- btree_reserve_flags |= BTREE_INSERT_USE_RESERVE;
+ flags |= BTREE_INSERT_USE_RESERVE;
closure_init_stack(&cl);
/* Hack, because gc and splitting nodes doesn't mix yet: */
if (!down_read_trylock(&c->gc_lock)) {
+ if (flags & BTREE_INSERT_NOUNLOCK)
+ return -EINTR;
+
bch2_btree_iter_unlock(iter);
down_read(&c->gc_lock);
@@ -1591,39 +1586,43 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
* XXX: figure out how far we might need to split,
* instead of locking/reserving all the way to the root:
*/
- if (!bch2_btree_iter_set_locks_want(iter, U8_MAX)) {
+ if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
ret = -EINTR;
goto out;
}
as = bch2_btree_update_start(c, iter->btree_id,
- btree_update_reserve_required(c, b),
- btree_reserve_flags, &cl);
+ btree_update_reserve_required(c, b), flags,
+ !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
if (IS_ERR(as)) {
ret = PTR_ERR(as);
if (ret == -EAGAIN) {
+ BUG_ON(flags & BTREE_INSERT_NOUNLOCK);
bch2_btree_iter_unlock(iter);
- up_read(&c->gc_lock);
- closure_sync(&cl);
- return -EINTR;
+ ret = -EINTR;
}
goto out;
}
- btree_split(as, b, iter, NULL);
+ btree_split(as, b, iter, NULL, flags);
bch2_btree_update_done(as);
- bch2_btree_iter_set_locks_want(iter, 1);
+ /*
+ * We haven't successfully inserted yet, so don't downgrade all the way
+ * back to read locks;
+ */
+ __bch2_btree_iter_downgrade(iter, 1);
out:
up_read(&c->gc_lock);
closure_sync(&cl);
return ret;
}
-int __bch2_foreground_maybe_merge(struct bch_fs *c,
- struct btree_iter *iter,
- unsigned level,
- enum btree_node_sibling sib)
+void __bch2_foreground_maybe_merge(struct bch_fs *c,
+ struct btree_iter *iter,
+ unsigned level,
+ unsigned flags,
+ enum btree_node_sibling sib)
{
struct btree_update *as;
struct bkey_format_state new_s;
@@ -1636,29 +1635,29 @@ int __bch2_foreground_maybe_merge(struct bch_fs *c,
closure_init_stack(&cl);
retry:
- if (!bch2_btree_node_relock(iter, level))
- return 0;
+ BUG_ON(!btree_node_locked(iter, level));
b = iter->l[level].b;
parent = btree_node_parent(iter, b);
if (!parent)
- return 0;
+ goto out;
if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c))
- return 0;
+ goto out;
/* XXX: can't be holding read locks */
- m = bch2_btree_node_get_sibling(c, iter, b, sib);
+ m = bch2_btree_node_get_sibling(c, iter, b,
+ !(flags & BTREE_INSERT_NOUNLOCK), sib);
if (IS_ERR(m)) {
ret = PTR_ERR(m);
- goto out;
+ goto err;
}
/* NULL means no sibling: */
if (!m) {
b->sib_u64s[sib] = U16_MAX;
- return 0;
+ goto out;
}
if (sib == btree_prev_sib) {
@@ -1688,33 +1687,26 @@ retry:
if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) {
six_unlock_intent(&m->lock);
- return 0;
+ goto out;
}
/* We're changing btree topology, doesn't mix with gc: */
- if (!down_read_trylock(&c->gc_lock)) {
- six_unlock_intent(&m->lock);
- bch2_btree_iter_unlock(iter);
+ if (!down_read_trylock(&c->gc_lock))
+ goto err_cycle_gc_lock;
- down_read(&c->gc_lock);
- up_read(&c->gc_lock);
+ if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
ret = -EINTR;
- goto out;
- }
-
- if (!bch2_btree_iter_set_locks_want(iter, U8_MAX)) {
- ret = -EINTR;
- goto out_unlock;
+ goto err_unlock;
}
as = bch2_btree_update_start(c, iter->btree_id,
btree_update_reserve_required(c, parent) + 1,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE,
- &cl);
+ !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
if (IS_ERR(as)) {
ret = PTR_ERR(as);
- goto out_unlock;
+ goto err_unlock;
}
trace_btree_merge(c, b);
@@ -1744,7 +1736,7 @@ retry:
bch2_btree_node_write(c, n, SIX_LOCK_intent);
- bch2_btree_insert_node(as, parent, iter, &as->parent_keys);
+ bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
bch2_btree_open_bucket_put(c, n);
bch2_btree_node_free_inmem(c, b, iter);
@@ -1754,26 +1746,53 @@ retry:
bch2_btree_iter_verify(iter, n);
bch2_btree_update_done(as);
-out_unlock:
- if (ret != -EINTR && ret != -EAGAIN)
- bch2_btree_iter_set_locks_want(iter, 1);
+
six_unlock_intent(&m->lock);
up_read(&c->gc_lock);
out:
- if (ret == -EAGAIN || ret == -EINTR) {
- bch2_btree_iter_unlock(iter);
- ret = -EINTR;
- }
-
+ /*
+ * Don't downgrade locks here: we're called after successful insert,
+ * and the caller will downgrade locks after a successful insert
+ * anyways (in case e.g. a split was required first)
+ *
+ * And we're also called when inserting into interior nodes in the
+ * split path, and downgrading to read locks in there is potentially
+ * confusing:
+ */
closure_sync(&cl);
+ return;
+
+err_cycle_gc_lock:
+ six_unlock_intent(&m->lock);
+
+ if (flags & BTREE_INSERT_NOUNLOCK)
+ goto out;
+
+ bch2_btree_iter_unlock(iter);
+
+ down_read(&c->gc_lock);
+ up_read(&c->gc_lock);
+ ret = -EINTR;
+ goto err;
+
+err_unlock:
+ six_unlock_intent(&m->lock);
+ up_read(&c->gc_lock);
+err:
+ BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK));
- if (ret == -EINTR) {
+ if ((ret == -EAGAIN || ret == -EINTR) &&
+ !(flags & BTREE_INSERT_NOUNLOCK)) {
+ bch2_btree_iter_unlock(iter);
+ closure_sync(&cl);
ret = bch2_btree_iter_traverse(iter);
- if (!ret)
- goto retry;
+ if (ret)
+ goto out;
+
+ goto retry;
}
- return ret;
+ goto out;
}
static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
@@ -1806,7 +1825,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
if (parent) {
bch2_keylist_add(&as->parent_keys, &n->key);
- bch2_btree_insert_node(as, parent, iter, &as->parent_keys);
+ bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
} else {
bch2_btree_set_root(as, n, iter);
}
@@ -1815,7 +1834,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
bch2_btree_node_free_inmem(c, b, iter);
- BUG_ON(!bch2_btree_iter_node_replace(iter, n));
+ bch2_btree_iter_node_replace(iter, n);
bch2_btree_update_done(as);
return 0;
@@ -1830,7 +1849,6 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
__le64 seq, unsigned flags)
{
- unsigned locks_want = iter->locks_want;
struct closure cl;
struct btree *b;
int ret;
@@ -1839,7 +1857,7 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
closure_init_stack(&cl);
- bch2_btree_iter_set_locks_want(iter, U8_MAX);
+ bch2_btree_iter_upgrade(iter, U8_MAX);
if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) {
if (!down_read_trylock(&c->gc_lock)) {
@@ -1866,7 +1884,7 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
closure_sync(&cl);
}
- bch2_btree_iter_set_locks_want(iter, locks_want);
+ bch2_btree_iter_downgrade(iter);
if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
up_read(&c->gc_lock);
@@ -1920,7 +1938,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
}
bch2_keylist_add(&as->parent_keys, &new_key->k_i);
- bch2_btree_insert_node(as, parent, iter, &as->parent_keys);
+ bch2_btree_insert_node(as, parent, iter, &as->parent_keys, 0);
if (new_hash) {
mutex_lock(&c->btree_cache.lock);
@@ -1982,6 +2000,9 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
closure_init_stack(&cl);
+ if (!bch2_btree_iter_upgrade(iter, U8_MAX))
+ return -EINTR;
+
if (!down_read_trylock(&c->gc_lock)) {
bch2_btree_iter_unlock(iter);
down_read(&c->gc_lock);
@@ -2041,6 +2062,8 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
goto err_free_update;
__bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key);
+
+ bch2_btree_iter_downgrade(iter);
err:
if (new_hash) {
mutex_lock(&c->btree_cache.lock);
diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h
index abf14e4..3a17de5 100644
--- a/libbcachefs/btree_update_interior.h
+++ b/libbcachefs/btree_update_interior.h
@@ -146,35 +146,51 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *,
struct btree *);
void bch2_btree_insert_node(struct btree_update *, struct btree *,
- struct btree_iter *, struct keylist *);
+ struct btree_iter *, struct keylist *,
+ unsigned);
int bch2_btree_split_leaf(struct bch_fs *, struct btree_iter *, unsigned);
-int __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *,
- unsigned, enum btree_node_sibling);
+void __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *,
+ unsigned, unsigned, enum btree_node_sibling);
-static inline int bch2_foreground_maybe_merge_sibling(struct bch_fs *c,
+static inline void bch2_foreground_maybe_merge_sibling(struct bch_fs *c,
struct btree_iter *iter,
- unsigned level,
+ unsigned level, unsigned flags,
enum btree_node_sibling sib)
{
struct btree *b;
+ /*
+ * iterators are inconsistent when they hit end of leaf, until
+ * traversed again
+ *
+ * XXX inconsistent how?
+ */
+ if (iter->flags & BTREE_ITER_AT_END_OF_LEAF)
+ return;
+
+ if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE)
+ return;
+
if (!bch2_btree_node_relock(iter, level))
- return 0;
+ return;
b = iter->l[level].b;
if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
- return 0;
+ return;
- return __bch2_foreground_maybe_merge(c, iter, level, sib);
+ __bch2_foreground_maybe_merge(c, iter, level, flags, sib);
}
static inline void bch2_foreground_maybe_merge(struct bch_fs *c,
struct btree_iter *iter,
- unsigned level)
+ unsigned level,
+ unsigned flags)
{
- bch2_foreground_maybe_merge_sibling(c, iter, level, btree_prev_sib);
- bch2_foreground_maybe_merge_sibling(c, iter, level, btree_next_sib);
+ bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
+ btree_prev_sib);
+ bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
+ btree_next_sib);
}
void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c
index cc41140..a62d830 100644
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@@ -227,19 +227,36 @@ btree_insert_key_leaf(struct btree_insert *trans,
return ret;
}
+#define trans_for_each_entry(trans, i) \
+ for ((i) = (trans)->entries; (i) < (trans)->entries + (trans)->nr; (i)++)
+
+/*
+ * We sort transaction entries so that if multiple iterators point to the same
+ * leaf node they'll be adjacent:
+ */
static bool same_leaf_as_prev(struct btree_insert *trans,
struct btree_insert_entry *i)
{
- /*
- * Because we sorted the transaction entries, if multiple iterators
- * point to the same leaf node they'll always be adjacent now:
- */
return i != trans->entries &&
i[0].iter->l[0].b == i[-1].iter->l[0].b;
}
-#define trans_for_each_entry(trans, i) \
- for ((i) = (trans)->entries; (i) < (trans)->entries + (trans)->nr; (i)++)
+static inline struct btree_insert_entry *trans_next_leaf(struct btree_insert *trans,
+ struct btree_insert_entry *i)
+{
+ struct btree *b = i->iter->l[0].b;
+
+ do {
+ i++;
+ } while (i < trans->entries + trans->nr && b == i->iter->l[0].b);
+
+ return i;
+}
+
+#define trans_for_each_leaf(trans, i) \
+ for ((i) = (trans)->entries; \
+ (i) < (trans)->entries + (trans)->nr; \
+ (i) = trans_next_leaf(trans, i))
inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
struct btree_iter *iter)
@@ -262,19 +279,16 @@ static void multi_lock_write(struct bch_fs *c, struct btree_insert *trans)
{
struct btree_insert_entry *i;
- trans_for_each_entry(trans, i)
- if (!same_leaf_as_prev(trans, i))
- bch2_btree_node_lock_for_insert(c, i->iter->l[0].b,
- i->iter);
+ trans_for_each_leaf(trans, i)
+ bch2_btree_node_lock_for_insert(c, i->iter->l[0].b, i->iter);
}
static void multi_unlock_write(struct btree_insert *trans)
{
struct btree_insert_entry *i;
- trans_for_each_entry(trans, i)
- if (!same_leaf_as_prev(trans, i))
- bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter);
+ trans_for_each_leaf(trans, i)
+ bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter);
}
static inline int btree_trans_cmp(struct btree_insert_entry l,
@@ -285,56 +299,24 @@ static inline int btree_trans_cmp(struct btree_insert_entry l,
/* Normal update interface: */
-/**
- * __bch_btree_insert_at - insert keys at given iterator positions
- *
- * This is main entry point for btree updates.
- *
- * Return values:
- * -EINTR: locking changed, this function should be called again. Only returned
- * if passed BTREE_INSERT_ATOMIC.
- * -EROFS: filesystem read only
- * -EIO: journal or btree node IO error
+/*
+ * Get journal reservation, take write locks, and attempt to do btree update(s):
*/
-int __bch2_btree_insert_at(struct btree_insert *trans)
+static inline int do_btree_insert_at(struct btree_insert *trans,
+ struct btree_iter **split,
+ bool *cycle_gc_lock)
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
- struct btree_iter *split = NULL;
- bool cycle_gc_lock = false;
unsigned u64s;
int ret;
- trans_for_each_entry(trans, i) {
- BUG_ON(i->iter->level);
- BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
- BUG_ON(debug_check_bkeys(c) &&
- bch2_bkey_invalid(c, i->iter->btree_id,
- bkey_i_to_s_c(i->k)));
- }
-
- bubble_sort(trans->entries, trans->nr, btree_trans_cmp);
-
- if (unlikely(!percpu_ref_tryget(&c->writes)))
- return -EROFS;
-retry_locks:
- ret = -EINTR;
- trans_for_each_entry(trans, i) {
- if (!bch2_btree_iter_set_locks_want(i->iter, 1))
- goto err;
+ trans_for_each_entry(trans, i)
+ BUG_ON(i->done);
- if (i->iter->uptodate == BTREE_ITER_NEED_TRAVERSE) {
- ret = bch2_btree_iter_traverse(i->iter);
- if (ret)
- goto err;
- }
- }
-retry:
- trans->did_work = false;
u64s = 0;
trans_for_each_entry(trans, i)
- if (!i->done)
- u64s += jset_u64s(i->k->k.u64s + i->extra_res);
+ u64s += jset_u64s(i->k->k.u64s + i->extra_res);
memset(&trans->journal_res, 0, sizeof(trans->journal_res));
@@ -344,13 +326,13 @@ retry:
u64s, u64s)
: 0;
if (ret)
- goto err;
+ return ret;
multi_lock_write(c, trans);
if (race_fault()) {
ret = -EINTR;
- goto unlock;
+ goto out;
}
u64s = 0;
@@ -365,129 +347,210 @@ retry:
* bch2_btree_node_write(), converting an unwritten bset to a
* written one
*/
- if (!i->done) {
- u64s += i->k->k.u64s + i->extra_res;
- if (!bch2_btree_node_insert_fits(c,
- i->iter->l[0].b, u64s)) {
- split = i->iter;
- goto unlock;
- }
+ u64s += i->k->k.u64s + i->extra_res;
+ if (!bch2_btree_node_insert_fits(c,
+ i->iter->l[0].b, u64s)) {
+ ret = -EINTR;
+ *split = i->iter;
+ goto out;
}
}
- ret = 0;
- split = NULL;
- cycle_gc_lock = false;
+ if (journal_seq_verify(c) &&
+ !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+ trans_for_each_entry(trans, i)
+ i->k->k.version.lo = trans->journal_res.seq;
trans_for_each_entry(trans, i) {
- if (i->done)
- continue;
-
switch (btree_insert_key_leaf(trans, i)) {
case BTREE_INSERT_OK:
i->done = true;
break;
case BTREE_INSERT_JOURNAL_RES_FULL:
case BTREE_INSERT_NEED_TRAVERSE:
- ret = -EINTR;
- break;
case BTREE_INSERT_NEED_RESCHED:
- ret = -EAGAIN;
+ ret = -EINTR;
break;
case BTREE_INSERT_BTREE_NODE_FULL:
- split = i->iter;
+ ret = -EINTR;
+ *split = i->iter;
break;
case BTREE_INSERT_ENOSPC:
ret = -ENOSPC;
break;
case BTREE_INSERT_NEED_GC_LOCK:
- cycle_gc_lock = true;
ret = -EINTR;
+ *cycle_gc_lock = true;
break;
default:
BUG();
}
- if (!trans->did_work && (ret || split))
+ /*
+ * If we did some work (i.e. inserted part of an extent),
+ * we have to do all the other updates as well:
+ */
+ if (!trans->did_work && (ret || *split))
break;
}
-unlock:
+out:
multi_unlock_write(trans);
bch2_journal_res_put(&c->journal, &trans->journal_res);
- if (split)
- goto split;
- if (ret)
- goto err;
+ return ret;
+}
- trans_for_each_entry(trans, i)
- if (i->iter->flags & BTREE_ITER_AT_END_OF_LEAF)
- goto out;
+/**
+ * __bch_btree_insert_at - insert keys at given iterator positions
+ *
+ * This is main entry point for btree updates.
+ *
+ * Return values:
+ * -EINTR: locking changed, this function should be called again. Only returned
+ * if passed BTREE_INSERT_ATOMIC.
+ * -EROFS: filesystem read only
+ * -EIO: journal or btree node IO error
+ */
+int __bch2_btree_insert_at(struct btree_insert *trans)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_insert_entry *i;
+ struct btree_iter *linked, *split = NULL;
+ bool cycle_gc_lock = false;
+ unsigned flags;
+ int ret;
+
+ for_each_btree_iter(trans->entries[0].iter, linked)
+ bch2_btree_iter_verify_locks(linked);
+
+ /* for the sake of sanity: */
+ BUG_ON(trans->nr > 1 && !(trans->flags & BTREE_INSERT_ATOMIC));
trans_for_each_entry(trans, i) {
- /*
- * iterators are inconsistent when they hit end of leaf, until
- * traversed again
- */
- if (i->iter->uptodate < BTREE_ITER_NEED_TRAVERSE &&
- !same_leaf_as_prev(trans, i))
- bch2_foreground_maybe_merge(c, i->iter, 0);
+ BUG_ON(i->iter->level);
+ BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
+ BUG_ON(debug_check_bkeys(c) &&
+ bch2_bkey_invalid(c, i->iter->btree_id,
+ bkey_i_to_s_c(i->k)));
+ BUG_ON(i->iter->uptodate == BTREE_ITER_END);
}
-out:
- /* make sure we didn't lose an error: */
- if (!ret && IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
- trans_for_each_entry(trans, i)
- BUG_ON(!i->done);
+ bubble_sort(trans->entries, trans->nr, btree_trans_cmp);
+
+ if (unlikely(!percpu_ref_tryget(&c->writes)))
+ return -EROFS;
+retry:
+ split = NULL;
+ cycle_gc_lock = false;
+
+ trans_for_each_entry(trans, i) {
+ if (!bch2_btree_iter_upgrade(i->iter, 1)) {
+ ret = -EINTR;
+ goto err;
+ }
+
+ if (i->iter->flags & BTREE_ITER_ERROR) {
+ ret = -EIO;
+ goto err;
+ }
+ }
+
+ ret = do_btree_insert_at(trans, &split, &cycle_gc_lock);
+ if (unlikely(ret))
+ goto err;
+
+ trans_for_each_leaf(trans, i)
+ bch2_foreground_maybe_merge(c, i->iter, 0, trans->flags);
+
+ trans_for_each_entry(trans, i)
+ bch2_btree_iter_downgrade(i->iter);
+out:
percpu_ref_put(&c->writes);
+
+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
+ /* make sure we didn't drop or screw up locks: */
+ for_each_btree_iter(trans->entries[0].iter, linked) {
+ bch2_btree_iter_verify_locks(linked);
+ BUG_ON((trans->flags & BTREE_INSERT_NOUNLOCK) &&
+ trans->did_work &&
+ linked->uptodate >= BTREE_ITER_NEED_RELOCK);
+ }
+
+ /* make sure we didn't lose an error: */
+ if (!ret)
+ trans_for_each_entry(trans, i)
+ BUG_ON(!i->done);
+ }
+
+ BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR);
+
return ret;
-split:
- /*
- * have to drop journal res before splitting, because splitting means
- * allocating new btree nodes, and holding a journal reservation
- * potentially blocks the allocator:
- */
- ret = bch2_btree_split_leaf(c, split, trans->flags);
+err:
+ flags = trans->flags;
/*
- * This can happen when we insert part of an extent - with an update
- * with multiple keys, we don't want to redo the entire update - that's
- * just too confusing:
+ * BTREE_INSERT_NOUNLOCK means don't unlock _after_ successful btree
+ * update; if we haven't done anything yet it doesn't apply
*/
- if (!ret &&
- (trans->flags & BTREE_INSERT_ATOMIC) &&
- trans->did_work)
- ret = -EINTR;
+ if (!trans->did_work)
+ flags &= ~BTREE_INSERT_NOUNLOCK;
- if (ret)
- goto err;
+ if (split) {
+ ret = bch2_btree_split_leaf(c, split, flags);
+
+ /*
+ * if the split succeeded without dropping locks the insert will
+ * still be atomic (in the BTREE_INSERT_ATOMIC sense, what the
+ * caller peeked() and is overwriting won't have changed)
+ */
+#if 0
+ /*
+ * XXX:
+ * split -> btree node merging (of parent node) might still drop
+ * locks when we're not passing it BTREE_INSERT_NOUNLOCK
+ */
+ if (!ret && !trans->did_work)
+ goto retry;
+#endif
+
+ /*
+ * don't care if we got ENOSPC because we told split it
+ * couldn't block:
+ */
+ if (!ret || (flags & BTREE_INSERT_NOUNLOCK))
+ ret = -EINTR;
+ }
- /*
- * if the split didn't have to drop locks the insert will still be
- * atomic (in the BTREE_INSERT_ATOMIC sense, what the caller peeked()
- * and is overwriting won't have changed)
- */
- goto retry_locks;
-err:
if (cycle_gc_lock) {
- down_read(&c->gc_lock);
+ if (!down_read_trylock(&c->gc_lock)) {
+ if (flags & BTREE_INSERT_NOUNLOCK)
+ goto out;
+
+ bch2_btree_iter_unlock(trans->entries[0].iter);
+ down_read(&c->gc_lock);
+ }
up_read(&c->gc_lock);
}
if (ret == -EINTR) {
+ if (flags & BTREE_INSERT_NOUNLOCK)
+ goto out;
+
trans_for_each_entry(trans, i) {
int ret2 = bch2_btree_iter_traverse(i->iter);
if (ret2) {
ret = ret2;
goto out;
}
+
+ BUG_ON(i->iter->uptodate > BTREE_ITER_NEED_PEEK);
}
/*
* BTREE_ITER_ATOMIC means we have to return -EINTR if we
* dropped locks:
*/
- if (!(trans->flags & BTREE_INSERT_ATOMIC))
+ if (!(flags & BTREE_INSERT_ATOMIC))
goto retry;
}
@@ -549,7 +612,7 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
bch2_btree_iter_init(&iter, c, id, bkey_start_pos(&k->k),
BTREE_ITER_INTENT);
ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq, flags,
- BTREE_INSERT_ENTRY(&iter, k));
+ BTREE_INSERT_ENTRY(&iter, k));
bch2_btree_iter_unlock(&iter);
return ret;
@@ -584,6 +647,11 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
if (bkey_cmp(iter.pos, end) >= 0)
break;
+ if (k.k->type == KEY_TYPE_DISCARD) {
+ bch2_btree_iter_next(&iter);
+ continue;
+ }
+
bkey_init(&delete.k);
/*
@@ -615,8 +683,8 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
}
ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq,
- BTREE_INSERT_NOFAIL,
- BTREE_INSERT_ENTRY(&iter, &delete));
+ BTREE_INSERT_NOFAIL,
+ BTREE_INSERT_ENTRY(&iter, &delete));
if (ret)
break;
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index b17189e..4311244 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -358,8 +358,9 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
old.data_type != new.data_type) {
BUG_ON(!c);
bch2_fs_inconsistent(c,
- "different types of data in same bucket: %u, %u",
- old.data_type, new.data_type);
+ "different types of data in same bucket: %s, %s",
+ bch2_data_types[old.data_type],
+ bch2_data_types[new.data_type]);
}
dev_usage = this_cpu_ptr(ca->usage_percpu);
diff --git a/libbcachefs/checksum.h b/libbcachefs/checksum.h
index 2690cc4..031b36f 100644
--- a/libbcachefs/checksum.h
+++ b/libbcachefs/checksum.h
@@ -109,14 +109,6 @@ static inline bool bch2_checksum_type_valid(const struct bch_fs *c,
return true;
}
-static const unsigned bch_crc_bytes[] = {
- [BCH_CSUM_NONE] = 0,
- [BCH_CSUM_CRC32C] = 4,
- [BCH_CSUM_CRC64] = 8,
- [BCH_CSUM_CHACHA20_POLY1305_80] = 10,
- [BCH_CSUM_CHACHA20_POLY1305_128] = 16,
-};
-
/* returns true if not equal */
static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r)
{
diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c
index df9913f..36dca6b 100644
--- a/libbcachefs/dirent.c
+++ b/libbcachefs/dirent.c
@@ -12,7 +12,8 @@
unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
{
- unsigned len = bkey_val_bytes(d.k) - sizeof(struct bch_dirent);
+ unsigned len = bkey_val_bytes(d.k) -
+ offsetof(struct bch_dirent, d_name);
while (len && !d.v->d_name[len - 1])
--len;
@@ -22,7 +23,8 @@ unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
static unsigned dirent_val_u64s(unsigned len)
{
- return DIV_ROUND_UP(sizeof(struct bch_dirent) + len, sizeof(u64));
+ return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len,
+ sizeof(u64));
}
static u64 bch2_dirent_hash(const struct bch_hash_info *info,
@@ -98,7 +100,7 @@ const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k)
if (bkey_val_u64s(k.k) > dirent_val_u64s(len))
return "value too big";
- if (len > NAME_MAX)
+ if (len > BCH_NAME_MAX)
return "dirent name too big";
if (memchr(d.v->d_name, '/', len))
@@ -141,9 +143,14 @@ static struct bkey_i_dirent *dirent_create_key(u8 type,
struct bkey_i_dirent *dirent;
unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len);
+ if (name->len > BCH_NAME_MAX)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ BUG_ON(u64s > U8_MAX);
+
dirent = kmalloc(u64s * sizeof(u64), GFP_NOFS);
if (!dirent)
- return NULL;
+ return ERR_PTR(-ENOMEM);
bkey_dirent_init(&dirent->k_i);
dirent->k.u64s = u64s;
@@ -153,7 +160,8 @@ static struct bkey_i_dirent *dirent_create_key(u8 type,
memcpy(dirent->v.d_name, name->name, name->len);
memset(dirent->v.d_name + name->len, 0,
bkey_val_bytes(&dirent->k) -
- (sizeof(struct bch_dirent) + name->len));
+ offsetof(struct bch_dirent, d_name) -
+ name->len);
EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len);
@@ -169,8 +177,8 @@ int bch2_dirent_create(struct bch_fs *c, u64 dir_inum,
int ret;
dirent = dirent_create_key(type, name, dst_inum);
- if (!dirent)
- return -ENOMEM;
+ if (IS_ERR(dirent))
+ return PTR_ERR(dirent);
ret = bch2_hash_set(bch2_dirent_hash_desc, hash_info, c, dir_inum,
journal_seq, &dirent->k_i, flags);
@@ -204,7 +212,7 @@ int bch2_dirent_rename(struct bch_fs *c,
struct bpos src_pos = bch2_dirent_pos(src_dir, src_name);
struct bpos dst_pos = bch2_dirent_pos(dst_dir, dst_name);
bool need_whiteout;
- int ret = -ENOMEM;
+ int ret;
bch2_btree_iter_init(&src_iter, c, BTREE_ID_DIRENTS, src_pos,
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
@@ -218,15 +226,19 @@ int bch2_dirent_rename(struct bch_fs *c,
if (mode == BCH_RENAME_EXCHANGE) {
new_src = dirent_create_key(0, src_name, 0);
- if (!new_src)
+ if (IS_ERR(new_src)) {
+ ret = PTR_ERR(new_src);
goto err;
+ }
} else {
new_src = (void *) &delete;
}
new_dst = dirent_create_key(0, dst_name, 0);
- if (!new_dst)
+ if (IS_ERR(new_dst)) {
+ ret = PTR_ERR(new_dst);
goto err;
+ }
retry:
/*
* Note that on -EINTR/dropped locks we're not restarting the lookup
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c
index d7b1719..737b9be 100644
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -257,12 +257,12 @@ static int i_sectors_dirty_finish(struct bch_fs *c, struct i_sectors_hook *h)
int ret;
mutex_lock(&h->inode->ei_update_lock);
- if (h->new_i_size != U64_MAX)
- i_size_write(&h->inode->v, h->new_i_size);
-
i_sectors_acct(c, h->inode, &h->quota_res, h->sectors);
ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_finish_fn, h);
+
+ if (!ret && h->new_i_size != U64_MAX)
+ i_size_write(&h->inode->v, h->new_i_size);
mutex_unlock(&h->inode->ei_update_lock);
bch2_quota_reservation_put(c, h->inode, &h->quota_res);
@@ -348,17 +348,25 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,
return BTREE_INSERT_NEED_TRAVERSE;
}
- BUG_ON(h->inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY);
+ /* truncate in progress? */
+ if (h->inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY)
+ goto no_i_size_update;
h->inode_u.bi_size = offset;
do_pack = true;
inode->ei_inode.bi_size = offset;
- if (h->op->is_dio)
- i_size_write(&inode->v, offset);
+ spin_lock(&inode->v.i_lock);
+ if (offset > inode->v.i_size) {
+ if (h->op->is_dio)
+ i_size_write(&inode->v, offset);
+ else
+ BUG();
+ }
+ spin_unlock(&inode->v.i_lock);
}
-
+no_i_size_update:
if (sectors) {
if (!h->need_inode_update) {
h->need_inode_update = true;
@@ -1457,8 +1465,10 @@ int bch2_write_end(struct file *file, struct address_space *mapping,
copied = 0;
}
+ spin_lock(&inode->v.i_lock);
if (pos + copied > inode->v.i_size)
i_size_write(&inode->v, pos + copied);
+ spin_unlock(&inode->v.i_lock);
if (copied) {
if (!PageUptodate(page))
@@ -1563,8 +1573,10 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE);
inode->ei_last_dirtied = (unsigned long) current;
+ spin_lock(&inode->v.i_lock);
if (pos + copied > inode->v.i_size)
i_size_write(&inode->v, pos + copied);
+ spin_unlock(&inode->v.i_lock);
if (copied < len &&
((offset + copied) & (PAGE_SIZE - 1))) {
@@ -2047,10 +2059,17 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
struct bch_fs *c = inode->v.i_sb->s_fs_info;
int ret;
- ret = filemap_write_and_wait_range(inode->v.i_mapping, start, end);
+ ret = file_write_and_wait_range(file, start, end);
if (ret)
return ret;
+ if (datasync && !(inode->v.i_state & I_DIRTY_DATASYNC))
+ goto out;
+
+ ret = sync_inode_metadata(&inode->v, 1);
+ if (ret)
+ return ret;
+out:
if (c->opts.journal_flush_disabled)
return 0;
@@ -2149,25 +2168,61 @@ static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from)
from, from + PAGE_SIZE);
}
+static int bch2_extend(struct bch_inode_info *inode, struct iattr *iattr)
+{
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct address_space *mapping = inode->v.i_mapping;
+ int ret;
+
+ ret = filemap_write_and_wait_range(mapping,
+ inode->ei_inode.bi_size, S64_MAX);
+ if (ret)
+ return ret;
+
+ truncate_setsize(&inode->v, iattr->ia_size);
+ setattr_copy(&inode->v, iattr);
+
+ mutex_lock(&inode->ei_update_lock);
+ inode->v.i_mtime = inode->v.i_ctime = current_time(&inode->v);
+ ret = bch2_write_inode_size(c, inode, inode->v.i_size);
+ mutex_unlock(&inode->ei_update_lock);
+
+ return ret;
+}
+
int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct address_space *mapping = inode->v.i_mapping;
- bool shrink = iattr->ia_size <= inode->v.i_size;
struct i_sectors_hook i_sectors_hook =
i_sectors_hook_init(inode, BCH_INODE_I_SIZE_DIRTY);
+ bool shrink;
int ret = 0;
inode_dio_wait(&inode->v);
pagecache_block_get(&mapping->add_lock);
- truncate_setsize(&inode->v, iattr->ia_size);
+ BUG_ON(inode->v.i_size < inode->ei_inode.bi_size);
+
+ shrink = iattr->ia_size <= inode->v.i_size;
+
+ if (!shrink) {
+ ret = bch2_extend(inode, iattr);
+ goto err_put_pagecache;
+ }
+
+ ret = bch2_truncate_page(inode, iattr->ia_size);
+ if (unlikely(ret))
+ goto err_put_pagecache;
- /* sync appends.. */
- /* XXX what protects inode->i_size? */
if (iattr->ia_size > inode->ei_inode.bi_size)
ret = filemap_write_and_wait_range(mapping,
- inode->ei_inode.bi_size, S64_MAX);
+ inode->ei_inode.bi_size,
+ iattr->ia_size - 1);
+ else if (iattr->ia_size & (PAGE_SIZE - 1))
+ ret = filemap_write_and_wait_range(mapping,
+ round_down(iattr->ia_size, PAGE_SIZE),
+ iattr->ia_size - 1);
if (ret)
goto err_put_pagecache;
@@ -2175,41 +2230,31 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
ret = i_sectors_dirty_start(c, &i_sectors_hook);
if (unlikely(ret))
- goto err;
+ goto err_put_pagecache;
- /*
- * There might be persistent reservations (from fallocate())
- * above i_size, which bch2_inode_truncate() will discard - we're
- * only supposed to discard them if we're doing a real truncate
- * here (new i_size < current i_size):
- */
- if (shrink) {
- ret = bch2_truncate_page(inode, iattr->ia_size);
- if (unlikely(ret))
- goto err;
+ truncate_setsize(&inode->v, iattr->ia_size);
- ret = bch2_inode_truncate(c, inode->v.i_ino,
- round_up(iattr->ia_size, PAGE_SIZE) >> 9,
- &i_sectors_hook.hook,
- &inode->ei_journal_seq);
- if (unlikely(ret))
- goto err;
- }
+ ret = bch2_inode_truncate(c, inode->v.i_ino,
+ round_up(iattr->ia_size, PAGE_SIZE) >> 9,
+ &i_sectors_hook.hook,
+ &inode->ei_journal_seq);
+ if (unlikely(ret))
+ goto err_put_sectors_dirty;
setattr_copy(&inode->v, iattr);
inode->v.i_mtime = inode->v.i_ctime = current_time(&inode->v);
-err:
- /*
- * On error - in particular, bch2_truncate_page() error - don't clear
- * I_SIZE_DIRTY, as we've left data above i_size!:
- */
- if (ret)
- i_sectors_hook.flags &= ~BCH_INODE_I_SIZE_DIRTY;
-
+out:
ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
err_put_pagecache:
pagecache_block_put(&mapping->add_lock);
return ret;
+err_put_sectors_dirty:
+ /*
+ * On error - in particular, bch2_truncate_page() error - don't clear
+ * I_SIZE_DIRTY, as we've left data above i_size!:
+ */
+ i_sectors_hook.flags &= ~BCH_INODE_I_SIZE_DIRTY;
+ goto out;
}
/* fallocate: */
@@ -2389,7 +2434,6 @@ btree_iter_err:
if (ret)
goto err_put_sectors_dirty;
- i_size_write(&inode->v, new_size);
i_sectors_hook.new_i_size = new_size;
err_put_sectors_dirty:
ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index dc6c651..3b7f78e 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -106,6 +106,8 @@ int __must_check __bch2_write_inode(struct bch_fs *c,
break;
}
+ BUG_ON(inode_u.bi_size != inode->ei_inode.bi_size);
+
if (set) {
ret = set(inode, &inode_u, p);
if (ret)
@@ -114,6 +116,10 @@ int __must_check __bch2_write_inode(struct bch_fs *c,
BUG_ON(i_nlink < nlink_bias(inode->v.i_mode));
+ BUG_ON(inode_u.bi_size != inode->ei_inode.bi_size &&
+ !(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+ inode_u.bi_size > i_size_read(&inode->v));
+
inode_u.bi_mode = inode->v.i_mode;
inode_u.bi_uid = i_uid_read(&inode->v);
inode_u.bi_gid = i_gid_read(&inode->v);
@@ -129,11 +135,17 @@ int __must_check __bch2_write_inode(struct bch_fs *c,
ret = bch2_btree_insert_at(c, NULL, NULL,
&inode->ei_journal_seq,
BTREE_INSERT_ATOMIC|
+ BTREE_INSERT_NOUNLOCK|
BTREE_INSERT_NOFAIL,
BTREE_INSERT_ENTRY(&iter, &inode_p.inode.k_i));
} while (ret == -EINTR);
if (!ret) {
+ /*
+ * the btree node lock protects inode->ei_inode, not
+ * ei_update_lock; this is important for inode updates via
+ * bchfs_write_index_update
+ */
inode->ei_inode = inode_u;
inode->ei_qid = bch_qid(&inode_u);
}
@@ -1107,7 +1119,7 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
- buf->f_namelen = NAME_MAX;
+ buf->f_namelen = BCH_NAME_MAX;
return 0;
}
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index addd51f..b4fe27f 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -75,6 +75,19 @@ static inline size_t journal_entry_u64s_reserve(struct journal_buf *buf)
return BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX);
}
+static inline bool journal_entry_empty(struct jset *j)
+{
+ struct jset_entry *i;
+
+ if (j->seq != j->last_seq)
+ return false;
+
+ vstruct_for_each(j, i)
+ if (i->type || i->u64s)
+ return false;
+ return true;
+}
+
static enum {
JOURNAL_ENTRY_ERROR,
JOURNAL_ENTRY_INUSE,
@@ -129,6 +142,11 @@ static enum {
/* XXX: why set this here, and not in bch2_journal_write()? */
buf->data->last_seq = cpu_to_le64(journal_last_seq(j));
+ if (journal_entry_empty(buf->data))
+ clear_bit(JOURNAL_NOT_EMPTY, &j->flags);
+ else
+ set_bit(JOURNAL_NOT_EMPTY, &j->flags);
+
journal_pin_new_entry(j, 1);
bch2_journal_buf_init(j);
@@ -884,8 +902,18 @@ void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca)
void bch2_fs_journal_stop(struct journal *j)
{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+
wait_event(j->wait, journal_flush_write(j));
+ /* do we need to write another journal entry? */
+ if (test_bit(JOURNAL_NOT_EMPTY, &j->flags) ||
+ c->btree_roots_dirty)
+ bch2_journal_meta(j);
+
+ BUG_ON(!bch2_journal_error(j) &&
+ test_bit(JOURNAL_NOT_EMPTY, &j->flags));
+
cancel_delayed_work_sync(&j->write_work);
cancel_delayed_work_sync(&j->reclaim_work);
}
diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c
index 36ba6a4..8a4e7b2 100644
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@@ -13,37 +13,6 @@
#include <trace/events/bcachefs.h>
-static struct jset_entry *bch2_journal_find_entry(struct jset *j, unsigned type,
- enum btree_id id)
-{
- struct jset_entry *entry;
-
- for_each_jset_entry_type(entry, j, type)
- if (entry->btree_id == id)
- return entry;
-
- return NULL;
-}
-
-struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *c, struct jset *j,
- enum btree_id id, unsigned *level)
-{
- struct bkey_i *k;
- struct jset_entry *entry =
- bch2_journal_find_entry(j, BCH_JSET_ENTRY_btree_root, id);
-
- if (!entry)
- return NULL;
-
- if (!entry->u64s)
- return ERR_PTR(-EINVAL);
-
- k = entry->start;
- *level = entry->level;
- *level = entry->level;
- return k;
-}
-
struct journal_list {
struct closure cl;
struct mutex lock;
@@ -717,6 +686,37 @@ void bch2_journal_entries_free(struct list_head *list)
}
}
+int bch2_journal_set_seq(struct bch_fs *c, u64 last_seq, u64 end_seq)
+{
+ struct journal *j = &c->journal;
+ struct journal_entry_pin_list *p;
+ u64 seq, nr = end_seq - last_seq + 1;
+
+ if (nr > j->pin.size) {
+ free_fifo(&j->pin);
+ init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL);
+ if (!j->pin.data) {
+ bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
+ return -ENOMEM;
+ }
+ }
+
+ atomic64_set(&j->seq, end_seq);
+ j->last_seq_ondisk = last_seq;
+
+ j->pin.front = last_seq;
+ j->pin.back = end_seq + 1;
+
+ fifo_for_each_entry_ptr(p, &j->pin, seq) {
+ INIT_LIST_HEAD(&p->list);
+ INIT_LIST_HEAD(&p->flushed);
+ atomic_set(&p->count, 0);
+ p->devs.nr = 0;
+ }
+
+ return 0;
+}
+
int bch2_journal_read(struct bch_fs *c, struct list_head *list)
{
struct journal *j = &c->journal;
@@ -724,10 +724,9 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
struct journal_replay *i;
struct journal_entry_pin_list *p;
struct bch_dev *ca;
- u64 cur_seq, end_seq, seq;
+ u64 cur_seq, end_seq;
unsigned iter;
- size_t entries = 0;
- u64 nr, keys = 0;
+ size_t keys = 0, entries = 0;
bool degraded = false;
int ret = 0;
@@ -783,43 +782,13 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
}
}
- list_for_each_entry(i, list, list) {
- struct jset_entry *entry;
- struct bkey_i *k, *_n;
-
- for_each_jset_key(k, _n, entry, &i->j)
- keys++;
- }
-
i = list_last_entry(list, struct journal_replay, list);
- nr = le64_to_cpu(i->j.seq) - le64_to_cpu(i->j.last_seq) + 1;
-
- fsck_err_on(c->sb.clean && (keys || nr > 1), c,
- "filesystem marked clean but journal not empty (%llu keys in %llu entries)",
- keys, nr);
-
- if (nr > j->pin.size) {
- free_fifo(&j->pin);
- init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL);
- if (!j->pin.data) {
- bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
- return -ENOMEM;
- }
- }
-
- atomic64_set(&j->seq, le64_to_cpu(i->j.seq));
- j->last_seq_ondisk = le64_to_cpu(i->j.last_seq);
-
- j->pin.front = le64_to_cpu(i->j.last_seq);
- j->pin.back = le64_to_cpu(i->j.seq) + 1;
-
- fifo_for_each_entry_ptr(p, &j->pin, seq) {
- INIT_LIST_HEAD(&p->list);
- INIT_LIST_HEAD(&p->flushed);
- atomic_set(&p->count, 0);
- p->devs.nr = 0;
- }
+ ret = bch2_journal_set_seq(c,
+ le64_to_cpu(i->j.last_seq),
+ le64_to_cpu(i->j.seq));
+ if (ret)
+ return ret;
mutex_lock(&j->blacklist_lock);
@@ -842,6 +811,8 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
struct journal_replay, list)->j.seq);
list_for_each_entry(i, list, list) {
+ struct jset_entry *entry;
+ struct bkey_i *k, *_n;
bool blacklisted;
mutex_lock(&j->blacklist_lock);
@@ -863,10 +834,13 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
journal_last_seq(j), end_seq);
cur_seq = le64_to_cpu(i->j.seq) + 1;
+
+ for_each_jset_key(k, _n, entry, &i->j)
+ keys++;
entries++;
}
- bch_info(c, "journal read done, %llu keys in %zu entries, seq %llu",
+ bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
keys, entries, journal_cur_seq(j));
fsck_err:
return ret;
@@ -950,7 +924,8 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
j->replay_journal_seq = 0;
bch2_journal_set_replay_done(j);
- ret = bch2_journal_flush_all_pins(j);
+ bch2_journal_flush_all_pins(j);
+ ret = bch2_journal_error(j);
err:
bch2_journal_entries_free(list);
return ret;
diff --git a/libbcachefs/journal_io.h b/libbcachefs/journal_io.h
index 4236b7f..e303df9 100644
--- a/libbcachefs/journal_io.h
+++ b/libbcachefs/journal_io.h
@@ -1,9 +1,6 @@
#ifndef _BCACHEFS_JOURNAL_IO_H
#define _BCACHEFS_JOURNAL_IO_H
-struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *, struct jset *,
- enum btree_id, unsigned *);
-
/*
* Only used for holding the journal entries we read in btree_journal_read()
* during cache_registration
@@ -37,6 +34,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \
vstruct_for_each_safe(entry, k, _n)
+int bch2_journal_set_seq(struct bch_fs *c, u64, u64);
int bch2_journal_read(struct bch_fs *, struct list_head *);
int bch2_journal_entry_sectors(struct journal *);
diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c
index 0e3e5b6..394b72b 100644
--- a/libbcachefs/journal_reclaim.c
+++ b/libbcachefs/journal_reclaim.c
@@ -337,34 +337,22 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
return ret;
}
-int bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
+void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_entry_pin *pin;
u64 pin_seq;
- bool flush;
if (!test_bit(JOURNAL_STARTED, &j->flags))
- return 0;
-again:
- wait_event(j->wait, journal_flush_done(j, seq_to_flush, &pin, &pin_seq));
- if (pin) {
- /* flushing a journal pin might cause a new one to be added: */
- pin->flush(j, pin, pin_seq);
- goto again;
- }
-
- spin_lock(&j->lock);
- flush = journal_last_seq(j) != j->last_seq_ondisk ||
- (seq_to_flush == U64_MAX && c->btree_roots_dirty);
- spin_unlock(&j->lock);
+ return;
- return flush ? bch2_journal_meta(j) : 0;
-}
+ while (1) {
+ wait_event(j->wait, journal_flush_done(j, seq_to_flush,
+ &pin, &pin_seq));
+ if (!pin)
+ break;
-int bch2_journal_flush_all_pins(struct journal *j)
-{
- return bch2_journal_flush_pins(j, U64_MAX);
+ pin->flush(j, pin, pin_seq);
+ }
}
int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
@@ -383,7 +371,9 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
seq = iter;
spin_unlock(&j->lock);
- ret = bch2_journal_flush_pins(j, seq);
+ bch2_journal_flush_pins(j, seq);
+
+ ret = bch2_journal_error(j);
if (ret)
return ret;
@@ -404,7 +394,7 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
}
spin_unlock(&j->lock);
- bch2_replicas_gc_end(c, ret);
+ ret = bch2_replicas_gc_end(c, ret);
mutex_unlock(&c->replicas_gc_lock);
return ret;
diff --git a/libbcachefs/journal_reclaim.h b/libbcachefs/journal_reclaim.h
index 7d460c3..eb22790 100644
--- a/libbcachefs/journal_reclaim.h
+++ b/libbcachefs/journal_reclaim.h
@@ -29,8 +29,13 @@ void bch2_journal_pin_add_if_older(struct journal *,
void bch2_journal_reclaim_fast(struct journal *);
void bch2_journal_reclaim_work(struct work_struct *);
-int bch2_journal_flush_pins(struct journal *, u64);
-int bch2_journal_flush_all_pins(struct journal *);
+void bch2_journal_flush_pins(struct journal *, u64);
+
+static inline void bch2_journal_flush_all_pins(struct journal *j)
+{
+ bch2_journal_flush_pins(j, U64_MAX);
+}
+
int bch2_journal_flush_device_pins(struct journal *, int);
#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */
diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h
index a27e054..effbeec 100644
--- a/libbcachefs/journal_types.h
+++ b/libbcachefs/journal_types.h
@@ -117,6 +117,7 @@ enum {
JOURNAL_REPLAY_DONE,
JOURNAL_STARTED,
JOURNAL_NEED_WRITE,
+ JOURNAL_NOT_EMPTY,
};
/* Embedded in struct bch_fs */
diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c
index ea51910..215c5aa 100644
--- a/libbcachefs/migrate.c
+++ b/libbcachefs/migrate.c
@@ -126,7 +126,13 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
retry:
if (!bch2_extent_has_device(bkey_i_to_s_c_extent(&b->key),
dev_idx)) {
- bch2_btree_iter_set_locks_want(&iter, 0);
+ /*
+ * we might have found a btree node key we
+ * needed to update, and then tried to update it
+ * but got -EINTR after upgrading the iter, but
+ * then raced and the node is now gone:
+ */
+ bch2_btree_iter_downgrade(&iter);
ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
bkey_i_to_s_c(&b->key));
@@ -141,11 +147,6 @@ retry:
if (ret)
goto err;
- if (!bch2_btree_iter_set_locks_want(&iter, U8_MAX)) {
- b = bch2_btree_iter_peek_node(&iter);
- goto retry;
- }
-
ret = bch2_btree_node_update_key(c, &iter, b, new_key);
if (ret == -EINTR) {
b = bch2_btree_iter_peek_node(&iter);
@@ -160,7 +161,7 @@ retry:
ret = 0;
out:
- bch2_replicas_gc_end(c, ret);
+ ret = bch2_replicas_gc_end(c, ret);
mutex_unlock(&c->replicas_gc_lock);
return ret;
diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h
index e7ab887..f476033 100644
--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@@ -137,6 +137,9 @@ enum opt_type {
BCH_OPT(degraded, u8, OPT_MOUNT, \
OPT_BOOL(), \
NO_SB_OPT, false) \
+ BCH_OPT(discard, u8, OPT_MOUNT, \
+ OPT_BOOL(), \
+ NO_SB_OPT, false) \
BCH_OPT(verbose_recovery, u8, OPT_MOUNT, \
OPT_BOOL(), \
NO_SB_OPT, false) \
diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c
new file mode 100644
index 0000000..58aee7a
--- /dev/null
+++ b/libbcachefs/recovery.c
@@ -0,0 +1,346 @@
+
+#include "bcachefs.h"
+#include "alloc.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_io.h"
+#include "error.h"
+#include "fsck.h"
+#include "journal_io.h"
+#include "quota.h"
+#include "recovery.h"
+#include "super-io.h"
+
+#include <linux/stat.h>
+
+struct bkey_i *btree_root_find(struct bch_fs *c,
+ struct bch_sb_field_clean *clean,
+ struct jset *j,
+ enum btree_id id, unsigned *level)
+{
+ struct bkey_i *k;
+ struct jset_entry *entry, *start, *end;
+
+ if (clean) {
+ start = clean->start;
+ end = vstruct_end(&clean->field);
+ } else {
+ start = j->start;
+ end = vstruct_last(j);
+ }
+
+ for (entry = start; entry < end; entry = vstruct_next(entry))
+ if (entry->type == BCH_JSET_ENTRY_btree_root &&
+ entry->btree_id == id)
+ goto found;
+
+ return NULL;
+found:
+ if (!entry->u64s)
+ return ERR_PTR(-EINVAL);
+
+ k = entry->start;
+ *level = entry->level;
+ return k;
+}
+
+static int verify_superblock_clean(struct bch_fs *c,
+ struct bch_sb_field_clean *clean,
+ struct jset *j)
+{
+ unsigned i;
+ int ret = 0;
+
+ if (!clean || !j)
+ return 0;
+
+ if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
+ "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
+ le64_to_cpu(clean->journal_seq),
+ le64_to_cpu(j->seq)))
+ bch2_fs_mark_clean(c, false);
+
+ mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
+ "superblock read clock doesn't match journal after clean shutdown");
+ mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
+ "superblock read clock doesn't match journal after clean shutdown");
+
+ for (i = 0; i < BTREE_ID_NR; i++) {
+ struct bkey_i *k1, *k2;
+ unsigned l1 = 0, l2 = 0;
+
+ k1 = btree_root_find(c, clean, NULL, i, &l1);
+ k2 = btree_root_find(c, NULL, j, i, &l2);
+
+ if (!k1 && !k2)
+ continue;
+
+ mustfix_fsck_err_on(!k1 || !k2 ||
+ IS_ERR(k1) ||
+ IS_ERR(k2) ||
+ k1->k.u64s != k2->k.u64s ||
+ memcmp(k1, k2, bkey_bytes(k1)) ||
+ l1 != l2, c,
+ "superblock btree root doesn't match journal after clean shutdown");
+ }
+fsck_err:
+ return ret;
+}
+
+static bool journal_empty(struct list_head *journal)
+{
+ struct journal_replay *i;
+ struct jset_entry *entry;
+
+ if (list_empty(journal))
+ return true;
+
+ i = list_last_entry(journal, struct journal_replay, list);
+
+ if (i->j.last_seq != i->j.seq)
+ return false;
+
+ list_for_each_entry(i, journal, list) {
+ vstruct_for_each(&i->j, entry) {
+ if (entry->type == BCH_JSET_ENTRY_btree_root)
+ continue;
+
+ if (entry->type == BCH_JSET_ENTRY_btree_keys &&
+ !entry->u64s)
+ continue;
+ return false;
+ }
+ }
+
+ return true;
+}
+
+int bch2_fs_recovery(struct bch_fs *c)
+{
+ const char *err = "cannot allocate memory";
+ struct bch_sb_field_clean *clean = NULL, *sb_clean = NULL;
+ LIST_HEAD(journal);
+ struct jset *j = NULL;
+ unsigned i;
+ int ret;
+
+ mutex_lock(&c->sb_lock);
+ if (!bch2_sb_get_replicas(c->disk_sb.sb)) {
+ bch_info(c, "building replicas info");
+ set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
+ }
+
+ if (c->sb.clean)
+ sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
+ if (sb_clean) {
+ clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
+ GFP_KERNEL);
+ if (!clean) {
+ ret = -ENOMEM;
+ mutex_unlock(&c->sb_lock);
+ goto err;
+ }
+ }
+ mutex_unlock(&c->sb_lock);
+
+ if (clean)
+ bch_info(c, "recovering from clean shutdown, journal seq %llu",
+ le64_to_cpu(clean->journal_seq));
+
+ if (!clean || !c->opts.nofsck) {
+ ret = bch2_journal_read(c, &journal);
+ if (ret)
+ goto err;
+
+ j = &list_entry(journal.prev, struct journal_replay, list)->j;
+ } else {
+ ret = bch2_journal_set_seq(c,
+ le64_to_cpu(clean->journal_seq),
+ le64_to_cpu(clean->journal_seq));
+ BUG_ON(ret);
+ }
+
+ ret = verify_superblock_clean(c, clean, j);
+ if (ret)
+ goto err;
+
+ fsck_err_on(clean && !journal_empty(&journal), c,
+ "filesystem marked clean but journal not empty");
+
+ if (clean) {
+ c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock);
+ c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock);
+ } else {
+ c->bucket_clock[READ].hand = le16_to_cpu(j->read_clock);
+ c->bucket_clock[WRITE].hand = le16_to_cpu(j->write_clock);
+ }
+
+ for (i = 0; i < BTREE_ID_NR; i++) {
+ unsigned level;
+ struct bkey_i *k;
+
+ k = btree_root_find(c, clean, j, i, &level);
+ if (!k)
+ continue;
+
+ err = "invalid btree root pointer";
+ if (IS_ERR(k))
+ goto err;
+
+ err = "error reading btree root";
+ if (bch2_btree_root_read(c, i, k, level)) {
+ if (i != BTREE_ID_ALLOC)
+ goto err;
+
+ mustfix_fsck_err(c, "error reading btree root");
+ }
+ }
+
+ for (i = 0; i < BTREE_ID_NR; i++)
+ if (!c->btree_roots[i].b)
+ bch2_btree_root_alloc(c, i);
+
+ err = "error reading allocation information";
+ ret = bch2_alloc_read(c, &journal);
+ if (ret)
+ goto err;
+
+ set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
+
+ bch_verbose(c, "starting mark and sweep:");
+ err = "error in recovery";
+ ret = bch2_initial_gc(c, &journal);
+ if (ret)
+ goto err;
+ bch_verbose(c, "mark and sweep done");
+
+ if (c->opts.noreplay)
+ goto out;
+
+ /*
+ * Mark dirty before journal replay, fsck:
+ * XXX: after a clean shutdown, this could be done lazily only when fsck
+ * finds an error
+ */
+ bch2_fs_mark_clean(c, false);
+
+ /*
+ * bch2_fs_journal_start() can't happen sooner, or btree_gc_finish()
+ * will give spurious errors about oldest_gen > bucket_gen -
+ * this is a hack but oh well.
+ */
+ bch2_fs_journal_start(&c->journal);
+
+ err = "error starting allocator";
+ if (bch2_fs_allocator_start(c))
+ goto err;
+
+ bch_verbose(c, "starting journal replay:");
+ err = "journal replay failed";
+ ret = bch2_journal_replay(c, &journal);
+ if (ret)
+ goto err;
+ bch_verbose(c, "journal replay done");
+
+ if (c->opts.norecovery)
+ goto out;
+
+ bch_verbose(c, "starting fsck:");
+ err = "error in fsck";
+ ret = bch2_fsck(c, !c->opts.nofsck);
+ if (ret)
+ goto err;
+ bch_verbose(c, "fsck done");
+
+ if (enabled_qtypes(c)) {
+ bch_verbose(c, "reading quotas:");
+ ret = bch2_fs_quota_read(c);
+ if (ret)
+ goto err;
+ bch_verbose(c, "quotas done");
+ }
+
+out:
+ bch2_journal_entries_free(&journal);
+ kfree(clean);
+ return ret;
+err:
+fsck_err:
+ BUG_ON(!ret);
+ goto out;
+}
+
+int bch2_fs_initialize(struct bch_fs *c)
+{
+ struct bch_inode_unpacked inode;
+ struct bkey_inode_buf packed_inode;
+ const char *err = "cannot allocate memory";
+ struct bch_dev *ca;
+ LIST_HEAD(journal);
+ unsigned i;
+ int ret;
+
+ bch_notice(c, "initializing new filesystem");
+
+ set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
+
+ ret = bch2_initial_gc(c, &journal);
+ if (ret)
+ goto err;
+
+ err = "unable to allocate journal buckets";
+ for_each_online_member(ca, c, i)
+ if (bch2_dev_journal_alloc(ca)) {
+ percpu_ref_put(&ca->io_ref);
+ goto err;
+ }
+
+ for (i = 0; i < BTREE_ID_NR; i++)
+ bch2_btree_root_alloc(c, i);
+
+ /*
+ * journal_res_get() will crash if called before this has
+ * set up the journal.pin FIFO and journal.cur pointer:
+ */
+ bch2_fs_journal_start(&c->journal);
+ bch2_journal_set_replay_done(&c->journal);
+
+ err = "error starting allocator";
+ if (bch2_fs_allocator_start(c))
+ goto err;
+
+ bch2_inode_init(c, &inode, 0, 0,
+ S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
+ inode.bi_inum = BCACHEFS_ROOT_INO;
+
+ bch2_inode_pack(&packed_inode, &inode);
+
+ err = "error creating root directory";
+ if (bch2_btree_insert(c, BTREE_ID_INODES,
+ &packed_inode.inode.k_i,
+ NULL, NULL, NULL, 0))
+ goto err;
+
+ if (enabled_qtypes(c)) {
+ ret = bch2_fs_quota_read(c);
+ if (ret)
+ goto err;
+ }
+
+ err = "error writing first journal entry";
+ if (bch2_journal_meta(&c->journal))
+ goto err;
+
+ mutex_lock(&c->sb_lock);
+ SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
+ SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+
+ return 0;
+err:
+ BUG_ON(!ret);
+ return ret;
+}
diff --git a/libbcachefs/recovery.h b/libbcachefs/recovery.h
new file mode 100644
index 0000000..685507e
--- /dev/null
+++ b/libbcachefs/recovery.h
@@ -0,0 +1,7 @@
+#ifndef _BCACHEFS_RECOVERY_H
+#define _BCACHEFS_RECOVERY_H
+
+int bch2_fs_recovery(struct bch_fs *);
+int bch2_fs_initialize(struct bch_fs *);
+
+#endif /* _BCACHEFS_RECOVERY_H */
diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c
index 6c52d1d..1e94d35 100644
--- a/libbcachefs/replicas.c
+++ b/libbcachefs/replicas.c
@@ -215,10 +215,8 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c,
return 0;
err:
mutex_unlock(&c->sb_lock);
- if (new_gc)
- kfree(new_gc);
- if (new_r)
- kfree(new_r);
+ kfree(new_gc);
+ kfree(new_r);
return ret;
}
@@ -265,10 +263,9 @@ int bch2_mark_bkey_replicas(struct bch_fs *c,
return bch2_mark_replicas(c, data_type, bch2_bkey_dirty_devs(k));
}
-int bch2_replicas_gc_end(struct bch_fs *c, int err)
+int bch2_replicas_gc_end(struct bch_fs *c, int ret)
{
struct bch_replicas_cpu *new_r, *old_r;
- int ret = 0;
lockdep_assert_held(&c->replicas_gc_lock);
@@ -276,29 +273,31 @@ int bch2_replicas_gc_end(struct bch_fs *c, int err)
new_r = rcu_dereference_protected(c->replicas_gc,
lockdep_is_held(&c->sb_lock));
+ rcu_assign_pointer(c->replicas_gc, NULL);
- if (err) {
- rcu_assign_pointer(c->replicas_gc, NULL);
- kfree_rcu(new_r, rcu);
+ if (ret)
goto err;
- }
if (bch2_cpu_replicas_to_sb_replicas(c, new_r)) {
ret = -ENOSPC;
goto err;
}
+ bch2_write_super(c);
+
+ /* don't update in memory replicas until changes are persistent */
+
old_r = rcu_dereference_protected(c->replicas,
lockdep_is_held(&c->sb_lock));
rcu_assign_pointer(c->replicas, new_r);
- rcu_assign_pointer(c->replicas_gc, NULL);
kfree_rcu(old_r, rcu);
-
- bch2_write_super(c);
-err:
+out:
mutex_unlock(&c->sb_lock);
return ret;
+err:
+ kfree_rcu(new_r, rcu);
+ goto out;
}
int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h
index f7dd014..c805109 100644
--- a/libbcachefs/str_hash.h
+++ b/libbcachefs/str_hash.h
@@ -237,6 +237,7 @@ static inline int bch2_hash_needs_whiteout(const struct bch_hash_desc desc,
{
struct bkey_s_c k;
+ bch2_btree_iter_copy(iter, start);
bch2_btree_iter_next_slot(iter);
for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c
index 9772d59..54de9fa 100644
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@@ -4,6 +4,7 @@
#include "disk_groups.h"
#include "error.h"
#include "io.h"
+#include "journal.h"
#include "replicas.h"
#include "quota.h"
#include "super-io.h"
@@ -89,6 +90,9 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
struct bch_sb *new_sb;
struct bio *bio;
+ if (sb->sb && sb->page_order >= order)
+ return 0;
+
if (sb->have_layout) {
u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
@@ -849,6 +853,84 @@ static const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
.validate = bch2_sb_validate_crypt,
};
+/* BCH_SB_FIELD_clean: */
+
+void bch2_fs_mark_clean(struct bch_fs *c, bool clean)
+{
+ struct bch_sb_field_clean *sb_clean;
+ unsigned u64s = sizeof(*sb_clean) / sizeof(u64);
+ struct jset_entry *entry;
+ struct btree_root *r;
+
+ mutex_lock(&c->sb_lock);
+ if (clean == BCH_SB_CLEAN(c->disk_sb.sb))
+ goto out;
+
+ SET_BCH_SB_CLEAN(c->disk_sb.sb, clean);
+
+ if (!clean)
+ goto write_super;
+
+ mutex_lock(&c->btree_root_lock);
+
+ for (r = c->btree_roots;
+ r < c->btree_roots + BTREE_ID_NR;
+ r++)
+ if (r->alive)
+ u64s += jset_u64s(r->key.u64s);
+
+ sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s);
+ if (!sb_clean) {
+ bch_err(c, "error resizing superblock while setting filesystem clean");
+ goto out;
+ }
+
+ sb_clean->flags = 0;
+ sb_clean->read_clock = cpu_to_le16(c->bucket_clock[READ].hand);
+ sb_clean->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand);
+ sb_clean->journal_seq = journal_cur_seq(&c->journal) - 1;
+
+ entry = sb_clean->start;
+ memset(entry, 0,
+ vstruct_end(&sb_clean->field) - (void *) entry);
+
+ for (r = c->btree_roots;
+ r < c->btree_roots + BTREE_ID_NR;
+ r++)
+ if (r->alive) {
+ entry->u64s = r->key.u64s;
+ entry->btree_id = r - c->btree_roots;
+ entry->level = r->level;
+ entry->type = BCH_JSET_ENTRY_btree_root;
+ bkey_copy(&entry->start[0], &r->key);
+ entry = vstruct_next(entry);
+ BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
+ }
+
+ BUG_ON(entry != vstruct_end(&sb_clean->field));
+
+ mutex_unlock(&c->btree_root_lock);
+write_super:
+ bch2_write_super(c);
+out:
+ mutex_unlock(&c->sb_lock);
+}
+
+static const char *bch2_sb_validate_clean(struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_clean *clean = field_to_type(f, clean);
+
+ if (vstruct_bytes(&clean->field) < sizeof(*clean))
+ return "invalid field crypt: wrong size";
+
+ return NULL;
+}
+
+static const struct bch_sb_field_ops bch_sb_field_ops_clean = {
+ .validate = bch2_sb_validate_clean,
+};
+
static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
#define x(f, nr) \
[BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f,
diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h
index 995b1c9..7d09d8e 100644
--- a/libbcachefs/super-io.h
+++ b/libbcachefs/super-io.h
@@ -131,6 +131,10 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
};
}
+/* BCH_SB_FIELD_clean: */
+
+void bch2_fs_mark_clean(struct bch_fs *, bool);
+
size_t bch2_sb_field_to_text(char *, size_t, struct bch_sb *,
struct bch_sb_field *);
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 1eab7c7..a2a32b9 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -10,7 +10,6 @@
#include "alloc.h"
#include "btree_cache.h"
#include "btree_gc.h"
-#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_io.h"
#include "chardev.h"
@@ -26,14 +25,13 @@
#include "inode.h"
#include "io.h"
#include "journal.h"
-#include "journal_io.h"
#include "journal_reclaim.h"
-#include "keylist.h"
#include "move.h"
#include "migrate.h"
#include "movinggc.h"
#include "quota.h"
#include "rebalance.h"
+#include "recovery.h"
#include "replicas.h"
#include "super.h"
#include "super-io.h"
@@ -201,18 +199,6 @@ int bch2_congested(void *data, int bdi_bits)
* - allocator depends on the journal (when it rewrites prios and gens)
*/
-static void bch_fs_mark_clean(struct bch_fs *c)
-{
- if (!bch2_journal_error(&c->journal) &&
- !test_bit(BCH_FS_ERROR, &c->flags) &&
- !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) {
- mutex_lock(&c->sb_lock);
- SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
- }
-}
-
static void __bch2_fs_read_only(struct bch_fs *c)
{
struct bch_dev *ca;
@@ -229,7 +215,7 @@ static void __bch2_fs_read_only(struct bch_fs *c)
* Flush journal before stopping allocators, because flushing journal
* blacklist entries involves allocating new btree nodes:
*/
- bch2_journal_flush_pins(&c->journal, U64_MAX - 1);
+ bch2_journal_flush_all_pins(&c->journal);
for_each_member_device(ca, c, i)
bch2_dev_allocator_stop(ca);
@@ -246,9 +232,6 @@ static void __bch2_fs_read_only(struct bch_fs *c)
closure_wait_event(&c->btree_interior_update_wait,
!bch2_btree_interior_updates_nr_pending(c));
- if (!test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
- bch2_btree_verify_flushed(c);
-
bch2_fs_journal_stop(&c->journal);
/*
@@ -257,6 +240,8 @@ static void __bch2_fs_read_only(struct bch_fs *c)
*/
if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
bch2_btree_flush_all_writes(c);
+ else
+ bch2_btree_verify_flushed(c);
/*
* After stopping journal:
@@ -275,12 +260,10 @@ static void bch2_writes_disabled(struct percpu_ref *writes)
void bch2_fs_read_only(struct bch_fs *c)
{
- if (c->state != BCH_FS_STARTING &&
- c->state != BCH_FS_RW)
+ if (c->state == BCH_FS_RO)
return;
- if (test_bit(BCH_FS_ERROR, &c->flags))
- return;
+ BUG_ON(test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
/*
* Block new foreground-end write operations from starting - any new
@@ -311,13 +294,18 @@ void bch2_fs_read_only(struct bch_fs *c)
__bch2_fs_read_only(c);
- bch_fs_mark_clean(c);
-
wait_event(bch_read_only_wait,
test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
- c->state = BCH_FS_RO;
+
+ if (!bch2_journal_error(&c->journal) &&
+ !test_bit(BCH_FS_ERROR, &c->flags) &&
+ !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
+ bch2_fs_mark_clean(c, true);
+
+ if (c->state != BCH_FS_STOPPING)
+ c->state = BCH_FS_RO;
}
static void bch2_fs_read_only_work(struct work_struct *work)
@@ -352,10 +340,11 @@ const char *bch2_fs_read_write(struct bch_fs *c)
const char *err = NULL;
unsigned i;
- if (c->state != BCH_FS_STARTING &&
- c->state != BCH_FS_RO)
+ if (c->state == BCH_FS_RW)
return NULL;
+ bch2_fs_mark_clean(c, false);
+
for_each_rw_member(ca, c, i)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
@@ -446,11 +435,6 @@ void bch2_fs_stop(struct bch_fs *c)
struct bch_dev *ca;
unsigned i;
- mutex_lock(&c->state_lock);
- BUG_ON(c->state == BCH_FS_STOPPING);
- c->state = BCH_FS_STOPPING;
- mutex_unlock(&c->state_lock);
-
for_each_member_device(ca, c, i)
if (ca->kobj.state_in_sysfs &&
ca->disk_sb.bdev)
@@ -475,11 +459,9 @@ void bch2_fs_stop(struct bch_fs *c)
closure_debug_destroy(&c->cl);
mutex_lock(&c->state_lock);
- __bch2_fs_read_only(c);
+ bch2_fs_read_only(c);
mutex_unlock(&c->state_lock);
- bch_fs_mark_clean(c);
-
/* btree prefetch might have kicked off reads in the background: */
bch2_btree_flush_all_reads(c);
@@ -695,9 +677,7 @@ const char *bch2_fs_start(struct bch_fs *c)
const char *err = "cannot allocate memory";
struct bch_sb_field_members *mi;
struct bch_dev *ca;
- LIST_HEAD(journal);
- struct jset *j;
- time64_t now;
+ time64_t now = ktime_get_seconds();
unsigned i;
int ret = -EINVAL;
@@ -706,157 +686,26 @@ const char *bch2_fs_start(struct bch_fs *c)
BUG_ON(c->state != BCH_FS_STARTING);
mutex_lock(&c->sb_lock);
+
for_each_online_member(ca, c, i)
bch2_sb_from_fs(c, ca);
+
+ mi = bch2_sb_get_members(c->disk_sb.sb);
+ for_each_online_member(ca, c, i)
+ mi->members[ca->dev_idx].last_mount = cpu_to_le64(now);
+
mutex_unlock(&c->sb_lock);
for_each_rw_member(ca, c, i)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
- if (BCH_SB_INITIALIZED(c->disk_sb.sb)) {
- ret = bch2_journal_read(c, &journal);
- if (ret)
- goto err;
-
- j = &list_entry(journal.prev, struct journal_replay, list)->j;
-
- c->bucket_clock[READ].hand = le16_to_cpu(j->read_clock);
- c->bucket_clock[WRITE].hand = le16_to_cpu(j->write_clock);
-
- for (i = 0; i < BTREE_ID_NR; i++) {
- unsigned level;
- struct bkey_i *k;
-
- k = bch2_journal_find_btree_root(c, j, i, &level);
- if (!k)
- continue;
-
- err = "invalid btree root pointer";
- if (IS_ERR(k))
- goto err;
-
- err = "error reading btree root";
- if (bch2_btree_root_read(c, i, k, level)) {
- if (i != BTREE_ID_ALLOC)
- goto err;
-
- mustfix_fsck_err(c, "error reading btree root");
- }
- }
-
- for (i = 0; i < BTREE_ID_NR; i++)
- if (!c->btree_roots[i].b)
- bch2_btree_root_alloc(c, i);
-
- err = "error reading allocation information";
- ret = bch2_alloc_read(c, &journal);
- if (ret)
- goto err;
-
- set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
-
- bch_verbose(c, "starting mark and sweep:");
- err = "error in recovery";
- ret = bch2_initial_gc(c, &journal);
- if (ret)
- goto err;
- bch_verbose(c, "mark and sweep done");
-
- if (c->opts.noreplay)
- goto recovery_done;
-
- /*
- * bch2_fs_journal_start() can't happen sooner, or btree_gc_finish()
- * will give spurious errors about oldest_gen > bucket_gen -
- * this is a hack but oh well.
- */
- bch2_fs_journal_start(&c->journal);
-
- err = "error starting allocator";
- if (bch2_fs_allocator_start(c))
- goto err;
-
- bch_verbose(c, "starting journal replay:");
- err = "journal replay failed";
- ret = bch2_journal_replay(c, &journal);
- if (ret)
- goto err;
- bch_verbose(c, "journal replay done");
-
- if (c->opts.norecovery)
- goto recovery_done;
-
- bch_verbose(c, "starting fsck:");
- err = "error in fsck";
- ret = bch2_fsck(c, !c->opts.nofsck);
- if (ret)
- goto err;
- bch_verbose(c, "fsck done");
-
- if (enabled_qtypes(c)) {
- bch_verbose(c, "reading quotas:");
- ret = bch2_fs_quota_read(c);
- if (ret)
- goto err;
- bch_verbose(c, "quotas done");
- }
- } else {
- struct bch_inode_unpacked inode;
- struct bkey_inode_buf packed_inode;
-
- bch_notice(c, "initializing new filesystem");
-
- set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
-
- ret = bch2_initial_gc(c, &journal);
- if (ret)
- goto err;
-
- err = "unable to allocate journal buckets";
- for_each_online_member(ca, c, i)
- if (bch2_dev_journal_alloc(ca)) {
- percpu_ref_put(&ca->io_ref);
- goto err;
- }
-
- for (i = 0; i < BTREE_ID_NR; i++)
- bch2_btree_root_alloc(c, i);
-
- /*
- * journal_res_get() will crash if called before this has
- * set up the journal.pin FIFO and journal.cur pointer:
- */
- bch2_fs_journal_start(&c->journal);
- bch2_journal_set_replay_done(&c->journal);
-
- err = "error starting allocator";
- if (bch2_fs_allocator_start(c))
- goto err;
-
- bch2_inode_init(c, &inode, 0, 0,
- S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
- inode.bi_inum = BCACHEFS_ROOT_INO;
-
- bch2_inode_pack(&packed_inode, &inode);
-
- err = "error creating root directory";
- if (bch2_btree_insert(c, BTREE_ID_INODES,
- &packed_inode.inode.k_i,
- NULL, NULL, NULL, 0))
- goto err;
-
- if (enabled_qtypes(c)) {
- ret = bch2_fs_quota_read(c);
- if (ret)
- goto err;
- }
+ ret = BCH_SB_INITIALIZED(c->disk_sb.sb)
+ ? bch2_fs_recovery(c)
+ : bch2_fs_initialize(c);
+ if (ret)
+ goto err;
- err = "error writing first journal entry";
- if (bch2_journal_meta(&c->journal))
- goto err;
- }
-recovery_done:
err = "dynamic fault";
if (bch2_fs_init_fault("fs_start"))
goto err;
@@ -869,28 +718,13 @@ recovery_done:
goto err;
}
- mutex_lock(&c->sb_lock);
- mi = bch2_sb_get_members(c->disk_sb.sb);
- now = ktime_get_seconds();
-
- for_each_member_device(ca, c, i)
- mi->members[ca->dev_idx].last_mount = cpu_to_le64(now);
-
- SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
- SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
-
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
set_bit(BCH_FS_STARTED, &c->flags);
err = NULL;
out:
mutex_unlock(&c->state_lock);
- bch2_journal_entries_free(&journal);
return err;
err:
-fsck_err:
switch (ret) {
case BCH_FSCK_ERRORS_NOT_FIXED:
bch_err(c, "filesystem contains errors: please report this to the developers");
@@ -1091,6 +925,9 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
ca->mi = bch2_mi_to_cpu(member);
ca->uuid = member->uuid;
+ if (opt_defined(c->opts, discard))
+ ca->mi.discard = opt_get(c->opts, discard);
+
if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
0, GFP_KERNEL) ||
percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
@@ -1454,7 +1291,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
* must flush all existing journal entries, they might have
* (overwritten) keys that point to the device we're removing:
*/
- ret = bch2_journal_flush_all_pins(&c->journal);
+ bch2_journal_flush_all_pins(&c->journal);
+ ret = bch2_journal_error(&c->journal);
if (ret) {
bch_err(ca, "Remove failed, journal error");
goto err;
@@ -1615,6 +1453,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
{
struct bch_opts opts = bch2_opts_empty();
struct bch_sb_handle sb = { NULL };
+ struct bch_sb_field_members *mi;
struct bch_dev *ca;
unsigned dev_idx;
const char *err;
@@ -1646,6 +1485,15 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
goto err;
}
+ mutex_lock(&c->sb_lock);
+ mi = bch2_sb_get_members(c->disk_sb.sb);
+
+ mi->members[ca->dev_idx].last_mount =
+ cpu_to_le64(ktime_get_seconds());
+
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+
mutex_unlock(&c->state_lock);
return 0;
err:
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index 66b5b9f..4987ee7 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -27,6 +27,7 @@
#include "rebalance.h"
#include "replicas.h"
#include "super-io.h"
+#include "tests.h"
#include <linux/blkdev.h>
#include <linux/sort.h>
@@ -192,6 +193,10 @@ rw_attribute(pd_controllers_update_seconds);
read_attribute(meta_replicas_have);
read_attribute(data_replicas_have);
+#ifdef CONFIG_BCACHEFS_TESTS
+write_attribute(perf_test);
+#endif /* CONFIG_BCACHEFS_TESTS */
+
#define BCH_DEBUG_PARAM(name, description) \
rw_attribute(name);
@@ -446,7 +451,25 @@ STORE(__bch2_fs)
sc.nr_to_scan = strtoul_or_return(buf);
c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc);
}
-
+#ifdef CONFIG_BCACHEFS_TESTS
+ if (attr == &sysfs_perf_test) {
+ char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
+ char *test = strsep(&p, " \t\n");
+ char *nr_str = strsep(&p, " \t\n");
+ char *threads_str = strsep(&p, " \t\n");
+ unsigned threads;
+ u64 nr;
+ int ret = -EINVAL;
+
+ if (threads_str &&
+ !(ret = kstrtouint(threads_str, 10, &threads)) &&
+ !(ret = bch2_strtoull_h(nr_str, &nr)))
+ bch2_btree_perf_test(c, test, nr, threads);
+ else
+ size = ret;
+ kfree(tmp);
+ }
+#endif
return size;
}
@@ -477,6 +500,10 @@ struct attribute *bch2_fs_files[] = {
&sysfs_promote_whole_extents,
&sysfs_compression_stats,
+
+#ifdef CONFIG_BCACHEFS_TESTS
+ &sysfs_perf_test,
+#endif
NULL
};
diff --git a/libbcachefs/tests.c b/libbcachefs/tests.c
new file mode 100644
index 0000000..9dcadd2
--- /dev/null
+++ b/libbcachefs/tests.c
@@ -0,0 +1,289 @@
+#ifdef CONFIG_BCACHEFS_TESTS
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "tests.h"
+
+#include "linux/kthread.h"
+#include "linux/random.h"
+
+static void test_delete(struct bch_fs *c, u64 nr)
+{
+ struct btree_iter iter;
+ struct bkey_i_cookie k;
+ int ret;
+
+ bkey_cookie_init(&k.k_i);
+
+ bch2_btree_iter_init(&iter, c, BTREE_ID_DIRENTS, k.k.p,
+ BTREE_ITER_INTENT);
+
+ ret = bch2_btree_iter_traverse(&iter);
+ BUG_ON(ret);
+
+ ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
+ BTREE_INSERT_ENTRY(&iter, &k.k_i));
+ BUG_ON(ret);
+
+ pr_info("deleting once");
+ ret = bch2_btree_delete_at(&iter, 0);
+ BUG_ON(ret);
+
+ pr_info("deleting twice");
+ ret = bch2_btree_delete_at(&iter, 0);
+ BUG_ON(ret);
+
+ bch2_btree_iter_unlock(&iter);
+}
+
+static u64 test_rand(void)
+{
+ u64 v;
+#if 0
+ v = prandom_u32();
+#else
+ prandom_bytes(&v, sizeof(v));
+#endif
+ return v;
+}
+
+static void rand_insert(struct bch_fs *c, u64 nr)
+{
+ struct bkey_i_cookie k;
+ int ret;
+ u64 i;
+
+ for (i = 0; i < nr; i++) {
+ bkey_cookie_init(&k.k_i);
+ k.k.p.offset = test_rand();
+
+ ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i,
+ NULL, NULL, NULL, 0);
+ BUG_ON(ret);
+ }
+}
+
+static void rand_lookup(struct bch_fs *c, u64 nr)
+{
+ u64 i;
+
+ for (i = 0; i < nr; i++) {
+ struct btree_iter iter;
+ struct bkey_s_c k;
+
+ bch2_btree_iter_init(&iter, c, BTREE_ID_DIRENTS,
+ POS(0, test_rand()), 0);
+
+ k = bch2_btree_iter_peek(&iter);
+ bch2_btree_iter_unlock(&iter);
+ }
+}
+
+static void rand_mixed(struct bch_fs *c, u64 nr)
+{
+ int ret;
+ u64 i;
+
+ for (i = 0; i < nr; i++) {
+ struct btree_iter iter;
+ struct bkey_s_c k;
+
+ bch2_btree_iter_init(&iter, c, BTREE_ID_DIRENTS,
+ POS(0, test_rand()), 0);
+
+ k = bch2_btree_iter_peek(&iter);
+
+ if (!(i & 3) && k.k) {
+ struct bkey_i_cookie k;
+
+ bkey_cookie_init(&k.k_i);
+ k.k.p = iter.pos;
+
+ ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
+ BTREE_INSERT_ENTRY(&iter, &k.k_i));
+ BUG_ON(ret);
+ }
+
+ bch2_btree_iter_unlock(&iter);
+ }
+
+}
+
+static void rand_delete(struct bch_fs *c, u64 nr)
+{
+ struct bkey_i k;
+ int ret;
+ u64 i;
+
+ for (i = 0; i < nr; i++) {
+ bkey_init(&k.k);
+ k.k.p.offset = test_rand();
+
+ ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k,
+ NULL, NULL, NULL, 0);
+ BUG_ON(ret);
+ }
+}
+
+static void seq_insert(struct bch_fs *c, u64 nr)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_i_cookie insert;
+ int ret;
+ u64 i = 0;
+
+ bkey_cookie_init(&insert.k_i);
+
+ for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN,
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k) {
+ insert.k.p = iter.pos;
+
+ ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
+ BTREE_INSERT_ENTRY(&iter, &insert.k_i));
+ BUG_ON(ret);
+
+ if (++i == nr)
+ break;
+ }
+ bch2_btree_iter_unlock(&iter);
+}
+
+static void seq_lookup(struct bch_fs *c, u64 nr)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+
+ for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN, 0, k)
+ ;
+ bch2_btree_iter_unlock(&iter);
+}
+
+static void seq_overwrite(struct bch_fs *c, u64 nr)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN,
+ BTREE_ITER_INTENT, k) {
+ struct bkey_i_cookie u;
+
+ bkey_reassemble(&u.k_i, k);
+
+ ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
+ BTREE_INSERT_ENTRY(&iter, &u.k_i));
+ BUG_ON(ret);
+ }
+ bch2_btree_iter_unlock(&iter);
+}
+
+static void seq_delete(struct bch_fs *c, u64 nr)
+{
+ int ret;
+
+ ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS,
+ POS_MIN, POS_MAX,
+ ZERO_VERSION, NULL, NULL, NULL);
+ BUG_ON(ret);
+}
+
+typedef void (*perf_test_fn)(struct bch_fs *, u64);
+
+struct test_job {
+ struct bch_fs *c;
+ u64 nr;
+ unsigned nr_threads;
+ perf_test_fn fn;
+
+ atomic_t ready;
+ wait_queue_head_t ready_wait;
+
+ atomic_t done;
+ struct completion done_completion;
+
+ u64 start;
+ u64 finish;
+};
+
+static int btree_perf_test_thread(void *data)
+{
+ struct test_job *j = data;
+
+ if (atomic_dec_and_test(&j->ready)) {
+ wake_up(&j->ready_wait);
+ j->start = sched_clock();
+ } else {
+ wait_event(j->ready_wait, !atomic_read(&j->ready));
+ }
+
+ j->fn(j->c, j->nr / j->nr_threads);
+
+ if (atomic_dec_and_test(&j->done)) {
+ j->finish = sched_clock();
+ complete(&j->done_completion);
+ }
+
+ return 0;
+}
+
+void bch2_btree_perf_test(struct bch_fs *c, const char *testname,
+ u64 nr, unsigned nr_threads)
+{
+ struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads };
+ char name_buf[20], nr_buf[20], per_sec_buf[20];
+ unsigned i;
+ u64 time;
+
+ atomic_set(&j.ready, nr_threads);
+ init_waitqueue_head(&j.ready_wait);
+
+ atomic_set(&j.done, nr_threads);
+ init_completion(&j.done_completion);
+
+#define perf_test(_test) \
+ if (!strcmp(testname, #_test)) j.fn = _test
+
+ perf_test(rand_insert);
+ perf_test(rand_lookup);
+ perf_test(rand_mixed);
+ perf_test(rand_delete);
+
+ perf_test(seq_insert);
+ perf_test(seq_lookup);
+ perf_test(seq_overwrite);
+ perf_test(seq_delete);
+
+ /* a unit test, not a perf test: */
+ perf_test(test_delete);
+
+ if (!j.fn) {
+ pr_err("unknown test %s", testname);
+ return;
+ }
+
+ //pr_info("running test %s:", testname);
+
+ if (nr_threads == 1)
+ btree_perf_test_thread(&j);
+ else
+ for (i = 0; i < nr_threads; i++)
+ kthread_run(btree_perf_test_thread, &j,
+ "bcachefs perf test[%u]", i);
+
+ while (wait_for_completion_interruptible(&j.done_completion))
+ ;
+
+ time = j.finish - j.start;
+
+ scnprintf(name_buf, sizeof(name_buf), "%s:", testname);
+ bch2_hprint(nr_buf, nr);
+ bch2_hprint(per_sec_buf, nr * NSEC_PER_SEC / time);
+ printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n",
+ name_buf, nr_buf, nr_threads,
+ time / NSEC_PER_SEC,
+ time * nr_threads / nr,
+ per_sec_buf);
+}
+
+#endif /* CONFIG_BCACHEFS_TESTS */
diff --git a/libbcachefs/tests.h b/libbcachefs/tests.h
new file mode 100644
index 0000000..3f1b8d1
--- /dev/null
+++ b/libbcachefs/tests.h
@@ -0,0 +1,14 @@
+#ifndef _BCACHEFS_TEST_H
+#define _BCACHEFS_TEST_H
+
+struct bch_fs;
+
+#ifdef CONFIG_BCACHEFS_TESTS
+
+void bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned);
+
+#else
+
+#endif /* CONFIG_BCACHEFS_TESTS */
+
+#endif /* _BCACHEFS_TEST_H */
diff --git a/libbcachefs/util.c b/libbcachefs/util.c
index e263dd2..24c6cc5 100644
--- a/libbcachefs/util.c
+++ b/libbcachefs/util.c
@@ -27,55 +27,73 @@
#define simple_strtoint(c, end, base) simple_strtol(c, end, base)
#define simple_strtouint(c, end, base) simple_strtoul(c, end, base)
+static const char si_units[] = "?kMGTPEZY";
+
+static int __bch2_strtoh(const char *cp, u64 *res,
+ u64 t_max, bool t_signed)
+{
+ bool positive = *cp != '-';
+ unsigned u;
+ u64 v = 0;
+
+ if (*cp == '+' || *cp == '-')
+ cp++;
+
+ if (!isdigit(*cp))
+ return -EINVAL;
+
+ do {
+ if (v > U64_MAX / 10)
+ return -ERANGE;
+ v *= 10;
+ if (v > U64_MAX - (*cp - '0'))
+ return -ERANGE;
+ v += *cp - '0';
+ cp++;
+ } while (isdigit(*cp));
+
+ for (u = 1; u < ARRAY_SIZE(si_units); u++)
+ if (*cp == si_units[u]) {
+ cp++;
+ goto got_unit;
+ }
+ u = 0;
+got_unit:
+ if (*cp == '\n')
+ cp++;
+ if (*cp)
+ return -EINVAL;
+
+ if (fls64(v) + u * 10 > 64)
+ return -ERANGE;
+
+ v <<= u * 10;
+
+ if (positive) {
+ if (v > t_max)
+ return -ERANGE;
+ } else {
+ if (v && !t_signed)
+ return -ERANGE;
+
+ if (v > t_max + 1)
+ return -ERANGE;
+ v = -v;
+ }
+
+ *res = v;
+ return 0;
+}
+
#define STRTO_H(name, type) \
int bch2_ ## name ## _h(const char *cp, type *res) \
{ \
- int u = 0; \
- char *e; \
- type i = simple_ ## name(cp, &e, 10); \
- \
- switch (tolower(*e)) { \
- default: \
- return -EINVAL; \
- case 'y': \
- case 'z': \
- u++; \
- case 'e': \
- u++; \
- case 'p': \
- u++; \
- case 't': \
- u++; \
- case 'g': \
- u++; \
- case 'm': \
- u++; \
- case 'k': \
- u++; \
- if (e++ == cp) \
- return -EINVAL; \
- case '\n': \
- case '\0': \
- if (*e == '\n') \
- e++; \
- } \
- \
- if (*e) \
- return -EINVAL; \
- \
- while (u--) { \
- if ((type) ~0 > 0 && \
- (type) ~0 / 1024 <= i) \
- return -EINVAL; \
- if ((i > 0 && ANYSINT_MAX(type) / 1024 < i) || \
- (i < 0 && -ANYSINT_MAX(type) / 1024 > i)) \
- return -EINVAL; \
- i *= 1024; \
- } \
- \
- *res = i; \
- return 0; \
-} \
+ u64 v; \
+ int ret = __bch2_strtoh(cp, &v, ANYSINT_MAX(type), \
+ ANYSINT_MAX(type) != ((type) ~0ULL)); \
+ *res = v; \
+ return ret; \
+}
STRTO_H(strtoint, int)
STRTO_H(strtouint, unsigned int)
@@ -84,7 +102,6 @@ STRTO_H(strtoull, unsigned long long)
ssize_t bch2_hprint(char *buf, s64 v)
{
- static const char units[] = "?kMGTPEZY";
char dec[4] = "";
int u, t = 0;
@@ -103,7 +120,7 @@ ssize_t bch2_hprint(char *buf, s64 v)
if (v < 100 && v > -100)
scnprintf(dec, sizeof(dec), ".%i", t / 103);
- return sprintf(buf, "%lli%s%c", v, dec, units[u]);
+ return sprintf(buf, "%lli%s%c", v, dec, si_units[u]);
}
ssize_t bch2_scnprint_string_list(char *buf, size_t size,
diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c
index c89c720..de95480 100644
--- a/libbcachefs/xattr.c
+++ b/libbcachefs/xattr.c
@@ -15,7 +15,7 @@
static unsigned xattr_val_u64s(unsigned name_len, unsigned val_len)
{
- return DIV_ROUND_UP(sizeof(struct bch_xattr) +
+ return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) +
name_len + val_len, sizeof(u64));
}