summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.bcachefs_revision2
-rw-r--r--cmd_migrate.c5
-rw-r--r--libbcachefs/acl.c147
-rw-r--r--libbcachefs/acl.h27
-rw-r--r--libbcachefs/bcachefs.h7
-rw-r--r--libbcachefs/bcachefs_format.h7
-rw-r--r--libbcachefs/bkey.h8
-rw-r--r--libbcachefs/bset.c2
-rw-r--r--libbcachefs/btree_cache.c1
-rw-r--r--libbcachefs/btree_io.c5
-rw-r--r--libbcachefs/btree_iter.c283
-rw-r--r--libbcachefs/btree_iter.h64
-rw-r--r--libbcachefs/btree_types.h34
-rw-r--r--libbcachefs/btree_update.h38
-rw-r--r--libbcachefs/btree_update_leaf.c99
-rw-r--r--libbcachefs/dirent.c284
-rw-r--r--libbcachefs/dirent.h12
-rw-r--r--libbcachefs/error.c5
-rw-r--r--libbcachefs/error.h6
-rw-r--r--libbcachefs/fs-io.c93
-rw-r--r--libbcachefs/fs-ioctl.c25
-rw-r--r--libbcachefs/fs.c954
-rw-r--r--libbcachefs/fs.h10
-rw-r--r--libbcachefs/fsck.c510
-rw-r--r--libbcachefs/fsck.h2
-rw-r--r--libbcachefs/inode.c111
-rw-r--r--libbcachefs/inode.h5
-rw-r--r--libbcachefs/recovery.c65
-rw-r--r--libbcachefs/str_hash.h317
-rw-r--r--libbcachefs/util.c2
-rw-r--r--libbcachefs/xattr.c94
-rw-r--r--libbcachefs/xattr.h9
32 files changed, 2082 insertions, 1151 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index f180717..dddb044 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-940d6ca657ea70758f3f43323bfd531019a40d3c
+eab3b355cf6fcabbf07d7a9032c68e95cab37ad0
diff --git a/cmd_migrate.c b/cmd_migrate.c
index 6186653..44283c3 100644
--- a/cmd_migrate.c
+++ b/cmd_migrate.c
@@ -239,8 +239,9 @@ static void copy_xattrs(struct bch_fs *c, struct bch_inode_unpacked *dst,
const struct xattr_handler *h = xattr_resolve_name(&attr);
- int ret = bch2_xattr_set(c, dst->bi_inum, &hash_info, attr,
- val, val_size, 0, h->flags, NULL);
+ int ret = bch2_trans_do(c, NULL, BTREE_INSERT_ATOMIC,
+ bch2_xattr_set(&trans, dst->bi_inum, &hash_info, attr,
+ val, val_size, h->flags, 0));
if (ret < 0)
die("error creating xattr: %s", strerror(-ret));
}
diff --git a/libbcachefs/acl.c b/libbcachefs/acl.c
index a8735bc..534ea94 100644
--- a/libbcachefs/acl.c
+++ b/libbcachefs/acl.c
@@ -132,7 +132,8 @@ invalid:
* Convert from in-memory to filesystem representation.
*/
static struct bkey_i_xattr *
-bch2_acl_to_xattr(const struct posix_acl *acl,
+bch2_acl_to_xattr(struct btree_trans *trans,
+ const struct posix_acl *acl,
int type)
{
struct bkey_i_xattr *xattr;
@@ -164,7 +165,7 @@ bch2_acl_to_xattr(const struct posix_acl *acl,
if (u64s > U8_MAX)
return ERR_PTR(-E2BIG);
- xattr = kmalloc(u64s * sizeof(u64), GFP_KERNEL);
+ xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
if (IS_ERR(xattr))
return xattr;
@@ -214,20 +215,29 @@ struct posix_acl *bch2_get_acl(struct inode *vinode, int type)
{
struct bch_inode_info *inode = to_bch_ei(vinode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct btree_iter iter;
+ struct btree_trans trans;
+ struct btree_iter *iter;
struct bkey_s_c_xattr xattr;
- struct bkey_s_c k;
struct posix_acl *acl = NULL;
- int name_index = acl_to_xattr_type(type);
- k = bch2_xattr_get_iter(c, &iter, inode, "", name_index);
- if (IS_ERR(k.k)) {
- if (PTR_ERR(k.k) != -ENOENT)
- acl = ERR_CAST(k.k);
+ bch2_trans_init(&trans, c);
+retry:
+ bch2_trans_begin(&trans);
+
+ iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc,
+ &inode->ei_str_hash, inode->v.i_ino,
+ &X_SEARCH(acl_to_xattr_type(type), "", 0),
+ 0);
+ if (IS_ERR(iter)) {
+ if (PTR_ERR(iter) == -EINTR)
+ goto retry;
+
+ if (PTR_ERR(iter) != -ENOENT)
+ acl = ERR_CAST(iter);
goto out;
}
- xattr = bkey_s_c_to_xattr(k);
+ xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
acl = bch2_acl_from_disk(xattr_val(xattr.v),
le16_to_cpu(xattr.v->x_val_len));
@@ -235,49 +245,59 @@ struct posix_acl *bch2_get_acl(struct inode *vinode, int type)
if (!IS_ERR(acl))
set_cached_acl(&inode->v, type, acl);
out:
- bch2_btree_iter_unlock(&iter);
+ bch2_trans_exit(&trans);
return acl;
}
-int __bch2_set_acl(struct inode *vinode, struct posix_acl *acl, int type)
+int bch2_set_acl_trans(struct btree_trans *trans,
+ struct bch_inode_unpacked *inode_u,
+ const struct bch_hash_info *hash_info,
+ struct posix_acl *acl, int type)
{
- struct bch_inode_info *inode = to_bch_ei(vinode);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
int ret;
if (type == ACL_TYPE_DEFAULT &&
- !S_ISDIR(inode->v.i_mode))
+ !S_ISDIR(inode_u->bi_mode))
return acl ? -EACCES : 0;
if (acl) {
struct bkey_i_xattr *xattr =
- bch2_acl_to_xattr(acl, type);
+ bch2_acl_to_xattr(trans, acl, type);
if (IS_ERR(xattr))
return PTR_ERR(xattr);
- ret = bch2_hash_set(bch2_xattr_hash_desc, &inode->ei_str_hash,
- c, inode->v.i_ino, &inode->ei_journal_seq,
- &xattr->k_i, 0);
- kfree(xattr);
+ ret = __bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
+ inode_u->bi_inum, &xattr->k_i, 0);
} else {
struct xattr_search_key search =
X_SEARCH(acl_to_xattr_type(type), "", 0);
- ret = bch2_hash_delete(bch2_xattr_hash_desc, &inode->ei_str_hash,
- c, inode->v.i_ino, &inode->ei_journal_seq,
- &search);
+ ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, hash_info,
+ inode_u->bi_inum, &search);
}
- if (!ret)
- set_cached_acl(&inode->v, type, acl);
+ return ret == -ENOENT ? 0 : ret;
+}
- return ret;
+static int inode_update_for_set_acl_fn(struct bch_inode_info *inode,
+ struct bch_inode_unpacked *bi,
+ void *p)
+{
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct timespec now = current_time(&inode->v);
+ umode_t mode = (unsigned long) p;
+
+ bi->bi_ctime = timespec_to_bch2_time(c, now);
+ bi->bi_mode = mode;
+ return 0;
}
int bch2_set_acl(struct inode *vinode, struct posix_acl *acl, int type)
{
struct bch_inode_info *inode = to_bch_ei(vinode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct btree_trans trans;
+ struct bch_inode_unpacked inode_u;
umode_t mode = inode->v.i_mode;
int ret;
@@ -287,19 +307,76 @@ int bch2_set_acl(struct inode *vinode, struct posix_acl *acl, int type)
return ret;
}
- ret = __bch2_set_acl(vinode, acl, type);
- if (ret)
- return ret;
+ bch2_trans_init(&trans, c);
+retry:
+ bch2_trans_begin(&trans);
+
+ ret = bch2_set_acl_trans(&trans,
+ &inode->ei_inode,
+ &inode->ei_str_hash,
+ acl, type) ?:
+ bch2_write_inode_trans(&trans, inode, &inode_u,
+ inode_update_for_set_acl_fn,
+ (void *)(unsigned long) mode) ?:
+ bch2_trans_commit(&trans, NULL, NULL,
+ &inode->ei_journal_seq,
+ BTREE_INSERT_ATOMIC|
+ BTREE_INSERT_NOUNLOCK);
+ if (ret == -EINTR)
+ goto retry;
+ if (unlikely(ret))
+ goto err;
+
+ bch2_inode_update_after_write(c, inode, &inode_u,
+ ATTR_CTIME|ATTR_MODE);
+
+ set_cached_acl(&inode->v, type, acl);
+err:
+ bch2_trans_exit(&trans);
+
+ return ret;
+}
+
+int bch2_acl_chmod(struct btree_trans *trans,
+ struct bch_inode_info *inode,
+ umode_t mode,
+ struct posix_acl **new_acl)
+{
+ struct btree_iter *iter;
+ struct bkey_s_c_xattr xattr;
+ struct bkey_i_xattr *new;
+ struct posix_acl *acl;
+ int ret = 0;
- if (mode != inode->v.i_mode) {
- mutex_lock(&inode->ei_update_lock);
- inode->v.i_mode = mode;
- inode->v.i_ctime = current_time(&inode->v);
+ iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc,
+ &inode->ei_str_hash, inode->v.i_ino,
+ &X_SEARCH(BCH_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0),
+ BTREE_ITER_INTENT);
+ if (IS_ERR(iter))
+ return PTR_ERR(iter) != -ENOENT ? PTR_ERR(iter) : 0;
+
+ xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
+
+ acl = bch2_acl_from_disk(xattr_val(xattr.v),
+ le16_to_cpu(xattr.v->x_val_len));
+ if (IS_ERR_OR_NULL(acl))
+ return PTR_ERR(acl);
+
+ ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode);
+ if (ret)
+ goto err;
- ret = bch2_write_inode(c, inode);
- mutex_unlock(&inode->ei_update_lock);
+ new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS);
+ if (IS_ERR(new)) {
+ ret = PTR_ERR(new);
+ goto err;
}
+ bch2_trans_update(trans, iter, &new->k_i, 0);
+ *new_acl = acl;
+ acl = NULL;
+err:
+ kfree(acl);
return ret;
}
diff --git a/libbcachefs/acl.h b/libbcachefs/acl.h
index 0be31ee..e067243 100644
--- a/libbcachefs/acl.h
+++ b/libbcachefs/acl.h
@@ -1,6 +1,11 @@
#ifndef _BCACHEFS_ACL_H
#define _BCACHEFS_ACL_H
+struct bch_inode_unpacked;
+struct bch_hash_info;
+struct bch_inode_info;
+struct posix_acl;
+
#ifdef CONFIG_BCACHEFS_POSIX_ACL
#define BCH_ACL_VERSION 0x0001
@@ -20,20 +25,30 @@ typedef struct {
__le32 a_version;
} bch_acl_header;
-struct posix_acl;
+struct posix_acl *bch2_get_acl(struct inode *, int);
-extern struct posix_acl *bch2_get_acl(struct inode *, int);
-extern int __bch2_set_acl(struct inode *, struct posix_acl *, int);
-extern int bch2_set_acl(struct inode *, struct posix_acl *, int);
+int bch2_set_acl_trans(struct btree_trans *,
+ struct bch_inode_unpacked *,
+ const struct bch_hash_info *,
+ struct posix_acl *, int);
+int bch2_set_acl(struct inode *, struct posix_acl *, int);
+int bch2_acl_chmod(struct btree_trans *, struct bch_inode_info *,
+ umode_t, struct posix_acl **);
#else
-static inline int __bch2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+static inline int bch2_set_acl_trans(struct btree_trans *trans,
+ struct bch_inode_unpacked *inode_u,
+ const struct bch_hash_info *hash_info,
+ struct posix_acl *acl, int type)
{
return 0;
}
-static inline int bch2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+static inline int bch2_acl_chmod(struct btree_trans *trans,
+ struct bch_inode_info *inode,
+ umode_t mode,
+ struct posix_acl **new_acl)
{
return 0;
}
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index 1482b80..bd5ea6f 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -262,7 +262,11 @@ do { \
BCH_DEBUG_PARAM(journal_seq_verify, \
"Store the journal sequence number in the version " \
"number of every btree key, and verify that btree " \
- "update ordering is preserved during recovery")
+ "update ordering is preserved during recovery") \
+ BCH_DEBUG_PARAM(inject_invalid_keys, \
+ "Store the journal sequence number in the version " \
+ "number of every btree key, and verify that btree " \
+ "update ordering is preserved during recovery") \
#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
@@ -465,6 +469,7 @@ enum {
/* misc: */
BCH_FS_BDEV_MOUNTED,
BCH_FS_FSCK_FIXED_ERRORS,
+ BCH_FS_FSCK_UNFIXED_ERRORS,
BCH_FS_FIXED_GENS,
BCH_FS_REBUILD_REPLICAS,
BCH_FS_HOLD_BTREE_WRITES,
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index b6e7b98..e300738 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -722,9 +722,7 @@ enum {
__BCH_INODE_I_SIZE_DIRTY= 5,
__BCH_INODE_I_SECTORS_DIRTY= 6,
-
- /* not implemented yet: */
- __BCH_INODE_HAS_XATTRS = 7, /* has xattrs in xattr btree */
+ __BCH_INODE_UNLINKED = 7,
/* bits 20+ reserved for packed fields below: */
};
@@ -736,7 +734,7 @@ enum {
#define BCH_INODE_NOATIME (1 << __BCH_INODE_NOATIME)
#define BCH_INODE_I_SIZE_DIRTY (1 << __BCH_INODE_I_SIZE_DIRTY)
#define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY)
-#define BCH_INODE_HAS_XATTRS (1 << __BCH_INODE_HAS_XATTRS)
+#define BCH_INODE_UNLINKED (1 << __BCH_INODE_UNLINKED)
LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24);
LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 32);
@@ -1222,6 +1220,7 @@ enum bch_sb_features {
BCH_FEATURE_LZ4 = 0,
BCH_FEATURE_GZIP = 1,
BCH_FEATURE_ZSTD = 2,
+ BCH_FEATURE_ATOMIC_NLINK = 3,
};
/* options: */
diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h
index 2f62bd8..bd1d21b 100644
--- a/libbcachefs/bkey.h
+++ b/libbcachefs/bkey.h
@@ -206,14 +206,12 @@ void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);
static __always_inline int bversion_cmp(struct bversion l, struct bversion r)
{
- if (l.hi != r.hi)
- return l.hi < r.hi ? -1 : 1;
- if (l.lo != r.lo)
- return l.lo < r.lo ? -1 : 1;
- return 0;
+ return (l.hi > r.hi) - (l.hi < r.hi) ?:
+ (l.lo > r.lo) - (l.lo < r.lo);
}
#define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 })
+#define MAX_VERSION ((struct bversion) { .hi = ~0, .lo = ~0ULL })
static __always_inline int bversion_zero(struct bversion v)
{
diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c
index 5c77787..8c77fc5 100644
--- a/libbcachefs/bset.c
+++ b/libbcachefs/bset.c
@@ -1449,7 +1449,7 @@ static struct bkey_packed *bch2_bset_search(struct btree *b,
!btree_iter_pos_cmp_packed(b, &search, m, strictly_greater))
m = bkey_next(m);
- if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
+ if (btree_keys_expensive_checks(b)) {
struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
BUG_ON(prev &&
diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c
index f15a415..db3712a 100644
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@@ -730,6 +730,7 @@ retry:
if (bch2_btree_node_relock(iter, level + 1))
goto retry;
+ trans_restart();
return ERR_PTR(-EINTR);
}
}
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index 847dfd6..94f56db 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -1298,7 +1298,9 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
struct bkey_s_c u = bkey_disassemble(b, k, &tmp);
const char *invalid = bch2_bkey_val_invalid(c, type, u);
- if (invalid) {
+ if (invalid ||
+ (inject_invalid_keys(c) &&
+ !bversion_cmp(u.k->version, MAX_VERSION))) {
char buf[160];
bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u);
@@ -1310,6 +1312,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
memmove_u64s_down(k, bkey_next(k),
(u64 *) vstruct_end(i) - (u64 *) k);
+ set_btree_bset_end(b, b->set);
continue;
}
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index 097b68e..a52ec12 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -262,6 +262,9 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
if (ret)
__btree_node_lock_type(c, b, type);
+ else
+ trans_restart();
+
return ret;
}
@@ -1555,6 +1558,7 @@ void bch2_btree_iter_unlink(struct btree_iter *iter)
for_each_linked_btree_iter(iter, linked)
if (linked->next == iter) {
linked->next = iter->next;
+ iter->next = iter;
return;
}
@@ -1571,8 +1575,9 @@ void bch2_btree_iter_link(struct btree_iter *iter, struct btree_iter *new)
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
unsigned nr_iters = 0;
- for_each_btree_iter(iter, new)
- nr_iters++;
+ for_each_btree_iter(new, iter)
+ if (iter->btree_id == new->btree_id)
+ nr_iters++;
BUG_ON(nr_iters > SIX_LOCK_MAX_RECURSE);
}
@@ -1580,8 +1585,278 @@ void bch2_btree_iter_link(struct btree_iter *iter, struct btree_iter *new)
void bch2_btree_iter_copy(struct btree_iter *dst, struct btree_iter *src)
{
+ unsigned i;
+
__bch2_btree_iter_unlock(dst);
memcpy(dst, src, offsetof(struct btree_iter, next));
- dst->nodes_locked = dst->nodes_intent_locked = 0;
- dst->uptodate = BTREE_ITER_NEED_RELOCK;
+
+ for (i = 0; i < BTREE_MAX_DEPTH; i++)
+ if (btree_node_locked(dst, i))
+ six_lock_increment(&dst->l[i].b->lock,
+ __btree_lock_want(dst, i));
+}
+
+/* new transactional stuff: */
+
+static void btree_trans_verify(struct btree_trans *trans)
+{
+ unsigned i;
+
+ for (i = 0; i < trans->nr_iters; i++) {
+ struct btree_iter *iter = &trans->iters[i];
+
+ BUG_ON(btree_iter_linked(iter) !=
+ ((trans->iters_linked & (1 << i)) &&
+ !is_power_of_2(trans->iters_linked)));
+ }
+}
+
+void bch2_trans_iter_free(struct btree_trans *trans,
+ struct btree_iter *iter)
+{
+ unsigned idx;
+
+ for (idx = 0; idx < trans->nr_iters; idx++)
+ if (&trans->iters[idx] == iter)
+ goto found;
+ BUG();
+found:
+ BUG_ON(!(trans->iters_linked & (1U << idx)));
+
+ trans->iters_live &= ~(1U << idx);
+ trans->iters_linked &= ~(1U << idx);
+ bch2_btree_iter_unlink(iter);
+}
+
+static int btree_trans_realloc_iters(struct btree_trans *trans)
+{
+ struct btree_iter *new_iters;
+ unsigned i;
+
+ bch2_trans_unlock(trans);
+
+ new_iters = kmalloc(sizeof(struct btree_iter) * BTREE_ITER_MAX,
+ GFP_NOFS);
+ if (!new_iters)
+ return -ENOMEM;
+
+ memcpy(new_iters, trans->iters,
+ sizeof(struct btree_iter) * trans->nr_iters);
+ trans->iters = new_iters;
+
+ for (i = 0; i < trans->nr_iters; i++)
+ trans->iters[i].next = &trans->iters[i];
+
+ if (trans->iters_linked) {
+ unsigned first_linked = __ffs(trans->iters_linked);
+
+ for (i = first_linked + 1; i < trans->nr_iters; i++)
+ if (trans->iters_linked & (1 << i))
+ bch2_btree_iter_link(&trans->iters[first_linked],
+ &trans->iters[i]);
+ }
+
+ btree_trans_verify(trans);
+
+ if (trans->iters_live) {
+ trans_restart();
+ return -EINTR;
+ }
+
+ return 0;
+}
+
+int bch2_trans_preload_iters(struct btree_trans *trans)
+{
+ if (trans->iters != trans->iters_onstack)
+ return 0;
+
+ return btree_trans_realloc_iters(trans);
+}
+
+static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
+ unsigned btree_id,
+ unsigned flags, u64 iter_id)
+{
+ struct btree_iter *iter;
+ int idx;
+
+ BUG_ON(trans->nr_iters > BTREE_ITER_MAX);
+
+ for (idx = 0; idx < trans->nr_iters; idx++)
+ if (trans->iter_ids[idx] == iter_id)
+ goto found;
+ idx = -1;
+found:
+ if (idx < 0) {
+ idx = ffz(trans->iters_linked);
+ if (idx < trans->nr_iters)
+ goto got_slot;
+
+ BUG_ON(trans->nr_iters == BTREE_ITER_MAX);
+
+ if (trans->iters == trans->iters_onstack &&
+ trans->nr_iters == ARRAY_SIZE(trans->iters_onstack)) {
+ int ret = btree_trans_realloc_iters(trans);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+
+ idx = trans->nr_iters++;
+got_slot:
+ trans->iter_ids[idx] = iter_id;
+ iter = &trans->iters[idx];
+
+ bch2_btree_iter_init(iter, trans->c, btree_id, POS_MIN, flags);
+ } else {
+ iter = &trans->iters[idx];
+
+ BUG_ON(iter->btree_id != btree_id);
+ BUG_ON((iter->flags ^ flags) &
+ (BTREE_ITER_SLOTS|BTREE_ITER_IS_EXTENTS));
+
+ iter->flags &= ~(BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
+ iter->flags |= flags & (BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
+ }
+
+ BUG_ON(trans->iters_live & (1 << idx));
+ trans->iters_live |= 1 << idx;
+
+ if (trans->iters_linked &&
+ !(trans->iters_linked & (1 << idx)))
+ bch2_btree_iter_link(&trans->iters[__ffs(trans->iters_linked)],
+ iter);
+
+ trans->iters_linked |= 1 << idx;
+
+ btree_trans_verify(trans);
+
+ return iter;
+}
+
+struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
+ enum btree_id btree_id,
+ struct bpos pos, unsigned flags,
+ u64 iter_id)
+{
+ struct btree_iter *iter =
+ __btree_trans_get_iter(trans, btree_id, flags, iter_id);
+
+ if (!IS_ERR(iter))
+ bch2_btree_iter_set_pos(iter, pos);
+ return iter;
+}
+
+struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans,
+ struct btree_iter *src,
+ u64 iter_id)
+{
+ struct btree_iter *iter =
+ __btree_trans_get_iter(trans, src->btree_id,
+ src->flags, iter_id);
+
+ if (!IS_ERR(iter))
+ bch2_btree_iter_copy(iter, src);
+ return iter;
+}
+
+void *bch2_trans_kmalloc(struct btree_trans *trans,
+ size_t size)
+{
+ void *ret;
+
+ if (trans->mem_top + size > trans->mem_bytes) {
+ size_t old_bytes = trans->mem_bytes;
+ size_t new_bytes = roundup_pow_of_two(trans->mem_top + size);
+ void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
+
+ if (!new_mem)
+ return ERR_PTR(-ENOMEM);
+
+ trans->mem = new_mem;
+ trans->mem_bytes = new_bytes;
+
+ if (old_bytes) {
+ trans_restart();
+ return ERR_PTR(-EINTR);
+ }
+ }
+
+ ret = trans->mem + trans->mem_top;
+ trans->mem_top += size;
+ return ret;
+}
+
+int bch2_trans_unlock(struct btree_trans *trans)
+{
+ unsigned iters = trans->iters_linked;
+ int ret = 0;
+
+ while (iters) {
+ unsigned idx = __ffs(iters);
+ struct btree_iter *iter = &trans->iters[idx];
+
+ if (iter->flags & BTREE_ITER_ERROR)
+ ret = -EIO;
+
+ __bch2_btree_iter_unlock(iter);
+ iters ^= 1 << idx;
+ }
+
+ return ret;
+}
+
+void __bch2_trans_begin(struct btree_trans *trans)
+{
+ unsigned idx;
+
+ btree_trans_verify(trans);
+
+ /*
+ * On transaction restart, the transaction isn't required to allocate
+ * all the same iterators it on the last iteration:
+ *
+ * Unlink any iterators it didn't use this iteration, assuming it got
+ * further (allocated an iter with a higher idx) than where the iter
+ * was originally allocated:
+ */
+ while (trans->iters_linked &&
+ trans->iters_live &&
+ (idx = __fls(trans->iters_linked)) >
+ __fls(trans->iters_live)) {
+ trans->iters_linked ^= 1 << idx;
+ bch2_btree_iter_unlink(&trans->iters[idx]);
+ }
+
+ trans->iters_live = 0;
+ trans->nr_updates = 0;
+ trans->mem_top = 0;
+
+ btree_trans_verify(trans);
+}
+
+void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c)
+{
+ trans->c = c;
+ trans->nr_restarts = 0;
+ trans->nr_iters = 0;
+ trans->iters_live = 0;
+ trans->iters_linked = 0;
+ trans->nr_updates = 0;
+ trans->mem_top = 0;
+ trans->mem_bytes = 0;
+ trans->mem = NULL;
+ trans->iters = trans->iters_onstack;
+}
+
+int bch2_trans_exit(struct btree_trans *trans)
+{
+ int ret = bch2_trans_unlock(trans);
+
+ kfree(trans->mem);
+ if (trans->iters != trans->iters_onstack)
+ kfree(trans->iters);
+ trans->mem = (void *) 0x1;
+ trans->iters = (void *) 0x1;
+ return ret;
}
diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h
index 5db1cc5..d046ad7 100644
--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@@ -269,4 +269,68 @@ static inline int btree_iter_err(struct bkey_s_c k)
return PTR_ERR_OR_ZERO(k.k);
}
+/* new multiple iterator interface: */
+
+int bch2_trans_preload_iters(struct btree_trans *);
+void bch2_trans_iter_free(struct btree_trans *,
+ struct btree_iter *);
+
+struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id,
+ struct bpos, unsigned, u64);
+struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *,
+ struct btree_iter *, u64);
+
+static __always_inline u64 __btree_iter_id(void)
+{
+ u64 ret = 0;
+
+ ret <<= 32;
+ ret |= _RET_IP_ & U32_MAX;
+ ret <<= 32;
+ ret |= _THIS_IP_ & U32_MAX;
+ return ret;
+}
+
+static __always_inline struct btree_iter *
+bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id,
+ struct bpos pos, unsigned flags)
+{
+ return __bch2_trans_get_iter(trans, btree_id, pos, flags,
+ __btree_iter_id());
+}
+
+static __always_inline struct btree_iter *
+bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src)
+{
+
+ return __bch2_trans_copy_iter(trans, src, __btree_iter_id());
+}
+
+void __bch2_trans_begin(struct btree_trans *);
+
+void *bch2_trans_kmalloc(struct btree_trans *, size_t);
+int bch2_trans_unlock(struct btree_trans *);
+void bch2_trans_init(struct btree_trans *, struct bch_fs *);
+int bch2_trans_exit(struct btree_trans *);
+
+#ifdef TRACE_TRANSACTION_RESTARTS
+#define bch2_trans_begin(_trans) \
+do { \
+ if (is_power_of_2((_trans)->nr_restarts) && \
+ (_trans)->nr_restarts >= 8) \
+ pr_info("nr restarts: %zu", (_trans)->nr_restarts); \
+ \
+ (_trans)->nr_restarts++; \
+ __bch2_trans_begin(_trans); \
+} while (0)
+#else
+#define bch2_trans_begin(_trans) __bch2_trans_begin(_trans)
+#endif
+
+#ifdef TRACE_TRANSACTION_RESTARTS_ALL
+#define trans_restart(...) pr_info("transaction restart" __VA_ARGS__)
+#else
+#define trans_restart(...) no_printk("transaction restart" __VA_ARGS__)
+#endif
+
#endif /* _BCACHEFS_BTREE_ITER_H */
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h
index daa648c..39e2db7 100644
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -253,6 +253,40 @@ struct btree_iter {
struct btree_iter *next;
};
+#define BTREE_ITER_MAX 8
+
+struct btree_insert_entry {
+ struct btree_iter *iter;
+ struct bkey_i *k;
+ unsigned extra_res;
+ /*
+ * true if entire key was inserted - can only be false for
+ * extents
+ */
+ bool done;
+};
+
+struct btree_trans {
+ struct bch_fs *c;
+ size_t nr_restarts;
+
+ u8 nr_iters;
+ u8 iters_live;
+ u8 iters_linked;
+ u8 nr_updates;
+
+ unsigned mem_top;
+ unsigned mem_bytes;
+ void *mem;
+
+ struct btree_iter *iters;
+ u64 iter_ids[BTREE_ITER_MAX];
+
+ struct btree_insert_entry updates[BTREE_ITER_MAX];
+
+ struct btree_iter iters_onstack[2];
+};
+
#define BTREE_FLAG(flag) \
static inline bool btree_node_ ## flag(struct btree *b) \
{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \
diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h
index aac9795..5e47d4c 100644
--- a/libbcachefs/btree_update.h
+++ b/libbcachefs/btree_update.h
@@ -27,16 +27,7 @@ struct btree_insert {
bool did_work;
unsigned short nr;
- struct btree_insert_entry {
- struct btree_iter *iter;
- struct bkey_i *k;
- unsigned extra_res;
- /*
- * true if entire key was inserted - can only be false for
- * extents
- */
- bool done;
- } *entries;
+ struct btree_insert_entry *entries;
};
int __bch2_btree_insert_at(struct btree_insert *);
@@ -149,4 +140,31 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
struct btree *, struct bkey_i_extent *);
+/* new transactional interface: */
+
+void bch2_trans_update(struct btree_trans *, struct btree_iter *,
+ struct bkey_i *, unsigned);
+int bch2_trans_commit(struct btree_trans *,
+ struct disk_reservation *,
+ struct extent_insert_hook *,
+ u64 *, unsigned);
+
+#define bch2_trans_do(_c, _journal_seq, _flags, _do) \
+({ \
+ struct btree_trans trans; \
+ int _ret; \
+ \
+ bch2_trans_init(&trans, (_c)); \
+ \
+ do { \
+ bch2_trans_begin(&trans); \
+ \
+ _ret = (_do) ?: bch2_trans_commit(&trans, NULL, NULL, \
+ (_journal_seq), (_flags)); \
+ } while (_ret == -EINTR); \
+ \
+ bch2_trans_exit(&trans); \
+ _ret; \
+})
+
#endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c
index 588a199..a481b0d 100644
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@@ -309,8 +309,10 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
unsigned u64s;
int ret;
- trans_for_each_entry(trans, i)
+ trans_for_each_entry(trans, i) {
BUG_ON(i->done);
+ BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
+ }
u64s = 0;
trans_for_each_entry(trans, i)
@@ -330,6 +332,7 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
if (race_fault()) {
ret = -EINTR;
+ trans_restart(" (race)");
goto out;
}
@@ -354,10 +357,14 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
}
}
- if (journal_seq_verify(c) &&
- !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
- trans_for_each_entry(trans, i)
- i->k->k.version.lo = trans->journal_res.seq;
+ if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
+ if (journal_seq_verify(c))
+ trans_for_each_entry(trans, i)
+ i->k->k.version.lo = trans->journal_res.seq;
+ else if (inject_invalid_keys(c))
+ trans_for_each_entry(trans, i)
+ i->k->k.version = MAX_VERSION;
+ }
trans_for_each_entry(trans, i) {
switch (btree_insert_key_leaf(trans, i)) {
@@ -398,6 +405,17 @@ out:
return ret;
}
+static inline void btree_insert_entry_checks(struct bch_fs *c,
+ struct btree_insert_entry *i)
+{
+ BUG_ON(i->iter->level);
+ BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
+ BUG_ON(debug_check_bkeys(c) &&
+ !bkey_deleted(&i->k->k) &&
+ bch2_bkey_invalid(c, i->iter->btree_id,
+ bkey_i_to_s_c(i->k)));
+}
+
/**
* __bch_btree_insert_at - insert keys at given iterator positions
*
@@ -418,20 +436,16 @@ int __bch2_btree_insert_at(struct btree_insert *trans)
unsigned flags;
int ret;
+ BUG_ON(!trans->nr);
+
for_each_btree_iter(trans->entries[0].iter, linked)
bch2_btree_iter_verify_locks(linked);
/* for the sake of sanity: */
BUG_ON(trans->nr > 1 && !(trans->flags & BTREE_INSERT_ATOMIC));
- trans_for_each_entry(trans, i) {
- BUG_ON(i->iter->level);
- BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
- BUG_ON(debug_check_bkeys(c) &&
- !bkey_deleted(&i->k->k) &&
- bch2_bkey_invalid(c, i->iter->btree_id,
- bkey_i_to_s_c(i->k)));
- }
+ trans_for_each_entry(trans, i)
+ btree_insert_entry_checks(c, i);
bubble_sort(trans->entries, trans->nr, btree_trans_cmp);
@@ -442,7 +456,12 @@ retry:
cycle_gc_lock = false;
trans_for_each_entry(trans, i) {
+ unsigned old_locks_want = i->iter->locks_want;
+ unsigned old_uptodate = i->iter->uptodate;
+
if (!bch2_btree_iter_upgrade(i->iter, 1, true)) {
+ trans_restart(" (failed upgrade, locks_want %u uptodate %u)",
+ old_locks_want, old_uptodate);
ret = -EINTR;
goto err;
}
@@ -515,8 +534,10 @@ err:
* don't care if we got ENOSPC because we told split it
* couldn't block:
*/
- if (!ret || (flags & BTREE_INSERT_NOUNLOCK))
+ if (!ret || (flags & BTREE_INSERT_NOUNLOCK)) {
+ trans_restart(" (split)");
ret = -EINTR;
+ }
}
if (cycle_gc_lock) {
@@ -531,13 +552,16 @@ err:
}
if (ret == -EINTR) {
- if (flags & BTREE_INSERT_NOUNLOCK)
+ if (flags & BTREE_INSERT_NOUNLOCK) {
+ trans_restart(" (can't unlock)");
goto out;
+ }
trans_for_each_entry(trans, i) {
int ret2 = bch2_btree_iter_traverse(i->iter);
if (ret2) {
ret = ret2;
+ trans_restart(" (traverse)");
goto out;
}
@@ -550,11 +574,56 @@ err:
*/
if (!(flags & BTREE_INSERT_ATOMIC))
goto retry;
+
+ trans_restart(" (atomic)");
}
goto out;
}
+void bch2_trans_update(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_i *k,
+ unsigned extra_journal_res)
+{
+ struct btree_insert_entry *i;
+
+ BUG_ON(trans->nr_updates >= ARRAY_SIZE(trans->updates));
+
+ i = &trans->updates[trans->nr_updates++];
+
+ *i = (struct btree_insert_entry) {
+ .iter = iter,
+ .k = k,
+ .extra_res = extra_journal_res,
+ };
+
+ btree_insert_entry_checks(trans->c, i);
+}
+
+int bch2_trans_commit(struct btree_trans *trans,
+ struct disk_reservation *disk_res,
+ struct extent_insert_hook *hook,
+ u64 *journal_seq,
+ unsigned flags)
+{
+ struct btree_insert insert = {
+ .c = trans->c,
+ .disk_res = disk_res,
+ .journal_seq = journal_seq,
+ .flags = flags,
+ .nr = trans->nr_updates,
+ .entries = trans->updates,
+ };
+
+ if (!trans->nr_updates)
+ return 0;
+
+ trans->nr_updates = 0;
+
+ return __bch2_btree_insert_at(&insert);
+}
+
int bch2_btree_delete_at(struct btree_iter *iter, unsigned flags)
{
struct bkey_i k;
diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c
index d3dd3eb..d979ae0 100644
--- a/libbcachefs/dirent.c
+++ b/libbcachefs/dirent.c
@@ -141,8 +141,8 @@ void bch2_dirent_to_text(struct bch_fs *c, char *buf,
}
}
-static struct bkey_i_dirent *dirent_create_key(u8 type,
- const struct qstr *name, u64 dst)
+static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
+ u8 type, const struct qstr *name, u64 dst)
{
struct bkey_i_dirent *dirent;
unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len);
@@ -152,9 +152,9 @@ static struct bkey_i_dirent *dirent_create_key(u8 type,
BUG_ON(u64s > U8_MAX);
- dirent = kmalloc(u64s * sizeof(u64), GFP_NOFS);
- if (!dirent)
- return ERR_PTR(-ENOMEM);
+ dirent = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
+ if (IS_ERR(dirent))
+ return dirent;
bkey_dirent_init(&dirent->k_i);
dirent->k.u64s = u64s;
@@ -172,23 +172,31 @@ static struct bkey_i_dirent *dirent_create_key(u8 type,
return dirent;
}
-int bch2_dirent_create(struct bch_fs *c, u64 dir_inum,
- const struct bch_hash_info *hash_info,
- u8 type, const struct qstr *name, u64 dst_inum,
- u64 *journal_seq, int flags)
+int __bch2_dirent_create(struct btree_trans *trans,
+ u64 dir_inum, const struct bch_hash_info *hash_info,
+ u8 type, const struct qstr *name, u64 dst_inum,
+ int flags)
{
struct bkey_i_dirent *dirent;
int ret;
- dirent = dirent_create_key(type, name, dst_inum);
- if (IS_ERR(dirent))
- return PTR_ERR(dirent);
+ dirent = dirent_create_key(trans, type, name, dst_inum);
+ ret = PTR_ERR_OR_ZERO(dirent);
+ if (ret)
+ return ret;
- ret = bch2_hash_set(bch2_dirent_hash_desc, hash_info, c, dir_inum,
- journal_seq, &dirent->k_i, flags);
- kfree(dirent);
+ return __bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
+ dir_inum, &dirent->k_i, flags);
+}
- return ret;
+int bch2_dirent_create(struct bch_fs *c, u64 dir_inum,
+ const struct bch_hash_info *hash_info,
+ u8 type, const struct qstr *name, u64 dst_inum,
+ u64 *journal_seq, int flags)
+{
+ return bch2_trans_do(c, journal_seq, flags,
+ __bch2_dirent_create(&trans, dir_inum, hash_info,
+ type, name, dst_inum, flags));
}
static void dirent_copy_target(struct bkey_i_dirent *dst,
@@ -204,151 +212,117 @@ static struct bpos bch2_dirent_pos(struct bch_inode_info *inode,
return POS(inode->v.i_ino, bch2_dirent_hash(&inode->ei_str_hash, name));
}
-int bch2_dirent_rename(struct bch_fs *c,
+int bch2_dirent_rename(struct btree_trans *trans,
struct bch_inode_info *src_dir, const struct qstr *src_name,
struct bch_inode_info *dst_dir, const struct qstr *dst_name,
- u64 *journal_seq, enum bch_rename_mode mode)
+ enum bch_rename_mode mode)
{
- struct btree_iter src_iter, dst_iter, whiteout_iter;
+ struct btree_iter *src_iter, *dst_iter;
struct bkey_s_c old_src, old_dst;
- struct bkey delete;
struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
- struct bpos src_pos = bch2_dirent_pos(src_dir, src_name);
struct bpos dst_pos = bch2_dirent_pos(dst_dir, dst_name);
- bool need_whiteout;
int ret;
- bch2_btree_iter_init(&src_iter, c, BTREE_ID_DIRENTS, src_pos,
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- bch2_btree_iter_init(&dst_iter, c, BTREE_ID_DIRENTS, dst_pos,
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- bch2_btree_iter_link(&src_iter, &dst_iter);
-
- bch2_btree_iter_init(&whiteout_iter, c, BTREE_ID_DIRENTS, src_pos,
- BTREE_ITER_SLOTS);
- bch2_btree_iter_link(&src_iter, &whiteout_iter);
-
- if (mode == BCH_RENAME_EXCHANGE) {
- new_src = dirent_create_key(0, src_name, 0);
- if (IS_ERR(new_src)) {
- ret = PTR_ERR(new_src);
- goto err;
- }
- } else {
- new_src = (void *) &delete;
- }
-
- new_dst = dirent_create_key(0, dst_name, 0);
- if (IS_ERR(new_dst)) {
- ret = PTR_ERR(new_dst);
- goto err;
- }
-retry:
- /*
- * Note that on -EINTR/dropped locks we're not restarting the lookup
- * from the original hashed position (like we do when creating dirents,
- * in bch_hash_set) - we never move existing dirents to different slot:
- */
- old_src = bch2_hash_lookup_at(bch2_dirent_hash_desc,
- &src_dir->ei_str_hash,
- &src_iter, src_name);
- if ((ret = btree_iter_err(old_src)))
- goto err;
-
- ret = bch2_hash_needs_whiteout(bch2_dirent_hash_desc,
- &src_dir->ei_str_hash,
- &whiteout_iter, &src_iter);
- if (ret < 0)
- goto err;
- need_whiteout = ret;
-
/*
+ * Lookup dst:
+ *
* Note that in BCH_RENAME mode, we're _not_ checking if
* the target already exists - we're relying on the VFS
* to do that check for us for correctness:
*/
- old_dst = mode == BCH_RENAME
- ? bch2_hash_hole_at(bch2_dirent_hash_desc, &dst_iter)
- : bch2_hash_lookup_at(bch2_dirent_hash_desc,
- &dst_dir->ei_str_hash,
- &dst_iter, dst_name);
- if ((ret = btree_iter_err(old_dst)))
- goto err;
-
- switch (mode) {
- case BCH_RENAME:
- bkey_init(&new_src->k);
- dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
-
- if (bkey_cmp(dst_pos, src_iter.pos) <= 0 &&
- bkey_cmp(src_iter.pos, dst_iter.pos) < 0) {
- /*
- * If we couldn't insert new_dst at its hashed
- * position (dst_pos) due to a hash collision,
- * and we're going to be deleting in
- * between the hashed position and first empty
- * slot we found - just overwrite the pos we
- * were going to delete:
- *
- * Note: this is a correctness issue, in this
- * situation bch2_hash_needs_whiteout() could
- * return false when the whiteout would have
- * been needed if we inserted at the pos
- * __dirent_find_hole() found
- */
- new_dst->k.p = src_iter.pos;
- ret = bch2_btree_insert_at(c, NULL, NULL,
- journal_seq,
- BTREE_INSERT_ATOMIC,
- BTREE_INSERT_ENTRY(&src_iter,
- &new_dst->k_i));
- goto err;
- }
+ dst_iter = mode == BCH_RENAME
+ ? bch2_hash_hole(trans, bch2_dirent_hash_desc,
+ &dst_dir->ei_str_hash,
+ dst_dir->v.i_ino, dst_name)
+ : bch2_hash_lookup(trans, bch2_dirent_hash_desc,
+ &dst_dir->ei_str_hash,
+ dst_dir->v.i_ino, dst_name,
+ BTREE_ITER_INTENT);
+ if (IS_ERR(dst_iter))
+ return PTR_ERR(dst_iter);
+ old_dst = bch2_btree_iter_peek_slot(dst_iter);
+
+ /* Lookup src: */
+ src_iter = bch2_hash_lookup(trans, bch2_dirent_hash_desc,
+ &src_dir->ei_str_hash,
+ src_dir->v.i_ino, src_name,
+ BTREE_ITER_INTENT);
+ if (IS_ERR(src_iter))
+ return PTR_ERR(src_iter);
+ old_src = bch2_btree_iter_peek_slot(src_iter);
+
+ /* Create new dst key: */
+ new_dst = dirent_create_key(trans, 0, dst_name, 0);
+ if (IS_ERR(new_dst))
+ return PTR_ERR(new_dst);
+
+ dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
+ new_dst->k.p = dst_iter->pos;
+
+ /* Create new src key: */
+ if (mode == BCH_RENAME_EXCHANGE) {
+ new_src = dirent_create_key(trans, 0, src_name, 0);
+ if (IS_ERR(new_src))
+ return PTR_ERR(new_src);
- if (need_whiteout)
- new_src->k.type = BCH_DIRENT_WHITEOUT;
- break;
- case BCH_RENAME_OVERWRITE:
+ dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst));
+ new_src->k.p = src_iter->pos;
+ } else {
+ new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+ if (IS_ERR(new_src))
+ return PTR_ERR(new_src);
bkey_init(&new_src->k);
- dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
+ new_src->k.p = src_iter->pos;
- if (bkey_cmp(dst_pos, src_iter.pos) <= 0 &&
- bkey_cmp(src_iter.pos, dst_iter.pos) < 0) {
+ if (bkey_cmp(dst_pos, src_iter->pos) <= 0 &&
+ bkey_cmp(src_iter->pos, dst_iter->pos) < 0) {
/*
- * Same case described above -
- * bch_hash_needs_whiteout could spuriously
- * return false, but we have to insert at
- * dst_iter.pos because we're overwriting
- * another dirent:
+ * We have a hash collision for the new dst key,
+ * and new_src - the key we're deleting - is between
+ * new_dst's hashed slot and the slot we're going to be
+ * inserting it into - oops. This will break the hash
+ * table if we don't deal with it:
*/
- new_src->k.type = BCH_DIRENT_WHITEOUT;
- } else if (need_whiteout)
- new_src->k.type = BCH_DIRENT_WHITEOUT;
- break;
- case BCH_RENAME_EXCHANGE:
- dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst));
- dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
- break;
+ if (mode == BCH_RENAME) {
+ /*
+ * If we're not overwriting, we can just insert
+ * new_dst at the src position:
+ */
+ new_dst->k.p = src_iter->pos;
+ bch2_trans_update(trans, src_iter, &new_dst->k_i, 0);
+ return 0;
+ } else {
+ /* If we're overwriting, we can't insert new_dst
+ * at a different slot because it has to
+ * overwrite old_dst - just make sure to use a
+ * whiteout when deleting src:
+ */
+ new_src->k.type = BCH_DIRENT_WHITEOUT;
+ }
+ } else {
+ /* Check if we need a whiteout to delete src: */
+ ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc,
+ &src_dir->ei_str_hash,
+ src_iter);
+ if (ret < 0)
+ return ret;
+
+ if (ret)
+ new_src->k.type = BCH_DIRENT_WHITEOUT;
+ }
}
- new_src->k.p = src_iter.pos;
- new_dst->k.p = dst_iter.pos;
- ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq,
- BTREE_INSERT_ATOMIC,
- BTREE_INSERT_ENTRY(&src_iter, &new_src->k_i),
- BTREE_INSERT_ENTRY(&dst_iter, &new_dst->k_i));
-err:
- if (ret == -EINTR)
- goto retry;
-
- bch2_btree_iter_unlock(&whiteout_iter);
- bch2_btree_iter_unlock(&dst_iter);
- bch2_btree_iter_unlock(&src_iter);
-
- if (new_src != (void *) &delete)
- kfree(new_src);
- kfree(new_dst);
- return ret;
+ bch2_trans_update(trans, src_iter, &new_src->k_i, 0);
+ bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0);
+ return 0;
+}
+
+int __bch2_dirent_delete(struct btree_trans *trans, u64 dir_inum,
+ const struct bch_hash_info *hash_info,
+ const struct qstr *name)
+{
+ return bch2_hash_delete(trans, bch2_dirent_hash_desc, hash_info,
+ dir_inum, name);
}
int bch2_dirent_delete(struct bch_fs *c, u64 dir_inum,
@@ -356,28 +330,34 @@ int bch2_dirent_delete(struct bch_fs *c, u64 dir_inum,
const struct qstr *name,
u64 *journal_seq)
{
- return bch2_hash_delete(bch2_dirent_hash_desc, hash_info,
- c, dir_inum, journal_seq, name);
+ return bch2_trans_do(c, journal_seq,
+ BTREE_INSERT_ATOMIC|
+ BTREE_INSERT_NOFAIL,
+ __bch2_dirent_delete(&trans, dir_inum, hash_info, name));
}
u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum,
const struct bch_hash_info *hash_info,
const struct qstr *name)
{
- struct btree_iter iter;
+ struct btree_trans trans;
+ struct btree_iter *iter;
struct bkey_s_c k;
- u64 inum;
+ u64 inum = 0;
- k = bch2_hash_lookup(bch2_dirent_hash_desc, hash_info, c,
- dir_inum, &iter, name);
- if (IS_ERR(k.k)) {
- bch2_btree_iter_unlock(&iter);
- return 0;
+ bch2_trans_init(&trans, c);
+
+ iter = bch2_hash_lookup(&trans, bch2_dirent_hash_desc,
+ hash_info, dir_inum, name, 0);
+ if (IS_ERR(iter)) {
+ BUG_ON(PTR_ERR(iter) == -EINTR);
+ goto out;
}
+ k = bch2_btree_iter_peek_slot(iter);
inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
- bch2_btree_iter_unlock(&iter);
-
+out:
+ bch2_trans_exit(&trans);
return inum;
}
diff --git a/libbcachefs/dirent.h b/libbcachefs/dirent.h
index 5d066af..4d92ffb 100644
--- a/libbcachefs/dirent.h
+++ b/libbcachefs/dirent.h
@@ -21,8 +21,16 @@ struct bch_hash_info;
struct bch_inode_info;
unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent);
+
+int __bch2_dirent_create(struct btree_trans *, u64,
+ const struct bch_hash_info *, u8,
+ const struct qstr *, u64, int);
int bch2_dirent_create(struct bch_fs *c, u64, const struct bch_hash_info *,
u8, const struct qstr *, u64, u64 *, int);
+
+int __bch2_dirent_delete(struct btree_trans *, u64,
+ const struct bch_hash_info *,
+ const struct qstr *);
int bch2_dirent_delete(struct bch_fs *, u64, const struct bch_hash_info *,
const struct qstr *, u64 *);
@@ -32,10 +40,10 @@ enum bch_rename_mode {
BCH_RENAME_EXCHANGE,
};
-int bch2_dirent_rename(struct bch_fs *,
+int bch2_dirent_rename(struct btree_trans *,
struct bch_inode_info *, const struct qstr *,
struct bch_inode_info *, const struct qstr *,
- u64 *, enum bch_rename_mode);
+ enum bch_rename_mode);
u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *,
const struct qstr *);
diff --git a/libbcachefs/error.c b/libbcachefs/error.c
index 2a357fc..9505b6e 100644
--- a/libbcachefs/error.c
+++ b/libbcachefs/error.c
@@ -131,8 +131,9 @@ print:
mutex_unlock(&c->fsck_error_lock);
- if (fix)
- set_bit(BCH_FS_FSCK_FIXED_ERRORS, &c->flags);
+ set_bit(fix
+ ? BCH_FS_FSCK_FIXED_ERRORS
+ : BCH_FS_FSCK_UNFIXED_ERRORS, &c->flags);
return fix ? FSCK_ERR_FIX
: flags & FSCK_CAN_IGNORE ? FSCK_ERR_IGNORE
diff --git a/libbcachefs/error.h b/libbcachefs/error.h
index f65ef13..588e763 100644
--- a/libbcachefs/error.h
+++ b/libbcachefs/error.h
@@ -147,12 +147,18 @@ void bch2_flush_fsck_errs(struct bch_fs *);
#define need_fsck_err_on(cond, c, ...) \
__fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
+#define need_fsck_err(c, ...) \
+ __fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
+
#define mustfix_fsck_err(c, ...) \
__fsck_err(c, FSCK_CAN_FIX, ##__VA_ARGS__)
#define mustfix_fsck_err_on(cond, c, ...) \
__fsck_err_on(cond, c, FSCK_CAN_FIX, ##__VA_ARGS__)
+#define fsck_err(c, ...) \
+ __fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__)
+
#define fsck_err_on(cond, c, ...) \
__fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__)
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c
index 9e78798..e4d2b39 100644
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -193,7 +193,7 @@ static int __must_check bch2_write_inode_size(struct bch_fs *c,
struct bch_inode_info *inode,
loff_t new_size)
{
- return __bch2_write_inode(c, inode, inode_set_size, &new_size);
+ return __bch2_write_inode(c, inode, inode_set_size, &new_size, 0);
}
static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
@@ -259,7 +259,7 @@ static int i_sectors_dirty_finish(struct bch_fs *c, struct i_sectors_hook *h)
mutex_lock(&h->inode->ei_update_lock);
i_sectors_acct(c, h->inode, &h->quota_res, h->sectors);
- ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_finish_fn, h);
+ ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_finish_fn, h, 0);
if (!ret && h->new_i_size != U64_MAX)
i_size_write(&h->inode->v, h->new_i_size);
@@ -289,7 +289,7 @@ static int i_sectors_dirty_start(struct bch_fs *c, struct i_sectors_hook *h)
int ret;
mutex_lock(&h->inode->ei_update_lock);
- ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_start_fn, h);
+ ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_start_fn, h, 0);
mutex_unlock(&h->inode->ei_update_lock);
return ret;
@@ -390,7 +390,8 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
struct bchfs_write_op *op = container_of(wop,
struct bchfs_write_op, op);
struct keylist *keys = &op->op.insert_keys;
- struct btree_iter extent_iter, inode_iter;
+ struct btree_trans trans;
+ struct btree_iter *extent_iter, *inode_iter = NULL;
struct bchfs_extent_trans_hook hook;
struct bkey_i *k = bch2_keylist_front(keys);
s64 orig_sectors_added = op->sectors_added;
@@ -398,12 +399,13 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
BUG_ON(k->k.p.inode != op->inode->v.i_ino);
- bch2_btree_iter_init(&extent_iter, wop->c, BTREE_ID_EXTENTS,
- bkey_start_pos(&bch2_keylist_front(keys)->k),
- BTREE_ITER_INTENT);
- bch2_btree_iter_init(&inode_iter, wop->c, BTREE_ID_INODES,
- POS(extent_iter.pos.inode, 0),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ bch2_trans_init(&trans, wop->c);
+
+ extent_iter = bch2_trans_get_iter(&trans,
+ BTREE_ID_EXTENTS,
+ bkey_start_pos(&bch2_keylist_front(keys)->k),
+ BTREE_ITER_INTENT);
+ BUG_ON(IS_ERR(extent_iter));
hook.op = op;
hook.hook.fn = bchfs_extent_update_hook;
@@ -416,19 +418,29 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
op->inode->ei_inode.bi_size)
hook.need_inode_update = true;
+ /* optimization for fewer transaction restarts: */
+ ret = bch2_btree_iter_traverse(extent_iter);
+ if (ret)
+ goto err;
+
if (hook.need_inode_update) {
struct bkey_s_c inode;
- if (!btree_iter_linked(&inode_iter))
- bch2_btree_iter_link(&extent_iter, &inode_iter);
+ if (!inode_iter) {
+ inode_iter = bch2_trans_get_iter(&trans,
+ BTREE_ID_INODES,
+ POS(extent_iter->pos.inode, 0),
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ BUG_ON(IS_ERR(inode_iter));
+ }
- inode = bch2_btree_iter_peek_slot(&inode_iter);
+ inode = bch2_btree_iter_peek_slot(inode_iter);
if ((ret = btree_iter_err(inode)))
goto err;
if (WARN_ONCE(inode.k->type != BCH_INODE_FS,
"inode %llu not found when updating",
- extent_iter.pos.inode)) {
+ extent_iter->pos.inode)) {
ret = -ENOENT;
break;
}
@@ -436,7 +448,7 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
if (WARN_ONCE(bkey_bytes(inode.k) >
sizeof(hook.inode_p),
"inode %llu too big (%zu bytes, buf %zu)",
- extent_iter.pos.inode,
+ extent_iter->pos.inode,
bkey_bytes(inode.k),
sizeof(hook.inode_p))) {
ret = -ENOENT;
@@ -448,7 +460,7 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
&hook.inode_u);
if (WARN_ONCE(ret,
"error %i unpacking inode %llu",
- ret, extent_iter.pos.inode)) {
+ ret, extent_iter->pos.inode)) {
ret = -ENOENT;
break;
}
@@ -458,8 +470,8 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
BTREE_INSERT_NOFAIL|
BTREE_INSERT_ATOMIC|
BTREE_INSERT_USE_RESERVE,
- BTREE_INSERT_ENTRY(&extent_iter, k),
- BTREE_INSERT_ENTRY_EXTRA_RES(&inode_iter,
+ BTREE_INSERT_ENTRY(extent_iter, k),
+ BTREE_INSERT_ENTRY_EXTRA_RES(inode_iter,
&hook.inode_p.inode.k_i, 2));
} else {
ret = bch2_btree_insert_at(wop->c, &wop->res,
@@ -467,10 +479,10 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
BTREE_INSERT_NOFAIL|
BTREE_INSERT_ATOMIC|
BTREE_INSERT_USE_RESERVE,
- BTREE_INSERT_ENTRY(&extent_iter, k));
+ BTREE_INSERT_ENTRY(extent_iter, k));
}
- BUG_ON(bkey_cmp(extent_iter.pos, bkey_start_pos(&k->k)));
+ BUG_ON(bkey_cmp(extent_iter->pos, bkey_start_pos(&k->k)));
if (WARN_ONCE(!ret != !k->k.size,
"ret %i k->size %u", ret, k->k.size))
@@ -481,12 +493,11 @@ err:
if (ret)
break;
- BUG_ON(bkey_cmp(extent_iter.pos, k->k.p) < 0);
+ BUG_ON(bkey_cmp(extent_iter->pos, k->k.p) < 0);
bch2_keylist_pop_front(keys);
} while (!bch2_keylist_empty(keys));
- bch2_btree_iter_unlock(&extent_iter);
- bch2_btree_iter_unlock(&inode_iter);
+ bch2_trans_exit(&trans);
if (op->is_dio) {
struct dio_write *dio = container_of(op, struct dio_write, iop);
@@ -2338,8 +2349,8 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct address_space *mapping = inode->v.i_mapping;
- struct btree_iter src;
- struct btree_iter dst;
+ struct btree_trans trans;
+ struct btree_iter *src, *dst;
BKEY_PADDED(k) copy;
struct bkey_s_c k;
struct i_sectors_hook i_sectors_hook = i_sectors_hook_init(inode, 0);
@@ -2349,13 +2360,17 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
if ((offset | len) & (block_bytes(c) - 1))
return -EINVAL;
- bch2_btree_iter_init(&dst, c, BTREE_ID_EXTENTS,
+ bch2_trans_init(&trans, c);
+
+ dst = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
POS(inode->v.i_ino, offset >> 9),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ BUG_ON(IS_ERR(dst));
+
/* position will be set from dst iter's position: */
- bch2_btree_iter_init(&src, c, BTREE_ID_EXTENTS, POS_MIN,
+ src = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN,
BTREE_ITER_SLOTS);
- bch2_btree_iter_link(&src, &dst);
+ BUG_ON(IS_ERR(src));
/*
* We need i_mutex to keep the page cache consistent with the extents
@@ -2384,24 +2399,24 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
if (ret)
goto err;
- while (bkey_cmp(dst.pos,
+ while (bkey_cmp(dst->pos,
POS(inode->v.i_ino,
round_up(new_size, PAGE_SIZE) >> 9)) < 0) {
struct disk_reservation disk_res;
- bch2_btree_iter_set_pos(&src,
- POS(dst.pos.inode, dst.pos.offset + (len >> 9)));
+ bch2_btree_iter_set_pos(src,
+ POS(dst->pos.inode, dst->pos.offset + (len >> 9)));
- k = bch2_btree_iter_peek_slot(&src);
+ k = bch2_btree_iter_peek_slot(src);
if ((ret = btree_iter_err(k)))
goto btree_iter_err;
bkey_reassemble(&copy.k, k);
- bch2_cut_front(src.pos, &copy.k);
+ bch2_cut_front(src->pos, &copy.k);
copy.k.k.p.offset -= len >> 9;
- BUG_ON(bkey_cmp(dst.pos, bkey_start_pos(&copy.k.k)));
+ BUG_ON(bkey_cmp(dst->pos, bkey_start_pos(&copy.k.k)));
ret = bch2_disk_reservation_get(c, &disk_res, copy.k.k.size,
bch2_extent_nr_dirty_ptrs(bkey_i_to_s_c(&copy.k)),
@@ -2412,14 +2427,13 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
&inode->ei_journal_seq,
BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOFAIL,
- BTREE_INSERT_ENTRY(&dst, &copy.k));
+ BTREE_INSERT_ENTRY(dst, &copy.k));
bch2_disk_reservation_put(c, &disk_res);
btree_iter_err:
if (ret == -EINTR)
ret = 0;
if (ret) {
- bch2_btree_iter_unlock(&src);
- bch2_btree_iter_unlock(&dst);
+ bch2_trans_exit(&trans);
goto err_put_sectors_dirty;
}
/*
@@ -2427,11 +2441,10 @@ btree_iter_err:
* pointers... which isn't a _super_ serious problem...
*/
- bch2_btree_iter_cond_resched(&src);
+ bch2_btree_iter_cond_resched(src);
}
- bch2_btree_iter_unlock(&src);
- bch2_btree_iter_unlock(&dst);
+ bch2_trans_exit(&trans);
ret = bch2_inode_truncate(c, inode->v.i_ino,
round_up(new_size, block_bytes(c)) >> 9,
diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c
index 2c1ecf7..336dbd4 100644
--- a/libbcachefs/fs-ioctl.c
+++ b/libbcachefs/fs-ioctl.c
@@ -87,6 +87,8 @@ void bch2_inode_flags_to_vfs(struct bch_inode_info *inode)
struct flags_set {
unsigned mask;
unsigned flags;
+
+ unsigned projid;
};
static int bch2_inode_flags_set(struct bch_inode_info *inode,
@@ -150,7 +152,7 @@ static int bch2_ioc_setflags(struct bch_fs *c,
}
mutex_lock(&inode->ei_update_lock);
- ret = __bch2_write_inode(c, inode, bch2_inode_flags_set, &s);
+ ret = __bch2_write_inode(c, inode, bch2_inode_flags_set, &s, 0);
if (!ret)
bch2_inode_flags_to_vfs(inode);
@@ -185,9 +187,9 @@ static int bch2_set_projid(struct bch_fs *c,
qid.q[QTYP_PRJ] = projid;
- ret = bch2_quota_transfer(c, 1 << QTYP_PRJ, qid, inode->ei_qid,
- inode->v.i_blocks +
- inode->ei_quota_reserved);
+ return bch2_quota_transfer(c, 1 << QTYP_PRJ, qid, inode->ei_qid,
+ inode->v.i_blocks +
+ inode->ei_quota_reserved);
if (ret)
return ret;
@@ -195,6 +197,17 @@ static int bch2_set_projid(struct bch_fs *c,
return 0;
}
+static int fssetxattr_inode_update_fn(struct bch_inode_info *inode,
+ struct bch_inode_unpacked *bi,
+ void *p)
+{
+ struct flags_set *s = p;
+
+ bi->bi_project = s->projid;
+
+ return bch2_inode_flags_set(inode, bi, p);
+}
+
static int bch2_ioc_fssetxattr(struct bch_fs *c,
struct file *file,
struct bch_inode_info *inode,
@@ -211,6 +224,8 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c,
if (fa.fsx_xflags)
return -EOPNOTSUPP;
+ s.projid = fa.fsx_projid;
+
ret = mnt_want_write_file(file);
if (ret)
return ret;
@@ -226,7 +241,7 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c,
if (ret)
goto err_unlock;
- ret = __bch2_write_inode(c, inode, bch2_inode_flags_set, &s);
+ ret = __bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, 0);
if (!ret)
bch2_inode_flags_to_vfs(inode);
err_unlock:
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index 3b7f78e..c51a65d 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -34,6 +34,19 @@ static void bch2_vfs_inode_init(struct bch_fs *,
struct bch_inode_info *,
struct bch_inode_unpacked *);
+static void journal_seq_copy(struct bch_inode_info *dst,
+ u64 journal_seq)
+{
+ u64 old, v = READ_ONCE(dst->ei_journal_seq);
+
+ do {
+ old = v;
+
+ if (old >= journal_seq)
+ break;
+ } while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old);
+}
+
/*
* I_SIZE_DIRTY requires special handling:
*
@@ -62,127 +75,113 @@ static void bch2_vfs_inode_init(struct bch_fs *,
* be set explicitly.
*/
-int __must_check __bch2_write_inode(struct bch_fs *c,
- struct bch_inode_info *inode,
- inode_set_fn set,
- void *p)
+void bch2_inode_update_after_write(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct bch_inode_unpacked *bi,
+ unsigned fields)
{
- struct btree_iter iter;
- struct bch_inode_unpacked inode_u;
- struct bkey_inode_buf inode_p;
+ set_nlink(&inode->v, bi->bi_flags & BCH_INODE_UNLINKED
+ ? 0
+ : bi->bi_nlink + nlink_bias(inode->v.i_mode));
+ i_uid_write(&inode->v, bi->bi_uid);
+ i_gid_write(&inode->v, bi->bi_gid);
+ inode->v.i_mode = bi->bi_mode;
+
+ if (fields & ATTR_ATIME)
+ inode->v.i_atime = bch2_time_to_timespec(c, bi->bi_atime);
+ if (fields & ATTR_MTIME)
+ inode->v.i_mtime = bch2_time_to_timespec(c, bi->bi_mtime);
+ if (fields & ATTR_CTIME)
+ inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime);
+
+ inode->ei_inode = *bi;
+ inode->ei_qid = bch_qid(bi);
+}
+
+int __must_check bch2_write_inode_trans(struct btree_trans *trans,
+ struct bch_inode_info *inode,
+ struct bch_inode_unpacked *inode_u,
+ inode_set_fn set,
+ void *p)
+{
+ struct btree_iter *iter;
+ struct bkey_inode_buf *inode_p;
+ struct bkey_s_c k;
u64 inum = inode->v.i_ino;
- unsigned i_nlink = READ_ONCE(inode->v.i_nlink);
int ret;
- /*
- * We can't write an inode with i_nlink == 0 because it's stored biased;
- * however, we don't need to because if i_nlink is 0 the inode is
- * getting deleted when it's evicted.
- */
- if (!i_nlink)
- return 0;
-
lockdep_assert_held(&inode->ei_update_lock);
- bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(inum, 0),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(inum, 0),
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ if (IS_ERR(iter))
+ return PTR_ERR(iter);
- do {
- struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
+ k = bch2_btree_iter_peek_slot(iter);
+ if ((ret = btree_iter_err(k)))
+ return ret;
- if ((ret = btree_iter_err(k)))
- goto out;
+ if (WARN_ONCE(k.k->type != BCH_INODE_FS,
+ "inode %llu not found when updating", inum))
+ return -ENOENT;
- if (WARN_ONCE(k.k->type != BCH_INODE_FS,
- "inode %llu not found when updating", inum)) {
- bch2_btree_iter_unlock(&iter);
- return -ENOENT;
- }
+ ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode_u);
+ if (WARN_ONCE(ret,
+ "error %i unpacking inode %llu", ret, inum))
+ return -ENOENT;
- ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u);
- if (WARN_ONCE(ret,
- "error %i unpacking inode %llu", ret, inum)) {
- ret = -ENOENT;
- break;
- }
+ BUG_ON(inode_u->bi_size != inode->ei_inode.bi_size);
- BUG_ON(inode_u.bi_size != inode->ei_inode.bi_size);
+ BUG_ON(inode_u->bi_size != inode->ei_inode.bi_size &&
+ !(inode_u->bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+ inode_u->bi_size > i_size_read(&inode->v));
- if (set) {
- ret = set(inode, &inode_u, p);
- if (ret)
- goto out;
- }
-
- BUG_ON(i_nlink < nlink_bias(inode->v.i_mode));
-
- BUG_ON(inode_u.bi_size != inode->ei_inode.bi_size &&
- !(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
- inode_u.bi_size > i_size_read(&inode->v));
-
- inode_u.bi_mode = inode->v.i_mode;
- inode_u.bi_uid = i_uid_read(&inode->v);
- inode_u.bi_gid = i_gid_read(&inode->v);
- inode_u.bi_project = inode->ei_qid.q[QTYP_PRJ];
- inode_u.bi_nlink= i_nlink - nlink_bias(inode->v.i_mode);
- inode_u.bi_dev = inode->v.i_rdev;
- inode_u.bi_atime= timespec_to_bch2_time(c, inode->v.i_atime);
- inode_u.bi_mtime= timespec_to_bch2_time(c, inode->v.i_mtime);
- inode_u.bi_ctime= timespec_to_bch2_time(c, inode->v.i_ctime);
-
- bch2_inode_pack(&inode_p, &inode_u);
-
- ret = bch2_btree_insert_at(c, NULL, NULL,
- &inode->ei_journal_seq,
- BTREE_INSERT_ATOMIC|
- BTREE_INSERT_NOUNLOCK|
- BTREE_INSERT_NOFAIL,
- BTREE_INSERT_ENTRY(&iter, &inode_p.inode.k_i));
- } while (ret == -EINTR);
-
- if (!ret) {
- /*
- * the btree node lock protects inode->ei_inode, not
- * ei_update_lock; this is important for inode updates via
- * bchfs_write_index_update
- */
- inode->ei_inode = inode_u;
- inode->ei_qid = bch_qid(&inode_u);
+ if (set) {
+ ret = set(inode, inode_u, p);
+ if (ret)
+ return ret;
}
-out:
- bch2_btree_iter_unlock(&iter);
- return ret < 0 ? ret : 0;
-}
+ inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
+ if (IS_ERR(inode_p))
+ return PTR_ERR(inode_p);
-int __must_check bch2_write_inode(struct bch_fs *c,
- struct bch_inode_info *inode)
-{
- return __bch2_write_inode(c, inode, NULL, NULL);
+ bch2_inode_pack(inode_p, inode_u);
+ bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
+ return 0;
}
-static int bch2_inc_nlink(struct bch_fs *c, struct bch_inode_info *inode)
+int __must_check __bch2_write_inode(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ inode_set_fn set,
+ void *p, unsigned fields)
{
+ struct btree_trans trans;
+ struct bch_inode_unpacked inode_u;
int ret;
- mutex_lock(&inode->ei_update_lock);
- inc_nlink(&inode->v);
- ret = bch2_write_inode(c, inode);
- mutex_unlock(&inode->ei_update_lock);
-
- return ret;
-}
+ bch2_trans_init(&trans, c);
+retry:
+ bch2_trans_begin(&trans);
-static int bch2_dec_nlink(struct bch_fs *c, struct bch_inode_info *inode)
-{
- int ret = 0;
+ ret = bch2_write_inode_trans(&trans, inode, &inode_u, set, p) ?:
+ bch2_trans_commit(&trans, NULL, NULL,
+ &inode->ei_journal_seq,
+ BTREE_INSERT_ATOMIC|
+ BTREE_INSERT_NOUNLOCK|
+ BTREE_INSERT_NOFAIL);
+ if (ret == -EINTR)
+ goto retry;
- mutex_lock(&inode->ei_update_lock);
- drop_nlink(&inode->v);
- ret = bch2_write_inode(c, inode);
- mutex_unlock(&inode->ei_update_lock);
+ /*
+ * the btree node lock protects inode->ei_inode, not ei_update_lock;
+ * this is important for inode updates via bchfs_write_index_update
+ */
+ if (!ret)
+ bch2_inode_update_after_write(c, inode, &inode_u, fields);
- return ret;
+ bch2_trans_exit(&trans);
+ return ret < 0 ? ret : 0;
}
static struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum)
@@ -212,125 +211,173 @@ static struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum)
return &inode->v;
}
-static struct bch_inode_info *bch2_vfs_inode_create(struct bch_fs *c,
- struct bch_inode_info *dir,
- umode_t mode, dev_t rdev)
+static void bch2_inode_init_owner(struct bch_inode_unpacked *inode_u,
+ const struct inode *dir, umode_t mode)
{
- struct posix_acl *default_acl = NULL, *acl = NULL;
- struct bch_inode_info *inode;
- struct bch_inode_unpacked inode_u;
- int ret;
+ kuid_t uid = current_fsuid();
+ kgid_t gid;
+
+ if (dir && dir->i_mode & S_ISGID) {
+ gid = dir->i_gid;
+ if (S_ISDIR(mode))
+ mode |= S_ISGID;
+ } else
+ gid = current_fsgid();
+
+ inode_u->bi_uid = from_kuid(dir->i_sb->s_user_ns, uid);
+ inode_u->bi_gid = from_kgid(dir->i_sb->s_user_ns, gid);
+ inode_u->bi_mode = mode;
+}
- inode = to_bch_ei(new_inode(c->vfs_sb));
- if (unlikely(!inode))
- return ERR_PTR(-ENOMEM);
+static int inode_update_for_create_fn(struct bch_inode_info *inode,
+ struct bch_inode_unpacked *bi,
+ void *p)
+{
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct bch_inode_unpacked *new_inode = p;
+ struct timespec now = current_time(&inode->v);
- inode_init_owner(&inode->v, &dir->v, mode);
+ bi->bi_mtime = bi->bi_ctime = timespec_to_bch2_time(c, now);
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
- ret = posix_acl_create(&dir->v, &inode->v.i_mode, &default_acl, &acl);
- if (ret)
- goto err_make_bad;
-#endif
+ if (S_ISDIR(new_inode->bi_mode))
+ bi->bi_nlink++;
- bch2_inode_init(c, &inode_u,
- i_uid_read(&inode->v),
- i_gid_read(&inode->v),
- inode->v.i_mode, rdev,
- &dir->ei_inode);
+ return 0;
+}
+
+static struct bch_inode_info *
+__bch2_create(struct bch_inode_info *dir, struct dentry *dentry,
+ umode_t mode, dev_t rdev, bool tmpfile)
+{
+ struct bch_fs *c = dir->v.i_sb->s_fs_info;
+ struct btree_trans trans;
+ struct bch_inode_unpacked dir_u;
+ struct bch_inode_info *inode, *old;
+ struct bch_inode_unpacked inode_u;
+ struct bch_hash_info hash_info;
+ struct posix_acl *default_acl = NULL, *acl = NULL;
+ int ret;
+
+ bch2_inode_init(c, &inode_u, 0, 0, 0, rdev, &dir->ei_inode);
+ bch2_inode_init_owner(&inode_u, &dir->v, mode);
inode_u.bi_project = dir->ei_qid.q[QTYP_PRJ];
+ hash_info = bch2_hash_info_init(c, &inode_u);
+
+ if (tmpfile)
+ inode_u.bi_flags |= BCH_INODE_UNLINKED;
+
ret = bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, BCH_QUOTA_PREALLOC);
if (ret)
- goto err_make_bad;
+ return ERR_PTR(ret);
- ret = bch2_inode_create(c, &inode_u,
- BLOCKDEV_INODE_MAX, 0,
- &c->unused_inode_hint);
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+ ret = posix_acl_create(&dir->v, &inode_u.bi_mode, &default_acl, &acl);
+ if (ret)
+ goto err;
+#endif
+
+ /*
+ * preallocate vfs inode before btree transaction, so that nothing can
+ * fail after the transaction succeeds:
+ */
+ inode = to_bch_ei(new_inode(c->vfs_sb));
+ if (unlikely(!inode)) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ bch2_trans_init(&trans, c);
+retry:
+ bch2_trans_begin(&trans);
+
+ ret = __bch2_inode_create(&trans, &inode_u,
+ BLOCKDEV_INODE_MAX, 0,
+ &c->unused_inode_hint) ?:
+ (default_acl
+ ? bch2_set_acl_trans(&trans, &inode_u, &hash_info,
+ default_acl, ACL_TYPE_DEFAULT)
+ : 0) ?:
+ (acl
+ ? bch2_set_acl_trans(&trans, &inode_u, &hash_info,
+ acl, ACL_TYPE_ACCESS)
+ : 0) ?:
+ (!tmpfile
+ ? __bch2_dirent_create(&trans, dir->v.i_ino,
+ &dir->ei_str_hash,
+ mode_to_type(mode),
+ &dentry->d_name,
+ inode_u.bi_inum,
+ BCH_HASH_SET_MUST_CREATE)
+ : 0) ?:
+ (!tmpfile
+ ? bch2_write_inode_trans(&trans, dir, &dir_u,
+ inode_update_for_create_fn,
+ &inode_u)
+ : 0) ?:
+ bch2_trans_commit(&trans, NULL, NULL,
+ &inode->ei_journal_seq,
+ BTREE_INSERT_ATOMIC|
+ BTREE_INSERT_NOUNLOCK);
+ if (ret == -EINTR)
+ goto retry;
if (unlikely(ret))
- goto err_acct_quota;
+ goto err_trans;
- bch2_vfs_inode_init(c, inode, &inode_u);
atomic_long_inc(&c->nr_inodes);
- if (default_acl) {
- ret = __bch2_set_acl(&inode->v, default_acl, ACL_TYPE_DEFAULT);
- if (unlikely(ret))
- goto err;
+ if (!tmpfile) {
+ bch2_inode_update_after_write(c, dir, &dir_u,
+ ATTR_MTIME|ATTR_CTIME);
+ journal_seq_copy(dir, inode->ei_journal_seq);
}
- if (acl) {
- ret = __bch2_set_acl(&inode->v, acl, ACL_TYPE_ACCESS);
- if (unlikely(ret))
- goto err;
+ bch2_vfs_inode_init(c, inode, &inode_u);
+
+ set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
+ set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
+
+ /*
+ * we must insert the new inode into the inode cache before calling
+ * bch2_trans_exit() and dropping locks, else we could race with another
+ * thread pulling the inode in and modifying it:
+ */
+
+ old = to_bch_ei(insert_inode_locked2(&inode->v));
+ if (unlikely(old)) {
+ /*
+ * We raced, another process pulled the new inode into cache
+ * before us:
+ */
+ old->ei_journal_seq = inode->ei_journal_seq;
+ make_bad_inode(&inode->v);
+ iput(&inode->v);
+
+ inode = old;
+ } else {
+ /*
+ * we really don't want insert_inode_locked2() to be setting
+ * I_NEW...
+ */
+ unlock_new_inode(&inode->v);
}
- insert_inode_hash(&inode->v);
+ bch2_trans_exit(&trans);
out:
posix_acl_release(default_acl);
posix_acl_release(acl);
return inode;
-err_acct_quota:
- bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, BCH_QUOTA_WARN);
-err_make_bad:
- /*
- * indicate to bch_evict_inode that the inode was never actually
- * created:
- */
+err_trans:
+ bch2_trans_exit(&trans);
make_bad_inode(&inode->v);
-err:
- clear_nlink(&inode->v);
iput(&inode->v);
+err:
+ bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, BCH_QUOTA_WARN);
inode = ERR_PTR(ret);
goto out;
}
-static int bch2_vfs_dirent_create(struct bch_fs *c,
- struct bch_inode_info *dir,
- u8 type, const struct qstr *name,
- u64 dst)
-{
- int ret;
-
- ret = bch2_dirent_create(c, dir->v.i_ino, &dir->ei_str_hash,
- type, name, dst,
- &dir->ei_journal_seq,
- BCH_HASH_SET_MUST_CREATE);
- if (unlikely(ret))
- return ret;
-
- dir->v.i_mtime = dir->v.i_ctime = current_time(&dir->v);
- mark_inode_dirty_sync(&dir->v);
- return 0;
-}
-
-static int __bch2_create(struct bch_inode_info *dir, struct dentry *dentry,
- umode_t mode, dev_t rdev)
-{
- struct bch_fs *c = dir->v.i_sb->s_fs_info;
- struct bch_inode_info *inode;
- int ret;
-
- inode = bch2_vfs_inode_create(c, dir, mode, rdev);
- if (unlikely(IS_ERR(inode)))
- return PTR_ERR(inode);
-
- ret = bch2_vfs_dirent_create(c, dir, mode_to_type(mode),
- &dentry->d_name, inode->v.i_ino);
- if (unlikely(ret)) {
- clear_nlink(&inode->v);
- iput(&inode->v);
- return ret;
- }
-
- if (dir->ei_journal_seq > inode->ei_journal_seq)
- inode->ei_journal_seq = dir->ei_journal_seq;
-
- d_instantiate(dentry, &inode->v);
- return 0;
-}
-
/* methods */
static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
@@ -354,7 +401,70 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
static int bch2_create(struct inode *vdir, struct dentry *dentry,
umode_t mode, bool excl)
{
- return __bch2_create(to_bch_ei(vdir), dentry, mode|S_IFREG, 0);
+ struct bch_inode_info *inode =
+ __bch2_create(to_bch_ei(vdir), dentry, mode|S_IFREG, 0, false);
+
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ d_instantiate(dentry, &inode->v);
+ return 0;
+}
+
+static int inode_update_for_link_fn(struct bch_inode_info *inode,
+ struct bch_inode_unpacked *bi,
+ void *p)
+{
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct timespec now = current_time(&inode->v);
+
+ bi->bi_ctime = timespec_to_bch2_time(c, now);
+
+ if (bi->bi_flags & BCH_INODE_UNLINKED)
+ bi->bi_flags &= ~BCH_INODE_UNLINKED;
+ else
+ bi->bi_nlink++;
+
+ return 0;
+}
+
+static int __bch2_link(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct bch_inode_info *dir,
+ struct dentry *dentry)
+{
+ struct btree_trans trans;
+ struct bch_inode_unpacked inode_u;
+ int ret;
+
+ lockdep_assert_held(&inode->v.i_rwsem);
+
+ bch2_trans_init(&trans, c);
+retry:
+ bch2_trans_begin(&trans);
+
+ ret = __bch2_dirent_create(&trans, dir->v.i_ino,
+ &dir->ei_str_hash,
+ mode_to_type(inode->v.i_mode),
+ &dentry->d_name,
+ inode->v.i_ino,
+ BCH_HASH_SET_MUST_CREATE) ?:
+ bch2_write_inode_trans(&trans, inode, &inode_u,
+ inode_update_for_link_fn,
+ NULL) ?:
+ bch2_trans_commit(&trans, NULL, NULL,
+ &inode->ei_journal_seq,
+ BTREE_INSERT_ATOMIC|
+ BTREE_INSERT_NOUNLOCK);
+
+ if (ret == -EINTR)
+ goto retry;
+
+ if (likely(!ret))
+ bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME);
+
+ bch2_trans_exit(&trans);
+ return ret;
}
static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
@@ -365,25 +475,43 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
int ret;
- lockdep_assert_held(&inode->v.i_rwsem);
-
- inode->v.i_ctime = current_time(&dir->v);
-
- ret = bch2_inc_nlink(c, inode);
- if (ret)
+ ret = __bch2_link(c, inode, dir, dentry);
+ if (unlikely(ret))
return ret;
ihold(&inode->v);
+ d_instantiate(dentry, &inode->v);
+ return 0;
+}
- ret = bch2_vfs_dirent_create(c, dir, mode_to_type(inode->v.i_mode),
- &dentry->d_name, inode->v.i_ino);
- if (unlikely(ret)) {
- bch2_dec_nlink(c, inode);
- iput(&inode->v);
- return ret;
- }
+static int inode_update_dir_for_unlink_fn(struct bch_inode_info *inode,
+ struct bch_inode_unpacked *bi,
+ void *p)
+{
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct bch_inode_info *unlink_inode = p;
+ struct timespec now = current_time(&inode->v);
+
+ bi->bi_mtime = bi->bi_ctime = timespec_to_bch2_time(c, now);
+
+ bi->bi_nlink -= S_ISDIR(unlink_inode->v.i_mode);
+
+ return 0;
+}
+
+static int inode_update_for_unlink_fn(struct bch_inode_info *inode,
+ struct bch_inode_unpacked *bi,
+ void *p)
+{
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct timespec now = current_time(&inode->v);
+
+ bi->bi_ctime = timespec_to_bch2_time(c, now);
+ if (bi->bi_nlink)
+ bi->bi_nlink--;
+ else
+ bi->bi_flags |= BCH_INODE_UNLINKED;
- d_instantiate(dentry, &inode->v);
return 0;
}
@@ -392,28 +520,44 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
struct bch_fs *c = vdir->i_sb->s_fs_info;
struct bch_inode_info *dir = to_bch_ei(vdir);
struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+ struct bch_inode_unpacked dir_u, inode_u;
+ struct btree_trans trans;
int ret;
- lockdep_assert_held(&inode->v.i_rwsem);
-
- ret = bch2_dirent_delete(c, dir->v.i_ino, &dir->ei_str_hash,
- &dentry->d_name, &dir->ei_journal_seq);
+ bch2_trans_init(&trans, c);
+retry:
+ bch2_trans_begin(&trans);
+
+ ret = __bch2_dirent_delete(&trans, dir->v.i_ino,
+ &dir->ei_str_hash,
+ &dentry->d_name) ?:
+ bch2_write_inode_trans(&trans, dir, &dir_u,
+ inode_update_dir_for_unlink_fn,
+ inode) ?:
+ bch2_write_inode_trans(&trans, inode, &inode_u,
+ inode_update_for_unlink_fn,
+ NULL) ?:
+ bch2_trans_commit(&trans, NULL, NULL,
+ &dir->ei_journal_seq,
+ BTREE_INSERT_ATOMIC|
+ BTREE_INSERT_NOUNLOCK|
+ BTREE_INSERT_NOFAIL);
+ if (ret == -EINTR)
+ goto retry;
if (ret)
- return ret;
+ goto err;
if (dir->ei_journal_seq > inode->ei_journal_seq)
inode->ei_journal_seq = dir->ei_journal_seq;
- inode->v.i_ctime = dir->v.i_ctime;
-
- if (S_ISDIR(inode->v.i_mode)) {
- bch2_dec_nlink(c, dir);
- drop_nlink(&inode->v);
- }
-
- bch2_dec_nlink(c, inode);
+ bch2_inode_update_after_write(c, dir, &dir_u,
+ ATTR_MTIME|ATTR_CTIME);
+ bch2_inode_update_after_write(c, inode, &inode_u,
+ ATTR_MTIME);
+err:
+ bch2_trans_exit(&trans);
- return 0;
+ return ret;
}
static int bch2_symlink(struct inode *vdir, struct dentry *dentry,
@@ -423,7 +567,7 @@ static int bch2_symlink(struct inode *vdir, struct dentry *dentry,
struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
int ret;
- inode = bch2_vfs_inode_create(c, dir, S_IFLNK|S_IRWXUGO, 0);
+ inode = __bch2_create(dir, dentry, S_IFLNK|S_IRWXUGO, 0, true);
if (unlikely(IS_ERR(inode)))
return PTR_ERR(inode);
@@ -438,37 +582,28 @@ static int bch2_symlink(struct inode *vdir, struct dentry *dentry,
if (unlikely(ret))
goto err;
- /* XXX: racy */
- if (dir->ei_journal_seq < inode->ei_journal_seq)
- dir->ei_journal_seq = inode->ei_journal_seq;
+ journal_seq_copy(dir, inode->ei_journal_seq);
- ret = bch2_vfs_dirent_create(c, dir, DT_LNK, &dentry->d_name,
- inode->v.i_ino);
+ ret = __bch2_link(c, inode, dir, dentry);
if (unlikely(ret))
goto err;
d_instantiate(dentry, &inode->v);
return 0;
err:
- clear_nlink(&inode->v);
iput(&inode->v);
return ret;
}
static int bch2_mkdir(struct inode *vdir, struct dentry *dentry, umode_t mode)
{
- struct bch_fs *c = vdir->i_sb->s_fs_info;
- struct bch_inode_info *dir = to_bch_ei(vdir);
- int ret;
-
- lockdep_assert_held(&dir->v.i_rwsem);
-
- ret = __bch2_create(dir, dentry, mode|S_IFDIR, 0);
- if (unlikely(ret))
- return ret;
+ struct bch_inode_info *inode =
+ __bch2_create(to_bch_ei(vdir), dentry, mode|S_IFDIR, 0, false);
- bch2_inc_nlink(c, dir);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+ d_instantiate(dentry, &inode->v);
return 0;
}
@@ -485,151 +620,197 @@ static int bch2_rmdir(struct inode *vdir, struct dentry *dentry)
static int bch2_mknod(struct inode *vdir, struct dentry *dentry,
umode_t mode, dev_t rdev)
{
- return __bch2_create(to_bch_ei(vdir), dentry, mode, rdev);
-}
-
-static int bch2_rename(struct bch_fs *c,
- struct bch_inode_info *old_dir,
- struct dentry *old_dentry,
- struct bch_inode_info *new_dir,
- struct dentry *new_dentry)
-{
- struct bch_inode_info *old_inode = to_bch_ei(old_dentry->d_inode);
- struct bch_inode_info *new_inode = to_bch_ei(new_dentry->d_inode);
- struct timespec now = current_time(&old_dir->v);
- int ret;
-
- lockdep_assert_held(&old_dir->v.i_rwsem);
- lockdep_assert_held(&new_dir->v.i_rwsem);
+ struct bch_inode_info *inode =
+ __bch2_create(to_bch_ei(vdir), dentry, mode, rdev, false);
- if (new_inode)
- filemap_write_and_wait_range(old_inode->v.i_mapping,
- 0, LLONG_MAX);
-
- if (new_inode && S_ISDIR(old_inode->v.i_mode)) {
- lockdep_assert_held(&new_inode->v.i_rwsem);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
- if (!S_ISDIR(new_inode->v.i_mode))
- return -ENOTDIR;
+ d_instantiate(dentry, &inode->v);
+ return 0;
+}
- if (bch2_empty_dir(c, new_inode->v.i_ino))
- return -ENOTEMPTY;
+struct rename_info {
+ u64 now;
+ struct bch_inode_info *src_dir;
+ struct bch_inode_info *dst_dir;
+ struct bch_inode_info *src_inode;
+ struct bch_inode_info *dst_inode;
+ enum bch_rename_mode mode;
+};
- ret = bch2_dirent_rename(c,
- old_dir, &old_dentry->d_name,
- new_dir, &new_dentry->d_name,
- &old_inode->ei_journal_seq, BCH_RENAME_OVERWRITE);
- if (unlikely(ret))
- return ret;
+static int inode_update_for_rename_fn(struct bch_inode_info *inode,
+ struct bch_inode_unpacked *bi,
+ void *p)
+{
+ struct rename_info *info = p;
- clear_nlink(&new_inode->v);
- bch2_dec_nlink(c, old_dir);
- } else if (new_inode) {
- lockdep_assert_held(&new_inode->v.i_rwsem);
+ if (inode == info->src_dir) {
+ bi->bi_nlink -= S_ISDIR(info->src_inode->v.i_mode);
+ bi->bi_nlink += info->dst_inode &&
+ S_ISDIR(info->dst_inode->v.i_mode) &&
+ info->mode == BCH_RENAME_EXCHANGE;
+ }
- ret = bch2_dirent_rename(c,
- old_dir, &old_dentry->d_name,
- new_dir, &new_dentry->d_name,
- &old_inode->ei_journal_seq, BCH_RENAME_OVERWRITE);
- if (unlikely(ret))
- return ret;
+ if (inode == info->dst_dir) {
+ bi->bi_nlink += S_ISDIR(info->src_inode->v.i_mode);
+ bi->bi_nlink -= info->dst_inode &&
+ S_ISDIR(info->dst_inode->v.i_mode);
+ }
- new_inode->v.i_ctime = now;
- bch2_dec_nlink(c, new_inode);
- } else if (S_ISDIR(old_inode->v.i_mode)) {
- ret = bch2_dirent_rename(c,
- old_dir, &old_dentry->d_name,
- new_dir, &new_dentry->d_name,
- &old_inode->ei_journal_seq, BCH_RENAME);
- if (unlikely(ret))
- return ret;
+ if (inode == info->dst_inode &&
+ info->mode == BCH_RENAME_OVERWRITE) {
+ BUG_ON(bi->bi_nlink &&
+ S_ISDIR(info->dst_inode->v.i_mode));
- bch2_inc_nlink(c, new_dir);
- bch2_dec_nlink(c, old_dir);
- } else {
- ret = bch2_dirent_rename(c,
- old_dir, &old_dentry->d_name,
- new_dir, &new_dentry->d_name,
- &old_inode->ei_journal_seq, BCH_RENAME);
- if (unlikely(ret))
- return ret;
+ if (bi->bi_nlink)
+ bi->bi_nlink--;
+ else
+ bi->bi_flags |= BCH_INODE_UNLINKED;
}
- old_dir->v.i_ctime = old_dir->v.i_mtime = now;
- new_dir->v.i_ctime = new_dir->v.i_mtime = now;
- mark_inode_dirty_sync(&old_dir->v);
- mark_inode_dirty_sync(&new_dir->v);
-
- old_inode->v.i_ctime = now;
- mark_inode_dirty_sync(&old_inode->v);
+ if (inode == info->src_dir ||
+ inode == info->dst_dir)
+ bi->bi_mtime = info->now;
+ bi->bi_ctime = info->now;
return 0;
}
-static int bch2_rename_exchange(struct bch_fs *c,
- struct bch_inode_info *old_dir,
- struct dentry *old_dentry,
- struct bch_inode_info *new_dir,
- struct dentry *new_dentry)
+static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry,
+ struct inode *dst_vdir, struct dentry *dst_dentry,
+ unsigned flags)
{
- struct bch_inode_info *old_inode = to_bch_ei(old_dentry->d_inode);
- struct bch_inode_info *new_inode = to_bch_ei(new_dentry->d_inode);
- struct timespec now = current_time(&old_dir->v);
+ struct bch_fs *c = src_vdir->i_sb->s_fs_info;
+ struct rename_info i = {
+ .now = timespec_to_bch2_time(c,
+ current_time(src_vdir)),
+ .src_dir = to_bch_ei(src_vdir),
+ .dst_dir = to_bch_ei(dst_vdir),
+ .src_inode = to_bch_ei(src_dentry->d_inode),
+ .dst_inode = to_bch_ei(dst_dentry->d_inode),
+ .mode = flags & RENAME_EXCHANGE
+ ? BCH_RENAME_EXCHANGE
+ : dst_dentry->d_inode
+ ? BCH_RENAME_OVERWRITE : BCH_RENAME,
+ };
+ struct btree_trans trans;
+ struct bch_inode_unpacked dst_dir_u, src_dir_u;
+ struct bch_inode_unpacked src_inode_u, dst_inode_u;
+ u64 journal_seq = 0;
int ret;
- ret = bch2_dirent_rename(c,
- old_dir, &old_dentry->d_name,
- new_dir, &new_dentry->d_name,
- &old_inode->ei_journal_seq, BCH_RENAME_EXCHANGE);
- if (unlikely(ret))
- return ret;
+ if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
+ return -EINVAL;
- if (S_ISDIR(old_inode->v.i_mode) !=
- S_ISDIR(new_inode->v.i_mode)) {
- if (S_ISDIR(old_inode->v.i_mode)) {
- bch2_inc_nlink(c, new_dir);
- bch2_dec_nlink(c, old_dir);
- } else {
- bch2_dec_nlink(c, new_dir);
- bch2_inc_nlink(c, old_dir);
- }
+ if (i.mode == BCH_RENAME_OVERWRITE) {
+ if (S_ISDIR(i.src_inode->v.i_mode) !=
+ S_ISDIR(i.dst_inode->v.i_mode))
+ return -ENOTDIR;
+
+ if (S_ISDIR(i.src_inode->v.i_mode) &&
+ bch2_empty_dir(c, i.dst_inode->v.i_ino))
+ return -ENOTEMPTY;
+
+ ret = filemap_write_and_wait_range(i.src_inode->v.i_mapping,
+ 0, LLONG_MAX);
+ if (ret)
+ return ret;
}
- old_dir->v.i_ctime = old_dir->v.i_mtime = now;
- new_dir->v.i_ctime = new_dir->v.i_mtime = now;
- mark_inode_dirty_sync(&old_dir->v);
- mark_inode_dirty_sync(&new_dir->v);
+ bch2_trans_init(&trans, c);
+retry:
+ bch2_trans_begin(&trans);
+ i.now = timespec_to_bch2_time(c, current_time(src_vdir)),
+
+ ret = bch2_dirent_rename(&trans,
+ i.src_dir, &src_dentry->d_name,
+ i.dst_dir, &dst_dentry->d_name,
+ i.mode) ?:
+ bch2_write_inode_trans(&trans, i.src_dir, &src_dir_u,
+ inode_update_for_rename_fn, &i) ?:
+ (i.src_dir != i.dst_dir
+ ? bch2_write_inode_trans(&trans, i.dst_dir, &dst_dir_u,
+ inode_update_for_rename_fn, &i)
+ : 0 ) ?:
+ bch2_write_inode_trans(&trans, i.src_inode, &src_inode_u,
+ inode_update_for_rename_fn, &i) ?:
+ (i.dst_inode
+ ? bch2_write_inode_trans(&trans, i.dst_inode, &dst_inode_u,
+ inode_update_for_rename_fn, &i)
+ : 0 ) ?:
+ bch2_trans_commit(&trans, NULL, NULL,
+ &journal_seq,
+ BTREE_INSERT_ATOMIC|
+ BTREE_INSERT_NOUNLOCK);
+ if (ret == -EINTR)
+ goto retry;
+ if (unlikely(ret))
+ goto err;
- old_inode->v.i_ctime = now;
- new_inode->v.i_ctime = now;
- mark_inode_dirty_sync(&old_inode->v);
- mark_inode_dirty_sync(&new_inode->v);
+ bch2_inode_update_after_write(c, i.src_dir, &src_dir_u,
+ ATTR_MTIME|ATTR_CTIME);
+ journal_seq_copy(i.src_dir, journal_seq);
- return 0;
-}
+ if (i.src_dir != i.dst_dir) {
+ bch2_inode_update_after_write(c, i.dst_dir, &dst_dir_u,
+ ATTR_MTIME|ATTR_CTIME);
+ journal_seq_copy(i.dst_dir, journal_seq);
+ }
-static int bch2_rename2(struct inode *old_vdir, struct dentry *old_dentry,
- struct inode *new_vdir, struct dentry *new_dentry,
- unsigned flags)
-{
- struct bch_fs *c = old_vdir->i_sb->s_fs_info;
- struct bch_inode_info *old_dir = to_bch_ei(old_vdir);
- struct bch_inode_info *new_dir = to_bch_ei(new_vdir);
+ bch2_inode_update_after_write(c, i.src_inode, &src_inode_u,
+ ATTR_CTIME);
+ if (i.dst_inode)
+ bch2_inode_update_after_write(c, i.dst_inode, &dst_inode_u,
+ ATTR_CTIME);
+err:
+ bch2_trans_exit(&trans);
- if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
- return -EINVAL;
+ return ret;
+}
- if (flags & RENAME_EXCHANGE)
- return bch2_rename_exchange(c, old_dir, old_dentry,
- new_dir, new_dentry);
+static int inode_update_for_setattr_fn(struct bch_inode_info *inode,
+ struct bch_inode_unpacked *bi,
+ void *p)
+{
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct iattr *attr = p;
+ unsigned int ia_valid = attr->ia_valid;
+
+ if (ia_valid & ATTR_UID)
+ bi->bi_uid = from_kuid(inode->v.i_sb->s_user_ns, attr->ia_uid);
+ if (ia_valid & ATTR_GID)
+ bi->bi_gid = from_kgid(inode->v.i_sb->s_user_ns, attr->ia_gid);
+
+ if (ia_valid & ATTR_ATIME)
+ bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime);
+ if (ia_valid & ATTR_MTIME)
+ bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime);
+ if (ia_valid & ATTR_CTIME)
+ bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime);
+
+ if (ia_valid & ATTR_MODE) {
+ umode_t mode = attr->ia_mode;
+ kgid_t gid = ia_valid & ATTR_GID
+ ? attr->ia_gid
+ : inode->v.i_gid;
+
+ if (!in_group_p(gid) &&
+ !capable_wrt_inode_uidgid(&inode->v, CAP_FSETID))
+ mode &= ~S_ISGID;
+ bi->bi_mode = mode;
+ }
- return bch2_rename(c, old_dir, old_dentry, new_dir, new_dentry);
+ return 0;
}
static int bch2_setattr_nonsize(struct bch_inode_info *inode, struct iattr *iattr)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_qid qid = inode->ei_qid;
+ struct btree_trans trans;
+ struct bch_inode_unpacked inode_u;
+ struct posix_acl *acl = NULL;
unsigned qtypes = 0;
int ret;
@@ -654,18 +835,38 @@ static int bch2_setattr_nonsize(struct bch_inode_info *inode, struct iattr *iatt
inode->v.i_blocks +
inode->ei_quota_reserved);
if (ret)
- goto out_unlock;
+ goto err;
}
- setattr_copy(&inode->v, iattr);
+ bch2_trans_init(&trans, c);
+retry:
+ bch2_trans_begin(&trans);
+ kfree(acl);
+ acl = NULL;
+
+ ret = bch2_write_inode_trans(&trans, inode, &inode_u,
+ inode_update_for_setattr_fn, iattr) ?:
+ (iattr->ia_valid & ATTR_MODE
+ ? bch2_acl_chmod(&trans, inode, iattr->ia_mode, &acl)
+ : 0) ?:
+ bch2_trans_commit(&trans, NULL, NULL,
+ &inode->ei_journal_seq,
+ BTREE_INSERT_ATOMIC|
+ BTREE_INSERT_NOUNLOCK|
+ BTREE_INSERT_NOFAIL);
+ if (ret == -EINTR)
+ goto retry;
+ if (unlikely(ret))
+ goto err_trans;
- ret = bch2_write_inode(c, inode);
-out_unlock:
- mutex_unlock(&inode->ei_update_lock);
+ bch2_inode_update_after_write(c, inode, &inode_u, iattr->ia_valid);
- if (!ret &&
- iattr->ia_valid & ATTR_MODE)
- ret = posix_acl_chmod(&inode->v, inode->v.i_mode);
+ if (acl)
+ set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
+err_trans:
+ bch2_trans_exit(&trans);
+err:
+ mutex_unlock(&inode->ei_update_lock);
return ret;
}
@@ -723,16 +924,14 @@ static int bch2_setattr(struct dentry *dentry, struct iattr *iattr)
static int bch2_tmpfile(struct inode *vdir, struct dentry *dentry, umode_t mode)
{
- struct bch_fs *c = vdir->i_sb->s_fs_info;
- struct bch_inode_info *dir = to_bch_ei(vdir);
- struct bch_inode_info *inode;
+ struct bch_inode_info *inode =
+ __bch2_create(to_bch_ei(vdir), dentry, mode, 0, true);
- /* XXX: i_nlink should be 0? */
- inode = bch2_vfs_inode_create(c, dir, mode, 0);
- if (unlikely(IS_ERR(inode)))
+ if (IS_ERR(inode))
return PTR_ERR(inode);
- d_tmpfile(dentry, &inode->v);
+ d_mark_tmpfile(dentry, &inode->v);
+ d_instantiate(dentry, &inode->v);
return 0;
}
@@ -987,24 +1186,17 @@ static void bch2_vfs_inode_init(struct bch_fs *c,
struct bch_inode_info *inode,
struct bch_inode_unpacked *bi)
{
- inode->v.i_mode = bi->bi_mode;
- i_uid_write(&inode->v, bi->bi_uid);
- i_gid_write(&inode->v, bi->bi_gid);
+ bch2_inode_update_after_write(c, inode, bi, ~0);
+
inode->v.i_blocks = bi->bi_sectors;
inode->v.i_ino = bi->bi_inum;
- set_nlink(&inode->v, bi->bi_nlink + nlink_bias(inode->v.i_mode));
inode->v.i_rdev = bi->bi_dev;
inode->v.i_generation = bi->bi_generation;
inode->v.i_size = bi->bi_size;
- inode->v.i_atime = bch2_time_to_timespec(c, bi->bi_atime);
- inode->v.i_mtime = bch2_time_to_timespec(c, bi->bi_mtime);
- inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime);
inode->ei_journal_seq = 0;
inode->ei_quota_reserved = 0;
- inode->ei_qid = bch_qid(bi);
inode->ei_str_hash = bch2_hash_info_init(c, bi);
- inode->ei_inode = *bi;
bch2_inode_flags_to_vfs(inode);
@@ -1059,6 +1251,19 @@ static void bch2_destroy_inode(struct inode *vinode)
call_rcu(&vinode->i_rcu, bch2_i_callback);
}
+static int inode_update_times_fn(struct bch_inode_info *inode,
+ struct bch_inode_unpacked *bi,
+ void *p)
+{
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+ bi->bi_atime = timespec_to_bch2_time(c, inode->v.i_atime);
+ bi->bi_mtime = timespec_to_bch2_time(c, inode->v.i_mtime);
+ bi->bi_ctime = timespec_to_bch2_time(c, inode->v.i_ctime);
+
+ return 0;
+}
+
static int bch2_vfs_write_inode(struct inode *vinode,
struct writeback_control *wbc)
{
@@ -1067,7 +1272,8 @@ static int bch2_vfs_write_inode(struct inode *vinode,
int ret;
mutex_lock(&inode->ei_update_lock);
- ret = bch2_write_inode(c, inode);
+ ret = __bch2_write_inode(c, inode, inode_update_times_fn, NULL,
+ ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
mutex_unlock(&inode->ei_update_lock);
if (c->opts.journal_flush_disabled)
@@ -1096,7 +1302,9 @@ static void bch2_evict_inode(struct inode *vinode)
bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
BCH_QUOTA_WARN);
bch2_inode_rm(c, inode->v.i_ino);
- atomic_long_dec(&c->nr_inodes);
+
+ WARN_ONCE(atomic_long_dec_return(&c->nr_inodes) < 0,
+ "nr_inodes < 0");
}
}
diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h
index fbbc7a3..e2fc270 100644
--- a/libbcachefs/fs.h
+++ b/libbcachefs/fs.h
@@ -51,8 +51,16 @@ struct bch_inode_unpacked;
typedef int (*inode_set_fn)(struct bch_inode_info *,
struct bch_inode_unpacked *, void *);
+void bch2_inode_update_after_write(struct bch_fs *,
+ struct bch_inode_info *,
+ struct bch_inode_unpacked *,
+ unsigned);
+int __must_check bch2_write_inode_trans(struct btree_trans *,
+ struct bch_inode_info *,
+ struct bch_inode_unpacked *,
+ inode_set_fn, void *);
int __must_check __bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
- inode_set_fn, void *);
+ inode_set_fn, void *, unsigned);
int __must_check bch2_write_inode(struct bch_fs *,
struct bch_inode_info *);
diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c
index edf714f..f6035cc 100644
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@@ -126,16 +126,22 @@ static int walk_inode(struct bch_fs *c, struct inode_walker *w, u64 inum)
struct hash_check {
struct bch_hash_info info;
- struct btree_iter chain;
- struct btree_iter iter;
+ struct btree_trans *trans;
+
+ /* start of current chain of hash collisions: */
+ struct btree_iter *chain;
+
+ /* next offset in current chain of hash collisions: */
u64 next;
};
static void hash_check_init(const struct bch_hash_desc desc,
- struct hash_check *h, struct bch_fs *c)
+ struct btree_trans *trans,
+ struct hash_check *h)
{
- bch2_btree_iter_init(&h->chain, c, desc.btree_id, POS_MIN, 0);
- bch2_btree_iter_init(&h->iter, c, desc.btree_id, POS_MIN, 0);
+ h->trans = trans;
+ h->chain = bch2_trans_get_iter(trans, desc.btree_id, POS_MIN, 0);
+ h->next = -1;
}
static void hash_check_set_inode(struct hash_check *h, struct bch_fs *c,
@@ -173,6 +179,75 @@ err:
return ret;
}
+/* fsck hasn't been converted to new transactions yet: */
+static int fsck_hash_delete_at(const struct bch_hash_desc desc,
+ struct bch_hash_info *info,
+ struct btree_iter *orig_iter)
+{
+ struct btree_trans trans;
+ struct btree_iter *iter;
+ int ret;
+
+ bch2_btree_iter_unlock(orig_iter);
+
+ bch2_trans_init(&trans, orig_iter->c);
+retry:
+ bch2_trans_begin(&trans);
+
+ iter = bch2_trans_copy_iter(&trans, orig_iter);
+ if (IS_ERR(iter)) {
+ ret = PTR_ERR(iter);
+ goto err;
+ }
+
+ ret = bch2_hash_delete_at(&trans, desc, info, iter) ?:
+ bch2_trans_commit(&trans, NULL, NULL, NULL,
+ BTREE_INSERT_ATOMIC|
+ BTREE_INSERT_NOFAIL);
+err:
+ if (ret == -EINTR)
+ goto retry;
+
+ bch2_trans_exit(&trans);
+ return ret;
+}
+
+static int hash_check_duplicates(const struct bch_hash_desc desc,
+ struct hash_check *h, struct bch_fs *c,
+ struct btree_iter *k_iter, struct bkey_s_c k)
+{
+ struct btree_iter *iter;
+ struct bkey_s_c k2;
+ char buf[200];
+ int ret = 0;
+
+ if (!bkey_cmp(h->chain->pos, k_iter->pos))
+ return 0;
+
+ iter = bch2_trans_copy_iter(h->trans, h->chain);
+ BUG_ON(IS_ERR(iter));
+
+ for_each_btree_key_continue(iter, 0, k2) {
+ if (bkey_cmp(k2.k->p, k.k->p) >= 0)
+ break;
+
+ if (fsck_err_on(k2.k->type == desc.key_type &&
+ !desc.cmp_bkey(k, k2), c,
+ "duplicate hash table keys:\n%s",
+ (bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id),
+ buf, sizeof(buf), k), buf))) {
+ ret = fsck_hash_delete_at(desc, &h->info, k_iter);
+ if (ret)
+ return ret;
+ ret = 1;
+ break;
+ }
+ }
+fsck_err:
+ bch2_trans_iter_free(h->trans, iter);
+ return ret;
+}
+
static int hash_check_key(const struct bch_hash_desc desc,
struct hash_check *h, struct bch_fs *c,
struct btree_iter *k_iter, struct bkey_s_c k)
@@ -185,13 +260,8 @@ static int hash_check_key(const struct bch_hash_desc desc,
k.k->type != desc.key_type)
return 0;
- if (k.k->p.offset != h->next) {
- if (!btree_iter_linked(&h->chain)) {
- bch2_btree_iter_link(k_iter, &h->chain);
- bch2_btree_iter_link(k_iter, &h->iter);
- }
- bch2_btree_iter_copy(&h->chain, k_iter);
- }
+ if (k.k->p.offset != h->next)
+ bch2_btree_iter_copy(h->chain, k_iter);
h->next = k.k->p.offset + 1;
if (k.k->type != desc.key_type)
@@ -199,11 +269,11 @@ static int hash_check_key(const struct bch_hash_desc desc,
hashed = desc.hash_bkey(&h->info, k);
- if (fsck_err_on(hashed < h->chain.pos.offset ||
+ if (fsck_err_on(hashed < h->chain->pos.offset ||
hashed > k.k->p.offset, c,
"hash table key at wrong offset: %llu, "
"hashed to %llu chain starts at %llu\n%s",
- k.k->p.offset, hashed, h->chain.pos.offset,
+ k.k->p.offset, hashed, h->chain->pos.offset,
(bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id),
buf, sizeof(buf), k), buf))) {
ret = hash_redo_key(desc, h, c, k_iter, k, hashed);
@@ -214,25 +284,7 @@ static int hash_check_key(const struct bch_hash_desc desc,
return 1;
}
- if (!bkey_cmp(h->chain.pos, k_iter->pos))
- return 0;
-
- bch2_btree_iter_copy(&h->iter, &h->chain);
- while (bkey_cmp(h->iter.pos, k_iter->pos) < 0) {
- struct bkey_s_c k2 = bch2_btree_iter_peek(&h->iter);
-
- if (fsck_err_on(k2.k->type == desc.key_type &&
- !desc.cmp_bkey(k, k2), c,
- "duplicate hash table keys:\n%s",
- (bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id),
- buf, sizeof(buf), k), buf))) {
- ret = bch2_hash_delete_at(desc, &h->info, &h->iter, NULL);
- if (ret)
- return ret;
- return 1;
- }
- bch2_btree_iter_next(&h->iter);
- }
+ ret = hash_check_duplicates(desc, h, c, k_iter, k);
fsck_err:
return ret;
}
@@ -250,6 +302,8 @@ static int check_extents(struct bch_fs *c)
u64 i_sectors;
int ret = 0;
+ bch_verbose(c, "checking extents");
+
for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
POS(BCACHEFS_ROOT_INO, 0), 0, k) {
ret = walk_inode(c, &w, k.k->p.inode);
@@ -332,16 +386,25 @@ static int check_dirents(struct bch_fs *c)
{
struct inode_walker w = inode_walker_init();
struct hash_check h;
- struct btree_iter iter;
+ struct btree_trans trans;
+ struct btree_iter *iter;
struct bkey_s_c k;
unsigned name_len;
char buf[200];
int ret = 0;
- hash_check_init(bch2_dirent_hash_desc, &h, c);
+ bch_verbose(c, "checking dirents");
- for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
- POS(BCACHEFS_ROOT_INO, 0), 0, k) {
+ bch2_trans_init(&trans, c);
+
+ BUG_ON(bch2_trans_preload_iters(&trans));
+
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS,
+ POS(BCACHEFS_ROOT_INO, 0), 0);
+
+ hash_check_init(bch2_dirent_hash_desc, &trans, &h);
+
+ for_each_btree_key_continue(iter, 0, k) {
struct bkey_s_c_dirent d;
struct bch_inode_unpacked target;
bool have_target;
@@ -360,7 +423,7 @@ static int check_dirents(struct bch_fs *c)
mode_to_type(w.inode.bi_mode),
(bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS,
buf, sizeof(buf), k), buf))) {
- ret = bch2_btree_delete_at(&iter, 0);
+ ret = bch2_btree_delete_at(iter, 0);
if (ret)
goto err;
continue;
@@ -369,7 +432,7 @@ static int check_dirents(struct bch_fs *c)
if (w.first_this_inode && w.have_inode)
hash_check_set_inode(&h, c, &w.inode);
- ret = hash_check_key(bch2_dirent_hash_desc, &h, c, &iter, k);
+ ret = hash_check_key(bch2_dirent_hash_desc, &h, c, iter, k);
if (ret > 0) {
ret = 0;
continue;
@@ -393,7 +456,7 @@ static int check_dirents(struct bch_fs *c)
fsck_err_on(name_len == 2 &&
!memcmp(d.v->d_name, "..", 2), c,
".. dirent")) {
- ret = remove_dirent(c, &iter, d);
+ ret = remove_dirent(c, iter, d);
if (ret)
goto err;
continue;
@@ -403,7 +466,7 @@ static int check_dirents(struct bch_fs *c)
"dirent points to own directory:\n%s",
(bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS,
buf, sizeof(buf), k), buf))) {
- ret = remove_dirent(c, &iter, d);
+ ret = remove_dirent(c, iter, d);
if (ret)
goto err;
continue;
@@ -420,7 +483,7 @@ static int check_dirents(struct bch_fs *c)
"dirent points to missing inode:\n%s",
(bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS,
buf, sizeof(buf), k), buf))) {
- ret = remove_dirent(c, &iter, d);
+ ret = remove_dirent(c, iter, d);
if (ret)
goto err;
continue;
@@ -446,7 +509,7 @@ static int check_dirents(struct bch_fs *c)
ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
BTREE_INSERT_NOFAIL,
- BTREE_INSERT_ENTRY(&iter, &n->k_i));
+ BTREE_INSERT_ENTRY(iter, &n->k_i));
kfree(n);
if (ret)
goto err;
@@ -455,9 +518,7 @@ static int check_dirents(struct bch_fs *c)
}
err:
fsck_err:
- bch2_btree_iter_unlock(&h.chain);
- bch2_btree_iter_unlock(&h.iter);
- return bch2_btree_iter_unlock(&iter) ?: ret;
+ return bch2_trans_exit(&trans) ?: ret;
}
/*
@@ -468,14 +529,23 @@ static int check_xattrs(struct bch_fs *c)
{
struct inode_walker w = inode_walker_init();
struct hash_check h;
- struct btree_iter iter;
+ struct btree_trans trans;
+ struct btree_iter *iter;
struct bkey_s_c k;
int ret = 0;
- hash_check_init(bch2_xattr_hash_desc, &h, c);
+ bch_verbose(c, "checking xattrs");
- for_each_btree_key(&iter, c, BTREE_ID_XATTRS,
- POS(BCACHEFS_ROOT_INO, 0), 0, k) {
+ bch2_trans_init(&trans, c);
+
+ BUG_ON(bch2_trans_preload_iters(&trans));
+
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS,
+ POS(BCACHEFS_ROOT_INO, 0), 0);
+
+ hash_check_init(bch2_xattr_hash_desc, &trans, &h);
+
+ for_each_btree_key_continue(iter, 0, k) {
ret = walk_inode(c, &w, k.k->p.inode);
if (ret)
break;
@@ -483,7 +553,7 @@ static int check_xattrs(struct bch_fs *c)
if (fsck_err_on(!w.have_inode, c,
"xattr for missing inode %llu",
k.k->p.inode)) {
- ret = bch2_btree_delete_at(&iter, 0);
+ ret = bch2_btree_delete_at(iter, 0);
if (ret)
goto err;
continue;
@@ -492,15 +562,13 @@ static int check_xattrs(struct bch_fs *c)
if (w.first_this_inode && w.have_inode)
hash_check_set_inode(&h, c, &w.inode);
- ret = hash_check_key(bch2_xattr_hash_desc, &h, c, &iter, k);
+ ret = hash_check_key(bch2_xattr_hash_desc, &h, c, iter, k);
if (ret)
goto fsck_err;
}
err:
fsck_err:
- bch2_btree_iter_unlock(&h.chain);
- bch2_btree_iter_unlock(&h.iter);
- return bch2_btree_iter_unlock(&iter) ?: ret;
+ return bch2_trans_exit(&trans) ?: ret;
}
/* Get root directory, create if it doesn't exist: */
@@ -509,6 +577,8 @@ static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode)
struct bkey_inode_buf packed;
int ret;
+ bch_verbose(c, "checking root directory");
+
ret = bch2_inode_find_by_inum(c, BCACHEFS_ROOT_INO, root_inode);
if (ret && ret != -ENOENT)
return ret;
@@ -546,6 +616,8 @@ static int check_lostfound(struct bch_fs *c,
u64 inum;
int ret;
+ bch_verbose(c, "checking lost+found");
+
inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info,
&lostfound);
if (!inum) {
@@ -672,6 +744,8 @@ static int check_directory_structure(struct bch_fs *c,
u64 d_inum;
int ret = 0;
+ bch_verbose(c, "checking directory structure");
+
/* DFS: */
restart_dfs:
had_unreachable = false;
@@ -872,64 +946,134 @@ s64 bch2_count_inode_sectors(struct bch_fs *c, u64 inum)
return bch2_btree_iter_unlock(&iter) ?: sectors;
}
-static int bch2_gc_do_inode(struct bch_fs *c,
- struct bch_inode_unpacked *lostfound_inode,
- struct btree_iter *iter,
- struct bkey_s_c_inode inode, struct nlink link)
+static int check_inode_nlink(struct bch_fs *c,
+ struct bch_inode_unpacked *lostfound_inode,
+ struct bch_inode_unpacked *u,
+ struct nlink *link,
+ bool *do_update)
{
- struct bch_inode_unpacked u;
+ u32 i_nlink = u->bi_flags & BCH_INODE_UNLINKED
+ ? 0
+ : u->bi_nlink + nlink_bias(u->bi_mode);
+ u32 real_i_nlink =
+ link->count * nlink_bias(u->bi_mode) +
+ link->dir_count;
int ret = 0;
- u32 i_nlink, real_i_nlink;
- bool do_update = false;
- ret = bch2_inode_unpack(inode, &u);
- if (bch2_fs_inconsistent_on(ret, c,
- "error unpacking inode %llu in fsck",
- inode.k->p.inode))
- return ret;
+ /*
+ * These should have been caught/fixed by earlier passes, we don't
+ * repair them here:
+ */
+ if (S_ISDIR(u->bi_mode) && link->count > 1) {
+ need_fsck_err(c, "directory %llu with multiple hardlinks: %u",
+ u->bi_inum, link->count);
+ return 0;
+ }
- i_nlink = u.bi_nlink + nlink_bias(u.bi_mode);
+ if (S_ISDIR(u->bi_mode) && !link->count) {
+ need_fsck_err(c, "unreachable directory found (inum %llu)",
+ u->bi_inum);
+ return 0;
+ }
- fsck_err_on(i_nlink < link.count, c,
- "inode %llu i_link too small (%u < %u, type %i)",
- inode.k->p.inode, i_nlink,
- link.count, mode_to_type(u.bi_mode));
+ if (!S_ISDIR(u->bi_mode) && link->dir_count) {
+ need_fsck_err(c, "non directory with subdirectories",
+ u->bi_inum);
+ return 0;
+ }
- /* These should have been caught/fixed by earlier passes: */
- if (S_ISDIR(u.bi_mode)) {
- need_fsck_err_on(link.count > 1, c,
- "directory %llu with multiple hardlinks: %u",
- inode.k->p.inode, link.count);
+ if (!link->count &&
+ !(u->bi_flags & BCH_INODE_UNLINKED) &&
+ (c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK))) {
+ if (fsck_err(c, "unreachable inode %llu not marked as unlinked (type %u)",
+ u->bi_inum, mode_to_type(u->bi_mode)) ==
+ FSCK_ERR_IGNORE)
+ return 0;
- real_i_nlink = link.count * 2 + link.dir_count;
- } else {
- need_fsck_err_on(link.dir_count, c,
- "found dirents for non directory %llu",
- inode.k->p.inode);
+ ret = reattach_inode(c, lostfound_inode, u->bi_inum);
+ if (ret)
+ return ret;
- real_i_nlink = link.count + link.dir_count;
+ link->count = 1;
+ real_i_nlink = nlink_bias(u->bi_mode) + link->dir_count;
+ goto set_i_nlink;
}
- if (!link.count) {
- fsck_err_on(c->sb.clean, c,
- "filesystem marked clean, "
- "but found orphaned inode %llu",
- inode.k->p.inode);
-
- if (fsck_err_on(S_ISDIR(u.bi_mode) &&
- bch2_empty_dir(c, inode.k->p.inode), c,
- "non empty directory with link count 0, "
- "inode nlink %u, dir links found %u",
- i_nlink, link.dir_count)) {
- ret = reattach_inode(c, lostfound_inode,
- inode.k->p.inode);
- if (ret)
- return ret;
+ if (i_nlink < link->count) {
+ if (fsck_err(c, "inode %llu i_link too small (%u < %u, type %i)",
+ u->bi_inum, i_nlink, link->count,
+ mode_to_type(u->bi_mode)) == FSCK_ERR_IGNORE)
+ return 0;
+ goto set_i_nlink;
+ }
+
+ if (i_nlink != real_i_nlink &&
+ c->sb.clean) {
+ if (fsck_err(c, "filesystem marked clean, "
+ "but inode %llu has wrong i_nlink "
+ "(type %u i_nlink %u, should be %u)",
+ u->bi_inum, mode_to_type(u->bi_mode),
+ i_nlink, real_i_nlink) == FSCK_ERR_IGNORE)
+ return 0;
+ goto set_i_nlink;
+ }
+
+ if (i_nlink != real_i_nlink &&
+ (c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK))) {
+ if (fsck_err(c, "inode %llu has wrong i_nlink "
+ "(type %u i_nlink %u, should be %u)",
+ u->bi_inum, mode_to_type(u->bi_mode),
+ i_nlink, real_i_nlink) == FSCK_ERR_IGNORE)
+ return 0;
+ goto set_i_nlink;
+ }
+
+ if (real_i_nlink && i_nlink != real_i_nlink)
+ bch_verbose(c, "setting inode %llu nlink from %u to %u",
+ u->bi_inum, i_nlink, real_i_nlink);
+set_i_nlink:
+ if (i_nlink != real_i_nlink) {
+ if (real_i_nlink) {
+ u->bi_nlink = real_i_nlink - nlink_bias(u->bi_mode);
+ u->bi_flags &= ~BCH_INODE_UNLINKED;
+ } else {
+ u->bi_nlink = 0;
+ u->bi_flags |= BCH_INODE_UNLINKED;
}
- bch_verbose(c, "deleting inode %llu", inode.k->p.inode);
+ *do_update = true;
+ }
+fsck_err:
+ return ret;
+}
+
+static int check_inode(struct bch_fs *c,
+ struct bch_inode_unpacked *lostfound_inode,
+ struct btree_iter *iter,
+ struct bkey_s_c_inode inode,
+ struct nlink *link)
+{
+ struct bch_inode_unpacked u;
+ bool do_update = false;
+ int ret = 0;
+
+ ret = bch2_inode_unpack(inode, &u);
+ if (bch2_fs_inconsistent_on(ret, c,
+ "error unpacking inode %llu in fsck",
+ inode.k->p.inode))
+ return ret;
+
+ if (link) {
+ ret = check_inode_nlink(c, lostfound_inode, &u, link,
+ &do_update);
+ if (ret)
+ return ret;
+ }
+
+ if (u.bi_flags & BCH_INODE_UNLINKED) {
+ bch_verbose(c, "deleting inode %llu", u.bi_inum);
- ret = bch2_inode_rm(c, inode.k->p.inode);
+ ret = bch2_inode_rm(c, u.bi_inum);
if (ret)
bch_err(c, "error in fs gc: error %i "
"while deleting inode", ret);
@@ -940,16 +1084,16 @@ static int bch2_gc_do_inode(struct bch_fs *c,
fsck_err_on(c->sb.clean, c,
"filesystem marked clean, "
"but inode %llu has i_size dirty",
- inode.k->p.inode);
+ u.bi_inum);
- bch_verbose(c, "truncating inode %llu", inode.k->p.inode);
+ bch_verbose(c, "truncating inode %llu", u.bi_inum);
/*
* XXX: need to truncate partial blocks too here - or ideally
* just switch units to bytes and that issue goes away
*/
- ret = bch2_inode_truncate(c, inode.k->p.inode,
+ ret = bch2_inode_truncate(c, u.bi_inum,
round_up(u.bi_size, PAGE_SIZE) >> 9,
NULL, NULL);
if (ret) {
@@ -974,12 +1118,12 @@ static int bch2_gc_do_inode(struct bch_fs *c,
fsck_err_on(c->sb.clean, c,
"filesystem marked clean, "
"but inode %llu has i_sectors dirty",
- inode.k->p.inode);
+ u.bi_inum);
bch_verbose(c, "recounting sectors for inode %llu",
- inode.k->p.inode);
+ u.bi_inum);
- sectors = bch2_count_inode_sectors(c, inode.k->p.inode);
+ sectors = bch2_count_inode_sectors(c, u.bi_inum);
if (sectors < 0) {
bch_err(c, "error in fs gc: error %i "
"recounting inode sectors",
@@ -992,20 +1136,6 @@ static int bch2_gc_do_inode(struct bch_fs *c,
do_update = true;
}
- if (i_nlink != real_i_nlink) {
- fsck_err_on(c->sb.clean, c,
- "filesystem marked clean, "
- "but inode %llu has wrong i_nlink "
- "(type %u i_nlink %u, should be %u)",
- inode.k->p.inode, mode_to_type(u.bi_mode),
- i_nlink, real_i_nlink);
-
- bch_verbose(c, "setting inode %llu nlinks from %u to %u",
- inode.k->p.inode, i_nlink, real_i_nlink);
- u.bi_nlink = real_i_nlink - nlink_bias(u.bi_mode);
- do_update = true;
- }
-
if (do_update) {
struct bkey_inode_buf p;
@@ -1024,9 +1154,9 @@ fsck_err:
noinline_for_stack
static int bch2_gc_walk_inodes(struct bch_fs *c,
- struct bch_inode_unpacked *lostfound_inode,
- nlink_table *links,
- u64 range_start, u64 range_end)
+ struct bch_inode_unpacked *lostfound_inode,
+ nlink_table *links,
+ u64 range_start, u64 range_end)
{
struct btree_iter iter;
struct bkey_s_c k;
@@ -1065,10 +1195,9 @@ peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links);
*/
bch2_btree_iter_unlock(&iter);
- ret = bch2_gc_do_inode(c, lostfound_inode, &iter,
- bkey_s_c_to_inode(k), *link);
- if (ret == -EINTR)
- continue;
+ ret = check_inode(c, lostfound_inode, &iter,
+ bkey_s_c_to_inode(k), link);
+ BUG_ON(ret == -EINTR);
if (ret)
break;
@@ -1103,6 +1232,8 @@ static int check_inode_nlinks(struct bch_fs *c,
u64 this_iter_range_start, next_iter_range_start = 0;
int ret = 0;
+ bch_verbose(c, "checking inode nlinks");
+
genradix_init(&links);
do {
@@ -1129,68 +1260,103 @@ static int check_inode_nlinks(struct bch_fs *c,
return ret;
}
+noinline_for_stack
+static int check_inodes_fast(struct bch_fs *c)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_s_c_inode inode;
+ unsigned long nr_inodes = 0;
+ int ret = 0;
+
+ for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, 0, k) {
+ if (k.k->type != BCH_INODE_FS)
+ continue;
+
+ inode = bkey_s_c_to_inode(k);
+
+ if (!(inode.v->bi_flags & BCH_INODE_UNLINKED))
+ nr_inodes++;
+
+ if (inode.v->bi_flags &
+ (BCH_INODE_I_SIZE_DIRTY|
+ BCH_INODE_I_SECTORS_DIRTY|
+ BCH_INODE_UNLINKED)) {
+ fsck_err_on(c->sb.clean, c,
+ "filesystem marked clean but found inode %llu with flags %x",
+ inode.k->p.inode, inode.v->bi_flags);
+ ret = check_inode(c, NULL, &iter, inode, NULL);
+ BUG_ON(ret == -EINTR);
+ if (ret)
+ break;
+ }
+ }
+ atomic_long_set(&c->nr_inodes, nr_inodes);
+fsck_err:
+ return bch2_btree_iter_unlock(&iter) ?: ret;
+}
+
/*
* Checks for inconsistencies that shouldn't happen, unless we have a bug.
* Doesn't fix them yet, mainly because they haven't yet been observed:
*/
-int bch2_fsck(struct bch_fs *c, bool full_fsck)
+static int bch2_fsck_full(struct bch_fs *c)
{
struct bch_inode_unpacked root_inode, lostfound_inode;
int ret;
- if (full_fsck) {
- bch_verbose(c, "checking extents");
- ret = check_extents(c);
- if (ret)
- return ret;
+ bch_verbose(c, "starting fsck:");
+ ret = check_extents(c) ?:
+ check_dirents(c) ?:
+ check_xattrs(c) ?:
+ check_root(c, &root_inode) ?:
+ check_lostfound(c, &root_inode, &lostfound_inode) ?:
+ check_directory_structure(c, &lostfound_inode) ?:
+ check_inode_nlinks(c, &lostfound_inode);
- bch_verbose(c, "checking dirents");
- ret = check_dirents(c);
- if (ret)
- return ret;
+ bch2_flush_fsck_errs(c);
+ bch_verbose(c, "fsck done");
- bch_verbose(c, "checking xattrs");
- ret = check_xattrs(c);
- if (ret)
- return ret;
+ return ret;
+}
- bch_verbose(c, "checking root directory");
- ret = check_root(c, &root_inode);
- if (ret)
- return ret;
+static int bch2_fsck_inode_nlink(struct bch_fs *c)
+{
+ struct bch_inode_unpacked root_inode, lostfound_inode;
+ int ret;
- bch_verbose(c, "checking lost+found");
- ret = check_lostfound(c, &root_inode, &lostfound_inode);
- if (ret)
- return ret;
+ bch_verbose(c, "checking inode link counts:");
+ ret = check_root(c, &root_inode) ?:
+ check_lostfound(c, &root_inode, &lostfound_inode) ?:
+ check_inode_nlinks(c, &lostfound_inode);
- bch_verbose(c, "checking directory structure");
- ret = check_directory_structure(c, &lostfound_inode);
- if (ret)
- return ret;
+ bch2_flush_fsck_errs(c);
+ bch_verbose(c, "done");
- bch_verbose(c, "checking inode nlinks");
- ret = check_inode_nlinks(c, &lostfound_inode);
- if (ret)
- return ret;
- } else {
- bch_verbose(c, "checking root directory");
- ret = check_root(c, &root_inode);
- if (ret)
- return ret;
+ return ret;
+}
- bch_verbose(c, "checking lost+found");
- ret = check_lostfound(c, &root_inode, &lostfound_inode);
- if (ret)
- return ret;
+static int bch2_fsck_walk_inodes_only(struct bch_fs *c)
+{
+ int ret;
- bch_verbose(c, "checking inode nlinks");
- ret = check_inode_nlinks(c, &lostfound_inode);
- if (ret)
- return ret;
- }
+ bch_verbose(c, "walking inodes:");
+ ret = check_inodes_fast(c);
bch2_flush_fsck_errs(c);
+ bch_verbose(c, "done");
- return 0;
+ return ret;
+}
+
+int bch2_fsck(struct bch_fs *c)
+{
+ if (!c->opts.nofsck)
+ return bch2_fsck_full(c);
+
+ if (!c->sb.clean &&
+ !(c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK)))
+ return bch2_fsck_inode_nlink(c);
+
+ return bch2_fsck_walk_inodes_only(c);
}
diff --git a/libbcachefs/fsck.h b/libbcachefs/fsck.h
index f9af130..bc9caaf 100644
--- a/libbcachefs/fsck.h
+++ b/libbcachefs/fsck.h
@@ -2,6 +2,6 @@
#define _BCACHEFS_FSCK_H
s64 bch2_count_inode_sectors(struct bch_fs *, u64);
-int bch2_fsck(struct bch_fs *, bool);
+int bch2_fsck(struct bch_fs *);
#endif /* _BCACHEFS_FSCK_H */
diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c
index 3ae5ac9..d4139fa 100644
--- a/libbcachefs/inode.c
+++ b/libbcachefs/inode.c
@@ -203,6 +203,10 @@ const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1)
return "invalid data checksum type";
+ if ((unpacked.bi_flags & BCH_INODE_UNLINKED) &&
+ unpacked.bi_nlink != 0)
+ return "flagged as unlinked but bi_nlink != 0";
+
return NULL;
}
case BCH_INODE_BLOCKDEV:
@@ -276,12 +280,27 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
}
}
-int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
- u64 min, u64 max, u64 *hint)
+static inline u32 bkey_generation(struct bkey_s_c k)
{
- struct bkey_inode_buf inode_p;
- struct btree_iter iter;
- bool searched_from_start = false;
+ switch (k.k->type) {
+ case BCH_INODE_BLOCKDEV:
+ case BCH_INODE_FS:
+ BUG();
+ case BCH_INODE_GENERATION:
+ return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation);
+ default:
+ return 0;
+ }
+}
+
+int __bch2_inode_create(struct btree_trans *trans,
+ struct bch_inode_unpacked *inode_u,
+ u64 min, u64 max, u64 *hint)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_inode_buf *inode_p;
+ struct btree_iter *iter;
+ u64 start;
int ret;
if (!max)
@@ -290,82 +309,66 @@ int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
if (c->opts.inodes_32bit)
max = min_t(u64, max, U32_MAX);
- if (*hint >= max || *hint < min)
- *hint = min;
+ start = READ_ONCE(*hint);
- if (*hint == min)
- searched_from_start = true;
-again:
- bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(*hint, 0),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ if (start >= max || start < min)
+ start = min;
+ inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
+ if (IS_ERR(inode_p))
+ return PTR_ERR(inode_p);
+
+ iter = bch2_trans_get_iter(trans,
+ BTREE_ID_INODES, POS(start, 0),
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ if (IS_ERR(iter))
+ return PTR_ERR(iter);
+again:
while (1) {
- struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
- u32 bi_generation = 0;
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
ret = btree_iter_err(k);
- if (ret) {
- bch2_btree_iter_unlock(&iter);
+ if (ret)
return ret;
- }
switch (k.k->type) {
case BCH_INODE_BLOCKDEV:
case BCH_INODE_FS:
/* slot used */
- if (iter.pos.inode == max)
+ if (iter->pos.inode >= max)
goto out;
- bch2_btree_iter_next_slot(&iter);
+ bch2_btree_iter_next_slot(iter);
break;
- case BCH_INODE_GENERATION: {
- struct bkey_s_c_inode_generation g =
- bkey_s_c_to_inode_generation(k);
- bi_generation = le32_to_cpu(g.v->bi_generation);
- /* fallthrough: */
- }
default:
- inode_u->bi_generation = bi_generation;
-
- bch2_inode_pack(&inode_p, inode_u);
- inode_p.inode.k.p = k.k->p;
-
- ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
- BTREE_INSERT_ATOMIC,
- BTREE_INSERT_ENTRY(&iter,
- &inode_p.inode.k_i));
-
- if (ret != -EINTR) {
- bch2_btree_iter_unlock(&iter);
-
- if (!ret) {
- inode_u->bi_inum =
- inode_p.inode.k.p.inode;
- *hint = inode_p.inode.k.p.inode + 1;
- }
-
- return ret;
- }
-
- if (ret == -EINTR)
- continue;
+ *hint = k.k->p.inode;
+ inode_u->bi_inum = k.k->p.inode;
+ inode_u->bi_generation = bkey_generation(k);
+ bch2_inode_pack(inode_p, inode_u);
+ bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
+ return 0;
}
}
out:
- bch2_btree_iter_unlock(&iter);
-
- if (!searched_from_start) {
+ if (start != min) {
/* Retry from start */
- *hint = min;
- searched_from_start = true;
+ start = min;
+ bch2_btree_iter_set_pos(iter, POS(start, 0));
goto again;
}
return -ENOSPC;
}
+int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
+ u64 min, u64 max, u64 *hint)
+{
+ return bch2_trans_do(c, NULL, BTREE_INSERT_ATOMIC,
+ __bch2_inode_create(&trans, inode_u, min, max, hint));
+}
+
int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size,
struct extent_insert_hook *hook, u64 *journal_seq)
{
diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h
index 2646106..a47194a 100644
--- a/libbcachefs/inode.h
+++ b/libbcachefs/inode.h
@@ -38,8 +38,13 @@ int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
uid_t, gid_t, umode_t, dev_t,
struct bch_inode_unpacked *);
+
+int __bch2_inode_create(struct btree_trans *,
+ struct bch_inode_unpacked *,
+ u64, u64, u64 *);
int bch2_inode_create(struct bch_fs *, struct bch_inode_unpacked *,
u64, u64, u64 *);
+
int bch2_inode_truncate(struct bch_fs *, u64, u64,
struct extent_insert_hook *, u64 *);
int bch2_inode_rm(struct bch_fs *, u64);
diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c
index 58aee7a..0af136d 100644
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@@ -5,6 +5,7 @@
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_io.h"
+#include "dirent.h"
#include "error.h"
#include "fsck.h"
#include "journal_io.h"
@@ -14,6 +15,8 @@
#include <linux/stat.h>
+#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
+
struct bkey_i *btree_root_find(struct bch_fs *c,
struct bch_sb_field_clean *clean,
struct jset *j,
@@ -233,7 +236,8 @@ int bch2_fs_recovery(struct bch_fs *c)
bch2_fs_journal_start(&c->journal);
err = "error starting allocator";
- if (bch2_fs_allocator_start(c))
+ ret = bch2_fs_allocator_start(c);
+ if (ret)
goto err;
bch_verbose(c, "starting journal replay:");
@@ -246,12 +250,16 @@ int bch2_fs_recovery(struct bch_fs *c)
if (c->opts.norecovery)
goto out;
- bch_verbose(c, "starting fsck:");
err = "error in fsck";
- ret = bch2_fsck(c, !c->opts.nofsck);
+ ret = bch2_fsck(c);
if (ret)
goto err;
- bch_verbose(c, "fsck done");
+
+ if (!test_bit(BCH_FS_FSCK_UNFIXED_ERRORS, &c->flags)) {
+ mutex_lock(&c->sb_lock);
+ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK;
+ mutex_unlock(&c->sb_lock);
+ }
if (enabled_qtypes(c)) {
bch_verbose(c, "reading quotas:");
@@ -273,8 +281,10 @@ fsck_err:
int bch2_fs_initialize(struct bch_fs *c)
{
- struct bch_inode_unpacked inode;
+ struct bch_inode_unpacked root_inode, lostfound_inode;
struct bkey_inode_buf packed_inode;
+ struct bch_hash_info root_hash_info;
+ struct qstr lostfound = QSTR("lost+found");
const char *err = "cannot allocate memory";
struct bch_dev *ca;
LIST_HEAD(journal);
@@ -307,21 +317,46 @@ int bch2_fs_initialize(struct bch_fs *c)
bch2_journal_set_replay_done(&c->journal);
err = "error starting allocator";
- if (bch2_fs_allocator_start(c))
+ ret = bch2_fs_allocator_start(c);
+ if (ret)
goto err;
- bch2_inode_init(c, &inode, 0, 0,
+ bch2_inode_init(c, &root_inode, 0, 0,
S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
- inode.bi_inum = BCACHEFS_ROOT_INO;
-
- bch2_inode_pack(&packed_inode, &inode);
+ root_inode.bi_inum = BCACHEFS_ROOT_INO;
+ root_inode.bi_nlink++; /* lost+found */
+ bch2_inode_pack(&packed_inode, &root_inode);
err = "error creating root directory";
- if (bch2_btree_insert(c, BTREE_ID_INODES,
- &packed_inode.inode.k_i,
- NULL, NULL, NULL, 0))
+ ret = bch2_btree_insert(c, BTREE_ID_INODES,
+ &packed_inode.inode.k_i,
+ NULL, NULL, NULL, 0);
+ if (ret)
goto err;
+ bch2_inode_init(c, &lostfound_inode, 0, 0,
+ S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0,
+ &root_inode);
+ lostfound_inode.bi_inum = BCACHEFS_ROOT_INO + 1;
+ bch2_inode_pack(&packed_inode, &lostfound_inode);
+
+ err = "error creating lost+found";
+ ret = bch2_btree_insert(c, BTREE_ID_INODES,
+ &packed_inode.inode.k_i,
+ NULL, NULL, NULL, 0);
+ if (ret)
+ goto err;
+
+ root_hash_info = bch2_hash_info_init(c, &root_inode);
+
+ ret = bch2_dirent_create(c, BCACHEFS_ROOT_INO, &root_hash_info, DT_DIR,
+ &lostfound, lostfound_inode.bi_inum, NULL,
+ BTREE_INSERT_NOFAIL);
+ if (ret)
+ goto err;
+
+ atomic_long_set(&c->nr_inodes, 2);
+
if (enabled_qtypes(c)) {
ret = bch2_fs_quota_read(c);
if (ret)
@@ -329,12 +364,14 @@ int bch2_fs_initialize(struct bch_fs *c)
}
err = "error writing first journal entry";
- if (bch2_journal_meta(&c->journal))
+ ret = bch2_journal_meta(&c->journal);
+ if (ret)
goto err;
mutex_lock(&c->sb_lock);
SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK;
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h
index c805109..99f1fe8 100644
--- a/libbcachefs/str_hash.h
+++ b/libbcachefs/str_hash.h
@@ -125,46 +125,29 @@ struct bch_hash_desc {
bool (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c);
};
-static inline struct bkey_s_c
-bch2_hash_lookup_at(const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- struct btree_iter *iter, const void *search)
+static inline struct btree_iter *
+bch2_hash_lookup(struct btree_trans *trans,
+ const struct bch_hash_desc desc,
+ const struct bch_hash_info *info,
+ u64 inode, const void *key,
+ unsigned flags)
{
- u64 inode = iter->pos.inode;
+ struct btree_iter *iter;
struct bkey_s_c k;
- for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
- if (iter->pos.inode != inode)
- break;
-
- if (k.k->type == desc.key_type) {
- if (!desc.cmp_key(k, search))
- return k;
- } else if (k.k->type == desc.whiteout_type) {
- ;
- } else {
- /* hole, not found */
- break;
- }
- }
- return btree_iter_err(k) ? k : bkey_s_c_err(-ENOENT);
-}
-
-static inline struct bkey_s_c
-bch2_hash_lookup_bkey_at(const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- struct btree_iter *iter, struct bkey_s_c search)
-{
- u64 inode = iter->pos.inode;
- struct bkey_s_c k;
+ iter = bch2_trans_get_iter(trans, desc.btree_id,
+ POS(inode, desc.hash_key(info, key)),
+ BTREE_ITER_SLOTS|flags);
+ if (IS_ERR(iter))
+ return iter;
for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
if (iter->pos.inode != inode)
break;
if (k.k->type == desc.key_type) {
- if (!desc.cmp_bkey(k, search))
- return k;
+ if (!desc.cmp_key(k, key))
+ return iter;
} else if (k.k->type == desc.whiteout_type) {
;
} else {
@@ -172,72 +155,48 @@ bch2_hash_lookup_bkey_at(const struct bch_hash_desc desc,
break;
}
}
- return btree_iter_err(k) ? k : bkey_s_c_err(-ENOENT);
-}
-
-static inline struct bkey_s_c
-bch2_hash_lookup(const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- struct bch_fs *c, u64 inode,
- struct btree_iter *iter, const void *key)
-{
- bch2_btree_iter_init(iter, c, desc.btree_id,
- POS(inode, desc.hash_key(info, key)),
- BTREE_ITER_SLOTS);
-
- return bch2_hash_lookup_at(desc, info, iter, key);
-}
-
-static inline struct bkey_s_c
-bch2_hash_lookup_intent(const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- struct bch_fs *c, u64 inode,
- struct btree_iter *iter, const void *key)
-{
- bch2_btree_iter_init(iter, c, desc.btree_id,
- POS(inode, desc.hash_key(info, key)),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- return bch2_hash_lookup_at(desc, info, iter, key);
+ return IS_ERR(k.k) ? ERR_CAST(k.k) : ERR_PTR(-ENOENT);
}
-static inline struct bkey_s_c
-bch2_hash_hole_at(const struct bch_hash_desc desc, struct btree_iter *iter)
+static inline struct btree_iter *
+bch2_hash_hole(struct btree_trans *trans,
+ const struct bch_hash_desc desc,
+ const struct bch_hash_info *info,
+ u64 inode, const void *key)
{
- u64 inode = iter->pos.inode;
+ struct btree_iter *iter;
struct bkey_s_c k;
+ iter = bch2_trans_get_iter(trans, desc.btree_id,
+ POS(inode, desc.hash_key(info, key)),
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ if (IS_ERR(iter))
+ return iter;
+
for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
if (iter->pos.inode != inode)
break;
if (k.k->type != desc.key_type)
- return k;
+ return iter;
}
- return btree_iter_err(k) ? k : bkey_s_c_err(-ENOENT);
-}
-
-static inline struct bkey_s_c bch2_hash_hole(const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- struct bch_fs *c, u64 inode,
- struct btree_iter *iter,
- const void *key)
-{
- bch2_btree_iter_init(iter, c, desc.btree_id,
- POS(inode, desc.hash_key(info, key)),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- return bch2_hash_hole_at(desc, iter);
+ return IS_ERR(k.k) ? ERR_CAST(k.k) : ERR_PTR(-ENOSPC);
}
-static inline int bch2_hash_needs_whiteout(const struct bch_hash_desc desc,
+static inline int bch2_hash_needs_whiteout(struct btree_trans *trans,
+ const struct bch_hash_desc desc,
const struct bch_hash_info *info,
- struct btree_iter *iter,
struct btree_iter *start)
{
+ struct btree_iter *iter;
struct bkey_s_c k;
- bch2_btree_iter_copy(iter, start);
+ iter = bch2_trans_copy_iter(trans, start);
+ if (IS_ERR(iter))
+ return PTR_ERR(iter);
+
bch2_btree_iter_next_slot(iter);
for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
@@ -252,142 +211,108 @@ static inline int bch2_hash_needs_whiteout(const struct bch_hash_desc desc,
return btree_iter_err(k);
}
-static inline int bch2_hash_set(const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- struct bch_fs *c, u64 inode,
- u64 *journal_seq,
- struct bkey_i *insert, int flags)
+static inline int __bch2_hash_set(struct btree_trans *trans,
+ const struct bch_hash_desc desc,
+ const struct bch_hash_info *info,
+ u64 inode, struct bkey_i *insert, int flags)
{
- struct btree_iter iter, hashed_slot;
+ struct btree_iter *iter, *slot = NULL;
struct bkey_s_c k;
- int ret;
- bch2_btree_iter_init(&hashed_slot, c, desc.btree_id,
- POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- bch2_btree_iter_init(&iter, c, desc.btree_id, hashed_slot.pos,
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- bch2_btree_iter_link(&hashed_slot, &iter);
-retry:
- /*
- * On hash collision, we have to keep the slot we hashed to locked while
- * we do the insert - to avoid racing with another thread deleting
- * whatever's in the slot we hashed to:
- */
- ret = bch2_btree_iter_traverse(&hashed_slot);
- if (ret)
- goto err;
-
- /*
- * On -EINTR/retry, we dropped locks - always restart from the slot we
- * hashed to:
- */
- bch2_btree_iter_copy(&iter, &hashed_slot);
-
- k = bch2_hash_lookup_bkey_at(desc, info, &iter, bkey_i_to_s_c(insert));
-
- ret = btree_iter_err(k);
- if (ret == -ENOENT) {
- if (flags & BCH_HASH_SET_MUST_REPLACE) {
- ret = -ENOENT;
- goto err;
+ iter = bch2_trans_get_iter(trans, desc.btree_id,
+ POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))),
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ if (IS_ERR(iter))
+ return PTR_ERR(iter);
+
+ for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
+ if (iter->pos.inode != inode)
+ break;
+
+ if (k.k->type == desc.key_type) {
+ if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
+ goto found;
+
+ /* hash collision: */
+ continue;
}
- /*
- * Not found, so we're now looking for any open
- * slot - we might have skipped over a whiteout
- * that we could have used, so restart from the
- * slot we hashed to:
- */
- bch2_btree_iter_copy(&iter, &hashed_slot);
- k = bch2_hash_hole_at(desc, &iter);
- if ((ret = btree_iter_err(k)))
- goto err;
- } else if (!ret) {
- if (flags & BCH_HASH_SET_MUST_CREATE) {
- ret = -EEXIST;
- goto err;
+ if (!slot &&
+ !(flags & BCH_HASH_SET_MUST_REPLACE)) {
+ slot = bch2_trans_copy_iter(trans, iter);
+ if (IS_ERR(slot))
+ return PTR_ERR(slot);
}
- } else {
- goto err;
+
+ if (k.k->type != desc.whiteout_type)
+ goto not_found;
}
- insert->k.p = iter.pos;
- ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq,
- BTREE_INSERT_ATOMIC|flags,
- BTREE_INSERT_ENTRY(&iter, insert));
-err:
- if (ret == -EINTR)
- goto retry;
-
- /*
- * On successful insert, we don't want to clobber ret with error from
- * iter:
- */
- bch2_btree_iter_unlock(&iter);
- bch2_btree_iter_unlock(&hashed_slot);
- return ret;
+ return btree_iter_err(k) ?: -ENOSPC;
+not_found:
+ if (flags & BCH_HASH_SET_MUST_REPLACE)
+ return -ENOENT;
+
+ insert->k.p = slot->pos;
+ bch2_trans_update(trans, slot, insert, 0);
+ return 0;
+found:
+ if (flags & BCH_HASH_SET_MUST_CREATE)
+ return -EEXIST;
+
+ insert->k.p = iter->pos;
+ bch2_trans_update(trans, iter, insert, 0);
+ return 0;
}
-static inline int bch2_hash_delete_at(const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- struct btree_iter *iter,
- u64 *journal_seq)
+static inline int bch2_hash_set(const struct bch_hash_desc desc,
+ const struct bch_hash_info *info,
+ struct bch_fs *c, u64 inode,
+ u64 *journal_seq,
+ struct bkey_i *insert, int flags)
{
- struct btree_iter whiteout_iter;
- struct bkey_i delete;
- int ret = -ENOENT;
+ return bch2_trans_do(c, journal_seq, flags|BTREE_INSERT_ATOMIC,
+ __bch2_hash_set(&trans, desc, info,
+ inode, insert, flags));
+}
- bch2_btree_iter_init(&whiteout_iter, iter->c, desc.btree_id,
- iter->pos, BTREE_ITER_SLOTS);
- bch2_btree_iter_link(iter, &whiteout_iter);
+static inline int bch2_hash_delete_at(struct btree_trans *trans,
+ const struct bch_hash_desc desc,
+ const struct bch_hash_info *info,
+ struct btree_iter *iter)
+{
+ struct bkey_i *delete;
+ int ret;
- ret = bch2_hash_needs_whiteout(desc, info, &whiteout_iter, iter);
+ ret = bch2_hash_needs_whiteout(trans, desc, info, iter);
if (ret < 0)
- goto err;
-
- bkey_init(&delete.k);
- delete.k.p = iter->pos;
- delete.k.type = ret ? desc.whiteout_type : KEY_TYPE_DELETED;
-
- ret = bch2_btree_insert_at(iter->c, NULL, NULL, journal_seq,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_ATOMIC,
- BTREE_INSERT_ENTRY(iter, &delete));
-err:
- bch2_btree_iter_unlink(&whiteout_iter);
- return ret;
+ return ret;
+
+ delete = bch2_trans_kmalloc(trans, sizeof(*delete));
+ if (IS_ERR(delete))
+ return PTR_ERR(delete);
+
+ bkey_init(&delete->k);
+ delete->k.p = iter->pos;
+ delete->k.type = ret ? desc.whiteout_type : KEY_TYPE_DELETED;
+
+ bch2_trans_update(trans, iter, delete, 0);
+ return 0;
}
-static inline int bch2_hash_delete(const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- struct bch_fs *c, u64 inode,
- u64 *journal_seq, const void *key)
+static inline int bch2_hash_delete(struct btree_trans *trans,
+ const struct bch_hash_desc desc,
+ const struct bch_hash_info *info,
+ u64 inode, const void *key)
{
- struct btree_iter iter, whiteout_iter;
- struct bkey_s_c k;
- int ret = -ENOENT;
-
- bch2_btree_iter_init(&iter, c, desc.btree_id,
- POS(inode, desc.hash_key(info, key)),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- bch2_btree_iter_init(&whiteout_iter, c, desc.btree_id,
- POS(inode, desc.hash_key(info, key)),
- BTREE_ITER_SLOTS);
- bch2_btree_iter_link(&iter, &whiteout_iter);
-retry:
- k = bch2_hash_lookup_at(desc, info, &iter, key);
- if ((ret = btree_iter_err(k)))
- goto err;
-
- ret = bch2_hash_delete_at(desc, info, &iter, journal_seq);
-err:
- if (ret == -EINTR)
- goto retry;
-
- bch2_btree_iter_unlock(&whiteout_iter);
- bch2_btree_iter_unlock(&iter);
- return ret;
+ struct btree_iter *iter;
+
+ iter = bch2_hash_lookup(trans, desc, info, inode, key,
+ BTREE_ITER_INTENT);
+ if (IS_ERR(iter))
+ return PTR_ERR(iter);
+
+ return bch2_hash_delete_at(trans, desc, info, iter);
}
#endif /* _BCACHEFS_STR_HASH_H */
diff --git a/libbcachefs/util.c b/libbcachefs/util.c
index 24c6cc5..1272ea7 100644
--- a/libbcachefs/util.c
+++ b/libbcachefs/util.c
@@ -52,7 +52,7 @@ static int __bch2_strtoh(const char *cp, u64 *res,
cp++;
} while (isdigit(*cp));
- for (u = 1; u < ARRAY_SIZE(si_units); u++)
+ for (u = 1; u < strlen(si_units); u++)
if (*cp == si_units[u]) {
cp++;
goto got_unit;
diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c
index c6b5015..7d0fee3 100644
--- a/libbcachefs/xattr.c
+++ b/libbcachefs/xattr.c
@@ -74,7 +74,6 @@ const char *bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
const struct xattr_handler *handler;
struct bkey_s_c_xattr xattr;
- unsigned u64s;
switch (k.k->type) {
case BCH_XATTR:
@@ -82,13 +81,15 @@ const char *bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k)
return "value too small";
xattr = bkey_s_c_to_xattr(k);
- u64s = xattr_val_u64s(xattr.v->x_name_len,
- le16_to_cpu(xattr.v->x_val_len));
- if (bkey_val_u64s(k.k) < u64s)
+ if (bkey_val_u64s(k.k) <
+ xattr_val_u64s(xattr.v->x_name_len,
+ le16_to_cpu(xattr.v->x_val_len)))
return "value too small";
- if (bkey_val_u64s(k.k) > u64s)
+ if (bkey_val_u64s(k.k) >
+ xattr_val_u64s(xattr.v->x_name_len,
+ le16_to_cpu(xattr.v->x_val_len) + 4))
return "value too big";
handler = bch2_xattr_type_to_handler(xattr.v->x_type);
@@ -142,32 +143,28 @@ void bch2_xattr_to_text(struct bch_fs *c, char *buf,
}
}
-struct bkey_s_c bch2_xattr_get_iter(struct bch_fs *c,
- struct btree_iter *iter,
- struct bch_inode_info *inode,
- const char *name, int type)
-{
- return bch2_hash_lookup(bch2_xattr_hash_desc,
- &inode->ei_str_hash,
- c, inode->v.i_ino, iter,
- &X_SEARCH(type, name, strlen(name)));
-}
-
int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
- const char *name, void *buffer, size_t size, int type)
+ const char *name, void *buffer, size_t size, int type)
{
- struct btree_iter iter;
- struct bkey_s_c k;
+ struct btree_trans trans;
+ struct btree_iter *iter;
struct bkey_s_c_xattr xattr;
int ret;
- k = bch2_hash_lookup(bch2_xattr_hash_desc, &inode->ei_str_hash, c,
- inode->v.i_ino, &iter,
- &X_SEARCH(type, name, strlen(name)));
- if (IS_ERR(k.k))
- return bch2_btree_iter_unlock(&iter) ?: -ENODATA;
+ bch2_trans_init(&trans, c);
+
+ iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc,
+ &inode->ei_str_hash, inode->v.i_ino,
+ &X_SEARCH(type, name, strlen(name)),
+ 0);
+ if (IS_ERR(iter)) {
+ bch2_trans_exit(&trans);
+ BUG_ON(PTR_ERR(iter) == -EINTR);
- xattr = bkey_s_c_to_xattr(k);
+ return PTR_ERR(iter) == -ENOENT ? -ENODATA : PTR_ERR(iter);
+ }
+
+ xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
ret = le16_to_cpu(xattr.v->x_val_len);
if (buffer) {
if (ret > size)
@@ -176,47 +173,48 @@ int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
memcpy(buffer, xattr_val(xattr.v), ret);
}
- bch2_btree_iter_unlock(&iter);
+ bch2_trans_exit(&trans);
return ret;
}
-int bch2_xattr_set(struct bch_fs *c, u64 inum,
+int bch2_xattr_set(struct btree_trans *trans, u64 inum,
const struct bch_hash_info *hash_info,
const char *name, const void *value, size_t size,
- int flags, int type, u64 *journal_seq)
+ int type, int flags)
{
- struct xattr_search_key search = X_SEARCH(type, name, strlen(name));
int ret;
if (value) {
struct bkey_i_xattr *xattr;
+ unsigned namelen = strlen(name);
unsigned u64s = BKEY_U64s +
- xattr_val_u64s(search.name.len, size);
+ xattr_val_u64s(namelen, size);
if (u64s > U8_MAX)
return -ERANGE;
- xattr = kmalloc(u64s * sizeof(u64), GFP_NOFS);
- if (!xattr)
- return -ENOMEM;
+ xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
+ if (IS_ERR(xattr))
+ return PTR_ERR(xattr);
bkey_xattr_init(&xattr->k_i);
xattr->k.u64s = u64s;
xattr->v.x_type = type;
- xattr->v.x_name_len = search.name.len;
+ xattr->v.x_name_len = namelen;
xattr->v.x_val_len = cpu_to_le16(size);
- memcpy(xattr->v.x_name, search.name.name, search.name.len);
+ memcpy(xattr->v.x_name, name, namelen);
memcpy(xattr_val(&xattr->v), value, size);
- ret = bch2_hash_set(bch2_xattr_hash_desc, hash_info, c,
- inum, journal_seq,
- &xattr->k_i,
- (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)|
- (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0));
- kfree(xattr);
+ ret = __bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
+ inum, &xattr->k_i,
+ (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)|
+ (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0));
} else {
- ret = bch2_hash_delete(bch2_xattr_hash_desc, hash_info,
- c, inum, journal_seq, &search);
+ struct xattr_search_key search =
+ X_SEARCH(type, name, strlen(name));
+
+ ret = bch2_hash_delete(trans, bch2_xattr_hash_desc,
+ hash_info, inum, &search);
}
if (ret == -ENOENT)
@@ -308,9 +306,11 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler,
struct bch_inode_info *inode = to_bch_ei(vinode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- return bch2_xattr_set(c, inode->v.i_ino, &inode->ei_str_hash,
- name, value, size, flags, handler->flags,
- &inode->ei_journal_seq);
+ return bch2_trans_do(c, &inode->ei_journal_seq, BTREE_INSERT_ATOMIC,
+ bch2_xattr_set(&trans, inode->v.i_ino,
+ &inode->ei_str_hash,
+ name, value, size,
+ handler->flags, flags));
}
static const struct xattr_handler bch_xattr_user_handler = {
@@ -433,7 +433,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
}
mutex_lock(&inode->ei_update_lock);
- ret = __bch2_write_inode(c, inode, inode_opt_set_fn, &s);
+ ret = __bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0);
mutex_unlock(&inode->ei_update_lock);
if (value &&
diff --git a/libbcachefs/xattr.h b/libbcachefs/xattr.h
index 1365032..0689d32 100644
--- a/libbcachefs/xattr.h
+++ b/libbcachefs/xattr.h
@@ -35,15 +35,12 @@ struct xattr_handler;
struct bch_hash_info;
struct bch_inode_info;
-struct bkey_s_c bch2_xattr_get_iter(struct bch_fs *,
- struct btree_iter *,
- struct bch_inode_info *,
- const char *, int);
int bch2_xattr_get(struct bch_fs *, struct bch_inode_info *,
const char *, void *, size_t, int);
-int bch2_xattr_set(struct bch_fs *, u64, const struct bch_hash_info *,
- const char *, const void *, size_t, int, int, u64 *);
+int bch2_xattr_set(struct btree_trans *, u64, const struct bch_hash_info *,
+ const char *, const void *, size_t, int, int);
+
ssize_t bch2_xattr_list(struct dentry *, char *, size_t);
extern const struct xattr_handler *bch2_xattr_handlers[];