summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.bcachefs_revision2
-rw-r--r--cmd_debug.c1
-rw-r--r--cmd_migrate.c17
-rw-r--r--libbcachefs/acl.c25
-rw-r--r--libbcachefs/acl.h11
-rw-r--r--libbcachefs/bcachefs.h23
-rw-r--r--libbcachefs/bcachefs_format.h59
-rw-r--r--libbcachefs/bcachefs_ioctl.h15
-rw-r--r--libbcachefs/bkey.h2
-rw-r--r--libbcachefs/bkey_methods.c33
-rw-r--r--libbcachefs/btree_iter.c180
-rw-r--r--libbcachefs/btree_iter.h9
-rw-r--r--libbcachefs/btree_key_cache.c12
-rw-r--r--libbcachefs/btree_locking.h17
-rw-r--r--libbcachefs/btree_types.h18
-rw-r--r--libbcachefs/btree_update.h2
-rw-r--r--libbcachefs/btree_update_leaf.c204
-rw-r--r--libbcachefs/buckets.c3
-rw-r--r--libbcachefs/dirent.c203
-rw-r--r--libbcachefs/dirent.h35
-rw-r--r--libbcachefs/extents.c32
-rw-r--r--libbcachefs/extents.h1
-rw-r--r--libbcachefs/fs-common.c282
-rw-r--r--libbcachefs/fs-common.h26
-rw-r--r--libbcachefs/fs-io.c181
-rw-r--r--libbcachefs/fs-ioctl.c176
-rw-r--r--libbcachefs/fs.c161
-rw-r--r--libbcachefs/fs.h17
-rw-r--r--libbcachefs/fsck.c1354
-rw-r--r--libbcachefs/inode.c128
-rw-r--r--libbcachefs/inode.h7
-rw-r--r--libbcachefs/io.c128
-rw-r--r--libbcachefs/io.h19
-rw-r--r--libbcachefs/io_types.h2
-rw-r--r--libbcachefs/migrate.c6
-rw-r--r--libbcachefs/move.c84
-rw-r--r--libbcachefs/opts.c3
-rw-r--r--libbcachefs/opts.h12
-rw-r--r--libbcachefs/recovery.c126
-rw-r--r--libbcachefs/reflink.c38
-rw-r--r--libbcachefs/reflink.h4
-rw-r--r--libbcachefs/str_hash.h48
-rw-r--r--libbcachefs/subvolume.c981
-rw-r--r--libbcachefs/subvolume.h115
-rw-r--r--libbcachefs/super.c4
-rw-r--r--libbcachefs/xattr.c23
-rw-r--r--libbcachefs/xattr.h3
47 files changed, 4011 insertions, 821 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index d53addfb..76bc7256 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-bd6ed9fb42c0aa36d1f4a21eeab45fe12e1fb792
+386f00b6399a1eb38053c236aae87678f3535df7
diff --git a/cmd_debug.c b/cmd_debug.c
index b3a6ea0c..aee19fbf 100644
--- a/cmd_debug.c
+++ b/cmd_debug.c
@@ -191,6 +191,7 @@ static void list_keys(struct bch_fs *c, enum btree_id btree_id,
bch2_trans_init(&trans, c, 0, 0);
for_each_btree_key(&trans, iter, btree_id, start,
+ BTREE_ITER_ALL_SNAPSHOTS|
BTREE_ITER_PREFETCH, k, ret) {
if (bkey_cmp(k.k->p, end) > 0)
break;
diff --git a/cmd_migrate.c b/cmd_migrate.c
index 51260906..41cfe5d9 100644
--- a/cmd_migrate.c
+++ b/cmd_migrate.c
@@ -138,8 +138,9 @@ static void create_link(struct bch_fs *c,
struct bch_inode_unpacked inode;
int ret = bch2_trans_do(c, NULL, NULL, 0,
- bch2_link_trans(&trans, parent->bi_inum, inum,
- &parent_u, &inode, &qstr));
+ bch2_link_trans(&trans,
+ (subvol_inum) { 1, parent->bi_inum }, &parent_u,
+ (subvol_inum) { 1, inum }, &inode, &qstr));
if (ret)
die("error creating hardlink: %s", strerror(-ret));
}
@@ -155,9 +156,10 @@ static struct bch_inode_unpacked create_file(struct bch_fs *c,
int ret = bch2_trans_do(c, NULL, NULL, 0,
bch2_create_trans(&trans,
- parent->bi_inum, parent,
+ (subvol_inum) { 1, parent->bi_inum }, parent,
&new_inode, &qstr,
- uid, gid, mode, rdev, NULL, NULL));
+ uid, gid, mode, rdev, NULL, NULL,
+ (subvol_inum) {}, 0));
if (ret)
die("error creating file: %s", strerror(-ret));
@@ -225,7 +227,9 @@ static void copy_xattrs(struct bch_fs *c, struct bch_inode_unpacked *dst,
const struct xattr_handler *h = xattr_resolve_name(&attr);
int ret = bch2_trans_do(c, NULL, NULL, 0,
- bch2_xattr_set(&trans, dst->bi_inum, &hash_info, attr,
+ bch2_xattr_set(&trans,
+ (subvol_inum) { 1, dst->bi_inum },
+ &hash_info, attr,
val, val_size, h->flags, 0));
if (ret < 0)
die("error creating xattr: %s", strerror(-ret));
@@ -569,7 +573,8 @@ static void copy_fs(struct bch_fs *c, int src_fd, const char *src_path,
syncfs(src_fd);
struct bch_inode_unpacked root_inode;
- int ret = bch2_inode_find_by_inum(c, BCACHEFS_ROOT_INO, &root_inode);
+ int ret = bch2_inode_find_by_inum(c, (subvol_inum) { 1, BCACHEFS_ROOT_INO },
+ &root_inode);
if (ret)
die("error looking up root directory: %s", strerror(-ret));
diff --git a/libbcachefs/acl.c b/libbcachefs/acl.c
index 2146a63d..f92b52e4 100644
--- a/libbcachefs/acl.c
+++ b/libbcachefs/acl.c
@@ -229,7 +229,7 @@ retry:
bch2_trans_begin(&trans);
ret = bch2_hash_lookup(&trans, &iter, bch2_xattr_hash_desc,
- &hash, inode->v.i_ino,
+ &hash, inode_inum(inode),
&X_SEARCH(acl_to_xattr_type(type), "", 0),
0);
if (ret) {
@@ -259,11 +259,11 @@ out:
return acl;
}
-int bch2_set_acl_trans(struct btree_trans *trans,
+int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum,
struct bch_inode_unpacked *inode_u,
- const struct bch_hash_info *hash_info,
struct posix_acl *acl, int type)
{
+ struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode_u);
int ret;
if (type == ACL_TYPE_DEFAULT &&
@@ -276,14 +276,14 @@ int bch2_set_acl_trans(struct btree_trans *trans,
if (IS_ERR(xattr))
return PTR_ERR(xattr);
- ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
- inode_u->bi_inum, &xattr->k_i, 0);
+ ret = bch2_hash_set(trans, bch2_xattr_hash_desc, &hash_info,
+ inum, &xattr->k_i, 0);
} else {
struct xattr_search_key search =
X_SEARCH(acl_to_xattr_type(type), "", 0);
- ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, hash_info,
- inode_u->bi_inum, &search);
+ ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, &hash_info,
+ inum, &search);
}
return ret == -ENOENT ? 0 : ret;
@@ -297,7 +297,6 @@ int bch2_set_acl(struct user_namespace *mnt_userns,
struct btree_trans trans;
struct btree_iter inode_iter = { NULL };
struct bch_inode_unpacked inode_u;
- struct bch_hash_info hash_info;
struct posix_acl *acl;
umode_t mode;
int ret;
@@ -308,7 +307,7 @@ retry:
bch2_trans_begin(&trans);
acl = _acl;
- ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode->v.i_ino,
+ ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode),
BTREE_ITER_INTENT);
if (ret)
goto btree_err;
@@ -321,9 +320,7 @@ retry:
goto btree_err;
}
- hash_info = bch2_hash_info_init(c, &inode_u);
-
- ret = bch2_set_acl_trans(&trans, &inode_u, &hash_info, acl, type);
+ ret = bch2_set_acl_trans(&trans, inode_inum(inode), &inode_u, acl, type);
if (ret)
goto btree_err;
@@ -352,7 +349,7 @@ err:
return ret;
}
-int bch2_acl_chmod(struct btree_trans *trans,
+int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
struct bch_inode_unpacked *inode,
umode_t mode,
struct posix_acl **new_acl)
@@ -366,7 +363,7 @@ int bch2_acl_chmod(struct btree_trans *trans,
int ret;
ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
- &hash_info, inode->bi_inum,
+ &hash_info, inum,
&X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0),
BTREE_ITER_INTENT);
if (ret)
diff --git a/libbcachefs/acl.h b/libbcachefs/acl.h
index 25fc54dd..2ad214bd 100644
--- a/libbcachefs/acl.h
+++ b/libbcachefs/acl.h
@@ -28,25 +28,24 @@ typedef struct {
struct posix_acl *bch2_get_acl(struct inode *, int);
-int bch2_set_acl_trans(struct btree_trans *,
+int bch2_set_acl_trans(struct btree_trans *, subvol_inum,
struct bch_inode_unpacked *,
- const struct bch_hash_info *,
struct posix_acl *, int);
int bch2_set_acl(struct user_namespace *, struct inode *, struct posix_acl *, int);
-int bch2_acl_chmod(struct btree_trans *, struct bch_inode_unpacked *,
+int bch2_acl_chmod(struct btree_trans *, subvol_inum,
+ struct bch_inode_unpacked *,
umode_t, struct posix_acl **);
#else
-static inline int bch2_set_acl_trans(struct btree_trans *trans,
+static inline int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum,
struct bch_inode_unpacked *inode_u,
- const struct bch_hash_info *hash_info,
struct posix_acl *acl, int type)
{
return 0;
}
-static inline int bch2_acl_chmod(struct btree_trans *trans,
+static inline int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
struct bch_inode_unpacked *inode,
umode_t mode,
struct posix_acl **new_acl)
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index 9975fc17..0efb1aaa 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -380,6 +380,8 @@ enum gc_phase {
GC_PHASE_BTREE_alloc,
GC_PHASE_BTREE_quotas,
GC_PHASE_BTREE_reflink,
+ GC_PHASE_BTREE_subvolumes,
+ GC_PHASE_BTREE_snapshots,
GC_PHASE_PENDING_DELETE,
};
@@ -563,6 +565,21 @@ struct btree_path_buf {
#define REPLICAS_DELTA_LIST_MAX (1U << 16)
+struct snapshot_t {
+ u32 parent;
+ u32 children[2];
+ u32 subvol; /* Nonzero only if a subvolume points to this node: */
+ u32 equiv;
+};
+
+typedef struct {
+ u32 subvol;
+ u64 inum;
+} subvol_inum;
+
+#define BCACHEFS_ROOT_SUBVOL_INUM \
+ ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO })
+
struct bch_fs {
struct closure cl;
@@ -634,6 +651,12 @@ struct bch_fs {
struct closure sb_write;
struct mutex sb_lock;
+ /* snapshot.c: */
+ GENRADIX(struct snapshot_t) snapshots;
+ struct bch_snapshot_table __rcu *snapshot_table;
+ struct mutex snapshot_table_lock;
+ struct work_struct snapshot_delete_work;
+
/* BTREE CACHE */
struct bio_set btree_bio;
struct workqueue_struct *io_complete_wq;
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index 98779e46..c082d5fc 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -323,7 +323,7 @@ static inline void bkey_init(struct bkey *k)
*/
#define BCH_BKEY_TYPES() \
x(deleted, 0) \
- x(discard, 1) \
+ x(whiteout, 1) \
x(error, 2) \
x(cookie, 3) \
x(hash_whiteout, 4) \
@@ -342,7 +342,9 @@ static inline void bkey_init(struct bkey *k)
x(inline_data, 17) \
x(btree_ptr_v2, 18) \
x(indirect_inline_data, 19) \
- x(alloc_v2, 20)
+ x(alloc_v2, 20) \
+ x(subvolume, 21) \
+ x(snapshot, 22)
enum bch_bkey_type {
#define x(name, nr) KEY_TYPE_##name = nr,
@@ -355,7 +357,7 @@ struct bch_deleted {
struct bch_val v;
};
-struct bch_discard {
+struct bch_whiteout {
struct bch_val v;
};
@@ -686,6 +688,10 @@ struct bch_inode_generation {
__le32 pad;
} __attribute__((packed, aligned(8)));
+/*
+ * bi_subvol and bi_parent_subvol are only set for subvolume roots:
+ */
+
#define BCH_INODE_FIELDS() \
x(bi_atime, 96) \
x(bi_ctime, 96) \
@@ -709,7 +715,9 @@ struct bch_inode_generation {
x(bi_erasure_code, 16) \
x(bi_fields_set, 16) \
x(bi_dir, 64) \
- x(bi_dir_offset, 64)
+ x(bi_dir_offset, 64) \
+ x(bi_subvol, 32) \
+ x(bi_parent_subvol, 32)
/* subset of BCH_INODE_FIELDS */
#define BCH_INODE_OPTS() \
@@ -792,6 +800,9 @@ struct bch_dirent {
__u8 d_name[];
} __attribute__((packed, aligned(8)));
+#define DT_SUBVOL 16
+#define BCH_DT_MAX 17
+
#define BCH_NAME_MAX (U8_MAX * sizeof(u64) - \
sizeof(struct bkey) - \
offsetof(struct bch_dirent, d_name))
@@ -928,6 +939,42 @@ struct bch_inline_data {
u8 data[0];
};
+/* Subvolumes: */
+
+#define SUBVOL_POS_MIN POS(0, 1)
+#define SUBVOL_POS_MAX POS(0, S32_MAX)
+#define BCACHEFS_ROOT_SUBVOL 1
+
+struct bch_subvolume {
+ struct bch_val v;
+ __le32 flags;
+ __le32 snapshot;
+ __le64 inode;
+};
+
+LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1)
+/*
+ * We need to know whether a subvolume is a snapshot so we can know whether we
+ * can delete it (or whether it should just be rm -rf'd)
+ */
+LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2)
+
+/* Snapshots */
+
+struct bch_snapshot {
+ struct bch_val v;
+ __le32 flags;
+ __le32 parent;
+ __le32 children[2];
+ __le32 subvol;
+ __le32 pad;
+};
+
+LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1)
+
+/* True if a subvolume points to this snapshot node: */
+LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2)
+
/* Optional/variable size superblock sections: */
struct bch_sb_field {
@@ -1695,7 +1742,9 @@ LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6);
x(alloc, 4) \
x(quotas, 5) \
x(stripes, 6) \
- x(reflink, 7)
+ x(reflink, 7) \
+ x(subvolumes, 8) \
+ x(snapshots, 9)
enum btree_id {
#define x(kwd, val) BTREE_ID_##kwd = val,
diff --git a/libbcachefs/bcachefs_ioctl.h b/libbcachefs/bcachefs_ioctl.h
index f679fc21..930981ad 100644
--- a/libbcachefs/bcachefs_ioctl.h
+++ b/libbcachefs/bcachefs_ioctl.h
@@ -78,6 +78,9 @@ struct bch_ioctl_incremental {
#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize)
#define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15, struct bch_ioctl_disk_resize_journal)
+#define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc, 16, struct bch_ioctl_subvolume)
+#define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc, 17, struct bch_ioctl_subvolume)
+
/* ioctl below act on a particular file, not the filesystem as a whole: */
#define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *)
@@ -349,4 +352,16 @@ struct bch_ioctl_disk_resize_journal {
__u64 nbuckets;
};
+struct bch_ioctl_subvolume {
+ __u32 flags;
+ __u32 dirfd;
+ __u16 mode;
+ __u16 pad[3];
+ __u64 dst_ptr;
+ __u64 src_ptr;
+};
+
+#define BCH_SUBVOL_SNAPSHOT_CREATE (1U << 0)
+#define BCH_SUBVOL_SNAPSHOT_RO (1U << 1)
+
#endif /* _BCACHEFS_IOCTL_H */
diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h
index c4a66f28..7dee3d8e 100644
--- a/libbcachefs/bkey.h
+++ b/libbcachefs/bkey.h
@@ -55,7 +55,7 @@ static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted)
#define bkey_whiteout(_k) \
- ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_discard)
+ ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout)
enum bkey_lr_packed {
BKEY_PACKED_BOTH,
diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c
index a03b5514..874defd8 100644
--- a/libbcachefs/bkey_methods.c
+++ b/libbcachefs/bkey_methods.c
@@ -11,6 +11,7 @@
#include "inode.h"
#include "quota.h"
#include "reflink.h"
+#include "subvolume.h"
#include "xattr.h"
const char * const bch2_bkey_types[] = {
@@ -30,7 +31,7 @@ static const char *deleted_key_invalid(const struct bch_fs *c,
.key_invalid = deleted_key_invalid, \
}
-#define bch2_bkey_ops_discard (struct bkey_ops) { \
+#define bch2_bkey_ops_whiteout (struct bkey_ops) { \
.key_invalid = deleted_key_invalid, \
}
@@ -100,6 +101,8 @@ const char *bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k)
static unsigned bch2_key_types_allowed[] = {
[BKEY_TYPE_extents] =
+ (1U << KEY_TYPE_deleted)|
+ (1U << KEY_TYPE_whiteout)|
(1U << KEY_TYPE_error)|
(1U << KEY_TYPE_cookie)|
(1U << KEY_TYPE_extent)|
@@ -107,26 +110,43 @@ static unsigned bch2_key_types_allowed[] = {
(1U << KEY_TYPE_reflink_p)|
(1U << KEY_TYPE_inline_data),
[BKEY_TYPE_inodes] =
+ (1U << KEY_TYPE_deleted)|
+ (1U << KEY_TYPE_whiteout)|
(1U << KEY_TYPE_inode)|
(1U << KEY_TYPE_inode_generation),
[BKEY_TYPE_dirents] =
+ (1U << KEY_TYPE_deleted)|
+ (1U << KEY_TYPE_whiteout)|
(1U << KEY_TYPE_hash_whiteout)|
(1U << KEY_TYPE_dirent),
[BKEY_TYPE_xattrs] =
+ (1U << KEY_TYPE_deleted)|
+ (1U << KEY_TYPE_whiteout)|
(1U << KEY_TYPE_cookie)|
(1U << KEY_TYPE_hash_whiteout)|
(1U << KEY_TYPE_xattr),
[BKEY_TYPE_alloc] =
+ (1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_alloc)|
(1U << KEY_TYPE_alloc_v2),
[BKEY_TYPE_quotas] =
+ (1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_quota),
[BKEY_TYPE_stripes] =
+ (1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_stripe),
[BKEY_TYPE_reflink] =
+ (1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_reflink_v)|
(1U << KEY_TYPE_indirect_inline_data),
+ [BKEY_TYPE_subvolumes] =
+ (1U << KEY_TYPE_deleted)|
+ (1U << KEY_TYPE_subvolume),
+ [BKEY_TYPE_snapshots] =
+ (1U << KEY_TYPE_deleted)|
+ (1U << KEY_TYPE_snapshot),
[BKEY_TYPE_btree] =
+ (1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_btree_ptr)|
(1U << KEY_TYPE_btree_ptr_v2),
};
@@ -134,21 +154,18 @@ static unsigned bch2_key_types_allowed[] = {
const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
enum btree_node_type type)
{
- unsigned key_types_allowed = (1U << KEY_TYPE_deleted)|
- bch2_key_types_allowed[type] ;
-
if (k.k->u64s < BKEY_U64s)
return "u64s too small";
- if (!(key_types_allowed & (1U << k.k->type)))
+ if (!(bch2_key_types_allowed[type] & (1U << k.k->type)))
return "invalid key type for this btree";
if (type == BKEY_TYPE_btree &&
bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
return "value too big";
- if (btree_node_type_is_extents(type)) {
- if ((k.k->size == 0) != bkey_deleted(k.k))
+ if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
+ if (k.k->size == 0)
return "bad size field";
if (k.k->size > k.k->p.offset)
@@ -165,7 +182,7 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
if (type != BKEY_TYPE_btree &&
btree_type_has_snapshots(type) &&
- k.k->p.snapshot != U32_MAX)
+ !k.k->p.snapshot)
return "invalid snapshot field";
if (type != BKEY_TYPE_btree &&
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index ce4d7c7e..b5484d77 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -13,6 +13,7 @@
#include "extents.h"
#include "journal.h"
#include "replicas.h"
+#include "subvolume.h"
#include <linux/prefetch.h>
#include <trace/events/bcachefs.h>
@@ -152,7 +153,7 @@ bool __bch2_btree_node_relock(struct btree_trans *trans,
if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
(btree_node_lock_seq_matches(path, b, level) &&
btree_node_lock_increment(trans, b, level, want))) {
- mark_btree_node_locked(trans, path, level, want);
+ mark_btree_node_locked(path, level, want);
return true;
} else {
return false;
@@ -188,7 +189,7 @@ static bool bch2_btree_node_upgrade(struct btree_trans *trans,
return false;
success:
- mark_btree_node_intent_locked(trans, path, level);
+ mark_btree_node_intent_locked(path, level);
return true;
}
@@ -674,6 +675,9 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
{
+ BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
+ !iter->pos.snapshot);
+
BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
iter->pos.snapshot != iter->snapshot);
@@ -681,6 +685,55 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
bkey_cmp(iter->pos, iter->k.p) > 0);
}
+static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k)
+{
+ struct btree_trans *trans = iter->trans;
+ struct btree_iter copy;
+ struct bkey_s_c prev;
+ int ret = 0;
+
+ if (!bch2_debug_check_iterators)
+ return 0;
+
+ if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS))
+ return 0;
+
+ if (bkey_err(k) || !k.k)
+ return 0;
+
+ BUG_ON(!bch2_snapshot_is_ancestor(trans->c,
+ iter->snapshot,
+ k.k->p.snapshot));
+
+ bch2_trans_iter_init(trans, &copy, iter->btree_id, iter->pos,
+ BTREE_ITER_ALL_SNAPSHOTS);
+ prev = bch2_btree_iter_prev(&copy);
+ if (!prev.k)
+ goto out;
+
+ ret = bkey_err(prev);
+ if (ret)
+ goto out;
+
+ if (!bkey_cmp(prev.k->p, k.k->p) &&
+ bch2_snapshot_is_ancestor(trans->c, iter->snapshot,
+ prev.k->p.snapshot) > 0) {
+ char buf1[100], buf2[200];
+
+ bch2_bkey_to_text(&PBUF(buf1), k.k);
+ bch2_bkey_to_text(&PBUF(buf2), prev.k);
+
+ panic("iter snap %u\n"
+ "k %s\n"
+ "prev %s\n",
+ iter->snapshot,
+ buf1, buf2);
+ }
+out:
+ bch2_trans_iter_exit(trans, &copy);
+ return ret;
+}
+
#else
static inline void bch2_btree_path_verify_level(struct btree_trans *trans,
@@ -689,6 +742,7 @@ static inline void bch2_btree_path_verify(struct btree_trans *trans,
struct btree_path *path) {}
static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
+static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; }
#endif
@@ -896,12 +950,12 @@ static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
bch2_btree_node_iter_peek_all(&l->iter, l->b));
}
-static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans,
+static inline struct bkey_s_c btree_path_level_peek(struct bch_fs *c,
struct btree_path *path,
struct btree_path_level *l,
struct bkey *u)
{
- struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
+ struct bkey_s_c k = __btree_iter_unpack(c, l, u,
bch2_btree_node_iter_peek(&l->iter, l->b));
path->pos = k.k ? k.k->p : l->b->key.k.p;
@@ -1041,7 +1095,7 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
t != BTREE_NODE_UNLOCKED) {
btree_node_unlock(path, b->c.level);
six_lock_increment(&b->c.lock, t);
- mark_btree_node_locked(trans, path, b->c.level, t);
+ mark_btree_node_locked(path, b->c.level, t);
}
btree_path_level_init(trans, path, b);
@@ -1118,7 +1172,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
path->l[i].b = NULL;
- mark_btree_node_locked(trans, path, path->level, lock_type);
+ mark_btree_node_locked(path, path->level, lock_type);
btree_path_level_init(trans, path, b);
return 0;
}
@@ -1210,7 +1264,7 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
if (unlikely(ret))
goto err;
- mark_btree_node_locked(trans, path, level, lock_type);
+ mark_btree_node_locked(path, level, lock_type);
btree_path_level_init(trans, path, b);
if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 &&
@@ -1252,10 +1306,6 @@ retry_all:
btree_trans_verify_sorted(trans);
-#ifdef CONFIG_BCACHEFS_DEBUG
- trans->traverse_all_idx = U8_MAX;
-#endif
-
for (i = trans->nr_sorted - 2; i >= 0; --i) {
struct btree_path *path1 = trans->paths + trans->sorted[i];
struct btree_path *path2 = trans->paths + trans->sorted[i + 1];
@@ -1294,9 +1344,6 @@ retry_all:
path = trans->paths + trans->sorted[i];
EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
-#ifdef CONFIG_BCACHEFS_DEBUG
- trans->traverse_all_idx = path->idx;
-#endif
ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_);
if (ret)
@@ -1985,11 +2032,25 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
}
if (likely(k.k)) {
- if (likely(!bkey_deleted(k.k)))
- break;
+ /*
+ * We can never have a key in a leaf node at POS_MAX, so
+ * we don't have to check these successor() calls:
+ */
+ if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
+ !bch2_snapshot_is_ancestor(trans->c,
+ iter->snapshot,
+ k.k->p.snapshot)) {
+ search_key = bpos_successor(k.k->p);
+ continue;
+ }
+
+ if (bkey_whiteout(k.k) &&
+ !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
+ search_key = bkey_successor(iter, k.k->p);
+ continue;
+ }
- /* Advance to next key: */
- search_key = bkey_successor(iter, k.k->p);
+ break;
} else if (likely(bpos_cmp(iter->path->l[0].b->key.k.p, SPOS_MAX))) {
/* Advance to next leaf node: */
search_key = bpos_successor(iter->path->l[0].b->key.k.p);
@@ -2010,6 +2071,9 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
iter->pos = bkey_start_pos(k.k);
+ if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+ iter->pos.snapshot = iter->snapshot;
+
cmp = bpos_cmp(k.k->p, iter->path->pos);
if (cmp) {
iter->path = bch2_btree_path_make_mut(trans, iter->path,
@@ -2022,6 +2086,10 @@ out:
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
+ ret = bch2_btree_iter_verify_ret(iter, k);
+ if (unlikely(ret))
+ return bkey_s_c_err(ret);
+
return k;
}
@@ -2045,7 +2113,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
{
struct btree_trans *trans = iter->trans;
struct bpos search_key = iter->pos;
+ struct btree_path *saved_path = NULL;
struct bkey_s_c k;
+ struct bkey saved_k;
+ const struct bch_val *saved_v;
int ret;
EBUG_ON(iter->path->cached || iter->path->level);
@@ -2053,6 +2124,9 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
bch2_btree_iter_verify(iter);
bch2_btree_iter_verify_entry_exit(iter);
+ if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+ search_key.snapshot = U32_MAX;
+
while (1) {
iter->path = btree_path_set_pos(trans, iter->path, search_key,
iter->flags & BTREE_ITER_INTENT);
@@ -2065,18 +2139,61 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
goto out;
}
- k = btree_path_level_peek(trans, iter->path,
+ k = btree_path_level_peek(trans->c, iter->path,
&iter->path->l[0], &iter->k);
if (!k.k ||
((iter->flags & BTREE_ITER_IS_EXTENTS)
- ? bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0
- : bkey_cmp(k.k->p, iter->pos) > 0))
+ ? bpos_cmp(bkey_start_pos(k.k), search_key) >= 0
+ : bpos_cmp(k.k->p, search_key) > 0))
k = btree_path_level_prev(trans->c, iter->path,
&iter->path->l[0], &iter->k);
btree_path_check_sort(trans, iter->path, 0);
if (likely(k.k)) {
+ if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) {
+ if (k.k->p.snapshot == iter->snapshot)
+ goto got_key;
+
+ /*
+ * If we have a saved candidate, and we're no
+ * longer at the same _key_ (not pos), return
+ * that candidate
+ */
+ if (saved_path && bkey_cmp(k.k->p, saved_k.p)) {
+ bch2_path_put(trans, iter->path,
+ iter->flags & BTREE_ITER_INTENT);
+ iter->path = saved_path;
+ saved_path = NULL;
+ iter->k = saved_k;
+ k.v = saved_v;
+ goto got_key;
+ }
+
+ if (bch2_snapshot_is_ancestor(iter->trans->c,
+ iter->snapshot,
+ k.k->p.snapshot)) {
+ if (saved_path)
+ bch2_path_put(trans, saved_path,
+ iter->flags & BTREE_ITER_INTENT);
+ saved_path = btree_path_clone(trans, iter->path,
+ iter->flags & BTREE_ITER_INTENT);
+ saved_k = *k.k;
+ saved_v = k.v;
+ }
+
+ search_key = bpos_predecessor(k.k->p);
+ continue;
+ }
+got_key:
+ if (bkey_whiteout(k.k) &&
+ !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
+ search_key = bkey_predecessor(iter, k.k->p);
+ if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+ search_key.snapshot = U32_MAX;
+ continue;
+ }
+
break;
} else if (likely(bpos_cmp(iter->path->l[0].b->data->min_key, POS_MIN))) {
/* Advance to previous leaf node: */
@@ -2094,7 +2211,12 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
/* Extents can straddle iter->pos: */
if (bkey_cmp(k.k->p, iter->pos) < 0)
iter->pos = k.k->p;
+
+ if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+ iter->pos.snapshot = iter->snapshot;
out:
+ if (saved_path)
+ bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_INTENT);
iter->path->should_be_locked = true;
bch2_btree_iter_verify_entry_exit(iter);
@@ -2143,7 +2265,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
if (unlikely(ret))
return bkey_s_c_err(ret);
- if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) {
+ if ((iter->flags & BTREE_ITER_CACHED) ||
+ !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
struct bkey_i *next_update;
next_update = iter->flags & BTREE_ITER_WITH_UPDATES
@@ -2202,6 +2325,9 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
+ ret = bch2_btree_iter_verify_ret(iter, k);
+ if (unlikely(ret))
+ return bkey_s_c_err(ret);
return k;
}
@@ -2352,13 +2478,13 @@ static void __bch2_trans_iter_init(struct btree_trans *trans,
btree_node_type_is_extents(btree_id))
flags |= BTREE_ITER_IS_EXTENTS;
- if (!btree_type_has_snapshots(btree_id) &&
- !(flags & __BTREE_ITER_ALL_SNAPSHOTS))
+ if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
+ !btree_type_has_snapshots(btree_id))
flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
- if (!(flags & BTREE_ITER_ALL_SNAPSHOTS))
- pos.snapshot = btree_type_has_snapshots(btree_id)
- ? U32_MAX : 0;
+ if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+ btree_type_has_snapshots(btree_id))
+ flags |= BTREE_ITER_FILTER_SNAPSHOTS;
iter->trans = trans;
iter->path = NULL;
diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h
index be1bb489..19ca73f5 100644
--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@@ -234,6 +234,15 @@ static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *it
iter->pos = bkey_start_pos(&iter->k);
}
+static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 snapshot)
+{
+ struct bpos pos = iter->pos;
+
+ iter->snapshot = snapshot;
+ pos.snapshot = snapshot;
+ bch2_btree_iter_set_pos(iter, pos);
+}
+
/*
* Unlocks before scheduling
* Note: does not revalidate iterator
diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c
index 938ced36..4f1bc1d1 100644
--- a/libbcachefs/btree_key_cache.c
+++ b/libbcachefs/btree_key_cache.c
@@ -163,6 +163,11 @@ btree_key_cache_create(struct btree_key_cache *c,
was_new = false;
}
+ if (btree_id == BTREE_ID_subvolumes)
+ six_lock_pcpu_alloc(&ck->c.lock);
+ else
+ six_lock_pcpu_free(&ck->c.lock);
+
ck->c.level = 0;
ck->c.btree_id = btree_id;
ck->key.btree_id = btree_id;
@@ -296,7 +301,7 @@ retry:
if (!ck)
goto retry;
- mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
+ mark_btree_node_locked(path, 0, SIX_LOCK_intent);
path->locks_want = 1;
} else {
enum six_lock_type lock_want = __btree_lock_want(path, 0);
@@ -318,7 +323,7 @@ retry:
goto retry;
}
- mark_btree_node_locked(trans, path, 0, lock_want);
+ mark_btree_node_locked(path, 0, lock_want);
}
path->l[0].lock_seq = ck->c.lock.state.seq;
@@ -366,7 +371,8 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos,
BTREE_ITER_SLOTS|
- BTREE_ITER_INTENT);
+ BTREE_ITER_INTENT|
+ BTREE_ITER_ALL_SNAPSHOTS);
bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos,
BTREE_ITER_CACHED|
BTREE_ITER_CACHED_NOFILL|
diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h
index 5c6b7580..d599008c 100644
--- a/libbcachefs/btree_locking.h
+++ b/libbcachefs/btree_locking.h
@@ -58,8 +58,7 @@ static inline void mark_btree_node_unlocked(struct btree_path *path,
path->nodes_intent_locked &= ~(1 << level);
}
-static inline void mark_btree_node_locked(struct btree_trans *trans,
- struct btree_path *path,
+static inline void mark_btree_node_locked(struct btree_path *path,
unsigned level,
enum six_lock_type type)
{
@@ -69,19 +68,12 @@ static inline void mark_btree_node_locked(struct btree_trans *trans,
path->nodes_locked |= 1 << level;
path->nodes_intent_locked |= type << level;
-#ifdef CONFIG_BCACHEFS_DEBUG
- path->ip_locked = _RET_IP_;
- BUG_ON(trans->in_traverse_all &&
- trans->traverse_all_idx != U8_MAX &&
- path->sorted_idx > trans->paths[trans->traverse_all_idx].sorted_idx);
-#endif
}
-static inline void mark_btree_node_intent_locked(struct btree_trans *trans,
- struct btree_path *path,
+static inline void mark_btree_node_intent_locked(struct btree_path *path,
unsigned level)
{
- mark_btree_node_locked(trans, path, level, SIX_LOCK_intent);
+ mark_btree_node_locked(path, level, SIX_LOCK_intent);
}
static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level)
@@ -120,9 +112,6 @@ static inline void __bch2_btree_path_unlock(struct btree_path *path)
while (path->nodes_locked)
btree_node_unlock(path, __ffs(path->nodes_locked));
-#ifdef CONFIG_BCACHEFS_DEBUG
- path->ip_locked = 0;
-#endif
}
static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h
index ccf91ebd..7fcd2ceb 100644
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -209,6 +209,7 @@ struct btree_node_iter {
#define BTREE_ITER_WITH_UPDATES (1 << 10)
#define __BTREE_ITER_ALL_SNAPSHOTS (1 << 11)
#define BTREE_ITER_ALL_SNAPSHOTS (1 << 12)
+#define BTREE_ITER_FILTER_SNAPSHOTS (1 << 13)
enum btree_path_uptodate {
BTREE_ITER_UPTODATE = 0,
@@ -255,7 +256,6 @@ struct btree_path {
} l[BTREE_MAX_DEPTH];
#ifdef CONFIG_BCACHEFS_DEBUG
unsigned long ip_allocated;
- unsigned long ip_locked;
#endif
};
@@ -369,7 +369,6 @@ struct btree_trans {
struct bpos locking_pos;
u8 locking_btree_id;
u8 locking_level;
- u8 traverse_all_idx;
pid_t pid;
#endif
unsigned long ip;
@@ -607,7 +606,8 @@ static inline bool btree_node_is_extents(struct btree *b)
#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \
((1U << BKEY_TYPE_alloc)| \
- (1U << BKEY_TYPE_stripes))
+ (1U << BKEY_TYPE_stripes)| \
+ (1U << BKEY_TYPE_snapshots))
#define BTREE_NODE_TYPE_HAS_TRIGGERS \
(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \
@@ -654,7 +654,8 @@ enum btree_update_flags {
#define BTREE_TRIGGER_WANTS_OLD_AND_NEW \
((1U << KEY_TYPE_stripe)| \
- (1U << KEY_TYPE_inode))
+ (1U << KEY_TYPE_inode)| \
+ (1U << KEY_TYPE_snapshot))
static inline bool btree_node_type_needs_gc(enum btree_node_type type)
{
@@ -671,11 +672,6 @@ struct btree_root {
s8 error;
};
-/*
- * Optional hook that will be called just prior to a btree node update, when
- * we're holding the write lock and we know what key is about to be overwritten:
- */
-
enum btree_insert_ret {
BTREE_INSERT_OK,
/* leaf node needs to be split */
@@ -696,8 +692,4 @@ enum btree_node_sibling {
btree_next_sib,
};
-typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *,
- struct btree *,
- struct btree_node_iter *);
-
#endif /* _BCACHEFS_BTREE_TYPES_H */
diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h
index 23b73d3a..4d0ece34 100644
--- a/libbcachefs/btree_update.h
+++ b/libbcachefs/btree_update.h
@@ -61,7 +61,7 @@ int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
struct disk_reservation *, u64 *, int flags);
int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
- struct bpos, struct bpos, u64 *);
+ struct bpos, struct bpos, unsigned, u64 *);
int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
struct bpos, struct bpos, u64 *);
diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c
index a0da9673..f69f919d 100644
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@@ -15,6 +15,7 @@
#include "journal.h"
#include "journal_reclaim.h"
#include "keylist.h"
+#include "subvolume.h"
#include "replicas.h"
#include <linux/prefetch.h>
@@ -245,6 +246,11 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
BUG_ON(i->cached != i->path->cached);
BUG_ON(i->level != i->path->level);
BUG_ON(i->btree_id != i->path->btree_id);
+ EBUG_ON(!i->level &&
+ !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) &&
+ test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) &&
+ i->k->k.p.snapshot &&
+ bch2_snapshot_internal_node(trans->c, i->k->k.p.snapshot));
}
static noinline int
@@ -934,6 +940,43 @@ err:
goto retry;
}
+static int check_pos_snapshot_overwritten(struct btree_trans *trans,
+ enum btree_id id,
+ struct bpos pos)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ if (!snapshot_t(c, pos.snapshot)->children[0])
+ return 0;
+
+ bch2_trans_iter_init(trans, &iter, id, pos,
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_ALL_SNAPSHOTS);
+ while (1) {
+ k = bch2_btree_iter_prev(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ break;
+
+ if (!k.k)
+ break;
+
+ if (bkey_cmp(pos, k.k->p))
+ break;
+
+ if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
+ ret = 1;
+ break;
+ }
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ return ret;
+}
+
static int bch2_trans_update_extent(struct btree_trans *trans,
struct btree_iter *orig_iter,
struct bkey_i *insert,
@@ -958,6 +1001,28 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
goto out;
if (bch2_bkey_maybe_mergable(k.k, &insert->k)) {
+ /*
+ * We can't merge extents if they belong to interior snapshot
+ * tree nodes, and there's a snapshot in which one extent is
+ * visible and the other is not - i.e. if visibility is
+ * different.
+ *
+ * Instead of checking if visibilitiy of the two extents is
+ * different, for now we just check if either has been
+ * overwritten:
+ */
+ ret = check_pos_snapshot_overwritten(trans, btree_id, insert->k.p);
+ if (ret < 0)
+ goto err;
+ if (ret)
+ goto nomerge1;
+
+ ret = check_pos_snapshot_overwritten(trans, btree_id, k.k->p);
+ if (ret < 0)
+ goto err;
+ if (ret)
+ goto nomerge1;
+
update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
if ((ret = PTR_ERR_OR_ZERO(update)))
goto err;
@@ -973,22 +1038,26 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
goto next;
}
}
-
- if (!bkey_cmp(k.k->p, bkey_start_pos(&insert->k)))
+nomerge1:
+ ret = 0;
+ if (!bkey_cmp(k.k->p, start))
goto next;
while (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) > 0) {
+ bool front_split = bkey_cmp(bkey_start_pos(k.k), start) < 0;
+ bool back_split = bkey_cmp(k.k->p, insert->k.p) > 0;
+
/*
* If we're going to be splitting a compressed extent, note it
* so that __bch2_trans_commit() can increase our disk
* reservation:
*/
- if (bkey_cmp(bkey_start_pos(k.k), start) < 0 &&
- bkey_cmp(k.k->p, insert->k.p) > 0 &&
+ if (((front_split && back_split) ||
+ ((front_split || back_split) && k.k->p.snapshot != insert->k.p.snapshot)) &&
(compressed_sectors = bch2_bkey_sectors_compressed(k)))
trans->extra_journal_res += compressed_sectors;
- if (bkey_cmp(bkey_start_pos(k.k), start) < 0) {
+ if (front_split) {
update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
if ((ret = PTR_ERR_OR_ZERO(update)))
goto err;
@@ -999,6 +1068,32 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_ALL_SNAPSHOTS|
+ BTREE_ITER_INTENT);
+ ret = bch2_btree_iter_traverse(&update_iter) ?:
+ bch2_trans_update(trans, &update_iter, update,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+ flags);
+ bch2_trans_iter_exit(trans, &update_iter);
+
+ if (ret)
+ goto err;
+ }
+
+ if (k.k->p.snapshot != insert->k.p.snapshot &&
+ (front_split || back_split)) {
+ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ if ((ret = PTR_ERR_OR_ZERO(update)))
+ goto err;
+
+ bkey_reassemble(update, k);
+
+ bch2_cut_front(start, update);
+ bch2_cut_back(insert->k.p, update);
+
+ bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_ALL_SNAPSHOTS|
BTREE_ITER_INTENT);
ret = bch2_btree_iter_traverse(&update_iter) ?:
bch2_trans_update(trans, &update_iter, update,
@@ -1010,12 +1105,32 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
}
if (bkey_cmp(k.k->p, insert->k.p) <= 0) {
- ret = bch2_btree_delete_at(trans, &iter, flags);
+ update = bch2_trans_kmalloc(trans, sizeof(*update));
+ if ((ret = PTR_ERR_OR_ZERO(update)))
+ goto err;
+
+ bkey_init(&update->k);
+ update->k.p = k.k->p;
+
+ if (insert->k.p.snapshot != k.k->p.snapshot) {
+ update->k.p.snapshot = insert->k.p.snapshot;
+ update->k.type = KEY_TYPE_whiteout;
+ }
+
+ bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_INTENT);
+ ret = bch2_btree_iter_traverse(&update_iter) ?:
+ bch2_trans_update(trans, &update_iter, update,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+ flags);
+ bch2_trans_iter_exit(trans, &update_iter);
+
if (ret)
goto err;
}
- if (bkey_cmp(k.k->p, insert->k.p) > 0) {
+ if (back_split) {
update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
if ((ret = PTR_ERR_OR_ZERO(update)))
goto err;
@@ -1023,10 +1138,15 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
bkey_reassemble(update, k);
bch2_cut_front(insert->k.p, update);
- ret = bch2_trans_update(trans, &iter, update, flags);
+ bch2_trans_copy_iter(&update_iter, &iter);
+ update_iter.pos = update->k.p;
+ ret = bch2_trans_update(trans, &update_iter, update,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+ flags);
+ bch2_trans_iter_exit(trans, &update_iter);
+
if (ret)
goto err;
-
goto out;
}
next:
@@ -1037,7 +1157,23 @@ next:
goto out;
}
- bch2_bkey_merge(c, bkey_i_to_s(insert), k);
+ if (bch2_bkey_maybe_mergable(&insert->k, k.k)) {
+ ret = check_pos_snapshot_overwritten(trans, btree_id, insert->k.p);
+ if (ret < 0)
+ goto out;
+ if (ret)
+ goto nomerge2;
+
+ ret = check_pos_snapshot_overwritten(trans, btree_id, k.k->p);
+ if (ret < 0)
+ goto out;
+ if (ret)
+ goto nomerge2;
+
+ bch2_bkey_merge(c, bkey_i_to_s(insert), k);
+ }
+nomerge2:
+ ret = 0;
out:
if (!bkey_deleted(&insert->k)) {
/*
@@ -1057,6 +1193,39 @@ err:
return ret;
}
+/*
+ * When deleting, check if we need to emit a whiteout (because we're overwriting
+ * something in an ancestor snapshot)
+ */
+static int need_whiteout_for_snapshot(struct btree_trans *trans,
+ enum btree_id btree_id, struct bpos pos)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ u32 snapshot = pos.snapshot;
+ int ret;
+
+ if (!bch2_snapshot_parent(trans->c, pos.snapshot))
+ return 0;
+
+ pos.snapshot++;
+
+ for_each_btree_key(trans, iter, btree_id, pos,
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ if (bkey_cmp(k.k->p, pos))
+ break;
+
+ if (bch2_snapshot_is_ancestor(trans->c, snapshot,
+ k.k->p.snapshot)) {
+ ret = !bkey_whiteout(k.k);
+ break;
+ }
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ return ret;
+}
+
int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_i *k, enum btree_update_flags flags)
{
@@ -1089,6 +1258,16 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
btree_insert_entry_cmp(i - 1, i) >= 0);
#endif
+ if (bkey_deleted(&n.k->k) &&
+ (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
+ int ret = need_whiteout_for_snapshot(trans, n.btree_id, n.k->k.p);
+ if (unlikely(ret < 0))
+ return ret;
+
+ if (ret)
+ n.k->k.type = KEY_TYPE_whiteout;
+ }
+
/*
* Pending updates are kept sorted: first, find position of new update,
* then delete/trim any updates the new update overwrites:
@@ -1175,13 +1354,14 @@ int bch2_btree_delete_at(struct btree_trans *trans,
int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
struct bpos start, struct bpos end,
+ unsigned iter_flags,
u64 *journal_seq)
{
struct btree_iter iter;
struct bkey_s_c k;
int ret = 0;
- bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT);
+ bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT|iter_flags);
retry:
while ((bch2_trans_begin(trans),
(k = bch2_btree_iter_peek(&iter)).k) &&
@@ -1248,5 +1428,5 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
u64 *journal_seq)
{
return bch2_trans_do(c, NULL, journal_seq, 0,
- bch2_btree_delete_range_trans(&trans, id, start, end, journal_seq));
+ bch2_btree_delete_range_trans(&trans, id, start, end, 0, journal_seq));
}
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index df12416e..5fd3aabb 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -16,6 +16,7 @@
#include "movinggc.h"
#include "reflink.h"
#include "replicas.h"
+#include "subvolume.h"
#include <linux/preempt.h>
#include <trace/events/bcachefs.h>
@@ -1200,6 +1201,8 @@ static int bch2_mark_key_locked(struct bch_fs *c,
return bch2_mark_reservation(c, old, new, journal_seq, flags);
case KEY_TYPE_reflink_p:
return bch2_mark_reflink_p(c, old, new, journal_seq, flags);
+ case KEY_TYPE_snapshot:
+ return bch2_mark_snapshot(c, old, new, journal_seq, flags);
default:
return 0;
}
diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c
index 1d510f77..8653a106 100644
--- a/libbcachefs/dirent.c
+++ b/libbcachefs/dirent.c
@@ -8,6 +8,7 @@
#include "fs.h"
#include "keylist.h"
#include "str_hash.h"
+#include "subvolume.h"
#include <linux/dcache.h>
@@ -99,7 +100,8 @@ const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k)
if (memchr(d.v->d_name, '/', len))
return "invalid name";
- if (le64_to_cpu(d.v->d_inum) == d.k->p.inode)
+ if (d.v->d_type != DT_SUBVOL &&
+ le64_to_cpu(d.v->d_inum) == d.k->p.inode)
return "dirent points to own directory";
return NULL;
@@ -113,7 +115,7 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
bch_scnmemcpy(out, d.v->d_name,
bch2_dirent_name_bytes(d));
pr_buf(out, " -> %llu type %s", d.v->d_inum,
- d.v->d_type < DT_MAX
+ d.v->d_type < BCH_DT_MAX
? bch2_d_types[d.v->d_type]
: "(bad d_type)");
}
@@ -149,8 +151,8 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
return dirent;
}
-int bch2_dirent_create(struct btree_trans *trans,
- u64 dir_inum, const struct bch_hash_info *hash_info,
+int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
+ const struct bch_hash_info *hash_info,
u8 type, const struct qstr *name, u64 dst_inum,
u64 *dir_offset, int flags)
{
@@ -163,7 +165,7 @@ int bch2_dirent_create(struct btree_trans *trans,
return ret;
ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
- dir_inum, &dirent->k_i, flags);
+ dir, &dirent->k_i, flags);
*dir_offset = dirent->k.p.offset;
return ret;
@@ -176,22 +178,86 @@ static void dirent_copy_target(struct bkey_i_dirent *dst,
dst->v.d_type = src.v->d_type;
}
+int __bch2_dirent_read_target(struct btree_trans *trans,
+ struct bkey_s_c_dirent d,
+ u32 *subvol, u32 *snapshot, u64 *inum,
+ bool is_fsck)
+{
+ int ret = 0;
+
+ *subvol = 0;
+ *snapshot = d.k->p.snapshot;
+
+ if (likely(d.v->d_type != DT_SUBVOL)) {
+ *inum = le64_to_cpu(d.v->d_inum);
+ } else {
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_s_c_subvolume s;
+ int ret;
+
+ *subvol = le64_to_cpu(d.v->d_inum);
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
+ POS(0, *subvol),
+ BTREE_ITER_CACHED);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_subvolume) {
+ ret = -ENOENT;
+ goto err;
+ }
+
+ s = bkey_s_c_to_subvolume(k);
+ *snapshot = le32_to_cpu(s.v->snapshot);
+ *inum = le64_to_cpu(s.v->inode);
+err:
+ if (ret == -ENOENT && !is_fsck)
+ bch2_fs_inconsistent(trans->c, "pointer to missing subvolume %u",
+ *subvol);
+
+ bch2_trans_iter_exit(trans, &iter);
+ }
+
+ return ret;
+}
+
+static int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
+ struct bkey_s_c_dirent d, subvol_inum *target)
+{
+ u32 snapshot;
+ int ret = 0;
+
+ ret = __bch2_dirent_read_target(trans, d, &target->subvol, &snapshot,
+ &target->inum, false);
+ if (!target->subvol)
+ target->subvol = dir.subvol;
+
+ return ret;
+}
+
int bch2_dirent_rename(struct btree_trans *trans,
- u64 src_dir, struct bch_hash_info *src_hash,
- u64 dst_dir, struct bch_hash_info *dst_hash,
- const struct qstr *src_name, u64 *src_inum, u64 *src_offset,
- const struct qstr *dst_name, u64 *dst_inum, u64 *dst_offset,
- enum bch_rename_mode mode)
+ subvol_inum src_dir, struct bch_hash_info *src_hash,
+ subvol_inum dst_dir, struct bch_hash_info *dst_hash,
+ const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset,
+ const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset,
+ enum bch_rename_mode mode)
{
struct btree_iter src_iter = { NULL };
struct btree_iter dst_iter = { NULL };
struct bkey_s_c old_src, old_dst;
struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
struct bpos dst_pos =
- POS(dst_dir, bch2_dirent_hash(dst_hash, dst_name));
+ POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name));
int ret = 0;
- *src_inum = *dst_inum = 0;
+ if (src_dir.subvol != dst_dir.subvol)
+ return -EXDEV;
+
+ memset(src_inum, 0, sizeof(*src_inum));
+ memset(dst_inum, 0, sizeof(*dst_inum));
/*
* Lookup dst:
@@ -214,8 +280,12 @@ int bch2_dirent_rename(struct btree_trans *trans,
if (ret)
goto out;
- if (mode != BCH_RENAME)
- *dst_inum = le64_to_cpu(bkey_s_c_to_dirent(old_dst).v->d_inum);
+ if (mode != BCH_RENAME) {
+ ret = bch2_dirent_read_target(trans, dst_dir,
+ bkey_s_c_to_dirent(old_dst), dst_inum);
+ if (ret)
+ goto out;
+ }
if (mode != BCH_RENAME_EXCHANGE)
*src_offset = dst_iter.pos.offset;
@@ -231,7 +301,10 @@ int bch2_dirent_rename(struct btree_trans *trans,
if (ret)
goto out;
- *src_inum = le64_to_cpu(bkey_s_c_to_dirent(old_src).v->d_inum);
+ ret = bch2_dirent_read_target(trans, src_dir,
+ bkey_s_c_to_dirent(old_src), src_inum);
+ if (ret)
+ goto out;
/* Create new dst key: */
new_dst = dirent_create_key(trans, 0, dst_name, 0);
@@ -310,63 +383,79 @@ out:
return ret;
}
-int bch2_dirent_delete_at(struct btree_trans *trans,
- const struct bch_hash_info *hash_info,
- struct btree_iter *iter)
-{
- return bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
- hash_info, iter);
-}
-
int __bch2_dirent_lookup_trans(struct btree_trans *trans,
struct btree_iter *iter,
- u64 dir_inum,
+ subvol_inum dir,
const struct bch_hash_info *hash_info,
- const struct qstr *name, unsigned flags)
+ const struct qstr *name, subvol_inum *inum,
+ unsigned flags)
{
- return bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
- hash_info, dir_inum, name, flags);
+ struct bkey_s_c k;
+ struct bkey_s_c_dirent d;
+ u32 snapshot;
+ int ret;
+
+ ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
+ if (ret)
+ return ret;
+
+ ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
+ hash_info, dir, name, flags);
+ if (ret)
+ return ret;
+
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
+ if (ret) {
+ bch2_trans_iter_exit(trans, iter);
+ return ret;
+ }
+
+ d = bkey_s_c_to_dirent(k);
+
+ ret = bch2_dirent_read_target(trans, dir, d, inum);
+ if (ret)
+ bch2_trans_iter_exit(trans, iter);
+
+ return ret;
}
-u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum,
+u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
const struct bch_hash_info *hash_info,
- const struct qstr *name)
+ const struct qstr *name, subvol_inum *inum)
{
struct btree_trans trans;
struct btree_iter iter;
- struct bkey_s_c k;
- u64 inum = 0;
- int ret = 0;
+ int ret;
bch2_trans_init(&trans, c, 0, 0);
+retry:
+ bch2_trans_begin(&trans);
- ret = __bch2_dirent_lookup_trans(&trans, &iter, dir_inum,
- hash_info, name, 0);
- if (ret)
- goto out;
-
- k = bch2_btree_iter_peek_slot(&iter);
- ret = bkey_err(k);
- if (ret)
- goto out;
+ ret = __bch2_dirent_lookup_trans(&trans, &iter, dir, hash_info,
+ name, inum, 0);
- inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
bch2_trans_iter_exit(&trans, &iter);
-out:
- BUG_ON(ret == -EINTR);
+ if (ret == -EINTR)
+ goto retry;
bch2_trans_exit(&trans);
- return inum;
+ return ret;
}
-int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum)
+int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
{
struct btree_iter iter;
struct bkey_s_c k;
+ u32 snapshot;
int ret;
+ ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
+ if (ret)
+ return ret;
+
for_each_btree_key(trans, iter, BTREE_ID_dirents,
- POS(dir_inum, 0), 0, k, ret) {
- if (k.k->p.inode > dir_inum)
+ SPOS(dir.inum, 0, snapshot), 0, k, ret) {
+ if (k.k->p.inode > dir.inum)
break;
if (k.k->type == KEY_TYPE_dirent) {
@@ -379,19 +468,26 @@ int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum)
return ret;
}
-int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx)
+int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
{
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_s_c_dirent dirent;
+ u32 snapshot;
int ret;
bch2_trans_init(&trans, c, 0, 0);
+retry:
+ bch2_trans_begin(&trans);
+
+ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+ if (ret)
+ goto err;
for_each_btree_key(&trans, iter, BTREE_ID_dirents,
- POS(inum, ctx->pos), 0, k, ret) {
- if (k.k->p.inode > inum)
+ SPOS(inum.inum, ctx->pos, snapshot), 0, k, ret) {
+ if (k.k->p.inode > inum.inum)
break;
if (k.k->type != KEY_TYPE_dirent)
@@ -407,11 +503,14 @@ int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx)
if (!dir_emit(ctx, dirent.v->d_name,
bch2_dirent_name_bytes(dirent),
le64_to_cpu(dirent.v->d_inum),
- dirent.v->d_type))
+ vfs_d_type(dirent.v->d_type)))
break;
ctx->pos = dirent.k->p.offset + 1;
}
bch2_trans_iter_exit(&trans, &iter);
+err:
+ if (ret == -EINTR)
+ goto retry;
ret = bch2_trans_exit(&trans) ?: ret;
diff --git a/libbcachefs/dirent.h b/libbcachefs/dirent.h
index c14f6029..e7f65fbd 100644
--- a/libbcachefs/dirent.h
+++ b/libbcachefs/dirent.h
@@ -29,13 +29,17 @@ static inline unsigned dirent_val_u64s(unsigned len)
sizeof(u64));
}
-int bch2_dirent_create(struct btree_trans *, u64,
+int bch2_dirent_create(struct btree_trans *, subvol_inum,
const struct bch_hash_info *, u8,
const struct qstr *, u64, u64 *, int);
-int bch2_dirent_delete_at(struct btree_trans *,
- const struct bch_hash_info *,
- struct btree_iter *);
+int __bch2_dirent_read_target(struct btree_trans *, struct bkey_s_c_dirent,
+ u32 *, u32 *, u64 *, bool);
+
+static inline unsigned vfs_d_type(unsigned type)
+{
+ return type == DT_SUBVOL ? DT_DIR : type;
+}
enum bch_rename_mode {
BCH_RENAME,
@@ -44,19 +48,20 @@ enum bch_rename_mode {
};
int bch2_dirent_rename(struct btree_trans *,
- u64, struct bch_hash_info *,
- u64, struct bch_hash_info *,
- const struct qstr *, u64 *, u64 *,
- const struct qstr *, u64 *, u64 *,
+ subvol_inum, struct bch_hash_info *,
+ subvol_inum, struct bch_hash_info *,
+ const struct qstr *, subvol_inum *, u64 *,
+ const struct qstr *, subvol_inum *, u64 *,
enum bch_rename_mode);
-int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *, u64,
- const struct bch_hash_info *,
- const struct qstr *, unsigned);
-u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *,
- const struct qstr *);
+int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *,
+ subvol_inum, const struct bch_hash_info *,
+ const struct qstr *, subvol_inum *, unsigned);
+u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum,
+ const struct bch_hash_info *,
+ const struct qstr *, subvol_inum *);
-int bch2_empty_dir_trans(struct btree_trans *, u64);
-int bch2_readdir(struct bch_fs *, u64, struct dir_context *);
+int bch2_empty_dir_trans(struct btree_trans *, subvol_inum);
+int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *);
#endif /* _BCACHEFS_DIRENT_H */
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c
index f66640c2..6c2eed77 100644
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -612,38 +612,6 @@ bool bch2_bkey_is_incompressible(struct bkey_s_c k)
return false;
}
-bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
- unsigned nr_replicas, bool compressed)
-{
- struct btree_trans trans;
- struct btree_iter iter;
- struct bpos end = pos;
- struct bkey_s_c k;
- bool ret = true;
- int err;
-
- end.offset += size;
-
- bch2_trans_init(&trans, c, 0, 0);
-
- for_each_btree_key(&trans, iter, BTREE_ID_extents, pos,
- BTREE_ITER_SLOTS, k, err) {
- if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
- break;
-
- if (nr_replicas > bch2_bkey_replicas(c, k) ||
- (!compressed && bch2_bkey_sectors_compressed(k))) {
- ret = false;
- break;
- }
- }
- bch2_trans_iter_exit(&trans, &iter);
-
- bch2_trans_exit(&trans);
-
- return ret;
-}
-
unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h
index 43cef0a3..afd3067b 100644
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@@ -567,7 +567,6 @@ unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c);
bool bch2_bkey_is_incompressible(struct bkey_s_c);
unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
-bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned, bool);
unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c);
unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
diff --git a/libbcachefs/fs-common.c b/libbcachefs/fs-common.c
index 6bc82559..3e8e3c5b 100644
--- a/libbcachefs/fs-common.c
+++ b/libbcachefs/fs-common.c
@@ -6,82 +6,186 @@
#include "dirent.h"
#include "fs-common.h"
#include "inode.h"
+#include "subvolume.h"
#include "xattr.h"
#include <linux/posix_acl.h>
-int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
+static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode)
+{
+ return S_ISDIR(inode->bi_mode) && !inode->bi_subvol;
+}
+
+int bch2_create_trans(struct btree_trans *trans,
+ subvol_inum dir,
struct bch_inode_unpacked *dir_u,
struct bch_inode_unpacked *new_inode,
const struct qstr *name,
uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
struct posix_acl *default_acl,
- struct posix_acl *acl)
+ struct posix_acl *acl,
+ subvol_inum snapshot_src,
+ unsigned flags)
{
struct bch_fs *c = trans->c;
struct btree_iter dir_iter = { NULL };
struct btree_iter inode_iter = { NULL };
- struct bch_hash_info hash = bch2_hash_info_init(c, new_inode);
+ subvol_inum new_inum = dir;
u64 now = bch2_current_time(c);
u64 cpu = raw_smp_processor_id();
- u64 dir_offset = 0;
+ u64 dir_target;
+ u32 snapshot;
+ unsigned dir_type = mode_to_type(mode);
int ret;
- ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir_inum, BTREE_ITER_INTENT);
+ ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
if (ret)
goto err;
- bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
-
- if (!name)
- new_inode->bi_flags |= BCH_INODE_UNLINKED;
-
- ret = bch2_inode_create(trans, &inode_iter, new_inode, U32_MAX, cpu);
+ ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
if (ret)
goto err;
- if (default_acl) {
- ret = bch2_set_acl_trans(trans, new_inode, &hash,
- default_acl, ACL_TYPE_DEFAULT);
+ if (!(flags & BCH_CREATE_SNAPSHOT)) {
+ /* Normal create path - allocate a new inode: */
+ bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
+
+ if (flags & BCH_CREATE_TMPFILE)
+ new_inode->bi_flags |= BCH_INODE_UNLINKED;
+
+ ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu);
if (ret)
goto err;
+
+ snapshot_src = (subvol_inum) { 0 };
+ } else {
+ /*
+ * Creating a snapshot - we're not allocating a new inode, but
+ * we do have to lookup the root inode of the subvolume we're
+ * snapshotting and update it (in the new snapshot):
+ */
+
+ if (!snapshot_src.inum) {
+ /* Inode wasn't specified, just snapshot: */
+ struct btree_iter subvol_iter;
+ struct bkey_s_c k;
+
+ bch2_trans_iter_init(trans, &subvol_iter, BTREE_ID_subvolumes,
+ POS(0, snapshot_src.subvol), 0);
+ k = bch2_btree_iter_peek_slot(&subvol_iter);
+
+ ret = bkey_err(k);
+ if (!ret && k.k->type != KEY_TYPE_subvolume) {
+ bch_err(c, "subvolume %u not found",
+ snapshot_src.subvol);
+ ret = -ENOENT;
+ }
+
+ if (!ret)
+ snapshot_src.inum = le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode);
+ bch2_trans_iter_exit(trans, &subvol_iter);
+
+ if (ret)
+ goto err;
+ }
+
+ ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src,
+ BTREE_ITER_INTENT);
+ if (ret)
+ goto err;
+
+ if (new_inode->bi_subvol != snapshot_src.subvol) {
+ /* Not a subvolume root: */
+ ret = -EINVAL;
+ goto err;
+ }
+
+ /*
+ * If we're not root, we have to own the subvolume being
+ * snapshotted:
+ */
+ if (uid && new_inode->bi_uid != uid) {
+ ret = -EPERM;
+ goto err;
+ }
+
+ flags |= BCH_CREATE_SUBVOL;
}
- if (acl) {
- ret = bch2_set_acl_trans(trans, new_inode, &hash,
- acl, ACL_TYPE_ACCESS);
+ new_inum.inum = new_inode->bi_inum;
+ dir_target = new_inode->bi_inum;
+
+ if (flags & BCH_CREATE_SUBVOL) {
+ u32 new_subvol, dir_snapshot;
+
+ ret = bch2_subvolume_create(trans, new_inode->bi_inum,
+ snapshot_src.subvol,
+ &new_subvol, &snapshot,
+ (flags & BCH_CREATE_SNAPSHOT_RO) != 0);
if (ret)
goto err;
+
+ new_inode->bi_parent_subvol = dir.subvol;
+ new_inode->bi_subvol = new_subvol;
+ new_inum.subvol = new_subvol;
+ dir_target = new_subvol;
+ dir_type = DT_SUBVOL;
+
+ ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &dir_snapshot);
+ if (ret)
+ goto err;
+
+ bch2_btree_iter_set_snapshot(&dir_iter, dir_snapshot);
+ ret = bch2_btree_iter_traverse(&dir_iter);
+ if (ret)
+ goto err;
+ }
+
+ if (!(flags & BCH_CREATE_SNAPSHOT)) {
+ if (default_acl) {
+ ret = bch2_set_acl_trans(trans, new_inum, new_inode,
+ default_acl, ACL_TYPE_DEFAULT);
+ if (ret)
+ goto err;
+ }
+
+ if (acl) {
+ ret = bch2_set_acl_trans(trans, new_inum, new_inode,
+ acl, ACL_TYPE_ACCESS);
+ if (ret)
+ goto err;
+ }
}
- if (name) {
+ if (!(flags & BCH_CREATE_TMPFILE)) {
struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u);
- dir_u->bi_mtime = dir_u->bi_ctime = now;
+ u64 dir_offset;
- if (S_ISDIR(new_inode->bi_mode))
+ if (is_subdir_for_nlink(new_inode))
dir_u->bi_nlink++;
+ dir_u->bi_mtime = dir_u->bi_ctime = now;
ret = bch2_inode_write(trans, &dir_iter, dir_u);
if (ret)
goto err;
- ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
- mode_to_type(new_inode->bi_mode),
- name, new_inode->bi_inum,
+ ret = bch2_dirent_create(trans, dir, &dir_hash,
+ dir_type,
+ name,
+ dir_target,
&dir_offset,
BCH_HASH_SET_MUST_CREATE);
if (ret)
goto err;
- }
- if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
- new_inode->bi_dir = dir_u->bi_inum;
- new_inode->bi_dir_offset = dir_offset;
+ if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
+ new_inode->bi_dir = dir_u->bi_inum;
+ new_inode->bi_dir_offset = dir_offset;
+ }
}
- /* XXX use bch2_btree_iter_set_snapshot() */
- inode_iter.snapshot = U32_MAX;
- bch2_btree_iter_set_pos(&inode_iter, SPOS(0, new_inode->bi_inum, U32_MAX));
+ inode_iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
+ bch2_btree_iter_set_snapshot(&inode_iter, snapshot);
ret = bch2_btree_iter_traverse(&inode_iter) ?:
bch2_inode_write(trans, &inode_iter, new_inode);
@@ -91,9 +195,10 @@ err:
return ret;
}
-int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
- u64 inum, struct bch_inode_unpacked *dir_u,
- struct bch_inode_unpacked *inode_u, const struct qstr *name)
+int bch2_link_trans(struct btree_trans *trans,
+ subvol_inum dir, struct bch_inode_unpacked *dir_u,
+ subvol_inum inum, struct bch_inode_unpacked *inode_u,
+ const struct qstr *name)
{
struct bch_fs *c = trans->c;
struct btree_iter dir_iter = { NULL };
@@ -103,6 +208,9 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
u64 dir_offset = 0;
int ret;
+ if (dir.subvol != inum.subvol)
+ return -EXDEV;
+
ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT);
if (ret)
goto err;
@@ -110,7 +218,7 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
inode_u->bi_ctime = now;
bch2_inode_nlink_inc(inode_u);
- ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir_inum, BTREE_ITER_INTENT);
+ ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
if (ret)
goto err;
@@ -118,15 +226,15 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
dir_hash = bch2_hash_info_init(c, dir_u);
- ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
+ ret = bch2_dirent_create(trans, dir, &dir_hash,
mode_to_type(inode_u->bi_mode),
- name, inum, &dir_offset,
+ name, inum.inum, &dir_offset,
BCH_HASH_SET_MUST_CREATE);
if (ret)
goto err;
if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
- inode_u->bi_dir = dir_inum;
+ inode_u->bi_dir = dir.inum;
inode_u->bi_dir_offset = dir_offset;
}
@@ -139,55 +247,83 @@ err:
}
int bch2_unlink_trans(struct btree_trans *trans,
- u64 dir_inum, struct bch_inode_unpacked *dir_u,
+ subvol_inum dir,
+ struct bch_inode_unpacked *dir_u,
struct bch_inode_unpacked *inode_u,
- const struct qstr *name)
+ const struct qstr *name,
+ int deleting_snapshot)
{
struct bch_fs *c = trans->c;
struct btree_iter dir_iter = { NULL };
struct btree_iter dirent_iter = { NULL };
struct btree_iter inode_iter = { NULL };
struct bch_hash_info dir_hash;
- u64 inum, now = bch2_current_time(c);
+ subvol_inum inum;
+ u64 now = bch2_current_time(c);
struct bkey_s_c k;
int ret;
- ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir_inum, BTREE_ITER_INTENT);
+ ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
if (ret)
goto err;
dir_hash = bch2_hash_info_init(c, dir_u);
- ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir_inum, &dir_hash,
- name, BTREE_ITER_INTENT);
+ ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
+ name, &inum, BTREE_ITER_INTENT);
if (ret)
goto err;
- k = bch2_btree_iter_peek_slot(&dirent_iter);
- ret = bkey_err(k);
+ ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum,
+ BTREE_ITER_INTENT);
if (ret)
goto err;
- inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
-
- ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT);
- if (ret)
+ if (deleting_snapshot == 1 && !inode_u->bi_subvol) {
+ ret = -ENOENT;
goto err;
+ }
+
+ if (deleting_snapshot <= 0 && S_ISDIR(inode_u->bi_mode)) {
+ ret = bch2_empty_dir_trans(trans, inum);
+ if (ret)
+ goto err;
+ }
+
+ if (inode_u->bi_subvol) {
+ ret = bch2_subvolume_delete(trans, inode_u->bi_subvol,
+ deleting_snapshot);
+ if (ret)
+ goto err;
+
+ k = bch2_btree_iter_peek_slot(&dirent_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ /*
+ * If we're deleting a subvolume, we need to really delete the
+ * dirent, not just emit a whiteout in the current snapshot:
+ */
+ bch2_btree_iter_set_snapshot(&dirent_iter, k.k->p.snapshot);
+ ret = bch2_btree_iter_traverse(&dirent_iter);
+ if (ret)
+ goto err;
+ }
- if (inode_u->bi_dir == k.k->p.inode &&
- inode_u->bi_dir_offset == k.k->p.offset) {
+ if (inode_u->bi_dir == dirent_iter.pos.inode &&
+ inode_u->bi_dir_offset == dirent_iter.pos.offset) {
inode_u->bi_dir = 0;
inode_u->bi_dir_offset = 0;
}
dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now;
- dir_u->bi_nlink -= S_ISDIR(inode_u->bi_mode);
+ dir_u->bi_nlink -= is_subdir_for_nlink(inode_u);
bch2_inode_nlink_dec(inode_u);
- ret = (S_ISDIR(inode_u->bi_mode)
- ? bch2_empty_dir_trans(trans, inum)
- : 0) ?:
- bch2_dirent_delete_at(trans, &dir_hash, &dirent_iter) ?:
+ ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
+ &dir_hash, &dirent_iter,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_inode_write(trans, &dir_iter, dir_u) ?:
bch2_inode_write(trans, &inode_iter, inode_u);
err:
@@ -222,8 +358,8 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u,
}
int bch2_rename_trans(struct btree_trans *trans,
- u64 src_dir, struct bch_inode_unpacked *src_dir_u,
- u64 dst_dir, struct bch_inode_unpacked *dst_dir_u,
+ subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u,
+ subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u,
struct bch_inode_unpacked *src_inode_u,
struct bch_inode_unpacked *dst_inode_u,
const struct qstr *src_name,
@@ -236,7 +372,8 @@ int bch2_rename_trans(struct btree_trans *trans,
struct btree_iter src_inode_iter = { NULL };
struct btree_iter dst_inode_iter = { NULL };
struct bch_hash_info src_hash, dst_hash;
- u64 src_inode, src_offset, dst_inode, dst_offset;
+ subvol_inum src_inum, dst_inum;
+ u64 src_offset, dst_offset;
u64 now = bch2_current_time(c);
int ret;
@@ -247,7 +384,8 @@ int bch2_rename_trans(struct btree_trans *trans,
src_hash = bch2_hash_info_init(c, src_dir_u);
- if (dst_dir != src_dir) {
+ if (dst_dir.inum != src_dir.inum ||
+ dst_dir.subvol != src_dir.subvol) {
ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir,
BTREE_ITER_INTENT);
if (ret)
@@ -262,19 +400,19 @@ int bch2_rename_trans(struct btree_trans *trans,
ret = bch2_dirent_rename(trans,
src_dir, &src_hash,
dst_dir, &dst_hash,
- src_name, &src_inode, &src_offset,
- dst_name, &dst_inode, &dst_offset,
+ src_name, &src_inum, &src_offset,
+ dst_name, &dst_inum, &dst_offset,
mode);
if (ret)
goto err;
- ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inode,
+ ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inum,
BTREE_ITER_INTENT);
if (ret)
goto err;
- if (dst_inode) {
- ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inode,
+ if (dst_inum.inum) {
+ ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inum,
BTREE_ITER_INTENT);
if (ret)
goto err;
@@ -305,7 +443,7 @@ int bch2_rename_trans(struct btree_trans *trans,
}
if (S_ISDIR(dst_inode_u->bi_mode) &&
- bch2_empty_dir_trans(trans, dst_inode)) {
+ bch2_empty_dir_trans(trans, dst_inum)) {
ret = -ENOTEMPTY;
goto err;
}
@@ -324,12 +462,12 @@ int bch2_rename_trans(struct btree_trans *trans,
goto err;
}
- if (S_ISDIR(src_inode_u->bi_mode)) {
+ if (is_subdir_for_nlink(src_inode_u)) {
src_dir_u->bi_nlink--;
dst_dir_u->bi_nlink++;
}
- if (dst_inode && S_ISDIR(dst_inode_u->bi_mode)) {
+ if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) {
dst_dir_u->bi_nlink--;
src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE;
}
@@ -340,22 +478,22 @@ int bch2_rename_trans(struct btree_trans *trans,
src_dir_u->bi_mtime = now;
src_dir_u->bi_ctime = now;
- if (src_dir != dst_dir) {
+ if (src_dir.inum != dst_dir.inum) {
dst_dir_u->bi_mtime = now;
dst_dir_u->bi_ctime = now;
}
src_inode_u->bi_ctime = now;
- if (dst_inode)
+ if (dst_inum.inum)
dst_inode_u->bi_ctime = now;
ret = bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?:
- (src_dir != dst_dir
+ (src_dir.inum != dst_dir.inum
? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u)
: 0 ) ?:
bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?:
- (dst_inode
+ (dst_inum.inum
? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u)
: 0 );
err:
diff --git a/libbcachefs/fs-common.h b/libbcachefs/fs-common.h
index 2273b796..9bb0a967 100644
--- a/libbcachefs/fs-common.h
+++ b/libbcachefs/fs-common.h
@@ -4,27 +4,33 @@
struct posix_acl;
-int bch2_create_trans(struct btree_trans *, u64,
+#define BCH_CREATE_TMPFILE (1U << 0)
+#define BCH_CREATE_SUBVOL (1U << 1)
+#define BCH_CREATE_SNAPSHOT (1U << 2)
+#define BCH_CREATE_SNAPSHOT_RO (1U << 3)
+
+int bch2_create_trans(struct btree_trans *, subvol_inum,
struct bch_inode_unpacked *,
struct bch_inode_unpacked *,
const struct qstr *,
uid_t, gid_t, umode_t, dev_t,
struct posix_acl *,
- struct posix_acl *);
+ struct posix_acl *,
+ subvol_inum, unsigned);
-int bch2_link_trans(struct btree_trans *, u64,
- u64, struct bch_inode_unpacked *,
- struct bch_inode_unpacked *,
+int bch2_link_trans(struct btree_trans *,
+ subvol_inum, struct bch_inode_unpacked *,
+ subvol_inum, struct bch_inode_unpacked *,
const struct qstr *);
-int bch2_unlink_trans(struct btree_trans *,
- u64, struct bch_inode_unpacked *,
+int bch2_unlink_trans(struct btree_trans *, subvol_inum,
+ struct bch_inode_unpacked *,
struct bch_inode_unpacked *,
- const struct qstr *);
+ const struct qstr *, int);
int bch2_rename_trans(struct btree_trans *,
- u64, struct bch_inode_unpacked *,
- u64, struct bch_inode_unpacked *,
+ subvol_inum, struct bch_inode_unpacked *,
+ subvol_inum, struct bch_inode_unpacked *,
struct bch_inode_unpacked *,
struct bch_inode_unpacked *,
const struct qstr *,
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c
index 29210377..c07755c6 100644
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -786,23 +786,35 @@ static void readpage_bio_extend(struct readpages_iter *iter,
}
}
-static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
- struct bch_read_bio *rbio, u64 inum,
+static void bchfs_read(struct btree_trans *trans,
+ struct bch_read_bio *rbio,
+ subvol_inum inum,
struct readpages_iter *readpages_iter)
{
struct bch_fs *c = trans->c;
+ struct btree_iter iter;
struct bkey_buf sk;
int flags = BCH_READ_RETRY_IF_STALE|
BCH_READ_MAY_PROMOTE;
+ u32 snapshot;
int ret = 0;
rbio->c = c;
rbio->start_time = local_clock();
+ rbio->subvol = inum.subvol;
bch2_bkey_buf_init(&sk);
retry:
bch2_trans_begin(trans);
+ iter = (struct btree_iter) { NULL };
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ goto err;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+ SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot),
+ BTREE_ITER_SLOTS|BTREE_ITER_FILTER_SNAPSHOTS);
while (1) {
struct bkey_s_c k;
unsigned bytes, sectors, offset_into_extent;
@@ -817,15 +829,15 @@ retry:
break;
}
- bch2_btree_iter_set_pos(iter,
- POS(inum, rbio->bio.bi_iter.bi_sector));
+ bch2_btree_iter_set_pos(&iter,
+ POS(inum.inum, rbio->bio.bi_iter.bi_sector));
- k = bch2_btree_iter_peek_slot(iter);
+ k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
break;
- offset_into_extent = iter->pos.offset -
+ offset_into_extent = iter.pos.offset -
bkey_start_offset(k.k);
sectors = k.k->size - offset_into_extent;
@@ -855,7 +867,7 @@ retry:
if (bkey_extent_is_allocation(k.k))
bch2_add_page_sectors(&rbio->bio, k);
- bch2_read_extent(trans, rbio, iter->pos,
+ bch2_read_extent(trans, rbio, iter.pos,
data_btree, k, offset_into_extent, flags);
if (flags & BCH_READ_LAST_FRAGMENT)
@@ -864,12 +876,14 @@ retry:
swap(rbio->bio.bi_iter.bi_size, bytes);
bio_advance(&rbio->bio, bytes);
}
+err:
+ bch2_trans_iter_exit(trans, &iter);
if (ret == -EINTR)
goto retry;
if (ret) {
- bch_err_inum_ratelimited(c, inum,
+ bch_err_inum_ratelimited(c, inum.inum,
"read error %i from btree lookup", ret);
rbio->bio.bi_status = BLK_STS_IOERR;
bio_endio(&rbio->bio);
@@ -884,7 +898,6 @@ void bch2_readahead(struct readahead_control *ractl)
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
struct btree_trans trans;
- struct btree_iter iter;
struct page *page;
struct readpages_iter readpages_iter;
int ret;
@@ -893,8 +906,6 @@ void bch2_readahead(struct readahead_control *ractl)
BUG_ON(ret);
bch2_trans_init(&trans, c, 0, 0);
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, POS_MIN,
- BTREE_ITER_SLOTS);
bch2_pagecache_add_get(&inode->ei_pagecache_lock);
@@ -915,22 +926,20 @@ void bch2_readahead(struct readahead_control *ractl)
rbio->bio.bi_end_io = bch2_readpages_end_io;
BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
- bchfs_read(&trans, &iter, rbio, inode->v.i_ino,
+ bchfs_read(&trans, rbio, inode_inum(inode),
&readpages_iter);
}
bch2_pagecache_add_put(&inode->ei_pagecache_lock);
- bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
kfree(readpages_iter.pages);
}
static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
- u64 inum, struct page *page)
+ subvol_inum inum, struct page *page)
{
struct btree_trans trans;
- struct btree_iter iter;
bch2_page_state_create(page, __GFP_NOFAIL);
@@ -940,12 +949,7 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
bch2_trans_init(&trans, c, 0, 0);
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, POS_MIN,
- BTREE_ITER_SLOTS);
-
- bchfs_read(&trans, &iter, rbio, inum, NULL);
-
- bch2_trans_iter_exit(&trans, &iter);
+ bchfs_read(&trans, rbio, inum, NULL);
bch2_trans_exit(&trans);
}
@@ -959,7 +963,7 @@ int bch2_readpage(struct file *file, struct page *page)
rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), opts);
rbio->bio.bi_end_io = bch2_readpages_end_io;
- __bchfs_readpage(c, rbio, inode->v.i_ino, page);
+ __bchfs_readpage(c, rbio, inode_inum(inode), page);
return 0;
}
@@ -982,7 +986,7 @@ static int bch2_read_single_page(struct page *page,
rbio->bio.bi_private = &done;
rbio->bio.bi_end_io = bch2_read_single_page_end_io;
- __bchfs_readpage(c, rbio, inode->v.i_ino, page);
+ __bchfs_readpage(c, rbio, inode_inum(inode), page);
wait_for_completion(&done);
ret = blk_status_to_errno(rbio->bio.bi_status);
@@ -1126,6 +1130,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
op->nr_replicas = nr_replicas;
op->res.nr_replicas = nr_replicas;
op->write_point = writepoint_hashed(inode->ei_last_dirtied);
+ op->subvol = inode->ei_subvol;
op->pos = POS(inode->v.i_ino, sector);
op->wbio.bio.bi_iter.bi_sector = sector;
op->wbio.bio.bi_opf = wbc_to_write_flags(wbc);
@@ -1758,7 +1763,7 @@ start:
if (iter->count)
closure_get(&dio->cl);
- bch2_read(c, rbio_init(bio, opts), inode->v.i_ino);
+ bch2_read(c, rbio_init(bio, opts), inode_inum(inode));
}
iter->count += shorten;
@@ -1813,6 +1818,50 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
/* O_DIRECT writes */
+static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum,
+ u64 offset, u64 size,
+ unsigned nr_replicas, bool compressed)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ u64 end = offset + size;
+ u32 snapshot;
+ bool ret = true;
+ int err;
+
+ bch2_trans_init(&trans, c, 0, 0);
+retry:
+ bch2_trans_begin(&trans);
+
+ err = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+ if (err)
+ goto err;
+
+ for_each_btree_key(&trans, iter, BTREE_ID_extents,
+ SPOS(inum.inum, offset, snapshot),
+ BTREE_ITER_SLOTS, k, err) {
+ if (bkey_cmp(bkey_start_pos(k.k), POS(inum.inum, end)) >= 0)
+ break;
+
+ if (k.k->p.snapshot != snapshot ||
+ nr_replicas > bch2_bkey_replicas(c, k) ||
+ (!compressed && bch2_bkey_sectors_compressed(k))) {
+ ret = false;
+ break;
+ }
+ }
+
+ offset = iter.pos.offset;
+ bch2_trans_iter_exit(&trans, &iter);
+err:
+ if (err == -EINTR)
+ goto retry;
+ bch2_trans_exit(&trans);
+
+ return err ? false : ret;
+}
+
static void bch2_dio_write_loop_async(struct bch_write_op *);
static long bch2_dio_write_loop(struct dio_write *dio)
@@ -1891,6 +1940,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
op_journal_seq_set(&dio->op, &inode->ei_journal_seq);
dio->op.write_point = writepoint_hashed((unsigned long) current);
dio->op.nr_replicas = dio->op.opts.data_replicas;
+ dio->op.subvol = inode->ei_subvol;
dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
if ((req->ki_flags & IOCB_DSYNC) &&
@@ -1901,8 +1951,8 @@ static long bch2_dio_write_loop(struct dio_write *dio)
ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
dio->op.opts.data_replicas, 0);
if (unlikely(ret) &&
- !bch2_check_range_allocated(c, dio->op.pos,
- bio_sectors(bio),
+ !bch2_check_range_allocated(c, inode_inum(inode),
+ dio->op.pos.offset, bio_sectors(bio),
dio->op.opts.data_replicas,
dio->op.opts.compression != 0))
goto err;
@@ -2146,9 +2196,9 @@ out:
/* truncate: */
-static inline int range_has_data(struct bch_fs *c,
- struct bpos start,
- struct bpos end)
+static inline int range_has_data(struct bch_fs *c, u32 subvol,
+ struct bpos start,
+ struct bpos end)
{
struct btree_trans trans;
struct btree_iter iter;
@@ -2156,6 +2206,12 @@ static inline int range_has_data(struct bch_fs *c,
int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
+retry:
+ bch2_trans_begin(&trans);
+
+ ret = bch2_subvolume_get_snapshot(&trans, subvol, &start.snapshot);
+ if (ret)
+ goto err;
for_each_btree_key(&trans, iter, BTREE_ID_extents, start, 0, k, ret) {
if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
@@ -2166,7 +2222,11 @@ static inline int range_has_data(struct bch_fs *c,
break;
}
}
+ start = iter.pos;
bch2_trans_iter_exit(&trans, &iter);
+err:
+ if (ret == -EINTR)
+ goto retry;
return bch2_trans_exit(&trans) ?: ret;
}
@@ -2198,7 +2258,7 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
* XXX: we're doing two index lookups when we end up reading the
* page
*/
- ret = range_has_data(c,
+ ret = range_has_data(c, inode->ei_subvol,
POS(inode->v.i_ino, index << PAGE_SECTOR_SHIFT),
POS(inode->v.i_ino, (index + 1) << PAGE_SECTOR_SHIFT));
if (ret <= 0)
@@ -2332,7 +2392,7 @@ int bch2_truncate(struct user_namespace *mnt_userns,
inode_dio_wait(&inode->v);
bch2_pagecache_block_get(&inode->ei_pagecache_lock);
- ret = bch2_inode_find_by_inum(c, inode->v.i_ino, &inode_u);
+ ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u);
if (ret)
goto err;
@@ -2390,7 +2450,7 @@ int bch2_truncate(struct user_namespace *mnt_userns,
truncate_setsize(&inode->v, iattr->ia_size);
- ret = bch2_fpunch(c, inode->v.i_ino,
+ ret = bch2_fpunch(c, inode_inum(inode),
round_up(iattr->ia_size, block_bytes(c)) >> 9,
U64_MAX, &inode->ei_journal_seq, &i_sectors_delta);
i_sectors_acct(c, inode, NULL, i_sectors_delta);
@@ -2450,7 +2510,7 @@ static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len
if (discard_start < discard_end) {
s64 i_sectors_delta = 0;
- ret = bch2_fpunch(c, inode->v.i_ino,
+ ret = bch2_fpunch(c, inode_inum(inode),
discard_start, discard_end,
&inode->ei_journal_seq,
&i_sectors_delta);
@@ -2529,7 +2589,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
} else {
s64 i_sectors_delta = 0;
- ret = bch2_fpunch(c, inode->v.i_ino,
+ ret = bch2_fpunch(c, inode_inum(inode),
offset >> 9, (offset + len) >> 9,
&inode->ei_journal_seq,
&i_sectors_delta);
@@ -2556,6 +2616,18 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
struct bpos move_pos = POS(inode->v.i_ino, offset >> 9);
struct bpos atomic_end;
unsigned trigger_flags = 0;
+ u32 snapshot;
+
+ bch2_trans_begin(&trans);
+
+ ret = bch2_subvolume_get_snapshot(&trans,
+ inode->ei_subvol, &snapshot);
+ if (ret)
+ continue;
+
+ bch2_btree_iter_set_snapshot(&src, snapshot);
+ bch2_btree_iter_set_snapshot(&dst, snapshot);
+ bch2_btree_iter_set_snapshot(&del, snapshot);
bch2_trans_begin(&trans);
@@ -2676,9 +2748,17 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
struct bkey_i_reservation reservation;
struct bkey_s_c k;
unsigned sectors;
+ u32 snapshot;
bch2_trans_begin(&trans);
+ ret = bch2_subvolume_get_snapshot(&trans,
+ inode->ei_subvol, &snapshot);
+ if (ret)
+ goto bkey_err;
+
+ bch2_btree_iter_set_snapshot(&iter, snapshot);
+
k = bch2_btree_iter_peek_slot(&iter);
if ((ret = bkey_err(k)))
goto bkey_err;
@@ -2725,7 +2805,8 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
reservation.v.nr_replicas = disk_res.nr_replicas;
}
- ret = bch2_extent_update(&trans, &iter, &reservation.k_i,
+ ret = bch2_extent_update(&trans, inode_inum(inode), &iter,
+ &reservation.k_i,
&disk_res, &inode->ei_journal_seq,
0, &i_sectors_delta, true);
i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
@@ -2927,8 +3008,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
mark_range_unallocated(src, pos_src, pos_src + aligned_len);
ret = bch2_remap_range(c,
- POS(dst->v.i_ino, pos_dst >> 9),
- POS(src->v.i_ino, pos_src >> 9),
+ inode_inum(dst), pos_dst >> 9,
+ inode_inum(src), pos_src >> 9,
aligned_len >> 9,
&dst->ei_journal_seq,
pos_dst + len, &i_sectors_delta);
@@ -3019,7 +3100,9 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
+ subvol_inum inum = inode_inum(inode);
u64 isize, next_data = MAX_LFS_FILESIZE;
+ u32 snapshot;
int ret;
isize = i_size_read(&inode->v);
@@ -3027,9 +3110,15 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
return -ENXIO;
bch2_trans_init(&trans, c, 0, 0);
+retry:
+ bch2_trans_begin(&trans);
+
+ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+ if (ret)
+ goto err;
for_each_btree_key(&trans, iter, BTREE_ID_extents,
- POS(inode->v.i_ino, offset >> 9), 0, k, ret) {
+ SPOS(inode->v.i_ino, offset >> 9, snapshot), 0, k, ret) {
if (k.k->p.inode != inode->v.i_ino) {
break;
} else if (bkey_extent_is_data(k.k)) {
@@ -3039,6 +3128,9 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
break;
}
bch2_trans_iter_exit(&trans, &iter);
+err:
+ if (ret == -EINTR)
+ goto retry;
ret = bch2_trans_exit(&trans) ?: ret;
if (ret)
@@ -3115,7 +3207,9 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
+ subvol_inum inum = inode_inum(inode);
u64 isize, next_hole = MAX_LFS_FILESIZE;
+ u32 snapshot;
int ret;
isize = i_size_read(&inode->v);
@@ -3123,9 +3217,15 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
return -ENXIO;
bch2_trans_init(&trans, c, 0, 0);
+retry:
+ bch2_trans_begin(&trans);
+
+ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+ if (ret)
+ goto err;
for_each_btree_key(&trans, iter, BTREE_ID_extents,
- POS(inode->v.i_ino, offset >> 9),
+ SPOS(inode->v.i_ino, offset >> 9, snapshot),
BTREE_ITER_SLOTS, k, ret) {
if (k.k->p.inode != inode->v.i_ino) {
next_hole = bch2_seek_pagecache_hole(&inode->v,
@@ -3143,6 +3243,9 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
}
}
bch2_trans_iter_exit(&trans, &iter);
+err:
+ if (ret == -EINTR)
+ goto retry;
ret = bch2_trans_exit(&trans) ?: ret;
if (ret)
diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c
index 91a0e761..3ed53f42 100644
--- a/libbcachefs/fs-ioctl.c
+++ b/libbcachefs/fs-ioctl.c
@@ -10,7 +10,11 @@
#include "quota.h"
#include <linux/compat.h>
+#include <linux/fsnotify.h>
#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/security.h>
+#include <linux/writeback.h>
#define FS_IOC_GOINGDOWN _IOR('X', 125, __u32)
#define FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */
@@ -192,7 +196,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
char *kname = NULL;
struct qstr qstr;
int ret = 0;
- u64 inum;
+ subvol_inum inum;
kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL);
if (!kname)
@@ -205,10 +209,8 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
qstr.len = ret;
qstr.name = kname;
- ret = -ENOENT;
- inum = bch2_dirent_lookup(c, src->v.i_ino, &hash,
- &qstr);
- if (!inum)
+ ret = bch2_dirent_lookup(c, inode_inum(src), &hash, &qstr, &inum);
+ if (ret)
goto err1;
vinode = bch2_vfs_inode_get(c, inum);
@@ -294,6 +296,154 @@ err:
return ret;
}
+static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
+ struct bch_ioctl_subvolume arg)
+{
+ struct inode *dir;
+ struct bch_inode_info *inode;
+ struct user_namespace *s_user_ns;
+ struct dentry *dst_dentry;
+ struct path src_path, dst_path;
+ int how = LOOKUP_FOLLOW;
+ int error;
+ subvol_inum snapshot_src = { 0 };
+ unsigned lookup_flags = 0;
+ unsigned create_flags = BCH_CREATE_SUBVOL;
+
+ if (arg.flags & ~(BCH_SUBVOL_SNAPSHOT_CREATE|
+ BCH_SUBVOL_SNAPSHOT_RO))
+ return -EINVAL;
+
+ if (!(arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
+ (arg.src_ptr ||
+ (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)))
+ return -EINVAL;
+
+ if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
+ create_flags |= BCH_CREATE_SNAPSHOT;
+
+ if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)
+ create_flags |= BCH_CREATE_SNAPSHOT_RO;
+
+ /* why do we need this lock? */
+ down_read(&c->vfs_sb->s_umount);
+
+ if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
+ sync_inodes_sb(c->vfs_sb);
+retry:
+ if (arg.src_ptr) {
+ error = user_path_at(arg.dirfd,
+ (const char __user *)(unsigned long)arg.src_ptr,
+ how, &src_path);
+ if (error)
+ goto err1;
+
+ if (src_path.dentry->d_sb->s_fs_info != c) {
+ path_put(&src_path);
+ error = -EXDEV;
+ goto err1;
+ }
+
+ snapshot_src = inode_inum(to_bch_ei(src_path.dentry->d_inode));
+ }
+
+ dst_dentry = user_path_create(arg.dirfd,
+ (const char __user *)(unsigned long)arg.dst_ptr,
+ &dst_path, lookup_flags);
+ error = PTR_ERR_OR_ZERO(dst_dentry);
+ if (error)
+ goto err2;
+
+ if (dst_dentry->d_sb->s_fs_info != c) {
+ error = -EXDEV;
+ goto err3;
+ }
+
+ if (dst_dentry->d_inode) {
+ error = -EEXIST;
+ goto err3;
+ }
+
+ dir = dst_path.dentry->d_inode;
+ if (IS_DEADDIR(dir)) {
+ error = -ENOENT;
+ goto err3;
+ }
+
+ s_user_ns = dir->i_sb->s_user_ns;
+ if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
+ !kgid_has_mapping(s_user_ns, current_fsgid())) {
+ error = -EOVERFLOW;
+ goto err3;
+ }
+
+ error = inode_permission(file_mnt_user_ns(filp),
+ dir, MAY_WRITE | MAY_EXEC);
+ if (error)
+ goto err3;
+
+ if (!IS_POSIXACL(dir))
+ arg.mode &= ~current_umask();
+
+ error = security_path_mkdir(&dst_path, dst_dentry, arg.mode);
+ if (error)
+ goto err3;
+
+ if ((arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
+ !arg.src_ptr)
+ snapshot_src.subvol = to_bch_ei(dir)->ei_inode.bi_subvol;
+
+ inode = __bch2_create(file_mnt_user_ns(filp), to_bch_ei(dir),
+ dst_dentry, arg.mode|S_IFDIR,
+ 0, snapshot_src, create_flags);
+ error = PTR_ERR_OR_ZERO(inode);
+ if (error)
+ goto err3;
+
+ d_instantiate(dst_dentry, &inode->v);
+ fsnotify_mkdir(dir, dst_dentry);
+err3:
+ done_path_create(&dst_path, dst_dentry);
+err2:
+ if (arg.src_ptr)
+ path_put(&src_path);
+
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
+err1:
+ up_read(&c->vfs_sb->s_umount);
+
+ return error;
+}
+
+static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
+ struct bch_ioctl_subvolume arg)
+{
+ struct path path;
+ int ret = 0;
+
+ if (arg.flags)
+ return -EINVAL;
+
+ ret = user_path_at(arg.dirfd,
+ (const char __user *)(unsigned long)arg.dst_ptr,
+ LOOKUP_FOLLOW, &path);
+ if (ret)
+ return ret;
+
+ if (path.dentry->d_sb->s_fs_info != c) {
+ path_put(&path);
+ return -EXDEV;
+ }
+
+ ret = __bch2_unlink(path.dentry->d_parent->d_inode, path.dentry, 1);
+ path_put(&path);
+
+ return ret;
+}
+
long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
struct bch_inode_info *inode = file_bch_inode(file);
@@ -324,6 +474,22 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
case FS_IOC_GOINGDOWN:
return bch2_ioc_goingdown(c, (u32 __user *) arg);
+ case BCH_IOCTL_SUBVOLUME_CREATE: {
+ struct bch_ioctl_subvolume i;
+
+ if (copy_from_user(&i, (void __user *) arg, sizeof(i)))
+ return -EFAULT;
+ return bch2_ioctl_subvolume_create(c, file, i);
+ }
+
+ case BCH_IOCTL_SUBVOLUME_DESTROY: {
+ struct bch_ioctl_subvolume i;
+
+ if (copy_from_user(&i, (void __user *) arg, sizeof(i)))
+ return -EFAULT;
+ return bch2_ioctl_subvolume_destroy(c, file, i);
+ }
+
default:
return bch2_fs_ioctl(c, cmd, (void __user *) arg);
}
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index 6cc56871..2094c18c 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -36,7 +36,7 @@
static struct kmem_cache *bch2_inode_cache;
-static void bch2_vfs_inode_init(struct bch_fs *,
+static void bch2_vfs_inode_init(struct bch_fs *, subvol_inum,
struct bch_inode_info *,
struct bch_inode_unpacked *);
@@ -149,7 +149,7 @@ int __must_check bch2_write_inode(struct bch_fs *c,
retry:
bch2_trans_begin(&trans);
- ret = bch2_inode_peek(&trans, &iter, &inode_u, inode->v.i_ino,
+ ret = bch2_inode_peek(&trans, &iter, &inode_u, inode_inum(inode),
BTREE_ITER_INTENT) ?:
(set ? set(inode, &inode_u, p) : 0) ?:
bch2_inode_write(&trans, &iter, &inode_u) ?:
@@ -208,13 +208,42 @@ int bch2_fs_quota_transfer(struct bch_fs *c,
return ret;
}
-struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum)
+static int bch2_iget5_test(struct inode *vinode, void *p)
+{
+ struct bch_inode_info *inode = to_bch_ei(vinode);
+ subvol_inum *inum = p;
+
+ return inode->ei_subvol == inum->subvol &&
+ inode->ei_inode.bi_inum == inum->inum;
+}
+
+static int bch2_iget5_set(struct inode *vinode, void *p)
+{
+ struct bch_inode_info *inode = to_bch_ei(vinode);
+ subvol_inum *inum = p;
+
+ inode->v.i_ino = inum->inum;
+ inode->ei_subvol = inum->subvol;
+ inode->ei_inode.bi_inum = inum->inum;
+ return 0;
+}
+
+static unsigned bch2_inode_hash(subvol_inum inum)
+{
+ return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
+}
+
+struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
{
struct bch_inode_unpacked inode_u;
struct bch_inode_info *inode;
int ret;
- inode = to_bch_ei(iget_locked(c->vfs_sb, inum));
+ inode = to_bch_ei(iget5_locked(c->vfs_sb,
+ bch2_inode_hash(inum),
+ bch2_iget5_test,
+ bch2_iget5_set,
+ &inum));
if (unlikely(!inode))
return ERR_PTR(-ENOMEM);
if (!(inode->v.i_state & I_NEW))
@@ -226,26 +255,20 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum)
return ERR_PTR(ret);
}
- bch2_vfs_inode_init(c, inode, &inode_u);
+ bch2_vfs_inode_init(c, inum, inode, &inode_u);
- inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum);
+ inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum.inum);
unlock_new_inode(&inode->v);
return &inode->v;
}
-static int inum_test(struct inode *inode, void *p)
-{
- unsigned long *ino = p;
-
- return *ino == inode->i_ino;
-}
-
-static struct bch_inode_info *
+struct bch_inode_info *
__bch2_create(struct user_namespace *mnt_userns,
struct bch_inode_info *dir, struct dentry *dentry,
- umode_t mode, dev_t rdev, bool tmpfile)
+ umode_t mode, dev_t rdev, subvol_inum snapshot_src,
+ unsigned flags)
{
struct bch_fs *c = dir->v.i_sb->s_fs_info;
struct btree_trans trans;
@@ -253,6 +276,7 @@ __bch2_create(struct user_namespace *mnt_userns,
struct bch_inode_info *inode, *old;
struct bch_inode_unpacked inode_u;
struct posix_acl *default_acl = NULL, *acl = NULL;
+ subvol_inum inum;
u64 journal_seq = 0;
int ret;
@@ -273,20 +297,23 @@ __bch2_create(struct user_namespace *mnt_userns,
bch2_inode_init_early(c, &inode_u);
- if (!tmpfile)
+ if (!(flags & BCH_CREATE_TMPFILE))
mutex_lock(&dir->ei_update_lock);
bch2_trans_init(&trans, c, 8,
- 2048 + (!tmpfile ? dentry->d_name.len : 0));
+ 2048 + (!(flags & BCH_CREATE_TMPFILE)
+ ? dentry->d_name.len : 0));
retry:
bch2_trans_begin(&trans);
- ret = bch2_create_trans(&trans, dir->v.i_ino, &dir_u, &inode_u,
- !tmpfile ? &dentry->d_name : NULL,
+ ret = bch2_create_trans(&trans,
+ inode_inum(dir), &dir_u, &inode_u,
+ !(flags & BCH_CREATE_TMPFILE)
+ ? &dentry->d_name : NULL,
from_kuid(mnt_userns, current_fsuid()),
from_kgid(mnt_userns, current_fsgid()),
mode, rdev,
- default_acl, acl) ?:
+ default_acl, acl, snapshot_src, flags) ?:
bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
KEY_TYPE_QUOTA_PREALLOC);
if (unlikely(ret))
@@ -302,14 +329,17 @@ err_before_quota:
goto err_trans;
}
- if (!tmpfile) {
+ if (!(flags & BCH_CREATE_TMPFILE)) {
bch2_inode_update_after_write(c, dir, &dir_u,
ATTR_MTIME|ATTR_CTIME);
journal_seq_copy(c, dir, journal_seq);
mutex_unlock(&dir->ei_update_lock);
}
- bch2_vfs_inode_init(c, inode, &inode_u);
+ inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
+ inum.inum = inode_u.bi_inum;
+
+ bch2_vfs_inode_init(c, inum, inode, &inode_u);
journal_seq_copy(c, inode, journal_seq);
set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
@@ -322,8 +352,12 @@ err_before_quota:
*/
inode->v.i_state |= I_CREATING;
- old = to_bch_ei(inode_insert5(&inode->v, inode->v.i_ino,
- inum_test, NULL, &inode->v.i_ino));
+
+ old = to_bch_ei(inode_insert5(&inode->v,
+ bch2_inode_hash(inum),
+ bch2_iget5_test,
+ bch2_iget5_set,
+ &inum));
BUG_ON(!old);
if (unlikely(old != inode)) {
@@ -350,7 +384,7 @@ err:
posix_acl_release(acl);
return inode;
err_trans:
- if (!tmpfile)
+ if (!(flags & BCH_CREATE_TMPFILE))
mutex_unlock(&dir->ei_update_lock);
bch2_trans_exit(&trans);
@@ -369,12 +403,13 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
struct bch_inode_info *dir = to_bch_ei(vdir);
struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
struct inode *vinode = NULL;
- u64 inum;
+ subvol_inum inum = { .subvol = 1 };
+ int ret;
- inum = bch2_dirent_lookup(c, dir->v.i_ino, &hash,
- &dentry->d_name);
+ ret = bch2_dirent_lookup(c, inode_inum(dir), &hash,
+ &dentry->d_name, &inum);
- if (inum)
+ if (!ret)
vinode = bch2_vfs_inode_get(c, inum);
return d_splice_alias(vinode, dentry);
@@ -385,7 +420,8 @@ static int bch2_mknod(struct user_namespace *mnt_userns,
umode_t mode, dev_t rdev)
{
struct bch_inode_info *inode =
- __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, rdev, false);
+ __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, rdev,
+ (subvol_inum) { 0 }, 0);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -415,8 +451,8 @@ static int __bch2_link(struct bch_fs *c,
ret = __bch2_trans_do(&trans, NULL, &inode->ei_journal_seq, 0,
bch2_link_trans(&trans,
- dir->v.i_ino,
- inode->v.i_ino, &dir_u, &inode_u,
+ inode_inum(dir), &dir_u,
+ inode_inum(inode), &inode_u,
&dentry->d_name));
if (likely(!ret)) {
@@ -452,7 +488,8 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
return 0;
}
-static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
+int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
+ int deleting_snapshot)
{
struct bch_fs *c = vdir->i_sb->s_fs_info;
struct bch_inode_info *dir = to_bch_ei(vdir);
@@ -467,8 +504,9 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
ret = __bch2_trans_do(&trans, NULL, &dir->ei_journal_seq,
BTREE_INSERT_NOFAIL,
bch2_unlink_trans(&trans,
- dir->v.i_ino, &dir_u,
- &inode_u, &dentry->d_name));
+ inode_inum(dir), &dir_u,
+ &inode_u, &dentry->d_name,
+ deleting_snapshot));
if (likely(!ret)) {
BUG_ON(inode_u.bi_inum != inode->v.i_ino);
@@ -486,6 +524,11 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
return ret;
}
+static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
+{
+ return __bch2_unlink(vdir, dentry, -1);
+}
+
static int bch2_symlink(struct user_namespace *mnt_userns,
struct inode *vdir, struct dentry *dentry,
const char *symname)
@@ -494,7 +537,8 @@ static int bch2_symlink(struct user_namespace *mnt_userns,
struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
int ret;
- inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0, true);
+ inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
+ (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
if (unlikely(IS_ERR(inode)))
return PTR_ERR(inode);
@@ -587,8 +631,8 @@ static int bch2_rename2(struct user_namespace *mnt_userns,
ret = __bch2_trans_do(&trans, NULL, &journal_seq, 0,
bch2_rename_trans(&trans,
- src_dir->v.i_ino, &src_dir_u,
- dst_dir->v.i_ino, &dst_dir_u,
+ inode_inum(src_dir), &src_dir_u,
+ inode_inum(dst_dir), &dst_dir_u,
&src_inode_u,
&dst_inode_u,
&src_dentry->d_name,
@@ -711,7 +755,7 @@ retry:
kfree(acl);
acl = NULL;
- ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode->v.i_ino,
+ ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode),
BTREE_ITER_INTENT);
if (ret)
goto btree_err;
@@ -719,7 +763,8 @@ retry:
bch2_setattr_copy(mnt_userns, inode, &inode_u, attr);
if (attr->ia_valid & ATTR_MODE) {
- ret = bch2_acl_chmod(&trans, &inode_u, inode_u.bi_mode, &acl);
+ ret = bch2_acl_chmod(&trans, inode_inum(inode), &inode_u,
+ inode_u.bi_mode, &acl);
if (ret)
goto btree_err;
}
@@ -810,7 +855,8 @@ static int bch2_tmpfile(struct user_namespace *mnt_userns,
struct inode *vdir, struct dentry *dentry, umode_t mode)
{
struct bch_inode_info *inode =
- __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, 0, true);
+ __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, 0,
+ (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -885,6 +931,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
unsigned offset_into_extent, sectors;
bool have_extent = false;
+ u32 snapshot;
int ret = 0;
ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
@@ -894,15 +941,21 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
if (start + len < start)
return -EINVAL;
+ start >>= 9;
+
bch2_bkey_buf_init(&cur);
bch2_bkey_buf_init(&prev);
bch2_trans_init(&trans, c, 0, 0);
-
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
- POS(ei->v.i_ino, start >> 9), 0);
retry:
bch2_trans_begin(&trans);
+ ret = bch2_subvolume_get_snapshot(&trans, ei->ei_subvol, &snapshot);
+ if (ret)
+ goto err;
+
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+ SPOS(ei->v.i_ino, start, snapshot), 0);
+
while ((k = bch2_btree_iter_peek(&iter)).k &&
!(ret = bkey_err(k)) &&
bkey_cmp(iter.pos, end) < 0) {
@@ -951,7 +1004,9 @@ retry:
bch2_btree_iter_set_pos(&iter,
POS(iter.pos.inode, iter.pos.offset + sectors));
}
-
+ start = iter.pos.offset;
+ bch2_trans_iter_exit(&trans, &iter);
+err:
if (ret == -EINTR)
goto retry;
@@ -959,7 +1014,6 @@ retry:
ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
FIEMAP_EXTENT_LAST);
- bch2_trans_iter_exit(&trans, &iter);
ret = bch2_trans_exit(&trans) ?: ret;
bch2_bkey_buf_exit(&cur, c);
bch2_bkey_buf_exit(&prev, c);
@@ -996,7 +1050,7 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
if (!dir_emit_dots(file, ctx))
return 0;
- return bch2_readdir(c, inode->v.i_ino, ctx);
+ return bch2_readdir(c, inode_inum(inode), ctx);
}
static const struct file_operations bch_file_operations = {
@@ -1096,6 +1150,7 @@ static const struct address_space_operations bch_address_space_operations = {
.error_remove_page = generic_error_remove_page,
};
+#if 0
static struct inode *bch2_nfs_get_inode(struct super_block *sb,
u64 ino, u32 generation)
{
@@ -1129,14 +1184,15 @@ static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid,
return generic_fh_to_parent(sb, fid, fh_len, fh_type,
bch2_nfs_get_inode);
}
+#endif
static const struct export_operations bch_export_ops = {
- .fh_to_dentry = bch2_fh_to_dentry,
- .fh_to_parent = bch2_fh_to_parent,
+ //.fh_to_dentry = bch2_fh_to_dentry,
+ //.fh_to_parent = bch2_fh_to_parent,
//.get_parent = bch2_get_parent,
};
-static void bch2_vfs_inode_init(struct bch_fs *c,
+static void bch2_vfs_inode_init(struct bch_fs *c, subvol_inum inum,
struct bch_inode_info *inode,
struct bch_inode_unpacked *bi)
{
@@ -1152,6 +1208,7 @@ static void bch2_vfs_inode_init(struct bch_fs *c,
inode->ei_journal_seq = 0;
inode->ei_quota_reserved = 0;
inode->ei_qid = bch_qid(bi);
+ inode->ei_subvol = inum.subvol;
inode->v.i_mapping->a_ops = &bch_address_space_operations;
@@ -1249,7 +1306,7 @@ static void bch2_evict_inode(struct inode *vinode)
KEY_TYPE_QUOTA_WARN);
bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
KEY_TYPE_QUOTA_WARN);
- bch2_inode_rm(c, inode->v.i_ino, true);
+ bch2_inode_rm(c, inode_inum(inode), true);
}
}
@@ -1593,7 +1650,7 @@ got_sb:
sb->s_flags |= SB_POSIXACL;
#endif
- vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_INO);
+ vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
if (IS_ERR(vinode)) {
bch_err(c, "error mounting: error getting root inode %i",
(int) PTR_ERR(vinode));
diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h
index 36cc6ba2..48fc504e 100644
--- a/libbcachefs/fs.h
+++ b/libbcachefs/fs.h
@@ -45,10 +45,20 @@ struct bch_inode_info {
struct mutex ei_quota_lock;
struct bch_qid ei_qid;
+ u32 ei_subvol;
+
/* copy of inode in btree: */
struct bch_inode_unpacked ei_inode;
};
+static inline subvol_inum inode_inum(struct bch_inode_info *inode)
+{
+ return (subvol_inum) {
+ .subvol = inode->ei_subvol,
+ .inum = inode->ei_inode.bi_inum,
+ };
+}
+
/*
* Set if we've gotten a btree error for this inode, and thus the vfs inode and
* btree inode may be inconsistent:
@@ -135,6 +145,10 @@ struct bch_inode_unpacked;
#ifndef NO_BCACHEFS_FS
+struct bch_inode_info *
+__bch2_create(struct user_namespace *, struct bch_inode_info *,
+ struct dentry *, umode_t, dev_t, subvol_inum, unsigned);
+
int bch2_fs_quota_transfer(struct bch_fs *,
struct bch_inode_info *,
struct bch_qid,
@@ -154,7 +168,7 @@ static inline int bch2_set_projid(struct bch_fs *c,
KEY_TYPE_QUOTA_PREALLOC);
}
-struct inode *bch2_vfs_inode_get(struct bch_fs *, u64);
+struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum);
/* returns 0 if we want to do the update, or error is passed up */
typedef int (*inode_set_fn)(struct bch_inode_info *,
@@ -170,6 +184,7 @@ int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
int bch2_setattr_nonsize(struct user_namespace *,
struct bch_inode_info *,
struct iattr *);
+int __bch2_unlink(struct inode *, struct dentry *, int);
void bch2_vfs_exit(void);
int bch2_vfs_init(void);
diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c
index eb979e79..16a1eae9 100644
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@@ -9,6 +9,7 @@
#include "fsck.h"
#include "inode.h"
#include "keylist.h"
+#include "subvolume.h"
#include "super.h"
#include "xattr.h"
@@ -17,7 +18,8 @@
#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
-static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
+static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
+ u32 snapshot)
{
struct btree_iter iter;
struct bkey_s_c k;
@@ -25,7 +27,7 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
int ret;
for_each_btree_key(trans, iter, BTREE_ID_extents,
- POS(inum, 0), 0, k, ret) {
+ SPOS(inum, 0, snapshot), 0, k, ret) {
if (k.k->p.inode != inum)
break;
@@ -38,6 +40,100 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
return ret ?: sectors;
}
+static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
+ u32 snapshot)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_s_c_dirent d;
+ u64 subdirs = 0;
+ int ret;
+
+ for_each_btree_key(trans, iter, BTREE_ID_dirents,
+ SPOS(inum, 0, snapshot), 0, k, ret) {
+ if (k.k->p.inode != inum)
+ break;
+
+ if (k.k->type != KEY_TYPE_dirent)
+ continue;
+
+ d = bkey_s_c_to_dirent(k);
+ if (d.v->d_type == DT_DIR)
+ subdirs++;
+ }
+
+ bch2_trans_iter_exit(trans, &iter);
+
+ return ret ?: subdirs;
+}
+
+static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
+ u32 *subvol)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
+ POS(0, snapshot), 0);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_snapshot) {
+ bch_err(trans->c, "snapshot %u not fonud", snapshot);
+ ret = -ENOENT;
+ goto err;
+ }
+
+ *subvol = le32_to_cpu(bkey_s_c_to_snapshot(k).v->subvol);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+
+}
+
+static int snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
+ u32 *subvol)
+{
+ return lockrestart_do(trans, __snapshot_lookup_subvol(trans, snapshot, subvol));
+}
+
+static int __subvol_lookup(struct btree_trans *trans, u32 subvol,
+ u32 *snapshot, u64 *inum)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
+ POS(0, subvol), 0);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_subvolume) {
+ bch_err(trans->c, "subvolume %u not fonud", subvol);
+ ret = -ENOENT;
+ goto err;
+ }
+
+ *snapshot = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot);
+ *inum = le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+
+}
+
+static int subvol_lookup(struct btree_trans *trans, u32 subvol,
+ u32 *snapshot, u64 *inum)
+{
+ return lockrestart_do(trans, __subvol_lookup(trans, subvol, snapshot, inum));
+}
+
static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
struct bch_inode_unpacked *inode,
u32 *snapshot)
@@ -47,14 +143,13 @@ static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
int ret;
bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
- POS(0, inode_nr), 0);
+ SPOS(0, inode_nr, *snapshot), 0);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;
- if (snapshot)
- *snapshot = iter.pos.snapshot;
+ *snapshot = iter.pos.snapshot;
ret = k.k->type == KEY_TYPE_inode
? bch2_inode_unpack(bkey_s_c_to_inode(k), inode)
: -ENOENT;
@@ -70,6 +165,36 @@ static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot));
}
+static int __lookup_dirent(struct btree_trans *trans,
+ struct bch_hash_info hash_info,
+ subvol_inum dir, struct qstr *name,
+ u64 *target, unsigned *type)
+{
+ struct btree_iter iter;
+ struct bkey_s_c_dirent d;
+ int ret;
+
+ ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc,
+ &hash_info, dir, name, 0);
+ if (ret)
+ return ret;
+
+ d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
+ *target = le64_to_cpu(d.v->d_inum);
+ *type = d.v->d_type;
+ bch2_trans_iter_exit(trans, &iter);
+ return 0;
+}
+
+static int lookup_dirent(struct btree_trans *trans,
+ struct bch_hash_info hash_info,
+ subvol_inum dir, struct qstr *name,
+ u64 *target, unsigned *type)
+{
+ return lockrestart_do(trans,
+ __lookup_dirent(trans, hash_info, dir, name, target, type));
+}
+
static int __write_inode(struct btree_trans *trans,
struct bch_inode_unpacked *inode,
u32 snapshot)
@@ -100,6 +225,71 @@ static int write_inode(struct btree_trans *trans,
return ret;
}
+static int fsck_inode_rm(struct btree_trans *trans, u64 inum, u32 snapshot)
+{
+ struct btree_iter iter = { NULL };
+ struct bkey_i_inode_generation delete;
+ struct bch_inode_unpacked inode_u;
+ struct bkey_s_c k;
+ int ret;
+
+ ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
+ SPOS(inum, 0, snapshot),
+ SPOS(inum, U64_MAX, snapshot),
+ 0, NULL) ?:
+ bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
+ SPOS(inum, 0, snapshot),
+ SPOS(inum, U64_MAX, snapshot),
+ 0, NULL) ?:
+ bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
+ SPOS(inum, 0, snapshot),
+ SPOS(inum, U64_MAX, snapshot),
+ 0, NULL);
+ if (ret)
+ goto err;
+retry:
+ bch2_trans_begin(trans);
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
+ SPOS(0, inum, snapshot), BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&iter);
+
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_inode) {
+ bch2_fs_inconsistent(trans->c,
+ "inode %llu:%u not found when deleting",
+ inum, snapshot);
+ ret = -EIO;
+ goto err;
+ }
+
+ bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u);
+
+ /* Subvolume root? */
+ if (inode_u.bi_subvol) {
+ ret = bch2_subvolume_delete(trans, inode_u.bi_subvol, -1);
+ if (ret)
+ goto err;
+ }
+
+ bkey_inode_generation_init(&delete.k_i);
+ delete.k.p = iter.pos;
+ delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
+
+ ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ if (ret == -EINTR)
+ goto retry;
+
+ return ret;
+}
+
static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
{
struct bch_fs *c = trans->c;
@@ -117,7 +307,7 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
- &dir_hash_info, &iter);
+ &dir_hash_info, &iter, 0);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -134,29 +324,49 @@ static int remove_dirent(struct btree_trans *trans, struct bpos pos)
}
/* Get lost+found, create if it doesn't exist: */
-static int lookup_lostfound(struct btree_trans *trans,
+static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
struct bch_inode_unpacked *lostfound)
{
struct bch_fs *c = trans->c;
struct bch_inode_unpacked root;
struct bch_hash_info root_hash_info;
struct qstr lostfound_str = QSTR("lost+found");
- u64 inum;
+ subvol_inum root_inum = { .subvol = subvol };
+ u64 inum = 0;
+ unsigned d_type = 0;
u32 snapshot;
int ret;
- ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root, &snapshot);
- if (ret && ret != -ENOENT)
+ ret = subvol_lookup(trans, subvol, &snapshot, &root_inum.inum);
+ if (ret)
+ return ret;
+
+ ret = lookup_inode(trans, root_inum.inum, &root, &snapshot);
+ if (ret) {
+ bch_err(c, "error fetching subvol root: %i", ret);
return ret;
+ }
root_hash_info = bch2_hash_info_init(c, &root);
- inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info,
- &lostfound_str);
- if (!inum) {
+
+ ret = lookup_dirent(trans, root_hash_info, root_inum,
+ &lostfound_str, &inum, &d_type);
+ if (ret == -ENOENT) {
bch_notice(c, "creating lost+found");
goto create_lostfound;
}
+ if (ret) {
+ bch_err(c, "error looking up lost+found: %i", ret);
+ return ret;
+ }
+
+ if (d_type != DT_DIR) {
+ bch_err(c, "error looking up lost+found: not a directory");
+ return ret;
+
+ }
+
ret = lookup_inode(trans, inum, lostfound, &snapshot);
if (ret && ret != -ENOENT) {
/*
@@ -174,11 +384,10 @@ create_lostfound:
ret = __bch2_trans_do(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW,
- bch2_create_trans(trans,
- BCACHEFS_ROOT_INO, &root,
- lostfound,
- &lostfound_str,
- 0, 0, S_IFDIR|0700, 0, NULL, NULL));
+ bch2_create_trans(trans, root_inum, &root,
+ lostfound, &lostfound_str,
+ 0, 0, S_IFDIR|0700, 0, NULL, NULL,
+ (subvol_inum) { }, 0));
if (ret)
bch_err(c, "error creating lost+found: %i", ret);
}
@@ -187,16 +396,22 @@ create_lostfound:
}
static int reattach_inode(struct btree_trans *trans,
- struct bch_inode_unpacked *inode)
+ struct bch_inode_unpacked *inode,
+ u32 inode_snapshot)
{
struct bch_hash_info dir_hash;
struct bch_inode_unpacked lostfound;
char name_buf[20];
struct qstr name;
u64 dir_offset = 0;
+ u32 subvol;
int ret;
- ret = lookup_lostfound(trans, &lostfound);
+ ret = snapshot_lookup_subvol(trans, inode_snapshot, &subvol);
+ if (ret)
+ return ret;
+
+ ret = lookup_lostfound(trans, subvol, &lostfound);
if (ret)
return ret;
@@ -214,10 +429,15 @@ static int reattach_inode(struct btree_trans *trans,
name = (struct qstr) QSTR(name_buf);
ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
- bch2_dirent_create(trans, lostfound.bi_inum, &dir_hash,
- mode_to_type(inode->bi_mode),
- &name, inode->bi_inum, &dir_offset,
- BCH_HASH_SET_MUST_CREATE));
+ bch2_dirent_create(trans,
+ (subvol_inum) {
+ .subvol = subvol,
+ .inum = lostfound.bi_inum,
+ },
+ &dir_hash,
+ mode_to_type(inode->bi_mode),
+ &name, inode->bi_inum, &dir_offset,
+ BCH_HASH_SET_MUST_CREATE));
if (ret) {
bch_err(trans->c, "error %i reattaching inode %llu",
ret, inode->bi_inum);
@@ -227,7 +447,7 @@ static int reattach_inode(struct btree_trans *trans,
inode->bi_dir = lostfound.bi_inum;
inode->bi_dir_offset = dir_offset;
- return write_inode(trans, inode, U32_MAX);
+ return write_inode(trans, inode, inode_snapshot);
}
static int remove_backpointer(struct btree_trans *trans,
@@ -254,45 +474,254 @@ out:
return ret;
}
+static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, struct bpos pos)
+{
+ pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
+
+ if (bkey_cmp(s->pos, pos))
+ s->nr = 0;
+ s->pos = pos;
+
+ /* Might get called multiple times due to lock restarts */
+ if (s->nr && s->d[s->nr - 1] == pos.snapshot)
+ return 0;
+
+ return snapshots_seen_add(c, s, pos.snapshot);
+}
+
+/**
+ * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
+ * and @ancestor hasn't been overwritten in @seen
+ *
+ * That is, returns whether key in @ancestor snapshot is visible in @id snapshot
+ */
+static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
+ u32 id, u32 ancestor)
+{
+ ssize_t i;
+
+ BUG_ON(id > ancestor);
+
+ id = snapshot_t(c, id)->equiv;
+ ancestor = snapshot_t(c, ancestor)->equiv;
+
+ /* @ancestor should be the snapshot most recently added to @seen */
+ BUG_ON(!seen->nr || seen->d[seen->nr - 1] != ancestor);
+ BUG_ON(seen->pos.snapshot != ancestor);
+
+ if (id == ancestor)
+ return true;
+
+ if (!bch2_snapshot_is_ancestor(c, id, ancestor))
+ return false;
+
+ for (i = seen->nr - 2;
+ i >= 0 && seen->d[i] >= id;
+ --i)
+ if (bch2_snapshot_is_ancestor(c, id, seen->d[i]) &&
+ bch2_snapshot_is_ancestor(c, seen->d[i], ancestor))
+ return false;
+
+ return true;
+}
+
+/**
+ * ref_visible - given a key with snapshot id @src that points to a key with
+ * snapshot id @dst, test whether there is some snapshot in which @dst is
+ * visible.
+ *
+ * This assumes we're visiting @src keys in natural key order.
+ *
+ * @s - list of snapshot IDs already seen at @src
+ * @src - snapshot ID of src key
+ * @dst - snapshot ID of dst key
+ */
+static int ref_visible(struct bch_fs *c, struct snapshots_seen *s,
+ u32 src, u32 dst)
+{
+ return dst <= src
+ ? key_visible_in_snapshot(c, s, dst, src)
+ : bch2_snapshot_is_ancestor(c, src, dst);
+}
+
+#define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \
+ for (_i = (_w)->d; _i < (_w)->d + (_w)->nr && (_i)->snapshot <= (_snapshot); _i++)\
+ if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
+
struct inode_walker {
- bool first_this_inode;
- bool have_inode;
- u64 cur_inum;
- u32 snapshot;
- struct bch_inode_unpacked inode;
+ bool first_this_inode;
+ u64 cur_inum;
+
+ size_t nr;
+ size_t size;
+ struct inode_walker_entry {
+ struct bch_inode_unpacked inode;
+ u32 snapshot;
+ u64 count;
+ } *d;
};
+static void inode_walker_exit(struct inode_walker *w)
+{
+ kfree(w->d);
+ w->d = NULL;
+}
+
static struct inode_walker inode_walker_init(void)
{
- return (struct inode_walker) {
- .cur_inum = -1,
- .have_inode = false,
+ return (struct inode_walker) { 0, };
+}
+
+static int inode_walker_realloc(struct inode_walker *w)
+{
+ if (w->nr == w->size) {
+ size_t new_size = max_t(size_t, 8UL, w->size * 2);
+ void *d = krealloc(w->d, new_size * sizeof(w->d[0]),
+ GFP_KERNEL);
+ if (!d)
+ return -ENOMEM;
+
+ w->d = d;
+ w->size = new_size;
+ }
+
+ return 0;
+}
+
+static int add_inode(struct bch_fs *c, struct inode_walker *w,
+ struct bkey_s_c_inode inode)
+{
+ struct bch_inode_unpacked u;
+ int ret;
+
+ ret = inode_walker_realloc(w);
+ if (ret)
+ return ret;
+
+ BUG_ON(bch2_inode_unpack(inode, &u));
+
+ w->d[w->nr++] = (struct inode_walker_entry) {
+ .inode = u,
+ .snapshot = snapshot_t(c, inode.k->p.snapshot)->equiv,
};
+
+ return 0;
}
static int __walk_inode(struct btree_trans *trans,
- struct inode_walker *w, u64 inum)
+ struct inode_walker *w, struct bpos pos)
{
- if (inum != w->cur_inum) {
- int ret = __lookup_inode(trans, inum, &w->inode, &w->snapshot);
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ unsigned i, ancestor_pos;
+ int ret;
- if (ret && ret != -ENOENT)
- return ret;
+ pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
- w->have_inode = !ret;
- w->cur_inum = inum;
- w->first_this_inode = true;
- } else {
+ if (pos.inode == w->cur_inum) {
w->first_this_inode = false;
+ goto lookup_snapshot;
}
- return 0;
+ w->nr = 0;
+
+ for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, pos.inode),
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ if (k.k->p.offset != pos.inode)
+ break;
+
+ if (k.k->type == KEY_TYPE_inode)
+ add_inode(c, w, bkey_s_c_to_inode(k));
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (ret)
+ return ret;
+
+ w->cur_inum = pos.inode;
+ w->first_this_inode = true;
+lookup_snapshot:
+ for (i = 0; i < w->nr; i++)
+ if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->d[i].snapshot))
+ goto found;
+ return INT_MAX;
+found:
+ BUG_ON(pos.snapshot > w->d[i].snapshot);
+
+ if (pos.snapshot != w->d[i].snapshot) {
+ ancestor_pos = i;
+
+ while (i && w->d[i - 1].snapshot > pos.snapshot)
+ --i;
+
+ ret = inode_walker_realloc(w);
+ if (ret)
+ return ret;
+
+ array_insert_item(w->d, w->nr, i, w->d[ancestor_pos]);
+ w->d[i].snapshot = pos.snapshot;
+ w->d[i].count = 0;
+ }
+
+ return i;
}
static int walk_inode(struct btree_trans *trans,
- struct inode_walker *w, u64 inum)
+ struct inode_walker *w, struct bpos pos)
{
- return lockrestart_do(trans, __walk_inode(trans, w, inum));
+ return lockrestart_do(trans, __walk_inode(trans, w, pos));
+}
+
+static int __get_visible_inodes(struct btree_trans *trans,
+ struct inode_walker *w,
+ struct snapshots_seen *s,
+ u64 inum)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ w->nr = 0;
+
+ for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum),
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ if (k.k->p.offset != inum)
+ break;
+
+ if (k.k->type != KEY_TYPE_inode)
+ continue;
+
+ if (ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) {
+ add_inode(c, w, bkey_s_c_to_inode(k));
+ if (k.k->p.snapshot >= s->pos.snapshot)
+ break;
+ }
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ return ret;
+}
+
+static int check_key_has_snapshot(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ char buf[200];
+ int ret = 0;
+
+ if (fsck_err_on(!snapshot_t(c, k.k->p.snapshot)->equiv, c,
+ "key in missing snapshot: %s",
+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) {
+ ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
+ bch2_btree_delete_at(trans, iter,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
+ return ret ?: -EINTR;
+ }
+fsck_err:
+ return ret;
}
static int hash_redo_key(struct btree_trans *trans,
@@ -300,6 +729,9 @@ static int hash_redo_key(struct btree_trans *trans,
struct bch_hash_info *hash_info,
struct btree_iter *k_iter, struct bkey_s_c k)
{
+ bch_err(trans->c, "hash_redo_key() not implemented yet");
+ return -EINVAL;
+#if 0
struct bkey_i *delete;
struct bkey_i *tmp;
@@ -318,6 +750,7 @@ static int hash_redo_key(struct btree_trans *trans,
return bch2_btree_iter_traverse(k_iter) ?:
bch2_trans_update(trans, k_iter, delete, 0) ?:
bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, tmp, 0);
+#endif
}
static int fsck_hash_delete_at(struct btree_trans *trans,
@@ -327,7 +760,7 @@ static int fsck_hash_delete_at(struct btree_trans *trans,
{
int ret;
retry:
- ret = bch2_hash_delete_at(trans, desc, info, iter) ?:
+ ret = bch2_hash_delete_at(trans, desc, info, iter, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW);
@@ -409,30 +842,29 @@ fsck_err:
static int check_inode(struct btree_trans *trans,
struct btree_iter *iter,
- struct bkey_s_c_inode inode)
+ struct bch_inode_unpacked *prev,
+ struct bch_inode_unpacked u)
{
struct bch_fs *c = trans->c;
- struct bch_inode_unpacked u;
bool do_update = false;
int ret = 0;
- ret = bch2_inode_unpack(inode, &u);
-
- if (bch2_fs_inconsistent_on(ret, c,
- "error unpacking inode %llu in fsck",
- inode.k->p.inode))
- return ret;
+ if (fsck_err_on(prev &&
+ (prev->bi_hash_seed != u.bi_hash_seed ||
+ mode_to_type(prev->bi_mode) != mode_to_type(u.bi_mode)), c,
+ "inodes in different snapshots don't match")) {
+ bch_err(c, "repair not implemented yet");
+ return -EINVAL;
+ }
if (u.bi_flags & BCH_INODE_UNLINKED &&
(!c->sb.clean ||
fsck_err(c, "filesystem marked clean, but inode %llu unlinked",
u.bi_inum))) {
- bch_verbose(c, "deleting inode %llu", u.bi_inum);
-
bch2_trans_unlock(trans);
bch2_fs_lazy_rw(c);
- ret = bch2_inode_rm(c, u.bi_inum, false);
+ ret = fsck_inode_rm(trans, u.bi_inum, iter->pos.snapshot);
if (ret)
bch_err(c, "error in fsck: error %i while deleting inode", ret);
return ret;
@@ -452,9 +884,10 @@ static int check_inode(struct btree_trans *trans,
* just switch units to bytes and that issue goes away
*/
ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
- POS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9),
+ SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
+ iter->pos.snapshot),
POS(u.bi_inum, U64_MAX),
- NULL);
+ 0, NULL);
if (ret) {
bch_err(c, "error in fsck: error %i truncating inode", ret);
return ret;
@@ -479,7 +912,7 @@ static int check_inode(struct btree_trans *trans,
bch_verbose(c, "recounting sectors for inode %llu",
u.bi_inum);
- sectors = bch2_count_inode_sectors(trans, u.bi_inum);
+ sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
if (sectors < 0) {
bch_err(c, "error in fsck: error %i recounting inode sectors",
(int) sectors);
@@ -499,11 +932,7 @@ static int check_inode(struct btree_trans *trans,
}
if (do_update) {
- ret = __bch2_trans_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
- bch2_btree_iter_traverse(iter) ?:
- bch2_inode_write(trans, iter, &u));
+ ret = write_inode(trans, &u, iter->pos.snapshot);
if (ret)
bch_err(c, "error in fsck: error %i "
"updating inode", ret);
@@ -519,26 +948,49 @@ static int check_inodes(struct bch_fs *c, bool full)
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_s_c_inode inode;
+ struct bch_inode_unpacked prev, u;
int ret;
+ memset(&prev, 0, sizeof(prev));
+
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH, k, ret) {
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ ret = check_key_has_snapshot(&trans, &iter, k);
+ if (ret)
+ break;
+
+ /*
+ * if snapshot id isn't a leaf node, skip it - deletion in
+ * particular is not atomic, so on the internal snapshot nodes
+ * we can see inodes marked for deletion after a clean shutdown
+ */
+ if (bch2_snapshot_internal_node(c, k.k->p.snapshot))
+ continue;
+
if (k.k->type != KEY_TYPE_inode)
continue;
inode = bkey_s_c_to_inode(k);
- if (full ||
- (inode.v->bi_flags & (BCH_INODE_I_SIZE_DIRTY|
- BCH_INODE_I_SECTORS_DIRTY|
- BCH_INODE_UNLINKED))) {
- ret = check_inode(&trans, &iter, inode);
- if (ret)
- break;
- }
+ if (!full &&
+ !(inode.v->bi_flags & (BCH_INODE_I_SIZE_DIRTY|
+ BCH_INODE_I_SECTORS_DIRTY|
+ BCH_INODE_UNLINKED)))
+ continue;
+
+ BUG_ON(bch2_inode_unpack(inode, &u));
+
+ ret = check_inode(&trans, &iter,
+ full && prev.bi_inum == u.bi_inum
+ ? &prev : NULL, u);
+ if (ret)
+ break;
+
+ prev = u;
}
bch2_trans_iter_exit(&trans, &iter);
@@ -547,6 +999,29 @@ static int check_inodes(struct bch_fs *c, bool full)
return bch2_trans_exit(&trans) ?: ret;
}
+noinline_for_stack
+static int check_subvols(struct bch_fs *c)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+
+ for_each_btree_key(&trans, iter, BTREE_ID_subvolumes, POS_MIN,
+ 0, k, ret) {
+ }
+ bch2_trans_iter_exit(&trans, &iter);
+
+ bch2_trans_exit(&trans);
+ return ret;
+}
+
+/*
+ * Checking for overlapping extents needs to be reimplemented
+ */
+#if 0
static int fix_overlapping_extent(struct btree_trans *trans,
struct bkey_s_c k, struct bpos cut_at)
{
@@ -582,16 +1057,18 @@ static int fix_overlapping_extent(struct btree_trans *trans,
bch2_trans_iter_exit(trans, &iter);
return ret;
}
+#endif
static int inode_backpointer_exists(struct btree_trans *trans,
- struct bch_inode_unpacked *inode)
+ struct bch_inode_unpacked *inode,
+ u32 snapshot)
{
struct btree_iter iter;
struct bkey_s_c k;
int ret;
bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents,
- POS(inode->bi_dir, inode->bi_dir_offset), 0);
+ SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot), 0);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
@@ -612,6 +1089,144 @@ static bool inode_backpointer_matches(struct bkey_s_c_dirent d,
d.k->p.offset == inode->bi_dir_offset;
}
+static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
+{
+ struct bch_fs *c = trans->c;
+ struct inode_walker_entry *i;
+ int ret = 0, ret2 = 0;
+ s64 count2;
+
+ for (i = w->d; i < w->d + w->nr; i++) {
+ if (i->inode.bi_sectors == i->count)
+ continue;
+
+ count2 = lockrestart_do(trans,
+ bch2_count_inode_sectors(trans, w->cur_inum, i->snapshot));
+
+ if (i->count != count2) {
+ bch_err(c, "fsck counted i_sectors wrong: got %llu should be %llu",
+ i->count, count2);
+ i->count = count2;
+ if (i->inode.bi_sectors == i->count)
+ continue;
+ }
+
+ if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c,
+ "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
+ w->cur_inum, i->snapshot,
+ i->inode.bi_sectors, i->count) == FSCK_ERR_IGNORE)
+ continue;
+
+ i->inode.bi_sectors = i->count;
+ ret = write_inode(trans, &i->inode, i->snapshot);
+ if (ret)
+ break;
+ ret2 = -EINTR;
+ }
+fsck_err:
+ return ret ?: ret2;
+}
+
+static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
+ struct inode_walker *inode,
+ struct snapshots_seen *s)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c k;
+ struct inode_walker_entry *i;
+ char buf[200];
+ int ret = 0;
+
+ k = bch2_btree_iter_peek(iter);
+ if (!k.k)
+ return 0;
+
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ ret = check_key_has_snapshot(trans, iter, k);
+ if (ret)
+ return ret;
+
+ ret = snapshots_seen_update(c, s, k.k->p);
+ if (ret)
+ return ret;
+
+ if (k.k->type == KEY_TYPE_whiteout)
+ return 0;
+
+ if (inode->cur_inum != k.k->p.inode) {
+ ret = check_i_sectors(trans, inode);
+ if (ret)
+ return ret;
+ }
+#if 0
+ if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
+ char buf1[200];
+ char buf2[200];
+
+ bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
+ bch2_bkey_val_to_text(&PBUF(buf2), c, k);
+
+ if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2))
+ return fix_overlapping_extent(trans, k, prev.k->k.p) ?: -EINTR;
+ }
+#endif
+ ret = __walk_inode(trans, inode, k.k->p);
+ if (ret < 0)
+ return ret;
+
+ if (fsck_err_on(ret == INT_MAX, c,
+ "extent in missing inode:\n %s",
+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
+ return __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
+ bch2_btree_delete_at(trans, iter,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
+
+ if (ret == INT_MAX)
+ return 0;
+
+ i = inode->d + ret;
+ ret = 0;
+
+ if (fsck_err_on(!S_ISREG(i->inode.bi_mode) &&
+ !S_ISLNK(i->inode.bi_mode), c,
+ "extent in non regular inode mode %o:\n %s",
+ i->inode.bi_mode,
+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
+ return __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
+ bch2_btree_delete_at(trans, iter,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
+
+ if (!bch2_snapshot_internal_node(c, k.k->p.snapshot)) {
+ for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) {
+ if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+ k.k->type != KEY_TYPE_reservation &&
+ k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9, c,
+ "extent type %u offset %llu past end of inode %llu, i_size %llu",
+ k.k->type, k.k->p.offset, k.k->p.inode, i->inode.bi_size)) {
+ bch2_fs_lazy_rw(c);
+ return bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
+ SPOS(k.k->p.inode, round_up(i->inode.bi_size, block_bytes(c)) >> 9,
+ k.k->p.snapshot),
+ POS(k.k->p.inode, U64_MAX),
+ 0, NULL) ?: -EINTR;
+ }
+ }
+ }
+
+ if (bkey_extent_is_allocation(k.k))
+ for_each_visible_inode(c, s, inode, k.k->p.snapshot, i)
+ i->count += k.k->size;
+#if 0
+ bch2_bkey_buf_reassemble(&prev, c, k);
+#endif
+
+fsck_err:
+ return ret;
+}
+
/*
* Walk extents: verify that extents have a corresponding S_ISREG inode, and
* that i_size an i_sectors are consistent
@@ -620,15 +1235,17 @@ noinline_for_stack
static int check_extents(struct bch_fs *c)
{
struct inode_walker w = inode_walker_init();
+ struct snapshots_seen s;
struct btree_trans trans;
struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_buf prev;
- u64 i_sectors = 0;
int ret = 0;
+#if 0
+ struct bkey_buf prev;
bch2_bkey_buf_init(&prev);
prev.k->k = KEY(0, 0, 0);
+#endif
+ snapshots_seen_init(&s);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
bch_verbose(c, "checking extents");
@@ -636,95 +1253,172 @@ static int check_extents(struct bch_fs *c)
bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
POS(BCACHEFS_ROOT_INO, 0),
BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH);
-retry:
- while ((k = bch2_btree_iter_peek(&iter)).k &&
- !(ret = bkey_err(k))) {
- if (w.have_inode &&
- w.cur_inum != k.k->p.inode &&
- !(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) &&
- fsck_err_on(w.inode.bi_sectors != i_sectors, c,
- "inode %llu has incorrect i_sectors: got %llu, should be %llu",
- w.inode.bi_inum,
- w.inode.bi_sectors, i_sectors)) {
- w.inode.bi_sectors = i_sectors;
-
- ret = write_inode(&trans, &w.inode, w.snapshot);
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS);
+
+ do {
+ ret = lockrestart_do(&trans,
+ check_extent(&trans, &iter, &w, &s));
+ if (ret)
+ break;
+ } while (bch2_btree_iter_advance(&iter));
+ bch2_trans_iter_exit(&trans, &iter);
+#if 0
+ bch2_bkey_buf_exit(&prev, c);
+#endif
+ inode_walker_exit(&w);
+ bch2_trans_exit(&trans);
+ snapshots_seen_exit(&s);
+
+ return ret;
+}
+
+static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
+{
+ struct bch_fs *c = trans->c;
+ struct inode_walker_entry *i;
+ int ret = 0, ret2 = 0;
+ s64 count2;
+
+ for (i = w->d; i < w->d + w->nr; i++) {
+ if (i->inode.bi_nlink == i->count)
+ continue;
+
+ count2 = lockrestart_do(trans,
+ bch2_count_subdirs(trans, w->cur_inum, i->snapshot));
+
+ if (i->count != count2) {
+ bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu",
+ i->count, count2);
+ i->count = count2;
+ if (i->inode.bi_nlink == i->count)
+ continue;
+ }
+
+ if (fsck_err_on(i->inode.bi_nlink != i->count, c,
+ "directory %llu:%u with wrong i_nlink: got %u, should be %llu",
+ w->cur_inum, i->snapshot, i->inode.bi_nlink, i->count)) {
+ i->inode.bi_nlink = i->count;
+ ret = write_inode(trans, &i->inode, i->snapshot);
if (ret)
break;
+ ret2 = -EINTR;
}
+ }
+fsck_err:
+ return ret ?: ret2;
+}
+
+static int check_dirent_target(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c_dirent d,
+ struct bch_inode_unpacked *target,
+ u32 target_snapshot)
+{
+ struct bch_fs *c = trans->c;
+ bool backpointer_exists = true;
+ char buf[200];
+ int ret = 0;
- if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
- char buf1[200];
- char buf2[200];
+ if (!target->bi_dir &&
+ !target->bi_dir_offset) {
+ target->bi_dir = d.k->p.inode;
+ target->bi_dir_offset = d.k->p.offset;
- bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
- bch2_bkey_val_to_text(&PBUF(buf2), c, k);
+ ret = write_inode(trans, target, target_snapshot);
+ if (ret)
+ goto err;
+ }
+
+ if (!inode_backpointer_matches(d, target)) {
+ ret = inode_backpointer_exists(trans, target, d.k->p.snapshot);
+ if (ret < 0)
+ goto err;
- if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2))
- return fix_overlapping_extent(&trans, k, prev.k->k.p) ?: -EINTR;
+ backpointer_exists = ret;
+ ret = 0;
+
+ if (fsck_err_on(S_ISDIR(target->bi_mode) &&
+ backpointer_exists, c,
+ "directory %llu with multiple links",
+ target->bi_inum)) {
+ ret = remove_dirent(trans, d.k->p);
+ if (ret)
+ goto err;
+ return 0;
}
- ret = walk_inode(&trans, &w, k.k->p.inode);
- if (ret)
- break;
+ if (fsck_err_on(backpointer_exists &&
+ !target->bi_nlink, c,
+ "inode %llu has multiple links but i_nlink 0",
+ target->bi_inum)) {
+ target->bi_nlink++;
+ target->bi_flags &= ~BCH_INODE_UNLINKED;
- if (w.first_this_inode)
- i_sectors = 0;
-
- if (fsck_err_on(!w.have_inode, c,
- "extent type %u for missing inode %llu",
- k.k->type, k.k->p.inode) ||
- fsck_err_on(w.have_inode &&
- !S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c,
- "extent type %u for non regular file, inode %llu mode %o",
- k.k->type, k.k->p.inode, w.inode.bi_mode)) {
- bch2_fs_lazy_rw(c);
- return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
- POS(k.k->p.inode, 0),
- POS(k.k->p.inode, U64_MAX),
- NULL) ?: -EINTR;
+ ret = write_inode(trans, target, target_snapshot);
+ if (ret)
+ goto err;
}
- if (fsck_err_on(w.have_inode &&
- !(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
- k.k->type != KEY_TYPE_reservation &&
- k.k->p.offset > round_up(w.inode.bi_size, block_bytes(c)) >> 9, c,
- "extent type %u offset %llu past end of inode %llu, i_size %llu",
- k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) {
- bch2_fs_lazy_rw(c);
- return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
- POS(k.k->p.inode, round_up(w.inode.bi_size, block_bytes(c)) >> 9),
- POS(k.k->p.inode, U64_MAX),
- NULL) ?: -EINTR;
+ if (fsck_err_on(!backpointer_exists, c,
+ "inode %llu has wrong backpointer:\n"
+ "got %llu:%llu\n"
+ "should be %llu:%llu",
+ target->bi_inum,
+ target->bi_dir,
+ target->bi_dir_offset,
+ d.k->p.inode,
+ d.k->p.offset)) {
+ target->bi_dir = d.k->p.inode;
+ target->bi_dir_offset = d.k->p.offset;
+
+ ret = write_inode(trans, target, target_snapshot);
+ if (ret)
+ goto err;
}
+ }
- if (bkey_extent_is_allocation(k.k))
- i_sectors += k.k->size;
- bch2_bkey_buf_reassemble(&prev, c, k);
+ if (fsck_err_on(vfs_d_type(d.v->d_type) != mode_to_type(target->bi_mode), c,
+ "incorrect d_type: should be %u:\n%s",
+ mode_to_type(target->bi_mode),
+ (bch2_bkey_val_to_text(&PBUF(buf), c, d.s_c), buf))) {
+ struct bkey_i_dirent *n;
- bch2_btree_iter_advance(&iter);
+ n = kmalloc(bkey_bytes(d.k), GFP_KERNEL);
+ if (!n) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ bkey_reassemble(&n->k_i, d.s_c);
+ n->v.d_type = mode_to_type(target->bi_mode);
+
+ ret = __bch2_trans_do(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW,
+ bch2_trans_update(trans, iter, &n->k_i, 0));
+ kfree(n);
+ if (ret)
+ goto err;
}
+err:
fsck_err:
- if (ret == -EINTR)
- goto retry;
- bch2_trans_iter_exit(&trans, &iter);
- bch2_bkey_buf_exit(&prev, c);
- return bch2_trans_exit(&trans) ?: ret;
+ return ret;
}
static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
struct bch_hash_info *hash_info,
- struct inode_walker *w, unsigned *nr_subdirs)
+ struct inode_walker *dir,
+ struct inode_walker *target,
+ struct snapshots_seen *s)
{
struct bch_fs *c = trans->c;
struct bkey_s_c k;
struct bkey_s_c_dirent d;
- struct bch_inode_unpacked target;
+ struct inode_walker_entry *i;
u32 target_snapshot;
- bool have_target;
- bool backpointer_exists = true;
- u64 d_inum;
+ u32 target_subvol;
+ u64 target_inum;
char buf[200];
int ret;
@@ -736,38 +1430,49 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (ret)
return ret;
- if (w->have_inode &&
- w->cur_inum != k.k->p.inode &&
- fsck_err_on(w->inode.bi_nlink != *nr_subdirs, c,
- "directory %llu with wrong i_nlink: got %u, should be %u",
- w->inode.bi_inum, w->inode.bi_nlink, *nr_subdirs)) {
- w->inode.bi_nlink = *nr_subdirs;
- ret = write_inode(trans, &w->inode, w->snapshot);
- return ret ?: -EINTR;
- }
+ ret = check_key_has_snapshot(trans, iter, k);
+ if (ret)
+ return ret;
- ret = __walk_inode(trans, w, k.k->p.inode);
+ ret = snapshots_seen_update(c, s, k.k->p);
if (ret)
return ret;
- if (w->first_this_inode)
- *nr_subdirs = 0;
+ if (k.k->type == KEY_TYPE_whiteout)
+ return 0;
+
+ if (dir->cur_inum != k.k->p.inode) {
+ ret = check_subdir_count(trans, dir);
+ if (ret)
+ return ret;
+ }
- if (fsck_err_on(!w->have_inode, c,
+ ret = __walk_inode(trans, dir, k.k->p);
+ if (ret < 0)
+ return ret;
+
+ if (fsck_err_on(ret == INT_MAX, c,
"dirent in nonexisting directory:\n%s",
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)) ||
- fsck_err_on(!S_ISDIR(w->inode.bi_mode), c,
+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
+ return __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
+ bch2_btree_delete_at(trans, iter,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
+
+ if (ret == INT_MAX)
+ return 0;
+
+ i = dir->d + ret;
+ ret = 0;
+
+ if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c,
"dirent in non directory inode type %u:\n%s",
- mode_to_type(w->inode.bi_mode),
+ mode_to_type(i->inode.bi_mode),
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
return __bch2_trans_do(trans, NULL, NULL, 0,
bch2_btree_delete_at(trans, iter, 0));
- if (!w->have_inode)
- return 0;
-
- if (w->first_this_inode)
- *hash_info = bch2_hash_info_init(c, &w->inode);
+ if (dir->first_this_inode)
+ *hash_info = bch2_hash_info_init(c, &dir->d[0].inode);
ret = hash_check_key(trans, bch2_dirent_hash_desc,
hash_info, iter, k);
@@ -780,105 +1485,76 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
return 0;
d = bkey_s_c_to_dirent(k);
- d_inum = le64_to_cpu(d.v->d_inum);
- ret = __lookup_inode(trans, d_inum, &target, &target_snapshot);
+ ret = __bch2_dirent_read_target(trans, d,
+ &target_subvol,
+ &target_snapshot,
+ &target_inum,
+ true);
if (ret && ret != -ENOENT)
return ret;
- have_target = !ret;
- ret = 0;
-
- if (fsck_err_on(!have_target, c,
- "dirent points to missing inode:\n%s",
- (bch2_bkey_val_to_text(&PBUF(buf), c,
- k), buf)))
+ if (fsck_err_on(ret, c,
+ "dirent points to missing subvolume %llu",
+ le64_to_cpu(d.v->d_inum)))
return remove_dirent(trans, d.k->p);
- if (!have_target)
- return 0;
-
- if (!target.bi_dir &&
- !target.bi_dir_offset) {
- target.bi_dir = k.k->p.inode;
- target.bi_dir_offset = k.k->p.offset;
+ if (target_subvol) {
+ struct bch_inode_unpacked subvol_root;
- ret = __write_inode(trans, &target, target_snapshot) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW);
- if (ret)
- return ret;
- return -EINTR;
- }
-
- if (!inode_backpointer_matches(d, &target)) {
- ret = inode_backpointer_exists(trans, &target);
- if (ret < 0)
+ ret = __lookup_inode(trans, target_inum,
+ &subvol_root, &target_snapshot);
+ if (ret && ret != -ENOENT)
return ret;
- backpointer_exists = ret;
- ret = 0;
+ if (fsck_err_on(ret, c,
+ "subvolume %u points to missing subvolume root %llu",
+ target_subvol,
+ target_inum)) {
+ bch_err(c, "repair not implemented yet");
+ return -EINVAL;
+ }
- if (fsck_err_on(S_ISDIR(target.bi_mode) &&
- backpointer_exists, c,
- "directory %llu with multiple links",
- target.bi_inum))
- return remove_dirent(trans, d.k->p);
+ if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c,
+ "subvol root %llu has wrong bi_subvol field: got %u, should be %u",
+ target_inum,
+ subvol_root.bi_subvol, target_subvol)) {
+ subvol_root.bi_subvol = target_subvol;
+ ret = write_inode(trans, &subvol_root, target_snapshot);
+ if (ret)
+ return ret;
+ }
- if (fsck_err_on(backpointer_exists &&
- !target.bi_nlink, c,
- "inode %llu has multiple links but i_nlink 0",
- d_inum)) {
- target.bi_nlink++;
- target.bi_flags &= ~BCH_INODE_UNLINKED;
+ ret = check_dirent_target(trans, iter, d, &subvol_root,
+ target_snapshot);
+ if (ret)
+ return ret;
+ } else {
+ ret = __get_visible_inodes(trans, target, s, target_inum);
+ if (ret)
+ return ret;
- ret = write_inode(trans, &target, target_snapshot);
- return ret ?: -EINTR;
+ if (fsck_err_on(!target->nr, c,
+ "dirent points to missing inode:\n%s",
+ (bch2_bkey_val_to_text(&PBUF(buf), c,
+ k), buf))) {
+ ret = remove_dirent(trans, d.k->p);
+ if (ret)
+ return ret;
}
- if (fsck_err_on(!backpointer_exists, c,
- "inode %llu has wrong backpointer:\n"
- "got %llu:%llu\n"
- "should be %llu:%llu",
- d_inum,
- target.bi_dir,
- target.bi_dir_offset,
- k.k->p.inode,
- k.k->p.offset)) {
- target.bi_dir = k.k->p.inode;
- target.bi_dir_offset = k.k->p.offset;
-
- ret = write_inode(trans, &target, target_snapshot);
- return ret ?: -EINTR;
+ for (i = target->d; i < target->d + target->nr; i++) {
+ ret = check_dirent_target(trans, iter, d,
+ &i->inode, i->snapshot);
+ if (ret)
+ return ret;
}
}
- if (fsck_err_on(d.v->d_type != mode_to_type(target.bi_mode), c,
- "incorrect d_type: should be %u:\n%s",
- mode_to_type(target.bi_mode),
- (bch2_bkey_val_to_text(&PBUF(buf), c,
- k), buf))) {
- struct bkey_i_dirent *n;
-
- n = kmalloc(bkey_bytes(d.k), GFP_KERNEL);
- if (!n)
- return -ENOMEM;
-
- bkey_reassemble(&n->k_i, d.s_c);
- n->v.d_type = mode_to_type(target.bi_mode);
-
- ret = __bch2_trans_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
- bch2_btree_iter_traverse(iter) ?:
- bch2_trans_update(trans, iter, &n->k_i, 0));
- kfree(n);
- return ret ?: -EINTR;
- }
+ if (d.v->d_type == DT_DIR)
+ for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
+ i->count++;
- *nr_subdirs += d.v->d_type == DT_DIR;
- return 0;
fsck_err:
return ret;
}
@@ -890,31 +1566,39 @@ fsck_err:
noinline_for_stack
static int check_dirents(struct bch_fs *c)
{
- struct inode_walker w = inode_walker_init();
+ struct inode_walker dir = inode_walker_init();
+ struct inode_walker target = inode_walker_init();
+ struct snapshots_seen s;
struct bch_hash_info hash_info;
struct btree_trans trans;
struct btree_iter iter;
- unsigned nr_subdirs = 0;
int ret = 0;
bch_verbose(c, "checking dirents");
+ snapshots_seen_init(&s);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
bch2_trans_iter_init(&trans, &iter, BTREE_ID_dirents,
POS(BCACHEFS_ROOT_INO, 0),
BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH);
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS);
do {
ret = lockrestart_do(&trans,
- check_dirent(&trans, &iter, &hash_info, &w, &nr_subdirs));
+ check_dirent(&trans, &iter, &hash_info,
+ &dir, &target, &s));
if (ret)
break;
} while (bch2_btree_iter_advance(&iter));
bch2_trans_iter_exit(&trans, &iter);
- return bch2_trans_exit(&trans) ?: ret;
+ bch2_trans_exit(&trans);
+ snapshots_seen_exit(&s);
+ inode_walker_exit(&dir);
+ inode_walker_exit(&target);
+ return ret;
}
/*
@@ -937,15 +1621,22 @@ static int check_xattrs(struct bch_fs *c)
bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
POS(BCACHEFS_ROOT_INO, 0),
BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH);
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS);
retry:
+ bch2_trans_begin(&trans);
+
while ((k = bch2_btree_iter_peek(&iter)).k &&
!(ret = bkey_err(k))) {
- ret = walk_inode(&trans, &w, k.k->p.inode);
+ ret = check_key_has_snapshot(&trans, &iter, k);
if (ret)
break;
- if (fsck_err_on(!w.have_inode, c,
+ ret = walk_inode(&trans, &w, k.k->p);
+ if (ret < 0)
+ break;
+
+ if (fsck_err_on(ret == INT_MAX, c,
"xattr for missing inode %llu",
k.k->p.inode)) {
ret = bch2_btree_delete_at(&trans, &iter, 0);
@@ -954,14 +1645,18 @@ retry:
continue;
}
- if (w.first_this_inode && w.have_inode)
- hash_info = bch2_hash_info_init(c, &w.inode);
+ if (ret == INT_MAX)
+ goto next;
+ ret = 0;
+
+ if (w.first_this_inode)
+ hash_info = bch2_hash_info_init(c, &w.d[0].inode);
ret = hash_check_key(&trans, bch2_xattr_hash_desc,
&hash_info, &iter, k);
if (ret)
break;
-
+next:
bch2_btree_iter_advance(&iter);
}
fsck_err:
@@ -973,40 +1668,63 @@ fsck_err:
}
/* Get root directory, create if it doesn't exist: */
-static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode)
+static int check_root(struct bch_fs *c)
{
- struct bkey_inode_buf packed;
+ struct btree_trans trans;
+ struct bch_inode_unpacked root_inode;
u32 snapshot;
+ u64 inum;
int ret;
+ bch2_trans_init(&trans, c, 0, 0);
+
bch_verbose(c, "checking root directory");
- ret = bch2_trans_do(c, NULL, NULL, 0,
- lookup_inode(&trans, BCACHEFS_ROOT_INO, root_inode, &snapshot));
+ ret = subvol_lookup(&trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
if (ret && ret != -ENOENT)
return ret;
- if (fsck_err_on(ret, c, "root directory missing"))
- goto create_root;
+ if (mustfix_fsck_err_on(ret, c, "root subvol missing")) {
+ struct bkey_i_subvolume root_subvol;
- if (fsck_err_on(!S_ISDIR(root_inode->bi_mode), c,
- "root inode not a directory"))
- goto create_root;
+ snapshot = U32_MAX;
+ inum = BCACHEFS_ROOT_INO;
- return 0;
-fsck_err:
- return ret;
-create_root:
- bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|0755,
- 0, NULL);
- root_inode->bi_inum = BCACHEFS_ROOT_INO;
+ bkey_subvolume_init(&root_subvol.k_i);
+ root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL;
+ root_subvol.v.flags = 0;
+ root_subvol.v.snapshot = cpu_to_le32(snapshot);
+ root_subvol.v.inode = cpu_to_le64(inum);
+ ret = __bch2_trans_do(&trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW,
+ __bch2_btree_insert(&trans, BTREE_ID_subvolumes, &root_subvol.k_i));
+ if (ret) {
+ bch_err(c, "error writing root subvol: %i", ret);
+ goto err;
+ }
- bch2_inode_pack(c, &packed, root_inode);
+ }
- return bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i,
- NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW);
+ ret = lookup_inode(&trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
+ if (ret && ret != -ENOENT)
+ return ret;
+
+ if (mustfix_fsck_err_on(ret, c, "root directory missing") ||
+ mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), c,
+ "root inode not a directory")) {
+ bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
+ 0, NULL);
+ root_inode.bi_inum = inum;
+
+ ret = write_inode(&trans, &root_inode, snapshot);
+ if (ret)
+ bch_err(c, "error writing root inode: %i", ret);
+ }
+err:
+fsck_err:
+ bch2_trans_exit(&trans);
+ return ret;
}
struct pathbuf {
@@ -1041,29 +1759,30 @@ static int path_down(struct pathbuf *p, u64 inum)
static int check_path(struct btree_trans *trans,
struct pathbuf *p,
- struct bch_inode_unpacked *inode)
+ struct bch_inode_unpacked *inode,
+ u32 snapshot)
{
struct bch_fs *c = trans->c;
- u32 snapshot;
size_t i;
int ret = 0;
+ snapshot = snapshot_t(c, snapshot)->equiv;
p->nr = 0;
while (inode->bi_inum != BCACHEFS_ROOT_INO) {
ret = lockrestart_do(trans,
- inode_backpointer_exists(trans, inode));
+ inode_backpointer_exists(trans, inode, snapshot));
if (ret < 0)
break;
if (!ret) {
- if (fsck_err(c, "unreachable inode %llu, type %u nlink %u backptr %llu:%llu",
- inode->bi_inum,
+ if (fsck_err(c, "unreachable inode %llu:%u, type %u nlink %u backptr %llu:%llu",
+ inode->bi_inum, snapshot,
mode_to_type(inode->bi_mode),
inode->bi_nlink,
inode->bi_dir,
inode->bi_dir_offset))
- ret = reattach_inode(trans, inode);
+ ret = reattach_inode(trans, inode, snapshot);
break;
}
ret = 0;
@@ -1086,13 +1805,13 @@ static int check_path(struct btree_trans *trans,
return 0;
ret = lockrestart_do(trans,
- remove_backpointer(trans, inode));
+ remove_backpointer(trans, inode));
if (ret) {
bch_err(c, "error removing dirent: %i", ret);
break;
}
- ret = reattach_inode(trans, inode);
+ ret = reattach_inode(trans, inode, snapshot);
break;
}
@@ -1127,7 +1846,8 @@ static int check_directory_structure(struct bch_fs *c)
for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH, k, ret) {
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
if (k.k->type != KEY_TYPE_inode)
continue;
@@ -1138,7 +1858,10 @@ static int check_directory_structure(struct bch_fs *c)
break;
}
- ret = check_path(&trans, &path, &u);
+ if (u.bi_flags & BCH_INODE_UNLINKED)
+ continue;
+
+ ret = check_path(&trans, &path, &u, iter.pos.snapshot);
if (ret)
break;
}
@@ -1196,8 +1919,9 @@ static int nlink_cmp(const void *_l, const void *_r)
return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot);
}
-static void inc_link(struct bch_fs *c, struct nlink_table *links,
- u64 range_start, u64 range_end, u64 inum)
+static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
+ struct nlink_table *links,
+ u64 range_start, u64 range_end, u64 inum, u32 snapshot)
{
struct nlink *link, key = {
.inum = inum, .snapshot = U32_MAX,
@@ -1208,8 +1932,18 @@ static void inc_link(struct bch_fs *c, struct nlink_table *links,
link = __inline_bsearch(&key, links->d, links->nr,
sizeof(links->d[0]), nlink_cmp);
- if (link)
- link->count++;
+ if (!link)
+ return;
+
+ while (link > links->d && link[0].inum == link[-1].inum)
+ --link;
+
+ for (; link < links->d + links->nr && link->inum == inum; link++)
+ if (ref_visible(c, s, snapshot, link->snapshot)) {
+ link->count++;
+ if (link->snapshot >= snapshot)
+ break;
+ }
}
noinline_for_stack
@@ -1229,7 +1963,8 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
for_each_btree_key(&trans, iter, BTREE_ID_inodes,
POS(0, start),
BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH, k, ret) {
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
if (k.k->type != KEY_TYPE_inode)
continue;
@@ -1270,23 +2005,33 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
u64 range_start, u64 range_end)
{
struct btree_trans trans;
+ struct snapshots_seen s;
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_s_c_dirent d;
int ret;
+ snapshots_seen_init(&s);
+
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN,
BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH, k, ret) {
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ ret = snapshots_seen_update(c, &s, k.k->p);
+ if (ret)
+ break;
+
switch (k.k->type) {
case KEY_TYPE_dirent:
d = bkey_s_c_to_dirent(k);
- if (d.v->d_type != DT_DIR)
- inc_link(c, links, range_start, range_end,
- le64_to_cpu(d.v->d_inum));
+ if (d.v->d_type != DT_DIR &&
+ d.v->d_type != DT_SUBVOL)
+ inc_link(c, &s, links, range_start, range_end,
+ le64_to_cpu(d.v->d_inum),
+ d.k->p.snapshot);
break;
}
@@ -1294,10 +2039,11 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
}
bch2_trans_iter_exit(&trans, &iter);
- ret = bch2_trans_exit(&trans) ?: ret;
if (ret)
bch_err(c, "error in fsck: btree error %i while walking dirents", ret);
+ bch2_trans_exit(&trans);
+ snapshots_seen_exit(&s);
return ret;
}
@@ -1319,7 +2065,8 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
for_each_btree_key(&trans, iter, BTREE_ID_inodes,
POS(0, range_start),
BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH, k, ret) {
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
if (k.k->p.offset >= range_end)
break;
@@ -1335,7 +2082,8 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
if (!u.bi_nlink)
continue;
- while (link->inum < k.k->p.offset) {
+ while ((cmp_int(link->inum, k.k->p.offset) ?:
+ cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
link++;
BUG_ON(link >= links->d + links->nr);
}
@@ -1408,13 +2156,13 @@ static int check_nlinks(struct bch_fs *c)
*/
int bch2_fsck_full(struct bch_fs *c)
{
- struct bch_inode_unpacked root_inode;
-
- return check_inodes(c, true) ?:
+ return bch2_fs_snapshots_check(c) ?:
+ check_inodes(c, true) ?:
+ check_subvols(c) ?:
check_extents(c) ?:
check_dirents(c) ?:
check_xattrs(c) ?:
- check_root(c, &root_inode) ?:
+ check_root(c) ?:
check_directory_structure(c) ?:
check_nlinks(c);
}
diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c
index 14b0e8c0..9130d571 100644
--- a/libbcachefs/inode.c
+++ b/libbcachefs/inode.c
@@ -6,8 +6,10 @@
#include "btree_update.h"
#include "error.h"
#include "extents.h"
+#include "extent_update.h"
#include "inode.h"
#include "str_hash.h"
+#include "subvolume.h"
#include "varint.h"
#include <linux/random.h>
@@ -295,15 +297,21 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode,
int bch2_inode_peek(struct btree_trans *trans,
struct btree_iter *iter,
struct bch_inode_unpacked *inode,
- u64 inum, unsigned flags)
+ subvol_inum inum, unsigned flags)
{
struct bkey_s_c k;
+ u32 snapshot;
int ret;
- if (trans->c->opts.inodes_use_key_cache)
+ if (0 && trans->c->opts.inodes_use_key_cache)
flags |= BTREE_ITER_CACHED;
- bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, inum), flags);
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ return ret;
+
+ bch2_trans_iter_init(trans, iter, BTREE_ID_inodes,
+ SPOS(0, inum.inum, snapshot), flags);
k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
if (ret)
@@ -340,8 +348,8 @@ int bch2_inode_write(struct btree_trans *trans,
const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
- struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
- struct bch_inode_unpacked unpacked;
+ struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
+ struct bch_inode_unpacked unpacked;
if (k.k->p.inode)
return "nonzero k.p.inode";
@@ -368,6 +376,9 @@ const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
unpacked.bi_nlink != 0)
return "flagged as unlinked but bi_nlink != 0";
+ if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode))
+ return "subvolume root but not a directory";
+
return NULL;
}
@@ -482,6 +493,9 @@ static inline u32 bkey_generation(struct bkey_s_c k)
}
}
+/*
+ * This just finds an empty slot:
+ */
int bch2_inode_create(struct btree_trans *trans,
struct btree_iter *iter,
struct bch_inode_unpacked *inode_u,
@@ -581,19 +595,77 @@ found_slot:
return 0;
}
-int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
+static int bch2_inode_delete_keys(struct btree_trans *trans,
+ subvol_inum inum, enum btree_id id)
+{
+ u64 offset = 0;
+ int ret = 0;
+
+ while (!ret || ret == -EINTR) {
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_i delete;
+ u32 snapshot;
+
+ bch2_trans_begin(trans);
+
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ continue;
+
+ bch2_trans_iter_init(trans, &iter, id,
+ SPOS(inum.inum, offset, snapshot),
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek(&iter);
+
+ if (!k.k || iter.pos.inode != inum.inum) {
+ bch2_trans_iter_exit(trans, &iter);
+ break;
+ }
+
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ bkey_init(&delete.k);
+ delete.k.p = iter.pos;
+
+ if (btree_node_type_is_extents(iter.btree_id)) {
+ unsigned max_sectors =
+ min_t(u64, U64_MAX - iter.pos.offset,
+ KEY_SIZE_MAX & (~0 << trans->c->block_bits));
+
+ /* create the biggest key we can */
+ bch2_key_resize(&delete.k, max_sectors);
+
+ ret = bch2_extent_trim_atomic(trans, &iter, &delete);
+ if (ret)
+ goto err;
+ }
+
+ ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL);
+err:
+ offset = iter.pos.offset;
+ bch2_trans_iter_exit(trans, &iter);
+ }
+
+ return ret;
+}
+
+int bch2_inode_rm(struct bch_fs *c, subvol_inum inum, bool cached)
{
struct btree_trans trans;
struct btree_iter iter = { NULL };
struct bkey_i_inode_generation delete;
- struct bpos start = POS(inode_nr, 0);
- struct bpos end = POS(inode_nr + 1, 0);
struct bch_inode_unpacked inode_u;
struct bkey_s_c k;
unsigned iter_flags = BTREE_ITER_INTENT;
+ u32 snapshot;
int ret;
- if (cached && c->opts.inodes_use_key_cache)
+ if (0 && cached && c->opts.inodes_use_key_cache)
iter_flags |= BTREE_ITER_CACHED;
bch2_trans_init(&trans, c, 0, 1024);
@@ -606,19 +678,20 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
* XXX: the dirent could ideally would delete whiteouts when they're no
* longer needed
*/
- ret = bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
- start, end, NULL) ?:
- bch2_btree_delete_range_trans(&trans, BTREE_ID_xattrs,
- start, end, NULL) ?:
- bch2_btree_delete_range_trans(&trans, BTREE_ID_dirents,
- start, end, NULL);
+ ret = bch2_inode_delete_keys(&trans, inum, BTREE_ID_extents) ?:
+ bch2_inode_delete_keys(&trans, inum, BTREE_ID_xattrs) ?:
+ bch2_inode_delete_keys(&trans, inum, BTREE_ID_dirents);
if (ret)
goto err;
retry:
bch2_trans_begin(&trans);
+ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+ if (ret)
+ goto err;
+
bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes,
- POS(0, inode_nr), iter_flags);
+ SPOS(0, inum.inum, snapshot), iter_flags);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
@@ -628,13 +701,20 @@ retry:
if (k.k->type != KEY_TYPE_inode) {
bch2_fs_inconsistent(trans.c,
"inode %llu not found when deleting",
- inode_nr);
+ inum.inum);
ret = -EIO;
goto err;
}
bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u);
+ /* Subvolume root? */
+ if (inode_u.bi_subvol) {
+ ret = bch2_subvolume_delete(&trans, inode_u.bi_subvol, -1);
+ if (ret)
+ goto err;
+ }
+
bkey_inode_generation_init(&delete.k_i);
delete.k.p = iter.pos;
delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
@@ -651,20 +731,22 @@ err:
return ret;
}
-static int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
+static int bch2_inode_find_by_inum_trans(struct btree_trans *trans,
+ subvol_inum inum,
struct bch_inode_unpacked *inode)
{
- struct btree_iter iter = { NULL };
+ struct btree_iter iter;
int ret;
- ret = bch2_inode_peek(trans, &iter, inode, inode_nr, 0);
- bch2_trans_iter_exit(trans, &iter);
+ ret = bch2_inode_peek(trans, &iter, inode, inum, 0);
+ if (!ret)
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
-int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
+int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
struct bch_inode_unpacked *inode)
{
return bch2_trans_do(c, NULL, NULL, 0,
- bch2_inode_find_by_inum_trans(&trans, inode_nr, inode));
+ bch2_inode_find_by_inum_trans(&trans, inum, inode));
}
diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h
index 25bef104..9e84cddc 100644
--- a/libbcachefs/inode.h
+++ b/libbcachefs/inode.h
@@ -58,7 +58,7 @@ int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
int bch2_inode_peek(struct btree_trans *, struct btree_iter *,
- struct bch_inode_unpacked *, u64, unsigned);
+ struct bch_inode_unpacked *, subvol_inum, unsigned);
int bch2_inode_write(struct btree_trans *, struct btree_iter *,
struct bch_inode_unpacked *);
@@ -74,9 +74,10 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
int bch2_inode_create(struct btree_trans *, struct btree_iter *,
struct bch_inode_unpacked *, u32, u64);
-int bch2_inode_rm(struct bch_fs *, u64, bool);
+int bch2_inode_rm(struct bch_fs *, subvol_inum, bool);
-int bch2_inode_find_by_inum(struct bch_fs *, u64, struct bch_inode_unpacked *);
+int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum,
+ struct bch_inode_unpacked *);
static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode)
{
diff --git a/libbcachefs/io.c b/libbcachefs/io.c
index ccde9001..0bc72d2a 100644
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@@ -27,6 +27,7 @@
#include "keylist.h"
#include "move.h"
#include "rebalance.h"
+#include "subvolume.h"
#include "super.h"
#include "super-io.h"
@@ -220,7 +221,8 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
: 0;
if (!*usage_increasing &&
- (new_replicas > bch2_bkey_replicas(c, old) ||
+ (new->k.p.snapshot != old.k->p.snapshot ||
+ new_replicas > bch2_bkey_replicas(c, old) ||
(!new_compressed && bch2_bkey_sectors_compressed(old))))
*usage_increasing = true;
@@ -256,6 +258,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
}
int bch2_extent_update(struct btree_trans *trans,
+ subvol_inum inum,
struct btree_iter *iter,
struct bkey_i *k,
struct disk_reservation *disk_res,
@@ -314,8 +317,8 @@ int bch2_extent_update(struct btree_trans *trans,
struct btree_iter inode_iter;
struct bch_inode_unpacked inode_u;
- ret = bch2_inode_peek(trans, &inode_iter, &inode_u,
- k->k.p.inode, BTREE_ITER_INTENT);
+ ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inum,
+ BTREE_ITER_INTENT);
if (ret)
return ret;
@@ -371,22 +374,37 @@ int bch2_extent_update(struct btree_trans *trans,
return 0;
}
+/*
+ * Returns -EINTR if we had to drop locks:
+ */
int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
- struct bpos end, u64 *journal_seq,
- s64 *i_sectors_delta)
+ subvol_inum inum, u64 end,
+ u64 *journal_seq, s64 *i_sectors_delta)
{
struct bch_fs *c = trans->c;
unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
+ struct bpos end_pos = POS(inum.inum, end);
struct bkey_s_c k;
int ret = 0, ret2 = 0;
+ u32 snapshot;
- while ((bch2_trans_begin(trans),
- (k = bch2_btree_iter_peek(iter)).k) &&
- bkey_cmp(iter->pos, end) < 0) {
+ while (1) {
struct disk_reservation disk_res =
bch2_disk_reservation_init(c, 0);
struct bkey_i delete;
+ bch2_trans_begin(trans);
+
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ goto btree_err;
+
+ bch2_btree_iter_set_snapshot(iter, snapshot);
+
+ k = bch2_btree_iter_peek(iter);
+ if (bkey_cmp(iter->pos, end_pos) >= 0)
+ break;
+
ret = bkey_err(k);
if (ret)
goto btree_err;
@@ -396,9 +414,9 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
/* create the biggest key we can */
bch2_key_resize(&delete.k, max_sectors);
- bch2_cut_back(end, &delete);
+ bch2_cut_back(end_pos, &delete);
- ret = bch2_extent_update(trans, iter, &delete,
+ ret = bch2_extent_update(trans, inum, iter, &delete,
&disk_res, journal_seq,
0, i_sectors_delta, false);
bch2_disk_reservation_put(c, &disk_res);
@@ -411,36 +429,31 @@ btree_err:
break;
}
- if (bkey_cmp(iter->pos, end) > 0) {
- bch2_btree_iter_set_pos(iter, end);
- ret = bch2_btree_iter_traverse(iter);
- }
+ if (bkey_cmp(iter->pos, end_pos) > 0)
+ bch2_btree_iter_set_pos(iter, end_pos);
return ret ?: ret2;
}
-int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end,
+int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
u64 *journal_seq, s64 *i_sectors_delta)
{
struct btree_trans trans;
struct btree_iter iter;
- int ret = 0;
+ int ret;
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
- POS(inum, start),
- BTREE_ITER_INTENT);
+ POS(inum.inum, start),
+ BTREE_ITER_INTENT);
- ret = bch2_fpunch_at(&trans, &iter, POS(inum, end),
+ ret = bch2_fpunch_at(&trans, &iter, inum, end,
journal_seq, i_sectors_delta);
bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
- if (ret == -EINTR)
- ret = 0;
-
- return ret;
+ return ret == -EINTR ? 0 : ret;
}
int bch2_write_index_default(struct bch_write_op *op)
@@ -451,40 +464,51 @@ int bch2_write_index_default(struct bch_write_op *op)
struct bkey_i *k = bch2_keylist_front(keys);
struct btree_trans trans;
struct btree_iter iter;
+ subvol_inum inum = {
+ .subvol = op->subvol,
+ .inum = k->k.p.inode,
+ };
int ret;
+ BUG_ON(!inum.subvol);
+
bch2_bkey_buf_init(&sk);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
- bkey_start_pos(&k->k),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-
do {
bch2_trans_begin(&trans);
k = bch2_keylist_front(keys);
+ bch2_bkey_buf_copy(&sk, c, k);
- k->k.p.snapshot = iter.snapshot;
+ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol,
+ &sk.k->k.p.snapshot);
+ if (ret == -EINTR)
+ continue;
+ if (ret)
+ break;
- bch2_bkey_buf_realloc(&sk, c, k->k.u64s);
- bkey_copy(sk.k, k);
- bch2_cut_front(iter.pos, sk.k);
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+ bkey_start_pos(&sk.k->k),
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- ret = bch2_extent_update(&trans, &iter, sk.k,
+ ret = bch2_extent_update(&trans, inum, &iter, sk.k,
&op->res, op_journal_seq(op),
op->new_i_size, &op->i_sectors_delta,
op->flags & BCH_WRITE_CHECK_ENOSPC);
+ bch2_trans_iter_exit(&trans, &iter);
+
if (ret == -EINTR)
continue;
if (ret)
break;
if (bkey_cmp(iter.pos, k->k.p) >= 0)
- bch2_keylist_pop_front(keys);
+ bch2_keylist_pop_front(&op->insert_keys);
+ else
+ bch2_cut_front(iter.pos, k);
} while (!bch2_keylist_empty(keys));
- bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
bch2_bkey_buf_exit(&sk, c);
@@ -1645,7 +1669,7 @@ static void bch2_rbio_done(struct bch_read_bio *rbio)
}
static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
- struct bvec_iter bvec_iter, u64 inode,
+ struct bvec_iter bvec_iter,
struct bch_io_failures *failed,
unsigned flags)
{
@@ -1709,7 +1733,10 @@ static void bch2_rbio_retry(struct work_struct *work)
struct bch_fs *c = rbio->c;
struct bvec_iter iter = rbio->bvec_iter;
unsigned flags = rbio->flags;
- u64 inode = rbio->read_pos.inode;
+ subvol_inum inum = {
+ .subvol = rbio->subvol,
+ .inum = rbio->read_pos.inode,
+ };
struct bch_io_failures failed = { .nr = 0 };
trace_read_retry(&rbio->bio);
@@ -1725,12 +1752,12 @@ static void bch2_rbio_retry(struct work_struct *work)
flags &= ~BCH_READ_MAY_PROMOTE;
if (flags & BCH_READ_NODECODE) {
- bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags);
+ bch2_read_retry_nodecode(c, rbio, iter, &failed, flags);
} else {
flags &= ~BCH_READ_LAST_FRAGMENT;
flags |= BCH_READ_MUST_CLONE;
- __bch2_read(c, rbio, iter, inode, &failed, flags);
+ __bch2_read(c, rbio, iter, inum, &failed, flags);
}
}
@@ -1804,7 +1831,8 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
if (!bch2_bkey_narrow_crcs(new, new_crc))
goto out;
- ret = bch2_trans_update(trans, &iter, new, 0);
+ ret = bch2_trans_update(trans, &iter, new,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
out:
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -2172,6 +2200,7 @@ get_bio:
/* XXX: only initialize this if needed */
rbio->devs_have = bch2_bkey_devs(k);
rbio->pick = pick;
+ rbio->subvol = orig->subvol;
rbio->read_pos = read_pos;
rbio->data_btree = data_btree;
rbio->data_pos = data_pos;
@@ -2274,25 +2303,31 @@ out_read_done:
}
void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
- struct bvec_iter bvec_iter, u64 inode,
+ struct bvec_iter bvec_iter, subvol_inum inum,
struct bch_io_failures *failed, unsigned flags)
{
struct btree_trans trans;
struct btree_iter iter;
struct bkey_buf sk;
struct bkey_s_c k;
+ u32 snapshot;
int ret;
BUG_ON(flags & BCH_READ_NODECODE);
bch2_bkey_buf_init(&sk);
bch2_trans_init(&trans, c, 0, 0);
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
- POS(inode, bvec_iter.bi_sector),
- BTREE_ITER_SLOTS);
retry:
bch2_trans_begin(&trans);
+ iter = (struct btree_iter) { NULL };
+
+ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+ if (ret)
+ goto err;
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+ SPOS(inum.inum, bvec_iter.bi_sector, snapshot),
+ BTREE_ITER_SLOTS|BTREE_ITER_FILTER_SNAPSHOTS);
while (1) {
unsigned bytes, sectors, offset_into_extent;
enum btree_id data_btree = BTREE_ID_extents;
@@ -2307,7 +2342,7 @@ retry:
}
bch2_btree_iter_set_pos(&iter,
- POS(inode, bvec_iter.bi_sector));
+ POS(inum.inum, bvec_iter.bi_sector));
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
@@ -2357,16 +2392,17 @@ retry:
swap(bvec_iter.bi_size, bytes);
bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
}
+err:
+ bch2_trans_iter_exit(&trans, &iter);
if (ret == -EINTR || ret == READ_RETRY || ret == READ_RETRY_AVOID)
goto retry;
- bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
bch2_bkey_buf_exit(&sk, c);
if (ret) {
- bch_err_inum_ratelimited(c, inode,
+ bch_err_inum_ratelimited(c, inum.inum,
"read error %i from btree lookup", ret);
rbio->bio.bi_status = BLK_STS_IOERR;
bch2_rbio_done(rbio);
diff --git a/libbcachefs/io.h b/libbcachefs/io.h
index bc0a0bd6..38efd39c 100644
--- a/libbcachefs/io.h
+++ b/libbcachefs/io.h
@@ -63,12 +63,13 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
struct bkey_i *, bool *, bool *, s64 *, s64 *);
-int bch2_extent_update(struct btree_trans *, struct btree_iter *,
- struct bkey_i *, struct disk_reservation *,
- u64 *, u64, s64 *, bool);
+int bch2_extent_update(struct btree_trans *, subvol_inum,
+ struct btree_iter *, struct bkey_i *,
+ struct disk_reservation *, u64 *, u64, s64 *, bool);
+
int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
- struct bpos, u64 *, s64 *);
-int bch2_fpunch(struct bch_fs *c, u64, u64, u64, u64 *, s64 *);
+ subvol_inum, u64, u64 *, s64 *);
+int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, u64 *, s64 *);
int bch2_write_index_default(struct bch_write_op *);
@@ -90,6 +91,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
op->devs_have.nr = 0;
op->target = 0;
op->opts = opts;
+ op->subvol = 0;
op->pos = POS_MAX;
op->version = ZERO_VERSION;
op->write_point = (struct write_point_specifier) { 0 };
@@ -157,10 +159,10 @@ static inline void bch2_read_extent(struct btree_trans *trans,
}
void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
- u64, struct bch_io_failures *, unsigned flags);
+ subvol_inum, struct bch_io_failures *, unsigned flags);
static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
- u64 inode)
+ subvol_inum inum)
{
struct bch_io_failures failed = { .nr = 0 };
@@ -168,8 +170,9 @@ static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
rbio->c = c;
rbio->start_time = local_clock();
+ rbio->subvol = inum.subvol;
- __bch2_read(c, rbio, rbio->bio.bi_iter, inode, &failed,
+ __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed,
BCH_READ_RETRY_IF_STALE|
BCH_READ_MAY_PROMOTE|
BCH_READ_USER_MAPPED);
diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h
index 0aab7795..78bff13d 100644
--- a/libbcachefs/io_types.h
+++ b/libbcachefs/io_types.h
@@ -62,6 +62,7 @@ struct bch_read_bio {
/*
* pos we read from - different from data_pos for indirect extents:
*/
+ u32 subvol;
struct bpos read_pos;
/*
@@ -122,6 +123,7 @@ struct bch_write_op {
u16 nonce;
struct bch_io_opts opts;
+ u32 subvol;
struct bpos pos;
struct bversion version;
diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c
index 1899326d..7c764ee4 100644
--- a/libbcachefs/migrate.c
+++ b/libbcachefs/migrate.c
@@ -48,7 +48,8 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
bch2_trans_iter_init(&trans, &iter, btree_id, POS_MIN,
- BTREE_ITER_PREFETCH);
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS);
while ((k = bch2_btree_iter_peek(&iter)).k &&
!(ret = bkey_err(k))) {
@@ -74,7 +75,8 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
bch2_btree_iter_set_pos(&iter, bkey_start_pos(&sk.k->k));
ret = bch2_btree_iter_traverse(&iter) ?:
- bch2_trans_update(&trans, &iter, sk.k, 0) ?:
+ bch2_trans_update(&trans, &iter, sk.k,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(&trans, NULL, NULL,
BTREE_INSERT_NOFAIL);
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index 7001e3cd..44a61818 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -13,6 +13,7 @@
#include "journal_reclaim.h"
#include "move.h"
#include "replicas.h"
+#include "subvolume.h"
#include "super-io.h"
#include "keylist.h"
@@ -53,6 +54,81 @@ struct moving_context {
wait_queue_head_t wait;
};
+static int insert_snapshot_whiteouts(struct btree_trans *trans,
+ enum btree_id id,
+ struct bpos old_pos,
+ struct bpos new_pos)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter, update_iter;
+ struct bkey_s_c k;
+ struct snapshots_seen s;
+ int ret;
+
+ if (!btree_type_has_snapshots(id))
+ return 0;
+
+ snapshots_seen_init(&s);
+
+ if (!bkey_cmp(old_pos, new_pos))
+ return 0;
+
+ if (!snapshot_t(c, old_pos.snapshot)->children[0])
+ return 0;
+
+ bch2_trans_iter_init(trans, &iter, id, old_pos,
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_ALL_SNAPSHOTS);
+ while (1) {
+next:
+ k = bch2_btree_iter_prev(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ break;
+
+ if (bkey_cmp(old_pos, k.k->p))
+ break;
+
+ if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) {
+ struct bkey_i *update;
+ size_t i;
+
+ for (i = 0; i < s.nr; i++)
+ if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, s.d[i]))
+ goto next;
+
+ update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+
+ ret = PTR_ERR_OR_ZERO(update);
+ if (ret)
+ break;
+
+ bkey_init(&update->k);
+ update->k.p = new_pos;
+ update->k.p.snapshot = k.k->p.snapshot;
+
+ bch2_trans_iter_init(trans, &update_iter, id, update->k.p,
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_ALL_SNAPSHOTS|
+ BTREE_ITER_INTENT);
+ ret = bch2_btree_iter_traverse(&update_iter) ?:
+ bch2_trans_update(trans, &update_iter, update,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ bch2_trans_iter_exit(trans, &update_iter);
+ if (ret)
+ break;
+
+ ret = snapshots_seen_add(c, &s, k.k->p.snapshot);
+ if (ret)
+ break;
+ }
+ }
+ bch2_trans_iter_exit(trans, &iter);
+ kfree(s.d);
+
+ return ret;
+}
+
static int bch2_migrate_index_update(struct bch_write_op *op)
{
struct bch_fs *c = op->c;
@@ -166,7 +242,10 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
next_pos = insert->k.p;
- ret = bch2_trans_update(&trans, &iter, insert, 0) ?:
+ ret = insert_snapshot_whiteouts(&trans, m->btree_id,
+ k.k->p, insert->k.p) ?:
+ bch2_trans_update(&trans, &iter, insert,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(&trans, &op->res,
op_journal_seq(op),
BTREE_INSERT_NOFAIL|
@@ -581,7 +660,8 @@ static int __bch2_move_data(struct bch_fs *c,
stats->pos = start;
bch2_trans_iter_init(&trans, &iter, btree_id, start,
- BTREE_ITER_PREFETCH);
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS);
if (rate)
bch2_ratelimit_reset(rate);
diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c
index 5de29607..ff99c6d2 100644
--- a/libbcachefs/opts.c
+++ b/libbcachefs/opts.c
@@ -63,7 +63,7 @@ const char * const bch2_member_states[] = {
#undef x
-const char * const bch2_d_types[DT_MAX] = {
+const char * const bch2_d_types[BCH_DT_MAX] = {
[DT_UNKNOWN] = "unknown",
[DT_FIFO] = "fifo",
[DT_CHR] = "chr",
@@ -73,6 +73,7 @@ const char * const bch2_d_types[DT_MAX] = {
[DT_LNK] = "lnk",
[DT_SOCK] = "sock",
[DT_WHT] = "whiteout",
+ [DT_SUBVOL] = "subvol",
};
void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h
index 147b4021..d39d6a54 100644
--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@@ -215,19 +215,19 @@ enum opt_type {
BCH_SB_POSIX_ACL, true, \
NULL, "Enable POSIX acls") \
x(usrquota, u8, \
- OPT_FORMAT|OPT_MOUNT, \
+ 0, \
OPT_BOOL(), \
- BCH_SB_USRQUOTA, false, \
+ NO_SB_OPT, false, \
NULL, "Enable user quotas") \
x(grpquota, u8, \
- OPT_FORMAT|OPT_MOUNT, \
+ 0, \
OPT_BOOL(), \
- BCH_SB_GRPQUOTA, false, \
+ NO_SB_OPT, false, \
NULL, "Enable group quotas") \
x(prjquota, u8, \
- OPT_FORMAT|OPT_MOUNT, \
+ 0, \
OPT_BOOL(), \
- BCH_SB_PRJQUOTA, false, \
+ NO_SB_OPT, false, \
NULL, "Enable project quotas") \
x(degraded, u8, \
OPT_MOUNT, \
diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c
index 11208e83..64e0b542 100644
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@@ -20,6 +20,7 @@
#include "quota.h"
#include "recovery.h"
#include "replicas.h"
+#include "subvolume.h"
#include "super-io.h"
#include <linux/sort.h>
@@ -961,6 +962,81 @@ fsck_err:
return ret;
}
+static int bch2_fs_initialize_subvolumes(struct bch_fs *c)
+{
+ struct bkey_i_snapshot root_snapshot;
+ struct bkey_i_subvolume root_volume;
+ int ret;
+
+ bkey_snapshot_init(&root_snapshot.k_i);
+ root_snapshot.k.p.offset = U32_MAX;
+ root_snapshot.v.flags = 0;
+ root_snapshot.v.parent = 0;
+ root_snapshot.v.subvol = BCACHEFS_ROOT_SUBVOL;
+ root_snapshot.v.pad = 0;
+ SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true);
+
+ ret = bch2_btree_insert(c, BTREE_ID_snapshots,
+ &root_snapshot.k_i,
+ NULL, NULL, 0);
+ if (ret)
+ return ret;
+
+
+ bkey_subvolume_init(&root_volume.k_i);
+ root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL;
+ root_volume.v.flags = 0;
+ root_volume.v.snapshot = cpu_to_le32(U32_MAX);
+ root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO);
+
+ ret = bch2_btree_insert(c, BTREE_ID_subvolumes,
+ &root_volume.k_i,
+ NULL, NULL, 0);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+static int bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bch_inode_unpacked inode;
+ struct bkey_inode_buf *packed;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
+ POS(0, BCACHEFS_ROOT_INO), 0);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_inode) {
+ bch_err(c, "root inode not found");
+ ret = -ENOENT;
+ goto err;
+ }
+
+ ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &inode);
+ BUG_ON(ret);
+
+ inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
+
+ packed = bch2_trans_kmalloc(trans, sizeof(*packed));
+ ret = PTR_ERR_OR_ZERO(packed);
+ if (ret)
+ goto err;
+
+ bch2_inode_pack(c, packed, &inode);
+ ret = bch2_trans_update(trans, &iter, &packed->inode.k_i, 0);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
int bch2_fs_recovery(struct bch_fs *c)
{
const char *err = "cannot allocate memory";
@@ -1017,11 +1093,12 @@ int bch2_fs_recovery(struct bch_fs *c)
c->opts.version_upgrade = true;
c->opts.fsck = true;
c->opts.fix_errors = FSCK_OPT_YES;
- }
-
- if (c->sb.version < bcachefs_metadata_version_btree_ptr_sectors_written) {
+ } else if (c->sb.version < bcachefs_metadata_version_btree_ptr_sectors_written) {
bch_info(c, "version prior to btree_ptr_sectors_written, upgrade required");
c->opts.version_upgrade = true;
+ } else if (c->sb.version < bcachefs_metadata_version_snapshot) {
+ bch_info(c, "filesystem version is prior to snapshot field - upgrading");
+ c->opts.version_upgrade = true;
}
ret = bch2_blacklist_table_initialize(c);
@@ -1190,6 +1267,29 @@ use_clean:
bch_verbose(c, "alloc write done");
}
+ if (c->sb.version < bcachefs_metadata_version_snapshot) {
+ err = "error creating root snapshot node";
+ ret = bch2_fs_initialize_subvolumes(c);
+ if (ret)
+ goto err;
+ }
+
+ bch_verbose(c, "reading snapshots table");
+ err = "error reading snapshots table";
+ ret = bch2_fs_snapshots_start(c);
+ if (ret)
+ goto err;
+ bch_verbose(c, "reading snapshots done");
+
+ if (c->sb.version < bcachefs_metadata_version_snapshot) {
+ /* set bi_subvol on root inode */
+ err = "error upgrade root inode for subvolumes";
+ ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
+ bch2_fs_upgrade_for_subvolumes(&trans));
+ if (ret)
+ goto err;
+ }
+
if (c->opts.fsck) {
bch_info(c, "starting fsck");
err = "error in fsck";
@@ -1350,9 +1450,22 @@ int bch2_fs_initialize(struct bch_fs *c)
}
}
+ err = "error creating root snapshot node";
+ ret = bch2_fs_initialize_subvolumes(c);
+ if (ret)
+ goto err;
+
+ bch_verbose(c, "reading snapshots table");
+ err = "error reading snapshots table";
+ ret = bch2_fs_snapshots_start(c);
+ if (ret)
+ goto err;
+ bch_verbose(c, "reading snapshots done");
+
bch2_inode_init(c, &root_inode, 0, 0,
S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
- root_inode.bi_inum = BCACHEFS_ROOT_INO;
+ root_inode.bi_inum = BCACHEFS_ROOT_INO;
+ root_inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
bch2_inode_pack(c, &packed_inode, &root_inode);
packed_inode.inode.k.p.snapshot = U32_MAX;
@@ -1367,11 +1480,12 @@ int bch2_fs_initialize(struct bch_fs *c)
err = "error creating lost+found";
ret = bch2_trans_do(c, NULL, NULL, 0,
- bch2_create_trans(&trans, BCACHEFS_ROOT_INO,
+ bch2_create_trans(&trans,
+ BCACHEFS_ROOT_SUBVOL_INUM,
&root_inode, &lostfound_inode,
&lostfound,
0, 0, S_IFDIR|0700, 0,
- NULL, NULL));
+ NULL, NULL, (subvol_inum) { 0 }, 0));
if (ret) {
bch_err(c, "error creating lost+found");
goto err;
diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c
index 576cfbcc..92ff6094 100644
--- a/libbcachefs/reflink.c
+++ b/libbcachefs/reflink.c
@@ -7,6 +7,7 @@
#include "inode.h"
#include "io.h"
#include "reflink.h"
+#include "subvolume.h"
#include <linux/sched/signal.h>
@@ -197,7 +198,8 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
}
s64 bch2_remap_range(struct bch_fs *c,
- struct bpos dst_start, struct bpos src_start,
+ subvol_inum dst_inum, u64 dst_offset,
+ subvol_inum src_inum, u64 src_offset,
u64 remap_sectors, u64 *journal_seq,
u64 new_i_size, s64 *i_sectors_delta)
{
@@ -205,9 +207,12 @@ s64 bch2_remap_range(struct bch_fs *c,
struct btree_iter dst_iter, src_iter;
struct bkey_s_c src_k;
struct bkey_buf new_dst, new_src;
+ struct bpos dst_start = POS(dst_inum.inum, dst_offset);
+ struct bpos src_start = POS(src_inum.inum, src_offset);
struct bpos dst_end = dst_start, src_end = src_start;
struct bpos src_want;
u64 dst_done;
+ u32 dst_snapshot, src_snapshot;
int ret = 0, ret2 = 0;
if (!percpu_ref_tryget(&c->writes))
@@ -238,6 +243,20 @@ s64 bch2_remap_range(struct bch_fs *c,
break;
}
+ ret = bch2_subvolume_get_snapshot(&trans, src_inum.subvol,
+ &src_snapshot);
+ if (ret)
+ continue;
+
+ bch2_btree_iter_set_snapshot(&src_iter, src_snapshot);
+
+ ret = bch2_subvolume_get_snapshot(&trans, dst_inum.subvol,
+ &dst_snapshot);
+ if (ret)
+ continue;
+
+ bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot);
+
dst_done = dst_iter.pos.offset - dst_start.offset;
src_want = POS(src_start.inode, src_start.offset + dst_done);
bch2_btree_iter_set_pos(&src_iter, src_want);
@@ -248,11 +267,11 @@ s64 bch2_remap_range(struct bch_fs *c,
continue;
if (bkey_cmp(src_want, src_iter.pos) < 0) {
- ret = bch2_fpunch_at(&trans, &dst_iter,
- bpos_min(dst_end,
- POS(dst_iter.pos.inode, dst_iter.pos.offset +
- src_iter.pos.offset - src_want.offset)),
- journal_seq, i_sectors_delta);
+ ret = bch2_fpunch_at(&trans, &dst_iter, dst_inum,
+ min(dst_end.offset,
+ dst_iter.pos.offset +
+ src_iter.pos.offset - src_want.offset),
+ journal_seq, i_sectors_delta);
continue;
}
@@ -289,8 +308,9 @@ s64 bch2_remap_range(struct bch_fs *c,
bch2_key_resize(&new_dst.k->k,
min(src_k.k->p.offset - src_want.offset,
dst_end.offset - dst_iter.pos.offset));
- ret = bch2_extent_update(&trans, &dst_iter, new_dst.k,
- &disk_res, journal_seq,
+
+ ret = bch2_extent_update(&trans, dst_inum, &dst_iter,
+ new_dst.k, &disk_res, journal_seq,
new_i_size, i_sectors_delta,
true);
bch2_disk_reservation_put(c, &disk_res);
@@ -311,7 +331,7 @@ s64 bch2_remap_range(struct bch_fs *c,
bch2_trans_begin(&trans);
ret2 = bch2_inode_peek(&trans, &inode_iter, &inode_u,
- dst_start.inode, BTREE_ITER_INTENT);
+ dst_inum, BTREE_ITER_INTENT);
if (!ret2 &&
inode_u.bi_size < new_i_size) {
diff --git a/libbcachefs/reflink.h b/libbcachefs/reflink.h
index 68c5cb5a..4c1b8286 100644
--- a/libbcachefs/reflink.h
+++ b/libbcachefs/reflink.h
@@ -57,7 +57,7 @@ static inline __le64 *bkey_refcount(struct bkey_i *k)
}
}
-s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos,
- u64, u64 *, u64, s64 *);
+s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64,
+ subvol_inum, u64, u64, u64 *, u64, s64 *);
#endif /* _BCACHEFS_REFLINK_H */
diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h
index c6a132b3..6486e709 100644
--- a/libbcachefs/str_hash.h
+++ b/libbcachefs/str_hash.h
@@ -8,6 +8,7 @@
#include "error.h"
#include "inode.h"
#include "siphash.h"
+#include "subvolume.h"
#include "super.h"
#include <linux/crc32c.h>
@@ -144,16 +145,21 @@ bch2_hash_lookup(struct btree_trans *trans,
struct btree_iter *iter,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
- u64 inode, const void *key,
+ subvol_inum inum, const void *key,
unsigned flags)
{
struct bkey_s_c k;
+ u32 snapshot;
int ret;
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ return ret;
+
for_each_btree_key(trans, *iter, desc.btree_id,
- POS(inode, desc.hash_key(info, key)),
+ SPOS(inum.inum, desc.hash_key(info, key), snapshot),
BTREE_ITER_SLOTS|flags, k, ret) {
- if (iter->pos.inode != inode)
+ if (iter->pos.inode != inum.inum)
break;
if (k.k->type == desc.key_type) {
@@ -176,15 +182,20 @@ bch2_hash_hole(struct btree_trans *trans,
struct btree_iter *iter,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
- u64 inode, const void *key)
+ subvol_inum inum, const void *key)
{
struct bkey_s_c k;
+ u32 snapshot;
int ret;
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ return ret;
+
for_each_btree_key(trans, *iter, desc.btree_id,
- POS(inode, desc.hash_key(info, key)),
+ SPOS(inum.inum, desc.hash_key(info, key), snapshot),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
- if (iter->pos.inode != inode)
+ if (iter->pos.inode != inum.inum)
break;
if (k.k->type != desc.key_type)
@@ -229,17 +240,25 @@ static __always_inline
int bch2_hash_set(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
- u64 inode, struct bkey_i *insert, int flags)
+ subvol_inum inum,
+ struct bkey_i *insert, int flags)
{
struct btree_iter iter, slot = { NULL };
struct bkey_s_c k;
bool found = false;
+ u32 snapshot;
int ret;
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ return ret;
+
for_each_btree_key(trans, iter, desc.btree_id,
- POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))),
+ SPOS(inum.inum,
+ desc.hash_bkey(info, bkey_i_to_s_c(insert)),
+ snapshot),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
- if (iter.pos.inode != inode)
+ if (iter.pos.inode != inum.inum)
break;
if (k.k->type == desc.key_type) {
@@ -288,7 +307,8 @@ static __always_inline
int bch2_hash_delete_at(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
- struct btree_iter *iter)
+ struct btree_iter *iter,
+ unsigned update_flags)
{
struct bkey_i *delete;
int ret;
@@ -306,24 +326,24 @@ int bch2_hash_delete_at(struct btree_trans *trans,
delete->k.p = iter->pos;
delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted;
- return bch2_trans_update(trans, iter, delete, 0);
+ return bch2_trans_update(trans, iter, delete, update_flags);
}
static __always_inline
int bch2_hash_delete(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
- u64 inode, const void *key)
+ subvol_inum inum, const void *key)
{
struct btree_iter iter;
int ret;
- ret = bch2_hash_lookup(trans, &iter, desc, info, inode, key,
+ ret = bch2_hash_lookup(trans, &iter, desc, info, inum, key,
BTREE_ITER_INTENT);
if (ret)
return ret;
- ret = bch2_hash_delete_at(trans, desc, info, &iter);
+ ret = bch2_hash_delete_at(trans, desc, info, &iter, 0);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
diff --git a/libbcachefs/subvolume.c b/libbcachefs/subvolume.c
new file mode 100644
index 00000000..ff3b4d2d
--- /dev/null
+++ b/libbcachefs/subvolume.c
@@ -0,0 +1,981 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_key_cache.h"
+#include "btree_update.h"
+#include "error.h"
+#include "subvolume.h"
+
+/* Snapshot tree: */
+
+static void bch2_delete_dead_snapshots_work(struct work_struct *);
+static void bch2_delete_dead_snapshots(struct bch_fs *);
+
+void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
+
+ pr_buf(out, "is_subvol %llu deleted %llu parent %u children %u %u subvol %u",
+ BCH_SNAPSHOT_SUBVOL(s.v),
+ BCH_SNAPSHOT_DELETED(s.v),
+ le32_to_cpu(s.v->parent),
+ le32_to_cpu(s.v->children[0]),
+ le32_to_cpu(s.v->children[1]),
+ le32_to_cpu(s.v->subvol));
+}
+
+const char *bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bkey_s_c_snapshot s;
+ u32 i, id;
+
+ if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0 ||
+ bkey_cmp(k.k->p, POS(0, 1)) < 0)
+ return "bad pos";
+
+ if (bkey_val_bytes(k.k) != sizeof(struct bch_snapshot))
+ return "bad val size";
+
+ s = bkey_s_c_to_snapshot(k);
+
+ id = le32_to_cpu(s.v->parent);
+ if (id && id <= k.k->p.offset)
+ return "bad parent node";
+
+ if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]))
+ return "children not normalized";
+
+ if (s.v->children[0] &&
+ s.v->children[0] == s.v->children[1])
+ return "duplicate child nodes";
+
+ for (i = 0; i < 2; i++) {
+ id = le32_to_cpu(s.v->children[i]);
+
+ if (id >= k.k->p.offset)
+ return "bad child node";
+ }
+
+ return NULL;
+}
+
+int bch2_mark_snapshot(struct bch_fs *c,
+ struct bkey_s_c old, struct bkey_s_c new,
+ u64 journal_seq, unsigned flags)
+{
+ struct snapshot_t *t;
+
+ t = genradix_ptr_alloc(&c->snapshots,
+ U32_MAX - new.k->p.offset,
+ GFP_KERNEL);
+ if (!t)
+ return -ENOMEM;
+
+ if (new.k->type == KEY_TYPE_snapshot) {
+ struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
+
+ t->parent = le32_to_cpu(s.v->parent);
+ t->children[0] = le32_to_cpu(s.v->children[0]);
+ t->children[1] = le32_to_cpu(s.v->children[1]);
+ t->subvol = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
+ } else {
+ t->parent = 0;
+ t->children[0] = 0;
+ t->children[1] = 0;
+ t->subvol = 0;
+ }
+
+ return 0;
+}
+
+static int subvol_lookup(struct btree_trans *trans, unsigned id, struct bch_subvolume *s)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, POS(0, id), 0);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k) ?: k.k->type == KEY_TYPE_subvolume ? 0 : -ENOENT;
+
+ if (!ret)
+ *s = *bkey_s_c_to_subvolume(k).v;
+
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int snapshot_lookup(struct btree_trans *trans, u32 id,
+ struct bch_snapshot *s)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
+ BTREE_ITER_WITH_UPDATES);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k) ?: k.k->type == KEY_TYPE_snapshot ? 0 : -ENOENT;
+
+ if (!ret)
+ *s = *bkey_s_c_to_snapshot(k).v;
+
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int snapshot_live(struct btree_trans *trans, u32 id)
+{
+ struct bch_snapshot v;
+ int ret;
+
+ if (!id)
+ return 0;
+
+ ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
+ if (ret == -ENOENT)
+ bch_err(trans->c, "snapshot node %u not found", id);
+ if (ret)
+ return ret;
+
+ return !BCH_SNAPSHOT_DELETED(&v);
+}
+
+static int bch2_snapshots_set_equiv(struct btree_trans *trans)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_s_c_snapshot snap;
+ unsigned i;
+ int ret;
+
+ for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k, ret) {
+ u32 id = k.k->p.offset, child[2];
+ unsigned nr_live = 0, live_idx;
+
+ if (k.k->type != KEY_TYPE_snapshot)
+ continue;
+
+ snap = bkey_s_c_to_snapshot(k);
+ child[0] = le32_to_cpu(snap.v->children[0]);
+ child[1] = le32_to_cpu(snap.v->children[1]);
+
+ for (i = 0; i < 2; i++) {
+ ret = snapshot_live(trans, child[i]);
+ if (ret < 0)
+ break;
+
+ if (ret)
+ live_idx = i;
+ nr_live += ret;
+ }
+
+ snapshot_t(c, id)->equiv = nr_live == 1
+ ? snapshot_t(c, child[live_idx])->equiv
+ : id;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (ret)
+ bch_err(c, "error walking snapshots: %i", ret);
+
+ return ret;
+}
+
+/* fsck: */
+static int bch2_snapshot_check(struct btree_trans *trans,
+ struct bkey_s_c_snapshot s)
+{
+ struct bch_subvolume subvol;
+ struct bch_snapshot v;
+ u32 i, id;
+ int ret;
+
+ id = le32_to_cpu(s.v->subvol);
+ ret = lockrestart_do(trans, subvol_lookup(trans, id, &subvol));
+ if (ret == -ENOENT)
+ bch_err(trans->c, "snapshot node %llu has nonexistent subvolume %u",
+ s.k->p.offset, id);
+ if (ret)
+ return ret;
+
+ if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) {
+ bch_err(trans->c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
+ s.k->p.offset);
+ return -EINVAL;
+ }
+
+ id = le32_to_cpu(s.v->parent);
+ if (id) {
+ ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
+ if (ret == -ENOENT)
+ bch_err(trans->c, "snapshot node %llu has nonexistent parent %u",
+ s.k->p.offset, id);
+ if (ret)
+ return ret;
+
+ if (le32_to_cpu(v.children[0]) != s.k->p.offset &&
+ le32_to_cpu(v.children[1]) != s.k->p.offset) {
+ bch_err(trans->c, "snapshot parent %u missing pointer to child %llu",
+ id, s.k->p.offset);
+ return -EINVAL;
+ }
+ }
+
+ for (i = 0; i < 2 && s.v->children[i]; i++) {
+ id = le32_to_cpu(s.v->children[i]);
+
+ ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
+ if (ret == -ENOENT)
+ bch_err(trans->c, "snapshot node %llu has nonexistent child %u",
+ s.k->p.offset, id);
+ if (ret)
+ return ret;
+
+ if (le32_to_cpu(v.parent) != s.k->p.offset) {
+ bch_err(trans->c, "snapshot child %u has wrong parent (got %u should be %llu)",
+ id, le32_to_cpu(v.parent), s.k->p.offset);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+int bch2_fs_snapshots_check(struct bch_fs *c)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bch_snapshot s;
+ unsigned id;
+ int ret;
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k, ret) {
+ if (k.k->type != KEY_TYPE_snapshot)
+ continue;
+
+ ret = bch2_snapshot_check(&trans, bkey_s_c_to_snapshot(k));
+ if (ret)
+ break;
+ }
+ bch2_trans_iter_exit(&trans, &iter);
+
+ if (ret) {
+ bch_err(c, "error %i checking snapshots", ret);
+ goto err;
+ }
+
+ for_each_btree_key(&trans, iter, BTREE_ID_subvolumes,
+ POS_MIN, 0, k, ret) {
+ if (k.k->type != KEY_TYPE_subvolume)
+ continue;
+again_2:
+ id = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot);
+ ret = snapshot_lookup(&trans, id, &s);
+
+ if (ret == -EINTR) {
+ k = bch2_btree_iter_peek(&iter);
+ goto again_2;
+ } else if (ret == -ENOENT)
+ bch_err(c, "subvolume %llu points to nonexistent snapshot %u",
+ k.k->p.offset, id);
+ else if (ret)
+ break;
+ }
+ bch2_trans_iter_exit(&trans, &iter);
+err:
+ bch2_trans_exit(&trans);
+ return ret;
+}
+
+void bch2_fs_snapshots_exit(struct bch_fs *c)
+{
+ genradix_free(&c->snapshots);
+}
+
+int bch2_fs_snapshots_start(struct bch_fs *c)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ bool have_deleted = false;
+ int ret = 0;
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k, ret) {
+ if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0)
+ break;
+
+ if (k.k->type != KEY_TYPE_snapshot) {
+ bch_err(c, "found wrong key type %u in snapshot node table",
+ k.k->type);
+ continue;
+ }
+
+ if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v))
+ have_deleted = true;
+
+ ret = bch2_mark_snapshot(c, bkey_s_c_null, k, 0, 0);
+ if (ret)
+ break;
+ }
+ bch2_trans_iter_exit(&trans, &iter);
+
+ if (ret)
+ goto err;
+
+ ret = bch2_snapshots_set_equiv(&trans);
+ if (ret)
+ goto err;
+err:
+ bch2_trans_exit(&trans);
+
+ if (!ret && have_deleted) {
+ bch_info(c, "restarting deletion of dead snapshots");
+ if (c->opts.fsck) {
+ bch2_delete_dead_snapshots_work(&c->snapshot_delete_work);
+ } else {
+ bch2_delete_dead_snapshots(c);
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * Mark a snapshot as deleted, for future cleanup:
+ */
+static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_i_snapshot *s;
+ int ret = 0;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_snapshot) {
+ bch2_fs_inconsistent(trans->c, "missing snapshot %u", id);
+ ret = -ENOENT;
+ goto err;
+ }
+
+ /* already deleted? */
+ if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v))
+ goto err;
+
+ s = bch2_trans_kmalloc(trans, sizeof(*s));
+ ret = PTR_ERR_OR_ZERO(s);
+ if (ret)
+ goto err;
+
+ bkey_reassemble(&s->k_i, k);
+
+ SET_BCH_SNAPSHOT_DELETED(&s->v, true);
+ ret = bch2_trans_update(trans, &iter, &s->k_i, 0);
+ if (ret)
+ goto err;
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
+{
+ struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
+ struct bkey_s_c k;
+ struct bkey_s_c_snapshot s;
+ struct bkey_i_snapshot *parent;
+ u32 parent_id;
+ unsigned i;
+ int ret = 0;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_snapshot) {
+ bch2_fs_inconsistent(trans->c, "missing snapshot %u", id);
+ ret = -ENOENT;
+ goto err;
+ }
+
+ s = bkey_s_c_to_snapshot(k);
+
+ BUG_ON(!BCH_SNAPSHOT_DELETED(s.v));
+ parent_id = le32_to_cpu(s.v->parent);
+
+ if (parent_id) {
+ bch2_trans_iter_init(trans, &p_iter, BTREE_ID_snapshots,
+ POS(0, parent_id),
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&p_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_snapshot) {
+ bch2_fs_inconsistent(trans->c, "missing snapshot %u", parent_id);
+ ret = -ENOENT;
+ goto err;
+ }
+
+ parent = bch2_trans_kmalloc(trans, sizeof(*parent));
+ ret = PTR_ERR_OR_ZERO(parent);
+ if (ret)
+ goto err;
+
+ bkey_reassemble(&parent->k_i, k);
+
+ for (i = 0; i < 2; i++)
+ if (le32_to_cpu(parent->v.children[i]) == id)
+ break;
+
+ if (i == 2)
+ bch_err(trans->c, "snapshot %u missing child pointer to %u",
+ parent_id, id);
+ else
+ parent->v.children[i] = 0;
+
+ if (le32_to_cpu(parent->v.children[0]) <
+ le32_to_cpu(parent->v.children[1]))
+ swap(parent->v.children[0],
+ parent->v.children[1]);
+
+ ret = bch2_trans_update(trans, &p_iter, &parent->k_i, 0);
+ if (ret)
+ goto err;
+ }
+
+ ret = bch2_btree_delete_at(trans, &iter, 0);
+err:
+ bch2_trans_iter_exit(trans, &p_iter);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
+ u32 *new_snapids,
+ u32 *snapshot_subvols,
+ unsigned nr_snapids)
+{
+ struct btree_iter iter;
+ struct bkey_i_snapshot *n;
+ struct bkey_s_c k;
+ unsigned i;
+ int ret = 0;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
+ POS_MIN, BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ for (i = 0; i < nr_snapids; i++) {
+ k = bch2_btree_iter_prev_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (!k.k || !k.k->p.offset) {
+ ret = -ENOSPC;
+ goto err;
+ }
+
+ n = bch2_trans_kmalloc(trans, sizeof(*n));
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ return ret;
+
+ bkey_snapshot_init(&n->k_i);
+ n->k.p = iter.pos;
+ n->v.flags = 0;
+ n->v.parent = cpu_to_le32(parent);
+ n->v.subvol = cpu_to_le32(snapshot_subvols[i]);
+ n->v.pad = 0;
+ SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
+
+ bch2_trans_update(trans, &iter, &n->k_i, 0);
+
+ ret = bch2_mark_snapshot(trans->c, bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0, 0);
+ if (ret)
+ break;
+
+ new_snapids[i] = iter.pos.offset;
+ }
+
+ if (parent) {
+ bch2_btree_iter_set_pos(&iter, POS(0, parent));
+ k = bch2_btree_iter_peek(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_snapshot) {
+ bch_err(trans->c, "snapshot %u not found", parent);
+ ret = -ENOENT;
+ goto err;
+ }
+
+ n = bch2_trans_kmalloc(trans, sizeof(*n));
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ return ret;
+
+ bkey_reassemble(&n->k_i, k);
+
+ if (n->v.children[0] || n->v.children[1]) {
+ bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
+ ret = -EINVAL;
+ goto err;
+ }
+
+ n->v.children[0] = cpu_to_le32(new_snapids[0]);
+ n->v.children[1] = cpu_to_le32(new_snapids[1]);
+ SET_BCH_SNAPSHOT_SUBVOL(&n->v, false);
+ bch2_trans_update(trans, &iter, &n->k_i, 0);
+ }
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+/* List of snapshot IDs that are being deleted: */
+struct snapshot_id_list {
+ u32 nr;
+ u32 size;
+ u32 *d;
+};
+
+static bool snapshot_list_has_id(struct snapshot_id_list *s, u32 id)
+{
+ unsigned i;
+
+ for (i = 0; i < s->nr; i++)
+ if (id == s->d[i])
+ return true;
+ return false;
+}
+
+static int snapshot_id_add(struct snapshot_id_list *s, u32 id)
+{
+ BUG_ON(snapshot_list_has_id(s, id));
+
+ if (s->nr == s->size) {
+ size_t new_size = max(8U, s->size * 2);
+ void *n = krealloc(s->d,
+ new_size * sizeof(s->d[0]),
+ GFP_KERNEL);
+ if (!n) {
+ pr_err("error allocating snapshot ID list");
+ return -ENOMEM;
+ }
+
+ s->d = n;
+ s->size = new_size;
+ };
+
+ s->d[s->nr++] = id;
+ return 0;
+}
+
+static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans,
+ struct snapshot_id_list *deleted,
+ enum btree_id btree_id)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct snapshot_id_list equiv_seen = { 0 };
+ struct bpos last_pos = POS_MIN;
+ int ret = 0;
+
+ /*
+ * XXX: We should also delete whiteouts that no longer overwrite
+ * anything
+ */
+
+ bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN,
+ BTREE_ITER_INTENT|
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_ALL_SNAPSHOTS);
+
+ while ((bch2_trans_begin(trans),
+ (k = bch2_btree_iter_peek(&iter)).k) &&
+ !(ret = bkey_err(k))) {
+ u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv;
+
+ if (bkey_cmp(k.k->p, last_pos))
+ equiv_seen.nr = 0;
+ last_pos = k.k->p;
+
+ if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
+ snapshot_list_has_id(&equiv_seen, equiv)) {
+ if (btree_id == BTREE_ID_inodes &&
+ bch2_btree_key_cache_flush(trans, btree_id, iter.pos))
+ continue;
+
+ ret = __bch2_trans_do(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL,
+ bch2_btree_iter_traverse(&iter) ?:
+ bch2_btree_delete_at(trans, &iter,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
+ if (ret)
+ break;
+ } else {
+ ret = snapshot_id_add(&equiv_seen, equiv);
+ if (ret)
+ break;
+ }
+
+ bch2_btree_iter_advance(&iter);
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ kfree(equiv_seen.d);
+
+ return ret;
+}
+
+static void bch2_delete_dead_snapshots_work(struct work_struct *work)
+{
+ struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_s_c_snapshot snap;
+ struct snapshot_id_list deleted = { 0 };
+ u32 i, id, children[2];
+ int ret = 0;
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ /*
+ * For every snapshot node: If we have no live children and it's not
+ * pointed to by a subvolume, delete it:
+ */
+ for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k, ret) {
+ if (k.k->type != KEY_TYPE_snapshot)
+ continue;
+
+ snap = bkey_s_c_to_snapshot(k);
+ if (BCH_SNAPSHOT_DELETED(snap.v) ||
+ BCH_SNAPSHOT_SUBVOL(snap.v))
+ continue;
+
+ children[0] = le32_to_cpu(snap.v->children[0]);
+ children[1] = le32_to_cpu(snap.v->children[1]);
+
+ ret = snapshot_live(&trans, children[0]) ?:
+ snapshot_live(&trans, children[1]);
+ if (ret < 0)
+ break;
+ if (ret)
+ continue;
+
+ ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ bch2_snapshot_node_set_deleted(&trans, iter.pos.offset));
+ if (ret) {
+ bch_err(c, "error deleting snapshot %llu: %i", iter.pos.offset, ret);
+ break;
+ }
+ }
+ bch2_trans_iter_exit(&trans, &iter);
+
+ if (ret) {
+ bch_err(c, "error walking snapshots: %i", ret);
+ goto err;
+ }
+
+ ret = bch2_snapshots_set_equiv(&trans);
+ if (ret)
+ goto err;
+
+ for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k, ret) {
+ if (k.k->type != KEY_TYPE_snapshot)
+ continue;
+
+ snap = bkey_s_c_to_snapshot(k);
+ if (BCH_SNAPSHOT_DELETED(snap.v)) {
+ ret = snapshot_id_add(&deleted, k.k->p.offset);
+ if (ret)
+ break;
+ }
+ }
+ bch2_trans_iter_exit(&trans, &iter);
+
+ if (ret) {
+ bch_err(c, "error walking snapshots: %i", ret);
+ goto err;
+ }
+
+ for (id = 0; id < BTREE_ID_NR; id++) {
+ if (!btree_type_has_snapshots(id))
+ continue;
+
+ ret = bch2_snapshot_delete_keys_btree(&trans, &deleted, id);
+ if (ret) {
+ bch_err(c, "error deleting snapshot keys: %i", ret);
+ goto err;
+ }
+ }
+
+ for (i = 0; i < deleted.nr; i++) {
+ ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ bch2_snapshot_node_delete(&trans, deleted.d[i]));
+ if (ret) {
+ bch_err(c, "error deleting snapshot %u: %i",
+ deleted.d[i], ret);
+ goto err;
+ }
+ }
+err:
+ kfree(deleted.d);
+ bch2_trans_exit(&trans);
+ percpu_ref_put(&c->writes);
+}
+
+static void bch2_delete_dead_snapshots(struct bch_fs *c)
+{
+ if (unlikely(!percpu_ref_tryget(&c->writes)))
+ return;
+
+ if (!queue_work(system_long_wq, &c->snapshot_delete_work))
+ percpu_ref_put(&c->writes);
+}
+
+static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
+ struct btree_trans_commit_hook *h)
+{
+ bch2_delete_dead_snapshots(trans->c);
+ return 0;
+}
+
+/* Subvolumes: */
+
+const char *bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+ if (bkey_cmp(k.k->p, SUBVOL_POS_MIN) < 0)
+ return "invalid pos";
+
+ if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0)
+ return "invalid pos";
+
+ if (bkey_val_bytes(k.k) != sizeof(struct bch_subvolume))
+ return "bad val size";
+
+ return NULL;
+}
+
+void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
+
+ pr_buf(out, "root %llu snapshot id %u",
+ le64_to_cpu(s.v->inode),
+ le32_to_cpu(s.v->snapshot));
+}
+
+int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol,
+ u32 *snapid)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
+ POS(0, subvol),
+ BTREE_ITER_CACHED|
+ BTREE_ITER_WITH_UPDATES);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_subvolume) {
+ bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvol);
+ ret = -EIO;
+ goto err;
+ }
+
+ *snapid = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+/* XXX: mark snapshot id for deletion, walk btree and delete: */
+int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid,
+ int deleting_snapshot)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_s_c_subvolume subvol;
+ struct btree_trans_commit_hook *h;
+ struct bkey_i *delete;
+ u32 snapid;
+ int ret = 0;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
+ POS(0, subvolid),
+ BTREE_ITER_CACHED|
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_subvolume) {
+ bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvolid);
+ ret = -EIO;
+ goto err;
+ }
+
+ subvol = bkey_s_c_to_subvolume(k);
+ snapid = le32_to_cpu(subvol.v->snapshot);
+
+ if (deleting_snapshot >= 0 &&
+ deleting_snapshot != BCH_SUBVOLUME_SNAP(subvol.v)) {
+ ret = -ENOENT;
+ goto err;
+ }
+
+ delete = bch2_trans_kmalloc(trans, sizeof(*delete));
+ ret = PTR_ERR_OR_ZERO(delete);
+ if (ret)
+ goto err;
+
+ bkey_init(&delete->k);
+ delete->k.p = iter.pos;
+ ret = bch2_trans_update(trans, &iter, delete, 0);
+ if (ret)
+ goto err;
+
+ ret = bch2_snapshot_node_set_deleted(trans, snapid);
+
+ h = bch2_trans_kmalloc(trans, sizeof(*h));
+ ret = PTR_ERR_OR_ZERO(h);
+ if (ret)
+ goto err;
+
+ h->fn = bch2_delete_dead_snapshots_hook;
+ bch2_trans_commit_hook(trans, h);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
+ u32 src_subvolid,
+ u32 *new_subvolid,
+ u32 *new_snapshotid,
+ bool ro)
+{
+ struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL };
+ struct bkey_i_subvolume *new_subvol = NULL;
+ struct bkey_i_subvolume *src_subvol = NULL;
+ struct bkey_s_c k;
+ u32 parent = 0, new_nodes[2], snapshot_subvols[2];
+ int ret = 0;
+
+ for_each_btree_key(trans, dst_iter, BTREE_ID_subvolumes, SUBVOL_POS_MIN,
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
+ if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0)
+ break;
+ if (bkey_deleted(k.k))
+ goto found_slot;
+ }
+
+ if (!ret)
+ ret = -ENOSPC;
+ goto err;
+found_slot:
+ snapshot_subvols[0] = dst_iter.pos.offset;
+ snapshot_subvols[1] = src_subvolid;
+
+ if (src_subvolid) {
+ /* Creating a snapshot: */
+ src_subvol = bch2_trans_kmalloc(trans, sizeof(*src_subvol));
+ ret = PTR_ERR_OR_ZERO(src_subvol);
+ if (ret)
+ goto err;
+
+ bch2_trans_iter_init(trans, &src_iter, BTREE_ID_subvolumes,
+ POS(0, src_subvolid),
+ BTREE_ITER_CACHED|
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&src_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_subvolume) {
+ bch_err(trans->c, "subvolume %u not found", src_subvolid);
+ ret = -ENOENT;
+ goto err;
+ }
+
+ bkey_reassemble(&src_subvol->k_i, k);
+ parent = le32_to_cpu(src_subvol->v.snapshot);
+ }
+
+ ret = bch2_snapshot_node_create(trans, parent, new_nodes,
+ snapshot_subvols,
+ src_subvolid ? 2 : 1);
+ if (ret)
+ goto err;
+
+ if (src_subvolid) {
+ src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]);
+ bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0);
+ }
+
+ new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
+ ret = PTR_ERR_OR_ZERO(new_subvol);
+ if (ret)
+ goto err;
+
+ bkey_subvolume_init(&new_subvol->k_i);
+ new_subvol->v.flags = 0;
+ new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]);
+ new_subvol->v.inode = cpu_to_le64(inode);
+ SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro);
+ SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0);
+ new_subvol->k.p = dst_iter.pos;
+ bch2_trans_update(trans, &dst_iter, &new_subvol->k_i, 0);
+
+ *new_subvolid = new_subvol->k.p.offset;
+ *new_snapshotid = new_nodes[0];
+err:
+ bch2_trans_iter_exit(trans, &src_iter);
+ bch2_trans_iter_exit(trans, &dst_iter);
+ return ret;
+}
+
+int bch2_fs_subvolumes_init(struct bch_fs *c)
+{
+ INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work);
+ return 0;
+}
diff --git a/libbcachefs/subvolume.h b/libbcachefs/subvolume.h
new file mode 100644
index 00000000..0740c7b7
--- /dev/null
+++ b/libbcachefs/subvolume.h
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUBVOLUME_H
+#define _BCACHEFS_SUBVOLUME_H
+
+void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+const char *bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_snapshot (struct bkey_ops) { \
+ .key_invalid = bch2_snapshot_invalid, \
+ .val_to_text = bch2_snapshot_to_text, \
+}
+
+int bch2_mark_snapshot(struct bch_fs *, struct bkey_s_c,
+ struct bkey_s_c, u64, unsigned);
+
+static inline struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
+{
+ return genradix_ptr(&c->snapshots, U32_MAX - id);
+}
+
+static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
+{
+ return snapshot_t(c, id)->parent;
+}
+
+static inline u32 bch2_snapshot_internal_node(struct bch_fs *c, u32 id)
+{
+ struct snapshot_t *s = snapshot_t(c, id);
+
+ return s->children[0] || s->children[1];
+}
+
+static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id)
+{
+ struct snapshot_t *s;
+ u32 parent = bch2_snapshot_parent(c, id);
+
+ if (!parent)
+ return 0;
+
+ s = snapshot_t(c, bch2_snapshot_parent(c, id));
+ if (id == s->children[0])
+ return s->children[1];
+ if (id == s->children[1])
+ return s->children[0];
+ return 0;
+}
+
+static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
+{
+ while (id && id < ancestor)
+ id = bch2_snapshot_parent(c, id);
+
+ return id == ancestor;
+}
+
+struct snapshots_seen {
+ struct bpos pos;
+ size_t nr;
+ size_t size;
+ u32 *d;
+};
+
+static inline void snapshots_seen_exit(struct snapshots_seen *s)
+{
+ kfree(s->d);
+ s->d = NULL;
+}
+
+static inline void snapshots_seen_init(struct snapshots_seen *s)
+{
+ memset(s, 0, sizeof(*s));
+}
+
+static inline int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id)
+{
+ if (s->nr == s->size) {
+ size_t new_size = max(s->size, 128UL) * 2;
+ u32 *d = krealloc(s->d, new_size * sizeof(s->d[0]), GFP_KERNEL);
+
+ if (!d) {
+ bch_err(c, "error reallocating snapshots_seen table (new size %zu)",
+ new_size);
+ return -ENOMEM;
+ }
+
+ s->size = new_size;
+ s->d = d;
+ }
+
+ s->d[s->nr++] = id;
+ return 0;
+}
+
+int bch2_fs_snapshots_check(struct bch_fs *);
+void bch2_fs_snapshots_exit(struct bch_fs *);
+int bch2_fs_snapshots_start(struct bch_fs *);
+
+const char *bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_subvolume (struct bkey_ops) { \
+ .key_invalid = bch2_subvolume_invalid, \
+ .val_to_text = bch2_subvolume_to_text, \
+}
+
+int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
+
+int bch2_subvolume_delete(struct btree_trans *, u32, int);
+int bch2_subvolume_create(struct btree_trans *, u64, u32,
+ u32 *, u32 *, bool);
+
+int bch2_fs_subvolumes_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_SUBVOLUME_H */
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 8f847661..1feb7dee 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -39,6 +39,7 @@
#include "rebalance.h"
#include "recovery.h"
#include "replicas.h"
+#include "subvolume.h"
#include "super.h"
#include "super-io.h"
#include "sysfs.h"
@@ -468,6 +469,7 @@ static void __bch2_fs_free(struct bch_fs *c)
for (i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_exit(&c->times[i]);
+ bch2_fs_snapshots_exit(c);
bch2_fs_quota_exit(c);
bch2_fs_fsio_exit(c);
bch2_fs_ec_exit(c);
@@ -686,6 +688,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
mutex_init(&c->usage_scratch_lock);
mutex_init(&c->bio_bounce_pages_lock);
+ mutex_init(&c->snapshot_table_lock);
spin_lock_init(&c->btree_write_error_lock);
@@ -789,6 +792,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_btree_key_cache_init(&c->btree_key_cache) ||
bch2_fs_btree_iter_init(c) ||
bch2_fs_btree_interior_update_init(c) ||
+ bch2_fs_subvolumes_init(c) ||
bch2_fs_io_init(c) ||
bch2_fs_encryption_init(c) ||
bch2_fs_compress_init(c) ||
diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c
index ef6ae97e..a182e242 100644
--- a/libbcachefs/xattr.c
+++ b/libbcachefs/xattr.c
@@ -128,7 +128,7 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info
int ret;
ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash,
- inode->v.i_ino,
+ inode_inum(inode),
&X_SEARCH(type, name, strlen(name)),
0);
if (ret)
@@ -160,7 +160,7 @@ int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
bch2_xattr_get_trans(&trans, inode, name, buffer, size, type));
}
-int bch2_xattr_set(struct btree_trans *trans, u64 inum,
+int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
const struct bch_hash_info *hash_info,
const char *name, const void *value, size_t size,
int type, int flags)
@@ -282,13 +282,21 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
struct btree_iter iter;
struct bkey_s_c k;
struct xattr_buf buf = { .buf = buffer, .len = buffer_size };
- u64 inum = dentry->d_inode->i_ino;
+ u64 offset = 0, inum = inode->ei_inode.bi_inum;
+ u32 snapshot;
int ret;
bch2_trans_init(&trans, c, 0, 0);
+retry:
+ bch2_trans_begin(&trans);
+ iter = (struct btree_iter) { NULL };
+
+ ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot);
+ if (ret)
+ goto err;
for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
- POS(inum, 0), 0, k, ret) {
+ SPOS(inum, offset, snapshot), 0, k, ret) {
BUG_ON(k.k->p.inode < inum);
if (k.k->p.inode > inum)
@@ -301,7 +309,12 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
if (ret)
break;
}
+
+ offset = iter.pos.offset;
bch2_trans_iter_exit(&trans, &iter);
+err:
+ if (ret == -EINTR)
+ goto retry;
ret = bch2_trans_exit(&trans) ?: ret;
@@ -340,7 +353,7 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler,
struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
return bch2_trans_do(c, NULL, &inode->ei_journal_seq, 0,
- bch2_xattr_set(&trans, inode->v.i_ino, &hash,
+ bch2_xattr_set(&trans, inode_inum(inode), &hash,
name, value, size,
handler->flags, flags));
}
diff --git a/libbcachefs/xattr.h b/libbcachefs/xattr.h
index 4151065a..f4f89654 100644
--- a/libbcachefs/xattr.h
+++ b/libbcachefs/xattr.h
@@ -39,7 +39,8 @@ struct bch_inode_info;
int bch2_xattr_get(struct bch_fs *, struct bch_inode_info *,
const char *, void *, size_t, int);
-int bch2_xattr_set(struct btree_trans *, u64, const struct bch_hash_info *,
+int bch2_xattr_set(struct btree_trans *, subvol_inum,
+ const struct bch_hash_info *,
const char *, const void *, size_t, int, int);
ssize_t bch2_xattr_list(struct dentry *, char *, size_t);