From 4f06dd92b5d0a6f8eec6a34b8d6ef3e1f4ac1e10 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 21 Oct 2020 16:12:49 -0400 Subject: fuse: fix write deadlock There are two modes for write(2) and friends in fuse: a) write through (update page cache, send sync WRITE request to userspace) b) buffered write (update page cache, async writeout later) The write through method kept all the page cache pages locked that were used for the request. Keeping more than one page locked is deadlock prone and Qian Cai demonstrated this with trinity fuzzing. The reason for keeping the pages locked is that concurrent mapped reads shouldn't try to pull possibly stale data into the page cache. For full page writes, the easy way to fix this is to make the cached page be the authoritative source by marking the page PG_uptodate immediately. After this the page can be safely unlocked, since mapped/cached reads will take the written data from the cache. Concurrent mapped writes will now cause data in the original WRITE request to be updated; this however doesn't cause any data inconsistency and this scenario should be exceedingly rare anyway. If the WRITE request returns with an error in the above case, currently the page is not marked uptodate; this means that a concurrent read will always read consistent data. After this patch the page is uptodate between writing to the cache and receiving the error: there's window where a cached read will read the wrong data. While theoretically this could be a regression, it is unlikely to be one in practice, since this is normal for buffered writes. In case of a partial page write to an already uptodate page the locking is also unnecessary, with the above caveats. Partial write of a not uptodate page still needs to be handled. One way would be to read the complete page before doing the write. This is not possible, since it might break filesystems that don't expect any READ requests when the file was opened O_WRONLY. The other solution is to serialize the synchronous write with reads from the partial pages. The easiest way to do this is to keep the partial pages locked. The problem is that a write() may involve two such pages (one head and one tail). This patch fixes it by only locking the partial tail page. If there's a partial head page as well, then split that off as a separate WRITE request. Reported-by: Qian Cai Link: https://lore.kernel.org/linux-fsdevel/4794a3fa3742a5e84fb0f934944204b55730829b.camel@lca.pw/ Fixes: ea9b9907b82a ("fuse: implement perform_write") Cc: # v2.6.26 Signed-off-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- fs/fuse/fuse_i.h | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/fuse/fuse_i.h') diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 63d97a15ffde..74d888c78fa4 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -912,6 +912,7 @@ struct fuse_io_args { struct { struct fuse_write_in in; struct fuse_write_out out; + bool page_locked; } write; }; struct fuse_args_pages ap; -- cgit v1.2.3 From 4b91459ad283a7b174c7a092e31c470f217d1a31 Mon Sep 17 00:00:00 2001 From: Connor Kuehl Date: Thu, 18 Mar 2021 08:52:23 -0500 Subject: fuse: fix typo for fuse_conn.max_pages comment 'Maxmum' -> 'Maximum' Signed-off-by: Connor Kuehl Signed-off-by: Miklos Szeredi --- fs/fuse/fuse_i.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/fuse/fuse_i.h') diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 74d888c78fa4..79c98df34deb 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -552,7 +552,7 @@ struct fuse_conn { /** Maximum write size */ unsigned max_write; - /** Maxmum number of pages that can be used in a single request */ + /** Maximum number of pages that can be used in a single request */ unsigned int max_pages; /** Input queue */ -- cgit v1.2.3 From aa6ff555f0e62bc1c85a2d181c1fae95d47c00ce Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Fri, 26 Mar 2021 06:46:42 +0530 Subject: fuse: fix a typo s/reponsible/responsible/ Signed-off-by: Bhaskar Chowdhury Signed-off-by: Miklos Szeredi --- fs/fuse/fuse_i.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/fuse/fuse_i.h') diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 79c98df34deb..e7775d9c3314 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -713,7 +713,7 @@ struct fuse_conn { /** Use enhanced/automatic page cache invalidation. */ unsigned auto_inval_data:1; - /** Filesystem is fully reponsible for page cache invalidation. */ + /** Filesystem is fully responsible for page cache invalidation. */ unsigned explicit_inval_data:1; /** Does the filesystem support readdirplus? */ -- cgit v1.2.3 From 52a4c95f4d24b8bcb50745732f7b9f8513c49c5f Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 25 Mar 2021 11:18:22 -0400 Subject: fuse: extend FUSE_SETXATTR request Fuse client needs to send additional information to file server when it calls SETXATTR(system.posix_acl_access), so add extra flags field to the structure. Signed-off-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- fs/fuse/acl.c | 2 +- fs/fuse/fuse_i.h | 5 ++++- fs/fuse/inode.c | 4 +++- fs/fuse/xattr.c | 9 ++++++--- include/uapi/linux/fuse.h | 7 +++++++ 5 files changed, 21 insertions(+), 6 deletions(-) (limited to 'fs/fuse/fuse_i.h') diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c index e9c0f916349d..d31260a139d4 100644 --- a/fs/fuse/acl.c +++ b/fs/fuse/acl.c @@ -94,7 +94,7 @@ int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode, return ret; } - ret = fuse_setxattr(inode, name, value, size, 0); + ret = fuse_setxattr(inode, name, value, size, 0, 0); kfree(value); } else { ret = fuse_removexattr(inode, name); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index e7775d9c3314..979b680bb47e 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -668,6 +668,9 @@ struct fuse_conn { /** Is setxattr not implemented by fs? */ unsigned no_setxattr:1; + /** Does file server support extended setxattr */ + unsigned setxattr_ext:1; + /** Is getxattr not implemented by fs? */ unsigned no_getxattr:1; @@ -1171,7 +1174,7 @@ void fuse_unlock_inode(struct inode *inode, bool locked); bool fuse_lock_inode(struct inode *inode); int fuse_setxattr(struct inode *inode, const char *name, const void *value, - size_t size, int flags); + size_t size, int flags, unsigned int extra_flags); ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, size_t size); ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index b0e18b470e91..06a68cfa76d8 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1052,6 +1052,8 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->handle_killpriv_v2 = 1; fm->sb->s_flags |= SB_NOSEC; } + if (arg->flags & FUSE_SETXATTR_EXT) + fc->setxattr_ext = 1; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -1095,7 +1097,7 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS | FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA | - FUSE_HANDLE_KILLPRIV_V2; + FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) ia->in.flags |= FUSE_MAP_ALIGNMENT; diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 1a7d7ace54e1..61dfaf7b7d20 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -12,7 +12,7 @@ #include int fuse_setxattr(struct inode *inode, const char *name, const void *value, - size_t size, int flags) + size_t size, int flags, unsigned int extra_flags) { struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); @@ -25,10 +25,13 @@ int fuse_setxattr(struct inode *inode, const char *name, const void *value, memset(&inarg, 0, sizeof(inarg)); inarg.size = size; inarg.flags = flags; + inarg.setxattr_flags = extra_flags; + args.opcode = FUSE_SETXATTR; args.nodeid = get_node_id(inode); args.in_numargs = 3; - args.in_args[0].size = sizeof(inarg); + args.in_args[0].size = fm->fc->setxattr_ext ? + sizeof(inarg) : FUSE_COMPAT_SETXATTR_IN_SIZE; args.in_args[0].value = &inarg; args.in_args[1].size = strlen(name) + 1; args.in_args[1].value = name; @@ -199,7 +202,7 @@ static int fuse_xattr_set(const struct xattr_handler *handler, if (!value) return fuse_removexattr(inode, name); - return fuse_setxattr(inode, name, value, size, flags); + return fuse_setxattr(inode, name, value, size, flags, 0); } static bool no_xattr_list(struct dentry *dentry) diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 54442612c48b..390f7b4c889d 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -179,6 +179,7 @@ * 7.33 * - add FUSE_HANDLE_KILLPRIV_V2, FUSE_WRITE_KILL_SUIDGID, FATTR_KILL_SUIDGID * - add FUSE_OPEN_KILL_SUIDGID + * - extend fuse_setxattr_in, add FUSE_SETXATTR_EXT */ #ifndef _LINUX_FUSE_H @@ -330,6 +331,7 @@ struct fuse_file_lock { * does not have CAP_FSETID. Additionally upon * write/truncate sgid is killed only if file has group * execute permission. (Same as Linux VFS behavior). + * FUSE_SETXATTR_EXT: Server supports extended struct fuse_setxattr_in */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -360,6 +362,7 @@ struct fuse_file_lock { #define FUSE_MAP_ALIGNMENT (1 << 26) #define FUSE_SUBMOUNTS (1 << 27) #define FUSE_HANDLE_KILLPRIV_V2 (1 << 28) +#define FUSE_SETXATTR_EXT (1 << 29) /** * CUSE INIT request/reply flags @@ -681,9 +684,13 @@ struct fuse_fsync_in { uint32_t padding; }; +#define FUSE_COMPAT_SETXATTR_IN_SIZE 8 + struct fuse_setxattr_in { uint32_t size; uint32_t flags; + uint32_t setxattr_flags; + uint32_t padding; }; struct fuse_getxattr_in { -- cgit v1.2.3 From a7f0d7aab0b4f3f0780b1f77356e2fe7202ac0cb Mon Sep 17 00:00:00 2001 From: Connor Kuehl Date: Thu, 18 Mar 2021 08:52:22 -0500 Subject: virtiofs: split requests that exceed virtqueue size If an incoming FUSE request can't fit on the virtqueue, the request is placed onto a workqueue so a worker can try to resubmit it later where there will (hopefully) be space for it next time. This is fine for requests that aren't larger than a virtqueue's maximum capacity. However, if a request's size exceeds the maximum capacity of the virtqueue (even if the virtqueue is empty), it will be doomed to a life of being placed on the workqueue, removed, discovered it won't fit, and placed on the workqueue yet again. Furthermore, from section 2.6.5.3.1 (Driver Requirements: Indirect Descriptors) of the virtio spec: "A driver MUST NOT create a descriptor chain longer than the Queue Size of the device." To fix this, limit the number of pages FUSE will use for an overall request. This way, each request can realistically fit on the virtqueue when it is decomposed into a scattergather list and avoid violating section 2.6.5.3.1 of the virtio spec. Signed-off-by: Connor Kuehl Reviewed-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- fs/fuse/fuse_i.h | 3 +++ fs/fuse/inode.c | 3 ++- fs/fuse/virtio_fs.c | 19 +++++++++++++++++-- 3 files changed, 22 insertions(+), 3 deletions(-) (limited to 'fs/fuse/fuse_i.h') diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 979b680bb47e..38369a5094af 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -555,6 +555,9 @@ struct fuse_conn { /** Maximum number of pages that can be used in a single request */ unsigned int max_pages; + /** Constrain ->max_pages to this value during feature negotiation */ + unsigned int max_pages_limit; + /** Input queue */ struct fuse_iqueue iq; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 06a68cfa76d8..2f3161bb4b1c 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -712,6 +712,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, fc->pid_ns = get_pid_ns(task_active_pid_ns(current)); fc->user_ns = get_user_ns(user_ns); fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; + fc->max_pages_limit = FUSE_MAX_MAX_PAGES; INIT_LIST_HEAD(&fc->mounts); list_add(&fm->fc_entry, &fc->mounts); @@ -1040,7 +1041,7 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->abort_err = 1; if (arg->flags & FUSE_MAX_PAGES) { fc->max_pages = - min_t(unsigned int, FUSE_MAX_MAX_PAGES, + min_t(unsigned int, fc->max_pages_limit, max_t(unsigned int, arg->max_pages, 1)); } if (IS_ENABLED(CONFIG_FUSE_DAX) && diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 1e5affed158e..b55e977f50ac 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -18,6 +18,12 @@ #include #include "fuse_i.h" +/* Used to help calculate the FUSE connection's max_pages limit for a request's + * size. Parts of the struct fuse_req are sliced into scattergather lists in + * addition to the pages used, so this can help account for that overhead. + */ +#define FUSE_HEADER_OVERHEAD 4 + /* List of virtio-fs device instances and a lock for the list. Also provides * mutual exclusion in device removal and mounting path */ @@ -1414,9 +1420,10 @@ static int virtio_fs_get_tree(struct fs_context *fsc) { struct virtio_fs *fs; struct super_block *sb; - struct fuse_conn *fc; + struct fuse_conn *fc = NULL; struct fuse_mount *fm; - int err; + unsigned int virtqueue_size; + int err = -EIO; /* This gets a reference on virtio_fs object. This ptr gets installed * in fc->iq->priv. Once fuse_conn is going away, it calls ->put() @@ -1428,6 +1435,10 @@ static int virtio_fs_get_tree(struct fs_context *fsc) return -EINVAL; } + virtqueue_size = virtqueue_get_vring_size(fs->vqs[VQ_REQUEST].vq); + if (WARN_ON(virtqueue_size <= FUSE_HEADER_OVERHEAD)) + goto out_err; + err = -ENOMEM; fc = kzalloc(sizeof(struct fuse_conn), GFP_KERNEL); if (!fc) @@ -1443,6 +1454,10 @@ static int virtio_fs_get_tree(struct fs_context *fsc) fc->delete_stale = true; fc->auto_submounts = true; + /* Tell FUSE to split requests that exceed the virtqueue's size */ + fc->max_pages_limit = min_t(unsigned int, fc->max_pages_limit, + virtqueue_size - FUSE_HEADER_OVERHEAD); + fsc->s_fs_info = fm; sb = sget_fc(fsc, virtio_fs_test_super, set_anon_super_fc); if (fsc->s_fs_info) { -- cgit v1.2.3