summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/debugfs/file.c165
-rw-r--r--fs/debugfs/inode.c208
-rw-r--r--fs/debugfs/internal.h50
-rw-r--r--fs/fuse/Kconfig12
-rw-r--r--fs/fuse/Makefile1
-rw-r--r--fs/fuse/dax.c11
-rw-r--r--fs/fuse/dev.c127
-rw-r--r--fs/fuse/dev_uring.c1319
-rw-r--r--fs/fuse/dev_uring_i.h205
-rw-r--r--fs/fuse/dir.c32
-rw-r--r--fs/fuse/fuse_dev_i.h66
-rw-r--r--fs/fuse/fuse_i.h32
-rw-r--r--fs/fuse/inode.c14
-rw-r--r--fs/fuse/xattr.c7
-rw-r--r--fs/nfs/Kconfig3
-rw-r--r--fs/nfs/callback_proc.c2
-rw-r--r--fs/nfs/client.c6
-rw-r--r--fs/nfs/direct.c1
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c52
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h1
-rw-r--r--fs/nfs/inode.c3
-rw-r--r--fs/nfs/internal.h9
-rw-r--r--fs/nfs/localio.c236
-rw-r--r--fs/nfs/nfs3proc.c46
-rw-r--r--fs/nfs/nfs42proc.c24
-rw-r--r--fs/nfs/nfs42xdr.c4
-rw-r--r--fs/nfs/nfs4state.c1
-rw-r--r--fs/nfs/nfstrace.h32
-rw-r--r--fs/nfs/pagelist.c5
-rw-r--r--fs/nfs/sysfs.c6
-rw-r--r--fs/nfs/write.c3
-rw-r--r--fs/nfs_common/Makefile3
-rw-r--r--fs/nfs_common/common.c89
-rw-r--r--fs/nfs_common/localio_trace.c10
-rw-r--r--fs/nfs_common/localio_trace.h56
-rw-r--r--fs/nfs_common/nfslocalio.c250
-rw-r--r--fs/nfsd/filecache.c20
-rw-r--r--fs/nfsd/localio.c9
-rw-r--r--fs/nfsd/netns.h12
-rw-r--r--fs/nfsd/nfsctl.c6
-rw-r--r--fs/nfsd/nfssvc.c40
-rw-r--r--fs/orangefs/orangefs-debugfs.c16
-rw-r--r--fs/sysfs/file.c2
43 files changed, 2664 insertions, 532 deletions
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 16e198a26339..e33cc77699cd 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -47,11 +47,17 @@ const struct file_operations debugfs_noop_file_operations = {
#define F_DENTRY(filp) ((filp)->f_path.dentry)
+const void *debugfs_get_aux(const struct file *file)
+{
+ return DEBUGFS_I(file_inode(file))->aux;
+}
+EXPORT_SYMBOL_GPL(debugfs_get_aux);
+
const struct file_operations *debugfs_real_fops(const struct file *filp)
{
struct debugfs_fsdata *fsd = F_DENTRY(filp)->d_fsdata;
- if ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT) {
+ if (!fsd) {
/*
* Urgh, we've been called w/o a protecting
* debugfs_file_get().
@@ -84,9 +90,11 @@ static int __debugfs_file_get(struct dentry *dentry, enum dbgfs_get_mode mode)
return -EINVAL;
d_fsd = READ_ONCE(dentry->d_fsdata);
- if (!((unsigned long)d_fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)) {
+ if (d_fsd) {
fsd = d_fsd;
} else {
+ struct inode *inode = dentry->d_inode;
+
if (WARN_ON(mode == DBGFS_GET_ALREADY))
return -EINVAL;
@@ -95,23 +103,38 @@ static int __debugfs_file_get(struct dentry *dentry, enum dbgfs_get_mode mode)
return -ENOMEM;
if (mode == DBGFS_GET_SHORT) {
- fsd->real_fops = NULL;
- fsd->short_fops = (void *)((unsigned long)d_fsd &
- ~DEBUGFS_FSDATA_IS_REAL_FOPS_BIT);
+ const struct debugfs_short_fops *ops;
+ ops = fsd->short_fops = DEBUGFS_I(inode)->short_fops;
+ if (ops->llseek)
+ fsd->methods |= HAS_LSEEK;
+ if (ops->read)
+ fsd->methods |= HAS_READ;
+ if (ops->write)
+ fsd->methods |= HAS_WRITE;
} else {
- fsd->real_fops = (void *)((unsigned long)d_fsd &
- ~DEBUGFS_FSDATA_IS_REAL_FOPS_BIT);
- fsd->short_fops = NULL;
+ const struct file_operations *ops;
+ ops = fsd->real_fops = DEBUGFS_I(inode)->real_fops;
+ if (ops->llseek)
+ fsd->methods |= HAS_LSEEK;
+ if (ops->read)
+ fsd->methods |= HAS_READ;
+ if (ops->write)
+ fsd->methods |= HAS_WRITE;
+ if (ops->unlocked_ioctl)
+ fsd->methods |= HAS_IOCTL;
+ if (ops->poll)
+ fsd->methods |= HAS_POLL;
}
refcount_set(&fsd->active_users, 1);
init_completion(&fsd->active_users_drained);
INIT_LIST_HEAD(&fsd->cancellations);
mutex_init(&fsd->cancellations_mtx);
- if (cmpxchg(&dentry->d_fsdata, d_fsd, fsd) != d_fsd) {
+ d_fsd = cmpxchg(&dentry->d_fsdata, NULL, fsd);
+ if (d_fsd) {
mutex_destroy(&fsd->cancellations_mtx);
kfree(fsd);
- fsd = READ_ONCE(dentry->d_fsdata);
+ fsd = d_fsd;
}
}
@@ -208,8 +231,7 @@ void debugfs_enter_cancellation(struct file *file,
return;
fsd = READ_ONCE(dentry->d_fsdata);
- if (WARN_ON(!fsd ||
- ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)))
+ if (WARN_ON(!fsd))
return;
mutex_lock(&fsd->cancellations_mtx);
@@ -240,8 +262,7 @@ void debugfs_leave_cancellation(struct file *file,
return;
fsd = READ_ONCE(dentry->d_fsdata);
- if (WARN_ON(!fsd ||
- ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)))
+ if (WARN_ON(!fsd))
return;
mutex_lock(&fsd->cancellations_mtx);
@@ -322,13 +343,16 @@ const struct file_operations debugfs_open_proxy_file_operations = {
#define PROTO(args...) args
#define ARGS(args...) args
-#define FULL_PROXY_FUNC(name, ret_type, filp, proto, args) \
+#define FULL_PROXY_FUNC(name, ret_type, filp, proto, args, bit, ret) \
static ret_type full_proxy_ ## name(proto) \
{ \
- struct dentry *dentry = F_DENTRY(filp); \
+ struct dentry *dentry = F_DENTRY(filp); \
+ struct debugfs_fsdata *fsd = dentry->d_fsdata; \
const struct file_operations *real_fops; \
ret_type r; \
\
+ if (!(fsd->methods & bit)) \
+ return ret; \
r = debugfs_file_get(dentry); \
if (unlikely(r)) \
return r; \
@@ -338,17 +362,18 @@ static ret_type full_proxy_ ## name(proto) \
return r; \
}
-#define FULL_PROXY_FUNC_BOTH(name, ret_type, filp, proto, args) \
+#define FULL_PROXY_FUNC_BOTH(name, ret_type, filp, proto, args, bit, ret) \
static ret_type full_proxy_ ## name(proto) \
{ \
struct dentry *dentry = F_DENTRY(filp); \
- struct debugfs_fsdata *fsd; \
+ struct debugfs_fsdata *fsd = dentry->d_fsdata; \
ret_type r; \
\
+ if (!(fsd->methods & bit)) \
+ return ret; \
r = debugfs_file_get(dentry); \
if (unlikely(r)) \
return r; \
- fsd = dentry->d_fsdata; \
if (fsd->real_fops) \
r = fsd->real_fops->name(args); \
else \
@@ -359,29 +384,32 @@ static ret_type full_proxy_ ## name(proto) \
FULL_PROXY_FUNC_BOTH(llseek, loff_t, filp,
PROTO(struct file *filp, loff_t offset, int whence),
- ARGS(filp, offset, whence));
+ ARGS(filp, offset, whence), HAS_LSEEK, -ESPIPE);
FULL_PROXY_FUNC_BOTH(read, ssize_t, filp,
PROTO(struct file *filp, char __user *buf, size_t size,
loff_t *ppos),
- ARGS(filp, buf, size, ppos));
+ ARGS(filp, buf, size, ppos), HAS_READ, -EINVAL);
FULL_PROXY_FUNC_BOTH(write, ssize_t, filp,
PROTO(struct file *filp, const char __user *buf,
size_t size, loff_t *ppos),
- ARGS(filp, buf, size, ppos));
+ ARGS(filp, buf, size, ppos), HAS_WRITE, -EINVAL);
FULL_PROXY_FUNC(unlocked_ioctl, long, filp,
PROTO(struct file *filp, unsigned int cmd, unsigned long arg),
- ARGS(filp, cmd, arg));
+ ARGS(filp, cmd, arg), HAS_IOCTL, -ENOTTY);
static __poll_t full_proxy_poll(struct file *filp,
struct poll_table_struct *wait)
{
struct dentry *dentry = F_DENTRY(filp);
+ struct debugfs_fsdata *fsd = dentry->d_fsdata;
__poll_t r = 0;
const struct file_operations *real_fops;
+ if (!(fsd->methods & HAS_POLL))
+ return DEFAULT_POLLMASK;
if (debugfs_file_get(dentry))
return EPOLLHUP;
@@ -393,9 +421,7 @@ static __poll_t full_proxy_poll(struct file *filp,
static int full_proxy_release(struct inode *inode, struct file *filp)
{
- const struct dentry *dentry = F_DENTRY(filp);
const struct file_operations *real_fops = debugfs_real_fops(filp);
- const struct file_operations *proxy_fops = filp->f_op;
int r = 0;
/*
@@ -404,49 +430,21 @@ static int full_proxy_release(struct inode *inode, struct file *filp)
* not to leak any resources. Releasers must not assume that
* ->i_private is still being meaningful here.
*/
- if (real_fops && real_fops->release)
+ if (real_fops->release)
r = real_fops->release(inode, filp);
- replace_fops(filp, d_inode(dentry)->i_fop);
- kfree(proxy_fops);
fops_put(real_fops);
return r;
}
-static void __full_proxy_fops_init(struct file_operations *proxy_fops,
- struct debugfs_fsdata *fsd)
-{
- proxy_fops->release = full_proxy_release;
-
- if ((fsd->real_fops && fsd->real_fops->llseek) ||
- (fsd->short_fops && fsd->short_fops->llseek))
- proxy_fops->llseek = full_proxy_llseek;
-
- if ((fsd->real_fops && fsd->real_fops->read) ||
- (fsd->short_fops && fsd->short_fops->read))
- proxy_fops->read = full_proxy_read;
-
- if ((fsd->real_fops && fsd->real_fops->write) ||
- (fsd->short_fops && fsd->short_fops->write))
- proxy_fops->write = full_proxy_write;
-
- if (fsd->real_fops && fsd->real_fops->poll)
- proxy_fops->poll = full_proxy_poll;
-
- if (fsd->real_fops && fsd->real_fops->unlocked_ioctl)
- proxy_fops->unlocked_ioctl = full_proxy_unlocked_ioctl;
-}
-
-static int full_proxy_open(struct inode *inode, struct file *filp,
- enum dbgfs_get_mode mode)
+static int full_proxy_open_regular(struct inode *inode, struct file *filp)
{
struct dentry *dentry = F_DENTRY(filp);
const struct file_operations *real_fops;
- struct file_operations *proxy_fops = NULL;
struct debugfs_fsdata *fsd;
int r;
- r = __debugfs_file_get(dentry, mode);
+ r = __debugfs_file_get(dentry, DBGFS_GET_REGULAR);
if (r)
return r == -EIO ? -ENOENT : r;
@@ -456,7 +454,7 @@ static int full_proxy_open(struct inode *inode, struct file *filp,
if (r)
goto out;
- if (real_fops && !fops_get(real_fops)) {
+ if (!fops_get(real_fops)) {
#ifdef CONFIG_MODULES
if (real_fops->owner &&
real_fops->owner->state == MODULE_STATE_GOING) {
@@ -472,55 +470,52 @@ static int full_proxy_open(struct inode *inode, struct file *filp,
goto out;
}
- proxy_fops = kzalloc(sizeof(*proxy_fops), GFP_KERNEL);
- if (!proxy_fops) {
- r = -ENOMEM;
- goto free_proxy;
- }
- __full_proxy_fops_init(proxy_fops, fsd);
- replace_fops(filp, proxy_fops);
-
- if (!real_fops || real_fops->open) {
- if (real_fops)
- r = real_fops->open(inode, filp);
- else
- r = simple_open(inode, filp);
+ if (real_fops->open) {
+ r = real_fops->open(inode, filp);
if (r) {
- replace_fops(filp, d_inode(dentry)->i_fop);
- goto free_proxy;
- } else if (filp->f_op != proxy_fops) {
+ fops_put(real_fops);
+ } else if (filp->f_op != &debugfs_full_proxy_file_operations) {
/* No protection against file removal anymore. */
WARN(1, "debugfs file owner replaced proxy fops: %pd",
dentry);
- goto free_proxy;
+ fops_put(real_fops);
}
}
-
- goto out;
-free_proxy:
- kfree(proxy_fops);
- fops_put(real_fops);
out:
debugfs_file_put(dentry);
return r;
}
-static int full_proxy_open_regular(struct inode *inode, struct file *filp)
-{
- return full_proxy_open(inode, filp, DBGFS_GET_REGULAR);
-}
-
const struct file_operations debugfs_full_proxy_file_operations = {
.open = full_proxy_open_regular,
+ .release = full_proxy_release,
+ .llseek = full_proxy_llseek,
+ .read = full_proxy_read,
+ .write = full_proxy_write,
+ .poll = full_proxy_poll,
+ .unlocked_ioctl = full_proxy_unlocked_ioctl
};
static int full_proxy_open_short(struct inode *inode, struct file *filp)
{
- return full_proxy_open(inode, filp, DBGFS_GET_SHORT);
+ struct dentry *dentry = F_DENTRY(filp);
+ int r;
+
+ r = __debugfs_file_get(dentry, DBGFS_GET_SHORT);
+ if (r)
+ return r == -EIO ? -ENOENT : r;
+ r = debugfs_locked_down(inode, filp, NULL);
+ if (!r)
+ r = simple_open(inode, filp);
+ debugfs_file_put(dentry);
+ return r;
}
const struct file_operations debugfs_full_short_proxy_file_operations = {
.open = full_proxy_open_short,
+ .llseek = full_proxy_llseek,
+ .read = full_proxy_read,
+ .write = full_proxy_write,
};
ssize_t debugfs_attr_read(struct file *file, char __user *buf,
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index e752009de929..75715d8877ee 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -208,16 +208,34 @@ static int debugfs_show_options(struct seq_file *m, struct dentry *root)
return 0;
}
+static struct kmem_cache *debugfs_inode_cachep __ro_after_init;
+
+static void init_once(void *foo)
+{
+ struct debugfs_inode_info *info = foo;
+ inode_init_once(&info->vfs_inode);
+}
+
+static struct inode *debugfs_alloc_inode(struct super_block *sb)
+{
+ struct debugfs_inode_info *info;
+ info = alloc_inode_sb(sb, debugfs_inode_cachep, GFP_KERNEL);
+ if (!info)
+ return NULL;
+ return &info->vfs_inode;
+}
+
static void debugfs_free_inode(struct inode *inode)
{
if (S_ISLNK(inode->i_mode))
kfree(inode->i_link);
- free_inode_nonrcu(inode);
+ kmem_cache_free(debugfs_inode_cachep, DEBUGFS_I(inode));
}
static const struct super_operations debugfs_super_operations = {
.statfs = simple_statfs,
.show_options = debugfs_show_options,
+ .alloc_inode = debugfs_alloc_inode,
.free_inode = debugfs_free_inode,
};
@@ -225,23 +243,18 @@ static void debugfs_release_dentry(struct dentry *dentry)
{
struct debugfs_fsdata *fsd = dentry->d_fsdata;
- if ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)
- return;
-
- /* check it wasn't a dir (no fsdata) or automount (no real_fops) */
- if (fsd && (fsd->real_fops || fsd->short_fops)) {
+ if (fsd) {
WARN_ON(!list_empty(&fsd->cancellations));
mutex_destroy(&fsd->cancellations_mtx);
}
-
kfree(fsd);
}
static struct vfsmount *debugfs_automount(struct path *path)
{
- struct debugfs_fsdata *fsd = path->dentry->d_fsdata;
+ struct inode *inode = path->dentry->d_inode;
- return fsd->automount(path->dentry, d_inode(path->dentry)->i_private);
+ return DEBUGFS_I(inode)->automount(path->dentry, inode->i_private);
}
static const struct dentry_operations debugfs_dops = {
@@ -411,6 +424,7 @@ static struct dentry *end_creating(struct dentry *dentry)
static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
struct dentry *parent, void *data,
+ const void *aux,
const struct file_operations *proxy_fops,
const void *real_fops)
{
@@ -441,9 +455,11 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
inode->i_private = data;
inode->i_op = &debugfs_file_inode_operations;
+ if (!real_fops)
+ proxy_fops = &debugfs_noop_file_operations;
inode->i_fop = proxy_fops;
- dentry->d_fsdata = (void *)((unsigned long)real_fops |
- DEBUGFS_FSDATA_IS_REAL_FOPS_BIT);
+ DEBUGFS_I(inode)->raw = real_fops;
+ DEBUGFS_I(inode)->aux = aux;
d_instantiate(dentry, inode);
fsnotify_create(d_inode(dentry->d_parent), dentry);
@@ -452,30 +468,22 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
struct dentry *debugfs_create_file_full(const char *name, umode_t mode,
struct dentry *parent, void *data,
+ const void *aux,
const struct file_operations *fops)
{
- if (WARN_ON((unsigned long)fops &
- DEBUGFS_FSDATA_IS_REAL_FOPS_BIT))
- return ERR_PTR(-EINVAL);
-
- return __debugfs_create_file(name, mode, parent, data,
- fops ? &debugfs_full_proxy_file_operations :
- &debugfs_noop_file_operations,
+ return __debugfs_create_file(name, mode, parent, data, aux,
+ &debugfs_full_proxy_file_operations,
fops);
}
EXPORT_SYMBOL_GPL(debugfs_create_file_full);
struct dentry *debugfs_create_file_short(const char *name, umode_t mode,
- struct dentry *parent, void *data,
- const struct debugfs_short_fops *fops)
+ struct dentry *parent, void *data,
+ const void *aux,
+ const struct debugfs_short_fops *fops)
{
- if (WARN_ON((unsigned long)fops &
- DEBUGFS_FSDATA_IS_REAL_FOPS_BIT))
- return ERR_PTR(-EINVAL);
-
- return __debugfs_create_file(name, mode, parent, data,
- fops ? &debugfs_full_short_proxy_file_operations :
- &debugfs_noop_file_operations,
+ return __debugfs_create_file(name, mode, parent, data, aux,
+ &debugfs_full_short_proxy_file_operations,
fops);
}
EXPORT_SYMBOL_GPL(debugfs_create_file_short);
@@ -512,9 +520,8 @@ struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode,
const struct file_operations *fops)
{
- return __debugfs_create_file(name, mode, parent, data,
- fops ? &debugfs_open_proxy_file_operations :
- &debugfs_noop_file_operations,
+ return __debugfs_create_file(name, mode, parent, data, NULL,
+ &debugfs_open_proxy_file_operations,
fops);
}
EXPORT_SYMBOL_GPL(debugfs_create_file_unsafe);
@@ -624,23 +631,13 @@ struct dentry *debugfs_create_automount(const char *name,
void *data)
{
struct dentry *dentry = start_creating(name, parent);
- struct debugfs_fsdata *fsd;
struct inode *inode;
if (IS_ERR(dentry))
return dentry;
- fsd = kzalloc(sizeof(*fsd), GFP_KERNEL);
- if (!fsd) {
- failed_creating(dentry);
- return ERR_PTR(-ENOMEM);
- }
-
- fsd->automount = f;
-
if (!(debugfs_allow & DEBUGFS_ALLOW_API)) {
failed_creating(dentry);
- kfree(fsd);
return ERR_PTR(-EPERM);
}
@@ -648,14 +645,13 @@ struct dentry *debugfs_create_automount(const char *name,
if (unlikely(!inode)) {
pr_err("out of free dentries, can not create automount '%s'\n",
name);
- kfree(fsd);
return failed_creating(dentry);
}
make_empty_dir_inode(inode);
inode->i_flags |= S_AUTOMOUNT;
inode->i_private = data;
- dentry->d_fsdata = fsd;
+ DEBUGFS_I(inode)->automount = f;
/* directory inodes start off with i_nlink == 2 (for "." entry) */
inc_nlink(inode);
d_instantiate(dentry, inode);
@@ -730,7 +726,7 @@ static void __debugfs_file_removed(struct dentry *dentry)
*/
smp_mb();
fsd = READ_ONCE(dentry->d_fsdata);
- if ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)
+ if (!fsd)
return;
/* if this was the last reference, we're done */
@@ -834,76 +830,70 @@ void debugfs_lookup_and_remove(const char *name, struct dentry *parent)
EXPORT_SYMBOL_GPL(debugfs_lookup_and_remove);
/**
- * debugfs_rename - rename a file/directory in the debugfs filesystem
- * @old_dir: a pointer to the parent dentry for the renamed object. This
- * should be a directory dentry.
- * @old_dentry: dentry of an object to be renamed.
- * @new_dir: a pointer to the parent dentry where the object should be
- * moved. This should be a directory dentry.
- * @new_name: a pointer to a string containing the target name.
+ * debugfs_change_name - rename a file/directory in the debugfs filesystem
+ * @dentry: dentry of an object to be renamed.
+ * @fmt: format for new name
*
* This function renames a file/directory in debugfs. The target must not
* exist for rename to succeed.
*
- * This function will return a pointer to old_dentry (which is updated to
- * reflect renaming) if it succeeds. If an error occurs, ERR_PTR(-ERROR)
- * will be returned.
+ * This function will return 0 on success and -E... on failure.
*
* If debugfs is not enabled in the kernel, the value -%ENODEV will be
* returned.
*/
-struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
- struct dentry *new_dir, const char *new_name)
+int __printf(2, 3) debugfs_change_name(struct dentry *dentry, const char *fmt, ...)
{
- int error;
- struct dentry *dentry = NULL, *trap;
+ int error = 0;
+ const char *new_name;
struct name_snapshot old_name;
+ struct dentry *parent, *target;
+ struct inode *dir;
+ va_list ap;
- if (IS_ERR(old_dir))
- return old_dir;
- if (IS_ERR(new_dir))
- return new_dir;
- if (IS_ERR_OR_NULL(old_dentry))
- return old_dentry;
-
- trap = lock_rename(new_dir, old_dir);
- /* Source or destination directories don't exist? */
- if (d_really_is_negative(old_dir) || d_really_is_negative(new_dir))
- goto exit;
- /* Source does not exist, cyclic rename, or mountpoint? */
- if (d_really_is_negative(old_dentry) || old_dentry == trap ||
- d_mountpoint(old_dentry))
- goto exit;
- dentry = lookup_one_len(new_name, new_dir, strlen(new_name));
- /* Lookup failed, cyclic rename or target exists? */
- if (IS_ERR(dentry) || dentry == trap || d_really_is_positive(dentry))
- goto exit;
-
- take_dentry_name_snapshot(&old_name, old_dentry);
-
- error = simple_rename(&nop_mnt_idmap, d_inode(old_dir), old_dentry,
- d_inode(new_dir), dentry, 0);
- if (error) {
- release_dentry_name_snapshot(&old_name);
- goto exit;
+ if (IS_ERR_OR_NULL(dentry))
+ return 0;
+
+ va_start(ap, fmt);
+ new_name = kvasprintf_const(GFP_KERNEL, fmt, ap);
+ va_end(ap);
+ if (!new_name)
+ return -ENOMEM;
+
+ parent = dget_parent(dentry);
+ dir = d_inode(parent);
+ inode_lock(dir);
+
+ take_dentry_name_snapshot(&old_name, dentry);
+
+ if (WARN_ON_ONCE(dentry->d_parent != parent)) {
+ error = -EINVAL;
+ goto out;
}
- d_move(old_dentry, dentry);
- fsnotify_move(d_inode(old_dir), d_inode(new_dir), &old_name.name,
- d_is_dir(old_dentry),
- NULL, old_dentry);
+ if (strcmp(old_name.name.name, new_name) == 0)
+ goto out;
+ target = lookup_one_len(new_name, parent, strlen(new_name));
+ if (IS_ERR(target)) {
+ error = PTR_ERR(target);
+ goto out;
+ }
+ if (d_really_is_positive(target)) {
+ dput(target);
+ error = -EINVAL;
+ goto out;
+ }
+ simple_rename_timestamp(dir, dentry, dir, target);
+ d_move(dentry, target);
+ dput(target);
+ fsnotify_move(dir, dir, &old_name.name, d_is_dir(dentry), NULL, dentry);
+out:
release_dentry_name_snapshot(&old_name);
- unlock_rename(new_dir, old_dir);
- dput(dentry);
- return old_dentry;
-exit:
- if (dentry && !IS_ERR(dentry))
- dput(dentry);
- unlock_rename(new_dir, old_dir);
- if (IS_ERR(dentry))
- return dentry;
- return ERR_PTR(-EINVAL);
+ inode_unlock(dir);
+ dput(parent);
+ kfree_const(new_name);
+ return error;
}
-EXPORT_SYMBOL_GPL(debugfs_rename);
+EXPORT_SYMBOL_GPL(debugfs_change_name);
/**
* debugfs_initialized - Tells whether debugfs has been registered
@@ -939,12 +929,22 @@ static int __init debugfs_init(void)
if (retval)
return retval;
- retval = register_filesystem(&debug_fs_type);
- if (retval)
+ debugfs_inode_cachep = kmem_cache_create("debugfs_inode_cache",
+ sizeof(struct debugfs_inode_info), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
+ init_once);
+ if (debugfs_inode_cachep == NULL) {
sysfs_remove_mount_point(kernel_kobj, "debug");
- else
- debugfs_registered = true;
+ return -ENOMEM;
+ }
- return retval;
+ retval = register_filesystem(&debug_fs_type);
+ if (retval) { // Really not going to happen
+ sysfs_remove_mount_point(kernel_kobj, "debug");
+ kmem_cache_destroy(debugfs_inode_cachep);
+ return retval;
+ }
+ debugfs_registered = true;
+ return 0;
}
core_initcall(debugfs_init);
diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h
index bbae4a228ef4..93483fe84425 100644
--- a/fs/debugfs/internal.h
+++ b/fs/debugfs/internal.h
@@ -11,6 +11,22 @@
struct file_operations;
+struct debugfs_inode_info {
+ struct inode vfs_inode;
+ union {
+ const void *raw;
+ const struct file_operations *real_fops;
+ const struct debugfs_short_fops *short_fops;
+ debugfs_automount_t automount;
+ };
+ const void *aux;
+};
+
+static inline struct debugfs_inode_info *DEBUGFS_I(struct inode *inode)
+{
+ return container_of(inode, struct debugfs_inode_info, vfs_inode);
+}
+
/* declared over in file.c */
extern const struct file_operations debugfs_noop_file_operations;
extern const struct file_operations debugfs_open_proxy_file_operations;
@@ -20,29 +36,25 @@ extern const struct file_operations debugfs_full_short_proxy_file_operations;
struct debugfs_fsdata {
const struct file_operations *real_fops;
const struct debugfs_short_fops *short_fops;
- union {
- /* automount_fn is used when real_fops is NULL */
- debugfs_automount_t automount;
- struct {
- refcount_t active_users;
- struct completion active_users_drained;
-
- /* protect cancellations */
- struct mutex cancellations_mtx;
- struct list_head cancellations;
- };
+ struct {
+ refcount_t active_users;
+ struct completion active_users_drained;
+
+ /* protect cancellations */
+ struct mutex cancellations_mtx;
+ struct list_head cancellations;
+ unsigned int methods;
};
};
-/*
- * A dentry's ->d_fsdata either points to the real fops or to a
- * dynamically allocated debugfs_fsdata instance.
- * In order to distinguish between these two cases, a real fops
- * pointer gets its lowest bit set.
- */
-#define DEBUGFS_FSDATA_IS_REAL_FOPS_BIT BIT(0)
+enum {
+ HAS_READ = 1,
+ HAS_WRITE = 2,
+ HAS_LSEEK = 4,
+ HAS_POLL = 8,
+ HAS_IOCTL = 16
+};
-/* Access BITS */
#define DEBUGFS_ALLOW_API BIT(0)
#define DEBUGFS_ALLOW_MOUNT BIT(1)
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
index 8674dbfbe59d..ca215a3cba3e 100644
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig
@@ -63,3 +63,15 @@ config FUSE_PASSTHROUGH
to be performed directly on a backing file.
If you want to allow passthrough operations, answer Y.
+
+config FUSE_IO_URING
+ bool "FUSE communication over io-uring"
+ default y
+ depends on FUSE_FS
+ depends on IO_URING
+ help
+ This allows sending FUSE requests over the io-uring interface and
+ also adds request core affinity.
+
+ If you want to allow fuse server/client communication through io-uring,
+ answer Y
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 2c372180d631..3f0f312a31c1 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -15,5 +15,6 @@ fuse-y += iomode.o
fuse-$(CONFIG_FUSE_DAX) += dax.o
fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o
fuse-$(CONFIG_SYSCTL) += sysctl.o
+fuse-$(CONFIG_FUSE_IO_URING) += dev_uring.o
virtiofs-y := virtio_fs.o
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index 9abbc2f2894f..0b6ee6dd1fd6 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -240,11 +240,12 @@ static int fuse_send_removemapping(struct inode *inode,
args.opcode = FUSE_REMOVEMAPPING;
args.nodeid = fi->nodeid;
- args.in_numargs = 2;
- args.in_args[0].size = sizeof(*inargp);
- args.in_args[0].value = inargp;
- args.in_args[1].size = inargp->count * sizeof(*remove_one);
- args.in_args[1].value = remove_one;
+ args.in_numargs = 3;
+ fuse_set_zero_arg0(&args);
+ args.in_args[1].size = sizeof(*inargp);
+ args.in_args[1].value = inargp;
+ args.in_args[2].size = inargp->count * sizeof(*remove_one);
+ args.in_args[2].value = remove_one;
return fuse_simple_request(fm, &args);
}
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 27ccae63495d..5b5f789b37eb 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -6,7 +6,9 @@
See the file COPYING.
*/
+#include "dev_uring_i.h"
#include "fuse_i.h"
+#include "fuse_dev_i.h"
#include <linux/init.h>
#include <linux/module.h>
@@ -28,23 +30,8 @@
MODULE_ALIAS_MISCDEV(FUSE_MINOR);
MODULE_ALIAS("devname:fuse");
-/* Ordinary requests have even IDs, while interrupts IDs are odd */
-#define FUSE_INT_REQ_BIT (1ULL << 0)
-#define FUSE_REQ_ID_STEP (1ULL << 1)
-
static struct kmem_cache *fuse_req_cachep;
-static void end_requests(struct list_head *head);
-
-static struct fuse_dev *fuse_get_dev(struct file *file)
-{
- /*
- * Lockless access is OK, because file->private data is set
- * once during mount and is valid until the file is released.
- */
- return READ_ONCE(file->private_data);
-}
-
static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req)
{
INIT_LIST_HEAD(&req->list);
@@ -89,7 +76,8 @@ void fuse_set_initialized(struct fuse_conn *fc)
static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background)
{
- return !fc->initialized || (for_background && fc->blocked);
+ return !fc->initialized || (for_background && fc->blocked) ||
+ (fc->io_uring && !fuse_uring_ready(fc));
}
static void fuse_drop_waiting(struct fuse_conn *fc)
@@ -234,7 +222,7 @@ u64 fuse_get_unique(struct fuse_iqueue *fiq)
}
EXPORT_SYMBOL_GPL(fuse_get_unique);
-static unsigned int fuse_req_hash(u64 unique)
+unsigned int fuse_req_hash(u64 unique)
{
return hash_long(unique & ~FUSE_INT_REQ_BIT, FUSE_PQ_HASH_BITS);
}
@@ -250,7 +238,8 @@ __releases(fiq->lock)
spin_unlock(&fiq->lock);
}
-static void fuse_dev_queue_forget(struct fuse_iqueue *fiq, struct fuse_forget_link *forget)
+void fuse_dev_queue_forget(struct fuse_iqueue *fiq,
+ struct fuse_forget_link *forget)
{
spin_lock(&fiq->lock);
if (fiq->connected) {
@@ -263,7 +252,7 @@ static void fuse_dev_queue_forget(struct fuse_iqueue *fiq, struct fuse_forget_li
}
}
-static void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
+void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
{
spin_lock(&fiq->lock);
if (list_empty(&req->intr_entry)) {
@@ -580,7 +569,25 @@ ssize_t __fuse_simple_request(struct mnt_idmap *idmap,
return ret;
}
-static bool fuse_request_queue_background(struct fuse_req *req)
+#ifdef CONFIG_FUSE_IO_URING
+static bool fuse_request_queue_background_uring(struct fuse_conn *fc,
+ struct fuse_req *req)
+{
+ struct fuse_iqueue *fiq = &fc->iq;
+
+ req->in.h.unique = fuse_get_unique(fiq);
+ req->in.h.len = sizeof(struct fuse_in_header) +
+ fuse_len_args(req->args->in_numargs,
+ (struct fuse_arg *) req->args->in_args);
+
+ return fuse_uring_queue_bq_req(req);
+}
+#endif
+
+/*
+ * @return true if queued
+ */
+static int fuse_request_queue_background(struct fuse_req *req)
{
struct fuse_mount *fm = req->fm;
struct fuse_conn *fc = fm->fc;
@@ -592,6 +599,12 @@ static bool fuse_request_queue_background(struct fuse_req *req)
atomic_inc(&fc->num_waiting);
}
__set_bit(FR_ISREPLY, &req->flags);
+
+#ifdef CONFIG_FUSE_IO_URING
+ if (fuse_uring_ready(fc))
+ return fuse_request_queue_background_uring(fc, req);
+#endif
+
spin_lock(&fc->bg_lock);
if (likely(fc->connected)) {
fc->num_background++;
@@ -692,22 +705,8 @@ static int unlock_request(struct fuse_req *req)
return err;
}
-struct fuse_copy_state {
- int write;
- struct fuse_req *req;
- struct iov_iter *iter;
- struct pipe_buffer *pipebufs;
- struct pipe_buffer *currbuf;
- struct pipe_inode_info *pipe;
- unsigned long nr_segs;
- struct page *pg;
- unsigned len;
- unsigned offset;
- unsigned move_pages:1;
-};
-
-static void fuse_copy_init(struct fuse_copy_state *cs, int write,
- struct iov_iter *iter)
+void fuse_copy_init(struct fuse_copy_state *cs, int write,
+ struct iov_iter *iter)
{
memset(cs, 0, sizeof(*cs));
cs->write = write;
@@ -814,6 +813,9 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
*size -= ncpy;
cs->len -= ncpy;
cs->offset += ncpy;
+ if (cs->is_uring)
+ cs->ring.copied_sz += ncpy;
+
return ncpy;
}
@@ -1068,9 +1070,9 @@ static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size)
}
/* Copy request arguments to/from userspace buffer */
-static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
- unsigned argpages, struct fuse_arg *args,
- int zeroing)
+int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
+ unsigned argpages, struct fuse_arg *args,
+ int zeroing)
{
int err = 0;
unsigned i;
@@ -1760,7 +1762,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode,
args = &ap->args;
args->nodeid = outarg->nodeid;
args->opcode = FUSE_NOTIFY_REPLY;
- args->in_numargs = 2;
+ args->in_numargs = 3;
args->in_pages = true;
args->end = fuse_retrieve_end;
@@ -1788,9 +1790,10 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode,
}
ra->inarg.offset = outarg->offset;
ra->inarg.size = total_len;
- args->in_args[0].size = sizeof(ra->inarg);
- args->in_args[0].value = &ra->inarg;
- args->in_args[1].size = total_len;
+ fuse_set_zero_arg0(args);
+ args->in_args[1].size = sizeof(ra->inarg);
+ args->in_args[1].value = &ra->inarg;
+ args->in_args[2].size = total_len;
err = fuse_simple_notify_reply(fm, args, outarg->notify_unique);
if (err)
@@ -1885,7 +1888,7 @@ static void fuse_resend(struct fuse_conn *fc)
spin_unlock(&fiq->lock);
list_for_each_entry(req, &to_queue, list)
clear_bit(FR_PENDING, &req->flags);
- end_requests(&to_queue);
+ fuse_dev_end_requests(&to_queue);
return;
}
/* iq and pq requests are both oldest to newest */
@@ -1934,7 +1937,7 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
}
/* Look up request on processing list by unique ID */
-static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique)
+struct fuse_req *fuse_request_find(struct fuse_pqueue *fpq, u64 unique)
{
unsigned int hash = fuse_req_hash(unique);
struct fuse_req *req;
@@ -1946,10 +1949,17 @@ static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique)
return NULL;
}
-static int copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args,
- unsigned nbytes)
+int fuse_copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args,
+ unsigned nbytes)
{
- unsigned reqsize = sizeof(struct fuse_out_header);
+
+ unsigned int reqsize = 0;
+
+ /*
+ * Uring has all headers separated from args - args is payload only
+ */
+ if (!cs->is_uring)
+ reqsize = sizeof(struct fuse_out_header);
reqsize += fuse_len_args(args->out_numargs, args->out_args);
@@ -2011,7 +2021,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
spin_lock(&fpq->lock);
req = NULL;
if (fpq->connected)
- req = request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT);
+ req = fuse_request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT);
err = -ENOENT;
if (!req) {
@@ -2049,7 +2059,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
if (oh.error)
err = nbytes != sizeof(oh) ? -EINVAL : 0;
else
- err = copy_out_args(cs, req->args, nbytes);
+ err = fuse_copy_out_args(cs, req->args, nbytes);
fuse_copy_finish(cs);
spin_lock(&fpq->lock);
@@ -2204,7 +2214,7 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait)
}
/* Abort all requests on the given list (pending or processing) */
-static void end_requests(struct list_head *head)
+void fuse_dev_end_requests(struct list_head *head)
{
while (!list_empty(head)) {
struct fuse_req *req;
@@ -2307,7 +2317,13 @@ void fuse_abort_conn(struct fuse_conn *fc)
wake_up_all(&fc->blocked_waitq);
spin_unlock(&fc->lock);
- end_requests(&to_end);
+ fuse_dev_end_requests(&to_end);
+
+ /*
+ * fc->lock must not be taken to avoid conflicts with io-uring
+ * locks
+ */
+ fuse_uring_abort(fc);
} else {
spin_unlock(&fc->lock);
}
@@ -2319,6 +2335,8 @@ void fuse_wait_aborted(struct fuse_conn *fc)
/* matches implicit memory barrier in fuse_drop_waiting() */
smp_mb();
wait_event(fc->blocked_waitq, atomic_read(&fc->num_waiting) == 0);
+
+ fuse_uring_wait_stopped_queues(fc);
}
int fuse_dev_release(struct inode *inode, struct file *file)
@@ -2337,7 +2355,7 @@ int fuse_dev_release(struct inode *inode, struct file *file)
list_splice_init(&fpq->processing[i], &to_end);
spin_unlock(&fpq->lock);
- end_requests(&to_end);
+ fuse_dev_end_requests(&to_end);
/* Are we the last open device? */
if (atomic_dec_and_test(&fc->dev_count)) {
@@ -2475,6 +2493,9 @@ const struct file_operations fuse_dev_operations = {
.fasync = fuse_dev_fasync,
.unlocked_ioctl = fuse_dev_ioctl,
.compat_ioctl = compat_ptr_ioctl,
+#ifdef CONFIG_FUSE_IO_URING
+ .uring_cmd = fuse_uring_cmd,
+#endif
};
EXPORT_SYMBOL_GPL(fuse_dev_operations);
diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
new file mode 100644
index 000000000000..ebd2931b4f2a
--- /dev/null
+++ b/fs/fuse/dev_uring.c
@@ -0,0 +1,1319 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (c) 2023-2024 DataDirect Networks.
+ */
+
+#include "fuse_i.h"
+#include "dev_uring_i.h"
+#include "fuse_dev_i.h"
+
+#include <linux/fs.h>
+#include <linux/io_uring/cmd.h>
+
+static bool __read_mostly enable_uring;
+module_param(enable_uring, bool, 0644);
+MODULE_PARM_DESC(enable_uring,
+ "Enable userspace communication through io-uring");
+
+#define FUSE_URING_IOV_SEGS 2 /* header and payload */
+
+
+bool fuse_uring_enabled(void)
+{
+ return enable_uring;
+}
+
+struct fuse_uring_pdu {
+ struct fuse_ring_ent *ent;
+};
+
+static const struct fuse_iqueue_ops fuse_io_uring_ops;
+
+static void uring_cmd_set_ring_ent(struct io_uring_cmd *cmd,
+ struct fuse_ring_ent *ring_ent)
+{
+ struct fuse_uring_pdu *pdu =
+ io_uring_cmd_to_pdu(cmd, struct fuse_uring_pdu);
+
+ pdu->ent = ring_ent;
+}
+
+static struct fuse_ring_ent *uring_cmd_to_ring_ent(struct io_uring_cmd *cmd)
+{
+ struct fuse_uring_pdu *pdu =
+ io_uring_cmd_to_pdu(cmd, struct fuse_uring_pdu);
+
+ return pdu->ent;
+}
+
+static void fuse_uring_flush_bg(struct fuse_ring_queue *queue)
+{
+ struct fuse_ring *ring = queue->ring;
+ struct fuse_conn *fc = ring->fc;
+
+ lockdep_assert_held(&queue->lock);
+ lockdep_assert_held(&fc->bg_lock);
+
+ /*
+ * Allow one bg request per queue, ignoring global fc limits.
+ * This prevents a single queue from consuming all resources and
+ * eliminates the need for remote queue wake-ups when global
+ * limits are met but this queue has no more waiting requests.
+ */
+ while ((fc->active_background < fc->max_background ||
+ !queue->active_background) &&
+ (!list_empty(&queue->fuse_req_bg_queue))) {
+ struct fuse_req *req;
+
+ req = list_first_entry(&queue->fuse_req_bg_queue,
+ struct fuse_req, list);
+ fc->active_background++;
+ queue->active_background++;
+
+ list_move_tail(&req->list, &queue->fuse_req_queue);
+ }
+}
+
+static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req,
+ int error)
+{
+ struct fuse_ring_queue *queue = ent->queue;
+ struct fuse_ring *ring = queue->ring;
+ struct fuse_conn *fc = ring->fc;
+
+ lockdep_assert_not_held(&queue->lock);
+ spin_lock(&queue->lock);
+ ent->fuse_req = NULL;
+ if (test_bit(FR_BACKGROUND, &req->flags)) {
+ queue->active_background--;
+ spin_lock(&fc->bg_lock);
+ fuse_uring_flush_bg(queue);
+ spin_unlock(&fc->bg_lock);
+ }
+
+ spin_unlock(&queue->lock);
+
+ if (error)
+ req->out.h.error = error;
+
+ clear_bit(FR_SENT, &req->flags);
+ fuse_request_end(req);
+}
+
+/* Abort all list queued request on the given ring queue */
+static void fuse_uring_abort_end_queue_requests(struct fuse_ring_queue *queue)
+{
+ struct fuse_req *req;
+ LIST_HEAD(req_list);
+
+ spin_lock(&queue->lock);
+ list_for_each_entry(req, &queue->fuse_req_queue, list)
+ clear_bit(FR_PENDING, &req->flags);
+ list_splice_init(&queue->fuse_req_queue, &req_list);
+ spin_unlock(&queue->lock);
+
+ /* must not hold queue lock to avoid order issues with fi->lock */
+ fuse_dev_end_requests(&req_list);
+}
+
+void fuse_uring_abort_end_requests(struct fuse_ring *ring)
+{
+ int qid;
+ struct fuse_ring_queue *queue;
+ struct fuse_conn *fc = ring->fc;
+
+ for (qid = 0; qid < ring->nr_queues; qid++) {
+ queue = READ_ONCE(ring->queues[qid]);
+ if (!queue)
+ continue;
+
+ queue->stopped = true;
+
+ WARN_ON_ONCE(ring->fc->max_background != UINT_MAX);
+ spin_lock(&queue->lock);
+ spin_lock(&fc->bg_lock);
+ fuse_uring_flush_bg(queue);
+ spin_unlock(&fc->bg_lock);
+ spin_unlock(&queue->lock);
+ fuse_uring_abort_end_queue_requests(queue);
+ }
+}
+
+void fuse_uring_destruct(struct fuse_conn *fc)
+{
+ struct fuse_ring *ring = fc->ring;
+ int qid;
+
+ if (!ring)
+ return;
+
+ for (qid = 0; qid < ring->nr_queues; qid++) {
+ struct fuse_ring_queue *queue = ring->queues[qid];
+ struct fuse_ring_ent *ent, *next;
+
+ if (!queue)
+ continue;
+
+ WARN_ON(!list_empty(&queue->ent_avail_queue));
+ WARN_ON(!list_empty(&queue->ent_w_req_queue));
+ WARN_ON(!list_empty(&queue->ent_commit_queue));
+ WARN_ON(!list_empty(&queue->ent_in_userspace));
+
+ list_for_each_entry_safe(ent, next, &queue->ent_released,
+ list) {
+ list_del_init(&ent->list);
+ kfree(ent);
+ }
+
+ kfree(queue->fpq.processing);
+ kfree(queue);
+ ring->queues[qid] = NULL;
+ }
+
+ kfree(ring->queues);
+ kfree(ring);
+ fc->ring = NULL;
+}
+
+/*
+ * Basic ring setup for this connection based on the provided configuration
+ */
+static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc)
+{
+ struct fuse_ring *ring;
+ size_t nr_queues = num_possible_cpus();
+ struct fuse_ring *res = NULL;
+ size_t max_payload_size;
+
+ ring = kzalloc(sizeof(*fc->ring), GFP_KERNEL_ACCOUNT);
+ if (!ring)
+ return NULL;
+
+ ring->queues = kcalloc(nr_queues, sizeof(struct fuse_ring_queue *),
+ GFP_KERNEL_ACCOUNT);
+ if (!ring->queues)
+ goto out_err;
+
+ max_payload_size = max(FUSE_MIN_READ_BUFFER, fc->max_write);
+ max_payload_size = max(max_payload_size, fc->max_pages * PAGE_SIZE);
+
+ spin_lock(&fc->lock);
+ if (fc->ring) {
+ /* race, another thread created the ring in the meantime */
+ spin_unlock(&fc->lock);
+ res = fc->ring;
+ goto out_err;
+ }
+
+ init_waitqueue_head(&ring->stop_waitq);
+
+ fc->ring = ring;
+ ring->nr_queues = nr_queues;
+ ring->fc = fc;
+ ring->max_payload_sz = max_payload_size;
+ atomic_set(&ring->queue_refs, 0);
+
+ spin_unlock(&fc->lock);
+ return ring;
+
+out_err:
+ kfree(ring->queues);
+ kfree(ring);
+ return res;
+}
+
+static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring,
+ int qid)
+{
+ struct fuse_conn *fc = ring->fc;
+ struct fuse_ring_queue *queue;
+ struct list_head *pq;
+
+ queue = kzalloc(sizeof(*queue), GFP_KERNEL_ACCOUNT);
+ if (!queue)
+ return NULL;
+ pq = kcalloc(FUSE_PQ_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL);
+ if (!pq) {
+ kfree(queue);
+ return NULL;
+ }
+
+ queue->qid = qid;
+ queue->ring = ring;
+ spin_lock_init(&queue->lock);
+
+ INIT_LIST_HEAD(&queue->ent_avail_queue);
+ INIT_LIST_HEAD(&queue->ent_commit_queue);
+ INIT_LIST_HEAD(&queue->ent_w_req_queue);
+ INIT_LIST_HEAD(&queue->ent_in_userspace);
+ INIT_LIST_HEAD(&queue->fuse_req_queue);
+ INIT_LIST_HEAD(&queue->fuse_req_bg_queue);
+ INIT_LIST_HEAD(&queue->ent_released);
+
+ queue->fpq.processing = pq;
+ fuse_pqueue_init(&queue->fpq);
+
+ spin_lock(&fc->lock);
+ if (ring->queues[qid]) {
+ spin_unlock(&fc->lock);
+ kfree(queue->fpq.processing);
+ kfree(queue);
+ return ring->queues[qid];
+ }
+
+ /*
+ * write_once and lock as the caller mostly doesn't take the lock at all
+ */
+ WRITE_ONCE(ring->queues[qid], queue);
+ spin_unlock(&fc->lock);
+
+ return queue;
+}
+
+static void fuse_uring_stop_fuse_req_end(struct fuse_req *req)
+{
+ clear_bit(FR_SENT, &req->flags);
+ req->out.h.error = -ECONNABORTED;
+ fuse_request_end(req);
+}
+
+/*
+ * Release a request/entry on connection tear down
+ */
+static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent)
+{
+ struct fuse_req *req;
+ struct io_uring_cmd *cmd;
+
+ struct fuse_ring_queue *queue = ent->queue;
+
+ spin_lock(&queue->lock);
+ cmd = ent->cmd;
+ ent->cmd = NULL;
+ req = ent->fuse_req;
+ ent->fuse_req = NULL;
+ if (req) {
+ /* remove entry from queue->fpq->processing */
+ list_del_init(&req->list);
+ }
+
+ /*
+ * The entry must not be freed immediately, due to access of direct
+ * pointer access of entries through IO_URING_F_CANCEL - there is a risk
+ * of race between daemon termination (which triggers IO_URING_F_CANCEL
+ * and accesses entries without checking the list state first
+ */
+ list_move(&ent->list, &queue->ent_released);
+ ent->state = FRRS_RELEASED;
+ spin_unlock(&queue->lock);
+
+ if (cmd)
+ io_uring_cmd_done(cmd, -ENOTCONN, 0, IO_URING_F_UNLOCKED);
+
+ if (req)
+ fuse_uring_stop_fuse_req_end(req);
+}
+
+static void fuse_uring_stop_list_entries(struct list_head *head,
+ struct fuse_ring_queue *queue,
+ enum fuse_ring_req_state exp_state)
+{
+ struct fuse_ring *ring = queue->ring;
+ struct fuse_ring_ent *ent, *next;
+ ssize_t queue_refs = SSIZE_MAX;
+ LIST_HEAD(to_teardown);
+
+ spin_lock(&queue->lock);
+ list_for_each_entry_safe(ent, next, head, list) {
+ if (ent->state != exp_state) {
+ pr_warn("entry teardown qid=%d state=%d expected=%d",
+ queue->qid, ent->state, exp_state);
+ continue;
+ }
+
+ ent->state = FRRS_TEARDOWN;
+ list_move(&ent->list, &to_teardown);
+ }
+ spin_unlock(&queue->lock);
+
+ /* no queue lock to avoid lock order issues */
+ list_for_each_entry_safe(ent, next, &to_teardown, list) {
+ fuse_uring_entry_teardown(ent);
+ queue_refs = atomic_dec_return(&ring->queue_refs);
+ WARN_ON_ONCE(queue_refs < 0);
+ }
+}
+
+static void fuse_uring_teardown_entries(struct fuse_ring_queue *queue)
+{
+ fuse_uring_stop_list_entries(&queue->ent_in_userspace, queue,
+ FRRS_USERSPACE);
+ fuse_uring_stop_list_entries(&queue->ent_avail_queue, queue,
+ FRRS_AVAILABLE);
+}
+
+/*
+ * Log state debug info
+ */
+static void fuse_uring_log_ent_state(struct fuse_ring *ring)
+{
+ int qid;
+ struct fuse_ring_ent *ent;
+
+ for (qid = 0; qid < ring->nr_queues; qid++) {
+ struct fuse_ring_queue *queue = ring->queues[qid];
+
+ if (!queue)
+ continue;
+
+ spin_lock(&queue->lock);
+ /*
+ * Log entries from the intermediate queue, the other queues
+ * should be empty
+ */
+ list_for_each_entry(ent, &queue->ent_w_req_queue, list) {
+ pr_info(" ent-req-queue ring=%p qid=%d ent=%p state=%d\n",
+ ring, qid, ent, ent->state);
+ }
+ list_for_each_entry(ent, &queue->ent_commit_queue, list) {
+ pr_info(" ent-commit-queue ring=%p qid=%d ent=%p state=%d\n",
+ ring, qid, ent, ent->state);
+ }
+ spin_unlock(&queue->lock);
+ }
+ ring->stop_debug_log = 1;
+}
+
+static void fuse_uring_async_stop_queues(struct work_struct *work)
+{
+ int qid;
+ struct fuse_ring *ring =
+ container_of(work, struct fuse_ring, async_teardown_work.work);
+
+ /* XXX code dup */
+ for (qid = 0; qid < ring->nr_queues; qid++) {
+ struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]);
+
+ if (!queue)
+ continue;
+
+ fuse_uring_teardown_entries(queue);
+ }
+
+ /*
+ * Some ring entries might be in the middle of IO operations,
+ * i.e. in process to get handled by file_operations::uring_cmd
+ * or on the way to userspace - we could handle that with conditions in
+ * run time code, but easier/cleaner to have an async tear down handler
+ * If there are still queue references left
+ */
+ if (atomic_read(&ring->queue_refs) > 0) {
+ if (time_after(jiffies,
+ ring->teardown_time + FUSE_URING_TEARDOWN_TIMEOUT))
+ fuse_uring_log_ent_state(ring);
+
+ schedule_delayed_work(&ring->async_teardown_work,
+ FUSE_URING_TEARDOWN_INTERVAL);
+ } else {
+ wake_up_all(&ring->stop_waitq);
+ }
+}
+
+/*
+ * Stop the ring queues
+ */
+void fuse_uring_stop_queues(struct fuse_ring *ring)
+{
+ int qid;
+
+ for (qid = 0; qid < ring->nr_queues; qid++) {
+ struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]);
+
+ if (!queue)
+ continue;
+
+ fuse_uring_teardown_entries(queue);
+ }
+
+ if (atomic_read(&ring->queue_refs) > 0) {
+ ring->teardown_time = jiffies;
+ INIT_DELAYED_WORK(&ring->async_teardown_work,
+ fuse_uring_async_stop_queues);
+ schedule_delayed_work(&ring->async_teardown_work,
+ FUSE_URING_TEARDOWN_INTERVAL);
+ } else {
+ wake_up_all(&ring->stop_waitq);
+ }
+}
+
+/*
+ * Handle IO_URING_F_CANCEL, typically should come on daemon termination.
+ *
+ * Releasing the last entry should trigger fuse_dev_release() if
+ * the daemon was terminated
+ */
+static void fuse_uring_cancel(struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+ struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd);
+ struct fuse_ring_queue *queue;
+ bool need_cmd_done = false;
+
+ /*
+ * direct access on ent - it must not be destructed as long as
+ * IO_URING_F_CANCEL might come up
+ */
+ queue = ent->queue;
+ spin_lock(&queue->lock);
+ if (ent->state == FRRS_AVAILABLE) {
+ ent->state = FRRS_USERSPACE;
+ list_move(&ent->list, &queue->ent_in_userspace);
+ need_cmd_done = true;
+ ent->cmd = NULL;
+ }
+ spin_unlock(&queue->lock);
+
+ if (need_cmd_done) {
+ /* no queue lock to avoid lock order issues */
+ io_uring_cmd_done(cmd, -ENOTCONN, 0, issue_flags);
+ }
+}
+
+static void fuse_uring_prepare_cancel(struct io_uring_cmd *cmd, int issue_flags,
+ struct fuse_ring_ent *ring_ent)
+{
+ uring_cmd_set_ring_ent(cmd, ring_ent);
+ io_uring_cmd_mark_cancelable(cmd, issue_flags);
+}
+
+/*
+ * Checks for errors and stores it into the request
+ */
+static int fuse_uring_out_header_has_err(struct fuse_out_header *oh,
+ struct fuse_req *req,
+ struct fuse_conn *fc)
+{
+ int err;
+
+ err = -EINVAL;
+ if (oh->unique == 0) {
+ /* Not supported through io-uring yet */
+ pr_warn_once("notify through fuse-io-uring not supported\n");
+ goto err;
+ }
+
+ if (oh->error <= -ERESTARTSYS || oh->error > 0)
+ goto err;
+
+ if (oh->error) {
+ err = oh->error;
+ goto err;
+ }
+
+ err = -ENOENT;
+ if ((oh->unique & ~FUSE_INT_REQ_BIT) != req->in.h.unique) {
+ pr_warn_ratelimited("unique mismatch, expected: %llu got %llu\n",
+ req->in.h.unique,
+ oh->unique & ~FUSE_INT_REQ_BIT);
+ goto err;
+ }
+
+ /*
+ * Is it an interrupt reply ID?
+ * XXX: Not supported through fuse-io-uring yet, it should not even
+ * find the request - should not happen.
+ */
+ WARN_ON_ONCE(oh->unique & FUSE_INT_REQ_BIT);
+
+ err = 0;
+err:
+ return err;
+}
+
+static int fuse_uring_copy_from_ring(struct fuse_ring *ring,
+ struct fuse_req *req,
+ struct fuse_ring_ent *ent)
+{
+ struct fuse_copy_state cs;
+ struct fuse_args *args = req->args;
+ struct iov_iter iter;
+ int err;
+ struct fuse_uring_ent_in_out ring_in_out;
+
+ err = copy_from_user(&ring_in_out, &ent->headers->ring_ent_in_out,
+ sizeof(ring_in_out));
+ if (err)
+ return -EFAULT;
+
+ err = import_ubuf(ITER_SOURCE, ent->payload, ring->max_payload_sz,
+ &iter);
+ if (err)
+ return err;
+
+ fuse_copy_init(&cs, 0, &iter);
+ cs.is_uring = 1;
+ cs.req = req;
+
+ return fuse_copy_out_args(&cs, args, ring_in_out.payload_sz);
+}
+
+ /*
+ * Copy data from the req to the ring buffer
+ */
+static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req,
+ struct fuse_ring_ent *ent)
+{
+ struct fuse_copy_state cs;
+ struct fuse_args *args = req->args;
+ struct fuse_in_arg *in_args = args->in_args;
+ int num_args = args->in_numargs;
+ int err;
+ struct iov_iter iter;
+ struct fuse_uring_ent_in_out ent_in_out = {
+ .flags = 0,
+ .commit_id = req->in.h.unique,
+ };
+
+ err = import_ubuf(ITER_DEST, ent->payload, ring->max_payload_sz, &iter);
+ if (err) {
+ pr_info_ratelimited("fuse: Import of user buffer failed\n");
+ return err;
+ }
+
+ fuse_copy_init(&cs, 1, &iter);
+ cs.is_uring = 1;
+ cs.req = req;
+
+ if (num_args > 0) {
+ /*
+ * Expectation is that the first argument is the per op header.
+ * Some op code have that as zero size.
+ */
+ if (args->in_args[0].size > 0) {
+ err = copy_to_user(&ent->headers->op_in, in_args->value,
+ in_args->size);
+ if (err) {
+ pr_info_ratelimited(
+ "Copying the header failed.\n");
+ return -EFAULT;
+ }
+ }
+ in_args++;
+ num_args--;
+ }
+
+ /* copy the payload */
+ err = fuse_copy_args(&cs, num_args, args->in_pages,
+ (struct fuse_arg *)in_args, 0);
+ if (err) {
+ pr_info_ratelimited("%s fuse_copy_args failed\n", __func__);
+ return err;
+ }
+
+ ent_in_out.payload_sz = cs.ring.copied_sz;
+ err = copy_to_user(&ent->headers->ring_ent_in_out, &ent_in_out,
+ sizeof(ent_in_out));
+ return err ? -EFAULT : 0;
+}
+
+static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent,
+ struct fuse_req *req)
+{
+ struct fuse_ring_queue *queue = ent->queue;
+ struct fuse_ring *ring = queue->ring;
+ int err;
+
+ err = -EIO;
+ if (WARN_ON(ent->state != FRRS_FUSE_REQ)) {
+ pr_err("qid=%d ring-req=%p invalid state %d on send\n",
+ queue->qid, ent, ent->state);
+ return err;
+ }
+
+ err = -EINVAL;
+ if (WARN_ON(req->in.h.unique == 0))
+ return err;
+
+ /* copy the request */
+ err = fuse_uring_args_to_ring(ring, req, ent);
+ if (unlikely(err)) {
+ pr_info_ratelimited("Copy to ring failed: %d\n", err);
+ return err;
+ }
+
+ /* copy fuse_in_header */
+ err = copy_to_user(&ent->headers->in_out, &req->in.h,
+ sizeof(req->in.h));
+ if (err) {
+ err = -EFAULT;
+ return err;
+ }
+
+ return 0;
+}
+
+static int fuse_uring_prepare_send(struct fuse_ring_ent *ent,
+ struct fuse_req *req)
+{
+ int err;
+
+ err = fuse_uring_copy_to_ring(ent, req);
+ if (!err)
+ set_bit(FR_SENT, &req->flags);
+ else
+ fuse_uring_req_end(ent, req, err);
+
+ return err;
+}
+
+/*
+ * Write data to the ring buffer and send the request to userspace,
+ * userspace will read it
+ * This is comparable with classical read(/dev/fuse)
+ */
+static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ent,
+ struct fuse_req *req,
+ unsigned int issue_flags)
+{
+ struct fuse_ring_queue *queue = ent->queue;
+ int err;
+ struct io_uring_cmd *cmd;
+
+ err = fuse_uring_prepare_send(ent, req);
+ if (err)
+ return err;
+
+ spin_lock(&queue->lock);
+ cmd = ent->cmd;
+ ent->cmd = NULL;
+ ent->state = FRRS_USERSPACE;
+ list_move(&ent->list, &queue->ent_in_userspace);
+ spin_unlock(&queue->lock);
+
+ io_uring_cmd_done(cmd, 0, 0, issue_flags);
+ return 0;
+}
+
+/*
+ * Make a ring entry available for fuse_req assignment
+ */
+static void fuse_uring_ent_avail(struct fuse_ring_ent *ent,
+ struct fuse_ring_queue *queue)
+{
+ WARN_ON_ONCE(!ent->cmd);
+ list_move(&ent->list, &queue->ent_avail_queue);
+ ent->state = FRRS_AVAILABLE;
+}
+
+/* Used to find the request on SQE commit */
+static void fuse_uring_add_to_pq(struct fuse_ring_ent *ent,
+ struct fuse_req *req)
+{
+ struct fuse_ring_queue *queue = ent->queue;
+ struct fuse_pqueue *fpq = &queue->fpq;
+ unsigned int hash;
+
+ req->ring_entry = ent;
+ hash = fuse_req_hash(req->in.h.unique);
+ list_move_tail(&req->list, &fpq->processing[hash]);
+}
+
+/*
+ * Assign a fuse queue entry to the given entry
+ */
+static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent,
+ struct fuse_req *req)
+{
+ struct fuse_ring_queue *queue = ent->queue;
+ struct fuse_conn *fc = req->fm->fc;
+ struct fuse_iqueue *fiq = &fc->iq;
+
+ lockdep_assert_held(&queue->lock);
+
+ if (WARN_ON_ONCE(ent->state != FRRS_AVAILABLE &&
+ ent->state != FRRS_COMMIT)) {
+ pr_warn("%s qid=%d state=%d\n", __func__, ent->queue->qid,
+ ent->state);
+ }
+
+ spin_lock(&fiq->lock);
+ clear_bit(FR_PENDING, &req->flags);
+ spin_unlock(&fiq->lock);
+ ent->fuse_req = req;
+ ent->state = FRRS_FUSE_REQ;
+ list_move(&ent->list, &queue->ent_w_req_queue);
+ fuse_uring_add_to_pq(ent, req);
+}
+
+/* Fetch the next fuse request if available */
+static struct fuse_req *fuse_uring_ent_assign_req(struct fuse_ring_ent *ent)
+ __must_hold(&queue->lock)
+{
+ struct fuse_req *req;
+ struct fuse_ring_queue *queue = ent->queue;
+ struct list_head *req_queue = &queue->fuse_req_queue;
+
+ lockdep_assert_held(&queue->lock);
+
+ /* get and assign the next entry while it is still holding the lock */
+ req = list_first_entry_or_null(req_queue, struct fuse_req, list);
+ if (req)
+ fuse_uring_add_req_to_ring_ent(ent, req);
+
+ return req;
+}
+
+/*
+ * Read data from the ring buffer, which user space has written to
+ * This is comparible with handling of classical write(/dev/fuse).
+ * Also make the ring request available again for new fuse requests.
+ */
+static void fuse_uring_commit(struct fuse_ring_ent *ent, struct fuse_req *req,
+ unsigned int issue_flags)
+{
+ struct fuse_ring *ring = ent->queue->ring;
+ struct fuse_conn *fc = ring->fc;
+ ssize_t err = 0;
+
+ err = copy_from_user(&req->out.h, &ent->headers->in_out,
+ sizeof(req->out.h));
+ if (err) {
+ req->out.h.error = -EFAULT;
+ goto out;
+ }
+
+ err = fuse_uring_out_header_has_err(&req->out.h, req, fc);
+ if (err) {
+ /* req->out.h.error already set */
+ goto out;
+ }
+
+ err = fuse_uring_copy_from_ring(ring, req, ent);
+out:
+ fuse_uring_req_end(ent, req, err);
+}
+
+/*
+ * Get the next fuse req and send it
+ */
+static void fuse_uring_next_fuse_req(struct fuse_ring_ent *ent,
+ struct fuse_ring_queue *queue,
+ unsigned int issue_flags)
+{
+ int err;
+ struct fuse_req *req;
+
+retry:
+ spin_lock(&queue->lock);
+ fuse_uring_ent_avail(ent, queue);
+ req = fuse_uring_ent_assign_req(ent);
+ spin_unlock(&queue->lock);
+
+ if (req) {
+ err = fuse_uring_send_next_to_ring(ent, req, issue_flags);
+ if (err)
+ goto retry;
+ }
+}
+
+static int fuse_ring_ent_set_commit(struct fuse_ring_ent *ent)
+{
+ struct fuse_ring_queue *queue = ent->queue;
+
+ lockdep_assert_held(&queue->lock);
+
+ if (WARN_ON_ONCE(ent->state != FRRS_USERSPACE))
+ return -EIO;
+
+ ent->state = FRRS_COMMIT;
+ list_move(&ent->list, &queue->ent_commit_queue);
+
+ return 0;
+}
+
+/* FUSE_URING_CMD_COMMIT_AND_FETCH handler */
+static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags,
+ struct fuse_conn *fc)
+{
+ const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe);
+ struct fuse_ring_ent *ent;
+ int err;
+ struct fuse_ring *ring = fc->ring;
+ struct fuse_ring_queue *queue;
+ uint64_t commit_id = READ_ONCE(cmd_req->commit_id);
+ unsigned int qid = READ_ONCE(cmd_req->qid);
+ struct fuse_pqueue *fpq;
+ struct fuse_req *req;
+
+ err = -ENOTCONN;
+ if (!ring)
+ return err;
+
+ if (qid >= ring->nr_queues)
+ return -EINVAL;
+
+ queue = ring->queues[qid];
+ if (!queue)
+ return err;
+ fpq = &queue->fpq;
+
+ if (!READ_ONCE(fc->connected) || READ_ONCE(queue->stopped))
+ return err;
+
+ spin_lock(&queue->lock);
+ /* Find a request based on the unique ID of the fuse request
+ * This should get revised, as it needs a hash calculation and list
+ * search. And full struct fuse_pqueue is needed (memory overhead).
+ * As well as the link from req to ring_ent.
+ */
+ req = fuse_request_find(fpq, commit_id);
+ err = -ENOENT;
+ if (!req) {
+ pr_info("qid=%d commit_id %llu not found\n", queue->qid,
+ commit_id);
+ spin_unlock(&queue->lock);
+ return err;
+ }
+ list_del_init(&req->list);
+ ent = req->ring_entry;
+ req->ring_entry = NULL;
+
+ err = fuse_ring_ent_set_commit(ent);
+ if (err != 0) {
+ pr_info_ratelimited("qid=%d commit_id %llu state %d",
+ queue->qid, commit_id, ent->state);
+ spin_unlock(&queue->lock);
+ req->out.h.error = err;
+ clear_bit(FR_SENT, &req->flags);
+ fuse_request_end(req);
+ return err;
+ }
+
+ ent->cmd = cmd;
+ spin_unlock(&queue->lock);
+
+ /* without the queue lock, as other locks are taken */
+ fuse_uring_prepare_cancel(cmd, issue_flags, ent);
+ fuse_uring_commit(ent, req, issue_flags);
+
+ /*
+ * Fetching the next request is absolutely required as queued
+ * fuse requests would otherwise not get processed - committing
+ * and fetching is done in one step vs legacy fuse, which has separated
+ * read (fetch request) and write (commit result).
+ */
+ fuse_uring_next_fuse_req(ent, queue, issue_flags);
+ return 0;
+}
+
+static bool is_ring_ready(struct fuse_ring *ring, int current_qid)
+{
+ int qid;
+ struct fuse_ring_queue *queue;
+ bool ready = true;
+
+ for (qid = 0; qid < ring->nr_queues && ready; qid++) {
+ if (current_qid == qid)
+ continue;
+
+ queue = ring->queues[qid];
+ if (!queue) {
+ ready = false;
+ break;
+ }
+
+ spin_lock(&queue->lock);
+ if (list_empty(&queue->ent_avail_queue))
+ ready = false;
+ spin_unlock(&queue->lock);
+ }
+
+ return ready;
+}
+
+/*
+ * fuse_uring_req_fetch command handling
+ */
+static void fuse_uring_do_register(struct fuse_ring_ent *ent,
+ struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+ struct fuse_ring_queue *queue = ent->queue;
+ struct fuse_ring *ring = queue->ring;
+ struct fuse_conn *fc = ring->fc;
+ struct fuse_iqueue *fiq = &fc->iq;
+
+ fuse_uring_prepare_cancel(cmd, issue_flags, ent);
+
+ spin_lock(&queue->lock);
+ ent->cmd = cmd;
+ fuse_uring_ent_avail(ent, queue);
+ spin_unlock(&queue->lock);
+
+ if (!ring->ready) {
+ bool ready = is_ring_ready(ring, queue->qid);
+
+ if (ready) {
+ WRITE_ONCE(fiq->ops, &fuse_io_uring_ops);
+ WRITE_ONCE(ring->ready, true);
+ wake_up_all(&fc->blocked_waitq);
+ }
+ }
+}
+
+/*
+ * sqe->addr is a ptr to an iovec array, iov[0] has the headers, iov[1]
+ * the payload
+ */
+static int fuse_uring_get_iovec_from_sqe(const struct io_uring_sqe *sqe,
+ struct iovec iov[FUSE_URING_IOV_SEGS])
+{
+ struct iovec __user *uiov = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ struct iov_iter iter;
+ ssize_t ret;
+
+ if (sqe->len != FUSE_URING_IOV_SEGS)
+ return -EINVAL;
+
+ /*
+ * Direction for buffer access will actually be READ and WRITE,
+ * using write for the import should include READ access as well.
+ */
+ ret = import_iovec(WRITE, uiov, FUSE_URING_IOV_SEGS,
+ FUSE_URING_IOV_SEGS, &iov, &iter);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+static struct fuse_ring_ent *
+fuse_uring_create_ring_ent(struct io_uring_cmd *cmd,
+ struct fuse_ring_queue *queue)
+{
+ struct fuse_ring *ring = queue->ring;
+ struct fuse_ring_ent *ent;
+ size_t payload_size;
+ struct iovec iov[FUSE_URING_IOV_SEGS];
+ int err;
+
+ err = fuse_uring_get_iovec_from_sqe(cmd->sqe, iov);
+ if (err) {
+ pr_info_ratelimited("Failed to get iovec from sqe, err=%d\n",
+ err);
+ return ERR_PTR(err);
+ }
+
+ err = -EINVAL;
+ if (iov[0].iov_len < sizeof(struct fuse_uring_req_header)) {
+ pr_info_ratelimited("Invalid header len %zu\n", iov[0].iov_len);
+ return ERR_PTR(err);
+ }
+
+ payload_size = iov[1].iov_len;
+ if (payload_size < ring->max_payload_sz) {
+ pr_info_ratelimited("Invalid req payload len %zu\n",
+ payload_size);
+ return ERR_PTR(err);
+ }
+
+ err = -ENOMEM;
+ ent = kzalloc(sizeof(*ent), GFP_KERNEL_ACCOUNT);
+ if (!ent)
+ return ERR_PTR(err);
+
+ INIT_LIST_HEAD(&ent->list);
+
+ ent->queue = queue;
+ ent->headers = iov[0].iov_base;
+ ent->payload = iov[1].iov_base;
+
+ atomic_inc(&ring->queue_refs);
+ return ent;
+}
+
+/*
+ * Register header and payload buffer with the kernel and puts the
+ * entry as "ready to get fuse requests" on the queue
+ */
+static int fuse_uring_register(struct io_uring_cmd *cmd,
+ unsigned int issue_flags, struct fuse_conn *fc)
+{
+ const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe);
+ struct fuse_ring *ring = fc->ring;
+ struct fuse_ring_queue *queue;
+ struct fuse_ring_ent *ent;
+ int err;
+ unsigned int qid = READ_ONCE(cmd_req->qid);
+
+ err = -ENOMEM;
+ if (!ring) {
+ ring = fuse_uring_create(fc);
+ if (!ring)
+ return err;
+ }
+
+ if (qid >= ring->nr_queues) {
+ pr_info_ratelimited("fuse: Invalid ring qid %u\n", qid);
+ return -EINVAL;
+ }
+
+ queue = ring->queues[qid];
+ if (!queue) {
+ queue = fuse_uring_create_queue(ring, qid);
+ if (!queue)
+ return err;
+ }
+
+ /*
+ * The created queue above does not need to be destructed in
+ * case of entry errors below, will be done at ring destruction time.
+ */
+
+ ent = fuse_uring_create_ring_ent(cmd, queue);
+ if (IS_ERR(ent))
+ return PTR_ERR(ent);
+
+ fuse_uring_do_register(ent, cmd, issue_flags);
+
+ return 0;
+}
+
+/*
+ * Entry function from io_uring to handle the given passthrough command
+ * (op code IORING_OP_URING_CMD)
+ */
+int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+ struct fuse_dev *fud;
+ struct fuse_conn *fc;
+ u32 cmd_op = cmd->cmd_op;
+ int err;
+
+ if ((unlikely(issue_flags & IO_URING_F_CANCEL))) {
+ fuse_uring_cancel(cmd, issue_flags);
+ return 0;
+ }
+
+ /* This extra SQE size holds struct fuse_uring_cmd_req */
+ if (!(issue_flags & IO_URING_F_SQE128))
+ return -EINVAL;
+
+ fud = fuse_get_dev(cmd->file);
+ if (!fud) {
+ pr_info_ratelimited("No fuse device found\n");
+ return -ENOTCONN;
+ }
+ fc = fud->fc;
+
+ /* Once a connection has io-uring enabled on it, it can't be disabled */
+ if (!enable_uring && !fc->io_uring) {
+ pr_info_ratelimited("fuse-io-uring is disabled\n");
+ return -EOPNOTSUPP;
+ }
+
+ if (fc->aborted)
+ return -ECONNABORTED;
+ if (!fc->connected)
+ return -ENOTCONN;
+
+ /*
+ * fuse_uring_register() needs the ring to be initialized,
+ * we need to know the max payload size
+ */
+ if (!fc->initialized)
+ return -EAGAIN;
+
+ switch (cmd_op) {
+ case FUSE_IO_URING_CMD_REGISTER:
+ err = fuse_uring_register(cmd, issue_flags, fc);
+ if (err) {
+ pr_info_once("FUSE_IO_URING_CMD_REGISTER failed err=%d\n",
+ err);
+ fc->io_uring = 0;
+ wake_up_all(&fc->blocked_waitq);
+ return err;
+ }
+ break;
+ case FUSE_IO_URING_CMD_COMMIT_AND_FETCH:
+ err = fuse_uring_commit_fetch(cmd, issue_flags, fc);
+ if (err) {
+ pr_info_once("FUSE_IO_URING_COMMIT_AND_FETCH failed err=%d\n",
+ err);
+ return err;
+ }
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return -EIOCBQUEUED;
+}
+
+static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd,
+ ssize_t ret, unsigned int issue_flags)
+{
+ struct fuse_ring_queue *queue = ent->queue;
+
+ spin_lock(&queue->lock);
+ ent->state = FRRS_USERSPACE;
+ list_move(&ent->list, &queue->ent_in_userspace);
+ ent->cmd = NULL;
+ spin_unlock(&queue->lock);
+
+ io_uring_cmd_done(cmd, ret, 0, issue_flags);
+}
+
+/*
+ * This prepares and sends the ring request in fuse-uring task context.
+ * User buffers are not mapped yet - the application does not have permission
+ * to write to it - this has to be executed in ring task context.
+ */
+static void fuse_uring_send_in_task(struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+ struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd);
+ struct fuse_ring_queue *queue = ent->queue;
+ int err;
+
+ if (!(issue_flags & IO_URING_F_TASK_DEAD)) {
+ err = fuse_uring_prepare_send(ent, ent->fuse_req);
+ if (err) {
+ fuse_uring_next_fuse_req(ent, queue, issue_flags);
+ return;
+ }
+ } else {
+ err = -ECANCELED;
+ }
+
+ fuse_uring_send(ent, cmd, err, issue_flags);
+}
+
+static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring)
+{
+ unsigned int qid;
+ struct fuse_ring_queue *queue;
+
+ qid = task_cpu(current);
+
+ if (WARN_ONCE(qid >= ring->nr_queues,
+ "Core number (%u) exceeds nr queues (%zu)\n", qid,
+ ring->nr_queues))
+ qid = 0;
+
+ queue = ring->queues[qid];
+ WARN_ONCE(!queue, "Missing queue for qid %d\n", qid);
+
+ return queue;
+}
+
+static void fuse_uring_dispatch_ent(struct fuse_ring_ent *ent)
+{
+ struct io_uring_cmd *cmd = ent->cmd;
+
+ uring_cmd_set_ring_ent(cmd, ent);
+ io_uring_cmd_complete_in_task(cmd, fuse_uring_send_in_task);
+}
+
+/* queue a fuse request and send it if a ring entry is available */
+void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req)
+{
+ struct fuse_conn *fc = req->fm->fc;
+ struct fuse_ring *ring = fc->ring;
+ struct fuse_ring_queue *queue;
+ struct fuse_ring_ent *ent = NULL;
+ int err;
+
+ err = -EINVAL;
+ queue = fuse_uring_task_to_queue(ring);
+ if (!queue)
+ goto err;
+
+ if (req->in.h.opcode != FUSE_NOTIFY_REPLY)
+ req->in.h.unique = fuse_get_unique(fiq);
+
+ spin_lock(&queue->lock);
+ err = -ENOTCONN;
+ if (unlikely(queue->stopped))
+ goto err_unlock;
+
+ ent = list_first_entry_or_null(&queue->ent_avail_queue,
+ struct fuse_ring_ent, list);
+ if (ent)
+ fuse_uring_add_req_to_ring_ent(ent, req);
+ else
+ list_add_tail(&req->list, &queue->fuse_req_queue);
+ spin_unlock(&queue->lock);
+
+ if (ent)
+ fuse_uring_dispatch_ent(ent);
+
+ return;
+
+err_unlock:
+ spin_unlock(&queue->lock);
+err:
+ req->out.h.error = err;
+ clear_bit(FR_PENDING, &req->flags);
+ fuse_request_end(req);
+}
+
+bool fuse_uring_queue_bq_req(struct fuse_req *req)
+{
+ struct fuse_conn *fc = req->fm->fc;
+ struct fuse_ring *ring = fc->ring;
+ struct fuse_ring_queue *queue;
+ struct fuse_ring_ent *ent = NULL;
+
+ queue = fuse_uring_task_to_queue(ring);
+ if (!queue)
+ return false;
+
+ spin_lock(&queue->lock);
+ if (unlikely(queue->stopped)) {
+ spin_unlock(&queue->lock);
+ return false;
+ }
+
+ list_add_tail(&req->list, &queue->fuse_req_bg_queue);
+
+ ent = list_first_entry_or_null(&queue->ent_avail_queue,
+ struct fuse_ring_ent, list);
+ spin_lock(&fc->bg_lock);
+ fc->num_background++;
+ if (fc->num_background == fc->max_background)
+ fc->blocked = 1;
+ fuse_uring_flush_bg(queue);
+ spin_unlock(&fc->bg_lock);
+
+ /*
+ * Due to bg_queue flush limits there might be other bg requests
+ * in the queue that need to be handled first. Or no further req
+ * might be available.
+ */
+ req = list_first_entry_or_null(&queue->fuse_req_queue, struct fuse_req,
+ list);
+ if (ent && req) {
+ fuse_uring_add_req_to_ring_ent(ent, req);
+ spin_unlock(&queue->lock);
+
+ fuse_uring_dispatch_ent(ent);
+ } else {
+ spin_unlock(&queue->lock);
+ }
+
+ return true;
+}
+
+static const struct fuse_iqueue_ops fuse_io_uring_ops = {
+ /* should be send over io-uring as enhancement */
+ .send_forget = fuse_dev_queue_forget,
+
+ /*
+ * could be send over io-uring, but interrupts should be rare,
+ * no need to make the code complex
+ */
+ .send_interrupt = fuse_dev_queue_interrupt,
+ .send_req = fuse_uring_queue_fuse_req,
+};
diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h
new file mode 100644
index 000000000000..2102b3d0c1ae
--- /dev/null
+++ b/fs/fuse/dev_uring_i.h
@@ -0,0 +1,205 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * FUSE: Filesystem in Userspace
+ * Copyright (c) 2023-2024 DataDirect Networks.
+ */
+
+#ifndef _FS_FUSE_DEV_URING_I_H
+#define _FS_FUSE_DEV_URING_I_H
+
+#include "fuse_i.h"
+
+#ifdef CONFIG_FUSE_IO_URING
+
+#define FUSE_URING_TEARDOWN_TIMEOUT (5 * HZ)
+#define FUSE_URING_TEARDOWN_INTERVAL (HZ/20)
+
+enum fuse_ring_req_state {
+ FRRS_INVALID = 0,
+
+ /* The ring entry received from userspace and it is being processed */
+ FRRS_COMMIT,
+
+ /* The ring entry is waiting for new fuse requests */
+ FRRS_AVAILABLE,
+
+ /* The ring entry got assigned a fuse req */
+ FRRS_FUSE_REQ,
+
+ /* The ring entry is in or on the way to user space */
+ FRRS_USERSPACE,
+
+ /* The ring entry is in teardown */
+ FRRS_TEARDOWN,
+
+ /* The ring entry is released, but not freed yet */
+ FRRS_RELEASED,
+};
+
+/** A fuse ring entry, part of the ring queue */
+struct fuse_ring_ent {
+ /* userspace buffer */
+ struct fuse_uring_req_header __user *headers;
+ void __user *payload;
+
+ /* the ring queue that owns the request */
+ struct fuse_ring_queue *queue;
+
+ /* fields below are protected by queue->lock */
+
+ struct io_uring_cmd *cmd;
+
+ struct list_head list;
+
+ enum fuse_ring_req_state state;
+
+ struct fuse_req *fuse_req;
+};
+
+struct fuse_ring_queue {
+ /*
+ * back pointer to the main fuse uring structure that holds this
+ * queue
+ */
+ struct fuse_ring *ring;
+
+ /* queue id, corresponds to the cpu core */
+ unsigned int qid;
+
+ /*
+ * queue lock, taken when any value in the queue changes _and_ also
+ * a ring entry state changes.
+ */
+ spinlock_t lock;
+
+ /* available ring entries (struct fuse_ring_ent) */
+ struct list_head ent_avail_queue;
+
+ /*
+ * entries in the process of being committed or in the process
+ * to be sent to userspace
+ */
+ struct list_head ent_w_req_queue;
+ struct list_head ent_commit_queue;
+
+ /* entries in userspace */
+ struct list_head ent_in_userspace;
+
+ /* entries that are released */
+ struct list_head ent_released;
+
+ /* fuse requests waiting for an entry slot */
+ struct list_head fuse_req_queue;
+
+ /* background fuse requests */
+ struct list_head fuse_req_bg_queue;
+
+ struct fuse_pqueue fpq;
+
+ unsigned int active_background;
+
+ bool stopped;
+};
+
+/**
+ * Describes if uring is for communication and holds alls the data needed
+ * for uring communication
+ */
+struct fuse_ring {
+ /* back pointer */
+ struct fuse_conn *fc;
+
+ /* number of ring queues */
+ size_t nr_queues;
+
+ /* maximum payload/arg size */
+ size_t max_payload_sz;
+
+ struct fuse_ring_queue **queues;
+
+ /*
+ * Log ring entry states on stop when entries cannot be released
+ */
+ unsigned int stop_debug_log : 1;
+
+ wait_queue_head_t stop_waitq;
+
+ /* async tear down */
+ struct delayed_work async_teardown_work;
+
+ /* log */
+ unsigned long teardown_time;
+
+ atomic_t queue_refs;
+
+ bool ready;
+};
+
+bool fuse_uring_enabled(void);
+void fuse_uring_destruct(struct fuse_conn *fc);
+void fuse_uring_stop_queues(struct fuse_ring *ring);
+void fuse_uring_abort_end_requests(struct fuse_ring *ring);
+int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
+void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req);
+bool fuse_uring_queue_bq_req(struct fuse_req *req);
+
+static inline void fuse_uring_abort(struct fuse_conn *fc)
+{
+ struct fuse_ring *ring = fc->ring;
+
+ if (ring == NULL)
+ return;
+
+ if (atomic_read(&ring->queue_refs) > 0) {
+ fuse_uring_abort_end_requests(ring);
+ fuse_uring_stop_queues(ring);
+ }
+}
+
+static inline void fuse_uring_wait_stopped_queues(struct fuse_conn *fc)
+{
+ struct fuse_ring *ring = fc->ring;
+
+ if (ring)
+ wait_event(ring->stop_waitq,
+ atomic_read(&ring->queue_refs) == 0);
+}
+
+static inline bool fuse_uring_ready(struct fuse_conn *fc)
+{
+ return fc->ring && fc->ring->ready;
+}
+
+#else /* CONFIG_FUSE_IO_URING */
+
+struct fuse_ring;
+
+static inline void fuse_uring_create(struct fuse_conn *fc)
+{
+}
+
+static inline void fuse_uring_destruct(struct fuse_conn *fc)
+{
+}
+
+static inline bool fuse_uring_enabled(void)
+{
+ return false;
+}
+
+static inline void fuse_uring_abort(struct fuse_conn *fc)
+{
+}
+
+static inline void fuse_uring_wait_stopped_queues(struct fuse_conn *fc)
+{
+}
+
+static inline bool fuse_uring_ready(struct fuse_conn *fc)
+{
+ return false;
+}
+
+#endif /* CONFIG_FUSE_IO_URING */
+
+#endif /* _FS_FUSE_DEV_URING_I_H */
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index bf057cf7098d..be693a8a1010 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -175,9 +175,10 @@ static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args,
memset(outarg, 0, sizeof(struct fuse_entry_out));
args->opcode = FUSE_LOOKUP;
args->nodeid = nodeid;
- args->in_numargs = 1;
- args->in_args[0].size = name->len + 1;
- args->in_args[0].value = name->name;
+ args->in_numargs = 2;
+ fuse_set_zero_arg0(args);
+ args->in_args[1].size = name->len + 1;
+ args->in_args[1].value = name->name;
args->out_numargs = 1;
args->out_args[0].size = sizeof(struct fuse_entry_out);
args->out_args[0].value = outarg;
@@ -929,11 +930,12 @@ static int fuse_symlink(struct mnt_idmap *idmap, struct inode *dir,
FUSE_ARGS(args);
args.opcode = FUSE_SYMLINK;
- args.in_numargs = 2;
- args.in_args[0].size = entry->d_name.len + 1;
- args.in_args[0].value = entry->d_name.name;
- args.in_args[1].size = len;
- args.in_args[1].value = link;
+ args.in_numargs = 3;
+ fuse_set_zero_arg0(&args);
+ args.in_args[1].size = entry->d_name.len + 1;
+ args.in_args[1].value = entry->d_name.name;
+ args.in_args[2].size = len;
+ args.in_args[2].value = link;
return create_new_entry(idmap, fm, &args, dir, entry, S_IFLNK);
}
@@ -993,9 +995,10 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
args.opcode = FUSE_UNLINK;
args.nodeid = get_node_id(dir);
- args.in_numargs = 1;
- args.in_args[0].size = entry->d_name.len + 1;
- args.in_args[0].value = entry->d_name.name;
+ args.in_numargs = 2;
+ fuse_set_zero_arg0(&args);
+ args.in_args[1].size = entry->d_name.len + 1;
+ args.in_args[1].value = entry->d_name.name;
err = fuse_simple_request(fm, &args);
if (!err) {
fuse_dir_changed(dir);
@@ -1016,9 +1019,10 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
args.opcode = FUSE_RMDIR;
args.nodeid = get_node_id(dir);
- args.in_numargs = 1;
- args.in_args[0].size = entry->d_name.len + 1;
- args.in_args[0].value = entry->d_name.name;
+ args.in_numargs = 2;
+ fuse_set_zero_arg0(&args);
+ args.in_args[1].size = entry->d_name.len + 1;
+ args.in_args[1].value = entry->d_name.name;
err = fuse_simple_request(fm, &args);
if (!err) {
fuse_dir_changed(dir);
diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h
new file mode 100644
index 000000000000..3b2bfe1248d3
--- /dev/null
+++ b/fs/fuse/fuse_dev_i.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
+ */
+#ifndef _FS_FUSE_DEV_I_H
+#define _FS_FUSE_DEV_I_H
+
+#include <linux/types.h>
+
+/* Ordinary requests have even IDs, while interrupts IDs are odd */
+#define FUSE_INT_REQ_BIT (1ULL << 0)
+#define FUSE_REQ_ID_STEP (1ULL << 1)
+
+struct fuse_arg;
+struct fuse_args;
+struct fuse_pqueue;
+struct fuse_req;
+struct fuse_iqueue;
+struct fuse_forget_link;
+
+struct fuse_copy_state {
+ int write;
+ struct fuse_req *req;
+ struct iov_iter *iter;
+ struct pipe_buffer *pipebufs;
+ struct pipe_buffer *currbuf;
+ struct pipe_inode_info *pipe;
+ unsigned long nr_segs;
+ struct page *pg;
+ unsigned int len;
+ unsigned int offset;
+ unsigned int move_pages:1;
+ unsigned int is_uring:1;
+ struct {
+ unsigned int copied_sz; /* copied size into the user buffer */
+ } ring;
+};
+
+static inline struct fuse_dev *fuse_get_dev(struct file *file)
+{
+ /*
+ * Lockless access is OK, because file->private data is set
+ * once during mount and is valid until the file is released.
+ */
+ return READ_ONCE(file->private_data);
+}
+
+unsigned int fuse_req_hash(u64 unique);
+struct fuse_req *fuse_request_find(struct fuse_pqueue *fpq, u64 unique);
+
+void fuse_dev_end_requests(struct list_head *head);
+
+void fuse_copy_init(struct fuse_copy_state *cs, int write,
+ struct iov_iter *iter);
+int fuse_copy_args(struct fuse_copy_state *cs, unsigned int numargs,
+ unsigned int argpages, struct fuse_arg *args,
+ int zeroing);
+int fuse_copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args,
+ unsigned int nbytes);
+void fuse_dev_queue_forget(struct fuse_iqueue *fiq,
+ struct fuse_forget_link *forget);
+void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req);
+
+#endif
+
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 74744c6f2860..fee96fe7887b 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -310,7 +310,7 @@ struct fuse_args {
bool is_ext:1;
bool is_pinned:1;
bool invalidate_vmap:1;
- struct fuse_in_arg in_args[3];
+ struct fuse_in_arg in_args[4];
struct fuse_arg out_args[2];
void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error);
/* Used for kvec iter backed by vmalloc address */
@@ -438,6 +438,10 @@ struct fuse_req {
/** fuse_mount this request belongs to */
struct fuse_mount *fm;
+
+#ifdef CONFIG_FUSE_IO_URING
+ void *ring_entry;
+#endif
};
struct fuse_iqueue;
@@ -863,6 +867,9 @@ struct fuse_conn {
/* Use pages instead of pointer for kernel I/O */
unsigned int use_pages_for_kvec_io:1;
+ /* Use io_uring for communication */
+ unsigned int io_uring;
+
/** Maximum stack depth for passthrough backing files */
int max_stack_depth;
@@ -923,6 +930,11 @@ struct fuse_conn {
/** IDR for backing files ids */
struct idr backing_files_map;
#endif
+
+#ifdef CONFIG_FUSE_IO_URING
+ /** uring connection information*/
+ struct fuse_ring *ring;
+#endif
};
/*
@@ -947,6 +959,19 @@ struct fuse_mount {
struct rcu_head rcu;
};
+/*
+ * Empty header for FUSE opcodes without specific header needs.
+ * Used as a placeholder in args->in_args[0] for consistency
+ * across all FUSE operations, simplifying request handling.
+ */
+struct fuse_zero_header {};
+
+static inline void fuse_set_zero_arg0(struct fuse_args *args)
+{
+ args->in_args[0].size = sizeof(struct fuse_zero_header);
+ args->in_args[0].value = NULL;
+}
+
static inline struct fuse_mount *get_fuse_mount_super(struct super_block *sb)
{
return sb->s_fs_info;
@@ -1220,6 +1245,11 @@ void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o);
struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
/**
+ * Initialize the fuse processing queue
+ */
+void fuse_pqueue_init(struct fuse_pqueue *fpq);
+
+/**
* Initialize fuse_conn
*/
void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 3ce4f4e81d09..e9db2cb8c150 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -7,6 +7,7 @@
*/
#include "fuse_i.h"
+#include "dev_uring_i.h"
#include <linux/pagemap.h>
#include <linux/slab.h>
@@ -937,7 +938,7 @@ static void fuse_iqueue_init(struct fuse_iqueue *fiq,
fiq->priv = priv;
}
-static void fuse_pqueue_init(struct fuse_pqueue *fpq)
+void fuse_pqueue_init(struct fuse_pqueue *fpq)
{
unsigned int i;
@@ -992,6 +993,8 @@ static void delayed_release(struct rcu_head *p)
{
struct fuse_conn *fc = container_of(p, struct fuse_conn, rcu);
+ fuse_uring_destruct(fc);
+
put_user_ns(fc->user_ns);
fc->release(fc);
}
@@ -1387,6 +1390,8 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
else
ok = false;
}
+ if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled())
+ fc->io_uring = 1;
} else {
ra_pages = fc->max_read / PAGE_SIZE;
fc->no_lock = 1;
@@ -1446,6 +1451,13 @@ void fuse_send_init(struct fuse_mount *fm)
if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
flags |= FUSE_PASSTHROUGH;
+ /*
+ * This is just an information flag for fuse server. No need to check
+ * the reply - server is either sending IORING_OP_URING_CMD or not.
+ */
+ if (fuse_uring_enabled())
+ flags |= FUSE_OVER_IO_URING;
+
ia->in.flags = flags;
ia->in.flags2 = flags >> 32;
diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c
index 9f568d345c51..93dfb06b6cea 100644
--- a/fs/fuse/xattr.c
+++ b/fs/fuse/xattr.c
@@ -164,9 +164,10 @@ int fuse_removexattr(struct inode *inode, const char *name)
args.opcode = FUSE_REMOVEXATTR;
args.nodeid = get_node_id(inode);
- args.in_numargs = 1;
- args.in_args[0].size = strlen(name) + 1;
- args.in_args[0].value = name;
+ args.in_numargs = 2;
+ fuse_set_zero_arg0(&args);
+ args.in_args[1].size = strlen(name) + 1;
+ args.in_args[1].value = name;
err = fuse_simple_request(fm, &args);
if (err == -ENOSYS) {
fm->fc->no_removexattr = 1;
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 0eb20012792f..d3f76101ad4b 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -170,7 +170,8 @@ config ROOT_NFS
config NFS_FSCACHE
bool "Provide NFS client caching support"
- depends on NFS_FS=m && NETFS_SUPPORT || NFS_FS=y && NETFS_SUPPORT=y
+ depends on NFS_FS
+ select NETFS_SUPPORT
select FSCACHE
help
Say Y here if you want NFS data to be cached locally on disc through
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 7832fb0369a1..8397c43358bd 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -718,7 +718,7 @@ __be32 nfs4_callback_offload(void *data, void *dummy,
copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_KERNEL);
if (!copy)
- return htonl(NFS4ERR_SERVERFAULT);
+ return cpu_to_be32(NFS4ERR_DELAY);
spin_lock(&cps->clp->cl_lock);
rcu_read_lock();
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 550ca934c9cf..3b0918ade53c 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -38,7 +38,7 @@
#include <linux/sunrpc/bc_xprt.h>
#include <linux/nsproxy.h>
#include <linux/pid_namespace.h>
-
+#include <linux/nfslocalio.h>
#include "nfs4_fs.h"
#include "callback.h"
@@ -186,7 +186,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
seqlock_init(&clp->cl_boot_lock);
ktime_get_real_ts64(&clp->cl_nfssvc_boot);
nfs_uuid_init(&clp->cl_uuid);
- spin_lock_init(&clp->cl_localio_lock);
+ INIT_WORK(&clp->cl_local_probe_work, nfs_local_probe_async_work);
#endif /* CONFIG_NFS_LOCALIO */
clp->cl_principal = "*";
@@ -244,7 +244,7 @@ static void pnfs_init_server(struct nfs_server *server)
*/
void nfs_free_client(struct nfs_client *clp)
{
- nfs_local_disable(clp);
+ nfs_localio_disable_client(clp);
/* -EIO all pending I/O */
if (!IS_ERR(clp->cl_rpcclient))
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index b08dbe96bc57..f45beea92d03 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -303,6 +303,7 @@ static void nfs_read_sync_pgio_error(struct list_head *head, int error)
static void nfs_direct_pgio_init(struct nfs_pgio_header *hdr)
{
get_dreq(hdr->dreq);
+ set_bit(NFS_IOHDR_ODIRECT, &hdr->flags);
}
static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = {
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index f78115c6c2c1..98b45b636be3 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -164,18 +164,17 @@ decode_name(struct xdr_stream *xdr, u32 *id)
}
static struct nfsd_file *
-ff_local_open_fh(struct nfs_client *clp, const struct cred *cred,
+ff_local_open_fh(struct pnfs_layout_segment *lseg, u32 ds_idx,
+ struct nfs_client *clp, const struct cred *cred,
struct nfs_fh *fh, fmode_t mode)
{
- if (mode & FMODE_WRITE) {
- /*
- * Always request read and write access since this corresponds
- * to a rw layout.
- */
- mode |= FMODE_READ;
- }
+#if IS_ENABLED(CONFIG_NFS_LOCALIO)
+ struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
- return nfs_local_open_fh(clp, cred, fh, mode);
+ return nfs_local_open_fh(clp, cred, fh, &mirror->nfl, mode);
+#else
+ return NULL;
+#endif
}
static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
@@ -247,6 +246,7 @@ static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags)
spin_lock_init(&mirror->lock);
refcount_set(&mirror->ref, 1);
INIT_LIST_HEAD(&mirror->mirrors);
+ nfs_localio_file_init(&mirror->nfl);
}
return mirror;
}
@@ -257,6 +257,7 @@ static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror)
ff_layout_remove_mirror(mirror);
kfree(mirror->fh_versions);
+ nfs_close_local_fh(&mirror->nfl);
cred = rcu_access_pointer(mirror->ro_cred);
put_cred(cred);
cred = rcu_access_pointer(mirror->rw_cred);
@@ -847,6 +848,9 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
struct nfs4_pnfs_ds *ds;
u32 ds_idx;
+ if (NFS_SERVER(pgio->pg_inode)->flags &
+ (NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR))
+ pgio->pg_maxretrans = io_maxretrans;
retry:
pnfs_generic_pg_check_layout(pgio, req);
/* Use full layout for now */
@@ -860,6 +864,8 @@ retry:
if (!pgio->pg_lseg)
goto out_nolseg;
}
+ /* Reset wb_nio, since getting layout segment was successful */
+ req->wb_nio = 0;
ds = ff_layout_get_ds_for_read(pgio, &ds_idx);
if (!ds) {
@@ -876,14 +882,24 @@ retry:
pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize;
pgio->pg_mirror_idx = ds_idx;
-
- if (NFS_SERVER(pgio->pg_inode)->flags &
- (NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR))
- pgio->pg_maxretrans = io_maxretrans;
return;
out_nolseg:
- if (pgio->pg_error < 0)
- return;
+ if (pgio->pg_error < 0) {
+ if (pgio->pg_error != -EAGAIN)
+ return;
+ /* Retry getting layout segment if lower layer returned -EAGAIN */
+ if (pgio->pg_maxretrans && req->wb_nio++ > pgio->pg_maxretrans) {
+ if (NFS_SERVER(pgio->pg_inode)->flags & NFS_MOUNT_SOFTERR)
+ pgio->pg_error = -ETIMEDOUT;
+ else
+ pgio->pg_error = -EIO;
+ return;
+ }
+ pgio->pg_error = 0;
+ /* Sleep for 1 second before retrying */
+ ssleep(1);
+ goto retry;
+ }
out_mds:
trace_pnfs_mds_fallback_pg_init_read(pgio->pg_inode,
0, NFS4_MAX_UINT64, IOMODE_READ,
@@ -1820,7 +1836,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
hdr->mds_offset = offset;
/* Start IO accounting for local read */
- localio = ff_local_open_fh(ds->ds_clp, ds_cred, fh, FMODE_READ);
+ localio = ff_local_open_fh(lseg, idx, ds->ds_clp, ds_cred, fh, FMODE_READ);
if (localio) {
hdr->task.tk_start = ktime_get();
ff_layout_read_record_layoutstats_start(&hdr->task, hdr);
@@ -1896,7 +1912,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
hdr->args.offset = offset;
/* Start IO accounting for local write */
- localio = ff_local_open_fh(ds->ds_clp, ds_cred, fh,
+ localio = ff_local_open_fh(lseg, idx, ds->ds_clp, ds_cred, fh,
FMODE_READ|FMODE_WRITE);
if (localio) {
hdr->task.tk_start = ktime_get();
@@ -1981,7 +1997,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
data->args.fh = fh;
/* Start IO accounting for local commit */
- localio = ff_local_open_fh(ds->ds_clp, ds_cred, fh,
+ localio = ff_local_open_fh(lseg, idx, ds->ds_clp, ds_cred, fh,
FMODE_READ|FMODE_WRITE);
if (localio) {
data->task.tk_start = ktime_get();
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index f84b3fb0dddd..095df09017a5 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -83,6 +83,7 @@ struct nfs4_ff_layout_mirror {
nfs4_stateid stateid;
const struct cred __rcu *ro_cred;
const struct cred __rcu *rw_cred;
+ struct nfs_file_localio nfl;
refcount_t ref;
spinlock_t lock;
unsigned long flags;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 596f35170137..1aa67fca69b2 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1137,6 +1137,8 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry,
ctx->lock_context.open_context = ctx;
INIT_LIST_HEAD(&ctx->list);
ctx->mdsthreshold = NULL;
+ nfs_localio_file_init(&ctx->nfl);
+
return ctx;
}
EXPORT_SYMBOL_GPL(alloc_nfs_open_context);
@@ -1168,6 +1170,7 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
nfs_sb_deactive(sb);
put_rpccred(rcu_dereference_protected(ctx->ll_cred, 1));
kfree(ctx->mdsthreshold);
+ nfs_close_local_fh(&ctx->nfl);
kfree_rcu(ctx, rcu_head);
}
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index e564bd11ba60..fae2c7ae4acc 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -455,11 +455,13 @@ extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode);
#if IS_ENABLED(CONFIG_NFS_LOCALIO)
/* localio.c */
-extern void nfs_local_disable(struct nfs_client *);
extern void nfs_local_probe(struct nfs_client *);
+extern void nfs_local_probe_async(struct nfs_client *);
+extern void nfs_local_probe_async_work(struct work_struct *);
extern struct nfsd_file *nfs_local_open_fh(struct nfs_client *,
const struct cred *,
struct nfs_fh *,
+ struct nfs_file_localio *,
const fmode_t);
extern int nfs_local_doio(struct nfs_client *,
struct nfsd_file *,
@@ -471,11 +473,12 @@ extern int nfs_local_commit(struct nfsd_file *,
extern bool nfs_server_is_local(const struct nfs_client *clp);
#else /* CONFIG_NFS_LOCALIO */
-static inline void nfs_local_disable(struct nfs_client *clp) {}
static inline void nfs_local_probe(struct nfs_client *clp) {}
+static inline void nfs_local_probe_async(struct nfs_client *clp) {}
static inline struct nfsd_file *
nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
- struct nfs_fh *fh, const fmode_t mode)
+ struct nfs_fh *fh, struct nfs_file_localio *nfl,
+ const fmode_t mode)
{
return NULL;
}
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index 4b8618cf114c..5c21caeae075 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -35,6 +35,7 @@ struct nfs_local_kiocb {
struct bio_vec *bvec;
struct nfs_pgio_header *hdr;
struct work_struct work;
+ void (*aio_complete_work)(struct work_struct *);
struct nfsd_file *localio;
};
@@ -48,9 +49,14 @@ struct nfs_local_fsync_ctx {
static bool localio_enabled __read_mostly = true;
module_param(localio_enabled, bool, 0644);
+static bool localio_O_DIRECT_semantics __read_mostly = false;
+module_param(localio_O_DIRECT_semantics, bool, 0644);
+MODULE_PARM_DESC(localio_O_DIRECT_semantics,
+ "LOCALIO will use O_DIRECT semantics to filesystem.");
+
static inline bool nfs_client_is_local(const struct nfs_client *clp)
{
- return !!test_bit(NFS_CS_LOCAL_IO, &clp->cl_flags);
+ return !!rcu_access_pointer(clp->cl_uuid.net);
}
bool nfs_server_is_local(const struct nfs_client *clp)
@@ -116,30 +122,6 @@ const struct rpc_program nfslocalio_program = {
};
/*
- * nfs_local_enable - enable local i/o for an nfs_client
- */
-static void nfs_local_enable(struct nfs_client *clp)
-{
- spin_lock(&clp->cl_localio_lock);
- set_bit(NFS_CS_LOCAL_IO, &clp->cl_flags);
- trace_nfs_local_enable(clp);
- spin_unlock(&clp->cl_localio_lock);
-}
-
-/*
- * nfs_local_disable - disable local i/o for an nfs_client
- */
-void nfs_local_disable(struct nfs_client *clp)
-{
- spin_lock(&clp->cl_localio_lock);
- if (test_and_clear_bit(NFS_CS_LOCAL_IO, &clp->cl_flags)) {
- trace_nfs_local_disable(clp);
- nfs_uuid_invalidate_one_client(&clp->cl_uuid);
- }
- spin_unlock(&clp->cl_localio_lock);
-}
-
-/*
* nfs_init_localioclient - Initialise an NFS localio client connection
*/
static struct rpc_clnt *nfs_init_localioclient(struct nfs_client *clp)
@@ -178,7 +160,7 @@ static bool nfs_server_uuid_is_local(struct nfs_client *clp)
rpc_shutdown_client(rpcclient_localio);
/* Server is only local if it initialized required struct members */
- if (status || !clp->cl_uuid.net || !clp->cl_uuid.dom)
+ if (status || !rcu_access_pointer(clp->cl_uuid.net) || !clp->cl_uuid.dom)
return false;
return true;
@@ -194,44 +176,64 @@ void nfs_local_probe(struct nfs_client *clp)
/* Disallow localio if disabled via sysfs or AUTH_SYS isn't used */
if (!localio_enabled ||
clp->cl_rpcclient->cl_auth->au_flavor != RPC_AUTH_UNIX) {
- nfs_local_disable(clp);
+ nfs_localio_disable_client(clp);
return;
}
if (nfs_client_is_local(clp)) {
/* If already enabled, disable and re-enable */
- nfs_local_disable(clp);
+ nfs_localio_disable_client(clp);
}
if (!nfs_uuid_begin(&clp->cl_uuid))
return;
if (nfs_server_uuid_is_local(clp))
- nfs_local_enable(clp);
+ nfs_localio_enable_client(clp);
nfs_uuid_end(&clp->cl_uuid);
}
EXPORT_SYMBOL_GPL(nfs_local_probe);
+void nfs_local_probe_async_work(struct work_struct *work)
+{
+ struct nfs_client *clp =
+ container_of(work, struct nfs_client, cl_local_probe_work);
+
+ nfs_local_probe(clp);
+}
+
+void nfs_local_probe_async(struct nfs_client *clp)
+{
+ queue_work(nfsiod_workqueue, &clp->cl_local_probe_work);
+}
+EXPORT_SYMBOL_GPL(nfs_local_probe_async);
+
+static inline struct nfsd_file *nfs_local_file_get(struct nfsd_file *nf)
+{
+ return nfs_to->nfsd_file_get(nf);
+}
+
+static inline void nfs_local_file_put(struct nfsd_file *nf)
+{
+ nfs_to->nfsd_file_put(nf);
+}
+
/*
- * nfs_local_open_fh - open a local filehandle in terms of nfsd_file
+ * __nfs_local_open_fh - open a local filehandle in terms of nfsd_file.
*
- * Returns a pointer to a struct nfsd_file or NULL
+ * Returns a pointer to a struct nfsd_file or ERR_PTR.
+ * Caller must release returned nfsd_file with nfs_to_nfsd_file_put_local().
*/
-struct nfsd_file *
-nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
- struct nfs_fh *fh, const fmode_t mode)
+static struct nfsd_file *
+__nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
+ struct nfs_fh *fh, struct nfs_file_localio *nfl,
+ const fmode_t mode)
{
struct nfsd_file *localio;
- int status;
-
- if (!nfs_server_is_local(clp))
- return NULL;
- if (mode & ~(FMODE_READ | FMODE_WRITE))
- return NULL;
localio = nfs_open_local_fh(&clp->cl_uuid, clp->cl_rpcclient,
- cred, fh, mode);
+ cred, fh, nfl, mode);
if (IS_ERR(localio)) {
- status = PTR_ERR(localio);
+ int status = PTR_ERR(localio);
trace_nfs_local_open_fh(fh, mode, status);
switch (status) {
case -ENOMEM:
@@ -240,10 +242,59 @@ nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
/* Revalidate localio, will disable if unsupported */
nfs_local_probe(clp);
}
- return NULL;
}
return localio;
}
+
+/*
+ * nfs_local_open_fh - open a local filehandle in terms of nfsd_file.
+ * First checking if the open nfsd_file is already cached, otherwise
+ * must __nfs_local_open_fh and insert the nfsd_file in nfs_file_localio.
+ *
+ * Returns a pointer to a struct nfsd_file or NULL.
+ */
+struct nfsd_file *
+nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
+ struct nfs_fh *fh, struct nfs_file_localio *nfl,
+ const fmode_t mode)
+{
+ struct nfsd_file *nf, *new, __rcu **pnf;
+
+ if (!nfs_server_is_local(clp))
+ return NULL;
+ if (mode & ~(FMODE_READ | FMODE_WRITE))
+ return NULL;
+
+ if (mode & FMODE_WRITE)
+ pnf = &nfl->rw_file;
+ else
+ pnf = &nfl->ro_file;
+
+ new = NULL;
+ rcu_read_lock();
+ nf = rcu_dereference(*pnf);
+ if (!nf) {
+ rcu_read_unlock();
+ new = __nfs_local_open_fh(clp, cred, fh, nfl, mode);
+ if (IS_ERR(new))
+ return NULL;
+ /* try to swap in the pointer */
+ spin_lock(&clp->cl_uuid.lock);
+ nf = rcu_dereference_protected(*pnf, 1);
+ if (!nf) {
+ nf = new;
+ new = NULL;
+ rcu_assign_pointer(*pnf, nf);
+ }
+ spin_unlock(&clp->cl_uuid.lock);
+ rcu_read_lock();
+ }
+ nf = nfs_local_file_get(nf);
+ rcu_read_unlock();
+ if (new)
+ nfs_to_nfsd_file_put_local(new);
+ return nf;
+}
EXPORT_SYMBOL_GPL(nfs_local_open_fh);
static struct bio_vec *
@@ -285,10 +336,19 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr,
kfree(iocb);
return NULL;
}
- init_sync_kiocb(&iocb->kiocb, file);
+
+ if (localio_O_DIRECT_semantics &&
+ test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) {
+ iocb->kiocb.ki_filp = file;
+ iocb->kiocb.ki_flags = IOCB_DIRECT;
+ } else
+ init_sync_kiocb(&iocb->kiocb, file);
+
iocb->kiocb.ki_pos = hdr->args.offset;
iocb->hdr = hdr;
iocb->kiocb.ki_flags &= ~IOCB_APPEND;
+ iocb->aio_complete_work = NULL;
+
return iocb;
}
@@ -328,7 +388,7 @@ nfs_local_pgio_done(struct nfs_pgio_header *hdr, long status)
hdr->res.op_status = NFS4_OK;
hdr->task.tk_status = 0;
} else {
- hdr->res.op_status = nfs4_stat_to_errno(status);
+ hdr->res.op_status = nfs_localio_errno_to_nfs4_stat(status);
hdr->task.tk_status = status;
}
}
@@ -338,11 +398,23 @@ nfs_local_pgio_release(struct nfs_local_kiocb *iocb)
{
struct nfs_pgio_header *hdr = iocb->hdr;
- nfs_to_nfsd_file_put_local(iocb->localio);
+ nfs_local_file_put(iocb->localio);
nfs_local_iocb_free(iocb);
nfs_local_hdr_release(hdr, hdr->task.tk_ops);
}
+/*
+ * Complete the I/O from iocb->kiocb.ki_complete()
+ *
+ * Note that this function can be called from a bottom half context,
+ * hence we need to queue the rpc_call_done() etc to a workqueue
+ */
+static inline void nfs_local_pgio_aio_complete(struct nfs_local_kiocb *iocb)
+{
+ INIT_WORK(&iocb->work, iocb->aio_complete_work);
+ queue_work(nfsiod_workqueue, &iocb->work);
+}
+
static void
nfs_local_read_done(struct nfs_local_kiocb *iocb, long status)
{
@@ -365,6 +437,23 @@ nfs_local_read_done(struct nfs_local_kiocb *iocb, long status)
status > 0 ? status : 0, hdr->res.eof);
}
+static void nfs_local_read_aio_complete_work(struct work_struct *work)
+{
+ struct nfs_local_kiocb *iocb =
+ container_of(work, struct nfs_local_kiocb, work);
+
+ nfs_local_pgio_release(iocb);
+}
+
+static void nfs_local_read_aio_complete(struct kiocb *kiocb, long ret)
+{
+ struct nfs_local_kiocb *iocb =
+ container_of(kiocb, struct nfs_local_kiocb, kiocb);
+
+ nfs_local_read_done(iocb, ret);
+ nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_read_aio_complete_work */
+}
+
static void nfs_local_call_read(struct work_struct *work)
{
struct nfs_local_kiocb *iocb =
@@ -379,10 +468,10 @@ static void nfs_local_call_read(struct work_struct *work)
nfs_local_iter_init(&iter, iocb, READ);
status = filp->f_op->read_iter(&iocb->kiocb, &iter);
- WARN_ON_ONCE(status == -EIOCBQUEUED);
-
- nfs_local_read_done(iocb, status);
- nfs_local_pgio_release(iocb);
+ if (status != -EIOCBQUEUED) {
+ nfs_local_read_done(iocb, status);
+ nfs_local_pgio_release(iocb);
+ }
revert_creds(save_cred);
}
@@ -410,6 +499,11 @@ nfs_do_local_read(struct nfs_pgio_header *hdr,
nfs_local_pgio_init(hdr, call_ops);
hdr->res.eof = false;
+ if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
+ iocb->kiocb.ki_complete = nfs_local_read_aio_complete;
+ iocb->aio_complete_work = nfs_local_read_aio_complete_work;
+ }
+
INIT_WORK(&iocb->work, nfs_local_call_read);
queue_work(nfslocaliod_workqueue, &iocb->work);
@@ -534,6 +628,24 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status)
nfs_local_pgio_done(hdr, status);
}
+static void nfs_local_write_aio_complete_work(struct work_struct *work)
+{
+ struct nfs_local_kiocb *iocb =
+ container_of(work, struct nfs_local_kiocb, work);
+
+ nfs_local_vfs_getattr(iocb);
+ nfs_local_pgio_release(iocb);
+}
+
+static void nfs_local_write_aio_complete(struct kiocb *kiocb, long ret)
+{
+ struct nfs_local_kiocb *iocb =
+ container_of(kiocb, struct nfs_local_kiocb, kiocb);
+
+ nfs_local_write_done(iocb, ret);
+ nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_write_aio_complete_work */
+}
+
static void nfs_local_call_write(struct work_struct *work)
{
struct nfs_local_kiocb *iocb =
@@ -552,11 +664,11 @@ static void nfs_local_call_write(struct work_struct *work)
file_start_write(filp);
status = filp->f_op->write_iter(&iocb->kiocb, &iter);
file_end_write(filp);
- WARN_ON_ONCE(status == -EIOCBQUEUED);
-
- nfs_local_write_done(iocb, status);
- nfs_local_vfs_getattr(iocb);
- nfs_local_pgio_release(iocb);
+ if (status != -EIOCBQUEUED) {
+ nfs_local_write_done(iocb, status);
+ nfs_local_vfs_getattr(iocb);
+ nfs_local_pgio_release(iocb);
+ }
revert_creds(save_cred);
current->flags = old_flags;
@@ -592,10 +704,16 @@ nfs_do_local_write(struct nfs_pgio_header *hdr,
case NFS_FILE_SYNC:
iocb->kiocb.ki_flags |= IOCB_DSYNC|IOCB_SYNC;
}
+
nfs_local_pgio_init(hdr, call_ops);
nfs_set_local_verifier(hdr->inode, hdr->res.verf, hdr->args.stable);
+ if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
+ iocb->kiocb.ki_complete = nfs_local_write_aio_complete;
+ iocb->aio_complete_work = nfs_local_write_aio_complete_work;
+ }
+
INIT_WORK(&iocb->work, nfs_local_call_write);
queue_work(nfslocaliod_workqueue, &iocb->work);
@@ -626,8 +744,8 @@ int nfs_local_doio(struct nfs_client *clp, struct nfsd_file *localio,
if (status != 0) {
if (status == -EAGAIN)
- nfs_local_disable(clp);
- nfs_to_nfsd_file_put_local(localio);
+ nfs_localio_disable_client(clp);
+ nfs_local_file_put(localio);
hdr->task.tk_status = status;
nfs_local_hdr_release(hdr, call_ops);
}
@@ -668,7 +786,7 @@ nfs_local_commit_done(struct nfs_commit_data *data, int status)
data->task.tk_status = 0;
} else {
nfs_reset_boot_verifier(data->inode);
- data->res.op_status = nfs4_stat_to_errno(status);
+ data->res.op_status = nfs_localio_errno_to_nfs4_stat(status);
data->task.tk_status = status;
}
}
@@ -678,7 +796,7 @@ nfs_local_release_commit_data(struct nfsd_file *localio,
struct nfs_commit_data *data,
const struct rpc_call_ops *call_ops)
{
- nfs_to_nfsd_file_put_local(localio);
+ nfs_local_file_put(localio);
call_ops->rpc_call_done(&data->task, data);
call_ops->rpc_release(data);
}
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 1566163c6d85..7359e1a3bd84 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -844,6 +844,41 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
return status;
}
+#if IS_ENABLED(CONFIG_NFS_LOCALIO)
+
+static unsigned nfs3_localio_probe_throttle __read_mostly = 0;
+module_param(nfs3_localio_probe_throttle, uint, 0644);
+MODULE_PARM_DESC(nfs3_localio_probe_throttle,
+ "Probe for NFSv3 LOCALIO every N IO requests. Must be power-of-2, defaults to 0 (probing disabled).");
+
+static void nfs3_localio_probe(struct nfs_server *server)
+{
+ struct nfs_client *clp = server->nfs_client;
+
+ /* Throttled to reduce nfs_local_probe_async() frequency */
+ if (!nfs3_localio_probe_throttle || nfs_server_is_local(clp))
+ return;
+
+ /*
+ * Try (re)enabling LOCALIO if isn't enabled -- admin deems
+ * it worthwhile to periodically check if LOCALIO possible by
+ * setting the 'nfs3_localio_probe_throttle' module parameter.
+ *
+ * This is useful if LOCALIO was previously enabled, but was
+ * disabled due to server restart, and IO has successfully
+ * completed in terms of normal RPC.
+ */
+ if ((clp->cl_uuid.nfs3_localio_probe_count++ &
+ (nfs3_localio_probe_throttle - 1)) == 0) {
+ if (!nfs_server_is_local(clp))
+ nfs_local_probe_async(clp);
+ }
+}
+
+#else
+static void nfs3_localio_probe(struct nfs_server *server) {}
+#endif
+
static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
{
struct inode *inode = hdr->inode;
@@ -855,8 +890,11 @@ static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
if (nfs3_async_handle_jukebox(task, inode))
return -EAGAIN;
- if (task->tk_status >= 0 && !server->read_hdrsize)
- cmpxchg(&server->read_hdrsize, 0, hdr->res.replen);
+ if (task->tk_status >= 0) {
+ if (!server->read_hdrsize)
+ cmpxchg(&server->read_hdrsize, 0, hdr->res.replen);
+ nfs3_localio_probe(server);
+ }
nfs_invalidate_atime(inode);
nfs_refresh_inode(inode, &hdr->fattr);
@@ -886,8 +924,10 @@ static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
if (nfs3_async_handle_jukebox(task, inode))
return -EAGAIN;
- if (task->tk_status >= 0)
+ if (task->tk_status >= 0) {
nfs_writeback_update_inode(hdr);
+ nfs3_localio_probe(NFS_SERVER(inode));
+ }
return 0;
}
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 531c9c20ef1d..1924c4a2077b 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -498,15 +498,15 @@ out_put_src_lock:
return err;
}
-struct nfs42_offloadcancel_data {
+struct nfs42_offload_data {
struct nfs_server *seq_server;
struct nfs42_offload_status_args args;
struct nfs42_offload_status_res res;
};
-static void nfs42_offload_cancel_prepare(struct rpc_task *task, void *calldata)
+static void nfs42_offload_prepare(struct rpc_task *task, void *calldata)
{
- struct nfs42_offloadcancel_data *data = calldata;
+ struct nfs42_offload_data *data = calldata;
nfs4_setup_sequence(data->seq_server->nfs_client,
&data->args.osa_seq_args,
@@ -515,7 +515,7 @@ static void nfs42_offload_cancel_prepare(struct rpc_task *task, void *calldata)
static void nfs42_offload_cancel_done(struct rpc_task *task, void *calldata)
{
- struct nfs42_offloadcancel_data *data = calldata;
+ struct nfs42_offload_data *data = calldata;
trace_nfs4_offload_cancel(&data->args, task->tk_status);
nfs41_sequence_done(task, &data->res.osr_seq_res);
@@ -525,22 +525,22 @@ static void nfs42_offload_cancel_done(struct rpc_task *task, void *calldata)
rpc_restart_call_prepare(task);
}
-static void nfs42_free_offloadcancel_data(void *data)
+static void nfs42_offload_release(void *data)
{
kfree(data);
}
static const struct rpc_call_ops nfs42_offload_cancel_ops = {
- .rpc_call_prepare = nfs42_offload_cancel_prepare,
+ .rpc_call_prepare = nfs42_offload_prepare,
.rpc_call_done = nfs42_offload_cancel_done,
- .rpc_release = nfs42_free_offloadcancel_data,
+ .rpc_release = nfs42_offload_release,
};
static int nfs42_do_offload_cancel_async(struct file *dst,
nfs4_stateid *stateid)
{
struct nfs_server *dst_server = NFS_SERVER(file_inode(dst));
- struct nfs42_offloadcancel_data *data = NULL;
+ struct nfs42_offload_data *data = NULL;
struct nfs_open_context *ctx = nfs_file_open_context(dst);
struct rpc_task *task;
struct rpc_message msg = {
@@ -552,14 +552,14 @@ static int nfs42_do_offload_cancel_async(struct file *dst,
.rpc_message = &msg,
.callback_ops = &nfs42_offload_cancel_ops,
.workqueue = nfsiod_workqueue,
- .flags = RPC_TASK_ASYNC,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_MOVEABLE,
};
int status;
if (!(dst_server->caps & NFS_CAP_OFFLOAD_CANCEL))
return -EOPNOTSUPP;
- data = kzalloc(sizeof(struct nfs42_offloadcancel_data), GFP_KERNEL);
+ data = kzalloc(sizeof(struct nfs42_offload_data), GFP_KERNEL);
if (data == NULL)
return -ENOMEM;
@@ -861,7 +861,7 @@ int nfs42_proc_layoutstats_generic(struct nfs_server *server,
.rpc_message = &msg,
.callback_ops = &nfs42_layoutstat_ops,
.callback_data = data,
- .flags = RPC_TASK_ASYNC,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_MOVEABLE,
};
struct rpc_task *task;
@@ -1016,7 +1016,7 @@ int nfs42_proc_layouterror(struct pnfs_layout_segment *lseg,
struct rpc_task_setup task_setup = {
.rpc_message = &msg,
.callback_ops = &nfs42_layouterror_ops,
- .flags = RPC_TASK_ASYNC,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_MOVEABLE,
};
unsigned int i;
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 9e3ae53e2205..5072d7ea72e9 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -144,9 +144,11 @@
decode_putfh_maxsz + \
decode_offload_cancel_maxsz)
#define NFS4_enc_copy_notify_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
encode_putfh_maxsz + \
encode_copy_notify_maxsz)
#define NFS4_dec_copy_notify_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
decode_putfh_maxsz + \
decode_copy_notify_maxsz)
#define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \
@@ -549,7 +551,7 @@ static void nfs4_xdr_enc_copy(struct rpc_rqst *req,
}
/*
- * Encode OFFLOAD_CANEL request
+ * Encode OFFLOAD_CANCEL request
*/
static void nfs4_xdr_enc_offload_cancel(struct rpc_rqst *req,
struct xdr_stream *xdr,
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 9a9f60a2291b..542cdf71229f 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1955,6 +1955,7 @@ restart:
}
rcu_read_unlock();
nfs4_free_state_owners(&freeme);
+ nfs_local_probe_async(clp);
if (lost_locks)
pr_warn("NFS: %s: lost %d locks\n",
clp->cl_hostname, lost_locks);
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 1eab98c277fa..7a058bd8c566 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -1714,38 +1714,6 @@ TRACE_EVENT(nfs_local_open_fh,
)
);
-DECLARE_EVENT_CLASS(nfs_local_client_event,
- TP_PROTO(
- const struct nfs_client *clp
- ),
-
- TP_ARGS(clp),
-
- TP_STRUCT__entry(
- __field(unsigned int, protocol)
- __string(server, clp->cl_hostname)
- ),
-
- TP_fast_assign(
- __entry->protocol = clp->rpc_ops->version;
- __assign_str(server);
- ),
-
- TP_printk(
- "server=%s NFSv%u", __get_str(server), __entry->protocol
- )
-);
-
-#define DEFINE_NFS_LOCAL_CLIENT_EVENT(name) \
- DEFINE_EVENT(nfs_local_client_event, name, \
- TP_PROTO( \
- const struct nfs_client *clp \
- ), \
- TP_ARGS(clp))
-
-DEFINE_NFS_LOCAL_CLIENT_EVENT(nfs_local_enable);
-DEFINE_NFS_LOCAL_CLIENT_EVENT(nfs_local_disable);
-
DECLARE_EVENT_CLASS(nfs_xdr_event,
TP_PROTO(
const struct xdr_stream *xdr,
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index e27c07bd8929..11968dcb7243 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -961,8 +961,9 @@ static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
struct nfs_client *clp = NFS_SERVER(hdr->inode)->nfs_client;
struct nfsd_file *localio =
- nfs_local_open_fh(clp, hdr->cred,
- hdr->args.fh, hdr->args.context->mode);
+ nfs_local_open_fh(clp, hdr->cred, hdr->args.fh,
+ &hdr->args.context->nfl,
+ hdr->args.context->mode);
if (NFS_SERVER(hdr->inode)->nfs_client->cl_minorversion)
task_flags = RPC_TASK_MOVEABLE;
diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c
index bf378ecd5d9f..7b59a40d40c0 100644
--- a/fs/nfs/sysfs.c
+++ b/fs/nfs/sysfs.c
@@ -280,9 +280,9 @@ void nfs_sysfs_link_rpc_client(struct nfs_server *server,
char name[RPC_CLIENT_NAME_SIZE];
int ret;
- strcpy(name, clnt->cl_program->name);
- strcat(name, uniq ? uniq : "");
- strcat(name, "_client");
+ strscpy(name, clnt->cl_program->name, sizeof(name));
+ strncat(name, uniq ? uniq : "", sizeof(name) - strlen(name) - 1);
+ strncat(name, "_client", sizeof(name) - strlen(name) - 1);
ret = sysfs_create_link_nowarn(&server->kobj,
&clnt->cl_sysfs->kobject, name);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 50fa539611f5..aa3d8bea3ec0 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1826,7 +1826,8 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
task_flags = RPC_TASK_MOVEABLE;
localio = nfs_local_open_fh(NFS_SERVER(inode)->nfs_client, data->cred,
- data->args.fh, data->context->mode);
+ data->args.fh, &data->context->nfl,
+ data->context->mode);
return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode),
data->mds_ops, how,
RPC_TASK_CRED_NOREF | task_flags, localio);
diff --git a/fs/nfs_common/Makefile b/fs/nfs_common/Makefile
index a5e54809701e..c10ead273ff2 100644
--- a/fs/nfs_common/Makefile
+++ b/fs/nfs_common/Makefile
@@ -6,8 +6,9 @@
obj-$(CONFIG_NFS_ACL_SUPPORT) += nfs_acl.o
nfs_acl-objs := nfsacl.o
+CFLAGS_localio_trace.o += -I$(src)
obj-$(CONFIG_NFS_COMMON_LOCALIO_SUPPORT) += nfs_localio.o
-nfs_localio-objs := nfslocalio.o
+nfs_localio-objs := nfslocalio.o localio_trace.o
obj-$(CONFIG_GRACE_PERIOD) += grace.o
obj-$(CONFIG_NFS_V4_2_SSC_HELPER) += nfs_ssc.o
diff --git a/fs/nfs_common/common.c b/fs/nfs_common/common.c
index 34a115176f97..af09aed09fd2 100644
--- a/fs/nfs_common/common.c
+++ b/fs/nfs_common/common.c
@@ -15,7 +15,7 @@ static const struct {
{ NFS_OK, 0 },
{ NFSERR_PERM, -EPERM },
{ NFSERR_NOENT, -ENOENT },
- { NFSERR_IO, -errno_NFSERR_IO},
+ { NFSERR_IO, -EIO },
{ NFSERR_NXIO, -ENXIO },
/* { NFSERR_EAGAIN, -EAGAIN }, */
{ NFSERR_ACCES, -EACCES },
@@ -45,7 +45,6 @@ static const struct {
{ NFSERR_SERVERFAULT, -EREMOTEIO },
{ NFSERR_BADTYPE, -EBADTYPE },
{ NFSERR_JUKEBOX, -EJUKEBOX },
- { -1, -EIO }
};
/**
@@ -59,26 +58,29 @@ int nfs_stat_to_errno(enum nfs_stat status)
{
int i;
- for (i = 0; nfs_errtbl[i].stat != -1; i++) {
+ for (i = 0; i < ARRAY_SIZE(nfs_errtbl); i++) {
if (nfs_errtbl[i].stat == (int)status)
return nfs_errtbl[i].errno;
}
- return nfs_errtbl[i].errno;
+ return -EIO;
}
EXPORT_SYMBOL_GPL(nfs_stat_to_errno);
/*
* We need to translate between nfs v4 status return values and
* the local errno values which may not be the same.
+ *
+ * nfs4_errtbl_common[] is used before more specialized mappings
+ * available in nfs4_errtbl[] or nfs4_errtbl_localio[].
*/
static const struct {
int stat;
int errno;
-} nfs4_errtbl[] = {
+} nfs4_errtbl_common[] = {
{ NFS4_OK, 0 },
{ NFS4ERR_PERM, -EPERM },
{ NFS4ERR_NOENT, -ENOENT },
- { NFS4ERR_IO, -errno_NFSERR_IO},
+ { NFS4ERR_IO, -EIO },
{ NFS4ERR_NXIO, -ENXIO },
{ NFS4ERR_ACCESS, -EACCES },
{ NFS4ERR_EXIST, -EEXIST },
@@ -98,15 +100,20 @@ static const struct {
{ NFS4ERR_BAD_COOKIE, -EBADCOOKIE },
{ NFS4ERR_NOTSUPP, -ENOTSUPP },
{ NFS4ERR_TOOSMALL, -ETOOSMALL },
- { NFS4ERR_SERVERFAULT, -EREMOTEIO },
{ NFS4ERR_BADTYPE, -EBADTYPE },
- { NFS4ERR_LOCKED, -EAGAIN },
{ NFS4ERR_SYMLINK, -ELOOP },
- { NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP },
{ NFS4ERR_DEADLOCK, -EDEADLK },
+};
+
+static const struct {
+ int stat;
+ int errno;
+} nfs4_errtbl[] = {
+ { NFS4ERR_SERVERFAULT, -EREMOTEIO },
+ { NFS4ERR_LOCKED, -EAGAIN },
+ { NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP },
{ NFS4ERR_NOXATTR, -ENODATA },
{ NFS4ERR_XATTR2BIG, -E2BIG },
- { -1, -EIO }
};
/*
@@ -116,7 +123,14 @@ static const struct {
int nfs4_stat_to_errno(int stat)
{
int i;
- for (i = 0; nfs4_errtbl[i].stat != -1; i++) {
+
+ /* First check nfs4_errtbl_common */
+ for (i = 0; i < ARRAY_SIZE(nfs4_errtbl_common); i++) {
+ if (nfs4_errtbl_common[i].stat == stat)
+ return nfs4_errtbl_common[i].errno;
+ }
+ /* Then check nfs4_errtbl */
+ for (i = 0; i < ARRAY_SIZE(nfs4_errtbl); i++) {
if (nfs4_errtbl[i].stat == stat)
return nfs4_errtbl[i].errno;
}
@@ -132,3 +146,56 @@ int nfs4_stat_to_errno(int stat)
return -stat;
}
EXPORT_SYMBOL_GPL(nfs4_stat_to_errno);
+
+/*
+ * This table is useful for conversion from local errno to NFS error.
+ * It provides more logically correct mappings for use with LOCALIO
+ * (which is focused on converting from errno to NFS status).
+ */
+static const struct {
+ int stat;
+ int errno;
+} nfs4_errtbl_localio[] = {
+ /* Map errors differently than nfs4_errtbl */
+ { NFS4ERR_IO, -EREMOTEIO },
+ { NFS4ERR_DELAY, -EAGAIN },
+ { NFS4ERR_FBIG, -E2BIG },
+ /* Map errors not handled by nfs4_errtbl */
+ { NFS4ERR_STALE, -EBADF },
+ { NFS4ERR_STALE, -EOPENSTALE },
+ { NFS4ERR_DELAY, -ETIMEDOUT },
+ { NFS4ERR_DELAY, -ERESTARTSYS },
+ { NFS4ERR_DELAY, -ENOMEM },
+ { NFS4ERR_IO, -ETXTBSY },
+ { NFS4ERR_IO, -EBUSY },
+ { NFS4ERR_SERVERFAULT, -ESERVERFAULT },
+ { NFS4ERR_SERVERFAULT, -ENFILE },
+ { NFS4ERR_IO, -EUCLEAN },
+ { NFS4ERR_PERM, -ENOKEY },
+};
+
+/*
+ * Convert an errno to an NFS error code for LOCALIO.
+ */
+__u32 nfs_localio_errno_to_nfs4_stat(int errno)
+{
+ int i;
+
+ /* First check nfs4_errtbl_common */
+ for (i = 0; i < ARRAY_SIZE(nfs4_errtbl_common); i++) {
+ if (nfs4_errtbl_common[i].errno == errno)
+ return nfs4_errtbl_common[i].stat;
+ }
+ /* Then check nfs4_errtbl_localio */
+ for (i = 0; i < ARRAY_SIZE(nfs4_errtbl_localio); i++) {
+ if (nfs4_errtbl_localio[i].errno == errno)
+ return nfs4_errtbl_localio[i].stat;
+ }
+ /* If we cannot translate the error, the recovery routines should
+ * handle it.
+ * Note: remaining NFSv4 error codes have values > 10000, so should
+ * not conflict with native Linux error codes.
+ */
+ return NFS4ERR_SERVERFAULT;
+}
+EXPORT_SYMBOL_GPL(nfs_localio_errno_to_nfs4_stat);
diff --git a/fs/nfs_common/localio_trace.c b/fs/nfs_common/localio_trace.c
new file mode 100644
index 000000000000..7decfe57abeb
--- /dev/null
+++ b/fs/nfs_common/localio_trace.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2024 Trond Myklebust <trond.myklebust@hammerspace.com>
+ * Copyright (C) 2024 Mike Snitzer <snitzer@hammerspace.com>
+ */
+#include <linux/nfs_fs.h>
+#include <linux/namei.h>
+
+#define CREATE_TRACE_POINTS
+#include "localio_trace.h"
diff --git a/fs/nfs_common/localio_trace.h b/fs/nfs_common/localio_trace.h
new file mode 100644
index 000000000000..4055aec9ff8d
--- /dev/null
+++ b/fs/nfs_common/localio_trace.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Trond Myklebust <trond.myklebust@hammerspace.com>
+ * Copyright (C) 2024 Mike Snitzer <snitzer@hammerspace.com>
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM nfs_localio
+
+#if !defined(_TRACE_NFS_COMMON_LOCALIO_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_NFS_COMMON_LOCALIO_H
+
+#include <linux/tracepoint.h>
+
+#include <trace/misc/fs.h>
+#include <trace/misc/nfs.h>
+#include <trace/misc/sunrpc.h>
+
+DECLARE_EVENT_CLASS(nfs_local_client_event,
+ TP_PROTO(
+ const struct nfs_client *clp
+ ),
+
+ TP_ARGS(clp),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, protocol)
+ __string(server, clp->cl_hostname)
+ ),
+
+ TP_fast_assign(
+ __entry->protocol = clp->rpc_ops->version;
+ __assign_str(server);
+ ),
+
+ TP_printk(
+ "server=%s NFSv%u", __get_str(server), __entry->protocol
+ )
+);
+
+#define DEFINE_NFS_LOCAL_CLIENT_EVENT(name) \
+ DEFINE_EVENT(nfs_local_client_event, name, \
+ TP_PROTO( \
+ const struct nfs_client *clp \
+ ), \
+ TP_ARGS(clp))
+
+DEFINE_NFS_LOCAL_CLIENT_EVENT(nfs_localio_enable_client);
+DEFINE_NFS_LOCAL_CLIENT_EVENT(nfs_localio_disable_client);
+
+#endif /* _TRACE_NFS_COMMON_LOCALIO_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE localio_trace
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/fs/nfs_common/nfslocalio.c b/fs/nfs_common/nfslocalio.c
index a74ec08f6c96..6a0bdea6d644 100644
--- a/fs/nfs_common/nfslocalio.c
+++ b/fs/nfs_common/nfslocalio.c
@@ -7,38 +7,67 @@
#include <linux/module.h>
#include <linux/list.h>
#include <linux/nfslocalio.h>
+#include <linux/nfs3.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
#include <net/netns/generic.h>
+#include "localio_trace.h"
+
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("NFS localio protocol bypass support");
-static DEFINE_SPINLOCK(nfs_uuid_lock);
+static DEFINE_SPINLOCK(nfs_uuids_lock);
/*
* Global list of nfs_uuid_t instances
- * that is protected by nfs_uuid_lock.
+ * that is protected by nfs_uuids_lock.
*/
static LIST_HEAD(nfs_uuids);
+/*
+ * Lock ordering:
+ * 1: nfs_uuid->lock
+ * 2: nfs_uuids_lock
+ * 3: nfs_uuid->list_lock (aka nn->local_clients_lock)
+ *
+ * May skip locks in select cases, but never hold multiple
+ * locks out of order.
+ */
+
void nfs_uuid_init(nfs_uuid_t *nfs_uuid)
{
- nfs_uuid->net = NULL;
+ RCU_INIT_POINTER(nfs_uuid->net, NULL);
nfs_uuid->dom = NULL;
+ nfs_uuid->list_lock = NULL;
INIT_LIST_HEAD(&nfs_uuid->list);
+ INIT_LIST_HEAD(&nfs_uuid->files);
+ spin_lock_init(&nfs_uuid->lock);
+ nfs_uuid->nfs3_localio_probe_count = 0;
}
EXPORT_SYMBOL_GPL(nfs_uuid_init);
bool nfs_uuid_begin(nfs_uuid_t *nfs_uuid)
{
- spin_lock(&nfs_uuid_lock);
- /* Is this nfs_uuid already in use? */
+ spin_lock(&nfs_uuid->lock);
+ if (rcu_access_pointer(nfs_uuid->net)) {
+ /* This nfs_uuid is already in use */
+ spin_unlock(&nfs_uuid->lock);
+ return false;
+ }
+
+ spin_lock(&nfs_uuids_lock);
if (!list_empty(&nfs_uuid->list)) {
- spin_unlock(&nfs_uuid_lock);
+ /* This nfs_uuid is already in use */
+ spin_unlock(&nfs_uuids_lock);
+ spin_unlock(&nfs_uuid->lock);
return false;
}
- uuid_gen(&nfs_uuid->uuid);
list_add_tail(&nfs_uuid->list, &nfs_uuids);
- spin_unlock(&nfs_uuid_lock);
+ spin_unlock(&nfs_uuids_lock);
+
+ uuid_gen(&nfs_uuid->uuid);
+ spin_unlock(&nfs_uuid->lock);
return true;
}
@@ -46,12 +75,16 @@ EXPORT_SYMBOL_GPL(nfs_uuid_begin);
void nfs_uuid_end(nfs_uuid_t *nfs_uuid)
{
- if (nfs_uuid->net == NULL) {
- spin_lock(&nfs_uuid_lock);
- if (nfs_uuid->net == NULL)
+ if (!rcu_access_pointer(nfs_uuid->net)) {
+ spin_lock(&nfs_uuid->lock);
+ if (!rcu_access_pointer(nfs_uuid->net)) {
+ /* Not local, remove from nfs_uuids */
+ spin_lock(&nfs_uuids_lock);
list_del_init(&nfs_uuid->list);
- spin_unlock(&nfs_uuid_lock);
- }
+ spin_unlock(&nfs_uuids_lock);
+ }
+ spin_unlock(&nfs_uuid->lock);
+ }
}
EXPORT_SYMBOL_GPL(nfs_uuid_end);
@@ -69,68 +102,142 @@ static nfs_uuid_t * nfs_uuid_lookup_locked(const uuid_t *uuid)
static struct module *nfsd_mod;
void nfs_uuid_is_local(const uuid_t *uuid, struct list_head *list,
- struct net *net, struct auth_domain *dom,
- struct module *mod)
+ spinlock_t *list_lock, struct net *net,
+ struct auth_domain *dom, struct module *mod)
{
nfs_uuid_t *nfs_uuid;
- spin_lock(&nfs_uuid_lock);
+ spin_lock(&nfs_uuids_lock);
nfs_uuid = nfs_uuid_lookup_locked(uuid);
- if (nfs_uuid) {
- kref_get(&dom->ref);
- nfs_uuid->dom = dom;
- /*
- * We don't hold a ref on the net, but instead put
- * ourselves on a list so the net pointer can be
- * invalidated.
- */
- list_move(&nfs_uuid->list, list);
- rcu_assign_pointer(nfs_uuid->net, net);
-
- __module_get(mod);
- nfsd_mod = mod;
+ if (!nfs_uuid) {
+ spin_unlock(&nfs_uuids_lock);
+ return;
}
- spin_unlock(&nfs_uuid_lock);
+
+ /*
+ * We don't hold a ref on the net, but instead put
+ * ourselves on @list (nn->local_clients) so the net
+ * pointer can be invalidated.
+ */
+ spin_lock(list_lock); /* list_lock is nn->local_clients_lock */
+ list_move(&nfs_uuid->list, list);
+ spin_unlock(list_lock);
+
+ spin_unlock(&nfs_uuids_lock);
+ /* Once nfs_uuid is parented to @list, avoid global nfs_uuids_lock */
+ spin_lock(&nfs_uuid->lock);
+
+ __module_get(mod);
+ nfsd_mod = mod;
+
+ nfs_uuid->list_lock = list_lock;
+ kref_get(&dom->ref);
+ nfs_uuid->dom = dom;
+ rcu_assign_pointer(nfs_uuid->net, net);
+ spin_unlock(&nfs_uuid->lock);
}
EXPORT_SYMBOL_GPL(nfs_uuid_is_local);
-static void nfs_uuid_put_locked(nfs_uuid_t *nfs_uuid)
+void nfs_localio_enable_client(struct nfs_client *clp)
+{
+ /* nfs_uuid_is_local() does the actual enablement */
+ trace_nfs_localio_enable_client(clp);
+}
+EXPORT_SYMBOL_GPL(nfs_localio_enable_client);
+
+/*
+ * Cleanup the nfs_uuid_t embedded in an nfs_client.
+ * This is the long-form of nfs_uuid_init().
+ */
+static bool nfs_uuid_put(nfs_uuid_t *nfs_uuid)
{
- if (nfs_uuid->net) {
- module_put(nfsd_mod);
- nfs_uuid->net = NULL;
+ LIST_HEAD(local_files);
+ struct nfs_file_localio *nfl, *tmp;
+
+ spin_lock(&nfs_uuid->lock);
+ if (unlikely(!rcu_access_pointer(nfs_uuid->net))) {
+ spin_unlock(&nfs_uuid->lock);
+ return false;
}
+ RCU_INIT_POINTER(nfs_uuid->net, NULL);
+
if (nfs_uuid->dom) {
auth_domain_put(nfs_uuid->dom);
nfs_uuid->dom = NULL;
}
- list_del_init(&nfs_uuid->list);
+
+ list_splice_init(&nfs_uuid->files, &local_files);
+ spin_unlock(&nfs_uuid->lock);
+
+ /* Walk list of files and ensure their last references dropped */
+ list_for_each_entry_safe(nfl, tmp, &local_files, list) {
+ nfs_close_local_fh(nfl);
+ cond_resched();
+ }
+
+ spin_lock(&nfs_uuid->lock);
+ BUG_ON(!list_empty(&nfs_uuid->files));
+
+ /* Remove client from nn->local_clients */
+ if (nfs_uuid->list_lock) {
+ spin_lock(nfs_uuid->list_lock);
+ BUG_ON(list_empty(&nfs_uuid->list));
+ list_del_init(&nfs_uuid->list);
+ spin_unlock(nfs_uuid->list_lock);
+ nfs_uuid->list_lock = NULL;
+ }
+
+ module_put(nfsd_mod);
+ spin_unlock(&nfs_uuid->lock);
+
+ return true;
}
-void nfs_uuid_invalidate_clients(struct list_head *list)
+void nfs_localio_disable_client(struct nfs_client *clp)
{
+ if (nfs_uuid_put(&clp->cl_uuid))
+ trace_nfs_localio_disable_client(clp);
+}
+EXPORT_SYMBOL_GPL(nfs_localio_disable_client);
+
+void nfs_localio_invalidate_clients(struct list_head *nn_local_clients,
+ spinlock_t *nn_local_clients_lock)
+{
+ LIST_HEAD(local_clients);
nfs_uuid_t *nfs_uuid, *tmp;
+ struct nfs_client *clp;
- spin_lock(&nfs_uuid_lock);
- list_for_each_entry_safe(nfs_uuid, tmp, list, list)
- nfs_uuid_put_locked(nfs_uuid);
- spin_unlock(&nfs_uuid_lock);
+ spin_lock(nn_local_clients_lock);
+ list_splice_init(nn_local_clients, &local_clients);
+ spin_unlock(nn_local_clients_lock);
+ list_for_each_entry_safe(nfs_uuid, tmp, &local_clients, list) {
+ if (WARN_ON(nfs_uuid->list_lock != nn_local_clients_lock))
+ break;
+ clp = container_of(nfs_uuid, struct nfs_client, cl_uuid);
+ nfs_localio_disable_client(clp);
+ }
}
-EXPORT_SYMBOL_GPL(nfs_uuid_invalidate_clients);
+EXPORT_SYMBOL_GPL(nfs_localio_invalidate_clients);
-void nfs_uuid_invalidate_one_client(nfs_uuid_t *nfs_uuid)
+static void nfs_uuid_add_file(nfs_uuid_t *nfs_uuid, struct nfs_file_localio *nfl)
{
- if (nfs_uuid->net) {
- spin_lock(&nfs_uuid_lock);
- nfs_uuid_put_locked(nfs_uuid);
- spin_unlock(&nfs_uuid_lock);
+ /* Add nfl to nfs_uuid->files if it isn't already */
+ spin_lock(&nfs_uuid->lock);
+ if (list_empty(&nfl->list)) {
+ rcu_assign_pointer(nfl->nfs_uuid, nfs_uuid);
+ list_add_tail(&nfl->list, &nfs_uuid->files);
}
+ spin_unlock(&nfs_uuid->lock);
}
-EXPORT_SYMBOL_GPL(nfs_uuid_invalidate_one_client);
+/*
+ * Caller is responsible for calling nfsd_net_put and
+ * nfsd_file_put (via nfs_to_nfsd_file_put_local).
+ */
struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *uuid,
struct rpc_clnt *rpc_clnt, const struct cred *cred,
- const struct nfs_fh *nfs_fh, const fmode_t fmode)
+ const struct nfs_fh *nfs_fh, struct nfs_file_localio *nfl,
+ const fmode_t fmode)
{
struct net *net;
struct nfsd_file *localio;
@@ -139,7 +246,7 @@ struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *uuid,
* Not running in nfsd context, so must safely get reference on nfsd_serv.
* But the server may already be shutting down, if so disallow new localio.
* uuid->net is NOT a counted reference, but rcu_read_lock() ensures that
- * if uuid->net is not NULL, then calling nfsd_serv_try_get() is safe
+ * if uuid->net is not NULL, then calling nfsd_net_try_get() is safe
* and if it succeeds we will have an implied reference to the net.
*
* Otherwise NFS may not have ref on NFSD and therefore cannot safely
@@ -147,21 +254,62 @@ struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *uuid,
*/
rcu_read_lock();
net = rcu_dereference(uuid->net);
- if (!net || !nfs_to->nfsd_serv_try_get(net)) {
+ if (!net || !nfs_to->nfsd_net_try_get(net)) {
rcu_read_unlock();
return ERR_PTR(-ENXIO);
}
rcu_read_unlock();
- /* We have an implied reference to net thanks to nfsd_serv_try_get */
+ /* We have an implied reference to net thanks to nfsd_net_try_get */
localio = nfs_to->nfsd_open_local_fh(net, uuid->dom, rpc_clnt,
cred, nfs_fh, fmode);
if (IS_ERR(localio))
nfs_to_nfsd_net_put(net);
+ else
+ nfs_uuid_add_file(uuid, nfl);
return localio;
}
EXPORT_SYMBOL_GPL(nfs_open_local_fh);
+void nfs_close_local_fh(struct nfs_file_localio *nfl)
+{
+ struct nfsd_file *ro_nf = NULL;
+ struct nfsd_file *rw_nf = NULL;
+ nfs_uuid_t *nfs_uuid;
+
+ rcu_read_lock();
+ nfs_uuid = rcu_dereference(nfl->nfs_uuid);
+ if (!nfs_uuid) {
+ /* regular (non-LOCALIO) NFS will hammer this */
+ rcu_read_unlock();
+ return;
+ }
+
+ ro_nf = rcu_access_pointer(nfl->ro_file);
+ rw_nf = rcu_access_pointer(nfl->rw_file);
+ if (ro_nf || rw_nf) {
+ spin_lock(&nfs_uuid->lock);
+ if (ro_nf)
+ ro_nf = rcu_dereference_protected(xchg(&nfl->ro_file, NULL), 1);
+ if (rw_nf)
+ rw_nf = rcu_dereference_protected(xchg(&nfl->rw_file, NULL), 1);
+
+ /* Remove nfl from nfs_uuid->files list */
+ RCU_INIT_POINTER(nfl->nfs_uuid, NULL);
+ list_del_init(&nfl->list);
+ spin_unlock(&nfs_uuid->lock);
+ rcu_read_unlock();
+
+ if (ro_nf)
+ nfs_to_nfsd_file_put_local(ro_nf);
+ if (rw_nf)
+ nfs_to_nfsd_file_put_local(rw_nf);
+ return;
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(nfs_close_local_fh);
+
/*
* The NFS LOCALIO code needs to call into NFSD using various symbols,
* but cannot be statically linked, because that will make the NFS
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index dc5c9d8e8202..0e552d873eaa 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -39,6 +39,7 @@
#include <linux/fsnotify.h>
#include <linux/seq_file.h>
#include <linux/rhashtable.h>
+#include <linux/nfslocalio.h>
#include "vfs.h"
#include "nfsd.h"
@@ -391,7 +392,7 @@ nfsd_file_put(struct nfsd_file *nf)
}
/**
- * nfsd_file_put_local - put nfsd_file reference and arm nfsd_serv_put in caller
+ * nfsd_file_put_local - put nfsd_file reference and arm nfsd_net_put in caller
* @nf: nfsd_file of which to put the reference
*
* First save the associated net to return to caller, then put
@@ -833,6 +834,14 @@ __nfsd_file_cache_purge(struct net *net)
struct nfsd_file *nf;
LIST_HEAD(dispose);
+#if IS_ENABLED(CONFIG_NFS_LOCALIO)
+ if (net) {
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ nfs_localio_invalidate_clients(&nn->local_clients,
+ &nn->local_clients_lock);
+ }
+#endif
+
rhltable_walk_enter(&nfsd_file_rhltable, &iter);
do {
rhashtable_walk_start(&iter);
@@ -1222,10 +1231,9 @@ nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
* a file. The security implications of this should be carefully
* considered before use.
*
- * The nfsd_file object returned by this API is reference-counted
- * and garbage-collected. The object is retained for a few
- * seconds after the final nfsd_file_put() in case the caller
- * wants to re-use it.
+ * The nfsd_file_object returned by this API is reference-counted
+ * but not garbage-collected. The object is unhashed after the
+ * final nfsd_file_put().
*
* Return values:
* %nfs_ok - @pnf points to an nfsd_file with its reference
@@ -1247,7 +1255,7 @@ nfsd_file_acquire_local(struct net *net, struct svc_cred *cred,
__be32 beres;
beres = nfsd_file_do_acquire(NULL, net, cred, client,
- fhp, may_flags, NULL, pnf, true);
+ fhp, may_flags, NULL, pnf, false);
put_cred(revert_creds(save_cred));
return beres;
}
diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c
index f441cb9f74d5..238647fa379e 100644
--- a/fs/nfsd/localio.c
+++ b/fs/nfsd/localio.c
@@ -25,10 +25,12 @@
#include "cache.h"
static const struct nfsd_localio_operations nfsd_localio_ops = {
- .nfsd_serv_try_get = nfsd_serv_try_get,
- .nfsd_serv_put = nfsd_serv_put,
+ .nfsd_net_try_get = nfsd_net_try_get,
+ .nfsd_net_put = nfsd_net_put,
.nfsd_open_local_fh = nfsd_open_local_fh,
.nfsd_file_put_local = nfsd_file_put_local,
+ .nfsd_file_get = nfsd_file_get,
+ .nfsd_file_put = nfsd_file_put,
.nfsd_file_file = nfsd_file_file,
};
@@ -52,7 +54,7 @@ void nfsd_localio_ops_init(void)
* avoid all the NFS overhead with reads, writes and commits.
*
* On successful return, returned nfsd_file will have its nf_net member
- * set. Caller (NFS client) is responsible for calling nfsd_serv_put and
+ * set. Caller (NFS client) is responsible for calling nfsd_net_put and
* nfsd_file_put (via nfs_to_nfsd_file_put_local).
*/
struct nfsd_file *
@@ -114,6 +116,7 @@ static __be32 localio_proc_uuid_is_local(struct svc_rqst *rqstp)
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
nfs_uuid_is_local(&argp->uuid, &nn->local_clients,
+ &nn->local_clients_lock,
net, rqstp->rq_client, THIS_MODULE);
return rpc_success;
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 4a07b8d0837b..3e2d0fde80a7 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -134,9 +134,10 @@ struct nfsd_net {
struct svc_info nfsd_info;
#define nfsd_serv nfsd_info.serv
- struct percpu_ref nfsd_serv_ref;
- struct completion nfsd_serv_confirm_done;
- struct completion nfsd_serv_free_done;
+
+ struct percpu_ref nfsd_net_ref;
+ struct completion nfsd_net_confirm_done;
+ struct completion nfsd_net_free_done;
/*
* clientid and stateid data for construction of net unique COPY
@@ -213,6 +214,7 @@ struct nfsd_net {
#if IS_ENABLED(CONFIG_NFS_LOCALIO)
/* Local clients to be invalidated when net is shut down */
+ spinlock_t local_clients_lock;
struct list_head local_clients;
#endif
};
@@ -223,8 +225,8 @@ struct nfsd_net {
extern bool nfsd_support_version(int vers);
extern unsigned int nfsd_net_id;
-bool nfsd_serv_try_get(struct net *net);
-void nfsd_serv_put(struct net *net);
+bool nfsd_net_try_get(struct net *net);
+void nfsd_net_put(struct net *net);
void nfsd_copy_write_verifier(__be32 verf[2], struct nfsd_net *nn);
void nfsd_reset_write_verifier(struct nfsd_net *nn);
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 95ea4393305b..ce2a71e4904c 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -2217,6 +2217,7 @@ static __net_init int nfsd_net_init(struct net *net)
seqlock_init(&nn->writeverf_lock);
nfsd_proc_stat_init(net);
#if IS_ENABLED(CONFIG_NFS_LOCALIO)
+ spin_lock_init(&nn->local_clients_lock);
INIT_LIST_HEAD(&nn->local_clients);
#endif
return 0;
@@ -2234,14 +2235,15 @@ out_export_error:
* nfsd_net_pre_exit - Disconnect localio clients from net namespace
* @net: a network namespace that is about to be destroyed
*
- * This invalidated ->net pointers held by localio clients
+ * This invalidates ->net pointers held by localio clients
* while they can still safely access nn->counter.
*/
static __net_exit void nfsd_net_pre_exit(struct net *net)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- nfs_uuid_invalidate_clients(&nn->local_clients);
+ nfs_localio_invalidate_clients(&nn->local_clients,
+ &nn->local_clients_lock);
}
#endif
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 3f5104ed70bf..9b3d6cff0e1e 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -204,32 +204,32 @@ int nfsd_minorversion(struct nfsd_net *nn, u32 minorversion, enum vers_op change
return 0;
}
-bool nfsd_serv_try_get(struct net *net) __must_hold(rcu)
+bool nfsd_net_try_get(struct net *net) __must_hold(rcu)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- return (nn && percpu_ref_tryget_live(&nn->nfsd_serv_ref));
+ return (nn && percpu_ref_tryget_live(&nn->nfsd_net_ref));
}
-void nfsd_serv_put(struct net *net) __must_hold(rcu)
+void nfsd_net_put(struct net *net) __must_hold(rcu)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- percpu_ref_put(&nn->nfsd_serv_ref);
+ percpu_ref_put(&nn->nfsd_net_ref);
}
-static void nfsd_serv_done(struct percpu_ref *ref)
+static void nfsd_net_done(struct percpu_ref *ref)
{
- struct nfsd_net *nn = container_of(ref, struct nfsd_net, nfsd_serv_ref);
+ struct nfsd_net *nn = container_of(ref, struct nfsd_net, nfsd_net_ref);
- complete(&nn->nfsd_serv_confirm_done);
+ complete(&nn->nfsd_net_confirm_done);
}
-static void nfsd_serv_free(struct percpu_ref *ref)
+static void nfsd_net_free(struct percpu_ref *ref)
{
- struct nfsd_net *nn = container_of(ref, struct nfsd_net, nfsd_serv_ref);
+ struct nfsd_net *nn = container_of(ref, struct nfsd_net, nfsd_net_ref);
- complete(&nn->nfsd_serv_free_done);
+ complete(&nn->nfsd_net_free_done);
}
/*
@@ -426,6 +426,10 @@ static void nfsd_shutdown_net(struct net *net)
if (!nn->nfsd_net_up)
return;
+
+ percpu_ref_kill_and_confirm(&nn->nfsd_net_ref, nfsd_net_done);
+ wait_for_completion(&nn->nfsd_net_confirm_done);
+
nfsd_export_flush(net);
nfs4_state_shutdown_net(net);
nfsd_reply_cache_shutdown(nn);
@@ -434,7 +438,10 @@ static void nfsd_shutdown_net(struct net *net)
lockd_down(net);
nn->lockd_up = false;
}
- percpu_ref_exit(&nn->nfsd_serv_ref);
+
+ wait_for_completion(&nn->nfsd_net_free_done);
+ percpu_ref_exit(&nn->nfsd_net_ref);
+
nn->nfsd_net_up = false;
nfsd_shutdown_generic();
}
@@ -516,11 +523,6 @@ void nfsd_destroy_serv(struct net *net)
lockdep_assert_held(&nfsd_mutex);
- percpu_ref_kill_and_confirm(&nn->nfsd_serv_ref, nfsd_serv_done);
- wait_for_completion(&nn->nfsd_serv_confirm_done);
- wait_for_completion(&nn->nfsd_serv_free_done);
- /* percpu_ref_exit is called in nfsd_shutdown_net */
-
spin_lock(&nfsd_notifier_lock);
nn->nfsd_serv = NULL;
spin_unlock(&nfsd_notifier_lock);
@@ -621,12 +623,12 @@ int nfsd_create_serv(struct net *net)
if (nn->nfsd_serv)
return 0;
- error = percpu_ref_init(&nn->nfsd_serv_ref, nfsd_serv_free,
+ error = percpu_ref_init(&nn->nfsd_net_ref, nfsd_net_free,
0, GFP_KERNEL);
if (error)
return error;
- init_completion(&nn->nfsd_serv_free_done);
- init_completion(&nn->nfsd_serv_confirm_done);
+ init_completion(&nn->nfsd_net_free_done);
+ init_completion(&nn->nfsd_net_confirm_done);
if (nfsd_max_blksize == 0)
nfsd_max_blksize = nfsd_get_default_max_blksize();
diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
index 1b508f543384..9729f071c5aa 100644
--- a/fs/orangefs/orangefs-debugfs.c
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -206,8 +206,8 @@ static void orangefs_kernel_debug_init(void)
pr_info("%s: overflow 1!\n", __func__);
}
- debugfs_create_file(ORANGEFS_KMOD_DEBUG_FILE, 0444, debug_dir, k_buffer,
- &kernel_debug_fops);
+ debugfs_create_file_aux_num(ORANGEFS_KMOD_DEBUG_FILE, 0444, debug_dir, k_buffer,
+ 0, &kernel_debug_fops);
}
@@ -306,11 +306,10 @@ static void orangefs_client_debug_init(void)
pr_info("%s: overflow! 2\n", __func__);
}
- client_debug_dentry = debugfs_create_file(ORANGEFS_CLIENT_DEBUG_FILE,
- 0444,
- debug_dir,
- c_buffer,
- &kernel_debug_fops);
+ client_debug_dentry = debugfs_create_file_aux_num(
+ ORANGEFS_CLIENT_DEBUG_FILE,
+ 0444, debug_dir, c_buffer, 1,
+ &kernel_debug_fops);
}
/* open ORANGEFS_KMOD_DEBUG_FILE or ORANGEFS_CLIENT_DEBUG_FILE.*/
@@ -418,8 +417,7 @@ static ssize_t orangefs_debug_write(struct file *file,
* A service operation is required to set a new client-side
* debug mask.
*/
- if (!strcmp(file->f_path.dentry->d_name.name,
- ORANGEFS_KMOD_DEBUG_FILE)) {
+ if (!debugfs_get_aux_num(file)) { // kernel-debug
debug_string_to_mask(buf, &orangefs_gossip_debug_mask, 0);
debug_mask_to_string(&orangefs_gossip_debug_mask, 0);
debug_string = kernel_debug_string;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 785408861c01..6931308876c4 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -817,7 +817,7 @@ EXPORT_SYMBOL_GPL(sysfs_emit_at);
* Returns number of bytes written to @buf.
*/
ssize_t sysfs_bin_attr_simple_read(struct file *file, struct kobject *kobj,
- struct bin_attribute *attr, char *buf,
+ const struct bin_attribute *attr, char *buf,
loff_t off, size_t count)
{
memcpy(buf, attr->private + off, count);