summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.c5
-rw-r--r--fs/9p/vfs_inode.c65
-rw-r--r--fs/9p/vfs_inode_dotl.c9
-rw-r--r--fs/autofs/dev-ioctl.c4
-rw-r--r--fs/btrfs/backref.c2
-rw-r--r--fs/btrfs/extent-io-tree.h2
-rw-r--r--fs/btrfs/extent-tree.c14
-rw-r--r--fs/btrfs/free-space-cache.c4
-rw-r--r--fs/btrfs/inode.c18
-rw-r--r--fs/btrfs/super.c30
-rw-r--r--fs/btrfs/sysfs.c8
-rw-r--r--fs/btrfs/tree-log.c8
-rw-r--r--fs/btrfs/volumes.c4
-rw-r--r--fs/ceph/Kconfig2
-rw-r--r--fs/ceph/addr.c23
-rw-r--r--fs/ceph/caps.c12
-rw-r--r--fs/ceph/debugfs.c16
-rw-r--r--fs/ceph/dir.c4
-rw-r--r--fs/ceph/file.c5
-rw-r--r--fs/ceph/mds_client.c184
-rw-r--r--fs/ceph/mds_client.h7
-rw-r--r--fs/ceph/mdsmap.c10
-rw-r--r--fs/ceph/metric.c149
-rw-r--r--fs/ceph/metric.h91
-rw-r--r--fs/ceph/super.c64
-rw-r--r--fs/ceph/super.h6
-rw-r--r--fs/ceph/xattr.c12
-rw-r--r--fs/cifs/connect.c1
-rw-r--r--fs/cifs/smb2inode.c1
-rw-r--r--fs/cifs/smb2pdu.c4
-rw-r--r--fs/coredump.c17
-rw-r--r--fs/exec.c32
-rw-r--r--fs/exfat/balloc.c4
-rw-r--r--fs/exfat/dir.c32
-rw-r--r--fs/exfat/exfat_fs.h14
-rw-r--r--fs/exfat/exfat_raw.h5
-rw-r--r--fs/exfat/fatent.c58
-rw-r--r--fs/exfat/file.c9
-rw-r--r--fs/exfat/inode.c13
-rw-r--r--fs/exfat/misc.c22
-rw-r--r--fs/exfat/namei.c32
-rw-r--r--fs/exfat/super.c48
-rw-r--r--fs/fat/Kconfig2
-rw-r--r--fs/fat/fatent.c3
-rw-r--r--fs/fat/file.c4
-rw-r--r--fs/hugetlbfs/inode.c6
-rw-r--r--fs/io_uring.c539
-rw-r--r--fs/minix/inode.c42
-rw-r--r--fs/minix/itree_common.c8
-rw-r--r--fs/minix/itree_v1.c12
-rw-r--r--fs/minix/itree_v2.c13
-rw-r--r--fs/minix/minix.h1
-rw-r--r--fs/namei.c12
-rw-r--r--fs/nfs/Makefile2
-rw-r--r--fs/nfs/blocklayout/rpc_pipefs.c2
-rw-r--r--fs/nfs/client.c22
-rw-r--r--fs/nfs/dir.c24
-rw-r--r--fs/nfs/direct.c2
-rw-r--r--fs/nfs/file.c17
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c64
-rw-r--r--fs/nfs/fs_context.c2
-rw-r--r--fs/nfs/inode.c20
-rw-r--r--fs/nfs/nfs42.h24
-rw-r--r--fs/nfs/nfs42proc.c258
-rw-r--r--fs/nfs/nfs42xattr.c1056
-rw-r--r--fs/nfs/nfs42xdr.c438
-rw-r--r--fs/nfs/nfs4_fs.h35
-rw-r--r--fs/nfs/nfs4client.c33
-rw-r--r--fs/nfs/nfs4file.c5
-rw-r--r--fs/nfs/nfs4proc.c239
-rw-r--r--fs/nfs/nfs4super.c10
-rw-r--r--fs/nfs/nfs4trace.h46
-rw-r--r--fs/nfs/nfs4xdr.c39
-rw-r--r--fs/nfs/nfstrace.h3
-rw-r--r--fs/nfs/pnfs.c52
-rw-r--r--fs/nfs/pnfs.h2
-rw-r--r--fs/nilfs2/alloc.c38
-rw-r--r--fs/nilfs2/btree.c42
-rw-r--r--fs/nilfs2/cpfile.c10
-rw-r--r--fs/nilfs2/dat.c14
-rw-r--r--fs/nilfs2/direct.c14
-rw-r--r--fs/nilfs2/gcinode.c2
-rw-r--r--fs/nilfs2/ifile.c4
-rw-r--r--fs/nilfs2/inode.c32
-rw-r--r--fs/nilfs2/ioctl.c37
-rw-r--r--fs/nilfs2/mdt.c2
-rw-r--r--fs/nilfs2/namei.c6
-rw-r--r--fs/nilfs2/nilfs.h18
-rw-r--r--fs/nilfs2/page.c11
-rw-r--r--fs/nilfs2/recovery.c32
-rw-r--r--fs/nilfs2/segbuf.c2
-rw-r--r--fs/nilfs2/segment.c38
-rw-r--r--fs/nilfs2/sufile.c29
-rw-r--r--fs/nilfs2/super.c73
-rw-r--r--fs/nilfs2/sysfs.c29
-rw-r--r--fs/nilfs2/the_nilfs.c85
-rw-r--r--fs/open.c6
-rw-r--r--fs/proc/base.c11
-rw-r--r--fs/proc/task_mmu.c4
-rw-r--r--fs/signalfd.c10
-rw-r--r--fs/ufs/super.c2
-rw-r--r--fs/xfs/libxfs/xfs_sb.c2
-rw-r--r--fs/xfs/xfs_attr_list.c2
-rw-r--r--fs/xfs/xfs_buf_item.c2
-rw-r--r--fs/xfs/xfs_buf_item_recover.c2
-rw-r--r--fs/xfs/xfs_dquot.c2
-rw-r--r--fs/xfs/xfs_export.c2
-rw-r--r--fs/xfs/xfs_inode.c4
-rw-r--r--fs/xfs/xfs_inode_item.c4
-rw-r--r--fs/xfs/xfs_iomap.c2
-rw-r--r--fs/xfs/xfs_log_cil.c2
-rw-r--r--fs/xfs/xfs_log_recover.c2
-rw-r--r--fs/xfs/xfs_refcount_item.c2
-rw-r--r--fs/xfs/xfs_reflink.c2
-rw-r--r--fs/xfs/xfs_sysfs.h6
-rw-r--r--fs/xfs/xfs_trans_ail.c4
116 files changed, 3760 insertions, 918 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 15a99f9c7253..39def020a074 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -500,10 +500,9 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
}
#ifdef CONFIG_9P_FSCACHE
- if (v9ses->fscache) {
+ if (v9ses->fscache)
v9fs_cache_session_put_cookie(v9ses);
- kfree(v9ses->cachetag);
- }
+ kfree(v9ses->cachetag);
#endif
kfree(v9ses->uname);
kfree(v9ses->aname);
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index c9255d399917..ae0c38ad1fcb 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -223,8 +223,7 @@ v9fs_blank_wstat(struct p9_wstat *wstat)
struct inode *v9fs_alloc_inode(struct super_block *sb)
{
struct v9fs_inode *v9inode;
- v9inode = (struct v9fs_inode *)kmem_cache_alloc(v9fs_inode_cache,
- GFP_KERNEL);
+ v9inode = kmem_cache_alloc(v9fs_inode_cache, GFP_KERNEL);
if (!v9inode)
return NULL;
#ifdef CONFIG_9P_FSCACHE
@@ -368,59 +367,6 @@ struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev)
return inode;
}
-/*
-static struct v9fs_fid*
-v9fs_clone_walk(struct v9fs_session_info *v9ses, u32 fid, struct dentry *dentry)
-{
- int err;
- int nfid;
- struct v9fs_fid *ret;
- struct v9fs_fcall *fcall;
-
- nfid = v9fs_get_idpool(&v9ses->fidpool);
- if (nfid < 0) {
- eprintk(KERN_WARNING, "no free fids available\n");
- return ERR_PTR(-ENOSPC);
- }
-
- err = v9fs_t_walk(v9ses, fid, nfid, (char *) dentry->d_name.name,
- &fcall);
-
- if (err < 0) {
- if (fcall && fcall->id == RWALK)
- goto clunk_fid;
-
- PRINT_FCALL_ERROR("walk error", fcall);
- v9fs_put_idpool(nfid, &v9ses->fidpool);
- goto error;
- }
-
- kfree(fcall);
- fcall = NULL;
- ret = v9fs_fid_create(v9ses, nfid);
- if (!ret) {
- err = -ENOMEM;
- goto clunk_fid;
- }
-
- err = v9fs_fid_insert(ret, dentry);
- if (err < 0) {
- v9fs_fid_destroy(ret);
- goto clunk_fid;
- }
-
- return ret;
-
-clunk_fid:
- v9fs_t_clunk(v9ses, nfid);
-
-error:
- kfree(fcall);
- return ERR_PTR(err);
-}
-*/
-
-
/**
* v9fs_clear_inode - release an inode
* @inode: inode to release
@@ -1090,7 +1036,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
{
int retval;
struct v9fs_session_info *v9ses;
- struct p9_fid *fid;
+ struct p9_fid *fid = NULL;
struct p9_wstat wstat;
p9_debug(P9_DEBUG_VFS, "\n");
@@ -1100,7 +1046,12 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
retval = -EPERM;
v9ses = v9fs_dentry2v9ses(dentry);
- fid = v9fs_fid_lookup(dentry);
+ if (iattr->ia_valid & ATTR_FILE) {
+ fid = iattr->ia_file->private_data;
+ WARN_ON(!fid);
+ }
+ if (!fid)
+ fid = v9fs_fid_lookup(dentry);
if(IS_ERR(fid))
return PTR_ERR(fid);
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 60328b21c5fb..0028eccb665a 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -540,7 +540,7 @@ static int v9fs_mapped_iattr_valid(int iattr_valid)
int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
{
int retval;
- struct p9_fid *fid;
+ struct p9_fid *fid = NULL;
struct p9_iattr_dotl p9attr;
struct inode *inode = d_inode(dentry);
@@ -560,7 +560,12 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
p9attr.mtime_sec = iattr->ia_mtime.tv_sec;
p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
- fid = v9fs_fid_lookup(dentry);
+ if (iattr->ia_valid & ATTR_FILE) {
+ fid = iattr->ia_file->private_data;
+ WARN_ON(!fid);
+ }
+ if (!fid)
+ fid = v9fs_fid_lookup(dentry);
if (IS_ERR(fid))
return PTR_ERR(fid);
diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c
index f3a0f412b43b..75105f45c51a 100644
--- a/fs/autofs/dev-ioctl.c
+++ b/fs/autofs/dev-ioctl.c
@@ -20,7 +20,7 @@
* another mount. This situation arises when starting automount(8)
* or other user space daemon which uses direct mounts or offset
* mounts (used for autofs lazy mount/umount of nested mount trees),
- * which have been left busy at at service shutdown.
+ * which have been left busy at service shutdown.
*/
typedef int (*ioctl_fn)(struct file *, struct autofs_sb_info *,
@@ -496,7 +496,7 @@ static int autofs_dev_ioctl_askumount(struct file *fp,
* located path is the root of a mount we return 1 along with
* the super magic of the mount or 0 otherwise.
*
- * In both cases the the device number (as returned by
+ * In both cases the device number (as returned by
* new_encode_dev()) is also returned.
*/
static int autofs_dev_ioctl_ismountpoint(struct file *fp,
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index ea10f7bc99ab..ea1c28ccb44f 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -2303,7 +2303,7 @@ struct btrfs_backref_iter *btrfs_backref_iter_alloc(
return NULL;
ret->path = btrfs_alloc_path();
- if (!ret) {
+ if (!ret->path) {
kfree(ret);
return NULL;
}
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index f39d47a2d01a..219a09a2b734 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -34,6 +34,8 @@ struct io_failure_record;
*/
#define CHUNK_ALLOCATED EXTENT_DIRTY
#define CHUNK_TRIMMED EXTENT_DEFRAG
+#define CHUNK_STATE_MASK (CHUNK_ALLOCATED | \
+ CHUNK_TRIMMED)
enum {
IO_TREE_FS_PINNED_EXTENTS,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 61ede335f6c3..de6fe176fdfb 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -33,6 +33,7 @@
#include "delalloc-space.h"
#include "block-group.h"
#include "discard.h"
+#include "rcu-string.h"
#undef SCRAMBLE_DELAYED_REFS
@@ -5668,6 +5669,19 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
&start, &end,
CHUNK_TRIMMED | CHUNK_ALLOCATED);
+ /* Check if there are any CHUNK_* bits left */
+ if (start > device->total_bytes) {
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ btrfs_warn_in_rcu(fs_info,
+"ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
+ start, end - start + 1,
+ rcu_str_deref(device->name),
+ device->total_bytes);
+ mutex_unlock(&fs_info->chunk_mutex);
+ ret = 0;
+ break;
+ }
+
/* Ensure we skip the reserved area in the first 1M */
start = max_t(u64, start, SZ_1M);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 6d961e11639e..ef0fd7afb0b1 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2282,7 +2282,7 @@ out:
static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info, bool update_stat)
{
- struct btrfs_free_space *left_info;
+ struct btrfs_free_space *left_info = NULL;
struct btrfs_free_space *right_info;
bool merged = false;
u64 offset = info->offset;
@@ -2298,7 +2298,7 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
if (right_info && rb_prev(&right_info->offset_index))
left_info = rb_entry(rb_prev(&right_info->offset_index),
struct btrfs_free_space, offset_index);
- else
+ else if (!right_info)
left_info = tree_search_offset(ctl, offset - 1, 0, 0);
/* See try_merge_free_space() comment. */
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6dc03bab0c9d..51fcd82d41c0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -654,12 +654,18 @@ cont:
page_error_op |
PAGE_END_WRITEBACK);
- for (i = 0; i < nr_pages; i++) {
- WARN_ON(pages[i]->mapping);
- put_page(pages[i]);
+ /*
+ * Ensure we only free the compressed pages if we have
+ * them allocated, as we can still reach here with
+ * inode_need_compress() == false.
+ */
+ if (pages) {
+ for (i = 0; i < nr_pages; i++) {
+ WARN_ON(pages[i]->mapping);
+ put_page(pages[i]);
+ }
+ kfree(pages);
}
- kfree(pages);
-
return 0;
}
}
@@ -6622,7 +6628,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
/* Only regular file could have regular/prealloc extent */
if (!S_ISREG(inode->vfs_inode.i_mode)) {
- ret = -EUCLEAN;
+ err = -EUCLEAN;
btrfs_crit(fs_info,
"regular/prealloc extent found for non-regular inode %llu",
btrfs_ino(inode));
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 5a9dc31d95c9..e529ddb35b87 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -517,6 +517,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
char *compress_type;
bool compress_force = false;
enum btrfs_compression_type saved_compress_type;
+ int saved_compress_level;
bool saved_compress_force;
int no_compress = 0;
@@ -598,6 +599,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
info->compress_type : BTRFS_COMPRESS_NONE;
saved_compress_force =
btrfs_test_opt(info, FORCE_COMPRESS);
+ saved_compress_level = info->compress_level;
if (token == Opt_compress ||
token == Opt_compress_force ||
strncmp(args[0].from, "zlib", 4) == 0) {
@@ -642,6 +644,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
no_compress = 0;
} else if (strncmp(args[0].from, "no", 2) == 0) {
compress_type = "no";
+ info->compress_level = 0;
+ info->compress_type = 0;
btrfs_clear_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
compress_force = false;
@@ -662,11 +666,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
*/
btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
}
- if ((btrfs_test_opt(info, COMPRESS) &&
- (info->compress_type != saved_compress_type ||
- compress_force != saved_compress_force)) ||
- (!btrfs_test_opt(info, COMPRESS) &&
- no_compress == 1)) {
+ if (no_compress == 1) {
+ btrfs_info(info, "use no compression");
+ } else if ((info->compress_type != saved_compress_type) ||
+ (compress_force != saved_compress_force) ||
+ (info->compress_level != saved_compress_level)) {
btrfs_info(info, "%s %s compression, level %d",
(compress_force) ? "force" : "use",
compress_type, info->compress_level);
@@ -1382,6 +1386,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
{
struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
const char *compress_type;
+ const char *subvol_name;
if (btrfs_test_opt(info, DEGRADED))
seq_puts(seq, ",degraded");
@@ -1468,8 +1473,13 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",ref_verify");
seq_printf(seq, ",subvolid=%llu",
BTRFS_I(d_inode(dentry))->root->root_key.objectid);
- seq_puts(seq, ",subvol=");
- seq_dentry(seq, dentry, " \t\n\\");
+ subvol_name = btrfs_get_subvol_name_from_objectid(info,
+ BTRFS_I(d_inode(dentry))->root->root_key.objectid);
+ if (!IS_ERR(subvol_name)) {
+ seq_puts(seq, ",subvol=");
+ seq_escape(seq, subvol_name, " \t\n\\");
+ kfree(subvol_name);
+ }
return 0;
}
@@ -1950,6 +1960,12 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
set_bit(BTRFS_FS_OPEN, &fs_info->flags);
}
out:
+ /*
+ * We need to set SB_I_VERSION here otherwise it'll get cleared by VFS,
+ * since the absence of the flag means it can be toggled off by remount.
+ */
+ *flags |= SB_I_VERSION;
+
wake_up_process(fs_info->transaction_kthread);
btrfs_remount_cleanup(fs_info, old_opts);
clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 104c80caaa74..c8df2edafd85 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1565,9 +1565,11 @@ void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info)
rbtree_postorder_for_each_entry_safe(qgroup, next,
&fs_info->qgroup_tree, node)
btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
- kobject_del(fs_info->qgroups_kobj);
- kobject_put(fs_info->qgroups_kobj);
- fs_info->qgroups_kobj = NULL;
+ if (fs_info->qgroups_kobj) {
+ kobject_del(fs_info->qgroups_kobj);
+ kobject_put(fs_info->qgroups_kobj);
+ fs_info->qgroups_kobj = NULL;
+ }
}
/* Called when qgroups get initialized, thus there is no need for locking */
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index ea8136dcf71f..696dd861cc3c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4036,11 +4036,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
fs_info->csum_root,
ds + cs, ds + cs + cl - 1,
&ordered_sums, 0);
- if (ret) {
- btrfs_release_path(dst_path);
- kfree(ins_data);
- return ret;
- }
+ if (ret)
+ break;
}
}
}
@@ -4053,7 +4050,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
* we have to do this after the loop above to avoid changing the
* log tree while trying to change the log tree.
*/
- ret = 0;
while (!list_empty(&ordered_sums)) {
struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
struct btrfs_ordered_sum,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d7670e2a9f39..ee96c5869f57 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4720,6 +4720,10 @@ again:
}
mutex_lock(&fs_info->chunk_mutex);
+ /* Clear all state bits beyond the shrunk device size */
+ clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
+ CHUNK_STATE_MASK);
+
btrfs_device_set_disk_total_bytes(device, new_size);
if (list_empty(&device->post_commit_list))
list_add_tail(&device->post_commit_list,
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index cf235f6eacf9..471e40156065 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -13,7 +13,7 @@ config CEPH_FS
scalable file system designed to provide high performance,
reliable access to petabytes of storage.
- More information at http://ceph.newdream.net/.
+ More information at https://ceph.io/.
If unsure, say N.
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 01ad09733ac7..6ea761c84494 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -862,8 +862,7 @@ static void writepages_finish(struct ceph_osd_request *req)
osd_data = osd_req_op_extent_osd_data(req, 0);
if (osd_data->pages_from_pool)
- mempool_free(osd_data->pages,
- ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
+ mempool_free(osd_data->pages, ceph_wb_pagevec_pool);
else
kfree(osd_data->pages);
ceph_osdc_put_request(req);
@@ -955,10 +954,10 @@ retry:
int num_ops = 0, op_idx;
unsigned i, pvec_pages, max_pages, locked_pages = 0;
struct page **pages = NULL, **data_pages;
- mempool_t *pool = NULL; /* Becomes non-null if mempool used */
struct page *page;
pgoff_t strip_unit_end = 0;
u64 offset = 0, len = 0;
+ bool from_pool = false;
max_pages = wsize >> PAGE_SHIFT;
@@ -1057,16 +1056,16 @@ get_more_pages:
sizeof(*pages),
GFP_NOFS);
if (!pages) {
- pool = fsc->wb_pagevec_pool;
- pages = mempool_alloc(pool, GFP_NOFS);
+ from_pool = true;
+ pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS);
BUG_ON(!pages);
}
len = 0;
} else if (page->index !=
(offset + len) >> PAGE_SHIFT) {
- if (num_ops >= (pool ? CEPH_OSD_SLAB_OPS :
- CEPH_OSD_MAX_OPS)) {
+ if (num_ops >= (from_pool ? CEPH_OSD_SLAB_OPS :
+ CEPH_OSD_MAX_OPS)) {
redirty_page_for_writepage(wbc, page);
unlock_page(page);
break;
@@ -1161,7 +1160,7 @@ new_request:
offset, len);
osd_req_op_extent_osd_data_pages(req, op_idx,
data_pages, len, 0,
- !!pool, false);
+ from_pool, false);
osd_req_op_extent_update(req, op_idx, len);
len = 0;
@@ -1188,12 +1187,12 @@ new_request:
dout("writepages got pages at %llu~%llu\n", offset, len);
osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len,
- 0, !!pool, false);
+ 0, from_pool, false);
osd_req_op_extent_update(req, op_idx, len);
BUG_ON(op_idx + 1 != req->r_num_ops);
- pool = NULL;
+ from_pool = false;
if (i < locked_pages) {
BUG_ON(num_ops <= req->r_num_ops);
num_ops -= req->r_num_ops;
@@ -1204,8 +1203,8 @@ new_request:
pages = kmalloc_array(locked_pages, sizeof(*pages),
GFP_NOFS);
if (!pages) {
- pool = fsc->wb_pagevec_pool;
- pages = mempool_alloc(pool, GFP_NOFS);
+ from_pool = true;
+ pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS);
BUG_ON(!pages);
}
memcpy(pages, data_pages + i,
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 972c13aa4225..55ccccf77cea 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -668,6 +668,7 @@ void ceph_add_cap(struct inode *inode,
spin_lock(&session->s_cap_lock);
list_add_tail(&cap->session_caps, &session->s_caps);
session->s_nr_caps++;
+ atomic64_inc(&mdsc->metric.total_caps);
spin_unlock(&session->s_cap_lock);
} else {
spin_lock(&session->s_cap_lock);
@@ -1161,6 +1162,7 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
} else {
list_del_init(&cap->session_caps);
session->s_nr_caps--;
+ atomic64_dec(&mdsc->metric.total_caps);
cap->session = NULL;
removed = 1;
}
@@ -4187,10 +4189,8 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
struct ceph_inode_info *ci;
dout("check_delayed_caps\n");
- while (1) {
- spin_lock(&mdsc->cap_delay_lock);
- if (list_empty(&mdsc->cap_delay_list))
- break;
+ spin_lock(&mdsc->cap_delay_lock);
+ while (!list_empty(&mdsc->cap_delay_list)) {
ci = list_first_entry(&mdsc->cap_delay_list,
struct ceph_inode_info,
i_cap_delay_list);
@@ -4200,13 +4200,13 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
list_del_init(&ci->i_cap_delay_list);
inode = igrab(&ci->vfs_inode);
- spin_unlock(&mdsc->cap_delay_lock);
-
if (inode) {
+ spin_unlock(&mdsc->cap_delay_lock);
dout("check_delayed_caps on %p\n", inode);
ceph_check_caps(ci, 0, NULL);
/* avoid calling iput_final() in tick thread */
ceph_async_iput(inode);
+ spin_lock(&mdsc->cap_delay_lock);
}
}
spin_unlock(&mdsc->cap_delay_lock);
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 070ed8481340..97539b497e4c 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -145,7 +145,7 @@ static int metric_show(struct seq_file *s, void *p)
struct ceph_fs_client *fsc = s->private;
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_client_metric *m = &mdsc->metric;
- int i, nr_caps = 0;
+ int nr_caps = 0;
s64 total, sum, avg, min, max, sq;
seq_printf(s, "item total avg_lat(us) min_lat(us) max_lat(us) stdev(us)\n");
@@ -190,17 +190,7 @@ static int metric_show(struct seq_file *s, void *p)
percpu_counter_sum(&m->d_lease_mis),
percpu_counter_sum(&m->d_lease_hit));
- mutex_lock(&mdsc->mutex);
- for (i = 0; i < mdsc->max_sessions; i++) {
- struct ceph_mds_session *s;
-
- s = __ceph_lookup_mds_session(mdsc, i);
- if (!s)
- continue;
- nr_caps += s->s_nr_caps;
- ceph_put_mds_session(s);
- }
- mutex_unlock(&mdsc->mutex);
+ nr_caps = atomic64_read(&m->total_caps);
seq_printf(s, "%-14s%-16d%-16lld%lld\n", "caps", nr_caps,
percpu_counter_sum(&m->i_caps_mis),
percpu_counter_sum(&m->i_caps_hit));
@@ -272,7 +262,7 @@ static int mds_sessions_show(struct seq_file *s, void *ptr)
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_auth_client *ac = fsc->client->monc.auth;
struct ceph_options *opt = fsc->client->options;
- int mds = -1;
+ int mds;
mutex_lock(&mdsc->mutex);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 39f5311404b0..060bdcc5ce32 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -930,6 +930,10 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
req->r_num_caps = 2;
req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+ if (as_ctx.pagelist) {
+ req->r_pagelist = as_ctx.pagelist;
+ as_ctx.pagelist = NULL;
+ }
err = ceph_mdsc_do_request(mdsc, dir, req);
if (!err && !req->r_reply_info.head->is_dentry)
err = ceph_handle_notrace_create(dir, dentry);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 160644ddaeed..d51c3f2fdca0 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1538,6 +1538,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
struct inode *inode = file_inode(filp);
struct ceph_inode_info *ci = ceph_inode(inode);
struct page *pinned_page = NULL;
+ bool direct_lock = iocb->ki_flags & IOCB_DIRECT;
ssize_t ret;
int want, got = 0;
int retry_op = 0, read = 0;
@@ -1546,7 +1547,7 @@ again:
dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode);
- if (iocb->ki_flags & IOCB_DIRECT)
+ if (direct_lock)
ceph_start_io_direct(inode);
else
ceph_start_io_read(inode);
@@ -1603,7 +1604,7 @@ again:
}
ceph_put_cap_refs(ci, got);
- if (iocb->ki_flags & IOCB_DIRECT)
+ if (direct_lock)
ceph_end_io_direct(inode);
else
ceph_end_io_read(inode);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index a50497142e59..4a26862d7667 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1103,8 +1103,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
frag.frag, mds);
if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
CEPH_MDS_STATE_ACTIVE) {
- if (mode == USE_ANY_MDS &&
- !ceph_mdsmap_is_laggy(mdsc->mdsmap,
+ if (!ceph_mdsmap_is_laggy(mdsc->mdsmap,
mds))
goto out;
}
@@ -1168,7 +1167,7 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
static const unsigned char feature_bits[] = CEPHFS_FEATURES_CLIENT_SUPPORTED;
#define FEATURE_BYTES(c) (DIV_ROUND_UP((size_t)feature_bits[c - 1] + 1, 64) * 8)
-static void encode_supported_features(void **p, void *end)
+static int encode_supported_features(void **p, void *end)
{
static const size_t count = ARRAY_SIZE(feature_bits);
@@ -1176,16 +1175,64 @@ static void encode_supported_features(void **p, void *end)
size_t i;
size_t size = FEATURE_BYTES(count);
- BUG_ON(*p + 4 + size > end);
+ if (WARN_ON_ONCE(*p + 4 + size > end))
+ return -ERANGE;
+
ceph_encode_32(p, size);
memset(*p, 0, size);
for (i = 0; i < count; i++)
((unsigned char*)(*p))[i / 8] |= BIT(feature_bits[i] % 8);
*p += size;
} else {
- BUG_ON(*p + 4 > end);
+ if (WARN_ON_ONCE(*p + 4 > end))
+ return -ERANGE;
+
ceph_encode_32(p, 0);
}
+
+ return 0;
+}
+
+static const unsigned char metric_bits[] = CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED;
+#define METRIC_BYTES(cnt) (DIV_ROUND_UP((size_t)metric_bits[cnt - 1] + 1, 64) * 8)
+static int encode_metric_spec(void **p, void *end)
+{
+ static const size_t count = ARRAY_SIZE(metric_bits);
+
+ /* header */
+ if (WARN_ON_ONCE(*p + 2 > end))
+ return -ERANGE;
+
+ ceph_encode_8(p, 1); /* version */
+ ceph_encode_8(p, 1); /* compat */
+
+ if (count > 0) {
+ size_t i;
+ size_t size = METRIC_BYTES(count);
+
+ if (WARN_ON_ONCE(*p + 4 + 4 + size > end))
+ return -ERANGE;
+
+ /* metric spec info length */
+ ceph_encode_32(p, 4 + size);
+
+ /* metric spec */
+ ceph_encode_32(p, size);
+ memset(*p, 0, size);
+ for (i = 0; i < count; i++)
+ ((unsigned char *)(*p))[i / 8] |= BIT(metric_bits[i] % 8);
+ *p += size;
+ } else {
+ if (WARN_ON_ONCE(*p + 4 + 4 > end))
+ return -ERANGE;
+
+ /* metric spec info length */
+ ceph_encode_32(p, 4);
+ /* metric spec */
+ ceph_encode_32(p, 0);
+ }
+
+ return 0;
}
/*
@@ -1203,6 +1250,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
struct ceph_mount_options *fsopt = mdsc->fsc->mount_options;
size_t size, count;
void *p, *end;
+ int ret;
const char* metadata[][2] = {
{"hostname", mdsc->nodename},
@@ -1227,12 +1275,19 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
size = FEATURE_BYTES(count);
extra_bytes += 4 + size;
+ /* metric spec */
+ size = 0;
+ count = ARRAY_SIZE(metric_bits);
+ if (count > 0)
+ size = METRIC_BYTES(count);
+ extra_bytes += 2 + 4 + 4 + size;
+
/* Allocate the message */
msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes,
GFP_NOFS, false);
if (!msg) {
pr_err("create_session_msg ENOMEM creating msg\n");
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
p = msg->front.iov_base;
end = p + msg->front.iov_len;
@@ -1245,9 +1300,9 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
* Serialize client metadata into waiting buffer space, using
* the format that userspace expects for map<string, string>
*
- * ClientSession messages with metadata are v3
+ * ClientSession messages with metadata are v4
*/
- msg->hdr.version = cpu_to_le16(3);
+ msg->hdr.version = cpu_to_le16(4);
msg->hdr.compat_version = cpu_to_le16(1);
/* The write pointer, following the session_head structure */
@@ -1269,7 +1324,20 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
p += val_len;
}
- encode_supported_features(&p, end);
+ ret = encode_supported_features(&p, end);
+ if (ret) {
+ pr_err("encode_supported_features failed!\n");
+ ceph_msg_put(msg);
+ return ERR_PTR(ret);
+ }
+
+ ret = encode_metric_spec(&p, end);
+ if (ret) {
+ pr_err("encode_metric_spec failed!\n");
+ ceph_msg_put(msg);
+ return ERR_PTR(ret);
+ }
+
msg->front.iov_len = p - msg->front.iov_base;
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
@@ -1297,8 +1365,8 @@ static int __open_session(struct ceph_mds_client *mdsc,
/* send connect message */
msg = create_session_open_msg(mdsc, session->s_seq);
- if (!msg)
- return -ENOMEM;
+ if (IS_ERR(msg))
+ return PTR_ERR(msg);
ceph_con_send(&session->s_con, msg);
return 0;
}
@@ -1312,6 +1380,7 @@ static struct ceph_mds_session *
__open_export_target_session(struct ceph_mds_client *mdsc, int target)
{
struct ceph_mds_session *session;
+ int ret;
session = __ceph_lookup_mds_session(mdsc, target);
if (!session) {
@@ -1320,8 +1389,11 @@ __open_export_target_session(struct ceph_mds_client *mdsc, int target)
return session;
}
if (session->s_state == CEPH_MDS_SESSION_NEW ||
- session->s_state == CEPH_MDS_SESSION_CLOSING)
- __open_session(mdsc, session);
+ session->s_state == CEPH_MDS_SESSION_CLOSING) {
+ ret = __open_session(mdsc, session);
+ if (ret)
+ return ERR_PTR(ret);
+ }
return session;
}
@@ -1485,6 +1557,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session,
cap->session = NULL;
list_del_init(&cap->session_caps);
session->s_nr_caps--;
+ atomic64_dec(&session->s_mdsc->metric.total_caps);
if (cap->queue_release)
__ceph_queue_cap_release(session, cap);
else
@@ -1785,8 +1858,7 @@ static void renewed_caps(struct ceph_mds_client *mdsc,
/*
* send a session close request
*/
-static int request_close_session(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
+static int request_close_session(struct ceph_mds_session *session)
{
struct ceph_msg *msg;
@@ -1809,7 +1881,7 @@ static int __close_session(struct ceph_mds_client *mdsc,
if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
return 0;
session->s_state = CEPH_MDS_SESSION_CLOSING;
- return request_close_session(mdsc, session);
+ return request_close_session(session);
}
static bool drop_negative_children(struct dentry *dentry)
@@ -2520,7 +2592,12 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
ceph_encode_copy(&p, &ts, sizeof(ts));
}
- BUG_ON(p > end);
+ if (WARN_ON_ONCE(p > end)) {
+ ceph_msg_put(msg);
+ msg = ERR_PTR(-ERANGE);
+ goto out_free2;
+ }
+
msg->front.iov_len = p - msg->front.iov_base;
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
@@ -2756,7 +2833,9 @@ static void __do_request(struct ceph_mds_client *mdsc,
}
if (session->s_state == CEPH_MDS_SESSION_NEW ||
session->s_state == CEPH_MDS_SESSION_CLOSING) {
- __open_session(mdsc, session);
+ err = __open_session(mdsc, session);
+ if (err)
+ goto out_session;
/* retry the same mds later */
if (random)
req->r_resend_mds = mds;
@@ -3279,8 +3358,10 @@ static void handle_session(struct ceph_mds_session *session,
goto bad;
/* version >= 3, feature bits */
ceph_decode_32_safe(&p, end, len, bad);
- ceph_decode_64_safe(&p, end, features, bad);
- p += len - sizeof(features);
+ if (len) {
+ ceph_decode_64_safe(&p, end, features, bad);
+ p += len - sizeof(features);
+ }
}
mutex_lock(&mdsc->mutex);
@@ -3310,6 +3391,8 @@ static void handle_session(struct ceph_mds_session *session,
session->s_state = CEPH_MDS_SESSION_OPEN;
session->s_features = features;
renewed_caps(mdsc, session, 0);
+ if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &session->s_features))
+ metric_schedule_delayed(&mdsc->metric);
wake = 1;
if (mdsc->stopping)
__close_session(mdsc, session);
@@ -4263,6 +4346,30 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc)
ceph_force_reconnect(fsc->sb);
}
+bool check_session_state(struct ceph_mds_session *s)
+{
+ if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
+ dout("resending session close request for mds%d\n",
+ s->s_mds);
+ request_close_session(s);
+ return false;
+ }
+ if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
+ if (s->s_state == CEPH_MDS_SESSION_OPEN) {
+ s->s_state = CEPH_MDS_SESSION_HUNG;
+ pr_info("mds%d hung\n", s->s_mds);
+ }
+ }
+ if (s->s_state == CEPH_MDS_SESSION_NEW ||
+ s->s_state == CEPH_MDS_SESSION_RESTARTING ||
+ s->s_state == CEPH_MDS_SESSION_CLOSED ||
+ s->s_state == CEPH_MDS_SESSION_REJECTED)
+ /* this mds is failed or recovering, just wait */
+ return false;
+
+ return true;
+}
+
/*
* delayed work -- periodically trim expired leases, renew caps with mds
*/
@@ -4283,6 +4390,9 @@ static void delayed_work(struct work_struct *work)
dout("mdsc delayed_work\n");
+ if (mdsc->stopping)
+ return;
+
mutex_lock(&mdsc->mutex);
renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
renew_caps = time_after_eq(jiffies, HZ*renew_interval +
@@ -4294,23 +4404,8 @@ static void delayed_work(struct work_struct *work)
struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
if (!s)
continue;
- if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
- dout("resending session close request for mds%d\n",
- s->s_mds);
- request_close_session(mdsc, s);
- ceph_put_mds_session(s);
- continue;
- }
- if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
- if (s->s_state == CEPH_MDS_SESSION_OPEN) {
- s->s_state = CEPH_MDS_SESSION_HUNG;
- pr_info("mds%d hung\n", s->s_mds);
- }
- }
- if (s->s_state == CEPH_MDS_SESSION_NEW ||
- s->s_state == CEPH_MDS_SESSION_RESTARTING ||
- s->s_state == CEPH_MDS_SESSION_REJECTED) {
- /* this mds is failed or recovering, just wait */
+
+ if (!check_session_state(s)) {
ceph_put_mds_session(s);
continue;
}
@@ -4359,7 +4454,6 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
goto err_mdsc;
}
- fsc->mdsc = mdsc;
init_completion(&mdsc->safe_umount_waiters);
init_waitqueue_head(&mdsc->session_close_wq);
INIT_LIST_HEAD(&mdsc->waiting_for_map);
@@ -4414,6 +4508,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
strscpy(mdsc->nodename, utsname()->nodename,
sizeof(mdsc->nodename));
+
+ fsc->mdsc = mdsc;
return 0;
err_mdsmap:
@@ -4657,7 +4753,16 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
{
dout("stop\n");
- cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
+ /*
+ * Make sure the delayed work stopped before releasing
+ * the resources.
+ *
+ * Because the cancel_delayed_work_sync() will only
+ * guarantee that the work finishes executing. But the
+ * delayed work will re-arm itself again after that.
+ */
+ flush_delayed_work(&mdsc->delayed_work);
+
if (mdsc->mdsmap)
ceph_mdsmap_destroy(mdsc->mdsmap);
kfree(mdsc->sessions);
@@ -4680,6 +4785,7 @@ void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
ceph_metric_destroy(&mdsc->metric);
+ flush_delayed_work(&mdsc->metric.delayed_work);
fsc->mdsc = NULL;
kfree(mdsc);
dout("mdsc_destroy %p done\n", mdsc);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 5e0c4073a6be..bc9e95937d7c 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -18,6 +18,7 @@
#include <linux/ceph/auth.h>
#include "metric.h"
+#include "super.h"
/* The first 8 bits are reserved for old ceph releases */
enum ceph_feature_type {
@@ -27,8 +28,9 @@ enum ceph_feature_type {
CEPHFS_FEATURE_LAZY_CAP_WANTED,
CEPHFS_FEATURE_MULTI_RECONNECT,
CEPHFS_FEATURE_DELEG_INO,
+ CEPHFS_FEATURE_METRIC_COLLECT,
- CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_DELEG_INO,
+ CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT,
};
/*
@@ -42,6 +44,7 @@ enum ceph_feature_type {
CEPHFS_FEATURE_LAZY_CAP_WANTED, \
CEPHFS_FEATURE_MULTI_RECONNECT, \
CEPHFS_FEATURE_DELEG_INO, \
+ CEPHFS_FEATURE_METRIC_COLLECT, \
\
CEPHFS_FEATURE_MAX, \
}
@@ -476,6 +479,8 @@ struct ceph_mds_client {
extern const char *ceph_mds_op_name(int op);
+extern bool check_session_state(struct ceph_mds_session *s);
+
extern struct ceph_mds_session *
__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 889627817e52..e4aba6c6d3b5 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -120,7 +120,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
const void *start = *p;
int i, j, n;
int err;
- u8 mdsmap_v, mdsmap_cv;
+ u8 mdsmap_v;
u16 mdsmap_ev;
m = kzalloc(sizeof(*m), GFP_NOFS);
@@ -129,7 +129,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
ceph_decode_need(p, end, 1 + 1, bad);
mdsmap_v = ceph_decode_8(p);
- mdsmap_cv = ceph_decode_8(p);
+ *p += sizeof(u8); /* mdsmap_cv */
if (mdsmap_v >= 4) {
u32 mdsmap_len;
ceph_decode_32_safe(p, end, mdsmap_len, bad);
@@ -174,7 +174,6 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
u64 global_id;
u32 namelen;
s32 mds, inc, state;
- u64 state_seq;
u8 info_v;
void *info_end = NULL;
struct ceph_entity_addr addr;
@@ -189,9 +188,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
info_v= ceph_decode_8(p);
if (info_v >= 4) {
u32 info_len;
- u8 info_cv;
ceph_decode_need(p, end, 1 + sizeof(u32), bad);
- info_cv = ceph_decode_8(p);
+ *p += sizeof(u8); /* info_cv */
info_len = ceph_decode_32(p);
info_end = *p + info_len;
if (info_end > end)
@@ -210,7 +208,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
mds = ceph_decode_32(p);
inc = ceph_decode_32(p);
state = ceph_decode_32(p);
- state_seq = ceph_decode_64(p);
+ *p += sizeof(u64); /* state_seq */
err = ceph_decode_entity_addr(p, end, &addr);
if (err)
goto corrupt;
diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c
index 9217f35bc2b9..2466b261fba2 100644
--- a/fs/ceph/metric.c
+++ b/fs/ceph/metric.c
@@ -1,10 +1,150 @@
/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/ceph/ceph_debug.h>
#include <linux/types.h>
#include <linux/percpu_counter.h>
#include <linux/math64.h>
#include "metric.h"
+#include "mds_client.h"
+
+static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *s)
+{
+ struct ceph_metric_head *head;
+ struct ceph_metric_cap *cap;
+ struct ceph_metric_read_latency *read;
+ struct ceph_metric_write_latency *write;
+ struct ceph_metric_metadata_latency *meta;
+ struct ceph_client_metric *m = &mdsc->metric;
+ u64 nr_caps = atomic64_read(&m->total_caps);
+ struct ceph_msg *msg;
+ struct timespec64 ts;
+ s64 sum;
+ s32 items = 0;
+ s32 len;
+
+ len = sizeof(*head) + sizeof(*cap) + sizeof(*read) + sizeof(*write)
+ + sizeof(*meta);
+
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true);
+ if (!msg) {
+ pr_err("send metrics to mds%d, failed to allocate message\n",
+ s->s_mds);
+ return false;
+ }
+
+ head = msg->front.iov_base;
+
+ /* encode the cap metric */
+ cap = (struct ceph_metric_cap *)(head + 1);
+ cap->type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO);
+ cap->ver = 1;
+ cap->compat = 1;
+ cap->data_len = cpu_to_le32(sizeof(*cap) - 10);
+ cap->hit = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_hit));
+ cap->mis = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_mis));
+ cap->total = cpu_to_le64(nr_caps);
+ items++;
+
+ /* encode the read latency metric */
+ read = (struct ceph_metric_read_latency *)(cap + 1);
+ read->type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY);
+ read->ver = 1;
+ read->compat = 1;
+ read->data_len = cpu_to_le32(sizeof(*read) - 10);
+ sum = m->read_latency_sum;
+ jiffies_to_timespec64(sum, &ts);
+ read->sec = cpu_to_le32(ts.tv_sec);
+ read->nsec = cpu_to_le32(ts.tv_nsec);
+ items++;
+
+ /* encode the write latency metric */
+ write = (struct ceph_metric_write_latency *)(read + 1);
+ write->type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY);
+ write->ver = 1;
+ write->compat = 1;
+ write->data_len = cpu_to_le32(sizeof(*write) - 10);
+ sum = m->write_latency_sum;
+ jiffies_to_timespec64(sum, &ts);
+ write->sec = cpu_to_le32(ts.tv_sec);
+ write->nsec = cpu_to_le32(ts.tv_nsec);
+ items++;
+
+ /* encode the metadata latency metric */
+ meta = (struct ceph_metric_metadata_latency *)(write + 1);
+ meta->type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY);
+ meta->ver = 1;
+ meta->compat = 1;
+ meta->data_len = cpu_to_le32(sizeof(*meta) - 10);
+ sum = m->metadata_latency_sum;
+ jiffies_to_timespec64(sum, &ts);
+ meta->sec = cpu_to_le32(ts.tv_sec);
+ meta->nsec = cpu_to_le32(ts.tv_nsec);
+ items++;
+
+ put_unaligned_le32(items, &head->num);
+ msg->front.iov_len = len;
+ msg->hdr.version = cpu_to_le16(1);
+ msg->hdr.compat_version = cpu_to_le16(1);
+ msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+ dout("client%llu send metrics to mds%d\n",
+ ceph_client_gid(mdsc->fsc->client), s->s_mds);
+ ceph_con_send(&s->s_con, msg);
+
+ return true;
+}
+
+
+static void metric_get_session(struct ceph_mds_client *mdsc)
+{
+ struct ceph_mds_session *s;
+ int i;
+
+ mutex_lock(&mdsc->mutex);
+ for (i = 0; i < mdsc->max_sessions; i++) {
+ s = __ceph_lookup_mds_session(mdsc, i);
+ if (!s)
+ continue;
+
+ /*
+ * Skip it if MDS doesn't support the metric collection,
+ * or the MDS will close the session's socket connection
+ * directly when it get this message.
+ */
+ if (check_session_state(s) &&
+ test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &s->s_features)) {
+ mdsc->metric.session = s;
+ break;
+ }
+
+ ceph_put_mds_session(s);
+ }
+ mutex_unlock(&mdsc->mutex);
+}
+
+static void metric_delayed_work(struct work_struct *work)
+{
+ struct ceph_client_metric *m =
+ container_of(work, struct ceph_client_metric, delayed_work.work);
+ struct ceph_mds_client *mdsc =
+ container_of(m, struct ceph_mds_client, metric);
+
+ if (mdsc->stopping)
+ return;
+
+ if (!m->session || !check_session_state(m->session)) {
+ if (m->session) {
+ ceph_put_mds_session(m->session);
+ m->session = NULL;
+ }
+ metric_get_session(mdsc);
+ }
+ if (m->session) {
+ ceph_mdsc_send_metrics(mdsc, m->session);
+ metric_schedule_delayed(m);
+ }
+}
int ceph_metric_init(struct ceph_client_metric *m)
{
@@ -22,6 +162,7 @@ int ceph_metric_init(struct ceph_client_metric *m)
if (ret)
goto err_d_lease_mis;
+ atomic64_set(&m->total_caps, 0);
ret = percpu_counter_init(&m->i_caps_hit, 0, GFP_KERNEL);
if (ret)
goto err_i_caps_hit;
@@ -51,6 +192,9 @@ int ceph_metric_init(struct ceph_client_metric *m)
m->total_metadatas = 0;
m->metadata_latency_sum = 0;
+ m->session = NULL;
+ INIT_DELAYED_WORK(&m->delayed_work, metric_delayed_work);
+
return 0;
err_i_caps_mis:
@@ -72,6 +216,11 @@ void ceph_metric_destroy(struct ceph_client_metric *m)
percpu_counter_destroy(&m->i_caps_hit);
percpu_counter_destroy(&m->d_lease_mis);
percpu_counter_destroy(&m->d_lease_hit);
+
+ cancel_delayed_work_sync(&m->delayed_work);
+
+ if (m->session)
+ ceph_put_mds_session(m->session);
}
static inline void __update_latency(ktime_t *totalp, ktime_t *lsump,
diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h
index ccd81285a450..1d0959d669d7 100644
--- a/fs/ceph/metric.h
+++ b/fs/ceph/metric.h
@@ -6,12 +6,91 @@
#include <linux/percpu_counter.h>
#include <linux/ktime.h>
+extern bool disable_send_metrics;
+
+enum ceph_metric_type {
+ CLIENT_METRIC_TYPE_CAP_INFO,
+ CLIENT_METRIC_TYPE_READ_LATENCY,
+ CLIENT_METRIC_TYPE_WRITE_LATENCY,
+ CLIENT_METRIC_TYPE_METADATA_LATENCY,
+ CLIENT_METRIC_TYPE_DENTRY_LEASE,
+
+ CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_DENTRY_LEASE,
+};
+
+/*
+ * This will always have the highest metric bit value
+ * as the last element of the array.
+ */
+#define CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED { \
+ CLIENT_METRIC_TYPE_CAP_INFO, \
+ CLIENT_METRIC_TYPE_READ_LATENCY, \
+ CLIENT_METRIC_TYPE_WRITE_LATENCY, \
+ CLIENT_METRIC_TYPE_METADATA_LATENCY, \
+ \
+ CLIENT_METRIC_TYPE_MAX, \
+}
+
+/* metric caps header */
+struct ceph_metric_cap {
+ __le32 type; /* ceph metric type */
+
+ __u8 ver;
+ __u8 compat;
+
+ __le32 data_len; /* length of sizeof(hit + mis + total) */
+ __le64 hit;
+ __le64 mis;
+ __le64 total;
+} __packed;
+
+/* metric read latency header */
+struct ceph_metric_read_latency {
+ __le32 type; /* ceph metric type */
+
+ __u8 ver;
+ __u8 compat;
+
+ __le32 data_len; /* length of sizeof(sec + nsec) */
+ __le32 sec;
+ __le32 nsec;
+} __packed;
+
+/* metric write latency header */
+struct ceph_metric_write_latency {
+ __le32 type; /* ceph metric type */
+
+ __u8 ver;
+ __u8 compat;
+
+ __le32 data_len; /* length of sizeof(sec + nsec) */
+ __le32 sec;
+ __le32 nsec;
+} __packed;
+
+/* metric metadata latency header */
+struct ceph_metric_metadata_latency {
+ __le32 type; /* ceph metric type */
+
+ __u8 ver;
+ __u8 compat;
+
+ __le32 data_len; /* length of sizeof(sec + nsec) */
+ __le32 sec;
+ __le32 nsec;
+} __packed;
+
+struct ceph_metric_head {
+ __le32 num; /* the number of metrics that will be sent */
+} __packed;
+
/* This is the global metrics */
struct ceph_client_metric {
atomic64_t total_dentries;
struct percpu_counter d_lease_hit;
struct percpu_counter d_lease_mis;
+ atomic64_t total_caps;
struct percpu_counter i_caps_hit;
struct percpu_counter i_caps_mis;
@@ -35,8 +114,20 @@ struct ceph_client_metric {
ktime_t metadata_latency_sq_sum;
ktime_t metadata_latency_min;
ktime_t metadata_latency_max;
+
+ struct ceph_mds_session *session;
+ struct delayed_work delayed_work; /* delayed work */
};
+static inline void metric_schedule_delayed(struct ceph_client_metric *m)
+{
+ if (disable_send_metrics)
+ return;
+
+ /* per second */
+ schedule_delayed_work(&m->delayed_work, round_jiffies_relative(HZ));
+}
+
extern int ceph_metric_init(struct ceph_client_metric *m);
extern void ceph_metric_destroy(struct ceph_client_metric *m);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index c9784eb1159a..7ec0e6d03d10 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -27,6 +27,9 @@
#include <linux/ceph/auth.h>
#include <linux/ceph/debugfs.h>
+static DEFINE_SPINLOCK(ceph_fsc_lock);
+static LIST_HEAD(ceph_fsc_list);
+
/*
* Ceph superblock operations
*
@@ -634,8 +637,6 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
struct ceph_options *opt)
{
struct ceph_fs_client *fsc;
- int page_count;
- size_t size;
int err;
fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
@@ -683,18 +684,12 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
if (!fsc->cap_wq)
goto fail_inode_wq;
- /* set up mempools */
- err = -ENOMEM;
- page_count = fsc->mount_options->wsize >> PAGE_SHIFT;
- size = sizeof (struct page *) * (page_count ? page_count : 1);
- fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size);
- if (!fsc->wb_pagevec_pool)
- goto fail_cap_wq;
+ spin_lock(&ceph_fsc_lock);
+ list_add_tail(&fsc->metric_wakeup, &ceph_fsc_list);
+ spin_unlock(&ceph_fsc_lock);
return fsc;
-fail_cap_wq:
- destroy_workqueue(fsc->cap_wq);
fail_inode_wq:
destroy_workqueue(fsc->inode_wq);
fail_client:
@@ -717,12 +712,14 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
{
dout("destroy_fs_client %p\n", fsc);
+ spin_lock(&ceph_fsc_lock);
+ list_del(&fsc->metric_wakeup);
+ spin_unlock(&ceph_fsc_lock);
+
ceph_mdsc_destroy(fsc);
destroy_workqueue(fsc->inode_wq);
destroy_workqueue(fsc->cap_wq);
- mempool_destroy(fsc->wb_pagevec_pool);
-
destroy_mount_options(fsc->mount_options);
ceph_destroy_client(fsc->client);
@@ -741,6 +738,7 @@ struct kmem_cache *ceph_dentry_cachep;
struct kmem_cache *ceph_file_cachep;
struct kmem_cache *ceph_dir_file_cachep;
struct kmem_cache *ceph_mds_request_cachep;
+mempool_t *ceph_wb_pagevec_pool;
static void ceph_inode_init_once(void *foo)
{
@@ -785,6 +783,10 @@ static int __init init_caches(void)
if (!ceph_mds_request_cachep)
goto bad_mds_req;
+ ceph_wb_pagevec_pool = mempool_create_kmalloc_pool(10, CEPH_MAX_WRITE_SIZE >> PAGE_SHIFT);
+ if (!ceph_wb_pagevec_pool)
+ goto bad_pagevec_pool;
+
error = ceph_fscache_register();
if (error)
goto bad_fscache;
@@ -793,6 +795,8 @@ static int __init init_caches(void)
bad_fscache:
kmem_cache_destroy(ceph_mds_request_cachep);
+bad_pagevec_pool:
+ mempool_destroy(ceph_wb_pagevec_pool);
bad_mds_req:
kmem_cache_destroy(ceph_dir_file_cachep);
bad_dir_file:
@@ -823,12 +827,13 @@ static void destroy_caches(void)
kmem_cache_destroy(ceph_file_cachep);
kmem_cache_destroy(ceph_dir_file_cachep);
kmem_cache_destroy(ceph_mds_request_cachep);
+ mempool_destroy(ceph_wb_pagevec_pool);
ceph_fscache_unregister();
}
/*
- * ceph_umount_begin - initiate forced umount. Tear down down the
+ * ceph_umount_begin - initiate forced umount. Tear down the
* mount, skipping steps that may hang while waiting for server(s).
*/
static void ceph_umount_begin(struct super_block *sb)
@@ -1282,6 +1287,37 @@ static void __exit exit_ceph(void)
destroy_caches();
}
+static int param_set_metrics(const char *val, const struct kernel_param *kp)
+{
+ struct ceph_fs_client *fsc;
+ int ret;
+
+ ret = param_set_bool(val, kp);
+ if (ret) {
+ pr_err("Failed to parse sending metrics switch value '%s'\n",
+ val);
+ return ret;
+ } else if (!disable_send_metrics) {
+ // wake up all the mds clients
+ spin_lock(&ceph_fsc_lock);
+ list_for_each_entry(fsc, &ceph_fsc_list, metric_wakeup) {
+ metric_schedule_delayed(&fsc->mdsc->metric);
+ }
+ spin_unlock(&ceph_fsc_lock);
+ }
+
+ return 0;
+}
+
+static const struct kernel_param_ops param_ops_metrics = {
+ .set = param_set_metrics,
+ .get = param_get_bool,
+};
+
+bool disable_send_metrics = false;
+module_param_cb(disable_send_metrics, &param_ops_metrics, &disable_send_metrics, 0644);
+MODULE_PARM_DESC(disable_send_metrics, "Enable sending perf metrics to ceph cluster (default: on)");
+
module_init(init_ceph);
module_exit(exit_ceph);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 5a6cdd39bc10..4c3c964b1c54 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -101,6 +101,8 @@ struct ceph_mount_options {
struct ceph_fs_client {
struct super_block *sb;
+ struct list_head metric_wakeup;
+
struct ceph_mount_options *mount_options;
struct ceph_client *client;
@@ -116,8 +118,6 @@ struct ceph_fs_client {
struct ceph_mds_client *mdsc;
- /* writeback */
- mempool_t *wb_pagevec_pool;
atomic_long_t writeback_count;
struct workqueue_struct *inode_wq;
@@ -353,7 +353,7 @@ struct ceph_inode_info {
unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */
/*
- * Link to the the auth cap's session's s_cap_dirty list. s_cap_dirty
+ * Link to the auth cap's session's s_cap_dirty list. s_cap_dirty
* is protected by the mdsc->cap_dirty_lock, but each individual item
* is also protected by the inode's i_ceph_lock. Walking s_cap_dirty
* requires the mdsc->cap_dirty_lock. List presence for an item can
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 71ee34d160c3..3a733ac33d9b 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -497,10 +497,10 @@ static int __set_xattr(struct ceph_inode_info *ci,
kfree(*newxattr);
*newxattr = NULL;
if (xattr->should_free_val)
- kfree((void *)xattr->val);
+ kfree(xattr->val);
if (update_xattr) {
- kfree((void *)name);
+ kfree(name);
name = xattr->name;
}
ci->i_xattrs.names_size -= xattr->name_len;
@@ -566,9 +566,9 @@ static void __free_xattr(struct ceph_inode_xattr *xattr)
BUG_ON(!xattr);
if (xattr->should_free_name)
- kfree((void *)xattr->name);
+ kfree(xattr->name);
if (xattr->should_free_val)
- kfree((void *)xattr->val);
+ kfree(xattr->val);
kfree(xattr);
}
@@ -582,9 +582,9 @@ static int __remove_xattr(struct ceph_inode_info *ci,
rb_erase(&xattr->node, &ci->i_xattrs.index);
if (xattr->should_free_name)
- kfree((void *)xattr->name);
+ kfree(xattr->name);
if (xattr->should_free_val)
- kfree((void *)xattr->val);
+ kfree(xattr->val);
ci->i_xattrs.names_size -= xattr->name_len;
ci->i_xattrs.vals_size -= xattr->val_len;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 0ad1309e88d3..a275ee399dce 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -4886,6 +4886,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol)
full_path = build_unc_path_to_root(vol, cifs_sb, !!count);
if (IS_ERR(full_path)) {
rc = PTR_ERR(full_path);
+ full_path = NULL;
break;
}
/* Chase referral */
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index b9db73687eaa..eba01d0908dd 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -115,6 +115,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
vars->oparms.fid = &fid;
vars->oparms.reconnect = false;
vars->oparms.mode = mode;
+ vars->oparms.cifs_sb = cifs_sb;
rqst[num_rqst].rq_iov = &vars->open_iov[0];
rqst[num_rqst].rq_nvec = SMB2_CREATE_IOV_SIZE;
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 24c2ac360591..667d70aa335f 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -3913,7 +3913,7 @@ smb2_readv_callback(struct mid_q_entry *mid)
case MID_RESPONSE_MALFORMED:
credits.value = le16_to_cpu(shdr->CreditRequest);
credits.instance = server->reconnect_instance;
- /* fall through */
+ fallthrough;
default:
rdata->result = -EIO;
}
@@ -4146,7 +4146,7 @@ smb2_writev_callback(struct mid_q_entry *mid)
case MID_RESPONSE_MALFORMED:
credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest);
credits.instance = server->reconnect_instance;
- /* fall through */
+ fallthrough;
default:
wdata->result = -EIO;
break;
diff --git a/fs/coredump.c b/fs/coredump.c
index 7237f07ff6be..76e7c10edfc0 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -153,10 +153,10 @@ int cn_esc_printf(struct core_name *cn, const char *fmt, ...)
return ret;
}
-static int cn_print_exe_file(struct core_name *cn)
+static int cn_print_exe_file(struct core_name *cn, bool name_only)
{
struct file *exe_file;
- char *pathbuf, *path;
+ char *pathbuf, *path, *ptr;
int ret;
exe_file = get_mm_exe_file(current->mm);
@@ -175,6 +175,11 @@ static int cn_print_exe_file(struct core_name *cn)
goto free_buf;
}
+ if (name_only) {
+ ptr = strrchr(path, '/');
+ if (ptr)
+ path = ptr + 1;
+ }
ret = cn_esc_printf(cn, "%s", path);
free_buf:
@@ -301,12 +306,16 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm,
utsname()->nodename);
up_read(&uts_sem);
break;
- /* executable */
+ /* executable, could be changed by prctl PR_SET_NAME etc */
case 'e':
err = cn_esc_printf(cn, "%s", current->comm);
break;
+ /* file name of executable */
+ case 'f':
+ err = cn_print_exe_file(cn, true);
+ break;
case 'E':
- err = cn_print_exe_file(cn);
+ err = cn_print_exe_file(cn, false);
break;
/* core limit size */
case 'c':
diff --git a/fs/exec.c b/fs/exec.c
index 3698252719a3..a91003e28eaa 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -141,12 +141,14 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
if (IS_ERR(file))
goto out;
- error = -EINVAL;
- if (!S_ISREG(file_inode(file)->i_mode))
- goto exit;
-
+ /*
+ * may_open() has already checked for this, so it should be
+ * impossible to trip now. But we need to be extra cautious
+ * and check again at the very end too.
+ */
error = -EACCES;
- if (path_noexec(&file->f_path))
+ if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) ||
+ path_noexec(&file->f_path)))
goto exit;
fsnotify_open(file);
@@ -215,7 +217,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
* We are doing an exec(). 'current' is the process
* doing the exec and bprm->mm is the new process's mm.
*/
- ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags,
+ ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags,
&page, NULL, NULL);
if (ret <= 0)
return NULL;
@@ -909,11 +911,14 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
if (IS_ERR(file))
goto out;
+ /*
+ * may_open() has already checked for this, so it should be
+ * impossible to trip now. But we need to be extra cautious
+ * and check again at the very end too.
+ */
err = -EACCES;
- if (!S_ISREG(file_inode(file)->i_mode))
- goto exit;
-
- if (path_noexec(&file->f_path))
+ if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) ||
+ path_noexec(&file->f_path)))
goto exit;
err = deny_write_access(file);
@@ -1402,7 +1407,12 @@ int begin_new_exec(struct linux_binprm * bprm)
if (retval)
goto out_unlock;
- set_fs(USER_DS);
+ /*
+ * Ensure that the uaccess routines can actually operate on userspace
+ * pointers:
+ */
+ force_uaccess_begin();
+
me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
PF_NOFREEZE | PF_NO_SETAFFINITY);
flush_thread();
diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c
index 4055eb00ea9b..a987919686c0 100644
--- a/fs/exfat/balloc.c
+++ b/fs/exfat/balloc.c
@@ -158,7 +158,7 @@ int exfat_set_bitmap(struct inode *inode, unsigned int clu)
b = BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent_idx);
set_bit_le(b, sbi->vol_amap[i]->b_data);
- exfat_update_bh(sb, sbi->vol_amap[i], IS_DIRSYNC(inode));
+ exfat_update_bh(sbi->vol_amap[i], IS_DIRSYNC(inode));
return 0;
}
@@ -180,7 +180,7 @@ void exfat_clear_bitmap(struct inode *inode, unsigned int clu)
b = BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent_idx);
clear_bit_le(b, sbi->vol_amap[i]->b_data);
- exfat_update_bh(sb, sbi->vol_amap[i], IS_DIRSYNC(inode));
+ exfat_update_bh(sbi->vol_amap[i], IS_DIRSYNC(inode));
if (opts->discard) {
int ret_discard;
diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index 119abf0d8dd6..573659bfbc55 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -470,7 +470,7 @@ int exfat_init_dir_entry(struct inode *inode, struct exfat_chain *p_dir,
&ep->dentry.file.access_date,
NULL);
- exfat_update_bh(sb, bh, IS_DIRSYNC(inode));
+ exfat_update_bh(bh, IS_DIRSYNC(inode));
brelse(bh);
ep = exfat_get_dentry(sb, p_dir, entry + 1, &bh, &sector);
@@ -480,7 +480,7 @@ int exfat_init_dir_entry(struct inode *inode, struct exfat_chain *p_dir,
exfat_init_stream_entry(ep,
(type == TYPE_FILE) ? ALLOC_FAT_CHAIN : ALLOC_NO_FAT_CHAIN,
start_clu, size);
- exfat_update_bh(sb, bh, IS_DIRSYNC(inode));
+ exfat_update_bh(bh, IS_DIRSYNC(inode));
brelse(bh);
return 0;
@@ -516,7 +516,7 @@ int exfat_update_dir_chksum(struct inode *inode, struct exfat_chain *p_dir,
}
fep->dentry.file.checksum = cpu_to_le16(chksum);
- exfat_update_bh(sb, fbh, IS_DIRSYNC(inode));
+ exfat_update_bh(fbh, IS_DIRSYNC(inode));
release_fbh:
brelse(fbh);
return ret;
@@ -538,7 +538,7 @@ int exfat_init_ext_entry(struct inode *inode, struct exfat_chain *p_dir,
return -EIO;
ep->dentry.file.num_ext = (unsigned char)(num_entries - 1);
- exfat_update_bh(sb, bh, sync);
+ exfat_update_bh(bh, sync);
brelse(bh);
ep = exfat_get_dentry(sb, p_dir, entry + 1, &bh, &sector);
@@ -547,7 +547,7 @@ int exfat_init_ext_entry(struct inode *inode, struct exfat_chain *p_dir,
ep->dentry.stream.name_len = p_uniname->name_len;
ep->dentry.stream.name_hash = cpu_to_le16(p_uniname->name_hash);
- exfat_update_bh(sb, bh, sync);
+ exfat_update_bh(bh, sync);
brelse(bh);
for (i = EXFAT_FIRST_CLUSTER; i < num_entries; i++) {
@@ -556,7 +556,7 @@ int exfat_init_ext_entry(struct inode *inode, struct exfat_chain *p_dir,
return -EIO;
exfat_init_name_entry(ep, uniname);
- exfat_update_bh(sb, bh, sync);
+ exfat_update_bh(bh, sync);
brelse(bh);
uniname += EXFAT_FILE_NAME_LEN;
}
@@ -580,7 +580,7 @@ int exfat_remove_entries(struct inode *inode, struct exfat_chain *p_dir,
return -EIO;
exfat_set_entry_type(ep, TYPE_DELETED);
- exfat_update_bh(sb, bh, IS_DIRSYNC(inode));
+ exfat_update_bh(bh, IS_DIRSYNC(inode));
brelse(bh);
}
@@ -604,16 +604,20 @@ void exfat_update_dir_chksum_with_entry_set(struct exfat_entry_set_cache *es)
es->modified = true;
}
-void exfat_free_dentry_set(struct exfat_entry_set_cache *es, int sync)
+int exfat_free_dentry_set(struct exfat_entry_set_cache *es, int sync)
{
- int i;
+ int i, err = 0;
- for (i = 0; i < es->num_bh; i++) {
- if (es->modified)
- exfat_update_bh(es->sb, es->bh[i], sync);
- brelse(es->bh[i]);
- }
+ if (es->modified)
+ err = exfat_update_bhs(es->bh, es->num_bh, sync);
+
+ for (i = 0; i < es->num_bh; i++)
+ if (err)
+ bforget(es->bh[i]);
+ else
+ brelse(es->bh[i]);
kfree(es);
+ return err;
}
static int exfat_walk_fat_chain(struct super_block *sb,
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index 75c7bdbeba6d..95d717f8620c 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -13,8 +13,6 @@
#define EXFAT_SUPER_MAGIC 0x2011BAB0UL
#define EXFAT_ROOT_INO 1
-#define EXFAT_SB_DIRTY 0
-
#define EXFAT_CLUSTERS_UNTRACKED (~0u)
/*
@@ -226,7 +224,8 @@ struct exfat_sb_info {
unsigned int num_FAT_sectors; /* num of FAT sectors */
unsigned int root_dir; /* root dir cluster */
unsigned int dentries_per_clu; /* num of dentries per cluster */
- unsigned int vol_flag; /* volume dirty flag */
+ unsigned int vol_flags; /* volume flags */
+ unsigned int vol_flags_persistent; /* volume flags to retain */
struct buffer_head *boot_bh; /* buffer_head of BOOT sector */
unsigned int map_clu; /* allocation bitmap start cluster */
@@ -238,7 +237,6 @@ struct exfat_sb_info {
unsigned int clu_srch_ptr; /* cluster search pointer */
unsigned int used_clusters; /* number of used clusters */
- unsigned long s_state;
struct mutex s_lock; /* superblock lock */
struct exfat_mount_options options;
struct nls_table *nls_io; /* Charset used for input and display */
@@ -383,7 +381,8 @@ static inline int exfat_sector_to_cluster(struct exfat_sb_info *sbi,
}
/* super.c */
-int exfat_set_vol_flags(struct super_block *sb, unsigned short new_flag);
+int exfat_set_volume_dirty(struct super_block *sb);
+int exfat_clear_volume_dirty(struct super_block *sb);
/* fatent.c */
#define exfat_get_next_cluster(sb, pclu) exfat_ent_get(sb, *(pclu), pclu)
@@ -463,7 +462,7 @@ struct exfat_dentry *exfat_get_dentry_cached(struct exfat_entry_set_cache *es,
int num);
struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb,
struct exfat_chain *p_dir, int entry, unsigned int type);
-void exfat_free_dentry_set(struct exfat_entry_set_cache *es, int sync);
+int exfat_free_dentry_set(struct exfat_entry_set_cache *es, int sync);
int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir);
/* inode.c */
@@ -515,7 +514,8 @@ void exfat_set_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts,
u8 *tz, __le16 *time, __le16 *date, u8 *time_cs);
u16 exfat_calc_chksum16(void *data, int len, u16 chksum, int type);
u32 exfat_calc_chksum32(void *data, int len, u32 chksum, int type);
-void exfat_update_bh(struct super_block *sb, struct buffer_head *bh, int sync);
+void exfat_update_bh(struct buffer_head *bh, int sync);
+int exfat_update_bhs(struct buffer_head **bhs, int nr_bhs, int sync);
void exfat_chain_set(struct exfat_chain *ec, unsigned int dir,
unsigned int size, unsigned char flags);
void exfat_chain_dup(struct exfat_chain *dup, struct exfat_chain *ec);
diff --git a/fs/exfat/exfat_raw.h b/fs/exfat/exfat_raw.h
index 350ce59cc324..6aec6288e1f2 100644
--- a/fs/exfat/exfat_raw.h
+++ b/fs/exfat/exfat_raw.h
@@ -14,9 +14,8 @@
#define EXFAT_MAX_FILE_LEN 255
-#define VOL_CLEAN 0x0000
-#define VOL_DIRTY 0x0002
-#define ERR_MEDIUM 0x0004
+#define VOLUME_DIRTY 0x0002
+#define MEDIA_FAILURE 0x0004
#define EXFAT_EOF_CLUSTER 0xFFFFFFFFu
#define EXFAT_BAD_CLUSTER 0xFFFFFFF7u
diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c
index 4e5c5c9c0f2d..c3c9afee7418 100644
--- a/fs/exfat/fatent.c
+++ b/fs/exfat/fatent.c
@@ -75,7 +75,7 @@ int exfat_ent_set(struct super_block *sb, unsigned int loc,
fat_entry = (__le32 *)&(bh->b_data[off]);
*fat_entry = cpu_to_le32(content);
- exfat_update_bh(sb, bh, sb->s_flags & SB_SYNCHRONOUS);
+ exfat_update_bh(bh, sb->s_flags & SB_SYNCHRONOUS);
exfat_mirror_bh(sb, sec, bh);
brelse(bh);
return 0;
@@ -174,7 +174,6 @@ int exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain)
return -EIO;
}
- set_bit(EXFAT_SB_DIRTY, &sbi->s_state);
clu = p_chain->dir;
if (p_chain->flags == ALLOC_NO_FAT_CHAIN) {
@@ -230,21 +229,6 @@ int exfat_find_last_cluster(struct super_block *sb, struct exfat_chain *p_chain,
return 0;
}
-static inline int exfat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
-{
- int i, err = 0;
-
- for (i = 0; i < nr_bhs; i++)
- write_dirty_buffer(bhs[i], 0);
-
- for (i = 0; i < nr_bhs; i++) {
- wait_on_buffer(bhs[i]);
- if (!err && !buffer_uptodate(bhs[i]))
- err = -EIO;
- }
- return err;
-}
-
int exfat_zeroed_cluster(struct inode *dir, unsigned int clu)
{
struct super_block *sb = dir->i_sb;
@@ -266,41 +250,23 @@ int exfat_zeroed_cluster(struct inode *dir, unsigned int clu)
}
/* Zeroing the unused blocks on this cluster */
- n = 0;
while (blknr < last_blknr) {
- bhs[n] = sb_getblk(sb, blknr);
- if (!bhs[n]) {
- err = -ENOMEM;
- goto release_bhs;
- }
- memset(bhs[n]->b_data, 0, sb->s_blocksize);
- exfat_update_bh(sb, bhs[n], 0);
-
- n++;
- blknr++;
-
- if (n == nr_bhs) {
- if (IS_DIRSYNC(dir)) {
- err = exfat_sync_bhs(bhs, n);
- if (err)
- goto release_bhs;
+ for (n = 0; n < nr_bhs && blknr < last_blknr; n++, blknr++) {
+ bhs[n] = sb_getblk(sb, blknr);
+ if (!bhs[n]) {
+ err = -ENOMEM;
+ goto release_bhs;
}
-
- for (i = 0; i < n; i++)
- brelse(bhs[i]);
- n = 0;
+ memset(bhs[n]->b_data, 0, sb->s_blocksize);
}
- }
- if (IS_DIRSYNC(dir)) {
- err = exfat_sync_bhs(bhs, n);
+ err = exfat_update_bhs(bhs, n, IS_DIRSYNC(dir));
if (err)
goto release_bhs;
- }
-
- for (i = 0; i < n; i++)
- brelse(bhs[i]);
+ for (i = 0; i < n; i++)
+ brelse(bhs[i]);
+ }
return 0;
release_bhs:
@@ -358,8 +324,6 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc,
}
}
- set_bit(EXFAT_SB_DIRTY, &sbi->s_state);
-
p_chain->dir = EXFAT_EOF_CLUSTER;
while ((new_clu = exfat_find_free_bitmap(sb, hint_clu)) !=
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index a6a063830edc..f41f523a58ad 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -106,7 +106,7 @@ int __exfat_truncate(struct inode *inode, loff_t new_size)
if (ei->type != TYPE_FILE && ei->type != TYPE_DIR)
return -EPERM;
- exfat_set_vol_flags(sb, VOL_DIRTY);
+ exfat_set_volume_dirty(sb);
num_clusters_new = EXFAT_B_TO_CLU_ROUND_UP(i_size_read(inode), sbi);
num_clusters_phys =
@@ -154,6 +154,7 @@ int __exfat_truncate(struct inode *inode, loff_t new_size)
struct timespec64 ts;
struct exfat_dentry *ep, *ep2;
struct exfat_entry_set_cache *es;
+ int err;
es = exfat_get_dentry_set(sb, &(ei->dir), ei->entry,
ES_ALL_ENTRIES);
@@ -188,7 +189,9 @@ int __exfat_truncate(struct inode *inode, loff_t new_size)
}
exfat_update_dir_chksum_with_entry_set(es);
- exfat_free_dentry_set(es, inode_needs_sync(inode));
+ err = exfat_free_dentry_set(es, inode_needs_sync(inode));
+ if (err)
+ return err;
}
/* cut off from the FAT chain */
@@ -217,7 +220,7 @@ int __exfat_truncate(struct inode *inode, loff_t new_size)
if (exfat_free_cluster(inode, &clu))
return -EIO;
- exfat_set_vol_flags(sb, VOL_CLEAN);
+ exfat_clear_volume_dirty(sb);
return 0;
}
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index cf9ca6c4d046..7f90204adef5 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -39,7 +39,7 @@ static int __exfat_write_inode(struct inode *inode, int sync)
if (is_dir && ei->dir.dir == sbi->root_dir && ei->entry == -1)
return 0;
- exfat_set_vol_flags(sb, VOL_DIRTY);
+ exfat_set_volume_dirty(sb);
/* get the directory entry of given file or directory */
es = exfat_get_dentry_set(sb, &(ei->dir), ei->entry, ES_ALL_ENTRIES);
@@ -77,8 +77,7 @@ static int __exfat_write_inode(struct inode *inode, int sync)
ep2->dentry.stream.size = ep2->dentry.stream.valid_size;
exfat_update_dir_chksum_with_entry_set(es);
- exfat_free_dentry_set(es, sync);
- return 0;
+ return exfat_free_dentry_set(es, sync);
}
int exfat_write_inode(struct inode *inode, struct writeback_control *wbc)
@@ -168,7 +167,7 @@ static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset,
}
if (*clu == EXFAT_EOF_CLUSTER) {
- exfat_set_vol_flags(sb, VOL_DIRTY);
+ exfat_set_volume_dirty(sb);
new_clu.dir = (last_clu == EXFAT_EOF_CLUSTER) ?
EXFAT_EOF_CLUSTER : last_clu + 1;
@@ -222,6 +221,7 @@ static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset,
if (ei->dir.dir != DIR_DELETED && modified) {
struct exfat_dentry *ep;
struct exfat_entry_set_cache *es;
+ int err;
es = exfat_get_dentry_set(sb, &(ei->dir), ei->entry,
ES_ALL_ENTRIES);
@@ -240,8 +240,9 @@ static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset,
ep->dentry.stream.valid_size;
exfat_update_dir_chksum_with_entry_set(es);
- exfat_free_dentry_set(es, inode_needs_sync(inode));
-
+ err = exfat_free_dentry_set(es, inode_needs_sync(inode));
+ if (err)
+ return err;
} /* end of if != DIR_DELETED */
inode->i_blocks +=
diff --git a/fs/exfat/misc.c b/fs/exfat/misc.c
index 17d41f3d3709..d34e6193258d 100644
--- a/fs/exfat/misc.c
+++ b/fs/exfat/misc.c
@@ -163,9 +163,8 @@ u32 exfat_calc_chksum32(void *data, int len, u32 chksum, int type)
return chksum;
}
-void exfat_update_bh(struct super_block *sb, struct buffer_head *bh, int sync)
+void exfat_update_bh(struct buffer_head *bh, int sync)
{
- set_bit(EXFAT_SB_DIRTY, &EXFAT_SB(sb)->s_state);
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
@@ -173,6 +172,25 @@ void exfat_update_bh(struct super_block *sb, struct buffer_head *bh, int sync)
sync_dirty_buffer(bh);
}
+int exfat_update_bhs(struct buffer_head **bhs, int nr_bhs, int sync)
+{
+ int i, err = 0;
+
+ for (i = 0; i < nr_bhs; i++) {
+ set_buffer_uptodate(bhs[i]);
+ mark_buffer_dirty(bhs[i]);
+ if (sync)
+ write_dirty_buffer(bhs[i], 0);
+ }
+
+ for (i = 0; i < nr_bhs && sync; i++) {
+ wait_on_buffer(bhs[i]);
+ if (!err && !buffer_uptodate(bhs[i]))
+ err = -EIO;
+ }
+ return err;
+}
+
void exfat_chain_set(struct exfat_chain *ec, unsigned int dir,
unsigned int size, unsigned char flags)
{
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index 2b9e21094a96..e73f20f66cb2 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -387,7 +387,7 @@ static int exfat_find_empty_entry(struct inode *inode,
ep->dentry.stream.valid_size = cpu_to_le64(size);
ep->dentry.stream.size = ep->dentry.stream.valid_size;
ep->dentry.stream.flags = p_dir->flags;
- exfat_update_bh(sb, bh, IS_DIRSYNC(inode));
+ exfat_update_bh(bh, IS_DIRSYNC(inode));
brelse(bh);
if (exfat_update_dir_chksum(inode, &(ei->dir),
ei->entry))
@@ -562,10 +562,10 @@ static int exfat_create(struct inode *dir, struct dentry *dentry, umode_t mode,
int err;
mutex_lock(&EXFAT_SB(sb)->s_lock);
- exfat_set_vol_flags(sb, VOL_DIRTY);
+ exfat_set_volume_dirty(sb);
err = exfat_add_entry(dir, dentry->d_name.name, &cdir, TYPE_FILE,
&info);
- exfat_set_vol_flags(sb, VOL_CLEAN);
+ exfat_clear_volume_dirty(sb);
if (err)
goto unlock;
@@ -834,7 +834,7 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry)
num_entries++;
brelse(bh);
- exfat_set_vol_flags(sb, VOL_DIRTY);
+ exfat_set_volume_dirty(sb);
/* update the directory entry */
if (exfat_remove_entries(dir, &cdir, entry, 0, num_entries)) {
err = -EIO;
@@ -843,7 +843,7 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry)
/* This doesn't modify ei */
ei->dir.dir = DIR_DELETED;
- exfat_set_vol_flags(sb, VOL_CLEAN);
+ exfat_clear_volume_dirty(sb);
inode_inc_iversion(dir);
dir->i_mtime = dir->i_atime = current_time(dir);
@@ -873,10 +873,10 @@ static int exfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
int err;
mutex_lock(&EXFAT_SB(sb)->s_lock);
- exfat_set_vol_flags(sb, VOL_DIRTY);
+ exfat_set_volume_dirty(sb);
err = exfat_add_entry(dir, dentry->d_name.name, &cdir, TYPE_DIR,
&info);
- exfat_set_vol_flags(sb, VOL_CLEAN);
+ exfat_clear_volume_dirty(sb);
if (err)
goto unlock;
@@ -1001,14 +1001,14 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
num_entries++;
brelse(bh);
- exfat_set_vol_flags(sb, VOL_DIRTY);
+ exfat_set_volume_dirty(sb);
err = exfat_remove_entries(dir, &cdir, entry, 0, num_entries);
if (err) {
exfat_err(sb, "failed to exfat_remove_entries : err(%d)", err);
goto unlock;
}
ei->dir.dir = DIR_DELETED;
- exfat_set_vol_flags(sb, VOL_CLEAN);
+ exfat_clear_volume_dirty(sb);
inode_inc_iversion(dir);
dir->i_mtime = dir->i_atime = current_time(dir);
@@ -1071,7 +1071,7 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir,
epnew->dentry.file.attr |= cpu_to_le16(ATTR_ARCHIVE);
ei->attr |= ATTR_ARCHIVE;
}
- exfat_update_bh(sb, new_bh, sync);
+ exfat_update_bh(new_bh, sync);
brelse(old_bh);
brelse(new_bh);
@@ -1087,7 +1087,7 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir,
}
memcpy(epnew, epold, DENTRY_SIZE);
- exfat_update_bh(sb, new_bh, sync);
+ exfat_update_bh(new_bh, sync);
brelse(old_bh);
brelse(new_bh);
@@ -1104,7 +1104,7 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir,
epold->dentry.file.attr |= cpu_to_le16(ATTR_ARCHIVE);
ei->attr |= ATTR_ARCHIVE;
}
- exfat_update_bh(sb, old_bh, sync);
+ exfat_update_bh(old_bh, sync);
brelse(old_bh);
ret = exfat_init_ext_entry(inode, p_dir, oldentry,
num_new_entries, p_uniname);
@@ -1159,7 +1159,7 @@ static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir,
epnew->dentry.file.attr |= cpu_to_le16(ATTR_ARCHIVE);
ei->attr |= ATTR_ARCHIVE;
}
- exfat_update_bh(sb, new_bh, IS_DIRSYNC(inode));
+ exfat_update_bh(new_bh, IS_DIRSYNC(inode));
brelse(mov_bh);
brelse(new_bh);
@@ -1175,7 +1175,7 @@ static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir,
}
memcpy(epnew, epmov, DENTRY_SIZE);
- exfat_update_bh(sb, new_bh, IS_DIRSYNC(inode));
+ exfat_update_bh(new_bh, IS_DIRSYNC(inode));
brelse(mov_bh);
brelse(new_bh);
@@ -1300,7 +1300,7 @@ static int __exfat_rename(struct inode *old_parent_inode,
if (ret)
goto out;
- exfat_set_vol_flags(sb, VOL_DIRTY);
+ exfat_set_volume_dirty(sb);
if (olddir.dir == newdir.dir)
ret = exfat_rename_file(new_parent_inode, &olddir, dentry,
@@ -1355,7 +1355,7 @@ del_out:
*/
new_ei->dir.dir = DIR_DELETED;
}
- exfat_set_vol_flags(sb, VOL_CLEAN);
+ exfat_clear_volume_dirty(sb);
out:
return ret;
}
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index 253a92460d52..3b6a1659892f 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -45,9 +45,6 @@ static void exfat_put_super(struct super_block *sb)
struct exfat_sb_info *sbi = EXFAT_SB(sb);
mutex_lock(&sbi->s_lock);
- if (test_and_clear_bit(EXFAT_SB_DIRTY, &sbi->s_state))
- sync_blockdev(sb->s_bdev);
- exfat_set_vol_flags(sb, VOL_CLEAN);
exfat_free_bitmap(sbi);
brelse(sbi->boot_bh);
mutex_unlock(&sbi->s_lock);
@@ -60,13 +57,14 @@ static int exfat_sync_fs(struct super_block *sb, int wait)
struct exfat_sb_info *sbi = EXFAT_SB(sb);
int err = 0;
+ if (!wait)
+ return 0;
+
/* If there are some dirty buffers in the bdev inode */
mutex_lock(&sbi->s_lock);
- if (test_and_clear_bit(EXFAT_SB_DIRTY, &sbi->s_state)) {
- sync_blockdev(sb->s_bdev);
- if (exfat_set_vol_flags(sb, VOL_CLEAN))
- err = -EIO;
- }
+ sync_blockdev(sb->s_bdev);
+ if (exfat_clear_volume_dirty(sb))
+ err = -EIO;
mutex_unlock(&sbi->s_lock);
return err;
}
@@ -98,17 +96,20 @@ static int exfat_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
-int exfat_set_vol_flags(struct super_block *sb, unsigned short new_flag)
+static int exfat_set_vol_flags(struct super_block *sb, unsigned short new_flags)
{
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct boot_sector *p_boot = (struct boot_sector *)sbi->boot_bh->b_data;
bool sync;
+ /* retain persistent-flags */
+ new_flags |= sbi->vol_flags_persistent;
+
/* flags are not changed */
- if (sbi->vol_flag == new_flag)
+ if (sbi->vol_flags == new_flags)
return 0;
- sbi->vol_flag = new_flag;
+ sbi->vol_flags = new_flags;
/* skip updating volume dirty flag,
* if this volume has been mounted with read-only
@@ -116,9 +117,9 @@ int exfat_set_vol_flags(struct super_block *sb, unsigned short new_flag)
if (sb_rdonly(sb))
return 0;
- p_boot->vol_flags = cpu_to_le16(new_flag);
+ p_boot->vol_flags = cpu_to_le16(new_flags);
- if (new_flag == VOL_DIRTY && !buffer_dirty(sbi->boot_bh))
+ if ((new_flags & VOLUME_DIRTY) && !buffer_dirty(sbi->boot_bh))
sync = true;
else
sync = false;
@@ -131,6 +132,20 @@ int exfat_set_vol_flags(struct super_block *sb, unsigned short new_flag)
return 0;
}
+int exfat_set_volume_dirty(struct super_block *sb)
+{
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+
+ return exfat_set_vol_flags(sb, sbi->vol_flags | VOLUME_DIRTY);
+}
+
+int exfat_clear_volume_dirty(struct super_block *sb)
+{
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+
+ return exfat_set_vol_flags(sb, sbi->vol_flags & ~VOLUME_DIRTY);
+}
+
static int exfat_show_options(struct seq_file *m, struct dentry *root)
{
struct super_block *sb = root->d_sb;
@@ -459,7 +474,8 @@ static int exfat_read_boot_sector(struct super_block *sb)
sbi->dentries_per_clu = 1 <<
(sbi->cluster_size_bits - DENTRY_SIZE_BITS);
- sbi->vol_flag = le16_to_cpu(p_boot->vol_flags);
+ sbi->vol_flags = le16_to_cpu(p_boot->vol_flags);
+ sbi->vol_flags_persistent = sbi->vol_flags & (VOLUME_DIRTY | MEDIA_FAILURE);
sbi->clu_srch_ptr = EXFAT_FIRST_CLUSTER;
sbi->used_clusters = EXFAT_CLUSTERS_UNTRACKED;
@@ -474,9 +490,9 @@ static int exfat_read_boot_sector(struct super_block *sb)
exfat_err(sb, "bogus data start sector");
return -EINVAL;
}
- if (sbi->vol_flag & VOL_DIRTY)
+ if (sbi->vol_flags & VOLUME_DIRTY)
exfat_warn(sb, "Volume was not properly unmounted. Some data may be corrupt. Please run fsck.");
- if (sbi->vol_flag & ERR_MEDIUM)
+ if (sbi->vol_flags & MEDIA_FAILURE)
exfat_warn(sb, "Medium has reported failures. Some data may be lost.");
/* exFAT file size is limited by a disk volume size */
diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig
index ca31993dcb47..66532a71e8fd 100644
--- a/fs/fat/Kconfig
+++ b/fs/fat/Kconfig
@@ -41,7 +41,7 @@ config MSDOS_FS
they are compressed; to access compressed MSDOS partitions under
Linux, you can either use the DOS emulator DOSEMU, described in the
DOSEMU-HOWTO, available from
- <http://www.tldp.org/docs.html#howto>, or try dmsdosfs in
+ <https://www.tldp.org/docs.html#howto>, or try dmsdosfs in
<ftp://ibiblio.org/pub/Linux/system/filesystems/dosfs/>. If you
intend to use dosemu with a non-compressed MSDOS partition, say Y
here) and MSDOS floppies. This means that file access becomes
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index bbfe18c07417..f7e3304b7802 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -657,6 +657,9 @@ static void fat_ra_init(struct super_block *sb, struct fatent_ra *ra,
unsigned long ra_pages = sb->s_bdi->ra_pages;
unsigned int reada_blocks;
+ if (fatent->entry >= ent_limit)
+ return;
+
if (ra_pages > sb->s_bdi->io_pages)
ra_pages = rounddown(ra_pages, sb->s_bdi->io_pages);
reada_blocks = ra_pages << (PAGE_SHIFT - sb->s_blocksize_bits + 1);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 42134c58c87e..f9ee27cf4d7c 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -25,9 +25,9 @@ static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr)
{
u32 attr;
- inode_lock(inode);
+ inode_lock_shared(inode);
attr = fat_make_attrs(inode);
- inode_unlock(inode);
+ inode_unlock_shared(inode);
return put_user(attr, user_attr);
}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 523954d00dff..b5c109703daa 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1364,6 +1364,12 @@ hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_magic = HUGETLBFS_MAGIC;
sb->s_op = &hugetlbfs_ops;
sb->s_time_gran = 1;
+
+ /*
+ * Due to the special and limited functionality of hugetlbfs, it does
+ * not work well as a stacking filesystem.
+ */
+ sb->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
sb->s_root = d_make_root(hugetlbfs_get_root(sb, ctx));
if (!sb->s_root)
goto out_free;
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2a3af95be4ca..dc506b75659c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -508,9 +508,9 @@ struct io_async_msghdr {
struct io_async_rw {
struct iovec fast_iov[UIO_FASTIOV];
- struct iovec *iov;
- ssize_t nr_segs;
- ssize_t size;
+ const struct iovec *free_iovec;
+ struct iov_iter iter;
+ size_t bytes_done;
struct wait_page_queue wpq;
};
@@ -898,6 +898,7 @@ static void io_put_req(struct io_kiocb *req);
static void io_double_put_req(struct io_kiocb *req);
static void __io_double_put_req(struct io_kiocb *req);
static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
+static void __io_queue_linked_timeout(struct io_kiocb *req);
static void io_queue_linked_timeout(struct io_kiocb *req);
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
struct io_uring_files_update *ip,
@@ -914,9 +915,9 @@ static void io_file_put_work(struct work_struct *work);
static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
struct iovec **iovec, struct iov_iter *iter,
bool needs_lock);
-static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
- struct iovec *iovec, struct iovec *fast_iov,
- struct iov_iter *iter);
+static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
+ const struct iovec *fast_iov,
+ struct iov_iter *iter, bool force);
static struct kmem_cache *req_cachep;
@@ -1107,10 +1108,16 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx)
}
}
-static void io_req_clean_work(struct io_kiocb *req)
+/*
+ * Returns true if we need to defer file table putting. This can only happen
+ * from the error path with REQ_F_COMP_LOCKED set.
+ */
+static bool io_req_clean_work(struct io_kiocb *req)
{
if (!(req->flags & REQ_F_WORK_INITIALIZED))
- return;
+ return false;
+
+ req->flags &= ~REQ_F_WORK_INITIALIZED;
if (req->work.mm) {
mmdrop(req->work.mm);
@@ -1123,6 +1130,9 @@ static void io_req_clean_work(struct io_kiocb *req)
if (req->work.fs) {
struct fs_struct *fs = req->work.fs;
+ if (req->flags & REQ_F_COMP_LOCKED)
+ return true;
+
spin_lock(&req->work.fs->lock);
if (--fs->users)
fs = NULL;
@@ -1131,7 +1141,8 @@ static void io_req_clean_work(struct io_kiocb *req)
free_fs_struct(fs);
req->work.fs = NULL;
}
- req->flags &= ~REQ_F_WORK_INITIALIZED;
+
+ return false;
}
static void io_prep_async_work(struct io_kiocb *req)
@@ -1179,7 +1190,7 @@ static void io_prep_async_link(struct io_kiocb *req)
io_prep_async_work(cur);
}
-static void __io_queue_async_work(struct io_kiocb *req)
+static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *link = io_prep_linked_timeout(req);
@@ -1187,16 +1198,19 @@ static void __io_queue_async_work(struct io_kiocb *req)
trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
&req->work, req->flags);
io_wq_enqueue(ctx->io_wq, &req->work);
-
- if (link)
- io_queue_linked_timeout(link);
+ return link;
}
static void io_queue_async_work(struct io_kiocb *req)
{
+ struct io_kiocb *link;
+
/* init ->work of the whole link before punting */
io_prep_async_link(req);
- __io_queue_async_work(req);
+ link = __io_queue_async_work(req);
+
+ if (link)
+ io_queue_linked_timeout(link);
}
static void io_kill_timeout(struct io_kiocb *req)
@@ -1229,12 +1243,19 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx)
do {
struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
struct io_defer_entry, list);
+ struct io_kiocb *link;
if (req_need_defer(de->req, de->seq))
break;
list_del_init(&de->list);
/* punt-init is done before queueing for defer */
- __io_queue_async_work(de->req);
+ link = __io_queue_async_work(de->req);
+ if (link) {
+ __io_queue_linked_timeout(link);
+ /* drop submission reference */
+ link->flags |= REQ_F_COMP_LOCKED;
+ io_put_req(link);
+ }
kfree(de);
} while (!list_empty(&ctx->defer_list));
}
@@ -1533,7 +1554,7 @@ static inline void io_put_file(struct io_kiocb *req, struct file *file,
fput(file);
}
-static void io_dismantle_req(struct io_kiocb *req)
+static bool io_dismantle_req(struct io_kiocb *req)
{
io_clean_op(req);
@@ -1541,7 +1562,6 @@ static void io_dismantle_req(struct io_kiocb *req)
kfree(req->io);
if (req->file)
io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
- io_req_clean_work(req);
if (req->flags & REQ_F_INFLIGHT) {
struct io_ring_ctx *ctx = req->ctx;
@@ -1553,15 +1573,15 @@ static void io_dismantle_req(struct io_kiocb *req)
wake_up(&ctx->inflight_wait);
spin_unlock_irqrestore(&ctx->inflight_lock, flags);
}
+
+ return io_req_clean_work(req);
}
-static void __io_free_req(struct io_kiocb *req)
+static void __io_free_req_finish(struct io_kiocb *req)
{
- struct io_ring_ctx *ctx;
+ struct io_ring_ctx *ctx = req->ctx;
- io_dismantle_req(req);
__io_put_req_task(req);
- ctx = req->ctx;
if (likely(!io_is_fallback_req(req)))
kmem_cache_free(req_cachep, req);
else
@@ -1569,6 +1589,39 @@ static void __io_free_req(struct io_kiocb *req)
percpu_ref_put(&ctx->refs);
}
+static void io_req_task_file_table_put(struct callback_head *cb)
+{
+ struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
+ struct fs_struct *fs = req->work.fs;
+
+ spin_lock(&req->work.fs->lock);
+ if (--fs->users)
+ fs = NULL;
+ spin_unlock(&req->work.fs->lock);
+ if (fs)
+ free_fs_struct(fs);
+ req->work.fs = NULL;
+ __io_free_req_finish(req);
+}
+
+static void __io_free_req(struct io_kiocb *req)
+{
+ if (!io_dismantle_req(req)) {
+ __io_free_req_finish(req);
+ } else {
+ int ret;
+
+ init_task_work(&req->task_work, io_req_task_file_table_put);
+ ret = task_work_add(req->task, &req->task_work, TWA_RESUME);
+ if (unlikely(ret)) {
+ struct task_struct *tsk;
+
+ tsk = io_wq_get_task(req->ctx->io_wq);
+ task_work_add(tsk, &req->task_work, 0);
+ }
+ }
+}
+
static bool io_link_cancel_timeout(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -1598,6 +1651,7 @@ static bool __io_kill_linked_timeout(struct io_kiocb *req)
return false;
list_del_init(&link->link_list);
+ link->flags |= REQ_F_COMP_LOCKED;
wake_ev = io_link_cancel_timeout(link);
req->flags &= ~REQ_F_LINK_TIMEOUT;
return wake_ev;
@@ -1656,6 +1710,7 @@ static void __io_fail_links(struct io_kiocb *req)
trace_io_uring_fail_link(req, link);
io_cqring_fill_event(link, -ECANCELED);
+ link->flags |= REQ_F_COMP_LOCKED;
__io_double_put_req(link);
req->flags &= ~REQ_F_LINK_TIMEOUT;
}
@@ -1710,22 +1765,22 @@ static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb)
{
struct task_struct *tsk = req->task;
struct io_ring_ctx *ctx = req->ctx;
- int ret, notify = TWA_RESUME;
+ int ret, notify;
/*
- * SQPOLL kernel thread doesn't need notification, just a wakeup.
- * If we're not using an eventfd, then TWA_RESUME is always fine,
- * as we won't have dependencies between request completions for
- * other kernel wait conditions.
+ * SQPOLL kernel thread doesn't need notification, just a wakeup. For
+ * all other cases, use TWA_SIGNAL unconditionally to ensure we're
+ * processing task_work. There's no reliable way to tell if TWA_RESUME
+ * will do the job.
*/
- if (ctx->flags & IORING_SETUP_SQPOLL)
- notify = 0;
- else if (ctx->cq_ev_fd)
+ notify = 0;
+ if (!(ctx->flags & IORING_SETUP_SQPOLL))
notify = TWA_SIGNAL;
ret = task_work_add(tsk, cb, notify);
if (!ret)
wake_up_process(tsk);
+
return ret;
}
@@ -1766,8 +1821,10 @@ static void __io_req_task_submit(struct io_kiocb *req)
static void io_req_task_submit(struct callback_head *cb)
{
struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
+ struct io_ring_ctx *ctx = req->ctx;
__io_req_task_submit(req);
+ percpu_ref_put(&ctx->refs);
}
static void io_req_task_queue(struct io_kiocb *req)
@@ -1775,6 +1832,7 @@ static void io_req_task_queue(struct io_kiocb *req)
int ret;
init_task_work(&req->task_work, io_req_task_submit);
+ percpu_ref_get(&req->ctx->refs);
ret = io_req_task_work_add(req, &req->task_work);
if (unlikely(ret)) {
@@ -1855,7 +1913,7 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req)
req->flags &= ~REQ_F_TASK_PINNED;
}
- io_dismantle_req(req);
+ WARN_ON_ONCE(io_dismantle_req(req));
rb->reqs[rb->to_free++] = req;
if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
__io_req_free_batch_flush(req->ctx, rb);
@@ -2241,7 +2299,7 @@ static bool io_resubmit_prep(struct io_kiocb *req, int error)
ret = io_import_iovec(rw, req, &iovec, &iter, false);
if (ret < 0)
goto end_req;
- ret = io_setup_async_rw(req, ret, iovec, inline_vecs, &iter);
+ ret = io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
if (!ret)
return true;
kfree(iovec);
@@ -2263,6 +2321,8 @@ static void io_rw_resubmit(struct callback_head *cb)
refcount_inc(&req->refs);
io_queue_async_work(req);
}
+
+ percpu_ref_put(&ctx->refs);
}
#endif
@@ -2275,6 +2335,8 @@ static bool io_rw_reissue(struct io_kiocb *req, long res)
return false;
init_task_work(&req->task_work, io_rw_resubmit);
+ percpu_ref_get(&req->ctx->refs);
+
ret = io_req_task_work_add(req, &req->task_work);
if (!ret)
return true;
@@ -2527,6 +2589,14 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
+ /* add previously done IO, if any */
+ if (req->io && req->io->rw.bytes_done > 0) {
+ if (ret < 0)
+ ret = req->io->rw.bytes_done;
+ else
+ ret += req->io->rw.bytes_done;
+ }
+
if (req->flags & REQ_F_CUR_POS)
req->file->f_pos = kiocb->ki_pos;
if (ret >= 0 && kiocb->ki_complete == io_complete_rw)
@@ -2758,6 +2828,13 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
ssize_t ret;
u8 opcode;
+ if (req->io) {
+ struct io_async_rw *iorw = &req->io->rw;
+
+ *iovec = NULL;
+ return iov_iter_count(&iorw->iter);
+ }
+
opcode = req->opcode;
if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
*iovec = NULL;
@@ -2783,14 +2860,6 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
return ret < 0 ? ret : sqe_len;
}
- if (req->io) {
- struct io_async_rw *iorw = &req->io->rw;
-
- iov_iter_init(iter, rw, iorw->iov, iorw->nr_segs, iorw->size);
- *iovec = NULL;
- return iorw->size;
- }
-
if (req->flags & REQ_F_BUFFER_SELECT) {
ret = io_iov_buffer_select(req, *iovec, needs_lock);
if (!ret) {
@@ -2868,21 +2937,30 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
return ret;
}
-static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
- struct iovec *iovec, struct iovec *fast_iov,
- struct iov_iter *iter)
+static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
+ const struct iovec *fast_iov, struct iov_iter *iter)
{
struct io_async_rw *rw = &req->io->rw;
- rw->nr_segs = iter->nr_segs;
- rw->size = io_size;
+ memcpy(&rw->iter, iter, sizeof(*iter));
+ rw->free_iovec = NULL;
+ rw->bytes_done = 0;
+ /* can only be fixed buffers, no need to do anything */
+ if (iter->type == ITER_BVEC)
+ return;
if (!iovec) {
- rw->iov = rw->fast_iov;
- if (rw->iov != fast_iov)
- memcpy(rw->iov, fast_iov,
+ unsigned iov_off = 0;
+
+ rw->iter.iov = rw->fast_iov;
+ if (iter->iov != fast_iov) {
+ iov_off = iter->iov - fast_iov;
+ rw->iter.iov += iov_off;
+ }
+ if (rw->fast_iov != fast_iov)
+ memcpy(rw->fast_iov + iov_off, fast_iov + iov_off,
sizeof(struct iovec) * iter->nr_segs);
} else {
- rw->iov = iovec;
+ rw->free_iovec = iovec;
req->flags |= REQ_F_NEED_CLEANUP;
}
}
@@ -2901,17 +2979,17 @@ static int io_alloc_async_ctx(struct io_kiocb *req)
return __io_alloc_async_ctx(req);
}
-static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
- struct iovec *iovec, struct iovec *fast_iov,
- struct iov_iter *iter)
+static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
+ const struct iovec *fast_iov,
+ struct iov_iter *iter, bool force)
{
- if (!io_op_defs[req->opcode].async_ctx)
+ if (!force && !io_op_defs[req->opcode].async_ctx)
return 0;
if (!req->io) {
if (__io_alloc_async_ctx(req))
return -ENOMEM;
- io_req_map_rw(req, io_size, iovec, fast_iov, iter);
+ io_req_map_rw(req, iovec, fast_iov, iter);
}
return 0;
}
@@ -2919,18 +2997,19 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
static inline int io_rw_prep_async(struct io_kiocb *req, int rw,
bool force_nonblock)
{
- struct io_async_ctx *io = req->io;
- struct iov_iter iter;
+ struct io_async_rw *iorw = &req->io->rw;
ssize_t ret;
- io->rw.iov = io->rw.fast_iov;
+ iorw->iter.iov = iorw->fast_iov;
+ /* reset ->io around the iovec import, we don't want to use it */
req->io = NULL;
- ret = io_import_iovec(rw, req, &io->rw.iov, &iter, !force_nonblock);
- req->io = io;
+ ret = io_import_iovec(rw, req, (struct iovec **) &iorw->iter.iov,
+ &iorw->iter, !force_nonblock);
+ req->io = container_of(iorw, struct io_async_ctx, rw);
if (unlikely(ret < 0))
return ret;
- io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
+ io_req_map_rw(req, iorw->iter.iov, iorw->fast_iov, &iorw->iter);
return 0;
}
@@ -2952,6 +3031,16 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
return io_rw_prep_async(req, READ, force_nonblock);
}
+/*
+ * This is our waitqueue callback handler, registered through lock_page_async()
+ * when we initially tried to do the IO with the iocb armed our waitqueue.
+ * This gets called when the page is unlocked, and we generally expect that to
+ * happen when the page IO is completed and the page is now uptodate. This will
+ * queue a task_work based retry of the operation, attempting to copy the data
+ * again. If the latter fails because the page was NOT uptodate, then we will
+ * do a thread based blocking retry of the operation. That's the unexpected
+ * slow path.
+ */
static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
int sync, void *arg)
{
@@ -2965,13 +3054,11 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
if (!wake_page_match(wpq, key))
return 0;
- /* Stop waking things up if the page is locked again */
- if (test_bit(key->bit_nr, &key->page->flags))
- return -1;
-
list_del_init(&wait->entry);
init_task_work(&req->task_work, io_req_task_submit);
+ percpu_ref_get(&req->ctx->refs);
+
/* submit ref gets dropped, acquire a new one */
refcount_inc(&req->refs);
ret = io_req_task_work_add(req, &req->task_work);
@@ -3008,7 +3095,18 @@ static inline int kiocb_wait_page_queue_init(struct kiocb *kiocb,
return -EOPNOTSUPP;
}
-
+/*
+ * This controls whether a given IO request should be armed for async page
+ * based retry. If we return false here, the request is handed to the async
+ * worker threads for retry. If we're doing buffered reads on a regular file,
+ * we prepare a private wait_page_queue entry and retry the operation. This
+ * will either succeed because the page is now uptodate and unlocked, or it
+ * will register a callback when the page is unlocked at IO completion. Through
+ * that callback, io_uring uses task_work to setup a retry of the operation.
+ * That retry will attempt the buffered read again. The retry will generally
+ * succeed, or in rare cases where it fails, we then fall back to using the
+ * async worker threads for a blocking retry.
+ */
static bool io_rw_should_retry(struct io_kiocb *req)
{
struct kiocb *kiocb = &req->rw.kiocb;
@@ -3018,8 +3116,8 @@ static bool io_rw_should_retry(struct io_kiocb *req)
if (req->flags & REQ_F_NOWAIT)
return false;
- /* already tried, or we're doing O_DIRECT */
- if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_WAITQ))
+ /* Only for buffered IO */
+ if (kiocb->ki_flags & IOCB_DIRECT)
return false;
/*
* just use poll if we can, and don't attempt if the fs doesn't
@@ -3028,13 +3126,6 @@ static bool io_rw_should_retry(struct io_kiocb *req)
if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
return false;
- /*
- * If request type doesn't require req->io to defer in general,
- * we need to allocate it here
- */
- if (!req->io && __io_alloc_async_ctx(req))
- return false;
-
ret = kiocb_wait_page_queue_init(kiocb, &req->io->rw.wpq,
io_async_buf_func, req);
if (!ret) {
@@ -3049,7 +3140,10 @@ static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
{
if (req->file->f_op->read_iter)
return call_read_iter(req->file, &req->rw.kiocb, iter);
- return loop_rw_iter(READ, req->file, &req->rw.kiocb, iter);
+ else if (req->file->f_op->read)
+ return loop_rw_iter(READ, req->file, &req->rw.kiocb, iter);
+ else
+ return -EINVAL;
}
static int io_read(struct io_kiocb *req, bool force_nonblock,
@@ -3057,16 +3151,19 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct kiocb *kiocb = &req->rw.kiocb;
- struct iov_iter iter;
- size_t iov_count;
+ struct iov_iter __iter, *iter = &__iter;
ssize_t io_size, ret, ret2;
- unsigned long nr_segs;
+ size_t iov_count;
+
+ if (req->io)
+ iter = &req->io->rw.iter;
- ret = io_import_iovec(READ, req, &iovec, &iter, !force_nonblock);
+ ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
if (ret < 0)
return ret;
io_size = ret;
req->result = io_size;
+ ret = 0;
/* Ensure we clear previously set non-block flag */
if (!force_nonblock)
@@ -3076,40 +3173,70 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
if (force_nonblock && !io_file_supports_async(req->file, READ))
goto copy_iov;
- iov_count = iov_iter_count(&iter);
- nr_segs = iter.nr_segs;
+ iov_count = iov_iter_count(iter);
ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count);
if (unlikely(ret))
goto out_free;
- ret2 = io_iter_do_read(req, &iter);
+ ret = io_iter_do_read(req, iter);
- /* Catch -EAGAIN return for forced non-blocking submission */
- if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) {
- kiocb_done(kiocb, ret2, cs);
- } else {
- iter.count = iov_count;
- iter.nr_segs = nr_segs;
-copy_iov:
- ret = io_setup_async_rw(req, io_size, iovec, inline_vecs,
- &iter);
+ if (!ret) {
+ goto done;
+ } else if (ret == -EIOCBQUEUED) {
+ ret = 0;
+ goto out_free;
+ } else if (ret == -EAGAIN) {
+ if (!force_nonblock)
+ goto done;
+ ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
if (ret)
goto out_free;
- /* it's copied and will be cleaned with ->io */
- iovec = NULL;
- /* if we can retry, do so with the callbacks armed */
- if (io_rw_should_retry(req)) {
- ret2 = io_iter_do_read(req, &iter);
- if (ret2 == -EIOCBQUEUED) {
- goto out_free;
- } else if (ret2 != -EAGAIN) {
- kiocb_done(kiocb, ret2, cs);
- goto out_free;
- }
- }
+ return -EAGAIN;
+ } else if (ret < 0) {
+ goto out_free;
+ }
+
+ /* read it all, or we did blocking attempt. no retry. */
+ if (!iov_iter_count(iter) || !force_nonblock ||
+ (req->file->f_flags & O_NONBLOCK))
+ goto done;
+
+ io_size -= ret;
+copy_iov:
+ ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
+ if (ret2) {
+ ret = ret2;
+ goto out_free;
+ }
+ /* it's copied and will be cleaned with ->io */
+ iovec = NULL;
+ /* now use our persistent iterator, if we aren't already */
+ iter = &req->io->rw.iter;
+retry:
+ req->io->rw.bytes_done += ret;
+ /* if we can retry, do so with the callbacks armed */
+ if (!io_rw_should_retry(req)) {
kiocb->ki_flags &= ~IOCB_WAITQ;
return -EAGAIN;
}
+
+ /*
+ * Now retry read with the IOCB_WAITQ parts set in the iocb. If we
+ * get -EIOCBQUEUED, then we'll get a notification when the desired
+ * page gets unlocked. We can also get a partial read here, and if we
+ * do, then just retry at the new offset.
+ */
+ ret = io_iter_do_read(req, iter);
+ if (ret == -EIOCBQUEUED) {
+ ret = 0;
+ goto out_free;
+ } else if (ret > 0 && ret < io_size) {
+ /* we got some bytes, but not all. retry. */
+ goto retry;
+ }
+done:
+ kiocb_done(kiocb, ret, cs);
+ ret = 0;
out_free:
if (iovec)
kfree(iovec);
@@ -3139,12 +3266,14 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct kiocb *kiocb = &req->rw.kiocb;
- struct iov_iter iter;
+ struct iov_iter __iter, *iter = &__iter;
size_t iov_count;
ssize_t ret, ret2, io_size;
- unsigned long nr_segs;
- ret = io_import_iovec(WRITE, req, &iovec, &iter, !force_nonblock);
+ if (req->io)
+ iter = &req->io->rw.iter;
+
+ ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
if (ret < 0)
return ret;
io_size = ret;
@@ -3163,8 +3292,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
(req->flags & REQ_F_ISREG))
goto copy_iov;
- iov_count = iov_iter_count(&iter);
- nr_segs = iter.nr_segs;
+ iov_count = iov_iter_count(iter);
ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count);
if (unlikely(ret))
goto out_free;
@@ -3185,9 +3313,11 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
kiocb->ki_flags |= IOCB_WRITE;
if (req->file->f_op->write_iter)
- ret2 = call_write_iter(req->file, kiocb, &iter);
+ ret2 = call_write_iter(req->file, kiocb, iter);
+ else if (req->file->f_op->write)
+ ret2 = loop_rw_iter(WRITE, req->file, kiocb, iter);
else
- ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter);
+ ret2 = -EINVAL;
/*
* Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
@@ -3198,16 +3328,10 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
if (!force_nonblock || ret2 != -EAGAIN) {
kiocb_done(kiocb, ret2, cs);
} else {
- iter.count = iov_count;
- iter.nr_segs = nr_segs;
copy_iov:
- ret = io_setup_async_rw(req, io_size, iovec, inline_vecs,
- &iter);
- if (ret)
- goto out_free;
- /* it's copied and will be cleaned with ->io */
- iovec = NULL;
- return -EAGAIN;
+ ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
+ if (!ret)
+ return -EAGAIN;
}
out_free:
if (iovec)
@@ -4488,6 +4612,8 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
req->result = mask;
init_task_work(&req->task_work, func);
+ percpu_ref_get(&req->ctx->refs);
+
/*
* If this fails, then the task is exiting. When a task exits, the
* work gets canceled, so just cancel this request as well instead
@@ -4526,9 +4652,24 @@ static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
return false;
}
-static void io_poll_remove_double(struct io_kiocb *req, void *data)
+static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
{
- struct io_poll_iocb *poll = data;
+ /* pure poll stashes this in ->io, poll driven retry elsewhere */
+ if (req->opcode == IORING_OP_POLL_ADD)
+ return (struct io_poll_iocb *) req->io;
+ return req->apoll->double_poll;
+}
+
+static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
+{
+ if (req->opcode == IORING_OP_POLL_ADD)
+ return &req->poll;
+ return &req->apoll->poll;
+}
+
+static void io_poll_remove_double(struct io_kiocb *req)
+{
+ struct io_poll_iocb *poll = io_poll_get_double(req);
lockdep_assert_held(&req->ctx->completion_lock);
@@ -4548,7 +4689,7 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
{
struct io_ring_ctx *ctx = req->ctx;
- io_poll_remove_double(req, req->io);
+ io_poll_remove_double(req);
req->poll.done = true;
io_cqring_fill_event(req, error ? error : mangle_poll(mask));
io_commit_cqring(ctx);
@@ -4575,18 +4716,20 @@ static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt)
static void io_poll_task_func(struct callback_head *cb)
{
struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
+ struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *nxt = NULL;
io_poll_task_handler(req, &nxt);
if (nxt)
__io_req_task_submit(nxt);
+ percpu_ref_put(&ctx->refs);
}
static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
int sync, void *key)
{
struct io_kiocb *req = wait->private;
- struct io_poll_iocb *poll = req->apoll->double_poll;
+ struct io_poll_iocb *poll = io_poll_get_single(req);
__poll_t mask = key_to_poll(key);
/* for instances that support it check for an event match first: */
@@ -4600,6 +4743,8 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
done = list_empty(&poll->wait.entry);
if (!done)
list_del_init(&poll->wait.entry);
+ /* make sure double remove sees this as being gone */
+ wait->private = NULL;
spin_unlock(&poll->head->lock);
if (!done)
__io_async_wake(req, poll, mask, io_poll_task_func);
@@ -4675,6 +4820,7 @@ static void io_async_task_func(struct callback_head *cb)
if (io_poll_rewait(req, &apoll->poll)) {
spin_unlock_irq(&ctx->completion_lock);
+ percpu_ref_put(&ctx->refs);
return;
}
@@ -4682,7 +4828,7 @@ static void io_async_task_func(struct callback_head *cb)
if (hash_hashed(&req->hash_node))
hash_del(&req->hash_node);
- io_poll_remove_double(req, apoll->double_poll);
+ io_poll_remove_double(req);
spin_unlock_irq(&ctx->completion_lock);
if (!READ_ONCE(apoll->poll.canceled))
@@ -4690,6 +4836,7 @@ static void io_async_task_func(struct callback_head *cb)
else
__io_req_task_cancel(req, -ECANCELED);
+ percpu_ref_put(&ctx->refs);
kfree(apoll->double_poll);
kfree(apoll);
}
@@ -4791,8 +4938,8 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
io_async_wake);
- if (ret) {
- io_poll_remove_double(req, apoll->double_poll);
+ if (ret || ipt.error) {
+ io_poll_remove_double(req);
spin_unlock_irq(&ctx->completion_lock);
kfree(apoll->double_poll);
kfree(apoll);
@@ -4824,14 +4971,13 @@ static bool io_poll_remove_one(struct io_kiocb *req)
{
bool do_complete;
+ io_poll_remove_double(req);
+
if (req->opcode == IORING_OP_POLL_ADD) {
- io_poll_remove_double(req, req->io);
do_complete = __io_poll_remove_one(req, &req->poll);
} else {
struct async_poll *apoll = req->apoll;
- io_poll_remove_double(req, apoll->double_poll);
-
/* non-poll requests have submit ref still */
do_complete = __io_poll_remove_one(req, &apoll->poll);
if (do_complete) {
@@ -4845,6 +4991,7 @@ static bool io_poll_remove_one(struct io_kiocb *req)
io_cqring_fill_event(req, -ECANCELED);
io_commit_cqring(req->ctx);
req->flags |= REQ_F_COMP_LOCKED;
+ req_set_fail_links(req);
io_put_req(req);
}
@@ -5017,6 +5164,23 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
+static int __io_timeout_cancel(struct io_kiocb *req)
+{
+ int ret;
+
+ list_del_init(&req->timeout.list);
+
+ ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
+ if (ret == -1)
+ return -EALREADY;
+
+ req_set_fail_links(req);
+ req->flags |= REQ_F_COMP_LOCKED;
+ io_cqring_fill_event(req, -ECANCELED);
+ io_put_req(req);
+ return 0;
+}
+
static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
{
struct io_kiocb *req;
@@ -5024,7 +5188,6 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
if (user_data == req->user_data) {
- list_del_init(&req->timeout.list);
ret = 0;
break;
}
@@ -5033,14 +5196,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
if (ret == -ENOENT)
return ret;
- ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
- if (ret == -1)
- return -EALREADY;
-
- req_set_fail_links(req);
- io_cqring_fill_event(req, -ECANCELED);
- io_put_req(req);
- return 0;
+ return __io_timeout_cancel(req);
}
static int io_timeout_remove_prep(struct io_kiocb *req,
@@ -5481,8 +5637,8 @@ static void __io_clean_op(struct io_kiocb *req)
case IORING_OP_WRITEV:
case IORING_OP_WRITE_FIXED:
case IORING_OP_WRITE:
- if (io->rw.iov != io->rw.fast_iov)
- kfree(io->rw.iov);
+ if (io->rw.free_iovec)
+ kfree(io->rw.free_iovec);
break;
case IORING_OP_RECVMSG:
case IORING_OP_SENDMSG:
@@ -5917,15 +6073,12 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-static void io_queue_linked_timeout(struct io_kiocb *req)
+static void __io_queue_linked_timeout(struct io_kiocb *req)
{
- struct io_ring_ctx *ctx = req->ctx;
-
/*
* If the list is now empty, then our linked request finished before
* we got a chance to setup the timer
*/
- spin_lock_irq(&ctx->completion_lock);
if (!list_empty(&req->link_list)) {
struct io_timeout_data *data = &req->io->timeout;
@@ -5933,6 +6086,14 @@ static void io_queue_linked_timeout(struct io_kiocb *req)
hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
data->mode);
}
+}
+
+static void io_queue_linked_timeout(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ spin_lock_irq(&ctx->completion_lock);
+ __io_queue_linked_timeout(req);
spin_unlock_irq(&ctx->completion_lock);
/* drop submission reference */
@@ -7837,6 +7998,71 @@ static bool io_wq_files_match(struct io_wq_work *work, void *data)
return work->files == files;
}
+/*
+ * Returns true if 'preq' is the link parent of 'req'
+ */
+static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req)
+{
+ struct io_kiocb *link;
+
+ if (!(preq->flags & REQ_F_LINK_HEAD))
+ return false;
+
+ list_for_each_entry(link, &preq->link_list, link_list) {
+ if (link == req)
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * We're looking to cancel 'req' because it's holding on to our files, but
+ * 'req' could be a link to another request. See if it is, and cancel that
+ * parent request if so.
+ */
+static bool io_poll_remove_link(struct io_ring_ctx *ctx, struct io_kiocb *req)
+{
+ struct hlist_node *tmp;
+ struct io_kiocb *preq;
+ bool found = false;
+ int i;
+
+ spin_lock_irq(&ctx->completion_lock);
+ for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
+ struct hlist_head *list;
+
+ list = &ctx->cancel_hash[i];
+ hlist_for_each_entry_safe(preq, tmp, list, hash_node) {
+ found = io_match_link(preq, req);
+ if (found) {
+ io_poll_remove_one(preq);
+ break;
+ }
+ }
+ }
+ spin_unlock_irq(&ctx->completion_lock);
+ return found;
+}
+
+static bool io_timeout_remove_link(struct io_ring_ctx *ctx,
+ struct io_kiocb *req)
+{
+ struct io_kiocb *preq;
+ bool found = false;
+
+ spin_lock_irq(&ctx->completion_lock);
+ list_for_each_entry(preq, &ctx->timeout_list, timeout.list) {
+ found = io_match_link(preq, req);
+ if (found) {
+ __io_timeout_cancel(preq);
+ break;
+ }
+ }
+ spin_unlock_irq(&ctx->completion_lock);
+ return found;
+}
+
static void io_uring_cancel_files(struct io_ring_ctx *ctx,
struct files_struct *files)
{
@@ -7891,6 +8117,9 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
}
} else {
io_wq_cancel_work(ctx->io_wq, &cancel_req->work);
+ /* could be a link, check and remove if it is */
+ if (!io_poll_remove_link(ctx, cancel_req))
+ io_timeout_remove_link(ctx, cancel_req);
io_put_req(cancel_req);
}
@@ -8171,6 +8400,10 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
struct io_rings *rings;
size_t size, sq_array_offset;
+ /* make sure these are sane, as we already accounted them */
+ ctx->sq_entries = p->sq_entries;
+ ctx->cq_entries = p->cq_entries;
+
size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
if (size == SIZE_MAX)
return -EOVERFLOW;
@@ -8187,8 +8420,6 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
rings->cq_ring_entries = p->cq_entries;
ctx->sq_mask = rings->sq_ring_mask;
ctx->cq_mask = rings->cq_ring_mask;
- ctx->sq_entries = rings->sq_ring_entries;
- ctx->cq_entries = rings->cq_ring_entries;
size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
if (size == SIZE_MAX) {
@@ -8317,6 +8548,16 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
ctx->user = user;
ctx->creds = get_current_cred();
+ /*
+ * Account memory _before_ installing the file descriptor. Once
+ * the descriptor is installed, it can get closed at any time. Also
+ * do this before hitting the general error path, as ring freeing
+ * will un-account as well.
+ */
+ io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries),
+ ACCT_LOCKED);
+ ctx->limit_mem = limit_mem;
+
ret = io_allocate_scq_urings(ctx, p);
if (ret)
goto err;
@@ -8354,14 +8595,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
}
/*
- * Account memory _before_ installing the file descriptor. Once
- * the descriptor is installed, it can get closed at any time.
- */
- io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries),
- ACCT_LOCKED);
- ctx->limit_mem = limit_mem;
-
- /*
* Install ring fd as the very last thing, so we don't risk someone
* having closed it before we finish setup
*/
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 7cb5fd38eb14..7b09a9158e40 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -150,6 +150,25 @@ static int minix_remount (struct super_block * sb, int * flags, char * data)
return 0;
}
+static bool minix_check_superblock(struct super_block *sb)
+{
+ struct minix_sb_info *sbi = minix_sb(sb);
+
+ if (sbi->s_imap_blocks == 0 || sbi->s_zmap_blocks == 0)
+ return false;
+
+ /*
+ * s_max_size must not exceed the block mapping limitation. This check
+ * is only needed for V1 filesystems, since V2/V3 support an extra level
+ * of indirect blocks which places the limit well above U32_MAX.
+ */
+ if (sbi->s_version == MINIX_V1 &&
+ sb->s_maxbytes > (7 + 512 + 512*512) * BLOCK_SIZE)
+ return false;
+
+ return true;
+}
+
static int minix_fill_super(struct super_block *s, void *data, int silent)
{
struct buffer_head *bh;
@@ -185,7 +204,7 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
sbi->s_zmap_blocks = ms->s_zmap_blocks;
sbi->s_firstdatazone = ms->s_firstdatazone;
sbi->s_log_zone_size = ms->s_log_zone_size;
- sbi->s_max_size = ms->s_max_size;
+ s->s_maxbytes = ms->s_max_size;
s->s_magic = ms->s_magic;
if (s->s_magic == MINIX_SUPER_MAGIC) {
sbi->s_version = MINIX_V1;
@@ -216,7 +235,7 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
sbi->s_zmap_blocks = m3s->s_zmap_blocks;
sbi->s_firstdatazone = m3s->s_firstdatazone;
sbi->s_log_zone_size = m3s->s_log_zone_size;
- sbi->s_max_size = m3s->s_max_size;
+ s->s_maxbytes = m3s->s_max_size;
sbi->s_ninodes = m3s->s_ninodes;
sbi->s_nzones = m3s->s_zones;
sbi->s_dirsize = 64;
@@ -228,11 +247,12 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
} else
goto out_no_fs;
+ if (!minix_check_superblock(s))
+ goto out_illegal_sb;
+
/*
* Allocate the buffer map to keep the superblock small.
*/
- if (sbi->s_imap_blocks == 0 || sbi->s_zmap_blocks == 0)
- goto out_illegal_sb;
i = (sbi->s_imap_blocks + sbi->s_zmap_blocks) * sizeof(bh);
map = kzalloc(i, GFP_KERNEL);
if (!map)
@@ -468,6 +488,13 @@ static struct inode *V1_minix_iget(struct inode *inode)
iget_failed(inode);
return ERR_PTR(-EIO);
}
+ if (raw_inode->i_nlinks == 0) {
+ printk("MINIX-fs: deleted inode referenced: %lu\n",
+ inode->i_ino);
+ brelse(bh);
+ iget_failed(inode);
+ return ERR_PTR(-ESTALE);
+ }
inode->i_mode = raw_inode->i_mode;
i_uid_write(inode, raw_inode->i_uid);
i_gid_write(inode, raw_inode->i_gid);
@@ -501,6 +528,13 @@ static struct inode *V2_minix_iget(struct inode *inode)
iget_failed(inode);
return ERR_PTR(-EIO);
}
+ if (raw_inode->i_nlinks == 0) {
+ printk("MINIX-fs: deleted inode referenced: %lu\n",
+ inode->i_ino);
+ brelse(bh);
+ iget_failed(inode);
+ return ERR_PTR(-ESTALE);
+ }
inode->i_mode = raw_inode->i_mode;
i_uid_write(inode, raw_inode->i_uid);
i_gid_write(inode, raw_inode->i_gid);
diff --git a/fs/minix/itree_common.c b/fs/minix/itree_common.c
index 043c3fdbc8e7..446148792f41 100644
--- a/fs/minix/itree_common.c
+++ b/fs/minix/itree_common.c
@@ -75,6 +75,7 @@ static int alloc_branch(struct inode *inode,
int n = 0;
int i;
int parent = minix_new_block(inode);
+ int err = -ENOSPC;
branch[0].key = cpu_to_block(parent);
if (parent) for (n = 1; n < num; n++) {
@@ -85,6 +86,11 @@ static int alloc_branch(struct inode *inode,
break;
branch[n].key = cpu_to_block(nr);
bh = sb_getblk(inode->i_sb, parent);
+ if (!bh) {
+ minix_free_block(inode, nr);
+ err = -ENOMEM;
+ break;
+ }
lock_buffer(bh);
memset(bh->b_data, 0, bh->b_size);
branch[n].bh = bh;
@@ -103,7 +109,7 @@ static int alloc_branch(struct inode *inode,
bforget(branch[i].bh);
for (i = 0; i < n; i++)
minix_free_block(inode, block_to_cpu(branch[i].key));
- return -ENOSPC;
+ return err;
}
static inline int splice_branch(struct inode *inode,
diff --git a/fs/minix/itree_v1.c b/fs/minix/itree_v1.c
index 046cc96ee7ad..1fed906042aa 100644
--- a/fs/minix/itree_v1.c
+++ b/fs/minix/itree_v1.c
@@ -29,12 +29,12 @@ static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
if (block < 0) {
printk("MINIX-fs: block_to_path: block %ld < 0 on dev %pg\n",
block, inode->i_sb->s_bdev);
- } else if (block >= (minix_sb(inode->i_sb)->s_max_size/BLOCK_SIZE)) {
- if (printk_ratelimit())
- printk("MINIX-fs: block_to_path: "
- "block %ld too big on dev %pg\n",
- block, inode->i_sb->s_bdev);
- } else if (block < 7) {
+ return 0;
+ }
+ if ((u64)block * BLOCK_SIZE >= inode->i_sb->s_maxbytes)
+ return 0;
+
+ if (block < 7) {
offsets[n++] = block;
} else if ((block -= 7) < 512) {
offsets[n++] = 7;
diff --git a/fs/minix/itree_v2.c b/fs/minix/itree_v2.c
index f7fc7ecccccc..9d00f31a2d9d 100644
--- a/fs/minix/itree_v2.c
+++ b/fs/minix/itree_v2.c
@@ -32,13 +32,12 @@ static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
if (block < 0) {
printk("MINIX-fs: block_to_path: block %ld < 0 on dev %pg\n",
block, sb->s_bdev);
- } else if ((u64)block * (u64)sb->s_blocksize >=
- minix_sb(sb)->s_max_size) {
- if (printk_ratelimit())
- printk("MINIX-fs: block_to_path: "
- "block %ld too big on dev %pg\n",
- block, sb->s_bdev);
- } else if (block < DIRCOUNT) {
+ return 0;
+ }
+ if ((u64)block * (u64)sb->s_blocksize >= sb->s_maxbytes)
+ return 0;
+
+ if (block < DIRCOUNT) {
offsets[n++] = block;
} else if ((block -= DIRCOUNT) < INDIRCOUNT(sb)) {
offsets[n++] = DIRCOUNT;
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index df081e8afcc3..168d45d3de73 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -32,7 +32,6 @@ struct minix_sb_info {
unsigned long s_zmap_blocks;
unsigned long s_firstdatazone;
unsigned long s_log_zone_size;
- unsigned long s_max_size;
int s_dirsize;
int s_namelen;
struct buffer_head ** s_imap;
diff --git a/fs/namei.c b/fs/namei.c
index fde8fe086c09..e99e2a9da0f7 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2851,16 +2851,24 @@ static int may_open(const struct path *path, int acc_mode, int flag)
case S_IFDIR:
if (acc_mode & MAY_WRITE)
return -EISDIR;
+ if (acc_mode & MAY_EXEC)
+ return -EACCES;
break;
case S_IFBLK:
case S_IFCHR:
if (!may_open_dev(path))
return -EACCES;
- /*FALLTHRU*/
+ fallthrough;
case S_IFIFO:
case S_IFSOCK:
+ if (acc_mode & MAY_EXEC)
+ return -EACCES;
flag &= ~O_TRUNC;
break;
+ case S_IFREG:
+ if ((acc_mode & MAY_EXEC) && path_noexec(path))
+ return -EACCES;
+ break;
}
error = inode_permission(inode, MAY_OPEN | acc_mode);
@@ -3770,11 +3778,11 @@ exit2:
mnt_drop_write(path.mnt);
exit1:
path_put(&path);
- putname(name);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
+ putname(name);
return error;
}
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 2433c3e03cfa..22d11fdc6deb 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -30,7 +30,7 @@ nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o
nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o
nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o
nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o pnfs_nfs.o
-nfsv4-$(CONFIG_NFS_V4_2) += nfs42proc.o
+nfsv4-$(CONFIG_NFS_V4_2) += nfs42proc.o nfs42xattr.o
obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/
obj-$(CONFIG_PNFS_BLOCK) += blocklayout/
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
index 9fb067a6f7e0..ef9db135c649 100644
--- a/fs/nfs/blocklayout/rpc_pipefs.c
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -79,7 +79,7 @@ bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
goto out_free_data;
bl_msg = msg->data;
- bl_msg->type = BL_DEVICE_MOUNT,
+ bl_msg->type = BL_DEVICE_MOUNT;
bl_msg->totallen = b->simple.len;
nfs4_encode_simple(msg->data + sizeof(*bl_msg), b);
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index f1ff3076e4a4..4b8cc93913f7 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -50,6 +50,7 @@
#include "nfs.h"
#include "netns.h"
#include "sysfs.h"
+#include "nfs42.h"
#define NFSDBG_FACILITY NFSDBG_CLIENT
@@ -749,7 +750,7 @@ error:
static void nfs_server_set_fsinfo(struct nfs_server *server,
struct nfs_fsinfo *fsinfo)
{
- unsigned long max_rpc_payload;
+ unsigned long max_rpc_payload, raw_max_rpc_payload;
/* Work out a lot of parameters */
if (server->rsize == 0)
@@ -762,7 +763,9 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
if (fsinfo->wtmax >= 512 && server->wsize > fsinfo->wtmax)
server->wsize = nfs_block_size(fsinfo->wtmax, NULL);
- max_rpc_payload = nfs_block_size(rpc_max_payload(server->client), NULL);
+ raw_max_rpc_payload = rpc_max_payload(server->client);
+ max_rpc_payload = nfs_block_size(raw_max_rpc_payload, NULL);
+
if (server->rsize > max_rpc_payload)
server->rsize = max_rpc_payload;
if (server->rsize > NFS_MAX_FILE_IO_SIZE)
@@ -795,6 +798,21 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
server->clone_blksize = fsinfo->clone_blksize;
/* We're airborne Set socket buffersize */
rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
+
+#ifdef CONFIG_NFS_V4_2
+ /*
+ * Defaults until limited by the session parameters.
+ */
+ server->gxasize = min_t(unsigned int, raw_max_rpc_payload,
+ XATTR_SIZE_MAX);
+ server->sxasize = min_t(unsigned int, raw_max_rpc_payload,
+ XATTR_SIZE_MAX);
+ server->lxasize = min_t(unsigned int, raw_max_rpc_payload,
+ nfs42_listxattr_xdrsize(XATTR_LIST_MAX));
+
+ if (fsinfo->xattr_support)
+ server->caps |= NFS_CAP_XATTR;
+#endif
}
/*
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 5a331da5f55a..a12f42e7d8c7 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2460,7 +2460,7 @@ static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, co
return NULL;
}
-static int nfs_access_get_cached(struct inode *inode, const struct cred *cred, struct nfs_access_entry *res, bool may_block)
+static int nfs_access_get_cached_locked(struct inode *inode, const struct cred *cred, struct nfs_access_entry *res, bool may_block)
{
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_access_entry *cache;
@@ -2533,6 +2533,20 @@ out:
return err;
}
+int nfs_access_get_cached(struct inode *inode, const struct cred *cred, struct
+nfs_access_entry *res, bool may_block)
+{
+ int status;
+
+ status = nfs_access_get_cached_rcu(inode, cred, res);
+ if (status != 0)
+ status = nfs_access_get_cached_locked(inode, cred, res,
+ may_block);
+
+ return status;
+}
+EXPORT_SYMBOL_GPL(nfs_access_get_cached);
+
static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *set)
{
struct nfs_inode *nfsi = NFS_I(inode);
@@ -2647,9 +2661,7 @@ static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask)
trace_nfs_access_enter(inode);
- status = nfs_access_get_cached_rcu(inode, cred, &cache);
- if (status != 0)
- status = nfs_access_get_cached(inode, cred, &cache, may_block);
+ status = nfs_access_get_cached(inode, cred, &cache, may_block);
if (status == 0)
goto out_cached;
@@ -2661,6 +2673,10 @@ static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask)
* Determine which access bits we want to ask for...
*/
cache.mask = NFS_ACCESS_READ | NFS_ACCESS_MODIFY | NFS_ACCESS_EXTEND;
+ if (nfs_server_capable(inode, NFS_CAP_XATTR)) {
+ cache.mask |= NFS_ACCESS_XAREAD | NFS_ACCESS_XAWRITE |
+ NFS_ACCESS_XALIST;
+ }
if (S_ISDIR(inode->i_mode))
cache.mask |= NFS_ACCESS_DELETE | NFS_ACCESS_LOOKUP;
else
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 1b79dd5cf661..2d30a4da49fa 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -896,7 +896,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
*/
ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
{
- ssize_t result = -EINVAL, requested;
+ ssize_t result, requested;
size_t count;
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index f96367a2463e..63940a7a70be 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -140,6 +140,7 @@ static int
nfs_file_flush(struct file *file, fl_owner_t id)
{
struct inode *inode = file_inode(file);
+ errseq_t since;
dprintk("NFS: flush(%pD2)\n", file);
@@ -148,7 +149,9 @@ nfs_file_flush(struct file *file, fl_owner_t id)
return 0;
/* Flush writes to the server and return any errors */
- return nfs_wb_all(inode);
+ since = filemap_sample_wb_err(file->f_mapping);
+ nfs_wb_all(inode);
+ return filemap_check_wb_err(file->f_mapping, since);
}
ssize_t
@@ -587,12 +590,14 @@ static const struct vm_operations_struct nfs_file_vm_ops = {
.page_mkwrite = nfs_vm_page_mkwrite,
};
-static int nfs_need_check_write(struct file *filp, struct inode *inode)
+static int nfs_need_check_write(struct file *filp, struct inode *inode,
+ int error)
{
struct nfs_open_context *ctx;
ctx = nfs_file_open_context(filp);
- if (nfs_ctx_key_to_expire(ctx, inode))
+ if (nfs_error_is_fatal_on_server(error) ||
+ nfs_ctx_key_to_expire(ctx, inode))
return 1;
return 0;
}
@@ -603,6 +608,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = file_inode(file);
unsigned long written = 0;
ssize_t result;
+ errseq_t since;
+ int error;
result = nfs_key_timeout_notify(file, inode);
if (result)
@@ -627,6 +634,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
if (iocb->ki_pos > i_size_read(inode))
nfs_revalidate_mapping(inode, file->f_mapping);
+ since = filemap_sample_wb_err(file->f_mapping);
nfs_start_io_write(inode);
result = generic_write_checks(iocb, from);
if (result > 0) {
@@ -645,7 +653,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
goto out;
/* Return error values */
- if (nfs_need_check_write(file, inode)) {
+ error = filemap_check_wb_err(file->f_mapping, since);
+ if (nfs_need_check_write(file, inode, error)) {
int err = nfs_wb_all(inode);
if (err < 0)
result = err;
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index de03e440b7ee..965145592750 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -790,6 +790,19 @@ ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx);
}
+static struct nfs4_pnfs_ds *
+ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio, int *best_idx)
+{
+ struct pnfs_layout_segment *lseg = pgio->pg_lseg;
+ struct nfs4_pnfs_ds *ds;
+
+ ds = ff_layout_choose_best_ds_for_read(lseg, pgio->pg_mirror_idx,
+ best_idx);
+ if (ds || !pgio->pg_mirror_idx)
+ return ds;
+ return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx);
+}
+
static void
ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req,
@@ -840,12 +853,11 @@ retry:
goto out_nolseg;
}
- ds = ff_layout_choose_best_ds_for_read(pgio->pg_lseg, 0, &ds_idx);
+ ds = ff_layout_get_ds_for_read(pgio, &ds_idx);
if (!ds) {
if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
goto out_mds;
- pnfs_put_lseg(pgio->pg_lseg);
- pgio->pg_lseg = NULL;
+ pnfs_generic_pg_cleanup(pgio);
/* Sleep for 1 second before retrying */
ssleep(1);
goto retry;
@@ -871,8 +883,6 @@ out_mds:
0, NFS4_MAX_UINT64, IOMODE_READ,
NFS_I(pgio->pg_inode)->layout,
pgio->pg_lseg);
- pnfs_put_lseg(pgio->pg_lseg);
- pgio->pg_lseg = NULL;
pgio->pg_maxretrans = 0;
nfs_pageio_reset_read_mds(pgio);
}
@@ -916,8 +926,7 @@ retry:
if (!ds) {
if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
goto out_mds;
- pnfs_put_lseg(pgio->pg_lseg);
- pgio->pg_lseg = NULL;
+ pnfs_generic_pg_cleanup(pgio);
/* Sleep for 1 second before retrying */
ssleep(1);
goto retry;
@@ -939,8 +948,6 @@ out_mds:
0, NFS4_MAX_UINT64, IOMODE_RW,
NFS_I(pgio->pg_inode)->layout,
pgio->pg_lseg);
- pnfs_put_lseg(pgio->pg_lseg);
- pgio->pg_lseg = NULL;
pgio->pg_maxretrans = 0;
nfs_pageio_reset_write_mds(pgio);
pgio->pg_error = -EAGAIN;
@@ -953,8 +960,8 @@ ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
if (!pgio->pg_lseg) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
nfs_req_openctx(req),
- 0,
- NFS4_MAX_UINT64,
+ req_offset(req),
+ req->wb_bytes,
IOMODE_RW,
false,
GFP_NOFS);
@@ -1028,11 +1035,24 @@ static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs)
}
}
+static void ff_layout_resend_pnfs_read(struct nfs_pgio_header *hdr)
+{
+ u32 idx = hdr->pgio_mirror_idx + 1;
+ int new_idx = 0;
+
+ if (ff_layout_choose_any_ds_for_read(hdr->lseg, idx + 1, &new_idx))
+ ff_layout_send_layouterror(hdr->lseg);
+ else
+ pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg);
+ pnfs_read_resend_pnfs(hdr, new_idx);
+}
+
static void ff_layout_reset_read(struct nfs_pgio_header *hdr)
{
struct rpc_task *task = &hdr->task;
pnfs_layoutcommit_inode(hdr->inode, false);
+ pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg);
if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
dprintk("%s Reset task %5u for i/o through MDS "
@@ -1234,6 +1254,12 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
break;
case NFS4ERR_NXIO:
ff_layout_mark_ds_unreachable(lseg, idx);
+ /*
+ * Don't return the layout if this is a read and we still
+ * have layouts to try
+ */
+ if (opnum == OP_READ)
+ break;
/* Fallthrough */
default:
pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode,
@@ -1247,7 +1273,6 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
static int ff_layout_read_done_cb(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
- int new_idx = hdr->pgio_mirror_idx;
int err;
if (task->tk_status < 0) {
@@ -1267,10 +1292,6 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
clear_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags);
switch (err) {
case -NFS4ERR_RESET_TO_PNFS:
- if (ff_layout_choose_best_ds_for_read(hdr->lseg,
- hdr->pgio_mirror_idx + 1,
- &new_idx))
- goto out_layouterror;
set_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags);
return task->tk_status;
case -NFS4ERR_RESET_TO_MDS:
@@ -1281,10 +1302,6 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
}
return 0;
-out_layouterror:
- ff_layout_read_record_layoutstats_done(task, hdr);
- ff_layout_send_layouterror(hdr->lseg);
- hdr->pgio_mirror_idx = new_idx;
out_eagain:
rpc_restart_call_prepare(task);
return -EAGAIN;
@@ -1411,10 +1428,9 @@ static void ff_layout_read_release(void *data)
struct nfs_pgio_header *hdr = data;
ff_layout_read_record_layoutstats_done(&hdr->task, hdr);
- if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) {
- ff_layout_send_layouterror(hdr->lseg);
- pnfs_read_resend_pnfs(hdr);
- } else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags))
+ if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags))
+ ff_layout_resend_pnfs_read(hdr);
+ else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags))
ff_layout_reset_read(hdr);
pnfs_generic_rw_release(data);
}
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index ccc88be88d6a..66949da0e827 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -982,7 +982,7 @@ static int nfs23_parse_monolithic(struct fs_context *fc,
/*
* The legacy version 6 binary mount data from userspace has a
* field used only to transport selinux information into the
- * the kernel. To continue to support that functionality we
+ * kernel. To continue to support that functionality we
* have a touch of selinux knowledge here in the NFS code. The
* userspace code converted context=blah to just blah so we are
* converting back to the full string selinux understands.
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 0bf1f835de01..aa6493905bbe 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -193,6 +193,7 @@ bool nfs_check_cache_invalid(struct inode *inode, unsigned long flags)
return nfs_check_cache_invalid_not_delegated(inode, flags);
}
+EXPORT_SYMBOL_GPL(nfs_check_cache_invalid);
static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
{
@@ -204,7 +205,8 @@ static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
flags &= ~NFS_INO_INVALID_OTHER;
flags &= ~(NFS_INO_INVALID_CHANGE
| NFS_INO_INVALID_SIZE
- | NFS_INO_REVAL_PAGECACHE);
+ | NFS_INO_REVAL_PAGECACHE
+ | NFS_INO_INVALID_XATTR);
}
if (inode->i_mapping->nrpages == 0)
@@ -233,11 +235,13 @@ static void nfs_zap_caches_locked(struct inode *inode)
| NFS_INO_INVALID_DATA
| NFS_INO_INVALID_ACCESS
| NFS_INO_INVALID_ACL
+ | NFS_INO_INVALID_XATTR
| NFS_INO_REVAL_PAGECACHE);
} else
nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR
| NFS_INO_INVALID_ACCESS
| NFS_INO_INVALID_ACL
+ | NFS_INO_INVALID_XATTR
| NFS_INO_REVAL_PAGECACHE);
nfs_zap_label_cache_locked(nfsi);
}
@@ -542,6 +546,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
inode->i_gid = fattr->gid;
else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP))
nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
+ if (nfs_server_capable(inode, NFS_CAP_XATTR))
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_XATTR);
if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
inode->i_blocks = fattr->du.nfs2.blocks;
if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
@@ -794,8 +800,10 @@ int nfs_getattr(const struct path *path, struct kstat *stat,
trace_nfs_getattr_enter(inode);
- if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync)
+ if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync) {
+ nfs_readdirplus_parent_cache_hit(path->dentry);
goto out_no_update;
+ }
/* Flush out writes to the server in order to update c/mtime. */
if ((request_mask & (STATX_CTIME|STATX_MTIME)) &&
@@ -1375,6 +1383,8 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
inode_set_iversion_raw(inode, fattr->change_attr);
if (S_ISDIR(inode->i_mode))
nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA);
+ else if (nfs_server_capable(inode, NFS_CAP_XATTR))
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_XATTR);
}
/* If we have atomic WCC data, we may update some attributes */
ts = inode->i_ctime;
@@ -1892,7 +1902,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
if (!(have_writers || have_delegation)) {
invalid |= NFS_INO_INVALID_DATA
| NFS_INO_INVALID_ACCESS
- | NFS_INO_INVALID_ACL;
+ | NFS_INO_INVALID_ACL
+ | NFS_INO_INVALID_XATTR;
/* Force revalidate of all attributes */
save_cache_validity |= NFS_INO_INVALID_CTIME
| NFS_INO_INVALID_MTIME
@@ -2095,6 +2106,9 @@ struct inode *nfs_alloc_inode(struct super_block *sb)
#if IS_ENABLED(CONFIG_NFS_V4)
nfsi->nfs4_acl = NULL;
#endif /* CONFIG_NFS_V4 */
+#ifdef CONFIG_NFS_V4_2
+ nfsi->xattr_cache = NULL;
+#endif
return &nfsi->vfs_inode;
}
EXPORT_SYMBOL_GPL(nfs_alloc_inode);
diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
index c891af949886..0fe5aacbcfdf 100644
--- a/fs/nfs/nfs42.h
+++ b/fs/nfs/nfs42.h
@@ -6,6 +6,8 @@
#ifndef __LINUX_FS_NFS_NFS4_2_H
#define __LINUX_FS_NFS_NFS4_2_H
+#include <linux/xattr.h>
+
/*
* FIXME: four LAYOUTSTATS calls per compound at most! Do we need to support
* more? Need to consider not to pre-alloc too much for a compound.
@@ -36,5 +38,27 @@ static inline bool nfs42_files_from_same_server(struct file *in,
return nfs4_check_serverowner_major_id(c_in->cl_serverowner,
c_out->cl_serverowner);
}
+
+ssize_t nfs42_proc_getxattr(struct inode *inode, const char *name,
+ void *buf, size_t buflen);
+int nfs42_proc_setxattr(struct inode *inode, const char *name,
+ const void *buf, size_t buflen, int flags);
+ssize_t nfs42_proc_listxattrs(struct inode *inode, void *buf,
+ size_t buflen, u64 *cookiep, bool *eofp);
+int nfs42_proc_removexattr(struct inode *inode, const char *name);
+
+/*
+ * Maximum XDR buffer size needed for a listxattr buffer of buflen size.
+ *
+ * The upper boundary is a buffer with all 1-byte sized attribute names.
+ * They would be 7 bytes long in the eventual buffer ("user.x\0"), and
+ * 8 bytes long XDR-encoded.
+ *
+ * Include the trailing eof word as well.
+ */
+static inline u32 nfs42_listxattr_xdrsize(u32 buflen)
+{
+ return ((buflen / (XATTR_USER_PREFIX_LEN + 2)) * 8) + 4;
+}
#endif /* CONFIG_NFS_V4_2 */
#endif /* __LINUX_FS_NFS_NFS4_2_H */
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index e2ae54b35dfe..142225f0af59 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -17,6 +17,7 @@
#include "nfs4session.h"
#include "internal.h"
#include "delegation.h"
+#include "nfs4trace.h"
#define NFSDBG_FACILITY NFSDBG_PROC
static int nfs42_do_offload_cancel_async(struct file *dst, nfs4_stateid *std);
@@ -714,7 +715,7 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
switch (task->tk_status) {
case 0:
- break;
+ return;
case -NFS4ERR_BADHANDLE:
case -ESTALE:
pnfs_destroy_layout(NFS_I(inode));
@@ -760,6 +761,8 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
case -EOPNOTSUPP:
NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTSTATS;
}
+
+ trace_nfs4_layoutstats(inode, &data->args.stateid, task->tk_status);
}
static void
@@ -882,7 +885,7 @@ nfs42_layouterror_done(struct rpc_task *task, void *calldata)
switch (task->tk_status) {
case 0:
- break;
+ return;
case -NFS4ERR_BADHANDLE:
case -ESTALE:
pnfs_destroy_layout(NFS_I(inode));
@@ -926,6 +929,9 @@ nfs42_layouterror_done(struct rpc_task *task, void *calldata)
case -EOPNOTSUPP:
NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTERROR;
}
+
+ trace_nfs4_layouterror(inode, &data->args.errors[0].stateid,
+ task->tk_status);
}
static void
@@ -1088,3 +1094,251 @@ out_put_src_lock:
nfs_put_lock_context(src_lock);
return err;
}
+
+#define NFS4XATTR_MAXPAGES DIV_ROUND_UP(XATTR_SIZE_MAX, PAGE_SIZE)
+
+static int _nfs42_proc_removexattr(struct inode *inode, const char *name)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs42_removexattrargs args = {
+ .fh = NFS_FH(inode),
+ .xattr_name = name,
+ };
+ struct nfs42_removexattrres res;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVEXATTR],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ int ret;
+ unsigned long timestamp = jiffies;
+
+ ret = nfs4_call_sync(server->client, server, &msg, &args.seq_args,
+ &res.seq_res, 1);
+ if (!ret)
+ nfs4_update_changeattr(inode, &res.cinfo, timestamp, 0);
+
+ return ret;
+}
+
+static int _nfs42_proc_setxattr(struct inode *inode, const char *name,
+ const void *buf, size_t buflen, int flags)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct page *pages[NFS4XATTR_MAXPAGES];
+ struct nfs42_setxattrargs arg = {
+ .fh = NFS_FH(inode),
+ .xattr_pages = pages,
+ .xattr_len = buflen,
+ .xattr_name = name,
+ .xattr_flags = flags,
+ };
+ struct nfs42_setxattrres res;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETXATTR],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ };
+ int ret, np;
+ unsigned long timestamp = jiffies;
+
+ if (buflen > server->sxasize)
+ return -ERANGE;
+
+ if (buflen > 0) {
+ np = nfs4_buf_to_pages_noslab(buf, buflen, arg.xattr_pages);
+ if (np < 0)
+ return np;
+ } else
+ np = 0;
+
+ ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args,
+ &res.seq_res, 1);
+
+ for (; np > 0; np--)
+ put_page(pages[np - 1]);
+
+ if (!ret)
+ nfs4_update_changeattr(inode, &res.cinfo, timestamp, 0);
+
+ return ret;
+}
+
+static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name,
+ void *buf, size_t buflen)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct page *pages[NFS4XATTR_MAXPAGES] = {};
+ struct nfs42_getxattrargs arg = {
+ .fh = NFS_FH(inode),
+ .xattr_pages = pages,
+ .xattr_len = buflen,
+ .xattr_name = name,
+ };
+ struct nfs42_getxattrres res;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETXATTR],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ };
+ int ret, np;
+
+ ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args,
+ &res.seq_res, 0);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * Normally, the caching is done one layer up, but for successful
+ * RPCS, always cache the result here, even if the caller was
+ * just querying the length, or if the reply was too big for
+ * the caller. This avoids a second RPC in the case of the
+ * common query-alloc-retrieve cycle for xattrs.
+ *
+ * Note that xattr_len is always capped to XATTR_SIZE_MAX.
+ */
+
+ nfs4_xattr_cache_add(inode, name, NULL, pages, res.xattr_len);
+
+ if (buflen) {
+ if (res.xattr_len > buflen)
+ return -ERANGE;
+ _copy_from_pages(buf, pages, 0, res.xattr_len);
+ }
+
+ np = DIV_ROUND_UP(res.xattr_len, PAGE_SIZE);
+ while (--np >= 0)
+ __free_page(pages[np]);
+
+ return res.xattr_len;
+}
+
+static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void *buf,
+ size_t buflen, u64 *cookiep, bool *eofp)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct page **pages;
+ struct nfs42_listxattrsargs arg = {
+ .fh = NFS_FH(inode),
+ .cookie = *cookiep,
+ };
+ struct nfs42_listxattrsres res = {
+ .eof = false,
+ .xattr_buf = buf,
+ .xattr_len = buflen,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LISTXATTRS],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ };
+ u32 xdrlen;
+ int ret, np;
+
+
+ res.scratch = alloc_page(GFP_KERNEL);
+ if (!res.scratch)
+ return -ENOMEM;
+
+ xdrlen = nfs42_listxattr_xdrsize(buflen);
+ if (xdrlen > server->lxasize)
+ xdrlen = server->lxasize;
+ np = xdrlen / PAGE_SIZE + 1;
+
+ pages = kcalloc(np, sizeof(struct page *), GFP_KERNEL);
+ if (pages == NULL) {
+ __free_page(res.scratch);
+ return -ENOMEM;
+ }
+
+ arg.xattr_pages = pages;
+ arg.count = xdrlen;
+
+ ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args,
+ &res.seq_res, 0);
+
+ if (ret >= 0) {
+ ret = res.copied;
+ *cookiep = res.cookie;
+ *eofp = res.eof;
+ }
+
+ while (--np >= 0) {
+ if (pages[np])
+ __free_page(pages[np]);
+ }
+
+ __free_page(res.scratch);
+ kfree(pages);
+
+ return ret;
+
+}
+
+ssize_t nfs42_proc_getxattr(struct inode *inode, const char *name,
+ void *buf, size_t buflen)
+{
+ struct nfs4_exception exception = { };
+ ssize_t err;
+
+ do {
+ err = _nfs42_proc_getxattr(inode, name, buf, buflen);
+ if (err >= 0)
+ break;
+ err = nfs4_handle_exception(NFS_SERVER(inode), err,
+ &exception);
+ } while (exception.retry);
+
+ return err;
+}
+
+int nfs42_proc_setxattr(struct inode *inode, const char *name,
+ const void *buf, size_t buflen, int flags)
+{
+ struct nfs4_exception exception = { };
+ int err;
+
+ do {
+ err = _nfs42_proc_setxattr(inode, name, buf, buflen, flags);
+ if (!err)
+ break;
+ err = nfs4_handle_exception(NFS_SERVER(inode), err,
+ &exception);
+ } while (exception.retry);
+
+ return err;
+}
+
+ssize_t nfs42_proc_listxattrs(struct inode *inode, void *buf,
+ size_t buflen, u64 *cookiep, bool *eofp)
+{
+ struct nfs4_exception exception = { };
+ ssize_t err;
+
+ do {
+ err = _nfs42_proc_listxattrs(inode, buf, buflen,
+ cookiep, eofp);
+ if (err >= 0)
+ break;
+ err = nfs4_handle_exception(NFS_SERVER(inode), err,
+ &exception);
+ } while (exception.retry);
+
+ return err;
+}
+
+int nfs42_proc_removexattr(struct inode *inode, const char *name)
+{
+ struct nfs4_exception exception = { };
+ int err;
+
+ do {
+ err = _nfs42_proc_removexattr(inode, name);
+ if (!err)
+ break;
+ err = nfs4_handle_exception(NFS_SERVER(inode), err,
+ &exception);
+ } while (exception.retry);
+
+ return err;
+}
diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c
new file mode 100644
index 000000000000..86777996cfec
--- /dev/null
+++ b/fs/nfs/nfs42xattr.c
@@ -0,0 +1,1056 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright 2019, 2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ *
+ * User extended attribute client side cache functions.
+ *
+ * Author: Frank van der Linden <fllinden@amazon.com>
+ */
+#include <linux/errno.h>
+#include <linux/nfs_fs.h>
+#include <linux/hashtable.h>
+#include <linux/refcount.h>
+#include <uapi/linux/xattr.h>
+
+#include "nfs4_fs.h"
+#include "internal.h"
+
+/*
+ * User extended attributes client side caching is implemented by having
+ * a cache structure attached to NFS inodes. This structure is allocated
+ * when needed, and freed when the cache is zapped.
+ *
+ * The cache structure contains as hash table of entries, and a pointer
+ * to a special-cased entry for the listxattr cache.
+ *
+ * Accessing and allocating / freeing the caches is done via reference
+ * counting. The cache entries use a similar refcounting scheme.
+ *
+ * This makes freeing a cache, both from the shrinker and from the
+ * zap cache path, easy. It also means that, in current use cases,
+ * the large majority of inodes will not waste any memory, as they
+ * will never have any user extended attributes assigned to them.
+ *
+ * Attribute entries are hashed in to a simple hash table. They are
+ * also part of an LRU.
+ *
+ * There are three shrinkers.
+ *
+ * Two shrinkers deal with the cache entries themselves: one for
+ * large entries (> PAGE_SIZE), and one for smaller entries. The
+ * shrinker for the larger entries works more aggressively than
+ * those for the smaller entries.
+ *
+ * The other shrinker frees the cache structures themselves.
+ */
+
+/*
+ * 64 buckets is a good default. There is likely no reasonable
+ * workload that uses more than even 64 user extended attributes.
+ * You can certainly add a lot more - but you get what you ask for
+ * in those circumstances.
+ */
+#define NFS4_XATTR_HASH_SIZE 64
+
+#define NFSDBG_FACILITY NFSDBG_XATTRCACHE
+
+struct nfs4_xattr_cache;
+struct nfs4_xattr_entry;
+
+struct nfs4_xattr_bucket {
+ spinlock_t lock;
+ struct hlist_head hlist;
+ struct nfs4_xattr_cache *cache;
+ bool draining;
+};
+
+struct nfs4_xattr_cache {
+ struct kref ref;
+ spinlock_t hash_lock; /* protects hashtable and lru */
+ struct nfs4_xattr_bucket buckets[NFS4_XATTR_HASH_SIZE];
+ struct list_head lru;
+ struct list_head dispose;
+ atomic_long_t nent;
+ spinlock_t listxattr_lock;
+ struct inode *inode;
+ struct nfs4_xattr_entry *listxattr;
+};
+
+struct nfs4_xattr_entry {
+ struct kref ref;
+ struct hlist_node hnode;
+ struct list_head lru;
+ struct list_head dispose;
+ char *xattr_name;
+ void *xattr_value;
+ size_t xattr_size;
+ struct nfs4_xattr_bucket *bucket;
+ uint32_t flags;
+};
+
+#define NFS4_XATTR_ENTRY_EXTVAL 0x0001
+
+/*
+ * LRU list of NFS inodes that have xattr caches.
+ */
+static struct list_lru nfs4_xattr_cache_lru;
+static struct list_lru nfs4_xattr_entry_lru;
+static struct list_lru nfs4_xattr_large_entry_lru;
+
+static struct kmem_cache *nfs4_xattr_cache_cachep;
+
+/*
+ * Hashing helper functions.
+ */
+static void
+nfs4_xattr_hash_init(struct nfs4_xattr_cache *cache)
+{
+ unsigned int i;
+
+ for (i = 0; i < NFS4_XATTR_HASH_SIZE; i++) {
+ INIT_HLIST_HEAD(&cache->buckets[i].hlist);
+ spin_lock_init(&cache->buckets[i].lock);
+ cache->buckets[i].cache = cache;
+ cache->buckets[i].draining = false;
+ }
+}
+
+/*
+ * Locking order:
+ * 1. inode i_lock or bucket lock
+ * 2. list_lru lock (taken by list_lru_* functions)
+ */
+
+/*
+ * Wrapper functions to add a cache entry to the right LRU.
+ */
+static bool
+nfs4_xattr_entry_lru_add(struct nfs4_xattr_entry *entry)
+{
+ struct list_lru *lru;
+
+ lru = (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) ?
+ &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru;
+
+ return list_lru_add(lru, &entry->lru);
+}
+
+static bool
+nfs4_xattr_entry_lru_del(struct nfs4_xattr_entry *entry)
+{
+ struct list_lru *lru;
+
+ lru = (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) ?
+ &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru;
+
+ return list_lru_del(lru, &entry->lru);
+}
+
+/*
+ * This function allocates cache entries. They are the normal
+ * extended attribute name/value pairs, but may also be a listxattr
+ * cache. Those allocations use the same entry so that they can be
+ * treated as one by the memory shrinker.
+ *
+ * xattr cache entries are allocated together with names. If the
+ * value fits in to one page with the entry structure and the name,
+ * it will also be part of the same allocation (kmalloc). This is
+ * expected to be the vast majority of cases. Larger allocations
+ * have a value pointer that is allocated separately by kvmalloc.
+ *
+ * Parameters:
+ *
+ * @name: Name of the extended attribute. NULL for listxattr cache
+ * entry.
+ * @value: Value of attribute, or listxattr cache. NULL if the
+ * value is to be copied from pages instead.
+ * @pages: Pages to copy the value from, if not NULL. Passed in to
+ * make it easier to copy the value after an RPC, even if
+ * the value will not be passed up to application (e.g.
+ * for a 'query' getxattr with NULL buffer).
+ * @len: Length of the value. Can be 0 for zero-length attribues.
+ * @value and @pages will be NULL if @len is 0.
+ */
+static struct nfs4_xattr_entry *
+nfs4_xattr_alloc_entry(const char *name, const void *value,
+ struct page **pages, size_t len)
+{
+ struct nfs4_xattr_entry *entry;
+ void *valp;
+ char *namep;
+ size_t alloclen, slen;
+ char *buf;
+ uint32_t flags;
+
+ BUILD_BUG_ON(sizeof(struct nfs4_xattr_entry) +
+ XATTR_NAME_MAX + 1 > PAGE_SIZE);
+
+ alloclen = sizeof(struct nfs4_xattr_entry);
+ if (name != NULL) {
+ slen = strlen(name) + 1;
+ alloclen += slen;
+ } else
+ slen = 0;
+
+ if (alloclen + len <= PAGE_SIZE) {
+ alloclen += len;
+ flags = 0;
+ } else {
+ flags = NFS4_XATTR_ENTRY_EXTVAL;
+ }
+
+ buf = kmalloc(alloclen, GFP_KERNEL_ACCOUNT | GFP_NOFS);
+ if (buf == NULL)
+ return NULL;
+ entry = (struct nfs4_xattr_entry *)buf;
+
+ if (name != NULL) {
+ namep = buf + sizeof(struct nfs4_xattr_entry);
+ memcpy(namep, name, slen);
+ } else {
+ namep = NULL;
+ }
+
+
+ if (flags & NFS4_XATTR_ENTRY_EXTVAL) {
+ valp = kvmalloc(len, GFP_KERNEL_ACCOUNT | GFP_NOFS);
+ if (valp == NULL) {
+ kfree(buf);
+ return NULL;
+ }
+ } else if (len != 0) {
+ valp = buf + sizeof(struct nfs4_xattr_entry) + slen;
+ } else
+ valp = NULL;
+
+ if (valp != NULL) {
+ if (value != NULL)
+ memcpy(valp, value, len);
+ else
+ _copy_from_pages(valp, pages, 0, len);
+ }
+
+ entry->flags = flags;
+ entry->xattr_value = valp;
+ kref_init(&entry->ref);
+ entry->xattr_name = namep;
+ entry->xattr_size = len;
+ entry->bucket = NULL;
+ INIT_LIST_HEAD(&entry->lru);
+ INIT_LIST_HEAD(&entry->dispose);
+ INIT_HLIST_NODE(&entry->hnode);
+
+ return entry;
+}
+
+static void
+nfs4_xattr_free_entry(struct nfs4_xattr_entry *entry)
+{
+ if (entry->flags & NFS4_XATTR_ENTRY_EXTVAL)
+ kvfree(entry->xattr_value);
+ kfree(entry);
+}
+
+static void
+nfs4_xattr_free_entry_cb(struct kref *kref)
+{
+ struct nfs4_xattr_entry *entry;
+
+ entry = container_of(kref, struct nfs4_xattr_entry, ref);
+
+ if (WARN_ON(!list_empty(&entry->lru)))
+ return;
+
+ nfs4_xattr_free_entry(entry);
+}
+
+static void
+nfs4_xattr_free_cache_cb(struct kref *kref)
+{
+ struct nfs4_xattr_cache *cache;
+ int i;
+
+ cache = container_of(kref, struct nfs4_xattr_cache, ref);
+
+ for (i = 0; i < NFS4_XATTR_HASH_SIZE; i++) {
+ if (WARN_ON(!hlist_empty(&cache->buckets[i].hlist)))
+ return;
+ cache->buckets[i].draining = false;
+ }
+
+ cache->listxattr = NULL;
+
+ kmem_cache_free(nfs4_xattr_cache_cachep, cache);
+
+}
+
+static struct nfs4_xattr_cache *
+nfs4_xattr_alloc_cache(void)
+{
+ struct nfs4_xattr_cache *cache;
+
+ cache = kmem_cache_alloc(nfs4_xattr_cache_cachep,
+ GFP_KERNEL_ACCOUNT | GFP_NOFS);
+ if (cache == NULL)
+ return NULL;
+
+ kref_init(&cache->ref);
+ atomic_long_set(&cache->nent, 0);
+
+ return cache;
+}
+
+/*
+ * Set the listxattr cache, which is a special-cased cache entry.
+ * The special value ERR_PTR(-ESTALE) is used to indicate that
+ * the cache is being drained - this prevents a new listxattr
+ * cache from being added to what is now a stale cache.
+ */
+static int
+nfs4_xattr_set_listcache(struct nfs4_xattr_cache *cache,
+ struct nfs4_xattr_entry *new)
+{
+ struct nfs4_xattr_entry *old;
+ int ret = 1;
+
+ spin_lock(&cache->listxattr_lock);
+
+ old = cache->listxattr;
+
+ if (old == ERR_PTR(-ESTALE)) {
+ ret = 0;
+ goto out;
+ }
+
+ cache->listxattr = new;
+ if (new != NULL && new != ERR_PTR(-ESTALE))
+ nfs4_xattr_entry_lru_add(new);
+
+ if (old != NULL) {
+ nfs4_xattr_entry_lru_del(old);
+ kref_put(&old->ref, nfs4_xattr_free_entry_cb);
+ }
+out:
+ spin_unlock(&cache->listxattr_lock);
+
+ return ret;
+}
+
+/*
+ * Unlink a cache from its parent inode, clearing out an invalid
+ * cache. Must be called with i_lock held.
+ */
+static struct nfs4_xattr_cache *
+nfs4_xattr_cache_unlink(struct inode *inode)
+{
+ struct nfs_inode *nfsi;
+ struct nfs4_xattr_cache *oldcache;
+
+ nfsi = NFS_I(inode);
+
+ oldcache = nfsi->xattr_cache;
+ if (oldcache != NULL) {
+ list_lru_del(&nfs4_xattr_cache_lru, &oldcache->lru);
+ oldcache->inode = NULL;
+ }
+ nfsi->xattr_cache = NULL;
+ nfsi->cache_validity &= ~NFS_INO_INVALID_XATTR;
+
+ return oldcache;
+
+}
+
+/*
+ * Discard a cache. Called by get_cache() if there was an old,
+ * invalid cache. Can also be called from a shrinker callback.
+ *
+ * The cache is dead, it has already been unlinked from its inode,
+ * and no longer appears on the cache LRU list.
+ *
+ * Mark all buckets as draining, so that no new entries are added. This
+ * could still happen in the unlikely, but possible case that another
+ * thread had grabbed a reference before it was unlinked from the inode,
+ * and is still holding it for an add operation.
+ *
+ * Remove all entries from the LRU lists, so that there is no longer
+ * any way to 'find' this cache. Then, remove the entries from the hash
+ * table.
+ *
+ * At that point, the cache will remain empty and can be freed when the final
+ * reference drops, which is very likely the kref_put at the end of
+ * this function, or the one called immediately afterwards in the
+ * shrinker callback.
+ */
+static void
+nfs4_xattr_discard_cache(struct nfs4_xattr_cache *cache)
+{
+ unsigned int i;
+ struct nfs4_xattr_entry *entry;
+ struct nfs4_xattr_bucket *bucket;
+ struct hlist_node *n;
+
+ nfs4_xattr_set_listcache(cache, ERR_PTR(-ESTALE));
+
+ for (i = 0; i < NFS4_XATTR_HASH_SIZE; i++) {
+ bucket = &cache->buckets[i];
+
+ spin_lock(&bucket->lock);
+ bucket->draining = true;
+ hlist_for_each_entry_safe(entry, n, &bucket->hlist, hnode) {
+ nfs4_xattr_entry_lru_del(entry);
+ hlist_del_init(&entry->hnode);
+ kref_put(&entry->ref, nfs4_xattr_free_entry_cb);
+ }
+ spin_unlock(&bucket->lock);
+ }
+
+ atomic_long_set(&cache->nent, 0);
+
+ kref_put(&cache->ref, nfs4_xattr_free_cache_cb);
+}
+
+/*
+ * Get a referenced copy of the cache structure. Avoid doing allocs
+ * while holding i_lock. Which means that we do some optimistic allocation,
+ * and might have to free the result in rare cases.
+ *
+ * This function only checks the NFS_INO_INVALID_XATTR cache validity bit
+ * and acts accordingly, replacing the cache when needed. For the read case
+ * (!add), this means that the caller must make sure that the cache
+ * is valid before caling this function. getxattr and listxattr call
+ * revalidate_inode to do this. The attribute cache timeout (for the
+ * non-delegated case) is expected to be dealt with in the revalidate
+ * call.
+ */
+
+static struct nfs4_xattr_cache *
+nfs4_xattr_get_cache(struct inode *inode, int add)
+{
+ struct nfs_inode *nfsi;
+ struct nfs4_xattr_cache *cache, *oldcache, *newcache;
+
+ nfsi = NFS_I(inode);
+
+ cache = oldcache = NULL;
+
+ spin_lock(&inode->i_lock);
+
+ if (nfsi->cache_validity & NFS_INO_INVALID_XATTR)
+ oldcache = nfs4_xattr_cache_unlink(inode);
+ else
+ cache = nfsi->xattr_cache;
+
+ if (cache != NULL)
+ kref_get(&cache->ref);
+
+ spin_unlock(&inode->i_lock);
+
+ if (add && cache == NULL) {
+ newcache = NULL;
+
+ cache = nfs4_xattr_alloc_cache();
+ if (cache == NULL)
+ goto out;
+
+ spin_lock(&inode->i_lock);
+ if (nfsi->cache_validity & NFS_INO_INVALID_XATTR) {
+ /*
+ * The cache was invalidated again. Give up,
+ * since what we want to enter is now likely
+ * outdated anyway.
+ */
+ spin_unlock(&inode->i_lock);
+ kref_put(&cache->ref, nfs4_xattr_free_cache_cb);
+ cache = NULL;
+ goto out;
+ }
+
+ /*
+ * Check if someone beat us to it.
+ */
+ if (nfsi->xattr_cache != NULL) {
+ newcache = nfsi->xattr_cache;
+ kref_get(&newcache->ref);
+ } else {
+ kref_get(&cache->ref);
+ nfsi->xattr_cache = cache;
+ cache->inode = inode;
+ list_lru_add(&nfs4_xattr_cache_lru, &cache->lru);
+ }
+
+ spin_unlock(&inode->i_lock);
+
+ /*
+ * If there was a race, throw away the cache we just
+ * allocated, and use the new one allocated by someone
+ * else.
+ */
+ if (newcache != NULL) {
+ kref_put(&cache->ref, nfs4_xattr_free_cache_cb);
+ cache = newcache;
+ }
+ }
+
+out:
+ /*
+ * Discard the now orphaned old cache.
+ */
+ if (oldcache != NULL)
+ nfs4_xattr_discard_cache(oldcache);
+
+ return cache;
+}
+
+static inline struct nfs4_xattr_bucket *
+nfs4_xattr_hash_bucket(struct nfs4_xattr_cache *cache, const char *name)
+{
+ return &cache->buckets[jhash(name, strlen(name), 0) &
+ (ARRAY_SIZE(cache->buckets) - 1)];
+}
+
+static struct nfs4_xattr_entry *
+nfs4_xattr_get_entry(struct nfs4_xattr_bucket *bucket, const char *name)
+{
+ struct nfs4_xattr_entry *entry;
+
+ entry = NULL;
+
+ hlist_for_each_entry(entry, &bucket->hlist, hnode) {
+ if (!strcmp(entry->xattr_name, name))
+ break;
+ }
+
+ return entry;
+}
+
+static int
+nfs4_xattr_hash_add(struct nfs4_xattr_cache *cache,
+ struct nfs4_xattr_entry *entry)
+{
+ struct nfs4_xattr_bucket *bucket;
+ struct nfs4_xattr_entry *oldentry = NULL;
+ int ret = 1;
+
+ bucket = nfs4_xattr_hash_bucket(cache, entry->xattr_name);
+ entry->bucket = bucket;
+
+ spin_lock(&bucket->lock);
+
+ if (bucket->draining) {
+ ret = 0;
+ goto out;
+ }
+
+ oldentry = nfs4_xattr_get_entry(bucket, entry->xattr_name);
+ if (oldentry != NULL) {
+ hlist_del_init(&oldentry->hnode);
+ nfs4_xattr_entry_lru_del(oldentry);
+ } else {
+ atomic_long_inc(&cache->nent);
+ }
+
+ hlist_add_head(&entry->hnode, &bucket->hlist);
+ nfs4_xattr_entry_lru_add(entry);
+
+out:
+ spin_unlock(&bucket->lock);
+
+ if (oldentry != NULL)
+ kref_put(&oldentry->ref, nfs4_xattr_free_entry_cb);
+
+ return ret;
+}
+
+static void
+nfs4_xattr_hash_remove(struct nfs4_xattr_cache *cache, const char *name)
+{
+ struct nfs4_xattr_bucket *bucket;
+ struct nfs4_xattr_entry *entry;
+
+ bucket = nfs4_xattr_hash_bucket(cache, name);
+
+ spin_lock(&bucket->lock);
+
+ entry = nfs4_xattr_get_entry(bucket, name);
+ if (entry != NULL) {
+ hlist_del_init(&entry->hnode);
+ nfs4_xattr_entry_lru_del(entry);
+ atomic_long_dec(&cache->nent);
+ }
+
+ spin_unlock(&bucket->lock);
+
+ if (entry != NULL)
+ kref_put(&entry->ref, nfs4_xattr_free_entry_cb);
+}
+
+static struct nfs4_xattr_entry *
+nfs4_xattr_hash_find(struct nfs4_xattr_cache *cache, const char *name)
+{
+ struct nfs4_xattr_bucket *bucket;
+ struct nfs4_xattr_entry *entry;
+
+ bucket = nfs4_xattr_hash_bucket(cache, name);
+
+ spin_lock(&bucket->lock);
+
+ entry = nfs4_xattr_get_entry(bucket, name);
+ if (entry != NULL)
+ kref_get(&entry->ref);
+
+ spin_unlock(&bucket->lock);
+
+ return entry;
+}
+
+/*
+ * Entry point to retrieve an entry from the cache.
+ */
+ssize_t nfs4_xattr_cache_get(struct inode *inode, const char *name, char *buf,
+ ssize_t buflen)
+{
+ struct nfs4_xattr_cache *cache;
+ struct nfs4_xattr_entry *entry;
+ ssize_t ret;
+
+ cache = nfs4_xattr_get_cache(inode, 0);
+ if (cache == NULL)
+ return -ENOENT;
+
+ ret = 0;
+ entry = nfs4_xattr_hash_find(cache, name);
+
+ if (entry != NULL) {
+ dprintk("%s: cache hit '%s', len %lu\n", __func__,
+ entry->xattr_name, (unsigned long)entry->xattr_size);
+ if (buflen == 0) {
+ /* Length probe only */
+ ret = entry->xattr_size;
+ } else if (buflen < entry->xattr_size)
+ ret = -ERANGE;
+ else {
+ memcpy(buf, entry->xattr_value, entry->xattr_size);
+ ret = entry->xattr_size;
+ }
+ kref_put(&entry->ref, nfs4_xattr_free_entry_cb);
+ } else {
+ dprintk("%s: cache miss '%s'\n", __func__, name);
+ ret = -ENOENT;
+ }
+
+ kref_put(&cache->ref, nfs4_xattr_free_cache_cb);
+
+ return ret;
+}
+
+/*
+ * Retrieve a cached list of xattrs from the cache.
+ */
+ssize_t nfs4_xattr_cache_list(struct inode *inode, char *buf, ssize_t buflen)
+{
+ struct nfs4_xattr_cache *cache;
+ struct nfs4_xattr_entry *entry;
+ ssize_t ret;
+
+ cache = nfs4_xattr_get_cache(inode, 0);
+ if (cache == NULL)
+ return -ENOENT;
+
+ spin_lock(&cache->listxattr_lock);
+
+ entry = cache->listxattr;
+
+ if (entry != NULL && entry != ERR_PTR(-ESTALE)) {
+ if (buflen == 0) {
+ /* Length probe only */
+ ret = entry->xattr_size;
+ } else if (entry->xattr_size > buflen)
+ ret = -ERANGE;
+ else {
+ memcpy(buf, entry->xattr_value, entry->xattr_size);
+ ret = entry->xattr_size;
+ }
+ } else {
+ ret = -ENOENT;
+ }
+
+ spin_unlock(&cache->listxattr_lock);
+
+ kref_put(&cache->ref, nfs4_xattr_free_cache_cb);
+
+ return ret;
+}
+
+/*
+ * Add an xattr to the cache.
+ *
+ * This also invalidates the xattr list cache.
+ */
+void nfs4_xattr_cache_add(struct inode *inode, const char *name,
+ const char *buf, struct page **pages, ssize_t buflen)
+{
+ struct nfs4_xattr_cache *cache;
+ struct nfs4_xattr_entry *entry;
+
+ dprintk("%s: add '%s' len %lu\n", __func__,
+ name, (unsigned long)buflen);
+
+ cache = nfs4_xattr_get_cache(inode, 1);
+ if (cache == NULL)
+ return;
+
+ entry = nfs4_xattr_alloc_entry(name, buf, pages, buflen);
+ if (entry == NULL)
+ goto out;
+
+ (void)nfs4_xattr_set_listcache(cache, NULL);
+
+ if (!nfs4_xattr_hash_add(cache, entry))
+ kref_put(&entry->ref, nfs4_xattr_free_entry_cb);
+
+out:
+ kref_put(&cache->ref, nfs4_xattr_free_cache_cb);
+}
+
+
+/*
+ * Remove an xattr from the cache.
+ *
+ * This also invalidates the xattr list cache.
+ */
+void nfs4_xattr_cache_remove(struct inode *inode, const char *name)
+{
+ struct nfs4_xattr_cache *cache;
+
+ dprintk("%s: remove '%s'\n", __func__, name);
+
+ cache = nfs4_xattr_get_cache(inode, 0);
+ if (cache == NULL)
+ return;
+
+ (void)nfs4_xattr_set_listcache(cache, NULL);
+ nfs4_xattr_hash_remove(cache, name);
+
+ kref_put(&cache->ref, nfs4_xattr_free_cache_cb);
+}
+
+/*
+ * Cache listxattr output, replacing any possible old one.
+ */
+void nfs4_xattr_cache_set_list(struct inode *inode, const char *buf,
+ ssize_t buflen)
+{
+ struct nfs4_xattr_cache *cache;
+ struct nfs4_xattr_entry *entry;
+
+ cache = nfs4_xattr_get_cache(inode, 1);
+ if (cache == NULL)
+ return;
+
+ entry = nfs4_xattr_alloc_entry(NULL, buf, NULL, buflen);
+ if (entry == NULL)
+ goto out;
+
+ /*
+ * This is just there to be able to get to bucket->cache,
+ * which is obviously the same for all buckets, so just
+ * use bucket 0.
+ */
+ entry->bucket = &cache->buckets[0];
+
+ if (!nfs4_xattr_set_listcache(cache, entry))
+ kref_put(&entry->ref, nfs4_xattr_free_entry_cb);
+
+out:
+ kref_put(&cache->ref, nfs4_xattr_free_cache_cb);
+}
+
+/*
+ * Zap the entire cache. Called when an inode is evicted.
+ */
+void nfs4_xattr_cache_zap(struct inode *inode)
+{
+ struct nfs4_xattr_cache *oldcache;
+
+ spin_lock(&inode->i_lock);
+ oldcache = nfs4_xattr_cache_unlink(inode);
+ spin_unlock(&inode->i_lock);
+
+ if (oldcache)
+ nfs4_xattr_discard_cache(oldcache);
+}
+
+/*
+ * The entry LRU is shrunk more aggressively than the cache LRU,
+ * by settings @seeks to 1.
+ *
+ * Cache structures are freed only when they've become empty, after
+ * pruning all but one entry.
+ */
+
+static unsigned long nfs4_xattr_cache_count(struct shrinker *shrink,
+ struct shrink_control *sc);
+static unsigned long nfs4_xattr_entry_count(struct shrinker *shrink,
+ struct shrink_control *sc);
+static unsigned long nfs4_xattr_cache_scan(struct shrinker *shrink,
+ struct shrink_control *sc);
+static unsigned long nfs4_xattr_entry_scan(struct shrinker *shrink,
+ struct shrink_control *sc);
+
+static struct shrinker nfs4_xattr_cache_shrinker = {
+ .count_objects = nfs4_xattr_cache_count,
+ .scan_objects = nfs4_xattr_cache_scan,
+ .seeks = DEFAULT_SEEKS,
+ .flags = SHRINKER_MEMCG_AWARE,
+};
+
+static struct shrinker nfs4_xattr_entry_shrinker = {
+ .count_objects = nfs4_xattr_entry_count,
+ .scan_objects = nfs4_xattr_entry_scan,
+ .seeks = DEFAULT_SEEKS,
+ .batch = 512,
+ .flags = SHRINKER_MEMCG_AWARE,
+};
+
+static struct shrinker nfs4_xattr_large_entry_shrinker = {
+ .count_objects = nfs4_xattr_entry_count,
+ .scan_objects = nfs4_xattr_entry_scan,
+ .seeks = 1,
+ .batch = 512,
+ .flags = SHRINKER_MEMCG_AWARE,
+};
+
+static enum lru_status
+cache_lru_isolate(struct list_head *item,
+ struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
+{
+ struct list_head *dispose = arg;
+ struct inode *inode;
+ struct nfs4_xattr_cache *cache = container_of(item,
+ struct nfs4_xattr_cache, lru);
+
+ if (atomic_long_read(&cache->nent) > 1)
+ return LRU_SKIP;
+
+ /*
+ * If a cache structure is on the LRU list, we know that
+ * its inode is valid. Try to lock it to break the link.
+ * Since we're inverting the lock order here, only try.
+ */
+ inode = cache->inode;
+
+ if (!spin_trylock(&inode->i_lock))
+ return LRU_SKIP;
+
+ kref_get(&cache->ref);
+
+ cache->inode = NULL;
+ NFS_I(inode)->xattr_cache = NULL;
+ NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_XATTR;
+ list_lru_isolate(lru, &cache->lru);
+
+ spin_unlock(&inode->i_lock);
+
+ list_add_tail(&cache->dispose, dispose);
+ return LRU_REMOVED;
+}
+
+static unsigned long
+nfs4_xattr_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+ LIST_HEAD(dispose);
+ unsigned long freed;
+ struct nfs4_xattr_cache *cache;
+
+ freed = list_lru_shrink_walk(&nfs4_xattr_cache_lru, sc,
+ cache_lru_isolate, &dispose);
+ while (!list_empty(&dispose)) {
+ cache = list_first_entry(&dispose, struct nfs4_xattr_cache,
+ dispose);
+ list_del_init(&cache->dispose);
+ nfs4_xattr_discard_cache(cache);
+ kref_put(&cache->ref, nfs4_xattr_free_cache_cb);
+ }
+
+ return freed;
+}
+
+
+static unsigned long
+nfs4_xattr_cache_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+ unsigned long count;
+
+ count = list_lru_count(&nfs4_xattr_cache_lru);
+ return vfs_pressure_ratio(count);
+}
+
+static enum lru_status
+entry_lru_isolate(struct list_head *item,
+ struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
+{
+ struct list_head *dispose = arg;
+ struct nfs4_xattr_bucket *bucket;
+ struct nfs4_xattr_cache *cache;
+ struct nfs4_xattr_entry *entry = container_of(item,
+ struct nfs4_xattr_entry, lru);
+
+ bucket = entry->bucket;
+ cache = bucket->cache;
+
+ /*
+ * Unhook the entry from its parent (either a cache bucket
+ * or a cache structure if it's a listxattr buf), so that
+ * it's no longer found. Then add it to the isolate list,
+ * to be freed later.
+ *
+ * In both cases, we're reverting lock order, so use
+ * trylock and skip the entry if we can't get the lock.
+ */
+ if (entry->xattr_name != NULL) {
+ /* Regular cache entry */
+ if (!spin_trylock(&bucket->lock))
+ return LRU_SKIP;
+
+ kref_get(&entry->ref);
+
+ hlist_del_init(&entry->hnode);
+ atomic_long_dec(&cache->nent);
+ list_lru_isolate(lru, &entry->lru);
+
+ spin_unlock(&bucket->lock);
+ } else {
+ /* Listxattr cache entry */
+ if (!spin_trylock(&cache->listxattr_lock))
+ return LRU_SKIP;
+
+ kref_get(&entry->ref);
+
+ cache->listxattr = NULL;
+ list_lru_isolate(lru, &entry->lru);
+
+ spin_unlock(&cache->listxattr_lock);
+ }
+
+ list_add_tail(&entry->dispose, dispose);
+ return LRU_REMOVED;
+}
+
+static unsigned long
+nfs4_xattr_entry_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+ LIST_HEAD(dispose);
+ unsigned long freed;
+ struct nfs4_xattr_entry *entry;
+ struct list_lru *lru;
+
+ lru = (shrink == &nfs4_xattr_large_entry_shrinker) ?
+ &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru;
+
+ freed = list_lru_shrink_walk(lru, sc, entry_lru_isolate, &dispose);
+
+ while (!list_empty(&dispose)) {
+ entry = list_first_entry(&dispose, struct nfs4_xattr_entry,
+ dispose);
+ list_del_init(&entry->dispose);
+
+ /*
+ * Drop two references: the one that we just grabbed
+ * in entry_lru_isolate, and the one that was set
+ * when the entry was first allocated.
+ */
+ kref_put(&entry->ref, nfs4_xattr_free_entry_cb);
+ kref_put(&entry->ref, nfs4_xattr_free_entry_cb);
+ }
+
+ return freed;
+}
+
+static unsigned long
+nfs4_xattr_entry_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+ unsigned long count;
+ struct list_lru *lru;
+
+ lru = (shrink == &nfs4_xattr_large_entry_shrinker) ?
+ &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru;
+
+ count = list_lru_count(lru);
+ return vfs_pressure_ratio(count);
+}
+
+
+static void nfs4_xattr_cache_init_once(void *p)
+{
+ struct nfs4_xattr_cache *cache = (struct nfs4_xattr_cache *)p;
+
+ spin_lock_init(&cache->listxattr_lock);
+ atomic_long_set(&cache->nent, 0);
+ nfs4_xattr_hash_init(cache);
+ cache->listxattr = NULL;
+ INIT_LIST_HEAD(&cache->lru);
+ INIT_LIST_HEAD(&cache->dispose);
+}
+
+int __init nfs4_xattr_cache_init(void)
+{
+ int ret = 0;
+
+ nfs4_xattr_cache_cachep = kmem_cache_create("nfs4_xattr_cache_cache",
+ sizeof(struct nfs4_xattr_cache), 0,
+ (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+ nfs4_xattr_cache_init_once);
+ if (nfs4_xattr_cache_cachep == NULL)
+ return -ENOMEM;
+
+ ret = list_lru_init_memcg(&nfs4_xattr_large_entry_lru,
+ &nfs4_xattr_large_entry_shrinker);
+ if (ret)
+ goto out4;
+
+ ret = list_lru_init_memcg(&nfs4_xattr_entry_lru,
+ &nfs4_xattr_entry_shrinker);
+ if (ret)
+ goto out3;
+
+ ret = list_lru_init_memcg(&nfs4_xattr_cache_lru,
+ &nfs4_xattr_cache_shrinker);
+ if (ret)
+ goto out2;
+
+ ret = register_shrinker(&nfs4_xattr_cache_shrinker);
+ if (ret)
+ goto out1;
+
+ ret = register_shrinker(&nfs4_xattr_entry_shrinker);
+ if (ret)
+ goto out;
+
+ ret = register_shrinker(&nfs4_xattr_large_entry_shrinker);
+ if (!ret)
+ return 0;
+
+ unregister_shrinker(&nfs4_xattr_entry_shrinker);
+out:
+ unregister_shrinker(&nfs4_xattr_cache_shrinker);
+out1:
+ list_lru_destroy(&nfs4_xattr_cache_lru);
+out2:
+ list_lru_destroy(&nfs4_xattr_entry_lru);
+out3:
+ list_lru_destroy(&nfs4_xattr_large_entry_lru);
+out4:
+ kmem_cache_destroy(nfs4_xattr_cache_cachep);
+
+ return ret;
+}
+
+void nfs4_xattr_cache_exit(void)
+{
+ unregister_shrinker(&nfs4_xattr_entry_shrinker);
+ unregister_shrinker(&nfs4_xattr_cache_shrinker);
+ list_lru_destroy(&nfs4_xattr_entry_lru);
+ list_lru_destroy(&nfs4_xattr_cache_lru);
+ kmem_cache_destroy(nfs4_xattr_cache_cachep);
+}
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index c03f3246d6c5..cc50085e151c 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -169,6 +169,78 @@
decode_clone_maxsz + \
decode_getattr_maxsz)
+/* Not limited by NFS itself, limited by the generic xattr code */
+#define nfs4_xattr_name_maxsz XDR_QUADLEN(XATTR_NAME_MAX)
+
+#define encode_getxattr_maxsz (op_encode_hdr_maxsz + 1 + \
+ nfs4_xattr_name_maxsz)
+#define decode_getxattr_maxsz (op_decode_hdr_maxsz + 1 + 1)
+#define encode_setxattr_maxsz (op_encode_hdr_maxsz + \
+ 1 + nfs4_xattr_name_maxsz + 1)
+#define decode_setxattr_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz)
+#define encode_listxattrs_maxsz (op_encode_hdr_maxsz + 2 + 1)
+#define decode_listxattrs_maxsz (op_decode_hdr_maxsz + 2 + 1 + 1)
+#define encode_removexattr_maxsz (op_encode_hdr_maxsz + 1 + \
+ nfs4_xattr_name_maxsz)
+#define decode_removexattr_maxsz (op_decode_hdr_maxsz + \
+ decode_change_info_maxsz)
+
+#define NFS4_enc_getxattr_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_getxattr_maxsz)
+#define NFS4_dec_getxattr_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_getxattr_maxsz)
+#define NFS4_enc_setxattr_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_setxattr_maxsz)
+#define NFS4_dec_setxattr_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_setxattr_maxsz)
+#define NFS4_enc_listxattrs_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_listxattrs_maxsz)
+#define NFS4_dec_listxattrs_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_listxattrs_maxsz)
+#define NFS4_enc_removexattr_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_removexattr_maxsz)
+#define NFS4_dec_removexattr_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_removexattr_maxsz)
+
+/*
+ * These values specify the maximum amount of data that is not
+ * associated with the extended attribute name or extended
+ * attribute list in the SETXATTR, GETXATTR and LISTXATTR
+ * respectively.
+ */
+const u32 nfs42_maxsetxattr_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
+ compound_encode_hdr_maxsz +
+ encode_sequence_maxsz +
+ encode_putfh_maxsz + 1 +
+ nfs4_xattr_name_maxsz)
+ * XDR_UNIT);
+
+const u32 nfs42_maxgetxattr_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
+ compound_decode_hdr_maxsz +
+ decode_sequence_maxsz +
+ decode_putfh_maxsz + 1) * XDR_UNIT);
+
+const u32 nfs42_maxlistxattrs_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
+ compound_decode_hdr_maxsz +
+ decode_sequence_maxsz +
+ decode_putfh_maxsz + 3) * XDR_UNIT);
+
static void encode_fallocate(struct xdr_stream *xdr,
const struct nfs42_falloc_args *args)
{
@@ -333,6 +405,210 @@ static void encode_layouterror(struct xdr_stream *xdr,
encode_device_error(xdr, &args->errors[0]);
}
+static void encode_setxattr(struct xdr_stream *xdr,
+ const struct nfs42_setxattrargs *arg,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ BUILD_BUG_ON(XATTR_CREATE != SETXATTR4_CREATE);
+ BUILD_BUG_ON(XATTR_REPLACE != SETXATTR4_REPLACE);
+
+ encode_op_hdr(xdr, OP_SETXATTR, decode_setxattr_maxsz, hdr);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(arg->xattr_flags);
+ encode_string(xdr, strlen(arg->xattr_name), arg->xattr_name);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(arg->xattr_len);
+ if (arg->xattr_len)
+ xdr_write_pages(xdr, arg->xattr_pages, 0, arg->xattr_len);
+}
+
+static int decode_setxattr(struct xdr_stream *xdr,
+ struct nfs4_change_info *cinfo)
+{
+ int status;
+
+ status = decode_op_hdr(xdr, OP_SETXATTR);
+ if (status)
+ goto out;
+ status = decode_change_info(xdr, cinfo);
+out:
+ return status;
+}
+
+
+static void encode_getxattr(struct xdr_stream *xdr, const char *name,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_GETXATTR, decode_getxattr_maxsz, hdr);
+ encode_string(xdr, strlen(name), name);
+}
+
+static int decode_getxattr(struct xdr_stream *xdr,
+ struct nfs42_getxattrres *res,
+ struct rpc_rqst *req)
+{
+ int status;
+ __be32 *p;
+ u32 len, rdlen;
+
+ status = decode_op_hdr(xdr, OP_GETXATTR);
+ if (status)
+ return status;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+
+ len = be32_to_cpup(p);
+ if (len > req->rq_rcv_buf.page_len)
+ return -ERANGE;
+
+ res->xattr_len = len;
+
+ if (len > 0) {
+ rdlen = xdr_read_pages(xdr, len);
+ if (rdlen < len)
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static void encode_removexattr(struct xdr_stream *xdr, const char *name,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_REMOVEXATTR, decode_removexattr_maxsz, hdr);
+ encode_string(xdr, strlen(name), name);
+}
+
+
+static int decode_removexattr(struct xdr_stream *xdr,
+ struct nfs4_change_info *cinfo)
+{
+ int status;
+
+ status = decode_op_hdr(xdr, OP_REMOVEXATTR);
+ if (status)
+ goto out;
+
+ status = decode_change_info(xdr, cinfo);
+out:
+ return status;
+}
+
+static void encode_listxattrs(struct xdr_stream *xdr,
+ const struct nfs42_listxattrsargs *arg,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_LISTXATTRS, decode_listxattrs_maxsz + 1, hdr);
+
+ p = reserve_space(xdr, 12);
+ if (unlikely(!p))
+ return;
+
+ p = xdr_encode_hyper(p, arg->cookie);
+ /*
+ * RFC 8276 says to specify the full max length of the LISTXATTRS
+ * XDR reply. Count is set to the XDR length of the names array
+ * plus the EOF marker. So, add the cookie and the names count.
+ */
+ *p = cpu_to_be32(arg->count + 8 + 4);
+}
+
+static int decode_listxattrs(struct xdr_stream *xdr,
+ struct nfs42_listxattrsres *res)
+{
+ int status;
+ __be32 *p;
+ u32 count, len, ulen;
+ size_t left, copied;
+ char *buf;
+
+ status = decode_op_hdr(xdr, OP_LISTXATTRS);
+ if (status) {
+ /*
+ * Special case: for LISTXATTRS, NFS4ERR_TOOSMALL
+ * should be translated to ERANGE.
+ */
+ if (status == -ETOOSMALL)
+ status = -ERANGE;
+ goto out;
+ }
+
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+
+ xdr_decode_hyper(p, &res->cookie);
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+
+ left = res->xattr_len;
+ buf = res->xattr_buf;
+
+ count = be32_to_cpup(p);
+ copied = 0;
+
+ /*
+ * We have asked for enough room to encode the maximum number
+ * of possible attribute names, so everything should fit.
+ *
+ * But, don't rely on that assumption. Just decode entries
+ * until they don't fit anymore, just in case the server did
+ * something odd.
+ */
+ while (count--) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+
+ len = be32_to_cpup(p);
+ if (len > (XATTR_NAME_MAX - XATTR_USER_PREFIX_LEN)) {
+ status = -ERANGE;
+ goto out;
+ }
+
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(!p))
+ return -EIO;
+
+ ulen = len + XATTR_USER_PREFIX_LEN + 1;
+ if (buf) {
+ if (ulen > left) {
+ status = -ERANGE;
+ goto out;
+ }
+
+ memcpy(buf, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+ memcpy(buf + XATTR_USER_PREFIX_LEN, p, len);
+
+ buf[ulen - 1] = 0;
+ buf += ulen;
+ left -= ulen;
+ }
+ copied += ulen;
+ }
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+
+ res->eof = be32_to_cpup(p);
+ res->copied = copied;
+
+out:
+ if (status == -ERANGE && res->xattr_len == XATTR_LIST_MAX)
+ status = -E2BIG;
+
+ return status;
+}
+
/*
* Encode ALLOCATE request
*/
@@ -988,4 +1264,166 @@ out:
return status;
}
+#ifdef CONFIG_NFS_V4_2
+static void nfs4_xdr_enc_setxattr(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs42_setxattrargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_setxattr(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+static int nfs4_xdr_dec_setxattr(struct rpc_rqst *req, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs42_setxattrres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, req);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+
+ status = decode_setxattr(xdr, &res->cinfo);
+out:
+ return status;
+}
+
+static void nfs4_xdr_enc_getxattr(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs42_getxattrargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+ size_t plen;
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_getxattr(xdr, args->xattr_name, &hdr);
+
+ plen = args->xattr_len ? args->xattr_len : XATTR_SIZE_MAX;
+
+ rpc_prepare_reply_pages(req, args->xattr_pages, 0, plen,
+ hdr.replen);
+ req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES;
+
+ encode_nops(&hdr);
+}
+
+static int nfs4_xdr_dec_getxattr(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr, void *data)
+{
+ struct nfs42_getxattrres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_getxattr(xdr, res, rqstp);
+out:
+ return status;
+}
+
+static void nfs4_xdr_enc_listxattrs(struct rpc_rqst *req,
+ struct xdr_stream *xdr, const void *data)
+{
+ const struct nfs42_listxattrsargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_listxattrs(xdr, args, &hdr);
+
+ rpc_prepare_reply_pages(req, args->xattr_pages, 0, args->count,
+ hdr.replen);
+ req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES;
+
+ encode_nops(&hdr);
+}
+
+static int nfs4_xdr_dec_listxattrs(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr, void *data)
+{
+ struct nfs42_listxattrsres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ xdr_set_scratch_buffer(xdr, page_address(res->scratch), PAGE_SIZE);
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_listxattrs(xdr, res);
+out:
+ return status;
+}
+
+static void nfs4_xdr_enc_removexattr(struct rpc_rqst *req,
+ struct xdr_stream *xdr, const void *data)
+{
+ const struct nfs42_removexattrargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_removexattr(xdr, args->xattr_name, &hdr);
+ encode_nops(&hdr);
+}
+
+static int nfs4_xdr_dec_removexattr(struct rpc_rqst *req,
+ struct xdr_stream *xdr, void *data)
+{
+ struct nfs42_removexattrres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, req);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+
+ status = decode_removexattr(xdr, &res->cinfo);
+out:
+ return status;
+}
+#endif
#endif /* __LINUX_FS_NFS_NFS4_2XDR_H */
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 210e590e1f71..0c9505dc852c 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -324,6 +324,13 @@ extern int update_open_stateid(struct nfs4_state *state,
extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
struct nfs_fsinfo *fsinfo);
+extern void nfs4_update_changeattr(struct inode *dir,
+ struct nfs4_change_info *cinfo,
+ unsigned long timestamp,
+ unsigned long cache_validity);
+extern int nfs4_buf_to_pages_noslab(const void *buf, size_t buflen,
+ struct page **pages);
+
#if defined(CONFIG_NFS_V4_1)
extern int nfs41_sequence_done(struct rpc_task *, struct nfs4_sequence_res *);
extern int nfs4_proc_create_session(struct nfs_client *, const struct cred *);
@@ -557,6 +564,12 @@ static inline void nfs4_unregister_sysctl(void)
/* nfs4xdr.c */
extern const struct rpc_procinfo nfs4_procedures[];
+#ifdef CONFIG_NFS_V4_2
+extern const u32 nfs42_maxsetxattr_overhead;
+extern const u32 nfs42_maxgetxattr_overhead;
+extern const u32 nfs42_maxlistxattrs_overhead;
+#endif
+
struct nfs4_mount_data;
/* callback_xdr.c */
@@ -613,12 +626,34 @@ static inline bool nfs4_state_match_open_stateid_other(const struct nfs4_state *
nfs4_stateid_match_other(&state->open_stateid, stateid);
}
+/* nfs42xattr.c */
+#ifdef CONFIG_NFS_V4_2
+extern int __init nfs4_xattr_cache_init(void);
+extern void nfs4_xattr_cache_exit(void);
+extern void nfs4_xattr_cache_add(struct inode *inode, const char *name,
+ const char *buf, struct page **pages,
+ ssize_t buflen);
+extern void nfs4_xattr_cache_remove(struct inode *inode, const char *name);
+extern ssize_t nfs4_xattr_cache_get(struct inode *inode, const char *name,
+ char *buf, ssize_t buflen);
+extern void nfs4_xattr_cache_set_list(struct inode *inode, const char *buf,
+ ssize_t buflen);
+extern ssize_t nfs4_xattr_cache_list(struct inode *inode, char *buf,
+ ssize_t buflen);
+extern void nfs4_xattr_cache_zap(struct inode *inode);
#else
+static inline void nfs4_xattr_cache_zap(struct inode *inode)
+{
+}
+#endif /* CONFIG_NFS_V4_2 */
+
+#else /* CONFIG_NFS_V4 */
#define nfs4_close_state(a, b) do { } while (0)
#define nfs4_close_sync(a, b) do { } while (0)
#define nfs4_state_protect(a, b, c, d) do { } while (0)
#define nfs4_state_protect_write(a, b, c, d) do { } while (0)
+
#endif /* CONFIG_NFS_V4 */
#endif /* __LINUX_FS_NFS_NFS4_FS.H */
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 0bd77cc1f639..daacc78a3d48 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -880,7 +880,7 @@ static int nfs4_set_client(struct nfs_server *server,
if (minorversion == 0)
__set_bit(NFS_CS_REUSEPORT, &cl_init.init_flags);
- else if (proto == XPRT_TRANSPORT_TCP)
+ if (proto == XPRT_TRANSPORT_TCP)
cl_init.nconnect = nconnect;
if (server->flags & NFS_MOUNT_NORESVPORT)
@@ -992,6 +992,36 @@ static void nfs4_session_limit_rwsize(struct nfs_server *server)
#endif /* CONFIG_NFS_V4_1 */
}
+/*
+ * Limit xattr sizes using the channel attributes.
+ */
+static void nfs4_session_limit_xasize(struct nfs_server *server)
+{
+#ifdef CONFIG_NFS_V4_2
+ struct nfs4_session *sess;
+ u32 server_gxa_sz;
+ u32 server_sxa_sz;
+ u32 server_lxa_sz;
+
+ if (!nfs4_has_session(server->nfs_client))
+ return;
+
+ sess = server->nfs_client->cl_session;
+
+ server_gxa_sz = sess->fc_attrs.max_resp_sz - nfs42_maxgetxattr_overhead;
+ server_sxa_sz = sess->fc_attrs.max_rqst_sz - nfs42_maxsetxattr_overhead;
+ server_lxa_sz = sess->fc_attrs.max_resp_sz -
+ nfs42_maxlistxattrs_overhead;
+
+ if (server->gxasize > server_gxa_sz)
+ server->gxasize = server_gxa_sz;
+ if (server->sxasize > server_sxa_sz)
+ server->sxasize = server_sxa_sz;
+ if (server->lxasize > server_lxa_sz)
+ server->lxasize = server_lxa_sz;
+#endif
+}
+
static int nfs4_server_common_setup(struct nfs_server *server,
struct nfs_fh *mntfh, bool auth_probe)
{
@@ -1039,6 +1069,7 @@ static int nfs4_server_common_setup(struct nfs_server *server,
goto out;
nfs4_session_limit_rwsize(server);
+ nfs4_session_limit_xasize(server);
if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
server->namelen = NFS4_MAXNAMLEN;
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 8e5d6223ddd3..a33970765467 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -110,6 +110,7 @@ static int
nfs4_file_flush(struct file *file, fl_owner_t id)
{
struct inode *inode = file_inode(file);
+ errseq_t since;
dprintk("NFS: flush(%pD2)\n", file);
@@ -125,7 +126,9 @@ nfs4_file_flush(struct file *file, fl_owner_t id)
return filemap_fdatawrite(file->f_mapping);
/* Flush writes to the server and return any errors */
- return nfs_wb_all(inode);
+ since = filemap_sample_wb_err(file->f_mapping);
+ nfs_wb_all(inode);
+ return filemap_check_wb_err(file->f_mapping, since);
}
#ifdef CONFIG_NFS_V4_2
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8963062da57e..dbd01548335b 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -66,6 +66,7 @@
#include "nfs4idmap.h"
#include "nfs4session.h"
#include "fscache.h"
+#include "nfs42.h"
#include "nfs4trace.h"
@@ -256,6 +257,7 @@ const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
| FATTR4_WORD1_FS_LAYOUT_TYPES,
FATTR4_WORD2_LAYOUT_BLKSIZE
| FATTR4_WORD2_CLONE_BLKSIZE
+ | FATTR4_WORD2_XATTR_SUPPORT
};
const u32 nfs4_fs_locations_bitmap[3] = {
@@ -1173,37 +1175,49 @@ nfs4_dec_nlink_locked(struct inode *inode)
}
static void
-update_changeattr_locked(struct inode *dir, struct nfs4_change_info *cinfo,
+nfs4_update_changeattr_locked(struct inode *inode,
+ struct nfs4_change_info *cinfo,
unsigned long timestamp, unsigned long cache_validity)
{
- struct nfs_inode *nfsi = NFS_I(dir);
+ struct nfs_inode *nfsi = NFS_I(inode);
nfsi->cache_validity |= NFS_INO_INVALID_CTIME
| NFS_INO_INVALID_MTIME
- | NFS_INO_INVALID_DATA
| cache_validity;
- if (cinfo->atomic && cinfo->before == inode_peek_iversion_raw(dir)) {
+
+ if (cinfo->atomic && cinfo->before == inode_peek_iversion_raw(inode)) {
nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE;
nfsi->attrtimeo_timestamp = jiffies;
} else {
- nfs_force_lookup_revalidate(dir);
- if (cinfo->before != inode_peek_iversion_raw(dir))
+ if (S_ISDIR(inode->i_mode)) {
+ nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+ nfs_force_lookup_revalidate(inode);
+ } else {
+ if (!NFS_PROTO(inode)->have_delegation(inode,
+ FMODE_READ))
+ nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE;
+ }
+
+ if (cinfo->before != inode_peek_iversion_raw(inode))
nfsi->cache_validity |= NFS_INO_INVALID_ACCESS |
- NFS_INO_INVALID_ACL;
+ NFS_INO_INVALID_ACL |
+ NFS_INO_INVALID_XATTR;
}
- inode_set_iversion_raw(dir, cinfo->after);
+ inode_set_iversion_raw(inode, cinfo->after);
nfsi->read_cache_jiffies = timestamp;
nfsi->attr_gencount = nfs_inc_attr_generation_counter();
nfsi->cache_validity &= ~NFS_INO_INVALID_CHANGE;
- nfs_fscache_invalidate(dir);
+
+ if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
+ nfs_fscache_invalidate(inode);
}
-static void
-update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo,
+void
+nfs4_update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo,
unsigned long timestamp, unsigned long cache_validity)
{
spin_lock(&dir->i_lock);
- update_changeattr_locked(dir, cinfo, timestamp, cache_validity);
+ nfs4_update_changeattr_locked(dir, cinfo, timestamp, cache_validity);
spin_unlock(&dir->i_lock);
}
@@ -1356,6 +1370,12 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
NFS4_ACCESS_MODIFY |
NFS4_ACCESS_EXTEND |
NFS4_ACCESS_EXECUTE;
+#ifdef CONFIG_NFS_V4_2
+ if (server->caps & NFS_CAP_XATTR)
+ p->o_arg.access |= NFS4_ACCESS_XAREAD |
+ NFS4_ACCESS_XAWRITE |
+ NFS4_ACCESS_XALIST;
+#endif
}
}
p->o_arg.clientid = server->nfs_client->cl_clientid;
@@ -2653,8 +2673,9 @@ static int _nfs4_proc_open(struct nfs4_opendata *data,
data->file_created = true;
if (data->file_created ||
inode_peek_iversion_raw(dir) != o_res->cinfo.after)
- update_changeattr(dir, &o_res->cinfo,
- o_res->f_attr->time_start, 0);
+ nfs4_update_changeattr(dir, &o_res->cinfo,
+ o_res->f_attr->time_start,
+ NFS_INO_INVALID_DATA);
}
if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0)
server->caps &= ~NFS_CAP_POSIX_LOCK;
@@ -3756,7 +3777,7 @@ static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
#define FATTR4_WORD1_NFS40_MASK (2*FATTR4_WORD1_MOUNTED_ON_FILEID - 1UL)
#define FATTR4_WORD2_NFS41_MASK (2*FATTR4_WORD2_SUPPATTR_EXCLCREAT - 1UL)
-#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_MODE_UMASK - 1UL)
+#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_XATTR_SUPPORT - 1UL)
static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
{
@@ -4540,7 +4561,8 @@ _nfs4_proc_remove(struct inode *dir, const struct qstr *name, u32 ftype)
status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 1);
if (status == 0) {
spin_lock(&dir->i_lock);
- update_changeattr_locked(dir, &res.cinfo, timestamp, 0);
+ nfs4_update_changeattr_locked(dir, &res.cinfo, timestamp,
+ NFS_INO_INVALID_DATA);
/* Removing a directory decrements nlink in the parent */
if (ftype == NF4DIR && dir->i_nlink > 2)
nfs4_dec_nlink_locked(dir);
@@ -4624,8 +4646,9 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
&data->timeout) == -EAGAIN)
return 0;
if (task->tk_status == 0)
- update_changeattr(dir, &res->cinfo,
- res->dir_attr->time_start, 0);
+ nfs4_update_changeattr(dir, &res->cinfo,
+ res->dir_attr->time_start,
+ NFS_INO_INVALID_DATA);
return 1;
}
@@ -4669,16 +4692,18 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
if (task->tk_status == 0) {
if (new_dir != old_dir) {
/* Note: If we moved a directory, nlink will change */
- update_changeattr(old_dir, &res->old_cinfo,
+ nfs4_update_changeattr(old_dir, &res->old_cinfo,
res->old_fattr->time_start,
- NFS_INO_INVALID_OTHER);
- update_changeattr(new_dir, &res->new_cinfo,
+ NFS_INO_INVALID_OTHER |
+ NFS_INO_INVALID_DATA);
+ nfs4_update_changeattr(new_dir, &res->new_cinfo,
res->new_fattr->time_start,
- NFS_INO_INVALID_OTHER);
+ NFS_INO_INVALID_OTHER |
+ NFS_INO_INVALID_DATA);
} else
- update_changeattr(old_dir, &res->old_cinfo,
+ nfs4_update_changeattr(old_dir, &res->old_cinfo,
res->old_fattr->time_start,
- 0);
+ NFS_INO_INVALID_DATA);
}
return 1;
}
@@ -4719,7 +4744,8 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, const struct
status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
if (!status) {
- update_changeattr(dir, &res.cinfo, res.fattr->time_start, 0);
+ nfs4_update_changeattr(dir, &res.cinfo, res.fattr->time_start,
+ NFS_INO_INVALID_DATA);
status = nfs_post_op_update_inode(inode, res.fattr);
if (!status)
nfs_setsecurity(inode, res.fattr, res.label);
@@ -4797,8 +4823,9 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_
&data->arg.seq_args, &data->res.seq_res, 1);
if (status == 0) {
spin_lock(&dir->i_lock);
- update_changeattr_locked(dir, &data->res.dir_cinfo,
- data->res.fattr->time_start, 0);
+ nfs4_update_changeattr_locked(dir, &data->res.dir_cinfo,
+ data->res.fattr->time_start,
+ NFS_INO_INVALID_DATA);
/* Creating a directory bumps nlink in the parent */
if (data->arg.ftype == NF4DIR)
nfs4_inc_nlink_locked(dir);
@@ -5531,7 +5558,7 @@ static inline int nfs4_server_supports_acls(struct nfs_server *server)
*/
#define NFS4ACL_MAXPAGES DIV_ROUND_UP(XATTR_SIZE_MAX, PAGE_SIZE)
-static int buf_to_pages_noslab(const void *buf, size_t buflen,
+int nfs4_buf_to_pages_noslab(const void *buf, size_t buflen,
struct page **pages)
{
struct page *newpage, **spages;
@@ -5773,7 +5800,7 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
return -EOPNOTSUPP;
if (npages > ARRAY_SIZE(pages))
return -ERANGE;
- i = buf_to_pages_noslab(buf, buflen, arg.acl_pages);
+ i = nfs4_buf_to_pages_noslab(buf, buflen, arg.acl_pages);
if (i < 0)
return i;
nfs4_inode_make_writeable(inode);
@@ -5845,8 +5872,6 @@ static int _nfs4_get_security_label(struct inode *inode, void *buf,
return ret;
if (!(fattr.valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL))
return -ENOENT;
- if (buflen < label.len)
- return -ERANGE;
return 0;
}
@@ -7430,6 +7455,133 @@ nfs4_listxattr_nfs4_label(struct inode *inode, char *list, size_t list_len)
#endif
+#ifdef CONFIG_NFS_V4_2
+static int nfs4_xattr_set_nfs4_user(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *key, const void *buf,
+ size_t buflen, int flags)
+{
+ struct nfs_access_entry cache;
+ int ret;
+
+ if (!nfs_server_capable(inode, NFS_CAP_XATTR))
+ return -EOPNOTSUPP;
+
+ /*
+ * There is no mapping from the MAY_* flags to the NFS_ACCESS_XA*
+ * flags right now. Handling of xattr operations use the normal
+ * file read/write permissions.
+ *
+ * Just in case the server has other ideas (which RFC 8276 allows),
+ * do a cached access check for the XA* flags to possibly avoid
+ * doing an RPC and getting EACCES back.
+ */
+ if (!nfs_access_get_cached(inode, current_cred(), &cache, true)) {
+ if (!(cache.mask & NFS_ACCESS_XAWRITE))
+ return -EACCES;
+ }
+
+ if (buf == NULL) {
+ ret = nfs42_proc_removexattr(inode, key);
+ if (!ret)
+ nfs4_xattr_cache_remove(inode, key);
+ } else {
+ ret = nfs42_proc_setxattr(inode, key, buf, buflen, flags);
+ if (!ret)
+ nfs4_xattr_cache_add(inode, key, buf, NULL, buflen);
+ }
+
+ return ret;
+}
+
+static int nfs4_xattr_get_nfs4_user(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *key, void *buf, size_t buflen)
+{
+ struct nfs_access_entry cache;
+ ssize_t ret;
+
+ if (!nfs_server_capable(inode, NFS_CAP_XATTR))
+ return -EOPNOTSUPP;
+
+ if (!nfs_access_get_cached(inode, current_cred(), &cache, true)) {
+ if (!(cache.mask & NFS_ACCESS_XAREAD))
+ return -EACCES;
+ }
+
+ ret = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+ if (ret)
+ return ret;
+
+ ret = nfs4_xattr_cache_get(inode, key, buf, buflen);
+ if (ret >= 0 || (ret < 0 && ret != -ENOENT))
+ return ret;
+
+ ret = nfs42_proc_getxattr(inode, key, buf, buflen);
+
+ return ret;
+}
+
+static ssize_t
+nfs4_listxattr_nfs4_user(struct inode *inode, char *list, size_t list_len)
+{
+ u64 cookie;
+ bool eof;
+ ssize_t ret, size;
+ char *buf;
+ size_t buflen;
+ struct nfs_access_entry cache;
+
+ if (!nfs_server_capable(inode, NFS_CAP_XATTR))
+ return 0;
+
+ if (!nfs_access_get_cached(inode, current_cred(), &cache, true)) {
+ if (!(cache.mask & NFS_ACCESS_XALIST))
+ return 0;
+ }
+
+ ret = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+ if (ret)
+ return ret;
+
+ ret = nfs4_xattr_cache_list(inode, list, list_len);
+ if (ret >= 0 || (ret < 0 && ret != -ENOENT))
+ return ret;
+
+ cookie = 0;
+ eof = false;
+ buflen = list_len ? list_len : XATTR_LIST_MAX;
+ buf = list_len ? list : NULL;
+ size = 0;
+
+ while (!eof) {
+ ret = nfs42_proc_listxattrs(inode, buf, buflen,
+ &cookie, &eof);
+ if (ret < 0)
+ return ret;
+
+ if (list_len) {
+ buf += ret;
+ buflen -= ret;
+ }
+ size += ret;
+ }
+
+ if (list_len)
+ nfs4_xattr_cache_set_list(inode, list, size);
+
+ return size;
+}
+
+#else
+
+static ssize_t
+nfs4_listxattr_nfs4_user(struct inode *inode, char *list, size_t list_len)
+{
+ return 0;
+}
+#endif /* CONFIG_NFS_V4_2 */
+
/*
* nfs_fhget will use either the mounted_on_fileid or the fileid
*/
@@ -10035,7 +10187,7 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size)
{
- ssize_t error, error2;
+ ssize_t error, error2, error3;
error = generic_listxattr(dentry, list, size);
if (error < 0)
@@ -10048,7 +10200,17 @@ static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size)
error2 = nfs4_listxattr_nfs4_label(d_inode(dentry), list, size);
if (error2 < 0)
return error2;
- return error + error2;
+
+ if (list) {
+ list += error2;
+ size -= error2;
+ }
+
+ error3 = nfs4_listxattr_nfs4_user(d_inode(dentry), list, size);
+ if (error3 < 0)
+ return error3;
+
+ return error + error2 + error3;
}
static const struct inode_operations nfs4_dir_inode_operations = {
@@ -10136,11 +10298,22 @@ static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
.set = nfs4_xattr_set_nfs4_acl,
};
+#ifdef CONFIG_NFS_V4_2
+static const struct xattr_handler nfs4_xattr_nfs4_user_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .get = nfs4_xattr_get_nfs4_user,
+ .set = nfs4_xattr_set_nfs4_user,
+};
+#endif
+
const struct xattr_handler *nfs4_xattr_handlers[] = {
&nfs4_xattr_nfs4_acl_handler,
#ifdef CONFIG_NFS_V4_SECURITY_LABEL
&nfs4_xattr_nfs4_label_handler,
#endif
+#ifdef CONFIG_NFS_V4_2
+ &nfs4_xattr_nfs4_user_handler,
+#endif
NULL
};
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index 1475f932d7da..0c1ab846b83d 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -69,6 +69,7 @@ static void nfs4_evict_inode(struct inode *inode)
pnfs_destroy_layout(NFS_I(inode));
/* First call standard NFS clear_inode() code */
nfs_clear_inode(inode);
+ nfs4_xattr_cache_zap(inode);
}
struct nfs_referral_count {
@@ -268,6 +269,12 @@ static int __init init_nfs_v4(void)
if (err)
goto out1;
+#ifdef CONFIG_NFS_V4_2
+ err = nfs4_xattr_cache_init();
+ if (err)
+ goto out2;
+#endif
+
err = nfs4_register_sysctl();
if (err)
goto out2;
@@ -288,6 +295,9 @@ static void __exit exit_nfs_v4(void)
nfs4_pnfs_v3_ds_connect_unload();
unregister_nfs_version(&nfs_v4);
+#ifdef CONFIG_NFS_V4_2
+ nfs4_xattr_cache_exit();
+#endif
nfs4_unregister_sysctl();
nfs_idmap_quit();
nfs_dns_resolver_destroy();
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 543541173a3d..b4f852d4d099 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -1727,6 +1727,13 @@ DEFINE_NFS4_IDMAP_EVENT(nfs4_map_group_to_gid);
DEFINE_NFS4_IDMAP_EVENT(nfs4_map_uid_to_name);
DEFINE_NFS4_IDMAP_EVENT(nfs4_map_gid_to_group);
+#ifdef CONFIG_NFS_V4_1
+#define NFS4_LSEG_LAYOUT_STATEID_HASH(lseg) \
+ (lseg ? nfs_stateid_hash(&lseg->pls_layout->plh_stateid) : 0)
+#else
+#define NFS4_LSEG_LAYOUT_STATEID_HASH(lseg) (0)
+#endif
+
DECLARE_EVENT_CLASS(nfs4_read_event,
TP_PROTO(
const struct nfs_pgio_header *hdr,
@@ -1745,6 +1752,8 @@ DECLARE_EVENT_CLASS(nfs4_read_event,
__field(unsigned long, error)
__field(int, stateid_seq)
__field(u32, stateid_hash)
+ __field(int, layoutstateid_seq)
+ __field(u32, layoutstateid_hash)
),
TP_fast_assign(
@@ -1754,6 +1763,7 @@ DECLARE_EVENT_CLASS(nfs4_read_event,
hdr->args.fh : &nfsi->fh;
const struct nfs4_state *state =
hdr->args.context->state;
+ const struct pnfs_layout_segment *lseg = hdr->lseg;
__entry->dev = inode->i_sb->s_dev;
__entry->fileid = nfsi->fileid;
@@ -1766,11 +1776,15 @@ DECLARE_EVENT_CLASS(nfs4_read_event,
be32_to_cpu(state->stateid.seqid);
__entry->stateid_hash =
nfs_stateid_hash(&state->stateid);
+ __entry->layoutstateid_seq = lseg ? lseg->pls_seq : 0;
+ __entry->layoutstateid_hash =
+ NFS4_LSEG_LAYOUT_STATEID_HASH(lseg);
),
TP_printk(
"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
- "offset=%lld count=%u res=%u stateid=%d:0x%08x",
+ "offset=%lld count=%u res=%u stateid=%d:0x%08x "
+ "layoutstateid=%d:0x%08x",
-__entry->error,
show_nfsv4_errors(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -1778,7 +1792,8 @@ DECLARE_EVENT_CLASS(nfs4_read_event,
__entry->fhandle,
(long long)__entry->offset,
__entry->arg_count, __entry->res_count,
- __entry->stateid_seq, __entry->stateid_hash
+ __entry->stateid_seq, __entry->stateid_hash,
+ __entry->layoutstateid_seq, __entry->layoutstateid_hash
)
);
#define DEFINE_NFS4_READ_EVENT(name) \
@@ -1811,6 +1826,8 @@ DECLARE_EVENT_CLASS(nfs4_write_event,
__field(unsigned long, error)
__field(int, stateid_seq)
__field(u32, stateid_hash)
+ __field(int, layoutstateid_seq)
+ __field(u32, layoutstateid_hash)
),
TP_fast_assign(
@@ -1820,6 +1837,7 @@ DECLARE_EVENT_CLASS(nfs4_write_event,
hdr->args.fh : &nfsi->fh;
const struct nfs4_state *state =
hdr->args.context->state;
+ const struct pnfs_layout_segment *lseg = hdr->lseg;
__entry->dev = inode->i_sb->s_dev;
__entry->fileid = nfsi->fileid;
@@ -1832,11 +1850,15 @@ DECLARE_EVENT_CLASS(nfs4_write_event,
be32_to_cpu(state->stateid.seqid);
__entry->stateid_hash =
nfs_stateid_hash(&state->stateid);
+ __entry->layoutstateid_seq = lseg ? lseg->pls_seq : 0;
+ __entry->layoutstateid_hash =
+ NFS4_LSEG_LAYOUT_STATEID_HASH(lseg);
),
TP_printk(
"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
- "offset=%lld count=%u res=%u stateid=%d:0x%08x",
+ "offset=%lld count=%u res=%u stateid=%d:0x%08x "
+ "layoutstateid=%d:0x%08x",
-__entry->error,
show_nfsv4_errors(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -1844,7 +1866,8 @@ DECLARE_EVENT_CLASS(nfs4_write_event,
__entry->fhandle,
(long long)__entry->offset,
__entry->arg_count, __entry->res_count,
- __entry->stateid_seq, __entry->stateid_hash
+ __entry->stateid_seq, __entry->stateid_hash,
+ __entry->layoutstateid_seq, __entry->layoutstateid_hash
)
);
@@ -1875,6 +1898,8 @@ DECLARE_EVENT_CLASS(nfs4_commit_event,
__field(unsigned long, error)
__field(loff_t, offset)
__field(u32, count)
+ __field(int, layoutstateid_seq)
+ __field(u32, layoutstateid_hash)
),
TP_fast_assign(
@@ -1882,6 +1907,7 @@ DECLARE_EVENT_CLASS(nfs4_commit_event,
const struct nfs_inode *nfsi = NFS_I(inode);
const struct nfs_fh *fh = data->args.fh ?
data->args.fh : &nfsi->fh;
+ const struct pnfs_layout_segment *lseg = data->lseg;
__entry->dev = inode->i_sb->s_dev;
__entry->fileid = nfsi->fileid;
@@ -1889,18 +1915,22 @@ DECLARE_EVENT_CLASS(nfs4_commit_event,
__entry->offset = data->args.offset;
__entry->count = data->args.count;
__entry->error = error < 0 ? -error : 0;
+ __entry->layoutstateid_seq = lseg ? lseg->pls_seq : 0;
+ __entry->layoutstateid_hash =
+ NFS4_LSEG_LAYOUT_STATEID_HASH(lseg);
),
TP_printk(
"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
- "offset=%lld count=%u",
+ "offset=%lld count=%u layoutstateid=%d:0x%08x",
-__entry->error,
show_nfsv4_errors(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
(long long)__entry->offset,
- __entry->count
+ __entry->count,
+ __entry->layoutstateid_seq, __entry->layoutstateid_hash
)
);
#define DEFINE_NFS4_COMMIT_EVENT(name) \
@@ -1993,7 +2023,9 @@ TRACE_EVENT(nfs4_layoutget,
DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutcommit);
DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutreturn);
-DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutreturn_on_close);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layouterror);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutstats);
TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_UNKNOWN);
TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_NO_PNFS);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 47817ef0aadb..0b3510f62623 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4166,7 +4166,11 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap,
return -EIO;
if (len < NFS4_MAXLABELLEN) {
if (label) {
- memcpy(label->label, p, len);
+ if (label->len) {
+ if (label->len < len)
+ return -ERANGE;
+ memcpy(label->label, p, len);
+ }
label->len = len;
label->pi = pi;
label->lfs = lfs;
@@ -4201,6 +4205,26 @@ static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, str
return status;
}
+static int decode_attr_xattrsupport(struct xdr_stream *xdr, uint32_t *bitmap,
+ uint32_t *res)
+{
+ __be32 *p;
+
+ *res = 0;
+ if (unlikely(bitmap[2] & (FATTR4_WORD2_XATTR_SUPPORT - 1U)))
+ return -EIO;
+ if (likely(bitmap[2] & FATTR4_WORD2_XATTR_SUPPORT)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ *res = be32_to_cpup(p);
+ bitmap[2] &= ~FATTR4_WORD2_XATTR_SUPPORT;
+ }
+ dprintk("%s: XATTR support=%s\n", __func__,
+ *res == 0 ? "false" : "true");
+ return 0;
+}
+
static int verify_attr_len(struct xdr_stream *xdr, unsigned int savep, uint32_t attrlen)
{
unsigned int attrwords = XDR_QUADLEN(attrlen);
@@ -4855,6 +4879,11 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
if (status)
goto xdr_error;
+ status = decode_attr_xattrsupport(xdr, bitmap,
+ &fsinfo->xattr_support);
+ if (status)
+ goto xdr_error;
+
status = verify_attr_len(xdr, savep, attrlen);
xdr_error:
dprintk("%s: xdr returned %d!\n", __func__, -status);
@@ -5227,7 +5256,7 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
* The XDR encode routine has set things up so that
* the link text will be copied directly into the
* buffer. We just have to do overflow-checking,
- * and and null-terminate the text (the VFS expects
+ * and null-terminate the text (the VFS expects
* null-termination).
*/
xdr_terminate_string(rcvbuf, len);
@@ -7456,6 +7485,8 @@ static struct {
{ NFS4ERR_SYMLINK, -ELOOP },
{ NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP },
{ NFS4ERR_DEADLOCK, -EDEADLK },
+ { NFS4ERR_NOXATTR, -ENODATA },
+ { NFS4ERR_XATTR2BIG, -E2BIG },
{ -1, -EIO }
};
@@ -7584,6 +7615,10 @@ const struct rpc_procinfo nfs4_procedures[] = {
PROC42(COPY_NOTIFY, enc_copy_notify, dec_copy_notify),
PROC(LOOKUPP, enc_lookupp, dec_lookupp),
PROC42(LAYOUTERROR, enc_layouterror, dec_layouterror),
+ PROC42(GETXATTR, enc_getxattr, dec_getxattr),
+ PROC42(SETXATTR, enc_setxattr, dec_setxattr),
+ PROC42(LISTXATTRS, enc_listxattrs, dec_listxattrs),
+ PROC42(REMOVEXATTR, enc_removexattr, dec_removexattr),
};
static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)];
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 547cec79899f..5a59dcdce0b2 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -59,7 +59,8 @@ TRACE_DEFINE_ENUM(NFS_INO_INVALID_OTHER);
{ NFS_INO_INVALID_CTIME, "INVALID_CTIME" }, \
{ NFS_INO_INVALID_MTIME, "INVALID_MTIME" }, \
{ NFS_INO_INVALID_SIZE, "INVALID_SIZE" }, \
- { NFS_INO_INVALID_OTHER, "INVALID_OTHER" })
+ { NFS_INO_INVALID_OTHER, "INVALID_OTHER" }, \
+ { NFS_INO_INVALID_XATTR, "INVALID_XATTR" })
TRACE_DEFINE_ENUM(NFS_INO_ADVISE_RDPLUS);
TRACE_DEFINE_ENUM(NFS_INO_STALE);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index dd2e14f5875d..40332c758d84 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1226,31 +1226,27 @@ out:
return status;
}
+static bool
+pnfs_layout_segments_returnable(struct pnfs_layout_hdr *lo,
+ enum pnfs_iomode iomode,
+ u32 seq)
+{
+ struct pnfs_layout_range recall_range = {
+ .length = NFS4_MAX_UINT64,
+ .iomode = iomode,
+ };
+ return pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs,
+ &recall_range, seq) != -EBUSY;
+}
+
/* Return true if layoutreturn is needed */
static bool
pnfs_layout_need_return(struct pnfs_layout_hdr *lo)
{
- struct pnfs_layout_segment *s;
- enum pnfs_iomode iomode;
- u32 seq;
-
if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
return false;
-
- seq = lo->plh_return_seq;
- iomode = lo->plh_return_iomode;
-
- /* Defer layoutreturn until all recalled lsegs are done */
- list_for_each_entry(s, &lo->plh_segs, pls_list) {
- if (seq && pnfs_seqid_is_newer(s->pls_seq, seq))
- continue;
- if (iomode != IOMODE_ANY && s->pls_range.iomode != iomode)
- continue;
- if (test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags))
- return false;
- }
-
- return true;
+ return pnfs_layout_segments_returnable(lo, lo->plh_return_iomode,
+ lo->plh_return_seq);
}
static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo)
@@ -1549,12 +1545,12 @@ void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
default:
arg_stateid = &args->stateid;
}
+ trace_nfs4_layoutreturn_on_close(args->inode, &args->stateid, ret);
pnfs_layoutreturn_free_lsegs(lo, arg_stateid, &args->range,
res_stateid);
if (ld_private && ld_private->ops && ld_private->ops->free)
ld_private->ops->free(ld_private);
pnfs_put_layout_hdr(lo);
- trace_nfs4_layoutreturn_on_close(args->inode, 0);
}
bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task)
@@ -2392,16 +2388,6 @@ out_forget:
return ERR_PTR(-EAGAIN);
}
-static int
-mark_lseg_invalid_or_return(struct pnfs_layout_segment *lseg,
- struct list_head *tmp_list)
-{
- if (!mark_lseg_invalid(lseg, tmp_list))
- return 0;
- pnfs_cache_lseg_for_layoutreturn(lseg->pls_layout, lseg);
- return 1;
-}
-
/**
* pnfs_mark_matching_lsegs_return - Free or return matching layout segments
* @lo: pointer to layout header
@@ -2438,7 +2424,7 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
lseg, lseg->pls_range.iomode,
lseg->pls_range.offset,
lseg->pls_range.length);
- if (mark_lseg_invalid_or_return(lseg, tmp_list))
+ if (mark_lseg_invalid(lseg, tmp_list))
continue;
remaining++;
set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
@@ -2953,7 +2939,8 @@ pnfs_try_to_read_data(struct nfs_pgio_header *hdr,
}
/* Resend all requests through pnfs. */
-void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr)
+void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr,
+ unsigned int mirror_idx)
{
struct nfs_pageio_descriptor pgio;
@@ -2964,6 +2951,7 @@ void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr)
nfs_pageio_init_read(&pgio, hdr->inode, false,
hdr->completion_ops);
+ pgio.pg_mirror_idx = mirror_idx;
hdr->task.tk_status = nfs_pageio_resend(&pgio, hdr);
}
}
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 8e0ada581b92..2661c44c62db 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -311,7 +311,7 @@ int _pnfs_return_layout(struct inode *);
int pnfs_commit_and_return_layout(struct inode *);
void pnfs_ld_write_done(struct nfs_pgio_header *);
void pnfs_ld_read_done(struct nfs_pgio_header *);
-void pnfs_read_resend_pnfs(struct nfs_pgio_header *);
+void pnfs_read_resend_pnfs(struct nfs_pgio_header *, unsigned int mirror_idx);
struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
struct nfs_open_context *ctx,
loff_t pos,
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 235b959fc2b3..adf3bb0a8048 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -613,10 +613,10 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
lock = nilfs_mdt_bgl_lock(inode, group);
if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap))
- nilfs_msg(inode->i_sb, KERN_WARNING,
- "%s (ino=%lu): entry number %llu already freed",
- __func__, inode->i_ino,
- (unsigned long long)req->pr_entry_nr);
+ nilfs_warn(inode->i_sb,
+ "%s (ino=%lu): entry number %llu already freed",
+ __func__, inode->i_ino,
+ (unsigned long long)req->pr_entry_nr);
else
nilfs_palloc_group_desc_add_entries(desc, lock, 1);
@@ -654,10 +654,10 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
lock = nilfs_mdt_bgl_lock(inode, group);
if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap))
- nilfs_msg(inode->i_sb, KERN_WARNING,
- "%s (ino=%lu): entry number %llu already freed",
- __func__, inode->i_ino,
- (unsigned long long)req->pr_entry_nr);
+ nilfs_warn(inode->i_sb,
+ "%s (ino=%lu): entry number %llu already freed",
+ __func__, inode->i_ino,
+ (unsigned long long)req->pr_entry_nr);
else
nilfs_palloc_group_desc_add_entries(desc, lock, 1);
@@ -763,10 +763,10 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
do {
if (!nilfs_clear_bit_atomic(lock, group_offset,
bitmap)) {
- nilfs_msg(inode->i_sb, KERN_WARNING,
- "%s (ino=%lu): entry number %llu already freed",
- __func__, inode->i_ino,
- (unsigned long long)entry_nrs[j]);
+ nilfs_warn(inode->i_sb,
+ "%s (ino=%lu): entry number %llu already freed",
+ __func__, inode->i_ino,
+ (unsigned long long)entry_nrs[j]);
} else {
n++;
}
@@ -808,10 +808,10 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
ret = nilfs_palloc_delete_entry_block(inode,
last_nrs[k]);
if (ret && ret != -ENOENT)
- nilfs_msg(inode->i_sb, KERN_WARNING,
- "error %d deleting block that object (entry=%llu, ino=%lu) belongs to",
- ret, (unsigned long long)last_nrs[k],
- inode->i_ino);
+ nilfs_warn(inode->i_sb,
+ "error %d deleting block that object (entry=%llu, ino=%lu) belongs to",
+ ret, (unsigned long long)last_nrs[k],
+ inode->i_ino);
}
desc_kaddr = kmap_atomic(desc_bh->b_page);
@@ -826,9 +826,9 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
if (nfree == nilfs_palloc_entries_per_group(inode)) {
ret = nilfs_palloc_delete_bitmap_block(inode, group);
if (ret && ret != -ENOENT)
- nilfs_msg(inode->i_sb, KERN_WARNING,
- "error %d deleting bitmap block of group=%lu, ino=%lu",
- ret, group, inode->i_ino);
+ nilfs_warn(inode->i_sb,
+ "error %d deleting bitmap block of group=%lu, ino=%lu",
+ ret, group, inode->i_ino);
}
}
return 0;
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 23e043eca237..f42ab57201e7 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -351,10 +351,10 @@ static int nilfs_btree_node_broken(const struct nilfs_btree_node *node,
(flags & NILFS_BTREE_NODE_ROOT) ||
nchildren < 0 ||
nchildren > NILFS_BTREE_NODE_NCHILDREN_MAX(size))) {
- nilfs_msg(inode->i_sb, KERN_CRIT,
- "bad btree node (ino=%lu, blocknr=%llu): level = %d, flags = 0x%x, nchildren = %d",
- inode->i_ino, (unsigned long long)blocknr, level,
- flags, nchildren);
+ nilfs_crit(inode->i_sb,
+ "bad btree node (ino=%lu, blocknr=%llu): level = %d, flags = 0x%x, nchildren = %d",
+ inode->i_ino, (unsigned long long)blocknr, level,
+ flags, nchildren);
ret = 1;
}
return ret;
@@ -381,9 +381,9 @@ static int nilfs_btree_root_broken(const struct nilfs_btree_node *node,
level >= NILFS_BTREE_LEVEL_MAX ||
nchildren < 0 ||
nchildren > NILFS_BTREE_ROOT_NCHILDREN_MAX)) {
- nilfs_msg(inode->i_sb, KERN_CRIT,
- "bad btree root (ino=%lu): level = %d, flags = 0x%x, nchildren = %d",
- inode->i_ino, level, flags, nchildren);
+ nilfs_crit(inode->i_sb,
+ "bad btree root (ino=%lu): level = %d, flags = 0x%x, nchildren = %d",
+ inode->i_ino, level, flags, nchildren);
ret = 1;
}
return ret;
@@ -450,10 +450,10 @@ static int nilfs_btree_bad_node(const struct nilfs_bmap *btree,
{
if (unlikely(nilfs_btree_node_get_level(node) != level)) {
dump_stack();
- nilfs_msg(btree->b_inode->i_sb, KERN_CRIT,
- "btree level mismatch (ino=%lu): %d != %d",
- btree->b_inode->i_ino,
- nilfs_btree_node_get_level(node), level);
+ nilfs_crit(btree->b_inode->i_sb,
+ "btree level mismatch (ino=%lu): %d != %d",
+ btree->b_inode->i_ino,
+ nilfs_btree_node_get_level(node), level);
return 1;
}
return 0;
@@ -508,7 +508,7 @@ static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
out_no_wait:
if (!buffer_uptodate(bh)) {
- nilfs_msg(btree->b_inode->i_sb, KERN_ERR,
+ nilfs_err(btree->b_inode->i_sb,
"I/O error reading b-tree node block (ino=%lu, blocknr=%llu)",
btree->b_inode->i_ino, (unsigned long long)ptr);
brelse(bh);
@@ -2074,10 +2074,10 @@ static int nilfs_btree_propagate(struct nilfs_bmap *btree,
ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0);
if (ret < 0) {
if (unlikely(ret == -ENOENT))
- nilfs_msg(btree->b_inode->i_sb, KERN_CRIT,
- "writing node/leaf block does not appear in b-tree (ino=%lu) at key=%llu, level=%d",
- btree->b_inode->i_ino,
- (unsigned long long)key, level);
+ nilfs_crit(btree->b_inode->i_sb,
+ "writing node/leaf block does not appear in b-tree (ino=%lu) at key=%llu, level=%d",
+ btree->b_inode->i_ino,
+ (unsigned long long)key, level);
goto out;
}
@@ -2114,11 +2114,11 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_bmap *btree,
if (level < NILFS_BTREE_LEVEL_NODE_MIN ||
level >= NILFS_BTREE_LEVEL_MAX) {
dump_stack();
- nilfs_msg(btree->b_inode->i_sb, KERN_WARNING,
- "invalid btree level: %d (key=%llu, ino=%lu, blocknr=%llu)",
- level, (unsigned long long)key,
- btree->b_inode->i_ino,
- (unsigned long long)bh->b_blocknr);
+ nilfs_warn(btree->b_inode->i_sb,
+ "invalid btree level: %d (key=%llu, ino=%lu, blocknr=%llu)",
+ level, (unsigned long long)key,
+ btree->b_inode->i_ino,
+ (unsigned long long)bh->b_blocknr);
return;
}
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 8d41311b5db4..86d4d850d130 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -322,7 +322,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
int ret, ncps, nicps, nss, count, i;
if (unlikely(start == 0 || start > end)) {
- nilfs_msg(cpfile->i_sb, KERN_ERR,
+ nilfs_err(cpfile->i_sb,
"cannot delete checkpoints: invalid range [%llu, %llu)",
(unsigned long long)start, (unsigned long long)end);
return -EINVAL;
@@ -376,7 +376,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
cpfile, cno);
if (ret == 0)
continue;
- nilfs_msg(cpfile->i_sb, KERN_ERR,
+ nilfs_err(cpfile->i_sb,
"error %d deleting checkpoint block",
ret);
break;
@@ -981,12 +981,10 @@ int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
int err;
if (cpsize > sb->s_blocksize) {
- nilfs_msg(sb, KERN_ERR,
- "too large checkpoint size: %zu bytes", cpsize);
+ nilfs_err(sb, "too large checkpoint size: %zu bytes", cpsize);
return -EINVAL;
} else if (cpsize < NILFS_MIN_CHECKPOINT_SIZE) {
- nilfs_msg(sb, KERN_ERR,
- "too small checkpoint size: %zu bytes", cpsize);
+ nilfs_err(sb, "too small checkpoint size: %zu bytes", cpsize);
return -EINVAL;
}
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 6f4066636be9..8bccdf1158fc 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -340,11 +340,11 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
kaddr = kmap_atomic(entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
if (unlikely(entry->de_blocknr == cpu_to_le64(0))) {
- nilfs_msg(dat->i_sb, KERN_CRIT,
- "%s: invalid vblocknr = %llu, [%llu, %llu)",
- __func__, (unsigned long long)vblocknr,
- (unsigned long long)le64_to_cpu(entry->de_start),
- (unsigned long long)le64_to_cpu(entry->de_end));
+ nilfs_crit(dat->i_sb,
+ "%s: invalid vblocknr = %llu, [%llu, %llu)",
+ __func__, (unsigned long long)vblocknr,
+ (unsigned long long)le64_to_cpu(entry->de_start),
+ (unsigned long long)le64_to_cpu(entry->de_end));
kunmap_atomic(kaddr);
brelse(entry_bh);
return -EINVAL;
@@ -471,11 +471,11 @@ int nilfs_dat_read(struct super_block *sb, size_t entry_size,
int err;
if (entry_size > sb->s_blocksize) {
- nilfs_msg(sb, KERN_ERR, "too large DAT entry size: %zu bytes",
+ nilfs_err(sb, "too large DAT entry size: %zu bytes",
entry_size);
return -EINVAL;
} else if (entry_size < NILFS_MIN_DAT_ENTRY_SIZE) {
- nilfs_msg(sb, KERN_ERR, "too small DAT entry size: %zu bytes",
+ nilfs_err(sb, "too small DAT entry size: %zu bytes",
entry_size);
return -EINVAL;
}
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index 533e24ea3a88..f353101955e3 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -328,16 +328,18 @@ static int nilfs_direct_assign(struct nilfs_bmap *bmap,
key = nilfs_bmap_data_get_key(bmap, *bh);
if (unlikely(key > NILFS_DIRECT_KEY_MAX)) {
- nilfs_msg(bmap->b_inode->i_sb, KERN_CRIT,
- "%s (ino=%lu): invalid key: %llu", __func__,
- bmap->b_inode->i_ino, (unsigned long long)key);
+ nilfs_crit(bmap->b_inode->i_sb,
+ "%s (ino=%lu): invalid key: %llu",
+ __func__,
+ bmap->b_inode->i_ino, (unsigned long long)key);
return -EINVAL;
}
ptr = nilfs_direct_get_ptr(bmap, key);
if (unlikely(ptr == NILFS_BMAP_INVALID_PTR)) {
- nilfs_msg(bmap->b_inode->i_sb, KERN_CRIT,
- "%s (ino=%lu): invalid pointer: %llu", __func__,
- bmap->b_inode->i_ino, (unsigned long long)ptr);
+ nilfs_crit(bmap->b_inode->i_sb,
+ "%s (ino=%lu): invalid pointer: %llu",
+ __func__,
+ bmap->b_inode->i_ino, (unsigned long long)ptr);
return -EINVAL;
}
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index aa3c328ee189..448320496856 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -142,7 +142,7 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
if (!buffer_uptodate(bh)) {
struct inode *inode = bh->b_page->mapping->host;
- nilfs_msg(inode->i_sb, KERN_ERR,
+ nilfs_err(inode->i_sb,
"I/O error reading %s block for GC (ino=%lu, vblocknr=%llu)",
buffer_nilfs_node(bh) ? "node" : "data",
inode->i_ino, (unsigned long long)bh->b_blocknr);
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index 4140d232cadc..02727ed3a7c6 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -142,8 +142,8 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh);
if (unlikely(err))
- nilfs_msg(sb, KERN_WARNING, "error %d reading inode: ino=%lu",
- err, (unsigned long)ino);
+ nilfs_warn(sb, "error %d reading inode: ino=%lu",
+ err, (unsigned long)ino);
return err;
}
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 28009ec54420..745d371d6fea 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -104,10 +104,10 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
* However, the page having this block must
* be locked in this case.
*/
- nilfs_msg(inode->i_sb, KERN_WARNING,
- "%s (ino=%lu): a race condition while inserting a data block at offset=%llu",
- __func__, inode->i_ino,
- (unsigned long long)blkoff);
+ nilfs_warn(inode->i_sb,
+ "%s (ino=%lu): a race condition while inserting a data block at offset=%llu",
+ __func__, inode->i_ino,
+ (unsigned long long)blkoff);
err = 0;
}
nilfs_transaction_abort(inode->i_sb);
@@ -388,7 +388,8 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
failed_after_creation:
clear_nlink(inode);
- unlock_new_inode(inode);
+ if (inode->i_state & I_NEW)
+ unlock_new_inode(inode);
iput(inode); /*
* raw_inode will be deleted through
* nilfs_evict_inode().
@@ -706,9 +707,8 @@ repeat:
goto repeat;
failed:
- nilfs_msg(ii->vfs_inode.i_sb, KERN_WARNING,
- "error %d truncating bmap (ino=%lu)", ret,
- ii->vfs_inode.i_ino);
+ nilfs_warn(ii->vfs_inode.i_sb, "error %d truncating bmap (ino=%lu)",
+ ret, ii->vfs_inode.i_ino);
}
void nilfs_truncate(struct inode *inode)
@@ -919,9 +919,9 @@ int nilfs_set_file_dirty(struct inode *inode, unsigned int nr_dirty)
* This will happen when somebody is freeing
* this inode.
*/
- nilfs_msg(inode->i_sb, KERN_WARNING,
- "cannot set file dirty (ino=%lu): the file is being freed",
- inode->i_ino);
+ nilfs_warn(inode->i_sb,
+ "cannot set file dirty (ino=%lu): the file is being freed",
+ inode->i_ino);
spin_unlock(&nilfs->ns_inode_lock);
return -EINVAL; /*
* NILFS_I_DIRTY may remain for
@@ -942,9 +942,9 @@ int __nilfs_mark_inode_dirty(struct inode *inode, int flags)
err = nilfs_load_inode_block(inode, &ibh);
if (unlikely(err)) {
- nilfs_msg(inode->i_sb, KERN_WARNING,
- "cannot mark inode dirty (ino=%lu): error %d loading inode block",
- inode->i_ino, err);
+ nilfs_warn(inode->i_sb,
+ "cannot mark inode dirty (ino=%lu): error %d loading inode block",
+ inode->i_ino, err);
return err;
}
nilfs_update_inode(inode, ibh, flags);
@@ -970,8 +970,8 @@ void nilfs_dirty_inode(struct inode *inode, int flags)
struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
if (is_bad_inode(inode)) {
- nilfs_msg(inode->i_sb, KERN_WARNING,
- "tried to mark bad_inode dirty. ignored.");
+ nilfs_warn(inode->i_sb,
+ "tried to mark bad_inode dirty. ignored.");
dump_stack();
return;
}
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 4ba73dbf3e8d..07d26f61f22a 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -569,25 +569,25 @@ static int nilfs_ioctl_move_inode_block(struct inode *inode,
if (unlikely(ret < 0)) {
if (ret == -ENOENT)
- nilfs_msg(inode->i_sb, KERN_CRIT,
- "%s: invalid virtual block address (%s): ino=%llu, cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu",
- __func__, vdesc->vd_flags ? "node" : "data",
- (unsigned long long)vdesc->vd_ino,
- (unsigned long long)vdesc->vd_cno,
- (unsigned long long)vdesc->vd_offset,
- (unsigned long long)vdesc->vd_blocknr,
- (unsigned long long)vdesc->vd_vblocknr);
+ nilfs_crit(inode->i_sb,
+ "%s: invalid virtual block address (%s): ino=%llu, cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu",
+ __func__, vdesc->vd_flags ? "node" : "data",
+ (unsigned long long)vdesc->vd_ino,
+ (unsigned long long)vdesc->vd_cno,
+ (unsigned long long)vdesc->vd_offset,
+ (unsigned long long)vdesc->vd_blocknr,
+ (unsigned long long)vdesc->vd_vblocknr);
return ret;
}
if (unlikely(!list_empty(&bh->b_assoc_buffers))) {
- nilfs_msg(inode->i_sb, KERN_CRIT,
- "%s: conflicting %s buffer: ino=%llu, cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu",
- __func__, vdesc->vd_flags ? "node" : "data",
- (unsigned long long)vdesc->vd_ino,
- (unsigned long long)vdesc->vd_cno,
- (unsigned long long)vdesc->vd_offset,
- (unsigned long long)vdesc->vd_blocknr,
- (unsigned long long)vdesc->vd_vblocknr);
+ nilfs_crit(inode->i_sb,
+ "%s: conflicting %s buffer: ino=%llu, cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu",
+ __func__, vdesc->vd_flags ? "node" : "data",
+ (unsigned long long)vdesc->vd_ino,
+ (unsigned long long)vdesc->vd_cno,
+ (unsigned long long)vdesc->vd_offset,
+ (unsigned long long)vdesc->vd_blocknr,
+ (unsigned long long)vdesc->vd_vblocknr);
brelse(bh);
return -EEXIST;
}
@@ -837,8 +837,7 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
return 0;
failed:
- nilfs_msg(nilfs->ns_sb, KERN_ERR, "error %d preparing GC: %s", ret,
- msg);
+ nilfs_err(nilfs->ns_sb, "error %d preparing GC: %s", ret, msg);
return ret;
}
@@ -947,7 +946,7 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
ret = nilfs_ioctl_move_blocks(inode->i_sb, &argv[0], kbufs[0]);
if (ret < 0) {
- nilfs_msg(inode->i_sb, KERN_ERR,
+ nilfs_err(inode->i_sb,
"error %d preparing GC: cannot read source blocks",
ret);
} else {
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 700870a92bc4..c0361ce45f62 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -199,7 +199,7 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
out_no_wait:
err = -EIO;
if (!buffer_uptodate(first_bh)) {
- nilfs_msg(inode->i_sb, KERN_ERR,
+ nilfs_err(inode->i_sb,
"I/O error reading meta-data file (ino=%lu, block-offset=%lu)",
inode->i_ino, block);
goto failed_bh;
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 9fe6d4ab74f0..a6ec7961d4f5 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -272,9 +272,9 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry)
goto out;
if (!inode->i_nlink) {
- nilfs_msg(inode->i_sb, KERN_WARNING,
- "deleting nonexistent file (ino=%lu), %d",
- inode->i_ino, inode->i_nlink);
+ nilfs_warn(inode->i_sb,
+ "deleting nonexistent file (ino=%lu), %d",
+ inode->i_ino, inode->i_nlink);
set_nlink(inode, 1);
}
err = nilfs_delete_entry(de, page);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 42395ba52da6..f8450ee3fd06 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -289,9 +289,8 @@ static inline int nilfs_mark_inode_dirty_sync(struct inode *inode)
/* super.c */
extern struct inode *nilfs_alloc_inode(struct super_block *);
-extern __printf(3, 4)
-void __nilfs_msg(struct super_block *sb, const char *level,
- const char *fmt, ...);
+__printf(2, 3)
+void __nilfs_msg(struct super_block *sb, const char *fmt, ...);
extern __printf(3, 4)
void __nilfs_error(struct super_block *sb, const char *function,
const char *fmt, ...);
@@ -299,7 +298,7 @@ void __nilfs_error(struct super_block *sb, const char *function,
#ifdef CONFIG_PRINTK
#define nilfs_msg(sb, level, fmt, ...) \
- __nilfs_msg(sb, level, fmt, ##__VA_ARGS__)
+ __nilfs_msg(sb, level fmt, ##__VA_ARGS__)
#define nilfs_error(sb, fmt, ...) \
__nilfs_error(sb, __func__, fmt, ##__VA_ARGS__)
@@ -307,7 +306,7 @@ void __nilfs_error(struct super_block *sb, const char *function,
#define nilfs_msg(sb, level, fmt, ...) \
do { \
- no_printk(fmt, ##__VA_ARGS__); \
+ no_printk(level fmt, ##__VA_ARGS__); \
(void)(sb); \
} while (0)
#define nilfs_error(sb, fmt, ...) \
@@ -318,6 +317,15 @@ void __nilfs_error(struct super_block *sb, const char *function,
#endif /* CONFIG_PRINTK */
+#define nilfs_crit(sb, fmt, ...) \
+ nilfs_msg(sb, KERN_CRIT, fmt, ##__VA_ARGS__)
+#define nilfs_err(sb, fmt, ...) \
+ nilfs_msg(sb, KERN_ERR, fmt, ##__VA_ARGS__)
+#define nilfs_warn(sb, fmt, ...) \
+ nilfs_msg(sb, KERN_WARNING, fmt, ##__VA_ARGS__)
+#define nilfs_info(sb, fmt, ...) \
+ nilfs_msg(sb, KERN_INFO, fmt, ##__VA_ARGS__)
+
extern struct nilfs_super_block *
nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **);
extern int nilfs_store_magic_and_option(struct super_block *,
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index d7fc8d369d89..b175f1330408 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -391,9 +391,8 @@ void nilfs_clear_dirty_page(struct page *page, bool silent)
BUG_ON(!PageLocked(page));
if (!silent)
- nilfs_msg(sb, KERN_WARNING,
- "discard dirty page: offset=%lld, ino=%lu",
- page_offset(page), inode->i_ino);
+ nilfs_warn(sb, "discard dirty page: offset=%lld, ino=%lu",
+ page_offset(page), inode->i_ino);
ClearPageUptodate(page);
ClearPageMappedToDisk(page);
@@ -409,9 +408,9 @@ void nilfs_clear_dirty_page(struct page *page, bool silent)
do {
lock_buffer(bh);
if (!silent)
- nilfs_msg(sb, KERN_WARNING,
- "discard dirty block: blocknr=%llu, size=%zu",
- (u64)bh->b_blocknr, bh->b_size);
+ nilfs_warn(sb,
+ "discard dirty block: blocknr=%llu, size=%zu",
+ (u64)bh->b_blocknr, bh->b_size);
set_mask_bits(&bh->b_state, clear_bits, 0);
unlock_buffer(bh);
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 140b663e91c7..0b453ef8fae5 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -51,7 +51,7 @@ static int nilfs_warn_segment_error(struct super_block *sb, int err)
switch (err) {
case NILFS_SEG_FAIL_IO:
- nilfs_msg(sb, KERN_ERR, "I/O error reading segment");
+ nilfs_err(sb, "I/O error reading segment");
return -EIO;
case NILFS_SEG_FAIL_MAGIC:
msg = "Magic number mismatch";
@@ -72,10 +72,10 @@ static int nilfs_warn_segment_error(struct super_block *sb, int err)
msg = "No super root in the last segment";
break;
default:
- nilfs_msg(sb, KERN_ERR, "unrecognized segment error %d", err);
+ nilfs_err(sb, "unrecognized segment error %d", err);
return -EINVAL;
}
- nilfs_msg(sb, KERN_WARNING, "invalid segment: %s", msg);
+ nilfs_warn(sb, "invalid segment: %s", msg);
return -EINVAL;
}
@@ -543,10 +543,10 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
put_page(page);
failed_inode:
- nilfs_msg(sb, KERN_WARNING,
- "error %d recovering data block (ino=%lu, block-offset=%llu)",
- err, (unsigned long)rb->ino,
- (unsigned long long)rb->blkoff);
+ nilfs_warn(sb,
+ "error %d recovering data block (ino=%lu, block-offset=%llu)",
+ err, (unsigned long)rb->ino,
+ (unsigned long long)rb->blkoff);
if (!err2)
err2 = err;
next:
@@ -669,8 +669,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
}
if (nsalvaged_blocks) {
- nilfs_msg(sb, KERN_INFO, "salvaged %lu blocks",
- nsalvaged_blocks);
+ nilfs_info(sb, "salvaged %lu blocks", nsalvaged_blocks);
ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE;
}
out:
@@ -681,7 +680,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
confused:
err = -EINVAL;
failed:
- nilfs_msg(sb, KERN_ERR,
+ nilfs_err(sb,
"error %d roll-forwarding partial segment at blocknr = %llu",
err, (unsigned long long)pseg_start);
goto out;
@@ -703,8 +702,8 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
set_buffer_dirty(bh);
err = sync_dirty_buffer(bh);
if (unlikely(err))
- nilfs_msg(nilfs->ns_sb, KERN_WARNING,
- "buffer sync write failed during post-cleaning of recovery.");
+ nilfs_warn(nilfs->ns_sb,
+ "buffer sync write failed during post-cleaning of recovery.");
brelse(bh);
}
@@ -739,8 +738,7 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
err = nilfs_attach_checkpoint(sb, ri->ri_cno, true, &root);
if (unlikely(err)) {
- nilfs_msg(sb, KERN_ERR,
- "error %d loading the latest checkpoint", err);
+ nilfs_err(sb, "error %d loading the latest checkpoint", err);
return err;
}
@@ -751,8 +749,7 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
if (ri->ri_need_recovery == NILFS_RECOVERY_ROLLFORWARD_DONE) {
err = nilfs_prepare_segment_for_recovery(nilfs, sb, ri);
if (unlikely(err)) {
- nilfs_msg(sb, KERN_ERR,
- "error %d preparing segment for recovery",
+ nilfs_err(sb, "error %d preparing segment for recovery",
err);
goto failed;
}
@@ -766,8 +763,7 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
nilfs_detach_log_writer(sb);
if (unlikely(err)) {
- nilfs_msg(sb, KERN_ERR,
- "error %d writing segment for recovery",
+ nilfs_err(sb, "error %d writing segment for recovery",
err);
goto failed;
}
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 20c479b5e41b..1a8729eded8b 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -505,7 +505,7 @@ static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf)
} while (--segbuf->sb_nbio > 0);
if (unlikely(atomic_read(&segbuf->sb_err) > 0)) {
- nilfs_msg(segbuf->sb_super, KERN_ERR,
+ nilfs_err(segbuf->sb_super,
"I/O error writing log (start-blocknr=%llu, block-count=%lu) in segment %llu",
(unsigned long long)segbuf->sb_pseg_start,
segbuf->sb_sum.nblocks,
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 91b58c897f92..a651e821c2de 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -158,7 +158,7 @@ static int nilfs_prepare_segment_lock(struct super_block *sb,
* it is saved and will be restored on
* nilfs_transaction_commit().
*/
- nilfs_msg(sb, KERN_WARNING, "journal info from a different FS");
+ nilfs_warn(sb, "journal info from a different FS");
save = current->journal_info;
}
if (!ti) {
@@ -1940,9 +1940,9 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci,
err = nilfs_ifile_get_inode_block(
ifile, ii->vfs_inode.i_ino, &ibh);
if (unlikely(err)) {
- nilfs_msg(sci->sc_super, KERN_WARNING,
- "log writer: error %d getting inode block (ino=%lu)",
- err, ii->vfs_inode.i_ino);
+ nilfs_warn(sci->sc_super,
+ "log writer: error %d getting inode block (ino=%lu)",
+ err, ii->vfs_inode.i_ino);
return err;
}
spin_lock(&nilfs->ns_inode_lock);
@@ -2449,7 +2449,7 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
if (likely(!err))
break;
- nilfs_msg(sb, KERN_WARNING, "error %d cleaning segments", err);
+ nilfs_warn(sb, "error %d cleaning segments", err);
set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(sci->sc_interval);
}
@@ -2457,9 +2457,9 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
int ret = nilfs_discard_segments(nilfs, sci->sc_freesegs,
sci->sc_nfreesegs);
if (ret) {
- nilfs_msg(sb, KERN_WARNING,
- "error %d on discard request, turning discards off for the device",
- ret);
+ nilfs_warn(sb,
+ "error %d on discard request, turning discards off for the device",
+ ret);
nilfs_clear_opt(nilfs, DISCARD);
}
}
@@ -2540,9 +2540,9 @@ static int nilfs_segctor_thread(void *arg)
/* start sync. */
sci->sc_task = current;
wake_up(&sci->sc_wait_task); /* for nilfs_segctor_start_thread() */
- nilfs_msg(sci->sc_super, KERN_INFO,
- "segctord starting. Construction interval = %lu seconds, CP frequency < %lu seconds",
- sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ);
+ nilfs_info(sci->sc_super,
+ "segctord starting. Construction interval = %lu seconds, CP frequency < %lu seconds",
+ sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ);
spin_lock(&sci->sc_state_lock);
loop:
@@ -2616,8 +2616,8 @@ static int nilfs_segctor_start_thread(struct nilfs_sc_info *sci)
if (IS_ERR(t)) {
int err = PTR_ERR(t);
- nilfs_msg(sci->sc_super, KERN_ERR,
- "error %d creating segctord thread", err);
+ nilfs_err(sci->sc_super, "error %d creating segctord thread",
+ err);
return err;
}
wait_event(sci->sc_wait_task, sci->sc_task != NULL);
@@ -2727,14 +2727,14 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
nilfs_segctor_write_out(sci);
if (!list_empty(&sci->sc_dirty_files)) {
- nilfs_msg(sci->sc_super, KERN_WARNING,
- "disposed unprocessed dirty file(s) when stopping log writer");
+ nilfs_warn(sci->sc_super,
+ "disposed unprocessed dirty file(s) when stopping log writer");
nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1);
}
if (!list_empty(&sci->sc_iput_queue)) {
- nilfs_msg(sci->sc_super, KERN_WARNING,
- "disposed unprocessed inode(s) in iput queue when stopping log writer");
+ nilfs_warn(sci->sc_super,
+ "disposed unprocessed inode(s) in iput queue when stopping log writer");
nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 1);
}
@@ -2812,8 +2812,8 @@ void nilfs_detach_log_writer(struct super_block *sb)
spin_lock(&nilfs->ns_inode_lock);
if (!list_empty(&nilfs->ns_dirty_files)) {
list_splice_init(&nilfs->ns_dirty_files, &garbage_list);
- nilfs_msg(sb, KERN_WARNING,
- "disposed unprocessed dirty file(s) when detaching log writer");
+ nilfs_warn(sb,
+ "disposed unprocessed dirty file(s) when detaching log writer");
}
spin_unlock(&nilfs->ns_inode_lock);
up_write(&nilfs->ns_segctor_sem);
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index bf3f8f05c89b..42ff67c0c14f 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -171,9 +171,9 @@ int nilfs_sufile_updatev(struct inode *sufile, __u64 *segnumv, size_t nsegs,
down_write(&NILFS_MDT(sufile)->mi_sem);
for (seg = segnumv; seg < segnumv + nsegs; seg++) {
if (unlikely(*seg >= nilfs_sufile_get_nsegments(sufile))) {
- nilfs_msg(sufile->i_sb, KERN_WARNING,
- "%s: invalid segment number: %llu",
- __func__, (unsigned long long)*seg);
+ nilfs_warn(sufile->i_sb,
+ "%s: invalid segment number: %llu",
+ __func__, (unsigned long long)*seg);
nerr++;
}
}
@@ -230,9 +230,8 @@ int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create,
int ret;
if (unlikely(segnum >= nilfs_sufile_get_nsegments(sufile))) {
- nilfs_msg(sufile->i_sb, KERN_WARNING,
- "%s: invalid segment number: %llu",
- __func__, (unsigned long long)segnum);
+ nilfs_warn(sufile->i_sb, "%s: invalid segment number: %llu",
+ __func__, (unsigned long long)segnum);
return -EINVAL;
}
down_write(&NILFS_MDT(sufile)->mi_sem);
@@ -410,9 +409,8 @@ void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum,
kaddr = kmap_atomic(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
if (unlikely(!nilfs_segment_usage_clean(su))) {
- nilfs_msg(sufile->i_sb, KERN_WARNING,
- "%s: segment %llu must be clean", __func__,
- (unsigned long long)segnum);
+ nilfs_warn(sufile->i_sb, "%s: segment %llu must be clean",
+ __func__, (unsigned long long)segnum);
kunmap_atomic(kaddr);
return;
}
@@ -468,9 +466,8 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
kaddr = kmap_atomic(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
if (nilfs_segment_usage_clean(su)) {
- nilfs_msg(sufile->i_sb, KERN_WARNING,
- "%s: segment %llu is already clean",
- __func__, (unsigned long long)segnum);
+ nilfs_warn(sufile->i_sb, "%s: segment %llu is already clean",
+ __func__, (unsigned long long)segnum);
kunmap_atomic(kaddr);
return;
}
@@ -1168,12 +1165,12 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize,
int err;
if (susize > sb->s_blocksize) {
- nilfs_msg(sb, KERN_ERR,
- "too large segment usage size: %zu bytes", susize);
+ nilfs_err(sb, "too large segment usage size: %zu bytes",
+ susize);
return -EINVAL;
} else if (susize < NILFS_MIN_SEGMENT_USAGE_SIZE) {
- nilfs_msg(sb, KERN_ERR,
- "too small segment usage size: %zu bytes", susize);
+ nilfs_err(sb, "too small segment usage size: %zu bytes",
+ susize);
return -EINVAL;
}
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 5729ee86da9a..2eee5fb1a882 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -62,19 +62,25 @@ struct kmem_cache *nilfs_btree_path_cache;
static int nilfs_setup_super(struct super_block *sb, int is_mount);
static int nilfs_remount(struct super_block *sb, int *flags, char *data);
-void __nilfs_msg(struct super_block *sb, const char *level, const char *fmt,
- ...)
+void __nilfs_msg(struct super_block *sb, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
+ int level;
va_start(args, fmt);
- vaf.fmt = fmt;
+
+ level = printk_get_level(fmt);
+ vaf.fmt = printk_skip_level(fmt);
vaf.va = &args;
+
if (sb)
- printk("%sNILFS (%s): %pV\n", level, sb->s_id, &vaf);
+ printk("%c%cNILFS (%s): %pV\n",
+ KERN_SOH_ASCII, level, sb->s_id, &vaf);
else
- printk("%sNILFS: %pV\n", level, &vaf);
+ printk("%c%cNILFS: %pV\n",
+ KERN_SOH_ASCII, level, &vaf);
+
va_end(args);
}
@@ -106,7 +112,7 @@ static void nilfs_set_error(struct super_block *sb)
*
* This implements the body of nilfs_error() macro. Normally,
* nilfs_error() should be used. As for sustainable errors such as a
- * single-shot I/O error, nilfs_msg() should be used instead.
+ * single-shot I/O error, nilfs_err() should be used instead.
*
* Callers should not add a trailing newline since this will do it.
*/
@@ -178,8 +184,7 @@ static int nilfs_sync_super(struct super_block *sb, int flag)
}
if (unlikely(err)) {
- nilfs_msg(sb, KERN_ERR, "unable to write superblock: err=%d",
- err);
+ nilfs_err(sb, "unable to write superblock: err=%d", err);
if (err == -EIO && nilfs->ns_sbh[1]) {
/*
* sbp[0] points to newer log than sbp[1],
@@ -249,7 +254,7 @@ struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb,
sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC)) {
memcpy(sbp[0], sbp[1], nilfs->ns_sbsize);
} else {
- nilfs_msg(sb, KERN_CRIT, "superblock broke");
+ nilfs_crit(sb, "superblock broke");
return NULL;
}
} else if (sbp[1] &&
@@ -359,9 +364,9 @@ static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off)
offset = sb2off & (nilfs->ns_blocksize - 1);
nsbh = sb_getblk(sb, newblocknr);
if (!nsbh) {
- nilfs_msg(sb, KERN_WARNING,
- "unable to move secondary superblock to block %llu",
- (unsigned long long)newblocknr);
+ nilfs_warn(sb,
+ "unable to move secondary superblock to block %llu",
+ (unsigned long long)newblocknr);
ret = -EIO;
goto out;
}
@@ -524,7 +529,7 @@ int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
up_read(&nilfs->ns_segctor_sem);
if (unlikely(err)) {
if (err == -ENOENT || err == -EINVAL) {
- nilfs_msg(sb, KERN_ERR,
+ nilfs_err(sb,
"Invalid checkpoint (checkpoint number=%llu)",
(unsigned long long)cno);
err = -EINVAL;
@@ -622,8 +627,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
err = nilfs_ifile_count_free_inodes(root->ifile,
&nmaxinodes, &nfreeinodes);
if (unlikely(err)) {
- nilfs_msg(sb, KERN_WARNING,
- "failed to count free inodes: err=%d", err);
+ nilfs_warn(sb, "failed to count free inodes: err=%d", err);
if (err == -ERANGE) {
/*
* If nilfs_palloc_count_max_entries() returns
@@ -755,7 +759,7 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
break;
case Opt_snapshot:
if (is_remount) {
- nilfs_msg(sb, KERN_ERR,
+ nilfs_err(sb,
"\"%s\" option is invalid for remount",
p);
return 0;
@@ -771,8 +775,7 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
nilfs_clear_opt(nilfs, DISCARD);
break;
default:
- nilfs_msg(sb, KERN_ERR,
- "unrecognized mount option \"%s\"", p);
+ nilfs_err(sb, "unrecognized mount option \"%s\"", p);
return 0;
}
}
@@ -808,10 +811,10 @@ static int nilfs_setup_super(struct super_block *sb, int is_mount)
mnt_count = le16_to_cpu(sbp[0]->s_mnt_count);
if (nilfs->ns_mount_state & NILFS_ERROR_FS) {
- nilfs_msg(sb, KERN_WARNING, "mounting fs with errors");
+ nilfs_warn(sb, "mounting fs with errors");
#if 0
} else if (max_mnt_count >= 0 && mnt_count >= max_mnt_count) {
- nilfs_msg(sb, KERN_WARNING, "maximal mount count reached");
+ nilfs_warn(sb, "maximal mount count reached");
#endif
}
if (!max_mnt_count)
@@ -874,7 +877,7 @@ int nilfs_check_feature_compatibility(struct super_block *sb,
features = le64_to_cpu(sbp->s_feature_incompat) &
~NILFS_FEATURE_INCOMPAT_SUPP;
if (features) {
- nilfs_msg(sb, KERN_ERR,
+ nilfs_err(sb,
"couldn't mount because of unsupported optional features (%llx)",
(unsigned long long)features);
return -EINVAL;
@@ -882,7 +885,7 @@ int nilfs_check_feature_compatibility(struct super_block *sb,
features = le64_to_cpu(sbp->s_feature_compat_ro) &
~NILFS_FEATURE_COMPAT_RO_SUPP;
if (!sb_rdonly(sb) && features) {
- nilfs_msg(sb, KERN_ERR,
+ nilfs_err(sb,
"couldn't mount RDWR because of unsupported optional features (%llx)",
(unsigned long long)features);
return -EINVAL;
@@ -901,12 +904,12 @@ static int nilfs_get_root_dentry(struct super_block *sb,
inode = nilfs_iget(sb, root, NILFS_ROOT_INO);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
- nilfs_msg(sb, KERN_ERR, "error %d getting root inode", ret);
+ nilfs_err(sb, "error %d getting root inode", ret);
goto out;
}
if (!S_ISDIR(inode->i_mode) || !inode->i_blocks || !inode->i_size) {
iput(inode);
- nilfs_msg(sb, KERN_ERR, "corrupt root inode");
+ nilfs_err(sb, "corrupt root inode");
ret = -EINVAL;
goto out;
}
@@ -934,7 +937,7 @@ static int nilfs_get_root_dentry(struct super_block *sb,
return ret;
failed_dentry:
- nilfs_msg(sb, KERN_ERR, "error %d getting root dentry", ret);
+ nilfs_err(sb, "error %d getting root dentry", ret);
goto out;
}
@@ -954,7 +957,7 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
ret = (ret == -ENOENT) ? -EINVAL : ret;
goto out;
} else if (!ret) {
- nilfs_msg(s, KERN_ERR,
+ nilfs_err(s,
"The specified checkpoint is not a snapshot (checkpoint number=%llu)",
(unsigned long long)cno);
ret = -EINVAL;
@@ -963,7 +966,7 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
ret = nilfs_attach_checkpoint(s, cno, false, &root);
if (ret) {
- nilfs_msg(s, KERN_ERR,
+ nilfs_err(s,
"error %d while loading snapshot (checkpoint number=%llu)",
ret, (unsigned long long)cno);
goto out;
@@ -1060,7 +1063,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
cno = nilfs_last_cno(nilfs);
err = nilfs_attach_checkpoint(sb, cno, true, &fsroot);
if (err) {
- nilfs_msg(sb, KERN_ERR,
+ nilfs_err(sb,
"error %d while loading last checkpoint (checkpoint number=%llu)",
err, (unsigned long long)cno);
goto failed_unload;
@@ -1122,8 +1125,8 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
err = -EINVAL;
if (!nilfs_valid_fs(nilfs)) {
- nilfs_msg(sb, KERN_WARNING,
- "couldn't remount because the filesystem is in an incomplete recovery state");
+ nilfs_warn(sb,
+ "couldn't remount because the filesystem is in an incomplete recovery state");
goto restore_opts;
}
@@ -1155,9 +1158,9 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
~NILFS_FEATURE_COMPAT_RO_SUPP;
up_read(&nilfs->ns_sem);
if (features) {
- nilfs_msg(sb, KERN_WARNING,
- "couldn't remount RDWR because of unsupported optional features (%llx)",
- (unsigned long long)features);
+ nilfs_warn(sb,
+ "couldn't remount RDWR because of unsupported optional features (%llx)",
+ (unsigned long long)features);
err = -EROFS;
goto restore_opts;
}
@@ -1216,7 +1219,7 @@ static int nilfs_parse_snapshot_option(const char *option,
return 0;
parse_error:
- nilfs_msg(NULL, KERN_ERR, "invalid option \"%s\": %s", option, msg);
+ nilfs_err(NULL, "invalid option \"%s\": %s", option, msg);
return 1;
}
@@ -1319,7 +1322,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
} else if (!sd.cno) {
if (nilfs_tree_is_busy(s->s_root)) {
if ((flags ^ s->s_flags) & SB_RDONLY) {
- nilfs_msg(s, KERN_ERR,
+ nilfs_err(s,
"the device already has a %s mount.",
sb_rdonly(s) ? "read-only" : "read/write");
err = -EBUSY;
diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c
index e60be7bb55b0..303d71430bdd 100644
--- a/fs/nilfs2/sysfs.c
+++ b/fs/nilfs2/sysfs.c
@@ -263,8 +263,8 @@ nilfs_checkpoints_checkpoints_number_show(struct nilfs_checkpoints_attr *attr,
err = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat);
up_read(&nilfs->ns_segctor_sem);
if (err < 0) {
- nilfs_msg(nilfs->ns_sb, KERN_ERR,
- "unable to get checkpoint stat: err=%d", err);
+ nilfs_err(nilfs->ns_sb, "unable to get checkpoint stat: err=%d",
+ err);
return err;
}
@@ -286,8 +286,8 @@ nilfs_checkpoints_snapshots_number_show(struct nilfs_checkpoints_attr *attr,
err = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat);
up_read(&nilfs->ns_segctor_sem);
if (err < 0) {
- nilfs_msg(nilfs->ns_sb, KERN_ERR,
- "unable to get checkpoint stat: err=%d", err);
+ nilfs_err(nilfs->ns_sb, "unable to get checkpoint stat: err=%d",
+ err);
return err;
}
@@ -405,8 +405,8 @@ nilfs_segments_dirty_segments_show(struct nilfs_segments_attr *attr,
err = nilfs_sufile_get_stat(nilfs->ns_sufile, &sustat);
up_read(&nilfs->ns_segctor_sem);
if (err < 0) {
- nilfs_msg(nilfs->ns_sb, KERN_ERR,
- "unable to get segment stat: err=%d", err);
+ nilfs_err(nilfs->ns_sb, "unable to get segment stat: err=%d",
+ err);
return err;
}
@@ -779,15 +779,15 @@ nilfs_superblock_sb_update_frequency_store(struct nilfs_superblock_attr *attr,
err = kstrtouint(skip_spaces(buf), 0, &val);
if (err) {
- nilfs_msg(nilfs->ns_sb, KERN_ERR,
- "unable to convert string: err=%d", err);
+ nilfs_err(nilfs->ns_sb, "unable to convert string: err=%d",
+ err);
return err;
}
if (val < NILFS_SB_FREQ) {
val = NILFS_SB_FREQ;
- nilfs_msg(nilfs->ns_sb, KERN_WARNING,
- "superblock update frequency cannot be lesser than 10 seconds");
+ nilfs_warn(nilfs->ns_sb,
+ "superblock update frequency cannot be lesser than 10 seconds");
}
down_write(&nilfs->ns_sem);
@@ -990,8 +990,7 @@ int nilfs_sysfs_create_device_group(struct super_block *sb)
nilfs->ns_dev_subgroups = kzalloc(devgrp_size, GFP_KERNEL);
if (unlikely(!nilfs->ns_dev_subgroups)) {
err = -ENOMEM;
- nilfs_msg(sb, KERN_ERR,
- "unable to allocate memory for device group");
+ nilfs_err(sb, "unable to allocate memory for device group");
goto failed_create_device_group;
}
@@ -1101,15 +1100,13 @@ int __init nilfs_sysfs_init(void)
nilfs_kset = kset_create_and_add(NILFS_ROOT_GROUP_NAME, NULL, fs_kobj);
if (!nilfs_kset) {
err = -ENOMEM;
- nilfs_msg(NULL, KERN_ERR,
- "unable to create sysfs entry: err=%d", err);
+ nilfs_err(NULL, "unable to create sysfs entry: err=%d", err);
goto failed_sysfs_init;
}
err = sysfs_create_group(&nilfs_kset->kobj, &nilfs_feature_attr_group);
if (unlikely(err)) {
- nilfs_msg(NULL, KERN_ERR,
- "unable to create feature group: err=%d", err);
+ nilfs_err(NULL, "unable to create feature group: err=%d", err);
goto cleanup_sysfs_init;
}
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 484785cdf96e..221a1cc597f0 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -183,7 +183,7 @@ static int nilfs_store_log_cursor(struct the_nilfs *nilfs,
nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg);
nilfs->ns_cno = nilfs->ns_last_cno + 1;
if (nilfs->ns_segnum >= nilfs->ns_nsegments) {
- nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ nilfs_err(nilfs->ns_sb,
"pointed segment number is out of range: segnum=%llu, nsegments=%lu",
(unsigned long long)nilfs->ns_segnum,
nilfs->ns_nsegments);
@@ -210,12 +210,12 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
int err;
if (!valid_fs) {
- nilfs_msg(sb, KERN_WARNING, "mounting unchecked fs");
+ nilfs_warn(sb, "mounting unchecked fs");
if (s_flags & SB_RDONLY) {
- nilfs_msg(sb, KERN_INFO,
- "recovery required for readonly filesystem");
- nilfs_msg(sb, KERN_INFO,
- "write access will be enabled during recovery");
+ nilfs_info(sb,
+ "recovery required for readonly filesystem");
+ nilfs_info(sb,
+ "write access will be enabled during recovery");
}
}
@@ -230,12 +230,11 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
goto scan_error;
if (!nilfs_valid_sb(sbp[1])) {
- nilfs_msg(sb, KERN_WARNING,
- "unable to fall back to spare super block");
+ nilfs_warn(sb,
+ "unable to fall back to spare super block");
goto scan_error;
}
- nilfs_msg(sb, KERN_INFO,
- "trying rollback from an earlier position");
+ nilfs_info(sb, "trying rollback from an earlier position");
/*
* restore super block with its spare and reconfigure
@@ -248,9 +247,9 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
/* verify consistency between two super blocks */
blocksize = BLOCK_SIZE << le32_to_cpu(sbp[0]->s_log_block_size);
if (blocksize != nilfs->ns_blocksize) {
- nilfs_msg(sb, KERN_WARNING,
- "blocksize differs between two super blocks (%d != %d)",
- blocksize, nilfs->ns_blocksize);
+ nilfs_warn(sb,
+ "blocksize differs between two super blocks (%d != %d)",
+ blocksize, nilfs->ns_blocksize);
goto scan_error;
}
@@ -269,8 +268,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
err = nilfs_load_super_root(nilfs, sb, ri.ri_super_root);
if (unlikely(err)) {
- nilfs_msg(sb, KERN_ERR, "error %d while loading super root",
- err);
+ nilfs_err(sb, "error %d while loading super root", err);
goto failed;
}
@@ -281,28 +279,28 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
__u64 features;
if (nilfs_test_opt(nilfs, NORECOVERY)) {
- nilfs_msg(sb, KERN_INFO,
- "norecovery option specified, skipping roll-forward recovery");
+ nilfs_info(sb,
+ "norecovery option specified, skipping roll-forward recovery");
goto skip_recovery;
}
features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) &
~NILFS_FEATURE_COMPAT_RO_SUPP;
if (features) {
- nilfs_msg(sb, KERN_ERR,
+ nilfs_err(sb,
"couldn't proceed with recovery because of unsupported optional features (%llx)",
(unsigned long long)features);
err = -EROFS;
goto failed_unload;
}
if (really_read_only) {
- nilfs_msg(sb, KERN_ERR,
+ nilfs_err(sb,
"write access unavailable, cannot proceed");
err = -EROFS;
goto failed_unload;
}
sb->s_flags &= ~SB_RDONLY;
} else if (nilfs_test_opt(nilfs, NORECOVERY)) {
- nilfs_msg(sb, KERN_ERR,
+ nilfs_err(sb,
"recovery cancelled because norecovery option was specified for a read/write mount");
err = -EINVAL;
goto failed_unload;
@@ -318,12 +316,12 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
up_write(&nilfs->ns_sem);
if (err) {
- nilfs_msg(sb, KERN_ERR,
+ nilfs_err(sb,
"error %d updating super block. recovery unfinished.",
err);
goto failed_unload;
}
- nilfs_msg(sb, KERN_INFO, "recovery complete");
+ nilfs_info(sb, "recovery complete");
skip_recovery:
nilfs_clear_recovery_info(&ri);
@@ -331,7 +329,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
return 0;
scan_error:
- nilfs_msg(sb, KERN_ERR, "error %d while searching super root", err);
+ nilfs_err(sb, "error %d while searching super root", err);
goto failed;
failed_unload:
@@ -378,7 +376,7 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
struct nilfs_super_block *sbp)
{
if (le32_to_cpu(sbp->s_rev_level) < NILFS_MIN_SUPP_REV) {
- nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ nilfs_err(nilfs->ns_sb,
"unsupported revision (superblock rev.=%d.%d, current rev.=%d.%d). Please check the version of mkfs.nilfs(2).",
le32_to_cpu(sbp->s_rev_level),
le16_to_cpu(sbp->s_minor_rev_level),
@@ -391,13 +389,11 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
nilfs->ns_inode_size = le16_to_cpu(sbp->s_inode_size);
if (nilfs->ns_inode_size > nilfs->ns_blocksize) {
- nilfs_msg(nilfs->ns_sb, KERN_ERR,
- "too large inode size: %d bytes",
+ nilfs_err(nilfs->ns_sb, "too large inode size: %d bytes",
nilfs->ns_inode_size);
return -EINVAL;
} else if (nilfs->ns_inode_size < NILFS_MIN_INODE_SIZE) {
- nilfs_msg(nilfs->ns_sb, KERN_ERR,
- "too small inode size: %d bytes",
+ nilfs_err(nilfs->ns_sb, "too small inode size: %d bytes",
nilfs->ns_inode_size);
return -EINVAL;
}
@@ -406,8 +402,7 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) {
- nilfs_msg(nilfs->ns_sb, KERN_ERR,
- "too short segment: %lu blocks",
+ nilfs_err(nilfs->ns_sb, "too short segment: %lu blocks",
nilfs->ns_blocks_per_segment);
return -EINVAL;
}
@@ -417,7 +412,7 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
le32_to_cpu(sbp->s_r_segments_percentage);
if (nilfs->ns_r_segments_percentage < 1 ||
nilfs->ns_r_segments_percentage > 99) {
- nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ nilfs_err(nilfs->ns_sb,
"invalid reserved segments percentage: %lu",
nilfs->ns_r_segments_percentage);
return -EINVAL;
@@ -503,16 +498,16 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
if (!sbp[0]) {
if (!sbp[1]) {
- nilfs_msg(sb, KERN_ERR, "unable to read superblock");
+ nilfs_err(sb, "unable to read superblock");
return -EIO;
}
- nilfs_msg(sb, KERN_WARNING,
- "unable to read primary superblock (blocksize = %d)",
- blocksize);
+ nilfs_warn(sb,
+ "unable to read primary superblock (blocksize = %d)",
+ blocksize);
} else if (!sbp[1]) {
- nilfs_msg(sb, KERN_WARNING,
- "unable to read secondary superblock (blocksize = %d)",
- blocksize);
+ nilfs_warn(sb,
+ "unable to read secondary superblock (blocksize = %d)",
+ blocksize);
}
/*
@@ -534,14 +529,14 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
}
if (!valid[swp]) {
nilfs_release_super_block(nilfs);
- nilfs_msg(sb, KERN_ERR, "couldn't find nilfs on the device");
+ nilfs_err(sb, "couldn't find nilfs on the device");
return -EINVAL;
}
if (!valid[!swp])
- nilfs_msg(sb, KERN_WARNING,
- "broken superblock, retrying with spare superblock (blocksize = %d)",
- blocksize);
+ nilfs_warn(sb,
+ "broken superblock, retrying with spare superblock (blocksize = %d)",
+ blocksize);
if (swp)
nilfs_swap_super_block(nilfs);
@@ -575,7 +570,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE);
if (!blocksize) {
- nilfs_msg(sb, KERN_ERR, "unable to set blocksize");
+ nilfs_err(sb, "unable to set blocksize");
err = -EINVAL;
goto out;
}
@@ -594,7 +589,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
if (blocksize < NILFS_MIN_BLOCK_SIZE ||
blocksize > NILFS_MAX_BLOCK_SIZE) {
- nilfs_msg(sb, KERN_ERR,
+ nilfs_err(sb,
"couldn't mount because of unsupported filesystem blocksize %d",
blocksize);
err = -EINVAL;
@@ -604,7 +599,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
int hw_blocksize = bdev_logical_block_size(sb->s_bdev);
if (blocksize < hw_blocksize) {
- nilfs_msg(sb, KERN_ERR,
+ nilfs_err(sb,
"blocksize %d too small for device (sector-size = %d)",
blocksize, hw_blocksize);
err = -EINVAL;
diff --git a/fs/open.c b/fs/open.c
index c80e9f497e9b..9af548fb841b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -779,12 +779,6 @@ static int do_dentry_open(struct file *f,
return 0;
}
- /* Any file opened for execve()/uselib() has to be a regular file. */
- if (unlikely(f->f_flags & FMODE_EXEC && !S_ISREG(inode->i_mode))) {
- error = -EACCES;
- goto cleanup_file;
- }
-
if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
error = get_write_access(inode);
if (unlikely(error))
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a333caeca291..617db4e0faa0 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -551,8 +551,17 @@ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
{
unsigned long totalpages = totalram_pages() + total_swap_pages;
unsigned long points = 0;
+ long badness;
+
+ badness = oom_badness(task, totalpages);
+ /*
+ * Special case OOM_SCORE_ADJ_MIN for all others scale the
+ * badness value into [0, 2000] range which we have been
+ * exporting for a long time so userspace might depend on it.
+ */
+ if (badness != LONG_MIN)
+ points = (1000 + badness * 1000 / (long)totalpages) * 2 / 3;
- points = oom_badness(task, totalpages) * 1000 / totalpages;
seq_printf(m, "%lu\n", points);
return 0;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index dbda4499a859..5066b0251ed8 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -786,7 +786,7 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss,
SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree);
SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp);
SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp);
- SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp);
+ SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp);
SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb);
seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ",
mss->private_hugetlb >> 10, 7);
@@ -816,7 +816,7 @@ static int show_smap(struct seq_file *m, void *v)
__show_smap(m, &mss, false);
- seq_printf(m, "THPeligible: %d\n",
+ seq_printf(m, "THPeligible: %d\n",
transparent_hugepage_enabled(vma));
if (arch_pkeys_enabled())
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 44b6845b071c..5b78719be445 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -314,9 +314,10 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
{
sigset_t mask;
- if (sizemask != sizeof(sigset_t) ||
- copy_from_user(&mask, user_mask, sizeof(mask)))
+ if (sizemask != sizeof(sigset_t))
return -EINVAL;
+ if (copy_from_user(&mask, user_mask, sizeof(mask)))
+ return -EFAULT;
return do_signalfd4(ufd, &mask, flags);
}
@@ -325,9 +326,10 @@ SYSCALL_DEFINE3(signalfd, int, ufd, sigset_t __user *, user_mask,
{
sigset_t mask;
- if (sizemask != sizeof(sigset_t) ||
- copy_from_user(&mask, user_mask, sizeof(mask)))
+ if (sizemask != sizeof(sigset_t))
return -EINVAL;
+ if (copy_from_user(&mask, user_mask, sizeof(mask)))
+ return -EFAULT;
return do_signalfd4(ufd, &mask, 0);
}
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 1da0be667409..e3b69fb280e8 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -101,7 +101,7 @@ static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 gene
struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
struct inode *inode;
- if (ino < UFS_ROOTINO || ino > uspi->s_ncg * uspi->s_ipg)
+ if (ino < UFS_ROOTINO || ino > (u64)uspi->s_ncg * uspi->s_ipg)
return ERR_PTR(-ESTALE);
inode = ufs_iget(sb, ino);
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 4df87546bd40..ae9aaf1f34bf 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -600,7 +600,7 @@ xfs_sb_quota_to_disk(
* disk. If neither are active, we should NULL the inode.
*
* In all cases, the separate pquotino must remain 0 because it
- * it beyond the "end" of the valid non-pquotino superblock.
+ * is beyond the "end" of the valid non-pquotino superblock.
*/
if (from->sb_qflags & XFS_GQUOTA_ACCT)
to->sb_gquotino = cpu_to_be64(from->sb_gquotino);
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index e380bd1a9bfc..50f922cad91a 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -44,7 +44,7 @@ xfs_attr_shortform_compare(const void *a, const void *b)
/*
* Copy out entries of shortform attribute lists for attr_list().
* Shortform attribute lists are not stored in hashval sorted order.
- * If the output buffer is not large enough to hold them all, then we
+ * If the output buffer is not large enough to hold them all, then
* we have to calculate each entries' hashvalue and sort them before
* we can begin returning them to the user.
*/
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 5bb6f22cc11a..408d1b572d3f 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -127,7 +127,7 @@ xfs_buf_item_size_segment(
* stretch of non-contiguous chunks to be logged. Contiguous chunks are logged
* in a single iovec.
*
- * Discontiguous buffers need a format structure per region that that is being
+ * Discontiguous buffers need a format structure per region that is being
* logged. This makes the changes in the buffer appear to log recovery as though
* they came from separate buffers, just like would occur if multiple buffers
* were used instead of a single discontiguous buffer. This enables
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index d480f11e6b00..8f0457d67d77 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -948,7 +948,7 @@ xlog_recover_buf_commit_pass2(
* or inode_cluster_size bytes, whichever is bigger. The inode
* buffers in the log can be a different size if the log was generated
* by an older kernel using unclustered inode buffers or a newer kernel
- * running with a different inode cluster size. Regardless, if the
+ * running with a different inode cluster size. Regardless, if
* the inode buffer size isn't max(blocksize, inode_cluster_size)
* for *our* value of inode_cluster_size, then we need to keep
* the buffer out of the buffer cache so that the buffer won't
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 04dc2be19c3a..bcd73b9c2994 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -807,7 +807,7 @@ xfs_qm_dqget_checks(
}
/*
- * Given the file system, id, and type (UDQUOT/GDQUOT), return a a locked
+ * Given the file system, id, and type (UDQUOT/GDQUOT), return a locked
* dquot, doing an allocation (if requested) as needed.
*/
int
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 5a4b0119143a..465fd9e048d4 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -56,7 +56,7 @@ xfs_fs_encode_fh(
fileid_type = FILEID_INO32_GEN_PARENT;
/*
- * If the the filesystem may contain 64bit inode numbers, we need
+ * If the filesystem may contain 64bit inode numbers, we need
* to use larger file handles that can represent them.
*
* While we only allocate inodes that do not fit into 32 bits any
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 407d6299606d..c06129cffba9 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -451,7 +451,7 @@ xfs_lock_inodes(
/*
* Currently supports between 2 and 5 inodes with exclusive locking. We
* support an arbitrary depth of locking here, but absolute limits on
- * inodes depend on the the type of locking and the limits placed by
+ * inodes depend on the type of locking and the limits placed by
* lockdep annotations in xfs_lock_inumorder. These are all checked by
* the asserts.
*/
@@ -3105,7 +3105,7 @@ out_trans_abort:
/*
* xfs_rename_alloc_whiteout()
*
- * Return a referenced, unlinked, unlocked inode that that can be used as a
+ * Return a referenced, unlinked, unlocked inode that can be used as a
* whiteout in a rename transaction. We use a tmpfile inode here so that if we
* crash between allocating the inode and linking it into the rename transaction
* recovery will free the inode and we won't leak it.
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 895f61b2b4f0..6c65938cee1c 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -191,7 +191,7 @@ xfs_inode_item_format_data_fork(
ip->i_df.if_bytes > 0) {
/*
* Round i_bytes up to a word boundary.
- * The underlying memory is guaranteed to
+ * The underlying memory is guaranteed
* to be there by xfs_idata_realloc().
*/
data_bytes = roundup(ip->i_df.if_bytes, 4);
@@ -275,7 +275,7 @@ xfs_inode_item_format_attr_fork(
ip->i_afp->if_bytes > 0) {
/*
* Round i_bytes up to a word boundary.
- * The underlying memory is guaranteed to
+ * The underlying memory is guaranteed
* to be there by xfs_idata_realloc().
*/
data_bytes = roundup(ip->i_afp->if_bytes, 4);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 0e3f62cde375..3abb8b9d6f4c 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -865,7 +865,7 @@ xfs_buffered_write_iomap_begin(
}
/*
- * Search the data fork fork first to look up our source mapping. We
+ * Search the data fork first to look up our source mapping. We
* always need the data fork map, as we have to return it to the
* iomap code so that the higher level write code can read data in to
* perform read-modify-write cycles for unaligned writes.
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 56c32eecffea..b0ef071b3cb5 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -239,7 +239,7 @@ xfs_cil_prepare_item(
* this CIL context and so we need to pin it. If we are replacing the
* old_lv, then remove the space it accounts for and make it the shadow
* buffer for later freeing. In both cases we are now switching to the
- * shadow buffer, so update the the pointer to it appropriately.
+ * shadow buffer, so update the pointer to it appropriately.
*/
if (!old_lv) {
if (lv->lv_item->li_ops->iop_pin)
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 52a65a74208f..e2ec91b2d0f4 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1100,7 +1100,7 @@ xlog_verify_head(
*
* Note that xlog_find_tail() clears the blocks at the new head
* (i.e., the records with invalid CRC) if the cycle number
- * matches the the current cycle.
+ * matches the current cycle.
*/
found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1,
buffer, rhead_blk, rhead, wrapped);
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 7b2c72bc2858..ca93b6488377 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -485,7 +485,7 @@ xfs_cui_item_recover(
* transaction. Normally, any work that needs to be deferred
* gets attached to the same defer_ops that scheduled the
* refcount update. However, we're in log recovery here, so we
- * we use the passed in defer_ops and to finish up any work that
+ * use the passed in defer_ops and to finish up any work that
* doesn't fit. We need to reserve enough blocks to handle a
* full btree split on either end of the refcount range.
*/
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index aac83f9d6107..16098dc42add 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -721,7 +721,7 @@ xfs_reflink_end_cow(
* repeatedly cycles the ILOCK to allocate one transaction per remapped
* extent.
*
- * If we're being called by writeback then the the pages will still
+ * If we're being called by writeback then the pages will still
* have PageWriteback set, which prevents races with reflink remapping
* and truncate. Reflink remapping prevents races with writeback by
* taking the iolock and mmaplock before flushing the pages and
diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h
index e9f810fc6731..43585850f154 100644
--- a/fs/xfs/xfs_sysfs.h
+++ b/fs/xfs/xfs_sysfs.h
@@ -32,9 +32,11 @@ xfs_sysfs_init(
struct xfs_kobj *parent_kobj,
const char *name)
{
+ struct kobject *parent;
+
+ parent = parent_kobj ? &parent_kobj->kobject : NULL;
init_completion(&kobj->complete);
- return kobject_init_and_add(&kobj->kobject, ktype,
- &parent_kobj->kobject, "%s", name);
+ return kobject_init_and_add(&kobj->kobject, ktype, parent, "%s", name);
}
static inline void
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 0c783d339675..dbb69b4bf3ed 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -480,7 +480,7 @@ xfsaild_push(
* inode buffer is locked because we already pushed the
* updates to it as part of inode clustering.
*
- * We do not want to to stop flushing just because lots
+ * We do not want to stop flushing just because lots
* of items are already being flushed, but we need to
* re-try the flushing relatively soon if most of the
* AIL is being flushed.
@@ -515,7 +515,7 @@ xfsaild_push(
/*
* Are there too many items we can't do anything with?
*
- * If we we are skipping too many items because we can't flush
+ * If we are skipping too many items because we can't flush
* them or they are already being flushed, we back off and
* given them time to complete whatever operation is being
* done. i.e. remove pressure from the AIL while we can't make