summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/binfmt_flat.c20
-rw-r--r--fs/btrfs/ctree.c2
-rw-r--r--fs/btrfs/ctree.h6
-rw-r--r--fs/btrfs/disk-io.c1
-rw-r--r--fs/btrfs/extent-tree.c17
-rw-r--r--fs/btrfs/file.c2
-rw-r--r--fs/btrfs/free-space-cache.c2
-rw-r--r--fs/btrfs/inode.c29
-rw-r--r--fs/btrfs/super.c1
-rw-r--r--fs/btrfs/tree-log.c10
-rw-r--r--fs/ceph/caps.c14
-rw-r--r--fs/ceph/debugfs.c4
-rw-r--r--fs/ceph/dir.c31
-rw-r--r--fs/ceph/file.c5
-rw-r--r--fs/ceph/inode.c19
-rw-r--r--fs/ceph/mds_client.h2
-rw-r--r--fs/ceph/quota.c4
-rw-r--r--fs/ceph/super.h73
-rw-r--r--fs/ext4/inode.c2
-rw-r--r--fs/fs-writeback.c103
-rw-r--r--fs/gfs2/log.c31
-rw-r--r--fs/gfs2/trans.c1
-rw-r--r--fs/io-wq.c21
-rw-r--r--fs/io_uring.c99
-rw-r--r--fs/nfsd/nfs4state.c2
-rw-r--r--fs/xfs/libxfs/xfs_trans_inode.c4
26 files changed, 292 insertions, 213 deletions
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index f2f9086ebe98..b9c658e0548e 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -576,7 +576,7 @@ static int load_flat_file(struct linux_binprm *bprm,
goto err;
}
- len = data_len + extra;
+ len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long);
len = PAGE_ALIGN(len);
realdatastart = vm_mmap(NULL, 0, len,
PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
@@ -590,7 +590,9 @@ static int load_flat_file(struct linux_binprm *bprm,
vm_munmap(textpos, text_len);
goto err;
}
- datapos = ALIGN(realdatastart, FLAT_DATA_ALIGN);
+ datapos = ALIGN(realdatastart +
+ MAX_SHARED_LIBS * sizeof(unsigned long),
+ FLAT_DATA_ALIGN);
pr_debug("Allocated data+bss+stack (%u bytes): %lx\n",
data_len + bss_len + stack_len, datapos);
@@ -620,7 +622,7 @@ static int load_flat_file(struct linux_binprm *bprm,
memp_size = len;
} else {
- len = text_len + data_len + extra;
+ len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(u32);
len = PAGE_ALIGN(len);
textpos = vm_mmap(NULL, 0, len,
PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
@@ -635,7 +637,9 @@ static int load_flat_file(struct linux_binprm *bprm,
}
realdatastart = textpos + ntohl(hdr->data_start);
- datapos = ALIGN(realdatastart, FLAT_DATA_ALIGN);
+ datapos = ALIGN(realdatastart +
+ MAX_SHARED_LIBS * sizeof(u32),
+ FLAT_DATA_ALIGN);
reloc = (__be32 __user *)
(datapos + (ntohl(hdr->reloc_start) - text_len));
@@ -652,9 +656,8 @@ static int load_flat_file(struct linux_binprm *bprm,
(text_len + full_data
- sizeof(struct flat_hdr)),
0);
- if (datapos != realdatastart)
- memmove((void *)datapos, (void *)realdatastart,
- full_data);
+ memmove((void *) datapos, (void *) realdatastart,
+ full_data);
#else
/*
* This is used on MMU systems mainly for testing.
@@ -710,7 +713,8 @@ static int load_flat_file(struct linux_binprm *bprm,
if (IS_ERR_VALUE(result)) {
ret = result;
pr_err("Unable to read code+data+bss, errno %d\n", ret);
- vm_munmap(textpos, text_len + data_len + extra);
+ vm_munmap(textpos, text_len + data_len + extra +
+ MAX_SHARED_LIBS * sizeof(u32));
goto err;
}
}
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 70e49d8d4f6c..cd1cd673bc0b 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -68,7 +68,7 @@ const char *btrfs_super_csum_driver(u16 csum_type)
btrfs_csums[csum_type].name;
}
-size_t __const btrfs_get_num_csums(void)
+size_t __attribute_const__ btrfs_get_num_csums(void)
{
return ARRAY_SIZE(btrfs_csums);
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9c7e466f27a9..9a72896bed2e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2262,7 +2262,7 @@ BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
int btrfs_super_csum_size(const struct btrfs_super_block *s);
const char *btrfs_super_csum_name(u16 csum_type);
const char *btrfs_super_csum_driver(u16 csum_type);
-size_t __const btrfs_get_num_csums(void);
+size_t __attribute_const__ btrfs_get_num_csums(void);
/*
@@ -2518,7 +2518,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes);
int btrfs_exclude_logged_extents(struct extent_buffer *eb);
int btrfs_cross_ref_exist(struct btrfs_root *root,
- u64 objectid, u64 offset, u64 bytenr);
+ u64 objectid, u64 offset, u64 bytenr, bool strict);
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 parent, u64 root_objectid,
@@ -2934,7 +2934,7 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
u64 start, u64 len);
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
u64 *orig_start, u64 *orig_block_len,
- u64 *ram_bytes);
+ u64 *ram_bytes, bool strict);
void __btrfs_del_delalloc_inode(struct btrfs_root *root,
struct btrfs_inode *inode);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9ae25f632157..f6bba7eb1fa1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4551,6 +4551,7 @@ static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache)
cache->io_ctl.inode = NULL;
iput(inode);
}
+ ASSERT(cache->io_ctl.pages == NULL);
btrfs_put_block_group(cache);
}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index de6fe176fdfb..5871ef78edba 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2306,7 +2306,8 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
static noinline int check_committed_ref(struct btrfs_root *root,
struct btrfs_path *path,
- u64 objectid, u64 offset, u64 bytenr)
+ u64 objectid, u64 offset, u64 bytenr,
+ bool strict)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *extent_root = fs_info->extent_root;
@@ -2348,9 +2349,13 @@ static noinline int check_committed_ref(struct btrfs_root *root,
btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
goto out;
- /* If extent created before last snapshot => it's definitely shared */
- if (btrfs_extent_generation(leaf, ei) <=
- btrfs_root_last_snapshot(&root->root_item))
+ /*
+ * If extent created before last snapshot => it's shared unless the
+ * snapshot has been deleted. Use the heuristic if strict is false.
+ */
+ if (!strict &&
+ (btrfs_extent_generation(leaf, ei) <=
+ btrfs_root_last_snapshot(&root->root_item)))
goto out;
iref = (struct btrfs_extent_inline_ref *)(ei + 1);
@@ -2375,7 +2380,7 @@ out:
}
int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
- u64 bytenr)
+ u64 bytenr, bool strict)
{
struct btrfs_path *path;
int ret;
@@ -2386,7 +2391,7 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
do {
ret = check_committed_ref(root, path, objectid,
- offset, bytenr);
+ offset, bytenr, strict);
if (ret && ret != -ENOENT)
goto out;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index bb824c7cb7c7..4507c3d09399 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1571,7 +1571,7 @@ static int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
}
ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
- NULL, NULL, NULL);
+ NULL, NULL, NULL, false);
if (ret <= 0) {
ret = 0;
if (!nowait)
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index ef0fd7afb0b1..dc82fd0c80cb 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1186,7 +1186,6 @@ static int __btrfs_wait_cache_io(struct btrfs_root *root,
ret = update_cache_item(trans, root, inode, path, offset,
io_ctl->entries, io_ctl->bitmaps);
out:
- io_ctl_free(io_ctl);
if (ret) {
invalidate_inode_pages2(inode->i_mapping);
BTRFS_I(inode)->generation = 0;
@@ -1347,6 +1346,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
* them out later
*/
io_ctl_drop_pages(io_ctl);
+ io_ctl_free(io_ctl);
unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
i_size_read(inode) - 1, &cached_state);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 51fcd82d41c0..9570458aa847 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1610,7 +1610,7 @@ next_slot:
goto out_check;
ret = btrfs_cross_ref_exist(root, ino,
found_key.offset -
- extent_offset, disk_bytenr);
+ extent_offset, disk_bytenr, false);
if (ret) {
/*
* ret could be -EIO if the above fails to read
@@ -2161,11 +2161,8 @@ static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio,
u64 bio_offset)
{
struct inode *inode = private_data;
- blk_status_t ret = 0;
- ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
- BUG_ON(ret); /* -ENOMEM */
- return 0;
+ return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
}
/*
@@ -6953,6 +6950,8 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
* @orig_start: (optional) Return the original file offset of the file extent
* @orig_len: (optional) Return the original on-disk length of the file extent
* @ram_bytes: (optional) Return the ram_bytes of the file extent
+ * @strict: if true, omit optimizations that might force us into unnecessary
+ * cow. e.g., don't trust generation number.
*
* This function will flush ordered extents in the range to ensure proper
* nocow checks for (nowait == false) case.
@@ -6967,7 +6966,7 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
*/
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
u64 *orig_start, u64 *orig_block_len,
- u64 *ram_bytes)
+ u64 *ram_bytes, bool strict)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_path *path;
@@ -7045,8 +7044,9 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
* Do the same check as in btrfs_cross_ref_exist but without the
* unnecessary search.
*/
- if (btrfs_file_extent_generation(leaf, fi) <=
- btrfs_root_last_snapshot(&root->root_item))
+ if (!strict &&
+ (btrfs_file_extent_generation(leaf, fi) <=
+ btrfs_root_last_snapshot(&root->root_item)))
goto out;
backref_offset = btrfs_file_extent_offset(leaf, fi);
@@ -7082,7 +7082,8 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
*/
ret = btrfs_cross_ref_exist(root, btrfs_ino(BTRFS_I(inode)),
- key.offset - backref_offset, disk_bytenr);
+ key.offset - backref_offset, disk_bytenr,
+ strict);
if (ret) {
ret = 0;
goto out;
@@ -7303,7 +7304,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
block_start = em->block_start + (start - em->start);
if (can_nocow_extent(inode, start, &len, &orig_start,
- &orig_block_len, &ram_bytes) == 1 &&
+ &orig_block_len, &ram_bytes, false) == 1 &&
btrfs_inc_nocow_writers(fs_info, block_start)) {
struct extent_map *em2;
@@ -7619,10 +7620,8 @@ static blk_status_t btrfs_submit_bio_start_direct_io(void *private_data,
struct bio *bio, u64 offset)
{
struct inode *inode = private_data;
- blk_status_t ret;
- ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, offset, 1);
- BUG_ON(ret); /* -ENOMEM */
- return 0;
+
+ return btrfs_csum_one_bio(BTRFS_I(inode), bio, offset, 1);
}
static void btrfs_end_dio_bio(struct bio *bio)
@@ -10136,7 +10135,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
free_extent_map(em);
em = NULL;
- ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL);
+ ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, true);
if (ret < 0) {
goto out;
} else if (ret) {
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index e529ddb35b87..25967ecaaf0a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -625,6 +625,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
} else if (strncmp(args[0].from, "lzo", 3) == 0) {
compress_type = "lzo";
info->compress_type = BTRFS_COMPRESS_LZO;
+ info->compress_level = 0;
btrfs_set_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, NODATACOW);
btrfs_clear_opt(info->mount_opt, NODATASUM);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 696dd861cc3c..39da9db35278 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3449,11 +3449,13 @@ fail:
btrfs_free_path(path);
out_unlock:
mutex_unlock(&dir->log_mutex);
- if (ret == -ENOSPC) {
+ if (err == -ENOSPC) {
btrfs_set_log_full_commit(trans);
- ret = 0;
- } else if (ret < 0)
- btrfs_abort_transaction(trans, ret);
+ err = 0;
+ } else if (err < 0 && err != -ENOENT) {
+ /* ENOENT can be returned if the entry hasn't been fsynced yet */
+ btrfs_abort_transaction(trans, err);
+ }
btrfs_end_log_trans(root);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 55ccccf77cea..034b3f4fdd3a 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -887,8 +887,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
int have = ci->i_snap_caps;
if ((have & mask) == mask) {
- dout("__ceph_caps_issued_mask ino 0x%lx snap issued %s"
- " (mask %s)\n", ci->vfs_inode.i_ino,
+ dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s"
+ " (mask %s)\n", ceph_ino(&ci->vfs_inode),
ceph_cap_string(have),
ceph_cap_string(mask));
return 1;
@@ -899,8 +899,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
if (!__cap_is_valid(cap))
continue;
if ((cap->issued & mask) == mask) {
- dout("__ceph_caps_issued_mask ino 0x%lx cap %p issued %s"
- " (mask %s)\n", ci->vfs_inode.i_ino, cap,
+ dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s"
+ " (mask %s)\n", ceph_ino(&ci->vfs_inode), cap,
ceph_cap_string(cap->issued),
ceph_cap_string(mask));
if (touch)
@@ -911,8 +911,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
/* does a combination of caps satisfy mask? */
have |= cap->issued;
if ((have & mask) == mask) {
- dout("__ceph_caps_issued_mask ino 0x%lx combo issued %s"
- " (mask %s)\n", ci->vfs_inode.i_ino,
+ dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s"
+ " (mask %s)\n", ceph_ino(&ci->vfs_inode),
ceph_cap_string(cap->issued),
ceph_cap_string(mask));
if (touch) {
@@ -2872,7 +2872,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
struct cap_wait cw;
DEFINE_WAIT_FUNC(wait, woken_wake_function);
- cw.ino = inode->i_ino;
+ cw.ino = ceph_ino(inode);
cw.tgid = current->tgid;
cw.need = need;
cw.want = want;
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 97539b497e4c..3e3fcda9b276 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -202,7 +202,7 @@ static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p)
{
struct seq_file *s = p;
- seq_printf(s, "0x%-17lx%-17s%-17s\n", inode->i_ino,
+ seq_printf(s, "0x%-17llx%-17s%-17s\n", ceph_ino(inode),
ceph_cap_string(cap->issued),
ceph_cap_string(cap->implemented));
return 0;
@@ -247,7 +247,7 @@ static int caps_show(struct seq_file *s, void *p)
spin_lock(&mdsc->caps_list_lock);
list_for_each_entry(cw, &mdsc->cap_wait_list, list) {
- seq_printf(s, "%-13d0x%-17lx%-17s%-17s\n", cw->tgid, cw->ino,
+ seq_printf(s, "%-13d0x%-17llx%-17s%-17s\n", cw->tgid, cw->ino,
ceph_cap_string(cw->need),
ceph_cap_string(cw->want));
}
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 9ff9d10a60ff..d72e4a12bb69 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -259,9 +259,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
dentry, dentry, d_inode(dentry));
ctx->pos = di->offset;
if (!dir_emit(ctx, dentry->d_name.name,
- dentry->d_name.len,
- ceph_translate_ino(dentry->d_sb,
- d_inode(dentry)->i_ino),
+ dentry->d_name.len, ceph_present_inode(d_inode(dentry)),
d_inode(dentry)->i_mode >> 12)) {
dput(dentry);
err = 0;
@@ -324,18 +322,21 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
/* always start with . and .. */
if (ctx->pos == 0) {
dout("readdir off 0 -> '.'\n");
- if (!dir_emit(ctx, ".", 1,
- ceph_translate_ino(inode->i_sb, inode->i_ino),
+ if (!dir_emit(ctx, ".", 1, ceph_present_inode(inode),
inode->i_mode >> 12))
return 0;
ctx->pos = 1;
}
if (ctx->pos == 1) {
- ino_t ino = parent_ino(file->f_path.dentry);
+ u64 ino;
+ struct dentry *dentry = file->f_path.dentry;
+
+ spin_lock(&dentry->d_lock);
+ ino = ceph_present_inode(dentry->d_parent->d_inode);
+ spin_unlock(&dentry->d_lock);
+
dout("readdir off 1 -> '..'\n");
- if (!dir_emit(ctx, "..", 2,
- ceph_translate_ino(inode->i_sb, ino),
- inode->i_mode >> 12))
+ if (!dir_emit(ctx, "..", 2, ino, inode->i_mode >> 12))
return 0;
ctx->pos = 2;
}
@@ -507,9 +508,6 @@ more:
}
for (; i < rinfo->dir_nr; i++) {
struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
- struct ceph_vino vino;
- ino_t ino;
- u32 ftype;
BUG_ON(rde->offset < ctx->pos);
@@ -519,13 +517,10 @@ more:
rde->name_len, rde->name, &rde->inode.in);
BUG_ON(!rde->inode.in);
- ftype = le32_to_cpu(rde->inode.in->mode) >> 12;
- vino.ino = le64_to_cpu(rde->inode.in->ino);
- vino.snap = le64_to_cpu(rde->inode.in->snapid);
- ino = ceph_vino_to_ino(vino);
if (!dir_emit(ctx, rde->name, rde->name_len,
- ceph_translate_ino(inode->i_sb, ino), ftype)) {
+ ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)),
+ le32_to_cpu(rde->inode.in->mode) >> 12)) {
dout("filldir stopping us...\n");
return 0;
}
@@ -1161,7 +1156,7 @@ retry:
if (try_async && op == CEPH_MDS_OP_UNLINK &&
(req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) {
- dout("async unlink on %lu/%.*s caps=%s", dir->i_ino,
+ dout("async unlink on %llu/%.*s caps=%s", ceph_ino(dir),
dentry->d_name.len, dentry->d_name.name,
ceph_cap_string(req->r_dir_caps));
set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 30cd00265181..3f4c993dfc6f 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -630,8 +630,8 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
} else {
struct dentry *dn;
- dout("%s d_adding new inode 0x%llx to 0x%lx/%s\n", __func__,
- vino.ino, dir->i_ino, dentry->d_name.name);
+ dout("%s d_adding new inode 0x%llx to 0x%llx/%s\n", __func__,
+ vino.ino, ceph_ino(dir), dentry->d_name.name);
ceph_dir_clear_ordered(dir);
ceph_init_inode_acls(inode, as_ctx);
if (inode->i_state & I_NEW) {
@@ -2507,6 +2507,7 @@ const struct file_operations ceph_file_fops = {
.mmap = ceph_mmap,
.fsync = ceph_fsync,
.lock = ceph_lock,
+ .setlease = simple_nosetlease,
.flock = ceph_flock,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 357c937699d5..d163fa96cb40 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -41,8 +41,10 @@ static void ceph_inode_work(struct work_struct *work);
*/
static int ceph_set_ino_cb(struct inode *inode, void *data)
{
- ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
- inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ ci->i_vino = *(struct ceph_vino *)data;
+ inode->i_ino = ceph_vino_to_ino_t(ci->i_vino);
inode_set_iversion_raw(inode, 0);
return 0;
}
@@ -50,17 +52,14 @@ static int ceph_set_ino_cb(struct inode *inode, void *data)
struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
{
struct inode *inode;
- ino_t t = ceph_vino_to_ino(vino);
- inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
+ inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare,
+ ceph_set_ino_cb, &vino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (inode->i_state & I_NEW)
- dout("get_inode created new inode %p %llx.%llx ino %llx\n",
- inode, ceph_vinop(inode), (u64)inode->i_ino);
- dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
- vino.snap, inode);
+ dout("get_inode on %llu=%llx.%llx got %p new %d\n", ceph_present_inode(inode),
+ ceph_vinop(inode), inode, !!(inode->i_state & I_NEW));
return inode;
}
@@ -2378,7 +2377,7 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
}
generic_fillattr(inode, stat);
- stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
+ stat->ino = ceph_present_inode(inode);
/*
* btime on newly-allocated inodes is 0, so if this is still set to
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index bc9e95937d7c..658800605bfb 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -372,7 +372,7 @@ struct ceph_quotarealm_inode {
struct cap_wait {
struct list_head list;
- unsigned long ino;
+ u64 ino;
pid_t tgid;
int need;
int want;
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index 198ddde5c1e6..cc2c4d40b022 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -23,12 +23,12 @@ static inline bool ceph_has_realms_with_quotas(struct inode *inode)
{
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
struct super_block *sb = mdsc->fsc->sb;
+ struct inode *root = d_inode(sb->s_root);
if (atomic64_read(&mdsc->quotarealms_count) > 0)
return true;
/* if root is the real CephFS root, we don't have quota realms */
- if (sb->s_root->d_inode &&
- (sb->s_root->d_inode->i_ino == CEPH_INO_ROOT))
+ if (root && ceph_ino(root) == CEPH_INO_ROOT)
return false;
/* otherwise, we can't know for sure */
return true;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 4c3c964b1c54..a3995ebe0623 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -457,15 +457,7 @@ ceph_vino(const struct inode *inode)
return ceph_inode(inode)->i_vino;
}
-/*
- * ino_t is <64 bits on many architectures, blech.
- *
- * i_ino (kernel inode) st_ino (userspace)
- * i386 32 32
- * x86_64+ino32 64 32
- * x86_64 64 64
- */
-static inline u32 ceph_ino_to_ino32(__u64 vino)
+static inline u32 ceph_ino_to_ino32(u64 vino)
{
u32 ino = vino & 0xffffffff;
ino ^= vino >> 32;
@@ -475,34 +467,17 @@ static inline u32 ceph_ino_to_ino32(__u64 vino)
}
/*
- * kernel i_ino value
+ * Inode numbers in cephfs are 64 bits, but inode->i_ino is 32-bits on
+ * some arches. We generally do not use this value inside the ceph driver, but
+ * we do want to set it to something, so that generic vfs code has an
+ * appropriate value for tracepoints and the like.
*/
-static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
+static inline ino_t ceph_vino_to_ino_t(struct ceph_vino vino)
{
-#if BITS_PER_LONG == 32
- return ceph_ino_to_ino32(vino.ino);
-#else
+ if (sizeof(ino_t) == sizeof(u32))
+ return ceph_ino_to_ino32(vino.ino);
return (ino_t)vino.ino;
-#endif
-}
-
-/*
- * user-visible ino (stat, filldir)
- */
-#if BITS_PER_LONG == 32
-static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
-{
- return ino;
-}
-#else
-static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
-{
- if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32))
- ino = ceph_ino_to_ino32(ino);
- return ino;
}
-#endif
-
/* for printf-style formatting */
#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
@@ -511,11 +486,34 @@ static inline u64 ceph_ino(struct inode *inode)
{
return ceph_inode(inode)->i_vino.ino;
}
+
static inline u64 ceph_snap(struct inode *inode)
{
return ceph_inode(inode)->i_vino.snap;
}
+/**
+ * ceph_present_ino - format an inode number for presentation to userland
+ * @sb: superblock where the inode lives
+ * @ino: inode number to (possibly) convert
+ *
+ * If the user mounted with the ino32 option, then the 64-bit value needs
+ * to be converted to something that can fit inside 32 bits. Note that
+ * internal kernel code never uses this value, so this is entirely for
+ * userland consumption.
+ */
+static inline u64 ceph_present_ino(struct super_block *sb, u64 ino)
+{
+ if (unlikely(ceph_test_mount_opt(ceph_sb_to_client(sb), INO32)))
+ return ceph_ino_to_ino32(ino);
+ return ino;
+}
+
+static inline u64 ceph_present_inode(struct inode *inode)
+{
+ return ceph_present_ino(inode->i_sb, ceph_ino(inode));
+}
+
static inline int ceph_ino_compare(struct inode *inode, void *data)
{
struct ceph_vino *pvino = (struct ceph_vino *)data;
@@ -524,11 +522,16 @@ static inline int ceph_ino_compare(struct inode *inode, void *data)
ci->i_vino.snap == pvino->snap;
}
+
static inline struct inode *ceph_find_inode(struct super_block *sb,
struct ceph_vino vino)
{
- ino_t t = ceph_vino_to_ino(vino);
- return ilookup5(sb, t, ceph_ino_compare, &vino);
+ /*
+ * NB: The hashval will be run through the fs/inode.c hash function
+ * anyway, so there is no need to squash the inode number down to
+ * 32-bits first. Just use low-order bits on arches with 32-bit long.
+ */
+ return ilookup5(sb, (unsigned long)vino.ino, ceph_ino_compare, &vino);
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3a196d81f594..bf596467c234 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4901,7 +4901,7 @@ static void __ext4_update_other_inode_time(struct super_block *sb,
(inode->i_state & I_DIRTY_TIME)) {
struct ext4_inode_info *ei = EXT4_I(inode);
- inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED);
+ inode->i_state &= ~I_DIRTY_TIME;
spin_unlock(&inode->i_lock);
spin_lock(&ei->i_raw_lock);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index a605c3dddabc..149227160ff0 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -42,7 +42,6 @@
struct wb_writeback_work {
long nr_pages;
struct super_block *sb;
- unsigned long *older_than_this;
enum writeback_sync_modes sync_mode;
unsigned int tagged_writepages:1;
unsigned int for_kupdate:1;
@@ -144,7 +143,9 @@ static void inode_io_list_del_locked(struct inode *inode,
struct bdi_writeback *wb)
{
assert_spin_locked(&wb->list_lock);
+ assert_spin_locked(&inode->i_lock);
+ inode->i_state &= ~I_SYNC_QUEUED;
list_del_init(&inode->i_io_list);
wb_io_lists_depopulated(wb);
}
@@ -1122,7 +1123,9 @@ void inode_io_list_del(struct inode *inode)
struct bdi_writeback *wb;
wb = inode_to_wb_and_lock_list(inode);
+ spin_lock(&inode->i_lock);
inode_io_list_del_locked(inode, wb);
+ spin_unlock(&inode->i_lock);
spin_unlock(&wb->list_lock);
}
EXPORT_SYMBOL(inode_io_list_del);
@@ -1172,8 +1175,10 @@ void sb_clear_inode_writeback(struct inode *inode)
* the case then the inode must have been redirtied while it was being written
* out and we don't reset its dirtied_when.
*/
-static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
+static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb)
{
+ assert_spin_locked(&inode->i_lock);
+
if (!list_empty(&wb->b_dirty)) {
struct inode *tail;
@@ -1182,6 +1187,14 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
inode->dirtied_when = jiffies;
}
inode_io_list_move_locked(inode, wb, &wb->b_dirty);
+ inode->i_state &= ~I_SYNC_QUEUED;
+}
+
+static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
+{
+ spin_lock(&inode->i_lock);
+ redirty_tail_locked(inode, wb);
+ spin_unlock(&inode->i_lock);
}
/*
@@ -1220,16 +1233,13 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
#define EXPIRE_DIRTY_ATIME 0x0001
/*
- * Move expired (dirtied before work->older_than_this) dirty inodes from
+ * Move expired (dirtied before dirtied_before) dirty inodes from
* @delaying_queue to @dispatch_queue.
*/
static int move_expired_inodes(struct list_head *delaying_queue,
struct list_head *dispatch_queue,
- int flags,
- struct wb_writeback_work *work)
+ unsigned long dirtied_before)
{
- unsigned long *older_than_this = NULL;
- unsigned long expire_time;
LIST_HEAD(tmp);
struct list_head *pos, *node;
struct super_block *sb = NULL;
@@ -1237,21 +1247,15 @@ static int move_expired_inodes(struct list_head *delaying_queue,
int do_sb_sort = 0;
int moved = 0;
- if ((flags & EXPIRE_DIRTY_ATIME) == 0)
- older_than_this = work->older_than_this;
- else if (!work->for_sync) {
- expire_time = jiffies - (dirtytime_expire_interval * HZ);
- older_than_this = &expire_time;
- }
while (!list_empty(delaying_queue)) {
inode = wb_inode(delaying_queue->prev);
- if (older_than_this &&
- inode_dirtied_after(inode, *older_than_this))
+ if (inode_dirtied_after(inode, dirtied_before))
break;
list_move(&inode->i_io_list, &tmp);
moved++;
- if (flags & EXPIRE_DIRTY_ATIME)
- set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);
+ spin_lock(&inode->i_lock);
+ inode->i_state |= I_SYNC_QUEUED;
+ spin_unlock(&inode->i_lock);
if (sb_is_blkdev_sb(inode->i_sb))
continue;
if (sb && sb != inode->i_sb)
@@ -1289,18 +1293,22 @@ out:
* |
* +--> dequeue for IO
*/
-static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
+static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work,
+ unsigned long dirtied_before)
{
int moved;
+ unsigned long time_expire_jif = dirtied_before;
assert_spin_locked(&wb->list_lock);
list_splice_init(&wb->b_more_io, &wb->b_io);
- moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
+ moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, dirtied_before);
+ if (!work->for_sync)
+ time_expire_jif = jiffies - dirtytime_expire_interval * HZ;
moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
- EXPIRE_DIRTY_ATIME, work);
+ time_expire_jif);
if (moved)
wb_io_lists_populated(wb);
- trace_writeback_queue_io(wb, work, moved);
+ trace_writeback_queue_io(wb, work, dirtied_before, moved);
}
static int write_inode(struct inode *inode, struct writeback_control *wbc)
@@ -1394,7 +1402,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
* writeback is not making progress due to locked
* buffers. Skip this inode for now.
*/
- redirty_tail(inode, wb);
+ redirty_tail_locked(inode, wb);
return;
}
@@ -1414,7 +1422,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
* retrying writeback of the dirty page/inode
* that cannot be performed immediately.
*/
- redirty_tail(inode, wb);
+ redirty_tail_locked(inode, wb);
}
} else if (inode->i_state & I_DIRTY) {
/*
@@ -1422,10 +1430,11 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
* such as delayed allocation during submission or metadata
* updates after data IO completion.
*/
- redirty_tail(inode, wb);
+ redirty_tail_locked(inode, wb);
} else if (inode->i_state & I_DIRTY_TIME) {
inode->dirtied_when = jiffies;
inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
+ inode->i_state &= ~I_SYNC_QUEUED;
} else {
/* The inode is clean. Remove from writeback lists. */
inode_io_list_del_locked(inode, wb);
@@ -1472,18 +1481,14 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
spin_lock(&inode->i_lock);
dirty = inode->i_state & I_DIRTY;
- if (inode->i_state & I_DIRTY_TIME) {
- if ((dirty & I_DIRTY_INODE) ||
- wbc->sync_mode == WB_SYNC_ALL ||
- unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) ||
- unlikely(time_after(jiffies,
- (inode->dirtied_time_when +
- dirtytime_expire_interval * HZ)))) {
- dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;
- trace_writeback_lazytime(inode);
- }
- } else
- inode->i_state &= ~I_DIRTY_TIME_EXPIRED;
+ if ((inode->i_state & I_DIRTY_TIME) &&
+ ((dirty & I_DIRTY_INODE) ||
+ wbc->sync_mode == WB_SYNC_ALL || wbc->for_sync ||
+ time_after(jiffies, inode->dirtied_time_when +
+ dirtytime_expire_interval * HZ))) {
+ dirty |= I_DIRTY_TIME;
+ trace_writeback_lazytime(inode);
+ }
inode->i_state &= ~dirty;
/*
@@ -1669,8 +1674,8 @@ static long writeback_sb_inodes(struct super_block *sb,
*/
spin_lock(&inode->i_lock);
if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+ redirty_tail_locked(inode, wb);
spin_unlock(&inode->i_lock);
- redirty_tail(inode, wb);
continue;
}
if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
@@ -1811,7 +1816,7 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
blk_start_plug(&plug);
spin_lock(&wb->list_lock);
if (list_empty(&wb->b_io))
- queue_io(wb, &work);
+ queue_io(wb, &work, jiffies);
__writeback_inodes_wb(wb, &work);
spin_unlock(&wb->list_lock);
blk_finish_plug(&plug);
@@ -1831,7 +1836,7 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
* takes longer than a dirty_writeback_interval interval, then leave a
* one-second gap.
*
- * older_than_this takes precedence over nr_to_write. So we'll only write back
+ * dirtied_before takes precedence over nr_to_write. So we'll only write back
* all dirty pages if they are all attached to "old" mappings.
*/
static long wb_writeback(struct bdi_writeback *wb,
@@ -1839,14 +1844,11 @@ static long wb_writeback(struct bdi_writeback *wb,
{
unsigned long wb_start = jiffies;
long nr_pages = work->nr_pages;
- unsigned long oldest_jif;
+ unsigned long dirtied_before = jiffies;
struct inode *inode;
long progress;
struct blk_plug plug;
- oldest_jif = jiffies;
- work->older_than_this = &oldest_jif;
-
blk_start_plug(&plug);
spin_lock(&wb->list_lock);
for (;;) {
@@ -1880,14 +1882,14 @@ static long wb_writeback(struct bdi_writeback *wb,
* safe.
*/
if (work->for_kupdate) {
- oldest_jif = jiffies -
+ dirtied_before = jiffies -
msecs_to_jiffies(dirty_expire_interval * 10);
} else if (work->for_background)
- oldest_jif = jiffies;
+ dirtied_before = jiffies;
trace_writeback_start(wb, work);
if (list_empty(&wb->b_io))
- queue_io(wb, work);
+ queue_io(wb, work, dirtied_before);
if (work->sb)
progress = writeback_sb_inodes(work->sb, wb, work);
else
@@ -2289,11 +2291,12 @@ void __mark_inode_dirty(struct inode *inode, int flags)
inode->i_state |= flags;
/*
- * If the inode is being synced, just update its dirty state.
- * The unlocker will place the inode on the appropriate
- * superblock list, based upon its state.
+ * If the inode is queued for writeback by flush worker, just
+ * update its dirty state. Once the flush worker is done with
+ * the inode it will place it on the appropriate superblock
+ * list, based upon its state.
*/
- if (inode->i_state & I_SYNC)
+ if (inode->i_state & I_SYNC_QUEUED)
goto out_unlock_inode;
/*
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index a58333e3980d..3763c9ff1406 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -902,6 +902,36 @@ static void empty_ail1_list(struct gfs2_sbd *sdp)
}
/**
+ * drain_bd - drain the buf and databuf queue for a failed transaction
+ * @tr: the transaction to drain
+ *
+ * When this is called, we're taking an error exit for a log write that failed
+ * but since we bypassed the after_commit functions, we need to remove the
+ * items from the buf and databuf queue.
+ */
+static void trans_drain(struct gfs2_trans *tr)
+{
+ struct gfs2_bufdata *bd;
+ struct list_head *head;
+
+ if (!tr)
+ return;
+
+ head = &tr->tr_buf;
+ while (!list_empty(head)) {
+ bd = list_first_entry(head, struct gfs2_bufdata, bd_list);
+ list_del_init(&bd->bd_list);
+ kmem_cache_free(gfs2_bufdata_cachep, bd);
+ }
+ head = &tr->tr_databuf;
+ while (!list_empty(head)) {
+ bd = list_first_entry(head, struct gfs2_bufdata, bd_list);
+ list_del_init(&bd->bd_list);
+ kmem_cache_free(gfs2_bufdata_cachep, bd);
+ }
+}
+
+/**
* gfs2_log_flush - flush incore transaction(s)
* @sdp: the filesystem
* @gl: The glock structure to flush. If NULL, flush the whole incore log
@@ -1005,6 +1035,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags)
out:
if (gfs2_withdrawn(sdp)) {
+ trans_drain(tr);
/**
* If the tr_list is empty, we're withdrawing during a log
* flush that targets a transaction, but the transaction was
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index e1c7eb6eb00a..6d4bf7ea7b3b 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -67,6 +67,7 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
tr->tr_reserved += gfs2_struct2blk(sdp, revokes);
INIT_LIST_HEAD(&tr->tr_databuf);
INIT_LIST_HEAD(&tr->tr_buf);
+ INIT_LIST_HEAD(&tr->tr_list);
INIT_LIST_HEAD(&tr->tr_ail1_list);
INIT_LIST_HEAD(&tr->tr_ail2_list);
diff --git a/fs/io-wq.c b/fs/io-wq.c
index e92c4724480c..414beb543883 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -925,6 +925,24 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
return match->nr_running && !match->cancel_all;
}
+static inline void io_wqe_remove_pending(struct io_wqe *wqe,
+ struct io_wq_work *work,
+ struct io_wq_work_node *prev)
+{
+ unsigned int hash = io_get_work_hash(work);
+ struct io_wq_work *prev_work = NULL;
+
+ if (io_wq_is_hashed(work) && work == wqe->hash_tail[hash]) {
+ if (prev)
+ prev_work = container_of(prev, struct io_wq_work, list);
+ if (prev_work && io_get_work_hash(prev_work) == hash)
+ wqe->hash_tail[hash] = prev_work;
+ else
+ wqe->hash_tail[hash] = NULL;
+ }
+ wq_list_del(&wqe->work_list, &work->list, prev);
+}
+
static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
struct io_cb_cancel_data *match)
{
@@ -938,8 +956,7 @@ retry:
work = container_of(node, struct io_wq_work, list);
if (!match->fn(work, match->data))
continue;
-
- wq_list_del(&wqe->work_list, node, prev);
+ io_wqe_remove_pending(wqe, work, prev);
spin_unlock_irqrestore(&wqe->lock, flags);
io_run_cancel(work, wqe);
match->nr_pending++;
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8a53af8e5fe2..ce69bd9b0838 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1150,7 +1150,7 @@ static void io_prep_async_work(struct io_kiocb *req)
io_req_init_async(req);
if (req->flags & REQ_F_ISREG) {
- if (def->hash_reg_file)
+ if (def->hash_reg_file || (req->ctx->flags & IORING_SETUP_IOPOLL))
io_wq_hash_work(&req->work, file_inode(req->file));
} else {
if (def->unbound_nonreg_file)
@@ -1746,7 +1746,8 @@ static struct io_kiocb *io_req_find_next(struct io_kiocb *req)
return __io_req_find_next(req);
}
-static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb)
+static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb,
+ bool twa_signal_ok)
{
struct task_struct *tsk = req->task;
struct io_ring_ctx *ctx = req->ctx;
@@ -1759,7 +1760,7 @@ static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb)
* will do the job.
*/
notify = 0;
- if (!(ctx->flags & IORING_SETUP_SQPOLL))
+ if (!(ctx->flags & IORING_SETUP_SQPOLL) && twa_signal_ok)
notify = TWA_SIGNAL;
ret = task_work_add(tsk, cb, notify);
@@ -1819,7 +1820,7 @@ static void io_req_task_queue(struct io_kiocb *req)
init_task_work(&req->task_work, io_req_task_submit);
percpu_ref_get(&req->ctx->refs);
- ret = io_req_task_work_add(req, &req->task_work);
+ ret = io_req_task_work_add(req, &req->task_work, true);
if (unlikely(ret)) {
struct task_struct *tsk;
@@ -2048,6 +2049,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
req = list_first_entry(done, struct io_kiocb, inflight_entry);
if (READ_ONCE(req->result) == -EAGAIN) {
+ req->result = 0;
req->iopoll_completed = 0;
list_move_tail(&req->inflight_entry, &again);
continue;
@@ -2293,22 +2295,6 @@ end_req:
io_req_complete(req, ret);
return false;
}
-
-static void io_rw_resubmit(struct callback_head *cb)
-{
- struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
- struct io_ring_ctx *ctx = req->ctx;
- int err;
-
- err = io_sq_thread_acquire_mm(ctx, req);
-
- if (io_resubmit_prep(req, err)) {
- refcount_inc(&req->refs);
- io_queue_async_work(req);
- }
-
- percpu_ref_put(&ctx->refs);
-}
#endif
static bool io_rw_reissue(struct io_kiocb *req, long res)
@@ -2319,12 +2305,14 @@ static bool io_rw_reissue(struct io_kiocb *req, long res)
if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker())
return false;
- init_task_work(&req->task_work, io_rw_resubmit);
- percpu_ref_get(&req->ctx->refs);
+ ret = io_sq_thread_acquire_mm(req->ctx, req);
- ret = io_req_task_work_add(req, &req->task_work);
- if (!ret)
+ if (io_resubmit_prep(req, ret)) {
+ refcount_inc(&req->refs);
+ io_queue_async_work(req);
return true;
+ }
+
#endif
return false;
}
@@ -2865,6 +2853,11 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
return iov_iter_count(&req->io->rw.iter);
}
+static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
+{
+ return kiocb->ki_filp->f_mode & FMODE_STREAM ? NULL : &kiocb->ki_pos;
+}
+
/*
* For files that don't have ->read_iter() and ->write_iter(), handle them
* by looping over ->read() or ->write() manually.
@@ -2900,10 +2893,10 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
if (rw == READ) {
nr = file->f_op->read(file, iovec.iov_base,
- iovec.iov_len, &kiocb->ki_pos);
+ iovec.iov_len, io_kiocb_ppos(kiocb));
} else {
nr = file->f_op->write(file, iovec.iov_base,
- iovec.iov_len, &kiocb->ki_pos);
+ iovec.iov_len, io_kiocb_ppos(kiocb));
}
if (iov_iter_is_bvec(iter))
@@ -3044,7 +3037,7 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
/* submit ref gets dropped, acquire a new one */
refcount_inc(&req->refs);
- ret = io_req_task_work_add(req, &req->task_work);
+ ret = io_req_task_work_add(req, &req->task_work, true);
if (unlikely(ret)) {
struct task_struct *tsk;
@@ -3125,6 +3118,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
if (ret < 0)
return ret;
+ iov_count = iov_iter_count(iter);
io_size = ret;
req->result = io_size;
ret = 0;
@@ -3137,8 +3131,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
if (force_nonblock && !io_file_supports_async(req->file, READ))
goto copy_iov;
- iov_count = iov_iter_count(iter);
- ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count);
+ ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), iov_count);
if (unlikely(ret))
goto out_free;
@@ -3150,14 +3143,18 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
ret = 0;
goto out_free;
} else if (ret == -EAGAIN) {
- if (!force_nonblock)
+ /* IOPOLL retry should happen for io-wq threads */
+ if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
goto done;
+ /* some cases will consume bytes even on error returns */
+ iov_iter_revert(iter, iov_count - iov_iter_count(iter));
ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
if (ret)
goto out_free;
return -EAGAIN;
} else if (ret < 0) {
- goto out_free;
+ /* make sure -ERESTARTSYS -> -EINTR is done */
+ goto done;
}
/* read it all, or we did blocking attempt. no retry. */
@@ -3241,6 +3238,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
if (ret < 0)
return ret;
+ iov_count = iov_iter_count(iter);
io_size = ret;
req->result = io_size;
@@ -3257,8 +3255,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
(req->flags & REQ_F_ISREG))
goto copy_iov;
- iov_count = iov_iter_count(iter);
- ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count);
+ ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), iov_count);
if (unlikely(ret))
goto out_free;
@@ -3291,9 +3288,14 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
ret2 = -EAGAIN;
if (!force_nonblock || ret2 != -EAGAIN) {
+ /* IOPOLL retry should happen for io-wq threads */
+ if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
+ goto copy_iov;
kiocb_done(kiocb, ret2, cs);
} else {
copy_iov:
+ /* some cases will consume bytes even on error returns */
+ iov_iter_revert(iter, iov_count - iov_iter_count(iter));
ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
if (!ret)
return -EAGAIN;
@@ -4566,6 +4568,7 @@ struct io_poll_table {
static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
__poll_t mask, task_work_func_t func)
{
+ bool twa_signal_ok;
int ret;
/* for instances that support it check for an event match first: */
@@ -4581,12 +4584,20 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
percpu_ref_get(&req->ctx->refs);
/*
+ * If we using the signalfd wait_queue_head for this wakeup, then
+ * it's not safe to use TWA_SIGNAL as we could be recursing on the
+ * tsk->sighand->siglock on doing the wakeup. Should not be needed
+ * either, as the normal wakeup will suffice.
+ */
+ twa_signal_ok = (poll->head != &req->task->sighand->signalfd_wqh);
+
+ /*
* If this fails, then the task is exiting. When a task exits, the
* work gets canceled, so just cancel this request as well instead
* of executing it. We can't safely execute it anyway, as we may not
* have the needed state needed for it anyway.
*/
- ret = io_req_task_work_add(req, &req->task_work);
+ ret = io_req_task_work_add(req, &req->task_work, twa_signal_ok);
if (unlikely(ret)) {
struct task_struct *tsk;
@@ -4875,12 +4886,20 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
struct async_poll *apoll;
struct io_poll_table ipt;
__poll_t mask, ret;
+ int rw;
if (!req->file || !file_can_poll(req->file))
return false;
if (req->flags & REQ_F_POLLED)
return false;
- if (!def->pollin && !def->pollout)
+ if (def->pollin)
+ rw = READ;
+ else if (def->pollout)
+ rw = WRITE;
+ else
+ return false;
+ /* if we can't nonblock try, then no point in arming a poll handler */
+ if (!io_file_supports_async(req->file, rw))
return false;
apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
@@ -7433,9 +7452,6 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
{
int ret;
- mmgrab(current->mm);
- ctx->sqo_mm = current->mm;
-
if (ctx->flags & IORING_SETUP_SQPOLL) {
ret = -EPERM;
if (!capable(CAP_SYS_ADMIN))
@@ -7480,10 +7496,6 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
return 0;
err:
io_finish_async(ctx);
- if (ctx->sqo_mm) {
- mmdrop(ctx->sqo_mm);
- ctx->sqo_mm = NULL;
- }
return ret;
}
@@ -8533,6 +8545,9 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
ctx->user = user;
ctx->creds = get_current_cred();
+ mmgrab(current->mm);
+ ctx->sqo_mm = current->mm;
+
/*
* Account memory _before_ installing the file descriptor. Once
* the descriptor is installed, it can get closed at any time. Also
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 49a604b1c6a6..c09a2a4281ec 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4597,6 +4597,8 @@ static bool nfsd_breaker_owns_lease(struct file_lock *fl)
if (!i_am_nfsd())
return NULL;
rqst = kthread_data(current);
+ if (!rqst->rq_lease_breaker)
+ return NULL;
clp = *(rqst->rq_lease_breaker);
return dl->dl_stid.sc_client == clp;
}
diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
index e15129647e00..b7e222befb08 100644
--- a/fs/xfs/libxfs/xfs_trans_inode.c
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -110,9 +110,9 @@ xfs_trans_log_inode(
* to log the timestamps, or will clear already cleared fields in the
* worst case.
*/
- if (inode->i_state & (I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED)) {
+ if (inode->i_state & I_DIRTY_TIME) {
spin_lock(&inode->i_lock);
- inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED);
+ inode->i_state &= ~I_DIRTY_TIME;
spin_unlock(&inode->i_lock);
}