summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/cache.c8
-rw-r--r--fs/9p/fid.c14
-rw-r--r--fs/9p/v9fs.c8
-rw-r--r--fs/9p/vfs_addr.c14
-rw-r--r--fs/9p/vfs_file.c33
-rw-r--r--fs/9p/vfs_inode.c24
-rw-r--r--fs/9p/vfs_inode_dotl.c11
-rw-r--r--fs/affs/super.c2
-rw-r--r--fs/afs/dir_silly.c4
-rw-r--r--fs/afs/write.c12
-rw-r--r--fs/aio.c6
-rw-r--r--fs/autofs/waitq.c2
-rw-r--r--fs/btrfs/compression.c4
-rw-r--r--fs/btrfs/ctree.c1
-rw-r--r--fs/btrfs/ctree.h2
-rw-r--r--fs/btrfs/dev-replace.c3
-rw-r--r--fs/btrfs/dir-item.c48
-rw-r--r--fs/btrfs/disk-io.c2
-rw-r--r--fs/btrfs/extent-tree.c1
-rw-r--r--fs/btrfs/file.c19
-rw-r--r--fs/btrfs/inode.c12
-rw-r--r--fs/btrfs/ioctl.c4
-rw-r--r--fs/btrfs/lzo.c36
-rw-r--r--fs/btrfs/tree-log.c79
-rw-r--r--fs/btrfs/volumes.c10
-rw-r--r--fs/btrfs/zlib.c36
-rw-r--r--fs/btrfs/zstd.c27
-rw-r--r--fs/buffer.c4
-rw-r--r--fs/cachefiles/io.c12
-rw-r--r--fs/cachefiles/rdwr.c16
-rw-r--r--fs/ceph/caps.c12
-rw-r--r--fs/ceph/file.c3
-rw-r--r--fs/ceph/inode.c2
-rw-r--r--fs/ceph/locks.c3
-rw-r--r--fs/ceph/mds_client.c17
-rw-r--r--fs/ceph/super.c17
-rw-r--r--fs/ceph/super.h3
-rw-r--r--fs/cifs/file.c4
-rw-r--r--fs/cramfs/inode.c2
-rw-r--r--fs/crypto/bio.c32
-rw-r--r--fs/crypto/fname.c3
-rw-r--r--fs/crypto/fscrypt_private.h16
-rw-r--r--fs/crypto/hkdf.c11
-rw-r--r--fs/crypto/keysetup.c62
-rw-r--r--fs/direct-io.c16
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/super.c3
-rw-r--r--fs/f2fs/compress.c1
-rw-r--r--fs/f2fs/super.c1
-rw-r--r--fs/fat/inode.c11
-rw-r--r--fs/fs-writeback.c5
-rw-r--r--fs/fscache/object.c2
-rw-r--r--fs/fscache/operation.c3
-rw-r--r--fs/fuse/file.c2
-rw-r--r--fs/fuse/fuse_i.h3
-rw-r--r--fs/fuse/inode.c87
-rw-r--r--fs/fuse/virtio_fs.c12
-rw-r--r--fs/gfs2/file.c6
-rw-r--r--fs/hfs/mdb.c2
-rw-r--r--fs/hfsplus/wrapper.c2
-rw-r--r--fs/internal.h11
-rw-r--r--fs/io-wq.c65
-rw-r--r--fs/io-wq.h59
-rw-r--r--fs/io_uring.c1796
-rw-r--r--fs/iomap/direct-io.c59
-rw-r--r--fs/jfs/jfs_metapage.c1
-rw-r--r--fs/jfs/resize.c5
-rw-r--r--fs/jfs/super.c5
-rw-r--r--fs/kernel_read_file.c2
-rw-r--r--fs/kernfs/dir.c9
-rw-r--r--fs/ksmbd/auth.c16
-rw-r--r--fs/ksmbd/connection.c12
-rw-r--r--fs/ksmbd/glob.h2
-rw-r--r--fs/ksmbd/ksmbd_netlink.h2
-rw-r--r--fs/ksmbd/mgmt/user_config.c2
-rw-r--r--fs/ksmbd/mgmt/user_config.h1
-rw-r--r--fs/ksmbd/smb2misc.c149
-rw-r--r--fs/ksmbd/smb2ops.c8
-rw-r--r--fs/ksmbd/smb2pdu.c405
-rw-r--r--fs/ksmbd/smb2pdu.h3
-rw-r--r--fs/ksmbd/smb_common.c12
-rw-r--r--fs/ksmbd/smb_common.h4
-rw-r--r--fs/ksmbd/transport_ipc.c3
-rw-r--r--fs/ksmbd/transport_ipc.h2
-rw-r--r--fs/ksmbd/transport_rdma.c21
-rw-r--r--fs/ksmbd/vfs.c2
-rw-r--r--fs/ksmbd/vfs.h2
-rw-r--r--fs/locks.c161
-rw-r--r--fs/namei.c4
-rw-r--r--fs/netfs/read_helper.c2
-rw-r--r--fs/nfs/blocklayout/dev.c4
-rw-r--r--fs/nfs/direct.c2
-rw-r--r--fs/nfs/file.c9
-rw-r--r--fs/nfs_common/grace.c1
-rw-r--r--fs/nfsd/Kconfig1
-rw-r--r--fs/nfsd/blocklayout.c158
-rw-r--r--fs/nfsd/filecache.c2
-rw-r--r--fs/nfsd/nfs4layouts.c5
-rw-r--r--fs/nfsd/nfs4xdr.c19
-rw-r--r--fs/nfsd/nfsctl.c7
-rw-r--r--fs/nilfs2/ioctl.c2
-rw-r--r--fs/nilfs2/super.c2
-rw-r--r--fs/nilfs2/the_nilfs.c2
-rw-r--r--fs/ntfs/file.c1
-rw-r--r--fs/ntfs/super.c8
-rw-r--r--fs/ntfs3/attrib.c20
-rw-r--r--fs/ntfs3/attrlist.c9
-rw-r--r--fs/ntfs3/bitfunc.c10
-rw-r--r--fs/ntfs3/bitmap.c14
-rw-r--r--fs/ntfs3/debug.h3
-rw-r--r--fs/ntfs3/dir.c30
-rw-r--r--fs/ntfs3/file.c13
-rw-r--r--fs/ntfs3/frecord.c55
-rw-r--r--fs/ntfs3/fslog.c12
-rw-r--r--fs/ntfs3/fsntfs.c77
-rw-r--r--fs/ntfs3/index.c160
-rw-r--r--fs/ntfs3/inode.c161
-rw-r--r--fs/ntfs3/lib/decompress_common.h5
-rw-r--r--fs/ntfs3/lib/lib.h6
-rw-r--r--fs/ntfs3/lznt.c12
-rw-r--r--fs/ntfs3/namei.c24
-rw-r--r--fs/ntfs3/ntfs.h20
-rw-r--r--fs/ntfs3/ntfs_fs.h67
-rw-r--r--fs/ntfs3/record.c3
-rw-r--r--fs/ntfs3/run.c2
-rw-r--r--fs/ntfs3/super.c651
-rw-r--r--fs/ntfs3/upcase.c8
-rw-r--r--fs/ntfs3/xattr.c249
-rw-r--r--fs/ocfs2/alloc.c46
-rw-r--r--fs/ocfs2/suballoc.c22
-rw-r--r--fs/ocfs2/super.c14
-rw-r--r--fs/orangefs/inode.c2
-rw-r--r--fs/orangefs/super.c1
-rw-r--r--fs/overlayfs/dir.c10
-rw-r--r--fs/overlayfs/file.c19
-rw-r--r--fs/pstore/blk.c8
-rw-r--r--fs/quota/quota.c1
-rw-r--r--fs/ramfs/inode.c1
-rw-r--r--fs/read_write.c4
-rw-r--r--fs/reiserfs/super.c8
-rw-r--r--fs/squashfs/super.c5
-rw-r--r--fs/sync.c62
-rw-r--r--fs/ubifs/crypto.c1
-rw-r--r--fs/udf/lowlevel.c5
-rw-r--r--fs/udf/super.c9
-rw-r--r--fs/userfaultfd.c12
-rw-r--r--fs/xfs/xfs_file.c2
-rw-r--r--fs/zonefs/super.c2
148 files changed, 2847 insertions, 2845 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index eb2151fb6049..1769a44f4819 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -23,7 +23,7 @@ struct fscache_netfs v9fs_cache_netfs = {
.version = 0,
};
-/**
+/*
* v9fs_random_cachetag - Generate a random tag to be associated
* with a new cache session.
*
@@ -233,7 +233,7 @@ static void v9fs_vfs_readpage_complete(struct page *page, void *data,
unlock_page(page);
}
-/**
+/*
* __v9fs_readpage_from_fscache - read a page from cache
*
* Returns 0 if the pages are in cache and a BIO is submitted,
@@ -268,7 +268,7 @@ int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page)
}
}
-/**
+/*
* __v9fs_readpages_from_fscache - read multiple pages from cache
*
* Returns 0 if the pages are in cache and a BIO is submitted,
@@ -308,7 +308,7 @@ int __v9fs_readpages_from_fscache(struct inode *inode,
}
}
-/**
+/*
* __v9fs_readpage_to_fscache - write a page to the cache
*
*/
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 9d9de62592be..b8863dd0de5c 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -19,18 +19,18 @@
#include "v9fs_vfs.h"
#include "fid.h"
+static inline void __add_fid(struct dentry *dentry, struct p9_fid *fid)
+{
+ hlist_add_head(&fid->dlist, (struct hlist_head *)&dentry->d_fsdata);
+}
+
+
/**
* v9fs_fid_add - add a fid to a dentry
* @dentry: dentry that the fid is being added to
* @fid: fid to add
*
*/
-
-static inline void __add_fid(struct dentry *dentry, struct p9_fid *fid)
-{
- hlist_add_head(&fid->dlist, (struct hlist_head *)&dentry->d_fsdata);
-}
-
void v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid)
{
spin_lock(&dentry->d_lock);
@@ -67,7 +67,7 @@ static struct p9_fid *v9fs_fid_find_inode(struct inode *inode, kuid_t uid)
/**
* v9fs_open_fid_add - add an open fid to an inode
- * @dentry: inode that the fid is being added to
+ * @inode: inode that the fid is being added to
* @fid: fid to add
*
*/
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index cdb99507ef33..2e0fa7c932db 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -155,6 +155,7 @@ int v9fs_show_options(struct seq_file *m, struct dentry *root)
/**
* v9fs_parse_options - parse mount options into session structure
* @v9ses: existing v9fs session information
+ * @opts: The mount option string
*
* Return 0 upon success, -ERRNO upon failure.
*/
@@ -542,12 +543,9 @@ extern int v9fs_error_init(void);
static struct kobject *v9fs_kobj;
#ifdef CONFIG_9P_FSCACHE
-/**
- * caches_show - list caches associated with a session
- *
- * Returns the size of buffer written.
+/*
+ * List caches associated with a session
*/
-
static ssize_t caches_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index cce9ace651a2..1c4f1b39cc95 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -30,8 +30,7 @@
/**
* v9fs_fid_readpage - read an entire page in from 9P
- *
- * @fid: fid being read
+ * @data: Opaque pointer to the fid being read
* @page: structure to page
*
*/
@@ -116,6 +115,8 @@ static int v9fs_vfs_readpages(struct file *filp, struct address_space *mapping,
/**
* v9fs_release_page - release the private state associated with a page
+ * @page: The page to be released
+ * @gfp: The caller's allocation restrictions
*
* Returns 1 if the page can be released, false otherwise.
*/
@@ -129,9 +130,9 @@ static int v9fs_release_page(struct page *page, gfp_t gfp)
/**
* v9fs_invalidate_page - Invalidate a page completely or partially
- *
- * @page: structure to page
- * @offset: offset in the page
+ * @page: The page to be invalidated
+ * @offset: offset of the invalidated region
+ * @length: length of the invalidated region
*/
static void v9fs_invalidate_page(struct page *page, unsigned int offset,
@@ -199,6 +200,8 @@ static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc)
/**
* v9fs_launder_page - Writeback a dirty page
+ * @page: The page to be cleaned up
+ *
* Returns 0 on success.
*/
@@ -219,6 +222,7 @@ static int v9fs_launder_page(struct page *page)
/**
* v9fs_direct_IO - 9P address space operation for direct I/O
* @iocb: target I/O control block
+ * @iter: The data/buffer to use
*
* The presence of v9fs_direct_IO() in the address space ops vector
* allowes open() O_DIRECT flags which would have failed otherwise.
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index aab5e6538660..246235ebdb70 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -359,14 +359,11 @@ out_err:
}
/**
- * v9fs_file_read - read from a file
- * @filp: file pointer to read
- * @udata: user data buffer to read data into
- * @count: size of buffer
- * @offset: offset at which to read data
+ * v9fs_file_read_iter - read from a file
+ * @iocb: The operation parameters
+ * @to: The buffer to read into
*
*/
-
static ssize_t
v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
@@ -388,11 +385,9 @@ v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
}
/**
- * v9fs_file_write - write to a file
- * @filp: file pointer to write
- * @data: data buffer to write data from
- * @count: size of buffer
- * @offset: offset at which to write data
+ * v9fs_file_write_iter - write to a file
+ * @iocb: The operation parameters
+ * @from: The data to write
*
*/
static ssize_t
@@ -561,11 +556,9 @@ out_unlock:
}
/**
- * v9fs_mmap_file_read - read from a file
- * @filp: file pointer to read
- * @data: user data buffer to read data into
- * @count: size of buffer
- * @offset: offset at which to read data
+ * v9fs_mmap_file_read_iter - read from a file
+ * @iocb: The operation parameters
+ * @to: The buffer to read into
*
*/
static ssize_t
@@ -576,11 +569,9 @@ v9fs_mmap_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
}
/**
- * v9fs_mmap_file_write - write to a file
- * @filp: file pointer to write
- * @data: data buffer to write data from
- * @count: size of buffer
- * @offset: offset at which to write data
+ * v9fs_mmap_file_write_iter - write to a file
+ * @iocb: The operation parameters
+ * @from: The data to write
*
*/
static ssize_t
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 795706520b5e..08f48b70a741 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -218,7 +218,7 @@ v9fs_blank_wstat(struct p9_wstat *wstat)
/**
* v9fs_alloc_inode - helper function to allocate an inode
- *
+ * @sb: The superblock to allocate the inode from
*/
struct inode *v9fs_alloc_inode(struct super_block *sb)
{
@@ -238,7 +238,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb)
/**
* v9fs_free_inode - destroy an inode
- *
+ * @inode: The inode to be freed
*/
void v9fs_free_inode(struct inode *inode)
@@ -343,7 +343,7 @@ error:
* v9fs_get_inode - helper function to setup an inode
* @sb: superblock
* @mode: mode to setup inode with
- *
+ * @rdev: The device numbers to set
*/
struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev)
@@ -369,7 +369,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev)
}
/**
- * v9fs_clear_inode - release an inode
+ * v9fs_evict_inode - Remove an inode from the inode cache
* @inode: inode to release
*
*/
@@ -665,14 +665,15 @@ error:
/**
* v9fs_vfs_create - VFS hook to create a regular file
+ * @mnt_userns: The user namespace of the mount
+ * @dir: The parent directory
+ * @dentry: The name of file to be created
+ * @mode: The UNIX file mode to set
+ * @excl: True if the file must not yet exist
*
* open(.., O_CREAT) is handled in v9fs_vfs_atomic_open(). This is only called
* for mknod(2).
*
- * @dir: directory inode that is being created
- * @dentry: dentry that is being deleted
- * @mode: create permissions
- *
*/
static int
@@ -696,6 +697,7 @@ v9fs_vfs_create(struct user_namespace *mnt_userns, struct inode *dir,
/**
* v9fs_vfs_mkdir - VFS mkdir hook to create a directory
+ * @mnt_userns: The user namespace of the mount
* @dir: inode that is being unlinked
* @dentry: dentry that is being unlinked
* @mode: mode for new directory
@@ -900,10 +902,12 @@ int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
/**
* v9fs_vfs_rename - VFS hook to rename an inode
+ * @mnt_userns: The user namespace of the mount
* @old_dir: old dir inode
* @old_dentry: old dentry
* @new_dir: new dir inode
* @new_dentry: new dentry
+ * @flags: RENAME_* flags
*
*/
@@ -1009,6 +1013,7 @@ done:
/**
* v9fs_vfs_getattr - retrieve file metadata
+ * @mnt_userns: The user namespace of the mount
* @path: Object to query
* @stat: metadata structure to populate
* @request_mask: Mask of STATX_xxx flags indicating the caller's interests
@@ -1050,6 +1055,7 @@ v9fs_vfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
/**
* v9fs_vfs_setattr - set file metadata
+ * @mnt_userns: The user namespace of the mount
* @dentry: file whose metadata to set
* @iattr: metadata assignment structure
*
@@ -1285,6 +1291,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
/**
* v9fs_vfs_symlink - helper function to create symlinks
+ * @mnt_userns: The user namespace of the mount
* @dir: directory inode containing symlink
* @dentry: dentry for symlink
* @symname: symlink data
@@ -1340,6 +1347,7 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
/**
* v9fs_vfs_mknod - create a special file
+ * @mnt_userns: The user namespace of the mount
* @dir: inode destination for new link
* @dentry: dentry for file
* @mode: mode for creation
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index e1c0240b51c0..01b9e1281a29 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -37,7 +37,10 @@ v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir,
struct dentry *dentry, umode_t omode, dev_t rdev);
/**
- * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a
+ * v9fs_get_fsgid_for_create - Helper function to get the gid for a new object
+ * @dir_inode: The directory inode
+ *
+ * Helper function to get the gid for creating a
* new file system object. This checks the S_ISGID to determine the owning
* group of the new file system object.
*/
@@ -211,12 +214,13 @@ int v9fs_open_to_dotl_flags(int flags)
/**
* v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol.
+ * @mnt_userns: The user namespace of the mount
* @dir: directory inode that is being created
* @dentry: dentry that is being deleted
* @omode: create permissions
+ * @excl: True if the file must not yet exist
*
*/
-
static int
v9fs_vfs_create_dotl(struct user_namespace *mnt_userns, struct inode *dir,
struct dentry *dentry, umode_t omode, bool excl)
@@ -361,6 +365,7 @@ err_clunk_old_fid:
/**
* v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory
+ * @mnt_userns: The user namespace of the mount
* @dir: inode that is being unlinked
* @dentry: dentry that is being unlinked
* @omode: mode for new directory
@@ -537,6 +542,7 @@ static int v9fs_mapped_iattr_valid(int iattr_valid)
/**
* v9fs_vfs_setattr_dotl - set file metadata
+ * @mnt_userns: The user namespace of the mount
* @dentry: file whose metadata to set
* @iattr: metadata assignment structure
*
@@ -816,6 +822,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
/**
* v9fs_vfs_mknod_dotl - create a special file
+ * @mnt_userns: The user namespace of the mount
* @dir: inode destination for new link
* @dentry: dentry for file
* @omode: mode for creation
diff --git a/fs/affs/super.c b/fs/affs/super.c
index c6c2a513ec92..c609005a9eaa 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -389,7 +389,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
* blocks, we will have to change it.
*/
- size = i_size_read(sb->s_bdev->bd_inode) >> 9;
+ size = bdev_nr_sectors(sb->s_bdev);
pr_debug("initial blocksize=%d, #blocks=%d\n", 512, size);
affs_set_blocksize(sb, PAGE_SIZE);
diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c
index dae9a57d7ec0..45cfd50a9521 100644
--- a/fs/afs/dir_silly.c
+++ b/fs/afs/dir_silly.c
@@ -86,8 +86,8 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode
return afs_do_sync_operation(op);
}
-/**
- * afs_sillyrename - Perform a silly-rename of a dentry
+/*
+ * Perform silly-rename of a dentry.
*
* AFS is stateless and the server doesn't know when the client is holding a
* file open. To prevent application problems when a file is unlinked while
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 2dfe3b3a53d6..8b1d9c2f6bec 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -861,7 +861,8 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
*/
vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
{
- struct page *page = thp_head(vmf->page);
+ struct folio *folio = page_folio(vmf->page);
+ struct page *page = &folio->page;
struct file *file = vmf->vma->vm_file;
struct inode *inode = file_inode(file);
struct afs_vnode *vnode = AFS_FS_I(inode);
@@ -884,7 +885,7 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
goto out;
#endif
- if (wait_on_page_writeback_killable(page))
+ if (folio_wait_writeback_killable(folio))
goto out;
if (lock_page_killable(page) < 0)
@@ -894,8 +895,8 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
* details the portion of the page we need to write back and we might
* need to redirty the page if there's a problem.
*/
- if (wait_on_page_writeback_killable(page) < 0) {
- unlock_page(page);
+ if (folio_wait_writeback_killable(folio) < 0) {
+ folio_unlock(folio);
goto out;
}
@@ -974,8 +975,7 @@ int afs_launder_page(struct page *page)
iov_iter_bvec(&iter, WRITE, bv, 1, bv[0].bv_len);
trace_afs_page_dirty(vnode, tracepoint_string("launder"), page);
- ret = afs_store_data(vnode, &iter, (loff_t)page->index * PAGE_SIZE,
- true);
+ ret = afs_store_data(vnode, &iter, page_offset(page) + f, true);
}
trace_afs_page_dirty(vnode, tracepoint_string("laundered"), page);
diff --git a/fs/aio.c b/fs/aio.c
index 51b08ab01dff..836dc7e48db7 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1417,7 +1417,7 @@ static void aio_remove_iocb(struct aio_kiocb *iocb)
spin_unlock_irqrestore(&ctx->ctx_lock, flags);
}
-static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
+static void aio_complete_rw(struct kiocb *kiocb, long res)
{
struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, rw);
@@ -1437,7 +1437,7 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
}
iocb->ki_res.res = res;
- iocb->ki_res.res2 = res2;
+ iocb->ki_res.res2 = 0;
iocb_put(iocb);
}
@@ -1508,7 +1508,7 @@ static inline void aio_rw_done(struct kiocb *req, ssize_t ret)
ret = -EINTR;
fallthrough;
default:
- req->ki_complete(req, ret, 0);
+ req->ki_complete(req, ret);
}
}
diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c
index 16b5fca0626e..54c1f8b8b075 100644
--- a/fs/autofs/waitq.c
+++ b/fs/autofs/waitq.c
@@ -358,7 +358,7 @@ int autofs_wait(struct autofs_sb_info *sbi,
qstr.len = strlen(p);
offset = p - name;
}
- qstr.hash = full_name_hash(dentry, name, qstr.len);
+ qstr.hash = full_name_hash(dentry, qstr.name, qstr.len);
if (mutex_lock_interruptible(&sbi->wq_mutex)) {
kfree(name);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 7869ad12bc6e..6c7eb80220ca 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -9,6 +9,7 @@
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
+#include <linux/kthread.h>
#include <linux/time.h>
#include <linux/init.h>
#include <linux/string.h>
@@ -172,9 +173,10 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
/* Hash through the page sector by sector */
for (pg_offset = 0; pg_offset < bytes_left;
pg_offset += sectorsize) {
- kaddr = page_address(page);
+ kaddr = kmap_atomic(page);
crypto_shash_digest(shash, kaddr + pg_offset,
sectorsize, csum);
+ kunmap_atomic(kaddr);
if (memcmp(&csum, cb_sum, csum_size) != 0) {
btrfs_print_data_csum_error(inode, disk_start,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 84627cbd5b5b..66290b214f2b 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -7,6 +7,7 @@
#include <linux/slab.h>
#include <linux/rbtree.h>
#include <linux/mm.h>
+#include <linux/error-injection.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index dff2c8a3e059..c0cebcf745ce 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3030,7 +3030,7 @@ struct btrfs_dir_item *
btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 dir,
- u64 objectid, const char *name, int name_len,
+ u64 index, const char *name, int name_len,
int mod);
struct btrfs_dir_item *
btrfs_search_dir_index_item(struct btrfs_root *root,
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index d029be40ea6f..fbb8b4457a72 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -283,8 +283,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
}
- if (i_size_read(bdev->bd_inode) <
- btrfs_device_get_total_bytes(srcdev)) {
+ if (bdev_nr_bytes(bdev) < btrfs_device_get_total_bytes(srcdev)) {
btrfs_err(fs_info,
"target device is smaller than source device!");
ret = -EINVAL;
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index f1274d5c3805..7721ce0c0604 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -190,9 +190,20 @@ static struct btrfs_dir_item *btrfs_lookup_match_dir(
}
/*
- * lookup a directory item based on name. 'dir' is the objectid
- * we're searching in, and 'mod' tells us if you plan on deleting the
- * item (use mod < 0) or changing the options (use mod > 0)
+ * Lookup for a directory item by name.
+ *
+ * @trans: The transaction handle to use. Can be NULL if @mod is 0.
+ * @root: The root of the target tree.
+ * @path: Path to use for the search.
+ * @dir: The inode number (objectid) of the directory.
+ * @name: The name associated to the directory entry we are looking for.
+ * @name_len: The length of the name.
+ * @mod: Used to indicate if the tree search is meant for a read only
+ * lookup, for a modification lookup or for a deletion lookup, so
+ * its value should be 0, 1 or -1, respectively.
+ *
+ * Returns: NULL if the dir item does not exists, an error pointer if an error
+ * happened, or a pointer to a dir item if a dir item exists for the given name.
*/
struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -273,27 +284,42 @@ out:
}
/*
- * lookup a directory item based on index. 'dir' is the objectid
- * we're searching in, and 'mod' tells us if you plan on deleting the
- * item (use mod < 0) or changing the options (use mod > 0)
+ * Lookup for a directory index item by name and index number.
*
- * The name is used to make sure the index really points to the name you were
- * looking for.
+ * @trans: The transaction handle to use. Can be NULL if @mod is 0.
+ * @root: The root of the target tree.
+ * @path: Path to use for the search.
+ * @dir: The inode number (objectid) of the directory.
+ * @index: The index number.
+ * @name: The name associated to the directory entry we are looking for.
+ * @name_len: The length of the name.
+ * @mod: Used to indicate if the tree search is meant for a read only
+ * lookup, for a modification lookup or for a deletion lookup, so
+ * its value should be 0, 1 or -1, respectively.
+ *
+ * Returns: NULL if the dir index item does not exists, an error pointer if an
+ * error happened, or a pointer to a dir item if the dir index item exists and
+ * matches the criteria (name and index number).
*/
struct btrfs_dir_item *
btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 dir,
- u64 objectid, const char *name, int name_len,
+ u64 index, const char *name, int name_len,
int mod)
{
+ struct btrfs_dir_item *di;
struct btrfs_key key;
key.objectid = dir;
key.type = BTRFS_DIR_INDEX_KEY;
- key.offset = objectid;
+ key.offset = index;
- return btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
+ di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
+ if (di == ERR_PTR(-ENOENT))
+ return NULL;
+
+ return di;
}
struct btrfs_dir_item *
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 355ea88d5c5f..29e7598584c4 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3740,7 +3740,7 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
else if (ret)
return ERR_PTR(ret);
- if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
return ERR_PTR(-EINVAL);
page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fc3da7585fb7..0ab456cb4bf8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4859,6 +4859,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
out_free_delayed:
btrfs_free_delayed_extent_op(extent_op);
out_free_buf:
+ btrfs_tree_unlock(buf);
free_extent_buffer(buf);
out_free_reserved:
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 7ff577005d0f..a1762363f61f 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -734,8 +734,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
if (args->start >= inode->disk_i_size && !args->replace_extent)
modify_tree = 0;
- update_refs = (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) ||
- root == fs_info->tree_root);
+ update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID);
while (1) {
recow = 0;
ret = btrfs_lookup_file_extent(trans, root, path, ino,
@@ -2704,14 +2703,16 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
drop_args.bytes_found);
if (ret != -ENOSPC) {
/*
- * When cloning we want to avoid transaction aborts when
- * nothing was done and we are attempting to clone parts
- * of inline extents, in such cases -EOPNOTSUPP is
- * returned by __btrfs_drop_extents() without having
- * changed anything in the file.
+ * The only time we don't want to abort is if we are
+ * attempting to clone a partial inline extent, in which
+ * case we'll get EOPNOTSUPP. However if we aren't
+ * clone we need to abort no matter what, because if we
+ * got EOPNOTSUPP via prealloc then we messed up and
+ * need to abort.
*/
- if (extent_info && !extent_info->is_new_extent &&
- ret && ret != -EOPNOTSUPP)
+ if (ret &&
+ (ret != -EOPNOTSUPP ||
+ (extent_info && extent_info->is_new_extent)))
btrfs_abort_transaction(trans, ret);
break;
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 487533c35ddb..954b53a90f04 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6,6 +6,7 @@
#include <crypto/hash.h>
#include <linux/kernel.h>
#include <linux/bio.h>
+#include <linux/blk-cgroup.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
@@ -287,8 +288,9 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
cur_size = min_t(unsigned long, compressed_size,
PAGE_SIZE);
- kaddr = page_address(cpage);
+ kaddr = kmap_atomic(cpage);
write_extent_buffer(leaf, kaddr, ptr, cur_size);
+ kunmap_atomic(kaddr);
i++;
ptr += cur_size;
@@ -8247,7 +8249,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
return dip;
}
-static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter,
+static void btrfs_submit_direct(const struct iomap_iter *iter,
struct bio *dio_bio, loff_t file_offset)
{
struct inode *inode = iter->inode;
@@ -8277,7 +8279,7 @@ static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter,
}
dio_bio->bi_status = BLK_STS_RESOURCE;
bio_endio(dio_bio);
- return BLK_QC_T_NONE;
+ return;
}
if (!write) {
@@ -8371,15 +8373,13 @@ static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter,
free_extent_map(em);
} while (submit_len > 0);
- return BLK_QC_T_NONE;
+ return;
out_err_em:
free_extent_map(em);
out_err:
dip->dio_bio->bi_status = status;
btrfs_dio_private_put(dip);
-
- return BLK_QC_T_NONE;
}
const struct iomap_ops btrfs_dio_iomap_ops = {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index cc61813213d8..36ff713da1b1 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1730,7 +1730,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
}
if (!strcmp(sizestr, "max"))
- new_size = device->bdev->bd_inode->i_size;
+ new_size = bdev_nr_bytes(device->bdev);
else {
if (sizestr[0] == '-') {
mod = -1;
@@ -1771,7 +1771,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
ret = -EINVAL;
goto out_finish;
}
- if (new_size > device->bdev->bd_inode->i_size) {
+ if (new_size > bdev_nr_bytes(device->bdev)) {
ret = -EFBIG;
goto out_finish;
}
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index c25dfd1a8a54..3dbe6eb5fda7 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -141,7 +141,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
*total_in = 0;
in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- data_in = page_address(in_page);
+ data_in = kmap(in_page);
/*
* store the size of all chunks of compressed data in
@@ -152,7 +152,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
ret = -ENOMEM;
goto out;
}
- cpage_out = page_address(out_page);
+ cpage_out = kmap(out_page);
out_offset = LZO_LEN;
tot_out = LZO_LEN;
pages[0] = out_page;
@@ -210,6 +210,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
if (out_len == 0 && tot_in >= len)
break;
+ kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
ret = -E2BIG;
@@ -221,7 +222,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
ret = -ENOMEM;
goto out;
}
- cpage_out = page_address(out_page);
+ cpage_out = kmap(out_page);
pages[nr_pages++] = out_page;
pg_bytes_left = PAGE_SIZE;
@@ -243,11 +244,12 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
break;
bytes_left = len - tot_in;
+ kunmap(in_page);
put_page(in_page);
start += PAGE_SIZE;
in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- data_in = page_address(in_page);
+ data_in = kmap(in_page);
in_len = min(bytes_left, PAGE_SIZE);
}
@@ -257,17 +259,22 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
}
/* store the size of all chunks of compressed data */
- sizes_ptr = page_address(pages[0]);
+ sizes_ptr = kmap_local_page(pages[0]);
write_compress_length(sizes_ptr, tot_out);
+ kunmap_local(sizes_ptr);
ret = 0;
*total_out = tot_out;
*total_in = tot_in;
out:
*out_pages = nr_pages;
+ if (out_page)
+ kunmap(out_page);
- if (in_page)
+ if (in_page) {
+ kunmap(in_page);
put_page(in_page);
+ }
return ret;
}
@@ -283,6 +290,7 @@ static void copy_compressed_segment(struct compressed_bio *cb,
u32 orig_in = *cur_in;
while (*cur_in < orig_in + len) {
+ char *kaddr;
struct page *cur_page;
u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in),
orig_in + len - *cur_in);
@@ -290,9 +298,11 @@ static void copy_compressed_segment(struct compressed_bio *cb,
ASSERT(copy_len);
cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE];
+ kaddr = kmap(cur_page);
memcpy(dest + *cur_in - orig_in,
- page_address(cur_page) + offset_in_page(*cur_in),
+ kaddr + offset_in_page(*cur_in),
copy_len);
+ kunmap(cur_page);
*cur_in += copy_len;
}
@@ -303,6 +313,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
struct workspace *workspace = list_entry(ws, struct workspace, list);
const struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
const u32 sectorsize = fs_info->sectorsize;
+ char *kaddr;
int ret;
/* Compressed data length, can be unaligned */
u32 len_in;
@@ -311,7 +322,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
/* Bytes decompressed so far */
u32 cur_out = 0;
- len_in = read_compress_length(page_address(cb->compressed_pages[0]));
+ kaddr = kmap(cb->compressed_pages[0]);
+ len_in = read_compress_length(kaddr);
+ kunmap(cb->compressed_pages[0]);
cur_in += LZO_LEN;
/*
@@ -344,9 +357,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
ASSERT(cur_in / sectorsize ==
(cur_in + LZO_LEN - 1) / sectorsize);
cur_page = cb->compressed_pages[cur_in / PAGE_SIZE];
+ kaddr = kmap(cur_page);
ASSERT(cur_page);
- seg_len = read_compress_length(page_address(cur_page) +
- offset_in_page(cur_in));
+ seg_len = read_compress_length(kaddr + offset_in_page(cur_in));
cur_in += LZO_LEN;
/* Copy the compressed segment payload into workspace */
@@ -431,7 +444,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in,
destlen = min_t(unsigned long, destlen, PAGE_SIZE);
bytes = min_t(unsigned long, destlen, out_len - start_byte);
- kaddr = page_address(dest_page);
+ kaddr = kmap_local_page(dest_page);
memcpy(kaddr, workspace->buf + start_byte, bytes);
/*
@@ -441,6 +454,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in,
*/
if (bytes < destlen)
memset(kaddr+bytes, 0, destlen-bytes);
+ kunmap_local(kaddr);
out:
return ret;
}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f7efc26aa82a..b415c5ec03ea 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -939,9 +939,11 @@ out:
}
/*
- * helper function to see if a given name and sequence number found
- * in an inode back reference are already in a directory and correctly
- * point to this inode
+ * See if a given name and sequence number found in an inode back reference are
+ * already in a directory and correctly point to this inode.
+ *
+ * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it
+ * exists.
*/
static noinline int inode_in_dir(struct btrfs_root *root,
struct btrfs_path *path,
@@ -950,29 +952,34 @@ static noinline int inode_in_dir(struct btrfs_root *root,
{
struct btrfs_dir_item *di;
struct btrfs_key location;
- int match = 0;
+ int ret = 0;
di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
index, name, name_len, 0);
- if (di && !IS_ERR(di)) {
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ } else if (di) {
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
if (location.objectid != objectid)
goto out;
- } else
+ } else {
goto out;
- btrfs_release_path(path);
+ }
+ btrfs_release_path(path);
di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
- if (di && !IS_ERR(di)) {
- btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
- if (location.objectid != objectid)
- goto out;
- } else
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
goto out;
- match = 1;
+ } else if (di) {
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+ if (location.objectid == objectid)
+ ret = 1;
+ }
out:
btrfs_release_path(path);
- return match;
+ return ret;
}
/*
@@ -1182,7 +1189,9 @@ next:
/* look for a conflicting sequence number */
di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
ref_index, name, namelen, 0);
- if (di && !IS_ERR(di)) {
+ if (IS_ERR(di)) {
+ return PTR_ERR(di);
+ } else if (di) {
ret = drop_one_dir_item(trans, root, path, dir, di);
if (ret)
return ret;
@@ -1192,7 +1201,9 @@ next:
/* look for a conflicting name */
di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
name, namelen, 0);
- if (di && !IS_ERR(di)) {
+ if (IS_ERR(di)) {
+ return PTR_ERR(di);
+ } else if (di) {
ret = drop_one_dir_item(trans, root, path, dir, di);
if (ret)
return ret;
@@ -1517,10 +1528,12 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- /* if we already have a perfect match, we're done */
- if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
- btrfs_ino(BTRFS_I(inode)), ref_index,
- name, namelen)) {
+ ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
+ btrfs_ino(BTRFS_I(inode)), ref_index,
+ name, namelen);
+ if (ret < 0) {
+ goto out;
+ } else if (ret == 0) {
/*
* look for a conflicting back reference in the
* metadata. if we find one we have to unlink that name
@@ -1580,6 +1593,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
if (ret)
goto out;
}
+ /* Else, ret == 1, we already have a perfect match, we're done. */
ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
kfree(name);
@@ -1936,8 +1950,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
struct btrfs_key log_key;
struct inode *dir;
u8 log_type;
- int exists;
- int ret = 0;
+ bool exists;
+ int ret;
bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
bool name_added = false;
@@ -1957,12 +1971,12 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
name_len);
btrfs_dir_item_key_to_cpu(eb, di, &log_key);
- exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
- if (exists == 0)
- exists = 1;
- else
- exists = 0;
+ ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
btrfs_release_path(path);
+ if (ret < 0)
+ goto out;
+ exists = (ret == 0);
+ ret = 0;
if (key->type == BTRFS_DIR_ITEM_KEY) {
dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
@@ -1977,7 +1991,11 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
ret = -EINVAL;
goto out;
}
- if (IS_ERR_OR_NULL(dst_di)) {
+
+ if (IS_ERR(dst_di)) {
+ ret = PTR_ERR(dst_di);
+ goto out;
+ } else if (!dst_di) {
/* we need a sequence number to insert, so we only
* do inserts for the BTRFS_DIR_INDEX_KEY types
*/
@@ -2281,7 +2299,7 @@ again:
dir_key->offset,
name, name_len, 0);
}
- if (!log_di || log_di == ERR_PTR(-ENOENT)) {
+ if (!log_di) {
btrfs_dir_item_key_to_cpu(eb, di, &location);
btrfs_release_path(path);
btrfs_release_path(log_path);
@@ -3540,8 +3558,7 @@ out_unlock:
if (err == -ENOSPC) {
btrfs_set_log_full_commit(trans);
err = 0;
- } else if (err < 0 && err != -ENOENT) {
- /* ENOENT can be returned if the entry hasn't been fsynced yet */
+ } else if (err < 0) {
btrfs_abort_transaction(trans, err);
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2ec3b8ac8fa3..9533f358850e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -508,7 +508,7 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
}
if (flush)
- filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
+ sync_blockdev(*bdev);
ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
if (ret) {
blkdev_put(*bdev, flags);
@@ -1286,7 +1286,7 @@ static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev
pgoff_t index;
/* make sure our super fits in the device */
- if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
+ if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev))
return ERR_PTR(-EINVAL);
/* make sure our super fits in the page */
@@ -2610,8 +2610,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
device->io_width = fs_info->sectorsize;
device->io_align = fs_info->sectorsize;
device->sector_size = fs_info->sectorsize;
- device->total_bytes = round_down(i_size_read(bdev->bd_inode),
- fs_info->sectorsize);
+ device->total_bytes =
+ round_down(bdev_nr_bytes(bdev), fs_info->sectorsize);
device->disk_total_bytes = device->total_bytes;
device->commit_total_bytes = device->total_bytes;
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
@@ -7236,7 +7236,7 @@ static int read_one_dev(struct extent_buffer *leaf,
fill_device_from_item(leaf, dev_item, device);
if (device->bdev) {
- u64 max_total_bytes = i_size_read(device->bdev->bd_inode);
+ u64 max_total_bytes = bdev_nr_bytes(device->bdev);
if (device->total_bytes > max_total_bytes) {
btrfs_err(fs_info,
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 8afa90074891..767a0c6c9694 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -126,7 +126,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
ret = -ENOMEM;
goto out;
}
- cpage_out = page_address(out_page);
+ cpage_out = kmap(out_page);
pages[0] = out_page;
nr_pages = 1;
@@ -148,22 +148,26 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
int i;
for (i = 0; i < in_buf_pages; i++) {
- if (in_page)
+ if (in_page) {
+ kunmap(in_page);
put_page(in_page);
+ }
in_page = find_get_page(mapping,
start >> PAGE_SHIFT);
- data_in = page_address(in_page);
+ data_in = kmap(in_page);
memcpy(workspace->buf + i * PAGE_SIZE,
data_in, PAGE_SIZE);
start += PAGE_SIZE;
}
workspace->strm.next_in = workspace->buf;
} else {
- if (in_page)
+ if (in_page) {
+ kunmap(in_page);
put_page(in_page);
+ }
in_page = find_get_page(mapping,
start >> PAGE_SHIFT);
- data_in = page_address(in_page);
+ data_in = kmap(in_page);
start += PAGE_SIZE;
workspace->strm.next_in = data_in;
}
@@ -192,6 +196,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
* the stream end if required
*/
if (workspace->strm.avail_out == 0) {
+ kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
ret = -E2BIG;
@@ -202,7 +207,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
ret = -ENOMEM;
goto out;
}
- cpage_out = page_address(out_page);
+ cpage_out = kmap(out_page);
pages[nr_pages] = out_page;
nr_pages++;
workspace->strm.avail_out = PAGE_SIZE;
@@ -229,6 +234,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
goto out;
} else if (workspace->strm.avail_out == 0) {
/* get another page for the stream end */
+ kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
ret = -E2BIG;
@@ -239,7 +245,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
ret = -ENOMEM;
goto out;
}
- cpage_out = page_address(out_page);
+ cpage_out = kmap(out_page);
pages[nr_pages] = out_page;
nr_pages++;
workspace->strm.avail_out = PAGE_SIZE;
@@ -258,8 +264,13 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
*total_in = workspace->strm.total_in;
out:
*out_pages = nr_pages;
- if (in_page)
+ if (out_page)
+ kunmap(out_page);
+
+ if (in_page) {
+ kunmap(in_page);
put_page(in_page);
+ }
return ret;
}
@@ -276,7 +287,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
unsigned long buf_start;
struct page **pages_in = cb->compressed_pages;
- data_in = page_address(pages_in[page_in_index]);
+ data_in = kmap(pages_in[page_in_index]);
workspace->strm.next_in = data_in;
workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE);
workspace->strm.total_in = 0;
@@ -298,6 +309,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
pr_warn("BTRFS: inflateInit failed\n");
+ kunmap(pages_in[page_in_index]);
return -EIO;
}
while (workspace->strm.total_in < srclen) {
@@ -324,13 +336,13 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
if (workspace->strm.avail_in == 0) {
unsigned long tmp;
-
+ kunmap(pages_in[page_in_index]);
page_in_index++;
if (page_in_index >= total_pages_in) {
data_in = NULL;
break;
}
- data_in = page_address(pages_in[page_in_index]);
+ data_in = kmap(pages_in[page_in_index]);
workspace->strm.next_in = data_in;
tmp = srclen - workspace->strm.total_in;
workspace->strm.avail_in = min(tmp, PAGE_SIZE);
@@ -342,6 +354,8 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
ret = 0;
done:
zlib_inflateEnd(&workspace->strm);
+ if (data_in)
+ kunmap(pages_in[page_in_index]);
if (!ret)
zero_fill_bio(cb->orig_bio);
return ret;
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 56dce9f00988..f06b68040352 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -399,7 +399,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
/* map in the first page of input data */
in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- workspace->in_buf.src = page_address(in_page);
+ workspace->in_buf.src = kmap(in_page);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
@@ -411,7 +411,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
goto out;
}
pages[nr_pages++] = out_page;
- workspace->out_buf.dst = page_address(out_page);
+ workspace->out_buf.dst = kmap(out_page);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
@@ -446,6 +446,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
if (workspace->out_buf.pos == workspace->out_buf.size) {
tot_out += PAGE_SIZE;
max_out -= PAGE_SIZE;
+ kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
ret = -E2BIG;
@@ -457,7 +458,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
goto out;
}
pages[nr_pages++] = out_page;
- workspace->out_buf.dst = page_address(out_page);
+ workspace->out_buf.dst = kmap(out_page);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out,
PAGE_SIZE);
@@ -472,12 +473,13 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
/* Check if we need more input */
if (workspace->in_buf.pos == workspace->in_buf.size) {
tot_in += PAGE_SIZE;
+ kunmap(in_page);
put_page(in_page);
start += PAGE_SIZE;
len -= PAGE_SIZE;
in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- workspace->in_buf.src = page_address(in_page);
+ workspace->in_buf.src = kmap(in_page);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
}
@@ -504,6 +506,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
tot_out += PAGE_SIZE;
max_out -= PAGE_SIZE;
+ kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
ret = -E2BIG;
@@ -515,7 +518,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
goto out;
}
pages[nr_pages++] = out_page;
- workspace->out_buf.dst = page_address(out_page);
+ workspace->out_buf.dst = kmap(out_page);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
}
@@ -531,8 +534,12 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
out:
*out_pages = nr_pages;
/* Cleanup */
- if (in_page)
+ if (in_page) {
+ kunmap(in_page);
put_page(in_page);
+ }
+ if (out_page)
+ kunmap(out_page);
return ret;
}
@@ -556,7 +563,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
goto done;
}
- workspace->in_buf.src = page_address(pages_in[page_in_index]);
+ workspace->in_buf.src = kmap(pages_in[page_in_index]);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
@@ -592,14 +599,14 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
break;
if (workspace->in_buf.pos == workspace->in_buf.size) {
- page_in_index++;
+ kunmap(pages_in[page_in_index++]);
if (page_in_index >= total_pages_in) {
workspace->in_buf.src = NULL;
ret = -EIO;
goto done;
}
srclen -= PAGE_SIZE;
- workspace->in_buf.src = page_address(pages_in[page_in_index]);
+ workspace->in_buf.src = kmap(pages_in[page_in_index]);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
}
@@ -607,6 +614,8 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
ret = 0;
zero_fill_bio(cb->orig_bio);
done:
+ if (workspace->in_buf.src)
+ kunmap(pages_in[page_in_index]);
return ret;
}
diff --git a/fs/buffer.c b/fs/buffer.c
index c615387aedca..46bc589b7a03 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -878,7 +878,7 @@ link_dev_buffers(struct page *page, struct buffer_head *head)
static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
{
sector_t retval = ~((sector_t)0);
- loff_t sz = i_size_read(bdev->bd_inode);
+ loff_t sz = bdev_nr_bytes(bdev);
if (sz) {
unsigned int sizebits = blksize_bits(size);
@@ -897,7 +897,7 @@ init_page_buffers(struct page *page, struct block_device *bdev,
struct buffer_head *head = page_buffers(page);
struct buffer_head *bh = head;
int uptodate = PageUptodate(page);
- sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size);
+ sector_t end_block = blkdev_max_block(bdev, size);
do {
if (!buffer_mapped(bh)) {
diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
index fac2e8e7b533..effe37ef8629 100644
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -37,11 +37,11 @@ static inline void cachefiles_put_kiocb(struct cachefiles_kiocb *ki)
/*
* Handle completion of a read from the cache.
*/
-static void cachefiles_read_complete(struct kiocb *iocb, long ret, long ret2)
+static void cachefiles_read_complete(struct kiocb *iocb, long ret)
{
struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb);
- _enter("%ld,%ld", ret, ret2);
+ _enter("%ld", ret);
if (ki->term_func) {
if (ret >= 0)
@@ -139,7 +139,7 @@ static int cachefiles_read(struct netfs_cache_resources *cres,
fallthrough;
default:
ki->was_async = false;
- cachefiles_read_complete(&ki->iocb, ret, 0);
+ cachefiles_read_complete(&ki->iocb, ret);
if (ret > 0)
ret = 0;
break;
@@ -159,12 +159,12 @@ presubmission_error:
/*
* Handle completion of a write to the cache.
*/
-static void cachefiles_write_complete(struct kiocb *iocb, long ret, long ret2)
+static void cachefiles_write_complete(struct kiocb *iocb, long ret)
{
struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb);
struct inode *inode = file_inode(ki->iocb.ki_filp);
- _enter("%ld,%ld", ret, ret2);
+ _enter("%ld", ret);
/* Tell lockdep we inherited freeze protection from submission thread */
__sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
@@ -244,7 +244,7 @@ static int cachefiles_write(struct netfs_cache_resources *cres,
fallthrough;
default:
ki->was_async = false;
- cachefiles_write_complete(&ki->iocb, ret, 0);
+ cachefiles_write_complete(&ki->iocb, ret);
if (ret > 0)
ret = 0;
break;
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 8ffc40e84a59..fcf4f3b72923 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -25,20 +25,20 @@ static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode,
struct cachefiles_object *object;
struct fscache_retrieval *op = monitor->op;
struct wait_page_key *key = _key;
- struct page *page = wait->private;
+ struct folio *folio = wait->private;
ASSERT(key);
_enter("{%lu},%u,%d,{%p,%u}",
monitor->netfs_page->index, mode, sync,
- key->page, key->bit_nr);
+ key->folio, key->bit_nr);
- if (key->page != page || key->bit_nr != PG_locked)
+ if (key->folio != folio || key->bit_nr != PG_locked)
return 0;
- _debug("--- monitor %p %lx ---", page, page->flags);
+ _debug("--- monitor %p %lx ---", folio, folio->flags);
- if (!PageUptodate(page) && !PageError(page)) {
+ if (!folio_test_uptodate(folio) && !folio_test_error(folio)) {
/* unlocked, not uptodate and not erronous? */
_debug("page probably truncated");
}
@@ -107,7 +107,7 @@ static int cachefiles_read_reissue(struct cachefiles_object *object,
put_page(backpage2);
INIT_LIST_HEAD(&monitor->op_link);
- add_page_wait_queue(backpage, &monitor->monitor);
+ folio_add_wait_queue(page_folio(backpage), &monitor->monitor);
if (trylock_page(backpage)) {
ret = -EIO;
@@ -294,7 +294,7 @@ monitor_backing_page:
get_page(backpage);
monitor->back_page = backpage;
monitor->monitor.private = backpage;
- add_page_wait_queue(backpage, &monitor->monitor);
+ folio_add_wait_queue(page_folio(backpage), &monitor->monitor);
monitor = NULL;
/* but the page may have been read before the monitor was installed, so
@@ -548,7 +548,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
get_page(backpage);
monitor->back_page = backpage;
monitor->monitor.private = backpage;
- add_page_wait_queue(backpage, &monitor->monitor);
+ folio_add_wait_queue(page_folio(backpage), &monitor->monitor);
monitor = NULL;
/* but the page may have been read before the monitor was
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 3e42d0466521..8f537f1d9d1d 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2330,7 +2330,6 @@ retry:
int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
- struct ceph_file_info *fi = file->private_data;
struct inode *inode = file->f_mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
u64 flush_tid;
@@ -2365,14 +2364,9 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
if (err < 0)
ret = err;
- if (errseq_check(&ci->i_meta_err, READ_ONCE(fi->meta_err))) {
- spin_lock(&file->f_lock);
- err = errseq_check_and_advance(&ci->i_meta_err,
- &fi->meta_err);
- spin_unlock(&file->f_lock);
- if (err < 0)
- ret = err;
- }
+ err = file_check_and_advance_wb_err(file);
+ if (err < 0)
+ ret = err;
out:
dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret);
return ret;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index d16fd2d5fd42..b129ea551378 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -233,7 +233,6 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
spin_lock_init(&fi->rw_contexts_lock);
INIT_LIST_HEAD(&fi->rw_contexts);
- fi->meta_err = errseq_sample(&ci->i_meta_err);
fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen);
return 0;
@@ -1023,7 +1022,7 @@ static void ceph_aio_complete(struct inode *inode,
ceph_put_cap_refs(ci, (aio_req->write ? CEPH_CAP_FILE_WR :
CEPH_CAP_FILE_RD));
- aio_req->iocb->ki_complete(aio_req->iocb, ret, 0);
+ aio_req->iocb->ki_complete(aio_req->iocb, ret);
ceph_free_cap_flush(aio_req->prealloc_cf);
kfree(aio_req);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 2df1e1284451..1c7574105478 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -541,8 +541,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ceph_fscache_inode_init(ci);
- ci->i_meta_err = 0;
-
return &ci->vfs_inode;
}
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index bdeb271f47d9..d8c31069fbf2 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -302,9 +302,6 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
if (!(fl->fl_flags & FL_FLOCK))
return -ENOLCK;
- /* No mandatory locks */
- if (fl->fl_type & LOCK_MAND)
- return -EOPNOTSUPP;
dout("ceph_flock, fl_file: %p\n", fl->fl_file);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 7cad180d6deb..d64413adc0fd 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1493,7 +1493,6 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc,
{
struct ceph_mds_request *req;
struct rb_node *p;
- struct ceph_inode_info *ci;
dout("cleanup_session_requests mds%d\n", session->s_mds);
mutex_lock(&mdsc->mutex);
@@ -1502,16 +1501,10 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc,
struct ceph_mds_request, r_unsafe_item);
pr_warn_ratelimited(" dropping unsafe request %llu\n",
req->r_tid);
- if (req->r_target_inode) {
- /* dropping unsafe change of inode's attributes */
- ci = ceph_inode(req->r_target_inode);
- errseq_set(&ci->i_meta_err, -EIO);
- }
- if (req->r_unsafe_dir) {
- /* dropping unsafe directory operation */
- ci = ceph_inode(req->r_unsafe_dir);
- errseq_set(&ci->i_meta_err, -EIO);
- }
+ if (req->r_target_inode)
+ mapping_set_error(req->r_target_inode->i_mapping, -EIO);
+ if (req->r_unsafe_dir)
+ mapping_set_error(req->r_unsafe_dir->i_mapping, -EIO);
__unregister_request(mdsc, req);
}
/* zero r_attempts, so kick_requests() will re-send requests */
@@ -1678,7 +1671,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
spin_unlock(&mdsc->cap_dirty_lock);
if (dirty_dropped) {
- errseq_set(&ci->i_meta_err, -EIO);
+ mapping_set_error(inode->i_mapping, -EIO);
if (ci->i_wrbuffer_ref_head == 0 &&
ci->i_wr_ref == 0 &&
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 9b1b7f4cfdd4..fd8742bae847 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1002,16 +1002,16 @@ static int ceph_compare_super(struct super_block *sb, struct fs_context *fc)
struct ceph_fs_client *new = fc->s_fs_info;
struct ceph_mount_options *fsopt = new->mount_options;
struct ceph_options *opt = new->client->options;
- struct ceph_fs_client *other = ceph_sb_to_client(sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
dout("ceph_compare_super %p\n", sb);
- if (compare_mount_options(fsopt, opt, other)) {
+ if (compare_mount_options(fsopt, opt, fsc)) {
dout("monitor(s)/mount options don't match\n");
return 0;
}
if ((opt->flags & CEPH_OPT_FSID) &&
- ceph_fsid_compare(&opt->fsid, &other->client->fsid)) {
+ ceph_fsid_compare(&opt->fsid, &fsc->client->fsid)) {
dout("fsid doesn't match\n");
return 0;
}
@@ -1019,6 +1019,17 @@ static int ceph_compare_super(struct super_block *sb, struct fs_context *fc)
dout("flags differ\n");
return 0;
}
+
+ if (fsc->blocklisted && !ceph_test_mount_opt(fsc, CLEANRECOVER)) {
+ dout("client is blocklisted (and CLEANRECOVER is not set)\n");
+ return 0;
+ }
+
+ if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
+ dout("client has been forcibly unmounted\n");
+ return 0;
+ }
+
return 1;
}
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index a40eb14c282a..14f951cd5b61 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -429,8 +429,6 @@ struct ceph_inode_info {
#ifdef CONFIG_CEPH_FSCACHE
struct fscache_cookie *fscache;
#endif
- errseq_t i_meta_err;
-
struct inode vfs_inode; /* at end */
};
@@ -774,7 +772,6 @@ struct ceph_file_info {
spinlock_t rw_contexts_lock;
struct list_head rw_contexts;
- errseq_t meta_err;
u32 filp_gen;
atomic_t num_locks;
};
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 13f3182cf796..1b855fcb179e 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3184,7 +3184,7 @@ restart_loop:
mutex_unlock(&ctx->aio_mutex);
if (ctx->iocb && ctx->iocb->ki_complete)
- ctx->iocb->ki_complete(ctx->iocb, ctx->rc, 0);
+ ctx->iocb->ki_complete(ctx->iocb, ctx->rc);
else
complete(&ctx->done);
}
@@ -3917,7 +3917,7 @@ again:
mutex_unlock(&ctx->aio_mutex);
if (ctx->iocb && ctx->iocb->ki_complete)
- ctx->iocb->ki_complete(ctx->iocb, ctx->rc, 0);
+ ctx->iocb->ki_complete(ctx->iocb, ctx->rc);
else
complete(&ctx->done);
}
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 2be65269a987..666aa380011e 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -209,7 +209,7 @@ static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset,
return read_buffers[i] + blk_offset;
}
- devsize = mapping->host->i_size >> PAGE_SHIFT;
+ devsize = bdev_nr_bytes(sb->s_bdev) >> PAGE_SHIFT;
/* Ok, read in BLKS_PER_BUF pages completely first. */
for (i = 0; i < BLKS_PER_BUF; i++) {
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index 68a2de6b5a9b..bfc2a5b74ed3 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -1,23 +1,10 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * This contains encryption functions for per-file encryption.
+ * Utility functions for file contents encryption/decryption on
+ * block device-based filesystems.
*
* Copyright (C) 2015, Google, Inc.
* Copyright (C) 2015, Motorola Mobility
- *
- * Written by Michael Halcrow, 2014.
- *
- * Filename encryption additions
- * Uday Savagaonkar, 2014
- * Encryption policy handling additions
- * Ildar Muslukhov, 2014
- * Add fscrypt_pullback_bio_page()
- * Jaegeuk Kim, 2015.
- *
- * This has not yet undergone a rigorous security audit.
- *
- * The usage of AES-XTS should conform to recommendations in NIST
- * Special Publication 800-38E and IEEE P1619/D16.
*/
#include <linux/pagemap.h>
@@ -26,6 +13,21 @@
#include <linux/namei.h>
#include "fscrypt_private.h"
+/**
+ * fscrypt_decrypt_bio() - decrypt the contents of a bio
+ * @bio: the bio to decrypt
+ *
+ * Decrypt the contents of a "read" bio following successful completion of the
+ * underlying disk read. The bio must be reading a whole number of blocks of an
+ * encrypted file directly into the page cache. If the bio is reading the
+ * ciphertext into bounce pages instead of the page cache (for example, because
+ * the file is also compressed, so decompression is required after decryption),
+ * then this function isn't applicable. This function may sleep, so it must be
+ * called from a workqueue rather than from the bio's bi_end_io callback.
+ *
+ * This function sets PG_error on any pages that contain any blocks that failed
+ * to be decrypted. The filesystem must not mark such pages uptodate.
+ */
void fscrypt_decrypt_bio(struct bio *bio)
{
struct bio_vec *bv;
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index eb538c28df94..a9be4bc74a94 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -429,8 +429,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
if (fscrypt_has_encryption_key(dir)) {
if (!fscrypt_fname_encrypted_size(&dir->i_crypt_info->ci_policy,
- iname->len,
- dir->i_sb->s_cop->max_namelen,
+ iname->len, NAME_MAX,
&fname->crypto_buf.len))
return -ENAMETOOLONG;
fname->crypto_buf.name = kmalloc(fname->crypto_buf.len,
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 3fa965eb3336..5b0a9e6478b5 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -20,6 +20,11 @@
#define FSCRYPT_FILE_NONCE_SIZE 16
+/*
+ * Minimum size of an fscrypt master key. Note: a longer key will be required
+ * if ciphers with a 256-bit security strength are used. This is just the
+ * absolute minimum, which applies when only 128-bit encryption is used.
+ */
#define FSCRYPT_MIN_KEY_SIZE 16
#define FSCRYPT_CONTEXT_V1 1
@@ -413,7 +418,11 @@ struct fscrypt_master_key_secret {
*/
struct fscrypt_hkdf hkdf;
- /* Size of the raw key in bytes. Set even if ->raw isn't set. */
+ /*
+ * Size of the raw key in bytes. This remains set even if ->raw was
+ * zeroized due to no longer being needed. I.e. we still remember the
+ * size of the key even if we don't need to remember the key itself.
+ */
u32 size;
/* For v1 policy keys: the raw key. Wiped for v2 policy keys. */
@@ -549,8 +558,9 @@ int __init fscrypt_init_keyring(void);
struct fscrypt_mode {
const char *friendly_name;
const char *cipher_str;
- int keysize;
- int ivsize;
+ int keysize; /* key size in bytes */
+ int security_strength; /* security strength in bytes */
+ int ivsize; /* IV size in bytes */
int logged_impl_name;
enum blk_crypto_mode_num blk_crypto_mode;
};
diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c
index e0ec21055505..7607d18b35fc 100644
--- a/fs/crypto/hkdf.c
+++ b/fs/crypto/hkdf.c
@@ -16,9 +16,14 @@
/*
* HKDF supports any unkeyed cryptographic hash algorithm, but fscrypt uses
- * SHA-512 because it is reasonably secure and efficient; and since it produces
- * a 64-byte digest, deriving an AES-256-XTS key preserves all 64 bytes of
- * entropy from the master key and requires only one iteration of HKDF-Expand.
+ * SHA-512 because it is well-established, secure, and reasonably efficient.
+ *
+ * HKDF-SHA256 was also considered, as its 256-bit security strength would be
+ * sufficient here. A 512-bit security strength is "nice to have", though.
+ * Also, on 64-bit CPUs, SHA-512 is usually just as fast as SHA-256. In the
+ * common case of deriving an AES-256-XTS key (512 bits), that can result in
+ * HKDF-SHA512 being much faster than HKDF-SHA256, as the longer digest size of
+ * SHA-512 causes HKDF-Expand to only need to do one iteration rather than two.
*/
#define HKDF_HMAC_ALG "hmac(sha512)"
#define HKDF_HASHLEN SHA512_DIGEST_SIZE
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index bca9c6658a7c..eede186b04ce 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -19,6 +19,7 @@ struct fscrypt_mode fscrypt_modes[] = {
.friendly_name = "AES-256-XTS",
.cipher_str = "xts(aes)",
.keysize = 64,
+ .security_strength = 32,
.ivsize = 16,
.blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_256_XTS,
},
@@ -26,12 +27,14 @@ struct fscrypt_mode fscrypt_modes[] = {
.friendly_name = "AES-256-CTS-CBC",
.cipher_str = "cts(cbc(aes))",
.keysize = 32,
+ .security_strength = 32,
.ivsize = 16,
},
[FSCRYPT_MODE_AES_128_CBC] = {
.friendly_name = "AES-128-CBC-ESSIV",
.cipher_str = "essiv(cbc(aes),sha256)",
.keysize = 16,
+ .security_strength = 16,
.ivsize = 16,
.blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV,
},
@@ -39,12 +42,14 @@ struct fscrypt_mode fscrypt_modes[] = {
.friendly_name = "AES-128-CTS-CBC",
.cipher_str = "cts(cbc(aes))",
.keysize = 16,
+ .security_strength = 16,
.ivsize = 16,
},
[FSCRYPT_MODE_ADIANTUM] = {
.friendly_name = "Adiantum",
.cipher_str = "adiantum(xchacha12,aes)",
.keysize = 32,
+ .security_strength = 32,
.ivsize = 32,
.blk_crypto_mode = BLK_ENCRYPTION_MODE_ADIANTUM,
},
@@ -117,8 +122,9 @@ err_free_tfm:
/*
* Prepare the crypto transform object or blk-crypto key in @prep_key, given the
- * raw key, encryption mode, and flag indicating which encryption implementation
- * (fs-layer or blk-crypto) will be used.
+ * raw key, encryption mode (@ci->ci_mode), flag indicating which encryption
+ * implementation (fs-layer or blk-crypto) will be used (@ci->ci_inlinecrypt),
+ * and IV generation method (@ci->ci_policy.flags).
*/
int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
const u8 *raw_key, const struct fscrypt_info *ci)
@@ -358,6 +364,45 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
}
/*
+ * Check whether the size of the given master key (@mk) is appropriate for the
+ * encryption settings which a particular file will use (@ci).
+ *
+ * If the file uses a v1 encryption policy, then the master key must be at least
+ * as long as the derived key, as this is a requirement of the v1 KDF.
+ *
+ * Otherwise, the KDF can accept any size key, so we enforce a slightly looser
+ * requirement: we require that the size of the master key be at least the
+ * maximum security strength of any algorithm whose key will be derived from it
+ * (but in practice we only need to consider @ci->ci_mode, since any other
+ * possible subkeys such as DIRHASH and INODE_HASH will never increase the
+ * required key size over @ci->ci_mode). This allows AES-256-XTS keys to be
+ * derived from a 256-bit master key, which is cryptographically sufficient,
+ * rather than requiring a 512-bit master key which is unnecessarily long. (We
+ * still allow 512-bit master keys if the user chooses to use them, though.)
+ */
+static bool fscrypt_valid_master_key_size(const struct fscrypt_master_key *mk,
+ const struct fscrypt_info *ci)
+{
+ unsigned int min_keysize;
+
+ if (ci->ci_policy.version == FSCRYPT_POLICY_V1)
+ min_keysize = ci->ci_mode->keysize;
+ else
+ min_keysize = ci->ci_mode->security_strength;
+
+ if (mk->mk_secret.size < min_keysize) {
+ fscrypt_warn(NULL,
+ "key with %s %*phN is too short (got %u bytes, need %u+ bytes)",
+ master_key_spec_type(&mk->mk_spec),
+ master_key_spec_len(&mk->mk_spec),
+ (u8 *)&mk->mk_spec.u,
+ mk->mk_secret.size, min_keysize);
+ return false;
+ }
+ return true;
+}
+
+/*
* Find the master key, then set up the inode's actual encryption key.
*
* If the master key is found in the filesystem-level keyring, then the
@@ -422,18 +467,7 @@ static int setup_file_encryption_key(struct fscrypt_info *ci,
goto out_release_key;
}
- /*
- * Require that the master key be at least as long as the derived key.
- * Otherwise, the derived key cannot possibly contain as much entropy as
- * that required by the encryption mode it will be used for. For v1
- * policies it's also required for the KDF to work at all.
- */
- if (mk->mk_secret.size < ci->ci_mode->keysize) {
- fscrypt_warn(NULL,
- "key with %s %*phN is too short (got %u bytes, need %u+ bytes)",
- master_key_spec_type(&mk_spec),
- master_key_spec_len(&mk_spec), (u8 *)&mk_spec.u,
- mk->mk_secret.size, ci->ci_mode->keysize);
+ if (!fscrypt_valid_master_key_size(mk, ci)) {
err = -ENOKEY;
goto out_release_key;
}
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b2e86e739d7a..654443558047 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -119,7 +119,6 @@ struct dio {
int flags; /* doesn't change */
int op;
int op_flags;
- blk_qc_t bio_cookie;
struct gendisk *bio_disk;
struct inode *inode;
loff_t i_size; /* i_size when submitted */
@@ -308,7 +307,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
if (ret > 0 && dio->op == REQ_OP_WRITE)
ret = generic_write_sync(dio->iocb, ret);
- dio->iocb->ki_complete(dio->iocb, ret, 0);
+ dio->iocb->ki_complete(dio->iocb, ret);
}
kmem_cache_free(dio_cache, dio);
@@ -438,11 +437,10 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
dio->bio_disk = bio->bi_bdev->bd_disk;
- if (sdio->submit_io) {
+ if (sdio->submit_io)
sdio->submit_io(bio, dio->inode, sdio->logical_offset_in_bio);
- dio->bio_cookie = BLK_QC_T_NONE;
- } else
- dio->bio_cookie = submit_bio(bio);
+ else
+ submit_bio(bio);
sdio->bio = NULL;
sdio->boundary = 0;
@@ -481,9 +479,7 @@ static struct bio *dio_await_one(struct dio *dio)
__set_current_state(TASK_UNINTERRUPTIBLE);
dio->waiter = current;
spin_unlock_irqrestore(&dio->bio_lock, flags);
- if (!(dio->iocb->ki_flags & IOCB_HIPRI) ||
- !blk_poll(dio->bio_disk->queue, dio->bio_cookie, true))
- blk_io_schedule();
+ blk_io_schedule();
/* wake up sets us TASK_RUNNING */
spin_lock_irqsave(&dio->bio_lock, flags);
dio->waiter = NULL;
@@ -1214,8 +1210,6 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
} else {
dio->op = REQ_OP_READ;
}
- if (iocb->ki_flags & IOCB_HIPRI)
- dio->op_flags |= REQ_HIPRI;
/*
* For AIO O_(D)SYNC writes we need to defer completions to a workqueue
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index ac0e11bbb445..9c5559faacda 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -915,7 +915,7 @@ const struct file_operations ext4_file_operations = {
.llseek = ext4_llseek,
.read_iter = ext4_file_read_iter,
.write_iter = ext4_file_write_iter,
- .iopoll = iomap_dio_iopoll,
+ .iopoll = iocb_bio_iopoll,
.unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 88d5d274a868..79b6a0c47f6f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1572,7 +1572,6 @@ static const struct fscrypt_operations ext4_cryptops = {
.set_context = ext4_set_context,
.get_dummy_policy = ext4_get_dummy_policy,
.empty_dir = ext4_empty_dir,
- .max_namelen = EXT4_NAME_LEN,
.has_stable_inodes = ext4_has_stable_inodes,
.get_ino_and_lblk_bits = ext4_get_ino_and_lblk_bits,
};
@@ -4474,7 +4473,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto cantfind_ext4;
/* check blocks count against device size */
- blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
+ blocks_count = sb_bdev_nr_blocks(sb);
if (blocks_count && ext4_blocks_count(es) > blocks_count) {
ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
"exceeds size of device (%llu blocks)",
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index c1bf9ad4c220..20a083dc9042 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -7,6 +7,7 @@
#include <linux/fs.h>
#include <linux/f2fs_fs.h>
+#include <linux/moduleparam.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/lzo.h>
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 78ebc306ee2b..cf049a042482 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -2976,7 +2976,6 @@ static const struct fscrypt_operations f2fs_cryptops = {
.set_context = f2fs_set_context,
.get_dummy_policy = f2fs_get_dummy_policy,
.empty_dir = f2fs_empty_dir,
- .max_namelen = F2FS_NAME_LEN,
.has_stable_inodes = f2fs_has_stable_inodes,
.get_ino_and_lblk_bits = f2fs_get_ino_and_lblk_bits,
.get_num_devices = f2fs_get_num_devices,
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index de0c9b013a85..a6f1c6d426d1 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1536,14 +1536,11 @@ static int fat_read_static_bpb(struct super_block *sb,
struct fat_bios_param_block *bpb)
{
static const char *notdos1x = "This doesn't look like a DOS 1.x volume";
-
+ sector_t bd_sects = bdev_nr_sectors(sb->s_bdev);
struct fat_floppy_defaults *fdefaults = NULL;
int error = -EINVAL;
- sector_t bd_sects;
unsigned i;
- bd_sects = i_size_read(sb->s_bdev->bd_inode) / SECTOR_SIZE;
-
/* 16-bit DOS 1.x reliably wrote bootstrap short-jmp code */
if (b->ignored[0] != 0xeb || b->ignored[2] != 0x90) {
if (!silent)
@@ -1943,10 +1940,8 @@ int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2)
ret = writeback_inode(i1);
if (!ret && i2)
ret = writeback_inode(i2);
- if (!ret) {
- struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
- ret = filemap_flush(mapping);
- }
+ if (!ret)
+ ret = sync_blockdev_nowait(sb->s_bdev);
return ret;
}
EXPORT_SYMBOL_GPL(fat_flush_inodes);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 81ec192ce067..4124a89a1a5d 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1893,7 +1893,8 @@ static long writeback_sb_inodes(struct super_block *sb,
* unplug, so get our IOs out the door before we
* give up the CPU.
*/
- blk_flush_plug(current);
+ if (current->plug)
+ blk_flush_plug(current->plug, false);
cond_resched();
}
@@ -2291,7 +2292,7 @@ void wakeup_flusher_threads(enum wb_reason reason)
* If we are expecting writeback progress we must submit plugged IO.
*/
if (blk_needs_flush_plug(current))
- blk_schedule_flush_plug(current);
+ blk_flush_plug(current->plug, true);
rcu_read_lock();
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index f346a78f4bd6..6a675652129b 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -77,7 +77,6 @@ static WORK_STATE(INIT_OBJECT, "INIT", fscache_initialise_object);
static WORK_STATE(PARENT_READY, "PRDY", fscache_parent_ready);
static WORK_STATE(ABORT_INIT, "ABRT", fscache_abort_initialisation);
static WORK_STATE(LOOK_UP_OBJECT, "LOOK", fscache_look_up_object);
-static WORK_STATE(CREATE_OBJECT, "CRTO", fscache_look_up_object);
static WORK_STATE(OBJECT_AVAILABLE, "AVBL", fscache_object_available);
static WORK_STATE(JUMPSTART_DEPS, "JUMP", fscache_jumpstart_dependents);
@@ -907,6 +906,7 @@ static void fscache_dequeue_object(struct fscache_object *object)
* @object: The object to ask about
* @data: The auxiliary data for the object
* @datalen: The size of the auxiliary data
+ * @object_size: The size of the object according to the server.
*
* This function consults the netfs about the coherency state of an object.
* The caller must be holding a ref on cookie->n_active (held by
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 433877107700..e002cdfaf3cc 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -22,7 +22,10 @@ static void fscache_operation_dummy_cancel(struct fscache_operation *op)
/**
* fscache_operation_init - Do basic initialisation of an operation
+ * @cookie: The cookie to operate on
* @op: The operation to initialise
+ * @processor: The function to perform the operation
+ * @cancel: A function to handle operation cancellation
* @release: The release function to assign
*
* Do basic initialisation of an operation. The caller must still set flags,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 11404f8c21c7..e6039f22311b 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -687,7 +687,7 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
spin_unlock(&fi->lock);
}
- io->iocb->ki_complete(io->iocb, res, 0);
+ io->iocb->ki_complete(io->iocb, res);
}
kref_put(&io->refcnt, fuse_io_release);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 319596df5dc6..f55f9f94b1a4 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1121,6 +1121,9 @@ int fuse_init_fs_context_submount(struct fs_context *fsc);
*/
void fuse_conn_destroy(struct fuse_mount *fm);
+/* Drop the connection and free the fuse mount */
+void fuse_mount_destroy(struct fuse_mount *fm);
+
/**
* Add connection to control filesystem
*/
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 36cd03114b6d..12d49a1914e8 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -457,14 +457,6 @@ static void fuse_send_destroy(struct fuse_mount *fm)
}
}
-static void fuse_put_super(struct super_block *sb)
-{
- struct fuse_mount *fm = get_fuse_mount_super(sb);
-
- fuse_conn_put(fm->fc);
- kfree(fm);
-}
-
static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr)
{
stbuf->f_type = FUSE_SUPER_MAGIC;
@@ -1003,7 +995,6 @@ static const struct super_operations fuse_super_operations = {
.evict_inode = fuse_evict_inode,
.write_inode = fuse_write_inode,
.drop_inode = generic_delete_inode,
- .put_super = fuse_put_super,
.umount_begin = fuse_umount_begin,
.statfs = fuse_statfs,
.sync_fs = fuse_sync_fs,
@@ -1424,20 +1415,17 @@ static int fuse_get_tree_submount(struct fs_context *fsc)
if (!fm)
return -ENOMEM;
+ fm->fc = fuse_conn_get(fc);
fsc->s_fs_info = fm;
sb = sget_fc(fsc, NULL, set_anon_super_fc);
- if (IS_ERR(sb)) {
- kfree(fm);
+ if (fsc->s_fs_info)
+ fuse_mount_destroy(fm);
+ if (IS_ERR(sb))
return PTR_ERR(sb);
- }
- fm->fc = fuse_conn_get(fc);
/* Initialize superblock, making @mp_fi its root */
err = fuse_fill_super_submount(sb, mp_fi);
if (err) {
- fuse_conn_put(fc);
- kfree(fm);
- sb->s_fs_info = NULL;
deactivate_locked_super(sb);
return err;
}
@@ -1569,8 +1557,6 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
{
struct fuse_fs_context *ctx = fsc->fs_private;
int err;
- struct fuse_conn *fc;
- struct fuse_mount *fm;
if (!ctx->file || !ctx->rootmode_present ||
!ctx->user_id_present || !ctx->group_id_present)
@@ -1580,42 +1566,18 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
* Require mount to happen from the same user namespace which
* opened /dev/fuse to prevent potential attacks.
*/
- err = -EINVAL;
if ((ctx->file->f_op != &fuse_dev_operations) ||
(ctx->file->f_cred->user_ns != sb->s_user_ns))
- goto err;
+ return -EINVAL;
ctx->fudptr = &ctx->file->private_data;
- fc = kmalloc(sizeof(*fc), GFP_KERNEL);
- err = -ENOMEM;
- if (!fc)
- goto err;
-
- fm = kzalloc(sizeof(*fm), GFP_KERNEL);
- if (!fm) {
- kfree(fc);
- goto err;
- }
-
- fuse_conn_init(fc, fm, sb->s_user_ns, &fuse_dev_fiq_ops, NULL);
- fc->release = fuse_free_conn;
-
- sb->s_fs_info = fm;
-
err = fuse_fill_super_common(sb, ctx);
if (err)
- goto err_put_conn;
+ return err;
/* file->private_data shall be visible on all CPUs after this */
smp_mb();
fuse_send_init(get_fuse_mount_super(sb));
return 0;
-
- err_put_conn:
- fuse_conn_put(fc);
- kfree(fm);
- sb->s_fs_info = NULL;
- err:
- return err;
}
/*
@@ -1637,22 +1599,40 @@ static int fuse_get_tree(struct fs_context *fsc)
{
struct fuse_fs_context *ctx = fsc->fs_private;
struct fuse_dev *fud;
+ struct fuse_conn *fc;
+ struct fuse_mount *fm;
struct super_block *sb;
int err;
+ fc = kmalloc(sizeof(*fc), GFP_KERNEL);
+ if (!fc)
+ return -ENOMEM;
+
+ fm = kzalloc(sizeof(*fm), GFP_KERNEL);
+ if (!fm) {
+ kfree(fc);
+ return -ENOMEM;
+ }
+
+ fuse_conn_init(fc, fm, fsc->user_ns, &fuse_dev_fiq_ops, NULL);
+ fc->release = fuse_free_conn;
+
+ fsc->s_fs_info = fm;
+
if (ctx->fd_present)
ctx->file = fget(ctx->fd);
if (IS_ENABLED(CONFIG_BLOCK) && ctx->is_bdev) {
err = get_tree_bdev(fsc, fuse_fill_super);
- goto out_fput;
+ goto out;
}
/*
* While block dev mount can be initialized with a dummy device fd
* (found by device name), normal fuse mounts can't
*/
+ err = -EINVAL;
if (!ctx->file)
- return -EINVAL;
+ goto out;
/*
* Allow creating a fuse mount with an already initialized fuse
@@ -1668,7 +1648,9 @@ static int fuse_get_tree(struct fs_context *fsc)
} else {
err = get_tree_nodev(fsc, fuse_fill_super);
}
-out_fput:
+out:
+ if (fsc->s_fs_info)
+ fuse_mount_destroy(fm);
if (ctx->file)
fput(ctx->file);
return err;
@@ -1747,17 +1729,25 @@ static void fuse_sb_destroy(struct super_block *sb)
struct fuse_mount *fm = get_fuse_mount_super(sb);
bool last;
- if (fm) {
+ if (sb->s_root) {
last = fuse_mount_remove(fm);
if (last)
fuse_conn_destroy(fm);
}
}
+void fuse_mount_destroy(struct fuse_mount *fm)
+{
+ fuse_conn_put(fm->fc);
+ kfree(fm);
+}
+EXPORT_SYMBOL(fuse_mount_destroy);
+
static void fuse_kill_sb_anon(struct super_block *sb)
{
fuse_sb_destroy(sb);
kill_anon_super(sb);
+ fuse_mount_destroy(get_fuse_mount_super(sb));
}
static struct file_system_type fuse_fs_type = {
@@ -1775,6 +1765,7 @@ static void fuse_kill_sb_blk(struct super_block *sb)
{
fuse_sb_destroy(sb);
kill_block_super(sb);
+ fuse_mount_destroy(get_fuse_mount_super(sb));
}
static struct file_system_type fuseblk_fs_type = {
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 0ad89c6629d7..94fc874f5de7 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -1394,12 +1394,13 @@ static void virtio_kill_sb(struct super_block *sb)
bool last;
/* If mount failed, we can still be called without any fc */
- if (fm) {
+ if (sb->s_root) {
last = fuse_mount_remove(fm);
if (last)
virtio_fs_conn_destroy(fm);
}
kill_anon_super(sb);
+ fuse_mount_destroy(fm);
}
static int virtio_fs_test_super(struct super_block *sb,
@@ -1455,19 +1456,14 @@ static int virtio_fs_get_tree(struct fs_context *fsc)
fsc->s_fs_info = fm;
sb = sget_fc(fsc, virtio_fs_test_super, set_anon_super_fc);
- if (fsc->s_fs_info) {
- fuse_conn_put(fc);
- kfree(fm);
- }
+ if (fsc->s_fs_info)
+ fuse_mount_destroy(fm);
if (IS_ERR(sb))
return PTR_ERR(sb);
if (!sb->s_root) {
err = virtio_fs_fill_super(sb, fsc);
if (err) {
- fuse_conn_put(fc);
- kfree(fm);
- sb->s_fs_info = NULL;
deactivate_locked_super(sb);
return err;
}
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index c559827cb6f9..5436a688157a 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1338,8 +1338,6 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
{
if (!(fl->fl_flags & FL_FLOCK))
return -ENOLCK;
- if (fl->fl_type & LOCK_MAND)
- return -EOPNOTSUPP;
if (fl->fl_type == F_UNLCK) {
do_unflock(file, fl);
@@ -1353,7 +1351,7 @@ const struct file_operations gfs2_file_fops = {
.llseek = gfs2_llseek,
.read_iter = gfs2_file_read_iter,
.write_iter = gfs2_file_write_iter,
- .iopoll = iomap_dio_iopoll,
+ .iopoll = iocb_bio_iopoll,
.unlocked_ioctl = gfs2_ioctl,
.compat_ioctl = gfs2_compat_ioctl,
.mmap = gfs2_mmap,
@@ -1386,7 +1384,7 @@ const struct file_operations gfs2_file_fops_nolock = {
.llseek = gfs2_llseek,
.read_iter = gfs2_file_read_iter,
.write_iter = gfs2_file_write_iter,
- .iopoll = iomap_dio_iopoll,
+ .iopoll = iocb_bio_iopoll,
.unlocked_ioctl = gfs2_ioctl,
.compat_ioctl = gfs2_compat_ioctl,
.mmap = gfs2_mmap,
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index cdf0edeeb278..5beb82652435 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -36,7 +36,7 @@ static int hfs_get_last_session(struct super_block *sb,
/* default values */
*start = 0;
- *size = i_size_read(sb->s_bdev->bd_inode) >> 9;
+ *size = bdev_nr_sectors(sb->s_bdev);
if (HFS_SB(sb)->session >= 0) {
struct cdrom_tocentry te;
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 0350dc7821bf..51ae6f1eb4a5 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -131,7 +131,7 @@ static int hfsplus_get_last_session(struct super_block *sb,
/* default values */
*start = 0;
- *size = i_size_read(sb->s_bdev->bd_inode) >> 9;
+ *size = bdev_nr_sectors(sb->s_bdev);
if (HFSPLUS_SB(sb)->session >= 0) {
struct cdrom_tocentry te;
diff --git a/fs/internal.h b/fs/internal.h
index 3cd065c8a66b..cdd83d4899bb 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -23,22 +23,11 @@ struct pipe_inode_info;
#ifdef CONFIG_BLOCK
extern void __init bdev_cache_init(void);
-extern int __sync_blockdev(struct block_device *bdev, int wait);
-void iterate_bdevs(void (*)(struct block_device *, void *), void *);
void emergency_thaw_bdev(struct super_block *sb);
#else
static inline void bdev_cache_init(void)
{
}
-
-static inline int __sync_blockdev(struct block_device *bdev, int wait)
-{
- return 0;
-}
-static inline void iterate_bdevs(void (*f)(struct block_device *, void *),
- void *arg)
-{
-}
static inline int emergency_thaw_bdev(struct super_block *sb)
{
return 0;
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 5bf8aa81715e..38b33ad9e8cf 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -140,6 +140,7 @@ static void io_wqe_dec_running(struct io_worker *worker);
static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
struct io_wqe_acct *acct,
struct io_cb_cancel_data *match);
+static void create_worker_cb(struct callback_head *cb);
static bool io_worker_get(struct io_worker *worker)
{
@@ -174,12 +175,46 @@ static void io_worker_ref_put(struct io_wq *wq)
complete(&wq->worker_done);
}
+static void io_worker_cancel_cb(struct io_worker *worker)
+{
+ struct io_wqe_acct *acct = io_wqe_get_acct(worker);
+ struct io_wqe *wqe = worker->wqe;
+ struct io_wq *wq = wqe->wq;
+
+ atomic_dec(&acct->nr_running);
+ raw_spin_lock(&worker->wqe->lock);
+ acct->nr_workers--;
+ raw_spin_unlock(&worker->wqe->lock);
+ io_worker_ref_put(wq);
+ clear_bit_unlock(0, &worker->create_state);
+ io_worker_release(worker);
+}
+
+static bool io_task_worker_match(struct callback_head *cb, void *data)
+{
+ struct io_worker *worker;
+
+ if (cb->func != create_worker_cb)
+ return false;
+ worker = container_of(cb, struct io_worker, create_work);
+ return worker == data;
+}
+
static void io_worker_exit(struct io_worker *worker)
{
struct io_wqe *wqe = worker->wqe;
+ struct io_wq *wq = wqe->wq;
- if (refcount_dec_and_test(&worker->ref))
- complete(&worker->ref_done);
+ while (1) {
+ struct callback_head *cb = task_work_cancel_match(wq->task,
+ io_task_worker_match, worker);
+
+ if (!cb)
+ break;
+ io_worker_cancel_cb(worker);
+ }
+
+ io_worker_release(worker);
wait_for_completion(&worker->ref_done);
raw_spin_lock(&wqe->lock);
@@ -253,7 +288,7 @@ static bool io_wqe_create_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
pr_warn_once("io-wq is not configured for unbound workers");
raw_spin_lock(&wqe->lock);
- if (acct->nr_workers == acct->max_workers) {
+ if (acct->nr_workers >= acct->max_workers) {
raw_spin_unlock(&wqe->lock);
return true;
}
@@ -323,8 +358,10 @@ static bool io_queue_worker_create(struct io_worker *worker,
init_task_work(&worker->create_work, func);
worker->create_index = acct->index;
- if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL))
+ if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) {
+ clear_bit_unlock(0, &worker->create_state);
return true;
+ }
clear_bit_unlock(0, &worker->create_state);
fail_release:
io_worker_release(worker);
@@ -716,11 +753,8 @@ static void io_workqueue_create(struct work_struct *work)
struct io_worker *worker = container_of(work, struct io_worker, work);
struct io_wqe_acct *acct = io_wqe_get_acct(worker);
- if (!io_queue_worker_create(worker, acct, create_worker_cont)) {
- clear_bit_unlock(0, &worker->create_state);
- io_worker_release(worker);
+ if (!io_queue_worker_create(worker, acct, create_worker_cont))
kfree(worker);
- }
}
static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
@@ -1150,17 +1184,9 @@ static void io_wq_exit_workers(struct io_wq *wq)
while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) {
struct io_worker *worker;
- struct io_wqe_acct *acct;
worker = container_of(cb, struct io_worker, create_work);
- acct = io_wqe_get_acct(worker);
- atomic_dec(&acct->nr_running);
- raw_spin_lock(&worker->wqe->lock);
- acct->nr_workers--;
- raw_spin_unlock(&worker->wqe->lock);
- io_worker_ref_put(wq);
- clear_bit_unlock(0, &worker->create_state);
- io_worker_release(worker);
+ io_worker_cancel_cb(worker);
}
rcu_read_lock();
@@ -1291,15 +1317,18 @@ int io_wq_max_workers(struct io_wq *wq, int *new_count)
rcu_read_lock();
for_each_node(node) {
+ struct io_wqe *wqe = wq->wqes[node];
struct io_wqe_acct *acct;
+ raw_spin_lock(&wqe->lock);
for (i = 0; i < IO_WQ_ACCT_NR; i++) {
- acct = &wq->wqes[node]->acct[i];
+ acct = &wqe->acct[i];
prev = max_t(int, acct->max_workers, prev);
if (new_count[i])
acct->max_workers = new_count[i];
new_count[i] = prev;
}
+ raw_spin_unlock(&wqe->lock);
}
rcu_read_unlock();
return 0;
diff --git a/fs/io-wq.h b/fs/io-wq.h
index bf5c4c533760..41bf37674a49 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -29,6 +29,17 @@ struct io_wq_work_list {
struct io_wq_work_node *last;
};
+#define wq_list_for_each(pos, prv, head) \
+ for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
+
+#define wq_list_for_each_resume(pos, prv) \
+ for (; pos; prv = pos, pos = (pos)->next)
+
+#define wq_list_empty(list) (READ_ONCE((list)->first) == NULL)
+#define INIT_WQ_LIST(list) do { \
+ (list)->first = NULL; \
+} while (0)
+
static inline void wq_list_add_after(struct io_wq_work_node *node,
struct io_wq_work_node *pos,
struct io_wq_work_list *list)
@@ -54,6 +65,15 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node,
}
}
+static inline void wq_list_add_head(struct io_wq_work_node *node,
+ struct io_wq_work_list *list)
+{
+ node->next = list->first;
+ if (!node->next)
+ list->last = node;
+ WRITE_ONCE(list->first, node);
+}
+
static inline void wq_list_cut(struct io_wq_work_list *list,
struct io_wq_work_node *last,
struct io_wq_work_node *prev)
@@ -69,6 +89,31 @@ static inline void wq_list_cut(struct io_wq_work_list *list,
last->next = NULL;
}
+static inline void __wq_list_splice(struct io_wq_work_list *list,
+ struct io_wq_work_node *to)
+{
+ list->last->next = to->next;
+ to->next = list->first;
+ INIT_WQ_LIST(list);
+}
+
+static inline bool wq_list_splice(struct io_wq_work_list *list,
+ struct io_wq_work_node *to)
+{
+ if (!wq_list_empty(list)) {
+ __wq_list_splice(list, to);
+ return true;
+ }
+ return false;
+}
+
+static inline void wq_stack_add_head(struct io_wq_work_node *node,
+ struct io_wq_work_node *stack)
+{
+ node->next = stack->next;
+ stack->next = node;
+}
+
static inline void wq_list_del(struct io_wq_work_list *list,
struct io_wq_work_node *node,
struct io_wq_work_node *prev)
@@ -76,14 +121,14 @@ static inline void wq_list_del(struct io_wq_work_list *list,
wq_list_cut(list, node, prev);
}
-#define wq_list_for_each(pos, prv, head) \
- for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
+static inline
+struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack)
+{
+ struct io_wq_work_node *node = stack->next;
-#define wq_list_empty(list) (READ_ONCE((list)->first) == NULL)
-#define INIT_WQ_LIST(list) do { \
- (list)->first = NULL; \
- (list)->last = NULL; \
-} while (0)
+ stack->next = node->next;
+ return node;
+}
struct io_wq_work {
struct io_wq_work_node list;
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6b9e70208782..3a4af9799d9a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -103,11 +103,14 @@
#define IORING_MAX_REG_BUFFERS (1U << 14)
-#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
- IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
- IOSQE_BUFFER_SELECT)
+#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
+ IOSQE_IO_HARDLINK | IOSQE_ASYNC)
+
+#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS|IOSQE_BUFFER_SELECT|IOSQE_IO_DRAIN)
+
#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
- REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS)
+ REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
+ REQ_F_ASYNC_DATA)
#define IO_TCTX_REFS_CACHE_NR (1U << 10)
@@ -195,8 +198,10 @@ struct io_rings {
};
enum io_uring_cmd_flags {
- IO_URING_F_NONBLOCK = 1,
- IO_URING_F_COMPLETE_DEFER = 2,
+ IO_URING_F_COMPLETE_DEFER = 1,
+ IO_URING_F_UNLOCKED = 2,
+ /* int's last bit, sign checks are usually faster than a bit test */
+ IO_URING_F_NONBLOCK = INT_MIN,
};
struct io_mapped_ubuf {
@@ -305,26 +310,16 @@ struct io_submit_link {
};
struct io_submit_state {
- struct blk_plug plug;
+ /* inline/task_work completion list, under ->uring_lock */
+ struct io_wq_work_node free_list;
+ /* batch completion logic */
+ struct io_wq_work_list compl_reqs;
struct io_submit_link link;
- /*
- * io_kiocb alloc cache
- */
- void *reqs[IO_REQ_CACHE_SIZE];
- unsigned int free_reqs;
-
bool plug_started;
-
- /*
- * Batch completion logic
- */
- struct io_kiocb *compl_reqs[IO_COMPL_BATCH];
- unsigned int compl_nr;
- /* inline/task_work completion list, under ->uring_lock */
- struct list_head free_list;
-
- unsigned int ios_left;
+ bool need_plug;
+ unsigned short submit_nr;
+ struct blk_plug plug;
};
struct io_ring_ctx {
@@ -368,6 +363,7 @@ struct io_ring_ctx {
* uring_lock, and updated through io_uring_register(2)
*/
struct io_rsrc_node *rsrc_node;
+ int rsrc_cached_refs;
struct io_file_table file_table;
unsigned nr_user_files;
unsigned nr_user_bufs;
@@ -384,7 +380,7 @@ struct io_ring_ctx {
} ____cacheline_aligned_in_smp;
/* IRQ completion list, under ->completion_lock */
- struct list_head locked_free_list;
+ struct io_wq_work_list locked_free_list;
unsigned int locked_free_nr;
const struct cred *sq_creds; /* cred used for __io_sq_thread() */
@@ -399,7 +395,6 @@ struct io_ring_ctx {
unsigned cached_cq_tail;
unsigned cq_entries;
struct eventfd_ctx *cq_ev_fd;
- struct wait_queue_head poll_wait;
struct wait_queue_head cq_wait;
unsigned cq_extra;
atomic_t cq_timeouts;
@@ -417,7 +412,7 @@ struct io_ring_ctx {
* For SQPOLL, only the single threaded io_sq_thread() will
* manipulate the list, hence no extra locking is needed there.
*/
- struct list_head iopoll_list;
+ struct io_wq_work_list iopoll_list;
struct hlist_head *cancel_hash;
unsigned cancel_hash_bits;
bool poll_multi_queue;
@@ -456,6 +451,8 @@ struct io_ring_ctx {
struct work_struct exit_work;
struct list_head tctx_list;
struct completion ref_comp;
+ u32 iowq_limits[2];
+ bool iowq_limits_set;
};
};
@@ -578,7 +575,6 @@ struct io_sr_msg {
int msg_flags;
int bgid;
size_t len;
- struct io_buffer *kbuf;
};
struct io_open {
@@ -690,11 +686,6 @@ struct io_hardlink {
int flags;
};
-struct io_completion {
- struct file *file;
- u32 cflags;
-};
-
struct io_async_connect {
struct sockaddr_storage address;
};
@@ -708,11 +699,15 @@ struct io_async_msghdr {
struct sockaddr_storage addr;
};
-struct io_async_rw {
- struct iovec fast_iov[UIO_FASTIOV];
- const struct iovec *free_iovec;
+struct io_rw_state {
struct iov_iter iter;
struct iov_iter_state iter_state;
+ struct iovec fast_iov[UIO_FASTIOV];
+};
+
+struct io_async_rw {
+ struct io_rw_state s;
+ const struct iovec *free_iovec;
size_t bytes_done;
struct wait_page_queue wpq;
};
@@ -739,9 +734,9 @@ enum {
REQ_F_CREDS_BIT,
REQ_F_REFCOUNT_BIT,
REQ_F_ARM_LTIMEOUT_BIT,
+ REQ_F_ASYNC_DATA_BIT,
/* keep async read/write and isreg together and in order */
- REQ_F_NOWAIT_READ_BIT,
- REQ_F_NOWAIT_WRITE_BIT,
+ REQ_F_SUPPORT_NOWAIT_BIT,
REQ_F_ISREG_BIT,
/* not a real bit, just to check we're not overflowing the space */
@@ -782,10 +777,8 @@ enum {
REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT),
/* caller should reissue async */
REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT),
- /* supports async reads */
- REQ_F_NOWAIT_READ = BIT(REQ_F_NOWAIT_READ_BIT),
- /* supports async writes */
- REQ_F_NOWAIT_WRITE = BIT(REQ_F_NOWAIT_WRITE_BIT),
+ /* supports async reads/writes */
+ REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT),
/* regular file */
REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
/* has creds assigned */
@@ -794,6 +787,8 @@ enum {
REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT),
/* there is a linked timeout that has to be armed */
REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT),
+ /* ->async_data allocated */
+ REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT),
};
struct async_poll {
@@ -850,39 +845,41 @@ struct io_kiocb {
struct io_mkdir mkdir;
struct io_symlink symlink;
struct io_hardlink hardlink;
- /* use only after cleaning per-op data, see io_clean_op() */
- struct io_completion compl;
};
- /* opcode allocated if it needs to store data for async defer */
- void *async_data;
u8 opcode;
/* polled IO has completed */
u8 iopoll_completed;
-
u16 buf_index;
+ unsigned int flags;
+
+ u64 user_data;
u32 result;
+ u32 cflags;
struct io_ring_ctx *ctx;
- unsigned int flags;
- atomic_t refs;
struct task_struct *task;
- u64 user_data;
- struct io_kiocb *link;
struct percpu_ref *fixed_rsrc_refs;
+ /* store used ubuf, so we can prevent reloading */
+ struct io_mapped_ubuf *imu;
- /* used with ctx->iopoll_list with reads/writes */
- struct list_head inflight_entry;
+ /* used by request caches, completion batching and iopoll */
+ struct io_wq_work_node comp_list;
+ atomic_t refs;
+ struct io_kiocb *link;
struct io_task_work io_task_work;
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
struct hlist_node hash_node;
+ /* internal polling, see IORING_FEAT_FAST_POLL */
struct async_poll *apoll;
+ /* opcode allocated if it needs to store data for async defer */
+ void *async_data;
struct io_wq_work work;
+ /* custom credentials, valid IFF REQ_F_CREDS is set */
const struct cred *creds;
-
- /* store used ubuf, so we can prevent reloading */
- struct io_mapped_ubuf *imu;
+ /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
+ struct io_buffer *kbuf;
};
struct io_tctx_node {
@@ -900,12 +897,12 @@ struct io_defer_entry {
struct io_op_def {
/* needs req->file assigned */
unsigned needs_file : 1;
+ /* should block plug */
+ unsigned plug : 1;
/* hash wq insertion if file is a regular file */
unsigned hash_reg_file : 1;
/* unbound wq insertion if file is a non-regular file */
unsigned unbound_nonreg_file : 1;
- /* opcode is not supported by this kernel */
- unsigned not_supported : 1;
/* set if opcode supports polled "wait" */
unsigned pollin : 1;
unsigned pollout : 1;
@@ -913,8 +910,8 @@ struct io_op_def {
unsigned buffer_select : 1;
/* do prep async if is going to be punted */
unsigned needs_async_setup : 1;
- /* should block plug */
- unsigned plug : 1;
+ /* opcode is not supported by this kernel */
+ unsigned not_supported : 1;
/* size of async data needed, if any */
unsigned short async_size;
};
@@ -1078,7 +1075,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
- long res, unsigned int cflags);
+ s32 res, u32 cflags);
static void io_put_req(struct io_kiocb *req);
static void io_put_req_deferred(struct io_kiocb *req);
static void io_dismantle_req(struct io_kiocb *req);
@@ -1093,7 +1090,7 @@ static void __io_queue_sqe(struct io_kiocb *req);
static void io_rsrc_put_work(struct work_struct *work);
static void io_req_task_queue(struct io_kiocb *req);
-static void io_submit_flush_completions(struct io_ring_ctx *ctx);
+static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
static int io_req_prep_async(struct io_kiocb *req);
static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
@@ -1165,6 +1162,12 @@ static inline void req_ref_get(struct io_kiocb *req)
atomic_inc(&req->refs);
}
+static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
+{
+ if (!wq_list_empty(&ctx->submit_state.compl_reqs))
+ __io_submit_flush_completions(ctx);
+}
+
static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
{
if (!(req->flags & REQ_F_REFCOUNT)) {
@@ -1178,13 +1181,52 @@ static inline void io_req_set_refcount(struct io_kiocb *req)
__io_req_set_refcount(req, 1);
}
-static inline void io_req_set_rsrc_node(struct io_kiocb *req)
+#define IO_RSRC_REF_BATCH 100
+
+static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
+ struct io_ring_ctx *ctx)
+ __must_hold(&ctx->uring_lock)
{
- struct io_ring_ctx *ctx = req->ctx;
+ struct percpu_ref *ref = req->fixed_rsrc_refs;
+
+ if (ref) {
+ if (ref == &ctx->rsrc_node->refs)
+ ctx->rsrc_cached_refs++;
+ else
+ percpu_ref_put(ref);
+ }
+}
+
+static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx)
+{
+ if (req->fixed_rsrc_refs)
+ percpu_ref_put(req->fixed_rsrc_refs);
+}
+
+static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
+ __must_hold(&ctx->uring_lock)
+{
+ if (ctx->rsrc_cached_refs) {
+ percpu_ref_put_many(&ctx->rsrc_node->refs, ctx->rsrc_cached_refs);
+ ctx->rsrc_cached_refs = 0;
+ }
+}
+static void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
+ __must_hold(&ctx->uring_lock)
+{
+ ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
+ percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH);
+}
+
+static inline void io_req_set_rsrc_node(struct io_kiocb *req,
+ struct io_ring_ctx *ctx)
+{
if (!req->fixed_rsrc_refs) {
req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
- percpu_ref_get(req->fixed_rsrc_refs);
+ ctx->rsrc_cached_refs--;
+ if (unlikely(ctx->rsrc_cached_refs < 0))
+ io_rsrc_refs_refill(ctx);
}
}
@@ -1217,6 +1259,11 @@ static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
return false;
}
+static inline bool req_has_async_data(struct io_kiocb *req)
+{
+ return req->flags & REQ_F_ASYNC_DATA;
+}
+
static inline void req_set_fail(struct io_kiocb *req)
{
req->flags |= REQ_F_FAIL;
@@ -1228,7 +1275,7 @@ static inline void req_fail_link_node(struct io_kiocb *req, int res)
req->result = res;
}
-static void io_ring_ctx_ref_free(struct percpu_ref *ref)
+static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
{
struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
@@ -1240,7 +1287,7 @@ static inline bool io_is_timeout_noseq(struct io_kiocb *req)
return !req->timeout.off;
}
-static void io_fallback_req_func(struct work_struct *work)
+static __cold void io_fallback_req_func(struct work_struct *work)
{
struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
fallback_work.work);
@@ -1253,15 +1300,13 @@ static void io_fallback_req_func(struct work_struct *work)
req->io_task_work.func(req, &locked);
if (locked) {
- if (ctx->submit_state.compl_nr)
- io_submit_flush_completions(ctx);
+ io_submit_flush_completions(ctx);
mutex_unlock(&ctx->uring_lock);
}
percpu_ref_put(&ctx->refs);
-
}
-static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
+static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
{
struct io_ring_ctx *ctx;
int hash_bits;
@@ -1298,7 +1343,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
ctx->flags = p->flags;
init_waitqueue_head(&ctx->sqo_sq_wait);
INIT_LIST_HEAD(&ctx->sqd_list);
- init_waitqueue_head(&ctx->poll_wait);
INIT_LIST_HEAD(&ctx->cq_overflow_list);
init_completion(&ctx->ref_comp);
xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
@@ -1307,7 +1351,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
init_waitqueue_head(&ctx->cq_wait);
spin_lock_init(&ctx->completion_lock);
spin_lock_init(&ctx->timeout_lock);
- INIT_LIST_HEAD(&ctx->iopoll_list);
+ INIT_WQ_LIST(&ctx->iopoll_list);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
INIT_LIST_HEAD(&ctx->ltimeout_list);
@@ -1316,9 +1360,10 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
init_llist_head(&ctx->rsrc_put_llist);
INIT_LIST_HEAD(&ctx->tctx_list);
- INIT_LIST_HEAD(&ctx->submit_state.free_list);
- INIT_LIST_HEAD(&ctx->locked_free_list);
+ ctx->submit_state.free_list.next = NULL;
+ INIT_WQ_LIST(&ctx->locked_free_list);
INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
+ INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
return ctx;
err:
kfree(ctx->dummy_ubuf);
@@ -1346,21 +1391,16 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq)
return false;
}
-#define FFS_ASYNC_READ 0x1UL
-#define FFS_ASYNC_WRITE 0x2UL
-#ifdef CONFIG_64BIT
-#define FFS_ISREG 0x4UL
-#else
-#define FFS_ISREG 0x0UL
-#endif
-#define FFS_MASK ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG)
+#define FFS_NOWAIT 0x1UL
+#define FFS_ISREG 0x2UL
+#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG)
static inline bool io_req_ffs_set(struct io_kiocb *req)
{
- return IS_ENABLED(CONFIG_64BIT) && (req->flags & REQ_F_FIXED_FILE);
+ return req->flags & REQ_F_FIXED_FILE;
}
-static void io_req_track_inflight(struct io_kiocb *req)
+static inline void io_req_track_inflight(struct io_kiocb *req)
{
if (!(req->flags & REQ_F_INFLIGHT)) {
req->flags |= REQ_F_INFLIGHT;
@@ -1368,11 +1408,6 @@ static void io_req_track_inflight(struct io_kiocb *req)
}
}
-static inline void io_unprep_linked_timeout(struct io_kiocb *req)
-{
- req->flags &= ~REQ_F_LINK_TIMEOUT;
-}
-
static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
{
if (WARN_ON_ONCE(!req->link))
@@ -1443,15 +1478,19 @@ static void io_prep_async_link(struct io_kiocb *req)
}
}
-static void io_queue_async_work(struct io_kiocb *req, bool *locked)
+static inline void io_req_add_compl_list(struct io_kiocb *req)
+{
+ struct io_submit_state *state = &req->ctx->submit_state;
+
+ wq_list_add_tail(&req->comp_list, &state->compl_reqs);
+}
+
+static void io_queue_async_work(struct io_kiocb *req, bool *dont_use)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *link = io_prep_linked_timeout(req);
struct io_uring_task *tctx = req->task->io_uring;
- /* must not take the lock, NULL it as a precaution */
- locked = NULL;
-
BUG_ON(!tctx);
BUG_ON(!tctx->io_wq);
@@ -1492,7 +1531,7 @@ static void io_kill_timeout(struct io_kiocb *req, int status)
}
}
-static void io_queue_deferred(struct io_ring_ctx *ctx)
+static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
{
while (!list_empty(&ctx->defer_list)) {
struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
@@ -1506,7 +1545,7 @@ static void io_queue_deferred(struct io_ring_ctx *ctx)
}
}
-static void io_flush_timeouts(struct io_ring_ctx *ctx)
+static __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
__must_hold(&ctx->completion_lock)
{
u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
@@ -1539,7 +1578,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx)
spin_unlock_irq(&ctx->timeout_lock);
}
-static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
+static __cold void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
{
if (ctx->off_timeout_used)
io_flush_timeouts(ctx);
@@ -1609,12 +1648,8 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
*/
if (wq_has_sleeper(&ctx->cq_wait))
wake_up_all(&ctx->cq_wait);
- if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
- wake_up(&ctx->sq_data->wait);
if (io_should_trigger_evfd(ctx))
eventfd_signal(ctx->cq_ev_fd, 1);
- if (waitqueue_active(&ctx->poll_wait))
- wake_up_interruptible(&ctx->poll_wait);
}
static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
@@ -1628,8 +1663,6 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
}
if (io_should_trigger_evfd(ctx))
eventfd_signal(ctx->cq_ev_fd, 1);
- if (waitqueue_active(&ctx->poll_wait))
- wake_up_interruptible(&ctx->poll_wait);
}
/* Returns true if there are no backlogged entries after the flush */
@@ -1725,7 +1758,7 @@ static inline void io_get_task_refs(int nr)
}
static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
- long res, unsigned int cflags)
+ s32 res, u32 cflags)
{
struct io_overflow_cqe *ocqe;
@@ -1753,7 +1786,7 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
}
static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
- long res, unsigned int cflags)
+ s32 res, u32 cflags)
{
struct io_uring_cqe *cqe;
@@ -1776,13 +1809,13 @@ static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data
/* not as hot to bloat with inlining */
static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
- long res, unsigned int cflags)
+ s32 res, u32 cflags)
{
return __io_cqring_fill_event(ctx, user_data, res, cflags);
}
-static void io_req_complete_post(struct io_kiocb *req, long res,
- unsigned int cflags)
+static void io_req_complete_post(struct io_kiocb *req, s32 res,
+ u32 cflags)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -1801,40 +1834,27 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
req->link = NULL;
}
}
+ io_req_put_rsrc(req, ctx);
io_dismantle_req(req);
io_put_task(req->task, 1);
- list_add(&req->inflight_entry, &ctx->locked_free_list);
+ wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
ctx->locked_free_nr++;
- } else {
- if (!percpu_ref_tryget(&ctx->refs))
- req = NULL;
}
io_commit_cqring(ctx);
spin_unlock(&ctx->completion_lock);
-
- if (req) {
- io_cqring_ev_posted(ctx);
- percpu_ref_put(&ctx->refs);
- }
-}
-
-static inline bool io_req_needs_clean(struct io_kiocb *req)
-{
- return req->flags & IO_REQ_CLEAN_FLAGS;
+ io_cqring_ev_posted(ctx);
}
-static void io_req_complete_state(struct io_kiocb *req, long res,
- unsigned int cflags)
+static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
+ u32 cflags)
{
- if (io_req_needs_clean(req))
- io_clean_op(req);
req->result = res;
- req->compl.cflags = cflags;
+ req->cflags = cflags;
req->flags |= REQ_F_COMPLETE_INLINE;
}
static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
- long res, unsigned cflags)
+ s32 res, u32 cflags)
{
if (issue_flags & IO_URING_F_COMPLETE_DEFER)
io_req_complete_state(req, res, cflags);
@@ -1842,12 +1862,12 @@ static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
io_req_complete_post(req, res, cflags);
}
-static inline void io_req_complete(struct io_kiocb *req, long res)
+static inline void io_req_complete(struct io_kiocb *req, s32 res)
{
__io_req_complete(req, 0, res, 0);
}
-static void io_req_complete_failed(struct io_kiocb *req, long res)
+static void io_req_complete_failed(struct io_kiocb *req, s32 res)
{
req_set_fail(req);
io_req_complete_post(req, res, 0);
@@ -1881,7 +1901,7 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
struct io_submit_state *state)
{
spin_lock(&ctx->completion_lock);
- list_splice_init(&ctx->locked_free_list, &state->free_list);
+ wq_list_splice(&ctx->locked_free_list, &state->free_list);
ctx->locked_free_nr = 0;
spin_unlock(&ctx->completion_lock);
}
@@ -1890,7 +1910,6 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
{
struct io_submit_state *state = &ctx->submit_state;
- int nr;
/*
* If we have more than a batch's worth of requests in our IRQ side
@@ -1899,20 +1918,7 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
*/
if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH)
io_flush_cached_locked_reqs(ctx, state);
-
- nr = state->free_reqs;
- while (!list_empty(&state->free_list)) {
- struct io_kiocb *req = list_first_entry(&state->free_list,
- struct io_kiocb, inflight_entry);
-
- list_del(&req->inflight_entry);
- state->reqs[nr++] = req;
- if (nr == ARRAY_SIZE(state->reqs))
- break;
- }
-
- state->free_reqs = nr;
- return nr != 0;
+ return !!state->free_list.next;
}
/*
@@ -1921,38 +1927,54 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
* Because of that, io_alloc_req() should be called only under ->uring_lock
* and with extra caution to not get a request that is still worked on.
*/
-static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
+static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{
struct io_submit_state *state = &ctx->submit_state;
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
+ void *reqs[IO_REQ_ALLOC_BATCH];
+ struct io_kiocb *req;
int ret, i;
- BUILD_BUG_ON(ARRAY_SIZE(state->reqs) < IO_REQ_ALLOC_BATCH);
-
- if (likely(state->free_reqs || io_flush_cached_reqs(ctx)))
- goto got_req;
+ if (likely(state->free_list.next || io_flush_cached_reqs(ctx)))
+ return true;
- ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH,
- state->reqs);
+ ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
/*
* Bulk alloc is all-or-nothing. If we fail to get a batch,
* retry single alloc to be on the safe side.
*/
if (unlikely(ret <= 0)) {
- state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
- if (!state->reqs[0])
- return NULL;
+ reqs[0] = kmem_cache_alloc(req_cachep, gfp);
+ if (!reqs[0])
+ return false;
ret = 1;
}
- for (i = 0; i < ret; i++)
- io_preinit_req(state->reqs[i], ctx);
- state->free_reqs = ret;
-got_req:
- state->free_reqs--;
- return state->reqs[state->free_reqs];
+ percpu_ref_get_many(&ctx->refs, ret);
+ for (i = 0; i < ret; i++) {
+ req = reqs[i];
+
+ io_preinit_req(req, ctx);
+ wq_stack_add_head(&req->comp_list, &state->free_list);
+ }
+ return true;
+}
+
+static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
+{
+ if (unlikely(!ctx->submit_state.free_list.next))
+ return __io_alloc_req_refill(ctx);
+ return true;
+}
+
+static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
+{
+ struct io_wq_work_node *node;
+
+ node = wq_stack_extract(&ctx->submit_state.free_list);
+ return container_of(node, struct io_kiocb, comp_list);
}
static inline void io_put_file(struct file *file)
@@ -1961,35 +1983,28 @@ static inline void io_put_file(struct file *file)
fput(file);
}
-static void io_dismantle_req(struct io_kiocb *req)
+static inline void io_dismantle_req(struct io_kiocb *req)
{
unsigned int flags = req->flags;
- if (io_req_needs_clean(req))
+ if (unlikely(flags & IO_REQ_CLEAN_FLAGS))
io_clean_op(req);
if (!(flags & REQ_F_FIXED_FILE))
io_put_file(req->file);
- if (req->fixed_rsrc_refs)
- percpu_ref_put(req->fixed_rsrc_refs);
- if (req->async_data) {
- kfree(req->async_data);
- req->async_data = NULL;
- }
}
-static void __io_free_req(struct io_kiocb *req)
+static __cold void __io_free_req(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
+ io_req_put_rsrc(req, ctx);
io_dismantle_req(req);
io_put_task(req->task, 1);
spin_lock(&ctx->completion_lock);
- list_add(&req->inflight_entry, &ctx->locked_free_list);
+ wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
ctx->locked_free_nr++;
spin_unlock(&ctx->completion_lock);
-
- percpu_ref_put(&ctx->refs);
}
static inline void io_remove_next_linked(struct io_kiocb *req)
@@ -2075,47 +2090,45 @@ static bool io_disarm_next(struct io_kiocb *req)
return posted;
}
-static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
+static void __io_req_find_next_prep(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ bool posted;
+
+ spin_lock(&ctx->completion_lock);
+ posted = io_disarm_next(req);
+ if (posted)
+ io_commit_cqring(req->ctx);
+ spin_unlock(&ctx->completion_lock);
+ if (posted)
+ io_cqring_ev_posted(ctx);
+}
+
+static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
{
struct io_kiocb *nxt;
+ if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
+ return NULL;
/*
* If LINK is set, we have dependent requests in this chain. If we
* didn't fail this request, queue the first one up, moving any other
* dependencies to the next request. In case of failure, fail the rest
* of the chain.
*/
- if (req->flags & IO_DISARM_MASK) {
- struct io_ring_ctx *ctx = req->ctx;
- bool posted;
-
- spin_lock(&ctx->completion_lock);
- posted = io_disarm_next(req);
- if (posted)
- io_commit_cqring(req->ctx);
- spin_unlock(&ctx->completion_lock);
- if (posted)
- io_cqring_ev_posted(ctx);
- }
+ if (unlikely(req->flags & IO_DISARM_MASK))
+ __io_req_find_next_prep(req);
nxt = req->link;
req->link = NULL;
return nxt;
}
-static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
-{
- if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
- return NULL;
- return __io_req_find_next(req);
-}
-
static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
{
if (!ctx)
return;
if (*locked) {
- if (ctx->submit_state.compl_nr)
- io_submit_flush_completions(ctx);
+ io_submit_flush_completions(ctx);
mutex_unlock(&ctx->uring_lock);
*locked = false;
}
@@ -2132,7 +2145,7 @@ static void tctx_task_work(struct callback_head *cb)
while (1) {
struct io_wq_work_node *node;
- if (!tctx->task_list.first && locked && ctx->submit_state.compl_nr)
+ if (!tctx->task_list.first && locked)
io_submit_flush_completions(ctx);
spin_lock_irq(&tctx->task_lock);
@@ -2195,8 +2208,9 @@ static void io_req_task_work_add(struct io_kiocb *req)
* will do the job.
*/
notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
- if (!task_work_add(tsk, &tctx->task_work, notify)) {
- wake_up_process(tsk);
+ if (likely(!task_work_add(tsk, &tctx->task_work, notify))) {
+ if (notify == TWA_NONE)
+ wake_up_process(tsk);
return;
}
@@ -2274,77 +2288,62 @@ static void io_free_req_work(struct io_kiocb *req, bool *locked)
io_free_req(req);
}
-struct req_batch {
- struct task_struct *task;
- int task_refs;
- int ctx_refs;
-};
-
-static inline void io_init_req_batch(struct req_batch *rb)
+static void io_free_batch_list(struct io_ring_ctx *ctx,
+ struct io_wq_work_node *node)
+ __must_hold(&ctx->uring_lock)
{
- rb->task_refs = 0;
- rb->ctx_refs = 0;
- rb->task = NULL;
-}
+ struct task_struct *task = NULL;
+ int task_refs = 0;
-static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
- struct req_batch *rb)
-{
- if (rb->ctx_refs)
- percpu_ref_put_many(&ctx->refs, rb->ctx_refs);
- if (rb->task)
- io_put_task(rb->task, rb->task_refs);
-}
+ do {
+ struct io_kiocb *req = container_of(node, struct io_kiocb,
+ comp_list);
-static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
- struct io_submit_state *state)
-{
- io_queue_next(req);
- io_dismantle_req(req);
+ if (unlikely(req->flags & REQ_F_REFCOUNT)) {
+ node = req->comp_list.next;
+ if (!req_ref_put_and_test(req))
+ continue;
+ }
- if (req->task != rb->task) {
- if (rb->task)
- io_put_task(rb->task, rb->task_refs);
- rb->task = req->task;
- rb->task_refs = 0;
- }
- rb->task_refs++;
- rb->ctx_refs++;
+ io_req_put_rsrc_locked(req, ctx);
+ io_queue_next(req);
+ io_dismantle_req(req);
- if (state->free_reqs != ARRAY_SIZE(state->reqs))
- state->reqs[state->free_reqs++] = req;
- else
- list_add(&req->inflight_entry, &state->free_list);
+ if (req->task != task) {
+ if (task)
+ io_put_task(task, task_refs);
+ task = req->task;
+ task_refs = 0;
+ }
+ task_refs++;
+ node = req->comp_list.next;
+ wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
+ } while (node);
+
+ if (task)
+ io_put_task(task, task_refs);
}
-static void io_submit_flush_completions(struct io_ring_ctx *ctx)
+static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{
+ struct io_wq_work_node *node, *prev;
struct io_submit_state *state = &ctx->submit_state;
- int i, nr = state->compl_nr;
- struct req_batch rb;
spin_lock(&ctx->completion_lock);
- for (i = 0; i < nr; i++) {
- struct io_kiocb *req = state->compl_reqs[i];
+ wq_list_for_each(node, prev, &state->compl_reqs) {
+ struct io_kiocb *req = container_of(node, struct io_kiocb,
+ comp_list);
__io_cqring_fill_event(ctx, req->user_data, req->result,
- req->compl.cflags);
+ req->cflags);
}
io_commit_cqring(ctx);
spin_unlock(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
- io_init_req_batch(&rb);
- for (i = 0; i < nr; i++) {
- struct io_kiocb *req = state->compl_reqs[i];
-
- if (req_ref_put_and_test(req))
- io_req_free_batch(&rb, req, &ctx->submit_state);
- }
-
- io_req_free_batch_finish(ctx, &rb);
- state->compl_nr = 0;
+ io_free_batch_list(ctx, state->compl_reqs.first);
+ INIT_WQ_LIST(&state->compl_reqs);
}
/*
@@ -2404,12 +2403,9 @@ static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf)
static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
{
- struct io_buffer *kbuf;
-
if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
return 0;
- kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
- return io_put_kbuf(req, kbuf);
+ return io_put_kbuf(req, req->kbuf);
}
static inline bool io_run_task_work(void)
@@ -2423,50 +2419,22 @@ static inline bool io_run_task_work(void)
return false;
}
-/*
- * Find and free completed poll iocbs
- */
-static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
- struct list_head *done)
+static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
{
- struct req_batch rb;
- struct io_kiocb *req;
-
- /* order with ->result store in io_complete_rw_iopoll() */
- smp_rmb();
-
- io_init_req_batch(&rb);
- while (!list_empty(done)) {
- req = list_first_entry(done, struct io_kiocb, inflight_entry);
- list_del(&req->inflight_entry);
-
- __io_cqring_fill_event(ctx, req->user_data, req->result,
- io_put_rw_kbuf(req));
- (*nr_events)++;
-
- if (req_ref_put_and_test(req))
- io_req_free_batch(&rb, req, &ctx->submit_state);
- }
-
- io_commit_cqring(ctx);
- io_cqring_ev_posted_iopoll(ctx);
- io_req_free_batch_finish(ctx, &rb);
-}
-
-static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
- long min)
-{
- struct io_kiocb *req, *tmp;
- LIST_HEAD(done);
- bool spin;
+ struct io_wq_work_node *pos, *start, *prev;
+ unsigned int poll_flags = BLK_POLL_NOSLEEP;
+ DEFINE_IO_COMP_BATCH(iob);
+ int nr_events = 0;
/*
* Only spin for completions if we don't have multiple devices hanging
- * off our complete list, and we're under the requested amount.
+ * off our complete list.
*/
- spin = !ctx->poll_multi_queue && *nr_events < min;
+ if (ctx->poll_multi_queue || force_nonspin)
+ poll_flags |= BLK_POLL_ONESHOT;
- list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
+ wq_list_for_each(pos, start, &ctx->iopoll_list) {
+ struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
struct kiocb *kiocb = &req->rw.kiocb;
int ret;
@@ -2475,47 +2443,62 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
* If we find a request that requires polling, break out
* and complete those lists first, if we have entries there.
*/
- if (READ_ONCE(req->iopoll_completed)) {
- list_move_tail(&req->inflight_entry, &done);
- continue;
- }
- if (!list_empty(&done))
+ if (READ_ONCE(req->iopoll_completed))
break;
- ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
+ ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags);
if (unlikely(ret < 0))
return ret;
else if (ret)
- spin = false;
+ poll_flags |= BLK_POLL_ONESHOT;
/* iopoll may have completed current req */
- if (READ_ONCE(req->iopoll_completed))
- list_move_tail(&req->inflight_entry, &done);
+ if (!rq_list_empty(iob.req_list) ||
+ READ_ONCE(req->iopoll_completed))
+ break;
}
- if (!list_empty(&done))
- io_iopoll_complete(ctx, nr_events, &done);
+ if (!rq_list_empty(iob.req_list))
+ iob.complete(&iob);
+ else if (!pos)
+ return 0;
+
+ prev = start;
+ wq_list_for_each_resume(pos, prev) {
+ struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
- return 0;
+ /* order with io_complete_rw_iopoll(), e.g. ->result updates */
+ if (!smp_load_acquire(&req->iopoll_completed))
+ break;
+ __io_cqring_fill_event(ctx, req->user_data, req->result,
+ io_put_rw_kbuf(req));
+ nr_events++;
+ }
+
+ if (unlikely(!nr_events))
+ return 0;
+
+ io_commit_cqring(ctx);
+ io_cqring_ev_posted_iopoll(ctx);
+ pos = start ? start->next : ctx->iopoll_list.first;
+ wq_list_cut(&ctx->iopoll_list, prev, start);
+ io_free_batch_list(ctx, pos);
+ return nr_events;
}
/*
* We can't just wait for polled events to come to us, we have to actively
* find and complete them.
*/
-static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
+static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
{
if (!(ctx->flags & IORING_SETUP_IOPOLL))
return;
mutex_lock(&ctx->uring_lock);
- while (!list_empty(&ctx->iopoll_list)) {
- unsigned int nr_events = 0;
-
- io_do_iopoll(ctx, &nr_events, 0);
-
+ while (!wq_list_empty(&ctx->iopoll_list)) {
/* let it sleep and repeat later if can't complete a request */
- if (nr_events == 0)
+ if (io_do_iopoll(ctx, true) == 0)
break;
/*
* Ensure we allow local-to-the-cpu processing to take place,
@@ -2562,7 +2545,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
* forever, while the workqueue is stuck trying to acquire the
* very same mutex.
*/
- if (list_empty(&ctx->iopoll_list)) {
+ if (wq_list_empty(&ctx->iopoll_list)) {
u32 tail = ctx->cached_cq_tail;
mutex_unlock(&ctx->uring_lock);
@@ -2571,11 +2554,15 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
/* some requests don't go through iopoll_list */
if (tail != ctx->cached_cq_tail ||
- list_empty(&ctx->iopoll_list))
+ wq_list_empty(&ctx->iopoll_list))
break;
}
- ret = io_do_iopoll(ctx, &nr_events, min);
- } while (!ret && nr_events < min && !need_resched());
+ ret = io_do_iopoll(ctx, !min);
+ if (ret < 0)
+ break;
+ nr_events += ret;
+ ret = 0;
+ } while (nr_events < min && !need_resched());
out:
mutex_unlock(&ctx->uring_lock);
return ret;
@@ -2600,9 +2587,9 @@ static bool io_resubmit_prep(struct io_kiocb *req)
{
struct io_async_rw *rw = req->async_data;
- if (!rw)
+ if (!req_has_async_data(req))
return !io_req_prep_async(req);
- iov_iter_restore(&rw->iter, &rw->iter_state);
+ iov_iter_restore(&rw->s.iter, &rw->s.iter_state);
return true;
}
@@ -2646,7 +2633,7 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
{
if (req->rw.kiocb.ki_flags & IOCB_WRITE)
kiocb_end_write(req);
- if (res != req->result) {
+ if (unlikely(res != req->result)) {
if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
io_rw_should_reissue(req)) {
req->flags |= REQ_F_REISSUE;
@@ -2661,16 +2648,11 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
static void io_req_task_complete(struct io_kiocb *req, bool *locked)
{
unsigned int cflags = io_put_rw_kbuf(req);
- long res = req->result;
+ int res = req->result;
if (*locked) {
- struct io_ring_ctx *ctx = req->ctx;
- struct io_submit_state *state = &ctx->submit_state;
-
io_req_complete_state(req, res, cflags);
- state->compl_reqs[state->compl_nr++] = req;
- if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
- io_submit_flush_completions(ctx);
+ io_req_add_compl_list(req);
} else {
io_req_complete_post(req, res, cflags);
}
@@ -2684,7 +2666,7 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
__io_req_complete(req, issue_flags, req->result, io_put_rw_kbuf(req));
}
-static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
+static void io_complete_rw(struct kiocb *kiocb, long res)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
@@ -2695,7 +2677,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
io_req_task_work_add(req);
}
-static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
+static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
@@ -2706,12 +2688,11 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
req->flags |= REQ_F_REISSUE;
return;
}
+ req->result = res;
}
- WRITE_ONCE(req->result, res);
- /* order with io_iopoll_complete() checking ->result */
- smp_wmb();
- WRITE_ONCE(req->iopoll_completed, 1);
+ /* order with io_iopoll_complete() checking ->iopoll_completed */
+ smp_store_release(&req->iopoll_completed, 1);
}
/*
@@ -2720,13 +2701,13 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
* find it from a io_do_iopoll() thread before the issuer is done
* accessing the kiocb cookie.
*/
-static void io_iopoll_req_issued(struct io_kiocb *req)
+static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
- const bool in_async = io_wq_current_is_worker();
+ const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
/* workqueue context doesn't hold uring_lock, grab it now */
- if (unlikely(in_async))
+ if (unlikely(needs_lock))
mutex_lock(&ctx->uring_lock);
/*
@@ -2734,23 +2715,15 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
* how we do polling eventually, not spinning if we're on potentially
* different devices.
*/
- if (list_empty(&ctx->iopoll_list)) {
+ if (wq_list_empty(&ctx->iopoll_list)) {
ctx->poll_multi_queue = false;
} else if (!ctx->poll_multi_queue) {
struct io_kiocb *list_req;
- unsigned int queue_num0, queue_num1;
-
- list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb,
- inflight_entry);
- if (list_req->file != req->file) {
+ list_req = container_of(ctx->iopoll_list.first, struct io_kiocb,
+ comp_list);
+ if (list_req->file != req->file)
ctx->poll_multi_queue = true;
- } else {
- queue_num0 = blk_qc_t_to_queue_num(list_req->rw.kiocb.ki_cookie);
- queue_num1 = blk_qc_t_to_queue_num(req->rw.kiocb.ki_cookie);
- if (queue_num0 != queue_num1)
- ctx->poll_multi_queue = true;
- }
}
/*
@@ -2758,11 +2731,11 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
* it to the front so we find it first.
*/
if (READ_ONCE(req->iopoll_completed))
- list_add(&req->inflight_entry, &ctx->iopoll_list);
+ wq_list_add_head(&req->comp_list, &ctx->iopoll_list);
else
- list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
+ wq_list_add_tail(&req->comp_list, &ctx->iopoll_list);
- if (unlikely(in_async)) {
+ if (unlikely(needs_lock)) {
/*
* If IORING_SETUP_SQPOLL is enabled, sqes are either handle
* in sq thread task context or in io worker task context. If
@@ -2787,10 +2760,8 @@ static bool io_bdev_nowait(struct block_device *bdev)
* any file. For now, just ensure that anything potentially problematic is done
* inline.
*/
-static bool __io_file_supports_nowait(struct file *file, int rw)
+static bool __io_file_supports_nowait(struct file *file, umode_t mode)
{
- umode_t mode = file_inode(file)->i_mode;
-
if (S_ISBLK(mode)) {
if (IS_ENABLED(CONFIG_BLOCK) &&
io_bdev_nowait(I_BDEV(file->f_mapping->host)))
@@ -2810,28 +2781,32 @@ static bool __io_file_supports_nowait(struct file *file, int rw)
/* any ->read/write should understand O_NONBLOCK */
if (file->f_flags & O_NONBLOCK)
return true;
+ return file->f_mode & FMODE_NOWAIT;
+}
- if (!(file->f_mode & FMODE_NOWAIT))
- return false;
-
- if (rw == READ)
- return file->f_op->read_iter != NULL;
+/*
+ * If we tracked the file through the SCM inflight mechanism, we could support
+ * any file. For now, just ensure that anything potentially problematic is done
+ * inline.
+ */
+static unsigned int io_file_get_flags(struct file *file)
+{
+ umode_t mode = file_inode(file)->i_mode;
+ unsigned int res = 0;
- return file->f_op->write_iter != NULL;
+ if (S_ISREG(mode))
+ res |= FFS_ISREG;
+ if (__io_file_supports_nowait(file, mode))
+ res |= FFS_NOWAIT;
+ return res;
}
-static bool io_file_supports_nowait(struct io_kiocb *req, int rw)
+static inline bool io_file_supports_nowait(struct io_kiocb *req)
{
- if (rw == READ && (req->flags & REQ_F_NOWAIT_READ))
- return true;
- else if (rw == WRITE && (req->flags & REQ_F_NOWAIT_WRITE))
- return true;
-
- return __io_file_supports_nowait(req->file, rw);
+ return req->flags & REQ_F_SUPPORT_NOWAIT;
}
-static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- int rw)
+static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_ring_ctx *ctx = req->ctx;
struct kiocb *kiocb = &req->rw.kiocb;
@@ -2839,16 +2814,15 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
unsigned ioprio;
int ret;
- if (!io_req_ffs_set(req) && S_ISREG(file_inode(file)->i_mode))
- req->flags |= REQ_F_ISREG;
+ if (!io_req_ffs_set(req))
+ req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;
kiocb->ki_pos = READ_ONCE(sqe->off);
if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) {
req->flags |= REQ_F_CUR_POS;
kiocb->ki_pos = file->f_pos;
}
- kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
- kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
+ kiocb->ki_flags = iocb_flags(file);
ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
if (unlikely(ret))
return ret;
@@ -2859,22 +2833,11 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
* reliably. If not, or it IOCB_NOWAIT is set, don't retry.
*/
if ((kiocb->ki_flags & IOCB_NOWAIT) ||
- ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req, rw)))
+ ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req)))
req->flags |= REQ_F_NOWAIT;
- ioprio = READ_ONCE(sqe->ioprio);
- if (ioprio) {
- ret = ioprio_check_cap(ioprio);
- if (ret)
- return ret;
-
- kiocb->ki_ioprio = ioprio;
- } else
- kiocb->ki_ioprio = get_current_ioprio();
-
if (ctx->flags & IORING_SETUP_IOPOLL) {
- if (!(kiocb->ki_flags & IOCB_DIRECT) ||
- !kiocb->ki_filp->f_op->iopoll)
+ if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
return -EOPNOTSUPP;
kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
@@ -2886,12 +2849,18 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
kiocb->ki_complete = io_complete_rw;
}
- if (req->opcode == IORING_OP_READ_FIXED ||
- req->opcode == IORING_OP_WRITE_FIXED) {
- req->imu = NULL;
- io_req_set_rsrc_node(req);
+ ioprio = READ_ONCE(sqe->ioprio);
+ if (ioprio) {
+ ret = ioprio_check_cap(ioprio);
+ if (ret)
+ return ret;
+
+ kiocb->ki_ioprio = ioprio;
+ } else {
+ kiocb->ki_ioprio = get_current_ioprio();
}
+ req->imu = NULL;
req->rw.addr = READ_ONCE(sqe->addr);
req->rw.len = READ_ONCE(sqe->len);
req->buf_index = READ_ONCE(sqe->buf_index);
@@ -2915,7 +2884,7 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
ret = -EINTR;
fallthrough;
default:
- kiocb->ki_complete(kiocb, ret, 0);
+ kiocb->ki_complete(kiocb, ret);
}
}
@@ -2926,7 +2895,7 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
struct io_async_rw *io = req->async_data;
/* add previously done IO, if any */
- if (io && io->bytes_done > 0) {
+ if (req_has_async_data(req) && io->bytes_done > 0) {
if (ret < 0)
ret = io->bytes_done;
else
@@ -2949,7 +2918,7 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
struct io_ring_ctx *ctx = req->ctx;
req_set_fail(req);
- if (issue_flags & IO_URING_F_NONBLOCK) {
+ if (issue_flags & IO_URING_F_UNLOCKED) {
mutex_lock(&ctx->uring_lock);
__io_req_complete(req, issue_flags, ret, cflags);
mutex_unlock(&ctx->uring_lock);
@@ -3020,13 +2989,15 @@ static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter
static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
{
- struct io_ring_ctx *ctx = req->ctx;
struct io_mapped_ubuf *imu = req->imu;
u16 index, buf_index = req->buf_index;
if (likely(!imu)) {
+ struct io_ring_ctx *ctx = req->ctx;
+
if (unlikely(buf_index >= ctx->nr_user_bufs))
return -EFAULT;
+ io_req_set_rsrc_node(req, ctx);
index = array_index_nospec(buf_index, ctx->nr_user_bufs);
imu = READ_ONCE(ctx->user_bufs[index]);
req->imu = imu;
@@ -3053,10 +3024,11 @@ static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
}
static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
- int bgid, struct io_buffer *kbuf,
- bool needs_lock)
+ int bgid, unsigned int issue_flags)
{
+ struct io_buffer *kbuf = req->kbuf;
struct io_buffer *head;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
if (req->flags & REQ_F_BUFFER_SELECTED)
return kbuf;
@@ -3077,34 +3049,32 @@ static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
}
if (*len > kbuf->len)
*len = kbuf->len;
+ req->flags |= REQ_F_BUFFER_SELECTED;
+ req->kbuf = kbuf;
} else {
kbuf = ERR_PTR(-ENOBUFS);
}
io_ring_submit_unlock(req->ctx, needs_lock);
-
return kbuf;
}
static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
- bool needs_lock)
+ unsigned int issue_flags)
{
struct io_buffer *kbuf;
u16 bgid;
- kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
bgid = req->buf_index;
- kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
+ kbuf = io_buffer_select(req, len, bgid, issue_flags);
if (IS_ERR(kbuf))
return kbuf;
- req->rw.addr = (u64) (unsigned long) kbuf;
- req->flags |= REQ_F_BUFFER_SELECTED;
return u64_to_user_ptr(kbuf->addr);
}
#ifdef CONFIG_COMPAT
static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
- bool needs_lock)
+ unsigned int issue_flags)
{
struct compat_iovec __user *uiov;
compat_ssize_t clen;
@@ -3120,7 +3090,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
return -EINVAL;
len = clen;
- buf = io_rw_buffer_select(req, &len, needs_lock);
+ buf = io_rw_buffer_select(req, &len, issue_flags);
if (IS_ERR(buf))
return PTR_ERR(buf);
iov[0].iov_base = buf;
@@ -3130,7 +3100,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
#endif
static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
- bool needs_lock)
+ unsigned int issue_flags)
{
struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
void __user *buf;
@@ -3142,7 +3112,7 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
len = iov[0].iov_len;
if (len < 0)
return -EINVAL;
- buf = io_rw_buffer_select(req, &len, needs_lock);
+ buf = io_rw_buffer_select(req, &len, issue_flags);
if (IS_ERR(buf))
return PTR_ERR(buf);
iov[0].iov_base = buf;
@@ -3151,12 +3121,11 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
}
static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
- bool needs_lock)
+ unsigned int issue_flags)
{
if (req->flags & REQ_F_BUFFER_SELECTED) {
- struct io_buffer *kbuf;
+ struct io_buffer *kbuf = req->kbuf;
- kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
iov[0].iov_len = kbuf->len;
return 0;
@@ -3166,52 +3135,72 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
#ifdef CONFIG_COMPAT
if (req->ctx->compat)
- return io_compat_import(req, iov, needs_lock);
+ return io_compat_import(req, iov, issue_flags);
#endif
- return __io_iov_buffer_select(req, iov, needs_lock);
+ return __io_iov_buffer_select(req, iov, issue_flags);
}
-static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
- struct iov_iter *iter, bool needs_lock)
+static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
+ struct io_rw_state *s,
+ unsigned int issue_flags)
{
- void __user *buf = u64_to_user_ptr(req->rw.addr);
- size_t sqe_len = req->rw.len;
+ struct iov_iter *iter = &s->iter;
u8 opcode = req->opcode;
+ struct iovec *iovec;
+ void __user *buf;
+ size_t sqe_len;
ssize_t ret;
- if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
- *iovec = NULL;
- return io_import_fixed(req, rw, iter);
- }
+ BUILD_BUG_ON(ERR_PTR(0) != NULL);
+
+ if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED)
+ return ERR_PTR(io_import_fixed(req, rw, iter));
/* buffer index only valid with fixed read/write, or buffer select */
- if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
- return -EINVAL;
+ if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT)))
+ return ERR_PTR(-EINVAL);
+
+ buf = u64_to_user_ptr(req->rw.addr);
+ sqe_len = req->rw.len;
if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
if (req->flags & REQ_F_BUFFER_SELECT) {
- buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
+ buf = io_rw_buffer_select(req, &sqe_len, issue_flags);
if (IS_ERR(buf))
- return PTR_ERR(buf);
+ return ERR_CAST(buf);
req->rw.len = sqe_len;
}
- ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
- *iovec = NULL;
- return ret;
+ ret = import_single_range(rw, buf, sqe_len, s->fast_iov, iter);
+ return ERR_PTR(ret);
}
+ iovec = s->fast_iov;
if (req->flags & REQ_F_BUFFER_SELECT) {
- ret = io_iov_buffer_select(req, *iovec, needs_lock);
+ ret = io_iov_buffer_select(req, iovec, issue_flags);
if (!ret)
- iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len);
- *iovec = NULL;
- return ret;
+ iov_iter_init(iter, rw, iovec, 1, iovec->iov_len);
+ return ERR_PTR(ret);
}
- return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter,
+ ret = __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, &iovec, iter,
req->ctx->compat);
+ if (unlikely(ret < 0))
+ return ERR_PTR(ret);
+ return iovec;
+}
+
+static inline int io_import_iovec(int rw, struct io_kiocb *req,
+ struct iovec **iovec, struct io_rw_state *s,
+ unsigned int issue_flags)
+{
+ *iovec = __io_import_iovec(rw, req, s, issue_flags);
+ if (unlikely(IS_ERR(*iovec)))
+ return PTR_ERR(*iovec);
+
+ iov_iter_save_state(&s->iter, &s->iter_state);
+ return 0;
}
static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
@@ -3236,7 +3225,8 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
*/
if (kiocb->ki_flags & IOCB_HIPRI)
return -EOPNOTSUPP;
- if (kiocb->ki_flags & IOCB_NOWAIT)
+ if ((kiocb->ki_flags & IOCB_NOWAIT) &&
+ !(kiocb->ki_filp->f_flags & O_NONBLOCK))
return -EAGAIN;
while (iov_iter_count(iter)) {
@@ -3282,7 +3272,7 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
{
struct io_async_rw *rw = req->async_data;
- memcpy(&rw->iter, iter, sizeof(*iter));
+ memcpy(&rw->s.iter, iter, sizeof(*iter));
rw->free_iovec = iovec;
rw->bytes_done = 0;
/* can only be fixed buffers, no need to do anything */
@@ -3291,33 +3281,36 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
if (!iovec) {
unsigned iov_off = 0;
- rw->iter.iov = rw->fast_iov;
+ rw->s.iter.iov = rw->s.fast_iov;
if (iter->iov != fast_iov) {
iov_off = iter->iov - fast_iov;
- rw->iter.iov += iov_off;
+ rw->s.iter.iov += iov_off;
}
- if (rw->fast_iov != fast_iov)
- memcpy(rw->fast_iov + iov_off, fast_iov + iov_off,
+ if (rw->s.fast_iov != fast_iov)
+ memcpy(rw->s.fast_iov + iov_off, fast_iov + iov_off,
sizeof(struct iovec) * iter->nr_segs);
} else {
req->flags |= REQ_F_NEED_CLEANUP;
}
}
-static inline int io_alloc_async_data(struct io_kiocb *req)
+static inline bool io_alloc_async_data(struct io_kiocb *req)
{
WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
- return req->async_data == NULL;
+ if (req->async_data) {
+ req->flags |= REQ_F_ASYNC_DATA;
+ return false;
+ }
+ return true;
}
static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
- const struct iovec *fast_iov,
- struct iov_iter *iter, bool force)
+ struct io_rw_state *s, bool force)
{
if (!force && !io_op_defs[req->opcode].needs_async_setup)
return 0;
- if (!req->async_data) {
+ if (!req_has_async_data(req)) {
struct io_async_rw *iorw;
if (io_alloc_async_data(req)) {
@@ -3325,10 +3318,10 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
return -ENOMEM;
}
- io_req_map_rw(req, iovec, fast_iov, iter);
+ io_req_map_rw(req, iovec, s->fast_iov, &s->iter);
iorw = req->async_data;
/* we've copied and mapped the iter, ensure state is saved */
- iov_iter_save_state(&iorw->iter, &iorw->iter_state);
+ iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state);
}
return 0;
}
@@ -3336,10 +3329,11 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
{
struct io_async_rw *iorw = req->async_data;
- struct iovec *iov = iorw->fast_iov;
+ struct iovec *iov;
int ret;
- ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
+ /* submission path, ->uring_lock should already be taken */
+ ret = io_import_iovec(rw, req, &iov, &iorw->s, 0);
if (unlikely(ret < 0))
return ret;
@@ -3347,7 +3341,6 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
iorw->free_iovec = iov;
if (iov)
req->flags |= REQ_F_NEED_CLEANUP;
- iov_iter_save_state(&iorw->iter, &iorw->iter_state);
return 0;
}
@@ -3355,11 +3348,11 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
if (unlikely(!(req->file->f_mode & FMODE_READ)))
return -EBADF;
- return io_prep_rw(req, sqe, READ);
+ return io_prep_rw(req, sqe);
}
/*
- * This is our waitqueue callback handler, registered through lock_page_async()
+ * This is our waitqueue callback handler, registered through __folio_lock_async()
* when we initially tried to do the IO with the iocb armed our waitqueue.
* This gets called when the page is unlocked, and we generally expect that to
* happen when the page IO is completed and the page is now uptodate. This will
@@ -3431,7 +3424,7 @@ static bool io_rw_should_retry(struct io_kiocb *req)
static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
{
- if (req->file->f_op->read_iter)
+ if (likely(req->file->f_op->read_iter))
return call_read_iter(req->file, &req->rw.kiocb, iter);
else if (req->file->f_op->read)
return loop_rw_iter(READ, req, iter);
@@ -3447,43 +3440,40 @@ static bool need_read_all(struct io_kiocb *req)
static int io_read(struct io_kiocb *req, unsigned int issue_flags)
{
- struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+ struct io_rw_state __s, *s = &__s;
+ struct iovec *iovec;
struct kiocb *kiocb = &req->rw.kiocb;
- struct iov_iter __iter, *iter = &__iter;
- struct io_async_rw *rw = req->async_data;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
- struct iov_iter_state __state, *state;
+ struct io_async_rw *rw;
ssize_t ret, ret2;
- if (rw) {
- iter = &rw->iter;
- state = &rw->iter_state;
+ if (!req_has_async_data(req)) {
+ ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
+ if (unlikely(ret < 0))
+ return ret;
+ } else {
+ rw = req->async_data;
+ s = &rw->s;
/*
* We come here from an earlier attempt, restore our state to
* match in case it doesn't. It's cheap enough that we don't
* need to make this conditional.
*/
- iov_iter_restore(iter, state);
+ iov_iter_restore(&s->iter, &s->iter_state);
iovec = NULL;
- } else {
- ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
- if (ret < 0)
- return ret;
- state = &__state;
- iov_iter_save_state(iter, state);
}
- req->result = iov_iter_count(iter);
+ req->result = iov_iter_count(&s->iter);
- /* Ensure we clear previously set non-block flag */
- if (!force_nonblock)
- kiocb->ki_flags &= ~IOCB_NOWAIT;
- else
+ if (force_nonblock) {
+ /* If the file doesn't support async, just async punt */
+ if (unlikely(!io_file_supports_nowait(req))) {
+ ret = io_setup_async_rw(req, iovec, s, true);
+ return ret ?: -EAGAIN;
+ }
kiocb->ki_flags |= IOCB_NOWAIT;
-
- /* If the file doesn't support async, just async punt */
- if (force_nonblock && !io_file_supports_nowait(req, READ)) {
- ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
- return ret ?: -EAGAIN;
+ } else {
+ /* Ensure we clear previously set non-block flag */
+ kiocb->ki_flags &= ~IOCB_NOWAIT;
}
ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result);
@@ -3492,7 +3482,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
return ret;
}
- ret = io_iter_do_read(req, iter);
+ ret = io_iter_do_read(req, &s->iter);
if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
req->flags &= ~REQ_F_REISSUE;
@@ -3505,7 +3495,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
ret = 0;
} else if (ret == -EIOCBQUEUED) {
goto out_free;
- } else if (ret <= 0 || ret == req->result || !force_nonblock ||
+ } else if (ret == req->result || ret <= 0 || !force_nonblock ||
(req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
/* read all, failed, already did sync or don't want to retry */
goto done;
@@ -3516,22 +3506,19 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
* untouched in case of error. Restore it and we'll advance it
* manually if we need to.
*/
- iov_iter_restore(iter, state);
+ iov_iter_restore(&s->iter, &s->iter_state);
- ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
+ ret2 = io_setup_async_rw(req, iovec, s, true);
if (ret2)
return ret2;
iovec = NULL;
rw = req->async_data;
+ s = &rw->s;
/*
* Now use our persistent iterator and state, if we aren't already.
* We've restored and mapped the iter to match.
*/
- if (iter != &rw->iter) {
- iter = &rw->iter;
- state = &rw->iter_state;
- }
do {
/*
@@ -3539,11 +3526,11 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
* above or inside this loop. Advance the iter by the bytes
* that were consumed.
*/
- iov_iter_advance(iter, ret);
- if (!iov_iter_count(iter))
+ iov_iter_advance(&s->iter, ret);
+ if (!iov_iter_count(&s->iter))
break;
rw->bytes_done += ret;
- iov_iter_save_state(iter, state);
+ iov_iter_save_state(&s->iter, &s->iter_state);
/* if we can retry, do so with the callbacks armed */
if (!io_rw_should_retry(req)) {
@@ -3557,12 +3544,12 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
* desired page gets unlocked. We can also get a partial read
* here, and if we do, then just retry at the new offset.
*/
- ret = io_iter_do_read(req, iter);
+ ret = io_iter_do_read(req, &s->iter);
if (ret == -EIOCBQUEUED)
return 0;
/* we got some bytes, but not all. retry. */
kiocb->ki_flags &= ~IOCB_WAITQ;
- iov_iter_restore(iter, state);
+ iov_iter_restore(&s->iter, &s->iter_state);
} while (ret > 0);
done:
kiocb_done(kiocb, ret, issue_flags);
@@ -3577,47 +3564,46 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
return -EBADF;
- return io_prep_rw(req, sqe, WRITE);
+ req->rw.kiocb.ki_hint = ki_hint_validate(file_write_hint(req->file));
+ return io_prep_rw(req, sqe);
}
static int io_write(struct io_kiocb *req, unsigned int issue_flags)
{
- struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+ struct io_rw_state __s, *s = &__s;
+ struct iovec *iovec;
struct kiocb *kiocb = &req->rw.kiocb;
- struct iov_iter __iter, *iter = &__iter;
- struct io_async_rw *rw = req->async_data;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
- struct iov_iter_state __state, *state;
ssize_t ret, ret2;
- if (rw) {
- iter = &rw->iter;
- state = &rw->iter_state;
- iov_iter_restore(iter, state);
- iovec = NULL;
- } else {
- ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
- if (ret < 0)
+ if (!req_has_async_data(req)) {
+ ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags);
+ if (unlikely(ret < 0))
return ret;
- state = &__state;
- iov_iter_save_state(iter, state);
+ } else {
+ struct io_async_rw *rw = req->async_data;
+
+ s = &rw->s;
+ iov_iter_restore(&s->iter, &s->iter_state);
+ iovec = NULL;
}
- req->result = iov_iter_count(iter);
+ req->result = iov_iter_count(&s->iter);
- /* Ensure we clear previously set non-block flag */
- if (!force_nonblock)
- kiocb->ki_flags &= ~IOCB_NOWAIT;
- else
- kiocb->ki_flags |= IOCB_NOWAIT;
+ if (force_nonblock) {
+ /* If the file doesn't support async, just async punt */
+ if (unlikely(!io_file_supports_nowait(req)))
+ goto copy_iov;
- /* If the file doesn't support async, just async punt */
- if (force_nonblock && !io_file_supports_nowait(req, WRITE))
- goto copy_iov;
+ /* file path doesn't support NOWAIT for non-direct_IO */
+ if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
+ (req->flags & REQ_F_ISREG))
+ goto copy_iov;
- /* file path doesn't support NOWAIT for non-direct_IO */
- if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
- (req->flags & REQ_F_ISREG))
- goto copy_iov;
+ kiocb->ki_flags |= IOCB_NOWAIT;
+ } else {
+ /* Ensure we clear previously set non-block flag */
+ kiocb->ki_flags &= ~IOCB_NOWAIT;
+ }
ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result);
if (unlikely(ret))
@@ -3637,10 +3623,10 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
}
kiocb->ki_flags |= IOCB_WRITE;
- if (req->file->f_op->write_iter)
- ret2 = call_write_iter(req->file, kiocb, iter);
+ if (likely(req->file->f_op->write_iter))
+ ret2 = call_write_iter(req->file, kiocb, &s->iter);
else if (req->file->f_op->write)
- ret2 = loop_rw_iter(WRITE, req, iter);
+ ret2 = loop_rw_iter(WRITE, req, &s->iter);
else
ret2 = -EINVAL;
@@ -3660,14 +3646,14 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
goto done;
if (!force_nonblock || ret2 != -EAGAIN) {
/* IOPOLL retry should happen for io-wq threads */
- if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
+ if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
goto copy_iov;
done:
kiocb_done(kiocb, ret2, issue_flags);
} else {
copy_iov:
- iov_iter_restore(iter, state);
- ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
+ iov_iter_restore(&s->iter, &s->iter_state);
+ ret = io_setup_async_rw(req, iovec, s, false);
return ret ?: -EAGAIN;
}
out_free:
@@ -3803,7 +3789,7 @@ static int io_mkdirat_prep(struct io_kiocb *req,
return 0;
}
-static int io_mkdirat(struct io_kiocb *req, int issue_flags)
+static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_mkdir *mkd = &req->mkdir;
int ret;
@@ -3852,7 +3838,7 @@ static int io_symlinkat_prep(struct io_kiocb *req,
return 0;
}
-static int io_symlinkat(struct io_kiocb *req, int issue_flags)
+static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_symlink *sl = &req->symlink;
int ret;
@@ -3902,7 +3888,7 @@ static int io_linkat_prep(struct io_kiocb *req,
return 0;
}
-static int io_linkat(struct io_kiocb *req, int issue_flags)
+static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_hardlink *lnk = &req->hardlink;
int ret;
@@ -4321,9 +4307,9 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
struct io_ring_ctx *ctx = req->ctx;
struct io_buffer *head;
int ret = 0;
- bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
- io_ring_submit_lock(ctx, !force_nonblock);
+ io_ring_submit_lock(ctx, needs_lock);
lockdep_assert_held(&ctx->uring_lock);
@@ -4336,7 +4322,7 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
/* complete before unlock, IOPOLL may need the lock */
__io_req_complete(req, issue_flags, ret, 0);
- io_ring_submit_unlock(ctx, !force_nonblock);
+ io_ring_submit_unlock(ctx, needs_lock);
return 0;
}
@@ -4408,9 +4394,9 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
struct io_ring_ctx *ctx = req->ctx;
struct io_buffer *head, *list;
int ret = 0;
- bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
- io_ring_submit_lock(ctx, !force_nonblock);
+ io_ring_submit_lock(ctx, needs_lock);
lockdep_assert_held(&ctx->uring_lock);
@@ -4426,7 +4412,7 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
req_set_fail(req);
/* complete before unlock, IOPOLL may need the lock */
__io_req_complete(req, issue_flags, ret, 0);
- io_ring_submit_unlock(ctx, !force_nonblock);
+ io_ring_submit_unlock(ctx, needs_lock);
return 0;
}
@@ -4759,8 +4745,9 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely(!sock))
return -ENOTSOCK;
- kmsg = req->async_data;
- if (!kmsg) {
+ if (req_has_async_data(req)) {
+ kmsg = req->async_data;
+ } else {
ret = io_sendmsg_copy_hdr(req, &iomsg);
if (ret)
return ret;
@@ -4919,23 +4906,16 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req,
}
static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
- bool needs_lock)
+ unsigned int issue_flags)
{
struct io_sr_msg *sr = &req->sr_msg;
- struct io_buffer *kbuf;
- kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
- if (IS_ERR(kbuf))
- return kbuf;
-
- sr->kbuf = kbuf;
- req->flags |= REQ_F_BUFFER_SELECTED;
- return kbuf;
+ return io_buffer_select(req, &sr->len, sr->bgid, issue_flags);
}
static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
{
- return io_put_kbuf(req, req->sr_msg.kbuf);
+ return io_put_kbuf(req, req->kbuf);
}
static int io_recvmsg_prep_async(struct io_kiocb *req)
@@ -4983,8 +4963,9 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely(!sock))
return -ENOTSOCK;
- kmsg = req->async_data;
- if (!kmsg) {
+ if (req_has_async_data(req)) {
+ kmsg = req->async_data;
+ } else {
ret = io_recvmsg_copy_hdr(req, &iomsg);
if (ret)
return ret;
@@ -4992,7 +4973,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
}
if (req->flags & REQ_F_BUFFER_SELECT) {
- kbuf = io_recv_buffer_select(req, !force_nonblock);
+ kbuf = io_recv_buffer_select(req, issue_flags);
if (IS_ERR(kbuf))
return PTR_ERR(kbuf);
kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
@@ -5044,7 +5025,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
return -ENOTSOCK;
if (req->flags & REQ_F_BUFFER_SELECT) {
- kbuf = io_recv_buffer_select(req, !force_nonblock);
+ kbuf = io_recv_buffer_select(req, issue_flags);
if (IS_ERR(kbuf))
return PTR_ERR(kbuf);
buf = u64_to_user_ptr(kbuf->addr);
@@ -5175,7 +5156,7 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
int ret;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
- if (req->async_data) {
+ if (req_has_async_data(req)) {
io = req->async_data;
} else {
ret = move_addr_to_kernel(req->connect.addr,
@@ -5191,7 +5172,7 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
ret = __sys_connect_file(req->file, &io->address,
req->connect.addr_len, file_flags);
if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
- if (req->async_data)
+ if (req_has_async_data(req))
return -EAGAIN;
if (io_alloc_async_data(req)) {
ret = -ENOMEM;
@@ -5351,16 +5332,6 @@ static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask)
return !(flags & IORING_CQE_F_MORE);
}
-static inline bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
- __must_hold(&req->ctx->completion_lock)
-{
- bool done;
-
- done = __io_poll_complete(req, mask);
- io_commit_cqring(req->ctx);
- return done;
-}
-
static void io_poll_task_func(struct io_kiocb *req, bool *locked)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -5482,7 +5453,10 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake);
req_ref_get(req);
poll->wait.private = req;
+
*poll_ptr = poll;
+ if (req->opcode == IORING_OP_POLL_ADD)
+ req->flags |= REQ_F_ASYNC_DATA;
}
pt->nr_entries++;
@@ -5606,17 +5580,13 @@ static int io_arm_poll_handler(struct io_kiocb *req)
struct async_poll *apoll;
struct io_poll_table ipt;
__poll_t ret, mask = EPOLLONESHOT | POLLERR | POLLPRI;
- int rw;
- if (!req->file || !file_can_poll(req->file))
- return IO_APOLL_ABORTED;
- if (req->flags & REQ_F_POLLED)
- return IO_APOLL_ABORTED;
if (!def->pollin && !def->pollout)
return IO_APOLL_ABORTED;
+ if (!file_can_poll(req->file) || (req->flags & REQ_F_POLLED))
+ return IO_APOLL_ABORTED;
if (def->pollin) {
- rw = READ;
mask |= POLLIN | POLLRDNORM;
/* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
@@ -5624,14 +5594,9 @@ static int io_arm_poll_handler(struct io_kiocb *req)
(req->sr_msg.msg_flags & MSG_ERRQUEUE))
mask &= ~POLLIN;
} else {
- rw = WRITE;
mask |= POLLOUT | POLLWRNORM;
}
- /* if we can't nonblock try, then no point in arming a poll handler */
- if (!io_file_supports_nowait(req, rw))
- return IO_APOLL_ABORTED;
-
apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
if (unlikely(!apoll))
return IO_APOLL_ABORTED;
@@ -5692,8 +5657,8 @@ static bool io_poll_remove_one(struct io_kiocb *req)
/*
* Returns true if we found and killed one or more poll requests
*/
-static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
- bool cancel_all)
+static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx,
+ struct task_struct *tsk, bool cancel_all)
{
struct hlist_node *tmp;
struct io_kiocb *req;
@@ -5847,7 +5812,8 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
if (mask) { /* no async, we'd stolen it */
ipt.error = 0;
- done = io_poll_complete(req, mask);
+ done = __io_poll_complete(req, mask);
+ io_commit_cqring(req->ctx);
}
spin_unlock(&ctx->completion_lock);
@@ -5923,7 +5889,10 @@ err:
static void io_req_task_timeout(struct io_kiocb *req, bool *locked)
{
- req_set_fail(req);
+ struct io_timeout_data *data = req->async_data;
+
+ if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))
+ req_set_fail(req);
io_req_complete_post(req, -ETIME, 0);
}
@@ -6129,7 +6098,8 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (off && is_timeout_link)
return -EINVAL;
flags = READ_ONCE(sqe->timeout_flags);
- if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK))
+ if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK |
+ IORING_TIMEOUT_ETIME_SUCCESS))
return -EINVAL;
/* more than one clock specified is invalid, obviously */
if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
@@ -6140,7 +6110,9 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (unlikely(off && !req->ctx->off_timeout_used))
req->ctx->off_timeout_used = true;
- if (!req->async_data && io_alloc_async_data(req))
+ if (WARN_ON_ONCE(req_has_async_data(req)))
+ return -EFAULT;
+ if (io_alloc_async_data(req))
return -ENOMEM;
data = req->async_data;
@@ -6297,6 +6269,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
u64 sqe_addr = req->cancel.addr;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
struct io_tctx_node *node;
int ret;
@@ -6305,7 +6278,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
goto done;
/* slow path, try all io-wq's */
- io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+ io_ring_submit_lock(ctx, needs_lock);
ret = -ENOENT;
list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
struct io_uring_task *tctx = node->task->io_uring;
@@ -6314,7 +6287,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
if (ret != -ENOENT)
break;
}
- io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+ io_ring_submit_unlock(ctx, needs_lock);
done:
if (ret < 0)
req_set_fail(req);
@@ -6341,6 +6314,7 @@ static int io_rsrc_update_prep(struct io_kiocb *req,
static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
struct io_uring_rsrc_update2 up;
int ret;
@@ -6350,10 +6324,10 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
up.tags = 0;
up.resv = 0;
- io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+ io_ring_submit_lock(ctx, needs_lock);
ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
&up, req->rsrc_update.nr_args);
- io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+ io_ring_submit_unlock(ctx, needs_lock);
if (ret < 0)
req_set_fail(req);
@@ -6449,7 +6423,7 @@ static int io_req_prep_async(struct io_kiocb *req)
{
if (!io_op_defs[req->opcode].needs_async_setup)
return 0;
- if (WARN_ON_ONCE(req->async_data))
+ if (WARN_ON_ONCE(req_has_async_data(req)))
return -EFAULT;
if (io_alloc_async_data(req))
return -EAGAIN;
@@ -6481,68 +6455,39 @@ static u32 io_get_sequence(struct io_kiocb *req)
return seq;
}
-static bool io_drain_req(struct io_kiocb *req)
+static __cold void io_drain_req(struct io_kiocb *req)
{
- struct io_kiocb *pos;
struct io_ring_ctx *ctx = req->ctx;
struct io_defer_entry *de;
int ret;
- u32 seq;
-
- if (req->flags & REQ_F_FAIL) {
- io_req_complete_fail_submit(req);
- return true;
- }
-
- /*
- * If we need to drain a request in the middle of a link, drain the
- * head request and the next request/link after the current link.
- * Considering sequential execution of links, IOSQE_IO_DRAIN will be
- * maintained for every request of our link.
- */
- if (ctx->drain_next) {
- req->flags |= REQ_F_IO_DRAIN;
- ctx->drain_next = false;
- }
- /* not interested in head, start from the first linked */
- io_for_each_link(pos, req->link) {
- if (pos->flags & REQ_F_IO_DRAIN) {
- ctx->drain_next = true;
- req->flags |= REQ_F_IO_DRAIN;
- break;
- }
- }
+ u32 seq = io_get_sequence(req);
/* Still need defer if there is pending req in defer list. */
- if (likely(list_empty_careful(&ctx->defer_list) &&
- !(req->flags & REQ_F_IO_DRAIN))) {
+ if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
+queue:
ctx->drain_active = false;
- return false;
+ io_req_task_queue(req);
+ return;
}
- seq = io_get_sequence(req);
- /* Still a chance to pass the sequence check */
- if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
- return false;
-
ret = io_req_prep_async(req);
- if (ret)
- goto fail;
+ if (ret) {
+fail:
+ io_req_complete_failed(req, ret);
+ return;
+ }
io_prep_async_link(req);
de = kmalloc(sizeof(*de), GFP_KERNEL);
if (!de) {
ret = -ENOMEM;
-fail:
- io_req_complete_failed(req, ret);
- return true;
+ goto fail;
}
spin_lock(&ctx->completion_lock);
if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
spin_unlock(&ctx->completion_lock);
kfree(de);
- io_queue_async_work(req, NULL);
- return true;
+ goto queue;
}
trace_io_uring_defer(ctx, req, req->user_data);
@@ -6550,23 +6495,13 @@ fail:
de->seq = seq;
list_add_tail(&de->list, &ctx->defer_list);
spin_unlock(&ctx->completion_lock);
- return true;
}
static void io_clean_op(struct io_kiocb *req)
{
if (req->flags & REQ_F_BUFFER_SELECTED) {
- switch (req->opcode) {
- case IORING_OP_READV:
- case IORING_OP_READ_FIXED:
- case IORING_OP_READ:
- kfree((void *)(unsigned long)req->rw.addr);
- break;
- case IORING_OP_RECVMSG:
- case IORING_OP_RECV:
- kfree(req->sr_msg.kbuf);
- break;
- }
+ kfree(req->kbuf);
+ req->kbuf = NULL;
}
if (req->flags & REQ_F_NEED_CLEANUP) {
@@ -6631,17 +6566,19 @@ static void io_clean_op(struct io_kiocb *req)
}
if (req->flags & REQ_F_CREDS)
put_cred(req->creds);
-
+ if (req->flags & REQ_F_ASYNC_DATA) {
+ kfree(req->async_data);
+ req->async_data = NULL;
+ }
req->flags &= ~IO_REQ_CLEAN_FLAGS;
}
static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
{
- struct io_ring_ctx *ctx = req->ctx;
const struct cred *creds = NULL;
int ret;
- if ((req->flags & REQ_F_CREDS) && req->creds != current_cred())
+ if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
creds = override_creds(req->creds);
switch (req->opcode) {
@@ -6764,8 +6701,8 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
if (ret)
return ret;
/* If the op doesn't have a file, we're not polling for it */
- if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file)
- io_iopoll_req_issued(req);
+ if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file)
+ io_iopoll_req_issued(req, issue_flags);
return 0;
}
@@ -6781,6 +6718,8 @@ static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
static void io_wq_submit_work(struct io_wq_work *work)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+ unsigned int issue_flags = IO_URING_F_UNLOCKED;
+ bool needs_poll = false;
struct io_kiocb *timeout;
int ret = 0;
@@ -6795,23 +6734,42 @@ static void io_wq_submit_work(struct io_wq_work *work)
io_queue_linked_timeout(timeout);
/* either cancelled or io-wq is dying, so don't touch tctx->iowq */
- if (work->flags & IO_WQ_WORK_CANCEL)
- ret = -ECANCELED;
+ if (work->flags & IO_WQ_WORK_CANCEL) {
+ io_req_task_queue_fail(req, -ECANCELED);
+ return;
+ }
- if (!ret) {
- do {
- ret = io_issue_sqe(req, 0);
- /*
- * We can get EAGAIN for polled IO even though we're
- * forcing a sync submission from here, since we can't
- * wait for request slots on the block side.
- */
- if (ret != -EAGAIN)
- break;
- cond_resched();
- } while (1);
+ if (req->flags & REQ_F_FORCE_ASYNC) {
+ const struct io_op_def *def = &io_op_defs[req->opcode];
+ bool opcode_poll = def->pollin || def->pollout;
+
+ if (opcode_poll && file_can_poll(req->file)) {
+ needs_poll = true;
+ issue_flags |= IO_URING_F_NONBLOCK;
+ }
}
+ do {
+ ret = io_issue_sqe(req, issue_flags);
+ if (ret != -EAGAIN)
+ break;
+ /*
+ * We can get EAGAIN for iopolled IO even though we're
+ * forcing a sync submission from here, since we can't
+ * wait for request slots on the block side.
+ */
+ if (!needs_poll) {
+ cond_resched();
+ continue;
+ }
+
+ if (io_arm_poll_handler(req) == IO_APOLL_OK)
+ return;
+ /* aborted or ready, in either case retry blocking */
+ needs_poll = false;
+ issue_flags &= ~IO_URING_F_NONBLOCK;
+ } while (1);
+
/* avoid locking problems by failing it from a clean context */
if (ret)
io_req_task_queue_fail(req, ret);
@@ -6835,12 +6793,7 @@ static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file
{
unsigned long file_ptr = (unsigned long) file;
- if (__io_file_supports_nowait(file, READ))
- file_ptr |= FFS_ASYNC_READ;
- if (__io_file_supports_nowait(file, WRITE))
- file_ptr |= FFS_ASYNC_WRITE;
- if (S_ISREG(file_inode(file)->i_mode))
- file_ptr |= FFS_ISREG;
+ file_ptr |= io_file_get_flags(file);
file_slot->file_ptr = file_ptr;
}
@@ -6857,8 +6810,8 @@ static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
file = (struct file *) (file_ptr & FFS_MASK);
file_ptr &= ~FFS_MASK;
/* mask in overlapping REQ_F and FFS bits */
- req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT);
- io_req_set_rsrc_node(req);
+ req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT);
+ io_req_set_rsrc_node(req, ctx);
return file;
}
@@ -6950,67 +6903,66 @@ static void io_queue_linked_timeout(struct io_kiocb *req)
io_put_req(req);
}
-static void __io_queue_sqe(struct io_kiocb *req)
+static void io_queue_sqe_arm_apoll(struct io_kiocb *req)
+ __must_hold(&req->ctx->uring_lock)
+{
+ struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
+
+ switch (io_arm_poll_handler(req)) {
+ case IO_APOLL_READY:
+ if (linked_timeout) {
+ io_queue_linked_timeout(linked_timeout);
+ linked_timeout = NULL;
+ }
+ io_req_task_queue(req);
+ break;
+ case IO_APOLL_ABORTED:
+ /*
+ * Queued up for async execution, worker will release
+ * submit reference when the iocb is actually submitted.
+ */
+ io_queue_async_work(req, NULL);
+ break;
+ }
+
+ if (linked_timeout)
+ io_queue_linked_timeout(linked_timeout);
+}
+
+static inline void __io_queue_sqe(struct io_kiocb *req)
__must_hold(&req->ctx->uring_lock)
{
struct io_kiocb *linked_timeout;
int ret;
-issue_sqe:
ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
+ if (req->flags & REQ_F_COMPLETE_INLINE) {
+ io_req_add_compl_list(req);
+ return;
+ }
/*
* We async punt it if the file wasn't marked NOWAIT, or if the file
* doesn't support non-blocking read/write attempts
*/
if (likely(!ret)) {
- if (req->flags & REQ_F_COMPLETE_INLINE) {
- struct io_ring_ctx *ctx = req->ctx;
- struct io_submit_state *state = &ctx->submit_state;
-
- state->compl_reqs[state->compl_nr++] = req;
- if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
- io_submit_flush_completions(ctx);
- return;
- }
-
linked_timeout = io_prep_linked_timeout(req);
if (linked_timeout)
io_queue_linked_timeout(linked_timeout);
} else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
- linked_timeout = io_prep_linked_timeout(req);
-
- switch (io_arm_poll_handler(req)) {
- case IO_APOLL_READY:
- if (linked_timeout)
- io_unprep_linked_timeout(req);
- goto issue_sqe;
- case IO_APOLL_ABORTED:
- /*
- * Queued up for async execution, worker will release
- * submit reference when the iocb is actually submitted.
- */
- io_queue_async_work(req, NULL);
- break;
- }
-
- if (linked_timeout)
- io_queue_linked_timeout(linked_timeout);
+ io_queue_sqe_arm_apoll(req);
} else {
io_req_complete_failed(req, ret);
}
}
-static inline void io_queue_sqe(struct io_kiocb *req)
+static void io_queue_sqe_fallback(struct io_kiocb *req)
__must_hold(&req->ctx->uring_lock)
{
- if (unlikely(req->ctx->drain_active) && io_drain_req(req))
- return;
-
- if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) {
- __io_queue_sqe(req);
- } else if (req->flags & REQ_F_FAIL) {
+ if (req->flags & REQ_F_FAIL) {
io_req_complete_fail_submit(req);
+ } else if (unlikely(req->ctx->drain_active)) {
+ io_drain_req(req);
} else {
int ret = io_req_prep_async(req);
@@ -7021,6 +6973,15 @@ static inline void io_queue_sqe(struct io_kiocb *req)
}
}
+static inline void io_queue_sqe(struct io_kiocb *req)
+ __must_hold(&req->ctx->uring_lock)
+{
+ if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))))
+ __io_queue_sqe(req);
+ else
+ io_queue_sqe_fallback(req);
+}
+
/*
* Check SQE restrictions (opcode and flags).
*
@@ -7030,9 +6991,6 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx,
struct io_kiocb *req,
unsigned int sqe_flags)
{
- if (likely(!ctx->restricted))
- return true;
-
if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
return false;
@@ -7047,16 +7005,35 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx,
return true;
}
+static void io_init_req_drain(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_kiocb *head = ctx->submit_state.link.head;
+
+ ctx->drain_active = true;
+ if (head) {
+ /*
+ * If we need to drain a request in the middle of a link, drain
+ * the head request and the next request/link after the current
+ * link. Considering sequential execution of links,
+ * IOSQE_IO_DRAIN will be maintained for every request of our
+ * link.
+ */
+ head->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC;
+ ctx->drain_next = true;
+ }
+}
+
static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
const struct io_uring_sqe *sqe)
__must_hold(&ctx->uring_lock)
{
- struct io_submit_state *state;
unsigned int sqe_flags;
- int personality, ret = 0;
+ int personality;
+ u8 opcode;
/* req is partially pre-initialised, see io_preinit_req() */
- req->opcode = READ_ONCE(sqe->opcode);
+ req->opcode = opcode = READ_ONCE(sqe->opcode);
/* same numerical values with corresponding REQ_F_*, safe to copy */
req->flags = sqe_flags = READ_ONCE(sqe->flags);
req->user_data = READ_ONCE(sqe->user_data);
@@ -7064,19 +7041,52 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
req->fixed_rsrc_refs = NULL;
req->task = current;
- /* enforce forwards compatibility on users */
- if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
- return -EINVAL;
- if (unlikely(req->opcode >= IORING_OP_LAST))
+ if (unlikely(opcode >= IORING_OP_LAST)) {
+ req->opcode = 0;
return -EINVAL;
- if (!io_check_restriction(ctx, req, sqe_flags))
- return -EACCES;
+ }
+ if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
+ /* enforce forwards compatibility on users */
+ if (sqe_flags & ~SQE_VALID_FLAGS)
+ return -EINVAL;
+ if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
+ !io_op_defs[opcode].buffer_select)
+ return -EOPNOTSUPP;
+ if (sqe_flags & IOSQE_IO_DRAIN)
+ io_init_req_drain(req);
+ }
+ if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
+ if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
+ return -EACCES;
+ /* knock it to the slow queue path, will be drained there */
+ if (ctx->drain_active)
+ req->flags |= REQ_F_FORCE_ASYNC;
+ /* if there is no link, we're at "next" request and need to drain */
+ if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
+ ctx->drain_next = false;
+ ctx->drain_active = true;
+ req->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC;
+ }
+ }
- if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
- !io_op_defs[req->opcode].buffer_select)
- return -EOPNOTSUPP;
- if (unlikely(sqe_flags & IOSQE_IO_DRAIN))
- ctx->drain_active = true;
+ if (io_op_defs[opcode].needs_file) {
+ struct io_submit_state *state = &ctx->submit_state;
+
+ /*
+ * Plug now if we have more than 2 IO left after this, and the
+ * target is potentially a read/write to block based storage.
+ */
+ if (state->need_plug && io_op_defs[opcode].plug) {
+ state->plug_started = true;
+ state->need_plug = false;
+ blk_start_plug_nr_ios(&state->plug, state->submit_nr);
+ }
+
+ req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd),
+ (sqe_flags & IOSQE_FIXED_FILE));
+ if (unlikely(!req->file))
+ return -EBADF;
+ }
personality = READ_ONCE(sqe->personality);
if (personality) {
@@ -7086,27 +7096,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
get_cred(req->creds);
req->flags |= REQ_F_CREDS;
}
- state = &ctx->submit_state;
- /*
- * Plug now if we have more than 1 IO left after this, and the target
- * is potentially a read/write to block based storage.
- */
- if (!state->plug_started && state->ios_left > 1 &&
- io_op_defs[req->opcode].plug) {
- blk_start_plug(&state->plug);
- state->plug_started = true;
- }
-
- if (io_op_defs[req->opcode].needs_file) {
- req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd),
- (sqe_flags & IOSQE_FIXED_FILE));
- if (unlikely(!req->file))
- ret = -EBADF;
- }
-
- state->ios_left--;
- return ret;
+ return io_req_prep(req, sqe);
}
static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
@@ -7118,7 +7109,8 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
ret = io_init_req(ctx, req, sqe);
if (unlikely(ret)) {
-fail_req:
+ trace_io_uring_req_failed(sqe, ret);
+
/* fail even hard links since we don't submit */
if (link->head) {
/*
@@ -7141,10 +7133,6 @@ fail_req:
return ret;
}
req_fail_link_node(req, ret);
- } else {
- ret = io_req_prep(req, sqe);
- if (unlikely(ret))
- goto fail_req;
}
/* don't need @sqe from now on */
@@ -7174,33 +7162,32 @@ fail_req:
link->last->link = req;
link->last = req;
+ if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK))
+ return 0;
/* last request of a link, enqueue the link */
- if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
- link->head = NULL;
- io_queue_sqe(head);
- }
- } else {
- if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
- link->head = req;
- link->last = req;
- } else {
- io_queue_sqe(req);
- }
+ link->head = NULL;
+ req = head;
+ } else if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
+ link->head = req;
+ link->last = req;
+ return 0;
}
+ io_queue_sqe(req);
return 0;
}
/*
* Batched submission is done, ensure local IO is flushed out.
*/
-static void io_submit_state_end(struct io_submit_state *state,
- struct io_ring_ctx *ctx)
+static void io_submit_state_end(struct io_ring_ctx *ctx)
{
+ struct io_submit_state *state = &ctx->submit_state;
+
if (state->link.head)
io_queue_sqe(state->link.head);
- if (state->compl_nr)
- io_submit_flush_completions(ctx);
+ /* flush only after queuing links as they can generate completions */
+ io_submit_flush_completions(ctx);
if (state->plug_started)
blk_finish_plug(&state->plug);
}
@@ -7212,7 +7199,8 @@ static void io_submit_state_start(struct io_submit_state *state,
unsigned int max_ios)
{
state->plug_started = false;
- state->ios_left = max_ios;
+ state->need_plug = max_ios > 2;
+ state->submit_nr = max_ios;
/* set only head, no need to init link_last in advance */
state->link.head = NULL;
}
@@ -7264,45 +7252,45 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
__must_hold(&ctx->uring_lock)
{
+ unsigned int entries = io_sqring_entries(ctx);
int submitted = 0;
+ if (unlikely(!entries))
+ return 0;
/* make sure SQ entry isn't read before tail */
- nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
- if (!percpu_ref_tryget_many(&ctx->refs, nr))
- return -EAGAIN;
+ nr = min3(nr, ctx->sq_entries, entries);
io_get_task_refs(nr);
io_submit_state_start(&ctx->submit_state, nr);
- while (submitted < nr) {
+ do {
const struct io_uring_sqe *sqe;
struct io_kiocb *req;
- req = io_alloc_req(ctx);
- if (unlikely(!req)) {
+ if (unlikely(!io_alloc_req_refill(ctx))) {
if (!submitted)
submitted = -EAGAIN;
break;
}
+ req = io_alloc_req(ctx);
sqe = io_get_sqe(ctx);
if (unlikely(!sqe)) {
- list_add(&req->inflight_entry, &ctx->submit_state.free_list);
+ wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
break;
}
/* will complete beyond this point, count as submitted */
submitted++;
if (io_submit_sqe(ctx, req, sqe))
break;
- }
+ } while (submitted < nr);
if (unlikely(submitted != nr)) {
int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
int unused = nr - ref_used;
current->io_uring->cached_refs += unused;
- percpu_ref_put_many(&ctx->refs, unused);
}
- io_submit_state_end(&ctx->submit_state, ctx);
+ io_submit_state_end(ctx);
/* Commit SQ ring head once we've consumed and submitted all SQEs */
io_commit_sqring(ctx);
@@ -7341,16 +7329,15 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
- if (!list_empty(&ctx->iopoll_list) || to_submit) {
- unsigned nr_events = 0;
+ if (!wq_list_empty(&ctx->iopoll_list) || to_submit) {
const struct cred *creds = NULL;
if (ctx->sq_creds != current_cred())
creds = override_creds(ctx->sq_creds);
mutex_lock(&ctx->uring_lock);
- if (!list_empty(&ctx->iopoll_list))
- io_do_iopoll(ctx, &nr_events, 0);
+ if (!wq_list_empty(&ctx->iopoll_list))
+ io_do_iopoll(ctx, true);
/*
* Don't submit if refs are dying, good for io_uring_register(),
@@ -7370,7 +7357,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
return ret;
}
-static void io_sqd_update_thread_idle(struct io_sq_data *sqd)
+static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd)
{
struct io_ring_ctx *ctx;
unsigned sq_thread_idle = 0;
@@ -7427,7 +7414,7 @@ static int io_sq_thread(void *data)
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
int ret = __io_sq_thread(ctx, cap_entries);
- if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
+ if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list)))
sqt_spin = true;
}
if (io_run_task_work())
@@ -7448,7 +7435,7 @@ static int io_sq_thread(void *data)
io_ring_set_wakeup_flag(ctx);
if ((ctx->flags & IORING_SETUP_IOPOLL) &&
- !list_empty_careful(&ctx->iopoll_list)) {
+ !wq_list_empty(&ctx->iopoll_list)) {
needs_sched = false;
break;
}
@@ -7624,7 +7611,7 @@ static void io_free_page_table(void **table, size_t size)
kfree(table);
}
-static void **io_alloc_page_table(size_t size)
+static __cold void **io_alloc_page_table(size_t size)
{
unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
size_t init_size = size;
@@ -7653,7 +7640,7 @@ static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
kfree(ref_node);
}
-static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
+static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
{
struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
struct io_ring_ctx *ctx = node->rsrc_data->ctx;
@@ -7699,10 +7686,13 @@ static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
struct io_rsrc_data *data_to_kill)
+ __must_hold(&ctx->uring_lock)
{
WARN_ON_ONCE(!ctx->rsrc_backup_node);
WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
+ io_rsrc_refs_drop(ctx);
+
if (data_to_kill) {
struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
@@ -7730,7 +7720,8 @@ static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
return ctx->rsrc_backup_node ? 0 : -ENOMEM;
}
-static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ctx)
+static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
+ struct io_ring_ctx *ctx)
{
int ret;
@@ -7786,9 +7777,9 @@ static void io_rsrc_data_free(struct io_rsrc_data *data)
kfree(data);
}
-static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
- u64 __user *utags, unsigned nr,
- struct io_rsrc_data **pdata)
+static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
+ u64 __user *utags, unsigned nr,
+ struct io_rsrc_data **pdata)
{
struct io_rsrc_data *data;
int ret = -ENOMEM;
@@ -8356,12 +8347,12 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
unsigned int issue_flags, u32 slot_index)
{
struct io_ring_ctx *ctx = req->ctx;
- bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
bool needs_switch = false;
struct io_fixed_file *file_slot;
int ret = -EBADF;
- io_ring_submit_lock(ctx, !force_nonblock);
+ io_ring_submit_lock(ctx, needs_lock);
if (file->f_op == &io_uring_fops)
goto err;
ret = -ENXIO;
@@ -8402,7 +8393,7 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
err:
if (needs_switch)
io_rsrc_node_switch(ctx, ctx->file_data);
- io_ring_submit_unlock(ctx, !force_nonblock);
+ io_ring_submit_unlock(ctx, needs_lock);
if (ret)
fput(file);
return ret;
@@ -8412,11 +8403,12 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
{
unsigned int offset = req->close.file_slot - 1;
struct io_ring_ctx *ctx = req->ctx;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
struct io_fixed_file *file_slot;
struct file *file;
int ret, i;
- io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+ io_ring_submit_lock(ctx, needs_lock);
ret = -ENXIO;
if (unlikely(!ctx->file_data))
goto out;
@@ -8442,7 +8434,7 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
io_rsrc_node_switch(ctx, ctx->file_data);
ret = 0;
out:
- io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+ io_ring_submit_unlock(ctx, needs_lock);
return ret;
}
@@ -8558,8 +8550,8 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
return io_wq_create(concurrency, &data);
}
-static int io_uring_alloc_task_context(struct task_struct *task,
- struct io_ring_ctx *ctx)
+static __cold int io_uring_alloc_task_context(struct task_struct *task,
+ struct io_ring_ctx *ctx)
{
struct io_uring_task *tctx;
int ret;
@@ -8606,8 +8598,8 @@ void __io_uring_free(struct task_struct *tsk)
tsk->io_uring = NULL;
}
-static int io_sq_offload_create(struct io_ring_ctx *ctx,
- struct io_uring_params *p)
+static __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
+ struct io_uring_params *p)
{
int ret;
@@ -9218,29 +9210,25 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx)
}
}
-static void io_req_cache_free(struct list_head *list)
-{
- struct io_kiocb *req, *nxt;
-
- list_for_each_entry_safe(req, nxt, list, inflight_entry) {
- list_del(&req->inflight_entry);
- kmem_cache_free(req_cachep, req);
- }
-}
-
static void io_req_caches_free(struct io_ring_ctx *ctx)
{
struct io_submit_state *state = &ctx->submit_state;
+ int nr = 0;
mutex_lock(&ctx->uring_lock);
+ io_flush_cached_locked_reqs(ctx, state);
- if (state->free_reqs) {
- kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
- state->free_reqs = 0;
- }
+ while (state->free_list.next) {
+ struct io_wq_work_node *node;
+ struct io_kiocb *req;
- io_flush_cached_locked_reqs(ctx, state);
- io_req_cache_free(&state->free_list);
+ node = wq_stack_extract(&state->free_list);
+ req = container_of(node, struct io_kiocb, comp_list);
+ kmem_cache_free(req_cachep, req);
+ nr++;
+ }
+ if (nr)
+ percpu_ref_put_many(&ctx->refs, nr);
mutex_unlock(&ctx->uring_lock);
}
@@ -9250,7 +9238,7 @@ static void io_wait_rsrc_data(struct io_rsrc_data *data)
wait_for_completion(&data->done);
}
-static void io_ring_ctx_free(struct io_ring_ctx *ctx)
+static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
{
io_sq_thread_finish(ctx);
@@ -9259,6 +9247,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
ctx->mm_account = NULL;
}
+ io_rsrc_refs_drop(ctx);
/* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
io_wait_rsrc_data(ctx->buf_data);
io_wait_rsrc_data(ctx->file_data);
@@ -9282,6 +9271,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
if (ctx->rsrc_backup_node)
io_rsrc_node_destroy(ctx->rsrc_backup_node);
flush_delayed_work(&ctx->rsrc_put_work);
+ flush_delayed_work(&ctx->fallback_work);
WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
@@ -9312,7 +9302,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
struct io_ring_ctx *ctx = file->private_data;
__poll_t mask = 0;
- poll_wait(file, &ctx->poll_wait, wait);
+ poll_wait(file, &ctx->cq_wait, wait);
/*
* synchronizes with barrier from wq_has_sleeper call in
* io_commit_cqring
@@ -9359,7 +9349,7 @@ struct io_tctx_exit {
struct io_ring_ctx *ctx;
};
-static void io_tctx_exit_cb(struct callback_head *cb)
+static __cold void io_tctx_exit_cb(struct callback_head *cb)
{
struct io_uring_task *tctx = current->io_uring;
struct io_tctx_exit *work;
@@ -9374,14 +9364,14 @@ static void io_tctx_exit_cb(struct callback_head *cb)
complete(&work->completion);
}
-static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
+static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
return req->ctx == data;
}
-static void io_ring_exit_work(struct work_struct *work)
+static __cold void io_ring_exit_work(struct work_struct *work)
{
struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
unsigned long timeout = jiffies + HZ * 60 * 5;
@@ -9410,6 +9400,8 @@ static void io_ring_exit_work(struct work_struct *work)
io_sq_thread_unpark(sqd);
}
+ io_req_caches_free(ctx);
+
if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
/* there is little hope left, don't run it too often */
interval = HZ * 60;
@@ -9436,7 +9428,6 @@ static void io_ring_exit_work(struct work_struct *work)
ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
if (WARN_ON_ONCE(ret))
continue;
- wake_up_process(node->task);
mutex_unlock(&ctx->uring_lock);
wait_for_completion(&exit.completion);
@@ -9450,8 +9441,8 @@ static void io_ring_exit_work(struct work_struct *work)
}
/* Returns true if we found and killed one or more timeouts */
-static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
- bool cancel_all)
+static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx,
+ struct task_struct *tsk, bool cancel_all)
{
struct io_kiocb *req, *tmp;
int canceled = 0;
@@ -9473,7 +9464,7 @@ static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
return canceled != 0;
}
-static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
+static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
{
unsigned long index;
struct creds *creds;
@@ -9535,8 +9526,9 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
return ret;
}
-static bool io_cancel_defer_files(struct io_ring_ctx *ctx,
- struct task_struct *task, bool cancel_all)
+static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
+ struct task_struct *task,
+ bool cancel_all)
{
struct io_defer_entry *de;
LIST_HEAD(list);
@@ -9561,7 +9553,7 @@ static bool io_cancel_defer_files(struct io_ring_ctx *ctx,
return true;
}
-static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
+static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
{
struct io_tctx_node *node;
enum io_wq_cancel cret;
@@ -9585,9 +9577,9 @@ static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
return ret;
}
-static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
- struct task_struct *task,
- bool cancel_all)
+static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
+ struct task_struct *task,
+ bool cancel_all)
{
struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
struct io_uring_task *tctx = task ? task->io_uring : NULL;
@@ -9611,7 +9603,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
/* SQPOLL thread does its own polling */
if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
(ctx->sq_data && ctx->sq_data->thread == current)) {
- while (!list_empty_careful(&ctx->iopoll_list)) {
+ while (!wq_list_empty(&ctx->iopoll_list)) {
io_iopoll_try_reap_events(ctx);
ret = true;
}
@@ -9638,7 +9630,16 @@ static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
ret = io_uring_alloc_task_context(current, ctx);
if (unlikely(ret))
return ret;
+
tctx = current->io_uring;
+ if (ctx->iowq_limits_set) {
+ unsigned int limits[2] = { ctx->iowq_limits[0],
+ ctx->iowq_limits[1], };
+
+ ret = io_wq_max_workers(tctx->io_wq, limits);
+ if (ret)
+ return ret;
+ }
}
if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
node = kmalloc(sizeof(*node), GFP_KERNEL);
@@ -9677,7 +9678,7 @@ static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
/*
* Remove this io_uring_file -> task mapping.
*/
-static void io_uring_del_tctx_node(unsigned long index)
+static __cold void io_uring_del_tctx_node(unsigned long index)
{
struct io_uring_task *tctx = current->io_uring;
struct io_tctx_node *node;
@@ -9700,7 +9701,7 @@ static void io_uring_del_tctx_node(unsigned long index)
kfree(node);
}
-static void io_uring_clean_tctx(struct io_uring_task *tctx)
+static __cold void io_uring_clean_tctx(struct io_uring_task *tctx)
{
struct io_wq *wq = tctx->io_wq;
struct io_tctx_node *node;
@@ -9727,7 +9728,7 @@ static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
return percpu_counter_sum(&tctx->inflight);
}
-static void io_uring_drop_tctx_refs(struct task_struct *task)
+static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
{
struct io_uring_task *tctx = task->io_uring;
unsigned int refs = tctx->cached_refs;
@@ -9743,7 +9744,8 @@ static void io_uring_drop_tctx_refs(struct task_struct *task)
* Find any io_uring ctx that this task has registered or done IO on, and cancel
* requests. @sqd should be not-null IIF it's an SQPOLL thread cancellation.
*/
-static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
+static __cold void io_uring_cancel_generic(bool cancel_all,
+ struct io_sq_data *sqd)
{
struct io_uring_task *tctx = current->io_uring;
struct io_ring_ctx *ctx;
@@ -9836,7 +9838,7 @@ static void *io_uring_validate_mmap_request(struct file *file,
#ifdef CONFIG_MMU
-static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
+static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
{
size_t sz = vma->vm_end - vma->vm_start;
unsigned long pfn;
@@ -10021,7 +10023,7 @@ out_fput:
}
#ifdef CONFIG_PROC_FS
-static int io_uring_show_cred(struct seq_file *m, unsigned int id,
+static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
const struct cred *cred)
{
struct user_namespace *uns = seq_user_ns(m);
@@ -10053,11 +10055,59 @@ static int io_uring_show_cred(struct seq_file *m, unsigned int id,
return 0;
}
-static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
+static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
+ struct seq_file *m)
{
struct io_sq_data *sq = NULL;
+ struct io_overflow_cqe *ocqe;
+ struct io_rings *r = ctx->rings;
+ unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
+ unsigned int sq_head = READ_ONCE(r->sq.head);
+ unsigned int sq_tail = READ_ONCE(r->sq.tail);
+ unsigned int cq_head = READ_ONCE(r->cq.head);
+ unsigned int cq_tail = READ_ONCE(r->cq.tail);
+ unsigned int sq_entries, cq_entries;
bool has_lock;
- int i;
+ unsigned int i;
+
+ /*
+ * we may get imprecise sqe and cqe info if uring is actively running
+ * since we get cached_sq_head and cached_cq_tail without uring_lock
+ * and sq_tail and cq_head are changed by userspace. But it's ok since
+ * we usually use these info when it is stuck.
+ */
+ seq_printf(m, "SqMask:\t\t0x%x\n", sq_mask);
+ seq_printf(m, "SqHead:\t%u\n", sq_head);
+ seq_printf(m, "SqTail:\t%u\n", sq_tail);
+ seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head);
+ seq_printf(m, "CqMask:\t0x%x\n", cq_mask);
+ seq_printf(m, "CqHead:\t%u\n", cq_head);
+ seq_printf(m, "CqTail:\t%u\n", cq_tail);
+ seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail);
+ seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head);
+ sq_entries = min(sq_tail - sq_head, ctx->sq_entries);
+ for (i = 0; i < sq_entries; i++) {
+ unsigned int entry = i + sq_head;
+ unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
+ struct io_uring_sqe *sqe = &ctx->sq_sqes[sq_idx];
+
+ if (sq_idx > sq_mask)
+ continue;
+ sqe = &ctx->sq_sqes[sq_idx];
+ seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n",
+ sq_idx, sqe->opcode, sqe->fd, sqe->flags,
+ sqe->user_data);
+ }
+ seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head);
+ cq_entries = min(cq_tail - cq_head, ctx->cq_entries);
+ for (i = 0; i < cq_entries; i++) {
+ unsigned int entry = i + cq_head;
+ struct io_uring_cqe *cqe = &r->cqes[entry & cq_mask];
+
+ seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
+ entry & cq_mask, cqe->user_data, cqe->res,
+ cqe->flags);
+ }
/*
* Avoid ABBA deadlock between the seq lock and the io_uring mutex,
@@ -10099,7 +10149,10 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
xa_for_each(&ctx->personalities, index, cred)
io_uring_show_cred(m, index, cred);
}
- seq_printf(m, "PollList:\n");
+ if (has_lock)
+ mutex_unlock(&ctx->uring_lock);
+
+ seq_puts(m, "PollList:\n");
spin_lock(&ctx->completion_lock);
for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
struct hlist_head *list = &ctx->cancel_hash[i];
@@ -10109,12 +10162,20 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
seq_printf(m, " op=%d, task_works=%d\n", req->opcode,
req->task->task_works != NULL);
}
+
+ seq_puts(m, "CqOverflowList:\n");
+ list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) {
+ struct io_uring_cqe *cqe = &ocqe->cqe;
+
+ seq_printf(m, " user_data=%llu, res=%d, flags=%x\n",
+ cqe->user_data, cqe->res, cqe->flags);
+
+ }
+
spin_unlock(&ctx->completion_lock);
- if (has_lock)
- mutex_unlock(&ctx->uring_lock);
}
-static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
+static __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
{
struct io_ring_ctx *ctx = f->private_data;
@@ -10138,8 +10199,8 @@ static const struct file_operations io_uring_fops = {
#endif
};
-static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
- struct io_uring_params *p)
+static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
+ struct io_uring_params *p)
{
struct io_rings *rings;
size_t size, sq_array_offset;
@@ -10228,8 +10289,8 @@ static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
return file;
}
-static int io_uring_create(unsigned entries, struct io_uring_params *p,
- struct io_uring_params __user *params)
+static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
+ struct io_uring_params __user *params)
{
struct io_ring_ctx *ctx;
struct file *file;
@@ -10387,7 +10448,8 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries,
return io_uring_setup(entries, params);
}
-static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
+static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned nr_args)
{
struct io_uring_probe *p;
size_t size;
@@ -10443,8 +10505,8 @@ static int io_register_personality(struct io_ring_ctx *ctx)
return id;
}
-static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg,
- unsigned int nr_args)
+static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
+ void __user *arg, unsigned int nr_args)
{
struct io_uring_restriction *res;
size_t size;
@@ -10578,7 +10640,7 @@ static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
return __io_register_rsrc_update(ctx, type, &up, up.nr);
}
-static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
+static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
unsigned int size, unsigned int type)
{
struct io_uring_rsrc_register rr;
@@ -10604,8 +10666,8 @@ static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
return -EINVAL;
}
-static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg,
- unsigned len)
+static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
+ void __user *arg, unsigned len)
{
struct io_uring_task *tctx = current->io_uring;
cpumask_var_t new_mask;
@@ -10631,7 +10693,7 @@ static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg,
return ret;
}
-static int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
+static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
{
struct io_uring_task *tctx = current->io_uring;
@@ -10641,9 +10703,11 @@ static int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
return io_wq_cpu_affinity(tctx->io_wq, NULL);
}
-static int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
- void __user *arg)
+static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
+ void __user *arg)
+ __must_hold(&ctx->uring_lock)
{
+ struct io_tctx_node *node;
struct io_uring_task *tctx = NULL;
struct io_sq_data *sqd = NULL;
__u32 new_count[2];
@@ -10674,13 +10738,19 @@ static int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
tctx = current->io_uring;
}
- ret = -EINVAL;
- if (!tctx || !tctx->io_wq)
- goto err;
+ BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
- ret = io_wq_max_workers(tctx->io_wq, new_count);
- if (ret)
- goto err;
+ memcpy(ctx->iowq_limits, new_count, sizeof(new_count));
+ ctx->iowq_limits_set = true;
+
+ ret = -EINVAL;
+ if (tctx && tctx->io_wq) {
+ ret = io_wq_max_workers(tctx->io_wq, new_count);
+ if (ret)
+ goto err;
+ } else {
+ memset(new_count, 0, sizeof(new_count));
+ }
if (sqd) {
mutex_unlock(&sqd->lock);
@@ -10690,6 +10760,22 @@ static int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
if (copy_to_user(arg, new_count, sizeof(new_count)))
return -EFAULT;
+ /* that's it for SQPOLL, only the SQPOLL task creates requests */
+ if (sqd)
+ return 0;
+
+ /* now propagate the restriction to all registered users */
+ list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
+ struct io_uring_task *tctx = node->task->io_uring;
+
+ if (WARN_ON_ONCE(!tctx->io_wq))
+ continue;
+
+ for (i = 0; i < ARRAY_SIZE(new_count); i++)
+ new_count[i] = ctx->iowq_limits[i];
+ /* ignore errors, it always returns zero anyway */
+ (void)io_wq_max_workers(tctx->io_wq, new_count);
+ }
return 0;
err:
if (sqd) {
@@ -10723,7 +10809,7 @@ static bool io_register_op_must_quiesce(int op)
}
}
-static int io_ctx_quiesce(struct io_ring_ctx *ctx)
+static __cold int io_ctx_quiesce(struct io_ring_ctx *ctx)
{
long ret;
@@ -10738,10 +10824,14 @@ static int io_ctx_quiesce(struct io_ring_ctx *ctx)
*/
mutex_unlock(&ctx->uring_lock);
do {
- ret = wait_for_completion_interruptible(&ctx->ref_comp);
- if (!ret)
+ ret = wait_for_completion_interruptible_timeout(&ctx->ref_comp, HZ);
+ if (ret) {
+ ret = min(0L, ret);
break;
+ }
+
ret = io_run_task_work_sig();
+ io_req_caches_free(ctx);
} while (ret >= 0);
mutex_lock(&ctx->uring_lock);
@@ -10972,6 +11062,8 @@ static int __init io_uring_init(void)
/* should fit into one byte */
BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
+ BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
+ BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);
BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 4ecd255e0511..811c898125a5 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -38,8 +38,7 @@ struct iomap_dio {
struct {
struct iov_iter *iter;
struct task_struct *waiter;
- struct request_queue *last_queue;
- blk_qc_t cookie;
+ struct bio *poll_bio;
} submit;
/* used for aio completion: */
@@ -49,29 +48,20 @@ struct iomap_dio {
};
};
-int iomap_dio_iopoll(struct kiocb *kiocb, bool spin)
-{
- struct request_queue *q = READ_ONCE(kiocb->private);
-
- if (!q)
- return 0;
- return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin);
-}
-EXPORT_SYMBOL_GPL(iomap_dio_iopoll);
-
static void iomap_dio_submit_bio(const struct iomap_iter *iter,
struct iomap_dio *dio, struct bio *bio, loff_t pos)
{
atomic_inc(&dio->ref);
- if (dio->iocb->ki_flags & IOCB_HIPRI)
+ if (dio->iocb->ki_flags & IOCB_HIPRI) {
bio_set_polled(bio, dio->iocb);
+ dio->submit.poll_bio = bio;
+ }
- dio->submit.last_queue = bdev_get_queue(iter->iomap.bdev);
if (dio->dops && dio->dops->submit_io)
- dio->submit.cookie = dio->dops->submit_io(iter, bio, pos);
+ dio->dops->submit_io(iter, bio, pos);
else
- dio->submit.cookie = submit_bio(bio);
+ submit_bio(bio);
}
ssize_t iomap_dio_complete(struct iomap_dio *dio)
@@ -135,7 +125,7 @@ static void iomap_dio_complete_work(struct work_struct *work)
struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
struct kiocb *iocb = dio->iocb;
- iocb->ki_complete(iocb, iomap_dio_complete(dio), 0);
+ iocb->ki_complete(iocb, iomap_dio_complete(dio));
}
/*
@@ -164,9 +154,11 @@ static void iomap_dio_bio_end_io(struct bio *bio)
} else if (dio->flags & IOMAP_DIO_WRITE) {
struct inode *inode = file_inode(dio->iocb->ki_filp);
+ WRITE_ONCE(dio->iocb->private, NULL);
INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
} else {
+ WRITE_ONCE(dio->iocb->private, NULL);
iomap_dio_complete_work(&dio->aio.work);
}
}
@@ -282,6 +274,13 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
if (!iov_iter_count(dio->submit.iter))
goto out;
+ /*
+ * We can only poll for single bio I/Os.
+ */
+ if (need_zeroout ||
+ ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode)))
+ dio->iocb->ki_flags &= ~IOCB_HIPRI;
+
if (need_zeroout) {
/* zero out from the start of the block to the write offset */
pad = pos & (fs_block_size - 1);
@@ -339,6 +338,11 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter,
BIO_MAX_VECS);
+ /*
+ * We can only poll for single bio I/Os.
+ */
+ if (nr_pages)
+ dio->iocb->ki_flags &= ~IOCB_HIPRI;
iomap_dio_submit_bio(iter, dio, bio, pos);
pos += n;
} while (nr_pages);
@@ -485,8 +489,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
dio->submit.iter = iter;
dio->submit.waiter = current;
- dio->submit.cookie = BLK_QC_T_NONE;
- dio->submit.last_queue = NULL;
+ dio->submit.poll_bio = NULL;
if (iov_iter_rw(iter) == READ) {
if (iomi.pos >= dio->i_size)
@@ -565,8 +568,15 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
inode_dio_begin(inode);
blk_start_plug(&plug);
- while ((ret = iomap_iter(&iomi, ops)) > 0)
+ while ((ret = iomap_iter(&iomi, ops)) > 0) {
iomi.processed = iomap_dio_iter(&iomi, dio);
+
+ /*
+ * We can only poll for single bio I/Os.
+ */
+ iocb->ki_flags &= ~IOCB_HIPRI;
+ }
+
blk_finish_plug(&plug);
/*
@@ -592,8 +602,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (dio->flags & IOMAP_DIO_WRITE_FUA)
dio->flags &= ~IOMAP_DIO_NEED_SYNC;
- WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie);
- WRITE_ONCE(iocb->private, dio->submit.last_queue);
+ WRITE_ONCE(iocb->private, dio->submit.poll_bio);
/*
* We are about to drop our additional submission reference, which
@@ -620,10 +629,8 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (!READ_ONCE(dio->submit.waiter))
break;
- if (!(iocb->ki_flags & IOCB_HIPRI) ||
- !dio->submit.last_queue ||
- !blk_poll(dio->submit.last_queue,
- dio->submit.cookie, true))
+ if (!dio->submit.poll_bio ||
+ !bio_poll(dio->submit.poll_bio, NULL, 0))
blk_io_schedule();
}
__set_current_state(TASK_RUNNING);
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 176580f54af9..104ae698443e 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -13,6 +13,7 @@
#include <linux/buffer_head.h>
#include <linux/mempool.h>
#include <linux/seq_file.h>
+#include <linux/writeback.h>
#include "jfs_incore.h"
#include "jfs_superblock.h"
#include "jfs_filsys.h"
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index bde787c354fc..8b9a72ae5efa 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -86,8 +86,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
goto out;
}
- VolumeSize = i_size_read(sb->s_bdev->bd_inode) >> sb->s_blocksize_bits;
-
+ VolumeSize = sb_bdev_nr_blocks(sb);
if (VolumeSize) {
if (newLVSize > VolumeSize) {
printk(KERN_WARNING "jfs_extendfs: invalid size\n");
@@ -199,7 +198,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
txQuiesce(sb);
/* Reset size of direct inode */
- sbi->direct_inode->i_size = i_size_read(sb->s_bdev->bd_inode);
+ sbi->direct_inode->i_size = bdev_nr_bytes(sb->s_bdev);
if (sbi->mntflag & JFS_INLINELOG) {
/*
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 9030aeaf0f88..24cbc9946e01 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -284,8 +284,7 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
}
case Opt_resize_nosize:
{
- *newLVSize = i_size_read(sb->s_bdev->bd_inode) >>
- sb->s_blocksize_bits;
+ *newLVSize = sb_bdev_nr_blocks(sb);
if (*newLVSize == 0)
pr_err("JFS: Cannot determine volume size\n");
break;
@@ -551,7 +550,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
ret = -ENOMEM;
goto out_unload;
}
- inode->i_size = i_size_read(sb->s_bdev->bd_inode);
+ inode->i_size = bdev_nr_bytes(sb->s_bdev);
inode->i_mapping->a_ops = &jfs_metapage_aops;
inode_fake_hash(inode);
mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c
index 87aac4c72c37..1b07550485b9 100644
--- a/fs/kernel_read_file.c
+++ b/fs/kernel_read_file.c
@@ -178,7 +178,7 @@ int kernel_read_file_from_fd(int fd, loff_t offset, void **buf,
struct fd f = fdget(fd);
int ret = -EBADF;
- if (!f.file)
+ if (!f.file || !(f.file->f_mode & FMODE_READ))
goto out;
ret = kernel_read_file(f.file, offset, buf, buf_size, file_size, id);
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index cfc3ce8b815a..8e0a1378a4b1 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -1111,7 +1111,14 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir,
kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
/* attach dentry and inode */
- if (kn && kernfs_active(kn)) {
+ if (kn) {
+ /* Inactive nodes are invisible to the VFS so don't
+ * create a negative.
+ */
+ if (!kernfs_active(kn)) {
+ up_read(&kernfs_rwsem);
+ return NULL;
+ }
inode = kernfs_get_inode(dir->i_sb, kn);
if (!inode)
inode = ERR_PTR(-ENOMEM);
diff --git a/fs/ksmbd/auth.c b/fs/ksmbd/auth.c
index 71c989f1568d..30a92ddc1817 100644
--- a/fs/ksmbd/auth.c
+++ b/fs/ksmbd/auth.c
@@ -298,8 +298,8 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob,
int blob_len, struct ksmbd_session *sess)
{
char *domain_name;
- unsigned int lm_off, nt_off;
- unsigned short nt_len;
+ unsigned int nt_off, dn_off;
+ unsigned short nt_len, dn_len;
int ret;
if (blob_len < sizeof(struct authenticate_message)) {
@@ -314,15 +314,17 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob,
return -EINVAL;
}
- lm_off = le32_to_cpu(authblob->LmChallengeResponse.BufferOffset);
nt_off = le32_to_cpu(authblob->NtChallengeResponse.BufferOffset);
nt_len = le16_to_cpu(authblob->NtChallengeResponse.Length);
+ dn_off = le32_to_cpu(authblob->DomainName.BufferOffset);
+ dn_len = le16_to_cpu(authblob->DomainName.Length);
+
+ if (blob_len < (u64)dn_off + dn_len || blob_len < (u64)nt_off + nt_len)
+ return -EINVAL;
/* TODO : use domain name that imported from configuration file */
- domain_name = smb_strndup_from_utf16((const char *)authblob +
- le32_to_cpu(authblob->DomainName.BufferOffset),
- le16_to_cpu(authblob->DomainName.Length), true,
- sess->conn->local_nls);
+ domain_name = smb_strndup_from_utf16((const char *)authblob + dn_off,
+ dn_len, true, sess->conn->local_nls);
if (IS_ERR(domain_name))
return PTR_ERR(domain_name);
diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c
index af086d35398a..b57a0d8a392f 100644
--- a/fs/ksmbd/connection.c
+++ b/fs/ksmbd/connection.c
@@ -61,6 +61,8 @@ struct ksmbd_conn *ksmbd_conn_alloc(void)
conn->local_nls = load_nls_default();
atomic_set(&conn->req_running, 0);
atomic_set(&conn->r_count, 0);
+ conn->total_credits = 1;
+
init_waitqueue_head(&conn->req_running_q);
INIT_LIST_HEAD(&conn->conns_list);
INIT_LIST_HEAD(&conn->sessions);
@@ -296,10 +298,12 @@ int ksmbd_conn_handler_loop(void *p)
pdu_size = get_rfc1002_len(hdr_buf);
ksmbd_debug(CONN, "RFC1002 header %u bytes\n", pdu_size);
- /* make sure we have enough to get to SMB header end */
- if (!ksmbd_pdu_size_has_room(pdu_size)) {
- ksmbd_debug(CONN, "SMB request too short (%u bytes)\n",
- pdu_size);
+ /*
+ * Check if pdu size is valid (min : smb header size,
+ * max : 0x00FFFFFF).
+ */
+ if (pdu_size < __SMB2_HEADER_STRUCTURE_SIZE ||
+ pdu_size > MAX_STREAM_PROT_LEN) {
continue;
}
diff --git a/fs/ksmbd/glob.h b/fs/ksmbd/glob.h
index 49a5a3afa118..5b8f3e0ebdb3 100644
--- a/fs/ksmbd/glob.h
+++ b/fs/ksmbd/glob.h
@@ -12,7 +12,7 @@
#include "unicode.h"
#include "vfs_cache.h"
-#define KSMBD_VERSION "3.1.9"
+#define KSMBD_VERSION "3.4.2"
extern int ksmbd_debug_types;
diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h
index 2fbe2bc1e093..c6718a05d347 100644
--- a/fs/ksmbd/ksmbd_netlink.h
+++ b/fs/ksmbd/ksmbd_netlink.h
@@ -211,6 +211,7 @@ struct ksmbd_tree_disconnect_request {
*/
struct ksmbd_logout_request {
__s8 account[KSMBD_REQ_MAX_ACCOUNT_NAME_SZ]; /* user account name */
+ __u32 account_flags;
};
/*
@@ -317,6 +318,7 @@ enum KSMBD_TREE_CONN_STATUS {
#define KSMBD_USER_FLAG_BAD_UID BIT(2)
#define KSMBD_USER_FLAG_BAD_USER BIT(3)
#define KSMBD_USER_FLAG_GUEST_ACCOUNT BIT(4)
+#define KSMBD_USER_FLAG_DELAY_SESSION BIT(5)
/*
* Share config flags.
diff --git a/fs/ksmbd/mgmt/user_config.c b/fs/ksmbd/mgmt/user_config.c
index d21629ae5c89..1019d3677d55 100644
--- a/fs/ksmbd/mgmt/user_config.c
+++ b/fs/ksmbd/mgmt/user_config.c
@@ -55,7 +55,7 @@ struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp)
void ksmbd_free_user(struct ksmbd_user *user)
{
- ksmbd_ipc_logout_request(user->name);
+ ksmbd_ipc_logout_request(user->name, user->flags);
kfree(user->name);
kfree(user->passkey);
kfree(user);
diff --git a/fs/ksmbd/mgmt/user_config.h b/fs/ksmbd/mgmt/user_config.h
index b2bb074a0150..aff80b029579 100644
--- a/fs/ksmbd/mgmt/user_config.h
+++ b/fs/ksmbd/mgmt/user_config.h
@@ -18,6 +18,7 @@ struct ksmbd_user {
size_t passkey_sz;
char *passkey;
+ unsigned int failed_login_count;
};
static inline bool user_guest(struct ksmbd_user *user)
diff --git a/fs/ksmbd/smb2misc.c b/fs/ksmbd/smb2misc.c
index 9aa46bb3e10d..030ca57c3784 100644
--- a/fs/ksmbd/smb2misc.c
+++ b/fs/ksmbd/smb2misc.c
@@ -80,18 +80,21 @@ static const bool has_smb2_data_area[NUMBER_OF_SMB2_COMMANDS] = {
};
/*
- * Returns the pointer to the beginning of the data area. Length of the data
- * area and the offset to it (from the beginning of the smb are also returned.
+ * Set length of the data area and the offset to arguments.
+ * if they are invalid, return error.
*/
-static char *smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr)
+static int smb2_get_data_area_len(unsigned int *off, unsigned int *len,
+ struct smb2_hdr *hdr)
{
+ int ret = 0;
+
*off = 0;
*len = 0;
/* error reqeusts do not have data area */
if (hdr->Status && hdr->Status != STATUS_MORE_PROCESSING_REQUIRED &&
(((struct smb2_err_rsp *)hdr)->StructureSize) == SMB2_ERROR_STRUCTURE_SIZE2_LE)
- return NULL;
+ return ret;
/*
* Following commands have data areas so we have to get the location
@@ -165,69 +168,60 @@ static char *smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr)
case SMB2_IOCTL:
*off = le32_to_cpu(((struct smb2_ioctl_req *)hdr)->InputOffset);
*len = le32_to_cpu(((struct smb2_ioctl_req *)hdr)->InputCount);
-
break;
default:
ksmbd_debug(SMB, "no length check for command\n");
break;
}
- /*
- * Invalid length or offset probably means data area is invalid, but
- * we have little choice but to ignore the data area in this case.
- */
if (*off > 4096) {
- ksmbd_debug(SMB, "offset %d too large, data area ignored\n",
- *off);
- *len = 0;
- *off = 0;
- } else if (*off < 0) {
- ksmbd_debug(SMB,
- "negative offset %d to data invalid ignore data area\n",
- *off);
- *off = 0;
- *len = 0;
- } else if (*len < 0) {
- ksmbd_debug(SMB,
- "negative data length %d invalid, data area ignored\n",
- *len);
- *len = 0;
- } else if (*len > 128 * 1024) {
- ksmbd_debug(SMB, "data area larger than 128K: %d\n", *len);
- *len = 0;
+ ksmbd_debug(SMB, "offset %d too large\n", *off);
+ ret = -EINVAL;
+ } else if ((u64)*off + *len > MAX_STREAM_PROT_LEN) {
+ ksmbd_debug(SMB, "Request is larger than maximum stream protocol length(%u): %llu\n",
+ MAX_STREAM_PROT_LEN, (u64)*off + *len);
+ ret = -EINVAL;
}
- /* return pointer to beginning of data area, ie offset from SMB start */
- if ((*off != 0) && (*len != 0))
- return (char *)hdr + *off;
- else
- return NULL;
+ return ret;
}
/*
* Calculate the size of the SMB message based on the fixed header
* portion, the number of word parameters and the data portion of the message.
*/
-static unsigned int smb2_calc_size(void *buf)
+static int smb2_calc_size(void *buf, unsigned int *len)
{
struct smb2_pdu *pdu = (struct smb2_pdu *)buf;
struct smb2_hdr *hdr = &pdu->hdr;
- int offset; /* the offset from the beginning of SMB to data area */
- int data_length; /* the length of the variable length data area */
+ unsigned int offset; /* the offset from the beginning of SMB to data area */
+ unsigned int data_length; /* the length of the variable length data area */
+ int ret;
+
/* Structure Size has already been checked to make sure it is 64 */
- int len = le16_to_cpu(hdr->StructureSize);
+ *len = le16_to_cpu(hdr->StructureSize);
/*
* StructureSize2, ie length of fixed parameter area has already
* been checked to make sure it is the correct length.
*/
- len += le16_to_cpu(pdu->StructureSize2);
+ *len += le16_to_cpu(pdu->StructureSize2);
+ /*
+ * StructureSize2 of smb2_lock pdu is set to 48, indicating
+ * the size of smb2 lock request with single smb2_lock_element
+ * regardless of number of locks. Subtract single
+ * smb2_lock_element for correct buffer size check.
+ */
+ if (hdr->Command == SMB2_LOCK)
+ *len -= sizeof(struct smb2_lock_element);
if (has_smb2_data_area[le16_to_cpu(hdr->Command)] == false)
goto calc_size_exit;
- smb2_get_data_area_len(&offset, &data_length, hdr);
- ksmbd_debug(SMB, "SMB2 data length %d offset %d\n", data_length,
+ ret = smb2_get_data_area_len(&offset, &data_length, hdr);
+ if (ret)
+ return ret;
+ ksmbd_debug(SMB, "SMB2 data length %u offset %u\n", data_length,
offset);
if (data_length > 0) {
@@ -237,16 +231,19 @@ static unsigned int smb2_calc_size(void *buf)
* for some commands, typically those with odd StructureSize,
* so we must add one to the calculation.
*/
- if (offset + 1 < len)
+ if (offset + 1 < *len) {
ksmbd_debug(SMB,
- "data area offset %d overlaps SMB2 header %d\n",
- offset + 1, len);
- else
- len = offset + data_length;
+ "data area offset %d overlaps SMB2 header %u\n",
+ offset + 1, *len);
+ return -EINVAL;
+ }
+
+ *len = offset + data_length;
}
+
calc_size_exit:
- ksmbd_debug(SMB, "SMB2 len %d\n", len);
- return len;
+ ksmbd_debug(SMB, "SMB2 len %u\n", *len);
+ return 0;
}
static inline int smb2_query_info_req_len(struct smb2_query_info_req *h)
@@ -287,11 +284,13 @@ static inline int smb2_ioctl_resp_len(struct smb2_ioctl_req *h)
le32_to_cpu(h->MaxOutputResponse);
}
-static int smb2_validate_credit_charge(struct smb2_hdr *hdr)
+static int smb2_validate_credit_charge(struct ksmbd_conn *conn,
+ struct smb2_hdr *hdr)
{
- int req_len = 0, expect_resp_len = 0, calc_credit_num, max_len;
- int credit_charge = le16_to_cpu(hdr->CreditCharge);
+ unsigned int req_len = 0, expect_resp_len = 0, calc_credit_num, max_len;
+ unsigned short credit_charge = le16_to_cpu(hdr->CreditCharge);
void *__hdr = hdr;
+ int ret;
switch (hdr->Command) {
case SMB2_QUERY_INFO:
@@ -313,21 +312,37 @@ static int smb2_validate_credit_charge(struct smb2_hdr *hdr)
req_len = smb2_ioctl_req_len(__hdr);
expect_resp_len = smb2_ioctl_resp_len(__hdr);
break;
- default:
+ case SMB2_CANCEL:
return 0;
+ default:
+ req_len = 1;
+ break;
}
- credit_charge = max(1, credit_charge);
- max_len = max(req_len, expect_resp_len);
+ credit_charge = max_t(unsigned short, credit_charge, 1);
+ max_len = max_t(unsigned int, req_len, expect_resp_len);
calc_credit_num = DIV_ROUND_UP(max_len, SMB2_MAX_BUFFER_SIZE);
if (credit_charge < calc_credit_num) {
- pr_err("Insufficient credit charge, given: %d, needed: %d\n",
- credit_charge, calc_credit_num);
+ ksmbd_debug(SMB, "Insufficient credit charge, given: %d, needed: %d\n",
+ credit_charge, calc_credit_num);
+ return 1;
+ } else if (credit_charge > conn->max_credits) {
+ ksmbd_debug(SMB, "Too large credit charge: %d\n", credit_charge);
return 1;
}
- return 0;
+ spin_lock(&conn->credits_lock);
+ if (credit_charge <= conn->total_credits) {
+ conn->total_credits -= credit_charge;
+ ret = 0;
+ } else {
+ ksmbd_debug(SMB, "Insufficient credits granted, given: %u, granted: %u\n",
+ credit_charge, conn->total_credits);
+ ret = 1;
+ }
+ spin_unlock(&conn->credits_lock);
+ return ret;
}
int ksmbd_smb2_check_message(struct ksmbd_work *work)
@@ -385,24 +400,20 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work)
}
}
- if ((work->conn->vals->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU) &&
- smb2_validate_credit_charge(hdr)) {
- work->conn->ops->set_rsp_status(work, STATUS_INVALID_PARAMETER);
+ if (smb2_calc_size(hdr, &clc_len))
return 1;
- }
- clc_len = smb2_calc_size(hdr);
if (len != clc_len) {
- /* server can return one byte more due to implied bcc[0] */
+ /* client can return one byte more due to implied bcc[0] */
if (clc_len == len + 1)
- return 0;
+ goto validate_credit;
/*
* Some windows servers (win2016) will pad also the final
* PDU in a compound to 8 bytes.
*/
if (ALIGN(clc_len, 8) == len)
- return 0;
+ goto validate_credit;
/*
* windows client also pad up to 8 bytes when compounding.
@@ -415,12 +426,9 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work)
"cli req padded more than expected. Length %d not %d for cmd:%d mid:%llu\n",
len, clc_len, command,
le64_to_cpu(hdr->MessageId));
- return 0;
+ goto validate_credit;
}
- if (command == SMB2_LOCK_HE && len == 88)
- return 0;
-
ksmbd_debug(SMB,
"cli req too short, len %d not %d. cmd:%d mid:%llu\n",
len, clc_len, command,
@@ -429,6 +437,13 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work)
return 1;
}
+validate_credit:
+ if ((work->conn->vals->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU) &&
+ smb2_validate_credit_charge(work->conn, hdr)) {
+ work->conn->ops->set_rsp_status(work, STATUS_INVALID_PARAMETER);
+ return 1;
+ }
+
return 0;
}
diff --git a/fs/ksmbd/smb2ops.c b/fs/ksmbd/smb2ops.c
index 197473871aa4..fb6a65d23139 100644
--- a/fs/ksmbd/smb2ops.c
+++ b/fs/ksmbd/smb2ops.c
@@ -187,11 +187,6 @@ static struct smb_version_cmds smb2_0_server_cmds[NUMBER_OF_SMB2_COMMANDS] = {
[SMB2_CHANGE_NOTIFY_HE] = { .proc = smb2_notify},
};
-int init_smb2_0_server(struct ksmbd_conn *conn)
-{
- return -EOPNOTSUPP;
-}
-
/**
* init_smb2_1_server() - initialize a smb server connection with smb2.1
* command dispatcher
@@ -289,6 +284,7 @@ int init_smb3_11_server(struct ksmbd_conn *conn)
void init_smb2_max_read_size(unsigned int sz)
{
+ sz = clamp_val(sz, SMB3_MIN_IOSIZE, SMB3_MAX_IOSIZE);
smb21_server_values.max_read_size = sz;
smb30_server_values.max_read_size = sz;
smb302_server_values.max_read_size = sz;
@@ -297,6 +293,7 @@ void init_smb2_max_read_size(unsigned int sz)
void init_smb2_max_write_size(unsigned int sz)
{
+ sz = clamp_val(sz, SMB3_MIN_IOSIZE, SMB3_MAX_IOSIZE);
smb21_server_values.max_write_size = sz;
smb30_server_values.max_write_size = sz;
smb302_server_values.max_write_size = sz;
@@ -305,6 +302,7 @@ void init_smb2_max_write_size(unsigned int sz)
void init_smb2_max_trans_size(unsigned int sz)
{
+ sz = clamp_val(sz, SMB3_MIN_IOSIZE, SMB3_MAX_IOSIZE);
smb21_server_values.max_trans_size = sz;
smb30_server_values.max_trans_size = sz;
smb302_server_values.max_trans_size = sz;
diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c
index dcf907738610..7e448df3f847 100644
--- a/fs/ksmbd/smb2pdu.c
+++ b/fs/ksmbd/smb2pdu.c
@@ -236,9 +236,6 @@ int init_smb2_neg_rsp(struct ksmbd_work *work)
if (conn->need_neg == false)
return -EINVAL;
- if (!(conn->dialect >= SMB20_PROT_ID &&
- conn->dialect <= SMB311_PROT_ID))
- return -EINVAL;
rsp_hdr = work->response_buf;
@@ -295,22 +292,6 @@ int init_smb2_neg_rsp(struct ksmbd_work *work)
return 0;
}
-static int smb2_consume_credit_charge(struct ksmbd_work *work,
- unsigned short credit_charge)
-{
- struct ksmbd_conn *conn = work->conn;
- unsigned int rsp_credits = 1;
-
- if (!conn->total_credits)
- return 0;
-
- if (credit_charge > 0)
- rsp_credits = credit_charge;
-
- conn->total_credits -= rsp_credits;
- return rsp_credits;
-}
-
/**
* smb2_set_rsp_credits() - set number of credits in response buffer
* @work: smb work containing smb response buffer
@@ -320,49 +301,43 @@ int smb2_set_rsp_credits(struct ksmbd_work *work)
struct smb2_hdr *req_hdr = ksmbd_req_buf_next(work);
struct smb2_hdr *hdr = ksmbd_resp_buf_next(work);
struct ksmbd_conn *conn = work->conn;
- unsigned short credits_requested = le16_to_cpu(req_hdr->CreditRequest);
- unsigned short credit_charge = 1, credits_granted = 0;
- unsigned short aux_max, aux_credits, min_credits;
- int rsp_credit_charge;
+ unsigned short credits_requested;
+ unsigned short credit_charge, credits_granted = 0;
+ unsigned short aux_max, aux_credits;
- if (hdr->Command == SMB2_CANCEL)
- goto out;
+ if (work->send_no_response)
+ return 0;
- /* get default minimum credits by shifting maximum credits by 4 */
- min_credits = conn->max_credits >> 4;
+ hdr->CreditCharge = req_hdr->CreditCharge;
- if (conn->total_credits >= conn->max_credits) {
+ if (conn->total_credits > conn->max_credits) {
+ hdr->CreditRequest = 0;
pr_err("Total credits overflow: %d\n", conn->total_credits);
- conn->total_credits = min_credits;
+ return -EINVAL;
}
- rsp_credit_charge =
- smb2_consume_credit_charge(work, le16_to_cpu(req_hdr->CreditCharge));
- if (rsp_credit_charge < 0)
- return -EINVAL;
+ credit_charge = max_t(unsigned short,
+ le16_to_cpu(req_hdr->CreditCharge), 1);
+ credits_requested = max_t(unsigned short,
+ le16_to_cpu(req_hdr->CreditRequest), 1);
- hdr->CreditCharge = cpu_to_le16(rsp_credit_charge);
+ /* according to smb2.credits smbtorture, Windows server
+ * 2016 or later grant up to 8192 credits at once.
+ *
+ * TODO: Need to adjuct CreditRequest value according to
+ * current cpu load
+ */
+ aux_credits = credits_requested - 1;
+ if (hdr->Command == SMB2_NEGOTIATE)
+ aux_max = 0;
+ else
+ aux_max = conn->max_credits - credit_charge;
+ aux_credits = min_t(unsigned short, aux_credits, aux_max);
+ credits_granted = credit_charge + aux_credits;
- if (credits_requested > 0) {
- aux_credits = credits_requested - 1;
- aux_max = 32;
- if (hdr->Command == SMB2_NEGOTIATE)
- aux_max = 0;
- aux_credits = (aux_credits < aux_max) ? aux_credits : aux_max;
- credits_granted = aux_credits + credit_charge;
-
- /* if credits granted per client is getting bigger than default
- * minimum credits then we should wrap it up within the limits.
- */
- if ((conn->total_credits + credits_granted) > min_credits)
- credits_granted = min_credits - conn->total_credits;
- /*
- * TODO: Need to adjuct CreditRequest value according to
- * current cpu load
- */
- } else if (conn->total_credits == 0) {
- credits_granted = 1;
- }
+ if (conn->max_credits - conn->total_credits < credits_granted)
+ credits_granted = conn->max_credits -
+ conn->total_credits;
conn->total_credits += credits_granted;
work->credits_granted += credits_granted;
@@ -371,7 +346,6 @@ int smb2_set_rsp_credits(struct ksmbd_work *work)
/* Update CreditRequest in last request */
hdr->CreditRequest = cpu_to_le16(work->credits_granted);
}
-out:
ksmbd_debug(SMB,
"credits: requested[%d] granted[%d] total_granted[%d]\n",
credits_requested, credits_granted,
@@ -475,6 +449,12 @@ bool is_chained_smb2_message(struct ksmbd_work *work)
return false;
}
+ if ((u64)get_rfc1002_len(work->response_buf) + MAX_CIFS_SMALL_BUFFER_SIZE >
+ work->response_sz) {
+ pr_err("next response offset exceeds response buffer size\n");
+ return false;
+ }
+
ksmbd_debug(SMB, "got SMB2 chained command\n");
init_chained_smb2_rsp(work);
return true;
@@ -544,7 +524,7 @@ int smb2_allocate_rsp_buf(struct ksmbd_work *work)
{
struct smb2_hdr *hdr = work->request_buf;
size_t small_sz = MAX_CIFS_SMALL_BUFFER_SIZE;
- size_t large_sz = work->conn->vals->max_trans_size + MAX_SMB2_HDR_SIZE;
+ size_t large_sz = small_sz + work->conn->vals->max_trans_size;
size_t sz = small_sz;
int cmd = le16_to_cpu(hdr->Command);
@@ -1166,13 +1146,6 @@ int smb2_handle_negotiate(struct ksmbd_work *work)
case SMB21_PROT_ID:
init_smb2_1_server(conn);
break;
- case SMB20_PROT_ID:
- rc = init_smb2_0_server(conn);
- if (rc) {
- rsp->hdr.Status = STATUS_NOT_SUPPORTED;
- goto err_out;
- }
- break;
case SMB2X_PROT_ID:
case BAD_PROT_ID:
default:
@@ -1191,11 +1164,9 @@ int smb2_handle_negotiate(struct ksmbd_work *work)
rsp->MaxReadSize = cpu_to_le32(conn->vals->max_read_size);
rsp->MaxWriteSize = cpu_to_le32(conn->vals->max_write_size);
- if (conn->dialect > SMB20_PROT_ID) {
- memcpy(conn->ClientGUID, req->ClientGUID,
- SMB2_CLIENT_GUID_SIZE);
- conn->cli_sec_mode = le16_to_cpu(req->SecurityMode);
- }
+ memcpy(conn->ClientGUID, req->ClientGUID,
+ SMB2_CLIENT_GUID_SIZE);
+ conn->cli_sec_mode = le16_to_cpu(req->SecurityMode);
rsp->StructureSize = cpu_to_le16(65);
rsp->DialectRevision = cpu_to_le16(conn->dialect);
@@ -1286,19 +1257,13 @@ static int generate_preauth_hash(struct ksmbd_work *work)
return 0;
}
-static int decode_negotiation_token(struct ksmbd_work *work,
- struct negotiate_message *negblob)
+static int decode_negotiation_token(struct ksmbd_conn *conn,
+ struct negotiate_message *negblob,
+ size_t sz)
{
- struct ksmbd_conn *conn = work->conn;
- struct smb2_sess_setup_req *req;
- int sz;
-
if (!conn->use_spnego)
return -EINVAL;
- req = work->request_buf;
- sz = le16_to_cpu(req->SecurityBufferLength);
-
if (ksmbd_decode_negTokenInit((char *)negblob, sz, conn)) {
if (ksmbd_decode_negTokenTarg((char *)negblob, sz, conn)) {
conn->auth_mechs |= KSMBD_AUTH_NTLMSSP;
@@ -1310,9 +1275,9 @@ static int decode_negotiation_token(struct ksmbd_work *work,
}
static int ntlm_negotiate(struct ksmbd_work *work,
- struct negotiate_message *negblob)
+ struct negotiate_message *negblob,
+ size_t negblob_len)
{
- struct smb2_sess_setup_req *req = work->request_buf;
struct smb2_sess_setup_rsp *rsp = work->response_buf;
struct challenge_message *chgblob;
unsigned char *spnego_blob = NULL;
@@ -1321,8 +1286,7 @@ static int ntlm_negotiate(struct ksmbd_work *work,
int sz, rc;
ksmbd_debug(SMB, "negotiate phase\n");
- sz = le16_to_cpu(req->SecurityBufferLength);
- rc = ksmbd_decode_ntlmssp_neg_blob(negblob, sz, work->sess);
+ rc = ksmbd_decode_ntlmssp_neg_blob(negblob, negblob_len, work->sess);
if (rc)
return rc;
@@ -1390,12 +1354,23 @@ static struct ksmbd_user *session_user(struct ksmbd_conn *conn,
struct authenticate_message *authblob;
struct ksmbd_user *user;
char *name;
- int sz;
+ unsigned int auth_msg_len, name_off, name_len, secbuf_len;
+ secbuf_len = le16_to_cpu(req->SecurityBufferLength);
+ if (secbuf_len < sizeof(struct authenticate_message)) {
+ ksmbd_debug(SMB, "blob len %d too small\n", secbuf_len);
+ return NULL;
+ }
authblob = user_authblob(conn, req);
- sz = le32_to_cpu(authblob->UserName.BufferOffset);
- name = smb_strndup_from_utf16((const char *)authblob + sz,
- le16_to_cpu(authblob->UserName.Length),
+ name_off = le32_to_cpu(authblob->UserName.BufferOffset);
+ name_len = le16_to_cpu(authblob->UserName.Length);
+ auth_msg_len = le16_to_cpu(req->SecurityBufferOffset) + secbuf_len;
+
+ if (auth_msg_len < (u64)name_off + name_len)
+ return NULL;
+
+ name = smb_strndup_from_utf16((const char *)authblob + name_off,
+ name_len,
true,
conn->local_nls);
if (IS_ERR(name)) {
@@ -1537,11 +1512,9 @@ binding_session:
}
}
- if (conn->dialect > SMB20_PROT_ID) {
- if (!ksmbd_conn_lookup_dialect(conn)) {
- pr_err("fail to verify the dialect\n");
- return -ENOENT;
- }
+ if (!ksmbd_conn_lookup_dialect(conn)) {
+ pr_err("fail to verify the dialect\n");
+ return -ENOENT;
}
return 0;
}
@@ -1623,11 +1596,9 @@ static int krb5_authenticate(struct ksmbd_work *work)
}
}
- if (conn->dialect > SMB20_PROT_ID) {
- if (!ksmbd_conn_lookup_dialect(conn)) {
- pr_err("fail to verify the dialect\n");
- return -ENOENT;
- }
+ if (!ksmbd_conn_lookup_dialect(conn)) {
+ pr_err("fail to verify the dialect\n");
+ return -ENOENT;
}
return 0;
}
@@ -1645,6 +1616,7 @@ int smb2_sess_setup(struct ksmbd_work *work)
struct smb2_sess_setup_rsp *rsp = work->response_buf;
struct ksmbd_session *sess;
struct negotiate_message *negblob;
+ unsigned int negblob_len, negblob_off;
int rc = 0;
ksmbd_debug(SMB, "Received request for session setup\n");
@@ -1725,10 +1697,16 @@ int smb2_sess_setup(struct ksmbd_work *work)
if (sess->state == SMB2_SESSION_EXPIRED)
sess->state = SMB2_SESSION_IN_PROGRESS;
+ negblob_off = le16_to_cpu(req->SecurityBufferOffset);
+ negblob_len = le16_to_cpu(req->SecurityBufferLength);
+ if (negblob_off < (offsetof(struct smb2_sess_setup_req, Buffer) - 4) ||
+ negblob_len < offsetof(struct negotiate_message, NegotiateFlags))
+ return -EINVAL;
+
negblob = (struct negotiate_message *)((char *)&req->hdr.ProtocolId +
- le16_to_cpu(req->SecurityBufferOffset));
+ negblob_off);
- if (decode_negotiation_token(work, negblob) == 0) {
+ if (decode_negotiation_token(conn, negblob, negblob_len) == 0) {
if (conn->mechToken)
negblob = (struct negotiate_message *)conn->mechToken;
}
@@ -1752,7 +1730,7 @@ int smb2_sess_setup(struct ksmbd_work *work)
sess->Preauth_HashValue = NULL;
} else if (conn->preferred_auth_mech == KSMBD_AUTH_NTLMSSP) {
if (negblob->MessageType == NtLmNegotiate) {
- rc = ntlm_negotiate(work, negblob);
+ rc = ntlm_negotiate(work, negblob, negblob_len);
if (rc)
goto out_err;
rsp->hdr.Status =
@@ -1812,9 +1790,30 @@ out_err:
conn->mechToken = NULL;
}
- if (rc < 0 && sess) {
- ksmbd_session_destroy(sess);
- work->sess = NULL;
+ if (rc < 0) {
+ /*
+ * SecurityBufferOffset should be set to zero
+ * in session setup error response.
+ */
+ rsp->SecurityBufferOffset = 0;
+
+ if (sess) {
+ bool try_delay = false;
+
+ /*
+ * To avoid dictionary attacks (repeated session setups rapidly sent) to
+ * connect to server, ksmbd make a delay of a 5 seconds on session setup
+ * failure to make it harder to send enough random connection requests
+ * to break into a server.
+ */
+ if (sess->user && sess->user->flags & KSMBD_USER_FLAG_DELAY_SESSION)
+ try_delay = true;
+
+ ksmbd_session_destroy(sess);
+ work->sess = NULL;
+ if (try_delay)
+ ssleep(5);
+ }
}
return rc;
@@ -3795,6 +3794,24 @@ static int verify_info_level(int info_level)
return 0;
}
+static int smb2_calc_max_out_buf_len(struct ksmbd_work *work,
+ unsigned short hdr2_len,
+ unsigned int out_buf_len)
+{
+ int free_len;
+
+ if (out_buf_len > work->conn->vals->max_trans_size)
+ return -EINVAL;
+
+ free_len = (int)(work->response_sz -
+ (get_rfc1002_len(work->response_buf) + 4)) -
+ hdr2_len;
+ if (free_len < 0)
+ return -EINVAL;
+
+ return min_t(int, out_buf_len, free_len);
+}
+
int smb2_query_dir(struct ksmbd_work *work)
{
struct ksmbd_conn *conn = work->conn;
@@ -3871,9 +3888,13 @@ int smb2_query_dir(struct ksmbd_work *work)
memset(&d_info, 0, sizeof(struct ksmbd_dir_info));
d_info.wptr = (char *)rsp->Buffer;
d_info.rptr = (char *)rsp->Buffer;
- d_info.out_buf_len = (work->response_sz - (get_rfc1002_len(rsp_org) + 4));
- d_info.out_buf_len = min_t(int, d_info.out_buf_len, le32_to_cpu(req->OutputBufferLength)) -
- sizeof(struct smb2_query_directory_rsp);
+ d_info.out_buf_len =
+ smb2_calc_max_out_buf_len(work, 8,
+ le32_to_cpu(req->OutputBufferLength));
+ if (d_info.out_buf_len < 0) {
+ rc = -EINVAL;
+ goto err_out;
+ }
d_info.flags = srch_flag;
/*
@@ -4107,12 +4128,11 @@ static int smb2_get_ea(struct ksmbd_work *work, struct ksmbd_file *fp,
le32_to_cpu(req->Flags));
}
- buf_free_len = work->response_sz -
- (get_rfc1002_len(rsp_org) + 4) -
- sizeof(struct smb2_query_info_rsp);
-
- if (le32_to_cpu(req->OutputBufferLength) < buf_free_len)
- buf_free_len = le32_to_cpu(req->OutputBufferLength);
+ buf_free_len =
+ smb2_calc_max_out_buf_len(work, 8,
+ le32_to_cpu(req->OutputBufferLength));
+ if (buf_free_len < 0)
+ return -EINVAL;
rc = ksmbd_vfs_listxattr(path->dentry, &xattr_list);
if (rc < 0) {
@@ -4423,6 +4443,8 @@ static void get_file_stream_info(struct ksmbd_work *work,
struct path *path = &fp->filp->f_path;
ssize_t xattr_list_len;
int nbytes = 0, streamlen, stream_name_len, next, idx = 0;
+ int buf_free_len;
+ struct smb2_query_info_req *req = ksmbd_req_buf_next(work);
generic_fillattr(file_mnt_user_ns(fp->filp), file_inode(fp->filp),
&stat);
@@ -4436,6 +4458,12 @@ static void get_file_stream_info(struct ksmbd_work *work,
goto out;
}
+ buf_free_len =
+ smb2_calc_max_out_buf_len(work, 8,
+ le32_to_cpu(req->OutputBufferLength));
+ if (buf_free_len < 0)
+ goto out;
+
while (idx < xattr_list_len) {
stream_name = xattr_list + idx;
streamlen = strlen(stream_name);
@@ -4460,6 +4488,10 @@ static void get_file_stream_info(struct ksmbd_work *work,
streamlen = snprintf(stream_buf, streamlen + 1,
":%s", &stream_name[XATTR_NAME_STREAM_LEN]);
+ next = sizeof(struct smb2_file_stream_info) + streamlen * 2;
+ if (next > buf_free_len)
+ break;
+
file_info = (struct smb2_file_stream_info *)&rsp->Buffer[nbytes];
streamlen = smbConvertToUTF16((__le16 *)file_info->StreamName,
stream_buf, streamlen,
@@ -4470,12 +4502,13 @@ static void get_file_stream_info(struct ksmbd_work *work,
file_info->StreamSize = cpu_to_le64(stream_name_len);
file_info->StreamAllocationSize = cpu_to_le64(stream_name_len);
- next = sizeof(struct smb2_file_stream_info) + streamlen;
nbytes += next;
+ buf_free_len -= next;
file_info->NextEntryOffset = cpu_to_le32(next);
}
- if (!S_ISDIR(stat.mode)) {
+ if (!S_ISDIR(stat.mode) &&
+ buf_free_len >= sizeof(struct smb2_file_stream_info) + 7 * 2) {
file_info = (struct smb2_file_stream_info *)
&rsp->Buffer[nbytes];
streamlen = smbConvertToUTF16((__le16 *)file_info->StreamName,
@@ -5499,7 +5532,6 @@ static int set_file_basic_info(struct ksmbd_file *fp,
struct ksmbd_share_config *share)
{
struct iattr attrs;
- struct timespec64 ctime;
struct file *filp;
struct inode *inode;
struct user_namespace *user_ns;
@@ -5521,13 +5553,11 @@ static int set_file_basic_info(struct ksmbd_file *fp,
attrs.ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET);
}
- if (file_info->ChangeTime) {
+ attrs.ia_valid |= ATTR_CTIME;
+ if (file_info->ChangeTime)
attrs.ia_ctime = ksmbd_NTtimeToUnix(file_info->ChangeTime);
- ctime = attrs.ia_ctime;
- attrs.ia_valid |= ATTR_CTIME;
- } else {
- ctime = inode->i_ctime;
- }
+ else
+ attrs.ia_ctime = inode->i_ctime;
if (file_info->LastWriteTime) {
attrs.ia_mtime = ksmbd_NTtimeToUnix(file_info->LastWriteTime);
@@ -5573,11 +5603,9 @@ static int set_file_basic_info(struct ksmbd_file *fp,
return -EACCES;
inode_lock(inode);
+ inode->i_ctime = attrs.ia_ctime;
+ attrs.ia_valid &= ~ATTR_CTIME;
rc = notify_change(user_ns, dentry, &attrs, NULL);
- if (!rc) {
- inode->i_ctime = ctime;
- mark_inode_dirty(inode);
- }
inode_unlock(inode);
}
return rc;
@@ -6241,8 +6269,7 @@ static noinline int smb2_write_pipe(struct ksmbd_work *work)
(offsetof(struct smb2_write_req, Buffer) - 4)) {
data_buf = (char *)&req->Buffer[0];
} else {
- if ((le16_to_cpu(req->DataOffset) > get_rfc1002_len(req)) ||
- (le16_to_cpu(req->DataOffset) + length > get_rfc1002_len(req))) {
+ if ((u64)le16_to_cpu(req->DataOffset) + length > get_rfc1002_len(req)) {
pr_err("invalid write data offset %u, smb_len %u\n",
le16_to_cpu(req->DataOffset),
get_rfc1002_len(req));
@@ -6400,8 +6427,7 @@ int smb2_write(struct ksmbd_work *work)
(offsetof(struct smb2_write_req, Buffer) - 4)) {
data_buf = (char *)&req->Buffer[0];
} else {
- if ((le16_to_cpu(req->DataOffset) > get_rfc1002_len(req)) ||
- (le16_to_cpu(req->DataOffset) + length > get_rfc1002_len(req))) {
+ if ((u64)le16_to_cpu(req->DataOffset) + length > get_rfc1002_len(req)) {
pr_err("invalid write data offset %u, smb_len %u\n",
le16_to_cpu(req->DataOffset),
get_rfc1002_len(req));
@@ -7044,24 +7070,26 @@ out2:
return err;
}
-static int fsctl_copychunk(struct ksmbd_work *work, struct smb2_ioctl_req *req,
+static int fsctl_copychunk(struct ksmbd_work *work,
+ struct copychunk_ioctl_req *ci_req,
+ unsigned int cnt_code,
+ unsigned int input_count,
+ unsigned long long volatile_id,
+ unsigned long long persistent_id,
struct smb2_ioctl_rsp *rsp)
{
- struct copychunk_ioctl_req *ci_req;
struct copychunk_ioctl_rsp *ci_rsp;
struct ksmbd_file *src_fp = NULL, *dst_fp = NULL;
struct srv_copychunk *chunks;
unsigned int i, chunk_count, chunk_count_written = 0;
unsigned int chunk_size_written = 0;
loff_t total_size_written = 0;
- int ret, cnt_code;
+ int ret = 0;
- cnt_code = le32_to_cpu(req->CntCode);
- ci_req = (struct copychunk_ioctl_req *)&req->Buffer[0];
ci_rsp = (struct copychunk_ioctl_rsp *)&rsp->Buffer[0];
- rsp->VolatileFileId = req->VolatileFileId;
- rsp->PersistentFileId = req->PersistentFileId;
+ rsp->VolatileFileId = cpu_to_le64(volatile_id);
+ rsp->PersistentFileId = cpu_to_le64(persistent_id);
ci_rsp->ChunksWritten =
cpu_to_le32(ksmbd_server_side_copy_max_chunk_count());
ci_rsp->ChunkBytesWritten =
@@ -7071,12 +7099,13 @@ static int fsctl_copychunk(struct ksmbd_work *work, struct smb2_ioctl_req *req,
chunks = (struct srv_copychunk *)&ci_req->Chunks[0];
chunk_count = le32_to_cpu(ci_req->ChunkCount);
+ if (chunk_count == 0)
+ goto out;
total_size_written = 0;
/* verify the SRV_COPYCHUNK_COPY packet */
if (chunk_count > ksmbd_server_side_copy_max_chunk_count() ||
- le32_to_cpu(req->InputCount) <
- offsetof(struct copychunk_ioctl_req, Chunks) +
+ input_count < offsetof(struct copychunk_ioctl_req, Chunks) +
chunk_count * sizeof(struct srv_copychunk)) {
rsp->hdr.Status = STATUS_INVALID_PARAMETER;
return -EINVAL;
@@ -7097,9 +7126,7 @@ static int fsctl_copychunk(struct ksmbd_work *work, struct smb2_ioctl_req *req,
src_fp = ksmbd_lookup_foreign_fd(work,
le64_to_cpu(ci_req->ResumeKey[0]));
- dst_fp = ksmbd_lookup_fd_slow(work,
- le64_to_cpu(req->VolatileFileId),
- le64_to_cpu(req->PersistentFileId));
+ dst_fp = ksmbd_lookup_fd_slow(work, volatile_id, persistent_id);
ret = -EINVAL;
if (!src_fp ||
src_fp->persistent_id != le64_to_cpu(ci_req->ResumeKey[1])) {
@@ -7174,8 +7201,8 @@ static __be32 idev_ipv4_address(struct in_device *idev)
}
static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn,
- struct smb2_ioctl_req *req,
- struct smb2_ioctl_rsp *rsp)
+ struct smb2_ioctl_rsp *rsp,
+ unsigned int out_buf_len)
{
struct network_interface_info_ioctl_rsp *nii_rsp = NULL;
int nbytes = 0;
@@ -7187,6 +7214,12 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn,
rtnl_lock();
for_each_netdev(&init_net, netdev) {
+ if (out_buf_len <
+ nbytes + sizeof(struct network_interface_info_ioctl_rsp)) {
+ rtnl_unlock();
+ return -ENOSPC;
+ }
+
if (netdev->type == ARPHRD_LOOPBACK)
continue;
@@ -7266,11 +7299,6 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn,
if (nii_rsp)
nii_rsp->Next = 0;
- if (!nbytes) {
- rsp->hdr.Status = STATUS_BUFFER_TOO_SMALL;
- return -EINVAL;
- }
-
rsp->PersistentFileId = cpu_to_le64(SMB2_NO_FID);
rsp->VolatileFileId = cpu_to_le64(SMB2_NO_FID);
return nbytes;
@@ -7278,11 +7306,16 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn,
static int fsctl_validate_negotiate_info(struct ksmbd_conn *conn,
struct validate_negotiate_info_req *neg_req,
- struct validate_negotiate_info_rsp *neg_rsp)
+ struct validate_negotiate_info_rsp *neg_rsp,
+ unsigned int in_buf_len)
{
int ret = 0;
int dialect;
+ if (in_buf_len < sizeof(struct validate_negotiate_info_req) +
+ le16_to_cpu(neg_req->DialectCount) * sizeof(__le16))
+ return -EINVAL;
+
dialect = ksmbd_lookup_dialect_by_id(neg_req->Dialects,
neg_req->DialectCount);
if (dialect == BAD_PROT_ID || dialect != conn->dialect) {
@@ -7316,7 +7349,7 @@ err_out:
static int fsctl_query_allocated_ranges(struct ksmbd_work *work, u64 id,
struct file_allocated_range_buffer *qar_req,
struct file_allocated_range_buffer *qar_rsp,
- int in_count, int *out_count)
+ unsigned int in_count, unsigned int *out_count)
{
struct ksmbd_file *fp;
loff_t start, length;
@@ -7343,7 +7376,8 @@ static int fsctl_query_allocated_ranges(struct ksmbd_work *work, u64 id,
}
static int fsctl_pipe_transceive(struct ksmbd_work *work, u64 id,
- int out_buf_len, struct smb2_ioctl_req *req,
+ unsigned int out_buf_len,
+ struct smb2_ioctl_req *req,
struct smb2_ioctl_rsp *rsp)
{
struct ksmbd_rpc_command *rpc_resp;
@@ -7457,8 +7491,7 @@ int smb2_ioctl(struct ksmbd_work *work)
{
struct smb2_ioctl_req *req;
struct smb2_ioctl_rsp *rsp, *rsp_org;
- int cnt_code, nbytes = 0;
- int out_buf_len;
+ unsigned int cnt_code, nbytes = 0, out_buf_len, in_buf_len;
u64 id = KSMBD_NO_FID;
struct ksmbd_conn *conn = work->conn;
int ret = 0;
@@ -7486,8 +7519,14 @@ int smb2_ioctl(struct ksmbd_work *work)
}
cnt_code = le32_to_cpu(req->CntCode);
- out_buf_len = le32_to_cpu(req->MaxOutputResponse);
- out_buf_len = min(KSMBD_IPC_MAX_PAYLOAD, out_buf_len);
+ ret = smb2_calc_max_out_buf_len(work, 48,
+ le32_to_cpu(req->MaxOutputResponse));
+ if (ret < 0) {
+ rsp->hdr.Status = STATUS_INVALID_PARAMETER;
+ goto out;
+ }
+ out_buf_len = (unsigned int)ret;
+ in_buf_len = le32_to_cpu(req->InputCount);
switch (cnt_code) {
case FSCTL_DFS_GET_REFERRALS:
@@ -7515,6 +7554,7 @@ int smb2_ioctl(struct ksmbd_work *work)
break;
}
case FSCTL_PIPE_TRANSCEIVE:
+ out_buf_len = min_t(u32, KSMBD_IPC_MAX_PAYLOAD, out_buf_len);
nbytes = fsctl_pipe_transceive(work, id, out_buf_len, req, rsp);
break;
case FSCTL_VALIDATE_NEGOTIATE_INFO:
@@ -7523,9 +7563,16 @@ int smb2_ioctl(struct ksmbd_work *work)
goto out;
}
+ if (in_buf_len < sizeof(struct validate_negotiate_info_req))
+ return -EINVAL;
+
+ if (out_buf_len < sizeof(struct validate_negotiate_info_rsp))
+ return -EINVAL;
+
ret = fsctl_validate_negotiate_info(conn,
(struct validate_negotiate_info_req *)&req->Buffer[0],
- (struct validate_negotiate_info_rsp *)&rsp->Buffer[0]);
+ (struct validate_negotiate_info_rsp *)&rsp->Buffer[0],
+ in_buf_len);
if (ret < 0)
goto out;
@@ -7534,9 +7581,10 @@ int smb2_ioctl(struct ksmbd_work *work)
rsp->VolatileFileId = cpu_to_le64(SMB2_NO_FID);
break;
case FSCTL_QUERY_NETWORK_INTERFACE_INFO:
- nbytes = fsctl_query_iface_info_ioctl(conn, req, rsp);
- if (nbytes < 0)
+ ret = fsctl_query_iface_info_ioctl(conn, rsp, out_buf_len);
+ if (ret < 0)
goto out;
+ nbytes = ret;
break;
case FSCTL_REQUEST_RESUME_KEY:
if (out_buf_len < sizeof(struct resume_key_ioctl_rsp)) {
@@ -7561,15 +7609,33 @@ int smb2_ioctl(struct ksmbd_work *work)
goto out;
}
+ if (in_buf_len < sizeof(struct copychunk_ioctl_req)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
if (out_buf_len < sizeof(struct copychunk_ioctl_rsp)) {
ret = -EINVAL;
goto out;
}
nbytes = sizeof(struct copychunk_ioctl_rsp);
- fsctl_copychunk(work, req, rsp);
+ rsp->VolatileFileId = req->VolatileFileId;
+ rsp->PersistentFileId = req->PersistentFileId;
+ fsctl_copychunk(work,
+ (struct copychunk_ioctl_req *)&req->Buffer[0],
+ le32_to_cpu(req->CntCode),
+ le32_to_cpu(req->InputCount),
+ le64_to_cpu(req->VolatileFileId),
+ le64_to_cpu(req->PersistentFileId),
+ rsp);
break;
case FSCTL_SET_SPARSE:
+ if (in_buf_len < sizeof(struct file_sparse)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
ret = fsctl_set_sparse(work, id,
(struct file_sparse *)&req->Buffer[0]);
if (ret < 0)
@@ -7588,6 +7654,11 @@ int smb2_ioctl(struct ksmbd_work *work)
goto out;
}
+ if (in_buf_len < sizeof(struct file_zero_data_information)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
zero_data =
(struct file_zero_data_information *)&req->Buffer[0];
@@ -7607,6 +7678,11 @@ int smb2_ioctl(struct ksmbd_work *work)
break;
}
case FSCTL_QUERY_ALLOCATED_RANGES:
+ if (in_buf_len < sizeof(struct file_allocated_range_buffer)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
ret = fsctl_query_allocated_ranges(work, id,
(struct file_allocated_range_buffer *)&req->Buffer[0],
(struct file_allocated_range_buffer *)&rsp->Buffer[0],
@@ -7647,6 +7723,11 @@ int smb2_ioctl(struct ksmbd_work *work)
struct duplicate_extents_to_file *dup_ext;
loff_t src_off, dst_off, length, cloned;
+ if (in_buf_len < sizeof(struct duplicate_extents_to_file)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
dup_ext = (struct duplicate_extents_to_file *)&req->Buffer[0];
fp_in = ksmbd_lookup_fd_slow(work, dup_ext->VolatileFileHandle,
@@ -7717,6 +7798,8 @@ out:
rsp->hdr.Status = STATUS_OBJECT_NAME_NOT_FOUND;
else if (ret == -EOPNOTSUPP)
rsp->hdr.Status = STATUS_NOT_SUPPORTED;
+ else if (ret == -ENOSPC)
+ rsp->hdr.Status = STATUS_BUFFER_TOO_SMALL;
else if (ret < 0 || rsp->hdr.Status == 0)
rsp->hdr.Status = STATUS_INVALID_PARAMETER;
smb2_set_err_rsp(work);
@@ -8411,20 +8494,18 @@ int smb3_decrypt_req(struct ksmbd_work *work)
struct smb2_hdr *hdr;
unsigned int pdu_length = get_rfc1002_len(buf);
struct kvec iov[2];
- unsigned int buf_data_size = pdu_length + 4 -
+ int buf_data_size = pdu_length + 4 -
sizeof(struct smb2_transform_hdr);
struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)buf;
int rc = 0;
- if (pdu_length + 4 <
- sizeof(struct smb2_transform_hdr) + sizeof(struct smb2_hdr)) {
+ if (buf_data_size < sizeof(struct smb2_hdr)) {
pr_err("Transform message is too small (%u)\n",
pdu_length);
return -ECONNABORTED;
}
- if (pdu_length + 4 <
- le32_to_cpu(tr_hdr->OriginalMessageSize) + sizeof(struct smb2_transform_hdr)) {
+ if (buf_data_size < le32_to_cpu(tr_hdr->OriginalMessageSize)) {
pr_err("Transform message is broken\n");
return -ECONNABORTED;
}
diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h
index 261825d06391..ff5a2f01d34a 100644
--- a/fs/ksmbd/smb2pdu.h
+++ b/fs/ksmbd/smb2pdu.h
@@ -113,6 +113,8 @@
#define SMB21_DEFAULT_IOSIZE (1024 * 1024)
#define SMB3_DEFAULT_IOSIZE (4 * 1024 * 1024)
#define SMB3_DEFAULT_TRANS_SIZE (1024 * 1024)
+#define SMB3_MIN_IOSIZE (64 * 1024)
+#define SMB3_MAX_IOSIZE (8 * 1024 * 1024)
/*
* SMB2 Header Definition
@@ -1637,7 +1639,6 @@ struct smb2_posix_info {
} __packed;
/* functions */
-int init_smb2_0_server(struct ksmbd_conn *conn);
void init_smb2_1_server(struct ksmbd_conn *conn);
void init_smb3_0_server(struct ksmbd_conn *conn);
void init_smb3_02_server(struct ksmbd_conn *conn);
diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c
index db8042a173d0..707490ab1f4c 100644
--- a/fs/ksmbd/smb_common.c
+++ b/fs/ksmbd/smb_common.c
@@ -21,7 +21,6 @@ static const char basechars[43] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_-!@#$%";
#define MAGIC_CHAR '~'
#define PERIOD '.'
#define mangle(V) ((char)(basechars[(V) % MANGLE_BASE]))
-#define KSMBD_MIN_SUPPORTED_HEADER_SIZE (sizeof(struct smb2_hdr))
struct smb_protocol {
int index;
@@ -89,7 +88,7 @@ unsigned int ksmbd_server_side_copy_max_total_size(void)
inline int ksmbd_min_protocol(void)
{
- return SMB2_PROT;
+ return SMB21_PROT;
}
inline int ksmbd_max_protocol(void)
@@ -294,11 +293,6 @@ int ksmbd_init_smb_server(struct ksmbd_work *work)
return 0;
}
-bool ksmbd_pdu_size_has_room(unsigned int pdu)
-{
- return (pdu >= KSMBD_MIN_SUPPORTED_HEADER_SIZE - 4);
-}
-
int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level,
struct ksmbd_file *dir,
struct ksmbd_dir_info *d_info,
@@ -433,7 +427,7 @@ int ksmbd_extract_shortname(struct ksmbd_conn *conn, const char *longname,
static int __smb2_negotiate(struct ksmbd_conn *conn)
{
- return (conn->dialect >= SMB20_PROT_ID &&
+ return (conn->dialect >= SMB21_PROT_ID &&
conn->dialect <= SMB311_PROT_ID);
}
@@ -463,7 +457,7 @@ int ksmbd_smb_negotiate_common(struct ksmbd_work *work, unsigned int command)
}
}
- if (command == SMB2_NEGOTIATE_HE) {
+ if (command == SMB2_NEGOTIATE_HE && __smb2_negotiate(conn)) {
ret = smb2_handle_negotiate(work);
init_smb2_neg_rsp(work);
return ret;
diff --git a/fs/ksmbd/smb_common.h b/fs/ksmbd/smb_common.h
index 994abede27e9..6e79e7577f6b 100644
--- a/fs/ksmbd/smb_common.h
+++ b/fs/ksmbd/smb_common.h
@@ -48,6 +48,8 @@
#define CIFS_DEFAULT_IOSIZE (64 * 1024)
#define MAX_CIFS_SMALL_BUFFER_SIZE 448 /* big enough for most */
+#define MAX_STREAM_PROT_LEN 0x00FFFFFF
+
/* Responses when opening a file. */
#define F_SUPERSEDED 0
#define F_OPENED 1
@@ -493,8 +495,6 @@ int ksmbd_lookup_dialect_by_id(__le16 *cli_dialects, __le16 dialects_count);
int ksmbd_init_smb_server(struct ksmbd_work *work);
-bool ksmbd_pdu_size_has_room(unsigned int pdu);
-
struct ksmbd_kstat;
int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work,
int info_level,
diff --git a/fs/ksmbd/transport_ipc.c b/fs/ksmbd/transport_ipc.c
index 44aea33a67fa..1acf1892a466 100644
--- a/fs/ksmbd/transport_ipc.c
+++ b/fs/ksmbd/transport_ipc.c
@@ -601,7 +601,7 @@ int ksmbd_ipc_tree_disconnect_request(unsigned long long session_id,
return ret;
}
-int ksmbd_ipc_logout_request(const char *account)
+int ksmbd_ipc_logout_request(const char *account, int flags)
{
struct ksmbd_ipc_msg *msg;
struct ksmbd_logout_request *req;
@@ -616,6 +616,7 @@ int ksmbd_ipc_logout_request(const char *account)
msg->type = KSMBD_EVENT_LOGOUT_REQUEST;
req = (struct ksmbd_logout_request *)msg->payload;
+ req->account_flags = flags;
strscpy(req->account, account, KSMBD_REQ_MAX_ACCOUNT_NAME_SZ);
ret = ipc_msg_send(msg);
diff --git a/fs/ksmbd/transport_ipc.h b/fs/ksmbd/transport_ipc.h
index 9eacc895ffdb..5e5b90a0c187 100644
--- a/fs/ksmbd/transport_ipc.h
+++ b/fs/ksmbd/transport_ipc.h
@@ -25,7 +25,7 @@ ksmbd_ipc_tree_connect_request(struct ksmbd_session *sess,
struct sockaddr *peer_addr);
int ksmbd_ipc_tree_disconnect_request(unsigned long long session_id,
unsigned long long connect_id);
-int ksmbd_ipc_logout_request(const char *account);
+int ksmbd_ipc_logout_request(const char *account, int flags);
struct ksmbd_share_config_response *
ksmbd_ipc_share_config_request(const char *name);
struct ksmbd_spnego_authen_response *
diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c
index 3a7fa23ba850..a2fd5a4d4cd5 100644
--- a/fs/ksmbd/transport_rdma.c
+++ b/fs/ksmbd/transport_rdma.c
@@ -549,6 +549,10 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
switch (recvmsg->type) {
case SMB_DIRECT_MSG_NEGOTIATE_REQ:
+ if (wc->byte_len < sizeof(struct smb_direct_negotiate_req)) {
+ put_empty_recvmsg(t, recvmsg);
+ return;
+ }
t->negotiation_requested = true;
t->full_packet_received = true;
wake_up_interruptible(&t->wait_status);
@@ -556,10 +560,23 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
case SMB_DIRECT_MSG_DATA_TRANSFER: {
struct smb_direct_data_transfer *data_transfer =
(struct smb_direct_data_transfer *)recvmsg->packet;
- int data_length = le32_to_cpu(data_transfer->data_length);
+ unsigned int data_length;
int avail_recvmsg_count, receive_credits;
+ if (wc->byte_len <
+ offsetof(struct smb_direct_data_transfer, padding)) {
+ put_empty_recvmsg(t, recvmsg);
+ return;
+ }
+
+ data_length = le32_to_cpu(data_transfer->data_length);
if (data_length) {
+ if (wc->byte_len < sizeof(struct smb_direct_data_transfer) +
+ (u64)data_length) {
+ put_empty_recvmsg(t, recvmsg);
+ return;
+ }
+
if (t->full_packet_received)
recvmsg->first_segment = true;
@@ -568,7 +585,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
else
t->full_packet_received = true;
- enqueue_reassembly(t, recvmsg, data_length);
+ enqueue_reassembly(t, recvmsg, (int)data_length);
wake_up_interruptible(&t->wait_reassembly_queue);
spin_lock(&t->receive_credit_lock);
diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c
index b41954294d38..835b384b0895 100644
--- a/fs/ksmbd/vfs.c
+++ b/fs/ksmbd/vfs.c
@@ -1023,7 +1023,7 @@ int ksmbd_vfs_zero_data(struct ksmbd_work *work, struct ksmbd_file *fp,
int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length,
struct file_allocated_range_buffer *ranges,
- int in_count, int *out_count)
+ unsigned int in_count, unsigned int *out_count)
{
struct file *f = fp->filp;
struct inode *inode = file_inode(fp->filp);
diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h
index 7b1dcaa3fbdc..b0d5b8feb4a3 100644
--- a/fs/ksmbd/vfs.h
+++ b/fs/ksmbd/vfs.h
@@ -166,7 +166,7 @@ int ksmbd_vfs_zero_data(struct ksmbd_work *work, struct ksmbd_file *fp,
struct file_allocated_range_buffer;
int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length,
struct file_allocated_range_buffer *ranges,
- int in_count, int *out_count);
+ unsigned int in_count, unsigned int *out_count);
int ksmbd_vfs_unlink(struct user_namespace *user_ns,
struct dentry *dir, struct dentry *dentry);
void *ksmbd_vfs_init_kstat(char **p, struct ksmbd_kstat *ksmbd_kstat);
diff --git a/fs/locks.c b/fs/locks.c
index 3d6fb4ae847b..0fca9d680978 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2,117 +2,11 @@
/*
* linux/fs/locks.c
*
- * Provide support for fcntl()'s F_GETLK, F_SETLK, and F_SETLKW calls.
- * Doug Evans (dje@spiff.uucp), August 07, 1992
+ * We implement four types of file locks: BSD locks, posix locks, open
+ * file description locks, and leases. For details about BSD locks,
+ * see the flock(2) man page; for details about the other three, see
+ * fcntl(2).
*
- * Deadlock detection added.
- * FIXME: one thing isn't handled yet:
- * - mandatory locks (requires lots of changes elsewhere)
- * Kelly Carmichael (kelly@[142.24.8.65]), September 17, 1994.
- *
- * Miscellaneous edits, and a total rewrite of posix_lock_file() code.
- * Kai Petzke (wpp@marie.physik.tu-berlin.de), 1994
- *
- * Converted file_lock_table to a linked list from an array, which eliminates
- * the limits on how many active file locks are open.
- * Chad Page (pageone@netcom.com), November 27, 1994
- *
- * Removed dependency on file descriptors. dup()'ed file descriptors now
- * get the same locks as the original file descriptors, and a close() on
- * any file descriptor removes ALL the locks on the file for the current
- * process. Since locks still depend on the process id, locks are inherited
- * after an exec() but not after a fork(). This agrees with POSIX, and both
- * BSD and SVR4 practice.
- * Andy Walker (andy@lysaker.kvaerner.no), February 14, 1995
- *
- * Scrapped free list which is redundant now that we allocate locks
- * dynamically with kmalloc()/kfree().
- * Andy Walker (andy@lysaker.kvaerner.no), February 21, 1995
- *
- * Implemented two lock personalities - FL_FLOCK and FL_POSIX.
- *
- * FL_POSIX locks are created with calls to fcntl() and lockf() through the
- * fcntl() system call. They have the semantics described above.
- *
- * FL_FLOCK locks are created with calls to flock(), through the flock()
- * system call, which is new. Old C libraries implement flock() via fcntl()
- * and will continue to use the old, broken implementation.
- *
- * FL_FLOCK locks follow the 4.4 BSD flock() semantics. They are associated
- * with a file pointer (filp). As a result they can be shared by a parent
- * process and its children after a fork(). They are removed when the last
- * file descriptor referring to the file pointer is closed (unless explicitly
- * unlocked).
- *
- * FL_FLOCK locks never deadlock, an existing lock is always removed before
- * upgrading from shared to exclusive (or vice versa). When this happens
- * any processes blocked by the current lock are woken up and allowed to
- * run before the new lock is applied.
- * Andy Walker (andy@lysaker.kvaerner.no), June 09, 1995
- *
- * Removed some race conditions in flock_lock_file(), marked other possible
- * races. Just grep for FIXME to see them.
- * Dmitry Gorodchanin (pgmdsg@ibi.com), February 09, 1996.
- *
- * Addressed Dmitry's concerns. Deadlock checking no longer recursive.
- * Lock allocation changed to GFP_ATOMIC as we can't afford to sleep
- * once we've checked for blocking and deadlocking.
- * Andy Walker (andy@lysaker.kvaerner.no), April 03, 1996.
- *
- * Initial implementation of mandatory locks. SunOS turned out to be
- * a rotten model, so I implemented the "obvious" semantics.
- * See 'Documentation/filesystems/mandatory-locking.rst' for details.
- * Andy Walker (andy@lysaker.kvaerner.no), April 06, 1996.
- *
- * Don't allow mandatory locks on mmap()'ed files. Added simple functions to
- * check if a file has mandatory locks, used by mmap(), open() and creat() to
- * see if system call should be rejected. Ref. HP-UX/SunOS/Solaris Reference
- * Manual, Section 2.
- * Andy Walker (andy@lysaker.kvaerner.no), April 09, 1996.
- *
- * Tidied up block list handling. Added '/proc/locks' interface.
- * Andy Walker (andy@lysaker.kvaerner.no), April 24, 1996.
- *
- * Fixed deadlock condition for pathological code that mixes calls to
- * flock() and fcntl().
- * Andy Walker (andy@lysaker.kvaerner.no), April 29, 1996.
- *
- * Allow only one type of locking scheme (FL_POSIX or FL_FLOCK) to be in use
- * for a given file at a time. Changed the CONFIG_LOCK_MANDATORY scheme to
- * guarantee sensible behaviour in the case where file system modules might
- * be compiled with different options than the kernel itself.
- * Andy Walker (andy@lysaker.kvaerner.no), May 15, 1996.
- *
- * Added a couple of missing wake_up() calls. Thanks to Thomas Meckel
- * (Thomas.Meckel@mni.fh-giessen.de) for spotting this.
- * Andy Walker (andy@lysaker.kvaerner.no), May 15, 1996.
- *
- * Changed FL_POSIX locks to use the block list in the same way as FL_FLOCK
- * locks. Changed process synchronisation to avoid dereferencing locks that
- * have already been freed.
- * Andy Walker (andy@lysaker.kvaerner.no), Sep 21, 1996.
- *
- * Made the block list a circular list to minimise searching in the list.
- * Andy Walker (andy@lysaker.kvaerner.no), Sep 25, 1996.
- *
- * Made mandatory locking a mount option. Default is not to allow mandatory
- * locking.
- * Andy Walker (andy@lysaker.kvaerner.no), Oct 04, 1996.
- *
- * Some adaptations for NFS support.
- * Olaf Kirch (okir@monad.swb.de), Dec 1996,
- *
- * Fixed /proc/locks interface so that we can't overrun the buffer we are handed.
- * Andy Walker (andy@lysaker.kvaerner.no), May 12, 1997.
- *
- * Use slab allocator instead of kmalloc/kfree.
- * Use generic list implementation from <linux/list.h>.
- * Sped up posix_locks_deadlock by only considering blocked locks.
- * Matthew Wilcox <willy@debian.org>, March, 2000.
- *
- * Leases and LOCK_MAND
- * Matthew Wilcox <willy@debian.org>, June, 2000.
- * Stephen Rothwell <sfr@canb.auug.org.au>, June, 2000.
*
* Locking conflicts and dependencies:
* If multiple threads attempt to lock the same byte (or flock the same file)
@@ -461,8 +355,6 @@ static void locks_move_blocks(struct file_lock *new, struct file_lock *fl)
}
static inline int flock_translate_cmd(int cmd) {
- if (cmd & LOCK_MAND)
- return cmd & (LOCK_MAND | LOCK_RW);
switch (cmd) {
case LOCK_SH:
return F_RDLCK;
@@ -942,8 +834,6 @@ static bool flock_locks_conflict(struct file_lock *caller_fl,
*/
if (caller_fl->fl_file == sys_fl->fl_file)
return false;
- if ((caller_fl->fl_type & LOCK_MAND) || (sys_fl->fl_type & LOCK_MAND))
- return false;
return locks_conflict(caller_fl, sys_fl);
}
@@ -2116,11 +2006,9 @@ EXPORT_SYMBOL(locks_lock_inode_wait);
* - %LOCK_SH -- a shared lock.
* - %LOCK_EX -- an exclusive lock.
* - %LOCK_UN -- remove an existing lock.
- * - %LOCK_MAND -- a 'mandatory' flock.
- * This exists to emulate Windows Share Modes.
+ * - %LOCK_MAND -- a 'mandatory' flock. (DEPRECATED)
*
- * %LOCK_MAND can be combined with %LOCK_READ or %LOCK_WRITE to allow other
- * processes read and write access respectively.
+ * %LOCK_MAND support has been removed from the kernel.
*/
SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
{
@@ -2137,9 +2025,22 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
cmd &= ~LOCK_NB;
unlock = (cmd == LOCK_UN);
- if (!unlock && !(cmd & LOCK_MAND) &&
- !(f.file->f_mode & (FMODE_READ|FMODE_WRITE)))
+ if (!unlock && !(f.file->f_mode & (FMODE_READ|FMODE_WRITE)))
+ goto out_putf;
+
+ /*
+ * LOCK_MAND locks were broken for a long time in that they never
+ * conflicted with one another and didn't prevent any sort of open,
+ * read or write activity.
+ *
+ * Just ignore these requests now, to preserve legacy behavior, but
+ * throw a warning to let people know that they don't actually work.
+ */
+ if (cmd & LOCK_MAND) {
+ pr_warn_once("Attempt to set a LOCK_MAND lock via flock(2). This support has been removed and the request ignored.\n");
+ error = 0;
goto out_putf;
+ }
lock = flock_make_lock(f.file, cmd, NULL);
if (IS_ERR(lock)) {
@@ -2718,6 +2619,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
struct inode *inode = NULL;
unsigned int fl_pid;
struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb);
+ int type;
fl_pid = locks_translate_pid(fl, proc_pidns);
/*
@@ -2745,11 +2647,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
seq_printf(f, " %s ",
(inode == NULL) ? "*NOINODE*" : "ADVISORY ");
} else if (IS_FLOCK(fl)) {
- if (fl->fl_type & LOCK_MAND) {
- seq_puts(f, "FLOCK MSNFS ");
- } else {
- seq_puts(f, "FLOCK ADVISORY ");
- }
+ seq_puts(f, "FLOCK ADVISORY ");
} else if (IS_LEASE(fl)) {
if (fl->fl_flags & FL_DELEG)
seq_puts(f, "DELEG ");
@@ -2765,17 +2663,10 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
} else {
seq_puts(f, "UNKNOWN UNKNOWN ");
}
- if (fl->fl_type & LOCK_MAND) {
- seq_printf(f, "%s ",
- (fl->fl_type & LOCK_READ)
- ? (fl->fl_type & LOCK_WRITE) ? "RW " : "READ "
- : (fl->fl_type & LOCK_WRITE) ? "WRITE" : "NONE ");
- } else {
- int type = IS_LEASE(fl) ? target_leasetype(fl) : fl->fl_type;
+ type = IS_LEASE(fl) ? target_leasetype(fl) : fl->fl_type;
- seq_printf(f, "%s ", (type == F_WRLCK) ? "WRITE" :
- (type == F_RDLCK) ? "READ" : "UNLCK");
- }
+ seq_printf(f, "%s ", (type == F_WRLCK) ? "WRITE" :
+ (type == F_RDLCK) ? "READ" : "UNLCK");
if (inode) {
/* userspace relies on this representation of dev_t */
seq_printf(f, "%d %02x:%02x:%lu ", fl_pid,
diff --git a/fs/namei.c b/fs/namei.c
index 1946d9667790..1f9d2187c765 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3076,9 +3076,7 @@ static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp)
int error = get_write_access(inode);
if (error)
return error;
- /*
- * Refuse to truncate files with mandatory locks held on them.
- */
+
error = security_path_truncate(path);
if (!error) {
error = do_truncate(mnt_userns, path->dentry, 0,
diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c
index 0b6cd3b8734c..994ec22d4040 100644
--- a/fs/netfs/read_helper.c
+++ b/fs/netfs/read_helper.c
@@ -150,7 +150,7 @@ static void netfs_clear_unread(struct netfs_read_subrequest *subreq)
{
struct iov_iter iter;
- iov_iter_xarray(&iter, WRITE, &subreq->rreq->mapping->i_pages,
+ iov_iter_xarray(&iter, READ, &subreq->rreq->mapping->i_pages,
subreq->start + subreq->transferred,
subreq->len - subreq->transferred);
iov_iter_zero(iov_iter_count(&iter), &iter);
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index acb1d22907da..5e56da748b2a 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -252,7 +252,7 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
d->bdev = bdev;
- d->len = i_size_read(d->bdev->bd_inode);
+ d->len = bdev_nr_bytes(d->bdev);
d->map = bl_map_simple;
printk(KERN_INFO "pNFS: using block device %s\n",
@@ -367,7 +367,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
return PTR_ERR(bdev);
d->bdev = bdev;
- d->len = i_size_read(d->bdev->bd_inode);
+ d->len = bdev_nr_bytes(d->bdev);
d->map = bl_map_simple;
d->pr_key = v->scsi.pr_key;
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 2e894fec036b..7a5f287c5391 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -275,7 +275,7 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq)
res = (long) dreq->count;
WARN_ON_ONCE(dreq->count < 0);
}
- dreq->iocb->ki_complete(dreq->iocb, res, 0);
+ dreq->iocb->ki_complete(dreq->iocb, res);
}
complete(&dreq->completion);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index aa353fd58240..24e7dccce355 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -843,15 +843,6 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
if (!(fl->fl_flags & FL_FLOCK))
return -ENOLCK;
- /*
- * The NFSv4 protocol doesn't support LOCK_MAND, which is not part of
- * any standard. In principle we might be able to support LOCK_MAND
- * on NFSv2/3 since NLMv3/4 support DOS share modes, but for now the
- * NFS code is not set up for it.
- */
- if (fl->fl_type & LOCK_MAND)
- return -EINVAL;
-
if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK)
is_local = 1;
diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c
index edec45831585..0a9b72685f98 100644
--- a/fs/nfs_common/grace.c
+++ b/fs/nfs_common/grace.c
@@ -42,7 +42,6 @@ EXPORT_SYMBOL_GPL(locks_start_grace);
/**
* locks_end_grace
- * @net: net namespace that this lock manager belongs to
* @lm: who this grace period is for
*
* Call this function to state that the given lock manager is ready to
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 6e9ea4ee0f73..3d1d17256a91 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -109,7 +109,6 @@ config NFSD_SCSILAYOUT
depends on NFSD_V4 && BLOCK
select NFSD_PNFS
select EXPORTFS_BLOCK_OPS
- select SCSI_COMMON
help
This option enables support for the exporting pNFS SCSI layouts
in the kernel's NFS server. The pNFS SCSI layout enables NFS
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index c99dee99a3c1..e5c0982a381d 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -9,9 +9,6 @@
#include <linux/pr.h>
#include <linux/nfsd/debug.h>
-#include <scsi/scsi_proto.h>
-#include <scsi/scsi_common.h>
-#include <scsi/scsi_request.h>
#include "blocklayoutxdr.h"
#include "pnfs.h"
@@ -211,109 +208,6 @@ const struct nfsd4_layout_ops bl_layout_ops = {
#endif /* CONFIG_NFSD_BLOCKLAYOUT */
#ifdef CONFIG_NFSD_SCSILAYOUT
-static int nfsd4_scsi_identify_device(struct block_device *bdev,
- struct pnfs_block_volume *b)
-{
- struct request_queue *q = bdev->bd_disk->queue;
- struct request *rq;
- struct scsi_request *req;
- /*
- * The allocation length (passed in bytes 3 and 4 of the INQUIRY
- * command descriptor block) specifies the number of bytes that have
- * been allocated for the data-in buffer.
- * 252 is the highest one-byte value that is a multiple of 4.
- * 65532 is the highest two-byte value that is a multiple of 4.
- */
- size_t bufflen = 252, maxlen = 65532, len, id_len;
- u8 *buf, *d, type, assoc;
- int retries = 1, error;
-
- if (WARN_ON_ONCE(!blk_queue_scsi_passthrough(q)))
- return -EINVAL;
-
-again:
- buf = kzalloc(bufflen, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
-
- rq = blk_get_request(q, REQ_OP_DRV_IN, 0);
- if (IS_ERR(rq)) {
- error = -ENOMEM;
- goto out_free_buf;
- }
- req = scsi_req(rq);
-
- error = blk_rq_map_kern(q, rq, buf, bufflen, GFP_KERNEL);
- if (error)
- goto out_put_request;
-
- req->cmd[0] = INQUIRY;
- req->cmd[1] = 1;
- req->cmd[2] = 0x83;
- req->cmd[3] = bufflen >> 8;
- req->cmd[4] = bufflen & 0xff;
- req->cmd_len = COMMAND_SIZE(INQUIRY);
-
- blk_execute_rq(NULL, rq, 1);
- if (req->result) {
- pr_err("pNFS: INQUIRY 0x83 failed with: %x\n",
- req->result);
- error = -EIO;
- goto out_put_request;
- }
-
- len = (buf[2] << 8) + buf[3] + 4;
- if (len > bufflen) {
- if (len <= maxlen && retries--) {
- blk_put_request(rq);
- kfree(buf);
- bufflen = len;
- goto again;
- }
- pr_err("pNFS: INQUIRY 0x83 response invalid (len = %zd)\n",
- len);
- goto out_put_request;
- }
-
- d = buf + 4;
- for (d = buf + 4; d < buf + len; d += id_len + 4) {
- id_len = d[3];
- type = d[1] & 0xf;
- assoc = (d[1] >> 4) & 0x3;
-
- /*
- * We only care about a EUI-64 and NAA designator types
- * with LU association.
- */
- if (assoc != 0x00)
- continue;
- if (type != 0x02 && type != 0x03)
- continue;
- if (id_len != 8 && id_len != 12 && id_len != 16)
- continue;
-
- b->scsi.code_set = PS_CODE_SET_BINARY;
- b->scsi.designator_type = type == 0x02 ?
- PS_DESIGNATOR_EUI64 : PS_DESIGNATOR_NAA;
- b->scsi.designator_len = id_len;
- memcpy(b->scsi.designator, d + 4, id_len);
-
- /*
- * If we found a 8 or 12 byte descriptor continue on to
- * see if a 16 byte one is available. If we find a
- * 16 byte descriptor we're done.
- */
- if (id_len == 16)
- break;
- }
-
-out_put_request:
- blk_put_request(rq);
-out_free_buf:
- kfree(buf);
- return error;
-}
-
#define NFSD_MDS_PR_KEY 0x0100000000000000ULL
/*
@@ -325,6 +219,31 @@ static u64 nfsd4_scsi_pr_key(struct nfs4_client *clp)
return ((u64)clp->cl_clientid.cl_boot << 32) | clp->cl_clientid.cl_id;
}
+static const u8 designator_types[] = {
+ PS_DESIGNATOR_EUI64,
+ PS_DESIGNATOR_NAA,
+};
+
+static int
+nfsd4_block_get_unique_id(struct gendisk *disk, struct pnfs_block_volume *b)
+{
+ int ret, i;
+
+ for (i = 0; i < ARRAY_SIZE(designator_types); i++) {
+ u8 type = designator_types[i];
+
+ ret = disk->fops->get_unique_id(disk, b->scsi.designator, type);
+ if (ret > 0) {
+ b->scsi.code_set = PS_CODE_SET_BINARY;
+ b->scsi.designator_type = type;
+ b->scsi.designator_len = ret;
+ return 0;
+ }
+ }
+
+ return -EINVAL;
+}
+
static int
nfsd4_block_get_device_info_scsi(struct super_block *sb,
struct nfs4_client *clp,
@@ -333,7 +252,7 @@ nfsd4_block_get_device_info_scsi(struct super_block *sb,
struct pnfs_block_deviceaddr *dev;
struct pnfs_block_volume *b;
const struct pr_ops *ops;
- int error;
+ int ret;
dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
sizeof(struct pnfs_block_volume), GFP_KERNEL);
@@ -347,33 +266,38 @@ nfsd4_block_get_device_info_scsi(struct super_block *sb,
b->type = PNFS_BLOCK_VOLUME_SCSI;
b->scsi.pr_key = nfsd4_scsi_pr_key(clp);
- error = nfsd4_scsi_identify_device(sb->s_bdev, b);
- if (error)
- return error;
+ ret = nfsd4_block_get_unique_id(sb->s_bdev->bd_disk, b);
+ if (ret < 0)
+ goto out_free_dev;
+ ret = -EINVAL;
ops = sb->s_bdev->bd_disk->fops->pr_ops;
if (!ops) {
pr_err("pNFS: device %s does not support PRs.\n",
sb->s_id);
- return -EINVAL;
+ goto out_free_dev;
}
- error = ops->pr_register(sb->s_bdev, 0, NFSD_MDS_PR_KEY, true);
- if (error) {
+ ret = ops->pr_register(sb->s_bdev, 0, NFSD_MDS_PR_KEY, true);
+ if (ret) {
pr_err("pNFS: failed to register key for device %s.\n",
sb->s_id);
- return -EINVAL;
+ goto out_free_dev;
}
- error = ops->pr_reserve(sb->s_bdev, NFSD_MDS_PR_KEY,
+ ret = ops->pr_reserve(sb->s_bdev, NFSD_MDS_PR_KEY,
PR_EXCLUSIVE_ACCESS_REG_ONLY, 0);
- if (error) {
+ if (ret) {
pr_err("pNFS: failed to reserve device %s.\n",
sb->s_id);
- return -EINVAL;
+ goto out_free_dev;
}
return 0;
+
+out_free_dev:
+ kfree(dev);
+ return ret;
}
static __be32
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 7629248fdd53..be3c1aad50ea 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -542,7 +542,7 @@ nfsd_file_close_inode_sync(struct inode *inode)
}
/**
- * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
+ * nfsd_file_close_inode - attempt a delayed close of a nfsd_file
* @inode: inode of the file to attempt to remove
*
* Walk the whole hash bucket, looking for any files that correspond to "inode".
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index a97873f2d22b..6d1b5bb051c5 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -145,8 +145,9 @@ void nfsd4_setup_layout_type(struct svc_export *exp)
#ifdef CONFIG_NFSD_SCSILAYOUT
if (sb->s_export_op->map_blocks &&
sb->s_export_op->commit_blocks &&
- sb->s_bdev && sb->s_bdev->bd_disk->fops->pr_ops &&
- blk_queue_scsi_passthrough(sb->s_bdev->bd_disk->queue))
+ sb->s_bdev &&
+ sb->s_bdev->bd_disk->fops->pr_ops &&
+ sb->s_bdev->bd_disk->fops->get_unique_id)
exp->ex_layout_types |= 1 << LAYOUT_SCSI;
#endif
}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 7abeccb975b2..cf030ebe2827 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3544,15 +3544,18 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
goto fail;
cd->rd_maxcount -= entry_bytes;
/*
- * RFC 3530 14.2.24 describes rd_dircount as only a "hint", so
- * let's always let through the first entry, at least:
+ * RFC 3530 14.2.24 describes rd_dircount as only a "hint", and
+ * notes that it could be zero. If it is zero, then the server
+ * should enforce only the rd_maxcount value.
*/
- if (!cd->rd_dircount)
- goto fail;
- name_and_cookie = 4 + 4 * XDR_QUADLEN(namlen) + 8;
- if (name_and_cookie > cd->rd_dircount && cd->cookie_offset)
- goto fail;
- cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie);
+ if (cd->rd_dircount) {
+ name_and_cookie = 4 + 4 * XDR_QUADLEN(namlen) + 8;
+ if (name_and_cookie > cd->rd_dircount && cd->cookie_offset)
+ goto fail;
+ cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie);
+ if (!cd->rd_dircount)
+ cd->rd_maxcount = 0;
+ }
cd->cookie_offset = cookie_offset;
skip_entry:
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index c2c3d9077dc5..070e5dd03e26 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -793,7 +793,10 @@ out_close:
svc_xprt_put(xprt);
}
out_err:
- nfsd_destroy(net);
+ if (!list_empty(&nn->nfsd_serv->sv_permsocks))
+ nn->nfsd_serv->sv_nrthreads--;
+ else
+ nfsd_destroy(net);
return err;
}
@@ -1545,7 +1548,7 @@ static int __init init_nfsd(void)
goto out_free_all;
return 0;
out_free_all:
- unregister_pernet_subsys(&nfsd_net_ops);
+ unregister_filesystem(&nfsd_fs_type);
out_free_exports:
remove_proc_entry("fs/nfs/exports", NULL);
remove_proc_entry("fs/nfs", NULL);
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 640ac8fe891e..1d0583cfd970 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -1107,7 +1107,7 @@ static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp)
goto out;
ret = -ERANGE;
- if (range[1] > i_size_read(inode->i_sb->s_bdev->bd_inode))
+ if (range[1] > bdev_nr_bytes(inode->i_sb->s_bdev))
goto out;
segbytes = nilfs->ns_blocks_per_segment * nilfs->ns_blocksize;
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index f6b2d280aab5..3134c0e42fd4 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -403,7 +403,7 @@ int nilfs_resize_fs(struct super_block *sb, __u64 newsize)
int ret;
ret = -ERANGE;
- devsize = i_size_read(sb->s_bdev->bd_inode);
+ devsize = bdev_nr_bytes(sb->s_bdev);
if (newsize > devsize)
goto out;
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index c8bfc01da5d7..1bfcb5d3ea48 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -489,7 +489,7 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
{
struct nilfs_super_block **sbp = nilfs->ns_sbp;
struct buffer_head **sbh = nilfs->ns_sbh;
- u64 sb2off = NILFS_SB2_OFFSET_BYTES(nilfs->ns_bdev->bd_inode->i_size);
+ u64 sb2off = NILFS_SB2_OFFSET_BYTES(bdev_nr_bytes(nilfs->ns_bdev));
int valid[2], swp = 0;
sbp[0] = nilfs_read_super_block(sb, NILFS_SB_OFFSET_BYTES, blocksize,
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index ab4f3362466d..373dbb627657 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -5,6 +5,7 @@
* Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc.
*/
+#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/buffer_head.h>
#include <linux/gfp.h>
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 0d7e948cb29c..5ae8de09b271 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2772,13 +2772,12 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
ntfs_debug("Set device block size to %i bytes (block size bits %i).",
blocksize, sb->s_blocksize_bits);
/* Determine the size of the device in units of block_size bytes. */
- if (!i_size_read(sb->s_bdev->bd_inode)) {
+ vol->nr_blocks = sb_bdev_nr_blocks(sb);
+ if (!vol->nr_blocks) {
if (!silent)
ntfs_error(sb, "Unable to determine device size.");
goto err_out_now;
}
- vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >>
- sb->s_blocksize_bits;
/* Read the boot sector and return unlocked buffer head to it. */
if (!(bh = read_ntfs_boot_sector(sb, silent))) {
if (!silent)
@@ -2816,8 +2815,7 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
goto err_out_now;
}
BUG_ON(blocksize != sb->s_blocksize);
- vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >>
- sb->s_blocksize_bits;
+ vol->nr_blocks = sb_bdev_nr_blocks(sb);
ntfs_debug("Changed device block size to %i bytes (block size "
"bits %i) to match volume sector size.",
blocksize, sb->s_blocksize_bits);
diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c
index 34c4cbf7e29b..e8c00dda42ad 100644
--- a/fs/ntfs3/attrib.c
+++ b/fs/ntfs3/attrib.c
@@ -6,13 +6,9 @@
* TODO: Merge attr_set_size/attr_data_get_block/attr_allocate_frame?
*/
-#include <linux/blkdev.h>
-#include <linux/buffer_head.h>
#include <linux/fs.h>
-#include <linux/hash.h>
-#include <linux/nls.h>
-#include <linux/ratelimit.h>
#include <linux/slab.h>
+#include <linux/kernel.h>
#include "debug.h"
#include "ntfs.h"
@@ -291,7 +287,7 @@ int attr_make_nonresident(struct ntfs_inode *ni, struct ATTRIB *attr,
if (!rsize) {
/* Empty resident -> Non empty nonresident. */
} else if (!is_data) {
- err = ntfs_sb_write_run(sbi, run, 0, data, rsize);
+ err = ntfs_sb_write_run(sbi, run, 0, data, rsize, 0);
if (err)
goto out2;
} else if (!page) {
@@ -451,11 +447,8 @@ again:
again_1:
align = sbi->cluster_size;
- if (is_ext) {
+ if (is_ext)
align <<= attr_b->nres.c_unit;
- if (is_attr_sparsed(attr_b))
- keep_prealloc = false;
- }
old_valid = le64_to_cpu(attr_b->nres.valid_size);
old_size = le64_to_cpu(attr_b->nres.data_size);
@@ -465,9 +458,6 @@ again_1:
new_alloc = (new_size + align - 1) & ~(u64)(align - 1);
new_alen = new_alloc >> cluster_bits;
- if (keep_prealloc && is_ext)
- keep_prealloc = false;
-
if (keep_prealloc && new_size < old_size) {
attr_b->nres.data_size = cpu_to_le64(new_size);
mi_b->dirty = true;
@@ -529,7 +519,7 @@ add_alloc_in_same_attr_seg:
} else if (pre_alloc == -1) {
pre_alloc = 0;
if (type == ATTR_DATA && !name_len &&
- sbi->options.prealloc) {
+ sbi->options->prealloc) {
CLST new_alen2 = bytes_to_cluster(
sbi, get_pre_allocated(new_size));
pre_alloc = new_alen2 - new_alen;
@@ -1966,7 +1956,7 @@ int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size)
return 0;
from = vbo;
- to = (vbo + bytes) < data_size ? (vbo + bytes) : data_size;
+ to = min_t(u64, vbo + bytes, data_size);
memset(Add2Ptr(resident_data(attr_b), from), 0, to - from);
return 0;
}
diff --git a/fs/ntfs3/attrlist.c b/fs/ntfs3/attrlist.c
index fa32399eb517..bad6d8a849a2 100644
--- a/fs/ntfs3/attrlist.c
+++ b/fs/ntfs3/attrlist.c
@@ -5,10 +5,7 @@
*
*/
-#include <linux/blkdev.h>
-#include <linux/buffer_head.h>
#include <linux/fs.h>
-#include <linux/nls.h>
#include "debug.h"
#include "ntfs.h"
@@ -336,7 +333,7 @@ int al_add_le(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name,
if (attr && attr->non_res) {
err = ntfs_sb_write_run(ni->mi.sbi, &al->run, 0, al->le,
- al->size);
+ al->size, 0);
if (err)
return err;
al->dirty = false;
@@ -423,7 +420,7 @@ next:
return true;
}
-int al_update(struct ntfs_inode *ni)
+int al_update(struct ntfs_inode *ni, int sync)
{
int err;
struct ATTRIB *attr;
@@ -445,7 +442,7 @@ int al_update(struct ntfs_inode *ni)
memcpy(resident_data(attr), al->le, al->size);
} else {
err = ntfs_sb_write_run(ni->mi.sbi, &al->run, 0, al->le,
- al->size);
+ al->size, sync);
if (err)
goto out;
diff --git a/fs/ntfs3/bitfunc.c b/fs/ntfs3/bitfunc.c
index ce304d40b5e1..50d838093790 100644
--- a/fs/ntfs3/bitfunc.c
+++ b/fs/ntfs3/bitfunc.c
@@ -5,13 +5,8 @@
*
*/
-#include <linux/blkdev.h>
-#include <linux/buffer_head.h>
-#include <linux/fs.h>
-#include <linux/nls.h>
+#include <linux/types.h>
-#include "debug.h"
-#include "ntfs.h"
#include "ntfs_fs.h"
#define BITS_IN_SIZE_T (sizeof(size_t) * 8)
@@ -124,8 +119,7 @@ bool are_bits_set(const ulong *lmap, size_t bit, size_t nbits)
pos = nbits & 7;
if (pos) {
- u8 mask = fill_mask[pos];
-
+ mask = fill_mask[pos];
if ((*map & mask) != mask)
return false;
}
diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c
index 831501555009..aa184407520f 100644
--- a/fs/ntfs3/bitmap.c
+++ b/fs/ntfs3/bitmap.c
@@ -10,12 +10,10 @@
*
*/
-#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/fs.h>
-#include <linux/nls.h>
+#include <linux/kernel.h>
-#include "debug.h"
#include "ntfs.h"
#include "ntfs_fs.h"
@@ -435,7 +433,7 @@ static void wnd_remove_free_ext(struct wnd_bitmap *wnd, size_t bit, size_t len)
;
} else {
n3 = rb_next(&e->count.node);
- max_new_len = len > new_len ? len : new_len;
+ max_new_len = max(len, new_len);
if (!n3) {
wnd->extent_max = max_new_len;
} else {
@@ -731,7 +729,7 @@ int wnd_set_free(struct wnd_bitmap *wnd, size_t bit, size_t bits)
wbits = wnd->bits_last;
tail = wbits - wbit;
- op = tail < bits ? tail : bits;
+ op = min_t(u32, tail, bits);
bh = wnd_map(wnd, iw);
if (IS_ERR(bh)) {
@@ -784,7 +782,7 @@ int wnd_set_used(struct wnd_bitmap *wnd, size_t bit, size_t bits)
wbits = wnd->bits_last;
tail = wbits - wbit;
- op = tail < bits ? tail : bits;
+ op = min_t(u32, tail, bits);
bh = wnd_map(wnd, iw);
if (IS_ERR(bh)) {
@@ -834,7 +832,7 @@ static bool wnd_is_free_hlp(struct wnd_bitmap *wnd, size_t bit, size_t bits)
wbits = wnd->bits_last;
tail = wbits - wbit;
- op = tail < bits ? tail : bits;
+ op = min_t(u32, tail, bits);
if (wbits != wnd->free_bits[iw]) {
bool ret;
@@ -926,7 +924,7 @@ use_wnd:
wbits = wnd->bits_last;
tail = wbits - wbit;
- op = tail < bits ? tail : bits;
+ op = min_t(u32, tail, bits);
if (wnd->free_bits[iw]) {
bool ret;
diff --git a/fs/ntfs3/debug.h b/fs/ntfs3/debug.h
index 31120569a87b..53ef7489c75f 100644
--- a/fs/ntfs3/debug.h
+++ b/fs/ntfs3/debug.h
@@ -11,6 +11,9 @@
#ifndef _LINUX_NTFS3_DEBUG_H
#define _LINUX_NTFS3_DEBUG_H
+struct super_block;
+struct inode;
+
#ifndef Add2Ptr
#define Add2Ptr(P, I) ((void *)((u8 *)(P) + (I)))
#define PtrOffset(B, O) ((size_t)((size_t)(O) - (size_t)(B)))
diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c
index 93f6d485564e..fb438d604040 100644
--- a/fs/ntfs3/dir.c
+++ b/fs/ntfs3/dir.c
@@ -7,10 +7,7 @@
*
*/
-#include <linux/blkdev.h>
-#include <linux/buffer_head.h>
#include <linux/fs.h>
-#include <linux/iversion.h>
#include <linux/nls.h>
#include "debug.h"
@@ -18,30 +15,27 @@
#include "ntfs_fs.h"
/* Convert little endian UTF-16 to NLS string. */
-int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const struct le_str *uni,
+int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const __le16 *name, u32 len,
u8 *buf, int buf_len)
{
- int ret, uni_len, warn;
- const __le16 *ip;
+ int ret, warn;
u8 *op;
- struct nls_table *nls = sbi->options.nls;
+ struct nls_table *nls = sbi->options->nls;
static_assert(sizeof(wchar_t) == sizeof(__le16));
if (!nls) {
/* UTF-16 -> UTF-8 */
- ret = utf16s_to_utf8s((wchar_t *)uni->name, uni->len,
- UTF16_LITTLE_ENDIAN, buf, buf_len);
+ ret = utf16s_to_utf8s(name, len, UTF16_LITTLE_ENDIAN, buf,
+ buf_len);
buf[ret] = '\0';
return ret;
}
- ip = uni->name;
op = buf;
- uni_len = uni->len;
warn = 0;
- while (uni_len--) {
+ while (len--) {
u16 ec;
int charlen;
char dump[5];
@@ -52,7 +46,7 @@ int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const struct le_str *uni,
break;
}
- ec = le16_to_cpu(*ip++);
+ ec = le16_to_cpu(*name++);
charlen = nls->uni2char(ec, op, buf_len);
if (charlen > 0) {
@@ -186,7 +180,7 @@ int ntfs_nls_to_utf16(struct ntfs_sb_info *sbi, const u8 *name, u32 name_len,
{
int ret, slen;
const u8 *end;
- struct nls_table *nls = sbi->options.nls;
+ struct nls_table *nls = sbi->options->nls;
u16 *uname = uni->name;
static_assert(sizeof(wchar_t) == sizeof(u16));
@@ -301,14 +295,14 @@ static inline int ntfs_filldir(struct ntfs_sb_info *sbi, struct ntfs_inode *ni,
return 0;
/* Skip meta files. Unless option to show metafiles is set. */
- if (!sbi->options.showmeta && ntfs_is_meta_file(sbi, ino))
+ if (!sbi->options->showmeta && ntfs_is_meta_file(sbi, ino))
return 0;
- if (sbi->options.nohidden && (fname->dup.fa & FILE_ATTRIBUTE_HIDDEN))
+ if (sbi->options->nohidden && (fname->dup.fa & FILE_ATTRIBUTE_HIDDEN))
return 0;
- name_len = ntfs_utf16_to_nls(sbi, (struct le_str *)&fname->name_len,
- name, PATH_MAX);
+ name_len = ntfs_utf16_to_nls(sbi, fname->name, fname->name_len, name,
+ PATH_MAX);
if (name_len <= 0) {
ntfs_warn(sbi->sb, "failed to convert name for inode %lx.",
ino);
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index 424450e77ad5..a3cd3c3f091e 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -8,11 +8,11 @@
*/
#include <linux/backing-dev.h>
+#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/compat.h>
#include <linux/falloc.h>
#include <linux/fiemap.h>
-#include <linux/nls.h>
#include "debug.h"
#include "ntfs.h"
@@ -588,8 +588,11 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
truncate_pagecache(inode, vbo_down);
if (!is_sparsed(ni) && !is_compressed(ni)) {
- /* Normal file. */
- err = ntfs_zero_range(inode, vbo, end);
+ /*
+ * Normal file, can't make hole.
+ * TODO: Try to find way to save info about hole.
+ */
+ err = -EOPNOTSUPP;
goto out;
}
@@ -737,7 +740,7 @@ int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
umode_t mode = inode->i_mode;
int err;
- if (sbi->options.no_acs_rules) {
+ if (sbi->options->noacsrules) {
/* "No access rules" - Force any changes of time etc. */
attr->ia_valid |= ATTR_FORCE;
/* and disable for editing some attributes. */
@@ -1185,7 +1188,7 @@ static int ntfs_file_release(struct inode *inode, struct file *file)
int err = 0;
/* If we are last writer on the inode, drop the block reservation. */
- if (sbi->options.prealloc && ((file->f_mode & FMODE_WRITE) &&
+ if (sbi->options->prealloc && ((file->f_mode & FMODE_WRITE) &&
atomic_read(&inode->i_writecount) == 1)) {
ni_lock(ni);
down_write(&ni->file.run_lock);
diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c
index 938b12d56ca6..6f47a9c17f89 100644
--- a/fs/ntfs3/frecord.c
+++ b/fs/ntfs3/frecord.c
@@ -5,11 +5,8 @@
*
*/
-#include <linux/blkdev.h>
-#include <linux/buffer_head.h>
#include <linux/fiemap.h>
#include <linux/fs.h>
-#include <linux/nls.h>
#include <linux/vmalloc.h>
#include "debug.h"
@@ -708,18 +705,35 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni)
continue;
mi = ni_find_mi(ni, ino_get(&le->ref));
+ if (!mi) {
+ /* Should never happened, 'cause already checked. */
+ goto bad;
+ }
attr = mi_find_attr(mi, NULL, le->type, le_name(le),
le->name_len, &le->id);
+ if (!attr) {
+ /* Should never happened, 'cause already checked. */
+ goto bad;
+ }
asize = le32_to_cpu(attr->size);
/* Insert into primary record. */
attr_ins = mi_insert_attr(&ni->mi, le->type, le_name(le),
le->name_len, asize,
le16_to_cpu(attr->name_off));
- id = attr_ins->id;
+ if (!attr_ins) {
+ /*
+ * Internal error.
+ * Either no space in primary record (already checked).
+ * Either tried to insert another
+ * non indexed attribute (logic error).
+ */
+ goto bad;
+ }
/* Copy all except id. */
+ id = attr_ins->id;
memcpy(attr_ins, attr, asize);
attr_ins->id = id;
@@ -735,6 +749,10 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni)
ni->attr_list.dirty = false;
return 0;
+bad:
+ ntfs_inode_err(&ni->vfs_inode, "Internal error");
+ make_bad_inode(&ni->vfs_inode);
+ return -EINVAL;
}
/*
@@ -956,6 +974,13 @@ static int ni_ins_attr_ext(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le,
continue;
}
+ /*
+ * Do not try to insert this attribute
+ * if there is no room in record.
+ */
+ if (le32_to_cpu(mi->mrec->used) + asize > sbi->record_size)
+ continue;
+
/* Try to insert attribute into this subrecord. */
attr = ni_ins_new_attr(ni, mi, le, type, name, name_len, asize,
name_off, svcn, ins_le);
@@ -1451,7 +1476,7 @@ int ni_insert_resident(struct ntfs_inode *ni, u32 data_size,
attr->res.flags = RESIDENT_FLAG_INDEXED;
/* is_attr_indexed(attr)) == true */
- le16_add_cpu(&ni->mi.mrec->hard_links, +1);
+ le16_add_cpu(&ni->mi.mrec->hard_links, 1);
ni->mi.dirty = true;
}
attr->res.res = 0;
@@ -1606,7 +1631,7 @@ struct ATTR_FILE_NAME *ni_fname_type(struct ntfs_inode *ni, u8 name_type,
*le = NULL;
- if (FILE_NAME_POSIX == name_type)
+ if (name_type == FILE_NAME_POSIX)
return NULL;
/* Enumerate all names. */
@@ -1706,18 +1731,16 @@ out:
/*
* ni_parse_reparse
*
- * Buffer is at least 24 bytes.
+ * buffer - memory for reparse buffer header
*/
enum REPARSE_SIGN ni_parse_reparse(struct ntfs_inode *ni, struct ATTRIB *attr,
- void *buffer)
+ struct REPARSE_DATA_BUFFER *buffer)
{
const struct REPARSE_DATA_BUFFER *rp = NULL;
u8 bits;
u16 len;
typeof(rp->CompressReparseBuffer) *cmpr;
- static_assert(sizeof(struct REPARSE_DATA_BUFFER) <= 24);
-
/* Try to estimate reparse point. */
if (!attr->non_res) {
rp = resident_data_ex(attr, sizeof(struct REPARSE_DATA_BUFFER));
@@ -1803,6 +1826,9 @@ enum REPARSE_SIGN ni_parse_reparse(struct ntfs_inode *ni, struct ATTRIB *attr,
return REPARSE_NONE;
}
+ if (buffer != rp)
+ memcpy(buffer, rp, sizeof(struct REPARSE_DATA_BUFFER));
+
/* Looks like normal symlink. */
return REPARSE_LINK;
}
@@ -2906,9 +2932,8 @@ bool ni_remove_name_undo(struct ntfs_inode *dir_ni, struct ntfs_inode *ni,
memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), de + 1, de_key_size);
mi_get_ref(&ni->mi, &de->ref);
- if (indx_insert_entry(&dir_ni->dir, dir_ni, de, sbi, NULL, 1)) {
+ if (indx_insert_entry(&dir_ni->dir, dir_ni, de, sbi, NULL, 1))
return false;
- }
}
return true;
@@ -3077,7 +3102,9 @@ static bool ni_update_parent(struct ntfs_inode *ni, struct NTFS_DUP_INFO *dup,
const struct EA_INFO *info;
info = resident_data_ex(attr, sizeof(struct EA_INFO));
- dup->ea_size = info->size_pack;
+ /* If ATTR_EA_INFO exists 'info' can't be NULL. */
+ if (info)
+ dup->ea_size = info->size_pack;
}
}
@@ -3205,7 +3232,7 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint)
goto out;
}
- err = al_update(ni);
+ err = al_update(ni, sync);
if (err)
goto out;
}
diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c
index b5853aed0e25..06492f088d60 100644
--- a/fs/ntfs3/fslog.c
+++ b/fs/ntfs3/fslog.c
@@ -6,12 +6,8 @@
*/
#include <linux/blkdev.h>
-#include <linux/buffer_head.h>
#include <linux/fs.h>
-#include <linux/hash.h>
-#include <linux/nls.h>
#include <linux/random.h>
-#include <linux/ratelimit.h>
#include <linux/slab.h>
#include "debug.h"
@@ -2219,7 +2215,7 @@ file_is_valid:
err = ntfs_sb_write_run(log->ni->mi.sbi,
&log->ni->file.run, off, page,
- log->page_size);
+ log->page_size, 0);
if (err)
goto out;
@@ -3710,7 +3706,7 @@ move_data:
if (a_dirty) {
attr = oa->attr;
- err = ntfs_sb_write_run(sbi, oa->run1, vbo, buffer_le, bytes);
+ err = ntfs_sb_write_run(sbi, oa->run1, vbo, buffer_le, bytes, 0);
if (err)
goto out;
}
@@ -5152,10 +5148,10 @@ end_reply:
ntfs_fix_pre_write(&rh->rhdr, log->page_size);
- err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rh, log->page_size);
+ err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rh, log->page_size, 0);
if (!err)
err = ntfs_sb_write_run(sbi, &log->ni->file.run, log->page_size,
- rh, log->page_size);
+ rh, log->page_size, 0);
kfree(rh);
if (err)
diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c
index 91e3743e1442..4de9acb16968 100644
--- a/fs/ntfs3/fsntfs.c
+++ b/fs/ntfs3/fsntfs.c
@@ -8,7 +8,7 @@
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/fs.h>
-#include <linux/nls.h>
+#include <linux/kernel.h>
#include "debug.h"
#include "ntfs.h"
@@ -358,7 +358,7 @@ int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len,
enum ALLOCATE_OPT opt)
{
int err;
- CLST alen = 0;
+ CLST alen;
struct super_block *sb = sbi->sb;
size_t alcn, zlen, zeroes, zlcn, zlen2, ztrim, new_zlen;
struct wnd_bitmap *wnd = &sbi->used.bitmap;
@@ -370,27 +370,28 @@ int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len,
if (!zlen) {
err = ntfs_refresh_zone(sbi);
if (err)
- goto out;
+ goto up_write;
+
zlen = wnd_zone_len(wnd);
}
if (!zlen) {
ntfs_err(sbi->sb, "no free space to extend mft");
- goto out;
+ err = -ENOSPC;
+ goto up_write;
}
lcn = wnd_zone_bit(wnd);
- alen = zlen > len ? len : zlen;
+ alen = min_t(CLST, len, zlen);
wnd_zone_set(wnd, lcn + alen, zlen - alen);
err = wnd_set_used(wnd, lcn, alen);
- if (err) {
- up_write(&wnd->rw_lock);
- return err;
- }
+ if (err)
+ goto up_write;
+
alcn = lcn;
- goto out;
+ goto space_found;
}
/*
* 'Cause cluster 0 is always used this value means that we should use
@@ -404,49 +405,45 @@ int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len,
alen = wnd_find(wnd, len, lcn, BITMAP_FIND_MARK_AS_USED, &alcn);
if (alen)
- goto out;
+ goto space_found;
/* Try to use clusters from MftZone. */
zlen = wnd_zone_len(wnd);
zeroes = wnd_zeroes(wnd);
/* Check too big request */
- if (len > zeroes + zlen || zlen <= NTFS_MIN_MFT_ZONE)
- goto out;
+ if (len > zeroes + zlen || zlen <= NTFS_MIN_MFT_ZONE) {
+ err = -ENOSPC;
+ goto up_write;
+ }
/* How many clusters to cat from zone. */
zlcn = wnd_zone_bit(wnd);
zlen2 = zlen >> 1;
- ztrim = len > zlen ? zlen : (len > zlen2 ? len : zlen2);
- new_zlen = zlen - ztrim;
-
- if (new_zlen < NTFS_MIN_MFT_ZONE) {
- new_zlen = NTFS_MIN_MFT_ZONE;
- if (new_zlen > zlen)
- new_zlen = zlen;
- }
+ ztrim = clamp_val(len, zlen2, zlen);
+ new_zlen = max_t(size_t, zlen - ztrim, NTFS_MIN_MFT_ZONE);
wnd_zone_set(wnd, zlcn, new_zlen);
/* Allocate continues clusters. */
alen = wnd_find(wnd, len, 0,
BITMAP_FIND_MARK_AS_USED | BITMAP_FIND_FULL, &alcn);
-
-out:
- if (alen) {
- err = 0;
- *new_len = alen;
- *new_lcn = alcn;
-
- ntfs_unmap_meta(sb, alcn, alen);
-
- /* Set hint for next requests. */
- if (!(opt & ALLOCATE_MFT))
- sbi->used.next_free_lcn = alcn + alen;
- } else {
+ if (!alen) {
err = -ENOSPC;
+ goto up_write;
}
+space_found:
+ err = 0;
+ *new_len = alen;
+ *new_lcn = alcn;
+
+ ntfs_unmap_meta(sb, alcn, alen);
+
+ /* Set hint for next requests. */
+ if (!(opt & ALLOCATE_MFT))
+ sbi->used.next_free_lcn = alcn + alen;
+up_write:
up_write(&wnd->rw_lock);
return err;
}
@@ -1080,7 +1077,7 @@ int ntfs_sb_write(struct super_block *sb, u64 lbo, size_t bytes,
}
int ntfs_sb_write_run(struct ntfs_sb_info *sbi, const struct runs_tree *run,
- u64 vbo, const void *buf, size_t bytes)
+ u64 vbo, const void *buf, size_t bytes, int sync)
{
struct super_block *sb = sbi->sb;
u8 cluster_bits = sbi->cluster_bits;
@@ -1099,8 +1096,8 @@ int ntfs_sb_write_run(struct ntfs_sb_info *sbi, const struct runs_tree *run,
len = ((u64)clen << cluster_bits) - off;
for (;;) {
- u32 op = len < bytes ? len : bytes;
- int err = ntfs_sb_write(sb, lbo, op, buf, 0);
+ u32 op = min_t(u64, len, bytes);
+ int err = ntfs_sb_write(sb, lbo, op, buf, sync);
if (err)
return err;
@@ -1300,7 +1297,7 @@ int ntfs_get_bh(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo,
nb->off = off = lbo & (blocksize - 1);
for (;;) {
- u32 len32 = len < bytes ? len : bytes;
+ u32 len32 = min_t(u64, len, bytes);
sector_t block = lbo >> sb->s_blocksize_bits;
do {
@@ -2175,7 +2172,7 @@ int ntfs_insert_security(struct ntfs_sb_info *sbi,
/* Write main SDS bucket. */
err = ntfs_sb_write_run(sbi, &ni->file.run, sbi->security.next_off,
- d_security, aligned_sec_size);
+ d_security, aligned_sec_size, 0);
if (err)
goto out;
@@ -2193,7 +2190,7 @@ int ntfs_insert_security(struct ntfs_sb_info *sbi,
/* Write copy SDS bucket. */
err = ntfs_sb_write_run(sbi, &ni->file.run, mirr_off, d_security,
- aligned_sec_size);
+ aligned_sec_size, 0);
if (err)
goto out;
diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c
index 0daca9adc54c..6f81e3a49abf 100644
--- a/fs/ntfs3/index.c
+++ b/fs/ntfs3/index.c
@@ -8,7 +8,7 @@
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/fs.h>
-#include <linux/nls.h>
+#include <linux/kernel.h>
#include "debug.h"
#include "ntfs.h"
@@ -671,138 +671,74 @@ static struct NTFS_DE *hdr_find_e(const struct ntfs_index *indx,
const struct INDEX_HDR *hdr, const void *key,
size_t key_len, const void *ctx, int *diff)
{
- struct NTFS_DE *e;
+ struct NTFS_DE *e, *found = NULL;
NTFS_CMP_FUNC cmp = indx->cmp;
+ int min_idx = 0, mid_idx, max_idx = 0;
+ int diff2;
+ int table_size = 8;
u32 e_size, e_key_len;
u32 end = le32_to_cpu(hdr->used);
u32 off = le32_to_cpu(hdr->de_off);
+ u16 offs[128];
-#ifdef NTFS3_INDEX_BINARY_SEARCH
- int max_idx = 0, fnd, min_idx;
- int nslots = 64;
- u16 *offs;
-
- if (end > 0x10000)
- goto next;
-
- offs = kmalloc(sizeof(u16) * nslots, GFP_NOFS);
- if (!offs)
- goto next;
+fill_table:
+ if (off + sizeof(struct NTFS_DE) > end)
+ return NULL;
- /* Use binary search algorithm. */
-next1:
- if (off + sizeof(struct NTFS_DE) > end) {
- e = NULL;
- goto out1;
- }
e = Add2Ptr(hdr, off);
e_size = le16_to_cpu(e->size);
- if (e_size < sizeof(struct NTFS_DE) || off + e_size > end) {
- e = NULL;
- goto out1;
- }
-
- if (max_idx >= nslots) {
- u16 *ptr;
- int new_slots = ALIGN(2 * nslots, 8);
-
- ptr = kmalloc(sizeof(u16) * new_slots, GFP_NOFS);
- if (ptr)
- memcpy(ptr, offs, sizeof(u16) * max_idx);
- kfree(offs);
- offs = ptr;
- nslots = new_slots;
- if (!ptr)
- goto next;
- }
-
- /* Store entry table. */
- offs[max_idx] = off;
+ if (e_size < sizeof(struct NTFS_DE) || off + e_size > end)
+ return NULL;
if (!de_is_last(e)) {
+ offs[max_idx] = off;
off += e_size;
- max_idx += 1;
- goto next1;
- }
- /*
- * Table of pointers is created.
- * Use binary search to find entry that is <= to the search value.
- */
- fnd = -1;
- min_idx = 0;
+ max_idx++;
+ if (max_idx < table_size)
+ goto fill_table;
- while (min_idx <= max_idx) {
- int mid_idx = min_idx + ((max_idx - min_idx) >> 1);
- int diff2;
-
- e = Add2Ptr(hdr, offs[mid_idx]);
+ max_idx--;
+ }
- e_key_len = le16_to_cpu(e->key_size);
+binary_search:
+ e_key_len = le16_to_cpu(e->key_size);
- diff2 = (*cmp)(key, key_len, e + 1, e_key_len, ctx);
+ diff2 = (*cmp)(key, key_len, e + 1, e_key_len, ctx);
+ if (diff2 > 0) {
+ if (found) {
+ min_idx = mid_idx + 1;
+ } else {
+ if (de_is_last(e))
+ return NULL;
- if (!diff2) {
- *diff = 0;
- goto out1;
+ max_idx = 0;
+ table_size = min(table_size * 2,
+ (int)ARRAY_SIZE(offs));
+ goto fill_table;
}
-
- if (diff2 < 0) {
+ } else if (diff2 < 0) {
+ if (found)
max_idx = mid_idx - 1;
- fnd = mid_idx;
- if (!fnd)
- break;
- } else {
- min_idx = mid_idx + 1;
- }
- }
+ else
+ max_idx--;
- if (fnd == -1) {
- e = NULL;
- goto out1;
+ found = e;
+ } else {
+ *diff = 0;
+ return e;
}
- *diff = -1;
- e = Add2Ptr(hdr, offs[fnd]);
-
-out1:
- kfree(offs);
-
- return e;
-#endif
-
-next:
- /*
- * Entries index are sorted.
- * Enumerate all entries until we find entry
- * that is <= to the search value.
- */
- if (off + sizeof(struct NTFS_DE) > end)
- return NULL;
-
- e = Add2Ptr(hdr, off);
- e_size = le16_to_cpu(e->size);
-
- if (e_size < sizeof(struct NTFS_DE) || off + e_size > end)
- return NULL;
-
- off += e_size;
-
- e_key_len = le16_to_cpu(e->key_size);
-
- *diff = (*cmp)(key, key_len, e + 1, e_key_len, ctx);
- if (!*diff)
- return e;
+ if (min_idx > max_idx) {
+ *diff = -1;
+ return found;
+ }
- if (*diff <= 0)
- return e;
+ mid_idx = (min_idx + max_idx) >> 1;
+ e = Add2Ptr(hdr, offs[mid_idx]);
- if (de_is_last(e)) {
- *diff = 1;
- return e;
- }
- goto next;
+ goto binary_search;
}
/*
@@ -1136,9 +1072,7 @@ int indx_find(struct ntfs_index *indx, struct ntfs_inode *ni,
if (!e)
return -EINVAL;
- if (fnd)
- fnd->root_de = e;
-
+ fnd->root_de = e;
err = 0;
for (;;) {
@@ -1401,7 +1335,7 @@ ok:
static int indx_create_allocate(struct ntfs_index *indx, struct ntfs_inode *ni,
CLST *vbn)
{
- int err = -ENOMEM;
+ int err;
struct ntfs_sb_info *sbi = ni->mi.sbi;
struct ATTRIB *bitmap;
struct ATTRIB *alloc;
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index db2a5a4c38e4..a87ab3ad3cd3 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -5,10 +5,8 @@
*
*/
-#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/fs.h>
-#include <linux/iversion.h>
#include <linux/mpage.h>
#include <linux/namei.h>
#include <linux/nls.h>
@@ -49,8 +47,8 @@ static struct inode *ntfs_read_mft(struct inode *inode,
inode->i_op = NULL;
/* Setup 'uid' and 'gid' */
- inode->i_uid = sbi->options.fs_uid;
- inode->i_gid = sbi->options.fs_gid;
+ inode->i_uid = sbi->options->fs_uid;
+ inode->i_gid = sbi->options->fs_gid;
err = mi_init(&ni->mi, sbi, ino);
if (err)
@@ -224,12 +222,9 @@ next_attr:
if (!attr->non_res) {
ni->i_valid = inode->i_size = rsize;
inode_set_bytes(inode, rsize);
- t32 = asize;
- } else {
- t32 = le16_to_cpu(attr->nres.run_off);
}
- mode = S_IFREG | (0777 & sbi->options.fs_fmask_inv);
+ mode = S_IFREG | (0777 & sbi->options->fs_fmask_inv);
if (!attr->non_res) {
ni->ni_flags |= NI_FLAG_RESIDENT;
@@ -272,7 +267,7 @@ next_attr:
goto out;
mode = sb->s_root
- ? (S_IFDIR | (0777 & sbi->options.fs_dmask_inv))
+ ? (S_IFDIR | (0777 & sbi->options->fs_dmask_inv))
: (S_IFDIR | 0777);
goto next_attr;
@@ -315,17 +310,14 @@ next_attr:
rp_fa = ni_parse_reparse(ni, attr, &rp);
switch (rp_fa) {
case REPARSE_LINK:
- if (!attr->non_res) {
- inode->i_size = rsize;
- inode_set_bytes(inode, rsize);
- t32 = asize;
- } else {
- inode->i_size =
- le64_to_cpu(attr->nres.data_size);
- t32 = le16_to_cpu(attr->nres.run_off);
- }
+ /*
+ * Normal symlink.
+ * Assume one unicode symbol == one utf8.
+ */
+ inode->i_size = le16_to_cpu(rp.SymbolicLinkReparseBuffer
+ .PrintNameLength) /
+ sizeof(u16);
- /* Looks like normal symlink. */
ni->i_valid = inode->i_size;
/* Clear directory bit. */
@@ -422,7 +414,7 @@ end_enum:
ni->std_fa &= ~FILE_ATTRIBUTE_DIRECTORY;
inode->i_op = &ntfs_link_inode_operations;
inode->i_fop = NULL;
- inode_nohighmem(inode); // ??
+ inode_nohighmem(inode);
} else if (S_ISREG(mode)) {
ni->std_fa &= ~FILE_ATTRIBUTE_DIRECTORY;
inode->i_op = &ntfs_file_inode_operations;
@@ -443,7 +435,7 @@ end_enum:
goto out;
}
- if ((sbi->options.sys_immutable &&
+ if ((sbi->options->sys_immutable &&
(std5->fa & FILE_ATTRIBUTE_SYSTEM)) &&
!S_ISFIFO(mode) && !S_ISSOCK(mode) && !S_ISLNK(mode)) {
inode->i_flags |= S_IMMUTABLE;
@@ -1054,7 +1046,7 @@ int ntfs_flush_inodes(struct super_block *sb, struct inode *i1,
if (!ret && i2)
ret = writeback_inode(i2);
if (!ret)
- ret = filemap_flush(sb->s_bdev->bd_inode->i_mapping);
+ ret = sync_blockdev_nowait(sb->s_bdev);
return ret;
}
@@ -1200,9 +1192,13 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns,
struct REPARSE_DATA_BUFFER *rp = NULL;
bool rp_inserted = false;
+ ni_lock_dir(dir_ni);
+
dir_root = indx_get_root(&dir_ni->dir, dir_ni, NULL, NULL);
- if (!dir_root)
- return ERR_PTR(-EINVAL);
+ if (!dir_root) {
+ err = -EINVAL;
+ goto out1;
+ }
if (S_ISDIR(mode)) {
/* Use parent's directory attributes. */
@@ -1244,7 +1240,7 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns,
* }
*/
} else if (S_ISREG(mode)) {
- if (sbi->options.sparse) {
+ if (sbi->options->sparse) {
/* Sparsed regular file, cause option 'sparse'. */
fa = FILE_ATTRIBUTE_SPARSE_FILE |
FILE_ATTRIBUTE_ARCHIVE;
@@ -1486,7 +1482,10 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns,
asize = ALIGN(SIZEOF_RESIDENT + nsize, 8);
t16 = PtrOffset(rec, attr);
- /* 0x78 - the size of EA + EAINFO to store WSL */
+ /*
+ * Below function 'ntfs_save_wsl_perm' requires 0x78 bytes.
+ * It is good idea to keep extened attributes resident.
+ */
if (asize + t16 + 0x78 + 8 > sbi->record_size) {
CLST alen;
CLST clst = bytes_to_cluster(sbi, nsize);
@@ -1521,14 +1520,14 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns,
}
asize = SIZEOF_NONRESIDENT + ALIGN(err, 8);
- inode->i_size = nsize;
} else {
attr->res.data_off = SIZEOF_RESIDENT_LE;
attr->res.data_size = cpu_to_le32(nsize);
memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), rp, nsize);
- inode->i_size = nsize;
nsize = 0;
}
+ /* Size of symlink equals the length of input string. */
+ inode->i_size = size;
attr->size = cpu_to_le32(asize);
@@ -1551,6 +1550,9 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns,
if (err)
goto out6;
+ /* Unlock parent directory before ntfs_init_acl. */
+ ni_unlock(dir_ni);
+
inode->i_generation = le16_to_cpu(rec->seq);
dir->i_mtime = dir->i_ctime = inode->i_atime;
@@ -1562,6 +1564,8 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns,
inode->i_op = &ntfs_link_inode_operations;
inode->i_fop = NULL;
inode->i_mapping->a_ops = &ntfs_aops;
+ inode->i_size = size;
+ inode_nohighmem(inode);
} else if (S_ISREG(mode)) {
inode->i_op = &ntfs_file_inode_operations;
inode->i_fop = &ntfs_file_operations;
@@ -1577,7 +1581,7 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns,
if (!S_ISLNK(mode) && (sb->s_flags & SB_POSIXACL)) {
err = ntfs_init_acl(mnt_userns, inode, dir);
if (err)
- goto out6;
+ goto out7;
} else
#endif
{
@@ -1586,7 +1590,7 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns,
/* Write non resident data. */
if (nsize) {
- err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rp, nsize);
+ err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rp, nsize, 0);
if (err)
goto out7;
}
@@ -1607,8 +1611,10 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns,
out7:
/* Undo 'indx_insert_entry'. */
+ ni_lock_dir(dir_ni);
indx_delete_entry(&dir_ni->dir, dir_ni, new_de + 1,
le16_to_cpu(new_de->key_size), sbi);
+ /* ni_unlock(dir_ni); will be called later. */
out6:
if (rp_inserted)
ntfs_remove_reparse(sbi, IO_REPARSE_TAG_SYMLINK, &new_de->ref);
@@ -1632,8 +1638,10 @@ out2:
kfree(rp);
out1:
- if (err)
+ if (err) {
+ ni_unlock(dir_ni);
return ERR_PTR(err);
+ }
unlock_new_inode(inode);
@@ -1754,15 +1762,15 @@ void ntfs_evict_inode(struct inode *inode)
static noinline int ntfs_readlink_hlp(struct inode *inode, char *buffer,
int buflen)
{
- int i, err = 0;
+ int i, err = -EINVAL;
struct ntfs_inode *ni = ntfs_i(inode);
struct super_block *sb = inode->i_sb;
struct ntfs_sb_info *sbi = sb->s_fs_info;
- u64 i_size = inode->i_size;
- u16 nlen = 0;
+ u64 size;
+ u16 ulen = 0;
void *to_free = NULL;
struct REPARSE_DATA_BUFFER *rp;
- struct le_str *uni;
+ const __le16 *uname;
struct ATTRIB *attr;
/* Reparse data present. Try to parse it. */
@@ -1771,68 +1779,64 @@ static noinline int ntfs_readlink_hlp(struct inode *inode, char *buffer,
*buffer = 0;
- /* Read into temporal buffer. */
- if (i_size > sbi->reparse.max_size || i_size <= sizeof(u32)) {
- err = -EINVAL;
- goto out;
- }
-
attr = ni_find_attr(ni, NULL, NULL, ATTR_REPARSE, NULL, 0, NULL, NULL);
- if (!attr) {
- err = -EINVAL;
+ if (!attr)
goto out;
- }
if (!attr->non_res) {
- rp = resident_data_ex(attr, i_size);
- if (!rp) {
- err = -EINVAL;
+ rp = resident_data_ex(attr, sizeof(struct REPARSE_DATA_BUFFER));
+ if (!rp)
goto out;
- }
+ size = le32_to_cpu(attr->res.data_size);
} else {
- rp = kmalloc(i_size, GFP_NOFS);
+ size = le64_to_cpu(attr->nres.data_size);
+ rp = NULL;
+ }
+
+ if (size > sbi->reparse.max_size || size <= sizeof(u32))
+ goto out;
+
+ if (!rp) {
+ rp = kmalloc(size, GFP_NOFS);
if (!rp) {
err = -ENOMEM;
goto out;
}
to_free = rp;
- err = ntfs_read_run_nb(sbi, &ni->file.run, 0, rp, i_size, NULL);
+ /* Read into temporal buffer. */
+ err = ntfs_read_run_nb(sbi, &ni->file.run, 0, rp, size, NULL);
if (err)
goto out;
}
- err = -EINVAL;
-
/* Microsoft Tag. */
switch (rp->ReparseTag) {
case IO_REPARSE_TAG_MOUNT_POINT:
/* Mount points and junctions. */
/* Can we use 'Rp->MountPointReparseBuffer.PrintNameLength'? */
- if (i_size <= offsetof(struct REPARSE_DATA_BUFFER,
- MountPointReparseBuffer.PathBuffer))
+ if (size <= offsetof(struct REPARSE_DATA_BUFFER,
+ MountPointReparseBuffer.PathBuffer))
goto out;
- uni = Add2Ptr(rp,
- offsetof(struct REPARSE_DATA_BUFFER,
- MountPointReparseBuffer.PathBuffer) +
- le16_to_cpu(rp->MountPointReparseBuffer
- .PrintNameOffset) -
- 2);
- nlen = le16_to_cpu(rp->MountPointReparseBuffer.PrintNameLength);
+ uname = Add2Ptr(rp,
+ offsetof(struct REPARSE_DATA_BUFFER,
+ MountPointReparseBuffer.PathBuffer) +
+ le16_to_cpu(rp->MountPointReparseBuffer
+ .PrintNameOffset));
+ ulen = le16_to_cpu(rp->MountPointReparseBuffer.PrintNameLength);
break;
case IO_REPARSE_TAG_SYMLINK:
/* FolderSymbolicLink */
/* Can we use 'Rp->SymbolicLinkReparseBuffer.PrintNameLength'? */
- if (i_size <= offsetof(struct REPARSE_DATA_BUFFER,
- SymbolicLinkReparseBuffer.PathBuffer))
+ if (size <= offsetof(struct REPARSE_DATA_BUFFER,
+ SymbolicLinkReparseBuffer.PathBuffer))
goto out;
- uni = Add2Ptr(rp,
- offsetof(struct REPARSE_DATA_BUFFER,
- SymbolicLinkReparseBuffer.PathBuffer) +
- le16_to_cpu(rp->SymbolicLinkReparseBuffer
- .PrintNameOffset) -
- 2);
- nlen = le16_to_cpu(
+ uname = Add2Ptr(
+ rp, offsetof(struct REPARSE_DATA_BUFFER,
+ SymbolicLinkReparseBuffer.PathBuffer) +
+ le16_to_cpu(rp->SymbolicLinkReparseBuffer
+ .PrintNameOffset));
+ ulen = le16_to_cpu(
rp->SymbolicLinkReparseBuffer.PrintNameLength);
break;
@@ -1864,29 +1868,28 @@ static noinline int ntfs_readlink_hlp(struct inode *inode, char *buffer,
goto out;
}
if (!IsReparseTagNameSurrogate(rp->ReparseTag) ||
- i_size <= sizeof(struct REPARSE_POINT)) {
+ size <= sizeof(struct REPARSE_POINT)) {
goto out;
}
/* Users tag. */
- uni = Add2Ptr(rp, sizeof(struct REPARSE_POINT) - 2);
- nlen = le16_to_cpu(rp->ReparseDataLength) -
+ uname = Add2Ptr(rp, sizeof(struct REPARSE_POINT));
+ ulen = le16_to_cpu(rp->ReparseDataLength) -
sizeof(struct REPARSE_POINT);
}
/* Convert nlen from bytes to UNICODE chars. */
- nlen >>= 1;
+ ulen >>= 1;
/* Check that name is available. */
- if (!nlen || &uni->name[nlen] > (__le16 *)Add2Ptr(rp, i_size))
+ if (!ulen || uname + ulen > (__le16 *)Add2Ptr(rp, size))
goto out;
/* If name is already zero terminated then truncate it now. */
- if (!uni->name[nlen - 1])
- nlen -= 1;
- uni->len = nlen;
+ if (!uname[ulen - 1])
+ ulen -= 1;
- err = ntfs_utf16_to_nls(sbi, uni, buffer, buflen);
+ err = ntfs_utf16_to_nls(sbi, uname, ulen, buffer, buflen);
if (err < 0)
goto out;
diff --git a/fs/ntfs3/lib/decompress_common.h b/fs/ntfs3/lib/decompress_common.h
index 2d70ae42f1b5..dd7ced000d0e 100644
--- a/fs/ntfs3/lib/decompress_common.h
+++ b/fs/ntfs3/lib/decompress_common.h
@@ -5,6 +5,9 @@
* Copyright (C) 2015 Eric Biggers
*/
+#ifndef _LINUX_NTFS3_LIB_DECOMPRESS_COMMON_H
+#define _LINUX_NTFS3_LIB_DECOMPRESS_COMMON_H
+
#include <linux/string.h>
#include <linux/compiler.h>
#include <linux/types.h>
@@ -336,3 +339,5 @@ static forceinline u8 *lz_copy(u8 *dst, u32 length, u32 offset, const u8 *bufend
return dst;
}
+
+#endif /* _LINUX_NTFS3_LIB_DECOMPRESS_COMMON_H */
diff --git a/fs/ntfs3/lib/lib.h b/fs/ntfs3/lib/lib.h
index f508fbad2e71..90309a5ae59c 100644
--- a/fs/ntfs3/lib/lib.h
+++ b/fs/ntfs3/lib/lib.h
@@ -7,6 +7,10 @@
* - linux kernel code style
*/
+#ifndef _LINUX_NTFS3_LIB_LIB_H
+#define _LINUX_NTFS3_LIB_LIB_H
+
+#include <linux/types.h>
/* globals from xpress_decompress.c */
struct xpress_decompressor *xpress_allocate_decompressor(void);
@@ -24,3 +28,5 @@ int lzx_decompress(struct lzx_decompressor *__restrict d,
const void *__restrict compressed_data,
size_t compressed_size, void *__restrict uncompressed_data,
size_t uncompressed_size);
+
+#endif /* _LINUX_NTFS3_LIB_LIB_H */
diff --git a/fs/ntfs3/lznt.c b/fs/ntfs3/lznt.c
index f1f691a67cc4..28f654561f27 100644
--- a/fs/ntfs3/lznt.c
+++ b/fs/ntfs3/lznt.c
@@ -5,13 +5,13 @@
*
*/
-#include <linux/blkdev.h>
-#include <linux/buffer_head.h>
-#include <linux/fs.h>
-#include <linux/nls.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/types.h>
#include "debug.h"
-#include "ntfs.h"
#include "ntfs_fs.h"
// clang-format off
@@ -292,7 +292,7 @@ next:
/*
* get_lznt_ctx
* @level: 0 - Standard compression.
- * !0 - Best compression, requires a lot of cpu.
+ * !0 - Best compression, requires a lot of cpu.
*/
struct lznt *get_lznt_ctx(int level)
{
diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c
index e58415d07132..bc741213ad84 100644
--- a/fs/ntfs3/namei.c
+++ b/fs/ntfs3/namei.c
@@ -5,11 +5,7 @@
*
*/
-#include <linux/blkdev.h>
-#include <linux/buffer_head.h>
#include <linux/fs.h>
-#include <linux/iversion.h>
-#include <linux/namei.h>
#include <linux/nls.h>
#include "debug.h"
@@ -99,16 +95,11 @@ static struct dentry *ntfs_lookup(struct inode *dir, struct dentry *dentry,
static int ntfs_create(struct user_namespace *mnt_userns, struct inode *dir,
struct dentry *dentry, umode_t mode, bool excl)
{
- struct ntfs_inode *ni = ntfs_i(dir);
struct inode *inode;
- ni_lock_dir(ni);
-
inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFREG | mode,
0, NULL, 0, NULL);
- ni_unlock(ni);
-
return IS_ERR(inode) ? PTR_ERR(inode) : 0;
}
@@ -120,16 +111,11 @@ static int ntfs_create(struct user_namespace *mnt_userns, struct inode *dir,
static int ntfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
struct dentry *dentry, umode_t mode, dev_t rdev)
{
- struct ntfs_inode *ni = ntfs_i(dir);
struct inode *inode;
- ni_lock_dir(ni);
-
inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, mode, rdev,
NULL, 0, NULL);
- ni_unlock(ni);
-
return IS_ERR(inode) ? PTR_ERR(inode) : 0;
}
@@ -200,15 +186,10 @@ static int ntfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
{
u32 size = strlen(symname);
struct inode *inode;
- struct ntfs_inode *ni = ntfs_i(dir);
-
- ni_lock_dir(ni);
inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFLNK | 0777,
0, symname, size, NULL);
- ni_unlock(ni);
-
return IS_ERR(inode) ? PTR_ERR(inode) : 0;
}
@@ -219,15 +200,10 @@ static int ntfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
struct dentry *dentry, umode_t mode)
{
struct inode *inode;
- struct ntfs_inode *ni = ntfs_i(dir);
-
- ni_lock_dir(ni);
inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFDIR | mode,
0, NULL, 0, NULL);
- ni_unlock(ni);
-
return IS_ERR(inode) ? PTR_ERR(inode) : 0;
}
diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h
index 6bb3e595263b..9cc396b117bf 100644
--- a/fs/ntfs3/ntfs.h
+++ b/fs/ntfs3/ntfs.h
@@ -10,19 +10,27 @@
#ifndef _LINUX_NTFS3_NTFS_H
#define _LINUX_NTFS3_NTFS_H
-/* TODO: Check 4K MFT record and 512 bytes cluster. */
+#include <linux/blkdev.h>
+#include <linux/build_bug.h>
+#include <linux/kernel.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include "debug.h"
-/* Activate this define to use binary search in indexes. */
-#define NTFS3_INDEX_BINARY_SEARCH
+/* TODO: Check 4K MFT record and 512 bytes cluster. */
/* Check each run for marked clusters. */
#define NTFS3_CHECK_FREE_CLST
#define NTFS_NAME_LEN 255
-/* ntfs.sys used 500 maximum links on-disk struct allows up to 0xffff. */
-#define NTFS_LINK_MAX 0x400
-//#define NTFS_LINK_MAX 0xffff
+/*
+ * ntfs.sys used 500 maximum links on-disk struct allows up to 0xffff.
+ * xfstest generic/041 creates 3003 hardlinks.
+ */
+#define NTFS_LINK_MAX 4000
/*
* Activate to use 64 bit clusters instead of 32 bits in ntfs.sys.
diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h
index dc71c59fd445..8aaec7e0804e 100644
--- a/fs/ntfs3/ntfs_fs.h
+++ b/fs/ntfs3/ntfs_fs.h
@@ -9,6 +9,37 @@
#ifndef _LINUX_NTFS3_NTFS_FS_H
#define _LINUX_NTFS3_NTFS_FS_H
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/cleancache.h>
+#include <linux/fs.h>
+#include <linux/highmem.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/page-flags.h>
+#include <linux/pagemap.h>
+#include <linux/rbtree.h>
+#include <linux/rwsem.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/time64.h>
+#include <linux/types.h>
+#include <linux/uidgid.h>
+#include <asm/div64.h>
+#include <asm/page.h>
+
+#include "debug.h"
+#include "ntfs.h"
+
+struct dentry;
+struct fiemap_extent_info;
+struct user_namespace;
+struct page;
+struct writeback_control;
+enum utf16_endian;
+
+
#define MINUS_ONE_T ((size_t)(-1))
/* Biggest MFT / smallest cluster */
#define MAXIMUM_BYTES_PER_MFT 4096
@@ -52,6 +83,7 @@
// clang-format on
struct ntfs_mount_options {
+ char *nls_name;
struct nls_table *nls;
kuid_t fs_uid;
@@ -59,19 +91,16 @@ struct ntfs_mount_options {
u16 fs_fmask_inv;
u16 fs_dmask_inv;
- unsigned uid : 1, /* uid was set. */
- gid : 1, /* gid was set. */
- fmask : 1, /* fmask was set. */
- dmask : 1, /* dmask was set. */
- sys_immutable : 1, /* Immutable system files. */
- discard : 1, /* Issue discard requests on deletions. */
- sparse : 1, /* Create sparse files. */
- showmeta : 1, /* Show meta files. */
- nohidden : 1, /* Do not show hidden files. */
- force : 1, /* Rw mount dirty volume. */
- no_acs_rules : 1, /*Exclude acs rules. */
- prealloc : 1 /* Preallocate space when file is growing. */
- ;
+ unsigned fmask : 1; /* fmask was set. */
+ unsigned dmask : 1; /*dmask was set. */
+ unsigned sys_immutable : 1; /* Immutable system files. */
+ unsigned discard : 1; /* Issue discard requests on deletions. */
+ unsigned sparse : 1; /* Create sparse files. */
+ unsigned showmeta : 1; /* Show meta files. */
+ unsigned nohidden : 1; /* Do not show hidden files. */
+ unsigned force : 1; /* RW mount dirty volume. */
+ unsigned noacsrules : 1; /* Exclude acs rules. */
+ unsigned prealloc : 1; /* Preallocate space when file is growing. */
};
/* Special value to unpack and deallocate. */
@@ -182,10 +211,8 @@ struct ntfs_sb_info {
u32 blocks_per_cluster; // cluster_size / sb->s_blocksize
u32 record_size;
- u32 sector_size;
u32 index_size;
- u8 sector_bits;
u8 cluster_bits;
u8 record_bits;
@@ -279,7 +306,7 @@ struct ntfs_sb_info {
#endif
} compress;
- struct ntfs_mount_options options;
+ struct ntfs_mount_options *options;
struct ratelimit_state msg_ratelimit;
};
@@ -436,7 +463,7 @@ bool al_remove_le(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le);
bool al_delete_le(struct ntfs_inode *ni, enum ATTR_TYPE type, CLST vcn,
const __le16 *name, size_t name_len,
const struct MFT_REF *ref);
-int al_update(struct ntfs_inode *ni);
+int al_update(struct ntfs_inode *ni, int sync);
static inline size_t al_aligned(size_t size)
{
return (size + 1023) & ~(size_t)1023;
@@ -448,7 +475,7 @@ bool are_bits_set(const ulong *map, size_t bit, size_t nbits);
size_t get_set_bits_ex(const ulong *map, size_t bit, size_t nbits);
/* Globals from dir.c */
-int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const struct le_str *uni,
+int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const __le16 *name, u32 len,
u8 *buf, int buf_len);
int ntfs_nls_to_utf16(struct ntfs_sb_info *sbi, const u8 *name, u32 name_len,
struct cpu_str *uni, u32 max_ulen,
@@ -520,7 +547,7 @@ struct ATTR_FILE_NAME *ni_fname_type(struct ntfs_inode *ni, u8 name_type,
struct ATTR_LIST_ENTRY **entry);
int ni_new_attr_flags(struct ntfs_inode *ni, enum FILE_ATTRIBUTE new_fa);
enum REPARSE_SIGN ni_parse_reparse(struct ntfs_inode *ni, struct ATTRIB *attr,
- void *buffer);
+ struct REPARSE_DATA_BUFFER *buffer);
int ni_write_inode(struct inode *inode, int sync, const char *hint);
#define _ni_write_inode(i, w) ni_write_inode(i, w, __func__)
int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo,
@@ -577,7 +604,7 @@ int ntfs_sb_read(struct super_block *sb, u64 lbo, size_t bytes, void *buffer);
int ntfs_sb_write(struct super_block *sb, u64 lbo, size_t bytes,
const void *buffer, int wait);
int ntfs_sb_write_run(struct ntfs_sb_info *sbi, const struct runs_tree *run,
- u64 vbo, const void *buf, size_t bytes);
+ u64 vbo, const void *buf, size_t bytes, int sync);
struct buffer_head *ntfs_bread_run(struct ntfs_sb_info *sbi,
const struct runs_tree *run, u64 vbo);
int ntfs_read_run_nb(struct ntfs_sb_info *sbi, const struct runs_tree *run,
diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c
index 103705c86772..861e35791506 100644
--- a/fs/ntfs3/record.c
+++ b/fs/ntfs3/record.c
@@ -5,10 +5,7 @@
*
*/
-#include <linux/blkdev.h>
-#include <linux/buffer_head.h>
#include <linux/fs.h>
-#include <linux/nls.h>
#include "debug.h"
#include "ntfs.h"
diff --git a/fs/ntfs3/run.c b/fs/ntfs3/run.c
index 26ed2b64345e..a8fec651f973 100644
--- a/fs/ntfs3/run.c
+++ b/fs/ntfs3/run.c
@@ -7,10 +7,8 @@
*/
#include <linux/blkdev.h>
-#include <linux/buffer_head.h>
#include <linux/fs.h>
#include <linux/log2.h>
-#include <linux/nls.h>
#include "debug.h"
#include "ntfs.h"
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index 55bbc9200a10..29813200c7af 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -23,16 +23,15 @@
*
*/
-#include <linux/backing-dev.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/exportfs.h>
#include <linux/fs.h>
-#include <linux/iversion.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/log2.h>
#include <linux/module.h>
#include <linux/nls.h>
-#include <linux/parser.h>
#include <linux/seq_file.h>
#include <linux/statfs.h>
@@ -205,9 +204,11 @@ void *ntfs_put_shared(void *ptr)
return ret;
}
-static inline void clear_mount_options(struct ntfs_mount_options *options)
+static inline void put_mount_options(struct ntfs_mount_options *options)
{
+ kfree(options->nls_name);
unload_nls(options->nls);
+ kfree(options);
}
enum Opt {
@@ -223,218 +224,175 @@ enum Opt {
Opt_nohidden,
Opt_showmeta,
Opt_acl,
- Opt_noatime,
- Opt_nls,
+ Opt_iocharset,
Opt_prealloc,
- Opt_no_acs_rules,
+ Opt_noacsrules,
Opt_err,
};
-static const match_table_t ntfs_tokens = {
- { Opt_uid, "uid=%u" },
- { Opt_gid, "gid=%u" },
- { Opt_umask, "umask=%o" },
- { Opt_dmask, "dmask=%o" },
- { Opt_fmask, "fmask=%o" },
- { Opt_immutable, "sys_immutable" },
- { Opt_discard, "discard" },
- { Opt_force, "force" },
- { Opt_sparse, "sparse" },
- { Opt_nohidden, "nohidden" },
- { Opt_acl, "acl" },
- { Opt_noatime, "noatime" },
- { Opt_showmeta, "showmeta" },
- { Opt_nls, "nls=%s" },
- { Opt_prealloc, "prealloc" },
- { Opt_no_acs_rules, "no_acs_rules" },
- { Opt_err, NULL },
+static const struct fs_parameter_spec ntfs_fs_parameters[] = {
+ fsparam_u32("uid", Opt_uid),
+ fsparam_u32("gid", Opt_gid),
+ fsparam_u32oct("umask", Opt_umask),
+ fsparam_u32oct("dmask", Opt_dmask),
+ fsparam_u32oct("fmask", Opt_fmask),
+ fsparam_flag_no("sys_immutable", Opt_immutable),
+ fsparam_flag_no("discard", Opt_discard),
+ fsparam_flag_no("force", Opt_force),
+ fsparam_flag_no("sparse", Opt_sparse),
+ fsparam_flag_no("hidden", Opt_nohidden),
+ fsparam_flag_no("acl", Opt_acl),
+ fsparam_flag_no("showmeta", Opt_showmeta),
+ fsparam_flag_no("prealloc", Opt_prealloc),
+ fsparam_flag_no("acsrules", Opt_noacsrules),
+ fsparam_string("iocharset", Opt_iocharset),
+ {}
};
-static noinline int ntfs_parse_options(struct super_block *sb, char *options,
- int silent,
- struct ntfs_mount_options *opts)
+/*
+ * Load nls table or if @nls is utf8 then return NULL.
+ */
+static struct nls_table *ntfs_load_nls(char *nls)
{
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int option;
- char nls_name[30];
- struct nls_table *nls;
+ struct nls_table *ret;
- opts->fs_uid = current_uid();
- opts->fs_gid = current_gid();
- opts->fs_fmask_inv = opts->fs_dmask_inv = ~current_umask();
- nls_name[0] = 0;
+ if (!nls)
+ nls = CONFIG_NLS_DEFAULT;
- if (!options)
- goto out;
+ if (strcmp(nls, "utf8") == 0)
+ return NULL;
- while ((p = strsep(&options, ","))) {
- int token;
+ if (strcmp(nls, CONFIG_NLS_DEFAULT) == 0)
+ return load_nls_default();
- if (!*p)
- continue;
+ ret = load_nls(nls);
+ if (ret)
+ return ret;
- token = match_token(p, ntfs_tokens, args);
- switch (token) {
- case Opt_immutable:
- opts->sys_immutable = 1;
- break;
- case Opt_uid:
- if (match_int(&args[0], &option))
- return -EINVAL;
- opts->fs_uid = make_kuid(current_user_ns(), option);
- if (!uid_valid(opts->fs_uid))
- return -EINVAL;
- opts->uid = 1;
- break;
- case Opt_gid:
- if (match_int(&args[0], &option))
- return -EINVAL;
- opts->fs_gid = make_kgid(current_user_ns(), option);
- if (!gid_valid(opts->fs_gid))
- return -EINVAL;
- opts->gid = 1;
- break;
- case Opt_umask:
- if (match_octal(&args[0], &option))
- return -EINVAL;
- opts->fs_fmask_inv = opts->fs_dmask_inv = ~option;
- opts->fmask = opts->dmask = 1;
- break;
- case Opt_dmask:
- if (match_octal(&args[0], &option))
- return -EINVAL;
- opts->fs_dmask_inv = ~option;
- opts->dmask = 1;
- break;
- case Opt_fmask:
- if (match_octal(&args[0], &option))
- return -EINVAL;
- opts->fs_fmask_inv = ~option;
- opts->fmask = 1;
- break;
- case Opt_discard:
- opts->discard = 1;
- break;
- case Opt_force:
- opts->force = 1;
- break;
- case Opt_sparse:
- opts->sparse = 1;
- break;
- case Opt_nohidden:
- opts->nohidden = 1;
- break;
- case Opt_acl:
+ return ERR_PTR(-EINVAL);
+}
+
+static int ntfs_fs_parse_param(struct fs_context *fc,
+ struct fs_parameter *param)
+{
+ struct ntfs_mount_options *opts = fc->fs_private;
+ struct fs_parse_result result;
+ int opt;
+
+ opt = fs_parse(fc, ntfs_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_uid:
+ opts->fs_uid = make_kuid(current_user_ns(), result.uint_32);
+ if (!uid_valid(opts->fs_uid))
+ return invalf(fc, "ntfs3: Invalid value for uid.");
+ break;
+ case Opt_gid:
+ opts->fs_gid = make_kgid(current_user_ns(), result.uint_32);
+ if (!gid_valid(opts->fs_gid))
+ return invalf(fc, "ntfs3: Invalid value for gid.");
+ break;
+ case Opt_umask:
+ if (result.uint_32 & ~07777)
+ return invalf(fc, "ntfs3: Invalid value for umask.");
+ opts->fs_fmask_inv = ~result.uint_32;
+ opts->fs_dmask_inv = ~result.uint_32;
+ opts->fmask = 1;
+ opts->dmask = 1;
+ break;
+ case Opt_dmask:
+ if (result.uint_32 & ~07777)
+ return invalf(fc, "ntfs3: Invalid value for dmask.");
+ opts->fs_dmask_inv = ~result.uint_32;
+ opts->dmask = 1;
+ break;
+ case Opt_fmask:
+ if (result.uint_32 & ~07777)
+ return invalf(fc, "ntfs3: Invalid value for fmask.");
+ opts->fs_fmask_inv = ~result.uint_32;
+ opts->fmask = 1;
+ break;
+ case Opt_immutable:
+ opts->sys_immutable = result.negated ? 0 : 1;
+ break;
+ case Opt_discard:
+ opts->discard = result.negated ? 0 : 1;
+ break;
+ case Opt_force:
+ opts->force = result.negated ? 0 : 1;
+ break;
+ case Opt_sparse:
+ opts->sparse = result.negated ? 0 : 1;
+ break;
+ case Opt_nohidden:
+ opts->nohidden = result.negated ? 1 : 0;
+ break;
+ case Opt_acl:
+ if (!result.negated)
#ifdef CONFIG_NTFS3_FS_POSIX_ACL
- sb->s_flags |= SB_POSIXACL;
- break;
+ fc->sb_flags |= SB_POSIXACL;
#else
- ntfs_err(sb, "support for ACL not compiled in!");
- return -EINVAL;
+ return invalf(fc, "ntfs3: Support for ACL not compiled in!");
#endif
- case Opt_noatime:
- sb->s_flags |= SB_NOATIME;
- break;
- case Opt_showmeta:
- opts->showmeta = 1;
- break;
- case Opt_nls:
- match_strlcpy(nls_name, &args[0], sizeof(nls_name));
- break;
- case Opt_prealloc:
- opts->prealloc = 1;
- break;
- case Opt_no_acs_rules:
- opts->no_acs_rules = 1;
- break;
- default:
- if (!silent)
- ntfs_err(
- sb,
- "Unrecognized mount option \"%s\" or missing value",
- p);
- //return -EINVAL;
- }
- }
-
-out:
- if (!strcmp(nls_name[0] ? nls_name : CONFIG_NLS_DEFAULT, "utf8")) {
- /*
- * For UTF-8 use utf16s_to_utf8s()/utf8s_to_utf16s()
- * instead of NLS.
- */
- nls = NULL;
- } else if (nls_name[0]) {
- nls = load_nls(nls_name);
- if (!nls) {
- ntfs_err(sb, "failed to load \"%s\"", nls_name);
- return -EINVAL;
- }
- } else {
- nls = load_nls_default();
- if (!nls) {
- ntfs_err(sb, "failed to load default nls");
- return -EINVAL;
- }
+ else
+ fc->sb_flags &= ~SB_POSIXACL;
+ break;
+ case Opt_showmeta:
+ opts->showmeta = result.negated ? 0 : 1;
+ break;
+ case Opt_iocharset:
+ kfree(opts->nls_name);
+ opts->nls_name = param->string;
+ param->string = NULL;
+ break;
+ case Opt_prealloc:
+ opts->prealloc = result.negated ? 0 : 1;
+ break;
+ case Opt_noacsrules:
+ opts->noacsrules = result.negated ? 1 : 0;
+ break;
+ default:
+ /* Should not be here unless we forget add case. */
+ return -EINVAL;
}
- opts->nls = nls;
-
return 0;
}
-static int ntfs_remount(struct super_block *sb, int *flags, char *data)
+static int ntfs_fs_reconfigure(struct fs_context *fc)
{
- int err, ro_rw;
+ struct super_block *sb = fc->root->d_sb;
struct ntfs_sb_info *sbi = sb->s_fs_info;
- struct ntfs_mount_options old_opts;
- char *orig_data = kstrdup(data, GFP_KERNEL);
-
- if (data && !orig_data)
- return -ENOMEM;
+ struct ntfs_mount_options *new_opts = fc->fs_private;
+ int ro_rw;
- /* Store original options. */
- memcpy(&old_opts, &sbi->options, sizeof(old_opts));
- clear_mount_options(&sbi->options);
- memset(&sbi->options, 0, sizeof(sbi->options));
-
- err = ntfs_parse_options(sb, data, 0, &sbi->options);
- if (err)
- goto restore_opts;
-
- ro_rw = sb_rdonly(sb) && !(*flags & SB_RDONLY);
+ ro_rw = sb_rdonly(sb) && !(fc->sb_flags & SB_RDONLY);
if (ro_rw && (sbi->flags & NTFS_FLAGS_NEED_REPLAY)) {
- ntfs_warn(
- sb,
- "Couldn't remount rw because journal is not replayed. Please umount/remount instead\n");
- err = -EINVAL;
- goto restore_opts;
+ errorf(fc, "ntfs3: Couldn't remount rw because journal is not replayed. Please umount/remount instead\n");
+ return -EINVAL;
}
+ new_opts->nls = ntfs_load_nls(new_opts->nls_name);
+ if (IS_ERR(new_opts->nls)) {
+ new_opts->nls = NULL;
+ errorf(fc, "ntfs3: Cannot load iocharset %s", new_opts->nls_name);
+ return -EINVAL;
+ }
+ if (new_opts->nls != sbi->options->nls)
+ return invalf(fc, "ntfs3: Cannot use different iocharset when remounting!");
+
sync_filesystem(sb);
if (ro_rw && (sbi->volume.flags & VOLUME_FLAG_DIRTY) &&
- !sbi->options.force) {
- ntfs_warn(sb, "volume is dirty and \"force\" flag is not set!");
- err = -EINVAL;
- goto restore_opts;
+ !new_opts->force) {
+ errorf(fc, "ntfs3: Volume is dirty and \"force\" flag is not set!");
+ return -EINVAL;
}
- clear_mount_options(&old_opts);
-
- *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME) |
- SB_NODIRATIME | SB_NOATIME;
- ntfs_info(sb, "re-mounted. Opts: %s", orig_data);
- err = 0;
- goto out;
+ memcpy(sbi->options, new_opts, sizeof(*new_opts));
-restore_opts:
- clear_mount_options(&sbi->options);
- memcpy(&sbi->options, &old_opts, sizeof(old_opts));
-
-out:
- kfree(orig_data);
- return err;
+ return 0;
}
static struct kmem_cache *ntfs_inode_cachep;
@@ -513,8 +471,6 @@ static noinline void put_ntfs(struct ntfs_sb_info *sbi)
xpress_free_decompressor(sbi->compress.xpress);
lzx_free_decompressor(sbi->compress.lzx);
#endif
- clear_mount_options(&sbi->options);
-
kfree(sbi);
}
@@ -525,7 +481,9 @@ static void ntfs_put_super(struct super_block *sb)
/* Mark rw ntfs as clear, if possible. */
ntfs_set_state(sbi, NTFS_DIRTY_CLEAR);
+ put_mount_options(sbi->options);
put_ntfs(sbi);
+ sb->s_fs_info = NULL;
sync_blockdev(sb->s_bdev);
}
@@ -552,23 +510,21 @@ static int ntfs_show_options(struct seq_file *m, struct dentry *root)
{
struct super_block *sb = root->d_sb;
struct ntfs_sb_info *sbi = sb->s_fs_info;
- struct ntfs_mount_options *opts = &sbi->options;
+ struct ntfs_mount_options *opts = sbi->options;
struct user_namespace *user_ns = seq_user_ns(m);
- if (opts->uid)
- seq_printf(m, ",uid=%u",
- from_kuid_munged(user_ns, opts->fs_uid));
- if (opts->gid)
- seq_printf(m, ",gid=%u",
- from_kgid_munged(user_ns, opts->fs_gid));
+ seq_printf(m, ",uid=%u",
+ from_kuid_munged(user_ns, opts->fs_uid));
+ seq_printf(m, ",gid=%u",
+ from_kgid_munged(user_ns, opts->fs_gid));
if (opts->fmask)
seq_printf(m, ",fmask=%04o", ~opts->fs_fmask_inv);
if (opts->dmask)
seq_printf(m, ",dmask=%04o", ~opts->fs_dmask_inv);
if (opts->nls)
- seq_printf(m, ",nls=%s", opts->nls->charset);
+ seq_printf(m, ",iocharset=%s", opts->nls->charset);
else
- seq_puts(m, ",nls=utf8");
+ seq_puts(m, ",iocharset=utf8");
if (opts->sys_immutable)
seq_puts(m, ",sys_immutable");
if (opts->discard)
@@ -581,14 +537,12 @@ static int ntfs_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",nohidden");
if (opts->force)
seq_puts(m, ",force");
- if (opts->no_acs_rules)
- seq_puts(m, ",no_acs_rules");
+ if (opts->noacsrules)
+ seq_puts(m, ",noacsrules");
if (opts->prealloc)
seq_puts(m, ",prealloc");
if (sb->s_flags & SB_POSIXACL)
seq_puts(m, ",acl");
- if (sb->s_flags & SB_NOATIME)
- seq_puts(m, ",noatime");
return 0;
}
@@ -643,7 +597,6 @@ static const struct super_operations ntfs_sops = {
.statfs = ntfs_statfs,
.show_options = ntfs_show_options,
.sync_fs = ntfs_sync_fs,
- .remount_fs = ntfs_remount,
.write_inode = ntfs3_write_inode,
};
@@ -729,7 +682,7 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
struct ntfs_sb_info *sbi = sb->s_fs_info;
int err;
u32 mb, gb, boot_sector_size, sct_per_clst, record_size;
- u64 sectors, clusters, fs_size, mlcn, mlcn2;
+ u64 sectors, clusters, mlcn, mlcn2;
struct NTFS_BOOT *boot;
struct buffer_head *bh;
struct MFT_REC *rec;
@@ -787,20 +740,20 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
goto out;
}
- sbi->sector_size = boot_sector_size;
- sbi->sector_bits = blksize_bits(boot_sector_size);
- fs_size = (sectors + 1) << sbi->sector_bits;
+ sbi->volume.size = sectors * boot_sector_size;
- gb = format_size_gb(fs_size, &mb);
+ gb = format_size_gb(sbi->volume.size + boot_sector_size, &mb);
/*
* - Volume formatted and mounted with the same sector size.
* - Volume formatted 4K and mounted as 512.
* - Volume formatted 512 and mounted as 4K.
*/
- if (sbi->sector_size != sector_size) {
- ntfs_warn(sb,
- "Different NTFS' sector size and media sector size");
+ if (boot_sector_size != sector_size) {
+ ntfs_warn(
+ sb,
+ "Different NTFS' sector size (%u) and media sector size (%u)",
+ boot_sector_size, sector_size);
dev_size += sector_size - 1;
}
@@ -810,8 +763,19 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
sbi->mft.lbo = mlcn << sbi->cluster_bits;
sbi->mft.lbo2 = mlcn2 << sbi->cluster_bits;
- if (sbi->cluster_size < sbi->sector_size)
+ /* Compare boot's cluster and sector. */
+ if (sbi->cluster_size < boot_sector_size)
+ goto out;
+
+ /* Compare boot's cluster and media sector. */
+ if (sbi->cluster_size < sector_size) {
+ /* No way to use ntfs_get_block in this case. */
+ ntfs_err(
+ sb,
+ "Failed to mount 'cause NTFS's cluster size (%u) is less than media sector size (%u)",
+ sbi->cluster_size, sector_size);
goto out;
+ }
sbi->cluster_mask = sbi->cluster_size - 1;
sbi->cluster_mask_inv = ~(u64)sbi->cluster_mask;
@@ -836,10 +800,9 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
: (u32)boot->index_size << sbi->cluster_bits;
sbi->volume.ser_num = le64_to_cpu(boot->serial_num);
- sbi->volume.size = sectors << sbi->sector_bits;
/* Warning if RAW volume. */
- if (dev_size < fs_size) {
+ if (dev_size < sbi->volume.size + boot_sector_size) {
u32 mb0, gb0;
gb0 = format_size_gb(dev_size, &mb0);
@@ -883,8 +846,7 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
rec->total = cpu_to_le32(sbi->record_size);
((struct ATTRIB *)Add2Ptr(rec, ao))->type = ATTR_END;
- if (sbi->cluster_size < PAGE_SIZE)
- sb_set_blocksize(sb, sbi->cluster_size);
+ sb_set_blocksize(sb, min_t(u32, sbi->cluster_size, PAGE_SIZE));
sbi->block_mask = sb->s_blocksize - 1;
sbi->blocks_per_cluster = sbi->cluster_size >> sb->s_blocksize_bits;
@@ -897,9 +859,11 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
if (clusters >= (1ull << (64 - sbi->cluster_bits)))
sbi->maxbytes = -1;
sbi->maxbytes_sparse = -1;
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
#else
/* Maximum size for sparse file. */
sbi->maxbytes_sparse = (1ull << (sbi->cluster_bits + 32)) - 1;
+ sb->s_maxbytes = 0xFFFFFFFFull << sbi->cluster_bits;
#endif
err = 0;
@@ -913,14 +877,13 @@ out:
/*
* ntfs_fill_super - Try to mount.
*/
-static int ntfs_fill_super(struct super_block *sb, void *data, int silent)
+static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
int err;
- struct ntfs_sb_info *sbi;
+ struct ntfs_sb_info *sbi = sb->s_fs_info;
struct block_device *bdev = sb->s_bdev;
- struct inode *bd_inode = bdev->bd_inode;
- struct request_queue *rq = bdev_get_queue(bdev);
- struct inode *inode = NULL;
+ struct request_queue *rq;
+ struct inode *inode;
struct ntfs_inode *ni;
size_t i, tt;
CLST vcn, lcn, len;
@@ -928,18 +891,11 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent)
const struct VOLUME_INFO *info;
u32 idx, done, bytes;
struct ATTR_DEF_ENTRY *t;
- u16 *upcase = NULL;
u16 *shared;
- bool is_ro;
struct MFT_REF ref;
ref.high = 0;
- sbi = kzalloc(sizeof(struct ntfs_sb_info), GFP_NOFS);
- if (!sbi)
- return -ENOMEM;
-
- sb->s_fs_info = sbi;
sbi->sb = sb;
sb->s_flags |= SB_NODIRATIME;
sb->s_magic = 0x7366746e; // "ntfs"
@@ -948,41 +904,27 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_time_gran = NTFS_TIME_GRAN; // 100 nsec
sb->s_xattr = ntfs_xattr_handlers;
- ratelimit_state_init(&sbi->msg_ratelimit, DEFAULT_RATELIMIT_INTERVAL,
- DEFAULT_RATELIMIT_BURST);
-
- err = ntfs_parse_options(sb, data, silent, &sbi->options);
- if (err)
+ sbi->options->nls = ntfs_load_nls(sbi->options->nls_name);
+ if (IS_ERR(sbi->options->nls)) {
+ sbi->options->nls = NULL;
+ errorf(fc, "Cannot load nls %s", sbi->options->nls_name);
+ err = -EINVAL;
goto out;
+ }
- if (!rq || !blk_queue_discard(rq) || !rq->limits.discard_granularity) {
- ;
- } else {
+ rq = bdev_get_queue(bdev);
+ if (blk_queue_discard(rq) && rq->limits.discard_granularity) {
sbi->discard_granularity = rq->limits.discard_granularity;
sbi->discard_granularity_mask_inv =
~(u64)(sbi->discard_granularity - 1);
}
- sb_set_blocksize(sb, PAGE_SIZE);
-
/* Parse boot. */
err = ntfs_init_from_boot(sb, rq ? queue_logical_block_size(rq) : 512,
- bd_inode->i_size);
+ bdev_nr_bytes(bdev));
if (err)
goto out;
-#ifdef CONFIG_NTFS3_64BIT_CLUSTER
- sb->s_maxbytes = MAX_LFS_FILESIZE;
-#else
- sb->s_maxbytes = 0xFFFFFFFFull << sbi->cluster_bits;
-#endif
-
- mutex_init(&sbi->compress.mtx_lznt);
-#ifdef CONFIG_NTFS3_LZX_XPRESS
- mutex_init(&sbi->compress.mtx_xpress);
- mutex_init(&sbi->compress.mtx_lzx);
-#endif
-
/*
* Load $Volume. This should be done before $LogFile
* 'cause 'sbi->volume.ni' is used 'ntfs_set_state'.
@@ -991,9 +933,8 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent)
ref.seq = cpu_to_le16(MFT_REC_VOL);
inode = ntfs_iget5(sb, &ref, &NAME_VOLUME);
if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
ntfs_err(sb, "Failed to load $Volume.");
- inode = NULL;
+ err = PTR_ERR(inode);
goto out;
}
@@ -1015,36 +956,33 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent)
} else {
/* Should we break mounting here? */
//err = -EINVAL;
- //goto out;
+ //goto put_inode_out;
}
attr = ni_find_attr(ni, attr, NULL, ATTR_VOL_INFO, NULL, 0, NULL, NULL);
if (!attr || is_attr_ext(attr)) {
err = -EINVAL;
- goto out;
+ goto put_inode_out;
}
info = resident_data_ex(attr, SIZEOF_ATTRIBUTE_VOLUME_INFO);
if (!info) {
err = -EINVAL;
- goto out;
+ goto put_inode_out;
}
sbi->volume.major_ver = info->major_ver;
sbi->volume.minor_ver = info->minor_ver;
sbi->volume.flags = info->flags;
-
sbi->volume.ni = ni;
- inode = NULL;
/* Load $MFTMirr to estimate recs_mirr. */
ref.low = cpu_to_le32(MFT_REC_MIRR);
ref.seq = cpu_to_le16(MFT_REC_MIRR);
inode = ntfs_iget5(sb, &ref, &NAME_MIRROR);
if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
ntfs_err(sb, "Failed to load $MFTMirr.");
- inode = NULL;
+ err = PTR_ERR(inode);
goto out;
}
@@ -1058,9 +996,8 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent)
ref.seq = cpu_to_le16(MFT_REC_LOG);
inode = ntfs_iget5(sb, &ref, &NAME_LOGFILE);
if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
ntfs_err(sb, "Failed to load \x24LogFile.");
- inode = NULL;
+ err = PTR_ERR(inode);
goto out;
}
@@ -1068,22 +1005,19 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent)
err = ntfs_loadlog_and_replay(ni, sbi);
if (err)
- goto out;
+ goto put_inode_out;
iput(inode);
- inode = NULL;
-
- is_ro = sb_rdonly(sbi->sb);
if (sbi->flags & NTFS_FLAGS_NEED_REPLAY) {
- if (!is_ro) {
+ if (!sb_rdonly(sb)) {
ntfs_warn(sb,
"failed to replay log file. Can't mount rw!");
err = -EINVAL;
goto out;
}
} else if (sbi->volume.flags & VOLUME_FLAG_DIRTY) {
- if (!is_ro && !sbi->options.force) {
+ if (!sb_rdonly(sb) && !sbi->options->force) {
ntfs_warn(
sb,
"volume is dirty and \"force\" flag is not set!");
@@ -1098,9 +1032,8 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent)
inode = ntfs_iget5(sb, &ref, &NAME_MFT);
if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
ntfs_err(sb, "Failed to load $MFT.");
- inode = NULL;
+ err = PTR_ERR(inode);
goto out;
}
@@ -1112,11 +1045,11 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent)
err = wnd_init(&sbi->mft.bitmap, sb, tt);
if (err)
- goto out;
+ goto put_inode_out;
err = ni_load_all_mi(ni);
if (err)
- goto out;
+ goto put_inode_out;
sbi->mft.ni = ni;
@@ -1125,9 +1058,8 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent)
ref.seq = cpu_to_le16(MFT_REC_BADCLUST);
inode = ntfs_iget5(sb, &ref, &NAME_BADCLUS);
if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
ntfs_err(sb, "Failed to load $BadClus.");
- inode = NULL;
+ err = PTR_ERR(inode);
goto out;
}
@@ -1150,18 +1082,15 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent)
ref.seq = cpu_to_le16(MFT_REC_BITMAP);
inode = ntfs_iget5(sb, &ref, &NAME_BITMAP);
if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
ntfs_err(sb, "Failed to load $Bitmap.");
- inode = NULL;
+ err = PTR_ERR(inode);
goto out;
}
- ni = ntfs_i(inode);
-
#ifndef CONFIG_NTFS3_64BIT_CLUSTER
if (inode->i_size >> 32) {
err = -EINVAL;
- goto out;
+ goto put_inode_out;
}
#endif
@@ -1169,14 +1098,14 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent)
tt = sbi->used.bitmap.nbits;
if (inode->i_size < bitmap_size(tt)) {
err = -EINVAL;
- goto out;
+ goto put_inode_out;
}
/* Not necessary. */
sbi->used.bitmap.set_tail = true;
- err = wnd_init(&sbi->used.bitmap, sbi->sb, tt);
+ err = wnd_init(&sbi->used.bitmap, sb, tt);
if (err)
- goto out;
+ goto put_inode_out;
iput(inode);
@@ -1188,23 +1117,22 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent)
/* Load $AttrDef. */
ref.low = cpu_to_le32(MFT_REC_ATTR);
ref.seq = cpu_to_le16(MFT_REC_ATTR);
- inode = ntfs_iget5(sbi->sb, &ref, &NAME_ATTRDEF);
+ inode = ntfs_iget5(sb, &ref, &NAME_ATTRDEF);
if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
ntfs_err(sb, "Failed to load $AttrDef -> %d", err);
- inode = NULL;
+ err = PTR_ERR(inode);
goto out;
}
if (inode->i_size < sizeof(struct ATTR_DEF_ENTRY)) {
err = -EINVAL;
- goto out;
+ goto put_inode_out;
}
bytes = inode->i_size;
sbi->def_table = t = kmalloc(bytes, GFP_NOFS);
if (!t) {
err = -ENOMEM;
- goto out;
+ goto put_inode_out;
}
for (done = idx = 0; done < bytes; done += PAGE_SIZE, idx++) {
@@ -1213,7 +1141,7 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent)
if (IS_ERR(page)) {
err = PTR_ERR(page);
- goto out;
+ goto put_inode_out;
}
memcpy(Add2Ptr(t, done), page_address(page),
min(PAGE_SIZE, tail));
@@ -1221,7 +1149,7 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent)
if (!idx && ATTR_STD != t->type) {
err = -EINVAL;
- goto out;
+ goto put_inode_out;
}
}
@@ -1254,33 +1182,24 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent)
ref.seq = cpu_to_le16(MFT_REC_UPCASE);
inode = ntfs_iget5(sb, &ref, &NAME_UPCASE);
if (IS_ERR(inode)) {
+ ntfs_err(sb, "Failed to load $UpCase.");
err = PTR_ERR(inode);
- ntfs_err(sb, "Failed to load \x24LogFile.");
- inode = NULL;
goto out;
}
- ni = ntfs_i(inode);
-
if (inode->i_size != 0x10000 * sizeof(short)) {
err = -EINVAL;
- goto out;
- }
-
- sbi->upcase = upcase = kvmalloc(0x10000 * sizeof(short), GFP_KERNEL);
- if (!upcase) {
- err = -ENOMEM;
- goto out;
+ goto put_inode_out;
}
for (idx = 0; idx < (0x10000 * sizeof(short) >> PAGE_SHIFT); idx++) {
const __le16 *src;
- u16 *dst = Add2Ptr(upcase, idx << PAGE_SHIFT);
+ u16 *dst = Add2Ptr(sbi->upcase, idx << PAGE_SHIFT);
struct page *page = ntfs_map_page(inode->i_mapping, idx);
if (IS_ERR(page)) {
err = PTR_ERR(page);
- goto out;
+ goto put_inode_out;
}
src = page_address(page);
@@ -1294,14 +1213,13 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent)
ntfs_unmap_page(page);
}
- shared = ntfs_set_shared(upcase, 0x10000 * sizeof(short));
- if (shared && upcase != shared) {
+ shared = ntfs_set_shared(sbi->upcase, 0x10000 * sizeof(short));
+ if (shared && sbi->upcase != shared) {
+ kvfree(sbi->upcase);
sbi->upcase = shared;
- kvfree(upcase);
}
iput(inode);
- inode = NULL;
if (is_ntfs3(sbi)) {
/* Load $Secure. */
@@ -1331,34 +1249,31 @@ load_root:
ref.seq = cpu_to_le16(MFT_REC_ROOT);
inode = ntfs_iget5(sb, &ref, &NAME_ROOT);
if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
ntfs_err(sb, "Failed to load root.");
- inode = NULL;
+ err = PTR_ERR(inode);
goto out;
}
- ni = ntfs_i(inode);
-
sb->s_root = d_make_root(inode);
-
if (!sb->s_root) {
- err = -EINVAL;
- goto out;
+ err = -ENOMEM;
+ goto put_inode_out;
}
+ fc->fs_private = NULL;
+
return 0;
-out:
+put_inode_out:
iput(inode);
-
- if (sb->s_root) {
- d_drop(sb->s_root);
- sb->s_root = NULL;
- }
-
+out:
+ /*
+ * Free resources here.
+ * ntfs_fs_free will be called with fc->s_fs_info = NULL
+ */
put_ntfs(sbi);
-
sb->s_fs_info = NULL;
+
return err;
}
@@ -1403,7 +1318,7 @@ int ntfs_discard(struct ntfs_sb_info *sbi, CLST lcn, CLST len)
if (sbi->flags & NTFS_FLAGS_NODISCARD)
return -EOPNOTSUPP;
- if (!sbi->options.discard)
+ if (!sbi->options->discard)
return -EOPNOTSUPP;
lbo = (u64)lcn << sbi->cluster_bits;
@@ -1428,19 +1343,99 @@ int ntfs_discard(struct ntfs_sb_info *sbi, CLST lcn, CLST len)
return err;
}
-static struct dentry *ntfs_mount(struct file_system_type *fs_type, int flags,
- const char *dev_name, void *data)
+static int ntfs_fs_get_tree(struct fs_context *fc)
+{
+ return get_tree_bdev(fc, ntfs_fill_super);
+}
+
+/*
+ * ntfs_fs_free - Free fs_context.
+ *
+ * Note that this will be called after fill_super and reconfigure
+ * even when they pass. So they have to take pointers if they pass.
+ */
+static void ntfs_fs_free(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, ntfs_fill_super);
+ struct ntfs_mount_options *opts = fc->fs_private;
+ struct ntfs_sb_info *sbi = fc->s_fs_info;
+
+ if (sbi)
+ put_ntfs(sbi);
+
+ if (opts)
+ put_mount_options(opts);
+}
+
+static const struct fs_context_operations ntfs_context_ops = {
+ .parse_param = ntfs_fs_parse_param,
+ .get_tree = ntfs_fs_get_tree,
+ .reconfigure = ntfs_fs_reconfigure,
+ .free = ntfs_fs_free,
+};
+
+/*
+ * ntfs_init_fs_context - Initialize spi and opts
+ *
+ * This will called when mount/remount. We will first initiliaze
+ * options so that if remount we can use just that.
+ */
+static int ntfs_init_fs_context(struct fs_context *fc)
+{
+ struct ntfs_mount_options *opts;
+ struct ntfs_sb_info *sbi;
+
+ opts = kzalloc(sizeof(struct ntfs_mount_options), GFP_NOFS);
+ if (!opts)
+ return -ENOMEM;
+
+ /* Default options. */
+ opts->fs_uid = current_uid();
+ opts->fs_gid = current_gid();
+ opts->fs_fmask_inv = ~current_umask();
+ opts->fs_dmask_inv = ~current_umask();
+
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)
+ goto ok;
+
+ sbi = kzalloc(sizeof(struct ntfs_sb_info), GFP_NOFS);
+ if (!sbi)
+ goto free_opts;
+
+ sbi->upcase = kvmalloc(0x10000 * sizeof(short), GFP_KERNEL);
+ if (!sbi->upcase)
+ goto free_sbi;
+
+ ratelimit_state_init(&sbi->msg_ratelimit, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+ mutex_init(&sbi->compress.mtx_lznt);
+#ifdef CONFIG_NTFS3_LZX_XPRESS
+ mutex_init(&sbi->compress.mtx_xpress);
+ mutex_init(&sbi->compress.mtx_lzx);
+#endif
+
+ sbi->options = opts;
+ fc->s_fs_info = sbi;
+ok:
+ fc->fs_private = opts;
+ fc->ops = &ntfs_context_ops;
+
+ return 0;
+free_sbi:
+ kfree(sbi);
+free_opts:
+ kfree(opts);
+ return -ENOMEM;
}
// clang-format off
static struct file_system_type ntfs_fs_type = {
- .owner = THIS_MODULE,
- .name = "ntfs3",
- .mount = ntfs_mount,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
+ .owner = THIS_MODULE,
+ .name = "ntfs3",
+ .init_fs_context = ntfs_init_fs_context,
+ .parameters = ntfs_fs_parameters,
+ .kill_sb = kill_block_super,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
// clang-format on
diff --git a/fs/ntfs3/upcase.c b/fs/ntfs3/upcase.c
index bbeba778237e..b5e8256fd710 100644
--- a/fs/ntfs3/upcase.c
+++ b/fs/ntfs3/upcase.c
@@ -5,13 +5,9 @@
*
*/
-#include <linux/blkdev.h>
-#include <linux/buffer_head.h>
-#include <linux/module.h>
-#include <linux/nls.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
-#include "debug.h"
-#include "ntfs.h"
#include "ntfs_fs.h"
static inline u16 upcase_unicode_char(const u16 *upcase, u16 chr)
diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c
index 7282d85c4ece..afd0ddad826f 100644
--- a/fs/ntfs3/xattr.c
+++ b/fs/ntfs3/xattr.c
@@ -5,10 +5,7 @@
*
*/
-#include <linux/blkdev.h>
-#include <linux/buffer_head.h>
#include <linux/fs.h>
-#include <linux/nls.h>
#include <linux/posix_acl.h>
#include <linux/posix_acl_xattr.h>
#include <linux/xattr.h>
@@ -78,6 +75,7 @@ static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea,
size_t add_bytes, const struct EA_INFO **info)
{
int err;
+ struct ntfs_sb_info *sbi = ni->mi.sbi;
struct ATTR_LIST_ENTRY *le = NULL;
struct ATTRIB *attr_info, *attr_ea;
void *ea_p;
@@ -102,10 +100,10 @@ static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea,
/* Check Ea limit. */
size = le32_to_cpu((*info)->size);
- if (size > ni->mi.sbi->ea_max_size)
+ if (size > sbi->ea_max_size)
return -EFBIG;
- if (attr_size(attr_ea) > ni->mi.sbi->ea_max_size)
+ if (attr_size(attr_ea) > sbi->ea_max_size)
return -EFBIG;
/* Allocate memory for packed Ea. */
@@ -113,15 +111,16 @@ static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea,
if (!ea_p)
return -ENOMEM;
- if (attr_ea->non_res) {
+ if (!size) {
+ ;
+ } else if (attr_ea->non_res) {
struct runs_tree run;
run_init(&run);
err = attr_load_runs(attr_ea, ni, &run, NULL);
if (!err)
- err = ntfs_read_run_nb(ni->mi.sbi, &run, 0, ea_p, size,
- NULL);
+ err = ntfs_read_run_nb(sbi, &run, 0, ea_p, size, NULL);
run_close(&run);
if (err)
@@ -260,7 +259,7 @@ out:
static noinline int ntfs_set_ea(struct inode *inode, const char *name,
size_t name_len, const void *value,
- size_t val_size, int flags, int locked)
+ size_t val_size, int flags)
{
struct ntfs_inode *ni = ntfs_i(inode);
struct ntfs_sb_info *sbi = ni->mi.sbi;
@@ -279,8 +278,7 @@ static noinline int ntfs_set_ea(struct inode *inode, const char *name,
u64 new_sz;
void *p;
- if (!locked)
- ni_lock(ni);
+ ni_lock(ni);
run_init(&ea_run);
@@ -370,21 +368,22 @@ static noinline int ntfs_set_ea(struct inode *inode, const char *name,
new_ea->name[name_len] = 0;
memcpy(new_ea->name + name_len + 1, value, val_size);
new_pack = le16_to_cpu(ea_info.size_pack) + packed_ea_size(new_ea);
-
- /* Should fit into 16 bits. */
- if (new_pack > 0xffff) {
- err = -EFBIG; // -EINVAL?
- goto out;
- }
ea_info.size_pack = cpu_to_le16(new_pack);
-
/* New size of ATTR_EA. */
size += add;
- if (size > sbi->ea_max_size) {
+ ea_info.size = cpu_to_le32(size);
+
+ /*
+ * 1. Check ea_info.size_pack for overflow.
+ * 2. New attibute size must fit value from $AttrDef
+ */
+ if (new_pack > 0xffff || size > sbi->ea_max_size) {
+ ntfs_inode_warn(
+ inode,
+ "The size of extended attributes must not exceed 64KiB");
err = -EFBIG; // -EINVAL?
goto out;
}
- ea_info.size = cpu_to_le32(size);
update_ea:
@@ -444,7 +443,7 @@ update_ea:
/* Delete xattr, ATTR_EA */
ni_remove_attr_le(ni, attr, mi, le);
} else if (attr->non_res) {
- err = ntfs_sb_write_run(sbi, &ea_run, 0, ea_all, size);
+ err = ntfs_sb_write_run(sbi, &ea_run, 0, ea_all, size, 0);
if (err)
goto out;
} else {
@@ -468,8 +467,7 @@ update_ea:
mark_inode_dirty(&ni->vfs_inode);
out:
- if (!locked)
- ni_unlock(ni);
+ ni_unlock(ni);
run_close(&ea_run);
kfree(ea_all);
@@ -478,12 +476,6 @@ out:
}
#ifdef CONFIG_NTFS3_FS_POSIX_ACL
-static inline void ntfs_posix_acl_release(struct posix_acl *acl)
-{
- if (acl && refcount_dec_and_test(&acl->a_refcount))
- kfree(acl);
-}
-
static struct posix_acl *ntfs_get_acl_ex(struct user_namespace *mnt_userns,
struct inode *inode, int type,
int locked)
@@ -521,12 +513,15 @@ static struct posix_acl *ntfs_get_acl_ex(struct user_namespace *mnt_userns,
/* Translate extended attribute to acl. */
if (err >= 0) {
acl = posix_acl_from_xattr(mnt_userns, buf, err);
- if (!IS_ERR(acl))
- set_cached_acl(inode, type, acl);
+ } else if (err == -ENODATA) {
+ acl = NULL;
} else {
- acl = err == -ENODATA ? NULL : ERR_PTR(err);
+ acl = ERR_PTR(err);
}
+ if (!IS_ERR(acl))
+ set_cached_acl(inode, type, acl);
+
__putname(buf);
return acl;
@@ -546,12 +541,13 @@ struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu)
static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns,
struct inode *inode, struct posix_acl *acl,
- int type, int locked)
+ int type)
{
const char *name;
size_t size, name_len;
void *value = NULL;
int err = 0;
+ int flags;
if (S_ISLNK(inode->i_mode))
return -EOPNOTSUPP;
@@ -561,22 +557,15 @@ static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns,
if (acl) {
umode_t mode = inode->i_mode;
- err = posix_acl_equiv_mode(acl, &mode);
- if (err < 0)
- return err;
+ err = posix_acl_update_mode(mnt_userns, inode, &mode,
+ &acl);
+ if (err)
+ goto out;
if (inode->i_mode != mode) {
inode->i_mode = mode;
mark_inode_dirty(inode);
}
-
- if (!err) {
- /*
- * ACL can be exactly represented in the
- * traditional file mode permission bits.
- */
- acl = NULL;
- }
}
name = XATTR_NAME_POSIX_ACL_ACCESS;
name_len = sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1;
@@ -594,20 +583,24 @@ static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns,
}
if (!acl) {
+ /* Remove xattr if it can be presented via mode. */
size = 0;
value = NULL;
+ flags = XATTR_REPLACE;
} else {
size = posix_acl_xattr_size(acl->a_count);
value = kmalloc(size, GFP_NOFS);
if (!value)
return -ENOMEM;
-
err = posix_acl_to_xattr(mnt_userns, acl, value, size);
if (err < 0)
goto out;
+ flags = 0;
}
- err = ntfs_set_ea(inode, name, name_len, value, size, 0, locked);
+ err = ntfs_set_ea(inode, name, name_len, value, size, flags);
+ if (err == -ENODATA && !size)
+ err = 0; /* Removing non existed xattr. */
if (!err)
set_cached_acl(inode, type, acl);
@@ -623,68 +616,7 @@ out:
int ntfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
struct posix_acl *acl, int type)
{
- return ntfs_set_acl_ex(mnt_userns, inode, acl, type, 0);
-}
-
-static int ntfs_xattr_get_acl(struct user_namespace *mnt_userns,
- struct inode *inode, int type, void *buffer,
- size_t size)
-{
- struct posix_acl *acl;
- int err;
-
- if (!(inode->i_sb->s_flags & SB_POSIXACL)) {
- ntfs_inode_warn(inode, "add mount option \"acl\" to use acl");
- return -EOPNOTSUPP;
- }
-
- acl = ntfs_get_acl(inode, type, false);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
-
- if (!acl)
- return -ENODATA;
-
- err = posix_acl_to_xattr(mnt_userns, acl, buffer, size);
- ntfs_posix_acl_release(acl);
-
- return err;
-}
-
-static int ntfs_xattr_set_acl(struct user_namespace *mnt_userns,
- struct inode *inode, int type, const void *value,
- size_t size)
-{
- struct posix_acl *acl;
- int err;
-
- if (!(inode->i_sb->s_flags & SB_POSIXACL)) {
- ntfs_inode_warn(inode, "add mount option \"acl\" to use acl");
- return -EOPNOTSUPP;
- }
-
- if (!inode_owner_or_capable(mnt_userns, inode))
- return -EPERM;
-
- if (!value) {
- acl = NULL;
- } else {
- acl = posix_acl_from_xattr(mnt_userns, value, size);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
-
- if (acl) {
- err = posix_acl_valid(mnt_userns, acl);
- if (err)
- goto release_and_out;
- }
- }
-
- err = ntfs_set_acl(mnt_userns, inode, acl, type);
-
-release_and_out:
- ntfs_posix_acl_release(acl);
- return err;
+ return ntfs_set_acl_ex(mnt_userns, inode, acl, type);
}
/*
@@ -698,54 +630,27 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode,
struct posix_acl *default_acl, *acl;
int err;
- /*
- * TODO: Refactoring lock.
- * ni_lock(dir) ... -> posix_acl_create(dir,...) -> ntfs_get_acl -> ni_lock(dir)
- */
- inode->i_default_acl = NULL;
-
- default_acl = ntfs_get_acl_ex(mnt_userns, dir, ACL_TYPE_DEFAULT, 1);
-
- if (!default_acl || default_acl == ERR_PTR(-EOPNOTSUPP)) {
- inode->i_mode &= ~current_umask();
- err = 0;
- goto out;
- }
-
- if (IS_ERR(default_acl)) {
- err = PTR_ERR(default_acl);
- goto out;
- }
-
- acl = default_acl;
- err = __posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
- if (err < 0)
- goto out1;
- if (!err) {
- posix_acl_release(acl);
- acl = NULL;
- }
+ err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+ if (err)
+ return err;
- if (!S_ISDIR(inode->i_mode)) {
+ if (default_acl) {
+ err = ntfs_set_acl_ex(mnt_userns, inode, default_acl,
+ ACL_TYPE_DEFAULT);
posix_acl_release(default_acl);
- default_acl = NULL;
+ } else {
+ inode->i_default_acl = NULL;
}
- if (default_acl)
- err = ntfs_set_acl_ex(mnt_userns, inode, default_acl,
- ACL_TYPE_DEFAULT, 1);
-
if (!acl)
inode->i_acl = NULL;
- else if (!err)
- err = ntfs_set_acl_ex(mnt_userns, inode, acl, ACL_TYPE_ACCESS,
- 1);
-
- posix_acl_release(acl);
-out1:
- posix_acl_release(default_acl);
+ else {
+ if (!err)
+ err = ntfs_set_acl_ex(mnt_userns, inode, acl,
+ ACL_TYPE_ACCESS);
+ posix_acl_release(acl);
+ }
-out:
return err;
}
#endif
@@ -772,7 +677,7 @@ int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode)
int ntfs_permission(struct user_namespace *mnt_userns, struct inode *inode,
int mask)
{
- if (ntfs_sb(inode->i_sb)->options.no_acs_rules) {
+ if (ntfs_sb(inode->i_sb)->options->noacsrules) {
/* "No access rules" mode - Allow all changes. */
return 0;
}
@@ -880,23 +785,6 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de,
goto out;
}
-#ifdef CONFIG_NTFS3_FS_POSIX_ACL
- if ((name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 &&
- !memcmp(name, XATTR_NAME_POSIX_ACL_ACCESS,
- sizeof(XATTR_NAME_POSIX_ACL_ACCESS))) ||
- (name_len == sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1 &&
- !memcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT,
- sizeof(XATTR_NAME_POSIX_ACL_DEFAULT)))) {
- /* TODO: init_user_ns? */
- err = ntfs_xattr_get_acl(
- &init_user_ns, inode,
- name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1
- ? ACL_TYPE_ACCESS
- : ACL_TYPE_DEFAULT,
- buffer, size);
- goto out;
- }
-#endif
/* Deal with NTFS extended attribute. */
err = ntfs_get_ea(inode, name, name_len, buffer, size, NULL);
@@ -1009,24 +897,8 @@ set_new_fa:
goto out;
}
-#ifdef CONFIG_NTFS3_FS_POSIX_ACL
- if ((name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 &&
- !memcmp(name, XATTR_NAME_POSIX_ACL_ACCESS,
- sizeof(XATTR_NAME_POSIX_ACL_ACCESS))) ||
- (name_len == sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1 &&
- !memcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT,
- sizeof(XATTR_NAME_POSIX_ACL_DEFAULT)))) {
- err = ntfs_xattr_set_acl(
- mnt_userns, inode,
- name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1
- ? ACL_TYPE_ACCESS
- : ACL_TYPE_DEFAULT,
- value, size);
- goto out;
- }
-#endif
/* Deal with NTFS extended attribute. */
- err = ntfs_set_ea(inode, name, name_len, value, size, flags, 0);
+ err = ntfs_set_ea(inode, name, name_len, value, size, flags);
out:
return err;
@@ -1042,28 +914,29 @@ int ntfs_save_wsl_perm(struct inode *inode)
int err;
__le32 value;
+ /* TODO: refactor this, so we don't lock 4 times in ntfs_set_ea */
value = cpu_to_le32(i_uid_read(inode));
err = ntfs_set_ea(inode, "$LXUID", sizeof("$LXUID") - 1, &value,
- sizeof(value), 0, 0);
+ sizeof(value), 0);
if (err)
goto out;
value = cpu_to_le32(i_gid_read(inode));
err = ntfs_set_ea(inode, "$LXGID", sizeof("$LXGID") - 1, &value,
- sizeof(value), 0, 0);
+ sizeof(value), 0);
if (err)
goto out;
value = cpu_to_le32(inode->i_mode);
err = ntfs_set_ea(inode, "$LXMOD", sizeof("$LXMOD") - 1, &value,
- sizeof(value), 0, 0);
+ sizeof(value), 0);
if (err)
goto out;
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
value = cpu_to_le32(inode->i_rdev);
err = ntfs_set_ea(inode, "$LXDEV", sizeof("$LXDEV") - 1, &value,
- sizeof(value), 0, 0);
+ sizeof(value), 0);
if (err)
goto out;
}
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index f1cc8258d34a..5d9ae17bd443 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -7045,7 +7045,7 @@ void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
int ocfs2_convert_inline_data_to_extents(struct inode *inode,
struct buffer_head *di_bh)
{
- int ret, i, has_data, num_pages = 0;
+ int ret, has_data, num_pages = 0;
int need_free = 0;
u32 bit_off, num;
handle_t *handle;
@@ -7054,26 +7054,17 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
struct ocfs2_alloc_context *data_ac = NULL;
- struct page **pages = NULL;
- loff_t end = osb->s_clustersize;
+ struct page *page = NULL;
struct ocfs2_extent_tree et;
int did_quota = 0;
has_data = i_size_read(inode) ? 1 : 0;
if (has_data) {
- pages = kcalloc(ocfs2_pages_per_cluster(osb->sb),
- sizeof(struct page *), GFP_NOFS);
- if (pages == NULL) {
- ret = -ENOMEM;
- mlog_errno(ret);
- return ret;
- }
-
ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
if (ret) {
mlog_errno(ret);
- goto free_pages;
+ goto out;
}
}
@@ -7093,7 +7084,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
}
if (has_data) {
- unsigned int page_end;
+ unsigned int page_end = min_t(unsigned, PAGE_SIZE,
+ osb->s_clustersize);
u64 phys;
ret = dquot_alloc_space_nodirty(inode,
@@ -7117,15 +7109,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
*/
block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
- /*
- * Non sparse file systems zero on extend, so no need
- * to do that now.
- */
- if (!ocfs2_sparse_alloc(osb) &&
- PAGE_SIZE < osb->s_clustersize)
- end = PAGE_SIZE;
-
- ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
+ ret = ocfs2_grab_eof_pages(inode, 0, page_end, &page,
+ &num_pages);
if (ret) {
mlog_errno(ret);
need_free = 1;
@@ -7136,20 +7121,15 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
* This should populate the 1st page for us and mark
* it up to date.
*/
- ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
+ ret = ocfs2_read_inline_data(inode, page, di_bh);
if (ret) {
mlog_errno(ret);
need_free = 1;
goto out_unlock;
}
- page_end = PAGE_SIZE;
- if (PAGE_SIZE > osb->s_clustersize)
- page_end = osb->s_clustersize;
-
- for (i = 0; i < num_pages; i++)
- ocfs2_map_and_dirty_page(inode, handle, 0, page_end,
- pages[i], i > 0, &phys);
+ ocfs2_map_and_dirty_page(inode, handle, 0, page_end, page, 0,
+ &phys);
}
spin_lock(&oi->ip_lock);
@@ -7180,8 +7160,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
}
out_unlock:
- if (pages)
- ocfs2_unlock_and_free_pages(pages, num_pages);
+ if (page)
+ ocfs2_unlock_and_free_pages(&page, num_pages);
out_commit:
if (ret < 0 && did_quota)
@@ -7205,8 +7185,6 @@ out_commit:
out:
if (data_ac)
ocfs2_free_alloc_context(data_ac);
-free_pages:
- kfree(pages);
return ret;
}
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 8521942f5af2..481017e1dac5 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1251,7 +1251,7 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
{
struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
struct journal_head *jh;
- int ret;
+ int ret = 1;
if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
return 0;
@@ -1259,14 +1259,18 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
if (!buffer_jbd(bg_bh))
return 1;
- jh = bh2jh(bg_bh);
- spin_lock(&jh->b_state_lock);
- bg = (struct ocfs2_group_desc *) jh->b_committed_data;
- if (bg)
- ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
- else
- ret = 1;
- spin_unlock(&jh->b_state_lock);
+ jbd_lock_bh_journal_head(bg_bh);
+ if (buffer_jbd(bg_bh)) {
+ jh = bh2jh(bg_bh);
+ spin_lock(&jh->b_state_lock);
+ bg = (struct ocfs2_group_desc *) jh->b_committed_data;
+ if (bg)
+ ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
+ else
+ ret = 1;
+ spin_unlock(&jh->b_state_lock);
+ }
+ jbd_unlock_bh_journal_head(bg_bh);
return ret;
}
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index c86bd4e60e20..5c914ce9b3ac 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -2167,11 +2167,17 @@ static int ocfs2_initialize_super(struct super_block *sb,
}
if (ocfs2_clusterinfo_valid(osb)) {
+ /*
+ * ci_stack and ci_cluster in ocfs2_cluster_info may not be null
+ * terminated, so make sure no overflow happens here by using
+ * memcpy. Destination strings will always be null terminated
+ * because osb is allocated using kzalloc.
+ */
osb->osb_stackflags =
OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags;
- strlcpy(osb->osb_cluster_stack,
+ memcpy(osb->osb_cluster_stack,
OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
- OCFS2_STACK_LABEL_LEN + 1);
+ OCFS2_STACK_LABEL_LEN);
if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) {
mlog(ML_ERROR,
"couldn't mount because of an invalid "
@@ -2180,9 +2186,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
status = -EINVAL;
goto bail;
}
- strlcpy(osb->osb_cluster_name,
+ memcpy(osb->osb_cluster_name,
OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster,
- OCFS2_CLUSTER_NAME_LEN + 1);
+ OCFS2_CLUSTER_NAME_LEN);
} else {
/* The empty string is identical with classic tools that
* don't know about s_cluster_info. */
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index c1bb4c4b5d67..e5e3e500ed46 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -10,7 +10,7 @@
* Linux VFS inode operations.
*/
-#include <linux/bvec.h>
+#include <linux/blkdev.h>
#include <linux/fileattr.h>
#include "protocol.h"
#include "orangefs-kernel.h"
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index 2f2e430461b2..8bb0a53a658b 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -11,6 +11,7 @@
#include <linux/parser.h>
#include <linux/hashtable.h>
+#include <linux/seq_file.h>
/* a cache for orangefs-inode objects (i.e. orangefs inode private data) */
static struct kmem_cache *orangefs_inode_cache;
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 1fefb2b8960e..93c7c267de93 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -1219,9 +1219,13 @@ static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir,
goto out_dput;
}
} else {
- if (!d_is_negative(newdentry) &&
- (!new_opaque || !ovl_is_whiteout(newdentry)))
- goto out_dput;
+ if (!d_is_negative(newdentry)) {
+ if (!new_opaque || !ovl_is_whiteout(newdentry))
+ goto out_dput;
+ } else {
+ if (flags & RENAME_EXCHANGE)
+ goto out_dput;
+ }
}
if (olddentry == trap)
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index d081faa55e83..ac461a499882 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -272,14 +272,14 @@ static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req)
kmem_cache_free(ovl_aio_request_cachep, aio_req);
}
-static void ovl_aio_rw_complete(struct kiocb *iocb, long res, long res2)
+static void ovl_aio_rw_complete(struct kiocb *iocb, long res)
{
struct ovl_aio_req *aio_req = container_of(iocb,
struct ovl_aio_req, iocb);
struct kiocb *orig_iocb = aio_req->orig_iocb;
ovl_aio_cleanup_handler(aio_req);
- orig_iocb->ki_complete(orig_iocb, res, res2);
+ orig_iocb->ki_complete(orig_iocb, res);
}
static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
@@ -296,6 +296,12 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
if (ret)
return ret;
+ ret = -EINVAL;
+ if (iocb->ki_flags & IOCB_DIRECT &&
+ (!real.file->f_mapping->a_ops ||
+ !real.file->f_mapping->a_ops->direct_IO))
+ goto out_fdput;
+
old_cred = ovl_override_creds(file_inode(file)->i_sb);
if (is_sync_kiocb(iocb)) {
ret = vfs_iter_read(real.file, iter, &iocb->ki_pos,
@@ -320,7 +326,7 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
out:
revert_creds(old_cred);
ovl_file_accessed(file);
-
+out_fdput:
fdput(real);
return ret;
@@ -349,6 +355,12 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
if (ret)
goto out_unlock;
+ ret = -EINVAL;
+ if (iocb->ki_flags & IOCB_DIRECT &&
+ (!real.file->f_mapping->a_ops ||
+ !real.file->f_mapping->a_ops->direct_IO))
+ goto out_fdput;
+
if (!ovl_should_sync(OVL_FS(inode->i_sb)))
ifl &= ~(IOCB_DSYNC | IOCB_SYNC);
@@ -384,6 +396,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
}
out:
revert_creds(old_cred);
+out_fdput:
fdput(real);
out_unlock:
diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c
index 04ce58c939a0..5d1fbaffd66a 100644
--- a/fs/pstore/blk.c
+++ b/fs/pstore/blk.c
@@ -205,7 +205,6 @@ static ssize_t psblk_generic_blk_write(const char *buf, size_t bytes,
static int __register_pstore_blk(struct pstore_device_info *dev,
const char *devpath)
{
- struct inode *inode;
int ret = -ENODEV;
lockdep_assert_held(&pstore_blk_lock);
@@ -217,14 +216,13 @@ static int __register_pstore_blk(struct pstore_device_info *dev,
goto err;
}
- inode = file_inode(psblk_file);
- if (!S_ISBLK(inode->i_mode)) {
+ if (!S_ISBLK(file_inode(psblk_file)->i_mode)) {
pr_err("'%s' is not block device!\n", devpath);
goto err_fput;
}
- inode = I_BDEV(psblk_file->f_mapping->host)->bd_inode;
- dev->zone.total_size = i_size_read(inode);
+ dev->zone.total_size =
+ bdev_nr_bytes(I_BDEV(psblk_file->f_mapping->host));
ret = __register_pstore_device(dev);
if (ret)
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 2bcc9a6f1bfc..052f143e2e0e 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -10,6 +10,7 @@
#include <linux/namei.h>
#include <linux/slab.h>
#include <asm/current.h>
+#include <linux/blkdev.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
#include <linux/security.h>
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 65e7e56005b8..e2302342a67f 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -38,6 +38,7 @@
#include <linux/uaccess.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
+#include <linux/seq_file.h>
#include "internal.h"
struct ramfs_mount_opts {
diff --git a/fs/read_write.c b/fs/read_write.c
index af057c57bdc6..0074afa7ecb3 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -368,10 +368,6 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t
if (unlikely((ssize_t) count < 0))
return -EINVAL;
- /*
- * ranged mandatory locking does not apply to streams - it makes sense
- * only for files where position has a meaning.
- */
if (ppos) {
loff_t pos = *ppos;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 58481f8d63d5..076f9ab94306 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1199,9 +1199,7 @@ static int reiserfs_parse_options(struct super_block *s,
if (!strcmp(arg, "auto")) {
/* From JFS code, to auto-get the size. */
- *blocks =
- i_size_read(s->s_bdev->bd_inode) >> s->
- s_blocksize_bits;
+ *blocks = sb_bdev_nr_blocks(s);
} else {
*blocks = simple_strtoul(arg, &p, 0);
if (*p != '\0') {
@@ -1986,9 +1984,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
* smaller than the filesystem. If the check fails then abort and
* scream, because bad stuff will happen otherwise.
*/
- if (s->s_bdev && s->s_bdev->bd_inode
- && i_size_read(s->s_bdev->bd_inode) <
- sb_block_count(rs) * sb_blocksize(rs)) {
+ if (bdev_nr_bytes(s->s_bdev) < sb_block_count(rs) * sb_blocksize(rs)) {
SWARN(silent, s, "", "Filesystem cannot be "
"mounted because it is bigger than the device");
SWARN(silent, s, "", "You may need to run fsck "
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 60d6951915f4..bb44ff4c5cc6 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -16,6 +16,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/blkdev.h>
#include <linux/fs.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
@@ -179,8 +180,8 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
/* Check the filesystem does not extend beyond the end of the
block device */
msblk->bytes_used = le64_to_cpu(sblk->bytes_used);
- if (msblk->bytes_used < 0 || msblk->bytes_used >
- i_size_read(sb->s_bdev->bd_inode))
+ if (msblk->bytes_used < 0 ||
+ msblk->bytes_used > bdev_nr_bytes(sb->s_bdev))
goto failed_mount;
/* Check block size for sanity */
diff --git a/fs/sync.c b/fs/sync.c
index 1373a610dc78..3ce8e2137f31 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -3,6 +3,7 @@
* High-level sync()-related operations
*/
+#include <linux/blkdev.h>
#include <linux/kernel.h>
#include <linux/file.h>
#include <linux/fs.h>
@@ -22,25 +23,6 @@
SYNC_FILE_RANGE_WAIT_AFTER)
/*
- * Do the filesystem syncing work. For simple filesystems
- * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to
- * submit IO for these buffers via __sync_blockdev(). This also speeds up the
- * wait == 1 case since in that case write_inode() functions do
- * sync_dirty_buffer() and thus effectively write one block at a time.
- */
-static int __sync_filesystem(struct super_block *sb, int wait)
-{
- if (wait)
- sync_inodes_sb(sb);
- else
- writeback_inodes_sb(sb, WB_REASON_SYNC);
-
- if (sb->s_op->sync_fs)
- sb->s_op->sync_fs(sb, wait);
- return __sync_blockdev(sb->s_bdev, wait);
-}
-
-/*
* Write out and wait upon all dirty data associated with this
* superblock. Filesystem data as well as the underlying block
* device. Takes the superblock lock.
@@ -61,10 +43,25 @@ int sync_filesystem(struct super_block *sb)
if (sb_rdonly(sb))
return 0;
- ret = __sync_filesystem(sb, 0);
+ /*
+ * Do the filesystem syncing work. For simple filesystems
+ * writeback_inodes_sb(sb) just dirties buffers with inodes so we have
+ * to submit I/O for these buffers via sync_blockdev(). This also
+ * speeds up the wait == 1 case since in that case write_inode()
+ * methods call sync_dirty_buffer() and thus effectively write one block
+ * at a time.
+ */
+ writeback_inodes_sb(sb, WB_REASON_SYNC);
+ if (sb->s_op->sync_fs)
+ sb->s_op->sync_fs(sb, 0);
+ ret = sync_blockdev_nowait(sb->s_bdev);
if (ret < 0)
return ret;
- return __sync_filesystem(sb, 1);
+
+ sync_inodes_sb(sb);
+ if (sb->s_op->sync_fs)
+ sb->s_op->sync_fs(sb, 1);
+ return sync_blockdev(sb->s_bdev);
}
EXPORT_SYMBOL(sync_filesystem);
@@ -81,21 +78,6 @@ static void sync_fs_one_sb(struct super_block *sb, void *arg)
sb->s_op->sync_fs(sb, *(int *)arg);
}
-static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
-{
- filemap_fdatawrite(bdev->bd_inode->i_mapping);
-}
-
-static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
-{
- /*
- * We keep the error status of individual mapping so that
- * applications can catch the writeback error using fsync(2).
- * See filemap_fdatawait_keep_errors() for details.
- */
- filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
-}
-
/*
* Sync everything. We start by waking flusher threads so that most of
* writeback runs on all devices in parallel. Then we sync all inodes reliably
@@ -114,8 +96,8 @@ void ksys_sync(void)
iterate_supers(sync_inodes_one_sb, NULL);
iterate_supers(sync_fs_one_sb, &nowait);
iterate_supers(sync_fs_one_sb, &wait);
- iterate_bdevs(fdatawrite_one_bdev, NULL);
- iterate_bdevs(fdatawait_one_bdev, NULL);
+ sync_bdevs(false);
+ sync_bdevs(true);
if (unlikely(laptop_mode))
laptop_sync_completion();
}
@@ -136,10 +118,10 @@ static void do_sync_work(struct work_struct *work)
*/
iterate_supers(sync_inodes_one_sb, &nowait);
iterate_supers(sync_fs_one_sb, &nowait);
- iterate_bdevs(fdatawrite_one_bdev, NULL);
+ sync_bdevs(false);
iterate_supers(sync_inodes_one_sb, &nowait);
iterate_supers(sync_fs_one_sb, &nowait);
- iterate_bdevs(fdatawrite_one_bdev, NULL);
+ sync_bdevs(false);
printk("Emergency Sync complete\n");
kfree(work);
}
diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c
index 22be7aeb96c4..c57b46a352d8 100644
--- a/fs/ubifs/crypto.c
+++ b/fs/ubifs/crypto.c
@@ -82,5 +82,4 @@ const struct fscrypt_operations ubifs_crypt_operations = {
.get_context = ubifs_crypt_get_context,
.set_context = ubifs_crypt_set_context,
.empty_dir = ubifs_crypt_empty_dir,
- .max_namelen = UBIFS_MAX_NLEN,
};
diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c
index f1094cdcd6cd..46d697172197 100644
--- a/fs/udf/lowlevel.c
+++ b/fs/udf/lowlevel.c
@@ -47,8 +47,7 @@ unsigned int udf_get_last_session(struct super_block *sb)
unsigned long udf_get_last_block(struct super_block *sb)
{
- struct block_device *bdev = sb->s_bdev;
- struct cdrom_device_info *cdi = disk_to_cdi(bdev->bd_disk);
+ struct cdrom_device_info *cdi = disk_to_cdi(sb->s_bdev->bd_disk);
unsigned long lblock = 0;
/*
@@ -56,7 +55,7 @@ unsigned long udf_get_last_block(struct super_block *sb)
* Try using the device size...
*/
if (!cdi || cdrom_get_last_written(cdi, &lblock) || lblock == 0)
- lblock = i_size_read(bdev->bd_inode) >> sb->s_blocksize_bits;
+ lblock = sb_bdev_nr_blocks(sb);
if (lblock)
return lblock - 1;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index b2d7c57d0688..34247fba6df9 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1175,8 +1175,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
struct udf_inode_info *vati;
uint32_t pos;
struct virtualAllocationTable20 *vat20;
- sector_t blocks = i_size_read(sb->s_bdev->bd_inode) >>
- sb->s_blocksize_bits;
+ sector_t blocks = sb_bdev_nr_blocks(sb);
udf_find_vat_block(sb, p_index, type1_index, sbi->s_last_block);
if (!sbi->s_vat_inode &&
@@ -1838,8 +1837,7 @@ static int udf_check_anchor_block(struct super_block *sb, sector_t block,
int ret;
if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) &&
- udf_fixed_to_variable(block) >=
- i_size_read(sb->s_bdev->bd_inode) >> sb->s_blocksize_bits)
+ udf_fixed_to_variable(block) >= sb_bdev_nr_blocks(sb))
return -EAGAIN;
bh = udf_read_tagged(sb, block, block, &ident);
@@ -1901,8 +1899,7 @@ static int udf_scan_anchors(struct super_block *sb, sector_t *lastblock,
last[last_count++] = *lastblock - 152;
for (i = 0; i < last_count; i++) {
- if (last[i] >= i_size_read(sb->s_bdev->bd_inode) >>
- sb->s_blocksize_bits)
+ if (last[i] >= sb_bdev_nr_blocks(sb))
continue;
ret = udf_check_anchor_block(sb, last[i], fileset);
if (ret != -EAGAIN) {
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 003f0d31743e..22bf14ab2d16 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1827,9 +1827,15 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
if (mode_wp && mode_dontwake)
return -EINVAL;
- ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
- uffdio_wp.range.len, mode_wp,
- &ctx->mmap_changing);
+ if (mmget_not_zero(ctx->mm)) {
+ ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
+ uffdio_wp.range.len, mode_wp,
+ &ctx->mmap_changing);
+ mmput(ctx->mm);
+ } else {
+ return -ESRCH;
+ }
+
if (ret)
return ret;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 7aa943edfc02..62e7fbe4e54c 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1452,7 +1452,7 @@ const struct file_operations xfs_file_operations = {
.write_iter = xfs_file_write_iter,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
- .iopoll = iomap_dio_iopoll,
+ .iopoll = iocb_bio_iopoll,
.unlocked_ioctl = xfs_file_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = xfs_file_compat_ioctl,
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index ddc346a9df9b..3ce5f47338cb 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -1128,7 +1128,7 @@ static const struct file_operations zonefs_file_operations = {
.write_iter = zonefs_file_write_iter,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
- .iopoll = iomap_dio_iopoll,
+ .iopoll = iocb_bio_iopoll,
};
static struct kmem_cache *zonefs_inode_cachep;