diff options
Diffstat (limited to 'fs')
148 files changed, 2847 insertions, 2845 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c index eb2151fb6049..1769a44f4819 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -23,7 +23,7 @@ struct fscache_netfs v9fs_cache_netfs = { .version = 0, }; -/** +/* * v9fs_random_cachetag - Generate a random tag to be associated * with a new cache session. * @@ -233,7 +233,7 @@ static void v9fs_vfs_readpage_complete(struct page *page, void *data, unlock_page(page); } -/** +/* * __v9fs_readpage_from_fscache - read a page from cache * * Returns 0 if the pages are in cache and a BIO is submitted, @@ -268,7 +268,7 @@ int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page) } } -/** +/* * __v9fs_readpages_from_fscache - read multiple pages from cache * * Returns 0 if the pages are in cache and a BIO is submitted, @@ -308,7 +308,7 @@ int __v9fs_readpages_from_fscache(struct inode *inode, } } -/** +/* * __v9fs_readpage_to_fscache - write a page to the cache * */ diff --git a/fs/9p/fid.c b/fs/9p/fid.c index 9d9de62592be..b8863dd0de5c 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -19,18 +19,18 @@ #include "v9fs_vfs.h" #include "fid.h" +static inline void __add_fid(struct dentry *dentry, struct p9_fid *fid) +{ + hlist_add_head(&fid->dlist, (struct hlist_head *)&dentry->d_fsdata); +} + + /** * v9fs_fid_add - add a fid to a dentry * @dentry: dentry that the fid is being added to * @fid: fid to add * */ - -static inline void __add_fid(struct dentry *dentry, struct p9_fid *fid) -{ - hlist_add_head(&fid->dlist, (struct hlist_head *)&dentry->d_fsdata); -} - void v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid) { spin_lock(&dentry->d_lock); @@ -67,7 +67,7 @@ static struct p9_fid *v9fs_fid_find_inode(struct inode *inode, kuid_t uid) /** * v9fs_open_fid_add - add an open fid to an inode - * @dentry: inode that the fid is being added to + * @inode: inode that the fid is being added to * @fid: fid to add * */ diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index cdb99507ef33..2e0fa7c932db 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -155,6 +155,7 @@ int v9fs_show_options(struct seq_file *m, struct dentry *root) /** * v9fs_parse_options - parse mount options into session structure * @v9ses: existing v9fs session information + * @opts: The mount option string * * Return 0 upon success, -ERRNO upon failure. */ @@ -542,12 +543,9 @@ extern int v9fs_error_init(void); static struct kobject *v9fs_kobj; #ifdef CONFIG_9P_FSCACHE -/** - * caches_show - list caches associated with a session - * - * Returns the size of buffer written. +/* + * List caches associated with a session */ - static ssize_t caches_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index cce9ace651a2..1c4f1b39cc95 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -30,8 +30,7 @@ /** * v9fs_fid_readpage - read an entire page in from 9P - * - * @fid: fid being read + * @data: Opaque pointer to the fid being read * @page: structure to page * */ @@ -116,6 +115,8 @@ static int v9fs_vfs_readpages(struct file *filp, struct address_space *mapping, /** * v9fs_release_page - release the private state associated with a page + * @page: The page to be released + * @gfp: The caller's allocation restrictions * * Returns 1 if the page can be released, false otherwise. */ @@ -129,9 +130,9 @@ static int v9fs_release_page(struct page *page, gfp_t gfp) /** * v9fs_invalidate_page - Invalidate a page completely or partially - * - * @page: structure to page - * @offset: offset in the page + * @page: The page to be invalidated + * @offset: offset of the invalidated region + * @length: length of the invalidated region */ static void v9fs_invalidate_page(struct page *page, unsigned int offset, @@ -199,6 +200,8 @@ static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc) /** * v9fs_launder_page - Writeback a dirty page + * @page: The page to be cleaned up + * * Returns 0 on success. */ @@ -219,6 +222,7 @@ static int v9fs_launder_page(struct page *page) /** * v9fs_direct_IO - 9P address space operation for direct I/O * @iocb: target I/O control block + * @iter: The data/buffer to use * * The presence of v9fs_direct_IO() in the address space ops vector * allowes open() O_DIRECT flags which would have failed otherwise. diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index aab5e6538660..246235ebdb70 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -359,14 +359,11 @@ out_err: } /** - * v9fs_file_read - read from a file - * @filp: file pointer to read - * @udata: user data buffer to read data into - * @count: size of buffer - * @offset: offset at which to read data + * v9fs_file_read_iter - read from a file + * @iocb: The operation parameters + * @to: The buffer to read into * */ - static ssize_t v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { @@ -388,11 +385,9 @@ v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) } /** - * v9fs_file_write - write to a file - * @filp: file pointer to write - * @data: data buffer to write data from - * @count: size of buffer - * @offset: offset at which to write data + * v9fs_file_write_iter - write to a file + * @iocb: The operation parameters + * @from: The data to write * */ static ssize_t @@ -561,11 +556,9 @@ out_unlock: } /** - * v9fs_mmap_file_read - read from a file - * @filp: file pointer to read - * @data: user data buffer to read data into - * @count: size of buffer - * @offset: offset at which to read data + * v9fs_mmap_file_read_iter - read from a file + * @iocb: The operation parameters + * @to: The buffer to read into * */ static ssize_t @@ -576,11 +569,9 @@ v9fs_mmap_file_read_iter(struct kiocb *iocb, struct iov_iter *to) } /** - * v9fs_mmap_file_write - write to a file - * @filp: file pointer to write - * @data: data buffer to write data from - * @count: size of buffer - * @offset: offset at which to write data + * v9fs_mmap_file_write_iter - write to a file + * @iocb: The operation parameters + * @from: The data to write * */ static ssize_t diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 795706520b5e..08f48b70a741 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -218,7 +218,7 @@ v9fs_blank_wstat(struct p9_wstat *wstat) /** * v9fs_alloc_inode - helper function to allocate an inode - * + * @sb: The superblock to allocate the inode from */ struct inode *v9fs_alloc_inode(struct super_block *sb) { @@ -238,7 +238,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb) /** * v9fs_free_inode - destroy an inode - * + * @inode: The inode to be freed */ void v9fs_free_inode(struct inode *inode) @@ -343,7 +343,7 @@ error: * v9fs_get_inode - helper function to setup an inode * @sb: superblock * @mode: mode to setup inode with - * + * @rdev: The device numbers to set */ struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev) @@ -369,7 +369,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev) } /** - * v9fs_clear_inode - release an inode + * v9fs_evict_inode - Remove an inode from the inode cache * @inode: inode to release * */ @@ -665,14 +665,15 @@ error: /** * v9fs_vfs_create - VFS hook to create a regular file + * @mnt_userns: The user namespace of the mount + * @dir: The parent directory + * @dentry: The name of file to be created + * @mode: The UNIX file mode to set + * @excl: True if the file must not yet exist * * open(.., O_CREAT) is handled in v9fs_vfs_atomic_open(). This is only called * for mknod(2). * - * @dir: directory inode that is being created - * @dentry: dentry that is being deleted - * @mode: create permissions - * */ static int @@ -696,6 +697,7 @@ v9fs_vfs_create(struct user_namespace *mnt_userns, struct inode *dir, /** * v9fs_vfs_mkdir - VFS mkdir hook to create a directory + * @mnt_userns: The user namespace of the mount * @dir: inode that is being unlinked * @dentry: dentry that is being unlinked * @mode: mode for new directory @@ -900,10 +902,12 @@ int v9fs_vfs_rmdir(struct inode *i, struct dentry *d) /** * v9fs_vfs_rename - VFS hook to rename an inode + * @mnt_userns: The user namespace of the mount * @old_dir: old dir inode * @old_dentry: old dentry * @new_dir: new dir inode * @new_dentry: new dentry + * @flags: RENAME_* flags * */ @@ -1009,6 +1013,7 @@ done: /** * v9fs_vfs_getattr - retrieve file metadata + * @mnt_userns: The user namespace of the mount * @path: Object to query * @stat: metadata structure to populate * @request_mask: Mask of STATX_xxx flags indicating the caller's interests @@ -1050,6 +1055,7 @@ v9fs_vfs_getattr(struct user_namespace *mnt_userns, const struct path *path, /** * v9fs_vfs_setattr - set file metadata + * @mnt_userns: The user namespace of the mount * @dentry: file whose metadata to set * @iattr: metadata assignment structure * @@ -1285,6 +1291,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, /** * v9fs_vfs_symlink - helper function to create symlinks + * @mnt_userns: The user namespace of the mount * @dir: directory inode containing symlink * @dentry: dentry for symlink * @symname: symlink data @@ -1340,6 +1347,7 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, /** * v9fs_vfs_mknod - create a special file + * @mnt_userns: The user namespace of the mount * @dir: inode destination for new link * @dentry: dentry for file * @mode: mode for creation diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index e1c0240b51c0..01b9e1281a29 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -37,7 +37,10 @@ v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t omode, dev_t rdev); /** - * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a + * v9fs_get_fsgid_for_create - Helper function to get the gid for a new object + * @dir_inode: The directory inode + * + * Helper function to get the gid for creating a * new file system object. This checks the S_ISGID to determine the owning * group of the new file system object. */ @@ -211,12 +214,13 @@ int v9fs_open_to_dotl_flags(int flags) /** * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol. + * @mnt_userns: The user namespace of the mount * @dir: directory inode that is being created * @dentry: dentry that is being deleted * @omode: create permissions + * @excl: True if the file must not yet exist * */ - static int v9fs_vfs_create_dotl(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t omode, bool excl) @@ -361,6 +365,7 @@ err_clunk_old_fid: /** * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory + * @mnt_userns: The user namespace of the mount * @dir: inode that is being unlinked * @dentry: dentry that is being unlinked * @omode: mode for new directory @@ -537,6 +542,7 @@ static int v9fs_mapped_iattr_valid(int iattr_valid) /** * v9fs_vfs_setattr_dotl - set file metadata + * @mnt_userns: The user namespace of the mount * @dentry: file whose metadata to set * @iattr: metadata assignment structure * @@ -816,6 +822,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, /** * v9fs_vfs_mknod_dotl - create a special file + * @mnt_userns: The user namespace of the mount * @dir: inode destination for new link * @dentry: dentry for file * @omode: mode for creation diff --git a/fs/affs/super.c b/fs/affs/super.c index c6c2a513ec92..c609005a9eaa 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -389,7 +389,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) * blocks, we will have to change it. */ - size = i_size_read(sb->s_bdev->bd_inode) >> 9; + size = bdev_nr_sectors(sb->s_bdev); pr_debug("initial blocksize=%d, #blocks=%d\n", 512, size); affs_set_blocksize(sb, PAGE_SIZE); diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c index dae9a57d7ec0..45cfd50a9521 100644 --- a/fs/afs/dir_silly.c +++ b/fs/afs/dir_silly.c @@ -86,8 +86,8 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode return afs_do_sync_operation(op); } -/** - * afs_sillyrename - Perform a silly-rename of a dentry +/* + * Perform silly-rename of a dentry. * * AFS is stateless and the server doesn't know when the client is holding a * file open. To prevent application problems when a file is unlinked while diff --git a/fs/afs/write.c b/fs/afs/write.c index 2dfe3b3a53d6..8b1d9c2f6bec 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -861,7 +861,8 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) */ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) { - struct page *page = thp_head(vmf->page); + struct folio *folio = page_folio(vmf->page); + struct page *page = &folio->page; struct file *file = vmf->vma->vm_file; struct inode *inode = file_inode(file); struct afs_vnode *vnode = AFS_FS_I(inode); @@ -884,7 +885,7 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) goto out; #endif - if (wait_on_page_writeback_killable(page)) + if (folio_wait_writeback_killable(folio)) goto out; if (lock_page_killable(page) < 0) @@ -894,8 +895,8 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) * details the portion of the page we need to write back and we might * need to redirty the page if there's a problem. */ - if (wait_on_page_writeback_killable(page) < 0) { - unlock_page(page); + if (folio_wait_writeback_killable(folio) < 0) { + folio_unlock(folio); goto out; } @@ -974,8 +975,7 @@ int afs_launder_page(struct page *page) iov_iter_bvec(&iter, WRITE, bv, 1, bv[0].bv_len); trace_afs_page_dirty(vnode, tracepoint_string("launder"), page); - ret = afs_store_data(vnode, &iter, (loff_t)page->index * PAGE_SIZE, - true); + ret = afs_store_data(vnode, &iter, page_offset(page) + f, true); } trace_afs_page_dirty(vnode, tracepoint_string("laundered"), page); @@ -1417,7 +1417,7 @@ static void aio_remove_iocb(struct aio_kiocb *iocb) spin_unlock_irqrestore(&ctx->ctx_lock, flags); } -static void aio_complete_rw(struct kiocb *kiocb, long res, long res2) +static void aio_complete_rw(struct kiocb *kiocb, long res) { struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, rw); @@ -1437,7 +1437,7 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2) } iocb->ki_res.res = res; - iocb->ki_res.res2 = res2; + iocb->ki_res.res2 = 0; iocb_put(iocb); } @@ -1508,7 +1508,7 @@ static inline void aio_rw_done(struct kiocb *req, ssize_t ret) ret = -EINTR; fallthrough; default: - req->ki_complete(req, ret, 0); + req->ki_complete(req, ret); } } diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c index 16b5fca0626e..54c1f8b8b075 100644 --- a/fs/autofs/waitq.c +++ b/fs/autofs/waitq.c @@ -358,7 +358,7 @@ int autofs_wait(struct autofs_sb_info *sbi, qstr.len = strlen(p); offset = p - name; } - qstr.hash = full_name_hash(dentry, name, qstr.len); + qstr.hash = full_name_hash(dentry, qstr.name, qstr.len); if (mutex_lock_interruptible(&sbi->wq_mutex)) { kfree(name); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 7869ad12bc6e..6c7eb80220ca 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -9,6 +9,7 @@ #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/highmem.h> +#include <linux/kthread.h> #include <linux/time.h> #include <linux/init.h> #include <linux/string.h> @@ -172,9 +173,10 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, /* Hash through the page sector by sector */ for (pg_offset = 0; pg_offset < bytes_left; pg_offset += sectorsize) { - kaddr = page_address(page); + kaddr = kmap_atomic(page); crypto_shash_digest(shash, kaddr + pg_offset, sectorsize, csum); + kunmap_atomic(kaddr); if (memcmp(&csum, cb_sum, csum_size) != 0) { btrfs_print_data_csum_error(inode, disk_start, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 84627cbd5b5b..66290b214f2b 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -7,6 +7,7 @@ #include <linux/slab.h> #include <linux/rbtree.h> #include <linux/mm.h> +#include <linux/error-injection.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index dff2c8a3e059..c0cebcf745ce 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3030,7 +3030,7 @@ struct btrfs_dir_item * btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, - u64 objectid, const char *name, int name_len, + u64 index, const char *name, int name_len, int mod); struct btrfs_dir_item * btrfs_search_dir_index_item(struct btrfs_root *root, diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index d029be40ea6f..fbb8b4457a72 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -283,8 +283,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, } - if (i_size_read(bdev->bd_inode) < - btrfs_device_get_total_bytes(srcdev)) { + if (bdev_nr_bytes(bdev) < btrfs_device_get_total_bytes(srcdev)) { btrfs_err(fs_info, "target device is smaller than source device!"); ret = -EINVAL; diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index f1274d5c3805..7721ce0c0604 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -190,9 +190,20 @@ static struct btrfs_dir_item *btrfs_lookup_match_dir( } /* - * lookup a directory item based on name. 'dir' is the objectid - * we're searching in, and 'mod' tells us if you plan on deleting the - * item (use mod < 0) or changing the options (use mod > 0) + * Lookup for a directory item by name. + * + * @trans: The transaction handle to use. Can be NULL if @mod is 0. + * @root: The root of the target tree. + * @path: Path to use for the search. + * @dir: The inode number (objectid) of the directory. + * @name: The name associated to the directory entry we are looking for. + * @name_len: The length of the name. + * @mod: Used to indicate if the tree search is meant for a read only + * lookup, for a modification lookup or for a deletion lookup, so + * its value should be 0, 1 or -1, respectively. + * + * Returns: NULL if the dir item does not exists, an error pointer if an error + * happened, or a pointer to a dir item if a dir item exists for the given name. */ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -273,27 +284,42 @@ out: } /* - * lookup a directory item based on index. 'dir' is the objectid - * we're searching in, and 'mod' tells us if you plan on deleting the - * item (use mod < 0) or changing the options (use mod > 0) + * Lookup for a directory index item by name and index number. * - * The name is used to make sure the index really points to the name you were - * looking for. + * @trans: The transaction handle to use. Can be NULL if @mod is 0. + * @root: The root of the target tree. + * @path: Path to use for the search. + * @dir: The inode number (objectid) of the directory. + * @index: The index number. + * @name: The name associated to the directory entry we are looking for. + * @name_len: The length of the name. + * @mod: Used to indicate if the tree search is meant for a read only + * lookup, for a modification lookup or for a deletion lookup, so + * its value should be 0, 1 or -1, respectively. + * + * Returns: NULL if the dir index item does not exists, an error pointer if an + * error happened, or a pointer to a dir item if the dir index item exists and + * matches the criteria (name and index number). */ struct btrfs_dir_item * btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, - u64 objectid, const char *name, int name_len, + u64 index, const char *name, int name_len, int mod) { + struct btrfs_dir_item *di; struct btrfs_key key; key.objectid = dir; key.type = BTRFS_DIR_INDEX_KEY; - key.offset = objectid; + key.offset = index; - return btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod); + di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod); + if (di == ERR_PTR(-ENOENT)) + return NULL; + + return di; } struct btrfs_dir_item * diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 355ea88d5c5f..29e7598584c4 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3740,7 +3740,7 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, else if (ret) return ERR_PTR(ret); - if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode)) + if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev)) return ERR_PTR(-EINVAL); page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index fc3da7585fb7..0ab456cb4bf8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4859,6 +4859,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, out_free_delayed: btrfs_free_delayed_extent_op(extent_op); out_free_buf: + btrfs_tree_unlock(buf); free_extent_buffer(buf); out_free_reserved: btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 7ff577005d0f..a1762363f61f 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -734,8 +734,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, if (args->start >= inode->disk_i_size && !args->replace_extent) modify_tree = 0; - update_refs = (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) || - root == fs_info->tree_root); + update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); while (1) { recow = 0; ret = btrfs_lookup_file_extent(trans, root, path, ino, @@ -2704,14 +2703,16 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, drop_args.bytes_found); if (ret != -ENOSPC) { /* - * When cloning we want to avoid transaction aborts when - * nothing was done and we are attempting to clone parts - * of inline extents, in such cases -EOPNOTSUPP is - * returned by __btrfs_drop_extents() without having - * changed anything in the file. + * The only time we don't want to abort is if we are + * attempting to clone a partial inline extent, in which + * case we'll get EOPNOTSUPP. However if we aren't + * clone we need to abort no matter what, because if we + * got EOPNOTSUPP via prealloc then we messed up and + * need to abort. */ - if (extent_info && !extent_info->is_new_extent && - ret && ret != -EOPNOTSUPP) + if (ret && + (ret != -EOPNOTSUPP || + (extent_info && extent_info->is_new_extent))) btrfs_abort_transaction(trans, ret); break; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 487533c35ddb..954b53a90f04 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6,6 +6,7 @@ #include <crypto/hash.h> #include <linux/kernel.h> #include <linux/bio.h> +#include <linux/blk-cgroup.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/pagemap.h> @@ -287,8 +288,9 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, cur_size = min_t(unsigned long, compressed_size, PAGE_SIZE); - kaddr = page_address(cpage); + kaddr = kmap_atomic(cpage); write_extent_buffer(leaf, kaddr, ptr, cur_size); + kunmap_atomic(kaddr); i++; ptr += cur_size; @@ -8247,7 +8249,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, return dip; } -static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter, +static void btrfs_submit_direct(const struct iomap_iter *iter, struct bio *dio_bio, loff_t file_offset) { struct inode *inode = iter->inode; @@ -8277,7 +8279,7 @@ static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter, } dio_bio->bi_status = BLK_STS_RESOURCE; bio_endio(dio_bio); - return BLK_QC_T_NONE; + return; } if (!write) { @@ -8371,15 +8373,13 @@ static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter, free_extent_map(em); } while (submit_len > 0); - return BLK_QC_T_NONE; + return; out_err_em: free_extent_map(em); out_err: dip->dio_bio->bi_status = status; btrfs_dio_private_put(dip); - - return BLK_QC_T_NONE; } const struct iomap_ops btrfs_dio_iomap_ops = { diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index cc61813213d8..36ff713da1b1 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1730,7 +1730,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, } if (!strcmp(sizestr, "max")) - new_size = device->bdev->bd_inode->i_size; + new_size = bdev_nr_bytes(device->bdev); else { if (sizestr[0] == '-') { mod = -1; @@ -1771,7 +1771,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, ret = -EINVAL; goto out_finish; } - if (new_size > device->bdev->bd_inode->i_size) { + if (new_size > bdev_nr_bytes(device->bdev)) { ret = -EFBIG; goto out_finish; } diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index c25dfd1a8a54..3dbe6eb5fda7 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -141,7 +141,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, *total_in = 0; in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = page_address(in_page); + data_in = kmap(in_page); /* * store the size of all chunks of compressed data in @@ -152,7 +152,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -ENOMEM; goto out; } - cpage_out = page_address(out_page); + cpage_out = kmap(out_page); out_offset = LZO_LEN; tot_out = LZO_LEN; pages[0] = out_page; @@ -210,6 +210,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, if (out_len == 0 && tot_in >= len) break; + kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; ret = -E2BIG; @@ -221,7 +222,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -ENOMEM; goto out; } - cpage_out = page_address(out_page); + cpage_out = kmap(out_page); pages[nr_pages++] = out_page; pg_bytes_left = PAGE_SIZE; @@ -243,11 +244,12 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, break; bytes_left = len - tot_in; + kunmap(in_page); put_page(in_page); start += PAGE_SIZE; in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = page_address(in_page); + data_in = kmap(in_page); in_len = min(bytes_left, PAGE_SIZE); } @@ -257,17 +259,22 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, } /* store the size of all chunks of compressed data */ - sizes_ptr = page_address(pages[0]); + sizes_ptr = kmap_local_page(pages[0]); write_compress_length(sizes_ptr, tot_out); + kunmap_local(sizes_ptr); ret = 0; *total_out = tot_out; *total_in = tot_in; out: *out_pages = nr_pages; + if (out_page) + kunmap(out_page); - if (in_page) + if (in_page) { + kunmap(in_page); put_page(in_page); + } return ret; } @@ -283,6 +290,7 @@ static void copy_compressed_segment(struct compressed_bio *cb, u32 orig_in = *cur_in; while (*cur_in < orig_in + len) { + char *kaddr; struct page *cur_page; u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in), orig_in + len - *cur_in); @@ -290,9 +298,11 @@ static void copy_compressed_segment(struct compressed_bio *cb, ASSERT(copy_len); cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE]; + kaddr = kmap(cur_page); memcpy(dest + *cur_in - orig_in, - page_address(cur_page) + offset_in_page(*cur_in), + kaddr + offset_in_page(*cur_in), copy_len); + kunmap(cur_page); *cur_in += copy_len; } @@ -303,6 +313,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) struct workspace *workspace = list_entry(ws, struct workspace, list); const struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); const u32 sectorsize = fs_info->sectorsize; + char *kaddr; int ret; /* Compressed data length, can be unaligned */ u32 len_in; @@ -311,7 +322,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) /* Bytes decompressed so far */ u32 cur_out = 0; - len_in = read_compress_length(page_address(cb->compressed_pages[0])); + kaddr = kmap(cb->compressed_pages[0]); + len_in = read_compress_length(kaddr); + kunmap(cb->compressed_pages[0]); cur_in += LZO_LEN; /* @@ -344,9 +357,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) ASSERT(cur_in / sectorsize == (cur_in + LZO_LEN - 1) / sectorsize); cur_page = cb->compressed_pages[cur_in / PAGE_SIZE]; + kaddr = kmap(cur_page); ASSERT(cur_page); - seg_len = read_compress_length(page_address(cur_page) + - offset_in_page(cur_in)); + seg_len = read_compress_length(kaddr + offset_in_page(cur_in)); cur_in += LZO_LEN; /* Copy the compressed segment payload into workspace */ @@ -431,7 +444,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in, destlen = min_t(unsigned long, destlen, PAGE_SIZE); bytes = min_t(unsigned long, destlen, out_len - start_byte); - kaddr = page_address(dest_page); + kaddr = kmap_local_page(dest_page); memcpy(kaddr, workspace->buf + start_byte, bytes); /* @@ -441,6 +454,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in, */ if (bytes < destlen) memset(kaddr+bytes, 0, destlen-bytes); + kunmap_local(kaddr); out: return ret; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f7efc26aa82a..b415c5ec03ea 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -939,9 +939,11 @@ out: } /* - * helper function to see if a given name and sequence number found - * in an inode back reference are already in a directory and correctly - * point to this inode + * See if a given name and sequence number found in an inode back reference are + * already in a directory and correctly point to this inode. + * + * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it + * exists. */ static noinline int inode_in_dir(struct btrfs_root *root, struct btrfs_path *path, @@ -950,29 +952,34 @@ static noinline int inode_in_dir(struct btrfs_root *root, { struct btrfs_dir_item *di; struct btrfs_key location; - int match = 0; + int ret = 0; di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, index, name, name_len, 0); - if (di && !IS_ERR(di)) { + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } else if (di) { btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); if (location.objectid != objectid) goto out; - } else + } else { goto out; - btrfs_release_path(path); + } + btrfs_release_path(path); di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); - if (di && !IS_ERR(di)) { - btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); - if (location.objectid != objectid) - goto out; - } else + if (IS_ERR(di)) { + ret = PTR_ERR(di); goto out; - match = 1; + } else if (di) { + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); + if (location.objectid == objectid) + ret = 1; + } out: btrfs_release_path(path); - return match; + return ret; } /* @@ -1182,7 +1189,9 @@ next: /* look for a conflicting sequence number */ di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), ref_index, name, namelen, 0); - if (di && !IS_ERR(di)) { + if (IS_ERR(di)) { + return PTR_ERR(di); + } else if (di) { ret = drop_one_dir_item(trans, root, path, dir, di); if (ret) return ret; @@ -1192,7 +1201,9 @@ next: /* look for a conflicting name */ di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, namelen, 0); - if (di && !IS_ERR(di)) { + if (IS_ERR(di)) { + return PTR_ERR(di); + } else if (di) { ret = drop_one_dir_item(trans, root, path, dir, di); if (ret) return ret; @@ -1517,10 +1528,12 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, if (ret) goto out; - /* if we already have a perfect match, we're done */ - if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), - btrfs_ino(BTRFS_I(inode)), ref_index, - name, namelen)) { + ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), + btrfs_ino(BTRFS_I(inode)), ref_index, + name, namelen); + if (ret < 0) { + goto out; + } else if (ret == 0) { /* * look for a conflicting back reference in the * metadata. if we find one we have to unlink that name @@ -1580,6 +1593,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, if (ret) goto out; } + /* Else, ret == 1, we already have a perfect match, we're done. */ ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; kfree(name); @@ -1936,8 +1950,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, struct btrfs_key log_key; struct inode *dir; u8 log_type; - int exists; - int ret = 0; + bool exists; + int ret; bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); bool name_added = false; @@ -1957,12 +1971,12 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, name_len); btrfs_dir_item_key_to_cpu(eb, di, &log_key); - exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); - if (exists == 0) - exists = 1; - else - exists = 0; + ret = btrfs_lookup_inode(trans, root, path, &log_key, 0); btrfs_release_path(path); + if (ret < 0) + goto out; + exists = (ret == 0); + ret = 0; if (key->type == BTRFS_DIR_ITEM_KEY) { dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, @@ -1977,7 +1991,11 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, ret = -EINVAL; goto out; } - if (IS_ERR_OR_NULL(dst_di)) { + + if (IS_ERR(dst_di)) { + ret = PTR_ERR(dst_di); + goto out; + } else if (!dst_di) { /* we need a sequence number to insert, so we only * do inserts for the BTRFS_DIR_INDEX_KEY types */ @@ -2281,7 +2299,7 @@ again: dir_key->offset, name, name_len, 0); } - if (!log_di || log_di == ERR_PTR(-ENOENT)) { + if (!log_di) { btrfs_dir_item_key_to_cpu(eb, di, &location); btrfs_release_path(path); btrfs_release_path(log_path); @@ -3540,8 +3558,7 @@ out_unlock: if (err == -ENOSPC) { btrfs_set_log_full_commit(trans); err = 0; - } else if (err < 0 && err != -ENOENT) { - /* ENOENT can be returned if the entry hasn't been fsynced yet */ + } else if (err < 0) { btrfs_abort_transaction(trans, err); } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2ec3b8ac8fa3..9533f358850e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -508,7 +508,7 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, } if (flush) - filemap_write_and_wait((*bdev)->bd_inode->i_mapping); + sync_blockdev(*bdev); ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE); if (ret) { blkdev_put(*bdev, flags); @@ -1286,7 +1286,7 @@ static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev pgoff_t index; /* make sure our super fits in the device */ - if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode)) + if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev)) return ERR_PTR(-EINVAL); /* make sure our super fits in the page */ @@ -2610,8 +2610,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path device->io_width = fs_info->sectorsize; device->io_align = fs_info->sectorsize; device->sector_size = fs_info->sectorsize; - device->total_bytes = round_down(i_size_read(bdev->bd_inode), - fs_info->sectorsize); + device->total_bytes = + round_down(bdev_nr_bytes(bdev), fs_info->sectorsize); device->disk_total_bytes = device->total_bytes; device->commit_total_bytes = device->total_bytes; set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); @@ -7236,7 +7236,7 @@ static int read_one_dev(struct extent_buffer *leaf, fill_device_from_item(leaf, dev_item, device); if (device->bdev) { - u64 max_total_bytes = i_size_read(device->bdev->bd_inode); + u64 max_total_bytes = bdev_nr_bytes(device->bdev); if (device->total_bytes > max_total_bytes) { btrfs_err(fs_info, diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 8afa90074891..767a0c6c9694 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -126,7 +126,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -ENOMEM; goto out; } - cpage_out = page_address(out_page); + cpage_out = kmap(out_page); pages[0] = out_page; nr_pages = 1; @@ -148,22 +148,26 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, int i; for (i = 0; i < in_buf_pages; i++) { - if (in_page) + if (in_page) { + kunmap(in_page); put_page(in_page); + } in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = page_address(in_page); + data_in = kmap(in_page); memcpy(workspace->buf + i * PAGE_SIZE, data_in, PAGE_SIZE); start += PAGE_SIZE; } workspace->strm.next_in = workspace->buf; } else { - if (in_page) + if (in_page) { + kunmap(in_page); put_page(in_page); + } in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = page_address(in_page); + data_in = kmap(in_page); start += PAGE_SIZE; workspace->strm.next_in = data_in; } @@ -192,6 +196,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, * the stream end if required */ if (workspace->strm.avail_out == 0) { + kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; ret = -E2BIG; @@ -202,7 +207,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -ENOMEM; goto out; } - cpage_out = page_address(out_page); + cpage_out = kmap(out_page); pages[nr_pages] = out_page; nr_pages++; workspace->strm.avail_out = PAGE_SIZE; @@ -229,6 +234,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } else if (workspace->strm.avail_out == 0) { /* get another page for the stream end */ + kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; ret = -E2BIG; @@ -239,7 +245,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -ENOMEM; goto out; } - cpage_out = page_address(out_page); + cpage_out = kmap(out_page); pages[nr_pages] = out_page; nr_pages++; workspace->strm.avail_out = PAGE_SIZE; @@ -258,8 +264,13 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, *total_in = workspace->strm.total_in; out: *out_pages = nr_pages; - if (in_page) + if (out_page) + kunmap(out_page); + + if (in_page) { + kunmap(in_page); put_page(in_page); + } return ret; } @@ -276,7 +287,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) unsigned long buf_start; struct page **pages_in = cb->compressed_pages; - data_in = page_address(pages_in[page_in_index]); + data_in = kmap(pages_in[page_in_index]); workspace->strm.next_in = data_in; workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE); workspace->strm.total_in = 0; @@ -298,6 +309,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) { pr_warn("BTRFS: inflateInit failed\n"); + kunmap(pages_in[page_in_index]); return -EIO; } while (workspace->strm.total_in < srclen) { @@ -324,13 +336,13 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (workspace->strm.avail_in == 0) { unsigned long tmp; - + kunmap(pages_in[page_in_index]); page_in_index++; if (page_in_index >= total_pages_in) { data_in = NULL; break; } - data_in = page_address(pages_in[page_in_index]); + data_in = kmap(pages_in[page_in_index]); workspace->strm.next_in = data_in; tmp = srclen - workspace->strm.total_in; workspace->strm.avail_in = min(tmp, PAGE_SIZE); @@ -342,6 +354,8 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) ret = 0; done: zlib_inflateEnd(&workspace->strm); + if (data_in) + kunmap(pages_in[page_in_index]); if (!ret) zero_fill_bio(cb->orig_bio); return ret; diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 56dce9f00988..f06b68040352 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -399,7 +399,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, /* map in the first page of input data */ in_page = find_get_page(mapping, start >> PAGE_SHIFT); - workspace->in_buf.src = page_address(in_page); + workspace->in_buf.src = kmap(in_page); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); @@ -411,7 +411,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } pages[nr_pages++] = out_page; - workspace->out_buf.dst = page_address(out_page); + workspace->out_buf.dst = kmap(out_page); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); @@ -446,6 +446,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, if (workspace->out_buf.pos == workspace->out_buf.size) { tot_out += PAGE_SIZE; max_out -= PAGE_SIZE; + kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; ret = -E2BIG; @@ -457,7 +458,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } pages[nr_pages++] = out_page; - workspace->out_buf.dst = page_address(out_page); + workspace->out_buf.dst = kmap(out_page); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); @@ -472,12 +473,13 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, /* Check if we need more input */ if (workspace->in_buf.pos == workspace->in_buf.size) { tot_in += PAGE_SIZE; + kunmap(in_page); put_page(in_page); start += PAGE_SIZE; len -= PAGE_SIZE; in_page = find_get_page(mapping, start >> PAGE_SHIFT); - workspace->in_buf.src = page_address(in_page); + workspace->in_buf.src = kmap(in_page); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); } @@ -504,6 +506,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, tot_out += PAGE_SIZE; max_out -= PAGE_SIZE; + kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; ret = -E2BIG; @@ -515,7 +518,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } pages[nr_pages++] = out_page; - workspace->out_buf.dst = page_address(out_page); + workspace->out_buf.dst = kmap(out_page); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); } @@ -531,8 +534,12 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, out: *out_pages = nr_pages; /* Cleanup */ - if (in_page) + if (in_page) { + kunmap(in_page); put_page(in_page); + } + if (out_page) + kunmap(out_page); return ret; } @@ -556,7 +563,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) goto done; } - workspace->in_buf.src = page_address(pages_in[page_in_index]); + workspace->in_buf.src = kmap(pages_in[page_in_index]); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); @@ -592,14 +599,14 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) break; if (workspace->in_buf.pos == workspace->in_buf.size) { - page_in_index++; + kunmap(pages_in[page_in_index++]); if (page_in_index >= total_pages_in) { workspace->in_buf.src = NULL; ret = -EIO; goto done; } srclen -= PAGE_SIZE; - workspace->in_buf.src = page_address(pages_in[page_in_index]); + workspace->in_buf.src = kmap(pages_in[page_in_index]); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); } @@ -607,6 +614,8 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) ret = 0; zero_fill_bio(cb->orig_bio); done: + if (workspace->in_buf.src) + kunmap(pages_in[page_in_index]); return ret; } diff --git a/fs/buffer.c b/fs/buffer.c index c615387aedca..46bc589b7a03 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -878,7 +878,7 @@ link_dev_buffers(struct page *page, struct buffer_head *head) static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size) { sector_t retval = ~((sector_t)0); - loff_t sz = i_size_read(bdev->bd_inode); + loff_t sz = bdev_nr_bytes(bdev); if (sz) { unsigned int sizebits = blksize_bits(size); @@ -897,7 +897,7 @@ init_page_buffers(struct page *page, struct block_device *bdev, struct buffer_head *head = page_buffers(page); struct buffer_head *bh = head; int uptodate = PageUptodate(page); - sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size); + sector_t end_block = blkdev_max_block(bdev, size); do { if (!buffer_mapped(bh)) { diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index fac2e8e7b533..effe37ef8629 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -37,11 +37,11 @@ static inline void cachefiles_put_kiocb(struct cachefiles_kiocb *ki) /* * Handle completion of a read from the cache. */ -static void cachefiles_read_complete(struct kiocb *iocb, long ret, long ret2) +static void cachefiles_read_complete(struct kiocb *iocb, long ret) { struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb); - _enter("%ld,%ld", ret, ret2); + _enter("%ld", ret); if (ki->term_func) { if (ret >= 0) @@ -139,7 +139,7 @@ static int cachefiles_read(struct netfs_cache_resources *cres, fallthrough; default: ki->was_async = false; - cachefiles_read_complete(&ki->iocb, ret, 0); + cachefiles_read_complete(&ki->iocb, ret); if (ret > 0) ret = 0; break; @@ -159,12 +159,12 @@ presubmission_error: /* * Handle completion of a write to the cache. */ -static void cachefiles_write_complete(struct kiocb *iocb, long ret, long ret2) +static void cachefiles_write_complete(struct kiocb *iocb, long ret) { struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb); struct inode *inode = file_inode(ki->iocb.ki_filp); - _enter("%ld,%ld", ret, ret2); + _enter("%ld", ret); /* Tell lockdep we inherited freeze protection from submission thread */ __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); @@ -244,7 +244,7 @@ static int cachefiles_write(struct netfs_cache_resources *cres, fallthrough; default: ki->was_async = false; - cachefiles_write_complete(&ki->iocb, ret, 0); + cachefiles_write_complete(&ki->iocb, ret); if (ret > 0) ret = 0; break; diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 8ffc40e84a59..fcf4f3b72923 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -25,20 +25,20 @@ static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode, struct cachefiles_object *object; struct fscache_retrieval *op = monitor->op; struct wait_page_key *key = _key; - struct page *page = wait->private; + struct folio *folio = wait->private; ASSERT(key); _enter("{%lu},%u,%d,{%p,%u}", monitor->netfs_page->index, mode, sync, - key->page, key->bit_nr); + key->folio, key->bit_nr); - if (key->page != page || key->bit_nr != PG_locked) + if (key->folio != folio || key->bit_nr != PG_locked) return 0; - _debug("--- monitor %p %lx ---", page, page->flags); + _debug("--- monitor %p %lx ---", folio, folio->flags); - if (!PageUptodate(page) && !PageError(page)) { + if (!folio_test_uptodate(folio) && !folio_test_error(folio)) { /* unlocked, not uptodate and not erronous? */ _debug("page probably truncated"); } @@ -107,7 +107,7 @@ static int cachefiles_read_reissue(struct cachefiles_object *object, put_page(backpage2); INIT_LIST_HEAD(&monitor->op_link); - add_page_wait_queue(backpage, &monitor->monitor); + folio_add_wait_queue(page_folio(backpage), &monitor->monitor); if (trylock_page(backpage)) { ret = -EIO; @@ -294,7 +294,7 @@ monitor_backing_page: get_page(backpage); monitor->back_page = backpage; monitor->monitor.private = backpage; - add_page_wait_queue(backpage, &monitor->monitor); + folio_add_wait_queue(page_folio(backpage), &monitor->monitor); monitor = NULL; /* but the page may have been read before the monitor was installed, so @@ -548,7 +548,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, get_page(backpage); monitor->back_page = backpage; monitor->monitor.private = backpage; - add_page_wait_queue(backpage, &monitor->monitor); + folio_add_wait_queue(page_folio(backpage), &monitor->monitor); monitor = NULL; /* but the page may have been read before the monitor was diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 3e42d0466521..8f537f1d9d1d 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2330,7 +2330,6 @@ retry: int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) { - struct ceph_file_info *fi = file->private_data; struct inode *inode = file->f_mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); u64 flush_tid; @@ -2365,14 +2364,9 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) if (err < 0) ret = err; - if (errseq_check(&ci->i_meta_err, READ_ONCE(fi->meta_err))) { - spin_lock(&file->f_lock); - err = errseq_check_and_advance(&ci->i_meta_err, - &fi->meta_err); - spin_unlock(&file->f_lock); - if (err < 0) - ret = err; - } + err = file_check_and_advance_wb_err(file); + if (err < 0) + ret = err; out: dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret); return ret; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index d16fd2d5fd42..b129ea551378 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -233,7 +233,6 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, spin_lock_init(&fi->rw_contexts_lock); INIT_LIST_HEAD(&fi->rw_contexts); - fi->meta_err = errseq_sample(&ci->i_meta_err); fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen); return 0; @@ -1023,7 +1022,7 @@ static void ceph_aio_complete(struct inode *inode, ceph_put_cap_refs(ci, (aio_req->write ? CEPH_CAP_FILE_WR : CEPH_CAP_FILE_RD)); - aio_req->iocb->ki_complete(aio_req->iocb, ret, 0); + aio_req->iocb->ki_complete(aio_req->iocb, ret); ceph_free_cap_flush(aio_req->prealloc_cf); kfree(aio_req); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 2df1e1284451..1c7574105478 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -541,8 +541,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ceph_fscache_inode_init(ci); - ci->i_meta_err = 0; - return &ci->vfs_inode; } diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index bdeb271f47d9..d8c31069fbf2 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -302,9 +302,6 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) if (!(fl->fl_flags & FL_FLOCK)) return -ENOLCK; - /* No mandatory locks */ - if (fl->fl_type & LOCK_MAND) - return -EOPNOTSUPP; dout("ceph_flock, fl_file: %p\n", fl->fl_file); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 7cad180d6deb..d64413adc0fd 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1493,7 +1493,6 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, { struct ceph_mds_request *req; struct rb_node *p; - struct ceph_inode_info *ci; dout("cleanup_session_requests mds%d\n", session->s_mds); mutex_lock(&mdsc->mutex); @@ -1502,16 +1501,10 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, struct ceph_mds_request, r_unsafe_item); pr_warn_ratelimited(" dropping unsafe request %llu\n", req->r_tid); - if (req->r_target_inode) { - /* dropping unsafe change of inode's attributes */ - ci = ceph_inode(req->r_target_inode); - errseq_set(&ci->i_meta_err, -EIO); - } - if (req->r_unsafe_dir) { - /* dropping unsafe directory operation */ - ci = ceph_inode(req->r_unsafe_dir); - errseq_set(&ci->i_meta_err, -EIO); - } + if (req->r_target_inode) + mapping_set_error(req->r_target_inode->i_mapping, -EIO); + if (req->r_unsafe_dir) + mapping_set_error(req->r_unsafe_dir->i_mapping, -EIO); __unregister_request(mdsc, req); } /* zero r_attempts, so kick_requests() will re-send requests */ @@ -1678,7 +1671,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, spin_unlock(&mdsc->cap_dirty_lock); if (dirty_dropped) { - errseq_set(&ci->i_meta_err, -EIO); + mapping_set_error(inode->i_mapping, -EIO); if (ci->i_wrbuffer_ref_head == 0 && ci->i_wr_ref == 0 && diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 9b1b7f4cfdd4..fd8742bae847 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1002,16 +1002,16 @@ static int ceph_compare_super(struct super_block *sb, struct fs_context *fc) struct ceph_fs_client *new = fc->s_fs_info; struct ceph_mount_options *fsopt = new->mount_options; struct ceph_options *opt = new->client->options; - struct ceph_fs_client *other = ceph_sb_to_client(sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); dout("ceph_compare_super %p\n", sb); - if (compare_mount_options(fsopt, opt, other)) { + if (compare_mount_options(fsopt, opt, fsc)) { dout("monitor(s)/mount options don't match\n"); return 0; } if ((opt->flags & CEPH_OPT_FSID) && - ceph_fsid_compare(&opt->fsid, &other->client->fsid)) { + ceph_fsid_compare(&opt->fsid, &fsc->client->fsid)) { dout("fsid doesn't match\n"); return 0; } @@ -1019,6 +1019,17 @@ static int ceph_compare_super(struct super_block *sb, struct fs_context *fc) dout("flags differ\n"); return 0; } + + if (fsc->blocklisted && !ceph_test_mount_opt(fsc, CLEANRECOVER)) { + dout("client is blocklisted (and CLEANRECOVER is not set)\n"); + return 0; + } + + if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) { + dout("client has been forcibly unmounted\n"); + return 0; + } + return 1; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index a40eb14c282a..14f951cd5b61 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -429,8 +429,6 @@ struct ceph_inode_info { #ifdef CONFIG_CEPH_FSCACHE struct fscache_cookie *fscache; #endif - errseq_t i_meta_err; - struct inode vfs_inode; /* at end */ }; @@ -774,7 +772,6 @@ struct ceph_file_info { spinlock_t rw_contexts_lock; struct list_head rw_contexts; - errseq_t meta_err; u32 filp_gen; atomic_t num_locks; }; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 13f3182cf796..1b855fcb179e 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3184,7 +3184,7 @@ restart_loop: mutex_unlock(&ctx->aio_mutex); if (ctx->iocb && ctx->iocb->ki_complete) - ctx->iocb->ki_complete(ctx->iocb, ctx->rc, 0); + ctx->iocb->ki_complete(ctx->iocb, ctx->rc); else complete(&ctx->done); } @@ -3917,7 +3917,7 @@ again: mutex_unlock(&ctx->aio_mutex); if (ctx->iocb && ctx->iocb->ki_complete) - ctx->iocb->ki_complete(ctx->iocb, ctx->rc, 0); + ctx->iocb->ki_complete(ctx->iocb, ctx->rc); else complete(&ctx->done); } diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 2be65269a987..666aa380011e 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -209,7 +209,7 @@ static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset, return read_buffers[i] + blk_offset; } - devsize = mapping->host->i_size >> PAGE_SHIFT; + devsize = bdev_nr_bytes(sb->s_bdev) >> PAGE_SHIFT; /* Ok, read in BLKS_PER_BUF pages completely first. */ for (i = 0; i < BLKS_PER_BUF; i++) { diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 68a2de6b5a9b..bfc2a5b74ed3 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -1,23 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 /* - * This contains encryption functions for per-file encryption. + * Utility functions for file contents encryption/decryption on + * block device-based filesystems. * * Copyright (C) 2015, Google, Inc. * Copyright (C) 2015, Motorola Mobility - * - * Written by Michael Halcrow, 2014. - * - * Filename encryption additions - * Uday Savagaonkar, 2014 - * Encryption policy handling additions - * Ildar Muslukhov, 2014 - * Add fscrypt_pullback_bio_page() - * Jaegeuk Kim, 2015. - * - * This has not yet undergone a rigorous security audit. - * - * The usage of AES-XTS should conform to recommendations in NIST - * Special Publication 800-38E and IEEE P1619/D16. */ #include <linux/pagemap.h> @@ -26,6 +13,21 @@ #include <linux/namei.h> #include "fscrypt_private.h" +/** + * fscrypt_decrypt_bio() - decrypt the contents of a bio + * @bio: the bio to decrypt + * + * Decrypt the contents of a "read" bio following successful completion of the + * underlying disk read. The bio must be reading a whole number of blocks of an + * encrypted file directly into the page cache. If the bio is reading the + * ciphertext into bounce pages instead of the page cache (for example, because + * the file is also compressed, so decompression is required after decryption), + * then this function isn't applicable. This function may sleep, so it must be + * called from a workqueue rather than from the bio's bi_end_io callback. + * + * This function sets PG_error on any pages that contain any blocks that failed + * to be decrypted. The filesystem must not mark such pages uptodate. + */ void fscrypt_decrypt_bio(struct bio *bio) { struct bio_vec *bv; diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index eb538c28df94..a9be4bc74a94 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -429,8 +429,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, if (fscrypt_has_encryption_key(dir)) { if (!fscrypt_fname_encrypted_size(&dir->i_crypt_info->ci_policy, - iname->len, - dir->i_sb->s_cop->max_namelen, + iname->len, NAME_MAX, &fname->crypto_buf.len)) return -ENAMETOOLONG; fname->crypto_buf.name = kmalloc(fname->crypto_buf.len, diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 3fa965eb3336..5b0a9e6478b5 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -20,6 +20,11 @@ #define FSCRYPT_FILE_NONCE_SIZE 16 +/* + * Minimum size of an fscrypt master key. Note: a longer key will be required + * if ciphers with a 256-bit security strength are used. This is just the + * absolute minimum, which applies when only 128-bit encryption is used. + */ #define FSCRYPT_MIN_KEY_SIZE 16 #define FSCRYPT_CONTEXT_V1 1 @@ -413,7 +418,11 @@ struct fscrypt_master_key_secret { */ struct fscrypt_hkdf hkdf; - /* Size of the raw key in bytes. Set even if ->raw isn't set. */ + /* + * Size of the raw key in bytes. This remains set even if ->raw was + * zeroized due to no longer being needed. I.e. we still remember the + * size of the key even if we don't need to remember the key itself. + */ u32 size; /* For v1 policy keys: the raw key. Wiped for v2 policy keys. */ @@ -549,8 +558,9 @@ int __init fscrypt_init_keyring(void); struct fscrypt_mode { const char *friendly_name; const char *cipher_str; - int keysize; - int ivsize; + int keysize; /* key size in bytes */ + int security_strength; /* security strength in bytes */ + int ivsize; /* IV size in bytes */ int logged_impl_name; enum blk_crypto_mode_num blk_crypto_mode; }; diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c index e0ec21055505..7607d18b35fc 100644 --- a/fs/crypto/hkdf.c +++ b/fs/crypto/hkdf.c @@ -16,9 +16,14 @@ /* * HKDF supports any unkeyed cryptographic hash algorithm, but fscrypt uses - * SHA-512 because it is reasonably secure and efficient; and since it produces - * a 64-byte digest, deriving an AES-256-XTS key preserves all 64 bytes of - * entropy from the master key and requires only one iteration of HKDF-Expand. + * SHA-512 because it is well-established, secure, and reasonably efficient. + * + * HKDF-SHA256 was also considered, as its 256-bit security strength would be + * sufficient here. A 512-bit security strength is "nice to have", though. + * Also, on 64-bit CPUs, SHA-512 is usually just as fast as SHA-256. In the + * common case of deriving an AES-256-XTS key (512 bits), that can result in + * HKDF-SHA512 being much faster than HKDF-SHA256, as the longer digest size of + * SHA-512 causes HKDF-Expand to only need to do one iteration rather than two. */ #define HKDF_HMAC_ALG "hmac(sha512)" #define HKDF_HASHLEN SHA512_DIGEST_SIZE diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index bca9c6658a7c..eede186b04ce 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -19,6 +19,7 @@ struct fscrypt_mode fscrypt_modes[] = { .friendly_name = "AES-256-XTS", .cipher_str = "xts(aes)", .keysize = 64, + .security_strength = 32, .ivsize = 16, .blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_256_XTS, }, @@ -26,12 +27,14 @@ struct fscrypt_mode fscrypt_modes[] = { .friendly_name = "AES-256-CTS-CBC", .cipher_str = "cts(cbc(aes))", .keysize = 32, + .security_strength = 32, .ivsize = 16, }, [FSCRYPT_MODE_AES_128_CBC] = { .friendly_name = "AES-128-CBC-ESSIV", .cipher_str = "essiv(cbc(aes),sha256)", .keysize = 16, + .security_strength = 16, .ivsize = 16, .blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV, }, @@ -39,12 +42,14 @@ struct fscrypt_mode fscrypt_modes[] = { .friendly_name = "AES-128-CTS-CBC", .cipher_str = "cts(cbc(aes))", .keysize = 16, + .security_strength = 16, .ivsize = 16, }, [FSCRYPT_MODE_ADIANTUM] = { .friendly_name = "Adiantum", .cipher_str = "adiantum(xchacha12,aes)", .keysize = 32, + .security_strength = 32, .ivsize = 32, .blk_crypto_mode = BLK_ENCRYPTION_MODE_ADIANTUM, }, @@ -117,8 +122,9 @@ err_free_tfm: /* * Prepare the crypto transform object or blk-crypto key in @prep_key, given the - * raw key, encryption mode, and flag indicating which encryption implementation - * (fs-layer or blk-crypto) will be used. + * raw key, encryption mode (@ci->ci_mode), flag indicating which encryption + * implementation (fs-layer or blk-crypto) will be used (@ci->ci_inlinecrypt), + * and IV generation method (@ci->ci_policy.flags). */ int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, const struct fscrypt_info *ci) @@ -358,6 +364,45 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, } /* + * Check whether the size of the given master key (@mk) is appropriate for the + * encryption settings which a particular file will use (@ci). + * + * If the file uses a v1 encryption policy, then the master key must be at least + * as long as the derived key, as this is a requirement of the v1 KDF. + * + * Otherwise, the KDF can accept any size key, so we enforce a slightly looser + * requirement: we require that the size of the master key be at least the + * maximum security strength of any algorithm whose key will be derived from it + * (but in practice we only need to consider @ci->ci_mode, since any other + * possible subkeys such as DIRHASH and INODE_HASH will never increase the + * required key size over @ci->ci_mode). This allows AES-256-XTS keys to be + * derived from a 256-bit master key, which is cryptographically sufficient, + * rather than requiring a 512-bit master key which is unnecessarily long. (We + * still allow 512-bit master keys if the user chooses to use them, though.) + */ +static bool fscrypt_valid_master_key_size(const struct fscrypt_master_key *mk, + const struct fscrypt_info *ci) +{ + unsigned int min_keysize; + + if (ci->ci_policy.version == FSCRYPT_POLICY_V1) + min_keysize = ci->ci_mode->keysize; + else + min_keysize = ci->ci_mode->security_strength; + + if (mk->mk_secret.size < min_keysize) { + fscrypt_warn(NULL, + "key with %s %*phN is too short (got %u bytes, need %u+ bytes)", + master_key_spec_type(&mk->mk_spec), + master_key_spec_len(&mk->mk_spec), + (u8 *)&mk->mk_spec.u, + mk->mk_secret.size, min_keysize); + return false; + } + return true; +} + +/* * Find the master key, then set up the inode's actual encryption key. * * If the master key is found in the filesystem-level keyring, then the @@ -422,18 +467,7 @@ static int setup_file_encryption_key(struct fscrypt_info *ci, goto out_release_key; } - /* - * Require that the master key be at least as long as the derived key. - * Otherwise, the derived key cannot possibly contain as much entropy as - * that required by the encryption mode it will be used for. For v1 - * policies it's also required for the KDF to work at all. - */ - if (mk->mk_secret.size < ci->ci_mode->keysize) { - fscrypt_warn(NULL, - "key with %s %*phN is too short (got %u bytes, need %u+ bytes)", - master_key_spec_type(&mk_spec), - master_key_spec_len(&mk_spec), (u8 *)&mk_spec.u, - mk->mk_secret.size, ci->ci_mode->keysize); + if (!fscrypt_valid_master_key_size(mk, ci)) { err = -ENOKEY; goto out_release_key; } diff --git a/fs/direct-io.c b/fs/direct-io.c index b2e86e739d7a..654443558047 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -119,7 +119,6 @@ struct dio { int flags; /* doesn't change */ int op; int op_flags; - blk_qc_t bio_cookie; struct gendisk *bio_disk; struct inode *inode; loff_t i_size; /* i_size when submitted */ @@ -308,7 +307,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags) if (ret > 0 && dio->op == REQ_OP_WRITE) ret = generic_write_sync(dio->iocb, ret); - dio->iocb->ki_complete(dio->iocb, ret, 0); + dio->iocb->ki_complete(dio->iocb, ret); } kmem_cache_free(dio_cache, dio); @@ -438,11 +437,10 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) dio->bio_disk = bio->bi_bdev->bd_disk; - if (sdio->submit_io) { + if (sdio->submit_io) sdio->submit_io(bio, dio->inode, sdio->logical_offset_in_bio); - dio->bio_cookie = BLK_QC_T_NONE; - } else - dio->bio_cookie = submit_bio(bio); + else + submit_bio(bio); sdio->bio = NULL; sdio->boundary = 0; @@ -481,9 +479,7 @@ static struct bio *dio_await_one(struct dio *dio) __set_current_state(TASK_UNINTERRUPTIBLE); dio->waiter = current; spin_unlock_irqrestore(&dio->bio_lock, flags); - if (!(dio->iocb->ki_flags & IOCB_HIPRI) || - !blk_poll(dio->bio_disk->queue, dio->bio_cookie, true)) - blk_io_schedule(); + blk_io_schedule(); /* wake up sets us TASK_RUNNING */ spin_lock_irqsave(&dio->bio_lock, flags); dio->waiter = NULL; @@ -1214,8 +1210,6 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, } else { dio->op = REQ_OP_READ; } - if (iocb->ki_flags & IOCB_HIPRI) - dio->op_flags |= REQ_HIPRI; /* * For AIO O_(D)SYNC writes we need to defer completions to a workqueue diff --git a/fs/ext4/file.c b/fs/ext4/file.c index ac0e11bbb445..9c5559faacda 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -915,7 +915,7 @@ const struct file_operations ext4_file_operations = { .llseek = ext4_llseek, .read_iter = ext4_file_read_iter, .write_iter = ext4_file_write_iter, - .iopoll = iomap_dio_iopoll, + .iopoll = iocb_bio_iopoll, .unlocked_ioctl = ext4_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext4_compat_ioctl, diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 88d5d274a868..79b6a0c47f6f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1572,7 +1572,6 @@ static const struct fscrypt_operations ext4_cryptops = { .set_context = ext4_set_context, .get_dummy_policy = ext4_get_dummy_policy, .empty_dir = ext4_empty_dir, - .max_namelen = EXT4_NAME_LEN, .has_stable_inodes = ext4_has_stable_inodes, .get_ino_and_lblk_bits = ext4_get_ino_and_lblk_bits, }; @@ -4474,7 +4473,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto cantfind_ext4; /* check blocks count against device size */ - blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits; + blocks_count = sb_bdev_nr_blocks(sb); if (blocks_count && ext4_blocks_count(es) > blocks_count) { ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu " "exceeds size of device (%llu blocks)", diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index c1bf9ad4c220..20a083dc9042 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -7,6 +7,7 @@ #include <linux/fs.h> #include <linux/f2fs_fs.h> +#include <linux/moduleparam.h> #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/lzo.h> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 78ebc306ee2b..cf049a042482 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2976,7 +2976,6 @@ static const struct fscrypt_operations f2fs_cryptops = { .set_context = f2fs_set_context, .get_dummy_policy = f2fs_get_dummy_policy, .empty_dir = f2fs_empty_dir, - .max_namelen = F2FS_NAME_LEN, .has_stable_inodes = f2fs_has_stable_inodes, .get_ino_and_lblk_bits = f2fs_get_ino_and_lblk_bits, .get_num_devices = f2fs_get_num_devices, diff --git a/fs/fat/inode.c b/fs/fat/inode.c index de0c9b013a85..a6f1c6d426d1 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1536,14 +1536,11 @@ static int fat_read_static_bpb(struct super_block *sb, struct fat_bios_param_block *bpb) { static const char *notdos1x = "This doesn't look like a DOS 1.x volume"; - + sector_t bd_sects = bdev_nr_sectors(sb->s_bdev); struct fat_floppy_defaults *fdefaults = NULL; int error = -EINVAL; - sector_t bd_sects; unsigned i; - bd_sects = i_size_read(sb->s_bdev->bd_inode) / SECTOR_SIZE; - /* 16-bit DOS 1.x reliably wrote bootstrap short-jmp code */ if (b->ignored[0] != 0xeb || b->ignored[2] != 0x90) { if (!silent) @@ -1943,10 +1940,8 @@ int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2) ret = writeback_inode(i1); if (!ret && i2) ret = writeback_inode(i2); - if (!ret) { - struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; - ret = filemap_flush(mapping); - } + if (!ret) + ret = sync_blockdev_nowait(sb->s_bdev); return ret; } EXPORT_SYMBOL_GPL(fat_flush_inodes); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 81ec192ce067..4124a89a1a5d 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1893,7 +1893,8 @@ static long writeback_sb_inodes(struct super_block *sb, * unplug, so get our IOs out the door before we * give up the CPU. */ - blk_flush_plug(current); + if (current->plug) + blk_flush_plug(current->plug, false); cond_resched(); } @@ -2291,7 +2292,7 @@ void wakeup_flusher_threads(enum wb_reason reason) * If we are expecting writeback progress we must submit plugged IO. */ if (blk_needs_flush_plug(current)) - blk_schedule_flush_plug(current); + blk_flush_plug(current->plug, true); rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) diff --git a/fs/fscache/object.c b/fs/fscache/object.c index f346a78f4bd6..6a675652129b 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -77,7 +77,6 @@ static WORK_STATE(INIT_OBJECT, "INIT", fscache_initialise_object); static WORK_STATE(PARENT_READY, "PRDY", fscache_parent_ready); static WORK_STATE(ABORT_INIT, "ABRT", fscache_abort_initialisation); static WORK_STATE(LOOK_UP_OBJECT, "LOOK", fscache_look_up_object); -static WORK_STATE(CREATE_OBJECT, "CRTO", fscache_look_up_object); static WORK_STATE(OBJECT_AVAILABLE, "AVBL", fscache_object_available); static WORK_STATE(JUMPSTART_DEPS, "JUMP", fscache_jumpstart_dependents); @@ -907,6 +906,7 @@ static void fscache_dequeue_object(struct fscache_object *object) * @object: The object to ask about * @data: The auxiliary data for the object * @datalen: The size of the auxiliary data + * @object_size: The size of the object according to the server. * * This function consults the netfs about the coherency state of an object. * The caller must be holding a ref on cookie->n_active (held by diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 433877107700..e002cdfaf3cc 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -22,7 +22,10 @@ static void fscache_operation_dummy_cancel(struct fscache_operation *op) /** * fscache_operation_init - Do basic initialisation of an operation + * @cookie: The cookie to operate on * @op: The operation to initialise + * @processor: The function to perform the operation + * @cancel: A function to handle operation cancellation * @release: The release function to assign * * Do basic initialisation of an operation. The caller must still set flags, diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 11404f8c21c7..e6039f22311b 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -687,7 +687,7 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) spin_unlock(&fi->lock); } - io->iocb->ki_complete(io->iocb, res, 0); + io->iocb->ki_complete(io->iocb, res); } kref_put(&io->refcnt, fuse_io_release); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 319596df5dc6..f55f9f94b1a4 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1121,6 +1121,9 @@ int fuse_init_fs_context_submount(struct fs_context *fsc); */ void fuse_conn_destroy(struct fuse_mount *fm); +/* Drop the connection and free the fuse mount */ +void fuse_mount_destroy(struct fuse_mount *fm); + /** * Add connection to control filesystem */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 36cd03114b6d..12d49a1914e8 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -457,14 +457,6 @@ static void fuse_send_destroy(struct fuse_mount *fm) } } -static void fuse_put_super(struct super_block *sb) -{ - struct fuse_mount *fm = get_fuse_mount_super(sb); - - fuse_conn_put(fm->fc); - kfree(fm); -} - static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr) { stbuf->f_type = FUSE_SUPER_MAGIC; @@ -1003,7 +995,6 @@ static const struct super_operations fuse_super_operations = { .evict_inode = fuse_evict_inode, .write_inode = fuse_write_inode, .drop_inode = generic_delete_inode, - .put_super = fuse_put_super, .umount_begin = fuse_umount_begin, .statfs = fuse_statfs, .sync_fs = fuse_sync_fs, @@ -1424,20 +1415,17 @@ static int fuse_get_tree_submount(struct fs_context *fsc) if (!fm) return -ENOMEM; + fm->fc = fuse_conn_get(fc); fsc->s_fs_info = fm; sb = sget_fc(fsc, NULL, set_anon_super_fc); - if (IS_ERR(sb)) { - kfree(fm); + if (fsc->s_fs_info) + fuse_mount_destroy(fm); + if (IS_ERR(sb)) return PTR_ERR(sb); - } - fm->fc = fuse_conn_get(fc); /* Initialize superblock, making @mp_fi its root */ err = fuse_fill_super_submount(sb, mp_fi); if (err) { - fuse_conn_put(fc); - kfree(fm); - sb->s_fs_info = NULL; deactivate_locked_super(sb); return err; } @@ -1569,8 +1557,6 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) { struct fuse_fs_context *ctx = fsc->fs_private; int err; - struct fuse_conn *fc; - struct fuse_mount *fm; if (!ctx->file || !ctx->rootmode_present || !ctx->user_id_present || !ctx->group_id_present) @@ -1580,42 +1566,18 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) * Require mount to happen from the same user namespace which * opened /dev/fuse to prevent potential attacks. */ - err = -EINVAL; if ((ctx->file->f_op != &fuse_dev_operations) || (ctx->file->f_cred->user_ns != sb->s_user_ns)) - goto err; + return -EINVAL; ctx->fudptr = &ctx->file->private_data; - fc = kmalloc(sizeof(*fc), GFP_KERNEL); - err = -ENOMEM; - if (!fc) - goto err; - - fm = kzalloc(sizeof(*fm), GFP_KERNEL); - if (!fm) { - kfree(fc); - goto err; - } - - fuse_conn_init(fc, fm, sb->s_user_ns, &fuse_dev_fiq_ops, NULL); - fc->release = fuse_free_conn; - - sb->s_fs_info = fm; - err = fuse_fill_super_common(sb, ctx); if (err) - goto err_put_conn; + return err; /* file->private_data shall be visible on all CPUs after this */ smp_mb(); fuse_send_init(get_fuse_mount_super(sb)); return 0; - - err_put_conn: - fuse_conn_put(fc); - kfree(fm); - sb->s_fs_info = NULL; - err: - return err; } /* @@ -1637,22 +1599,40 @@ static int fuse_get_tree(struct fs_context *fsc) { struct fuse_fs_context *ctx = fsc->fs_private; struct fuse_dev *fud; + struct fuse_conn *fc; + struct fuse_mount *fm; struct super_block *sb; int err; + fc = kmalloc(sizeof(*fc), GFP_KERNEL); + if (!fc) + return -ENOMEM; + + fm = kzalloc(sizeof(*fm), GFP_KERNEL); + if (!fm) { + kfree(fc); + return -ENOMEM; + } + + fuse_conn_init(fc, fm, fsc->user_ns, &fuse_dev_fiq_ops, NULL); + fc->release = fuse_free_conn; + + fsc->s_fs_info = fm; + if (ctx->fd_present) ctx->file = fget(ctx->fd); if (IS_ENABLED(CONFIG_BLOCK) && ctx->is_bdev) { err = get_tree_bdev(fsc, fuse_fill_super); - goto out_fput; + goto out; } /* * While block dev mount can be initialized with a dummy device fd * (found by device name), normal fuse mounts can't */ + err = -EINVAL; if (!ctx->file) - return -EINVAL; + goto out; /* * Allow creating a fuse mount with an already initialized fuse @@ -1668,7 +1648,9 @@ static int fuse_get_tree(struct fs_context *fsc) } else { err = get_tree_nodev(fsc, fuse_fill_super); } -out_fput: +out: + if (fsc->s_fs_info) + fuse_mount_destroy(fm); if (ctx->file) fput(ctx->file); return err; @@ -1747,17 +1729,25 @@ static void fuse_sb_destroy(struct super_block *sb) struct fuse_mount *fm = get_fuse_mount_super(sb); bool last; - if (fm) { + if (sb->s_root) { last = fuse_mount_remove(fm); if (last) fuse_conn_destroy(fm); } } +void fuse_mount_destroy(struct fuse_mount *fm) +{ + fuse_conn_put(fm->fc); + kfree(fm); +} +EXPORT_SYMBOL(fuse_mount_destroy); + static void fuse_kill_sb_anon(struct super_block *sb) { fuse_sb_destroy(sb); kill_anon_super(sb); + fuse_mount_destroy(get_fuse_mount_super(sb)); } static struct file_system_type fuse_fs_type = { @@ -1775,6 +1765,7 @@ static void fuse_kill_sb_blk(struct super_block *sb) { fuse_sb_destroy(sb); kill_block_super(sb); + fuse_mount_destroy(get_fuse_mount_super(sb)); } static struct file_system_type fuseblk_fs_type = { diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 0ad89c6629d7..94fc874f5de7 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -1394,12 +1394,13 @@ static void virtio_kill_sb(struct super_block *sb) bool last; /* If mount failed, we can still be called without any fc */ - if (fm) { + if (sb->s_root) { last = fuse_mount_remove(fm); if (last) virtio_fs_conn_destroy(fm); } kill_anon_super(sb); + fuse_mount_destroy(fm); } static int virtio_fs_test_super(struct super_block *sb, @@ -1455,19 +1456,14 @@ static int virtio_fs_get_tree(struct fs_context *fsc) fsc->s_fs_info = fm; sb = sget_fc(fsc, virtio_fs_test_super, set_anon_super_fc); - if (fsc->s_fs_info) { - fuse_conn_put(fc); - kfree(fm); - } + if (fsc->s_fs_info) + fuse_mount_destroy(fm); if (IS_ERR(sb)) return PTR_ERR(sb); if (!sb->s_root) { err = virtio_fs_fill_super(sb, fsc); if (err) { - fuse_conn_put(fc); - kfree(fm); - sb->s_fs_info = NULL; deactivate_locked_super(sb); return err; } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index c559827cb6f9..5436a688157a 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1338,8 +1338,6 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl) { if (!(fl->fl_flags & FL_FLOCK)) return -ENOLCK; - if (fl->fl_type & LOCK_MAND) - return -EOPNOTSUPP; if (fl->fl_type == F_UNLCK) { do_unflock(file, fl); @@ -1353,7 +1351,7 @@ const struct file_operations gfs2_file_fops = { .llseek = gfs2_llseek, .read_iter = gfs2_file_read_iter, .write_iter = gfs2_file_write_iter, - .iopoll = iomap_dio_iopoll, + .iopoll = iocb_bio_iopoll, .unlocked_ioctl = gfs2_ioctl, .compat_ioctl = gfs2_compat_ioctl, .mmap = gfs2_mmap, @@ -1386,7 +1384,7 @@ const struct file_operations gfs2_file_fops_nolock = { .llseek = gfs2_llseek, .read_iter = gfs2_file_read_iter, .write_iter = gfs2_file_write_iter, - .iopoll = iomap_dio_iopoll, + .iopoll = iocb_bio_iopoll, .unlocked_ioctl = gfs2_ioctl, .compat_ioctl = gfs2_compat_ioctl, .mmap = gfs2_mmap, diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c index cdf0edeeb278..5beb82652435 100644 --- a/fs/hfs/mdb.c +++ b/fs/hfs/mdb.c @@ -36,7 +36,7 @@ static int hfs_get_last_session(struct super_block *sb, /* default values */ *start = 0; - *size = i_size_read(sb->s_bdev->bd_inode) >> 9; + *size = bdev_nr_sectors(sb->s_bdev); if (HFS_SB(sb)->session >= 0) { struct cdrom_tocentry te; diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 0350dc7821bf..51ae6f1eb4a5 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -131,7 +131,7 @@ static int hfsplus_get_last_session(struct super_block *sb, /* default values */ *start = 0; - *size = i_size_read(sb->s_bdev->bd_inode) >> 9; + *size = bdev_nr_sectors(sb->s_bdev); if (HFSPLUS_SB(sb)->session >= 0) { struct cdrom_tocentry te; diff --git a/fs/internal.h b/fs/internal.h index 3cd065c8a66b..cdd83d4899bb 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -23,22 +23,11 @@ struct pipe_inode_info; #ifdef CONFIG_BLOCK extern void __init bdev_cache_init(void); -extern int __sync_blockdev(struct block_device *bdev, int wait); -void iterate_bdevs(void (*)(struct block_device *, void *), void *); void emergency_thaw_bdev(struct super_block *sb); #else static inline void bdev_cache_init(void) { } - -static inline int __sync_blockdev(struct block_device *bdev, int wait) -{ - return 0; -} -static inline void iterate_bdevs(void (*f)(struct block_device *, void *), - void *arg) -{ -} static inline int emergency_thaw_bdev(struct super_block *sb) { return 0; diff --git a/fs/io-wq.c b/fs/io-wq.c index 5bf8aa81715e..38b33ad9e8cf 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -140,6 +140,7 @@ static void io_wqe_dec_running(struct io_worker *worker); static bool io_acct_cancel_pending_work(struct io_wqe *wqe, struct io_wqe_acct *acct, struct io_cb_cancel_data *match); +static void create_worker_cb(struct callback_head *cb); static bool io_worker_get(struct io_worker *worker) { @@ -174,12 +175,46 @@ static void io_worker_ref_put(struct io_wq *wq) complete(&wq->worker_done); } +static void io_worker_cancel_cb(struct io_worker *worker) +{ + struct io_wqe_acct *acct = io_wqe_get_acct(worker); + struct io_wqe *wqe = worker->wqe; + struct io_wq *wq = wqe->wq; + + atomic_dec(&acct->nr_running); + raw_spin_lock(&worker->wqe->lock); + acct->nr_workers--; + raw_spin_unlock(&worker->wqe->lock); + io_worker_ref_put(wq); + clear_bit_unlock(0, &worker->create_state); + io_worker_release(worker); +} + +static bool io_task_worker_match(struct callback_head *cb, void *data) +{ + struct io_worker *worker; + + if (cb->func != create_worker_cb) + return false; + worker = container_of(cb, struct io_worker, create_work); + return worker == data; +} + static void io_worker_exit(struct io_worker *worker) { struct io_wqe *wqe = worker->wqe; + struct io_wq *wq = wqe->wq; - if (refcount_dec_and_test(&worker->ref)) - complete(&worker->ref_done); + while (1) { + struct callback_head *cb = task_work_cancel_match(wq->task, + io_task_worker_match, worker); + + if (!cb) + break; + io_worker_cancel_cb(worker); + } + + io_worker_release(worker); wait_for_completion(&worker->ref_done); raw_spin_lock(&wqe->lock); @@ -253,7 +288,7 @@ static bool io_wqe_create_worker(struct io_wqe *wqe, struct io_wqe_acct *acct) pr_warn_once("io-wq is not configured for unbound workers"); raw_spin_lock(&wqe->lock); - if (acct->nr_workers == acct->max_workers) { + if (acct->nr_workers >= acct->max_workers) { raw_spin_unlock(&wqe->lock); return true; } @@ -323,8 +358,10 @@ static bool io_queue_worker_create(struct io_worker *worker, init_task_work(&worker->create_work, func); worker->create_index = acct->index; - if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) + if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) { + clear_bit_unlock(0, &worker->create_state); return true; + } clear_bit_unlock(0, &worker->create_state); fail_release: io_worker_release(worker); @@ -716,11 +753,8 @@ static void io_workqueue_create(struct work_struct *work) struct io_worker *worker = container_of(work, struct io_worker, work); struct io_wqe_acct *acct = io_wqe_get_acct(worker); - if (!io_queue_worker_create(worker, acct, create_worker_cont)) { - clear_bit_unlock(0, &worker->create_state); - io_worker_release(worker); + if (!io_queue_worker_create(worker, acct, create_worker_cont)) kfree(worker); - } } static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) @@ -1150,17 +1184,9 @@ static void io_wq_exit_workers(struct io_wq *wq) while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) { struct io_worker *worker; - struct io_wqe_acct *acct; worker = container_of(cb, struct io_worker, create_work); - acct = io_wqe_get_acct(worker); - atomic_dec(&acct->nr_running); - raw_spin_lock(&worker->wqe->lock); - acct->nr_workers--; - raw_spin_unlock(&worker->wqe->lock); - io_worker_ref_put(wq); - clear_bit_unlock(0, &worker->create_state); - io_worker_release(worker); + io_worker_cancel_cb(worker); } rcu_read_lock(); @@ -1291,15 +1317,18 @@ int io_wq_max_workers(struct io_wq *wq, int *new_count) rcu_read_lock(); for_each_node(node) { + struct io_wqe *wqe = wq->wqes[node]; struct io_wqe_acct *acct; + raw_spin_lock(&wqe->lock); for (i = 0; i < IO_WQ_ACCT_NR; i++) { - acct = &wq->wqes[node]->acct[i]; + acct = &wqe->acct[i]; prev = max_t(int, acct->max_workers, prev); if (new_count[i]) acct->max_workers = new_count[i]; new_count[i] = prev; } + raw_spin_unlock(&wqe->lock); } rcu_read_unlock(); return 0; diff --git a/fs/io-wq.h b/fs/io-wq.h index bf5c4c533760..41bf37674a49 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -29,6 +29,17 @@ struct io_wq_work_list { struct io_wq_work_node *last; }; +#define wq_list_for_each(pos, prv, head) \ + for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next) + +#define wq_list_for_each_resume(pos, prv) \ + for (; pos; prv = pos, pos = (pos)->next) + +#define wq_list_empty(list) (READ_ONCE((list)->first) == NULL) +#define INIT_WQ_LIST(list) do { \ + (list)->first = NULL; \ +} while (0) + static inline void wq_list_add_after(struct io_wq_work_node *node, struct io_wq_work_node *pos, struct io_wq_work_list *list) @@ -54,6 +65,15 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node, } } +static inline void wq_list_add_head(struct io_wq_work_node *node, + struct io_wq_work_list *list) +{ + node->next = list->first; + if (!node->next) + list->last = node; + WRITE_ONCE(list->first, node); +} + static inline void wq_list_cut(struct io_wq_work_list *list, struct io_wq_work_node *last, struct io_wq_work_node *prev) @@ -69,6 +89,31 @@ static inline void wq_list_cut(struct io_wq_work_list *list, last->next = NULL; } +static inline void __wq_list_splice(struct io_wq_work_list *list, + struct io_wq_work_node *to) +{ + list->last->next = to->next; + to->next = list->first; + INIT_WQ_LIST(list); +} + +static inline bool wq_list_splice(struct io_wq_work_list *list, + struct io_wq_work_node *to) +{ + if (!wq_list_empty(list)) { + __wq_list_splice(list, to); + return true; + } + return false; +} + +static inline void wq_stack_add_head(struct io_wq_work_node *node, + struct io_wq_work_node *stack) +{ + node->next = stack->next; + stack->next = node; +} + static inline void wq_list_del(struct io_wq_work_list *list, struct io_wq_work_node *node, struct io_wq_work_node *prev) @@ -76,14 +121,14 @@ static inline void wq_list_del(struct io_wq_work_list *list, wq_list_cut(list, node, prev); } -#define wq_list_for_each(pos, prv, head) \ - for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next) +static inline +struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack) +{ + struct io_wq_work_node *node = stack->next; -#define wq_list_empty(list) (READ_ONCE((list)->first) == NULL) -#define INIT_WQ_LIST(list) do { \ - (list)->first = NULL; \ - (list)->last = NULL; \ -} while (0) + stack->next = node->next; + return node; +} struct io_wq_work { struct io_wq_work_node list; diff --git a/fs/io_uring.c b/fs/io_uring.c index 6b9e70208782..3a4af9799d9a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -103,11 +103,14 @@ #define IORING_MAX_REG_BUFFERS (1U << 14) -#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ - IOSQE_IO_HARDLINK | IOSQE_ASYNC | \ - IOSQE_BUFFER_SELECT) +#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ + IOSQE_IO_HARDLINK | IOSQE_ASYNC) + +#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS|IOSQE_BUFFER_SELECT|IOSQE_IO_DRAIN) + #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ - REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS) + REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \ + REQ_F_ASYNC_DATA) #define IO_TCTX_REFS_CACHE_NR (1U << 10) @@ -195,8 +198,10 @@ struct io_rings { }; enum io_uring_cmd_flags { - IO_URING_F_NONBLOCK = 1, - IO_URING_F_COMPLETE_DEFER = 2, + IO_URING_F_COMPLETE_DEFER = 1, + IO_URING_F_UNLOCKED = 2, + /* int's last bit, sign checks are usually faster than a bit test */ + IO_URING_F_NONBLOCK = INT_MIN, }; struct io_mapped_ubuf { @@ -305,26 +310,16 @@ struct io_submit_link { }; struct io_submit_state { - struct blk_plug plug; + /* inline/task_work completion list, under ->uring_lock */ + struct io_wq_work_node free_list; + /* batch completion logic */ + struct io_wq_work_list compl_reqs; struct io_submit_link link; - /* - * io_kiocb alloc cache - */ - void *reqs[IO_REQ_CACHE_SIZE]; - unsigned int free_reqs; - bool plug_started; - - /* - * Batch completion logic - */ - struct io_kiocb *compl_reqs[IO_COMPL_BATCH]; - unsigned int compl_nr; - /* inline/task_work completion list, under ->uring_lock */ - struct list_head free_list; - - unsigned int ios_left; + bool need_plug; + unsigned short submit_nr; + struct blk_plug plug; }; struct io_ring_ctx { @@ -368,6 +363,7 @@ struct io_ring_ctx { * uring_lock, and updated through io_uring_register(2) */ struct io_rsrc_node *rsrc_node; + int rsrc_cached_refs; struct io_file_table file_table; unsigned nr_user_files; unsigned nr_user_bufs; @@ -384,7 +380,7 @@ struct io_ring_ctx { } ____cacheline_aligned_in_smp; /* IRQ completion list, under ->completion_lock */ - struct list_head locked_free_list; + struct io_wq_work_list locked_free_list; unsigned int locked_free_nr; const struct cred *sq_creds; /* cred used for __io_sq_thread() */ @@ -399,7 +395,6 @@ struct io_ring_ctx { unsigned cached_cq_tail; unsigned cq_entries; struct eventfd_ctx *cq_ev_fd; - struct wait_queue_head poll_wait; struct wait_queue_head cq_wait; unsigned cq_extra; atomic_t cq_timeouts; @@ -417,7 +412,7 @@ struct io_ring_ctx { * For SQPOLL, only the single threaded io_sq_thread() will * manipulate the list, hence no extra locking is needed there. */ - struct list_head iopoll_list; + struct io_wq_work_list iopoll_list; struct hlist_head *cancel_hash; unsigned cancel_hash_bits; bool poll_multi_queue; @@ -456,6 +451,8 @@ struct io_ring_ctx { struct work_struct exit_work; struct list_head tctx_list; struct completion ref_comp; + u32 iowq_limits[2]; + bool iowq_limits_set; }; }; @@ -578,7 +575,6 @@ struct io_sr_msg { int msg_flags; int bgid; size_t len; - struct io_buffer *kbuf; }; struct io_open { @@ -690,11 +686,6 @@ struct io_hardlink { int flags; }; -struct io_completion { - struct file *file; - u32 cflags; -}; - struct io_async_connect { struct sockaddr_storage address; }; @@ -708,11 +699,15 @@ struct io_async_msghdr { struct sockaddr_storage addr; }; -struct io_async_rw { - struct iovec fast_iov[UIO_FASTIOV]; - const struct iovec *free_iovec; +struct io_rw_state { struct iov_iter iter; struct iov_iter_state iter_state; + struct iovec fast_iov[UIO_FASTIOV]; +}; + +struct io_async_rw { + struct io_rw_state s; + const struct iovec *free_iovec; size_t bytes_done; struct wait_page_queue wpq; }; @@ -739,9 +734,9 @@ enum { REQ_F_CREDS_BIT, REQ_F_REFCOUNT_BIT, REQ_F_ARM_LTIMEOUT_BIT, + REQ_F_ASYNC_DATA_BIT, /* keep async read/write and isreg together and in order */ - REQ_F_NOWAIT_READ_BIT, - REQ_F_NOWAIT_WRITE_BIT, + REQ_F_SUPPORT_NOWAIT_BIT, REQ_F_ISREG_BIT, /* not a real bit, just to check we're not overflowing the space */ @@ -782,10 +777,8 @@ enum { REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT), /* caller should reissue async */ REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT), - /* supports async reads */ - REQ_F_NOWAIT_READ = BIT(REQ_F_NOWAIT_READ_BIT), - /* supports async writes */ - REQ_F_NOWAIT_WRITE = BIT(REQ_F_NOWAIT_WRITE_BIT), + /* supports async reads/writes */ + REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT), /* regular file */ REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), /* has creds assigned */ @@ -794,6 +787,8 @@ enum { REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT), /* there is a linked timeout that has to be armed */ REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT), + /* ->async_data allocated */ + REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT), }; struct async_poll { @@ -850,39 +845,41 @@ struct io_kiocb { struct io_mkdir mkdir; struct io_symlink symlink; struct io_hardlink hardlink; - /* use only after cleaning per-op data, see io_clean_op() */ - struct io_completion compl; }; - /* opcode allocated if it needs to store data for async defer */ - void *async_data; u8 opcode; /* polled IO has completed */ u8 iopoll_completed; - u16 buf_index; + unsigned int flags; + + u64 user_data; u32 result; + u32 cflags; struct io_ring_ctx *ctx; - unsigned int flags; - atomic_t refs; struct task_struct *task; - u64 user_data; - struct io_kiocb *link; struct percpu_ref *fixed_rsrc_refs; + /* store used ubuf, so we can prevent reloading */ + struct io_mapped_ubuf *imu; - /* used with ctx->iopoll_list with reads/writes */ - struct list_head inflight_entry; + /* used by request caches, completion batching and iopoll */ + struct io_wq_work_node comp_list; + atomic_t refs; + struct io_kiocb *link; struct io_task_work io_task_work; /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ struct hlist_node hash_node; + /* internal polling, see IORING_FEAT_FAST_POLL */ struct async_poll *apoll; + /* opcode allocated if it needs to store data for async defer */ + void *async_data; struct io_wq_work work; + /* custom credentials, valid IFF REQ_F_CREDS is set */ const struct cred *creds; - - /* store used ubuf, so we can prevent reloading */ - struct io_mapped_ubuf *imu; + /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ + struct io_buffer *kbuf; }; struct io_tctx_node { @@ -900,12 +897,12 @@ struct io_defer_entry { struct io_op_def { /* needs req->file assigned */ unsigned needs_file : 1; + /* should block plug */ + unsigned plug : 1; /* hash wq insertion if file is a regular file */ unsigned hash_reg_file : 1; /* unbound wq insertion if file is a non-regular file */ unsigned unbound_nonreg_file : 1; - /* opcode is not supported by this kernel */ - unsigned not_supported : 1; /* set if opcode supports polled "wait" */ unsigned pollin : 1; unsigned pollout : 1; @@ -913,8 +910,8 @@ struct io_op_def { unsigned buffer_select : 1; /* do prep async if is going to be punted */ unsigned needs_async_setup : 1; - /* should block plug */ - unsigned plug : 1; + /* opcode is not supported by this kernel */ + unsigned not_supported : 1; /* size of async data needed, if any */ unsigned short async_size; }; @@ -1078,7 +1075,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, - long res, unsigned int cflags); + s32 res, u32 cflags); static void io_put_req(struct io_kiocb *req); static void io_put_req_deferred(struct io_kiocb *req); static void io_dismantle_req(struct io_kiocb *req); @@ -1093,7 +1090,7 @@ static void __io_queue_sqe(struct io_kiocb *req); static void io_rsrc_put_work(struct work_struct *work); static void io_req_task_queue(struct io_kiocb *req); -static void io_submit_flush_completions(struct io_ring_ctx *ctx); +static void __io_submit_flush_completions(struct io_ring_ctx *ctx); static int io_req_prep_async(struct io_kiocb *req); static int io_install_fixed_file(struct io_kiocb *req, struct file *file, @@ -1165,6 +1162,12 @@ static inline void req_ref_get(struct io_kiocb *req) atomic_inc(&req->refs); } +static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) +{ + if (!wq_list_empty(&ctx->submit_state.compl_reqs)) + __io_submit_flush_completions(ctx); +} + static inline void __io_req_set_refcount(struct io_kiocb *req, int nr) { if (!(req->flags & REQ_F_REFCOUNT)) { @@ -1178,13 +1181,52 @@ static inline void io_req_set_refcount(struct io_kiocb *req) __io_req_set_refcount(req, 1); } -static inline void io_req_set_rsrc_node(struct io_kiocb *req) +#define IO_RSRC_REF_BATCH 100 + +static inline void io_req_put_rsrc_locked(struct io_kiocb *req, + struct io_ring_ctx *ctx) + __must_hold(&ctx->uring_lock) { - struct io_ring_ctx *ctx = req->ctx; + struct percpu_ref *ref = req->fixed_rsrc_refs; + + if (ref) { + if (ref == &ctx->rsrc_node->refs) + ctx->rsrc_cached_refs++; + else + percpu_ref_put(ref); + } +} + +static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx) +{ + if (req->fixed_rsrc_refs) + percpu_ref_put(req->fixed_rsrc_refs); +} + +static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx) + __must_hold(&ctx->uring_lock) +{ + if (ctx->rsrc_cached_refs) { + percpu_ref_put_many(&ctx->rsrc_node->refs, ctx->rsrc_cached_refs); + ctx->rsrc_cached_refs = 0; + } +} +static void io_rsrc_refs_refill(struct io_ring_ctx *ctx) + __must_hold(&ctx->uring_lock) +{ + ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH; + percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH); +} + +static inline void io_req_set_rsrc_node(struct io_kiocb *req, + struct io_ring_ctx *ctx) +{ if (!req->fixed_rsrc_refs) { req->fixed_rsrc_refs = &ctx->rsrc_node->refs; - percpu_ref_get(req->fixed_rsrc_refs); + ctx->rsrc_cached_refs--; + if (unlikely(ctx->rsrc_cached_refs < 0)) + io_rsrc_refs_refill(ctx); } } @@ -1217,6 +1259,11 @@ static bool io_match_task(struct io_kiocb *head, struct task_struct *task, return false; } +static inline bool req_has_async_data(struct io_kiocb *req) +{ + return req->flags & REQ_F_ASYNC_DATA; +} + static inline void req_set_fail(struct io_kiocb *req) { req->flags |= REQ_F_FAIL; @@ -1228,7 +1275,7 @@ static inline void req_fail_link_node(struct io_kiocb *req, int res) req->result = res; } -static void io_ring_ctx_ref_free(struct percpu_ref *ref) +static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) { struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs); @@ -1240,7 +1287,7 @@ static inline bool io_is_timeout_noseq(struct io_kiocb *req) return !req->timeout.off; } -static void io_fallback_req_func(struct work_struct *work) +static __cold void io_fallback_req_func(struct work_struct *work) { struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, fallback_work.work); @@ -1253,15 +1300,13 @@ static void io_fallback_req_func(struct work_struct *work) req->io_task_work.func(req, &locked); if (locked) { - if (ctx->submit_state.compl_nr) - io_submit_flush_completions(ctx); + io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); } percpu_ref_put(&ctx->refs); - } -static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) +static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; int hash_bits; @@ -1298,7 +1343,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ctx->flags = p->flags; init_waitqueue_head(&ctx->sqo_sq_wait); INIT_LIST_HEAD(&ctx->sqd_list); - init_waitqueue_head(&ctx->poll_wait); INIT_LIST_HEAD(&ctx->cq_overflow_list); init_completion(&ctx->ref_comp); xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1); @@ -1307,7 +1351,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->cq_wait); spin_lock_init(&ctx->completion_lock); spin_lock_init(&ctx->timeout_lock); - INIT_LIST_HEAD(&ctx->iopoll_list); + INIT_WQ_LIST(&ctx->iopoll_list); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); INIT_LIST_HEAD(&ctx->ltimeout_list); @@ -1316,9 +1360,10 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work); init_llist_head(&ctx->rsrc_put_llist); INIT_LIST_HEAD(&ctx->tctx_list); - INIT_LIST_HEAD(&ctx->submit_state.free_list); - INIT_LIST_HEAD(&ctx->locked_free_list); + ctx->submit_state.free_list.next = NULL; + INIT_WQ_LIST(&ctx->locked_free_list); INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); + INIT_WQ_LIST(&ctx->submit_state.compl_reqs); return ctx; err: kfree(ctx->dummy_ubuf); @@ -1346,21 +1391,16 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq) return false; } -#define FFS_ASYNC_READ 0x1UL -#define FFS_ASYNC_WRITE 0x2UL -#ifdef CONFIG_64BIT -#define FFS_ISREG 0x4UL -#else -#define FFS_ISREG 0x0UL -#endif -#define FFS_MASK ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG) +#define FFS_NOWAIT 0x1UL +#define FFS_ISREG 0x2UL +#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG) static inline bool io_req_ffs_set(struct io_kiocb *req) { - return IS_ENABLED(CONFIG_64BIT) && (req->flags & REQ_F_FIXED_FILE); + return req->flags & REQ_F_FIXED_FILE; } -static void io_req_track_inflight(struct io_kiocb *req) +static inline void io_req_track_inflight(struct io_kiocb *req) { if (!(req->flags & REQ_F_INFLIGHT)) { req->flags |= REQ_F_INFLIGHT; @@ -1368,11 +1408,6 @@ static void io_req_track_inflight(struct io_kiocb *req) } } -static inline void io_unprep_linked_timeout(struct io_kiocb *req) -{ - req->flags &= ~REQ_F_LINK_TIMEOUT; -} - static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) { if (WARN_ON_ONCE(!req->link)) @@ -1443,15 +1478,19 @@ static void io_prep_async_link(struct io_kiocb *req) } } -static void io_queue_async_work(struct io_kiocb *req, bool *locked) +static inline void io_req_add_compl_list(struct io_kiocb *req) +{ + struct io_submit_state *state = &req->ctx->submit_state; + + wq_list_add_tail(&req->comp_list, &state->compl_reqs); +} + +static void io_queue_async_work(struct io_kiocb *req, bool *dont_use) { struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *link = io_prep_linked_timeout(req); struct io_uring_task *tctx = req->task->io_uring; - /* must not take the lock, NULL it as a precaution */ - locked = NULL; - BUG_ON(!tctx); BUG_ON(!tctx->io_wq); @@ -1492,7 +1531,7 @@ static void io_kill_timeout(struct io_kiocb *req, int status) } } -static void io_queue_deferred(struct io_ring_ctx *ctx) +static __cold void io_queue_deferred(struct io_ring_ctx *ctx) { while (!list_empty(&ctx->defer_list)) { struct io_defer_entry *de = list_first_entry(&ctx->defer_list, @@ -1506,7 +1545,7 @@ static void io_queue_deferred(struct io_ring_ctx *ctx) } } -static void io_flush_timeouts(struct io_ring_ctx *ctx) +static __cold void io_flush_timeouts(struct io_ring_ctx *ctx) __must_hold(&ctx->completion_lock) { u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); @@ -1539,7 +1578,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx) spin_unlock_irq(&ctx->timeout_lock); } -static void __io_commit_cqring_flush(struct io_ring_ctx *ctx) +static __cold void __io_commit_cqring_flush(struct io_ring_ctx *ctx) { if (ctx->off_timeout_used) io_flush_timeouts(ctx); @@ -1609,12 +1648,8 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) */ if (wq_has_sleeper(&ctx->cq_wait)) wake_up_all(&ctx->cq_wait); - if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait)) - wake_up(&ctx->sq_data->wait); if (io_should_trigger_evfd(ctx)) eventfd_signal(ctx->cq_ev_fd, 1); - if (waitqueue_active(&ctx->poll_wait)) - wake_up_interruptible(&ctx->poll_wait); } static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) @@ -1628,8 +1663,6 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) } if (io_should_trigger_evfd(ctx)) eventfd_signal(ctx->cq_ev_fd, 1); - if (waitqueue_active(&ctx->poll_wait)) - wake_up_interruptible(&ctx->poll_wait); } /* Returns true if there are no backlogged entries after the flush */ @@ -1725,7 +1758,7 @@ static inline void io_get_task_refs(int nr) } static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, - long res, unsigned int cflags) + s32 res, u32 cflags) { struct io_overflow_cqe *ocqe; @@ -1753,7 +1786,7 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, } static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, - long res, unsigned int cflags) + s32 res, u32 cflags) { struct io_uring_cqe *cqe; @@ -1776,13 +1809,13 @@ static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data /* not as hot to bloat with inlining */ static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, - long res, unsigned int cflags) + s32 res, u32 cflags) { return __io_cqring_fill_event(ctx, user_data, res, cflags); } -static void io_req_complete_post(struct io_kiocb *req, long res, - unsigned int cflags) +static void io_req_complete_post(struct io_kiocb *req, s32 res, + u32 cflags) { struct io_ring_ctx *ctx = req->ctx; @@ -1801,40 +1834,27 @@ static void io_req_complete_post(struct io_kiocb *req, long res, req->link = NULL; } } + io_req_put_rsrc(req, ctx); io_dismantle_req(req); io_put_task(req->task, 1); - list_add(&req->inflight_entry, &ctx->locked_free_list); + wq_list_add_head(&req->comp_list, &ctx->locked_free_list); ctx->locked_free_nr++; - } else { - if (!percpu_ref_tryget(&ctx->refs)) - req = NULL; } io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); - - if (req) { - io_cqring_ev_posted(ctx); - percpu_ref_put(&ctx->refs); - } -} - -static inline bool io_req_needs_clean(struct io_kiocb *req) -{ - return req->flags & IO_REQ_CLEAN_FLAGS; + io_cqring_ev_posted(ctx); } -static void io_req_complete_state(struct io_kiocb *req, long res, - unsigned int cflags) +static inline void io_req_complete_state(struct io_kiocb *req, s32 res, + u32 cflags) { - if (io_req_needs_clean(req)) - io_clean_op(req); req->result = res; - req->compl.cflags = cflags; + req->cflags = cflags; req->flags |= REQ_F_COMPLETE_INLINE; } static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags, - long res, unsigned cflags) + s32 res, u32 cflags) { if (issue_flags & IO_URING_F_COMPLETE_DEFER) io_req_complete_state(req, res, cflags); @@ -1842,12 +1862,12 @@ static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags, io_req_complete_post(req, res, cflags); } -static inline void io_req_complete(struct io_kiocb *req, long res) +static inline void io_req_complete(struct io_kiocb *req, s32 res) { __io_req_complete(req, 0, res, 0); } -static void io_req_complete_failed(struct io_kiocb *req, long res) +static void io_req_complete_failed(struct io_kiocb *req, s32 res) { req_set_fail(req); io_req_complete_post(req, res, 0); @@ -1881,7 +1901,7 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, struct io_submit_state *state) { spin_lock(&ctx->completion_lock); - list_splice_init(&ctx->locked_free_list, &state->free_list); + wq_list_splice(&ctx->locked_free_list, &state->free_list); ctx->locked_free_nr = 0; spin_unlock(&ctx->completion_lock); } @@ -1890,7 +1910,6 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) { struct io_submit_state *state = &ctx->submit_state; - int nr; /* * If we have more than a batch's worth of requests in our IRQ side @@ -1899,20 +1918,7 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) */ if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH) io_flush_cached_locked_reqs(ctx, state); - - nr = state->free_reqs; - while (!list_empty(&state->free_list)) { - struct io_kiocb *req = list_first_entry(&state->free_list, - struct io_kiocb, inflight_entry); - - list_del(&req->inflight_entry); - state->reqs[nr++] = req; - if (nr == ARRAY_SIZE(state->reqs)) - break; - } - - state->free_reqs = nr; - return nr != 0; + return !!state->free_list.next; } /* @@ -1921,38 +1927,54 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) * Because of that, io_alloc_req() should be called only under ->uring_lock * and with extra caution to not get a request that is still worked on. */ -static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) +static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { struct io_submit_state *state = &ctx->submit_state; gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; + void *reqs[IO_REQ_ALLOC_BATCH]; + struct io_kiocb *req; int ret, i; - BUILD_BUG_ON(ARRAY_SIZE(state->reqs) < IO_REQ_ALLOC_BATCH); - - if (likely(state->free_reqs || io_flush_cached_reqs(ctx))) - goto got_req; + if (likely(state->free_list.next || io_flush_cached_reqs(ctx))) + return true; - ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH, - state->reqs); + ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs); /* * Bulk alloc is all-or-nothing. If we fail to get a batch, * retry single alloc to be on the safe side. */ if (unlikely(ret <= 0)) { - state->reqs[0] = kmem_cache_alloc(req_cachep, gfp); - if (!state->reqs[0]) - return NULL; + reqs[0] = kmem_cache_alloc(req_cachep, gfp); + if (!reqs[0]) + return false; ret = 1; } - for (i = 0; i < ret; i++) - io_preinit_req(state->reqs[i], ctx); - state->free_reqs = ret; -got_req: - state->free_reqs--; - return state->reqs[state->free_reqs]; + percpu_ref_get_many(&ctx->refs, ret); + for (i = 0; i < ret; i++) { + req = reqs[i]; + + io_preinit_req(req, ctx); + wq_stack_add_head(&req->comp_list, &state->free_list); + } + return true; +} + +static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx) +{ + if (unlikely(!ctx->submit_state.free_list.next)) + return __io_alloc_req_refill(ctx); + return true; +} + +static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) +{ + struct io_wq_work_node *node; + + node = wq_stack_extract(&ctx->submit_state.free_list); + return container_of(node, struct io_kiocb, comp_list); } static inline void io_put_file(struct file *file) @@ -1961,35 +1983,28 @@ static inline void io_put_file(struct file *file) fput(file); } -static void io_dismantle_req(struct io_kiocb *req) +static inline void io_dismantle_req(struct io_kiocb *req) { unsigned int flags = req->flags; - if (io_req_needs_clean(req)) + if (unlikely(flags & IO_REQ_CLEAN_FLAGS)) io_clean_op(req); if (!(flags & REQ_F_FIXED_FILE)) io_put_file(req->file); - if (req->fixed_rsrc_refs) - percpu_ref_put(req->fixed_rsrc_refs); - if (req->async_data) { - kfree(req->async_data); - req->async_data = NULL; - } } -static void __io_free_req(struct io_kiocb *req) +static __cold void __io_free_req(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; + io_req_put_rsrc(req, ctx); io_dismantle_req(req); io_put_task(req->task, 1); spin_lock(&ctx->completion_lock); - list_add(&req->inflight_entry, &ctx->locked_free_list); + wq_list_add_head(&req->comp_list, &ctx->locked_free_list); ctx->locked_free_nr++; spin_unlock(&ctx->completion_lock); - - percpu_ref_put(&ctx->refs); } static inline void io_remove_next_linked(struct io_kiocb *req) @@ -2075,47 +2090,45 @@ static bool io_disarm_next(struct io_kiocb *req) return posted; } -static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) +static void __io_req_find_next_prep(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + bool posted; + + spin_lock(&ctx->completion_lock); + posted = io_disarm_next(req); + if (posted) + io_commit_cqring(req->ctx); + spin_unlock(&ctx->completion_lock); + if (posted) + io_cqring_ev_posted(ctx); +} + +static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) { struct io_kiocb *nxt; + if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK)))) + return NULL; /* * If LINK is set, we have dependent requests in this chain. If we * didn't fail this request, queue the first one up, moving any other * dependencies to the next request. In case of failure, fail the rest * of the chain. */ - if (req->flags & IO_DISARM_MASK) { - struct io_ring_ctx *ctx = req->ctx; - bool posted; - - spin_lock(&ctx->completion_lock); - posted = io_disarm_next(req); - if (posted) - io_commit_cqring(req->ctx); - spin_unlock(&ctx->completion_lock); - if (posted) - io_cqring_ev_posted(ctx); - } + if (unlikely(req->flags & IO_DISARM_MASK)) + __io_req_find_next_prep(req); nxt = req->link; req->link = NULL; return nxt; } -static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) -{ - if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK)))) - return NULL; - return __io_req_find_next(req); -} - static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) { if (!ctx) return; if (*locked) { - if (ctx->submit_state.compl_nr) - io_submit_flush_completions(ctx); + io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); *locked = false; } @@ -2132,7 +2145,7 @@ static void tctx_task_work(struct callback_head *cb) while (1) { struct io_wq_work_node *node; - if (!tctx->task_list.first && locked && ctx->submit_state.compl_nr) + if (!tctx->task_list.first && locked) io_submit_flush_completions(ctx); spin_lock_irq(&tctx->task_lock); @@ -2195,8 +2208,9 @@ static void io_req_task_work_add(struct io_kiocb *req) * will do the job. */ notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL; - if (!task_work_add(tsk, &tctx->task_work, notify)) { - wake_up_process(tsk); + if (likely(!task_work_add(tsk, &tctx->task_work, notify))) { + if (notify == TWA_NONE) + wake_up_process(tsk); return; } @@ -2274,77 +2288,62 @@ static void io_free_req_work(struct io_kiocb *req, bool *locked) io_free_req(req); } -struct req_batch { - struct task_struct *task; - int task_refs; - int ctx_refs; -}; - -static inline void io_init_req_batch(struct req_batch *rb) +static void io_free_batch_list(struct io_ring_ctx *ctx, + struct io_wq_work_node *node) + __must_hold(&ctx->uring_lock) { - rb->task_refs = 0; - rb->ctx_refs = 0; - rb->task = NULL; -} + struct task_struct *task = NULL; + int task_refs = 0; -static void io_req_free_batch_finish(struct io_ring_ctx *ctx, - struct req_batch *rb) -{ - if (rb->ctx_refs) - percpu_ref_put_many(&ctx->refs, rb->ctx_refs); - if (rb->task) - io_put_task(rb->task, rb->task_refs); -} + do { + struct io_kiocb *req = container_of(node, struct io_kiocb, + comp_list); -static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req, - struct io_submit_state *state) -{ - io_queue_next(req); - io_dismantle_req(req); + if (unlikely(req->flags & REQ_F_REFCOUNT)) { + node = req->comp_list.next; + if (!req_ref_put_and_test(req)) + continue; + } - if (req->task != rb->task) { - if (rb->task) - io_put_task(rb->task, rb->task_refs); - rb->task = req->task; - rb->task_refs = 0; - } - rb->task_refs++; - rb->ctx_refs++; + io_req_put_rsrc_locked(req, ctx); + io_queue_next(req); + io_dismantle_req(req); - if (state->free_reqs != ARRAY_SIZE(state->reqs)) - state->reqs[state->free_reqs++] = req; - else - list_add(&req->inflight_entry, &state->free_list); + if (req->task != task) { + if (task) + io_put_task(task, task_refs); + task = req->task; + task_refs = 0; + } + task_refs++; + node = req->comp_list.next; + wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); + } while (node); + + if (task) + io_put_task(task, task_refs); } -static void io_submit_flush_completions(struct io_ring_ctx *ctx) +static void __io_submit_flush_completions(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { + struct io_wq_work_node *node, *prev; struct io_submit_state *state = &ctx->submit_state; - int i, nr = state->compl_nr; - struct req_batch rb; spin_lock(&ctx->completion_lock); - for (i = 0; i < nr; i++) { - struct io_kiocb *req = state->compl_reqs[i]; + wq_list_for_each(node, prev, &state->compl_reqs) { + struct io_kiocb *req = container_of(node, struct io_kiocb, + comp_list); __io_cqring_fill_event(ctx, req->user_data, req->result, - req->compl.cflags); + req->cflags); } io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); io_cqring_ev_posted(ctx); - io_init_req_batch(&rb); - for (i = 0; i < nr; i++) { - struct io_kiocb *req = state->compl_reqs[i]; - - if (req_ref_put_and_test(req)) - io_req_free_batch(&rb, req, &ctx->submit_state); - } - - io_req_free_batch_finish(ctx, &rb); - state->compl_nr = 0; + io_free_batch_list(ctx, state->compl_reqs.first); + INIT_WQ_LIST(&state->compl_reqs); } /* @@ -2404,12 +2403,9 @@ static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf) static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req) { - struct io_buffer *kbuf; - if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) return 0; - kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; - return io_put_kbuf(req, kbuf); + return io_put_kbuf(req, req->kbuf); } static inline bool io_run_task_work(void) @@ -2423,50 +2419,22 @@ static inline bool io_run_task_work(void) return false; } -/* - * Find and free completed poll iocbs - */ -static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, - struct list_head *done) +static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) { - struct req_batch rb; - struct io_kiocb *req; - - /* order with ->result store in io_complete_rw_iopoll() */ - smp_rmb(); - - io_init_req_batch(&rb); - while (!list_empty(done)) { - req = list_first_entry(done, struct io_kiocb, inflight_entry); - list_del(&req->inflight_entry); - - __io_cqring_fill_event(ctx, req->user_data, req->result, - io_put_rw_kbuf(req)); - (*nr_events)++; - - if (req_ref_put_and_test(req)) - io_req_free_batch(&rb, req, &ctx->submit_state); - } - - io_commit_cqring(ctx); - io_cqring_ev_posted_iopoll(ctx); - io_req_free_batch_finish(ctx, &rb); -} - -static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, - long min) -{ - struct io_kiocb *req, *tmp; - LIST_HEAD(done); - bool spin; + struct io_wq_work_node *pos, *start, *prev; + unsigned int poll_flags = BLK_POLL_NOSLEEP; + DEFINE_IO_COMP_BATCH(iob); + int nr_events = 0; /* * Only spin for completions if we don't have multiple devices hanging - * off our complete list, and we're under the requested amount. + * off our complete list. */ - spin = !ctx->poll_multi_queue && *nr_events < min; + if (ctx->poll_multi_queue || force_nonspin) + poll_flags |= BLK_POLL_ONESHOT; - list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) { + wq_list_for_each(pos, start, &ctx->iopoll_list) { + struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); struct kiocb *kiocb = &req->rw.kiocb; int ret; @@ -2475,47 +2443,62 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, * If we find a request that requires polling, break out * and complete those lists first, if we have entries there. */ - if (READ_ONCE(req->iopoll_completed)) { - list_move_tail(&req->inflight_entry, &done); - continue; - } - if (!list_empty(&done)) + if (READ_ONCE(req->iopoll_completed)) break; - ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin); + ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags); if (unlikely(ret < 0)) return ret; else if (ret) - spin = false; + poll_flags |= BLK_POLL_ONESHOT; /* iopoll may have completed current req */ - if (READ_ONCE(req->iopoll_completed)) - list_move_tail(&req->inflight_entry, &done); + if (!rq_list_empty(iob.req_list) || + READ_ONCE(req->iopoll_completed)) + break; } - if (!list_empty(&done)) - io_iopoll_complete(ctx, nr_events, &done); + if (!rq_list_empty(iob.req_list)) + iob.complete(&iob); + else if (!pos) + return 0; + + prev = start; + wq_list_for_each_resume(pos, prev) { + struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); - return 0; + /* order with io_complete_rw_iopoll(), e.g. ->result updates */ + if (!smp_load_acquire(&req->iopoll_completed)) + break; + __io_cqring_fill_event(ctx, req->user_data, req->result, + io_put_rw_kbuf(req)); + nr_events++; + } + + if (unlikely(!nr_events)) + return 0; + + io_commit_cqring(ctx); + io_cqring_ev_posted_iopoll(ctx); + pos = start ? start->next : ctx->iopoll_list.first; + wq_list_cut(&ctx->iopoll_list, prev, start); + io_free_batch_list(ctx, pos); + return nr_events; } /* * We can't just wait for polled events to come to us, we have to actively * find and complete them. */ -static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) +static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) { if (!(ctx->flags & IORING_SETUP_IOPOLL)) return; mutex_lock(&ctx->uring_lock); - while (!list_empty(&ctx->iopoll_list)) { - unsigned int nr_events = 0; - - io_do_iopoll(ctx, &nr_events, 0); - + while (!wq_list_empty(&ctx->iopoll_list)) { /* let it sleep and repeat later if can't complete a request */ - if (nr_events == 0) + if (io_do_iopoll(ctx, true) == 0) break; /* * Ensure we allow local-to-the-cpu processing to take place, @@ -2562,7 +2545,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) * forever, while the workqueue is stuck trying to acquire the * very same mutex. */ - if (list_empty(&ctx->iopoll_list)) { + if (wq_list_empty(&ctx->iopoll_list)) { u32 tail = ctx->cached_cq_tail; mutex_unlock(&ctx->uring_lock); @@ -2571,11 +2554,15 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) /* some requests don't go through iopoll_list */ if (tail != ctx->cached_cq_tail || - list_empty(&ctx->iopoll_list)) + wq_list_empty(&ctx->iopoll_list)) break; } - ret = io_do_iopoll(ctx, &nr_events, min); - } while (!ret && nr_events < min && !need_resched()); + ret = io_do_iopoll(ctx, !min); + if (ret < 0) + break; + nr_events += ret; + ret = 0; + } while (nr_events < min && !need_resched()); out: mutex_unlock(&ctx->uring_lock); return ret; @@ -2600,9 +2587,9 @@ static bool io_resubmit_prep(struct io_kiocb *req) { struct io_async_rw *rw = req->async_data; - if (!rw) + if (!req_has_async_data(req)) return !io_req_prep_async(req); - iov_iter_restore(&rw->iter, &rw->iter_state); + iov_iter_restore(&rw->s.iter, &rw->s.iter_state); return true; } @@ -2646,7 +2633,7 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res) { if (req->rw.kiocb.ki_flags & IOCB_WRITE) kiocb_end_write(req); - if (res != req->result) { + if (unlikely(res != req->result)) { if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_should_reissue(req)) { req->flags |= REQ_F_REISSUE; @@ -2661,16 +2648,11 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res) static void io_req_task_complete(struct io_kiocb *req, bool *locked) { unsigned int cflags = io_put_rw_kbuf(req); - long res = req->result; + int res = req->result; if (*locked) { - struct io_ring_ctx *ctx = req->ctx; - struct io_submit_state *state = &ctx->submit_state; - io_req_complete_state(req, res, cflags); - state->compl_reqs[state->compl_nr++] = req; - if (state->compl_nr == ARRAY_SIZE(state->compl_reqs)) - io_submit_flush_completions(ctx); + io_req_add_compl_list(req); } else { io_req_complete_post(req, res, cflags); } @@ -2684,7 +2666,7 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2, __io_req_complete(req, issue_flags, req->result, io_put_rw_kbuf(req)); } -static void io_complete_rw(struct kiocb *kiocb, long res, long res2) +static void io_complete_rw(struct kiocb *kiocb, long res) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); @@ -2695,7 +2677,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2) io_req_task_work_add(req); } -static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) +static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); @@ -2706,12 +2688,11 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) req->flags |= REQ_F_REISSUE; return; } + req->result = res; } - WRITE_ONCE(req->result, res); - /* order with io_iopoll_complete() checking ->result */ - smp_wmb(); - WRITE_ONCE(req->iopoll_completed, 1); + /* order with io_iopoll_complete() checking ->iopoll_completed */ + smp_store_release(&req->iopoll_completed, 1); } /* @@ -2720,13 +2701,13 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) * find it from a io_do_iopoll() thread before the issuer is done * accessing the kiocb cookie. */ -static void io_iopoll_req_issued(struct io_kiocb *req) +static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; - const bool in_async = io_wq_current_is_worker(); + const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; /* workqueue context doesn't hold uring_lock, grab it now */ - if (unlikely(in_async)) + if (unlikely(needs_lock)) mutex_lock(&ctx->uring_lock); /* @@ -2734,23 +2715,15 @@ static void io_iopoll_req_issued(struct io_kiocb *req) * how we do polling eventually, not spinning if we're on potentially * different devices. */ - if (list_empty(&ctx->iopoll_list)) { + if (wq_list_empty(&ctx->iopoll_list)) { ctx->poll_multi_queue = false; } else if (!ctx->poll_multi_queue) { struct io_kiocb *list_req; - unsigned int queue_num0, queue_num1; - - list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb, - inflight_entry); - if (list_req->file != req->file) { + list_req = container_of(ctx->iopoll_list.first, struct io_kiocb, + comp_list); + if (list_req->file != req->file) ctx->poll_multi_queue = true; - } else { - queue_num0 = blk_qc_t_to_queue_num(list_req->rw.kiocb.ki_cookie); - queue_num1 = blk_qc_t_to_queue_num(req->rw.kiocb.ki_cookie); - if (queue_num0 != queue_num1) - ctx->poll_multi_queue = true; - } } /* @@ -2758,11 +2731,11 @@ static void io_iopoll_req_issued(struct io_kiocb *req) * it to the front so we find it first. */ if (READ_ONCE(req->iopoll_completed)) - list_add(&req->inflight_entry, &ctx->iopoll_list); + wq_list_add_head(&req->comp_list, &ctx->iopoll_list); else - list_add_tail(&req->inflight_entry, &ctx->iopoll_list); + wq_list_add_tail(&req->comp_list, &ctx->iopoll_list); - if (unlikely(in_async)) { + if (unlikely(needs_lock)) { /* * If IORING_SETUP_SQPOLL is enabled, sqes are either handle * in sq thread task context or in io worker task context. If @@ -2787,10 +2760,8 @@ static bool io_bdev_nowait(struct block_device *bdev) * any file. For now, just ensure that anything potentially problematic is done * inline. */ -static bool __io_file_supports_nowait(struct file *file, int rw) +static bool __io_file_supports_nowait(struct file *file, umode_t mode) { - umode_t mode = file_inode(file)->i_mode; - if (S_ISBLK(mode)) { if (IS_ENABLED(CONFIG_BLOCK) && io_bdev_nowait(I_BDEV(file->f_mapping->host))) @@ -2810,28 +2781,32 @@ static bool __io_file_supports_nowait(struct file *file, int rw) /* any ->read/write should understand O_NONBLOCK */ if (file->f_flags & O_NONBLOCK) return true; + return file->f_mode & FMODE_NOWAIT; +} - if (!(file->f_mode & FMODE_NOWAIT)) - return false; - - if (rw == READ) - return file->f_op->read_iter != NULL; +/* + * If we tracked the file through the SCM inflight mechanism, we could support + * any file. For now, just ensure that anything potentially problematic is done + * inline. + */ +static unsigned int io_file_get_flags(struct file *file) +{ + umode_t mode = file_inode(file)->i_mode; + unsigned int res = 0; - return file->f_op->write_iter != NULL; + if (S_ISREG(mode)) + res |= FFS_ISREG; + if (__io_file_supports_nowait(file, mode)) + res |= FFS_NOWAIT; + return res; } -static bool io_file_supports_nowait(struct io_kiocb *req, int rw) +static inline bool io_file_supports_nowait(struct io_kiocb *req) { - if (rw == READ && (req->flags & REQ_F_NOWAIT_READ)) - return true; - else if (rw == WRITE && (req->flags & REQ_F_NOWAIT_WRITE)) - return true; - - return __io_file_supports_nowait(req->file, rw); + return req->flags & REQ_F_SUPPORT_NOWAIT; } -static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, - int rw) +static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; struct kiocb *kiocb = &req->rw.kiocb; @@ -2839,16 +2814,15 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, unsigned ioprio; int ret; - if (!io_req_ffs_set(req) && S_ISREG(file_inode(file)->i_mode)) - req->flags |= REQ_F_ISREG; + if (!io_req_ffs_set(req)) + req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT; kiocb->ki_pos = READ_ONCE(sqe->off); if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) { req->flags |= REQ_F_CUR_POS; kiocb->ki_pos = file->f_pos; } - kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); - kiocb->ki_flags = iocb_flags(kiocb->ki_filp); + kiocb->ki_flags = iocb_flags(file); ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); if (unlikely(ret)) return ret; @@ -2859,22 +2833,11 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, * reliably. If not, or it IOCB_NOWAIT is set, don't retry. */ if ((kiocb->ki_flags & IOCB_NOWAIT) || - ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req, rw))) + ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req))) req->flags |= REQ_F_NOWAIT; - ioprio = READ_ONCE(sqe->ioprio); - if (ioprio) { - ret = ioprio_check_cap(ioprio); - if (ret) - return ret; - - kiocb->ki_ioprio = ioprio; - } else - kiocb->ki_ioprio = get_current_ioprio(); - if (ctx->flags & IORING_SETUP_IOPOLL) { - if (!(kiocb->ki_flags & IOCB_DIRECT) || - !kiocb->ki_filp->f_op->iopoll) + if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) return -EOPNOTSUPP; kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE; @@ -2886,12 +2849,18 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, kiocb->ki_complete = io_complete_rw; } - if (req->opcode == IORING_OP_READ_FIXED || - req->opcode == IORING_OP_WRITE_FIXED) { - req->imu = NULL; - io_req_set_rsrc_node(req); + ioprio = READ_ONCE(sqe->ioprio); + if (ioprio) { + ret = ioprio_check_cap(ioprio); + if (ret) + return ret; + + kiocb->ki_ioprio = ioprio; + } else { + kiocb->ki_ioprio = get_current_ioprio(); } + req->imu = NULL; req->rw.addr = READ_ONCE(sqe->addr); req->rw.len = READ_ONCE(sqe->len); req->buf_index = READ_ONCE(sqe->buf_index); @@ -2915,7 +2884,7 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) ret = -EINTR; fallthrough; default: - kiocb->ki_complete(kiocb, ret, 0); + kiocb->ki_complete(kiocb, ret); } } @@ -2926,7 +2895,7 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_async_rw *io = req->async_data; /* add previously done IO, if any */ - if (io && io->bytes_done > 0) { + if (req_has_async_data(req) && io->bytes_done > 0) { if (ret < 0) ret = io->bytes_done; else @@ -2949,7 +2918,7 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_ring_ctx *ctx = req->ctx; req_set_fail(req); - if (issue_flags & IO_URING_F_NONBLOCK) { + if (issue_flags & IO_URING_F_UNLOCKED) { mutex_lock(&ctx->uring_lock); __io_req_complete(req, issue_flags, ret, cflags); mutex_unlock(&ctx->uring_lock); @@ -3020,13 +2989,15 @@ static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter) { - struct io_ring_ctx *ctx = req->ctx; struct io_mapped_ubuf *imu = req->imu; u16 index, buf_index = req->buf_index; if (likely(!imu)) { + struct io_ring_ctx *ctx = req->ctx; + if (unlikely(buf_index >= ctx->nr_user_bufs)) return -EFAULT; + io_req_set_rsrc_node(req, ctx); index = array_index_nospec(buf_index, ctx->nr_user_bufs); imu = READ_ONCE(ctx->user_bufs[index]); req->imu = imu; @@ -3053,10 +3024,11 @@ static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock) } static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len, - int bgid, struct io_buffer *kbuf, - bool needs_lock) + int bgid, unsigned int issue_flags) { + struct io_buffer *kbuf = req->kbuf; struct io_buffer *head; + bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; if (req->flags & REQ_F_BUFFER_SELECTED) return kbuf; @@ -3077,34 +3049,32 @@ static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len, } if (*len > kbuf->len) *len = kbuf->len; + req->flags |= REQ_F_BUFFER_SELECTED; + req->kbuf = kbuf; } else { kbuf = ERR_PTR(-ENOBUFS); } io_ring_submit_unlock(req->ctx, needs_lock); - return kbuf; } static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len, - bool needs_lock) + unsigned int issue_flags) { struct io_buffer *kbuf; u16 bgid; - kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; bgid = req->buf_index; - kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock); + kbuf = io_buffer_select(req, len, bgid, issue_flags); if (IS_ERR(kbuf)) return kbuf; - req->rw.addr = (u64) (unsigned long) kbuf; - req->flags |= REQ_F_BUFFER_SELECTED; return u64_to_user_ptr(kbuf->addr); } #ifdef CONFIG_COMPAT static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, - bool needs_lock) + unsigned int issue_flags) { struct compat_iovec __user *uiov; compat_ssize_t clen; @@ -3120,7 +3090,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, return -EINVAL; len = clen; - buf = io_rw_buffer_select(req, &len, needs_lock); + buf = io_rw_buffer_select(req, &len, issue_flags); if (IS_ERR(buf)) return PTR_ERR(buf); iov[0].iov_base = buf; @@ -3130,7 +3100,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, #endif static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, - bool needs_lock) + unsigned int issue_flags) { struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr); void __user *buf; @@ -3142,7 +3112,7 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, len = iov[0].iov_len; if (len < 0) return -EINVAL; - buf = io_rw_buffer_select(req, &len, needs_lock); + buf = io_rw_buffer_select(req, &len, issue_flags); if (IS_ERR(buf)) return PTR_ERR(buf); iov[0].iov_base = buf; @@ -3151,12 +3121,11 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, } static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, - bool needs_lock) + unsigned int issue_flags) { if (req->flags & REQ_F_BUFFER_SELECTED) { - struct io_buffer *kbuf; + struct io_buffer *kbuf = req->kbuf; - kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; iov[0].iov_base = u64_to_user_ptr(kbuf->addr); iov[0].iov_len = kbuf->len; return 0; @@ -3166,52 +3135,72 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, #ifdef CONFIG_COMPAT if (req->ctx->compat) - return io_compat_import(req, iov, needs_lock); + return io_compat_import(req, iov, issue_flags); #endif - return __io_iov_buffer_select(req, iov, needs_lock); + return __io_iov_buffer_select(req, iov, issue_flags); } -static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec, - struct iov_iter *iter, bool needs_lock) +static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req, + struct io_rw_state *s, + unsigned int issue_flags) { - void __user *buf = u64_to_user_ptr(req->rw.addr); - size_t sqe_len = req->rw.len; + struct iov_iter *iter = &s->iter; u8 opcode = req->opcode; + struct iovec *iovec; + void __user *buf; + size_t sqe_len; ssize_t ret; - if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { - *iovec = NULL; - return io_import_fixed(req, rw, iter); - } + BUILD_BUG_ON(ERR_PTR(0) != NULL); + + if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) + return ERR_PTR(io_import_fixed(req, rw, iter)); /* buffer index only valid with fixed read/write, or buffer select */ - if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT)) - return -EINVAL; + if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))) + return ERR_PTR(-EINVAL); + + buf = u64_to_user_ptr(req->rw.addr); + sqe_len = req->rw.len; if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { if (req->flags & REQ_F_BUFFER_SELECT) { - buf = io_rw_buffer_select(req, &sqe_len, needs_lock); + buf = io_rw_buffer_select(req, &sqe_len, issue_flags); if (IS_ERR(buf)) - return PTR_ERR(buf); + return ERR_CAST(buf); req->rw.len = sqe_len; } - ret = import_single_range(rw, buf, sqe_len, *iovec, iter); - *iovec = NULL; - return ret; + ret = import_single_range(rw, buf, sqe_len, s->fast_iov, iter); + return ERR_PTR(ret); } + iovec = s->fast_iov; if (req->flags & REQ_F_BUFFER_SELECT) { - ret = io_iov_buffer_select(req, *iovec, needs_lock); + ret = io_iov_buffer_select(req, iovec, issue_flags); if (!ret) - iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len); - *iovec = NULL; - return ret; + iov_iter_init(iter, rw, iovec, 1, iovec->iov_len); + return ERR_PTR(ret); } - return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter, + ret = __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, &iovec, iter, req->ctx->compat); + if (unlikely(ret < 0)) + return ERR_PTR(ret); + return iovec; +} + +static inline int io_import_iovec(int rw, struct io_kiocb *req, + struct iovec **iovec, struct io_rw_state *s, + unsigned int issue_flags) +{ + *iovec = __io_import_iovec(rw, req, s, issue_flags); + if (unlikely(IS_ERR(*iovec))) + return PTR_ERR(*iovec); + + iov_iter_save_state(&s->iter, &s->iter_state); + return 0; } static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) @@ -3236,7 +3225,8 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) */ if (kiocb->ki_flags & IOCB_HIPRI) return -EOPNOTSUPP; - if (kiocb->ki_flags & IOCB_NOWAIT) + if ((kiocb->ki_flags & IOCB_NOWAIT) && + !(kiocb->ki_filp->f_flags & O_NONBLOCK)) return -EAGAIN; while (iov_iter_count(iter)) { @@ -3282,7 +3272,7 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, { struct io_async_rw *rw = req->async_data; - memcpy(&rw->iter, iter, sizeof(*iter)); + memcpy(&rw->s.iter, iter, sizeof(*iter)); rw->free_iovec = iovec; rw->bytes_done = 0; /* can only be fixed buffers, no need to do anything */ @@ -3291,33 +3281,36 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, if (!iovec) { unsigned iov_off = 0; - rw->iter.iov = rw->fast_iov; + rw->s.iter.iov = rw->s.fast_iov; if (iter->iov != fast_iov) { iov_off = iter->iov - fast_iov; - rw->iter.iov += iov_off; + rw->s.iter.iov += iov_off; } - if (rw->fast_iov != fast_iov) - memcpy(rw->fast_iov + iov_off, fast_iov + iov_off, + if (rw->s.fast_iov != fast_iov) + memcpy(rw->s.fast_iov + iov_off, fast_iov + iov_off, sizeof(struct iovec) * iter->nr_segs); } else { req->flags |= REQ_F_NEED_CLEANUP; } } -static inline int io_alloc_async_data(struct io_kiocb *req) +static inline bool io_alloc_async_data(struct io_kiocb *req) { WARN_ON_ONCE(!io_op_defs[req->opcode].async_size); req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL); - return req->async_data == NULL; + if (req->async_data) { + req->flags |= REQ_F_ASYNC_DATA; + return false; + } + return true; } static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, - const struct iovec *fast_iov, - struct iov_iter *iter, bool force) + struct io_rw_state *s, bool force) { if (!force && !io_op_defs[req->opcode].needs_async_setup) return 0; - if (!req->async_data) { + if (!req_has_async_data(req)) { struct io_async_rw *iorw; if (io_alloc_async_data(req)) { @@ -3325,10 +3318,10 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, return -ENOMEM; } - io_req_map_rw(req, iovec, fast_iov, iter); + io_req_map_rw(req, iovec, s->fast_iov, &s->iter); iorw = req->async_data; /* we've copied and mapped the iter, ensure state is saved */ - iov_iter_save_state(&iorw->iter, &iorw->iter_state); + iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state); } return 0; } @@ -3336,10 +3329,11 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, static inline int io_rw_prep_async(struct io_kiocb *req, int rw) { struct io_async_rw *iorw = req->async_data; - struct iovec *iov = iorw->fast_iov; + struct iovec *iov; int ret; - ret = io_import_iovec(rw, req, &iov, &iorw->iter, false); + /* submission path, ->uring_lock should already be taken */ + ret = io_import_iovec(rw, req, &iov, &iorw->s, 0); if (unlikely(ret < 0)) return ret; @@ -3347,7 +3341,6 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw) iorw->free_iovec = iov; if (iov) req->flags |= REQ_F_NEED_CLEANUP; - iov_iter_save_state(&iorw->iter, &iorw->iter_state); return 0; } @@ -3355,11 +3348,11 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { if (unlikely(!(req->file->f_mode & FMODE_READ))) return -EBADF; - return io_prep_rw(req, sqe, READ); + return io_prep_rw(req, sqe); } /* - * This is our waitqueue callback handler, registered through lock_page_async() + * This is our waitqueue callback handler, registered through __folio_lock_async() * when we initially tried to do the IO with the iocb armed our waitqueue. * This gets called when the page is unlocked, and we generally expect that to * happen when the page IO is completed and the page is now uptodate. This will @@ -3431,7 +3424,7 @@ static bool io_rw_should_retry(struct io_kiocb *req) static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) { - if (req->file->f_op->read_iter) + if (likely(req->file->f_op->read_iter)) return call_read_iter(req->file, &req->rw.kiocb, iter); else if (req->file->f_op->read) return loop_rw_iter(READ, req, iter); @@ -3447,43 +3440,40 @@ static bool need_read_all(struct io_kiocb *req) static int io_read(struct io_kiocb *req, unsigned int issue_flags) { - struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; + struct io_rw_state __s, *s = &__s; + struct iovec *iovec; struct kiocb *kiocb = &req->rw.kiocb; - struct iov_iter __iter, *iter = &__iter; - struct io_async_rw *rw = req->async_data; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - struct iov_iter_state __state, *state; + struct io_async_rw *rw; ssize_t ret, ret2; - if (rw) { - iter = &rw->iter; - state = &rw->iter_state; + if (!req_has_async_data(req)) { + ret = io_import_iovec(READ, req, &iovec, s, issue_flags); + if (unlikely(ret < 0)) + return ret; + } else { + rw = req->async_data; + s = &rw->s; /* * We come here from an earlier attempt, restore our state to * match in case it doesn't. It's cheap enough that we don't * need to make this conditional. */ - iov_iter_restore(iter, state); + iov_iter_restore(&s->iter, &s->iter_state); iovec = NULL; - } else { - ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock); - if (ret < 0) - return ret; - state = &__state; - iov_iter_save_state(iter, state); } - req->result = iov_iter_count(iter); + req->result = iov_iter_count(&s->iter); - /* Ensure we clear previously set non-block flag */ - if (!force_nonblock) - kiocb->ki_flags &= ~IOCB_NOWAIT; - else + if (force_nonblock) { + /* If the file doesn't support async, just async punt */ + if (unlikely(!io_file_supports_nowait(req))) { + ret = io_setup_async_rw(req, iovec, s, true); + return ret ?: -EAGAIN; + } kiocb->ki_flags |= IOCB_NOWAIT; - - /* If the file doesn't support async, just async punt */ - if (force_nonblock && !io_file_supports_nowait(req, READ)) { - ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true); - return ret ?: -EAGAIN; + } else { + /* Ensure we clear previously set non-block flag */ + kiocb->ki_flags &= ~IOCB_NOWAIT; } ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result); @@ -3492,7 +3482,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) return ret; } - ret = io_iter_do_read(req, iter); + ret = io_iter_do_read(req, &s->iter); if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { req->flags &= ~REQ_F_REISSUE; @@ -3505,7 +3495,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) ret = 0; } else if (ret == -EIOCBQUEUED) { goto out_free; - } else if (ret <= 0 || ret == req->result || !force_nonblock || + } else if (ret == req->result || ret <= 0 || !force_nonblock || (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) { /* read all, failed, already did sync or don't want to retry */ goto done; @@ -3516,22 +3506,19 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) * untouched in case of error. Restore it and we'll advance it * manually if we need to. */ - iov_iter_restore(iter, state); + iov_iter_restore(&s->iter, &s->iter_state); - ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true); + ret2 = io_setup_async_rw(req, iovec, s, true); if (ret2) return ret2; iovec = NULL; rw = req->async_data; + s = &rw->s; /* * Now use our persistent iterator and state, if we aren't already. * We've restored and mapped the iter to match. */ - if (iter != &rw->iter) { - iter = &rw->iter; - state = &rw->iter_state; - } do { /* @@ -3539,11 +3526,11 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) * above or inside this loop. Advance the iter by the bytes * that were consumed. */ - iov_iter_advance(iter, ret); - if (!iov_iter_count(iter)) + iov_iter_advance(&s->iter, ret); + if (!iov_iter_count(&s->iter)) break; rw->bytes_done += ret; - iov_iter_save_state(iter, state); + iov_iter_save_state(&s->iter, &s->iter_state); /* if we can retry, do so with the callbacks armed */ if (!io_rw_should_retry(req)) { @@ -3557,12 +3544,12 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) * desired page gets unlocked. We can also get a partial read * here, and if we do, then just retry at the new offset. */ - ret = io_iter_do_read(req, iter); + ret = io_iter_do_read(req, &s->iter); if (ret == -EIOCBQUEUED) return 0; /* we got some bytes, but not all. retry. */ kiocb->ki_flags &= ~IOCB_WAITQ; - iov_iter_restore(iter, state); + iov_iter_restore(&s->iter, &s->iter_state); } while (ret > 0); done: kiocb_done(kiocb, ret, issue_flags); @@ -3577,47 +3564,46 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { if (unlikely(!(req->file->f_mode & FMODE_WRITE))) return -EBADF; - return io_prep_rw(req, sqe, WRITE); + req->rw.kiocb.ki_hint = ki_hint_validate(file_write_hint(req->file)); + return io_prep_rw(req, sqe); } static int io_write(struct io_kiocb *req, unsigned int issue_flags) { - struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; + struct io_rw_state __s, *s = &__s; + struct iovec *iovec; struct kiocb *kiocb = &req->rw.kiocb; - struct iov_iter __iter, *iter = &__iter; - struct io_async_rw *rw = req->async_data; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - struct iov_iter_state __state, *state; ssize_t ret, ret2; - if (rw) { - iter = &rw->iter; - state = &rw->iter_state; - iov_iter_restore(iter, state); - iovec = NULL; - } else { - ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock); - if (ret < 0) + if (!req_has_async_data(req)) { + ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags); + if (unlikely(ret < 0)) return ret; - state = &__state; - iov_iter_save_state(iter, state); + } else { + struct io_async_rw *rw = req->async_data; + + s = &rw->s; + iov_iter_restore(&s->iter, &s->iter_state); + iovec = NULL; } - req->result = iov_iter_count(iter); + req->result = iov_iter_count(&s->iter); - /* Ensure we clear previously set non-block flag */ - if (!force_nonblock) - kiocb->ki_flags &= ~IOCB_NOWAIT; - else - kiocb->ki_flags |= IOCB_NOWAIT; + if (force_nonblock) { + /* If the file doesn't support async, just async punt */ + if (unlikely(!io_file_supports_nowait(req))) + goto copy_iov; - /* If the file doesn't support async, just async punt */ - if (force_nonblock && !io_file_supports_nowait(req, WRITE)) - goto copy_iov; + /* file path doesn't support NOWAIT for non-direct_IO */ + if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) && + (req->flags & REQ_F_ISREG)) + goto copy_iov; - /* file path doesn't support NOWAIT for non-direct_IO */ - if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) && - (req->flags & REQ_F_ISREG)) - goto copy_iov; + kiocb->ki_flags |= IOCB_NOWAIT; + } else { + /* Ensure we clear previously set non-block flag */ + kiocb->ki_flags &= ~IOCB_NOWAIT; + } ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result); if (unlikely(ret)) @@ -3637,10 +3623,10 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) } kiocb->ki_flags |= IOCB_WRITE; - if (req->file->f_op->write_iter) - ret2 = call_write_iter(req->file, kiocb, iter); + if (likely(req->file->f_op->write_iter)) + ret2 = call_write_iter(req->file, kiocb, &s->iter); else if (req->file->f_op->write) - ret2 = loop_rw_iter(WRITE, req, iter); + ret2 = loop_rw_iter(WRITE, req, &s->iter); else ret2 = -EINVAL; @@ -3660,14 +3646,14 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) goto done; if (!force_nonblock || ret2 != -EAGAIN) { /* IOPOLL retry should happen for io-wq threads */ - if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN) + if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL)) goto copy_iov; done: kiocb_done(kiocb, ret2, issue_flags); } else { copy_iov: - iov_iter_restore(iter, state); - ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); + iov_iter_restore(&s->iter, &s->iter_state); + ret = io_setup_async_rw(req, iovec, s, false); return ret ?: -EAGAIN; } out_free: @@ -3803,7 +3789,7 @@ static int io_mkdirat_prep(struct io_kiocb *req, return 0; } -static int io_mkdirat(struct io_kiocb *req, int issue_flags) +static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags) { struct io_mkdir *mkd = &req->mkdir; int ret; @@ -3852,7 +3838,7 @@ static int io_symlinkat_prep(struct io_kiocb *req, return 0; } -static int io_symlinkat(struct io_kiocb *req, int issue_flags) +static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags) { struct io_symlink *sl = &req->symlink; int ret; @@ -3902,7 +3888,7 @@ static int io_linkat_prep(struct io_kiocb *req, return 0; } -static int io_linkat(struct io_kiocb *req, int issue_flags) +static int io_linkat(struct io_kiocb *req, unsigned int issue_flags) { struct io_hardlink *lnk = &req->hardlink; int ret; @@ -4321,9 +4307,9 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) struct io_ring_ctx *ctx = req->ctx; struct io_buffer *head; int ret = 0; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; - io_ring_submit_lock(ctx, !force_nonblock); + io_ring_submit_lock(ctx, needs_lock); lockdep_assert_held(&ctx->uring_lock); @@ -4336,7 +4322,7 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) /* complete before unlock, IOPOLL may need the lock */ __io_req_complete(req, issue_flags, ret, 0); - io_ring_submit_unlock(ctx, !force_nonblock); + io_ring_submit_unlock(ctx, needs_lock); return 0; } @@ -4408,9 +4394,9 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) struct io_ring_ctx *ctx = req->ctx; struct io_buffer *head, *list; int ret = 0; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; - io_ring_submit_lock(ctx, !force_nonblock); + io_ring_submit_lock(ctx, needs_lock); lockdep_assert_held(&ctx->uring_lock); @@ -4426,7 +4412,7 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) req_set_fail(req); /* complete before unlock, IOPOLL may need the lock */ __io_req_complete(req, issue_flags, ret, 0); - io_ring_submit_unlock(ctx, !force_nonblock); + io_ring_submit_unlock(ctx, needs_lock); return 0; } @@ -4759,8 +4745,9 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(!sock)) return -ENOTSOCK; - kmsg = req->async_data; - if (!kmsg) { + if (req_has_async_data(req)) { + kmsg = req->async_data; + } else { ret = io_sendmsg_copy_hdr(req, &iomsg); if (ret) return ret; @@ -4919,23 +4906,16 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req, } static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req, - bool needs_lock) + unsigned int issue_flags) { struct io_sr_msg *sr = &req->sr_msg; - struct io_buffer *kbuf; - kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock); - if (IS_ERR(kbuf)) - return kbuf; - - sr->kbuf = kbuf; - req->flags |= REQ_F_BUFFER_SELECTED; - return kbuf; + return io_buffer_select(req, &sr->len, sr->bgid, issue_flags); } static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req) { - return io_put_kbuf(req, req->sr_msg.kbuf); + return io_put_kbuf(req, req->kbuf); } static int io_recvmsg_prep_async(struct io_kiocb *req) @@ -4983,8 +4963,9 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(!sock)) return -ENOTSOCK; - kmsg = req->async_data; - if (!kmsg) { + if (req_has_async_data(req)) { + kmsg = req->async_data; + } else { ret = io_recvmsg_copy_hdr(req, &iomsg); if (ret) return ret; @@ -4992,7 +4973,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) } if (req->flags & REQ_F_BUFFER_SELECT) { - kbuf = io_recv_buffer_select(req, !force_nonblock); + kbuf = io_recv_buffer_select(req, issue_flags); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr); @@ -5044,7 +5025,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags) return -ENOTSOCK; if (req->flags & REQ_F_BUFFER_SELECT) { - kbuf = io_recv_buffer_select(req, !force_nonblock); + kbuf = io_recv_buffer_select(req, issue_flags); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); buf = u64_to_user_ptr(kbuf->addr); @@ -5175,7 +5156,7 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags) int ret; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - if (req->async_data) { + if (req_has_async_data(req)) { io = req->async_data; } else { ret = move_addr_to_kernel(req->connect.addr, @@ -5191,7 +5172,7 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags) ret = __sys_connect_file(req->file, &io->address, req->connect.addr_len, file_flags); if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) { - if (req->async_data) + if (req_has_async_data(req)) return -EAGAIN; if (io_alloc_async_data(req)) { ret = -ENOMEM; @@ -5351,16 +5332,6 @@ static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask) return !(flags & IORING_CQE_F_MORE); } -static inline bool io_poll_complete(struct io_kiocb *req, __poll_t mask) - __must_hold(&req->ctx->completion_lock) -{ - bool done; - - done = __io_poll_complete(req, mask); - io_commit_cqring(req->ctx); - return done; -} - static void io_poll_task_func(struct io_kiocb *req, bool *locked) { struct io_ring_ctx *ctx = req->ctx; @@ -5482,7 +5453,10 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake); req_ref_get(req); poll->wait.private = req; + *poll_ptr = poll; + if (req->opcode == IORING_OP_POLL_ADD) + req->flags |= REQ_F_ASYNC_DATA; } pt->nr_entries++; @@ -5606,17 +5580,13 @@ static int io_arm_poll_handler(struct io_kiocb *req) struct async_poll *apoll; struct io_poll_table ipt; __poll_t ret, mask = EPOLLONESHOT | POLLERR | POLLPRI; - int rw; - if (!req->file || !file_can_poll(req->file)) - return IO_APOLL_ABORTED; - if (req->flags & REQ_F_POLLED) - return IO_APOLL_ABORTED; if (!def->pollin && !def->pollout) return IO_APOLL_ABORTED; + if (!file_can_poll(req->file) || (req->flags & REQ_F_POLLED)) + return IO_APOLL_ABORTED; if (def->pollin) { - rw = READ; mask |= POLLIN | POLLRDNORM; /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */ @@ -5624,14 +5594,9 @@ static int io_arm_poll_handler(struct io_kiocb *req) (req->sr_msg.msg_flags & MSG_ERRQUEUE)) mask &= ~POLLIN; } else { - rw = WRITE; mask |= POLLOUT | POLLWRNORM; } - /* if we can't nonblock try, then no point in arming a poll handler */ - if (!io_file_supports_nowait(req, rw)) - return IO_APOLL_ABORTED; - apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); if (unlikely(!apoll)) return IO_APOLL_ABORTED; @@ -5692,8 +5657,8 @@ static bool io_poll_remove_one(struct io_kiocb *req) /* * Returns true if we found and killed one or more poll requests */ -static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, - bool cancel_all) +static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, + struct task_struct *tsk, bool cancel_all) { struct hlist_node *tmp; struct io_kiocb *req; @@ -5847,7 +5812,8 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) if (mask) { /* no async, we'd stolen it */ ipt.error = 0; - done = io_poll_complete(req, mask); + done = __io_poll_complete(req, mask); + io_commit_cqring(req->ctx); } spin_unlock(&ctx->completion_lock); @@ -5923,7 +5889,10 @@ err: static void io_req_task_timeout(struct io_kiocb *req, bool *locked) { - req_set_fail(req); + struct io_timeout_data *data = req->async_data; + + if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) + req_set_fail(req); io_req_complete_post(req, -ETIME, 0); } @@ -6129,7 +6098,8 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (off && is_timeout_link) return -EINVAL; flags = READ_ONCE(sqe->timeout_flags); - if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK)) + if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK | + IORING_TIMEOUT_ETIME_SUCCESS)) return -EINVAL; /* more than one clock specified is invalid, obviously */ if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1) @@ -6140,7 +6110,9 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (unlikely(off && !req->ctx->off_timeout_used)) req->ctx->off_timeout_used = true; - if (!req->async_data && io_alloc_async_data(req)) + if (WARN_ON_ONCE(req_has_async_data(req))) + return -EFAULT; + if (io_alloc_async_data(req)) return -ENOMEM; data = req->async_data; @@ -6297,6 +6269,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; u64 sqe_addr = req->cancel.addr; + bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; struct io_tctx_node *node; int ret; @@ -6305,7 +6278,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) goto done; /* slow path, try all io-wq's */ - io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); + io_ring_submit_lock(ctx, needs_lock); ret = -ENOENT; list_for_each_entry(node, &ctx->tctx_list, ctx_node) { struct io_uring_task *tctx = node->task->io_uring; @@ -6314,7 +6287,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) if (ret != -ENOENT) break; } - io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); + io_ring_submit_unlock(ctx, needs_lock); done: if (ret < 0) req_set_fail(req); @@ -6341,6 +6314,7 @@ static int io_rsrc_update_prep(struct io_kiocb *req, static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; + bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; struct io_uring_rsrc_update2 up; int ret; @@ -6350,10 +6324,10 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) up.tags = 0; up.resv = 0; - io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); + io_ring_submit_lock(ctx, needs_lock); ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, req->rsrc_update.nr_args); - io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); + io_ring_submit_unlock(ctx, needs_lock); if (ret < 0) req_set_fail(req); @@ -6449,7 +6423,7 @@ static int io_req_prep_async(struct io_kiocb *req) { if (!io_op_defs[req->opcode].needs_async_setup) return 0; - if (WARN_ON_ONCE(req->async_data)) + if (WARN_ON_ONCE(req_has_async_data(req))) return -EFAULT; if (io_alloc_async_data(req)) return -EAGAIN; @@ -6481,68 +6455,39 @@ static u32 io_get_sequence(struct io_kiocb *req) return seq; } -static bool io_drain_req(struct io_kiocb *req) +static __cold void io_drain_req(struct io_kiocb *req) { - struct io_kiocb *pos; struct io_ring_ctx *ctx = req->ctx; struct io_defer_entry *de; int ret; - u32 seq; - - if (req->flags & REQ_F_FAIL) { - io_req_complete_fail_submit(req); - return true; - } - - /* - * If we need to drain a request in the middle of a link, drain the - * head request and the next request/link after the current link. - * Considering sequential execution of links, IOSQE_IO_DRAIN will be - * maintained for every request of our link. - */ - if (ctx->drain_next) { - req->flags |= REQ_F_IO_DRAIN; - ctx->drain_next = false; - } - /* not interested in head, start from the first linked */ - io_for_each_link(pos, req->link) { - if (pos->flags & REQ_F_IO_DRAIN) { - ctx->drain_next = true; - req->flags |= REQ_F_IO_DRAIN; - break; - } - } + u32 seq = io_get_sequence(req); /* Still need defer if there is pending req in defer list. */ - if (likely(list_empty_careful(&ctx->defer_list) && - !(req->flags & REQ_F_IO_DRAIN))) { + if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) { +queue: ctx->drain_active = false; - return false; + io_req_task_queue(req); + return; } - seq = io_get_sequence(req); - /* Still a chance to pass the sequence check */ - if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) - return false; - ret = io_req_prep_async(req); - if (ret) - goto fail; + if (ret) { +fail: + io_req_complete_failed(req, ret); + return; + } io_prep_async_link(req); de = kmalloc(sizeof(*de), GFP_KERNEL); if (!de) { ret = -ENOMEM; -fail: - io_req_complete_failed(req, ret); - return true; + goto fail; } spin_lock(&ctx->completion_lock); if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) { spin_unlock(&ctx->completion_lock); kfree(de); - io_queue_async_work(req, NULL); - return true; + goto queue; } trace_io_uring_defer(ctx, req, req->user_data); @@ -6550,23 +6495,13 @@ fail: de->seq = seq; list_add_tail(&de->list, &ctx->defer_list); spin_unlock(&ctx->completion_lock); - return true; } static void io_clean_op(struct io_kiocb *req) { if (req->flags & REQ_F_BUFFER_SELECTED) { - switch (req->opcode) { - case IORING_OP_READV: - case IORING_OP_READ_FIXED: - case IORING_OP_READ: - kfree((void *)(unsigned long)req->rw.addr); - break; - case IORING_OP_RECVMSG: - case IORING_OP_RECV: - kfree(req->sr_msg.kbuf); - break; - } + kfree(req->kbuf); + req->kbuf = NULL; } if (req->flags & REQ_F_NEED_CLEANUP) { @@ -6631,17 +6566,19 @@ static void io_clean_op(struct io_kiocb *req) } if (req->flags & REQ_F_CREDS) put_cred(req->creds); - + if (req->flags & REQ_F_ASYNC_DATA) { + kfree(req->async_data); + req->async_data = NULL; + } req->flags &= ~IO_REQ_CLEAN_FLAGS; } static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) { - struct io_ring_ctx *ctx = req->ctx; const struct cred *creds = NULL; int ret; - if ((req->flags & REQ_F_CREDS) && req->creds != current_cred()) + if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred())) creds = override_creds(req->creds); switch (req->opcode) { @@ -6764,8 +6701,8 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) if (ret) return ret; /* If the op doesn't have a file, we're not polling for it */ - if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) - io_iopoll_req_issued(req); + if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file) + io_iopoll_req_issued(req, issue_flags); return 0; } @@ -6781,6 +6718,8 @@ static struct io_wq_work *io_wq_free_work(struct io_wq_work *work) static void io_wq_submit_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); + unsigned int issue_flags = IO_URING_F_UNLOCKED; + bool needs_poll = false; struct io_kiocb *timeout; int ret = 0; @@ -6795,23 +6734,42 @@ static void io_wq_submit_work(struct io_wq_work *work) io_queue_linked_timeout(timeout); /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ - if (work->flags & IO_WQ_WORK_CANCEL) - ret = -ECANCELED; + if (work->flags & IO_WQ_WORK_CANCEL) { + io_req_task_queue_fail(req, -ECANCELED); + return; + } - if (!ret) { - do { - ret = io_issue_sqe(req, 0); - /* - * We can get EAGAIN for polled IO even though we're - * forcing a sync submission from here, since we can't - * wait for request slots on the block side. - */ - if (ret != -EAGAIN) - break; - cond_resched(); - } while (1); + if (req->flags & REQ_F_FORCE_ASYNC) { + const struct io_op_def *def = &io_op_defs[req->opcode]; + bool opcode_poll = def->pollin || def->pollout; + + if (opcode_poll && file_can_poll(req->file)) { + needs_poll = true; + issue_flags |= IO_URING_F_NONBLOCK; + } } + do { + ret = io_issue_sqe(req, issue_flags); + if (ret != -EAGAIN) + break; + /* + * We can get EAGAIN for iopolled IO even though we're + * forcing a sync submission from here, since we can't + * wait for request slots on the block side. + */ + if (!needs_poll) { + cond_resched(); + continue; + } + + if (io_arm_poll_handler(req) == IO_APOLL_OK) + return; + /* aborted or ready, in either case retry blocking */ + needs_poll = false; + issue_flags &= ~IO_URING_F_NONBLOCK; + } while (1); + /* avoid locking problems by failing it from a clean context */ if (ret) io_req_task_queue_fail(req, ret); @@ -6835,12 +6793,7 @@ static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file { unsigned long file_ptr = (unsigned long) file; - if (__io_file_supports_nowait(file, READ)) - file_ptr |= FFS_ASYNC_READ; - if (__io_file_supports_nowait(file, WRITE)) - file_ptr |= FFS_ASYNC_WRITE; - if (S_ISREG(file_inode(file)->i_mode)) - file_ptr |= FFS_ISREG; + file_ptr |= io_file_get_flags(file); file_slot->file_ptr = file_ptr; } @@ -6857,8 +6810,8 @@ static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx, file = (struct file *) (file_ptr & FFS_MASK); file_ptr &= ~FFS_MASK; /* mask in overlapping REQ_F and FFS bits */ - req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT); - io_req_set_rsrc_node(req); + req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT); + io_req_set_rsrc_node(req, ctx); return file; } @@ -6950,67 +6903,66 @@ static void io_queue_linked_timeout(struct io_kiocb *req) io_put_req(req); } -static void __io_queue_sqe(struct io_kiocb *req) +static void io_queue_sqe_arm_apoll(struct io_kiocb *req) + __must_hold(&req->ctx->uring_lock) +{ + struct io_kiocb *linked_timeout = io_prep_linked_timeout(req); + + switch (io_arm_poll_handler(req)) { + case IO_APOLL_READY: + if (linked_timeout) { + io_queue_linked_timeout(linked_timeout); + linked_timeout = NULL; + } + io_req_task_queue(req); + break; + case IO_APOLL_ABORTED: + /* + * Queued up for async execution, worker will release + * submit reference when the iocb is actually submitted. + */ + io_queue_async_work(req, NULL); + break; + } + + if (linked_timeout) + io_queue_linked_timeout(linked_timeout); +} + +static inline void __io_queue_sqe(struct io_kiocb *req) __must_hold(&req->ctx->uring_lock) { struct io_kiocb *linked_timeout; int ret; -issue_sqe: ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); + if (req->flags & REQ_F_COMPLETE_INLINE) { + io_req_add_compl_list(req); + return; + } /* * We async punt it if the file wasn't marked NOWAIT, or if the file * doesn't support non-blocking read/write attempts */ if (likely(!ret)) { - if (req->flags & REQ_F_COMPLETE_INLINE) { - struct io_ring_ctx *ctx = req->ctx; - struct io_submit_state *state = &ctx->submit_state; - - state->compl_reqs[state->compl_nr++] = req; - if (state->compl_nr == ARRAY_SIZE(state->compl_reqs)) - io_submit_flush_completions(ctx); - return; - } - linked_timeout = io_prep_linked_timeout(req); if (linked_timeout) io_queue_linked_timeout(linked_timeout); } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { - linked_timeout = io_prep_linked_timeout(req); - - switch (io_arm_poll_handler(req)) { - case IO_APOLL_READY: - if (linked_timeout) - io_unprep_linked_timeout(req); - goto issue_sqe; - case IO_APOLL_ABORTED: - /* - * Queued up for async execution, worker will release - * submit reference when the iocb is actually submitted. - */ - io_queue_async_work(req, NULL); - break; - } - - if (linked_timeout) - io_queue_linked_timeout(linked_timeout); + io_queue_sqe_arm_apoll(req); } else { io_req_complete_failed(req, ret); } } -static inline void io_queue_sqe(struct io_kiocb *req) +static void io_queue_sqe_fallback(struct io_kiocb *req) __must_hold(&req->ctx->uring_lock) { - if (unlikely(req->ctx->drain_active) && io_drain_req(req)) - return; - - if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) { - __io_queue_sqe(req); - } else if (req->flags & REQ_F_FAIL) { + if (req->flags & REQ_F_FAIL) { io_req_complete_fail_submit(req); + } else if (unlikely(req->ctx->drain_active)) { + io_drain_req(req); } else { int ret = io_req_prep_async(req); @@ -7021,6 +6973,15 @@ static inline void io_queue_sqe(struct io_kiocb *req) } } +static inline void io_queue_sqe(struct io_kiocb *req) + __must_hold(&req->ctx->uring_lock) +{ + if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) + __io_queue_sqe(req); + else + io_queue_sqe_fallback(req); +} + /* * Check SQE restrictions (opcode and flags). * @@ -7030,9 +6991,6 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx, struct io_kiocb *req, unsigned int sqe_flags) { - if (likely(!ctx->restricted)) - return true; - if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) return false; @@ -7047,16 +7005,35 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx, return true; } +static void io_init_req_drain(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_kiocb *head = ctx->submit_state.link.head; + + ctx->drain_active = true; + if (head) { + /* + * If we need to drain a request in the middle of a link, drain + * the head request and the next request/link after the current + * link. Considering sequential execution of links, + * IOSQE_IO_DRAIN will be maintained for every request of our + * link. + */ + head->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC; + ctx->drain_next = true; + } +} + static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe *sqe) __must_hold(&ctx->uring_lock) { - struct io_submit_state *state; unsigned int sqe_flags; - int personality, ret = 0; + int personality; + u8 opcode; /* req is partially pre-initialised, see io_preinit_req() */ - req->opcode = READ_ONCE(sqe->opcode); + req->opcode = opcode = READ_ONCE(sqe->opcode); /* same numerical values with corresponding REQ_F_*, safe to copy */ req->flags = sqe_flags = READ_ONCE(sqe->flags); req->user_data = READ_ONCE(sqe->user_data); @@ -7064,19 +7041,52 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->fixed_rsrc_refs = NULL; req->task = current; - /* enforce forwards compatibility on users */ - if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) - return -EINVAL; - if (unlikely(req->opcode >= IORING_OP_LAST)) + if (unlikely(opcode >= IORING_OP_LAST)) { + req->opcode = 0; return -EINVAL; - if (!io_check_restriction(ctx, req, sqe_flags)) - return -EACCES; + } + if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) { + /* enforce forwards compatibility on users */ + if (sqe_flags & ~SQE_VALID_FLAGS) + return -EINVAL; + if ((sqe_flags & IOSQE_BUFFER_SELECT) && + !io_op_defs[opcode].buffer_select) + return -EOPNOTSUPP; + if (sqe_flags & IOSQE_IO_DRAIN) + io_init_req_drain(req); + } + if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) { + if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags)) + return -EACCES; + /* knock it to the slow queue path, will be drained there */ + if (ctx->drain_active) + req->flags |= REQ_F_FORCE_ASYNC; + /* if there is no link, we're at "next" request and need to drain */ + if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) { + ctx->drain_next = false; + ctx->drain_active = true; + req->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC; + } + } - if ((sqe_flags & IOSQE_BUFFER_SELECT) && - !io_op_defs[req->opcode].buffer_select) - return -EOPNOTSUPP; - if (unlikely(sqe_flags & IOSQE_IO_DRAIN)) - ctx->drain_active = true; + if (io_op_defs[opcode].needs_file) { + struct io_submit_state *state = &ctx->submit_state; + + /* + * Plug now if we have more than 2 IO left after this, and the + * target is potentially a read/write to block based storage. + */ + if (state->need_plug && io_op_defs[opcode].plug) { + state->plug_started = true; + state->need_plug = false; + blk_start_plug_nr_ios(&state->plug, state->submit_nr); + } + + req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd), + (sqe_flags & IOSQE_FIXED_FILE)); + if (unlikely(!req->file)) + return -EBADF; + } personality = READ_ONCE(sqe->personality); if (personality) { @@ -7086,27 +7096,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, get_cred(req->creds); req->flags |= REQ_F_CREDS; } - state = &ctx->submit_state; - /* - * Plug now if we have more than 1 IO left after this, and the target - * is potentially a read/write to block based storage. - */ - if (!state->plug_started && state->ios_left > 1 && - io_op_defs[req->opcode].plug) { - blk_start_plug(&state->plug); - state->plug_started = true; - } - - if (io_op_defs[req->opcode].needs_file) { - req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd), - (sqe_flags & IOSQE_FIXED_FILE)); - if (unlikely(!req->file)) - ret = -EBADF; - } - - state->ios_left--; - return ret; + return io_req_prep(req, sqe); } static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, @@ -7118,7 +7109,8 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, ret = io_init_req(ctx, req, sqe); if (unlikely(ret)) { -fail_req: + trace_io_uring_req_failed(sqe, ret); + /* fail even hard links since we don't submit */ if (link->head) { /* @@ -7141,10 +7133,6 @@ fail_req: return ret; } req_fail_link_node(req, ret); - } else { - ret = io_req_prep(req, sqe); - if (unlikely(ret)) - goto fail_req; } /* don't need @sqe from now on */ @@ -7174,33 +7162,32 @@ fail_req: link->last->link = req; link->last = req; + if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) + return 0; /* last request of a link, enqueue the link */ - if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { - link->head = NULL; - io_queue_sqe(head); - } - } else { - if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { - link->head = req; - link->last = req; - } else { - io_queue_sqe(req); - } + link->head = NULL; + req = head; + } else if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { + link->head = req; + link->last = req; + return 0; } + io_queue_sqe(req); return 0; } /* * Batched submission is done, ensure local IO is flushed out. */ -static void io_submit_state_end(struct io_submit_state *state, - struct io_ring_ctx *ctx) +static void io_submit_state_end(struct io_ring_ctx *ctx) { + struct io_submit_state *state = &ctx->submit_state; + if (state->link.head) io_queue_sqe(state->link.head); - if (state->compl_nr) - io_submit_flush_completions(ctx); + /* flush only after queuing links as they can generate completions */ + io_submit_flush_completions(ctx); if (state->plug_started) blk_finish_plug(&state->plug); } @@ -7212,7 +7199,8 @@ static void io_submit_state_start(struct io_submit_state *state, unsigned int max_ios) { state->plug_started = false; - state->ios_left = max_ios; + state->need_plug = max_ios > 2; + state->submit_nr = max_ios; /* set only head, no need to init link_last in advance */ state->link.head = NULL; } @@ -7264,45 +7252,45 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) __must_hold(&ctx->uring_lock) { + unsigned int entries = io_sqring_entries(ctx); int submitted = 0; + if (unlikely(!entries)) + return 0; /* make sure SQ entry isn't read before tail */ - nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx)); - if (!percpu_ref_tryget_many(&ctx->refs, nr)) - return -EAGAIN; + nr = min3(nr, ctx->sq_entries, entries); io_get_task_refs(nr); io_submit_state_start(&ctx->submit_state, nr); - while (submitted < nr) { + do { const struct io_uring_sqe *sqe; struct io_kiocb *req; - req = io_alloc_req(ctx); - if (unlikely(!req)) { + if (unlikely(!io_alloc_req_refill(ctx))) { if (!submitted) submitted = -EAGAIN; break; } + req = io_alloc_req(ctx); sqe = io_get_sqe(ctx); if (unlikely(!sqe)) { - list_add(&req->inflight_entry, &ctx->submit_state.free_list); + wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); break; } /* will complete beyond this point, count as submitted */ submitted++; if (io_submit_sqe(ctx, req, sqe)) break; - } + } while (submitted < nr); if (unlikely(submitted != nr)) { int ref_used = (submitted == -EAGAIN) ? 0 : submitted; int unused = nr - ref_used; current->io_uring->cached_refs += unused; - percpu_ref_put_many(&ctx->refs, unused); } - io_submit_state_end(&ctx->submit_state, ctx); + io_submit_state_end(ctx); /* Commit SQ ring head once we've consumed and submitted all SQEs */ io_commit_sqring(ctx); @@ -7341,16 +7329,15 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE) to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE; - if (!list_empty(&ctx->iopoll_list) || to_submit) { - unsigned nr_events = 0; + if (!wq_list_empty(&ctx->iopoll_list) || to_submit) { const struct cred *creds = NULL; if (ctx->sq_creds != current_cred()) creds = override_creds(ctx->sq_creds); mutex_lock(&ctx->uring_lock); - if (!list_empty(&ctx->iopoll_list)) - io_do_iopoll(ctx, &nr_events, 0); + if (!wq_list_empty(&ctx->iopoll_list)) + io_do_iopoll(ctx, true); /* * Don't submit if refs are dying, good for io_uring_register(), @@ -7370,7 +7357,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) return ret; } -static void io_sqd_update_thread_idle(struct io_sq_data *sqd) +static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd) { struct io_ring_ctx *ctx; unsigned sq_thread_idle = 0; @@ -7427,7 +7414,7 @@ static int io_sq_thread(void *data) list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { int ret = __io_sq_thread(ctx, cap_entries); - if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list))) + if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list))) sqt_spin = true; } if (io_run_task_work()) @@ -7448,7 +7435,7 @@ static int io_sq_thread(void *data) io_ring_set_wakeup_flag(ctx); if ((ctx->flags & IORING_SETUP_IOPOLL) && - !list_empty_careful(&ctx->iopoll_list)) { + !wq_list_empty(&ctx->iopoll_list)) { needs_sched = false; break; } @@ -7624,7 +7611,7 @@ static void io_free_page_table(void **table, size_t size) kfree(table); } -static void **io_alloc_page_table(size_t size) +static __cold void **io_alloc_page_table(size_t size) { unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); size_t init_size = size; @@ -7653,7 +7640,7 @@ static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node) kfree(ref_node); } -static void io_rsrc_node_ref_zero(struct percpu_ref *ref) +static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref) { struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs); struct io_ring_ctx *ctx = node->rsrc_data->ctx; @@ -7699,10 +7686,13 @@ static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) static void io_rsrc_node_switch(struct io_ring_ctx *ctx, struct io_rsrc_data *data_to_kill) + __must_hold(&ctx->uring_lock) { WARN_ON_ONCE(!ctx->rsrc_backup_node); WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node); + io_rsrc_refs_drop(ctx); + if (data_to_kill) { struct io_rsrc_node *rsrc_node = ctx->rsrc_node; @@ -7730,7 +7720,8 @@ static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) return ctx->rsrc_backup_node ? 0 : -ENOMEM; } -static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ctx) +static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data, + struct io_ring_ctx *ctx) { int ret; @@ -7786,9 +7777,9 @@ static void io_rsrc_data_free(struct io_rsrc_data *data) kfree(data); } -static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put, - u64 __user *utags, unsigned nr, - struct io_rsrc_data **pdata) +static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put, + u64 __user *utags, unsigned nr, + struct io_rsrc_data **pdata) { struct io_rsrc_data *data; int ret = -ENOMEM; @@ -8356,12 +8347,12 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file, unsigned int issue_flags, u32 slot_index) { struct io_ring_ctx *ctx = req->ctx; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; bool needs_switch = false; struct io_fixed_file *file_slot; int ret = -EBADF; - io_ring_submit_lock(ctx, !force_nonblock); + io_ring_submit_lock(ctx, needs_lock); if (file->f_op == &io_uring_fops) goto err; ret = -ENXIO; @@ -8402,7 +8393,7 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file, err: if (needs_switch) io_rsrc_node_switch(ctx, ctx->file_data); - io_ring_submit_unlock(ctx, !force_nonblock); + io_ring_submit_unlock(ctx, needs_lock); if (ret) fput(file); return ret; @@ -8412,11 +8403,12 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) { unsigned int offset = req->close.file_slot - 1; struct io_ring_ctx *ctx = req->ctx; + bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; struct io_fixed_file *file_slot; struct file *file; int ret, i; - io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); + io_ring_submit_lock(ctx, needs_lock); ret = -ENXIO; if (unlikely(!ctx->file_data)) goto out; @@ -8442,7 +8434,7 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) io_rsrc_node_switch(ctx, ctx->file_data); ret = 0; out: - io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); + io_ring_submit_unlock(ctx, needs_lock); return ret; } @@ -8558,8 +8550,8 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, return io_wq_create(concurrency, &data); } -static int io_uring_alloc_task_context(struct task_struct *task, - struct io_ring_ctx *ctx) +static __cold int io_uring_alloc_task_context(struct task_struct *task, + struct io_ring_ctx *ctx) { struct io_uring_task *tctx; int ret; @@ -8606,8 +8598,8 @@ void __io_uring_free(struct task_struct *tsk) tsk->io_uring = NULL; } -static int io_sq_offload_create(struct io_ring_ctx *ctx, - struct io_uring_params *p) +static __cold int io_sq_offload_create(struct io_ring_ctx *ctx, + struct io_uring_params *p) { int ret; @@ -9218,29 +9210,25 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx) } } -static void io_req_cache_free(struct list_head *list) -{ - struct io_kiocb *req, *nxt; - - list_for_each_entry_safe(req, nxt, list, inflight_entry) { - list_del(&req->inflight_entry); - kmem_cache_free(req_cachep, req); - } -} - static void io_req_caches_free(struct io_ring_ctx *ctx) { struct io_submit_state *state = &ctx->submit_state; + int nr = 0; mutex_lock(&ctx->uring_lock); + io_flush_cached_locked_reqs(ctx, state); - if (state->free_reqs) { - kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs); - state->free_reqs = 0; - } + while (state->free_list.next) { + struct io_wq_work_node *node; + struct io_kiocb *req; - io_flush_cached_locked_reqs(ctx, state); - io_req_cache_free(&state->free_list); + node = wq_stack_extract(&state->free_list); + req = container_of(node, struct io_kiocb, comp_list); + kmem_cache_free(req_cachep, req); + nr++; + } + if (nr) + percpu_ref_put_many(&ctx->refs, nr); mutex_unlock(&ctx->uring_lock); } @@ -9250,7 +9238,7 @@ static void io_wait_rsrc_data(struct io_rsrc_data *data) wait_for_completion(&data->done); } -static void io_ring_ctx_free(struct io_ring_ctx *ctx) +static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_sq_thread_finish(ctx); @@ -9259,6 +9247,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) ctx->mm_account = NULL; } + io_rsrc_refs_drop(ctx); /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */ io_wait_rsrc_data(ctx->buf_data); io_wait_rsrc_data(ctx->file_data); @@ -9282,6 +9271,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) if (ctx->rsrc_backup_node) io_rsrc_node_destroy(ctx->rsrc_backup_node); flush_delayed_work(&ctx->rsrc_put_work); + flush_delayed_work(&ctx->fallback_work); WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist)); @@ -9312,7 +9302,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) struct io_ring_ctx *ctx = file->private_data; __poll_t mask = 0; - poll_wait(file, &ctx->poll_wait, wait); + poll_wait(file, &ctx->cq_wait, wait); /* * synchronizes with barrier from wq_has_sleeper call in * io_commit_cqring @@ -9359,7 +9349,7 @@ struct io_tctx_exit { struct io_ring_ctx *ctx; }; -static void io_tctx_exit_cb(struct callback_head *cb) +static __cold void io_tctx_exit_cb(struct callback_head *cb) { struct io_uring_task *tctx = current->io_uring; struct io_tctx_exit *work; @@ -9374,14 +9364,14 @@ static void io_tctx_exit_cb(struct callback_head *cb) complete(&work->completion); } -static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) +static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); return req->ctx == data; } -static void io_ring_exit_work(struct work_struct *work) +static __cold void io_ring_exit_work(struct work_struct *work) { struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work); unsigned long timeout = jiffies + HZ * 60 * 5; @@ -9410,6 +9400,8 @@ static void io_ring_exit_work(struct work_struct *work) io_sq_thread_unpark(sqd); } + io_req_caches_free(ctx); + if (WARN_ON_ONCE(time_after(jiffies, timeout))) { /* there is little hope left, don't run it too often */ interval = HZ * 60; @@ -9436,7 +9428,6 @@ static void io_ring_exit_work(struct work_struct *work) ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL); if (WARN_ON_ONCE(ret)) continue; - wake_up_process(node->task); mutex_unlock(&ctx->uring_lock); wait_for_completion(&exit.completion); @@ -9450,8 +9441,8 @@ static void io_ring_exit_work(struct work_struct *work) } /* Returns true if we found and killed one or more timeouts */ -static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, - bool cancel_all) +static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, + struct task_struct *tsk, bool cancel_all) { struct io_kiocb *req, *tmp; int canceled = 0; @@ -9473,7 +9464,7 @@ static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, return canceled != 0; } -static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) +static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) { unsigned long index; struct creds *creds; @@ -9535,8 +9526,9 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data) return ret; } -static bool io_cancel_defer_files(struct io_ring_ctx *ctx, - struct task_struct *task, bool cancel_all) +static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, + struct task_struct *task, + bool cancel_all) { struct io_defer_entry *de; LIST_HEAD(list); @@ -9561,7 +9553,7 @@ static bool io_cancel_defer_files(struct io_ring_ctx *ctx, return true; } -static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) +static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) { struct io_tctx_node *node; enum io_wq_cancel cret; @@ -9585,9 +9577,9 @@ static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) return ret; } -static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, - struct task_struct *task, - bool cancel_all) +static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, + struct task_struct *task, + bool cancel_all) { struct io_task_cancel cancel = { .task = task, .all = cancel_all, }; struct io_uring_task *tctx = task ? task->io_uring : NULL; @@ -9611,7 +9603,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, /* SQPOLL thread does its own polling */ if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || (ctx->sq_data && ctx->sq_data->thread == current)) { - while (!list_empty_careful(&ctx->iopoll_list)) { + while (!wq_list_empty(&ctx->iopoll_list)) { io_iopoll_try_reap_events(ctx); ret = true; } @@ -9638,7 +9630,16 @@ static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) ret = io_uring_alloc_task_context(current, ctx); if (unlikely(ret)) return ret; + tctx = current->io_uring; + if (ctx->iowq_limits_set) { + unsigned int limits[2] = { ctx->iowq_limits[0], + ctx->iowq_limits[1], }; + + ret = io_wq_max_workers(tctx->io_wq, limits); + if (ret) + return ret; + } } if (!xa_load(&tctx->xa, (unsigned long)ctx)) { node = kmalloc(sizeof(*node), GFP_KERNEL); @@ -9677,7 +9678,7 @@ static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx) /* * Remove this io_uring_file -> task mapping. */ -static void io_uring_del_tctx_node(unsigned long index) +static __cold void io_uring_del_tctx_node(unsigned long index) { struct io_uring_task *tctx = current->io_uring; struct io_tctx_node *node; @@ -9700,7 +9701,7 @@ static void io_uring_del_tctx_node(unsigned long index) kfree(node); } -static void io_uring_clean_tctx(struct io_uring_task *tctx) +static __cold void io_uring_clean_tctx(struct io_uring_task *tctx) { struct io_wq *wq = tctx->io_wq; struct io_tctx_node *node; @@ -9727,7 +9728,7 @@ static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) return percpu_counter_sum(&tctx->inflight); } -static void io_uring_drop_tctx_refs(struct task_struct *task) +static __cold void io_uring_drop_tctx_refs(struct task_struct *task) { struct io_uring_task *tctx = task->io_uring; unsigned int refs = tctx->cached_refs; @@ -9743,7 +9744,8 @@ static void io_uring_drop_tctx_refs(struct task_struct *task) * Find any io_uring ctx that this task has registered or done IO on, and cancel * requests. @sqd should be not-null IIF it's an SQPOLL thread cancellation. */ -static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) +static __cold void io_uring_cancel_generic(bool cancel_all, + struct io_sq_data *sqd) { struct io_uring_task *tctx = current->io_uring; struct io_ring_ctx *ctx; @@ -9836,7 +9838,7 @@ static void *io_uring_validate_mmap_request(struct file *file, #ifdef CONFIG_MMU -static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) +static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) { size_t sz = vma->vm_end - vma->vm_start; unsigned long pfn; @@ -10021,7 +10023,7 @@ out_fput: } #ifdef CONFIG_PROC_FS -static int io_uring_show_cred(struct seq_file *m, unsigned int id, +static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, const struct cred *cred) { struct user_namespace *uns = seq_user_ns(m); @@ -10053,11 +10055,59 @@ static int io_uring_show_cred(struct seq_file *m, unsigned int id, return 0; } -static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) +static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, + struct seq_file *m) { struct io_sq_data *sq = NULL; + struct io_overflow_cqe *ocqe; + struct io_rings *r = ctx->rings; + unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1; + unsigned int sq_head = READ_ONCE(r->sq.head); + unsigned int sq_tail = READ_ONCE(r->sq.tail); + unsigned int cq_head = READ_ONCE(r->cq.head); + unsigned int cq_tail = READ_ONCE(r->cq.tail); + unsigned int sq_entries, cq_entries; bool has_lock; - int i; + unsigned int i; + + /* + * we may get imprecise sqe and cqe info if uring is actively running + * since we get cached_sq_head and cached_cq_tail without uring_lock + * and sq_tail and cq_head are changed by userspace. But it's ok since + * we usually use these info when it is stuck. + */ + seq_printf(m, "SqMask:\t\t0x%x\n", sq_mask); + seq_printf(m, "SqHead:\t%u\n", sq_head); + seq_printf(m, "SqTail:\t%u\n", sq_tail); + seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head); + seq_printf(m, "CqMask:\t0x%x\n", cq_mask); + seq_printf(m, "CqHead:\t%u\n", cq_head); + seq_printf(m, "CqTail:\t%u\n", cq_tail); + seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail); + seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head); + sq_entries = min(sq_tail - sq_head, ctx->sq_entries); + for (i = 0; i < sq_entries; i++) { + unsigned int entry = i + sq_head; + unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]); + struct io_uring_sqe *sqe = &ctx->sq_sqes[sq_idx]; + + if (sq_idx > sq_mask) + continue; + sqe = &ctx->sq_sqes[sq_idx]; + seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n", + sq_idx, sqe->opcode, sqe->fd, sqe->flags, + sqe->user_data); + } + seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head); + cq_entries = min(cq_tail - cq_head, ctx->cq_entries); + for (i = 0; i < cq_entries; i++) { + unsigned int entry = i + cq_head; + struct io_uring_cqe *cqe = &r->cqes[entry & cq_mask]; + + seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n", + entry & cq_mask, cqe->user_data, cqe->res, + cqe->flags); + } /* * Avoid ABBA deadlock between the seq lock and the io_uring mutex, @@ -10099,7 +10149,10 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) xa_for_each(&ctx->personalities, index, cred) io_uring_show_cred(m, index, cred); } - seq_printf(m, "PollList:\n"); + if (has_lock) + mutex_unlock(&ctx->uring_lock); + + seq_puts(m, "PollList:\n"); spin_lock(&ctx->completion_lock); for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { struct hlist_head *list = &ctx->cancel_hash[i]; @@ -10109,12 +10162,20 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) seq_printf(m, " op=%d, task_works=%d\n", req->opcode, req->task->task_works != NULL); } + + seq_puts(m, "CqOverflowList:\n"); + list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) { + struct io_uring_cqe *cqe = &ocqe->cqe; + + seq_printf(m, " user_data=%llu, res=%d, flags=%x\n", + cqe->user_data, cqe->res, cqe->flags); + + } + spin_unlock(&ctx->completion_lock); - if (has_lock) - mutex_unlock(&ctx->uring_lock); } -static void io_uring_show_fdinfo(struct seq_file *m, struct file *f) +static __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) { struct io_ring_ctx *ctx = f->private_data; @@ -10138,8 +10199,8 @@ static const struct file_operations io_uring_fops = { #endif }; -static int io_allocate_scq_urings(struct io_ring_ctx *ctx, - struct io_uring_params *p) +static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, + struct io_uring_params *p) { struct io_rings *rings; size_t size, sq_array_offset; @@ -10228,8 +10289,8 @@ static struct file *io_uring_get_file(struct io_ring_ctx *ctx) return file; } -static int io_uring_create(unsigned entries, struct io_uring_params *p, - struct io_uring_params __user *params) +static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, + struct io_uring_params __user *params) { struct io_ring_ctx *ctx; struct file *file; @@ -10387,7 +10448,8 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries, return io_uring_setup(entries, params); } -static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) +static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, + unsigned nr_args) { struct io_uring_probe *p; size_t size; @@ -10443,8 +10505,8 @@ static int io_register_personality(struct io_ring_ctx *ctx) return id; } -static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg, - unsigned int nr_args) +static __cold int io_register_restrictions(struct io_ring_ctx *ctx, + void __user *arg, unsigned int nr_args) { struct io_uring_restriction *res; size_t size; @@ -10578,7 +10640,7 @@ static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, return __io_register_rsrc_update(ctx, type, &up, up.nr); } -static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, +static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, unsigned int size, unsigned int type) { struct io_uring_rsrc_register rr; @@ -10604,8 +10666,8 @@ static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, return -EINVAL; } -static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg, - unsigned len) +static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, + void __user *arg, unsigned len) { struct io_uring_task *tctx = current->io_uring; cpumask_var_t new_mask; @@ -10631,7 +10693,7 @@ static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg, return ret; } -static int io_unregister_iowq_aff(struct io_ring_ctx *ctx) +static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx) { struct io_uring_task *tctx = current->io_uring; @@ -10641,9 +10703,11 @@ static int io_unregister_iowq_aff(struct io_ring_ctx *ctx) return io_wq_cpu_affinity(tctx->io_wq, NULL); } -static int io_register_iowq_max_workers(struct io_ring_ctx *ctx, - void __user *arg) +static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, + void __user *arg) + __must_hold(&ctx->uring_lock) { + struct io_tctx_node *node; struct io_uring_task *tctx = NULL; struct io_sq_data *sqd = NULL; __u32 new_count[2]; @@ -10674,13 +10738,19 @@ static int io_register_iowq_max_workers(struct io_ring_ctx *ctx, tctx = current->io_uring; } - ret = -EINVAL; - if (!tctx || !tctx->io_wq) - goto err; + BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits)); - ret = io_wq_max_workers(tctx->io_wq, new_count); - if (ret) - goto err; + memcpy(ctx->iowq_limits, new_count, sizeof(new_count)); + ctx->iowq_limits_set = true; + + ret = -EINVAL; + if (tctx && tctx->io_wq) { + ret = io_wq_max_workers(tctx->io_wq, new_count); + if (ret) + goto err; + } else { + memset(new_count, 0, sizeof(new_count)); + } if (sqd) { mutex_unlock(&sqd->lock); @@ -10690,6 +10760,22 @@ static int io_register_iowq_max_workers(struct io_ring_ctx *ctx, if (copy_to_user(arg, new_count, sizeof(new_count))) return -EFAULT; + /* that's it for SQPOLL, only the SQPOLL task creates requests */ + if (sqd) + return 0; + + /* now propagate the restriction to all registered users */ + list_for_each_entry(node, &ctx->tctx_list, ctx_node) { + struct io_uring_task *tctx = node->task->io_uring; + + if (WARN_ON_ONCE(!tctx->io_wq)) + continue; + + for (i = 0; i < ARRAY_SIZE(new_count); i++) + new_count[i] = ctx->iowq_limits[i]; + /* ignore errors, it always returns zero anyway */ + (void)io_wq_max_workers(tctx->io_wq, new_count); + } return 0; err: if (sqd) { @@ -10723,7 +10809,7 @@ static bool io_register_op_must_quiesce(int op) } } -static int io_ctx_quiesce(struct io_ring_ctx *ctx) +static __cold int io_ctx_quiesce(struct io_ring_ctx *ctx) { long ret; @@ -10738,10 +10824,14 @@ static int io_ctx_quiesce(struct io_ring_ctx *ctx) */ mutex_unlock(&ctx->uring_lock); do { - ret = wait_for_completion_interruptible(&ctx->ref_comp); - if (!ret) + ret = wait_for_completion_interruptible_timeout(&ctx->ref_comp, HZ); + if (ret) { + ret = min(0L, ret); break; + } + ret = io_run_task_work_sig(); + io_req_caches_free(ctx); } while (ret >= 0); mutex_lock(&ctx->uring_lock); @@ -10972,6 +11062,8 @@ static int __init io_uring_init(void) /* should fit into one byte */ BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8)); + BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8)); + BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS); BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int)); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 4ecd255e0511..811c898125a5 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -38,8 +38,7 @@ struct iomap_dio { struct { struct iov_iter *iter; struct task_struct *waiter; - struct request_queue *last_queue; - blk_qc_t cookie; + struct bio *poll_bio; } submit; /* used for aio completion: */ @@ -49,29 +48,20 @@ struct iomap_dio { }; }; -int iomap_dio_iopoll(struct kiocb *kiocb, bool spin) -{ - struct request_queue *q = READ_ONCE(kiocb->private); - - if (!q) - return 0; - return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin); -} -EXPORT_SYMBOL_GPL(iomap_dio_iopoll); - static void iomap_dio_submit_bio(const struct iomap_iter *iter, struct iomap_dio *dio, struct bio *bio, loff_t pos) { atomic_inc(&dio->ref); - if (dio->iocb->ki_flags & IOCB_HIPRI) + if (dio->iocb->ki_flags & IOCB_HIPRI) { bio_set_polled(bio, dio->iocb); + dio->submit.poll_bio = bio; + } - dio->submit.last_queue = bdev_get_queue(iter->iomap.bdev); if (dio->dops && dio->dops->submit_io) - dio->submit.cookie = dio->dops->submit_io(iter, bio, pos); + dio->dops->submit_io(iter, bio, pos); else - dio->submit.cookie = submit_bio(bio); + submit_bio(bio); } ssize_t iomap_dio_complete(struct iomap_dio *dio) @@ -135,7 +125,7 @@ static void iomap_dio_complete_work(struct work_struct *work) struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work); struct kiocb *iocb = dio->iocb; - iocb->ki_complete(iocb, iomap_dio_complete(dio), 0); + iocb->ki_complete(iocb, iomap_dio_complete(dio)); } /* @@ -164,9 +154,11 @@ static void iomap_dio_bio_end_io(struct bio *bio) } else if (dio->flags & IOMAP_DIO_WRITE) { struct inode *inode = file_inode(dio->iocb->ki_filp); + WRITE_ONCE(dio->iocb->private, NULL); INIT_WORK(&dio->aio.work, iomap_dio_complete_work); queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work); } else { + WRITE_ONCE(dio->iocb->private, NULL); iomap_dio_complete_work(&dio->aio.work); } } @@ -282,6 +274,13 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, if (!iov_iter_count(dio->submit.iter)) goto out; + /* + * We can only poll for single bio I/Os. + */ + if (need_zeroout || + ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) + dio->iocb->ki_flags &= ~IOCB_HIPRI; + if (need_zeroout) { /* zero out from the start of the block to the write offset */ pad = pos & (fs_block_size - 1); @@ -339,6 +338,11 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS); + /* + * We can only poll for single bio I/Os. + */ + if (nr_pages) + dio->iocb->ki_flags &= ~IOCB_HIPRI; iomap_dio_submit_bio(iter, dio, bio, pos); pos += n; } while (nr_pages); @@ -485,8 +489,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->submit.iter = iter; dio->submit.waiter = current; - dio->submit.cookie = BLK_QC_T_NONE; - dio->submit.last_queue = NULL; + dio->submit.poll_bio = NULL; if (iov_iter_rw(iter) == READ) { if (iomi.pos >= dio->i_size) @@ -565,8 +568,15 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, inode_dio_begin(inode); blk_start_plug(&plug); - while ((ret = iomap_iter(&iomi, ops)) > 0) + while ((ret = iomap_iter(&iomi, ops)) > 0) { iomi.processed = iomap_dio_iter(&iomi, dio); + + /* + * We can only poll for single bio I/Os. + */ + iocb->ki_flags &= ~IOCB_HIPRI; + } + blk_finish_plug(&plug); /* @@ -592,8 +602,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (dio->flags & IOMAP_DIO_WRITE_FUA) dio->flags &= ~IOMAP_DIO_NEED_SYNC; - WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie); - WRITE_ONCE(iocb->private, dio->submit.last_queue); + WRITE_ONCE(iocb->private, dio->submit.poll_bio); /* * We are about to drop our additional submission reference, which @@ -620,10 +629,8 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (!READ_ONCE(dio->submit.waiter)) break; - if (!(iocb->ki_flags & IOCB_HIPRI) || - !dio->submit.last_queue || - !blk_poll(dio->submit.last_queue, - dio->submit.cookie, true)) + if (!dio->submit.poll_bio || + !bio_poll(dio->submit.poll_bio, NULL, 0)) blk_io_schedule(); } __set_current_state(TASK_RUNNING); diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 176580f54af9..104ae698443e 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -13,6 +13,7 @@ #include <linux/buffer_head.h> #include <linux/mempool.h> #include <linux/seq_file.h> +#include <linux/writeback.h> #include "jfs_incore.h" #include "jfs_superblock.h" #include "jfs_filsys.h" diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c index bde787c354fc..8b9a72ae5efa 100644 --- a/fs/jfs/resize.c +++ b/fs/jfs/resize.c @@ -86,8 +86,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) goto out; } - VolumeSize = i_size_read(sb->s_bdev->bd_inode) >> sb->s_blocksize_bits; - + VolumeSize = sb_bdev_nr_blocks(sb); if (VolumeSize) { if (newLVSize > VolumeSize) { printk(KERN_WARNING "jfs_extendfs: invalid size\n"); @@ -199,7 +198,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) txQuiesce(sb); /* Reset size of direct inode */ - sbi->direct_inode->i_size = i_size_read(sb->s_bdev->bd_inode); + sbi->direct_inode->i_size = bdev_nr_bytes(sb->s_bdev); if (sbi->mntflag & JFS_INLINELOG) { /* diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 9030aeaf0f88..24cbc9946e01 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -284,8 +284,7 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, } case Opt_resize_nosize: { - *newLVSize = i_size_read(sb->s_bdev->bd_inode) >> - sb->s_blocksize_bits; + *newLVSize = sb_bdev_nr_blocks(sb); if (*newLVSize == 0) pr_err("JFS: Cannot determine volume size\n"); break; @@ -551,7 +550,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) ret = -ENOMEM; goto out_unload; } - inode->i_size = i_size_read(sb->s_bdev->bd_inode); + inode->i_size = bdev_nr_bytes(sb->s_bdev); inode->i_mapping->a_ops = &jfs_metapage_aops; inode_fake_hash(inode); mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c index 87aac4c72c37..1b07550485b9 100644 --- a/fs/kernel_read_file.c +++ b/fs/kernel_read_file.c @@ -178,7 +178,7 @@ int kernel_read_file_from_fd(int fd, loff_t offset, void **buf, struct fd f = fdget(fd); int ret = -EBADF; - if (!f.file) + if (!f.file || !(f.file->f_mode & FMODE_READ)) goto out; ret = kernel_read_file(f.file, offset, buf, buf_size, file_size, id); diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index cfc3ce8b815a..8e0a1378a4b1 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -1111,7 +1111,14 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, kn = kernfs_find_ns(parent, dentry->d_name.name, ns); /* attach dentry and inode */ - if (kn && kernfs_active(kn)) { + if (kn) { + /* Inactive nodes are invisible to the VFS so don't + * create a negative. + */ + if (!kernfs_active(kn)) { + up_read(&kernfs_rwsem); + return NULL; + } inode = kernfs_get_inode(dir->i_sb, kn); if (!inode) inode = ERR_PTR(-ENOMEM); diff --git a/fs/ksmbd/auth.c b/fs/ksmbd/auth.c index 71c989f1568d..30a92ddc1817 100644 --- a/fs/ksmbd/auth.c +++ b/fs/ksmbd/auth.c @@ -298,8 +298,8 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, int blob_len, struct ksmbd_session *sess) { char *domain_name; - unsigned int lm_off, nt_off; - unsigned short nt_len; + unsigned int nt_off, dn_off; + unsigned short nt_len, dn_len; int ret; if (blob_len < sizeof(struct authenticate_message)) { @@ -314,15 +314,17 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, return -EINVAL; } - lm_off = le32_to_cpu(authblob->LmChallengeResponse.BufferOffset); nt_off = le32_to_cpu(authblob->NtChallengeResponse.BufferOffset); nt_len = le16_to_cpu(authblob->NtChallengeResponse.Length); + dn_off = le32_to_cpu(authblob->DomainName.BufferOffset); + dn_len = le16_to_cpu(authblob->DomainName.Length); + + if (blob_len < (u64)dn_off + dn_len || blob_len < (u64)nt_off + nt_len) + return -EINVAL; /* TODO : use domain name that imported from configuration file */ - domain_name = smb_strndup_from_utf16((const char *)authblob + - le32_to_cpu(authblob->DomainName.BufferOffset), - le16_to_cpu(authblob->DomainName.Length), true, - sess->conn->local_nls); + domain_name = smb_strndup_from_utf16((const char *)authblob + dn_off, + dn_len, true, sess->conn->local_nls); if (IS_ERR(domain_name)) return PTR_ERR(domain_name); diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c index af086d35398a..b57a0d8a392f 100644 --- a/fs/ksmbd/connection.c +++ b/fs/ksmbd/connection.c @@ -61,6 +61,8 @@ struct ksmbd_conn *ksmbd_conn_alloc(void) conn->local_nls = load_nls_default(); atomic_set(&conn->req_running, 0); atomic_set(&conn->r_count, 0); + conn->total_credits = 1; + init_waitqueue_head(&conn->req_running_q); INIT_LIST_HEAD(&conn->conns_list); INIT_LIST_HEAD(&conn->sessions); @@ -296,10 +298,12 @@ int ksmbd_conn_handler_loop(void *p) pdu_size = get_rfc1002_len(hdr_buf); ksmbd_debug(CONN, "RFC1002 header %u bytes\n", pdu_size); - /* make sure we have enough to get to SMB header end */ - if (!ksmbd_pdu_size_has_room(pdu_size)) { - ksmbd_debug(CONN, "SMB request too short (%u bytes)\n", - pdu_size); + /* + * Check if pdu size is valid (min : smb header size, + * max : 0x00FFFFFF). + */ + if (pdu_size < __SMB2_HEADER_STRUCTURE_SIZE || + pdu_size > MAX_STREAM_PROT_LEN) { continue; } diff --git a/fs/ksmbd/glob.h b/fs/ksmbd/glob.h index 49a5a3afa118..5b8f3e0ebdb3 100644 --- a/fs/ksmbd/glob.h +++ b/fs/ksmbd/glob.h @@ -12,7 +12,7 @@ #include "unicode.h" #include "vfs_cache.h" -#define KSMBD_VERSION "3.1.9" +#define KSMBD_VERSION "3.4.2" extern int ksmbd_debug_types; diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h index 2fbe2bc1e093..c6718a05d347 100644 --- a/fs/ksmbd/ksmbd_netlink.h +++ b/fs/ksmbd/ksmbd_netlink.h @@ -211,6 +211,7 @@ struct ksmbd_tree_disconnect_request { */ struct ksmbd_logout_request { __s8 account[KSMBD_REQ_MAX_ACCOUNT_NAME_SZ]; /* user account name */ + __u32 account_flags; }; /* @@ -317,6 +318,7 @@ enum KSMBD_TREE_CONN_STATUS { #define KSMBD_USER_FLAG_BAD_UID BIT(2) #define KSMBD_USER_FLAG_BAD_USER BIT(3) #define KSMBD_USER_FLAG_GUEST_ACCOUNT BIT(4) +#define KSMBD_USER_FLAG_DELAY_SESSION BIT(5) /* * Share config flags. diff --git a/fs/ksmbd/mgmt/user_config.c b/fs/ksmbd/mgmt/user_config.c index d21629ae5c89..1019d3677d55 100644 --- a/fs/ksmbd/mgmt/user_config.c +++ b/fs/ksmbd/mgmt/user_config.c @@ -55,7 +55,7 @@ struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp) void ksmbd_free_user(struct ksmbd_user *user) { - ksmbd_ipc_logout_request(user->name); + ksmbd_ipc_logout_request(user->name, user->flags); kfree(user->name); kfree(user->passkey); kfree(user); diff --git a/fs/ksmbd/mgmt/user_config.h b/fs/ksmbd/mgmt/user_config.h index b2bb074a0150..aff80b029579 100644 --- a/fs/ksmbd/mgmt/user_config.h +++ b/fs/ksmbd/mgmt/user_config.h @@ -18,6 +18,7 @@ struct ksmbd_user { size_t passkey_sz; char *passkey; + unsigned int failed_login_count; }; static inline bool user_guest(struct ksmbd_user *user) diff --git a/fs/ksmbd/smb2misc.c b/fs/ksmbd/smb2misc.c index 9aa46bb3e10d..030ca57c3784 100644 --- a/fs/ksmbd/smb2misc.c +++ b/fs/ksmbd/smb2misc.c @@ -80,18 +80,21 @@ static const bool has_smb2_data_area[NUMBER_OF_SMB2_COMMANDS] = { }; /* - * Returns the pointer to the beginning of the data area. Length of the data - * area and the offset to it (from the beginning of the smb are also returned. + * Set length of the data area and the offset to arguments. + * if they are invalid, return error. */ -static char *smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr) +static int smb2_get_data_area_len(unsigned int *off, unsigned int *len, + struct smb2_hdr *hdr) { + int ret = 0; + *off = 0; *len = 0; /* error reqeusts do not have data area */ if (hdr->Status && hdr->Status != STATUS_MORE_PROCESSING_REQUIRED && (((struct smb2_err_rsp *)hdr)->StructureSize) == SMB2_ERROR_STRUCTURE_SIZE2_LE) - return NULL; + return ret; /* * Following commands have data areas so we have to get the location @@ -165,69 +168,60 @@ static char *smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr) case SMB2_IOCTL: *off = le32_to_cpu(((struct smb2_ioctl_req *)hdr)->InputOffset); *len = le32_to_cpu(((struct smb2_ioctl_req *)hdr)->InputCount); - break; default: ksmbd_debug(SMB, "no length check for command\n"); break; } - /* - * Invalid length or offset probably means data area is invalid, but - * we have little choice but to ignore the data area in this case. - */ if (*off > 4096) { - ksmbd_debug(SMB, "offset %d too large, data area ignored\n", - *off); - *len = 0; - *off = 0; - } else if (*off < 0) { - ksmbd_debug(SMB, - "negative offset %d to data invalid ignore data area\n", - *off); - *off = 0; - *len = 0; - } else if (*len < 0) { - ksmbd_debug(SMB, - "negative data length %d invalid, data area ignored\n", - *len); - *len = 0; - } else if (*len > 128 * 1024) { - ksmbd_debug(SMB, "data area larger than 128K: %d\n", *len); - *len = 0; + ksmbd_debug(SMB, "offset %d too large\n", *off); + ret = -EINVAL; + } else if ((u64)*off + *len > MAX_STREAM_PROT_LEN) { + ksmbd_debug(SMB, "Request is larger than maximum stream protocol length(%u): %llu\n", + MAX_STREAM_PROT_LEN, (u64)*off + *len); + ret = -EINVAL; } - /* return pointer to beginning of data area, ie offset from SMB start */ - if ((*off != 0) && (*len != 0)) - return (char *)hdr + *off; - else - return NULL; + return ret; } /* * Calculate the size of the SMB message based on the fixed header * portion, the number of word parameters and the data portion of the message. */ -static unsigned int smb2_calc_size(void *buf) +static int smb2_calc_size(void *buf, unsigned int *len) { struct smb2_pdu *pdu = (struct smb2_pdu *)buf; struct smb2_hdr *hdr = &pdu->hdr; - int offset; /* the offset from the beginning of SMB to data area */ - int data_length; /* the length of the variable length data area */ + unsigned int offset; /* the offset from the beginning of SMB to data area */ + unsigned int data_length; /* the length of the variable length data area */ + int ret; + /* Structure Size has already been checked to make sure it is 64 */ - int len = le16_to_cpu(hdr->StructureSize); + *len = le16_to_cpu(hdr->StructureSize); /* * StructureSize2, ie length of fixed parameter area has already * been checked to make sure it is the correct length. */ - len += le16_to_cpu(pdu->StructureSize2); + *len += le16_to_cpu(pdu->StructureSize2); + /* + * StructureSize2 of smb2_lock pdu is set to 48, indicating + * the size of smb2 lock request with single smb2_lock_element + * regardless of number of locks. Subtract single + * smb2_lock_element for correct buffer size check. + */ + if (hdr->Command == SMB2_LOCK) + *len -= sizeof(struct smb2_lock_element); if (has_smb2_data_area[le16_to_cpu(hdr->Command)] == false) goto calc_size_exit; - smb2_get_data_area_len(&offset, &data_length, hdr); - ksmbd_debug(SMB, "SMB2 data length %d offset %d\n", data_length, + ret = smb2_get_data_area_len(&offset, &data_length, hdr); + if (ret) + return ret; + ksmbd_debug(SMB, "SMB2 data length %u offset %u\n", data_length, offset); if (data_length > 0) { @@ -237,16 +231,19 @@ static unsigned int smb2_calc_size(void *buf) * for some commands, typically those with odd StructureSize, * so we must add one to the calculation. */ - if (offset + 1 < len) + if (offset + 1 < *len) { ksmbd_debug(SMB, - "data area offset %d overlaps SMB2 header %d\n", - offset + 1, len); - else - len = offset + data_length; + "data area offset %d overlaps SMB2 header %u\n", + offset + 1, *len); + return -EINVAL; + } + + *len = offset + data_length; } + calc_size_exit: - ksmbd_debug(SMB, "SMB2 len %d\n", len); - return len; + ksmbd_debug(SMB, "SMB2 len %u\n", *len); + return 0; } static inline int smb2_query_info_req_len(struct smb2_query_info_req *h) @@ -287,11 +284,13 @@ static inline int smb2_ioctl_resp_len(struct smb2_ioctl_req *h) le32_to_cpu(h->MaxOutputResponse); } -static int smb2_validate_credit_charge(struct smb2_hdr *hdr) +static int smb2_validate_credit_charge(struct ksmbd_conn *conn, + struct smb2_hdr *hdr) { - int req_len = 0, expect_resp_len = 0, calc_credit_num, max_len; - int credit_charge = le16_to_cpu(hdr->CreditCharge); + unsigned int req_len = 0, expect_resp_len = 0, calc_credit_num, max_len; + unsigned short credit_charge = le16_to_cpu(hdr->CreditCharge); void *__hdr = hdr; + int ret; switch (hdr->Command) { case SMB2_QUERY_INFO: @@ -313,21 +312,37 @@ static int smb2_validate_credit_charge(struct smb2_hdr *hdr) req_len = smb2_ioctl_req_len(__hdr); expect_resp_len = smb2_ioctl_resp_len(__hdr); break; - default: + case SMB2_CANCEL: return 0; + default: + req_len = 1; + break; } - credit_charge = max(1, credit_charge); - max_len = max(req_len, expect_resp_len); + credit_charge = max_t(unsigned short, credit_charge, 1); + max_len = max_t(unsigned int, req_len, expect_resp_len); calc_credit_num = DIV_ROUND_UP(max_len, SMB2_MAX_BUFFER_SIZE); if (credit_charge < calc_credit_num) { - pr_err("Insufficient credit charge, given: %d, needed: %d\n", - credit_charge, calc_credit_num); + ksmbd_debug(SMB, "Insufficient credit charge, given: %d, needed: %d\n", + credit_charge, calc_credit_num); + return 1; + } else if (credit_charge > conn->max_credits) { + ksmbd_debug(SMB, "Too large credit charge: %d\n", credit_charge); return 1; } - return 0; + spin_lock(&conn->credits_lock); + if (credit_charge <= conn->total_credits) { + conn->total_credits -= credit_charge; + ret = 0; + } else { + ksmbd_debug(SMB, "Insufficient credits granted, given: %u, granted: %u\n", + credit_charge, conn->total_credits); + ret = 1; + } + spin_unlock(&conn->credits_lock); + return ret; } int ksmbd_smb2_check_message(struct ksmbd_work *work) @@ -385,24 +400,20 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work) } } - if ((work->conn->vals->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU) && - smb2_validate_credit_charge(hdr)) { - work->conn->ops->set_rsp_status(work, STATUS_INVALID_PARAMETER); + if (smb2_calc_size(hdr, &clc_len)) return 1; - } - clc_len = smb2_calc_size(hdr); if (len != clc_len) { - /* server can return one byte more due to implied bcc[0] */ + /* client can return one byte more due to implied bcc[0] */ if (clc_len == len + 1) - return 0; + goto validate_credit; /* * Some windows servers (win2016) will pad also the final * PDU in a compound to 8 bytes. */ if (ALIGN(clc_len, 8) == len) - return 0; + goto validate_credit; /* * windows client also pad up to 8 bytes when compounding. @@ -415,12 +426,9 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work) "cli req padded more than expected. Length %d not %d for cmd:%d mid:%llu\n", len, clc_len, command, le64_to_cpu(hdr->MessageId)); - return 0; + goto validate_credit; } - if (command == SMB2_LOCK_HE && len == 88) - return 0; - ksmbd_debug(SMB, "cli req too short, len %d not %d. cmd:%d mid:%llu\n", len, clc_len, command, @@ -429,6 +437,13 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work) return 1; } +validate_credit: + if ((work->conn->vals->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU) && + smb2_validate_credit_charge(work->conn, hdr)) { + work->conn->ops->set_rsp_status(work, STATUS_INVALID_PARAMETER); + return 1; + } + return 0; } diff --git a/fs/ksmbd/smb2ops.c b/fs/ksmbd/smb2ops.c index 197473871aa4..fb6a65d23139 100644 --- a/fs/ksmbd/smb2ops.c +++ b/fs/ksmbd/smb2ops.c @@ -187,11 +187,6 @@ static struct smb_version_cmds smb2_0_server_cmds[NUMBER_OF_SMB2_COMMANDS] = { [SMB2_CHANGE_NOTIFY_HE] = { .proc = smb2_notify}, }; -int init_smb2_0_server(struct ksmbd_conn *conn) -{ - return -EOPNOTSUPP; -} - /** * init_smb2_1_server() - initialize a smb server connection with smb2.1 * command dispatcher @@ -289,6 +284,7 @@ int init_smb3_11_server(struct ksmbd_conn *conn) void init_smb2_max_read_size(unsigned int sz) { + sz = clamp_val(sz, SMB3_MIN_IOSIZE, SMB3_MAX_IOSIZE); smb21_server_values.max_read_size = sz; smb30_server_values.max_read_size = sz; smb302_server_values.max_read_size = sz; @@ -297,6 +293,7 @@ void init_smb2_max_read_size(unsigned int sz) void init_smb2_max_write_size(unsigned int sz) { + sz = clamp_val(sz, SMB3_MIN_IOSIZE, SMB3_MAX_IOSIZE); smb21_server_values.max_write_size = sz; smb30_server_values.max_write_size = sz; smb302_server_values.max_write_size = sz; @@ -305,6 +302,7 @@ void init_smb2_max_write_size(unsigned int sz) void init_smb2_max_trans_size(unsigned int sz) { + sz = clamp_val(sz, SMB3_MIN_IOSIZE, SMB3_MAX_IOSIZE); smb21_server_values.max_trans_size = sz; smb30_server_values.max_trans_size = sz; smb302_server_values.max_trans_size = sz; diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index dcf907738610..7e448df3f847 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -236,9 +236,6 @@ int init_smb2_neg_rsp(struct ksmbd_work *work) if (conn->need_neg == false) return -EINVAL; - if (!(conn->dialect >= SMB20_PROT_ID && - conn->dialect <= SMB311_PROT_ID)) - return -EINVAL; rsp_hdr = work->response_buf; @@ -295,22 +292,6 @@ int init_smb2_neg_rsp(struct ksmbd_work *work) return 0; } -static int smb2_consume_credit_charge(struct ksmbd_work *work, - unsigned short credit_charge) -{ - struct ksmbd_conn *conn = work->conn; - unsigned int rsp_credits = 1; - - if (!conn->total_credits) - return 0; - - if (credit_charge > 0) - rsp_credits = credit_charge; - - conn->total_credits -= rsp_credits; - return rsp_credits; -} - /** * smb2_set_rsp_credits() - set number of credits in response buffer * @work: smb work containing smb response buffer @@ -320,49 +301,43 @@ int smb2_set_rsp_credits(struct ksmbd_work *work) struct smb2_hdr *req_hdr = ksmbd_req_buf_next(work); struct smb2_hdr *hdr = ksmbd_resp_buf_next(work); struct ksmbd_conn *conn = work->conn; - unsigned short credits_requested = le16_to_cpu(req_hdr->CreditRequest); - unsigned short credit_charge = 1, credits_granted = 0; - unsigned short aux_max, aux_credits, min_credits; - int rsp_credit_charge; + unsigned short credits_requested; + unsigned short credit_charge, credits_granted = 0; + unsigned short aux_max, aux_credits; - if (hdr->Command == SMB2_CANCEL) - goto out; + if (work->send_no_response) + return 0; - /* get default minimum credits by shifting maximum credits by 4 */ - min_credits = conn->max_credits >> 4; + hdr->CreditCharge = req_hdr->CreditCharge; - if (conn->total_credits >= conn->max_credits) { + if (conn->total_credits > conn->max_credits) { + hdr->CreditRequest = 0; pr_err("Total credits overflow: %d\n", conn->total_credits); - conn->total_credits = min_credits; + return -EINVAL; } - rsp_credit_charge = - smb2_consume_credit_charge(work, le16_to_cpu(req_hdr->CreditCharge)); - if (rsp_credit_charge < 0) - return -EINVAL; + credit_charge = max_t(unsigned short, + le16_to_cpu(req_hdr->CreditCharge), 1); + credits_requested = max_t(unsigned short, + le16_to_cpu(req_hdr->CreditRequest), 1); - hdr->CreditCharge = cpu_to_le16(rsp_credit_charge); + /* according to smb2.credits smbtorture, Windows server + * 2016 or later grant up to 8192 credits at once. + * + * TODO: Need to adjuct CreditRequest value according to + * current cpu load + */ + aux_credits = credits_requested - 1; + if (hdr->Command == SMB2_NEGOTIATE) + aux_max = 0; + else + aux_max = conn->max_credits - credit_charge; + aux_credits = min_t(unsigned short, aux_credits, aux_max); + credits_granted = credit_charge + aux_credits; - if (credits_requested > 0) { - aux_credits = credits_requested - 1; - aux_max = 32; - if (hdr->Command == SMB2_NEGOTIATE) - aux_max = 0; - aux_credits = (aux_credits < aux_max) ? aux_credits : aux_max; - credits_granted = aux_credits + credit_charge; - - /* if credits granted per client is getting bigger than default - * minimum credits then we should wrap it up within the limits. - */ - if ((conn->total_credits + credits_granted) > min_credits) - credits_granted = min_credits - conn->total_credits; - /* - * TODO: Need to adjuct CreditRequest value according to - * current cpu load - */ - } else if (conn->total_credits == 0) { - credits_granted = 1; - } + if (conn->max_credits - conn->total_credits < credits_granted) + credits_granted = conn->max_credits - + conn->total_credits; conn->total_credits += credits_granted; work->credits_granted += credits_granted; @@ -371,7 +346,6 @@ int smb2_set_rsp_credits(struct ksmbd_work *work) /* Update CreditRequest in last request */ hdr->CreditRequest = cpu_to_le16(work->credits_granted); } -out: ksmbd_debug(SMB, "credits: requested[%d] granted[%d] total_granted[%d]\n", credits_requested, credits_granted, @@ -475,6 +449,12 @@ bool is_chained_smb2_message(struct ksmbd_work *work) return false; } + if ((u64)get_rfc1002_len(work->response_buf) + MAX_CIFS_SMALL_BUFFER_SIZE > + work->response_sz) { + pr_err("next response offset exceeds response buffer size\n"); + return false; + } + ksmbd_debug(SMB, "got SMB2 chained command\n"); init_chained_smb2_rsp(work); return true; @@ -544,7 +524,7 @@ int smb2_allocate_rsp_buf(struct ksmbd_work *work) { struct smb2_hdr *hdr = work->request_buf; size_t small_sz = MAX_CIFS_SMALL_BUFFER_SIZE; - size_t large_sz = work->conn->vals->max_trans_size + MAX_SMB2_HDR_SIZE; + size_t large_sz = small_sz + work->conn->vals->max_trans_size; size_t sz = small_sz; int cmd = le16_to_cpu(hdr->Command); @@ -1166,13 +1146,6 @@ int smb2_handle_negotiate(struct ksmbd_work *work) case SMB21_PROT_ID: init_smb2_1_server(conn); break; - case SMB20_PROT_ID: - rc = init_smb2_0_server(conn); - if (rc) { - rsp->hdr.Status = STATUS_NOT_SUPPORTED; - goto err_out; - } - break; case SMB2X_PROT_ID: case BAD_PROT_ID: default: @@ -1191,11 +1164,9 @@ int smb2_handle_negotiate(struct ksmbd_work *work) rsp->MaxReadSize = cpu_to_le32(conn->vals->max_read_size); rsp->MaxWriteSize = cpu_to_le32(conn->vals->max_write_size); - if (conn->dialect > SMB20_PROT_ID) { - memcpy(conn->ClientGUID, req->ClientGUID, - SMB2_CLIENT_GUID_SIZE); - conn->cli_sec_mode = le16_to_cpu(req->SecurityMode); - } + memcpy(conn->ClientGUID, req->ClientGUID, + SMB2_CLIENT_GUID_SIZE); + conn->cli_sec_mode = le16_to_cpu(req->SecurityMode); rsp->StructureSize = cpu_to_le16(65); rsp->DialectRevision = cpu_to_le16(conn->dialect); @@ -1286,19 +1257,13 @@ static int generate_preauth_hash(struct ksmbd_work *work) return 0; } -static int decode_negotiation_token(struct ksmbd_work *work, - struct negotiate_message *negblob) +static int decode_negotiation_token(struct ksmbd_conn *conn, + struct negotiate_message *negblob, + size_t sz) { - struct ksmbd_conn *conn = work->conn; - struct smb2_sess_setup_req *req; - int sz; - if (!conn->use_spnego) return -EINVAL; - req = work->request_buf; - sz = le16_to_cpu(req->SecurityBufferLength); - if (ksmbd_decode_negTokenInit((char *)negblob, sz, conn)) { if (ksmbd_decode_negTokenTarg((char *)negblob, sz, conn)) { conn->auth_mechs |= KSMBD_AUTH_NTLMSSP; @@ -1310,9 +1275,9 @@ static int decode_negotiation_token(struct ksmbd_work *work, } static int ntlm_negotiate(struct ksmbd_work *work, - struct negotiate_message *negblob) + struct negotiate_message *negblob, + size_t negblob_len) { - struct smb2_sess_setup_req *req = work->request_buf; struct smb2_sess_setup_rsp *rsp = work->response_buf; struct challenge_message *chgblob; unsigned char *spnego_blob = NULL; @@ -1321,8 +1286,7 @@ static int ntlm_negotiate(struct ksmbd_work *work, int sz, rc; ksmbd_debug(SMB, "negotiate phase\n"); - sz = le16_to_cpu(req->SecurityBufferLength); - rc = ksmbd_decode_ntlmssp_neg_blob(negblob, sz, work->sess); + rc = ksmbd_decode_ntlmssp_neg_blob(negblob, negblob_len, work->sess); if (rc) return rc; @@ -1390,12 +1354,23 @@ static struct ksmbd_user *session_user(struct ksmbd_conn *conn, struct authenticate_message *authblob; struct ksmbd_user *user; char *name; - int sz; + unsigned int auth_msg_len, name_off, name_len, secbuf_len; + secbuf_len = le16_to_cpu(req->SecurityBufferLength); + if (secbuf_len < sizeof(struct authenticate_message)) { + ksmbd_debug(SMB, "blob len %d too small\n", secbuf_len); + return NULL; + } authblob = user_authblob(conn, req); - sz = le32_to_cpu(authblob->UserName.BufferOffset); - name = smb_strndup_from_utf16((const char *)authblob + sz, - le16_to_cpu(authblob->UserName.Length), + name_off = le32_to_cpu(authblob->UserName.BufferOffset); + name_len = le16_to_cpu(authblob->UserName.Length); + auth_msg_len = le16_to_cpu(req->SecurityBufferOffset) + secbuf_len; + + if (auth_msg_len < (u64)name_off + name_len) + return NULL; + + name = smb_strndup_from_utf16((const char *)authblob + name_off, + name_len, true, conn->local_nls); if (IS_ERR(name)) { @@ -1537,11 +1512,9 @@ binding_session: } } - if (conn->dialect > SMB20_PROT_ID) { - if (!ksmbd_conn_lookup_dialect(conn)) { - pr_err("fail to verify the dialect\n"); - return -ENOENT; - } + if (!ksmbd_conn_lookup_dialect(conn)) { + pr_err("fail to verify the dialect\n"); + return -ENOENT; } return 0; } @@ -1623,11 +1596,9 @@ static int krb5_authenticate(struct ksmbd_work *work) } } - if (conn->dialect > SMB20_PROT_ID) { - if (!ksmbd_conn_lookup_dialect(conn)) { - pr_err("fail to verify the dialect\n"); - return -ENOENT; - } + if (!ksmbd_conn_lookup_dialect(conn)) { + pr_err("fail to verify the dialect\n"); + return -ENOENT; } return 0; } @@ -1645,6 +1616,7 @@ int smb2_sess_setup(struct ksmbd_work *work) struct smb2_sess_setup_rsp *rsp = work->response_buf; struct ksmbd_session *sess; struct negotiate_message *negblob; + unsigned int negblob_len, negblob_off; int rc = 0; ksmbd_debug(SMB, "Received request for session setup\n"); @@ -1725,10 +1697,16 @@ int smb2_sess_setup(struct ksmbd_work *work) if (sess->state == SMB2_SESSION_EXPIRED) sess->state = SMB2_SESSION_IN_PROGRESS; + negblob_off = le16_to_cpu(req->SecurityBufferOffset); + negblob_len = le16_to_cpu(req->SecurityBufferLength); + if (negblob_off < (offsetof(struct smb2_sess_setup_req, Buffer) - 4) || + negblob_len < offsetof(struct negotiate_message, NegotiateFlags)) + return -EINVAL; + negblob = (struct negotiate_message *)((char *)&req->hdr.ProtocolId + - le16_to_cpu(req->SecurityBufferOffset)); + negblob_off); - if (decode_negotiation_token(work, negblob) == 0) { + if (decode_negotiation_token(conn, negblob, negblob_len) == 0) { if (conn->mechToken) negblob = (struct negotiate_message *)conn->mechToken; } @@ -1752,7 +1730,7 @@ int smb2_sess_setup(struct ksmbd_work *work) sess->Preauth_HashValue = NULL; } else if (conn->preferred_auth_mech == KSMBD_AUTH_NTLMSSP) { if (negblob->MessageType == NtLmNegotiate) { - rc = ntlm_negotiate(work, negblob); + rc = ntlm_negotiate(work, negblob, negblob_len); if (rc) goto out_err; rsp->hdr.Status = @@ -1812,9 +1790,30 @@ out_err: conn->mechToken = NULL; } - if (rc < 0 && sess) { - ksmbd_session_destroy(sess); - work->sess = NULL; + if (rc < 0) { + /* + * SecurityBufferOffset should be set to zero + * in session setup error response. + */ + rsp->SecurityBufferOffset = 0; + + if (sess) { + bool try_delay = false; + + /* + * To avoid dictionary attacks (repeated session setups rapidly sent) to + * connect to server, ksmbd make a delay of a 5 seconds on session setup + * failure to make it harder to send enough random connection requests + * to break into a server. + */ + if (sess->user && sess->user->flags & KSMBD_USER_FLAG_DELAY_SESSION) + try_delay = true; + + ksmbd_session_destroy(sess); + work->sess = NULL; + if (try_delay) + ssleep(5); + } } return rc; @@ -3795,6 +3794,24 @@ static int verify_info_level(int info_level) return 0; } +static int smb2_calc_max_out_buf_len(struct ksmbd_work *work, + unsigned short hdr2_len, + unsigned int out_buf_len) +{ + int free_len; + + if (out_buf_len > work->conn->vals->max_trans_size) + return -EINVAL; + + free_len = (int)(work->response_sz - + (get_rfc1002_len(work->response_buf) + 4)) - + hdr2_len; + if (free_len < 0) + return -EINVAL; + + return min_t(int, out_buf_len, free_len); +} + int smb2_query_dir(struct ksmbd_work *work) { struct ksmbd_conn *conn = work->conn; @@ -3871,9 +3888,13 @@ int smb2_query_dir(struct ksmbd_work *work) memset(&d_info, 0, sizeof(struct ksmbd_dir_info)); d_info.wptr = (char *)rsp->Buffer; d_info.rptr = (char *)rsp->Buffer; - d_info.out_buf_len = (work->response_sz - (get_rfc1002_len(rsp_org) + 4)); - d_info.out_buf_len = min_t(int, d_info.out_buf_len, le32_to_cpu(req->OutputBufferLength)) - - sizeof(struct smb2_query_directory_rsp); + d_info.out_buf_len = + smb2_calc_max_out_buf_len(work, 8, + le32_to_cpu(req->OutputBufferLength)); + if (d_info.out_buf_len < 0) { + rc = -EINVAL; + goto err_out; + } d_info.flags = srch_flag; /* @@ -4107,12 +4128,11 @@ static int smb2_get_ea(struct ksmbd_work *work, struct ksmbd_file *fp, le32_to_cpu(req->Flags)); } - buf_free_len = work->response_sz - - (get_rfc1002_len(rsp_org) + 4) - - sizeof(struct smb2_query_info_rsp); - - if (le32_to_cpu(req->OutputBufferLength) < buf_free_len) - buf_free_len = le32_to_cpu(req->OutputBufferLength); + buf_free_len = + smb2_calc_max_out_buf_len(work, 8, + le32_to_cpu(req->OutputBufferLength)); + if (buf_free_len < 0) + return -EINVAL; rc = ksmbd_vfs_listxattr(path->dentry, &xattr_list); if (rc < 0) { @@ -4423,6 +4443,8 @@ static void get_file_stream_info(struct ksmbd_work *work, struct path *path = &fp->filp->f_path; ssize_t xattr_list_len; int nbytes = 0, streamlen, stream_name_len, next, idx = 0; + int buf_free_len; + struct smb2_query_info_req *req = ksmbd_req_buf_next(work); generic_fillattr(file_mnt_user_ns(fp->filp), file_inode(fp->filp), &stat); @@ -4436,6 +4458,12 @@ static void get_file_stream_info(struct ksmbd_work *work, goto out; } + buf_free_len = + smb2_calc_max_out_buf_len(work, 8, + le32_to_cpu(req->OutputBufferLength)); + if (buf_free_len < 0) + goto out; + while (idx < xattr_list_len) { stream_name = xattr_list + idx; streamlen = strlen(stream_name); @@ -4460,6 +4488,10 @@ static void get_file_stream_info(struct ksmbd_work *work, streamlen = snprintf(stream_buf, streamlen + 1, ":%s", &stream_name[XATTR_NAME_STREAM_LEN]); + next = sizeof(struct smb2_file_stream_info) + streamlen * 2; + if (next > buf_free_len) + break; + file_info = (struct smb2_file_stream_info *)&rsp->Buffer[nbytes]; streamlen = smbConvertToUTF16((__le16 *)file_info->StreamName, stream_buf, streamlen, @@ -4470,12 +4502,13 @@ static void get_file_stream_info(struct ksmbd_work *work, file_info->StreamSize = cpu_to_le64(stream_name_len); file_info->StreamAllocationSize = cpu_to_le64(stream_name_len); - next = sizeof(struct smb2_file_stream_info) + streamlen; nbytes += next; + buf_free_len -= next; file_info->NextEntryOffset = cpu_to_le32(next); } - if (!S_ISDIR(stat.mode)) { + if (!S_ISDIR(stat.mode) && + buf_free_len >= sizeof(struct smb2_file_stream_info) + 7 * 2) { file_info = (struct smb2_file_stream_info *) &rsp->Buffer[nbytes]; streamlen = smbConvertToUTF16((__le16 *)file_info->StreamName, @@ -5499,7 +5532,6 @@ static int set_file_basic_info(struct ksmbd_file *fp, struct ksmbd_share_config *share) { struct iattr attrs; - struct timespec64 ctime; struct file *filp; struct inode *inode; struct user_namespace *user_ns; @@ -5521,13 +5553,11 @@ static int set_file_basic_info(struct ksmbd_file *fp, attrs.ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET); } - if (file_info->ChangeTime) { + attrs.ia_valid |= ATTR_CTIME; + if (file_info->ChangeTime) attrs.ia_ctime = ksmbd_NTtimeToUnix(file_info->ChangeTime); - ctime = attrs.ia_ctime; - attrs.ia_valid |= ATTR_CTIME; - } else { - ctime = inode->i_ctime; - } + else + attrs.ia_ctime = inode->i_ctime; if (file_info->LastWriteTime) { attrs.ia_mtime = ksmbd_NTtimeToUnix(file_info->LastWriteTime); @@ -5573,11 +5603,9 @@ static int set_file_basic_info(struct ksmbd_file *fp, return -EACCES; inode_lock(inode); + inode->i_ctime = attrs.ia_ctime; + attrs.ia_valid &= ~ATTR_CTIME; rc = notify_change(user_ns, dentry, &attrs, NULL); - if (!rc) { - inode->i_ctime = ctime; - mark_inode_dirty(inode); - } inode_unlock(inode); } return rc; @@ -6241,8 +6269,7 @@ static noinline int smb2_write_pipe(struct ksmbd_work *work) (offsetof(struct smb2_write_req, Buffer) - 4)) { data_buf = (char *)&req->Buffer[0]; } else { - if ((le16_to_cpu(req->DataOffset) > get_rfc1002_len(req)) || - (le16_to_cpu(req->DataOffset) + length > get_rfc1002_len(req))) { + if ((u64)le16_to_cpu(req->DataOffset) + length > get_rfc1002_len(req)) { pr_err("invalid write data offset %u, smb_len %u\n", le16_to_cpu(req->DataOffset), get_rfc1002_len(req)); @@ -6400,8 +6427,7 @@ int smb2_write(struct ksmbd_work *work) (offsetof(struct smb2_write_req, Buffer) - 4)) { data_buf = (char *)&req->Buffer[0]; } else { - if ((le16_to_cpu(req->DataOffset) > get_rfc1002_len(req)) || - (le16_to_cpu(req->DataOffset) + length > get_rfc1002_len(req))) { + if ((u64)le16_to_cpu(req->DataOffset) + length > get_rfc1002_len(req)) { pr_err("invalid write data offset %u, smb_len %u\n", le16_to_cpu(req->DataOffset), get_rfc1002_len(req)); @@ -7044,24 +7070,26 @@ out2: return err; } -static int fsctl_copychunk(struct ksmbd_work *work, struct smb2_ioctl_req *req, +static int fsctl_copychunk(struct ksmbd_work *work, + struct copychunk_ioctl_req *ci_req, + unsigned int cnt_code, + unsigned int input_count, + unsigned long long volatile_id, + unsigned long long persistent_id, struct smb2_ioctl_rsp *rsp) { - struct copychunk_ioctl_req *ci_req; struct copychunk_ioctl_rsp *ci_rsp; struct ksmbd_file *src_fp = NULL, *dst_fp = NULL; struct srv_copychunk *chunks; unsigned int i, chunk_count, chunk_count_written = 0; unsigned int chunk_size_written = 0; loff_t total_size_written = 0; - int ret, cnt_code; + int ret = 0; - cnt_code = le32_to_cpu(req->CntCode); - ci_req = (struct copychunk_ioctl_req *)&req->Buffer[0]; ci_rsp = (struct copychunk_ioctl_rsp *)&rsp->Buffer[0]; - rsp->VolatileFileId = req->VolatileFileId; - rsp->PersistentFileId = req->PersistentFileId; + rsp->VolatileFileId = cpu_to_le64(volatile_id); + rsp->PersistentFileId = cpu_to_le64(persistent_id); ci_rsp->ChunksWritten = cpu_to_le32(ksmbd_server_side_copy_max_chunk_count()); ci_rsp->ChunkBytesWritten = @@ -7071,12 +7099,13 @@ static int fsctl_copychunk(struct ksmbd_work *work, struct smb2_ioctl_req *req, chunks = (struct srv_copychunk *)&ci_req->Chunks[0]; chunk_count = le32_to_cpu(ci_req->ChunkCount); + if (chunk_count == 0) + goto out; total_size_written = 0; /* verify the SRV_COPYCHUNK_COPY packet */ if (chunk_count > ksmbd_server_side_copy_max_chunk_count() || - le32_to_cpu(req->InputCount) < - offsetof(struct copychunk_ioctl_req, Chunks) + + input_count < offsetof(struct copychunk_ioctl_req, Chunks) + chunk_count * sizeof(struct srv_copychunk)) { rsp->hdr.Status = STATUS_INVALID_PARAMETER; return -EINVAL; @@ -7097,9 +7126,7 @@ static int fsctl_copychunk(struct ksmbd_work *work, struct smb2_ioctl_req *req, src_fp = ksmbd_lookup_foreign_fd(work, le64_to_cpu(ci_req->ResumeKey[0])); - dst_fp = ksmbd_lookup_fd_slow(work, - le64_to_cpu(req->VolatileFileId), - le64_to_cpu(req->PersistentFileId)); + dst_fp = ksmbd_lookup_fd_slow(work, volatile_id, persistent_id); ret = -EINVAL; if (!src_fp || src_fp->persistent_id != le64_to_cpu(ci_req->ResumeKey[1])) { @@ -7174,8 +7201,8 @@ static __be32 idev_ipv4_address(struct in_device *idev) } static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, - struct smb2_ioctl_req *req, - struct smb2_ioctl_rsp *rsp) + struct smb2_ioctl_rsp *rsp, + unsigned int out_buf_len) { struct network_interface_info_ioctl_rsp *nii_rsp = NULL; int nbytes = 0; @@ -7187,6 +7214,12 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, rtnl_lock(); for_each_netdev(&init_net, netdev) { + if (out_buf_len < + nbytes + sizeof(struct network_interface_info_ioctl_rsp)) { + rtnl_unlock(); + return -ENOSPC; + } + if (netdev->type == ARPHRD_LOOPBACK) continue; @@ -7266,11 +7299,6 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, if (nii_rsp) nii_rsp->Next = 0; - if (!nbytes) { - rsp->hdr.Status = STATUS_BUFFER_TOO_SMALL; - return -EINVAL; - } - rsp->PersistentFileId = cpu_to_le64(SMB2_NO_FID); rsp->VolatileFileId = cpu_to_le64(SMB2_NO_FID); return nbytes; @@ -7278,11 +7306,16 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, static int fsctl_validate_negotiate_info(struct ksmbd_conn *conn, struct validate_negotiate_info_req *neg_req, - struct validate_negotiate_info_rsp *neg_rsp) + struct validate_negotiate_info_rsp *neg_rsp, + unsigned int in_buf_len) { int ret = 0; int dialect; + if (in_buf_len < sizeof(struct validate_negotiate_info_req) + + le16_to_cpu(neg_req->DialectCount) * sizeof(__le16)) + return -EINVAL; + dialect = ksmbd_lookup_dialect_by_id(neg_req->Dialects, neg_req->DialectCount); if (dialect == BAD_PROT_ID || dialect != conn->dialect) { @@ -7316,7 +7349,7 @@ err_out: static int fsctl_query_allocated_ranges(struct ksmbd_work *work, u64 id, struct file_allocated_range_buffer *qar_req, struct file_allocated_range_buffer *qar_rsp, - int in_count, int *out_count) + unsigned int in_count, unsigned int *out_count) { struct ksmbd_file *fp; loff_t start, length; @@ -7343,7 +7376,8 @@ static int fsctl_query_allocated_ranges(struct ksmbd_work *work, u64 id, } static int fsctl_pipe_transceive(struct ksmbd_work *work, u64 id, - int out_buf_len, struct smb2_ioctl_req *req, + unsigned int out_buf_len, + struct smb2_ioctl_req *req, struct smb2_ioctl_rsp *rsp) { struct ksmbd_rpc_command *rpc_resp; @@ -7457,8 +7491,7 @@ int smb2_ioctl(struct ksmbd_work *work) { struct smb2_ioctl_req *req; struct smb2_ioctl_rsp *rsp, *rsp_org; - int cnt_code, nbytes = 0; - int out_buf_len; + unsigned int cnt_code, nbytes = 0, out_buf_len, in_buf_len; u64 id = KSMBD_NO_FID; struct ksmbd_conn *conn = work->conn; int ret = 0; @@ -7486,8 +7519,14 @@ int smb2_ioctl(struct ksmbd_work *work) } cnt_code = le32_to_cpu(req->CntCode); - out_buf_len = le32_to_cpu(req->MaxOutputResponse); - out_buf_len = min(KSMBD_IPC_MAX_PAYLOAD, out_buf_len); + ret = smb2_calc_max_out_buf_len(work, 48, + le32_to_cpu(req->MaxOutputResponse)); + if (ret < 0) { + rsp->hdr.Status = STATUS_INVALID_PARAMETER; + goto out; + } + out_buf_len = (unsigned int)ret; + in_buf_len = le32_to_cpu(req->InputCount); switch (cnt_code) { case FSCTL_DFS_GET_REFERRALS: @@ -7515,6 +7554,7 @@ int smb2_ioctl(struct ksmbd_work *work) break; } case FSCTL_PIPE_TRANSCEIVE: + out_buf_len = min_t(u32, KSMBD_IPC_MAX_PAYLOAD, out_buf_len); nbytes = fsctl_pipe_transceive(work, id, out_buf_len, req, rsp); break; case FSCTL_VALIDATE_NEGOTIATE_INFO: @@ -7523,9 +7563,16 @@ int smb2_ioctl(struct ksmbd_work *work) goto out; } + if (in_buf_len < sizeof(struct validate_negotiate_info_req)) + return -EINVAL; + + if (out_buf_len < sizeof(struct validate_negotiate_info_rsp)) + return -EINVAL; + ret = fsctl_validate_negotiate_info(conn, (struct validate_negotiate_info_req *)&req->Buffer[0], - (struct validate_negotiate_info_rsp *)&rsp->Buffer[0]); + (struct validate_negotiate_info_rsp *)&rsp->Buffer[0], + in_buf_len); if (ret < 0) goto out; @@ -7534,9 +7581,10 @@ int smb2_ioctl(struct ksmbd_work *work) rsp->VolatileFileId = cpu_to_le64(SMB2_NO_FID); break; case FSCTL_QUERY_NETWORK_INTERFACE_INFO: - nbytes = fsctl_query_iface_info_ioctl(conn, req, rsp); - if (nbytes < 0) + ret = fsctl_query_iface_info_ioctl(conn, rsp, out_buf_len); + if (ret < 0) goto out; + nbytes = ret; break; case FSCTL_REQUEST_RESUME_KEY: if (out_buf_len < sizeof(struct resume_key_ioctl_rsp)) { @@ -7561,15 +7609,33 @@ int smb2_ioctl(struct ksmbd_work *work) goto out; } + if (in_buf_len < sizeof(struct copychunk_ioctl_req)) { + ret = -EINVAL; + goto out; + } + if (out_buf_len < sizeof(struct copychunk_ioctl_rsp)) { ret = -EINVAL; goto out; } nbytes = sizeof(struct copychunk_ioctl_rsp); - fsctl_copychunk(work, req, rsp); + rsp->VolatileFileId = req->VolatileFileId; + rsp->PersistentFileId = req->PersistentFileId; + fsctl_copychunk(work, + (struct copychunk_ioctl_req *)&req->Buffer[0], + le32_to_cpu(req->CntCode), + le32_to_cpu(req->InputCount), + le64_to_cpu(req->VolatileFileId), + le64_to_cpu(req->PersistentFileId), + rsp); break; case FSCTL_SET_SPARSE: + if (in_buf_len < sizeof(struct file_sparse)) { + ret = -EINVAL; + goto out; + } + ret = fsctl_set_sparse(work, id, (struct file_sparse *)&req->Buffer[0]); if (ret < 0) @@ -7588,6 +7654,11 @@ int smb2_ioctl(struct ksmbd_work *work) goto out; } + if (in_buf_len < sizeof(struct file_zero_data_information)) { + ret = -EINVAL; + goto out; + } + zero_data = (struct file_zero_data_information *)&req->Buffer[0]; @@ -7607,6 +7678,11 @@ int smb2_ioctl(struct ksmbd_work *work) break; } case FSCTL_QUERY_ALLOCATED_RANGES: + if (in_buf_len < sizeof(struct file_allocated_range_buffer)) { + ret = -EINVAL; + goto out; + } + ret = fsctl_query_allocated_ranges(work, id, (struct file_allocated_range_buffer *)&req->Buffer[0], (struct file_allocated_range_buffer *)&rsp->Buffer[0], @@ -7647,6 +7723,11 @@ int smb2_ioctl(struct ksmbd_work *work) struct duplicate_extents_to_file *dup_ext; loff_t src_off, dst_off, length, cloned; + if (in_buf_len < sizeof(struct duplicate_extents_to_file)) { + ret = -EINVAL; + goto out; + } + dup_ext = (struct duplicate_extents_to_file *)&req->Buffer[0]; fp_in = ksmbd_lookup_fd_slow(work, dup_ext->VolatileFileHandle, @@ -7717,6 +7798,8 @@ out: rsp->hdr.Status = STATUS_OBJECT_NAME_NOT_FOUND; else if (ret == -EOPNOTSUPP) rsp->hdr.Status = STATUS_NOT_SUPPORTED; + else if (ret == -ENOSPC) + rsp->hdr.Status = STATUS_BUFFER_TOO_SMALL; else if (ret < 0 || rsp->hdr.Status == 0) rsp->hdr.Status = STATUS_INVALID_PARAMETER; smb2_set_err_rsp(work); @@ -8411,20 +8494,18 @@ int smb3_decrypt_req(struct ksmbd_work *work) struct smb2_hdr *hdr; unsigned int pdu_length = get_rfc1002_len(buf); struct kvec iov[2]; - unsigned int buf_data_size = pdu_length + 4 - + int buf_data_size = pdu_length + 4 - sizeof(struct smb2_transform_hdr); struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)buf; int rc = 0; - if (pdu_length + 4 < - sizeof(struct smb2_transform_hdr) + sizeof(struct smb2_hdr)) { + if (buf_data_size < sizeof(struct smb2_hdr)) { pr_err("Transform message is too small (%u)\n", pdu_length); return -ECONNABORTED; } - if (pdu_length + 4 < - le32_to_cpu(tr_hdr->OriginalMessageSize) + sizeof(struct smb2_transform_hdr)) { + if (buf_data_size < le32_to_cpu(tr_hdr->OriginalMessageSize)) { pr_err("Transform message is broken\n"); return -ECONNABORTED; } diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h index 261825d06391..ff5a2f01d34a 100644 --- a/fs/ksmbd/smb2pdu.h +++ b/fs/ksmbd/smb2pdu.h @@ -113,6 +113,8 @@ #define SMB21_DEFAULT_IOSIZE (1024 * 1024) #define SMB3_DEFAULT_IOSIZE (4 * 1024 * 1024) #define SMB3_DEFAULT_TRANS_SIZE (1024 * 1024) +#define SMB3_MIN_IOSIZE (64 * 1024) +#define SMB3_MAX_IOSIZE (8 * 1024 * 1024) /* * SMB2 Header Definition @@ -1637,7 +1639,6 @@ struct smb2_posix_info { } __packed; /* functions */ -int init_smb2_0_server(struct ksmbd_conn *conn); void init_smb2_1_server(struct ksmbd_conn *conn); void init_smb3_0_server(struct ksmbd_conn *conn); void init_smb3_02_server(struct ksmbd_conn *conn); diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c index db8042a173d0..707490ab1f4c 100644 --- a/fs/ksmbd/smb_common.c +++ b/fs/ksmbd/smb_common.c @@ -21,7 +21,6 @@ static const char basechars[43] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_-!@#$%"; #define MAGIC_CHAR '~' #define PERIOD '.' #define mangle(V) ((char)(basechars[(V) % MANGLE_BASE])) -#define KSMBD_MIN_SUPPORTED_HEADER_SIZE (sizeof(struct smb2_hdr)) struct smb_protocol { int index; @@ -89,7 +88,7 @@ unsigned int ksmbd_server_side_copy_max_total_size(void) inline int ksmbd_min_protocol(void) { - return SMB2_PROT; + return SMB21_PROT; } inline int ksmbd_max_protocol(void) @@ -294,11 +293,6 @@ int ksmbd_init_smb_server(struct ksmbd_work *work) return 0; } -bool ksmbd_pdu_size_has_room(unsigned int pdu) -{ - return (pdu >= KSMBD_MIN_SUPPORTED_HEADER_SIZE - 4); -} - int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level, struct ksmbd_file *dir, struct ksmbd_dir_info *d_info, @@ -433,7 +427,7 @@ int ksmbd_extract_shortname(struct ksmbd_conn *conn, const char *longname, static int __smb2_negotiate(struct ksmbd_conn *conn) { - return (conn->dialect >= SMB20_PROT_ID && + return (conn->dialect >= SMB21_PROT_ID && conn->dialect <= SMB311_PROT_ID); } @@ -463,7 +457,7 @@ int ksmbd_smb_negotiate_common(struct ksmbd_work *work, unsigned int command) } } - if (command == SMB2_NEGOTIATE_HE) { + if (command == SMB2_NEGOTIATE_HE && __smb2_negotiate(conn)) { ret = smb2_handle_negotiate(work); init_smb2_neg_rsp(work); return ret; diff --git a/fs/ksmbd/smb_common.h b/fs/ksmbd/smb_common.h index 994abede27e9..6e79e7577f6b 100644 --- a/fs/ksmbd/smb_common.h +++ b/fs/ksmbd/smb_common.h @@ -48,6 +48,8 @@ #define CIFS_DEFAULT_IOSIZE (64 * 1024) #define MAX_CIFS_SMALL_BUFFER_SIZE 448 /* big enough for most */ +#define MAX_STREAM_PROT_LEN 0x00FFFFFF + /* Responses when opening a file. */ #define F_SUPERSEDED 0 #define F_OPENED 1 @@ -493,8 +495,6 @@ int ksmbd_lookup_dialect_by_id(__le16 *cli_dialects, __le16 dialects_count); int ksmbd_init_smb_server(struct ksmbd_work *work); -bool ksmbd_pdu_size_has_room(unsigned int pdu); - struct ksmbd_kstat; int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level, diff --git a/fs/ksmbd/transport_ipc.c b/fs/ksmbd/transport_ipc.c index 44aea33a67fa..1acf1892a466 100644 --- a/fs/ksmbd/transport_ipc.c +++ b/fs/ksmbd/transport_ipc.c @@ -601,7 +601,7 @@ int ksmbd_ipc_tree_disconnect_request(unsigned long long session_id, return ret; } -int ksmbd_ipc_logout_request(const char *account) +int ksmbd_ipc_logout_request(const char *account, int flags) { struct ksmbd_ipc_msg *msg; struct ksmbd_logout_request *req; @@ -616,6 +616,7 @@ int ksmbd_ipc_logout_request(const char *account) msg->type = KSMBD_EVENT_LOGOUT_REQUEST; req = (struct ksmbd_logout_request *)msg->payload; + req->account_flags = flags; strscpy(req->account, account, KSMBD_REQ_MAX_ACCOUNT_NAME_SZ); ret = ipc_msg_send(msg); diff --git a/fs/ksmbd/transport_ipc.h b/fs/ksmbd/transport_ipc.h index 9eacc895ffdb..5e5b90a0c187 100644 --- a/fs/ksmbd/transport_ipc.h +++ b/fs/ksmbd/transport_ipc.h @@ -25,7 +25,7 @@ ksmbd_ipc_tree_connect_request(struct ksmbd_session *sess, struct sockaddr *peer_addr); int ksmbd_ipc_tree_disconnect_request(unsigned long long session_id, unsigned long long connect_id); -int ksmbd_ipc_logout_request(const char *account); +int ksmbd_ipc_logout_request(const char *account, int flags); struct ksmbd_share_config_response * ksmbd_ipc_share_config_request(const char *name); struct ksmbd_spnego_authen_response * diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index 3a7fa23ba850..a2fd5a4d4cd5 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -549,6 +549,10 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) switch (recvmsg->type) { case SMB_DIRECT_MSG_NEGOTIATE_REQ: + if (wc->byte_len < sizeof(struct smb_direct_negotiate_req)) { + put_empty_recvmsg(t, recvmsg); + return; + } t->negotiation_requested = true; t->full_packet_received = true; wake_up_interruptible(&t->wait_status); @@ -556,10 +560,23 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) case SMB_DIRECT_MSG_DATA_TRANSFER: { struct smb_direct_data_transfer *data_transfer = (struct smb_direct_data_transfer *)recvmsg->packet; - int data_length = le32_to_cpu(data_transfer->data_length); + unsigned int data_length; int avail_recvmsg_count, receive_credits; + if (wc->byte_len < + offsetof(struct smb_direct_data_transfer, padding)) { + put_empty_recvmsg(t, recvmsg); + return; + } + + data_length = le32_to_cpu(data_transfer->data_length); if (data_length) { + if (wc->byte_len < sizeof(struct smb_direct_data_transfer) + + (u64)data_length) { + put_empty_recvmsg(t, recvmsg); + return; + } + if (t->full_packet_received) recvmsg->first_segment = true; @@ -568,7 +585,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) else t->full_packet_received = true; - enqueue_reassembly(t, recvmsg, data_length); + enqueue_reassembly(t, recvmsg, (int)data_length); wake_up_interruptible(&t->wait_reassembly_queue); spin_lock(&t->receive_credit_lock); diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c index b41954294d38..835b384b0895 100644 --- a/fs/ksmbd/vfs.c +++ b/fs/ksmbd/vfs.c @@ -1023,7 +1023,7 @@ int ksmbd_vfs_zero_data(struct ksmbd_work *work, struct ksmbd_file *fp, int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length, struct file_allocated_range_buffer *ranges, - int in_count, int *out_count) + unsigned int in_count, unsigned int *out_count) { struct file *f = fp->filp; struct inode *inode = file_inode(fp->filp); diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h index 7b1dcaa3fbdc..b0d5b8feb4a3 100644 --- a/fs/ksmbd/vfs.h +++ b/fs/ksmbd/vfs.h @@ -166,7 +166,7 @@ int ksmbd_vfs_zero_data(struct ksmbd_work *work, struct ksmbd_file *fp, struct file_allocated_range_buffer; int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length, struct file_allocated_range_buffer *ranges, - int in_count, int *out_count); + unsigned int in_count, unsigned int *out_count); int ksmbd_vfs_unlink(struct user_namespace *user_ns, struct dentry *dir, struct dentry *dentry); void *ksmbd_vfs_init_kstat(char **p, struct ksmbd_kstat *ksmbd_kstat); diff --git a/fs/locks.c b/fs/locks.c index 3d6fb4ae847b..0fca9d680978 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2,117 +2,11 @@ /* * linux/fs/locks.c * - * Provide support for fcntl()'s F_GETLK, F_SETLK, and F_SETLKW calls. - * Doug Evans (dje@spiff.uucp), August 07, 1992 + * We implement four types of file locks: BSD locks, posix locks, open + * file description locks, and leases. For details about BSD locks, + * see the flock(2) man page; for details about the other three, see + * fcntl(2). * - * Deadlock detection added. - * FIXME: one thing isn't handled yet: - * - mandatory locks (requires lots of changes elsewhere) - * Kelly Carmichael (kelly@[142.24.8.65]), September 17, 1994. - * - * Miscellaneous edits, and a total rewrite of posix_lock_file() code. - * Kai Petzke (wpp@marie.physik.tu-berlin.de), 1994 - * - * Converted file_lock_table to a linked list from an array, which eliminates - * the limits on how many active file locks are open. - * Chad Page (pageone@netcom.com), November 27, 1994 - * - * Removed dependency on file descriptors. dup()'ed file descriptors now - * get the same locks as the original file descriptors, and a close() on - * any file descriptor removes ALL the locks on the file for the current - * process. Since locks still depend on the process id, locks are inherited - * after an exec() but not after a fork(). This agrees with POSIX, and both - * BSD and SVR4 practice. - * Andy Walker (andy@lysaker.kvaerner.no), February 14, 1995 - * - * Scrapped free list which is redundant now that we allocate locks - * dynamically with kmalloc()/kfree(). - * Andy Walker (andy@lysaker.kvaerner.no), February 21, 1995 - * - * Implemented two lock personalities - FL_FLOCK and FL_POSIX. - * - * FL_POSIX locks are created with calls to fcntl() and lockf() through the - * fcntl() system call. They have the semantics described above. - * - * FL_FLOCK locks are created with calls to flock(), through the flock() - * system call, which is new. Old C libraries implement flock() via fcntl() - * and will continue to use the old, broken implementation. - * - * FL_FLOCK locks follow the 4.4 BSD flock() semantics. They are associated - * with a file pointer (filp). As a result they can be shared by a parent - * process and its children after a fork(). They are removed when the last - * file descriptor referring to the file pointer is closed (unless explicitly - * unlocked). - * - * FL_FLOCK locks never deadlock, an existing lock is always removed before - * upgrading from shared to exclusive (or vice versa). When this happens - * any processes blocked by the current lock are woken up and allowed to - * run before the new lock is applied. - * Andy Walker (andy@lysaker.kvaerner.no), June 09, 1995 - * - * Removed some race conditions in flock_lock_file(), marked other possible - * races. Just grep for FIXME to see them. - * Dmitry Gorodchanin (pgmdsg@ibi.com), February 09, 1996. - * - * Addressed Dmitry's concerns. Deadlock checking no longer recursive. - * Lock allocation changed to GFP_ATOMIC as we can't afford to sleep - * once we've checked for blocking and deadlocking. - * Andy Walker (andy@lysaker.kvaerner.no), April 03, 1996. - * - * Initial implementation of mandatory locks. SunOS turned out to be - * a rotten model, so I implemented the "obvious" semantics. - * See 'Documentation/filesystems/mandatory-locking.rst' for details. - * Andy Walker (andy@lysaker.kvaerner.no), April 06, 1996. - * - * Don't allow mandatory locks on mmap()'ed files. Added simple functions to - * check if a file has mandatory locks, used by mmap(), open() and creat() to - * see if system call should be rejected. Ref. HP-UX/SunOS/Solaris Reference - * Manual, Section 2. - * Andy Walker (andy@lysaker.kvaerner.no), April 09, 1996. - * - * Tidied up block list handling. Added '/proc/locks' interface. - * Andy Walker (andy@lysaker.kvaerner.no), April 24, 1996. - * - * Fixed deadlock condition for pathological code that mixes calls to - * flock() and fcntl(). - * Andy Walker (andy@lysaker.kvaerner.no), April 29, 1996. - * - * Allow only one type of locking scheme (FL_POSIX or FL_FLOCK) to be in use - * for a given file at a time. Changed the CONFIG_LOCK_MANDATORY scheme to - * guarantee sensible behaviour in the case where file system modules might - * be compiled with different options than the kernel itself. - * Andy Walker (andy@lysaker.kvaerner.no), May 15, 1996. - * - * Added a couple of missing wake_up() calls. Thanks to Thomas Meckel - * (Thomas.Meckel@mni.fh-giessen.de) for spotting this. - * Andy Walker (andy@lysaker.kvaerner.no), May 15, 1996. - * - * Changed FL_POSIX locks to use the block list in the same way as FL_FLOCK - * locks. Changed process synchronisation to avoid dereferencing locks that - * have already been freed. - * Andy Walker (andy@lysaker.kvaerner.no), Sep 21, 1996. - * - * Made the block list a circular list to minimise searching in the list. - * Andy Walker (andy@lysaker.kvaerner.no), Sep 25, 1996. - * - * Made mandatory locking a mount option. Default is not to allow mandatory - * locking. - * Andy Walker (andy@lysaker.kvaerner.no), Oct 04, 1996. - * - * Some adaptations for NFS support. - * Olaf Kirch (okir@monad.swb.de), Dec 1996, - * - * Fixed /proc/locks interface so that we can't overrun the buffer we are handed. - * Andy Walker (andy@lysaker.kvaerner.no), May 12, 1997. - * - * Use slab allocator instead of kmalloc/kfree. - * Use generic list implementation from <linux/list.h>. - * Sped up posix_locks_deadlock by only considering blocked locks. - * Matthew Wilcox <willy@debian.org>, March, 2000. - * - * Leases and LOCK_MAND - * Matthew Wilcox <willy@debian.org>, June, 2000. - * Stephen Rothwell <sfr@canb.auug.org.au>, June, 2000. * * Locking conflicts and dependencies: * If multiple threads attempt to lock the same byte (or flock the same file) @@ -461,8 +355,6 @@ static void locks_move_blocks(struct file_lock *new, struct file_lock *fl) } static inline int flock_translate_cmd(int cmd) { - if (cmd & LOCK_MAND) - return cmd & (LOCK_MAND | LOCK_RW); switch (cmd) { case LOCK_SH: return F_RDLCK; @@ -942,8 +834,6 @@ static bool flock_locks_conflict(struct file_lock *caller_fl, */ if (caller_fl->fl_file == sys_fl->fl_file) return false; - if ((caller_fl->fl_type & LOCK_MAND) || (sys_fl->fl_type & LOCK_MAND)) - return false; return locks_conflict(caller_fl, sys_fl); } @@ -2116,11 +2006,9 @@ EXPORT_SYMBOL(locks_lock_inode_wait); * - %LOCK_SH -- a shared lock. * - %LOCK_EX -- an exclusive lock. * - %LOCK_UN -- remove an existing lock. - * - %LOCK_MAND -- a 'mandatory' flock. - * This exists to emulate Windows Share Modes. + * - %LOCK_MAND -- a 'mandatory' flock. (DEPRECATED) * - * %LOCK_MAND can be combined with %LOCK_READ or %LOCK_WRITE to allow other - * processes read and write access respectively. + * %LOCK_MAND support has been removed from the kernel. */ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) { @@ -2137,9 +2025,22 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) cmd &= ~LOCK_NB; unlock = (cmd == LOCK_UN); - if (!unlock && !(cmd & LOCK_MAND) && - !(f.file->f_mode & (FMODE_READ|FMODE_WRITE))) + if (!unlock && !(f.file->f_mode & (FMODE_READ|FMODE_WRITE))) + goto out_putf; + + /* + * LOCK_MAND locks were broken for a long time in that they never + * conflicted with one another and didn't prevent any sort of open, + * read or write activity. + * + * Just ignore these requests now, to preserve legacy behavior, but + * throw a warning to let people know that they don't actually work. + */ + if (cmd & LOCK_MAND) { + pr_warn_once("Attempt to set a LOCK_MAND lock via flock(2). This support has been removed and the request ignored.\n"); + error = 0; goto out_putf; + } lock = flock_make_lock(f.file, cmd, NULL); if (IS_ERR(lock)) { @@ -2718,6 +2619,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, struct inode *inode = NULL; unsigned int fl_pid; struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb); + int type; fl_pid = locks_translate_pid(fl, proc_pidns); /* @@ -2745,11 +2647,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, seq_printf(f, " %s ", (inode == NULL) ? "*NOINODE*" : "ADVISORY "); } else if (IS_FLOCK(fl)) { - if (fl->fl_type & LOCK_MAND) { - seq_puts(f, "FLOCK MSNFS "); - } else { - seq_puts(f, "FLOCK ADVISORY "); - } + seq_puts(f, "FLOCK ADVISORY "); } else if (IS_LEASE(fl)) { if (fl->fl_flags & FL_DELEG) seq_puts(f, "DELEG "); @@ -2765,17 +2663,10 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, } else { seq_puts(f, "UNKNOWN UNKNOWN "); } - if (fl->fl_type & LOCK_MAND) { - seq_printf(f, "%s ", - (fl->fl_type & LOCK_READ) - ? (fl->fl_type & LOCK_WRITE) ? "RW " : "READ " - : (fl->fl_type & LOCK_WRITE) ? "WRITE" : "NONE "); - } else { - int type = IS_LEASE(fl) ? target_leasetype(fl) : fl->fl_type; + type = IS_LEASE(fl) ? target_leasetype(fl) : fl->fl_type; - seq_printf(f, "%s ", (type == F_WRLCK) ? "WRITE" : - (type == F_RDLCK) ? "READ" : "UNLCK"); - } + seq_printf(f, "%s ", (type == F_WRLCK) ? "WRITE" : + (type == F_RDLCK) ? "READ" : "UNLCK"); if (inode) { /* userspace relies on this representation of dev_t */ seq_printf(f, "%d %02x:%02x:%lu ", fl_pid, diff --git a/fs/namei.c b/fs/namei.c index 1946d9667790..1f9d2187c765 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3076,9 +3076,7 @@ static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp) int error = get_write_access(inode); if (error) return error; - /* - * Refuse to truncate files with mandatory locks held on them. - */ + error = security_path_truncate(path); if (!error) { error = do_truncate(mnt_userns, path->dentry, 0, diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c index 0b6cd3b8734c..994ec22d4040 100644 --- a/fs/netfs/read_helper.c +++ b/fs/netfs/read_helper.c @@ -150,7 +150,7 @@ static void netfs_clear_unread(struct netfs_read_subrequest *subreq) { struct iov_iter iter; - iov_iter_xarray(&iter, WRITE, &subreq->rreq->mapping->i_pages, + iov_iter_xarray(&iter, READ, &subreq->rreq->mapping->i_pages, subreq->start + subreq->transferred, subreq->len - subreq->transferred); iov_iter_zero(iov_iter_count(&iter), &iter); diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index acb1d22907da..5e56da748b2a 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -252,7 +252,7 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d, d->bdev = bdev; - d->len = i_size_read(d->bdev->bd_inode); + d->len = bdev_nr_bytes(d->bdev); d->map = bl_map_simple; printk(KERN_INFO "pNFS: using block device %s\n", @@ -367,7 +367,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, return PTR_ERR(bdev); d->bdev = bdev; - d->len = i_size_read(d->bdev->bd_inode); + d->len = bdev_nr_bytes(d->bdev); d->map = bl_map_simple; d->pr_key = v->scsi.pr_key; diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 2e894fec036b..7a5f287c5391 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -275,7 +275,7 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq) res = (long) dreq->count; WARN_ON_ONCE(dreq->count < 0); } - dreq->iocb->ki_complete(dreq->iocb, res, 0); + dreq->iocb->ki_complete(dreq->iocb, res); } complete(&dreq->completion); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index aa353fd58240..24e7dccce355 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -843,15 +843,6 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) if (!(fl->fl_flags & FL_FLOCK)) return -ENOLCK; - /* - * The NFSv4 protocol doesn't support LOCK_MAND, which is not part of - * any standard. In principle we might be able to support LOCK_MAND - * on NFSv2/3 since NLMv3/4 support DOS share modes, but for now the - * NFS code is not set up for it. - */ - if (fl->fl_type & LOCK_MAND) - return -EINVAL; - if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK) is_local = 1; diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c index edec45831585..0a9b72685f98 100644 --- a/fs/nfs_common/grace.c +++ b/fs/nfs_common/grace.c @@ -42,7 +42,6 @@ EXPORT_SYMBOL_GPL(locks_start_grace); /** * locks_end_grace - * @net: net namespace that this lock manager belongs to * @lm: who this grace period is for * * Call this function to state that the given lock manager is ready to diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 6e9ea4ee0f73..3d1d17256a91 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -109,7 +109,6 @@ config NFSD_SCSILAYOUT depends on NFSD_V4 && BLOCK select NFSD_PNFS select EXPORTFS_BLOCK_OPS - select SCSI_COMMON help This option enables support for the exporting pNFS SCSI layouts in the kernel's NFS server. The pNFS SCSI layout enables NFS diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index c99dee99a3c1..e5c0982a381d 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -9,9 +9,6 @@ #include <linux/pr.h> #include <linux/nfsd/debug.h> -#include <scsi/scsi_proto.h> -#include <scsi/scsi_common.h> -#include <scsi/scsi_request.h> #include "blocklayoutxdr.h" #include "pnfs.h" @@ -211,109 +208,6 @@ const struct nfsd4_layout_ops bl_layout_ops = { #endif /* CONFIG_NFSD_BLOCKLAYOUT */ #ifdef CONFIG_NFSD_SCSILAYOUT -static int nfsd4_scsi_identify_device(struct block_device *bdev, - struct pnfs_block_volume *b) -{ - struct request_queue *q = bdev->bd_disk->queue; - struct request *rq; - struct scsi_request *req; - /* - * The allocation length (passed in bytes 3 and 4 of the INQUIRY - * command descriptor block) specifies the number of bytes that have - * been allocated for the data-in buffer. - * 252 is the highest one-byte value that is a multiple of 4. - * 65532 is the highest two-byte value that is a multiple of 4. - */ - size_t bufflen = 252, maxlen = 65532, len, id_len; - u8 *buf, *d, type, assoc; - int retries = 1, error; - - if (WARN_ON_ONCE(!blk_queue_scsi_passthrough(q))) - return -EINVAL; - -again: - buf = kzalloc(bufflen, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - rq = blk_get_request(q, REQ_OP_DRV_IN, 0); - if (IS_ERR(rq)) { - error = -ENOMEM; - goto out_free_buf; - } - req = scsi_req(rq); - - error = blk_rq_map_kern(q, rq, buf, bufflen, GFP_KERNEL); - if (error) - goto out_put_request; - - req->cmd[0] = INQUIRY; - req->cmd[1] = 1; - req->cmd[2] = 0x83; - req->cmd[3] = bufflen >> 8; - req->cmd[4] = bufflen & 0xff; - req->cmd_len = COMMAND_SIZE(INQUIRY); - - blk_execute_rq(NULL, rq, 1); - if (req->result) { - pr_err("pNFS: INQUIRY 0x83 failed with: %x\n", - req->result); - error = -EIO; - goto out_put_request; - } - - len = (buf[2] << 8) + buf[3] + 4; - if (len > bufflen) { - if (len <= maxlen && retries--) { - blk_put_request(rq); - kfree(buf); - bufflen = len; - goto again; - } - pr_err("pNFS: INQUIRY 0x83 response invalid (len = %zd)\n", - len); - goto out_put_request; - } - - d = buf + 4; - for (d = buf + 4; d < buf + len; d += id_len + 4) { - id_len = d[3]; - type = d[1] & 0xf; - assoc = (d[1] >> 4) & 0x3; - - /* - * We only care about a EUI-64 and NAA designator types - * with LU association. - */ - if (assoc != 0x00) - continue; - if (type != 0x02 && type != 0x03) - continue; - if (id_len != 8 && id_len != 12 && id_len != 16) - continue; - - b->scsi.code_set = PS_CODE_SET_BINARY; - b->scsi.designator_type = type == 0x02 ? - PS_DESIGNATOR_EUI64 : PS_DESIGNATOR_NAA; - b->scsi.designator_len = id_len; - memcpy(b->scsi.designator, d + 4, id_len); - - /* - * If we found a 8 or 12 byte descriptor continue on to - * see if a 16 byte one is available. If we find a - * 16 byte descriptor we're done. - */ - if (id_len == 16) - break; - } - -out_put_request: - blk_put_request(rq); -out_free_buf: - kfree(buf); - return error; -} - #define NFSD_MDS_PR_KEY 0x0100000000000000ULL /* @@ -325,6 +219,31 @@ static u64 nfsd4_scsi_pr_key(struct nfs4_client *clp) return ((u64)clp->cl_clientid.cl_boot << 32) | clp->cl_clientid.cl_id; } +static const u8 designator_types[] = { + PS_DESIGNATOR_EUI64, + PS_DESIGNATOR_NAA, +}; + +static int +nfsd4_block_get_unique_id(struct gendisk *disk, struct pnfs_block_volume *b) +{ + int ret, i; + + for (i = 0; i < ARRAY_SIZE(designator_types); i++) { + u8 type = designator_types[i]; + + ret = disk->fops->get_unique_id(disk, b->scsi.designator, type); + if (ret > 0) { + b->scsi.code_set = PS_CODE_SET_BINARY; + b->scsi.designator_type = type; + b->scsi.designator_len = ret; + return 0; + } + } + + return -EINVAL; +} + static int nfsd4_block_get_device_info_scsi(struct super_block *sb, struct nfs4_client *clp, @@ -333,7 +252,7 @@ nfsd4_block_get_device_info_scsi(struct super_block *sb, struct pnfs_block_deviceaddr *dev; struct pnfs_block_volume *b; const struct pr_ops *ops; - int error; + int ret; dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) + sizeof(struct pnfs_block_volume), GFP_KERNEL); @@ -347,33 +266,38 @@ nfsd4_block_get_device_info_scsi(struct super_block *sb, b->type = PNFS_BLOCK_VOLUME_SCSI; b->scsi.pr_key = nfsd4_scsi_pr_key(clp); - error = nfsd4_scsi_identify_device(sb->s_bdev, b); - if (error) - return error; + ret = nfsd4_block_get_unique_id(sb->s_bdev->bd_disk, b); + if (ret < 0) + goto out_free_dev; + ret = -EINVAL; ops = sb->s_bdev->bd_disk->fops->pr_ops; if (!ops) { pr_err("pNFS: device %s does not support PRs.\n", sb->s_id); - return -EINVAL; + goto out_free_dev; } - error = ops->pr_register(sb->s_bdev, 0, NFSD_MDS_PR_KEY, true); - if (error) { + ret = ops->pr_register(sb->s_bdev, 0, NFSD_MDS_PR_KEY, true); + if (ret) { pr_err("pNFS: failed to register key for device %s.\n", sb->s_id); - return -EINVAL; + goto out_free_dev; } - error = ops->pr_reserve(sb->s_bdev, NFSD_MDS_PR_KEY, + ret = ops->pr_reserve(sb->s_bdev, NFSD_MDS_PR_KEY, PR_EXCLUSIVE_ACCESS_REG_ONLY, 0); - if (error) { + if (ret) { pr_err("pNFS: failed to reserve device %s.\n", sb->s_id); - return -EINVAL; + goto out_free_dev; } return 0; + +out_free_dev: + kfree(dev); + return ret; } static __be32 diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 7629248fdd53..be3c1aad50ea 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -542,7 +542,7 @@ nfsd_file_close_inode_sync(struct inode *inode) } /** - * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file + * nfsd_file_close_inode - attempt a delayed close of a nfsd_file * @inode: inode of the file to attempt to remove * * Walk the whole hash bucket, looking for any files that correspond to "inode". diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index a97873f2d22b..6d1b5bb051c5 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -145,8 +145,9 @@ void nfsd4_setup_layout_type(struct svc_export *exp) #ifdef CONFIG_NFSD_SCSILAYOUT if (sb->s_export_op->map_blocks && sb->s_export_op->commit_blocks && - sb->s_bdev && sb->s_bdev->bd_disk->fops->pr_ops && - blk_queue_scsi_passthrough(sb->s_bdev->bd_disk->queue)) + sb->s_bdev && + sb->s_bdev->bd_disk->fops->pr_ops && + sb->s_bdev->bd_disk->fops->get_unique_id) exp->ex_layout_types |= 1 << LAYOUT_SCSI; #endif } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 7abeccb975b2..cf030ebe2827 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3544,15 +3544,18 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, goto fail; cd->rd_maxcount -= entry_bytes; /* - * RFC 3530 14.2.24 describes rd_dircount as only a "hint", so - * let's always let through the first entry, at least: + * RFC 3530 14.2.24 describes rd_dircount as only a "hint", and + * notes that it could be zero. If it is zero, then the server + * should enforce only the rd_maxcount value. */ - if (!cd->rd_dircount) - goto fail; - name_and_cookie = 4 + 4 * XDR_QUADLEN(namlen) + 8; - if (name_and_cookie > cd->rd_dircount && cd->cookie_offset) - goto fail; - cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie); + if (cd->rd_dircount) { + name_and_cookie = 4 + 4 * XDR_QUADLEN(namlen) + 8; + if (name_and_cookie > cd->rd_dircount && cd->cookie_offset) + goto fail; + cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie); + if (!cd->rd_dircount) + cd->rd_maxcount = 0; + } cd->cookie_offset = cookie_offset; skip_entry: diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index c2c3d9077dc5..070e5dd03e26 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -793,7 +793,10 @@ out_close: svc_xprt_put(xprt); } out_err: - nfsd_destroy(net); + if (!list_empty(&nn->nfsd_serv->sv_permsocks)) + nn->nfsd_serv->sv_nrthreads--; + else + nfsd_destroy(net); return err; } @@ -1545,7 +1548,7 @@ static int __init init_nfsd(void) goto out_free_all; return 0; out_free_all: - unregister_pernet_subsys(&nfsd_net_ops); + unregister_filesystem(&nfsd_fs_type); out_free_exports: remove_proc_entry("fs/nfs/exports", NULL); remove_proc_entry("fs/nfs", NULL); diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 640ac8fe891e..1d0583cfd970 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -1107,7 +1107,7 @@ static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp) goto out; ret = -ERANGE; - if (range[1] > i_size_read(inode->i_sb->s_bdev->bd_inode)) + if (range[1] > bdev_nr_bytes(inode->i_sb->s_bdev)) goto out; segbytes = nilfs->ns_blocks_per_segment * nilfs->ns_blocksize; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index f6b2d280aab5..3134c0e42fd4 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -403,7 +403,7 @@ int nilfs_resize_fs(struct super_block *sb, __u64 newsize) int ret; ret = -ERANGE; - devsize = i_size_read(sb->s_bdev->bd_inode); + devsize = bdev_nr_bytes(sb->s_bdev); if (newsize > devsize) goto out; diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index c8bfc01da5d7..1bfcb5d3ea48 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -489,7 +489,7 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs, { struct nilfs_super_block **sbp = nilfs->ns_sbp; struct buffer_head **sbh = nilfs->ns_sbh; - u64 sb2off = NILFS_SB2_OFFSET_BYTES(nilfs->ns_bdev->bd_inode->i_size); + u64 sb2off = NILFS_SB2_OFFSET_BYTES(bdev_nr_bytes(nilfs->ns_bdev)); int valid[2], swp = 0; sbp[0] = nilfs_read_super_block(sb, NILFS_SB_OFFSET_BYTES, blocksize, diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index ab4f3362466d..373dbb627657 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -5,6 +5,7 @@ * Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc. */ +#include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/buffer_head.h> #include <linux/gfp.h> diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 0d7e948cb29c..5ae8de09b271 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -2772,13 +2772,12 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) ntfs_debug("Set device block size to %i bytes (block size bits %i).", blocksize, sb->s_blocksize_bits); /* Determine the size of the device in units of block_size bytes. */ - if (!i_size_read(sb->s_bdev->bd_inode)) { + vol->nr_blocks = sb_bdev_nr_blocks(sb); + if (!vol->nr_blocks) { if (!silent) ntfs_error(sb, "Unable to determine device size."); goto err_out_now; } - vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >> - sb->s_blocksize_bits; /* Read the boot sector and return unlocked buffer head to it. */ if (!(bh = read_ntfs_boot_sector(sb, silent))) { if (!silent) @@ -2816,8 +2815,7 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) goto err_out_now; } BUG_ON(blocksize != sb->s_blocksize); - vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >> - sb->s_blocksize_bits; + vol->nr_blocks = sb_bdev_nr_blocks(sb); ntfs_debug("Changed device block size to %i bytes (block size " "bits %i) to match volume sector size.", blocksize, sb->s_blocksize_bits); diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c index 34c4cbf7e29b..e8c00dda42ad 100644 --- a/fs/ntfs3/attrib.c +++ b/fs/ntfs3/attrib.c @@ -6,13 +6,9 @@ * TODO: Merge attr_set_size/attr_data_get_block/attr_allocate_frame? */ -#include <linux/blkdev.h> -#include <linux/buffer_head.h> #include <linux/fs.h> -#include <linux/hash.h> -#include <linux/nls.h> -#include <linux/ratelimit.h> #include <linux/slab.h> +#include <linux/kernel.h> #include "debug.h" #include "ntfs.h" @@ -291,7 +287,7 @@ int attr_make_nonresident(struct ntfs_inode *ni, struct ATTRIB *attr, if (!rsize) { /* Empty resident -> Non empty nonresident. */ } else if (!is_data) { - err = ntfs_sb_write_run(sbi, run, 0, data, rsize); + err = ntfs_sb_write_run(sbi, run, 0, data, rsize, 0); if (err) goto out2; } else if (!page) { @@ -451,11 +447,8 @@ again: again_1: align = sbi->cluster_size; - if (is_ext) { + if (is_ext) align <<= attr_b->nres.c_unit; - if (is_attr_sparsed(attr_b)) - keep_prealloc = false; - } old_valid = le64_to_cpu(attr_b->nres.valid_size); old_size = le64_to_cpu(attr_b->nres.data_size); @@ -465,9 +458,6 @@ again_1: new_alloc = (new_size + align - 1) & ~(u64)(align - 1); new_alen = new_alloc >> cluster_bits; - if (keep_prealloc && is_ext) - keep_prealloc = false; - if (keep_prealloc && new_size < old_size) { attr_b->nres.data_size = cpu_to_le64(new_size); mi_b->dirty = true; @@ -529,7 +519,7 @@ add_alloc_in_same_attr_seg: } else if (pre_alloc == -1) { pre_alloc = 0; if (type == ATTR_DATA && !name_len && - sbi->options.prealloc) { + sbi->options->prealloc) { CLST new_alen2 = bytes_to_cluster( sbi, get_pre_allocated(new_size)); pre_alloc = new_alen2 - new_alen; @@ -1966,7 +1956,7 @@ int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size) return 0; from = vbo; - to = (vbo + bytes) < data_size ? (vbo + bytes) : data_size; + to = min_t(u64, vbo + bytes, data_size); memset(Add2Ptr(resident_data(attr_b), from), 0, to - from); return 0; } diff --git a/fs/ntfs3/attrlist.c b/fs/ntfs3/attrlist.c index fa32399eb517..bad6d8a849a2 100644 --- a/fs/ntfs3/attrlist.c +++ b/fs/ntfs3/attrlist.c @@ -5,10 +5,7 @@ * */ -#include <linux/blkdev.h> -#include <linux/buffer_head.h> #include <linux/fs.h> -#include <linux/nls.h> #include "debug.h" #include "ntfs.h" @@ -336,7 +333,7 @@ int al_add_le(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, if (attr && attr->non_res) { err = ntfs_sb_write_run(ni->mi.sbi, &al->run, 0, al->le, - al->size); + al->size, 0); if (err) return err; al->dirty = false; @@ -423,7 +420,7 @@ next: return true; } -int al_update(struct ntfs_inode *ni) +int al_update(struct ntfs_inode *ni, int sync) { int err; struct ATTRIB *attr; @@ -445,7 +442,7 @@ int al_update(struct ntfs_inode *ni) memcpy(resident_data(attr), al->le, al->size); } else { err = ntfs_sb_write_run(ni->mi.sbi, &al->run, 0, al->le, - al->size); + al->size, sync); if (err) goto out; diff --git a/fs/ntfs3/bitfunc.c b/fs/ntfs3/bitfunc.c index ce304d40b5e1..50d838093790 100644 --- a/fs/ntfs3/bitfunc.c +++ b/fs/ntfs3/bitfunc.c @@ -5,13 +5,8 @@ * */ -#include <linux/blkdev.h> -#include <linux/buffer_head.h> -#include <linux/fs.h> -#include <linux/nls.h> +#include <linux/types.h> -#include "debug.h" -#include "ntfs.h" #include "ntfs_fs.h" #define BITS_IN_SIZE_T (sizeof(size_t) * 8) @@ -124,8 +119,7 @@ bool are_bits_set(const ulong *lmap, size_t bit, size_t nbits) pos = nbits & 7; if (pos) { - u8 mask = fill_mask[pos]; - + mask = fill_mask[pos]; if ((*map & mask) != mask) return false; } diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c index 831501555009..aa184407520f 100644 --- a/fs/ntfs3/bitmap.c +++ b/fs/ntfs3/bitmap.c @@ -10,12 +10,10 @@ * */ -#include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/fs.h> -#include <linux/nls.h> +#include <linux/kernel.h> -#include "debug.h" #include "ntfs.h" #include "ntfs_fs.h" @@ -435,7 +433,7 @@ static void wnd_remove_free_ext(struct wnd_bitmap *wnd, size_t bit, size_t len) ; } else { n3 = rb_next(&e->count.node); - max_new_len = len > new_len ? len : new_len; + max_new_len = max(len, new_len); if (!n3) { wnd->extent_max = max_new_len; } else { @@ -731,7 +729,7 @@ int wnd_set_free(struct wnd_bitmap *wnd, size_t bit, size_t bits) wbits = wnd->bits_last; tail = wbits - wbit; - op = tail < bits ? tail : bits; + op = min_t(u32, tail, bits); bh = wnd_map(wnd, iw); if (IS_ERR(bh)) { @@ -784,7 +782,7 @@ int wnd_set_used(struct wnd_bitmap *wnd, size_t bit, size_t bits) wbits = wnd->bits_last; tail = wbits - wbit; - op = tail < bits ? tail : bits; + op = min_t(u32, tail, bits); bh = wnd_map(wnd, iw); if (IS_ERR(bh)) { @@ -834,7 +832,7 @@ static bool wnd_is_free_hlp(struct wnd_bitmap *wnd, size_t bit, size_t bits) wbits = wnd->bits_last; tail = wbits - wbit; - op = tail < bits ? tail : bits; + op = min_t(u32, tail, bits); if (wbits != wnd->free_bits[iw]) { bool ret; @@ -926,7 +924,7 @@ use_wnd: wbits = wnd->bits_last; tail = wbits - wbit; - op = tail < bits ? tail : bits; + op = min_t(u32, tail, bits); if (wnd->free_bits[iw]) { bool ret; diff --git a/fs/ntfs3/debug.h b/fs/ntfs3/debug.h index 31120569a87b..53ef7489c75f 100644 --- a/fs/ntfs3/debug.h +++ b/fs/ntfs3/debug.h @@ -11,6 +11,9 @@ #ifndef _LINUX_NTFS3_DEBUG_H #define _LINUX_NTFS3_DEBUG_H +struct super_block; +struct inode; + #ifndef Add2Ptr #define Add2Ptr(P, I) ((void *)((u8 *)(P) + (I))) #define PtrOffset(B, O) ((size_t)((size_t)(O) - (size_t)(B))) diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c index 93f6d485564e..fb438d604040 100644 --- a/fs/ntfs3/dir.c +++ b/fs/ntfs3/dir.c @@ -7,10 +7,7 @@ * */ -#include <linux/blkdev.h> -#include <linux/buffer_head.h> #include <linux/fs.h> -#include <linux/iversion.h> #include <linux/nls.h> #include "debug.h" @@ -18,30 +15,27 @@ #include "ntfs_fs.h" /* Convert little endian UTF-16 to NLS string. */ -int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const struct le_str *uni, +int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const __le16 *name, u32 len, u8 *buf, int buf_len) { - int ret, uni_len, warn; - const __le16 *ip; + int ret, warn; u8 *op; - struct nls_table *nls = sbi->options.nls; + struct nls_table *nls = sbi->options->nls; static_assert(sizeof(wchar_t) == sizeof(__le16)); if (!nls) { /* UTF-16 -> UTF-8 */ - ret = utf16s_to_utf8s((wchar_t *)uni->name, uni->len, - UTF16_LITTLE_ENDIAN, buf, buf_len); + ret = utf16s_to_utf8s(name, len, UTF16_LITTLE_ENDIAN, buf, + buf_len); buf[ret] = '\0'; return ret; } - ip = uni->name; op = buf; - uni_len = uni->len; warn = 0; - while (uni_len--) { + while (len--) { u16 ec; int charlen; char dump[5]; @@ -52,7 +46,7 @@ int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const struct le_str *uni, break; } - ec = le16_to_cpu(*ip++); + ec = le16_to_cpu(*name++); charlen = nls->uni2char(ec, op, buf_len); if (charlen > 0) { @@ -186,7 +180,7 @@ int ntfs_nls_to_utf16(struct ntfs_sb_info *sbi, const u8 *name, u32 name_len, { int ret, slen; const u8 *end; - struct nls_table *nls = sbi->options.nls; + struct nls_table *nls = sbi->options->nls; u16 *uname = uni->name; static_assert(sizeof(wchar_t) == sizeof(u16)); @@ -301,14 +295,14 @@ static inline int ntfs_filldir(struct ntfs_sb_info *sbi, struct ntfs_inode *ni, return 0; /* Skip meta files. Unless option to show metafiles is set. */ - if (!sbi->options.showmeta && ntfs_is_meta_file(sbi, ino)) + if (!sbi->options->showmeta && ntfs_is_meta_file(sbi, ino)) return 0; - if (sbi->options.nohidden && (fname->dup.fa & FILE_ATTRIBUTE_HIDDEN)) + if (sbi->options->nohidden && (fname->dup.fa & FILE_ATTRIBUTE_HIDDEN)) return 0; - name_len = ntfs_utf16_to_nls(sbi, (struct le_str *)&fname->name_len, - name, PATH_MAX); + name_len = ntfs_utf16_to_nls(sbi, fname->name, fname->name_len, name, + PATH_MAX); if (name_len <= 0) { ntfs_warn(sbi->sb, "failed to convert name for inode %lx.", ino); diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 424450e77ad5..a3cd3c3f091e 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -8,11 +8,11 @@ */ #include <linux/backing-dev.h> +#include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/compat.h> #include <linux/falloc.h> #include <linux/fiemap.h> -#include <linux/nls.h> #include "debug.h" #include "ntfs.h" @@ -588,8 +588,11 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) truncate_pagecache(inode, vbo_down); if (!is_sparsed(ni) && !is_compressed(ni)) { - /* Normal file. */ - err = ntfs_zero_range(inode, vbo, end); + /* + * Normal file, can't make hole. + * TODO: Try to find way to save info about hole. + */ + err = -EOPNOTSUPP; goto out; } @@ -737,7 +740,7 @@ int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, umode_t mode = inode->i_mode; int err; - if (sbi->options.no_acs_rules) { + if (sbi->options->noacsrules) { /* "No access rules" - Force any changes of time etc. */ attr->ia_valid |= ATTR_FORCE; /* and disable for editing some attributes. */ @@ -1185,7 +1188,7 @@ static int ntfs_file_release(struct inode *inode, struct file *file) int err = 0; /* If we are last writer on the inode, drop the block reservation. */ - if (sbi->options.prealloc && ((file->f_mode & FMODE_WRITE) && + if (sbi->options->prealloc && ((file->f_mode & FMODE_WRITE) && atomic_read(&inode->i_writecount) == 1)) { ni_lock(ni); down_write(&ni->file.run_lock); diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c index 938b12d56ca6..6f47a9c17f89 100644 --- a/fs/ntfs3/frecord.c +++ b/fs/ntfs3/frecord.c @@ -5,11 +5,8 @@ * */ -#include <linux/blkdev.h> -#include <linux/buffer_head.h> #include <linux/fiemap.h> #include <linux/fs.h> -#include <linux/nls.h> #include <linux/vmalloc.h> #include "debug.h" @@ -708,18 +705,35 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni) continue; mi = ni_find_mi(ni, ino_get(&le->ref)); + if (!mi) { + /* Should never happened, 'cause already checked. */ + goto bad; + } attr = mi_find_attr(mi, NULL, le->type, le_name(le), le->name_len, &le->id); + if (!attr) { + /* Should never happened, 'cause already checked. */ + goto bad; + } asize = le32_to_cpu(attr->size); /* Insert into primary record. */ attr_ins = mi_insert_attr(&ni->mi, le->type, le_name(le), le->name_len, asize, le16_to_cpu(attr->name_off)); - id = attr_ins->id; + if (!attr_ins) { + /* + * Internal error. + * Either no space in primary record (already checked). + * Either tried to insert another + * non indexed attribute (logic error). + */ + goto bad; + } /* Copy all except id. */ + id = attr_ins->id; memcpy(attr_ins, attr, asize); attr_ins->id = id; @@ -735,6 +749,10 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni) ni->attr_list.dirty = false; return 0; +bad: + ntfs_inode_err(&ni->vfs_inode, "Internal error"); + make_bad_inode(&ni->vfs_inode); + return -EINVAL; } /* @@ -956,6 +974,13 @@ static int ni_ins_attr_ext(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le, continue; } + /* + * Do not try to insert this attribute + * if there is no room in record. + */ + if (le32_to_cpu(mi->mrec->used) + asize > sbi->record_size) + continue; + /* Try to insert attribute into this subrecord. */ attr = ni_ins_new_attr(ni, mi, le, type, name, name_len, asize, name_off, svcn, ins_le); @@ -1451,7 +1476,7 @@ int ni_insert_resident(struct ntfs_inode *ni, u32 data_size, attr->res.flags = RESIDENT_FLAG_INDEXED; /* is_attr_indexed(attr)) == true */ - le16_add_cpu(&ni->mi.mrec->hard_links, +1); + le16_add_cpu(&ni->mi.mrec->hard_links, 1); ni->mi.dirty = true; } attr->res.res = 0; @@ -1606,7 +1631,7 @@ struct ATTR_FILE_NAME *ni_fname_type(struct ntfs_inode *ni, u8 name_type, *le = NULL; - if (FILE_NAME_POSIX == name_type) + if (name_type == FILE_NAME_POSIX) return NULL; /* Enumerate all names. */ @@ -1706,18 +1731,16 @@ out: /* * ni_parse_reparse * - * Buffer is at least 24 bytes. + * buffer - memory for reparse buffer header */ enum REPARSE_SIGN ni_parse_reparse(struct ntfs_inode *ni, struct ATTRIB *attr, - void *buffer) + struct REPARSE_DATA_BUFFER *buffer) { const struct REPARSE_DATA_BUFFER *rp = NULL; u8 bits; u16 len; typeof(rp->CompressReparseBuffer) *cmpr; - static_assert(sizeof(struct REPARSE_DATA_BUFFER) <= 24); - /* Try to estimate reparse point. */ if (!attr->non_res) { rp = resident_data_ex(attr, sizeof(struct REPARSE_DATA_BUFFER)); @@ -1803,6 +1826,9 @@ enum REPARSE_SIGN ni_parse_reparse(struct ntfs_inode *ni, struct ATTRIB *attr, return REPARSE_NONE; } + if (buffer != rp) + memcpy(buffer, rp, sizeof(struct REPARSE_DATA_BUFFER)); + /* Looks like normal symlink. */ return REPARSE_LINK; } @@ -2906,9 +2932,8 @@ bool ni_remove_name_undo(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), de + 1, de_key_size); mi_get_ref(&ni->mi, &de->ref); - if (indx_insert_entry(&dir_ni->dir, dir_ni, de, sbi, NULL, 1)) { + if (indx_insert_entry(&dir_ni->dir, dir_ni, de, sbi, NULL, 1)) return false; - } } return true; @@ -3077,7 +3102,9 @@ static bool ni_update_parent(struct ntfs_inode *ni, struct NTFS_DUP_INFO *dup, const struct EA_INFO *info; info = resident_data_ex(attr, sizeof(struct EA_INFO)); - dup->ea_size = info->size_pack; + /* If ATTR_EA_INFO exists 'info' can't be NULL. */ + if (info) + dup->ea_size = info->size_pack; } } @@ -3205,7 +3232,7 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint) goto out; } - err = al_update(ni); + err = al_update(ni, sync); if (err) goto out; } diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c index b5853aed0e25..06492f088d60 100644 --- a/fs/ntfs3/fslog.c +++ b/fs/ntfs3/fslog.c @@ -6,12 +6,8 @@ */ #include <linux/blkdev.h> -#include <linux/buffer_head.h> #include <linux/fs.h> -#include <linux/hash.h> -#include <linux/nls.h> #include <linux/random.h> -#include <linux/ratelimit.h> #include <linux/slab.h> #include "debug.h" @@ -2219,7 +2215,7 @@ file_is_valid: err = ntfs_sb_write_run(log->ni->mi.sbi, &log->ni->file.run, off, page, - log->page_size); + log->page_size, 0); if (err) goto out; @@ -3710,7 +3706,7 @@ move_data: if (a_dirty) { attr = oa->attr; - err = ntfs_sb_write_run(sbi, oa->run1, vbo, buffer_le, bytes); + err = ntfs_sb_write_run(sbi, oa->run1, vbo, buffer_le, bytes, 0); if (err) goto out; } @@ -5152,10 +5148,10 @@ end_reply: ntfs_fix_pre_write(&rh->rhdr, log->page_size); - err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rh, log->page_size); + err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rh, log->page_size, 0); if (!err) err = ntfs_sb_write_run(sbi, &log->ni->file.run, log->page_size, - rh, log->page_size); + rh, log->page_size, 0); kfree(rh); if (err) diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c index 91e3743e1442..4de9acb16968 100644 --- a/fs/ntfs3/fsntfs.c +++ b/fs/ntfs3/fsntfs.c @@ -8,7 +8,7 @@ #include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/fs.h> -#include <linux/nls.h> +#include <linux/kernel.h> #include "debug.h" #include "ntfs.h" @@ -358,7 +358,7 @@ int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len, enum ALLOCATE_OPT opt) { int err; - CLST alen = 0; + CLST alen; struct super_block *sb = sbi->sb; size_t alcn, zlen, zeroes, zlcn, zlen2, ztrim, new_zlen; struct wnd_bitmap *wnd = &sbi->used.bitmap; @@ -370,27 +370,28 @@ int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len, if (!zlen) { err = ntfs_refresh_zone(sbi); if (err) - goto out; + goto up_write; + zlen = wnd_zone_len(wnd); } if (!zlen) { ntfs_err(sbi->sb, "no free space to extend mft"); - goto out; + err = -ENOSPC; + goto up_write; } lcn = wnd_zone_bit(wnd); - alen = zlen > len ? len : zlen; + alen = min_t(CLST, len, zlen); wnd_zone_set(wnd, lcn + alen, zlen - alen); err = wnd_set_used(wnd, lcn, alen); - if (err) { - up_write(&wnd->rw_lock); - return err; - } + if (err) + goto up_write; + alcn = lcn; - goto out; + goto space_found; } /* * 'Cause cluster 0 is always used this value means that we should use @@ -404,49 +405,45 @@ int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len, alen = wnd_find(wnd, len, lcn, BITMAP_FIND_MARK_AS_USED, &alcn); if (alen) - goto out; + goto space_found; /* Try to use clusters from MftZone. */ zlen = wnd_zone_len(wnd); zeroes = wnd_zeroes(wnd); /* Check too big request */ - if (len > zeroes + zlen || zlen <= NTFS_MIN_MFT_ZONE) - goto out; + if (len > zeroes + zlen || zlen <= NTFS_MIN_MFT_ZONE) { + err = -ENOSPC; + goto up_write; + } /* How many clusters to cat from zone. */ zlcn = wnd_zone_bit(wnd); zlen2 = zlen >> 1; - ztrim = len > zlen ? zlen : (len > zlen2 ? len : zlen2); - new_zlen = zlen - ztrim; - - if (new_zlen < NTFS_MIN_MFT_ZONE) { - new_zlen = NTFS_MIN_MFT_ZONE; - if (new_zlen > zlen) - new_zlen = zlen; - } + ztrim = clamp_val(len, zlen2, zlen); + new_zlen = max_t(size_t, zlen - ztrim, NTFS_MIN_MFT_ZONE); wnd_zone_set(wnd, zlcn, new_zlen); /* Allocate continues clusters. */ alen = wnd_find(wnd, len, 0, BITMAP_FIND_MARK_AS_USED | BITMAP_FIND_FULL, &alcn); - -out: - if (alen) { - err = 0; - *new_len = alen; - *new_lcn = alcn; - - ntfs_unmap_meta(sb, alcn, alen); - - /* Set hint for next requests. */ - if (!(opt & ALLOCATE_MFT)) - sbi->used.next_free_lcn = alcn + alen; - } else { + if (!alen) { err = -ENOSPC; + goto up_write; } +space_found: + err = 0; + *new_len = alen; + *new_lcn = alcn; + + ntfs_unmap_meta(sb, alcn, alen); + + /* Set hint for next requests. */ + if (!(opt & ALLOCATE_MFT)) + sbi->used.next_free_lcn = alcn + alen; +up_write: up_write(&wnd->rw_lock); return err; } @@ -1080,7 +1077,7 @@ int ntfs_sb_write(struct super_block *sb, u64 lbo, size_t bytes, } int ntfs_sb_write_run(struct ntfs_sb_info *sbi, const struct runs_tree *run, - u64 vbo, const void *buf, size_t bytes) + u64 vbo, const void *buf, size_t bytes, int sync) { struct super_block *sb = sbi->sb; u8 cluster_bits = sbi->cluster_bits; @@ -1099,8 +1096,8 @@ int ntfs_sb_write_run(struct ntfs_sb_info *sbi, const struct runs_tree *run, len = ((u64)clen << cluster_bits) - off; for (;;) { - u32 op = len < bytes ? len : bytes; - int err = ntfs_sb_write(sb, lbo, op, buf, 0); + u32 op = min_t(u64, len, bytes); + int err = ntfs_sb_write(sb, lbo, op, buf, sync); if (err) return err; @@ -1300,7 +1297,7 @@ int ntfs_get_bh(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo, nb->off = off = lbo & (blocksize - 1); for (;;) { - u32 len32 = len < bytes ? len : bytes; + u32 len32 = min_t(u64, len, bytes); sector_t block = lbo >> sb->s_blocksize_bits; do { @@ -2175,7 +2172,7 @@ int ntfs_insert_security(struct ntfs_sb_info *sbi, /* Write main SDS bucket. */ err = ntfs_sb_write_run(sbi, &ni->file.run, sbi->security.next_off, - d_security, aligned_sec_size); + d_security, aligned_sec_size, 0); if (err) goto out; @@ -2193,7 +2190,7 @@ int ntfs_insert_security(struct ntfs_sb_info *sbi, /* Write copy SDS bucket. */ err = ntfs_sb_write_run(sbi, &ni->file.run, mirr_off, d_security, - aligned_sec_size); + aligned_sec_size, 0); if (err) goto out; diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c index 0daca9adc54c..6f81e3a49abf 100644 --- a/fs/ntfs3/index.c +++ b/fs/ntfs3/index.c @@ -8,7 +8,7 @@ #include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/fs.h> -#include <linux/nls.h> +#include <linux/kernel.h> #include "debug.h" #include "ntfs.h" @@ -671,138 +671,74 @@ static struct NTFS_DE *hdr_find_e(const struct ntfs_index *indx, const struct INDEX_HDR *hdr, const void *key, size_t key_len, const void *ctx, int *diff) { - struct NTFS_DE *e; + struct NTFS_DE *e, *found = NULL; NTFS_CMP_FUNC cmp = indx->cmp; + int min_idx = 0, mid_idx, max_idx = 0; + int diff2; + int table_size = 8; u32 e_size, e_key_len; u32 end = le32_to_cpu(hdr->used); u32 off = le32_to_cpu(hdr->de_off); + u16 offs[128]; -#ifdef NTFS3_INDEX_BINARY_SEARCH - int max_idx = 0, fnd, min_idx; - int nslots = 64; - u16 *offs; - - if (end > 0x10000) - goto next; - - offs = kmalloc(sizeof(u16) * nslots, GFP_NOFS); - if (!offs) - goto next; +fill_table: + if (off + sizeof(struct NTFS_DE) > end) + return NULL; - /* Use binary search algorithm. */ -next1: - if (off + sizeof(struct NTFS_DE) > end) { - e = NULL; - goto out1; - } e = Add2Ptr(hdr, off); e_size = le16_to_cpu(e->size); - if (e_size < sizeof(struct NTFS_DE) || off + e_size > end) { - e = NULL; - goto out1; - } - - if (max_idx >= nslots) { - u16 *ptr; - int new_slots = ALIGN(2 * nslots, 8); - - ptr = kmalloc(sizeof(u16) * new_slots, GFP_NOFS); - if (ptr) - memcpy(ptr, offs, sizeof(u16) * max_idx); - kfree(offs); - offs = ptr; - nslots = new_slots; - if (!ptr) - goto next; - } - - /* Store entry table. */ - offs[max_idx] = off; + if (e_size < sizeof(struct NTFS_DE) || off + e_size > end) + return NULL; if (!de_is_last(e)) { + offs[max_idx] = off; off += e_size; - max_idx += 1; - goto next1; - } - /* - * Table of pointers is created. - * Use binary search to find entry that is <= to the search value. - */ - fnd = -1; - min_idx = 0; + max_idx++; + if (max_idx < table_size) + goto fill_table; - while (min_idx <= max_idx) { - int mid_idx = min_idx + ((max_idx - min_idx) >> 1); - int diff2; - - e = Add2Ptr(hdr, offs[mid_idx]); + max_idx--; + } - e_key_len = le16_to_cpu(e->key_size); +binary_search: + e_key_len = le16_to_cpu(e->key_size); - diff2 = (*cmp)(key, key_len, e + 1, e_key_len, ctx); + diff2 = (*cmp)(key, key_len, e + 1, e_key_len, ctx); + if (diff2 > 0) { + if (found) { + min_idx = mid_idx + 1; + } else { + if (de_is_last(e)) + return NULL; - if (!diff2) { - *diff = 0; - goto out1; + max_idx = 0; + table_size = min(table_size * 2, + (int)ARRAY_SIZE(offs)); + goto fill_table; } - - if (diff2 < 0) { + } else if (diff2 < 0) { + if (found) max_idx = mid_idx - 1; - fnd = mid_idx; - if (!fnd) - break; - } else { - min_idx = mid_idx + 1; - } - } + else + max_idx--; - if (fnd == -1) { - e = NULL; - goto out1; + found = e; + } else { + *diff = 0; + return e; } - *diff = -1; - e = Add2Ptr(hdr, offs[fnd]); - -out1: - kfree(offs); - - return e; -#endif - -next: - /* - * Entries index are sorted. - * Enumerate all entries until we find entry - * that is <= to the search value. - */ - if (off + sizeof(struct NTFS_DE) > end) - return NULL; - - e = Add2Ptr(hdr, off); - e_size = le16_to_cpu(e->size); - - if (e_size < sizeof(struct NTFS_DE) || off + e_size > end) - return NULL; - - off += e_size; - - e_key_len = le16_to_cpu(e->key_size); - - *diff = (*cmp)(key, key_len, e + 1, e_key_len, ctx); - if (!*diff) - return e; + if (min_idx > max_idx) { + *diff = -1; + return found; + } - if (*diff <= 0) - return e; + mid_idx = (min_idx + max_idx) >> 1; + e = Add2Ptr(hdr, offs[mid_idx]); - if (de_is_last(e)) { - *diff = 1; - return e; - } - goto next; + goto binary_search; } /* @@ -1136,9 +1072,7 @@ int indx_find(struct ntfs_index *indx, struct ntfs_inode *ni, if (!e) return -EINVAL; - if (fnd) - fnd->root_de = e; - + fnd->root_de = e; err = 0; for (;;) { @@ -1401,7 +1335,7 @@ ok: static int indx_create_allocate(struct ntfs_index *indx, struct ntfs_inode *ni, CLST *vbn) { - int err = -ENOMEM; + int err; struct ntfs_sb_info *sbi = ni->mi.sbi; struct ATTRIB *bitmap; struct ATTRIB *alloc; diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index db2a5a4c38e4..a87ab3ad3cd3 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -5,10 +5,8 @@ * */ -#include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/fs.h> -#include <linux/iversion.h> #include <linux/mpage.h> #include <linux/namei.h> #include <linux/nls.h> @@ -49,8 +47,8 @@ static struct inode *ntfs_read_mft(struct inode *inode, inode->i_op = NULL; /* Setup 'uid' and 'gid' */ - inode->i_uid = sbi->options.fs_uid; - inode->i_gid = sbi->options.fs_gid; + inode->i_uid = sbi->options->fs_uid; + inode->i_gid = sbi->options->fs_gid; err = mi_init(&ni->mi, sbi, ino); if (err) @@ -224,12 +222,9 @@ next_attr: if (!attr->non_res) { ni->i_valid = inode->i_size = rsize; inode_set_bytes(inode, rsize); - t32 = asize; - } else { - t32 = le16_to_cpu(attr->nres.run_off); } - mode = S_IFREG | (0777 & sbi->options.fs_fmask_inv); + mode = S_IFREG | (0777 & sbi->options->fs_fmask_inv); if (!attr->non_res) { ni->ni_flags |= NI_FLAG_RESIDENT; @@ -272,7 +267,7 @@ next_attr: goto out; mode = sb->s_root - ? (S_IFDIR | (0777 & sbi->options.fs_dmask_inv)) + ? (S_IFDIR | (0777 & sbi->options->fs_dmask_inv)) : (S_IFDIR | 0777); goto next_attr; @@ -315,17 +310,14 @@ next_attr: rp_fa = ni_parse_reparse(ni, attr, &rp); switch (rp_fa) { case REPARSE_LINK: - if (!attr->non_res) { - inode->i_size = rsize; - inode_set_bytes(inode, rsize); - t32 = asize; - } else { - inode->i_size = - le64_to_cpu(attr->nres.data_size); - t32 = le16_to_cpu(attr->nres.run_off); - } + /* + * Normal symlink. + * Assume one unicode symbol == one utf8. + */ + inode->i_size = le16_to_cpu(rp.SymbolicLinkReparseBuffer + .PrintNameLength) / + sizeof(u16); - /* Looks like normal symlink. */ ni->i_valid = inode->i_size; /* Clear directory bit. */ @@ -422,7 +414,7 @@ end_enum: ni->std_fa &= ~FILE_ATTRIBUTE_DIRECTORY; inode->i_op = &ntfs_link_inode_operations; inode->i_fop = NULL; - inode_nohighmem(inode); // ?? + inode_nohighmem(inode); } else if (S_ISREG(mode)) { ni->std_fa &= ~FILE_ATTRIBUTE_DIRECTORY; inode->i_op = &ntfs_file_inode_operations; @@ -443,7 +435,7 @@ end_enum: goto out; } - if ((sbi->options.sys_immutable && + if ((sbi->options->sys_immutable && (std5->fa & FILE_ATTRIBUTE_SYSTEM)) && !S_ISFIFO(mode) && !S_ISSOCK(mode) && !S_ISLNK(mode)) { inode->i_flags |= S_IMMUTABLE; @@ -1054,7 +1046,7 @@ int ntfs_flush_inodes(struct super_block *sb, struct inode *i1, if (!ret && i2) ret = writeback_inode(i2); if (!ret) - ret = filemap_flush(sb->s_bdev->bd_inode->i_mapping); + ret = sync_blockdev_nowait(sb->s_bdev); return ret; } @@ -1200,9 +1192,13 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, struct REPARSE_DATA_BUFFER *rp = NULL; bool rp_inserted = false; + ni_lock_dir(dir_ni); + dir_root = indx_get_root(&dir_ni->dir, dir_ni, NULL, NULL); - if (!dir_root) - return ERR_PTR(-EINVAL); + if (!dir_root) { + err = -EINVAL; + goto out1; + } if (S_ISDIR(mode)) { /* Use parent's directory attributes. */ @@ -1244,7 +1240,7 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, * } */ } else if (S_ISREG(mode)) { - if (sbi->options.sparse) { + if (sbi->options->sparse) { /* Sparsed regular file, cause option 'sparse'. */ fa = FILE_ATTRIBUTE_SPARSE_FILE | FILE_ATTRIBUTE_ARCHIVE; @@ -1486,7 +1482,10 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, asize = ALIGN(SIZEOF_RESIDENT + nsize, 8); t16 = PtrOffset(rec, attr); - /* 0x78 - the size of EA + EAINFO to store WSL */ + /* + * Below function 'ntfs_save_wsl_perm' requires 0x78 bytes. + * It is good idea to keep extened attributes resident. + */ if (asize + t16 + 0x78 + 8 > sbi->record_size) { CLST alen; CLST clst = bytes_to_cluster(sbi, nsize); @@ -1521,14 +1520,14 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, } asize = SIZEOF_NONRESIDENT + ALIGN(err, 8); - inode->i_size = nsize; } else { attr->res.data_off = SIZEOF_RESIDENT_LE; attr->res.data_size = cpu_to_le32(nsize); memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), rp, nsize); - inode->i_size = nsize; nsize = 0; } + /* Size of symlink equals the length of input string. */ + inode->i_size = size; attr->size = cpu_to_le32(asize); @@ -1551,6 +1550,9 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, if (err) goto out6; + /* Unlock parent directory before ntfs_init_acl. */ + ni_unlock(dir_ni); + inode->i_generation = le16_to_cpu(rec->seq); dir->i_mtime = dir->i_ctime = inode->i_atime; @@ -1562,6 +1564,8 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, inode->i_op = &ntfs_link_inode_operations; inode->i_fop = NULL; inode->i_mapping->a_ops = &ntfs_aops; + inode->i_size = size; + inode_nohighmem(inode); } else if (S_ISREG(mode)) { inode->i_op = &ntfs_file_inode_operations; inode->i_fop = &ntfs_file_operations; @@ -1577,7 +1581,7 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, if (!S_ISLNK(mode) && (sb->s_flags & SB_POSIXACL)) { err = ntfs_init_acl(mnt_userns, inode, dir); if (err) - goto out6; + goto out7; } else #endif { @@ -1586,7 +1590,7 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, /* Write non resident data. */ if (nsize) { - err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rp, nsize); + err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rp, nsize, 0); if (err) goto out7; } @@ -1607,8 +1611,10 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, out7: /* Undo 'indx_insert_entry'. */ + ni_lock_dir(dir_ni); indx_delete_entry(&dir_ni->dir, dir_ni, new_de + 1, le16_to_cpu(new_de->key_size), sbi); + /* ni_unlock(dir_ni); will be called later. */ out6: if (rp_inserted) ntfs_remove_reparse(sbi, IO_REPARSE_TAG_SYMLINK, &new_de->ref); @@ -1632,8 +1638,10 @@ out2: kfree(rp); out1: - if (err) + if (err) { + ni_unlock(dir_ni); return ERR_PTR(err); + } unlock_new_inode(inode); @@ -1754,15 +1762,15 @@ void ntfs_evict_inode(struct inode *inode) static noinline int ntfs_readlink_hlp(struct inode *inode, char *buffer, int buflen) { - int i, err = 0; + int i, err = -EINVAL; struct ntfs_inode *ni = ntfs_i(inode); struct super_block *sb = inode->i_sb; struct ntfs_sb_info *sbi = sb->s_fs_info; - u64 i_size = inode->i_size; - u16 nlen = 0; + u64 size; + u16 ulen = 0; void *to_free = NULL; struct REPARSE_DATA_BUFFER *rp; - struct le_str *uni; + const __le16 *uname; struct ATTRIB *attr; /* Reparse data present. Try to parse it. */ @@ -1771,68 +1779,64 @@ static noinline int ntfs_readlink_hlp(struct inode *inode, char *buffer, *buffer = 0; - /* Read into temporal buffer. */ - if (i_size > sbi->reparse.max_size || i_size <= sizeof(u32)) { - err = -EINVAL; - goto out; - } - attr = ni_find_attr(ni, NULL, NULL, ATTR_REPARSE, NULL, 0, NULL, NULL); - if (!attr) { - err = -EINVAL; + if (!attr) goto out; - } if (!attr->non_res) { - rp = resident_data_ex(attr, i_size); - if (!rp) { - err = -EINVAL; + rp = resident_data_ex(attr, sizeof(struct REPARSE_DATA_BUFFER)); + if (!rp) goto out; - } + size = le32_to_cpu(attr->res.data_size); } else { - rp = kmalloc(i_size, GFP_NOFS); + size = le64_to_cpu(attr->nres.data_size); + rp = NULL; + } + + if (size > sbi->reparse.max_size || size <= sizeof(u32)) + goto out; + + if (!rp) { + rp = kmalloc(size, GFP_NOFS); if (!rp) { err = -ENOMEM; goto out; } to_free = rp; - err = ntfs_read_run_nb(sbi, &ni->file.run, 0, rp, i_size, NULL); + /* Read into temporal buffer. */ + err = ntfs_read_run_nb(sbi, &ni->file.run, 0, rp, size, NULL); if (err) goto out; } - err = -EINVAL; - /* Microsoft Tag. */ switch (rp->ReparseTag) { case IO_REPARSE_TAG_MOUNT_POINT: /* Mount points and junctions. */ /* Can we use 'Rp->MountPointReparseBuffer.PrintNameLength'? */ - if (i_size <= offsetof(struct REPARSE_DATA_BUFFER, - MountPointReparseBuffer.PathBuffer)) + if (size <= offsetof(struct REPARSE_DATA_BUFFER, + MountPointReparseBuffer.PathBuffer)) goto out; - uni = Add2Ptr(rp, - offsetof(struct REPARSE_DATA_BUFFER, - MountPointReparseBuffer.PathBuffer) + - le16_to_cpu(rp->MountPointReparseBuffer - .PrintNameOffset) - - 2); - nlen = le16_to_cpu(rp->MountPointReparseBuffer.PrintNameLength); + uname = Add2Ptr(rp, + offsetof(struct REPARSE_DATA_BUFFER, + MountPointReparseBuffer.PathBuffer) + + le16_to_cpu(rp->MountPointReparseBuffer + .PrintNameOffset)); + ulen = le16_to_cpu(rp->MountPointReparseBuffer.PrintNameLength); break; case IO_REPARSE_TAG_SYMLINK: /* FolderSymbolicLink */ /* Can we use 'Rp->SymbolicLinkReparseBuffer.PrintNameLength'? */ - if (i_size <= offsetof(struct REPARSE_DATA_BUFFER, - SymbolicLinkReparseBuffer.PathBuffer)) + if (size <= offsetof(struct REPARSE_DATA_BUFFER, + SymbolicLinkReparseBuffer.PathBuffer)) goto out; - uni = Add2Ptr(rp, - offsetof(struct REPARSE_DATA_BUFFER, - SymbolicLinkReparseBuffer.PathBuffer) + - le16_to_cpu(rp->SymbolicLinkReparseBuffer - .PrintNameOffset) - - 2); - nlen = le16_to_cpu( + uname = Add2Ptr( + rp, offsetof(struct REPARSE_DATA_BUFFER, + SymbolicLinkReparseBuffer.PathBuffer) + + le16_to_cpu(rp->SymbolicLinkReparseBuffer + .PrintNameOffset)); + ulen = le16_to_cpu( rp->SymbolicLinkReparseBuffer.PrintNameLength); break; @@ -1864,29 +1868,28 @@ static noinline int ntfs_readlink_hlp(struct inode *inode, char *buffer, goto out; } if (!IsReparseTagNameSurrogate(rp->ReparseTag) || - i_size <= sizeof(struct REPARSE_POINT)) { + size <= sizeof(struct REPARSE_POINT)) { goto out; } /* Users tag. */ - uni = Add2Ptr(rp, sizeof(struct REPARSE_POINT) - 2); - nlen = le16_to_cpu(rp->ReparseDataLength) - + uname = Add2Ptr(rp, sizeof(struct REPARSE_POINT)); + ulen = le16_to_cpu(rp->ReparseDataLength) - sizeof(struct REPARSE_POINT); } /* Convert nlen from bytes to UNICODE chars. */ - nlen >>= 1; + ulen >>= 1; /* Check that name is available. */ - if (!nlen || &uni->name[nlen] > (__le16 *)Add2Ptr(rp, i_size)) + if (!ulen || uname + ulen > (__le16 *)Add2Ptr(rp, size)) goto out; /* If name is already zero terminated then truncate it now. */ - if (!uni->name[nlen - 1]) - nlen -= 1; - uni->len = nlen; + if (!uname[ulen - 1]) + ulen -= 1; - err = ntfs_utf16_to_nls(sbi, uni, buffer, buflen); + err = ntfs_utf16_to_nls(sbi, uname, ulen, buffer, buflen); if (err < 0) goto out; diff --git a/fs/ntfs3/lib/decompress_common.h b/fs/ntfs3/lib/decompress_common.h index 2d70ae42f1b5..dd7ced000d0e 100644 --- a/fs/ntfs3/lib/decompress_common.h +++ b/fs/ntfs3/lib/decompress_common.h @@ -5,6 +5,9 @@ * Copyright (C) 2015 Eric Biggers */ +#ifndef _LINUX_NTFS3_LIB_DECOMPRESS_COMMON_H +#define _LINUX_NTFS3_LIB_DECOMPRESS_COMMON_H + #include <linux/string.h> #include <linux/compiler.h> #include <linux/types.h> @@ -336,3 +339,5 @@ static forceinline u8 *lz_copy(u8 *dst, u32 length, u32 offset, const u8 *bufend return dst; } + +#endif /* _LINUX_NTFS3_LIB_DECOMPRESS_COMMON_H */ diff --git a/fs/ntfs3/lib/lib.h b/fs/ntfs3/lib/lib.h index f508fbad2e71..90309a5ae59c 100644 --- a/fs/ntfs3/lib/lib.h +++ b/fs/ntfs3/lib/lib.h @@ -7,6 +7,10 @@ * - linux kernel code style */ +#ifndef _LINUX_NTFS3_LIB_LIB_H +#define _LINUX_NTFS3_LIB_LIB_H + +#include <linux/types.h> /* globals from xpress_decompress.c */ struct xpress_decompressor *xpress_allocate_decompressor(void); @@ -24,3 +28,5 @@ int lzx_decompress(struct lzx_decompressor *__restrict d, const void *__restrict compressed_data, size_t compressed_size, void *__restrict uncompressed_data, size_t uncompressed_size); + +#endif /* _LINUX_NTFS3_LIB_LIB_H */ diff --git a/fs/ntfs3/lznt.c b/fs/ntfs3/lznt.c index f1f691a67cc4..28f654561f27 100644 --- a/fs/ntfs3/lznt.c +++ b/fs/ntfs3/lznt.c @@ -5,13 +5,13 @@ * */ -#include <linux/blkdev.h> -#include <linux/buffer_head.h> -#include <linux/fs.h> -#include <linux/nls.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/stddef.h> +#include <linux/string.h> +#include <linux/types.h> #include "debug.h" -#include "ntfs.h" #include "ntfs_fs.h" // clang-format off @@ -292,7 +292,7 @@ next: /* * get_lznt_ctx * @level: 0 - Standard compression. - * !0 - Best compression, requires a lot of cpu. + * !0 - Best compression, requires a lot of cpu. */ struct lznt *get_lznt_ctx(int level) { diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c index e58415d07132..bc741213ad84 100644 --- a/fs/ntfs3/namei.c +++ b/fs/ntfs3/namei.c @@ -5,11 +5,7 @@ * */ -#include <linux/blkdev.h> -#include <linux/buffer_head.h> #include <linux/fs.h> -#include <linux/iversion.h> -#include <linux/namei.h> #include <linux/nls.h> #include "debug.h" @@ -99,16 +95,11 @@ static struct dentry *ntfs_lookup(struct inode *dir, struct dentry *dentry, static int ntfs_create(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - struct ntfs_inode *ni = ntfs_i(dir); struct inode *inode; - ni_lock_dir(ni); - inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFREG | mode, 0, NULL, 0, NULL); - ni_unlock(ni); - return IS_ERR(inode) ? PTR_ERR(inode) : 0; } @@ -120,16 +111,11 @@ static int ntfs_create(struct user_namespace *mnt_userns, struct inode *dir, static int ntfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { - struct ntfs_inode *ni = ntfs_i(dir); struct inode *inode; - ni_lock_dir(ni); - inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, mode, rdev, NULL, 0, NULL); - ni_unlock(ni); - return IS_ERR(inode) ? PTR_ERR(inode) : 0; } @@ -200,15 +186,10 @@ static int ntfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, { u32 size = strlen(symname); struct inode *inode; - struct ntfs_inode *ni = ntfs_i(dir); - - ni_lock_dir(ni); inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFLNK | 0777, 0, symname, size, NULL); - ni_unlock(ni); - return IS_ERR(inode) ? PTR_ERR(inode) : 0; } @@ -219,15 +200,10 @@ static int ntfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; - struct ntfs_inode *ni = ntfs_i(dir); - - ni_lock_dir(ni); inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFDIR | mode, 0, NULL, 0, NULL); - ni_unlock(ni); - return IS_ERR(inode) ? PTR_ERR(inode) : 0; } diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h index 6bb3e595263b..9cc396b117bf 100644 --- a/fs/ntfs3/ntfs.h +++ b/fs/ntfs3/ntfs.h @@ -10,19 +10,27 @@ #ifndef _LINUX_NTFS3_NTFS_H #define _LINUX_NTFS3_NTFS_H -/* TODO: Check 4K MFT record and 512 bytes cluster. */ +#include <linux/blkdev.h> +#include <linux/build_bug.h> +#include <linux/kernel.h> +#include <linux/stddef.h> +#include <linux/string.h> +#include <linux/types.h> + +#include "debug.h" -/* Activate this define to use binary search in indexes. */ -#define NTFS3_INDEX_BINARY_SEARCH +/* TODO: Check 4K MFT record and 512 bytes cluster. */ /* Check each run for marked clusters. */ #define NTFS3_CHECK_FREE_CLST #define NTFS_NAME_LEN 255 -/* ntfs.sys used 500 maximum links on-disk struct allows up to 0xffff. */ -#define NTFS_LINK_MAX 0x400 -//#define NTFS_LINK_MAX 0xffff +/* + * ntfs.sys used 500 maximum links on-disk struct allows up to 0xffff. + * xfstest generic/041 creates 3003 hardlinks. + */ +#define NTFS_LINK_MAX 4000 /* * Activate to use 64 bit clusters instead of 32 bits in ntfs.sys. diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index dc71c59fd445..8aaec7e0804e 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -9,6 +9,37 @@ #ifndef _LINUX_NTFS3_NTFS_FS_H #define _LINUX_NTFS3_NTFS_FS_H +#include <linux/blkdev.h> +#include <linux/buffer_head.h> +#include <linux/cleancache.h> +#include <linux/fs.h> +#include <linux/highmem.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/mutex.h> +#include <linux/page-flags.h> +#include <linux/pagemap.h> +#include <linux/rbtree.h> +#include <linux/rwsem.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/time64.h> +#include <linux/types.h> +#include <linux/uidgid.h> +#include <asm/div64.h> +#include <asm/page.h> + +#include "debug.h" +#include "ntfs.h" + +struct dentry; +struct fiemap_extent_info; +struct user_namespace; +struct page; +struct writeback_control; +enum utf16_endian; + + #define MINUS_ONE_T ((size_t)(-1)) /* Biggest MFT / smallest cluster */ #define MAXIMUM_BYTES_PER_MFT 4096 @@ -52,6 +83,7 @@ // clang-format on struct ntfs_mount_options { + char *nls_name; struct nls_table *nls; kuid_t fs_uid; @@ -59,19 +91,16 @@ struct ntfs_mount_options { u16 fs_fmask_inv; u16 fs_dmask_inv; - unsigned uid : 1, /* uid was set. */ - gid : 1, /* gid was set. */ - fmask : 1, /* fmask was set. */ - dmask : 1, /* dmask was set. */ - sys_immutable : 1, /* Immutable system files. */ - discard : 1, /* Issue discard requests on deletions. */ - sparse : 1, /* Create sparse files. */ - showmeta : 1, /* Show meta files. */ - nohidden : 1, /* Do not show hidden files. */ - force : 1, /* Rw mount dirty volume. */ - no_acs_rules : 1, /*Exclude acs rules. */ - prealloc : 1 /* Preallocate space when file is growing. */ - ; + unsigned fmask : 1; /* fmask was set. */ + unsigned dmask : 1; /*dmask was set. */ + unsigned sys_immutable : 1; /* Immutable system files. */ + unsigned discard : 1; /* Issue discard requests on deletions. */ + unsigned sparse : 1; /* Create sparse files. */ + unsigned showmeta : 1; /* Show meta files. */ + unsigned nohidden : 1; /* Do not show hidden files. */ + unsigned force : 1; /* RW mount dirty volume. */ + unsigned noacsrules : 1; /* Exclude acs rules. */ + unsigned prealloc : 1; /* Preallocate space when file is growing. */ }; /* Special value to unpack and deallocate. */ @@ -182,10 +211,8 @@ struct ntfs_sb_info { u32 blocks_per_cluster; // cluster_size / sb->s_blocksize u32 record_size; - u32 sector_size; u32 index_size; - u8 sector_bits; u8 cluster_bits; u8 record_bits; @@ -279,7 +306,7 @@ struct ntfs_sb_info { #endif } compress; - struct ntfs_mount_options options; + struct ntfs_mount_options *options; struct ratelimit_state msg_ratelimit; }; @@ -436,7 +463,7 @@ bool al_remove_le(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le); bool al_delete_le(struct ntfs_inode *ni, enum ATTR_TYPE type, CLST vcn, const __le16 *name, size_t name_len, const struct MFT_REF *ref); -int al_update(struct ntfs_inode *ni); +int al_update(struct ntfs_inode *ni, int sync); static inline size_t al_aligned(size_t size) { return (size + 1023) & ~(size_t)1023; @@ -448,7 +475,7 @@ bool are_bits_set(const ulong *map, size_t bit, size_t nbits); size_t get_set_bits_ex(const ulong *map, size_t bit, size_t nbits); /* Globals from dir.c */ -int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const struct le_str *uni, +int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const __le16 *name, u32 len, u8 *buf, int buf_len); int ntfs_nls_to_utf16(struct ntfs_sb_info *sbi, const u8 *name, u32 name_len, struct cpu_str *uni, u32 max_ulen, @@ -520,7 +547,7 @@ struct ATTR_FILE_NAME *ni_fname_type(struct ntfs_inode *ni, u8 name_type, struct ATTR_LIST_ENTRY **entry); int ni_new_attr_flags(struct ntfs_inode *ni, enum FILE_ATTRIBUTE new_fa); enum REPARSE_SIGN ni_parse_reparse(struct ntfs_inode *ni, struct ATTRIB *attr, - void *buffer); + struct REPARSE_DATA_BUFFER *buffer); int ni_write_inode(struct inode *inode, int sync, const char *hint); #define _ni_write_inode(i, w) ni_write_inode(i, w, __func__) int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo, @@ -577,7 +604,7 @@ int ntfs_sb_read(struct super_block *sb, u64 lbo, size_t bytes, void *buffer); int ntfs_sb_write(struct super_block *sb, u64 lbo, size_t bytes, const void *buffer, int wait); int ntfs_sb_write_run(struct ntfs_sb_info *sbi, const struct runs_tree *run, - u64 vbo, const void *buf, size_t bytes); + u64 vbo, const void *buf, size_t bytes, int sync); struct buffer_head *ntfs_bread_run(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo); int ntfs_read_run_nb(struct ntfs_sb_info *sbi, const struct runs_tree *run, diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c index 103705c86772..861e35791506 100644 --- a/fs/ntfs3/record.c +++ b/fs/ntfs3/record.c @@ -5,10 +5,7 @@ * */ -#include <linux/blkdev.h> -#include <linux/buffer_head.h> #include <linux/fs.h> -#include <linux/nls.h> #include "debug.h" #include "ntfs.h" diff --git a/fs/ntfs3/run.c b/fs/ntfs3/run.c index 26ed2b64345e..a8fec651f973 100644 --- a/fs/ntfs3/run.c +++ b/fs/ntfs3/run.c @@ -7,10 +7,8 @@ */ #include <linux/blkdev.h> -#include <linux/buffer_head.h> #include <linux/fs.h> #include <linux/log2.h> -#include <linux/nls.h> #include "debug.h" #include "ntfs.h" diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index 55bbc9200a10..29813200c7af 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -23,16 +23,15 @@ * */ -#include <linux/backing-dev.h> #include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/exportfs.h> #include <linux/fs.h> -#include <linux/iversion.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/log2.h> #include <linux/module.h> #include <linux/nls.h> -#include <linux/parser.h> #include <linux/seq_file.h> #include <linux/statfs.h> @@ -205,9 +204,11 @@ void *ntfs_put_shared(void *ptr) return ret; } -static inline void clear_mount_options(struct ntfs_mount_options *options) +static inline void put_mount_options(struct ntfs_mount_options *options) { + kfree(options->nls_name); unload_nls(options->nls); + kfree(options); } enum Opt { @@ -223,218 +224,175 @@ enum Opt { Opt_nohidden, Opt_showmeta, Opt_acl, - Opt_noatime, - Opt_nls, + Opt_iocharset, Opt_prealloc, - Opt_no_acs_rules, + Opt_noacsrules, Opt_err, }; -static const match_table_t ntfs_tokens = { - { Opt_uid, "uid=%u" }, - { Opt_gid, "gid=%u" }, - { Opt_umask, "umask=%o" }, - { Opt_dmask, "dmask=%o" }, - { Opt_fmask, "fmask=%o" }, - { Opt_immutable, "sys_immutable" }, - { Opt_discard, "discard" }, - { Opt_force, "force" }, - { Opt_sparse, "sparse" }, - { Opt_nohidden, "nohidden" }, - { Opt_acl, "acl" }, - { Opt_noatime, "noatime" }, - { Opt_showmeta, "showmeta" }, - { Opt_nls, "nls=%s" }, - { Opt_prealloc, "prealloc" }, - { Opt_no_acs_rules, "no_acs_rules" }, - { Opt_err, NULL }, +static const struct fs_parameter_spec ntfs_fs_parameters[] = { + fsparam_u32("uid", Opt_uid), + fsparam_u32("gid", Opt_gid), + fsparam_u32oct("umask", Opt_umask), + fsparam_u32oct("dmask", Opt_dmask), + fsparam_u32oct("fmask", Opt_fmask), + fsparam_flag_no("sys_immutable", Opt_immutable), + fsparam_flag_no("discard", Opt_discard), + fsparam_flag_no("force", Opt_force), + fsparam_flag_no("sparse", Opt_sparse), + fsparam_flag_no("hidden", Opt_nohidden), + fsparam_flag_no("acl", Opt_acl), + fsparam_flag_no("showmeta", Opt_showmeta), + fsparam_flag_no("prealloc", Opt_prealloc), + fsparam_flag_no("acsrules", Opt_noacsrules), + fsparam_string("iocharset", Opt_iocharset), + {} }; -static noinline int ntfs_parse_options(struct super_block *sb, char *options, - int silent, - struct ntfs_mount_options *opts) +/* + * Load nls table or if @nls is utf8 then return NULL. + */ +static struct nls_table *ntfs_load_nls(char *nls) { - char *p; - substring_t args[MAX_OPT_ARGS]; - int option; - char nls_name[30]; - struct nls_table *nls; + struct nls_table *ret; - opts->fs_uid = current_uid(); - opts->fs_gid = current_gid(); - opts->fs_fmask_inv = opts->fs_dmask_inv = ~current_umask(); - nls_name[0] = 0; + if (!nls) + nls = CONFIG_NLS_DEFAULT; - if (!options) - goto out; + if (strcmp(nls, "utf8") == 0) + return NULL; - while ((p = strsep(&options, ","))) { - int token; + if (strcmp(nls, CONFIG_NLS_DEFAULT) == 0) + return load_nls_default(); - if (!*p) - continue; + ret = load_nls(nls); + if (ret) + return ret; - token = match_token(p, ntfs_tokens, args); - switch (token) { - case Opt_immutable: - opts->sys_immutable = 1; - break; - case Opt_uid: - if (match_int(&args[0], &option)) - return -EINVAL; - opts->fs_uid = make_kuid(current_user_ns(), option); - if (!uid_valid(opts->fs_uid)) - return -EINVAL; - opts->uid = 1; - break; - case Opt_gid: - if (match_int(&args[0], &option)) - return -EINVAL; - opts->fs_gid = make_kgid(current_user_ns(), option); - if (!gid_valid(opts->fs_gid)) - return -EINVAL; - opts->gid = 1; - break; - case Opt_umask: - if (match_octal(&args[0], &option)) - return -EINVAL; - opts->fs_fmask_inv = opts->fs_dmask_inv = ~option; - opts->fmask = opts->dmask = 1; - break; - case Opt_dmask: - if (match_octal(&args[0], &option)) - return -EINVAL; - opts->fs_dmask_inv = ~option; - opts->dmask = 1; - break; - case Opt_fmask: - if (match_octal(&args[0], &option)) - return -EINVAL; - opts->fs_fmask_inv = ~option; - opts->fmask = 1; - break; - case Opt_discard: - opts->discard = 1; - break; - case Opt_force: - opts->force = 1; - break; - case Opt_sparse: - opts->sparse = 1; - break; - case Opt_nohidden: - opts->nohidden = 1; - break; - case Opt_acl: + return ERR_PTR(-EINVAL); +} + +static int ntfs_fs_parse_param(struct fs_context *fc, + struct fs_parameter *param) +{ + struct ntfs_mount_options *opts = fc->fs_private; + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, ntfs_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_uid: + opts->fs_uid = make_kuid(current_user_ns(), result.uint_32); + if (!uid_valid(opts->fs_uid)) + return invalf(fc, "ntfs3: Invalid value for uid."); + break; + case Opt_gid: + opts->fs_gid = make_kgid(current_user_ns(), result.uint_32); + if (!gid_valid(opts->fs_gid)) + return invalf(fc, "ntfs3: Invalid value for gid."); + break; + case Opt_umask: + if (result.uint_32 & ~07777) + return invalf(fc, "ntfs3: Invalid value for umask."); + opts->fs_fmask_inv = ~result.uint_32; + opts->fs_dmask_inv = ~result.uint_32; + opts->fmask = 1; + opts->dmask = 1; + break; + case Opt_dmask: + if (result.uint_32 & ~07777) + return invalf(fc, "ntfs3: Invalid value for dmask."); + opts->fs_dmask_inv = ~result.uint_32; + opts->dmask = 1; + break; + case Opt_fmask: + if (result.uint_32 & ~07777) + return invalf(fc, "ntfs3: Invalid value for fmask."); + opts->fs_fmask_inv = ~result.uint_32; + opts->fmask = 1; + break; + case Opt_immutable: + opts->sys_immutable = result.negated ? 0 : 1; + break; + case Opt_discard: + opts->discard = result.negated ? 0 : 1; + break; + case Opt_force: + opts->force = result.negated ? 0 : 1; + break; + case Opt_sparse: + opts->sparse = result.negated ? 0 : 1; + break; + case Opt_nohidden: + opts->nohidden = result.negated ? 1 : 0; + break; + case Opt_acl: + if (!result.negated) #ifdef CONFIG_NTFS3_FS_POSIX_ACL - sb->s_flags |= SB_POSIXACL; - break; + fc->sb_flags |= SB_POSIXACL; #else - ntfs_err(sb, "support for ACL not compiled in!"); - return -EINVAL; + return invalf(fc, "ntfs3: Support for ACL not compiled in!"); #endif - case Opt_noatime: - sb->s_flags |= SB_NOATIME; - break; - case Opt_showmeta: - opts->showmeta = 1; - break; - case Opt_nls: - match_strlcpy(nls_name, &args[0], sizeof(nls_name)); - break; - case Opt_prealloc: - opts->prealloc = 1; - break; - case Opt_no_acs_rules: - opts->no_acs_rules = 1; - break; - default: - if (!silent) - ntfs_err( - sb, - "Unrecognized mount option \"%s\" or missing value", - p); - //return -EINVAL; - } - } - -out: - if (!strcmp(nls_name[0] ? nls_name : CONFIG_NLS_DEFAULT, "utf8")) { - /* - * For UTF-8 use utf16s_to_utf8s()/utf8s_to_utf16s() - * instead of NLS. - */ - nls = NULL; - } else if (nls_name[0]) { - nls = load_nls(nls_name); - if (!nls) { - ntfs_err(sb, "failed to load \"%s\"", nls_name); - return -EINVAL; - } - } else { - nls = load_nls_default(); - if (!nls) { - ntfs_err(sb, "failed to load default nls"); - return -EINVAL; - } + else + fc->sb_flags &= ~SB_POSIXACL; + break; + case Opt_showmeta: + opts->showmeta = result.negated ? 0 : 1; + break; + case Opt_iocharset: + kfree(opts->nls_name); + opts->nls_name = param->string; + param->string = NULL; + break; + case Opt_prealloc: + opts->prealloc = result.negated ? 0 : 1; + break; + case Opt_noacsrules: + opts->noacsrules = result.negated ? 1 : 0; + break; + default: + /* Should not be here unless we forget add case. */ + return -EINVAL; } - opts->nls = nls; - return 0; } -static int ntfs_remount(struct super_block *sb, int *flags, char *data) +static int ntfs_fs_reconfigure(struct fs_context *fc) { - int err, ro_rw; + struct super_block *sb = fc->root->d_sb; struct ntfs_sb_info *sbi = sb->s_fs_info; - struct ntfs_mount_options old_opts; - char *orig_data = kstrdup(data, GFP_KERNEL); - - if (data && !orig_data) - return -ENOMEM; + struct ntfs_mount_options *new_opts = fc->fs_private; + int ro_rw; - /* Store original options. */ - memcpy(&old_opts, &sbi->options, sizeof(old_opts)); - clear_mount_options(&sbi->options); - memset(&sbi->options, 0, sizeof(sbi->options)); - - err = ntfs_parse_options(sb, data, 0, &sbi->options); - if (err) - goto restore_opts; - - ro_rw = sb_rdonly(sb) && !(*flags & SB_RDONLY); + ro_rw = sb_rdonly(sb) && !(fc->sb_flags & SB_RDONLY); if (ro_rw && (sbi->flags & NTFS_FLAGS_NEED_REPLAY)) { - ntfs_warn( - sb, - "Couldn't remount rw because journal is not replayed. Please umount/remount instead\n"); - err = -EINVAL; - goto restore_opts; + errorf(fc, "ntfs3: Couldn't remount rw because journal is not replayed. Please umount/remount instead\n"); + return -EINVAL; } + new_opts->nls = ntfs_load_nls(new_opts->nls_name); + if (IS_ERR(new_opts->nls)) { + new_opts->nls = NULL; + errorf(fc, "ntfs3: Cannot load iocharset %s", new_opts->nls_name); + return -EINVAL; + } + if (new_opts->nls != sbi->options->nls) + return invalf(fc, "ntfs3: Cannot use different iocharset when remounting!"); + sync_filesystem(sb); if (ro_rw && (sbi->volume.flags & VOLUME_FLAG_DIRTY) && - !sbi->options.force) { - ntfs_warn(sb, "volume is dirty and \"force\" flag is not set!"); - err = -EINVAL; - goto restore_opts; + !new_opts->force) { + errorf(fc, "ntfs3: Volume is dirty and \"force\" flag is not set!"); + return -EINVAL; } - clear_mount_options(&old_opts); - - *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME) | - SB_NODIRATIME | SB_NOATIME; - ntfs_info(sb, "re-mounted. Opts: %s", orig_data); - err = 0; - goto out; + memcpy(sbi->options, new_opts, sizeof(*new_opts)); -restore_opts: - clear_mount_options(&sbi->options); - memcpy(&sbi->options, &old_opts, sizeof(old_opts)); - -out: - kfree(orig_data); - return err; + return 0; } static struct kmem_cache *ntfs_inode_cachep; @@ -513,8 +471,6 @@ static noinline void put_ntfs(struct ntfs_sb_info *sbi) xpress_free_decompressor(sbi->compress.xpress); lzx_free_decompressor(sbi->compress.lzx); #endif - clear_mount_options(&sbi->options); - kfree(sbi); } @@ -525,7 +481,9 @@ static void ntfs_put_super(struct super_block *sb) /* Mark rw ntfs as clear, if possible. */ ntfs_set_state(sbi, NTFS_DIRTY_CLEAR); + put_mount_options(sbi->options); put_ntfs(sbi); + sb->s_fs_info = NULL; sync_blockdev(sb->s_bdev); } @@ -552,23 +510,21 @@ static int ntfs_show_options(struct seq_file *m, struct dentry *root) { struct super_block *sb = root->d_sb; struct ntfs_sb_info *sbi = sb->s_fs_info; - struct ntfs_mount_options *opts = &sbi->options; + struct ntfs_mount_options *opts = sbi->options; struct user_namespace *user_ns = seq_user_ns(m); - if (opts->uid) - seq_printf(m, ",uid=%u", - from_kuid_munged(user_ns, opts->fs_uid)); - if (opts->gid) - seq_printf(m, ",gid=%u", - from_kgid_munged(user_ns, opts->fs_gid)); + seq_printf(m, ",uid=%u", + from_kuid_munged(user_ns, opts->fs_uid)); + seq_printf(m, ",gid=%u", + from_kgid_munged(user_ns, opts->fs_gid)); if (opts->fmask) seq_printf(m, ",fmask=%04o", ~opts->fs_fmask_inv); if (opts->dmask) seq_printf(m, ",dmask=%04o", ~opts->fs_dmask_inv); if (opts->nls) - seq_printf(m, ",nls=%s", opts->nls->charset); + seq_printf(m, ",iocharset=%s", opts->nls->charset); else - seq_puts(m, ",nls=utf8"); + seq_puts(m, ",iocharset=utf8"); if (opts->sys_immutable) seq_puts(m, ",sys_immutable"); if (opts->discard) @@ -581,14 +537,12 @@ static int ntfs_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",nohidden"); if (opts->force) seq_puts(m, ",force"); - if (opts->no_acs_rules) - seq_puts(m, ",no_acs_rules"); + if (opts->noacsrules) + seq_puts(m, ",noacsrules"); if (opts->prealloc) seq_puts(m, ",prealloc"); if (sb->s_flags & SB_POSIXACL) seq_puts(m, ",acl"); - if (sb->s_flags & SB_NOATIME) - seq_puts(m, ",noatime"); return 0; } @@ -643,7 +597,6 @@ static const struct super_operations ntfs_sops = { .statfs = ntfs_statfs, .show_options = ntfs_show_options, .sync_fs = ntfs_sync_fs, - .remount_fs = ntfs_remount, .write_inode = ntfs3_write_inode, }; @@ -729,7 +682,7 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, struct ntfs_sb_info *sbi = sb->s_fs_info; int err; u32 mb, gb, boot_sector_size, sct_per_clst, record_size; - u64 sectors, clusters, fs_size, mlcn, mlcn2; + u64 sectors, clusters, mlcn, mlcn2; struct NTFS_BOOT *boot; struct buffer_head *bh; struct MFT_REC *rec; @@ -787,20 +740,20 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, goto out; } - sbi->sector_size = boot_sector_size; - sbi->sector_bits = blksize_bits(boot_sector_size); - fs_size = (sectors + 1) << sbi->sector_bits; + sbi->volume.size = sectors * boot_sector_size; - gb = format_size_gb(fs_size, &mb); + gb = format_size_gb(sbi->volume.size + boot_sector_size, &mb); /* * - Volume formatted and mounted with the same sector size. * - Volume formatted 4K and mounted as 512. * - Volume formatted 512 and mounted as 4K. */ - if (sbi->sector_size != sector_size) { - ntfs_warn(sb, - "Different NTFS' sector size and media sector size"); + if (boot_sector_size != sector_size) { + ntfs_warn( + sb, + "Different NTFS' sector size (%u) and media sector size (%u)", + boot_sector_size, sector_size); dev_size += sector_size - 1; } @@ -810,8 +763,19 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, sbi->mft.lbo = mlcn << sbi->cluster_bits; sbi->mft.lbo2 = mlcn2 << sbi->cluster_bits; - if (sbi->cluster_size < sbi->sector_size) + /* Compare boot's cluster and sector. */ + if (sbi->cluster_size < boot_sector_size) + goto out; + + /* Compare boot's cluster and media sector. */ + if (sbi->cluster_size < sector_size) { + /* No way to use ntfs_get_block in this case. */ + ntfs_err( + sb, + "Failed to mount 'cause NTFS's cluster size (%u) is less than media sector size (%u)", + sbi->cluster_size, sector_size); goto out; + } sbi->cluster_mask = sbi->cluster_size - 1; sbi->cluster_mask_inv = ~(u64)sbi->cluster_mask; @@ -836,10 +800,9 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, : (u32)boot->index_size << sbi->cluster_bits; sbi->volume.ser_num = le64_to_cpu(boot->serial_num); - sbi->volume.size = sectors << sbi->sector_bits; /* Warning if RAW volume. */ - if (dev_size < fs_size) { + if (dev_size < sbi->volume.size + boot_sector_size) { u32 mb0, gb0; gb0 = format_size_gb(dev_size, &mb0); @@ -883,8 +846,7 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, rec->total = cpu_to_le32(sbi->record_size); ((struct ATTRIB *)Add2Ptr(rec, ao))->type = ATTR_END; - if (sbi->cluster_size < PAGE_SIZE) - sb_set_blocksize(sb, sbi->cluster_size); + sb_set_blocksize(sb, min_t(u32, sbi->cluster_size, PAGE_SIZE)); sbi->block_mask = sb->s_blocksize - 1; sbi->blocks_per_cluster = sbi->cluster_size >> sb->s_blocksize_bits; @@ -897,9 +859,11 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, if (clusters >= (1ull << (64 - sbi->cluster_bits))) sbi->maxbytes = -1; sbi->maxbytes_sparse = -1; + sb->s_maxbytes = MAX_LFS_FILESIZE; #else /* Maximum size for sparse file. */ sbi->maxbytes_sparse = (1ull << (sbi->cluster_bits + 32)) - 1; + sb->s_maxbytes = 0xFFFFFFFFull << sbi->cluster_bits; #endif err = 0; @@ -913,14 +877,13 @@ out: /* * ntfs_fill_super - Try to mount. */ -static int ntfs_fill_super(struct super_block *sb, void *data, int silent) +static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) { int err; - struct ntfs_sb_info *sbi; + struct ntfs_sb_info *sbi = sb->s_fs_info; struct block_device *bdev = sb->s_bdev; - struct inode *bd_inode = bdev->bd_inode; - struct request_queue *rq = bdev_get_queue(bdev); - struct inode *inode = NULL; + struct request_queue *rq; + struct inode *inode; struct ntfs_inode *ni; size_t i, tt; CLST vcn, lcn, len; @@ -928,18 +891,11 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) const struct VOLUME_INFO *info; u32 idx, done, bytes; struct ATTR_DEF_ENTRY *t; - u16 *upcase = NULL; u16 *shared; - bool is_ro; struct MFT_REF ref; ref.high = 0; - sbi = kzalloc(sizeof(struct ntfs_sb_info), GFP_NOFS); - if (!sbi) - return -ENOMEM; - - sb->s_fs_info = sbi; sbi->sb = sb; sb->s_flags |= SB_NODIRATIME; sb->s_magic = 0x7366746e; // "ntfs" @@ -948,41 +904,27 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_time_gran = NTFS_TIME_GRAN; // 100 nsec sb->s_xattr = ntfs_xattr_handlers; - ratelimit_state_init(&sbi->msg_ratelimit, DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); - - err = ntfs_parse_options(sb, data, silent, &sbi->options); - if (err) + sbi->options->nls = ntfs_load_nls(sbi->options->nls_name); + if (IS_ERR(sbi->options->nls)) { + sbi->options->nls = NULL; + errorf(fc, "Cannot load nls %s", sbi->options->nls_name); + err = -EINVAL; goto out; + } - if (!rq || !blk_queue_discard(rq) || !rq->limits.discard_granularity) { - ; - } else { + rq = bdev_get_queue(bdev); + if (blk_queue_discard(rq) && rq->limits.discard_granularity) { sbi->discard_granularity = rq->limits.discard_granularity; sbi->discard_granularity_mask_inv = ~(u64)(sbi->discard_granularity - 1); } - sb_set_blocksize(sb, PAGE_SIZE); - /* Parse boot. */ err = ntfs_init_from_boot(sb, rq ? queue_logical_block_size(rq) : 512, - bd_inode->i_size); + bdev_nr_bytes(bdev)); if (err) goto out; -#ifdef CONFIG_NTFS3_64BIT_CLUSTER - sb->s_maxbytes = MAX_LFS_FILESIZE; -#else - sb->s_maxbytes = 0xFFFFFFFFull << sbi->cluster_bits; -#endif - - mutex_init(&sbi->compress.mtx_lznt); -#ifdef CONFIG_NTFS3_LZX_XPRESS - mutex_init(&sbi->compress.mtx_xpress); - mutex_init(&sbi->compress.mtx_lzx); -#endif - /* * Load $Volume. This should be done before $LogFile * 'cause 'sbi->volume.ni' is used 'ntfs_set_state'. @@ -991,9 +933,8 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) ref.seq = cpu_to_le16(MFT_REC_VOL); inode = ntfs_iget5(sb, &ref, &NAME_VOLUME); if (IS_ERR(inode)) { - err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $Volume."); - inode = NULL; + err = PTR_ERR(inode); goto out; } @@ -1015,36 +956,33 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) } else { /* Should we break mounting here? */ //err = -EINVAL; - //goto out; + //goto put_inode_out; } attr = ni_find_attr(ni, attr, NULL, ATTR_VOL_INFO, NULL, 0, NULL, NULL); if (!attr || is_attr_ext(attr)) { err = -EINVAL; - goto out; + goto put_inode_out; } info = resident_data_ex(attr, SIZEOF_ATTRIBUTE_VOLUME_INFO); if (!info) { err = -EINVAL; - goto out; + goto put_inode_out; } sbi->volume.major_ver = info->major_ver; sbi->volume.minor_ver = info->minor_ver; sbi->volume.flags = info->flags; - sbi->volume.ni = ni; - inode = NULL; /* Load $MFTMirr to estimate recs_mirr. */ ref.low = cpu_to_le32(MFT_REC_MIRR); ref.seq = cpu_to_le16(MFT_REC_MIRR); inode = ntfs_iget5(sb, &ref, &NAME_MIRROR); if (IS_ERR(inode)) { - err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $MFTMirr."); - inode = NULL; + err = PTR_ERR(inode); goto out; } @@ -1058,9 +996,8 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) ref.seq = cpu_to_le16(MFT_REC_LOG); inode = ntfs_iget5(sb, &ref, &NAME_LOGFILE); if (IS_ERR(inode)) { - err = PTR_ERR(inode); ntfs_err(sb, "Failed to load \x24LogFile."); - inode = NULL; + err = PTR_ERR(inode); goto out; } @@ -1068,22 +1005,19 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) err = ntfs_loadlog_and_replay(ni, sbi); if (err) - goto out; + goto put_inode_out; iput(inode); - inode = NULL; - - is_ro = sb_rdonly(sbi->sb); if (sbi->flags & NTFS_FLAGS_NEED_REPLAY) { - if (!is_ro) { + if (!sb_rdonly(sb)) { ntfs_warn(sb, "failed to replay log file. Can't mount rw!"); err = -EINVAL; goto out; } } else if (sbi->volume.flags & VOLUME_FLAG_DIRTY) { - if (!is_ro && !sbi->options.force) { + if (!sb_rdonly(sb) && !sbi->options->force) { ntfs_warn( sb, "volume is dirty and \"force\" flag is not set!"); @@ -1098,9 +1032,8 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) inode = ntfs_iget5(sb, &ref, &NAME_MFT); if (IS_ERR(inode)) { - err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $MFT."); - inode = NULL; + err = PTR_ERR(inode); goto out; } @@ -1112,11 +1045,11 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) err = wnd_init(&sbi->mft.bitmap, sb, tt); if (err) - goto out; + goto put_inode_out; err = ni_load_all_mi(ni); if (err) - goto out; + goto put_inode_out; sbi->mft.ni = ni; @@ -1125,9 +1058,8 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) ref.seq = cpu_to_le16(MFT_REC_BADCLUST); inode = ntfs_iget5(sb, &ref, &NAME_BADCLUS); if (IS_ERR(inode)) { - err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $BadClus."); - inode = NULL; + err = PTR_ERR(inode); goto out; } @@ -1150,18 +1082,15 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) ref.seq = cpu_to_le16(MFT_REC_BITMAP); inode = ntfs_iget5(sb, &ref, &NAME_BITMAP); if (IS_ERR(inode)) { - err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $Bitmap."); - inode = NULL; + err = PTR_ERR(inode); goto out; } - ni = ntfs_i(inode); - #ifndef CONFIG_NTFS3_64BIT_CLUSTER if (inode->i_size >> 32) { err = -EINVAL; - goto out; + goto put_inode_out; } #endif @@ -1169,14 +1098,14 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) tt = sbi->used.bitmap.nbits; if (inode->i_size < bitmap_size(tt)) { err = -EINVAL; - goto out; + goto put_inode_out; } /* Not necessary. */ sbi->used.bitmap.set_tail = true; - err = wnd_init(&sbi->used.bitmap, sbi->sb, tt); + err = wnd_init(&sbi->used.bitmap, sb, tt); if (err) - goto out; + goto put_inode_out; iput(inode); @@ -1188,23 +1117,22 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) /* Load $AttrDef. */ ref.low = cpu_to_le32(MFT_REC_ATTR); ref.seq = cpu_to_le16(MFT_REC_ATTR); - inode = ntfs_iget5(sbi->sb, &ref, &NAME_ATTRDEF); + inode = ntfs_iget5(sb, &ref, &NAME_ATTRDEF); if (IS_ERR(inode)) { - err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $AttrDef -> %d", err); - inode = NULL; + err = PTR_ERR(inode); goto out; } if (inode->i_size < sizeof(struct ATTR_DEF_ENTRY)) { err = -EINVAL; - goto out; + goto put_inode_out; } bytes = inode->i_size; sbi->def_table = t = kmalloc(bytes, GFP_NOFS); if (!t) { err = -ENOMEM; - goto out; + goto put_inode_out; } for (done = idx = 0; done < bytes; done += PAGE_SIZE, idx++) { @@ -1213,7 +1141,7 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) if (IS_ERR(page)) { err = PTR_ERR(page); - goto out; + goto put_inode_out; } memcpy(Add2Ptr(t, done), page_address(page), min(PAGE_SIZE, tail)); @@ -1221,7 +1149,7 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) if (!idx && ATTR_STD != t->type) { err = -EINVAL; - goto out; + goto put_inode_out; } } @@ -1254,33 +1182,24 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) ref.seq = cpu_to_le16(MFT_REC_UPCASE); inode = ntfs_iget5(sb, &ref, &NAME_UPCASE); if (IS_ERR(inode)) { + ntfs_err(sb, "Failed to load $UpCase."); err = PTR_ERR(inode); - ntfs_err(sb, "Failed to load \x24LogFile."); - inode = NULL; goto out; } - ni = ntfs_i(inode); - if (inode->i_size != 0x10000 * sizeof(short)) { err = -EINVAL; - goto out; - } - - sbi->upcase = upcase = kvmalloc(0x10000 * sizeof(short), GFP_KERNEL); - if (!upcase) { - err = -ENOMEM; - goto out; + goto put_inode_out; } for (idx = 0; idx < (0x10000 * sizeof(short) >> PAGE_SHIFT); idx++) { const __le16 *src; - u16 *dst = Add2Ptr(upcase, idx << PAGE_SHIFT); + u16 *dst = Add2Ptr(sbi->upcase, idx << PAGE_SHIFT); struct page *page = ntfs_map_page(inode->i_mapping, idx); if (IS_ERR(page)) { err = PTR_ERR(page); - goto out; + goto put_inode_out; } src = page_address(page); @@ -1294,14 +1213,13 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) ntfs_unmap_page(page); } - shared = ntfs_set_shared(upcase, 0x10000 * sizeof(short)); - if (shared && upcase != shared) { + shared = ntfs_set_shared(sbi->upcase, 0x10000 * sizeof(short)); + if (shared && sbi->upcase != shared) { + kvfree(sbi->upcase); sbi->upcase = shared; - kvfree(upcase); } iput(inode); - inode = NULL; if (is_ntfs3(sbi)) { /* Load $Secure. */ @@ -1331,34 +1249,31 @@ load_root: ref.seq = cpu_to_le16(MFT_REC_ROOT); inode = ntfs_iget5(sb, &ref, &NAME_ROOT); if (IS_ERR(inode)) { - err = PTR_ERR(inode); ntfs_err(sb, "Failed to load root."); - inode = NULL; + err = PTR_ERR(inode); goto out; } - ni = ntfs_i(inode); - sb->s_root = d_make_root(inode); - if (!sb->s_root) { - err = -EINVAL; - goto out; + err = -ENOMEM; + goto put_inode_out; } + fc->fs_private = NULL; + return 0; -out: +put_inode_out: iput(inode); - - if (sb->s_root) { - d_drop(sb->s_root); - sb->s_root = NULL; - } - +out: + /* + * Free resources here. + * ntfs_fs_free will be called with fc->s_fs_info = NULL + */ put_ntfs(sbi); - sb->s_fs_info = NULL; + return err; } @@ -1403,7 +1318,7 @@ int ntfs_discard(struct ntfs_sb_info *sbi, CLST lcn, CLST len) if (sbi->flags & NTFS_FLAGS_NODISCARD) return -EOPNOTSUPP; - if (!sbi->options.discard) + if (!sbi->options->discard) return -EOPNOTSUPP; lbo = (u64)lcn << sbi->cluster_bits; @@ -1428,19 +1343,99 @@ int ntfs_discard(struct ntfs_sb_info *sbi, CLST lcn, CLST len) return err; } -static struct dentry *ntfs_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data) +static int ntfs_fs_get_tree(struct fs_context *fc) +{ + return get_tree_bdev(fc, ntfs_fill_super); +} + +/* + * ntfs_fs_free - Free fs_context. + * + * Note that this will be called after fill_super and reconfigure + * even when they pass. So they have to take pointers if they pass. + */ +static void ntfs_fs_free(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, ntfs_fill_super); + struct ntfs_mount_options *opts = fc->fs_private; + struct ntfs_sb_info *sbi = fc->s_fs_info; + + if (sbi) + put_ntfs(sbi); + + if (opts) + put_mount_options(opts); +} + +static const struct fs_context_operations ntfs_context_ops = { + .parse_param = ntfs_fs_parse_param, + .get_tree = ntfs_fs_get_tree, + .reconfigure = ntfs_fs_reconfigure, + .free = ntfs_fs_free, +}; + +/* + * ntfs_init_fs_context - Initialize spi and opts + * + * This will called when mount/remount. We will first initiliaze + * options so that if remount we can use just that. + */ +static int ntfs_init_fs_context(struct fs_context *fc) +{ + struct ntfs_mount_options *opts; + struct ntfs_sb_info *sbi; + + opts = kzalloc(sizeof(struct ntfs_mount_options), GFP_NOFS); + if (!opts) + return -ENOMEM; + + /* Default options. */ + opts->fs_uid = current_uid(); + opts->fs_gid = current_gid(); + opts->fs_fmask_inv = ~current_umask(); + opts->fs_dmask_inv = ~current_umask(); + + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) + goto ok; + + sbi = kzalloc(sizeof(struct ntfs_sb_info), GFP_NOFS); + if (!sbi) + goto free_opts; + + sbi->upcase = kvmalloc(0x10000 * sizeof(short), GFP_KERNEL); + if (!sbi->upcase) + goto free_sbi; + + ratelimit_state_init(&sbi->msg_ratelimit, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + mutex_init(&sbi->compress.mtx_lznt); +#ifdef CONFIG_NTFS3_LZX_XPRESS + mutex_init(&sbi->compress.mtx_xpress); + mutex_init(&sbi->compress.mtx_lzx); +#endif + + sbi->options = opts; + fc->s_fs_info = sbi; +ok: + fc->fs_private = opts; + fc->ops = &ntfs_context_ops; + + return 0; +free_sbi: + kfree(sbi); +free_opts: + kfree(opts); + return -ENOMEM; } // clang-format off static struct file_system_type ntfs_fs_type = { - .owner = THIS_MODULE, - .name = "ntfs3", - .mount = ntfs_mount, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, + .owner = THIS_MODULE, + .name = "ntfs3", + .init_fs_context = ntfs_init_fs_context, + .parameters = ntfs_fs_parameters, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; // clang-format on diff --git a/fs/ntfs3/upcase.c b/fs/ntfs3/upcase.c index bbeba778237e..b5e8256fd710 100644 --- a/fs/ntfs3/upcase.c +++ b/fs/ntfs3/upcase.c @@ -5,13 +5,9 @@ * */ -#include <linux/blkdev.h> -#include <linux/buffer_head.h> -#include <linux/module.h> -#include <linux/nls.h> +#include <linux/kernel.h> +#include <linux/types.h> -#include "debug.h" -#include "ntfs.h" #include "ntfs_fs.h" static inline u16 upcase_unicode_char(const u16 *upcase, u16 chr) diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c index 7282d85c4ece..afd0ddad826f 100644 --- a/fs/ntfs3/xattr.c +++ b/fs/ntfs3/xattr.c @@ -5,10 +5,7 @@ * */ -#include <linux/blkdev.h> -#include <linux/buffer_head.h> #include <linux/fs.h> -#include <linux/nls.h> #include <linux/posix_acl.h> #include <linux/posix_acl_xattr.h> #include <linux/xattr.h> @@ -78,6 +75,7 @@ static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea, size_t add_bytes, const struct EA_INFO **info) { int err; + struct ntfs_sb_info *sbi = ni->mi.sbi; struct ATTR_LIST_ENTRY *le = NULL; struct ATTRIB *attr_info, *attr_ea; void *ea_p; @@ -102,10 +100,10 @@ static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea, /* Check Ea limit. */ size = le32_to_cpu((*info)->size); - if (size > ni->mi.sbi->ea_max_size) + if (size > sbi->ea_max_size) return -EFBIG; - if (attr_size(attr_ea) > ni->mi.sbi->ea_max_size) + if (attr_size(attr_ea) > sbi->ea_max_size) return -EFBIG; /* Allocate memory for packed Ea. */ @@ -113,15 +111,16 @@ static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea, if (!ea_p) return -ENOMEM; - if (attr_ea->non_res) { + if (!size) { + ; + } else if (attr_ea->non_res) { struct runs_tree run; run_init(&run); err = attr_load_runs(attr_ea, ni, &run, NULL); if (!err) - err = ntfs_read_run_nb(ni->mi.sbi, &run, 0, ea_p, size, - NULL); + err = ntfs_read_run_nb(sbi, &run, 0, ea_p, size, NULL); run_close(&run); if (err) @@ -260,7 +259,7 @@ out: static noinline int ntfs_set_ea(struct inode *inode, const char *name, size_t name_len, const void *value, - size_t val_size, int flags, int locked) + size_t val_size, int flags) { struct ntfs_inode *ni = ntfs_i(inode); struct ntfs_sb_info *sbi = ni->mi.sbi; @@ -279,8 +278,7 @@ static noinline int ntfs_set_ea(struct inode *inode, const char *name, u64 new_sz; void *p; - if (!locked) - ni_lock(ni); + ni_lock(ni); run_init(&ea_run); @@ -370,21 +368,22 @@ static noinline int ntfs_set_ea(struct inode *inode, const char *name, new_ea->name[name_len] = 0; memcpy(new_ea->name + name_len + 1, value, val_size); new_pack = le16_to_cpu(ea_info.size_pack) + packed_ea_size(new_ea); - - /* Should fit into 16 bits. */ - if (new_pack > 0xffff) { - err = -EFBIG; // -EINVAL? - goto out; - } ea_info.size_pack = cpu_to_le16(new_pack); - /* New size of ATTR_EA. */ size += add; - if (size > sbi->ea_max_size) { + ea_info.size = cpu_to_le32(size); + + /* + * 1. Check ea_info.size_pack for overflow. + * 2. New attibute size must fit value from $AttrDef + */ + if (new_pack > 0xffff || size > sbi->ea_max_size) { + ntfs_inode_warn( + inode, + "The size of extended attributes must not exceed 64KiB"); err = -EFBIG; // -EINVAL? goto out; } - ea_info.size = cpu_to_le32(size); update_ea: @@ -444,7 +443,7 @@ update_ea: /* Delete xattr, ATTR_EA */ ni_remove_attr_le(ni, attr, mi, le); } else if (attr->non_res) { - err = ntfs_sb_write_run(sbi, &ea_run, 0, ea_all, size); + err = ntfs_sb_write_run(sbi, &ea_run, 0, ea_all, size, 0); if (err) goto out; } else { @@ -468,8 +467,7 @@ update_ea: mark_inode_dirty(&ni->vfs_inode); out: - if (!locked) - ni_unlock(ni); + ni_unlock(ni); run_close(&ea_run); kfree(ea_all); @@ -478,12 +476,6 @@ out: } #ifdef CONFIG_NTFS3_FS_POSIX_ACL -static inline void ntfs_posix_acl_release(struct posix_acl *acl) -{ - if (acl && refcount_dec_and_test(&acl->a_refcount)) - kfree(acl); -} - static struct posix_acl *ntfs_get_acl_ex(struct user_namespace *mnt_userns, struct inode *inode, int type, int locked) @@ -521,12 +513,15 @@ static struct posix_acl *ntfs_get_acl_ex(struct user_namespace *mnt_userns, /* Translate extended attribute to acl. */ if (err >= 0) { acl = posix_acl_from_xattr(mnt_userns, buf, err); - if (!IS_ERR(acl)) - set_cached_acl(inode, type, acl); + } else if (err == -ENODATA) { + acl = NULL; } else { - acl = err == -ENODATA ? NULL : ERR_PTR(err); + acl = ERR_PTR(err); } + if (!IS_ERR(acl)) + set_cached_acl(inode, type, acl); + __putname(buf); return acl; @@ -546,12 +541,13 @@ struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu) static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, - int type, int locked) + int type) { const char *name; size_t size, name_len; void *value = NULL; int err = 0; + int flags; if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; @@ -561,22 +557,15 @@ static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns, if (acl) { umode_t mode = inode->i_mode; - err = posix_acl_equiv_mode(acl, &mode); - if (err < 0) - return err; + err = posix_acl_update_mode(mnt_userns, inode, &mode, + &acl); + if (err) + goto out; if (inode->i_mode != mode) { inode->i_mode = mode; mark_inode_dirty(inode); } - - if (!err) { - /* - * ACL can be exactly represented in the - * traditional file mode permission bits. - */ - acl = NULL; - } } name = XATTR_NAME_POSIX_ACL_ACCESS; name_len = sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1; @@ -594,20 +583,24 @@ static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns, } if (!acl) { + /* Remove xattr if it can be presented via mode. */ size = 0; value = NULL; + flags = XATTR_REPLACE; } else { size = posix_acl_xattr_size(acl->a_count); value = kmalloc(size, GFP_NOFS); if (!value) return -ENOMEM; - err = posix_acl_to_xattr(mnt_userns, acl, value, size); if (err < 0) goto out; + flags = 0; } - err = ntfs_set_ea(inode, name, name_len, value, size, 0, locked); + err = ntfs_set_ea(inode, name, name_len, value, size, flags); + if (err == -ENODATA && !size) + err = 0; /* Removing non existed xattr. */ if (!err) set_cached_acl(inode, type, acl); @@ -623,68 +616,7 @@ out: int ntfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type) { - return ntfs_set_acl_ex(mnt_userns, inode, acl, type, 0); -} - -static int ntfs_xattr_get_acl(struct user_namespace *mnt_userns, - struct inode *inode, int type, void *buffer, - size_t size) -{ - struct posix_acl *acl; - int err; - - if (!(inode->i_sb->s_flags & SB_POSIXACL)) { - ntfs_inode_warn(inode, "add mount option \"acl\" to use acl"); - return -EOPNOTSUPP; - } - - acl = ntfs_get_acl(inode, type, false); - if (IS_ERR(acl)) - return PTR_ERR(acl); - - if (!acl) - return -ENODATA; - - err = posix_acl_to_xattr(mnt_userns, acl, buffer, size); - ntfs_posix_acl_release(acl); - - return err; -} - -static int ntfs_xattr_set_acl(struct user_namespace *mnt_userns, - struct inode *inode, int type, const void *value, - size_t size) -{ - struct posix_acl *acl; - int err; - - if (!(inode->i_sb->s_flags & SB_POSIXACL)) { - ntfs_inode_warn(inode, "add mount option \"acl\" to use acl"); - return -EOPNOTSUPP; - } - - if (!inode_owner_or_capable(mnt_userns, inode)) - return -EPERM; - - if (!value) { - acl = NULL; - } else { - acl = posix_acl_from_xattr(mnt_userns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - - if (acl) { - err = posix_acl_valid(mnt_userns, acl); - if (err) - goto release_and_out; - } - } - - err = ntfs_set_acl(mnt_userns, inode, acl, type); - -release_and_out: - ntfs_posix_acl_release(acl); - return err; + return ntfs_set_acl_ex(mnt_userns, inode, acl, type); } /* @@ -698,54 +630,27 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *default_acl, *acl; int err; - /* - * TODO: Refactoring lock. - * ni_lock(dir) ... -> posix_acl_create(dir,...) -> ntfs_get_acl -> ni_lock(dir) - */ - inode->i_default_acl = NULL; - - default_acl = ntfs_get_acl_ex(mnt_userns, dir, ACL_TYPE_DEFAULT, 1); - - if (!default_acl || default_acl == ERR_PTR(-EOPNOTSUPP)) { - inode->i_mode &= ~current_umask(); - err = 0; - goto out; - } - - if (IS_ERR(default_acl)) { - err = PTR_ERR(default_acl); - goto out; - } - - acl = default_acl; - err = __posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); - if (err < 0) - goto out1; - if (!err) { - posix_acl_release(acl); - acl = NULL; - } + err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (err) + return err; - if (!S_ISDIR(inode->i_mode)) { + if (default_acl) { + err = ntfs_set_acl_ex(mnt_userns, inode, default_acl, + ACL_TYPE_DEFAULT); posix_acl_release(default_acl); - default_acl = NULL; + } else { + inode->i_default_acl = NULL; } - if (default_acl) - err = ntfs_set_acl_ex(mnt_userns, inode, default_acl, - ACL_TYPE_DEFAULT, 1); - if (!acl) inode->i_acl = NULL; - else if (!err) - err = ntfs_set_acl_ex(mnt_userns, inode, acl, ACL_TYPE_ACCESS, - 1); - - posix_acl_release(acl); -out1: - posix_acl_release(default_acl); + else { + if (!err) + err = ntfs_set_acl_ex(mnt_userns, inode, acl, + ACL_TYPE_ACCESS); + posix_acl_release(acl); + } -out: return err; } #endif @@ -772,7 +677,7 @@ int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode) int ntfs_permission(struct user_namespace *mnt_userns, struct inode *inode, int mask) { - if (ntfs_sb(inode->i_sb)->options.no_acs_rules) { + if (ntfs_sb(inode->i_sb)->options->noacsrules) { /* "No access rules" mode - Allow all changes. */ return 0; } @@ -880,23 +785,6 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de, goto out; } -#ifdef CONFIG_NTFS3_FS_POSIX_ACL - if ((name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 && - !memcmp(name, XATTR_NAME_POSIX_ACL_ACCESS, - sizeof(XATTR_NAME_POSIX_ACL_ACCESS))) || - (name_len == sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1 && - !memcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, - sizeof(XATTR_NAME_POSIX_ACL_DEFAULT)))) { - /* TODO: init_user_ns? */ - err = ntfs_xattr_get_acl( - &init_user_ns, inode, - name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 - ? ACL_TYPE_ACCESS - : ACL_TYPE_DEFAULT, - buffer, size); - goto out; - } -#endif /* Deal with NTFS extended attribute. */ err = ntfs_get_ea(inode, name, name_len, buffer, size, NULL); @@ -1009,24 +897,8 @@ set_new_fa: goto out; } -#ifdef CONFIG_NTFS3_FS_POSIX_ACL - if ((name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 && - !memcmp(name, XATTR_NAME_POSIX_ACL_ACCESS, - sizeof(XATTR_NAME_POSIX_ACL_ACCESS))) || - (name_len == sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1 && - !memcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, - sizeof(XATTR_NAME_POSIX_ACL_DEFAULT)))) { - err = ntfs_xattr_set_acl( - mnt_userns, inode, - name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 - ? ACL_TYPE_ACCESS - : ACL_TYPE_DEFAULT, - value, size); - goto out; - } -#endif /* Deal with NTFS extended attribute. */ - err = ntfs_set_ea(inode, name, name_len, value, size, flags, 0); + err = ntfs_set_ea(inode, name, name_len, value, size, flags); out: return err; @@ -1042,28 +914,29 @@ int ntfs_save_wsl_perm(struct inode *inode) int err; __le32 value; + /* TODO: refactor this, so we don't lock 4 times in ntfs_set_ea */ value = cpu_to_le32(i_uid_read(inode)); err = ntfs_set_ea(inode, "$LXUID", sizeof("$LXUID") - 1, &value, - sizeof(value), 0, 0); + sizeof(value), 0); if (err) goto out; value = cpu_to_le32(i_gid_read(inode)); err = ntfs_set_ea(inode, "$LXGID", sizeof("$LXGID") - 1, &value, - sizeof(value), 0, 0); + sizeof(value), 0); if (err) goto out; value = cpu_to_le32(inode->i_mode); err = ntfs_set_ea(inode, "$LXMOD", sizeof("$LXMOD") - 1, &value, - sizeof(value), 0, 0); + sizeof(value), 0); if (err) goto out; if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { value = cpu_to_le32(inode->i_rdev); err = ntfs_set_ea(inode, "$LXDEV", sizeof("$LXDEV") - 1, &value, - sizeof(value), 0, 0); + sizeof(value), 0); if (err) goto out; } diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index f1cc8258d34a..5d9ae17bd443 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -7045,7 +7045,7 @@ void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di) int ocfs2_convert_inline_data_to_extents(struct inode *inode, struct buffer_head *di_bh) { - int ret, i, has_data, num_pages = 0; + int ret, has_data, num_pages = 0; int need_free = 0; u32 bit_off, num; handle_t *handle; @@ -7054,26 +7054,17 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct ocfs2_alloc_context *data_ac = NULL; - struct page **pages = NULL; - loff_t end = osb->s_clustersize; + struct page *page = NULL; struct ocfs2_extent_tree et; int did_quota = 0; has_data = i_size_read(inode) ? 1 : 0; if (has_data) { - pages = kcalloc(ocfs2_pages_per_cluster(osb->sb), - sizeof(struct page *), GFP_NOFS); - if (pages == NULL) { - ret = -ENOMEM; - mlog_errno(ret); - return ret; - } - ret = ocfs2_reserve_clusters(osb, 1, &data_ac); if (ret) { mlog_errno(ret); - goto free_pages; + goto out; } } @@ -7093,7 +7084,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, } if (has_data) { - unsigned int page_end; + unsigned int page_end = min_t(unsigned, PAGE_SIZE, + osb->s_clustersize); u64 phys; ret = dquot_alloc_space_nodirty(inode, @@ -7117,15 +7109,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, */ block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off); - /* - * Non sparse file systems zero on extend, so no need - * to do that now. - */ - if (!ocfs2_sparse_alloc(osb) && - PAGE_SIZE < osb->s_clustersize) - end = PAGE_SIZE; - - ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages); + ret = ocfs2_grab_eof_pages(inode, 0, page_end, &page, + &num_pages); if (ret) { mlog_errno(ret); need_free = 1; @@ -7136,20 +7121,15 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, * This should populate the 1st page for us and mark * it up to date. */ - ret = ocfs2_read_inline_data(inode, pages[0], di_bh); + ret = ocfs2_read_inline_data(inode, page, di_bh); if (ret) { mlog_errno(ret); need_free = 1; goto out_unlock; } - page_end = PAGE_SIZE; - if (PAGE_SIZE > osb->s_clustersize) - page_end = osb->s_clustersize; - - for (i = 0; i < num_pages; i++) - ocfs2_map_and_dirty_page(inode, handle, 0, page_end, - pages[i], i > 0, &phys); + ocfs2_map_and_dirty_page(inode, handle, 0, page_end, page, 0, + &phys); } spin_lock(&oi->ip_lock); @@ -7180,8 +7160,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, } out_unlock: - if (pages) - ocfs2_unlock_and_free_pages(pages, num_pages); + if (page) + ocfs2_unlock_and_free_pages(&page, num_pages); out_commit: if (ret < 0 && did_quota) @@ -7205,8 +7185,6 @@ out_commit: out: if (data_ac) ocfs2_free_alloc_context(data_ac); -free_pages: - kfree(pages); return ret; } diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 8521942f5af2..481017e1dac5 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1251,7 +1251,7 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, { struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; struct journal_head *jh; - int ret; + int ret = 1; if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap)) return 0; @@ -1259,14 +1259,18 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, if (!buffer_jbd(bg_bh)) return 1; - jh = bh2jh(bg_bh); - spin_lock(&jh->b_state_lock); - bg = (struct ocfs2_group_desc *) jh->b_committed_data; - if (bg) - ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap); - else - ret = 1; - spin_unlock(&jh->b_state_lock); + jbd_lock_bh_journal_head(bg_bh); + if (buffer_jbd(bg_bh)) { + jh = bh2jh(bg_bh); + spin_lock(&jh->b_state_lock); + bg = (struct ocfs2_group_desc *) jh->b_committed_data; + if (bg) + ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap); + else + ret = 1; + spin_unlock(&jh->b_state_lock); + } + jbd_unlock_bh_journal_head(bg_bh); return ret; } diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index c86bd4e60e20..5c914ce9b3ac 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2167,11 +2167,17 @@ static int ocfs2_initialize_super(struct super_block *sb, } if (ocfs2_clusterinfo_valid(osb)) { + /* + * ci_stack and ci_cluster in ocfs2_cluster_info may not be null + * terminated, so make sure no overflow happens here by using + * memcpy. Destination strings will always be null terminated + * because osb is allocated using kzalloc. + */ osb->osb_stackflags = OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags; - strlcpy(osb->osb_cluster_stack, + memcpy(osb->osb_cluster_stack, OCFS2_RAW_SB(di)->s_cluster_info.ci_stack, - OCFS2_STACK_LABEL_LEN + 1); + OCFS2_STACK_LABEL_LEN); if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) { mlog(ML_ERROR, "couldn't mount because of an invalid " @@ -2180,9 +2186,9 @@ static int ocfs2_initialize_super(struct super_block *sb, status = -EINVAL; goto bail; } - strlcpy(osb->osb_cluster_name, + memcpy(osb->osb_cluster_name, OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster, - OCFS2_CLUSTER_NAME_LEN + 1); + OCFS2_CLUSTER_NAME_LEN); } else { /* The empty string is identical with classic tools that * don't know about s_cluster_info. */ diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index c1bb4c4b5d67..e5e3e500ed46 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -10,7 +10,7 @@ * Linux VFS inode operations. */ -#include <linux/bvec.h> +#include <linux/blkdev.h> #include <linux/fileattr.h> #include "protocol.h" #include "orangefs-kernel.h" diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index 2f2e430461b2..8bb0a53a658b 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -11,6 +11,7 @@ #include <linux/parser.h> #include <linux/hashtable.h> +#include <linux/seq_file.h> /* a cache for orangefs-inode objects (i.e. orangefs inode private data) */ static struct kmem_cache *orangefs_inode_cache; diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 1fefb2b8960e..93c7c267de93 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -1219,9 +1219,13 @@ static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir, goto out_dput; } } else { - if (!d_is_negative(newdentry) && - (!new_opaque || !ovl_is_whiteout(newdentry))) - goto out_dput; + if (!d_is_negative(newdentry)) { + if (!new_opaque || !ovl_is_whiteout(newdentry)) + goto out_dput; + } else { + if (flags & RENAME_EXCHANGE) + goto out_dput; + } } if (olddentry == trap) diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index d081faa55e83..ac461a499882 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -272,14 +272,14 @@ static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req) kmem_cache_free(ovl_aio_request_cachep, aio_req); } -static void ovl_aio_rw_complete(struct kiocb *iocb, long res, long res2) +static void ovl_aio_rw_complete(struct kiocb *iocb, long res) { struct ovl_aio_req *aio_req = container_of(iocb, struct ovl_aio_req, iocb); struct kiocb *orig_iocb = aio_req->orig_iocb; ovl_aio_cleanup_handler(aio_req); - orig_iocb->ki_complete(orig_iocb, res, res2); + orig_iocb->ki_complete(orig_iocb, res); } static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) @@ -296,6 +296,12 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) if (ret) return ret; + ret = -EINVAL; + if (iocb->ki_flags & IOCB_DIRECT && + (!real.file->f_mapping->a_ops || + !real.file->f_mapping->a_ops->direct_IO)) + goto out_fdput; + old_cred = ovl_override_creds(file_inode(file)->i_sb); if (is_sync_kiocb(iocb)) { ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, @@ -320,7 +326,7 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) out: revert_creds(old_cred); ovl_file_accessed(file); - +out_fdput: fdput(real); return ret; @@ -349,6 +355,12 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) if (ret) goto out_unlock; + ret = -EINVAL; + if (iocb->ki_flags & IOCB_DIRECT && + (!real.file->f_mapping->a_ops || + !real.file->f_mapping->a_ops->direct_IO)) + goto out_fdput; + if (!ovl_should_sync(OVL_FS(inode->i_sb))) ifl &= ~(IOCB_DSYNC | IOCB_SYNC); @@ -384,6 +396,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) } out: revert_creds(old_cred); +out_fdput: fdput(real); out_unlock: diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c index 04ce58c939a0..5d1fbaffd66a 100644 --- a/fs/pstore/blk.c +++ b/fs/pstore/blk.c @@ -205,7 +205,6 @@ static ssize_t psblk_generic_blk_write(const char *buf, size_t bytes, static int __register_pstore_blk(struct pstore_device_info *dev, const char *devpath) { - struct inode *inode; int ret = -ENODEV; lockdep_assert_held(&pstore_blk_lock); @@ -217,14 +216,13 @@ static int __register_pstore_blk(struct pstore_device_info *dev, goto err; } - inode = file_inode(psblk_file); - if (!S_ISBLK(inode->i_mode)) { + if (!S_ISBLK(file_inode(psblk_file)->i_mode)) { pr_err("'%s' is not block device!\n", devpath); goto err_fput; } - inode = I_BDEV(psblk_file->f_mapping->host)->bd_inode; - dev->zone.total_size = i_size_read(inode); + dev->zone.total_size = + bdev_nr_bytes(I_BDEV(psblk_file->f_mapping->host)); ret = __register_pstore_device(dev); if (ret) diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 2bcc9a6f1bfc..052f143e2e0e 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -10,6 +10,7 @@ #include <linux/namei.h> #include <linux/slab.h> #include <asm/current.h> +#include <linux/blkdev.h> #include <linux/uaccess.h> #include <linux/kernel.h> #include <linux/security.h> diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 65e7e56005b8..e2302342a67f 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -38,6 +38,7 @@ #include <linux/uaccess.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> +#include <linux/seq_file.h> #include "internal.h" struct ramfs_mount_opts { diff --git a/fs/read_write.c b/fs/read_write.c index af057c57bdc6..0074afa7ecb3 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -368,10 +368,6 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t if (unlikely((ssize_t) count < 0)) return -EINVAL; - /* - * ranged mandatory locking does not apply to streams - it makes sense - * only for files where position has a meaning. - */ if (ppos) { loff_t pos = *ppos; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 58481f8d63d5..076f9ab94306 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1199,9 +1199,7 @@ static int reiserfs_parse_options(struct super_block *s, if (!strcmp(arg, "auto")) { /* From JFS code, to auto-get the size. */ - *blocks = - i_size_read(s->s_bdev->bd_inode) >> s-> - s_blocksize_bits; + *blocks = sb_bdev_nr_blocks(s); } else { *blocks = simple_strtoul(arg, &p, 0); if (*p != '\0') { @@ -1986,9 +1984,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) * smaller than the filesystem. If the check fails then abort and * scream, because bad stuff will happen otherwise. */ - if (s->s_bdev && s->s_bdev->bd_inode - && i_size_read(s->s_bdev->bd_inode) < - sb_block_count(rs) * sb_blocksize(rs)) { + if (bdev_nr_bytes(s->s_bdev) < sb_block_count(rs) * sb_blocksize(rs)) { SWARN(silent, s, "", "Filesystem cannot be " "mounted because it is bigger than the device"); SWARN(silent, s, "", "You may need to run fsck " diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 60d6951915f4..bb44ff4c5cc6 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -16,6 +16,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/blkdev.h> #include <linux/fs.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> @@ -179,8 +180,8 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) /* Check the filesystem does not extend beyond the end of the block device */ msblk->bytes_used = le64_to_cpu(sblk->bytes_used); - if (msblk->bytes_used < 0 || msblk->bytes_used > - i_size_read(sb->s_bdev->bd_inode)) + if (msblk->bytes_used < 0 || + msblk->bytes_used > bdev_nr_bytes(sb->s_bdev)) goto failed_mount; /* Check block size for sanity */ diff --git a/fs/sync.c b/fs/sync.c index 1373a610dc78..3ce8e2137f31 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -3,6 +3,7 @@ * High-level sync()-related operations */ +#include <linux/blkdev.h> #include <linux/kernel.h> #include <linux/file.h> #include <linux/fs.h> @@ -22,25 +23,6 @@ SYNC_FILE_RANGE_WAIT_AFTER) /* - * Do the filesystem syncing work. For simple filesystems - * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to - * submit IO for these buffers via __sync_blockdev(). This also speeds up the - * wait == 1 case since in that case write_inode() functions do - * sync_dirty_buffer() and thus effectively write one block at a time. - */ -static int __sync_filesystem(struct super_block *sb, int wait) -{ - if (wait) - sync_inodes_sb(sb); - else - writeback_inodes_sb(sb, WB_REASON_SYNC); - - if (sb->s_op->sync_fs) - sb->s_op->sync_fs(sb, wait); - return __sync_blockdev(sb->s_bdev, wait); -} - -/* * Write out and wait upon all dirty data associated with this * superblock. Filesystem data as well as the underlying block * device. Takes the superblock lock. @@ -61,10 +43,25 @@ int sync_filesystem(struct super_block *sb) if (sb_rdonly(sb)) return 0; - ret = __sync_filesystem(sb, 0); + /* + * Do the filesystem syncing work. For simple filesystems + * writeback_inodes_sb(sb) just dirties buffers with inodes so we have + * to submit I/O for these buffers via sync_blockdev(). This also + * speeds up the wait == 1 case since in that case write_inode() + * methods call sync_dirty_buffer() and thus effectively write one block + * at a time. + */ + writeback_inodes_sb(sb, WB_REASON_SYNC); + if (sb->s_op->sync_fs) + sb->s_op->sync_fs(sb, 0); + ret = sync_blockdev_nowait(sb->s_bdev); if (ret < 0) return ret; - return __sync_filesystem(sb, 1); + + sync_inodes_sb(sb); + if (sb->s_op->sync_fs) + sb->s_op->sync_fs(sb, 1); + return sync_blockdev(sb->s_bdev); } EXPORT_SYMBOL(sync_filesystem); @@ -81,21 +78,6 @@ static void sync_fs_one_sb(struct super_block *sb, void *arg) sb->s_op->sync_fs(sb, *(int *)arg); } -static void fdatawrite_one_bdev(struct block_device *bdev, void *arg) -{ - filemap_fdatawrite(bdev->bd_inode->i_mapping); -} - -static void fdatawait_one_bdev(struct block_device *bdev, void *arg) -{ - /* - * We keep the error status of individual mapping so that - * applications can catch the writeback error using fsync(2). - * See filemap_fdatawait_keep_errors() for details. - */ - filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping); -} - /* * Sync everything. We start by waking flusher threads so that most of * writeback runs on all devices in parallel. Then we sync all inodes reliably @@ -114,8 +96,8 @@ void ksys_sync(void) iterate_supers(sync_inodes_one_sb, NULL); iterate_supers(sync_fs_one_sb, &nowait); iterate_supers(sync_fs_one_sb, &wait); - iterate_bdevs(fdatawrite_one_bdev, NULL); - iterate_bdevs(fdatawait_one_bdev, NULL); + sync_bdevs(false); + sync_bdevs(true); if (unlikely(laptop_mode)) laptop_sync_completion(); } @@ -136,10 +118,10 @@ static void do_sync_work(struct work_struct *work) */ iterate_supers(sync_inodes_one_sb, &nowait); iterate_supers(sync_fs_one_sb, &nowait); - iterate_bdevs(fdatawrite_one_bdev, NULL); + sync_bdevs(false); iterate_supers(sync_inodes_one_sb, &nowait); iterate_supers(sync_fs_one_sb, &nowait); - iterate_bdevs(fdatawrite_one_bdev, NULL); + sync_bdevs(false); printk("Emergency Sync complete\n"); kfree(work); } diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c index 22be7aeb96c4..c57b46a352d8 100644 --- a/fs/ubifs/crypto.c +++ b/fs/ubifs/crypto.c @@ -82,5 +82,4 @@ const struct fscrypt_operations ubifs_crypt_operations = { .get_context = ubifs_crypt_get_context, .set_context = ubifs_crypt_set_context, .empty_dir = ubifs_crypt_empty_dir, - .max_namelen = UBIFS_MAX_NLEN, }; diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c index f1094cdcd6cd..46d697172197 100644 --- a/fs/udf/lowlevel.c +++ b/fs/udf/lowlevel.c @@ -47,8 +47,7 @@ unsigned int udf_get_last_session(struct super_block *sb) unsigned long udf_get_last_block(struct super_block *sb) { - struct block_device *bdev = sb->s_bdev; - struct cdrom_device_info *cdi = disk_to_cdi(bdev->bd_disk); + struct cdrom_device_info *cdi = disk_to_cdi(sb->s_bdev->bd_disk); unsigned long lblock = 0; /* @@ -56,7 +55,7 @@ unsigned long udf_get_last_block(struct super_block *sb) * Try using the device size... */ if (!cdi || cdrom_get_last_written(cdi, &lblock) || lblock == 0) - lblock = i_size_read(bdev->bd_inode) >> sb->s_blocksize_bits; + lblock = sb_bdev_nr_blocks(sb); if (lblock) return lblock - 1; diff --git a/fs/udf/super.c b/fs/udf/super.c index b2d7c57d0688..34247fba6df9 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -1175,8 +1175,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) struct udf_inode_info *vati; uint32_t pos; struct virtualAllocationTable20 *vat20; - sector_t blocks = i_size_read(sb->s_bdev->bd_inode) >> - sb->s_blocksize_bits; + sector_t blocks = sb_bdev_nr_blocks(sb); udf_find_vat_block(sb, p_index, type1_index, sbi->s_last_block); if (!sbi->s_vat_inode && @@ -1838,8 +1837,7 @@ static int udf_check_anchor_block(struct super_block *sb, sector_t block, int ret; if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) && - udf_fixed_to_variable(block) >= - i_size_read(sb->s_bdev->bd_inode) >> sb->s_blocksize_bits) + udf_fixed_to_variable(block) >= sb_bdev_nr_blocks(sb)) return -EAGAIN; bh = udf_read_tagged(sb, block, block, &ident); @@ -1901,8 +1899,7 @@ static int udf_scan_anchors(struct super_block *sb, sector_t *lastblock, last[last_count++] = *lastblock - 152; for (i = 0; i < last_count; i++) { - if (last[i] >= i_size_read(sb->s_bdev->bd_inode) >> - sb->s_blocksize_bits) + if (last[i] >= sb_bdev_nr_blocks(sb)) continue; ret = udf_check_anchor_block(sb, last[i], fileset); if (ret != -EAGAIN) { diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 003f0d31743e..22bf14ab2d16 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1827,9 +1827,15 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx, if (mode_wp && mode_dontwake) return -EINVAL; - ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start, - uffdio_wp.range.len, mode_wp, - &ctx->mmap_changing); + if (mmget_not_zero(ctx->mm)) { + ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start, + uffdio_wp.range.len, mode_wp, + &ctx->mmap_changing); + mmput(ctx->mm); + } else { + return -ESRCH; + } + if (ret) return ret; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 7aa943edfc02..62e7fbe4e54c 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1452,7 +1452,7 @@ const struct file_operations xfs_file_operations = { .write_iter = xfs_file_write_iter, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, - .iopoll = iomap_dio_iopoll, + .iopoll = iocb_bio_iopoll, .unlocked_ioctl = xfs_file_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = xfs_file_compat_ioctl, diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index ddc346a9df9b..3ce5f47338cb 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -1128,7 +1128,7 @@ static const struct file_operations zonefs_file_operations = { .write_iter = zonefs_file_write_iter, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, - .iopoll = iomap_dio_iopoll, + .iopoll = iocb_bio_iopoll, }; static struct kmem_cache *zonefs_inode_cachep; |