diff options
Diffstat (limited to 'fs')
125 files changed, 2756 insertions, 1909 deletions
diff --git a/fs/9p/fid.c b/fs/9p/fid.c index 79df61fe0e59..baf2b152229e 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -152,7 +152,7 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry, const unsigned char **wnames, *uname; int i, n, l, clone, access; struct v9fs_session_info *v9ses; - struct p9_fid *fid, *old_fid = NULL; + struct p9_fid *fid, *old_fid; v9ses = v9fs_dentry2v9ses(dentry); access = v9ses->flags & V9FS_ACCESS_MASK; @@ -194,13 +194,12 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry, if (IS_ERR(fid)) return fid; + refcount_inc(&fid->count); v9fs_fid_add(dentry->d_sb->s_root, fid); } /* If we are root ourself just return that */ - if (dentry->d_sb->s_root == dentry) { - refcount_inc(&fid->count); + if (dentry->d_sb->s_root == dentry) return fid; - } /* * Do a multipath walk with attached root. * When walking parent we need to make sure we @@ -212,6 +211,7 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry, fid = ERR_PTR(n); goto err_out; } + old_fid = fid; clone = 1; i = 0; while (i < n) { @@ -221,19 +221,15 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry, * walk to ensure none of the patch component change */ fid = p9_client_walk(fid, l, &wnames[i], clone); + /* non-cloning walk will return the same fid */ + if (fid != old_fid) { + p9_client_clunk(old_fid); + old_fid = fid; + } if (IS_ERR(fid)) { - if (old_fid) { - /* - * If we fail, clunk fid which are mapping - * to path component and not the last component - * of the path. - */ - p9_client_clunk(old_fid); - } kfree(wnames); goto err_out; } - old_fid = fid; i += l; clone = 0; } diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index a8f512b44a85..d0833fa69faf 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -58,8 +58,21 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq) */ static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file) { + struct inode *inode = file_inode(file); + struct v9fs_inode *v9inode = V9FS_I(inode); struct p9_fid *fid = file->private_data; + BUG_ON(!fid); + + /* we might need to read from a fid that was opened write-only + * for read-modify-write of page cache, use the writeback fid + * for that */ + if (rreq->origin == NETFS_READ_FOR_WRITE && + (fid->mode & O_ACCMODE) == O_WRONLY) { + fid = v9inode->writeback_fid; + BUG_ON(!fid); + } + refcount_inc(&fid->count); rreq->netfs_priv = fid; return 0; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 419d2f3cf2c2..3d8297714772 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -1251,15 +1251,15 @@ static const char *v9fs_vfs_get_link(struct dentry *dentry, return ERR_PTR(-ECHILD); v9ses = v9fs_dentry2v9ses(dentry); - fid = v9fs_fid_lookup(dentry); + if (!v9fs_proto_dotu(v9ses)) + return ERR_PTR(-EBADF); + p9_debug(P9_DEBUG_VFS, "%pd\n", dentry); + fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) return ERR_CAST(fid); - if (!v9fs_proto_dotu(v9ses)) - return ERR_PTR(-EBADF); - st = p9_client_stat(fid); p9_client_clunk(fid); if (IS_ERR(st)) diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index d17502a738a9..b6eb1160296c 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -274,6 +274,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, if (IS_ERR(ofid)) { err = PTR_ERR(ofid); p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); + p9_client_clunk(dfid); goto out; } @@ -285,6 +286,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, if (err) { p9_debug(P9_DEBUG_VFS, "Failed to get acl values in creat %d\n", err); + p9_client_clunk(dfid); goto error; } err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags), @@ -292,6 +294,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, if (err < 0) { p9_debug(P9_DEBUG_VFS, "p9_client_open_dotl failed in creat %d\n", err); + p9_client_clunk(dfid); goto error; } v9fs_invalidate_inode_attr(dir); diff --git a/fs/afs/file.c b/fs/afs/file.c index 42118a4f3383..d1cfb235c4b9 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -375,7 +375,7 @@ static int afs_begin_cache_operation(struct netfs_io_request *rreq) } static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len, - struct folio *folio, void **_fsdata) + struct folio **foliop, void **_fsdata) { struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 89630acbc2cc..64dab70d4a4f 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -745,7 +745,8 @@ int afs_getattr(struct user_namespace *mnt_userns, const struct path *path, _enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation); - if (!(query_flags & AT_STATX_DONT_SYNC) && + if (vnode->volume && + !(query_flags & AT_STATX_DONT_SYNC) && !test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { key = afs_request_key(vnode->volume->cell); if (IS_ERR(key)) diff --git a/fs/attr.c b/fs/attr.c index dbe996b0dedf..b5b8835ddf15 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -22,7 +22,7 @@ * chown_ok - verify permissions to chown inode * @mnt_userns: user namespace of the mount @inode was found from * @inode: inode to check permissions on - * @uid: uid to chown @inode to + * @ia_vfsuid: uid to chown @inode to * * If the inode has been found through an idmapped mount the user namespace of * the vfsmount must be passed through @mnt_userns. This function will then @@ -31,15 +31,15 @@ * performed on the raw inode simply passs init_user_ns. */ static bool chown_ok(struct user_namespace *mnt_userns, - const struct inode *inode, - kuid_t uid) + const struct inode *inode, vfsuid_t ia_vfsuid) { - kuid_t kuid = i_uid_into_mnt(mnt_userns, inode); - if (uid_eq(current_fsuid(), kuid) && uid_eq(uid, inode->i_uid)) + vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + if (vfsuid_eq_kuid(vfsuid, current_fsuid()) && + vfsuid_eq(ia_vfsuid, vfsuid)) return true; if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN)) return true; - if (uid_eq(kuid, INVALID_UID) && + if (!vfsuid_valid(vfsuid) && ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN)) return true; return false; @@ -49,7 +49,7 @@ static bool chown_ok(struct user_namespace *mnt_userns, * chgrp_ok - verify permissions to chgrp inode * @mnt_userns: user namespace of the mount @inode was found from * @inode: inode to check permissions on - * @gid: gid to chown @inode to + * @ia_vfsgid: gid to chown @inode to * * If the inode has been found through an idmapped mount the user namespace of * the vfsmount must be passed through @mnt_userns. This function will then @@ -58,21 +58,19 @@ static bool chown_ok(struct user_namespace *mnt_userns, * performed on the raw inode simply passs init_user_ns. */ static bool chgrp_ok(struct user_namespace *mnt_userns, - const struct inode *inode, kgid_t gid) + const struct inode *inode, vfsgid_t ia_vfsgid) { - kgid_t kgid = i_gid_into_mnt(mnt_userns, inode); - if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode))) { - kgid_t mapped_gid; - - if (gid_eq(gid, inode->i_gid)) + vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + if (vfsuid_eq_kuid(vfsuid, current_fsuid())) { + if (vfsgid_eq(ia_vfsgid, vfsgid)) return true; - mapped_gid = mapped_kgid_fs(mnt_userns, i_user_ns(inode), gid); - if (in_group_p(mapped_gid)) + if (vfsgid_in_group_p(ia_vfsgid)) return true; } if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN)) return true; - if (gid_eq(kgid, INVALID_GID) && + if (!vfsgid_valid(vfsgid) && ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN)) return true; return false; @@ -120,28 +118,29 @@ int setattr_prepare(struct user_namespace *mnt_userns, struct dentry *dentry, goto kill_priv; /* Make sure a caller can chown. */ - if ((ia_valid & ATTR_UID) && !chown_ok(mnt_userns, inode, attr->ia_uid)) + if ((ia_valid & ATTR_UID) && + !chown_ok(mnt_userns, inode, attr->ia_vfsuid)) return -EPERM; /* Make sure caller can chgrp. */ - if ((ia_valid & ATTR_GID) && !chgrp_ok(mnt_userns, inode, attr->ia_gid)) + if ((ia_valid & ATTR_GID) && + !chgrp_ok(mnt_userns, inode, attr->ia_vfsgid)) return -EPERM; /* Make sure a caller can chmod. */ if (ia_valid & ATTR_MODE) { - kgid_t mapped_gid; + vfsgid_t vfsgid; if (!inode_owner_or_capable(mnt_userns, inode)) return -EPERM; if (ia_valid & ATTR_GID) - mapped_gid = mapped_kgid_fs(mnt_userns, - i_user_ns(inode), attr->ia_gid); + vfsgid = attr->ia_vfsgid; else - mapped_gid = i_gid_into_mnt(mnt_userns, inode); + vfsgid = i_gid_into_vfsgid(mnt_userns, inode); /* Also check the setgid bit! */ - if (!in_group_p(mapped_gid) && + if (!vfsgid_in_group_p(vfsgid) && !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) attr->ia_mode &= ~S_ISGID; } @@ -219,9 +218,7 @@ EXPORT_SYMBOL(inode_newsize_ok); * setattr_copy must be called with i_mutex held. * * setattr_copy updates the inode's metadata with that specified - * in attr on idmapped mounts. If file ownership is changed setattr_copy - * doesn't map ia_uid and ia_gid. It will asssume the caller has already - * provided the intended values. Necessary permission checks to determine + * in attr on idmapped mounts. Necessary permission checks to determine * whether or not the S_ISGID property needs to be removed are performed with * the correct idmapped mount permission helpers. * Noticeably missing is inode size update, which is more complex @@ -242,10 +239,8 @@ void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode, { unsigned int ia_valid = attr->ia_valid; - if (ia_valid & ATTR_UID) - inode->i_uid = attr->ia_uid; - if (ia_valid & ATTR_GID) - inode->i_gid = attr->ia_gid; + i_uid_update(mnt_userns, attr, inode); + i_gid_update(mnt_userns, attr, inode); if (ia_valid & ATTR_ATIME) inode->i_atime = attr->ia_atime; if (ia_valid & ATTR_MTIME) @@ -254,8 +249,8 @@ void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode, inode->i_ctime = attr->ia_ctime; if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; - kgid_t kgid = i_gid_into_mnt(mnt_userns, inode); - if (!in_group_p(kgid) && + vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + if (!vfsgid_in_group_p(vfsgid) && !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) mode &= ~S_ISGID; inode->i_mode = mode; @@ -306,9 +301,6 @@ EXPORT_SYMBOL(may_setattr); * retry. Because breaking a delegation may take a long time, the * caller should drop the i_mutex before doing so. * - * If file ownership is changed notify_change() doesn't map ia_uid and - * ia_gid. It will asssume the caller has already provided the intended values. - * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. Also, passing NULL is fine for callers holding @@ -397,23 +389,25 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, * namespace of the superblock. */ if (ia_valid & ATTR_UID && - !kuid_has_mapping(inode->i_sb->s_user_ns, attr->ia_uid)) + !vfsuid_has_fsmapping(mnt_userns, inode->i_sb->s_user_ns, + attr->ia_vfsuid)) return -EOVERFLOW; if (ia_valid & ATTR_GID && - !kgid_has_mapping(inode->i_sb->s_user_ns, attr->ia_gid)) + !vfsgid_has_fsmapping(mnt_userns, inode->i_sb->s_user_ns, + attr->ia_vfsgid)) return -EOVERFLOW; /* Don't allow modifications of files with invalid uids or * gids unless those uids & gids are being made valid. */ if (!(ia_valid & ATTR_UID) && - !uid_valid(i_uid_into_mnt(mnt_userns, inode))) + !vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode))) return -EOVERFLOW; if (!(ia_valid & ATTR_GID) && - !gid_valid(i_gid_into_mnt(mnt_userns, inode))) + !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode))) return -EOVERFLOW; - error = security_inode_setattr(dentry, attr); + error = security_inode_setattr(mnt_userns, dentry, attr); if (error) return error; error = try_break_deleg(inode, delegated_inode); diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 3ac668ace50a..35e0e860cc0b 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -104,6 +104,7 @@ struct btrfs_block_group { unsigned int relocating_repair:1; unsigned int chunk_item_inserted:1; unsigned int zone_is_active:1; + unsigned int zoned_data_reloc_ongoing:1; int disk_cache_state; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0e49b1a0c071..9c21e214d29e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -675,9 +675,8 @@ struct btrfs_fs_info { rwlock_t global_root_lock; struct rb_root global_root_tree; - /* The xarray that holds all the FS roots */ - spinlock_t fs_roots_lock; - struct xarray fs_roots; + spinlock_t fs_roots_radix_lock; + struct radix_tree_root fs_roots_radix; /* block group cache stuff */ rwlock_t block_group_cache_lock; @@ -995,10 +994,10 @@ struct btrfs_fs_info { struct btrfs_delayed_root *delayed_root; - /* Extent buffer xarray */ + /* Extent buffer radix tree */ spinlock_t buffer_lock; /* Entries are eb->start / sectorsize */ - struct xarray extent_buffers; + struct radix_tree_root buffer_radix; /* next backup root to be overwritten */ int backup_root_index; @@ -1119,8 +1118,7 @@ enum { */ BTRFS_ROOT_SHAREABLE, BTRFS_ROOT_TRACK_DIRTY, - /* The root is tracked in fs_info::fs_roots */ - BTRFS_ROOT_REGISTERED, + BTRFS_ROOT_IN_RADIX, BTRFS_ROOT_ORPHAN_ITEM_INSERTED, BTRFS_ROOT_DEFRAG_RUNNING, BTRFS_ROOT_FORCE_COW, @@ -1224,10 +1222,10 @@ struct btrfs_root { struct rb_root inode_tree; /* - * Xarray that keeps track of delayed nodes of every inode, protected - * by inode_lock + * radix tree that keeps track of delayed nodes of every inode, + * protected by inode_lock */ - struct xarray delayed_nodes; + struct radix_tree_root delayed_nodes_tree; /* * right now this just gets used so that a root has its own devid * for stat. It may be used for more later @@ -1330,6 +1328,8 @@ struct btrfs_replace_extent_info { * existing extent into a file range. */ bool is_new_extent; + /* Indicate if we should update the inode's mtime and ctime. */ + bool update_times; /* Meaningful only if is_new_extent is true. */ int qgroup_reserved; /* diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 66779ab3ed4a..748bf6b0d860 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -78,7 +78,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( } spin_lock(&root->inode_lock); - node = xa_load(&root->delayed_nodes, ino); + node = radix_tree_lookup(&root->delayed_nodes_tree, ino); if (node) { if (btrfs_inode->delayed_node) { @@ -90,9 +90,9 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( /* * It's possible that we're racing into the middle of removing - * this node from the xarray. In this case, the refcount + * this node from the radix tree. In this case, the refcount * was zero and it should never go back to one. Just return - * NULL like it was never in the xarray at all; our release + * NULL like it was never in the radix at all; our release * function is in the process of removing it. * * Some implementations of refcount_inc refuse to bump the @@ -100,7 +100,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( * here, refcount_inc() may decide to just WARN_ONCE() instead * of actually bumping the refcount. * - * If this node is properly in the xarray, we want to bump the + * If this node is properly in the radix, we want to bump the * refcount twice, once for the inode and once for this get * operation. */ @@ -128,30 +128,36 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( u64 ino = btrfs_ino(btrfs_inode); int ret; - do { - node = btrfs_get_delayed_node(btrfs_inode); - if (node) - return node; +again: + node = btrfs_get_delayed_node(btrfs_inode); + if (node) + return node; - node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS); - if (!node) - return ERR_PTR(-ENOMEM); - btrfs_init_delayed_node(node, root, ino); + node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS); + if (!node) + return ERR_PTR(-ENOMEM); + btrfs_init_delayed_node(node, root, ino); - /* Cached in the inode and can be accessed */ - refcount_set(&node->refs, 2); + /* cached in the btrfs inode and can be accessed */ + refcount_set(&node->refs, 2); - spin_lock(&root->inode_lock); - ret = xa_insert(&root->delayed_nodes, ino, node, GFP_NOFS); - if (ret) { - spin_unlock(&root->inode_lock); - kmem_cache_free(delayed_node_cache, node); - if (ret != -EBUSY) - return ERR_PTR(ret); - } - } while (ret); + ret = radix_tree_preload(GFP_NOFS); + if (ret) { + kmem_cache_free(delayed_node_cache, node); + return ERR_PTR(ret); + } + + spin_lock(&root->inode_lock); + ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node); + if (ret == -EEXIST) { + spin_unlock(&root->inode_lock); + kmem_cache_free(delayed_node_cache, node); + radix_tree_preload_end(); + goto again; + } btrfs_inode->delayed_node = node; spin_unlock(&root->inode_lock); + radix_tree_preload_end(); return node; } @@ -270,7 +276,8 @@ static void __btrfs_release_delayed_node( * back up. We can delete it now. */ ASSERT(refcount_read(&delayed_node->refs) == 0); - xa_erase(&root->delayed_nodes, delayed_node->inode_id); + radix_tree_delete(&root->delayed_nodes_tree, + delayed_node->inode_id); spin_unlock(&root->inode_lock); kmem_cache_free(delayed_node_cache, delayed_node); } @@ -1863,35 +1870,34 @@ void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode) void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) { - unsigned long index = 0; - struct btrfs_delayed_node *delayed_node; + u64 inode_id = 0; struct btrfs_delayed_node *delayed_nodes[8]; + int i, n; while (1) { - int n = 0; - spin_lock(&root->inode_lock); - if (xa_empty(&root->delayed_nodes)) { + n = radix_tree_gang_lookup(&root->delayed_nodes_tree, + (void **)delayed_nodes, inode_id, + ARRAY_SIZE(delayed_nodes)); + if (!n) { spin_unlock(&root->inode_lock); - return; + break; } - xa_for_each_start(&root->delayed_nodes, index, delayed_node, index) { + inode_id = delayed_nodes[n - 1]->inode_id + 1; + for (i = 0; i < n; i++) { /* * Don't increase refs in case the node is dead and * about to be removed from the tree in the loop below */ - if (refcount_inc_not_zero(&delayed_node->refs)) { - delayed_nodes[n] = delayed_node; - n++; - } - if (n >= ARRAY_SIZE(delayed_nodes)) - break; + if (!refcount_inc_not_zero(&delayed_nodes[i]->refs)) + delayed_nodes[i] = NULL; } - index++; spin_unlock(&root->inode_lock); - for (int i = 0; i < n; i++) { + for (i = 0; i < n; i++) { + if (!delayed_nodes[i]) + continue; __btrfs_kill_delayed_node(delayed_nodes[i]); btrfs_release_delayed_node(delayed_nodes[i]); } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 89e94ea2fef5..de440ebf5648 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -5,6 +5,7 @@ #include <linux/fs.h> #include <linux/blkdev.h> +#include <linux/radix-tree.h> #include <linux/writeback.h> #include <linux/workqueue.h> #include <linux/kthread.h> @@ -485,7 +486,7 @@ static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info, uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur, fs_info->nodesize); - /* A dirty eb shouldn't disappear from extent_buffers */ + /* A dirty eb shouldn't disappear from buffer_radix */ if (WARN_ON(!eb)) return -EUCLEAN; @@ -1158,7 +1159,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->nr_delalloc_inodes = 0; root->nr_ordered_extents = 0; root->inode_tree = RB_ROOT; - xa_init_flags(&root->delayed_nodes, GFP_ATOMIC); + INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); btrfs_init_root_block_rsv(root); @@ -1210,9 +1211,9 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks); #ifdef CONFIG_BTRFS_DEBUG INIT_LIST_HEAD(&root->leak_list); - spin_lock(&fs_info->fs_roots_lock); + spin_lock(&fs_info->fs_roots_radix_lock); list_add_tail(&root->leak_list, &fs_info->allocated_roots); - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); #endif } @@ -1659,11 +1660,12 @@ static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, { struct btrfs_root *root; - spin_lock(&fs_info->fs_roots_lock); - root = xa_load(&fs_info->fs_roots, (unsigned long)root_id); + spin_lock(&fs_info->fs_roots_radix_lock); + root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)root_id); if (root) root = btrfs_grab_root(root); - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); return root; } @@ -1705,14 +1707,20 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, { int ret; - spin_lock(&fs_info->fs_roots_lock); - ret = xa_insert(&fs_info->fs_roots, (unsigned long)root->root_key.objectid, - root, GFP_NOFS); + ret = radix_tree_preload(GFP_NOFS); + if (ret) + return ret; + + spin_lock(&fs_info->fs_roots_radix_lock); + ret = radix_tree_insert(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + root); if (ret == 0) { btrfs_grab_root(root); - set_bit(BTRFS_ROOT_REGISTERED, &root->state); + set_bit(BTRFS_ROOT_IN_RADIX, &root->state); } - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); + radix_tree_preload_end(); return ret; } @@ -2342,9 +2350,9 @@ void btrfs_put_root(struct btrfs_root *root) btrfs_drew_lock_destroy(&root->snapshot_lock); free_root_extent_buffers(root); #ifdef CONFIG_BTRFS_DEBUG - spin_lock(&root->fs_info->fs_roots_lock); + spin_lock(&root->fs_info->fs_roots_radix_lock); list_del_init(&root->leak_list); - spin_unlock(&root->fs_info->fs_roots_lock); + spin_unlock(&root->fs_info->fs_roots_radix_lock); #endif kfree(root); } @@ -2352,21 +2360,28 @@ void btrfs_put_root(struct btrfs_root *root) void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) { - struct btrfs_root *root; - unsigned long index = 0; + int ret; + struct btrfs_root *gang[8]; + int i; while (!list_empty(&fs_info->dead_roots)) { - root = list_entry(fs_info->dead_roots.next, - struct btrfs_root, root_list); - list_del(&root->root_list); + gang[0] = list_entry(fs_info->dead_roots.next, + struct btrfs_root, root_list); + list_del(&gang[0]->root_list); - if (test_bit(BTRFS_ROOT_REGISTERED, &root->state)) - btrfs_drop_and_free_fs_root(fs_info, root); - btrfs_put_root(root); + if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) + btrfs_drop_and_free_fs_root(fs_info, gang[0]); + btrfs_put_root(gang[0]); } - xa_for_each(&fs_info->fs_roots, index, root) { - btrfs_drop_and_free_fs_root(fs_info, root); + while (1) { + ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, 0, + ARRAY_SIZE(gang)); + if (!ret) + break; + for (i = 0; i < ret; i++) + btrfs_drop_and_free_fs_root(fs_info, gang[i]); } } @@ -3134,8 +3149,8 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) { - xa_init_flags(&fs_info->fs_roots, GFP_ATOMIC); - xa_init_flags(&fs_info->extent_buffers, GFP_ATOMIC); + INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); + INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->delayed_iputs); @@ -3143,7 +3158,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&fs_info->caching_block_groups); spin_lock_init(&fs_info->delalloc_root_lock); spin_lock_init(&fs_info->trans_lock); - spin_lock_init(&fs_info->fs_roots_lock); + spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); spin_lock_init(&fs_info->super_lock); @@ -3374,7 +3389,7 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) /* * btrfs_find_orphan_roots() is responsible for finding all the dead * roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load - * them into the fs_info->fs_roots. This must be done before + * them into the fs_info->fs_roots_radix tree. This must be done before * calling btrfs_orphan_cleanup() on the tree root. If we don't do it * first, then btrfs_orphan_cleanup() will delete a dead root's orphan * item before the root's tree is deleted - this means that if we unmount @@ -4499,11 +4514,12 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, { bool drop_ref = false; - spin_lock(&fs_info->fs_roots_lock); - xa_erase(&fs_info->fs_roots, (unsigned long)root->root_key.objectid); - if (test_and_clear_bit(BTRFS_ROOT_REGISTERED, &root->state)) + spin_lock(&fs_info->fs_roots_radix_lock); + radix_tree_delete(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid); + if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state)) drop_ref = true; - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); if (BTRFS_FS_ERROR(fs_info)) { ASSERT(root->log_root == NULL); @@ -4519,48 +4535,50 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) { - struct btrfs_root *roots[8]; - unsigned long index = 0; - int i; + u64 root_objectid = 0; + struct btrfs_root *gang[8]; + int i = 0; int err = 0; - int grabbed; + unsigned int ret = 0; while (1) { - struct btrfs_root *root; - - spin_lock(&fs_info->fs_roots_lock); - if (!xa_find(&fs_info->fs_roots, &index, ULONG_MAX, XA_PRESENT)) { - spin_unlock(&fs_info->fs_roots_lock); - return err; + spin_lock(&fs_info->fs_roots_radix_lock); + ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, root_objectid, + ARRAY_SIZE(gang)); + if (!ret) { + spin_unlock(&fs_info->fs_roots_radix_lock); + break; } + root_objectid = gang[ret - 1]->root_key.objectid + 1; - grabbed = 0; - xa_for_each_start(&fs_info->fs_roots, index, root, index) { - /* Avoid grabbing roots in dead_roots */ - if (btrfs_root_refs(&root->root_item) > 0) - roots[grabbed++] = btrfs_grab_root(root); - if (grabbed >= ARRAY_SIZE(roots)) - break; + for (i = 0; i < ret; i++) { + /* Avoid to grab roots in dead_roots */ + if (btrfs_root_refs(&gang[i]->root_item) == 0) { + gang[i] = NULL; + continue; + } + /* grab all the search result for later use */ + gang[i] = btrfs_grab_root(gang[i]); } - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); - for (i = 0; i < grabbed; i++) { - if (!roots[i]) + for (i = 0; i < ret; i++) { + if (!gang[i]) continue; - index = roots[i]->root_key.objectid; - err = btrfs_orphan_cleanup(roots[i]); + root_objectid = gang[i]->root_key.objectid; + err = btrfs_orphan_cleanup(gang[i]); if (err) - goto out; - btrfs_put_root(roots[i]); + break; + btrfs_put_root(gang[i]); } - index++; + root_objectid++; } -out: - /* Release the roots that remain uncleaned due to error */ - for (; i < grabbed; i++) { - if (roots[i]) - btrfs_put_root(roots[i]); + /* release the uncleaned roots due to error */ + for (; i < ret; i++) { + if (gang[i]) + btrfs_put_root(gang[i]); } return err; } @@ -4632,6 +4650,17 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) int ret; set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags); + + /* + * We may have the reclaim task running and relocating a data block group, + * in which case it may create delayed iputs. So stop it before we park + * the cleaner kthread otherwise we can get new delayed iputs after + * parking the cleaner, and that can make the async reclaim task to hang + * if it's waiting for delayed iputs to complete, since the cleaner is + * parked and can not run delayed iputs - this will make us hang when + * trying to stop the async reclaim task. + */ + cancel_work_sync(&fs_info->reclaim_bgs_work); /* * We don't want the cleaner to start new transactions, add more delayed * iputs, etc. while we're closing. We can't use kthread_stop() yet @@ -4672,8 +4701,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) cancel_work_sync(&fs_info->async_data_reclaim_work); cancel_work_sync(&fs_info->preempt_reclaim_work); - cancel_work_sync(&fs_info->reclaim_bgs_work); - /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); @@ -4870,28 +4897,31 @@ static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info) { - unsigned long index = 0; - int grabbed = 0; - struct btrfs_root *roots[8]; + struct btrfs_root *gang[8]; + u64 root_objectid = 0; + int ret; + + spin_lock(&fs_info->fs_roots_radix_lock); + while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, root_objectid, + ARRAY_SIZE(gang))) != 0) { + int i; - spin_lock(&fs_info->fs_roots_lock); - while ((grabbed = xa_extract(&fs_info->fs_roots, (void **)roots, index, - ULONG_MAX, 8, XA_PRESENT))) { - for (int i = 0; i < grabbed; i++) - roots[i] = btrfs_grab_root(roots[i]); - spin_unlock(&fs_info->fs_roots_lock); + for (i = 0; i < ret; i++) + gang[i] = btrfs_grab_root(gang[i]); + spin_unlock(&fs_info->fs_roots_radix_lock); - for (int i = 0; i < grabbed; i++) { - if (!roots[i]) + for (i = 0; i < ret; i++) { + if (!gang[i]) continue; - index = roots[i]->root_key.objectid; - btrfs_free_log(NULL, roots[i]); - btrfs_put_root(roots[i]); + root_objectid = gang[i]->root_key.objectid; + btrfs_free_log(NULL, gang[i]); + btrfs_put_root(gang[i]); } - index++; - spin_lock(&fs_info->fs_roots_lock); + root_objectid++; + spin_lock(&fs_info->fs_roots_radix_lock); } - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); btrfs_free_log_root_tree(NULL, fs_info); } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0867c5cd6e01..a3afc15430ce 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3832,7 +3832,7 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, block_group->start == fs_info->data_reloc_bg || fs_info->data_reloc_bg == 0); - if (block_group->ro) { + if (block_group->ro || block_group->zoned_data_reloc_ongoing) { ret = 1; goto out; } @@ -3894,8 +3894,24 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, out: if (ret && ffe_ctl->for_treelog) fs_info->treelog_bg = 0; - if (ret && ffe_ctl->for_data_reloc) + if (ret && ffe_ctl->for_data_reloc && + fs_info->data_reloc_bg == block_group->start) { + /* + * Do not allow further allocations from this block group. + * Compared to increasing the ->ro, setting the + * ->zoned_data_reloc_ongoing flag still allows nocow + * writers to come in. See btrfs_inc_nocow_writers(). + * + * We need to disable an allocation to avoid an allocation of + * regular (non-relocation data) extent. With mix of relocation + * extents and regular extents, we can dispatch WRITE commands + * (for relocation extents) and ZONE APPEND commands (for + * regular extents) at the same time to the same zone, which + * easily break the write pointer. + */ + block_group->zoned_data_reloc_ongoing = 1; fs_info->data_reloc_bg = 0; + } spin_unlock(&fs_info->relocation_bg_lock); spin_unlock(&fs_info->treelog_bg_lock); spin_unlock(&block_group->lock); @@ -5813,7 +5829,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) btrfs_qgroup_convert_reserved_meta(root, INT_MAX); btrfs_qgroup_free_meta_all_pertrans(root); - if (test_bit(BTRFS_ROOT_REGISTERED, &root->state)) + if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) btrfs_add_dropped_root(trans, root); else btrfs_put_root(root); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 8f6b544ae616..f03ab5dbda7a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2966,7 +2966,7 @@ static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) } /* - * Find extent buffer for a given bytenr. + * Find extent buffer for a givne bytenr. * * This is for end_bio_extent_readpage(), thus we can't do any unsafe locking * in endio context. @@ -2985,9 +2985,11 @@ static struct extent_buffer *find_extent_buffer_readpage( return (struct extent_buffer *)page->private; } - /* For subpage case, we need to lookup extent buffer xarray */ - eb = xa_load(&fs_info->extent_buffers, - bytenr >> fs_info->sectorsize_bits); + /* For subpage case, we need to lookup buffer radix tree */ + rcu_read_lock(); + eb = radix_tree_lookup(&fs_info->buffer_radix, + bytenr >> fs_info->sectorsize_bits); + rcu_read_unlock(); ASSERT(eb); return eb; } @@ -4435,8 +4437,8 @@ static struct extent_buffer *find_extent_buffer_nolock( struct extent_buffer *eb; rcu_read_lock(); - eb = xa_load(&fs_info->extent_buffers, - start >> fs_info->sectorsize_bits); + eb = radix_tree_lookup(&fs_info->buffer_radix, + start >> fs_info->sectorsize_bits); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); return eb; @@ -5241,13 +5243,14 @@ int extent_writepages(struct address_space *mapping, */ btrfs_zoned_data_reloc_lock(BTRFS_I(inode)); ret = extent_write_cache_pages(mapping, wbc, &epd); - btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); ASSERT(ret <= 0); if (ret < 0) { + btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); end_write_bio(&epd, ret); return ret; } flush_write_bio(&epd); + btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); return ret; } @@ -6128,22 +6131,24 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, if (!eb) return ERR_PTR(-ENOMEM); eb->fs_info = fs_info; - - do { - ret = xa_insert(&fs_info->extent_buffers, - start >> fs_info->sectorsize_bits, - eb, GFP_NOFS); - if (ret == -ENOMEM) { - exists = ERR_PTR(ret); +again: + ret = radix_tree_preload(GFP_NOFS); + if (ret) { + exists = ERR_PTR(ret); + goto free_eb; + } + spin_lock(&fs_info->buffer_lock); + ret = radix_tree_insert(&fs_info->buffer_radix, + start >> fs_info->sectorsize_bits, eb); + spin_unlock(&fs_info->buffer_lock); + radix_tree_preload_end(); + if (ret == -EEXIST) { + exists = find_extent_buffer(fs_info, start); + if (exists) goto free_eb; - } - if (ret == -EBUSY) { - exists = find_extent_buffer(fs_info, start); - if (exists) - goto free_eb; - } - } while (ret); - + else + goto again; + } check_buffer_tree_ref(eb); set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); @@ -6318,22 +6323,25 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, } if (uptodate) set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - - do { - ret = xa_insert(&fs_info->extent_buffers, - start >> fs_info->sectorsize_bits, - eb, GFP_NOFS); - if (ret == -ENOMEM) { - exists = ERR_PTR(ret); +again: + ret = radix_tree_preload(GFP_NOFS); + if (ret) { + exists = ERR_PTR(ret); + goto free_eb; + } + + spin_lock(&fs_info->buffer_lock); + ret = radix_tree_insert(&fs_info->buffer_radix, + start >> fs_info->sectorsize_bits, eb); + spin_unlock(&fs_info->buffer_lock); + radix_tree_preload_end(); + if (ret == -EEXIST) { + exists = find_extent_buffer(fs_info, start); + if (exists) goto free_eb; - } - if (ret == -EBUSY) { - exists = find_extent_buffer(fs_info, start); - if (exists) - goto free_eb; - } - } while (ret); - + else + goto again; + } /* add one reference for the tree */ check_buffer_tree_ref(eb); set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); @@ -6378,8 +6386,10 @@ static int release_extent_buffer(struct extent_buffer *eb) spin_unlock(&eb->refs_lock); - xa_erase(&fs_info->extent_buffers, - eb->start >> fs_info->sectorsize_bits); + spin_lock(&fs_info->buffer_lock); + radix_tree_delete(&fs_info->buffer_radix, + eb->start >> fs_info->sectorsize_bits); + spin_unlock(&fs_info->buffer_lock); } else { spin_unlock(&eb->refs_lock); } @@ -7324,25 +7334,42 @@ void memmove_extent_buffer(const struct extent_buffer *dst, } } +#define GANG_LOOKUP_SIZE 16 static struct extent_buffer *get_next_extent_buffer( struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr) { - struct extent_buffer *eb; - unsigned long index; + struct extent_buffer *gang[GANG_LOOKUP_SIZE]; + struct extent_buffer *found = NULL; u64 page_start = page_offset(page); + u64 cur = page_start; ASSERT(in_range(bytenr, page_start, PAGE_SIZE)); lockdep_assert_held(&fs_info->buffer_lock); - xa_for_each_start(&fs_info->extent_buffers, index, eb, - page_start >> fs_info->sectorsize_bits) { - if (in_range(eb->start, page_start, PAGE_SIZE)) - return eb; - else if (eb->start >= page_start + PAGE_SIZE) - /* Already beyond page end */ - return NULL; + while (cur < page_start + PAGE_SIZE) { + int ret; + int i; + + ret = radix_tree_gang_lookup(&fs_info->buffer_radix, + (void **)gang, cur >> fs_info->sectorsize_bits, + min_t(unsigned int, GANG_LOOKUP_SIZE, + PAGE_SIZE / fs_info->nodesize)); + if (ret == 0) + goto out; + for (i = 0; i < ret; i++) { + /* Already beyond page end */ + if (gang[i]->start >= page_start + PAGE_SIZE) + goto out; + /* Found one */ + if (gang[i]->start >= bytenr) { + found = gang[i]; + goto out; + } + } + cur = gang[ret - 1]->start + gang[ret - 1]->len; } - return NULL; +out: + return found; } static int try_release_subpage_extent_buffer(struct page *page) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 1fd827b99c1b..9dfde1af8a64 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2323,25 +2323,62 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); - if (ret != BTRFS_NO_LOG_SYNC) { + if (ret == BTRFS_NO_LOG_SYNC) { + ret = btrfs_end_transaction(trans); + goto out; + } + + /* We successfully logged the inode, attempt to sync the log. */ + if (!ret) { + ret = btrfs_sync_log(trans, root, &ctx); if (!ret) { - ret = btrfs_sync_log(trans, root, &ctx); - if (!ret) { - ret = btrfs_end_transaction(trans); - goto out; - } - } - if (!full_sync) { - ret = btrfs_wait_ordered_range(inode, start, len); - if (ret) { - btrfs_end_transaction(trans); - goto out; - } + ret = btrfs_end_transaction(trans); + goto out; } - ret = btrfs_commit_transaction(trans); - } else { + } + + /* + * At this point we need to commit the transaction because we had + * btrfs_need_log_full_commit() or some other error. + * + * If we didn't do a full sync we have to stop the trans handle, wait on + * the ordered extents, start it again and commit the transaction. If + * we attempt to wait on the ordered extents here we could deadlock with + * something like fallocate() that is holding the extent lock trying to + * start a transaction while some other thread is trying to commit the + * transaction while we (fsync) are currently holding the transaction + * open. + */ + if (!full_sync) { ret = btrfs_end_transaction(trans); + if (ret) + goto out; + ret = btrfs_wait_ordered_range(inode, start, len); + if (ret) + goto out; + + /* + * This is safe to use here because we're only interested in + * making sure the transaction that had the ordered extents is + * committed. We aren't waiting on anything past this point, + * we're purely getting the transaction and committing it. + */ + trans = btrfs_attach_transaction_barrier(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + + /* + * We committed the transaction and there's no currently + * running transaction, this means everything we care + * about made it to disk and we are done. + */ + if (ret == -ENOENT) + ret = 0; + goto out; + } } + + ret = btrfs_commit_transaction(trans); out: ASSERT(list_empty(&ctx.list)); err = file_check_and_advance_wb_err(file); @@ -2719,7 +2756,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, min_size, false); - BUG_ON(ret); + if (WARN_ON(ret)) + goto out_trans; trans->block_rsv = rsv; cur_offset = start; @@ -2803,6 +2841,25 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, extent_info->file_offset += replace_len; } + /* + * We are releasing our handle on the transaction, balance the + * dirty pages of the btree inode and flush delayed items, and + * then get a new transaction handle, which may now point to a + * new transaction in case someone else may have committed the + * transaction we used to replace/drop file extent items. So + * bump the inode's iversion and update mtime and ctime except + * if we are called from a dedupe context. This is because a + * power failure/crash may happen after the transaction is + * committed and before we finish replacing/dropping all the + * file extent items we need. + */ + inode_inc_iversion(&inode->vfs_inode); + + if (!extent_info || extent_info->update_times) { + inode->vfs_inode.i_mtime = current_time(&inode->vfs_inode); + inode->vfs_inode.i_ctime = inode->vfs_inode.i_mtime; + } + ret = btrfs_update_inode(trans, root, inode); if (ret) break; @@ -2819,7 +2876,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, min_size, false); - BUG_ON(ret); /* shouldn't happen */ + if (WARN_ON(ret)) + break; trans->block_rsv = rsv; cur_offset = drop_args.drop_end; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 81737eff92f3..d50448bf8eed 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3195,6 +3195,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) ordered_extent->file_offset, ordered_extent->file_offset + logical_len); + btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes); } else { BUG_ON(root == fs_info->tree_root); ret = insert_ordered_extent_file_extent(trans, ordered_extent); @@ -3576,7 +3578,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) u64 last_objectid = 0; int ret = 0, nr_unlink = 0; - /* Bail out if the cleanup is already running. */ if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state)) return 0; @@ -3659,17 +3660,17 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) * * btrfs_find_orphan_roots() ran before us, which has * found all deleted roots and loaded them into - * fs_info->fs_roots. So here we can find if an + * fs_info->fs_roots_radix. So here we can find if an * orphan item corresponds to a deleted root by looking - * up the root from that xarray. + * up the root from that radix tree. */ - spin_lock(&fs_info->fs_roots_lock); - dead_root = xa_load(&fs_info->fs_roots, - (unsigned long)found_key.objectid); + spin_lock(&fs_info->fs_roots_radix_lock); + dead_root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)found_key.objectid); if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0) is_dead_root = 1; - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); if (is_dead_root) { /* prevent this orphan from being found again */ @@ -3909,7 +3910,7 @@ cache_index: * cache. * * This is required for both inode re-read from disk and delayed inode - * in the delayed_nodes xarray. + * in delayed_nodes_tree. */ if (BTRFS_I(inode)->last_trans == fs_info->generation) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, @@ -7679,7 +7680,19 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || em->block_start == EXTENT_MAP_INLINE) { free_extent_map(em); - ret = -ENOTBLK; + /* + * If we are in a NOWAIT context, return -EAGAIN in order to + * fallback to buffered IO. This is not only because we can + * block with buffered IO (no support for NOWAIT semantics at + * the moment) but also to avoid returning short reads to user + * space - this happens if we were able to read some data from + * previous non-compressed extents and then when we fallback to + * buffered IO, at btrfs_file_read_iter() by calling + * filemap_read(), we fail to fault in pages for the read buffer, + * in which case filemap_read() returns a short read (the number + * of bytes previously read is > 0, so it does not return -EFAULT). + */ + ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK; goto unlock_err; } @@ -9897,6 +9910,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( extent_info.file_offset = file_offset; extent_info.extent_buf = (char *)&stack_fi; extent_info.is_new_extent = true; + extent_info.update_times = true; extent_info.qgroup_reserved = qgroup_released; extent_info.insertions = 0; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 313d9d685adb..33461b4f9c8b 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -45,7 +45,6 @@ void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting ne start_ns = ktime_get_ns(); down_read_nested(&eb->lock, nest); - eb->lock_owner = current->pid; trace_btrfs_tree_read_lock(eb, start_ns); } @@ -62,7 +61,6 @@ void btrfs_tree_read_lock(struct extent_buffer *eb) int btrfs_try_tree_read_lock(struct extent_buffer *eb) { if (down_read_trylock(&eb->lock)) { - eb->lock_owner = current->pid; trace_btrfs_try_tree_read_lock(eb); return 1; } @@ -90,7 +88,6 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb) void btrfs_tree_read_unlock(struct extent_buffer *eb) { trace_btrfs_tree_read_unlock(eb); - eb->lock_owner = 0; up_read(&eb->lock); } diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index c39f8b3a5a4a..a3549d587464 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -344,6 +344,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, int ret; const u64 len = olen_aligned; u64 last_dest_end = destoff; + u64 prev_extent_end = off; ret = -ENOMEM; buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); @@ -363,7 +364,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode, key.offset = off; while (1) { - u64 next_key_min_offset = key.offset + 1; struct btrfs_file_extent_item *extent; u64 extent_gen; int type; @@ -431,14 +431,21 @@ process_slot: * The first search might have left us at an extent item that * ends before our target range's start, can happen if we have * holes and NO_HOLES feature enabled. + * + * Subsequent searches may leave us on a file range we have + * processed before - this happens due to a race with ordered + * extent completion for a file range that is outside our source + * range, but that range was part of a file extent item that + * also covered a leading part of our source range. */ - if (key.offset + datal <= off) { + if (key.offset + datal <= prev_extent_end) { path->slots[0]++; goto process_slot; } else if (key.offset >= off + len) { break; } - next_key_min_offset = key.offset + datal; + + prev_extent_end = key.offset + datal; size = btrfs_item_size(leaf, slot); read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), size); @@ -489,6 +496,7 @@ process_slot: clone_info.file_offset = new_key.offset; clone_info.extent_buf = buf; clone_info.is_new_extent = false; + clone_info.update_times = !no_time_update; ret = btrfs_replace_file_extents(BTRFS_I(inode), path, drop_start, new_key.offset + datal - 1, &clone_info, &trans); @@ -550,7 +558,7 @@ process_slot: break; btrfs_release_path(path); - key.offset = next_key_min_offset; + key.offset = prev_extent_end; if (fatal_signal_pending(current)) { ret = -EINTR; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index fa56890ff81f..c7dea639a56f 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -10,6 +10,7 @@ #include <linux/mount.h> #include <linux/xattr.h> #include <linux/posix_acl_xattr.h> +#include <linux/radix-tree.h> #include <linux/vmalloc.h> #include <linux/string.h> #include <linux/compat.h> @@ -127,7 +128,7 @@ struct send_ctx { struct list_head new_refs; struct list_head deleted_refs; - struct xarray name_cache; + struct radix_tree_root name_cache; struct list_head name_cache_list; int name_cache_size; @@ -268,13 +269,14 @@ struct orphan_dir_info { struct name_cache_entry { struct list_head list; /* - * On 32bit kernels, xarray has only 32bit indices, but we need to - * handle 64bit inums. We use the lower 32bit of the 64bit inum to store - * it in the tree. If more than one inum would fall into the same entry, - * we use inum_aliases to store the additional entries. inum_aliases is - * also used to store entries with the same inum but different generations. + * radix_tree has only 32bit entries but we need to handle 64bit inums. + * We use the lower 32bit of the 64bit inum to store it in the tree. If + * more then one inum would fall into the same entry, we use radix_list + * to store the additional entries. radix_list is also used to store + * entries where two entries have the same inum but different + * generations. */ - struct list_head inum_aliases; + struct list_head radix_list; u64 ino; u64 gen; u64 parent_ino; @@ -2024,9 +2026,9 @@ out: } /* - * Insert a name cache entry. On 32bit kernels the xarray index is 32bit, + * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit, * so we need to do some special handling in case we have clashes. This function - * takes care of this with the help of name_cache_entry::inum_aliases. + * takes care of this with the help of name_cache_entry::radix_list. * In case of error, nce is kfreed. */ static int name_cache_insert(struct send_ctx *sctx, @@ -2035,7 +2037,8 @@ static int name_cache_insert(struct send_ctx *sctx, int ret = 0; struct list_head *nce_head; - nce_head = xa_load(&sctx->name_cache, (unsigned long)nce->ino); + nce_head = radix_tree_lookup(&sctx->name_cache, + (unsigned long)nce->ino); if (!nce_head) { nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL); if (!nce_head) { @@ -2044,14 +2047,14 @@ static int name_cache_insert(struct send_ctx *sctx, } INIT_LIST_HEAD(nce_head); - ret = xa_insert(&sctx->name_cache, nce->ino, nce_head, GFP_KERNEL); + ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head); if (ret < 0) { kfree(nce_head); kfree(nce); return ret; } } - list_add_tail(&nce->inum_aliases, nce_head); + list_add_tail(&nce->radix_list, nce_head); list_add_tail(&nce->list, &sctx->name_cache_list); sctx->name_cache_size++; @@ -2063,14 +2066,15 @@ static void name_cache_delete(struct send_ctx *sctx, { struct list_head *nce_head; - nce_head = xa_load(&sctx->name_cache, (unsigned long)nce->ino); + nce_head = radix_tree_lookup(&sctx->name_cache, + (unsigned long)nce->ino); if (!nce_head) { btrfs_err(sctx->send_root->fs_info, "name_cache_delete lookup failed ino %llu cache size %d, leaking memory", nce->ino, sctx->name_cache_size); } - list_del(&nce->inum_aliases); + list_del(&nce->radix_list); list_del(&nce->list); sctx->name_cache_size--; @@ -2078,7 +2082,7 @@ static void name_cache_delete(struct send_ctx *sctx, * We may not get to the final release of nce_head if the lookup fails */ if (nce_head && list_empty(nce_head)) { - xa_erase(&sctx->name_cache, (unsigned long)nce->ino); + radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino); kfree(nce_head); } } @@ -2089,11 +2093,11 @@ static struct name_cache_entry *name_cache_search(struct send_ctx *sctx, struct list_head *nce_head; struct name_cache_entry *cur; - nce_head = xa_load(&sctx->name_cache, (unsigned long)ino); + nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino); if (!nce_head) return NULL; - list_for_each_entry(cur, nce_head, inum_aliases) { + list_for_each_entry(cur, nce_head, radix_list) { if (cur->ino == ino && cur->gen == gen) return cur; } @@ -7518,7 +7522,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) INIT_LIST_HEAD(&sctx->new_refs); INIT_LIST_HEAD(&sctx->deleted_refs); - xa_init_flags(&sctx->name_cache, GFP_KERNEL); + INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL); INIT_LIST_HEAD(&sctx->name_cache_list); sctx->flags = arg->flags; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index b1fdc6a26c76..6627dd7875ee 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -763,6 +763,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, compress_force = false; no_compress++; } else { + btrfs_err(info, "unrecognized compression value %s", + args[0].from); ret = -EINVAL; goto out; } @@ -821,8 +823,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, case Opt_thread_pool: ret = match_int(&args[0], &intarg); if (ret) { + btrfs_err(info, "unrecognized thread_pool value %s", + args[0].from); goto out; } else if (intarg == 0) { + btrfs_err(info, "invalid value 0 for thread_pool"); ret = -EINVAL; goto out; } @@ -883,8 +888,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; case Opt_ratio: ret = match_int(&args[0], &intarg); - if (ret) + if (ret) { + btrfs_err(info, "unrecognized metadata_ratio value %s", + args[0].from); goto out; + } info->metadata_ratio = intarg; btrfs_info(info, "metadata ratio %u", info->metadata_ratio); @@ -901,6 +909,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, btrfs_set_and_info(info, DISCARD_ASYNC, "turning on async discard"); } else { + btrfs_err(info, "unrecognized discard mode value %s", + args[0].from); ret = -EINVAL; goto out; } @@ -933,6 +943,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, btrfs_set_and_info(info, FREE_SPACE_TREE, "enabling free space tree"); } else { + btrfs_err(info, "unrecognized space_cache value %s", + args[0].from); ret = -EINVAL; goto out; } @@ -1014,8 +1026,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; case Opt_check_integrity_print_mask: ret = match_int(&args[0], &intarg); - if (ret) + if (ret) { + btrfs_err(info, + "unrecognized check_integrity_print_mask value %s", + args[0].from); goto out; + } info->check_integrity_print_mask = intarg; btrfs_info(info, "check_integrity_print_mask 0x%x", info->check_integrity_print_mask); @@ -1030,13 +1046,15 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, goto out; #endif case Opt_fatal_errors: - if (strcmp(args[0].from, "panic") == 0) + if (strcmp(args[0].from, "panic") == 0) { btrfs_set_opt(info->mount_opt, PANIC_ON_FATAL_ERROR); - else if (strcmp(args[0].from, "bug") == 0) + } else if (strcmp(args[0].from, "bug") == 0) { btrfs_clear_opt(info->mount_opt, PANIC_ON_FATAL_ERROR); - else { + } else { + btrfs_err(info, "unrecognized fatal_errors value %s", + args[0].from); ret = -EINVAL; goto out; } @@ -1044,8 +1062,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, case Opt_commit_interval: intarg = 0; ret = match_int(&args[0], &intarg); - if (ret) + if (ret) { + btrfs_err(info, "unrecognized commit_interval value %s", + args[0].from); + ret = -EINVAL; goto out; + } if (intarg == 0) { btrfs_info(info, "using default commit interval %us", @@ -1059,8 +1081,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; case Opt_rescue: ret = parse_rescue_options(info, args[0].from); - if (ret < 0) + if (ret < 0) { + btrfs_err(info, "unrecognized rescue value %s", + args[0].from); goto out; + } break; #ifdef CONFIG_BTRFS_DEBUG case Opt_fragment_all: @@ -1985,6 +2010,14 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (ret) goto restore; + /* V1 cache is not supported for subpage mount. */ + if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) { + btrfs_warn(fs_info, + "v1 space cache is not supported for page size %lu with sectorsize %u", + PAGE_SIZE, fs_info->sectorsize); + ret = -EINVAL; + goto restore; + } btrfs_remount_begin(fs_info, old_opts, *flags); btrfs_resize_thread_pool(fs_info, fs_info->thread_pool_size, old_thread_pool_size); diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 1591bfa55bcc..d8e56edd6991 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -150,8 +150,8 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) { - unsigned long index; - struct extent_buffer *eb; + struct radix_tree_iter iter; + void **slot; struct btrfs_device *dev, *tmp; if (!fs_info) @@ -163,9 +163,25 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) test_mnt->mnt_sb->s_fs_info = NULL; - xa_for_each(&fs_info->extent_buffers, index, eb) { + spin_lock(&fs_info->buffer_lock); + radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) { + struct extent_buffer *eb; + + eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock); + if (!eb) + continue; + /* Shouldn't happen but that kind of thinking creates CVE's */ + if (radix_tree_exception(eb)) { + if (radix_tree_deref_retry(eb)) + slot = radix_tree_iter_retry(&iter); + continue; + } + slot = radix_tree_iter_resume(slot, &iter); + spin_unlock(&fs_info->buffer_lock); free_extent_buffer_stale(eb); + spin_lock(&fs_info->buffer_lock); } + spin_unlock(&fs_info->buffer_lock); btrfs_mapping_tree_free(&fs_info->mapping_tree); list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices, @@ -186,7 +202,7 @@ void btrfs_free_dummy_root(struct btrfs_root *root) if (!root) return; /* Will be freed by btrfs_free_fs_roots */ - if (WARN_ON(test_bit(BTRFS_ROOT_REGISTERED, &root->state))) + if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state))) return; btrfs_global_root_delete(root); btrfs_put_root(root); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 06c0a958d114..875b801ab3d7 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -23,7 +23,7 @@ #include "space-info.h" #include "zoned.h" -#define BTRFS_ROOT_TRANS_TAG XA_MARK_0 +#define BTRFS_ROOT_TRANS_TAG 0 /* * Transaction states and transitions @@ -437,15 +437,15 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, */ smp_wmb(); - spin_lock(&fs_info->fs_roots_lock); + spin_lock(&fs_info->fs_roots_radix_lock); if (root->last_trans == trans->transid && !force) { - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); return 0; } - xa_set_mark(&fs_info->fs_roots, - (unsigned long)root->root_key.objectid, - BTRFS_ROOT_TRANS_TAG); - spin_unlock(&fs_info->fs_roots_lock); + radix_tree_tag_set(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + spin_unlock(&fs_info->fs_roots_radix_lock); root->last_trans = trans->transid; /* this is pretty tricky. We don't want to @@ -487,9 +487,11 @@ void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, spin_unlock(&cur_trans->dropped_roots_lock); /* Make sure we don't try to update the root at commit time */ - xa_clear_mark(&fs_info->fs_roots, - (unsigned long)root->root_key.objectid, - BTRFS_ROOT_TRANS_TAG); + spin_lock(&fs_info->fs_roots_radix_lock); + radix_tree_tag_clear(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + spin_unlock(&fs_info->fs_roots_radix_lock); } int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, @@ -1402,8 +1404,9 @@ void btrfs_add_dead_root(struct btrfs_root *root) static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *root; - unsigned long index; + struct btrfs_root *gang[8]; + int i; + int ret; /* * At this point no one can be using this transaction to modify any tree @@ -1411,46 +1414,57 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) */ ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING); - spin_lock(&fs_info->fs_roots_lock); - xa_for_each_marked(&fs_info->fs_roots, index, root, BTRFS_ROOT_TRANS_TAG) { - int ret; - - /* - * At this point we can neither have tasks logging inodes - * from a root nor trying to commit a log tree. - */ - ASSERT(atomic_read(&root->log_writers) == 0); - ASSERT(atomic_read(&root->log_commit[0]) == 0); - ASSERT(atomic_read(&root->log_commit[1]) == 0); - - xa_clear_mark(&fs_info->fs_roots, - (unsigned long)root->root_key.objectid, - BTRFS_ROOT_TRANS_TAG); - spin_unlock(&fs_info->fs_roots_lock); - - btrfs_free_log(trans, root); - ret = btrfs_update_reloc_root(trans, root); - if (ret) - return ret; - - /* See comments in should_cow_block() */ - clear_bit(BTRFS_ROOT_FORCE_COW, &root->state); - smp_mb__after_atomic(); + spin_lock(&fs_info->fs_roots_radix_lock); + while (1) { + ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix, + (void **)gang, 0, + ARRAY_SIZE(gang), + BTRFS_ROOT_TRANS_TAG); + if (ret == 0) + break; + for (i = 0; i < ret; i++) { + struct btrfs_root *root = gang[i]; + int ret2; + + /* + * At this point we can neither have tasks logging inodes + * from a root nor trying to commit a log tree. + */ + ASSERT(atomic_read(&root->log_writers) == 0); + ASSERT(atomic_read(&root->log_commit[0]) == 0); + ASSERT(atomic_read(&root->log_commit[1]) == 0); + + radix_tree_tag_clear(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + spin_unlock(&fs_info->fs_roots_radix_lock); + + btrfs_free_log(trans, root); + ret2 = btrfs_update_reloc_root(trans, root); + if (ret2) + return ret2; + + /* see comments in should_cow_block() */ + clear_bit(BTRFS_ROOT_FORCE_COW, &root->state); + smp_mb__after_atomic(); + + if (root->commit_root != root->node) { + list_add_tail(&root->dirty_list, + &trans->transaction->switch_commits); + btrfs_set_root_node(&root->root_item, + root->node); + } - if (root->commit_root != root->node) { - list_add_tail(&root->dirty_list, - &trans->transaction->switch_commits); - btrfs_set_root_node(&root->root_item, root->node); + ret2 = btrfs_update_root(trans, fs_info->tree_root, + &root->root_key, + &root->root_item); + if (ret2) + return ret2; + spin_lock(&fs_info->fs_roots_radix_lock); + btrfs_qgroup_free_meta_all_pertrans(root); } - - ret = btrfs_update_root(trans, fs_info->tree_root, - &root->root_key, &root->root_item); - if (ret) - return ret; - spin_lock(&fs_info->fs_roots_lock); - btrfs_qgroup_free_meta_all_pertrans(root); } - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); return 0; } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 11237a913bee..d99026df6f67 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1735,12 +1735,14 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, &mapped_length, &bioc); if (ret || !bioc || mapped_length < PAGE_SIZE) { - btrfs_put_bioc(bioc); - return -EIO; + ret = -EIO; + goto out_put_bioc; } - if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) - return -EINVAL; + if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + ret = -EINVAL; + goto out_put_bioc; + } nofs_flag = memalloc_nofs_save(); nmirrors = (int)bioc->num_stripes; @@ -1759,7 +1761,8 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, break; } memalloc_nofs_restore(nofs_flag); - +out_put_bioc: + btrfs_put_bioc(bioc); return ret; } @@ -1885,7 +1888,6 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ { struct btrfs_fs_info *fs_info = block_group->fs_info; struct map_lookup *map; - bool need_zone_finish; int ret = 0; int i; @@ -1942,12 +1944,6 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ } } - /* - * The block group is not fully allocated, so not fully written yet. We - * need to send ZONE_FINISH command to free up an active zone. - */ - need_zone_finish = !btrfs_zoned_bg_is_full(block_group); - block_group->zone_is_active = 0; block_group->alloc_offset = block_group->zone_capacity; block_group->free_space_ctl->free_space = 0; @@ -1963,15 +1959,13 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ if (device->zone_info->max_active_zones == 0) continue; - if (need_zone_finish) { - ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, - physical >> SECTOR_SHIFT, - device->zone_info->zone_size >> SECTOR_SHIFT, - GFP_NOFS); + ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, + physical >> SECTOR_SHIFT, + device->zone_info->zone_size >> SECTOR_SHIFT, + GFP_NOFS); - if (ret) - return ret; - } + if (ret) + return ret; btrfs_dev_clear_active_zone(device, physical); } @@ -2139,3 +2133,30 @@ bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info) factor = div64_u64(used * 100, total); return factor >= fs_info->bg_reclaim_threshold; } + +void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, + u64 length) +{ + struct btrfs_block_group *block_group; + + if (!btrfs_is_zoned(fs_info)) + return; + + block_group = btrfs_lookup_block_group(fs_info, logical); + /* It should be called on a previous data relocation block group. */ + ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)); + + spin_lock(&block_group->lock); + if (!block_group->zoned_data_reloc_ongoing) + goto out; + + /* All relocation extents are written. */ + if (block_group->start + block_group->alloc_offset == logical + length) { + /* Now, release this block group for further allocations. */ + block_group->zoned_data_reloc_ongoing = 0; + } + +out: + spin_unlock(&block_group->lock); + btrfs_put_block_group(block_group); +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index bb1a189e11f9..6b2eec99162b 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -77,6 +77,8 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg); void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info); bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info); +void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, + u64 length); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -243,6 +245,9 @@ static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info) { return false; } + +static inline void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) { } #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index a41ae6efc545..1fee702d5529 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -21,7 +21,8 @@ static int cachefiles_ondemand_fd_release(struct inode *inode, * anon_fd. */ xas_for_each(&xas, req, ULONG_MAX) { - if (req->msg.opcode == CACHEFILES_OP_READ) { + if (req->msg.object_id == object_id && + req->msg.opcode == CACHEFILES_OP_READ) { req->error = -EIO; complete(&req->done); xas_store(&xas, NULL); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 6dee88815491..d6e5916138e4 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -63,7 +63,7 @@ (CONGESTION_ON_THRESH(congestion_kb) >> 2)) static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len, - struct folio *folio, void **_fsdata); + struct folio **foliop, void **_fsdata); static inline struct ceph_snap_context *page_snap_context(struct page *page) { @@ -1288,18 +1288,19 @@ ceph_find_incompatible(struct page *page) } static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len, - struct folio *folio, void **_fsdata) + struct folio **foliop, void **_fsdata) { struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_snap_context *snapc; - snapc = ceph_find_incompatible(folio_page(folio, 0)); + snapc = ceph_find_incompatible(folio_page(*foliop, 0)); if (snapc) { int r; - folio_unlock(folio); - folio_put(folio); + folio_unlock(*foliop); + folio_put(*foliop); + *foliop = NULL; if (IS_ERR(snapc)) return PTR_ERR(snapc); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 38c930384d41..ac8fd5e7f540 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -4377,6 +4377,7 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s) ihold(inode); dout("flush_dirty_caps %llx.%llx\n", ceph_vinop(inode)); spin_unlock(&mdsc->cap_dirty_lock); + ceph_wait_on_async_create(inode); ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL); iput(inode); spin_lock(&mdsc->cap_dirty_lock); diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 1dd995efd5b8..2cfbac8bb965 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -162,6 +162,8 @@ cifs_dump_iface(struct seq_file *m, struct cifs_server_iface *iface) seq_printf(m, "\t\tIPv4: %pI4\n", &ipv4->sin_addr); else if (iface->sockaddr.ss_family == AF_INET6) seq_printf(m, "\t\tIPv6: %pI6\n", &ipv6->sin6_addr); + if (!iface->is_active) + seq_puts(m, "\t\t[for-cleanup]\n"); } static int cifs_debug_files_proc_show(struct seq_file *m, void *v) @@ -221,6 +223,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) struct TCP_Server_Info *server; struct cifs_ses *ses; struct cifs_tcon *tcon; + struct cifs_server_iface *iface; int c, i, j; seq_puts(m, @@ -456,11 +459,10 @@ skip_rdma: if (ses->iface_count) seq_printf(m, "\n\n\tServer interfaces: %zu", ses->iface_count); - for (j = 0; j < ses->iface_count; j++) { - struct cifs_server_iface *iface; - - iface = &ses->iface_list[j]; - seq_printf(m, "\n\t%d)", j+1); + j = 0; + list_for_each_entry(iface, &ses->iface_list, + iface_head) { + seq_printf(m, "\n\t%d)", ++j); cifs_dump_iface(m, iface); if (is_ses_using_iface(ses, iface)) seq_puts(m, "\t\t[CONNECTED]\n"); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index e7737166e5b8..a643c84ff1e9 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -80,6 +80,9 @@ #define SMB_DNS_RESOLVE_INTERVAL_MIN 120 #define SMB_DNS_RESOLVE_INTERVAL_DEFAULT 600 +/* smb multichannel query server interfaces interval in seconds */ +#define SMB_INTERFACE_POLL_INTERVAL 600 + /* maximum number of PDUs in one compound */ #define MAX_COMPOUND 5 @@ -933,15 +936,67 @@ static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net) #endif struct cifs_server_iface { + struct list_head iface_head; + struct kref refcount; size_t speed; unsigned int rdma_capable : 1; unsigned int rss_capable : 1; + unsigned int is_active : 1; /* unset if non existent */ struct sockaddr_storage sockaddr; }; +/* release iface when last ref is dropped */ +static inline void +release_iface(struct kref *ref) +{ + struct cifs_server_iface *iface = container_of(ref, + struct cifs_server_iface, + refcount); + list_del_init(&iface->iface_head); + kfree(iface); +} + +/* + * compare two interfaces a and b + * return 0 if everything matches. + * return 1 if a has higher link speed, or rdma capable, or rss capable + * return -1 otherwise. + */ +static inline int +iface_cmp(struct cifs_server_iface *a, struct cifs_server_iface *b) +{ + int cmp_ret = 0; + + WARN_ON(!a || !b); + if (a->speed == b->speed) { + if (a->rdma_capable == b->rdma_capable) { + if (a->rss_capable == b->rss_capable) { + cmp_ret = memcmp(&a->sockaddr, &b->sockaddr, + sizeof(a->sockaddr)); + if (!cmp_ret) + return 0; + else if (cmp_ret > 0) + return 1; + else + return -1; + } else if (a->rss_capable > b->rss_capable) + return 1; + else + return -1; + } else if (a->rdma_capable > b->rdma_capable) + return 1; + else + return -1; + } else if (a->speed > b->speed) + return 1; + else + return -1; +} + struct cifs_chan { unsigned int in_reconnect : 1; /* if session setup in progress for this channel */ struct TCP_Server_Info *server; + struct cifs_server_iface *iface; /* interface in use */ __u8 signkey[SMB3_SIGN_KEY_SIZE]; }; @@ -993,7 +1048,7 @@ struct cifs_ses { */ spinlock_t iface_lock; /* ========= begin: protected by iface_lock ======== */ - struct cifs_server_iface *iface_list; + struct list_head iface_list; size_t iface_count; unsigned long iface_last_update; /* jiffies */ /* ========= end: protected by iface_lock ======== */ @@ -1203,6 +1258,7 @@ struct cifs_tcon { #ifdef CONFIG_CIFS_DFS_UPCALL struct list_head ulist; /* cache update list */ #endif + struct delayed_work query_interfaces; /* query interfaces workqueue job */ }; /* diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 3b7366ec03c7..d59aebefa71c 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -636,6 +636,13 @@ cifs_chan_clear_need_reconnect(struct cifs_ses *ses, bool cifs_chan_needs_reconnect(struct cifs_ses *ses, struct TCP_Server_Info *server); +bool +cifs_chan_is_iface_active(struct cifs_ses *ses, + struct TCP_Server_Info *server); +int +cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server); +int +SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon); void extract_unc_hostname(const char *unc, const char **h, size_t *len); int copy_path_name(char *dst, const char *src); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 1849e3411487..386bb523c69e 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -145,6 +145,25 @@ requeue_resolve: return rc; } +static void smb2_query_server_interfaces(struct work_struct *work) +{ + int rc; + struct cifs_tcon *tcon = container_of(work, + struct cifs_tcon, + query_interfaces.work); + + /* + * query server network interfaces, in case they change + */ + rc = SMB3_request_interfaces(0, tcon); + if (rc) { + cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n", + __func__, rc); + } + + queue_delayed_work(cifsiod_wq, &tcon->query_interfaces, + (SMB_INTERFACE_POLL_INTERVAL * HZ)); +} static void cifs_resolve_server(struct work_struct *work) { @@ -217,7 +236,7 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, bool mark_smb_session) { struct TCP_Server_Info *pserver; - struct cifs_ses *ses; + struct cifs_ses *ses, *nses; struct cifs_tcon *tcon; /* @@ -231,7 +250,20 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { + list_for_each_entry_safe(ses, nses, &pserver->smb_ses_list, smb_ses_list) { + /* check if iface is still active */ + if (!cifs_chan_is_iface_active(ses, server)) { + /* + * HACK: drop the lock before calling + * cifs_chan_update_iface to avoid deadlock + */ + ses->ses_count++; + spin_unlock(&cifs_tcp_ses_lock); + cifs_chan_update_iface(ses, server); + spin_lock(&cifs_tcp_ses_lock); + ses->ses_count--; + } + spin_lock(&ses->chan_lock); if (!mark_smb_session && cifs_chan_needs_reconnect(ses, server)) goto next_session; @@ -1886,7 +1918,6 @@ void cifs_put_smb_ses(struct cifs_ses *ses) list_del_init(&ses->smb_ses_list); spin_unlock(&cifs_tcp_ses_lock); - spin_lock(&ses->chan_lock); chan_count = ses->chan_count; /* close any extra channels */ @@ -1894,13 +1925,14 @@ void cifs_put_smb_ses(struct cifs_ses *ses) int i; for (i = 1; i < chan_count; i++) { - spin_unlock(&ses->chan_lock); + if (ses->chans[i].iface) { + kref_put(&ses->chans[i].iface->refcount, release_iface); + ses->chans[i].iface = NULL; + } cifs_put_tcp_session(ses->chans[i].server, 0); - spin_lock(&ses->chan_lock); ses->chans[i].server = NULL; } } - spin_unlock(&ses->chan_lock); sesInfoFree(ses); cifs_put_tcp_session(server, 0); @@ -2270,6 +2302,9 @@ cifs_put_tcon(struct cifs_tcon *tcon) list_del_init(&tcon->tcon_list); spin_unlock(&cifs_tcp_ses_lock); + /* cancel polling of interfaces */ + cancel_delayed_work_sync(&tcon->query_interfaces); + if (tcon->use_witness) { int rc; @@ -2507,6 +2542,12 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) tcon->local_lease = ctx->local_lease; INIT_LIST_HEAD(&tcon->pending_opens); + /* schedule query interfaces poll */ + INIT_DELAYED_WORK(&tcon->query_interfaces, + smb2_query_server_interfaces); + queue_delayed_work(cifsiod_wq, &tcon->query_interfaces, + (SMB_INTERFACE_POLL_INTERVAL * HZ)); + spin_lock(&cifs_tcp_ses_lock); list_add(&tcon->tcon_list, &ses->tcon_list); spin_unlock(&cifs_tcp_ses_lock); @@ -3982,10 +4023,16 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, struct nls_table *nls_info) { int rc = -ENOSYS; + struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr; + struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr; bool is_binding = false; - spin_lock(&cifs_tcp_ses_lock); + if (server->dstaddr.ss_family == AF_INET6) + scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI6", &addr6->sin6_addr); + else + scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI4", &addr->sin_addr); + if (ses->ses_status != SES_GOOD && ses->ses_status != SES_NEW && ses->ses_status != SES_NEED_RECON) { diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index c69e1240d730..0e84e6fcf8ab 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -75,6 +75,7 @@ sesInfoAlloc(void) INIT_LIST_HEAD(&ret_buf->tcon_list); mutex_init(&ret_buf->session_mutex); spin_lock_init(&ret_buf->iface_lock); + INIT_LIST_HEAD(&ret_buf->iface_list); spin_lock_init(&ret_buf->chan_lock); } return ret_buf; @@ -83,6 +84,8 @@ sesInfoAlloc(void) void sesInfoFree(struct cifs_ses *buf_to_free) { + struct cifs_server_iface *iface = NULL, *niface = NULL; + if (buf_to_free == NULL) { cifs_dbg(FYI, "Null buffer passed to sesInfoFree\n"); return; @@ -96,7 +99,11 @@ sesInfoFree(struct cifs_ses *buf_to_free) kfree(buf_to_free->user_name); kfree(buf_to_free->domainName); kfree_sensitive(buf_to_free->auth_key.response); - kfree(buf_to_free->iface_list); + spin_lock(&buf_to_free->iface_lock); + list_for_each_entry_safe(iface, niface, &buf_to_free->iface_list, + iface_head) + kref_put(&iface->refcount, release_iface); + spin_unlock(&buf_to_free->iface_lock); kfree_sensitive(buf_to_free); } diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index d417de354d9d..02c8b2906196 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -58,7 +58,7 @@ bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface) spin_lock(&ses->chan_lock); for (i = 0; i < ses->chan_count; i++) { - if (is_server_using_iface(ses->chans[i].server, iface)) { + if (ses->chans[i].iface == iface) { spin_unlock(&ses->chan_lock); return true; } @@ -146,16 +146,24 @@ cifs_chan_needs_reconnect(struct cifs_ses *ses, return CIFS_CHAN_NEEDS_RECONNECT(ses, chan_index); } +bool +cifs_chan_is_iface_active(struct cifs_ses *ses, + struct TCP_Server_Info *server) +{ + unsigned int chan_index = cifs_ses_get_chan_index(ses, server); + + return ses->chans[chan_index].iface && + ses->chans[chan_index].iface->is_active; +} + /* returns number of channels added */ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) { int old_chan_count, new_chan_count; int left; - int i = 0; int rc = 0; int tries = 0; - struct cifs_server_iface *ifaces = NULL; - size_t iface_count; + struct cifs_server_iface *iface = NULL, *niface = NULL; spin_lock(&ses->chan_lock); @@ -185,32 +193,16 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) spin_unlock(&ses->chan_lock); /* - * Make a copy of the iface list at the time and use that - * instead so as to not hold the iface spinlock for opening - * channels - */ - spin_lock(&ses->iface_lock); - iface_count = ses->iface_count; - if (iface_count <= 0) { - spin_unlock(&ses->iface_lock); - cifs_dbg(VFS, "no iface list available to open channels\n"); - return 0; - } - ifaces = kmemdup(ses->iface_list, iface_count*sizeof(*ifaces), - GFP_ATOMIC); - if (!ifaces) { - spin_unlock(&ses->iface_lock); - return 0; - } - spin_unlock(&ses->iface_lock); - - /* * Keep connecting to same, fastest, iface for all channels as * long as its RSS. Try next fastest one if not RSS or channel * creation fails. */ + spin_lock(&ses->iface_lock); + iface = list_first_entry(&ses->iface_list, struct cifs_server_iface, + iface_head); + spin_unlock(&ses->iface_lock); + while (left > 0) { - struct cifs_server_iface *iface; tries++; if (tries > 3*ses->chan_max) { @@ -219,31 +211,128 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) break; } - iface = &ifaces[i]; - if (is_ses_using_iface(ses, iface) && !iface->rss_capable) { - i = (i+1) % iface_count; - continue; + spin_lock(&ses->iface_lock); + if (!ses->iface_count) { + spin_unlock(&ses->iface_lock); + break; } - rc = cifs_ses_add_channel(cifs_sb, ses, iface); - if (rc) { - cifs_dbg(FYI, "failed to open extra channel on iface#%d rc=%d\n", - i, rc); - i = (i+1) % iface_count; - continue; + list_for_each_entry_safe_from(iface, niface, &ses->iface_list, + iface_head) { + /* skip ifaces that are unusable */ + if (!iface->is_active || + (is_ses_using_iface(ses, iface) && + !iface->rss_capable)) { + continue; + } + + /* take ref before unlock */ + kref_get(&iface->refcount); + + spin_unlock(&ses->iface_lock); + rc = cifs_ses_add_channel(cifs_sb, ses, iface); + spin_lock(&ses->iface_lock); + + if (rc) { + cifs_dbg(VFS, "failed to open extra channel on iface:%pIS rc=%d\n", + &iface->sockaddr, + rc); + kref_put(&iface->refcount, release_iface); + continue; + } + + cifs_dbg(FYI, "successfully opened new channel on iface:%pIS\n", + &iface->sockaddr); + break; } + spin_unlock(&ses->iface_lock); - cifs_dbg(FYI, "successfully opened new channel on iface#%d\n", - i); left--; new_chan_count++; } - kfree(ifaces); return new_chan_count - old_chan_count; } /* + * update the iface for the channel if necessary. + * will return 0 when iface is updated, 1 if removed, 2 otherwise + * Must be called with chan_lock held. + */ +int +cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) +{ + unsigned int chan_index; + struct cifs_server_iface *iface = NULL; + struct cifs_server_iface *old_iface = NULL; + int rc = 0; + + spin_lock(&ses->chan_lock); + chan_index = cifs_ses_get_chan_index(ses, server); + if (!chan_index) { + spin_unlock(&ses->chan_lock); + return 0; + } + + if (ses->chans[chan_index].iface) { + old_iface = ses->chans[chan_index].iface; + if (old_iface->is_active) { + spin_unlock(&ses->chan_lock); + return 1; + } + } + spin_unlock(&ses->chan_lock); + + spin_lock(&ses->iface_lock); + /* then look for a new one */ + list_for_each_entry(iface, &ses->iface_list, iface_head) { + if (!iface->is_active || + (is_ses_using_iface(ses, iface) && + !iface->rss_capable)) { + continue; + } + kref_get(&iface->refcount); + } + + if (!list_entry_is_head(iface, &ses->iface_list, iface_head)) { + rc = 1; + iface = NULL; + cifs_dbg(FYI, "unable to find a suitable iface\n"); + } + + /* now drop the ref to the current iface */ + if (old_iface && iface) { + kref_put(&old_iface->refcount, release_iface); + cifs_dbg(FYI, "replacing iface: %pIS with %pIS\n", + &old_iface->sockaddr, + &iface->sockaddr); + } else if (old_iface) { + kref_put(&old_iface->refcount, release_iface); + cifs_dbg(FYI, "releasing ref to iface: %pIS\n", + &old_iface->sockaddr); + } else { + WARN_ON(!iface); + cifs_dbg(FYI, "adding new iface: %pIS\n", &iface->sockaddr); + } + spin_unlock(&ses->iface_lock); + + spin_lock(&ses->chan_lock); + chan_index = cifs_ses_get_chan_index(ses, server); + ses->chans[chan_index].iface = iface; + + /* No iface is found. if secondary chan, drop connection */ + if (!iface && CIFS_SERVER_IS_CHAN(server)) + ses->chans[chan_index].server = NULL; + + spin_unlock(&ses->chan_lock); + + if (!iface && CIFS_SERVER_IS_CHAN(server)) + cifs_put_tcp_session(server, false); + + return rc; +} + +/* * If server is a channel of ses, return the corresponding enclosing * cifs_chan otherwise return NULL. */ @@ -355,6 +444,7 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, spin_unlock(&ses->chan_lock); goto out; } + chan->iface = iface; ses->chan_count++; atomic_set(&ses->chan_seq, 0); @@ -384,6 +474,14 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, out: if (rc && chan->server) { + /* + * we should avoid race with these delayed works before we + * remove this channel + */ + cancel_delayed_work_sync(&chan->server->echo); + cancel_delayed_work_sync(&chan->server->resolve); + cancel_delayed_work_sync(&chan->server->reconnect); + spin_lock(&ses->chan_lock); /* we rely on all bits beyond chan_count to be clear */ cifs_chan_clear_need_reconnect(ses, chan->server); @@ -394,10 +492,9 @@ out: */ WARN_ON(ses->chan_count < 1); spin_unlock(&ses->chan_lock); - } - if (rc && chan->server) cifs_put_tcp_session(chan->server, 0); + } return rc; } diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 8543cafdfd34..8802995b2d3d 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -512,73 +512,41 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) static int parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, size_t buf_len, - struct cifs_server_iface **iface_list, - size_t *iface_count) + struct cifs_ses *ses) { struct network_interface_info_ioctl_rsp *p; struct sockaddr_in *addr4; struct sockaddr_in6 *addr6; struct iface_info_ipv4 *p4; struct iface_info_ipv6 *p6; - struct cifs_server_iface *info; + struct cifs_server_iface *info = NULL, *iface = NULL, *niface = NULL; + struct cifs_server_iface tmp_iface; ssize_t bytes_left; size_t next = 0; int nb_iface = 0; - int rc = 0; - - *iface_list = NULL; - *iface_count = 0; - - /* - * Fist pass: count and sanity check - */ + int rc = 0, ret = 0; bytes_left = buf_len; p = buf; - while (bytes_left >= sizeof(*p)) { - nb_iface++; - next = le32_to_cpu(p->Next); - if (!next) { - bytes_left -= sizeof(*p); - break; - } - p = (struct network_interface_info_ioctl_rsp *)((u8 *)p+next); - bytes_left -= next; - } - - if (!nb_iface) { - cifs_dbg(VFS, "%s: malformed interface info\n", __func__); - rc = -EINVAL; - goto out; - } - - /* Azure rounds the buffer size up 8, to a 16 byte boundary */ - if ((bytes_left > 8) || p->Next) - cifs_dbg(VFS, "%s: incomplete interface info\n", __func__); - + spin_lock(&ses->iface_lock); /* - * Second pass: extract info to internal structure + * Go through iface_list and do kref_put to remove + * any unused ifaces. ifaces in use will be removed + * when the last user calls a kref_put on it */ - - *iface_list = kcalloc(nb_iface, sizeof(**iface_list), GFP_KERNEL); - if (!*iface_list) { - rc = -ENOMEM; - goto out; + list_for_each_entry_safe(iface, niface, &ses->iface_list, + iface_head) { + iface->is_active = 0; + kref_put(&iface->refcount, release_iface); } + spin_unlock(&ses->iface_lock); - info = *iface_list; - bytes_left = buf_len; - p = buf; while (bytes_left >= sizeof(*p)) { - info->speed = le64_to_cpu(p->LinkSpeed); - info->rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE) ? 1 : 0; - info->rss_capable = le32_to_cpu(p->Capability & RSS_CAPABLE) ? 1 : 0; - - cifs_dbg(FYI, "%s: adding iface %zu\n", __func__, *iface_count); - cifs_dbg(FYI, "%s: speed %zu bps\n", __func__, info->speed); - cifs_dbg(FYI, "%s: capabilities 0x%08x\n", __func__, - le32_to_cpu(p->Capability)); + memset(&tmp_iface, 0, sizeof(tmp_iface)); + tmp_iface.speed = le64_to_cpu(p->LinkSpeed); + tmp_iface.rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE) ? 1 : 0; + tmp_iface.rss_capable = le32_to_cpu(p->Capability & RSS_CAPABLE) ? 1 : 0; switch (p->Family) { /* @@ -587,7 +555,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, * conversion explicit in case either one changes. */ case INTERNETWORK: - addr4 = (struct sockaddr_in *)&info->sockaddr; + addr4 = (struct sockaddr_in *)&tmp_iface.sockaddr; p4 = (struct iface_info_ipv4 *)p->Buffer; addr4->sin_family = AF_INET; memcpy(&addr4->sin_addr, &p4->IPv4Address, 4); @@ -599,7 +567,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, &addr4->sin_addr); break; case INTERNETWORKV6: - addr6 = (struct sockaddr_in6 *)&info->sockaddr; + addr6 = (struct sockaddr_in6 *)&tmp_iface.sockaddr; p6 = (struct iface_info_ipv6 *)p->Buffer; addr6->sin6_family = AF_INET6; memcpy(&addr6->sin6_addr, &p6->IPv6Address, 16); @@ -619,46 +587,96 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, goto next_iface; } - (*iface_count)++; - info++; + /* + * The iface_list is assumed to be sorted by speed. + * Check if the new interface exists in that list. + * NEVER change iface. it could be in use. + * Add a new one instead + */ + spin_lock(&ses->iface_lock); + iface = niface = NULL; + list_for_each_entry_safe(iface, niface, &ses->iface_list, + iface_head) { + ret = iface_cmp(iface, &tmp_iface); + if (!ret) { + /* just get a ref so that it doesn't get picked/freed */ + iface->is_active = 1; + kref_get(&iface->refcount); + spin_unlock(&ses->iface_lock); + goto next_iface; + } else if (ret < 0) { + /* all remaining ifaces are slower */ + kref_get(&iface->refcount); + break; + } + } + spin_unlock(&ses->iface_lock); + + /* no match. insert the entry in the list */ + info = kmalloc(sizeof(struct cifs_server_iface), + GFP_KERNEL); + if (!info) { + rc = -ENOMEM; + goto out; + } + memcpy(info, &tmp_iface, sizeof(tmp_iface)); + + /* add this new entry to the list */ + kref_init(&info->refcount); + info->is_active = 1; + + cifs_dbg(FYI, "%s: adding iface %zu\n", __func__, ses->iface_count); + cifs_dbg(FYI, "%s: speed %zu bps\n", __func__, info->speed); + cifs_dbg(FYI, "%s: capabilities 0x%08x\n", __func__, + le32_to_cpu(p->Capability)); + + spin_lock(&ses->iface_lock); + if (!list_entry_is_head(iface, &ses->iface_list, iface_head)) { + list_add_tail(&info->iface_head, &iface->iface_head); + kref_put(&iface->refcount, release_iface); + } else + list_add_tail(&info->iface_head, &ses->iface_list); + spin_unlock(&ses->iface_lock); + + ses->iface_count++; + ses->iface_last_update = jiffies; next_iface: + nb_iface++; next = le32_to_cpu(p->Next); - if (!next) + if (!next) { + bytes_left -= sizeof(*p); break; + } p = (struct network_interface_info_ioctl_rsp *)((u8 *)p+next); bytes_left -= next; } - if (!*iface_count) { + if (!nb_iface) { + cifs_dbg(VFS, "%s: malformed interface info\n", __func__); rc = -EINVAL; goto out; } -out: - if (rc) { - kfree(*iface_list); - *iface_count = 0; - *iface_list = NULL; - } - return rc; -} + /* Azure rounds the buffer size up 8, to a 16 byte boundary */ + if ((bytes_left > 8) || p->Next) + cifs_dbg(VFS, "%s: incomplete interface info\n", __func__); -static int compare_iface(const void *ia, const void *ib) -{ - const struct cifs_server_iface *a = (struct cifs_server_iface *)ia; - const struct cifs_server_iface *b = (struct cifs_server_iface *)ib; - return a->speed == b->speed ? 0 : (a->speed > b->speed ? -1 : 1); + if (!ses->iface_count) { + rc = -EINVAL; + goto out; + } + +out: + return rc; } -static int +int SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) { int rc; unsigned int ret_data_len = 0; struct network_interface_info_ioctl_rsp *out_buf = NULL; - struct cifs_server_iface *iface_list; - size_t iface_count; struct cifs_ses *ses = tcon->ses; rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, @@ -674,21 +692,10 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) goto out; } - rc = parse_server_interfaces(out_buf, ret_data_len, - &iface_list, &iface_count); + rc = parse_server_interfaces(out_buf, ret_data_len, ses); if (rc) goto out; - /* sort interfaces from fastest to slowest */ - sort(iface_list, iface_count, sizeof(*iface_list), compare_iface, NULL); - - spin_lock(&ses->iface_lock); - kfree(ses->iface_list); - ses->iface_list = iface_list; - ses->iface_count = iface_count; - ses->iface_last_update = jiffies; - spin_unlock(&ses->iface_lock); - out: kfree(out_buf); return rc; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index b515140bad8d..c705de32e225 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -543,6 +543,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req, struct TCP_Server_Info *server, unsigned int *total_len) { char *pneg_ctxt; + char *hostname = NULL; unsigned int ctxt_len, neg_context_count; if (*total_len > 200) { @@ -570,16 +571,25 @@ assemble_neg_contexts(struct smb2_negotiate_req *req, *total_len += ctxt_len; pneg_ctxt += ctxt_len; - ctxt_len = build_netname_ctxt((struct smb2_netname_neg_context *)pneg_ctxt, - server->hostname); - *total_len += ctxt_len; - pneg_ctxt += ctxt_len; + /* + * secondary channels don't have the hostname field populated + * use the hostname field in the primary channel instead + */ + hostname = CIFS_SERVER_IS_CHAN(server) ? + server->primary_server->hostname : server->hostname; + if (hostname && (hostname[0] != 0)) { + ctxt_len = build_netname_ctxt((struct smb2_netname_neg_context *)pneg_ctxt, + hostname); + *total_len += ctxt_len; + pneg_ctxt += ctxt_len; + neg_context_count = 3; + } else + neg_context_count = 2; build_posix_ctxt((struct smb2_posix_neg_context *)pneg_ctxt); *total_len += sizeof(struct smb2_posix_neg_context); pneg_ctxt += sizeof(struct smb2_posix_neg_context); - - neg_context_count = 4; + neg_context_count++; if (server->compress_algorithm) { build_compression_ctxt((struct smb2_compression_capabilities_context *) diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig index ee92634196a8..1105ce3c80cb 100644 --- a/fs/dlm/Kconfig +++ b/fs/dlm/Kconfig @@ -9,6 +9,15 @@ menuconfig DLM A general purpose distributed lock manager for kernel or userspace applications. +config DLM_DEPRECATED_API + bool "DLM deprecated API" + depends on DLM + help + Enables deprecated DLM timeout features that will be removed in + later Linux kernel releases. + + If you are unsure, say N. + config DLM_DEBUG bool "DLM debugging" depends on DLM diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile index 3545fdafc6fb..71dab733cf9a 100644 --- a/fs/dlm/Makefile +++ b/fs/dlm/Makefile @@ -9,7 +9,6 @@ dlm-y := ast.o \ member.o \ memory.o \ midcomms.o \ - netlink.o \ lowcomms.o \ plock.o \ rcom.o \ @@ -18,5 +17,6 @@ dlm-y := ast.o \ requestqueue.o \ user.o \ util.o +dlm-$(CONFIG_DLM_DEPRECATED_API) += netlink.o dlm-$(CONFIG_DLM_DEBUG) += debug_fs.o diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index bfac462dd3e8..19ef136f9e4f 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -255,13 +255,13 @@ void dlm_callback_work(struct work_struct *work) if (callbacks[i].flags & DLM_CB_SKIP) { continue; } else if (callbacks[i].flags & DLM_CB_BAST) { - bastfn(lkb->lkb_astparam, callbacks[i].mode); trace_dlm_bast(ls, lkb, callbacks[i].mode); + bastfn(lkb->lkb_astparam, callbacks[i].mode); } else if (callbacks[i].flags & DLM_CB_CAST) { lkb->lkb_lksb->sb_status = callbacks[i].sb_status; lkb->lkb_lksb->sb_flags = callbacks[i].sb_flags; + trace_dlm_ast(ls, lkb); castfn(lkb->lkb_astparam); - trace_dlm_ast(ls, lkb, lkb->lkb_lksb); } } diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 42eee2783756..ac8b62106ce0 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -75,8 +75,9 @@ struct dlm_cluster { unsigned int cl_log_info; unsigned int cl_protocol; unsigned int cl_mark; +#ifdef CONFIG_DLM_DEPRECATED_API unsigned int cl_timewarn_cs; - unsigned int cl_waitwarn_us; +#endif unsigned int cl_new_rsb_count; unsigned int cl_recover_callbacks; char cl_cluster_name[DLM_LOCKSPACE_LEN]; @@ -102,8 +103,9 @@ enum { CLUSTER_ATTR_LOG_INFO, CLUSTER_ATTR_PROTOCOL, CLUSTER_ATTR_MARK, +#ifdef CONFIG_DLM_DEPRECATED_API CLUSTER_ATTR_TIMEWARN_CS, - CLUSTER_ATTR_WAITWARN_US, +#endif CLUSTER_ATTR_NEW_RSB_COUNT, CLUSTER_ATTR_RECOVER_CALLBACKS, CLUSTER_ATTR_CLUSTER_NAME, @@ -224,8 +226,9 @@ CLUSTER_ATTR(log_debug, NULL); CLUSTER_ATTR(log_info, NULL); CLUSTER_ATTR(protocol, dlm_check_protocol_and_dlm_running); CLUSTER_ATTR(mark, NULL); +#ifdef CONFIG_DLM_DEPRECATED_API CLUSTER_ATTR(timewarn_cs, dlm_check_zero); -CLUSTER_ATTR(waitwarn_us, NULL); +#endif CLUSTER_ATTR(new_rsb_count, NULL); CLUSTER_ATTR(recover_callbacks, NULL); @@ -240,8 +243,9 @@ static struct configfs_attribute *cluster_attrs[] = { [CLUSTER_ATTR_LOG_INFO] = &cluster_attr_log_info, [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol, [CLUSTER_ATTR_MARK] = &cluster_attr_mark, +#ifdef CONFIG_DLM_DEPRECATED_API [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs, - [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us, +#endif [CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count, [CLUSTER_ATTR_RECOVER_CALLBACKS] = &cluster_attr_recover_callbacks, [CLUSTER_ATTR_CLUSTER_NAME] = &cluster_attr_cluster_name, @@ -432,8 +436,9 @@ static struct config_group *make_cluster(struct config_group *g, cl->cl_log_debug = dlm_config.ci_log_debug; cl->cl_log_info = dlm_config.ci_log_info; cl->cl_protocol = dlm_config.ci_protocol; +#ifdef CONFIG_DLM_DEPRECATED_API cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs; - cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us; +#endif cl->cl_new_rsb_count = dlm_config.ci_new_rsb_count; cl->cl_recover_callbacks = dlm_config.ci_recover_callbacks; memcpy(cl->cl_cluster_name, dlm_config.ci_cluster_name, @@ -954,8 +959,9 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num) #define DEFAULT_LOG_INFO 1 #define DEFAULT_PROTOCOL DLM_PROTO_TCP #define DEFAULT_MARK 0 +#ifdef CONFIG_DLM_DEPRECATED_API #define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */ -#define DEFAULT_WAITWARN_US 0 +#endif #define DEFAULT_NEW_RSB_COUNT 128 #define DEFAULT_RECOVER_CALLBACKS 0 #define DEFAULT_CLUSTER_NAME "" @@ -971,8 +977,9 @@ struct dlm_config_info dlm_config = { .ci_log_info = DEFAULT_LOG_INFO, .ci_protocol = DEFAULT_PROTOCOL, .ci_mark = DEFAULT_MARK, +#ifdef CONFIG_DLM_DEPRECATED_API .ci_timewarn_cs = DEFAULT_TIMEWARN_CS, - .ci_waitwarn_us = DEFAULT_WAITWARN_US, +#endif .ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT, .ci_recover_callbacks = DEFAULT_RECOVER_CALLBACKS, .ci_cluster_name = DEFAULT_CLUSTER_NAME diff --git a/fs/dlm/config.h b/fs/dlm/config.h index df92b0a07fc6..55c5f2c13ebd 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -37,8 +37,9 @@ struct dlm_config_info { int ci_log_info; int ci_protocol; int ci_mark; +#ifdef CONFIG_DLM_DEPRECATED_API int ci_timewarn_cs; - int ci_waitwarn_us; +#endif int ci_new_rsb_count; int ci_recover_callbacks; char ci_cluster_name[DLM_LOCKSPACE_LEN]; diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 776c3ed519f0..8aca8085d24e 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -145,7 +145,9 @@ struct dlm_args { void (*bastfn) (void *astparam, int mode); int mode; struct dlm_lksb *lksb; +#ifdef CONFIG_DLM_DEPRECATED_API unsigned long timeout; +#endif }; @@ -203,10 +205,20 @@ struct dlm_args { #define DLM_IFL_OVERLAP_UNLOCK 0x00080000 #define DLM_IFL_OVERLAP_CANCEL 0x00100000 #define DLM_IFL_ENDOFLIFE 0x00200000 +#ifdef CONFIG_DLM_DEPRECATED_API #define DLM_IFL_WATCH_TIMEWARN 0x00400000 #define DLM_IFL_TIMEOUT_CANCEL 0x00800000 +#endif #define DLM_IFL_DEADLOCK_CANCEL 0x01000000 #define DLM_IFL_STUB_MS 0x02000000 /* magic number for m_flags */ +/* least significant 2 bytes are message changed, they are full transmitted + * but at receive side only the 2 bytes LSB will be set. + * + * Even wireshark dlm dissector does only evaluate the lower bytes and note + * that they may not be used on transceiver side, we assume the higher bytes + * are for internal use or reserved so long they are not parsed on receiver + * side. + */ #define DLM_IFL_USER 0x00000001 #define DLM_IFL_ORPHAN 0x00000002 @@ -249,10 +261,12 @@ struct dlm_lkb { struct list_head lkb_rsb_lookup; /* waiting for rsb lookup */ struct list_head lkb_wait_reply; /* waiting for remote reply */ struct list_head lkb_ownqueue; /* list of locks for a process */ - struct list_head lkb_time_list; ktime_t lkb_timestamp; - ktime_t lkb_wait_time; + +#ifdef CONFIG_DLM_DEPRECATED_API + struct list_head lkb_time_list; unsigned long lkb_timeout_cs; +#endif struct mutex lkb_cb_mutex; struct work_struct lkb_cb_work; @@ -568,8 +582,10 @@ struct dlm_ls { struct mutex ls_orphans_mutex; struct list_head ls_orphans; +#ifdef CONFIG_DLM_DEPRECATED_API struct mutex ls_timeout_mutex; struct list_head ls_timeout; +#endif spinlock_t ls_new_rsb_spin; int ls_new_rsb_count; @@ -606,8 +622,8 @@ struct dlm_ls { wait_queue_head_t ls_uevent_wait; /* user part of join/leave */ int ls_uevent_result; - struct completion ls_members_done; - int ls_members_result; + struct completion ls_recovery_done; + int ls_recovery_result; struct miscdevice ls_device; @@ -688,7 +704,9 @@ struct dlm_ls { #define LSFL_RCOM_READY 5 #define LSFL_RCOM_WAIT 6 #define LSFL_UEVENT_WAIT 7 +#ifdef CONFIG_DLM_DEPRECATED_API #define LSFL_TIMEWARN 8 +#endif #define LSFL_CB_DELAY 9 #define LSFL_NODIR 10 @@ -741,9 +759,15 @@ static inline int dlm_no_directory(struct dlm_ls *ls) return test_bit(LSFL_NODIR, &ls->ls_flags); } +#ifdef CONFIG_DLM_DEPRECATED_API int dlm_netlink_init(void); void dlm_netlink_exit(void); void dlm_timeout_warn(struct dlm_lkb *lkb); +#else +static inline int dlm_netlink_init(void) { return 0; } +static inline void dlm_netlink_exit(void) { }; +static inline void dlm_timeout_warn(struct dlm_lkb *lkb) { }; +#endif int dlm_plock_init(void); void dlm_plock_exit(void); diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 226822f49d30..dac7eb75dba9 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -296,12 +296,14 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb);); +#ifdef CONFIG_DLM_DEPRECATED_API /* if the operation was a cancel, then return -DLM_ECANCEL, if a timeout caused the cancel then return -ETIMEDOUT */ if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_TIMEOUT_CANCEL)) { lkb->lkb_flags &= ~DLM_IFL_TIMEOUT_CANCEL; rv = -ETIMEDOUT; } +#endif if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_DEADLOCK_CANCEL)) { lkb->lkb_flags &= ~DLM_IFL_DEADLOCK_CANCEL; @@ -1210,7 +1212,9 @@ static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret, kref_init(&lkb->lkb_ref); INIT_LIST_HEAD(&lkb->lkb_ownqueue); INIT_LIST_HEAD(&lkb->lkb_rsb_lookup); +#ifdef CONFIG_DLM_DEPRECATED_API INIT_LIST_HEAD(&lkb->lkb_time_list); +#endif INIT_LIST_HEAD(&lkb->lkb_cb_list); mutex_init(&lkb->lkb_cb_mutex); INIT_WORK(&lkb->lkb_cb_work, dlm_callback_work); @@ -1306,6 +1310,13 @@ static inline void hold_lkb(struct dlm_lkb *lkb) kref_get(&lkb->lkb_ref); } +static void unhold_lkb_assert(struct kref *kref) +{ + struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref); + + DLM_ASSERT(false, dlm_print_lkb(lkb);); +} + /* This is called when we need to remove a reference and are certain it's not the last ref. e.g. del_lkb is always called between a find_lkb/put_lkb and is always the inverse of a previous add_lkb. @@ -1313,9 +1324,7 @@ static inline void hold_lkb(struct dlm_lkb *lkb) static inline void unhold_lkb(struct dlm_lkb *lkb) { - int rv; - rv = kref_put(&lkb->lkb_ref, kill_lkb); - DLM_ASSERT(!rv, dlm_print_lkb(lkb);); + kref_put(&lkb->lkb_ref, unhold_lkb_assert); } static void lkb_add_ordered(struct list_head *new, struct list_head *head, @@ -1402,75 +1411,6 @@ static int msg_reply_type(int mstype) return -1; } -static int nodeid_warned(int nodeid, int num_nodes, int *warned) -{ - int i; - - for (i = 0; i < num_nodes; i++) { - if (!warned[i]) { - warned[i] = nodeid; - return 0; - } - if (warned[i] == nodeid) - return 1; - } - return 0; -} - -void dlm_scan_waiters(struct dlm_ls *ls) -{ - struct dlm_lkb *lkb; - s64 us; - s64 debug_maxus = 0; - u32 debug_scanned = 0; - u32 debug_expired = 0; - int num_nodes = 0; - int *warned = NULL; - - if (!dlm_config.ci_waitwarn_us) - return; - - mutex_lock(&ls->ls_waiters_mutex); - - list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) { - if (!lkb->lkb_wait_time) - continue; - - debug_scanned++; - - us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_wait_time)); - - if (us < dlm_config.ci_waitwarn_us) - continue; - - lkb->lkb_wait_time = 0; - - debug_expired++; - if (us > debug_maxus) - debug_maxus = us; - - if (!num_nodes) { - num_nodes = ls->ls_num_nodes; - warned = kcalloc(num_nodes, sizeof(int), GFP_KERNEL); - } - if (!warned) - continue; - if (nodeid_warned(lkb->lkb_wait_nodeid, num_nodes, warned)) - continue; - - log_error(ls, "waitwarn %x %lld %d us check connection to " - "node %d", lkb->lkb_id, (long long)us, - dlm_config.ci_waitwarn_us, lkb->lkb_wait_nodeid); - } - mutex_unlock(&ls->ls_waiters_mutex); - kfree(warned); - - if (debug_expired) - log_debug(ls, "scan_waiters %u warn %u over %d us max %lld us", - debug_scanned, debug_expired, - dlm_config.ci_waitwarn_us, (long long)debug_maxus); -} - /* add/remove lkb from global waiters list of lkb's waiting for a reply from a remote node */ @@ -1514,7 +1454,6 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid) lkb->lkb_wait_count++; lkb->lkb_wait_type = mstype; - lkb->lkb_wait_time = ktime_get(); lkb->lkb_wait_nodeid = to_nodeid; /* for debugging */ hold_lkb(lkb); list_add(&lkb->lkb_wait_reply, &ls->ls_waiters); @@ -1842,6 +1781,7 @@ void dlm_scan_rsbs(struct dlm_ls *ls) } } +#ifdef CONFIG_DLM_DEPRECATED_API static void add_timeout(struct dlm_lkb *lkb) { struct dlm_ls *ls = lkb->lkb_resource->res_ls; @@ -1962,17 +1902,11 @@ void dlm_adjust_timeouts(struct dlm_ls *ls) list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us); mutex_unlock(&ls->ls_timeout_mutex); - - if (!dlm_config.ci_waitwarn_us) - return; - - mutex_lock(&ls->ls_waiters_mutex); - list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) { - if (ktime_to_us(lkb->lkb_wait_time)) - lkb->lkb_wait_time = ktime_get(); - } - mutex_unlock(&ls->ls_waiters_mutex); } +#else +static void add_timeout(struct dlm_lkb *lkb) { } +static void del_timeout(struct dlm_lkb *lkb) { } +#endif /* lkb is master or local copy */ @@ -2837,12 +2771,20 @@ static void confirm_master(struct dlm_rsb *r, int error) } } +#ifdef CONFIG_DLM_DEPRECATED_API static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags, int namelen, unsigned long timeout_cs, void (*ast) (void *astparam), void *astparam, void (*bast) (void *astparam, int mode), struct dlm_args *args) +#else +static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags, + int namelen, void (*ast)(void *astparam), + void *astparam, + void (*bast)(void *astparam, int mode), + struct dlm_args *args) +#endif { int rv = -EINVAL; @@ -2895,7 +2837,9 @@ static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags, args->astfn = ast; args->astparam = astparam; args->bastfn = bast; +#ifdef CONFIG_DLM_DEPRECATED_API args->timeout = timeout_cs; +#endif args->mode = mode; args->lksb = lksb; rv = 0; @@ -2951,7 +2895,9 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, lkb->lkb_lksb = args->lksb; lkb->lkb_lvbptr = args->lksb->sb_lvbptr; lkb->lkb_ownpid = (int) current->pid; +#ifdef CONFIG_DLM_DEPRECATED_API lkb->lkb_timeout_cs = args->timeout; +#endif rv = 0; out: if (rv) @@ -3472,10 +3418,15 @@ int dlm_lock(dlm_lockspace_t *lockspace, if (error) goto out; - trace_dlm_lock_start(ls, lkb, mode, flags); + trace_dlm_lock_start(ls, lkb, name, namelen, mode, flags); +#ifdef CONFIG_DLM_DEPRECATED_API error = set_lock_args(mode, lksb, flags, namelen, 0, ast, astarg, bast, &args); +#else + error = set_lock_args(mode, lksb, flags, namelen, ast, astarg, bast, + &args); +#endif if (error) goto out_put; @@ -3487,7 +3438,7 @@ int dlm_lock(dlm_lockspace_t *lockspace, if (error == -EINPROGRESS) error = 0; out_put: - trace_dlm_lock_end(ls, lkb, mode, flags, error); + trace_dlm_lock_end(ls, lkb, name, namelen, mode, flags, error); if (convert || error) __put_lkb(ls, lkb); @@ -5839,9 +5790,14 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc) return 0; } +#ifdef CONFIG_DLM_DEPRECATED_API int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode, uint32_t flags, void *name, unsigned int namelen, unsigned long timeout_cs) +#else +int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, + int mode, uint32_t flags, void *name, unsigned int namelen) +#endif { struct dlm_lkb *lkb; struct dlm_args args; @@ -5864,8 +5820,13 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, goto out; } } +#ifdef CONFIG_DLM_DEPRECATED_API error = set_lock_args(mode, &ua->lksb, flags, namelen, timeout_cs, fake_astfn, ua, fake_bastfn, &args); +#else + error = set_lock_args(mode, &ua->lksb, flags, namelen, fake_astfn, ua, + fake_bastfn, &args); +#endif if (error) { kfree(ua->lksb.sb_lvbptr); ua->lksb.sb_lvbptr = NULL; @@ -5904,9 +5865,14 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, return error; } +#ifdef CONFIG_DLM_DEPRECATED_API int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, int mode, uint32_t flags, uint32_t lkid, char *lvb_in, unsigned long timeout_cs) +#else +int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + int mode, uint32_t flags, uint32_t lkid, char *lvb_in) +#endif { struct dlm_lkb *lkb; struct dlm_args args; @@ -5941,8 +5907,13 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, ua->bastaddr = ua_tmp->bastaddr; ua->user_lksb = ua_tmp->user_lksb; +#ifdef CONFIG_DLM_DEPRECATED_API error = set_lock_args(mode, &ua->lksb, flags, 0, timeout_cs, fake_astfn, ua, fake_bastfn, &args); +#else + error = set_lock_args(mode, &ua->lksb, flags, 0, fake_astfn, ua, + fake_bastfn, &args); +#endif if (error) goto out_put; @@ -5966,7 +5937,7 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, int mode, uint32_t flags, void *name, unsigned int namelen, - unsigned long timeout_cs, uint32_t *lkid) + uint32_t *lkid) { struct dlm_lkb *lkb = NULL, *iter; struct dlm_user_args *ua; diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h index 252a5898f908..a7b6474f009d 100644 --- a/fs/dlm/lock.h +++ b/fs/dlm/lock.h @@ -24,9 +24,15 @@ int dlm_put_lkb(struct dlm_lkb *lkb); void dlm_scan_rsbs(struct dlm_ls *ls); int dlm_lock_recovery_try(struct dlm_ls *ls); void dlm_unlock_recovery(struct dlm_ls *ls); -void dlm_scan_waiters(struct dlm_ls *ls); + +#ifdef CONFIG_DLM_DEPRECATED_API void dlm_scan_timeout(struct dlm_ls *ls); void dlm_adjust_timeouts(struct dlm_ls *ls); +#else +static inline void dlm_scan_timeout(struct dlm_ls *ls) { } +static inline void dlm_adjust_timeouts(struct dlm_ls *ls) { } +#endif + int dlm_master_lookup(struct dlm_ls *ls, int nodeid, char *name, int len, unsigned int flags, int *r_nodeid, int *result); @@ -41,15 +47,22 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls); int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc); int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc); +#ifdef CONFIG_DLM_DEPRECATED_API int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode, uint32_t flags, void *name, unsigned int namelen, unsigned long timeout_cs); int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, int mode, uint32_t flags, uint32_t lkid, char *lvb_in, unsigned long timeout_cs); +#else +int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode, + uint32_t flags, void *name, unsigned int namelen); +int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + int mode, uint32_t flags, uint32_t lkid, char *lvb_in); +#endif int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, int mode, uint32_t flags, void *name, unsigned int namelen, - unsigned long timeout_cs, uint32_t *lkid); + uint32_t *lkid); int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, uint32_t flags, uint32_t lkid, char *lvb_in); int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 19ed41a5da93..3972f4d86c75 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -275,7 +275,6 @@ static int dlm_scand(void *data) ls->ls_scan_time = jiffies; dlm_scan_rsbs(ls); dlm_scan_timeout(ls); - dlm_scan_waiters(ls); dlm_unlock_recovery(ls); } else { ls->ls_scan_time += HZ; @@ -490,13 +489,28 @@ static int new_lockspace(const char *name, const char *cluster, ls->ls_ops_arg = ops_arg; } - if (flags & DLM_LSFL_TIMEWARN) +#ifdef CONFIG_DLM_DEPRECATED_API + if (flags & DLM_LSFL_TIMEWARN) { + pr_warn_once("===============================================================\n" + "WARNING: the dlm DLM_LSFL_TIMEWARN flag is being deprecated and\n" + " will be removed in v6.2!\n" + " Inclusive DLM_LSFL_TIMEWARN define in UAPI header!\n" + "===============================================================\n"); + set_bit(LSFL_TIMEWARN, &ls->ls_flags); + } /* ls_exflags are forced to match among nodes, and we don't - need to require all nodes to have some flags set */ + * need to require all nodes to have some flags set + */ ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS | DLM_LSFL_NEWEXCL)); +#else + /* ls_exflags are forced to match among nodes, and we don't + * need to require all nodes to have some flags set + */ + ls->ls_exflags = (flags & ~(DLM_LSFL_FS | DLM_LSFL_NEWEXCL)); +#endif size = READ_ONCE(dlm_config.ci_rsbtbl_size); ls->ls_rsbtbl_size = size; @@ -527,8 +541,10 @@ static int new_lockspace(const char *name, const char *cluster, mutex_init(&ls->ls_waiters_mutex); INIT_LIST_HEAD(&ls->ls_orphans); mutex_init(&ls->ls_orphans_mutex); +#ifdef CONFIG_DLM_DEPRECATED_API INIT_LIST_HEAD(&ls->ls_timeout); mutex_init(&ls->ls_timeout_mutex); +#endif INIT_LIST_HEAD(&ls->ls_new_rsb); spin_lock_init(&ls->ls_new_rsb_spin); @@ -548,8 +564,8 @@ static int new_lockspace(const char *name, const char *cluster, init_waitqueue_head(&ls->ls_uevent_wait); ls->ls_uevent_result = 0; - init_completion(&ls->ls_members_done); - ls->ls_members_result = -1; + init_completion(&ls->ls_recovery_done); + ls->ls_recovery_result = -1; mutex_init(&ls->ls_cb_mutex); INIT_LIST_HEAD(&ls->ls_cb_delay); @@ -645,8 +661,9 @@ static int new_lockspace(const char *name, const char *cluster, if (error) goto out_recoverd; - wait_for_completion(&ls->ls_members_done); - error = ls->ls_members_result; + /* wait until recovery is successful or failed */ + wait_for_completion(&ls->ls_recovery_done); + error = ls->ls_recovery_result; if (error) goto out_members; diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 19e82f08c0e0..a4e84e8d94c8 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -529,7 +529,7 @@ static void lowcomms_write_space(struct sock *sk) return; if (!test_and_set_bit(CF_CONNECTED, &con->flags)) { - log_print("successful connected to node %d", con->nodeid); + log_print("connected to node %d", con->nodeid); queue_work(send_workqueue, &con->swork); return; } @@ -1931,7 +1931,7 @@ static int dlm_sctp_connect(struct connection *con, struct socket *sock, return ret; if (!test_and_set_bit(CF_CONNECTED, &con->flags)) - log_print("successful connected to node %d", con->nodeid); + log_print("connected to node %d", con->nodeid); return 0; } diff --git a/fs/dlm/member.c b/fs/dlm/member.c index 98084e0cfccf..2af2ccfe43a9 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -534,7 +534,11 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) int i, error, neg = 0, low = -1; /* previously removed members that we've not finished removing need to - count as a negative change so the "neg" recovery steps will happen */ + * count as a negative change so the "neg" recovery steps will happen + * + * This functionality must report all member changes to lsops or + * midcomms layer and must never return before. + */ list_for_each_entry(memb, &ls->ls_nodes_gone, list) { log_rinfo(ls, "prev removed member %d", memb->nodeid); @@ -583,19 +587,6 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) *neg_out = neg; error = ping_members(ls); - /* error -EINTR means that a new recovery action is triggered. - * We ignore this recovery action and let run the new one which might - * have new member configuration. - */ - if (error == -EINTR) - error = 0; - - /* new_lockspace() may be waiting to know if the config - * is good or bad - */ - ls->ls_members_result = error; - complete(&ls->ls_members_done); - log_rinfo(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes); return error; } @@ -675,7 +666,16 @@ int dlm_ls_stop(struct dlm_ls *ls) if (!ls->ls_recover_begin) ls->ls_recover_begin = jiffies; - dlm_lsop_recover_prep(ls); + /* call recover_prep ops only once and not multiple times + * for each possible dlm_ls_stop() when recovery is already + * stopped. + * + * If we successful was able to clear LSFL_RUNNING bit and + * it was set we know it is the first dlm_ls_stop() call. + */ + if (new) + dlm_lsop_recover_prep(ls); + return 0; } diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index 0993eebf2060..737f185aad8d 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -29,6 +29,8 @@ struct plock_async_data { struct plock_op { struct list_head list; int done; + /* if lock op got interrupted while waiting dlm_controld reply */ + bool sigint; struct dlm_plock_info info; /* if set indicates async handling */ struct plock_async_data *data; @@ -79,8 +81,7 @@ static void send_op(struct plock_op *op) abandoned waiter. So, we have to insert the unlock-close when the lock call is interrupted. */ -static void do_unlock_close(struct dlm_ls *ls, u64 number, - struct file *file, struct file_lock *fl) +static void do_unlock_close(const struct dlm_plock_info *info) { struct plock_op *op; @@ -89,15 +90,12 @@ static void do_unlock_close(struct dlm_ls *ls, u64 number, return; op->info.optype = DLM_PLOCK_OP_UNLOCK; - op->info.pid = fl->fl_pid; - op->info.fsid = ls->ls_global_id; - op->info.number = number; + op->info.pid = info->pid; + op->info.fsid = info->fsid; + op->info.number = info->number; op->info.start = 0; op->info.end = OFFSET_MAX; - if (fl->fl_lmops && fl->fl_lmops->lm_grant) - op->info.owner = (__u64) fl->fl_pid; - else - op->info.owner = (__u64)(long) fl->fl_owner; + op->info.owner = info->owner; op->info.flags |= DLM_PLOCK_FL_CLOSE; send_op(op); @@ -161,16 +159,24 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, rv = wait_event_interruptible(recv_wq, (op->done != 0)); if (rv == -ERESTARTSYS) { spin_lock(&ops_lock); - list_del(&op->list); + /* recheck under ops_lock if we got a done != 0, + * if so this interrupt case should be ignored + */ + if (op->done != 0) { + spin_unlock(&ops_lock); + goto do_lock_wait; + } + + op->sigint = true; spin_unlock(&ops_lock); - log_print("%s: wait interrupted %x %llx, op removed", + log_debug(ls, "%s: wait interrupted %x %llx pid %d", __func__, ls->ls_global_id, - (unsigned long long)number); - dlm_release_plock_op(op); - do_unlock_close(ls, number, file, fl); + (unsigned long long)number, op->info.pid); goto out; } +do_lock_wait: + WARN_ON(!list_empty(&op->list)); rv = op->info.rv; @@ -378,7 +384,7 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count, spin_lock(&ops_lock); if (!list_empty(&send_list)) { - op = list_entry(send_list.next, struct plock_op, list); + op = list_first_entry(&send_list, struct plock_op, list); if (op->info.flags & DLM_PLOCK_FL_CLOSE) list_del(&op->list); else @@ -425,6 +431,19 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count, if (iter->info.fsid == info.fsid && iter->info.number == info.number && iter->info.owner == info.owner) { + if (iter->sigint) { + list_del(&iter->list); + spin_unlock(&ops_lock); + + pr_debug("%s: sigint cleanup %x %llx pid %d", + __func__, iter->info.fsid, + (unsigned long long)iter->info.number, + iter->info.pid); + do_unlock_close(&iter->info); + memcpy(&iter->info, &info, sizeof(info)); + dlm_release_plock_op(iter); + return count; + } list_del_init(&iter->list); memcpy(&iter->info, &info, sizeof(info)); if (iter->data) @@ -443,7 +462,7 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count, else wake_up(&recv_wq); } else - log_print("%s: no op %x %llx - may got interrupted?", __func__, + log_print("%s: no op %x %llx", __func__, info.fsid, (unsigned long long)info.number); return count; } diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index a55dfce705dd..e15eb511b04b 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -70,6 +70,10 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) /* * Add or remove nodes from the lockspace's ls_nodes list. + * + * Due to the fact that we must report all membership changes to lsops + * or midcomms layer, it is not permitted to abort ls_recover() until + * this is done. */ error = dlm_recover_members(ls, rv, &neg); @@ -239,14 +243,12 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) jiffies_to_msecs(jiffies - start)); mutex_unlock(&ls->ls_recoverd_active); - dlm_lsop_recover_done(ls); return 0; fail: dlm_release_root_list(ls); - log_rinfo(ls, "dlm_recover %llu error %d", - (unsigned long long)rv->seq, error); mutex_unlock(&ls->ls_recoverd_active); + return error; } @@ -257,6 +259,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) static void do_ls_recovery(struct dlm_ls *ls) { struct dlm_recover *rv = NULL; + int error; spin_lock(&ls->ls_recover_lock); rv = ls->ls_recover_args; @@ -266,7 +269,31 @@ static void do_ls_recovery(struct dlm_ls *ls) spin_unlock(&ls->ls_recover_lock); if (rv) { - ls_recover(ls, rv); + error = ls_recover(ls, rv); + switch (error) { + case 0: + ls->ls_recovery_result = 0; + complete(&ls->ls_recovery_done); + + dlm_lsop_recover_done(ls); + break; + case -EINTR: + /* if recovery was interrupted -EINTR we wait for the next + * ls_recover() iteration until it hopefully succeeds. + */ + log_rinfo(ls, "%s %llu interrupted and should be queued to run again", + __func__, (unsigned long long)rv->seq); + break; + default: + log_rinfo(ls, "%s %llu error %d", __func__, + (unsigned long long)rv->seq, error); + + /* let new_lockspace() get aware of critical error */ + ls->ls_recovery_result = error; + complete(&ls->ls_recovery_done); + break; + } + kfree(rv->nodes); kfree(rv); } diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 1060b24f18d4..99e8f0744513 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -250,6 +250,14 @@ static int device_user_lock(struct dlm_user_proc *proc, goto out; } +#ifdef CONFIG_DLM_DEPRECATED_API + if (params->timeout) + pr_warn_once("========================================================\n" + "WARNING: the lkb timeout feature is being deprecated and\n" + " will be removed in v6.2!\n" + "========================================================\n"); +#endif + ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS); if (!ua) goto out; @@ -262,23 +270,34 @@ static int device_user_lock(struct dlm_user_proc *proc, ua->xid = params->xid; if (params->flags & DLM_LKF_CONVERT) { +#ifdef CONFIG_DLM_DEPRECATED_API error = dlm_user_convert(ls, ua, params->mode, params->flags, params->lkid, params->lvb, (unsigned long) params->timeout); +#else + error = dlm_user_convert(ls, ua, + params->mode, params->flags, + params->lkid, params->lvb); +#endif } else if (params->flags & DLM_LKF_ORPHAN) { error = dlm_user_adopt_orphan(ls, ua, params->mode, params->flags, params->name, params->namelen, - (unsigned long) params->timeout, &lkid); if (!error) error = lkid; } else { +#ifdef CONFIG_DLM_DEPRECATED_API error = dlm_user_request(ls, ua, params->mode, params->flags, params->name, params->namelen, (unsigned long) params->timeout); +#else + error = dlm_user_request(ls, ua, + params->mode, params->flags, + params->name, params->namelen); +#endif if (!error) error = ua->lksb.sb_lkid; } diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index 19e6c56a9f47..26fa170090b8 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -17,7 +17,7 @@ struct z_erofs_decompress_req { /* indicate the algorithm will be used for decompression */ unsigned int alg; - bool inplace_io, partial_decoding; + bool inplace_io, partial_decoding, fillgaps; }; struct z_erofs_decompressor { diff --git a/fs/erofs/data.c b/fs/erofs/data.c index fbb037ba326e..fe8ac0e163f7 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -366,42 +366,33 @@ static sector_t erofs_bmap(struct address_space *mapping, sector_t block) return iomap_bmap(mapping, block, &erofs_iomap_ops); } -static int erofs_prepare_dio(struct kiocb *iocb, struct iov_iter *to) +static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); - loff_t align = iocb->ki_pos | iov_iter_count(to) | - iov_iter_alignment(to); - struct block_device *bdev = inode->i_sb->s_bdev; - unsigned int blksize_mask; - - if (bdev) - blksize_mask = (1 << ilog2(bdev_logical_block_size(bdev))) - 1; - else - blksize_mask = (1 << inode->i_blkbits) - 1; - if (align & blksize_mask) - return -EINVAL; - return 0; -} - -static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) -{ /* no need taking (shared) inode lock since it's a ro filesystem */ if (!iov_iter_count(to)) return 0; #ifdef CONFIG_FS_DAX - if (IS_DAX(iocb->ki_filp->f_mapping->host)) + if (IS_DAX(inode)) return dax_iomap_rw(iocb, to, &erofs_iomap_ops); #endif if (iocb->ki_flags & IOCB_DIRECT) { - int err = erofs_prepare_dio(iocb, to); + struct block_device *bdev = inode->i_sb->s_bdev; + unsigned int blksize_mask; + + if (bdev) + blksize_mask = bdev_logical_block_size(bdev) - 1; + else + blksize_mask = (1 << inode->i_blkbits) - 1; + + if ((iocb->ki_pos | iov_iter_count(to) | + iov_iter_alignment(to)) & blksize_mask) + return -EINVAL; - if (!err) - return iomap_dio_rw(iocb, to, &erofs_iomap_ops, - NULL, 0, NULL, 0); - if (err < 0) - return err; + return iomap_dio_rw(iocb, to, &erofs_iomap_ops, + NULL, 0, NULL, 0); } return filemap_read(iocb, to, 0); } diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 6dca1900c733..2d55569f96ac 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -83,7 +83,7 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, j = 0; /* 'valid' bounced can only be tested after a complete round */ - if (test_bit(j, bounced)) { + if (!rq->fillgaps && test_bit(j, bounced)) { DBG_BUGON(i < lz4_max_distance_pages); DBG_BUGON(top >= lz4_max_distance_pages); availables[top++] = rq->out[i - lz4_max_distance_pages]; @@ -91,14 +91,18 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, if (page) { __clear_bit(j, bounced); - if (kaddr) { - if (kaddr + PAGE_SIZE == page_address(page)) + if (!PageHighMem(page)) { + if (!i) { + kaddr = page_address(page); + continue; + } + if (kaddr && + kaddr + PAGE_SIZE == page_address(page)) { kaddr += PAGE_SIZE; - else - kaddr = NULL; - } else if (!i) { - kaddr = page_address(page); + continue; + } } + kaddr = NULL; continue; } kaddr = NULL; diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c index 05a3063cf2bc..5e59b3f523eb 100644 --- a/fs/erofs/decompressor_lzma.c +++ b/fs/erofs/decompressor_lzma.c @@ -143,6 +143,7 @@ again: DBG_BUGON(z_erofs_lzma_head); z_erofs_lzma_head = head; spin_unlock(&z_erofs_lzma_lock); + wake_up_all(&z_erofs_lzma_wq); z_erofs_lzma_max_dictsize = dict_size; mutex_unlock(&lzma_resize_mutex); diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c index 18e59821c597..ecf28f66b97d 100644 --- a/fs/erofs/dir.c +++ b/fs/erofs/dir.c @@ -22,10 +22,9 @@ static void debug_one_dentry(unsigned char d_type, const char *de_name, } static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx, - void *dentry_blk, unsigned int *ofs, + void *dentry_blk, struct erofs_dirent *de, unsigned int nameoff, unsigned int maxsize) { - struct erofs_dirent *de = dentry_blk + *ofs; const struct erofs_dirent *end = dentry_blk + nameoff; while (de < end) { @@ -59,9 +58,8 @@ static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx, /* stopped by some reason */ return 1; ++de; - *ofs += sizeof(struct erofs_dirent); + ctx->pos += sizeof(struct erofs_dirent); } - *ofs = maxsize; return 0; } @@ -90,33 +88,33 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) nameoff = le16_to_cpu(de->nameoff); if (nameoff < sizeof(struct erofs_dirent) || - nameoff >= PAGE_SIZE) { + nameoff >= EROFS_BLKSIZ) { erofs_err(dir->i_sb, "invalid de[0].nameoff %u @ nid %llu", nameoff, EROFS_I(dir)->nid); err = -EFSCORRUPTED; - goto skip_this; + break; } maxsize = min_t(unsigned int, - dirsize - ctx->pos + ofs, PAGE_SIZE); + dirsize - ctx->pos + ofs, EROFS_BLKSIZ); /* search dirents at the arbitrary position */ if (initial) { initial = false; ofs = roundup(ofs, sizeof(struct erofs_dirent)); + ctx->pos = blknr_to_addr(i) + ofs; if (ofs >= nameoff) goto skip_this; } - err = erofs_fill_dentries(dir, ctx, de, &ofs, + err = erofs_fill_dentries(dir, ctx, de, (void *)de + ofs, nameoff, maxsize); -skip_this: - ctx->pos = blknr_to_addr(i) + ofs; - if (err) break; +skip_this: + ctx->pos = blknr_to_addr(i) + maxsize; ++i; ofs = 0; } diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 724bb57075f6..5792ca9e0d5e 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -2,6 +2,7 @@ /* * Copyright (C) 2018 HUAWEI, Inc. * https://www.huawei.com/ + * Copyright (C) 2022 Alibaba Cloud */ #include "zdata.h" #include "compress.h" @@ -26,6 +27,82 @@ static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = { _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES) }; +struct z_erofs_bvec_iter { + struct page *bvpage; + struct z_erofs_bvset *bvset; + unsigned int nr, cur; +}; + +static struct page *z_erofs_bvec_iter_end(struct z_erofs_bvec_iter *iter) +{ + if (iter->bvpage) + kunmap_local(iter->bvset); + return iter->bvpage; +} + +static struct page *z_erofs_bvset_flip(struct z_erofs_bvec_iter *iter) +{ + unsigned long base = (unsigned long)((struct z_erofs_bvset *)0)->bvec; + /* have to access nextpage in advance, otherwise it will be unmapped */ + struct page *nextpage = iter->bvset->nextpage; + struct page *oldpage; + + DBG_BUGON(!nextpage); + oldpage = z_erofs_bvec_iter_end(iter); + iter->bvpage = nextpage; + iter->bvset = kmap_local_page(nextpage); + iter->nr = (PAGE_SIZE - base) / sizeof(struct z_erofs_bvec); + iter->cur = 0; + return oldpage; +} + +static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter, + struct z_erofs_bvset_inline *bvset, + unsigned int bootstrap_nr, + unsigned int cur) +{ + *iter = (struct z_erofs_bvec_iter) { + .nr = bootstrap_nr, + .bvset = (struct z_erofs_bvset *)bvset, + }; + + while (cur > iter->nr) { + cur -= iter->nr; + z_erofs_bvset_flip(iter); + } + iter->cur = cur; +} + +static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter, + struct z_erofs_bvec *bvec, + struct page **candidate_bvpage) +{ + if (iter->cur == iter->nr) { + if (!*candidate_bvpage) + return -EAGAIN; + + DBG_BUGON(iter->bvset->nextpage); + iter->bvset->nextpage = *candidate_bvpage; + z_erofs_bvset_flip(iter); + + iter->bvset->nextpage = NULL; + *candidate_bvpage = NULL; + } + iter->bvset->bvec[iter->cur++] = *bvec; + return 0; +} + +static void z_erofs_bvec_dequeue(struct z_erofs_bvec_iter *iter, + struct z_erofs_bvec *bvec, + struct page **old_bvpage) +{ + if (iter->cur == iter->nr) + *old_bvpage = z_erofs_bvset_flip(iter); + else + *old_bvpage = NULL; + *bvec = iter->bvset->bvec[iter->cur++]; +} + static void z_erofs_destroy_pcluster_pool(void) { int i; @@ -46,7 +123,7 @@ static int z_erofs_create_pcluster_pool(void) for (pcs = pcluster_pool; pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) { - size = struct_size(a, compressed_pages, pcs->maxpages); + size = struct_size(a, compressed_bvecs, pcs->maxpages); sprintf(pcs->name, "erofs_pcluster-%u", pcs->maxpages); pcs->slab = kmem_cache_create(pcs->name, size, 0, @@ -150,30 +227,29 @@ int __init z_erofs_init_zip_subsystem(void) return err; } -enum z_erofs_collectmode { - COLLECT_SECONDARY, - COLLECT_PRIMARY, +enum z_erofs_pclustermode { + Z_EROFS_PCLUSTER_INFLIGHT, /* - * The current collection was the tail of an exist chain, in addition - * that the previous processed chained collections are all decided to + * The current pclusters was the tail of an exist chain, in addition + * that the previous processed chained pclusters are all decided to * be hooked up to it. - * A new chain will be created for the remaining collections which are - * not processed yet, therefore different from COLLECT_PRIMARY_FOLLOWED, - * the next collection cannot reuse the whole page safely in - * the following scenario: + * A new chain will be created for the remaining pclusters which are + * not processed yet, so different from Z_EROFS_PCLUSTER_FOLLOWED, + * the next pcluster cannot reuse the whole page safely for inplace I/O + * in the following scenario: * ________________________________________________________________ * | tail (partial) page | head (partial) page | - * | (belongs to the next cl) | (belongs to the current cl) | - * |_______PRIMARY_FOLLOWED_______|________PRIMARY_HOOKED___________| + * | (belongs to the next pcl) | (belongs to the current pcl) | + * |_______PCLUSTER_FOLLOWED______|________PCLUSTER_HOOKED__________| */ - COLLECT_PRIMARY_HOOKED, + Z_EROFS_PCLUSTER_HOOKED, /* - * a weak form of COLLECT_PRIMARY_FOLLOWED, the difference is that it + * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it * could be dispatched into bypass queue later due to uptodated managed * pages. All related online pages cannot be reused for inplace I/O (or - * pagevec) since it can be directly decoded without I/O submission. + * bvpage) since it can be directly decoded without I/O submission. */ - COLLECT_PRIMARY_FOLLOWED_NOINPLACE, + Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE, /* * The current collection has been linked with the owned chain, and * could also be linked with the remaining collections, which means @@ -184,39 +260,36 @@ enum z_erofs_collectmode { * ________________________________________________________________ * | tail (partial) page | head (partial) page | * | (of the current cl) | (of the previous collection) | - * | PRIMARY_FOLLOWED or | | - * |_____PRIMARY_HOOKED___|____________PRIMARY_FOLLOWED____________| + * | PCLUSTER_FOLLOWED or | | + * |_____PCLUSTER_HOOKED__|___________PCLUSTER_FOLLOWED____________| * * [ (*) the above page can be used as inplace I/O. ] */ - COLLECT_PRIMARY_FOLLOWED, + Z_EROFS_PCLUSTER_FOLLOWED, }; struct z_erofs_decompress_frontend { struct inode *const inode; struct erofs_map_blocks map; + struct z_erofs_bvec_iter biter; - struct z_erofs_pagevec_ctor vector; - + struct page *candidate_bvpage; struct z_erofs_pcluster *pcl, *tailpcl; - /* a pointer used to pick up inplace I/O pages */ - struct page **icpage_ptr; z_erofs_next_pcluster_t owned_head; - - enum z_erofs_collectmode mode; + enum z_erofs_pclustermode mode; bool readahead; /* used for applying cache strategy on the fly */ bool backmost; erofs_off_t headoffset; + + /* a pointer used to pick up inplace I/O pages */ + unsigned int icur; }; #define DECOMPRESS_FRONTEND_INIT(__i) { \ .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \ - .mode = COLLECT_PRIMARY_FOLLOWED, .backmost = true } - -static struct page *z_pagemap_global[Z_EROFS_VMAP_GLOBAL_PAGES]; -static DEFINE_MUTEX(z_pagemap_global_lock); + .mode = Z_EROFS_PCLUSTER_FOLLOWED, .backmost = true } static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, enum z_erofs_cache_alloctype type, @@ -231,24 +304,21 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, */ gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; - struct page **pages; - pgoff_t index; + unsigned int i; - if (fe->mode < COLLECT_PRIMARY_FOLLOWED) + if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) return; - pages = pcl->compressed_pages; - index = pcl->obj.index; - for (; index < pcl->obj.index + pcl->pclusterpages; ++index, ++pages) { + for (i = 0; i < pcl->pclusterpages; ++i) { struct page *page; compressed_page_t t; struct page *newpage = NULL; /* the compressed page was loaded before */ - if (READ_ONCE(*pages)) + if (READ_ONCE(pcl->compressed_bvecs[i].page)) continue; - page = find_get_page(mc, index); + page = find_get_page(mc, pcl->obj.index + i); if (page) { t = tag_compressed_page_justfound(page); @@ -269,7 +339,8 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, } } - if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t))) + if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, + tagptr_cast_ptr(t))) continue; if (page) @@ -283,7 +354,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, * managed cache since it can be moved to the bypass queue instead. */ if (standalone) - fe->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE; + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; } /* called by erofs_shrinker to get rid of all compressed_pages */ @@ -300,7 +371,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, * therefore no need to worry about available decompression users. */ for (i = 0; i < pcl->pclusterpages; ++i) { - struct page *page = pcl->compressed_pages[i]; + struct page *page = pcl->compressed_bvecs[i].page; if (!page) continue; @@ -313,7 +384,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, continue; /* barrier is implied in the following 'unlock_page' */ - WRITE_ONCE(pcl->compressed_pages[i], NULL); + WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); detach_page_private(page); unlock_page(page); } @@ -323,56 +394,59 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, int erofs_try_to_free_cached_page(struct page *page) { struct z_erofs_pcluster *const pcl = (void *)page_private(page); - int ret = 0; /* 0 - busy */ + int ret, i; - if (erofs_workgroup_try_to_freeze(&pcl->obj, 1)) { - unsigned int i; + if (!erofs_workgroup_try_to_freeze(&pcl->obj, 1)) + return 0; - DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); - for (i = 0; i < pcl->pclusterpages; ++i) { - if (pcl->compressed_pages[i] == page) { - WRITE_ONCE(pcl->compressed_pages[i], NULL); - ret = 1; - break; - } + ret = 0; + DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); + for (i = 0; i < pcl->pclusterpages; ++i) { + if (pcl->compressed_bvecs[i].page == page) { + WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); + ret = 1; + break; } - erofs_workgroup_unfreeze(&pcl->obj, 1); - - if (ret) - detach_page_private(page); } + erofs_workgroup_unfreeze(&pcl->obj, 1); + if (ret) + detach_page_private(page); return ret; } -/* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */ static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, - struct page *page) + struct z_erofs_bvec *bvec) { struct z_erofs_pcluster *const pcl = fe->pcl; - while (fe->icpage_ptr > pcl->compressed_pages) - if (!cmpxchg(--fe->icpage_ptr, NULL, page)) + while (fe->icur > 0) { + if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page, + NULL, bvec->page)) { + pcl->compressed_bvecs[fe->icur] = *bvec; return true; + } + } return false; } /* callers must be with pcluster lock held */ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, - struct page *page, enum z_erofs_page_type type, - bool pvec_safereuse) + struct z_erofs_bvec *bvec, bool exclusive) { int ret; - /* give priority for inplaceio */ - if (fe->mode >= COLLECT_PRIMARY && - type == Z_EROFS_PAGE_TYPE_EXCLUSIVE && - z_erofs_try_inplace_io(fe, page)) - return 0; - - ret = z_erofs_pagevec_enqueue(&fe->vector, page, type, - pvec_safereuse); - fe->pcl->vcnt += (unsigned int)ret; - return ret ? 0 : -EAGAIN; + if (exclusive) { + /* give priority for inplaceio to use file pages first */ + if (z_erofs_try_inplace_io(fe, bvec)) + return 0; + /* otherwise, check if it can be used as a bvpage */ + if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED && + !fe->candidate_bvpage) + fe->candidate_bvpage = bvec->page; + } + ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage); + fe->pcl->vcnt += (ret >= 0); + return ret; } static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) @@ -385,7 +459,7 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) *owned_head) == Z_EROFS_PCLUSTER_NIL) { *owned_head = &pcl->next; /* so we can attach this pcluster to our submission chain. */ - f->mode = COLLECT_PRIMARY_FOLLOWED; + f->mode = Z_EROFS_PCLUSTER_FOLLOWED; return; } @@ -393,66 +467,21 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) * type 2, link to the end of an existing open chain, be careful * that its submission is controlled by the original attached chain. */ - if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, + if (*owned_head != &pcl->next && pcl != f->tailpcl && + cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, *owned_head) == Z_EROFS_PCLUSTER_TAIL) { *owned_head = Z_EROFS_PCLUSTER_TAIL; - f->mode = COLLECT_PRIMARY_HOOKED; + f->mode = Z_EROFS_PCLUSTER_HOOKED; f->tailpcl = NULL; return; } /* type 3, it belongs to a chain, but it isn't the end of the chain */ - f->mode = COLLECT_PRIMARY; + f->mode = Z_EROFS_PCLUSTER_INFLIGHT; } -static int z_erofs_lookup_pcluster(struct z_erofs_decompress_frontend *fe, - struct inode *inode, - struct erofs_map_blocks *map) -{ - struct z_erofs_pcluster *pcl = fe->pcl; - unsigned int length; - - /* to avoid unexpected loop formed by corrupted images */ - if (fe->owned_head == &pcl->next || pcl == fe->tailpcl) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - - if (pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - - length = READ_ONCE(pcl->length); - if (length & Z_EROFS_PCLUSTER_FULL_LENGTH) { - if ((map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) > length) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - } else { - unsigned int llen = map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT; - - if (map->m_flags & EROFS_MAP_FULL_MAPPED) - llen |= Z_EROFS_PCLUSTER_FULL_LENGTH; - - while (llen > length && - length != cmpxchg_relaxed(&pcl->length, length, llen)) { - cpu_relax(); - length = READ_ONCE(pcl->length); - } - } - mutex_lock(&pcl->lock); - /* used to check tail merging loop due to corrupted images */ - if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL) - fe->tailpcl = pcl; - - z_erofs_try_to_claim_pcluster(fe); - return 0; -} - -static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe, - struct inode *inode, - struct erofs_map_blocks *map) +static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) { + struct erofs_map_blocks *map = &fe->map; bool ztailpacking = map->m_flags & EROFS_MAP_META; struct z_erofs_pcluster *pcl; struct erofs_workgroup *grp; @@ -471,14 +500,13 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe, atomic_set(&pcl->obj.refcount, 1); pcl->algorithmformat = map->m_algorithmformat; - pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) | - (map->m_flags & EROFS_MAP_FULL_MAPPED ? - Z_EROFS_PCLUSTER_FULL_LENGTH : 0); + pcl->length = 0; + pcl->partial = true; /* new pclusters should be claimed as type 1, primary and followed */ pcl->next = fe->owned_head; pcl->pageofs_out = map->m_la & ~PAGE_MASK; - fe->mode = COLLECT_PRIMARY_FOLLOWED; + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED; /* * lock all primary followed works before visible to others @@ -494,7 +522,7 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe, } else { pcl->obj.index = map->m_pa >> PAGE_SHIFT; - grp = erofs_insert_workgroup(inode->i_sb, &pcl->obj); + grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj); if (IS_ERR(grp)) { err = PTR_ERR(grp); goto err_out; @@ -520,11 +548,10 @@ err_out: return err; } -static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe, - struct inode *inode, - struct erofs_map_blocks *map) +static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) { - struct erofs_workgroup *grp; + struct erofs_map_blocks *map = &fe->map; + struct erofs_workgroup *grp = NULL; int ret; DBG_BUGON(fe->pcl); @@ -533,38 +560,35 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe, DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); - if (map->m_flags & EROFS_MAP_META) { - if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - goto tailpacking; + if (!(map->m_flags & EROFS_MAP_META)) { + grp = erofs_find_workgroup(fe->inode->i_sb, + map->m_pa >> PAGE_SHIFT); + } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { + DBG_BUGON(1); + return -EFSCORRUPTED; } - grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT); if (grp) { fe->pcl = container_of(grp, struct z_erofs_pcluster, obj); + ret = -EEXIST; } else { -tailpacking: - ret = z_erofs_register_pcluster(fe, inode, map); - if (!ret) - goto out; - if (ret != -EEXIST) - return ret; + ret = z_erofs_register_pcluster(fe); } - ret = z_erofs_lookup_pcluster(fe, inode, map); - if (ret) { - erofs_workgroup_put(&fe->pcl->obj); + if (ret == -EEXIST) { + mutex_lock(&fe->pcl->lock); + /* used to check tail merging loop due to corrupted images */ + if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL) + fe->tailpcl = fe->pcl; + + z_erofs_try_to_claim_pcluster(fe); + } else if (ret) { return ret; } - -out: - z_erofs_pagevec_ctor_init(&fe->vector, Z_EROFS_NR_INLINE_PAGEVECS, - fe->pcl->pagevec, fe->pcl->vcnt); + z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset, + Z_EROFS_INLINE_BVECS, fe->pcl->vcnt); /* since file-backed online pages are traversed in reverse order */ - fe->icpage_ptr = fe->pcl->compressed_pages + - z_erofs_pclusterpages(fe->pcl); + fe->icur = z_erofs_pclusterpages(fe->pcl); return 0; } @@ -593,14 +617,19 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) if (!pcl) return false; - z_erofs_pagevec_ctor_exit(&fe->vector, false); + z_erofs_bvec_iter_end(&fe->biter); mutex_unlock(&pcl->lock); + if (fe->candidate_bvpage) { + DBG_BUGON(z_erofs_is_shortlived_page(fe->candidate_bvpage)); + fe->candidate_bvpage = NULL; + } + /* * if all pending pages are added, don't hold its reference * any longer if the pcluster isn't hosted by ourselves. */ - if (fe->mode < COLLECT_PRIMARY_FOLLOWED_NOINPLACE) + if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE) erofs_workgroup_put(&pcl->obj); fe->pcl = NULL; @@ -628,11 +657,10 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct erofs_map_blocks *const map = &fe->map; const loff_t offset = page_offset(page); - bool tight = true; + bool tight = true, exclusive; enum z_erofs_cache_alloctype cache_strategy; - enum z_erofs_page_type page_type; - unsigned int cur, end, spiltted, index; + unsigned int cur, end, spiltted; int err = 0; /* register locked file pages as online pages in pack */ @@ -653,7 +681,7 @@ repeat: map->m_llen = 0; err = z_erofs_map_blocks_iter(inode, map, 0); if (err) - goto err_out; + goto out; } else { if (fe->pcl) goto hitted; @@ -663,9 +691,9 @@ repeat: if (!(map->m_flags & EROFS_MAP_MAPPED)) goto hitted; - err = z_erofs_collector_begin(fe, inode, map); + err = z_erofs_collector_begin(fe); if (err) - goto err_out; + goto out; if (z_erofs_is_inline_pcluster(fe->pcl)) { void *mp; @@ -676,11 +704,12 @@ repeat: err = PTR_ERR(mp); erofs_err(inode->i_sb, "failed to get inline page, err %d", err); - goto err_out; + goto out; } get_page(fe->map.buf.page); - WRITE_ONCE(fe->pcl->compressed_pages[0], fe->map.buf.page); - fe->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE; + WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, + fe->map.buf.page); + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; } else { /* bind cache first when cached decompression is preferred */ if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy, @@ -696,10 +725,10 @@ hitted: * Ensure the current partial page belongs to this submit chain rather * than other concurrent submit chains or the noio(bypass) chain since * those chains are handled asynchronously thus the page cannot be used - * for inplace I/O or pagevec (should be processed in strict order.) + * for inplace I/O or bvpage (should be processed in a strict order.) */ - tight &= (fe->mode >= COLLECT_PRIMARY_HOOKED && - fe->mode != COLLECT_PRIMARY_FOLLOWED_NOINPLACE); + tight &= (fe->mode >= Z_EROFS_PCLUSTER_HOOKED && + fe->mode != Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); cur = end - min_t(unsigned int, offset + end - map->m_la, end); if (!(map->m_flags & EROFS_MAP_MAPPED)) { @@ -707,60 +736,59 @@ hitted: goto next_part; } - /* let's derive page type */ - page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD : - (!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE : - (tight ? Z_EROFS_PAGE_TYPE_EXCLUSIVE : - Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED)); - + exclusive = (!cur && (!spiltted || tight)); if (cur) - tight &= (fe->mode >= COLLECT_PRIMARY_FOLLOWED); + tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED); retry: - err = z_erofs_attach_page(fe, page, page_type, - fe->mode >= COLLECT_PRIMARY_FOLLOWED); - /* should allocate an additional short-lived page for pagevec */ - if (err == -EAGAIN) { - struct page *const newpage = - alloc_page(GFP_NOFS | __GFP_NOFAIL); - - set_page_private(newpage, Z_EROFS_SHORTLIVED_PAGE); - err = z_erofs_attach_page(fe, newpage, - Z_EROFS_PAGE_TYPE_EXCLUSIVE, true); - if (!err) - goto retry; + err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) { + .page = page, + .offset = offset - map->m_la, + .end = end, + }), exclusive); + /* should allocate an additional short-lived page for bvset */ + if (err == -EAGAIN && !fe->candidate_bvpage) { + fe->candidate_bvpage = alloc_page(GFP_NOFS | __GFP_NOFAIL); + set_page_private(fe->candidate_bvpage, + Z_EROFS_SHORTLIVED_PAGE); + goto retry; } - if (err) - goto err_out; - - index = page->index - (map->m_la >> PAGE_SHIFT); - - z_erofs_onlinepage_fixup(page, index, true); + if (err) { + DBG_BUGON(err == -EAGAIN && fe->candidate_bvpage); + goto out; + } + z_erofs_onlinepage_split(page); /* bump up the number of spiltted parts of a page */ ++spiltted; - /* also update nr_pages */ - fe->pcl->nr_pages = max_t(pgoff_t, fe->pcl->nr_pages, index + 1); + if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) + fe->pcl->multibases = true; + + if ((map->m_flags & EROFS_MAP_FULL_MAPPED) && + fe->pcl->length == map->m_llen) + fe->pcl->partial = false; + if (fe->pcl->length < offset + end - map->m_la) { + fe->pcl->length = offset + end - map->m_la; + fe->pcl->pageofs_out = map->m_la & ~PAGE_MASK; + } next_part: - /* can be used for verification */ + /* shorten the remaining extent to update progress */ map->m_llen = offset + cur - map->m_la; + map->m_flags &= ~EROFS_MAP_FULL_MAPPED; end = cur; if (end > 0) goto repeat; out: + if (err) + z_erofs_page_mark_eio(page); z_erofs_onlinepage_endio(page); erofs_dbg("%s, finish page: %pK spiltted: %u map->m_llen %llu", __func__, page, spiltted, map->m_llen); return err; - - /* if some error occurred while processing this page */ -err_out: - SetPageError(page); - goto out; } static bool z_erofs_get_sync_decompress_policy(struct erofs_sb_info *sbi, @@ -783,97 +811,137 @@ static bool z_erofs_page_is_invalidated(struct page *page) return !page->mapping && !z_erofs_is_shortlived_page(page); } -static int z_erofs_decompress_pcluster(struct super_block *sb, - struct z_erofs_pcluster *pcl, - struct page **pagepool) -{ - struct erofs_sb_info *const sbi = EROFS_SB(sb); - unsigned int pclusterpages = z_erofs_pclusterpages(pcl); - struct z_erofs_pagevec_ctor ctor; - unsigned int i, inputsize, outputsize, llen, nr_pages; - struct page *pages_onstack[Z_EROFS_VMAP_ONSTACK_PAGES]; - struct page **pages, **compressed_pages, *page; +struct z_erofs_decompress_backend { + struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES]; + struct super_block *sb; + struct z_erofs_pcluster *pcl; - enum z_erofs_page_type page_type; - bool overlapped, partial; - int err; + /* pages with the longest decompressed length for deduplication */ + struct page **decompressed_pages; + /* pages to keep the compressed data */ + struct page **compressed_pages; - might_sleep(); - DBG_BUGON(!READ_ONCE(pcl->nr_pages)); + struct list_head decompressed_secondary_bvecs; + struct page **pagepool; + unsigned int onstack_used, nr_pages; +}; - mutex_lock(&pcl->lock); - nr_pages = pcl->nr_pages; +struct z_erofs_bvec_item { + struct z_erofs_bvec bvec; + struct list_head list; +}; - if (nr_pages <= Z_EROFS_VMAP_ONSTACK_PAGES) { - pages = pages_onstack; - } else if (nr_pages <= Z_EROFS_VMAP_GLOBAL_PAGES && - mutex_trylock(&z_pagemap_global_lock)) { - pages = z_pagemap_global; - } else { - gfp_t gfp_flags = GFP_KERNEL; +static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be, + struct z_erofs_bvec *bvec) +{ + struct z_erofs_bvec_item *item; - if (nr_pages > Z_EROFS_VMAP_GLOBAL_PAGES) - gfp_flags |= __GFP_NOFAIL; + if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK)) { + unsigned int pgnr; + struct page *oldpage; - pages = kvmalloc_array(nr_pages, sizeof(struct page *), - gfp_flags); + pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT; + DBG_BUGON(pgnr >= be->nr_pages); + oldpage = be->decompressed_pages[pgnr]; + be->decompressed_pages[pgnr] = bvec->page; - /* fallback to global pagemap for the lowmem scenario */ - if (!pages) { - mutex_lock(&z_pagemap_global_lock); - pages = z_pagemap_global; - } + if (!oldpage) + return; } - for (i = 0; i < nr_pages; ++i) - pages[i] = NULL; - - err = 0; - z_erofs_pagevec_ctor_init(&ctor, Z_EROFS_NR_INLINE_PAGEVECS, - pcl->pagevec, 0); - - for (i = 0; i < pcl->vcnt; ++i) { - unsigned int pagenr; + /* (cold path) one pcluster is requested multiple times */ + item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_NOFAIL); + item->bvec = *bvec; + list_add(&item->list, &be->decompressed_secondary_bvecs); +} - page = z_erofs_pagevec_dequeue(&ctor, &page_type); +static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be, + int err) +{ + unsigned int off0 = be->pcl->pageofs_out; + struct list_head *p, *n; + + list_for_each_safe(p, n, &be->decompressed_secondary_bvecs) { + struct z_erofs_bvec_item *bvi; + unsigned int end, cur; + void *dst, *src; + + bvi = container_of(p, struct z_erofs_bvec_item, list); + cur = bvi->bvec.offset < 0 ? -bvi->bvec.offset : 0; + end = min_t(unsigned int, be->pcl->length - bvi->bvec.offset, + bvi->bvec.end); + dst = kmap_local_page(bvi->bvec.page); + while (cur < end) { + unsigned int pgnr, scur, len; + + pgnr = (bvi->bvec.offset + cur + off0) >> PAGE_SHIFT; + DBG_BUGON(pgnr >= be->nr_pages); + + scur = bvi->bvec.offset + cur - + ((pgnr << PAGE_SHIFT) - off0); + len = min_t(unsigned int, end - cur, PAGE_SIZE - scur); + if (!be->decompressed_pages[pgnr]) { + err = -EFSCORRUPTED; + cur += len; + continue; + } + src = kmap_local_page(be->decompressed_pages[pgnr]); + memcpy(dst + cur, src + scur, len); + kunmap_local(src); + cur += len; + } + kunmap_local(dst); + if (err) + z_erofs_page_mark_eio(bvi->bvec.page); + z_erofs_onlinepage_endio(bvi->bvec.page); + list_del(p); + kfree(bvi); + } +} - /* all pages in pagevec ought to be valid */ - DBG_BUGON(!page); - DBG_BUGON(z_erofs_page_is_invalidated(page)); +static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be) +{ + struct z_erofs_pcluster *pcl = be->pcl; + struct z_erofs_bvec_iter biter; + struct page *old_bvpage; + int i; - if (z_erofs_put_shortlivedpage(pagepool, page)) - continue; + z_erofs_bvec_iter_begin(&biter, &pcl->bvset, Z_EROFS_INLINE_BVECS, 0); + for (i = 0; i < pcl->vcnt; ++i) { + struct z_erofs_bvec bvec; - if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD) - pagenr = 0; - else - pagenr = z_erofs_onlinepage_index(page); + z_erofs_bvec_dequeue(&biter, &bvec, &old_bvpage); - DBG_BUGON(pagenr >= nr_pages); + if (old_bvpage) + z_erofs_put_shortlivedpage(be->pagepool, old_bvpage); - /* - * currently EROFS doesn't support multiref(dedup), - * so here erroring out one multiref page. - */ - if (pages[pagenr]) { - DBG_BUGON(1); - SetPageError(pages[pagenr]); - z_erofs_onlinepage_endio(pages[pagenr]); - err = -EFSCORRUPTED; - } - pages[pagenr] = page; + DBG_BUGON(z_erofs_page_is_invalidated(bvec.page)); + z_erofs_do_decompressed_bvec(be, &bvec); } - z_erofs_pagevec_ctor_exit(&ctor, true); - overlapped = false; - compressed_pages = pcl->compressed_pages; + old_bvpage = z_erofs_bvec_iter_end(&biter); + if (old_bvpage) + z_erofs_put_shortlivedpage(be->pagepool, old_bvpage); +} +static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, + bool *overlapped) +{ + struct z_erofs_pcluster *pcl = be->pcl; + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); + int i, err = 0; + + *overlapped = false; for (i = 0; i < pclusterpages; ++i) { - unsigned int pagenr; + struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i]; + struct page *page = bvec->page; - page = compressed_pages[i]; - /* all compressed pages ought to be valid */ - DBG_BUGON(!page); + /* compressed pages ought to be present before decompressing */ + if (!page) { + DBG_BUGON(1); + continue; + } + be->compressed_pages[i] = page; if (z_erofs_is_inline_pcluster(pcl)) { if (!PageUptodate(page)) @@ -883,109 +951,129 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, DBG_BUGON(z_erofs_page_is_invalidated(page)); if (!z_erofs_is_shortlived_page(page)) { - if (erofs_page_is_managed(sbi, page)) { + if (erofs_page_is_managed(EROFS_SB(be->sb), page)) { if (!PageUptodate(page)) err = -EIO; continue; } + z_erofs_do_decompressed_bvec(be, bvec); + *overlapped = true; + } + } - /* - * only if non-head page can be selected - * for inplace decompression - */ - pagenr = z_erofs_onlinepage_index(page); - - DBG_BUGON(pagenr >= nr_pages); - if (pages[pagenr]) { - DBG_BUGON(1); - SetPageError(pages[pagenr]); - z_erofs_onlinepage_endio(pages[pagenr]); - err = -EFSCORRUPTED; - } - pages[pagenr] = page; + if (err) + return err; + return 0; +} - overlapped = true; - } +static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, + int err) +{ + struct erofs_sb_info *const sbi = EROFS_SB(be->sb); + struct z_erofs_pcluster *pcl = be->pcl; + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); + unsigned int i, inputsize; + int err2; + struct page *page; + bool overlapped; - /* PG_error needs checking for all non-managed pages */ - if (PageError(page)) { - DBG_BUGON(PageUptodate(page)); - err = -EIO; - } + mutex_lock(&pcl->lock); + be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT; + + /* allocate (de)compressed page arrays if cannot be kept on stack */ + be->decompressed_pages = NULL; + be->compressed_pages = NULL; + be->onstack_used = 0; + if (be->nr_pages <= Z_EROFS_ONSTACK_PAGES) { + be->decompressed_pages = be->onstack_pages; + be->onstack_used = be->nr_pages; + memset(be->decompressed_pages, 0, + sizeof(struct page *) * be->nr_pages); } + if (pclusterpages + be->onstack_used <= Z_EROFS_ONSTACK_PAGES) + be->compressed_pages = be->onstack_pages + be->onstack_used; + + if (!be->decompressed_pages) + be->decompressed_pages = + kvcalloc(be->nr_pages, sizeof(struct page *), + GFP_KERNEL | __GFP_NOFAIL); + if (!be->compressed_pages) + be->compressed_pages = + kvcalloc(pclusterpages, sizeof(struct page *), + GFP_KERNEL | __GFP_NOFAIL); + + z_erofs_parse_out_bvecs(be); + err2 = z_erofs_parse_in_bvecs(be, &overlapped); + if (err2) + err = err2; if (err) goto out; - llen = pcl->length >> Z_EROFS_PCLUSTER_LENGTH_BIT; - if (nr_pages << PAGE_SHIFT >= pcl->pageofs_out + llen) { - outputsize = llen; - partial = !(pcl->length & Z_EROFS_PCLUSTER_FULL_LENGTH); - } else { - outputsize = (nr_pages << PAGE_SHIFT) - pcl->pageofs_out; - partial = true; - } - if (z_erofs_is_inline_pcluster(pcl)) inputsize = pcl->tailpacking_size; else inputsize = pclusterpages * PAGE_SIZE; err = z_erofs_decompress(&(struct z_erofs_decompress_req) { - .sb = sb, - .in = compressed_pages, - .out = pages, + .sb = be->sb, + .in = be->compressed_pages, + .out = be->decompressed_pages, .pageofs_in = pcl->pageofs_in, .pageofs_out = pcl->pageofs_out, .inputsize = inputsize, - .outputsize = outputsize, + .outputsize = pcl->length, .alg = pcl->algorithmformat, .inplace_io = overlapped, - .partial_decoding = partial - }, pagepool); + .partial_decoding = pcl->partial, + .fillgaps = pcl->multibases, + }, be->pagepool); out: /* must handle all compressed pages before actual file pages */ if (z_erofs_is_inline_pcluster(pcl)) { - page = compressed_pages[0]; - WRITE_ONCE(compressed_pages[0], NULL); + page = pcl->compressed_bvecs[0].page; + WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL); put_page(page); } else { for (i = 0; i < pclusterpages; ++i) { - page = compressed_pages[i]; + page = pcl->compressed_bvecs[i].page; if (erofs_page_is_managed(sbi, page)) continue; /* recycle all individual short-lived pages */ - (void)z_erofs_put_shortlivedpage(pagepool, page); - WRITE_ONCE(compressed_pages[i], NULL); + (void)z_erofs_put_shortlivedpage(be->pagepool, page); + WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); } } + if (be->compressed_pages < be->onstack_pages || + be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES) + kvfree(be->compressed_pages); + z_erofs_fill_other_copies(be, err); - for (i = 0; i < nr_pages; ++i) { - page = pages[i]; + for (i = 0; i < be->nr_pages; ++i) { + page = be->decompressed_pages[i]; if (!page) continue; DBG_BUGON(z_erofs_page_is_invalidated(page)); /* recycle all individual short-lived pages */ - if (z_erofs_put_shortlivedpage(pagepool, page)) + if (z_erofs_put_shortlivedpage(be->pagepool, page)) continue; - - if (err < 0) - SetPageError(page); - + if (err) + z_erofs_page_mark_eio(page); z_erofs_onlinepage_endio(page); } - if (pages == z_pagemap_global) - mutex_unlock(&z_pagemap_global_lock); - else if (pages != pages_onstack) - kvfree(pages); + if (be->decompressed_pages != be->onstack_pages) + kvfree(be->decompressed_pages); - pcl->nr_pages = 0; + pcl->length = 0; + pcl->partial = true; + pcl->multibases = false; + pcl->bvset.nextpage = NULL; pcl->vcnt = 0; /* pcluster lock MUST be taken before the following line */ @@ -997,22 +1085,25 @@ out: static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, struct page **pagepool) { + struct z_erofs_decompress_backend be = { + .sb = io->sb, + .pagepool = pagepool, + .decompressed_secondary_bvecs = + LIST_HEAD_INIT(be.decompressed_secondary_bvecs), + }; z_erofs_next_pcluster_t owned = io->head; while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) { - struct z_erofs_pcluster *pcl; - - /* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */ + /* impossible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */ DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL); - - /* no possible that 'owned' equals NULL */ + /* impossible that 'owned' equals Z_EROFS_PCLUSTER_NIL */ DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL); - pcl = container_of(owned, struct z_erofs_pcluster, next); - owned = READ_ONCE(pcl->next); + be.pcl = container_of(owned, struct z_erofs_pcluster, next); + owned = READ_ONCE(be.pcl->next); - z_erofs_decompress_pcluster(io->sb, pcl, pagepool); - erofs_workgroup_put(&pcl->obj); + z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0); + erofs_workgroup_put(&be.pcl->obj); } } @@ -1038,7 +1129,6 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, if (sync) { if (!atomic_add_return(bios, &io->pending_bios)) complete(&io->u.done); - return; } @@ -1071,7 +1161,7 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, int justfound; repeat: - page = READ_ONCE(pcl->compressed_pages[nr]); + page = READ_ONCE(pcl->compressed_bvecs[nr].page); oldpage = page; if (!page) @@ -1087,7 +1177,7 @@ repeat: * otherwise, it will go inplace I/O path instead. */ if (page->private == Z_EROFS_PREALLOCATED_PAGE) { - WRITE_ONCE(pcl->compressed_pages[nr], page); + WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); set_page_private(page, 0); tocache = true; goto out_tocache; @@ -1113,14 +1203,13 @@ repeat: /* the page is still in manage cache */ if (page->mapping == mc) { - WRITE_ONCE(pcl->compressed_pages[nr], page); + WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); - ClearPageError(page); if (!PagePrivate(page)) { /* * impossible to be !PagePrivate(page) for * the current restriction as well if - * the page is already in compressed_pages[]. + * the page is already in compressed_bvecs[]. */ DBG_BUGON(!justfound); @@ -1149,7 +1238,8 @@ repeat: put_page(page); out_allocpage: page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL); - if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) { + if (oldpage != cmpxchg(&pcl->compressed_bvecs[nr].page, + oldpage, page)) { erofs_pagepool_add(pagepool, page); cond_resched(); goto repeat; @@ -1186,6 +1276,7 @@ fg_out: q = fgq; init_completion(&fgq->u.done); atomic_set(&fgq->pending_bios, 0); + q->eio = false; } q->sb = sb; q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED; @@ -1246,26 +1337,25 @@ static void z_erofs_decompressqueue_endio(struct bio *bio) DBG_BUGON(PageUptodate(page)); DBG_BUGON(z_erofs_page_is_invalidated(page)); - if (err) - SetPageError(page); - if (erofs_page_is_managed(EROFS_SB(q->sb), page)) { if (!err) SetPageUptodate(page); unlock_page(page); } } + if (err) + q->eio = true; z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1); bio_put(bio); } -static void z_erofs_submit_queue(struct super_block *sb, - struct z_erofs_decompress_frontend *f, +static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, struct page **pagepool, struct z_erofs_decompressqueue *fgq, bool *force_fg) { - struct erofs_sb_info *const sbi = EROFS_SB(sb); + struct super_block *sb = f->inode->i_sb; + struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb)); z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; void *bi_private; @@ -1317,7 +1407,7 @@ static void z_erofs_submit_queue(struct super_block *sb, struct page *page; page = pickup_page_for_submission(pcl, i++, pagepool, - MNGD_MAPPING(sbi)); + mc); if (!page) continue; @@ -1369,15 +1459,14 @@ submit_bio_retry: z_erofs_decompress_kickoff(q[JQ_SUBMIT], *force_fg, nr_bios); } -static void z_erofs_runqueue(struct super_block *sb, - struct z_erofs_decompress_frontend *f, +static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, struct page **pagepool, bool force_fg) { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; if (f->owned_head == Z_EROFS_PCLUSTER_TAIL) return; - z_erofs_submit_queue(sb, f, pagepool, io, &force_fg); + z_erofs_submit_queue(f, pagepool, io, &force_fg); /* handle bypass queue (no i/o pclusters) immediately */ z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool); @@ -1475,7 +1564,7 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) (void)z_erofs_collector_end(&f); /* if some compressed cluster ready, need submit them anyway */ - z_erofs_runqueue(inode->i_sb, &f, &pagepool, + z_erofs_runqueue(&f, &pagepool, z_erofs_get_sync_decompress_policy(sbi, 0)); if (err) @@ -1524,7 +1613,7 @@ static void z_erofs_readahead(struct readahead_control *rac) z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false); (void)z_erofs_collector_end(&f); - z_erofs_runqueue(inode->i_sb, &f, &pagepool, + z_erofs_runqueue(&f, &pagepool, z_erofs_get_sync_decompress_policy(sbi, nr_pages)); erofs_put_metabuf(&f.map.buf); erofs_release_pages(&pagepool); diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index 58053bb5066f..e7f04c4fbb81 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -7,13 +7,10 @@ #define __EROFS_FS_ZDATA_H #include "internal.h" -#include "zpvec.h" +#include "tagptr.h" #define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) -#define Z_EROFS_NR_INLINE_PAGEVECS 3 - -#define Z_EROFS_PCLUSTER_FULL_LENGTH 0x00000001 -#define Z_EROFS_PCLUSTER_LENGTH_BIT 1 +#define Z_EROFS_INLINE_BVECS 2 /* * let's leave a type here in case of introducing @@ -21,6 +18,21 @@ */ typedef void *z_erofs_next_pcluster_t; +struct z_erofs_bvec { + struct page *page; + int offset; + unsigned int end; +}; + +#define __Z_EROFS_BVSET(name, total) \ +struct name { \ + /* point to the next page which contains the following bvecs */ \ + struct page *nextpage; \ + struct z_erofs_bvec bvec[total]; \ +} +__Z_EROFS_BVSET(z_erofs_bvset,); +__Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS); + /* * Structure fields follow one of the following exclusion rules. * @@ -38,24 +50,21 @@ struct z_erofs_pcluster { /* A: point to next chained pcluster or TAILs */ z_erofs_next_pcluster_t next; - /* A: lower limit of decompressed length and if full length or not */ + /* L: the maximum decompression size of this round */ unsigned int length; + /* L: total number of bvecs */ + unsigned int vcnt; + /* I: page offset of start position of decompression */ unsigned short pageofs_out; /* I: page offset of inline compressed data */ unsigned short pageofs_in; - /* L: maximum relative page index in pagevec[] */ - unsigned short nr_pages; - - /* L: total number of pages in pagevec[] */ - unsigned int vcnt; - union { - /* L: inline a certain number of pagevecs for bootstrap */ - erofs_vtptr_t pagevec[Z_EROFS_NR_INLINE_PAGEVECS]; + /* L: inline a certain number of bvec for bootstrap */ + struct z_erofs_bvset_inline bvset; /* I: can be used to free the pcluster by RCU. */ struct rcu_head rcu; @@ -72,8 +81,14 @@ struct z_erofs_pcluster { /* I: compression algorithm format */ unsigned char algorithmformat; - /* A: compressed pages (can be cached or inplaced pages) */ - struct page *compressed_pages[]; + /* L: whether partial decompression or not */ + bool partial; + + /* L: indicate several pageofs_outs or not */ + bool multibases; + + /* A: compressed bvecs (can be cached or inplaced pages) */ + struct z_erofs_bvec compressed_bvecs[]; }; /* let's avoid the valid 32-bit kernel addresses */ @@ -94,6 +109,8 @@ struct z_erofs_decompressqueue { struct completion done; struct work_struct work; } u; + + bool eio; }; static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) @@ -108,38 +125,17 @@ static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) return pcl->pclusterpages; } -#define Z_EROFS_ONLINEPAGE_COUNT_BITS 2 -#define Z_EROFS_ONLINEPAGE_COUNT_MASK ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1) -#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT (Z_EROFS_ONLINEPAGE_COUNT_BITS) - /* - * waiters (aka. ongoing_packs): # to unlock the page - * sub-index: 0 - for partial page, >= 1 full page sub-index + * bit 31: I/O error occurred on this page + * bit 0 - 30: remaining parts to complete this page */ -typedef atomic_t z_erofs_onlinepage_t; - -/* type punning */ -union z_erofs_onlinepage_converter { - z_erofs_onlinepage_t *o; - unsigned long *v; -}; - -static inline unsigned int z_erofs_onlinepage_index(struct page *page) -{ - union z_erofs_onlinepage_converter u; - - DBG_BUGON(!PagePrivate(page)); - u.v = &page_private(page); - - return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT; -} +#define Z_EROFS_PAGE_EIO (1 << 31) static inline void z_erofs_onlinepage_init(struct page *page) { union { - z_erofs_onlinepage_t o; + atomic_t o; unsigned long v; - /* keep from being unlocked in advance */ } u = { .o = ATOMIC_INIT(1) }; set_page_private(page, u.v); @@ -147,49 +143,36 @@ static inline void z_erofs_onlinepage_init(struct page *page) SetPagePrivate(page); } -static inline void z_erofs_onlinepage_fixup(struct page *page, - uintptr_t index, bool down) +static inline void z_erofs_onlinepage_split(struct page *page) { - union z_erofs_onlinepage_converter u = { .v = &page_private(page) }; - int orig, orig_index, val; - -repeat: - orig = atomic_read(u.o); - orig_index = orig >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT; - if (orig_index) { - if (!index) - return; + atomic_inc((atomic_t *)&page->private); +} - DBG_BUGON(orig_index != index); - } +static inline void z_erofs_page_mark_eio(struct page *page) +{ + int orig; - val = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) | - ((orig & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned int)down); - if (atomic_cmpxchg(u.o, orig, val) != orig) - goto repeat; + do { + orig = atomic_read((atomic_t *)&page->private); + } while (atomic_cmpxchg((atomic_t *)&page->private, orig, + orig | Z_EROFS_PAGE_EIO) != orig); } static inline void z_erofs_onlinepage_endio(struct page *page) { - union z_erofs_onlinepage_converter u; unsigned int v; DBG_BUGON(!PagePrivate(page)); - u.v = &page_private(page); - - v = atomic_dec_return(u.o); - if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) { + v = atomic_dec_return((atomic_t *)&page->private); + if (!(v & ~Z_EROFS_PAGE_EIO)) { set_page_private(page, 0); ClearPagePrivate(page); - if (!PageError(page)) + if (!(v & Z_EROFS_PAGE_EIO)) SetPageUptodate(page); unlock_page(page); } - erofs_dbg("%s, page %p value %x", __func__, page, atomic_read(u.o)); } -#define Z_EROFS_VMAP_ONSTACK_PAGES \ - min_t(unsigned int, THREAD_SIZE / 8 / sizeof(struct page *), 96U) -#define Z_EROFS_VMAP_GLOBAL_PAGES 2048 +#define Z_EROFS_ONSTACK_PAGES 32 #endif diff --git a/fs/erofs/zpvec.h b/fs/erofs/zpvec.h deleted file mode 100644 index b05464f4a808..000000000000 --- a/fs/erofs/zpvec.h +++ /dev/null @@ -1,159 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2018 HUAWEI, Inc. - * https://www.huawei.com/ - */ -#ifndef __EROFS_FS_ZPVEC_H -#define __EROFS_FS_ZPVEC_H - -#include "tagptr.h" - -/* page type in pagevec for decompress subsystem */ -enum z_erofs_page_type { - /* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */ - Z_EROFS_PAGE_TYPE_EXCLUSIVE, - - Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED, - - Z_EROFS_VLE_PAGE_TYPE_HEAD, - Z_EROFS_VLE_PAGE_TYPE_MAX -}; - -extern void __compiletime_error("Z_EROFS_PAGE_TYPE_EXCLUSIVE != 0") - __bad_page_type_exclusive(void); - -/* pagevec tagged pointer */ -typedef tagptr2_t erofs_vtptr_t; - -/* pagevec collector */ -struct z_erofs_pagevec_ctor { - struct page *curr, *next; - erofs_vtptr_t *pages; - - unsigned int nr, index; -}; - -static inline void z_erofs_pagevec_ctor_exit(struct z_erofs_pagevec_ctor *ctor, - bool atomic) -{ - if (!ctor->curr) - return; - - if (atomic) - kunmap_atomic(ctor->pages); - else - kunmap(ctor->curr); -} - -static inline struct page * -z_erofs_pagevec_ctor_next_page(struct z_erofs_pagevec_ctor *ctor, - unsigned int nr) -{ - unsigned int index; - - /* keep away from occupied pages */ - if (ctor->next) - return ctor->next; - - for (index = 0; index < nr; ++index) { - const erofs_vtptr_t t = ctor->pages[index]; - const unsigned int tags = tagptr_unfold_tags(t); - - if (tags == Z_EROFS_PAGE_TYPE_EXCLUSIVE) - return tagptr_unfold_ptr(t); - } - DBG_BUGON(nr >= ctor->nr); - return NULL; -} - -static inline void -z_erofs_pagevec_ctor_pagedown(struct z_erofs_pagevec_ctor *ctor, - bool atomic) -{ - struct page *next = z_erofs_pagevec_ctor_next_page(ctor, ctor->nr); - - z_erofs_pagevec_ctor_exit(ctor, atomic); - - ctor->curr = next; - ctor->next = NULL; - ctor->pages = atomic ? - kmap_atomic(ctor->curr) : kmap(ctor->curr); - - ctor->nr = PAGE_SIZE / sizeof(struct page *); - ctor->index = 0; -} - -static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor, - unsigned int nr, - erofs_vtptr_t *pages, - unsigned int i) -{ - ctor->nr = nr; - ctor->curr = ctor->next = NULL; - ctor->pages = pages; - - if (i >= nr) { - i -= nr; - z_erofs_pagevec_ctor_pagedown(ctor, false); - while (i > ctor->nr) { - i -= ctor->nr; - z_erofs_pagevec_ctor_pagedown(ctor, false); - } - } - ctor->next = z_erofs_pagevec_ctor_next_page(ctor, i); - ctor->index = i; -} - -static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor, - struct page *page, - enum z_erofs_page_type type, - bool pvec_safereuse) -{ - if (!ctor->next) { - /* some pages cannot be reused as pvec safely without I/O */ - if (type == Z_EROFS_PAGE_TYPE_EXCLUSIVE && !pvec_safereuse) - type = Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED; - - if (type != Z_EROFS_PAGE_TYPE_EXCLUSIVE && - ctor->index + 1 == ctor->nr) - return false; - } - - if (ctor->index >= ctor->nr) - z_erofs_pagevec_ctor_pagedown(ctor, false); - - /* exclusive page type must be 0 */ - if (Z_EROFS_PAGE_TYPE_EXCLUSIVE != (uintptr_t)NULL) - __bad_page_type_exclusive(); - - /* should remind that collector->next never equal to 1, 2 */ - if (type == (uintptr_t)ctor->next) { - ctor->next = page; - } - ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, page, type); - return true; -} - -static inline struct page * -z_erofs_pagevec_dequeue(struct z_erofs_pagevec_ctor *ctor, - enum z_erofs_page_type *type) -{ - erofs_vtptr_t t; - - if (ctor->index >= ctor->nr) { - DBG_BUGON(!ctor->next); - z_erofs_pagevec_ctor_pagedown(ctor, true); - } - - t = ctor->pages[ctor->index]; - - *type = tagptr_unfold_tags(t); - - /* should remind that collector->next never equal to 1, 2 */ - if (*type == (uintptr_t)ctor->next) - ctor->next = tagptr_unfold_ptr(t); - - ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, NULL, 0); - return tagptr_unfold_ptr(t); -} -#endif diff --git a/fs/exec.c b/fs/exec.c index 0989fb8472a1..778123259e42 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1301,7 +1301,7 @@ int begin_new_exec(struct linux_binprm * bprm) bprm->mm = NULL; #ifdef CONFIG_POSIX_TIMERS - exit_itimers(me->signal); + exit_itimers(me); flush_itimer_signals(); #endif diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 76acc3721951..c6eaf7e9ea74 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -1198,7 +1198,9 @@ static int __exfat_rename(struct inode *old_parent_inode, return -ENOENT; } - exfat_chain_dup(&olddir, &ei->dir); + exfat_chain_set(&olddir, EXFAT_I(old_parent_inode)->start_clu, + EXFAT_B_TO_CLU_ROUND_UP(i_size_read(old_parent_inode), sbi), + EXFAT_I(old_parent_inode)->flags); dentry = ei->entry; ep = exfat_get_dentry(sb, &olddir, dentry, &old_bh); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index e6b932219803..7a192e4e7fa9 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1679,14 +1679,14 @@ int ext2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (error) return error; - if (is_quota_modification(inode, iattr)) { + if (is_quota_modification(mnt_userns, inode, iattr)) { error = dquot_initialize(inode); if (error) return error; } - if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) || - (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) { - error = dquot_transfer(inode, iattr); + if (i_uid_needs_update(mnt_userns, iattr, inode) || + i_gid_needs_update(mnt_userns, iattr, inode)) { + error = dquot_transfer(mnt_userns, inode, iattr); if (error) return error; } diff --git a/fs/ext2/super.c b/fs/ext2/super.c index f6a19f6d9f6d..6f475d2e3b18 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1059,9 +1059,10 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) sbi->s_frags_per_group); goto failed_mount; } - if (sbi->s_inodes_per_group > sb->s_blocksize * 8) { + if (sbi->s_inodes_per_group < sbi->s_inodes_per_block || + sbi->s_inodes_per_group > sb->s_blocksize * 8) { ext2_msg(sb, KERN_ERR, - "error: #inodes per group too big: %lu", + "error: invalid #inodes per group: %lu", sbi->s_inodes_per_group); goto failed_mount; } @@ -1071,6 +1072,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) - le32_to_cpu(es->s_first_data_block) - 1) / EXT2_BLOCKS_PER_GROUP(sb)) + 1; + if ((u64)sbi->s_groups_count * sbi->s_inodes_per_group != + le32_to_cpu(es->s_inodes_count)) { + ext2_msg(sb, KERN_ERR, "error: invalid #inodes: %u vs computed %llu", + le32_to_cpu(es->s_inodes_count), + (u64)sbi->s_groups_count * sbi->s_inodes_per_group); + goto failed_mount; + } db_count = (sbi->s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) / EXT2_DESC_PER_BLOCK(sb); sbi->s_group_desc = kmalloc_array(db_count, @@ -1490,8 +1498,7 @@ static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data, len = i_size-off; toread = len; while (toread > 0) { - tocopy = sb->s_blocksize - offset < toread ? - sb->s_blocksize - offset : toread; + tocopy = min_t(size_t, sb->s_blocksize - offset, toread); tmp_bh.b_state = 0; tmp_bh.b_size = sb->s_blocksize; @@ -1529,8 +1536,7 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type, struct buffer_head *bh; while (towrite > 0) { - tocopy = sb->s_blocksize - offset < towrite ? - sb->s_blocksize - offset : towrite; + tocopy = min_t(size_t, sb->s_blocksize - offset, towrite); tmp_bh.b_state = 0; tmp_bh.b_size = sb->s_blocksize; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 84c0eb55071d..3dcc1dd1f179 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5350,14 +5350,14 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (error) return error; - if (is_quota_modification(inode, attr)) { + if (is_quota_modification(mnt_userns, inode, attr)) { error = dquot_initialize(inode); if (error) return error; } - if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) || - (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) { + if (i_uid_needs_update(mnt_userns, attr, inode) || + i_gid_needs_update(mnt_userns, attr, inode)) { handle_t *handle; /* (user+group)*(old+new) structure, inode write (sb, @@ -5374,7 +5374,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, * counts xattr inode references. */ down_read(&EXT4_I(inode)->xattr_sem); - error = dquot_transfer(inode, attr); + error = dquot_transfer(mnt_userns, inode, attr); up_read(&EXT4_I(inode)->xattr_sem); if (error) { @@ -5383,10 +5383,8 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } /* Update corresponding info in inode so that everything is in * one transaction */ - if (attr->ia_valid & ATTR_UID) - inode->i_uid = attr->ia_uid; - if (attr->ia_valid & ATTR_GID) - inode->i_gid = attr->ia_gid; + i_uid_update(mnt_userns, attr, inode); + i_gid_update(mnt_userns, attr, inode); error = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); if (unlikely(error)) { diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index bd14cef1b08f..d66e37d80a2d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -861,10 +861,8 @@ static void __setattr_copy(struct user_namespace *mnt_userns, { unsigned int ia_valid = attr->ia_valid; - if (ia_valid & ATTR_UID) - inode->i_uid = attr->ia_uid; - if (ia_valid & ATTR_GID) - inode->i_gid = attr->ia_gid; + i_uid_update(mnt_userns, attr, inode); + i_gid_update(mnt_userns, attr, inode); if (ia_valid & ATTR_ATIME) inode->i_atime = attr->ia_atime; if (ia_valid & ATTR_MTIME) @@ -917,17 +915,15 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (err) return err; - if (is_quota_modification(inode, attr)) { + if (is_quota_modification(mnt_userns, inode, attr)) { err = f2fs_dquot_initialize(inode); if (err) return err; } - if ((attr->ia_valid & ATTR_UID && - !uid_eq(attr->ia_uid, inode->i_uid)) || - (attr->ia_valid & ATTR_GID && - !gid_eq(attr->ia_gid, inode->i_gid))) { + if (i_uid_needs_update(mnt_userns, attr, inode) || + i_gid_needs_update(mnt_userns, attr, inode)) { f2fs_lock_op(F2FS_I_SB(inode)); - err = dquot_transfer(inode, attr); + err = dquot_transfer(mnt_userns, inode, attr); if (err) { set_sbi_flag(F2FS_I_SB(inode), SBI_QUOTA_NEED_REPAIR); @@ -938,10 +934,8 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, * update uid/gid under lock_op(), so that dquot and inode can * be updated atomically. */ - if (attr->ia_valid & ATTR_UID) - inode->i_uid = attr->ia_uid; - if (attr->ia_valid & ATTR_GID) - inode->i_gid = attr->ia_gid; + i_uid_update(mnt_userns, attr, inode); + i_gid_update(mnt_userns, attr, inode); f2fs_mark_inode_dirty_sync(inode, true); f2fs_unlock_op(F2FS_I_SB(inode)); } diff --git a/fs/f2fs/iostat.c b/fs/f2fs/iostat.c index be599f31d3c4..d84c5f6cc09d 100644 --- a/fs/f2fs/iostat.c +++ b/fs/f2fs/iostat.c @@ -91,8 +91,9 @@ static inline void __record_iostat_latency(struct f2fs_sb_info *sbi) unsigned int cnt; struct f2fs_iostat_latency iostat_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; struct iostat_lat_info *io_lat = sbi->iostat_io_lat; + unsigned long flags; - spin_lock_bh(&sbi->iostat_lat_lock); + spin_lock_irqsave(&sbi->iostat_lat_lock, flags); for (idx = 0; idx < MAX_IO_TYPE; idx++) { for (io = 0; io < NR_PAGE_TYPE; io++) { cnt = io_lat->bio_cnt[idx][io]; @@ -106,7 +107,7 @@ static inline void __record_iostat_latency(struct f2fs_sb_info *sbi) io_lat->bio_cnt[idx][io] = 0; } } - spin_unlock_bh(&sbi->iostat_lat_lock); + spin_unlock_irqrestore(&sbi->iostat_lat_lock, flags); trace_f2fs_iostat_latency(sbi, iostat_lat); } @@ -115,14 +116,15 @@ static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi) { unsigned long long iostat_diff[NR_IO_TYPE]; int i; + unsigned long flags; if (time_is_after_jiffies(sbi->iostat_next_period)) return; /* Need double check under the lock */ - spin_lock_bh(&sbi->iostat_lock); + spin_lock_irqsave(&sbi->iostat_lock, flags); if (time_is_after_jiffies(sbi->iostat_next_period)) { - spin_unlock_bh(&sbi->iostat_lock); + spin_unlock_irqrestore(&sbi->iostat_lock, flags); return; } sbi->iostat_next_period = jiffies + @@ -133,7 +135,7 @@ static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi) sbi->prev_rw_iostat[i]; sbi->prev_rw_iostat[i] = sbi->rw_iostat[i]; } - spin_unlock_bh(&sbi->iostat_lock); + spin_unlock_irqrestore(&sbi->iostat_lock, flags); trace_f2fs_iostat(sbi, iostat_diff); @@ -145,25 +147,27 @@ void f2fs_reset_iostat(struct f2fs_sb_info *sbi) struct iostat_lat_info *io_lat = sbi->iostat_io_lat; int i; - spin_lock_bh(&sbi->iostat_lock); + spin_lock_irq(&sbi->iostat_lock); for (i = 0; i < NR_IO_TYPE; i++) { sbi->rw_iostat[i] = 0; sbi->prev_rw_iostat[i] = 0; } - spin_unlock_bh(&sbi->iostat_lock); + spin_unlock_irq(&sbi->iostat_lock); - spin_lock_bh(&sbi->iostat_lat_lock); + spin_lock_irq(&sbi->iostat_lat_lock); memset(io_lat, 0, sizeof(struct iostat_lat_info)); - spin_unlock_bh(&sbi->iostat_lat_lock); + spin_unlock_irq(&sbi->iostat_lat_lock); } void f2fs_update_iostat(struct f2fs_sb_info *sbi, enum iostat_type type, unsigned long long io_bytes) { + unsigned long flags; + if (!sbi->iostat_enable) return; - spin_lock_bh(&sbi->iostat_lock); + spin_lock_irqsave(&sbi->iostat_lock, flags); sbi->rw_iostat[type] += io_bytes; if (type == APP_BUFFERED_IO || type == APP_DIRECT_IO) @@ -172,7 +176,7 @@ void f2fs_update_iostat(struct f2fs_sb_info *sbi, if (type == APP_BUFFERED_READ_IO || type == APP_DIRECT_READ_IO) sbi->rw_iostat[APP_READ_IO] += io_bytes; - spin_unlock_bh(&sbi->iostat_lock); + spin_unlock_irqrestore(&sbi->iostat_lock, flags); f2fs_record_iostat(sbi); } @@ -185,6 +189,7 @@ static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx, struct f2fs_sb_info *sbi = iostat_ctx->sbi; struct iostat_lat_info *io_lat = sbi->iostat_io_lat; int idx; + unsigned long flags; if (!sbi->iostat_enable) return; @@ -202,12 +207,12 @@ static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx, idx = WRITE_ASYNC_IO; } - spin_lock_bh(&sbi->iostat_lat_lock); + spin_lock_irqsave(&sbi->iostat_lat_lock, flags); io_lat->sum_lat[idx][iotype] += ts_diff; io_lat->bio_cnt[idx][iotype]++; if (ts_diff > io_lat->peak_lat[idx][iotype]) io_lat->peak_lat[idx][iotype] = ts_diff; - spin_unlock_bh(&sbi->iostat_lat_lock); + spin_unlock_irqrestore(&sbi->iostat_lat_lock, flags); } void iostat_update_and_unbind_ctx(struct bio *bio, int rw) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index c549acb52ac4..bf00d5057abb 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -89,8 +89,6 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, if (test_opt(sbi, INLINE_XATTR)) set_inode_flag(inode, FI_INLINE_XATTR); - if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) - set_inode_flag(inode, FI_INLINE_DATA); if (f2fs_may_inline_dentry(inode)) set_inode_flag(inode, FI_INLINE_DENTRY); @@ -107,10 +105,6 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, f2fs_init_extent_tree(inode, NULL); - stat_inc_inline_xattr(inode); - stat_inc_inline_inode(inode); - stat_inc_inline_dir(inode); - F2FS_I(inode)->i_flags = f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED); @@ -127,6 +121,14 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, set_compress_context(inode); } + /* Should enable inline_data after compression set */ + if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) + set_inode_flag(inode, FI_INLINE_DATA); + + stat_inc_inline_xattr(inode); + stat_inc_inline_inode(inode); + stat_inc_inline_dir(inode); + f2fs_set_inode_flags(inode); trace_f2fs_new_inode(inode, 0); @@ -325,6 +327,9 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode, if (!is_extension_exist(name, ext[i], false)) continue; + /* Do not use inline_data with compression */ + stat_dec_inline_inode(inode); + clear_inode_flag(inode, FI_INLINE_DATA); set_compress_context(inode); return; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 836c79a20afc..cf6f7fc83c08 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1450,7 +1450,9 @@ page_hit: out_err: ClearPageUptodate(page); out_put_err: - f2fs_handle_page_eio(sbi, page->index, NODE); + /* ENOENT comes from read_node_page which is not an error. */ + if (err != -ENOENT) + f2fs_handle_page_eio(sbi, page->index, NODE); f2fs_put_page(page, 1); return ERR_PTR(err); } diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 3cb7f8a43b4d..dcd0a1e35095 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -255,18 +255,18 @@ static int recover_quota_data(struct inode *inode, struct page *page) memset(&attr, 0, sizeof(attr)); - attr.ia_uid = make_kuid(inode->i_sb->s_user_ns, i_uid); - attr.ia_gid = make_kgid(inode->i_sb->s_user_ns, i_gid); + attr.ia_vfsuid = VFSUIDT_INIT(make_kuid(inode->i_sb->s_user_ns, i_uid)); + attr.ia_vfsgid = VFSGIDT_INIT(make_kgid(inode->i_sb->s_user_ns, i_gid)); - if (!uid_eq(attr.ia_uid, inode->i_uid)) + if (!vfsuid_eq(attr.ia_vfsuid, i_uid_into_vfsuid(&init_user_ns, inode))) attr.ia_valid |= ATTR_UID; - if (!gid_eq(attr.ia_gid, inode->i_gid)) + if (!vfsgid_eq(attr.ia_vfsgid, i_gid_into_vfsgid(&init_user_ns, inode))) attr.ia_valid |= ATTR_GID; if (!attr.ia_valid) return 0; - err = dquot_transfer(inode, &attr); + err = dquot_transfer(&init_user_ns, inode, &attr); if (err) set_sbi_flag(F2FS_I_SB(inode), SBI_QUOTA_NEED_REPAIR); return err; diff --git a/fs/fat/file.c b/fs/fat/file.c index 3dae3ed60f3a..3e4eb3467cb4 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -90,7 +90,8 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) * out the RO attribute for checking by the security * module, just because it maps to a file mode. */ - err = security_inode_setattr(file->f_path.dentry, &ia); + err = security_inode_setattr(file_mnt_user_ns(file), + file->f_path.dentry, &ia); if (err) goto out_unlock_inode; @@ -516,9 +517,11 @@ int fat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } if (((attr->ia_valid & ATTR_UID) && - (!uid_eq(attr->ia_uid, sbi->options.fs_uid))) || + (!uid_eq(from_vfsuid(mnt_userns, i_user_ns(inode), attr->ia_vfsuid), + sbi->options.fs_uid))) || ((attr->ia_valid & ATTR_GID) && - (!gid_eq(attr->ia_gid, sbi->options.fs_gid))) || + (!gid_eq(from_vfsgid(mnt_userns, i_user_ns(inode), attr->ia_vfsgid), + sbi->options.fs_gid))) || ((attr->ia_valid & ATTR_MODE) && (attr->ia_mode & ~FAT_VALID_MODE))) error = -EPERM; diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 9d3cf0111709..74920826d8f6 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -372,17 +372,22 @@ nomem: return NULL; } +static inline bool fscache_cookie_is_dropped(struct fscache_cookie *cookie) +{ + return READ_ONCE(cookie->state) == FSCACHE_COOKIE_STATE_DROPPED; +} + static void fscache_wait_on_collision(struct fscache_cookie *candidate, struct fscache_cookie *wait_for) { enum fscache_cookie_state *statep = &wait_for->state; - wait_var_event_timeout(statep, READ_ONCE(*statep) == FSCACHE_COOKIE_STATE_DROPPED, + wait_var_event_timeout(statep, fscache_cookie_is_dropped(wait_for), 20 * HZ); - if (READ_ONCE(*statep) != FSCACHE_COOKIE_STATE_DROPPED) { + if (!fscache_cookie_is_dropped(wait_for)) { pr_notice("Potential collision c=%08x old: c=%08x", candidate->debug_id, wait_for->debug_id); - wait_var_event(statep, READ_ONCE(*statep) == FSCACHE_COOKIE_STATE_DROPPED); + wait_var_event(statep, fscache_cookie_is_dropped(wait_for)); } } @@ -517,7 +522,14 @@ static void fscache_perform_lookup(struct fscache_cookie *cookie) } fscache_see_cookie(cookie, fscache_cookie_see_active); - fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_ACTIVE); + spin_lock(&cookie->lock); + if (test_and_clear_bit(FSCACHE_COOKIE_DO_INVALIDATE, &cookie->flags)) + __fscache_set_cookie_state(cookie, + FSCACHE_COOKIE_STATE_INVALIDATING); + else + __fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_ACTIVE); + spin_unlock(&cookie->lock); + wake_up_cookie_state(cookie); trace = fscache_access_lookup_cookie_end; out: @@ -752,6 +764,9 @@ again_locked: spin_lock(&cookie->lock); } + if (test_and_clear_bit(FSCACHE_COOKIE_DO_INVALIDATE, &cookie->flags)) + fscache_end_cookie_access(cookie, fscache_access_invalidate_cookie_end); + switch (state) { case FSCACHE_COOKIE_STATE_RELINQUISHING: fscache_see_cookie(cookie, fscache_cookie_see_relinquish); @@ -1048,6 +1063,9 @@ void __fscache_invalidate(struct fscache_cookie *cookie, return; case FSCACHE_COOKIE_STATE_LOOKING_UP: + __fscache_begin_cookie_access(cookie, fscache_access_invalidate_cookie); + set_bit(FSCACHE_COOKIE_DO_INVALIDATE, &cookie->flags); + fallthrough; case FSCACHE_COOKIE_STATE_CREATING: spin_unlock(&cookie->lock); _leave(" [look %x]", cookie->inval_counter); diff --git a/fs/fscache/volume.c b/fs/fscache/volume.c index f2aa7dbad766..a058e0136bfe 100644 --- a/fs/fscache/volume.c +++ b/fs/fscache/volume.c @@ -143,7 +143,7 @@ static void fscache_wait_on_volume_collision(struct fscache_volume *candidate, { wait_var_event_timeout(&candidate->flags, !fscache_is_acquire_pending(candidate), 20 * HZ); - if (!fscache_is_acquire_pending(candidate)) { + if (fscache_is_acquire_pending(candidate)) { pr_notice("Potential volume collision new=%08x old=%08x", candidate->debug_id, collidee_debug_id); fscache_stat(&fscache_n_volumes_collision); @@ -182,7 +182,7 @@ static bool fscache_hash_volume(struct fscache_volume *candidate) hlist_bl_add_head(&candidate->hash_link, h); hlist_bl_unlock(h); - if (test_bit(FSCACHE_VOLUME_ACQUIRE_PENDING, &candidate->flags)) + if (fscache_is_acquire_pending(candidate)) fscache_wait_on_volume_collision(candidate, collidee_debug_id); return true; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 62408047e8d7..02eb72351b15 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -600,41 +600,79 @@ static void hugetlb_vmtruncate(struct inode *inode, loff_t offset) remove_inode_hugepages(inode, offset, LLONG_MAX); } +static void hugetlbfs_zero_partial_page(struct hstate *h, + struct address_space *mapping, + loff_t start, + loff_t end) +{ + pgoff_t idx = start >> huge_page_shift(h); + struct folio *folio; + + folio = filemap_lock_folio(mapping, idx); + if (!folio) + return; + + start = start & ~huge_page_mask(h); + end = end & ~huge_page_mask(h); + if (!end) + end = huge_page_size(h); + + folio_zero_segment(folio, (size_t)start, (size_t)end); + + folio_unlock(folio); + folio_put(folio); +} + static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) { + struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); + struct address_space *mapping = inode->i_mapping; struct hstate *h = hstate_inode(inode); loff_t hpage_size = huge_page_size(h); loff_t hole_start, hole_end; /* - * For hole punch round up the beginning offset of the hole and - * round down the end. + * hole_start and hole_end indicate the full pages within the hole. */ hole_start = round_up(offset, hpage_size); hole_end = round_down(offset + len, hpage_size); - if (hole_end > hole_start) { - struct address_space *mapping = inode->i_mapping; - struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); + inode_lock(inode); - inode_lock(inode); + /* protected by i_rwsem */ + if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { + inode_unlock(inode); + return -EPERM; + } - /* protected by i_rwsem */ - if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { - inode_unlock(inode); - return -EPERM; - } + i_mmap_lock_write(mapping); + + /* If range starts before first full page, zero partial page. */ + if (offset < hole_start) + hugetlbfs_zero_partial_page(h, mapping, + offset, min(offset + len, hole_start)); - i_mmap_lock_write(mapping); + /* Unmap users of full pages in the hole. */ + if (hole_end > hole_start) { if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) hugetlb_vmdelete_list(&mapping->i_mmap, hole_start >> PAGE_SHIFT, hole_end >> PAGE_SHIFT, 0); - i_mmap_unlock_write(mapping); - remove_inode_hugepages(inode, hole_start, hole_end); - inode_unlock(inode); } + /* If range extends beyond last full page, zero partial page. */ + if ((offset + len) > hole_end && (offset + len) > hole_start) + hugetlbfs_zero_partial_page(h, mapping, + hole_end, offset + len); + + i_mmap_unlock_write(mapping); + + /* Remove full pages from the file. */ + if (hole_end > hole_start) + remove_inode_hugepages(inode, hole_start, hole_end); + + inode_unlock(inode); + return 0; } diff --git a/fs/io_uring.c b/fs/io_uring.c index d3ee4fc532fa..e8e769be9ed0 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1183,6 +1183,7 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollout = 1, .needs_async_setup = 1, + .ioprio = 1, .async_size = sizeof(struct io_async_msghdr), }, [IORING_OP_RECVMSG] = { @@ -1191,6 +1192,7 @@ static const struct io_op_def io_op_defs[] = { .pollin = 1, .buffer_select = 1, .needs_async_setup = 1, + .ioprio = 1, .async_size = sizeof(struct io_async_msghdr), }, [IORING_OP_TIMEOUT] = { @@ -1266,6 +1268,7 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollout = 1, .audit_skip = 1, + .ioprio = 1, }, [IORING_OP_RECV] = { .needs_file = 1, @@ -1273,6 +1276,7 @@ static const struct io_op_def io_op_defs[] = { .pollin = 1, .buffer_select = 1, .audit_skip = 1, + .ioprio = 1, }, [IORING_OP_OPENAT2] = { }, @@ -1734,6 +1738,14 @@ static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) return; /* + * READV uses fields in `struct io_rw` (len/addr) to stash the selected + * buffer data. However if that buffer is recycled the original request + * data stored in addr is lost. Therefore forbid recycling for now. + */ + if (req->opcode == IORING_OP_READV) + return; + + /* * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear * the flag and hence ensure that bl->head doesn't get incremented. * If the tail has already been incremented, hang on to it. @@ -1975,7 +1987,7 @@ static inline void io_req_track_inflight(struct io_kiocb *req) { if (!(req->flags & REQ_F_INFLIGHT)) { req->flags |= REQ_F_INFLIGHT; - atomic_inc(¤t->io_uring->inflight_tracked); + atomic_inc(&req->task->io_uring->inflight_tracked); } } @@ -3437,7 +3449,7 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res) if (unlikely(res != req->cqe.res)) { if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_should_reissue(req)) { - req->flags |= REQ_F_REISSUE; + req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO; return true; } req_set_fail(req); @@ -3487,7 +3499,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) kiocb_end_write(req); if (unlikely(res != req->cqe.res)) { if (res == -EAGAIN && io_rw_should_reissue(req)) { - req->flags |= REQ_F_REISSUE; + req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO; return; } req->cqe.res = res; @@ -4314,18 +4326,19 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(ret < 0)) return ret; } else { + rw = req->async_data; + s = &rw->s; + /* * Safe and required to re-import if we're using provided * buffers, as we dropped the selected one before retry. */ - if (req->flags & REQ_F_BUFFER_SELECT) { + if (io_do_buffer_select(req)) { ret = io_import_iovec(READ, req, &iovec, s, issue_flags); if (unlikely(ret < 0)) return ret; } - rw = req->async_data; - s = &rw->s; /* * We come here from an earlier attempt, restore our state to * match in case it doesn't. It's cheap enough that we don't @@ -5061,7 +5074,7 @@ static int io_uring_cmd_prep(struct io_kiocb *req, { struct io_uring_cmd *ioucmd = &req->uring_cmd; - if (sqe->rw_flags) + if (sqe->rw_flags || sqe->__pad1) return -EINVAL; ioucmd->cmd = sqe->cmd; ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); @@ -6075,14 +6088,12 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *sr = &req->sr_msg; - if (unlikely(sqe->file_index)) - return -EINVAL; - if (unlikely(sqe->addr2 || sqe->file_index)) + if (unlikely(sqe->file_index || sqe->addr2)) return -EINVAL; sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); - sr->flags = READ_ONCE(sqe->addr2); + sr->flags = READ_ONCE(sqe->ioprio); if (sr->flags & ~IORING_RECVSEND_POLL_FIRST) return -EINVAL; sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; @@ -6313,14 +6324,12 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *sr = &req->sr_msg; - if (unlikely(sqe->file_index)) - return -EINVAL; - if (unlikely(sqe->addr2 || sqe->file_index)) + if (unlikely(sqe->file_index || sqe->addr2)) return -EINVAL; sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); - sr->flags = READ_ONCE(sqe->addr2); + sr->flags = READ_ONCE(sqe->ioprio); if (sr->flags & ~IORING_RECVSEND_POLL_FIRST) return -EINVAL; sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; @@ -6954,7 +6963,8 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked) io_req_complete_failed(req, ret); } -static void __io_poll_execute(struct io_kiocb *req, int mask, __poll_t events) +static void __io_poll_execute(struct io_kiocb *req, int mask, + __poll_t __maybe_unused events) { req->cqe.res = mask; /* @@ -6963,7 +6973,6 @@ static void __io_poll_execute(struct io_kiocb *req, int mask, __poll_t events) * CPU. We want to avoid pulling in req->apoll->events for that * case. */ - req->apoll_events = events; if (req->opcode == IORING_OP_POLL_ADD) req->io_task_work.func = io_poll_task_func; else @@ -7114,6 +7123,8 @@ static int __io_arm_poll_handler(struct io_kiocb *req, io_init_poll_iocb(poll, mask, io_poll_wake); poll->file = req->file; + req->apoll_events = poll->events; + ipt->pt._key = mask; ipt->req = req; ipt->error = 0; @@ -7144,8 +7155,11 @@ static int __io_arm_poll_handler(struct io_kiocb *req, if (mask) { /* can't multishot if failed, just queue the event we've got */ - if (unlikely(ipt->error || !ipt->nr_entries)) + if (unlikely(ipt->error || !ipt->nr_entries)) { poll->events |= EPOLLONESHOT; + req->apoll_events |= EPOLLONESHOT; + ipt->error = 0; + } __io_poll_execute(req, mask, poll->events); return 0; } @@ -7207,6 +7221,7 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) mask |= EPOLLEXCLUSIVE; if (req->flags & REQ_F_POLLED) { apoll = req->apoll; + kfree(apoll->double_poll); } else if (!(issue_flags & IO_URING_F_UNLOCKED) && !list_empty(&ctx->apoll_cache)) { apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, @@ -7392,7 +7407,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe return -EINVAL; io_req_set_refcount(req); - req->apoll_events = poll->events = io_poll_parse_events(sqe, flags); + poll->events = io_poll_parse_events(sqe, flags); return 0; } @@ -7405,6 +7420,8 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) ipt.pt._qproc = io_poll_queue_proc; ret = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events); + if (!ret && ipt.error) + req_set_fail(req); ret = ret ?: ipt.error; if (ret) __io_req_complete(req, issue_flags, ret, 0); @@ -7964,6 +7981,9 @@ static int io_files_update_with_index_alloc(struct io_kiocb *req, struct file *file; int ret, fd; + if (!req->ctx->file_data) + return -ENXIO; + for (done = 0; done < req->rsrc_update.nr_args; done++) { if (copy_from_user(&fd, &fds[done], sizeof(fd))) { ret = -EFAULT; @@ -12919,7 +12939,7 @@ static int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) { struct io_uring_buf_ring *br; struct io_uring_buf_reg reg; - struct io_buffer_list *bl; + struct io_buffer_list *bl, *free_bl = NULL; struct page **pages; int nr_pages; @@ -12951,7 +12971,7 @@ static int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (bl->buf_nr_pages || !list_empty(&bl->buf_list)) return -EEXIST; } else { - bl = kzalloc(sizeof(*bl), GFP_KERNEL); + free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); if (!bl) return -ENOMEM; } @@ -12960,7 +12980,7 @@ static int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) struct_size(br, bufs, reg.ring_entries), &nr_pages); if (IS_ERR(pages)) { - kfree(bl); + kfree(free_bl); return PTR_ERR(pages); } diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 1d732fd223d4..332dc9ac47a9 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -95,14 +95,14 @@ int jfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (rc) return rc; - if (is_quota_modification(inode, iattr)) { + if (is_quota_modification(mnt_userns, inode, iattr)) { rc = dquot_initialize(inode); if (rc) return rc; } if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) || (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) { - rc = dquot_transfer(inode, iattr); + rc = dquot_transfer(mnt_userns, inode, iattr); if (rc) return rc; } diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index e6f4ccc12f49..353f047e783c 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -6490,6 +6490,7 @@ int smb2_write(struct ksmbd_work *work) goto out; } + ksmbd_debug(SMB, "flags %u\n", le32_to_cpu(req->Flags)); if (le32_to_cpu(req->Flags) & SMB2_WRITEFLAG_WRITE_THROUGH) writethrough = true; @@ -6505,10 +6506,6 @@ int smb2_write(struct ksmbd_work *work) data_buf = (char *)(((char *)&req->hdr.ProtocolId) + le16_to_cpu(req->DataOffset)); - ksmbd_debug(SMB, "flags %u\n", le32_to_cpu(req->Flags)); - if (le32_to_cpu(req->Flags) & SMB2_WRITEFLAG_WRITE_THROUGH) - writethrough = true; - ksmbd_debug(SMB, "filename %pd, offset %lld, len %zu\n", fp->filp->f_path.dentry, offset, length); err = ksmbd_vfs_write(work, fp, data_buf, length, &offset, @@ -7703,7 +7700,7 @@ int smb2_ioctl(struct ksmbd_work *work) { struct file_zero_data_information *zero_data; struct ksmbd_file *fp; - loff_t off, len; + loff_t off, len, bfz; if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) { ksmbd_debug(SMB, @@ -7720,19 +7717,26 @@ int smb2_ioctl(struct ksmbd_work *work) zero_data = (struct file_zero_data_information *)&req->Buffer[0]; - fp = ksmbd_lookup_fd_fast(work, id); - if (!fp) { - ret = -ENOENT; + off = le64_to_cpu(zero_data->FileOffset); + bfz = le64_to_cpu(zero_data->BeyondFinalZero); + if (off > bfz) { + ret = -EINVAL; goto out; } - off = le64_to_cpu(zero_data->FileOffset); - len = le64_to_cpu(zero_data->BeyondFinalZero) - off; + len = bfz - off; + if (len) { + fp = ksmbd_lookup_fd_fast(work, id); + if (!fp) { + ret = -ENOENT; + goto out; + } - ret = ksmbd_vfs_zero_data(work, fp, off, len); - ksmbd_fd_put(work, fp); - if (ret < 0) - goto out; + ret = ksmbd_vfs_zero_data(work, fp, off, len); + ksmbd_fd_put(work, fp); + if (ret < 0) + goto out; + } break; } case FSCTL_QUERY_ALLOCATED_RANGES: @@ -7806,14 +7810,24 @@ int smb2_ioctl(struct ksmbd_work *work) src_off = le64_to_cpu(dup_ext->SourceFileOffset); dst_off = le64_to_cpu(dup_ext->TargetFileOffset); length = le64_to_cpu(dup_ext->ByteCount); - cloned = vfs_clone_file_range(fp_in->filp, src_off, fp_out->filp, - dst_off, length, 0); + /* + * XXX: It is not clear if FSCTL_DUPLICATE_EXTENTS_TO_FILE + * should fall back to vfs_copy_file_range(). This could be + * beneficial when re-exporting nfs/smb mount, but note that + * this can result in partial copy that returns an error status. + * If/when FSCTL_DUPLICATE_EXTENTS_TO_FILE_EX is implemented, + * fall back to vfs_copy_file_range(), should be avoided when + * the flag DUPLICATE_EXTENTS_DATA_EX_SOURCE_ATOMIC is set. + */ + cloned = vfs_clone_file_range(fp_in->filp, src_off, + fp_out->filp, dst_off, length, 0); if (cloned == -EXDEV || cloned == -EOPNOTSUPP) { ret = -EOPNOTSUPP; goto dup_ext_out; } else if (cloned != length) { cloned = vfs_copy_file_range(fp_in->filp, src_off, - fp_out->filp, dst_off, length, 0); + fp_out->filp, dst_off, + length, 0); if (cloned != length) { if (cloned < 0) ret = cloned; diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index d035e060c2f0..35b55ee94fe5 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -5,16 +5,6 @@ * * Author(s): Long Li <longli@microsoft.com>, * Hyunchul Lee <hyc.lee@gmail.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU General Public License for more details. */ #define SUBMOD_NAME "smb_direct" diff --git a/fs/ksmbd/transport_tcp.c b/fs/ksmbd/transport_tcp.c index 8fef9de787d3..143bba4e4db8 100644 --- a/fs/ksmbd/transport_tcp.c +++ b/fs/ksmbd/transport_tcp.c @@ -230,7 +230,7 @@ static int ksmbd_kthread_fn(void *p) break; } ret = kernel_accept(iface->ksmbd_socket, &client_sk, - O_NONBLOCK); + SOCK_NONBLOCK); mutex_unlock(&iface->sock_release_lock); if (ret) { if (ret == -EAGAIN) diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c index dcdd07c6efff..7c849024999f 100644 --- a/fs/ksmbd/vfs.c +++ b/fs/ksmbd/vfs.c @@ -963,7 +963,7 @@ ssize_t ksmbd_vfs_getxattr(struct user_namespace *user_ns, */ int ksmbd_vfs_setxattr(struct user_namespace *user_ns, struct dentry *dentry, const char *attr_name, - const void *attr_value, size_t attr_size, int flags) + void *attr_value, size_t attr_size, int flags) { int err; @@ -1015,7 +1015,9 @@ int ksmbd_vfs_zero_data(struct ksmbd_work *work, struct ksmbd_file *fp, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, off, len); - return vfs_fallocate(fp->filp, FALLOC_FL_ZERO_RANGE, off, len); + return vfs_fallocate(fp->filp, + FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE, + off, len); } int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length, @@ -1046,7 +1048,7 @@ int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length, *out_count = 0; end = start + length; while (start < end && *out_count < in_count) { - extent_start = f->f_op->llseek(f, start, SEEK_DATA); + extent_start = vfs_llseek(f, start, SEEK_DATA); if (extent_start < 0) { if (extent_start != -ENXIO) ret = (int)extent_start; @@ -1056,7 +1058,7 @@ int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length, if (extent_start >= end) break; - extent_end = f->f_op->llseek(f, extent_start, SEEK_HOLE); + extent_end = vfs_llseek(f, extent_start, SEEK_HOLE); if (extent_end < 0) { if (extent_end != -ENXIO) ret = (int)extent_end; @@ -1777,6 +1779,10 @@ int ksmbd_vfs_copy_file_ranges(struct ksmbd_work *work, ret = vfs_copy_file_range(src_fp->filp, src_off, dst_fp->filp, dst_off, len, 0); + if (ret == -EOPNOTSUPP || ret == -EXDEV) + ret = generic_copy_file_range(src_fp->filp, src_off, + dst_fp->filp, dst_off, + len, 0); if (ret < 0) return ret; diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h index 8c37aaf936ab..70da4c0ba7ad 100644 --- a/fs/ksmbd/vfs.h +++ b/fs/ksmbd/vfs.h @@ -109,7 +109,7 @@ ssize_t ksmbd_vfs_casexattr_len(struct user_namespace *user_ns, int attr_name_len); int ksmbd_vfs_setxattr(struct user_namespace *user_ns, struct dentry *dentry, const char *attr_name, - const void *attr_value, size_t attr_size, int flags); + void *attr_value, size_t attr_size, int flags); int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name, size_t *xattr_stream_name_size, int s_type); int ksmbd_vfs_remove_xattr(struct user_namespace *user_ns, diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 0a22a2faf552..e1c4617de771 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -176,7 +176,7 @@ nlm_delete_file(struct nlm_file *file) } } -static int nlm_unlock_files(struct nlm_file *file) +static int nlm_unlock_files(struct nlm_file *file, fl_owner_t owner) { struct file_lock lock; @@ -184,6 +184,7 @@ static int nlm_unlock_files(struct nlm_file *file) lock.fl_type = F_UNLCK; lock.fl_start = 0; lock.fl_end = OFFSET_MAX; + lock.fl_owner = owner; if (file->f_file[O_RDONLY] && vfs_lock_file(file->f_file[O_RDONLY], F_SETLK, &lock, NULL)) goto out_err; @@ -225,7 +226,7 @@ again: if (match(lockhost, host)) { spin_unlock(&flctx->flc_lock); - if (nlm_unlock_files(file)) + if (nlm_unlock_files(file, fl->fl_owner)) return 1; goto again; } @@ -282,11 +283,10 @@ nlm_file_inuse(struct nlm_file *file) static void nlm_close_files(struct nlm_file *file) { - struct file *f; - - for (f = file->f_file[0]; f <= file->f_file[1]; f++) - if (f) - nlmsvc_ops->fclose(f); + if (file->f_file[O_RDONLY]) + nlmsvc_ops->fclose(file->f_file[O_RDONLY]); + if (file->f_file[O_WRONLY]) + nlmsvc_ops->fclose(file->f_file[O_WRONLY]); } /* diff --git a/fs/locks.c b/fs/locks.c index ca28e0e50e56..c266cfdc3291 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -425,21 +425,9 @@ static inline int flock_translate_cmd(int cmd) { } /* Fill in a file_lock structure with an appropriate FLOCK lock. */ -static struct file_lock * -flock_make_lock(struct file *filp, unsigned int cmd, struct file_lock *fl) +static void flock_make_lock(struct file *filp, struct file_lock *fl, int type) { - int type = flock_translate_cmd(cmd); - - if (type < 0) - return ERR_PTR(type); - - if (fl == NULL) { - fl = locks_alloc_lock(); - if (fl == NULL) - return ERR_PTR(-ENOMEM); - } else { - locks_init_lock(fl); - } + locks_init_lock(fl); fl->fl_file = filp; fl->fl_owner = filp; @@ -447,8 +435,6 @@ flock_make_lock(struct file *filp, unsigned int cmd, struct file_lock *fl) fl->fl_flags = FL_FLOCK; fl->fl_type = type; fl->fl_end = OFFSET_MAX; - - return fl; } static int assign_type(struct file_lock *fl, long type) @@ -2097,21 +2083,9 @@ EXPORT_SYMBOL(locks_lock_inode_wait); */ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) { - struct fd f = fdget(fd); - struct file_lock *lock; - int can_sleep, unlock; - int error; - - error = -EBADF; - if (!f.file) - goto out; - - can_sleep = !(cmd & LOCK_NB); - cmd &= ~LOCK_NB; - unlock = (cmd == LOCK_UN); - - if (!unlock && !(f.file->f_mode & (FMODE_READ|FMODE_WRITE))) - goto out_putf; + int can_sleep, error, type; + struct file_lock fl; + struct fd f; /* * LOCK_MAND locks were broken for a long time in that they never @@ -2123,36 +2097,41 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) */ if (cmd & LOCK_MAND) { pr_warn_once("Attempt to set a LOCK_MAND lock via flock(2). This support has been removed and the request ignored.\n"); - error = 0; - goto out_putf; + return 0; } - lock = flock_make_lock(f.file, cmd, NULL); - if (IS_ERR(lock)) { - error = PTR_ERR(lock); + type = flock_translate_cmd(cmd & ~LOCK_NB); + if (type < 0) + return type; + + error = -EBADF; + f = fdget(fd); + if (!f.file) + return error; + + if (type != F_UNLCK && !(f.file->f_mode & (FMODE_READ | FMODE_WRITE))) goto out_putf; - } - if (can_sleep) - lock->fl_flags |= FL_SLEEP; + flock_make_lock(f.file, &fl, type); - error = security_file_lock(f.file, lock->fl_type); + error = security_file_lock(f.file, fl.fl_type); if (error) - goto out_free; + goto out_putf; + + can_sleep = !(cmd & LOCK_NB); + if (can_sleep) + fl.fl_flags |= FL_SLEEP; if (f.file->f_op->flock) error = f.file->f_op->flock(f.file, - (can_sleep) ? F_SETLKW : F_SETLK, - lock); + (can_sleep) ? F_SETLKW : F_SETLK, + &fl); else - error = locks_lock_file_wait(f.file, lock); - - out_free: - locks_free_lock(lock); + error = locks_lock_file_wait(f.file, &fl); out_putf: fdput(f); - out: + return error; } @@ -2614,7 +2593,7 @@ locks_remove_flock(struct file *filp, struct file_lock_context *flctx) if (list_empty(&flctx->flc_flock)) return; - flock_make_lock(filp, LOCK_UN, &fl); + flock_make_lock(filp, &fl, F_UNLCK); fl.fl_flags |= FL_CLOSE; if (filp->f_op->flock) diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 42f892c5712e..0ce535852151 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -319,8 +319,9 @@ zero_out: * conflicting writes once the folio is grabbed and locked. It is passed a * pointer to the fsdata cookie that gets returned to the VM to be passed to * write_end. It is permitted to sleep. It should return 0 if the request - * should go ahead; unlock the folio and return -EAGAIN to cause the folio to - * be regot; or return an error. + * should go ahead or it may return an error. It may also unlock and put the + * folio, provided it sets ``*foliop`` to NULL, in which case a return of 0 + * will cause the folio to be re-got and the process to be retried. * * The calling netfs must initialise a netfs context contiguous to the vfs * inode before calling this. @@ -348,13 +349,13 @@ retry: if (ctx->ops->check_write_begin) { /* Allow the netfs (eg. ceph) to flush conflicts. */ - ret = ctx->ops->check_write_begin(file, pos, len, folio, _fsdata); + ret = ctx->ops->check_write_begin(file, pos, len, &folio, _fsdata); if (ret < 0) { trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin); - if (ret == -EAGAIN) - goto retry; goto error; } + if (!folio) + goto retry; } if (folio_test_uptodate(folio)) @@ -416,8 +417,10 @@ have_folio_no_wait: error_put: netfs_put_request(rreq, false, netfs_rreq_trace_put_failed); error: - folio_unlock(folio); - folio_put(folio); + if (folio) { + folio_unlock(folio); + folio_put(folio); + } _leave(" = %d", ret); return ret; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index c0fdcf8c0032..bb0e84a46d61 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4012,22 +4012,29 @@ static int _nfs4_discover_trunking(struct nfs_server *server, } page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL); - if (page == NULL || locations == NULL) - goto out; + if (!locations) + goto out_free; + locations->fattr = nfs_alloc_fattr(); + if (!locations->fattr) + goto out_free_2; status = nfs4_proc_get_locations(server, fhandle, locations, page, cred); if (status) - goto out; + goto out_free_3; for (i = 0; i < locations->nlocations; i++) test_fs_location_for_trunking(&locations->locations[i], clp, server); -out: - if (page) - __free_page(page); +out_free_3: + kfree(locations->fattr); +out_free_2: kfree(locations); +out_free: + __free_page(page); return status; } diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 2540b35ec187..9bab3e9c702a 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2753,5 +2753,6 @@ again: goto again; nfs_put_client(clp); + module_put_and_kthread_exit(0); return 0; } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 61b2aae81abb..2acea7792bb2 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -470,6 +470,15 @@ nfsd4_decode_fattr4(struct nfsd4_compoundargs *argp, u32 *bmval, u32 bmlen, return nfserr_bad_xdr; } } + if (bmval[1] & FATTR4_WORD1_TIME_CREATE) { + struct timespec64 ts; + + /* No Linux filesystem supports setting this attribute. */ + bmval[1] &= ~FATTR4_WORD1_TIME_CREATE; + status = nfsd4_decode_nfstime4(argp, &ts); + if (status) + return status; + } if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) { u32 set_it; diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 847b482155ae..9a8b09afc173 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -465,7 +465,8 @@ static inline bool nfsd_attrs_supported(u32 minorversion, const u32 *bmval) (FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL) #define NFSD_WRITEABLE_ATTRS_WORD1 \ (FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \ - | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) + | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_CREATE \ + | FATTR4_WORD1_TIME_MODIFY_SET) #ifdef CONFIG_NFSD_V4_SECURITY_LABEL #define MAYBE_FATTR4_WORD2_SECURITY_LABEL \ FATTR4_WORD2_SECURITY_LABEL diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 840e3af63a6f..d79db56475d4 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -577,6 +577,7 @@ out_err: ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst, u64 dst_pos, u64 count) { + ssize_t ret; /* * Limit copy to 4MB to prevent indefinitely blocking an nfsd @@ -587,7 +588,12 @@ ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst, * limit like this and pipeline multiple COPY requests. */ count = min_t(u64, count, 1 << 22); - return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0); + ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0); + + if (ret == -EOPNOTSUPP || ret == -EXDEV) + ret = generic_copy_file_range(src, src_pos, dst, dst_pos, + count, 0); + return ret; } __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp, @@ -1173,6 +1179,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, u64 offset, nfsd_copy_write_verifier(verf, nn); err2 = filemap_check_wb_err(nf->nf_file->f_mapping, since); + err = nfserrno(err2); break; case -EINVAL: err = nfserr_notsupp; @@ -1180,8 +1187,8 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, u64 offset, default: nfsd_reset_write_verifier(nn); trace_nfsd_writeverf_reset(nn, rqstp, err2); + err = nfserrno(err2); } - err = nfserrno(err2); } else nfsd_copy_write_verifier(verf, nn); diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 1344f7d475d3..aecda4fc95f5 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -198,6 +198,9 @@ static inline int nilfs_acl_chmod(struct inode *inode) static inline int nilfs_init_acl(struct inode *inode, struct inode *dir) { + if (S_ISLNK(inode->i_mode)) + return 0; + inode->i_mode &= ~current_umask(); return 0; } diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 4f897e109547..cd7d09a569ff 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -295,12 +295,13 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, const void *data, int data_type, struct inode *dir) { - __u32 marks_mask = 0, marks_ignored_mask = 0; + __u32 marks_mask = 0, marks_ignore_mask = 0; __u32 test_mask, user_mask = FANOTIFY_OUTGOING_EVENTS | FANOTIFY_EVENT_FLAGS; const struct path *path = fsnotify_data_path(data, data_type); unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); struct fsnotify_mark *mark; + bool ondir = event_mask & FAN_ONDIR; int type; pr_debug("%s: report_mask=%x mask=%x data=%p data_type=%d\n", @@ -315,19 +316,21 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, return 0; } else if (!(fid_mode & FAN_REPORT_FID)) { /* Do we have a directory inode to report? */ - if (!dir && !(event_mask & FS_ISDIR)) + if (!dir && !ondir) return 0; } fsnotify_foreach_iter_mark_type(iter_info, mark, type) { - /* Apply ignore mask regardless of mark's ISDIR flag */ - marks_ignored_mask |= mark->ignored_mask; + /* + * Apply ignore mask depending on event flags in ignore mask. + */ + marks_ignore_mask |= + fsnotify_effective_ignore_mask(mark, ondir, type); /* - * If the event is on dir and this mark doesn't care about - * events on dir, don't send it! + * Send the event depending on event flags in mark mask. */ - if (event_mask & FS_ISDIR && !(mark->mask & FS_ISDIR)) + if (!fsnotify_mask_applicable(mark->mask, ondir, type)) continue; marks_mask |= mark->mask; @@ -336,7 +339,7 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, *match_mask |= 1U << type; } - test_mask = event_mask & marks_mask & ~marks_ignored_mask; + test_mask = event_mask & marks_mask & ~marks_ignore_mask; /* * For dirent modification events (create/delete/move) that do not carry diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 80e0ec95b113..1d9f11255c64 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -499,6 +499,8 @@ static inline unsigned int fanotify_mark_user_flags(struct fsnotify_mark *mark) mflags |= FAN_MARK_IGNORED_SURV_MODIFY; if (mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF) mflags |= FAN_MARK_EVICTABLE; + if (mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS) + mflags |= FAN_MARK_IGNORE; return mflags; } diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index c2255b440df9..f0e49a406ffa 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1009,10 +1009,10 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, mask &= ~umask; spin_lock(&fsn_mark->lock); oldmask = fsnotify_calc_mask(fsn_mark); - if (!(flags & FAN_MARK_IGNORED_MASK)) { + if (!(flags & FANOTIFY_MARK_IGNORE_BITS)) { fsn_mark->mask &= ~mask; } else { - fsn_mark->ignored_mask &= ~mask; + fsn_mark->ignore_mask &= ~mask; } newmask = fsnotify_calc_mask(fsn_mark); /* @@ -1021,7 +1021,7 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, * changes to the mask. * Destroy mark when only umask bits remain. */ - *destroy = !((fsn_mark->mask | fsn_mark->ignored_mask) & ~umask); + *destroy = !((fsn_mark->mask | fsn_mark->ignore_mask) & ~umask); spin_unlock(&fsn_mark->lock); return oldmask & ~newmask; @@ -1085,15 +1085,24 @@ static bool fanotify_mark_update_flags(struct fsnotify_mark *fsn_mark, unsigned int fan_flags) { bool want_iref = !(fan_flags & FAN_MARK_EVICTABLE); + unsigned int ignore = fan_flags & FANOTIFY_MARK_IGNORE_BITS; bool recalc = false; /* + * When using FAN_MARK_IGNORE for the first time, mark starts using + * independent event flags in ignore mask. After that, trying to + * update the ignore mask with the old FAN_MARK_IGNORED_MASK API + * will result in EEXIST error. + */ + if (ignore == FAN_MARK_IGNORE) + fsn_mark->flags |= FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS; + + /* * Setting FAN_MARK_IGNORED_SURV_MODIFY for the first time may lead to * the removal of the FS_MODIFY bit in calculated mask if it was set - * because of an ignored mask that is now going to survive FS_MODIFY. + * because of an ignore mask that is now going to survive FS_MODIFY. */ - if ((fan_flags & FAN_MARK_IGNORED_MASK) && - (fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) && + if (ignore && (fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) && !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) { fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY; if (!(fsn_mark->mask & FS_MODIFY)) @@ -1120,10 +1129,10 @@ static bool fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, bool recalc; spin_lock(&fsn_mark->lock); - if (!(fan_flags & FAN_MARK_IGNORED_MASK)) + if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS)) fsn_mark->mask |= mask; else - fsn_mark->ignored_mask |= mask; + fsn_mark->ignore_mask |= mask; recalc = fsnotify_calc_mask(fsn_mark) & ~fsnotify_conn_mask(fsn_mark->connector); @@ -1187,6 +1196,37 @@ static int fanotify_group_init_error_pool(struct fsnotify_group *group) sizeof(struct fanotify_error_event)); } +static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark, + unsigned int fan_flags) +{ + /* + * Non evictable mark cannot be downgraded to evictable mark. + */ + if (fan_flags & FAN_MARK_EVICTABLE && + !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)) + return -EEXIST; + + /* + * New ignore mask semantics cannot be downgraded to old semantics. + */ + if (fan_flags & FAN_MARK_IGNORED_MASK && + fsn_mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS) + return -EEXIST; + + /* + * An ignore mask that survives modify could never be downgraded to not + * survive modify. With new FAN_MARK_IGNORE semantics we make that rule + * explicit and return an error when trying to update the ignore mask + * without the original FAN_MARK_IGNORED_SURV_MODIFY value. + */ + if (fan_flags & FAN_MARK_IGNORE && + !(fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) && + fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY) + return -EEXIST; + + return 0; +} + static int fanotify_add_mark(struct fsnotify_group *group, fsnotify_connp_t *connp, unsigned int obj_type, __u32 mask, unsigned int fan_flags, @@ -1208,19 +1248,18 @@ static int fanotify_add_mark(struct fsnotify_group *group, } /* - * Non evictable mark cannot be downgraded to evictable mark. + * Check if requested mark flags conflict with an existing mark flags. */ - if (fan_flags & FAN_MARK_EVICTABLE && - !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)) { - ret = -EEXIST; + ret = fanotify_may_update_existing_mark(fsn_mark, fan_flags); + if (ret) goto out; - } /* * Error events are pre-allocated per group, only if strictly * needed (i.e. FAN_FS_ERROR was requested). */ - if (!(fan_flags & FAN_MARK_IGNORED_MASK) && (mask & FAN_FS_ERROR)) { + if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS) && + (mask & FAN_FS_ERROR)) { ret = fanotify_group_init_error_pool(group); if (ret) goto out; @@ -1261,10 +1300,10 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, /* * If some other task has this inode open for write we should not add - * an ignored mark, unless that ignored mark is supposed to survive + * an ignore mask, unless that ignore mask is supposed to survive * modification changes anyway. */ - if ((flags & FAN_MARK_IGNORED_MASK) && + if ((flags & FANOTIFY_MARK_IGNORE_BITS) && !(flags & FAN_MARK_IGNORED_SURV_MODIFY) && inode_is_open_for_write(inode)) return 0; @@ -1513,8 +1552,16 @@ static int fanotify_test_fid(struct dentry *dentry) return 0; } -static int fanotify_events_supported(struct path *path, __u64 mask) +static int fanotify_events_supported(struct fsnotify_group *group, + struct path *path, __u64 mask, + unsigned int flags) { + unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; + /* Strict validation of events in non-dir inode mask with v5.17+ APIs */ + bool strict_dir_events = FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID) || + (mask & FAN_RENAME) || + (flags & FAN_MARK_IGNORE); + /* * Some filesystems such as 'proc' acquire unusual locks when opening * files. For them fanotify permission events have high chances of @@ -1526,6 +1573,16 @@ static int fanotify_events_supported(struct path *path, __u64 mask) if (mask & FANOTIFY_PERM_EVENTS && path->mnt->mnt_sb->s_type->fs_flags & FS_DISALLOW_NOTIFY_PERM) return -EINVAL; + + /* + * We shouldn't have allowed setting dirent events and the directory + * flags FAN_ONDIR and FAN_EVENT_ON_CHILD in mask of non-dir inode, + * but because we always allowed it, error only when using new APIs. + */ + if (strict_dir_events && mark_type == FAN_MARK_INODE && + !d_is_dir(path->dentry) && (mask & FANOTIFY_DIRONLY_EVENT_BITS)) + return -ENOTDIR; + return 0; } @@ -1540,7 +1597,8 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, __kernel_fsid_t __fsid, *fsid = NULL; u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS; unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; - bool ignored = flags & FAN_MARK_IGNORED_MASK; + unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS; + unsigned int ignore = flags & FANOTIFY_MARK_IGNORE_BITS; unsigned int obj_type, fid_mode; u32 umask = 0; int ret; @@ -1569,7 +1627,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, return -EINVAL; } - switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) { + switch (mark_cmd) { case FAN_MARK_ADD: case FAN_MARK_REMOVE: if (!mask) @@ -1589,9 +1647,19 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, if (mask & ~valid_mask) return -EINVAL; - /* Event flags (ONDIR, ON_CHILD) are meaningless in ignored mask */ - if (ignored) + + /* We don't allow FAN_MARK_IGNORE & FAN_MARK_IGNORED_MASK together */ + if (ignore == (FAN_MARK_IGNORE | FAN_MARK_IGNORED_MASK)) + return -EINVAL; + + /* + * Event flags (FAN_ONDIR, FAN_EVENT_ON_CHILD) have no effect with + * FAN_MARK_IGNORED_MASK. + */ + if (ignore == FAN_MARK_IGNORED_MASK) { mask &= ~FANOTIFY_EVENT_FLAGS; + umask = FANOTIFY_EVENT_FLAGS; + } f = fdget(fanotify_fd); if (unlikely(!f.file)) @@ -1655,7 +1723,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME)) goto fput_and_out; - if (flags & FAN_MARK_FLUSH) { + if (mark_cmd == FAN_MARK_FLUSH) { ret = 0; if (mark_type == FAN_MARK_MOUNT) fsnotify_clear_vfsmount_marks_by_group(group); @@ -1671,8 +1739,8 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, if (ret) goto fput_and_out; - if (flags & FAN_MARK_ADD) { - ret = fanotify_events_supported(&path, mask); + if (mark_cmd == FAN_MARK_ADD) { + ret = fanotify_events_supported(group, &path, mask, flags); if (ret) goto path_put_and_out; } @@ -1695,17 +1763,11 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, else mnt = path.mnt; - /* - * FAN_RENAME is not allowed on non-dir (for now). - * We shouldn't have allowed setting any dirent events in mask of - * non-dir, but because we always allowed it, error only if group - * was initialized with the new flag FAN_REPORT_TARGET_FID. - */ - ret = -ENOTDIR; - if (inode && !S_ISDIR(inode->i_mode) && - ((mask & FAN_RENAME) || - ((mask & FANOTIFY_DIRENT_EVENTS) && - FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID)))) + ret = mnt ? -EINVAL : -EISDIR; + /* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */ + if (mark_cmd == FAN_MARK_ADD && ignore == FAN_MARK_IGNORE && + (mnt || S_ISDIR(inode->i_mode)) && + !(flags & FAN_MARK_IGNORED_SURV_MODIFY)) goto path_put_and_out; /* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */ @@ -1717,12 +1779,12 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, * events with parent/name info for non-directory. */ if ((fid_mode & FAN_REPORT_DIR_FID) && - (flags & FAN_MARK_ADD) && !ignored) + (flags & FAN_MARK_ADD) && !ignore) mask |= FAN_EVENT_ON_CHILD; } /* create/update an inode mark */ - switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) { + switch (mark_cmd) { case FAN_MARK_ADD: if (mark_type == FAN_MARK_MOUNT) ret = fanotify_add_vfsmount_mark(group, mnt, mask, @@ -1800,7 +1862,7 @@ static int __init fanotify_user_setup(void) BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS); BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 12); - BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 10); + BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11); fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC|SLAB_ACCOUNT); diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 59fb40abe33d..55081ae3a6ec 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -113,7 +113,7 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) return; seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ", inode->i_ino, inode->i_sb->s_dev, - mflags, mark->mask, mark->ignored_mask); + mflags, mark->mask, mark->ignore_mask); show_mark_fhandle(m, inode); seq_putc(m, '\n'); iput(inode); @@ -121,12 +121,12 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) struct mount *mnt = fsnotify_conn_mount(mark->connector); seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n", - mnt->mnt_id, mflags, mark->mask, mark->ignored_mask); + mnt->mnt_id, mflags, mark->mask, mark->ignore_mask); } else if (mark->connector->type == FSNOTIFY_OBJ_TYPE_SB) { struct super_block *sb = fsnotify_conn_sb(mark->connector); seq_printf(m, "fanotify sdev:%x mflags:%x mask:%x ignored_mask:%x\n", - sb->s_dev, mflags, mark->mask, mark->ignored_mask); + sb->s_dev, mflags, mark->mask, mark->ignore_mask); } } diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 0b3e74935cb4..7974e91ffe13 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -100,7 +100,7 @@ void fsnotify_sb_delete(struct super_block *sb) * Given an inode, first check if we care what happens to our children. Inotify * and dnotify both tell their parents about events. If we care about any event * on a child we run all of our children and set a dentry flag saying that the - * parent cares. Thus when an event happens on a child it can quickly tell if + * parent cares. Thus when an event happens on a child it can quickly tell * if there is a need to find a parent and send the event to the parent. */ void __fsnotify_update_child_dentry_flags(struct inode *inode) @@ -324,7 +324,8 @@ static int send_to_group(__u32 mask, const void *data, int data_type, struct fsnotify_group *group = NULL; __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS); __u32 marks_mask = 0; - __u32 marks_ignored_mask = 0; + __u32 marks_ignore_mask = 0; + bool is_dir = mask & FS_ISDIR; struct fsnotify_mark *mark; int type; @@ -336,7 +337,7 @@ static int send_to_group(__u32 mask, const void *data, int data_type, fsnotify_foreach_iter_mark_type(iter_info, mark, type) { if (!(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) - mark->ignored_mask = 0; + mark->ignore_mask = 0; } } @@ -344,14 +345,15 @@ static int send_to_group(__u32 mask, const void *data, int data_type, fsnotify_foreach_iter_mark_type(iter_info, mark, type) { group = mark->group; marks_mask |= mark->mask; - marks_ignored_mask |= mark->ignored_mask; + marks_ignore_mask |= + fsnotify_effective_ignore_mask(mark, is_dir, type); } - pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignored_mask=%x data=%p data_type=%d dir=%p cookie=%d\n", - __func__, group, mask, marks_mask, marks_ignored_mask, + pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignore_mask=%x data=%p data_type=%d dir=%p cookie=%d\n", + __func__, group, mask, marks_mask, marks_ignore_mask, data, data_type, dir, cookie); - if (!(test_mask & marks_mask & ~marks_ignored_mask)) + if (!(test_mask & marks_mask & ~marks_ignore_mask)) return 0; if (group->ops->handle_event) { @@ -423,7 +425,8 @@ static bool fsnotify_iter_select_report_types( * But is *this mark* watching children? */ if (type == FSNOTIFY_ITER_TYPE_PARENT && - !(mark->mask & FS_EVENT_ON_CHILD)) + !(mark->mask & FS_EVENT_ON_CHILD) && + !(fsnotify_ignore_mask(mark) & FS_EVENT_ON_CHILD)) continue; fsnotify_iter_set_report_type(iter_info, type); @@ -532,8 +535,8 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, /* - * If this is a modify event we may need to clear some ignored masks. - * In that case, the object with ignored masks will have the FS_MODIFY + * If this is a modify event we may need to clear some ignore masks. + * In that case, the object with ignore masks will have the FS_MODIFY * event in its mask. * Otherwise, return if none of the marks care about this type of event. */ diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index ed42a189faa2..1c4bfdab008d 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -136,7 +136,7 @@ static inline u32 inotify_mask_to_arg(__u32 mask) IN_Q_OVERFLOW); } -/* intofiy userspace file descriptor functions */ +/* inotify userspace file descriptor functions */ static __poll_t inotify_poll(struct file *file, poll_table *wait) { struct fsnotify_group *group = file->private_data; diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c index 4de597a83b88..52615e6090e1 100644 --- a/fs/ntfs/attrib.c +++ b/fs/ntfs/attrib.c @@ -592,8 +592,12 @@ static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name, a = (ATTR_RECORD*)((u8*)ctx->attr + le32_to_cpu(ctx->attr->length)); for (;; a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) { - if ((u8*)a < (u8*)ctx->mrec || (u8*)a > (u8*)ctx->mrec + - le32_to_cpu(ctx->mrec->bytes_allocated)) + u8 *mrec_end = (u8 *)ctx->mrec + + le32_to_cpu(ctx->mrec->bytes_allocated); + u8 *name_end = (u8 *)a + le16_to_cpu(a->name_offset) + + a->name_length * sizeof(ntfschar); + if ((u8*)a < (u8*)ctx->mrec || (u8*)a > mrec_end || + name_end > mrec_end) break; ctx->attr = a; if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) || diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 7497cd592258..9c67edd215d5 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1146,7 +1146,7 @@ int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (status) return status; - if (is_quota_modification(inode, attr)) { + if (is_quota_modification(mnt_userns, inode, attr)) { status = dquot_initialize(inode); if (status) return status; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 337527571461..740b64238312 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -277,7 +277,6 @@ enum ocfs2_mount_options OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */ OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */ OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */ - OCFS2_MOUNT_NOCLUSTER = 1 << 18, /* No cluster aware filesystem mount */ }; #define OCFS2_OSB_SOFT_RO 0x0001 @@ -673,8 +672,7 @@ static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb) static inline int ocfs2_mount_local(struct ocfs2_super *osb) { - return ((osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT) - || (osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER)); + return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT); } static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb) diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 0b0ae3ebb0cf..da7718cef735 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -252,16 +252,14 @@ static int __ocfs2_find_empty_slot(struct ocfs2_slot_info *si, int i, ret = -ENOSPC; if ((preferred >= 0) && (preferred < si->si_num_slots)) { - if (!si->si_slots[preferred].sl_valid || - !si->si_slots[preferred].sl_node_num) { + if (!si->si_slots[preferred].sl_valid) { ret = preferred; goto out; } } for(i = 0; i < si->si_num_slots; i++) { - if (!si->si_slots[i].sl_valid || - !si->si_slots[i].sl_node_num) { + if (!si->si_slots[i].sl_valid) { ret = i; break; } @@ -456,30 +454,24 @@ int ocfs2_find_slot(struct ocfs2_super *osb) spin_lock(&osb->osb_lock); ocfs2_update_slot_info(si); - if (ocfs2_mount_local(osb)) - /* use slot 0 directly in local mode */ - slot = 0; - else { - /* search for ourselves first and take the slot if it already - * exists. Perhaps we need to mark this in a variable for our - * own journal recovery? Possibly not, though we certainly - * need to warn to the user */ - slot = __ocfs2_node_num_to_slot(si, osb->node_num); + /* search for ourselves first and take the slot if it already + * exists. Perhaps we need to mark this in a variable for our + * own journal recovery? Possibly not, though we certainly + * need to warn to the user */ + slot = __ocfs2_node_num_to_slot(si, osb->node_num); + if (slot < 0) { + /* if no slot yet, then just take 1st available + * one. */ + slot = __ocfs2_find_empty_slot(si, osb->preferred_slot); if (slot < 0) { - /* if no slot yet, then just take 1st available - * one. */ - slot = __ocfs2_find_empty_slot(si, osb->preferred_slot); - if (slot < 0) { - spin_unlock(&osb->osb_lock); - mlog(ML_ERROR, "no free slots available!\n"); - status = -EINVAL; - goto bail; - } - } else - printk(KERN_INFO "ocfs2: Slot %d on device (%s) was " - "already allocated to this node!\n", - slot, osb->dev_str); - } + spin_unlock(&osb->osb_lock); + mlog(ML_ERROR, "no free slots available!\n"); + status = -EINVAL; + goto bail; + } + } else + printk(KERN_INFO "ocfs2: Slot %d on device (%s) was already " + "allocated to this node!\n", slot, osb->dev_str); ocfs2_set_slot(si, slot, osb->node_num); osb->slot_num = slot; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index f7298816d8d9..438be028935d 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -172,7 +172,6 @@ enum { Opt_dir_resv_level, Opt_journal_async_commit, Opt_err_cont, - Opt_nocluster, Opt_err, }; @@ -206,7 +205,6 @@ static const match_table_t tokens = { {Opt_dir_resv_level, "dir_resv_level=%u"}, {Opt_journal_async_commit, "journal_async_commit"}, {Opt_err_cont, "errors=continue"}, - {Opt_nocluster, "nocluster"}, {Opt_err, NULL} }; @@ -618,13 +616,6 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) goto out; } - tmp = OCFS2_MOUNT_NOCLUSTER; - if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) { - ret = -EINVAL; - mlog(ML_ERROR, "Cannot change nocluster option on remount\n"); - goto out; - } - tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL | OCFS2_MOUNT_HB_NONE; if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) { @@ -865,7 +856,6 @@ static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb, } if (ocfs2_userspace_stack(osb) && - !(osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER) && strncmp(osb->osb_cluster_stack, mopt->cluster_stack, OCFS2_STACK_LABEL_LEN)) { mlog(ML_ERROR, @@ -1137,11 +1127,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" : "ordered"); - if ((osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER) && - !(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT)) - printk(KERN_NOTICE "ocfs2: The shared device (%s) is mounted " - "without cluster aware mode.\n", osb->dev_str); - atomic_set(&osb->vol_state, VOLUME_MOUNTED); wake_up(&osb->osb_mount_event); @@ -1452,9 +1437,6 @@ static int ocfs2_parse_options(struct super_block *sb, case Opt_journal_async_commit: mopt->mount_opt |= OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT; break; - case Opt_nocluster: - mopt->mount_opt |= OCFS2_MOUNT_NOCLUSTER; - break; default: mlog(ML_ERROR, "Unrecognized mount option \"%s\" " @@ -1566,9 +1548,6 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root) if (opts & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT) seq_printf(s, ",journal_async_commit"); - if (opts & OCFS2_MOUNT_NOCLUSTER) - seq_printf(s, ",nocluster"); - return 0; } diff --git a/fs/open.c b/fs/open.c index 1d57fbde2feb..2790aac66e58 100644 --- a/fs/open.c +++ b/fs/open.c @@ -663,6 +663,42 @@ SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode) return do_fchmodat(AT_FDCWD, filename, mode); } +/** + * setattr_vfsuid - check and set ia_fsuid attribute + * @kuid: new inode owner + * + * Check whether @kuid is valid and if so generate and set vfsuid_t in + * ia_vfsuid. + * + * Return: true if @kuid is valid, false if not. + */ +static inline bool setattr_vfsuid(struct iattr *attr, kuid_t kuid) +{ + if (!uid_valid(kuid)) + return false; + attr->ia_valid |= ATTR_UID; + attr->ia_vfsuid = VFSUIDT_INIT(kuid); + return true; +} + +/** + * setattr_vfsgid - check and set ia_fsgid attribute + * @kgid: new inode owner + * + * Check whether @kgid is valid and if so generate and set vfsgid_t in + * ia_vfsgid. + * + * Return: true if @kgid is valid, false if not. + */ +static inline bool setattr_vfsgid(struct iattr *attr, kgid_t kgid) +{ + if (!gid_valid(kgid)) + return false; + attr->ia_valid |= ATTR_GID; + attr->ia_vfsgid = VFSGIDT_INIT(kgid); + return true; +} + int chown_common(const struct path *path, uid_t user, gid_t group) { struct user_namespace *mnt_userns, *fs_userns; @@ -678,28 +714,22 @@ int chown_common(const struct path *path, uid_t user, gid_t group) mnt_userns = mnt_user_ns(path->mnt); fs_userns = i_user_ns(inode); - uid = mapped_kuid_user(mnt_userns, fs_userns, uid); - gid = mapped_kgid_user(mnt_userns, fs_userns, gid); retry_deleg: newattrs.ia_valid = ATTR_CTIME; - if (user != (uid_t) -1) { - if (!uid_valid(uid)) - return -EINVAL; - newattrs.ia_valid |= ATTR_UID; - newattrs.ia_uid = uid; - } - if (group != (gid_t) -1) { - if (!gid_valid(gid)) - return -EINVAL; - newattrs.ia_valid |= ATTR_GID; - newattrs.ia_gid = gid; - } + if ((user != (uid_t)-1) && !setattr_vfsuid(&newattrs, uid)) + return -EINVAL; + if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid)) + return -EINVAL; if (!S_ISDIR(inode->i_mode)) newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV; inode_lock(inode); - error = security_path_chown(path, uid, gid); + /* Continue to send actual fs values, not the mount values. */ + error = security_path_chown( + path, + from_vfsuid(mnt_userns, fs_userns, newattrs.ia_vfsuid), + from_vfsgid(mnt_userns, fs_userns, newattrs.ia_vfsgid)); if (!error) error = notify_change(mnt_userns, path->dentry, &newattrs, &delegated_inode); diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 714ec569d25b..245e2cb62708 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -331,8 +331,8 @@ int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upperdentry, if (!err) { struct iattr attr = { .ia_valid = ATTR_UID | ATTR_GID, - .ia_uid = stat->uid, - .ia_gid = stat->gid, + .ia_vfsuid = VFSUIDT_INIT(stat->uid), + .ia_vfsgid = VFSGIDT_INIT(stat->gid), }; err = ovl_do_notify_change(ofs, upperdentry, &attr); } diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 492eddeb481f..7922b619f6c8 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -454,23 +454,94 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) return res; } +/* + * Apply the idmapping of the layer to POSIX ACLs. The caller must pass a clone + * of the POSIX ACLs retrieved from the lower layer to this function to not + * alter the POSIX ACLs for the underlying filesystem. + */ +static void ovl_idmap_posix_acl(struct user_namespace *mnt_userns, + struct posix_acl *acl) +{ + for (unsigned int i = 0; i < acl->a_count; i++) { + vfsuid_t vfsuid; + vfsgid_t vfsgid; + + struct posix_acl_entry *e = &acl->a_entries[i]; + switch (e->e_tag) { + case ACL_USER: + vfsuid = make_vfsuid(mnt_userns, &init_user_ns, e->e_uid); + e->e_uid = vfsuid_into_kuid(vfsuid); + break; + case ACL_GROUP: + vfsgid = make_vfsgid(mnt_userns, &init_user_ns, e->e_gid); + e->e_gid = vfsgid_into_kgid(vfsgid); + break; + } + } +} + +/* + * When the relevant layer is an idmapped mount we need to take the idmapping + * of the layer into account and translate any ACL_{GROUP,USER} values + * according to the idmapped mount. + * + * We cannot alter the ACLs returned from the relevant layer as that would + * alter the cached values filesystem wide for the lower filesystem. Instead we + * can clone the ACLs and then apply the relevant idmapping of the layer. + * + * This is obviously only relevant when idmapped layers are used. + */ struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu) { struct inode *realinode = ovl_inode_real(inode); - const struct cred *old_cred; - struct posix_acl *acl; + struct posix_acl *acl, *clone; + struct path realpath; if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !IS_POSIXACL(realinode)) return NULL; - if (rcu) - return get_cached_acl_rcu(realinode, type); + /* Careful in RCU walk mode */ + ovl_i_path_real(inode, &realpath); + if (!realpath.dentry) { + WARN_ON(!rcu); + return ERR_PTR(-ECHILD); + } - old_cred = ovl_override_creds(inode->i_sb); - acl = get_acl(realinode, type); - revert_creds(old_cred); + if (rcu) { + acl = get_cached_acl_rcu(realinode, type); + } else { + const struct cred *old_cred; + + old_cred = ovl_override_creds(inode->i_sb); + acl = get_acl(realinode, type); + revert_creds(old_cred); + } + /* + * If there are no POSIX ACLs, or we encountered an error, + * or the layer isn't idmapped we don't need to do anything. + */ + if (!is_idmapped_mnt(realpath.mnt) || IS_ERR_OR_NULL(acl)) + return acl; - return acl; + /* + * We only get here if the layer is idmapped. So drop out of RCU path + * walk so we can clone the ACLs. There's no need to release the ACLs + * since get_cached_acl_rcu() doesn't take a reference on the ACLs. + */ + if (rcu) + return ERR_PTR(-ECHILD); + + clone = posix_acl_clone(acl, GFP_KERNEL); + if (!clone) + clone = ERR_PTR(-ENOMEM); + else + ovl_idmap_posix_acl(mnt_user_ns(realpath.mnt), clone); + /* + * Since we're not in RCU path walk we always need to release the + * original ACLs. + */ + posix_acl_release(acl); + return clone; } int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags) diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 4f34b7e02eee..6ec815b84d48 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -139,17 +139,7 @@ static inline int ovl_do_notify_change(struct ovl_fs *ofs, struct dentry *upperdentry, struct iattr *attr) { - struct user_namespace *upper_mnt_userns = ovl_upper_mnt_userns(ofs); - struct user_namespace *fs_userns = i_user_ns(d_inode(upperdentry)); - - if (attr->ia_valid & ATTR_UID) - attr->ia_uid = mapped_kuid_user(upper_mnt_userns, - fs_userns, attr->ia_uid); - if (attr->ia_valid & ATTR_GID) - attr->ia_gid = mapped_kgid_user(upper_mnt_userns, - fs_userns, attr->ia_gid); - - return notify_change(upper_mnt_userns, upperdentry, attr, NULL); + return notify_change(ovl_upper_mnt_userns(ofs), upperdentry, attr, NULL); } static inline int ovl_do_rmdir(struct ovl_fs *ofs, @@ -259,7 +249,8 @@ static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - int err = vfs_setxattr(ovl_upper_mnt_userns(ofs), dentry, name, value, size, flags); + int err = vfs_setxattr(ovl_upper_mnt_userns(ofs), dentry, name, + (void *)value, size, flags); pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, %d) = %i\n", dentry, name, min((int)size, 48), value, size, flags, err); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 962d32468eb4..1d17d7b13dcd 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -199,7 +199,7 @@ EXPORT_SYMBOL(posix_acl_alloc); /* * Clone an ACL. */ -static struct posix_acl * +struct posix_acl * posix_acl_clone(const struct posix_acl *acl, gfp_t flags) { struct posix_acl *clone = NULL; @@ -213,6 +213,7 @@ posix_acl_clone(const struct posix_acl *acl, gfp_t flags) } return clone; } +EXPORT_SYMBOL_GPL(posix_acl_clone); /* * Check if an acl is valid. Returns 0 if it is, or -E... otherwise. @@ -361,8 +362,8 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode, { const struct posix_acl_entry *pa, *pe, *mask_obj; int found = 0; - kuid_t uid; - kgid_t gid; + vfsuid_t vfsuid; + vfsgid_t vfsgid; want &= MAY_READ | MAY_WRITE | MAY_EXEC; @@ -370,30 +371,28 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode, switch(pa->e_tag) { case ACL_USER_OBJ: /* (May have been checked already) */ - uid = i_uid_into_mnt(mnt_userns, inode); - if (uid_eq(uid, current_fsuid())) + vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + if (vfsuid_eq_kuid(vfsuid, current_fsuid())) goto check_perm; break; case ACL_USER: - uid = mapped_kuid_fs(mnt_userns, - i_user_ns(inode), + vfsuid = make_vfsuid(mnt_userns, &init_user_ns, pa->e_uid); - if (uid_eq(uid, current_fsuid())) + if (vfsuid_eq_kuid(vfsuid, current_fsuid())) goto mask; break; case ACL_GROUP_OBJ: - gid = i_gid_into_mnt(mnt_userns, inode); - if (in_group_p(gid)) { + vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + if (vfsgid_in_group_p(vfsgid)) { found = 1; if ((pa->e_perm & want) == want) goto mask; } break; case ACL_GROUP: - gid = mapped_kgid_fs(mnt_userns, - i_user_ns(inode), + vfsgid = make_vfsgid(mnt_userns, &init_user_ns, pa->e_gid); - if (in_group_p(gid)) { + if (vfsgid_in_group_p(vfsgid)) { found = 1; if ((pa->e_perm & want) == want) goto mask; @@ -699,7 +698,7 @@ int posix_acl_update_mode(struct user_namespace *mnt_userns, return error; if (error == 0) *acl = NULL; - if (!in_group_p(i_gid_into_mnt(mnt_userns, inode)) && + if (!vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode)) && !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) mode &= ~S_ISGID; *mode_p = mode; @@ -710,46 +709,127 @@ EXPORT_SYMBOL(posix_acl_update_mode); /* * Fix up the uids and gids in posix acl extended attributes in place. */ -static void posix_acl_fix_xattr_userns( - struct user_namespace *to, struct user_namespace *from, - struct user_namespace *mnt_userns, - void *value, size_t size, bool from_user) +static int posix_acl_fix_xattr_common(void *value, size_t size) +{ + struct posix_acl_xattr_header *header = value; + int count; + + if (!header) + return -EINVAL; + if (size < sizeof(struct posix_acl_xattr_header)) + return -EINVAL; + if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) + return -EINVAL; + + count = posix_acl_xattr_count(size); + if (count < 0) + return -EINVAL; + if (count == 0) + return -EINVAL; + + return count; +} + +void posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns, + const struct inode *inode, + void *value, size_t size) { struct posix_acl_xattr_header *header = value; struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end; int count; + vfsuid_t vfsuid; + vfsgid_t vfsgid; kuid_t uid; kgid_t gid; - if (!value) + if (no_idmapping(mnt_userns, i_user_ns(inode))) return; - if (size < sizeof(struct posix_acl_xattr_header)) + + count = posix_acl_fix_xattr_common(value, size); + if (count < 0) return; - if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) + + for (end = entry + count; entry != end; entry++) { + switch (le16_to_cpu(entry->e_tag)) { + case ACL_USER: + uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id)); + vfsuid = make_vfsuid(mnt_userns, &init_user_ns, uid); + entry->e_id = cpu_to_le32(from_kuid(&init_user_ns, + vfsuid_into_kuid(vfsuid))); + break; + case ACL_GROUP: + gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id)); + vfsgid = make_vfsgid(mnt_userns, &init_user_ns, gid); + entry->e_id = cpu_to_le32(from_kgid(&init_user_ns, + vfsgid_into_kgid(vfsgid))); + break; + default: + break; + } + } +} + +void posix_acl_setxattr_idmapped_mnt(struct user_namespace *mnt_userns, + const struct inode *inode, + void *value, size_t size) +{ + struct posix_acl_xattr_header *header = value; + struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end; + int count; + vfsuid_t vfsuid; + vfsgid_t vfsgid; + kuid_t uid; + kgid_t gid; + + if (no_idmapping(mnt_userns, i_user_ns(inode))) return; - count = posix_acl_xattr_count(size); + count = posix_acl_fix_xattr_common(value, size); if (count < 0) return; - if (count == 0) + + for (end = entry + count; entry != end; entry++) { + switch (le16_to_cpu(entry->e_tag)) { + case ACL_USER: + uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id)); + vfsuid = VFSUIDT_INIT(uid); + uid = from_vfsuid(mnt_userns, &init_user_ns, vfsuid); + entry->e_id = cpu_to_le32(from_kuid(&init_user_ns, uid)); + break; + case ACL_GROUP: + gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id)); + vfsgid = VFSGIDT_INIT(gid); + gid = from_vfsgid(mnt_userns, &init_user_ns, vfsgid); + entry->e_id = cpu_to_le32(from_kgid(&init_user_ns, gid)); + break; + default: + break; + } + } +} + +static void posix_acl_fix_xattr_userns( + struct user_namespace *to, struct user_namespace *from, + void *value, size_t size) +{ + struct posix_acl_xattr_header *header = value; + struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end; + int count; + kuid_t uid; + kgid_t gid; + + count = posix_acl_fix_xattr_common(value, size); + if (count < 0) return; for (end = entry + count; entry != end; entry++) { switch(le16_to_cpu(entry->e_tag)) { case ACL_USER: uid = make_kuid(from, le32_to_cpu(entry->e_id)); - if (from_user) - uid = mapped_kuid_user(mnt_userns, &init_user_ns, uid); - else - uid = mapped_kuid_fs(mnt_userns, &init_user_ns, uid); entry->e_id = cpu_to_le32(from_kuid(to, uid)); break; case ACL_GROUP: gid = make_kgid(from, le32_to_cpu(entry->e_id)); - if (from_user) - gid = mapped_kgid_user(mnt_userns, &init_user_ns, gid); - else - gid = mapped_kgid_fs(mnt_userns, &init_user_ns, gid); entry->e_id = cpu_to_le32(from_kgid(to, gid)); break; default: @@ -758,34 +838,20 @@ static void posix_acl_fix_xattr_userns( } } -void posix_acl_fix_xattr_from_user(struct user_namespace *mnt_userns, - struct inode *inode, - void *value, size_t size) +void posix_acl_fix_xattr_from_user(void *value, size_t size) { struct user_namespace *user_ns = current_user_ns(); - - /* Leave ids untouched on non-idmapped mounts. */ - if (no_idmapping(mnt_userns, i_user_ns(inode))) - mnt_userns = &init_user_ns; - if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns)) + if (user_ns == &init_user_ns) return; - posix_acl_fix_xattr_userns(&init_user_ns, user_ns, mnt_userns, value, - size, true); + posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size); } -void posix_acl_fix_xattr_to_user(struct user_namespace *mnt_userns, - struct inode *inode, - void *value, size_t size) +void posix_acl_fix_xattr_to_user(void *value, size_t size) { struct user_namespace *user_ns = current_user_ns(); - - /* Leave ids untouched on non-idmapped mounts. */ - if (no_idmapping(mnt_userns, i_user_ns(inode))) - mnt_userns = &init_user_ns; - if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns)) + if (user_ns == &init_user_ns) return; - posix_acl_fix_xattr_userns(user_ns, &init_user_ns, mnt_userns, value, - size, false); + posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size); } /* diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 09d1307959d0..28966da7834e 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2085,7 +2085,8 @@ EXPORT_SYMBOL(__dquot_transfer); /* Wrapper for transferring ownership of an inode for uid/gid only * Called from FSXXX_setattr() */ -int dquot_transfer(struct inode *inode, struct iattr *iattr) +int dquot_transfer(struct user_namespace *mnt_userns, struct inode *inode, + struct iattr *iattr) { struct dquot *transfer_to[MAXQUOTAS] = {}; struct dquot *dquot; @@ -2095,8 +2096,11 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) if (!dquot_active(inode)) return 0; - if (iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)){ - dquot = dqget(sb, make_kqid_uid(iattr->ia_uid)); + if (i_uid_needs_update(mnt_userns, iattr, inode)) { + kuid_t kuid = from_vfsuid(mnt_userns, i_user_ns(inode), + iattr->ia_vfsuid); + + dquot = dqget(sb, make_kqid_uid(kuid)); if (IS_ERR(dquot)) { if (PTR_ERR(dquot) != -ESRCH) { ret = PTR_ERR(dquot); @@ -2106,8 +2110,11 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) } transfer_to[USRQUOTA] = dquot; } - if (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid)){ - dquot = dqget(sb, make_kqid_gid(iattr->ia_gid)); + if (i_gid_needs_update(mnt_userns, iattr, inode)) { + kgid_t kgid = from_vfsgid(mnt_userns, i_user_ns(inode), + iattr->ia_vfsgid); + + dquot = dqget(sb, make_kqid_gid(kgid)); if (IS_ERR(dquot)) { if (PTR_ERR(dquot) != -ESRCH) { ret = PTR_ERR(dquot); diff --git a/fs/read_write.c b/fs/read_write.c index b1b1cdfee9d3..397da0236607 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1263,6 +1263,9 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, count, fl); file_end_write(out.file); } else { + if (out.file->f_flags & O_NONBLOCK) + fl |= SPLICE_F_NONBLOCK; + retval = splice_file_to_pipe(in.file, opipe, &pos, count, fl); } @@ -1397,28 +1400,6 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in, } EXPORT_SYMBOL(generic_copy_file_range); -static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - size_t len, unsigned int flags) -{ - /* - * Although we now allow filesystems to handle cross sb copy, passing - * a file of the wrong filesystem type to filesystem driver can result - * in an attempt to dereference the wrong type of ->private_data, so - * avoid doing that until we really have a good reason. NFS defines - * several different file_system_type structures, but they all end up - * using the same ->copy_file_range() function pointer. - */ - if (file_out->f_op->copy_file_range && - file_out->f_op->copy_file_range == file_in->f_op->copy_file_range) - return file_out->f_op->copy_file_range(file_in, pos_in, - file_out, pos_out, - len, flags); - - return generic_copy_file_range(file_in, pos_in, file_out, pos_out, len, - flags); -} - /* * Performs necessary checks before doing a file copy * @@ -1440,6 +1421,24 @@ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in, if (ret) return ret; + /* + * We allow some filesystems to handle cross sb copy, but passing + * a file of the wrong filesystem type to filesystem driver can result + * in an attempt to dereference the wrong type of ->private_data, so + * avoid doing that until we really have a good reason. + * + * nfs and cifs define several different file_system_type structures + * and several different sets of file_operations, but they all end up + * using the same ->copy_file_range() function pointer. + */ + if (file_out->f_op->copy_file_range) { + if (file_in->f_op->copy_file_range != + file_out->f_op->copy_file_range) + return -EXDEV; + } else if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) { + return -EXDEV; + } + /* Don't touch certain kinds of inodes */ if (IS_IMMUTABLE(inode_out)) return -EPERM; @@ -1505,26 +1504,41 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, file_start_write(file_out); /* - * Try cloning first, this is supported by more file systems, and - * more efficient if both clone and copy are supported (e.g. NFS). + * Cloning is supported by more file systems, so we implement copy on + * same sb using clone, but for filesystems where both clone and copy + * are supported (e.g. nfs,cifs), we only call the copy method. */ + if (file_out->f_op->copy_file_range) { + ret = file_out->f_op->copy_file_range(file_in, pos_in, + file_out, pos_out, + len, flags); + goto done; + } + if (file_in->f_op->remap_file_range && file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) { - loff_t cloned; - - cloned = file_in->f_op->remap_file_range(file_in, pos_in, + ret = file_in->f_op->remap_file_range(file_in, pos_in, file_out, pos_out, min_t(loff_t, MAX_RW_COUNT, len), REMAP_FILE_CAN_SHORTEN); - if (cloned > 0) { - ret = cloned; + if (ret > 0) goto done; - } } - ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len, - flags); - WARN_ON_ONCE(ret == -EOPNOTSUPP); + /* + * We can get here for same sb copy of filesystems that do not implement + * ->copy_file_range() in case filesystem does not support clone or in + * case filesystem supports clone but rejected the clone request (e.g. + * because it was not block aligned). + * + * In both cases, fall back to kernel copy so we are able to maintain a + * consistent story about which filesystems support copy_file_range() + * and which filesystems do not, that will allow userspace tools to + * make consistent desicions w.r.t using copy_file_range(). + */ + ret = generic_copy_file_range(file_in, pos_in, file_out, pos_out, len, + flags); + done: if (ret > 0) { fsnotify_access(file_in); diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 0cffe054b78e..0df48d176732 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -290,7 +290,7 @@ static int _get_block_create_0(struct inode *inode, sector_t block, struct buffer_head *bh; struct item_head *ih, tmp_ih; b_blocknr_t blocknr; - char *p = NULL; + char *p; int chars; int ret; int result; @@ -305,8 +305,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block, result = search_for_position_by_key(inode->i_sb, &key, &path); if (result != POSITION_FOUND) { pathrelse(&path); - if (p) - kunmap(bh_result->b_page); if (result == IO_ERROR) return -EIO; /* @@ -352,8 +350,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block, } pathrelse(&path); - if (p) - kunmap(bh_result->b_page); return ret; } /* requested data are in direct item(s) */ @@ -363,8 +359,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block, * when it is stored in direct item(s) */ pathrelse(&path); - if (p) - kunmap(bh_result->b_page); return -ENOENT; } @@ -396,9 +390,7 @@ static int _get_block_create_0(struct inode *inode, sector_t block, * sure we need to. But, this means the item might move if * kmap schedules */ - if (!p) - p = (char *)kmap(bh_result->b_page); - + p = (char *)kmap(bh_result->b_page); p += offset; memset(p, 0, inode->i_sb->s_blocksize); do { @@ -3284,7 +3276,7 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, /* must be turned off for recursive notify_change calls */ ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID); - if (is_quota_modification(inode, attr)) { + if (is_quota_modification(mnt_userns, inode, attr)) { error = dquot_initialize(inode); if (error) return error; @@ -3367,7 +3359,7 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, reiserfs_write_unlock(inode->i_sb); if (error) goto out; - error = dquot_transfer(inode, attr); + error = dquot_transfer(mnt_userns, inode, attr); reiserfs_write_lock(inode->i_sb); if (error) { journal_end(&th); diff --git a/fs/remap_range.c b/fs/remap_range.c index e112b5424cdb..881a306ee247 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -71,7 +71,8 @@ static int generic_remap_checks(struct file *file_in, loff_t pos_in, * Otherwise, make sure the count is also block-aligned, having * already confirmed the starting offsets' block alignment. */ - if (pos_in + count == size_in) { + if (pos_in + count == size_in && + (!(remap_flags & REMAP_FILE_DEDUP) || pos_out + count == size_out)) { bcount = ALIGN(size_in, bs) - pos_in; } else { if (!IS_ALIGNED(count, bs)) diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index de7252715b12..81d26abf486f 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -553,7 +553,7 @@ struct dentry *tracefs_create_dir(const char *name, struct dentry *parent) * * Only one instances directory is allowed. * - * The instances directory is special as it allows for mkdir and rmdir to + * The instances directory is special as it allows for mkdir and rmdir * to be done by userspace. When a mkdir or rmdir is performed, the inode * locks are released and the methods passed in (@mkdir and @rmdir) are * called without locks and with the name of the directory being created diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index e943370107d0..de86f5b2859f 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -192,17 +192,19 @@ static inline void msg_init(struct uffd_msg *msg) } static inline struct uffd_msg userfault_msg(unsigned long address, + unsigned long real_address, unsigned int flags, unsigned long reason, unsigned int features) { struct uffd_msg msg; + msg_init(&msg); msg.event = UFFD_EVENT_PAGEFAULT; - if (!(features & UFFD_FEATURE_EXACT_ADDRESS)) - address &= PAGE_MASK; - msg.arg.pagefault.address = address; + msg.arg.pagefault.address = (features & UFFD_FEATURE_EXACT_ADDRESS) ? + real_address : address; + /* * These flags indicate why the userfault occurred: * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault. @@ -488,8 +490,8 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); uwq.wq.private = current; - uwq.msg = userfault_msg(vmf->real_address, vmf->flags, reason, - ctx->features); + uwq.msg = userfault_msg(vmf->address, vmf->real_address, vmf->flags, + reason, ctx->features); uwq.ctx = ctx; uwq.waken = false; diff --git a/fs/xattr.c b/fs/xattr.c index e8dd03e4561e..a1f4998bc6be 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -282,9 +282,15 @@ out: } EXPORT_SYMBOL_GPL(__vfs_setxattr_locked); +static inline bool is_posix_acl_xattr(const char *name) +{ + return (strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || + (strcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0); +} + int vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, - const char *name, const void *value, size_t size, int flags) + const char *name, void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; struct inode *delegated_inode = NULL; @@ -292,12 +298,16 @@ vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, int error; if (size && strcmp(name, XATTR_NAME_CAPS) == 0) { - error = cap_convert_nscap(mnt_userns, dentry, &value, size); + error = cap_convert_nscap(mnt_userns, dentry, + (const void **)&value, size); if (error < 0) return error; size = error; } + if (size && is_posix_acl_xattr(name)) + posix_acl_setxattr_idmapped_mnt(mnt_userns, inode, value, size); + retry_deleg: inode_lock(inode); error = __vfs_setxattr_locked(mnt_userns, dentry, name, value, size, @@ -431,7 +441,10 @@ vfs_getxattr(struct user_namespace *mnt_userns, struct dentry *dentry, return ret; } nolsm: - return __vfs_getxattr(dentry, inode, name, value, size); + error = __vfs_getxattr(dentry, inode, name, value, size); + if (error > 0 && is_posix_acl_xattr(name)) + posix_acl_getxattr_idmapped_mnt(mnt_userns, inode, value, size); + return error; } EXPORT_SYMBOL_GPL(vfs_getxattr); @@ -577,8 +590,7 @@ static void setxattr_convert(struct user_namespace *mnt_userns, if (ctx->size && ((strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || (strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))) - posix_acl_fix_xattr_from_user(mnt_userns, d_inode(d), - ctx->kvalue, ctx->size); + posix_acl_fix_xattr_from_user(ctx->kvalue, ctx->size); } int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, @@ -695,8 +707,7 @@ do_getxattr(struct user_namespace *mnt_userns, struct dentry *d, if (error > 0) { if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) - posix_acl_fix_xattr_to_user(mnt_userns, d_inode(d), - ctx->kvalue, error); + posix_acl_fix_xattr_to_user(ctx->kvalue, error); if (ctx->size && copy_to_user(ctx->value, ctx->kvalue, error)) error = -EFAULT; } else if (error == -ERANGE && ctx->size >= XATTR_SIZE_MAX) { diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 1824f61621a2..224649a76cbb 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -50,7 +50,7 @@ STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args); STATIC int xfs_attr_leaf_get(xfs_da_args_t *args); STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args); STATIC int xfs_attr_leaf_hasname(struct xfs_da_args *args, struct xfs_buf **bp); -STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args, struct xfs_buf *bp); +STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args); /* * Internal routines when attribute list is more than one block. @@ -393,16 +393,10 @@ xfs_attr_sf_addname( * It won't fit in the shortform, transform to a leaf block. GROT: * another possible req'mt for a double-split btree op. */ - error = xfs_attr_shortform_to_leaf(args, &attr->xattri_leaf_bp); + error = xfs_attr_shortform_to_leaf(args); if (error) return error; - /* - * Prevent the leaf buffer from being unlocked so that a concurrent AIL - * push cannot grab the half-baked leaf buffer and run into problems - * with the write verifier. - */ - xfs_trans_bhold(args->trans, attr->xattri_leaf_bp); attr->xattri_dela_state = XFS_DAS_LEAF_ADD; out: trace_xfs_attr_sf_addname_return(attr->xattri_dela_state, args->dp); @@ -447,11 +441,9 @@ xfs_attr_leaf_addname( /* * Use the leaf buffer we may already hold locked as a result of - * a sf-to-leaf conversion. The held buffer is no longer valid - * after this call, regardless of the result. + * a sf-to-leaf conversion. */ - error = xfs_attr_leaf_try_add(args, attr->xattri_leaf_bp); - attr->xattri_leaf_bp = NULL; + error = xfs_attr_leaf_try_add(args); if (error == -ENOSPC) { error = xfs_attr3_leaf_to_node(args); @@ -497,8 +489,6 @@ xfs_attr_node_addname( struct xfs_da_args *args = attr->xattri_da_args; int error; - ASSERT(!attr->xattri_leaf_bp); - error = xfs_attr_node_addname_find_attr(attr); if (error) return error; @@ -1215,24 +1205,14 @@ xfs_attr_restore_rmt_blk( */ STATIC int xfs_attr_leaf_try_add( - struct xfs_da_args *args, - struct xfs_buf *bp) + struct xfs_da_args *args) { + struct xfs_buf *bp; int error; - /* - * If the caller provided a buffer to us, it is locked and held in - * the transaction because it just did a shortform to leaf conversion. - * Hence we don't need to read it again. Otherwise read in the leaf - * buffer. - */ - if (bp) { - xfs_trans_bhold_release(args->trans, bp); - } else { - error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); - if (error) - return error; - } + error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); + if (error) + return error; /* * Look up the xattr name to set the insertion point for the new xattr. diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index b4a2fc77017e..dfb47fa63c6d 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -515,11 +515,6 @@ struct xfs_attr_intent { */ struct xfs_attri_log_nameval *xattri_nameval; - /* - * Used by xfs_attr_set to hold a leaf buffer across a transaction roll - */ - struct xfs_buf *xattri_leaf_bp; - /* Used to keep track of current state of delayed operation */ enum xfs_delattr_state xattri_dela_state; diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 37e7c33f6283..8f47396f8dd2 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -289,6 +289,23 @@ xfs_attr3_leaf_verify_entry( return NULL; } +/* + * Validate an attribute leaf block. + * + * Empty leaf blocks can occur under the following circumstances: + * + * 1. setxattr adds a new extended attribute to a file; + * 2. The file has zero existing attributes; + * 3. The attribute is too large to fit in the attribute fork; + * 4. The attribute is small enough to fit in a leaf block; + * 5. A log flush occurs after committing the transaction that creates + * the (empty) leaf block; and + * 6. The filesystem goes down after the log flush but before the new + * attribute can be committed to the leaf block. + * + * Hence we need to ensure that we don't fail the validation purely + * because the leaf is empty. + */ static xfs_failaddr_t xfs_attr3_leaf_verify( struct xfs_buf *bp) @@ -311,15 +328,6 @@ xfs_attr3_leaf_verify( return fa; /* - * Empty leaf blocks should never occur; they imply the existence of a - * software bug that needs fixing. xfs_repair also flags them as a - * corruption that needs fixing, so we should never let these go to - * disk. - */ - if (ichdr.count == 0) - return __this_address; - - /* * firstused is the block offset of the first name info structure. * Make sure it doesn't go off the block or crash into the header. */ @@ -922,14 +930,10 @@ xfs_attr_shortform_getvalue( return -ENOATTR; } -/* - * Convert from using the shortform to the leaf. On success, return the - * buffer so that we can keep it locked until we're totally done with it. - */ +/* Convert from using the shortform to the leaf format. */ int xfs_attr_shortform_to_leaf( - struct xfs_da_args *args, - struct xfs_buf **leaf_bp) + struct xfs_da_args *args) { struct xfs_inode *dp; struct xfs_attr_shortform *sf; @@ -991,7 +995,6 @@ xfs_attr_shortform_to_leaf( sfe = xfs_attr_sf_nextentry(sfe); } error = 0; - *leaf_bp = bp; out: kmem_free(tmpbuffer); return error; diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index efa757f1e912..368f4d9fa1d5 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -49,8 +49,7 @@ void xfs_attr_shortform_create(struct xfs_da_args *args); void xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff); int xfs_attr_shortform_lookup(struct xfs_da_args *args); int xfs_attr_shortform_getvalue(struct xfs_da_args *args); -int xfs_attr_shortform_to_leaf(struct xfs_da_args *args, - struct xfs_buf **leaf_bp); +int xfs_attr_shortform_to_leaf(struct xfs_da_args *args); int xfs_attr_sf_removename(struct xfs_da_args *args); int xfs_attr_sf_findname(struct xfs_da_args *args, struct xfs_attr_sf_entry **sfep, diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 135d44133477..5077a7ad5646 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -576,7 +576,7 @@ xfs_attri_item_recover( struct xfs_trans_res tres; struct xfs_attri_log_format *attrp; struct xfs_attri_log_nameval *nv = attrip->attri_nameval; - int error, ret = 0; + int error; int total; int local; struct xfs_attrd_log_item *done_item = NULL; @@ -655,29 +655,32 @@ xfs_attri_item_recover( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - ret = xfs_xattri_finish_update(attr, done_item); - if (ret == -EAGAIN) { - /* There's more work to do, so add it to this transaction */ + error = xfs_xattri_finish_update(attr, done_item); + if (error == -EAGAIN) { + /* + * There's more work to do, so add the intent item to this + * transaction so that we can continue it later. + */ xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_ATTR, &attr->xattri_list); - } else - error = ret; + error = xfs_defer_ops_capture_and_commit(tp, capture_list); + if (error) + goto out_unlock; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_irele(ip); + return 0; + } if (error) { xfs_trans_cancel(tp); goto out_unlock; } error = xfs_defer_ops_capture_and_commit(tp, capture_list); - out_unlock: - if (attr->xattri_leaf_bp) - xfs_buf_relse(attr->xattri_leaf_bp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_irele(ip); out: - if (ret != -EAGAIN) - xfs_attr_free_item(attr); + xfs_attr_free_item(attr); return error; } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 52be58372c63..85e1a26c92e8 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -686,6 +686,8 @@ xfs_can_free_eofblocks( * forever. */ end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip)); + if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) + end_fsb = roundup_64(end_fsb, mp->m_sb.sb_rextsize); last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); if (last_fsb <= end_fsb) return false; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 5269354b1b69..2609825d53ee 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -440,7 +440,7 @@ xfs_inodegc_queue_all( for_each_online_cpu(cpu) { gc = per_cpu_ptr(mp->m_inodegc, cpu); if (!llist_empty(&gc->list)) - queue_work_on(cpu, mp->m_inodegc_wq, &gc->work); + mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0); } } @@ -1841,8 +1841,8 @@ void xfs_inodegc_worker( struct work_struct *work) { - struct xfs_inodegc *gc = container_of(work, struct xfs_inodegc, - work); + struct xfs_inodegc *gc = container_of(to_delayed_work(work), + struct xfs_inodegc, work); struct llist_node *node = llist_del_all(&gc->list); struct xfs_inode *ip, *n; @@ -1862,19 +1862,29 @@ xfs_inodegc_worker( } /* - * Force all currently queued inode inactivation work to run immediately and - * wait for the work to finish. + * Expedite all pending inodegc work to run immediately. This does not wait for + * completion of the work. */ void -xfs_inodegc_flush( +xfs_inodegc_push( struct xfs_mount *mp) { if (!xfs_is_inodegc_enabled(mp)) return; + trace_xfs_inodegc_push(mp, __return_address); + xfs_inodegc_queue_all(mp); +} +/* + * Force all currently queued inode inactivation work to run immediately and + * wait for the work to finish. + */ +void +xfs_inodegc_flush( + struct xfs_mount *mp) +{ + xfs_inodegc_push(mp); trace_xfs_inodegc_flush(mp, __return_address); - - xfs_inodegc_queue_all(mp); flush_workqueue(mp->m_inodegc_wq); } @@ -2014,6 +2024,7 @@ xfs_inodegc_queue( struct xfs_inodegc *gc; int items; unsigned int shrinker_hits; + unsigned long queue_delay = 1; trace_xfs_inode_set_need_inactive(ip); spin_lock(&ip->i_flags_lock); @@ -2025,19 +2036,26 @@ xfs_inodegc_queue( items = READ_ONCE(gc->items); WRITE_ONCE(gc->items, items + 1); shrinker_hits = READ_ONCE(gc->shrinker_hits); - put_cpu_ptr(gc); - if (!xfs_is_inodegc_enabled(mp)) + /* + * We queue the work while holding the current CPU so that the work + * is scheduled to run on this CPU. + */ + if (!xfs_is_inodegc_enabled(mp)) { + put_cpu_ptr(gc); return; - - if (xfs_inodegc_want_queue_work(ip, items)) { - trace_xfs_inodegc_queue(mp, __return_address); - queue_work(mp->m_inodegc_wq, &gc->work); } + if (xfs_inodegc_want_queue_work(ip, items)) + queue_delay = 0; + + trace_xfs_inodegc_queue(mp, __return_address); + mod_delayed_work(mp->m_inodegc_wq, &gc->work, queue_delay); + put_cpu_ptr(gc); + if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) { trace_xfs_inodegc_throttle(mp, __return_address); - flush_work(&gc->work); + flush_delayed_work(&gc->work); } } @@ -2054,7 +2072,7 @@ xfs_inodegc_cpu_dead( unsigned int count = 0; dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu); - cancel_work_sync(&dead_gc->work); + cancel_delayed_work_sync(&dead_gc->work); if (llist_empty(&dead_gc->list)) return; @@ -2073,12 +2091,12 @@ xfs_inodegc_cpu_dead( llist_add_batch(first, last, &gc->list); count += READ_ONCE(gc->items); WRITE_ONCE(gc->items, count); - put_cpu_ptr(gc); if (xfs_is_inodegc_enabled(mp)) { trace_xfs_inodegc_queue(mp, __return_address); - queue_work(mp->m_inodegc_wq, &gc->work); + mod_delayed_work(mp->m_inodegc_wq, &gc->work, 0); } + put_cpu_ptr(gc); } /* @@ -2173,7 +2191,7 @@ xfs_inodegc_shrinker_scan( unsigned int h = READ_ONCE(gc->shrinker_hits); WRITE_ONCE(gc->shrinker_hits, h + 1); - queue_work_on(cpu, mp->m_inodegc_wq, &gc->work); + mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0); no_items = false; } } diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 2e4cfddf8b8e..6cd180721659 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -76,6 +76,7 @@ void xfs_blockgc_stop(struct xfs_mount *mp); void xfs_blockgc_start(struct xfs_mount *mp); void xfs_inodegc_worker(struct work_struct *work); +void xfs_inodegc_push(struct xfs_mount *mp); void xfs_inodegc_flush(struct xfs_mount *mp); void xfs_inodegc_stop(struct xfs_mount *mp); void xfs_inodegc_start(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 52d6f2c7d58b..3e1c62ffa4f7 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -132,6 +132,26 @@ xfs_ilock_attr_map_shared( } /* + * You can't set both SHARED and EXCL for the same lock, + * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_MMAPLOCK_SHARED, + * XFS_MMAPLOCK_EXCL, XFS_ILOCK_SHARED, XFS_ILOCK_EXCL are valid values + * to set in lock_flags. + */ +static inline void +xfs_lock_flags_assert( + uint lock_flags) +{ + ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != + (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != + (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); + ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); + ASSERT(lock_flags != 0); +} + +/* * In addition to i_rwsem in the VFS inode, the xfs inode contains 2 * multi-reader locks: invalidate_lock and the i_lock. This routine allows * various combinations of the locks to be obtained. @@ -168,18 +188,7 @@ xfs_ilock( { trace_xfs_ilock(ip, lock_flags, _RET_IP_); - /* - * You can't set both SHARED and EXCL for the same lock, - * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, - * and XFS_ILOCK_EXCL are valid values to set in lock_flags. - */ - ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != - (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); - ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != - (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); - ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != - (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); + xfs_lock_flags_assert(lock_flags); if (lock_flags & XFS_IOLOCK_EXCL) { down_write_nested(&VFS_I(ip)->i_rwsem, @@ -222,18 +231,7 @@ xfs_ilock_nowait( { trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_); - /* - * You can't set both SHARED and EXCL for the same lock, - * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, - * and XFS_ILOCK_EXCL are valid values to set in lock_flags. - */ - ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != - (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); - ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != - (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); - ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != - (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); + xfs_lock_flags_assert(lock_flags); if (lock_flags & XFS_IOLOCK_EXCL) { if (!down_write_trylock(&VFS_I(ip)->i_rwsem)) @@ -291,19 +289,7 @@ xfs_iunlock( xfs_inode_t *ip, uint lock_flags) { - /* - * You can't set both SHARED and EXCL for the same lock, - * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, - * and XFS_ILOCK_EXCL are valid values to set in lock_flags. - */ - ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != - (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); - ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != - (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); - ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != - (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); - ASSERT(lock_flags != 0); + xfs_lock_flags_assert(lock_flags); if (lock_flags & XFS_IOLOCK_EXCL) up_write(&VFS_I(ip)->i_rwsem); @@ -379,8 +365,8 @@ xfs_isilocked( } if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) { - return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem, - (lock_flags & XFS_IOLOCK_SHARED)); + return __xfs_rwsem_islocked(&VFS_I(ip)->i_mapping->invalidate_lock, + (lock_flags & XFS_MMAPLOCK_SHARED)); } if (lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) { diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 29f5b8b8aca6..a7402f6ea510 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -667,13 +667,15 @@ xfs_setattr_nonsize( uint qflags = 0; if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) { - uid = iattr->ia_uid; + uid = from_vfsuid(mnt_userns, i_user_ns(inode), + iattr->ia_vfsuid); qflags |= XFS_QMOPT_UQUOTA; } else { uid = inode->i_uid; } if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) { - gid = iattr->ia_gid; + gid = from_vfsgid(mnt_userns, i_user_ns(inode), + iattr->ia_vfsgid); qflags |= XFS_QMOPT_GQUOTA; } else { gid = inode->i_gid; @@ -704,13 +706,13 @@ xfs_setattr_nonsize( * didn't have the inode locked, inode's dquot(s) would have changed * also. */ - if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp) && - !uid_eq(inode->i_uid, iattr->ia_uid)) { + if (XFS_IS_UQUOTA_ON(mp) && + i_uid_needs_update(mnt_userns, iattr, inode)) { ASSERT(udqp); old_udqp = xfs_qm_vop_chown(tp, ip, &ip->i_udquot, udqp); } - if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp) && - !gid_eq(inode->i_gid, iattr->ia_gid)) { + if (XFS_IS_GQUOTA_ON(mp) && + i_gid_needs_update(mnt_userns, iattr, inode)) { ASSERT(xfs_has_pquotino(mp) || !XFS_IS_PQUOTA_ON(mp)); ASSERT(gdqp); old_gdqp = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp); diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 1e972f884a81..ae904b21e9cc 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -2092,8 +2092,6 @@ xlog_dealloc_log( xlog_in_core_t *iclog, *next_iclog; int i; - xlog_cil_destroy(log); - /* * Cycle all the iclogbuf locks to make sure all log IO completion * is done before we tear down these buffers. @@ -2105,6 +2103,13 @@ xlog_dealloc_log( iclog = iclog->ic_next; } + /* + * Destroy the CIL after waiting for iclog IO completion because an + * iclog EIO error will try to shut down the log, which accesses the + * CIL to wake up the waiters. + */ + xlog_cil_destroy(log); + iclog = log->l_iclog; for (i = 0; i < log->l_iclog_bufs; i++) { next_iclog = iclog->ic_next; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index ba5d42abf66e..d2eaebd85abf 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -61,7 +61,7 @@ struct xfs_error_cfg { */ struct xfs_inodegc { struct llist_head list; - struct work_struct work; + struct delayed_work work; /* approximate count of inodes in the list */ unsigned int items; diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 74ac9ca9e119..392cb39cc10c 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -454,9 +454,12 @@ xfs_qm_scall_getquota( struct xfs_dquot *dqp; int error; - /* Flush inodegc work at the start of a quota reporting scan. */ + /* + * Expedite pending inodegc work at the start of a quota reporting + * scan but don't block waiting for it to complete. + */ if (id == 0) - xfs_inodegc_flush(mp); + xfs_inodegc_push(mp); /* * Try to get the dquot. We don't want it allocated on disk, so don't @@ -498,7 +501,7 @@ xfs_qm_scall_getquota_next( /* Flush inodegc work at the start of a quota reporting scan. */ if (*id == 0) - xfs_inodegc_flush(mp); + xfs_inodegc_push(mp); error = xfs_qm_dqget_next(mp, *id, type, &dqp); if (error) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index ed18160e6181..aa977c7ea370 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -797,8 +797,11 @@ xfs_fs_statfs( xfs_extlen_t lsize; int64_t ffree; - /* Wait for whatever inactivations are in progress. */ - xfs_inodegc_flush(mp); + /* + * Expedite background inodegc but don't wait. We do not want to block + * here waiting hours for a billion extent file to be truncated. + */ + xfs_inodegc_push(mp); statp->f_type = XFS_SUPER_MAGIC; statp->f_namelen = MAXNAMELEN - 1; @@ -1074,7 +1077,7 @@ xfs_inodegc_init_percpu( gc = per_cpu_ptr(mp->m_inodegc, cpu); init_llist_head(&gc->list); gc->items = 0; - INIT_WORK(&gc->work, xfs_inodegc_worker); + INIT_DELAYED_WORK(&gc->work, xfs_inodegc_worker); } return 0; } diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index d32026585c1b..0fa1b7a2918c 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -240,6 +240,7 @@ DEFINE_EVENT(xfs_fs_class, name, \ TP_PROTO(struct xfs_mount *mp, void *caller_ip), \ TP_ARGS(mp, caller_ip)) DEFINE_FS_EVENT(xfs_inodegc_flush); +DEFINE_FS_EVENT(xfs_inodegc_push); DEFINE_FS_EVENT(xfs_inodegc_start); DEFINE_FS_EVENT(xfs_inodegc_stop); DEFINE_FS_EVENT(xfs_inodegc_queue); diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 053299758deb..f5d8338967cb 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -616,7 +616,7 @@ static int zonefs_inode_setattr(struct user_namespace *mnt_userns, !uid_eq(iattr->ia_uid, inode->i_uid)) || ((iattr->ia_valid & ATTR_GID) && !gid_eq(iattr->ia_gid, inode->i_gid))) { - ret = dquot_transfer(inode, iattr); + ret = dquot_transfer(mnt_userns, inode, iattr); if (ret) return ret; } |