From 8b70deb8ca901d45e2342d654c1e8d306f52e6cb Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Wed, 26 Apr 2023 19:22:21 +0200 Subject: fs/ecryptfs: Replace kmap() with kmap_local_page() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kmap() has been deprecated in favor of kmap_local_page(). Therefore, replace kmap() with kmap_local_page() in fs/ecryptfs. There are two main problems with kmap(): (1) It comes with an overhead as the mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmap’s pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. With kmap_local_page() the mappings are per thread, CPU local, can take page faults, and can be called from any context (including interrupts). It is faster than kmap() in kernels with HIGHMEM enabled. The tasks can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and still valid. Obviously, thread locality implies that the kernel virtual addresses returned by kmap_local_page() are only valid in the context of the callers (i.e., they cannot be handed to other threads). The use of kmap_local_page() in fs/ecryptfs does not break the above-mentioned assumption, so it is allowed and preferred. Tested in a QEMU/KVM x86_32 VM, 6GB RAM, booting a kernel with HIGHMEM64GB enabled. Suggested-by: Ira Weiny Reviewed-by: Ira Weiny Signed-off-by: "Fabio M. De Francesco" Message-Id: <20230426172223.8896-2-fmdefrancesco@gmail.com> Signed-off-by: Christian Brauner --- fs/ecryptfs/crypto.c | 8 ++++---- fs/ecryptfs/read_write.c | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index c16f0d660cb7..03bd55069d86 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -441,10 +441,10 @@ int ecryptfs_encrypt_page(struct page *page) } lower_offset = lower_offset_for_page(crypt_stat, page); - enc_extent_virt = kmap(enc_extent_page); + enc_extent_virt = kmap_local_page(enc_extent_page); rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt, lower_offset, PAGE_SIZE); - kunmap(enc_extent_page); + kunmap_local(enc_extent_virt); if (rc < 0) { ecryptfs_printk(KERN_ERR, "Error attempting to write lower page; rc = [%d]\n", @@ -490,10 +490,10 @@ int ecryptfs_decrypt_page(struct page *page) BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)); lower_offset = lower_offset_for_page(crypt_stat, page); - page_virt = kmap(page); + page_virt = kmap_local_page(page); rc = ecryptfs_read_lower(page_virt, lower_offset, PAGE_SIZE, ecryptfs_inode); - kunmap(page); + kunmap_local(page_virt); if (rc < 0) { ecryptfs_printk(KERN_ERR, "Error attempting to read lower page; rc = [%d]\n", diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c index 60bdcaddcbe5..5edf027c8359 100644 --- a/fs/ecryptfs/read_write.c +++ b/fs/ecryptfs/read_write.c @@ -64,11 +64,11 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode, offset = ((((loff_t)page_for_lower->index) << PAGE_SHIFT) + offset_in_page); - virt = kmap(page_for_lower); + virt = kmap_local_page(page_for_lower); rc = ecryptfs_write_lower(ecryptfs_inode, virt, offset, size); if (rc > 0) rc = 0; - kunmap(page_for_lower); + kunmap_local(virt); return rc; } @@ -253,11 +253,11 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs, int rc; offset = ((((loff_t)page_index) << PAGE_SHIFT) + offset_in_page); - virt = kmap(page_for_ecryptfs); + virt = kmap_local_page(page_for_ecryptfs); rc = ecryptfs_read_lower(virt, offset, size, ecryptfs_inode); if (rc > 0) rc = 0; - kunmap(page_for_ecryptfs); + kunmap_local(virt); flush_dcache_page(page_for_ecryptfs); return rc; } -- cgit v1.2.3 From c3c6833ea8114a18d5acf5f9a4d32f864103144c Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Wed, 26 Apr 2023 19:22:22 +0200 Subject: fs/ecryptfs: Use kmap_local_page() in ecryptfs_write() kmap_atomic() is deprecated in favor of kmap_local_page(). Therefore, replace kmap_atomic() with kmap_local_page() in ecryptfs_write(). kmap_atomic() is implemented like kmap_local_page() which also disables page-faults and preemption (the latter only for !PREEMPT_RT kernels). The code within the mapping/un-mapping in ecryptfs_write() does not depend on the above-mentioned side effects so that a mere replacement of the old API with the new one is all that is required (i.e., there is no need to explicitly call pagefault_disable() and/or preempt_disable()). Tested in a QEMU/KVM x86_32 VM, 6GB RAM, booting a kernel with HIGHMEM64GB enabled. Suggested-by: Ira Weiny Signed-off-by: "Fabio M. De Francesco" Message-Id: <20230426172223.8896-3-fmdefrancesco@gmail.com> Signed-off-by: Christian Brauner --- fs/ecryptfs/read_write.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c index 5edf027c8359..3458f153a588 100644 --- a/fs/ecryptfs/read_write.c +++ b/fs/ecryptfs/read_write.c @@ -140,7 +140,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset, ecryptfs_page_idx, rc); goto out; } - ecryptfs_page_virt = kmap_atomic(ecryptfs_page); + ecryptfs_page_virt = kmap_local_page(ecryptfs_page); /* * pos: where we're now writing, offset: where the request was @@ -163,7 +163,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset, (data + data_offset), num_bytes); data_offset += num_bytes; } - kunmap_atomic(ecryptfs_page_virt); + kunmap_local(ecryptfs_page_virt); flush_dcache_page(ecryptfs_page); SetPageUptodate(ecryptfs_page); unlock_page(ecryptfs_page); -- cgit v1.2.3 From e2393b8f3987c5f29dca03aa71d0502f6d721c0d Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Wed, 26 Apr 2023 19:22:23 +0200 Subject: fs/ecryptfs: Use kmap_local_page() in copy_up_encrypted_with_header() kmap_atomic() has been deprecated in favor of kmap_local_page(). Therefore, replace kmap_atomic() with kmap_local_page() in ecryptfs_copy_up_encrypted_with_header(). kmap_atomic() is implemented like a kmap_local_page() which also disables page-faults and preemption (the latter only in !PREEMPT_RT kernels). The kernel virtual addresses returned by these two API are only valid in the context of the callers (i.e., they cannot be handed to other threads). With kmap_local_page() the mappings are per thread and CPU local like in kmap_atomic(); however, they can handle page-faults and can be called from any context (including interrupts). The tasks that call kmap_local_page() can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and are still valid. In ecryptfs_copy_up_encrypted_with_header(), the block of code between the mapping and un-mapping does not depend on the above-mentioned side effects of kmap_aatomic(), so that the mere replacements of the old API with the new one is all that is required (i.e., there is no need to explicitly call pagefault_disable() and/or preempt_disable()). Tested in a QEMU/KVM x86_32 VM, 6GB RAM, booting a kernel with HIGHMEM64GB enabled. Cc: Ira Weiny Signed-off-by: "Fabio M. De Francesco" Message-Id: <20230426172223.8896-4-fmdefrancesco@gmail.com> Signed-off-by: Christian Brauner --- fs/ecryptfs/mmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 373c3e5747e6..cb1e998ce54d 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -125,7 +125,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page, /* This is a header extent */ char *page_virt; - page_virt = kmap_atomic(page); + page_virt = kmap_local_page(page); memset(page_virt, 0, PAGE_SIZE); /* TODO: Support more than one header extent */ if (view_extent_num == 0) { @@ -138,7 +138,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page, crypt_stat, &written); } - kunmap_atomic(page_virt); + kunmap_local(page_virt); flush_dcache_page(page); if (rc) { printk(KERN_ERR "%s: Error reading xattr " -- cgit v1.2.3 From ba38980add7ffc9e674ada5b4ded4e7d14e76581 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Sun, 4 Jun 2023 12:16:06 +0100 Subject: reiserfs: Check the return value from __getblk() __getblk() can return a NULL pointer if we run out of memory or if we try to access beyond the end of the device; check it and handle it appropriately. Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/lkml/CAFcO6XOacq3hscbXevPQP7sXRoYFz34ZdKPYjmd6k5sZuhGFDw@mail.gmail.com/ Tested-by: butt3rflyh4ck Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") # probably introduced in 2002 Acked-by: Edward Shishkin Signed-off-by: Christian Brauner --- fs/reiserfs/journal.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 479aa4a57602..015bfe4e4524 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2326,7 +2326,7 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev, int i, j; bh = __getblk(dev, block, bufsize); - if (buffer_uptodate(bh)) + if (!bh || buffer_uptodate(bh)) return (bh); if (block + BUFNR > max_block) { @@ -2336,6 +2336,8 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev, j = 1; for (i = 1; i < blocks; i++) { bh = __getblk(dev, block + i, bufsize); + if (!bh) + break; if (buffer_uptodate(bh)) { brelse(bh); break; -- cgit v1.2.3 From 12ee4b66af34f8e72f3b2fd93a946a955efe7c86 Mon Sep 17 00:00:00 2001 From: Ahelenia Ziemiańska Date: Mon, 3 Jul 2023 16:42:13 +0200 Subject: splice: always fsnotify_access(in), fsnotify_modify(out) on success MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current behaviour caused an asymmetry where some write APIs (write, sendfile) would notify the written-to/read-from objects, but splice wouldn't. This affected userspace which uses inotify, most notably coreutils tail -f, to monitor pipes. If the pipe buffer had been filled by a splice-family function: * tail wouldn't know and thus wouldn't service the pipe, and * all writes to the pipe would block because it's full, thus service was denied. (For the particular case of tail -f this could be worked around with ---disable-inotify.) Fixes: 983652c69199 ("splice: report related fsnotify events") Link: https://lore.kernel.org/linux-fsdevel/jbyihkyk5dtaohdwjyivambb2gffyjs3dodpofafnkkunxq7bu@jngkdxx65pux/t/#u Link: https://bugs.debian.org/1039488 Signed-off-by: Ahelenia Ziemiańska Acked-by: Jan Kara Reviewed-by: Amir Goldstein Message-Id: <604ec704d933e0e0121d9e107ce914512e045fad.1688393619.git.nabijaczleweli@nabijaczleweli.xyz> Signed-off-by: Christian Brauner --- fs/splice.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/fs/splice.c b/fs/splice.c index 004eb1c4ce31..016b0a83df44 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1267,10 +1267,8 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out, if ((in->f_flags | out->f_flags) & O_NONBLOCK) flags |= SPLICE_F_NONBLOCK; - return splice_pipe_to_pipe(ipipe, opipe, len, flags); - } - - if (ipipe) { + ret = splice_pipe_to_pipe(ipipe, opipe, len, flags); + } else if (ipipe) { if (off_in) return -ESPIPE; if (off_out) { @@ -1295,18 +1293,11 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out, ret = do_splice_from(ipipe, out, &offset, len, flags); file_end_write(out); - if (ret > 0) - fsnotify_modify(out); - if (!off_out) out->f_pos = offset; else *off_out = offset; - - return ret; - } - - if (opipe) { + } else if (opipe) { if (off_out) return -ESPIPE; if (off_in) { @@ -1322,18 +1313,25 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out, ret = splice_file_to_pipe(in, opipe, &offset, len, flags); - if (ret > 0) - fsnotify_access(in); - if (!off_in) in->f_pos = offset; else *off_in = offset; + } else { + ret = -EINVAL; + } - return ret; + if (ret > 0) { + /* + * Generate modify out before access in: + * do_splice_from() may've already sent modify out, + * and this ensures the events get merged. + */ + fsnotify_modify(out); + fsnotify_access(in); } - return -EINVAL; + return ret; } static long __do_splice(struct file *in, loff_t __user *off_in, -- cgit v1.2.3 From 7f0f1ea069e52d5a16921abd59377a7da6c25149 Mon Sep 17 00:00:00 2001 From: Ahelenia Ziemiańska Date: Mon, 3 Jul 2023 16:42:17 +0200 Subject: splice: fsnotify_access(fd)/fsnotify_modify(fd) in vmsplice MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Same logic applies here: this can fill up the pipe and pollers that rely on getting IN_MODIFY notifications never wake up. Fixes: 983652c69199 ("splice: report related fsnotify events") Link: https://lore.kernel.org/linux-fsdevel/jbyihkyk5dtaohdwjyivambb2gffyjs3dodpofafnkkunxq7bu@jngkdxx65pux/t/#u Link: https://bugs.debian.org/1039488 Signed-off-by: Ahelenia Ziemiańska Acked-by: Jan Kara Reviewed-by: Amir Goldstein Message-Id: <8d9ad5acb9c5c1dd2376a2ff5da6ac3183115389.1688393619.git.nabijaczleweli@nabijaczleweli.xyz> Signed-off-by: Christian Brauner --- fs/splice.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/splice.c b/fs/splice.c index 016b0a83df44..2cb89f5f9a8a 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1460,6 +1460,9 @@ static long vmsplice_to_user(struct file *file, struct iov_iter *iter, pipe_unlock(pipe); } + if (ret > 0) + fsnotify_access(file); + return ret; } @@ -1489,8 +1492,10 @@ static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter, if (!ret) ret = iter_to_pipe(iter, pipe, buf_flag); pipe_unlock(pipe); - if (ret > 0) + if (ret > 0) { wakeup_pipe_readers(pipe); + fsnotify_modify(file); + } return ret; } -- cgit v1.2.3 From 576d498e0ac5caff2d9f6312573ab54d98f12d32 Mon Sep 17 00:00:00 2001 From: Ahelenia Ziemiańska Date: Mon, 3 Jul 2023 16:42:21 +0200 Subject: splice: fsnotify_access(in), fsnotify_modify(out) on success in tee MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Same logic applies here: this can fill up the pipe, and pollers that rely on getting IN_MODIFY notifications never wake up. Fixes: 983652c69199 ("splice: report related fsnotify events") Link: https://lore.kernel.org/linux-fsdevel/jbyihkyk5dtaohdwjyivambb2gffyjs3dodpofafnkkunxq7bu@jngkdxx65pux/t/#u Link: https://bugs.debian.org/1039488 Signed-off-by: Ahelenia Ziemiańska Acked-by: Jan Kara Reviewed-by: Amir Goldstein Message-Id: <10d76dd8c85017ae3cd047c9b9a32e26daefdaa2.1688393619.git.nabijaczleweli@nabijaczleweli.xyz> Signed-off-by: Christian Brauner --- fs/splice.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/splice.c b/fs/splice.c index 2cb89f5f9a8a..378fedb392ae 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1929,6 +1929,11 @@ long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags) } } + if (ret > 0) { + fsnotify_access(in); + fsnotify_modify(out); + } + return ret; } -- cgit v1.2.3 From bccb5c397fbfe348e22c045f5e09c35780edb817 Mon Sep 17 00:00:00 2001 From: Luca Vizzarro Date: Thu, 2 Feb 2023 11:58:31 +0000 Subject: fcntl: Cast commands with int args explicitly According to the fcntl API specification commands that expect an integer, hence not a pointer, always take an int and not long. In order to avoid access to undefined bits, we should explicitly cast the argument to int. Cc: Alexander Viro Cc: Christian Brauner Cc: Jeff Layton Cc: Chuck Lever Cc: Kevin Brodsky Cc: Vincenzo Frascino Cc: Szabolcs Nagy Cc: "Theodore Ts'o" Cc: David Laight Cc: Mark Rutland Cc: linux-fsdevel@vger.kernel.org Cc: linux-morello@op-lists.linaro.org Signed-off-by: Luca Vizzarro Message-Id: <20230414152459.816046-2-Luca.Vizzarro@arm.com> Signed-off-by: Christian Brauner --- fs/fcntl.c | 29 +++++++++++++++-------------- include/linux/fs.h | 2 +- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/fs/fcntl.c b/fs/fcntl.c index b622be119706..e871009f6c88 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -34,7 +34,7 @@ #define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME) -static int setfl(int fd, struct file * filp, unsigned long arg) +static int setfl(int fd, struct file * filp, unsigned int arg) { struct inode * inode = file_inode(filp); int error = 0; @@ -112,11 +112,11 @@ void __f_setown(struct file *filp, struct pid *pid, enum pid_type type, } EXPORT_SYMBOL(__f_setown); -int f_setown(struct file *filp, unsigned long arg, int force) +int f_setown(struct file *filp, int who, int force) { enum pid_type type; struct pid *pid = NULL; - int who = arg, ret = 0; + int ret = 0; type = PIDTYPE_TGID; if (who < 0) { @@ -317,28 +317,29 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, struct file *filp) { void __user *argp = (void __user *)arg; + int argi = (int)arg; struct flock flock; long err = -EINVAL; switch (cmd) { case F_DUPFD: - err = f_dupfd(arg, filp, 0); + err = f_dupfd(argi, filp, 0); break; case F_DUPFD_CLOEXEC: - err = f_dupfd(arg, filp, O_CLOEXEC); + err = f_dupfd(argi, filp, O_CLOEXEC); break; case F_GETFD: err = get_close_on_exec(fd) ? FD_CLOEXEC : 0; break; case F_SETFD: err = 0; - set_close_on_exec(fd, arg & FD_CLOEXEC); + set_close_on_exec(fd, argi & FD_CLOEXEC); break; case F_GETFL: err = filp->f_flags; break; case F_SETFL: - err = setfl(fd, filp, arg); + err = setfl(fd, filp, argi); break; #if BITS_PER_LONG != 32 /* 32-bit arches must use fcntl64() */ @@ -375,7 +376,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, force_successful_syscall_return(); break; case F_SETOWN: - err = f_setown(filp, arg, 1); + err = f_setown(filp, argi, 1); break; case F_GETOWN_EX: err = f_getown_ex(filp, arg); @@ -391,28 +392,28 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, break; case F_SETSIG: /* arg == 0 restores default behaviour. */ - if (!valid_signal(arg)) { + if (!valid_signal(argi)) { break; } err = 0; - filp->f_owner.signum = arg; + filp->f_owner.signum = argi; break; case F_GETLEASE: err = fcntl_getlease(filp); break; case F_SETLEASE: - err = fcntl_setlease(fd, filp, arg); + err = fcntl_setlease(fd, filp, argi); break; case F_NOTIFY: - err = fcntl_dirnotify(fd, filp, arg); + err = fcntl_dirnotify(fd, filp, argi); break; case F_SETPIPE_SZ: case F_GETPIPE_SZ: - err = pipe_fcntl(filp, cmd, arg); + err = pipe_fcntl(filp, cmd, argi); break; case F_ADD_SEALS: case F_GET_SEALS: - err = memfd_fcntl(filp, cmd, arg); + err = memfd_fcntl(filp, cmd, argi); break; case F_GET_RW_HINT: case F_SET_RW_HINT: diff --git a/include/linux/fs.h b/include/linux/fs.h index 6867512907d6..8e546433c781 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1069,7 +1069,7 @@ extern void fasync_free(struct fasync_struct *); extern void kill_fasync(struct fasync_struct **, int, int); extern void __f_setown(struct file *filp, struct pid *, enum pid_type, int force); -extern int f_setown(struct file *filp, unsigned long arg, int force); +extern int f_setown(struct file *filp, int who, int force); extern void f_delown(struct file *filp); extern pid_t f_getown(struct file *filp); extern int send_sigurg(struct fown_struct *fown); -- cgit v1.2.3 From ed5f17f66ef39273dcf83ca89b2f1d52a52b22a5 Mon Sep 17 00:00:00 2001 From: Luca Vizzarro Date: Wed, 1 Feb 2023 15:05:33 +0000 Subject: fs: Pass argument to fcntl_setlease as int The interface for fcntl expects the argument passed for the command F_SETLEASE to be of type int. The current code wrongly treats it as a long. In order to avoid access to undefined bits, we should explicitly cast the argument to int. Cc: Alexander Viro Cc: Christian Brauner Cc: Jeff Layton Cc: Chuck Lever Cc: Trond Myklebust Cc: Anna Schumaker Cc: Kevin Brodsky Cc: Vincenzo Frascino Cc: Szabolcs Nagy Cc: "Theodore Ts'o" Cc: David Laight Cc: Mark Rutland Cc: linux-fsdevel@vger.kernel.org Cc: linux-cifs@vger.kernel.org Cc: linux-nfs@vger.kernel.org Cc: linux-morello@op-lists.linaro.org Signed-off-by: Luca Vizzarro Message-Id: <20230414152459.816046-3-Luca.Vizzarro@arm.com> Signed-off-by: Christian Brauner --- fs/libfs.c | 2 +- fs/locks.c | 20 ++++++++++---------- fs/nfs/nfs4_fs.h | 2 +- fs/nfs/nfs4file.c | 2 +- fs/nfs/nfs4proc.c | 4 ++-- fs/smb/client/cifsfs.c | 2 +- include/linux/filelock.h | 12 ++++++------ include/linux/fs.h | 4 ++-- 8 files changed, 24 insertions(+), 24 deletions(-) diff --git a/fs/libfs.c b/fs/libfs.c index 5b851315eeed..0ea7fedb0b7a 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1269,7 +1269,7 @@ EXPORT_SYMBOL(alloc_anon_inode); * All arguments are ignored and it just returns -EINVAL. */ int -simple_nosetlease(struct file *filp, long arg, struct file_lock **flp, +simple_nosetlease(struct file *filp, int arg, struct file_lock **flp, void **priv) { return -EINVAL; diff --git a/fs/locks.c b/fs/locks.c index df8b26a42524..265b5190db3e 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -438,7 +438,7 @@ static void flock_make_lock(struct file *filp, struct file_lock *fl, int type) fl->fl_end = OFFSET_MAX; } -static int assign_type(struct file_lock *fl, long type) +static int assign_type(struct file_lock *fl, int type) { switch (type) { case F_RDLCK: @@ -549,7 +549,7 @@ static const struct lock_manager_operations lease_manager_ops = { /* * Initialize a lease, use the default lock manager operations */ -static int lease_init(struct file *filp, long type, struct file_lock *fl) +static int lease_init(struct file *filp, int type, struct file_lock *fl) { if (assign_type(fl, type) != 0) return -EINVAL; @@ -567,7 +567,7 @@ static int lease_init(struct file *filp, long type, struct file_lock *fl) } /* Allocate a file_lock initialised to this type of lease */ -static struct file_lock *lease_alloc(struct file *filp, long type) +static struct file_lock *lease_alloc(struct file *filp, int type) { struct file_lock *fl = locks_alloc_lock(); int error = -ENOMEM; @@ -1666,7 +1666,7 @@ int fcntl_getlease(struct file *filp) * conflict with the lease we're trying to set. */ static int -check_conflicting_open(struct file *filp, const long arg, int flags) +check_conflicting_open(struct file *filp, const int arg, int flags) { struct inode *inode = file_inode(filp); int self_wcount = 0, self_rcount = 0; @@ -1701,7 +1701,7 @@ check_conflicting_open(struct file *filp, const long arg, int flags) } static int -generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **priv) +generic_add_lease(struct file *filp, int arg, struct file_lock **flp, void **priv) { struct file_lock *fl, *my_fl = NULL, *lease; struct inode *inode = file_inode(filp); @@ -1859,7 +1859,7 @@ static int generic_delete_lease(struct file *filp, void *owner) * The (input) flp->fl_lmops->lm_break function is required * by break_lease(). */ -int generic_setlease(struct file *filp, long arg, struct file_lock **flp, +int generic_setlease(struct file *filp, int arg, struct file_lock **flp, void **priv) { struct inode *inode = file_inode(filp); @@ -1906,7 +1906,7 @@ lease_notifier_chain_init(void) } static inline void -setlease_notifier(long arg, struct file_lock *lease) +setlease_notifier(int arg, struct file_lock *lease) { if (arg != F_UNLCK) srcu_notifier_call_chain(&lease_notifier_chain, arg, lease); @@ -1942,7 +1942,7 @@ EXPORT_SYMBOL_GPL(lease_unregister_notifier); * may be NULL if the lm_setup operation doesn't require it. */ int -vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv) +vfs_setlease(struct file *filp, int arg, struct file_lock **lease, void **priv) { if (lease) setlease_notifier(arg, *lease); @@ -1953,7 +1953,7 @@ vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv) } EXPORT_SYMBOL_GPL(vfs_setlease); -static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) +static int do_fcntl_add_lease(unsigned int fd, struct file *filp, int arg) { struct file_lock *fl; struct fasync_struct *new; @@ -1988,7 +1988,7 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) * Note that you also need to call %F_SETSIG to * receive a signal when the lease is broken. */ -int fcntl_setlease(unsigned int fd, struct file *filp, long arg) +int fcntl_setlease(unsigned int fd, struct file *filp, int arg) { if (arg == F_UNLCK) return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp); diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 4c9f8bd866ab..47c5c1f86d66 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -328,7 +328,7 @@ extern int update_open_stateid(struct nfs4_state *state, const nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode); -extern int nfs4_proc_setlease(struct file *file, long arg, +extern int nfs4_proc_setlease(struct file *file, int arg, struct file_lock **lease, void **priv); extern int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo); diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 4aeadd6e1a6d..02788c3c85e5 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -438,7 +438,7 @@ void nfs42_ssc_unregister_ops(void) } #endif /* CONFIG_NFS_V4_2 */ -static int nfs4_setlease(struct file *file, long arg, struct file_lock **lease, +static int nfs4_setlease(struct file *file, int arg, struct file_lock **lease, void **priv) { return nfs4_proc_setlease(file, arg, lease, priv); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index e1a886b58354..b4593fbe0114 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7573,7 +7573,7 @@ static int nfs4_delete_lease(struct file *file, void **priv) return generic_setlease(file, F_UNLCK, NULL, priv); } -static int nfs4_add_lease(struct file *file, long arg, struct file_lock **lease, +static int nfs4_add_lease(struct file *file, int arg, struct file_lock **lease, void **priv) { struct inode *inode = file_inode(file); @@ -7591,7 +7591,7 @@ static int nfs4_add_lease(struct file *file, long arg, struct file_lock **lease, return -EAGAIN; } -int nfs4_proc_setlease(struct file *file, long arg, struct file_lock **lease, +int nfs4_proc_setlease(struct file *file, int arg, struct file_lock **lease, void **priv) { switch (arg) { diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index a4d8b0ea1c8c..6fc8f43b1c9d 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -1077,7 +1077,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence) } static int -cifs_setlease(struct file *file, long arg, struct file_lock **lease, void **priv) +cifs_setlease(struct file *file, int arg, struct file_lock **lease, void **priv) { /* * Note that this is called by vfs setlease with i_lock held to diff --git a/include/linux/filelock.h b/include/linux/filelock.h index efcdd1631d9b..95e868e09e29 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -144,7 +144,7 @@ int fcntl_setlk64(unsigned int, struct file *, unsigned int, struct flock64 *); #endif -int fcntl_setlease(unsigned int fd, struct file *filp, long arg); +int fcntl_setlease(unsigned int fd, struct file *filp, int arg); int fcntl_getlease(struct file *filp); /* fs/locks.c */ @@ -167,8 +167,8 @@ bool vfs_inode_has_locks(struct inode *inode); int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl); int __break_lease(struct inode *inode, unsigned int flags, unsigned int type); void lease_get_mtime(struct inode *, struct timespec64 *time); -int generic_setlease(struct file *, long, struct file_lock **, void **priv); -int vfs_setlease(struct file *, long, struct file_lock **, void **); +int generic_setlease(struct file *, int, struct file_lock **, void **priv); +int vfs_setlease(struct file *, int, struct file_lock **, void **); int lease_modify(struct file_lock *, int, struct list_head *); struct notifier_block; @@ -213,7 +213,7 @@ static inline int fcntl_setlk64(unsigned int fd, struct file *file, return -EACCES; } #endif -static inline int fcntl_setlease(unsigned int fd, struct file *filp, long arg) +static inline int fcntl_setlease(unsigned int fd, struct file *filp, int arg) { return -EINVAL; } @@ -306,13 +306,13 @@ static inline void lease_get_mtime(struct inode *inode, return; } -static inline int generic_setlease(struct file *filp, long arg, +static inline int generic_setlease(struct file *filp, int arg, struct file_lock **flp, void **priv) { return -EINVAL; } -static inline int vfs_setlease(struct file *filp, long arg, +static inline int vfs_setlease(struct file *filp, int arg, struct file_lock **lease, void **priv) { return -EINVAL; diff --git a/include/linux/fs.h b/include/linux/fs.h index 8e546433c781..11055d00fab8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1799,7 +1799,7 @@ struct file_operations { ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); void (*splice_eof)(struct file *file); - int (*setlease)(struct file *, long, struct file_lock **, void **); + int (*setlease)(struct file *, int, struct file_lock **, void **); long (*fallocate)(struct file *file, int mode, loff_t offset, loff_t len); void (*show_fdinfo)(struct seq_file *m, struct file *f); @@ -2950,7 +2950,7 @@ extern int simple_write_begin(struct file *file, struct address_space *mapping, extern const struct address_space_operations ram_aops; extern int always_delete_dentry(const struct dentry *); extern struct inode *alloc_anon_inode(struct super_block *); -extern int simple_nosetlease(struct file *, long, struct file_lock **, void **); +extern int simple_nosetlease(struct file *, int, struct file_lock **, void **); extern const struct dentry_operations simple_dentry_operations; extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned int flags); -- cgit v1.2.3 From 515c5046650d8f56f93546d3867b64c1906162bc Mon Sep 17 00:00:00 2001 From: Luca Vizzarro Date: Wed, 1 Feb 2023 16:18:53 +0000 Subject: pipe: Pass argument of pipe_fcntl as int The interface for fcntl expects the argument passed for the command F_SETPIPE_SZ to be of type int. The current code wrongly treats it as a long. In order to avoid access to undefined bits, we should explicitly cast the argument to int. Cc: Alexander Viro Cc: Christian Brauner Cc: Jeff Layton Cc: Chuck Lever Cc: Kevin Brodsky Cc: Vincenzo Frascino Cc: Szabolcs Nagy Cc: "Theodore Ts'o" Cc: David Laight Cc: Mark Rutland Cc: linux-fsdevel@vger.kernel.org Cc: linux-morello@op-lists.linaro.org Signed-off-by: Luca Vizzarro Message-Id: <20230414152459.816046-4-Luca.Vizzarro@arm.com> Signed-off-by: Christian Brauner --- fs/pipe.c | 6 +++--- include/linux/pipe_fs_i.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index 2d88f73f585a..a388612f0f86 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1236,7 +1236,7 @@ const struct file_operations pipefifo_fops = { * Currently we rely on the pipe array holding a power-of-2 number * of pages. Returns 0 on error. */ -unsigned int round_pipe_size(unsigned long size) +unsigned int round_pipe_size(unsigned int size) { if (size > (1U << 31)) return 0; @@ -1319,7 +1319,7 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) * Allocate a new array of pipe buffers and copy the info over. Returns the * pipe size if successful, or return -ERROR on error. */ -static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) +static long pipe_set_size(struct pipe_inode_info *pipe, unsigned int arg) { unsigned long user_bufs; unsigned int nr_slots, size; @@ -1387,7 +1387,7 @@ struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice) return pipe; } -long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) +long pipe_fcntl(struct file *file, unsigned int cmd, unsigned int arg) { struct pipe_inode_info *pipe; long ret; diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 02e0086b10f6..608a9eb86bff 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -269,10 +269,10 @@ bool pipe_is_unprivileged_user(void); /* for F_SETPIPE_SZ and F_GETPIPE_SZ */ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots); -long pipe_fcntl(struct file *, unsigned int, unsigned long arg); +long pipe_fcntl(struct file *, unsigned int, unsigned int arg); struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice); int create_pipe_files(struct file **, int); -unsigned int round_pipe_size(unsigned long size); +unsigned int round_pipe_size(unsigned int size); #endif -- cgit v1.2.3 From f4ae4081e5a871b4c4e2f802ccf73f8d69566073 Mon Sep 17 00:00:00 2001 From: Luca Vizzarro Date: Wed, 15 Feb 2023 15:50:05 +0000 Subject: dnotify: Pass argument of fcntl_dirnotify as int The interface for fcntl expects the argument passed for the command F_DIRNOTIFY to be of type int. The current code wrongly treats it as a long. In order to avoid access to undefined bits, we should explicitly cast the argument to int. Cc: Jan Kara Cc: Amir Goldstein Cc: Alexander Viro Cc: Christian Brauner Cc: Jeff Layton Cc: Chuck Lever Cc: Kevin Brodsky Cc: Vincenzo Frascino Cc: Szabolcs Nagy Cc: "Theodore Ts'o" Cc: David Laight Cc: Mark Rutland Cc: linux-fsdevel@vger.kernel.org Cc: linux-morello@op-lists.linaro.org Acked-by: Jan Kara Signed-off-by: Luca Vizzarro Message-Id: <20230414152459.816046-6-Luca.Vizzarro@arm.com> Signed-off-by: Christian Brauner --- fs/notify/dnotify/dnotify.c | 4 ++-- include/linux/dnotify.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 190aa717fa32..ebdcc25df0f7 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -199,7 +199,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id) } /* this conversion is done only at watch creation */ -static __u32 convert_arg(unsigned long arg) +static __u32 convert_arg(unsigned int arg) { __u32 new_mask = FS_EVENT_ON_CHILD; @@ -258,7 +258,7 @@ static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark *dn_mark, * up here. Allocate both a mark for fsnotify to add and a dnotify_struct to be * attached to the fsnotify_mark. */ -int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) +int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg) { struct dnotify_mark *new_dn_mark, *dn_mark; struct fsnotify_mark *new_fsn_mark, *fsn_mark; diff --git a/include/linux/dnotify.h b/include/linux/dnotify.h index b1d26f9f1c9f..9f183a679277 100644 --- a/include/linux/dnotify.h +++ b/include/linux/dnotify.h @@ -30,7 +30,7 @@ struct dnotify_struct { FS_MOVED_FROM | FS_MOVED_TO) extern void dnotify_flush(struct file *, fl_owner_t); -extern int fcntl_dirnotify(int, struct file *, unsigned long); +extern int fcntl_dirnotify(int, struct file *, unsigned int); #else @@ -38,7 +38,7 @@ static inline void dnotify_flush(struct file *filp, fl_owner_t id) { } -static inline int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) +static inline int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg) { return -EINVAL; } -- cgit v1.2.3 From 758b492047816a3158d027e9fca660bc5bcf20bf Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Sun, 9 Jul 2023 14:54:51 +0800 Subject: eventfd: prevent underflow for eventfd semaphores For eventfd with flag EFD_SEMAPHORE, when its ctx->count is 0, calling eventfd_ctx_do_read will cause ctx->count to overflow to ULLONG_MAX. An underflow can happen with EFD_SEMAPHORE eventfds in at least the following three subsystems: (1) virt/kvm/eventfd.c (2) drivers/vfio/virqfd.c (3) drivers/virt/acrn/irqfd.c where (2) and (3) are just modeled after (1). An eventfd must be specified for use with the KVM_IRQFD ioctl(). This can also be an EFD_SEMAPHORE eventfd. When the eventfd count is zero or has been decremented to zero an underflow can be triggered when the irqfd is shut down by raising the KVM_IRQFD_FLAG_DEASSIGN flag in the KVM_IRQFD ioctl(): // ctx->count == 0 kvm_vm_ioctl() -> kvm_irqfd() -> kvm_irqfd_deassign() -> irqfd_deactivate() -> irqfd_shutdown() -> eventfd_ctx_remove_wait_queue(&cnt) -> eventfd_ctx_do_read(&cnt) Userspace polling on the eventfd wouldn't notice the underflow because 1 is always returned as the value from eventfd_read() while ctx->count would've underflowed. It's not a huge deal because this should only be happening when the irqfd is shutdown but we should still fix it and avoid the spurious wakeup. Fixes: cb289d6244a3 ("eventfd - allow atomic read and waitqueue remove") Signed-off-by: Wen Yang Cc: Alexander Viro Cc: Jens Axboe Cc: Christian Brauner Cc: Christoph Hellwig Cc: Dylan Yudaken Cc: David Woodhouse Cc: Matthew Wilcox Cc: linux-fsdevel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Message-Id: [brauner: rewrite commit message and add explanation how this underflow can happen] Signed-off-by: Christian Brauner --- fs/eventfd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/eventfd.c b/fs/eventfd.c index 8aa36cd37351..33a918f9566c 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -189,7 +189,7 @@ void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) { lockdep_assert_held(&ctx->wqh.lock); - *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count; + *cnt = ((ctx->flags & EFD_SEMAPHORE) && ctx->count) ? 1 : ctx->count; ctx->count -= *cnt; } EXPORT_SYMBOL_GPL(eventfd_ctx_do_read); -- cgit v1.2.3 From 5d1f903f75a80daa4dfb3d84e114ec8ecbf29956 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 12 Jul 2023 20:58:49 +0200 Subject: attr: block mode changes of symlinks Changing the mode of symlinks is meaningless as the vfs doesn't take the mode of a symlink into account during path lookup permission checking. However, the vfs doesn't block mode changes on symlinks. This however, has lead to an untenable mess roughly classifiable into the following two categories: (1) Filesystems that don't implement a i_op->setattr() for symlinks. Such filesystems may or may not know that without i_op->setattr() defined, notify_change() falls back to simple_setattr() causing the inode's mode in the inode cache to be changed. That's a generic issue as this will affect all non-size changing inode attributes including ownership changes. Example: afs (2) Filesystems that fail with EOPNOTSUPP but change the mode of the symlink nonetheless. Some filesystems will happily update the mode of a symlink but still return EOPNOTSUPP. This is the biggest source of confusion for userspace. The EOPNOTSUPP in this case comes from POSIX ACLs. Specifically it comes from filesystems that call posix_acl_chmod(), e.g., btrfs via if (!err && attr->ia_valid & ATTR_MODE) err = posix_acl_chmod(idmap, dentry, inode->i_mode); Filesystems including btrfs don't implement i_op->set_acl() so posix_acl_chmod() will report EOPNOTSUPP. When posix_acl_chmod() is called, most filesystems will have finished updating the inode. Perversely, this has the consequences that this behavior may depend on two kconfig options and mount options: * CONFIG_POSIX_ACL={y,n} * CONFIG_${FSTYPE}_POSIX_ACL={y,n} * Opt_acl, Opt_noacl Example: btrfs, ext4, xfs The only way to change the mode on a symlink currently involves abusing an O_PATH file descriptor in the following manner: fd = openat(-1, "/path/to/link", O_CLOEXEC | O_PATH | O_NOFOLLOW); char path[PATH_MAX]; snprintf(path, sizeof(path), "/proc/self/fd/%d", fd); chmod(path, 0000); But for most major filesystems with POSIX ACL support such as btrfs, ext4, ceph, tmpfs, xfs and others this will fail with EOPNOTSUPP with the mode still updated due to the aforementioned posix_acl_chmod() nonsense. So, given that for all major filesystems this would fail with EOPNOTSUPP and that both glibc (cf. [1]) and musl (cf. [2]) outright block mode changes on symlinks we should just try and block mode changes on symlinks directly in the vfs and have a clean break with this nonsense. If this causes any regressions, we do the next best thing and fix up all filesystems that do return EOPNOTSUPP with the mode updated to not call posix_acl_chmod() on symlinks. But as usual, let's try the clean cut solution first. It's a simple patch that can be easily reverted. Not marking this for backport as I'll do that manually if we're reasonably sure that this works and there are no strong objections. We could block this in chmod_common() but it's more appropriate to do it notify_change() as it will also mean that we catch filesystems that change symlink permissions explicitly or accidently. Similar proposals were floated in the past as in [3] and [4] and again recently in [5]. There's also a couple of bugs about this inconsistency as in [6] and [7]. Link: https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/unix/sysv/linux/fchmodat.c;h=99527a3727e44cb8661ee1f743068f108ec93979;hb=HEAD [1] Link: https://git.musl-libc.org/cgit/musl/tree/src/stat/fchmodat.c [2] Link: https://lore.kernel.org/all/20200911065733.GA31579@infradead.org [3] Link: https://sourceware.org/legacy-ml/libc-alpha/2020-02/msg00518.html [4] Link: https://lore.kernel.org/lkml/87lefmbppo.fsf@oldenburg.str.redhat.com [5] Link: https://sourceware.org/legacy-ml/libc-alpha/2020-02/msg00467.html [6] Link: https://sourceware.org/bugzilla/show_bug.cgi?id=14578#c17 [7] Reviewed-by: Aleksa Sarai Reviewed-by: Christoph Hellwig Cc: stable@vger.kernel.org # please backport to all LTSes but not before v6.6-rc2 is tagged Suggested-by: Christoph Hellwig Suggested-by: Florian Weimer Message-Id: <20230712-vfs-chmod-symlinks-v2-1-08cfb92b61dd@kernel.org> Signed-off-by: Christian Brauner --- fs/attr.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/fs/attr.c b/fs/attr.c index d60dc1edb526..87af68bb8ad2 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -394,9 +394,25 @@ int notify_change(struct mnt_idmap *idmap, struct dentry *dentry, return error; if ((ia_valid & ATTR_MODE)) { - umode_t amode = attr->ia_mode; + /* + * Don't allow changing the mode of symlinks: + * + * (1) The vfs doesn't take the mode of symlinks into account + * during permission checking. + * (2) This has never worked correctly. Most major filesystems + * did return EOPNOTSUPP due to interactions with POSIX ACLs + * but did still updated the mode of the symlink. + * This inconsistency led system call wrapper providers such + * as libc to block changing the mode of symlinks with + * EOPNOTSUPP already. + * (3) To even do this in the first place one would have to use + * specific file descriptors and quite some effort. + */ + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + /* Flag setting protected by i_mutex */ - if (is_sxid(amode)) + if (is_sxid(attr->ia_mode)) inode->i_flags &= ~S_NOSEC; } -- cgit v1.2.3 From 0d5a4f8f775ff990142cdc810a84eae078589d27 Mon Sep 17 00:00:00 2001 From: Wang Ming Date: Thu, 13 Jul 2023 20:05:42 +0800 Subject: fs: Fix error checking for d_hash_and_lookup() The d_hash_and_lookup() function returns error pointers or NULL. Most incorrect error checks were fixed, but the one in int path_pts() was forgotten. Fixes: eedf265aa003 ("devpts: Make each mount of devpts an independent filesystem.") Signed-off-by: Wang Ming Message-Id: <20230713120555.7025-1-machel@vivo.com> Signed-off-by: Christian Brauner --- fs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/namei.c b/fs/namei.c index e56ff39a79bc..2bae29ea52ff 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2890,7 +2890,7 @@ int path_pts(struct path *path) dput(path->dentry); path->dentry = parent; child = d_hash_and_lookup(parent, &this); - if (!child) + if (IS_ERR_OR_NULL(child)) return -ENOENT; path->dentry = child; -- cgit v1.2.3 From 05f26f86f4a1f31d5ed2015ac1c89ea9da1d2bb7 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Wed, 26 Jul 2023 11:21:35 +0800 Subject: epoll: simplify ep_alloc() The get_current_user() does not fail, and moving it after kzalloc() can simplify the code a bit. Signed-off-by: Zhen Lei Message-Id: <20230726032135.933-1-thunder.leizhen@huaweicloud.com> Signed-off-by: Christian Brauner --- fs/eventpoll.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 4b1b3362f697..1d9a71a0c4c1 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -975,15 +975,11 @@ again: static int ep_alloc(struct eventpoll **pep) { - int error; - struct user_struct *user; struct eventpoll *ep; - user = get_current_user(); - error = -ENOMEM; ep = kzalloc(sizeof(*ep), GFP_KERNEL); if (unlikely(!ep)) - goto free_uid; + return -ENOMEM; mutex_init(&ep->mtx); rwlock_init(&ep->lock); @@ -992,16 +988,12 @@ static int ep_alloc(struct eventpoll **pep) INIT_LIST_HEAD(&ep->rdllist); ep->rbr = RB_ROOT_CACHED; ep->ovflist = EP_UNACTIVE_PTR; - ep->user = user; + ep->user = get_current_user(); refcount_set(&ep->refcount, 1); *pep = ep; return 0; - -free_uid: - free_uid(user); - return error; } /* -- cgit v1.2.3 From ee042cdb9f0f8a802fc6497ca921e8863f090f6e Mon Sep 17 00:00:00 2001 From: Zhu Wang Date: Mon, 31 Jul 2023 19:25:33 +0800 Subject: fs/ecryptfs: remove kernel-doc warnings Remove kernel-doc warnings: fs/ecryptfs/mmap.c:270: warning: Excess function parameter 'flags' description in 'ecryptfs_write_begin' Message-ID: <20230731112533.214216-1-wangzhu9@huawei.com> Signed-off-by: Zhu Wang Signed-off-by: Christian Brauner --- fs/ecryptfs/mmap.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index cb1e998ce54d..e2483acc4366 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -255,7 +255,6 @@ out: * @mapping: The eCryptfs object * @pos: The file offset at which to start writing * @len: Length of the write - * @flags: Various flags * @pagep: Pointer to return the page * @fsdata: Pointer to return fs data (unused) * -- cgit v1.2.3 From ed192c59f86910f089d061e635f6c1129bb14064 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 27 Jul 2023 13:38:09 +0200 Subject: file: mostly eliminate spurious relocking in __range_close Stock code takes a lock trip for every fd in range, but this can be trivially avoided and real-world consumers do have plenty of already closed cases. Just booting Debian 12 with a debug printk shows: (sh) min 3 max 17 closed 15 empty 0 (sh) min 19 max 63 closed 31 empty 14 (sh) min 4 max 63 closed 0 empty 60 (spawn) min 3 max 63 closed 13 empty 48 (spawn) min 3 max 63 closed 13 empty 48 (mount) min 3 max 17 closed 15 empty 0 (mount) min 19 max 63 closed 32 empty 13 and so on. While here use more idiomatic naming. An avoidable relock is left in place to avoid uglifying the code. The code was not switched to bitmap traversal for the same reason. Tested with ltp kernel/syscalls/close_range Signed-off-by: Mateusz Guzik Message-Id: <20230727113809.800067-1-mjguzik@gmail.com> Signed-off-by: Christian Brauner --- fs/file.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/fs/file.c b/fs/file.c index 7893ea161d77..7bb2cddbd82b 100644 --- a/fs/file.c +++ b/fs/file.c @@ -693,29 +693,30 @@ static inline void __range_cloexec(struct files_struct *cur_fds, spin_unlock(&cur_fds->file_lock); } -static inline void __range_close(struct files_struct *cur_fds, unsigned int fd, +static inline void __range_close(struct files_struct *files, unsigned int fd, unsigned int max_fd) { + struct file *file; unsigned n; - rcu_read_lock(); - n = last_fd(files_fdtable(cur_fds)); - rcu_read_unlock(); + spin_lock(&files->file_lock); + n = last_fd(files_fdtable(files)); max_fd = min(max_fd, n); - while (fd <= max_fd) { - struct file *file; - - spin_lock(&cur_fds->file_lock); - file = pick_file(cur_fds, fd++); - spin_unlock(&cur_fds->file_lock); - + for (; fd <= max_fd; fd++) { + file = pick_file(files, fd); if (file) { - /* found a valid file to close */ - filp_close(file, cur_fds); + spin_unlock(&files->file_lock); + filp_close(file, files); cond_resched(); + spin_lock(&files->file_lock); + } else if (need_resched()) { + spin_unlock(&files->file_lock); + cond_resched(); + spin_lock(&files->file_lock); } } + spin_unlock(&files->file_lock); } /** -- cgit v1.2.3 From 021a160abf62c19aff36c920566efb4f690e964a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 8 Aug 2023 19:26:35 +0200 Subject: fs: use __fput_sync in close(2) close(2) is a special case which guarantees a shallow kernel stack, making delegation to task_work machinery unnecessary. Said delegation is problematic as it involves atomic ops and interrupt masking trips, none of which are cheap on x86-64. Forcing close(2) to do it looks like an oversight in the original work. Moreover presence of CONFIG_RSEQ adds an additional overhead as fput() -> task_work_add(..., TWA_RESUME) -> set_notify_resume() makes the thread returning to userspace land in resume_user_mode_work(), where rseq_handle_notify_resume takes a SMAP round-trip if rseq is enabled for the thread (and it is by default with contemporary glibc). Sample result when benchmarking open1_processes -t 1 from will-it-scale (that's an open + close loop) + tmpfs on /tmp, running on the Sapphire Rapid CPU (ops/s): stock+RSEQ: 1329857 stock-RSEQ: 1421667 (+7%) patched: 1523521 (+14.5% / +7%) (with / without rseq) Patched result is the same regardless of rseq as the codepath is avoided. Signed-off-by: Linus Torvalds Signed-off-by: Christian Brauner --- fs/file_table.c | 5 +---- fs/open.c | 27 ++++++++++++++++++++++++--- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/fs/file_table.c b/fs/file_table.c index fc7d677ff5ad..ee21b3da9d08 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -461,11 +461,8 @@ void fput(struct file *file) */ void __fput_sync(struct file *file) { - if (atomic_long_dec_and_test(&file->f_count)) { - struct task_struct *task = current; - BUG_ON(!(task->flags & PF_KTHREAD)); + if (atomic_long_dec_and_test(&file->f_count)) __fput(file); - } } EXPORT_SYMBOL(fput); diff --git a/fs/open.c b/fs/open.c index 0c55c8e7f837..455fdddbe9aa 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1503,7 +1503,7 @@ SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode) * "id" is the POSIX thread ID. We use the * files pointer for this.. */ -int filp_close(struct file *filp, fl_owner_t id) +static int filp_flush(struct file *filp, fl_owner_t id) { int retval = 0; @@ -1520,10 +1520,18 @@ int filp_close(struct file *filp, fl_owner_t id) dnotify_flush(filp, id); locks_remove_posix(filp, id); } - fput(filp); return retval; } +int filp_close(struct file *filp, fl_owner_t id) +{ + int retval; + + retval = filp_flush(filp, id); + fput(filp); + + return retval; +} EXPORT_SYMBOL(filp_close); /* @@ -1533,7 +1541,20 @@ EXPORT_SYMBOL(filp_close); */ SYSCALL_DEFINE1(close, unsigned int, fd) { - int retval = close_fd(fd); + int retval; + struct file *file; + + file = close_fd_get_file(fd); + if (!file) + return -EBADF; + + retval = filp_flush(file, current->files); + + /* + * We're returning to user space. Don't bother + * with any delayed fput() cases. + */ + __fput_sync(file); /* can't restart close syscall because file table entry was cleared */ if (unlikely(retval == -ERESTARTSYS || -- cgit v1.2.3 From 4352b8cd66e22952210e8205312d2b9ecd70be54 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Aug 2023 09:17:04 -0700 Subject: fs: unexport d_genocide d_genocide is only used by built-in code. Signed-off-by: Christoph Hellwig Message-Id: <20230808161704.1099680-1-hch@lst.de> Signed-off-by: Christian Brauner --- fs/dcache.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 52e6d5fdab6b..3aaf2c7ba110 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -3247,8 +3247,6 @@ void d_genocide(struct dentry *parent) d_walk(parent, parent, d_genocide_kill); } -EXPORT_SYMBOL(d_genocide); - void d_tmpfile(struct file *file, struct inode *inode) { struct dentry *dentry = file->f_path.dentry; -- cgit v1.2.3 From d80a8f1b58c2bc8d7c6bfb65401ea4f7ec8cddc2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 8 Aug 2023 07:34:20 -0400 Subject: vfs, security: Fix automount superblock LSM init problem, preventing NFS sb sharing When NFS superblocks are created by automounting, their LSM parameters aren't set in the fs_context struct prior to sget_fc() being called, leading to failure to match existing superblocks. This bug leads to messages like the following appearing in dmesg when fscache is enabled: NFS: Cache volume key already in use (nfs,4.2,2,108,106a8c0,1,,,,100000,100000,2ee,3a98,1d4c,3a98,1) Fix this by adding a new LSM hook to load fc->security for submount creation. Signed-off-by: David Howells Signed-off-by: Jeff Layton Link: https://lore.kernel.org/r/165962680944.3334508.6610023900349142034.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/165962729225.3357250.14350728846471527137.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/165970659095.2812394.6868894171102318796.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/166133579016.3678898.6283195019480567275.stgit@warthog.procyon.org.uk/ # v4 Link: https://lore.kernel.org/r/217595.1662033775@warthog.procyon.org.uk/ # v5 Fixes: 9bc61ab18b1d ("vfs: Introduce fs_context, switch vfs_kern_mount() to it.") Fixes: 779df6a5480f ("NFS: Ensure security label is set for root inode") Tested-by: Jeff Layton Acked-by: Casey Schaufler Acked-by: "Christian Brauner (Microsoft)" Acked-by: Paul Moore Reviewed-by: Jeff Layton Message-Id: <20230808-master-v9-1-e0ecde888221@kernel.org> Signed-off-by: Christian Brauner --- fs/fs_context.c | 23 ++++++++++++++++++- include/linux/lsm_hook_defs.h | 1 + include/linux/security.h | 6 +++++ security/security.c | 14 ++++++++++++ security/selinux/hooks.c | 22 +++++++++++++++++++ security/smack/smack_lsm.c | 51 +++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 116 insertions(+), 1 deletion(-) diff --git a/fs/fs_context.c b/fs/fs_context.c index 851214d1d013..375023e40161 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -315,10 +315,31 @@ struct fs_context *fs_context_for_reconfigure(struct dentry *dentry, } EXPORT_SYMBOL(fs_context_for_reconfigure); +/** + * fs_context_for_submount: allocate a new fs_context for a submount + * @type: file_system_type of the new context + * @reference: reference dentry from which to copy relevant info + * + * Allocate a new fs_context suitable for a submount. This also ensures that + * the fc->security object is inherited from @reference (if needed). + */ struct fs_context *fs_context_for_submount(struct file_system_type *type, struct dentry *reference) { - return alloc_fs_context(type, reference, 0, 0, FS_CONTEXT_FOR_SUBMOUNT); + struct fs_context *fc; + int ret; + + fc = alloc_fs_context(type, reference, 0, 0, FS_CONTEXT_FOR_SUBMOUNT); + if (IS_ERR(fc)) + return fc; + + ret = security_fs_context_submount(fc, reference->d_sb); + if (ret) { + put_fs_context(fc); + return ERR_PTR(ret); + } + + return fc; } EXPORT_SYMBOL(fs_context_for_submount); diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 7308a1a7599b..af796986baee 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -54,6 +54,7 @@ LSM_HOOK(int, 0, bprm_creds_from_file, struct linux_binprm *bprm, struct file *f LSM_HOOK(int, 0, bprm_check_security, struct linux_binprm *bprm) LSM_HOOK(void, LSM_RET_VOID, bprm_committing_creds, struct linux_binprm *bprm) LSM_HOOK(void, LSM_RET_VOID, bprm_committed_creds, struct linux_binprm *bprm) +LSM_HOOK(int, 0, fs_context_submount, struct fs_context *fc, struct super_block *reference) LSM_HOOK(int, 0, fs_context_dup, struct fs_context *fc, struct fs_context *src_sc) LSM_HOOK(int, -ENOPARAM, fs_context_parse_param, struct fs_context *fc, diff --git a/include/linux/security.h b/include/linux/security.h index 32828502f09e..bac98ea18f78 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -293,6 +293,7 @@ int security_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file); int security_bprm_check(struct linux_binprm *bprm); void security_bprm_committing_creds(struct linux_binprm *bprm); void security_bprm_committed_creds(struct linux_binprm *bprm); +int security_fs_context_submount(struct fs_context *fc, struct super_block *reference); int security_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc); int security_fs_context_parse_param(struct fs_context *fc, struct fs_parameter *param); int security_sb_alloc(struct super_block *sb); @@ -629,6 +630,11 @@ static inline void security_bprm_committed_creds(struct linux_binprm *bprm) { } +static inline int security_fs_context_submount(struct fs_context *fc, + struct super_block *reference) +{ + return 0; +} static inline int security_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc) { diff --git a/security/security.c b/security/security.c index b720424ca37d..549104a447e3 100644 --- a/security/security.c +++ b/security/security.c @@ -1138,6 +1138,20 @@ void security_bprm_committed_creds(struct linux_binprm *bprm) call_void_hook(bprm_committed_creds, bprm); } +/** + * security_fs_context_submount() - Initialise fc->security + * @fc: new filesystem context + * @reference: dentry reference for submount/remount + * + * Fill out the ->security field for a new fs_context. + * + * Return: Returns 0 on success or negative error code on failure. + */ +int security_fs_context_submount(struct fs_context *fc, struct super_block *reference) +{ + return call_int_hook(fs_context_submount, 0, fc, reference); +} + /** * security_fs_context_dup() - Duplicate a fs_context LSM blob * @fc: destination filesystem context diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index d06e350fedee..afd663744041 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2745,6 +2745,27 @@ static int selinux_umount(struct vfsmount *mnt, int flags) FILESYSTEM__UNMOUNT, NULL); } +static int selinux_fs_context_submount(struct fs_context *fc, + struct super_block *reference) +{ + const struct superblock_security_struct *sbsec; + struct selinux_mnt_opts *opts; + + opts = kzalloc(sizeof(*opts), GFP_KERNEL); + if (!opts) + return -ENOMEM; + + sbsec = selinux_superblock(reference); + if (sbsec->flags & FSCONTEXT_MNT) + opts->fscontext_sid = sbsec->sid; + if (sbsec->flags & CONTEXT_MNT) + opts->context_sid = sbsec->mntpoint_sid; + if (sbsec->flags & DEFCONTEXT_MNT) + opts->defcontext_sid = sbsec->def_sid; + fc->security = opts; + return 0; +} + static int selinux_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc) { @@ -7182,6 +7203,7 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = { /* * PUT "CLONING" (ACCESSING + ALLOCATING) HOOKS HERE */ + LSM_HOOK_INIT(fs_context_submount, selinux_fs_context_submount), LSM_HOOK_INIT(fs_context_dup, selinux_fs_context_dup), LSM_HOOK_INIT(fs_context_parse_param, selinux_fs_context_parse_param), LSM_HOOK_INIT(sb_eat_lsm_opts, selinux_sb_eat_lsm_opts), diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 6e270cf3fd30..a8201cf22f20 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -614,6 +614,56 @@ out_opt_err: return -EINVAL; } +/** + * smack_fs_context_submount - Initialise security data for a filesystem context + * @fc: The filesystem context. + * @reference: reference superblock + * + * Returns 0 on success or -ENOMEM on error. + */ +static int smack_fs_context_submount(struct fs_context *fc, + struct super_block *reference) +{ + struct superblock_smack *sbsp; + struct smack_mnt_opts *ctx; + struct inode_smack *isp; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + fc->security = ctx; + + sbsp = smack_superblock(reference); + isp = smack_inode(reference->s_root->d_inode); + + if (sbsp->smk_default) { + ctx->fsdefault = kstrdup(sbsp->smk_default->smk_known, GFP_KERNEL); + if (!ctx->fsdefault) + return -ENOMEM; + } + + if (sbsp->smk_floor) { + ctx->fsfloor = kstrdup(sbsp->smk_floor->smk_known, GFP_KERNEL); + if (!ctx->fsfloor) + return -ENOMEM; + } + + if (sbsp->smk_hat) { + ctx->fshat = kstrdup(sbsp->smk_hat->smk_known, GFP_KERNEL); + if (!ctx->fshat) + return -ENOMEM; + } + + if (isp->smk_flags & SMK_INODE_TRANSMUTE) { + if (sbsp->smk_root) { + ctx->fstransmute = kstrdup(sbsp->smk_root->smk_known, GFP_KERNEL); + if (!ctx->fstransmute) + return -ENOMEM; + } + } + return 0; +} + /** * smack_fs_context_dup - Duplicate the security data on fs_context duplication * @fc: The new filesystem context. @@ -4876,6 +4926,7 @@ static struct security_hook_list smack_hooks[] __ro_after_init = { LSM_HOOK_INIT(ptrace_traceme, smack_ptrace_traceme), LSM_HOOK_INIT(syslog, smack_syslog), + LSM_HOOK_INIT(fs_context_submount, smack_fs_context_submount), LSM_HOOK_INIT(fs_context_dup, smack_fs_context_dup), LSM_HOOK_INIT(fs_context_parse_param, smack_fs_context_parse_param), -- cgit v1.2.3 From 8a237adf213d73671992266eff7437f1b9f40567 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 27 Jun 2023 17:08:15 -0300 Subject: fs/buffer.c: disable per-CPU buffer_head cache for isolated CPUs For certain types of applications (for example PLC software or RAN processing), upon occurrence of an event, it is necessary to complete a certain task in a maximum amount of time (deadline). One way to express this requirement is with a pair of numbers, deadline time and execution time, where: * deadline time: length of time between event and deadline. * execution time: length of time it takes for processing of event to occur on a particular hardware platform (uninterrupted). The particular values depend on use-case. For the case where the realtime application executes in a virtualized guest, an IPI which must be serviced in the host will cause the following sequence of events: 1) VM-exit 2) execution of IPI (and function call) 3) VM-entry Which causes an excess of 50us latency as observed by cyclictest (this violates the latency requirement of vRAN application with 1ms TTI, for example). invalidate_bh_lrus calls an IPI on each CPU that has non empty per-CPU cache: on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1); The performance when using the per-CPU LRU cache is as follows: 42 ns per __find_get_block 68 ns per __find_get_block_slow Given that the main use cases for latency sensitive applications do not involve block I/O (data necessary for program operation is locked in RAM), disable per-CPU buffer_head caches for isolated CPUs. Signed-off-by: Marcelo Tosatti Acked-by: Frederic Weisbecker Message-Id: Signed-off-by: Christian Brauner --- fs/buffer.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/buffer.c b/fs/buffer.c index bd091329026c..3aa2380a75e1 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -49,6 +49,7 @@ #include #include #include +#include #include "internal.h" @@ -1352,7 +1353,7 @@ static void bh_lru_install(struct buffer_head *bh) * failing page migration. * Skip putting upcoming bh into bh_lru until migration is done. */ - if (lru_cache_disabled()) { + if (lru_cache_disabled() || cpu_is_isolated(smp_processor_id())) { bh_lru_unlock(); return; } @@ -1382,6 +1383,10 @@ lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size) check_irqs_on(); bh_lru_lock(); + if (cpu_is_isolated(smp_processor_id())) { + bh_lru_unlock(); + return NULL; + } for (i = 0; i < BH_LRU_SIZE; i++) { struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]); -- cgit v1.2.3 From d220efa20bba1ecc3fba3b14b2bf404a1557acd0 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 25 Jun 2023 20:20:47 +0200 Subject: docs: filesystems: idmappings: clarify from where idmappings are taken Let's clarify from where we take idmapping of each type: - caller - filesystem - mount Cc: Jonathan Corbet Cc: Christian Brauner Cc: linux-fsdevel@vger.kernel.org Cc: linux-doc@vger.kernel.org Signed-off-by: Alexander Mikhalitsyn Message-Id: <20230625182047.26854-1-aleksandr.mikhalitsyn@canonical.com> Signed-off-by: Christian Brauner --- Documentation/filesystems/idmappings.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/filesystems/idmappings.rst b/Documentation/filesystems/idmappings.rst index ad6d21640576..f3d168c9f0b9 100644 --- a/Documentation/filesystems/idmappings.rst +++ b/Documentation/filesystems/idmappings.rst @@ -373,6 +373,13 @@ kernel maps the caller's userspace id down into a kernel id according to the caller's idmapping and then maps that kernel id up according to the filesystem's idmapping. +From the implementation point it's worth mentioning how idmappings are represented. +All idmappings are taken from the corresponding user namespace. + + - caller's idmapping (usually taken from ``current_user_ns()``) + - filesystem's idmapping (``sb->s_user_ns``) + - mount's idmapping (``mnt_idmap(vfsmnt)``) + Let's see some examples with caller/filesystem idmapping but without mount idmappings. This will exhibit some problems we can hit. After that we will revisit/reconsider these examples, this time using mount idmappings, to see how -- cgit v1.2.3 From 89cbd4c036ba44eadc495820e72c5c36273abb76 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Fri, 11 Aug 2023 09:43:59 +0800 Subject: fs: Fix one kernel-doc comment Fix one kernel-doc comment to silence the warning: fs/read_write.c:88: warning: Function parameter or member 'maxsize' not described in 'generic_file_llseek_size' Signed-off-by: Yang Li Reviewed-by: Randy Dunlap Message-Id: <20230811014359.4960-1-yang.lee@linux.alibaba.com> Signed-off-by: Christian Brauner --- fs/read_write.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/read_write.c b/fs/read_write.c index b07de77ef126..4771701c896b 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -71,7 +71,7 @@ EXPORT_SYMBOL(vfs_setpos); * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek - * @size: max size of this file in file system + * @maxsize: max size of this file in file system * @eof: offset used for SEEK_END position * * This is a variant of generic_file_llseek that allows passing in a custom -- cgit v1.2.3 From 45e0d4b95b65fedd18aa3e1b6571e16a2e6e0aa9 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Fri, 11 Aug 2023 21:48:14 +0200 Subject: vfs: fix up the assert in i_readcount_dec Drops a race where 2 threads could spot a positive value and both proceed to dec to -1, without reporting anything. Signed-off-by: Mateusz Guzik Message-Id: <20230811194814.1612336-1-mjguzik@gmail.com> Signed-off-by: Christian Brauner --- include/linux/fs.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 11055d00fab8..57e2a0a9eea1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2607,8 +2607,7 @@ static inline bool inode_is_open_for_write(const struct inode *inode) #if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING) static inline void i_readcount_dec(struct inode *inode) { - BUG_ON(!atomic_read(&inode->i_readcount)); - atomic_dec(&inode->i_readcount); + BUG_ON(atomic_dec_return(&inode->i_readcount) < 0); } static inline void i_readcount_inc(struct inode *inode) { -- cgit v1.2.3 From 45071e1c2897138f337000e12f2393e769f125b3 Mon Sep 17 00:00:00 2001 From: Loic Poulain Date: Sun, 13 Aug 2023 10:23:49 +0200 Subject: init: Add support for rootwait timeout parameter Add an optional timeout arg to 'rootwait' as the maximum time in seconds to wait for the root device to show up before attempting forced mount of the root filesystem. Use case: In case of device mapper usage for the rootfs (e.g. root=/dev/dm-0), if the mapper is not able to create the virtual block for any reason (wrong arguments, bad dm-verity signature, etc), the `rootwait` param causes the kernel to wait forever. It may however be desirable to only wait for a given time and then panic (force mount) to cause device reset. This gives the bootloader a chance to detect the problem and to take some measures, such as marking the booted partition as bad (for A/B case) or entering a recovery mode. In success case, mounting happens as soon as the root device is ready, unlike the existing 'rootdelay' parameter which performs an unconditional pause. Signed-off-by: Loic Poulain Message-Id: <20230813082349.513386-1-loic.poulain@linaro.org> Signed-off-by: Christian Brauner --- Documentation/admin-guide/kernel-parameters.txt | 4 +++ init/do_mounts.c | 38 +++++++++++++++++++++++-- 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a1457995fd41..387cf9c2a2c5 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5501,6 +5501,10 @@ Useful for devices that are detected asynchronously (e.g. USB and MMC devices). + rootwait= [KNL] Maximum time (in seconds) to wait for root device + to show up before attempting to mount the root + filesystem. + rproc_mem=nn[KMG][@address] [KNL,ARM,CMA] Remoteproc physical memory block. Memory area to be used by remote processor image, diff --git a/init/do_mounts.c b/init/do_mounts.c index 1aa015883519..5dfd30b13f48 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -71,12 +72,37 @@ static int __init rootwait_setup(char *str) { if (*str) return 0; - root_wait = 1; + root_wait = -1; return 1; } __setup("rootwait", rootwait_setup); +static int __init rootwait_timeout_setup(char *str) +{ + int sec; + + if (kstrtoint(str, 0, &sec) || sec < 0) { + pr_warn("ignoring invalid rootwait value\n"); + goto ignore; + } + + if (check_mul_overflow(sec, MSEC_PER_SEC, &root_wait)) { + pr_warn("ignoring excessive rootwait value\n"); + goto ignore; + } + + return 1; + +ignore: + /* Fallback to indefinite wait */ + root_wait = -1; + + return 1; +} + +__setup("rootwait=", rootwait_timeout_setup); + static char * __initdata root_mount_data; static int __init root_data_setup(char *str) { @@ -384,14 +410,22 @@ void __init mount_root(char *root_device_name) /* wait for any asynchronous scanning to complete */ static void __init wait_for_root(char *root_device_name) { + ktime_t end; + if (ROOT_DEV != 0) return; pr_info("Waiting for root device %s...\n", root_device_name); + end = ktime_add_ms(ktime_get_raw(), root_wait); + while (!driver_probe_done() || - early_lookup_bdev(root_device_name, &ROOT_DEV) < 0) + early_lookup_bdev(root_device_name, &ROOT_DEV) < 0) { msleep(5); + if (root_wait > 0 && ktime_after(ktime_get_raw(), end)) + break; + } + async_synchronize_full(); } -- cgit v1.2.3 From b93ec212bcaccf61313ddbf2527319a7585805ee Mon Sep 17 00:00:00 2001 From: "GONG, Ruiqi" Date: Wed, 16 Aug 2023 11:32:10 +0800 Subject: doc: idmappings: fix an error and rephrase a paragraph If the modified paragraph is referring to the idmapping mentioned in the previous paragraph (i.e. `u0:k10000:r10000`), then it is `u0` that the upper idmapset starts with, not `u1000`. Fix this error and rephrase this paragraph a bit to make this reference more explicit. Reported-by: Wang Lei Signed-off-by: GONG, Ruiqi Message-Id: <20230816033210.914262-1-gongruiqi@huaweicloud.com> Signed-off-by: Christian Brauner --- Documentation/filesystems/idmappings.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Documentation/filesystems/idmappings.rst b/Documentation/filesystems/idmappings.rst index f3d168c9f0b9..d095c5838f94 100644 --- a/Documentation/filesystems/idmappings.rst +++ b/Documentation/filesystems/idmappings.rst @@ -146,9 +146,10 @@ For the rest of this document we will prefix all userspace ids with ``u`` and all kernel ids with ``k``. Ranges of idmappings will be prefixed with ``r``. So an idmapping will be written as ``u0:k10000:r10000``. -For example, the id ``u1000`` is an id in the upper idmapset or "userspace -idmapset" starting with ``u1000``. And it is mapped to ``k11000`` which is a -kernel id in the lower idmapset or "kernel idmapset" starting with ``k10000``. +For example, within this idmapping, the id ``u1000`` is an id in the upper +idmapset or "userspace idmapset" starting with ``u0``. And it is mapped to +``k11000`` which is a kernel id in the lower idmapset or "kernel idmapset" +starting with ``k10000``. A kernel id is always created by an idmapping. Such idmappings are associated with user namespaces. Since we mainly care about how idmappings work we're not -- cgit v1.2.3 From ec05b12634ad007f7a37161ecb7cd243c3b1970c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 18 Aug 2023 21:10:03 +0100 Subject: devpts: Fix kernel-doc warnings This documentation has bit-rotted over time. Signed-off-by: "Matthew Wilcox (Oracle)" Message-Id: <20230818201003.2720257-1-willy@infradead.org> Signed-off-by: Christian Brauner --- fs/devpts/inode.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index fe3db0eda8e4..b31dfd79e474 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -534,12 +534,12 @@ void devpts_kill_index(struct pts_fs_info *fsi, int idx) /** * devpts_pty_new -- create a new inode in /dev/pts/ - * @ptmx_inode: inode of the master - * @device: major+minor of the node to be created + * @fsi: Filesystem info for this instance. * @index: used as a name of the node * @priv: what's given back by devpts_get_priv * - * The created inode is returned. Remove it from /dev/pts/ by devpts_pty_kill. + * The dentry for the created inode is returned. + * Remove it from /dev/pts/ with devpts_pty_kill(). */ struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv) { @@ -580,7 +580,7 @@ struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv) /** * devpts_get_priv -- get private data for a slave - * @pts_inode: inode of the slave + * @dentry: dentry of the slave * * Returns whatever was passed as priv in devpts_pty_new for a given inode. */ @@ -593,7 +593,7 @@ void *devpts_get_priv(struct dentry *dentry) /** * devpts_pty_kill -- remove inode form /dev/pts/ - * @inode: inode of the slave to be removed + * @dentry: dentry of the slave to be removed * * This is an inverse operation of devpts_pty_new. */ -- cgit v1.2.3 From 35931eb3945b8d38c31f8e956aee3cf31c52121b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 18 Aug 2023 21:08:24 +0100 Subject: fs: Fix kernel-doc warnings These have a variety of causes and a corresponding variety of solutions. Signed-off-by: "Matthew Wilcox (Oracle)" Message-Id: <20230818200824.2720007-1-willy@infradead.org> Signed-off-by: Christian Brauner --- fs/file.c | 3 ++- fs/fs_context.c | 12 +++++++++--- fs/ioctl.c | 10 +++++++--- fs/kernel_read_file.c | 12 ++++++------ fs/namei.c | 3 +++ fs/open.c | 4 ++-- 6 files changed, 29 insertions(+), 15 deletions(-) diff --git a/fs/file.c b/fs/file.c index 7bb2cddbd82b..1ddf645ca691 100644 --- a/fs/file.c +++ b/fs/file.c @@ -668,7 +668,7 @@ EXPORT_SYMBOL(close_fd); /* for ksys_close() */ /** * last_fd - return last valid index into fd table - * @cur_fds: files struct + * @fdt: File descriptor table. * * Context: Either rcu read lock or files_lock must be held. * @@ -724,6 +724,7 @@ static inline void __range_close(struct files_struct *files, unsigned int fd, * * @fd: starting file descriptor to close * @max_fd: last file descriptor to close + * @flags: CLOSE_RANGE flags. * * This closes a range of file descriptors. All file descriptors * from @fd up to and including @max_fd are closed. diff --git a/fs/fs_context.c b/fs/fs_context.c index 375023e40161..a48a69caddce 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -162,6 +162,10 @@ EXPORT_SYMBOL(vfs_parse_fs_param); /** * vfs_parse_fs_string - Convenience function to just parse a string. + * @fc: Filesystem context. + * @key: Parameter name. + * @value: Default value. + * @v_size: Maximum number of bytes in the value. */ int vfs_parse_fs_string(struct fs_context *fc, const char *key, const char *value, size_t v_size) @@ -189,7 +193,7 @@ EXPORT_SYMBOL(vfs_parse_fs_string); /** * generic_parse_monolithic - Parse key[=val][,key[=val]]* mount data - * @ctx: The superblock configuration to fill in. + * @fc: The superblock configuration to fill in. * @data: The data to parse * * Parse a blob of data that's in key[=val][,key[=val]]* form. This can be @@ -354,7 +358,7 @@ void fc_drop_locked(struct fs_context *fc) static void legacy_fs_context_free(struct fs_context *fc); /** - * vfs_dup_fc_config: Duplicate a filesystem context. + * vfs_dup_fs_context - Duplicate a filesystem context. * @src_fc: The context to copy. */ struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc) @@ -400,7 +404,9 @@ EXPORT_SYMBOL(vfs_dup_fs_context); /** * logfc - Log a message to a filesystem context - * @fc: The filesystem context to log to. + * @log: The filesystem context to log to, or NULL to use printk. + * @prefix: A string to prefix the output with, or NULL. + * @level: 'w' for a warning, 'e' for an error. Anything else is a notice. * @fmt: The format of the buffer. */ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, ...) diff --git a/fs/ioctl.c b/fs/ioctl.c index 5b2481cd4750..d413e0b8f6c2 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -109,9 +109,6 @@ static int ioctl_fibmap(struct file *filp, int __user *p) * Returns 0 on success, -errno on error, 1 if this was the last * extent that will fit in user array. */ -#define SET_UNKNOWN_FLAGS (FIEMAP_EXTENT_DELALLOC) -#define SET_NO_UNMOUNTED_IO_FLAGS (FIEMAP_EXTENT_DATA_ENCRYPTED) -#define SET_NOT_ALIGNED_FLAGS (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE) int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical, u64 phys, u64 len, u32 flags) { @@ -127,6 +124,10 @@ int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical, if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max) return 1; +#define SET_UNKNOWN_FLAGS (FIEMAP_EXTENT_DELALLOC) +#define SET_NO_UNMOUNTED_IO_FLAGS (FIEMAP_EXTENT_DATA_ENCRYPTED) +#define SET_NOT_ALIGNED_FLAGS (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE) + if (flags & SET_UNKNOWN_FLAGS) flags |= FIEMAP_EXTENT_UNKNOWN; if (flags & SET_NO_UNMOUNTED_IO_FLAGS) @@ -877,6 +878,9 @@ out: #ifdef CONFIG_COMPAT /** * compat_ptr_ioctl - generic implementation of .compat_ioctl file operation + * @file: The file to operate on. + * @cmd: The ioctl command number. + * @arg: The argument to the ioctl. * * This is not normally called as a function, but instead set in struct * file_operations as diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c index 5d826274570c..c429c42a6867 100644 --- a/fs/kernel_read_file.c +++ b/fs/kernel_read_file.c @@ -8,16 +8,16 @@ /** * kernel_read_file() - read file contents into a kernel buffer * - * @file file to read from - * @offset where to start reading from (see below). - * @buf pointer to a "void *" buffer for reading into (if + * @file: file to read from + * @offset: where to start reading from (see below). + * @buf: pointer to a "void *" buffer for reading into (if * *@buf is NULL, a buffer will be allocated, and * @buf_size will be ignored) - * @buf_size size of buf, if already allocated. If @buf not + * @buf_size: size of buf, if already allocated. If @buf not * allocated, this is the largest size to allocate. - * @file_size if non-NULL, the full size of @file will be + * @file_size: if non-NULL, the full size of @file will be * written here. - * @id the kernel_read_file_id identifying the type of + * @id: the kernel_read_file_id identifying the type of * file contents being read (for LSMs to examine) * * @offset must be 0 unless both @buf and @file_size are non-NULL diff --git a/fs/namei.c b/fs/namei.c index 2bae29ea52ff..567ee547492b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -643,6 +643,8 @@ static bool nd_alloc_stack(struct nameidata *nd) /** * path_connected - Verify that a dentry is below mnt.mnt_root + * @mnt: The mountpoint to check. + * @dentry: The dentry to check. * * Rename can sometimes move a file or directory outside of a bind * mount, path_connected allows those cases to be detected. @@ -1083,6 +1085,7 @@ fs_initcall(init_fs_namei_sysctls); /** * may_follow_link - Check symlink following for unsafe situations * @nd: nameidata pathwalk data + * @inode: Used for idmapping. * * In the case of the sysctl_protected_symlinks sysctl being enabled, * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is diff --git a/fs/open.c b/fs/open.c index 455fdddbe9aa..24c1ee83a4b6 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1150,7 +1150,7 @@ EXPORT_SYMBOL_GPL(kernel_file_open); * backing_file_open - open a backing file for kernel internal use * @path: path of the file to open * @flags: open flags - * @path: path of the backing file + * @real_path: path of the backing file * @cred: credentials for open * * Open a backing file for a stackable filesystem (e.g., overlayfs). @@ -1567,7 +1567,7 @@ SYSCALL_DEFINE1(close, unsigned int, fd) } /** - * close_range() - Close all file descriptors in a given range. + * sys_close_range() - Close all file descriptors in a given range. * * @fd: starting file descriptor to close * @max_fd: last file descriptor to close -- cgit v1.2.3 From fbaa530e28afd203b0dc6605c7ecc584952bd606 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 18 Aug 2023 15:45:56 +0100 Subject: fs/pipe: remove redundant initialization of pointer buf The pointer buf is being initializated with a value that is never read, it is being re-assigned later on at the pointer where it is being used. The initialization is redundant and can be removed. Cleans up clang scan build warning: fs/pipe.c:492:24: warning: Value stored to 'buf' during its initialization is never read [deadcode.DeadStores] Signed-off-by: Colin Ian King Message-Id: <20230818144556.1208082-1-colin.i.king@gmail.com> Signed-off-by: Christian Brauner --- fs/pipe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/pipe.c b/fs/pipe.c index a388612f0f86..34b7bf4d7adb 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -489,7 +489,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) head = pipe->head; if (!pipe_full(head, pipe->tail, pipe->max_usage)) { unsigned int mask = pipe->ring_size - 1; - struct pipe_buffer *buf = &pipe->bufs[head & mask]; + struct pipe_buffer *buf; struct page *page = pipe->tmp_page; int copied; -- cgit v1.2.3 From 8c8e7dba1032c6054540ab6f59ce3f3b9f3c6105 Mon Sep 17 00:00:00 2001 From: Anh Tuan Phan Date: Thu, 17 Aug 2023 23:31:42 +0700 Subject: fs/dcache: Replace printk and WARN_ON by WARN Use WARN instead of printk + WARN_ON as reported from coccinelle: ./fs/dcache.c:1667:1-7: SUGGESTION: printk + WARN_ON can be just WARN Signed-off-by: Anh Tuan Phan Message-Id: <20230817163142.117706-1-tuananhlfc@gmail.com> Signed-off-by: Christian Brauner --- fs/dcache.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 3aaf2c7ba110..25ac74d30bff 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1664,7 +1664,7 @@ static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) if (dentry == _data && dentry->d_lockref.count == 1) return D_WALK_CONTINUE; - printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%pd} " + WARN(1, "BUG: Dentry %p{i=%lx,n=%pd} " " still in use (%d) [unmount of %s %s]\n", dentry, dentry->d_inode ? @@ -1673,7 +1673,6 @@ static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) dentry->d_lockref.count, dentry->d_sb->s_type->name, dentry->d_sb->s_id); - WARN_ON(1); return D_WALK_CONTINUE; } -- cgit v1.2.3 From 5522d9f7b2e6d21dfa361cc498719b7f5e736b57 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 21 Aug 2023 15:13:22 +0100 Subject: libfs: Convert simple_write_begin and simple_write_end to use a folio Remove a number of implicit calls to compound_head() and various calls to compatibility functions. This is not sufficient to enable support for large folios; generic_perform_write() must be converted first. Signed-off-by: "Matthew Wilcox (Oracle)" Message-Id: <20230821141322.2535459-1-willy@infradead.org> Signed-off-by: Christian Brauner --- fs/libfs.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/fs/libfs.c b/fs/libfs.c index 0ea7fedb0b7a..d80374ede638 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -548,21 +548,20 @@ int simple_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct page **pagep, void **fsdata) { - struct page *page; - pgoff_t index; + struct folio *folio; - index = pos >> PAGE_SHIFT; + folio = __filemap_get_folio(mapping, pos / PAGE_SIZE, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) + return PTR_ERR(folio); - page = grab_cache_page_write_begin(mapping, index); - if (!page) - return -ENOMEM; - - *pagep = page; + *pagep = &folio->page; - if (!PageUptodate(page) && (len != PAGE_SIZE)) { - unsigned from = pos & (PAGE_SIZE - 1); + if (!folio_test_uptodate(folio) && (len != folio_size(folio))) { + size_t from = offset_in_folio(folio, pos); - zero_user_segments(page, 0, from, from + len, PAGE_SIZE); + folio_zero_segments(folio, 0, from, + from + len, folio_size(folio)); } return 0; } @@ -594,17 +593,18 @@ static int simple_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { - struct inode *inode = page->mapping->host; + struct folio *folio = page_folio(page); + struct inode *inode = folio->mapping->host; loff_t last_pos = pos + copied; - /* zero the stale part of the page if we did a short copy */ - if (!PageUptodate(page)) { + /* zero the stale part of the folio if we did a short copy */ + if (!folio_test_uptodate(folio)) { if (copied < len) { - unsigned from = pos & (PAGE_SIZE - 1); + size_t from = offset_in_folio(folio, pos); - zero_user(page, from + copied, len - copied); + folio_zero_range(folio, from + copied, len - copied); } - SetPageUptodate(page); + folio_mark_uptodate(folio); } /* * No need to use i_size_read() here, the i_size @@ -613,9 +613,9 @@ static int simple_write_end(struct file *file, struct address_space *mapping, if (last_pos > inode->i_size) i_size_write(inode, last_pos); - set_page_dirty(page); - unlock_page(page); - put_page(page); + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); return copied; } -- cgit v1.2.3 From 781ca6027ed7b83b97184a0606b703794da1e4be Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 21 Aug 2023 15:15:41 +0100 Subject: splice: Convert page_cache_pipe_buf_confirm() to use a folio Convert buf->page to a folio once instead of five times. There's only one uptodate bit per folio, not per page, so we lose nothing here. Signed-off-by: "Matthew Wilcox (Oracle)" Message-Id: <20230821141541.2535953-1-willy@infradead.org> Signed-off-by: Christian Brauner --- fs/splice.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/fs/splice.c b/fs/splice.c index 378fedb392ae..fa4f51055e9f 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -120,17 +120,17 @@ static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - struct page *page = buf->page; + struct folio *folio = page_folio(buf->page); int err; - if (!PageUptodate(page)) { - lock_page(page); + if (!folio_test_uptodate(folio)) { + folio_lock(folio); /* - * Page got truncated/unhashed. This will cause a 0-byte + * Folio got truncated/unhashed. This will cause a 0-byte * splice, if this is the first page. */ - if (!page->mapping) { + if (!folio->mapping) { err = -ENODATA; goto error; } @@ -138,20 +138,18 @@ static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, /* * Uh oh, read-error from disk. */ - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { err = -EIO; goto error; } - /* - * Page is ok afterall, we are done. - */ - unlock_page(page); + /* Folio is ok after all, we are done */ + folio_unlock(folio); } return 0; error: - unlock_page(page); + folio_unlock(folio); return err; } -- cgit v1.2.3 From a370167fe526123637965f60859a9f1f3e1a58b7 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 17 Aug 2023 17:13:31 +0300 Subject: io_uring: rename kiocb_end_write() local helper This helper does not take a kiocb as input and we want to create a common helper by that name that takes a kiocb as input. Signed-off-by: Amir Goldstein Reviewed-by: Jan Kara Reviewed-by: Jens Axboe Message-Id: <20230817141337.1025891-2-amir73il@gmail.com> Signed-off-by: Christian Brauner --- io_uring/rw.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index 1bce2208b65c..749ebd565839 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -220,7 +220,7 @@ static bool io_rw_should_reissue(struct io_kiocb *req) } #endif -static void kiocb_end_write(struct io_kiocb *req) +static void io_req_end_write(struct io_kiocb *req) { /* * Tell lockdep we inherited freeze protection from submission @@ -243,7 +243,7 @@ static void io_req_io_end(struct io_kiocb *req) struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); if (rw->kiocb.ki_flags & IOCB_WRITE) { - kiocb_end_write(req); + io_req_end_write(req); fsnotify_modify(req->file); } else { fsnotify_access(req->file); @@ -313,7 +313,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) struct io_kiocb *req = cmd_to_io_kiocb(rw); if (kiocb->ki_flags & IOCB_WRITE) - kiocb_end_write(req); + io_req_end_write(req); if (unlikely(res != req->cqe.res)) { if (res == -EAGAIN && io_rw_should_reissue(req)) { req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO; @@ -961,7 +961,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) io->bytes_done += ret2; if (kiocb->ki_flags & IOCB_WRITE) - kiocb_end_write(req); + io_req_end_write(req); return ret ? ret : -EAGAIN; } done: @@ -972,7 +972,7 @@ copy_iov: ret = io_setup_async_rw(req, iovec, s, false); if (!ret) { if (kiocb->ki_flags & IOCB_WRITE) - kiocb_end_write(req); + io_req_end_write(req); return -EAGAIN; } return ret; -- cgit v1.2.3 From f6c05b9e5d9bb8f54c6d00b8070fa93619f62a5e Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 17 Aug 2023 17:13:32 +0300 Subject: fs: add kerneldoc to file_{start,end}_write() helpers and use sb_end_write() instead of open coded version. Signed-off-by: Amir Goldstein Reviewed-by: Jan Kara Reviewed-by: Jens Axboe Message-Id: <20230817141337.1025891-3-amir73il@gmail.com> Signed-off-by: Christian Brauner --- include/linux/fs.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 57e2a0a9eea1..660ff7178caa 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2539,6 +2539,13 @@ static inline bool inode_wrong_type(const struct inode *inode, umode_t mode) return (inode->i_mode ^ mode) & S_IFMT; } +/** + * file_start_write - get write access to a superblock for regular file io + * @file: the file we want to write to + * + * This is a variant of sb_start_write() which is a noop on non-regualr file. + * Should be matched with a call to file_end_write(). + */ static inline void file_start_write(struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) @@ -2553,11 +2560,17 @@ static inline bool file_start_write_trylock(struct file *file) return sb_start_write_trylock(file_inode(file)->i_sb); } +/** + * file_end_write - drop write access to a superblock of a regular file + * @file: the file we wrote to + * + * Should be matched with a call to file_start_write(). + */ static inline void file_end_write(struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) return; - __sb_end_write(file_inode(file)->i_sb, SB_FREEZE_WRITE); + sb_end_write(file_inode(file)->i_sb); } /* -- cgit v1.2.3 From ed0360bbab72b829437b67ebb2f9cfac19f59dfe Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 17 Aug 2023 17:13:33 +0300 Subject: fs: create kiocb_{start,end}_write() helpers aio, io_uring, cachefiles and overlayfs, all open code an ugly variant of file_{start,end}_write() to silence lockdep warnings. Create helpers for this lockdep dance so we can use the helpers in all the callers. Suggested-by: Jan Kara Signed-off-by: Amir Goldstein Reviewed-by: Jan Kara Reviewed-by: Jens Axboe Message-Id: <20230817141337.1025891-4-amir73il@gmail.com> Signed-off-by: Christian Brauner --- include/linux/fs.h | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/include/linux/fs.h b/include/linux/fs.h index 660ff7178caa..1eac9545a7b4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2573,6 +2573,42 @@ static inline void file_end_write(struct file *file) sb_end_write(file_inode(file)->i_sb); } +/** + * kiocb_start_write - get write access to a superblock for async file io + * @iocb: the io context we want to submit the write with + * + * This is a variant of sb_start_write() for async io submission. + * Should be matched with a call to kiocb_end_write(). + */ +static inline void kiocb_start_write(struct kiocb *iocb) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + sb_start_write(inode->i_sb); + /* + * Fool lockdep by telling it the lock got released so that it + * doesn't complain about the held lock when we return to userspace. + */ + __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE); +} + +/** + * kiocb_end_write - drop write access to a superblock after async file io + * @iocb: the io context we sumbitted the write with + * + * Should be matched with a call to kiocb_start_write(). + */ +static inline void kiocb_end_write(struct kiocb *iocb) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + /* + * Tell lockdep we inherited freeze protection from submission thread. + */ + __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); + sb_end_write(inode->i_sb); +} + /* * This is used for regular files where some users -- especially the * currently executed binary in a process, previously handled via -- cgit v1.2.3 From e484fd73f4bdcb00c2188100c2d84e9f3f5c9f7d Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 17 Aug 2023 17:13:34 +0300 Subject: io_uring: use kiocb_{start,end}_write() helpers Use helpers instead of the open coded dance to silence lockdep warnings. Suggested-by: Jan Kara Signed-off-by: Amir Goldstein Reviewed-by: Jan Kara Reviewed-by: Jens Axboe Message-Id: <20230817141337.1025891-5-amir73il@gmail.com> Signed-off-by: Christian Brauner --- io_uring/rw.c | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index 749ebd565839..9581b90cb459 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -222,15 +222,10 @@ static bool io_rw_should_reissue(struct io_kiocb *req) static void io_req_end_write(struct io_kiocb *req) { - /* - * Tell lockdep we inherited freeze protection from submission - * thread. - */ if (req->flags & REQ_F_ISREG) { - struct super_block *sb = file_inode(req->file)->i_sb; + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); - __sb_writers_acquired(sb, SB_FREEZE_WRITE); - sb_end_write(sb); + kiocb_end_write(&rw->kiocb); } } @@ -902,18 +897,8 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) return ret; } - /* - * Open-code file_start_write here to grab freeze protection, - * which will be released by another thread in - * io_complete_rw(). Fool lockdep by telling it the lock got - * released so that it doesn't complain about the held lock when - * we return to userspace. - */ - if (req->flags & REQ_F_ISREG) { - sb_start_write(file_inode(req->file)->i_sb); - __sb_writers_release(file_inode(req->file)->i_sb, - SB_FREEZE_WRITE); - } + if (req->flags & REQ_F_ISREG) + kiocb_start_write(kiocb); kiocb->ki_flags |= IOCB_WRITE; if (likely(req->file->f_op->write_iter)) -- cgit v1.2.3 From 8c3cfa80fd1ee9cd4b249de197f5ca9bbfae0daf Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 17 Aug 2023 17:13:35 +0300 Subject: aio: use kiocb_{start,end}_write() helpers Use helpers instead of the open coded dance to silence lockdep warnings. Suggested-by: Jan Kara Signed-off-by: Amir Goldstein Reviewed-by: Jan Kara Reviewed-by: Jens Axboe Message-Id: <20230817141337.1025891-6-amir73il@gmail.com> Signed-off-by: Christian Brauner --- fs/aio.c | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 77e33619de40..b3174da80ff6 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1447,13 +1447,8 @@ static void aio_complete_rw(struct kiocb *kiocb, long res) if (kiocb->ki_flags & IOCB_WRITE) { struct inode *inode = file_inode(kiocb->ki_filp); - /* - * Tell lockdep we inherited freeze protection from submission - * thread. - */ if (S_ISREG(inode->i_mode)) - __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); - file_end_write(kiocb->ki_filp); + kiocb_end_write(kiocb); } iocb->ki_res.res = res; @@ -1581,17 +1576,8 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb, return ret; ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter)); if (!ret) { - /* - * Open-code file_start_write here to grab freeze protection, - * which will be released by another thread in - * aio_complete_rw(). Fool lockdep by telling it the lock got - * released so that it doesn't complain about the held lock when - * we return to userspace. - */ - if (S_ISREG(file_inode(file)->i_mode)) { - sb_start_write(file_inode(file)->i_sb); - __sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE); - } + if (S_ISREG(file_inode(file)->i_mode)) + kiocb_start_write(req); req->ki_flags |= IOCB_WRITE; aio_rw_done(req, call_write_iter(file, req, &iter)); } -- cgit v1.2.3 From 8f7371268a4ba83e6bf3b73cf7484a2581f56d7b Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 17 Aug 2023 17:13:36 +0300 Subject: ovl: use kiocb_{start,end}_write() helpers Use helpers instead of the open coded dance to silence lockdep warnings. Suggested-by: Jan Kara Signed-off-by: Amir Goldstein Reviewed-by: Jan Kara Reviewed-by: Jens Axboe Message-Id: <20230817141337.1025891-7-amir73il@gmail.com> Signed-off-by: Christian Brauner --- fs/overlayfs/file.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 21245b00722a..e4cc401d334d 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -290,10 +290,7 @@ static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req) if (iocb->ki_flags & IOCB_WRITE) { struct inode *inode = file_inode(orig_iocb->ki_filp); - /* Actually acquired in ovl_write_iter() */ - __sb_writers_acquired(file_inode(iocb->ki_filp)->i_sb, - SB_FREEZE_WRITE); - file_end_write(iocb->ki_filp); + kiocb_end_write(iocb); ovl_copyattr(inode); } @@ -409,10 +406,6 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) if (!aio_req) goto out; - file_start_write(real.file); - /* Pacify lockdep, same trick as done in aio_write() */ - __sb_writers_release(file_inode(real.file)->i_sb, - SB_FREEZE_WRITE); aio_req->fd = real; real.flags = 0; aio_req->orig_iocb = iocb; @@ -420,6 +413,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) aio_req->iocb.ki_flags = ifl; aio_req->iocb.ki_complete = ovl_aio_rw_complete; refcount_set(&aio_req->ref, 2); + kiocb_start_write(&aio_req->iocb); ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter); ovl_aio_put(aio_req); if (ret != -EIOCBQUEUED) -- cgit v1.2.3 From e6fa4c728fb671765291cca3a905986612c06b6e Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 17 Aug 2023 17:13:37 +0300 Subject: cachefiles: use kiocb_{start,end}_write() helpers Use helpers instead of the open coded dance to silence lockdep warnings. Suggested-by: Jan Kara Signed-off-by: Amir Goldstein Reviewed-by: Jan Kara Reviewed-by: Jens Axboe Message-Id: <20230817141337.1025891-8-amir73il@gmail.com> Signed-off-by: Christian Brauner --- fs/cachefiles/io.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index 175a25fcade8..009d23cd435b 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -259,9 +259,7 @@ static void cachefiles_write_complete(struct kiocb *iocb, long ret) _enter("%ld", ret); - /* Tell lockdep we inherited freeze protection from submission thread */ - __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); - __sb_end_write(inode->i_sb, SB_FREEZE_WRITE); + kiocb_end_write(iocb); if (ret < 0) trace_cachefiles_io_error(object, inode, ret, @@ -286,7 +284,6 @@ int __cachefiles_write(struct cachefiles_object *object, { struct cachefiles_cache *cache; struct cachefiles_kiocb *ki; - struct inode *inode; unsigned int old_nofs; ssize_t ret; size_t len = iov_iter_count(iter); @@ -322,19 +319,12 @@ int __cachefiles_write(struct cachefiles_object *object, ki->iocb.ki_complete = cachefiles_write_complete; atomic_long_add(ki->b_writing, &cache->b_writing); - /* Open-code file_start_write here to grab freeze protection, which - * will be released by another thread in aio_complete_rw(). Fool - * lockdep by telling it the lock got released so that it doesn't - * complain about the held lock when we return to userspace. - */ - inode = file_inode(file); - __sb_start_write(inode->i_sb, SB_FREEZE_WRITE); - __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE); + kiocb_start_write(&ki->iocb); get_file(ki->iocb.ki_filp); cachefiles_grab_object(object, cachefiles_obj_get_ioreq); - trace_cachefiles_write(object, inode, ki->iocb.ki_pos, len); + trace_cachefiles_write(object, file_inode(file), ki->iocb.ki_pos, len); old_nofs = memalloc_nofs_save(); ret = cachefiles_inject_write_error(); if (ret == 0) -- cgit v1.2.3