summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/xfs.rst16
-rw-r--r--Documentation/filesystems/porting.rst7
-rw-r--r--arch/alpha/kernel/process.c2
-rw-r--r--arch/arc/kernel/process.c2
-rw-r--r--arch/arm/kernel/process.c2
-rw-r--r--arch/arm64/kernel/process.c2
-rw-r--r--arch/csky/Kconfig24
-rw-r--r--arch/csky/abiv1/inc/abi/cacheflush.h1
-rw-r--r--arch/csky/abiv1/inc/abi/ckmmu.h10
-rw-r--r--arch/csky/abiv1/inc/abi/entry.h1
-rw-r--r--arch/csky/abiv1/inc/abi/page.h1
-rw-r--r--arch/csky/abiv1/inc/abi/pgtable-bits.h40
-rw-r--r--arch/csky/abiv1/inc/abi/reg_ops.h1
-rw-r--r--arch/csky/abiv1/inc/abi/regdef.h6
-rw-r--r--arch/csky/abiv1/inc/abi/string.h1
-rw-r--r--arch/csky/abiv1/inc/abi/switch_context.h1
-rw-r--r--arch/csky/abiv1/inc/abi/vdso.h18
-rw-r--r--arch/csky/abiv2/cacheflush.c3
-rw-r--r--arch/csky/abiv2/inc/abi/ckmmu.h44
-rw-r--r--arch/csky/abiv2/inc/abi/entry.h20
-rw-r--r--arch/csky/abiv2/inc/abi/fpu.h1
-rw-r--r--arch/csky/abiv2/inc/abi/page.h1
-rw-r--r--arch/csky/abiv2/inc/abi/pgtable-bits.h37
-rw-r--r--arch/csky/abiv2/inc/abi/reg_ops.h1
-rw-r--r--arch/csky/abiv2/inc/abi/regdef.h6
-rw-r--r--arch/csky/abiv2/inc/abi/switch_context.h1
-rw-r--r--arch/csky/abiv2/inc/abi/vdso.h20
-rw-r--r--arch/csky/abiv2/sysdep.h1
-rw-r--r--arch/csky/include/asm/addrspace.h1
-rw-r--r--arch/csky/include/asm/atomic.h212
-rw-r--r--arch/csky/include/asm/barrier.h83
-rw-r--r--arch/csky/include/asm/bitops.h1
-rw-r--r--arch/csky/include/asm/bug.h3
-rw-r--r--arch/csky/include/asm/cacheflush.h1
-rw-r--r--arch/csky/include/asm/checksum.h1
-rw-r--r--arch/csky/include/asm/clocksource.h8
-rw-r--r--arch/csky/include/asm/cmpxchg.h27
-rw-r--r--arch/csky/include/asm/elf.h1
-rw-r--r--arch/csky/include/asm/fixmap.h1
-rw-r--r--arch/csky/include/asm/ftrace.h1
-rw-r--r--arch/csky/include/asm/futex.h121
-rw-r--r--arch/csky/include/asm/highmem.h1
-rw-r--r--arch/csky/include/asm/io.h1
-rw-r--r--arch/csky/include/asm/memory.h2
-rw-r--r--arch/csky/include/asm/mmu.h1
-rw-r--r--arch/csky/include/asm/mmu_context.h10
-rw-r--r--arch/csky/include/asm/page.h2
-rw-r--r--arch/csky/include/asm/perf_event.h1
-rw-r--r--arch/csky/include/asm/pgalloc.h3
-rw-r--r--arch/csky/include/asm/pgtable.h80
-rw-r--r--arch/csky/include/asm/processor.h3
-rw-r--r--arch/csky/include/asm/ptrace.h1
-rw-r--r--arch/csky/include/asm/segment.h3
-rw-r--r--arch/csky/include/asm/shmparam.h1
-rw-r--r--arch/csky/include/asm/spinlock.h167
-rw-r--r--arch/csky/include/asm/spinlock_types.h10
-rw-r--r--arch/csky/include/asm/string.h1
-rw-r--r--arch/csky/include/asm/switch_to.h1
-rw-r--r--arch/csky/include/asm/syscalls.h1
-rw-r--r--arch/csky/include/asm/thread_info.h2
-rw-r--r--arch/csky/include/asm/tlb.h1
-rw-r--r--arch/csky/include/asm/tlbflush.h1
-rw-r--r--arch/csky/include/asm/traps.h1
-rw-r--r--arch/csky/include/asm/uaccess.h1
-rw-r--r--arch/csky/include/asm/unistd.h1
-rw-r--r--arch/csky/include/asm/vdso.h21
-rw-r--r--arch/csky/include/asm/vdso/clocksource.h9
-rw-r--r--arch/csky/include/asm/vdso/gettimeofday.h114
-rw-r--r--arch/csky/include/asm/vdso/processor.h12
-rw-r--r--arch/csky/include/asm/vdso/vsyscall.h22
-rw-r--r--arch/csky/include/uapi/asm/byteorder.h1
-rw-r--r--arch/csky/include/uapi/asm/perf_regs.h1
-rw-r--r--arch/csky/include/uapi/asm/ptrace.h1
-rw-r--r--arch/csky/include/uapi/asm/sigcontext.h1
-rw-r--r--arch/csky/include/uapi/asm/unistd.h1
-rw-r--r--arch/csky/kernel/Makefile2
-rw-r--r--arch/csky/kernel/atomic.S24
-rw-r--r--arch/csky/kernel/entry.S106
-rw-r--r--arch/csky/kernel/head.S10
-rw-r--r--arch/csky/kernel/perf_event.c4
-rw-r--r--arch/csky/kernel/probes/simulate-insn.c22
-rw-r--r--arch/csky/kernel/process.c2
-rw-r--r--arch/csky/kernel/ptrace.c128
-rw-r--r--arch/csky/kernel/setup.c18
-rw-r--r--arch/csky/kernel/signal.c4
-rw-r--r--arch/csky/kernel/smp.c7
-rw-r--r--arch/csky/kernel/traps.c10
-rw-r--r--arch/csky/kernel/vdso.c127
-rw-r--r--arch/csky/kernel/vdso/.gitignore4
-rw-r--r--arch/csky/kernel/vdso/Makefile72
-rw-r--r--arch/csky/kernel/vdso/note.S12
-rw-r--r--arch/csky/kernel/vdso/rt_sigreturn.S14
-rwxr-xr-xarch/csky/kernel/vdso/so2s.sh5
-rw-r--r--arch/csky/kernel/vdso/vdso.S16
-rw-r--r--arch/csky/kernel/vdso/vdso.lds.S58
-rw-r--r--arch/csky/kernel/vdso/vgettimeofday.c28
-rw-r--r--arch/csky/kernel/vmlinux.lds.S2
-rw-r--r--arch/csky/mm/fault.c388
-rw-r--r--arch/csky/mm/init.c56
-rw-r--r--arch/csky/mm/tlb.c42
-rw-r--r--arch/h8300/kernel/process.c2
-rw-r--r--arch/hexagon/kernel/process.c2
-rw-r--r--arch/ia64/kernel/process.c2
-rw-r--r--arch/m68k/kernel/process.c2
-rw-r--r--arch/microblaze/kernel/process.c2
-rw-r--r--arch/mips/kernel/process.c2
-rw-r--r--arch/nds32/kernel/process.c2
-rw-r--r--arch/nios2/kernel/process.c2
-rw-r--r--arch/openrisc/kernel/process.c2
-rw-r--r--arch/parisc/kernel/process.c2
-rw-r--r--arch/powerpc/kernel/process.c2
-rw-r--r--arch/riscv/Kconfig1
-rw-r--r--arch/riscv/configs/defconfig1
-rw-r--r--arch/riscv/configs/rv32_defconfig1
-rw-r--r--arch/riscv/kernel/process.c2
-rw-r--r--arch/riscv/mm/init.c21
-rw-r--r--arch/s390/kernel/process.c2
-rw-r--r--arch/sh/boards/mach-landisk/gio.c6
-rw-r--r--arch/sh/kernel/process_32.c2
-rw-r--r--arch/sparc/kernel/process_32.c2
-rw-r--r--arch/sparc/kernel/process_64.c2
-rw-r--r--arch/um/kernel/process.c2
-rw-r--r--arch/x86/kernel/process.c2
-rw-r--r--arch/xtensa/kernel/process.c2
-rw-r--r--block/bfq-iosched.c4
-rw-r--r--block/blk-core.c1
-rw-r--r--block/blk-crypto-fallback.c12
-rw-r--r--block/blk-map.c4
-rw-r--r--block/blk-mq-sched.c6
-rw-r--r--block/blk-mq-sched.h1
-rw-r--r--block/blk-pm.h38
-rw-r--r--block/blk-settings.c12
-rw-r--r--block/blk-sysfs.c7
-rw-r--r--block/bounce.c24
-rw-r--r--block/genhd.c4
-rw-r--r--block/ioctl.c21
-rw-r--r--block/kyber-iosched.c33
-rw-r--r--block/mq-deadline.c4
-rw-r--r--drivers/block/loop.c5
-rw-r--r--drivers/block/nbd.c32
-rw-r--r--drivers/block/xen-blkback/blkback.c4
-rw-r--r--drivers/dax/super.c2
-rw-r--r--drivers/i2c/busses/i2c-brcmstb.c2
-rw-r--r--drivers/i2c/busses/i2c-designware-core.h2
-rw-r--r--drivers/i2c/busses/i2c-designware-master.c2
-rw-r--r--drivers/i2c/busses/i2c-exynos5.c8
-rw-r--r--drivers/i2c/busses/i2c-qcom-geni.c34
-rw-r--r--drivers/md/dm-io.c4
-rw-r--r--drivers/md/dm-log-writes.c10
-rw-r--r--drivers/nvme/target/io-cmd-bdev.c8
-rw-r--r--drivers/nvme/target/passthru.c4
-rw-r--r--drivers/rtc/rtc-m41t80.c4
-rw-r--r--drivers/s390/char/vmur.c2
-rw-r--r--drivers/scsi/aic7xxx/aic79xx.h2
-rw-r--r--drivers/scsi/aic7xxx/aic7xxx.h2
-rw-r--r--drivers/scsi/bnx2fc/Kconfig1
-rw-r--r--drivers/scsi/bnx2i/bnx2i_iscsi.c2
-rw-r--r--drivers/scsi/hpsa.c51
-rw-r--r--drivers/scsi/hpsa_cmd.h2
-rw-r--r--drivers/scsi/isci/request.c8
-rw-r--r--drivers/scsi/iscsi_tcp.c9
-rw-r--r--drivers/scsi/libiscsi.c348
-rw-r--r--drivers/scsi/libiscsi_tcp.c86
-rw-r--r--drivers/scsi/mpt3sas/mpt3sas_base.c58
-rw-r--r--drivers/scsi/mpt3sas/mpt3sas_base.h52
-rw-r--r--drivers/scsi/mpt3sas/mpt3sas_ctl.c67
-rw-r--r--drivers/scsi/mpt3sas/mpt3sas_ctl.h22
-rw-r--r--drivers/scsi/mpt3sas/mpt3sas_scsih.c44
-rw-r--r--drivers/scsi/mpt3sas/mpt3sas_trigger_diag.c38
-rw-r--r--drivers/scsi/pmcraid.h6
-rw-r--r--drivers/scsi/qla2xxx/qla_target.c3
-rw-r--r--drivers/scsi/qla4xxx/ql4_os.c2
-rw-r--r--drivers/scsi/scsi_transport_iscsi.c3
-rw-r--r--drivers/scsi/sd.c14
-rw-r--r--drivers/scsi/sd_zbc.c6
-rw-r--r--drivers/scsi/ufs/ufshcd.c6
-rw-r--r--drivers/scsi/ufs/ufshcd.h2
-rw-r--r--drivers/staging/vme/devices/vme_user.c12
-rw-r--r--drivers/target/sbp/sbp_target.c2
-rw-r--r--drivers/target/target_core_iblock.c9
-rw-r--r--drivers/target/target_core_pr.c15
-rw-r--r--drivers/target/target_core_pscsi.c2
-rw-r--r--drivers/target/target_core_transport.c15
-rw-r--r--drivers/target/target_core_user.c189
-rw-r--r--fs/9p/vfs_inode.c21
-rw-r--r--fs/block_dev.c10
-rw-r--r--fs/direct-io.c2
-rw-r--r--fs/erofs/data.c4
-rw-r--r--fs/ext4/readpage.c3
-rw-r--r--fs/f2fs/data.c3
-rw-r--r--fs/f2fs/node.c2
-rw-r--r--fs/gfs2/inode.c4
-rw-r--r--fs/inode.c1
-rw-r--r--fs/io-wq.c621
-rw-r--r--fs/io-wq.h35
-rw-r--r--fs/io_uring.c1043
-rw-r--r--fs/iomap/buffered-io.c11
-rw-r--r--fs/jfs/super.c1
-rw-r--r--fs/mpage.c4
-rw-r--r--fs/namespace.c53
-rw-r--r--fs/nfs/blocklayout/blocklayout.c6
-rw-r--r--fs/proc/self.c7
-rw-r--r--fs/proc/thread_self.c7
-rw-r--r--fs/xfs/libxfs/xfs_btree.c12
-rw-r--r--fs/xfs/xfs_aops.c17
-rw-r--r--fs/xfs/xfs_bio_io.c2
-rw-r--r--fs/xfs/xfs_buf.c4
-rw-r--r--fs/xfs/xfs_extent_busy.c14
-rw-r--r--fs/xfs/xfs_sysctl.c35
-rw-r--r--fs/xfs/xfs_trans.c33
-rw-r--r--fs/xfs/xfs_trans.h30
-rw-r--r--include/linux/bio.h7
-rw-r--r--include/linux/blkdev.h1
-rw-r--r--include/linux/blktrace_api.h4
-rw-r--r--include/linux/cpuhotplug.h1
-rw-r--r--include/linux/io_uring.h22
-rw-r--r--include/linux/mount.h1
-rw-r--r--include/linux/net.h3
-rw-r--r--include/linux/sched.h3
-rw-r--r--include/scsi/libiscsi.h6
-rw-r--r--include/target/target_core_backend.h1
-rw-r--r--include/trace/events/bcache.h10
-rw-r--r--include/trace/events/block.h20
-rw-r--r--include/uapi/linux/io_uring.h1
-rw-r--r--kernel/audit_fsnotify.c2
-rw-r--r--kernel/ptrace.c2
-rw-r--r--kernel/signal.c4
-rw-r--r--kernel/trace/blktrace.c20
-rw-r--r--net/ipv4/af_inet.c1
-rw-r--r--net/ipv6/af_inet6.c1
-rw-r--r--net/socket.c10
231 files changed, 3123 insertions, 2793 deletions
diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst
index 6178153d3320..5422407a96d7 100644
--- a/Documentation/admin-guide/xfs.rst
+++ b/Documentation/admin-guide/xfs.rst
@@ -284,6 +284,9 @@ The following sysctls are available for the XFS filesystem:
removes unused preallocation from clean inodes and releases
the unused space back to the free pool.
+ fs.xfs.speculative_cow_prealloc_lifetime
+ This is an alias for speculative_prealloc_lifetime.
+
fs.xfs.error_level (Min: 0 Default: 3 Max: 11)
A volume knob for error reporting when internal errors occur.
This will generate detailed messages & backtraces for filesystem
@@ -356,12 +359,13 @@ The following sysctls are available for the XFS filesystem:
Deprecated Sysctls
==================
-=========================== ================
- Name Removal Schedule
-=========================== ================
-fs.xfs.irix_sgid_inherit September 2025
-fs.xfs.irix_symlink_mode September 2025
-=========================== ================
+=========================================== ================
+ Name Removal Schedule
+=========================================== ================
+fs.xfs.irix_sgid_inherit September 2025
+fs.xfs.irix_symlink_mode September 2025
+fs.xfs.speculative_cow_prealloc_lifetime September 2025
+=========================================== ================
Removed Sysctls
diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index 633610253f45..0302035781be 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -883,3 +883,10 @@ For bvec based itererators bio_iov_iter_get_pages() now doesn't copy bvecs but
uses the one provided. Anyone issuing kiocb-I/O should ensure that the bvec and
page references stay until I/O has completed, i.e. until ->ki_complete() has
been called or returned with non -EIOCBQUEUED code.
+
+---
+
+**mandatory**
+
+mnt_want_write_file() can now only be paired with mnt_drop_write_file(),
+whereas previously it could be paired with mnt_drop_write() as well.
diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index 6c71554206cc..5112ab996394 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -249,7 +249,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
childti->pcb.ksp = (unsigned long) childstack;
childti->pcb.flags = 1; /* set FEN, clear everything else */
- if (unlikely(p->flags & PF_KTHREAD)) {
+ if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
/* kernel thread */
memset(childstack, 0,
sizeof(struct switch_stack) + sizeof(struct pt_regs));
diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c
index 37f724ad5e39..d838d0d57696 100644
--- a/arch/arc/kernel/process.c
+++ b/arch/arc/kernel/process.c
@@ -191,7 +191,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
childksp[0] = 0; /* fp */
childksp[1] = (unsigned long)ret_from_fork; /* blink */
- if (unlikely(p->flags & PF_KTHREAD)) {
+ if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
memset(c_regs, 0, sizeof(struct pt_regs));
c_callee->r13 = kthread_arg;
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index ee3aee69e444..5199a2bb4111 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -243,7 +243,7 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start,
thread->cpu_domain = get_domain();
#endif
- if (likely(!(p->flags & PF_KTHREAD))) {
+ if (likely(!(p->flags & (PF_KTHREAD | PF_IO_WORKER)))) {
*childregs = *current_pt_regs();
childregs->ARM_r0 = 0;
if (stack_start)
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 4cc1ccc8d6ab..325c83b1a24d 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -398,7 +398,7 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start,
ptrauth_thread_init_kernel(p);
- if (likely(!(p->flags & PF_KTHREAD))) {
+ if (likely(!(p->flags & (PF_KTHREAD | PF_IO_WORKER)))) {
*childregs = *current_pt_regs();
childregs->regs[0] = 0;
diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index 89dd2fcf38fa..34e91224adc3 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -7,7 +7,7 @@ config CSKY
select ARCH_HAS_SYNC_DMA_FOR_CPU
select ARCH_HAS_SYNC_DMA_FOR_DEVICE
select ARCH_USE_BUILTIN_BSWAP
- select ARCH_USE_QUEUED_RWLOCKS if NR_CPUS>2
+ select ARCH_USE_QUEUED_RWLOCKS
select ARCH_WANT_FRAME_POINTERS if !CPU_CK610
select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
select COMMON_CLK
@@ -35,6 +35,9 @@ config CSKY
select GENERIC_IRQ_MULTI_HANDLER
select GENERIC_SCHED_CLOCK
select GENERIC_SMP_IDLE_THREAD
+ select GENERIC_TIME_VSYSCALL
+ select GENERIC_VDSO_32
+ select GENERIC_GETTIMEOFDAY
select GX6605S_TIMER if CPU_CK610
select HAVE_ARCH_TRACEHOOK
select HAVE_ARCH_AUDITSYSCALL
@@ -43,11 +46,14 @@ config CSKY
select HAVE_CONTEXT_TRACKING
select HAVE_VIRT_CPU_ACCOUNTING_GEN
select HAVE_DEBUG_BUGVERBOSE
+ select HAVE_DEBUG_KMEMLEAK
select HAVE_DYNAMIC_FTRACE
select HAVE_DYNAMIC_FTRACE_WITH_REGS
+ select HAVE_GENERIC_VDSO
select HAVE_FUNCTION_TRACER
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_ERROR_INJECTION
+ select HAVE_FUTEX_CMPXCHG if FUTEX && SMP
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_KERNEL_GZIP
select HAVE_KERNEL_LZO
@@ -192,6 +198,22 @@ config CPU_CK860
endchoice
choice
+ prompt "PAGE OFFSET"
+ default PAGE_OFFSET_80000000
+
+config PAGE_OFFSET_80000000
+ bool "PAGE OFFSET 2G (user:kernel = 2:2)"
+
+config PAGE_OFFSET_A0000000
+ bool "PAGE OFFSET 2.5G (user:kernel = 2.5:1.5)"
+endchoice
+
+config PAGE_OFFSET
+ hex
+ default 0x80000000 if PAGE_OFFSET_80000000
+ default 0xa0000000 if PAGE_OFFSET_A0000000
+choice
+
prompt "C-SKY PMU type"
depends on PERF_EVENTS
depends on CPU_CK807 || CPU_CK810 || CPU_CK860
diff --git a/arch/csky/abiv1/inc/abi/cacheflush.h b/arch/csky/abiv1/inc/abi/cacheflush.h
index d3e04208d53c..6cab7afae962 100644
--- a/arch/csky/abiv1/inc/abi/cacheflush.h
+++ b/arch/csky/abiv1/inc/abi/cacheflush.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ABI_CSKY_CACHEFLUSH_H
#define __ABI_CSKY_CACHEFLUSH_H
diff --git a/arch/csky/abiv1/inc/abi/ckmmu.h b/arch/csky/abiv1/inc/abi/ckmmu.h
index ba8eb5870835..416b30c57983 100644
--- a/arch/csky/abiv1/inc/abi/ckmmu.h
+++ b/arch/csky/abiv1/inc/abi/ckmmu.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ASM_CSKY_CKMMUV1_H
#define __ASM_CSKY_CKMMUV1_H
@@ -89,13 +88,14 @@ static inline void tlb_invalid_indexed(void)
cpwcr("cpcr8", 0x02000000);
}
-static inline void setup_pgd(unsigned long pgd, bool kernel)
+static inline void setup_pgd(pgd_t *pgd, int asid)
{
- cpwcr("cpcr29", pgd | BIT(0));
+ cpwcr("cpcr29", __pa(pgd) | BIT(0));
+ write_mmu_entryhi(asid);
}
-static inline unsigned long get_pgd(void)
+static inline pgd_t *get_pgd(void)
{
- return cprcr("cpcr29") & ~BIT(0);
+ return __va(cprcr("cpcr29") & ~BIT(0));
}
#endif /* __ASM_CSKY_CKMMUV1_H */
diff --git a/arch/csky/abiv1/inc/abi/entry.h b/arch/csky/abiv1/inc/abi/entry.h
index 13c23e2c707c..b6a2109b895e 100644
--- a/arch/csky/abiv1/inc/abi/entry.h
+++ b/arch/csky/abiv1/inc/abi/entry.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ASM_CSKY_ENTRY_H
#define __ASM_CSKY_ENTRY_H
diff --git a/arch/csky/abiv1/inc/abi/page.h b/arch/csky/abiv1/inc/abi/page.h
index c864519117c7..2d2159933b76 100644
--- a/arch/csky/abiv1/inc/abi/page.h
+++ b/arch/csky/abiv1/inc/abi/page.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#include <asm/shmparam.h>
diff --git a/arch/csky/abiv1/inc/abi/pgtable-bits.h b/arch/csky/abiv1/inc/abi/pgtable-bits.h
index d605445aad9a..752c8b3f9194 100644
--- a/arch/csky/abiv1/inc/abi/pgtable-bits.h
+++ b/arch/csky/abiv1/inc/abi/pgtable-bits.h
@@ -1,37 +1,49 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ASM_CSKY_PGTABLE_BITS_H
#define __ASM_CSKY_PGTABLE_BITS_H
/* implemented in software */
-#define _PAGE_ACCESSED (1<<3)
-#define PAGE_ACCESSED_BIT (3)
-
+#define _PAGE_PRESENT (1<<0)
#define _PAGE_READ (1<<1)
#define _PAGE_WRITE (1<<2)
-#define _PAGE_PRESENT (1<<0)
-
+#define _PAGE_ACCESSED (1<<3)
#define _PAGE_MODIFIED (1<<4)
-#define PAGE_MODIFIED_BIT (4)
/* implemented in hardware */
#define _PAGE_GLOBAL (1<<6)
-
#define _PAGE_VALID (1<<7)
-#define PAGE_VALID_BIT (7)
-
#define _PAGE_DIRTY (1<<8)
-#define PAGE_DIRTY_BIT (8)
#define _PAGE_CACHE (3<<9)
#define _PAGE_UNCACHE (2<<9)
#define _PAGE_SO _PAGE_UNCACHE
-
#define _CACHE_MASK (7<<9)
-#define _CACHE_CACHED (_PAGE_VALID | _PAGE_CACHE)
-#define _CACHE_UNCACHED (_PAGE_VALID | _PAGE_UNCACHE)
+#define _CACHE_CACHED _PAGE_CACHE
+#define _CACHE_UNCACHED _PAGE_UNCACHE
+
+#define _PAGE_PROT_NONE _PAGE_READ
+
+/*
+ * Encode and decode a swap entry
+ *
+ * Format of swap PTE:
+ * bit 0: _PAGE_PRESENT (zero)
+ * bit 1: _PAGE_READ (zero)
+ * bit 2 - 5: swap type[0 - 3]
+ * bit 6: _PAGE_GLOBAL (zero)
+ * bit 7: _PAGE_VALID (zero)
+ * bit 8: swap type[4]
+ * bit 9 - 31: swap offset
+ */
+#define __swp_type(x) ((((x).val >> 2) & 0xf) | \
+ (((x).val >> 4) & 0x10))
+#define __swp_offset(x) ((x).val >> 9)
+#define __swp_entry(type, offset) ((swp_entry_t) { \
+ ((type & 0xf) << 2) | \
+ ((type & 0x10) << 4) | \
+ ((offset) << 9)})
#define HAVE_ARCH_UNMAPPED_AREA
diff --git a/arch/csky/abiv1/inc/abi/reg_ops.h b/arch/csky/abiv1/inc/abi/reg_ops.h
index a153bd3918f7..abd01a243388 100644
--- a/arch/csky/abiv1/inc/abi/reg_ops.h
+++ b/arch/csky/abiv1/inc/abi/reg_ops.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ABI_REG_OPS_H
#define __ABI_REG_OPS_H
diff --git a/arch/csky/abiv1/inc/abi/regdef.h b/arch/csky/abiv1/inc/abi/regdef.h
index 104707fbdcc1..7b386fd67070 100644
--- a/arch/csky/abiv1/inc/abi/regdef.h
+++ b/arch/csky/abiv1/inc/abi/regdef.h
@@ -1,10 +1,14 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ASM_CSKY_REGDEF_H
#define __ASM_CSKY_REGDEF_H
+#ifdef __ASSEMBLY__
#define syscallid r1
+#else
+#define syscallid "r1"
+#endif
+
#define regs_syscallid(regs) regs->regs[9]
#define regs_fp(regs) regs->regs[2]
diff --git a/arch/csky/abiv1/inc/abi/string.h b/arch/csky/abiv1/inc/abi/string.h
index 0cd43384f8d2..9d95594b0feb 100644
--- a/arch/csky/abiv1/inc/abi/string.h
+++ b/arch/csky/abiv1/inc/abi/string.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ABI_CSKY_STRING_H
#define __ABI_CSKY_STRING_H
diff --git a/arch/csky/abiv1/inc/abi/switch_context.h b/arch/csky/abiv1/inc/abi/switch_context.h
index 17c82686498e..ec73fd7c9f87 100644
--- a/arch/csky/abiv1/inc/abi/switch_context.h
+++ b/arch/csky/abiv1/inc/abi/switch_context.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ABI_CSKY_PTRACE_H
#define __ABI_CSKY_PTRACE_H
diff --git a/arch/csky/abiv1/inc/abi/vdso.h b/arch/csky/abiv1/inc/abi/vdso.h
index 14352f524f1d..9e6d0a2fdd2b 100644
--- a/arch/csky/abiv1/inc/abi/vdso.h
+++ b/arch/csky/abiv1/inc/abi/vdso.h
@@ -1,17 +1,9 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#include <linux/uaccess.h>
+#ifndef __ABI_CSKY_VDSO_H
+#define __ABI_CSKY_VDSO_H
-static inline int setup_vdso_page(unsigned short *ptr)
-{
- int err = 0;
+/* movi r1, 127; addi r1, (139 - 127) */
+#define SET_SYSCALL_ID .long 0x20b167f1
- /* movi r1, 127 */
- err |= __put_user(0x67f1, ptr + 0);
- /* addi r1, (139 - 127) */
- err |= __put_user(0x20b1, ptr + 1);
- /* trap 0 */
- err |= __put_user(0x0008, ptr + 2);
-
- return err;
-}
+#endif /* __ABI_CSKY_VDSO_H */
diff --git a/arch/csky/abiv2/cacheflush.c b/arch/csky/abiv2/cacheflush.c
index 790f1ebfba44..39c51399dd81 100644
--- a/arch/csky/abiv2/cacheflush.c
+++ b/arch/csky/abiv2/cacheflush.c
@@ -12,6 +12,9 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
unsigned long addr;
struct page *page;
+ if (!pfn_valid(pte_pfn(*pte)))
+ return;
+
page = pfn_to_page(pte_pfn(*pte));
if (page == ZERO_PAGE(0))
return;
diff --git a/arch/csky/abiv2/inc/abi/ckmmu.h b/arch/csky/abiv2/inc/abi/ckmmu.h
index 73ded7c72482..64215f2380f1 100644
--- a/arch/csky/abiv2/inc/abi/ckmmu.h
+++ b/arch/csky/abiv2/inc/abi/ckmmu.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ASM_CSKY_CKMMUV2_H
#define __ASM_CSKY_CKMMUV2_H
@@ -78,8 +77,13 @@ static inline void tlb_read(void)
static inline void tlb_invalid_all(void)
{
#ifdef CONFIG_CPU_HAS_TLBI
- asm volatile("tlbi.alls\n":::"memory");
sync_is();
+ asm volatile(
+ "tlbi.alls \n"
+ "sync.i \n"
+ :
+ :
+ : "memory");
#else
mtcr("cr<8, 15>", 0x04000000);
#endif
@@ -88,8 +92,13 @@ static inline void tlb_invalid_all(void)
static inline void local_tlb_invalid_all(void)
{
#ifdef CONFIG_CPU_HAS_TLBI
- asm volatile("tlbi.all\n":::"memory");
sync_is();
+ asm volatile(
+ "tlbi.all \n"
+ "sync.i \n"
+ :
+ :
+ : "memory");
#else
tlb_invalid_all();
#endif
@@ -100,16 +109,31 @@ static inline void tlb_invalid_indexed(void)
mtcr("cr<8, 15>", 0x02000000);
}
-static inline void setup_pgd(unsigned long pgd, bool kernel)
+#define NOP32 ".long 0x4820c400\n"
+
+static inline void setup_pgd(pgd_t *pgd, int asid)
{
- if (kernel)
- mtcr("cr<28, 15>", pgd | BIT(0));
- else
- mtcr("cr<29, 15>", pgd | BIT(0));
+#ifdef CONFIG_CPU_HAS_TLBI
+ sync_is();
+#else
+ mb();
+#endif
+ asm volatile(
+#ifdef CONFIG_CPU_HAS_TLBI
+ "mtcr %1, cr<28, 15> \n"
+#endif
+ "mtcr %1, cr<29, 15> \n"
+ "mtcr %0, cr< 4, 15> \n"
+ ".rept 64 \n"
+ NOP32
+ ".endr \n"
+ :
+ :"r"(asid), "r"(__pa(pgd) | BIT(0))
+ :"memory");
}
-static inline unsigned long get_pgd(void)
+static inline pgd_t *get_pgd(void)
{
- return mfcr("cr<29, 15>") & ~BIT(0);
+ return __va(mfcr("cr<29, 15>") & ~BIT(0));
}
#endif /* __ASM_CSKY_CKMMUV2_H */
diff --git a/arch/csky/abiv2/inc/abi/entry.h b/arch/csky/abiv2/inc/abi/entry.h
index bedcc6f06bba..cca63e699b58 100644
--- a/arch/csky/abiv2/inc/abi/entry.h
+++ b/arch/csky/abiv2/inc/abi/entry.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ASM_CSKY_ENTRY_H
#define __ASM_CSKY_ENTRY_H
@@ -26,6 +25,9 @@
stw tls, (sp, 0)
stw lr, (sp, 4)
+ RD_MEH lr
+ WR_MEH lr
+
mfcr lr, epc
movi tls, \epc_inc
add lr, tls
@@ -231,6 +233,16 @@
mtcr \rx, cr<8, 15>
.endm
+#ifdef CONFIG_PAGE_OFFSET_80000000
+#define MSA_SET cr<30, 15>
+#define MSA_CLR cr<31, 15>
+#endif
+
+#ifdef CONFIG_PAGE_OFFSET_A0000000
+#define MSA_SET cr<31, 15>
+#define MSA_CLR cr<30, 15>
+#endif
+
.macro SETUP_MMU
/* Init psr and enable ee */
lrw r6, DEFAULT_PSR_VALUE
@@ -281,15 +293,15 @@
* 31 - 29 | 28 - 9 | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0
* BA Reserved SH WA B SO SEC C D V
*/
- mfcr r6, cr<30, 15> /* Get MSA0 */
+ mfcr r6, MSA_SET /* Get MSA */
2:
lsri r6, 29
lsli r6, 29
addi r6, 0x1ce
- mtcr r6, cr<30, 15> /* Set MSA0 */
+ mtcr r6, MSA_SET /* Set MSA */
movi r6, 0
- mtcr r6, cr<31, 15> /* Clr MSA1 */
+ mtcr r6, MSA_CLR /* Clr MSA */
/* enable MMU */
mfcr r6, cr18
diff --git a/arch/csky/abiv2/inc/abi/fpu.h b/arch/csky/abiv2/inc/abi/fpu.h
index 09e2700a3693..aabb79355013 100644
--- a/arch/csky/abiv2/inc/abi/fpu.h
+++ b/arch/csky/abiv2/inc/abi/fpu.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ASM_CSKY_FPU_H
#define __ASM_CSKY_FPU_H
diff --git a/arch/csky/abiv2/inc/abi/page.h b/arch/csky/abiv2/inc/abi/page.h
index 0a70cb553dca..cf005f13cd15 100644
--- a/arch/csky/abiv2/inc/abi/page.h
+++ b/arch/csky/abiv2/inc/abi/page.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
static inline void clear_user_page(void *addr, unsigned long vaddr,
struct page *page)
diff --git a/arch/csky/abiv2/inc/abi/pgtable-bits.h b/arch/csky/abiv2/inc/abi/pgtable-bits.h
index 137f7932c83b..7e7f389f546f 100644
--- a/arch/csky/abiv2/inc/abi/pgtable-bits.h
+++ b/arch/csky/abiv2/inc/abi/pgtable-bits.h
@@ -1,37 +1,48 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ASM_CSKY_PGTABLE_BITS_H
#define __ASM_CSKY_PGTABLE_BITS_H
/* implemented in software */
#define _PAGE_ACCESSED (1<<7)
-#define PAGE_ACCESSED_BIT (7)
-
#define _PAGE_READ (1<<8)
#define _PAGE_WRITE (1<<9)
#define _PAGE_PRESENT (1<<10)
-
#define _PAGE_MODIFIED (1<<11)
-#define PAGE_MODIFIED_BIT (11)
/* implemented in hardware */
#define _PAGE_GLOBAL (1<<0)
-
#define _PAGE_VALID (1<<1)
-#define PAGE_VALID_BIT (1)
-
#define _PAGE_DIRTY (1<<2)
-#define PAGE_DIRTY_BIT (2)
#define _PAGE_SO (1<<5)
#define _PAGE_BUF (1<<6)
-
#define _PAGE_CACHE (1<<3)
-
#define _CACHE_MASK _PAGE_CACHE
-#define _CACHE_CACHED (_PAGE_VALID | _PAGE_CACHE | _PAGE_BUF)
-#define _CACHE_UNCACHED (_PAGE_VALID)
+#define _CACHE_CACHED (_PAGE_CACHE | _PAGE_BUF)
+#define _CACHE_UNCACHED (0)
+
+#define _PAGE_PROT_NONE _PAGE_WRITE
+
+/*
+ * Encode and decode a swap entry
+ *
+ * Format of swap PTE:
+ * bit 0: _PAGE_GLOBAL (zero)
+ * bit 1: _PAGE_VALID (zero)
+ * bit 2 - 6: swap type
+ * bit 7 - 8: swap offset[0 - 1]
+ * bit 9: _PAGE_WRITE (zero)
+ * bit 10: _PAGE_PRESENT (zero)
+ * bit 11 - 31: swap offset[2 - 22]
+ */
+#define __swp_type(x) (((x).val >> 2) & 0x1f)
+#define __swp_offset(x) ((((x).val >> 7) & 0x3) | \
+ (((x).val >> 9) & 0x7ffffc))
+#define __swp_entry(type, offset) ((swp_entry_t) { \
+ ((type & 0x1f) << 2) | \
+ ((offset & 0x3) << 7) | \
+ ((offset & 0x7ffffc) << 9)})
#endif /* __ASM_CSKY_PGTABLE_BITS_H */
diff --git a/arch/csky/abiv2/inc/abi/reg_ops.h b/arch/csky/abiv2/inc/abi/reg_ops.h
index ae82c3f26a6b..49ba18a64751 100644
--- a/arch/csky/abiv2/inc/abi/reg_ops.h
+++ b/arch/csky/abiv2/inc/abi/reg_ops.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ABI_REG_OPS_H
#define __ABI_REG_OPS_H
diff --git a/arch/csky/abiv2/inc/abi/regdef.h b/arch/csky/abiv2/inc/abi/regdef.h
index d7328bbc1ce7..0933addbc27b 100644
--- a/arch/csky/abiv2/inc/abi/regdef.h
+++ b/arch/csky/abiv2/inc/abi/regdef.h
@@ -1,10 +1,14 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ASM_CSKY_REGDEF_H
#define __ASM_CSKY_REGDEF_H
+#ifdef __ASSEMBLY__
#define syscallid r7
+#else
+#define syscallid "r7"
+#endif
+
#define regs_syscallid(regs) regs->regs[3]
#define regs_fp(regs) regs->regs[4]
diff --git a/arch/csky/abiv2/inc/abi/switch_context.h b/arch/csky/abiv2/inc/abi/switch_context.h
index 73a81245a3b3..5dd5c3f4ee7e 100644
--- a/arch/csky/abiv2/inc/abi/switch_context.h
+++ b/arch/csky/abiv2/inc/abi/switch_context.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ABI_CSKY_PTRACE_H
#define __ABI_CSKY_PTRACE_H
diff --git a/arch/csky/abiv2/inc/abi/vdso.h b/arch/csky/abiv2/inc/abi/vdso.h
index b60d4a070326..40fd10d893ff 100644
--- a/arch/csky/abiv2/inc/abi/vdso.h
+++ b/arch/csky/abiv2/inc/abi/vdso.h
@@ -3,21 +3,7 @@
#ifndef __ABI_CSKY_VDSO_H
#define __ABI_CSKY_VDSO_H
-#include <linux/uaccess.h>
+/* movi r7, 173 */
+#define SET_SYSCALL_ID .long 0x008bea07
-static inline int setup_vdso_page(unsigned short *ptr)
-{
- int err = 0;
-
- /* movi r7, 173 */
- err |= __put_user(0xea07, ptr);
- err |= __put_user(0x008b, ptr+1);
-
- /* trap 0 */
- err |= __put_user(0xc000, ptr+2);
- err |= __put_user(0x2020, ptr+3);
-
- return err;
-}
-
-#endif /* __ABI_CSKY_STRING_H */
+#endif /* __ABI_CSKY_VDSO_H */
diff --git a/arch/csky/abiv2/sysdep.h b/arch/csky/abiv2/sysdep.h
index bbbedfd34777..61abe9201c50 100644
--- a/arch/csky/abiv2/sysdep.h
+++ b/arch/csky/abiv2/sysdep.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __SYSDEP_H
#define __SYSDEP_H
diff --git a/arch/csky/include/asm/addrspace.h b/arch/csky/include/asm/addrspace.h
index d1c2ede692ed..6fc05d44536c 100644
--- a/arch/csky/include/asm/addrspace.h
+++ b/arch/csky/include/asm/addrspace.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ASM_CSKY_ADDRSPACE_H
#define __ASM_CSKY_ADDRSPACE_H
diff --git a/arch/csky/include/asm/atomic.h b/arch/csky/include/asm/atomic.h
deleted file mode 100644
index e369d73b13e3..000000000000
--- a/arch/csky/include/asm/atomic.h
+++ /dev/null
@@ -1,212 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef __ASM_CSKY_ATOMIC_H
-#define __ASM_CSKY_ATOMIC_H
-
-#include <linux/version.h>
-#include <asm/cmpxchg.h>
-#include <asm/barrier.h>
-
-#ifdef CONFIG_CPU_HAS_LDSTEX
-
-#define __atomic_add_unless __atomic_add_unless
-static inline int __atomic_add_unless(atomic_t *v, int a, int u)
-{
- unsigned long tmp, ret;
-
- smp_mb();
-
- asm volatile (
- "1: ldex.w %0, (%3) \n"
- " mov %1, %0 \n"
- " cmpne %0, %4 \n"
- " bf 2f \n"
- " add %0, %2 \n"
- " stex.w %0, (%3) \n"
- " bez %0, 1b \n"
- "2: \n"
- : "=&r" (tmp), "=&r" (ret)
- : "r" (a), "r"(&v->counter), "r"(u)
- : "memory");
-
- if (ret != u)
- smp_mb();
-
- return ret;
-}
-
-#define ATOMIC_OP(op, c_op) \
-static inline void atomic_##op(int i, atomic_t *v) \
-{ \
- unsigned long tmp; \
- \
- asm volatile ( \
- "1: ldex.w %0, (%2) \n" \
- " " #op " %0, %1 \n" \
- " stex.w %0, (%2) \n" \
- " bez %0, 1b \n" \
- : "=&r" (tmp) \
- : "r" (i), "r"(&v->counter) \
- : "memory"); \
-}
-
-#define ATOMIC_OP_RETURN(op, c_op) \
-static inline int atomic_##op##_return(int i, atomic_t *v) \
-{ \
- unsigned long tmp, ret; \
- \
- smp_mb(); \
- asm volatile ( \
- "1: ldex.w %0, (%3) \n" \
- " " #op " %0, %2 \n" \
- " mov %1, %0 \n" \
- " stex.w %0, (%3) \n" \
- " bez %0, 1b \n" \
- : "=&r" (tmp), "=&r" (ret) \
- : "r" (i), "r"(&v->counter) \
- : "memory"); \
- smp_mb(); \
- \
- return ret; \
-}
-
-#define ATOMIC_FETCH_OP(op, c_op) \
-static inline int atomic_fetch_##op(int i, atomic_t *v) \
-{ \
- unsigned long tmp, ret; \
- \
- smp_mb(); \
- asm volatile ( \
- "1: ldex.w %0, (%3) \n" \
- " mov %1, %0 \n" \
- " " #op " %0, %2 \n" \
- " stex.w %0, (%3) \n" \
- " bez %0, 1b \n" \
- : "=&r" (tmp), "=&r" (ret) \
- : "r" (i), "r"(&v->counter) \
- : "memory"); \
- smp_mb(); \
- \
- return ret; \
-}
-
-#else /* CONFIG_CPU_HAS_LDSTEX */
-
-#include <linux/irqflags.h>
-
-#define __atomic_add_unless __atomic_add_unless
-static inline int __atomic_add_unless(atomic_t *v, int a, int u)
-{
- unsigned long tmp, ret, flags;
-
- raw_local_irq_save(flags);
-
- asm volatile (
- " ldw %0, (%3) \n"
- " mov %1, %0 \n"
- " cmpne %0, %4 \n"
- " bf 2f \n"
- " add %0, %2 \n"
- " stw %0, (%3) \n"
- "2: \n"
- : "=&r" (tmp), "=&r" (ret)
- : "r" (a), "r"(&v->counter), "r"(u)
- : "memory");
-
- raw_local_irq_restore(flags);
-
- return ret;
-}
-
-#define ATOMIC_OP(op, c_op) \
-static inline void atomic_##op(int i, atomic_t *v) \
-{ \
- unsigned long tmp, flags; \
- \
- raw_local_irq_save(flags); \
- \
- asm volatile ( \
- " ldw %0, (%2) \n" \
- " " #op " %0, %1 \n" \
- " stw %0, (%2) \n" \
- : "=&r" (tmp) \
- : "r" (i), "r"(&v->counter) \
- : "memory"); \
- \
- raw_local_irq_restore(flags); \
-}
-
-#define ATOMIC_OP_RETURN(op, c_op) \
-static inline int atomic_##op##_return(int i, atomic_t *v) \
-{ \
- unsigned long tmp, ret, flags; \
- \
- raw_local_irq_save(flags); \
- \
- asm volatile ( \
- " ldw %0, (%3) \n" \
- " " #op " %0, %2 \n" \
- " stw %0, (%3) \n" \
- " mov %1, %0 \n" \
- : "=&r" (tmp), "=&r" (ret) \
- : "r" (i), "r"(&v->counter) \
- : "memory"); \
- \
- raw_local_irq_restore(flags); \
- \
- return ret; \
-}
-
-#define ATOMIC_FETCH_OP(op, c_op) \
-static inline int atomic_fetch_##op(int i, atomic_t *v) \
-{ \
- unsigned long tmp, ret, flags; \
- \
- raw_local_irq_save(flags); \
- \
- asm volatile ( \
- " ldw %0, (%3) \n" \
- " mov %1, %0 \n" \
- " " #op " %0, %2 \n" \
- " stw %0, (%3) \n" \
- : "=&r" (tmp), "=&r" (ret) \
- : "r" (i), "r"(&v->counter) \
- : "memory"); \
- \
- raw_local_irq_restore(flags); \
- \
- return ret; \
-}
-
-#endif /* CONFIG_CPU_HAS_LDSTEX */
-
-#define atomic_add_return atomic_add_return
-ATOMIC_OP_RETURN(add, +)
-#define atomic_sub_return atomic_sub_return
-ATOMIC_OP_RETURN(sub, -)
-
-#define atomic_fetch_add atomic_fetch_add
-ATOMIC_FETCH_OP(add, +)
-#define atomic_fetch_sub atomic_fetch_sub
-ATOMIC_FETCH_OP(sub, -)
-#define atomic_fetch_and atomic_fetch_and
-ATOMIC_FETCH_OP(and, &)
-#define atomic_fetch_or atomic_fetch_or
-ATOMIC_FETCH_OP(or, |)
-#define atomic_fetch_xor atomic_fetch_xor
-ATOMIC_FETCH_OP(xor, ^)
-
-#define atomic_and atomic_and
-ATOMIC_OP(and, &)
-#define atomic_or atomic_or
-ATOMIC_OP(or, |)
-#define atomic_xor atomic_xor
-ATOMIC_OP(xor, ^)
-
-#undef ATOMIC_FETCH_OP
-#undef ATOMIC_OP_RETURN
-#undef ATOMIC_OP
-
-#include <asm-generic/atomic.h>
-
-#endif /* __ASM_CSKY_ATOMIC_H */
diff --git a/arch/csky/include/asm/barrier.h b/arch/csky/include/asm/barrier.h
index a430e7fddf35..84fc600c8b45 100644
--- a/arch/csky/include/asm/barrier.h
+++ b/arch/csky/include/asm/barrier.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ASM_CSKY_BARRIER_H
#define __ASM_CSKY_BARRIER_H
@@ -8,6 +7,61 @@
#define nop() asm volatile ("nop\n":::"memory")
+#ifdef CONFIG_SMP
+
+/*
+ * bar.brwarws: ordering barrier for all load/store instructions
+ * before/after
+ *
+ * |31|30 26|25 21|20 16|15 10|9 5|4 0|
+ * 1 10000 00000 00000 100001 00001 0 bw br aw ar
+ *
+ * b: before
+ * a: after
+ * r: read
+ * w: write
+ *
+ * Here are all combinations:
+ *
+ * bar.brw
+ * bar.br
+ * bar.bw
+ * bar.arw
+ * bar.ar
+ * bar.aw
+ * bar.brwarw
+ * bar.brarw
+ * bar.bwarw
+ * bar.brwar
+ * bar.brwaw
+ * bar.brar
+ * bar.bwaw
+ */
+#define __bar_brw() asm volatile (".long 0x842cc000\n":::"memory")
+#define __bar_br() asm volatile (".long 0x8424c000\n":::"memory")
+#define __bar_bw() asm volatile (".long 0x8428c000\n":::"memory")
+#define __bar_arw() asm volatile (".long 0x8423c000\n":::"memory")
+#define __bar_ar() asm volatile (".long 0x8421c000\n":::"memory")
+#define __bar_aw() asm volatile (".long 0x8422c000\n":::"memory")
+#define __bar_brwarw() asm volatile (".long 0x842fc000\n":::"memory")
+#define __bar_brarw() asm volatile (".long 0x8427c000\n":::"memory")
+#define __bar_bwarw() asm volatile (".long 0x842bc000\n":::"memory")
+#define __bar_brwar() asm volatile (".long 0x842dc000\n":::"memory")
+#define __bar_brwaw() asm volatile (".long 0x842ec000\n":::"memory")
+#define __bar_brar() asm volatile (".long 0x8425c000\n":::"memory")
+#define __bar_brar() asm volatile (".long 0x8425c000\n":::"memory")
+#define __bar_bwaw() asm volatile (".long 0x842ac000\n":::"memory")
+
+#define __smp_mb() __bar_brwarw()
+#define __smp_rmb() __bar_brar()
+#define __smp_wmb() __bar_bwaw()
+
+#define ACQUIRE_FENCE ".long 0x8427c000\n"
+#define __smp_acquire_fence() __bar_brarw()
+#define __smp_release_fence() __bar_brwaw()
+
+#endif /* CONFIG_SMP */
+
/*
* sync: completion barrier, all sync.xx instructions
* guarantee the last response recieved by bus transaction
@@ -15,31 +69,14 @@
* sync.s: inherit from sync, but also shareable to other cores
* sync.i: inherit from sync, but also flush cpu pipeline
* sync.is: the same with sync.i + sync.s
- *
- * bar.brwarw: ordering barrier for all load/store instructions before it
- * bar.brwarws: ordering barrier for all load/store instructions before it
- * and shareable to other cores
- * bar.brar: ordering barrier for all load instructions before it
- * bar.brars: ordering barrier for all load instructions before it
- * and shareable to other cores
- * bar.bwaw: ordering barrier for all store instructions before it
- * bar.bwaws: ordering barrier for all store instructions before it
- * and shareable to other cores
*/
+#define mb() asm volatile ("sync\n":::"memory")
#ifdef CONFIG_CPU_HAS_CACHEV2
-#define mb() asm volatile ("sync.s\n":::"memory")
-
-#ifdef CONFIG_SMP
-#define __smp_mb() asm volatile ("bar.brwarws\n":::"memory")
-#define __smp_rmb() asm volatile ("bar.brars\n":::"memory")
-#define __smp_wmb() asm volatile ("bar.bwaws\n":::"memory")
-#endif /* CONFIG_SMP */
-
-#define sync_is() asm volatile ("sync.is\n":::"memory")
-
-#else /* !CONFIG_CPU_HAS_CACHEV2 */
-#define mb() asm volatile ("sync\n":::"memory")
+/*
+ * Using three sync.is to prevent speculative PTW
+ */
+#define sync_is() asm volatile ("sync.is\nsync.is\nsync.is\n":::"memory")
#endif
#include <asm-generic/barrier.h>
diff --git a/arch/csky/include/asm/bitops.h b/arch/csky/include/asm/bitops.h
index 43b9838bff63..91818787d860 100644
--- a/arch/csky/include/asm/bitops.h
+++ b/arch/csky/include/asm/bitops.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ASM_CSKY_BITOPS_H
#define __ASM_CSKY_BITOPS_H
diff --git a/arch/csky/include/asm/bug.h b/arch/csky/include/asm/bug.h
index 33ebd16b9c78..03f1a5f9184a 100644
--- a/arch/csky/include/asm/bug.h
+++ b/arch/csky/include/asm/bug.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ASM_CSKY_BUG_H
#define __ASM_CSKY_BUG_H
@@ -21,6 +20,8 @@ do { \
struct pt_regs;
void die(struct pt_regs *regs, const char *str);
+void do_trap(struct pt_regs *regs, int signo, int code, unsigned long addr);
+
void show_regs(struct pt_regs *regs);
void show_code(struct pt_regs *regs);
diff --git a/arch/csky/include/asm/cacheflush.h b/arch/csky/include/asm/cacheflush.h
index f0b8f25429a2..d0f9eafe8988 100644
--- a/arch/csky/include/asm/cacheflush.h
+++ b/arch/csky/include/asm/cacheflush.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ASM_CSKY_CACHEFLUSH_H
#define __ASM_CSKY_CACHEFLUSH_H
diff --git a/arch/csky/include/asm/checksum.h b/arch/csky/include/asm/checksum.h
index 7685824291b1..aa12ef4b9080 100644
--- a/arch/csky/include/asm/checksum.h
+++ b/arch/csky/include/asm/checksum.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ASM_CSKY_CHECKSUM_H
#define __ASM_CSKY_CHECKSUM_H
diff --git a/arch/csky/include/asm/clocksource.h b/arch/csky/include/asm/clocksource.h
new file mode 100644
index 000000000000..54da0e49efa1
--- /dev/null
+++ b/arch/csky/include/asm/clocksource.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __ASM_VDSO_CSKY_CLOCKSOURCE_H
+#define __ASM_VDSO_CSKY_CLOCKSOURCE_H
+
+#include <asm/vdso/clocksource.h>
+
+#endif
diff --git a/arch/csky/include/asm/cmpxchg.h b/arch/csky/include/asm/cmpxchg.h
index 89224530a0ee..dabc8e46ce7b 100644
--- a/arch/csky/include/asm/cmpxchg.h
+++ b/arch/csky/include/asm/cmpxchg.h
@@ -3,12 +3,12 @@
#ifndef __ASM_CSKY_CMPXCHG_H
#define __ASM_CSKY_CMPXCHG_H
-#ifdef CONFIG_CPU_HAS_LDSTEX
+#ifdef CONFIG_SMP
#include <asm/barrier.h>
extern void __bad_xchg(void);
-#define __xchg(new, ptr, size) \
+#define __xchg_relaxed(new, ptr, size) \
({ \
__typeof__(ptr) __ptr = (ptr); \
__typeof__(new) __new = (new); \
@@ -16,7 +16,6 @@ extern void __bad_xchg(void);
unsigned long tmp; \
switch (size) { \
case 4: \
- smp_mb(); \
asm volatile ( \
"1: ldex.w %0, (%3) \n" \
" mov %1, %2 \n" \
@@ -25,7 +24,6 @@ extern void __bad_xchg(void);
: "=&r" (__ret), "=&r" (tmp) \
: "r" (__new), "r"(__ptr) \
:); \
- smp_mb(); \
break; \
default: \
__bad_xchg(); \
@@ -33,9 +31,10 @@ extern void __bad_xchg(void);
__ret; \
})
-#define xchg(ptr, x) (__xchg((x), (ptr), sizeof(*(ptr))))
+#define xchg_relaxed(ptr, x) \
+ (__xchg_relaxed((x), (ptr), sizeof(*(ptr))))
-#define __cmpxchg(ptr, old, new, size) \
+#define __cmpxchg_relaxed(ptr, old, new, size) \
({ \
__typeof__(ptr) __ptr = (ptr); \
__typeof__(new) __new = (new); \
@@ -44,7 +43,6 @@ extern void __bad_xchg(void);
__typeof__(*(ptr)) __ret; \
switch (size) { \
case 4: \
- smp_mb(); \
asm volatile ( \
"1: ldex.w %0, (%3) \n" \
" cmpne %0, %4 \n" \
@@ -56,7 +54,6 @@ extern void __bad_xchg(void);
: "=&r" (__ret), "=&r" (__tmp) \
: "r" (__new), "r"(__ptr), "r"(__old) \
:); \
- smp_mb(); \
break; \
default: \
__bad_xchg(); \
@@ -64,8 +61,18 @@ extern void __bad_xchg(void);
__ret; \
})
-#define cmpxchg(ptr, o, n) \
- (__cmpxchg((ptr), (o), (n), sizeof(*(ptr))))
+#define cmpxchg_relaxed(ptr, o, n) \
+ (__cmpxchg_relaxed((ptr), (o), (n), sizeof(*(ptr))))
+
+#define cmpxchg(ptr, o, n) \
+({ \
+ __typeof__(*(ptr)) __ret; \
+ __smp_release_fence(); \
+ __ret = cmpxchg_relaxed(ptr, o, n); \
+ __smp_acquire_fence(); \
+ __ret; \
+})
+
#else
#include <asm-generic/cmpxchg.h>
#endif
diff --git a/arch/csky/include/asm/elf.h b/arch/csky/include/asm/elf.h
index eb2cc5a673b5..48b83e283ed4 100644
--- a/arch/csky/include/asm/elf.h
+++ b/arch/csky/include/asm/elf.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ASM_CSKY_ELF_H
#define __ASM_CSKY_ELF_H
diff --git a/arch/csky/include/asm/fixmap.h b/arch/csky/include/asm/fixmap.h
index 4b589cc20900..49a77cbbe2a9 100644
--- a/arch/csky/include/asm/fixmap.h
+++ b/arch/csky/include/asm/fixmap.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ASM_CSKY_FIXMAP_H
#define __ASM_CSKY_FIXMAP_H
diff --git a/arch/csky/include/asm/ftrace.h b/arch/csky/include/asm/ftrace.h
index fae72b0b1374..9b86341731b6 100644
--- a/arch/csky/include/asm/ftrace.h
+++ b/arch/csky/include/asm/ftrace.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ASM_CSKY_FTRACE_H
#define __ASM_CSKY_FTRACE_H
diff --git a/arch/csky/include/asm/futex.h b/arch/csky/include/asm/futex.h
new file mode 100644
index 000000000000..6cfd312723fa
--- /dev/null
+++ b/arch/csky/include/asm/futex.h
@@ -0,0 +1,121 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __ASM_CSKY_FUTEX_H
+#define __ASM_CSKY_FUTEX_H
+
+#ifndef CONFIG_SMP
+#include <asm-generic/futex.h>
+#else
+#include <linux/atomic.h>
+#include <linux/futex.h>
+#include <linux/uaccess.h>
+#include <linux/errno.h>
+
+#define __futex_atomic_op(insn, ret, oldval, uaddr, oparg) \
+{ \
+ u32 tmp; \
+ \
+ __atomic_pre_full_fence(); \
+ \
+ __asm__ __volatile__ ( \
+ "1: ldex.w %[ov], %[u] \n" \
+ " "insn" \n" \
+ "2: stex.w %[t], %[u] \n" \
+ " bez %[t], 1b \n" \
+ " br 4f \n" \
+ "3: mov %[r], %[e] \n" \
+ "4: \n" \
+ " .section __ex_table,\"a\" \n" \
+ " .balign 4 \n" \
+ " .long 1b, 3b \n" \
+ " .long 2b, 3b \n" \
+ " .previous \n" \
+ : [r] "+r" (ret), [ov] "=&r" (oldval), \
+ [u] "+m" (*uaddr), [t] "=&r" (tmp) \
+ : [op] "Jr" (oparg), [e] "jr" (-EFAULT) \
+ : "memory"); \
+ \
+ __atomic_post_full_fence(); \
+}
+
+static inline int
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
+{
+ int oldval = 0, ret = 0;
+
+ if (!access_ok(uaddr, sizeof(u32)))
+ return -EFAULT;
+
+ switch (op) {
+ case FUTEX_OP_SET:
+ __futex_atomic_op("mov %[t], %[ov]",
+ ret, oldval, uaddr, oparg);
+ break;
+ case FUTEX_OP_ADD:
+ __futex_atomic_op("add %[t], %[ov], %[op]",
+ ret, oldval, uaddr, oparg);
+ break;
+ case FUTEX_OP_OR:
+ __futex_atomic_op("or %[t], %[ov], %[op]",
+ ret, oldval, uaddr, oparg);
+ break;
+ case FUTEX_OP_ANDN:
+ __futex_atomic_op("and %[t], %[ov], %[op]",
+ ret, oldval, uaddr, ~oparg);
+ break;
+ case FUTEX_OP_XOR:
+ __futex_atomic_op("xor %[t], %[ov], %[op]",
+ ret, oldval, uaddr, oparg);
+ break;
+ default:
+ ret = -ENOSYS;
+ }
+
+ if (!ret)
+ *oval = oldval;
+
+ return ret;
+}
+
+
+
+static inline int
+futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
+ u32 oldval, u32 newval)
+{
+ int ret = 0;
+ u32 val, tmp;
+
+ if (!access_ok(uaddr, sizeof(u32)))
+ return -EFAULT;
+
+ __atomic_pre_full_fence();
+
+ __asm__ __volatile__ (
+ "1: ldex.w %[v], %[u] \n"
+ " cmpne %[v], %[ov] \n"
+ " bt 4f \n"
+ " mov %[t], %[nv] \n"
+ "2: stex.w %[t], %[u] \n"
+ " bez %[t], 1b \n"
+ " br 4f \n"
+ "3: mov %[r], %[e] \n"
+ "4: \n"
+ " .section __ex_table,\"a\" \n"
+ " .balign 4 \n"
+ " .long 1b, 3b \n"
+ " .long 2b, 3b \n"
+ " .previous \n"
+ : [r] "+r" (ret), [v] "=&r" (val), [u] "+m" (*uaddr),
+ [t] "=&r" (tmp)
+ : [ov] "Jr" (oldval), [nv] "Jr" (newval), [e] "Jr" (-EFAULT)
+ : "memory");
+
+ __atomic_post_full_fence();
+
+ *uval = val;
+ return ret;
+}
+
+#endif /* CONFIG_SMP */
+#endif /* __ASM_CSKY_FUTEX_H */
diff --git a/arch/csky/include/asm/highmem.h b/arch/csky/include/asm/highmem.h
index 1f4ed3f4c0d9..1ed810effb3d 100644
--- a/arch/csky/include/asm/highmem.h
+++ b/arch/csky/include/asm/highmem.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ASM_CSKY_HIGHMEM_H
#define __ASM_CSKY_HIGHMEM_H
diff --git a/arch/csky/include/asm/io.h b/arch/csky/include/asm/io.h
index e909587f24c5..f82654053dc0 100644
--- a/arch/csky/include/asm/io.h
+++ b/arch/csky/include/asm/io.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ASM_CSKY_IO_H
#define __ASM_CSKY_IO_H
diff --git a/arch/csky/include/asm/memory.h b/arch/csky/include/asm/memory.h
index a65c6759f537..d12179801ae3 100644
--- a/arch/csky/include/asm/memory.h
+++ b/arch/csky/include/asm/memory.h
@@ -10,7 +10,7 @@
#define FIXADDR_TOP _AC(0xffffc000, UL)
#define PKMAP_BASE _AC(0xff800000, UL)
-#define VMALLOC_START _AC(0xc0008000, UL)
+#define VMALLOC_START (PAGE_OFFSET + LOWMEM_LIMIT + (PAGE_SIZE * 8))
#define VMALLOC_END (PKMAP_BASE - (PAGE_SIZE * 2))
#ifdef CONFIG_HAVE_TCM
diff --git a/arch/csky/include/asm/mmu.h b/arch/csky/include/asm/mmu.h
index 26fbb1d15df0..d78321901d06 100644
--- a/arch/csky/include/asm/mmu.h
+++ b/arch/csky/include/asm/mmu.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ASM_CSKY_MMU_H
#define __ASM_CSKY_MMU_H
diff --git a/arch/csky/include/asm/mmu_context.h b/arch/csky/include/asm/mmu_context.h
index b227d29393a8..95d99b30792c 100644
--- a/arch/csky/include/asm/mmu_context.h
+++ b/arch/csky/include/asm/mmu_context.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ASM_CSKY_MMU_CONTEXT_H
#define __ASM_CSKY_MMU_CONTEXT_H
@@ -14,12 +13,6 @@
#include <linux/sched.h>
#include <abi/ckmmu.h>
-#define TLBMISS_HANDLER_SETUP_PGD(pgd) \
- setup_pgd(__pa(pgd), false)
-
-#define TLBMISS_HANDLER_SETUP_PGD_KERNEL(pgd) \
- setup_pgd(__pa(pgd), true)
-
#define ASID_MASK ((1 << CONFIG_CPU_ASID_BITS) - 1)
#define cpu_asid(mm) (atomic64_read(&mm->context.asid) & ASID_MASK)
@@ -36,8 +29,7 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next,
if (prev != next)
check_and_switch_context(next, cpu);
- TLBMISS_HANDLER_SETUP_PGD(next->pgd);
- write_mmu_entryhi(next->context.asid.counter);
+ setup_pgd(next->pgd, next->context.asid.counter);
flush_icache_deferred(next);
}
diff --git a/arch/csky/include/asm/page.h b/arch/csky/include/asm/page.h
index 9b98bf31d57c..3b91fc3cf36f 100644
--- a/arch/csky/include/asm/page.h
+++ b/arch/csky/include/asm/page.h
@@ -24,7 +24,7 @@
* address region. We use them mapping kernel 1GB direct-map address area and
* for more than 1GB of memory we use highmem.
*/
-#define PAGE_OFFSET 0x80000000
+#define PAGE_OFFSET CONFIG_PAGE_OFFSET
#define SSEG_SIZE 0x20000000
#define LOWMEM_LIMIT (SSEG_SIZE * 2)
diff --git a/arch/csky/include/asm/perf_event.h b/arch/csky/include/asm/perf_event.h
index 572093e11001..249905d8a4e8 100644
--- a/arch/csky/include/asm/perf_event.h
+++ b/arch/csky/include/asm/perf_event.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ASM_CSKY_PERF_EVENT_H
#define __ASM_CSKY_PERF_EVENT_H
diff --git a/arch/csky/include/asm/pgalloc.h b/arch/csky/include/asm/pgalloc.h
index d58d8146b729..cd211aabbefd 100644
--- a/arch/csky/include/asm/pgalloc.h
+++ b/arch/csky/include/asm/pgalloc.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ASM_CSKY_PGALLOC_H
#define __ASM_CSKY_PGALLOC_H
@@ -71,7 +70,7 @@ do { \
} while (0)
extern void pagetable_init(void);
-extern void pre_mmu_init(void);
+extern void mmu_init(unsigned long min_pfn, unsigned long max_pfn);
extern void pre_trap_init(void);
#endif /* __ASM_CSKY_PGALLOC_H */
diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h
index 2002cb7f1053..0d60367b6bfa 100644
--- a/arch/csky/include/asm/pgtable.h
+++ b/arch/csky/include/asm/pgtable.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ASM_CSKY_PGTABLE_H
#define __ASM_CSKY_PGTABLE_H
@@ -14,7 +13,7 @@
#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
#define PGDIR_MASK (~(PGDIR_SIZE-1))
-#define USER_PTRS_PER_PGD (0x80000000UL/PGDIR_SIZE)
+#define USER_PTRS_PER_PGD (PAGE_OFFSET/PGDIR_SIZE)
#define FIRST_USER_ADDRESS 0UL
/*
@@ -34,23 +33,13 @@
#define pmd_page(pmd) (pfn_to_page(pmd_phys(pmd) >> PAGE_SHIFT))
#define pte_clear(mm, addr, ptep) set_pte((ptep), \
- (((unsigned int) addr & PAGE_OFFSET) ? __pte(_PAGE_GLOBAL) : __pte(0)))
+ (((unsigned int) addr >= PAGE_OFFSET) ? __pte(_PAGE_GLOBAL) : __pte(0)))
#define pte_none(pte) (!(pte_val(pte) & ~_PAGE_GLOBAL))
#define pte_present(pte) (pte_val(pte) & _PAGE_PRESENT)
#define pte_pfn(x) ((unsigned long)((x).pte_low >> PAGE_SHIFT))
#define pfn_pte(pfn, prot) __pte(((unsigned long long)(pfn) << PAGE_SHIFT) \
| pgprot_val(prot))
-#define __READABLE (_PAGE_READ | _PAGE_VALID | _PAGE_ACCESSED)
-#define __WRITEABLE (_PAGE_WRITE | _PAGE_DIRTY | _PAGE_MODIFIED)
-
-#define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_ACCESSED | _PAGE_MODIFIED | \
- _CACHE_MASK)
-
-#define __swp_type(x) (((x).val >> 4) & 0xff)
-#define __swp_offset(x) ((x).val >> 12)
-#define __swp_entry(type, offset) ((swp_entry_t) {((type) << 4) | \
- ((offset) << 12) })
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
@@ -59,41 +48,52 @@
pgprot_val(pgprot))
/*
- * CSKY can't do page protection for execute, and considers that the same like
- * read. Also, write permissions imply read permissions. This is the closest
- * we can get by reasonable means..
+ * C-SKY only has VALID and DIRTY bit in hardware. So we need to use the
+ * two bits emulate PRESENT, READ, WRITE, EXEC, MODIFIED, ACCESSED.
*/
-#define PAGE_NONE __pgprot(_PAGE_PRESENT | _CACHE_CACHED)
-#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \
+#define _PAGE_BASE (_PAGE_PRESENT | _PAGE_ACCESSED)
+
+#define PAGE_NONE __pgprot(_PAGE_PROT_NONE)
+#define PAGE_READ __pgprot(_PAGE_BASE | _PAGE_READ | \
_CACHE_CACHED)
-#define PAGE_COPY __pgprot(_PAGE_PRESENT | _PAGE_READ | _CACHE_CACHED)
-#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_READ | _CACHE_CACHED)
-#define PAGE_KERNEL __pgprot(_PAGE_PRESENT | __READABLE | __WRITEABLE | \
- _PAGE_GLOBAL | _CACHE_CACHED)
-#define PAGE_USERIO __pgprot(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \
+#define PAGE_WRITE __pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_WRITE | \
_CACHE_CACHED)
+#define PAGE_SHARED PAGE_WRITE
+
+#define PAGE_KERNEL __pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_VALID | \
+ _PAGE_WRITE | _PAGE_DIRTY | _PAGE_MODIFIED | \
+ _PAGE_GLOBAL | \
+ _CACHE_CACHED)
+
+#define _PAGE_IOREMAP (_PAGE_BASE | _PAGE_READ | _PAGE_VALID | \
+ _PAGE_WRITE | _PAGE_DIRTY | _PAGE_MODIFIED | \
+ _PAGE_GLOBAL | \
+ _CACHE_UNCACHED | _PAGE_SO)
+
+#define _PAGE_CHG_MASK (~(unsigned long) \
+ (_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \
+ _CACHE_MASK | _PAGE_GLOBAL))
-#define _PAGE_IOREMAP \
- (_PAGE_PRESENT | __READABLE | __WRITEABLE | _PAGE_GLOBAL | \
- _CACHE_UNCACHED | _PAGE_SO)
+#define MAX_SWAPFILES_CHECK() \
+ BUILD_BUG_ON(MAX_SWAPFILES_SHIFT != 5)
#define __P000 PAGE_NONE
-#define __P001 PAGE_READONLY
-#define __P010 PAGE_COPY
-#define __P011 PAGE_COPY
-#define __P100 PAGE_READONLY
-#define __P101 PAGE_READONLY
-#define __P110 PAGE_COPY
-#define __P111 PAGE_COPY
+#define __P001 PAGE_READ
+#define __P010 PAGE_READ
+#define __P011 PAGE_READ
+#define __P100 PAGE_READ
+#define __P101 PAGE_READ
+#define __P110 PAGE_READ
+#define __P111 PAGE_READ
#define __S000 PAGE_NONE
-#define __S001 PAGE_READONLY
-#define __S010 PAGE_SHARED
-#define __S011 PAGE_SHARED
-#define __S100 PAGE_READONLY
-#define __S101 PAGE_READONLY
-#define __S110 PAGE_SHARED
-#define __S111 PAGE_SHARED
+#define __S001 PAGE_READ
+#define __S010 PAGE_WRITE
+#define __S011 PAGE_WRITE
+#define __S100 PAGE_READ
+#define __S101 PAGE_READ
+#define __S110 PAGE_WRITE
+#define __S111 PAGE_WRITE
extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
diff --git a/arch/csky/include/asm/processor.h b/arch/csky/include/asm/processor.h
index 4800f6563abb..9e933021fe8e 100644
--- a/arch/csky/include/asm/processor.h
+++ b/arch/csky/include/asm/processor.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ASM_CSKY_PROCESSOR_H
#define __ASM_CSKY_PROCESSOR_H
@@ -28,7 +27,7 @@ extern struct cpuinfo_csky cpu_data[];
* for a 64 bit kernel expandable to 8192EB, of which the current CSKY
* implementations will "only" be able to use 1TB ...
*/
-#define TASK_SIZE 0x7fff8000UL
+#define TASK_SIZE (PAGE_OFFSET - (PAGE_SIZE * 8))
#ifdef __KERNEL__
#define STACK_TOP TASK_SIZE
diff --git a/arch/csky/include/asm/ptrace.h b/arch/csky/include/asm/ptrace.h
index 91ceb1b454c9..4202aab6df42 100644
--- a/arch/csky/include/asm/ptrace.h
+++ b/arch/csky/include/asm/ptrace.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ASM_CSKY_PTRACE_H
#define __ASM_CSKY_PTRACE_H
diff --git a/arch/csky/include/asm/segment.h b/arch/csky/include/asm/segment.h
index 79ede9b1a646..589e8321dc14 100644
--- a/arch/csky/include/asm/segment.h
+++ b/arch/csky/include/asm/segment.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ASM_CSKY_SEGMENT_H
#define __ASM_CSKY_SEGMENT_H
@@ -10,7 +9,7 @@ typedef struct {
#define KERNEL_DS ((mm_segment_t) { 0xFFFFFFFF })
-#define USER_DS ((mm_segment_t) { 0x80000000UL })
+#define USER_DS ((mm_segment_t) { PAGE_OFFSET })
#define get_fs() (current_thread_info()->addr_limit)
#define set_fs(x) (current_thread_info()->addr_limit = (x))
#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg)
diff --git a/arch/csky/include/asm/shmparam.h b/arch/csky/include/asm/shmparam.h
index efafe4c79fed..2fe6cea0dae9 100644
--- a/arch/csky/include/asm/shmparam.h
+++ b/arch/csky/include/asm/shmparam.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ASM_CSKY_SHMPARAM_H
#define __ASM_CSKY_SHMPARAM_H
diff --git a/arch/csky/include/asm/spinlock.h b/arch/csky/include/asm/spinlock.h
index 7cf3f2b34cea..69f5aa249c5f 100644
--- a/arch/csky/include/asm/spinlock.h
+++ b/arch/csky/include/asm/spinlock.h
@@ -6,8 +6,6 @@
#include <linux/spinlock_types.h>
#include <asm/barrier.h>
-#ifdef CONFIG_QUEUED_RWLOCKS
-
/*
* Ticket-based spin-locking.
*/
@@ -88,169 +86,4 @@ static inline int arch_spin_is_contended(arch_spinlock_t *lock)
#include <asm/qrwlock.h>
-/* See include/linux/spinlock.h */
-#define smp_mb__after_spinlock() smp_mb()
-
-#else /* CONFIG_QUEUED_RWLOCKS */
-
-/*
- * Test-and-set spin-locking.
- */
-static inline void arch_spin_lock(arch_spinlock_t *lock)
-{
- u32 *p = &lock->lock;
- u32 tmp;
-
- asm volatile (
- "1: ldex.w %0, (%1) \n"
- " bnez %0, 1b \n"
- " movi %0, 1 \n"
- " stex.w %0, (%1) \n"
- " bez %0, 1b \n"
- : "=&r" (tmp)
- : "r"(p)
- : "cc");
- smp_mb();
-}
-
-static inline void arch_spin_unlock(arch_spinlock_t *lock)
-{
- smp_mb();
- WRITE_ONCE(lock->lock, 0);
-}
-
-static inline int arch_spin_trylock(arch_spinlock_t *lock)
-{
- u32 *p = &lock->lock;
- u32 tmp;
-
- asm volatile (
- "1: ldex.w %0, (%1) \n"
- " bnez %0, 2f \n"
- " movi %0, 1 \n"
- " stex.w %0, (%1) \n"
- " bez %0, 1b \n"
- " movi %0, 0 \n"
- "2: \n"
- : "=&r" (tmp)
- : "r"(p)
- : "cc");
-
- if (!tmp)
- smp_mb();
-
- return !tmp;
-}
-
-#define arch_spin_is_locked(x) (READ_ONCE((x)->lock) != 0)
-
-/*
- * read lock/unlock/trylock
- */
-static inline void arch_read_lock(arch_rwlock_t *lock)
-{
- u32 *p = &lock->lock;
- u32 tmp;
-
- asm volatile (
- "1: ldex.w %0, (%1) \n"
- " blz %0, 1b \n"
- " addi %0, 1 \n"
- " stex.w %0, (%1) \n"
- " bez %0, 1b \n"
- : "=&r" (tmp)
- : "r"(p)
- : "cc");
- smp_mb();
-}
-
-static inline void arch_read_unlock(arch_rwlock_t *lock)
-{
- u32 *p = &lock->lock;
- u32 tmp;
-
- smp_mb();
- asm volatile (
- "1: ldex.w %0, (%1) \n"
- " subi %0, 1 \n"
- " stex.w %0, (%1) \n"
- " bez %0, 1b \n"
- : "=&r" (tmp)
- : "r"(p)
- : "cc");
-}
-
-static inline int arch_read_trylock(arch_rwlock_t *lock)
-{
- u32 *p = &lock->lock;
- u32 tmp;
-
- asm volatile (
- "1: ldex.w %0, (%1) \n"
- " blz %0, 2f \n"
- " addi %0, 1 \n"
- " stex.w %0, (%1) \n"
- " bez %0, 1b \n"
- " movi %0, 0 \n"
- "2: \n"
- : "=&r" (tmp)
- : "r"(p)
- : "cc");
-
- if (!tmp)
- smp_mb();
-
- return !tmp;
-}
-
-/*
- * write lock/unlock/trylock
- */
-static inline void arch_write_lock(arch_rwlock_t *lock)
-{
- u32 *p = &lock->lock;
- u32 tmp;
-
- asm volatile (
- "1: ldex.w %0, (%1) \n"
- " bnez %0, 1b \n"
- " subi %0, 1 \n"
- " stex.w %0, (%1) \n"
- " bez %0, 1b \n"
- : "=&r" (tmp)
- : "r"(p)
- : "cc");
- smp_mb();
-}
-
-static inline void arch_write_unlock(arch_rwlock_t *lock)
-{
- smp_mb();
- WRITE_ONCE(lock->lock, 0);
-}
-
-static inline int arch_write_trylock(arch_rwlock_t *lock)
-{
- u32 *p = &lock->lock;
- u32 tmp;
-
- asm volatile (
- "1: ldex.w %0, (%1) \n"
- " bnez %0, 2f \n"
- " subi %0, 1 \n"
- " stex.w %0, (%1) \n"
- " bez %0, 1b \n"
- " movi %0, 0 \n"
- "2: \n"
- : "=&r" (tmp)
- : "r"(p)
- : "cc");
-
- if (!tmp)
- smp_mb();
-
- return !tmp;
-}
-
-#endif /* CONFIG_QUEUED_RWLOCKS */
#endif /* __ASM_CSKY_SPINLOCK_H */
diff --git a/arch/csky/include/asm/spinlock_types.h b/arch/csky/include/asm/spinlock_types.h
index 88b82438b182..8ff0f6ff3a00 100644
--- a/arch/csky/include/asm/spinlock_types.h
+++ b/arch/csky/include/asm/spinlock_types.h
@@ -22,16 +22,6 @@ typedef struct {
#define __ARCH_SPIN_LOCK_UNLOCKED { { 0 } }
-#ifdef CONFIG_QUEUED_RWLOCKS
#include <asm-generic/qrwlock_types.h>
-#else /* CONFIG_NR_CPUS > 2 */
-
-typedef struct {
- u32 lock;
-} arch_rwlock_t;
-
-#define __ARCH_RW_LOCK_UNLOCKED { 0 }
-
-#endif /* CONFIG_QUEUED_RWLOCKS */
#endif /* __ASM_CSKY_SPINLOCK_TYPES_H */
diff --git a/arch/csky/include/asm/string.h b/arch/csky/include/asm/string.h
index 73142de18355..a0d81e9d6b8f 100644
--- a/arch/csky/include/asm/string.h
+++ b/arch/csky/include/asm/string.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef _CSKY_STRING_MM_H_
#define _CSKY_STRING_MM_H_
diff --git a/arch/csky/include/asm/switch_to.h b/arch/csky/include/asm/switch_to.h
index 35a39e88933d..731e466415e2 100644
--- a/arch/csky/include/asm/switch_to.h
+++ b/arch/csky/include/asm/switch_to.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ASM_CSKY_SWITCH_TO_H
#define __ASM_CSKY_SWITCH_TO_H
diff --git a/arch/csky/include/asm/syscalls.h b/arch/csky/include/asm/syscalls.h
index 5d48e5e0082e..ea9ce6138b9b 100644
--- a/arch/csky/include/asm/syscalls.h
+++ b/arch/csky/include/asm/syscalls.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ASM_CSKY_SYSCALLS_H
#define __ASM_CSKY_SYSCALLS_H
diff --git a/arch/csky/include/asm/thread_info.h b/arch/csky/include/asm/thread_info.h
index 21456a3737c2..8c349a8f904d 100644
--- a/arch/csky/include/asm/thread_info.h
+++ b/arch/csky/include/asm/thread_info.h
@@ -1,12 +1,10 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef _ASM_CSKY_THREAD_INFO_H
#define _ASM_CSKY_THREAD_INFO_H
#ifndef __ASSEMBLY__
-#include <linux/version.h>
#include <asm/types.h>
#include <asm/page.h>
#include <asm/processor.h>
diff --git a/arch/csky/include/asm/tlb.h b/arch/csky/include/asm/tlb.h
index fdff9b8d70c8..3498e65f59f8 100644
--- a/arch/csky/include/asm/tlb.h
+++ b/arch/csky/include/asm/tlb.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ASM_CSKY_TLB_H
#define __ASM_CSKY_TLB_H
diff --git a/arch/csky/include/asm/tlbflush.h b/arch/csky/include/asm/tlbflush.h
index 6845b0667703..407160b4fde7 100644
--- a/arch/csky/include/asm/tlbflush.h
+++ b/arch/csky/include/asm/tlbflush.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ASM_TLBFLUSH_H
#define __ASM_TLBFLUSH_H
diff --git a/arch/csky/include/asm/traps.h b/arch/csky/include/asm/traps.h
index 1c081805b962..421a4195e2fe 100644
--- a/arch/csky/include/asm/traps.h
+++ b/arch/csky/include/asm/traps.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ASM_CSKY_TRAPS_H
#define __ASM_CSKY_TRAPS_H
diff --git a/arch/csky/include/asm/uaccess.h b/arch/csky/include/asm/uaccess.h
index 1633ffe5ae15..3dec272e1fa3 100644
--- a/arch/csky/include/asm/uaccess.h
+++ b/arch/csky/include/asm/uaccess.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ASM_CSKY_UACCESS_H
#define __ASM_CSKY_UACCESS_H
diff --git a/arch/csky/include/asm/unistd.h b/arch/csky/include/asm/unistd.h
index da7a18295615..9cf97de9a26d 100644
--- a/arch/csky/include/asm/unistd.h
+++ b/arch/csky/include/asm/unistd.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#include <uapi/asm/unistd.h>
diff --git a/arch/csky/include/asm/vdso.h b/arch/csky/include/asm/vdso.h
index d963d691f3a1..eb5142f9c564 100644
--- a/arch/csky/include/asm/vdso.h
+++ b/arch/csky/include/asm/vdso.h
@@ -3,10 +3,25 @@
#ifndef __ASM_CSKY_VDSO_H
#define __ASM_CSKY_VDSO_H
-#include <abi/vdso.h>
+#include <linux/types.h>
-struct csky_vdso {
- unsigned short rt_signal_retcode[4];
+#ifndef GENERIC_TIME_VSYSCALL
+struct vdso_data {
};
+#endif
+
+/*
+ * The VDSO symbols are mapped into Linux so we can just use regular symbol
+ * addressing to get their offsets in userspace. The symbols are mapped at an
+ * offset of 0, but since the linker must support setting weak undefined
+ * symbols to the absolute address 0 it also happens to support other low
+ * addresses even when the code model suggests those low addresses would not
+ * otherwise be availiable.
+ */
+#define VDSO_SYMBOL(base, name) \
+({ \
+ extern const char __vdso_##name[]; \
+ (void __user *)((unsigned long)(base) + __vdso_##name); \
+})
#endif /* __ASM_CSKY_VDSO_H */
diff --git a/arch/csky/include/asm/vdso/clocksource.h b/arch/csky/include/asm/vdso/clocksource.h
new file mode 100644
index 000000000000..dfca7b4724b7
--- /dev/null
+++ b/arch/csky/include/asm/vdso/clocksource.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __ASM_VDSO_CSKY_CLOCKSOURCE_H
+#define __ASM_VDSO_CSKY_CLOCKSOURCE_H
+
+#define VDSO_ARCH_CLOCKMODES \
+ VDSO_CLOCKMODE_ARCHTIMER
+
+#endif /* __ASM_VDSO_CSKY_CLOCKSOURCE_H */
diff --git a/arch/csky/include/asm/vdso/gettimeofday.h b/arch/csky/include/asm/vdso/gettimeofday.h
new file mode 100644
index 000000000000..6c4f1446944f
--- /dev/null
+++ b/arch/csky/include/asm/vdso/gettimeofday.h
@@ -0,0 +1,114 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __ASM_VDSO_CSKY_GETTIMEOFDAY_H
+#define __ASM_VDSO_CSKY_GETTIMEOFDAY_H
+
+#ifndef __ASSEMBLY__
+
+#include <asm/barrier.h>
+#include <asm/unistd.h>
+#include <abi/regdef.h>
+#include <uapi/linux/time.h>
+
+#define VDSO_HAS_CLOCK_GETRES 1
+
+static __always_inline
+int gettimeofday_fallback(struct __kernel_old_timeval *_tv,
+ struct timezone *_tz)
+{
+ register struct __kernel_old_timeval *tv asm("a0") = _tv;
+ register struct timezone *tz asm("a1") = _tz;
+ register long ret asm("a0");
+ register long nr asm(syscallid) = __NR_gettimeofday;
+
+ asm volatile ("trap 0\n"
+ : "=r" (ret)
+ : "r"(tv), "r"(tz), "r"(nr)
+ : "memory");
+
+ return ret;
+}
+
+static __always_inline
+long clock_gettime_fallback(clockid_t _clkid, struct __kernel_timespec *_ts)
+{
+ register clockid_t clkid asm("a0") = _clkid;
+ register struct __kernel_timespec *ts asm("a1") = _ts;
+ register long ret asm("a0");
+ register long nr asm(syscallid) = __NR_clock_gettime64;
+
+ asm volatile ("trap 0\n"
+ : "=r" (ret)
+ : "r"(clkid), "r"(ts), "r"(nr)
+ : "memory");
+
+ return ret;
+}
+
+static __always_inline
+long clock_gettime32_fallback(clockid_t _clkid, struct old_timespec32 *_ts)
+{
+ register clockid_t clkid asm("a0") = _clkid;
+ register struct old_timespec32 *ts asm("a1") = _ts;
+ register long ret asm("a0");
+ register long nr asm(syscallid) = __NR_clock_gettime;
+
+ asm volatile ("trap 0\n"
+ : "=r" (ret)
+ : "r"(clkid), "r"(ts), "r"(nr)
+ : "memory");
+
+ return ret;
+}
+
+static __always_inline
+int clock_getres_fallback(clockid_t _clkid, struct __kernel_timespec *_ts)
+{
+ register clockid_t clkid asm("a0") = _clkid;
+ register struct __kernel_timespec *ts asm("a1") = _ts;
+ register long ret asm("a0");
+ register long nr asm(syscallid) = __NR_clock_getres_time64;
+
+ asm volatile ("trap 0\n"
+ : "=r" (ret)
+ : "r"(clkid), "r"(ts), "r"(nr)
+ : "memory");
+
+ return ret;
+}
+
+static __always_inline
+int clock_getres32_fallback(clockid_t _clkid, struct old_timespec32 *_ts)
+{
+ register clockid_t clkid asm("a0") = _clkid;
+ register struct old_timespec32 *ts asm("a1") = _ts;
+ register long ret asm("a0");
+ register long nr asm(syscallid) = __NR_clock_getres;
+
+ asm volatile ("trap 0\n"
+ : "=r" (ret)
+ : "r"(clkid), "r"(ts), "r"(nr)
+ : "memory");
+
+ return ret;
+}
+
+uint64_t csky_pmu_read_cc(void);
+static __always_inline u64 __arch_get_hw_counter(s32 clock_mode,
+ const struct vdso_data *vd)
+{
+#ifdef CONFIG_CSKY_PMU_V1
+ return csky_pmu_read_cc();
+#else
+ return 0;
+#endif
+}
+
+static __always_inline const struct vdso_data *__arch_get_vdso_data(void)
+{
+ return _vdso_data;
+}
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __ASM_VDSO_CSKY_GETTIMEOFDAY_H */
diff --git a/arch/csky/include/asm/vdso/processor.h b/arch/csky/include/asm/vdso/processor.h
new file mode 100644
index 000000000000..39a6b561d0cc
--- /dev/null
+++ b/arch/csky/include/asm/vdso/processor.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __ASM_VDSO_CSKY_PROCESSOR_H
+#define __ASM_VDSO_CSKY_PROCESSOR_H
+
+#ifndef __ASSEMBLY__
+
+#define cpu_relax() barrier()
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __ASM_VDSO_CSKY_PROCESSOR_H */
diff --git a/arch/csky/include/asm/vdso/vsyscall.h b/arch/csky/include/asm/vdso/vsyscall.h
new file mode 100644
index 000000000000..c276211a7c4d
--- /dev/null
+++ b/arch/csky/include/asm/vdso/vsyscall.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __ASM_VDSO_CSKY_VSYSCALL_H
+#define __ASM_VDSO_CSKY_VSYSCALL_H
+
+#ifndef __ASSEMBLY__
+
+#include <vdso/datapage.h>
+
+extern struct vdso_data *vdso_data;
+
+static __always_inline struct vdso_data *__csky_get_k_vdso_data(void)
+{
+ return vdso_data;
+}
+#define __arch_get_k_vdso_data __csky_get_k_vdso_data
+
+#include <asm-generic/vdso/vsyscall.h>
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __ASM_VDSO_CSKY_VSYSCALL_H */
diff --git a/arch/csky/include/uapi/asm/byteorder.h b/arch/csky/include/uapi/asm/byteorder.h
index d150cd664873..1aedd513b65a 100644
--- a/arch/csky/include/uapi/asm/byteorder.h
+++ b/arch/csky/include/uapi/asm/byteorder.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ASM_CSKY_BYTEORDER_H
#define __ASM_CSKY_BYTEORDER_H
diff --git a/arch/csky/include/uapi/asm/perf_regs.h b/arch/csky/include/uapi/asm/perf_regs.h
index 49d4e147a559..d0a8ac6a1b77 100644
--- a/arch/csky/include/uapi/asm/perf_regs.h
+++ b/arch/csky/include/uapi/asm/perf_regs.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-// Copyright (C) 2019 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef _ASM_CSKY_PERF_REGS_H
#define _ASM_CSKY_PERF_REGS_H
diff --git a/arch/csky/include/uapi/asm/ptrace.h b/arch/csky/include/uapi/asm/ptrace.h
index 66b2268e324e..3be9c14334a6 100644
--- a/arch/csky/include/uapi/asm/ptrace.h
+++ b/arch/csky/include/uapi/asm/ptrace.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef _CSKY_PTRACE_H
#define _CSKY_PTRACE_H
diff --git a/arch/csky/include/uapi/asm/sigcontext.h b/arch/csky/include/uapi/asm/sigcontext.h
index 670c020f2cb8..859afb602477 100644
--- a/arch/csky/include/uapi/asm/sigcontext.h
+++ b/arch/csky/include/uapi/asm/sigcontext.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#ifndef __ASM_CSKY_SIGCONTEXT_H
#define __ASM_CSKY_SIGCONTEXT_H
diff --git a/arch/csky/include/uapi/asm/unistd.h b/arch/csky/include/uapi/asm/unistd.h
index ba4018929733..7ff6a2466af1 100644
--- a/arch/csky/include/uapi/asm/unistd.h
+++ b/arch/csky/include/uapi/asm/unistd.h
@@ -1,5 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
#define __ARCH_WANT_STAT64
#define __ARCH_WANT_NEW_STAT
diff --git a/arch/csky/kernel/Makefile b/arch/csky/kernel/Makefile
index 37f37c0e934a..6c0f36010ed0 100644
--- a/arch/csky/kernel/Makefile
+++ b/arch/csky/kernel/Makefile
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
extra-y := head.o vmlinux.lds
-obj-y += entry.o atomic.o signal.o traps.o irq.o time.o vdso.o
+obj-y += entry.o atomic.o signal.o traps.o irq.o time.o vdso.o vdso/
obj-y += power.o syscall.o syscall_table.o setup.o
obj-y += process.o cpu-probe.o ptrace.o stacktrace.o
obj-y += probes/
diff --git a/arch/csky/kernel/atomic.S b/arch/csky/kernel/atomic.S
index 3821ef9b7567..e73e548f7855 100644
--- a/arch/csky/kernel/atomic.S
+++ b/arch/csky/kernel/atomic.S
@@ -14,6 +14,10 @@
*/
ENTRY(csky_cmpxchg)
USPTOKSP
+
+ RD_MEH a3
+ WR_MEH a3
+
mfcr a3, epc
addi a3, TRAP0_SIZE
@@ -36,11 +40,11 @@ ENTRY(csky_cmpxchg)
2:
sync.is
#else
-1:
+GLOBAL(csky_cmpxchg_ldw)
ldw a3, (a2)
cmpne a0, a3
bt16 3f
-2:
+GLOBAL(csky_cmpxchg_stw)
stw a1, (a2)
3:
#endif
@@ -55,19 +59,3 @@ ENTRY(csky_cmpxchg)
KSPTOUSP
rte
END(csky_cmpxchg)
-
-#ifndef CONFIG_CPU_HAS_LDSTEX
-/*
- * Called from tlbmodified exception
- */
-ENTRY(csky_cmpxchg_fixup)
- mfcr a0, epc
- lrw a1, 2b
- cmpne a1, a0
- bt 1f
- subi a1, (2b - 1b)
- stw a1, (sp, LSAVE_PC)
-1:
- rts
-END(csky_cmpxchg_fixup)
-#endif
diff --git a/arch/csky/kernel/entry.S b/arch/csky/kernel/entry.S
index 5a5cabd076e1..c1bd7a6b4ab6 100644
--- a/arch/csky/kernel/entry.S
+++ b/arch/csky/kernel/entry.S
@@ -13,10 +13,6 @@
#include <asm/page.h>
#include <asm/thread_info.h>
-#define PTE_INDX_MSK 0xffc
-#define PTE_INDX_SHIFT 10
-#define _PGDIR_SHIFT 22
-
.macro zero_fp
#ifdef CONFIG_STACKTRACE
movi r8, 0
@@ -41,108 +37,15 @@
#endif
.endm
-.macro tlbop_begin name, val0, val1, val2
-ENTRY(csky_\name)
- mtcr a3, ss2
- mtcr r6, ss3
- mtcr a2, ss4
-
- RD_PGDR r6
- RD_MEH a3
-#ifdef CONFIG_CPU_HAS_TLBI
- tlbi.vaas a3
- sync.is
-
- btsti a3, 31
- bf 1f
- RD_PGDR_K r6
-1:
-#else
- bgeni a2, 31
- WR_MCIR a2
- bgeni a2, 25
- WR_MCIR a2
-#endif
- bclri r6, 0
- lrw a2, va_pa_offset
- ld.w a2, (a2, 0)
- subu r6, a2
- bseti r6, 31
-
- mov a2, a3
- lsri a2, _PGDIR_SHIFT
- lsli a2, 2
- addu r6, a2
- ldw r6, (r6)
-
- lrw a2, va_pa_offset
- ld.w a2, (a2, 0)
- subu r6, a2
- bseti r6, 31
-
- lsri a3, PTE_INDX_SHIFT
- lrw a2, PTE_INDX_MSK
- and a3, a2
- addu r6, a3
- ldw a3, (r6)
-
- movi a2, (_PAGE_PRESENT | \val0)
- and a3, a2
- cmpne a3, a2
- bt \name
-
- /* First read/write the page, just update the flags */
- ldw a3, (r6)
- bgeni a2, PAGE_VALID_BIT
- bseti a2, PAGE_ACCESSED_BIT
- bseti a2, \val1
- bseti a2, \val2
- or a3, a2
- stw a3, (r6)
-
- /* Some cpu tlb-hardrefill bypass the cache */
-#ifdef CONFIG_CPU_NEED_TLBSYNC
- movi a2, 0x22
- bseti a2, 6
- mtcr r6, cr22
- mtcr a2, cr17
- sync
-#endif
-
- mfcr a3, ss2
- mfcr r6, ss3
- mfcr a2, ss4
- rte
-\name:
- mfcr a3, ss2
- mfcr r6, ss3
- mfcr a2, ss4
+.text
+ENTRY(csky_pagefault)
SAVE_ALL 0
-.endm
-.macro tlbop_end is_write
zero_fp
context_tracking
- RD_MEH a2
- psrset ee, ie
+ psrset ee
mov a0, sp
- movi a1, \is_write
jbsr do_page_fault
jmpi ret_from_exception
-.endm
-
-.text
-
-tlbop_begin tlbinvalidl, _PAGE_READ, PAGE_VALID_BIT, PAGE_ACCESSED_BIT
-tlbop_end 0
-
-tlbop_begin tlbinvalids, _PAGE_WRITE, PAGE_DIRTY_BIT, PAGE_MODIFIED_BIT
-tlbop_end 1
-
-tlbop_begin tlbmodified, _PAGE_WRITE, PAGE_DIRTY_BIT, PAGE_MODIFIED_BIT
-#ifndef CONFIG_CPU_HAS_LDSTEX
-jbsr csky_cmpxchg_fixup
-#endif
-tlbop_end 1
ENTRY(csky_systemcall)
SAVE_ALL TRAP0_SIZE
@@ -314,6 +217,9 @@ ENTRY(csky_trap)
ENTRY(csky_get_tls)
USPTOKSP
+ RD_MEH a0
+ WR_MEH a0
+
/* increase epc for continue */
mfcr a0, epc
addi a0, TRAP0_SIZE
diff --git a/arch/csky/kernel/head.S b/arch/csky/kernel/head.S
index 17ed9d250480..7e3e4f15b052 100644
--- a/arch/csky/kernel/head.S
+++ b/arch/csky/kernel/head.S
@@ -21,10 +21,16 @@ END(_start)
ENTRY(_start_smp_secondary)
SETUP_MMU
- /* copy msa1 from CPU0 */
- lrw r6, secondary_msa1
+#ifdef CONFIG_PAGE_OFFSET_80000000
+ lrw r6, secondary_msa1
ld.w r6, (r6, 0)
mtcr r6, cr<31, 15>
+#endif
+
+ lrw r6, secondary_pgd
+ ld.w r6, (r6, 0)
+ mtcr r6, cr<28, 15>
+ mtcr r6, cr<29, 15>
/* set stack point */
lrw r6, secondary_stack
diff --git a/arch/csky/kernel/perf_event.c b/arch/csky/kernel/perf_event.c
index 1a29f1157449..e5f18420ce64 100644
--- a/arch/csky/kernel/perf_event.c
+++ b/arch/csky/kernel/perf_event.c
@@ -87,7 +87,7 @@ static int csky_pmu_irq;
})
/* cycle counter */
-static uint64_t csky_pmu_read_cc(void)
+uint64_t csky_pmu_read_cc(void)
{
uint32_t lo, hi, tmp;
uint64_t result;
@@ -1319,7 +1319,7 @@ int csky_pmu_device_probe(struct platform_device *pdev,
pr_notice("[perf] PMU request irq fail!\n");
}
- ret = cpuhp_setup_state(CPUHP_AP_PERF_ONLINE, "AP_PERF_ONLINE",
+ ret = cpuhp_setup_state(CPUHP_AP_PERF_CSKY_ONLINE, "AP_PERF_ONLINE",
csky_pmu_starting_cpu,
csky_pmu_dying_cpu);
if (ret) {
diff --git a/arch/csky/kernel/probes/simulate-insn.c b/arch/csky/kernel/probes/simulate-insn.c
index 4e464fed52ec..d6e8d092c9b7 100644
--- a/arch/csky/kernel/probes/simulate-insn.c
+++ b/arch/csky/kernel/probes/simulate-insn.c
@@ -274,9 +274,9 @@ void __kprobes
simulate_bnezad32(u32 opcode, long addr, struct pt_regs *regs)
{
unsigned long tmp = opcode & 0x1f;
- unsigned long val;
+ long val;
- csky_insn_reg_get_val(regs, tmp, &val);
+ csky_insn_reg_get_val(regs, tmp, (unsigned long *)&val);
val -= 1;
@@ -286,7 +286,7 @@ simulate_bnezad32(u32 opcode, long addr, struct pt_regs *regs)
} else
instruction_pointer_set(regs, addr + 4);
- csky_insn_reg_set_val(regs, tmp, val);
+ csky_insn_reg_set_val(regs, tmp, (unsigned long)val);
}
void __kprobes
@@ -297,13 +297,11 @@ simulate_bhsz32(u32 opcode, long addr, struct pt_regs *regs)
csky_insn_reg_get_val(regs, tmp, &val);
- if (val >= 0) {
+ if ((long) val >= 0) {
instruction_pointer_set(regs,
addr + sign_extend32((opcode & 0xffff0000) >> 15, 15));
} else
instruction_pointer_set(regs, addr + 4);
-
- csky_insn_reg_set_val(regs, tmp, val);
}
void __kprobes
@@ -314,13 +312,11 @@ simulate_bhz32(u32 opcode, long addr, struct pt_regs *regs)
csky_insn_reg_get_val(regs, tmp, &val);
- if (val > 0) {
+ if ((long) val > 0) {
instruction_pointer_set(regs,
addr + sign_extend32((opcode & 0xffff0000) >> 15, 15));
} else
instruction_pointer_set(regs, addr + 4);
-
- csky_insn_reg_set_val(regs, tmp, val);
}
void __kprobes
@@ -331,13 +327,11 @@ simulate_blsz32(u32 opcode, long addr, struct pt_regs *regs)
csky_insn_reg_get_val(regs, tmp, &val);
- if (val <= 0) {
+ if ((long) val <= 0) {
instruction_pointer_set(regs,
addr + sign_extend32((opcode & 0xffff0000) >> 15, 15));
} else
instruction_pointer_set(regs, addr + 4);
-
- csky_insn_reg_set_val(regs, tmp, val);
}
void __kprobes
@@ -348,13 +342,11 @@ simulate_blz32(u32 opcode, long addr, struct pt_regs *regs)
csky_insn_reg_get_val(regs, tmp, &val);
- if (val < 0) {
+ if ((long) val < 0) {
instruction_pointer_set(regs,
addr + sign_extend32((opcode & 0xffff0000) >> 15, 15));
} else
instruction_pointer_set(regs, addr + 4);
-
- csky_insn_reg_set_val(regs, tmp, val);
}
void __kprobes
diff --git a/arch/csky/kernel/process.c b/arch/csky/kernel/process.c
index 69af6bc87e64..3d0ca22cd0e2 100644
--- a/arch/csky/kernel/process.c
+++ b/arch/csky/kernel/process.c
@@ -49,7 +49,7 @@ int copy_thread(unsigned long clone_flags,
/* setup thread.sp for switch_to !!! */
p->thread.sp = (unsigned long)childstack;
- if (unlikely(p->flags & PF_KTHREAD)) {
+ if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
memset(childregs, 0, sizeof(struct pt_regs));
childstack->r15 = (unsigned long) ret_from_kernel_thread;
childstack->r10 = kthread_arg;
diff --git a/arch/csky/kernel/ptrace.c b/arch/csky/kernel/ptrace.c
index d822144906ac..0105ac81b432 100644
--- a/arch/csky/kernel/ptrace.c
+++ b/arch/csky/kernel/ptrace.c
@@ -22,6 +22,7 @@
#include <asm/asm-offsets.h>
#include <abi/regdef.h>
+#include <abi/ckmmu.h>
#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>
@@ -83,7 +84,7 @@ static int gpr_get(struct task_struct *target,
/* Abiv1 regs->tls is fake and we need sync here. */
regs->tls = task_thread_info(target)->tp_value;
- return membuf_write(&to, regs, sizeof(regs));
+ return membuf_write(&to, regs, sizeof(*regs));
}
static int gpr_set(struct task_struct *target,
@@ -343,6 +344,124 @@ asmlinkage void syscall_trace_exit(struct pt_regs *regs)
trace_sys_exit(regs, syscall_get_return_value(current, regs));
}
+#ifdef CONFIG_CPU_CK860
+static void show_iutlb(void)
+{
+ int entry, i;
+ unsigned long flags;
+ unsigned long oldpid;
+ unsigned long entryhi[16], entrylo0[16], entrylo1[16];
+
+ oldpid = read_mmu_entryhi();
+
+ entry = 0x8000;
+
+ local_irq_save(flags);
+
+ for (i = 0; i < 16; i++) {
+ write_mmu_index(entry);
+ tlb_read();
+ entryhi[i] = read_mmu_entryhi();
+ entrylo0[i] = read_mmu_entrylo0();
+ entrylo1[i] = read_mmu_entrylo1();
+
+ entry++;
+ }
+
+ local_irq_restore(flags);
+
+ write_mmu_entryhi(oldpid);
+
+ printk("\n\n\n");
+ for (i = 0; i < 16; i++)
+ printk("iutlb[%d]: entryhi - 0x%lx; entrylo0 - 0x%lx;"
+ " entrylo1 - 0x%lx\n",
+ i, entryhi[i], entrylo0[i], entrylo1[i]);
+ printk("\n\n\n");
+}
+
+static void show_dutlb(void)
+{
+ int entry, i;
+ unsigned long flags;
+ unsigned long oldpid;
+ unsigned long entryhi[16], entrylo0[16], entrylo1[16];
+
+ oldpid = read_mmu_entryhi();
+
+ entry = 0x4000;
+
+ local_irq_save(flags);
+
+ for (i = 0; i < 16; i++) {
+ write_mmu_index(entry);
+ tlb_read();
+ entryhi[i] = read_mmu_entryhi();
+ entrylo0[i] = read_mmu_entrylo0();
+ entrylo1[i] = read_mmu_entrylo1();
+
+ entry++;
+ }
+
+ local_irq_restore(flags);
+
+ write_mmu_entryhi(oldpid);
+
+ printk("\n\n\n");
+ for (i = 0; i < 16; i++)
+ printk("dutlb[%d]: entryhi - 0x%lx; entrylo0 - 0x%lx;"
+ " entrylo1 - 0x%lx\n",
+ i, entryhi[i], entrylo0[i], entrylo1[i]);
+ printk("\n\n\n");
+}
+
+static unsigned long entryhi[1024], entrylo0[1024], entrylo1[1024];
+static void show_jtlb(void)
+{
+ int entry;
+ unsigned long flags;
+ unsigned long oldpid;
+
+ oldpid = read_mmu_entryhi();
+
+ entry = 0;
+
+ local_irq_save(flags);
+ while (entry < 1024) {
+ write_mmu_index(entry);
+ tlb_read();
+ entryhi[entry] = read_mmu_entryhi();
+ entrylo0[entry] = read_mmu_entrylo0();
+ entrylo1[entry] = read_mmu_entrylo1();
+
+ entry++;
+ }
+ local_irq_restore(flags);
+
+ write_mmu_entryhi(oldpid);
+
+ printk("\n\n\n");
+
+ for (entry = 0; entry < 1024; entry++)
+ printk("jtlb[%x]: entryhi - 0x%lx; entrylo0 - 0x%lx;"
+ " entrylo1 - 0x%lx\n",
+ entry, entryhi[entry], entrylo0[entry], entrylo1[entry]);
+ printk("\n\n\n");
+}
+
+static void show_tlb(void)
+{
+ show_iutlb();
+ show_dutlb();
+ show_jtlb();
+}
+#else
+static void show_tlb(void)
+{
+ return;
+}
+#endif
+
void show_regs(struct pt_regs *fp)
{
pr_info("\nCURRENT PROCESS:\n\n");
@@ -363,9 +482,10 @@ void show_regs(struct pt_regs *fp)
pr_info("PC: 0x%08lx (%pS)\n", (long)fp->pc, (void *)fp->pc);
pr_info("LR: 0x%08lx (%pS)\n", (long)fp->lr, (void *)fp->lr);
- pr_info("SP: 0x%08lx\n", (long)fp);
- pr_info("orig_a0: 0x%08lx\n", fp->orig_a0);
+ pr_info("SP: 0x%08lx\n", (long)fp->usp);
pr_info("PSR: 0x%08lx\n", (long)fp->sr);
+ pr_info("orig_a0: 0x%08lx\n", fp->orig_a0);
+ pr_info("PT_REGS: 0x%08lx\n", (long)fp);
pr_info(" a0: 0x%08lx a1: 0x%08lx a2: 0x%08lx a3: 0x%08lx\n",
fp->a0, fp->a1, fp->a2, fp->a3);
@@ -395,5 +515,7 @@ void show_regs(struct pt_regs *fp)
fp->regs[8], fp->regs[9]);
#endif
+ show_tlb();
+
return;
}
diff --git a/arch/csky/kernel/setup.c b/arch/csky/kernel/setup.c
index e4cab16056d6..e93bc6f74432 100644
--- a/arch/csky/kernel/setup.c
+++ b/arch/csky/kernel/setup.c
@@ -45,13 +45,17 @@ static void __init csky_memblock_init(void)
if (size >= lowmem_size) {
max_low_pfn = min_low_pfn + lowmem_size;
+#ifdef CONFIG_PAGE_OFFSET_80000000
write_mmu_msa1(read_mmu_msa0() + SSEG_SIZE);
+#endif
} else if (size > sseg_size) {
max_low_pfn = min_low_pfn + sseg_size;
}
max_zone_pfn[ZONE_NORMAL] = max_low_pfn;
+ mmu_init(min_low_pfn, max_low_pfn);
+
#ifdef CONFIG_HIGHMEM
max_zone_pfn[ZONE_HIGHMEM] = max_pfn;
@@ -101,16 +105,26 @@ void __init setup_arch(char **cmdline_p)
unsigned long va_pa_offset;
EXPORT_SYMBOL(va_pa_offset);
+static inline unsigned long read_mmu_msa(void)
+{
+#ifdef CONFIG_PAGE_OFFSET_80000000
+ return read_mmu_msa0();
+#endif
+
+#ifdef CONFIG_PAGE_OFFSET_A0000000
+ return read_mmu_msa1();
+#endif
+}
+
asmlinkage __visible void __init csky_start(unsigned int unused,
void *dtb_start)
{
/* Clean up bss section */
memset(__bss_start, 0, __bss_stop - __bss_start);
- va_pa_offset = read_mmu_msa0() & ~(SSEG_SIZE - 1);
+ va_pa_offset = read_mmu_msa() & ~(SSEG_SIZE - 1);
pre_trap_init();
- pre_mmu_init();
if (dtb_start == NULL)
early_init_dt_scan(__dtb_start);
diff --git a/arch/csky/kernel/signal.c b/arch/csky/kernel/signal.c
index 37ea64ed3c12..312f046d452d 100644
--- a/arch/csky/kernel/signal.c
+++ b/arch/csky/kernel/signal.c
@@ -134,7 +134,6 @@ setup_rt_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs)
{
struct rt_sigframe *frame;
int err = 0;
- struct csky_vdso *vdso = current->mm->context.vdso;
frame = get_sigframe(ksig, regs, sizeof(*frame));
if (!access_ok(frame, sizeof(*frame)))
@@ -152,7 +151,8 @@ setup_rt_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs)
return -EFAULT;
/* Set up to return from userspace. */
- regs->lr = (unsigned long)(vdso->rt_signal_retcode);
+ regs->lr = (unsigned long)VDSO_SYMBOL(
+ current->mm->context.vdso, rt_sigreturn);
/*
* Set up registers for signal handler.
diff --git a/arch/csky/kernel/smp.c b/arch/csky/kernel/smp.c
index 041d0de6a1b6..0f9f5eef9338 100644
--- a/arch/csky/kernel/smp.c
+++ b/arch/csky/kernel/smp.c
@@ -203,8 +203,8 @@ volatile unsigned int secondary_hint;
volatile unsigned int secondary_hint2;
volatile unsigned int secondary_ccr;
volatile unsigned int secondary_stack;
-
-unsigned long secondary_msa1;
+volatile unsigned int secondary_msa1;
+volatile unsigned int secondary_pgd;
int __cpu_up(unsigned int cpu, struct task_struct *tidle)
{
@@ -216,6 +216,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle)
secondary_hint2 = mfcr("cr<21, 1>");
secondary_ccr = mfcr("cr18");
secondary_msa1 = read_mmu_msa1();
+ secondary_pgd = mfcr("cr<29, 15>");
/*
* Because other CPUs are in reset status, we must flush data
@@ -262,8 +263,6 @@ void csky_start_secondary(void)
flush_tlb_all();
write_mmu_pagemask(0);
- TLBMISS_HANDLER_SETUP_PGD(swapper_pg_dir);
- TLBMISS_HANDLER_SETUP_PGD_KERNEL(swapper_pg_dir);
#ifdef CONFIG_CPU_HAS_FPU
init_fpu();
diff --git a/arch/csky/kernel/traps.c b/arch/csky/kernel/traps.c
index 959a917c989d..e5fbf8653a21 100644
--- a/arch/csky/kernel/traps.c
+++ b/arch/csky/kernel/traps.c
@@ -39,9 +39,7 @@ asmlinkage void csky_cmpxchg(void);
asmlinkage void csky_get_tls(void);
asmlinkage void csky_irq(void);
-asmlinkage void csky_tlbinvalidl(void);
-asmlinkage void csky_tlbinvalids(void);
-asmlinkage void csky_tlbmodified(void);
+asmlinkage void csky_pagefault(void);
/* Defined in head.S */
asmlinkage void _start_smp_secondary(void);
@@ -66,9 +64,9 @@ void __init trap_init(void)
VEC_INIT(VEC_TRAP3, csky_get_tls);
/* setup MMU TLB exception */
- VEC_INIT(VEC_TLBINVALIDL, csky_tlbinvalidl);
- VEC_INIT(VEC_TLBINVALIDS, csky_tlbinvalids);
- VEC_INIT(VEC_TLBMODIFIED, csky_tlbmodified);
+ VEC_INIT(VEC_TLBINVALIDL, csky_pagefault);
+ VEC_INIT(VEC_TLBINVALIDS, csky_pagefault);
+ VEC_INIT(VEC_TLBMODIFIED, csky_pagefault);
#ifdef CONFIG_CPU_HAS_FPU
init_fpu();
diff --git a/arch/csky/kernel/vdso.c b/arch/csky/kernel/vdso.c
index abc3dbc658d4..16c20d64d165 100644
--- a/arch/csky/kernel/vdso.c
+++ b/arch/csky/kernel/vdso.c
@@ -1,86 +1,107 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
-#include <linux/kernel.h>
-#include <linux/err.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/init.h>
#include <linux/binfmts.h>
#include <linux/elf.h>
-#include <linux/vmalloc.h>
-#include <linux/unistd.h>
-#include <linux/uaccess.h>
+#include <linux/err.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <asm/page.h>
+#ifdef GENERIC_TIME_VSYSCALL
+#include <vdso/datapage.h>
+#else
#include <asm/vdso.h>
-#include <asm/cacheflush.h>
+#endif
-static struct page *vdso_page;
+extern char vdso_start[], vdso_end[];
-static int __init init_vdso(void)
-{
- struct csky_vdso *vdso;
- int err = 0;
-
- vdso_page = alloc_page(GFP_KERNEL);
- if (!vdso_page)
- panic("Cannot allocate vdso");
+static unsigned int vdso_pages;
+static struct page **vdso_pagelist;
- vdso = vmap(&vdso_page, 1, 0, PAGE_KERNEL);
- if (!vdso)
- panic("Cannot map vdso");
+/*
+ * The vDSO data page.
+ */
+static union {
+ struct vdso_data data;
+ u8 page[PAGE_SIZE];
+} vdso_data_store __page_aligned_data;
+struct vdso_data *vdso_data = &vdso_data_store.data;
- clear_page(vdso);
-
- err = setup_vdso_page(vdso->rt_signal_retcode);
- if (err)
- panic("Cannot set signal return code, err: %x.", err);
+static int __init vdso_init(void)
+{
+ unsigned int i;
+
+ vdso_pages = (vdso_end - vdso_start) >> PAGE_SHIFT;
+ vdso_pagelist =
+ kcalloc(vdso_pages + 1, sizeof(struct page *), GFP_KERNEL);
+ if (unlikely(vdso_pagelist == NULL)) {
+ pr_err("vdso: pagelist allocation failed\n");
+ return -ENOMEM;
+ }
- dcache_wb_range((unsigned long)vdso, (unsigned long)vdso + 16);
+ for (i = 0; i < vdso_pages; i++) {
+ struct page *pg;
- vunmap(vdso);
+ pg = virt_to_page(vdso_start + (i << PAGE_SHIFT));
+ vdso_pagelist[i] = pg;
+ }
+ vdso_pagelist[i] = virt_to_page(vdso_data);
return 0;
}
-subsys_initcall(init_vdso);
+arch_initcall(vdso_init);
-int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+int arch_setup_additional_pages(struct linux_binprm *bprm,
+ int uses_interp)
{
- int ret;
- unsigned long addr;
struct mm_struct *mm = current->mm;
+ unsigned long vdso_base, vdso_len;
+ int ret;
- mmap_write_lock(mm);
+ vdso_len = (vdso_pages + 1) << PAGE_SHIFT;
- addr = get_unmapped_area(NULL, STACK_TOP, PAGE_SIZE, 0, 0);
- if (IS_ERR_VALUE(addr)) {
- ret = addr;
- goto up_fail;
+ mmap_write_lock(mm);
+ vdso_base = get_unmapped_area(NULL, 0, vdso_len, 0, 0);
+ if (IS_ERR_VALUE(vdso_base)) {
+ ret = vdso_base;
+ goto end;
}
- ret = install_special_mapping(
- mm,
- addr,
- PAGE_SIZE,
- VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
- &vdso_page);
- if (ret)
- goto up_fail;
+ /*
+ * Put vDSO base into mm struct. We need to do this before calling
+ * install_special_mapping or the perf counter mmap tracking code
+ * will fail to recognise it as a vDSO (since arch_vma_name fails).
+ */
+ mm->context.vdso = (void *)vdso_base;
+
+ ret =
+ install_special_mapping(mm, vdso_base, vdso_pages << PAGE_SHIFT,
+ (VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC),
+ vdso_pagelist);
+
+ if (unlikely(ret)) {
+ mm->context.vdso = NULL;
+ goto end;
+ }
- mm->context.vdso = (void *)addr;
+ vdso_base += (vdso_pages << PAGE_SHIFT);
+ ret = install_special_mapping(mm, vdso_base, PAGE_SIZE,
+ (VM_READ | VM_MAYREAD), &vdso_pagelist[vdso_pages]);
-up_fail:
+ if (unlikely(ret))
+ mm->context.vdso = NULL;
+end:
mmap_write_unlock(mm);
return ret;
}
const char *arch_vma_name(struct vm_area_struct *vma)
{
- if (vma->vm_mm == NULL)
- return NULL;
-
- if (vma->vm_start == (long)vma->vm_mm->context.vdso)
+ if (vma->vm_mm && (vma->vm_start == (long)vma->vm_mm->context.vdso))
return "[vdso]";
- else
- return NULL;
+ if (vma->vm_mm && (vma->vm_start ==
+ (long)vma->vm_mm->context.vdso + PAGE_SIZE))
+ return "[vdso_data]";
+ return NULL;
}
diff --git a/arch/csky/kernel/vdso/.gitignore b/arch/csky/kernel/vdso/.gitignore
new file mode 100644
index 000000000000..3a19def868ec
--- /dev/null
+++ b/arch/csky/kernel/vdso/.gitignore
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+vdso.lds
+*.tmp
+vdso-syms.S
diff --git a/arch/csky/kernel/vdso/Makefile b/arch/csky/kernel/vdso/Makefile
new file mode 100644
index 000000000000..0b6909f10667
--- /dev/null
+++ b/arch/csky/kernel/vdso/Makefile
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before
+# the inclusion of generic Makefile.
+ARCH_REL_TYPE_ABS := R_CKCORE_ADDR32|R_CKCORE_JUMP_SLOT
+include $(srctree)/lib/vdso/Makefile
+
+# Symbols present in the vdso
+vdso-syms += rt_sigreturn
+vdso-syms += vgettimeofday
+
+# Files to link into the vdso
+obj-vdso = $(patsubst %, %.o, $(vdso-syms)) note.o
+
+ifneq ($(c-gettimeofday-y),)
+ CFLAGS_vgettimeofday.o += -include $(c-gettimeofday-y)
+endif
+
+ccflags-y := -fno-stack-protector -DBUILD_VDSO32
+
+# Build rules
+targets := $(obj-vdso) vdso.so vdso.so.dbg vdso.lds vdso-dummy.o
+obj-vdso := $(addprefix $(obj)/, $(obj-vdso))
+
+obj-y += vdso.o vdso-syms.o
+CPPFLAGS_vdso.lds += -P -C -U$(ARCH)
+
+# Disable gcov profiling for VDSO code
+GCOV_PROFILE := n
+KCOV_INSTRUMENT := n
+
+# Force dependency
+$(obj)/vdso.o: $(obj)/vdso.so
+
+SYSCFLAGS_vdso.so.dbg = $(c_flags)
+$(obj)/vdso.so.dbg: $(src)/vdso.lds $(obj-vdso) FORCE
+ $(call if_changed,vdsold)
+SYSCFLAGS_vdso.so.dbg = -shared -s -Wl,-soname=linux-vdso.so.1 \
+ -Wl,--build-id=sha1 -Wl,--hash-style=both
+
+$(obj)/vdso-syms.S: $(obj)/vdso.so FORCE
+ $(call if_changed,so2s)
+
+# strip rule for the .so file
+$(obj)/%.so: OBJCOPYFLAGS := -S
+$(obj)/%.so: $(obj)/%.so.dbg FORCE
+ $(call if_changed,objcopy)
+
+# actual build commands
+# The DSO images are built using a special linker script
+# Make sure only to export the intended __vdso_xxx symbol offsets.
+quiet_cmd_vdsold = VDSOLD $@
+ cmd_vdsold = $(CC) $(KBUILD_CFLAGS) $(call cc-option, -no-pie) -nostdlib -nostartfiles $(SYSCFLAGS_$(@F)) \
+ -Wl,-T,$(filter-out FORCE,$^) -o $@.tmp && \
+ $(CROSS_COMPILE)objcopy \
+ $(patsubst %, -G __vdso_%, $(vdso-syms)) $@.tmp $@ && \
+ rm $@.tmp
+
+# Extracts symbol offsets from the VDSO, converting them into an assembly file
+# that contains the same symbols at the same offsets.
+quiet_cmd_so2s = SO2S $@
+ cmd_so2s = $(NM) -D $< | $(srctree)/$(src)/so2s.sh > $@
+
+# install commands for the unstripped file
+quiet_cmd_vdso_install = INSTALL $@
+ cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@
+
+vdso.so: $(obj)/vdso.so.dbg
+ @mkdir -p $(MODLIB)/vdso
+ $(call cmd,vdso_install)
+
+vdso_install: vdso.so
diff --git a/arch/csky/kernel/vdso/note.S b/arch/csky/kernel/vdso/note.S
new file mode 100644
index 000000000000..2a956c942211
--- /dev/null
+++ b/arch/csky/kernel/vdso/note.S
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
+ * Here we can supply some information useful to userland.
+ */
+
+#include <linux/elfnote.h>
+#include <linux/version.h>
+
+ELFNOTE_START(Linux, 0, "a")
+ .long LINUX_VERSION_CODE
+ELFNOTE_END
diff --git a/arch/csky/kernel/vdso/rt_sigreturn.S b/arch/csky/kernel/vdso/rt_sigreturn.S
new file mode 100644
index 000000000000..0a6bd1216118
--- /dev/null
+++ b/arch/csky/kernel/vdso/rt_sigreturn.S
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#include <linux/linkage.h>
+#include <asm/unistd.h>
+#include <abi/vdso.h>
+
+ .text
+ENTRY(__vdso_rt_sigreturn)
+ .cfi_startproc
+ .cfi_signal_frame
+ SET_SYSCALL_ID
+ trap 0
+ .cfi_endproc
+ENDPROC(__vdso_rt_sigreturn)
diff --git a/arch/csky/kernel/vdso/so2s.sh b/arch/csky/kernel/vdso/so2s.sh
new file mode 100755
index 000000000000..69da3d529c6d
--- /dev/null
+++ b/arch/csky/kernel/vdso/so2s.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0+
+
+sed 's!\([0-9a-f]*\) T \([a-z0-9_]*\)\(@@LINUX_5.10\)*!.global \2\n.set \2,0x\1!' \
+| grep '^\.'
diff --git a/arch/csky/kernel/vdso/vdso.S b/arch/csky/kernel/vdso/vdso.S
new file mode 100644
index 000000000000..5162ca069494
--- /dev/null
+++ b/arch/csky/kernel/vdso/vdso.S
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <asm/page.h>
+
+ __PAGE_ALIGNED_DATA
+
+ .globl vdso_start, vdso_end
+ .balign PAGE_SIZE
+vdso_start:
+ .incbin "arch/csky/kernel/vdso/vdso.so"
+ .balign PAGE_SIZE
+vdso_end:
+
+ .previous
diff --git a/arch/csky/kernel/vdso/vdso.lds.S b/arch/csky/kernel/vdso/vdso.lds.S
new file mode 100644
index 000000000000..590a6c79fff7
--- /dev/null
+++ b/arch/csky/kernel/vdso/vdso.lds.S
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#include <asm/page.h>
+
+OUTPUT_ARCH(csky)
+
+SECTIONS
+{
+ PROVIDE(_vdso_data = . + PAGE_SIZE);
+ . = SIZEOF_HEADERS;
+
+ .hash : { *(.hash) } :text
+ .gnu.hash : { *(.gnu.hash) }
+ .dynsym : { *(.dynsym) }
+ .dynstr : { *(.dynstr) }
+ .gnu.version : { *(.gnu.version) }
+ .gnu.version_d : { *(.gnu.version_d) }
+ .gnu.version_r : { *(.gnu.version_r) }
+
+ .note : { *(.note.*) } :text :note
+ .dynamic : { *(.dynamic) } :text :dynamic
+
+ .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
+ .eh_frame : { KEEP (*(.eh_frame)) } :text
+
+ .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) }
+
+ . = 0x800;
+ .text : { *(.text .text.*) } :text
+
+ .data : {
+ *(.got.plt) *(.got)
+ *(.data .data.* .gnu.linkonce.d.*)
+ *(.dynbss)
+ *(.bss .bss.* .gnu.linkonce.b.*)
+ }
+}
+
+PHDRS
+{
+ text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */
+ dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
+ note PT_NOTE FLAGS(4); /* PF_R */
+ eh_frame_hdr PT_GNU_EH_FRAME;
+}
+
+VERSION
+{
+ LINUX_5.10 {
+ global:
+ __vdso_rt_sigreturn;
+ __vdso_clock_gettime;
+ __vdso_clock_gettime64;
+ __vdso_gettimeofday;
+ __vdso_clock_getres;
+ local: *;
+ };
+}
diff --git a/arch/csky/kernel/vdso/vgettimeofday.c b/arch/csky/kernel/vdso/vgettimeofday.c
new file mode 100644
index 000000000000..da491832c098
--- /dev/null
+++ b/arch/csky/kernel/vdso/vgettimeofday.c
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/time.h>
+#include <linux/types.h>
+
+int __vdso_clock_gettime(clockid_t clock,
+ struct old_timespec32 *ts)
+{
+ return __cvdso_clock_gettime32(clock, ts);
+}
+
+int __vdso_clock_gettime64(clockid_t clock,
+ struct __kernel_timespec *ts)
+{
+ return __cvdso_clock_gettime(clock, ts);
+}
+
+int __vdso_gettimeofday(struct __kernel_old_timeval *tv,
+ struct timezone *tz)
+{
+ return __cvdso_gettimeofday(tv, tz);
+}
+
+int __vdso_clock_getres(clockid_t clock_id,
+ struct old_timespec32 *res)
+{
+ return __cvdso_clock_getres_time32(clock_id, res);
+}
diff --git a/arch/csky/kernel/vmlinux.lds.S b/arch/csky/kernel/vmlinux.lds.S
index f03033e17c29..e8b1a4a49798 100644
--- a/arch/csky/kernel/vmlinux.lds.S
+++ b/arch/csky/kernel/vmlinux.lds.S
@@ -33,6 +33,7 @@ SECTIONS
.text : AT(ADDR(.text) - LOAD_OFFSET) {
_text = .;
+ VBR_BASE
IRQENTRY_TEXT
SOFTIRQENTRY_TEXT
TEXT_TEXT
@@ -104,7 +105,6 @@ SECTIONS
EXCEPTION_TABLE(L1_CACHE_BYTES)
BSS_SECTION(L1_CACHE_BYTES, PAGE_SIZE, L1_CACHE_BYTES)
- VBR_BASE
_end = . ;
STABS_DEBUG
diff --git a/arch/csky/mm/fault.c b/arch/csky/mm/fault.c
index 081b178b41b1..1482de56f4f7 100644
--- a/arch/csky/mm/fault.c
+++ b/arch/csky/mm/fault.c
@@ -1,29 +1,10 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
-#include <linux/signal.h>
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/interrupt.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/mman.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/version.h>
-#include <linux/vt_kern.h>
#include <linux/extable.h>
-#include <linux/uaccess.h>
-#include <linux/perf_event.h>
#include <linux/kprobes.h>
-
-#include <asm/hardirq.h>
-#include <asm/mmu_context.h>
-#include <asm/traps.h>
-#include <asm/page.h>
+#include <linux/mmu_context.h>
+#include <linux/perf_event.h>
int fixup_exception(struct pt_regs *regs)
{
@@ -39,180 +20,287 @@ int fixup_exception(struct pt_regs *regs)
return 0;
}
-/*
- * This routine handles page faults. It determines the address,
- * and the problem, and then passes it off to one of the appropriate
- * routines.
- */
-asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write,
- unsigned long mmu_meh)
+static inline bool is_write(struct pt_regs *regs)
{
- struct vm_area_struct *vma = NULL;
- struct task_struct *tsk = current;
- struct mm_struct *mm = tsk->mm;
- int si_code;
- int fault;
- unsigned long address = mmu_meh & PAGE_MASK;
+ switch (trap_no(regs)) {
+ case VEC_TLBINVALIDS:
+ return true;
+ case VEC_TLBMODIFIED:
+ return true;
+ }
- if (kprobe_page_fault(regs, tsk->thread.trap_no))
+ return false;
+}
+
+#ifdef CONFIG_CPU_HAS_LDSTEX
+static inline void csky_cmpxchg_fixup(struct pt_regs *regs)
+{
+ return;
+}
+#else
+extern unsigned long csky_cmpxchg_ldw;
+extern unsigned long csky_cmpxchg_stw;
+static inline void csky_cmpxchg_fixup(struct pt_regs *regs)
+{
+ if (trap_no(regs) != VEC_TLBMODIFIED)
return;
- si_code = SEGV_MAPERR;
+ if (instruction_pointer(regs) == csky_cmpxchg_stw)
+ instruction_pointer_set(regs, csky_cmpxchg_ldw);
+ return;
+}
+#endif
+
+static inline void no_context(struct pt_regs *regs, unsigned long addr)
+{
+ current->thread.trap_no = trap_no(regs);
+
+ /* Are we prepared to handle this kernel fault? */
+ if (fixup_exception(regs))
+ return;
-#ifndef CONFIG_CPU_HAS_TLBI
/*
- * We fault-in kernel-space virtual memory on-demand. The
- * 'reference' page table is init_mm.pgd.
- *
- * NOTE! We MUST NOT take any locks for this case. We may
- * be in an interrupt or a critical region, and should
- * only copy the information from the master page table,
- * nothing more.
+ * Oops. The kernel tried to access some bad page. We'll have to
+ * terminate things with extreme prejudice.
*/
- if (unlikely(address >= VMALLOC_START) &&
- unlikely(address <= VMALLOC_END)) {
- /*
- * Synchronize this task's top level page-table
- * with the 'reference' page table.
- *
- * Do _not_ use "tsk" here. We might be inside
- * an interrupt in the middle of a task switch..
- */
- int offset = pgd_index(address);
- pgd_t *pgd, *pgd_k;
- pud_t *pud, *pud_k;
- pmd_t *pmd, *pmd_k;
- pte_t *pte_k;
+ bust_spinlocks(1);
+ pr_alert("Unable to handle kernel paging request at virtual "
+ "addr 0x%08lx, pc: 0x%08lx\n", addr, regs->pc);
+ die(regs, "Oops");
+ do_exit(SIGKILL);
+}
- unsigned long pgd_base;
+static inline void mm_fault_error(struct pt_regs *regs, unsigned long addr, vm_fault_t fault)
+{
+ current->thread.trap_no = trap_no(regs);
- pgd_base = (unsigned long)__va(get_pgd());
- pgd = (pgd_t *)pgd_base + offset;
- pgd_k = init_mm.pgd + offset;
+ if (fault & VM_FAULT_OOM) {
+ /*
+ * We ran out of memory, call the OOM killer, and return the userspace
+ * (which will retry the fault, or kill us if we got oom-killed).
+ */
+ if (!user_mode(regs)) {
+ no_context(regs, addr);
+ return;
+ }
+ pagefault_out_of_memory();
+ return;
+ } else if (fault & VM_FAULT_SIGBUS) {
+ /* Kernel mode? Handle exceptions or die */
+ if (!user_mode(regs)) {
+ no_context(regs, addr);
+ return;
+ }
+ do_trap(regs, SIGBUS, BUS_ADRERR, addr);
+ return;
+ }
+ BUG();
+}
- if (!pgd_present(*pgd_k))
- goto no_context;
- set_pgd(pgd, *pgd_k);
+static inline void bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr)
+{
+ /*
+ * Something tried to access memory that isn't in our memory map.
+ * Fix it, but check if it's kernel or user first.
+ */
+ mmap_read_unlock(mm);
+ /* User mode accesses just cause a SIGSEGV */
+ if (user_mode(regs)) {
+ do_trap(regs, SIGSEGV, code, addr);
+ return;
+ }
- pud = (pud_t *)pgd;
- pud_k = (pud_t *)pgd_k;
- if (!pud_present(*pud_k))
- goto no_context;
+ no_context(regs, addr);
+}
- pmd = pmd_offset(pud, address);
- pmd_k = pmd_offset(pud_k, address);
- if (!pmd_present(*pmd_k))
- goto no_context;
- set_pmd(pmd, *pmd_k);
+static inline void vmalloc_fault(struct pt_regs *regs, int code, unsigned long addr)
+{
+ pgd_t *pgd, *pgd_k;
+ pud_t *pud, *pud_k;
+ pmd_t *pmd, *pmd_k;
+ pte_t *pte_k;
+ int offset;
- pte_k = pte_offset_kernel(pmd_k, address);
- if (!pte_present(*pte_k))
- goto no_context;
+ /* User mode accesses just cause a SIGSEGV */
+ if (user_mode(regs)) {
+ do_trap(regs, SIGSEGV, code, addr);
return;
}
-#endif
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
/*
- * If we're in an interrupt or have no user
- * context, we must not take the fault..
+ * Synchronize this task's top level page-table
+ * with the 'reference' page table.
+ *
+ * Do _not_ use "tsk" here. We might be inside
+ * an interrupt in the middle of a task switch..
*/
- if (in_atomic() || !mm)
- goto bad_area_nosemaphore;
+ offset = pgd_index(addr);
- mmap_read_lock(mm);
- vma = find_vma(mm, address);
- if (!vma)
- goto bad_area;
- if (vma->vm_start <= address)
- goto good_area;
- if (!(vma->vm_flags & VM_GROWSDOWN))
- goto bad_area;
- if (expand_stack(vma, address))
- goto bad_area;
- /*
- * Ok, we have a good vm_area for this memory access, so
- * we can handle it..
- */
-good_area:
- si_code = SEGV_ACCERR;
+ pgd = get_pgd() + offset;
+ pgd_k = init_mm.pgd + offset;
- if (write) {
+ if (!pgd_present(*pgd_k)) {
+ no_context(regs, addr);
+ return;
+ }
+ set_pgd(pgd, *pgd_k);
+
+ pud = (pud_t *)pgd;
+ pud_k = (pud_t *)pgd_k;
+ if (!pud_present(*pud_k)) {
+ no_context(regs, addr);
+ return;
+ }
+
+ pmd = pmd_offset(pud, addr);
+ pmd_k = pmd_offset(pud_k, addr);
+ if (!pmd_present(*pmd_k)) {
+ no_context(regs, addr);
+ return;
+ }
+ set_pmd(pmd, *pmd_k);
+
+ pte_k = pte_offset_kernel(pmd_k, addr);
+ if (!pte_present(*pte_k)) {
+ no_context(regs, addr);
+ return;
+ }
+
+ flush_tlb_one(addr);
+}
+
+static inline bool access_error(struct pt_regs *regs, struct vm_area_struct *vma)
+{
+ if (is_write(regs)) {
if (!(vma->vm_flags & VM_WRITE))
- goto bad_area;
+ return true;
} else {
if (unlikely(!vma_is_accessible(vma)))
- goto bad_area;
+ return true;
}
+ return false;
+}
+
+/*
+ * This routine handles page faults. It determines the address and the
+ * problem, and then passes it off to one of the appropriate routines.
+ */
+asmlinkage void do_page_fault(struct pt_regs *regs)
+{
+ struct task_struct *tsk;
+ struct vm_area_struct *vma;
+ struct mm_struct *mm;
+ unsigned long addr = read_mmu_entryhi() & PAGE_MASK;
+ unsigned int flags = FAULT_FLAG_DEFAULT;
+ int code = SEGV_MAPERR;
+ vm_fault_t fault;
+
+ tsk = current;
+ mm = tsk->mm;
+
+ csky_cmpxchg_fixup(regs);
+
+ if (kprobe_page_fault(regs, tsk->thread.trap_no))
+ return;
/*
- * If for any reason at all we couldn't handle the fault,
- * make sure we exit gracefully rather than endlessly redo
- * the fault.
+ * Fault-in kernel-space virtual memory on-demand.
+ * The 'reference' page table is init_mm.pgd.
+ *
+ * NOTE! We MUST NOT take any locks for this case. We may
+ * be in an interrupt or a critical region, and should
+ * only copy the information from the master page table,
+ * nothing more.
*/
- fault = handle_mm_fault(vma, address, write ? FAULT_FLAG_WRITE : 0,
- regs);
- if (unlikely(fault & VM_FAULT_ERROR)) {
- if (fault & VM_FAULT_OOM)
- goto out_of_memory;
- else if (fault & VM_FAULT_SIGBUS)
- goto do_sigbus;
- else if (fault & VM_FAULT_SIGSEGV)
- goto bad_area;
- BUG();
+ if (unlikely((addr >= VMALLOC_START) && (addr <= VMALLOC_END))) {
+ vmalloc_fault(regs, code, addr);
+ return;
}
- mmap_read_unlock(mm);
- return;
+
+ /* Enable interrupts if they were enabled in the parent context. */
+ if (likely(regs->sr & BIT(6)))
+ local_irq_enable();
/*
- * Something tried to access memory that isn't in our memory map..
- * Fix it, but check if it's kernel or user first..
+ * If we're in an interrupt, have no user context, or are running
+ * in an atomic region, then we must not take the fault.
*/
-bad_area:
- mmap_read_unlock(mm);
-
-bad_area_nosemaphore:
- /* User mode accesses just cause a SIGSEGV */
- if (user_mode(regs)) {
- tsk->thread.trap_no = trap_no(regs);
- force_sig_fault(SIGSEGV, si_code, (void __user *)address);
+ if (unlikely(faulthandler_disabled() || !mm)) {
+ no_context(regs, addr);
return;
}
-no_context:
- tsk->thread.trap_no = trap_no(regs);
+ if (user_mode(regs))
+ flags |= FAULT_FLAG_USER;
- /* Are we prepared to handle this kernel fault? */
- if (fixup_exception(regs))
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
+
+ if (is_write(regs))
+ flags |= FAULT_FLAG_WRITE;
+retry:
+ mmap_read_lock(mm);
+ vma = find_vma(mm, addr);
+ if (unlikely(!vma)) {
+ bad_area(regs, mm, code, addr);
+ return;
+ }
+ if (likely(vma->vm_start <= addr))
+ goto good_area;
+ if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+ bad_area(regs, mm, code, addr);
return;
+ }
+ if (unlikely(expand_stack(vma, addr))) {
+ bad_area(regs, mm, code, addr);
+ return;
+ }
/*
- * Oops. The kernel tried to access some bad page. We'll have to
- * terminate things with extreme prejudice.
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it.
*/
- bust_spinlocks(1);
- pr_alert("Unable to handle kernel paging request at virtual "
- "address 0x%08lx, pc: 0x%08lx\n", address, regs->pc);
- die(regs, "Oops");
+good_area:
+ code = SEGV_ACCERR;
+
+ if (unlikely(access_error(regs, vma))) {
+ bad_area(regs, mm, code, addr);
+ return;
+ }
-out_of_memory:
- tsk->thread.trap_no = trap_no(regs);
+ /*
+ * If for any reason at all we could not handle the fault,
+ * make sure we exit gracefully rather than endlessly redo
+ * the fault.
+ */
+ fault = handle_mm_fault(vma, addr, flags, regs);
/*
- * We ran out of memory, call the OOM killer, and return the userspace
- * (which will retry the fault, or kill us if we got oom-killed).
+ * If we need to retry but a fatal signal is pending, handle the
+ * signal first. We do not need to release the mmap_lock because it
+ * would already be released in __lock_page_or_retry in mm/filemap.c.
*/
- pagefault_out_of_memory();
- return;
+ if (fault_signal_pending(fault, regs)) {
+ if (!user_mode(regs))
+ no_context(regs, addr);
+ return;
+ }
-do_sigbus:
- tsk->thread.trap_no = trap_no(regs);
+ if (unlikely((fault & VM_FAULT_RETRY) && (flags & FAULT_FLAG_ALLOW_RETRY))) {
+ flags |= FAULT_FLAG_TRIED;
- mmap_read_unlock(mm);
+ /*
+ * No need to mmap_read_unlock(mm) as we would
+ * have already released it in __lock_page_or_retry
+ * in mm/filemap.c.
+ */
+ goto retry;
+ }
- /* Kernel mode? Handle exceptions or die */
- if (!user_mode(regs))
- goto no_context;
+ mmap_read_unlock(mm);
- force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
+ if (unlikely(fault & VM_FAULT_ERROR)) {
+ mm_fault_error(regs, addr, fault);
+ return;
+ }
+ return;
}
diff --git a/arch/csky/mm/init.c b/arch/csky/mm/init.c
index af627128314f..894050a8ce09 100644
--- a/arch/csky/mm/init.c
+++ b/arch/csky/mm/init.c
@@ -28,9 +28,15 @@
#include <asm/mmu_context.h>
#include <asm/sections.h>
#include <asm/tlb.h>
+#include <asm/cacheflush.h>
+
+#define PTRS_KERN_TABLE \
+ ((PTRS_PER_PGD - USER_PTRS_PER_PGD) * PTRS_PER_PTE)
pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_bss;
pte_t invalid_pte_table[PTRS_PER_PTE] __page_aligned_bss;
+pte_t kernel_pte_tables[PTRS_KERN_TABLE] __page_aligned_bss;
+
EXPORT_SYMBOL(invalid_pte_table);
unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]
__page_aligned_bss;
@@ -80,9 +86,9 @@ void __init mem_init(void)
#ifdef CONFIG_HIGHMEM
unsigned long tmp;
- max_mapnr = highend_pfn;
+ set_max_mapnr(highend_pfn - ARCH_PFN_OFFSET);
#else
- max_mapnr = max_low_pfn;
+ set_max_mapnr(max_low_pfn - ARCH_PFN_OFFSET);
#endif
high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT);
@@ -104,24 +110,9 @@ void __init mem_init(void)
mem_init_print_info(NULL);
}
-extern char __init_begin[], __init_end[];
-
void free_initmem(void)
{
- unsigned long addr;
-
- addr = (unsigned long) &__init_begin;
-
- while (addr < (unsigned long) &__init_end) {
- ClearPageReserved(virt_to_page(addr));
- init_page_count(virt_to_page(addr));
- free_page(addr);
- totalram_pages_inc();
- addr += PAGE_SIZE;
- }
-
- pr_info("Freeing unused kernel memory: %dk freed\n",
- ((unsigned int)&__init_end - (unsigned int)&__init_begin) >> 10);
+ free_initmem_default(-1);
}
void pgd_init(unsigned long *p)
@@ -130,20 +121,35 @@ void pgd_init(unsigned long *p)
for (i = 0; i < PTRS_PER_PGD; i++)
p[i] = __pa(invalid_pte_table);
+
+ flush_tlb_all();
+ local_icache_inv_all(NULL);
}
-void __init pre_mmu_init(void)
+void __init mmu_init(unsigned long min_pfn, unsigned long max_pfn)
{
- /*
- * Setup page-table and enable TLB-hardrefill
- */
+ int i;
+
+ for (i = 0; i < USER_PTRS_PER_PGD; i++)
+ swapper_pg_dir[i].pgd = __pa(invalid_pte_table);
+
+ for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++)
+ swapper_pg_dir[i].pgd =
+ __pa(kernel_pte_tables + (PTRS_PER_PTE * (i - USER_PTRS_PER_PGD)));
+
+ for (i = 0; i < PTRS_KERN_TABLE; i++)
+ set_pte(&kernel_pte_tables[i], __pte(_PAGE_GLOBAL));
+
+ for (i = min_pfn; i < max_pfn; i++)
+ set_pte(&kernel_pte_tables[i - PFN_DOWN(va_pa_offset)], pfn_pte(i, PAGE_KERNEL));
+
flush_tlb_all();
- pgd_init((unsigned long *)swapper_pg_dir);
- TLBMISS_HANDLER_SETUP_PGD(swapper_pg_dir);
- TLBMISS_HANDLER_SETUP_PGD_KERNEL(swapper_pg_dir);
+ local_icache_inv_all(NULL);
/* Setup page mask to 4k */
write_mmu_pagemask(0);
+
+ setup_pgd(swapper_pg_dir, 0);
}
void __init fixrange_init(unsigned long start, unsigned long end,
diff --git a/arch/csky/mm/tlb.c b/arch/csky/mm/tlb.c
index ed1512381112..9234c5e5ceaf 100644
--- a/arch/csky/mm/tlb.c
+++ b/arch/csky/mm/tlb.c
@@ -24,7 +24,13 @@ void flush_tlb_all(void)
void flush_tlb_mm(struct mm_struct *mm)
{
#ifdef CONFIG_CPU_HAS_TLBI
- asm volatile("tlbi.asids %0"::"r"(cpu_asid(mm)));
+ sync_is();
+ asm volatile(
+ "tlbi.asids %0 \n"
+ "sync.i \n"
+ :
+ : "r" (cpu_asid(mm))
+ : "memory");
#else
tlb_invalid_all();
#endif
@@ -53,11 +59,17 @@ void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
end &= TLB_ENTRY_SIZE_MASK;
#ifdef CONFIG_CPU_HAS_TLBI
+ sync_is();
while (start < end) {
- asm volatile("tlbi.vas %0"::"r"(start | newpid));
+ asm volatile(
+ "tlbi.vas %0 \n"
+ :
+ : "r" (start | newpid)
+ : "memory");
+
start += 2*PAGE_SIZE;
}
- sync_is();
+ asm volatile("sync.i\n");
#else
{
unsigned long flags, oldpid;
@@ -87,11 +99,17 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
end &= TLB_ENTRY_SIZE_MASK;
#ifdef CONFIG_CPU_HAS_TLBI
+ sync_is();
while (start < end) {
- asm volatile("tlbi.vaas %0"::"r"(start));
+ asm volatile(
+ "tlbi.vaas %0 \n"
+ :
+ : "r" (start)
+ : "memory");
+
start += 2*PAGE_SIZE;
}
- sync_is();
+ asm volatile("sync.i\n");
#else
{
unsigned long flags, oldpid;
@@ -121,8 +139,13 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr)
addr &= TLB_ENTRY_SIZE_MASK;
#ifdef CONFIG_CPU_HAS_TLBI
- asm volatile("tlbi.vas %0"::"r"(addr | newpid));
sync_is();
+ asm volatile(
+ "tlbi.vas %0 \n"
+ "sync.i \n"
+ :
+ : "r" (addr | newpid)
+ : "memory");
#else
{
int oldpid, idx;
@@ -147,8 +170,13 @@ void flush_tlb_one(unsigned long addr)
addr &= TLB_ENTRY_SIZE_MASK;
#ifdef CONFIG_CPU_HAS_TLBI
- asm volatile("tlbi.vaas %0"::"r"(addr));
sync_is();
+ asm volatile(
+ "tlbi.vaas %0 \n"
+ "sync.i \n"
+ :
+ : "r" (addr)
+ : "memory");
#else
{
int oldpid, idx;
diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c
index bc1364db58fe..46b1342ce515 100644
--- a/arch/h8300/kernel/process.c
+++ b/arch/h8300/kernel/process.c
@@ -112,7 +112,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
childregs = (struct pt_regs *) (THREAD_SIZE + task_stack_page(p)) - 1;
- if (unlikely(p->flags & PF_KTHREAD)) {
+ if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
memset(childregs, 0, sizeof(struct pt_regs));
childregs->retpc = (unsigned long) ret_from_kernel_thread;
childregs->er4 = topstk; /* arg */
diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c
index 6a980cba7b29..c61165c99ae0 100644
--- a/arch/hexagon/kernel/process.c
+++ b/arch/hexagon/kernel/process.c
@@ -73,7 +73,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg,
sizeof(*ss));
ss->lr = (unsigned long)ret_from_fork;
p->thread.switch_sp = ss;
- if (unlikely(p->flags & PF_KTHREAD)) {
+ if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
memset(childregs, 0, sizeof(struct pt_regs));
/* r24 <- fn, r25 <- arg */
ss->r24 = usp;
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index 4ebbfa076a26..7e1a1525e202 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -338,7 +338,7 @@ copy_thread(unsigned long clone_flags, unsigned long user_stack_base,
ia64_drop_fpu(p); /* don't pick up stale state from a CPU's fph */
- if (unlikely(p->flags & PF_KTHREAD)) {
+ if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
if (unlikely(!user_stack_base)) {
/* fork_idle() called us */
return 0;
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index 08359a6e058f..da83cc83e791 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -157,7 +157,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg,
*/
p->thread.fs = get_fs().seg;
- if (unlikely(p->flags & PF_KTHREAD)) {
+ if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
/* kernel thread */
memset(frame, 0, sizeof(struct fork_frame));
frame->regs.sr = PS_S;
diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c
index 657c2beb665e..62aa237180b6 100644
--- a/arch/microblaze/kernel/process.c
+++ b/arch/microblaze/kernel/process.c
@@ -59,7 +59,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg,
struct pt_regs *childregs = task_pt_regs(p);
struct thread_info *ti = task_thread_info(p);
- if (unlikely(p->flags & PF_KTHREAD)) {
+ if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
/* if we're creating a new kernel thread then just zeroing all
* the registers. That's OK for a brand new thread.*/
memset(childregs, 0, sizeof(struct pt_regs));
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index af4c862ec5ff..7efa0d1a4c2b 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -120,7 +120,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
/* Put the stack after the struct pt_regs. */
childksp = (unsigned long) childregs;
p->thread.cp0_status = (read_c0_status() & ~(ST0_CU2|ST0_CU1)) | ST0_KERNEL_CUMASK;
- if (unlikely(p->flags & PF_KTHREAD)) {
+ if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
/* kernel thread */
unsigned long status = p->thread.cp0_status;
memset(childregs, 0, sizeof(struct pt_regs));
diff --git a/arch/nds32/kernel/process.c b/arch/nds32/kernel/process.c
index e01ad5d17224..c1327e552ec6 100644
--- a/arch/nds32/kernel/process.c
+++ b/arch/nds32/kernel/process.c
@@ -156,7 +156,7 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start,
memset(&p->thread.cpu_context, 0, sizeof(struct cpu_context));
- if (unlikely(p->flags & PF_KTHREAD)) {
+ if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
memset(childregs, 0, sizeof(struct pt_regs));
/* kernel thread fn */
p->thread.cpu_context.r6 = stack_start;
diff --git a/arch/nios2/kernel/process.c b/arch/nios2/kernel/process.c
index 50b4eb19a6cc..c5f916ca6845 100644
--- a/arch/nios2/kernel/process.c
+++ b/arch/nios2/kernel/process.c
@@ -109,7 +109,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg,
struct switch_stack *childstack =
((struct switch_stack *)childregs) - 1;
- if (unlikely(p->flags & PF_KTHREAD)) {
+ if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
memset(childstack, 0,
sizeof(struct switch_stack) + sizeof(struct pt_regs));
diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c
index 181448f74316..eb62429681fc 100644
--- a/arch/openrisc/kernel/process.c
+++ b/arch/openrisc/kernel/process.c
@@ -174,7 +174,7 @@ copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg,
sp -= sizeof(struct pt_regs);
kregs = (struct pt_regs *)sp;
- if (unlikely(p->flags & PF_KTHREAD)) {
+ if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
memset(kregs, 0, sizeof(struct pt_regs));
kregs->gpr[20] = usp; /* fn, kernel thread */
kregs->gpr[22] = arg;
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index fda1c1a6a444..b144fbe29bc1 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -200,7 +200,7 @@ copy_thread(unsigned long clone_flags, unsigned long usp,
extern void * const ret_from_kernel_thread;
extern void * const child_return;
- if (unlikely(p->flags & PF_KTHREAD)) {
+ if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
/* kernel thread */
memset(cregs, 0, sizeof(struct pt_regs));
if (!usp) /* idle thread */
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 924d023dad0a..3231c2df9e26 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1670,7 +1670,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
/* Copy registers */
sp -= sizeof(struct pt_regs);
childregs = (struct pt_regs *) sp;
- if (unlikely(p->flags & PF_KTHREAD)) {
+ if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
/* kernel thread */
memset(childregs, 0, sizeof(struct pt_regs));
childregs->gpr[1] = sp + sizeof(struct pt_regs);
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index a998babc1237..85d626b8ce5e 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -314,6 +314,7 @@ endchoice
# Common NUMA Features
config NUMA
bool "NUMA Memory Allocation and Scheduler Support"
+ depends on SMP
select GENERIC_ARCH_NUMA
select OF_NUMA
select ARCH_SUPPORTS_NUMA_BALANCING
diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig
index 8c3d1e451703..6c0625aa96c7 100644
--- a/arch/riscv/configs/defconfig
+++ b/arch/riscv/configs/defconfig
@@ -17,6 +17,7 @@ CONFIG_BPF_SYSCALL=y
CONFIG_SOC_SIFIVE=y
CONFIG_SOC_VIRT=y
CONFIG_SMP=y
+CONFIG_HOTPLUG_CPU=y
CONFIG_JUMP_LABEL=y
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
diff --git a/arch/riscv/configs/rv32_defconfig b/arch/riscv/configs/rv32_defconfig
index 2c2cda6cc1c5..8dd02b842fef 100644
--- a/arch/riscv/configs/rv32_defconfig
+++ b/arch/riscv/configs/rv32_defconfig
@@ -18,6 +18,7 @@ CONFIG_SOC_SIFIVE=y
CONFIG_SOC_VIRT=y
CONFIG_ARCH_RV32I=y
CONFIG_SMP=y
+CONFIG_HOTPLUG_CPU=y
CONFIG_JUMP_LABEL=y
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c
index 19f4688f2f36..6f728e731bed 100644
--- a/arch/riscv/kernel/process.c
+++ b/arch/riscv/kernel/process.c
@@ -124,7 +124,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg,
struct pt_regs *childregs = task_pt_regs(p);
/* p->thread holds context to be restored by __switch_to() */
- if (unlikely(p->flags & PF_KTHREAD)) {
+ if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
/* Kernel thread */
memset(childregs, 0, sizeof(struct pt_regs));
childregs->gp = gp_in_global;
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 9dfb8b2dffd6..067583ab1bd7 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -108,34 +108,17 @@ void __init mem_init(void)
void __init setup_bootmem(void)
{
- phys_addr_t mem_start = 0;
- phys_addr_t start, dram_end, end = 0;
phys_addr_t vmlinux_end = __pa_symbol(&_end);
phys_addr_t vmlinux_start = __pa_symbol(&_start);
+ phys_addr_t dram_end = memblock_end_of_DRAM();
phys_addr_t max_mapped_addr = __pa(~(ulong)0);
- u64 i;
-
- /* Find the memory region containing the kernel */
- for_each_mem_range(i, &start, &end) {
- phys_addr_t size = end - start;
- if (!mem_start)
- mem_start = start;
- if (start <= vmlinux_start && vmlinux_end <= end)
- BUG_ON(size == 0);
- }
- /*
- * The maximal physical memory size is -PAGE_OFFSET.
- * Make sure that any memory beyond mem_start + (-PAGE_OFFSET) is removed
- * as it is unusable by kernel.
- */
+ /* The maximal physical memory size is -PAGE_OFFSET. */
memblock_enforce_memory_limit(-PAGE_OFFSET);
/* Reserve from the start of the kernel to the end of the kernel */
memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start);
- dram_end = memblock_end_of_DRAM();
-
/*
* memblock allocator is not aware of the fact that last 4K bytes of
* the addressable memory can not be mapped because of IS_ERR_VALUE
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 367bd000f6d1..e20bed1ed34a 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -130,7 +130,7 @@ int copy_thread(unsigned long clone_flags, unsigned long new_stackp,
frame->sf.gprs[9] = (unsigned long)frame;
/* Store access registers to kernel stack of new process. */
- if (unlikely(p->flags & PF_KTHREAD)) {
+ if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
/* kernel thread */
memset(&frame->childregs, 0, sizeof(struct pt_regs));
frame->childregs.psw.mask = PSW_KERNEL_BITS | PSW_MASK_DAT |
diff --git a/arch/sh/boards/mach-landisk/gio.c b/arch/sh/boards/mach-landisk/gio.c
index 1c0da99dfc60..ff2200fec29a 100644
--- a/arch/sh/boards/mach-landisk/gio.c
+++ b/arch/sh/boards/mach-landisk/gio.c
@@ -27,11 +27,10 @@ static int openCnt;
static int gio_open(struct inode *inode, struct file *filp)
{
- int minor;
+ int minor = iminor(inode);
int ret = -ENOENT;
preempt_disable();
- minor = MINOR(inode->i_rdev);
if (minor < DEVCOUNT) {
if (openCnt > 0) {
ret = -EALREADY;
@@ -46,9 +45,8 @@ static int gio_open(struct inode *inode, struct file *filp)
static int gio_close(struct inode *inode, struct file *filp)
{
- int minor;
+ int minor = iminor(inode);
- minor = MINOR(inode->i_rdev);
if (minor < DEVCOUNT) {
openCnt--;
}
diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c
index 80a5d1c66a51..1aa508eb0823 100644
--- a/arch/sh/kernel/process_32.c
+++ b/arch/sh/kernel/process_32.c
@@ -114,7 +114,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg,
childregs = task_pt_regs(p);
p->thread.sp = (unsigned long) childregs;
- if (unlikely(p->flags & PF_KTHREAD)) {
+ if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
memset(childregs, 0, sizeof(struct pt_regs));
p->thread.pc = (unsigned long) ret_from_kernel_thread;
childregs->regs[4] = arg;
diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c
index 7649b14d69f8..b91e88058e0c 100644
--- a/arch/sparc/kernel/process_32.c
+++ b/arch/sparc/kernel/process_32.c
@@ -309,7 +309,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
ti->ksp = (unsigned long) new_stack;
p->thread.kregs = childregs;
- if (unlikely(p->flags & PF_KTHREAD)) {
+ if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
extern int nwindows;
unsigned long psr;
memset(new_stack, 0, STACKFRAME_SZ + TRACEREG_SZ);
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index 6f8c7822fc06..7afd0a859a78 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -597,7 +597,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
sizeof(struct sparc_stackf));
t->fpsaved[0] = 0;
- if (unlikely(p->flags & PF_KTHREAD)) {
+ if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
memset(child_trap_frame, 0, child_stack_sz);
__thread_flag_byte_ptr(t)[TI_FLAG_BYTE_CWP] =
(current_pt_regs()->tstate + 1) & TSTATE_CWP;
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index 81d508daf67c..c5011064b5dd 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -157,7 +157,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
unsigned long arg, struct task_struct * p, unsigned long tls)
{
void (*handler)(void);
- int kthread = current->flags & PF_KTHREAD;
+ int kthread = current->flags & (PF_KTHREAD | PF_IO_WORKER);
int ret = 0;
p->thread = (struct thread_struct) INIT_THREAD;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 145a7ac0c19a..9c214d7085a4 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -161,7 +161,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
#endif
/* Kernel thread ? */
- if (unlikely(p->flags & PF_KTHREAD)) {
+ if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
memset(childregs, 0, sizeof(struct pt_regs));
kthread_frame_init(frame, sp, arg);
return 0;
diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c
index 397a7de56377..9534ef515d74 100644
--- a/arch/xtensa/kernel/process.c
+++ b/arch/xtensa/kernel/process.c
@@ -217,7 +217,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp_thread_fn,
p->thread.sp = (unsigned long)childregs;
- if (!(p->flags & PF_KTHREAD)) {
+ if (!(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
struct pt_regs *regs = current_pt_regs();
unsigned long usp = usp_thread_fn ?
usp_thread_fn : regs->areg[1];
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index b398dde53af9..ec482e6641ff 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -125,6 +125,8 @@
#include <linux/delay.h>
#include <linux/backing-dev.h>
+#include <trace/events/block.h>
+
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-tag.h"
@@ -5621,7 +5623,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
spin_unlock_irq(&bfqd->lock);
- blk_mq_sched_request_inserted(rq);
+ trace_block_rq_insert(rq);
spin_lock_irq(&bfqd->lock);
bfqq = bfq_init_rq(rq);
diff --git a/block/blk-core.c b/block/blk-core.c
index 5e752840b41a..fc60ff208497 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -59,6 +59,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_split);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_insert);
DEFINE_IDA(blk_queue_ida);
diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c
index e8327c50d7c9..c176b7af56a7 100644
--- a/block/blk-crypto-fallback.c
+++ b/block/blk-crypto-fallback.c
@@ -80,6 +80,7 @@ static struct blk_crypto_keyslot {
static struct blk_keyslot_manager blk_crypto_ksm;
static struct workqueue_struct *blk_crypto_wq;
static mempool_t *blk_crypto_bounce_page_pool;
+static struct bio_set crypto_bio_split;
/*
* This is the key we set when evicting a keyslot. This *should* be the all 0's
@@ -224,7 +225,8 @@ static bool blk_crypto_split_bio_if_needed(struct bio **bio_ptr)
if (num_sectors < bio_sectors(bio)) {
struct bio *split_bio;
- split_bio = bio_split(bio, num_sectors, GFP_NOIO, NULL);
+ split_bio = bio_split(bio, num_sectors, GFP_NOIO,
+ &crypto_bio_split);
if (!split_bio) {
bio->bi_status = BLK_STS_RESOURCE;
return false;
@@ -538,9 +540,13 @@ static int blk_crypto_fallback_init(void)
prandom_bytes(blank_key, BLK_CRYPTO_MAX_KEY_SIZE);
- err = blk_ksm_init(&blk_crypto_ksm, blk_crypto_num_keyslots);
+ err = bioset_init(&crypto_bio_split, 64, 0, 0);
if (err)
goto out;
+
+ err = blk_ksm_init(&blk_crypto_ksm, blk_crypto_num_keyslots);
+ if (err)
+ goto fail_free_bioset;
err = -ENOMEM;
blk_crypto_ksm.ksm_ll_ops = blk_crypto_ksm_ll_ops;
@@ -591,6 +597,8 @@ fail_free_wq:
destroy_workqueue(blk_crypto_wq);
fail_free_ksm:
blk_ksm_destroy(&blk_crypto_ksm);
+fail_free_bioset:
+ bioset_exit(&crypto_bio_split);
out:
return err;
}
diff --git a/block/blk-map.c b/block/blk-map.c
index 21630dccac62..369e204d14d0 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -150,9 +150,7 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data,
bmd->is_our_pages = !map_data;
bmd->is_null_mapped = (map_data && map_data->null_mapped);
- nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
- if (nr_pages > BIO_MAX_PAGES)
- nr_pages = BIO_MAX_PAGES;
+ nr_pages = bio_max_segs(DIV_ROUND_UP(offset + len, PAGE_SIZE));
ret = -ENOMEM;
bio = bio_kmalloc(gfp_mask, nr_pages);
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index deff4e826e23..ddb65e9e6fd9 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -384,12 +384,6 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq)
}
EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
-void blk_mq_sched_request_inserted(struct request *rq)
-{
- trace_block_rq_insert(rq);
-}
-EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted);
-
static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
bool has_sched,
struct request *rq)
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 0476360f05f1..5b18ab915c65 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -7,7 +7,6 @@
void blk_mq_sched_assign_ioc(struct request *rq);
-void blk_mq_sched_request_inserted(struct request *rq);
bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs, struct request **merged_request);
bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
diff --git a/block/blk-pm.h b/block/blk-pm.h
index a2283cc9f716..8a5a0d4b357f 100644
--- a/block/blk-pm.h
+++ b/block/blk-pm.h
@@ -21,31 +21,6 @@ static inline void blk_pm_mark_last_busy(struct request *rq)
if (rq->q->dev && !(rq->rq_flags & RQF_PM))
pm_runtime_mark_last_busy(rq->q->dev);
}
-
-static inline void blk_pm_requeue_request(struct request *rq)
-{
- lockdep_assert_held(&rq->q->queue_lock);
-
- if (rq->q->dev && !(rq->rq_flags & RQF_PM))
- rq->q->nr_pending--;
-}
-
-static inline void blk_pm_add_request(struct request_queue *q,
- struct request *rq)
-{
- lockdep_assert_held(&q->queue_lock);
-
- if (q->dev && !(rq->rq_flags & RQF_PM))
- q->nr_pending++;
-}
-
-static inline void blk_pm_put_request(struct request *rq)
-{
- lockdep_assert_held(&rq->q->queue_lock);
-
- if (rq->q->dev && !(rq->rq_flags & RQF_PM))
- --rq->q->nr_pending;
-}
#else
static inline int blk_pm_resume_queue(const bool pm, struct request_queue *q)
{
@@ -55,19 +30,6 @@ static inline int blk_pm_resume_queue(const bool pm, struct request_queue *q)
static inline void blk_pm_mark_last_busy(struct request *rq)
{
}
-
-static inline void blk_pm_requeue_request(struct request *rq)
-{
-}
-
-static inline void blk_pm_add_request(struct request_queue *q,
- struct request *rq)
-{
-}
-
-static inline void blk_pm_put_request(struct request *rq)
-{
-}
#endif
#endif /* _BLOCK_BLK_PM_H_ */
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 7dd8be314ac6..b4aa2f37fab6 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -504,6 +504,14 @@ void blk_queue_io_opt(struct request_queue *q, unsigned int opt)
}
EXPORT_SYMBOL(blk_queue_io_opt);
+static unsigned int blk_round_down_sectors(unsigned int sectors, unsigned int lbs)
+{
+ sectors = round_down(sectors, lbs >> SECTOR_SHIFT);
+ if (sectors < PAGE_SIZE >> SECTOR_SHIFT)
+ sectors = PAGE_SIZE >> SECTOR_SHIFT;
+ return sectors;
+}
+
/**
* blk_stack_limits - adjust queue_limits for stacked devices
* @t: the stacking driver limits (top device)
@@ -630,6 +638,10 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
ret = -1;
}
+ t->max_sectors = blk_round_down_sectors(t->max_sectors, t->logical_block_size);
+ t->max_hw_sectors = blk_round_down_sectors(t->max_hw_sectors, t->logical_block_size);
+ t->max_dev_sectors = blk_round_down_sectors(t->max_dev_sectors, t->logical_block_size);
+
/* Discard alignment and granularity */
if (b->discard_granularity) {
alignment = queue_limit_discard_alignment(b, start);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index ae39c7f3d83d..0f4f0c8a7825 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -434,10 +434,13 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page,
if (ret < 0)
return ret;
- if (poll_on)
+ if (poll_on) {
blk_queue_flag_set(QUEUE_FLAG_POLL, q);
- else
+ } else {
+ blk_mq_freeze_queue(q);
blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
+ blk_mq_unfreeze_queue(q);
+ }
return ret;
}
diff --git a/block/bounce.c b/block/bounce.c
index fc55314aa426..87983a35079c 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -214,8 +214,7 @@ static void bounce_end_io_read_isa(struct bio *bio)
__bounce_end_io_read(bio, &isa_page_pool);
}
-static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask,
- struct bio_set *bs)
+static struct bio *bounce_clone_bio(struct bio *bio_src)
{
struct bvec_iter iter;
struct bio_vec bv;
@@ -242,10 +241,12 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask,
* asking for trouble and would force extra work on
* __bio_clone_fast() anyways.
*/
-
- bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
- if (!bio)
- return NULL;
+ if (bio_is_passthrough(bio_src))
+ bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL,
+ bio_segments(bio_src));
+ else
+ bio = bio_alloc_bioset(GFP_NOIO, bio_segments(bio_src),
+ &bounce_bio_set);
bio->bi_bdev = bio_src->bi_bdev;
if (bio_flagged(bio_src, BIO_REMAPPED))
bio_set_flag(bio, BIO_REMAPPED);
@@ -269,11 +270,11 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask,
break;
}
- if (bio_crypt_clone(bio, bio_src, gfp_mask) < 0)
+ if (bio_crypt_clone(bio, bio_src, GFP_NOIO) < 0)
goto err_put;
if (bio_integrity(bio_src) &&
- bio_integrity_clone(bio, bio_src, gfp_mask) < 0)
+ bio_integrity_clone(bio, bio_src, GFP_NOIO) < 0)
goto err_put;
bio_clone_blkg_association(bio, bio_src);
@@ -296,7 +297,6 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
unsigned i = 0;
bool bounce = false;
int sectors = 0;
- bool passthrough = bio_is_passthrough(*bio_orig);
bio_for_each_segment(from, *bio_orig, iter) {
if (i++ < BIO_MAX_PAGES)
@@ -307,14 +307,14 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
if (!bounce)
return;
- if (!passthrough && sectors < bio_sectors(*bio_orig)) {
+ if (!bio_is_passthrough(*bio_orig) &&
+ sectors < bio_sectors(*bio_orig)) {
bio = bio_split(*bio_orig, sectors, GFP_NOIO, &bounce_bio_split);
bio_chain(bio, *bio_orig);
submit_bio_noacct(*bio_orig);
*bio_orig = bio;
}
- bio = bounce_clone_bio(*bio_orig, GFP_NOIO, passthrough ? NULL :
- &bounce_bio_set);
+ bio = bounce_clone_bio(*bio_orig);
/*
* Bvec table can't be updated by bio_for_each_segment_all(),
diff --git a/block/genhd.c b/block/genhd.c
index 36ff45bbaaaf..fcc530164b5a 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -74,7 +74,7 @@ bool set_capacity_and_notify(struct gendisk *disk, sector_t size)
return false;
pr_info("%s: detected capacity change from %lld to %lld\n",
- disk->disk_name, size, capacity);
+ disk->disk_name, capacity, size);
/*
* Historically we did not send a uevent for changes to/from an empty
@@ -476,7 +476,7 @@ void disk_uevent(struct gendisk *disk, enum kobject_action action)
struct disk_part_iter piter;
struct block_device *part;
- disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
+ disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY_PART0);
while ((part = disk_part_iter_next(&piter)))
kobject_uevent(bdev_kobj(part), action);
disk_part_iter_exit(&piter);
diff --git a/block/ioctl.c b/block/ioctl.c
index d61d652078f4..ff241e663c01 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -81,20 +81,27 @@ static int compat_blkpg_ioctl(struct block_device *bdev,
}
#endif
-static int blkdev_reread_part(struct block_device *bdev)
+static int blkdev_reread_part(struct block_device *bdev, fmode_t mode)
{
- int ret;
+ struct block_device *tmp;
if (!disk_part_scan_enabled(bdev->bd_disk) || bdev_is_partition(bdev))
return -EINVAL;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
- mutex_lock(&bdev->bd_mutex);
- ret = bdev_disk_changed(bdev, false);
- mutex_unlock(&bdev->bd_mutex);
+ /*
+ * Reopen the device to revalidate the driver state and force a
+ * partition rescan.
+ */
+ mode &= ~FMODE_EXCL;
+ set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
- return ret;
+ tmp = blkdev_get_by_dev(bdev->bd_dev, mode, NULL);
+ if (IS_ERR(tmp))
+ return PTR_ERR(tmp);
+ blkdev_put(tmp, mode);
+ return 0;
}
static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode,
@@ -498,7 +505,7 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
bdev->bd_bdi->ra_pages = (arg * 512) / PAGE_SIZE;
return 0;
case BLKRRPART:
- return blkdev_reread_part(bdev);
+ return blkdev_reread_part(bdev, mode);
case BLKTRACESTART:
case BLKTRACESTOP:
case BLKTRACETEARDOWN:
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index c25c41d0d061..33d34d69cade 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -13,6 +13,8 @@
#include <linux/module.h>
#include <linux/sbitmap.h>
+#include <trace/events/block.h>
+
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
@@ -353,19 +355,9 @@ static void kyber_timer_fn(struct timer_list *t)
}
}
-static unsigned int kyber_sched_tags_shift(struct request_queue *q)
-{
- /*
- * All of the hardware queues have the same depth, so we can just grab
- * the shift of the first one.
- */
- return q->queue_hw_ctx[0]->sched_tags->bitmap_tags->sb.shift;
-}
-
static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
{
struct kyber_queue_data *kqd;
- unsigned int shift;
int ret = -ENOMEM;
int i;
@@ -400,9 +392,6 @@ static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
kqd->latency_targets[i] = kyber_latency_targets[i];
}
- shift = kyber_sched_tags_shift(q);
- kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
-
return kqd;
err_buckets:
@@ -458,9 +447,19 @@ static void kyber_ctx_queue_init(struct kyber_ctx_queue *kcq)
INIT_LIST_HEAD(&kcq->rq_list[i]);
}
-static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
+static void kyber_depth_updated(struct blk_mq_hw_ctx *hctx)
{
struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
+ struct blk_mq_tags *tags = hctx->sched_tags;
+ unsigned int shift = tags->bitmap_tags->sb.shift;
+
+ kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
+
+ sbitmap_queue_min_shallow_depth(tags->bitmap_tags, kqd->async_depth);
+}
+
+static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
+{
struct kyber_hctx_data *khd;
int i;
@@ -502,8 +501,7 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
khd->batching = 0;
hctx->sched_data = khd;
- sbitmap_queue_min_shallow_depth(hctx->sched_tags->bitmap_tags,
- kqd->async_depth);
+ kyber_depth_updated(hctx);
return 0;
@@ -602,7 +600,7 @@ static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx,
list_move_tail(&rq->queuelist, head);
sbitmap_set_bit(&khd->kcq_map[sched_domain],
rq->mq_ctx->index_hw[hctx->type]);
- blk_mq_sched_request_inserted(rq);
+ trace_block_rq_insert(rq);
spin_unlock(&kcq->lock);
}
}
@@ -1022,6 +1020,7 @@ static struct elevator_type kyber_sched = {
.completed_request = kyber_completed_request,
.dispatch_request = kyber_dispatch_request,
.has_work = kyber_has_work,
+ .depth_updated = kyber_depth_updated,
},
#ifdef CONFIG_BLK_DEBUG_FS
.queue_debugfs_attrs = kyber_queue_debugfs_attrs,
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index b57470e154c8..f3631a287466 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -18,6 +18,8 @@
#include <linux/rbtree.h>
#include <linux/sbitmap.h>
+#include <trace/events/block.h>
+
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
@@ -496,7 +498,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
if (blk_mq_sched_try_insert_merge(q, rq))
return;
- blk_mq_sched_request_inserted(rq);
+ trace_block_rq_insert(rq);
if (at_head || blk_rq_is_passthrough(rq)) {
if (at_head)
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 578fc034db3f..a370cde3ddd4 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -663,7 +663,7 @@ static inline int is_loop_device(struct file *file)
{
struct inode *i = file->f_mapping->host;
- return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR;
+ return i && S_ISBLK(i->i_mode) && imajor(i) == LOOP_MAJOR;
}
static int loop_validate_file(struct file *file, struct block_device *bdev)
@@ -1212,6 +1212,9 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
goto out_unlock;
}
+ if (test_bit(QUEUE_FLAG_WC, &lo->lo_queue->queue_flags))
+ blk_queue_write_cache(lo->lo_queue, false, false);
+
/* freeze request queue during the transition */
blk_mq_freeze_queue(lo->lo_queue);
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 8b9622eb0a21..4ff71b579cfc 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -78,8 +78,7 @@ struct link_dead_args {
#define NBD_RT_HAS_PID_FILE 3
#define NBD_RT_HAS_CONFIG_REF 4
#define NBD_RT_BOUND 5
-#define NBD_RT_DESTROY_ON_DISCONNECT 6
-#define NBD_RT_DISCONNECT_ON_CLOSE 7
+#define NBD_RT_DISCONNECT_ON_CLOSE 6
#define NBD_DESTROY_ON_DISCONNECT 0
#define NBD_DISCONNECT_REQUESTED 1
@@ -1904,12 +1903,21 @@ again:
if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
- set_bit(NBD_RT_DESTROY_ON_DISCONNECT,
- &config->runtime_flags);
- set_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags);
- put_dev = true;
+ /*
+ * We have 1 ref to keep the device around, and then 1
+ * ref for our current operation here, which will be
+ * inherited by the config. If we already have
+ * DESTROY_ON_DISCONNECT set then we know we don't have
+ * that extra ref already held so we don't need the
+ * put_dev.
+ */
+ if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT,
+ &nbd->flags))
+ put_dev = true;
} else {
- clear_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags);
+ if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT,
+ &nbd->flags))
+ refcount_inc(&nbd->refs);
}
if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
set_bit(NBD_RT_DISCONNECT_ON_CLOSE,
@@ -2080,15 +2088,13 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
- if (!test_and_set_bit(NBD_RT_DESTROY_ON_DISCONNECT,
- &config->runtime_flags))
+ if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT,
+ &nbd->flags))
put_dev = true;
- set_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags);
} else {
- if (test_and_clear_bit(NBD_RT_DESTROY_ON_DISCONNECT,
- &config->runtime_flags))
+ if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT,
+ &nbd->flags))
refcount_inc(&nbd->refs);
- clear_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags);
}
if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index da16121140ca..1cdf09ff67b6 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -1326,9 +1326,7 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
pages[i]->page,
seg[i].nsec << 9,
seg[i].offset) == 0)) {
-
- int nr_iovecs = min_t(int, (nseg-i), BIO_MAX_PAGES);
- bio = bio_alloc(GFP_KERNEL, nr_iovecs);
+ bio = bio_alloc(GFP_KERNEL, bio_max_segs(nseg - i));
if (unlikely(bio == NULL))
goto fail_put_bio;
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index cadbd0a1a1ef..5fa6ae9dbc8b 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -480,7 +480,7 @@ static void dax_free_inode(struct inode *inode)
kfree(dax_dev->host);
dax_dev->host = NULL;
if (inode->i_rdev)
- ida_simple_remove(&dax_minor_ida, MINOR(inode->i_rdev));
+ ida_simple_remove(&dax_minor_ida, iminor(inode));
kmem_cache_free(dax_cache, dax_dev);
}
diff --git a/drivers/i2c/busses/i2c-brcmstb.c b/drivers/i2c/busses/i2c-brcmstb.c
index d4e0a0f6732a..ba766d24219e 100644
--- a/drivers/i2c/busses/i2c-brcmstb.c
+++ b/drivers/i2c/busses/i2c-brcmstb.c
@@ -316,7 +316,7 @@ static int brcmstb_send_i2c_cmd(struct brcmstb_i2c_dev *dev,
goto cmd_out;
}
- if ((CMD_RD || CMD_WR) &&
+ if ((cmd == CMD_RD || cmd == CMD_WR) &&
bsc_readl(dev, iic_enable) & BSC_IIC_EN_NOACK_MASK) {
rc = -EREMOTEIO;
dev_dbg(dev->device, "controller received NOACK intr for %s\n",
diff --git a/drivers/i2c/busses/i2c-designware-core.h b/drivers/i2c/busses/i2c-designware-core.h
index 85307cfa7109..5392b82f68a4 100644
--- a/drivers/i2c/busses/i2c-designware-core.h
+++ b/drivers/i2c/busses/i2c-designware-core.h
@@ -38,6 +38,8 @@
#define DW_IC_CON_TX_EMPTY_CTRL BIT(8)
#define DW_IC_CON_RX_FIFO_FULL_HLD_CTRL BIT(9)
+#define DW_IC_DATA_CMD_DAT GENMASK(7, 0)
+
/*
* Registers offset
*/
diff --git a/drivers/i2c/busses/i2c-designware-master.c b/drivers/i2c/busses/i2c-designware-master.c
index d6425ad6e6a3..dd27b9dbe931 100644
--- a/drivers/i2c/busses/i2c-designware-master.c
+++ b/drivers/i2c/busses/i2c-designware-master.c
@@ -432,7 +432,7 @@ i2c_dw_read(struct dw_i2c_dev *dev)
regmap_read(dev->map, DW_IC_DATA_CMD, &tmp);
/* Ensure length byte is a valid value */
if (flags & I2C_M_RECV_LEN &&
- tmp <= I2C_SMBUS_BLOCK_MAX && tmp > 0) {
+ (tmp & DW_IC_DATA_CMD_DAT) <= I2C_SMBUS_BLOCK_MAX && tmp > 0) {
len = i2c_dw_recv_len(dev, tmp);
}
*buf++ = tmp;
diff --git a/drivers/i2c/busses/i2c-exynos5.c b/drivers/i2c/busses/i2c-exynos5.c
index 20a9881a0d6c..5ac30d95650c 100644
--- a/drivers/i2c/busses/i2c-exynos5.c
+++ b/drivers/i2c/busses/i2c-exynos5.c
@@ -606,6 +606,7 @@ static void exynos5_i2c_message_start(struct exynos5_i2c *i2c, int stop)
u32 i2c_ctl;
u32 int_en = 0;
u32 i2c_auto_conf = 0;
+ u32 i2c_addr = 0;
u32 fifo_ctl;
unsigned long flags;
unsigned short trig_lvl;
@@ -640,7 +641,12 @@ static void exynos5_i2c_message_start(struct exynos5_i2c *i2c, int stop)
int_en |= HSI2C_INT_TX_ALMOSTEMPTY_EN;
}
- writel(HSI2C_SLV_ADDR_MAS(i2c->msg->addr), i2c->regs + HSI2C_ADDR);
+ i2c_addr = HSI2C_SLV_ADDR_MAS(i2c->msg->addr);
+
+ if (i2c->op_clock >= I2C_MAX_FAST_MODE_PLUS_FREQ)
+ i2c_addr |= HSI2C_MASTER_ID(MASTER_ID(i2c->adap.nr));
+
+ writel(i2c_addr, i2c->regs + HSI2C_ADDR);
writel(fifo_ctl, i2c->regs + HSI2C_FIFO_CTL);
writel(i2c_ctl, i2c->regs + HSI2C_CTL);
diff --git a/drivers/i2c/busses/i2c-qcom-geni.c b/drivers/i2c/busses/i2c-qcom-geni.c
index c3f584795911..214b4c913a13 100644
--- a/drivers/i2c/busses/i2c-qcom-geni.c
+++ b/drivers/i2c/busses/i2c-qcom-geni.c
@@ -375,32 +375,6 @@ static void geni_i2c_tx_msg_cleanup(struct geni_i2c_dev *gi2c,
}
}
-static void geni_i2c_stop_xfer(struct geni_i2c_dev *gi2c)
-{
- int ret;
- u32 geni_status;
- struct i2c_msg *cur;
-
- /* Resume device, as runtime suspend can happen anytime during transfer */
- ret = pm_runtime_get_sync(gi2c->se.dev);
- if (ret < 0) {
- dev_err(gi2c->se.dev, "Failed to resume device: %d\n", ret);
- return;
- }
-
- geni_status = readl_relaxed(gi2c->se.base + SE_GENI_STATUS);
- if (geni_status & M_GENI_CMD_ACTIVE) {
- cur = gi2c->cur;
- geni_i2c_abort_xfer(gi2c);
- if (cur->flags & I2C_M_RD)
- geni_i2c_rx_msg_cleanup(gi2c, cur);
- else
- geni_i2c_tx_msg_cleanup(gi2c, cur);
- }
-
- pm_runtime_put_sync_suspend(gi2c->se.dev);
-}
-
static int geni_i2c_rx_one_msg(struct geni_i2c_dev *gi2c, struct i2c_msg *msg,
u32 m_param)
{
@@ -676,13 +650,6 @@ static int geni_i2c_remove(struct platform_device *pdev)
return 0;
}
-static void geni_i2c_shutdown(struct platform_device *pdev)
-{
- struct geni_i2c_dev *gi2c = platform_get_drvdata(pdev);
-
- geni_i2c_stop_xfer(gi2c);
-}
-
static int __maybe_unused geni_i2c_runtime_suspend(struct device *dev)
{
int ret;
@@ -747,7 +714,6 @@ MODULE_DEVICE_TABLE(of, geni_i2c_dt_match);
static struct platform_driver geni_i2c_driver = {
.probe = geni_i2c_probe,
.remove = geni_i2c_remove,
- .shutdown = geni_i2c_shutdown,
.driver = {
.name = "geni_i2c",
.pm = &geni_i2c_pm_ops,
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 4312007d2d34..2d3cda0acacb 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -341,8 +341,8 @@ static void do_region(int op, int op_flags, unsigned region,
num_bvecs = 1;
break;
default:
- num_bvecs = min_t(int, BIO_MAX_PAGES,
- dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT)));
+ num_bvecs = bio_max_segs(dm_sector_div_up(remaining,
+ (PAGE_SIZE >> SECTOR_SHIFT)));
}
bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, &io->client->bios);
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index e3d35c6c9f71..57882654ffee 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -264,15 +264,14 @@ static int write_inline_data(struct log_writes_c *lc, void *entry,
size_t entrylen, void *data, size_t datalen,
sector_t sector)
{
- int num_pages, bio_pages, pg_datalen, pg_sectorlen, i;
+ int bio_pages, pg_datalen, pg_sectorlen, i;
struct page *page;
struct bio *bio;
size_t ret;
void *ptr;
while (datalen) {
- num_pages = ALIGN(datalen, PAGE_SIZE) >> PAGE_SHIFT;
- bio_pages = min(num_pages, BIO_MAX_PAGES);
+ bio_pages = bio_max_segs(DIV_ROUND_UP(datalen, PAGE_SIZE));
atomic_inc(&lc->io_blocks);
@@ -364,7 +363,7 @@ static int log_one_block(struct log_writes_c *lc,
goto out;
atomic_inc(&lc->io_blocks);
- bio = bio_alloc(GFP_KERNEL, min(block->vec_cnt, BIO_MAX_PAGES));
+ bio = bio_alloc(GFP_KERNEL, bio_max_segs(block->vec_cnt));
if (!bio) {
DMERR("Couldn't alloc log bio");
goto error;
@@ -386,7 +385,8 @@ static int log_one_block(struct log_writes_c *lc,
if (ret != block->vecs[i].bv_len) {
atomic_inc(&lc->io_blocks);
submit_bio(bio);
- bio = bio_alloc(GFP_KERNEL, min(block->vec_cnt - i, BIO_MAX_PAGES));
+ bio = bio_alloc(GFP_KERNEL,
+ bio_max_segs(block->vec_cnt - i));
if (!bio) {
DMERR("Couldn't alloc log bio");
goto error;
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index 3d9a5d3ed9cd..9a8b3726a37c 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -185,7 +185,7 @@ static int nvmet_bdev_alloc_bip(struct nvmet_req *req, struct bio *bio,
}
bip = bio_integrity_alloc(bio, GFP_NOIO,
- min_t(unsigned int, req->metadata_sg_cnt, BIO_MAX_PAGES));
+ bio_max_segs(req->metadata_sg_cnt));
if (IS_ERR(bip)) {
pr_err("Unable to allocate bio_integrity_payload\n");
return PTR_ERR(bip);
@@ -225,7 +225,7 @@ static int nvmet_bdev_alloc_bip(struct nvmet_req *req, struct bio *bio,
static void nvmet_bdev_execute_rw(struct nvmet_req *req)
{
- int sg_cnt = req->sg_cnt;
+ unsigned int sg_cnt = req->sg_cnt;
struct bio *bio;
struct scatterlist *sg;
struct blk_plug plug;
@@ -262,7 +262,7 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req)
bio = &req->b.inline_bio;
bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec));
} else {
- bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES));
+ bio = bio_alloc(GFP_KERNEL, bio_max_segs(sg_cnt));
}
bio_set_dev(bio, req->ns->bdev);
bio->bi_iter.bi_sector = sector;
@@ -289,7 +289,7 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req)
}
}
- bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES));
+ bio = bio_alloc(GFP_KERNEL, bio_max_segs(sg_cnt));
bio_set_dev(bio, req->ns->bdev);
bio->bi_iter.bi_sector = sector;
bio->bi_opf = op;
diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c
index f50c7b2bf21c..26c587ccd152 100644
--- a/drivers/nvme/target/passthru.c
+++ b/drivers/nvme/target/passthru.c
@@ -26,7 +26,7 @@ static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req)
struct nvme_ctrl *pctrl = ctrl->subsys->passthru_ctrl;
u16 status = NVME_SC_SUCCESS;
struct nvme_id_ctrl *id;
- int max_hw_sectors;
+ unsigned int max_hw_sectors;
int page_shift;
id = kzalloc(sizeof(*id), GFP_KERNEL);
@@ -198,7 +198,7 @@ static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq)
bio = &req->p.inline_bio;
bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec));
} else {
- bio = bio_alloc(GFP_KERNEL, min(req->sg_cnt, BIO_MAX_PAGES));
+ bio = bio_alloc(GFP_KERNEL, bio_max_segs(req->sg_cnt));
bio->bi_end_io = bio_put;
}
bio->bi_opf = req_op(rq);
diff --git a/drivers/rtc/rtc-m41t80.c b/drivers/rtc/rtc-m41t80.c
index cce5b5284658..89128fc29ccc 100644
--- a/drivers/rtc/rtc-m41t80.c
+++ b/drivers/rtc/rtc-m41t80.c
@@ -785,7 +785,7 @@ static long wdt_unlocked_ioctl(struct file *file, unsigned int cmd,
*/
static int wdt_open(struct inode *inode, struct file *file)
{
- if (MINOR(inode->i_rdev) == WATCHDOG_MINOR) {
+ if (iminor(inode) == WATCHDOG_MINOR) {
mutex_lock(&m41t80_rtc_mutex);
if (test_and_set_bit(0, &wdt_is_open)) {
mutex_unlock(&m41t80_rtc_mutex);
@@ -809,7 +809,7 @@ static int wdt_open(struct inode *inode, struct file *file)
*/
static int wdt_release(struct inode *inode, struct file *file)
{
- if (MINOR(inode->i_rdev) == WATCHDOG_MINOR)
+ if (iminor(inode) == WATCHDOG_MINOR)
clear_bit(0, &wdt_is_open);
return 0;
}
diff --git a/drivers/s390/char/vmur.c b/drivers/s390/char/vmur.c
index 1bbf27b98cf6..68f49e2e964c 100644
--- a/drivers/s390/char/vmur.c
+++ b/drivers/s390/char/vmur.c
@@ -681,7 +681,7 @@ static int ur_open(struct inode *inode, struct file *file)
* We treat the minor number as the devno of the ur device
* to find in the driver tree.
*/
- devno = MINOR(file_inode(file)->i_rdev);
+ devno = iminor(file_inode(file));
urd = urdev_get_from_devno(devno);
if (!urd) {
diff --git a/drivers/scsi/aic7xxx/aic79xx.h b/drivers/scsi/aic7xxx/aic79xx.h
index 77e1d6bb59a3..59e9321815c8 100644
--- a/drivers/scsi/aic7xxx/aic79xx.h
+++ b/drivers/scsi/aic7xxx/aic79xx.h
@@ -1175,7 +1175,7 @@ struct ahd_softc {
uint8_t tqinfifonext;
/*
- * Cached verson of the hs_mailbox so we can avoid
+ * Cached version of the hs_mailbox so we can avoid
* pausing the sequencer during mailbox updates.
*/
uint8_t hs_mailbox;
diff --git a/drivers/scsi/aic7xxx/aic7xxx.h b/drivers/scsi/aic7xxx/aic7xxx.h
index 11a09798e6b5..9bc755a0a2d3 100644
--- a/drivers/scsi/aic7xxx/aic7xxx.h
+++ b/drivers/scsi/aic7xxx/aic7xxx.h
@@ -896,8 +896,6 @@ union ahc_bus_softc {
typedef void (*ahc_bus_intr_t)(struct ahc_softc *);
typedef int (*ahc_bus_chip_init_t)(struct ahc_softc *);
-typedef int (*ahc_bus_suspend_t)(struct ahc_softc *);
-typedef int (*ahc_bus_resume_t)(struct ahc_softc *);
typedef void ahc_callback_t (void *);
struct ahc_softc {
diff --git a/drivers/scsi/bnx2fc/Kconfig b/drivers/scsi/bnx2fc/Kconfig
index 3cf7e08df809..ecdc0f0f4f4e 100644
--- a/drivers/scsi/bnx2fc/Kconfig
+++ b/drivers/scsi/bnx2fc/Kconfig
@@ -5,6 +5,7 @@ config SCSI_BNX2X_FCOE
depends on (IPV6 || IPV6=n)
depends on LIBFC
depends on LIBFCOE
+ depends on MMU
select NETDEVICES
select ETHERNET
select NET_VENDOR_BROADCOM
diff --git a/drivers/scsi/bnx2i/bnx2i_iscsi.c b/drivers/scsi/bnx2i/bnx2i_iscsi.c
index fdd446765311..1e6d8f62ea3c 100644
--- a/drivers/scsi/bnx2i/bnx2i_iscsi.c
+++ b/drivers/scsi/bnx2i/bnx2i_iscsi.c
@@ -1171,10 +1171,8 @@ static void bnx2i_cleanup_task(struct iscsi_task *task)
bnx2i_send_cmd_cleanup_req(hba, task->dd_data);
spin_unlock_bh(&conn->session->back_lock);
- spin_unlock_bh(&conn->session->frwd_lock);
wait_for_completion_timeout(&bnx2i_conn->cmd_cleanup_cmpl,
msecs_to_jiffies(ISCSI_CMD_CLEANUP_TIMEOUT));
- spin_lock_bh(&conn->session->frwd_lock);
spin_lock_bh(&conn->session->back_lock);
}
bnx2i_iscsi_unmap_sg_list(task->dd_data);
diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c
index 337d3aa91945..38369766511c 100644
--- a/drivers/scsi/hpsa.c
+++ b/drivers/scsi/hpsa.c
@@ -1151,7 +1151,10 @@ static void __enqueue_cmd_and_start_io(struct ctlr_info *h,
{
dial_down_lockup_detection_during_fw_flash(h, c);
atomic_inc(&h->commands_outstanding);
- if (c->device)
+ /*
+ * Check to see if the command is being retried.
+ */
+ if (c->device && !c->retry_pending)
atomic_inc(&c->device->commands_outstanding);
reply_queue = h->reply_map[raw_smp_processor_id()];
@@ -5567,7 +5570,8 @@ static inline void hpsa_cmd_partial_init(struct ctlr_info *h, int index,
}
static int hpsa_ioaccel_submit(struct ctlr_info *h,
- struct CommandList *c, struct scsi_cmnd *cmd)
+ struct CommandList *c, struct scsi_cmnd *cmd,
+ bool retry)
{
struct hpsa_scsi_dev_t *dev = cmd->device->hostdata;
int rc = IO_ACCEL_INELIGIBLE;
@@ -5584,18 +5588,22 @@ static int hpsa_ioaccel_submit(struct ctlr_info *h,
cmd->host_scribble = (unsigned char *) c;
if (dev->offload_enabled) {
- hpsa_cmd_init(h, c->cmdindex, c);
+ hpsa_cmd_init(h, c->cmdindex, c); /* Zeroes out all fields */
c->cmd_type = CMD_SCSI;
c->scsi_cmd = cmd;
c->device = dev;
+ if (retry) /* Resubmit but do not increment device->commands_outstanding. */
+ c->retry_pending = true;
rc = hpsa_scsi_ioaccel_raid_map(h, c);
if (rc < 0) /* scsi_dma_map failed. */
rc = SCSI_MLQUEUE_HOST_BUSY;
} else if (dev->hba_ioaccel_enabled) {
- hpsa_cmd_init(h, c->cmdindex, c);
+ hpsa_cmd_init(h, c->cmdindex, c); /* Zeroes out all fields */
c->cmd_type = CMD_SCSI;
c->scsi_cmd = cmd;
c->device = dev;
+ if (retry) /* Resubmit but do not increment device->commands_outstanding. */
+ c->retry_pending = true;
rc = hpsa_scsi_ioaccel_direct_map(h, c);
if (rc < 0) /* scsi_dma_map failed. */
rc = SCSI_MLQUEUE_HOST_BUSY;
@@ -5628,7 +5636,8 @@ static void hpsa_command_resubmit_worker(struct work_struct *work)
if (c2->error_data.serv_response ==
IOACCEL2_STATUS_SR_TASK_COMP_SET_FULL) {
- rc = hpsa_ioaccel_submit(h, c, cmd);
+ /* Resubmit with the retry_pending flag set. */
+ rc = hpsa_ioaccel_submit(h, c, cmd, true);
if (rc == 0)
return;
if (rc == SCSI_MLQUEUE_HOST_BUSY) {
@@ -5644,6 +5653,15 @@ static void hpsa_command_resubmit_worker(struct work_struct *work)
}
}
hpsa_cmd_partial_init(c->h, c->cmdindex, c);
+ /*
+ * Here we have not come in though queue_command, so we
+ * can set the retry_pending flag to true for a driver initiated
+ * retry attempt (I.E. not a SML retry).
+ * I.E. We are submitting a driver initiated retry.
+ * Note: hpsa_ciss_submit does not zero out the command fields like
+ * ioaccel submit does.
+ */
+ c->retry_pending = true;
if (hpsa_ciss_submit(c->h, c, cmd, dev)) {
/*
* If we get here, it means dma mapping failed. Try
@@ -5706,11 +5724,16 @@ static int hpsa_scsi_queue_command(struct Scsi_Host *sh, struct scsi_cmnd *cmd)
/*
* Call alternate submit routine for I/O accelerated commands.
* Retries always go down the normal I/O path.
+ * Note: If cmd->retries is non-zero, then this is a SML
+ * initiated retry and not a driver initiated retry.
+ * This command has been obtained from cmd_tagged_alloc
+ * and is therefore a brand-new command.
*/
if (likely(cmd->retries == 0 &&
!blk_rq_is_passthrough(cmd->request) &&
h->acciopath_status)) {
- rc = hpsa_ioaccel_submit(h, c, cmd);
+ /* Submit with the retry_pending flag unset. */
+ rc = hpsa_ioaccel_submit(h, c, cmd, false);
if (rc == 0)
return 0;
if (rc == SCSI_MLQUEUE_HOST_BUSY) {
@@ -6105,6 +6128,7 @@ return_reset_status:
* at init, and managed by cmd_tagged_alloc() and cmd_tagged_free() using the
* block request tag as an index into a table of entries. cmd_tagged_free() is
* the complement, although cmd_free() may be called instead.
+ * This function is only called for new requests from queue_command.
*/
static struct CommandList *cmd_tagged_alloc(struct ctlr_info *h,
struct scsi_cmnd *scmd)
@@ -6139,8 +6163,14 @@ static struct CommandList *cmd_tagged_alloc(struct ctlr_info *h,
}
atomic_inc(&c->refcount);
-
hpsa_cmd_partial_init(h, idx, c);
+
+ /*
+ * This is a new command obtained from queue_command so
+ * there have not been any driver initiated retry attempts.
+ */
+ c->retry_pending = false;
+
return c;
}
@@ -6208,6 +6238,13 @@ static struct CommandList *cmd_alloc(struct ctlr_info *h)
}
hpsa_cmd_partial_init(h, i, c);
c->device = NULL;
+
+ /*
+ * cmd_alloc is for "internal" commands and they are never
+ * retried.
+ */
+ c->retry_pending = false;
+
return c;
}
diff --git a/drivers/scsi/hpsa_cmd.h b/drivers/scsi/hpsa_cmd.h
index 46df2e3ff89b..d126bb877250 100644
--- a/drivers/scsi/hpsa_cmd.h
+++ b/drivers/scsi/hpsa_cmd.h
@@ -448,7 +448,7 @@ struct CommandList {
*/
struct hpsa_scsi_dev_t *phys_disk;
- int abort_pending;
+ bool retry_pending;
struct hpsa_scsi_dev_t *device;
atomic_t refcount; /* Must be last to avoid memset in hpsa_cmd_init() */
} __aligned(COMMANDLIST_ALIGNMENT);
diff --git a/drivers/scsi/isci/request.c b/drivers/scsi/isci/request.c
index bee16850b236..58e62162882f 100644
--- a/drivers/scsi/isci/request.c
+++ b/drivers/scsi/isci/request.c
@@ -1480,8 +1480,6 @@ static enum sci_status
stp_request_pio_await_h2d_completion_tc_event(struct isci_request *ireq,
u32 completion_code)
{
- enum sci_status status = SCI_SUCCESS;
-
switch (SCU_GET_COMPLETION_TL_STATUS(completion_code)) {
case SCU_MAKE_COMPLETION_STATUS(SCU_TASK_DONE_GOOD):
ireq->scu_status = SCU_TASK_DONE_GOOD;
@@ -1500,7 +1498,7 @@ stp_request_pio_await_h2d_completion_tc_event(struct isci_request *ireq,
break;
}
- return status;
+ return SCI_SUCCESS;
}
static enum sci_status
@@ -2152,8 +2150,6 @@ static enum sci_status stp_request_udma_await_tc_event(struct isci_request *ireq
static enum sci_status atapi_raw_completion(struct isci_request *ireq, u32 completion_code,
enum sci_base_request_states next)
{
- enum sci_status status = SCI_SUCCESS;
-
switch (SCU_GET_COMPLETION_TL_STATUS(completion_code)) {
case SCU_MAKE_COMPLETION_STATUS(SCU_TASK_DONE_GOOD):
ireq->scu_status = SCU_TASK_DONE_GOOD;
@@ -2172,7 +2168,7 @@ static enum sci_status atapi_raw_completion(struct isci_request *ireq, u32 compl
break;
}
- return status;
+ return SCI_SUCCESS;
}
static enum sci_status atapi_data_tc_completion_handler(struct isci_request *ireq,
diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index a9ce6298b935..dd33ce0e3737 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -847,6 +847,7 @@ iscsi_sw_tcp_session_create(struct iscsi_endpoint *ep, uint16_t cmds_max,
struct iscsi_session *session;
struct iscsi_sw_tcp_host *tcp_sw_host;
struct Scsi_Host *shost;
+ int rc;
if (ep) {
printk(KERN_ERR "iscsi_tcp: invalid ep %p.\n", ep);
@@ -864,6 +865,11 @@ iscsi_sw_tcp_session_create(struct iscsi_endpoint *ep, uint16_t cmds_max,
shost->max_channel = 0;
shost->max_cmd_len = SCSI_MAX_VARLEN_CDB_SIZE;
+ rc = iscsi_host_get_max_scsi_cmds(shost, cmds_max);
+ if (rc < 0)
+ goto free_host;
+ shost->can_queue = rc;
+
if (iscsi_host_add(shost, NULL))
goto free_host;
@@ -878,7 +884,6 @@ iscsi_sw_tcp_session_create(struct iscsi_endpoint *ep, uint16_t cmds_max,
tcp_sw_host = iscsi_host_priv(shost);
tcp_sw_host->session = session;
- shost->can_queue = session->scsi_cmds_max;
if (iscsi_tcp_r2tpool_alloc(session))
goto remove_session;
return cls_session;
@@ -981,7 +986,7 @@ static struct scsi_host_template iscsi_sw_tcp_sht = {
.name = "iSCSI Initiator over TCP/IP",
.queuecommand = iscsi_queuecommand,
.change_queue_depth = scsi_change_queue_depth,
- .can_queue = ISCSI_DEF_XMIT_CMDS_MAX - 1,
+ .can_queue = ISCSI_TOTAL_CMDS_MAX,
.sg_tablesize = 4096,
.max_sectors = 0xFFFF,
.cmd_per_lun = ISCSI_DEF_CMD_PER_LUN,
diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
index 4e668aafbcca..7ad11e42306d 100644
--- a/drivers/scsi/libiscsi.c
+++ b/drivers/scsi/libiscsi.c
@@ -523,16 +523,6 @@ static void iscsi_complete_task(struct iscsi_task *task, int state)
WARN_ON_ONCE(task->state == ISCSI_TASK_FREE);
task->state = state;
- spin_lock_bh(&conn->taskqueuelock);
- if (!list_empty(&task->running)) {
- pr_debug_once("%s while task on list", __func__);
- list_del_init(&task->running);
- }
- spin_unlock_bh(&conn->taskqueuelock);
-
- if (conn->task == task)
- conn->task = NULL;
-
if (READ_ONCE(conn->ping_task) == task)
WRITE_ONCE(conn->ping_task, NULL);
@@ -564,11 +554,41 @@ void iscsi_complete_scsi_task(struct iscsi_task *task,
}
EXPORT_SYMBOL_GPL(iscsi_complete_scsi_task);
+/*
+ * Must be called with back and frwd lock
+ */
+static bool cleanup_queued_task(struct iscsi_task *task)
+{
+ struct iscsi_conn *conn = task->conn;
+ bool early_complete = false;
+
+ /* Bad target might have completed task while it was still running */
+ if (task->state == ISCSI_TASK_COMPLETED)
+ early_complete = true;
+
+ if (!list_empty(&task->running)) {
+ list_del_init(&task->running);
+ /*
+ * If it's on a list but still running, this could be from
+ * a bad target sending a rsp early, cleanup from a TMF, or
+ * session recovery.
+ */
+ if (task->state == ISCSI_TASK_RUNNING ||
+ task->state == ISCSI_TASK_COMPLETED)
+ __iscsi_put_task(task);
+ }
+
+ if (conn->task == task) {
+ conn->task = NULL;
+ __iscsi_put_task(task);
+ }
+
+ return early_complete;
+}
/*
- * session back_lock must be held and if not called for a task that is
- * still pending or from the xmit thread, then xmit thread must
- * be suspended.
+ * session frwd lock must be held and if not called for a task that is still
+ * pending or from the xmit thread, then xmit thread must be suspended
*/
static void fail_scsi_task(struct iscsi_task *task, int err)
{
@@ -576,14 +596,11 @@ static void fail_scsi_task(struct iscsi_task *task, int err)
struct scsi_cmnd *sc;
int state;
- /*
- * if a command completes and we get a successful tmf response
- * we will hit this because the scsi eh abort code does not take
- * a ref to the task.
- */
- sc = task->sc;
- if (!sc)
+ spin_lock_bh(&conn->session->back_lock);
+ if (cleanup_queued_task(task)) {
+ spin_unlock_bh(&conn->session->back_lock);
return;
+ }
if (task->state == ISCSI_TASK_PENDING) {
/*
@@ -598,11 +615,9 @@ static void fail_scsi_task(struct iscsi_task *task, int err)
else
state = ISCSI_TASK_ABRT_TMF;
+ sc = task->sc;
sc->result = err << 16;
scsi_set_resid(sc, scsi_bufflen(sc));
-
- /* regular RX path uses back_lock */
- spin_lock_bh(&conn->session->back_lock);
iscsi_complete_task(task, state);
spin_unlock_bh(&conn->session->back_lock);
}
@@ -748,9 +763,7 @@ __iscsi_conn_send_pdu(struct iscsi_conn *conn, struct iscsi_hdr *hdr,
if (session->tt->xmit_task(task))
goto free_task;
} else {
- spin_lock_bh(&conn->taskqueuelock);
list_add_tail(&task->running, &conn->mgmtqueue);
- spin_unlock_bh(&conn->taskqueuelock);
iscsi_conn_queue_work(conn);
}
@@ -1411,31 +1424,61 @@ static int iscsi_check_cmdsn_window_closed(struct iscsi_conn *conn)
return 0;
}
-static int iscsi_xmit_task(struct iscsi_conn *conn)
+static int iscsi_xmit_task(struct iscsi_conn *conn, struct iscsi_task *task,
+ bool was_requeue)
{
- struct iscsi_task *task = conn->task;
int rc;
- if (test_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx))
- return -ENODATA;
-
spin_lock_bh(&conn->session->back_lock);
- if (conn->task == NULL) {
+
+ if (!conn->task) {
+ /* Take a ref so we can access it after xmit_task() */
+ __iscsi_get_task(task);
+ } else {
+ /* Already have a ref from when we failed to send it last call */
+ conn->task = NULL;
+ }
+
+ /*
+ * If this was a requeue for a R2T we have an extra ref on the task in
+ * case a bad target sends a cmd rsp before we have handled the task.
+ */
+ if (was_requeue)
+ __iscsi_put_task(task);
+
+ /*
+ * Do this after dropping the extra ref because if this was a requeue
+ * it's removed from that list and cleanup_queued_task would miss it.
+ */
+ if (test_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx)) {
+ /*
+ * Save the task and ref in case we weren't cleaning up this
+ * task and get woken up again.
+ */
+ conn->task = task;
spin_unlock_bh(&conn->session->back_lock);
return -ENODATA;
}
- __iscsi_get_task(task);
spin_unlock_bh(&conn->session->back_lock);
+
spin_unlock_bh(&conn->session->frwd_lock);
rc = conn->session->tt->xmit_task(task);
spin_lock_bh(&conn->session->frwd_lock);
if (!rc) {
/* done with this task */
task->last_xfer = jiffies;
- conn->task = NULL;
}
/* regular RX path uses back_lock */
spin_lock(&conn->session->back_lock);
+ if (rc && task->state == ISCSI_TASK_RUNNING) {
+ /*
+ * get an extra ref that is released next time we access it
+ * as conn->task above.
+ */
+ __iscsi_get_task(task);
+ conn->task = task;
+ }
+
__iscsi_put_task(task);
spin_unlock(&conn->session->back_lock);
return rc;
@@ -1445,9 +1488,7 @@ static int iscsi_xmit_task(struct iscsi_conn *conn)
* iscsi_requeue_task - requeue task to run from session workqueue
* @task: task to requeue
*
- * LLDs that need to run a task from the session workqueue should call
- * this. The session frwd_lock must be held. This should only be called
- * by software drivers.
+ * Callers must have taken a ref to the task that is going to be requeued.
*/
void iscsi_requeue_task(struct iscsi_task *task)
{
@@ -1457,11 +1498,18 @@ void iscsi_requeue_task(struct iscsi_task *task)
* this may be on the requeue list already if the xmit_task callout
* is handling the r2ts while we are adding new ones
*/
- spin_lock_bh(&conn->taskqueuelock);
- if (list_empty(&task->running))
+ spin_lock_bh(&conn->session->frwd_lock);
+ if (list_empty(&task->running)) {
list_add_tail(&task->running, &conn->requeue);
- spin_unlock_bh(&conn->taskqueuelock);
+ } else {
+ /*
+ * Don't need the extra ref since it's already requeued and
+ * has a ref.
+ */
+ iscsi_put_task(task);
+ }
iscsi_conn_queue_work(conn);
+ spin_unlock_bh(&conn->session->frwd_lock);
}
EXPORT_SYMBOL_GPL(iscsi_requeue_task);
@@ -1487,7 +1535,7 @@ static int iscsi_data_xmit(struct iscsi_conn *conn)
}
if (conn->task) {
- rc = iscsi_xmit_task(conn);
+ rc = iscsi_xmit_task(conn, conn->task, false);
if (rc)
goto done;
}
@@ -1497,54 +1545,41 @@ static int iscsi_data_xmit(struct iscsi_conn *conn)
* only have one nop-out as a ping from us and targets should not
* overflow us with nop-ins
*/
- spin_lock_bh(&conn->taskqueuelock);
check_mgmt:
while (!list_empty(&conn->mgmtqueue)) {
- conn->task = list_entry(conn->mgmtqueue.next,
- struct iscsi_task, running);
- list_del_init(&conn->task->running);
- spin_unlock_bh(&conn->taskqueuelock);
- if (iscsi_prep_mgmt_task(conn, conn->task)) {
+ task = list_entry(conn->mgmtqueue.next, struct iscsi_task,
+ running);
+ list_del_init(&task->running);
+ if (iscsi_prep_mgmt_task(conn, task)) {
/* regular RX path uses back_lock */
spin_lock_bh(&conn->session->back_lock);
- __iscsi_put_task(conn->task);
+ __iscsi_put_task(task);
spin_unlock_bh(&conn->session->back_lock);
- conn->task = NULL;
- spin_lock_bh(&conn->taskqueuelock);
continue;
}
- rc = iscsi_xmit_task(conn);
+ rc = iscsi_xmit_task(conn, task, false);
if (rc)
goto done;
- spin_lock_bh(&conn->taskqueuelock);
}
/* process pending command queue */
while (!list_empty(&conn->cmdqueue)) {
- conn->task = list_entry(conn->cmdqueue.next, struct iscsi_task,
- running);
- list_del_init(&conn->task->running);
- spin_unlock_bh(&conn->taskqueuelock);
+ task = list_entry(conn->cmdqueue.next, struct iscsi_task,
+ running);
+ list_del_init(&task->running);
if (conn->session->state == ISCSI_STATE_LOGGING_OUT) {
- fail_scsi_task(conn->task, DID_IMM_RETRY);
- spin_lock_bh(&conn->taskqueuelock);
+ fail_scsi_task(task, DID_IMM_RETRY);
continue;
}
- rc = iscsi_prep_scsi_cmd_pdu(conn->task);
+ rc = iscsi_prep_scsi_cmd_pdu(task);
if (rc) {
- if (rc == -ENOMEM || rc == -EACCES) {
- spin_lock_bh(&conn->taskqueuelock);
- list_add_tail(&conn->task->running,
- &conn->cmdqueue);
- conn->task = NULL;
- spin_unlock_bh(&conn->taskqueuelock);
- goto done;
- } else
- fail_scsi_task(conn->task, DID_ABORT);
- spin_lock_bh(&conn->taskqueuelock);
+ if (rc == -ENOMEM || rc == -EACCES)
+ fail_scsi_task(task, DID_IMM_RETRY);
+ else
+ fail_scsi_task(task, DID_ABORT);
continue;
}
- rc = iscsi_xmit_task(conn);
+ rc = iscsi_xmit_task(conn, task, false);
if (rc)
goto done;
/*
@@ -1552,7 +1587,6 @@ check_mgmt:
* we need to check the mgmt queue for nops that need to
* be sent to aviod starvation
*/
- spin_lock_bh(&conn->taskqueuelock);
if (!list_empty(&conn->mgmtqueue))
goto check_mgmt;
}
@@ -1566,21 +1600,17 @@ check_mgmt:
task = list_entry(conn->requeue.next, struct iscsi_task,
running);
+
if (iscsi_check_tmf_restrictions(task, ISCSI_OP_SCSI_DATA_OUT))
break;
- conn->task = task;
- list_del_init(&conn->task->running);
- conn->task->state = ISCSI_TASK_RUNNING;
- spin_unlock_bh(&conn->taskqueuelock);
- rc = iscsi_xmit_task(conn);
+ list_del_init(&task->running);
+ rc = iscsi_xmit_task(conn, task, true);
if (rc)
goto done;
- spin_lock_bh(&conn->taskqueuelock);
if (!list_empty(&conn->mgmtqueue))
goto check_mgmt;
}
- spin_unlock_bh(&conn->taskqueuelock);
spin_unlock_bh(&conn->session->frwd_lock);
return -ENODATA;
@@ -1746,9 +1776,7 @@ int iscsi_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *sc)
goto prepd_reject;
}
} else {
- spin_lock_bh(&conn->taskqueuelock);
list_add_tail(&task->running, &conn->cmdqueue);
- spin_unlock_bh(&conn->taskqueuelock);
iscsi_conn_queue_work(conn);
}
@@ -1855,27 +1883,39 @@ static int iscsi_exec_task_mgmt_fn(struct iscsi_conn *conn,
}
/*
- * Fail commands. session lock held and recv side suspended and xmit
- * thread flushed
+ * Fail commands. session frwd lock held and xmit thread flushed.
*/
static void fail_scsi_tasks(struct iscsi_conn *conn, u64 lun, int error)
{
+ struct iscsi_session *session = conn->session;
struct iscsi_task *task;
int i;
- for (i = 0; i < conn->session->cmds_max; i++) {
- task = conn->session->cmds[i];
+ spin_lock_bh(&session->back_lock);
+ for (i = 0; i < session->cmds_max; i++) {
+ task = session->cmds[i];
if (!task->sc || task->state == ISCSI_TASK_FREE)
continue;
if (lun != -1 && lun != task->sc->device->lun)
continue;
- ISCSI_DBG_SESSION(conn->session,
+ __iscsi_get_task(task);
+ spin_unlock_bh(&session->back_lock);
+
+ ISCSI_DBG_SESSION(session,
"failing sc %p itt 0x%x state %d\n",
task->sc, task->itt, task->state);
fail_scsi_task(task, error);
+
+ spin_unlock_bh(&session->frwd_lock);
+ iscsi_put_task(task);
+ spin_lock_bh(&session->frwd_lock);
+
+ spin_lock_bh(&session->back_lock);
}
+
+ spin_unlock_bh(&session->back_lock);
}
/**
@@ -1953,6 +1993,7 @@ enum blk_eh_timer_return iscsi_eh_cmd_timed_out(struct scsi_cmnd *sc)
ISCSI_DBG_EH(session, "scsi cmd %p timedout\n", sc);
spin_lock_bh(&session->frwd_lock);
+ spin_lock(&session->back_lock);
task = (struct iscsi_task *)sc->SCp.ptr;
if (!task) {
/*
@@ -1960,8 +2001,11 @@ enum blk_eh_timer_return iscsi_eh_cmd_timed_out(struct scsi_cmnd *sc)
* so let timeout code complete it now.
*/
rc = BLK_EH_DONE;
+ spin_unlock(&session->back_lock);
goto done;
}
+ __iscsi_get_task(task);
+ spin_unlock(&session->back_lock);
if (session->state != ISCSI_STATE_LOGGED_IN) {
/*
@@ -2020,6 +2064,7 @@ enum blk_eh_timer_return iscsi_eh_cmd_timed_out(struct scsi_cmnd *sc)
goto done;
}
+ spin_lock(&session->back_lock);
for (i = 0; i < conn->session->cmds_max; i++) {
running_task = conn->session->cmds[i];
if (!running_task->sc || running_task == task ||
@@ -2052,10 +2097,12 @@ enum blk_eh_timer_return iscsi_eh_cmd_timed_out(struct scsi_cmnd *sc)
"last xfer %lu/%lu. Last check %lu.\n",
task->last_xfer, running_task->last_xfer,
task->last_timeout);
+ spin_unlock(&session->back_lock);
rc = BLK_EH_RESET_TIMER;
goto done;
}
}
+ spin_unlock(&session->back_lock);
/* Assumes nop timeout is shorter than scsi cmd timeout */
if (task->have_checked_conn)
@@ -2077,9 +2124,12 @@ enum blk_eh_timer_return iscsi_eh_cmd_timed_out(struct scsi_cmnd *sc)
rc = BLK_EH_RESET_TIMER;
done:
- if (task)
- task->last_timeout = jiffies;
spin_unlock_bh(&session->frwd_lock);
+
+ if (task) {
+ task->last_timeout = jiffies;
+ iscsi_put_task(task);
+ }
ISCSI_DBG_EH(session, "return %s\n", rc == BLK_EH_RESET_TIMER ?
"timer reset" : "shutdown or nh");
return rc;
@@ -2187,15 +2237,20 @@ int iscsi_eh_abort(struct scsi_cmnd *sc)
conn->eh_abort_cnt++;
age = session->age;
+ spin_lock(&session->back_lock);
task = (struct iscsi_task *)sc->SCp.ptr;
- ISCSI_DBG_EH(session, "aborting [sc %p itt 0x%x]\n",
- sc, task->itt);
-
- /* task completed before time out */
- if (!task->sc) {
+ if (!task || !task->sc) {
+ /* task completed before time out */
ISCSI_DBG_EH(session, "sc completed while abort in progress\n");
- goto success;
+
+ spin_unlock(&session->back_lock);
+ spin_unlock_bh(&session->frwd_lock);
+ mutex_unlock(&session->eh_mutex);
+ return SUCCESS;
}
+ ISCSI_DBG_EH(session, "aborting [sc %p itt 0x%x]\n", sc, task->itt);
+ __iscsi_get_task(task);
+ spin_unlock(&session->back_lock);
if (task->state == ISCSI_TASK_PENDING) {
fail_scsi_task(task, DID_ABORT);
@@ -2257,6 +2312,7 @@ success:
success_unlocked:
ISCSI_DBG_EH(session, "abort success [sc %p itt 0x%x]\n",
sc, task->itt);
+ iscsi_put_task(task);
mutex_unlock(&session->eh_mutex);
return SUCCESS;
@@ -2265,6 +2321,7 @@ failed:
failed_unlocked:
ISCSI_DBG_EH(session, "abort failed [sc %p itt 0x%x]\n", sc,
task ? task->itt : 0);
+ iscsi_put_task(task);
mutex_unlock(&session->eh_mutex);
return FAILED;
}
@@ -2591,6 +2648,56 @@ void iscsi_pool_free(struct iscsi_pool *q)
}
EXPORT_SYMBOL_GPL(iscsi_pool_free);
+int iscsi_host_get_max_scsi_cmds(struct Scsi_Host *shost,
+ uint16_t requested_cmds_max)
+{
+ int scsi_cmds, total_cmds = requested_cmds_max;
+
+check:
+ if (!total_cmds)
+ total_cmds = ISCSI_DEF_XMIT_CMDS_MAX;
+ /*
+ * The iscsi layer needs some tasks for nop handling and tmfs,
+ * so the cmds_max must at least be greater than ISCSI_MGMT_CMDS_MAX
+ * + 1 command for scsi IO.
+ */
+ if (total_cmds < ISCSI_TOTAL_CMDS_MIN) {
+ printk(KERN_ERR "iscsi: invalid max cmds of %d. Must be a power of two that is at least %d.\n",
+ total_cmds, ISCSI_TOTAL_CMDS_MIN);
+ return -EINVAL;
+ }
+
+ if (total_cmds > ISCSI_TOTAL_CMDS_MAX) {
+ printk(KERN_INFO "iscsi: invalid max cmds of %d. Must be a power of 2 less than or equal to %d. Using %d.\n",
+ requested_cmds_max, ISCSI_TOTAL_CMDS_MAX,
+ ISCSI_TOTAL_CMDS_MAX);
+ total_cmds = ISCSI_TOTAL_CMDS_MAX;
+ }
+
+ if (!is_power_of_2(total_cmds)) {
+ total_cmds = rounddown_pow_of_two(total_cmds);
+ if (total_cmds < ISCSI_TOTAL_CMDS_MIN) {
+ printk(KERN_ERR "iscsi: invalid max cmds of %d. Must be a power of 2 greater than %d.\n", requested_cmds_max, ISCSI_TOTAL_CMDS_MIN);
+ return -EINVAL;
+ }
+
+ printk(KERN_INFO "iscsi: invalid max cmds %d. Must be a power of 2. Rounding max cmds down to %d.\n",
+ requested_cmds_max, total_cmds);
+ }
+
+ scsi_cmds = total_cmds - ISCSI_MGMT_CMDS_MAX;
+ if (shost->can_queue && scsi_cmds > shost->can_queue) {
+ total_cmds = shost->can_queue;
+
+ printk(KERN_INFO "iscsi: requested max cmds %u is higher than driver limit. Using driver limit %u\n",
+ requested_cmds_max, shost->can_queue);
+ goto check;
+ }
+
+ return scsi_cmds;
+}
+EXPORT_SYMBOL_GPL(iscsi_host_get_max_scsi_cmds);
+
/**
* iscsi_host_add - add host to system
* @shost: scsi host
@@ -2681,8 +2788,6 @@ void iscsi_host_remove(struct Scsi_Host *shost)
flush_signals(current);
scsi_remove_host(shost);
- if (ihost->workq)
- destroy_workqueue(ihost->workq);
}
EXPORT_SYMBOL_GPL(iscsi_host_remove);
@@ -2690,6 +2795,9 @@ void iscsi_host_free(struct Scsi_Host *shost)
{
struct iscsi_host *ihost = shost_priv(shost);
+ if (ihost->workq)
+ destroy_workqueue(ihost->workq);
+
kfree(ihost->netdev);
kfree(ihost->hwaddress);
kfree(ihost->initiatorname);
@@ -2743,7 +2851,7 @@ iscsi_session_setup(struct iscsi_transport *iscsit, struct Scsi_Host *shost,
struct iscsi_host *ihost = shost_priv(shost);
struct iscsi_session *session;
struct iscsi_cls_session *cls_session;
- int cmd_i, scsi_cmds, total_cmds = cmds_max;
+ int cmd_i, scsi_cmds;
unsigned long flags;
spin_lock_irqsave(&ihost->lock, flags);
@@ -2754,37 +2862,9 @@ iscsi_session_setup(struct iscsi_transport *iscsit, struct Scsi_Host *shost,
ihost->num_sessions++;
spin_unlock_irqrestore(&ihost->lock, flags);
- if (!total_cmds)
- total_cmds = ISCSI_DEF_XMIT_CMDS_MAX;
- /*
- * The iscsi layer needs some tasks for nop handling and tmfs,
- * so the cmds_max must at least be greater than ISCSI_MGMT_CMDS_MAX
- * + 1 command for scsi IO.
- */
- if (total_cmds < ISCSI_TOTAL_CMDS_MIN) {
- printk(KERN_ERR "iscsi: invalid can_queue of %d. can_queue "
- "must be a power of two that is at least %d.\n",
- total_cmds, ISCSI_TOTAL_CMDS_MIN);
+ scsi_cmds = iscsi_host_get_max_scsi_cmds(shost, cmds_max);
+ if (scsi_cmds < 0)
goto dec_session_count;
- }
-
- if (total_cmds > ISCSI_TOTAL_CMDS_MAX) {
- printk(KERN_ERR "iscsi: invalid can_queue of %d. can_queue "
- "must be a power of 2 less than or equal to %d.\n",
- cmds_max, ISCSI_TOTAL_CMDS_MAX);
- total_cmds = ISCSI_TOTAL_CMDS_MAX;
- }
-
- if (!is_power_of_2(total_cmds)) {
- printk(KERN_ERR "iscsi: invalid can_queue of %d. can_queue "
- "must be a power of 2.\n", total_cmds);
- total_cmds = rounddown_pow_of_two(total_cmds);
- if (total_cmds < ISCSI_TOTAL_CMDS_MIN)
- goto dec_session_count;
- printk(KERN_INFO "iscsi: Rounding can_queue to %d.\n",
- total_cmds);
- }
- scsi_cmds = total_cmds - ISCSI_MGMT_CMDS_MAX;
cls_session = iscsi_alloc_session(shost, iscsit,
sizeof(struct iscsi_session) +
@@ -2800,7 +2880,7 @@ iscsi_session_setup(struct iscsi_transport *iscsit, struct Scsi_Host *shost,
session->lu_reset_timeout = 15;
session->abort_timeout = 10;
session->scsi_cmds_max = scsi_cmds;
- session->cmds_max = total_cmds;
+ session->cmds_max = scsi_cmds + ISCSI_MGMT_CMDS_MAX;
session->queued_cmdsn = session->cmdsn = initial_cmdsn;
session->exp_cmdsn = initial_cmdsn + 1;
session->max_cmdsn = initial_cmdsn + 1;
@@ -2919,7 +2999,6 @@ iscsi_conn_setup(struct iscsi_cls_session *cls_session, int dd_size,
INIT_LIST_HEAD(&conn->mgmtqueue);
INIT_LIST_HEAD(&conn->cmdqueue);
INIT_LIST_HEAD(&conn->requeue);
- spin_lock_init(&conn->taskqueuelock);
INIT_WORK(&conn->xmitwork, iscsi_xmitworker);
/* allocate login_task used for the login/text sequences */
@@ -3085,10 +3164,16 @@ fail_mgmt_tasks(struct iscsi_session *session, struct iscsi_conn *conn)
ISCSI_DBG_SESSION(conn->session,
"failing mgmt itt 0x%x state %d\n",
task->itt, task->state);
+
+ spin_lock_bh(&session->back_lock);
+ if (cleanup_queued_task(task)) {
+ spin_unlock_bh(&session->back_lock);
+ continue;
+ }
+
state = ISCSI_TASK_ABRT_SESS_RECOV;
if (task->state == ISCSI_TASK_PENDING)
state = ISCSI_TASK_COMPLETED;
- spin_lock_bh(&session->back_lock);
iscsi_complete_task(task, state);
spin_unlock_bh(&session->back_lock);
}
@@ -3189,6 +3274,13 @@ int iscsi_conn_bind(struct iscsi_cls_session *cls_session,
spin_unlock_bh(&session->frwd_lock);
/*
+ * The target could have reduced it's window size between logins, so
+ * we have to reset max/exp cmdsn so we can see the new values.
+ */
+ spin_lock_bh(&session->back_lock);
+ session->max_cmdsn = session->exp_cmdsn = session->cmdsn + 1;
+ spin_unlock_bh(&session->back_lock);
+ /*
* Unblock xmitworker(), Login Phase will pass through.
*/
clear_bit(ISCSI_SUSPEND_BIT, &conn->suspend_rx);
diff --git a/drivers/scsi/libiscsi_tcp.c b/drivers/scsi/libiscsi_tcp.c
index 83f14b2c8804..2e9ffe3d1a55 100644
--- a/drivers/scsi/libiscsi_tcp.c
+++ b/drivers/scsi/libiscsi_tcp.c
@@ -524,48 +524,79 @@ static int iscsi_tcp_data_in(struct iscsi_conn *conn, struct iscsi_task *task)
/**
* iscsi_tcp_r2t_rsp - iSCSI R2T Response processing
* @conn: iscsi connection
- * @task: scsi command task
+ * @hdr: PDU header
*/
-static int iscsi_tcp_r2t_rsp(struct iscsi_conn *conn, struct iscsi_task *task)
+static int iscsi_tcp_r2t_rsp(struct iscsi_conn *conn, struct iscsi_hdr *hdr)
{
struct iscsi_session *session = conn->session;
- struct iscsi_tcp_task *tcp_task = task->dd_data;
- struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
- struct iscsi_r2t_rsp *rhdr = (struct iscsi_r2t_rsp *)tcp_conn->in.hdr;
+ struct iscsi_tcp_task *tcp_task;
+ struct iscsi_tcp_conn *tcp_conn;
+ struct iscsi_r2t_rsp *rhdr;
struct iscsi_r2t_info *r2t;
- int r2tsn = be32_to_cpu(rhdr->r2tsn);
+ struct iscsi_task *task;
u32 data_length;
u32 data_offset;
+ int r2tsn;
int rc;
+ spin_lock(&session->back_lock);
+ task = iscsi_itt_to_ctask(conn, hdr->itt);
+ if (!task) {
+ spin_unlock(&session->back_lock);
+ return ISCSI_ERR_BAD_ITT;
+ } else if (task->sc->sc_data_direction != DMA_TO_DEVICE) {
+ spin_unlock(&session->back_lock);
+ return ISCSI_ERR_PROTO;
+ }
+ /*
+ * A bad target might complete the cmd before we have handled R2Ts
+ * so get a ref to the task that will be dropped in the xmit path.
+ */
+ if (task->state != ISCSI_TASK_RUNNING) {
+ spin_unlock(&session->back_lock);
+ /* Let the path that got the early rsp complete it */
+ return 0;
+ }
+ task->last_xfer = jiffies;
+ __iscsi_get_task(task);
+
+ tcp_conn = conn->dd_data;
+ rhdr = (struct iscsi_r2t_rsp *)tcp_conn->in.hdr;
+ /* fill-in new R2T associated with the task */
+ iscsi_update_cmdsn(session, (struct iscsi_nopin *)rhdr);
+ spin_unlock(&session->back_lock);
+
if (tcp_conn->in.datalen) {
iscsi_conn_printk(KERN_ERR, conn,
"invalid R2t with datalen %d\n",
tcp_conn->in.datalen);
- return ISCSI_ERR_DATALEN;
+ rc = ISCSI_ERR_DATALEN;
+ goto put_task;
}
+ tcp_task = task->dd_data;
+ r2tsn = be32_to_cpu(rhdr->r2tsn);
if (tcp_task->exp_datasn != r2tsn){
ISCSI_DBG_TCP(conn, "task->exp_datasn(%d) != rhdr->r2tsn(%d)\n",
tcp_task->exp_datasn, r2tsn);
- return ISCSI_ERR_R2TSN;
+ rc = ISCSI_ERR_R2TSN;
+ goto put_task;
}
- /* fill-in new R2T associated with the task */
- iscsi_update_cmdsn(session, (struct iscsi_nopin*)rhdr);
-
- if (!task->sc || session->state != ISCSI_STATE_LOGGED_IN) {
+ if (session->state != ISCSI_STATE_LOGGED_IN) {
iscsi_conn_printk(KERN_INFO, conn,
"dropping R2T itt %d in recovery.\n",
task->itt);
- return 0;
+ rc = 0;
+ goto put_task;
}
data_length = be32_to_cpu(rhdr->data_length);
if (data_length == 0) {
iscsi_conn_printk(KERN_ERR, conn,
"invalid R2T with zero data len\n");
- return ISCSI_ERR_DATALEN;
+ rc = ISCSI_ERR_DATALEN;
+ goto put_task;
}
if (data_length > session->max_burst)
@@ -579,7 +610,8 @@ static int iscsi_tcp_r2t_rsp(struct iscsi_conn *conn, struct iscsi_task *task)
"invalid R2T with data len %u at offset %u "
"and total length %d\n", data_length,
data_offset, task->sc->sdb.length);
- return ISCSI_ERR_DATALEN;
+ rc = ISCSI_ERR_DATALEN;
+ goto put_task;
}
spin_lock(&tcp_task->pool2queue);
@@ -589,7 +621,8 @@ static int iscsi_tcp_r2t_rsp(struct iscsi_conn *conn, struct iscsi_task *task)
"Target has sent more R2Ts than it "
"negotiated for or driver has leaked.\n");
spin_unlock(&tcp_task->pool2queue);
- return ISCSI_ERR_PROTO;
+ rc = ISCSI_ERR_PROTO;
+ goto put_task;
}
r2t->exp_statsn = rhdr->statsn;
@@ -607,6 +640,10 @@ static int iscsi_tcp_r2t_rsp(struct iscsi_conn *conn, struct iscsi_task *task)
iscsi_requeue_task(task);
return 0;
+
+put_task:
+ iscsi_put_task(task);
+ return rc;
}
/*
@@ -730,20 +767,11 @@ iscsi_tcp_hdr_dissect(struct iscsi_conn *conn, struct iscsi_hdr *hdr)
rc = iscsi_complete_pdu(conn, hdr, NULL, 0);
break;
case ISCSI_OP_R2T:
- spin_lock(&conn->session->back_lock);
- task = iscsi_itt_to_ctask(conn, hdr->itt);
- spin_unlock(&conn->session->back_lock);
- if (!task)
- rc = ISCSI_ERR_BAD_ITT;
- else if (ahslen)
+ if (ahslen) {
rc = ISCSI_ERR_AHSLEN;
- else if (task->sc->sc_data_direction == DMA_TO_DEVICE) {
- task->last_xfer = jiffies;
- spin_lock(&conn->session->frwd_lock);
- rc = iscsi_tcp_r2t_rsp(conn, task);
- spin_unlock(&conn->session->frwd_lock);
- } else
- rc = ISCSI_ERR_PROTO;
+ break;
+ }
+ rc = iscsi_tcp_r2t_rsp(conn, hdr);
break;
case ISCSI_OP_LOGIN_RSP:
case ISCSI_OP_TEXT_RSP:
diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.c b/drivers/scsi/mpt3sas/mpt3sas_base.c
index f5582c8e77c9..ac066f86bb14 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_base.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_base.c
@@ -3648,25 +3648,16 @@ _base_get_msix_index(struct MPT3SAS_ADAPTER *ioc,
base_mod64(atomic64_add_return(1,
&ioc->total_io_cnt), ioc->reply_queue_count) : 0;
- return ioc->cpu_msix_table[raw_smp_processor_id()];
-}
+ if (scmd && ioc->shost->nr_hw_queues > 1) {
+ u32 tag = blk_mq_unique_tag(scmd->request);
-/**
- * _base_sdev_nr_inflight_request -get number of inflight requests
- * of a request queue.
- * @q: request_queue object
- *
- * returns number of inflight request of a request queue.
- */
-inline unsigned long
-_base_sdev_nr_inflight_request(struct request_queue *q)
-{
- struct blk_mq_hw_ctx *hctx = q->queue_hw_ctx[0];
+ return blk_mq_unique_tag_to_hwq(tag) +
+ ioc->high_iops_queues;
+ }
- return atomic_read(&hctx->nr_active);
+ return ioc->cpu_msix_table[raw_smp_processor_id()];
}
-
/**
* _base_get_high_iops_msix_index - get the msix index of
* high iops queues
@@ -3686,7 +3677,8 @@ _base_get_high_iops_msix_index(struct MPT3SAS_ADAPTER *ioc,
* reply queues in terms of batch count 16 when outstanding
* IOs on the target device is >=8.
*/
- if (_base_sdev_nr_inflight_request(scmd->device->request_queue) >
+
+ if (atomic_read(&scmd->device->device_busy) >
MPT3SAS_DEVICE_HIGH_IOPS_DEPTH)
return base_mod64((
atomic64_add_return(1, &ioc->high_iops_outstanding) /
@@ -3739,8 +3731,23 @@ mpt3sas_base_get_smid_scsiio(struct MPT3SAS_ADAPTER *ioc, u8 cb_idx,
struct scsi_cmnd *scmd)
{
struct scsiio_tracker *request = scsi_cmd_priv(scmd);
- unsigned int tag = scmd->request->tag;
u16 smid;
+ u32 tag, unique_tag;
+
+ unique_tag = blk_mq_unique_tag(scmd->request);
+ tag = blk_mq_unique_tag_to_tag(unique_tag);
+
+ /*
+ * Store hw queue number corresponding to the tag.
+ * This hw queue number is used later to determine
+ * the unique_tag using the logic below. This unique_tag
+ * is used to retrieve the scmd pointer corresponding
+ * to tag using scsi_host_find_tag() API.
+ *
+ * tag = smid - 1;
+ * unique_tag = ioc->io_queue_num[tag] << BLK_MQ_UNIQUE_TAG_BITS | tag;
+ */
+ ioc->io_queue_num[tag] = blk_mq_unique_tag_to_hwq(unique_tag);
smid = tag + 1;
request->cb_idx = cb_idx;
@@ -3831,6 +3838,7 @@ mpt3sas_base_free_smid(struct MPT3SAS_ADAPTER *ioc, u16 smid)
mpt3sas_base_clear_st(ioc, st);
_base_recovery_check(ioc);
+ ioc->io_queue_num[smid - 1] = 0;
return;
}
@@ -5362,6 +5370,9 @@ _base_release_memory_pools(struct MPT3SAS_ADAPTER *ioc)
kfree(ioc->chain_lookup);
ioc->chain_lookup = NULL;
}
+
+ kfree(ioc->io_queue_num);
+ ioc->io_queue_num = NULL;
}
/**
@@ -5641,7 +5652,8 @@ _base_allocate_memory_pools(struct MPT3SAS_ADAPTER *ioc)
reply_post_free_sz = ioc->reply_post_queue_depth *
sizeof(Mpi2DefaultReplyDescriptor_t);
rdpq_sz = reply_post_free_sz * RDPQ_MAX_INDEX_IN_ONE_CHUNK;
- if (_base_is_controller_msix_enabled(ioc) && !ioc->rdpq_array_enable)
+ if ((_base_is_controller_msix_enabled(ioc) && !ioc->rdpq_array_enable)
+ || (ioc->reply_queue_count < RDPQ_MAX_INDEX_IN_ONE_CHUNK))
rdpq_sz = reply_post_free_sz * ioc->reply_queue_count;
ret = base_alloc_rdpq_dma_pool(ioc, rdpq_sz);
if (ret == -EAGAIN) {
@@ -5772,6 +5784,11 @@ _base_allocate_memory_pools(struct MPT3SAS_ADAPTER *ioc)
ioc_info(ioc, "internal(0x%p): depth(%d), start smid(%d)\n",
ioc->internal,
ioc->internal_depth, ioc->internal_smid));
+
+ ioc->io_queue_num = kcalloc(ioc->scsiio_depth,
+ sizeof(u16), GFP_KERNEL);
+ if (!ioc->io_queue_num)
+ goto out;
/*
* The number of NVMe page sized blocks needed is:
* (((sg_tablesize * 8) - 1) / (page_size - 8)) + 1
@@ -8174,8 +8191,11 @@ mpt3sas_base_hard_reset_handler(struct MPT3SAS_ADAPTER *ioc,
ioc_state = mpt3sas_base_get_iocstate(ioc, 0);
if ((ioc_state & MPI2_IOC_STATE_MASK) == MPI2_IOC_STATE_FAULT ||
(ioc_state & MPI2_IOC_STATE_MASK) ==
- MPI2_IOC_STATE_COREDUMP)
+ MPI2_IOC_STATE_COREDUMP) {
is_fault = 1;
+ ioc->htb_rel.trigger_info_dwords[1] =
+ (ioc_state & MPI2_DOORBELL_DATA_MASK);
+ }
}
_base_pre_reset_handler(ioc);
mpt3sas_wait_for_commands_to_complete(ioc);
diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.h b/drivers/scsi/mpt3sas/mpt3sas_base.h
index 2def7a340616..315aee6ef86f 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_base.h
+++ b/drivers/scsi/mpt3sas/mpt3sas_base.h
@@ -77,8 +77,8 @@
#define MPT3SAS_DRIVER_NAME "mpt3sas"
#define MPT3SAS_AUTHOR "Avago Technologies <MPT-FusionLinux.pdl@avagotech.com>"
#define MPT3SAS_DESCRIPTION "LSI MPT Fusion SAS 3.0 Device Driver"
-#define MPT3SAS_DRIVER_VERSION "36.100.00.00"
-#define MPT3SAS_MAJOR_VERSION 36
+#define MPT3SAS_DRIVER_VERSION "37.100.00.00"
+#define MPT3SAS_MAJOR_VERSION 37
#define MPT3SAS_MINOR_VERSION 100
#define MPT3SAS_BUILD_VERSION 0
#define MPT3SAS_RELEASE_VERSION 00
@@ -1073,6 +1073,50 @@ struct hba_port {
#define MULTIPATH_DISABLED_PORT_ID 0xFF
+/**
+ * struct htb_rel_query - diagnostic buffer release reason
+ * @unique_id - unique id associated with this buffer.
+ * @buffer_rel_condition - Release condition ioctl/sysfs/reset
+ * @reserved
+ * @trigger_type - Master/Event/scsi/MPI
+ * @trigger_info_dwords - Data Correspondig to trigger type
+ */
+struct htb_rel_query {
+ u16 buffer_rel_condition;
+ u16 reserved;
+ u32 trigger_type;
+ u32 trigger_info_dwords[2];
+};
+
+/* Buffer_rel_condition bit fields */
+
+/* Bit 0 - Diag Buffer not Released */
+#define MPT3_DIAG_BUFFER_NOT_RELEASED (0x00)
+/* Bit 0 - Diag Buffer Released */
+#define MPT3_DIAG_BUFFER_RELEASED (0x01)
+
+/*
+ * Bit 1 - Diag Buffer Released by IOCTL,
+ * This bit is valid only if Bit 0 is one
+ */
+#define MPT3_DIAG_BUFFER_REL_IOCTL (0x02 | MPT3_DIAG_BUFFER_RELEASED)
+
+/*
+ * Bit 2 - Diag Buffer Released by Trigger,
+ * This bit is valid only if Bit 0 is one
+ */
+#define MPT3_DIAG_BUFFER_REL_TRIGGER (0x04 | MPT3_DIAG_BUFFER_RELEASED)
+
+/*
+ * Bit 3 - Diag Buffer Released by SysFs,
+ * This bit is valid only if Bit 0 is one
+ */
+#define MPT3_DIAG_BUFFER_REL_SYSFS (0x08 | MPT3_DIAG_BUFFER_RELEASED)
+
+/* DIAG RESET Master trigger flags */
+#define MPT_DIAG_RESET_ISSUED_BY_DRIVER 0x00000000
+#define MPT_DIAG_RESET_ISSUED_BY_USER 0x00000001
+
typedef void (*MPT3SAS_FLUSH_RUNNING_CMDS)(struct MPT3SAS_ADAPTER *ioc);
/**
* struct MPT3SAS_ADAPTER - per adapter struct
@@ -1439,6 +1483,7 @@ struct MPT3SAS_ADAPTER {
spinlock_t scsi_lookup_lock;
int pending_io_count;
wait_queue_head_t reset_wq;
+ u16 *io_queue_num;
/* PCIe SGL */
struct dma_pool *pcie_sgl_dma_pool;
@@ -1529,6 +1574,8 @@ struct MPT3SAS_ADAPTER {
u32 diagnostic_flags[MPI2_DIAG_BUF_TYPE_COUNT];
u32 ring_buffer_offset;
u32 ring_buffer_sz;
+ struct htb_rel_query htb_rel;
+ u8 reset_from_user;
u8 is_warpdrive;
u8 is_mcpu_endpoint;
u8 hide_ir_msg;
@@ -1565,6 +1612,7 @@ struct mpt3sas_debugfs_buffer {
};
#define MPT_DRV_SUPPORT_BITMAP_MEMMOVE 0x00000001
+#define MPT_DRV_SUPPORT_BITMAP_ADDNLQUERY 0x00000002
typedef u8 (*MPT_CALLBACK)(struct MPT3SAS_ADAPTER *ioc, u16 smid, u8 msix_index,
u32 reply);
diff --git a/drivers/scsi/mpt3sas/mpt3sas_ctl.c b/drivers/scsi/mpt3sas/mpt3sas_ctl.c
index c8a0ce18f2c5..44f9a05db94e 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_ctl.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_ctl.c
@@ -479,6 +479,8 @@ void mpt3sas_ctl_pre_reset_handler(struct MPT3SAS_ADAPTER *ioc)
ioc_info(ioc,
"%s: Releasing the trace buffer due to adapter reset.",
__func__);
+ ioc->htb_rel.buffer_rel_condition =
+ MPT3_DIAG_BUFFER_REL_TRIGGER;
mpt3sas_send_diag_release(ioc, i, &issue_reset);
}
}
@@ -1334,6 +1336,7 @@ _ctl_do_reset(struct MPT3SAS_ADAPTER *ioc, void __user *arg)
dctlprintk(ioc, ioc_info(ioc, "%s: enter\n",
__func__));
+ ioc->reset_from_user = 1;
retval = mpt3sas_base_hard_reset_handler(ioc, FORCE_BIG_HAMMER);
ioc_info(ioc,
"Ioctl: host reset: %s\n", ((!retval) ? "SUCCESS" : "FAILED"));
@@ -1687,6 +1690,9 @@ _ctl_diag_register_2(struct MPT3SAS_ADAPTER *ioc,
request_data = ioc->diag_buffer[buffer_type];
request_data_sz = diag_register->requested_buffer_size;
ioc->unique_id[buffer_type] = diag_register->unique_id;
+ /* Reset ioc variables used for additional query commands */
+ ioc->reset_from_user = 0;
+ memset(&ioc->htb_rel, 0, sizeof(struct htb_rel_query));
ioc->diag_buffer_status[buffer_type] &=
MPT3_DIAG_BUFFER_IS_DRIVER_ALLOCATED;
memcpy(ioc->product_specific[buffer_type],
@@ -2469,7 +2475,61 @@ _ctl_diag_read_buffer(struct MPT3SAS_ADAPTER *ioc, void __user *arg)
return rc;
}
+/**
+ * _ctl_addnl_diag_query - query relevant info associated with diag buffers
+ * @ioc: per adapter object
+ * @arg: user space buffer containing ioctl content
+ *
+ * The application will send only unique_id. Driver will
+ * inspect unique_id first, if valid, fill the details related to cause
+ * for diag buffer release.
+ */
+static long
+_ctl_addnl_diag_query(struct MPT3SAS_ADAPTER *ioc, void __user *arg)
+{
+ struct mpt3_addnl_diag_query karg;
+ u32 buffer_type = 0;
+ if (copy_from_user(&karg, arg, sizeof(karg))) {
+ pr_err("%s: failure at %s:%d/%s()!\n",
+ ioc->name, __FILE__, __LINE__, __func__);
+ return -EFAULT;
+ }
+ dctlprintk(ioc, ioc_info(ioc, "%s\n", __func__));
+ if (karg.unique_id == 0) {
+ ioc_err(ioc, "%s: unique_id is(0x%08x)\n",
+ __func__, karg.unique_id);
+ return -EPERM;
+ }
+ buffer_type = _ctl_diag_get_bufftype(ioc, karg.unique_id);
+ if (buffer_type == MPT3_DIAG_UID_NOT_FOUND) {
+ ioc_err(ioc, "%s: buffer with unique_id(0x%08x) not found\n",
+ __func__, karg.unique_id);
+ return -EPERM;
+ }
+ memset(&karg.buffer_rel_condition, 0, sizeof(struct htb_rel_query));
+ if ((ioc->diag_buffer_status[buffer_type] &
+ MPT3_DIAG_BUFFER_IS_REGISTERED) == 0) {
+ ioc_info(ioc, "%s: buffer_type(0x%02x) is not registered\n",
+ __func__, buffer_type);
+ goto out;
+ }
+ if ((ioc->diag_buffer_status[buffer_type] &
+ MPT3_DIAG_BUFFER_IS_RELEASED) == 0) {
+ ioc_err(ioc, "%s: buffer_type(0x%02x) is not released\n",
+ __func__, buffer_type);
+ return -EPERM;
+ }
+ memcpy(&karg.buffer_rel_condition, &ioc->htb_rel,
+ sizeof(struct htb_rel_query));
+out:
+ if (copy_to_user(arg, &karg, sizeof(struct mpt3_addnl_diag_query))) {
+ ioc_err(ioc, "%s: unable to write mpt3_addnl_diag_query data @ %p\n",
+ __func__, arg);
+ return -EFAULT;
+ }
+ return 0;
+}
#ifdef CONFIG_COMPAT
/**
@@ -2533,7 +2593,7 @@ _ctl_ioctl_main(struct file *file, unsigned int cmd, void __user *arg,
struct MPT3SAS_ADAPTER *ioc;
struct mpt3_ioctl_header ioctl_header;
enum block_state state;
- long ret = -EINVAL;
+ long ret = -ENOIOCTLCMD;
/* get IOCTL header */
if (copy_from_user(&ioctl_header, (char __user *)arg,
@@ -2643,6 +2703,10 @@ _ctl_ioctl_main(struct file *file, unsigned int cmd, void __user *arg,
if (_IOC_SIZE(cmd) == sizeof(struct mpt3_diag_read_buffer))
ret = _ctl_diag_read_buffer(ioc, arg);
break;
+ case MPT3ADDNLDIAGQUERY:
+ if (_IOC_SIZE(cmd) == sizeof(struct mpt3_addnl_diag_query))
+ ret = _ctl_addnl_diag_query(ioc, arg);
+ break;
default:
dctlprintk(ioc,
ioc_info(ioc, "unsupported ioctl opcode(0x%08x)\n",
@@ -3425,6 +3489,7 @@ host_trace_buffer_enable_store(struct device *cdev,
MPT3_DIAG_BUFFER_IS_RELEASED))
goto out;
ioc_info(ioc, "releasing host trace buffer\n");
+ ioc->htb_rel.buffer_rel_condition = MPT3_DIAG_BUFFER_REL_SYSFS;
mpt3sas_send_diag_release(ioc, MPI2_DIAG_BUF_TYPE_TRACE,
&issue_reset);
}
diff --git a/drivers/scsi/mpt3sas/mpt3sas_ctl.h b/drivers/scsi/mpt3sas/mpt3sas_ctl.h
index 0f7aa4ddade0..d2ccdafb8df2 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_ctl.h
+++ b/drivers/scsi/mpt3sas/mpt3sas_ctl.h
@@ -94,6 +94,8 @@
struct mpt3_diag_query)
#define MPT3DIAGREADBUFFER _IOWR(MPT3_MAGIC_NUMBER, 30, \
struct mpt3_diag_read_buffer)
+#define MPT3ADDNLDIAGQUERY _IOWR(MPT3_MAGIC_NUMBER, 32, \
+ struct mpt3_addnl_diag_query)
/* Trace Buffer default UniqueId */
#define MPT2DIAGBUFFUNIQUEID (0x07075900)
@@ -430,4 +432,24 @@ struct mpt3_diag_read_buffer {
uint32_t diagnostic_data[1];
};
+/**
+ * struct mpt3_addnl_diag_query - diagnostic buffer release reason
+ * @hdr - generic header
+ * @unique_id - unique id associated with this buffer.
+ * @buffer_rel_condition - Release condition ioctl/sysfs/reset
+ * @reserved1
+ * @trigger_type - Master/Event/scsi/MPI
+ * @trigger_info_dwords - Data Correspondig to trigger type
+ * @reserved2
+ */
+struct mpt3_addnl_diag_query {
+ struct mpt3_ioctl_header hdr;
+ uint32_t unique_id;
+ uint16_t buffer_rel_condition;
+ uint16_t reserved1;
+ uint32_t trigger_type;
+ uint32_t trigger_info_dwords[2];
+ uint32_t reserved2[2];
+};
+
#endif /* MPT3SAS_CTL_H_INCLUDED */
diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
index c8b09a81834d..ffca03064797 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
@@ -54,6 +54,7 @@
#include <linux/interrupt.h>
#include <linux/aer.h>
#include <linux/raid_class.h>
+#include <linux/blk-mq-pci.h>
#include <asm/unaligned.h>
#include "mpt3sas_base.h"
@@ -168,6 +169,11 @@ MODULE_PARM_DESC(multipath_on_hba,
"\t SAS 2.0 & SAS 3.0 HBA - This will be disabled,\n\t\t"
"\t SAS 3.5 HBA - This will be enabled)");
+static int host_tagset_enable = 1;
+module_param(host_tagset_enable, int, 0444);
+MODULE_PARM_DESC(host_tagset_enable,
+ "Shared host tagset enable/disable Default: enable(1)");
+
/* raid transport support */
static struct raid_template *mpt3sas_raid_template;
static struct raid_template *mpt2sas_raid_template;
@@ -1743,10 +1749,12 @@ mpt3sas_scsih_scsi_lookup_get(struct MPT3SAS_ADAPTER *ioc, u16 smid)
struct scsi_cmnd *scmd = NULL;
struct scsiio_tracker *st;
Mpi25SCSIIORequest_t *mpi_request;
+ u16 tag = smid - 1;
if (smid > 0 &&
smid <= ioc->scsiio_depth - INTERNAL_SCSIIO_CMDS_COUNT) {
- u32 unique_tag = smid - 1;
+ u32 unique_tag =
+ ioc->io_queue_num[tag] << BLK_MQ_UNIQUE_TAG_BITS | tag;
mpi_request = mpt3sas_base_get_msg_frame(ioc, smid);
@@ -11599,6 +11607,22 @@ scsih_scan_finished(struct Scsi_Host *shost, unsigned long time)
return 1;
}
+/**
+ * scsih_map_queues - map reply queues with request queues
+ * @shost: SCSI host pointer
+ */
+static int scsih_map_queues(struct Scsi_Host *shost)
+{
+ struct MPT3SAS_ADAPTER *ioc =
+ (struct MPT3SAS_ADAPTER *)shost->hostdata;
+
+ if (ioc->shost->nr_hw_queues == 1)
+ return 0;
+
+ return blk_mq_pci_map_queues(&shost->tag_set.map[HCTX_TYPE_DEFAULT],
+ ioc->pdev, ioc->high_iops_queues);
+}
+
/* shost template for SAS 2.0 HBA devices */
static struct scsi_host_template mpt2sas_driver_template = {
.module = THIS_MODULE,
@@ -11666,6 +11690,7 @@ static struct scsi_host_template mpt3sas_driver_template = {
.sdev_attrs = mpt3sas_dev_attrs,
.track_queue_depth = 1,
.cmd_size = sizeof(struct scsiio_tracker),
+ .map_queues = scsih_map_queues,
};
/* raid transport support for SAS 3.0 HBA devices */
@@ -11922,6 +11947,8 @@ _scsih_probe(struct pci_dev *pdev, const struct pci_device_id *id)
* Enable MEMORY MOVE support flag.
*/
ioc->drv_support_bitmap |= MPT_DRV_SUPPORT_BITMAP_MEMMOVE;
+ /* Enable ADDITIONAL QUERY support flag. */
+ ioc->drv_support_bitmap |= MPT_DRV_SUPPORT_BITMAP_ADDNLQUERY;
ioc->enable_sdev_max_qd = enable_sdev_max_qd;
@@ -12028,6 +12055,21 @@ _scsih_probe(struct pci_dev *pdev, const struct pci_device_id *id)
} else
ioc->hide_drives = 0;
+ shost->host_tagset = 0;
+ shost->nr_hw_queues = 1;
+
+ if (ioc->is_gen35_ioc && ioc->reply_queue_count > 1 &&
+ host_tagset_enable && ioc->smp_affinity_enable) {
+
+ shost->host_tagset = 1;
+ shost->nr_hw_queues =
+ ioc->reply_queue_count - ioc->high_iops_queues;
+
+ dev_info(&ioc->pdev->dev,
+ "Max SCSIIO MPT commands: %d shared with nr_hw_queues = %d\n",
+ shost->can_queue, shost->nr_hw_queues);
+ }
+
rv = scsi_add_host(shost, &pdev->dev);
if (rv) {
ioc_err(ioc, "failure at %s:%d/%s()!\n",
diff --git a/drivers/scsi/mpt3sas/mpt3sas_trigger_diag.c b/drivers/scsi/mpt3sas/mpt3sas_trigger_diag.c
index 8ec9bab20ec4..d9b7d0ee25b0 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_trigger_diag.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_trigger_diag.c
@@ -132,6 +132,35 @@ mpt3sas_process_trigger_data(struct MPT3SAS_ADAPTER *ioc,
&issue_reset);
}
+ ioc->htb_rel.buffer_rel_condition = MPT3_DIAG_BUFFER_REL_TRIGGER;
+ if (event_data) {
+ ioc->htb_rel.trigger_type = event_data->trigger_type;
+ switch (event_data->trigger_type) {
+ case MPT3SAS_TRIGGER_SCSI:
+ memcpy(&ioc->htb_rel.trigger_info_dwords,
+ &event_data->u.scsi,
+ sizeof(struct SL_WH_SCSI_TRIGGER_T));
+ break;
+ case MPT3SAS_TRIGGER_MPI:
+ memcpy(&ioc->htb_rel.trigger_info_dwords,
+ &event_data->u.mpi,
+ sizeof(struct SL_WH_MPI_TRIGGER_T));
+ break;
+ case MPT3SAS_TRIGGER_MASTER:
+ ioc->htb_rel.trigger_info_dwords[0] =
+ event_data->u.master.MasterData;
+ break;
+ case MPT3SAS_TRIGGER_EVENT:
+ memcpy(&ioc->htb_rel.trigger_info_dwords,
+ &event_data->u.event,
+ sizeof(struct SL_WH_EVENT_TRIGGER_T));
+ break;
+ default:
+ ioc_err(ioc, "%d - Is not a valid Trigger type\n",
+ event_data->trigger_type);
+ break;
+ }
+ }
_mpt3sas_raise_sigio(ioc, event_data);
dTriggerDiagPrintk(ioc, ioc_info(ioc, "%s: exit\n",
@@ -201,9 +230,14 @@ mpt3sas_trigger_master(struct MPT3SAS_ADAPTER *ioc, u32 trigger_bitmask)
event_data.u.master.MasterData = trigger_bitmask;
if (trigger_bitmask & MASTER_TRIGGER_FW_FAULT ||
- trigger_bitmask & MASTER_TRIGGER_ADAPTER_RESET)
+ trigger_bitmask & MASTER_TRIGGER_ADAPTER_RESET) {
+ ioc->htb_rel.trigger_type = MPT3SAS_TRIGGER_MASTER;
+ ioc->htb_rel.trigger_info_dwords[0] = trigger_bitmask;
+ if (ioc->reset_from_user)
+ ioc->htb_rel.trigger_info_dwords[1] =
+ MPT_DIAG_RESET_ISSUED_BY_USER;
_mpt3sas_raise_sigio(ioc, &event_data);
- else
+ } else
mpt3sas_send_trigger_data_event(ioc, &event_data);
out:
diff --git a/drivers/scsi/pmcraid.h b/drivers/scsi/pmcraid.h
index 15c962108075..6d36debde18e 100644
--- a/drivers/scsi/pmcraid.h
+++ b/drivers/scsi/pmcraid.h
@@ -244,7 +244,7 @@ struct pmcraid_ioarcb {
__u8 hrrq_id;
__u8 cdb[PMCRAID_MAX_CDB_LEN];
struct pmcraid_ioarcb_add_data add_data;
-} __attribute__((packed, aligned(PMCRAID_IOARCB_ALIGNMENT)));
+};
/* well known resource handle values */
#define PMCRAID_IOA_RES_HANDLE 0xffffffff
@@ -1040,8 +1040,8 @@ struct pmcraid_passthrough_ioctl_buffer {
struct pmcraid_ioctl_header ioctl_header;
struct pmcraid_ioarcb ioarcb;
struct pmcraid_ioasa ioasa;
- u8 request_buffer[1];
-} __attribute__ ((packed));
+ u8 request_buffer[];
+} __attribute__ ((packed, aligned(PMCRAID_IOARCB_ALIGNMENT)));
/*
* keys to differentiate between driver handled IOCTLs and passthrough
diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c
index 0d09480b66cd..c48daf52725d 100644
--- a/drivers/scsi/qla2xxx/qla_target.c
+++ b/drivers/scsi/qla2xxx/qla_target.c
@@ -981,8 +981,7 @@ void qlt_free_session_done(struct work_struct *work)
int rc;
if (!own ||
- (own &&
- (own->iocb.u.isp24.status_subcode == ELS_PLOGI))) {
+ (own->iocb.u.isp24.status_subcode == ELS_PLOGI)) {
rc = qla2x00_post_async_logout_work(vha, sess,
NULL);
if (rc != QLA_SUCCESS)
diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c
index a4b014e1cd8c..7bd9a4a04ad5 100644
--- a/drivers/scsi/qla4xxx/ql4_os.c
+++ b/drivers/scsi/qla4xxx/ql4_os.c
@@ -841,7 +841,7 @@ static int __qla4xxx_is_chap_active(struct device *dev, void *data)
sess = cls_session->dd_data;
ddb_entry = sess->dd_data;
- if (iscsi_session_chkready(cls_session))
+ if (iscsi_is_session_online(cls_session))
goto exit_is_chap_active;
if (ddb_entry->chap_tbl_idx == *chap_tbl_idx)
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 2e68c0a87698..969d24d580e2 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -1701,10 +1701,8 @@ static const char *iscsi_session_state_name(int state)
int iscsi_session_chkready(struct iscsi_cls_session *session)
{
- unsigned long flags;
int err;
- spin_lock_irqsave(&session->lock, flags);
switch (session->state) {
case ISCSI_SESSION_LOGGED_IN:
err = 0;
@@ -1719,7 +1717,6 @@ int iscsi_session_chkready(struct iscsi_cls_session *session)
err = DID_NO_CONNECT << 16;
break;
}
- spin_unlock_irqrestore(&session->lock, flags);
return err;
}
EXPORT_SYMBOL_GPL(iscsi_session_chkready);
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index a3d2d4bc4a3d..ed0b1bb99f08 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -707,9 +707,9 @@ static int sd_sec_submit(void *data, u16 spsp, u8 secp, void *buffer,
put_unaligned_be16(spsp, &cdb[2]);
put_unaligned_be32(len, &cdb[6]);
- ret = scsi_execute_req(sdev, cdb,
- send ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
- buffer, len, NULL, SD_TIMEOUT, sdkp->max_retries, NULL);
+ ret = scsi_execute(sdev, cdb, send ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
+ buffer, len, NULL, NULL, SD_TIMEOUT, sdkp->max_retries, 0,
+ RQF_PM, NULL);
return ret <= 0 ? ret : -EIO;
}
#endif /* CONFIG_BLK_SED_OPAL */
@@ -3379,10 +3379,12 @@ static int sd_probe(struct device *dev)
sdp->type != TYPE_RBC)
goto out;
-#ifndef CONFIG_BLK_DEV_ZONED
- if (sdp->type == TYPE_ZBC)
+ if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) && sdp->type == TYPE_ZBC) {
+ sdev_printk(KERN_WARNING, sdp,
+ "Unsupported ZBC host-managed device.\n");
goto out;
-#endif
+ }
+
SCSI_LOG_HLQUEUE(3, sdev_printk(KERN_INFO, sdp,
"sd_probe\n"));
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 03adb39293c2..ee558675eab4 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -704,6 +704,7 @@ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp)
unsigned int nr_zones = sdkp->rev_nr_zones;
u32 max_append;
int ret = 0;
+ unsigned int flags;
/*
* For all zoned disks, initialize zone append emulation data if not
@@ -736,16 +737,19 @@ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp)
disk->queue->nr_zones == nr_zones)
goto unlock;
+ flags = memalloc_noio_save();
sdkp->zone_blocks = zone_blocks;
sdkp->nr_zones = nr_zones;
- sdkp->rev_wp_offset = kvcalloc(nr_zones, sizeof(u32), GFP_NOIO);
+ sdkp->rev_wp_offset = kvcalloc(nr_zones, sizeof(u32), GFP_KERNEL);
if (!sdkp->rev_wp_offset) {
ret = -ENOMEM;
+ memalloc_noio_restore(flags);
goto unlock;
}
ret = blk_revalidate_disk_zones(disk, sd_zbc_revalidate_zones_cb);
+ memalloc_noio_restore(flags);
kvfree(sdkp->rev_wp_offset);
sdkp->rev_wp_offset = NULL;
diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index 721f55db181f..77161750c9fb 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -451,6 +451,8 @@ static void ufshcd_print_evt(struct ufs_hba *hba, u32 id,
if (!found)
dev_err(hba->dev, "No record of %s\n", err_name);
+ else
+ dev_err(hba->dev, "%s: total cnt=%llu\n", err_name, e->cnt);
}
static void ufshcd_print_evt_hist(struct ufs_hba *hba)
@@ -1866,7 +1868,7 @@ static ssize_t ufshcd_clkgate_delay_show(struct device *dev,
{
struct ufs_hba *hba = dev_get_drvdata(dev);
- return snprintf(buf, PAGE_SIZE, "%lu\n", hba->clk_gating.delay_ms);
+ return sysfs_emit(buf, "%lu\n", hba->clk_gating.delay_ms);
}
static ssize_t ufshcd_clkgate_delay_store(struct device *dev,
@@ -1889,7 +1891,7 @@ static ssize_t ufshcd_clkgate_enable_show(struct device *dev,
{
struct ufs_hba *hba = dev_get_drvdata(dev);
- return snprintf(buf, PAGE_SIZE, "%d\n", hba->clk_gating.is_enabled);
+ return sysfs_emit(buf, "%d\n", hba->clk_gating.is_enabled);
}
static ssize_t ufshcd_clkgate_enable_store(struct device *dev,
diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
index ee61f821f75d..18e56c1c1b30 100644
--- a/drivers/scsi/ufs/ufshcd.h
+++ b/drivers/scsi/ufs/ufshcd.h
@@ -570,7 +570,7 @@ enum ufshcd_quirks {
/*
* This quirk allows only sg entries aligned with page size.
*/
- UFSHCD_QUIRK_ALIGN_SG_WITH_PAGE_SIZE = 1 << 13,
+ UFSHCD_QUIRK_ALIGN_SG_WITH_PAGE_SIZE = 1 << 14,
};
enum ufshcd_caps {
diff --git a/drivers/staging/vme/devices/vme_user.c b/drivers/staging/vme/devices/vme_user.c
index 35d7260e2271..e3fa38bd7f12 100644
--- a/drivers/staging/vme/devices/vme_user.c
+++ b/drivers/staging/vme/devices/vme_user.c
@@ -175,7 +175,7 @@ static ssize_t buffer_from_user(unsigned int minor, const char __user *buf,
static ssize_t vme_user_read(struct file *file, char __user *buf, size_t count,
loff_t *ppos)
{
- unsigned int minor = MINOR(file_inode(file)->i_rdev);
+ unsigned int minor = iminor(file_inode(file));
ssize_t retval;
size_t image_size;
@@ -218,7 +218,7 @@ static ssize_t vme_user_read(struct file *file, char __user *buf, size_t count,
static ssize_t vme_user_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
- unsigned int minor = MINOR(file_inode(file)->i_rdev);
+ unsigned int minor = iminor(file_inode(file));
ssize_t retval;
size_t image_size;
@@ -260,7 +260,7 @@ static ssize_t vme_user_write(struct file *file, const char __user *buf,
static loff_t vme_user_llseek(struct file *file, loff_t off, int whence)
{
- unsigned int minor = MINOR(file_inode(file)->i_rdev);
+ unsigned int minor = iminor(file_inode(file));
size_t image_size;
loff_t res;
@@ -294,7 +294,7 @@ static int vme_user_ioctl(struct inode *inode, struct file *file,
struct vme_slave slave;
struct vme_irq_id irq_req;
unsigned long copied;
- unsigned int minor = MINOR(inode->i_rdev);
+ unsigned int minor = iminor(inode);
int retval;
dma_addr_t pci_addr;
void __user *argp = (void __user *)arg;
@@ -412,7 +412,7 @@ vme_user_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
int ret;
struct inode *inode = file_inode(file);
- unsigned int minor = MINOR(inode->i_rdev);
+ unsigned int minor = iminor(inode);
mutex_lock(&image[minor].mutex);
ret = vme_user_ioctl(inode, file, cmd, arg);
@@ -481,7 +481,7 @@ static int vme_user_master_mmap(unsigned int minor, struct vm_area_struct *vma)
static int vme_user_mmap(struct file *file, struct vm_area_struct *vma)
{
- unsigned int minor = MINOR(file_inode(file)->i_rdev);
+ unsigned int minor = iminor(file_inode(file));
if (type[minor] == MASTER_MINOR)
return vme_user_master_mmap(minor, vma);
diff --git a/drivers/target/sbp/sbp_target.c b/drivers/target/sbp/sbp_target.c
index e4a9b9fe3dfb..2a6165febd3b 100644
--- a/drivers/target/sbp/sbp_target.c
+++ b/drivers/target/sbp/sbp_target.c
@@ -1006,7 +1006,7 @@ static void tgt_agent_fetch_work(struct work_struct *work)
agent->state = AGENT_STATE_SUSPENDED;
spin_unlock_bh(&agent->lock);
- };
+ }
}
static struct sbp_target_agent *sbp_target_agent_register(
diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c
index 8ed93fd205c7..ee3d52061281 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -315,10 +315,8 @@ iblock_get_bio(struct se_cmd *cmd, sector_t lba, u32 sg_num, int op,
* Only allocate as many vector entries as the bio code allows us to,
* we'll loop later on until we have handled the whole request.
*/
- if (sg_num > BIO_MAX_PAGES)
- sg_num = BIO_MAX_PAGES;
-
- bio = bio_alloc_bioset(GFP_NOIO, sg_num, &ib_dev->ibd_bio_set);
+ bio = bio_alloc_bioset(GFP_NOIO, bio_max_segs(sg_num),
+ &ib_dev->ibd_bio_set);
if (!bio) {
pr_err("Unable to allocate memory for bio\n");
return NULL;
@@ -638,8 +636,7 @@ iblock_alloc_bip(struct se_cmd *cmd, struct bio *bio,
return -ENODEV;
}
- bip = bio_integrity_alloc(bio, GFP_NOIO,
- min_t(unsigned int, cmd->t_prot_nents, BIO_MAX_PAGES));
+ bip = bio_integrity_alloc(bio, GFP_NOIO, bio_max_segs(cmd->t_prot_nents));
if (IS_ERR(bip)) {
pr_err("Unable to allocate bio_integrity_payload\n");
return PTR_ERR(bip);
diff --git a/drivers/target/target_core_pr.c b/drivers/target/target_core_pr.c
index 14db5e568f22..d4cc43afe05b 100644
--- a/drivers/target/target_core_pr.c
+++ b/drivers/target/target_core_pr.c
@@ -3739,6 +3739,7 @@ core_scsi3_pri_read_keys(struct se_cmd *cmd)
spin_unlock(&dev->t10_pr.registration_lock);
put_unaligned_be32(add_len, &buf[4]);
+ target_set_cmd_data_length(cmd, 8 + add_len);
transport_kunmap_data_sg(cmd);
@@ -3757,7 +3758,7 @@ core_scsi3_pri_read_reservation(struct se_cmd *cmd)
struct t10_pr_registration *pr_reg;
unsigned char *buf;
u64 pr_res_key;
- u32 add_len = 16; /* Hardcoded to 16 when a reservation is held. */
+ u32 add_len = 0;
if (cmd->data_length < 8) {
pr_err("PRIN SA READ_RESERVATIONS SCSI Data Length: %u"
@@ -3775,8 +3776,9 @@ core_scsi3_pri_read_reservation(struct se_cmd *cmd)
pr_reg = dev->dev_pr_res_holder;
if (pr_reg) {
/*
- * Set the hardcoded Additional Length
+ * Set the Additional Length to 16 when a reservation is held
*/
+ add_len = 16;
put_unaligned_be32(add_len, &buf[4]);
if (cmd->data_length < 22)
@@ -3812,6 +3814,8 @@ core_scsi3_pri_read_reservation(struct se_cmd *cmd)
(pr_reg->pr_res_type & 0x0f);
}
+ target_set_cmd_data_length(cmd, 8 + add_len);
+
err:
spin_unlock(&dev->dev_reservation_lock);
transport_kunmap_data_sg(cmd);
@@ -3830,7 +3834,7 @@ core_scsi3_pri_report_capabilities(struct se_cmd *cmd)
struct se_device *dev = cmd->se_dev;
struct t10_reservation *pr_tmpl = &dev->t10_pr;
unsigned char *buf;
- u16 add_len = 8; /* Hardcoded to 8. */
+ u16 len = 8; /* Hardcoded to 8. */
if (cmd->data_length < 6) {
pr_err("PRIN SA REPORT_CAPABILITIES SCSI Data Length:"
@@ -3842,7 +3846,7 @@ core_scsi3_pri_report_capabilities(struct se_cmd *cmd)
if (!buf)
return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
- put_unaligned_be16(add_len, &buf[0]);
+ put_unaligned_be16(len, &buf[0]);
buf[2] |= 0x10; /* CRH: Compatible Reservation Hanlding bit. */
buf[2] |= 0x08; /* SIP_C: Specify Initiator Ports Capable bit */
buf[2] |= 0x04; /* ATP_C: All Target Ports Capable bit */
@@ -3871,6 +3875,8 @@ core_scsi3_pri_report_capabilities(struct se_cmd *cmd)
buf[4] |= 0x02; /* PR_TYPE_WRITE_EXCLUSIVE */
buf[5] |= 0x01; /* PR_TYPE_EXCLUSIVE_ACCESS_ALLREG */
+ target_set_cmd_data_length(cmd, len);
+
transport_kunmap_data_sg(cmd);
return 0;
@@ -4031,6 +4037,7 @@ core_scsi3_pri_read_full_status(struct se_cmd *cmd)
* Set ADDITIONAL_LENGTH
*/
put_unaligned_be32(add_len, &buf[4]);
+ target_set_cmd_data_length(cmd, 8 + add_len);
transport_kunmap_data_sg(cmd);
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index 33770e5808ce..3cbc074992bc 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -881,7 +881,7 @@ pscsi_map_sg(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
if (!bio) {
new_bio:
- nr_vecs = min_t(int, BIO_MAX_PAGES, nr_pages);
+ nr_vecs = bio_max_segs(nr_pages);
nr_pages -= nr_vecs;
/*
* Calls bio_kmalloc() and sets bio->bi_end_io()
diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index 93ea17cbad79..5ecb9f18a53d 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -879,11 +879,9 @@ void target_complete_cmd(struct se_cmd *cmd, u8 scsi_status)
}
EXPORT_SYMBOL(target_complete_cmd);
-void target_complete_cmd_with_length(struct se_cmd *cmd, u8 scsi_status, int length)
+void target_set_cmd_data_length(struct se_cmd *cmd, int length)
{
- if ((scsi_status == SAM_STAT_GOOD ||
- cmd->se_cmd_flags & SCF_TREAT_READ_AS_NORMAL) &&
- length < cmd->data_length) {
+ if (length < cmd->data_length) {
if (cmd->se_cmd_flags & SCF_UNDERFLOW_BIT) {
cmd->residual_count += cmd->data_length - length;
} else {
@@ -893,6 +891,15 @@ void target_complete_cmd_with_length(struct se_cmd *cmd, u8 scsi_status, int len
cmd->data_length = length;
}
+}
+EXPORT_SYMBOL(target_set_cmd_data_length);
+
+void target_complete_cmd_with_length(struct se_cmd *cmd, u8 scsi_status, int length)
+{
+ if (scsi_status == SAM_STAT_GOOD ||
+ cmd->se_cmd_flags & SCF_TREAT_READ_AS_NORMAL) {
+ target_set_cmd_data_length(cmd, length);
+ }
target_complete_cmd(cmd, scsi_status);
}
diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index a5991df23581..bf73cd5f4b04 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -1566,6 +1566,88 @@ static struct se_device *tcmu_alloc_device(struct se_hba *hba, const char *name)
return &udev->se_dev;
}
+static void tcmu_dev_call_rcu(struct rcu_head *p)
+{
+ struct se_device *dev = container_of(p, struct se_device, rcu_head);
+ struct tcmu_dev *udev = TCMU_DEV(dev);
+
+ kfree(udev->uio_info.name);
+ kfree(udev->name);
+ kfree(udev);
+}
+
+static int tcmu_check_and_free_pending_cmd(struct tcmu_cmd *cmd)
+{
+ if (test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags)) {
+ kmem_cache_free(tcmu_cmd_cache, cmd);
+ return 0;
+ }
+ return -EINVAL;
+}
+
+static void tcmu_blocks_release(struct radix_tree_root *blocks,
+ int start, int end)
+{
+ int i;
+ struct page *page;
+
+ for (i = start; i < end; i++) {
+ page = radix_tree_delete(blocks, i);
+ if (page) {
+ __free_page(page);
+ atomic_dec(&global_db_count);
+ }
+ }
+}
+
+static void tcmu_remove_all_queued_tmr(struct tcmu_dev *udev)
+{
+ struct tcmu_tmr *tmr, *tmp;
+
+ list_for_each_entry_safe(tmr, tmp, &udev->tmr_queue, queue_entry) {
+ list_del_init(&tmr->queue_entry);
+ kfree(tmr);
+ }
+}
+
+static void tcmu_dev_kref_release(struct kref *kref)
+{
+ struct tcmu_dev *udev = container_of(kref, struct tcmu_dev, kref);
+ struct se_device *dev = &udev->se_dev;
+ struct tcmu_cmd *cmd;
+ bool all_expired = true;
+ int i;
+
+ vfree(udev->mb_addr);
+ udev->mb_addr = NULL;
+
+ spin_lock_bh(&timed_out_udevs_lock);
+ if (!list_empty(&udev->timedout_entry))
+ list_del(&udev->timedout_entry);
+ spin_unlock_bh(&timed_out_udevs_lock);
+
+ /* Upper layer should drain all requests before calling this */
+ mutex_lock(&udev->cmdr_lock);
+ idr_for_each_entry(&udev->commands, cmd, i) {
+ if (tcmu_check_and_free_pending_cmd(cmd) != 0)
+ all_expired = false;
+ }
+ /* There can be left over TMR cmds. Remove them. */
+ tcmu_remove_all_queued_tmr(udev);
+ if (!list_empty(&udev->qfull_queue))
+ all_expired = false;
+ idr_destroy(&udev->commands);
+ WARN_ON(!all_expired);
+
+ tcmu_blocks_release(&udev->data_blocks, 0, udev->dbi_max + 1);
+ bitmap_free(udev->data_bitmap);
+ mutex_unlock(&udev->cmdr_lock);
+
+ pr_debug("dev_kref_release\n");
+
+ call_rcu(&dev->rcu_head, tcmu_dev_call_rcu);
+}
+
static void run_qfull_queue(struct tcmu_dev *udev, bool fail)
{
struct tcmu_cmd *tcmu_cmd, *tmp_cmd;
@@ -1678,6 +1760,25 @@ static struct page *tcmu_try_get_block_page(struct tcmu_dev *udev, uint32_t dbi)
return page;
}
+static void tcmu_vma_open(struct vm_area_struct *vma)
+{
+ struct tcmu_dev *udev = vma->vm_private_data;
+
+ pr_debug("vma_open\n");
+
+ kref_get(&udev->kref);
+}
+
+static void tcmu_vma_close(struct vm_area_struct *vma)
+{
+ struct tcmu_dev *udev = vma->vm_private_data;
+
+ pr_debug("vma_close\n");
+
+ /* release ref from tcmu_vma_open */
+ kref_put(&udev->kref, tcmu_dev_kref_release);
+}
+
static vm_fault_t tcmu_vma_fault(struct vm_fault *vmf)
{
struct tcmu_dev *udev = vmf->vma->vm_private_data;
@@ -1716,6 +1817,8 @@ static vm_fault_t tcmu_vma_fault(struct vm_fault *vmf)
}
static const struct vm_operations_struct tcmu_vm_ops = {
+ .open = tcmu_vma_open,
+ .close = tcmu_vma_close,
.fault = tcmu_vma_fault,
};
@@ -1732,6 +1835,8 @@ static int tcmu_mmap(struct uio_info *info, struct vm_area_struct *vma)
if (vma_pages(vma) != (udev->ring_size >> PAGE_SHIFT))
return -EINVAL;
+ tcmu_vma_open(vma);
+
return 0;
}
@@ -1744,93 +1849,12 @@ static int tcmu_open(struct uio_info *info, struct inode *inode)
return -EBUSY;
udev->inode = inode;
- kref_get(&udev->kref);
pr_debug("open\n");
return 0;
}
-static void tcmu_dev_call_rcu(struct rcu_head *p)
-{
- struct se_device *dev = container_of(p, struct se_device, rcu_head);
- struct tcmu_dev *udev = TCMU_DEV(dev);
-
- kfree(udev->uio_info.name);
- kfree(udev->name);
- kfree(udev);
-}
-
-static int tcmu_check_and_free_pending_cmd(struct tcmu_cmd *cmd)
-{
- if (test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags)) {
- kmem_cache_free(tcmu_cmd_cache, cmd);
- return 0;
- }
- return -EINVAL;
-}
-
-static void tcmu_blocks_release(struct radix_tree_root *blocks,
- int start, int end)
-{
- int i;
- struct page *page;
-
- for (i = start; i < end; i++) {
- page = radix_tree_delete(blocks, i);
- if (page) {
- __free_page(page);
- atomic_dec(&global_db_count);
- }
- }
-}
-
-static void tcmu_remove_all_queued_tmr(struct tcmu_dev *udev)
-{
- struct tcmu_tmr *tmr, *tmp;
-
- list_for_each_entry_safe(tmr, tmp, &udev->tmr_queue, queue_entry) {
- list_del_init(&tmr->queue_entry);
- kfree(tmr);
- }
-}
-
-static void tcmu_dev_kref_release(struct kref *kref)
-{
- struct tcmu_dev *udev = container_of(kref, struct tcmu_dev, kref);
- struct se_device *dev = &udev->se_dev;
- struct tcmu_cmd *cmd;
- bool all_expired = true;
- int i;
-
- vfree(udev->mb_addr);
- udev->mb_addr = NULL;
-
- spin_lock_bh(&timed_out_udevs_lock);
- if (!list_empty(&udev->timedout_entry))
- list_del(&udev->timedout_entry);
- spin_unlock_bh(&timed_out_udevs_lock);
-
- /* Upper layer should drain all requests before calling this */
- mutex_lock(&udev->cmdr_lock);
- idr_for_each_entry(&udev->commands, cmd, i) {
- if (tcmu_check_and_free_pending_cmd(cmd) != 0)
- all_expired = false;
- }
- /* There can be left over TMR cmds. Remove them. */
- tcmu_remove_all_queued_tmr(udev);
- if (!list_empty(&udev->qfull_queue))
- all_expired = false;
- idr_destroy(&udev->commands);
- WARN_ON(!all_expired);
-
- tcmu_blocks_release(&udev->data_blocks, 0, udev->dbi_max + 1);
- bitmap_free(udev->data_bitmap);
- mutex_unlock(&udev->cmdr_lock);
-
- call_rcu(&dev->rcu_head, tcmu_dev_call_rcu);
-}
-
static int tcmu_release(struct uio_info *info, struct inode *inode)
{
struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info);
@@ -1838,8 +1862,7 @@ static int tcmu_release(struct uio_info *info, struct inode *inode)
clear_bit(TCMU_DEV_BIT_OPEN, &udev->flags);
pr_debug("close\n");
- /* release ref from open */
- kref_put(&udev->kref, tcmu_dev_kref_release);
+
return 0;
}
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 648eb4c4cf7f..8d97f0b45e9c 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1139,9 +1139,6 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
struct super_block *sb, unsigned int flags)
{
umode_t mode;
- char ext[32];
- char tag_name[14];
- unsigned int i_nlink;
struct v9fs_session_info *v9ses = sb->s_fs_info;
struct v9fs_inode *v9inode = V9FS_I(inode);
@@ -1159,18 +1156,18 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
inode->i_gid = stat->n_gid;
}
if ((S_ISREG(inode->i_mode)) || (S_ISDIR(inode->i_mode))) {
- if (v9fs_proto_dotu(v9ses) && (stat->extension[0] != '\0')) {
+ if (v9fs_proto_dotu(v9ses)) {
+ unsigned int i_nlink;
/*
- * Hadlink support got added later to
- * to the .u extension. So there can be
- * server out there that doesn't support
- * this even with .u extension. So check
- * for non NULL stat->extension
+ * Hadlink support got added later to the .u extension.
+ * So there can be a server out there that doesn't
+ * support this even with .u extension. That would
+ * just leave us with stat->extension being an empty
+ * string, though.
*/
- strlcpy(ext, stat->extension, sizeof(ext));
/* HARDLINKCOUNT %u */
- sscanf(ext, "%13s %u", tag_name, &i_nlink);
- if (!strncmp(tag_name, "HARDLINKCOUNT", 13))
+ if (sscanf(stat->extension,
+ " HARDLINKCOUNT %u", &i_nlink) == 1)
set_nlink(inode, i_nlink);
}
}
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 91ff9bfd7c1a..4aa1f88d5bf8 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -221,7 +221,7 @@ static void blkdev_bio_end_io_simple(struct bio *bio)
static ssize_t
__blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
- int nr_pages)
+ unsigned int nr_pages)
{
struct file *file = iocb->ki_filp;
struct block_device *bdev = I_BDEV(bdev_file_inode(file));
@@ -355,8 +355,8 @@ static void blkdev_bio_end_io(struct bio *bio)
}
}
-static ssize_t
-__blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
+static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+ unsigned int nr_pages)
{
struct file *file = iocb->ki_filp;
struct inode *inode = bdev_file_inode(file);
@@ -486,7 +486,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
static ssize_t
blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
- int nr_pages;
+ unsigned int nr_pages;
if (!iov_iter_count(iter))
return 0;
@@ -495,7 +495,7 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES)
return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
- return __blkdev_direct_IO(iocb, iter, min(nr_pages, BIO_MAX_PAGES));
+ return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
}
static __init int blkdev_init(void)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 0957e1bb8eb2..b61491bf3166 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -695,7 +695,7 @@ static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio,
if (ret)
goto out;
sector = start_sector << (sdio->blkbits - 9);
- nr_pages = min(sdio->pages_in_io, BIO_MAX_PAGES);
+ nr_pages = bio_max_segs(sdio->pages_in_io);
BUG_ON(nr_pages <= 0);
dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages);
sdio->boundary = 0;
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index ea4f693bee22..f88851c5c250 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -215,10 +215,8 @@ submit_bio_retry:
/* max # of continuous pages */
if (nblocks > DIV_ROUND_UP(map.m_plen, PAGE_SIZE))
nblocks = DIV_ROUND_UP(map.m_plen, PAGE_SIZE);
- if (nblocks > BIO_MAX_PAGES)
- nblocks = BIO_MAX_PAGES;
- bio = bio_alloc(GFP_NOIO, nblocks);
+ bio = bio_alloc(GFP_NOIO, bio_max_segs(nblocks));
bio->bi_end_io = erofs_readendio;
bio_set_dev(bio, sb->s_bdev);
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index f014c5e473a9..3db923403505 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -371,8 +371,7 @@ int ext4_mpage_readpages(struct inode *inode,
* bio_alloc will _always_ be able to allocate a bio if
* __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset().
*/
- bio = bio_alloc(GFP_KERNEL,
- min_t(int, nr_pages, BIO_MAX_PAGES));
+ bio = bio_alloc(GFP_KERNEL, bio_max_segs(nr_pages));
fscrypt_set_bio_crypt_ctx(bio, inode, next_block,
GFP_KERNEL);
ext4_set_bio_post_read_ctx(bio, inode, page->index);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index b9721c8f116c..7c95818639a6 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -969,8 +969,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
unsigned int post_read_steps = 0;
bio = bio_alloc_bioset(for_write ? GFP_NOIO : GFP_KERNEL,
- min_t(int, nr_pages, BIO_MAX_PAGES),
- &f2fs_bioset);
+ bio_max_segs(nr_pages), &f2fs_bioset);
if (!bio)
return ERR_PTR(-ENOMEM);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index a8a0fb890e8d..4b0e2e3c2c88 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -2747,7 +2747,7 @@ int f2fs_restore_node_summary(struct f2fs_sb_info *sbi,
sum_entry = &sum->entries[0];
for (i = 0; i < last_offset; i += nrpages, addr += nrpages) {
- nrpages = min(last_offset - i, BIO_MAX_PAGES);
+ nrpages = bio_max_segs(last_offset - i);
/* readahead node pages */
f2fs_ra_meta_pages(sbi, addr, nrpages, META_POR, true);
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 68376fa03880..c9775d5c6594 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -491,8 +491,8 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip,
di = (struct gfs2_dinode *)dibh->b_data;
gfs2_dinode_out(ip, di);
- di->di_major = cpu_to_be32(MAJOR(ip->i_inode.i_rdev));
- di->di_minor = cpu_to_be32(MINOR(ip->i_inode.i_rdev));
+ di->di_major = cpu_to_be32(imajor(&ip->i_inode));
+ di->di_minor = cpu_to_be32(iminor(&ip->i_inode));
di->__pad1 = 0;
di->__pad2 = 0;
di->__pad3 = 0;
diff --git a/fs/inode.c b/fs/inode.c
index 6dba963d3f6d..a047ab306f9a 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -142,6 +142,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
atomic_set(&inode->i_count, 1);
inode->i_op = &empty_iops;
inode->i_fop = &no_open_fops;
+ inode->i_ino = 0;
inode->__i_nlink = 1;
inode->i_opflags = 0;
if (sb->s_xattr)
diff --git a/fs/io-wq.c b/fs/io-wq.c
index c36bbcd823ce..44e20248805a 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -13,13 +13,9 @@
#include <linux/sched/mm.h>
#include <linux/percpu.h>
#include <linux/slab.h>
-#include <linux/kthread.h>
#include <linux/rculist_nulls.h>
-#include <linux/fs_struct.h>
-#include <linux/task_work.h>
-#include <linux/blk-cgroup.h>
-#include <linux/audit.h>
#include <linux/cpu.h>
+#include <linux/tracehook.h>
#include "../kernel/sched/sched.h"
#include "io-wq.h"
@@ -36,7 +32,6 @@ enum {
enum {
IO_WQ_BIT_EXIT = 0, /* wq exiting */
- IO_WQ_BIT_ERROR = 1, /* error on setup */
};
enum {
@@ -57,14 +52,12 @@ struct io_worker {
struct io_wq_work *cur_work;
spinlock_t lock;
- struct rcu_head rcu;
- struct mm_struct *mm;
-#ifdef CONFIG_BLK_CGROUP
- struct cgroup_subsys_state *blkcg_css;
-#endif
const struct cred *cur_creds;
const struct cred *saved_creds;
- struct nsproxy *restore_nsproxy;
+
+ struct completion ref_done;
+
+ struct rcu_head rcu;
};
#if BITS_PER_LONG == 64
@@ -93,7 +86,6 @@ struct io_wqe {
struct {
raw_spinlock_t lock;
struct io_wq_work_list work_list;
- unsigned long hash_map;
unsigned flags;
} ____cacheline_aligned_in_smp;
@@ -103,6 +95,8 @@ struct io_wqe {
struct hlist_nulls_head free_list;
struct list_head all_list;
+ struct wait_queue_entry wait;
+
struct io_wq *wq;
struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS];
};
@@ -119,12 +113,15 @@ struct io_wq {
struct task_struct *manager;
struct user_struct *user;
+
+ struct io_wq_hash *hash;
+
refcount_t refs;
struct completion done;
struct hlist_node cpuhp_node;
- refcount_t use_refs;
+ pid_t task_pid;
};
static enum cpuhp_state io_wq_online;
@@ -137,62 +134,7 @@ static bool io_worker_get(struct io_worker *worker)
static void io_worker_release(struct io_worker *worker)
{
if (refcount_dec_and_test(&worker->ref))
- wake_up_process(worker->task);
-}
-
-/*
- * Note: drops the wqe->lock if returning true! The caller must re-acquire
- * the lock in that case. Some callers need to restart handling if this
- * happens, so we can't just re-acquire the lock on behalf of the caller.
- */
-static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
-{
- bool dropped_lock = false;
-
- if (worker->saved_creds) {
- revert_creds(worker->saved_creds);
- worker->cur_creds = worker->saved_creds = NULL;
- }
-
- if (current->files) {
- __acquire(&wqe->lock);
- raw_spin_unlock_irq(&wqe->lock);
- dropped_lock = true;
-
- task_lock(current);
- current->files = NULL;
- current->nsproxy = worker->restore_nsproxy;
- task_unlock(current);
- }
-
- if (current->fs)
- current->fs = NULL;
-
- /*
- * If we have an active mm, we need to drop the wq lock before unusing
- * it. If we do, return true and let the caller retry the idle loop.
- */
- if (worker->mm) {
- if (!dropped_lock) {
- __acquire(&wqe->lock);
- raw_spin_unlock_irq(&wqe->lock);
- dropped_lock = true;
- }
- __set_current_state(TASK_RUNNING);
- kthread_unuse_mm(worker->mm);
- mmput(worker->mm);
- worker->mm = NULL;
- }
-
-#ifdef CONFIG_BLK_CGROUP
- if (worker->blkcg_css) {
- kthread_associate_blkcg(NULL);
- worker->blkcg_css = NULL;
- }
-#endif
- if (current->signal->rlim[RLIMIT_FSIZE].rlim_cur != RLIM_INFINITY)
- current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
- return dropped_lock;
+ complete(&worker->ref_done);
}
static inline struct io_wqe_acct *io_work_get_acct(struct io_wqe *wqe,
@@ -204,9 +146,10 @@ static inline struct io_wqe_acct *io_work_get_acct(struct io_wqe *wqe,
return &wqe->acct[IO_WQ_ACCT_BOUND];
}
-static inline struct io_wqe_acct *io_wqe_get_acct(struct io_wqe *wqe,
- struct io_worker *worker)
+static inline struct io_wqe_acct *io_wqe_get_acct(struct io_worker *worker)
{
+ struct io_wqe *wqe = worker->wqe;
+
if (worker->flags & IO_WORKER_F_BOUND)
return &wqe->acct[IO_WQ_ACCT_BOUND];
@@ -216,39 +159,36 @@ static inline struct io_wqe_acct *io_wqe_get_acct(struct io_wqe *wqe,
static void io_worker_exit(struct io_worker *worker)
{
struct io_wqe *wqe = worker->wqe;
- struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker);
+ struct io_wqe_acct *acct = io_wqe_get_acct(worker);
+ unsigned flags;
- /*
- * If we're not at zero, someone else is holding a brief reference
- * to the worker. Wait for that to go away.
- */
- set_current_state(TASK_INTERRUPTIBLE);
- if (!refcount_dec_and_test(&worker->ref))
- schedule();
- __set_current_state(TASK_RUNNING);
+ if (refcount_dec_and_test(&worker->ref))
+ complete(&worker->ref_done);
+ wait_for_completion(&worker->ref_done);
preempt_disable();
current->flags &= ~PF_IO_WORKER;
- if (worker->flags & IO_WORKER_F_RUNNING)
+ flags = worker->flags;
+ worker->flags = 0;
+ if (flags & IO_WORKER_F_RUNNING)
atomic_dec(&acct->nr_running);
- if (!(worker->flags & IO_WORKER_F_BOUND))
- atomic_dec(&wqe->wq->user->processes);
worker->flags = 0;
preempt_enable();
+ if (worker->saved_creds) {
+ revert_creds(worker->saved_creds);
+ worker->cur_creds = worker->saved_creds = NULL;
+ }
+
raw_spin_lock_irq(&wqe->lock);
- hlist_nulls_del_rcu(&worker->nulls_node);
+ if (flags & IO_WORKER_F_FREE)
+ hlist_nulls_del_rcu(&worker->nulls_node);
list_del_rcu(&worker->all_list);
- if (__io_worker_unuse(wqe, worker)) {
- __release(&wqe->lock);
- raw_spin_lock_irq(&wqe->lock);
- }
acct->nr_workers--;
raw_spin_unlock_irq(&wqe->lock);
kfree_rcu(worker, rcu);
- if (refcount_dec_and_test(&wqe->wq->refs))
- complete(&wqe->wq->done);
+ io_wq_put(wqe->wq);
}
static inline bool io_wqe_run_queue(struct io_wqe *wqe)
@@ -306,33 +246,27 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
wake_up_process(wqe->wq->manager);
}
-static void io_wqe_inc_running(struct io_wqe *wqe, struct io_worker *worker)
+static void io_wqe_inc_running(struct io_worker *worker)
{
- struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker);
+ struct io_wqe_acct *acct = io_wqe_get_acct(worker);
atomic_inc(&acct->nr_running);
}
-static void io_wqe_dec_running(struct io_wqe *wqe, struct io_worker *worker)
+static void io_wqe_dec_running(struct io_worker *worker)
__must_hold(wqe->lock)
{
- struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker);
+ struct io_wqe_acct *acct = io_wqe_get_acct(worker);
+ struct io_wqe *wqe = worker->wqe;
if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe))
io_wqe_wake_worker(wqe, acct);
}
-static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker)
+static void io_worker_start(struct io_worker *worker)
{
- allow_kernel_signal(SIGINT);
-
- current->flags |= PF_IO_WORKER;
- current->fs = NULL;
- current->files = NULL;
-
worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
- worker->restore_nsproxy = current->nsproxy;
- io_wqe_inc_running(wqe, worker);
+ io_wqe_inc_running(worker);
}
/*
@@ -357,19 +291,17 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
worker_bound = (worker->flags & IO_WORKER_F_BOUND) != 0;
work_bound = (work->flags & IO_WQ_WORK_UNBOUND) == 0;
if (worker_bound != work_bound) {
- io_wqe_dec_running(wqe, worker);
+ io_wqe_dec_running(worker);
if (work_bound) {
worker->flags |= IO_WORKER_F_BOUND;
wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers--;
wqe->acct[IO_WQ_ACCT_BOUND].nr_workers++;
- atomic_dec(&wqe->wq->user->processes);
} else {
worker->flags &= ~IO_WORKER_F_BOUND;
wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers++;
wqe->acct[IO_WQ_ACCT_BOUND].nr_workers--;
- atomic_inc(&wqe->wq->user->processes);
}
- io_wqe_inc_running(wqe, worker);
+ io_wqe_inc_running(worker);
}
}
@@ -380,15 +312,17 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
* retry the loop in that case (we changed task state), we don't regrab
* the lock if we return success.
*/
-static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker)
+static void __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker)
__must_hold(wqe->lock)
{
if (!(worker->flags & IO_WORKER_F_FREE)) {
worker->flags |= IO_WORKER_F_FREE;
hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
}
-
- return __io_worker_unuse(wqe, worker);
+ if (worker->saved_creds) {
+ revert_creds(worker->saved_creds);
+ worker->cur_creds = worker->saved_creds = NULL;
+ }
}
static inline unsigned int io_get_work_hash(struct io_wq_work *work)
@@ -396,14 +330,31 @@ static inline unsigned int io_get_work_hash(struct io_wq_work *work)
return work->flags >> IO_WQ_HASH_SHIFT;
}
+static void io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
+{
+ struct io_wq *wq = wqe->wq;
+
+ spin_lock(&wq->hash->wait.lock);
+ if (list_empty(&wqe->wait.entry)) {
+ __add_wait_queue(&wq->hash->wait, &wqe->wait);
+ if (!test_bit(hash, &wq->hash->map)) {
+ __set_current_state(TASK_RUNNING);
+ list_del_init(&wqe->wait.entry);
+ }
+ }
+ spin_unlock(&wq->hash->wait.lock);
+}
+
static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
__must_hold(wqe->lock)
{
struct io_wq_work_node *node, *prev;
struct io_wq_work *work, *tail;
- unsigned int hash;
+ unsigned int stall_hash = -1U;
wq_list_for_each(node, prev, &wqe->work_list) {
+ unsigned int hash;
+
work = container_of(node, struct io_wq_work, list);
/* not hashed, can run anytime */
@@ -412,111 +363,60 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
return work;
}
- /* hashed, can run if not already running */
hash = io_get_work_hash(work);
- if (!(wqe->hash_map & BIT(hash))) {
- wqe->hash_map |= BIT(hash);
- /* all items with this hash lie in [work, tail] */
- tail = wqe->hash_tail[hash];
+ /* all items with this hash lie in [work, tail] */
+ tail = wqe->hash_tail[hash];
+
+ /* hashed, can run if not already running */
+ if (!test_and_set_bit(hash, &wqe->wq->hash->map)) {
wqe->hash_tail[hash] = NULL;
wq_list_cut(&wqe->work_list, &tail->list, prev);
return work;
}
+ if (stall_hash == -1U)
+ stall_hash = hash;
+ /* fast forward to a next hash, for-each will fix up @prev */
+ node = &tail->list;
}
- return NULL;
-}
-
-static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work)
-{
- if (worker->mm) {
- kthread_unuse_mm(worker->mm);
- mmput(worker->mm);
- worker->mm = NULL;
+ if (stall_hash != -1U) {
+ raw_spin_unlock(&wqe->lock);
+ io_wait_on_hash(wqe, stall_hash);
+ raw_spin_lock(&wqe->lock);
}
- if (mmget_not_zero(work->identity->mm)) {
- kthread_use_mm(work->identity->mm);
- worker->mm = work->identity->mm;
- return;
- }
-
- /* failed grabbing mm, ensure work gets cancelled */
- work->flags |= IO_WQ_WORK_CANCEL;
+ return NULL;
}
-static inline void io_wq_switch_blkcg(struct io_worker *worker,
- struct io_wq_work *work)
+static void io_flush_signals(void)
{
-#ifdef CONFIG_BLK_CGROUP
- if (!(work->flags & IO_WQ_WORK_BLKCG))
- return;
- if (work->identity->blkcg_css != worker->blkcg_css) {
- kthread_associate_blkcg(work->identity->blkcg_css);
- worker->blkcg_css = work->identity->blkcg_css;
+ if (unlikely(test_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL))) {
+ if (current->task_works)
+ task_work_run();
+ clear_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL);
}
-#endif
}
static void io_wq_switch_creds(struct io_worker *worker,
struct io_wq_work *work)
{
- const struct cred *old_creds = override_creds(work->identity->creds);
+ const struct cred *old_creds = override_creds(work->creds);
- worker->cur_creds = work->identity->creds;
+ worker->cur_creds = work->creds;
if (worker->saved_creds)
put_cred(old_creds); /* creds set by previous switch */
else
worker->saved_creds = old_creds;
}
-static void io_impersonate_work(struct io_worker *worker,
- struct io_wq_work *work)
-{
- if ((work->flags & IO_WQ_WORK_FILES) &&
- current->files != work->identity->files) {
- task_lock(current);
- current->files = work->identity->files;
- current->nsproxy = work->identity->nsproxy;
- task_unlock(current);
- if (!work->identity->files) {
- /* failed grabbing files, ensure work gets cancelled */
- work->flags |= IO_WQ_WORK_CANCEL;
- }
- }
- if ((work->flags & IO_WQ_WORK_FS) && current->fs != work->identity->fs)
- current->fs = work->identity->fs;
- if ((work->flags & IO_WQ_WORK_MM) && work->identity->mm != worker->mm)
- io_wq_switch_mm(worker, work);
- if ((work->flags & IO_WQ_WORK_CREDS) &&
- worker->cur_creds != work->identity->creds)
- io_wq_switch_creds(worker, work);
- if (work->flags & IO_WQ_WORK_FSIZE)
- current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->identity->fsize;
- else if (current->signal->rlim[RLIMIT_FSIZE].rlim_cur != RLIM_INFINITY)
- current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
- io_wq_switch_blkcg(worker, work);
-#ifdef CONFIG_AUDIT
- current->loginuid = work->identity->loginuid;
- current->sessionid = work->identity->sessionid;
-#endif
-}
-
static void io_assign_current_work(struct io_worker *worker,
struct io_wq_work *work)
{
if (work) {
- /* flush pending signals before assigning new work */
- if (signal_pending(current))
- flush_signals(current);
+ io_flush_signals();
cond_resched();
}
-#ifdef CONFIG_AUDIT
- current->loginuid = KUIDT_INIT(AUDIT_UID_UNSET);
- current->sessionid = AUDIT_SID_UNSET;
-#endif
-
spin_lock_irq(&worker->lock);
worker->cur_work = work;
spin_unlock_irq(&worker->lock);
@@ -550,6 +450,7 @@ get_next:
if (!work)
break;
io_assign_current_work(worker, work);
+ __set_current_state(TASK_RUNNING);
/* handle a whole dependent link */
do {
@@ -557,7 +458,8 @@ get_next:
unsigned int hash = io_get_work_hash(work);
next_hashed = wq_next_work(work);
- io_impersonate_work(worker, work);
+ if (work->creds && worker->cur_creds != work->creds)
+ io_wq_switch_creds(worker, work);
wq->do_work(work);
io_assign_current_work(worker, NULL);
@@ -572,8 +474,10 @@ get_next:
io_wqe_enqueue(wqe, linked);
if (hash != -1U && !next_hashed) {
+ clear_bit(hash, &wq->hash->map);
+ if (wq_has_sleeper(&wq->hash->wait))
+ wake_up(&wq->hash->wait);
raw_spin_lock_irq(&wqe->lock);
- wqe->hash_map &= ~BIT_ULL(hash);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
/* skip unnecessary unlock-lock wqe->lock */
if (!work)
@@ -592,27 +496,23 @@ static int io_wqe_worker(void *data)
struct io_wqe *wqe = worker->wqe;
struct io_wq *wq = wqe->wq;
- io_worker_start(wqe, worker);
+ io_worker_start(worker);
while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
set_current_state(TASK_INTERRUPTIBLE);
loop:
raw_spin_lock_irq(&wqe->lock);
if (io_wqe_run_queue(wqe)) {
- __set_current_state(TASK_RUNNING);
io_worker_handle_work(worker);
goto loop;
}
- /* drops the lock on success, retry */
- if (__io_worker_idle(wqe, worker)) {
- __release(&wqe->lock);
- goto loop;
- }
+ __io_worker_idle(wqe, worker);
raw_spin_unlock_irq(&wqe->lock);
- if (signal_pending(current))
- flush_signals(current);
+ io_flush_signals();
if (schedule_timeout(WORKER_IDLE_TIMEOUT))
continue;
+ if (fatal_signal_pending(current))
+ break;
/* timed out, exit unless we're the fixed worker */
if (test_bit(IO_WQ_BIT_EXIT, &wq->state) ||
!(worker->flags & IO_WORKER_F_FIXED))
@@ -636,15 +536,16 @@ loop:
*/
void io_wq_worker_running(struct task_struct *tsk)
{
- struct io_worker *worker = kthread_data(tsk);
- struct io_wqe *wqe = worker->wqe;
+ struct io_worker *worker = tsk->pf_io_worker;
+ if (!worker)
+ return;
if (!(worker->flags & IO_WORKER_F_UP))
return;
if (worker->flags & IO_WORKER_F_RUNNING)
return;
worker->flags |= IO_WORKER_F_RUNNING;
- io_wqe_inc_running(wqe, worker);
+ io_wqe_inc_running(worker);
}
/*
@@ -654,9 +555,10 @@ void io_wq_worker_running(struct task_struct *tsk)
*/
void io_wq_worker_sleeping(struct task_struct *tsk)
{
- struct io_worker *worker = kthread_data(tsk);
- struct io_wqe *wqe = worker->wqe;
+ struct io_worker *worker = tsk->pf_io_worker;
+ if (!worker)
+ return;
if (!(worker->flags & IO_WORKER_F_UP))
return;
if (!(worker->flags & IO_WORKER_F_RUNNING))
@@ -664,32 +566,27 @@ void io_wq_worker_sleeping(struct task_struct *tsk)
worker->flags &= ~IO_WORKER_F_RUNNING;
- raw_spin_lock_irq(&wqe->lock);
- io_wqe_dec_running(wqe, worker);
- raw_spin_unlock_irq(&wqe->lock);
+ raw_spin_lock_irq(&worker->wqe->lock);
+ io_wqe_dec_running(worker);
+ raw_spin_unlock_irq(&worker->wqe->lock);
}
-static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
+static int task_thread(void *data, int index)
{
+ struct io_worker *worker = data;
+ struct io_wqe *wqe = worker->wqe;
struct io_wqe_acct *acct = &wqe->acct[index];
- struct io_worker *worker;
+ struct io_wq *wq = wqe->wq;
+ char buf[TASK_COMM_LEN];
- worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node);
- if (!worker)
- return false;
+ sprintf(buf, "iou-wrk-%d", wq->task_pid);
+ set_task_comm(current, buf);
- refcount_set(&worker->ref, 1);
- worker->nulls_node.pprev = NULL;
- worker->wqe = wqe;
- spin_lock_init(&worker->lock);
+ current->pf_io_worker = worker;
+ worker->task = current;
- worker->task = kthread_create_on_node(io_wqe_worker, worker, wqe->node,
- "io_wqe_worker-%d/%d", index, wqe->node);
- if (IS_ERR(worker->task)) {
- kfree(worker);
- return false;
- }
- kthread_bind_mask(worker->task, cpumask_of_node(wqe->node));
+ set_cpus_allowed_ptr(current, cpumask_of_node(wqe->node));
+ current->flags |= PF_NO_SETAFFINITY;
raw_spin_lock_irq(&wqe->lock);
hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
@@ -702,11 +599,63 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
acct->nr_workers++;
raw_spin_unlock_irq(&wqe->lock);
- if (index == IO_WQ_ACCT_UNBOUND)
- atomic_inc(&wq->user->processes);
+ io_wqe_worker(data);
+ do_exit(0);
+}
+
+static int task_thread_bound(void *data)
+{
+ return task_thread(data, IO_WQ_ACCT_BOUND);
+}
+
+static int task_thread_unbound(void *data)
+{
+ return task_thread(data, IO_WQ_ACCT_UNBOUND);
+}
+
+pid_t io_wq_fork_thread(int (*fn)(void *), void *arg)
+{
+ unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
+ CLONE_IO|SIGCHLD;
+ struct kernel_clone_args args = {
+ .flags = ((lower_32_bits(flags) | CLONE_VM |
+ CLONE_UNTRACED) & ~CSIGNAL),
+ .exit_signal = (lower_32_bits(flags) & CSIGNAL),
+ .stack = (unsigned long)fn,
+ .stack_size = (unsigned long)arg,
+ };
+
+ return kernel_clone(&args);
+}
+
+static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
+{
+ struct io_worker *worker;
+ pid_t pid;
+
+ __set_current_state(TASK_RUNNING);
+
+ worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node);
+ if (!worker)
+ return false;
+
+ refcount_set(&worker->ref, 1);
+ worker->nulls_node.pprev = NULL;
+ worker->wqe = wqe;
+ spin_lock_init(&worker->lock);
+ init_completion(&worker->ref_done);
refcount_inc(&wq->refs);
- wake_up_process(worker->task);
+
+ if (index == IO_WQ_ACCT_BOUND)
+ pid = io_wq_fork_thread(task_thread_bound, worker);
+ else
+ pid = io_wq_fork_thread(task_thread_unbound, worker);
+ if (pid < 0) {
+ io_wq_put(wq);
+ kfree(worker);
+ return false;
+ }
return true;
}
@@ -752,93 +701,57 @@ static bool io_wq_worker_wake(struct io_worker *worker, void *data)
return false;
}
-/*
- * Manager thread. Tasked with creating new workers, if we need them.
- */
-static int io_wq_manager(void *data)
+static void io_wq_check_workers(struct io_wq *wq)
{
- struct io_wq *wq = data;
int node;
- /* create fixed workers */
- refcount_set(&wq->refs, 1);
for_each_node(node) {
+ struct io_wqe *wqe = wq->wqes[node];
+ bool fork_worker[2] = { false, false };
+
if (!node_online(node))
continue;
- if (create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND))
- continue;
- set_bit(IO_WQ_BIT_ERROR, &wq->state);
- set_bit(IO_WQ_BIT_EXIT, &wq->state);
- goto out;
- }
-
- complete(&wq->done);
-
- while (!kthread_should_stop()) {
- if (current->task_works)
- task_work_run();
-
- for_each_node(node) {
- struct io_wqe *wqe = wq->wqes[node];
- bool fork_worker[2] = { false, false };
-
- if (!node_online(node))
- continue;
-
- raw_spin_lock_irq(&wqe->lock);
- if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND))
- fork_worker[IO_WQ_ACCT_BOUND] = true;
- if (io_wqe_need_worker(wqe, IO_WQ_ACCT_UNBOUND))
- fork_worker[IO_WQ_ACCT_UNBOUND] = true;
- raw_spin_unlock_irq(&wqe->lock);
- if (fork_worker[IO_WQ_ACCT_BOUND])
- create_io_worker(wq, wqe, IO_WQ_ACCT_BOUND);
- if (fork_worker[IO_WQ_ACCT_UNBOUND])
- create_io_worker(wq, wqe, IO_WQ_ACCT_UNBOUND);
- }
- set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(HZ);
- }
-
- if (current->task_works)
- task_work_run();
-out:
- if (refcount_dec_and_test(&wq->refs)) {
- complete(&wq->done);
- return 0;
- }
- /* if ERROR is set and we get here, we have workers to wake */
- if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) {
- rcu_read_lock();
- for_each_node(node)
- io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL);
- rcu_read_unlock();
+ raw_spin_lock_irq(&wqe->lock);
+ if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND))
+ fork_worker[IO_WQ_ACCT_BOUND] = true;
+ if (io_wqe_need_worker(wqe, IO_WQ_ACCT_UNBOUND))
+ fork_worker[IO_WQ_ACCT_UNBOUND] = true;
+ raw_spin_unlock_irq(&wqe->lock);
+ if (fork_worker[IO_WQ_ACCT_BOUND])
+ create_io_worker(wq, wqe, IO_WQ_ACCT_BOUND);
+ if (fork_worker[IO_WQ_ACCT_UNBOUND])
+ create_io_worker(wq, wqe, IO_WQ_ACCT_UNBOUND);
}
- return 0;
}
-static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
- struct io_wq_work *work)
+/*
+ * Manager thread. Tasked with creating new workers, if we need them.
+ */
+static int io_wq_manager(void *data)
{
- bool free_worker;
-
- if (!(work->flags & IO_WQ_WORK_UNBOUND))
- return true;
- if (atomic_read(&acct->nr_running))
- return true;
+ struct io_wq *wq = data;
+ char buf[TASK_COMM_LEN];
- rcu_read_lock();
- free_worker = !hlist_nulls_empty(&wqe->free_list);
- rcu_read_unlock();
- if (free_worker)
- return true;
+ sprintf(buf, "iou-mgr-%d", wq->task_pid);
+ set_task_comm(current, buf);
+ current->flags |= PF_IO_WORKER;
+ wq->manager = current;
- if (atomic_read(&wqe->wq->user->processes) >= acct->max_workers &&
- !(capable(CAP_SYS_RESOURCE) || capable(CAP_SYS_ADMIN)))
- return false;
+ complete(&wq->done);
- return true;
+ do {
+ set_current_state(TASK_INTERRUPTIBLE);
+ io_wq_check_workers(wq);
+ schedule_timeout(HZ);
+ if (fatal_signal_pending(current))
+ set_bit(IO_WQ_BIT_EXIT, &wq->state);
+ } while (!test_bit(IO_WQ_BIT_EXIT, &wq->state));
+
+ io_wq_check_workers(wq);
+ wq->manager = NULL;
+ io_wq_put(wq);
+ do_exit(0);
}
static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
@@ -872,20 +785,37 @@ append:
wq_list_add_after(&work->list, &tail->list, &wqe->work_list);
}
+static int io_wq_fork_manager(struct io_wq *wq)
+{
+ int ret;
+
+ if (wq->manager)
+ return 0;
+
+ clear_bit(IO_WQ_BIT_EXIT, &wq->state);
+ refcount_inc(&wq->refs);
+ current->flags |= PF_IO_WORKER;
+ ret = io_wq_fork_thread(io_wq_manager, wq);
+ current->flags &= ~PF_IO_WORKER;
+ if (ret >= 0) {
+ wait_for_completion(&wq->done);
+ return 0;
+ }
+
+ io_wq_put(wq);
+ return ret;
+}
+
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
int work_flags;
unsigned long flags;
- /*
- * Do early check to see if we need a new unbound worker, and if we do,
- * if we're allowed to do so. This isn't 100% accurate as there's a
- * gap between this check and incrementing the value, but that's OK.
- * It's close enough to not be an issue, fork() has the same delay.
- */
- if (unlikely(!io_wq_can_queue(wqe, acct, work))) {
- io_run_cancel(work, wqe);
+ /* Can only happen if manager creation fails after exec */
+ if (unlikely(io_wq_fork_manager(wqe->wq))) {
+ work->flags |= IO_WQ_WORK_CANCEL;
+ wqe->wq->do_work(work);
return;
}
@@ -939,7 +869,7 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
spin_lock_irqsave(&worker->lock, flags);
if (worker->cur_work &&
match->fn(worker->cur_work, match->data)) {
- send_sig(SIGINT, worker->task, 1);
+ set_notify_signal(worker->task);
match->nr_running++;
}
spin_unlock_irqrestore(&worker->lock, flags);
@@ -1043,6 +973,24 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
return IO_WQ_CANCEL_NOTFOUND;
}
+static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode,
+ int sync, void *key)
+{
+ struct io_wqe *wqe = container_of(wait, struct io_wqe, wait);
+ int ret;
+
+ list_del_init(&wait->entry);
+
+ rcu_read_lock();
+ ret = io_wqe_activate_free_worker(wqe);
+ rcu_read_unlock();
+
+ if (!ret)
+ wake_up_process(wqe->wq->manager);
+
+ return 1;
+}
+
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
{
int ret = -ENOMEM, node;
@@ -1063,12 +1011,11 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
if (ret)
goto err_wqes;
+ refcount_inc(&data->hash->refs);
+ wq->hash = data->hash;
wq->free_work = data->free_work;
wq->do_work = data->do_work;
- /* caller must already hold a reference to this */
- wq->user = data->user;
-
ret = -ENOMEM;
for_each_node(node) {
struct io_wqe *wqe;
@@ -1083,11 +1030,11 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
wqe->node = alloc_node;
wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0);
- if (wq->user) {
- wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers =
+ wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers =
task_rlimit(current, RLIMIT_NPROC);
- }
atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0);
+ wqe->wait.func = io_wqe_hash_wake;
+ INIT_LIST_HEAD(&wqe->wait.entry);
wqe->wq = wq;
raw_spin_lock_init(&wqe->lock);
INIT_WQ_LIST(&wqe->work_list);
@@ -1095,23 +1042,16 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
INIT_LIST_HEAD(&wqe->all_list);
}
+ wq->task_pid = current->pid;
init_completion(&wq->done);
+ refcount_set(&wq->refs, 1);
- wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager");
- if (!IS_ERR(wq->manager)) {
- wake_up_process(wq->manager);
- wait_for_completion(&wq->done);
- if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) {
- ret = -ENOMEM;
- goto err;
- }
- refcount_set(&wq->use_refs, 1);
- reinit_completion(&wq->done);
+ ret = io_wq_fork_manager(wq);
+ if (!ret)
return wq;
- }
- ret = PTR_ERR(wq->manager);
- complete(&wq->done);
+ io_wq_put(wq);
+ io_wq_put_hash(data->hash);
err:
cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
for_each_node(node)
@@ -1123,15 +1063,7 @@ err_wq:
return ERR_PTR(ret);
}
-bool io_wq_get(struct io_wq *wq, struct io_wq_data *data)
-{
- if (data->free_work != wq->free_work || data->do_work != wq->do_work)
- return false;
-
- return refcount_inc_not_zero(&wq->use_refs);
-}
-
-static void __io_wq_destroy(struct io_wq *wq)
+static void io_wq_destroy(struct io_wq *wq)
{
int node;
@@ -1139,30 +1071,31 @@ static void __io_wq_destroy(struct io_wq *wq)
set_bit(IO_WQ_BIT_EXIT, &wq->state);
if (wq->manager)
- kthread_stop(wq->manager);
+ wake_up_process(wq->manager);
rcu_read_lock();
for_each_node(node)
io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL);
rcu_read_unlock();
- wait_for_completion(&wq->done);
+ spin_lock_irq(&wq->hash->wait.lock);
+ for_each_node(node) {
+ struct io_wqe *wqe = wq->wqes[node];
- for_each_node(node)
- kfree(wq->wqes[node]);
+ list_del_init(&wqe->wait.entry);
+ kfree(wqe);
+ }
+ spin_unlock_irq(&wq->hash->wait.lock);
+ io_wq_put_hash(wq->hash);
kfree(wq->wqes);
kfree(wq);
-}
-void io_wq_destroy(struct io_wq *wq)
-{
- if (refcount_dec_and_test(&wq->use_refs))
- __io_wq_destroy(wq);
}
-struct task_struct *io_wq_get_task(struct io_wq *wq)
+void io_wq_put(struct io_wq *wq)
{
- return wq->manager;
+ if (refcount_dec_and_test(&wq->refs))
+ io_wq_destroy(wq);
}
static bool io_wq_worker_affinity(struct io_worker *worker, void *data)
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 096f1021018e..b6ca12b60c35 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -1,6 +1,7 @@
#ifndef INTERNAL_IO_WQ_H
#define INTERNAL_IO_WQ_H
+#include <linux/refcount.h>
#include <linux/io_uring.h>
struct io_wq;
@@ -11,13 +12,6 @@ enum {
IO_WQ_WORK_UNBOUND = 4,
IO_WQ_WORK_CONCURRENT = 16,
- IO_WQ_WORK_FILES = 32,
- IO_WQ_WORK_FS = 64,
- IO_WQ_WORK_MM = 128,
- IO_WQ_WORK_CREDS = 256,
- IO_WQ_WORK_BLKCG = 512,
- IO_WQ_WORK_FSIZE = 1024,
-
IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */
};
@@ -85,7 +79,7 @@ static inline void wq_list_del(struct io_wq_work_list *list,
struct io_wq_work {
struct io_wq_work_node list;
- struct io_identity *identity;
+ const struct cred *creds;
unsigned flags;
};
@@ -100,20 +94,32 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *);
typedef void (io_wq_work_fn)(struct io_wq_work *);
-struct io_wq_data {
- struct user_struct *user;
+struct io_wq_hash {
+ refcount_t refs;
+ unsigned long map;
+ struct wait_queue_head wait;
+};
+static inline void io_wq_put_hash(struct io_wq_hash *hash)
+{
+ if (refcount_dec_and_test(&hash->refs))
+ kfree(hash);
+}
+
+struct io_wq_data {
+ struct io_wq_hash *hash;
io_wq_work_fn *do_work;
free_work_fn *free_work;
};
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
-bool io_wq_get(struct io_wq *wq, struct io_wq_data *data);
-void io_wq_destroy(struct io_wq *wq);
+void io_wq_put(struct io_wq *wq);
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
void io_wq_hash_work(struct io_wq_work *work, void *val);
+pid_t io_wq_fork_thread(int (*fn)(void *), void *arg);
+
static inline bool io_wq_is_hashed(struct io_wq_work *work)
{
return work->flags & IO_WQ_WORK_HASHED;
@@ -124,8 +130,6 @@ typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
void *data, bool cancel_all);
-struct task_struct *io_wq_get_task(struct io_wq *wq);
-
#if defined(CONFIG_IO_WQ)
extern void io_wq_worker_sleeping(struct task_struct *);
extern void io_wq_worker_running(struct task_struct *);
@@ -140,6 +144,7 @@ static inline void io_wq_worker_running(struct task_struct *tsk)
static inline bool io_wq_current_is_worker(void)
{
- return in_task() && (current->flags & PF_IO_WORKER);
+ return in_task() && (current->flags & PF_IO_WORKER) &&
+ current->pf_io_worker;
}
#endif
diff --git a/fs/io_uring.c b/fs/io_uring.c
index c9f5f295c2ac..4a088581b0f2 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -57,7 +57,6 @@
#include <linux/mman.h>
#include <linux/percpu.h>
#include <linux/slab.h>
-#include <linux/kthread.h>
#include <linux/blkdev.h>
#include <linux/bvec.h>
#include <linux/net.h>
@@ -254,6 +253,11 @@ struct io_restriction {
bool registered;
};
+enum {
+ IO_SQ_THREAD_SHOULD_STOP = 0,
+ IO_SQ_THREAD_SHOULD_PARK,
+};
+
struct io_sq_data {
refcount_t refs;
struct mutex lock;
@@ -267,6 +271,13 @@ struct io_sq_data {
struct wait_queue_head wait;
unsigned sq_thread_idle;
+ int sq_cpu;
+ pid_t task_pid;
+
+ unsigned long state;
+ struct completion startup;
+ struct completion completion;
+ struct completion exited;
};
#define IO_IOPOLL_BATCH 8
@@ -323,12 +334,12 @@ struct io_ring_ctx {
struct {
unsigned int flags;
unsigned int compat: 1;
- unsigned int limit_mem: 1;
unsigned int cq_overflow_flushed: 1;
unsigned int drain_next: 1;
unsigned int eventfd_async: 1;
unsigned int restricted: 1;
unsigned int sqo_dead: 1;
+ unsigned int sqo_exec: 1;
/*
* Ring buffer of indices into array of io_uring_sqe, which is
@@ -350,6 +361,9 @@ struct io_ring_ctx {
unsigned cached_cq_overflow;
unsigned long sq_check_overflow;
+ /* hashed buffered write serialization */
+ struct io_wq_hash *hash_map;
+
struct list_head defer_list;
struct list_head timeout_list;
struct list_head cq_overflow_list;
@@ -366,22 +380,14 @@ struct io_ring_ctx {
struct io_rings *rings;
- /* IO offload */
- struct io_wq *io_wq;
-
/*
- * For SQPOLL usage - we hold a reference to the parent task, so we
- * have access to the ->files
+ * For SQPOLL usage
*/
struct task_struct *sqo_task;
/* Only used for accounting purposes */
struct mm_struct *mm_account;
-#ifdef CONFIG_BLK_CGROUP
- struct cgroup_subsys_state *sqo_blkcg_css;
-#endif
-
struct io_sq_data *sq_data; /* if using sq thread polling */
struct wait_queue_head sqo_sq_wait;
@@ -401,13 +407,6 @@ struct io_ring_ctx {
struct user_struct *user;
- const struct cred *creds;
-
-#ifdef CONFIG_AUDIT
- kuid_t loginuid;
- unsigned int sessionid;
-#endif
-
struct completion ref_comp;
struct completion sq_thread_comp;
@@ -456,6 +455,11 @@ struct io_ring_ctx {
struct io_restriction restrictions;
+ /* exit task_work */
+ struct callback_head *exit_task_work;
+
+ struct wait_queue_head hash_wait;
+
/* Keep this last, we don't need it for the fast path */
struct work_struct exit_work;
};
@@ -838,7 +842,6 @@ struct io_op_def {
unsigned plug : 1;
/* size of async data needed, if any */
unsigned short async_size;
- unsigned work_flags;
};
static const struct io_op_def io_op_defs[] = {
@@ -851,7 +854,6 @@ static const struct io_op_def io_op_defs[] = {
.needs_async_data = 1,
.plug = 1,
.async_size = sizeof(struct io_async_rw),
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_WRITEV] = {
.needs_file = 1,
@@ -861,12 +863,9 @@ static const struct io_op_def io_op_defs[] = {
.needs_async_data = 1,
.plug = 1,
.async_size = sizeof(struct io_async_rw),
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
- IO_WQ_WORK_FSIZE,
},
[IORING_OP_FSYNC] = {
.needs_file = 1,
- .work_flags = IO_WQ_WORK_BLKCG,
},
[IORING_OP_READ_FIXED] = {
.needs_file = 1,
@@ -874,7 +873,6 @@ static const struct io_op_def io_op_defs[] = {
.pollin = 1,
.plug = 1,
.async_size = sizeof(struct io_async_rw),
- .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
},
[IORING_OP_WRITE_FIXED] = {
.needs_file = 1,
@@ -883,8 +881,6 @@ static const struct io_op_def io_op_defs[] = {
.pollout = 1,
.plug = 1,
.async_size = sizeof(struct io_async_rw),
- .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE |
- IO_WQ_WORK_MM,
},
[IORING_OP_POLL_ADD] = {
.needs_file = 1,
@@ -893,7 +889,6 @@ static const struct io_op_def io_op_defs[] = {
[IORING_OP_POLL_REMOVE] = {},
[IORING_OP_SYNC_FILE_RANGE] = {
.needs_file = 1,
- .work_flags = IO_WQ_WORK_BLKCG,
},
[IORING_OP_SENDMSG] = {
.needs_file = 1,
@@ -901,8 +896,6 @@ static const struct io_op_def io_op_defs[] = {
.pollout = 1,
.needs_async_data = 1,
.async_size = sizeof(struct io_async_msghdr),
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
- IO_WQ_WORK_FS,
},
[IORING_OP_RECVMSG] = {
.needs_file = 1,
@@ -911,29 +904,23 @@ static const struct io_op_def io_op_defs[] = {
.buffer_select = 1,
.needs_async_data = 1,
.async_size = sizeof(struct io_async_msghdr),
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
- IO_WQ_WORK_FS,
},
[IORING_OP_TIMEOUT] = {
.needs_async_data = 1,
.async_size = sizeof(struct io_timeout_data),
- .work_flags = IO_WQ_WORK_MM,
},
[IORING_OP_TIMEOUT_REMOVE] = {
/* used by timeout updates' prep() */
- .work_flags = IO_WQ_WORK_MM,
},
[IORING_OP_ACCEPT] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES,
},
[IORING_OP_ASYNC_CANCEL] = {},
[IORING_OP_LINK_TIMEOUT] = {
.needs_async_data = 1,
.async_size = sizeof(struct io_timeout_data),
- .work_flags = IO_WQ_WORK_MM,
},
[IORING_OP_CONNECT] = {
.needs_file = 1,
@@ -941,26 +928,14 @@ static const struct io_op_def io_op_defs[] = {
.pollout = 1,
.needs_async_data = 1,
.async_size = sizeof(struct io_async_connect),
- .work_flags = IO_WQ_WORK_MM,
},
[IORING_OP_FALLOCATE] = {
.needs_file = 1,
- .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE,
- },
- [IORING_OP_OPENAT] = {
- .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG |
- IO_WQ_WORK_FS | IO_WQ_WORK_MM,
- },
- [IORING_OP_CLOSE] = {
- .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG,
- },
- [IORING_OP_FILES_UPDATE] = {
- .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_MM,
- },
- [IORING_OP_STATX] = {
- .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_MM |
- IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
},
+ [IORING_OP_OPENAT] = {},
+ [IORING_OP_CLOSE] = {},
+ [IORING_OP_FILES_UPDATE] = {},
+ [IORING_OP_STATX] = {},
[IORING_OP_READ] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
@@ -968,7 +943,6 @@ static const struct io_op_def io_op_defs[] = {
.buffer_select = 1,
.plug = 1,
.async_size = sizeof(struct io_async_rw),
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_WRITE] = {
.needs_file = 1,
@@ -976,42 +950,31 @@ static const struct io_op_def io_op_defs[] = {
.pollout = 1,
.plug = 1,
.async_size = sizeof(struct io_async_rw),
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
- IO_WQ_WORK_FSIZE,
},
[IORING_OP_FADVISE] = {
.needs_file = 1,
- .work_flags = IO_WQ_WORK_BLKCG,
- },
- [IORING_OP_MADVISE] = {
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
+ [IORING_OP_MADVISE] = {},
[IORING_OP_SEND] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_RECV] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_OPENAT2] = {
- .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_FS |
- IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
},
[IORING_OP_EPOLL_CTL] = {
.unbound_nonreg_file = 1,
- .work_flags = IO_WQ_WORK_FILES,
},
[IORING_OP_SPLICE] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
- .work_flags = IO_WQ_WORK_BLKCG,
},
[IORING_OP_PROVIDE_BUFFERS] = {},
[IORING_OP_REMOVE_BUFFERS] = {},
@@ -1023,19 +986,14 @@ static const struct io_op_def io_op_defs[] = {
[IORING_OP_SHUTDOWN] = {
.needs_file = 1,
},
- [IORING_OP_RENAMEAT] = {
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES |
- IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
- },
- [IORING_OP_UNLINKAT] = {
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES |
- IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
- },
+ [IORING_OP_RENAMEAT] = {},
+ [IORING_OP_UNLINKAT] = {},
};
static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct task_struct *task,
struct files_struct *files);
+static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx);
static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node);
static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
struct io_ring_ctx *ctx);
@@ -1126,161 +1084,18 @@ static bool io_match_task(struct io_kiocb *head,
continue;
if (req->file && req->file->f_op == &io_uring_fops)
return true;
- if ((req->work.flags & IO_WQ_WORK_FILES) &&
- req->work.identity->files == files)
+ if (req->task->files == files)
return true;
}
return false;
}
-static void io_sq_thread_drop_mm_files(void)
-{
- struct files_struct *files = current->files;
- struct mm_struct *mm = current->mm;
-
- if (mm) {
- kthread_unuse_mm(mm);
- mmput(mm);
- current->mm = NULL;
- }
- if (files) {
- struct nsproxy *nsproxy = current->nsproxy;
-
- task_lock(current);
- current->files = NULL;
- current->nsproxy = NULL;
- task_unlock(current);
- put_files_struct(files);
- put_nsproxy(nsproxy);
- }
-}
-
-static int __io_sq_thread_acquire_files(struct io_ring_ctx *ctx)
-{
- if (!current->files) {
- struct files_struct *files;
- struct nsproxy *nsproxy;
-
- task_lock(ctx->sqo_task);
- files = ctx->sqo_task->files;
- if (!files) {
- task_unlock(ctx->sqo_task);
- return -EOWNERDEAD;
- }
- atomic_inc(&files->count);
- get_nsproxy(ctx->sqo_task->nsproxy);
- nsproxy = ctx->sqo_task->nsproxy;
- task_unlock(ctx->sqo_task);
-
- task_lock(current);
- current->files = files;
- current->nsproxy = nsproxy;
- task_unlock(current);
- }
- return 0;
-}
-
-static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
-{
- struct mm_struct *mm;
-
- if (current->mm)
- return 0;
-
- task_lock(ctx->sqo_task);
- mm = ctx->sqo_task->mm;
- if (unlikely(!mm || !mmget_not_zero(mm)))
- mm = NULL;
- task_unlock(ctx->sqo_task);
-
- if (mm) {
- kthread_use_mm(mm);
- return 0;
- }
-
- return -EFAULT;
-}
-
-static int __io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx,
- struct io_kiocb *req)
-{
- const struct io_op_def *def = &io_op_defs[req->opcode];
- int ret;
-
- if (def->work_flags & IO_WQ_WORK_MM) {
- ret = __io_sq_thread_acquire_mm(ctx);
- if (unlikely(ret))
- return ret;
- }
-
- if (def->needs_file || (def->work_flags & IO_WQ_WORK_FILES)) {
- ret = __io_sq_thread_acquire_files(ctx);
- if (unlikely(ret))
- return ret;
- }
-
- return 0;
-}
-
-static inline int io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx,
- struct io_kiocb *req)
-{
- if (!(ctx->flags & IORING_SETUP_SQPOLL))
- return 0;
- return __io_sq_thread_acquire_mm_files(ctx, req);
-}
-
-static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx,
- struct cgroup_subsys_state **cur_css)
-
-{
-#ifdef CONFIG_BLK_CGROUP
- /* puts the old one when swapping */
- if (*cur_css != ctx->sqo_blkcg_css) {
- kthread_associate_blkcg(ctx->sqo_blkcg_css);
- *cur_css = ctx->sqo_blkcg_css;
- }
-#endif
-}
-
-static void io_sq_thread_unassociate_blkcg(void)
-{
-#ifdef CONFIG_BLK_CGROUP
- kthread_associate_blkcg(NULL);
-#endif
-}
-
static inline void req_set_fail_links(struct io_kiocb *req)
{
if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
req->flags |= REQ_F_FAIL_LINK;
}
-/*
- * None of these are dereferenced, they are simply used to check if any of
- * them have changed. If we're under current and check they are still the
- * same, we're fine to grab references to them for actual out-of-line use.
- */
-static void io_init_identity(struct io_identity *id)
-{
- id->files = current->files;
- id->mm = current->mm;
-#ifdef CONFIG_BLK_CGROUP
- rcu_read_lock();
- id->blkcg_css = blkcg_css();
- rcu_read_unlock();
-#endif
- id->creds = current_cred();
- id->nsproxy = current->nsproxy;
- id->fs = current->fs;
- id->fsize = rlimit(RLIMIT_FSIZE);
-#ifdef CONFIG_AUDIT
- id->loginuid = current->loginuid;
- id->sessionid = current->sessionid;
-#endif
- refcount_set(&id->count, 1);
-}
-
static inline void __io_req_init_async(struct io_kiocb *req)
{
memset(&req->work, 0, sizeof(req->work));
@@ -1293,17 +1108,10 @@ static inline void __io_req_init_async(struct io_kiocb *req)
*/
static inline void io_req_init_async(struct io_kiocb *req)
{
- struct io_uring_task *tctx = current->io_uring;
-
if (req->flags & REQ_F_WORK_INITIALIZED)
return;
__io_req_init_async(req);
-
- /* Grab a ref if this isn't our static identity */
- req->work.identity = tctx->identity;
- if (tctx->identity != &tctx->__identity)
- refcount_inc(&req->work.identity->count);
}
static void io_ring_ctx_ref_free(struct percpu_ref *ref)
@@ -1388,40 +1196,14 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq)
return false;
}
-static void io_put_identity(struct io_uring_task *tctx, struct io_kiocb *req)
-{
- if (req->work.identity == &tctx->__identity)
- return;
- if (refcount_dec_and_test(&req->work.identity->count))
- kfree(req->work.identity);
-}
-
static void io_req_clean_work(struct io_kiocb *req)
{
if (!(req->flags & REQ_F_WORK_INITIALIZED))
return;
- if (req->work.flags & IO_WQ_WORK_MM)
- mmdrop(req->work.identity->mm);
-#ifdef CONFIG_BLK_CGROUP
- if (req->work.flags & IO_WQ_WORK_BLKCG)
- css_put(req->work.identity->blkcg_css);
-#endif
- if (req->work.flags & IO_WQ_WORK_CREDS)
- put_cred(req->work.identity->creds);
- if (req->work.flags & IO_WQ_WORK_FS) {
- struct fs_struct *fs = req->work.identity->fs;
-
- spin_lock(&req->work.identity->fs->lock);
- if (--fs->users)
- fs = NULL;
- spin_unlock(&req->work.identity->fs->lock);
- if (fs)
- free_fs_struct(fs);
- }
- if (req->work.flags & IO_WQ_WORK_FILES) {
- put_files_struct(req->work.identity->files);
- put_nsproxy(req->work.identity->nsproxy);
+ if (req->work.creds) {
+ put_cred(req->work.creds);
+ req->work.creds = NULL;
}
if (req->flags & REQ_F_INFLIGHT) {
struct io_ring_ctx *ctx = req->ctx;
@@ -1437,54 +1219,6 @@ static void io_req_clean_work(struct io_kiocb *req)
}
req->flags &= ~REQ_F_WORK_INITIALIZED;
- req->work.flags &= ~(IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | IO_WQ_WORK_FS |
- IO_WQ_WORK_CREDS | IO_WQ_WORK_FILES);
- io_put_identity(req->task->io_uring, req);
-}
-
-/*
- * Create a private copy of io_identity, since some fields don't match
- * the current context.
- */
-static bool io_identity_cow(struct io_kiocb *req)
-{
- struct io_uring_task *tctx = current->io_uring;
- const struct cred *creds = NULL;
- struct io_identity *id;
-
- if (req->work.flags & IO_WQ_WORK_CREDS)
- creds = req->work.identity->creds;
-
- id = kmemdup(req->work.identity, sizeof(*id), GFP_KERNEL);
- if (unlikely(!id)) {
- req->work.flags |= IO_WQ_WORK_CANCEL;
- return false;
- }
-
- /*
- * We can safely just re-init the creds we copied Either the field
- * matches the current one, or we haven't grabbed it yet. The only
- * exception is ->creds, through registered personalities, so handle
- * that one separately.
- */
- io_init_identity(id);
- if (creds)
- id->creds = creds;
-
- /* add one for this request */
- refcount_inc(&id->count);
-
- /* drop tctx and req identity references, if needed */
- if (tctx->identity != &tctx->__identity &&
- refcount_dec_and_test(&tctx->identity->count))
- kfree(tctx->identity);
- if (req->work.identity != &tctx->__identity &&
- refcount_dec_and_test(&req->work.identity->count))
- kfree(req->work.identity);
-
- req->work.identity = id;
- tctx->identity = id;
- return true;
}
static void io_req_track_inflight(struct io_kiocb *req)
@@ -1501,79 +1235,6 @@ static void io_req_track_inflight(struct io_kiocb *req)
}
}
-static bool io_grab_identity(struct io_kiocb *req)
-{
- const struct io_op_def *def = &io_op_defs[req->opcode];
- struct io_identity *id = req->work.identity;
-
- if (def->work_flags & IO_WQ_WORK_FSIZE) {
- if (id->fsize != rlimit(RLIMIT_FSIZE))
- return false;
- req->work.flags |= IO_WQ_WORK_FSIZE;
- }
-#ifdef CONFIG_BLK_CGROUP
- if (!(req->work.flags & IO_WQ_WORK_BLKCG) &&
- (def->work_flags & IO_WQ_WORK_BLKCG)) {
- rcu_read_lock();
- if (id->blkcg_css != blkcg_css()) {
- rcu_read_unlock();
- return false;
- }
- /*
- * This should be rare, either the cgroup is dying or the task
- * is moving cgroups. Just punt to root for the handful of ios.
- */
- if (css_tryget_online(id->blkcg_css))
- req->work.flags |= IO_WQ_WORK_BLKCG;
- rcu_read_unlock();
- }
-#endif
- if (!(req->work.flags & IO_WQ_WORK_CREDS)) {
- if (id->creds != current_cred())
- return false;
- get_cred(id->creds);
- req->work.flags |= IO_WQ_WORK_CREDS;
- }
-#ifdef CONFIG_AUDIT
- if (!uid_eq(current->loginuid, id->loginuid) ||
- current->sessionid != id->sessionid)
- return false;
-#endif
- if (!(req->work.flags & IO_WQ_WORK_FS) &&
- (def->work_flags & IO_WQ_WORK_FS)) {
- if (current->fs != id->fs)
- return false;
- spin_lock(&id->fs->lock);
- if (!id->fs->in_exec) {
- id->fs->users++;
- req->work.flags |= IO_WQ_WORK_FS;
- } else {
- req->work.flags |= IO_WQ_WORK_CANCEL;
- }
- spin_unlock(&current->fs->lock);
- }
- if (!(req->work.flags & IO_WQ_WORK_FILES) &&
- (def->work_flags & IO_WQ_WORK_FILES) &&
- !(req->flags & REQ_F_NO_FILE_TABLE)) {
- if (id->files != current->files ||
- id->nsproxy != current->nsproxy)
- return false;
- atomic_inc(&id->files->count);
- get_nsproxy(id->nsproxy);
- req->work.flags |= IO_WQ_WORK_FILES;
- io_req_track_inflight(req);
- }
- if (!(req->work.flags & IO_WQ_WORK_MM) &&
- (def->work_flags & IO_WQ_WORK_MM)) {
- if (id->mm != current->mm)
- return false;
- mmgrab(id->mm);
- req->work.flags |= IO_WQ_WORK_MM;
- }
-
- return true;
-}
-
static void io_prep_async_work(struct io_kiocb *req)
{
const struct io_op_def *def = &io_op_defs[req->opcode];
@@ -1591,17 +1252,8 @@ static void io_prep_async_work(struct io_kiocb *req)
if (def->unbound_nonreg_file)
req->work.flags |= IO_WQ_WORK_UNBOUND;
}
-
- /* if we fail grabbing identity, we must COW, regrab, and retry */
- if (io_grab_identity(req))
- return;
-
- if (!io_identity_cow(req))
- return;
-
- /* can't fail at this point */
- if (!io_grab_identity(req))
- WARN_ON(1);
+ if (!req->work.creds)
+ req->work.creds = get_current_cred();
}
static void io_prep_async_link(struct io_kiocb *req)
@@ -1616,10 +1268,14 @@ static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *link = io_prep_linked_timeout(req);
+ struct io_uring_task *tctx = req->task->io_uring;
+
+ BUG_ON(!tctx);
+ BUG_ON(!tctx->io_wq);
trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
&req->work, req->flags);
- io_wq_enqueue(ctx->io_wq, &req->work);
+ io_wq_enqueue(tctx->io_wq, &req->work);
return link;
}
@@ -2313,11 +1969,14 @@ static int io_req_task_work_add(struct io_kiocb *req)
static void io_req_task_work_add_fallback(struct io_kiocb *req,
task_work_func_t cb)
{
- struct task_struct *tsk = io_wq_get_task(req->ctx->io_wq);
+ struct io_ring_ctx *ctx = req->ctx;
+ struct callback_head *head;
init_task_work(&req->task_work, cb);
- task_work_add(tsk, &req->task_work, TWA_NONE);
- wake_up_process(tsk);
+ do {
+ head = READ_ONCE(ctx->exit_task_work);
+ req->task_work.next = head;
+ } while (cmpxchg(&ctx->exit_task_work, head, &req->task_work) != head);
}
static void __io_req_task_cancel(struct io_kiocb *req, int error)
@@ -2351,15 +2010,11 @@ static void __io_req_task_submit(struct io_kiocb *req)
/* ctx stays valid until unlock, even if we drop all ours ctx->refs */
mutex_lock(&ctx->uring_lock);
- if (!ctx->sqo_dead && !(current->flags & PF_EXITING) &&
- !io_sq_thread_acquire_mm_files(ctx, req))
+ if (!ctx->sqo_dead && !(current->flags & PF_EXITING) && !current->in_execve)
__io_queue_sqe(req);
else
__io_req_task_cancel(req, -EFAULT);
mutex_unlock(&ctx->uring_lock);
-
- if (ctx->flags & IORING_SETUP_SQPOLL)
- io_sq_thread_drop_mm_files();
}
static void io_req_task_submit(struct callback_head *cb)
@@ -2823,7 +2478,6 @@ static bool io_rw_reissue(struct io_kiocb *req)
{
#ifdef CONFIG_BLOCK
umode_t mode = file_inode(req->file)->i_mode;
- int ret;
if (!S_ISBLK(mode) && !S_ISREG(mode))
return false;
@@ -2839,9 +2493,7 @@ static bool io_rw_reissue(struct io_kiocb *req)
lockdep_assert_held(&req->ctx->uring_lock);
- ret = io_sq_thread_acquire_mm_files(req->ctx, req);
-
- if (!ret && io_resubmit_prep(req)) {
+ if (io_resubmit_prep(req)) {
refcount_inc(&req->refs);
io_queue_async_work(req);
return true;
@@ -5946,12 +5598,15 @@ static bool io_cancel_cb(struct io_wq_work *work, void *data)
return req->user_data == (unsigned long) data;
}
-static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
+static int io_async_cancel_one(struct io_uring_task *tctx, void *sqe_addr)
{
enum io_wq_cancel cancel_ret;
int ret = 0;
- cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr, false);
+ if (!tctx->io_wq)
+ return -ENOENT;
+
+ cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, sqe_addr, false);
switch (cancel_ret) {
case IO_WQ_CANCEL_OK:
ret = 0;
@@ -5974,7 +5629,8 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
unsigned long flags;
int ret;
- ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr);
+ ret = io_async_cancel_one(req->task->io_uring,
+ (void *) (unsigned long) sqe_addr);
if (ret != -ENOENT) {
spin_lock_irqsave(&ctx->completion_lock, flags);
goto done;
@@ -6563,10 +6219,9 @@ static void __io_queue_sqe(struct io_kiocb *req)
const struct cred *old_creds = NULL;
int ret;
- if ((req->flags & REQ_F_WORK_INITIALIZED) &&
- (req->work.flags & IO_WQ_WORK_CREDS) &&
- req->work.identity->creds != current_cred())
- old_creds = override_creds(req->work.identity->creds);
+ if ((req->flags & REQ_F_WORK_INITIALIZED) && req->work.creds &&
+ req->work.creds != current_cred())
+ old_creds = override_creds(req->work.creds);
ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
@@ -6684,9 +6339,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
if (unlikely(req->opcode >= IORING_OP_LAST))
return -EINVAL;
- if (unlikely(io_sq_thread_acquire_mm_files(ctx, req)))
- return -EFAULT;
-
if (unlikely(!io_check_restriction(ctx, req, sqe_flags)))
return -EACCES;
@@ -6696,17 +6348,11 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
id = READ_ONCE(sqe->personality);
if (id) {
- struct io_identity *iod;
-
- iod = idr_find(&ctx->personality_idr, id);
- if (unlikely(!iod))
- return -EINVAL;
- refcount_inc(&iod->count);
-
__io_req_init_async(req);
- get_cred(iod->creds);
- req->work.identity = iod;
- req->work.flags |= IO_WQ_WORK_CREDS;
+ req->work.creds = idr_find(&ctx->personality_idr, id);
+ if (unlikely(!req->work.creds))
+ return -EINVAL;
+ get_cred(req->work.creds);
}
state = &ctx->submit_state;
@@ -7008,71 +6654,97 @@ static void io_sqd_init_new(struct io_sq_data *sqd)
io_sqd_update_thread_idle(sqd);
}
+static bool io_sq_thread_should_stop(struct io_sq_data *sqd)
+{
+ return test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
+}
+
+static bool io_sq_thread_should_park(struct io_sq_data *sqd)
+{
+ return test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
+}
+
+static void io_sq_thread_parkme(struct io_sq_data *sqd)
+{
+ for (;;) {
+ /*
+ * TASK_PARKED is a special state; we must serialize against
+ * possible pending wakeups to avoid store-store collisions on
+ * task->state.
+ *
+ * Such a collision might possibly result in the task state
+ * changin from TASK_PARKED and us failing the
+ * wait_task_inactive() in kthread_park().
+ */
+ set_special_state(TASK_PARKED);
+ if (!test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state))
+ break;
+
+ /*
+ * Thread is going to call schedule(), do not preempt it,
+ * or the caller of kthread_park() may spend more time in
+ * wait_task_inactive().
+ */
+ preempt_disable();
+ complete(&sqd->completion);
+ schedule_preempt_disabled();
+ preempt_enable();
+ }
+ __set_current_state(TASK_RUNNING);
+}
+
static int io_sq_thread(void *data)
{
- struct cgroup_subsys_state *cur_css = NULL;
- struct files_struct *old_files = current->files;
- struct nsproxy *old_nsproxy = current->nsproxy;
- const struct cred *old_cred = NULL;
struct io_sq_data *sqd = data;
struct io_ring_ctx *ctx;
unsigned long timeout = 0;
+ char buf[TASK_COMM_LEN];
DEFINE_WAIT(wait);
- task_lock(current);
- current->files = NULL;
- current->nsproxy = NULL;
- task_unlock(current);
+ sprintf(buf, "iou-sqp-%d", sqd->task_pid);
+ set_task_comm(current, buf);
+ sqd->thread = current;
+ current->pf_io_worker = NULL;
+
+ if (sqd->sq_cpu != -1)
+ set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
+ else
+ set_cpus_allowed_ptr(current, cpu_online_mask);
+ current->flags |= PF_NO_SETAFFINITY;
+
+ complete(&sqd->completion);
- while (!kthread_should_stop()) {
+ wait_for_completion(&sqd->startup);
+
+ while (!io_sq_thread_should_stop(sqd)) {
int ret;
bool cap_entries, sqt_spin, needs_sched;
/*
* Any changes to the sqd lists are synchronized through the
- * kthread parking. This synchronizes the thread vs users,
+ * thread parking. This synchronizes the thread vs users,
* the users are synchronized on the sqd->ctx_lock.
*/
- if (kthread_should_park()) {
- kthread_parkme();
- /*
- * When sq thread is unparked, in case the previous park operation
- * comes from io_put_sq_data(), which means that sq thread is going
- * to be stopped, so here needs to have a check.
- */
- if (kthread_should_stop())
- break;
+ if (io_sq_thread_should_park(sqd)) {
+ io_sq_thread_parkme(sqd);
+ continue;
}
-
if (unlikely(!list_empty(&sqd->ctx_new_list))) {
io_sqd_init_new(sqd);
timeout = jiffies + sqd->sq_thread_idle;
}
-
+ if (fatal_signal_pending(current))
+ break;
sqt_spin = false;
cap_entries = !list_is_singular(&sqd->ctx_list);
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
- if (current->cred != ctx->creds) {
- if (old_cred)
- revert_creds(old_cred);
- old_cred = override_creds(ctx->creds);
- }
- io_sq_thread_associate_blkcg(ctx, &cur_css);
-#ifdef CONFIG_AUDIT
- current->loginuid = ctx->loginuid;
- current->sessionid = ctx->sessionid;
-#endif
-
ret = __io_sq_thread(ctx, cap_entries);
if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
sqt_spin = true;
-
- io_sq_thread_drop_mm_files();
}
if (sqt_spin || !time_after(jiffies, timeout)) {
io_run_task_work();
- io_sq_thread_drop_mm_files();
cond_resched();
if (sqt_spin)
timeout = jiffies + sqd->sq_thread_idle;
@@ -7093,7 +6765,7 @@ static int io_sq_thread(void *data)
}
}
- if (needs_sched && !kthread_should_park()) {
+ if (needs_sched && !io_sq_thread_should_park(sqd)) {
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
io_ring_set_wakeup_flag(ctx);
@@ -7106,22 +6778,25 @@ static int io_sq_thread(void *data)
timeout = jiffies + sqd->sq_thread_idle;
}
- io_run_task_work();
- io_sq_thread_drop_mm_files();
-
- if (cur_css)
- io_sq_thread_unassociate_blkcg();
- if (old_cred)
- revert_creds(old_cred);
+ list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
+ io_uring_cancel_sqpoll(ctx);
- task_lock(current);
- current->files = old_files;
- current->nsproxy = old_nsproxy;
- task_unlock(current);
+ io_run_task_work();
- kthread_parkme();
+ /*
+ * Clear thread under lock so that concurrent parks work correctly
+ */
+ complete_all(&sqd->completion);
+ mutex_lock(&sqd->lock);
+ sqd->thread = NULL;
+ list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
+ ctx->sqo_exec = 1;
+ io_ring_set_wakeup_flag(ctx);
+ }
+ mutex_unlock(&sqd->lock);
- return 0;
+ complete(&sqd->exited);
+ do_exit(0);
}
struct io_wait_queue {
@@ -7413,20 +7088,74 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
return 0;
}
+static void io_sq_thread_unpark(struct io_sq_data *sqd)
+ __releases(&sqd->lock)
+{
+ if (!sqd->thread)
+ return;
+ if (sqd->thread == current)
+ return;
+ clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
+ wake_up_state(sqd->thread, TASK_PARKED);
+ mutex_unlock(&sqd->lock);
+}
+
+static bool io_sq_thread_park(struct io_sq_data *sqd)
+ __acquires(&sqd->lock)
+{
+ if (sqd->thread == current)
+ return true;
+ mutex_lock(&sqd->lock);
+ if (!sqd->thread) {
+ mutex_unlock(&sqd->lock);
+ return false;
+ }
+ set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
+ wake_up_process(sqd->thread);
+ wait_for_completion(&sqd->completion);
+ return true;
+}
+
+static void io_sq_thread_stop(struct io_sq_data *sqd)
+{
+ if (!sqd->thread)
+ return;
+
+ set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
+ WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state));
+ wake_up_process(sqd->thread);
+ wait_for_completion(&sqd->exited);
+}
+
static void io_put_sq_data(struct io_sq_data *sqd)
{
if (refcount_dec_and_test(&sqd->refs)) {
- /*
- * The park is a bit of a work-around, without it we get
- * warning spews on shutdown with SQPOLL set and affinity
- * set to a single CPU.
- */
+ io_sq_thread_stop(sqd);
+ kfree(sqd);
+ }
+}
+
+static void io_sq_thread_finish(struct io_ring_ctx *ctx)
+{
+ struct io_sq_data *sqd = ctx->sq_data;
+
+ if (sqd) {
+ complete(&sqd->startup);
if (sqd->thread) {
- kthread_park(sqd->thread);
- kthread_stop(sqd->thread);
+ wait_for_completion(&ctx->sq_thread_comp);
+ io_sq_thread_park(sqd);
}
- kfree(sqd);
+ mutex_lock(&sqd->ctx_lock);
+ list_del(&ctx->sqd_list);
+ io_sqd_update_thread_idle(sqd);
+ mutex_unlock(&sqd->ctx_lock);
+
+ if (sqd->thread)
+ io_sq_thread_unpark(sqd);
+
+ io_put_sq_data(sqd);
+ ctx->sq_data = NULL;
}
}
@@ -7473,68 +7202,12 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p)
mutex_init(&sqd->ctx_lock);
mutex_init(&sqd->lock);
init_waitqueue_head(&sqd->wait);
+ init_completion(&sqd->startup);
+ init_completion(&sqd->completion);
+ init_completion(&sqd->exited);
return sqd;
}
-static void io_sq_thread_unpark(struct io_sq_data *sqd)
- __releases(&sqd->lock)
-{
- if (!sqd->thread)
- return;
- kthread_unpark(sqd->thread);
- mutex_unlock(&sqd->lock);
-}
-
-static void io_sq_thread_park(struct io_sq_data *sqd)
- __acquires(&sqd->lock)
-{
- if (!sqd->thread)
- return;
- mutex_lock(&sqd->lock);
- kthread_park(sqd->thread);
-}
-
-static void io_sq_thread_stop(struct io_ring_ctx *ctx)
-{
- struct io_sq_data *sqd = ctx->sq_data;
-
- if (sqd) {
- if (sqd->thread) {
- /*
- * We may arrive here from the error branch in
- * io_sq_offload_create() where the kthread is created
- * without being waked up, thus wake it up now to make
- * sure the wait will complete.
- */
- wake_up_process(sqd->thread);
- wait_for_completion(&ctx->sq_thread_comp);
-
- io_sq_thread_park(sqd);
- }
-
- mutex_lock(&sqd->ctx_lock);
- list_del(&ctx->sqd_list);
- io_sqd_update_thread_idle(sqd);
- mutex_unlock(&sqd->ctx_lock);
-
- if (sqd->thread)
- io_sq_thread_unpark(sqd);
-
- io_put_sq_data(sqd);
- ctx->sq_data = NULL;
- }
-}
-
-static void io_finish_async(struct io_ring_ctx *ctx)
-{
- io_sq_thread_stop(ctx);
-
- if (ctx->io_wq) {
- io_wq_destroy(ctx->io_wq);
- ctx->io_wq = NULL;
- }
-}
-
#if defined(CONFIG_UNIX)
/*
* Ensure the UNIX gc is aware of our file set, so we are certain that
@@ -7561,7 +7234,7 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
skb->sk = sk;
nr_files = 0;
- fpl->user = get_uid(ctx->user);
+ fpl->user = get_uid(current_user());
for (i = 0; i < nr; i++) {
struct file *file = io_file_from_index(ctx, i + offset);
@@ -8093,54 +7766,34 @@ static struct io_wq_work *io_free_work(struct io_wq_work *work)
return req ? &req->work : NULL;
}
-static int io_init_wq_offload(struct io_ring_ctx *ctx,
- struct io_uring_params *p)
+static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx)
{
+ struct io_wq_hash *hash;
struct io_wq_data data;
- struct fd f;
- struct io_ring_ctx *ctx_attach;
unsigned int concurrency;
- int ret = 0;
-
- data.user = ctx->user;
- data.free_work = io_free_work;
- data.do_work = io_wq_submit_work;
-
- if (!(p->flags & IORING_SETUP_ATTACH_WQ)) {
- /* Do QD, or 4 * CPUS, whatever is smallest */
- concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
- ctx->io_wq = io_wq_create(concurrency, &data);
- if (IS_ERR(ctx->io_wq)) {
- ret = PTR_ERR(ctx->io_wq);
- ctx->io_wq = NULL;
- }
- return ret;
+ hash = ctx->hash_map;
+ if (!hash) {
+ hash = kzalloc(sizeof(*hash), GFP_KERNEL);
+ if (!hash)
+ return ERR_PTR(-ENOMEM);
+ refcount_set(&hash->refs, 1);
+ init_waitqueue_head(&hash->wait);
+ ctx->hash_map = hash;
}
- f = fdget(p->wq_fd);
- if (!f.file)
- return -EBADF;
-
- if (f.file->f_op != &io_uring_fops) {
- ret = -EINVAL;
- goto out_fput;
- }
+ data.hash = hash;
+ data.free_work = io_free_work;
+ data.do_work = io_wq_submit_work;
- ctx_attach = f.file->private_data;
- /* @io_wq is protected by holding the fd */
- if (!io_wq_get(ctx_attach->io_wq, &data)) {
- ret = -EINVAL;
- goto out_fput;
- }
+ /* Do QD, or 4 * CPUS, whatever is smallest */
+ concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
- ctx->io_wq = ctx_attach->io_wq;
-out_fput:
- fdput(f);
- return ret;
+ return io_wq_create(concurrency, &data);
}
-static int io_uring_alloc_task_context(struct task_struct *task)
+static int io_uring_alloc_task_context(struct task_struct *task,
+ struct io_ring_ctx *ctx)
{
struct io_uring_task *tctx;
int ret;
@@ -8155,13 +7808,19 @@ static int io_uring_alloc_task_context(struct task_struct *task)
return ret;
}
+ tctx->io_wq = io_init_wq_offload(ctx);
+ if (IS_ERR(tctx->io_wq)) {
+ ret = PTR_ERR(tctx->io_wq);
+ percpu_counter_destroy(&tctx->inflight);
+ kfree(tctx);
+ return ret;
+ }
+
xa_init(&tctx->xa);
init_waitqueue_head(&tctx->wait);
tctx->last = NULL;
atomic_set(&tctx->in_idle, 0);
tctx->sqpoll = false;
- io_init_identity(&tctx->__identity);
- tctx->identity = &tctx->__identity;
task->io_uring = tctx;
spin_lock_init(&tctx->task_lock);
INIT_WQ_LIST(&tctx->task_list);
@@ -8175,19 +7834,49 @@ void __io_uring_free(struct task_struct *tsk)
struct io_uring_task *tctx = tsk->io_uring;
WARN_ON_ONCE(!xa_empty(&tctx->xa));
- WARN_ON_ONCE(refcount_read(&tctx->identity->count) != 1);
- if (tctx->identity != &tctx->__identity)
- kfree(tctx->identity);
percpu_counter_destroy(&tctx->inflight);
kfree(tctx);
tsk->io_uring = NULL;
}
+static int io_sq_thread_fork(struct io_sq_data *sqd, struct io_ring_ctx *ctx)
+{
+ int ret;
+
+ clear_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
+ reinit_completion(&sqd->completion);
+ ctx->sqo_dead = ctx->sqo_exec = 0;
+ sqd->task_pid = current->pid;
+ current->flags |= PF_IO_WORKER;
+ ret = io_wq_fork_thread(io_sq_thread, sqd);
+ current->flags &= ~PF_IO_WORKER;
+ if (ret < 0) {
+ sqd->thread = NULL;
+ return ret;
+ }
+ wait_for_completion(&sqd->completion);
+ return io_uring_alloc_task_context(sqd->thread, ctx);
+}
+
static int io_sq_offload_create(struct io_ring_ctx *ctx,
struct io_uring_params *p)
{
int ret;
+ /* Retain compatibility with failing for an invalid attach attempt */
+ if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
+ IORING_SETUP_ATTACH_WQ) {
+ struct fd f;
+
+ f = fdget(p->wq_fd);
+ if (!f.file)
+ return -ENXIO;
+ if (f.file->f_op != &io_uring_fops) {
+ fdput(f);
+ return -EINVAL;
+ }
+ fdput(f);
+ }
if (ctx->flags & IORING_SETUP_SQPOLL) {
struct io_sq_data *sqd;
@@ -8213,7 +7902,7 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
ctx->sq_thread_idle = HZ;
if (sqd->thread)
- goto done;
+ return 0;
if (p->flags & IORING_SETUP_SQ_AFF) {
int cpu = p->sq_thread_cpu;
@@ -8224,18 +7913,21 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
if (!cpu_online(cpu))
goto err;
- sqd->thread = kthread_create_on_cpu(io_sq_thread, sqd,
- cpu, "io_uring-sq");
+ sqd->sq_cpu = cpu;
} else {
- sqd->thread = kthread_create(io_sq_thread, sqd,
- "io_uring-sq");
+ sqd->sq_cpu = -1;
}
- if (IS_ERR(sqd->thread)) {
- ret = PTR_ERR(sqd->thread);
+
+ sqd->task_pid = current->pid;
+ current->flags |= PF_IO_WORKER;
+ ret = io_wq_fork_thread(io_sq_thread, sqd);
+ current->flags &= ~PF_IO_WORKER;
+ if (ret < 0) {
sqd->thread = NULL;
goto err;
}
- ret = io_uring_alloc_task_context(sqd->thread);
+ wait_for_completion(&sqd->completion);
+ ret = io_uring_alloc_task_context(sqd->thread, ctx);
if (ret)
goto err;
} else if (p->flags & IORING_SETUP_SQ_AFF) {
@@ -8244,14 +7936,9 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
goto err;
}
-done:
- ret = io_init_wq_offload(ctx, p);
- if (ret)
- goto err;
-
return 0;
err:
- io_finish_async(ctx);
+ io_sq_thread_finish(ctx);
return ret;
}
@@ -8259,8 +7946,8 @@ static void io_sq_offload_start(struct io_ring_ctx *ctx)
{
struct io_sq_data *sqd = ctx->sq_data;
- if ((ctx->flags & IORING_SETUP_SQPOLL) && sqd->thread)
- wake_up_process(sqd->thread);
+ if (ctx->flags & IORING_SETUP_SQPOLL)
+ complete(&sqd->startup);
}
static inline void __io_unaccount_mem(struct user_struct *user,
@@ -8290,7 +7977,7 @@ static inline int __io_account_mem(struct user_struct *user,
static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
{
- if (ctx->limit_mem)
+ if (ctx->user)
__io_unaccount_mem(ctx->user, nr_pages);
if (ctx->mm_account)
@@ -8301,7 +7988,7 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
{
int ret;
- if (ctx->limit_mem) {
+ if (ctx->user) {
ret = __io_account_mem(ctx->user, nr_pages);
if (ret)
return ret;
@@ -8730,21 +8417,14 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
mutex_lock(&ctx->uring_lock);
mutex_unlock(&ctx->uring_lock);
- io_finish_async(ctx);
+ io_sq_thread_finish(ctx);
io_sqe_buffers_unregister(ctx);
- if (ctx->sqo_task) {
- put_task_struct(ctx->sqo_task);
- ctx->sqo_task = NULL;
+ if (ctx->mm_account) {
mmdrop(ctx->mm_account);
ctx->mm_account = NULL;
}
-#ifdef CONFIG_BLK_CGROUP
- if (ctx->sqo_blkcg_css)
- css_put(ctx->sqo_blkcg_css);
-#endif
-
mutex_lock(&ctx->uring_lock);
io_sqe_files_unregister(ctx);
mutex_unlock(&ctx->uring_lock);
@@ -8764,8 +8444,9 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
percpu_ref_exit(&ctx->refs);
free_uid(ctx->user);
- put_cred(ctx->creds);
io_req_caches_free(ctx, NULL);
+ if (ctx->hash_map)
+ io_wq_put_hash(ctx->hash_map);
kfree(ctx->cancel_hash);
kfree(ctx);
}
@@ -8812,13 +8493,11 @@ static int io_uring_fasync(int fd, struct file *file, int on)
static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
{
- struct io_identity *iod;
+ const struct cred *creds;
- iod = idr_remove(&ctx->personality_idr, id);
- if (iod) {
- put_cred(iod->creds);
- if (refcount_dec_and_test(&iod->count))
- kfree(iod);
+ creds = idr_remove(&ctx->personality_idr, id);
+ if (creds) {
+ put_cred(creds);
return 0;
}
@@ -8833,6 +8512,28 @@ static int io_remove_personalities(int id, void *p, void *data)
return 0;
}
+static void io_run_ctx_fallback(struct io_ring_ctx *ctx)
+{
+ struct callback_head *work, *head, *next;
+
+ do {
+ do {
+ head = NULL;
+ work = READ_ONCE(ctx->exit_task_work);
+ } while (cmpxchg(&ctx->exit_task_work, work, head) != work);
+
+ if (!work)
+ break;
+
+ do {
+ next = work->next;
+ work->func(work);
+ work = next;
+ cond_resched();
+ } while (work);
+ } while (1);
+}
+
static void io_ring_exit_work(struct work_struct *work)
{
struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
@@ -8846,17 +8547,11 @@ static void io_ring_exit_work(struct work_struct *work)
*/
do {
io_uring_try_cancel_requests(ctx, NULL, NULL);
+ io_run_ctx_fallback(ctx);
} while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
io_ring_ctx_free(ctx);
}
-static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
-{
- struct io_kiocb *req = container_of(work, struct io_kiocb, work);
-
- return req->ctx == data;
-}
-
static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
{
mutex_lock(&ctx->uring_lock);
@@ -8875,9 +8570,6 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
io_kill_timeouts(ctx, NULL, NULL);
io_poll_remove_all(ctx, NULL, NULL);
- if (ctx->io_wq)
- io_wq_cancel_cb(ctx->io_wq, io_cancel_ctx_cb, ctx, true);
-
/* if we failed setting up the ctx, we might not have any rings */
io_iopoll_try_reap_events(ctx);
@@ -8956,13 +8648,14 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct files_struct *files)
{
struct io_task_cancel cancel = { .task = task, .files = files, };
+ struct io_uring_task *tctx = current->io_uring;
while (1) {
enum io_wq_cancel cret;
bool ret = false;
- if (ctx->io_wq) {
- cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb,
+ if (tctx && tctx->io_wq) {
+ cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
&cancel, true);
ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
}
@@ -9045,12 +8738,15 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
struct files_struct *files)
{
struct task_struct *task = current;
+ bool did_park = false;
if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
io_disable_sqo_submit(ctx);
- task = ctx->sq_data->thread;
- atomic_inc(&task->io_uring->in_idle);
- io_sq_thread_park(ctx->sq_data);
+ did_park = io_sq_thread_park(ctx->sq_data);
+ if (did_park) {
+ task = ctx->sq_data->thread;
+ atomic_inc(&task->io_uring->in_idle);
+ }
}
io_cancel_defer_files(ctx, task, files);
@@ -9059,7 +8755,7 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
if (!files)
io_uring_try_cancel_requests(ctx, task, NULL);
- if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
+ if (did_park) {
atomic_dec(&task->io_uring->in_idle);
io_sq_thread_unpark(ctx->sq_data);
}
@@ -9074,7 +8770,7 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
int ret;
if (unlikely(!tctx)) {
- ret = io_uring_alloc_task_context(current);
+ ret = io_uring_alloc_task_context(current, ctx);
if (unlikely(ret))
return ret;
tctx = current->io_uring;
@@ -9144,8 +8840,13 @@ void __io_uring_files_cancel(struct files_struct *files)
io_uring_cancel_task_requests(file->private_data, files);
atomic_dec(&tctx->in_idle);
- if (files)
+ if (files) {
io_uring_remove_task_files(tctx);
+ if (tctx->io_wq) {
+ io_wq_put(tctx->io_wq);
+ tctx->io_wq = NULL;
+ }
+ }
}
static s64 tctx_inflight(struct io_uring_task *tctx)
@@ -9155,14 +8856,17 @@ static s64 tctx_inflight(struct io_uring_task *tctx)
static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
{
+ struct io_sq_data *sqd = ctx->sq_data;
struct io_uring_task *tctx;
s64 inflight;
DEFINE_WAIT(wait);
- if (!ctx->sq_data)
+ if (!sqd)
return;
- tctx = ctx->sq_data->thread->io_uring;
io_disable_sqo_submit(ctx);
+ if (!io_sq_thread_park(sqd))
+ return;
+ tctx = ctx->sq_data->thread->io_uring;
atomic_inc(&tctx->in_idle);
do {
@@ -9183,6 +8887,7 @@ static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
finish_wait(&tctx->wait, &wait);
} while (1);
atomic_dec(&tctx->in_idle);
+ io_sq_thread_unpark(sqd);
}
/*
@@ -9236,11 +8941,17 @@ static int io_uring_flush(struct file *file, void *data)
struct io_uring_task *tctx = current->io_uring;
struct io_ring_ctx *ctx = file->private_data;
+ /* Ignore helper thread files exit */
+ if (current->flags & PF_IO_WORKER)
+ return 0;
+
if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) {
io_uring_cancel_task_requests(ctx, NULL);
io_req_caches_free(ctx, current);
}
+ io_run_ctx_fallback(ctx);
+
if (!tctx)
return 0;
@@ -9439,6 +9150,12 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
if (ctx->flags & IORING_SETUP_SQPOLL) {
io_cqring_overflow_flush(ctx, false, NULL, NULL);
+ if (unlikely(ctx->sqo_exec)) {
+ ret = io_sq_thread_fork(ctx->sq_data, ctx);
+ if (ret)
+ goto out;
+ ctx->sqo_exec = 0;
+ }
ret = -EOWNERDEAD;
if (unlikely(ctx->sqo_dead))
goto out;
@@ -9495,8 +9212,7 @@ out_fput:
#ifdef CONFIG_PROC_FS
static int io_uring_show_cred(int id, void *p, void *data)
{
- struct io_identity *iod = p;
- const struct cred *cred = iod->creds;
+ const struct cred *cred = p;
struct seq_file *m = data;
struct user_namespace *uns = seq_user_ns(m);
struct group_info *gi;
@@ -9541,8 +9257,11 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
*/
has_lock = mutex_trylock(&ctx->uring_lock);
- if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL))
+ if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
sq = ctx->sq_data;
+ if (!sq->thread)
+ sq = NULL;
+ }
seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
@@ -9702,7 +9421,6 @@ static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
static int io_uring_create(unsigned entries, struct io_uring_params *p,
struct io_uring_params __user *params)
{
- struct user_struct *user = NULL;
struct io_ring_ctx *ctx;
struct file *file;
int ret;
@@ -9744,22 +9462,13 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
p->cq_entries = 2 * p->sq_entries;
}
- user = get_uid(current_user());
-
ctx = io_ring_ctx_alloc(p);
- if (!ctx) {
- free_uid(user);
+ if (!ctx)
return -ENOMEM;
- }
ctx->compat = in_compat_syscall();
- ctx->limit_mem = !capable(CAP_IPC_LOCK);
- ctx->user = user;
- ctx->creds = get_current_cred();
-#ifdef CONFIG_AUDIT
- ctx->loginuid = current->loginuid;
- ctx->sessionid = current->sessionid;
-#endif
- ctx->sqo_task = get_task_struct(current);
+ if (!capable(CAP_IPC_LOCK))
+ ctx->user = get_uid(current_user());
+ ctx->sqo_task = current;
/*
* This is just grabbed for accounting purposes. When a process exits,
@@ -9770,24 +9479,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
mmgrab(current->mm);
ctx->mm_account = current->mm;
-#ifdef CONFIG_BLK_CGROUP
- /*
- * The sq thread will belong to the original cgroup it was inited in.
- * If the cgroup goes offline (e.g. disabling the io controller), then
- * issued bios will be associated with the closest cgroup later in the
- * block layer.
- */
- rcu_read_lock();
- ctx->sqo_blkcg_css = blkcg_css();
- ret = css_tryget_online(ctx->sqo_blkcg_css);
- rcu_read_unlock();
- if (!ret) {
- /* don't init against a dying cgroup, have the user try again */
- ctx->sqo_blkcg_css = NULL;
- ret = -ENODEV;
- goto err;
- }
-#endif
ret = io_allocate_scq_urings(ctx, p);
if (ret)
goto err;
@@ -9821,7 +9512,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
- IORING_FEAT_EXT_ARG;
+ IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS;
if (copy_to_user(params, p, sizeof(*p))) {
ret = -EFAULT;
@@ -9927,21 +9618,15 @@ out:
static int io_register_personality(struct io_ring_ctx *ctx)
{
- struct io_identity *id;
+ const struct cred *creds;
int ret;
- id = kmalloc(sizeof(*id), GFP_KERNEL);
- if (unlikely(!id))
- return -ENOMEM;
-
- io_init_identity(id);
- id->creds = get_current_cred();
+ creds = get_current_cred();
- ret = idr_alloc_cyclic(&ctx->personality_idr, id, 1, USHRT_MAX, GFP_KERNEL);
- if (ret < 0) {
- put_cred(id->creds);
- kfree(id);
- }
+ ret = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1,
+ USHRT_MAX, GFP_KERNEL);
+ if (ret < 0)
+ put_cred(creds);
return ret;
}
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 16a1e82e3aeb..7ffcd7ef33d4 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -278,14 +278,14 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
if (!is_contig || bio_full(ctx->bio, plen)) {
gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
gfp_t orig_gfp = gfp;
- int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE);
if (ctx->bio)
submit_bio(ctx->bio);
if (ctx->rac) /* same as readahead_gfp_mask */
gfp |= __GFP_NORETRY | __GFP_NOWARN;
- ctx->bio = bio_alloc(gfp, min(BIO_MAX_PAGES, nr_vecs));
+ ctx->bio = bio_alloc(gfp, bio_max_segs(nr_vecs));
/*
* If the bio_alloc fails, try it again for a single page to
* avoid having to deal with partial page reads. This emulates
@@ -1459,13 +1459,6 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data)
goto redirty;
/*
- * Given that we do not allow direct reclaim to call us, we should
- * never be called in a recursive filesystem reclaim context.
- */
- if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
- goto redirty;
-
- /*
* Is this page beyond the end of the file?
*
* The page index is less than the end_index, adjust the end_offset
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index b2dc4d1f9dcc..1f0ffabbde56 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -551,7 +551,6 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
ret = -ENOMEM;
goto out_unload;
}
- inode->i_ino = 0;
inode->i_size = i_size_read(sb->s_bdev->bd_inode);
inode->i_mapping->a_ops = &jfs_metapage_aops;
inode_fake_hash(inode);
diff --git a/fs/mpage.c b/fs/mpage.c
index 830e6cc2a9e7..961234d68779 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -304,9 +304,7 @@ alloc_new:
goto out;
}
args->bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
- min_t(int, args->nr_pages,
- BIO_MAX_PAGES),
- gfp);
+ bio_max_segs(args->nr_pages), gfp);
if (args->bio == NULL)
goto confused;
}
diff --git a/fs/namespace.c b/fs/namespace.c
index 062fe984a697..56bb5a5fdc0d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -381,50 +381,36 @@ int mnt_want_write(struct vfsmount *m)
EXPORT_SYMBOL_GPL(mnt_want_write);
/**
- * mnt_clone_write - get write access to a mount
- * @mnt: the mount on which to take a write
- *
- * This is effectively like mnt_want_write, except
- * it must only be used to take an extra write reference
- * on a mountpoint that we already know has a write reference
- * on it. This allows some optimisation.
- *
- * After finished, mnt_drop_write must be called as usual to
- * drop the reference.
- */
-int mnt_clone_write(struct vfsmount *mnt)
-{
- /* superblock may be r/o */
- if (__mnt_is_readonly(mnt))
- return -EROFS;
- preempt_disable();
- mnt_inc_writers(real_mount(mnt));
- preempt_enable();
- return 0;
-}
-EXPORT_SYMBOL_GPL(mnt_clone_write);
-
-/**
* __mnt_want_write_file - get write access to a file's mount
* @file: the file who's mount on which to take a write
*
- * This is like __mnt_want_write, but it takes a file and can
- * do some optimisations if the file is open for write already
+ * This is like __mnt_want_write, but if the file is already open for writing it
+ * skips incrementing mnt_writers (since the open file already has a reference)
+ * and instead only does the check for emergency r/o remounts. This must be
+ * paired with __mnt_drop_write_file.
*/
int __mnt_want_write_file(struct file *file)
{
- if (!(file->f_mode & FMODE_WRITER))
- return __mnt_want_write(file->f_path.mnt);
- else
- return mnt_clone_write(file->f_path.mnt);
+ if (file->f_mode & FMODE_WRITER) {
+ /*
+ * Superblock may have become readonly while there are still
+ * writable fd's, e.g. due to a fs error with errors=remount-ro
+ */
+ if (__mnt_is_readonly(file->f_path.mnt))
+ return -EROFS;
+ return 0;
+ }
+ return __mnt_want_write(file->f_path.mnt);
}
/**
* mnt_want_write_file - get write access to a file's mount
* @file: the file who's mount on which to take a write
*
- * This is like mnt_want_write, but it takes a file and can
- * do some optimisations if the file is open for write already
+ * This is like mnt_want_write, but if the file is already open for writing it
+ * skips incrementing mnt_writers (since the open file already has a reference)
+ * and instead only does the freeze protection and the check for emergency r/o
+ * remounts. This must be paired with mnt_drop_write_file.
*/
int mnt_want_write_file(struct file *file)
{
@@ -470,7 +456,8 @@ EXPORT_SYMBOL_GPL(mnt_drop_write);
void __mnt_drop_write_file(struct file *file)
{
- __mnt_drop_write(file->f_path.mnt);
+ if (!(file->f_mode & FMODE_WRITER))
+ __mnt_drop_write(file->f_path.mnt);
}
void mnt_drop_write_file(struct file *file)
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 1a96ce28efb0..fe860c538747 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -115,13 +115,13 @@ bl_submit_bio(struct bio *bio)
return NULL;
}
-static struct bio *
-bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector,
+static struct bio *bl_alloc_init_bio(unsigned int npg,
+ struct block_device *bdev, sector_t disk_sector,
bio_end_io_t end_io, struct parallel_io *par)
{
struct bio *bio;
- npg = min(npg, BIO_MAX_PAGES);
+ npg = bio_max_segs(npg);
bio = bio_alloc(GFP_NOIO, npg);
if (bio) {
bio->bi_iter.bi_sector = disk_sector;
diff --git a/fs/proc/self.c b/fs/proc/self.c
index a4012154e109..72cd69bcaf4a 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -16,13 +16,6 @@ static const char *proc_self_get_link(struct dentry *dentry,
pid_t tgid = task_tgid_nr_ns(current, ns);
char *name;
- /*
- * Not currently supported. Once we can inherit all of struct pid,
- * we can allow this.
- */
- if (current->flags & PF_IO_WORKER)
- return ERR_PTR(-EOPNOTSUPP);
-
if (!tgid)
return ERR_PTR(-ENOENT);
/* max length of unsigned int in decimal + NULL term */
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index d56681d86d28..a553273fbd41 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -17,13 +17,6 @@ static const char *proc_thread_self_get_link(struct dentry *dentry,
pid_t pid = task_pid_nr_ns(current, ns);
char *name;
- /*
- * Not currently supported. Once we can inherit all of struct pid,
- * we can allow this.
- */
- if (current->flags & PF_IO_WORKER)
- return ERR_PTR(-EOPNOTSUPP);
-
if (!pid)
return ERR_PTR(-ENOENT);
name = kmalloc(10 + 6 + 10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC);
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index b56ff451adce..5b6fcb9b44e2 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -2805,7 +2805,7 @@ xfs_btree_split_worker(
struct xfs_btree_split_args *args = container_of(work,
struct xfs_btree_split_args, work);
unsigned long pflags;
- unsigned long new_pflags = PF_MEMALLOC_NOFS;
+ unsigned long new_pflags = 0;
/*
* we are in a transaction context here, but may also be doing work
@@ -2817,12 +2817,20 @@ xfs_btree_split_worker(
new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
current_set_flags_nested(&pflags, new_pflags);
+ xfs_trans_set_context(args->cur->bc_tp);
args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
args->key, args->curp, args->stat);
- complete(args->done);
+ xfs_trans_clear_context(args->cur->bc_tp);
current_restore_flags_nested(&pflags, new_pflags);
+
+ /*
+ * Do not access args after complete() has run here. We don't own args
+ * and the owner may run and free args before we return here.
+ */
+ complete(args->done);
+
}
/*
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 4304c6416fbb..b4186d666157 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -62,7 +62,7 @@ xfs_setfilesize_trans_alloc(
* We hand off the transaction to the completion thread now, so
* clear the flag here.
*/
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
+ xfs_trans_clear_context(tp);
return 0;
}
@@ -125,7 +125,7 @@ xfs_setfilesize_ioend(
* thus we need to mark ourselves as being in a transaction manually.
* Similarly for freeze protection.
*/
- current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
+ xfs_trans_set_context(tp);
__sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
/* we abort the update if there was an IO error */
@@ -568,6 +568,12 @@ xfs_vm_writepage(
{
struct xfs_writepage_ctx wpc = { };
+ if (WARN_ON_ONCE(current->journal_info)) {
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
+
return iomap_writepage(page, wbc, &wpc.ctx, &xfs_writeback_ops);
}
@@ -578,6 +584,13 @@ xfs_vm_writepages(
{
struct xfs_writepage_ctx wpc = { };
+ /*
+ * Writing back data in a transaction context can result in recursive
+ * transactions. This is bad, so issue a warning and get out of here.
+ */
+ if (WARN_ON_ONCE(current->journal_info))
+ return 0;
+
xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
}
diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c
index e2148f2d5d6b..17f36db2f792 100644
--- a/fs/xfs/xfs_bio_io.c
+++ b/fs/xfs/xfs_bio_io.c
@@ -6,7 +6,7 @@
static inline unsigned int bio_max_vecs(unsigned int count)
{
- return min_t(unsigned, howmany(count, PAGE_SIZE), BIO_MAX_PAGES);
+ return bio_max_segs(howmany(count, PAGE_SIZE));
}
int
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index f6e5235df7c9..37a1d12762d8 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1480,7 +1480,7 @@ xfs_buf_ioapply_map(
int op)
{
int page_index;
- int total_nr_pages = bp->b_page_count;
+ unsigned int total_nr_pages = bp->b_page_count;
int nr_pages;
struct bio *bio;
sector_t sector = bp->b_maps[map].bm_bn;
@@ -1505,7 +1505,7 @@ xfs_buf_ioapply_map(
next_chunk:
atomic_inc(&bp->b_io_remaining);
- nr_pages = min(total_nr_pages, BIO_MAX_PAGES);
+ nr_pages = bio_max_segs(total_nr_pages);
bio = bio_alloc(GFP_NOIO, nr_pages);
bio_set_dev(bio, bp->b_target->bt_bdev);
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 3991e59cfd18..ef17c1f6db32 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -344,7 +344,6 @@ xfs_extent_busy_trim(
ASSERT(*len > 0);
spin_lock(&args->pag->pagb_lock);
-restart:
fbno = *bno;
flen = *len;
rbp = args->pag->pagb_tree.rb_node;
@@ -363,19 +362,6 @@ restart:
continue;
}
- /*
- * If this is a metadata allocation, try to reuse the busy
- * extent instead of trimming the allocation.
- */
- if (!(args->datatype & XFS_ALLOC_USERDATA) &&
- !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) {
- if (!xfs_extent_busy_update_extent(args->mp, args->pag,
- busyp, fbno, flen,
- false))
- goto restart;
- continue;
- }
-
if (bbno <= fbno) {
/* start overlap */
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index 145e06c47744..546a6cd96729 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -51,7 +51,7 @@ xfs_panic_mask_proc_handler(
#endif /* CONFIG_PROC_FS */
STATIC int
-xfs_deprecate_irix_sgid_inherit_proc_handler(
+xfs_deprecated_dointvec_minmax(
struct ctl_table *ctl,
int write,
void *buffer,
@@ -59,24 +59,8 @@ xfs_deprecate_irix_sgid_inherit_proc_handler(
loff_t *ppos)
{
if (write) {
- printk_once(KERN_WARNING
- "XFS: " "%s sysctl option is deprecated.\n",
- ctl->procname);
- }
- return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
-}
-
-STATIC int
-xfs_deprecate_irix_symlink_mode_proc_handler(
- struct ctl_table *ctl,
- int write,
- void *buffer,
- size_t *lenp,
- loff_t *ppos)
-{
- if (write) {
- printk_once(KERN_WARNING
- "XFS: " "%s sysctl option is deprecated.\n",
+ printk_ratelimited(KERN_WARNING
+ "XFS: %s sysctl option is deprecated.\n",
ctl->procname);
}
return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
@@ -88,7 +72,7 @@ static struct ctl_table xfs_table[] = {
.data = &xfs_params.sgid_inherit.val,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = xfs_deprecate_irix_sgid_inherit_proc_handler,
+ .proc_handler = xfs_deprecated_dointvec_minmax,
.extra1 = &xfs_params.sgid_inherit.min,
.extra2 = &xfs_params.sgid_inherit.max
},
@@ -97,7 +81,7 @@ static struct ctl_table xfs_table[] = {
.data = &xfs_params.symlink_mode.val,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = xfs_deprecate_irix_symlink_mode_proc_handler,
+ .proc_handler = xfs_deprecated_dointvec_minmax,
.extra1 = &xfs_params.symlink_mode.min,
.extra2 = &xfs_params.symlink_mode.max
},
@@ -201,6 +185,15 @@ static struct ctl_table xfs_table[] = {
.extra1 = &xfs_params.blockgc_timer.min,
.extra2 = &xfs_params.blockgc_timer.max,
},
+ {
+ .procname = "speculative_cow_prealloc_lifetime",
+ .data = &xfs_params.blockgc_timer.val,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = xfs_deprecated_dointvec_minmax,
+ .extra1 = &xfs_params.blockgc_timer.min,
+ .extra2 = &xfs_params.blockgc_timer.max,
+ },
/* please keep this the last entry */
#ifdef CONFIG_PROC_FS
{
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 44f72c09c203..b22a09e9daee 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -72,6 +72,7 @@ xfs_trans_free(
xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false);
trace_xfs_trans_free(tp, _RET_IP_);
+ xfs_trans_clear_context(tp);
if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT))
sb_end_intwrite(tp->t_mountp->m_super);
xfs_trans_free_dqinfo(tp);
@@ -123,7 +124,8 @@ xfs_trans_dup(
ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used;
tp->t_rtx_res = tp->t_rtx_res_used;
- ntp->t_pflags = tp->t_pflags;
+
+ xfs_trans_switch_context(tp, ntp);
/* move deferred ops over to the new tp */
xfs_defer_move(ntp, tp);
@@ -157,9 +159,6 @@ xfs_trans_reserve(
int error = 0;
bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
- /* Mark this thread as being in a transaction */
- current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
-
/*
* Attempt to reserve the needed disk blocks by decrementing
* the number needed from the number available. This will
@@ -167,10 +166,8 @@ xfs_trans_reserve(
*/
if (blocks > 0) {
error = xfs_mod_fdblocks(mp, -((int64_t)blocks), rsvd);
- if (error != 0) {
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
+ if (error != 0)
return -ENOSPC;
- }
tp->t_blk_res += blocks;
}
@@ -244,9 +241,6 @@ undo_blocks:
xfs_mod_fdblocks(mp, (int64_t)blocks, rsvd);
tp->t_blk_res = 0;
}
-
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
-
return error;
}
@@ -260,6 +254,7 @@ xfs_trans_alloc(
struct xfs_trans **tpp)
{
struct xfs_trans *tp;
+ bool want_retry = true;
int error;
/*
@@ -267,9 +262,11 @@ xfs_trans_alloc(
* GFP_NOFS allocation context so that we avoid lockdep false positives
* by doing GFP_KERNEL allocations inside sb_start_intwrite().
*/
+retry:
tp = kmem_cache_zalloc(xfs_trans_zone, GFP_KERNEL | __GFP_NOFAIL);
if (!(flags & XFS_TRANS_NO_WRITECOUNT))
sb_start_intwrite(mp->m_super);
+ xfs_trans_set_context(tp);
/*
* Zero-reservation ("empty") transactions can't modify anything, so
@@ -289,7 +286,9 @@ xfs_trans_alloc(
tp->t_firstblock = NULLFSBLOCK;
error = xfs_trans_reserve(tp, resp, blocks, rtextents);
- if (error == -ENOSPC) {
+ if (error == -ENOSPC && want_retry) {
+ xfs_trans_cancel(tp);
+
/*
* We weren't able to reserve enough space for the transaction.
* Flush the other speculative space allocations to free space.
@@ -297,8 +296,11 @@ xfs_trans_alloc(
* other locks.
*/
error = xfs_blockgc_free_space(mp, NULL);
- if (!error)
- error = xfs_trans_reserve(tp, resp, blocks, rtextents);
+ if (error)
+ return error;
+
+ want_retry = false;
+ goto retry;
}
if (error) {
xfs_trans_cancel(tp);
@@ -893,7 +895,6 @@ __xfs_trans_commit(
xfs_log_commit_cil(mp, tp, &commit_lsn, regrant);
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
xfs_trans_free(tp);
/*
@@ -925,7 +926,6 @@ out_unreserve:
xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
tp->t_ticket = NULL;
}
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
xfs_trans_free_items(tp, !!error);
xfs_trans_free(tp);
@@ -985,9 +985,6 @@ xfs_trans_cancel(
tp->t_ticket = NULL;
}
- /* mark this thread as no longer being in a transaction */
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
-
xfs_trans_free_items(tp, dirty);
xfs_trans_free(tp);
}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 8b03fbfe9a1b..9dd745cf77c9 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -281,4 +281,34 @@ int xfs_trans_alloc_ichange(struct xfs_inode *ip, struct xfs_dquot *udqp,
struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, bool force,
struct xfs_trans **tpp);
+static inline void
+xfs_trans_set_context(
+ struct xfs_trans *tp)
+{
+ ASSERT(current->journal_info == NULL);
+ tp->t_pflags = memalloc_nofs_save();
+ current->journal_info = tp;
+}
+
+static inline void
+xfs_trans_clear_context(
+ struct xfs_trans *tp)
+{
+ if (current->journal_info == tp) {
+ memalloc_nofs_restore(tp->t_pflags);
+ current->journal_info = NULL;
+ }
+}
+
+static inline void
+xfs_trans_switch_context(
+ struct xfs_trans *old_tp,
+ struct xfs_trans *new_tp)
+{
+ ASSERT(current->journal_info == old_tp);
+ new_tp->t_pflags = old_tp->t_pflags;
+ old_tp->t_pflags = 0;
+ current->journal_info = new_tp;
+}
+
#endif /* __XFS_TRANS_H__ */
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 5b468f2242ff..983ed2fe7c85 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -20,7 +20,12 @@
#define BIO_BUG_ON
#endif
-#define BIO_MAX_PAGES 256
+#define BIO_MAX_PAGES 256U
+
+static inline unsigned int bio_max_segs(unsigned int nr_segs)
+{
+ return min(nr_segs, BIO_MAX_PAGES);
+}
#define bio_prio(bio) (bio)->bi_ioprio
#define bio_set_prio(bio, prio) ((bio)->bi_ioprio = prio)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 69035e9f632b..c032cfe133c7 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -462,7 +462,6 @@ struct request_queue {
#ifdef CONFIG_PM
struct device *dev;
enum rpm_status rpm_status;
- unsigned int nr_pending;
#endif
/*
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 05556573b896..a083e15df608 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -23,8 +23,6 @@ struct blk_trace {
u32 pid;
u32 dev;
struct dentry *dir;
- struct dentry *dropped_file;
- struct dentry *msg_file;
struct list_head running_list;
atomic_t dropped;
};
@@ -119,7 +117,7 @@ struct compat_blk_user_trace_setup {
#endif
-extern void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes);
+void blk_fill_rwbs(char *rwbs, unsigned int op);
static inline sector_t blk_rq_trace_sector(struct request *rq)
{
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 9129ba231423..f14adb882338 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -186,6 +186,7 @@ enum cpuhp_state {
CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE,
CPUHP_AP_PERF_POWERPC_HV_24x7_ONLINE,
CPUHP_AP_PERF_POWERPC_HV_GPCI_ONLINE,
+ CPUHP_AP_PERF_CSKY_ONLINE,
CPUHP_AP_WATCHDOG_ONLINE,
CPUHP_AP_WORKQUEUE_ONLINE,
CPUHP_AP_RCUTREE_ONLINE,
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 2eb6d19de336..51ede771cd99 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -5,23 +5,6 @@
#include <linux/sched.h>
#include <linux/xarray.h>
-struct io_identity {
- struct files_struct *files;
- struct mm_struct *mm;
-#ifdef CONFIG_BLK_CGROUP
- struct cgroup_subsys_state *blkcg_css;
-#endif
- const struct cred *creds;
- struct nsproxy *nsproxy;
- struct fs_struct *fs;
- unsigned long fsize;
-#ifdef CONFIG_AUDIT
- kuid_t loginuid;
- unsigned int sessionid;
-#endif
- refcount_t count;
-};
-
struct io_wq_work_node {
struct io_wq_work_node *next;
};
@@ -36,9 +19,8 @@ struct io_uring_task {
struct xarray xa;
struct wait_queue_head wait;
struct file *last;
+ void *io_wq;
struct percpu_counter inflight;
- struct io_identity __identity;
- struct io_identity *identity;
atomic_t in_idle;
bool sqpoll;
@@ -61,7 +43,7 @@ static inline void io_uring_task_cancel(void)
}
static inline void io_uring_files_cancel(struct files_struct *files)
{
- if (current->io_uring && !xa_empty(&current->io_uring->xa))
+ if (current->io_uring)
__io_uring_files_cancel(files);
}
static inline void io_uring_free(struct task_struct *tsk)
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 161f4419db6c..5d92a7e1a742 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -86,7 +86,6 @@ struct path;
extern int mnt_want_write(struct vfsmount *mnt);
extern int mnt_want_write_file(struct file *file);
-extern int mnt_clone_write(struct vfsmount *mnt);
extern void mnt_drop_write(struct vfsmount *mnt);
extern void mnt_drop_write_file(struct file *file);
extern void mntput(struct vfsmount *mnt);
diff --git a/include/linux/net.h b/include/linux/net.h
index 9e2324efc26a..ba736b457a06 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -42,8 +42,6 @@ struct net;
#define SOCK_PASSCRED 3
#define SOCK_PASSSEC 4
-#define PROTO_CMSG_DATA_ONLY 0x0001
-
#ifndef ARCH_HAS_SOCKET_TYPES
/**
* enum sock_type - Socket types
@@ -138,7 +136,6 @@ typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *,
struct proto_ops {
int family;
- unsigned int flags;
struct module *owner;
int (*release) (struct socket *sock);
int (*bind) (struct socket *sock,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 26f499810dfa..ef00bb22164c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -895,6 +895,9 @@ struct task_struct {
/* CLONE_CHILD_CLEARTID: */
int __user *clear_child_tid;
+ /* PF_IO_WORKER */
+ void *pf_io_worker;
+
u64 utime;
u64 stime;
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
diff --git a/include/scsi/libiscsi.h b/include/scsi/libiscsi.h
index b3bbd10eb3f0..02f966e9358f 100644
--- a/include/scsi/libiscsi.h
+++ b/include/scsi/libiscsi.h
@@ -187,7 +187,7 @@ struct iscsi_conn {
struct iscsi_task *task; /* xmit task in progress */
/* xmit */
- spinlock_t taskqueuelock; /* protects the next three lists */
+ /* items must be added/deleted under frwd lock */
struct list_head mgmtqueue; /* mgmt (control) xmit queue */
struct list_head cmdqueue; /* data-path cmd queue */
struct list_head requeue; /* tasks needing another run */
@@ -332,7 +332,7 @@ struct iscsi_session {
* cmdsn, queued_cmdsn *
* session resources: *
* - cmdpool kfifo_out , *
- * - mgmtpool, */
+ * - mgmtpool, queues */
spinlock_t back_lock; /* protects cmdsn_exp *
* cmdsn_max, *
* cmdpool kfifo_in */
@@ -395,6 +395,8 @@ extern struct Scsi_Host *iscsi_host_alloc(struct scsi_host_template *sht,
extern void iscsi_host_remove(struct Scsi_Host *shost);
extern void iscsi_host_free(struct Scsi_Host *shost);
extern int iscsi_target_alloc(struct scsi_target *starget);
+extern int iscsi_host_get_max_scsi_cmds(struct Scsi_Host *shost,
+ uint16_t requested_cmds_max);
/*
* session management
diff --git a/include/target/target_core_backend.h b/include/target/target_core_backend.h
index 6336780d83a7..ce2fba49c95d 100644
--- a/include/target/target_core_backend.h
+++ b/include/target/target_core_backend.h
@@ -72,6 +72,7 @@ int transport_backend_register(const struct target_backend_ops *);
void target_backend_unregister(const struct target_backend_ops *);
void target_complete_cmd(struct se_cmd *, u8);
+void target_set_cmd_data_length(struct se_cmd *, int);
void target_complete_cmd_with_length(struct se_cmd *, u8, int);
void transport_copy_sense_to_cmd(struct se_cmd *, unsigned char *);
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h
index e41c611d6d3b..899fdacf57b9 100644
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcache.h
@@ -28,7 +28,7 @@ DECLARE_EVENT_CLASS(bcache_request,
__entry->sector = bio->bi_iter.bi_sector;
__entry->orig_sector = bio->bi_iter.bi_sector - 16;
__entry->nr_sector = bio->bi_iter.bi_size >> 9;
- blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
+ blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
),
TP_printk("%d,%d %s %llu + %u (from %d,%d @ %llu)",
@@ -102,7 +102,7 @@ DECLARE_EVENT_CLASS(bcache_bio,
__entry->dev = bio_dev(bio);
__entry->sector = bio->bi_iter.bi_sector;
__entry->nr_sector = bio->bi_iter.bi_size >> 9;
- blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
+ blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
),
TP_printk("%d,%d %s %llu + %u",
@@ -137,7 +137,7 @@ TRACE_EVENT(bcache_read,
__entry->dev = bio_dev(bio);
__entry->sector = bio->bi_iter.bi_sector;
__entry->nr_sector = bio->bi_iter.bi_size >> 9;
- blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
+ blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
__entry->cache_hit = hit;
__entry->bypass = bypass;
),
@@ -168,7 +168,7 @@ TRACE_EVENT(bcache_write,
__entry->inode = inode;
__entry->sector = bio->bi_iter.bi_sector;
__entry->nr_sector = bio->bi_iter.bi_size >> 9;
- blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
+ blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
__entry->writeback = writeback;
__entry->bypass = bypass;
),
@@ -238,7 +238,7 @@ TRACE_EVENT(bcache_journal_write,
__entry->sector = bio->bi_iter.bi_sector;
__entry->nr_sector = bio->bi_iter.bi_size >> 9;
__entry->nr_keys = keys;
- blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
+ blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
),
TP_printk("%d,%d %s %llu + %u keys %u",
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 0d782663a005..cc5ab96a7471 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -89,7 +89,7 @@ TRACE_EVENT(block_rq_requeue,
__entry->sector = blk_rq_trace_sector(rq);
__entry->nr_sector = blk_rq_trace_nr_sectors(rq);
- blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
+ blk_fill_rwbs(__entry->rwbs, rq->cmd_flags);
__get_str(cmd)[0] = '\0';
),
@@ -133,7 +133,7 @@ TRACE_EVENT(block_rq_complete,
__entry->nr_sector = nr_bytes >> 9;
__entry->error = error;
- blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, nr_bytes);
+ blk_fill_rwbs(__entry->rwbs, rq->cmd_flags);
__get_str(cmd)[0] = '\0';
),
@@ -166,7 +166,7 @@ DECLARE_EVENT_CLASS(block_rq,
__entry->nr_sector = blk_rq_trace_nr_sectors(rq);
__entry->bytes = blk_rq_bytes(rq);
- blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
+ blk_fill_rwbs(__entry->rwbs, rq->cmd_flags);
__get_str(cmd)[0] = '\0';
memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
),
@@ -196,7 +196,7 @@ DEFINE_EVENT(block_rq, block_rq_insert,
/**
* block_rq_issue - issue pending block IO request operation to device driver
- * @rq: block IO operation operation request
+ * @rq: block IO operation request
*
* Called when block operation request @rq from queue @q is sent to a
* device driver for processing.
@@ -210,7 +210,7 @@ DEFINE_EVENT(block_rq, block_rq_issue,
/**
* block_rq_merge - merge request with another one in the elevator
- * @rq: block IO operation operation request
+ * @rq: block IO operation request
*
* Called when block operation request @rq from queue @q is merged to another
* request queued in the elevator.
@@ -249,7 +249,7 @@ TRACE_EVENT(block_bio_complete,
__entry->sector = bio->bi_iter.bi_sector;
__entry->nr_sector = bio_sectors(bio);
__entry->error = blk_status_to_errno(bio->bi_status);
- blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
+ blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
),
TP_printk("%d,%d %s %llu + %u [%d]",
@@ -276,7 +276,7 @@ DECLARE_EVENT_CLASS(block_bio,
__entry->dev = bio_dev(bio);
__entry->sector = bio->bi_iter.bi_sector;
__entry->nr_sector = bio_sectors(bio);
- blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
+ blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
),
@@ -433,7 +433,7 @@ TRACE_EVENT(block_split,
__entry->dev = bio_dev(bio);
__entry->sector = bio->bi_iter.bi_sector;
__entry->new_sector = new_sector;
- blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
+ blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
),
@@ -474,7 +474,7 @@ TRACE_EVENT(block_bio_remap,
__entry->nr_sector = bio_sectors(bio);
__entry->old_dev = dev;
__entry->old_sector = from;
- blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
+ blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
),
TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu",
@@ -518,7 +518,7 @@ TRACE_EVENT(block_rq_remap,
__entry->old_dev = dev;
__entry->old_sector = from;
__entry->nr_bios = blk_rq_count_bios(rq);
- blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
+ blk_fill_rwbs(__entry->rwbs, rq->cmd_flags);
),
TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu %u",
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index ac4e1738a9af..2514eb6b1cf2 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -262,6 +262,7 @@ struct io_uring_params {
#define IORING_FEAT_POLL_32BITS (1U << 6)
#define IORING_FEAT_SQPOLL_NONFIXED (1U << 7)
#define IORING_FEAT_EXT_ARG (1U << 8)
+#define IORING_FEAT_NATIVE_WORKERS (1U << 9)
/*
* io_uring_register(2) opcodes and arguments
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index 5b3f01da172b..60739d5e3373 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -84,7 +84,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa
dentry = kern_path_locked(pathname, &path);
if (IS_ERR(dentry))
- return (void *)dentry; /* returning an error */
+ return ERR_CAST(dentry); /* returning an error */
inode = path.dentry->d_inode;
inode_unlock(inode);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 61db50f7ca86..821cf1723814 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -375,7 +375,7 @@ static int ptrace_attach(struct task_struct *task, long request,
audit_ptrace(task);
retval = -EPERM;
- if (unlikely(task->flags & PF_KTHREAD))
+ if (unlikely(task->flags & (PF_KTHREAD | PF_IO_WORKER)))
goto out;
if (same_thread_group(task, current))
goto out;
diff --git a/kernel/signal.c b/kernel/signal.c
index 5ad8566534e7..ba4d1ef39a9e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -91,7 +91,7 @@ static bool sig_task_ignored(struct task_struct *t, int sig, bool force)
return true;
/* Only allow kernel generated signals to this kthread */
- if (unlikely((t->flags & PF_KTHREAD) &&
+ if (unlikely((t->flags & (PF_KTHREAD | PF_IO_WORKER)) &&
(handler == SIG_KTHREAD_KERNEL) && !force))
return true;
@@ -1096,7 +1096,7 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc
/*
* Skip useless siginfo allocation for SIGKILL and kernel threads.
*/
- if ((sig == SIGKILL) || (t->flags & PF_KTHREAD))
+ if ((sig == SIGKILL) || (t->flags & (PF_KTHREAD | PF_IO_WORKER)))
goto out_set;
/*
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index c286c13bd31a..c221e4c3f625 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -312,8 +312,6 @@ record_it:
static void blk_trace_free(struct blk_trace *bt)
{
- debugfs_remove(bt->msg_file);
- debugfs_remove(bt->dropped_file);
relay_close(bt->rchan);
debugfs_remove(bt->dir);
free_percpu(bt->sequence);
@@ -545,10 +543,8 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
INIT_LIST_HEAD(&bt->running_list);
ret = -EIO;
- bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt,
- &blk_dropped_fops);
-
- bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
+ debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops);
+ debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
bt->rchan = relay_open("trace", dir, buts->buf_size,
buts->buf_nr, &blk_relay_callbacks, bt);
@@ -1868,7 +1864,17 @@ void blk_trace_remove_sysfs(struct device *dev)
#ifdef CONFIG_EVENT_TRACING
-void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes)
+/**
+ * blk_fill_rwbs - Fill the buffer rwbs by mapping op to character string.
+ * @rwbs: buffer to be filled
+ * @op: REQ_OP_XXX for the tracepoint
+ *
+ * Description:
+ * Maps the REQ_OP_XXX to character and fills the buffer provided by the
+ * caller with resulting string.
+ *
+ **/
+void blk_fill_rwbs(char *rwbs, unsigned int op)
{
int i = 0;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index a02ce89b56b5..1355e6c0d567 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1021,7 +1021,6 @@ static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned lon
const struct proto_ops inet_stream_ops = {
.family = PF_INET,
- .flags = PROTO_CMSG_DATA_ONLY,
.owner = THIS_MODULE,
.release = inet_release,
.bind = inet_bind,
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 1fb75f01756c..802f5111805a 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -665,7 +665,6 @@ int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
const struct proto_ops inet6_stream_ops = {
.family = PF_INET6,
- .flags = PROTO_CMSG_DATA_ONLY,
.owner = THIS_MODULE,
.release = inet6_release,
.bind = inet6_bind,
diff --git a/net/socket.c b/net/socket.c
index 23c7842389de..84a8049c2b09 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2413,10 +2413,6 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
long __sys_sendmsg_sock(struct socket *sock, struct msghdr *msg,
unsigned int flags)
{
- /* disallow ancillary data requests from this path */
- if (msg->msg_control || msg->msg_controllen)
- return -EINVAL;
-
return ____sys_sendmsg(sock, msg, flags, NULL, 0);
}
@@ -2625,12 +2621,6 @@ long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg,
struct user_msghdr __user *umsg,
struct sockaddr __user *uaddr, unsigned int flags)
{
- if (msg->msg_control || msg->msg_controllen) {
- /* disallow ancillary data reqs unless cmsg is plain data */
- if (!(sock->ops->flags & PROTO_CMSG_DATA_ONLY))
- return -EINVAL;
- }
-
return ____sys_recvmsg(sock, msg, umsg, uaddr, flags, 0);
}