From 23d67a54857a768acdb0804cdd6037c324a50ecd Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Nov 2020 12:42:00 -0500 Subject: seccomp: Migrate to use SYSCALL_WORK flag On architectures using the generic syscall entry code the architecture independent syscall work is moved to flags in thread_info::syscall_work. This removes architecture dependencies and frees up TIF bits. Define SYSCALL_WORK_SECCOMP, use it in the generic entry code and convert the code which uses the TIF specific helper functions to use the new *_syscall_work() helpers which either resolve to the new mode for users of the generic entry code or to the TIF based functions for the other architectures. Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Link: https://lore.kernel.org/r/20201116174206.2639648-5-krisman@collabora.com --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 32083db7a2a2..bc5b1090f415 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1625,7 +1625,7 @@ static void copy_seccomp(struct task_struct *p) * to manually enable the seccomp thread flag here. */ if (p->seccomp.mode != SECCOMP_MODE_DISABLED) - set_tsk_thread_flag(p, TIF_SECCOMP); + set_task_syscall_work(p, SECCOMP); #endif } -- cgit v1.2.3 From 64c19ba29b66e98af9306b4a7525fb22c895d252 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Nov 2020 12:42:02 -0500 Subject: ptrace: Migrate to use SYSCALL_TRACE flag On architectures using the generic syscall entry code the architecture independent syscall work is moved to flags in thread_info::syscall_work. This removes architecture dependencies and frees up TIF bits. Define SYSCALL_WORK_SYSCALL_TRACE, use it in the generic entry code and convert the code which uses the TIF specific helper functions to use the new *_syscall_work() helpers which either resolve to the new mode for users of the generic entry code or to the TIF based functions for the other architectures. Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Link: https://lore.kernel.org/r/20201116174206.2639648-7-krisman@collabora.com --- include/asm-generic/syscall.h | 15 ++++++++------- include/linux/entry-common.h | 10 ++++++---- include/linux/thread_info.h | 2 ++ include/linux/tracehook.h | 17 +++++++++-------- kernel/entry/common.c | 4 ++-- kernel/fork.c | 2 +- kernel/ptrace.c | 6 +++--- 7 files changed, 31 insertions(+), 25 deletions(-) (limited to 'kernel/fork.c') diff --git a/include/asm-generic/syscall.h b/include/asm-generic/syscall.h index 524d8e68ff5e..ed94e5658d0c 100644 --- a/include/asm-generic/syscall.h +++ b/include/asm-generic/syscall.h @@ -43,7 +43,7 @@ int syscall_get_nr(struct task_struct *task, struct pt_regs *regs); * @regs: task_pt_regs() of @task * * It's only valid to call this when @task is stopped for system - * call exit tracing (due to TIF_SYSCALL_TRACE or TIF_SYSCALL_AUDIT), + * call exit tracing (due to %SYSCALL_WORK_SYSCALL_TRACE or TIF_SYSCALL_AUDIT), * after tracehook_report_syscall_entry() returned nonzero to prevent * the system call from taking place. * @@ -63,7 +63,7 @@ void syscall_rollback(struct task_struct *task, struct pt_regs *regs); * Returns 0 if the system call succeeded, or -ERRORCODE if it failed. * * It's only valid to call this when @task is stopped for tracing on exit - * from a system call, due to %TIF_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. + * from a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. */ long syscall_get_error(struct task_struct *task, struct pt_regs *regs); @@ -76,7 +76,7 @@ long syscall_get_error(struct task_struct *task, struct pt_regs *regs); * This value is meaningless if syscall_get_error() returned nonzero. * * It's only valid to call this when @task is stopped for tracing on exit - * from a system call, due to %TIF_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. + * from a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. */ long syscall_get_return_value(struct task_struct *task, struct pt_regs *regs); @@ -93,7 +93,7 @@ long syscall_get_return_value(struct task_struct *task, struct pt_regs *regs); * code; the user sees a failed system call with this errno code. * * It's only valid to call this when @task is stopped for tracing on exit - * from a system call, due to %TIF_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. + * from a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. */ void syscall_set_return_value(struct task_struct *task, struct pt_regs *regs, int error, long val); @@ -108,7 +108,7 @@ void syscall_set_return_value(struct task_struct *task, struct pt_regs *regs, * @args[0], and so on. * * It's only valid to call this when @task is stopped for tracing on - * entry to a system call, due to %TIF_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. + * entry to a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. */ void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, unsigned long *args); @@ -123,7 +123,7 @@ void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, * The first argument gets value @args[0], and so on. * * It's only valid to call this when @task is stopped for tracing on - * entry to a system call, due to %TIF_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. + * entry to a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. */ void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, const unsigned long *args); @@ -135,7 +135,8 @@ void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, * Returns the AUDIT_ARCH_* based on the system call convention in use. * * It's only valid to call this when @task is stopped on entry to a system - * call, due to %TIF_SYSCALL_TRACE, %TIF_SYSCALL_AUDIT, or %SYSCALL_WORK_SECCOMP. + * call, due to %SYSCALL_WORK_SYSCALL_TRACE, %TIF_SYSCALL_AUDIT, or + * %SYSCALL_WORK_SECCOMP. * * Architectures which permit CONFIG_HAVE_ARCH_SECCOMP_FILTER must * provide an implementation of this. diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index 2a01eee2dbba..ae426ab9c372 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -41,7 +41,7 @@ #endif #define SYSCALL_ENTER_WORK \ - (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ + (_TIF_SYSCALL_AUDIT | \ _TIF_SYSCALL_EMU | \ ARCH_SYSCALL_ENTER_WORK) @@ -53,12 +53,14 @@ #endif #define SYSCALL_EXIT_WORK \ - (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ + (_TIF_SYSCALL_AUDIT | \ ARCH_SYSCALL_EXIT_WORK) #define SYSCALL_WORK_ENTER (SYSCALL_WORK_SECCOMP | \ - SYSCALL_WORK_SYSCALL_TRACEPOINT) -#define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT) + SYSCALL_WORK_SYSCALL_TRACEPOINT | \ + SYSCALL_WORK_SYSCALL_TRACE) +#define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \ + SYSCALL_WORK_SYSCALL_TRACE) /* * TIF flags handled in exit_to_user_mode_loop() diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index c232043c12d3..761a4590d554 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -38,10 +38,12 @@ enum { enum syscall_work_bit { SYSCALL_WORK_BIT_SECCOMP, SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT, + SYSCALL_WORK_BIT_SYSCALL_TRACE, }; #define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP) #define SYSCALL_WORK_SYSCALL_TRACEPOINT BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT) +#define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE) #include diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index f7d82e4fafd6..3f20368afe9e 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -83,11 +83,12 @@ static inline int ptrace_report_syscall(struct pt_regs *regs, * tracehook_report_syscall_entry - task is about to attempt a system call * @regs: user register state of current task * - * This will be called if %TIF_SYSCALL_TRACE or %TIF_SYSCALL_EMU have been set, - * when the current task has just entered the kernel for a system call. - * Full user register state is available here. Changing the values - * in @regs can affect the system call number and arguments to be tried. - * It is safe to block here, preventing the system call from beginning. + * This will be called if %SYSCALL_WORK_SYSCALL_TRACE or + * %TIF_SYSCALL_EMU have been set, when the current task has just + * entered the kernel for a system call. Full user register state is + * available here. Changing the values in @regs can affect the system + * call number and arguments to be tried. It is safe to block here, + * preventing the system call from beginning. * * Returns zero normally, or nonzero if the calling arch code should abort * the system call. That must prevent normal entry so no system call is @@ -109,15 +110,15 @@ static inline __must_check int tracehook_report_syscall_entry( * @regs: user register state of current task * @step: nonzero if simulating single-step or block-step * - * This will be called if %TIF_SYSCALL_TRACE has been set, when the - * current task has just finished an attempted system call. Full + * This will be called if %SYSCALL_WORK_SYSCALL_TRACE has been set, when + * the current task has just finished an attempted system call. Full * user register state is available here. It is safe to block here, * preventing signals from being processed. * * If @step is nonzero, this report is also in lieu of the normal * trap that would follow the system call instruction because * user_enable_block_step() or user_enable_single_step() was used. - * In this case, %TIF_SYSCALL_TRACE might not be set. + * In this case, %SYSCALL_WORK_SYSCALL_TRACE might not be set. * * Called without locks, just before checking for pending signals. */ diff --git a/kernel/entry/common.c b/kernel/entry/common.c index f651967847ec..917328a9edaa 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -47,7 +47,7 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall, long ret = 0; /* Handle ptrace */ - if (ti_work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) { + if (work & SYSCALL_WORK_SYSCALL_TRACE || ti_work & _TIF_SYSCALL_EMU) { ret = arch_syscall_enter_tracehook(regs); if (ret || (ti_work & _TIF_SYSCALL_EMU)) return -1L; @@ -237,7 +237,7 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work, trace_sys_exit(regs, syscall_get_return_value(current, regs)); step = report_single_step(ti_work); - if (step || ti_work & _TIF_SYSCALL_TRACE) + if (step || work & SYSCALL_WORK_SYSCALL_TRACE) arch_syscall_exit_tracehook(regs, step); } diff --git a/kernel/fork.c b/kernel/fork.c index bc5b1090f415..99f68c20f2ff 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2158,7 +2158,7 @@ static __latent_entropy struct task_struct *copy_process( * child regardless of CLONE_PTRACE. */ user_disable_single_step(p); - clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); + clear_task_syscall_work(p, SYSCALL_TRACE); #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); #endif diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 43d6179508d6..55a2bc3186a7 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -117,7 +117,7 @@ void __ptrace_unlink(struct task_struct *child) const struct cred *old_cred; BUG_ON(!child->ptrace); - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + clear_task_syscall_work(child, SYSCALL_TRACE); #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); #endif @@ -812,9 +812,9 @@ static int ptrace_resume(struct task_struct *child, long request, return -EIO; if (request == PTRACE_SYSCALL) - set_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + set_task_syscall_work(child, SYSCALL_TRACE); else - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + clear_task_syscall_work(child, SYSCALL_TRACE); #ifdef TIF_SYSCALL_EMU if (request == PTRACE_SYSEMU || request == PTRACE_SYSEMU_SINGLESTEP) -- cgit v1.2.3 From 64eb35f701f04b30706e21d1b02636b5d31a37d2 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Nov 2020 12:42:03 -0500 Subject: ptrace: Migrate TIF_SYSCALL_EMU to use SYSCALL_WORK flag On architectures using the generic syscall entry code the architecture independent syscall work is moved to flags in thread_info::syscall_work. This removes architecture dependencies and frees up TIF bits. Define SYSCALL_WORK_SYSCALL_EMU, use it in the generic entry code and convert the code which uses the TIF specific helper functions to use the new *_syscall_work() helpers which either resolve to the new mode for users of the generic entry code or to the TIF based functions for the other architectures. Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Link: https://lore.kernel.org/r/20201116174206.2639648-8-krisman@collabora.com --- include/linux/entry-common.h | 8 ++------ include/linux/thread_info.h | 2 ++ include/linux/tracehook.h | 2 +- kernel/entry/common.c | 19 ++++++++++--------- kernel/fork.c | 4 ++-- kernel/ptrace.c | 10 +++++----- 6 files changed, 22 insertions(+), 23 deletions(-) (limited to 'kernel/fork.c') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index ae426ab9c372..b30f82bed92b 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -13,10 +13,6 @@ * Define dummy _TIF work flags if not defined by the architecture or for * disabled functionality. */ -#ifndef _TIF_SYSCALL_EMU -# define _TIF_SYSCALL_EMU (0) -#endif - #ifndef _TIF_SYSCALL_AUDIT # define _TIF_SYSCALL_AUDIT (0) #endif @@ -42,7 +38,6 @@ #define SYSCALL_ENTER_WORK \ (_TIF_SYSCALL_AUDIT | \ - _TIF_SYSCALL_EMU | \ ARCH_SYSCALL_ENTER_WORK) /* @@ -58,7 +53,8 @@ #define SYSCALL_WORK_ENTER (SYSCALL_WORK_SECCOMP | \ SYSCALL_WORK_SYSCALL_TRACEPOINT | \ - SYSCALL_WORK_SYSCALL_TRACE) + SYSCALL_WORK_SYSCALL_TRACE | \ + SYSCALL_WORK_SYSCALL_EMU) #define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \ SYSCALL_WORK_SYSCALL_TRACE) diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 761a4590d554..85b8a4216168 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -39,11 +39,13 @@ enum syscall_work_bit { SYSCALL_WORK_BIT_SECCOMP, SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT, SYSCALL_WORK_BIT_SYSCALL_TRACE, + SYSCALL_WORK_BIT_SYSCALL_EMU, }; #define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP) #define SYSCALL_WORK_SYSCALL_TRACEPOINT BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT) #define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE) +#define SYSCALL_WORK_SYSCALL_EMU BIT(SYSCALL_WORK_BIT_SYSCALL_EMU) #include diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 3f20368afe9e..54b925224a13 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -84,7 +84,7 @@ static inline int ptrace_report_syscall(struct pt_regs *regs, * @regs: user register state of current task * * This will be called if %SYSCALL_WORK_SYSCALL_TRACE or - * %TIF_SYSCALL_EMU have been set, when the current task has just + * %SYSCALL_WORK_SYSCALL_EMU have been set, when the current task has just * entered the kernel for a system call. Full user register state is * available here. Changing the values in @regs can affect the system * call number and arguments to be tried. It is safe to block here, diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 917328a9edaa..90533f34ea99 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -47,9 +47,9 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall, long ret = 0; /* Handle ptrace */ - if (work & SYSCALL_WORK_SYSCALL_TRACE || ti_work & _TIF_SYSCALL_EMU) { + if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { ret = arch_syscall_enter_tracehook(regs); - if (ret || (ti_work & _TIF_SYSCALL_EMU)) + if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) return -1L; } @@ -208,21 +208,22 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs) } #ifndef _TIF_SINGLESTEP -static inline bool report_single_step(unsigned long ti_work) +static inline bool report_single_step(unsigned long work) { return false; } #else /* - * If TIF_SYSCALL_EMU is set, then the only reason to report is when + * If SYSCALL_EMU is set, then the only reason to report is when * TIF_SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall * instruction has been already reported in syscall_enter_from_user_mode(). */ -#define SYSEMU_STEP (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU) - -static inline bool report_single_step(unsigned long ti_work) +static inline bool report_single_step(unsigned long work) { - return (ti_work & SYSEMU_STEP) == _TIF_SINGLESTEP; + if (!(work & SYSCALL_WORK_SYSCALL_EMU)) + return false; + + return !!(current_thread_info()->flags & _TIF_SINGLESTEP); } #endif @@ -236,7 +237,7 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work, if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) trace_sys_exit(regs, syscall_get_return_value(current, regs)); - step = report_single_step(ti_work); + step = report_single_step(work); if (step || work & SYSCALL_WORK_SYSCALL_TRACE) arch_syscall_exit_tracehook(regs, step); } diff --git a/kernel/fork.c b/kernel/fork.c index 99f68c20f2ff..02b689a23457 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2159,8 +2159,8 @@ static __latent_entropy struct task_struct *copy_process( */ user_disable_single_step(p); clear_task_syscall_work(p, SYSCALL_TRACE); -#ifdef TIF_SYSCALL_EMU - clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); +#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU) + clear_task_syscall_work(p, SYSCALL_EMU); #endif clear_tsk_latency_tracing(p); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 55a2bc3186a7..237bcd6d255c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -118,8 +118,8 @@ void __ptrace_unlink(struct task_struct *child) BUG_ON(!child->ptrace); clear_task_syscall_work(child, SYSCALL_TRACE); -#ifdef TIF_SYSCALL_EMU - clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); +#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU) + clear_task_syscall_work(child, SYSCALL_EMU); #endif child->parent = child->real_parent; @@ -816,11 +816,11 @@ static int ptrace_resume(struct task_struct *child, long request, else clear_task_syscall_work(child, SYSCALL_TRACE); -#ifdef TIF_SYSCALL_EMU +#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU) if (request == PTRACE_SYSEMU || request == PTRACE_SYSEMU_SINGLESTEP) - set_tsk_thread_flag(child, TIF_SYSCALL_EMU); + set_task_syscall_work(child, SYSCALL_EMU); else - clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); + clear_task_syscall_work(child, SYSCALL_EMU); #endif if (is_singleblock(request)) { -- cgit v1.2.3 From 1446e1df9eb183fdf81c3f0715402f1d7595d4cb Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Fri, 27 Nov 2020 14:32:34 -0500 Subject: kernel: Implement selective syscall userspace redirection Introduce a mechanism to quickly disable/enable syscall handling for a specific process and redirect to userspace via SIGSYS. This is useful for processes with parts that require syscall redirection and parts that don't, but who need to perform this boundary crossing really fast, without paying the cost of a system call to reconfigure syscall handling on each boundary transition. This is particularly important for Windows games running over Wine. The proposed interface looks like this: prctl(PR_SET_SYSCALL_USER_DISPATCH, , , , [selector]) The range [,+) is a part of the process memory map that is allowed to by-pass the redirection code and dispatch syscalls directly, such that in fast paths a process doesn't need to disable the trap nor the kernel has to check the selector. This is essential to return from SIGSYS to a blocked area without triggering another SIGSYS from rt_sigreturn. selector is an optional pointer to a char-sized userspace memory region that has a key switch for the mechanism. This key switch is set to either PR_SYS_DISPATCH_ON, PR_SYS_DISPATCH_OFF to enable and disable the redirection without calling the kernel. The feature is meant to be set per-thread and it is disabled on fork/clone/execv. Internally, this doesn't add overhead to the syscall hot path, and it requires very little per-architecture support. I avoided using seccomp, even though it duplicates some functionality, due to previous feedback that maybe it shouldn't mix with seccomp since it is not a security mechanism. And obviously, this should never be considered a security mechanism, since any part of the program can by-pass it by using the syscall dispatcher. For the sysinfo benchmark, which measures the overhead added to executing a native syscall that doesn't require interception, the overhead using only the direct dispatcher region to issue syscalls is pretty much irrelevant. The overhead of using the selector goes around 40ns for a native (unredirected) syscall in my system, and it is (as expected) dominated by the supervisor-mode user-address access. In fact, with SMAP off, the overhead is consistently less than 5ns on my test box. Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Acked-by: Peter Zijlstra (Intel) Acked-by: Kees Cook Link: https://lore.kernel.org/r/20201127193238.821364-4-krisman@collabora.com --- fs/exec.c | 3 + include/linux/sched.h | 2 + include/linux/syscall_user_dispatch.h | 40 +++++++++++++ include/linux/thread_info.h | 2 + include/uapi/linux/prctl.h | 5 ++ kernel/entry/Makefile | 2 +- kernel/entry/common.h | 7 +++ kernel/entry/syscall_user_dispatch.c | 104 ++++++++++++++++++++++++++++++++++ kernel/fork.c | 1 + kernel/sys.c | 5 ++ 10 files changed, 170 insertions(+), 1 deletion(-) create mode 100644 include/linux/syscall_user_dispatch.h create mode 100644 kernel/entry/common.h create mode 100644 kernel/entry/syscall_user_dispatch.c (limited to 'kernel/fork.c') diff --git a/fs/exec.c b/fs/exec.c index 547a2390baf5..aee36e5733ce 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -64,6 +64,7 @@ #include #include #include +#include #include #include @@ -1302,6 +1303,8 @@ int begin_new_exec(struct linux_binprm * bprm) flush_thread(); me->personality &= ~bprm->per_clear; + clear_syscall_work_syscall_user_dispatch(me); + /* * We have to apply CLOEXEC before we change whether the process is * dumpable (in setup_new_exec) to avoid a race with a process in userspace diff --git a/include/linux/sched.h b/include/linux/sched.h index 063cd120b459..5a24a033b3f8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -965,6 +966,7 @@ struct task_struct { unsigned int sessionid; #endif struct seccomp seccomp; + struct syscall_user_dispatch syscall_dispatch; /* Thread group tracking: */ u64 parent_exec_id; diff --git a/include/linux/syscall_user_dispatch.h b/include/linux/syscall_user_dispatch.h new file mode 100644 index 000000000000..a0ae443fb7df --- /dev/null +++ b/include/linux/syscall_user_dispatch.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020 Collabora Ltd. + */ +#ifndef _SYSCALL_USER_DISPATCH_H +#define _SYSCALL_USER_DISPATCH_H + +#include + +#ifdef CONFIG_GENERIC_ENTRY + +struct syscall_user_dispatch { + char __user *selector; + unsigned long offset; + unsigned long len; + bool on_dispatch; +}; + +int set_syscall_user_dispatch(unsigned long mode, unsigned long offset, + unsigned long len, char __user *selector); + +#define clear_syscall_work_syscall_user_dispatch(tsk) \ + clear_task_syscall_work(tsk, SYSCALL_USER_DISPATCH) + +#else +struct syscall_user_dispatch {}; + +static inline int set_syscall_user_dispatch(unsigned long mode, unsigned long offset, + unsigned long len, char __user *selector) +{ + return -EINVAL; +} + +static inline void clear_syscall_work_syscall_user_dispatch(struct task_struct *tsk) +{ +} + +#endif /* CONFIG_GENERIC_ENTRY */ + +#endif /* _SYSCALL_USER_DISPATCH_H */ diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index ca80a214df09..c8a974cead73 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -42,6 +42,7 @@ enum syscall_work_bit { SYSCALL_WORK_BIT_SYSCALL_TRACE, SYSCALL_WORK_BIT_SYSCALL_EMU, SYSCALL_WORK_BIT_SYSCALL_AUDIT, + SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH, }; #define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP) @@ -49,6 +50,7 @@ enum syscall_work_bit { #define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE) #define SYSCALL_WORK_SYSCALL_EMU BIT(SYSCALL_WORK_BIT_SYSCALL_EMU) #define SYSCALL_WORK_SYSCALL_AUDIT BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT) +#define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH) #endif #include diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 7f0827705c9a..90deb41c8a34 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -247,4 +247,9 @@ struct prctl_mm_map { #define PR_SET_IO_FLUSHER 57 #define PR_GET_IO_FLUSHER 58 +/* Dispatch syscalls to a userspace handler */ +#define PR_SET_SYSCALL_USER_DISPATCH 59 +# define PR_SYS_DISPATCH_OFF 0 +# define PR_SYS_DISPATCH_ON 1 + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/entry/Makefile b/kernel/entry/Makefile index 34c8a3f1c735..095c775e001e 100644 --- a/kernel/entry/Makefile +++ b/kernel/entry/Makefile @@ -9,5 +9,5 @@ KCOV_INSTRUMENT := n CFLAGS_REMOVE_common.o = -fstack-protector -fstack-protector-strong CFLAGS_common.o += -fno-stack-protector -obj-$(CONFIG_GENERIC_ENTRY) += common.o +obj-$(CONFIG_GENERIC_ENTRY) += common.o syscall_user_dispatch.o obj-$(CONFIG_KVM_XFER_TO_GUEST_WORK) += kvm.o diff --git a/kernel/entry/common.h b/kernel/entry/common.h new file mode 100644 index 000000000000..f6e6d02f07fe --- /dev/null +++ b/kernel/entry/common.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _COMMON_H +#define _COMMON_H + +bool syscall_user_dispatch(struct pt_regs *regs); + +#endif diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c new file mode 100644 index 000000000000..b0338a5625d9 --- /dev/null +++ b/kernel/entry/syscall_user_dispatch.c @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020 Collabora Ltd. + */ +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include "common.h" + +static void trigger_sigsys(struct pt_regs *regs) +{ + struct kernel_siginfo info; + + clear_siginfo(&info); + info.si_signo = SIGSYS; + info.si_code = SYS_USER_DISPATCH; + info.si_call_addr = (void __user *)KSTK_EIP(current); + info.si_errno = 0; + info.si_arch = syscall_get_arch(current); + info.si_syscall = syscall_get_nr(current, regs); + + force_sig_info(&info); +} + +bool syscall_user_dispatch(struct pt_regs *regs) +{ + struct syscall_user_dispatch *sd = ¤t->syscall_dispatch; + char state; + + if (likely(instruction_pointer(regs) - sd->offset < sd->len)) + return false; + + if (unlikely(arch_syscall_is_vdso_sigreturn(regs))) + return false; + + if (likely(sd->selector)) { + /* + * access_ok() is performed once, at prctl time, when + * the selector is loaded by userspace. + */ + if (unlikely(__get_user(state, sd->selector))) + do_exit(SIGSEGV); + + if (likely(state == PR_SYS_DISPATCH_OFF)) + return false; + + if (state != PR_SYS_DISPATCH_ON) + do_exit(SIGSYS); + } + + sd->on_dispatch = true; + syscall_rollback(current, regs); + trigger_sigsys(regs); + + return true; +} + +int set_syscall_user_dispatch(unsigned long mode, unsigned long offset, + unsigned long len, char __user *selector) +{ + switch (mode) { + case PR_SYS_DISPATCH_OFF: + if (offset || len || selector) + return -EINVAL; + break; + case PR_SYS_DISPATCH_ON: + /* + * Validate the direct dispatcher region just for basic + * sanity against overflow and a 0-sized dispatcher + * region. If the user is able to submit a syscall from + * an address, that address is obviously valid. + */ + if (offset && offset + len <= offset) + return -EINVAL; + + if (selector && !access_ok(selector, sizeof(*selector))) + return -EFAULT; + + break; + default: + return -EINVAL; + } + + current->syscall_dispatch.selector = selector; + current->syscall_dispatch.offset = offset; + current->syscall_dispatch.len = len; + current->syscall_dispatch.on_dispatch = false; + + if (mode == PR_SYS_DISPATCH_ON) + set_syscall_work(SYSCALL_USER_DISPATCH); + else + clear_syscall_work(SYSCALL_USER_DISPATCH); + + return 0; +} diff --git a/kernel/fork.c b/kernel/fork.c index 02b689a23457..4a5ecb41f440 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -906,6 +906,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); set_task_stack_end_magic(tsk); + clear_syscall_work_syscall_user_dispatch(tsk); #ifdef CONFIG_STACKPROTECTOR tsk->stack_canary = get_random_canary(); diff --git a/kernel/sys.c b/kernel/sys.c index a730c03ee607..51f00fe20e4d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -2530,6 +2531,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER; break; + case PR_SET_SYSCALL_USER_DISPATCH: + error = set_syscall_user_dispatch(arg2, arg3, arg4, + (char __user *) arg5); + break; default: error = -EINVAL; break; -- cgit v1.2.3