From 7c9f8861e6c9c839f913e49b98c3854daca18f27 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 22 Apr 2008 16:38:23 -0500 Subject: stackprotector: use canary at end of stack to indicate overruns at oops time (Updated with a common max-stack-used checker that knows about the canary, as suggested by Joe Perches) Use a canary at the end of the stack to clearly indicate at oops time whether the stack has ever overflowed. This is a very simple implementation with a couple of drawbacks: 1) a thread may legitimately use exactly up to the last word on the stack -- but the chances of doing this and then oopsing later seem slim 2) it's possible that the stack usage isn't dense enough that the canary location could get skipped over -- but the worst that happens is that we don't flag the overrun -- though this happens fairly often in my testing :( With the code in place, an intentionally-bloated stack oops might do: BUG: unable to handle kernel paging request at ffff8103f84cc680 IP: [] update_curr+0x9a/0xa8 PGD 8063 PUD 0 Thread overran stack or stack corrupted Oops: 0000 [1] SMP CPU 0 ... ... unless the stack overrun is so bad that it corrupts some other thread. Signed-off-by: Eric Sandeen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sched.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index cfa222a91539..a964ed945094 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5748,12 +5748,7 @@ void sched_show_task(struct task_struct *p) printk(KERN_CONT " %016lx ", thread_saved_pc(p)); #endif #ifdef CONFIG_DEBUG_STACK_USAGE - { - unsigned long *n = end_of_stack(p); - while (!*n) - n++; - free = (unsigned long)n - (unsigned long)end_of_stack(p); - } + free = stack_not_used(p); #endif printk(KERN_CONT "%5lu %5d %6d\n", free, task_pid_nr(p), task_pid_nr(p->real_parent)); -- cgit v1.2.3 From 01e3eb82278bf45221fc38b391bc5ee0f6a314d6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 Jan 2009 13:00:50 +0100 Subject: Revert "sched: improve preempt debugging" This reverts commit 7317d7b87edb41a9135e30be1ec3f7ef817c53dd. This has been reported (and bisected) by Alexey Zaytsev and Kamalesh Babulal to produce annoying warnings during bootup on both x86 and powerpc. kernel_locked() is not a valid test in IRQ context (we update the BKL's ->lock_depth and the preempt count separately and non-atomicalyy), so we cannot put it into the generic preempt debugging checks which can run in IRQ contexts too. Reported-and-bisected-by: Alexey Zaytsev Reported-and-bisected-by: Kamalesh Babulal Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 8be2c13b50d0..3b630d882660 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4440,7 +4440,7 @@ void __kprobes sub_preempt_count(int val) /* * Underflow? */ - if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked()))) + if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) return; /* * Is the spinlock portion underflowing? -- cgit v1.2.3 From 5add95d4f7cf08f6f62510f19576992912387501 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:08 +0100 Subject: [CVE-2009-0029] System call wrappers part 06 Signed-off-by: Heiko Carstens --- kernel/sched.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 8be2c13b50d0..1a0fdfa5ddf9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5126,7 +5126,7 @@ int can_nice(const struct task_struct *p, const int nice) * sys_setpriority is a more generic, but much slower function that * does similar things. */ -asmlinkage long sys_nice(int increment) +SYSCALL_DEFINE1(nice, int, increment) { long nice, retval; @@ -5433,8 +5433,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) * @policy: new policy. * @param: structure containing the new RT priority. */ -asmlinkage long -sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) +SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, + struct sched_param __user *, param) { /* negative values for policy are not valid */ if (policy < 0) @@ -5448,7 +5448,7 @@ sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) * @pid: the pid in question. * @param: structure containing the new RT priority. */ -asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) +SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) { return do_sched_setscheduler(pid, -1, param); } @@ -5457,7 +5457,7 @@ asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) * sys_sched_getscheduler - get the policy (scheduling class) of a thread * @pid: the pid in question. */ -asmlinkage long sys_sched_getscheduler(pid_t pid) +SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) { struct task_struct *p; int retval; @@ -5482,7 +5482,7 @@ asmlinkage long sys_sched_getscheduler(pid_t pid) * @pid: the pid in question. * @param: structure containing the RT priority. */ -asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) +SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) { struct sched_param lp; struct task_struct *p; @@ -5600,8 +5600,8 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, * @len: length in bytes of the bitmask pointed to by user_mask_ptr * @user_mask_ptr: user-space pointer to the new cpu mask */ -asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, - unsigned long __user *user_mask_ptr) +SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, + unsigned long __user *, user_mask_ptr) { cpumask_var_t new_mask; int retval; @@ -5648,8 +5648,8 @@ out_unlock: * @len: length in bytes of the bitmask pointed to by user_mask_ptr * @user_mask_ptr: user-space pointer to hold the current cpu mask */ -asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, - unsigned long __user *user_mask_ptr) +SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, + unsigned long __user *, user_mask_ptr) { int ret; cpumask_var_t mask; @@ -5678,7 +5678,7 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, * This function yields the current CPU to other tasks. If there are no * other threads running on this CPU then this function will return. */ -asmlinkage long sys_sched_yield(void) +SYSCALL_DEFINE0(sched_yield) { struct rq *rq = this_rq_lock(); @@ -5819,7 +5819,7 @@ long __sched io_schedule_timeout(long timeout) * this syscall returns the maximum rt_priority that can be used * by a given scheduling class. */ -asmlinkage long sys_sched_get_priority_max(int policy) +SYSCALL_DEFINE1(sched_get_priority_max, int, policy) { int ret = -EINVAL; @@ -5844,7 +5844,7 @@ asmlinkage long sys_sched_get_priority_max(int policy) * this syscall returns the minimum rt_priority that can be used * by a given scheduling class. */ -asmlinkage long sys_sched_get_priority_min(int policy) +SYSCALL_DEFINE1(sched_get_priority_min, int, policy) { int ret = -EINVAL; -- cgit v1.2.3 From 754fe8d297bfae7b77f7ce866e2fb0c5fb186506 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:09 +0100 Subject: [CVE-2009-0029] System call wrappers part 07 Signed-off-by: Heiko Carstens --- kernel/exit.c | 8 ++++---- kernel/kexec.c | 5 ++--- kernel/sched.c | 4 ++-- kernel/signal.c | 2 +- kernel/sys.c | 7 ++++--- net/socket.c | 2 +- 6 files changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/exit.c b/kernel/exit.c index fac9b040af2c..08895df0eab3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1141,7 +1141,7 @@ NORET_TYPE void complete_and_exit(struct completion *comp, long code) EXPORT_SYMBOL(complete_and_exit); -asmlinkage long sys_exit(int error_code) +SYSCALL_DEFINE1(exit, int, error_code) { do_exit((error_code&0xff)<<8); } @@ -1182,7 +1182,7 @@ do_group_exit(int exit_code) * wait4()-ing process will get the correct exit code - even if this * thread is not the thread group leader. */ -asmlinkage long sys_exit_group(int error_code) +SYSCALL_DEFINE1(exit_group, int, error_code) { do_group_exit((error_code & 0xff) << 8); /* NOTREACHED */ @@ -1795,8 +1795,8 @@ asmlinkage long sys_waitid(int which, pid_t upid, return ret; } -asmlinkage long sys_wait4(pid_t upid, int __user *stat_addr, - int options, struct rusage __user *ru) +SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, + int, options, struct rusage __user *, ru) { struct pid *pid = NULL; enum pid_type type; diff --git a/kernel/kexec.c b/kernel/kexec.c index 3fb855ad6aa0..8a6d7b08864e 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -934,9 +934,8 @@ struct kimage *kexec_crash_image; static DEFINE_MUTEX(kexec_mutex); -asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, - struct kexec_segment __user *segments, - unsigned long flags) +SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, + struct kexec_segment __user *, segments, unsigned long, flags) { struct kimage **dest_image, *image; int result; diff --git a/kernel/sched.c b/kernel/sched.c index 1a0fdfa5ddf9..65c02037b052 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5869,8 +5869,8 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) * this syscall writes the default timeslice value of a given process * into the user-space timespec buffer. A value of '0' means infinity. */ -asmlinkage -long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) +SYSCALL_DEFINE4(sched_rr_get_interval, pid_t, pid, + struct timespec __user *, interval) { struct task_struct *p; unsigned int time_slice; diff --git a/kernel/signal.c b/kernel/signal.c index 3fe08eaa5dea..41f32e08615e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1961,7 +1961,7 @@ EXPORT_SYMBOL(unblock_all_signals); * System call entry points. */ -asmlinkage long sys_restart_syscall(void) +SYSCALL_DEFINE0(restart_syscall) { struct restart_block *restart = ¤t_thread_info()->restart_block; return restart->fn(restart); diff --git a/kernel/sys.c b/kernel/sys.c index cbe4502c28a1..39b192b40034 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -143,7 +143,7 @@ out: return error; } -asmlinkage long sys_setpriority(int which, int who, int niceval) +SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) { struct task_struct *g, *p; struct user_struct *user; @@ -208,7 +208,7 @@ out: * has been offset by 20 (ie it returns 40..1 instead of -20..19) * to stay compatible. */ -asmlinkage long sys_getpriority(int which, int who) +SYSCALL_DEFINE2(getpriority, int, which, int, who) { struct task_struct *g, *p; struct user_struct *user; @@ -355,7 +355,8 @@ EXPORT_SYMBOL_GPL(kernel_power_off); * * reboot doesn't sync: do that yourself before calling this. */ -asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user * arg) +SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, + void __user *, arg) { char buffer[256]; diff --git a/net/socket.c b/net/socket.c index 06603d73c411..cc9b666e58f6 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1789,7 +1789,7 @@ out_put: * Shutdown a socket. */ -asmlinkage long sys_shutdown(int fd, int how) +SYSCALL_DEFINE2(shutdown, int, fd, int, how) { int err, fput_needed; struct socket *sock; -- cgit v1.2.3 From 17da2bd90abf428523de0fb98f7075e00e3ed42e Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:10 +0100 Subject: [CVE-2009-0029] System call wrappers part 08 Signed-off-by: Heiko Carstens --- kernel/exit.c | 7 +++---- kernel/fork.c | 2 +- kernel/futex.c | 6 +++--- kernel/module.c | 10 ++++------ kernel/sched.c | 2 +- kernel/signal.c | 18 +++++++----------- 6 files changed, 19 insertions(+), 26 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/exit.c b/kernel/exit.c index 08895df0eab3..f80dec3f1875 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1754,9 +1754,8 @@ end: return retval; } -asmlinkage long sys_waitid(int which, pid_t upid, - struct siginfo __user *infop, int options, - struct rusage __user *ru) +SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, + infop, int, options, struct rusage __user *, ru) { struct pid *pid = NULL; enum pid_type type; @@ -1833,7 +1832,7 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, * sys_waitpid() remains for compatibility. waitpid() should be * implemented by calling sys_wait4() from libc.a. */ -asmlinkage long sys_waitpid(pid_t pid, int __user *stat_addr, int options) +SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) { return sys_wait4(pid, stat_addr, options, NULL); } diff --git a/kernel/fork.c b/kernel/fork.c index 1d68f1255dd8..8eb37d38c6a4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -901,7 +901,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p) clear_freeze_flag(p); } -asmlinkage long sys_set_tid_address(int __user *tidptr) +SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) { current->clear_child_tid = tidptr; diff --git a/kernel/futex.c b/kernel/futex.c index 002aa189eb09..e86931d8d4e9 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1978,9 +1978,9 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, } -asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, - struct timespec __user *utime, u32 __user *uaddr2, - u32 val3) +SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + struct timespec __user *, utime, u32 __user *, uaddr2, + u32, val3) { struct timespec ts; ktime_t t, *tp = NULL; diff --git a/kernel/module.c b/kernel/module.c index c9332c90d5a0..e8b51d41dd72 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -743,8 +743,8 @@ static void wait_for_zero_refcount(struct module *mod) mutex_lock(&module_mutex); } -asmlinkage long -sys_delete_module(const char __user *name_user, unsigned int flags) +SYSCALL_DEFINE2(delete_module, const char __user *, name_user, + unsigned int, flags) { struct module *mod; char name[MODULE_NAME_LEN]; @@ -2296,10 +2296,8 @@ static noinline struct module *load_module(void __user *umod, } /* This is where the real work happens */ -asmlinkage long -sys_init_module(void __user *umod, - unsigned long len, - const char __user *uargs) +SYSCALL_DEFINE3(init_module, void __user *, umod, + unsigned long, len, const char __user *, uargs) { struct module *mod; int ret = 0; diff --git a/kernel/sched.c b/kernel/sched.c index 65c02037b052..eb1931eef587 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5869,7 +5869,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) * this syscall writes the default timeslice value of a given process * into the user-space timespec buffer. A value of '0' means infinity. */ -SYSCALL_DEFINE4(sched_rr_get_interval, pid_t, pid, +SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, struct timespec __user *, interval) { struct task_struct *p; diff --git a/kernel/signal.c b/kernel/signal.c index 41f32e08615e..278cc8737f17 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2014,8 +2014,8 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset) return error; } -asmlinkage long -sys_rt_sigprocmask(int how, sigset_t __user *set, sigset_t __user *oset, size_t sigsetsize) +SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, set, + sigset_t __user *, oset, size_t, sigsetsize) { int error = -EINVAL; sigset_t old_set, new_set; @@ -2074,8 +2074,7 @@ out: return error; } -asmlinkage long -sys_rt_sigpending(sigset_t __user *set, size_t sigsetsize) +SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize) { return do_sigpending(set, sigsetsize); } @@ -2146,11 +2145,9 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) #endif -asmlinkage long -sys_rt_sigtimedwait(const sigset_t __user *uthese, - siginfo_t __user *uinfo, - const struct timespec __user *uts, - size_t sigsetsize) +SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, + siginfo_t __user *, uinfo, const struct timespec __user *, uts, + size_t, sigsetsize) { int ret, sig; sigset_t these; @@ -2223,8 +2220,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese, return ret; } -asmlinkage long -sys_kill(pid_t pid, int sig) +SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) { struct siginfo info; -- cgit v1.2.3 From 98a4826b99bc4bcc34c604b2fc4fcf4d771600ec Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 14 Jan 2009 10:56:32 +0100 Subject: sched: fix bandwidth validation for UID grouping Impact: make rt-limit tunables work again Mark Glines reported: > I've got an issue on x86-64 where I can't configure the system to allow > RT tasks for a non-root user. > > In 2.6.26.5, I was able to do the following to set things up nicely: > echo 450000 >/sys/kernel/uids/0/cpu_rt_runtime > echo 450000 >/sys/kernel/uids/1000/cpu_rt_runtime > > Seems like every value I try to echo into the /sys files returns EINVAL. For UID grouping we initialize the root group with infinite bandwidth which by default is actually more than the global limit, therefore the bandwidth check always fails. Because the root group is a phantom group (for UID grouping) we cannot runtime adjust it, therefore we let it reflect the global bandwidth settings. Reported-by: Mark Glines Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 3b630d882660..ed62d1cee05c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9050,6 +9050,13 @@ static int tg_schedulable(struct task_group *tg, void *data) runtime = d->rt_runtime; } +#ifdef CONFIG_USER_SCHED + if (tg == &root_task_group) { + period = global_rt_period(); + runtime = global_rt_runtime(); + } +#endif + /* * Cannot have more runtime than the period. */ -- cgit v1.2.3 From cce7ade803699463ecc62a065ca522004f7ccb3d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Jan 2009 14:53:37 +0100 Subject: sched: SCHED_IDLE weight change Increase the SCHED_IDLE weight from 2 to 3, this gives much more stable vruntime numbers. time advanced in 100ms: weight=2 64765.988352 67012.881408 88501.412352 weight=3 35496.181411 34130.971298 35497.411573 Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index ed62d1cee05c..6acfb3c2398b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1323,8 +1323,8 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec) * slice expiry etc. */ -#define WEIGHT_IDLEPRIO 2 -#define WMULT_IDLEPRIO (1 << 31) +#define WEIGHT_IDLEPRIO 3 +#define WMULT_IDLEPRIO 1431655765 /* * Nice levels are multiplicative, with a gentle 10% change for every -- cgit v1.2.3 From d942fb6c7d391baba3dddb566eb735fbf3df8528 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 26 Jan 2009 17:56:17 +0100 Subject: sched: fix sync wakeups Pawel Dziekonski reported that the openssl benchmark and his quantum chemistry application both show slowdowns due to the scheduler under-parallelizing execution. The reason are pipe wakeups still doing 'sync' wakeups which overrides the normal buddy wakeup logic - even if waker and wakee are loosely coupled. Fix an inversion of logic in the buddy wakeup code. Reported-by: Pawel Dziekonski Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++++ kernel/sched_fair.c | 11 ++--------- 2 files changed, 6 insertions(+), 9 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 52bbf1c842a8..770b1f9ebe14 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2266,6 +2266,10 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) if (!sched_feat(SYNC_WAKEUPS)) sync = 0; + if (!sync && (current->se.avg_overlap < sysctl_sched_migration_cost && + p->se.avg_overlap < sysctl_sched_migration_cost)) + sync = 1; + #ifdef CONFIG_SMP if (sched_feat(LB_WAKEUP_UPDATE)) { struct sched_domain *sd; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 5cc1c162044f..fdc417504681 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1179,20 +1179,15 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, int idx, unsigned long load, unsigned long this_load, unsigned int imbalance) { - struct task_struct *curr = this_rq->curr; - struct task_group *tg; unsigned long tl = this_load; unsigned long tl_per_task; + struct task_group *tg; unsigned long weight; int balanced; if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) return 0; - if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost || - p->se.avg_overlap > sysctl_sched_migration_cost)) - sync = 0; - /* * If sync wakeup then subtract the (maximum possible) * effect of the currently running task from the load @@ -1419,9 +1414,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) if (!sched_feat(WAKEUP_PREEMPT)) return; - if (sched_feat(WAKEUP_OVERLAP) && (sync || - (se->avg_overlap < sysctl_sched_migration_cost && - pse->avg_overlap < sysctl_sched_migration_cost))) { + if (sched_feat(WAKEUP_OVERLAP) && sync) { resched_task(curr); return; } -- cgit v1.2.3 From 1596e29773eadd96b0a5fc6e736afa52394cafda Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 28 Jan 2009 14:51:38 +0100 Subject: sched: symmetric sync vs avg_overlap Reinstate the weakening of the sync hint if set. This yields a more symmetric usage of avg_overlap. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 770b1f9ebe14..242d0d47a70d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2266,9 +2266,15 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) if (!sched_feat(SYNC_WAKEUPS)) sync = 0; - if (!sync && (current->se.avg_overlap < sysctl_sched_migration_cost && - p->se.avg_overlap < sysctl_sched_migration_cost)) - sync = 1; + if (!sync) { + if (current->se.avg_overlap < sysctl_sched_migration_cost && + p->se.avg_overlap < sysctl_sched_migration_cost) + sync = 1; + } else { + if (current->se.avg_overlap >= sysctl_sched_migration_cost || + p->se.avg_overlap >= sysctl_sched_migration_cost) + sync = 0; + } #ifdef CONFIG_SMP if (sched_feat(LB_WAKEUP_UPDATE)) { -- cgit v1.2.3 From 483b4ee60edbefdfbff0dd538fb81f368d9e7c0d Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 4 Feb 2009 11:59:44 -0800 Subject: sched: fix nohz load balancer on cpu offline Christian Borntraeger reports: > After a logical cpu offline, even on a complete idle system, there > is one cpu with full ticks. It turns out that nohz.cpu_mask has the > the offlined cpu still set. > > In select_nohz_load_balancer() we check if the system is completely > idle to turn of load balancing. We compare cpu_online_map with > nohz.cpu_mask. Since cpu_online_map is updated on cpu unplug, > but nohz.cpu_mask is not, the check fails and the scheduler believes > that we need an "idle load balancer" even on a fully idle system. > Since the ilb cpu does not deactivate the timer tick this breaks NOHZ. Fix the select_nohz_load_balancer() to not set the nohz.cpu_mask while a cpu is going offline. Reported-by: Christian Borntraeger Signed-off-by: Suresh Siddha Tested-by: Christian Borntraeger Signed-off-by: Ingo Molnar --- kernel/sched.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 242d0d47a70d..e1fc67d0674c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3890,19 +3890,24 @@ int select_nohz_load_balancer(int stop_tick) int cpu = smp_processor_id(); if (stop_tick) { - cpumask_set_cpu(cpu, nohz.cpu_mask); cpu_rq(cpu)->in_nohz_recently = 1; - /* - * If we are going offline and still the leader, give up! - */ - if (!cpu_active(cpu) && - atomic_read(&nohz.load_balancer) == cpu) { + if (!cpu_active(cpu)) { + if (atomic_read(&nohz.load_balancer) != cpu) + return 0; + + /* + * If we are going offline and still the leader, + * give up! + */ if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) BUG(); + return 0; } + cpumask_set_cpu(cpu, nohz.cpu_mask); + /* time for ilb owner also to sleep */ if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { if (atomic_read(&nohz.load_balancer) == cpu) -- cgit v1.2.3 From 777c6c5f1f6e757ae49ecca2ed72d6b1f523c007 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 4 Feb 2009 15:12:14 -0800 Subject: wait: prevent exclusive waiter starvation With exclusive waiters, every process woken up through the wait queue must ensure that the next waiter down the line is woken when it has finished. Interruptible waiters don't do that when aborting due to a signal. And if an aborting waiter is concurrently woken up through the waitqueue, noone will ever wake up the next waiter. This has been observed with __wait_on_bit_lock() used by lock_page_killable(): the first contender on the queue was aborting when the actual lock holder woke it up concurrently. The aborted contender didn't acquire the lock and therefor never did an unlock followed by waking up the next waiter. Add abort_exclusive_wait() which removes the process' wait descriptor from the waitqueue, iff still queued, or wakes up the next waiter otherwise. It does so under the waitqueue lock. Racing with a wake up means the aborting process is either already woken (removed from the queue) and will wake up the next waiter, or it will remove itself from the queue and the concurrent wake up will apply to the next waiter after it. Use abort_exclusive_wait() in __wait_event_interruptible_exclusive() and __wait_on_bit_lock() when they were interrupted by other means than a wake up through the queue. [akpm@linux-foundation.org: coding-style fixes] Reported-by: Chris Mason Signed-off-by: Johannes Weiner Mentored-by: Oleg Nesterov Cc: Peter Zijlstra Cc: Matthew Wilcox Cc: Chuck Lever Cc: Nick Piggin Cc: Ingo Molnar Cc: ["after some testing"] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/wait.h | 11 ++++++++-- kernel/sched.c | 4 ++-- kernel/wait.c | 59 +++++++++++++++++++++++++++++++++++++++++++++------- 3 files changed, 63 insertions(+), 11 deletions(-) (limited to 'kernel/sched.c') diff --git a/include/linux/wait.h b/include/linux/wait.h index ef609f842fac..a210ede73b56 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -132,6 +132,8 @@ static inline void __remove_wait_queue(wait_queue_head_t *head, list_del(&old->task_list); } +void __wake_up_common(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, int sync, void *key); void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key); extern void __wake_up_locked(wait_queue_head_t *q, unsigned int mode); extern void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr); @@ -333,16 +335,19 @@ do { \ for (;;) { \ prepare_to_wait_exclusive(&wq, &__wait, \ TASK_INTERRUPTIBLE); \ - if (condition) \ + if (condition) { \ + finish_wait(&wq, &__wait); \ break; \ + } \ if (!signal_pending(current)) { \ schedule(); \ continue; \ } \ ret = -ERESTARTSYS; \ + abort_exclusive_wait(&wq, &__wait, \ + TASK_INTERRUPTIBLE, NULL); \ break; \ } \ - finish_wait(&wq, &__wait); \ } while (0) #define wait_event_interruptible_exclusive(wq, condition) \ @@ -431,6 +436,8 @@ extern long interruptible_sleep_on_timeout(wait_queue_head_t *q, void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state); void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state); void finish_wait(wait_queue_head_t *q, wait_queue_t *wait); +void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, + unsigned int mode, void *key); int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); diff --git a/kernel/sched.c b/kernel/sched.c index 242d0d47a70d..8ee437a5ec1d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4697,8 +4697,8 @@ EXPORT_SYMBOL(default_wake_function); * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns * zero in this (rare) case, and we handle it by continuing to scan the queue. */ -static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, int sync, void *key) +void __wake_up_common(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, int sync, void *key) { wait_queue_t *curr, *next; diff --git a/kernel/wait.c b/kernel/wait.c index cd87131f2fc2..42a2dbc181c8 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -91,6 +91,15 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) } EXPORT_SYMBOL(prepare_to_wait_exclusive); +/* + * finish_wait - clean up after waiting in a queue + * @q: waitqueue waited on + * @wait: wait descriptor + * + * Sets current thread back to running state and removes + * the wait descriptor from the given waitqueue if still + * queued. + */ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) { unsigned long flags; @@ -117,6 +126,39 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) } EXPORT_SYMBOL(finish_wait); +/* + * abort_exclusive_wait - abort exclusive waiting in a queue + * @q: waitqueue waited on + * @wait: wait descriptor + * @state: runstate of the waiter to be woken + * @key: key to identify a wait bit queue or %NULL + * + * Sets current thread back to running state and removes + * the wait descriptor from the given waitqueue if still + * queued. + * + * Wakes up the next waiter if the caller is concurrently + * woken up through the queue. + * + * This prevents waiter starvation where an exclusive waiter + * aborts and is woken up concurrently and noone wakes up + * the next waiter. + */ +void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, + unsigned int mode, void *key) +{ + unsigned long flags; + + __set_current_state(TASK_RUNNING); + spin_lock_irqsave(&q->lock, flags); + if (!list_empty(&wait->task_list)) + list_del_init(&wait->task_list); + else if (waitqueue_active(q)) + __wake_up_common(q, mode, 1, 0, key); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(abort_exclusive_wait); + int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) { int ret = default_wake_function(wait, mode, sync, key); @@ -177,17 +219,20 @@ int __sched __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, int (*action)(void *), unsigned mode) { - int ret = 0; - do { + int ret; + prepare_to_wait_exclusive(wq, &q->wait, mode); - if (test_bit(q->key.bit_nr, q->key.flags)) { - if ((ret = (*action)(q->key.flags))) - break; - } + if (!test_bit(q->key.bit_nr, q->key.flags)) + continue; + ret = action(q->key.flags); + if (!ret) + continue; + abort_exclusive_wait(wq, &q->wait, mode, &q->key); + return ret; } while (test_and_set_bit(q->key.bit_nr, q->key.flags)); finish_wait(wq, &q->wait); - return ret; + return 0; } EXPORT_SYMBOL(__wait_on_bit_lock); -- cgit v1.2.3 From fc631c82e1734d718ff0832558f64c8f5d185f26 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 Feb 2009 14:27:17 +0100 Subject: sched: revert recent sync wakeup changes Intel reported a 10% regression (mysql+sysbench) on a 16-way machine with these patches: 1596e29: sched: symmetric sync vs avg_overlap d942fb6: sched: fix sync wakeups Revert them. Reported-by: "Zhang, Yanmin" Bisected-by: Lin Ming Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 ---------- kernel/sched_fair.c | 11 +++++++++-- 2 files changed, 9 insertions(+), 12 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index e1fc67d0674c..f11c02b86c73 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2266,16 +2266,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) if (!sched_feat(SYNC_WAKEUPS)) sync = 0; - if (!sync) { - if (current->se.avg_overlap < sysctl_sched_migration_cost && - p->se.avg_overlap < sysctl_sched_migration_cost) - sync = 1; - } else { - if (current->se.avg_overlap >= sysctl_sched_migration_cost || - p->se.avg_overlap >= sysctl_sched_migration_cost) - sync = 0; - } - #ifdef CONFIG_SMP if (sched_feat(LB_WAKEUP_UPDATE)) { struct sched_domain *sd; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a7e50ba185ac..0566f2a03c42 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1191,15 +1191,20 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, int idx, unsigned long load, unsigned long this_load, unsigned int imbalance) { + struct task_struct *curr = this_rq->curr; + struct task_group *tg; unsigned long tl = this_load; unsigned long tl_per_task; - struct task_group *tg; unsigned long weight; int balanced; if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) return 0; + if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost || + p->se.avg_overlap > sysctl_sched_migration_cost)) + sync = 0; + /* * If sync wakeup then subtract the (maximum possible) * effect of the currently running task from the load @@ -1426,7 +1431,9 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) if (!sched_feat(WAKEUP_PREEMPT)) return; - if (sched_feat(WAKEUP_OVERLAP) && sync) { + if (sched_feat(WAKEUP_OVERLAP) && (sync || + (se->avg_overlap < sysctl_sched_migration_cost && + pse->avg_overlap < sysctl_sched_migration_cost))) { resched_task(curr); return; } -- cgit v1.2.3 From a0490fa35dc0022ef95f64802e8edf18c411c790 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 12 Feb 2009 11:35:40 +0100 Subject: sched: cpu hotplug fix rq_attach_root() does a kfree() with the runqueue lock held. That's not a very wise move, fix it. Signed-off-by: Ingo Molnar --- kernel/sched.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index c1d0ed360088..410eec404133 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6944,20 +6944,26 @@ static void free_rootdomain(struct root_domain *rd) static void rq_attach_root(struct rq *rq, struct root_domain *rd) { + struct root_domain *old_rd = NULL; unsigned long flags; spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { - struct root_domain *old_rd = rq->rd; + old_rd = rq->rd; if (cpumask_test_cpu(rq->cpu, old_rd->online)) set_rq_offline(rq); cpumask_clear_cpu(rq->cpu, old_rd->span); - if (atomic_dec_and_test(&old_rd->refcount)) - free_rootdomain(old_rd); + /* + * If we dont want to free the old_rt yet then + * set old_rd to NULL to skip the freeing later + * in this function: + */ + if (!atomic_dec_and_test(&old_rd->refcount)) + old_rd = NULL; } atomic_inc(&rd->refcount); @@ -6968,6 +6974,9 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) set_rq_online(rq); spin_unlock_irqrestore(&rq->lock, flags); + + if (old_rd) + free_rootdomain(old_rd); } static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) -- cgit v1.2.3 From b36128c830a8f5bd7d4981f5b0b69950f5928ee6 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 20 Feb 2009 16:29:08 +0900 Subject: alloc_percpu: change percpu_ptr to per_cpu_ptr Impact: cleanup There are two allocated per-cpu accessor macros with almost identical spelling. The original and far more popular is per_cpu_ptr (44 files), so change over the other 4 files. tj: kill percpu_ptr() and update UP too Signed-off-by: Rusty Russell Cc: mingo@redhat.com Cc: lenb@kernel.org Cc: cpufreq@vger.kernel.org Signed-off-by: Tejun Heo --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 2 +- drivers/acpi/processor_perflib.c | 4 ++-- include/linux/percpu.h | 23 +++++++++++------------ kernel/sched.c | 6 +++--- kernel/stop_machine.c | 2 +- 5 files changed, 18 insertions(+), 19 deletions(-) (limited to 'kernel/sched.c') diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 4b1c319d30c3..22590cf688ae 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -601,7 +601,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) if (!data) return -ENOMEM; - data->acpi_data = percpu_ptr(acpi_perf_data, cpu); + data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu); per_cpu(drv_data, cpu) = data; if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c index 9cc769b587ff..68fd3d292799 100644 --- a/drivers/acpi/processor_perflib.c +++ b/drivers/acpi/processor_perflib.c @@ -516,12 +516,12 @@ int acpi_processor_preregister_performance( continue; } - if (!performance || !percpu_ptr(performance, i)) { + if (!performance || !per_cpu_ptr(performance, i)) { retval = -EINVAL; continue; } - pr->performance = percpu_ptr(performance, i); + pr->performance = per_cpu_ptr(performance, i); cpumask_set_cpu(i, pr->performance->shared_cpu_map); if (acpi_processor_get_psd(pr)) { retval = -EINVAL; diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 3577ffd90d45..c80cfe1260ec 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -81,23 +81,13 @@ struct percpu_data { }; #define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata) -/* - * Use this to get to a cpu's version of the per-cpu object dynamically - * allocated. Non-atomic access to the current CPU's version should - * probably be combined with get_cpu()/put_cpu(). - */ -#define percpu_ptr(ptr, cpu) \ -({ \ - struct percpu_data *__p = __percpu_disguise(ptr); \ - (__typeof__(ptr))__p->ptrs[(cpu)]; \ -}) extern void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask); extern void percpu_free(void *__pdata); #else /* CONFIG_SMP */ -#define percpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); }) +#define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); }) static __always_inline void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask) { @@ -122,6 +112,15 @@ static inline void percpu_free(void *__pdata) cpu_possible_map) #define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type)) #define free_percpu(ptr) percpu_free((ptr)) -#define per_cpu_ptr(ptr, cpu) percpu_ptr((ptr), (cpu)) +/* + * Use this to get to a cpu's version of the per-cpu object dynamically + * allocated. Non-atomic access to the current CPU's version should + * probably be combined with get_cpu()/put_cpu(). + */ +#define per_cpu_ptr(ptr, cpu) \ +({ \ + struct percpu_data *__p = __percpu_disguise(ptr); \ + (__typeof__(ptr))__p->ptrs[(cpu)]; \ +}) #endif /* __LINUX_PERCPU_H */ diff --git a/kernel/sched.c b/kernel/sched.c index fc17fd91ab57..9d30ac956328 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9472,7 +9472,7 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); u64 data; #ifndef CONFIG_64BIT @@ -9491,7 +9491,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); #ifndef CONFIG_64BIT /* @@ -9587,7 +9587,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) ca = task_ca(tsk); for (; ca; ca = ca->parent) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); *cpuusage += cputime; } } diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 0cd415ee62a2..74541ca49536 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -170,7 +170,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) * doesn't hit this CPU until we're ready. */ get_cpu(); for_each_online_cpu(i) { - sm_work = percpu_ptr(stop_machine_work, i); + sm_work = per_cpu_ptr(stop_machine_work, i); INIT_WORK(sm_work, stop_cpu); queue_work_on(i, stop_machine_wq, sm_work); } -- cgit v1.2.3 From cac64d00c256e65776d575e82aaf540632b66178 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 25 Feb 2009 09:59:26 -0800 Subject: sched_rt: don't start timer when rt bandwidth disabled Impact: fix incorrect condition check No need to start rt bandwidth timer when rt bandwidth is disabled. If this timer starts, it may stop at sched_rt_period_timer() on the first time. Signed-off-by: Hiroshi Shimamoto Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 410eec404133..c3baa9653d1d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -223,7 +223,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) { ktime_t now; - if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF) + if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) return; if (hrtimer_active(&rt_b->rt_period_timer)) -- cgit v1.2.3 From 54e991242850edc8c53f71fa5aa3ba7a93ce38f5 Mon Sep 17 00:00:00 2001 From: Dhaval Giani Date: Fri, 27 Feb 2009 15:13:54 +0530 Subject: sched: don't allow setuid to succeed if the user does not have rt bandwidth Impact: fix hung task with certain (non-default) rt-limit settings Corey Hickey reported that on using setuid to change the uid of a rt process, the process would be unkillable and not be running. This is because there was no rt runtime for that user group. Add in a check to see if a user can attach an rt task to its task group. On failure, return EINVAL, which is also returned in CONFIG_CGROUP_SCHED. Reported-by: Corey Hickey Signed-off-by: Dhaval Giani Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++++ kernel/sched.c | 13 +++++++++++-- kernel/sys.c | 31 ++++++++++++++++++++----------- kernel/user.c | 18 ++++++++++++++++++ 4 files changed, 53 insertions(+), 13 deletions(-) (limited to 'kernel/sched.c') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8981e52c714f..8c216e057c94 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2291,9 +2291,13 @@ extern long sched_group_rt_runtime(struct task_group *tg); extern int sched_group_set_rt_period(struct task_group *tg, long rt_period_us); extern long sched_group_rt_period(struct task_group *tg); +extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk); #endif #endif +extern int task_can_switch_user(struct user_struct *up, + struct task_struct *tsk); + #ifdef CONFIG_TASK_XACCT static inline void add_rchar(struct task_struct *tsk, ssize_t amt) { diff --git a/kernel/sched.c b/kernel/sched.c index c3baa9653d1d..8e2558c2ba67 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9224,6 +9224,16 @@ static int sched_rt_global_constraints(void) return ret; } + +int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) +{ + /* Don't accept realtime tasks when there is no way for them to run */ + if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) + return 0; + + return 1; +} + #else /* !CONFIG_RT_GROUP_SCHED */ static int sched_rt_global_constraints(void) { @@ -9317,8 +9327,7 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, struct task_struct *tsk) { #ifdef CONFIG_RT_GROUP_SCHED - /* Don't accept realtime tasks when there is no way for them to run */ - if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0) + if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) return -EINVAL; #else /* We don't support RT-tasks being in separate groups */ diff --git a/kernel/sys.c b/kernel/sys.c index f145c415bc16..37f458e6882a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -559,7 +559,7 @@ error: abort_creds(new); return retval; } - + /* * change the user struct in a credentials set to match the new UID */ @@ -571,6 +571,11 @@ static int set_user(struct cred *new) if (!new_user) return -EAGAIN; + if (!task_can_switch_user(new_user, current)) { + free_uid(new_user); + return -EINVAL; + } + if (atomic_read(&new_user->processes) >= current->signal->rlim[RLIMIT_NPROC].rlim_cur && new_user != INIT_USER) { @@ -631,10 +636,11 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) goto error; } - retval = -EAGAIN; - if (new->uid != old->uid && set_user(new) < 0) - goto error; - + if (new->uid != old->uid) { + retval = set_user(new); + if (retval < 0) + goto error; + } if (ruid != (uid_t) -1 || (euid != (uid_t) -1 && euid != old->uid)) new->suid = new->euid; @@ -680,9 +686,10 @@ SYSCALL_DEFINE1(setuid, uid_t, uid) retval = -EPERM; if (capable(CAP_SETUID)) { new->suid = new->uid = uid; - if (uid != old->uid && set_user(new) < 0) { - retval = -EAGAIN; - goto error; + if (uid != old->uid) { + retval = set_user(new); + if (retval < 0) + goto error; } } else if (uid != old->uid && uid != new->suid) { goto error; @@ -734,11 +741,13 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) goto error; } - retval = -EAGAIN; if (ruid != (uid_t) -1) { new->uid = ruid; - if (ruid != old->uid && set_user(new) < 0) - goto error; + if (ruid != old->uid) { + retval = set_user(new); + if (retval < 0) + goto error; + } } if (euid != (uid_t) -1) new->euid = euid; diff --git a/kernel/user.c b/kernel/user.c index 3551ac742395..6a9b696128c8 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -362,6 +362,24 @@ static void free_user(struct user_struct *up, unsigned long flags) #endif +#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED) +/* + * We need to check if a setuid can take place. This function should be called + * before successfully completing the setuid. + */ +int task_can_switch_user(struct user_struct *up, struct task_struct *tsk) +{ + + return sched_rt_can_attach(up->tg, tsk); + +} +#else +int task_can_switch_user(struct user_struct *up, struct task_struct *tsk) +{ + return 1; +} +#endif + /* * Locate the user_struct for the passed UID. If found, take a ref on it. The * caller must undo that ref with free_uid(). -- cgit v1.2.3