From 1ed1328792ff46e4bb86a3d7f7be2971f4549f6c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Sep 2015 12:53:17 -0400 Subject: sched, cgroup: replace signal_struct->group_rwsem with a global percpu_rwsem Note: This commit was originally committed as d59cfc09c32a but got reverted by 0c986253b939 due to the performance regression from the percpu_rwsem write down/up operations added to cgroup task migration path. percpu_rwsem changes which alleviate the performance issue are pending for v4.4-rc1 merge window. Re-apply. The cgroup side of threadgroup locking uses signal_struct->group_rwsem to synchronize against threadgroup changes. This per-process rwsem adds small overhead to thread creation, exit and exec paths, forces cgroup code paths to do lock-verify-unlock-retry dance in a couple places and makes it impossible to atomically perform operations across multiple processes. This patch replaces signal_struct->group_rwsem with a global percpu_rwsem cgroup_threadgroup_rwsem which is cheaper on the reader side and contained in cgroups proper. This patch converts one-to-one. This does make writer side heavier and lower the granularity; however, cgroup process migration is a fairly cold path, we do want to optimize thread operations over it and cgroup migration operations don't take enough time for the lower granularity to matter. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/g/55F8097A.7000206@de.ibm.com Cc: Ingo Molnar Cc: Peter Zijlstra --- include/linux/init_task.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux/init_task.h') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index e38681f4912d..d0b380ee7d67 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -25,13 +25,6 @@ extern struct files_struct init_files; extern struct fs_struct init_fs; -#ifdef CONFIG_CGROUPS -#define INIT_GROUP_RWSEM(sig) \ - .group_rwsem = __RWSEM_INITIALIZER(sig.group_rwsem), -#else -#define INIT_GROUP_RWSEM(sig) -#endif - #ifdef CONFIG_CPUSETS #define INIT_CPUSET_SEQ(tsk) \ .mems_allowed_seq = SEQCNT_ZERO(tsk.mems_allowed_seq), @@ -64,7 +57,6 @@ extern struct fs_struct init_fs; INIT_PREV_CPUTIME(sig) \ .cred_guard_mutex = \ __MUTEX_INITIALIZER(sig.cred_guard_mutex), \ - INIT_GROUP_RWSEM(sig) \ } extern struct nsproxy init_nsproxy; -- cgit v1.2.3 From d5c373eb5610686162ff50429f63f4c00c554799 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Wed, 14 Oct 2015 12:07:55 -0700 Subject: posix_cpu_timer: Convert cputimer->running to bool In the next patch in this series, a new field 'checking_timer' will be added to 'struct thread_group_cputimer'. Both this and the existing 'running' integer field are just used as boolean values. To save space in the structure, we can make both of these fields booleans. This is a preparatory patch to convert the existing running integer field to a boolean. Suggested-by: George Spelvin Signed-off-by: Jason Low Reviewed: George Spelvin Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Frederic Weisbecker Cc: Davidlohr Bueso Cc: Steven Rostedt Cc: hideaki.kimura@hpe.com Cc: terry.rudd@hpe.com Cc: scott.norton@hpe.com Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1444849677-29330-4-git-send-email-jason.low2@hp.com Signed-off-by: Thomas Gleixner --- include/linux/init_task.h | 2 +- include/linux/sched.h | 6 +++--- kernel/fork.c | 2 +- kernel/time/posix-cpu-timers.c | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux/init_task.h') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index e38681f4912d..c43b80f3f875 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -59,7 +59,7 @@ extern struct fs_struct init_fs; .rlim = INIT_RLIMITS, \ .cputimer = { \ .cputime_atomic = INIT_CPUTIME_ATOMIC, \ - .running = 0, \ + .running = false, \ }, \ INIT_PREV_CPUTIME(sig) \ .cred_guard_mutex = \ diff --git a/include/linux/sched.h b/include/linux/sched.h index b7b9501b41af..6c8504ade2ba 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -617,15 +617,15 @@ struct task_cputime_atomic { /** * struct thread_group_cputimer - thread group interval timer counts * @cputime_atomic: atomic thread group interval timers. - * @running: non-zero when there are timers running and - * @cputime receives updates. + * @running: true when there are timers running and + * @cputime_atomic receives updates. * * This structure contains the version of task_cputime, above, that is * used for thread group CPU timer calculations. */ struct thread_group_cputimer { struct task_cputime_atomic cputime_atomic; - int running; + bool running; }; #include diff --git a/kernel/fork.c b/kernel/fork.c index 2845623fb582..6ac894244d39 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1101,7 +1101,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); if (cpu_limit != RLIM_INFINITY) { sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit); - sig->cputimer.running = 1; + sig->cputimer.running = true; } /* The timer lists. */ diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 6f6e252ec761..2d58153074d9 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -249,7 +249,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) * but barriers are not required because update_gt_cputime() * can handle concurrent updates. */ - WRITE_ONCE(cputimer->running, 1); + WRITE_ONCE(cputimer->running, true); } sample_cputime_atomic(times, &cputimer->cputime_atomic); } @@ -918,7 +918,7 @@ static inline void stop_process_timers(struct signal_struct *sig) struct thread_group_cputimer *cputimer = &sig->cputimer; /* Turn off cputimer->running. This is done without locking. */ - WRITE_ONCE(cputimer->running, 0); + WRITE_ONCE(cputimer->running, false); } static u32 onecputick; -- cgit v1.2.3 From c8d75aa47dd585c9538a8205e9bb9847e12cfb84 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Wed, 14 Oct 2015 12:07:56 -0700 Subject: posix_cpu_timer: Reduce unnecessary sighand lock contention It was found while running a database workload on large systems that significant time was spent trying to acquire the sighand lock. The issue was that whenever an itimer expired, many threads ended up simultaneously trying to send the signal. Most of the time, nothing happened after acquiring the sighand lock because another thread had just already sent the signal and updated the "next expire" time. The fastpath_timer_check() didn't help much since the "next expire" time was updated after the threads exit fastpath_timer_check(). This patch addresses this by having the thread_group_cputimer structure maintain a boolean to signify when a thread in the group is already checking for process wide timers, and adds extra logic in the fastpath to check the boolean. Signed-off-by: Jason Low Reviewed-by: Oleg Nesterov Reviewed-by: George Spelvin Cc: Paul E. McKenney Cc: Frederic Weisbecker Cc: Davidlohr Bueso Cc: Steven Rostedt Cc: hideaki.kimura@hpe.com Cc: terry.rudd@hpe.com Cc: scott.norton@hpe.com Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1444849677-29330-5-git-send-email-jason.low2@hp.com Signed-off-by: Thomas Gleixner --- include/linux/init_task.h | 1 + include/linux/sched.h | 3 +++ kernel/time/posix-cpu-timers.c | 26 ++++++++++++++++++++++++-- 3 files changed, 28 insertions(+), 2 deletions(-) (limited to 'include/linux/init_task.h') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index c43b80f3f875..810a34f60424 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -60,6 +60,7 @@ extern struct fs_struct init_fs; .cputimer = { \ .cputime_atomic = INIT_CPUTIME_ATOMIC, \ .running = false, \ + .checking_timer = false, \ }, \ INIT_PREV_CPUTIME(sig) \ .cred_guard_mutex = \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 6c8504ade2ba..f87559df5b75 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -619,6 +619,8 @@ struct task_cputime_atomic { * @cputime_atomic: atomic thread group interval timers. * @running: true when there are timers running and * @cputime_atomic receives updates. + * @checking_timer: true when a thread in the group is in the + * process of checking for thread group timers. * * This structure contains the version of task_cputime, above, that is * used for thread group CPU timer calculations. @@ -626,6 +628,7 @@ struct task_cputime_atomic { struct thread_group_cputimer { struct task_cputime_atomic cputime_atomic; bool running; + bool checking_timer; }; #include diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 2d58153074d9..f5e86d282d52 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -975,6 +975,12 @@ static void check_process_timers(struct task_struct *tsk, if (!READ_ONCE(tsk->signal->cputimer.running)) return; + /* + * Signify that a thread is checking for process timers. + * Write access to this field is protected by the sighand lock. + */ + sig->cputimer.checking_timer = true; + /* * Collect the current process totals. */ @@ -1029,6 +1035,8 @@ static void check_process_timers(struct task_struct *tsk, sig->cputime_expires.sched_exp = sched_expires; if (task_cputime_zero(&sig->cputime_expires)) stop_process_timers(sig); + + sig->cputimer.checking_timer = false; } /* @@ -1142,8 +1150,22 @@ static inline int fastpath_timer_check(struct task_struct *tsk) } sig = tsk->signal; - /* Check if cputimer is running. This is accessed without locking. */ - if (READ_ONCE(sig->cputimer.running)) { + /* + * Check if thread group timers expired when the cputimer is + * running and no other thread in the group is already checking + * for thread group cputimers. These fields are read without the + * sighand lock. However, this is fine because this is meant to + * be a fastpath heuristic to determine whether we should try to + * acquire the sighand lock to check/handle timers. + * + * In the worst case scenario, if 'running' or 'checking_timer' gets + * set but the current thread doesn't see the change yet, we'll wait + * until the next thread in the group gets a scheduler interrupt to + * handle the timer. This isn't an issue in practice because these + * types of delays with signals actually getting sent are expected. + */ + if (READ_ONCE(sig->cputimer.running) && + !READ_ONCE(sig->cputimer.checking_timer)) { struct task_cputime group_sample; sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic); -- cgit v1.2.3