From 9c44bc03fff44ff04237a7d92e35304a0e50c331 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:21:04 +0200 Subject: softlockup: allow panic on lockup allow users to configure the softlockup detector to generate a panic instead of a warning message. high-availability systems might opt for this strict method (combined with panic_timeout= boot option/sysctl), instead of generating softlockup warnings ad infinitum. also, automated tests work better if the system reboots reliably (into a safe kernel) in case of a lockup. The full spectrum of configurability is supported: boot option, sysctl option and Kconfig option. it's default-disabled. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5395a6176f4b..71f5972dc48e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -294,7 +294,8 @@ extern void softlockup_tick(void); extern void spawn_softlockup_task(void); extern void touch_softlockup_watchdog(void); extern void touch_all_softlockup_watchdogs(void); -extern unsigned long softlockup_thresh; +extern unsigned int softlockup_panic; +extern unsigned long softlockup_thresh; extern unsigned long sysctl_hung_task_check_count; extern unsigned long sysctl_hung_task_timeout_secs; extern unsigned long sysctl_hung_task_warnings; -- cgit v1.2.3 From 9383d9679056e6cc4e7ff70f31da945a268238f4 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Mon, 12 May 2008 21:21:14 +0200 Subject: softlockup: fix softlockup_thresh unaligned access and disable detection at runtime Fix unaligned access errors when setting softlockup_thresh on 64 bit platforms. Allow softlockup detection to be disabled by setting softlockup_thresh <= 0. Detect that boot time softlockup detection has been disabled earlier in softlockup_tick. Signed-off-by: Dimitri Sivanich Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 2 +- kernel/softlockup.c | 12 ++++++++++-- kernel/sysctl.c | 9 +++++---- 3 files changed, 16 insertions(+), 7 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 71f5972dc48e..ea26221644e2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -295,10 +295,10 @@ extern void spawn_softlockup_task(void); extern void touch_softlockup_watchdog(void); extern void touch_all_softlockup_watchdogs(void); extern unsigned int softlockup_panic; -extern unsigned long softlockup_thresh; extern unsigned long sysctl_hung_task_check_count; extern unsigned long sysctl_hung_task_timeout_secs; extern unsigned long sysctl_hung_task_warnings; +extern int softlockup_thresh; #else static inline void softlockup_tick(void) { diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 78e0ad21cb0c..a3a0b239b7f7 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -25,7 +25,7 @@ static DEFINE_PER_CPU(unsigned long, print_timestamp); static DEFINE_PER_CPU(struct task_struct *, watchdog_task); static int __read_mostly did_panic; -unsigned long __read_mostly softlockup_thresh = 60; +int __read_mostly softlockup_thresh = 60; /* * Should we panic (and reboot, if panic_timeout= is set) when a @@ -94,6 +94,14 @@ void softlockup_tick(void) struct pt_regs *regs = get_irq_regs(); unsigned long now; + /* Is detection switched off? */ + if (!per_cpu(watchdog_task, this_cpu) || softlockup_thresh <= 0) { + /* Be sure we don't false trigger if switched back on */ + if (touch_timestamp) + per_cpu(touch_timestamp, this_cpu) = 0; + return; + } + if (touch_timestamp == 0) { touch_softlockup_watchdog(); return; @@ -104,7 +112,7 @@ void softlockup_tick(void) /* report at most once a second */ if ((print_timestamp >= touch_timestamp && print_timestamp < (touch_timestamp + 1)) || - did_panic || !per_cpu(watchdog_task, this_cpu)) { + did_panic) { return; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2d3b388c402d..31c19a79738d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -84,12 +84,13 @@ extern int latencytop_enabled; extern int sysctl_nr_open_min, sysctl_nr_open_max; /* Constants used for minimum and maximum */ -#if defined(CONFIG_DETECT_SOFTLOCKUP) || defined(CONFIG_HIGHMEM) +#ifdef CONFIG_HIGHMEM static int one = 1; #endif #ifdef CONFIG_DETECT_SOFTLOCKUP static int sixty = 60; +static int neg_one = -1; #endif #ifdef CONFIG_MMU @@ -742,11 +743,11 @@ static struct ctl_table kern_table[] = { .ctl_name = CTL_UNNUMBERED, .procname = "softlockup_thresh", .data = &softlockup_thresh, - .maxlen = sizeof(unsigned long), + .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = &proc_dointvec_minmax, .strategy = &sysctl_intvec, - .extra1 = &one, + .extra1 = &neg_one, .extra2 = &sixty, }, { -- cgit v1.2.3 From 1b427c153a08fdbc092c2bdbf845b92fda58d857 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 18 Jul 2008 14:01:39 +0200 Subject: sched: fix build error, provide partition_sched_domains() unconditionally MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit provide an empty partition_sched_domains() definition for the UP case: include/linux/cpuset.h: In function ‘rebuild_sched_domains': include/linux/cpuset.h:163: error: implicit declaration of function ‘partition_sched_domains' Signed-off-by: Ingo Molnar --- include/linux/sched.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 1941d8b5cf11..26da921530fe 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -824,7 +824,16 @@ extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, struct sched_domain_attr *dattr_new); extern int arch_reinit_sched_domains(void); -#endif /* CONFIG_SMP */ +#else /* CONFIG_SMP */ + +struct sched_domain_attr; + +static inline void +partition_sched_domains(int ndoms_new, cpumask_t *doms_new, + struct sched_domain_attr *dattr_new) +{ +} +#endif /* !CONFIG_SMP */ struct io_context; /* See blkdev.h */ #define NGROUPS_SMALL 32 -- cgit v1.2.3 From 8b05c7e6e159d2f33c9275281b8b909a89eb7c5d Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Wed, 23 Jul 2008 21:26:53 -0700 Subject: add a helper function to test if an object is on the stack lib/debugobjects.c has a function to test if an object is on the stack. The block layer and ide needs it (they need to avoid DMA from/to stack buffers). This patch moves the function to include/linux/sched.h so that everyone can use it. lib/debugobjects.c uses current->stack but this patch uses a task_stack_page() accessor, which is a preferable way to access the stack. Signed-off-by: FUJITA Tomonori Cc: Christoph Lameter Cc: Andy Whitcroft Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 7 +++++++ lib/debugobjects.c | 4 +--- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index dc7e592c473a..6aca4a16e377 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1983,6 +1983,13 @@ static inline unsigned long *end_of_stack(struct task_struct *p) #endif +static inline int object_is_on_stack(void *obj) +{ + void *stack = task_stack_page(current); + + return (obj >= stack) && (obj < (stack + THREAD_SIZE)); +} + extern void thread_info_cache_init(void); /* set thread flags in other task's structures diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 85b18d79be89..f86196390cfd 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -226,15 +226,13 @@ debug_object_fixup(int (*fixup)(void *addr, enum debug_obj_state state), static void debug_object_is_on_stack(void *addr, int onstack) { - void *stack = current->stack; int is_on_stack; static int limit; if (limit > 4) return; - is_on_stack = (addr >= stack && addr < (stack + THREAD_SIZE)); - + is_on_stack = object_is_on_stack(addr); if (is_on_stack == onstack) return; -- cgit v1.2.3 From 364d3c13c17f45da6d638011078d4c4d3070d719 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 25 Jul 2008 01:47:36 -0700 Subject: ptrace: give more respect to SIGKILL ptrace_stop() has some complicated checks to prevent the scheduling in the TASK_TRACED state with the pending SIGKILL, but these checks are racy, and they depend on arch_ptrace_stop_needed(). This patch assumes that the traced task should die asap if it was killed by SIGKILL, in that case schedule()->signal_pending_state() has no reason to ignore the TASK_WAKEKILL part of TASK_TRACED, and we can kill this nasty special case. Note: do_exit()->ptrace_notify() is special, the killed task can already dequeue SIGKILL at this point. Another indication that fatal_signal_pending() is not exactly right. Signed-off-by: Oleg Nesterov Cc: Ingo Molnar Cc: Matthew Wilcox Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6aca4a16e377..79e749dbf81e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2054,9 +2054,6 @@ static inline int signal_pending_state(long state, struct task_struct *p) if (!signal_pending(p)) return 0; - if (state & (__TASK_STOPPED | __TASK_TRACED)) - return 0; - return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p); } -- cgit v1.2.3 From 7b34e4283c685f5cc6ba6d30e939906eee0d4bcf Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 25 Jul 2008 01:47:37 -0700 Subject: introduce PF_KTHREAD flag Introduce the new PF_KTHREAD flag to mark the kernel threads. It is set by INIT_TASK() and copied to the forked childs (we could set it in kthreadd() along with PF_NOFREEZE instead). daemonize() was changed as well. In that case testing of PF_KTHREAD is racy, but daemonize() is hopeless anyway. This flag is cleared in do_execve(), before search_binary_handler(). Probably not the best place, we can do this in exec_mmap() or in start_thread(), or clear it along with PF_FORKNOEXEC. But I think this doesn't matter in practice, and if do_execve() fails kthread should die soon. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 1 + include/linux/init_task.h | 2 +- include/linux/sched.h | 1 + kernel/exit.c | 2 +- 4 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux/sched.h') diff --git a/fs/exec.c b/fs/exec.c index af249af4ccab..cd2e8c9b1249 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1326,6 +1326,7 @@ int do_execve(char * filename, if (retval < 0) goto out; + current->flags &= ~PF_KTHREAD; retval = search_binary_handler(bprm,regs); if (retval >= 0) { /* execve success */ diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 93c45acf249a..021d8e720c79 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -122,7 +122,7 @@ extern struct group_info init_groups; .state = 0, \ .stack = &init_thread_info, \ .usage = ATOMIC_INIT(2), \ - .flags = 0, \ + .flags = PF_KTHREAD, \ .lock_depth = -1, \ .prio = MAX_PRIO-20, \ .static_prio = MAX_PRIO-20, \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 79e749dbf81e..eec64a4adb9d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1483,6 +1483,7 @@ static inline void put_task_struct(struct task_struct *t) #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ +#define PF_KTHREAD 0x00000020 /* I am a kernel thread */ #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ #define PF_DUMPCORE 0x00000200 /* dumped core */ diff --git a/kernel/exit.c b/kernel/exit.c index a7799d8a6404..28a44a2612dc 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -430,7 +430,7 @@ void daemonize(const char *name, ...) * We don't want to have TIF_FREEZE set if the system-wide hibernation * or suspend transition begins right now. */ - current->flags |= PF_NOFREEZE; + current->flags |= (PF_NOFREEZE | PF_KTHREAD); if (current->nsproxy != &init_nsproxy) { get_nsproxy(&init_nsproxy); -- cgit v1.2.3 From 246bb0b1deb29726990620d8b5e55ca29f331362 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 25 Jul 2008 01:47:38 -0700 Subject: kill PF_BORROWED_MM in favour of PF_KTHREAD Kill PF_BORROWED_MM. Change use_mm/unuse_mm to not play with ->flags, and do s/PF_BORROWED_MM/PF_KTHREAD/ for a couple of other users. No functional changes yet. But this allows us to do further fixes/cleanups. oom_kill/ptrace/etc often check "p->mm != NULL" to filter out the kthreads, this is wrong because of use_mm(). The problem with PF_BORROWED_MM is that we need task_lock() to avoid races. With this patch we can check PF_KTHREAD directly, or use a simple lockless helper: /* The result must not be dereferenced !!! */ struct mm_struct *__get_task_mm(struct task_struct *tsk) { if (tsk->flags & PF_KTHREAD) return NULL; return tsk->mm; } Note also ecard_task(). It runs with ->mm != NULL, but it's the kernel thread without PF_BORROWED_MM. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 2 -- include/linux/sched.h | 3 +-- kernel/fork.c | 4 ++-- 3 files changed, 3 insertions(+), 6 deletions(-) (limited to 'include/linux/sched.h') diff --git a/fs/aio.c b/fs/aio.c index 0fb3117ddd93..0051fd94b44e 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -586,7 +586,6 @@ static void use_mm(struct mm_struct *mm) struct task_struct *tsk = current; task_lock(tsk); - tsk->flags |= PF_BORROWED_MM; active_mm = tsk->active_mm; atomic_inc(&mm->mm_count); tsk->mm = mm; @@ -610,7 +609,6 @@ static void unuse_mm(struct mm_struct *mm) struct task_struct *tsk = current; task_lock(tsk); - tsk->flags &= ~PF_BORROWED_MM; tsk->mm = NULL; /* active_mm is still 'mm' */ enter_lazy_tlb(mm, tsk); diff --git a/include/linux/sched.h b/include/linux/sched.h index eec64a4adb9d..0560999eb1db 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1483,7 +1483,6 @@ static inline void put_task_struct(struct task_struct *t) #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ -#define PF_KTHREAD 0x00000020 /* I am a kernel thread */ #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ #define PF_DUMPCORE 0x00000200 /* dumped core */ @@ -1497,7 +1496,7 @@ static inline void put_task_struct(struct task_struct *t) #define PF_KSWAPD 0x00040000 /* I am kswapd */ #define PF_SWAPOFF 0x00080000 /* I am in swapoff */ #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ -#define PF_BORROWED_MM 0x00200000 /* I am a kthread doing use_mm */ +#define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */ #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ diff --git a/kernel/fork.c b/kernel/fork.c index 228f80c9155a..eeaec6893b0d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -474,7 +474,7 @@ EXPORT_SYMBOL_GPL(mmput); /** * get_task_mm - acquire a reference to the task's mm * - * Returns %NULL if the task has no mm. Checks PF_BORROWED_MM (meaning + * Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning * this kernel workthread has transiently adopted a user mm with use_mm, * to do its AIO) is not set and if so returns a reference to it, after * bumping up the use count. User must release the mm via mmput() @@ -487,7 +487,7 @@ struct mm_struct *get_task_mm(struct task_struct *task) task_lock(task); mm = task->mm; if (mm) { - if (task->flags & PF_BORROWED_MM) + if (task->flags & PF_KTHREAD) mm = NULL; else atomic_inc(&mm->mm_users); -- cgit v1.2.3 From 19b0cfcca41dd772065671ad0584e1cea0f3fd13 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Fri, 25 Jul 2008 01:48:35 -0700 Subject: pidns: remove now unused kill_proc function This function operated on a pid_t to kill a task, which is no longer valid in a containerized system. It has finally lost all its users and we can safely remove it from the tree. Signed-off-by: Pavel Emelyanov Cc: Oleg Nesterov Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 - kernel/signal.c | 12 ------------ 2 files changed, 13 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 0560999eb1db..134cb5cb506c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1800,7 +1800,6 @@ extern void force_sig(int, struct task_struct *); extern void force_sig_specific(int, struct task_struct *); extern int send_sig(int, struct task_struct *, int); extern void zap_other_threads(struct task_struct *p); -extern int kill_proc(pid_t, int, int); extern struct sigqueue *sigqueue_alloc(void); extern void sigqueue_free(struct sigqueue *); extern int send_sigqueue(struct sigqueue *, struct task_struct *, int group); diff --git a/kernel/signal.c b/kernel/signal.c index 5c7b7eaa0dc6..82c3545596c5 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1228,17 +1228,6 @@ int kill_pid(struct pid *pid, int sig, int priv) } EXPORT_SYMBOL(kill_pid); -int -kill_proc(pid_t pid, int sig, int priv) -{ - int ret; - - rcu_read_lock(); - ret = kill_pid_info(sig, __si_special(priv), find_pid(pid)); - rcu_read_unlock(); - return ret; -} - /* * These functions support sending signals using preallocated sigqueue * structures. This is needed "because realtime applications cannot @@ -1906,7 +1895,6 @@ EXPORT_SYMBOL(recalc_sigpending); EXPORT_SYMBOL_GPL(dequeue_signal); EXPORT_SYMBOL(flush_signals); EXPORT_SYMBOL(force_sig); -EXPORT_SYMBOL(kill_proc); EXPORT_SYMBOL(ptrace_notify); EXPORT_SYMBOL(send_sig); EXPORT_SYMBOL(send_sig_info); -- cgit v1.2.3 From e49859e71e0318b564de1546bdc30fab738f9deb Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Fri, 25 Jul 2008 01:48:36 -0700 Subject: pidns: remove now unused find_pid function. This one had the only users so far - the kill_proc, which is removed, so drop this (invalid in namespaced world) call too. And of course - erase all references on it from comments. Signed-off-by: Pavel Emelyanov Cc: Oleg Nesterov Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pid.h | 4 +--- include/linux/sched.h | 2 +- kernel/pid.c | 8 +------- 3 files changed, 3 insertions(+), 11 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/pid.h b/include/linux/pid.h index 6f084b9e2c40..ff1b2a5814d4 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -48,7 +48,7 @@ enum pid_type */ struct upid { - /* Try to keep pid_chain in the same cacheline as nr for find_pid */ + /* Try to keep pid_chain in the same cacheline as nr for find_vpid */ int nr; struct pid_namespace *ns; struct hlist_node pid_chain; @@ -105,14 +105,12 @@ extern struct pid_namespace init_pid_ns; * or rcu_read_lock() held. * * find_pid_ns() finds the pid in the namespace specified - * find_pid() find the pid by its global id, i.e. in the init namespace * find_vpid() finr the pid by its virtual id, i.e. in the current namespace * * see also find_task_by_pid() set in include/linux/sched.h */ extern struct pid *find_pid_ns(int nr, struct pid_namespace *ns); extern struct pid *find_vpid(int nr); -extern struct pid *find_pid(int nr); /* * Lookup a PID in the hash table, and return with it's count elevated. diff --git a/include/linux/sched.h b/include/linux/sched.h index 134cb5cb506c..182da1550fad 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1718,7 +1718,7 @@ extern struct pid_namespace init_pid_ns; * find_task_by_pid(): * finds a task by its global pid * - * see also find_pid() etc in include/linux/pid.h + * see also find_vpid() etc in include/linux/pid.h */ extern struct task_struct *find_task_by_pid_type_ns(int type, int pid, diff --git a/kernel/pid.c b/kernel/pid.c index 753fd90d9ec1..064e76afa507 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -309,12 +309,6 @@ struct pid *find_vpid(int nr) } EXPORT_SYMBOL_GPL(find_vpid); -struct pid *find_pid(int nr) -{ - return find_pid_ns(nr, &init_pid_ns); -} -EXPORT_SYMBOL_GPL(find_pid); - /* * attach_pid() must be called with the tasklist_lock write-held. */ @@ -483,7 +477,7 @@ EXPORT_SYMBOL(task_session_nr_ns); /* * Used by proc to find the first pid that is greater then or equal to nr. * - * If there is a pid at nr this function is exactly the same as find_pid. + * If there is a pid at nr this function is exactly the same as find_pid_ns. */ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) { -- cgit v1.2.3 From dbda0de52618d13d1b927c7ba7bb839cfddc4e8c Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Fri, 25 Jul 2008 01:48:37 -0700 Subject: pidns: remove find_task_by_pid, unused for a long time It seems to me that it was a mistake marking this function as deprecated and scheduling it for removal, rather than resolutely removing it after the last caller's death. Anyway - better late, then never. Signed-off-by: Pavel Emelyanov Cc: Oleg Nesterov Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/feature-removal-schedule.txt | 18 ------------------ include/linux/pid.h | 2 +- include/linux/sched.h | 6 ------ 3 files changed, 1 insertion(+), 25 deletions(-) (limited to 'include/linux/sched.h') diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 09c4a1efb8e3..721c71b86e06 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -138,24 +138,6 @@ Who: Kay Sievers --------------------------- -What: find_task_by_pid -When: 2.6.26 -Why: With pid namespaces, calling this funciton will return the - wrong task when called from inside a namespace. - - The best way to save a task pid and find a task by this - pid later, is to find this task's struct pid pointer (or get - it directly from the task) and call pid_task() later. - - If someone really needs to get a task by its pid_t, then - he most likely needs the find_task_by_vpid() to get the - task from the same namespace as the current task is in, but - this may be not so in general. - -Who: Pavel Emelyanov - ---------------------------- - What: ACPI procfs interface When: July 2008 Why: ACPI sysfs conversion should be finished by January 2008. diff --git a/include/linux/pid.h b/include/linux/pid.h index ff1b2a5814d4..22921ac4cfd9 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -107,7 +107,7 @@ extern struct pid_namespace init_pid_ns; * find_pid_ns() finds the pid in the namespace specified * find_vpid() finr the pid by its virtual id, i.e. in the current namespace * - * see also find_task_by_pid() set in include/linux/sched.h + * see also find_task_by_vpid() set in include/linux/sched.h */ extern struct pid *find_pid_ns(int nr, struct pid_namespace *ns); extern struct pid *find_vpid(int nr); diff --git a/include/linux/sched.h b/include/linux/sched.h index 182da1550fad..354ef478a80d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1715,8 +1715,6 @@ extern struct pid_namespace init_pid_ns; * finds a task by its pid in the specified namespace * find_task_by_vpid(): * finds a task by its virtual pid - * find_task_by_pid(): - * finds a task by its global pid * * see also find_vpid() etc in include/linux/pid.h */ @@ -1724,10 +1722,6 @@ extern struct pid_namespace init_pid_ns; extern struct task_struct *find_task_by_pid_type_ns(int type, int pid, struct pid_namespace *ns); -static inline struct task_struct *__deprecated find_task_by_pid(pid_t nr) -{ - return find_task_by_pid_type_ns(PIDTYPE_PID, nr, &init_pid_ns); -} extern struct task_struct *find_task_by_vpid(pid_t nr); extern struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns); -- cgit v1.2.3 From 49b5cf34727a6c1be1568ab28e89a2d9a6bf51e0 Mon Sep 17 00:00:00 2001 From: Jonathan Lim Date: Fri, 25 Jul 2008 01:48:40 -0700 Subject: accounting: account for user time when updating memory integrals Adapt acct_update_integrals() to include user time when calculating the time difference. The units of acct_rss_mem1 and acct_vm_mem1 are also changed from pages-jiffies to pages-usecs to avoid calling jiffies_to_usecs() in xacct_add_tsk() which might overflow. Signed-off-by: Jonathan Lim Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 +- kernel/sched.c | 2 ++ kernel/tsacct.c | 21 ++++++++++++++------- 3 files changed, 17 insertions(+), 8 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 354ef478a80d..af780f299c7c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1257,7 +1257,7 @@ struct task_struct { #if defined(CONFIG_TASK_XACCT) u64 acct_rss_mem1; /* accumulated rss usage */ u64 acct_vm_mem1; /* accumulated virtual memory usage */ - cputime_t acct_stimexpd;/* stime since last update */ + cputime_t acct_timexpd; /* stime + utime since last update */ #endif #ifdef CONFIG_CPUSETS nodemask_t mems_allowed; diff --git a/kernel/sched.c b/kernel/sched.c index 6acf749d3336..0047bd9b96aa 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4046,6 +4046,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime) cpustat->nice = cputime64_add(cpustat->nice, tmp); else cpustat->user = cputime64_add(cpustat->user, tmp); + /* Account for user time used */ + acct_update_integrals(p); } /* diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 4ab1b584961b..1da6990af8e0 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -84,9 +84,9 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) { struct mm_struct *mm; - /* convert pages-jiffies to Mbyte-usec */ - stats->coremem = jiffies_to_usecs(p->acct_rss_mem1) * PAGE_SIZE / MB; - stats->virtmem = jiffies_to_usecs(p->acct_vm_mem1) * PAGE_SIZE / MB; + /* convert pages-usec to Mbyte-usec */ + stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB; + stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB; mm = get_task_mm(p); if (mm) { /* adjust to KB unit */ @@ -118,12 +118,19 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) void acct_update_integrals(struct task_struct *tsk) { if (likely(tsk->mm)) { - long delta = cputime_to_jiffies( - cputime_sub(tsk->stime, tsk->acct_stimexpd)); + cputime_t time, dtime; + struct timeval value; + u64 delta; + + time = tsk->stime + tsk->utime; + dtime = cputime_sub(time, tsk->acct_timexpd); + jiffies_to_timeval(cputime_to_jiffies(dtime), &value); + delta = value.tv_sec; + delta = delta * USEC_PER_SEC + value.tv_usec; if (delta == 0) return; - tsk->acct_stimexpd = tsk->stime; + tsk->acct_timexpd = time; tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; } @@ -135,7 +142,7 @@ void acct_update_integrals(struct task_struct *tsk) */ void acct_clear_integrals(struct task_struct *tsk) { - tsk->acct_stimexpd = 0; + tsk->acct_timexpd = 0; tsk->acct_rss_mem1 = 0; tsk->acct_vm_mem1 = 0; } -- cgit v1.2.3 From 297c5d92634c809cef23d73e7b2556f2528ff7e2 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Fri, 25 Jul 2008 01:48:49 -0700 Subject: task IO accounting: provide distinct tgid/tid I/O statistics Report per-thread I/O statistics in /proc/pid/task/tid/io and aggregate parent I/O statistics in /proc/pid/io. This approach follows the same model used to account per-process and per-thread CPU times. As a practial application, this allows for example to quickly find the top I/O consumer when a process spawns many child threads that perform the actual I/O work, because the aggregated I/O statistics can always be found in /proc/pid/io. [ Oleg Nesterov points out that we should check that the task is still alive before we iterate over the threads, but also says that we can do that fixup on top of this later. - Linus ] Acked-by: Balbir Singh Signed-off-by: Andrea Righi Cc: Matt Heaton Cc: Shailabh Nagar Acked-by-with-comments: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 86 ++++++++++++++++++++++++++++++++++++++++++--------- include/linux/sched.h | 4 +++ kernel/exit.c | 27 ++++++++++++++++ kernel/fork.c | 6 ++++ 4 files changed, 108 insertions(+), 15 deletions(-) (limited to 'include/linux/sched.h') diff --git a/fs/proc/base.c b/fs/proc/base.c index 58c3e6a8e15e..a891fe4cb43b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2376,29 +2376,82 @@ static int proc_base_fill_cache(struct file *filp, void *dirent, } #ifdef CONFIG_TASK_IO_ACCOUNTING -static int proc_pid_io_accounting(struct task_struct *task, char *buffer) -{ +static int do_io_accounting(struct task_struct *task, char *buffer, int whole) +{ + u64 rchar, wchar, syscr, syscw; + struct task_io_accounting ioac; + + if (!whole) { + rchar = task->rchar; + wchar = task->wchar; + syscr = task->syscr; + syscw = task->syscw; + memcpy(&ioac, &task->ioac, sizeof(ioac)); + } else { + unsigned long flags; + struct task_struct *t = task; + rchar = wchar = syscr = syscw = 0; + memset(&ioac, 0, sizeof(ioac)); + + rcu_read_lock(); + do { + rchar += t->rchar; + wchar += t->wchar; + syscr += t->syscr; + syscw += t->syscw; + + ioac.read_bytes += t->ioac.read_bytes; + ioac.write_bytes += t->ioac.write_bytes; + ioac.cancelled_write_bytes += + t->ioac.cancelled_write_bytes; + t = next_thread(t); + } while (t != task); + rcu_read_unlock(); + + if (lock_task_sighand(task, &flags)) { + struct signal_struct *sig = task->signal; + + rchar += sig->rchar; + wchar += sig->wchar; + syscr += sig->syscr; + syscw += sig->syscw; + + ioac.read_bytes += sig->ioac.read_bytes; + ioac.write_bytes += sig->ioac.write_bytes; + ioac.cancelled_write_bytes += + sig->ioac.cancelled_write_bytes; + + unlock_task_sighand(task, &flags); + } + } + return sprintf(buffer, -#ifdef CONFIG_TASK_XACCT "rchar: %llu\n" "wchar: %llu\n" "syscr: %llu\n" "syscw: %llu\n" -#endif "read_bytes: %llu\n" "write_bytes: %llu\n" "cancelled_write_bytes: %llu\n", -#ifdef CONFIG_TASK_XACCT - (unsigned long long)task->rchar, - (unsigned long long)task->wchar, - (unsigned long long)task->syscr, - (unsigned long long)task->syscw, -#endif - (unsigned long long)task->ioac.read_bytes, - (unsigned long long)task->ioac.write_bytes, - (unsigned long long)task->ioac.cancelled_write_bytes); + (unsigned long long)rchar, + (unsigned long long)wchar, + (unsigned long long)syscr, + (unsigned long long)syscw, + (unsigned long long)ioac.read_bytes, + (unsigned long long)ioac.write_bytes, + (unsigned long long)ioac.cancelled_write_bytes); +} + +static int proc_tid_io_accounting(struct task_struct *task, char *buffer) +{ + return do_io_accounting(task, buffer, 0); } -#endif + +static int proc_tgid_io_accounting(struct task_struct *task, char *buffer) +{ + return do_io_accounting(task, buffer, 1); +} +#endif /* CONFIG_TASK_IO_ACCOUNTING */ /* * Thread groups @@ -2470,7 +2523,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("coredump_filter", S_IRUGO|S_IWUSR, coredump_filter), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING - INF("io", S_IRUGO, pid_io_accounting), + INF("io", S_IRUGO, tgid_io_accounting), #endif }; @@ -2797,6 +2850,9 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_FAULT_INJECTION REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject), #endif +#ifdef CONFIG_TASK_IO_ACCOUNTING + INF("io", S_IRUGO, tid_io_accounting), +#endif }; static int proc_tid_base_readdir(struct file * filp, diff --git a/include/linux/sched.h b/include/linux/sched.h index af780f299c7c..d22ffe06d0eb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -506,6 +506,10 @@ struct signal_struct { unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; unsigned long inblock, oublock, cinblock, coublock; +#ifdef CONFIG_TASK_XACCT + u64 rchar, wchar, syscr, syscw; +#endif + struct task_io_accounting ioac; /* * Cumulative ns of scheduled CPU time for dead threads in the diff --git a/kernel/exit.c b/kernel/exit.c index 8a4d4d12e294..ad933bb29ec7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -120,6 +120,18 @@ static void __exit_signal(struct task_struct *tsk) sig->nivcsw += tsk->nivcsw; sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); +#ifdef CONFIG_TASK_XACCT + sig->rchar += tsk->rchar; + sig->wchar += tsk->wchar; + sig->syscr += tsk->syscr; + sig->syscw += tsk->syscw; +#endif /* CONFIG_TASK_XACCT */ +#ifdef CONFIG_TASK_IO_ACCOUNTING + sig->ioac.read_bytes += tsk->ioac.read_bytes; + sig->ioac.write_bytes += tsk->ioac.write_bytes; + sig->ioac.cancelled_write_bytes += + tsk->ioac.cancelled_write_bytes; +#endif /* CONFIG_TASK_IO_ACCOUNTING */ sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig = NULL; /* Marker for below. */ } @@ -1366,6 +1378,21 @@ static int wait_task_zombie(struct task_struct *p, int options, psig->coublock += task_io_get_oublock(p) + sig->oublock + sig->coublock; +#ifdef CONFIG_TASK_XACCT + psig->rchar += p->rchar + sig->rchar; + psig->wchar += p->wchar + sig->wchar; + psig->syscr += p->syscr + sig->syscr; + psig->syscw += p->syscw + sig->syscw; +#endif /* CONFIG_TASK_XACCT */ +#ifdef CONFIG_TASK_IO_ACCOUNTING + psig->ioac.read_bytes += + p->ioac.read_bytes + sig->ioac.read_bytes; + psig->ioac.write_bytes += + p->ioac.write_bytes + sig->ioac.write_bytes; + psig->ioac.cancelled_write_bytes += + p->ioac.cancelled_write_bytes + + sig->ioac.cancelled_write_bytes; +#endif /* CONFIG_TASK_IO_ACCOUNTING */ spin_unlock_irq(&p->parent->sighand->siglock); } diff --git a/kernel/fork.c b/kernel/fork.c index 813d5c89b9d5..b99d73e971a4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -812,6 +812,12 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; +#ifdef CONFIG_TASK_XACCT + sig->rchar = sig->wchar = sig->syscr = sig->syscw = 0; +#endif +#ifdef CONFIG_TASK_IO_ACCOUNTING + memset(&sig->ioac, 0, sizeof(sig->ioac)); +#endif sig->sum_sched_runtime = 0; INIT_LIST_HEAD(&sig->cpu_timers[0]); INIT_LIST_HEAD(&sig->cpu_timers[1]); -- cgit v1.2.3 From 873b47717732c2f33a4b14de02571a4295a02f0c Mon Sep 17 00:00:00 2001 From: Keika Kobayashi Date: Fri, 25 Jul 2008 01:48:52 -0700 Subject: per-task-delay-accounting: add memory reclaim delay Sometimes, application responses become bad under heavy memory load. Applications take a bit time to reclaim memory. The statistics, how long memory reclaim takes, will be useful to measure memory usage. This patch adds accounting memory reclaim to per-task-delay-accounting for accounting the time of do_try_to_free_pages(). - When System is under low memory load, memory reclaim may not occur. $ free total used free shared buffers cached Mem: 8197800 1577300 6620500 0 4808 1516724 -/+ buffers/cache: 55768 8142032 Swap: 16386292 0 16386292 $ vmstat 1 procs -----------memory---------- ---swap-- -----io---- -system-- ----cpu---- r b swpd free buff cache si so bi bo in cs us sy id wa 0 0 0 5069748 10612 3014060 0 0 0 0 3 26 0 0 100 0 0 0 0 5069748 10612 3014060 0 0 0 0 4 22 0 0 100 0 0 0 0 5069748 10612 3014060 0 0 0 0 3 18 0 0 100 0 Measure the time of tar command. $ ls -s test.dat 1501472 test.dat $ time tar cvf test.tar test.dat real 0m13.388s user 0m0.116s sys 0m5.304s $ ./delayget -d -p CPU count real total virtual total delay total 428 5528345500 5477116080 62749891 IO count delay total 338 8078977189 SWAP count delay total 0 0 RECLAIM count delay total 0 0 - When system is under heavy memory load memory reclaim may occur. $ vmstat 1 procs -----------memory---------- ---swap-- -----io---- -system-- ----cpu---- r b swpd free buff cache si so bi bo in cs us sy id wa 0 0 7159032 49724 1812 3012 0 0 0 0 3 24 0 0 100 0 0 0 7159032 49724 1812 3012 0 0 0 0 4 24 0 0 100 0 0 0 7159032 49848 1812 3012 0 0 0 0 3 22 0 0 100 0 In this case, one process uses more 8G memory by execution of malloc() and memset(). $ time tar cvf test.tar test.dat real 1m38.563s <- increased by 85 sec user 0m0.140s sys 0m7.060s $ ./delayget -d -p CPU count real total virtual total delay total 9021 7140446250 7315277975 923201824 IO count delay total 8965 90466349669 SWAP count delay total 3 21036367 RECLAIM count delay total 740 61011951153 In the later case, the value of RECLAIM is increasing. So, taskstats can show how much memory reclaim influences TAT. Signed-off-by: Keika Kobayashi Acked-by: Balbir Singh Acked-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/delayacct.h | 19 +++++++++++++++++++ include/linux/sched.h | 4 ++++ kernel/delayacct.c | 13 +++++++++++++ mm/vmscan.c | 5 +++++ 4 files changed, 41 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index ab94bc083558..f352f06fa063 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -39,6 +39,8 @@ extern void __delayacct_blkio_start(void); extern void __delayacct_blkio_end(void); extern int __delayacct_add_tsk(struct taskstats *, struct task_struct *); extern __u64 __delayacct_blkio_ticks(struct task_struct *); +extern void __delayacct_freepages_start(void); +extern void __delayacct_freepages_end(void); static inline int delayacct_is_task_waiting_on_io(struct task_struct *p) { @@ -107,6 +109,18 @@ static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk) return 0; } +static inline void delayacct_freepages_start(void) +{ + if (current->delays) + __delayacct_freepages_start(); +} + +static inline void delayacct_freepages_end(void) +{ + if (current->delays) + __delayacct_freepages_end(); +} + #else static inline void delayacct_set_flag(int flag) {} @@ -129,6 +143,11 @@ static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk) { return 0; } static inline int delayacct_is_task_waiting_on_io(struct task_struct *p) { return 0; } +static inline void delayacct_freepages_start(void) +{} +static inline void delayacct_freepages_end(void) +{} + #endif /* CONFIG_TASK_DELAY_ACCT */ #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index d22ffe06d0eb..42036ffe6b00 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -672,6 +672,10 @@ struct task_delay_info { /* io operations performed */ u32 swapin_count; /* total count of the number of swapin block */ /* io operations performed */ + + struct timespec freepages_start, freepages_end; + u64 freepages_delay; /* wait for memory reclaim */ + u32 freepages_count; /* total count of memory reclaim */ }; #endif /* CONFIG_TASK_DELAY_ACCT */ diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 10e43fd8b721..84b6782a2ce4 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -165,3 +165,16 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk) return ret; } +void __delayacct_freepages_start(void) +{ + delayacct_start(¤t->delays->freepages_start); +} + +void __delayacct_freepages_end(void) +{ + delayacct_end(¤t->delays->freepages_start, + ¤t->delays->freepages_end, + ¤t->delays->freepages_delay, + ¤t->delays->freepages_count); +} + diff --git a/mm/vmscan.c b/mm/vmscan.c index 967d30ccd92b..26672c6cd3ce 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -1316,6 +1317,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct zone *zone; enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); + delayacct_freepages_start(); + if (scan_global_lru(sc)) count_vm_event(ALLOCSTALL); /* @@ -1396,6 +1399,8 @@ out: } else mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority); + delayacct_freepages_end(); + return ret; } -- cgit v1.2.3 From 16d69265b930f7e2fa9eea381715696f780718f4 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 25 Jul 2008 19:44:36 -0700 Subject: uninline arch_pick_mmap_layout() Fix this, on avr32: include/linux/utsname.h:35, from init/main.c:20: include/linux/sched.h: In function 'arch_pick_mmap_layout': include/linux/sched.h:2149: error: implicit declaration of function 'PAGE_ALIGN' Reported-by: Adrian Bunk Cc: Haavard Skinnemoen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 9 --------- mm/util.c | 10 ++++++++++ 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 42036ffe6b00..3260a5c42b91 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2139,16 +2139,7 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) #endif /* CONFIG_SMP */ -#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT extern void arch_pick_mmap_layout(struct mm_struct *mm); -#else -static inline void arch_pick_mmap_layout(struct mm_struct *mm) -{ - mm->mmap_base = TASK_UNMAPPED_BASE; - mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; -} -#endif #ifdef CONFIG_TRACING extern void diff --git a/mm/util.c b/mm/util.c index 8f18683825bc..0efd83097ecf 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1,3 +1,4 @@ +#include #include #include #include @@ -136,3 +137,12 @@ char *strndup_user(const char __user *s, long n) return p; } EXPORT_SYMBOL(strndup_user); + +#ifndef HAVE_ARCH_PICK_MMAP_LAYOUT +void arch_pick_mmap_layout(struct mm_struct *mm) +{ + mm->mmap_base = TASK_UNMAPPED_BASE; + mm->get_unmapped_area = arch_get_unmapped_area; + mm->unmap_area = arch_unmap_area; +} +#endif -- cgit v1.2.3 From 7babe8db99d305340cf4828ce1f5a1481d5622ef Mon Sep 17 00:00:00 2001 From: Eduard - Gabriel Munteanu Date: Fri, 25 Jul 2008 19:45:11 -0700 Subject: Full conversion to early_initcall() interface, remove old interface A previous patch added the early_initcall(), to allow a cleaner hooking of pre-SMP initcalls. Now we remove the older interface, converting all existing users to the new one. [akpm@linux-foundation.org: cleanups] [akpm@linux-foundation.org: build fix] [kosaki.motohiro@jp.fujitsu.com: warning fix] [kosaki.motohiro@jp.fujitsu.com: warning fix] Signed-off-by: Eduard - Gabriel Munteanu Cc: Tom Zanussi Signed-off-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 9 --------- include/linux/smp.h | 5 ----- init/main.c | 23 +---------------------- kernel/sched.c | 5 ++++- kernel/smp.c | 4 +++- kernel/softirq.c | 3 ++- kernel/softlockup.c | 25 ++++++++++++++++++++++--- 7 files changed, 32 insertions(+), 42 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 3260a5c42b91..adb8077dc463 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -292,7 +292,6 @@ extern void sched_show_task(struct task_struct *p); #ifdef CONFIG_DETECT_SOFTLOCKUP extern void softlockup_tick(void); -extern void spawn_softlockup_task(void); extern void touch_softlockup_watchdog(void); extern void touch_all_softlockup_watchdogs(void); extern unsigned int softlockup_panic; @@ -2222,14 +2221,6 @@ static inline void inc_syscw(struct task_struct *tsk) } #endif -#ifdef CONFIG_SMP -void migration_init(void); -#else -static inline void migration_init(void) -{ -} -#endif - #ifndef TASK_SIZE_OF #define TASK_SIZE_OF(tsk) TASK_SIZE #endif diff --git a/include/linux/smp.h b/include/linux/smp.h index 48262f86c969..66484d4a8459 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -74,15 +74,10 @@ void __smp_call_function_single(int cpuid, struct call_single_data *data); #ifdef CONFIG_USE_GENERIC_SMP_HELPERS void generic_smp_call_function_single_interrupt(void); void generic_smp_call_function_interrupt(void); -void init_call_single_data(void); void ipi_call_lock(void); void ipi_call_unlock(void); void ipi_call_lock_irq(void); void ipi_call_unlock_irq(void); -#else -static inline void init_call_single_data(void) -{ -} #endif /* diff --git a/init/main.c b/init/main.c index b6fec08dbbef..20fdc9884b77 100644 --- a/init/main.c +++ b/init/main.c @@ -774,16 +774,7 @@ static void __init do_basic_setup(void) do_initcalls(); } -static int __initdata nosoftlockup; - -static int __init nosoftlockup_setup(char *str) -{ - nosoftlockup = 1; - return 1; -} -__setup("nosoftlockup", nosoftlockup_setup); - -static void __init __do_pre_smp_initcalls(void) +static void __init do_pre_smp_initcalls(void) { initcall_t *call; @@ -791,17 +782,6 @@ static void __init __do_pre_smp_initcalls(void) do_one_initcall(*call); } -static void __init do_pre_smp_initcalls(void) -{ - extern int spawn_ksoftirqd(void); - - init_call_single_data(); - migration_init(); - spawn_ksoftirqd(); - if (!nosoftlockup) - spawn_softlockup_task(); -} - static void run_init_process(char *init_filename) { argv_init[0] = init_filename; @@ -873,7 +853,6 @@ static int __init kernel_init(void * unused) smp_prepare_cpus(setup_max_cpus); - __do_pre_smp_initcalls(); do_pre_smp_initcalls(); smp_init(); diff --git a/kernel/sched.c b/kernel/sched.c index 0047bd9b96aa..fde1a1026359 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6389,7 +6389,7 @@ static struct notifier_block __cpuinitdata migration_notifier = { .priority = 10 }; -void __init migration_init(void) +static int __init migration_init(void) { void *cpu = (void *)(long)smp_processor_id(); int err; @@ -6399,7 +6399,10 @@ void __init migration_init(void) BUG_ON(err == NOTIFY_BAD); migration_call(&migration_notifier, CPU_ONLINE, cpu); register_cpu_notifier(&migration_notifier); + + return err; } +early_initcall(migration_init); #endif #ifdef CONFIG_SMP diff --git a/kernel/smp.c b/kernel/smp.c index 462c785ca1ee..96fc7c0edc59 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -33,7 +33,7 @@ struct call_single_queue { spinlock_t lock; }; -void __cpuinit init_call_single_data(void) +static int __cpuinit init_call_single_data(void) { int i; @@ -43,7 +43,9 @@ void __cpuinit init_call_single_data(void) spin_lock_init(&q->lock); INIT_LIST_HEAD(&q->list); } + return 0; } +early_initcall(init_call_single_data); static void csd_flag_wait(struct call_single_data *data) { diff --git a/kernel/softirq.c b/kernel/softirq.c index f6b03d56c2bf..c506f266a6b9 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -630,7 +630,7 @@ static struct notifier_block __cpuinitdata cpu_nfb = { .notifier_call = cpu_callback }; -__init int spawn_ksoftirqd(void) +static __init int spawn_ksoftirqd(void) { void *cpu = (void *)(long)smp_processor_id(); int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); @@ -640,6 +640,7 @@ __init int spawn_ksoftirqd(void) register_cpu_notifier(&cpu_nfb); return 0; } +early_initcall(spawn_ksoftirqd); #ifdef CONFIG_SMP /* diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 7bd8d1aadd5d..b75b492fbfcf 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -338,14 +338,33 @@ static struct notifier_block __cpuinitdata cpu_nfb = { .notifier_call = cpu_callback }; -__init void spawn_softlockup_task(void) +static int __initdata nosoftlockup; + +static int __init nosoftlockup_setup(char *str) +{ + nosoftlockup = 1; + return 1; +} +__setup("nosoftlockup", nosoftlockup_setup); + +static int __init spawn_softlockup_task(void) { void *cpu = (void *)(long)smp_processor_id(); - int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + int err; - BUG_ON(err == NOTIFY_BAD); + if (nosoftlockup) + return 0; + + err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + if (err == NOTIFY_BAD) { + BUG(); + return 1; + } cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); register_cpu_notifier(&cpu_nfb); atomic_notifier_chain_register(&panic_notifier_list, &panic_block); + + return 0; } +early_initcall(spawn_softlockup_task); -- cgit v1.2.3 From 2b2a1ff64afbadac842bbc58c5166962cf4f7664 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Fri, 25 Jul 2008 19:45:54 -0700 Subject: tracehook: death This moves the ptrace logic in task death (exit_notify) into tracehook.h inlines. Some code is rearranged slightly to make things nicer. There is no change, only cleanup. There is one hook called with the tasklist_lock write-locked, as ptrace needs. There is also a new hook called after exit_state changes and without locks. This is a better place for tracing work to be in the future, since it doesn't delay the whole system with locking. Signed-off-by: Roland McGrath Cc: Oleg Nesterov Reviewed-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 +- include/linux/tracehook.h | 52 +++++++++++++++++++++++++++++++++++++++++++++++ kernel/exit.c | 26 ++++++++---------------- kernel/signal.c | 10 ++++++--- 4 files changed, 69 insertions(+), 21 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index adb8077dc463..a95d84d0da95 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1796,7 +1796,7 @@ extern int kill_pid_info_as_uid(int, struct siginfo *, struct pid *, uid_t, uid_ extern int kill_pgrp(struct pid *pid, int sig, int priv); extern int kill_pid(struct pid *pid, int sig, int priv); extern int kill_proc_info(int, struct siginfo *, pid_t); -extern void do_notify_parent(struct task_struct *, int); +extern int do_notify_parent(struct task_struct *, int); extern void force_sig(int, struct task_struct *); extern void force_sig_specific(int, struct task_struct *); extern int send_sig(int, struct task_struct *, int); diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 6dc428dd2f38..4c50e1b57349 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -471,4 +471,56 @@ static inline int tracehook_notify_jctl(int notify, int why) return notify || (current->ptrace & PT_PTRACED); } +/** + * tracehook_notify_death - task is dead, ready to notify parent + * @task: @current task now exiting + * @death_cookie: value to pass to tracehook_report_death() + * @group_dead: nonzero if this was the last thread in the group to die + * + * Return the signal number to send our parent with do_notify_parent(), or + * zero to send no signal and leave a zombie, or -1 to self-reap right now. + * + * Called with write_lock_irq(&tasklist_lock) held. + */ +static inline int tracehook_notify_death(struct task_struct *task, + void **death_cookie, int group_dead) +{ + if (task->exit_signal == -1) + return task->ptrace ? SIGCHLD : -1; + + /* + * If something other than our normal parent is ptracing us, then + * send it a SIGCHLD instead of honoring exit_signal. exit_signal + * only has special meaning to our real parent. + */ + if (thread_group_empty(task) && !ptrace_reparented(task)) + return task->exit_signal; + + return task->ptrace ? SIGCHLD : 0; +} + +/** + * tracehook_report_death - task is dead and ready to be reaped + * @task: @current task now exiting + * @signal: signal number sent to parent, or 0 or -1 + * @death_cookie: value passed back from tracehook_notify_death() + * @group_dead: nonzero if this was the last thread in the group to die + * + * Thread has just become a zombie or is about to self-reap. If positive, + * @signal is the signal number just sent to the parent (usually %SIGCHLD). + * If @signal is -1, this thread will self-reap. If @signal is 0, this is + * a delayed_group_leader() zombie. The @death_cookie was passed back by + * tracehook_notify_death(). + * + * If normal reaping is not inhibited, @task->exit_state might be changing + * in parallel. + * + * Called without locks. + */ +static inline void tracehook_report_death(struct task_struct *task, + int signal, void *death_cookie, + int group_dead) +{ +} + #endif /* */ diff --git a/kernel/exit.c b/kernel/exit.c index da28745f7c38..6cdf60712bd2 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -885,7 +885,8 @@ static void forget_original_parent(struct task_struct *father) */ static void exit_notify(struct task_struct *tsk, int group_dead) { - int state; + int signal; + void *cookie; /* * This does two things: @@ -922,22 +923,11 @@ static void exit_notify(struct task_struct *tsk, int group_dead) !capable(CAP_KILL)) tsk->exit_signal = SIGCHLD; - /* If something other than our normal parent is ptracing us, then - * send it a SIGCHLD instead of honoring exit_signal. exit_signal - * only has special meaning to our real parent. - */ - if (!task_detached(tsk) && thread_group_empty(tsk)) { - int signal = ptrace_reparented(tsk) ? - SIGCHLD : tsk->exit_signal; - do_notify_parent(tsk, signal); - } else if (tsk->ptrace) { - do_notify_parent(tsk, SIGCHLD); - } + signal = tracehook_notify_death(tsk, &cookie, group_dead); + if (signal > 0) + signal = do_notify_parent(tsk, signal); - state = EXIT_ZOMBIE; - if (task_detached(tsk) && likely(!tsk->ptrace)) - state = EXIT_DEAD; - tsk->exit_state = state; + tsk->exit_state = signal < 0 ? EXIT_DEAD : EXIT_ZOMBIE; /* mt-exec, de_thread() is waiting for us */ if (thread_group_leader(tsk) && @@ -947,8 +937,10 @@ static void exit_notify(struct task_struct *tsk, int group_dead) write_unlock_irq(&tasklist_lock); + tracehook_report_death(tsk, signal, cookie, group_dead); + /* If the process is dead, release it - nobody will wait for it */ - if (state == EXIT_DEAD) + if (signal < 0) release_task(tsk); } diff --git a/kernel/signal.c b/kernel/signal.c index e9e699f4b1bd..0e862d3130ff 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1326,9 +1326,11 @@ static inline void __wake_up_parent(struct task_struct *p, /* * Let a parent know about the death of a child. * For a stopped/continued status change, use do_notify_parent_cldstop instead. + * + * Returns -1 if our parent ignored us and so we've switched to + * self-reaping, or else @sig. */ - -void do_notify_parent(struct task_struct *tsk, int sig) +int do_notify_parent(struct task_struct *tsk, int sig) { struct siginfo info; unsigned long flags; @@ -1399,12 +1401,14 @@ void do_notify_parent(struct task_struct *tsk, int sig) */ tsk->exit_signal = -1; if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) - sig = 0; + sig = -1; } if (valid_signal(sig) && sig > 0) __group_send_sig_info(sig, &info, tsk->parent); __wake_up_parent(tsk, tsk->parent); spin_unlock_irqrestore(&psig->siglock, flags); + + return sig; } static void do_notify_parent_cldstop(struct task_struct *tsk, int why) -- cgit v1.2.3 From 85ba2d862e521375a8ee01526c5c46b1f24bb4af Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Fri, 25 Jul 2008 19:45:58 -0700 Subject: tracehook: wait_task_inactive This extends wait_task_inactive() with a new argument so it can be used in a "soft" mode where it will check for the task changing state unexpectedly and back off. There is no change to existing callers. This lays the groundwork to allow robust, noninvasive tracing that can try to sample a blocked thread but back off safely if it wakes up. Signed-off-by: Roland McGrath Cc: Oleg Nesterov Reviewed-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/perfmon.c | 4 ++-- include/linux/sched.h | 8 ++++++-- kernel/kthread.c | 2 +- kernel/ptrace.c | 2 +- kernel/sched.c | 29 +++++++++++++++++++++++++++-- 5 files changed, 37 insertions(+), 8 deletions(-) (limited to 'include/linux/sched.h') diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index 19d4493c6193..fc8f3509df27 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -2626,7 +2626,7 @@ pfm_task_incompatible(pfm_context_t *ctx, struct task_struct *task) /* * make sure the task is off any CPU */ - wait_task_inactive(task); + wait_task_inactive(task, 0); /* more to come... */ @@ -4774,7 +4774,7 @@ recheck: UNPROTECT_CTX(ctx, flags); - wait_task_inactive(task); + wait_task_inactive(task, 0); PROTECT_CTX(ctx, flags); diff --git a/include/linux/sched.h b/include/linux/sched.h index a95d84d0da95..f59318a0099b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1882,9 +1882,13 @@ extern void set_task_comm(struct task_struct *tsk, char *from); extern char *get_task_comm(char *to, struct task_struct *tsk); #ifdef CONFIG_SMP -extern void wait_task_inactive(struct task_struct * p); +extern unsigned long wait_task_inactive(struct task_struct *, long match_state); #else -#define wait_task_inactive(p) do { } while (0) +static inline unsigned long wait_task_inactive(struct task_struct *p, + long match_state) +{ + return 1; +} #endif #define next_task(p) list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks) diff --git a/kernel/kthread.c b/kernel/kthread.c index 6111c27491b1..96cff2f8710b 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -176,7 +176,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu) return; } /* Must have done schedule() in kthread() before we set_task_cpu */ - wait_task_inactive(k); + wait_task_inactive(k, 0); set_task_cpu(k, cpu); k->cpus_allowed = cpumask_of_cpu(cpu); k->rt.nr_cpus_allowed = 1; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 8392a9da6450..082b3fcb32a0 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -107,7 +107,7 @@ int ptrace_check_attach(struct task_struct *child, int kill) read_unlock(&tasklist_lock); if (!ret && !kill) - wait_task_inactive(child); + ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH; /* All systems go.. */ return ret; diff --git a/kernel/sched.c b/kernel/sched.c index fde1a1026359..0236958addcb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1867,16 +1867,24 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) /* * wait_task_inactive - wait for a thread to unschedule. * + * If @match_state is nonzero, it's the @p->state value just checked and + * not expected to change. If it changes, i.e. @p might have woken up, + * then return zero. When we succeed in waiting for @p to be off its CPU, + * we return a positive number (its total switch count). If a second call + * a short while later returns the same number, the caller can be sure that + * @p has remained unscheduled the whole time. + * * The caller must ensure that the task *will* unschedule sometime soon, * else this function might spin for a *long* time. This function can't * be called with interrupts off, or it may introduce deadlock with * smp_call_function() if an IPI is sent by the same process we are * waiting to become inactive. */ -void wait_task_inactive(struct task_struct *p) +unsigned long wait_task_inactive(struct task_struct *p, long match_state) { unsigned long flags; int running, on_rq; + unsigned long ncsw; struct rq *rq; for (;;) { @@ -1899,8 +1907,11 @@ void wait_task_inactive(struct task_struct *p) * return false if the runqueue has changed and p * is actually now running somewhere else! */ - while (task_running(rq, p)) + while (task_running(rq, p)) { + if (match_state && unlikely(p->state != match_state)) + return 0; cpu_relax(); + } /* * Ok, time to look more closely! We need the rq @@ -1910,8 +1921,20 @@ void wait_task_inactive(struct task_struct *p) rq = task_rq_lock(p, &flags); running = task_running(rq, p); on_rq = p->se.on_rq; + ncsw = 0; + if (!match_state || p->state == match_state) { + ncsw = p->nivcsw + p->nvcsw; + if (unlikely(!ncsw)) + ncsw = 1; + } task_rq_unlock(rq, &flags); + /* + * If it changed from the expected state, bail out now. + */ + if (unlikely(!ncsw)) + break; + /* * Was it really running after all now that we * checked with the proper locks actually held? @@ -1944,6 +1967,8 @@ void wait_task_inactive(struct task_struct *p) */ break; } + + return ncsw; } /*** -- cgit v1.2.3 From 5995477ab7f3522c497c9c4a1c55373e9d655574 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Sun, 27 Jul 2008 17:29:15 +0200 Subject: task IO accounting: improve code readability Put all i/o statistics in struct proc_io_accounting and use inline functions to initialize and increment statistics, removing a lot of single variable assignments. This also reduces the kernel size as following (with CONFIG_TASK_XACCT=y and CONFIG_TASK_IO_ACCOUNTING=y). text data bss dec hex filename 11651 0 0 11651 2d83 kernel/exit.o.before 11619 0 0 11619 2d63 kernel/exit.o.after 10886 132 136 11154 2b92 kernel/fork.o.before 10758 132 136 11026 2b12 kernel/fork.o.after 3082029 807968 4818600 8708597 84e1f5 vmlinux.o.before 3081869 807968 4818600 8708437 84e155 vmlinux.o.after Signed-off-by: Andrea Righi Acked-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- fs/proc/base.c | 57 ++++++++++------------------------ include/linux/sched.h | 19 ++++-------- include/linux/task_io_accounting.h | 27 ++++++++++++++-- include/linux/task_io_accounting_ops.h | 56 +++++++++++++++++++++++++++------ kernel/exit.c | 30 ++---------------- kernel/fork.c | 15 ++------- kernel/tsacct.c | 14 ++++----- 7 files changed, 104 insertions(+), 114 deletions(-) (limited to 'include/linux/sched.h') diff --git a/fs/proc/base.c b/fs/proc/base.c index e74308bdabd3..3d94906c7aa8 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include #include @@ -2402,44 +2403,17 @@ static int proc_base_fill_cache(struct file *filp, void *dirent, #ifdef CONFIG_TASK_IO_ACCOUNTING static int do_io_accounting(struct task_struct *task, char *buffer, int whole) { - u64 rchar, wchar, syscr, syscw; - struct task_io_accounting ioac; - - rchar = task->rchar; - wchar = task->wchar; - syscr = task->syscr; - syscw = task->syscw; - memcpy(&ioac, &task->ioac, sizeof(ioac)); - - if (whole) { - unsigned long flags; - - if (lock_task_sighand(task, &flags)) { - struct signal_struct *sig = task->signal; - struct task_struct *t = task; - - rchar += sig->rchar; - wchar += sig->wchar; - syscr += sig->syscr; - syscw += sig->syscw; - - ioac.read_bytes += sig->ioac.read_bytes; - ioac.write_bytes += sig->ioac.write_bytes; - ioac.cancelled_write_bytes += - sig->ioac.cancelled_write_bytes; - while_each_thread(task, t) { - rchar += t->rchar; - wchar += t->wchar; - syscr += t->syscr; - syscw += t->syscw; - - ioac.read_bytes += t->ioac.read_bytes; - ioac.write_bytes += t->ioac.write_bytes; - ioac.cancelled_write_bytes += - t->ioac.cancelled_write_bytes; - } - unlock_task_sighand(task, &flags); - } + struct proc_io_accounting acct = task->ioac; + unsigned long flags; + + if (whole && lock_task_sighand(task, &flags)) { + struct task_struct *t = task; + + task_io_accounting_add(&acct, &task->signal->ioac); + while_each_thread(task, t) + task_io_accounting_add(&acct, &t->ioac); + + unlock_task_sighand(task, &flags); } return sprintf(buffer, "rchar: %llu\n" @@ -2449,9 +2423,10 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole) "read_bytes: %llu\n" "write_bytes: %llu\n" "cancelled_write_bytes: %llu\n", - rchar, wchar, syscr, syscw, - ioac.read_bytes, ioac.write_bytes, - ioac.cancelled_write_bytes); + acct.chr.rchar, acct.chr.wchar, + acct.chr.syscr, acct.chr.syscw, + acct.blk.read_bytes, acct.blk.write_bytes, + acct.blk.cancelled_write_bytes); } static int proc_tid_io_accounting(struct task_struct *task, char *buffer) diff --git a/include/linux/sched.h b/include/linux/sched.h index f59318a0099b..034c1ca6b332 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -505,10 +505,7 @@ struct signal_struct { unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; unsigned long inblock, oublock, cinblock, coublock; -#ifdef CONFIG_TASK_XACCT - u64 rchar, wchar, syscr, syscw; -#endif - struct task_io_accounting ioac; + struct proc_io_accounting ioac; /* * Cumulative ns of scheduled CPU time for dead threads in the @@ -1256,11 +1253,7 @@ struct task_struct { unsigned long ptrace_message; siginfo_t *last_siginfo; /* For ptrace use. */ -#ifdef CONFIG_TASK_XACCT -/* i/o counters(bytes read/written, #syscalls */ - u64 rchar, wchar, syscr, syscw; -#endif - struct task_io_accounting ioac; + struct proc_io_accounting ioac; #if defined(CONFIG_TASK_XACCT) u64 acct_rss_mem1; /* accumulated rss usage */ u64 acct_vm_mem1; /* accumulated virtual memory usage */ @@ -2190,22 +2183,22 @@ extern long sched_group_rt_period(struct task_group *tg); #ifdef CONFIG_TASK_XACCT static inline void add_rchar(struct task_struct *tsk, ssize_t amt) { - tsk->rchar += amt; + tsk->ioac.chr.rchar += amt; } static inline void add_wchar(struct task_struct *tsk, ssize_t amt) { - tsk->wchar += amt; + tsk->ioac.chr.wchar += amt; } static inline void inc_syscr(struct task_struct *tsk) { - tsk->syscr++; + tsk->ioac.chr.syscr++; } static inline void inc_syscw(struct task_struct *tsk) { - tsk->syscw++; + tsk->ioac.chr.syscw++; } #else static inline void add_rchar(struct task_struct *tsk, ssize_t amt) diff --git a/include/linux/task_io_accounting.h b/include/linux/task_io_accounting.h index 44d00e9cceea..165390f8b936 100644 --- a/include/linux/task_io_accounting.h +++ b/include/linux/task_io_accounting.h @@ -1,5 +1,5 @@ /* - * task_io_accounting: a structure which is used for recording a single task's + * proc_io_accounting: a structure which is used for recording a single task's * IO statistics. * * Don't include this header file directly - it is designed to be dragged in via @@ -8,6 +8,22 @@ * Blame akpm@osdl.org for all this. */ +#ifdef CONFIG_TASK_XACCT +struct task_chr_io_accounting { + /* bytes read */ + u64 rchar; + /* bytes written */ + u64 wchar; + /* # of read syscalls */ + u64 syscr; + /* # of write syscalls */ + u64 syscw; +}; +#else /* CONFIG_TASK_XACCT */ +struct task_chr_io_accounting { +}; +#endif /* CONFIG_TASK_XACCT */ + #ifdef CONFIG_TASK_IO_ACCOUNTING struct task_io_accounting { /* @@ -31,7 +47,12 @@ struct task_io_accounting { */ u64 cancelled_write_bytes; }; -#else +#else /* CONFIG_TASK_IO_ACCOUNTING */ struct task_io_accounting { }; -#endif +#endif /* CONFIG_TASK_IO_ACCOUNTING */ + +struct proc_io_accounting { + struct task_chr_io_accounting chr; + struct task_io_accounting blk; +}; diff --git a/include/linux/task_io_accounting_ops.h b/include/linux/task_io_accounting_ops.h index ff46c6fad79d..e6f958ebe97f 100644 --- a/include/linux/task_io_accounting_ops.h +++ b/include/linux/task_io_accounting_ops.h @@ -9,7 +9,7 @@ #ifdef CONFIG_TASK_IO_ACCOUNTING static inline void task_io_account_read(size_t bytes) { - current->ioac.read_bytes += bytes; + current->ioac.blk.read_bytes += bytes; } /* @@ -18,12 +18,12 @@ static inline void task_io_account_read(size_t bytes) */ static inline unsigned long task_io_get_inblock(const struct task_struct *p) { - return p->ioac.read_bytes >> 9; + return p->ioac.blk.read_bytes >> 9; } static inline void task_io_account_write(size_t bytes) { - current->ioac.write_bytes += bytes; + current->ioac.blk.write_bytes += bytes; } /* @@ -32,17 +32,25 @@ static inline void task_io_account_write(size_t bytes) */ static inline unsigned long task_io_get_oublock(const struct task_struct *p) { - return p->ioac.write_bytes >> 9; + return p->ioac.blk.write_bytes >> 9; } static inline void task_io_account_cancelled_write(size_t bytes) { - current->ioac.cancelled_write_bytes += bytes; + current->ioac.blk.cancelled_write_bytes += bytes; } -static inline void task_io_accounting_init(struct task_struct *tsk) +static inline void task_io_accounting_init(struct proc_io_accounting *ioac) { - memset(&tsk->ioac, 0, sizeof(tsk->ioac)); + memset(ioac, 0, sizeof(*ioac)); +} + +static inline void task_blk_io_accounting_add(struct proc_io_accounting *dst, + struct proc_io_accounting *src) +{ + dst->blk.read_bytes += src->blk.read_bytes; + dst->blk.write_bytes += src->blk.write_bytes; + dst->blk.cancelled_write_bytes += src->blk.cancelled_write_bytes; } #else @@ -69,9 +77,37 @@ static inline void task_io_account_cancelled_write(size_t bytes) { } -static inline void task_io_accounting_init(struct task_struct *tsk) +static inline void task_io_accounting_init(struct proc_io_accounting *ioac) +{ +} + +static inline void task_blk_io_accounting_add(struct proc_io_accounting *dst, + struct proc_io_accounting *src) { } -#endif /* CONFIG_TASK_IO_ACCOUNTING */ -#endif /* __TASK_IO_ACCOUNTING_OPS_INCLUDED */ +#endif /* CONFIG_TASK_IO_ACCOUNTING */ + +#ifdef CONFIG_TASK_XACCT +static inline void task_chr_io_accounting_add(struct proc_io_accounting *dst, + struct proc_io_accounting *src) +{ + dst->chr.rchar += src->chr.rchar; + dst->chr.wchar += src->chr.wchar; + dst->chr.syscr += src->chr.syscr; + dst->chr.syscw += src->chr.syscw; +} +#else +static inline void task_chr_io_accounting_add(struct proc_io_accounting *dst, + struct proc_io_accounting *src) +{ +} +#endif /* CONFIG_TASK_XACCT */ + +static inline void task_io_accounting_add(struct proc_io_accounting *dst, + struct proc_io_accounting *src) +{ + task_chr_io_accounting_add(dst, src); + task_blk_io_accounting_add(dst, src); +} +#endif /* __TASK_IO_ACCOUNTING_OPS_INCLUDED */ diff --git a/kernel/exit.c b/kernel/exit.c index 0caf590548a0..eb4d6470d1d0 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -121,18 +121,7 @@ static void __exit_signal(struct task_struct *tsk) sig->nivcsw += tsk->nivcsw; sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); -#ifdef CONFIG_TASK_XACCT - sig->rchar += tsk->rchar; - sig->wchar += tsk->wchar; - sig->syscr += tsk->syscr; - sig->syscw += tsk->syscw; -#endif /* CONFIG_TASK_XACCT */ -#ifdef CONFIG_TASK_IO_ACCOUNTING - sig->ioac.read_bytes += tsk->ioac.read_bytes; - sig->ioac.write_bytes += tsk->ioac.write_bytes; - sig->ioac.cancelled_write_bytes += - tsk->ioac.cancelled_write_bytes; -#endif /* CONFIG_TASK_IO_ACCOUNTING */ + task_io_accounting_add(&sig->ioac, &tsk->ioac); sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig = NULL; /* Marker for below. */ } @@ -1363,21 +1352,8 @@ static int wait_task_zombie(struct task_struct *p, int options, psig->coublock += task_io_get_oublock(p) + sig->oublock + sig->coublock; -#ifdef CONFIG_TASK_XACCT - psig->rchar += p->rchar + sig->rchar; - psig->wchar += p->wchar + sig->wchar; - psig->syscr += p->syscr + sig->syscr; - psig->syscw += p->syscw + sig->syscw; -#endif /* CONFIG_TASK_XACCT */ -#ifdef CONFIG_TASK_IO_ACCOUNTING - psig->ioac.read_bytes += - p->ioac.read_bytes + sig->ioac.read_bytes; - psig->ioac.write_bytes += - p->ioac.write_bytes + sig->ioac.write_bytes; - psig->ioac.cancelled_write_bytes += - p->ioac.cancelled_write_bytes + - sig->ioac.cancelled_write_bytes; -#endif /* CONFIG_TASK_IO_ACCOUNTING */ + task_io_accounting_add(&psig->ioac, &p->ioac); + task_io_accounting_add(&psig->ioac, &sig->ioac); spin_unlock_irq(&p->parent->sighand->siglock); } diff --git a/kernel/fork.c b/kernel/fork.c index 5e050c1317c4..8214ba7c8bb1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -806,12 +806,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; -#ifdef CONFIG_TASK_XACCT - sig->rchar = sig->wchar = sig->syscr = sig->syscw = 0; -#endif -#ifdef CONFIG_TASK_IO_ACCOUNTING - memset(&sig->ioac, 0, sizeof(sig->ioac)); -#endif + task_io_accounting_init(&sig->ioac); sig->sum_sched_runtime = 0; INIT_LIST_HEAD(&sig->cpu_timers[0]); INIT_LIST_HEAD(&sig->cpu_timers[1]); @@ -994,13 +989,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->last_switch_timestamp = 0; #endif -#ifdef CONFIG_TASK_XACCT - p->rchar = 0; /* I/O counter: bytes read */ - p->wchar = 0; /* I/O counter: bytes written */ - p->syscr = 0; /* I/O counter: read syscalls */ - p->syscw = 0; /* I/O counter: write syscalls */ -#endif - task_io_accounting_init(p); + task_io_accounting_init(&p->ioac); acct_clear_integrals(p); p->it_virt_expires = cputime_zero; diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 3da47ccdc5e5..f9cd2561689c 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -94,14 +94,14 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) stats->hiwater_vm = mm->hiwater_vm * PAGE_SIZE / KB; mmput(mm); } - stats->read_char = p->rchar; - stats->write_char = p->wchar; - stats->read_syscalls = p->syscr; - stats->write_syscalls = p->syscw; + stats->read_char = p->ioac.chr.rchar; + stats->write_char = p->ioac.chr.wchar; + stats->read_syscalls = p->ioac.chr.syscr; + stats->write_syscalls = p->ioac.chr.syscw; #ifdef CONFIG_TASK_IO_ACCOUNTING - stats->read_bytes = p->ioac.read_bytes; - stats->write_bytes = p->ioac.write_bytes; - stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes; + stats->read_bytes = p->ioac.blk.read_bytes; + stats->write_bytes = p->ioac.blk.write_bytes; + stats->cancelled_write_bytes = p->ioac.blk.cancelled_write_bytes; #else stats->read_bytes = 0; stats->write_bytes = 0; -- cgit v1.2.3 From 940389b8afad6495211614c13eb91ef7001773ec Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Mon, 28 Jul 2008 00:48:12 +0200 Subject: task IO accounting: move all IO statistics in struct task_io_accounting Simplify the code of include/linux/task_io_accounting.h. It is also more reasonable to have all the task i/o-related statistics in a single struct (task_io_accounting). Signed-off-by: Andrea Righi Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- fs/proc/base.c | 10 +++---- include/linux/sched.h | 12 ++++----- include/linux/task_io_accounting.h | 17 ++---------- include/linux/task_io_accounting_ops.h | 48 +++++++++++++++++----------------- kernel/tsacct.c | 14 +++++----- 5 files changed, 44 insertions(+), 57 deletions(-) (limited to 'include/linux/sched.h') diff --git a/fs/proc/base.c b/fs/proc/base.c index 3d94906c7aa8..01ed610f9b87 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2403,7 +2403,7 @@ static int proc_base_fill_cache(struct file *filp, void *dirent, #ifdef CONFIG_TASK_IO_ACCOUNTING static int do_io_accounting(struct task_struct *task, char *buffer, int whole) { - struct proc_io_accounting acct = task->ioac; + struct task_io_accounting acct = task->ioac; unsigned long flags; if (whole && lock_task_sighand(task, &flags)) { @@ -2423,10 +2423,10 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole) "read_bytes: %llu\n" "write_bytes: %llu\n" "cancelled_write_bytes: %llu\n", - acct.chr.rchar, acct.chr.wchar, - acct.chr.syscr, acct.chr.syscw, - acct.blk.read_bytes, acct.blk.write_bytes, - acct.blk.cancelled_write_bytes); + acct.rchar, acct.wchar, + acct.syscr, acct.syscw, + acct.read_bytes, acct.write_bytes, + acct.cancelled_write_bytes); } static int proc_tid_io_accounting(struct task_struct *task, char *buffer) diff --git a/include/linux/sched.h b/include/linux/sched.h index 034c1ca6b332..5270d449ff9d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -505,7 +505,7 @@ struct signal_struct { unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; unsigned long inblock, oublock, cinblock, coublock; - struct proc_io_accounting ioac; + struct task_io_accounting ioac; /* * Cumulative ns of scheduled CPU time for dead threads in the @@ -1253,7 +1253,7 @@ struct task_struct { unsigned long ptrace_message; siginfo_t *last_siginfo; /* For ptrace use. */ - struct proc_io_accounting ioac; + struct task_io_accounting ioac; #if defined(CONFIG_TASK_XACCT) u64 acct_rss_mem1; /* accumulated rss usage */ u64 acct_vm_mem1; /* accumulated virtual memory usage */ @@ -2183,22 +2183,22 @@ extern long sched_group_rt_period(struct task_group *tg); #ifdef CONFIG_TASK_XACCT static inline void add_rchar(struct task_struct *tsk, ssize_t amt) { - tsk->ioac.chr.rchar += amt; + tsk->ioac.rchar += amt; } static inline void add_wchar(struct task_struct *tsk, ssize_t amt) { - tsk->ioac.chr.wchar += amt; + tsk->ioac.wchar += amt; } static inline void inc_syscr(struct task_struct *tsk) { - tsk->ioac.chr.syscr++; + tsk->ioac.syscr++; } static inline void inc_syscw(struct task_struct *tsk) { - tsk->ioac.chr.syscw++; + tsk->ioac.syscw++; } #else static inline void add_rchar(struct task_struct *tsk, ssize_t amt) diff --git a/include/linux/task_io_accounting.h b/include/linux/task_io_accounting.h index 165390f8b936..5e88afc9a2fb 100644 --- a/include/linux/task_io_accounting.h +++ b/include/linux/task_io_accounting.h @@ -1,5 +1,5 @@ /* - * proc_io_accounting: a structure which is used for recording a single task's + * task_io_accounting: a structure which is used for recording a single task's * IO statistics. * * Don't include this header file directly - it is designed to be dragged in via @@ -8,8 +8,8 @@ * Blame akpm@osdl.org for all this. */ +struct task_io_accounting { #ifdef CONFIG_TASK_XACCT -struct task_chr_io_accounting { /* bytes read */ u64 rchar; /* bytes written */ @@ -18,14 +18,9 @@ struct task_chr_io_accounting { u64 syscr; /* # of write syscalls */ u64 syscw; -}; -#else /* CONFIG_TASK_XACCT */ -struct task_chr_io_accounting { -}; #endif /* CONFIG_TASK_XACCT */ #ifdef CONFIG_TASK_IO_ACCOUNTING -struct task_io_accounting { /* * The number of bytes which this task has caused to be read from * storage. @@ -46,13 +41,5 @@ struct task_io_accounting { * information loss in doing that. */ u64 cancelled_write_bytes; -}; -#else /* CONFIG_TASK_IO_ACCOUNTING */ -struct task_io_accounting { -}; #endif /* CONFIG_TASK_IO_ACCOUNTING */ - -struct proc_io_accounting { - struct task_chr_io_accounting chr; - struct task_io_accounting blk; }; diff --git a/include/linux/task_io_accounting_ops.h b/include/linux/task_io_accounting_ops.h index e6f958ebe97f..4d090f9ee608 100644 --- a/include/linux/task_io_accounting_ops.h +++ b/include/linux/task_io_accounting_ops.h @@ -9,7 +9,7 @@ #ifdef CONFIG_TASK_IO_ACCOUNTING static inline void task_io_account_read(size_t bytes) { - current->ioac.blk.read_bytes += bytes; + current->ioac.read_bytes += bytes; } /* @@ -18,12 +18,12 @@ static inline void task_io_account_read(size_t bytes) */ static inline unsigned long task_io_get_inblock(const struct task_struct *p) { - return p->ioac.blk.read_bytes >> 9; + return p->ioac.read_bytes >> 9; } static inline void task_io_account_write(size_t bytes) { - current->ioac.blk.write_bytes += bytes; + current->ioac.write_bytes += bytes; } /* @@ -32,25 +32,25 @@ static inline void task_io_account_write(size_t bytes) */ static inline unsigned long task_io_get_oublock(const struct task_struct *p) { - return p->ioac.blk.write_bytes >> 9; + return p->ioac.write_bytes >> 9; } static inline void task_io_account_cancelled_write(size_t bytes) { - current->ioac.blk.cancelled_write_bytes += bytes; + current->ioac.cancelled_write_bytes += bytes; } -static inline void task_io_accounting_init(struct proc_io_accounting *ioac) +static inline void task_io_accounting_init(struct task_io_accounting *ioac) { memset(ioac, 0, sizeof(*ioac)); } -static inline void task_blk_io_accounting_add(struct proc_io_accounting *dst, - struct proc_io_accounting *src) +static inline void task_blk_io_accounting_add(struct task_io_accounting *dst, + struct task_io_accounting *src) { - dst->blk.read_bytes += src->blk.read_bytes; - dst->blk.write_bytes += src->blk.write_bytes; - dst->blk.cancelled_write_bytes += src->blk.cancelled_write_bytes; + dst->read_bytes += src->read_bytes; + dst->write_bytes += src->write_bytes; + dst->cancelled_write_bytes += src->cancelled_write_bytes; } #else @@ -77,35 +77,35 @@ static inline void task_io_account_cancelled_write(size_t bytes) { } -static inline void task_io_accounting_init(struct proc_io_accounting *ioac) +static inline void task_io_accounting_init(struct task_io_accounting *ioac) { } -static inline void task_blk_io_accounting_add(struct proc_io_accounting *dst, - struct proc_io_accounting *src) +static inline void task_blk_io_accounting_add(struct task_io_accounting *dst, + struct task_io_accounting *src) { } #endif /* CONFIG_TASK_IO_ACCOUNTING */ #ifdef CONFIG_TASK_XACCT -static inline void task_chr_io_accounting_add(struct proc_io_accounting *dst, - struct proc_io_accounting *src) +static inline void task_chr_io_accounting_add(struct task_io_accounting *dst, + struct task_io_accounting *src) { - dst->chr.rchar += src->chr.rchar; - dst->chr.wchar += src->chr.wchar; - dst->chr.syscr += src->chr.syscr; - dst->chr.syscw += src->chr.syscw; + dst->rchar += src->rchar; + dst->wchar += src->wchar; + dst->syscr += src->syscr; + dst->syscw += src->syscw; } #else -static inline void task_chr_io_accounting_add(struct proc_io_accounting *dst, - struct proc_io_accounting *src) +static inline void task_chr_io_accounting_add(struct task_io_accounting *dst, + struct task_io_accounting *src) { } #endif /* CONFIG_TASK_XACCT */ -static inline void task_io_accounting_add(struct proc_io_accounting *dst, - struct proc_io_accounting *src) +static inline void task_io_accounting_add(struct task_io_accounting *dst, + struct task_io_accounting *src) { task_chr_io_accounting_add(dst, src); task_blk_io_accounting_add(dst, src); diff --git a/kernel/tsacct.c b/kernel/tsacct.c index f9cd2561689c..8ebcd8532dfb 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -94,14 +94,14 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) stats->hiwater_vm = mm->hiwater_vm * PAGE_SIZE / KB; mmput(mm); } - stats->read_char = p->ioac.chr.rchar; - stats->write_char = p->ioac.chr.wchar; - stats->read_syscalls = p->ioac.chr.syscr; - stats->write_syscalls = p->ioac.chr.syscw; + stats->read_char = p->ioac.rchar; + stats->write_char = p->ioac.wchar; + stats->read_syscalls = p->ioac.syscr; + stats->write_syscalls = p->ioac.syscw; #ifdef CONFIG_TASK_IO_ACCOUNTING - stats->read_bytes = p->ioac.blk.read_bytes; - stats->write_bytes = p->ioac.blk.write_bytes; - stats->cancelled_write_bytes = p->ioac.blk.cancelled_write_bytes; + stats->read_bytes = p->ioac.read_bytes; + stats->write_bytes = p->ioac.write_bytes; + stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes; #else stats->read_bytes = 0; stats->write_bytes = 0; -- cgit v1.2.3 From e4e4e534faa3c2be4e165ce414f44b76ada7208c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 14 Apr 2008 08:50:02 +0200 Subject: sched clock: revert various sched_clock() changes Found an interactivity problem on a quad core test-system - simple CPU loops would occasionally delay the system un an unacceptable way. After much debugging with Peter Zijlstra it turned out that the problem is caused by the string of sched_clock() changes - they caused the CPU clock to jump backwards a bit - which confuses the scheduler arithmetics. (which is unsigned for performance reasons) So revert: # c300ba2: sched_clock: and multiplier for TSC to gtod drift # c0c8773: sched_clock: only update deltas with local reads. # af52a90: sched_clock: stop maximum check on NO HZ # f7cce27: sched_clock: widen the max and min time This solves the interactivity problems. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Mike Galbraith --- include/linux/sched.h | 17 +------- kernel/sched_clock.c | 109 ++++++----------------------------------------- kernel/time/tick-sched.c | 2 - 3 files changed, 14 insertions(+), 114 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5270d449ff9d..ea436bc1a0e2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1572,28 +1572,13 @@ static inline void sched_clock_idle_sleep_event(void) static inline void sched_clock_idle_wakeup_event(u64 delta_ns) { } - -#ifdef CONFIG_NO_HZ -static inline void sched_clock_tick_stop(int cpu) -{ -} - -static inline void sched_clock_tick_start(int cpu) -{ -} -#endif - -#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ +#else extern void sched_clock_init(void); extern u64 sched_clock_cpu(int cpu); extern void sched_clock_tick(void); extern void sched_clock_idle_sleep_event(void); extern void sched_clock_idle_wakeup_event(u64 delta_ns); -#ifdef CONFIG_NO_HZ -extern void sched_clock_tick_stop(int cpu); -extern void sched_clock_tick_start(int cpu); #endif -#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ /* * For kernel-internal use: high-speed (but slightly incorrect) per-cpu diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 5a2dc7d8fd98..9a7844158ae8 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -44,11 +44,6 @@ unsigned long long __attribute__((weak)) sched_clock(void) #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK -#define MULTI_SHIFT 15 -/* Max is double, Min is 1/2 */ -#define MAX_MULTI (2LL << MULTI_SHIFT) -#define MIN_MULTI (1LL << (MULTI_SHIFT-1)) - struct sched_clock_data { /* * Raw spinlock - this is a special case: this might be called @@ -62,10 +57,6 @@ struct sched_clock_data { u64 tick_raw; u64 tick_gtod; u64 clock; - s64 multi; -#ifdef CONFIG_NO_HZ - int check_max; -#endif }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); @@ -97,53 +88,18 @@ void sched_clock_init(void) scd->tick_raw = 0; scd->tick_gtod = ktime_now; scd->clock = ktime_now; - scd->multi = 1 << MULTI_SHIFT; -#ifdef CONFIG_NO_HZ - scd->check_max = 1; -#endif } sched_clock_running = 1; } -#ifdef CONFIG_NO_HZ -/* - * The dynamic ticks makes the delta jiffies inaccurate. This - * prevents us from checking the maximum time update. - * Disable the maximum check during stopped ticks. - */ -void sched_clock_tick_stop(int cpu) -{ - struct sched_clock_data *scd = cpu_sdc(cpu); - - scd->check_max = 0; -} - -void sched_clock_tick_start(int cpu) -{ - struct sched_clock_data *scd = cpu_sdc(cpu); - - scd->check_max = 1; -} - -static int check_max(struct sched_clock_data *scd) -{ - return scd->check_max; -} -#else -static int check_max(struct sched_clock_data *scd) -{ - return 1; -} -#endif /* CONFIG_NO_HZ */ - /* * update the percpu scd from the raw @now value * * - filter out backward motion * - use jiffies to generate a min,max window to clip the raw values */ -static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *time) +static void __update_sched_clock(struct sched_clock_data *scd, u64 now) { unsigned long now_jiffies = jiffies; long delta_jiffies = now_jiffies - scd->tick_jiffies; @@ -152,31 +108,16 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *tim s64 delta = now - scd->prev_raw; WARN_ON_ONCE(!irqs_disabled()); - - /* - * At schedule tick the clock can be just under the gtod. We don't - * want to push it too prematurely. - */ - min_clock = scd->tick_gtod + (delta_jiffies * TICK_NSEC); - if (min_clock > TICK_NSEC) - min_clock -= TICK_NSEC / 2; + min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC; if (unlikely(delta < 0)) { clock++; goto out; } - /* - * The clock must stay within a jiffie of the gtod. - * But since we may be at the start of a jiffy or the end of one - * we add another jiffy buffer. - */ - max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC; - - delta *= scd->multi; - delta >>= MULTI_SHIFT; + max_clock = min_clock + TICK_NSEC; - if (unlikely(clock + delta > max_clock) && check_max(scd)) { + if (unlikely(clock + delta > max_clock)) { if (clock < max_clock) clock = max_clock; else @@ -189,12 +130,9 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *tim if (unlikely(clock < min_clock)) clock = min_clock; - if (time) - *time = clock; - else { - scd->prev_raw = now; - scd->clock = clock; - } + scd->prev_raw = now; + scd->tick_jiffies = now_jiffies; + scd->clock = clock; } static void lock_double_clock(struct sched_clock_data *data1, @@ -238,26 +176,21 @@ u64 sched_clock_cpu(int cpu) now -= scd->tick_gtod; __raw_spin_unlock(&my_scd->lock); - - __update_sched_clock(scd, now, &clock); - - __raw_spin_unlock(&scd->lock); - } else { __raw_spin_lock(&scd->lock); - __update_sched_clock(scd, now, NULL); - clock = scd->clock; - __raw_spin_unlock(&scd->lock); } + __update_sched_clock(scd, now); + clock = scd->clock; + + __raw_spin_unlock(&scd->lock); + return clock; } void sched_clock_tick(void) { struct sched_clock_data *scd = this_scd(); - unsigned long now_jiffies = jiffies; - s64 mult, delta_gtod, delta_raw; u64 now, now_gtod; if (unlikely(!sched_clock_running)) @@ -269,29 +202,14 @@ void sched_clock_tick(void) now = sched_clock(); __raw_spin_lock(&scd->lock); - __update_sched_clock(scd, now, NULL); + __update_sched_clock(scd, now); /* * update tick_gtod after __update_sched_clock() because that will * already observe 1 new jiffy; adding a new tick_gtod to that would * increase the clock 2 jiffies. */ - delta_gtod = now_gtod - scd->tick_gtod; - delta_raw = now - scd->tick_raw; - - if ((long)delta_raw > 0) { - mult = delta_gtod << MULTI_SHIFT; - do_div(mult, delta_raw); - scd->multi = mult; - if (scd->multi > MAX_MULTI) - scd->multi = MAX_MULTI; - else if (scd->multi < MIN_MULTI) - scd->multi = MIN_MULTI; - } else - scd->multi = 1 << MULTI_SHIFT; - scd->tick_raw = now; scd->tick_gtod = now_gtod; - scd->tick_jiffies = now_jiffies; __raw_spin_unlock(&scd->lock); } @@ -321,7 +239,6 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) __raw_spin_lock(&scd->lock); scd->prev_raw = now; scd->clock += delta_ns; - scd->multi = 1 << MULTI_SHIFT; __raw_spin_unlock(&scd->lock); touch_softlockup_watchdog(); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 825b4c00fe44..f5da526424a9 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -289,7 +289,6 @@ void tick_nohz_stop_sched_tick(int inidle) ts->tick_stopped = 1; ts->idle_jiffies = last_jiffies; rcu_enter_nohz(); - sched_clock_tick_stop(cpu); } /* @@ -392,7 +391,6 @@ void tick_nohz_restart_sched_tick(void) select_nohz_load_balancer(0); now = ktime_get(); tick_do_update_jiffies64(now); - sched_clock_tick_start(cpu); cpu_clear(cpu, nohz_cpu_mask); /* -- cgit v1.2.3 From c1955a3d4762e7a9bf84035eb3c4886a900f0d15 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 11 Aug 2008 08:59:03 +0200 Subject: sched_clock: delay using sched_clock() Some arch's can't handle sched_clock() being called too early - delay this until sched_clock_init() has been called. Reported-by: Bill Gatliff Signed-off-by: Peter Zijlstra Tested-by: Nishanth Aravamudan CC: Russell King - ARM Linux Signed-off-by: Ingo Molnar --- include/linux/sched.h | 14 +++----------- kernel/sched_clock.c | 19 +++++++++++++++++-- 2 files changed, 20 insertions(+), 13 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index ea436bc1a0e2..5850bfb968a8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1551,16 +1551,10 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) extern unsigned long long sched_clock(void); -#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK -static inline void sched_clock_init(void) -{ -} - -static inline u64 sched_clock_cpu(int cpu) -{ - return sched_clock(); -} +extern void sched_clock_init(void); +extern u64 sched_clock_cpu(int cpu); +#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK static inline void sched_clock_tick(void) { } @@ -1573,8 +1567,6 @@ static inline void sched_clock_idle_wakeup_event(u64 delta_ns) { } #else -extern void sched_clock_init(void); -extern u64 sched_clock_cpu(int cpu); extern void sched_clock_tick(void); extern void sched_clock_idle_sleep_event(void); extern void sched_clock_idle_wakeup_event(u64 delta_ns); diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 074edc989379..204991a0bfa7 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -42,6 +42,8 @@ unsigned long long __attribute__((weak)) sched_clock(void) return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); } +static __read_mostly int sched_clock_running; + #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK struct sched_clock_data { @@ -70,8 +72,6 @@ static inline struct sched_clock_data *cpu_sdc(int cpu) return &per_cpu(sched_clock_data, cpu); } -static __read_mostly int sched_clock_running; - void sched_clock_init(void) { u64 ktime_now = ktime_to_ns(ktime_get()); @@ -248,6 +248,21 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); +#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ + +void sched_clock_init(void) +{ + sched_clock_running = 1; +} + +u64 sched_clock_cpu(int cpu) +{ + if (unlikely(!sched_clock_running)) + return 0; + + return sched_clock(); +} + #endif unsigned long long cpu_clock(int cpu) -- cgit v1.2.3