From 9c44bc03fff44ff04237a7d92e35304a0e50c331 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:21:04 +0200
Subject: softlockup: allow panic on lockup

allow users to configure the softlockup detector to generate a panic
instead of a warning message.

high-availability systems might opt for this strict method (combined
with panic_timeout= boot option/sysctl), instead of generating
softlockup warnings ad infinitum.

also, automated tests work better if the system reboots reliably (into
a safe kernel) in case of a lockup.

The full spectrum of configurability is supported: boot option, sysctl
option and Kconfig option.

it's default-disabled.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux/sched.h')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5395a6176f4b..71f5972dc48e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -294,7 +294,8 @@ extern void softlockup_tick(void);
 extern void spawn_softlockup_task(void);
 extern void touch_softlockup_watchdog(void);
 extern void touch_all_softlockup_watchdogs(void);
-extern unsigned long  softlockup_thresh;
+extern unsigned int  softlockup_panic;
+extern unsigned long softlockup_thresh;
 extern unsigned long sysctl_hung_task_check_count;
 extern unsigned long sysctl_hung_task_timeout_secs;
 extern unsigned long sysctl_hung_task_warnings;
-- 
cgit v1.2.3


From 9383d9679056e6cc4e7ff70f31da945a268238f4 Mon Sep 17 00:00:00 2001
From: Dimitri Sivanich <sivanich@sgi.com>
Date: Mon, 12 May 2008 21:21:14 +0200
Subject: softlockup: fix softlockup_thresh unaligned access and disable
 detection at runtime

Fix unaligned access errors when setting softlockup_thresh on
64 bit platforms.

Allow softlockup detection to be disabled by setting
softlockup_thresh <= 0.

Detect that boot time softlockup detection has been disabled
earlier in softlockup_tick.

Signed-off-by: Dimitri Sivanich <sivanich@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h |  2 +-
 kernel/softlockup.c   | 12 ++++++++++--
 kernel/sysctl.c       |  9 +++++----
 3 files changed, 16 insertions(+), 7 deletions(-)

(limited to 'include/linux/sched.h')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 71f5972dc48e..ea26221644e2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -295,10 +295,10 @@ extern void spawn_softlockup_task(void);
 extern void touch_softlockup_watchdog(void);
 extern void touch_all_softlockup_watchdogs(void);
 extern unsigned int  softlockup_panic;
-extern unsigned long softlockup_thresh;
 extern unsigned long sysctl_hung_task_check_count;
 extern unsigned long sysctl_hung_task_timeout_secs;
 extern unsigned long sysctl_hung_task_warnings;
+extern int softlockup_thresh;
 #else
 static inline void softlockup_tick(void)
 {
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 78e0ad21cb0c..a3a0b239b7f7 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -25,7 +25,7 @@ static DEFINE_PER_CPU(unsigned long, print_timestamp);
 static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
 
 static int __read_mostly did_panic;
-unsigned long __read_mostly softlockup_thresh = 60;
+int __read_mostly softlockup_thresh = 60;
 
 /*
  * Should we panic (and reboot, if panic_timeout= is set) when a
@@ -94,6 +94,14 @@ void softlockup_tick(void)
 	struct pt_regs *regs = get_irq_regs();
 	unsigned long now;
 
+	/* Is detection switched off? */
+	if (!per_cpu(watchdog_task, this_cpu) || softlockup_thresh <= 0) {
+		/* Be sure we don't false trigger if switched back on */
+		if (touch_timestamp)
+			per_cpu(touch_timestamp, this_cpu) = 0;
+		return;
+	}
+
 	if (touch_timestamp == 0) {
 		touch_softlockup_watchdog();
 		return;
@@ -104,7 +112,7 @@ void softlockup_tick(void)
 	/* report at most once a second */
 	if ((print_timestamp >= touch_timestamp &&
 			print_timestamp < (touch_timestamp + 1)) ||
-			did_panic || !per_cpu(watchdog_task, this_cpu)) {
+			did_panic) {
 		return;
 	}
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2d3b388c402d..31c19a79738d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -84,12 +84,13 @@ extern int latencytop_enabled;
 extern int sysctl_nr_open_min, sysctl_nr_open_max;
 
 /* Constants used for minimum and  maximum */
-#if defined(CONFIG_DETECT_SOFTLOCKUP) || defined(CONFIG_HIGHMEM)
+#ifdef CONFIG_HIGHMEM
 static int one = 1;
 #endif
 
 #ifdef CONFIG_DETECT_SOFTLOCKUP
 static int sixty = 60;
+static int neg_one = -1;
 #endif
 
 #ifdef CONFIG_MMU
@@ -742,11 +743,11 @@ static struct ctl_table kern_table[] = {
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "softlockup_thresh",
 		.data		= &softlockup_thresh,
-		.maxlen		= sizeof(unsigned long),
+		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_doulongvec_minmax,
+		.proc_handler	= &proc_dointvec_minmax,
 		.strategy	= &sysctl_intvec,
-		.extra1		= &one,
+		.extra1		= &neg_one,
 		.extra2		= &sixty,
 	},
 	{
-- 
cgit v1.2.3


From 1b427c153a08fdbc092c2bdbf845b92fda58d857 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 18 Jul 2008 14:01:39 +0200
Subject: sched: fix build error, provide partition_sched_domains()
 unconditionally
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

provide an empty partition_sched_domains() definition for the UP case:

 include/linux/cpuset.h: In function ‘rebuild_sched_domains':
 include/linux/cpuset.h:163: error: implicit declaration of function ‘partition_sched_domains'

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/linux/sched.h')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1941d8b5cf11..26da921530fe 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -824,7 +824,16 @@ extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
 				    struct sched_domain_attr *dattr_new);
 extern int arch_reinit_sched_domains(void);
 
-#endif	/* CONFIG_SMP */
+#else /* CONFIG_SMP */
+
+struct sched_domain_attr;
+
+static inline void
+partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+			struct sched_domain_attr *dattr_new)
+{
+}
+#endif	/* !CONFIG_SMP */
 
 struct io_context;			/* See blkdev.h */
 #define NGROUPS_SMALL		32
-- 
cgit v1.2.3


From 8b05c7e6e159d2f33c9275281b8b909a89eb7c5d Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Wed, 23 Jul 2008 21:26:53 -0700
Subject: add a helper function to test if an object is on the stack

lib/debugobjects.c has a function to test if an object is on the stack.
The block layer and ide needs it (they need to avoid DMA from/to stack
buffers).  This patch moves the function to include/linux/sched.h so that
everyone can use it.

lib/debugobjects.c uses current->stack but this patch uses a
task_stack_page() accessor, which is a preferable way to access the stack.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 7 +++++++
 lib/debugobjects.c    | 4 +---
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux/sched.h')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index dc7e592c473a..6aca4a16e377 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1983,6 +1983,13 @@ static inline unsigned long *end_of_stack(struct task_struct *p)
 
 #endif
 
+static inline int object_is_on_stack(void *obj)
+{
+	void *stack = task_stack_page(current);
+
+	return (obj >= stack) && (obj < (stack + THREAD_SIZE));
+}
+
 extern void thread_info_cache_init(void);
 
 /* set thread flags in other task's structures
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index 85b18d79be89..f86196390cfd 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -226,15 +226,13 @@ debug_object_fixup(int (*fixup)(void *addr, enum debug_obj_state state),
 
 static void debug_object_is_on_stack(void *addr, int onstack)
 {
-	void *stack = current->stack;
 	int is_on_stack;
 	static int limit;
 
 	if (limit > 4)
 		return;
 
-	is_on_stack = (addr >= stack && addr < (stack + THREAD_SIZE));
-
+	is_on_stack = object_is_on_stack(addr);
 	if (is_on_stack == onstack)
 		return;
 
-- 
cgit v1.2.3


From 364d3c13c17f45da6d638011078d4c4d3070d719 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:36 -0700
Subject: ptrace: give more respect to SIGKILL

ptrace_stop() has some complicated checks to prevent the scheduling in the
TASK_TRACED state with the pending SIGKILL, but these checks are racy, and
they depend on arch_ptrace_stop_needed().

This patch assumes that the traced task should die asap if it was killed by
SIGKILL, in that case schedule()->signal_pending_state() has no reason to
ignore the TASK_WAKEKILL part of TASK_TRACED, and we can kill this nasty
special case.

Note: do_exit()->ptrace_notify() is special, the killed task can already
dequeue SIGKILL at this point. Another indication that fatal_signal_pending()
is not exactly right.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux/sched.h')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6aca4a16e377..79e749dbf81e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2054,9 +2054,6 @@ static inline int signal_pending_state(long state, struct task_struct *p)
 	if (!signal_pending(p))
 		return 0;
 
-	if (state & (__TASK_STOPPED | __TASK_TRACED))
-		return 0;
-
 	return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
 }
 
-- 
cgit v1.2.3


From 7b34e4283c685f5cc6ba6d30e939906eee0d4bcf Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:37 -0700
Subject: introduce PF_KTHREAD flag

Introduce the new PF_KTHREAD flag to mark the kernel threads.  It is set
by INIT_TASK() and copied to the forked childs (we could set it in
kthreadd() along with PF_NOFREEZE instead).

daemonize() was changed as well.  In that case testing of PF_KTHREAD is
racy, but daemonize() is hopeless anyway.

This flag is cleared in do_execve(), before search_binary_handler().
Probably not the best place, we can do this in exec_mmap() or in
start_thread(), or clear it along with PF_FORKNOEXEC.  But I think this
doesn't matter in practice, and if do_execve() fails kthread should die
soon.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c                 | 1 +
 include/linux/init_task.h | 2 +-
 include/linux/sched.h     | 1 +
 kernel/exit.c             | 2 +-
 4 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux/sched.h')

diff --git a/fs/exec.c b/fs/exec.c
index af249af4ccab..cd2e8c9b1249 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1326,6 +1326,7 @@ int do_execve(char * filename,
 	if (retval < 0)
 		goto out;
 
+	current->flags &= ~PF_KTHREAD;
 	retval = search_binary_handler(bprm,regs);
 	if (retval >= 0) {
 		/* execve success */
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 93c45acf249a..021d8e720c79 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -122,7 +122,7 @@ extern struct group_info init_groups;
 	.state		= 0,						\
 	.stack		= &init_thread_info,				\
 	.usage		= ATOMIC_INIT(2),				\
-	.flags		= 0,						\
+	.flags		= PF_KTHREAD,					\
 	.lock_depth	= -1,						\
 	.prio		= MAX_PRIO-20,					\
 	.static_prio	= MAX_PRIO-20,					\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 79e749dbf81e..eec64a4adb9d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1483,6 +1483,7 @@ static inline void put_task_struct(struct task_struct *t)
 #define PF_EXITING	0x00000004	/* getting shut down */
 #define PF_EXITPIDONE	0x00000008	/* pi exit done on shut down */
 #define PF_VCPU		0x00000010	/* I'm a virtual CPU */
+#define PF_KTHREAD	0x00000020	/* I am a kernel thread */
 #define PF_FORKNOEXEC	0x00000040	/* forked but didn't exec */
 #define PF_SUPERPRIV	0x00000100	/* used super-user privileges */
 #define PF_DUMPCORE	0x00000200	/* dumped core */
diff --git a/kernel/exit.c b/kernel/exit.c
index a7799d8a6404..28a44a2612dc 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -430,7 +430,7 @@ void daemonize(const char *name, ...)
 	 * We don't want to have TIF_FREEZE set if the system-wide hibernation
 	 * or suspend transition begins right now.
 	 */
-	current->flags |= PF_NOFREEZE;
+	current->flags |= (PF_NOFREEZE | PF_KTHREAD);
 
 	if (current->nsproxy != &init_nsproxy) {
 		get_nsproxy(&init_nsproxy);
-- 
cgit v1.2.3


From 246bb0b1deb29726990620d8b5e55ca29f331362 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:38 -0700
Subject: kill PF_BORROWED_MM in favour of PF_KTHREAD

Kill PF_BORROWED_MM.  Change use_mm/unuse_mm to not play with ->flags, and
do s/PF_BORROWED_MM/PF_KTHREAD/ for a couple of other users.

No functional changes yet.  But this allows us to do further
fixes/cleanups.

oom_kill/ptrace/etc often check "p->mm != NULL" to filter out the
kthreads, this is wrong because of use_mm().  The problem with
PF_BORROWED_MM is that we need task_lock() to avoid races.  With this
patch we can check PF_KTHREAD directly, or use a simple lockless helper:

	/* The result must not be dereferenced !!! */
	struct mm_struct *__get_task_mm(struct task_struct *tsk)
	{
		if (tsk->flags & PF_KTHREAD)
			return NULL;
		return tsk->mm;
	}

Note also ecard_task().  It runs with ->mm != NULL, but it's the kernel
thread without PF_BORROWED_MM.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/aio.c              | 2 --
 include/linux/sched.h | 3 +--
 kernel/fork.c         | 4 ++--
 3 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'include/linux/sched.h')

diff --git a/fs/aio.c b/fs/aio.c
index 0fb3117ddd93..0051fd94b44e 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -586,7 +586,6 @@ static void use_mm(struct mm_struct *mm)
 	struct task_struct *tsk = current;
 
 	task_lock(tsk);
-	tsk->flags |= PF_BORROWED_MM;
 	active_mm = tsk->active_mm;
 	atomic_inc(&mm->mm_count);
 	tsk->mm = mm;
@@ -610,7 +609,6 @@ static void unuse_mm(struct mm_struct *mm)
 	struct task_struct *tsk = current;
 
 	task_lock(tsk);
-	tsk->flags &= ~PF_BORROWED_MM;
 	tsk->mm = NULL;
 	/* active_mm is still 'mm' */
 	enter_lazy_tlb(mm, tsk);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index eec64a4adb9d..0560999eb1db 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1483,7 +1483,6 @@ static inline void put_task_struct(struct task_struct *t)
 #define PF_EXITING	0x00000004	/* getting shut down */
 #define PF_EXITPIDONE	0x00000008	/* pi exit done on shut down */
 #define PF_VCPU		0x00000010	/* I'm a virtual CPU */
-#define PF_KTHREAD	0x00000020	/* I am a kernel thread */
 #define PF_FORKNOEXEC	0x00000040	/* forked but didn't exec */
 #define PF_SUPERPRIV	0x00000100	/* used super-user privileges */
 #define PF_DUMPCORE	0x00000200	/* dumped core */
@@ -1497,7 +1496,7 @@ static inline void put_task_struct(struct task_struct *t)
 #define PF_KSWAPD	0x00040000	/* I am kswapd */
 #define PF_SWAPOFF	0x00080000	/* I am in swapoff */
 #define PF_LESS_THROTTLE 0x00100000	/* Throttle me less: I clean memory */
-#define PF_BORROWED_MM	0x00200000	/* I am a kthread doing use_mm */
+#define PF_KTHREAD	0x00200000	/* I am a kernel thread */
 #define PF_RANDOMIZE	0x00400000	/* randomize virtual address space */
 #define PF_SWAPWRITE	0x00800000	/* Allowed to write to swap */
 #define PF_SPREAD_PAGE	0x01000000	/* Spread page cache over cpuset */
diff --git a/kernel/fork.c b/kernel/fork.c
index 228f80c9155a..eeaec6893b0d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -474,7 +474,7 @@ EXPORT_SYMBOL_GPL(mmput);
 /**
  * get_task_mm - acquire a reference to the task's mm
  *
- * Returns %NULL if the task has no mm.  Checks PF_BORROWED_MM (meaning
+ * Returns %NULL if the task has no mm.  Checks PF_KTHREAD (meaning
  * this kernel workthread has transiently adopted a user mm with use_mm,
  * to do its AIO) is not set and if so returns a reference to it, after
  * bumping up the use count.  User must release the mm via mmput()
@@ -487,7 +487,7 @@ struct mm_struct *get_task_mm(struct task_struct *task)
 	task_lock(task);
 	mm = task->mm;
 	if (mm) {
-		if (task->flags & PF_BORROWED_MM)
+		if (task->flags & PF_KTHREAD)
 			mm = NULL;
 		else
 			atomic_inc(&mm->mm_users);
-- 
cgit v1.2.3


From 19b0cfcca41dd772065671ad0584e1cea0f3fd13 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 25 Jul 2008 01:48:35 -0700
Subject: pidns: remove now unused kill_proc function

This function operated on a pid_t to kill a task, which is no longer valid
in a containerized system.

It has finally lost all its users and we can safely remove it from the
tree.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h |  1 -
 kernel/signal.c       | 12 ------------
 2 files changed, 13 deletions(-)

(limited to 'include/linux/sched.h')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0560999eb1db..134cb5cb506c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1800,7 +1800,6 @@ extern void force_sig(int, struct task_struct *);
 extern void force_sig_specific(int, struct task_struct *);
 extern int send_sig(int, struct task_struct *, int);
 extern void zap_other_threads(struct task_struct *p);
-extern int kill_proc(pid_t, int, int);
 extern struct sigqueue *sigqueue_alloc(void);
 extern void sigqueue_free(struct sigqueue *);
 extern int send_sigqueue(struct sigqueue *,  struct task_struct *, int group);
diff --git a/kernel/signal.c b/kernel/signal.c
index 5c7b7eaa0dc6..82c3545596c5 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1228,17 +1228,6 @@ int kill_pid(struct pid *pid, int sig, int priv)
 }
 EXPORT_SYMBOL(kill_pid);
 
-int
-kill_proc(pid_t pid, int sig, int priv)
-{
-	int ret;
-
-	rcu_read_lock();
-	ret = kill_pid_info(sig, __si_special(priv), find_pid(pid));
-	rcu_read_unlock();
-	return ret;
-}
-
 /*
  * These functions support sending signals using preallocated sigqueue
  * structures.  This is needed "because realtime applications cannot
@@ -1906,7 +1895,6 @@ EXPORT_SYMBOL(recalc_sigpending);
 EXPORT_SYMBOL_GPL(dequeue_signal);
 EXPORT_SYMBOL(flush_signals);
 EXPORT_SYMBOL(force_sig);
-EXPORT_SYMBOL(kill_proc);
 EXPORT_SYMBOL(ptrace_notify);
 EXPORT_SYMBOL(send_sig);
 EXPORT_SYMBOL(send_sig_info);
-- 
cgit v1.2.3


From e49859e71e0318b564de1546bdc30fab738f9deb Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 25 Jul 2008 01:48:36 -0700
Subject: pidns: remove now unused find_pid function.

This one had the only users so far - the kill_proc, which is removed, so
drop this (invalid in namespaced world) call too.

And of course - erase all references on it from comments.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pid.h   | 4 +---
 include/linux/sched.h | 2 +-
 kernel/pid.c          | 8 +-------
 3 files changed, 3 insertions(+), 11 deletions(-)

(limited to 'include/linux/sched.h')

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 6f084b9e2c40..ff1b2a5814d4 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -48,7 +48,7 @@ enum pid_type
  */
 
 struct upid {
-	/* Try to keep pid_chain in the same cacheline as nr for find_pid */
+	/* Try to keep pid_chain in the same cacheline as nr for find_vpid */
 	int nr;
 	struct pid_namespace *ns;
 	struct hlist_node pid_chain;
@@ -105,14 +105,12 @@ extern struct pid_namespace init_pid_ns;
  * or rcu_read_lock() held.
  *
  * find_pid_ns() finds the pid in the namespace specified
- * find_pid() find the pid by its global id, i.e. in the init namespace
  * find_vpid() finr the pid by its virtual id, i.e. in the current namespace
  *
  * see also find_task_by_pid() set in include/linux/sched.h
  */
 extern struct pid *find_pid_ns(int nr, struct pid_namespace *ns);
 extern struct pid *find_vpid(int nr);
-extern struct pid *find_pid(int nr);
 
 /*
  * Lookup a PID in the hash table, and return with it's count elevated.
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 134cb5cb506c..182da1550fad 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1718,7 +1718,7 @@ extern struct pid_namespace init_pid_ns;
  * find_task_by_pid():
  *      finds a task by its global pid
  *
- * see also find_pid() etc in include/linux/pid.h
+ * see also find_vpid() etc in include/linux/pid.h
  */
 
 extern struct task_struct *find_task_by_pid_type_ns(int type, int pid,
diff --git a/kernel/pid.c b/kernel/pid.c
index 753fd90d9ec1..064e76afa507 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -309,12 +309,6 @@ struct pid *find_vpid(int nr)
 }
 EXPORT_SYMBOL_GPL(find_vpid);
 
-struct pid *find_pid(int nr)
-{
-	return find_pid_ns(nr, &init_pid_ns);
-}
-EXPORT_SYMBOL_GPL(find_pid);
-
 /*
  * attach_pid() must be called with the tasklist_lock write-held.
  */
@@ -483,7 +477,7 @@ EXPORT_SYMBOL(task_session_nr_ns);
 /*
  * Used by proc to find the first pid that is greater then or equal to nr.
  *
- * If there is a pid at nr this function is exactly the same as find_pid.
+ * If there is a pid at nr this function is exactly the same as find_pid_ns.
  */
 struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
 {
-- 
cgit v1.2.3


From dbda0de52618d13d1b927c7ba7bb839cfddc4e8c Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 25 Jul 2008 01:48:37 -0700
Subject: pidns: remove find_task_by_pid, unused for a long time

It seems to me that it was a mistake marking this function as deprecated
and scheduling it for removal, rather than resolutely removing it after
the last caller's death.

Anyway - better late, then never.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/feature-removal-schedule.txt | 18 ------------------
 include/linux/pid.h                        |  2 +-
 include/linux/sched.h                      |  6 ------
 3 files changed, 1 insertion(+), 25 deletions(-)

(limited to 'include/linux/sched.h')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 09c4a1efb8e3..721c71b86e06 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -138,24 +138,6 @@ Who:	Kay Sievers <kay.sievers@suse.de>
 
 ---------------------------
 
-What:	find_task_by_pid
-When:	2.6.26
-Why:	With pid namespaces, calling this funciton will return the
-	wrong task when called from inside a namespace.
-
-	The best way to save a task pid and find a task by this
-	pid later, is to find this task's struct pid pointer (or get
-	it directly from the task) and call pid_task() later.
-
-	If someone really needs to get a task by its pid_t, then
-	he most likely needs the find_task_by_vpid() to get the
-	task from the same namespace as the current task is in, but
-	this may be not so in general.
-
-Who:	Pavel Emelyanov <xemul@openvz.org>
-
----------------------------
-
 What:	ACPI procfs interface
 When:	July 2008
 Why:	ACPI sysfs conversion should be finished by January 2008.
diff --git a/include/linux/pid.h b/include/linux/pid.h
index ff1b2a5814d4..22921ac4cfd9 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -107,7 +107,7 @@ extern struct pid_namespace init_pid_ns;
  * find_pid_ns() finds the pid in the namespace specified
  * find_vpid() finr the pid by its virtual id, i.e. in the current namespace
  *
- * see also find_task_by_pid() set in include/linux/sched.h
+ * see also find_task_by_vpid() set in include/linux/sched.h
  */
 extern struct pid *find_pid_ns(int nr, struct pid_namespace *ns);
 extern struct pid *find_vpid(int nr);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 182da1550fad..354ef478a80d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1715,8 +1715,6 @@ extern struct pid_namespace init_pid_ns;
  *      finds a task by its pid in the specified namespace
  * find_task_by_vpid():
  *      finds a task by its virtual pid
- * find_task_by_pid():
- *      finds a task by its global pid
  *
  * see also find_vpid() etc in include/linux/pid.h
  */
@@ -1724,10 +1722,6 @@ extern struct pid_namespace init_pid_ns;
 extern struct task_struct *find_task_by_pid_type_ns(int type, int pid,
 		struct pid_namespace *ns);
 
-static inline struct task_struct *__deprecated find_task_by_pid(pid_t nr)
-{
-	return find_task_by_pid_type_ns(PIDTYPE_PID, nr, &init_pid_ns);
-}
 extern struct task_struct *find_task_by_vpid(pid_t nr);
 extern struct task_struct *find_task_by_pid_ns(pid_t nr,
 		struct pid_namespace *ns);
-- 
cgit v1.2.3


From 49b5cf34727a6c1be1568ab28e89a2d9a6bf51e0 Mon Sep 17 00:00:00 2001
From: Jonathan Lim <jlim@sgi.com>
Date: Fri, 25 Jul 2008 01:48:40 -0700
Subject: accounting: account for user time when updating memory integrals

Adapt acct_update_integrals() to include user time when calculating the time
difference.  The units of acct_rss_mem1 and acct_vm_mem1 are also changed from
pages-jiffies to pages-usecs to avoid calling jiffies_to_usecs() in
xacct_add_tsk() which might overflow.

Signed-off-by: Jonathan Lim <jlim@sgi.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h |  2 +-
 kernel/sched.c        |  2 ++
 kernel/tsacct.c       | 21 ++++++++++++++-------
 3 files changed, 17 insertions(+), 8 deletions(-)

(limited to 'include/linux/sched.h')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 354ef478a80d..af780f299c7c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1257,7 +1257,7 @@ struct task_struct {
 #if defined(CONFIG_TASK_XACCT)
 	u64 acct_rss_mem1;	/* accumulated rss usage */
 	u64 acct_vm_mem1;	/* accumulated virtual memory usage */
-	cputime_t acct_stimexpd;/* stime since last update */
+	cputime_t acct_timexpd;	/* stime + utime since last update */
 #endif
 #ifdef CONFIG_CPUSETS
 	nodemask_t mems_allowed;
diff --git a/kernel/sched.c b/kernel/sched.c
index 6acf749d3336..0047bd9b96aa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4046,6 +4046,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
 		cpustat->nice = cputime64_add(cpustat->nice, tmp);
 	else
 		cpustat->user = cputime64_add(cpustat->user, tmp);
+	/* Account for user time used */
+	acct_update_integrals(p);
 }
 
 /*
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 4ab1b584961b..1da6990af8e0 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -84,9 +84,9 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 {
 	struct mm_struct *mm;
 
-	/* convert pages-jiffies to Mbyte-usec */
-	stats->coremem = jiffies_to_usecs(p->acct_rss_mem1) * PAGE_SIZE / MB;
-	stats->virtmem = jiffies_to_usecs(p->acct_vm_mem1) * PAGE_SIZE / MB;
+	/* convert pages-usec to Mbyte-usec */
+	stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB;
+	stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB;
 	mm = get_task_mm(p);
 	if (mm) {
 		/* adjust to KB unit */
@@ -118,12 +118,19 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 void acct_update_integrals(struct task_struct *tsk)
 {
 	if (likely(tsk->mm)) {
-		long delta = cputime_to_jiffies(
-			cputime_sub(tsk->stime, tsk->acct_stimexpd));
+		cputime_t time, dtime;
+		struct timeval value;
+		u64 delta;
+
+		time = tsk->stime + tsk->utime;
+		dtime = cputime_sub(time, tsk->acct_timexpd);
+		jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
+		delta = value.tv_sec;
+		delta = delta * USEC_PER_SEC + value.tv_usec;
 
 		if (delta == 0)
 			return;
-		tsk->acct_stimexpd = tsk->stime;
+		tsk->acct_timexpd = time;
 		tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
 		tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
 	}
@@ -135,7 +142,7 @@ void acct_update_integrals(struct task_struct *tsk)
  */
 void acct_clear_integrals(struct task_struct *tsk)
 {
-	tsk->acct_stimexpd = 0;
+	tsk->acct_timexpd = 0;
 	tsk->acct_rss_mem1 = 0;
 	tsk->acct_vm_mem1 = 0;
 }
-- 
cgit v1.2.3


From 297c5d92634c809cef23d73e7b2556f2528ff7e2 Mon Sep 17 00:00:00 2001
From: Andrea Righi <righi.andrea@gmail.com>
Date: Fri, 25 Jul 2008 01:48:49 -0700
Subject: task IO accounting: provide distinct tgid/tid I/O statistics

Report per-thread I/O statistics in /proc/pid/task/tid/io and aggregate
parent I/O statistics in /proc/pid/io.  This approach follows the same
model used to account per-process and per-thread CPU times.

As a practial application, this allows for example to quickly find the top
I/O consumer when a process spawns many child threads that perform the
actual I/O work, because the aggregated I/O statistics can always be found
in /proc/pid/io.

[ Oleg Nesterov points out that we should check that the task is still
  alive before we iterate over the threads, but also says that we can do
  that fixup on top of this later.  - Linus ]

Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Andrea Righi <righi.andrea@gmail.com>
Cc: Matt Heaton <matt@hostmonster.com>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Acked-by-with-comments: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c        | 86 ++++++++++++++++++++++++++++++++++++++++++---------
 include/linux/sched.h |  4 +++
 kernel/exit.c         | 27 ++++++++++++++++
 kernel/fork.c         |  6 ++++
 4 files changed, 108 insertions(+), 15 deletions(-)

(limited to 'include/linux/sched.h')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 58c3e6a8e15e..a891fe4cb43b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2376,29 +2376,82 @@ static int proc_base_fill_cache(struct file *filp, void *dirent,
 }
 
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-static int proc_pid_io_accounting(struct task_struct *task, char *buffer)
-{
+static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
+{
+	u64 rchar, wchar, syscr, syscw;
+	struct task_io_accounting ioac;
+
+	if (!whole) {
+		rchar = task->rchar;
+		wchar = task->wchar;
+		syscr = task->syscr;
+		syscw = task->syscw;
+		memcpy(&ioac, &task->ioac, sizeof(ioac));
+	} else {
+		unsigned long flags;
+		struct task_struct *t = task;
+		rchar = wchar = syscr = syscw = 0;
+		memset(&ioac, 0, sizeof(ioac));
+
+		rcu_read_lock();
+		do {
+			rchar += t->rchar;
+			wchar += t->wchar;
+			syscr += t->syscr;
+			syscw += t->syscw;
+
+			ioac.read_bytes += t->ioac.read_bytes;
+			ioac.write_bytes += t->ioac.write_bytes;
+			ioac.cancelled_write_bytes +=
+					t->ioac.cancelled_write_bytes;
+			t = next_thread(t);
+		} while (t != task);
+		rcu_read_unlock();
+
+		if (lock_task_sighand(task, &flags)) {
+			struct signal_struct *sig = task->signal;
+
+			rchar += sig->rchar;
+			wchar += sig->wchar;
+			syscr += sig->syscr;
+			syscw += sig->syscw;
+
+			ioac.read_bytes += sig->ioac.read_bytes;
+			ioac.write_bytes += sig->ioac.write_bytes;
+			ioac.cancelled_write_bytes +=
+					sig->ioac.cancelled_write_bytes;
+
+			unlock_task_sighand(task, &flags);
+		}
+	}
+
 	return sprintf(buffer,
-#ifdef CONFIG_TASK_XACCT
 			"rchar: %llu\n"
 			"wchar: %llu\n"
 			"syscr: %llu\n"
 			"syscw: %llu\n"
-#endif
 			"read_bytes: %llu\n"
 			"write_bytes: %llu\n"
 			"cancelled_write_bytes: %llu\n",
-#ifdef CONFIG_TASK_XACCT
-			(unsigned long long)task->rchar,
-			(unsigned long long)task->wchar,
-			(unsigned long long)task->syscr,
-			(unsigned long long)task->syscw,
-#endif
-			(unsigned long long)task->ioac.read_bytes,
-			(unsigned long long)task->ioac.write_bytes,
-			(unsigned long long)task->ioac.cancelled_write_bytes);
+			(unsigned long long)rchar,
+			(unsigned long long)wchar,
+			(unsigned long long)syscr,
+			(unsigned long long)syscw,
+			(unsigned long long)ioac.read_bytes,
+			(unsigned long long)ioac.write_bytes,
+			(unsigned long long)ioac.cancelled_write_bytes);
+}
+
+static int proc_tid_io_accounting(struct task_struct *task, char *buffer)
+{
+	return do_io_accounting(task, buffer, 0);
 }
-#endif
+
+static int proc_tgid_io_accounting(struct task_struct *task, char *buffer)
+{
+	return do_io_accounting(task, buffer, 1);
+}
+#endif /* CONFIG_TASK_IO_ACCOUNTING */
 
 /*
  * Thread groups
@@ -2470,7 +2523,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 	REG("coredump_filter", S_IRUGO|S_IWUSR, coredump_filter),
 #endif
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-	INF("io",	S_IRUGO, pid_io_accounting),
+	INF("io",	S_IRUGO, tgid_io_accounting),
 #endif
 };
 
@@ -2797,6 +2850,9 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_FAULT_INJECTION
 	REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
 #endif
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+	INF("io",	S_IRUGO, tid_io_accounting),
+#endif
 };
 
 static int proc_tid_base_readdir(struct file * filp,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index af780f299c7c..d22ffe06d0eb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -506,6 +506,10 @@ struct signal_struct {
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
 	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
 	unsigned long inblock, oublock, cinblock, coublock;
+#ifdef CONFIG_TASK_XACCT
+	u64 rchar, wchar, syscr, syscw;
+#endif
+	struct task_io_accounting ioac;
 
 	/*
 	 * Cumulative ns of scheduled CPU time for dead threads in the
diff --git a/kernel/exit.c b/kernel/exit.c
index 8a4d4d12e294..ad933bb29ec7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -120,6 +120,18 @@ static void __exit_signal(struct task_struct *tsk)
 		sig->nivcsw += tsk->nivcsw;
 		sig->inblock += task_io_get_inblock(tsk);
 		sig->oublock += task_io_get_oublock(tsk);
+#ifdef CONFIG_TASK_XACCT
+		sig->rchar += tsk->rchar;
+		sig->wchar += tsk->wchar;
+		sig->syscr += tsk->syscr;
+		sig->syscw += tsk->syscw;
+#endif /* CONFIG_TASK_XACCT */
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+		sig->ioac.read_bytes += tsk->ioac.read_bytes;
+		sig->ioac.write_bytes += tsk->ioac.write_bytes;
+		sig->ioac.cancelled_write_bytes +=
+					tsk->ioac.cancelled_write_bytes;
+#endif /* CONFIG_TASK_IO_ACCOUNTING */
 		sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
 		sig = NULL; /* Marker for below. */
 	}
@@ -1366,6 +1378,21 @@ static int wait_task_zombie(struct task_struct *p, int options,
 		psig->coublock +=
 			task_io_get_oublock(p) +
 			sig->oublock + sig->coublock;
+#ifdef CONFIG_TASK_XACCT
+		psig->rchar += p->rchar + sig->rchar;
+		psig->wchar += p->wchar + sig->wchar;
+		psig->syscr += p->syscr + sig->syscr;
+		psig->syscw += p->syscw + sig->syscw;
+#endif /* CONFIG_TASK_XACCT */
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+		psig->ioac.read_bytes +=
+			p->ioac.read_bytes + sig->ioac.read_bytes;
+		psig->ioac.write_bytes +=
+			p->ioac.write_bytes + sig->ioac.write_bytes;
+		psig->ioac.cancelled_write_bytes +=
+				p->ioac.cancelled_write_bytes +
+				sig->ioac.cancelled_write_bytes;
+#endif /* CONFIG_TASK_IO_ACCOUNTING */
 		spin_unlock_irq(&p->parent->sighand->siglock);
 	}
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 813d5c89b9d5..b99d73e971a4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -812,6 +812,12 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
 	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
 	sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
+#ifdef CONFIG_TASK_XACCT
+	sig->rchar = sig->wchar = sig->syscr = sig->syscw = 0;
+#endif
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+	memset(&sig->ioac, 0, sizeof(sig->ioac));
+#endif
 	sig->sum_sched_runtime = 0;
 	INIT_LIST_HEAD(&sig->cpu_timers[0]);
 	INIT_LIST_HEAD(&sig->cpu_timers[1]);
-- 
cgit v1.2.3


From 873b47717732c2f33a4b14de02571a4295a02f0c Mon Sep 17 00:00:00 2001
From: Keika Kobayashi <kobayashi.kk@ncos.nec.co.jp>
Date: Fri, 25 Jul 2008 01:48:52 -0700
Subject: per-task-delay-accounting: add memory reclaim delay

Sometimes, application responses become bad under heavy memory load.
Applications take a bit time to reclaim memory.  The statistics, how long
memory reclaim takes, will be useful to measure memory usage.

This patch adds accounting memory reclaim to per-task-delay-accounting for
accounting the time of do_try_to_free_pages().

<i.e>

- When System is under low memory load,
  memory reclaim may not occur.

$ free
             total       used       free     shared    buffers     cached
Mem:       8197800    1577300    6620500          0       4808    1516724
-/+ buffers/cache:      55768    8142032
Swap:     16386292          0   16386292

$ vmstat 1
procs -----------memory---------- ---swap-- -----io---- -system-- ----cpu----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa
 0  0      0 5069748  10612 3014060    0    0     0     0    3   26  0  0 100  0
 0  0      0 5069748  10612 3014060    0    0     0     0    4   22  0  0 100  0
 0  0      0 5069748  10612 3014060    0    0     0     0    3   18  0  0 100  0

Measure the time of tar command.

$ ls -s test.dat
1501472 test.dat

$ time tar cvf test.tar test.dat
real    0m13.388s
user    0m0.116s
sys     0m5.304s

$ ./delayget -d -p <pid>
CPU             count     real total  virtual total    delay total
                  428     5528345500     5477116080       62749891
IO              count    delay total
                  338     8078977189
SWAP            count    delay total
                    0              0
RECLAIM         count    delay total
                    0              0

- When system is under heavy memory load
  memory reclaim may occur.

$ vmstat 1
procs -----------memory---------- ---swap-- -----io---- -system-- ----cpu----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa
 0  0 7159032  49724   1812   3012    0    0     0     0    3   24  0  0 100  0
 0  0 7159032  49724   1812   3012    0    0     0     0    4   24  0  0 100  0
 0  0 7159032  49848   1812   3012    0    0     0     0    3   22  0  0 100  0

In this case, one process uses more 8G memory
by execution of malloc() and memset().

$ time tar cvf test.tar test.dat
real    1m38.563s        <-  increased by 85 sec
user    0m0.140s
sys     0m7.060s

$ ./delayget -d -p <pid>
CPU             count     real total  virtual total    delay total
                 9021     7140446250     7315277975      923201824
IO              count    delay total
                 8965    90466349669
SWAP            count    delay total
                    3       21036367
RECLAIM         count    delay total
                  740    61011951153

In the later case, the value of RECLAIM is increasing.
So, taskstats can show how much memory reclaim influences TAT.

Signed-off-by: Keika Kobayashi <kobayashi.kk@ncos.nec.co.jp>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujistu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/delayacct.h | 19 +++++++++++++++++++
 include/linux/sched.h     |  4 ++++
 kernel/delayacct.c        | 13 +++++++++++++
 mm/vmscan.c               |  5 +++++
 4 files changed, 41 insertions(+)

(limited to 'include/linux/sched.h')

diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index ab94bc083558..f352f06fa063 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -39,6 +39,8 @@ extern void __delayacct_blkio_start(void);
 extern void __delayacct_blkio_end(void);
 extern int __delayacct_add_tsk(struct taskstats *, struct task_struct *);
 extern __u64 __delayacct_blkio_ticks(struct task_struct *);
+extern void __delayacct_freepages_start(void);
+extern void __delayacct_freepages_end(void);
 
 static inline int delayacct_is_task_waiting_on_io(struct task_struct *p)
 {
@@ -107,6 +109,18 @@ static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk)
 	return 0;
 }
 
+static inline void delayacct_freepages_start(void)
+{
+	if (current->delays)
+		__delayacct_freepages_start();
+}
+
+static inline void delayacct_freepages_end(void)
+{
+	if (current->delays)
+		__delayacct_freepages_end();
+}
+
 #else
 static inline void delayacct_set_flag(int flag)
 {}
@@ -129,6 +143,11 @@ static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk)
 { return 0; }
 static inline int delayacct_is_task_waiting_on_io(struct task_struct *p)
 { return 0; }
+static inline void delayacct_freepages_start(void)
+{}
+static inline void delayacct_freepages_end(void)
+{}
+
 #endif /* CONFIG_TASK_DELAY_ACCT */
 
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d22ffe06d0eb..42036ffe6b00 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -672,6 +672,10 @@ struct task_delay_info {
 				/* io operations performed */
 	u32 swapin_count;	/* total count of the number of swapin block */
 				/* io operations performed */
+
+	struct timespec freepages_start, freepages_end;
+	u64 freepages_delay;	/* wait for memory reclaim */
+	u32 freepages_count;	/* total count of memory reclaim */
 };
 #endif	/* CONFIG_TASK_DELAY_ACCT */
 
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 10e43fd8b721..84b6782a2ce4 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -165,3 +165,16 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk)
 	return ret;
 }
 
+void __delayacct_freepages_start(void)
+{
+	delayacct_start(&current->delays->freepages_start);
+}
+
+void __delayacct_freepages_end(void)
+{
+	delayacct_end(&current->delays->freepages_start,
+			&current->delays->freepages_end,
+			&current->delays->freepages_delay,
+			&current->delays->freepages_count);
+}
+
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 967d30ccd92b..26672c6cd3ce 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -38,6 +38,7 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/memcontrol.h>
+#include <linux/delayacct.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -1316,6 +1317,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 	struct zone *zone;
 	enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
 
+	delayacct_freepages_start();
+
 	if (scan_global_lru(sc))
 		count_vm_event(ALLOCSTALL);
 	/*
@@ -1396,6 +1399,8 @@ out:
 	} else
 		mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
 
+	delayacct_freepages_end();
+
 	return ret;
 }
 
-- 
cgit v1.2.3


From 16d69265b930f7e2fa9eea381715696f780718f4 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Fri, 25 Jul 2008 19:44:36 -0700
Subject: uninline arch_pick_mmap_layout()

Fix this, on avr32:

  include/linux/utsname.h:35,
                   from init/main.c:20:
  include/linux/sched.h: In function 'arch_pick_mmap_layout':
  include/linux/sched.h:2149: error: implicit declaration of function 'PAGE_ALIGN'

Reported-by: Adrian Bunk <bunk@kernel.org>
Cc: Haavard Skinnemoen <hskinnemoen@atmel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h |  9 ---------
 mm/util.c             | 10 ++++++++++
 2 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'include/linux/sched.h')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 42036ffe6b00..3260a5c42b91 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2139,16 +2139,7 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
 
 #endif /* CONFIG_SMP */
 
-#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
 extern void arch_pick_mmap_layout(struct mm_struct *mm);
-#else
-static inline void arch_pick_mmap_layout(struct mm_struct *mm)
-{
-	mm->mmap_base = TASK_UNMAPPED_BASE;
-	mm->get_unmapped_area = arch_get_unmapped_area;
-	mm->unmap_area = arch_unmap_area;
-}
-#endif
 
 #ifdef CONFIG_TRACING
 extern void
diff --git a/mm/util.c b/mm/util.c
index 8f18683825bc..0efd83097ecf 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1,3 +1,4 @@
+#include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/module.h>
@@ -136,3 +137,12 @@ char *strndup_user(const char __user *s, long n)
 	return p;
 }
 EXPORT_SYMBOL(strndup_user);
+
+#ifndef HAVE_ARCH_PICK_MMAP_LAYOUT
+void arch_pick_mmap_layout(struct mm_struct *mm)
+{
+	mm->mmap_base = TASK_UNMAPPED_BASE;
+	mm->get_unmapped_area = arch_get_unmapped_area;
+	mm->unmap_area = arch_unmap_area;
+}
+#endif
-- 
cgit v1.2.3


From 7babe8db99d305340cf4828ce1f5a1481d5622ef Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Fri, 25 Jul 2008 19:45:11 -0700
Subject: Full conversion to early_initcall() interface, remove old interface

A previous patch added the early_initcall(), to allow a cleaner hooking of
pre-SMP initcalls.  Now we remove the older interface, converting all
existing users to the new one.

[akpm@linux-foundation.org: cleanups]
[akpm@linux-foundation.org: build fix]
[kosaki.motohiro@jp.fujitsu.com: warning fix]
[kosaki.motohiro@jp.fujitsu.com: warning fix]
Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h |  9 ---------
 include/linux/smp.h   |  5 -----
 init/main.c           | 23 +----------------------
 kernel/sched.c        |  5 ++++-
 kernel/smp.c          |  4 +++-
 kernel/softirq.c      |  3 ++-
 kernel/softlockup.c   | 25 ++++++++++++++++++++++---
 7 files changed, 32 insertions(+), 42 deletions(-)

(limited to 'include/linux/sched.h')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3260a5c42b91..adb8077dc463 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -292,7 +292,6 @@ extern void sched_show_task(struct task_struct *p);
 
 #ifdef CONFIG_DETECT_SOFTLOCKUP
 extern void softlockup_tick(void);
-extern void spawn_softlockup_task(void);
 extern void touch_softlockup_watchdog(void);
 extern void touch_all_softlockup_watchdogs(void);
 extern unsigned int  softlockup_panic;
@@ -2222,14 +2221,6 @@ static inline void inc_syscw(struct task_struct *tsk)
 }
 #endif
 
-#ifdef CONFIG_SMP
-void migration_init(void);
-#else
-static inline void migration_init(void)
-{
-}
-#endif
-
 #ifndef TASK_SIZE_OF
 #define TASK_SIZE_OF(tsk)	TASK_SIZE
 #endif
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 48262f86c969..66484d4a8459 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -74,15 +74,10 @@ void __smp_call_function_single(int cpuid, struct call_single_data *data);
 #ifdef CONFIG_USE_GENERIC_SMP_HELPERS
 void generic_smp_call_function_single_interrupt(void);
 void generic_smp_call_function_interrupt(void);
-void init_call_single_data(void);
 void ipi_call_lock(void);
 void ipi_call_unlock(void);
 void ipi_call_lock_irq(void);
 void ipi_call_unlock_irq(void);
-#else
-static inline void init_call_single_data(void)
-{
-}
 #endif
 
 /*
diff --git a/init/main.c b/init/main.c
index b6fec08dbbef..20fdc9884b77 100644
--- a/init/main.c
+++ b/init/main.c
@@ -774,16 +774,7 @@ static void __init do_basic_setup(void)
 	do_initcalls();
 }
 
-static int __initdata nosoftlockup;
-
-static int __init nosoftlockup_setup(char *str)
-{
-	nosoftlockup = 1;
-	return 1;
-}
-__setup("nosoftlockup", nosoftlockup_setup);
-
-static void __init __do_pre_smp_initcalls(void)
+static void __init do_pre_smp_initcalls(void)
 {
 	initcall_t *call;
 
@@ -791,17 +782,6 @@ static void __init __do_pre_smp_initcalls(void)
 		do_one_initcall(*call);
 }
 
-static void __init do_pre_smp_initcalls(void)
-{
-	extern int spawn_ksoftirqd(void);
-
-	init_call_single_data();
-	migration_init();
-	spawn_ksoftirqd();
-	if (!nosoftlockup)
-		spawn_softlockup_task();
-}
-
 static void run_init_process(char *init_filename)
 {
 	argv_init[0] = init_filename;
@@ -873,7 +853,6 @@ static int __init kernel_init(void * unused)
 
 	smp_prepare_cpus(setup_max_cpus);
 
-	__do_pre_smp_initcalls();
 	do_pre_smp_initcalls();
 
 	smp_init();
diff --git a/kernel/sched.c b/kernel/sched.c
index 0047bd9b96aa..fde1a1026359 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6389,7 +6389,7 @@ static struct notifier_block __cpuinitdata migration_notifier = {
 	.priority = 10
 };
 
-void __init migration_init(void)
+static int __init migration_init(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
 	int err;
@@ -6399,7 +6399,10 @@ void __init migration_init(void)
 	BUG_ON(err == NOTIFY_BAD);
 	migration_call(&migration_notifier, CPU_ONLINE, cpu);
 	register_cpu_notifier(&migration_notifier);
+
+	return err;
 }
+early_initcall(migration_init);
 #endif
 
 #ifdef CONFIG_SMP
diff --git a/kernel/smp.c b/kernel/smp.c
index 462c785ca1ee..96fc7c0edc59 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -33,7 +33,7 @@ struct call_single_queue {
 	spinlock_t lock;
 };
 
-void __cpuinit init_call_single_data(void)
+static int __cpuinit init_call_single_data(void)
 {
 	int i;
 
@@ -43,7 +43,9 @@ void __cpuinit init_call_single_data(void)
 		spin_lock_init(&q->lock);
 		INIT_LIST_HEAD(&q->list);
 	}
+	return 0;
 }
+early_initcall(init_call_single_data);
 
 static void csd_flag_wait(struct call_single_data *data)
 {
diff --git a/kernel/softirq.c b/kernel/softirq.c
index f6b03d56c2bf..c506f266a6b9 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -630,7 +630,7 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
 	.notifier_call = cpu_callback
 };
 
-__init int spawn_ksoftirqd(void)
+static __init int spawn_ksoftirqd(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
 	int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
@@ -640,6 +640,7 @@ __init int spawn_ksoftirqd(void)
 	register_cpu_notifier(&cpu_nfb);
 	return 0;
 }
+early_initcall(spawn_ksoftirqd);
 
 #ifdef CONFIG_SMP
 /*
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 7bd8d1aadd5d..b75b492fbfcf 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -338,14 +338,33 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
 	.notifier_call = cpu_callback
 };
 
-__init void spawn_softlockup_task(void)
+static int __initdata nosoftlockup;
+
+static int __init nosoftlockup_setup(char *str)
+{
+	nosoftlockup = 1;
+	return 1;
+}
+__setup("nosoftlockup", nosoftlockup_setup);
+
+static int __init spawn_softlockup_task(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
-	int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	int err;
 
-	BUG_ON(err == NOTIFY_BAD);
+	if (nosoftlockup)
+		return 0;
+
+	err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	if (err == NOTIFY_BAD) {
+		BUG();
+		return 1;
+	}
 	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
 	register_cpu_notifier(&cpu_nfb);
 
 	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
+
+	return 0;
 }
+early_initcall(spawn_softlockup_task);
-- 
cgit v1.2.3


From 2b2a1ff64afbadac842bbc58c5166962cf4f7664 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:54 -0700
Subject: tracehook: death

This moves the ptrace logic in task death (exit_notify) into tracehook.h
inlines.  Some code is rearranged slightly to make things nicer.  There is
no change, only cleanup.

There is one hook called with the tasklist_lock write-locked, as ptrace
needs.  There is also a new hook called after exit_state changes and
without locks.  This is a better place for tracing work to be in the
future, since it doesn't delay the whole system with locking.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h     |  2 +-
 include/linux/tracehook.h | 52 +++++++++++++++++++++++++++++++++++++++++++++++
 kernel/exit.c             | 26 ++++++++----------------
 kernel/signal.c           | 10 ++++++---
 4 files changed, 69 insertions(+), 21 deletions(-)

(limited to 'include/linux/sched.h')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index adb8077dc463..a95d84d0da95 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1796,7 +1796,7 @@ extern int kill_pid_info_as_uid(int, struct siginfo *, struct pid *, uid_t, uid_
 extern int kill_pgrp(struct pid *pid, int sig, int priv);
 extern int kill_pid(struct pid *pid, int sig, int priv);
 extern int kill_proc_info(int, struct siginfo *, pid_t);
-extern void do_notify_parent(struct task_struct *, int);
+extern int do_notify_parent(struct task_struct *, int);
 extern void force_sig(int, struct task_struct *);
 extern void force_sig_specific(int, struct task_struct *);
 extern int send_sig(int, struct task_struct *, int);
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 6dc428dd2f38..4c50e1b57349 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -471,4 +471,56 @@ static inline int tracehook_notify_jctl(int notify, int why)
 	return notify || (current->ptrace & PT_PTRACED);
 }
 
+/**
+ * tracehook_notify_death - task is dead, ready to notify parent
+ * @task:		@current task now exiting
+ * @death_cookie:	value to pass to tracehook_report_death()
+ * @group_dead:		nonzero if this was the last thread in the group to die
+ *
+ * Return the signal number to send our parent with do_notify_parent(), or
+ * zero to send no signal and leave a zombie, or -1 to self-reap right now.
+ *
+ * Called with write_lock_irq(&tasklist_lock) held.
+ */
+static inline int tracehook_notify_death(struct task_struct *task,
+					 void **death_cookie, int group_dead)
+{
+	if (task->exit_signal == -1)
+		return task->ptrace ? SIGCHLD : -1;
+
+	/*
+	 * If something other than our normal parent is ptracing us, then
+	 * send it a SIGCHLD instead of honoring exit_signal.  exit_signal
+	 * only has special meaning to our real parent.
+	 */
+	if (thread_group_empty(task) && !ptrace_reparented(task))
+		return task->exit_signal;
+
+	return task->ptrace ? SIGCHLD : 0;
+}
+
+/**
+ * tracehook_report_death - task is dead and ready to be reaped
+ * @task:		@current task now exiting
+ * @signal:		signal number sent to parent, or 0 or -1
+ * @death_cookie:	value passed back from tracehook_notify_death()
+ * @group_dead:		nonzero if this was the last thread in the group to die
+ *
+ * Thread has just become a zombie or is about to self-reap.  If positive,
+ * @signal is the signal number just sent to the parent (usually %SIGCHLD).
+ * If @signal is -1, this thread will self-reap.  If @signal is 0, this is
+ * a delayed_group_leader() zombie.  The @death_cookie was passed back by
+ * tracehook_notify_death().
+ *
+ * If normal reaping is not inhibited, @task->exit_state might be changing
+ * in parallel.
+ *
+ * Called without locks.
+ */
+static inline void tracehook_report_death(struct task_struct *task,
+					  int signal, void *death_cookie,
+					  int group_dead)
+{
+}
+
 #endif	/* <linux/tracehook.h> */
diff --git a/kernel/exit.c b/kernel/exit.c
index da28745f7c38..6cdf60712bd2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -885,7 +885,8 @@ static void forget_original_parent(struct task_struct *father)
  */
 static void exit_notify(struct task_struct *tsk, int group_dead)
 {
-	int state;
+	int signal;
+	void *cookie;
 
 	/*
 	 * This does two things:
@@ -922,22 +923,11 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	    !capable(CAP_KILL))
 		tsk->exit_signal = SIGCHLD;
 
-	/* If something other than our normal parent is ptracing us, then
-	 * send it a SIGCHLD instead of honoring exit_signal.  exit_signal
-	 * only has special meaning to our real parent.
-	 */
-	if (!task_detached(tsk) && thread_group_empty(tsk)) {
-		int signal = ptrace_reparented(tsk) ?
-				SIGCHLD : tsk->exit_signal;
-		do_notify_parent(tsk, signal);
-	} else if (tsk->ptrace) {
-		do_notify_parent(tsk, SIGCHLD);
-	}
+	signal = tracehook_notify_death(tsk, &cookie, group_dead);
+	if (signal > 0)
+		signal = do_notify_parent(tsk, signal);
 
-	state = EXIT_ZOMBIE;
-	if (task_detached(tsk) && likely(!tsk->ptrace))
-		state = EXIT_DEAD;
-	tsk->exit_state = state;
+	tsk->exit_state = signal < 0 ? EXIT_DEAD : EXIT_ZOMBIE;
 
 	/* mt-exec, de_thread() is waiting for us */
 	if (thread_group_leader(tsk) &&
@@ -947,8 +937,10 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 
 	write_unlock_irq(&tasklist_lock);
 
+	tracehook_report_death(tsk, signal, cookie, group_dead);
+
 	/* If the process is dead, release it - nobody will wait for it */
-	if (state == EXIT_DEAD)
+	if (signal < 0)
 		release_task(tsk);
 }
 
diff --git a/kernel/signal.c b/kernel/signal.c
index e9e699f4b1bd..0e862d3130ff 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1326,9 +1326,11 @@ static inline void __wake_up_parent(struct task_struct *p,
 /*
  * Let a parent know about the death of a child.
  * For a stopped/continued status change, use do_notify_parent_cldstop instead.
+ *
+ * Returns -1 if our parent ignored us and so we've switched to
+ * self-reaping, or else @sig.
  */
-
-void do_notify_parent(struct task_struct *tsk, int sig)
+int do_notify_parent(struct task_struct *tsk, int sig)
 {
 	struct siginfo info;
 	unsigned long flags;
@@ -1399,12 +1401,14 @@ void do_notify_parent(struct task_struct *tsk, int sig)
 		 */
 		tsk->exit_signal = -1;
 		if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
-			sig = 0;
+			sig = -1;
 	}
 	if (valid_signal(sig) && sig > 0)
 		__group_send_sig_info(sig, &info, tsk->parent);
 	__wake_up_parent(tsk, tsk->parent);
 	spin_unlock_irqrestore(&psig->siglock, flags);
+
+	return sig;
 }
 
 static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
-- 
cgit v1.2.3


From 85ba2d862e521375a8ee01526c5c46b1f24bb4af Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:58 -0700
Subject: tracehook: wait_task_inactive

This extends wait_task_inactive() with a new argument so it can be used in
a "soft" mode where it will check for the task changing state unexpectedly
and back off.  There is no change to existing callers.  This lays the
groundwork to allow robust, noninvasive tracing that can try to sample a
blocked thread but back off safely if it wakes up.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/kernel/perfmon.c |  4 ++--
 include/linux/sched.h      |  8 ++++++--
 kernel/kthread.c           |  2 +-
 kernel/ptrace.c            |  2 +-
 kernel/sched.c             | 29 +++++++++++++++++++++++++++--
 5 files changed, 37 insertions(+), 8 deletions(-)

(limited to 'include/linux/sched.h')

diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 19d4493c6193..fc8f3509df27 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -2626,7 +2626,7 @@ pfm_task_incompatible(pfm_context_t *ctx, struct task_struct *task)
 	/*
 	 * make sure the task is off any CPU
 	 */
-	wait_task_inactive(task);
+	wait_task_inactive(task, 0);
 
 	/* more to come... */
 
@@ -4774,7 +4774,7 @@ recheck:
 
 		UNPROTECT_CTX(ctx, flags);
 
-		wait_task_inactive(task);
+		wait_task_inactive(task, 0);
 
 		PROTECT_CTX(ctx, flags);
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a95d84d0da95..f59318a0099b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1882,9 +1882,13 @@ extern void set_task_comm(struct task_struct *tsk, char *from);
 extern char *get_task_comm(char *to, struct task_struct *tsk);
 
 #ifdef CONFIG_SMP
-extern void wait_task_inactive(struct task_struct * p);
+extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
 #else
-#define wait_task_inactive(p)	do { } while (0)
+static inline unsigned long wait_task_inactive(struct task_struct *p,
+					       long match_state)
+{
+	return 1;
+}
 #endif
 
 #define next_task(p)	list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks)
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 6111c27491b1..96cff2f8710b 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -176,7 +176,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
 		return;
 	}
 	/* Must have done schedule() in kthread() before we set_task_cpu */
-	wait_task_inactive(k);
+	wait_task_inactive(k, 0);
 	set_task_cpu(k, cpu);
 	k->cpus_allowed = cpumask_of_cpu(cpu);
 	k->rt.nr_cpus_allowed = 1;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 8392a9da6450..082b3fcb32a0 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -107,7 +107,7 @@ int ptrace_check_attach(struct task_struct *child, int kill)
 	read_unlock(&tasklist_lock);
 
 	if (!ret && !kill)
-		wait_task_inactive(child);
+		ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH;
 
 	/* All systems go.. */
 	return ret;
diff --git a/kernel/sched.c b/kernel/sched.c
index fde1a1026359..0236958addcb 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1867,16 +1867,24 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  *
+ * If @match_state is nonzero, it's the @p->state value just checked and
+ * not expected to change.  If it changes, i.e. @p might have woken up,
+ * then return zero.  When we succeed in waiting for @p to be off its CPU,
+ * we return a positive number (its total switch count).  If a second call
+ * a short while later returns the same number, the caller can be sure that
+ * @p has remained unscheduled the whole time.
+ *
  * The caller must ensure that the task *will* unschedule sometime soon,
  * else this function might spin for a *long* time. This function can't
  * be called with interrupts off, or it may introduce deadlock with
  * smp_call_function() if an IPI is sent by the same process we are
  * waiting to become inactive.
  */
-void wait_task_inactive(struct task_struct *p)
+unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 {
 	unsigned long flags;
 	int running, on_rq;
+	unsigned long ncsw;
 	struct rq *rq;
 
 	for (;;) {
@@ -1899,8 +1907,11 @@ void wait_task_inactive(struct task_struct *p)
 		 * return false if the runqueue has changed and p
 		 * is actually now running somewhere else!
 		 */
-		while (task_running(rq, p))
+		while (task_running(rq, p)) {
+			if (match_state && unlikely(p->state != match_state))
+				return 0;
 			cpu_relax();
+		}
 
 		/*
 		 * Ok, time to look more closely! We need the rq
@@ -1910,8 +1921,20 @@ void wait_task_inactive(struct task_struct *p)
 		rq = task_rq_lock(p, &flags);
 		running = task_running(rq, p);
 		on_rq = p->se.on_rq;
+		ncsw = 0;
+		if (!match_state || p->state == match_state) {
+			ncsw = p->nivcsw + p->nvcsw;
+			if (unlikely(!ncsw))
+				ncsw = 1;
+		}
 		task_rq_unlock(rq, &flags);
 
+		/*
+		 * If it changed from the expected state, bail out now.
+		 */
+		if (unlikely(!ncsw))
+			break;
+
 		/*
 		 * Was it really running after all now that we
 		 * checked with the proper locks actually held?
@@ -1944,6 +1967,8 @@ void wait_task_inactive(struct task_struct *p)
 		 */
 		break;
 	}
+
+	return ncsw;
 }
 
 /***
-- 
cgit v1.2.3


From 5995477ab7f3522c497c9c4a1c55373e9d655574 Mon Sep 17 00:00:00 2001
From: Andrea Righi <righi.andrea@gmail.com>
Date: Sun, 27 Jul 2008 17:29:15 +0200
Subject: task IO accounting: improve code readability

Put all i/o statistics in struct proc_io_accounting and use inline functions to
initialize and increment statistics, removing a lot of single variable
assignments.

This also reduces the kernel size as following (with CONFIG_TASK_XACCT=y and
CONFIG_TASK_IO_ACCOUNTING=y).

    text    data     bss     dec     hex filename
   11651       0       0   11651    2d83 kernel/exit.o.before
   11619       0       0   11619    2d63 kernel/exit.o.after
   10886     132     136   11154    2b92 kernel/fork.o.before
   10758     132     136   11026    2b12 kernel/fork.o.after

 3082029  807968 4818600 8708597  84e1f5 vmlinux.o.before
 3081869  807968 4818600 8708437  84e155 vmlinux.o.after

Signed-off-by: Andrea Righi <righi.andrea@gmail.com>
Acked-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c                         | 57 ++++++++++------------------------
 include/linux/sched.h                  | 19 ++++--------
 include/linux/task_io_accounting.h     | 27 ++++++++++++++--
 include/linux/task_io_accounting_ops.h | 56 +++++++++++++++++++++++++++------
 kernel/exit.c                          | 30 ++----------------
 kernel/fork.c                          | 15 ++-------
 kernel/tsacct.c                        | 14 ++++-----
 7 files changed, 104 insertions(+), 114 deletions(-)

(limited to 'include/linux/sched.h')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index e74308bdabd3..3d94906c7aa8 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -53,6 +53,7 @@
 #include <linux/time.h>
 #include <linux/proc_fs.h>
 #include <linux/stat.h>
+#include <linux/task_io_accounting_ops.h>
 #include <linux/init.h>
 #include <linux/capability.h>
 #include <linux/file.h>
@@ -2402,44 +2403,17 @@ static int proc_base_fill_cache(struct file *filp, void *dirent,
 #ifdef CONFIG_TASK_IO_ACCOUNTING
 static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
 {
-	u64 rchar, wchar, syscr, syscw;
-	struct task_io_accounting ioac;
-
-	rchar = task->rchar;
-	wchar = task->wchar;
-	syscr = task->syscr;
-	syscw = task->syscw;
-	memcpy(&ioac, &task->ioac, sizeof(ioac));
-
-	if (whole) {
-		unsigned long flags;
-
-		if (lock_task_sighand(task, &flags)) {
-			struct signal_struct *sig = task->signal;
-			struct task_struct *t = task;
-
-			rchar += sig->rchar;
-			wchar += sig->wchar;
-			syscr += sig->syscr;
-			syscw += sig->syscw;
-
-			ioac.read_bytes += sig->ioac.read_bytes;
-			ioac.write_bytes += sig->ioac.write_bytes;
-			ioac.cancelled_write_bytes +=
-					sig->ioac.cancelled_write_bytes;
-			while_each_thread(task, t) {
-				rchar += t->rchar;
-				wchar += t->wchar;
-				syscr += t->syscr;
-				syscw += t->syscw;
-
-				ioac.read_bytes += t->ioac.read_bytes;
-				ioac.write_bytes += t->ioac.write_bytes;
-				ioac.cancelled_write_bytes +=
-					t->ioac.cancelled_write_bytes;
-			}
-			unlock_task_sighand(task, &flags);
-		}
+	struct proc_io_accounting acct = task->ioac;
+	unsigned long flags;
+
+	if (whole && lock_task_sighand(task, &flags)) {
+		struct task_struct *t = task;
+
+		task_io_accounting_add(&acct, &task->signal->ioac);
+		while_each_thread(task, t)
+			task_io_accounting_add(&acct, &t->ioac);
+
+		unlock_task_sighand(task, &flags);
 	}
 	return sprintf(buffer,
 			"rchar: %llu\n"
@@ -2449,9 +2423,10 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
 			"read_bytes: %llu\n"
 			"write_bytes: %llu\n"
 			"cancelled_write_bytes: %llu\n",
-			rchar, wchar, syscr, syscw,
-			ioac.read_bytes, ioac.write_bytes,
-			ioac.cancelled_write_bytes);
+			acct.chr.rchar, acct.chr.wchar,
+			acct.chr.syscr, acct.chr.syscw,
+			acct.blk.read_bytes, acct.blk.write_bytes,
+			acct.blk.cancelled_write_bytes);
 }
 
 static int proc_tid_io_accounting(struct task_struct *task, char *buffer)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f59318a0099b..034c1ca6b332 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -505,10 +505,7 @@ struct signal_struct {
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
 	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
 	unsigned long inblock, oublock, cinblock, coublock;
-#ifdef CONFIG_TASK_XACCT
-	u64 rchar, wchar, syscr, syscw;
-#endif
-	struct task_io_accounting ioac;
+	struct proc_io_accounting ioac;
 
 	/*
 	 * Cumulative ns of scheduled CPU time for dead threads in the
@@ -1256,11 +1253,7 @@ struct task_struct {
 
 	unsigned long ptrace_message;
 	siginfo_t *last_siginfo; /* For ptrace use.  */
-#ifdef CONFIG_TASK_XACCT
-/* i/o counters(bytes read/written, #syscalls */
-	u64 rchar, wchar, syscr, syscw;
-#endif
-	struct task_io_accounting ioac;
+	struct proc_io_accounting ioac;
 #if defined(CONFIG_TASK_XACCT)
 	u64 acct_rss_mem1;	/* accumulated rss usage */
 	u64 acct_vm_mem1;	/* accumulated virtual memory usage */
@@ -2190,22 +2183,22 @@ extern long sched_group_rt_period(struct task_group *tg);
 #ifdef CONFIG_TASK_XACCT
 static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
 {
-	tsk->rchar += amt;
+	tsk->ioac.chr.rchar += amt;
 }
 
 static inline void add_wchar(struct task_struct *tsk, ssize_t amt)
 {
-	tsk->wchar += amt;
+	tsk->ioac.chr.wchar += amt;
 }
 
 static inline void inc_syscr(struct task_struct *tsk)
 {
-	tsk->syscr++;
+	tsk->ioac.chr.syscr++;
 }
 
 static inline void inc_syscw(struct task_struct *tsk)
 {
-	tsk->syscw++;
+	tsk->ioac.chr.syscw++;
 }
 #else
 static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
diff --git a/include/linux/task_io_accounting.h b/include/linux/task_io_accounting.h
index 44d00e9cceea..165390f8b936 100644
--- a/include/linux/task_io_accounting.h
+++ b/include/linux/task_io_accounting.h
@@ -1,5 +1,5 @@
 /*
- * task_io_accounting: a structure which is used for recording a single task's
+ * proc_io_accounting: a structure which is used for recording a single task's
  * IO statistics.
  *
  * Don't include this header file directly - it is designed to be dragged in via
@@ -8,6 +8,22 @@
  * Blame akpm@osdl.org for all this.
  */
 
+#ifdef CONFIG_TASK_XACCT
+struct task_chr_io_accounting {
+	/* bytes read */
+	u64 rchar;
+	/*  bytes written */
+	u64 wchar;
+	/* # of read syscalls */
+	u64 syscr;
+	/* # of write syscalls */
+	u64 syscw;
+};
+#else /* CONFIG_TASK_XACCT */
+struct task_chr_io_accounting {
+};
+#endif /* CONFIG_TASK_XACCT */
+
 #ifdef CONFIG_TASK_IO_ACCOUNTING
 struct task_io_accounting {
 	/*
@@ -31,7 +47,12 @@ struct task_io_accounting {
 	 */
 	u64 cancelled_write_bytes;
 };
-#else
+#else /* CONFIG_TASK_IO_ACCOUNTING */
 struct task_io_accounting {
 };
-#endif
+#endif /* CONFIG_TASK_IO_ACCOUNTING */
+
+struct proc_io_accounting {
+	struct task_chr_io_accounting chr;
+	struct task_io_accounting blk;
+};
diff --git a/include/linux/task_io_accounting_ops.h b/include/linux/task_io_accounting_ops.h
index ff46c6fad79d..e6f958ebe97f 100644
--- a/include/linux/task_io_accounting_ops.h
+++ b/include/linux/task_io_accounting_ops.h
@@ -9,7 +9,7 @@
 #ifdef CONFIG_TASK_IO_ACCOUNTING
 static inline void task_io_account_read(size_t bytes)
 {
-	current->ioac.read_bytes += bytes;
+	current->ioac.blk.read_bytes += bytes;
 }
 
 /*
@@ -18,12 +18,12 @@ static inline void task_io_account_read(size_t bytes)
  */
 static inline unsigned long task_io_get_inblock(const struct task_struct *p)
 {
-	return p->ioac.read_bytes >> 9;
+	return p->ioac.blk.read_bytes >> 9;
 }
 
 static inline void task_io_account_write(size_t bytes)
 {
-	current->ioac.write_bytes += bytes;
+	current->ioac.blk.write_bytes += bytes;
 }
 
 /*
@@ -32,17 +32,25 @@ static inline void task_io_account_write(size_t bytes)
  */
 static inline unsigned long task_io_get_oublock(const struct task_struct *p)
 {
-	return p->ioac.write_bytes >> 9;
+	return p->ioac.blk.write_bytes >> 9;
 }
 
 static inline void task_io_account_cancelled_write(size_t bytes)
 {
-	current->ioac.cancelled_write_bytes += bytes;
+	current->ioac.blk.cancelled_write_bytes += bytes;
 }
 
-static inline void task_io_accounting_init(struct task_struct *tsk)
+static inline void task_io_accounting_init(struct proc_io_accounting *ioac)
 {
-	memset(&tsk->ioac, 0, sizeof(tsk->ioac));
+	memset(ioac, 0, sizeof(*ioac));
+}
+
+static inline void task_blk_io_accounting_add(struct proc_io_accounting *dst,
+						struct proc_io_accounting *src)
+{
+	dst->blk.read_bytes += src->blk.read_bytes;
+	dst->blk.write_bytes += src->blk.write_bytes;
+	dst->blk.cancelled_write_bytes += src->blk.cancelled_write_bytes;
 }
 
 #else
@@ -69,9 +77,37 @@ static inline void task_io_account_cancelled_write(size_t bytes)
 {
 }
 
-static inline void task_io_accounting_init(struct task_struct *tsk)
+static inline void task_io_accounting_init(struct proc_io_accounting *ioac)
+{
+}
+
+static inline void task_blk_io_accounting_add(struct proc_io_accounting *dst,
+						struct proc_io_accounting *src)
 {
 }
 
-#endif		/* CONFIG_TASK_IO_ACCOUNTING */
-#endif		/* __TASK_IO_ACCOUNTING_OPS_INCLUDED */
+#endif /* CONFIG_TASK_IO_ACCOUNTING */
+
+#ifdef CONFIG_TASK_XACCT
+static inline void task_chr_io_accounting_add(struct proc_io_accounting *dst,
+						struct proc_io_accounting *src)
+{
+	dst->chr.rchar += src->chr.rchar;
+	dst->chr.wchar += src->chr.wchar;
+	dst->chr.syscr += src->chr.syscr;
+	dst->chr.syscw += src->chr.syscw;
+}
+#else
+static inline void task_chr_io_accounting_add(struct proc_io_accounting *dst,
+						struct proc_io_accounting *src)
+{
+}
+#endif /* CONFIG_TASK_XACCT */
+
+static inline void task_io_accounting_add(struct proc_io_accounting *dst,
+						struct proc_io_accounting *src)
+{
+	task_chr_io_accounting_add(dst, src);
+	task_blk_io_accounting_add(dst, src);
+}
+#endif /* __TASK_IO_ACCOUNTING_OPS_INCLUDED */
diff --git a/kernel/exit.c b/kernel/exit.c
index 0caf590548a0..eb4d6470d1d0 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -121,18 +121,7 @@ static void __exit_signal(struct task_struct *tsk)
 		sig->nivcsw += tsk->nivcsw;
 		sig->inblock += task_io_get_inblock(tsk);
 		sig->oublock += task_io_get_oublock(tsk);
-#ifdef CONFIG_TASK_XACCT
-		sig->rchar += tsk->rchar;
-		sig->wchar += tsk->wchar;
-		sig->syscr += tsk->syscr;
-		sig->syscw += tsk->syscw;
-#endif /* CONFIG_TASK_XACCT */
-#ifdef CONFIG_TASK_IO_ACCOUNTING
-		sig->ioac.read_bytes += tsk->ioac.read_bytes;
-		sig->ioac.write_bytes += tsk->ioac.write_bytes;
-		sig->ioac.cancelled_write_bytes +=
-					tsk->ioac.cancelled_write_bytes;
-#endif /* CONFIG_TASK_IO_ACCOUNTING */
+		task_io_accounting_add(&sig->ioac, &tsk->ioac);
 		sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
 		sig = NULL; /* Marker for below. */
 	}
@@ -1363,21 +1352,8 @@ static int wait_task_zombie(struct task_struct *p, int options,
 		psig->coublock +=
 			task_io_get_oublock(p) +
 			sig->oublock + sig->coublock;
-#ifdef CONFIG_TASK_XACCT
-		psig->rchar += p->rchar + sig->rchar;
-		psig->wchar += p->wchar + sig->wchar;
-		psig->syscr += p->syscr + sig->syscr;
-		psig->syscw += p->syscw + sig->syscw;
-#endif /* CONFIG_TASK_XACCT */
-#ifdef CONFIG_TASK_IO_ACCOUNTING
-		psig->ioac.read_bytes +=
-			p->ioac.read_bytes + sig->ioac.read_bytes;
-		psig->ioac.write_bytes +=
-			p->ioac.write_bytes + sig->ioac.write_bytes;
-		psig->ioac.cancelled_write_bytes +=
-				p->ioac.cancelled_write_bytes +
-				sig->ioac.cancelled_write_bytes;
-#endif /* CONFIG_TASK_IO_ACCOUNTING */
+		task_io_accounting_add(&psig->ioac, &p->ioac);
+		task_io_accounting_add(&psig->ioac, &sig->ioac);
 		spin_unlock_irq(&p->parent->sighand->siglock);
 	}
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 5e050c1317c4..8214ba7c8bb1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -806,12 +806,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
 	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
 	sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
-#ifdef CONFIG_TASK_XACCT
-	sig->rchar = sig->wchar = sig->syscr = sig->syscw = 0;
-#endif
-#ifdef CONFIG_TASK_IO_ACCOUNTING
-	memset(&sig->ioac, 0, sizeof(sig->ioac));
-#endif
+	task_io_accounting_init(&sig->ioac);
 	sig->sum_sched_runtime = 0;
 	INIT_LIST_HEAD(&sig->cpu_timers[0]);
 	INIT_LIST_HEAD(&sig->cpu_timers[1]);
@@ -994,13 +989,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->last_switch_timestamp = 0;
 #endif
 
-#ifdef CONFIG_TASK_XACCT
-	p->rchar = 0;		/* I/O counter: bytes read */
-	p->wchar = 0;		/* I/O counter: bytes written */
-	p->syscr = 0;		/* I/O counter: read syscalls */
-	p->syscw = 0;		/* I/O counter: write syscalls */
-#endif
-	task_io_accounting_init(p);
+	task_io_accounting_init(&p->ioac);
 	acct_clear_integrals(p);
 
 	p->it_virt_expires = cputime_zero;
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 3da47ccdc5e5..f9cd2561689c 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -94,14 +94,14 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 		stats->hiwater_vm    = mm->hiwater_vm * PAGE_SIZE / KB;
 		mmput(mm);
 	}
-	stats->read_char	= p->rchar;
-	stats->write_char	= p->wchar;
-	stats->read_syscalls	= p->syscr;
-	stats->write_syscalls	= p->syscw;
+	stats->read_char	= p->ioac.chr.rchar;
+	stats->write_char	= p->ioac.chr.wchar;
+	stats->read_syscalls	= p->ioac.chr.syscr;
+	stats->write_syscalls	= p->ioac.chr.syscw;
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-	stats->read_bytes	= p->ioac.read_bytes;
-	stats->write_bytes	= p->ioac.write_bytes;
-	stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes;
+	stats->read_bytes	= p->ioac.blk.read_bytes;
+	stats->write_bytes	= p->ioac.blk.write_bytes;
+	stats->cancelled_write_bytes = p->ioac.blk.cancelled_write_bytes;
 #else
 	stats->read_bytes	= 0;
 	stats->write_bytes	= 0;
-- 
cgit v1.2.3


From 940389b8afad6495211614c13eb91ef7001773ec Mon Sep 17 00:00:00 2001
From: Andrea Righi <righi.andrea@gmail.com>
Date: Mon, 28 Jul 2008 00:48:12 +0200
Subject: task IO accounting: move all IO statistics in struct
 task_io_accounting

Simplify the code of include/linux/task_io_accounting.h.

It is also more reasonable to have all the task i/o-related statistics in a
single struct (task_io_accounting).

Signed-off-by: Andrea Righi <righi.andrea@gmail.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c                         | 10 +++----
 include/linux/sched.h                  | 12 ++++-----
 include/linux/task_io_accounting.h     | 17 ++----------
 include/linux/task_io_accounting_ops.h | 48 +++++++++++++++++-----------------
 kernel/tsacct.c                        | 14 +++++-----
 5 files changed, 44 insertions(+), 57 deletions(-)

(limited to 'include/linux/sched.h')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3d94906c7aa8..01ed610f9b87 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2403,7 +2403,7 @@ static int proc_base_fill_cache(struct file *filp, void *dirent,
 #ifdef CONFIG_TASK_IO_ACCOUNTING
 static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
 {
-	struct proc_io_accounting acct = task->ioac;
+	struct task_io_accounting acct = task->ioac;
 	unsigned long flags;
 
 	if (whole && lock_task_sighand(task, &flags)) {
@@ -2423,10 +2423,10 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
 			"read_bytes: %llu\n"
 			"write_bytes: %llu\n"
 			"cancelled_write_bytes: %llu\n",
-			acct.chr.rchar, acct.chr.wchar,
-			acct.chr.syscr, acct.chr.syscw,
-			acct.blk.read_bytes, acct.blk.write_bytes,
-			acct.blk.cancelled_write_bytes);
+			acct.rchar, acct.wchar,
+			acct.syscr, acct.syscw,
+			acct.read_bytes, acct.write_bytes,
+			acct.cancelled_write_bytes);
 }
 
 static int proc_tid_io_accounting(struct task_struct *task, char *buffer)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 034c1ca6b332..5270d449ff9d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -505,7 +505,7 @@ struct signal_struct {
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
 	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
 	unsigned long inblock, oublock, cinblock, coublock;
-	struct proc_io_accounting ioac;
+	struct task_io_accounting ioac;
 
 	/*
 	 * Cumulative ns of scheduled CPU time for dead threads in the
@@ -1253,7 +1253,7 @@ struct task_struct {
 
 	unsigned long ptrace_message;
 	siginfo_t *last_siginfo; /* For ptrace use.  */
-	struct proc_io_accounting ioac;
+	struct task_io_accounting ioac;
 #if defined(CONFIG_TASK_XACCT)
 	u64 acct_rss_mem1;	/* accumulated rss usage */
 	u64 acct_vm_mem1;	/* accumulated virtual memory usage */
@@ -2183,22 +2183,22 @@ extern long sched_group_rt_period(struct task_group *tg);
 #ifdef CONFIG_TASK_XACCT
 static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
 {
-	tsk->ioac.chr.rchar += amt;
+	tsk->ioac.rchar += amt;
 }
 
 static inline void add_wchar(struct task_struct *tsk, ssize_t amt)
 {
-	tsk->ioac.chr.wchar += amt;
+	tsk->ioac.wchar += amt;
 }
 
 static inline void inc_syscr(struct task_struct *tsk)
 {
-	tsk->ioac.chr.syscr++;
+	tsk->ioac.syscr++;
 }
 
 static inline void inc_syscw(struct task_struct *tsk)
 {
-	tsk->ioac.chr.syscw++;
+	tsk->ioac.syscw++;
 }
 #else
 static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
diff --git a/include/linux/task_io_accounting.h b/include/linux/task_io_accounting.h
index 165390f8b936..5e88afc9a2fb 100644
--- a/include/linux/task_io_accounting.h
+++ b/include/linux/task_io_accounting.h
@@ -1,5 +1,5 @@
 /*
- * proc_io_accounting: a structure which is used for recording a single task's
+ * task_io_accounting: a structure which is used for recording a single task's
  * IO statistics.
  *
  * Don't include this header file directly - it is designed to be dragged in via
@@ -8,8 +8,8 @@
  * Blame akpm@osdl.org for all this.
  */
 
+struct task_io_accounting {
 #ifdef CONFIG_TASK_XACCT
-struct task_chr_io_accounting {
 	/* bytes read */
 	u64 rchar;
 	/*  bytes written */
@@ -18,14 +18,9 @@ struct task_chr_io_accounting {
 	u64 syscr;
 	/* # of write syscalls */
 	u64 syscw;
-};
-#else /* CONFIG_TASK_XACCT */
-struct task_chr_io_accounting {
-};
 #endif /* CONFIG_TASK_XACCT */
 
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-struct task_io_accounting {
 	/*
 	 * The number of bytes which this task has caused to be read from
 	 * storage.
@@ -46,13 +41,5 @@ struct task_io_accounting {
 	 * information loss in doing that.
 	 */
 	u64 cancelled_write_bytes;
-};
-#else /* CONFIG_TASK_IO_ACCOUNTING */
-struct task_io_accounting {
-};
 #endif /* CONFIG_TASK_IO_ACCOUNTING */
-
-struct proc_io_accounting {
-	struct task_chr_io_accounting chr;
-	struct task_io_accounting blk;
 };
diff --git a/include/linux/task_io_accounting_ops.h b/include/linux/task_io_accounting_ops.h
index e6f958ebe97f..4d090f9ee608 100644
--- a/include/linux/task_io_accounting_ops.h
+++ b/include/linux/task_io_accounting_ops.h
@@ -9,7 +9,7 @@
 #ifdef CONFIG_TASK_IO_ACCOUNTING
 static inline void task_io_account_read(size_t bytes)
 {
-	current->ioac.blk.read_bytes += bytes;
+	current->ioac.read_bytes += bytes;
 }
 
 /*
@@ -18,12 +18,12 @@ static inline void task_io_account_read(size_t bytes)
  */
 static inline unsigned long task_io_get_inblock(const struct task_struct *p)
 {
-	return p->ioac.blk.read_bytes >> 9;
+	return p->ioac.read_bytes >> 9;
 }
 
 static inline void task_io_account_write(size_t bytes)
 {
-	current->ioac.blk.write_bytes += bytes;
+	current->ioac.write_bytes += bytes;
 }
 
 /*
@@ -32,25 +32,25 @@ static inline void task_io_account_write(size_t bytes)
  */
 static inline unsigned long task_io_get_oublock(const struct task_struct *p)
 {
-	return p->ioac.blk.write_bytes >> 9;
+	return p->ioac.write_bytes >> 9;
 }
 
 static inline void task_io_account_cancelled_write(size_t bytes)
 {
-	current->ioac.blk.cancelled_write_bytes += bytes;
+	current->ioac.cancelled_write_bytes += bytes;
 }
 
-static inline void task_io_accounting_init(struct proc_io_accounting *ioac)
+static inline void task_io_accounting_init(struct task_io_accounting *ioac)
 {
 	memset(ioac, 0, sizeof(*ioac));
 }
 
-static inline void task_blk_io_accounting_add(struct proc_io_accounting *dst,
-						struct proc_io_accounting *src)
+static inline void task_blk_io_accounting_add(struct task_io_accounting *dst,
+						struct task_io_accounting *src)
 {
-	dst->blk.read_bytes += src->blk.read_bytes;
-	dst->blk.write_bytes += src->blk.write_bytes;
-	dst->blk.cancelled_write_bytes += src->blk.cancelled_write_bytes;
+	dst->read_bytes += src->read_bytes;
+	dst->write_bytes += src->write_bytes;
+	dst->cancelled_write_bytes += src->cancelled_write_bytes;
 }
 
 #else
@@ -77,35 +77,35 @@ static inline void task_io_account_cancelled_write(size_t bytes)
 {
 }
 
-static inline void task_io_accounting_init(struct proc_io_accounting *ioac)
+static inline void task_io_accounting_init(struct task_io_accounting *ioac)
 {
 }
 
-static inline void task_blk_io_accounting_add(struct proc_io_accounting *dst,
-						struct proc_io_accounting *src)
+static inline void task_blk_io_accounting_add(struct task_io_accounting *dst,
+						struct task_io_accounting *src)
 {
 }
 
 #endif /* CONFIG_TASK_IO_ACCOUNTING */
 
 #ifdef CONFIG_TASK_XACCT
-static inline void task_chr_io_accounting_add(struct proc_io_accounting *dst,
-						struct proc_io_accounting *src)
+static inline void task_chr_io_accounting_add(struct task_io_accounting *dst,
+						struct task_io_accounting *src)
 {
-	dst->chr.rchar += src->chr.rchar;
-	dst->chr.wchar += src->chr.wchar;
-	dst->chr.syscr += src->chr.syscr;
-	dst->chr.syscw += src->chr.syscw;
+	dst->rchar += src->rchar;
+	dst->wchar += src->wchar;
+	dst->syscr += src->syscr;
+	dst->syscw += src->syscw;
 }
 #else
-static inline void task_chr_io_accounting_add(struct proc_io_accounting *dst,
-						struct proc_io_accounting *src)
+static inline void task_chr_io_accounting_add(struct task_io_accounting *dst,
+						struct task_io_accounting *src)
 {
 }
 #endif /* CONFIG_TASK_XACCT */
 
-static inline void task_io_accounting_add(struct proc_io_accounting *dst,
-						struct proc_io_accounting *src)
+static inline void task_io_accounting_add(struct task_io_accounting *dst,
+						struct task_io_accounting *src)
 {
 	task_chr_io_accounting_add(dst, src);
 	task_blk_io_accounting_add(dst, src);
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index f9cd2561689c..8ebcd8532dfb 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -94,14 +94,14 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 		stats->hiwater_vm    = mm->hiwater_vm * PAGE_SIZE / KB;
 		mmput(mm);
 	}
-	stats->read_char	= p->ioac.chr.rchar;
-	stats->write_char	= p->ioac.chr.wchar;
-	stats->read_syscalls	= p->ioac.chr.syscr;
-	stats->write_syscalls	= p->ioac.chr.syscw;
+	stats->read_char	= p->ioac.rchar;
+	stats->write_char	= p->ioac.wchar;
+	stats->read_syscalls	= p->ioac.syscr;
+	stats->write_syscalls	= p->ioac.syscw;
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-	stats->read_bytes	= p->ioac.blk.read_bytes;
-	stats->write_bytes	= p->ioac.blk.write_bytes;
-	stats->cancelled_write_bytes = p->ioac.blk.cancelled_write_bytes;
+	stats->read_bytes	= p->ioac.read_bytes;
+	stats->write_bytes	= p->ioac.write_bytes;
+	stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes;
 #else
 	stats->read_bytes	= 0;
 	stats->write_bytes	= 0;
-- 
cgit v1.2.3


From e4e4e534faa3c2be4e165ce414f44b76ada7208c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 14 Apr 2008 08:50:02 +0200
Subject: sched clock: revert various sched_clock() changes

Found an interactivity problem on a quad core test-system - simple
CPU loops would occasionally delay the system un an unacceptable way.

After much debugging with Peter Zijlstra it turned out that the problem
is caused by the string of sched_clock() changes - they caused the CPU
clock to jump backwards a bit - which confuses the scheduler arithmetics.

(which is unsigned for performance reasons)

So revert:

 # c300ba2: sched_clock: and multiplier for TSC to gtod drift
 # c0c8773: sched_clock: only update deltas with local reads.
 # af52a90: sched_clock: stop maximum check on NO HZ
 # f7cce27: sched_clock: widen the max and min time

This solves the interactivity problems.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Mike Galbraith <efault@gmx.de>
---
 include/linux/sched.h    |  17 +-------
 kernel/sched_clock.c     | 109 ++++++-----------------------------------------
 kernel/time/tick-sched.c |   2 -
 3 files changed, 14 insertions(+), 114 deletions(-)

(limited to 'include/linux/sched.h')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5270d449ff9d..ea436bc1a0e2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1572,28 +1572,13 @@ static inline void sched_clock_idle_sleep_event(void)
 static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
 {
 }
-
-#ifdef CONFIG_NO_HZ
-static inline void sched_clock_tick_stop(int cpu)
-{
-}
-
-static inline void sched_clock_tick_start(int cpu)
-{
-}
-#endif
-
-#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
+#else
 extern void sched_clock_init(void);
 extern u64 sched_clock_cpu(int cpu);
 extern void sched_clock_tick(void);
 extern void sched_clock_idle_sleep_event(void);
 extern void sched_clock_idle_wakeup_event(u64 delta_ns);
-#ifdef CONFIG_NO_HZ
-extern void sched_clock_tick_stop(int cpu);
-extern void sched_clock_tick_start(int cpu);
 #endif
-#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
 
 /*
  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 5a2dc7d8fd98..9a7844158ae8 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -44,11 +44,6 @@ unsigned long long __attribute__((weak)) sched_clock(void)
 
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
 
-#define MULTI_SHIFT 15
-/* Max is double, Min is 1/2 */
-#define MAX_MULTI (2LL << MULTI_SHIFT)
-#define MIN_MULTI (1LL << (MULTI_SHIFT-1))
-
 struct sched_clock_data {
 	/*
 	 * Raw spinlock - this is a special case: this might be called
@@ -62,10 +57,6 @@ struct sched_clock_data {
 	u64			tick_raw;
 	u64			tick_gtod;
 	u64			clock;
-	s64			multi;
-#ifdef CONFIG_NO_HZ
-	int			check_max;
-#endif
 };
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
@@ -97,53 +88,18 @@ void sched_clock_init(void)
 		scd->tick_raw = 0;
 		scd->tick_gtod = ktime_now;
 		scd->clock = ktime_now;
-		scd->multi = 1 << MULTI_SHIFT;
-#ifdef CONFIG_NO_HZ
-		scd->check_max = 1;
-#endif
 	}
 
 	sched_clock_running = 1;
 }
 
-#ifdef CONFIG_NO_HZ
-/*
- * The dynamic ticks makes the delta jiffies inaccurate. This
- * prevents us from checking the maximum time update.
- * Disable the maximum check during stopped ticks.
- */
-void sched_clock_tick_stop(int cpu)
-{
-	struct sched_clock_data *scd = cpu_sdc(cpu);
-
-	scd->check_max = 0;
-}
-
-void sched_clock_tick_start(int cpu)
-{
-	struct sched_clock_data *scd = cpu_sdc(cpu);
-
-	scd->check_max = 1;
-}
-
-static int check_max(struct sched_clock_data *scd)
-{
-	return scd->check_max;
-}
-#else
-static int check_max(struct sched_clock_data *scd)
-{
-	return 1;
-}
-#endif /* CONFIG_NO_HZ */
-
 /*
  * update the percpu scd from the raw @now value
  *
  *  - filter out backward motion
  *  - use jiffies to generate a min,max window to clip the raw values
  */
-static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *time)
+static void __update_sched_clock(struct sched_clock_data *scd, u64 now)
 {
 	unsigned long now_jiffies = jiffies;
 	long delta_jiffies = now_jiffies - scd->tick_jiffies;
@@ -152,31 +108,16 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *tim
 	s64 delta = now - scd->prev_raw;
 
 	WARN_ON_ONCE(!irqs_disabled());
-
-	/*
-	 * At schedule tick the clock can be just under the gtod. We don't
-	 * want to push it too prematurely.
-	 */
-	min_clock = scd->tick_gtod + (delta_jiffies * TICK_NSEC);
-	if (min_clock > TICK_NSEC)
-		min_clock -= TICK_NSEC / 2;
+	min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC;
 
 	if (unlikely(delta < 0)) {
 		clock++;
 		goto out;
 	}
 
-	/*
-	 * The clock must stay within a jiffie of the gtod.
-	 * But since we may be at the start of a jiffy or the end of one
-	 * we add another jiffy buffer.
-	 */
-	max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC;
-
-	delta *= scd->multi;
-	delta >>= MULTI_SHIFT;
+	max_clock = min_clock + TICK_NSEC;
 
-	if (unlikely(clock + delta > max_clock) && check_max(scd)) {
+	if (unlikely(clock + delta > max_clock)) {
 		if (clock < max_clock)
 			clock = max_clock;
 		else
@@ -189,12 +130,9 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *tim
 	if (unlikely(clock < min_clock))
 		clock = min_clock;
 
-	if (time)
-		*time = clock;
-	else {
-		scd->prev_raw = now;
-		scd->clock = clock;
-	}
+	scd->prev_raw = now;
+	scd->tick_jiffies = now_jiffies;
+	scd->clock = clock;
 }
 
 static void lock_double_clock(struct sched_clock_data *data1,
@@ -238,26 +176,21 @@ u64 sched_clock_cpu(int cpu)
 		now -= scd->tick_gtod;
 
 		__raw_spin_unlock(&my_scd->lock);
-
-		__update_sched_clock(scd, now, &clock);
-
-		__raw_spin_unlock(&scd->lock);
-
 	} else {
 		__raw_spin_lock(&scd->lock);
-		__update_sched_clock(scd, now, NULL);
-		clock = scd->clock;
-		__raw_spin_unlock(&scd->lock);
 	}
 
+	__update_sched_clock(scd, now);
+	clock = scd->clock;
+
+	__raw_spin_unlock(&scd->lock);
+
 	return clock;
 }
 
 void sched_clock_tick(void)
 {
 	struct sched_clock_data *scd = this_scd();
-	unsigned long now_jiffies = jiffies;
-	s64 mult, delta_gtod, delta_raw;
 	u64 now, now_gtod;
 
 	if (unlikely(!sched_clock_running))
@@ -269,29 +202,14 @@ void sched_clock_tick(void)
 	now = sched_clock();
 
 	__raw_spin_lock(&scd->lock);
-	__update_sched_clock(scd, now, NULL);
+	__update_sched_clock(scd, now);
 	/*
 	 * update tick_gtod after __update_sched_clock() because that will
 	 * already observe 1 new jiffy; adding a new tick_gtod to that would
 	 * increase the clock 2 jiffies.
 	 */
-	delta_gtod = now_gtod - scd->tick_gtod;
-	delta_raw = now - scd->tick_raw;
-
-	if ((long)delta_raw > 0) {
-		mult = delta_gtod << MULTI_SHIFT;
-		do_div(mult, delta_raw);
-		scd->multi = mult;
-		if (scd->multi > MAX_MULTI)
-			scd->multi = MAX_MULTI;
-		else if (scd->multi < MIN_MULTI)
-			scd->multi = MIN_MULTI;
-	} else
-		scd->multi = 1 << MULTI_SHIFT;
-
 	scd->tick_raw = now;
 	scd->tick_gtod = now_gtod;
-	scd->tick_jiffies = now_jiffies;
 	__raw_spin_unlock(&scd->lock);
 }
 
@@ -321,7 +239,6 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
 	__raw_spin_lock(&scd->lock);
 	scd->prev_raw = now;
 	scd->clock += delta_ns;
-	scd->multi = 1 << MULTI_SHIFT;
 	__raw_spin_unlock(&scd->lock);
 
 	touch_softlockup_watchdog();
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 825b4c00fe44..f5da526424a9 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -289,7 +289,6 @@ void tick_nohz_stop_sched_tick(int inidle)
 			ts->tick_stopped = 1;
 			ts->idle_jiffies = last_jiffies;
 			rcu_enter_nohz();
-			sched_clock_tick_stop(cpu);
 		}
 
 		/*
@@ -392,7 +391,6 @@ void tick_nohz_restart_sched_tick(void)
 	select_nohz_load_balancer(0);
 	now = ktime_get();
 	tick_do_update_jiffies64(now);
-	sched_clock_tick_start(cpu);
 	cpu_clear(cpu, nohz_cpu_mask);
 
 	/*
-- 
cgit v1.2.3


From c1955a3d4762e7a9bf84035eb3c4886a900f0d15 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 11 Aug 2008 08:59:03 +0200
Subject: sched_clock: delay using sched_clock()

Some arch's can't handle sched_clock() being called too early - delay
this until sched_clock_init() has been called.

Reported-by: Bill Gatliff <bgat@billgatliff.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Tested-by: Nishanth Aravamudan <nacc@us.ibm.com>
CC: Russell King - ARM Linux <linux@arm.linux.org.uk>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 14 +++-----------
 kernel/sched_clock.c  | 19 +++++++++++++++++--
 2 files changed, 20 insertions(+), 13 deletions(-)

(limited to 'include/linux/sched.h')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ea436bc1a0e2..5850bfb968a8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1551,16 +1551,10 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
 
 extern unsigned long long sched_clock(void);
 
-#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-static inline void sched_clock_init(void)
-{
-}
-
-static inline u64 sched_clock_cpu(int cpu)
-{
-	return sched_clock();
-}
+extern void sched_clock_init(void);
+extern u64 sched_clock_cpu(int cpu);
 
+#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
 static inline void sched_clock_tick(void)
 {
 }
@@ -1573,8 +1567,6 @@ static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
 {
 }
 #else
-extern void sched_clock_init(void);
-extern u64 sched_clock_cpu(int cpu);
 extern void sched_clock_tick(void);
 extern void sched_clock_idle_sleep_event(void);
 extern void sched_clock_idle_wakeup_event(u64 delta_ns);
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 074edc989379..204991a0bfa7 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -42,6 +42,8 @@ unsigned long long __attribute__((weak)) sched_clock(void)
 	return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
 }
 
+static __read_mostly int sched_clock_running;
+
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
 
 struct sched_clock_data {
@@ -70,8 +72,6 @@ static inline struct sched_clock_data *cpu_sdc(int cpu)
 	return &per_cpu(sched_clock_data, cpu);
 }
 
-static __read_mostly int sched_clock_running;
-
 void sched_clock_init(void)
 {
 	u64 ktime_now = ktime_to_ns(ktime_get());
@@ -248,6 +248,21 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
+#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
+
+void sched_clock_init(void)
+{
+	sched_clock_running = 1;
+}
+
+u64 sched_clock_cpu(int cpu)
+{
+	if (unlikely(!sched_clock_running))
+		return 0;
+
+	return sched_clock();
+}
+
 #endif
 
 unsigned long long cpu_clock(int cpu)
-- 
cgit v1.2.3