From c1dcc927dae01dfd4904ee82ce2c00b50eab6dc3 Mon Sep 17 00:00:00 2001
From: Soren Brinkmann <soren.brinkmann@xilinx.com>
Date: Tue, 26 Nov 2013 17:04:50 -0800
Subject: clocksource: cadence_ttc: Fix mutex taken inside interrupt context

When the kernel is compiled with:
CONFIG_HIGH_RES_TIMERS=no
CONFIG_HZ_PERIODIC=yes
CONFIG_DEBUG_ATOMIC_SLEEP=yes

The following WARN appears:

WARNING: CPU: 1 PID: 0 at linux/kernel/mutex.c:856 mutex_trylock+0x70/0x1fc()
DEBUG_LOCKS_WARN_ON(in_interrupt())
Modules linked in:
CPU: 1 PID: 0 Comm: swapper/1 Not tainted 3.12.0-xilinx-dirty #93
[<c0014a78>] (unwind_backtrace+0x0/0x11c) from [<c0011b6c>] (show_stack+0x10/0x14)
[<c0011b6c>] (show_stack+0x10/0x14) from [<c039120c>] (dump_stack+0x7c/0xc0)
[<c039120c>] (dump_stack+0x7c/0xc0) from [<c001fda4>] (warn_slowpath_common+0x60/0x84)
[<c001fda4>] (warn_slowpath_common+0x60/0x84) from [<c001fe48>] (warn_slowpath_fmt+0x2c/0x3c)
[<c001fe48>] (warn_slowpath_fmt+0x2c/0x3c) from [<c0392658>] (mutex_trylock+0x70/0x1fc)
[<c0392658>] (mutex_trylock+0x70/0x1fc) from [<c02dfc08>] (clk_prepare_lock+0xc/0xe4)
[<c02dfc08>] (clk_prepare_lock+0xc/0xe4) from [<c02e099c>] (clk_get_rate+0xc/0x44)
[<c02e099c>] (clk_get_rate+0xc/0x44) from [<c02d0394>] (ttc_set_mode+0x34/0x78)
[<c02d0394>] (ttc_set_mode+0x34/0x78) from [<c005f794>] (clockevents_set_mode+0x28/0x5c)
[<c005f794>] (clockevents_set_mode+0x28/0x5c) from [<c00607fc>] (tick_broadcast_on_off+0x190/0x1c0)
[<c00607fc>] (tick_broadcast_on_off+0x190/0x1c0) from [<c005f168>] (clockevents_notify+0x58/0x1ac)
[<c005f168>] (clockevents_notify+0x58/0x1ac) from [<c02b99dc>] (cpuidle_setup_broadcast_timer+0x20/0x24)
[<c02b99dc>] (cpuidle_setup_broadcast_timer+0x20/0x24) from [<c006cd04>] (generic_smp_call_function_single_interrupt+0)
[<c006cd04>] (generic_smp_call_function_single_interrupt+0xe0/0x130) from [<c00138c8>] (handle_IPI+0x88/0x118)
[<c00138c8>] (handle_IPI+0x88/0x118) from [<c0008504>] (gic_handle_irq+0x58/0x60)
[<c0008504>] (gic_handle_irq+0x58/0x60) from [<c0012644>] (__irq_svc+0x44/0x78)
Exception stack(0xef099fa0 to 0xef099fe8)
9fa0: 00000001 ef092100 00000000 ef092100 ef098000 00000015 c0399f2c c0579d74
9fc0: 0000406a 413fc090 00000000 00000000 00000000 ef099fe8 c00666ec c000f46c
9fe0: 20000113 ffffffff
[<c0012644>] (__irq_svc+0x44/0x78) from [<c000f46c>] (arch_cpu_idle+0x34/0x3c)
[<c000f46c>] (arch_cpu_idle+0x34/0x3c) from [<c0053980>] (cpu_startup_entry+0xa8/0x10c)
[<c0053980>] (cpu_startup_entry+0xa8/0x10c) from [<000085a4>] (0x85a4)

We are in an interrupt context (IPI) and we are calling clk_get_rate in the
set_mode function which in turn ends up by getting a mutex... Even if that
does not hang, it is a potential kernel deadlock.

It is not allowed to call clk_get_rate() from interrupt context. To
avoid such calls the timer input frequency is stored in the driver's
data struct which makes it accessible to the driver in any context.

[dlezcano] completed the changelog with the WARN trace and added a more
detailed description. Tested on zync zc702.

Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Tested-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Soren Brinkmann <soren.brinkmann@xilinx.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/clocksource/cadence_ttc_timer.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)
diff --git a/drivers/clocksource/cadence_ttc_timer.c b/drivers/clocksource/cadence_ttc_timer.c
index b2bb3a4bc205..a92350b55d32 100644
--- a/drivers/clocksource/cadence_ttc_timer.c
+++ b/drivers/clocksource/cadence_ttc_timer.c
@@ -67,11 +67,13 @@
  * struct ttc_timer - This definition defines local timer structure
  *
  * @base_addr:	Base address of timer
+ * @freq:	Timer input clock frequency
  * @clk:	Associated clock source
  * @clk_rate_change_nb	Notifier block for clock rate changes
  */
 struct ttc_timer {
 	void __iomem *base_addr;
+	unsigned long freq;
 	struct clk *clk;
 	struct notifier_block clk_rate_change_nb;
 };
@@ -196,9 +198,8 @@ static void ttc_set_mode(enum clock_event_mode mode,
 
 	switch (mode) {
 	case CLOCK_EVT_MODE_PERIODIC:
-		ttc_set_interval(timer,
-				DIV_ROUND_CLOSEST(clk_get_rate(ttce->ttc.clk),
-					PRESCALE * HZ));
+		ttc_set_interval(timer, DIV_ROUND_CLOSEST(ttce->ttc.freq,
+						PRESCALE * HZ));
 		break;
 	case CLOCK_EVT_MODE_ONESHOT:
 	case CLOCK_EVT_MODE_UNUSED:
@@ -273,6 +274,8 @@ static void __init ttc_setup_clocksource(struct clk *clk, void __iomem *base)
 		return;
 	}
 
+	ttccs->ttc.freq = clk_get_rate(ttccs->ttc.clk);
+
 	ttccs->ttc.clk_rate_change_nb.notifier_call =
 		ttc_rate_change_clocksource_cb;
 	ttccs->ttc.clk_rate_change_nb.next = NULL;
@@ -298,16 +301,14 @@ static void __init ttc_setup_clocksource(struct clk *clk, void __iomem *base)
 	__raw_writel(CNT_CNTRL_RESET,
 		     ttccs->ttc.base_addr + TTC_CNT_CNTRL_OFFSET);
 
-	err = clocksource_register_hz(&ttccs->cs,
-			clk_get_rate(ttccs->ttc.clk) / PRESCALE);
+	err = clocksource_register_hz(&ttccs->cs, ttccs->ttc.freq / PRESCALE);
 	if (WARN_ON(err)) {
 		kfree(ttccs);
 		return;
 	}
 
 	ttc_sched_clock_val_reg = base + TTC_COUNT_VAL_OFFSET;
-	setup_sched_clock(ttc_sched_clock_read, 16,
-			clk_get_rate(ttccs->ttc.clk) / PRESCALE);
+	setup_sched_clock(ttc_sched_clock_read, 16, ttccs->ttc.freq / PRESCALE);
 }
 
 static int ttc_rate_change_clockevent_cb(struct notifier_block *nb,
@@ -334,6 +335,9 @@ static int ttc_rate_change_clockevent_cb(struct notifier_block *nb,
 				ndata->new_rate / PRESCALE);
 		local_irq_restore(flags);
 
+		/* update cached frequency */
+		ttc->freq = ndata->new_rate;
+
 		/* fall through */
 	}
 	case PRE_RATE_CHANGE:
@@ -367,6 +371,7 @@ static void __init ttc_setup_clockevent(struct clk *clk,
 	if (clk_notifier_register(ttcce->ttc.clk,
 				&ttcce->ttc.clk_rate_change_nb))
 		pr_warn("Unable to register clock notifier.\n");
+	ttcce->ttc.freq = clk_get_rate(ttcce->ttc.clk);
 
 	ttcce->ttc.base_addr = base;
 	ttcce->ce.name = "ttc_clockevent";
@@ -396,7 +401,7 @@ static void __init ttc_setup_clockevent(struct clk *clk,
 	}
 
 	clockevents_config_and_register(&ttcce->ce,
-			clk_get_rate(ttcce->ttc.clk) / PRESCALE, 1, 0xfffe);
+			ttcce->ttc.freq / PRESCALE, 1, 0xfffe);
 }
 
 /**
-- 
cgit v1.2.3


From 9722c2dac708e9468cc0dc30218ef76946ffbc9d Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Mon, 6 Jan 2014 11:39:12 +0000
Subject: sched: Calculate effective load even if local weight is 0

Thomas Hellstrom bisected a regression where erratic 3D performance is
experienced on virtual machines as measured by glxgears. It identified
commit 58d081b5 ("sched/numa: Avoid overloading CPUs on a preferred NUMA
node") as the problem which had modified the behaviour of effective_load.

Effective load calculates the difference to the system-wide load if a
scheduling entity was moved to another CPU. The task group is not heavier
as a result of the move but overall system load can increase/decrease as a
result of the change. Commit 58d081b5 ("sched/numa: Avoid overloading CPUs
on a preferred NUMA node") changed effective_load to make it suitable for
calculating if a particular NUMA node was compute overloaded. To reduce
the cost of the function, it assumed that a current sched entity weight
of 0 was uninteresting but that is not the case.

wake_affine() uses a weight of 0 for sync wakeups on the grounds that it
is assuming the waking task will sleep and not contribute to load in the
near future. In this case, we still want to calculate the effective load
of the sched entity hierarchy. As effective_load is no longer used by
task_numa_compare since commit fb13c7ee (sched/numa: Use a system-wide
search to find swap/migration candidates), this patch simply restores the
historical behaviour.

Reported-and-tested-by: Thomas Hellstrom <thellstrom@vmware.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
[ Wrote changelog]
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20140106113912.GC6178@suse.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c7395d97e4cb..e64b0794060e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3923,7 +3923,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 {
 	struct sched_entity *se = tg->se[cpu];
 
-	if (!tg->parent || !wl)	/* the trivial, non-cgroup case */
+	if (!tg->parent)	/* the trivial, non-cgroup case */
 		return wl;
 
 	for_each_sched_entity(se) {
-- 
cgit v1.2.3