summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2011-04-18 12:04:30 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2011-04-18 12:04:30 +1000
commitbe839f909ce32f83a384db93415b8730c9aa72ae (patch)
treeb579432cd73a32ff0ee80f3ddbedf0fba8af9700
parent9bd02ee4b08d49da5112f240ae6e046f7beb87a1 (diff)
parent7d92e647a604becab4df58de947008e43a9990c0 (diff)
Merge remote-tracking branch 'tip/auto-latest'
-rw-r--r--arch/alpha/kernel/smp.c3
-rw-r--r--arch/alpha/kernel/time.c3
-rw-r--r--arch/arm/kernel/smp.c5
-rw-r--r--arch/blackfin/kernel/time-ts.c35
-rw-r--r--arch/blackfin/mach-common/smp.c3
-rw-r--r--arch/cris/arch-v32/kernel/smp.c13
-rw-r--r--arch/ia64/kernel/cyclone.c6
-rw-r--r--arch/ia64/kernel/irq_ia64.c2
-rw-r--r--arch/ia64/kernel/time.c9
-rw-r--r--arch/ia64/sn/kernel/sn2/timer.c6
-rw-r--r--arch/ia64/xen/irq_xen.c10
-rw-r--r--arch/m32r/kernel/smp.c4
-rw-r--r--arch/microblaze/kernel/timer.c6
-rw-r--r--arch/mips/alchemy/common/time.c3
-rw-r--r--arch/mips/cavium-octeon/csrc-octeon.c3
-rw-r--r--arch/mips/cavium-octeon/smp.c2
-rw-r--r--arch/mips/include/asm/time.h6
-rw-r--r--arch/mips/jz4740/time.c3
-rw-r--r--arch/mips/kernel/cevt-txx9.c3
-rw-r--r--arch/mips/kernel/csrc-bcm1480.c3
-rw-r--r--arch/mips/kernel/csrc-ioasic.c4
-rw-r--r--arch/mips/kernel/csrc-powertv.c35
-rw-r--r--arch/mips/kernel/csrc-r4k.c4
-rw-r--r--arch/mips/kernel/csrc-sb1250.c3
-rw-r--r--arch/mips/kernel/i8253.c5
-rw-r--r--arch/mips/kernel/smtc.c2
-rw-r--r--arch/mips/loongson/common/cs5536/cs5536_mfgpt.c5
-rw-r--r--arch/mips/mti-malta/malta-int.c2
-rw-r--r--arch/mips/pmc-sierra/yosemite/smp.c4
-rw-r--r--arch/mips/sgi-ip27/ip27-irq.c2
-rw-r--r--arch/mips/sgi-ip27/ip27-timer.c3
-rw-r--r--arch/mips/sibyte/bcm1480/smp.c7
-rw-r--r--arch/mips/sibyte/sb1250/smp.c7
-rw-r--r--arch/mn10300/kernel/smp.c5
-rw-r--r--arch/parisc/kernel/smp.c5
-rw-r--r--arch/powerpc/kernel/smp.c4
-rw-r--r--arch/s390/kernel/smp.c6
-rw-r--r--arch/sh/kernel/smp.c2
-rw-r--r--arch/sparc/include/asm/topology_64.h6
-rw-r--r--arch/sparc/kernel/smp_32.c4
-rw-r--r--arch/sparc/kernel/smp_64.c1
-rw-r--r--arch/tile/kernel/smp.c6
-rw-r--r--arch/um/kernel/smp.c2
-rw-r--r--arch/x86/Kconfig43
-rw-r--r--arch/x86/Kconfig.cpu16
-rw-r--r--arch/x86/Makefile_32.cpu2
-rw-r--r--arch/x86/include/asm/cpufeature.h13
-rw-r--r--arch/x86/include/asm/mach_traps.h18
-rw-r--r--arch/x86/include/asm/mmzone_64.h23
-rw-r--r--arch/x86/include/asm/module.h2
-rw-r--r--arch/x86/include/asm/olpc_ofw.h9
-rw-r--r--arch/x86/include/asm/percpu.h27
-rw-r--r--arch/x86/include/asm/probe_roms.h8
-rw-r--r--arch/x86/include/asm/setup.h2
-rw-r--r--arch/x86/include/asm/topology.h1
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/apb_timer.c10
-rw-r--r--arch/x86/kernel/apic/apic.c2
-rw-r--r--arch/x86/kernel/apic/numaq_32.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig4
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c7
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c2
-rw-r--r--arch/x86/kernel/head32.c1
-rw-r--r--arch/x86/kernel/i8253.c6
-rw-r--r--arch/x86/kernel/kvmclock.c6
-rw-r--r--arch/x86/kernel/mpparse.c6
-rw-r--r--arch/x86/kernel/probe_roms.c (renamed from arch/x86/kernel/probe_roms_32.c)101
-rw-r--r--arch/x86/kernel/process.c4
-rw-r--r--arch/x86/kernel/reboot.c24
-rw-r--r--arch/x86/kernel/smp.c5
-rw-r--r--arch/x86/kernel/smpboot.c4
-rw-r--r--arch/x86/kernel/x86_init.c2
-rw-r--r--arch/x86/lguest/boot.c4
-rw-r--r--arch/x86/mm/numa_32.c268
-rw-r--r--arch/x86/mm/numa_64.c131
-rw-r--r--arch/x86/mm/srat_32.c1
-rw-r--r--arch/x86/platform/olpc/Makefile4
-rw-r--r--arch/x86/platform/olpc/olpc.c51
-rw-r--r--arch/x86/platform/olpc/olpc_dt.c19
-rw-r--r--arch/x86/platform/uv/uv_time.c6
-rw-r--r--arch/x86/xen/smp.c5
-rw-r--r--arch/x86/xen/time.c6
-rw-r--r--drivers/acpi/processor_throttling.c32
-rw-r--r--drivers/char/hpet.c6
-rw-r--r--drivers/clocksource/cyclone.c10
-rw-r--r--include/linux/clocksource.h2
-rw-r--r--include/linux/mutex.h2
-rw-r--r--include/linux/sched.h50
-rw-r--r--init/Kconfig5
-rw-r--r--kernel/cpuset.c2
-rw-r--r--kernel/mutex-debug.c2
-rw-r--r--kernel/mutex-debug.h2
-rw-r--r--kernel/mutex.c2
-rw-r--r--kernel/mutex.h2
-rw-r--r--kernel/sched.c1597
-rw-r--r--kernel/sched_debug.c2
-rw-r--r--kernel/sched_fair.c60
-rw-r--r--kernel/sched_features.h6
-rw-r--r--kernel/sched_idletask.c2
-rw-r--r--kernel/sched_rt.c54
-rw-r--r--kernel/sched_stoptask.c5
-rw-r--r--tools/perf/Documentation/perf-script.txt52
-rw-r--r--tools/perf/builtin-script.c171
-rw-r--r--tools/perf/util/probe-finder.c72
-rw-r--r--tools/perf/util/probe-finder.h2
105 files changed, 1502 insertions, 1739 deletions
diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index 42aa078a5e4d..5a621c6d22ab 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -585,8 +585,7 @@ handle_ipi(struct pt_regs *regs)
switch (which) {
case IPI_RESCHEDULE:
- /* Reschedule callback. Everything to be done
- is done by the interrupt return path. */
+ scheduler_ipi();
break;
case IPI_CALL_FUNC:
diff --git a/arch/alpha/kernel/time.c b/arch/alpha/kernel/time.c
index a58e84f1a63b..d73d400cc9ed 100644
--- a/arch/alpha/kernel/time.c
+++ b/arch/alpha/kernel/time.c
@@ -374,8 +374,7 @@ static struct clocksource clocksource_rpcc = {
static inline void register_rpcc_clocksource(long cycle_freq)
{
- clocksource_calc_mult_shift(&clocksource_rpcc, cycle_freq, 4);
- clocksource_register(&clocksource_rpcc);
+ clocksource_register_hz(&clocksource_rpcc, cycle_freq);
}
#else /* !CONFIG_SMP */
static inline void register_rpcc_clocksource(long cycle_freq)
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index a0ad0540761f..fcf3c8e25601 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -567,10 +567,7 @@ asmlinkage void __exception_irq_entry do_IPI(int ipinr, struct pt_regs *regs)
break;
case IPI_RESCHEDULE:
- /*
- * nothing more to do - eveything is
- * done on the interrupt return path
- */
+ scheduler_ipi();
break;
case IPI_CALL_FUNC:
diff --git a/arch/blackfin/kernel/time-ts.c b/arch/blackfin/kernel/time-ts.c
index cdb4beb6bc8f..9e9b60d969dc 100644
--- a/arch/blackfin/kernel/time-ts.c
+++ b/arch/blackfin/kernel/time-ts.c
@@ -23,29 +23,6 @@
#include <asm/gptimers.h>
#include <asm/nmi.h>
-/* Accelerators for sched_clock()
- * convert from cycles(64bits) => nanoseconds (64bits)
- * basic equation:
- * ns = cycles / (freq / ns_per_sec)
- * ns = cycles * (ns_per_sec / freq)
- * ns = cycles * (10^9 / (cpu_khz * 10^3))
- * ns = cycles * (10^6 / cpu_khz)
- *
- * Then we use scaling math (suggested by george@mvista.com) to get:
- * ns = cycles * (10^6 * SC / cpu_khz) / SC
- * ns = cycles * cyc2ns_scale / SC
- *
- * And since SC is a constant power of two, we can convert the div
- * into a shift.
- *
- * We can use khz divisor instead of mhz to keep a better precision, since
- * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
- * (mathieu.desnoyers@polymtl.ca)
- *
- * -johnstul@us.ibm.com "math is hard, lets go shopping!"
- */
-
-#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
#if defined(CONFIG_CYCLES_CLOCKSOURCE)
@@ -63,7 +40,6 @@ static struct clocksource bfin_cs_cycles = {
.rating = 400,
.read = bfin_read_cycles,
.mask = CLOCKSOURCE_MASK(64),
- .shift = CYC2NS_SCALE_FACTOR,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
@@ -75,10 +51,7 @@ static inline unsigned long long bfin_cs_cycles_sched_clock(void)
static int __init bfin_cs_cycles_init(void)
{
- bfin_cs_cycles.mult = \
- clocksource_hz2mult(get_cclk(), bfin_cs_cycles.shift);
-
- if (clocksource_register(&bfin_cs_cycles))
+ if (clocksource_register_hz(&bfin_cs_cycles, get_cclk()))
panic("failed to register clocksource");
return 0;
@@ -111,7 +84,6 @@ static struct clocksource bfin_cs_gptimer0 = {
.rating = 350,
.read = bfin_read_gptimer0,
.mask = CLOCKSOURCE_MASK(32),
- .shift = CYC2NS_SCALE_FACTOR,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
@@ -125,10 +97,7 @@ static int __init bfin_cs_gptimer0_init(void)
{
setup_gptimer0();
- bfin_cs_gptimer0.mult = \
- clocksource_hz2mult(get_sclk(), bfin_cs_gptimer0.shift);
-
- if (clocksource_register(&bfin_cs_gptimer0))
+ if (clocksource_register_hz(&bfin_cs_gptimer0, get_sclk()))
panic("failed to register clocksource");
return 0;
diff --git a/arch/blackfin/mach-common/smp.c b/arch/blackfin/mach-common/smp.c
index 8bce5ed031e4..1fbd94c44457 100644
--- a/arch/blackfin/mach-common/smp.c
+++ b/arch/blackfin/mach-common/smp.c
@@ -177,6 +177,9 @@ static irqreturn_t ipi_handler_int1(int irq, void *dev_instance)
while (msg_queue->count) {
msg = &msg_queue->ipi_message[msg_queue->head];
switch (msg->type) {
+ case BFIN_IPI_RESCHEDULE:
+ scheduler_ipi();
+ break;
case BFIN_IPI_CALL_FUNC:
spin_unlock_irqrestore(&msg_queue->lock, flags);
ipi_call_function(cpu, msg);
diff --git a/arch/cris/arch-v32/kernel/smp.c b/arch/cris/arch-v32/kernel/smp.c
index 4c9e3e1ba5d1..66cc75657e2f 100644
--- a/arch/cris/arch-v32/kernel/smp.c
+++ b/arch/cris/arch-v32/kernel/smp.c
@@ -342,15 +342,18 @@ irqreturn_t crisv32_ipi_interrupt(int irq, void *dev_id)
ipi = REG_RD(intr_vect, irq_regs[smp_processor_id()], rw_ipi);
+ if (ipi.vector & IPI_SCHEDULE) {
+ scheduler_ipi();
+ }
if (ipi.vector & IPI_CALL) {
- func(info);
+ func(info);
}
if (ipi.vector & IPI_FLUSH_TLB) {
- if (flush_mm == FLUSH_ALL)
- __flush_tlb_all();
- else if (flush_vma == FLUSH_ALL)
+ if (flush_mm == FLUSH_ALL)
+ __flush_tlb_all();
+ else if (flush_vma == FLUSH_ALL)
__flush_tlb_mm(flush_mm);
- else
+ else
__flush_tlb_page(flush_vma, flush_addr);
}
diff --git a/arch/ia64/kernel/cyclone.c b/arch/ia64/kernel/cyclone.c
index 1b811c61bdc6..f64097b5118a 100644
--- a/arch/ia64/kernel/cyclone.c
+++ b/arch/ia64/kernel/cyclone.c
@@ -31,8 +31,6 @@ static struct clocksource clocksource_cyclone = {
.rating = 300,
.read = read_cyclone,
.mask = (1LL << 40) - 1,
- .mult = 0, /*to be calculated*/
- .shift = 16,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
@@ -118,9 +116,7 @@ int __init init_cyclone_clock(void)
/* initialize last tick */
cyclone_mc = cyclone_timer;
clocksource_cyclone.fsys_mmio = cyclone_timer;
- clocksource_cyclone.mult = clocksource_hz2mult(CYCLONE_TIMER_FREQ,
- clocksource_cyclone.shift);
- clocksource_register(&clocksource_cyclone);
+ clocksource_register_hz(&clocksource_cyclone, CYCLONE_TIMER_FREQ);
return 0;
}
diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c
index 5b704740f160..782c3a357f24 100644
--- a/arch/ia64/kernel/irq_ia64.c
+++ b/arch/ia64/kernel/irq_ia64.c
@@ -31,6 +31,7 @@
#include <linux/irq.h>
#include <linux/ratelimit.h>
#include <linux/acpi.h>
+#include <linux/sched.h>
#include <asm/delay.h>
#include <asm/intrinsics.h>
@@ -496,6 +497,7 @@ ia64_handle_irq (ia64_vector vector, struct pt_regs *regs)
smp_local_flush_tlb();
kstat_incr_irqs_this_cpu(irq, desc);
} else if (unlikely(IS_RESCHEDULE(vector))) {
+ scheduler_ipi();
kstat_incr_irqs_this_cpu(irq, desc);
} else {
ia64_setreg(_IA64_REG_CR_TPR, vector);
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 156ad803d5b7..04440cc09b40 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -73,8 +73,6 @@ static struct clocksource clocksource_itc = {
.rating = 350,
.read = itc_get_cycles,
.mask = CLOCKSOURCE_MASK(64),
- .mult = 0, /*to be calculated*/
- .shift = 16,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
#ifdef CONFIG_PARAVIRT
.resume = paravirt_clocksource_resume,
@@ -365,11 +363,8 @@ ia64_init_itm (void)
ia64_cpu_local_tick();
if (!itc_clocksource) {
- /* Sort out mult/shift values: */
- clocksource_itc.mult =
- clocksource_hz2mult(local_cpu_data->itc_freq,
- clocksource_itc.shift);
- clocksource_register(&clocksource_itc);
+ clocksource_register_hz(&clocksource_itc,
+ local_cpu_data->itc_freq);
itc_clocksource = &clocksource_itc;
}
}
diff --git a/arch/ia64/sn/kernel/sn2/timer.c b/arch/ia64/sn/kernel/sn2/timer.c
index 21d6f09e3447..c34efda122e1 100644
--- a/arch/ia64/sn/kernel/sn2/timer.c
+++ b/arch/ia64/sn/kernel/sn2/timer.c
@@ -33,8 +33,6 @@ static struct clocksource clocksource_sn2 = {
.rating = 450,
.read = read_sn2,
.mask = (1LL << 55) - 1,
- .mult = 0,
- .shift = 10,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
@@ -57,9 +55,7 @@ ia64_sn_udelay (unsigned long usecs)
void __init sn_timer_init(void)
{
clocksource_sn2.fsys_mmio = RTC_COUNTER_ADDR;
- clocksource_sn2.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
- clocksource_sn2.shift);
- clocksource_register(&clocksource_sn2);
+ clocksource_register_hz(&clocksource_sn2, sn_rtc_cycles_per_second);
ia64_udelay = &ia64_sn_udelay;
}
diff --git a/arch/ia64/xen/irq_xen.c b/arch/ia64/xen/irq_xen.c
index 108bb858acf2..b279e142c633 100644
--- a/arch/ia64/xen/irq_xen.c
+++ b/arch/ia64/xen/irq_xen.c
@@ -92,6 +92,8 @@ static unsigned short saved_irq_cnt;
static int xen_slab_ready;
#ifdef CONFIG_SMP
+#include <linux/sched.h>
+
/* Dummy stub. Though we may check XEN_RESCHEDULE_VECTOR before __do_IRQ,
* it ends up to issue several memory accesses upon percpu data and
* thus adds unnecessary traffic to other paths.
@@ -99,7 +101,13 @@ static int xen_slab_ready;
static irqreturn_t
xen_dummy_handler(int irq, void *dev_id)
{
+ return IRQ_HANDLED;
+}
+static irqreturn_t
+xen_resched_handler(int irq, void *dev_id)
+{
+ scheduler_ipi();
return IRQ_HANDLED;
}
@@ -110,7 +118,7 @@ static struct irqaction xen_ipi_irqaction = {
};
static struct irqaction xen_resched_irqaction = {
- .handler = xen_dummy_handler,
+ .handler = xen_resched_handler,
.flags = IRQF_DISABLED,
.name = "resched"
};
diff --git a/arch/m32r/kernel/smp.c b/arch/m32r/kernel/smp.c
index 31cef20b2996..fc10b39893d4 100644
--- a/arch/m32r/kernel/smp.c
+++ b/arch/m32r/kernel/smp.c
@@ -122,8 +122,6 @@ void smp_send_reschedule(int cpu_id)
*
* Description: This routine executes on CPU which received
* 'RESCHEDULE_IPI'.
- * Rescheduling is processed at the exit of interrupt
- * operation.
*
* Born on Date: 2002.02.05
*
@@ -138,7 +136,7 @@ void smp_send_reschedule(int cpu_id)
*==========================================================================*/
void smp_reschedule_interrupt(void)
{
- /* nothing to do */
+ scheduler_ipi();
}
/*==========================================================================*
diff --git a/arch/microblaze/kernel/timer.c b/arch/microblaze/kernel/timer.c
index d8a214f11ac2..e5550ce4e0eb 100644
--- a/arch/microblaze/kernel/timer.c
+++ b/arch/microblaze/kernel/timer.c
@@ -217,16 +217,12 @@ static struct clocksource clocksource_microblaze = {
.rating = 300,
.read = microblaze_read,
.mask = CLOCKSOURCE_MASK(32),
- .shift = 8, /* I can shift it */
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
static int __init microblaze_clocksource_init(void)
{
- clocksource_microblaze.mult =
- clocksource_hz2mult(timer_clock_freq,
- clocksource_microblaze.shift);
- if (clocksource_register(&clocksource_microblaze))
+ if (clocksource_register_hz(&clocksource_microblaze, timer_clock_freq))
panic("failed to register clocksource");
/* stop timer1 */
diff --git a/arch/mips/alchemy/common/time.c b/arch/mips/alchemy/common/time.c
index 2aecb2fdf982..d5da6adbf634 100644
--- a/arch/mips/alchemy/common/time.c
+++ b/arch/mips/alchemy/common/time.c
@@ -141,8 +141,7 @@ static int __init alchemy_time_init(unsigned int m2int)
goto cntr_err;
/* register counter1 clocksource and event device */
- clocksource_set_clock(&au1x_counter1_clocksource, 32768);
- clocksource_register(&au1x_counter1_clocksource);
+ clocksource_register_hz(&au1x_counter1_clocksource, 32768);
cd->shift = 32;
cd->mult = div_sc(32768, NSEC_PER_SEC, cd->shift);
diff --git a/arch/mips/cavium-octeon/csrc-octeon.c b/arch/mips/cavium-octeon/csrc-octeon.c
index 26bf71130bf8..29d56afbb02d 100644
--- a/arch/mips/cavium-octeon/csrc-octeon.c
+++ b/arch/mips/cavium-octeon/csrc-octeon.c
@@ -105,8 +105,7 @@ unsigned long long notrace sched_clock(void)
void __init plat_time_init(void)
{
clocksource_mips.rating = 300;
- clocksource_set_clock(&clocksource_mips, octeon_get_clock_rate());
- clocksource_register(&clocksource_mips);
+ clocksource_register_hz(&clocksource_mips, octeon_get_clock_rate());
}
static u64 octeon_udelay_factor;
diff --git a/arch/mips/cavium-octeon/smp.c b/arch/mips/cavium-octeon/smp.c
index ba78b21cc8d0..76923eeb58b9 100644
--- a/arch/mips/cavium-octeon/smp.c
+++ b/arch/mips/cavium-octeon/smp.c
@@ -44,6 +44,8 @@ static irqreturn_t mailbox_interrupt(int irq, void *dev_id)
if (action & SMP_CALL_FUNCTION)
smp_call_function_interrupt();
+ if (action & SMP_RESCHEDULE_YOURSELF)
+ scheduler_ipi();
/* Check if we've been told to flush the icache */
if (action & SMP_ICACHE_FLUSH)
diff --git a/arch/mips/include/asm/time.h b/arch/mips/include/asm/time.h
index c7f1bfef1574..bc14447e69b5 100644
--- a/arch/mips/include/asm/time.h
+++ b/arch/mips/include/asm/time.h
@@ -84,12 +84,6 @@ static inline int init_mips_clocksource(void)
#endif
}
-static inline void clocksource_set_clock(struct clocksource *cs,
- unsigned int clock)
-{
- clocksource_calc_mult_shift(cs, clock, 4);
-}
-
static inline void clockevent_set_clock(struct clock_event_device *cd,
unsigned int clock)
{
diff --git a/arch/mips/jz4740/time.c b/arch/mips/jz4740/time.c
index fe01678d94fd..03dfd4e0da4e 100644
--- a/arch/mips/jz4740/time.c
+++ b/arch/mips/jz4740/time.c
@@ -121,8 +121,7 @@ void __init plat_time_init(void)
clockevents_register_device(&jz4740_clockevent);
- clocksource_set_clock(&jz4740_clocksource, clk_rate);
- ret = clocksource_register(&jz4740_clocksource);
+ ret = clocksource_register_hz(&jz4740_clocksource, clk_rate);
if (ret)
printk(KERN_ERR "Failed to register clocksource: %d\n", ret);
diff --git a/arch/mips/kernel/cevt-txx9.c b/arch/mips/kernel/cevt-txx9.c
index 0b7377361e22..f0ab92a1b057 100644
--- a/arch/mips/kernel/cevt-txx9.c
+++ b/arch/mips/kernel/cevt-txx9.c
@@ -51,8 +51,7 @@ void __init txx9_clocksource_init(unsigned long baseaddr,
{
struct txx9_tmr_reg __iomem *tmrptr;
- clocksource_set_clock(&txx9_clocksource.cs, TIMER_CLK(imbusclk));
- clocksource_register(&txx9_clocksource.cs);
+ clocksource_register_hz(&txx9_clocksource.cs, TIMER_CLK(imbusclk));
tmrptr = ioremap(baseaddr, sizeof(struct txx9_tmr_reg));
__raw_writel(TCR_BASE, &tmrptr->tcr);
diff --git a/arch/mips/kernel/csrc-bcm1480.c b/arch/mips/kernel/csrc-bcm1480.c
index 51489f8a825e..f96f99c794a3 100644
--- a/arch/mips/kernel/csrc-bcm1480.c
+++ b/arch/mips/kernel/csrc-bcm1480.c
@@ -49,6 +49,5 @@ void __init sb1480_clocksource_init(void)
plldiv = G_BCM1480_SYS_PLL_DIV(__raw_readq(IOADDR(A_SCD_SYSTEM_CFG)));
zbbus = ((plldiv >> 1) * 50000000) + ((plldiv & 1) * 25000000);
- clocksource_set_clock(cs, zbbus);
- clocksource_register(cs);
+ clocksource_register_hz(cs, zbbus);
}
diff --git a/arch/mips/kernel/csrc-ioasic.c b/arch/mips/kernel/csrc-ioasic.c
index 23da108506b0..46bd7fa98d6c 100644
--- a/arch/mips/kernel/csrc-ioasic.c
+++ b/arch/mips/kernel/csrc-ioasic.c
@@ -59,7 +59,5 @@ void __init dec_ioasic_clocksource_init(void)
printk(KERN_INFO "I/O ASIC clock frequency %dHz\n", freq);
clocksource_dec.rating = 200 + freq / 10000000;
- clocksource_set_clock(&clocksource_dec, freq);
-
- clocksource_register(&clocksource_dec);
+ clocksource_register_hz(&clocksource_dec, freq);
}
diff --git a/arch/mips/kernel/csrc-powertv.c b/arch/mips/kernel/csrc-powertv.c
index a27c16c8690e..2e7c5232da8d 100644
--- a/arch/mips/kernel/csrc-powertv.c
+++ b/arch/mips/kernel/csrc-powertv.c
@@ -78,9 +78,7 @@ static void __init powertv_c0_hpt_clocksource_init(void)
clocksource_mips.rating = 200 + mips_hpt_frequency / 10000000;
- clocksource_set_clock(&clocksource_mips, mips_hpt_frequency);
-
- clocksource_register(&clocksource_mips);
+ clocksource_register_hz(&clocksource_mips, mips_hpt_frequency);
}
/**
@@ -130,43 +128,16 @@ static struct clocksource clocksource_tim_c = {
/**
* powertv_tim_c_clocksource_init - set up a clock source for the TIM_C clock
*
- * The hard part here is coming up with a constant k and shift s such that
- * the 48-bit TIM_C value multiplied by k doesn't overflow and that value,
- * when shifted right by s, yields the corresponding number of nanoseconds.
* We know that TIM_C counts at 27 MHz/8, so each cycle corresponds to
- * 1 / (27,000,000/8) seconds. Multiply that by a billion and you get the
- * number of nanoseconds. Since the TIM_C value has 48 bits and the math is
- * done in 64 bits, avoiding an overflow means that k must be less than
- * 64 - 48 = 16 bits.
+ * 1 / (27,000,000/8) seconds.
*/
static void __init powertv_tim_c_clocksource_init(void)
{
- int prescale;
- unsigned long dividend;
- unsigned long k;
- int s;
- const int max_k_bits = (64 - 48) - 1;
- const unsigned long billion = 1000000000;
const unsigned long counts_per_second = 27000000 / 8;
- prescale = BITS_PER_LONG - ilog2(billion) - 1;
- dividend = billion << prescale;
- k = dividend / counts_per_second;
- s = ilog2(k) - max_k_bits;
-
- if (s < 0)
- s = prescale;
-
- else {
- k >>= s;
- s += prescale;
- }
-
- clocksource_tim_c.mult = k;
- clocksource_tim_c.shift = s;
clocksource_tim_c.rating = 200;
- clocksource_register(&clocksource_tim_c);
+ clocksource_register_hz(&clocksource_tim_c, counts_per_second);
tim_c = (struct tim_c *) asic_reg_addr(tim_ch);
}
diff --git a/arch/mips/kernel/csrc-r4k.c b/arch/mips/kernel/csrc-r4k.c
index e95a3cd48eea..decd1fa38d55 100644
--- a/arch/mips/kernel/csrc-r4k.c
+++ b/arch/mips/kernel/csrc-r4k.c
@@ -30,9 +30,7 @@ int __init init_r4k_clocksource(void)
/* Calculate a somewhat reasonable rating value */
clocksource_mips.rating = 200 + mips_hpt_frequency / 10000000;
- clocksource_set_clock(&clocksource_mips, mips_hpt_frequency);
-
- clocksource_register(&clocksource_mips);
+ clocksource_register_hz(&clocksource_mips, mips_hpt_frequency);
return 0;
}
diff --git a/arch/mips/kernel/csrc-sb1250.c b/arch/mips/kernel/csrc-sb1250.c
index d14d3d1907fa..e9606d907685 100644
--- a/arch/mips/kernel/csrc-sb1250.c
+++ b/arch/mips/kernel/csrc-sb1250.c
@@ -65,6 +65,5 @@ void __init sb1250_clocksource_init(void)
IOADDR(A_SCD_TIMER_REGISTER(SB1250_HPT_NUM,
R_SCD_TIMER_CFG)));
- clocksource_set_clock(cs, V_SCD_TIMER_FREQ);
- clocksource_register(cs);
+ clocksource_register_hz(cs, V_SCD_TIMER_FREQ);
}
diff --git a/arch/mips/kernel/i8253.c b/arch/mips/kernel/i8253.c
index 2392a7a296d4..9fadd17888d9 100644
--- a/arch/mips/kernel/i8253.c
+++ b/arch/mips/kernel/i8253.c
@@ -196,8 +196,6 @@ static struct clocksource clocksource_pit = {
.rating = 110,
.read = pit_read,
.mask = CLOCKSOURCE_MASK(32),
- .mult = 0,
- .shift = 20,
};
static int __init init_pit_clocksource(void)
@@ -205,7 +203,6 @@ static int __init init_pit_clocksource(void)
if (num_possible_cpus() > 1) /* PIT does not scale! */
return 0;
- clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20);
- return clocksource_register(&clocksource_pit);
+ return clocksource_register_hz(&clocksource_pit, CLOCK_TICK_RATE);
}
arch_initcall(init_pit_clocksource);
diff --git a/arch/mips/kernel/smtc.c b/arch/mips/kernel/smtc.c
index 5a88cc4ccd5a..cedac4633741 100644
--- a/arch/mips/kernel/smtc.c
+++ b/arch/mips/kernel/smtc.c
@@ -929,7 +929,7 @@ static void post_direct_ipi(int cpu, struct smtc_ipi *pipi)
static void ipi_resched_interrupt(void)
{
- /* Return from interrupt should be enough to cause scheduler check */
+ scheduler_ipi();
}
static void ipi_call_interrupt(void)
diff --git a/arch/mips/loongson/common/cs5536/cs5536_mfgpt.c b/arch/mips/loongson/common/cs5536/cs5536_mfgpt.c
index 8c807c965199..0cb1b9760e34 100644
--- a/arch/mips/loongson/common/cs5536/cs5536_mfgpt.c
+++ b/arch/mips/loongson/common/cs5536/cs5536_mfgpt.c
@@ -201,8 +201,6 @@ static struct clocksource clocksource_mfgpt = {
.rating = 120, /* Functional for real use, but not desired */
.read = mfgpt_read,
.mask = CLOCKSOURCE_MASK(32),
- .mult = 0,
- .shift = 22,
};
int __init init_mfgpt_clocksource(void)
@@ -210,8 +208,7 @@ int __init init_mfgpt_clocksource(void)
if (num_possible_cpus() > 1) /* MFGPT does not scale! */
return 0;
- clocksource_mfgpt.mult = clocksource_hz2mult(MFGPT_TICK_RATE, 22);
- return clocksource_register(&clocksource_mfgpt);
+ return clocksource_register_hz(&clocksource_mfgpt, MFGPT_TICK_RATE);
}
arch_initcall(init_mfgpt_clocksource);
diff --git a/arch/mips/mti-malta/malta-int.c b/arch/mips/mti-malta/malta-int.c
index 9027061f0ead..7d93e6fbfa5a 100644
--- a/arch/mips/mti-malta/malta-int.c
+++ b/arch/mips/mti-malta/malta-int.c
@@ -309,6 +309,8 @@ static void ipi_call_dispatch(void)
static irqreturn_t ipi_resched_interrupt(int irq, void *dev_id)
{
+ scheduler_ipi();
+
return IRQ_HANDLED;
}
diff --git a/arch/mips/pmc-sierra/yosemite/smp.c b/arch/mips/pmc-sierra/yosemite/smp.c
index efc9e889b349..2608752898c0 100644
--- a/arch/mips/pmc-sierra/yosemite/smp.c
+++ b/arch/mips/pmc-sierra/yosemite/smp.c
@@ -55,6 +55,8 @@ void titan_mailbox_irq(void)
if (status & 0x2)
smp_call_function_interrupt();
+ if (status & 0x4)
+ scheduler_ipi();
break;
case 1:
@@ -63,6 +65,8 @@ void titan_mailbox_irq(void)
if (status & 0x2)
smp_call_function_interrupt();
+ if (status & 0x4)
+ scheduler_ipi();
break;
}
}
diff --git a/arch/mips/sgi-ip27/ip27-irq.c b/arch/mips/sgi-ip27/ip27-irq.c
index 0a04603d577c..b18b04e48577 100644
--- a/arch/mips/sgi-ip27/ip27-irq.c
+++ b/arch/mips/sgi-ip27/ip27-irq.c
@@ -147,8 +147,10 @@ static void ip27_do_irq_mask0(void)
#ifdef CONFIG_SMP
if (pend0 & (1UL << CPU_RESCHED_A_IRQ)) {
LOCAL_HUB_CLR_INTR(CPU_RESCHED_A_IRQ);
+ scheduler_ipi();
} else if (pend0 & (1UL << CPU_RESCHED_B_IRQ)) {
LOCAL_HUB_CLR_INTR(CPU_RESCHED_B_IRQ);
+ scheduler_ipi();
} else if (pend0 & (1UL << CPU_CALL_A_IRQ)) {
LOCAL_HUB_CLR_INTR(CPU_CALL_A_IRQ);
smp_call_function_interrupt();
diff --git a/arch/mips/sgi-ip27/ip27-timer.c b/arch/mips/sgi-ip27/ip27-timer.c
index 3f810c9cbf83..ef74f3267f91 100644
--- a/arch/mips/sgi-ip27/ip27-timer.c
+++ b/arch/mips/sgi-ip27/ip27-timer.c
@@ -163,8 +163,7 @@ static void __init hub_rt_clocksource_init(void)
{
struct clocksource *cs = &hub_rt_clocksource;
- clocksource_set_clock(cs, CYCLES_PER_SEC);
- clocksource_register(cs);
+ clocksource_register_hz(cs, CYCLES_PER_SEC);
}
void __init plat_time_init(void)
diff --git a/arch/mips/sibyte/bcm1480/smp.c b/arch/mips/sibyte/bcm1480/smp.c
index 47b347c992ea..d667875be564 100644
--- a/arch/mips/sibyte/bcm1480/smp.c
+++ b/arch/mips/sibyte/bcm1480/smp.c
@@ -20,6 +20,7 @@
#include <linux/delay.h>
#include <linux/smp.h>
#include <linux/kernel_stat.h>
+#include <linux/sched.h>
#include <asm/mmu_context.h>
#include <asm/io.h>
@@ -189,10 +190,8 @@ void bcm1480_mailbox_interrupt(void)
/* Clear the mailbox to clear the interrupt */
__raw_writeq(((u64)action)<<48, mailbox_0_clear_regs[cpu]);
- /*
- * Nothing to do for SMP_RESCHEDULE_YOURSELF; returning from the
- * interrupt will do the reschedule for us
- */
+ if (action & SMP_RESCHEDULE_YOURSELF)
+ scheduler_ipi();
if (action & SMP_CALL_FUNCTION)
smp_call_function_interrupt();
diff --git a/arch/mips/sibyte/sb1250/smp.c b/arch/mips/sibyte/sb1250/smp.c
index c00a5cb1128d..38e7f6bd7922 100644
--- a/arch/mips/sibyte/sb1250/smp.c
+++ b/arch/mips/sibyte/sb1250/smp.c
@@ -21,6 +21,7 @@
#include <linux/interrupt.h>
#include <linux/smp.h>
#include <linux/kernel_stat.h>
+#include <linux/sched.h>
#include <asm/mmu_context.h>
#include <asm/io.h>
@@ -177,10 +178,8 @@ void sb1250_mailbox_interrupt(void)
/* Clear the mailbox to clear the interrupt */
____raw_writeq(((u64)action) << 48, mailbox_clear_regs[cpu]);
- /*
- * Nothing to do for SMP_RESCHEDULE_YOURSELF; returning from the
- * interrupt will do the reschedule for us
- */
+ if (action & SMP_RESCHEDULE_YOURSELF)
+ scheduler_ipi();
if (action & SMP_CALL_FUNCTION)
smp_call_function_interrupt();
diff --git a/arch/mn10300/kernel/smp.c b/arch/mn10300/kernel/smp.c
index 226c826a2194..83fb27912231 100644
--- a/arch/mn10300/kernel/smp.c
+++ b/arch/mn10300/kernel/smp.c
@@ -494,14 +494,11 @@ void smp_send_stop(void)
* @irq: The interrupt number.
* @dev_id: The device ID.
*
- * We need do nothing here, since the scheduling will be effected on our way
- * back through entry.S.
- *
* Returns IRQ_HANDLED to indicate we handled the interrupt successfully.
*/
static irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id)
{
- /* do nothing */
+ scheduler_ipi();
return IRQ_HANDLED;
}
diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c
index 69d63d354ef0..828305f19cff 100644
--- a/arch/parisc/kernel/smp.c
+++ b/arch/parisc/kernel/smp.c
@@ -155,10 +155,7 @@ ipi_interrupt(int irq, void *dev_id)
case IPI_RESCHEDULE:
smp_debug(100, KERN_DEBUG "CPU%d IPI_RESCHEDULE\n", this_cpu);
- /*
- * Reschedule callback. Everything to be
- * done is done by the interrupt return path.
- */
+ scheduler_ipi();
break;
case IPI_CALL_FUNC:
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index cbdbb14be4b0..9f9c204bef69 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -116,7 +116,7 @@ void smp_message_recv(int msg)
generic_smp_call_function_interrupt();
break;
case PPC_MSG_RESCHEDULE:
- /* we notice need_resched on exit */
+ scheduler_ipi();
break;
case PPC_MSG_CALL_FUNC_SINGLE:
generic_smp_call_function_single_interrupt();
@@ -146,7 +146,7 @@ static irqreturn_t call_function_action(int irq, void *data)
static irqreturn_t reschedule_action(int irq, void *data)
{
- /* we just need the return path side effect of checking need_resched */
+ scheduler_ipi();
return IRQ_HANDLED;
}
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 63a97db83f96..63c7d9ff220d 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -165,12 +165,12 @@ static void do_ext_call_interrupt(unsigned int ext_int_code,
kstat_cpu(smp_processor_id()).irqs[EXTINT_IPI]++;
/*
* handle bit signal external calls
- *
- * For the ec_schedule signal we have to do nothing. All the work
- * is done automatically when we return from the interrupt.
*/
bits = xchg(&S390_lowcore.ext_call_fast, 0);
+ if (test_bit(ec_schedule, &bits))
+ scheduler_ipi();
+
if (test_bit(ec_call_function, &bits))
generic_smp_call_function_interrupt();
diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c
index 509b36b45115..6207561ea34a 100644
--- a/arch/sh/kernel/smp.c
+++ b/arch/sh/kernel/smp.c
@@ -20,6 +20,7 @@
#include <linux/module.h>
#include <linux/cpu.h>
#include <linux/interrupt.h>
+#include <linux/sched.h>
#include <asm/atomic.h>
#include <asm/processor.h>
#include <asm/system.h>
@@ -323,6 +324,7 @@ void smp_message_recv(unsigned int msg)
generic_smp_call_function_interrupt();
break;
case SMP_MSG_RESCHEDULE:
+ scheduler_ipi();
break;
case SMP_MSG_FUNCTION_SINGLE:
generic_smp_call_function_single_interrupt();
diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h
index 1c79f32734a0..8b9c556d630b 100644
--- a/arch/sparc/include/asm/topology_64.h
+++ b/arch/sparc/include/asm/topology_64.h
@@ -65,6 +65,10 @@ static inline int pcibus_to_node(struct pci_bus *pbus)
#define smt_capable() (sparc64_multi_core)
#endif /* CONFIG_SMP */
-#define cpu_coregroup_mask(cpu) (&cpu_core_map[cpu])
+extern cpumask_t cpu_core_map[NR_CPUS];
+static inline const struct cpumask *cpu_coregroup_mask(int cpu)
+{
+ return &cpu_core_map[cpu];
+}
#endif /* _ASM_SPARC64_TOPOLOGY_H */
diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c
index 91c10fb70858..f95690c167b6 100644
--- a/arch/sparc/kernel/smp_32.c
+++ b/arch/sparc/kernel/smp_32.c
@@ -125,7 +125,9 @@ struct linux_prom_registers smp_penguin_ctable __cpuinitdata = { 0 };
void smp_send_reschedule(int cpu)
{
- /* See sparc64 */
+ /*
+ * XXX missing reschedule IPI, see scheduler_ipi()
+ */
}
void smp_send_stop(void)
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index 3e94a8c23238..9478da7fdb3e 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -1368,6 +1368,7 @@ void smp_send_reschedule(int cpu)
void __irq_entry smp_receive_signal_client(int irq, struct pt_regs *regs)
{
clear_softint(1 << irq);
+ scheduler_ipi();
}
/* This is a nop because we capture all other cpus
diff --git a/arch/tile/kernel/smp.c b/arch/tile/kernel/smp.c
index a4293102ef81..c52224d5ed45 100644
--- a/arch/tile/kernel/smp.c
+++ b/arch/tile/kernel/smp.c
@@ -189,12 +189,8 @@ void flush_icache_range(unsigned long start, unsigned long end)
/* Called when smp_send_reschedule() triggers IRQ_RESCHEDULE. */
static irqreturn_t handle_reschedule_ipi(int irq, void *token)
{
- /*
- * Nothing to do here; when we return from interrupt, the
- * rescheduling will occur there. But do bump the interrupt
- * profiler count in the meantime.
- */
__get_cpu_var(irq_stat).irq_resched_count++;
+ scheduler_ipi();
return IRQ_HANDLED;
}
diff --git a/arch/um/kernel/smp.c b/arch/um/kernel/smp.c
index 106bf27e2a9a..eefb107d2d73 100644
--- a/arch/um/kernel/smp.c
+++ b/arch/um/kernel/smp.c
@@ -173,7 +173,7 @@ void IPI_handler(int cpu)
break;
case 'R':
- set_tsk_need_resched(current);
+ scheduler_ipi();
break;
case 'S':
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cc6c53a95bfd..0945ed8057f0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -365,17 +365,6 @@ config X86_UV
# Following is an alphabetically sorted list of 32 bit extended platforms
# Please maintain the alphabetic order if and when there are additions
-config X86_ELAN
- bool "AMD Elan"
- depends on X86_32
- depends on X86_EXTENDED_PLATFORM
- ---help---
- Select this for an AMD Elan processor.
-
- Do not use this option for K6/Athlon/Opteron processors!
-
- If unsure, choose "PC-compatible" instead.
-
config X86_INTEL_CE
bool "CE4100 TV platform"
depends on PCI
@@ -1223,6 +1212,10 @@ config HAVE_ARCH_BOOTMEM
def_bool y
depends on X86_32 && NUMA
+config HAVE_ARCH_ALLOC_REMAP
+ def_bool y
+ depends on X86_32 && NUMA
+
config ARCH_HAVE_MEMORY_PRESENT
def_bool y
depends on X86_32 && DISCONTIGMEM
@@ -1231,13 +1224,9 @@ config NEED_NODE_MEMMAP_SIZE
def_bool y
depends on X86_32 && (DISCONTIGMEM || SPARSEMEM)
-config HAVE_ARCH_ALLOC_REMAP
- def_bool y
- depends on X86_32 && NUMA
-
config ARCH_FLATMEM_ENABLE
def_bool y
- depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && !NUMA
+ depends on X86_32 && !NUMA
config ARCH_DISCONTIGMEM_ENABLE
def_bool y
@@ -1247,20 +1236,16 @@ config ARCH_DISCONTIGMEM_DEFAULT
def_bool y
depends on NUMA && X86_32
-config ARCH_PROC_KCORE_TEXT
- def_bool y
- depends on X86_64 && PROC_KCORE
-
-config ARCH_SPARSEMEM_DEFAULT
- def_bool y
- depends on X86_64
-
config ARCH_SPARSEMEM_ENABLE
def_bool y
depends on X86_64 || NUMA || (EXPERIMENTAL && X86_32) || X86_32_NON_STANDARD
select SPARSEMEM_STATIC if X86_32
select SPARSEMEM_VMEMMAP_ENABLE if X86_64
+config ARCH_SPARSEMEM_DEFAULT
+ def_bool y
+ depends on X86_64
+
config ARCH_SELECT_MEMORY_MODEL
def_bool y
depends on ARCH_SPARSEMEM_ENABLE
@@ -1269,6 +1254,10 @@ config ARCH_MEMORY_PROBE
def_bool X86_64
depends on MEMORY_HOTPLUG
+config ARCH_PROC_KCORE_TEXT
+ def_bool y
+ depends on X86_64 && PROC_KCORE
+
config ILLEGAL_POINTER_VALUE
hex
default 0 if X86_32
@@ -1703,10 +1692,6 @@ config ARCH_ENABLE_MEMORY_HOTREMOVE
def_bool y
depends on MEMORY_HOTPLUG
-config HAVE_ARCH_EARLY_PFN_TO_NID
- def_bool X86_64
- depends on NUMA
-
config USE_PERCPU_NUMA_NODE_ID
def_bool y
depends on NUMA
@@ -2076,7 +2061,7 @@ config OLPC
depends on !X86_PAE
select GPIOLIB
select OF
- select OF_PROMTREE if PROC_DEVICETREE
+ select OF_PROMTREE
---help---
Add support for detecting the unique features of the OLPC
XO hardware.
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index d161e939df62..6a7cfdf8ff69 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -1,6 +1,4 @@
# Put here option for CPU selection and depending optimization
-if !X86_ELAN
-
choice
prompt "Processor family"
default M686 if X86_32
@@ -203,6 +201,14 @@ config MWINCHIP3D
stores for this CPU, which can increase performance of some
operations.
+config MELAN
+ bool "AMD Elan"
+ depends on X86_32
+ ---help---
+ Select this for an AMD Elan processor.
+
+ Do not use this option for K6/Athlon/Opteron processors!
+
config MGEODEGX1
bool "GeodeGX1"
depends on X86_32
@@ -292,8 +298,6 @@ config X86_GENERIC
This is really intended for distributors who need more
generic optimizations.
-endif
-
#
# Define implied options from the CPU selection here
config X86_INTERNODE_CACHE_SHIFT
@@ -312,7 +316,7 @@ config X86_L1_CACHE_SHIFT
int
default "7" if MPENTIUM4 || MPSC
default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
- default "4" if X86_ELAN || M486 || M386 || MGEODEGX1
+ default "4" if MELAN || M486 || M386 || MGEODEGX1
default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
config X86_XADD
@@ -358,7 +362,7 @@ config X86_POPAD_OK
config X86_ALIGNMENT_16
def_bool y
- depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1
+ depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1
config X86_INTEL_USERCOPY
def_bool y
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
index f2ee1abb1df9..86cee7b749e1 100644
--- a/arch/x86/Makefile_32.cpu
+++ b/arch/x86/Makefile_32.cpu
@@ -37,7 +37,7 @@ cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom,$(call cc-option,-march=
$(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic))
# AMD Elan support
-cflags-$(CONFIG_X86_ELAN) += -march=i486
+cflags-$(CONFIG_MELAN) += -march=i486
# Geode GX1 support
cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 91f3e087cf21..50c0d30e676d 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -207,8 +207,7 @@ extern const char * const x86_power_flags[32];
#define test_cpu_cap(c, bit) \
test_bit(bit, (unsigned long *)((c)->x86_capability))
-#define cpu_has(c, bit) \
- (__builtin_constant_p(bit) && \
+#define REQUIRED_MASK_BIT_SET(bit) \
( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) || \
(((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1)) || \
(((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2)) || \
@@ -218,10 +217,16 @@ extern const char * const x86_power_flags[32];
(((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \
(((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) || \
(((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) || \
- (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) ) \
- ? 1 : \
+ (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) )
+
+#define cpu_has(c, bit) \
+ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
test_cpu_cap(c, bit))
+#define this_cpu_has(bit) \
+ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
+ x86_this_cpu_test_bit(bit, (unsigned long *)&cpu_info.x86_capability))
+
#define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit)
#define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability))
diff --git a/arch/x86/include/asm/mach_traps.h b/arch/x86/include/asm/mach_traps.h
index 72a8b52e7dfd..7775debde112 100644
--- a/arch/x86/include/asm/mach_traps.h
+++ b/arch/x86/include/asm/mach_traps.h
@@ -8,6 +8,7 @@
#include <asm/mc146818rtc.h>
#define NMI_REASON_PORT 0x61
+#define NMI_ENABLE_PORT 0x70 /* Real-Time Clock Address Register as well */
#define NMI_REASON_SERR 0x80
#define NMI_REASON_IOCHK 0x40
@@ -30,12 +31,19 @@ static inline void reassert_nmi(void)
old_reg = current_lock_cmos_reg();
else
lock_cmos(0); /* register doesn't matter here */
- outb(0x8f, 0x70);
- inb(0x71); /* dummy */
- outb(0x0f, 0x70);
- inb(0x71); /* dummy */
+
+ /*
+ * This will cause the NMI output to transition low
+ * then high if there are any pending NMI sources. The
+ * CPU's NMI input logic will then register a new NMI.
+ */
+ outb(0x8f, NMI_ENABLE_PORT);
+ inb(0x71); /* dummy */
+ outb(0x0f, NMI_ENABLE_PORT);
+ inb(0x71); /* dummy */
+
if (old_reg >= 0)
- outb(old_reg, 0x70);
+ outb(old_reg, NMI_ENABLE_PORT);
else
unlock_cmos();
}
diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h
index 288b96f815a6..b3f88d7867c7 100644
--- a/arch/x86/include/asm/mmzone_64.h
+++ b/arch/x86/include/asm/mmzone_64.h
@@ -4,36 +4,13 @@
#ifndef _ASM_X86_MMZONE_64_H
#define _ASM_X86_MMZONE_64_H
-
#ifdef CONFIG_NUMA
#include <linux/mmdebug.h>
-
#include <asm/smp.h>
-/* Simple perfect hash to map physical addresses to node numbers */
-struct memnode {
- int shift;
- unsigned int mapsize;
- s16 *map;
- s16 embedded_map[64 - 8];
-} ____cacheline_aligned; /* total size = 128 bytes */
-extern struct memnode memnode;
-#define memnode_shift memnode.shift
-#define memnodemap memnode.map
-#define memnodemapsize memnode.mapsize
-
extern struct pglist_data *node_data[];
-static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
-{
- unsigned nid;
- VIRTUAL_BUG_ON(!memnodemap);
- nid = memnodemap[addr >> memnode_shift];
- VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]);
- return nid;
-}
-
#define NODE_DATA(nid) (node_data[nid])
#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h
index 67763c5d8b4e..9eae7752ae9b 100644
--- a/arch/x86/include/asm/module.h
+++ b/arch/x86/include/asm/module.h
@@ -35,7 +35,7 @@
#define MODULE_PROC_FAMILY "K7 "
#elif defined CONFIG_MK8
#define MODULE_PROC_FAMILY "K8 "
-#elif defined CONFIG_X86_ELAN
+#elif defined CONFIG_MELAN
#define MODULE_PROC_FAMILY "ELAN "
#elif defined CONFIG_MCRUSOE
#define MODULE_PROC_FAMILY "CRUSOE "
diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h
index c5d3a5abbb9f..24487712e0b1 100644
--- a/arch/x86/include/asm/olpc_ofw.h
+++ b/arch/x86/include/asm/olpc_ofw.h
@@ -26,15 +26,12 @@ extern void setup_olpc_ofw_pgd(void);
/* check if OFW was detected during boot */
extern bool olpc_ofw_present(void);
+extern void olpc_dt_build_devicetree(void);
+
#else /* !CONFIG_OLPC */
static inline void olpc_ofw_detect(void) { }
static inline void setup_olpc_ofw_pgd(void) { }
-#endif /* !CONFIG_OLPC */
-
-#ifdef CONFIG_OF_PROMTREE
-extern void olpc_dt_build_devicetree(void);
-#else
static inline void olpc_dt_build_devicetree(void) { }
-#endif
+#endif /* !CONFIG_OLPC */
#endif /* _ASM_X86_OLPC_OFW_H */
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index d475b4398d8b..76042d981596 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -542,6 +542,33 @@ do { \
old__; \
})
+static __always_inline int x86_this_cpu_constant_test_bit(unsigned int nr,
+ const unsigned long __percpu *addr)
+{
+ unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG;
+
+ return ((1UL << (nr % BITS_PER_LONG)) & percpu_read(*a)) != 0;
+}
+
+static inline int x86_this_cpu_variable_test_bit(int nr,
+ const unsigned long __percpu *addr)
+{
+ int oldbit;
+
+ asm volatile("bt "__percpu_arg(2)",%1\n\t"
+ "sbb %0,%0"
+ : "=r" (oldbit)
+ : "m" (*(unsigned long *)addr), "Ir" (nr));
+
+ return oldbit;
+}
+
+#define x86_this_cpu_test_bit(nr, addr) \
+ (__builtin_constant_p((nr)) \
+ ? x86_this_cpu_constant_test_bit((nr), (addr)) \
+ : x86_this_cpu_variable_test_bit((nr), (addr)))
+
+
#include <asm-generic/percpu.h>
/* We can use this directly for local CPU (faster). */
diff --git a/arch/x86/include/asm/probe_roms.h b/arch/x86/include/asm/probe_roms.h
new file mode 100644
index 000000000000..4950a0b1d09c
--- /dev/null
+++ b/arch/x86/include/asm/probe_roms.h
@@ -0,0 +1,8 @@
+#ifndef _PROBE_ROMS_H_
+#define _PROBE_ROMS_H_
+struct pci_dev;
+
+extern void __iomem *pci_map_biosrom(struct pci_dev *pdev);
+extern void pci_unmap_biosrom(void __iomem *rom);
+extern size_t pci_biosrom_size(struct pci_dev *pdev);
+#endif
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index db8aa19a08a2..03d3a32ace20 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -104,10 +104,10 @@ void *extend_brk(size_t size, size_t align);
type *name; \
RESERVE_BRK(name, sizeof(type) * entries)
+extern void probe_roms(void);
#ifdef __i386__
void __init i386_start_kernel(void);
-extern void probe_roms(void);
#else
void __init x86_64_start_kernel(char *real_mode);
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 910a7084f7f2..8dba76972fd7 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -95,7 +95,6 @@ extern void setup_node_to_cpumask_map(void);
#ifdef CONFIG_X86_32
extern unsigned long node_start_pfn[];
extern unsigned long node_end_pfn[];
-extern unsigned long node_remap_size[];
#define node_has_online_mem(nid) (node_start_pfn[nid] != node_end_pfn[nid])
# define SD_CACHE_NICE_TRIES 1
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 7338ef2218bc..758b379be4c8 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -36,7 +36,7 @@ obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
obj-y += time.o ioport.o ldt.o dumpstack.o
obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o
obj-$(CONFIG_IRQ_WORK) += irq_work.o
-obj-$(CONFIG_X86_32) += probe_roms_32.o
+obj-y += probe_roms.o
obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index cd1ffed4ee22..289e92862fd9 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -177,7 +177,6 @@ static struct clocksource clocksource_apbt = {
.rating = APBT_CLOCKSOURCE_RATING,
.read = apbt_read_clocksource,
.mask = APBT_MASK,
- .shift = APBT_SHIFT,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
.resume = apbt_restart_clocksource,
};
@@ -543,14 +542,7 @@ static int apbt_clocksource_register(void)
if (t1 == apbt_read_clocksource(&clocksource_apbt))
panic("APBT counter not counting. APBT disabled\n");
- /*
- * initialize and register APBT clocksource
- * convert that to ns/clock cycle
- * mult = (ns/c) * 2^APBT_SHIFT
- */
- clocksource_apbt.mult = div_sc(MSEC_PER_SEC,
- (unsigned long) apbt_freq, APBT_SHIFT);
- clocksource_register(&clocksource_apbt);
+ clocksource_register_khz(&clocksource_apbt, (u32)apbt_freq*1000);
return 0;
}
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index fabf01eff771..2bc503bf9e99 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -505,7 +505,7 @@ static void __cpuinit setup_APIC_timer(void)
{
struct clock_event_device *levt = &__get_cpu_var(lapic_events);
- if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_ARAT)) {
+ if (this_cpu_has(X86_FEATURE_ARAT)) {
lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP;
/* Make LAPIC timer preferrable over percpu HPET */
lapic_clockevent.rating = 150;
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 6273eee5134b..0aced70815f0 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -93,10 +93,6 @@ static inline void numaq_register_node(int node, struct sys_cfg_data *scd)
node_end_pfn[node]);
memory_present(node, node_start_pfn[node], node_end_pfn[node]);
-
- node_remap_size[node] = node_memmap_size_bytes(node,
- node_start_pfn[node],
- node_end_pfn[node]);
}
/*
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
index 870e6cc6ad28..0ab9b22557c4 100644
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -43,7 +43,7 @@ config X86_ACPI_CPUFREQ
config ELAN_CPUFREQ
tristate "AMD Elan SC400 and SC410"
select CPU_FREQ_TABLE
- depends on X86_ELAN
+ depends on MELAN
---help---
This adds the CPUFreq driver for AMD Elan SC400 and SC410
processors.
@@ -59,7 +59,7 @@ config ELAN_CPUFREQ
config SC520_CPUFREQ
tristate "AMD Elan SC520"
select CPU_FREQ_TABLE
- depends on X86_ELAN
+ depends on MELAN
---help---
This adds the CPUFreq driver for AMD Elan SC520 processor.
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 6f8c5e9da97f..6b0f4cde7a22 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -355,7 +355,6 @@ static void notify_thresholds(__u64 msr_val)
static void intel_thermal_interrupt(void)
{
__u64 msr_val;
- struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
@@ -367,19 +366,19 @@ static void intel_thermal_interrupt(void)
CORE_LEVEL) != 0)
mce_log_therm_throt_event(CORE_THROTTLED | msr_val);
- if (cpu_has(c, X86_FEATURE_PLN))
+ if (this_cpu_has(X86_FEATURE_PLN))
if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
POWER_LIMIT_EVENT,
CORE_LEVEL) != 0)
mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);
- if (cpu_has(c, X86_FEATURE_PTS)) {
+ if (this_cpu_has(X86_FEATURE_PTS)) {
rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
THERMAL_THROTTLING_EVENT,
PACKAGE_LEVEL) != 0)
mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
- if (cpu_has(c, X86_FEATURE_PLN))
+ if (this_cpu_has(X86_FEATURE_PLN))
if (therm_throt_process(msr_val &
PACKAGE_THERM_STATUS_POWER_LIMIT,
POWER_LIMIT_EVENT,
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index c2520e178d32..8ff882fdb1c0 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -468,7 +468,7 @@ static struct p4_event_bind p4_event_bind_map[] = {
.opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
.escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
.escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS),
+ P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
[P4_EVENT_X87_ASSIST] = {
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index d6d6bb361931..3bb08509a7a1 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -23,7 +23,6 @@
static void __init i386_default_early_setup(void)
{
/* Initialize 32bit specific setup functions */
- x86_init.resources.probe_roms = probe_roms;
x86_init.resources.reserve_resources = i386_reserve_resources;
x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 2dfd31597443..212fe6590aab 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -188,8 +188,6 @@ static struct clocksource pit_cs = {
.rating = 110,
.read = pit_read,
.mask = CLOCKSOURCE_MASK(32),
- .mult = 0,
- .shift = 20,
};
static int __init init_pit_clocksource(void)
@@ -205,9 +203,7 @@ static int __init init_pit_clocksource(void)
pit_ce.mode != CLOCK_EVT_MODE_PERIODIC)
return 0;
- pit_cs.mult = clocksource_hz2mult(CLOCK_TICK_RATE, pit_cs.shift);
-
- return clocksource_register(&pit_cs);
+ return clocksource_register_hz(&pit_cs, CLOCK_TICK_RATE);
}
arch_initcall(init_pit_clocksource);
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index f98d3eafe07a..6389a6bca11b 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -26,8 +26,6 @@
#include <asm/x86_init.h>
#include <asm/reboot.h>
-#define KVM_SCALE 22
-
static int kvmclock = 1;
static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
@@ -120,8 +118,6 @@ static struct clocksource kvm_clock = {
.read = kvm_clock_get_cycles,
.rating = 400,
.mask = CLOCKSOURCE_MASK(64),
- .mult = 1 << KVM_SCALE,
- .shift = KVM_SCALE,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
@@ -203,7 +199,7 @@ void __init kvmclock_init(void)
machine_ops.crash_shutdown = kvm_crash_shutdown;
#endif
kvm_get_preset_lpj();
- clocksource_register(&kvm_clock);
+ clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
pv_info.paravirt_enabled = 1;
pv_info.name = "KVM";
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 5a532ce646bf..6f9bfffb2720 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -715,17 +715,15 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
}
}
-static int
+static int __init
check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
{
- int ret = 0;
-
if (!mpc_new_phys || count <= mpc_new_length) {
WARN(1, "update_mptable: No spare slots (length: %x)\n", count);
return -1;
}
- return ret;
+ return 0;
}
#else /* CONFIG_X86_IO_APIC */
static
diff --git a/arch/x86/kernel/probe_roms_32.c b/arch/x86/kernel/probe_roms.c
index 071e7fea42e5..ba0a4cce53be 100644
--- a/arch/x86/kernel/probe_roms_32.c
+++ b/arch/x86/kernel/probe_roms.c
@@ -73,6 +73,107 @@ static struct resource video_rom_resource = {
.flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
};
+/* does this oprom support the given pci device, or any of the devices
+ * that the driver supports?
+ */
+static bool match_id(struct pci_dev *pdev, unsigned short vendor, unsigned short device)
+{
+ struct pci_driver *drv = pdev->driver;
+ const struct pci_device_id *id;
+
+ if (pdev->vendor == vendor && pdev->device == device)
+ return true;
+
+ for (id = drv ? drv->id_table : NULL; id && id->vendor; id++)
+ if (id->vendor == vendor && id->device == device)
+ break;
+
+ return id && id->vendor;
+}
+
+static bool probe_list(struct pci_dev *pdev, unsigned short vendor,
+ const unsigned char *rom_list)
+{
+ unsigned short device;
+
+ do {
+ if (probe_kernel_address(rom_list, device) != 0)
+ device = 0;
+
+ if (device && match_id(pdev, vendor, device))
+ break;
+
+ rom_list += 2;
+ } while (device);
+
+ return !!device;
+}
+
+static struct resource *find_oprom(struct pci_dev *pdev)
+{
+ struct resource *oprom = NULL;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(adapter_rom_resources); i++) {
+ struct resource *res = &adapter_rom_resources[i];
+ unsigned short offset, vendor, device, list, rev;
+ const unsigned char *rom;
+
+ if (res->end == 0)
+ break;
+
+ rom = isa_bus_to_virt(res->start);
+ if (probe_kernel_address(rom + 0x18, offset) != 0)
+ continue;
+
+ if (probe_kernel_address(rom + offset + 0x4, vendor) != 0)
+ continue;
+
+ if (probe_kernel_address(rom + offset + 0x6, device) != 0)
+ continue;
+
+ if (match_id(pdev, vendor, device)) {
+ oprom = res;
+ break;
+ }
+
+ if (probe_kernel_address(rom + offset + 0x8, list) == 0 &&
+ probe_kernel_address(rom + offset + 0xc, rev) == 0 &&
+ rev >= 3 && list &&
+ probe_list(pdev, vendor, rom + offset + list)) {
+ oprom = res;
+ break;
+ }
+ }
+
+ return oprom;
+}
+
+void *pci_map_biosrom(struct pci_dev *pdev)
+{
+ struct resource *oprom = find_oprom(pdev);
+
+ if (!oprom)
+ return NULL;
+
+ return ioremap(oprom->start, resource_size(oprom));
+}
+EXPORT_SYMBOL(pci_map_biosrom);
+
+void pci_unmap_biosrom(void __iomem *image)
+{
+ iounmap(image);
+}
+EXPORT_SYMBOL(pci_unmap_biosrom);
+
+size_t pci_biosrom_size(struct pci_dev *pdev)
+{
+ struct resource *oprom = find_oprom(pdev);
+
+ return oprom ? resource_size(oprom) : 0;
+}
+EXPORT_SYMBOL(pci_biosrom_size);
+
#define ROMSIGNATURE 0xaa55
static int __init romsignature(const unsigned char *rom)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index d46cbe46b7ab..88a90a977f8e 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -449,7 +449,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
{
if (!need_resched()) {
- if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR))
+ if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
clflush((void *)&current_thread_info()->flags);
__monitor((void *)&current_thread_info()->flags, 0, 0);
@@ -465,7 +465,7 @@ static void mwait_idle(void)
if (!need_resched()) {
trace_power_start(POWER_CSTATE, 1, smp_processor_id());
trace_cpu_idle(1, smp_processor_id());
- if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR))
+ if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
clflush((void *)&current_thread_info()->flags);
__monitor((void *)&current_thread_info()->flags, 0, 0);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 08c44b08bf5b..0c016f727695 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -36,7 +36,7 @@ EXPORT_SYMBOL(pm_power_off);
static const struct desc_ptr no_idt = {};
static int reboot_mode;
-enum reboot_type reboot_type = BOOT_KBD;
+enum reboot_type reboot_type = BOOT_ACPI;
int reboot_force;
#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
@@ -478,9 +478,24 @@ void __attribute__((weak)) mach_reboot_fixups(void)
{
}
+/*
+ * Windows compatible x86 hardware expects the following on reboot:
+ *
+ * 1) If the FADT has the ACPI reboot register flag set, try it
+ * 2) If still alive, write to the keyboard controller
+ * 3) If still alive, write to the ACPI reboot register again
+ * 4) If still alive, write to the keyboard controller again
+ *
+ * If the machine is still alive at this stage, it gives up. We default to
+ * following the same pattern, except that if we're still alive after (4) we'll
+ * try to force a triple fault and then cycle between hitting the keyboard
+ * controller and doing that
+ */
static void native_machine_emergency_restart(void)
{
int i;
+ int attempt = 0;
+ int orig_reboot_type = reboot_type;
if (reboot_emergency)
emergency_vmx_disable_all();
@@ -502,6 +517,13 @@ static void native_machine_emergency_restart(void)
outb(0xfe, 0x64); /* pulse reset low */
udelay(50);
}
+ if (attempt == 0 && orig_reboot_type == BOOT_ACPI) {
+ attempt = 1;
+ reboot_type = BOOT_ACPI;
+ } else {
+ reboot_type = BOOT_TRIPLE;
+ }
+ break;
case BOOT_TRIPLE:
load_idt(&no_idt);
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 513deac7228d..013e7eba83bb 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -194,14 +194,13 @@ static void native_stop_other_cpus(int wait)
}
/*
- * Reschedule call back. Nothing to do,
- * all the work is done automatically when
- * we return from the interrupt.
+ * Reschedule call back.
*/
void smp_reschedule_interrupt(struct pt_regs *regs)
{
ack_APIC_irq();
inc_irq_stat(irq_resched_count);
+ scheduler_ipi();
/*
* KVM uses this interrupt to force a cpu out of guest mode
*/
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 8ed8908cc9f7..5a927576b568 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1355,9 +1355,9 @@ static inline void mwait_play_dead(void)
void *mwait_ptr;
struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info);
- if (!(cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)))
+ if (!this_cpu_has(X86_FEATURE_MWAIT) && mwait_usable(c))
return;
- if (!cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLSH))
+ if (!this_cpu_has(X86_FEATURE_CLFLSH))
return;
if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
return;
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index c11514e9128b..6eee0828e327 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -35,7 +35,7 @@ void iommu_shutdown_noop(void) { }
struct x86_init_ops x86_init __initdata = {
.resources = {
- .probe_roms = x86_init_noop,
+ .probe_roms = probe_roms,
.reserve_resources = reserve_standard_io_resources,
.memory_setup = default_machine_specific_memory_setup,
},
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 1cd608973ce5..4e0068ead6b4 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -913,8 +913,6 @@ static struct clocksource lguest_clock = {
.rating = 200,
.read = lguest_clock_read,
.mask = CLOCKSOURCE_MASK(64),
- .mult = 1 << 22,
- .shift = 22,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
@@ -997,7 +995,7 @@ static void lguest_time_init(void)
/* Set up the timer interrupt (0) to go to our simple timer routine */
irq_set_handler(0, lguest_time_irq);
- clocksource_register(&lguest_clock);
+ clocksource_register_hz(&lguest_clock, NSEC_PER_SEC);
/* We can't set cpumask in the initializer: damn C limitations! Set it
* here and register our timer device. */
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index bde3906420df..c757c0a3b529 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -104,13 +104,9 @@ extern unsigned long highend_pfn, highstart_pfn;
#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
-unsigned long node_remap_size[MAX_NUMNODES];
static void *node_remap_start_vaddr[MAX_NUMNODES];
void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
-static unsigned long kva_start_pfn;
-static unsigned long kva_pages;
-
int __cpuinit numa_cpu_node(int cpu)
{
return apic->x86_32_numa_cpu_node(cpu);
@@ -129,7 +125,6 @@ int __init get_memcfg_numa_flat(void)
node_end_pfn[0] = max_pfn;
memblock_x86_register_active_regions(0, 0, max_pfn);
memory_present(0, 0, max_pfn);
- node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn);
/* Indicate there is one node available. */
nodes_clear(node_online_map);
@@ -164,9 +159,8 @@ static void __init allocate_pgdat(int nid)
{
char buf[16];
- if (node_has_online_mem(nid) && node_remap_start_vaddr[nid])
- NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
- else {
+ NODE_DATA(nid) = alloc_remap(nid, ALIGN(sizeof(pg_data_t), PAGE_SIZE));
+ if (!NODE_DATA(nid)) {
unsigned long pgdat_phys;
pgdat_phys = memblock_find_in_range(min_low_pfn<<PAGE_SHIFT,
max_pfn_mapped<<PAGE_SHIFT,
@@ -182,25 +176,38 @@ static void __init allocate_pgdat(int nid)
}
/*
- * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel
- * virtual address space (KVA) is reserved and portions of nodes are mapped
- * using it. This is to allow node-local memory to be allocated for
- * structures that would normally require ZONE_NORMAL. The memory is
- * allocated with alloc_remap() and callers should be prepared to allocate
- * from the bootmem allocator instead.
+ * Remap memory allocator
*/
static unsigned long node_remap_start_pfn[MAX_NUMNODES];
static void *node_remap_end_vaddr[MAX_NUMNODES];
static void *node_remap_alloc_vaddr[MAX_NUMNODES];
-static unsigned long node_remap_offset[MAX_NUMNODES];
+/**
+ * alloc_remap - Allocate remapped memory
+ * @nid: NUMA node to allocate memory from
+ * @size: The size of allocation
+ *
+ * Allocate @size bytes from the remap area of NUMA node @nid. The
+ * size of the remap area is predetermined by init_alloc_remap() and
+ * only the callers considered there should call this function. For
+ * more info, please read the comment on top of init_alloc_remap().
+ *
+ * The caller must be ready to handle allocation failure from this
+ * function and fall back to regular memory allocator in such cases.
+ *
+ * CONTEXT:
+ * Single CPU early boot context.
+ *
+ * RETURNS:
+ * Pointer to the allocated memory on success, %NULL on failure.
+ */
void *alloc_remap(int nid, unsigned long size)
{
void *allocation = node_remap_alloc_vaddr[nid];
size = ALIGN(size, L1_CACHE_BYTES);
- if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid])
+ if (!allocation || (allocation + size) > node_remap_end_vaddr[nid])
return NULL;
node_remap_alloc_vaddr[nid] += size;
@@ -209,26 +216,6 @@ void *alloc_remap(int nid, unsigned long size)
return allocation;
}
-static void __init remap_numa_kva(void)
-{
- void *vaddr;
- unsigned long pfn;
- int node;
-
- for_each_online_node(node) {
- printk(KERN_DEBUG "remap_numa_kva: node %d\n", node);
- for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
- vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
- printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n",
- (unsigned long)vaddr,
- node_remap_start_pfn[node] + pfn);
- set_pmd_pfn((ulong) vaddr,
- node_remap_start_pfn[node] + pfn,
- PAGE_KERNEL_LARGE);
- }
- }
-}
-
#ifdef CONFIG_HIBERNATION
/**
* resume_map_numa_kva - add KVA mapping to the temporary page tables created
@@ -240,15 +227,16 @@ void resume_map_numa_kva(pgd_t *pgd_base)
int node;
for_each_online_node(node) {
- unsigned long start_va, start_pfn, size, pfn;
+ unsigned long start_va, start_pfn, nr_pages, pfn;
start_va = (unsigned long)node_remap_start_vaddr[node];
start_pfn = node_remap_start_pfn[node];
- size = node_remap_size[node];
+ nr_pages = (node_remap_end_vaddr[node] -
+ node_remap_start_vaddr[node]) >> PAGE_SHIFT;
printk(KERN_DEBUG "%s: node %d\n", __func__, node);
- for (pfn = 0; pfn < size; pfn += PTRS_PER_PTE) {
+ for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) {
unsigned long vaddr = start_va + (pfn << PAGE_SHIFT);
pgd_t *pgd = pgd_base + pgd_index(vaddr);
pud_t *pud = pud_offset(pgd, vaddr);
@@ -264,132 +252,102 @@ void resume_map_numa_kva(pgd_t *pgd_base)
}
#endif
-static __init unsigned long calculate_numa_remap_pages(void)
+/**
+ * init_alloc_remap - Initialize remap allocator for a NUMA node
+ * @nid: NUMA node to initizlie remap allocator for
+ *
+ * NUMA nodes may end up without any lowmem. As allocating pgdat and
+ * memmap on a different node with lowmem is inefficient, a special
+ * remap allocator is implemented which can be used by alloc_remap().
+ *
+ * For each node, the amount of memory which will be necessary for
+ * pgdat and memmap is calculated and two memory areas of the size are
+ * allocated - one in the node and the other in lowmem; then, the area
+ * in the node is remapped to the lowmem area.
+ *
+ * As pgdat and memmap must be allocated in lowmem anyway, this
+ * doesn't waste lowmem address space; however, the actual lowmem
+ * which gets remapped over is wasted. The amount shouldn't be
+ * problematic on machines this feature will be used.
+ *
+ * Initialization failure isn't fatal. alloc_remap() is used
+ * opportunistically and the callers will fall back to other memory
+ * allocation mechanisms on failure.
+ */
+static __init void init_alloc_remap(int nid)
{
- int nid;
- unsigned long size, reserve_pages = 0;
+ unsigned long size, pfn;
+ u64 node_pa, remap_pa;
+ void *remap_va;
- for_each_online_node(nid) {
- u64 node_kva_target;
- u64 node_kva_final;
-
- /*
- * The acpi/srat node info can show hot-add memroy zones
- * where memory could be added but not currently present.
- */
- printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
- nid, node_start_pfn[nid], node_end_pfn[nid]);
- if (node_start_pfn[nid] > max_pfn)
- continue;
- if (!node_end_pfn[nid])
- continue;
- if (node_end_pfn[nid] > max_pfn)
- node_end_pfn[nid] = max_pfn;
-
- /* ensure the remap includes space for the pgdat. */
- size = node_remap_size[nid] + sizeof(pg_data_t);
-
- /* convert size to large (pmd size) pages, rounding up */
- size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
- /* now the roundup is correct, convert to PAGE_SIZE pages */
- size = size * PTRS_PER_PTE;
-
- node_kva_target = round_down(node_end_pfn[nid] - size,
- PTRS_PER_PTE);
- node_kva_target <<= PAGE_SHIFT;
- do {
- node_kva_final = memblock_find_in_range(node_kva_target,
- ((u64)node_end_pfn[nid])<<PAGE_SHIFT,
- ((u64)size)<<PAGE_SHIFT,
- LARGE_PAGE_BYTES);
- node_kva_target -= LARGE_PAGE_BYTES;
- } while (node_kva_final == MEMBLOCK_ERROR &&
- (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid]));
-
- if (node_kva_final == MEMBLOCK_ERROR)
- panic("Can not get kva ram\n");
-
- node_remap_size[nid] = size;
- node_remap_offset[nid] = reserve_pages;
- reserve_pages += size;
- printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of"
- " node %d at %llx\n",
- size, nid, node_kva_final>>PAGE_SHIFT);
-
- /*
- * prevent kva address below max_low_pfn want it on system
- * with less memory later.
- * layout will be: KVA address , KVA RAM
- *
- * we are supposed to only record the one less then max_low_pfn
- * but we could have some hole in high memory, and it will only
- * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide
- * to use it as free.
- * So memblock_x86_reserve_range here, hope we don't run out of that array
- */
- memblock_x86_reserve_range(node_kva_final,
- node_kva_final+(((u64)size)<<PAGE_SHIFT),
- "KVA RAM");
-
- node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT;
- }
- printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n",
- reserve_pages);
- return reserve_pages;
-}
+ /*
+ * The acpi/srat node info can show hot-add memroy zones where
+ * memory could be added but not currently present.
+ */
+ printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
+ nid, node_start_pfn[nid], node_end_pfn[nid]);
+ if (node_start_pfn[nid] > max_pfn)
+ return;
+ if (!node_end_pfn[nid])
+ return;
+ if (node_end_pfn[nid] > max_pfn)
+ node_end_pfn[nid] = max_pfn;
-static void init_remap_allocator(int nid)
-{
- node_remap_start_vaddr[nid] = pfn_to_kaddr(
- kva_start_pfn + node_remap_offset[nid]);
- node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
- (node_remap_size[nid] * PAGE_SIZE);
- node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
- ALIGN(sizeof(pg_data_t), PAGE_SIZE);
-
- printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid,
- (ulong) node_remap_start_vaddr[nid],
- (ulong) node_remap_end_vaddr[nid]);
+ /* calculate the necessary space aligned to large page size */
+ size = node_memmap_size_bytes(nid, node_start_pfn[nid],
+ min(node_end_pfn[nid], max_pfn));
+ size += ALIGN(sizeof(pg_data_t), PAGE_SIZE);
+ size = ALIGN(size, LARGE_PAGE_BYTES);
+
+ /* allocate node memory and the lowmem remap area */
+ node_pa = memblock_find_in_range(node_start_pfn[nid] << PAGE_SHIFT,
+ (u64)node_end_pfn[nid] << PAGE_SHIFT,
+ size, LARGE_PAGE_BYTES);
+ if (node_pa == MEMBLOCK_ERROR) {
+ pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n",
+ size, nid);
+ return;
+ }
+ memblock_x86_reserve_range(node_pa, node_pa + size, "KVA RAM");
+
+ remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT,
+ max_low_pfn << PAGE_SHIFT,
+ size, LARGE_PAGE_BYTES);
+ if (remap_pa == MEMBLOCK_ERROR) {
+ pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n",
+ size, nid);
+ memblock_x86_free_range(node_pa, node_pa + size);
+ return;
+ }
+ memblock_x86_reserve_range(remap_pa, remap_pa + size, "KVA PG");
+ remap_va = phys_to_virt(remap_pa);
+
+ /* perform actual remap */
+ for (pfn = 0; pfn < size >> PAGE_SHIFT; pfn += PTRS_PER_PTE)
+ set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT),
+ (node_pa >> PAGE_SHIFT) + pfn,
+ PAGE_KERNEL_LARGE);
+
+ /* initialize remap allocator parameters */
+ node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT;
+ node_remap_start_vaddr[nid] = remap_va;
+ node_remap_end_vaddr[nid] = remap_va + size;
+ node_remap_alloc_vaddr[nid] = remap_va;
+
+ printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n",
+ nid, node_pa, node_pa + size, remap_va, remap_va + size);
}
void __init initmem_init(void)
{
int nid;
- long kva_target_pfn;
-
- /*
- * When mapping a NUMA machine we allocate the node_mem_map arrays
- * from node local memory. They are then mapped directly into KVA
- * between zone normal and vmalloc space. Calculate the size of
- * this space and use it to adjust the boundary between ZONE_NORMAL
- * and ZONE_HIGHMEM.
- */
get_memcfg_numa();
numa_init_array();
- kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE);
-
- kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
- do {
- kva_start_pfn = memblock_find_in_range(kva_target_pfn<<PAGE_SHIFT,
- max_low_pfn<<PAGE_SHIFT,
- kva_pages<<PAGE_SHIFT,
- PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT;
- kva_target_pfn -= PTRS_PER_PTE;
- } while (kva_start_pfn == MEMBLOCK_ERROR && kva_target_pfn > min_low_pfn);
-
- if (kva_start_pfn == MEMBLOCK_ERROR)
- panic("Can not get kva space\n");
-
- printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n",
- kva_start_pfn, max_low_pfn);
- printk(KERN_INFO "max_pfn = %lx\n", max_pfn);
+ for_each_online_node(nid)
+ init_alloc_remap(nid);
- /* avoid clash with initrd */
- memblock_x86_reserve_range(kva_start_pfn<<PAGE_SHIFT,
- (kva_start_pfn + kva_pages)<<PAGE_SHIFT,
- "KVA PG");
#ifdef CONFIG_HIGHMEM
highstart_pfn = highend_pfn = max_pfn;
if (max_pfn > max_low_pfn)
@@ -409,12 +367,8 @@ void __init initmem_init(void)
printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n",
(ulong) pfn_to_kaddr(max_low_pfn));
- for_each_online_node(nid) {
- init_remap_allocator(nid);
-
+ for_each_online_node(nid)
allocate_pgdat(nid);
- }
- remap_numa_kva();
printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",
(ulong) pfn_to_kaddr(highstart_pfn));
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index e8c00cc72033..13f5b068e8c2 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -28,125 +28,10 @@ EXPORT_SYMBOL(node_data);
nodemask_t numa_nodes_parsed __initdata;
-struct memnode memnode;
-
-static unsigned long __initdata nodemap_addr;
-static unsigned long __initdata nodemap_size;
-
static struct numa_meminfo numa_meminfo __initdata;
-
static int numa_distance_cnt;
static u8 *numa_distance;
-/*
- * Given a shift value, try to populate memnodemap[]
- * Returns :
- * 1 if OK
- * 0 if memnodmap[] too small (of shift too small)
- * -1 if node overlap or lost ram (shift too big)
- */
-static int __init populate_memnodemap(const struct numa_meminfo *mi, int shift)
-{
- unsigned long addr, end;
- int i, res = -1;
-
- memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize);
- for (i = 0; i < mi->nr_blks; i++) {
- addr = mi->blk[i].start;
- end = mi->blk[i].end;
- if (addr >= end)
- continue;
- if ((end >> shift) >= memnodemapsize)
- return 0;
- do {
- if (memnodemap[addr >> shift] != NUMA_NO_NODE)
- return -1;
- memnodemap[addr >> shift] = mi->blk[i].nid;
- addr += (1UL << shift);
- } while (addr < end);
- res = 1;
- }
- return res;
-}
-
-static int __init allocate_cachealigned_memnodemap(void)
-{
- unsigned long addr;
-
- memnodemap = memnode.embedded_map;
- if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map))
- return 0;
-
- addr = 0x8000;
- nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
- nodemap_addr = memblock_find_in_range(addr, get_max_mapped(),
- nodemap_size, L1_CACHE_BYTES);
- if (nodemap_addr == MEMBLOCK_ERROR) {
- printk(KERN_ERR
- "NUMA: Unable to allocate Memory to Node hash map\n");
- nodemap_addr = nodemap_size = 0;
- return -1;
- }
- memnodemap = phys_to_virt(nodemap_addr);
- memblock_x86_reserve_range(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP");
-
- printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
- nodemap_addr, nodemap_addr + nodemap_size);
- return 0;
-}
-
-/*
- * The LSB of all start and end addresses in the node map is the value of the
- * maximum possible shift.
- */
-static int __init extract_lsb_from_nodes(const struct numa_meminfo *mi)
-{
- int i, nodes_used = 0;
- unsigned long start, end;
- unsigned long bitfield = 0, memtop = 0;
-
- for (i = 0; i < mi->nr_blks; i++) {
- start = mi->blk[i].start;
- end = mi->blk[i].end;
- if (start >= end)
- continue;
- bitfield |= start;
- nodes_used++;
- if (end > memtop)
- memtop = end;
- }
- if (nodes_used <= 1)
- i = 63;
- else
- i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
- memnodemapsize = (memtop >> i)+1;
- return i;
-}
-
-static int __init compute_hash_shift(const struct numa_meminfo *mi)
-{
- int shift;
-
- shift = extract_lsb_from_nodes(mi);
- if (allocate_cachealigned_memnodemap())
- return -1;
- printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
- shift);
-
- if (populate_memnodemap(mi, shift) != 1) {
- printk(KERN_INFO "Your memory is not aligned you need to "
- "rebuild your kernel with a bigger NODEMAPSIZE "
- "shift=%d\n", shift);
- return -1;
- }
- return shift;
-}
-
-int __meminit __early_pfn_to_nid(unsigned long pfn)
-{
- return phys_to_nid(pfn << PAGE_SHIFT);
-}
-
static void * __init early_node_mem(int nodeid, unsigned long start,
unsigned long end, unsigned long size,
unsigned long align)
@@ -270,7 +155,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
memblock_x86_reserve_range(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA");
printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
nodedata_phys + pgdat_size - 1);
- nid = phys_to_nid(nodedata_phys);
+ nid = early_pfn_to_nid(nodedata_phys >> PAGE_SHIFT);
if (nid != nodeid)
printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
@@ -527,12 +412,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
if (WARN_ON(nodes_empty(node_possible_map)))
return -EINVAL;
- memnode_shift = compute_hash_shift(mi);
- if (memnode_shift < 0) {
- printk(KERN_ERR "NUMA: No NUMA node hash function found. Contact maintainer\n");
- return -EINVAL;
- }
-
for (i = 0; i < mi->nr_blks; i++)
memblock_x86_register_active_regions(mi->blk[i].nid,
mi->blk[i].start >> PAGE_SHIFT,
@@ -626,17 +505,13 @@ static int __init numa_init(int (*init_func)(void))
void __init initmem_init(void)
{
- int ret;
-
if (!numa_off) {
#ifdef CONFIG_ACPI_NUMA
- ret = numa_init(x86_acpi_numa_init);
- if (!ret)
+ if (!numa_init(x86_acpi_numa_init))
return;
#endif
#ifdef CONFIG_AMD_NUMA
- ret = numa_init(amd_numa_init);
- if (!ret)
+ if (!numa_init(amd_numa_init))
return;
#endif
}
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index 364f36bdfad8..ae20046a9e98 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -278,7 +278,6 @@ int __init get_memcfg_from_srat(void)
unsigned long end = min(node_end_pfn[nid], max_pfn);
memory_present(nid, start, end);
- node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
}
return 1;
out_fail:
diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile
index c2a8cab65e5d..81c5e2165c24 100644
--- a/arch/x86/platform/olpc/Makefile
+++ b/arch/x86/platform/olpc/Makefile
@@ -1,4 +1,2 @@
-obj-$(CONFIG_OLPC) += olpc.o
+obj-$(CONFIG_OLPC) += olpc.o olpc_ofw.o olpc_dt.o
obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o
-obj-$(CONFIG_OLPC) += olpc_ofw.o
-obj-$(CONFIG_OF_PROMTREE) += olpc_dt.o
diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c
index edaf3fe8dc5e..0060fd59ea00 100644
--- a/arch/x86/platform/olpc/olpc.c
+++ b/arch/x86/platform/olpc/olpc.c
@@ -18,6 +18,7 @@
#include <linux/io.h>
#include <linux/string.h>
#include <linux/platform_device.h>
+#include <linux/of.h>
#include <asm/geode.h>
#include <asm/setup.h>
@@ -187,41 +188,43 @@ err:
}
EXPORT_SYMBOL_GPL(olpc_ec_cmd);
-static bool __init check_ofw_architecture(void)
+static bool __init check_ofw_architecture(struct device_node *root)
{
- size_t propsize;
- char olpc_arch[5];
- const void *args[] = { NULL, "architecture", olpc_arch, (void *)5 };
- void *res[] = { &propsize };
+ const char *olpc_arch;
+ int propsize;
- if (olpc_ofw("getprop", args, res)) {
- printk(KERN_ERR "ofw: getprop call failed!\n");
- return false;
- }
+ olpc_arch = of_get_property(root, "architecture", &propsize);
return propsize == 5 && strncmp("OLPC", olpc_arch, 5) == 0;
}
-static u32 __init get_board_revision(void)
+static u32 __init get_board_revision(struct device_node *root)
{
- size_t propsize;
- __be32 rev;
- const void *args[] = { NULL, "board-revision-int", &rev, (void *)4 };
- void *res[] = { &propsize };
-
- if (olpc_ofw("getprop", args, res) || propsize != 4) {
- printk(KERN_ERR "ofw: getprop call failed!\n");
- return cpu_to_be32(0);
- }
- return be32_to_cpu(rev);
+ int propsize;
+ const __be32 *rev;
+
+ rev = of_get_property(root, "board-revision-int", &propsize);
+ if (propsize != 4)
+ return 0;
+
+ return be32_to_cpu(*rev);
}
static bool __init platform_detect(void)
{
- if (!check_ofw_architecture())
+ struct device_node *root = of_find_node_by_path("/");
+ bool success;
+
+ if (!root)
return false;
- olpc_platform_info.flags |= OLPC_F_PRESENT;
- olpc_platform_info.boardrev = get_board_revision();
- return true;
+
+ success = check_ofw_architecture(root);
+ if (success) {
+ olpc_platform_info.boardrev = get_board_revision(root);
+ olpc_platform_info.flags |= OLPC_F_PRESENT;
+ }
+
+ of_node_put(root);
+ return success;
}
static int __init add_xo1_platform_devices(void)
diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c
index 044bda5b3174..d39f63d017d2 100644
--- a/arch/x86/platform/olpc/olpc_dt.c
+++ b/arch/x86/platform/olpc/olpc_dt.c
@@ -19,7 +19,9 @@
#include <linux/kernel.h>
#include <linux/bootmem.h>
#include <linux/of.h>
+#include <linux/of_platform.h>
#include <linux/of_pdt.h>
+#include <asm/olpc.h>
#include <asm/olpc_ofw.h>
static phandle __init olpc_dt_getsibling(phandle node)
@@ -180,3 +182,20 @@ void __init olpc_dt_build_devicetree(void)
pr_info("PROM DT: Built device tree with %u bytes of memory.\n",
prom_early_allocated);
}
+
+/* A list of DT node/bus matches that we want to expose as platform devices */
+static struct of_device_id __initdata of_ids[] = {
+ { .compatible = "olpc,xo1-battery" },
+ { .compatible = "olpc,xo1-dcon" },
+ { .compatible = "olpc,xo1-rtc" },
+ {},
+};
+
+static int __init olpc_create_platform_devices(void)
+{
+ if (machine_is_olpc())
+ return of_platform_bus_probe(NULL, of_ids, NULL);
+ else
+ return 0;
+}
+device_initcall(olpc_create_platform_devices);
diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
index 9daf5d1af9f1..0eb90184515f 100644
--- a/arch/x86/platform/uv/uv_time.c
+++ b/arch/x86/platform/uv/uv_time.c
@@ -40,7 +40,6 @@ static struct clocksource clocksource_uv = {
.rating = 400,
.read = uv_read_rtc,
.mask = (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK,
- .shift = 10,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
@@ -372,14 +371,11 @@ static __init int uv_rtc_setup_clock(void)
if (!is_uv_system())
return -ENODEV;
- clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
- clocksource_uv.shift);
-
/* If single blade, prefer tsc */
if (uv_num_possible_blades() == 1)
clocksource_uv.rating = 250;
- rc = clocksource_register(&clocksource_uv);
+ rc = clocksource_register_hz(&clocksource_uv, sn_rtc_cycles_per_second);
if (rc)
printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc);
else
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 30612441ed99..762b46ab14d5 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -46,13 +46,12 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
/*
- * Reschedule call back. Nothing to do,
- * all the work is done automatically when
- * we return from the interrupt.
+ * Reschedule call back.
*/
static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
{
inc_irq_stat(irq_resched_count);
+ scheduler_ipi();
return IRQ_HANDLED;
}
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 2e2d370a47b1..c532d280ce36 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -26,8 +26,6 @@
#include "xen-ops.h"
-#define XEN_SHIFT 22
-
/* Xen may fire a timer up to this many ns early */
#define TIMER_SLOP 100000
#define NS_PER_TICK (1000000000LL / HZ)
@@ -211,8 +209,6 @@ static struct clocksource xen_clocksource __read_mostly = {
.rating = 400,
.read = xen_clocksource_get_cycles,
.mask = ~0,
- .mult = 1<<XEN_SHIFT, /* time directly in nanoseconds */
- .shift = XEN_SHIFT,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
@@ -448,7 +444,7 @@ static __init void xen_time_init(void)
int cpu = smp_processor_id();
struct timespec tp;
- clocksource_register(&xen_clocksource);
+ clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC);
if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
/* Successfully turned off 100Hz tick, so we have the
diff --git a/drivers/acpi/processor_throttling.c b/drivers/acpi/processor_throttling.c
index ad3501739563..605a2954ef17 100644
--- a/drivers/acpi/processor_throttling.c
+++ b/drivers/acpi/processor_throttling.c
@@ -710,20 +710,14 @@ static int acpi_processor_get_throttling_fadt(struct acpi_processor *pr)
}
#ifdef CONFIG_X86
-static int acpi_throttling_rdmsr(struct acpi_processor *pr,
- u64 *value)
+static int acpi_throttling_rdmsr(u64 *value)
{
- struct cpuinfo_x86 *c;
u64 msr_high, msr_low;
- unsigned int cpu;
u64 msr = 0;
int ret = -1;
- cpu = pr->id;
- c = &cpu_data(cpu);
-
- if ((c->x86_vendor != X86_VENDOR_INTEL) ||
- !cpu_has(c, X86_FEATURE_ACPI)) {
+ if ((this_cpu_read(cpu_info.x86_vendor) != X86_VENDOR_INTEL) ||
+ !this_cpu_has(X86_FEATURE_ACPI)) {
printk(KERN_ERR PREFIX
"HARDWARE addr space,NOT supported yet\n");
} else {
@@ -738,18 +732,13 @@ static int acpi_throttling_rdmsr(struct acpi_processor *pr,
return ret;
}
-static int acpi_throttling_wrmsr(struct acpi_processor *pr, u64 value)
+static int acpi_throttling_wrmsr(u64 value)
{
- struct cpuinfo_x86 *c;
- unsigned int cpu;
int ret = -1;
u64 msr;
- cpu = pr->id;
- c = &cpu_data(cpu);
-
- if ((c->x86_vendor != X86_VENDOR_INTEL) ||
- !cpu_has(c, X86_FEATURE_ACPI)) {
+ if ((this_cpu_read(cpu_info.x86_vendor) != X86_VENDOR_INTEL) ||
+ !this_cpu_has(X86_FEATURE_ACPI)) {
printk(KERN_ERR PREFIX
"HARDWARE addr space,NOT supported yet\n");
} else {
@@ -761,15 +750,14 @@ static int acpi_throttling_wrmsr(struct acpi_processor *pr, u64 value)
return ret;
}
#else
-static int acpi_throttling_rdmsr(struct acpi_processor *pr,
- u64 *value)
+static int acpi_throttling_rdmsr(u64 *value)
{
printk(KERN_ERR PREFIX
"HARDWARE addr space,NOT supported yet\n");
return -1;
}
-static int acpi_throttling_wrmsr(struct acpi_processor *pr, u64 value)
+static int acpi_throttling_wrmsr(u64 value)
{
printk(KERN_ERR PREFIX
"HARDWARE addr space,NOT supported yet\n");
@@ -801,7 +789,7 @@ static int acpi_read_throttling_status(struct acpi_processor *pr,
ret = 0;
break;
case ACPI_ADR_SPACE_FIXED_HARDWARE:
- ret = acpi_throttling_rdmsr(pr, value);
+ ret = acpi_throttling_rdmsr(value);
break;
default:
printk(KERN_ERR PREFIX "Unknown addr space %d\n",
@@ -834,7 +822,7 @@ static int acpi_write_throttling_state(struct acpi_processor *pr,
ret = 0;
break;
case ACPI_ADR_SPACE_FIXED_HARDWARE:
- ret = acpi_throttling_wrmsr(pr, value);
+ ret = acpi_throttling_wrmsr(value);
break;
default:
printk(KERN_ERR PREFIX "Unknown addr space %d\n",
diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index 7066e801b9d3..051474c65b78 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -84,8 +84,6 @@ static struct clocksource clocksource_hpet = {
.rating = 250,
.read = read_hpet,
.mask = CLOCKSOURCE_MASK(64),
- .mult = 0, /* to be calculated */
- .shift = 10,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
static struct clocksource *hpet_clocksource;
@@ -934,9 +932,7 @@ int hpet_alloc(struct hpet_data *hdp)
if (!hpet_clocksource) {
hpet_mctr = (void __iomem *)&hpetp->hp_hpet->hpet_mc;
CLKSRC_FSYS_MMIO_SET(clocksource_hpet.fsys_mmio, hpet_mctr);
- clocksource_hpet.mult = clocksource_hz2mult(hpetp->hp_tick_freq,
- clocksource_hpet.shift);
- clocksource_register(&clocksource_hpet);
+ clocksource_register_hz(&clocksource_hpet, hpetp->hp_tick_freq);
hpetp->hp_clocksource = &clocksource_hpet;
hpet_clocksource = &clocksource_hpet;
}
diff --git a/drivers/clocksource/cyclone.c b/drivers/clocksource/cyclone.c
index 64e528e8bfa6..72f811f73e9c 100644
--- a/drivers/clocksource/cyclone.c
+++ b/drivers/clocksource/cyclone.c
@@ -29,8 +29,6 @@ static struct clocksource clocksource_cyclone = {
.rating = 250,
.read = read_cyclone,
.mask = CYCLONE_TIMER_MASK,
- .mult = 10,
- .shift = 0,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
@@ -108,12 +106,8 @@ static int __init init_cyclone_clocksource(void)
}
cyclone_ptr = cyclone_timer;
- /* sort out mult/shift values: */
- clocksource_cyclone.shift = 22;
- clocksource_cyclone.mult = clocksource_hz2mult(CYCLONE_TIMER_FREQ,
- clocksource_cyclone.shift);
-
- return clocksource_register(&clocksource_cyclone);
+ return clocksource_register_hz(&clocksource_cyclone,
+ CYCLONE_TIMER_FREQ);
}
arch_initcall(init_cyclone_clocksource);
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index c37b21ad5a3b..94c1f38e922a 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -161,7 +161,7 @@ struct clocksource {
/*
* First part of structure is read mostly
*/
- char *name;
+ const char *name;
struct list_head list;
int rating;
cycle_t (*read)(struct clocksource *cs);
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 94b48bd40dd7..c75471db576e 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -51,7 +51,7 @@ struct mutex {
spinlock_t wait_lock;
struct list_head wait_list;
#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP)
- struct thread_info *owner;
+ struct task_struct *owner;
#endif
#ifdef CONFIG_DEBUG_MUTEXES
const char *name;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 18d63cea2848..171ba24b08a7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -360,7 +360,7 @@ extern signed long schedule_timeout_interruptible(signed long timeout);
extern signed long schedule_timeout_killable(signed long timeout);
extern signed long schedule_timeout_uninterruptible(signed long timeout);
asmlinkage void schedule(void);
-extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner);
+extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner);
struct nsproxy;
struct user_namespace;
@@ -868,6 +868,7 @@ static inline int sd_power_saving_flags(void)
struct sched_group {
struct sched_group *next; /* Must be a circular list */
+ atomic_t ref;
/*
* CPU power of this group, SCHED_LOAD_SCALE being max power for a
@@ -882,9 +883,6 @@ struct sched_group {
* NOTE: this field is variable length. (Allocated dynamically
* by attaching extra space to the end of the structure,
* depending on how many CPUs the kernel has booted up with)
- *
- * It is also be embedded into static data structures at build
- * time. (See 'struct static_sched_group' in kernel/sched.c)
*/
unsigned long cpumask[0];
};
@@ -894,17 +892,6 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
return to_cpumask(sg->cpumask);
}
-enum sched_domain_level {
- SD_LV_NONE = 0,
- SD_LV_SIBLING,
- SD_LV_MC,
- SD_LV_BOOK,
- SD_LV_CPU,
- SD_LV_NODE,
- SD_LV_ALLNODES,
- SD_LV_MAX
-};
-
struct sched_domain_attr {
int relax_domain_level;
};
@@ -913,6 +900,8 @@ struct sched_domain_attr {
.relax_domain_level = -1, \
}
+extern int sched_domain_level_max;
+
struct sched_domain {
/* These fields must be setup */
struct sched_domain *parent; /* top domain must be null terminated */
@@ -930,7 +919,7 @@ struct sched_domain {
unsigned int forkexec_idx;
unsigned int smt_gain;
int flags; /* See SD_* */
- enum sched_domain_level level;
+ int level;
/* Runtime fields. */
unsigned long last_balance; /* init to jiffies. units in jiffies */
@@ -973,6 +962,10 @@ struct sched_domain {
#ifdef CONFIG_SCHED_DEBUG
char *name;
#endif
+ union {
+ void *private; /* used during construction */
+ struct rcu_head rcu; /* used during destruction */
+ };
unsigned int span_weight;
/*
@@ -981,9 +974,6 @@ struct sched_domain {
* NOTE: this field is variable length. (Allocated dynamically
* by attaching extra space to the end of the structure,
* depending on how many CPUs the kernel has booted up with)
- *
- * It is also be embedded into static data structures at build
- * time. (See 'struct static_sched_domain' in kernel/sched.c)
*/
unsigned long span[0];
};
@@ -1048,8 +1038,12 @@ struct sched_domain;
#define WF_FORK 0x02 /* child wakeup after fork */
#define ENQUEUE_WAKEUP 1
-#define ENQUEUE_WAKING 2
-#define ENQUEUE_HEAD 4
+#define ENQUEUE_HEAD 2
+#ifdef CONFIG_SMP
+#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */
+#else
+#define ENQUEUE_WAKING 0
+#endif
#define DEQUEUE_SLEEP 1
@@ -1067,12 +1061,11 @@ struct sched_class {
void (*put_prev_task) (struct rq *rq, struct task_struct *p);
#ifdef CONFIG_SMP
- int (*select_task_rq)(struct rq *rq, struct task_struct *p,
- int sd_flag, int flags);
+ int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
void (*post_schedule) (struct rq *this_rq);
- void (*task_waking) (struct rq *this_rq, struct task_struct *task);
+ void (*task_waking) (struct task_struct *task);
void (*task_woken) (struct rq *this_rq, struct task_struct *task);
void (*set_cpus_allowed)(struct task_struct *p,
@@ -1200,10 +1193,10 @@ struct task_struct {
int lock_depth; /* BKL lock depth */
#ifdef CONFIG_SMP
-#ifdef __ARCH_WANT_UNLOCKED_CTXSW
- int oncpu;
-#endif
+ struct task_struct *wake_entry;
+ int on_cpu;
#endif
+ int on_rq;
int prio, static_prio, normal_prio;
unsigned int rt_priority;
@@ -1274,6 +1267,7 @@ struct task_struct {
/* Revert to default priority/policy when forking */
unsigned sched_reset_on_fork:1;
+ unsigned sched_contributes_to_load:1;
pid_t pid;
pid_t tgid;
@@ -2192,8 +2186,10 @@ extern void set_task_comm(struct task_struct *tsk, char *from);
extern char *get_task_comm(char *to, struct task_struct *tsk);
#ifdef CONFIG_SMP
+void scheduler_ipi(void);
extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
#else
+static inline void scheduler_ipi(void) { }
static inline unsigned long wait_task_inactive(struct task_struct *p,
long match_state)
{
diff --git a/init/Kconfig b/init/Kconfig
index 0edda616640f..94bf28dc9220 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -827,6 +827,11 @@ config SCHED_AUTOGROUP
desktop applications. Task group autogeneration is currently based
upon task session.
+config SCHED_TTWU_QUEUE
+ bool
+ depends on !SPARC32
+ default y
+
config MM_OWNER
bool
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 33eee16addb8..2bb8c2e98fff 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1159,7 +1159,7 @@ int current_cpuset_is_being_rebound(void)
static int update_relax_domain_level(struct cpuset *cs, s64 val)
{
#ifdef CONFIG_SMP
- if (val < -1 || val >= SD_LV_MAX)
+ if (val < -1 || val >= sched_domain_level_max)
return -EINVAL;
#endif
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index ec815a960b5d..73da83aff418 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -75,7 +75,7 @@ void debug_mutex_unlock(struct mutex *lock)
return;
DEBUG_LOCKS_WARN_ON(lock->magic != lock);
- DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
+ DEBUG_LOCKS_WARN_ON(lock->owner != current);
DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
mutex_clear_owner(lock);
}
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index 57d527a16f9d..0799fd3e4cfa 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -29,7 +29,7 @@ extern void debug_mutex_init(struct mutex *lock, const char *name,
static inline void mutex_set_owner(struct mutex *lock)
{
- lock->owner = current_thread_info();
+ lock->owner = current;
}
static inline void mutex_clear_owner(struct mutex *lock)
diff --git a/kernel/mutex.c b/kernel/mutex.c
index c4195fa98900..fe4706cb0c5b 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -160,7 +160,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
*/
for (;;) {
- struct thread_info *owner;
+ struct task_struct *owner;
/*
* If we own the BKL, then don't spin. The owner of
diff --git a/kernel/mutex.h b/kernel/mutex.h
index 67578ca48f94..4115fbf83b12 100644
--- a/kernel/mutex.h
+++ b/kernel/mutex.h
@@ -19,7 +19,7 @@
#ifdef CONFIG_SMP
static inline void mutex_set_owner(struct mutex *lock)
{
- lock->owner = current_thread_info();
+ lock->owner = current;
}
static inline void mutex_clear_owner(struct mutex *lock)
diff --git a/kernel/sched.c b/kernel/sched.c
index 312f8b95c2d4..bc31f4ee2d98 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -231,7 +231,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
#endif
/*
- * sched_domains_mutex serializes calls to arch_init_sched_domains,
+ * sched_domains_mutex serializes calls to init_sched_domains,
* detach_destroy_domains and partition_sched_domains.
*/
static DEFINE_MUTEX(sched_domains_mutex);
@@ -312,6 +312,9 @@ struct cfs_rq {
u64 exec_clock;
u64 min_vruntime;
+#ifndef CONFIG_64BIT
+ u64 min_vruntime_copy;
+#endif
struct rb_root tasks_timeline;
struct rb_node *rb_leftmost;
@@ -417,6 +420,7 @@ struct rt_rq {
*/
struct root_domain {
atomic_t refcount;
+ struct rcu_head rcu;
cpumask_var_t span;
cpumask_var_t online;
@@ -553,6 +557,10 @@ struct rq {
unsigned int ttwu_count;
unsigned int ttwu_local;
#endif
+
+#ifdef CONFIG_SMP
+ struct task_struct *wake_list;
+#endif
};
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -571,7 +579,7 @@ static inline int cpu_of(struct rq *rq)
#define rcu_dereference_check_sched_domain(p) \
rcu_dereference_check((p), \
- rcu_read_lock_sched_held() || \
+ rcu_read_lock_held() || \
lockdep_is_held(&sched_domains_mutex))
/*
@@ -596,7 +604,7 @@ static inline int cpu_of(struct rq *rq)
* Return the group to which this tasks belongs.
*
* We use task_subsys_state_check() and extend the RCU verification
- * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach()
+ * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach()
* holds that lock for each task it moves into the cgroup. Therefore
* by holding that lock, we pin the task to the current cgroup.
*/
@@ -606,7 +614,7 @@ static inline struct task_group *task_group(struct task_struct *p)
struct cgroup_subsys_state *css;
css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
- lockdep_is_held(&task_rq(p)->lock));
+ lockdep_is_held(&p->pi_lock));
tg = container_of(css, struct task_group, css);
return autogroup_task_group(p, tg);
@@ -838,18 +846,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p)
return rq->curr == p;
}
-#ifndef __ARCH_WANT_UNLOCKED_CTXSW
static inline int task_running(struct rq *rq, struct task_struct *p)
{
+#ifdef CONFIG_SMP
+ return p->on_cpu;
+#else
return task_current(rq, p);
+#endif
}
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
{
+#ifdef CONFIG_SMP
+ /*
+ * We can optimise this out completely for !SMP, because the
+ * SMP rebalancing from interrupt is the only thing that cares
+ * here.
+ */
+ next->on_cpu = 1;
+#endif
}
static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
{
+#ifdef CONFIG_SMP
+ /*
+ * After ->on_cpu is cleared, the task can be moved to a different CPU.
+ * We must ensure this doesn't happen until the switch is completely
+ * finished.
+ */
+ smp_wmb();
+ prev->on_cpu = 0;
+#endif
#ifdef CONFIG_DEBUG_SPINLOCK
/* this is a valid case when another task releases the spinlock */
rq->lock.owner = current;
@@ -865,15 +894,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
}
#else /* __ARCH_WANT_UNLOCKED_CTXSW */
-static inline int task_running(struct rq *rq, struct task_struct *p)
-{
-#ifdef CONFIG_SMP
- return p->oncpu;
-#else
- return task_current(rq, p);
-#endif
-}
-
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
{
#ifdef CONFIG_SMP
@@ -882,7 +902,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
* SMP rebalancing from interrupt is the only thing that cares
* here.
*/
- next->oncpu = 1;
+ next->on_cpu = 1;
#endif
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
raw_spin_unlock_irq(&rq->lock);
@@ -895,12 +915,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
{
#ifdef CONFIG_SMP
/*
- * After ->oncpu is cleared, the task can be moved to a different CPU.
+ * After ->on_cpu is cleared, the task can be moved to a different CPU.
* We must ensure this doesn't happen until the switch is completely
* finished.
*/
smp_wmb();
- prev->oncpu = 0;
+ prev->on_cpu = 0;
#endif
#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
local_irq_enable();
@@ -909,23 +929,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
/*
- * Check whether the task is waking, we use this to synchronize ->cpus_allowed
- * against ttwu().
- */
-static inline int task_is_waking(struct task_struct *p)
-{
- return unlikely(p->state == TASK_WAKING);
-}
-
-/*
- * __task_rq_lock - lock the runqueue a given task resides on.
- * Must be called interrupts disabled.
+ * __task_rq_lock - lock the rq @p resides on.
*/
static inline struct rq *__task_rq_lock(struct task_struct *p)
__acquires(rq->lock)
{
struct rq *rq;
+ lockdep_assert_held(&p->pi_lock);
+
for (;;) {
rq = task_rq(p);
raw_spin_lock(&rq->lock);
@@ -936,22 +948,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
}
/*
- * task_rq_lock - lock the runqueue a given task resides on and disable
- * interrupts. Note the ordering: we can safely lookup the task_rq without
- * explicitly disabling preemption.
+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
*/
static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
+ __acquires(p->pi_lock)
__acquires(rq->lock)
{
struct rq *rq;
for (;;) {
- local_irq_save(*flags);
+ raw_spin_lock_irqsave(&p->pi_lock, *flags);
rq = task_rq(p);
raw_spin_lock(&rq->lock);
if (likely(rq == task_rq(p)))
return rq;
- raw_spin_unlock_irqrestore(&rq->lock, *flags);
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
}
}
@@ -961,10 +973,13 @@ static void __task_rq_unlock(struct rq *rq)
raw_spin_unlock(&rq->lock);
}
-static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
+static inline void
+task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
__releases(rq->lock)
+ __releases(p->pi_lock)
{
- raw_spin_unlock_irqrestore(&rq->lock, *flags);
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
}
/*
@@ -1773,7 +1788,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
update_rq_clock(rq);
sched_info_queued(p);
p->sched_class->enqueue_task(rq, p, flags);
- p->se.on_rq = 1;
}
static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1781,7 +1795,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
update_rq_clock(rq);
sched_info_dequeued(p);
p->sched_class->dequeue_task(rq, p, flags);
- p->se.on_rq = 0;
}
/*
@@ -2116,7 +2129,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
* A queue event has occurred, and we're going to schedule. In
* this case, we can save a useless back to back clock update.
*/
- if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr))
+ if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
rq->skip_clock_update = 1;
}
@@ -2162,6 +2175,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
*/
WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
!(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
+
+#ifdef CONFIG_LOCKDEP
+ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
+ lockdep_is_held(&task_rq(p)->lock)));
+#endif
#endif
trace_sched_migrate_task(p, new_cpu);
@@ -2182,19 +2200,6 @@ struct migration_arg {
static int migration_cpu_stop(void *data);
/*
- * The task's runqueue lock must be held.
- * Returns true if you have to wait for migration thread.
- */
-static bool migrate_task(struct task_struct *p, struct rq *rq)
-{
- /*
- * If the task is not on a runqueue (and not running), then
- * the next wake-up will properly place the task.
- */
- return p->se.on_rq || task_running(rq, p);
-}
-
-/*
* wait_task_inactive - wait for a thread to unschedule.
*
* If @match_state is nonzero, it's the @p->state value just checked and
@@ -2251,11 +2256,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
rq = task_rq_lock(p, &flags);
trace_sched_wait_task(p);
running = task_running(rq, p);
- on_rq = p->se.on_rq;
+ on_rq = p->on_rq;
ncsw = 0;
if (!match_state || p->state == match_state)
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);
/*
* If it changed from the expected state, bail out now.
@@ -2330,7 +2335,7 @@ EXPORT_SYMBOL_GPL(kick_process);
#ifdef CONFIG_SMP
/*
- * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
+ * ->cpus_allowed is protected by both rq->lock and p->pi_lock
*/
static int select_fallback_rq(int cpu, struct task_struct *p)
{
@@ -2363,12 +2368,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
}
/*
- * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
+ * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
*/
static inline
-int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
{
- int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
+ int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
/*
* In order not to call set_task_cpu() on a blocking task we need
@@ -2394,27 +2399,60 @@ static void update_avg(u64 *avg, u64 sample)
}
#endif
-static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
- bool is_sync, bool is_migrate, bool is_local,
- unsigned long en_flags)
+static void
+ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
{
+#ifdef CONFIG_SCHEDSTATS
+ struct rq *rq = this_rq();
+
+#ifdef CONFIG_SMP
+ int this_cpu = smp_processor_id();
+
+ if (cpu == this_cpu) {
+ schedstat_inc(rq, ttwu_local);
+ schedstat_inc(p, se.statistics.nr_wakeups_local);
+ } else {
+ struct sched_domain *sd;
+
+ schedstat_inc(p, se.statistics.nr_wakeups_remote);
+ for_each_domain(this_cpu, sd) {
+ if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+ schedstat_inc(sd, ttwu_wake_remote);
+ break;
+ }
+ }
+ }
+#endif /* CONFIG_SMP */
+
+ schedstat_inc(rq, ttwu_count);
schedstat_inc(p, se.statistics.nr_wakeups);
- if (is_sync)
+
+ if (wake_flags & WF_SYNC)
schedstat_inc(p, se.statistics.nr_wakeups_sync);
- if (is_migrate)
+
+ if (cpu != task_cpu(p))
schedstat_inc(p, se.statistics.nr_wakeups_migrate);
- if (is_local)
- schedstat_inc(p, se.statistics.nr_wakeups_local);
- else
- schedstat_inc(p, se.statistics.nr_wakeups_remote);
+#endif /* CONFIG_SCHEDSTATS */
+}
+
+static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
+{
activate_task(rq, p, en_flags);
+ p->on_rq = 1;
+
+ /* if a worker is waking up, notify workqueue */
+ if (p->flags & PF_WQ_WORKER)
+ wq_worker_waking_up(p, cpu_of(rq));
}
-static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
- int wake_flags, bool success)
+/*
+ * Mark the task runnable and perform wakeup-preemption.
+ */
+static void
+ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
{
- trace_sched_wakeup(p, success);
+ trace_sched_wakeup(p, true);
check_preempt_curr(rq, p, wake_flags);
p->state = TASK_RUNNING;
@@ -2433,9 +2471,99 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
rq->idle_stamp = 0;
}
#endif
- /* if a worker is waking up, notify workqueue */
- if ((p->flags & PF_WQ_WORKER) && success)
- wq_worker_waking_up(p, cpu_of(rq));
+}
+
+static void
+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
+{
+#ifdef CONFIG_SMP
+ if (p->sched_contributes_to_load)
+ rq->nr_uninterruptible--;
+#endif
+
+ ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
+ ttwu_do_wakeup(rq, p, wake_flags);
+}
+
+/*
+ * Called in case the task @p isn't fully descheduled from its runqueue,
+ * in this case we must do a remote wakeup. Its a 'light' wakeup though,
+ * since all we need to do is flip p->state to TASK_RUNNING, since
+ * the task is still ->on_rq.
+ */
+static int ttwu_remote(struct task_struct *p, int wake_flags)
+{
+ struct rq *rq;
+ int ret = 0;
+
+ rq = __task_rq_lock(p);
+ if (p->on_rq) {
+ ttwu_do_wakeup(rq, p, wake_flags);
+ ret = 1;
+ }
+ __task_rq_unlock(rq);
+
+ return ret;
+}
+
+#ifdef CONFIG_SMP
+static void sched_ttwu_pending(void)
+{
+ struct rq *rq = this_rq();
+ struct task_struct *list = xchg(&rq->wake_list, NULL);
+
+ if (!list)
+ return;
+
+ raw_spin_lock(&rq->lock);
+
+ while (list) {
+ struct task_struct *p = list;
+ list = list->wake_entry;
+ ttwu_do_activate(rq, p, 0);
+ }
+
+ raw_spin_unlock(&rq->lock);
+}
+
+void scheduler_ipi(void)
+{
+ sched_ttwu_pending();
+}
+
+static void ttwu_queue_remote(struct task_struct *p, int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct task_struct *next = rq->wake_list;
+
+ for (;;) {
+ struct task_struct *old = next;
+
+ p->wake_entry = next;
+ next = cmpxchg(&rq->wake_list, old, p);
+ if (next == old)
+ break;
+ }
+
+ if (!next)
+ smp_send_reschedule(cpu);
+}
+#endif
+
+static void ttwu_queue(struct task_struct *p, int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+#if defined(CONFIG_SMP) && defined(CONFIG_SCHED_TTWU_QUEUE)
+ if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
+ ttwu_queue_remote(p, cpu);
+ return;
+ }
+#endif
+
+ raw_spin_lock(&rq->lock);
+ ttwu_do_activate(rq, p, 0);
+ raw_spin_unlock(&rq->lock);
}
/**
@@ -2453,92 +2581,64 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
* Returns %true if @p was woken up, %false if it was already running
* or @state didn't match @p's state.
*/
-static int try_to_wake_up(struct task_struct *p, unsigned int state,
- int wake_flags)
+static int
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
- int cpu, orig_cpu, this_cpu, success = 0;
unsigned long flags;
- unsigned long en_flags = ENQUEUE_WAKEUP;
- struct rq *rq;
-
- this_cpu = get_cpu();
+ int cpu, success = 0;
smp_wmb();
- rq = task_rq_lock(p, &flags);
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
if (!(p->state & state))
goto out;
- if (p->se.on_rq)
- goto out_running;
-
+ success = 1; /* we're going to change ->state */
cpu = task_cpu(p);
- orig_cpu = cpu;
-#ifdef CONFIG_SMP
- if (unlikely(task_running(rq, p)))
- goto out_activate;
+ if (p->on_rq && ttwu_remote(p, wake_flags))
+ goto stat;
+#ifdef CONFIG_SMP
/*
- * In order to handle concurrent wakeups and release the rq->lock
- * we put the task in TASK_WAKING state.
- *
- * First fix up the nr_uninterruptible count:
+ * If the owning (remote) cpu is still in the middle of schedule() with
+ * this task as prev, wait until its done referencing the task.
*/
- if (task_contributes_to_load(p)) {
- if (likely(cpu_online(orig_cpu)))
- rq->nr_uninterruptible--;
- else
- this_rq()->nr_uninterruptible--;
- }
- p->state = TASK_WAKING;
-
- if (p->sched_class->task_waking) {
- p->sched_class->task_waking(rq, p);
- en_flags |= ENQUEUE_WAKING;
+ while (p->on_cpu) {
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+ /*
+ * If called from interrupt context we could have landed in the
+ * middle of schedule(), in this case we should take care not
+ * to spin on ->on_cpu if p is current, since that would
+ * deadlock.
+ */
+ if (p == current) {
+ ttwu_queue(p, cpu);
+ goto stat;
+ }
+#endif
+ cpu_relax();
}
-
- cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
- if (cpu != orig_cpu)
- set_task_cpu(p, cpu);
- __task_rq_unlock(rq);
-
- rq = cpu_rq(cpu);
- raw_spin_lock(&rq->lock);
-
/*
- * We migrated the task without holding either rq->lock, however
- * since the task is not on the task list itself, nobody else
- * will try and migrate the task, hence the rq should match the
- * cpu we just moved it to.
+ * Pairs with the smp_wmb() in finish_lock_switch().
*/
- WARN_ON(task_cpu(p) != cpu);
- WARN_ON(p->state != TASK_WAKING);
+ smp_rmb();
-#ifdef CONFIG_SCHEDSTATS
- schedstat_inc(rq, ttwu_count);
- if (cpu == this_cpu)
- schedstat_inc(rq, ttwu_local);
- else {
- struct sched_domain *sd;
- for_each_domain(this_cpu, sd) {
- if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
- schedstat_inc(sd, ttwu_wake_remote);
- break;
- }
- }
- }
-#endif /* CONFIG_SCHEDSTATS */
+ p->sched_contributes_to_load = !!task_contributes_to_load(p);
+ p->state = TASK_WAKING;
-out_activate:
+ if (p->sched_class->task_waking)
+ p->sched_class->task_waking(p);
+
+ cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
+ if (task_cpu(p) != cpu)
+ set_task_cpu(p, cpu);
#endif /* CONFIG_SMP */
- ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
- cpu == this_cpu, en_flags);
- success = 1;
-out_running:
- ttwu_post_activation(p, rq, wake_flags, success);
+
+ ttwu_queue(p, cpu);
+stat:
+ ttwu_stat(p, cpu, wake_flags);
out:
- task_rq_unlock(rq, &flags);
- put_cpu();
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
return success;
}
@@ -2547,31 +2647,34 @@ out:
* try_to_wake_up_local - try to wake up a local task with rq lock held
* @p: the thread to be awakened
*
- * Put @p on the run-queue if it's not already there. The caller must
+ * Put @p on the run-queue if it's not already there. The caller must
* ensure that this_rq() is locked, @p is bound to this_rq() and not
- * the current task. this_rq() stays locked over invocation.
+ * the current task.
*/
static void try_to_wake_up_local(struct task_struct *p)
{
struct rq *rq = task_rq(p);
- bool success = false;
BUG_ON(rq != this_rq());
BUG_ON(p == current);
lockdep_assert_held(&rq->lock);
+ if (!raw_spin_trylock(&p->pi_lock)) {
+ raw_spin_unlock(&rq->lock);
+ raw_spin_lock(&p->pi_lock);
+ raw_spin_lock(&rq->lock);
+ }
+
if (!(p->state & TASK_NORMAL))
- return;
+ goto out;
- if (!p->se.on_rq) {
- if (likely(!task_running(rq, p))) {
- schedstat_inc(rq, ttwu_count);
- schedstat_inc(rq, ttwu_local);
- }
- ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
- success = true;
- }
- ttwu_post_activation(p, rq, 0, success);
+ if (!p->on_rq)
+ ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+
+ ttwu_do_wakeup(rq, p, 0);
+ ttwu_stat(p, smp_processor_id(), 0);
+out:
+ raw_spin_unlock(&p->pi_lock);
}
/**
@@ -2604,19 +2707,21 @@ int wake_up_state(struct task_struct *p, unsigned int state)
*/
static void __sched_fork(struct task_struct *p)
{
+ p->on_rq = 0;
+
+ p->se.on_rq = 0;
p->se.exec_start = 0;
p->se.sum_exec_runtime = 0;
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
p->se.vruntime = 0;
+ INIT_LIST_HEAD(&p->se.group_node);
#ifdef CONFIG_SCHEDSTATS
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
INIT_LIST_HEAD(&p->rt.run_list);
- p->se.on_rq = 0;
- INIT_LIST_HEAD(&p->se.group_node);
#ifdef CONFIG_PREEMPT_NOTIFIERS
INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -2628,6 +2733,7 @@ static void __sched_fork(struct task_struct *p)
*/
void sched_fork(struct task_struct *p, int clone_flags)
{
+ unsigned long flags;
int cpu = get_cpu();
__sched_fork(p);
@@ -2678,16 +2784,16 @@ void sched_fork(struct task_struct *p, int clone_flags)
*
* Silence PROVE_RCU.
*/
- rcu_read_lock();
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
set_task_cpu(p, cpu);
- rcu_read_unlock();
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
if (likely(sched_info_on()))
memset(&p->sched_info, 0, sizeof(p->sched_info));
#endif
-#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
- p->oncpu = 0;
+#if defined(CONFIG_SMP)
+ p->on_cpu = 0;
#endif
#ifdef CONFIG_PREEMPT
/* Want to start with kernel preemption disabled. */
@@ -2711,37 +2817,27 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
{
unsigned long flags;
struct rq *rq;
- int cpu __maybe_unused = get_cpu();
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
#ifdef CONFIG_SMP
- rq = task_rq_lock(p, &flags);
- p->state = TASK_WAKING;
-
/*
* Fork balancing, do it here and not earlier because:
* - cpus_allowed can change in the fork path
* - any previously selected cpu might disappear through hotplug
- *
- * We set TASK_WAKING so that select_task_rq() can drop rq->lock
- * without people poking at ->cpus_allowed.
*/
- cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
- set_task_cpu(p, cpu);
-
- p->state = TASK_RUNNING;
- task_rq_unlock(rq, &flags);
+ set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
#endif
- rq = task_rq_lock(p, &flags);
+ rq = __task_rq_lock(p);
activate_task(rq, p, 0);
- trace_sched_wakeup_new(p, 1);
+ p->on_rq = 1;
+ trace_sched_wakeup_new(p, true);
check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
if (p->sched_class->task_woken)
p->sched_class->task_woken(rq, p);
#endif
- task_rq_unlock(rq, &flags);
- put_cpu();
+ task_rq_unlock(rq, p, &flags);
}
#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -3450,27 +3546,22 @@ void sched_exec(void)
{
struct task_struct *p = current;
unsigned long flags;
- struct rq *rq;
int dest_cpu;
- rq = task_rq_lock(p, &flags);
- dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
if (dest_cpu == smp_processor_id())
goto unlock;
- /*
- * select_task_rq() can race against ->cpus_allowed
- */
- if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
- likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
+ if (likely(cpu_active(dest_cpu))) {
struct migration_arg arg = { p, dest_cpu };
- task_rq_unlock(rq, &flags);
- stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
return;
}
unlock:
- task_rq_unlock(rq, &flags);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
}
#endif
@@ -3507,7 +3598,7 @@ unsigned long long task_delta_exec(struct task_struct *p)
rq = task_rq_lock(p, &flags);
ns = do_task_delta_exec(p, rq);
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);
return ns;
}
@@ -3525,7 +3616,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
rq = task_rq_lock(p, &flags);
ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);
return ns;
}
@@ -3549,7 +3640,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p)
rq = task_rq_lock(p, &flags);
thread_group_cputime(p, &totals);
ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);
return ns;
}
@@ -4035,7 +4126,7 @@ static inline void schedule_debug(struct task_struct *prev)
static void put_prev_task(struct rq *rq, struct task_struct *prev)
{
- if (prev->se.on_rq)
+ if (prev->on_rq)
update_rq_clock(rq);
prev->sched_class->put_prev_task(rq, prev);
}
@@ -4097,11 +4188,13 @@ need_resched:
if (unlikely(signal_pending_state(prev->state, prev))) {
prev->state = TASK_RUNNING;
} else {
+ deactivate_task(rq, prev, DEQUEUE_SLEEP);
+ prev->on_rq = 0;
+
/*
- * If a worker is going to sleep, notify and
- * ask workqueue whether it wants to wake up a
- * task to maintain concurrency. If so, wake
- * up the task.
+ * If a worker went to sleep, notify and ask workqueue
+ * whether it wants to wake up a task to maintain
+ * concurrency.
*/
if (prev->flags & PF_WQ_WORKER) {
struct task_struct *to_wakeup;
@@ -4110,11 +4203,10 @@ need_resched:
if (to_wakeup)
try_to_wake_up_local(to_wakeup);
}
- deactivate_task(rq, prev, DEQUEUE_SLEEP);
/*
- * If we are going to sleep and we have plugged IO queued, make
- * sure to submit it to avoid deadlocks.
+ * If we are going to sleep and we have plugged IO
+ * queued, make sure to submit it to avoid deadlocks.
*/
if (blk_needs_flush_plug(prev)) {
raw_spin_unlock(&rq->lock);
@@ -4161,70 +4253,53 @@ need_resched:
EXPORT_SYMBOL(schedule);
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-/*
- * Look out! "owner" is an entirely speculative pointer
- * access and not reliable.
- */
-int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
-{
- unsigned int cpu;
- struct rq *rq;
- if (!sched_feat(OWNER_SPIN))
- return 0;
+static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
+{
+ bool ret = false;
-#ifdef CONFIG_DEBUG_PAGEALLOC
- /*
- * Need to access the cpu field knowing that
- * DEBUG_PAGEALLOC could have unmapped it if
- * the mutex owner just released it and exited.
- */
- if (probe_kernel_address(&owner->cpu, cpu))
- return 0;
-#else
- cpu = owner->cpu;
-#endif
+ rcu_read_lock();
+ if (lock->owner != owner)
+ goto fail;
/*
- * Even if the access succeeded (likely case),
- * the cpu field may no longer be valid.
+ * Ensure we emit the owner->on_cpu, dereference _after_ checking
+ * lock->owner still matches owner, if that fails, owner might
+ * point to free()d memory, if it still matches, the rcu_read_lock()
+ * ensures the memory stays valid.
*/
- if (cpu >= nr_cpumask_bits)
- return 0;
+ barrier();
- /*
- * We need to validate that we can do a
- * get_cpu() and that we have the percpu area.
- */
- if (!cpu_online(cpu))
- return 0;
+ ret = owner->on_cpu;
+fail:
+ rcu_read_unlock();
- rq = cpu_rq(cpu);
+ return ret;
+}
- for (;;) {
- /*
- * Owner changed, break to re-assess state.
- */
- if (lock->owner != owner) {
- /*
- * If the lock has switched to a different owner,
- * we likely have heavy contention. Return 0 to quit
- * optimistic spinning and not contend further:
- */
- if (lock->owner)
- return 0;
- break;
- }
+/*
+ * Look out! "owner" is an entirely speculative pointer
+ * access and not reliable.
+ */
+int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
+{
+ if (!sched_feat(OWNER_SPIN))
+ return 0;
- /*
- * Is that owner really running on that cpu?
- */
- if (task_thread_info(rq->curr) != owner || need_resched())
+ while (owner_running(lock, owner)) {
+ if (need_resched())
return 0;
arch_mutex_cpu_relax();
}
+ /*
+ * If the owner changed to another task there is likely
+ * heavy contention, stop spinning.
+ */
+ if (lock->owner)
+ return 0;
+
return 1;
}
#endif
@@ -4684,19 +4759,18 @@ EXPORT_SYMBOL(sleep_on_timeout);
*/
void rt_mutex_setprio(struct task_struct *p, int prio)
{
- unsigned long flags;
int oldprio, on_rq, running;
struct rq *rq;
const struct sched_class *prev_class;
BUG_ON(prio < 0 || prio > MAX_PRIO);
- rq = task_rq_lock(p, &flags);
+ rq = __task_rq_lock(p);
trace_sched_pi_setprio(p, prio);
oldprio = p->prio;
prev_class = p->sched_class;
- on_rq = p->se.on_rq;
+ on_rq = p->on_rq;
running = task_current(rq, p);
if (on_rq)
dequeue_task(rq, p, 0);
@@ -4716,7 +4790,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
check_class_changed(rq, p, prev_class, oldprio);
- task_rq_unlock(rq, &flags);
+ __task_rq_unlock(rq);
}
#endif
@@ -4744,7 +4818,7 @@ void set_user_nice(struct task_struct *p, long nice)
p->static_prio = NICE_TO_PRIO(nice);
goto out_unlock;
}
- on_rq = p->se.on_rq;
+ on_rq = p->on_rq;
if (on_rq)
dequeue_task(rq, p, 0);
@@ -4764,7 +4838,7 @@ void set_user_nice(struct task_struct *p, long nice)
resched_task(rq->curr);
}
out_unlock:
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);
}
EXPORT_SYMBOL(set_user_nice);
@@ -4878,8 +4952,6 @@ static struct task_struct *find_process_by_pid(pid_t pid)
static void
__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
{
- BUG_ON(p->se.on_rq);
-
p->policy = policy;
p->rt_priority = prio;
p->normal_prio = normal_prio(p);
@@ -4994,20 +5066,17 @@ recheck:
/*
* make sure no PI-waiters arrive (or leave) while we are
* changing the priority of the task:
- */
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- /*
+ *
* To be able to change p->policy safely, the appropriate
* runqueue lock must be held.
*/
- rq = __task_rq_lock(p);
+ rq = task_rq_lock(p, &flags);
/*
* Changing the policy of the stop threads its a very bad idea
*/
if (p == rq->stop) {
- __task_rq_unlock(rq);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ task_rq_unlock(rq, p, &flags);
return -EINVAL;
}
@@ -5031,8 +5100,7 @@ recheck:
if (rt_bandwidth_enabled() && rt_policy(policy) &&
task_group(p)->rt_bandwidth.rt_runtime == 0 &&
!task_group_is_autogroup(task_group(p))) {
- __task_rq_unlock(rq);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ task_rq_unlock(rq, p, &flags);
return -EPERM;
}
}
@@ -5041,11 +5109,10 @@ recheck:
/* recheck policy now with rq lock held */
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
policy = oldpolicy = -1;
- __task_rq_unlock(rq);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ task_rq_unlock(rq, p, &flags);
goto recheck;
}
- on_rq = p->se.on_rq;
+ on_rq = p->on_rq;
running = task_current(rq, p);
if (on_rq)
deactivate_task(rq, p, 0);
@@ -5064,8 +5131,7 @@ recheck:
activate_task(rq, p, 0);
check_class_changed(rq, p, prev_class, oldprio);
- __task_rq_unlock(rq);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ task_rq_unlock(rq, p, &flags);
rt_mutex_adjust_pi(p);
@@ -5316,7 +5382,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
{
struct task_struct *p;
unsigned long flags;
- struct rq *rq;
int retval;
get_online_cpus();
@@ -5331,9 +5396,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
if (retval)
goto out_unlock;
- rq = task_rq_lock(p, &flags);
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
- task_rq_unlock(rq, &flags);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
out_unlock:
rcu_read_unlock();
@@ -5658,7 +5723,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
rq = task_rq_lock(p, &flags);
time_slice = p->sched_class->get_rr_interval(rq, p);
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);
rcu_read_unlock();
jiffies_to_timespec(time_slice, &t);
@@ -5776,8 +5841,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
rcu_read_unlock();
rq->curr = rq->idle = idle;
-#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
- idle->oncpu = 1;
+#if defined(CONFIG_SMP)
+ idle->on_cpu = 1;
#endif
raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -5881,18 +5946,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
unsigned int dest_cpu;
int ret = 0;
- /*
- * Serialize against TASK_WAKING so that ttwu() and wunt() can
- * drop the rq->lock and still rely on ->cpus_allowed.
- */
-again:
- while (task_is_waking(p))
- cpu_relax();
rq = task_rq_lock(p, &flags);
- if (task_is_waking(p)) {
- task_rq_unlock(rq, &flags);
- goto again;
- }
if (!cpumask_intersects(new_mask, cpu_active_mask)) {
ret = -EINVAL;
@@ -5917,16 +5971,16 @@ again:
goto out;
dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
- if (migrate_task(p, rq)) {
+ if (p->on_rq) {
struct migration_arg arg = { p, dest_cpu };
/* Need help from migration thread: drop lock and wait. */
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);
stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
tlb_migrate_finish(p->mm);
return 0;
}
out:
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);
return ret;
}
@@ -5954,6 +6008,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
rq_src = cpu_rq(src_cpu);
rq_dest = cpu_rq(dest_cpu);
+ raw_spin_lock(&p->pi_lock);
double_rq_lock(rq_src, rq_dest);
/* Already moved. */
if (task_cpu(p) != src_cpu)
@@ -5966,7 +6021,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
* If we're not on a rq, the next wake-up will ensure we're
* placed properly.
*/
- if (p->se.on_rq) {
+ if (p->on_rq) {
deactivate_task(rq_src, p, 0);
set_task_cpu(p, dest_cpu);
activate_task(rq_dest, p, 0);
@@ -5976,6 +6031,7 @@ done:
ret = 1;
fail:
double_rq_unlock(rq_src, rq_dest);
+ raw_spin_unlock(&p->pi_lock);
return ret;
}
@@ -6316,6 +6372,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
#ifdef CONFIG_HOTPLUG_CPU
case CPU_DYING:
+ sched_ttwu_pending();
/* Update our root-domain */
raw_spin_lock_irqsave(&rq->lock, flags);
if (rq->rd) {
@@ -6394,6 +6451,8 @@ early_initcall(migration_init);
#ifdef CONFIG_SMP
+static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
+
#ifdef CONFIG_SCHED_DEBUG
static __read_mostly int sched_domain_debug_enabled;
@@ -6489,7 +6548,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
static void sched_domain_debug(struct sched_domain *sd, int cpu)
{
- cpumask_var_t groupmask;
int level = 0;
if (!sched_domain_debug_enabled)
@@ -6502,20 +6560,14 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
- if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
- printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
- return;
- }
-
for (;;) {
- if (sched_domain_debug_one(sd, cpu, level, groupmask))
+ if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
break;
level++;
sd = sd->parent;
if (!sd)
break;
}
- free_cpumask_var(groupmask);
}
#else /* !CONFIG_SCHED_DEBUG */
# define sched_domain_debug(sd, cpu) do { } while (0)
@@ -6572,12 +6624,11 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
return 1;
}
-static void free_rootdomain(struct root_domain *rd)
+static void free_rootdomain(struct rcu_head *rcu)
{
- synchronize_sched();
+ struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
cpupri_cleanup(&rd->cpupri);
-
free_cpumask_var(rd->rto_mask);
free_cpumask_var(rd->online);
free_cpumask_var(rd->span);
@@ -6618,7 +6669,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
raw_spin_unlock_irqrestore(&rq->lock, flags);
if (old_rd)
- free_rootdomain(old_rd);
+ call_rcu_sched(&old_rd->rcu, free_rootdomain);
}
static int init_rootdomain(struct root_domain *rd)
@@ -6669,6 +6720,25 @@ static struct root_domain *alloc_rootdomain(void)
return rd;
}
+static void free_sched_domain(struct rcu_head *rcu)
+{
+ struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
+ if (atomic_dec_and_test(&sd->groups->ref))
+ kfree(sd->groups);
+ kfree(sd);
+}
+
+static void destroy_sched_domain(struct sched_domain *sd, int cpu)
+{
+ call_rcu(&sd->rcu, free_sched_domain);
+}
+
+static void destroy_sched_domains(struct sched_domain *sd, int cpu)
+{
+ for (; sd; sd = sd->parent)
+ destroy_sched_domain(sd, cpu);
+}
+
/*
* Attach the domain 'sd' to 'cpu' as its base domain. Callers must
* hold the hotplug lock.
@@ -6679,9 +6749,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
struct rq *rq = cpu_rq(cpu);
struct sched_domain *tmp;
- for (tmp = sd; tmp; tmp = tmp->parent)
- tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
-
/* Remove the sched domains which do not contribute to scheduling. */
for (tmp = sd; tmp; ) {
struct sched_domain *parent = tmp->parent;
@@ -6692,12 +6759,15 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
tmp->parent = parent->parent;
if (parent->parent)
parent->parent->child = tmp;
+ destroy_sched_domain(parent, cpu);
} else
tmp = tmp->parent;
}
if (sd && sd_degenerate(sd)) {
+ tmp = sd;
sd = sd->parent;
+ destroy_sched_domain(tmp, cpu);
if (sd)
sd->child = NULL;
}
@@ -6705,7 +6775,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
sched_domain_debug(sd, cpu);
rq_attach_root(rq, rd);
+ tmp = rq->sd;
rcu_assign_pointer(rq->sd, sd);
+ destroy_sched_domains(tmp, cpu);
}
/* cpus with isolated domains */
@@ -6721,56 +6793,6 @@ static int __init isolated_cpu_setup(char *str)
__setup("isolcpus=", isolated_cpu_setup);
-/*
- * init_sched_build_groups takes the cpumask we wish to span, and a pointer
- * to a function which identifies what group(along with sched group) a CPU
- * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
- * (due to the fact that we keep track of groups covered with a struct cpumask).
- *
- * init_sched_build_groups will build a circular linked list of the groups
- * covered by the given span, and will set each group's ->cpumask correctly,
- * and ->cpu_power to 0.
- */
-static void
-init_sched_build_groups(const struct cpumask *span,
- const struct cpumask *cpu_map,
- int (*group_fn)(int cpu, const struct cpumask *cpu_map,
- struct sched_group **sg,
- struct cpumask *tmpmask),
- struct cpumask *covered, struct cpumask *tmpmask)
-{
- struct sched_group *first = NULL, *last = NULL;
- int i;
-
- cpumask_clear(covered);
-
- for_each_cpu(i, span) {
- struct sched_group *sg;
- int group = group_fn(i, cpu_map, &sg, tmpmask);
- int j;
-
- if (cpumask_test_cpu(i, covered))
- continue;
-
- cpumask_clear(sched_group_cpus(sg));
- sg->cpu_power = 0;
-
- for_each_cpu(j, span) {
- if (group_fn(j, cpu_map, NULL, tmpmask) != group)
- continue;
-
- cpumask_set_cpu(j, covered);
- cpumask_set_cpu(j, sched_group_cpus(sg));
- }
- if (!first)
- first = sg;
- if (last)
- last->next = sg;
- last = sg;
- }
- last->next = first;
-}
-
#define SD_NODES_PER_DOMAIN 16
#ifdef CONFIG_NUMA
@@ -6841,311 +6863,125 @@ static void sched_domain_node_span(int node, struct cpumask *span)
cpumask_or(span, span, cpumask_of_node(next_node));
}
}
+
+static const struct cpumask *cpu_node_mask(int cpu)
+{
+ lockdep_assert_held(&sched_domains_mutex);
+
+ sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
+
+ return sched_domains_tmpmask;
+}
+
+static const struct cpumask *cpu_allnodes_mask(int cpu)
+{
+ return cpu_possible_mask;
+}
#endif /* CONFIG_NUMA */
-int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
+static const struct cpumask *cpu_cpu_mask(int cpu)
+{
+ return cpumask_of_node(cpu_to_node(cpu));
+}
-/*
- * The cpus mask in sched_group and sched_domain hangs off the end.
- *
- * ( See the the comments in include/linux/sched.h:struct sched_group
- * and struct sched_domain. )
- */
-struct static_sched_group {
- struct sched_group sg;
- DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
-};
+int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
-struct static_sched_domain {
- struct sched_domain sd;
- DECLARE_BITMAP(span, CONFIG_NR_CPUS);
+struct sd_data {
+ struct sched_domain **__percpu sd;
+ struct sched_group **__percpu sg;
};
struct s_data {
-#ifdef CONFIG_NUMA
- int sd_allnodes;
- cpumask_var_t domainspan;
- cpumask_var_t covered;
- cpumask_var_t notcovered;
-#endif
- cpumask_var_t nodemask;
- cpumask_var_t this_sibling_map;
- cpumask_var_t this_core_map;
- cpumask_var_t this_book_map;
- cpumask_var_t send_covered;
- cpumask_var_t tmpmask;
- struct sched_group **sched_group_nodes;
+ struct sched_domain ** __percpu sd;
struct root_domain *rd;
};
enum s_alloc {
- sa_sched_groups = 0,
sa_rootdomain,
- sa_tmpmask,
- sa_send_covered,
- sa_this_book_map,
- sa_this_core_map,
- sa_this_sibling_map,
- sa_nodemask,
- sa_sched_group_nodes,
-#ifdef CONFIG_NUMA
- sa_notcovered,
- sa_covered,
- sa_domainspan,
-#endif
+ sa_sd,
+ sa_sd_storage,
sa_none,
};
-/*
- * SMT sched-domains:
- */
-#ifdef CONFIG_SCHED_SMT
-static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
-
-static int
-cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
- struct sched_group **sg, struct cpumask *unused)
-{
- if (sg)
- *sg = &per_cpu(sched_groups, cpu).sg;
- return cpu;
-}
-#endif /* CONFIG_SCHED_SMT */
+struct sched_domain_topology_level;
-/*
- * multi-core sched-domains:
- */
-#ifdef CONFIG_SCHED_MC
-static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
+typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
+typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
-static int
-cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
- struct sched_group **sg, struct cpumask *mask)
-{
- int group;
-#ifdef CONFIG_SCHED_SMT
- cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
- group = cpumask_first(mask);
-#else
- group = cpu;
-#endif
- if (sg)
- *sg = &per_cpu(sched_group_core, group).sg;
- return group;
-}
-#endif /* CONFIG_SCHED_MC */
+struct sched_domain_topology_level {
+ sched_domain_init_f init;
+ sched_domain_mask_f mask;
+ struct sd_data data;
+};
/*
- * book sched-domains:
+ * Assumes the sched_domain tree is fully constructed
*/
-#ifdef CONFIG_SCHED_BOOK
-static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
-
-static int
-cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
- struct sched_group **sg, struct cpumask *mask)
+static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
{
- int group = cpu;
-#ifdef CONFIG_SCHED_MC
- cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
- group = cpumask_first(mask);
-#elif defined(CONFIG_SCHED_SMT)
- cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
- group = cpumask_first(mask);
-#endif
- if (sg)
- *sg = &per_cpu(sched_group_book, group).sg;
- return group;
-}
-#endif /* CONFIG_SCHED_BOOK */
+ struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
+ struct sched_domain *child = sd->child;
-static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
+ if (child)
+ cpu = cpumask_first(sched_domain_span(child));
-static int
-cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
- struct sched_group **sg, struct cpumask *mask)
-{
- int group;
-#ifdef CONFIG_SCHED_BOOK
- cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
- group = cpumask_first(mask);
-#elif defined(CONFIG_SCHED_MC)
- cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
- group = cpumask_first(mask);
-#elif defined(CONFIG_SCHED_SMT)
- cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
- group = cpumask_first(mask);
-#else
- group = cpu;
-#endif
if (sg)
- *sg = &per_cpu(sched_group_phys, group).sg;
- return group;
+ *sg = *per_cpu_ptr(sdd->sg, cpu);
+
+ return cpu;
}
-#ifdef CONFIG_NUMA
/*
- * The init_sched_build_groups can't handle what we want to do with node
- * groups, so roll our own. Now each node has its own list of groups which
- * gets dynamically allocated.
+ * build_sched_groups takes the cpumask we wish to span, and a pointer
+ * to a function which identifies what group(along with sched group) a CPU
+ * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
+ * (due to the fact that we keep track of groups covered with a struct cpumask).
+ *
+ * build_sched_groups will build a circular linked list of the groups
+ * covered by the given span, and will set each group's ->cpumask correctly,
+ * and ->cpu_power to 0.
*/
-static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
-static struct sched_group ***sched_group_nodes_bycpu;
-
-static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
-
-static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
- struct sched_group **sg,
- struct cpumask *nodemask)
-{
- int group;
-
- cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
- group = cpumask_first(nodemask);
-
- if (sg)
- *sg = &per_cpu(sched_group_allnodes, group).sg;
- return group;
-}
-
-static void init_numa_sched_groups_power(struct sched_group *group_head)
-{
- struct sched_group *sg = group_head;
- int j;
-
- if (!sg)
- return;
- do {
- for_each_cpu(j, sched_group_cpus(sg)) {
- struct sched_domain *sd;
-
- sd = &per_cpu(phys_domains, j).sd;
- if (j != group_first_cpu(sd->groups)) {
- /*
- * Only add "power" once for each
- * physical package.
- */
- continue;
- }
-
- sg->cpu_power += sd->groups->cpu_power;
- }
- sg = sg->next;
- } while (sg != group_head);
-}
-
-static int build_numa_sched_groups(struct s_data *d,
- const struct cpumask *cpu_map, int num)
+static void
+build_sched_groups(struct sched_domain *sd)
{
- struct sched_domain *sd;
- struct sched_group *sg, *prev;
- int n, j;
-
- cpumask_clear(d->covered);
- cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
- if (cpumask_empty(d->nodemask)) {
- d->sched_group_nodes[num] = NULL;
- goto out;
- }
-
- sched_domain_node_span(num, d->domainspan);
- cpumask_and(d->domainspan, d->domainspan, cpu_map);
-
- sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
- GFP_KERNEL, num);
- if (!sg) {
- printk(KERN_WARNING "Can not alloc domain group for node %d\n",
- num);
- return -ENOMEM;
- }
- d->sched_group_nodes[num] = sg;
-
- for_each_cpu(j, d->nodemask) {
- sd = &per_cpu(node_domains, j).sd;
- sd->groups = sg;
- }
-
- sg->cpu_power = 0;
- cpumask_copy(sched_group_cpus(sg), d->nodemask);
- sg->next = sg;
- cpumask_or(d->covered, d->covered, d->nodemask);
+ struct sched_group *first = NULL, *last = NULL;
+ struct sd_data *sdd = sd->private;
+ const struct cpumask *span = sched_domain_span(sd);
+ struct cpumask *covered;
+ int i;
- prev = sg;
- for (j = 0; j < nr_node_ids; j++) {
- n = (num + j) % nr_node_ids;
- cpumask_complement(d->notcovered, d->covered);
- cpumask_and(d->tmpmask, d->notcovered, cpu_map);
- cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
- if (cpumask_empty(d->tmpmask))
- break;
- cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
- if (cpumask_empty(d->tmpmask))
- continue;
- sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
- GFP_KERNEL, num);
- if (!sg) {
- printk(KERN_WARNING
- "Can not alloc domain group for node %d\n", j);
- return -ENOMEM;
- }
- sg->cpu_power = 0;
- cpumask_copy(sched_group_cpus(sg), d->tmpmask);
- sg->next = prev->next;
- cpumask_or(d->covered, d->covered, d->tmpmask);
- prev->next = sg;
- prev = sg;
- }
-out:
- return 0;
-}
-#endif /* CONFIG_NUMA */
+ lockdep_assert_held(&sched_domains_mutex);
+ covered = sched_domains_tmpmask;
-#ifdef CONFIG_NUMA
-/* Free memory allocated for various sched_group structures */
-static void free_sched_groups(const struct cpumask *cpu_map,
- struct cpumask *nodemask)
-{
- int cpu, i;
+ cpumask_clear(covered);
- for_each_cpu(cpu, cpu_map) {
- struct sched_group **sched_group_nodes
- = sched_group_nodes_bycpu[cpu];
+ for_each_cpu(i, span) {
+ struct sched_group *sg;
+ int group = get_group(i, sdd, &sg);
+ int j;
- if (!sched_group_nodes)
+ if (cpumask_test_cpu(i, covered))
continue;
- for (i = 0; i < nr_node_ids; i++) {
- struct sched_group *oldsg, *sg = sched_group_nodes[i];
+ cpumask_clear(sched_group_cpus(sg));
+ sg->cpu_power = 0;
- cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
- if (cpumask_empty(nodemask))
+ for_each_cpu(j, span) {
+ if (get_group(j, sdd, NULL) != group)
continue;
- if (sg == NULL)
- continue;
- sg = sg->next;
-next_sg:
- oldsg = sg;
- sg = sg->next;
- kfree(oldsg);
- if (oldsg != sched_group_nodes[i])
- goto next_sg;
+ cpumask_set_cpu(j, covered);
+ cpumask_set_cpu(j, sched_group_cpus(sg));
}
- kfree(sched_group_nodes);
- sched_group_nodes_bycpu[cpu] = NULL;
+
+ if (!first)
+ first = sg;
+ if (last)
+ last->next = sg;
+ last = sg;
}
+ last->next = first;
}
-#else /* !CONFIG_NUMA */
-static void free_sched_groups(const struct cpumask *cpu_map,
- struct cpumask *nodemask)
-{
-}
-#endif /* CONFIG_NUMA */
/*
* Initialize sched groups cpu_power.
@@ -7159,11 +6995,6 @@ static void free_sched_groups(const struct cpumask *cpu_map,
*/
static void init_sched_groups_power(int cpu, struct sched_domain *sd)
{
- struct sched_domain *child;
- struct sched_group *group;
- long power;
- int weight;
-
WARN_ON(!sd || !sd->groups);
if (cpu != group_first_cpu(sd->groups))
@@ -7171,36 +7002,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
- child = sd->child;
-
- sd->groups->cpu_power = 0;
-
- if (!child) {
- power = SCHED_LOAD_SCALE;
- weight = cpumask_weight(sched_domain_span(sd));
- /*
- * SMT siblings share the power of a single core.
- * Usually multiple threads get a better yield out of
- * that one core than a single thread would have,
- * reflect that in sd->smt_gain.
- */
- if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
- power *= sd->smt_gain;
- power /= weight;
- power >>= SCHED_LOAD_SHIFT;
- }
- sd->groups->cpu_power += power;
- return;
- }
-
- /*
- * Add cpu_power of each child group to this groups cpu_power.
- */
- group = child->groups;
- do {
- sd->groups->cpu_power += group->cpu_power;
- group = group->next;
- } while (group != child->groups);
+ update_group_power(sd, cpu);
}
/*
@@ -7214,15 +7016,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
# define SD_INIT_NAME(sd, type) do { } while (0)
#endif
-#define SD_INIT(sd, type) sd_init_##type(sd)
-
-#define SD_INIT_FUNC(type) \
-static noinline void sd_init_##type(struct sched_domain *sd) \
-{ \
- memset(sd, 0, sizeof(*sd)); \
- *sd = SD_##type##_INIT; \
- sd->level = SD_LV_##type; \
- SD_INIT_NAME(sd, type); \
+#define SD_INIT_FUNC(type) \
+static noinline struct sched_domain * \
+sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
+{ \
+ struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
+ *sd = SD_##type##_INIT; \
+ SD_INIT_NAME(sd, type); \
+ sd->private = &tl->data; \
+ return sd; \
}
SD_INIT_FUNC(CPU)
@@ -7241,13 +7043,14 @@ SD_INIT_FUNC(CPU)
#endif
static int default_relax_domain_level = -1;
+int sched_domain_level_max;
static int __init setup_relax_domain_level(char *str)
{
unsigned long val;
val = simple_strtoul(str, NULL, 0);
- if (val < SD_LV_MAX)
+ if (val < sched_domain_level_max)
default_relax_domain_level = val;
return 1;
@@ -7275,37 +7078,20 @@ static void set_domain_attribute(struct sched_domain *sd,
}
}
+static void __sdt_free(const struct cpumask *cpu_map);
+static int __sdt_alloc(const struct cpumask *cpu_map);
+
static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
const struct cpumask *cpu_map)
{
switch (what) {
- case sa_sched_groups:
- free_sched_groups(cpu_map, d->tmpmask); /* fall through */
- d->sched_group_nodes = NULL;
case sa_rootdomain:
- free_rootdomain(d->rd); /* fall through */
- case sa_tmpmask:
- free_cpumask_var(d->tmpmask); /* fall through */
- case sa_send_covered:
- free_cpumask_var(d->send_covered); /* fall through */
- case sa_this_book_map:
- free_cpumask_var(d->this_book_map); /* fall through */
- case sa_this_core_map:
- free_cpumask_var(d->this_core_map); /* fall through */
- case sa_this_sibling_map:
- free_cpumask_var(d->this_sibling_map); /* fall through */
- case sa_nodemask:
- free_cpumask_var(d->nodemask); /* fall through */
- case sa_sched_group_nodes:
-#ifdef CONFIG_NUMA
- kfree(d->sched_group_nodes); /* fall through */
- case sa_notcovered:
- free_cpumask_var(d->notcovered); /* fall through */
- case sa_covered:
- free_cpumask_var(d->covered); /* fall through */
- case sa_domainspan:
- free_cpumask_var(d->domainspan); /* fall through */
-#endif
+ if (!atomic_read(&d->rd->refcount))
+ free_rootdomain(&d->rd->rcu); /* fall through */
+ case sa_sd:
+ free_percpu(d->sd); /* fall through */
+ case sa_sd_storage:
+ __sdt_free(cpu_map); /* fall through */
case sa_none:
break;
}
@@ -7314,308 +7100,212 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
const struct cpumask *cpu_map)
{
-#ifdef CONFIG_NUMA
- if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
- return sa_none;
- if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
- return sa_domainspan;
- if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
- return sa_covered;
- /* Allocate the per-node list of sched groups */
- d->sched_group_nodes = kcalloc(nr_node_ids,
- sizeof(struct sched_group *), GFP_KERNEL);
- if (!d->sched_group_nodes) {
- printk(KERN_WARNING "Can not alloc sched group node list\n");
- return sa_notcovered;
- }
- sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
-#endif
- if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
- return sa_sched_group_nodes;
- if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
- return sa_nodemask;
- if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
- return sa_this_sibling_map;
- if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
- return sa_this_core_map;
- if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
- return sa_this_book_map;
- if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
- return sa_send_covered;
+ memset(d, 0, sizeof(*d));
+
+ if (__sdt_alloc(cpu_map))
+ return sa_sd_storage;
+ d->sd = alloc_percpu(struct sched_domain *);
+ if (!d->sd)
+ return sa_sd_storage;
d->rd = alloc_rootdomain();
- if (!d->rd) {
- printk(KERN_WARNING "Cannot alloc root domain\n");
- return sa_tmpmask;
- }
+ if (!d->rd)
+ return sa_sd;
return sa_rootdomain;
}
-static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
- const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
+/*
+ * NULL the sd_data elements we've used to build the sched_domain and
+ * sched_group structure so that the subsequent __free_domain_allocs()
+ * will not free the data we're using.
+ */
+static void claim_allocations(int cpu, struct sched_domain *sd)
{
- struct sched_domain *sd = NULL;
-#ifdef CONFIG_NUMA
- struct sched_domain *parent;
-
- d->sd_allnodes = 0;
- if (cpumask_weight(cpu_map) >
- SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
- sd = &per_cpu(allnodes_domains, i).sd;
- SD_INIT(sd, ALLNODES);
- set_domain_attribute(sd, attr);
- cpumask_copy(sched_domain_span(sd), cpu_map);
- cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
- d->sd_allnodes = 1;
- }
- parent = sd;
-
- sd = &per_cpu(node_domains, i).sd;
- SD_INIT(sd, NODE);
- set_domain_attribute(sd, attr);
- sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
- sd->parent = parent;
- if (parent)
- parent->child = sd;
- cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
-#endif
- return sd;
-}
+ struct sd_data *sdd = sd->private;
+ struct sched_group *sg = sd->groups;
-static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
- const struct cpumask *cpu_map, struct sched_domain_attr *attr,
- struct sched_domain *parent, int i)
-{
- struct sched_domain *sd;
- sd = &per_cpu(phys_domains, i).sd;
- SD_INIT(sd, CPU);
- set_domain_attribute(sd, attr);
- cpumask_copy(sched_domain_span(sd), d->nodemask);
- sd->parent = parent;
- if (parent)
- parent->child = sd;
- cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
- return sd;
-}
+ WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
+ *per_cpu_ptr(sdd->sd, cpu) = NULL;
-static struct sched_domain *__build_book_sched_domain(struct s_data *d,
- const struct cpumask *cpu_map, struct sched_domain_attr *attr,
- struct sched_domain *parent, int i)
-{
- struct sched_domain *sd = parent;
-#ifdef CONFIG_SCHED_BOOK
- sd = &per_cpu(book_domains, i).sd;
- SD_INIT(sd, BOOK);
- set_domain_attribute(sd, attr);
- cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
- sd->parent = parent;
- parent->child = sd;
- cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
-#endif
- return sd;
+ if (cpu == cpumask_first(sched_group_cpus(sg))) {
+ WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg);
+ *per_cpu_ptr(sdd->sg, cpu) = NULL;
+ }
}
-static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
- const struct cpumask *cpu_map, struct sched_domain_attr *attr,
- struct sched_domain *parent, int i)
+#ifdef CONFIG_SCHED_SMT
+static const struct cpumask *cpu_smt_mask(int cpu)
{
- struct sched_domain *sd = parent;
-#ifdef CONFIG_SCHED_MC
- sd = &per_cpu(core_domains, i).sd;
- SD_INIT(sd, MC);
- set_domain_attribute(sd, attr);
- cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
- sd->parent = parent;
- parent->child = sd;
- cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
-#endif
- return sd;
+ return topology_thread_cpumask(cpu);
}
-
-static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
- const struct cpumask *cpu_map, struct sched_domain_attr *attr,
- struct sched_domain *parent, int i)
-{
- struct sched_domain *sd = parent;
-#ifdef CONFIG_SCHED_SMT
- sd = &per_cpu(cpu_domains, i).sd;
- SD_INIT(sd, SIBLING);
- set_domain_attribute(sd, attr);
- cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
- sd->parent = parent;
- parent->child = sd;
- cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
#endif
- return sd;
-}
-static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
- const struct cpumask *cpu_map, int cpu)
-{
- switch (l) {
+/*
+ * Topology list, bottom-up.
+ */
+static struct sched_domain_topology_level default_topology[] = {
#ifdef CONFIG_SCHED_SMT
- case SD_LV_SIBLING: /* set up CPU (sibling) groups */
- cpumask_and(d->this_sibling_map, cpu_map,
- topology_thread_cpumask(cpu));
- if (cpu == cpumask_first(d->this_sibling_map))
- init_sched_build_groups(d->this_sibling_map, cpu_map,
- &cpu_to_cpu_group,
- d->send_covered, d->tmpmask);
- break;
+ { sd_init_SIBLING, cpu_smt_mask, },
#endif
#ifdef CONFIG_SCHED_MC
- case SD_LV_MC: /* set up multi-core groups */
- cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
- if (cpu == cpumask_first(d->this_core_map))
- init_sched_build_groups(d->this_core_map, cpu_map,
- &cpu_to_core_group,
- d->send_covered, d->tmpmask);
- break;
+ { sd_init_MC, cpu_coregroup_mask, },
#endif
#ifdef CONFIG_SCHED_BOOK
- case SD_LV_BOOK: /* set up book groups */
- cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
- if (cpu == cpumask_first(d->this_book_map))
- init_sched_build_groups(d->this_book_map, cpu_map,
- &cpu_to_book_group,
- d->send_covered, d->tmpmask);
- break;
+ { sd_init_BOOK, cpu_book_mask, },
#endif
- case SD_LV_CPU: /* set up physical groups */
- cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
- if (!cpumask_empty(d->nodemask))
- init_sched_build_groups(d->nodemask, cpu_map,
- &cpu_to_phys_group,
- d->send_covered, d->tmpmask);
- break;
+ { sd_init_CPU, cpu_cpu_mask, },
#ifdef CONFIG_NUMA
- case SD_LV_ALLNODES:
- init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
- d->send_covered, d->tmpmask);
- break;
+ { sd_init_NODE, cpu_node_mask, },
+ { sd_init_ALLNODES, cpu_allnodes_mask, },
#endif
- default:
- break;
+ { NULL, },
+};
+
+static struct sched_domain_topology_level *sched_domain_topology = default_topology;
+
+static int __sdt_alloc(const struct cpumask *cpu_map)
+{
+ struct sched_domain_topology_level *tl;
+ int j;
+
+ for (tl = sched_domain_topology; tl->init; tl++) {
+ struct sd_data *sdd = &tl->data;
+
+ sdd->sd = alloc_percpu(struct sched_domain *);
+ if (!sdd->sd)
+ return -ENOMEM;
+
+ sdd->sg = alloc_percpu(struct sched_group *);
+ if (!sdd->sg)
+ return -ENOMEM;
+
+ for_each_cpu(j, cpu_map) {
+ struct sched_domain *sd;
+ struct sched_group *sg;
+
+ sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sd)
+ return -ENOMEM;
+
+ *per_cpu_ptr(sdd->sd, j) = sd;
+
+ sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sg)
+ return -ENOMEM;
+
+ *per_cpu_ptr(sdd->sg, j) = sg;
+ }
+ }
+
+ return 0;
+}
+
+static void __sdt_free(const struct cpumask *cpu_map)
+{
+ struct sched_domain_topology_level *tl;
+ int j;
+
+ for (tl = sched_domain_topology; tl->init; tl++) {
+ struct sd_data *sdd = &tl->data;
+
+ for_each_cpu(j, cpu_map) {
+ kfree(*per_cpu_ptr(sdd->sd, j));
+ kfree(*per_cpu_ptr(sdd->sg, j));
+ }
+ free_percpu(sdd->sd);
+ free_percpu(sdd->sg);
+ }
+}
+
+struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
+ struct s_data *d, const struct cpumask *cpu_map,
+ struct sched_domain_attr *attr, struct sched_domain *child,
+ int cpu)
+{
+ struct sched_domain *sd = tl->init(tl, cpu);
+ if (!sd)
+ return child;
+
+ set_domain_attribute(sd, attr);
+ cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
+ if (child) {
+ sd->level = child->level + 1;
+ sched_domain_level_max = max(sched_domain_level_max, sd->level);
+ child->parent = sd;
}
+ sd->child = child;
+
+ return sd;
}
/*
* Build sched domains for a given set of cpus and attach the sched domains
* to the individual cpus
*/
-static int __build_sched_domains(const struct cpumask *cpu_map,
- struct sched_domain_attr *attr)
+static int build_sched_domains(const struct cpumask *cpu_map,
+ struct sched_domain_attr *attr)
{
enum s_alloc alloc_state = sa_none;
- struct s_data d;
struct sched_domain *sd;
- int i;
-#ifdef CONFIG_NUMA
- d.sd_allnodes = 0;
-#endif
+ struct s_data d;
+ int i, ret = -ENOMEM;
alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
if (alloc_state != sa_rootdomain)
goto error;
- alloc_state = sa_sched_groups;
-
- /*
- * Set up domains for cpus specified by the cpu_map.
- */
- for_each_cpu(i, cpu_map) {
- cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
- cpu_map);
-
- sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
- sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
- sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
- sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
- sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
- }
+ /* Set up domains for cpus specified by the cpu_map. */
for_each_cpu(i, cpu_map) {
- build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
- build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
- build_sched_groups(&d, SD_LV_MC, cpu_map, i);
- }
+ struct sched_domain_topology_level *tl;
- /* Set up physical groups */
- for (i = 0; i < nr_node_ids; i++)
- build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
+ sd = NULL;
+ for (tl = sched_domain_topology; tl->init; tl++)
+ sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
-#ifdef CONFIG_NUMA
- /* Set up node groups */
- if (d.sd_allnodes)
- build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
+ while (sd->child)
+ sd = sd->child;
- for (i = 0; i < nr_node_ids; i++)
- if (build_numa_sched_groups(&d, cpu_map, i))
- goto error;
-#endif
-
- /* Calculate CPU power for physical packages and nodes */
-#ifdef CONFIG_SCHED_SMT
- for_each_cpu(i, cpu_map) {
- sd = &per_cpu(cpu_domains, i).sd;
- init_sched_groups_power(i, sd);
- }
-#endif
-#ifdef CONFIG_SCHED_MC
- for_each_cpu(i, cpu_map) {
- sd = &per_cpu(core_domains, i).sd;
- init_sched_groups_power(i, sd);
- }
-#endif
-#ifdef CONFIG_SCHED_BOOK
- for_each_cpu(i, cpu_map) {
- sd = &per_cpu(book_domains, i).sd;
- init_sched_groups_power(i, sd);
+ *per_cpu_ptr(d.sd, i) = sd;
}
-#endif
+ /* Build the groups for the domains */
for_each_cpu(i, cpu_map) {
- sd = &per_cpu(phys_domains, i).sd;
- init_sched_groups_power(i, sd);
- }
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ sd->span_weight = cpumask_weight(sched_domain_span(sd));
+ get_group(i, sd->private, &sd->groups);
+ atomic_inc(&sd->groups->ref);
-#ifdef CONFIG_NUMA
- for (i = 0; i < nr_node_ids; i++)
- init_numa_sched_groups_power(d.sched_group_nodes[i]);
+ if (i != cpumask_first(sched_domain_span(sd)))
+ continue;
- if (d.sd_allnodes) {
- struct sched_group *sg;
+ build_sched_groups(sd);
+ }
+ }
+
+ /* Calculate CPU power for physical packages and nodes */
+ for (i = nr_cpumask_bits-1; i >= 0; i--) {
+ if (!cpumask_test_cpu(i, cpu_map))
+ continue;
- cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
- d.tmpmask);
- init_numa_sched_groups_power(sg);
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ claim_allocations(i, sd);
+ init_sched_groups_power(i, sd);
+ }
}
-#endif
/* Attach the domains */
+ rcu_read_lock();
for_each_cpu(i, cpu_map) {
-#ifdef CONFIG_SCHED_SMT
- sd = &per_cpu(cpu_domains, i).sd;
-#elif defined(CONFIG_SCHED_MC)
- sd = &per_cpu(core_domains, i).sd;
-#elif defined(CONFIG_SCHED_BOOK)
- sd = &per_cpu(book_domains, i).sd;
-#else
- sd = &per_cpu(phys_domains, i).sd;
-#endif
+ sd = *per_cpu_ptr(d.sd, i);
cpu_attach_domain(sd, d.rd, i);
}
+ rcu_read_unlock();
- d.sched_group_nodes = NULL; /* don't free this we still need it */
- __free_domain_allocs(&d, sa_tmpmask, cpu_map);
- return 0;
-
+ ret = 0;
error:
__free_domain_allocs(&d, alloc_state, cpu_map);
- return -ENOMEM;
-}
-
-static int build_sched_domains(const struct cpumask *cpu_map)
-{
- return __build_sched_domains(cpu_map, NULL);
+ return ret;
}
static cpumask_var_t *doms_cur; /* current sched domains */
@@ -7670,7 +7360,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
* For now this just excludes isolated cpus, but could be used to
* exclude other special cases in the future.
*/
-static int arch_init_sched_domains(const struct cpumask *cpu_map)
+static int init_sched_domains(const struct cpumask *cpu_map)
{
int err;
@@ -7681,32 +7371,24 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
doms_cur = &fallback_doms;
cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
dattr_cur = NULL;
- err = build_sched_domains(doms_cur[0]);
+ err = build_sched_domains(doms_cur[0], NULL);
register_sched_domain_sysctl();
return err;
}
-static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
- struct cpumask *tmpmask)
-{
- free_sched_groups(cpu_map, tmpmask);
-}
-
/*
* Detach sched domains from a group of cpus specified in cpu_map
* These cpus will now be attached to the NULL domain
*/
static void detach_destroy_domains(const struct cpumask *cpu_map)
{
- /* Save because hotplug lock held. */
- static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
int i;
+ rcu_read_lock();
for_each_cpu(i, cpu_map)
cpu_attach_domain(NULL, &def_root_domain, i);
- synchronize_sched();
- arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
+ rcu_read_unlock();
}
/* handle null as "default" */
@@ -7795,8 +7477,7 @@ match1:
goto match2;
}
/* no match - add a new doms_new */
- __build_sched_domains(doms_new[i],
- dattr_new ? dattr_new + i : NULL);
+ build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
match2:
;
}
@@ -7815,7 +7496,7 @@ match2:
}
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-static void arch_reinit_sched_domains(void)
+static void reinit_sched_domains(void)
{
get_online_cpus();
@@ -7848,7 +7529,7 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
else
sched_mc_power_savings = level;
- arch_reinit_sched_domains();
+ reinit_sched_domains();
return count;
}
@@ -7967,14 +7648,9 @@ void __init sched_init_smp(void)
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
-#if defined(CONFIG_NUMA)
- sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
- GFP_KERNEL);
- BUG_ON(sched_group_nodes_bycpu == NULL);
-#endif
get_online_cpus();
mutex_lock(&sched_domains_mutex);
- arch_init_sched_domains(cpu_active_mask);
+ init_sched_domains(cpu_active_mask);
cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
if (cpumask_empty(non_isolated_cpus))
cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -8281,6 +7957,7 @@ void __init sched_init(void)
/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
#ifdef CONFIG_SMP
+ zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
#ifdef CONFIG_NO_HZ
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
@@ -8340,7 +8017,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
int old_prio = p->prio;
int on_rq;
- on_rq = p->se.on_rq;
+ on_rq = p->on_rq;
if (on_rq)
deactivate_task(rq, p, 0);
__setscheduler(rq, p, SCHED_NORMAL, 0);
@@ -8683,7 +8360,7 @@ void sched_move_task(struct task_struct *tsk)
rq = task_rq_lock(tsk, &flags);
running = task_current(rq, tsk);
- on_rq = tsk->se.on_rq;
+ on_rq = tsk->on_rq;
if (on_rq)
dequeue_task(rq, tsk, 0);
@@ -8702,7 +8379,7 @@ void sched_move_task(struct task_struct *tsk)
if (on_rq)
enqueue_task(rq, tsk, 0);
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, tsk, &flags);
}
#endif /* CONFIG_CGROUP_SCHED */
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 7bacd83a4158..3669bec6e130 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -152,7 +152,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
read_lock_irqsave(&tasklist_lock, flags);
do_each_thread(g, p) {
- if (!p->se.on_rq || task_cpu(p) != rq_cpu)
+ if (!p->on_rq || task_cpu(p) != rq_cpu)
continue;
print_task(m, rq, p);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 6fa833ab2cb8..87445931a179 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -358,6 +358,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
}
cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
+#ifndef CONFIG_64BIT
+ smp_wmb();
+ cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
+#endif
}
/*
@@ -1372,12 +1376,25 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
#ifdef CONFIG_SMP
-static void task_waking_fair(struct rq *rq, struct task_struct *p)
+static void task_waking_fair(struct task_struct *p)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 min_vruntime;
- se->vruntime -= cfs_rq->min_vruntime;
+#ifndef CONFIG_64BIT
+ u64 min_vruntime_copy;
+
+ do {
+ min_vruntime_copy = cfs_rq->min_vruntime_copy;
+ smp_rmb();
+ min_vruntime = cfs_rq->min_vruntime;
+ } while (min_vruntime != min_vruntime_copy);
+#else
+ min_vruntime = cfs_rq->min_vruntime;
+#endif
+
+ se->vruntime -= min_vruntime;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1622,6 +1639,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
/*
* Otherwise, iterate the domains and find an elegible idle cpu.
*/
+ rcu_read_lock();
for_each_domain(target, sd) {
if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
break;
@@ -1641,6 +1659,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
break;
}
+ rcu_read_unlock();
return target;
}
@@ -1657,7 +1676,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
* preempt must be disabled.
*/
static int
-select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags)
+select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
{
struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
int cpu = smp_processor_id();
@@ -1673,6 +1692,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
new_cpu = prev_cpu;
}
+ rcu_read_lock();
for_each_domain(cpu, tmp) {
if (!(tmp->flags & SD_LOAD_BALANCE))
continue;
@@ -1723,9 +1743,10 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
if (affine_sd) {
if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
- return select_idle_sibling(p, cpu);
- else
- return select_idle_sibling(p, prev_cpu);
+ prev_cpu = cpu;
+
+ new_cpu = select_idle_sibling(p, prev_cpu);
+ goto unlock;
}
while (sd) {
@@ -1766,6 +1787,8 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
}
/* while loop will break here if sd == NULL */
}
+unlock:
+ rcu_read_unlock();
return new_cpu;
}
@@ -1789,10 +1812,7 @@ wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
* This is especially important for buddies when the leftmost
* task is higher priority than the buddy.
*/
- if (unlikely(se->load.weight != NICE_0_LOAD))
- gran = calc_delta_fair(gran, se);
-
- return gran;
+ return calc_delta_fair(gran, se);
}
/*
@@ -2648,7 +2668,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
/*
* Only siblings can have significantly less than SCHED_LOAD_SCALE
*/
- if (sd->level != SD_LV_SIBLING)
+ if (!(sd->flags & SD_SHARE_CPUPOWER))
return 0;
/*
@@ -3465,6 +3485,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
raw_spin_unlock(&this_rq->lock);
update_shares(this_cpu);
+ rcu_read_lock();
for_each_domain(this_cpu, sd) {
unsigned long interval;
int balance = 1;
@@ -3486,6 +3507,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
break;
}
}
+ rcu_read_unlock();
raw_spin_lock(&this_rq->lock);
@@ -3534,6 +3556,7 @@ static int active_load_balance_cpu_stop(void *data)
double_lock_balance(busiest_rq, target_rq);
/* Search for an sd spanning us and the target CPU. */
+ rcu_read_lock();
for_each_domain(target_cpu, sd) {
if ((sd->flags & SD_LOAD_BALANCE) &&
cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
@@ -3549,6 +3572,7 @@ static int active_load_balance_cpu_stop(void *data)
else
schedstat_inc(sd, alb_failed);
}
+ rcu_read_unlock();
double_unlock_balance(busiest_rq, target_rq);
out_unlock:
busiest_rq->active_balance = 0;
@@ -3675,6 +3699,7 @@ static int find_new_ilb(int cpu)
{
struct sched_domain *sd;
struct sched_group *ilb_group;
+ int ilb = nr_cpu_ids;
/*
* Have idle load balancer selection from semi-idle packages only
@@ -3690,20 +3715,25 @@ static int find_new_ilb(int cpu)
if (cpumask_weight(nohz.idle_cpus_mask) < 2)
goto out_done;
+ rcu_read_lock();
for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
ilb_group = sd->groups;
do {
- if (is_semi_idle_group(ilb_group))
- return cpumask_first(nohz.grp_idle_mask);
+ if (is_semi_idle_group(ilb_group)) {
+ ilb = cpumask_first(nohz.grp_idle_mask);
+ goto unlock;
+ }
ilb_group = ilb_group->next;
} while (ilb_group != sd->groups);
}
+unlock:
+ rcu_read_unlock();
out_done:
- return nr_cpu_ids;
+ return ilb;
}
#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
static inline int find_new_ilb(int call_cpu)
@@ -3848,6 +3878,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
update_shares(cpu);
+ rcu_read_lock();
for_each_domain(cpu, sd) {
if (!(sd->flags & SD_LOAD_BALANCE))
continue;
@@ -3893,6 +3924,7 @@ out:
if (!balance)
break;
}
+ rcu_read_unlock();
/*
* next_balance will be updated only when there is a need.
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 68e69acc29b9..be40f7371ee1 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -64,3 +64,9 @@ SCHED_FEAT(OWNER_SPIN, 1)
* Decrement CPU power based on irq activity
*/
SCHED_FEAT(NONIRQ_POWER, 1)
+
+/*
+ * Queue remote wakeups on the target CPU and process them
+ * using the scheduler IPI. Reduces rq->lock contention/bounces.
+ */
+SCHED_FEAT(TTWU_QUEUE, 1)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index a776a6396427..0a51882534ea 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -7,7 +7,7 @@
#ifdef CONFIG_SMP
static int
-select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
+select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
{
return task_cpu(p); /* IDLE tasks as never migrated */
}
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index e7cebdc65f82..19ecb3127379 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -977,13 +977,23 @@ static void yield_task_rt(struct rq *rq)
static int find_lowest_rq(struct task_struct *task);
static int
-select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
+select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
{
+ struct task_struct *curr;
+ struct rq *rq;
+ int cpu;
+
if (sd_flag != SD_BALANCE_WAKE)
return smp_processor_id();
+ cpu = task_cpu(p);
+ rq = cpu_rq(cpu);
+
+ rcu_read_lock();
+ curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+
/*
- * If the current task is an RT task, then
+ * If the current task on @p's runqueue is an RT task, then
* try to see if we can wake this RT task up on another
* runqueue. Otherwise simply start this RT task
* on its current runqueue.
@@ -997,21 +1007,25 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
* lock?
*
* For equal prio tasks, we just let the scheduler sort it out.
+ *
+ * Otherwise, just let it ride on the affined RQ and the
+ * post-schedule router will push the preempted task away
+ *
+ * This test is optimistic, if we get it wrong the load-balancer
+ * will have to sort it out.
*/
- if (unlikely(rt_task(rq->curr)) &&
- (rq->curr->rt.nr_cpus_allowed < 2 ||
- rq->curr->prio < p->prio) &&
+ if (curr && unlikely(rt_task(curr)) &&
+ (curr->rt.nr_cpus_allowed < 2 ||
+ curr->prio < p->prio) &&
(p->rt.nr_cpus_allowed > 1)) {
- int cpu = find_lowest_rq(p);
+ int target = find_lowest_rq(p);
- return (cpu == -1) ? task_cpu(p) : cpu;
+ if (target != -1)
+ cpu = target;
}
+ rcu_read_unlock();
- /*
- * Otherwise, just let it ride on the affined RQ and the
- * post-schedule router will push the preempted task away
- */
- return task_cpu(p);
+ return cpu;
}
static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
@@ -1136,7 +1150,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
* The previous task needs to be made eligible for pushing
* if it is still active
*/
- if (p->se.on_rq && p->rt.nr_cpus_allowed > 1)
+ if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
}
@@ -1287,7 +1301,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
!cpumask_test_cpu(lowest_rq->cpu,
&task->cpus_allowed) ||
task_running(rq, task) ||
- !task->se.on_rq)) {
+ !task->on_rq)) {
raw_spin_unlock(&lowest_rq->lock);
lowest_rq = NULL;
@@ -1321,7 +1335,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
BUG_ON(task_current(rq, p));
BUG_ON(p->rt.nr_cpus_allowed <= 1);
- BUG_ON(!p->se.on_rq);
+ BUG_ON(!p->on_rq);
BUG_ON(!rt_task(p));
return p;
@@ -1467,7 +1481,7 @@ static int pull_rt_task(struct rq *this_rq)
*/
if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
WARN_ON(p == src_rq->curr);
- WARN_ON(!p->se.on_rq);
+ WARN_ON(!p->on_rq);
/*
* There's a chance that p is higher in priority
@@ -1538,7 +1552,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
* Update the migration status of the RQ if we have an RT task
* which is running AND changing its weight value.
*/
- if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
+ if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) {
struct rq *rq = task_rq(p);
if (!task_current(rq, p)) {
@@ -1608,7 +1622,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
* we may need to handle the pulling of RT tasks
* now.
*/
- if (p->se.on_rq && !rq->rt.rt_nr_running)
+ if (p->on_rq && !rq->rt.rt_nr_running)
pull_rt_task(rq);
}
@@ -1638,7 +1652,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
* If that current running task is also an RT task
* then see if we can move to another run queue.
*/
- if (p->se.on_rq && rq->curr != p) {
+ if (p->on_rq && rq->curr != p) {
#ifdef CONFIG_SMP
if (rq->rt.overloaded && push_rt_task(rq) &&
/* Don't resched if we changed runqueues */
@@ -1657,7 +1671,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
static void
prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
{
- if (!p->se.on_rq)
+ if (!p->on_rq)
return;
if (rq->curr == p) {
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 1ba2bd40fdac..6f437632afab 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -9,8 +9,7 @@
#ifdef CONFIG_SMP
static int
-select_task_rq_stop(struct rq *rq, struct task_struct *p,
- int sd_flag, int flags)
+select_task_rq_stop(struct task_struct *p, int sd_flag, int flags)
{
return task_cpu(p); /* stop tasks as never migrate */
}
@@ -26,7 +25,7 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
{
struct task_struct *stop = rq->stop;
- if (stop && stop->se.on_rq)
+ if (stop && stop->on_rq)
return stop;
return NULL;
diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index 66f040b30729..86c87e214b11 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -113,13 +113,61 @@ OPTIONS
Do various checks like samples ordering and lost events.
-f::
---fields
+--fields::
Comma separated list of fields to print. Options are:
comm, tid, pid, time, cpu, event, trace, sym. Field
- list must be prepended with the type, trace, sw or hw,
+ list can be prepended with the type, trace, sw or hw,
to indicate to which event type the field list applies.
e.g., -f sw:comm,tid,time,sym and -f trace:time,cpu,trace
+ perf script -f <fields>
+
+ is equivalent to:
+
+ perf script -f trace:<fields> -f sw:<fields> -f hw:<fields>
+
+ i.e., the specified fields apply to all event types if the type string
+ is not given.
+
+ The arguments are processed in the order received. A later usage can
+ reset a prior request. e.g.:
+
+ -f trace: -f comm,tid,time,sym
+
+ The first -f suppresses trace events (field list is ""), but then the
+ second invocation sets the fields to comm,tid,time,sym. In this case a
+ warning is given to the user:
+
+ "Overriding previous field request for all events."
+
+ Alternativey, consider the order:
+
+ -f comm,tid,time,sym -f trace:
+
+ The first -f sets the fields for all events and the second -f
+ suppresses trace events. The user is given a warning message about
+ the override, and the result of the above is that only S/W and H/W
+ events are displayed with the given fields.
+
+ For the 'wildcard' option if a user selected field is invalid for an
+ event type, a message is displayed to the user that the option is
+ ignored for that type. For example:
+
+ $ perf script -f comm,tid,trace
+ 'trace' not valid for hardware events. Ignoring.
+ 'trace' not valid for software events. Ignoring.
+
+ Alternatively, if the type is given an invalid field is specified it
+ is an error. For example:
+
+ perf script -v -f sw:comm,tid,trace
+ 'trace' not valid for software events.
+
+ At this point usage is displayed, and perf-script exits.
+
+ Finally, a user may not set fields to none for all event types.
+ i.e., -f "" is not allowed.
+
-k::
--vmlinux=<file>::
vmlinux pathname
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index ac574ea23917..6cf811acc41b 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -49,23 +49,52 @@ struct output_option {
};
/* default set to maintain compatibility with current format */
-static u64 output_fields[PERF_TYPE_MAX] = {
- [PERF_TYPE_HARDWARE] = PERF_OUTPUT_COMM | PERF_OUTPUT_TID | \
- PERF_OUTPUT_CPU | PERF_OUTPUT_TIME | \
- PERF_OUTPUT_EVNAME | PERF_OUTPUT_SYM,
-
- [PERF_TYPE_SOFTWARE] = PERF_OUTPUT_COMM | PERF_OUTPUT_TID | \
- PERF_OUTPUT_CPU | PERF_OUTPUT_TIME | \
- PERF_OUTPUT_EVNAME | PERF_OUTPUT_SYM,
-
- [PERF_TYPE_TRACEPOINT] = PERF_OUTPUT_COMM | PERF_OUTPUT_TID | \
- PERF_OUTPUT_CPU | PERF_OUTPUT_TIME | \
- PERF_OUTPUT_EVNAME | PERF_OUTPUT_TRACE,
+static struct {
+ bool user_set;
+ u64 fields;
+ u64 invalid_fields;
+} output[PERF_TYPE_MAX] = {
+
+ [PERF_TYPE_HARDWARE] = {
+ .user_set = false,
+
+ .fields = PERF_OUTPUT_COMM | PERF_OUTPUT_TID |
+ PERF_OUTPUT_CPU | PERF_OUTPUT_TIME |
+ PERF_OUTPUT_EVNAME | PERF_OUTPUT_SYM,
+
+ .invalid_fields = PERF_OUTPUT_TRACE,
+ },
+
+ [PERF_TYPE_SOFTWARE] = {
+ .user_set = false,
+
+ .fields = PERF_OUTPUT_COMM | PERF_OUTPUT_TID |
+ PERF_OUTPUT_CPU | PERF_OUTPUT_TIME |
+ PERF_OUTPUT_EVNAME | PERF_OUTPUT_SYM,
+
+ .invalid_fields = PERF_OUTPUT_TRACE,
+ },
+
+ [PERF_TYPE_TRACEPOINT] = {
+ .user_set = false,
+
+ .fields = PERF_OUTPUT_COMM | PERF_OUTPUT_TID |
+ PERF_OUTPUT_CPU | PERF_OUTPUT_TIME |
+ PERF_OUTPUT_EVNAME | PERF_OUTPUT_TRACE,
+ },
};
-static bool output_set_by_user;
+static bool output_set_by_user(void)
+{
+ int j;
+ for (j = 0; j < PERF_TYPE_MAX; ++j) {
+ if (output[j].user_set)
+ return true;
+ }
+ return false;
+}
-#define PRINT_FIELD(x) (output_fields[attr->type] & PERF_OUTPUT_##x)
+#define PRINT_FIELD(x) (output[attr->type].fields & PERF_OUTPUT_##x)
static int perf_session__check_attr(struct perf_session *session,
struct perf_event_attr *attr)
@@ -168,7 +197,7 @@ static void process_event(union perf_event *event __unused,
{
struct perf_event_attr *attr = &evsel->attr;
- if (output_fields[attr->type] == 0)
+ if (output[attr->type].fields == 0)
return;
if (perf_session__check_attr(session, attr) < 0)
@@ -451,6 +480,7 @@ static int parse_output_fields(const struct option *opt __used,
{
char *tok;
int i, imax = sizeof(all_output_options) / sizeof(struct output_option);
+ int j;
int rc = 0;
char *str = strdup(arg);
int type = -1;
@@ -458,52 +488,95 @@ static int parse_output_fields(const struct option *opt __used,
if (!str)
return -ENOMEM;
- tok = strtok(str, ":");
- if (!tok) {
- fprintf(stderr,
- "Invalid field string - not prepended with type.");
- return -EINVAL;
- }
-
- /* first word should state which event type user
- * is specifying the fields
+ /* first word can state for which event type the user is specifying
+ * the fields. If no type exists, the specified fields apply to all
+ * event types found in the file minus the invalid fields for a type.
*/
- if (!strcmp(tok, "hw"))
- type = PERF_TYPE_HARDWARE;
- else if (!strcmp(tok, "sw"))
- type = PERF_TYPE_SOFTWARE;
- else if (!strcmp(tok, "trace"))
- type = PERF_TYPE_TRACEPOINT;
- else {
- fprintf(stderr, "Invalid event type in field string.");
- return -EINVAL;
+ tok = strchr(str, ':');
+ if (tok) {
+ *tok = '\0';
+ tok++;
+ if (!strcmp(str, "hw"))
+ type = PERF_TYPE_HARDWARE;
+ else if (!strcmp(str, "sw"))
+ type = PERF_TYPE_SOFTWARE;
+ else if (!strcmp(str, "trace"))
+ type = PERF_TYPE_TRACEPOINT;
+ else {
+ fprintf(stderr, "Invalid event type in field string.\n");
+ return -EINVAL;
+ }
+
+ if (output[type].user_set)
+ pr_warning("Overriding previous field request for %s events.\n",
+ event_type(type));
+
+ output[type].fields = 0;
+ output[type].user_set = true;
+
+ } else {
+ tok = str;
+ if (strlen(str) == 0) {
+ fprintf(stderr,
+ "Cannot set fields to 'none' for all event types.\n");
+ rc = -EINVAL;
+ goto out;
+ }
+
+ if (output_set_by_user())
+ pr_warning("Overriding previous field request for all events.\n");
+
+ for (j = 0; j < PERF_TYPE_MAX; ++j) {
+ output[j].fields = 0;
+ output[j].user_set = true;
+ }
}
- output_fields[type] = 0;
- while (1) {
- tok = strtok(NULL, ",");
- if (!tok)
- break;
+ tok = strtok(tok, ",");
+ while (tok) {
for (i = 0; i < imax; ++i) {
- if (strcmp(tok, all_output_options[i].str) == 0) {
- output_fields[type] |= all_output_options[i].field;
+ if (strcmp(tok, all_output_options[i].str) == 0)
break;
- }
}
if (i == imax) {
- fprintf(stderr, "Invalid field requested.");
+ fprintf(stderr, "Invalid field requested.\n");
rc = -EINVAL;
- break;
+ goto out;
}
- }
- if (output_fields[type] == 0) {
- pr_debug("No fields requested for %s type. "
- "Events will not be displayed\n", event_type(type));
+ if (type == -1) {
+ /* add user option to all events types for
+ * which it is valid
+ */
+ for (j = 0; j < PERF_TYPE_MAX; ++j) {
+ if (output[j].invalid_fields & all_output_options[i].field) {
+ pr_warning("\'%s\' not valid for %s events. Ignoring.\n",
+ all_output_options[i].str, event_type(j));
+ } else
+ output[j].fields |= all_output_options[i].field;
+ }
+ } else {
+ if (output[type].invalid_fields & all_output_options[i].field) {
+ fprintf(stderr, "\'%s\' not valid for %s events.\n",
+ all_output_options[i].str, event_type(type));
+
+ rc = -EINVAL;
+ goto out;
+ }
+ output[type].fields |= all_output_options[i].field;
+ }
+
+ tok = strtok(NULL, ",");
}
- output_set_by_user = true;
+ if (type >= 0) {
+ if (output[type].fields == 0) {
+ pr_debug("No fields requested for %s type. "
+ "Events will not be displayed.\n", event_type(type));
+ }
+ }
+out:
free(str);
return rc;
}
@@ -1020,7 +1093,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __used)
struct stat perf_stat;
int input;
- if (output_set_by_user) {
+ if (output_set_by_user()) {
fprintf(stderr,
"custom fields not supported for generated scripts");
return -1;
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index b7c85ce466a1..a7c7145a8d4f 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -1471,6 +1471,38 @@ static int find_probe_point_by_func(struct probe_finder *pf)
return _param.retval;
}
+struct pubname_callback_param {
+ char *function;
+ char *file;
+ Dwarf_Die *cu_die;
+ Dwarf_Die *sp_die;
+ int found;
+};
+
+static int pubname_search_cb(Dwarf *dbg, Dwarf_Global *gl, void *data)
+{
+ struct pubname_callback_param *param = data;
+
+ if (dwarf_offdie(dbg, gl->die_offset, param->sp_die)) {
+ if (dwarf_tag(param->sp_die) != DW_TAG_subprogram)
+ return DWARF_CB_OK;
+
+ if (die_compare_name(param->sp_die, param->function)) {
+ if (!dwarf_offdie(dbg, gl->cu_offset, param->cu_die))
+ return DWARF_CB_OK;
+
+ if (param->file &&
+ strtailcmp(param->file, dwarf_decl_file(param->sp_die)))
+ return DWARF_CB_OK;
+
+ param->found = 1;
+ return DWARF_CB_ABORT;
+ }
+ }
+
+ return DWARF_CB_OK;
+}
+
/* Find probe points from debuginfo */
static int find_probes(int fd, struct probe_finder *pf)
{
@@ -1498,6 +1530,27 @@ static int find_probes(int fd, struct probe_finder *pf)
off = 0;
line_list__init(&pf->lcache);
+
+ /* Fastpath: lookup by function name from .debug_pubnames section */
+ if (pp->function) {
+ struct pubname_callback_param pubname_param = {
+ .function = pp->function,
+ .file = pp->file,
+ .cu_die = &pf->cu_die,
+ .sp_die = &pf->sp_die,
+ };
+ struct dwarf_callback_param probe_param = {
+ .data = pf,
+ };
+
+ dwarf_getpubnames(dbg, pubname_search_cb, &pubname_param, 0);
+ if (pubname_param.found) {
+ ret = probe_point_search_cb(&pf->sp_die, &probe_param);
+ if (ret)
+ goto found;
+ }
+ }
+
/* Loop on CUs (Compilation Unit) */
while (!dwarf_nextcu(dbg, off, &noff, &cuhl, NULL, NULL, NULL)) {
/* Get the DIE(Debugging Information Entry) of this CU */
@@ -1525,6 +1578,8 @@ static int find_probes(int fd, struct probe_finder *pf)
}
off = noff;
}
+
+found:
line_list__free(&pf->lcache);
if (dwfl)
dwfl_end(dwfl);
@@ -1946,6 +2001,22 @@ int find_line_range(int fd, struct line_range *lr)
return -EBADF;
}
+ /* Fastpath: lookup by function name from .debug_pubnames section */
+ if (lr->function) {
+ struct pubname_callback_param pubname_param = {
+ .function = lr->function, .file = lr->file,
+ .cu_die = &lf.cu_die, .sp_die = &lf.sp_die, .found = 0};
+ struct dwarf_callback_param line_range_param = {
+ .data = (void *)&lf, .retval = 0};
+
+ dwarf_getpubnames(dbg, pubname_search_cb, &pubname_param, 0);
+ if (pubname_param.found) {
+ line_range_search_cb(&lf.sp_die, &line_range_param);
+ if (lf.found)
+ goto found;
+ }
+ }
+
/* Loop on CUs (Compilation Unit) */
while (!lf.found && ret >= 0) {
if (dwarf_nextcu(dbg, off, &noff, &cuhl, NULL, NULL, NULL) != 0)
@@ -1974,6 +2045,7 @@ int find_line_range(int fd, struct line_range *lr)
off = noff;
}
+found:
/* Store comp_dir */
if (lf.found) {
comp_dir = cu_get_comp_dir(&lf.cu_die);
diff --git a/tools/perf/util/probe-finder.h b/tools/perf/util/probe-finder.h
index beaefc3c1223..605730a366db 100644
--- a/tools/perf/util/probe-finder.h
+++ b/tools/perf/util/probe-finder.h
@@ -49,6 +49,7 @@ struct probe_finder {
Dwarf_Addr addr; /* Address */
const char *fname; /* Real file name */
Dwarf_Die cu_die; /* Current CU */
+ Dwarf_Die sp_die;
struct list_head lcache; /* Line cache for lazy match */
/* For variable searching */
@@ -83,6 +84,7 @@ struct line_finder {
int lno_s; /* Start line number */
int lno_e; /* End line number */
Dwarf_Die cu_die; /* Current CU */
+ Dwarf_Die sp_die;
int found;
};