diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2011-04-18 12:04:30 +1000 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2011-04-18 12:04:30 +1000 |
commit | be839f909ce32f83a384db93415b8730c9aa72ae (patch) | |
tree | b579432cd73a32ff0ee80f3ddbedf0fba8af9700 | |
parent | 9bd02ee4b08d49da5112f240ae6e046f7beb87a1 (diff) | |
parent | 7d92e647a604becab4df58de947008e43a9990c0 (diff) |
Merge remote-tracking branch 'tip/auto-latest'
105 files changed, 1502 insertions, 1739 deletions
diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index 42aa078a5e4d..5a621c6d22ab 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c @@ -585,8 +585,7 @@ handle_ipi(struct pt_regs *regs) switch (which) { case IPI_RESCHEDULE: - /* Reschedule callback. Everything to be done - is done by the interrupt return path. */ + scheduler_ipi(); break; case IPI_CALL_FUNC: diff --git a/arch/alpha/kernel/time.c b/arch/alpha/kernel/time.c index a58e84f1a63b..d73d400cc9ed 100644 --- a/arch/alpha/kernel/time.c +++ b/arch/alpha/kernel/time.c @@ -374,8 +374,7 @@ static struct clocksource clocksource_rpcc = { static inline void register_rpcc_clocksource(long cycle_freq) { - clocksource_calc_mult_shift(&clocksource_rpcc, cycle_freq, 4); - clocksource_register(&clocksource_rpcc); + clocksource_register_hz(&clocksource_rpcc, cycle_freq); } #else /* !CONFIG_SMP */ static inline void register_rpcc_clocksource(long cycle_freq) diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index a0ad0540761f..fcf3c8e25601 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -567,10 +567,7 @@ asmlinkage void __exception_irq_entry do_IPI(int ipinr, struct pt_regs *regs) break; case IPI_RESCHEDULE: - /* - * nothing more to do - eveything is - * done on the interrupt return path - */ + scheduler_ipi(); break; case IPI_CALL_FUNC: diff --git a/arch/blackfin/kernel/time-ts.c b/arch/blackfin/kernel/time-ts.c index cdb4beb6bc8f..9e9b60d969dc 100644 --- a/arch/blackfin/kernel/time-ts.c +++ b/arch/blackfin/kernel/time-ts.c @@ -23,29 +23,6 @@ #include <asm/gptimers.h> #include <asm/nmi.h> -/* Accelerators for sched_clock() - * convert from cycles(64bits) => nanoseconds (64bits) - * basic equation: - * ns = cycles / (freq / ns_per_sec) - * ns = cycles * (ns_per_sec / freq) - * ns = cycles * (10^9 / (cpu_khz * 10^3)) - * ns = cycles * (10^6 / cpu_khz) - * - * Then we use scaling math (suggested by george@mvista.com) to get: - * ns = cycles * (10^6 * SC / cpu_khz) / SC - * ns = cycles * cyc2ns_scale / SC - * - * And since SC is a constant power of two, we can convert the div - * into a shift. - * - * We can use khz divisor instead of mhz to keep a better precision, since - * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. - * (mathieu.desnoyers@polymtl.ca) - * - * -johnstul@us.ibm.com "math is hard, lets go shopping!" - */ - -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ #if defined(CONFIG_CYCLES_CLOCKSOURCE) @@ -63,7 +40,6 @@ static struct clocksource bfin_cs_cycles = { .rating = 400, .read = bfin_read_cycles, .mask = CLOCKSOURCE_MASK(64), - .shift = CYC2NS_SCALE_FACTOR, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -75,10 +51,7 @@ static inline unsigned long long bfin_cs_cycles_sched_clock(void) static int __init bfin_cs_cycles_init(void) { - bfin_cs_cycles.mult = \ - clocksource_hz2mult(get_cclk(), bfin_cs_cycles.shift); - - if (clocksource_register(&bfin_cs_cycles)) + if (clocksource_register_hz(&bfin_cs_cycles, get_cclk())) panic("failed to register clocksource"); return 0; @@ -111,7 +84,6 @@ static struct clocksource bfin_cs_gptimer0 = { .rating = 350, .read = bfin_read_gptimer0, .mask = CLOCKSOURCE_MASK(32), - .shift = CYC2NS_SCALE_FACTOR, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -125,10 +97,7 @@ static int __init bfin_cs_gptimer0_init(void) { setup_gptimer0(); - bfin_cs_gptimer0.mult = \ - clocksource_hz2mult(get_sclk(), bfin_cs_gptimer0.shift); - - if (clocksource_register(&bfin_cs_gptimer0)) + if (clocksource_register_hz(&bfin_cs_gptimer0, get_sclk())) panic("failed to register clocksource"); return 0; diff --git a/arch/blackfin/mach-common/smp.c b/arch/blackfin/mach-common/smp.c index 8bce5ed031e4..1fbd94c44457 100644 --- a/arch/blackfin/mach-common/smp.c +++ b/arch/blackfin/mach-common/smp.c @@ -177,6 +177,9 @@ static irqreturn_t ipi_handler_int1(int irq, void *dev_instance) while (msg_queue->count) { msg = &msg_queue->ipi_message[msg_queue->head]; switch (msg->type) { + case BFIN_IPI_RESCHEDULE: + scheduler_ipi(); + break; case BFIN_IPI_CALL_FUNC: spin_unlock_irqrestore(&msg_queue->lock, flags); ipi_call_function(cpu, msg); diff --git a/arch/cris/arch-v32/kernel/smp.c b/arch/cris/arch-v32/kernel/smp.c index 4c9e3e1ba5d1..66cc75657e2f 100644 --- a/arch/cris/arch-v32/kernel/smp.c +++ b/arch/cris/arch-v32/kernel/smp.c @@ -342,15 +342,18 @@ irqreturn_t crisv32_ipi_interrupt(int irq, void *dev_id) ipi = REG_RD(intr_vect, irq_regs[smp_processor_id()], rw_ipi); + if (ipi.vector & IPI_SCHEDULE) { + scheduler_ipi(); + } if (ipi.vector & IPI_CALL) { - func(info); + func(info); } if (ipi.vector & IPI_FLUSH_TLB) { - if (flush_mm == FLUSH_ALL) - __flush_tlb_all(); - else if (flush_vma == FLUSH_ALL) + if (flush_mm == FLUSH_ALL) + __flush_tlb_all(); + else if (flush_vma == FLUSH_ALL) __flush_tlb_mm(flush_mm); - else + else __flush_tlb_page(flush_vma, flush_addr); } diff --git a/arch/ia64/kernel/cyclone.c b/arch/ia64/kernel/cyclone.c index 1b811c61bdc6..f64097b5118a 100644 --- a/arch/ia64/kernel/cyclone.c +++ b/arch/ia64/kernel/cyclone.c @@ -31,8 +31,6 @@ static struct clocksource clocksource_cyclone = { .rating = 300, .read = read_cyclone, .mask = (1LL << 40) - 1, - .mult = 0, /*to be calculated*/ - .shift = 16, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -118,9 +116,7 @@ int __init init_cyclone_clock(void) /* initialize last tick */ cyclone_mc = cyclone_timer; clocksource_cyclone.fsys_mmio = cyclone_timer; - clocksource_cyclone.mult = clocksource_hz2mult(CYCLONE_TIMER_FREQ, - clocksource_cyclone.shift); - clocksource_register(&clocksource_cyclone); + clocksource_register_hz(&clocksource_cyclone, CYCLONE_TIMER_FREQ); return 0; } diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c index 5b704740f160..782c3a357f24 100644 --- a/arch/ia64/kernel/irq_ia64.c +++ b/arch/ia64/kernel/irq_ia64.c @@ -31,6 +31,7 @@ #include <linux/irq.h> #include <linux/ratelimit.h> #include <linux/acpi.h> +#include <linux/sched.h> #include <asm/delay.h> #include <asm/intrinsics.h> @@ -496,6 +497,7 @@ ia64_handle_irq (ia64_vector vector, struct pt_regs *regs) smp_local_flush_tlb(); kstat_incr_irqs_this_cpu(irq, desc); } else if (unlikely(IS_RESCHEDULE(vector))) { + scheduler_ipi(); kstat_incr_irqs_this_cpu(irq, desc); } else { ia64_setreg(_IA64_REG_CR_TPR, vector); diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index 156ad803d5b7..04440cc09b40 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -73,8 +73,6 @@ static struct clocksource clocksource_itc = { .rating = 350, .read = itc_get_cycles, .mask = CLOCKSOURCE_MASK(64), - .mult = 0, /*to be calculated*/ - .shift = 16, .flags = CLOCK_SOURCE_IS_CONTINUOUS, #ifdef CONFIG_PARAVIRT .resume = paravirt_clocksource_resume, @@ -365,11 +363,8 @@ ia64_init_itm (void) ia64_cpu_local_tick(); if (!itc_clocksource) { - /* Sort out mult/shift values: */ - clocksource_itc.mult = - clocksource_hz2mult(local_cpu_data->itc_freq, - clocksource_itc.shift); - clocksource_register(&clocksource_itc); + clocksource_register_hz(&clocksource_itc, + local_cpu_data->itc_freq); itc_clocksource = &clocksource_itc; } } diff --git a/arch/ia64/sn/kernel/sn2/timer.c b/arch/ia64/sn/kernel/sn2/timer.c index 21d6f09e3447..c34efda122e1 100644 --- a/arch/ia64/sn/kernel/sn2/timer.c +++ b/arch/ia64/sn/kernel/sn2/timer.c @@ -33,8 +33,6 @@ static struct clocksource clocksource_sn2 = { .rating = 450, .read = read_sn2, .mask = (1LL << 55) - 1, - .mult = 0, - .shift = 10, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -57,9 +55,7 @@ ia64_sn_udelay (unsigned long usecs) void __init sn_timer_init(void) { clocksource_sn2.fsys_mmio = RTC_COUNTER_ADDR; - clocksource_sn2.mult = clocksource_hz2mult(sn_rtc_cycles_per_second, - clocksource_sn2.shift); - clocksource_register(&clocksource_sn2); + clocksource_register_hz(&clocksource_sn2, sn_rtc_cycles_per_second); ia64_udelay = &ia64_sn_udelay; } diff --git a/arch/ia64/xen/irq_xen.c b/arch/ia64/xen/irq_xen.c index 108bb858acf2..b279e142c633 100644 --- a/arch/ia64/xen/irq_xen.c +++ b/arch/ia64/xen/irq_xen.c @@ -92,6 +92,8 @@ static unsigned short saved_irq_cnt; static int xen_slab_ready; #ifdef CONFIG_SMP +#include <linux/sched.h> + /* Dummy stub. Though we may check XEN_RESCHEDULE_VECTOR before __do_IRQ, * it ends up to issue several memory accesses upon percpu data and * thus adds unnecessary traffic to other paths. @@ -99,7 +101,13 @@ static int xen_slab_ready; static irqreturn_t xen_dummy_handler(int irq, void *dev_id) { + return IRQ_HANDLED; +} +static irqreturn_t +xen_resched_handler(int irq, void *dev_id) +{ + scheduler_ipi(); return IRQ_HANDLED; } @@ -110,7 +118,7 @@ static struct irqaction xen_ipi_irqaction = { }; static struct irqaction xen_resched_irqaction = { - .handler = xen_dummy_handler, + .handler = xen_resched_handler, .flags = IRQF_DISABLED, .name = "resched" }; diff --git a/arch/m32r/kernel/smp.c b/arch/m32r/kernel/smp.c index 31cef20b2996..fc10b39893d4 100644 --- a/arch/m32r/kernel/smp.c +++ b/arch/m32r/kernel/smp.c @@ -122,8 +122,6 @@ void smp_send_reschedule(int cpu_id) * * Description: This routine executes on CPU which received * 'RESCHEDULE_IPI'. - * Rescheduling is processed at the exit of interrupt - * operation. * * Born on Date: 2002.02.05 * @@ -138,7 +136,7 @@ void smp_send_reschedule(int cpu_id) *==========================================================================*/ void smp_reschedule_interrupt(void) { - /* nothing to do */ + scheduler_ipi(); } /*==========================================================================* diff --git a/arch/microblaze/kernel/timer.c b/arch/microblaze/kernel/timer.c index d8a214f11ac2..e5550ce4e0eb 100644 --- a/arch/microblaze/kernel/timer.c +++ b/arch/microblaze/kernel/timer.c @@ -217,16 +217,12 @@ static struct clocksource clocksource_microblaze = { .rating = 300, .read = microblaze_read, .mask = CLOCKSOURCE_MASK(32), - .shift = 8, /* I can shift it */ .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; static int __init microblaze_clocksource_init(void) { - clocksource_microblaze.mult = - clocksource_hz2mult(timer_clock_freq, - clocksource_microblaze.shift); - if (clocksource_register(&clocksource_microblaze)) + if (clocksource_register_hz(&clocksource_microblaze, timer_clock_freq)) panic("failed to register clocksource"); /* stop timer1 */ diff --git a/arch/mips/alchemy/common/time.c b/arch/mips/alchemy/common/time.c index 2aecb2fdf982..d5da6adbf634 100644 --- a/arch/mips/alchemy/common/time.c +++ b/arch/mips/alchemy/common/time.c @@ -141,8 +141,7 @@ static int __init alchemy_time_init(unsigned int m2int) goto cntr_err; /* register counter1 clocksource and event device */ - clocksource_set_clock(&au1x_counter1_clocksource, 32768); - clocksource_register(&au1x_counter1_clocksource); + clocksource_register_hz(&au1x_counter1_clocksource, 32768); cd->shift = 32; cd->mult = div_sc(32768, NSEC_PER_SEC, cd->shift); diff --git a/arch/mips/cavium-octeon/csrc-octeon.c b/arch/mips/cavium-octeon/csrc-octeon.c index 26bf71130bf8..29d56afbb02d 100644 --- a/arch/mips/cavium-octeon/csrc-octeon.c +++ b/arch/mips/cavium-octeon/csrc-octeon.c @@ -105,8 +105,7 @@ unsigned long long notrace sched_clock(void) void __init plat_time_init(void) { clocksource_mips.rating = 300; - clocksource_set_clock(&clocksource_mips, octeon_get_clock_rate()); - clocksource_register(&clocksource_mips); + clocksource_register_hz(&clocksource_mips, octeon_get_clock_rate()); } static u64 octeon_udelay_factor; diff --git a/arch/mips/cavium-octeon/smp.c b/arch/mips/cavium-octeon/smp.c index ba78b21cc8d0..76923eeb58b9 100644 --- a/arch/mips/cavium-octeon/smp.c +++ b/arch/mips/cavium-octeon/smp.c @@ -44,6 +44,8 @@ static irqreturn_t mailbox_interrupt(int irq, void *dev_id) if (action & SMP_CALL_FUNCTION) smp_call_function_interrupt(); + if (action & SMP_RESCHEDULE_YOURSELF) + scheduler_ipi(); /* Check if we've been told to flush the icache */ if (action & SMP_ICACHE_FLUSH) diff --git a/arch/mips/include/asm/time.h b/arch/mips/include/asm/time.h index c7f1bfef1574..bc14447e69b5 100644 --- a/arch/mips/include/asm/time.h +++ b/arch/mips/include/asm/time.h @@ -84,12 +84,6 @@ static inline int init_mips_clocksource(void) #endif } -static inline void clocksource_set_clock(struct clocksource *cs, - unsigned int clock) -{ - clocksource_calc_mult_shift(cs, clock, 4); -} - static inline void clockevent_set_clock(struct clock_event_device *cd, unsigned int clock) { diff --git a/arch/mips/jz4740/time.c b/arch/mips/jz4740/time.c index fe01678d94fd..03dfd4e0da4e 100644 --- a/arch/mips/jz4740/time.c +++ b/arch/mips/jz4740/time.c @@ -121,8 +121,7 @@ void __init plat_time_init(void) clockevents_register_device(&jz4740_clockevent); - clocksource_set_clock(&jz4740_clocksource, clk_rate); - ret = clocksource_register(&jz4740_clocksource); + ret = clocksource_register_hz(&jz4740_clocksource, clk_rate); if (ret) printk(KERN_ERR "Failed to register clocksource: %d\n", ret); diff --git a/arch/mips/kernel/cevt-txx9.c b/arch/mips/kernel/cevt-txx9.c index 0b7377361e22..f0ab92a1b057 100644 --- a/arch/mips/kernel/cevt-txx9.c +++ b/arch/mips/kernel/cevt-txx9.c @@ -51,8 +51,7 @@ void __init txx9_clocksource_init(unsigned long baseaddr, { struct txx9_tmr_reg __iomem *tmrptr; - clocksource_set_clock(&txx9_clocksource.cs, TIMER_CLK(imbusclk)); - clocksource_register(&txx9_clocksource.cs); + clocksource_register_hz(&txx9_clocksource.cs, TIMER_CLK(imbusclk)); tmrptr = ioremap(baseaddr, sizeof(struct txx9_tmr_reg)); __raw_writel(TCR_BASE, &tmrptr->tcr); diff --git a/arch/mips/kernel/csrc-bcm1480.c b/arch/mips/kernel/csrc-bcm1480.c index 51489f8a825e..f96f99c794a3 100644 --- a/arch/mips/kernel/csrc-bcm1480.c +++ b/arch/mips/kernel/csrc-bcm1480.c @@ -49,6 +49,5 @@ void __init sb1480_clocksource_init(void) plldiv = G_BCM1480_SYS_PLL_DIV(__raw_readq(IOADDR(A_SCD_SYSTEM_CFG))); zbbus = ((plldiv >> 1) * 50000000) + ((plldiv & 1) * 25000000); - clocksource_set_clock(cs, zbbus); - clocksource_register(cs); + clocksource_register_hz(cs, zbbus); } diff --git a/arch/mips/kernel/csrc-ioasic.c b/arch/mips/kernel/csrc-ioasic.c index 23da108506b0..46bd7fa98d6c 100644 --- a/arch/mips/kernel/csrc-ioasic.c +++ b/arch/mips/kernel/csrc-ioasic.c @@ -59,7 +59,5 @@ void __init dec_ioasic_clocksource_init(void) printk(KERN_INFO "I/O ASIC clock frequency %dHz\n", freq); clocksource_dec.rating = 200 + freq / 10000000; - clocksource_set_clock(&clocksource_dec, freq); - - clocksource_register(&clocksource_dec); + clocksource_register_hz(&clocksource_dec, freq); } diff --git a/arch/mips/kernel/csrc-powertv.c b/arch/mips/kernel/csrc-powertv.c index a27c16c8690e..2e7c5232da8d 100644 --- a/arch/mips/kernel/csrc-powertv.c +++ b/arch/mips/kernel/csrc-powertv.c @@ -78,9 +78,7 @@ static void __init powertv_c0_hpt_clocksource_init(void) clocksource_mips.rating = 200 + mips_hpt_frequency / 10000000; - clocksource_set_clock(&clocksource_mips, mips_hpt_frequency); - - clocksource_register(&clocksource_mips); + clocksource_register_hz(&clocksource_mips, mips_hpt_frequency); } /** @@ -130,43 +128,16 @@ static struct clocksource clocksource_tim_c = { /** * powertv_tim_c_clocksource_init - set up a clock source for the TIM_C clock * - * The hard part here is coming up with a constant k and shift s such that - * the 48-bit TIM_C value multiplied by k doesn't overflow and that value, - * when shifted right by s, yields the corresponding number of nanoseconds. * We know that TIM_C counts at 27 MHz/8, so each cycle corresponds to - * 1 / (27,000,000/8) seconds. Multiply that by a billion and you get the - * number of nanoseconds. Since the TIM_C value has 48 bits and the math is - * done in 64 bits, avoiding an overflow means that k must be less than - * 64 - 48 = 16 bits. + * 1 / (27,000,000/8) seconds. */ static void __init powertv_tim_c_clocksource_init(void) { - int prescale; - unsigned long dividend; - unsigned long k; - int s; - const int max_k_bits = (64 - 48) - 1; - const unsigned long billion = 1000000000; const unsigned long counts_per_second = 27000000 / 8; - prescale = BITS_PER_LONG - ilog2(billion) - 1; - dividend = billion << prescale; - k = dividend / counts_per_second; - s = ilog2(k) - max_k_bits; - - if (s < 0) - s = prescale; - - else { - k >>= s; - s += prescale; - } - - clocksource_tim_c.mult = k; - clocksource_tim_c.shift = s; clocksource_tim_c.rating = 200; - clocksource_register(&clocksource_tim_c); + clocksource_register_hz(&clocksource_tim_c, counts_per_second); tim_c = (struct tim_c *) asic_reg_addr(tim_ch); } diff --git a/arch/mips/kernel/csrc-r4k.c b/arch/mips/kernel/csrc-r4k.c index e95a3cd48eea..decd1fa38d55 100644 --- a/arch/mips/kernel/csrc-r4k.c +++ b/arch/mips/kernel/csrc-r4k.c @@ -30,9 +30,7 @@ int __init init_r4k_clocksource(void) /* Calculate a somewhat reasonable rating value */ clocksource_mips.rating = 200 + mips_hpt_frequency / 10000000; - clocksource_set_clock(&clocksource_mips, mips_hpt_frequency); - - clocksource_register(&clocksource_mips); + clocksource_register_hz(&clocksource_mips, mips_hpt_frequency); return 0; } diff --git a/arch/mips/kernel/csrc-sb1250.c b/arch/mips/kernel/csrc-sb1250.c index d14d3d1907fa..e9606d907685 100644 --- a/arch/mips/kernel/csrc-sb1250.c +++ b/arch/mips/kernel/csrc-sb1250.c @@ -65,6 +65,5 @@ void __init sb1250_clocksource_init(void) IOADDR(A_SCD_TIMER_REGISTER(SB1250_HPT_NUM, R_SCD_TIMER_CFG))); - clocksource_set_clock(cs, V_SCD_TIMER_FREQ); - clocksource_register(cs); + clocksource_register_hz(cs, V_SCD_TIMER_FREQ); } diff --git a/arch/mips/kernel/i8253.c b/arch/mips/kernel/i8253.c index 2392a7a296d4..9fadd17888d9 100644 --- a/arch/mips/kernel/i8253.c +++ b/arch/mips/kernel/i8253.c @@ -196,8 +196,6 @@ static struct clocksource clocksource_pit = { .rating = 110, .read = pit_read, .mask = CLOCKSOURCE_MASK(32), - .mult = 0, - .shift = 20, }; static int __init init_pit_clocksource(void) @@ -205,7 +203,6 @@ static int __init init_pit_clocksource(void) if (num_possible_cpus() > 1) /* PIT does not scale! */ return 0; - clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20); - return clocksource_register(&clocksource_pit); + return clocksource_register_hz(&clocksource_pit, CLOCK_TICK_RATE); } arch_initcall(init_pit_clocksource); diff --git a/arch/mips/kernel/smtc.c b/arch/mips/kernel/smtc.c index 5a88cc4ccd5a..cedac4633741 100644 --- a/arch/mips/kernel/smtc.c +++ b/arch/mips/kernel/smtc.c @@ -929,7 +929,7 @@ static void post_direct_ipi(int cpu, struct smtc_ipi *pipi) static void ipi_resched_interrupt(void) { - /* Return from interrupt should be enough to cause scheduler check */ + scheduler_ipi(); } static void ipi_call_interrupt(void) diff --git a/arch/mips/loongson/common/cs5536/cs5536_mfgpt.c b/arch/mips/loongson/common/cs5536/cs5536_mfgpt.c index 8c807c965199..0cb1b9760e34 100644 --- a/arch/mips/loongson/common/cs5536/cs5536_mfgpt.c +++ b/arch/mips/loongson/common/cs5536/cs5536_mfgpt.c @@ -201,8 +201,6 @@ static struct clocksource clocksource_mfgpt = { .rating = 120, /* Functional for real use, but not desired */ .read = mfgpt_read, .mask = CLOCKSOURCE_MASK(32), - .mult = 0, - .shift = 22, }; int __init init_mfgpt_clocksource(void) @@ -210,8 +208,7 @@ int __init init_mfgpt_clocksource(void) if (num_possible_cpus() > 1) /* MFGPT does not scale! */ return 0; - clocksource_mfgpt.mult = clocksource_hz2mult(MFGPT_TICK_RATE, 22); - return clocksource_register(&clocksource_mfgpt); + return clocksource_register_hz(&clocksource_mfgpt, MFGPT_TICK_RATE); } arch_initcall(init_mfgpt_clocksource); diff --git a/arch/mips/mti-malta/malta-int.c b/arch/mips/mti-malta/malta-int.c index 9027061f0ead..7d93e6fbfa5a 100644 --- a/arch/mips/mti-malta/malta-int.c +++ b/arch/mips/mti-malta/malta-int.c @@ -309,6 +309,8 @@ static void ipi_call_dispatch(void) static irqreturn_t ipi_resched_interrupt(int irq, void *dev_id) { + scheduler_ipi(); + return IRQ_HANDLED; } diff --git a/arch/mips/pmc-sierra/yosemite/smp.c b/arch/mips/pmc-sierra/yosemite/smp.c index efc9e889b349..2608752898c0 100644 --- a/arch/mips/pmc-sierra/yosemite/smp.c +++ b/arch/mips/pmc-sierra/yosemite/smp.c @@ -55,6 +55,8 @@ void titan_mailbox_irq(void) if (status & 0x2) smp_call_function_interrupt(); + if (status & 0x4) + scheduler_ipi(); break; case 1: @@ -63,6 +65,8 @@ void titan_mailbox_irq(void) if (status & 0x2) smp_call_function_interrupt(); + if (status & 0x4) + scheduler_ipi(); break; } } diff --git a/arch/mips/sgi-ip27/ip27-irq.c b/arch/mips/sgi-ip27/ip27-irq.c index 0a04603d577c..b18b04e48577 100644 --- a/arch/mips/sgi-ip27/ip27-irq.c +++ b/arch/mips/sgi-ip27/ip27-irq.c @@ -147,8 +147,10 @@ static void ip27_do_irq_mask0(void) #ifdef CONFIG_SMP if (pend0 & (1UL << CPU_RESCHED_A_IRQ)) { LOCAL_HUB_CLR_INTR(CPU_RESCHED_A_IRQ); + scheduler_ipi(); } else if (pend0 & (1UL << CPU_RESCHED_B_IRQ)) { LOCAL_HUB_CLR_INTR(CPU_RESCHED_B_IRQ); + scheduler_ipi(); } else if (pend0 & (1UL << CPU_CALL_A_IRQ)) { LOCAL_HUB_CLR_INTR(CPU_CALL_A_IRQ); smp_call_function_interrupt(); diff --git a/arch/mips/sgi-ip27/ip27-timer.c b/arch/mips/sgi-ip27/ip27-timer.c index 3f810c9cbf83..ef74f3267f91 100644 --- a/arch/mips/sgi-ip27/ip27-timer.c +++ b/arch/mips/sgi-ip27/ip27-timer.c @@ -163,8 +163,7 @@ static void __init hub_rt_clocksource_init(void) { struct clocksource *cs = &hub_rt_clocksource; - clocksource_set_clock(cs, CYCLES_PER_SEC); - clocksource_register(cs); + clocksource_register_hz(cs, CYCLES_PER_SEC); } void __init plat_time_init(void) diff --git a/arch/mips/sibyte/bcm1480/smp.c b/arch/mips/sibyte/bcm1480/smp.c index 47b347c992ea..d667875be564 100644 --- a/arch/mips/sibyte/bcm1480/smp.c +++ b/arch/mips/sibyte/bcm1480/smp.c @@ -20,6 +20,7 @@ #include <linux/delay.h> #include <linux/smp.h> #include <linux/kernel_stat.h> +#include <linux/sched.h> #include <asm/mmu_context.h> #include <asm/io.h> @@ -189,10 +190,8 @@ void bcm1480_mailbox_interrupt(void) /* Clear the mailbox to clear the interrupt */ __raw_writeq(((u64)action)<<48, mailbox_0_clear_regs[cpu]); - /* - * Nothing to do for SMP_RESCHEDULE_YOURSELF; returning from the - * interrupt will do the reschedule for us - */ + if (action & SMP_RESCHEDULE_YOURSELF) + scheduler_ipi(); if (action & SMP_CALL_FUNCTION) smp_call_function_interrupt(); diff --git a/arch/mips/sibyte/sb1250/smp.c b/arch/mips/sibyte/sb1250/smp.c index c00a5cb1128d..38e7f6bd7922 100644 --- a/arch/mips/sibyte/sb1250/smp.c +++ b/arch/mips/sibyte/sb1250/smp.c @@ -21,6 +21,7 @@ #include <linux/interrupt.h> #include <linux/smp.h> #include <linux/kernel_stat.h> +#include <linux/sched.h> #include <asm/mmu_context.h> #include <asm/io.h> @@ -177,10 +178,8 @@ void sb1250_mailbox_interrupt(void) /* Clear the mailbox to clear the interrupt */ ____raw_writeq(((u64)action) << 48, mailbox_clear_regs[cpu]); - /* - * Nothing to do for SMP_RESCHEDULE_YOURSELF; returning from the - * interrupt will do the reschedule for us - */ + if (action & SMP_RESCHEDULE_YOURSELF) + scheduler_ipi(); if (action & SMP_CALL_FUNCTION) smp_call_function_interrupt(); diff --git a/arch/mn10300/kernel/smp.c b/arch/mn10300/kernel/smp.c index 226c826a2194..83fb27912231 100644 --- a/arch/mn10300/kernel/smp.c +++ b/arch/mn10300/kernel/smp.c @@ -494,14 +494,11 @@ void smp_send_stop(void) * @irq: The interrupt number. * @dev_id: The device ID. * - * We need do nothing here, since the scheduling will be effected on our way - * back through entry.S. - * * Returns IRQ_HANDLED to indicate we handled the interrupt successfully. */ static irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id) { - /* do nothing */ + scheduler_ipi(); return IRQ_HANDLED; } diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c index 69d63d354ef0..828305f19cff 100644 --- a/arch/parisc/kernel/smp.c +++ b/arch/parisc/kernel/smp.c @@ -155,10 +155,7 @@ ipi_interrupt(int irq, void *dev_id) case IPI_RESCHEDULE: smp_debug(100, KERN_DEBUG "CPU%d IPI_RESCHEDULE\n", this_cpu); - /* - * Reschedule callback. Everything to be - * done is done by the interrupt return path. - */ + scheduler_ipi(); break; case IPI_CALL_FUNC: diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index cbdbb14be4b0..9f9c204bef69 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -116,7 +116,7 @@ void smp_message_recv(int msg) generic_smp_call_function_interrupt(); break; case PPC_MSG_RESCHEDULE: - /* we notice need_resched on exit */ + scheduler_ipi(); break; case PPC_MSG_CALL_FUNC_SINGLE: generic_smp_call_function_single_interrupt(); @@ -146,7 +146,7 @@ static irqreturn_t call_function_action(int irq, void *data) static irqreturn_t reschedule_action(int irq, void *data) { - /* we just need the return path side effect of checking need_resched */ + scheduler_ipi(); return IRQ_HANDLED; } diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 63a97db83f96..63c7d9ff220d 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -165,12 +165,12 @@ static void do_ext_call_interrupt(unsigned int ext_int_code, kstat_cpu(smp_processor_id()).irqs[EXTINT_IPI]++; /* * handle bit signal external calls - * - * For the ec_schedule signal we have to do nothing. All the work - * is done automatically when we return from the interrupt. */ bits = xchg(&S390_lowcore.ext_call_fast, 0); + if (test_bit(ec_schedule, &bits)) + scheduler_ipi(); + if (test_bit(ec_call_function, &bits)) generic_smp_call_function_interrupt(); diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c index 509b36b45115..6207561ea34a 100644 --- a/arch/sh/kernel/smp.c +++ b/arch/sh/kernel/smp.c @@ -20,6 +20,7 @@ #include <linux/module.h> #include <linux/cpu.h> #include <linux/interrupt.h> +#include <linux/sched.h> #include <asm/atomic.h> #include <asm/processor.h> #include <asm/system.h> @@ -323,6 +324,7 @@ void smp_message_recv(unsigned int msg) generic_smp_call_function_interrupt(); break; case SMP_MSG_RESCHEDULE: + scheduler_ipi(); break; case SMP_MSG_FUNCTION_SINGLE: generic_smp_call_function_single_interrupt(); diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h index 1c79f32734a0..8b9c556d630b 100644 --- a/arch/sparc/include/asm/topology_64.h +++ b/arch/sparc/include/asm/topology_64.h @@ -65,6 +65,10 @@ static inline int pcibus_to_node(struct pci_bus *pbus) #define smt_capable() (sparc64_multi_core) #endif /* CONFIG_SMP */ -#define cpu_coregroup_mask(cpu) (&cpu_core_map[cpu]) +extern cpumask_t cpu_core_map[NR_CPUS]; +static inline const struct cpumask *cpu_coregroup_mask(int cpu) +{ + return &cpu_core_map[cpu]; +} #endif /* _ASM_SPARC64_TOPOLOGY_H */ diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c index 91c10fb70858..f95690c167b6 100644 --- a/arch/sparc/kernel/smp_32.c +++ b/arch/sparc/kernel/smp_32.c @@ -125,7 +125,9 @@ struct linux_prom_registers smp_penguin_ctable __cpuinitdata = { 0 }; void smp_send_reschedule(int cpu) { - /* See sparc64 */ + /* + * XXX missing reschedule IPI, see scheduler_ipi() + */ } void smp_send_stop(void) diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index 3e94a8c23238..9478da7fdb3e 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1368,6 +1368,7 @@ void smp_send_reschedule(int cpu) void __irq_entry smp_receive_signal_client(int irq, struct pt_regs *regs) { clear_softint(1 << irq); + scheduler_ipi(); } /* This is a nop because we capture all other cpus diff --git a/arch/tile/kernel/smp.c b/arch/tile/kernel/smp.c index a4293102ef81..c52224d5ed45 100644 --- a/arch/tile/kernel/smp.c +++ b/arch/tile/kernel/smp.c @@ -189,12 +189,8 @@ void flush_icache_range(unsigned long start, unsigned long end) /* Called when smp_send_reschedule() triggers IRQ_RESCHEDULE. */ static irqreturn_t handle_reschedule_ipi(int irq, void *token) { - /* - * Nothing to do here; when we return from interrupt, the - * rescheduling will occur there. But do bump the interrupt - * profiler count in the meantime. - */ __get_cpu_var(irq_stat).irq_resched_count++; + scheduler_ipi(); return IRQ_HANDLED; } diff --git a/arch/um/kernel/smp.c b/arch/um/kernel/smp.c index 106bf27e2a9a..eefb107d2d73 100644 --- a/arch/um/kernel/smp.c +++ b/arch/um/kernel/smp.c @@ -173,7 +173,7 @@ void IPI_handler(int cpu) break; case 'R': - set_tsk_need_resched(current); + scheduler_ipi(); break; case 'S': diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cc6c53a95bfd..0945ed8057f0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -365,17 +365,6 @@ config X86_UV # Following is an alphabetically sorted list of 32 bit extended platforms # Please maintain the alphabetic order if and when there are additions -config X86_ELAN - bool "AMD Elan" - depends on X86_32 - depends on X86_EXTENDED_PLATFORM - ---help--- - Select this for an AMD Elan processor. - - Do not use this option for K6/Athlon/Opteron processors! - - If unsure, choose "PC-compatible" instead. - config X86_INTEL_CE bool "CE4100 TV platform" depends on PCI @@ -1223,6 +1212,10 @@ config HAVE_ARCH_BOOTMEM def_bool y depends on X86_32 && NUMA +config HAVE_ARCH_ALLOC_REMAP + def_bool y + depends on X86_32 && NUMA + config ARCH_HAVE_MEMORY_PRESENT def_bool y depends on X86_32 && DISCONTIGMEM @@ -1231,13 +1224,9 @@ config NEED_NODE_MEMMAP_SIZE def_bool y depends on X86_32 && (DISCONTIGMEM || SPARSEMEM) -config HAVE_ARCH_ALLOC_REMAP - def_bool y - depends on X86_32 && NUMA - config ARCH_FLATMEM_ENABLE def_bool y - depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && !NUMA + depends on X86_32 && !NUMA config ARCH_DISCONTIGMEM_ENABLE def_bool y @@ -1247,20 +1236,16 @@ config ARCH_DISCONTIGMEM_DEFAULT def_bool y depends on NUMA && X86_32 -config ARCH_PROC_KCORE_TEXT - def_bool y - depends on X86_64 && PROC_KCORE - -config ARCH_SPARSEMEM_DEFAULT - def_bool y - depends on X86_64 - config ARCH_SPARSEMEM_ENABLE def_bool y depends on X86_64 || NUMA || (EXPERIMENTAL && X86_32) || X86_32_NON_STANDARD select SPARSEMEM_STATIC if X86_32 select SPARSEMEM_VMEMMAP_ENABLE if X86_64 +config ARCH_SPARSEMEM_DEFAULT + def_bool y + depends on X86_64 + config ARCH_SELECT_MEMORY_MODEL def_bool y depends on ARCH_SPARSEMEM_ENABLE @@ -1269,6 +1254,10 @@ config ARCH_MEMORY_PROBE def_bool X86_64 depends on MEMORY_HOTPLUG +config ARCH_PROC_KCORE_TEXT + def_bool y + depends on X86_64 && PROC_KCORE + config ILLEGAL_POINTER_VALUE hex default 0 if X86_32 @@ -1703,10 +1692,6 @@ config ARCH_ENABLE_MEMORY_HOTREMOVE def_bool y depends on MEMORY_HOTPLUG -config HAVE_ARCH_EARLY_PFN_TO_NID - def_bool X86_64 - depends on NUMA - config USE_PERCPU_NUMA_NODE_ID def_bool y depends on NUMA @@ -2076,7 +2061,7 @@ config OLPC depends on !X86_PAE select GPIOLIB select OF - select OF_PROMTREE if PROC_DEVICETREE + select OF_PROMTREE ---help--- Add support for detecting the unique features of the OLPC XO hardware. diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index d161e939df62..6a7cfdf8ff69 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -1,6 +1,4 @@ # Put here option for CPU selection and depending optimization -if !X86_ELAN - choice prompt "Processor family" default M686 if X86_32 @@ -203,6 +201,14 @@ config MWINCHIP3D stores for this CPU, which can increase performance of some operations. +config MELAN + bool "AMD Elan" + depends on X86_32 + ---help--- + Select this for an AMD Elan processor. + + Do not use this option for K6/Athlon/Opteron processors! + config MGEODEGX1 bool "GeodeGX1" depends on X86_32 @@ -292,8 +298,6 @@ config X86_GENERIC This is really intended for distributors who need more generic optimizations. -endif - # # Define implied options from the CPU selection here config X86_INTERNODE_CACHE_SHIFT @@ -312,7 +316,7 @@ config X86_L1_CACHE_SHIFT int default "7" if MPENTIUM4 || MPSC default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU - default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 + default "4" if MELAN || M486 || M386 || MGEODEGX1 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX config X86_XADD @@ -358,7 +362,7 @@ config X86_POPAD_OK config X86_ALIGNMENT_16 def_bool y - depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 + depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 config X86_INTEL_USERCOPY def_bool y diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu index f2ee1abb1df9..86cee7b749e1 100644 --- a/arch/x86/Makefile_32.cpu +++ b/arch/x86/Makefile_32.cpu @@ -37,7 +37,7 @@ cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom,$(call cc-option,-march= $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic)) # AMD Elan support -cflags-$(CONFIG_X86_ELAN) += -march=i486 +cflags-$(CONFIG_MELAN) += -march=i486 # Geode GX1 support cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 91f3e087cf21..50c0d30e676d 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -207,8 +207,7 @@ extern const char * const x86_power_flags[32]; #define test_cpu_cap(c, bit) \ test_bit(bit, (unsigned long *)((c)->x86_capability)) -#define cpu_has(c, bit) \ - (__builtin_constant_p(bit) && \ +#define REQUIRED_MASK_BIT_SET(bit) \ ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) || \ (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1)) || \ (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2)) || \ @@ -218,10 +217,16 @@ extern const char * const x86_power_flags[32]; (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \ (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) || \ (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) || \ - (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) ) \ - ? 1 : \ + (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) ) + +#define cpu_has(c, bit) \ + (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ test_cpu_cap(c, bit)) +#define this_cpu_has(bit) \ + (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ + x86_this_cpu_test_bit(bit, (unsigned long *)&cpu_info.x86_capability)) + #define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit) #define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability)) diff --git a/arch/x86/include/asm/mach_traps.h b/arch/x86/include/asm/mach_traps.h index 72a8b52e7dfd..7775debde112 100644 --- a/arch/x86/include/asm/mach_traps.h +++ b/arch/x86/include/asm/mach_traps.h @@ -8,6 +8,7 @@ #include <asm/mc146818rtc.h> #define NMI_REASON_PORT 0x61 +#define NMI_ENABLE_PORT 0x70 /* Real-Time Clock Address Register as well */ #define NMI_REASON_SERR 0x80 #define NMI_REASON_IOCHK 0x40 @@ -30,12 +31,19 @@ static inline void reassert_nmi(void) old_reg = current_lock_cmos_reg(); else lock_cmos(0); /* register doesn't matter here */ - outb(0x8f, 0x70); - inb(0x71); /* dummy */ - outb(0x0f, 0x70); - inb(0x71); /* dummy */ + + /* + * This will cause the NMI output to transition low + * then high if there are any pending NMI sources. The + * CPU's NMI input logic will then register a new NMI. + */ + outb(0x8f, NMI_ENABLE_PORT); + inb(0x71); /* dummy */ + outb(0x0f, NMI_ENABLE_PORT); + inb(0x71); /* dummy */ + if (old_reg >= 0) - outb(old_reg, 0x70); + outb(old_reg, NMI_ENABLE_PORT); else unlock_cmos(); } diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h index 288b96f815a6..b3f88d7867c7 100644 --- a/arch/x86/include/asm/mmzone_64.h +++ b/arch/x86/include/asm/mmzone_64.h @@ -4,36 +4,13 @@ #ifndef _ASM_X86_MMZONE_64_H #define _ASM_X86_MMZONE_64_H - #ifdef CONFIG_NUMA #include <linux/mmdebug.h> - #include <asm/smp.h> -/* Simple perfect hash to map physical addresses to node numbers */ -struct memnode { - int shift; - unsigned int mapsize; - s16 *map; - s16 embedded_map[64 - 8]; -} ____cacheline_aligned; /* total size = 128 bytes */ -extern struct memnode memnode; -#define memnode_shift memnode.shift -#define memnodemap memnode.map -#define memnodemapsize memnode.mapsize - extern struct pglist_data *node_data[]; -static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) -{ - unsigned nid; - VIRTUAL_BUG_ON(!memnodemap); - nid = memnodemap[addr >> memnode_shift]; - VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]); - return nid; -} - #define NODE_DATA(nid) (node_data[nid]) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index 67763c5d8b4e..9eae7752ae9b 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -35,7 +35,7 @@ #define MODULE_PROC_FAMILY "K7 " #elif defined CONFIG_MK8 #define MODULE_PROC_FAMILY "K8 " -#elif defined CONFIG_X86_ELAN +#elif defined CONFIG_MELAN #define MODULE_PROC_FAMILY "ELAN " #elif defined CONFIG_MCRUSOE #define MODULE_PROC_FAMILY "CRUSOE " diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h index c5d3a5abbb9f..24487712e0b1 100644 --- a/arch/x86/include/asm/olpc_ofw.h +++ b/arch/x86/include/asm/olpc_ofw.h @@ -26,15 +26,12 @@ extern void setup_olpc_ofw_pgd(void); /* check if OFW was detected during boot */ extern bool olpc_ofw_present(void); +extern void olpc_dt_build_devicetree(void); + #else /* !CONFIG_OLPC */ static inline void olpc_ofw_detect(void) { } static inline void setup_olpc_ofw_pgd(void) { } -#endif /* !CONFIG_OLPC */ - -#ifdef CONFIG_OF_PROMTREE -extern void olpc_dt_build_devicetree(void); -#else static inline void olpc_dt_build_devicetree(void) { } -#endif +#endif /* !CONFIG_OLPC */ #endif /* _ASM_X86_OLPC_OFW_H */ diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index d475b4398d8b..76042d981596 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -542,6 +542,33 @@ do { \ old__; \ }) +static __always_inline int x86_this_cpu_constant_test_bit(unsigned int nr, + const unsigned long __percpu *addr) +{ + unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG; + + return ((1UL << (nr % BITS_PER_LONG)) & percpu_read(*a)) != 0; +} + +static inline int x86_this_cpu_variable_test_bit(int nr, + const unsigned long __percpu *addr) +{ + int oldbit; + + asm volatile("bt "__percpu_arg(2)",%1\n\t" + "sbb %0,%0" + : "=r" (oldbit) + : "m" (*(unsigned long *)addr), "Ir" (nr)); + + return oldbit; +} + +#define x86_this_cpu_test_bit(nr, addr) \ + (__builtin_constant_p((nr)) \ + ? x86_this_cpu_constant_test_bit((nr), (addr)) \ + : x86_this_cpu_variable_test_bit((nr), (addr))) + + #include <asm-generic/percpu.h> /* We can use this directly for local CPU (faster). */ diff --git a/arch/x86/include/asm/probe_roms.h b/arch/x86/include/asm/probe_roms.h new file mode 100644 index 000000000000..4950a0b1d09c --- /dev/null +++ b/arch/x86/include/asm/probe_roms.h @@ -0,0 +1,8 @@ +#ifndef _PROBE_ROMS_H_ +#define _PROBE_ROMS_H_ +struct pci_dev; + +extern void __iomem *pci_map_biosrom(struct pci_dev *pdev); +extern void pci_unmap_biosrom(void __iomem *rom); +extern size_t pci_biosrom_size(struct pci_dev *pdev); +#endif diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index db8aa19a08a2..03d3a32ace20 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -104,10 +104,10 @@ void *extend_brk(size_t size, size_t align); type *name; \ RESERVE_BRK(name, sizeof(type) * entries) +extern void probe_roms(void); #ifdef __i386__ void __init i386_start_kernel(void); -extern void probe_roms(void); #else void __init x86_64_start_kernel(char *real_mode); diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 910a7084f7f2..8dba76972fd7 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -95,7 +95,6 @@ extern void setup_node_to_cpumask_map(void); #ifdef CONFIG_X86_32 extern unsigned long node_start_pfn[]; extern unsigned long node_end_pfn[]; -extern unsigned long node_remap_size[]; #define node_has_online_mem(nid) (node_start_pfn[nid] != node_end_pfn[nid]) # define SD_CACHE_NICE_TRIES 1 diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 7338ef2218bc..758b379be4c8 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -36,7 +36,7 @@ obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o ldt.o dumpstack.o obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o obj-$(CONFIG_IRQ_WORK) += irq_work.o -obj-$(CONFIG_X86_32) += probe_roms_32.o +obj-y += probe_roms.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index cd1ffed4ee22..289e92862fd9 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -177,7 +177,6 @@ static struct clocksource clocksource_apbt = { .rating = APBT_CLOCKSOURCE_RATING, .read = apbt_read_clocksource, .mask = APBT_MASK, - .shift = APBT_SHIFT, .flags = CLOCK_SOURCE_IS_CONTINUOUS, .resume = apbt_restart_clocksource, }; @@ -543,14 +542,7 @@ static int apbt_clocksource_register(void) if (t1 == apbt_read_clocksource(&clocksource_apbt)) panic("APBT counter not counting. APBT disabled\n"); - /* - * initialize and register APBT clocksource - * convert that to ns/clock cycle - * mult = (ns/c) * 2^APBT_SHIFT - */ - clocksource_apbt.mult = div_sc(MSEC_PER_SEC, - (unsigned long) apbt_freq, APBT_SHIFT); - clocksource_register(&clocksource_apbt); + clocksource_register_khz(&clocksource_apbt, (u32)apbt_freq*1000); return 0; } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index fabf01eff771..2bc503bf9e99 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -505,7 +505,7 @@ static void __cpuinit setup_APIC_timer(void) { struct clock_event_device *levt = &__get_cpu_var(lapic_events); - if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_ARAT)) { + if (this_cpu_has(X86_FEATURE_ARAT)) { lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP; /* Make LAPIC timer preferrable over percpu HPET */ lapic_clockevent.rating = 150; diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 6273eee5134b..0aced70815f0 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -93,10 +93,6 @@ static inline void numaq_register_node(int node, struct sys_cfg_data *scd) node_end_pfn[node]); memory_present(node, node_start_pfn[node], node_end_pfn[node]); - - node_remap_size[node] = node_memmap_size_bytes(node, - node_start_pfn[node], - node_end_pfn[node]); } /* diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig index 870e6cc6ad28..0ab9b22557c4 100644 --- a/arch/x86/kernel/cpu/cpufreq/Kconfig +++ b/arch/x86/kernel/cpu/cpufreq/Kconfig @@ -43,7 +43,7 @@ config X86_ACPI_CPUFREQ config ELAN_CPUFREQ tristate "AMD Elan SC400 and SC410" select CPU_FREQ_TABLE - depends on X86_ELAN + depends on MELAN ---help--- This adds the CPUFreq driver for AMD Elan SC400 and SC410 processors. @@ -59,7 +59,7 @@ config ELAN_CPUFREQ config SC520_CPUFREQ tristate "AMD Elan SC520" select CPU_FREQ_TABLE - depends on X86_ELAN + depends on MELAN ---help--- This adds the CPUFreq driver for AMD Elan SC520 processor. diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 6f8c5e9da97f..6b0f4cde7a22 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -355,7 +355,6 @@ static void notify_thresholds(__u64 msr_val) static void intel_thermal_interrupt(void) { __u64 msr_val; - struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); rdmsrl(MSR_IA32_THERM_STATUS, msr_val); @@ -367,19 +366,19 @@ static void intel_thermal_interrupt(void) CORE_LEVEL) != 0) mce_log_therm_throt_event(CORE_THROTTLED | msr_val); - if (cpu_has(c, X86_FEATURE_PLN)) + if (this_cpu_has(X86_FEATURE_PLN)) if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, POWER_LIMIT_EVENT, CORE_LEVEL) != 0) mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val); - if (cpu_has(c, X86_FEATURE_PTS)) { + if (this_cpu_has(X86_FEATURE_PTS)) { rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, THERMAL_THROTTLING_EVENT, PACKAGE_LEVEL) != 0) mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val); - if (cpu_has(c, X86_FEATURE_PLN)) + if (this_cpu_has(X86_FEATURE_PLN)) if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_POWER_LIMIT, POWER_LIMIT_EVENT, diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index c2520e178d32..8ff882fdb1c0 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -468,7 +468,7 @@ static struct p4_event_bind p4_event_bind_map[] = { .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED), .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS), + P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_X87_ASSIST] = { diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index d6d6bb361931..3bb08509a7a1 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -23,7 +23,6 @@ static void __init i386_default_early_setup(void) { /* Initialize 32bit specific setup functions */ - x86_init.resources.probe_roms = probe_roms; x86_init.resources.reserve_resources = i386_reserve_resources; x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index 2dfd31597443..212fe6590aab 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -188,8 +188,6 @@ static struct clocksource pit_cs = { .rating = 110, .read = pit_read, .mask = CLOCKSOURCE_MASK(32), - .mult = 0, - .shift = 20, }; static int __init init_pit_clocksource(void) @@ -205,9 +203,7 @@ static int __init init_pit_clocksource(void) pit_ce.mode != CLOCK_EVT_MODE_PERIODIC) return 0; - pit_cs.mult = clocksource_hz2mult(CLOCK_TICK_RATE, pit_cs.shift); - - return clocksource_register(&pit_cs); + return clocksource_register_hz(&pit_cs, CLOCK_TICK_RATE); } arch_initcall(init_pit_clocksource); diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index f98d3eafe07a..6389a6bca11b 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -26,8 +26,6 @@ #include <asm/x86_init.h> #include <asm/reboot.h> -#define KVM_SCALE 22 - static int kvmclock = 1; static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; @@ -120,8 +118,6 @@ static struct clocksource kvm_clock = { .read = kvm_clock_get_cycles, .rating = 400, .mask = CLOCKSOURCE_MASK(64), - .mult = 1 << KVM_SCALE, - .shift = KVM_SCALE, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -203,7 +199,7 @@ void __init kvmclock_init(void) machine_ops.crash_shutdown = kvm_crash_shutdown; #endif kvm_get_preset_lpj(); - clocksource_register(&kvm_clock); + clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); pv_info.paravirt_enabled = 1; pv_info.name = "KVM"; diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 5a532ce646bf..6f9bfffb2720 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -715,17 +715,15 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) } } -static int +static int __init check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count) { - int ret = 0; - if (!mpc_new_phys || count <= mpc_new_length) { WARN(1, "update_mptable: No spare slots (length: %x)\n", count); return -1; } - return ret; + return 0; } #else /* CONFIG_X86_IO_APIC */ static diff --git a/arch/x86/kernel/probe_roms_32.c b/arch/x86/kernel/probe_roms.c index 071e7fea42e5..ba0a4cce53be 100644 --- a/arch/x86/kernel/probe_roms_32.c +++ b/arch/x86/kernel/probe_roms.c @@ -73,6 +73,107 @@ static struct resource video_rom_resource = { .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM }; +/* does this oprom support the given pci device, or any of the devices + * that the driver supports? + */ +static bool match_id(struct pci_dev *pdev, unsigned short vendor, unsigned short device) +{ + struct pci_driver *drv = pdev->driver; + const struct pci_device_id *id; + + if (pdev->vendor == vendor && pdev->device == device) + return true; + + for (id = drv ? drv->id_table : NULL; id && id->vendor; id++) + if (id->vendor == vendor && id->device == device) + break; + + return id && id->vendor; +} + +static bool probe_list(struct pci_dev *pdev, unsigned short vendor, + const unsigned char *rom_list) +{ + unsigned short device; + + do { + if (probe_kernel_address(rom_list, device) != 0) + device = 0; + + if (device && match_id(pdev, vendor, device)) + break; + + rom_list += 2; + } while (device); + + return !!device; +} + +static struct resource *find_oprom(struct pci_dev *pdev) +{ + struct resource *oprom = NULL; + int i; + + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources); i++) { + struct resource *res = &adapter_rom_resources[i]; + unsigned short offset, vendor, device, list, rev; + const unsigned char *rom; + + if (res->end == 0) + break; + + rom = isa_bus_to_virt(res->start); + if (probe_kernel_address(rom + 0x18, offset) != 0) + continue; + + if (probe_kernel_address(rom + offset + 0x4, vendor) != 0) + continue; + + if (probe_kernel_address(rom + offset + 0x6, device) != 0) + continue; + + if (match_id(pdev, vendor, device)) { + oprom = res; + break; + } + + if (probe_kernel_address(rom + offset + 0x8, list) == 0 && + probe_kernel_address(rom + offset + 0xc, rev) == 0 && + rev >= 3 && list && + probe_list(pdev, vendor, rom + offset + list)) { + oprom = res; + break; + } + } + + return oprom; +} + +void *pci_map_biosrom(struct pci_dev *pdev) +{ + struct resource *oprom = find_oprom(pdev); + + if (!oprom) + return NULL; + + return ioremap(oprom->start, resource_size(oprom)); +} +EXPORT_SYMBOL(pci_map_biosrom); + +void pci_unmap_biosrom(void __iomem *image) +{ + iounmap(image); +} +EXPORT_SYMBOL(pci_unmap_biosrom); + +size_t pci_biosrom_size(struct pci_dev *pdev) +{ + struct resource *oprom = find_oprom(pdev); + + return oprom ? resource_size(oprom) : 0; +} +EXPORT_SYMBOL(pci_biosrom_size); + #define ROMSIGNATURE 0xaa55 static int __init romsignature(const unsigned char *rom) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index d46cbe46b7ab..88a90a977f8e 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -449,7 +449,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { if (!need_resched()) { - if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR)) + if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); __monitor((void *)¤t_thread_info()->flags, 0, 0); @@ -465,7 +465,7 @@ static void mwait_idle(void) if (!need_resched()) { trace_power_start(POWER_CSTATE, 1, smp_processor_id()); trace_cpu_idle(1, smp_processor_id()); - if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR)) + if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); __monitor((void *)¤t_thread_info()->flags, 0, 0); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 08c44b08bf5b..0c016f727695 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -36,7 +36,7 @@ EXPORT_SYMBOL(pm_power_off); static const struct desc_ptr no_idt = {}; static int reboot_mode; -enum reboot_type reboot_type = BOOT_KBD; +enum reboot_type reboot_type = BOOT_ACPI; int reboot_force; #if defined(CONFIG_X86_32) && defined(CONFIG_SMP) @@ -478,9 +478,24 @@ void __attribute__((weak)) mach_reboot_fixups(void) { } +/* + * Windows compatible x86 hardware expects the following on reboot: + * + * 1) If the FADT has the ACPI reboot register flag set, try it + * 2) If still alive, write to the keyboard controller + * 3) If still alive, write to the ACPI reboot register again + * 4) If still alive, write to the keyboard controller again + * + * If the machine is still alive at this stage, it gives up. We default to + * following the same pattern, except that if we're still alive after (4) we'll + * try to force a triple fault and then cycle between hitting the keyboard + * controller and doing that + */ static void native_machine_emergency_restart(void) { int i; + int attempt = 0; + int orig_reboot_type = reboot_type; if (reboot_emergency) emergency_vmx_disable_all(); @@ -502,6 +517,13 @@ static void native_machine_emergency_restart(void) outb(0xfe, 0x64); /* pulse reset low */ udelay(50); } + if (attempt == 0 && orig_reboot_type == BOOT_ACPI) { + attempt = 1; + reboot_type = BOOT_ACPI; + } else { + reboot_type = BOOT_TRIPLE; + } + break; case BOOT_TRIPLE: load_idt(&no_idt); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 513deac7228d..013e7eba83bb 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -194,14 +194,13 @@ static void native_stop_other_cpus(int wait) } /* - * Reschedule call back. Nothing to do, - * all the work is done automatically when - * we return from the interrupt. + * Reschedule call back. */ void smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); inc_irq_stat(irq_resched_count); + scheduler_ipi(); /* * KVM uses this interrupt to force a cpu out of guest mode */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 8ed8908cc9f7..5a927576b568 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1355,9 +1355,9 @@ static inline void mwait_play_dead(void) void *mwait_ptr; struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info); - if (!(cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c))) + if (!this_cpu_has(X86_FEATURE_MWAIT) && mwait_usable(c)) return; - if (!cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLSH)) + if (!this_cpu_has(X86_FEATURE_CLFLSH)) return; if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF) return; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index c11514e9128b..6eee0828e327 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -35,7 +35,7 @@ void iommu_shutdown_noop(void) { } struct x86_init_ops x86_init __initdata = { .resources = { - .probe_roms = x86_init_noop, + .probe_roms = probe_roms, .reserve_resources = reserve_standard_io_resources, .memory_setup = default_machine_specific_memory_setup, }, diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 1cd608973ce5..4e0068ead6b4 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -913,8 +913,6 @@ static struct clocksource lguest_clock = { .rating = 200, .read = lguest_clock_read, .mask = CLOCKSOURCE_MASK(64), - .mult = 1 << 22, - .shift = 22, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -997,7 +995,7 @@ static void lguest_time_init(void) /* Set up the timer interrupt (0) to go to our simple timer routine */ irq_set_handler(0, lguest_time_irq); - clocksource_register(&lguest_clock); + clocksource_register_hz(&lguest_clock, NSEC_PER_SEC); /* We can't set cpumask in the initializer: damn C limitations! Set it * here and register our timer device. */ diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index bde3906420df..c757c0a3b529 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -104,13 +104,9 @@ extern unsigned long highend_pfn, highstart_pfn; #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) -unsigned long node_remap_size[MAX_NUMNODES]; static void *node_remap_start_vaddr[MAX_NUMNODES]; void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); -static unsigned long kva_start_pfn; -static unsigned long kva_pages; - int __cpuinit numa_cpu_node(int cpu) { return apic->x86_32_numa_cpu_node(cpu); @@ -129,7 +125,6 @@ int __init get_memcfg_numa_flat(void) node_end_pfn[0] = max_pfn; memblock_x86_register_active_regions(0, 0, max_pfn); memory_present(0, 0, max_pfn); - node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn); /* Indicate there is one node available. */ nodes_clear(node_online_map); @@ -164,9 +159,8 @@ static void __init allocate_pgdat(int nid) { char buf[16]; - if (node_has_online_mem(nid) && node_remap_start_vaddr[nid]) - NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; - else { + NODE_DATA(nid) = alloc_remap(nid, ALIGN(sizeof(pg_data_t), PAGE_SIZE)); + if (!NODE_DATA(nid)) { unsigned long pgdat_phys; pgdat_phys = memblock_find_in_range(min_low_pfn<<PAGE_SHIFT, max_pfn_mapped<<PAGE_SHIFT, @@ -182,25 +176,38 @@ static void __init allocate_pgdat(int nid) } /* - * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel - * virtual address space (KVA) is reserved and portions of nodes are mapped - * using it. This is to allow node-local memory to be allocated for - * structures that would normally require ZONE_NORMAL. The memory is - * allocated with alloc_remap() and callers should be prepared to allocate - * from the bootmem allocator instead. + * Remap memory allocator */ static unsigned long node_remap_start_pfn[MAX_NUMNODES]; static void *node_remap_end_vaddr[MAX_NUMNODES]; static void *node_remap_alloc_vaddr[MAX_NUMNODES]; -static unsigned long node_remap_offset[MAX_NUMNODES]; +/** + * alloc_remap - Allocate remapped memory + * @nid: NUMA node to allocate memory from + * @size: The size of allocation + * + * Allocate @size bytes from the remap area of NUMA node @nid. The + * size of the remap area is predetermined by init_alloc_remap() and + * only the callers considered there should call this function. For + * more info, please read the comment on top of init_alloc_remap(). + * + * The caller must be ready to handle allocation failure from this + * function and fall back to regular memory allocator in such cases. + * + * CONTEXT: + * Single CPU early boot context. + * + * RETURNS: + * Pointer to the allocated memory on success, %NULL on failure. + */ void *alloc_remap(int nid, unsigned long size) { void *allocation = node_remap_alloc_vaddr[nid]; size = ALIGN(size, L1_CACHE_BYTES); - if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid]) + if (!allocation || (allocation + size) > node_remap_end_vaddr[nid]) return NULL; node_remap_alloc_vaddr[nid] += size; @@ -209,26 +216,6 @@ void *alloc_remap(int nid, unsigned long size) return allocation; } -static void __init remap_numa_kva(void) -{ - void *vaddr; - unsigned long pfn; - int node; - - for_each_online_node(node) { - printk(KERN_DEBUG "remap_numa_kva: node %d\n", node); - for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { - vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT); - printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n", - (unsigned long)vaddr, - node_remap_start_pfn[node] + pfn); - set_pmd_pfn((ulong) vaddr, - node_remap_start_pfn[node] + pfn, - PAGE_KERNEL_LARGE); - } - } -} - #ifdef CONFIG_HIBERNATION /** * resume_map_numa_kva - add KVA mapping to the temporary page tables created @@ -240,15 +227,16 @@ void resume_map_numa_kva(pgd_t *pgd_base) int node; for_each_online_node(node) { - unsigned long start_va, start_pfn, size, pfn; + unsigned long start_va, start_pfn, nr_pages, pfn; start_va = (unsigned long)node_remap_start_vaddr[node]; start_pfn = node_remap_start_pfn[node]; - size = node_remap_size[node]; + nr_pages = (node_remap_end_vaddr[node] - + node_remap_start_vaddr[node]) >> PAGE_SHIFT; printk(KERN_DEBUG "%s: node %d\n", __func__, node); - for (pfn = 0; pfn < size; pfn += PTRS_PER_PTE) { + for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) { unsigned long vaddr = start_va + (pfn << PAGE_SHIFT); pgd_t *pgd = pgd_base + pgd_index(vaddr); pud_t *pud = pud_offset(pgd, vaddr); @@ -264,132 +252,102 @@ void resume_map_numa_kva(pgd_t *pgd_base) } #endif -static __init unsigned long calculate_numa_remap_pages(void) +/** + * init_alloc_remap - Initialize remap allocator for a NUMA node + * @nid: NUMA node to initizlie remap allocator for + * + * NUMA nodes may end up without any lowmem. As allocating pgdat and + * memmap on a different node with lowmem is inefficient, a special + * remap allocator is implemented which can be used by alloc_remap(). + * + * For each node, the amount of memory which will be necessary for + * pgdat and memmap is calculated and two memory areas of the size are + * allocated - one in the node and the other in lowmem; then, the area + * in the node is remapped to the lowmem area. + * + * As pgdat and memmap must be allocated in lowmem anyway, this + * doesn't waste lowmem address space; however, the actual lowmem + * which gets remapped over is wasted. The amount shouldn't be + * problematic on machines this feature will be used. + * + * Initialization failure isn't fatal. alloc_remap() is used + * opportunistically and the callers will fall back to other memory + * allocation mechanisms on failure. + */ +static __init void init_alloc_remap(int nid) { - int nid; - unsigned long size, reserve_pages = 0; + unsigned long size, pfn; + u64 node_pa, remap_pa; + void *remap_va; - for_each_online_node(nid) { - u64 node_kva_target; - u64 node_kva_final; - - /* - * The acpi/srat node info can show hot-add memroy zones - * where memory could be added but not currently present. - */ - printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n", - nid, node_start_pfn[nid], node_end_pfn[nid]); - if (node_start_pfn[nid] > max_pfn) - continue; - if (!node_end_pfn[nid]) - continue; - if (node_end_pfn[nid] > max_pfn) - node_end_pfn[nid] = max_pfn; - - /* ensure the remap includes space for the pgdat. */ - size = node_remap_size[nid] + sizeof(pg_data_t); - - /* convert size to large (pmd size) pages, rounding up */ - size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES; - /* now the roundup is correct, convert to PAGE_SIZE pages */ - size = size * PTRS_PER_PTE; - - node_kva_target = round_down(node_end_pfn[nid] - size, - PTRS_PER_PTE); - node_kva_target <<= PAGE_SHIFT; - do { - node_kva_final = memblock_find_in_range(node_kva_target, - ((u64)node_end_pfn[nid])<<PAGE_SHIFT, - ((u64)size)<<PAGE_SHIFT, - LARGE_PAGE_BYTES); - node_kva_target -= LARGE_PAGE_BYTES; - } while (node_kva_final == MEMBLOCK_ERROR && - (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid])); - - if (node_kva_final == MEMBLOCK_ERROR) - panic("Can not get kva ram\n"); - - node_remap_size[nid] = size; - node_remap_offset[nid] = reserve_pages; - reserve_pages += size; - printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of" - " node %d at %llx\n", - size, nid, node_kva_final>>PAGE_SHIFT); - - /* - * prevent kva address below max_low_pfn want it on system - * with less memory later. - * layout will be: KVA address , KVA RAM - * - * we are supposed to only record the one less then max_low_pfn - * but we could have some hole in high memory, and it will only - * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide - * to use it as free. - * So memblock_x86_reserve_range here, hope we don't run out of that array - */ - memblock_x86_reserve_range(node_kva_final, - node_kva_final+(((u64)size)<<PAGE_SHIFT), - "KVA RAM"); - - node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT; - } - printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n", - reserve_pages); - return reserve_pages; -} + /* + * The acpi/srat node info can show hot-add memroy zones where + * memory could be added but not currently present. + */ + printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n", + nid, node_start_pfn[nid], node_end_pfn[nid]); + if (node_start_pfn[nid] > max_pfn) + return; + if (!node_end_pfn[nid]) + return; + if (node_end_pfn[nid] > max_pfn) + node_end_pfn[nid] = max_pfn; -static void init_remap_allocator(int nid) -{ - node_remap_start_vaddr[nid] = pfn_to_kaddr( - kva_start_pfn + node_remap_offset[nid]); - node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] + - (node_remap_size[nid] * PAGE_SIZE); - node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + - ALIGN(sizeof(pg_data_t), PAGE_SIZE); - - printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid, - (ulong) node_remap_start_vaddr[nid], - (ulong) node_remap_end_vaddr[nid]); + /* calculate the necessary space aligned to large page size */ + size = node_memmap_size_bytes(nid, node_start_pfn[nid], + min(node_end_pfn[nid], max_pfn)); + size += ALIGN(sizeof(pg_data_t), PAGE_SIZE); + size = ALIGN(size, LARGE_PAGE_BYTES); + + /* allocate node memory and the lowmem remap area */ + node_pa = memblock_find_in_range(node_start_pfn[nid] << PAGE_SHIFT, + (u64)node_end_pfn[nid] << PAGE_SHIFT, + size, LARGE_PAGE_BYTES); + if (node_pa == MEMBLOCK_ERROR) { + pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n", + size, nid); + return; + } + memblock_x86_reserve_range(node_pa, node_pa + size, "KVA RAM"); + + remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT, + max_low_pfn << PAGE_SHIFT, + size, LARGE_PAGE_BYTES); + if (remap_pa == MEMBLOCK_ERROR) { + pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n", + size, nid); + memblock_x86_free_range(node_pa, node_pa + size); + return; + } + memblock_x86_reserve_range(remap_pa, remap_pa + size, "KVA PG"); + remap_va = phys_to_virt(remap_pa); + + /* perform actual remap */ + for (pfn = 0; pfn < size >> PAGE_SHIFT; pfn += PTRS_PER_PTE) + set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT), + (node_pa >> PAGE_SHIFT) + pfn, + PAGE_KERNEL_LARGE); + + /* initialize remap allocator parameters */ + node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT; + node_remap_start_vaddr[nid] = remap_va; + node_remap_end_vaddr[nid] = remap_va + size; + node_remap_alloc_vaddr[nid] = remap_va; + + printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n", + nid, node_pa, node_pa + size, remap_va, remap_va + size); } void __init initmem_init(void) { int nid; - long kva_target_pfn; - - /* - * When mapping a NUMA machine we allocate the node_mem_map arrays - * from node local memory. They are then mapped directly into KVA - * between zone normal and vmalloc space. Calculate the size of - * this space and use it to adjust the boundary between ZONE_NORMAL - * and ZONE_HIGHMEM. - */ get_memcfg_numa(); numa_init_array(); - kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE); - - kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); - do { - kva_start_pfn = memblock_find_in_range(kva_target_pfn<<PAGE_SHIFT, - max_low_pfn<<PAGE_SHIFT, - kva_pages<<PAGE_SHIFT, - PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT; - kva_target_pfn -= PTRS_PER_PTE; - } while (kva_start_pfn == MEMBLOCK_ERROR && kva_target_pfn > min_low_pfn); - - if (kva_start_pfn == MEMBLOCK_ERROR) - panic("Can not get kva space\n"); - - printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n", - kva_start_pfn, max_low_pfn); - printk(KERN_INFO "max_pfn = %lx\n", max_pfn); + for_each_online_node(nid) + init_alloc_remap(nid); - /* avoid clash with initrd */ - memblock_x86_reserve_range(kva_start_pfn<<PAGE_SHIFT, - (kva_start_pfn + kva_pages)<<PAGE_SHIFT, - "KVA PG"); #ifdef CONFIG_HIGHMEM highstart_pfn = highend_pfn = max_pfn; if (max_pfn > max_low_pfn) @@ -409,12 +367,8 @@ void __init initmem_init(void) printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n", (ulong) pfn_to_kaddr(max_low_pfn)); - for_each_online_node(nid) { - init_remap_allocator(nid); - + for_each_online_node(nid) allocate_pgdat(nid); - } - remap_numa_kva(); printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", (ulong) pfn_to_kaddr(highstart_pfn)); diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index e8c00cc72033..13f5b068e8c2 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -28,125 +28,10 @@ EXPORT_SYMBOL(node_data); nodemask_t numa_nodes_parsed __initdata; -struct memnode memnode; - -static unsigned long __initdata nodemap_addr; -static unsigned long __initdata nodemap_size; - static struct numa_meminfo numa_meminfo __initdata; - static int numa_distance_cnt; static u8 *numa_distance; -/* - * Given a shift value, try to populate memnodemap[] - * Returns : - * 1 if OK - * 0 if memnodmap[] too small (of shift too small) - * -1 if node overlap or lost ram (shift too big) - */ -static int __init populate_memnodemap(const struct numa_meminfo *mi, int shift) -{ - unsigned long addr, end; - int i, res = -1; - - memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize); - for (i = 0; i < mi->nr_blks; i++) { - addr = mi->blk[i].start; - end = mi->blk[i].end; - if (addr >= end) - continue; - if ((end >> shift) >= memnodemapsize) - return 0; - do { - if (memnodemap[addr >> shift] != NUMA_NO_NODE) - return -1; - memnodemap[addr >> shift] = mi->blk[i].nid; - addr += (1UL << shift); - } while (addr < end); - res = 1; - } - return res; -} - -static int __init allocate_cachealigned_memnodemap(void) -{ - unsigned long addr; - - memnodemap = memnode.embedded_map; - if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map)) - return 0; - - addr = 0x8000; - nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); - nodemap_addr = memblock_find_in_range(addr, get_max_mapped(), - nodemap_size, L1_CACHE_BYTES); - if (nodemap_addr == MEMBLOCK_ERROR) { - printk(KERN_ERR - "NUMA: Unable to allocate Memory to Node hash map\n"); - nodemap_addr = nodemap_size = 0; - return -1; - } - memnodemap = phys_to_virt(nodemap_addr); - memblock_x86_reserve_range(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP"); - - printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", - nodemap_addr, nodemap_addr + nodemap_size); - return 0; -} - -/* - * The LSB of all start and end addresses in the node map is the value of the - * maximum possible shift. - */ -static int __init extract_lsb_from_nodes(const struct numa_meminfo *mi) -{ - int i, nodes_used = 0; - unsigned long start, end; - unsigned long bitfield = 0, memtop = 0; - - for (i = 0; i < mi->nr_blks; i++) { - start = mi->blk[i].start; - end = mi->blk[i].end; - if (start >= end) - continue; - bitfield |= start; - nodes_used++; - if (end > memtop) - memtop = end; - } - if (nodes_used <= 1) - i = 63; - else - i = find_first_bit(&bitfield, sizeof(unsigned long)*8); - memnodemapsize = (memtop >> i)+1; - return i; -} - -static int __init compute_hash_shift(const struct numa_meminfo *mi) -{ - int shift; - - shift = extract_lsb_from_nodes(mi); - if (allocate_cachealigned_memnodemap()) - return -1; - printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", - shift); - - if (populate_memnodemap(mi, shift) != 1) { - printk(KERN_INFO "Your memory is not aligned you need to " - "rebuild your kernel with a bigger NODEMAPSIZE " - "shift=%d\n", shift); - return -1; - } - return shift; -} - -int __meminit __early_pfn_to_nid(unsigned long pfn) -{ - return phys_to_nid(pfn << PAGE_SHIFT); -} - static void * __init early_node_mem(int nodeid, unsigned long start, unsigned long end, unsigned long size, unsigned long align) @@ -270,7 +155,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) memblock_x86_reserve_range(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA"); printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys, nodedata_phys + pgdat_size - 1); - nid = phys_to_nid(nodedata_phys); + nid = early_pfn_to_nid(nodedata_phys >> PAGE_SHIFT); if (nid != nodeid) printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid); @@ -527,12 +412,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) if (WARN_ON(nodes_empty(node_possible_map))) return -EINVAL; - memnode_shift = compute_hash_shift(mi); - if (memnode_shift < 0) { - printk(KERN_ERR "NUMA: No NUMA node hash function found. Contact maintainer\n"); - return -EINVAL; - } - for (i = 0; i < mi->nr_blks; i++) memblock_x86_register_active_regions(mi->blk[i].nid, mi->blk[i].start >> PAGE_SHIFT, @@ -626,17 +505,13 @@ static int __init numa_init(int (*init_func)(void)) void __init initmem_init(void) { - int ret; - if (!numa_off) { #ifdef CONFIG_ACPI_NUMA - ret = numa_init(x86_acpi_numa_init); - if (!ret) + if (!numa_init(x86_acpi_numa_init)) return; #endif #ifdef CONFIG_AMD_NUMA - ret = numa_init(amd_numa_init); - if (!ret) + if (!numa_init(amd_numa_init)) return; #endif } diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c index 364f36bdfad8..ae20046a9e98 100644 --- a/arch/x86/mm/srat_32.c +++ b/arch/x86/mm/srat_32.c @@ -278,7 +278,6 @@ int __init get_memcfg_from_srat(void) unsigned long end = min(node_end_pfn[nid], max_pfn); memory_present(nid, start, end); - node_remap_size[nid] = node_memmap_size_bytes(nid, start, end); } return 1; out_fail: diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile index c2a8cab65e5d..81c5e2165c24 100644 --- a/arch/x86/platform/olpc/Makefile +++ b/arch/x86/platform/olpc/Makefile @@ -1,4 +1,2 @@ -obj-$(CONFIG_OLPC) += olpc.o +obj-$(CONFIG_OLPC) += olpc.o olpc_ofw.o olpc_dt.o obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o -obj-$(CONFIG_OLPC) += olpc_ofw.o -obj-$(CONFIG_OF_PROMTREE) += olpc_dt.o diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c index edaf3fe8dc5e..0060fd59ea00 100644 --- a/arch/x86/platform/olpc/olpc.c +++ b/arch/x86/platform/olpc/olpc.c @@ -18,6 +18,7 @@ #include <linux/io.h> #include <linux/string.h> #include <linux/platform_device.h> +#include <linux/of.h> #include <asm/geode.h> #include <asm/setup.h> @@ -187,41 +188,43 @@ err: } EXPORT_SYMBOL_GPL(olpc_ec_cmd); -static bool __init check_ofw_architecture(void) +static bool __init check_ofw_architecture(struct device_node *root) { - size_t propsize; - char olpc_arch[5]; - const void *args[] = { NULL, "architecture", olpc_arch, (void *)5 }; - void *res[] = { &propsize }; + const char *olpc_arch; + int propsize; - if (olpc_ofw("getprop", args, res)) { - printk(KERN_ERR "ofw: getprop call failed!\n"); - return false; - } + olpc_arch = of_get_property(root, "architecture", &propsize); return propsize == 5 && strncmp("OLPC", olpc_arch, 5) == 0; } -static u32 __init get_board_revision(void) +static u32 __init get_board_revision(struct device_node *root) { - size_t propsize; - __be32 rev; - const void *args[] = { NULL, "board-revision-int", &rev, (void *)4 }; - void *res[] = { &propsize }; - - if (olpc_ofw("getprop", args, res) || propsize != 4) { - printk(KERN_ERR "ofw: getprop call failed!\n"); - return cpu_to_be32(0); - } - return be32_to_cpu(rev); + int propsize; + const __be32 *rev; + + rev = of_get_property(root, "board-revision-int", &propsize); + if (propsize != 4) + return 0; + + return be32_to_cpu(*rev); } static bool __init platform_detect(void) { - if (!check_ofw_architecture()) + struct device_node *root = of_find_node_by_path("/"); + bool success; + + if (!root) return false; - olpc_platform_info.flags |= OLPC_F_PRESENT; - olpc_platform_info.boardrev = get_board_revision(); - return true; + + success = check_ofw_architecture(root); + if (success) { + olpc_platform_info.boardrev = get_board_revision(root); + olpc_platform_info.flags |= OLPC_F_PRESENT; + } + + of_node_put(root); + return success; } static int __init add_xo1_platform_devices(void) diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c index 044bda5b3174..d39f63d017d2 100644 --- a/arch/x86/platform/olpc/olpc_dt.c +++ b/arch/x86/platform/olpc/olpc_dt.c @@ -19,7 +19,9 @@ #include <linux/kernel.h> #include <linux/bootmem.h> #include <linux/of.h> +#include <linux/of_platform.h> #include <linux/of_pdt.h> +#include <asm/olpc.h> #include <asm/olpc_ofw.h> static phandle __init olpc_dt_getsibling(phandle node) @@ -180,3 +182,20 @@ void __init olpc_dt_build_devicetree(void) pr_info("PROM DT: Built device tree with %u bytes of memory.\n", prom_early_allocated); } + +/* A list of DT node/bus matches that we want to expose as platform devices */ +static struct of_device_id __initdata of_ids[] = { + { .compatible = "olpc,xo1-battery" }, + { .compatible = "olpc,xo1-dcon" }, + { .compatible = "olpc,xo1-rtc" }, + {}, +}; + +static int __init olpc_create_platform_devices(void) +{ + if (machine_is_olpc()) + return of_platform_bus_probe(NULL, of_ids, NULL); + else + return 0; +} +device_initcall(olpc_create_platform_devices); diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c index 9daf5d1af9f1..0eb90184515f 100644 --- a/arch/x86/platform/uv/uv_time.c +++ b/arch/x86/platform/uv/uv_time.c @@ -40,7 +40,6 @@ static struct clocksource clocksource_uv = { .rating = 400, .read = uv_read_rtc, .mask = (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK, - .shift = 10, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -372,14 +371,11 @@ static __init int uv_rtc_setup_clock(void) if (!is_uv_system()) return -ENODEV; - clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second, - clocksource_uv.shift); - /* If single blade, prefer tsc */ if (uv_num_possible_blades() == 1) clocksource_uv.rating = 250; - rc = clocksource_register(&clocksource_uv); + rc = clocksource_register_hz(&clocksource_uv, sn_rtc_cycles_per_second); if (rc) printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc); else diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 30612441ed99..762b46ab14d5 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -46,13 +46,12 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); /* - * Reschedule call back. Nothing to do, - * all the work is done automatically when - * we return from the interrupt. + * Reschedule call back. */ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) { inc_irq_stat(irq_resched_count); + scheduler_ipi(); return IRQ_HANDLED; } diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 2e2d370a47b1..c532d280ce36 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -26,8 +26,6 @@ #include "xen-ops.h" -#define XEN_SHIFT 22 - /* Xen may fire a timer up to this many ns early */ #define TIMER_SLOP 100000 #define NS_PER_TICK (1000000000LL / HZ) @@ -211,8 +209,6 @@ static struct clocksource xen_clocksource __read_mostly = { .rating = 400, .read = xen_clocksource_get_cycles, .mask = ~0, - .mult = 1<<XEN_SHIFT, /* time directly in nanoseconds */ - .shift = XEN_SHIFT, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -448,7 +444,7 @@ static __init void xen_time_init(void) int cpu = smp_processor_id(); struct timespec tp; - clocksource_register(&xen_clocksource); + clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC); if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { /* Successfully turned off 100Hz tick, so we have the diff --git a/drivers/acpi/processor_throttling.c b/drivers/acpi/processor_throttling.c index ad3501739563..605a2954ef17 100644 --- a/drivers/acpi/processor_throttling.c +++ b/drivers/acpi/processor_throttling.c @@ -710,20 +710,14 @@ static int acpi_processor_get_throttling_fadt(struct acpi_processor *pr) } #ifdef CONFIG_X86 -static int acpi_throttling_rdmsr(struct acpi_processor *pr, - u64 *value) +static int acpi_throttling_rdmsr(u64 *value) { - struct cpuinfo_x86 *c; u64 msr_high, msr_low; - unsigned int cpu; u64 msr = 0; int ret = -1; - cpu = pr->id; - c = &cpu_data(cpu); - - if ((c->x86_vendor != X86_VENDOR_INTEL) || - !cpu_has(c, X86_FEATURE_ACPI)) { + if ((this_cpu_read(cpu_info.x86_vendor) != X86_VENDOR_INTEL) || + !this_cpu_has(X86_FEATURE_ACPI)) { printk(KERN_ERR PREFIX "HARDWARE addr space,NOT supported yet\n"); } else { @@ -738,18 +732,13 @@ static int acpi_throttling_rdmsr(struct acpi_processor *pr, return ret; } -static int acpi_throttling_wrmsr(struct acpi_processor *pr, u64 value) +static int acpi_throttling_wrmsr(u64 value) { - struct cpuinfo_x86 *c; - unsigned int cpu; int ret = -1; u64 msr; - cpu = pr->id; - c = &cpu_data(cpu); - - if ((c->x86_vendor != X86_VENDOR_INTEL) || - !cpu_has(c, X86_FEATURE_ACPI)) { + if ((this_cpu_read(cpu_info.x86_vendor) != X86_VENDOR_INTEL) || + !this_cpu_has(X86_FEATURE_ACPI)) { printk(KERN_ERR PREFIX "HARDWARE addr space,NOT supported yet\n"); } else { @@ -761,15 +750,14 @@ static int acpi_throttling_wrmsr(struct acpi_processor *pr, u64 value) return ret; } #else -static int acpi_throttling_rdmsr(struct acpi_processor *pr, - u64 *value) +static int acpi_throttling_rdmsr(u64 *value) { printk(KERN_ERR PREFIX "HARDWARE addr space,NOT supported yet\n"); return -1; } -static int acpi_throttling_wrmsr(struct acpi_processor *pr, u64 value) +static int acpi_throttling_wrmsr(u64 value) { printk(KERN_ERR PREFIX "HARDWARE addr space,NOT supported yet\n"); @@ -801,7 +789,7 @@ static int acpi_read_throttling_status(struct acpi_processor *pr, ret = 0; break; case ACPI_ADR_SPACE_FIXED_HARDWARE: - ret = acpi_throttling_rdmsr(pr, value); + ret = acpi_throttling_rdmsr(value); break; default: printk(KERN_ERR PREFIX "Unknown addr space %d\n", @@ -834,7 +822,7 @@ static int acpi_write_throttling_state(struct acpi_processor *pr, ret = 0; break; case ACPI_ADR_SPACE_FIXED_HARDWARE: - ret = acpi_throttling_wrmsr(pr, value); + ret = acpi_throttling_wrmsr(value); break; default: printk(KERN_ERR PREFIX "Unknown addr space %d\n", diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c index 7066e801b9d3..051474c65b78 100644 --- a/drivers/char/hpet.c +++ b/drivers/char/hpet.c @@ -84,8 +84,6 @@ static struct clocksource clocksource_hpet = { .rating = 250, .read = read_hpet, .mask = CLOCKSOURCE_MASK(64), - .mult = 0, /* to be calculated */ - .shift = 10, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; static struct clocksource *hpet_clocksource; @@ -934,9 +932,7 @@ int hpet_alloc(struct hpet_data *hdp) if (!hpet_clocksource) { hpet_mctr = (void __iomem *)&hpetp->hp_hpet->hpet_mc; CLKSRC_FSYS_MMIO_SET(clocksource_hpet.fsys_mmio, hpet_mctr); - clocksource_hpet.mult = clocksource_hz2mult(hpetp->hp_tick_freq, - clocksource_hpet.shift); - clocksource_register(&clocksource_hpet); + clocksource_register_hz(&clocksource_hpet, hpetp->hp_tick_freq); hpetp->hp_clocksource = &clocksource_hpet; hpet_clocksource = &clocksource_hpet; } diff --git a/drivers/clocksource/cyclone.c b/drivers/clocksource/cyclone.c index 64e528e8bfa6..72f811f73e9c 100644 --- a/drivers/clocksource/cyclone.c +++ b/drivers/clocksource/cyclone.c @@ -29,8 +29,6 @@ static struct clocksource clocksource_cyclone = { .rating = 250, .read = read_cyclone, .mask = CYCLONE_TIMER_MASK, - .mult = 10, - .shift = 0, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -108,12 +106,8 @@ static int __init init_cyclone_clocksource(void) } cyclone_ptr = cyclone_timer; - /* sort out mult/shift values: */ - clocksource_cyclone.shift = 22; - clocksource_cyclone.mult = clocksource_hz2mult(CYCLONE_TIMER_FREQ, - clocksource_cyclone.shift); - - return clocksource_register(&clocksource_cyclone); + return clocksource_register_hz(&clocksource_cyclone, + CYCLONE_TIMER_FREQ); } arch_initcall(init_cyclone_clocksource); diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index c37b21ad5a3b..94c1f38e922a 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -161,7 +161,7 @@ struct clocksource { /* * First part of structure is read mostly */ - char *name; + const char *name; struct list_head list; int rating; cycle_t (*read)(struct clocksource *cs); diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 94b48bd40dd7..c75471db576e 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -51,7 +51,7 @@ struct mutex { spinlock_t wait_lock; struct list_head wait_list; #if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP) - struct thread_info *owner; + struct task_struct *owner; #endif #ifdef CONFIG_DEBUG_MUTEXES const char *name; diff --git a/include/linux/sched.h b/include/linux/sched.h index 18d63cea2848..171ba24b08a7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -360,7 +360,7 @@ extern signed long schedule_timeout_interruptible(signed long timeout); extern signed long schedule_timeout_killable(signed long timeout); extern signed long schedule_timeout_uninterruptible(signed long timeout); asmlinkage void schedule(void); -extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner); +extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner); struct nsproxy; struct user_namespace; @@ -868,6 +868,7 @@ static inline int sd_power_saving_flags(void) struct sched_group { struct sched_group *next; /* Must be a circular list */ + atomic_t ref; /* * CPU power of this group, SCHED_LOAD_SCALE being max power for a @@ -882,9 +883,6 @@ struct sched_group { * NOTE: this field is variable length. (Allocated dynamically * by attaching extra space to the end of the structure, * depending on how many CPUs the kernel has booted up with) - * - * It is also be embedded into static data structures at build - * time. (See 'struct static_sched_group' in kernel/sched.c) */ unsigned long cpumask[0]; }; @@ -894,17 +892,6 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg) return to_cpumask(sg->cpumask); } -enum sched_domain_level { - SD_LV_NONE = 0, - SD_LV_SIBLING, - SD_LV_MC, - SD_LV_BOOK, - SD_LV_CPU, - SD_LV_NODE, - SD_LV_ALLNODES, - SD_LV_MAX -}; - struct sched_domain_attr { int relax_domain_level; }; @@ -913,6 +900,8 @@ struct sched_domain_attr { .relax_domain_level = -1, \ } +extern int sched_domain_level_max; + struct sched_domain { /* These fields must be setup */ struct sched_domain *parent; /* top domain must be null terminated */ @@ -930,7 +919,7 @@ struct sched_domain { unsigned int forkexec_idx; unsigned int smt_gain; int flags; /* See SD_* */ - enum sched_domain_level level; + int level; /* Runtime fields. */ unsigned long last_balance; /* init to jiffies. units in jiffies */ @@ -973,6 +962,10 @@ struct sched_domain { #ifdef CONFIG_SCHED_DEBUG char *name; #endif + union { + void *private; /* used during construction */ + struct rcu_head rcu; /* used during destruction */ + }; unsigned int span_weight; /* @@ -981,9 +974,6 @@ struct sched_domain { * NOTE: this field is variable length. (Allocated dynamically * by attaching extra space to the end of the structure, * depending on how many CPUs the kernel has booted up with) - * - * It is also be embedded into static data structures at build - * time. (See 'struct static_sched_domain' in kernel/sched.c) */ unsigned long span[0]; }; @@ -1048,8 +1038,12 @@ struct sched_domain; #define WF_FORK 0x02 /* child wakeup after fork */ #define ENQUEUE_WAKEUP 1 -#define ENQUEUE_WAKING 2 -#define ENQUEUE_HEAD 4 +#define ENQUEUE_HEAD 2 +#ifdef CONFIG_SMP +#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */ +#else +#define ENQUEUE_WAKING 0 +#endif #define DEQUEUE_SLEEP 1 @@ -1067,12 +1061,11 @@ struct sched_class { void (*put_prev_task) (struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP - int (*select_task_rq)(struct rq *rq, struct task_struct *p, - int sd_flag, int flags); + int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); void (*post_schedule) (struct rq *this_rq); - void (*task_waking) (struct rq *this_rq, struct task_struct *task); + void (*task_waking) (struct task_struct *task); void (*task_woken) (struct rq *this_rq, struct task_struct *task); void (*set_cpus_allowed)(struct task_struct *p, @@ -1200,10 +1193,10 @@ struct task_struct { int lock_depth; /* BKL lock depth */ #ifdef CONFIG_SMP -#ifdef __ARCH_WANT_UNLOCKED_CTXSW - int oncpu; -#endif + struct task_struct *wake_entry; + int on_cpu; #endif + int on_rq; int prio, static_prio, normal_prio; unsigned int rt_priority; @@ -1274,6 +1267,7 @@ struct task_struct { /* Revert to default priority/policy when forking */ unsigned sched_reset_on_fork:1; + unsigned sched_contributes_to_load:1; pid_t pid; pid_t tgid; @@ -2192,8 +2186,10 @@ extern void set_task_comm(struct task_struct *tsk, char *from); extern char *get_task_comm(char *to, struct task_struct *tsk); #ifdef CONFIG_SMP +void scheduler_ipi(void); extern unsigned long wait_task_inactive(struct task_struct *, long match_state); #else +static inline void scheduler_ipi(void) { } static inline unsigned long wait_task_inactive(struct task_struct *p, long match_state) { diff --git a/init/Kconfig b/init/Kconfig index 0edda616640f..94bf28dc9220 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -827,6 +827,11 @@ config SCHED_AUTOGROUP desktop applications. Task group autogeneration is currently based upon task session. +config SCHED_TTWU_QUEUE + bool + depends on !SPARC32 + default y + config MM_OWNER bool diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 33eee16addb8..2bb8c2e98fff 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1159,7 +1159,7 @@ int current_cpuset_is_being_rebound(void) static int update_relax_domain_level(struct cpuset *cs, s64 val) { #ifdef CONFIG_SMP - if (val < -1 || val >= SD_LV_MAX) + if (val < -1 || val >= sched_domain_level_max) return -EINVAL; #endif diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index ec815a960b5d..73da83aff418 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c @@ -75,7 +75,7 @@ void debug_mutex_unlock(struct mutex *lock) return; DEBUG_LOCKS_WARN_ON(lock->magic != lock); - DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); + DEBUG_LOCKS_WARN_ON(lock->owner != current); DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); mutex_clear_owner(lock); } diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h index 57d527a16f9d..0799fd3e4cfa 100644 --- a/kernel/mutex-debug.h +++ b/kernel/mutex-debug.h @@ -29,7 +29,7 @@ extern void debug_mutex_init(struct mutex *lock, const char *name, static inline void mutex_set_owner(struct mutex *lock) { - lock->owner = current_thread_info(); + lock->owner = current; } static inline void mutex_clear_owner(struct mutex *lock) diff --git a/kernel/mutex.c b/kernel/mutex.c index c4195fa98900..fe4706cb0c5b 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -160,7 +160,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, */ for (;;) { - struct thread_info *owner; + struct task_struct *owner; /* * If we own the BKL, then don't spin. The owner of diff --git a/kernel/mutex.h b/kernel/mutex.h index 67578ca48f94..4115fbf83b12 100644 --- a/kernel/mutex.h +++ b/kernel/mutex.h @@ -19,7 +19,7 @@ #ifdef CONFIG_SMP static inline void mutex_set_owner(struct mutex *lock) { - lock->owner = current_thread_info(); + lock->owner = current; } static inline void mutex_clear_owner(struct mutex *lock) diff --git a/kernel/sched.c b/kernel/sched.c index 312f8b95c2d4..bc31f4ee2d98 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -231,7 +231,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) #endif /* - * sched_domains_mutex serializes calls to arch_init_sched_domains, + * sched_domains_mutex serializes calls to init_sched_domains, * detach_destroy_domains and partition_sched_domains. */ static DEFINE_MUTEX(sched_domains_mutex); @@ -312,6 +312,9 @@ struct cfs_rq { u64 exec_clock; u64 min_vruntime; +#ifndef CONFIG_64BIT + u64 min_vruntime_copy; +#endif struct rb_root tasks_timeline; struct rb_node *rb_leftmost; @@ -417,6 +420,7 @@ struct rt_rq { */ struct root_domain { atomic_t refcount; + struct rcu_head rcu; cpumask_var_t span; cpumask_var_t online; @@ -553,6 +557,10 @@ struct rq { unsigned int ttwu_count; unsigned int ttwu_local; #endif + +#ifdef CONFIG_SMP + struct task_struct *wake_list; +#endif }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -571,7 +579,7 @@ static inline int cpu_of(struct rq *rq) #define rcu_dereference_check_sched_domain(p) \ rcu_dereference_check((p), \ - rcu_read_lock_sched_held() || \ + rcu_read_lock_held() || \ lockdep_is_held(&sched_domains_mutex)) /* @@ -596,7 +604,7 @@ static inline int cpu_of(struct rq *rq) * Return the group to which this tasks belongs. * * We use task_subsys_state_check() and extend the RCU verification - * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach() + * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach() * holds that lock for each task it moves into the cgroup. Therefore * by holding that lock, we pin the task to the current cgroup. */ @@ -606,7 +614,7 @@ static inline struct task_group *task_group(struct task_struct *p) struct cgroup_subsys_state *css; css = task_subsys_state_check(p, cpu_cgroup_subsys_id, - lockdep_is_held(&task_rq(p)->lock)); + lockdep_is_held(&p->pi_lock)); tg = container_of(css, struct task_group, css); return autogroup_task_group(p, tg); @@ -838,18 +846,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p) return rq->curr == p; } -#ifndef __ARCH_WANT_UNLOCKED_CTXSW static inline int task_running(struct rq *rq, struct task_struct *p) { +#ifdef CONFIG_SMP + return p->on_cpu; +#else return task_current(rq, p); +#endif } +#ifndef __ARCH_WANT_UNLOCKED_CTXSW static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { +#ifdef CONFIG_SMP + /* + * We can optimise this out completely for !SMP, because the + * SMP rebalancing from interrupt is the only thing that cares + * here. + */ + next->on_cpu = 1; +#endif } static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) { +#ifdef CONFIG_SMP + /* + * After ->on_cpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + */ + smp_wmb(); + prev->on_cpu = 0; +#endif #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ rq->lock.owner = current; @@ -865,15 +894,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) } #else /* __ARCH_WANT_UNLOCKED_CTXSW */ -static inline int task_running(struct rq *rq, struct task_struct *p) -{ -#ifdef CONFIG_SMP - return p->oncpu; -#else - return task_current(rq, p); -#endif -} - static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { #ifdef CONFIG_SMP @@ -882,7 +902,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) * SMP rebalancing from interrupt is the only thing that cares * here. */ - next->oncpu = 1; + next->on_cpu = 1; #endif #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW raw_spin_unlock_irq(&rq->lock); @@ -895,12 +915,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) { #ifdef CONFIG_SMP /* - * After ->oncpu is cleared, the task can be moved to a different CPU. + * After ->on_cpu is cleared, the task can be moved to a different CPU. * We must ensure this doesn't happen until the switch is completely * finished. */ smp_wmb(); - prev->oncpu = 0; + prev->on_cpu = 0; #endif #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW local_irq_enable(); @@ -909,23 +929,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ /* - * Check whether the task is waking, we use this to synchronize ->cpus_allowed - * against ttwu(). - */ -static inline int task_is_waking(struct task_struct *p) -{ - return unlikely(p->state == TASK_WAKING); -} - -/* - * __task_rq_lock - lock the runqueue a given task resides on. - * Must be called interrupts disabled. + * __task_rq_lock - lock the rq @p resides on. */ static inline struct rq *__task_rq_lock(struct task_struct *p) __acquires(rq->lock) { struct rq *rq; + lockdep_assert_held(&p->pi_lock); + for (;;) { rq = task_rq(p); raw_spin_lock(&rq->lock); @@ -936,22 +948,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p) } /* - * task_rq_lock - lock the runqueue a given task resides on and disable - * interrupts. Note the ordering: we can safely lookup the task_rq without - * explicitly disabling preemption. + * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. */ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) + __acquires(p->pi_lock) __acquires(rq->lock) { struct rq *rq; for (;;) { - local_irq_save(*flags); + raw_spin_lock_irqsave(&p->pi_lock, *flags); rq = task_rq(p); raw_spin_lock(&rq->lock); if (likely(rq == task_rq(p))) return rq; - raw_spin_unlock_irqrestore(&rq->lock, *flags); + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, *flags); } } @@ -961,10 +973,13 @@ static void __task_rq_unlock(struct rq *rq) raw_spin_unlock(&rq->lock); } -static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) +static inline void +task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) __releases(rq->lock) + __releases(p->pi_lock) { - raw_spin_unlock_irqrestore(&rq->lock, *flags); + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, *flags); } /* @@ -1773,7 +1788,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) update_rq_clock(rq); sched_info_queued(p); p->sched_class->enqueue_task(rq, p, flags); - p->se.on_rq = 1; } static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) @@ -1781,7 +1795,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) update_rq_clock(rq); sched_info_dequeued(p); p->sched_class->dequeue_task(rq, p, flags); - p->se.on_rq = 0; } /* @@ -2116,7 +2129,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) * A queue event has occurred, and we're going to schedule. In * this case, we can save a useless back to back clock update. */ - if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr)) + if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) rq->skip_clock_update = 1; } @@ -2162,6 +2175,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) */ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); + +#ifdef CONFIG_LOCKDEP + WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || + lockdep_is_held(&task_rq(p)->lock))); +#endif #endif trace_sched_migrate_task(p, new_cpu); @@ -2182,19 +2200,6 @@ struct migration_arg { static int migration_cpu_stop(void *data); /* - * The task's runqueue lock must be held. - * Returns true if you have to wait for migration thread. - */ -static bool migrate_task(struct task_struct *p, struct rq *rq) -{ - /* - * If the task is not on a runqueue (and not running), then - * the next wake-up will properly place the task. - */ - return p->se.on_rq || task_running(rq, p); -} - -/* * wait_task_inactive - wait for a thread to unschedule. * * If @match_state is nonzero, it's the @p->state value just checked and @@ -2251,11 +2256,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) rq = task_rq_lock(p, &flags); trace_sched_wait_task(p); running = task_running(rq, p); - on_rq = p->se.on_rq; + on_rq = p->on_rq; ncsw = 0; if (!match_state || p->state == match_state) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); /* * If it changed from the expected state, bail out now. @@ -2330,7 +2335,7 @@ EXPORT_SYMBOL_GPL(kick_process); #ifdef CONFIG_SMP /* - * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. + * ->cpus_allowed is protected by both rq->lock and p->pi_lock */ static int select_fallback_rq(int cpu, struct task_struct *p) { @@ -2363,12 +2368,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p) } /* - * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. + * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. */ static inline -int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) +int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) { - int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); + int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); /* * In order not to call set_task_cpu() on a blocking task we need @@ -2394,27 +2399,60 @@ static void update_avg(u64 *avg, u64 sample) } #endif -static inline void ttwu_activate(struct task_struct *p, struct rq *rq, - bool is_sync, bool is_migrate, bool is_local, - unsigned long en_flags) +static void +ttwu_stat(struct task_struct *p, int cpu, int wake_flags) { +#ifdef CONFIG_SCHEDSTATS + struct rq *rq = this_rq(); + +#ifdef CONFIG_SMP + int this_cpu = smp_processor_id(); + + if (cpu == this_cpu) { + schedstat_inc(rq, ttwu_local); + schedstat_inc(p, se.statistics.nr_wakeups_local); + } else { + struct sched_domain *sd; + + schedstat_inc(p, se.statistics.nr_wakeups_remote); + for_each_domain(this_cpu, sd) { + if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { + schedstat_inc(sd, ttwu_wake_remote); + break; + } + } + } +#endif /* CONFIG_SMP */ + + schedstat_inc(rq, ttwu_count); schedstat_inc(p, se.statistics.nr_wakeups); - if (is_sync) + + if (wake_flags & WF_SYNC) schedstat_inc(p, se.statistics.nr_wakeups_sync); - if (is_migrate) + + if (cpu != task_cpu(p)) schedstat_inc(p, se.statistics.nr_wakeups_migrate); - if (is_local) - schedstat_inc(p, se.statistics.nr_wakeups_local); - else - schedstat_inc(p, se.statistics.nr_wakeups_remote); +#endif /* CONFIG_SCHEDSTATS */ +} + +static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) +{ activate_task(rq, p, en_flags); + p->on_rq = 1; + + /* if a worker is waking up, notify workqueue */ + if (p->flags & PF_WQ_WORKER) + wq_worker_waking_up(p, cpu_of(rq)); } -static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, - int wake_flags, bool success) +/* + * Mark the task runnable and perform wakeup-preemption. + */ +static void +ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) { - trace_sched_wakeup(p, success); + trace_sched_wakeup(p, true); check_preempt_curr(rq, p, wake_flags); p->state = TASK_RUNNING; @@ -2433,9 +2471,99 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, rq->idle_stamp = 0; } #endif - /* if a worker is waking up, notify workqueue */ - if ((p->flags & PF_WQ_WORKER) && success) - wq_worker_waking_up(p, cpu_of(rq)); +} + +static void +ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) +{ +#ifdef CONFIG_SMP + if (p->sched_contributes_to_load) + rq->nr_uninterruptible--; +#endif + + ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); + ttwu_do_wakeup(rq, p, wake_flags); +} + +/* + * Called in case the task @p isn't fully descheduled from its runqueue, + * in this case we must do a remote wakeup. Its a 'light' wakeup though, + * since all we need to do is flip p->state to TASK_RUNNING, since + * the task is still ->on_rq. + */ +static int ttwu_remote(struct task_struct *p, int wake_flags) +{ + struct rq *rq; + int ret = 0; + + rq = __task_rq_lock(p); + if (p->on_rq) { + ttwu_do_wakeup(rq, p, wake_flags); + ret = 1; + } + __task_rq_unlock(rq); + + return ret; +} + +#ifdef CONFIG_SMP +static void sched_ttwu_pending(void) +{ + struct rq *rq = this_rq(); + struct task_struct *list = xchg(&rq->wake_list, NULL); + + if (!list) + return; + + raw_spin_lock(&rq->lock); + + while (list) { + struct task_struct *p = list; + list = list->wake_entry; + ttwu_do_activate(rq, p, 0); + } + + raw_spin_unlock(&rq->lock); +} + +void scheduler_ipi(void) +{ + sched_ttwu_pending(); +} + +static void ttwu_queue_remote(struct task_struct *p, int cpu) +{ + struct rq *rq = cpu_rq(cpu); + struct task_struct *next = rq->wake_list; + + for (;;) { + struct task_struct *old = next; + + p->wake_entry = next; + next = cmpxchg(&rq->wake_list, old, p); + if (next == old) + break; + } + + if (!next) + smp_send_reschedule(cpu); +} +#endif + +static void ttwu_queue(struct task_struct *p, int cpu) +{ + struct rq *rq = cpu_rq(cpu); + +#if defined(CONFIG_SMP) && defined(CONFIG_SCHED_TTWU_QUEUE) + if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { + ttwu_queue_remote(p, cpu); + return; + } +#endif + + raw_spin_lock(&rq->lock); + ttwu_do_activate(rq, p, 0); + raw_spin_unlock(&rq->lock); } /** @@ -2453,92 +2581,64 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, * Returns %true if @p was woken up, %false if it was already running * or @state didn't match @p's state. */ -static int try_to_wake_up(struct task_struct *p, unsigned int state, - int wake_flags) +static int +try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) { - int cpu, orig_cpu, this_cpu, success = 0; unsigned long flags; - unsigned long en_flags = ENQUEUE_WAKEUP; - struct rq *rq; - - this_cpu = get_cpu(); + int cpu, success = 0; smp_wmb(); - rq = task_rq_lock(p, &flags); + raw_spin_lock_irqsave(&p->pi_lock, flags); if (!(p->state & state)) goto out; - if (p->se.on_rq) - goto out_running; - + success = 1; /* we're going to change ->state */ cpu = task_cpu(p); - orig_cpu = cpu; -#ifdef CONFIG_SMP - if (unlikely(task_running(rq, p))) - goto out_activate; + if (p->on_rq && ttwu_remote(p, wake_flags)) + goto stat; +#ifdef CONFIG_SMP /* - * In order to handle concurrent wakeups and release the rq->lock - * we put the task in TASK_WAKING state. - * - * First fix up the nr_uninterruptible count: + * If the owning (remote) cpu is still in the middle of schedule() with + * this task as prev, wait until its done referencing the task. */ - if (task_contributes_to_load(p)) { - if (likely(cpu_online(orig_cpu))) - rq->nr_uninterruptible--; - else - this_rq()->nr_uninterruptible--; - } - p->state = TASK_WAKING; - - if (p->sched_class->task_waking) { - p->sched_class->task_waking(rq, p); - en_flags |= ENQUEUE_WAKING; + while (p->on_cpu) { +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + /* + * If called from interrupt context we could have landed in the + * middle of schedule(), in this case we should take care not + * to spin on ->on_cpu if p is current, since that would + * deadlock. + */ + if (p == current) { + ttwu_queue(p, cpu); + goto stat; + } +#endif + cpu_relax(); } - - cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags); - if (cpu != orig_cpu) - set_task_cpu(p, cpu); - __task_rq_unlock(rq); - - rq = cpu_rq(cpu); - raw_spin_lock(&rq->lock); - /* - * We migrated the task without holding either rq->lock, however - * since the task is not on the task list itself, nobody else - * will try and migrate the task, hence the rq should match the - * cpu we just moved it to. + * Pairs with the smp_wmb() in finish_lock_switch(). */ - WARN_ON(task_cpu(p) != cpu); - WARN_ON(p->state != TASK_WAKING); + smp_rmb(); -#ifdef CONFIG_SCHEDSTATS - schedstat_inc(rq, ttwu_count); - if (cpu == this_cpu) - schedstat_inc(rq, ttwu_local); - else { - struct sched_domain *sd; - for_each_domain(this_cpu, sd) { - if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { - schedstat_inc(sd, ttwu_wake_remote); - break; - } - } - } -#endif /* CONFIG_SCHEDSTATS */ + p->sched_contributes_to_load = !!task_contributes_to_load(p); + p->state = TASK_WAKING; -out_activate: + if (p->sched_class->task_waking) + p->sched_class->task_waking(p); + + cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); + if (task_cpu(p) != cpu) + set_task_cpu(p, cpu); #endif /* CONFIG_SMP */ - ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, - cpu == this_cpu, en_flags); - success = 1; -out_running: - ttwu_post_activation(p, rq, wake_flags, success); + + ttwu_queue(p, cpu); +stat: + ttwu_stat(p, cpu, wake_flags); out: - task_rq_unlock(rq, &flags); - put_cpu(); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); return success; } @@ -2547,31 +2647,34 @@ out: * try_to_wake_up_local - try to wake up a local task with rq lock held * @p: the thread to be awakened * - * Put @p on the run-queue if it's not already there. The caller must + * Put @p on the run-queue if it's not already there. The caller must * ensure that this_rq() is locked, @p is bound to this_rq() and not - * the current task. this_rq() stays locked over invocation. + * the current task. */ static void try_to_wake_up_local(struct task_struct *p) { struct rq *rq = task_rq(p); - bool success = false; BUG_ON(rq != this_rq()); BUG_ON(p == current); lockdep_assert_held(&rq->lock); + if (!raw_spin_trylock(&p->pi_lock)) { + raw_spin_unlock(&rq->lock); + raw_spin_lock(&p->pi_lock); + raw_spin_lock(&rq->lock); + } + if (!(p->state & TASK_NORMAL)) - return; + goto out; - if (!p->se.on_rq) { - if (likely(!task_running(rq, p))) { - schedstat_inc(rq, ttwu_count); - schedstat_inc(rq, ttwu_local); - } - ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); - success = true; - } - ttwu_post_activation(p, rq, 0, success); + if (!p->on_rq) + ttwu_activate(rq, p, ENQUEUE_WAKEUP); + + ttwu_do_wakeup(rq, p, 0); + ttwu_stat(p, smp_processor_id(), 0); +out: + raw_spin_unlock(&p->pi_lock); } /** @@ -2604,19 +2707,21 @@ int wake_up_state(struct task_struct *p, unsigned int state) */ static void __sched_fork(struct task_struct *p) { + p->on_rq = 0; + + p->se.on_rq = 0; p->se.exec_start = 0; p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; + INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif INIT_LIST_HEAD(&p->rt.run_list); - p->se.on_rq = 0; - INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); @@ -2628,6 +2733,7 @@ static void __sched_fork(struct task_struct *p) */ void sched_fork(struct task_struct *p, int clone_flags) { + unsigned long flags; int cpu = get_cpu(); __sched_fork(p); @@ -2678,16 +2784,16 @@ void sched_fork(struct task_struct *p, int clone_flags) * * Silence PROVE_RCU. */ - rcu_read_lock(); + raw_spin_lock_irqsave(&p->pi_lock, flags); set_task_cpu(p, cpu); - rcu_read_unlock(); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif -#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) - p->oncpu = 0; +#if defined(CONFIG_SMP) + p->on_cpu = 0; #endif #ifdef CONFIG_PREEMPT /* Want to start with kernel preemption disabled. */ @@ -2711,37 +2817,27 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) { unsigned long flags; struct rq *rq; - int cpu __maybe_unused = get_cpu(); + raw_spin_lock_irqsave(&p->pi_lock, flags); #ifdef CONFIG_SMP - rq = task_rq_lock(p, &flags); - p->state = TASK_WAKING; - /* * Fork balancing, do it here and not earlier because: * - cpus_allowed can change in the fork path * - any previously selected cpu might disappear through hotplug - * - * We set TASK_WAKING so that select_task_rq() can drop rq->lock - * without people poking at ->cpus_allowed. */ - cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); - set_task_cpu(p, cpu); - - p->state = TASK_RUNNING; - task_rq_unlock(rq, &flags); + set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); #endif - rq = task_rq_lock(p, &flags); + rq = __task_rq_lock(p); activate_task(rq, p, 0); - trace_sched_wakeup_new(p, 1); + p->on_rq = 1; + trace_sched_wakeup_new(p, true); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP if (p->sched_class->task_woken) p->sched_class->task_woken(rq, p); #endif - task_rq_unlock(rq, &flags); - put_cpu(); + task_rq_unlock(rq, p, &flags); } #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -3450,27 +3546,22 @@ void sched_exec(void) { struct task_struct *p = current; unsigned long flags; - struct rq *rq; int dest_cpu; - rq = task_rq_lock(p, &flags); - dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); + raw_spin_lock_irqsave(&p->pi_lock, flags); + dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); if (dest_cpu == smp_processor_id()) goto unlock; - /* - * select_task_rq() can race against ->cpus_allowed - */ - if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && - likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) { + if (likely(cpu_active(dest_cpu))) { struct migration_arg arg = { p, dest_cpu }; - task_rq_unlock(rq, &flags); - stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); return; } unlock: - task_rq_unlock(rq, &flags); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); } #endif @@ -3507,7 +3598,7 @@ unsigned long long task_delta_exec(struct task_struct *p) rq = task_rq_lock(p, &flags); ns = do_task_delta_exec(p, rq); - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); return ns; } @@ -3525,7 +3616,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) rq = task_rq_lock(p, &flags); ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); return ns; } @@ -3549,7 +3640,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p) rq = task_rq_lock(p, &flags); thread_group_cputime(p, &totals); ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); return ns; } @@ -4035,7 +4126,7 @@ static inline void schedule_debug(struct task_struct *prev) static void put_prev_task(struct rq *rq, struct task_struct *prev) { - if (prev->se.on_rq) + if (prev->on_rq) update_rq_clock(rq); prev->sched_class->put_prev_task(rq, prev); } @@ -4097,11 +4188,13 @@ need_resched: if (unlikely(signal_pending_state(prev->state, prev))) { prev->state = TASK_RUNNING; } else { + deactivate_task(rq, prev, DEQUEUE_SLEEP); + prev->on_rq = 0; + /* - * If a worker is going to sleep, notify and - * ask workqueue whether it wants to wake up a - * task to maintain concurrency. If so, wake - * up the task. + * If a worker went to sleep, notify and ask workqueue + * whether it wants to wake up a task to maintain + * concurrency. */ if (prev->flags & PF_WQ_WORKER) { struct task_struct *to_wakeup; @@ -4110,11 +4203,10 @@ need_resched: if (to_wakeup) try_to_wake_up_local(to_wakeup); } - deactivate_task(rq, prev, DEQUEUE_SLEEP); /* - * If we are going to sleep and we have plugged IO queued, make - * sure to submit it to avoid deadlocks. + * If we are going to sleep and we have plugged IO + * queued, make sure to submit it to avoid deadlocks. */ if (blk_needs_flush_plug(prev)) { raw_spin_unlock(&rq->lock); @@ -4161,70 +4253,53 @@ need_resched: EXPORT_SYMBOL(schedule); #ifdef CONFIG_MUTEX_SPIN_ON_OWNER -/* - * Look out! "owner" is an entirely speculative pointer - * access and not reliable. - */ -int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) -{ - unsigned int cpu; - struct rq *rq; - if (!sched_feat(OWNER_SPIN)) - return 0; +static inline bool owner_running(struct mutex *lock, struct task_struct *owner) +{ + bool ret = false; -#ifdef CONFIG_DEBUG_PAGEALLOC - /* - * Need to access the cpu field knowing that - * DEBUG_PAGEALLOC could have unmapped it if - * the mutex owner just released it and exited. - */ - if (probe_kernel_address(&owner->cpu, cpu)) - return 0; -#else - cpu = owner->cpu; -#endif + rcu_read_lock(); + if (lock->owner != owner) + goto fail; /* - * Even if the access succeeded (likely case), - * the cpu field may no longer be valid. + * Ensure we emit the owner->on_cpu, dereference _after_ checking + * lock->owner still matches owner, if that fails, owner might + * point to free()d memory, if it still matches, the rcu_read_lock() + * ensures the memory stays valid. */ - if (cpu >= nr_cpumask_bits) - return 0; + barrier(); - /* - * We need to validate that we can do a - * get_cpu() and that we have the percpu area. - */ - if (!cpu_online(cpu)) - return 0; + ret = owner->on_cpu; +fail: + rcu_read_unlock(); - rq = cpu_rq(cpu); + return ret; +} - for (;;) { - /* - * Owner changed, break to re-assess state. - */ - if (lock->owner != owner) { - /* - * If the lock has switched to a different owner, - * we likely have heavy contention. Return 0 to quit - * optimistic spinning and not contend further: - */ - if (lock->owner) - return 0; - break; - } +/* + * Look out! "owner" is an entirely speculative pointer + * access and not reliable. + */ +int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) +{ + if (!sched_feat(OWNER_SPIN)) + return 0; - /* - * Is that owner really running on that cpu? - */ - if (task_thread_info(rq->curr) != owner || need_resched()) + while (owner_running(lock, owner)) { + if (need_resched()) return 0; arch_mutex_cpu_relax(); } + /* + * If the owner changed to another task there is likely + * heavy contention, stop spinning. + */ + if (lock->owner) + return 0; + return 1; } #endif @@ -4684,19 +4759,18 @@ EXPORT_SYMBOL(sleep_on_timeout); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - unsigned long flags; int oldprio, on_rq, running; struct rq *rq; const struct sched_class *prev_class; BUG_ON(prio < 0 || prio > MAX_PRIO); - rq = task_rq_lock(p, &flags); + rq = __task_rq_lock(p); trace_sched_pi_setprio(p, prio); oldprio = p->prio; prev_class = p->sched_class; - on_rq = p->se.on_rq; + on_rq = p->on_rq; running = task_current(rq, p); if (on_rq) dequeue_task(rq, p, 0); @@ -4716,7 +4790,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); check_class_changed(rq, p, prev_class, oldprio); - task_rq_unlock(rq, &flags); + __task_rq_unlock(rq); } #endif @@ -4744,7 +4818,7 @@ void set_user_nice(struct task_struct *p, long nice) p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } - on_rq = p->se.on_rq; + on_rq = p->on_rq; if (on_rq) dequeue_task(rq, p, 0); @@ -4764,7 +4838,7 @@ void set_user_nice(struct task_struct *p, long nice) resched_task(rq->curr); } out_unlock: - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); } EXPORT_SYMBOL(set_user_nice); @@ -4878,8 +4952,6 @@ static struct task_struct *find_process_by_pid(pid_t pid) static void __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) { - BUG_ON(p->se.on_rq); - p->policy = policy; p->rt_priority = prio; p->normal_prio = normal_prio(p); @@ -4994,20 +5066,17 @@ recheck: /* * make sure no PI-waiters arrive (or leave) while we are * changing the priority of the task: - */ - raw_spin_lock_irqsave(&p->pi_lock, flags); - /* + * * To be able to change p->policy safely, the appropriate * runqueue lock must be held. */ - rq = __task_rq_lock(p); + rq = task_rq_lock(p, &flags); /* * Changing the policy of the stop threads its a very bad idea */ if (p == rq->stop) { - __task_rq_unlock(rq); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + task_rq_unlock(rq, p, &flags); return -EINVAL; } @@ -5031,8 +5100,7 @@ recheck: if (rt_bandwidth_enabled() && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0 && !task_group_is_autogroup(task_group(p))) { - __task_rq_unlock(rq); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + task_rq_unlock(rq, p, &flags); return -EPERM; } } @@ -5041,11 +5109,10 @@ recheck: /* recheck policy now with rq lock held */ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; - __task_rq_unlock(rq); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + task_rq_unlock(rq, p, &flags); goto recheck; } - on_rq = p->se.on_rq; + on_rq = p->on_rq; running = task_current(rq, p); if (on_rq) deactivate_task(rq, p, 0); @@ -5064,8 +5131,7 @@ recheck: activate_task(rq, p, 0); check_class_changed(rq, p, prev_class, oldprio); - __task_rq_unlock(rq); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + task_rq_unlock(rq, p, &flags); rt_mutex_adjust_pi(p); @@ -5316,7 +5382,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) { struct task_struct *p; unsigned long flags; - struct rq *rq; int retval; get_online_cpus(); @@ -5331,9 +5396,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) if (retval) goto out_unlock; - rq = task_rq_lock(p, &flags); + raw_spin_lock_irqsave(&p->pi_lock, flags); cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); - task_rq_unlock(rq, &flags); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); out_unlock: rcu_read_unlock(); @@ -5658,7 +5723,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, rq = task_rq_lock(p, &flags); time_slice = p->sched_class->get_rr_interval(rq, p); - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); rcu_read_unlock(); jiffies_to_timespec(time_slice, &t); @@ -5776,8 +5841,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) rcu_read_unlock(); rq->curr = rq->idle = idle; -#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) - idle->oncpu = 1; +#if defined(CONFIG_SMP) + idle->on_cpu = 1; #endif raw_spin_unlock_irqrestore(&rq->lock, flags); @@ -5881,18 +5946,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) unsigned int dest_cpu; int ret = 0; - /* - * Serialize against TASK_WAKING so that ttwu() and wunt() can - * drop the rq->lock and still rely on ->cpus_allowed. - */ -again: - while (task_is_waking(p)) - cpu_relax(); rq = task_rq_lock(p, &flags); - if (task_is_waking(p)) { - task_rq_unlock(rq, &flags); - goto again; - } if (!cpumask_intersects(new_mask, cpu_active_mask)) { ret = -EINVAL; @@ -5917,16 +5971,16 @@ again: goto out; dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); - if (migrate_task(p, rq)) { + if (p->on_rq) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); tlb_migrate_finish(p->mm); return 0; } out: - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); return ret; } @@ -5954,6 +6008,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) rq_src = cpu_rq(src_cpu); rq_dest = cpu_rq(dest_cpu); + raw_spin_lock(&p->pi_lock); double_rq_lock(rq_src, rq_dest); /* Already moved. */ if (task_cpu(p) != src_cpu) @@ -5966,7 +6021,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) * If we're not on a rq, the next wake-up will ensure we're * placed properly. */ - if (p->se.on_rq) { + if (p->on_rq) { deactivate_task(rq_src, p, 0); set_task_cpu(p, dest_cpu); activate_task(rq_dest, p, 0); @@ -5976,6 +6031,7 @@ done: ret = 1; fail: double_rq_unlock(rq_src, rq_dest); + raw_spin_unlock(&p->pi_lock); return ret; } @@ -6316,6 +6372,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) #ifdef CONFIG_HOTPLUG_CPU case CPU_DYING: + sched_ttwu_pending(); /* Update our root-domain */ raw_spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { @@ -6394,6 +6451,8 @@ early_initcall(migration_init); #ifdef CONFIG_SMP +static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ + #ifdef CONFIG_SCHED_DEBUG static __read_mostly int sched_domain_debug_enabled; @@ -6489,7 +6548,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, static void sched_domain_debug(struct sched_domain *sd, int cpu) { - cpumask_var_t groupmask; int level = 0; if (!sched_domain_debug_enabled) @@ -6502,20 +6560,14 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); - if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) { - printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); - return; - } - for (;;) { - if (sched_domain_debug_one(sd, cpu, level, groupmask)) + if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) break; level++; sd = sd->parent; if (!sd) break; } - free_cpumask_var(groupmask); } #else /* !CONFIG_SCHED_DEBUG */ # define sched_domain_debug(sd, cpu) do { } while (0) @@ -6572,12 +6624,11 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) return 1; } -static void free_rootdomain(struct root_domain *rd) +static void free_rootdomain(struct rcu_head *rcu) { - synchronize_sched(); + struct root_domain *rd = container_of(rcu, struct root_domain, rcu); cpupri_cleanup(&rd->cpupri); - free_cpumask_var(rd->rto_mask); free_cpumask_var(rd->online); free_cpumask_var(rd->span); @@ -6618,7 +6669,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) raw_spin_unlock_irqrestore(&rq->lock, flags); if (old_rd) - free_rootdomain(old_rd); + call_rcu_sched(&old_rd->rcu, free_rootdomain); } static int init_rootdomain(struct root_domain *rd) @@ -6669,6 +6720,25 @@ static struct root_domain *alloc_rootdomain(void) return rd; } +static void free_sched_domain(struct rcu_head *rcu) +{ + struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); + if (atomic_dec_and_test(&sd->groups->ref)) + kfree(sd->groups); + kfree(sd); +} + +static void destroy_sched_domain(struct sched_domain *sd, int cpu) +{ + call_rcu(&sd->rcu, free_sched_domain); +} + +static void destroy_sched_domains(struct sched_domain *sd, int cpu) +{ + for (; sd; sd = sd->parent) + destroy_sched_domain(sd, cpu); +} + /* * Attach the domain 'sd' to 'cpu' as its base domain. Callers must * hold the hotplug lock. @@ -6679,9 +6749,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) struct rq *rq = cpu_rq(cpu); struct sched_domain *tmp; - for (tmp = sd; tmp; tmp = tmp->parent) - tmp->span_weight = cpumask_weight(sched_domain_span(tmp)); - /* Remove the sched domains which do not contribute to scheduling. */ for (tmp = sd; tmp; ) { struct sched_domain *parent = tmp->parent; @@ -6692,12 +6759,15 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) tmp->parent = parent->parent; if (parent->parent) parent->parent->child = tmp; + destroy_sched_domain(parent, cpu); } else tmp = tmp->parent; } if (sd && sd_degenerate(sd)) { + tmp = sd; sd = sd->parent; + destroy_sched_domain(tmp, cpu); if (sd) sd->child = NULL; } @@ -6705,7 +6775,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) sched_domain_debug(sd, cpu); rq_attach_root(rq, rd); + tmp = rq->sd; rcu_assign_pointer(rq->sd, sd); + destroy_sched_domains(tmp, cpu); } /* cpus with isolated domains */ @@ -6721,56 +6793,6 @@ static int __init isolated_cpu_setup(char *str) __setup("isolcpus=", isolated_cpu_setup); -/* - * init_sched_build_groups takes the cpumask we wish to span, and a pointer - * to a function which identifies what group(along with sched group) a CPU - * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids - * (due to the fact that we keep track of groups covered with a struct cpumask). - * - * init_sched_build_groups will build a circular linked list of the groups - * covered by the given span, and will set each group's ->cpumask correctly, - * and ->cpu_power to 0. - */ -static void -init_sched_build_groups(const struct cpumask *span, - const struct cpumask *cpu_map, - int (*group_fn)(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, - struct cpumask *tmpmask), - struct cpumask *covered, struct cpumask *tmpmask) -{ - struct sched_group *first = NULL, *last = NULL; - int i; - - cpumask_clear(covered); - - for_each_cpu(i, span) { - struct sched_group *sg; - int group = group_fn(i, cpu_map, &sg, tmpmask); - int j; - - if (cpumask_test_cpu(i, covered)) - continue; - - cpumask_clear(sched_group_cpus(sg)); - sg->cpu_power = 0; - - for_each_cpu(j, span) { - if (group_fn(j, cpu_map, NULL, tmpmask) != group) - continue; - - cpumask_set_cpu(j, covered); - cpumask_set_cpu(j, sched_group_cpus(sg)); - } - if (!first) - first = sg; - if (last) - last->next = sg; - last = sg; - } - last->next = first; -} - #define SD_NODES_PER_DOMAIN 16 #ifdef CONFIG_NUMA @@ -6841,311 +6863,125 @@ static void sched_domain_node_span(int node, struct cpumask *span) cpumask_or(span, span, cpumask_of_node(next_node)); } } + +static const struct cpumask *cpu_node_mask(int cpu) +{ + lockdep_assert_held(&sched_domains_mutex); + + sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); + + return sched_domains_tmpmask; +} + +static const struct cpumask *cpu_allnodes_mask(int cpu) +{ + return cpu_possible_mask; +} #endif /* CONFIG_NUMA */ -int sched_smt_power_savings = 0, sched_mc_power_savings = 0; +static const struct cpumask *cpu_cpu_mask(int cpu) +{ + return cpumask_of_node(cpu_to_node(cpu)); +} -/* - * The cpus mask in sched_group and sched_domain hangs off the end. - * - * ( See the the comments in include/linux/sched.h:struct sched_group - * and struct sched_domain. ) - */ -struct static_sched_group { - struct sched_group sg; - DECLARE_BITMAP(cpus, CONFIG_NR_CPUS); -}; +int sched_smt_power_savings = 0, sched_mc_power_savings = 0; -struct static_sched_domain { - struct sched_domain sd; - DECLARE_BITMAP(span, CONFIG_NR_CPUS); +struct sd_data { + struct sched_domain **__percpu sd; + struct sched_group **__percpu sg; }; struct s_data { -#ifdef CONFIG_NUMA - int sd_allnodes; - cpumask_var_t domainspan; - cpumask_var_t covered; - cpumask_var_t notcovered; -#endif - cpumask_var_t nodemask; - cpumask_var_t this_sibling_map; - cpumask_var_t this_core_map; - cpumask_var_t this_book_map; - cpumask_var_t send_covered; - cpumask_var_t tmpmask; - struct sched_group **sched_group_nodes; + struct sched_domain ** __percpu sd; struct root_domain *rd; }; enum s_alloc { - sa_sched_groups = 0, sa_rootdomain, - sa_tmpmask, - sa_send_covered, - sa_this_book_map, - sa_this_core_map, - sa_this_sibling_map, - sa_nodemask, - sa_sched_group_nodes, -#ifdef CONFIG_NUMA - sa_notcovered, - sa_covered, - sa_domainspan, -#endif + sa_sd, + sa_sd_storage, sa_none, }; -/* - * SMT sched-domains: - */ -#ifdef CONFIG_SCHED_SMT -static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); -static DEFINE_PER_CPU(struct static_sched_group, sched_groups); - -static int -cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, struct cpumask *unused) -{ - if (sg) - *sg = &per_cpu(sched_groups, cpu).sg; - return cpu; -} -#endif /* CONFIG_SCHED_SMT */ +struct sched_domain_topology_level; -/* - * multi-core sched-domains: - */ -#ifdef CONFIG_SCHED_MC -static DEFINE_PER_CPU(struct static_sched_domain, core_domains); -static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); +typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); +typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); -static int -cpu_to_core_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, struct cpumask *mask) -{ - int group; -#ifdef CONFIG_SCHED_SMT - cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); - group = cpumask_first(mask); -#else - group = cpu; -#endif - if (sg) - *sg = &per_cpu(sched_group_core, group).sg; - return group; -} -#endif /* CONFIG_SCHED_MC */ +struct sched_domain_topology_level { + sched_domain_init_f init; + sched_domain_mask_f mask; + struct sd_data data; +}; /* - * book sched-domains: + * Assumes the sched_domain tree is fully constructed */ -#ifdef CONFIG_SCHED_BOOK -static DEFINE_PER_CPU(struct static_sched_domain, book_domains); -static DEFINE_PER_CPU(struct static_sched_group, sched_group_book); - -static int -cpu_to_book_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, struct cpumask *mask) +static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) { - int group = cpu; -#ifdef CONFIG_SCHED_MC - cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); - group = cpumask_first(mask); -#elif defined(CONFIG_SCHED_SMT) - cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); - group = cpumask_first(mask); -#endif - if (sg) - *sg = &per_cpu(sched_group_book, group).sg; - return group; -} -#endif /* CONFIG_SCHED_BOOK */ + struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); + struct sched_domain *child = sd->child; -static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); -static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); + if (child) + cpu = cpumask_first(sched_domain_span(child)); -static int -cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, struct cpumask *mask) -{ - int group; -#ifdef CONFIG_SCHED_BOOK - cpumask_and(mask, cpu_book_mask(cpu), cpu_map); - group = cpumask_first(mask); -#elif defined(CONFIG_SCHED_MC) - cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); - group = cpumask_first(mask); -#elif defined(CONFIG_SCHED_SMT) - cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); - group = cpumask_first(mask); -#else - group = cpu; -#endif if (sg) - *sg = &per_cpu(sched_group_phys, group).sg; - return group; + *sg = *per_cpu_ptr(sdd->sg, cpu); + + return cpu; } -#ifdef CONFIG_NUMA /* - * The init_sched_build_groups can't handle what we want to do with node - * groups, so roll our own. Now each node has its own list of groups which - * gets dynamically allocated. + * build_sched_groups takes the cpumask we wish to span, and a pointer + * to a function which identifies what group(along with sched group) a CPU + * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids + * (due to the fact that we keep track of groups covered with a struct cpumask). + * + * build_sched_groups will build a circular linked list of the groups + * covered by the given span, and will set each group's ->cpumask correctly, + * and ->cpu_power to 0. */ -static DEFINE_PER_CPU(struct static_sched_domain, node_domains); -static struct sched_group ***sched_group_nodes_bycpu; - -static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains); -static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); - -static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, - struct cpumask *nodemask) -{ - int group; - - cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map); - group = cpumask_first(nodemask); - - if (sg) - *sg = &per_cpu(sched_group_allnodes, group).sg; - return group; -} - -static void init_numa_sched_groups_power(struct sched_group *group_head) -{ - struct sched_group *sg = group_head; - int j; - - if (!sg) - return; - do { - for_each_cpu(j, sched_group_cpus(sg)) { - struct sched_domain *sd; - - sd = &per_cpu(phys_domains, j).sd; - if (j != group_first_cpu(sd->groups)) { - /* - * Only add "power" once for each - * physical package. - */ - continue; - } - - sg->cpu_power += sd->groups->cpu_power; - } - sg = sg->next; - } while (sg != group_head); -} - -static int build_numa_sched_groups(struct s_data *d, - const struct cpumask *cpu_map, int num) +static void +build_sched_groups(struct sched_domain *sd) { - struct sched_domain *sd; - struct sched_group *sg, *prev; - int n, j; - - cpumask_clear(d->covered); - cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); - if (cpumask_empty(d->nodemask)) { - d->sched_group_nodes[num] = NULL; - goto out; - } - - sched_domain_node_span(num, d->domainspan); - cpumask_and(d->domainspan, d->domainspan, cpu_map); - - sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), - GFP_KERNEL, num); - if (!sg) { - printk(KERN_WARNING "Can not alloc domain group for node %d\n", - num); - return -ENOMEM; - } - d->sched_group_nodes[num] = sg; - - for_each_cpu(j, d->nodemask) { - sd = &per_cpu(node_domains, j).sd; - sd->groups = sg; - } - - sg->cpu_power = 0; - cpumask_copy(sched_group_cpus(sg), d->nodemask); - sg->next = sg; - cpumask_or(d->covered, d->covered, d->nodemask); + struct sched_group *first = NULL, *last = NULL; + struct sd_data *sdd = sd->private; + const struct cpumask *span = sched_domain_span(sd); + struct cpumask *covered; + int i; - prev = sg; - for (j = 0; j < nr_node_ids; j++) { - n = (num + j) % nr_node_ids; - cpumask_complement(d->notcovered, d->covered); - cpumask_and(d->tmpmask, d->notcovered, cpu_map); - cpumask_and(d->tmpmask, d->tmpmask, d->domainspan); - if (cpumask_empty(d->tmpmask)) - break; - cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n)); - if (cpumask_empty(d->tmpmask)) - continue; - sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), - GFP_KERNEL, num); - if (!sg) { - printk(KERN_WARNING - "Can not alloc domain group for node %d\n", j); - return -ENOMEM; - } - sg->cpu_power = 0; - cpumask_copy(sched_group_cpus(sg), d->tmpmask); - sg->next = prev->next; - cpumask_or(d->covered, d->covered, d->tmpmask); - prev->next = sg; - prev = sg; - } -out: - return 0; -} -#endif /* CONFIG_NUMA */ + lockdep_assert_held(&sched_domains_mutex); + covered = sched_domains_tmpmask; -#ifdef CONFIG_NUMA -/* Free memory allocated for various sched_group structures */ -static void free_sched_groups(const struct cpumask *cpu_map, - struct cpumask *nodemask) -{ - int cpu, i; + cpumask_clear(covered); - for_each_cpu(cpu, cpu_map) { - struct sched_group **sched_group_nodes - = sched_group_nodes_bycpu[cpu]; + for_each_cpu(i, span) { + struct sched_group *sg; + int group = get_group(i, sdd, &sg); + int j; - if (!sched_group_nodes) + if (cpumask_test_cpu(i, covered)) continue; - for (i = 0; i < nr_node_ids; i++) { - struct sched_group *oldsg, *sg = sched_group_nodes[i]; + cpumask_clear(sched_group_cpus(sg)); + sg->cpu_power = 0; - cpumask_and(nodemask, cpumask_of_node(i), cpu_map); - if (cpumask_empty(nodemask)) + for_each_cpu(j, span) { + if (get_group(j, sdd, NULL) != group) continue; - if (sg == NULL) - continue; - sg = sg->next; -next_sg: - oldsg = sg; - sg = sg->next; - kfree(oldsg); - if (oldsg != sched_group_nodes[i]) - goto next_sg; + cpumask_set_cpu(j, covered); + cpumask_set_cpu(j, sched_group_cpus(sg)); } - kfree(sched_group_nodes); - sched_group_nodes_bycpu[cpu] = NULL; + + if (!first) + first = sg; + if (last) + last->next = sg; + last = sg; } + last->next = first; } -#else /* !CONFIG_NUMA */ -static void free_sched_groups(const struct cpumask *cpu_map, - struct cpumask *nodemask) -{ -} -#endif /* CONFIG_NUMA */ /* * Initialize sched groups cpu_power. @@ -7159,11 +6995,6 @@ static void free_sched_groups(const struct cpumask *cpu_map, */ static void init_sched_groups_power(int cpu, struct sched_domain *sd) { - struct sched_domain *child; - struct sched_group *group; - long power; - int weight; - WARN_ON(!sd || !sd->groups); if (cpu != group_first_cpu(sd->groups)) @@ -7171,36 +7002,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); - child = sd->child; - - sd->groups->cpu_power = 0; - - if (!child) { - power = SCHED_LOAD_SCALE; - weight = cpumask_weight(sched_domain_span(sd)); - /* - * SMT siblings share the power of a single core. - * Usually multiple threads get a better yield out of - * that one core than a single thread would have, - * reflect that in sd->smt_gain. - */ - if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { - power *= sd->smt_gain; - power /= weight; - power >>= SCHED_LOAD_SHIFT; - } - sd->groups->cpu_power += power; - return; - } - - /* - * Add cpu_power of each child group to this groups cpu_power. - */ - group = child->groups; - do { - sd->groups->cpu_power += group->cpu_power; - group = group->next; - } while (group != child->groups); + update_group_power(sd, cpu); } /* @@ -7214,15 +7016,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) # define SD_INIT_NAME(sd, type) do { } while (0) #endif -#define SD_INIT(sd, type) sd_init_##type(sd) - -#define SD_INIT_FUNC(type) \ -static noinline void sd_init_##type(struct sched_domain *sd) \ -{ \ - memset(sd, 0, sizeof(*sd)); \ - *sd = SD_##type##_INIT; \ - sd->level = SD_LV_##type; \ - SD_INIT_NAME(sd, type); \ +#define SD_INIT_FUNC(type) \ +static noinline struct sched_domain * \ +sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ +{ \ + struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ + *sd = SD_##type##_INIT; \ + SD_INIT_NAME(sd, type); \ + sd->private = &tl->data; \ + return sd; \ } SD_INIT_FUNC(CPU) @@ -7241,13 +7043,14 @@ SD_INIT_FUNC(CPU) #endif static int default_relax_domain_level = -1; +int sched_domain_level_max; static int __init setup_relax_domain_level(char *str) { unsigned long val; val = simple_strtoul(str, NULL, 0); - if (val < SD_LV_MAX) + if (val < sched_domain_level_max) default_relax_domain_level = val; return 1; @@ -7275,37 +7078,20 @@ static void set_domain_attribute(struct sched_domain *sd, } } +static void __sdt_free(const struct cpumask *cpu_map); +static int __sdt_alloc(const struct cpumask *cpu_map); + static void __free_domain_allocs(struct s_data *d, enum s_alloc what, const struct cpumask *cpu_map) { switch (what) { - case sa_sched_groups: - free_sched_groups(cpu_map, d->tmpmask); /* fall through */ - d->sched_group_nodes = NULL; case sa_rootdomain: - free_rootdomain(d->rd); /* fall through */ - case sa_tmpmask: - free_cpumask_var(d->tmpmask); /* fall through */ - case sa_send_covered: - free_cpumask_var(d->send_covered); /* fall through */ - case sa_this_book_map: - free_cpumask_var(d->this_book_map); /* fall through */ - case sa_this_core_map: - free_cpumask_var(d->this_core_map); /* fall through */ - case sa_this_sibling_map: - free_cpumask_var(d->this_sibling_map); /* fall through */ - case sa_nodemask: - free_cpumask_var(d->nodemask); /* fall through */ - case sa_sched_group_nodes: -#ifdef CONFIG_NUMA - kfree(d->sched_group_nodes); /* fall through */ - case sa_notcovered: - free_cpumask_var(d->notcovered); /* fall through */ - case sa_covered: - free_cpumask_var(d->covered); /* fall through */ - case sa_domainspan: - free_cpumask_var(d->domainspan); /* fall through */ -#endif + if (!atomic_read(&d->rd->refcount)) + free_rootdomain(&d->rd->rcu); /* fall through */ + case sa_sd: + free_percpu(d->sd); /* fall through */ + case sa_sd_storage: + __sdt_free(cpu_map); /* fall through */ case sa_none: break; } @@ -7314,308 +7100,212 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map) { -#ifdef CONFIG_NUMA - if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) - return sa_none; - if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) - return sa_domainspan; - if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) - return sa_covered; - /* Allocate the per-node list of sched groups */ - d->sched_group_nodes = kcalloc(nr_node_ids, - sizeof(struct sched_group *), GFP_KERNEL); - if (!d->sched_group_nodes) { - printk(KERN_WARNING "Can not alloc sched group node list\n"); - return sa_notcovered; - } - sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; -#endif - if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) - return sa_sched_group_nodes; - if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL)) - return sa_nodemask; - if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) - return sa_this_sibling_map; - if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL)) - return sa_this_core_map; - if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) - return sa_this_book_map; - if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) - return sa_send_covered; + memset(d, 0, sizeof(*d)); + + if (__sdt_alloc(cpu_map)) + return sa_sd_storage; + d->sd = alloc_percpu(struct sched_domain *); + if (!d->sd) + return sa_sd_storage; d->rd = alloc_rootdomain(); - if (!d->rd) { - printk(KERN_WARNING "Cannot alloc root domain\n"); - return sa_tmpmask; - } + if (!d->rd) + return sa_sd; return sa_rootdomain; } -static struct sched_domain *__build_numa_sched_domains(struct s_data *d, - const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) +/* + * NULL the sd_data elements we've used to build the sched_domain and + * sched_group structure so that the subsequent __free_domain_allocs() + * will not free the data we're using. + */ +static void claim_allocations(int cpu, struct sched_domain *sd) { - struct sched_domain *sd = NULL; -#ifdef CONFIG_NUMA - struct sched_domain *parent; - - d->sd_allnodes = 0; - if (cpumask_weight(cpu_map) > - SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) { - sd = &per_cpu(allnodes_domains, i).sd; - SD_INIT(sd, ALLNODES); - set_domain_attribute(sd, attr); - cpumask_copy(sched_domain_span(sd), cpu_map); - cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask); - d->sd_allnodes = 1; - } - parent = sd; - - sd = &per_cpu(node_domains, i).sd; - SD_INIT(sd, NODE); - set_domain_attribute(sd, attr); - sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); - sd->parent = parent; - if (parent) - parent->child = sd; - cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); -#endif - return sd; -} + struct sd_data *sdd = sd->private; + struct sched_group *sg = sd->groups; -static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, - const struct cpumask *cpu_map, struct sched_domain_attr *attr, - struct sched_domain *parent, int i) -{ - struct sched_domain *sd; - sd = &per_cpu(phys_domains, i).sd; - SD_INIT(sd, CPU); - set_domain_attribute(sd, attr); - cpumask_copy(sched_domain_span(sd), d->nodemask); - sd->parent = parent; - if (parent) - parent->child = sd; - cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask); - return sd; -} + WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); + *per_cpu_ptr(sdd->sd, cpu) = NULL; -static struct sched_domain *__build_book_sched_domain(struct s_data *d, - const struct cpumask *cpu_map, struct sched_domain_attr *attr, - struct sched_domain *parent, int i) -{ - struct sched_domain *sd = parent; -#ifdef CONFIG_SCHED_BOOK - sd = &per_cpu(book_domains, i).sd; - SD_INIT(sd, BOOK); - set_domain_attribute(sd, attr); - cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i)); - sd->parent = parent; - parent->child = sd; - cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask); -#endif - return sd; + if (cpu == cpumask_first(sched_group_cpus(sg))) { + WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg); + *per_cpu_ptr(sdd->sg, cpu) = NULL; + } } -static struct sched_domain *__build_mc_sched_domain(struct s_data *d, - const struct cpumask *cpu_map, struct sched_domain_attr *attr, - struct sched_domain *parent, int i) +#ifdef CONFIG_SCHED_SMT +static const struct cpumask *cpu_smt_mask(int cpu) { - struct sched_domain *sd = parent; -#ifdef CONFIG_SCHED_MC - sd = &per_cpu(core_domains, i).sd; - SD_INIT(sd, MC); - set_domain_attribute(sd, attr); - cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i)); - sd->parent = parent; - parent->child = sd; - cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask); -#endif - return sd; + return topology_thread_cpumask(cpu); } - -static struct sched_domain *__build_smt_sched_domain(struct s_data *d, - const struct cpumask *cpu_map, struct sched_domain_attr *attr, - struct sched_domain *parent, int i) -{ - struct sched_domain *sd = parent; -#ifdef CONFIG_SCHED_SMT - sd = &per_cpu(cpu_domains, i).sd; - SD_INIT(sd, SIBLING); - set_domain_attribute(sd, attr); - cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i)); - sd->parent = parent; - parent->child = sd; - cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask); #endif - return sd; -} -static void build_sched_groups(struct s_data *d, enum sched_domain_level l, - const struct cpumask *cpu_map, int cpu) -{ - switch (l) { +/* + * Topology list, bottom-up. + */ +static struct sched_domain_topology_level default_topology[] = { #ifdef CONFIG_SCHED_SMT - case SD_LV_SIBLING: /* set up CPU (sibling) groups */ - cpumask_and(d->this_sibling_map, cpu_map, - topology_thread_cpumask(cpu)); - if (cpu == cpumask_first(d->this_sibling_map)) - init_sched_build_groups(d->this_sibling_map, cpu_map, - &cpu_to_cpu_group, - d->send_covered, d->tmpmask); - break; + { sd_init_SIBLING, cpu_smt_mask, }, #endif #ifdef CONFIG_SCHED_MC - case SD_LV_MC: /* set up multi-core groups */ - cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu)); - if (cpu == cpumask_first(d->this_core_map)) - init_sched_build_groups(d->this_core_map, cpu_map, - &cpu_to_core_group, - d->send_covered, d->tmpmask); - break; + { sd_init_MC, cpu_coregroup_mask, }, #endif #ifdef CONFIG_SCHED_BOOK - case SD_LV_BOOK: /* set up book groups */ - cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu)); - if (cpu == cpumask_first(d->this_book_map)) - init_sched_build_groups(d->this_book_map, cpu_map, - &cpu_to_book_group, - d->send_covered, d->tmpmask); - break; + { sd_init_BOOK, cpu_book_mask, }, #endif - case SD_LV_CPU: /* set up physical groups */ - cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); - if (!cpumask_empty(d->nodemask)) - init_sched_build_groups(d->nodemask, cpu_map, - &cpu_to_phys_group, - d->send_covered, d->tmpmask); - break; + { sd_init_CPU, cpu_cpu_mask, }, #ifdef CONFIG_NUMA - case SD_LV_ALLNODES: - init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, - d->send_covered, d->tmpmask); - break; + { sd_init_NODE, cpu_node_mask, }, + { sd_init_ALLNODES, cpu_allnodes_mask, }, #endif - default: - break; + { NULL, }, +}; + +static struct sched_domain_topology_level *sched_domain_topology = default_topology; + +static int __sdt_alloc(const struct cpumask *cpu_map) +{ + struct sched_domain_topology_level *tl; + int j; + + for (tl = sched_domain_topology; tl->init; tl++) { + struct sd_data *sdd = &tl->data; + + sdd->sd = alloc_percpu(struct sched_domain *); + if (!sdd->sd) + return -ENOMEM; + + sdd->sg = alloc_percpu(struct sched_group *); + if (!sdd->sg) + return -ENOMEM; + + for_each_cpu(j, cpu_map) { + struct sched_domain *sd; + struct sched_group *sg; + + sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), + GFP_KERNEL, cpu_to_node(j)); + if (!sd) + return -ENOMEM; + + *per_cpu_ptr(sdd->sd, j) = sd; + + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, cpu_to_node(j)); + if (!sg) + return -ENOMEM; + + *per_cpu_ptr(sdd->sg, j) = sg; + } + } + + return 0; +} + +static void __sdt_free(const struct cpumask *cpu_map) +{ + struct sched_domain_topology_level *tl; + int j; + + for (tl = sched_domain_topology; tl->init; tl++) { + struct sd_data *sdd = &tl->data; + + for_each_cpu(j, cpu_map) { + kfree(*per_cpu_ptr(sdd->sd, j)); + kfree(*per_cpu_ptr(sdd->sg, j)); + } + free_percpu(sdd->sd); + free_percpu(sdd->sg); + } +} + +struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, + struct s_data *d, const struct cpumask *cpu_map, + struct sched_domain_attr *attr, struct sched_domain *child, + int cpu) +{ + struct sched_domain *sd = tl->init(tl, cpu); + if (!sd) + return child; + + set_domain_attribute(sd, attr); + cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); + if (child) { + sd->level = child->level + 1; + sched_domain_level_max = max(sched_domain_level_max, sd->level); + child->parent = sd; } + sd->child = child; + + return sd; } /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus */ -static int __build_sched_domains(const struct cpumask *cpu_map, - struct sched_domain_attr *attr) +static int build_sched_domains(const struct cpumask *cpu_map, + struct sched_domain_attr *attr) { enum s_alloc alloc_state = sa_none; - struct s_data d; struct sched_domain *sd; - int i; -#ifdef CONFIG_NUMA - d.sd_allnodes = 0; -#endif + struct s_data d; + int i, ret = -ENOMEM; alloc_state = __visit_domain_allocation_hell(&d, cpu_map); if (alloc_state != sa_rootdomain) goto error; - alloc_state = sa_sched_groups; - - /* - * Set up domains for cpus specified by the cpu_map. - */ - for_each_cpu(i, cpu_map) { - cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), - cpu_map); - - sd = __build_numa_sched_domains(&d, cpu_map, attr, i); - sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); - sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i); - sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); - sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); - } + /* Set up domains for cpus specified by the cpu_map. */ for_each_cpu(i, cpu_map) { - build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); - build_sched_groups(&d, SD_LV_BOOK, cpu_map, i); - build_sched_groups(&d, SD_LV_MC, cpu_map, i); - } + struct sched_domain_topology_level *tl; - /* Set up physical groups */ - for (i = 0; i < nr_node_ids; i++) - build_sched_groups(&d, SD_LV_CPU, cpu_map, i); + sd = NULL; + for (tl = sched_domain_topology; tl->init; tl++) + sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); -#ifdef CONFIG_NUMA - /* Set up node groups */ - if (d.sd_allnodes) - build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); + while (sd->child) + sd = sd->child; - for (i = 0; i < nr_node_ids; i++) - if (build_numa_sched_groups(&d, cpu_map, i)) - goto error; -#endif - - /* Calculate CPU power for physical packages and nodes */ -#ifdef CONFIG_SCHED_SMT - for_each_cpu(i, cpu_map) { - sd = &per_cpu(cpu_domains, i).sd; - init_sched_groups_power(i, sd); - } -#endif -#ifdef CONFIG_SCHED_MC - for_each_cpu(i, cpu_map) { - sd = &per_cpu(core_domains, i).sd; - init_sched_groups_power(i, sd); - } -#endif -#ifdef CONFIG_SCHED_BOOK - for_each_cpu(i, cpu_map) { - sd = &per_cpu(book_domains, i).sd; - init_sched_groups_power(i, sd); + *per_cpu_ptr(d.sd, i) = sd; } -#endif + /* Build the groups for the domains */ for_each_cpu(i, cpu_map) { - sd = &per_cpu(phys_domains, i).sd; - init_sched_groups_power(i, sd); - } + for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { + sd->span_weight = cpumask_weight(sched_domain_span(sd)); + get_group(i, sd->private, &sd->groups); + atomic_inc(&sd->groups->ref); -#ifdef CONFIG_NUMA - for (i = 0; i < nr_node_ids; i++) - init_numa_sched_groups_power(d.sched_group_nodes[i]); + if (i != cpumask_first(sched_domain_span(sd))) + continue; - if (d.sd_allnodes) { - struct sched_group *sg; + build_sched_groups(sd); + } + } + + /* Calculate CPU power for physical packages and nodes */ + for (i = nr_cpumask_bits-1; i >= 0; i--) { + if (!cpumask_test_cpu(i, cpu_map)) + continue; - cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, - d.tmpmask); - init_numa_sched_groups_power(sg); + for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { + claim_allocations(i, sd); + init_sched_groups_power(i, sd); + } } -#endif /* Attach the domains */ + rcu_read_lock(); for_each_cpu(i, cpu_map) { -#ifdef CONFIG_SCHED_SMT - sd = &per_cpu(cpu_domains, i).sd; -#elif defined(CONFIG_SCHED_MC) - sd = &per_cpu(core_domains, i).sd; -#elif defined(CONFIG_SCHED_BOOK) - sd = &per_cpu(book_domains, i).sd; -#else - sd = &per_cpu(phys_domains, i).sd; -#endif + sd = *per_cpu_ptr(d.sd, i); cpu_attach_domain(sd, d.rd, i); } + rcu_read_unlock(); - d.sched_group_nodes = NULL; /* don't free this we still need it */ - __free_domain_allocs(&d, sa_tmpmask, cpu_map); - return 0; - + ret = 0; error: __free_domain_allocs(&d, alloc_state, cpu_map); - return -ENOMEM; -} - -static int build_sched_domains(const struct cpumask *cpu_map) -{ - return __build_sched_domains(cpu_map, NULL); + return ret; } static cpumask_var_t *doms_cur; /* current sched domains */ @@ -7670,7 +7360,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) * For now this just excludes isolated cpus, but could be used to * exclude other special cases in the future. */ -static int arch_init_sched_domains(const struct cpumask *cpu_map) +static int init_sched_domains(const struct cpumask *cpu_map) { int err; @@ -7681,32 +7371,24 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map) doms_cur = &fallback_doms; cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); dattr_cur = NULL; - err = build_sched_domains(doms_cur[0]); + err = build_sched_domains(doms_cur[0], NULL); register_sched_domain_sysctl(); return err; } -static void arch_destroy_sched_domains(const struct cpumask *cpu_map, - struct cpumask *tmpmask) -{ - free_sched_groups(cpu_map, tmpmask); -} - /* * Detach sched domains from a group of cpus specified in cpu_map * These cpus will now be attached to the NULL domain */ static void detach_destroy_domains(const struct cpumask *cpu_map) { - /* Save because hotplug lock held. */ - static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS); int i; + rcu_read_lock(); for_each_cpu(i, cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); - synchronize_sched(); - arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask)); + rcu_read_unlock(); } /* handle null as "default" */ @@ -7795,8 +7477,7 @@ match1: goto match2; } /* no match - add a new doms_new */ - __build_sched_domains(doms_new[i], - dattr_new ? dattr_new + i : NULL); + build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); match2: ; } @@ -7815,7 +7496,7 @@ match2: } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -static void arch_reinit_sched_domains(void) +static void reinit_sched_domains(void) { get_online_cpus(); @@ -7848,7 +7529,7 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) else sched_mc_power_savings = level; - arch_reinit_sched_domains(); + reinit_sched_domains(); return count; } @@ -7967,14 +7648,9 @@ void __init sched_init_smp(void) alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); alloc_cpumask_var(&fallback_doms, GFP_KERNEL); -#if defined(CONFIG_NUMA) - sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), - GFP_KERNEL); - BUG_ON(sched_group_nodes_bycpu == NULL); -#endif get_online_cpus(); mutex_lock(&sched_domains_mutex); - arch_init_sched_domains(cpu_active_mask); + init_sched_domains(cpu_active_mask); cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); if (cpumask_empty(non_isolated_cpus)) cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); @@ -8281,6 +7957,7 @@ void __init sched_init(void) /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); #ifdef CONFIG_SMP + zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); #ifdef CONFIG_NO_HZ zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); @@ -8340,7 +8017,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p) int old_prio = p->prio; int on_rq; - on_rq = p->se.on_rq; + on_rq = p->on_rq; if (on_rq) deactivate_task(rq, p, 0); __setscheduler(rq, p, SCHED_NORMAL, 0); @@ -8683,7 +8360,7 @@ void sched_move_task(struct task_struct *tsk) rq = task_rq_lock(tsk, &flags); running = task_current(rq, tsk); - on_rq = tsk->se.on_rq; + on_rq = tsk->on_rq; if (on_rq) dequeue_task(rq, tsk, 0); @@ -8702,7 +8379,7 @@ void sched_move_task(struct task_struct *tsk) if (on_rq) enqueue_task(rq, tsk, 0); - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, tsk, &flags); } #endif /* CONFIG_CGROUP_SCHED */ diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 7bacd83a4158..3669bec6e130 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -152,7 +152,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) read_lock_irqsave(&tasklist_lock, flags); do_each_thread(g, p) { - if (!p->se.on_rq || task_cpu(p) != rq_cpu) + if (!p->on_rq || task_cpu(p) != rq_cpu) continue; print_task(m, rq, p); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 6fa833ab2cb8..87445931a179 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -358,6 +358,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) } cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); +#ifndef CONFIG_64BIT + smp_wmb(); + cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; +#endif } /* @@ -1372,12 +1376,25 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP -static void task_waking_fair(struct rq *rq, struct task_struct *p) +static void task_waking_fair(struct task_struct *p) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 min_vruntime; - se->vruntime -= cfs_rq->min_vruntime; +#ifndef CONFIG_64BIT + u64 min_vruntime_copy; + + do { + min_vruntime_copy = cfs_rq->min_vruntime_copy; + smp_rmb(); + min_vruntime = cfs_rq->min_vruntime; + } while (min_vruntime != min_vruntime_copy); +#else + min_vruntime = cfs_rq->min_vruntime; +#endif + + se->vruntime -= min_vruntime; } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -1622,6 +1639,7 @@ static int select_idle_sibling(struct task_struct *p, int target) /* * Otherwise, iterate the domains and find an elegible idle cpu. */ + rcu_read_lock(); for_each_domain(target, sd) { if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) break; @@ -1641,6 +1659,7 @@ static int select_idle_sibling(struct task_struct *p, int target) cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) break; } + rcu_read_unlock(); return target; } @@ -1657,7 +1676,7 @@ static int select_idle_sibling(struct task_struct *p, int target) * preempt must be disabled. */ static int -select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags) +select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) { struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; int cpu = smp_processor_id(); @@ -1673,6 +1692,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ new_cpu = prev_cpu; } + rcu_read_lock(); for_each_domain(cpu, tmp) { if (!(tmp->flags & SD_LOAD_BALANCE)) continue; @@ -1723,9 +1743,10 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ if (affine_sd) { if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) - return select_idle_sibling(p, cpu); - else - return select_idle_sibling(p, prev_cpu); + prev_cpu = cpu; + + new_cpu = select_idle_sibling(p, prev_cpu); + goto unlock; } while (sd) { @@ -1766,6 +1787,8 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ } /* while loop will break here if sd == NULL */ } +unlock: + rcu_read_unlock(); return new_cpu; } @@ -1789,10 +1812,7 @@ wakeup_gran(struct sched_entity *curr, struct sched_entity *se) * This is especially important for buddies when the leftmost * task is higher priority than the buddy. */ - if (unlikely(se->load.weight != NICE_0_LOAD)) - gran = calc_delta_fair(gran, se); - - return gran; + return calc_delta_fair(gran, se); } /* @@ -2648,7 +2668,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) /* * Only siblings can have significantly less than SCHED_LOAD_SCALE */ - if (sd->level != SD_LV_SIBLING) + if (!(sd->flags & SD_SHARE_CPUPOWER)) return 0; /* @@ -3465,6 +3485,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) raw_spin_unlock(&this_rq->lock); update_shares(this_cpu); + rcu_read_lock(); for_each_domain(this_cpu, sd) { unsigned long interval; int balance = 1; @@ -3486,6 +3507,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) break; } } + rcu_read_unlock(); raw_spin_lock(&this_rq->lock); @@ -3534,6 +3556,7 @@ static int active_load_balance_cpu_stop(void *data) double_lock_balance(busiest_rq, target_rq); /* Search for an sd spanning us and the target CPU. */ + rcu_read_lock(); for_each_domain(target_cpu, sd) { if ((sd->flags & SD_LOAD_BALANCE) && cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) @@ -3549,6 +3572,7 @@ static int active_load_balance_cpu_stop(void *data) else schedstat_inc(sd, alb_failed); } + rcu_read_unlock(); double_unlock_balance(busiest_rq, target_rq); out_unlock: busiest_rq->active_balance = 0; @@ -3675,6 +3699,7 @@ static int find_new_ilb(int cpu) { struct sched_domain *sd; struct sched_group *ilb_group; + int ilb = nr_cpu_ids; /* * Have idle load balancer selection from semi-idle packages only @@ -3690,20 +3715,25 @@ static int find_new_ilb(int cpu) if (cpumask_weight(nohz.idle_cpus_mask) < 2) goto out_done; + rcu_read_lock(); for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { ilb_group = sd->groups; do { - if (is_semi_idle_group(ilb_group)) - return cpumask_first(nohz.grp_idle_mask); + if (is_semi_idle_group(ilb_group)) { + ilb = cpumask_first(nohz.grp_idle_mask); + goto unlock; + } ilb_group = ilb_group->next; } while (ilb_group != sd->groups); } +unlock: + rcu_read_unlock(); out_done: - return nr_cpu_ids; + return ilb; } #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ static inline int find_new_ilb(int call_cpu) @@ -3848,6 +3878,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) update_shares(cpu); + rcu_read_lock(); for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) continue; @@ -3893,6 +3924,7 @@ out: if (!balance) break; } + rcu_read_unlock(); /* * next_balance will be updated only when there is a need. diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 68e69acc29b9..be40f7371ee1 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -64,3 +64,9 @@ SCHED_FEAT(OWNER_SPIN, 1) * Decrement CPU power based on irq activity */ SCHED_FEAT(NONIRQ_POWER, 1) + +/* + * Queue remote wakeups on the target CPU and process them + * using the scheduler IPI. Reduces rq->lock contention/bounces. + */ +SCHED_FEAT(TTWU_QUEUE, 1) diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index a776a6396427..0a51882534ea 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -7,7 +7,7 @@ #ifdef CONFIG_SMP static int -select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags) +select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index e7cebdc65f82..19ecb3127379 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -977,13 +977,23 @@ static void yield_task_rt(struct rq *rq) static int find_lowest_rq(struct task_struct *task); static int -select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) +select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) { + struct task_struct *curr; + struct rq *rq; + int cpu; + if (sd_flag != SD_BALANCE_WAKE) return smp_processor_id(); + cpu = task_cpu(p); + rq = cpu_rq(cpu); + + rcu_read_lock(); + curr = ACCESS_ONCE(rq->curr); /* unlocked access */ + /* - * If the current task is an RT task, then + * If the current task on @p's runqueue is an RT task, then * try to see if we can wake this RT task up on another * runqueue. Otherwise simply start this RT task * on its current runqueue. @@ -997,21 +1007,25 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) * lock? * * For equal prio tasks, we just let the scheduler sort it out. + * + * Otherwise, just let it ride on the affined RQ and the + * post-schedule router will push the preempted task away + * + * This test is optimistic, if we get it wrong the load-balancer + * will have to sort it out. */ - if (unlikely(rt_task(rq->curr)) && - (rq->curr->rt.nr_cpus_allowed < 2 || - rq->curr->prio < p->prio) && + if (curr && unlikely(rt_task(curr)) && + (curr->rt.nr_cpus_allowed < 2 || + curr->prio < p->prio) && (p->rt.nr_cpus_allowed > 1)) { - int cpu = find_lowest_rq(p); + int target = find_lowest_rq(p); - return (cpu == -1) ? task_cpu(p) : cpu; + if (target != -1) + cpu = target; } + rcu_read_unlock(); - /* - * Otherwise, just let it ride on the affined RQ and the - * post-schedule router will push the preempted task away - */ - return task_cpu(p); + return cpu; } static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) @@ -1136,7 +1150,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) * The previous task needs to be made eligible for pushing * if it is still active */ - if (p->se.on_rq && p->rt.nr_cpus_allowed > 1) + if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); } @@ -1287,7 +1301,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) || task_running(rq, task) || - !task->se.on_rq)) { + !task->on_rq)) { raw_spin_unlock(&lowest_rq->lock); lowest_rq = NULL; @@ -1321,7 +1335,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) BUG_ON(task_current(rq, p)); BUG_ON(p->rt.nr_cpus_allowed <= 1); - BUG_ON(!p->se.on_rq); + BUG_ON(!p->on_rq); BUG_ON(!rt_task(p)); return p; @@ -1467,7 +1481,7 @@ static int pull_rt_task(struct rq *this_rq) */ if (p && (p->prio < this_rq->rt.highest_prio.curr)) { WARN_ON(p == src_rq->curr); - WARN_ON(!p->se.on_rq); + WARN_ON(!p->on_rq); /* * There's a chance that p is higher in priority @@ -1538,7 +1552,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, * Update the migration status of the RQ if we have an RT task * which is running AND changing its weight value. */ - if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) { + if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) { struct rq *rq = task_rq(p); if (!task_current(rq, p)) { @@ -1608,7 +1622,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) * we may need to handle the pulling of RT tasks * now. */ - if (p->se.on_rq && !rq->rt.rt_nr_running) + if (p->on_rq && !rq->rt.rt_nr_running) pull_rt_task(rq); } @@ -1638,7 +1652,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) * If that current running task is also an RT task * then see if we can move to another run queue. */ - if (p->se.on_rq && rq->curr != p) { + if (p->on_rq && rq->curr != p) { #ifdef CONFIG_SMP if (rq->rt.overloaded && push_rt_task(rq) && /* Don't resched if we changed runqueues */ @@ -1657,7 +1671,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) static void prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) { - if (!p->se.on_rq) + if (!p->on_rq) return; if (rq->curr == p) { diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c index 1ba2bd40fdac..6f437632afab 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched_stoptask.c @@ -9,8 +9,7 @@ #ifdef CONFIG_SMP static int -select_task_rq_stop(struct rq *rq, struct task_struct *p, - int sd_flag, int flags) +select_task_rq_stop(struct task_struct *p, int sd_flag, int flags) { return task_cpu(p); /* stop tasks as never migrate */ } @@ -26,7 +25,7 @@ static struct task_struct *pick_next_task_stop(struct rq *rq) { struct task_struct *stop = rq->stop; - if (stop && stop->se.on_rq) + if (stop && stop->on_rq) return stop; return NULL; diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index 66f040b30729..86c87e214b11 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -113,13 +113,61 @@ OPTIONS Do various checks like samples ordering and lost events. -f:: ---fields +--fields:: Comma separated list of fields to print. Options are: comm, tid, pid, time, cpu, event, trace, sym. Field - list must be prepended with the type, trace, sw or hw, + list can be prepended with the type, trace, sw or hw, to indicate to which event type the field list applies. e.g., -f sw:comm,tid,time,sym and -f trace:time,cpu,trace + perf script -f <fields> + + is equivalent to: + + perf script -f trace:<fields> -f sw:<fields> -f hw:<fields> + + i.e., the specified fields apply to all event types if the type string + is not given. + + The arguments are processed in the order received. A later usage can + reset a prior request. e.g.: + + -f trace: -f comm,tid,time,sym + + The first -f suppresses trace events (field list is ""), but then the + second invocation sets the fields to comm,tid,time,sym. In this case a + warning is given to the user: + + "Overriding previous field request for all events." + + Alternativey, consider the order: + + -f comm,tid,time,sym -f trace: + + The first -f sets the fields for all events and the second -f + suppresses trace events. The user is given a warning message about + the override, and the result of the above is that only S/W and H/W + events are displayed with the given fields. + + For the 'wildcard' option if a user selected field is invalid for an + event type, a message is displayed to the user that the option is + ignored for that type. For example: + + $ perf script -f comm,tid,trace + 'trace' not valid for hardware events. Ignoring. + 'trace' not valid for software events. Ignoring. + + Alternatively, if the type is given an invalid field is specified it + is an error. For example: + + perf script -v -f sw:comm,tid,trace + 'trace' not valid for software events. + + At this point usage is displayed, and perf-script exits. + + Finally, a user may not set fields to none for all event types. + i.e., -f "" is not allowed. + -k:: --vmlinux=<file>:: vmlinux pathname diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index ac574ea23917..6cf811acc41b 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -49,23 +49,52 @@ struct output_option { }; /* default set to maintain compatibility with current format */ -static u64 output_fields[PERF_TYPE_MAX] = { - [PERF_TYPE_HARDWARE] = PERF_OUTPUT_COMM | PERF_OUTPUT_TID | \ - PERF_OUTPUT_CPU | PERF_OUTPUT_TIME | \ - PERF_OUTPUT_EVNAME | PERF_OUTPUT_SYM, - - [PERF_TYPE_SOFTWARE] = PERF_OUTPUT_COMM | PERF_OUTPUT_TID | \ - PERF_OUTPUT_CPU | PERF_OUTPUT_TIME | \ - PERF_OUTPUT_EVNAME | PERF_OUTPUT_SYM, - - [PERF_TYPE_TRACEPOINT] = PERF_OUTPUT_COMM | PERF_OUTPUT_TID | \ - PERF_OUTPUT_CPU | PERF_OUTPUT_TIME | \ - PERF_OUTPUT_EVNAME | PERF_OUTPUT_TRACE, +static struct { + bool user_set; + u64 fields; + u64 invalid_fields; +} output[PERF_TYPE_MAX] = { + + [PERF_TYPE_HARDWARE] = { + .user_set = false, + + .fields = PERF_OUTPUT_COMM | PERF_OUTPUT_TID | + PERF_OUTPUT_CPU | PERF_OUTPUT_TIME | + PERF_OUTPUT_EVNAME | PERF_OUTPUT_SYM, + + .invalid_fields = PERF_OUTPUT_TRACE, + }, + + [PERF_TYPE_SOFTWARE] = { + .user_set = false, + + .fields = PERF_OUTPUT_COMM | PERF_OUTPUT_TID | + PERF_OUTPUT_CPU | PERF_OUTPUT_TIME | + PERF_OUTPUT_EVNAME | PERF_OUTPUT_SYM, + + .invalid_fields = PERF_OUTPUT_TRACE, + }, + + [PERF_TYPE_TRACEPOINT] = { + .user_set = false, + + .fields = PERF_OUTPUT_COMM | PERF_OUTPUT_TID | + PERF_OUTPUT_CPU | PERF_OUTPUT_TIME | + PERF_OUTPUT_EVNAME | PERF_OUTPUT_TRACE, + }, }; -static bool output_set_by_user; +static bool output_set_by_user(void) +{ + int j; + for (j = 0; j < PERF_TYPE_MAX; ++j) { + if (output[j].user_set) + return true; + } + return false; +} -#define PRINT_FIELD(x) (output_fields[attr->type] & PERF_OUTPUT_##x) +#define PRINT_FIELD(x) (output[attr->type].fields & PERF_OUTPUT_##x) static int perf_session__check_attr(struct perf_session *session, struct perf_event_attr *attr) @@ -168,7 +197,7 @@ static void process_event(union perf_event *event __unused, { struct perf_event_attr *attr = &evsel->attr; - if (output_fields[attr->type] == 0) + if (output[attr->type].fields == 0) return; if (perf_session__check_attr(session, attr) < 0) @@ -451,6 +480,7 @@ static int parse_output_fields(const struct option *opt __used, { char *tok; int i, imax = sizeof(all_output_options) / sizeof(struct output_option); + int j; int rc = 0; char *str = strdup(arg); int type = -1; @@ -458,52 +488,95 @@ static int parse_output_fields(const struct option *opt __used, if (!str) return -ENOMEM; - tok = strtok(str, ":"); - if (!tok) { - fprintf(stderr, - "Invalid field string - not prepended with type."); - return -EINVAL; - } - - /* first word should state which event type user - * is specifying the fields + /* first word can state for which event type the user is specifying + * the fields. If no type exists, the specified fields apply to all + * event types found in the file minus the invalid fields for a type. */ - if (!strcmp(tok, "hw")) - type = PERF_TYPE_HARDWARE; - else if (!strcmp(tok, "sw")) - type = PERF_TYPE_SOFTWARE; - else if (!strcmp(tok, "trace")) - type = PERF_TYPE_TRACEPOINT; - else { - fprintf(stderr, "Invalid event type in field string."); - return -EINVAL; + tok = strchr(str, ':'); + if (tok) { + *tok = '\0'; + tok++; + if (!strcmp(str, "hw")) + type = PERF_TYPE_HARDWARE; + else if (!strcmp(str, "sw")) + type = PERF_TYPE_SOFTWARE; + else if (!strcmp(str, "trace")) + type = PERF_TYPE_TRACEPOINT; + else { + fprintf(stderr, "Invalid event type in field string.\n"); + return -EINVAL; + } + + if (output[type].user_set) + pr_warning("Overriding previous field request for %s events.\n", + event_type(type)); + + output[type].fields = 0; + output[type].user_set = true; + + } else { + tok = str; + if (strlen(str) == 0) { + fprintf(stderr, + "Cannot set fields to 'none' for all event types.\n"); + rc = -EINVAL; + goto out; + } + + if (output_set_by_user()) + pr_warning("Overriding previous field request for all events.\n"); + + for (j = 0; j < PERF_TYPE_MAX; ++j) { + output[j].fields = 0; + output[j].user_set = true; + } } - output_fields[type] = 0; - while (1) { - tok = strtok(NULL, ","); - if (!tok) - break; + tok = strtok(tok, ","); + while (tok) { for (i = 0; i < imax; ++i) { - if (strcmp(tok, all_output_options[i].str) == 0) { - output_fields[type] |= all_output_options[i].field; + if (strcmp(tok, all_output_options[i].str) == 0) break; - } } if (i == imax) { - fprintf(stderr, "Invalid field requested."); + fprintf(stderr, "Invalid field requested.\n"); rc = -EINVAL; - break; + goto out; } - } - if (output_fields[type] == 0) { - pr_debug("No fields requested for %s type. " - "Events will not be displayed\n", event_type(type)); + if (type == -1) { + /* add user option to all events types for + * which it is valid + */ + for (j = 0; j < PERF_TYPE_MAX; ++j) { + if (output[j].invalid_fields & all_output_options[i].field) { + pr_warning("\'%s\' not valid for %s events. Ignoring.\n", + all_output_options[i].str, event_type(j)); + } else + output[j].fields |= all_output_options[i].field; + } + } else { + if (output[type].invalid_fields & all_output_options[i].field) { + fprintf(stderr, "\'%s\' not valid for %s events.\n", + all_output_options[i].str, event_type(type)); + + rc = -EINVAL; + goto out; + } + output[type].fields |= all_output_options[i].field; + } + + tok = strtok(NULL, ","); } - output_set_by_user = true; + if (type >= 0) { + if (output[type].fields == 0) { + pr_debug("No fields requested for %s type. " + "Events will not be displayed.\n", event_type(type)); + } + } +out: free(str); return rc; } @@ -1020,7 +1093,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __used) struct stat perf_stat; int input; - if (output_set_by_user) { + if (output_set_by_user()) { fprintf(stderr, "custom fields not supported for generated scripts"); return -1; diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index b7c85ce466a1..a7c7145a8d4f 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -1471,6 +1471,38 @@ static int find_probe_point_by_func(struct probe_finder *pf) return _param.retval; } +struct pubname_callback_param { + char *function; + char *file; + Dwarf_Die *cu_die; + Dwarf_Die *sp_die; + int found; +}; + +static int pubname_search_cb(Dwarf *dbg, Dwarf_Global *gl, void *data) +{ + struct pubname_callback_param *param = data; + + if (dwarf_offdie(dbg, gl->die_offset, param->sp_die)) { + if (dwarf_tag(param->sp_die) != DW_TAG_subprogram) + return DWARF_CB_OK; + + if (die_compare_name(param->sp_die, param->function)) { + if (!dwarf_offdie(dbg, gl->cu_offset, param->cu_die)) + return DWARF_CB_OK; + + if (param->file && + strtailcmp(param->file, dwarf_decl_file(param->sp_die))) + return DWARF_CB_OK; + + param->found = 1; + return DWARF_CB_ABORT; + } + } + + return DWARF_CB_OK; +} + /* Find probe points from debuginfo */ static int find_probes(int fd, struct probe_finder *pf) { @@ -1498,6 +1530,27 @@ static int find_probes(int fd, struct probe_finder *pf) off = 0; line_list__init(&pf->lcache); + + /* Fastpath: lookup by function name from .debug_pubnames section */ + if (pp->function) { + struct pubname_callback_param pubname_param = { + .function = pp->function, + .file = pp->file, + .cu_die = &pf->cu_die, + .sp_die = &pf->sp_die, + }; + struct dwarf_callback_param probe_param = { + .data = pf, + }; + + dwarf_getpubnames(dbg, pubname_search_cb, &pubname_param, 0); + if (pubname_param.found) { + ret = probe_point_search_cb(&pf->sp_die, &probe_param); + if (ret) + goto found; + } + } + /* Loop on CUs (Compilation Unit) */ while (!dwarf_nextcu(dbg, off, &noff, &cuhl, NULL, NULL, NULL)) { /* Get the DIE(Debugging Information Entry) of this CU */ @@ -1525,6 +1578,8 @@ static int find_probes(int fd, struct probe_finder *pf) } off = noff; } + +found: line_list__free(&pf->lcache); if (dwfl) dwfl_end(dwfl); @@ -1946,6 +2001,22 @@ int find_line_range(int fd, struct line_range *lr) return -EBADF; } + /* Fastpath: lookup by function name from .debug_pubnames section */ + if (lr->function) { + struct pubname_callback_param pubname_param = { + .function = lr->function, .file = lr->file, + .cu_die = &lf.cu_die, .sp_die = &lf.sp_die, .found = 0}; + struct dwarf_callback_param line_range_param = { + .data = (void *)&lf, .retval = 0}; + + dwarf_getpubnames(dbg, pubname_search_cb, &pubname_param, 0); + if (pubname_param.found) { + line_range_search_cb(&lf.sp_die, &line_range_param); + if (lf.found) + goto found; + } + } + /* Loop on CUs (Compilation Unit) */ while (!lf.found && ret >= 0) { if (dwarf_nextcu(dbg, off, &noff, &cuhl, NULL, NULL, NULL) != 0) @@ -1974,6 +2045,7 @@ int find_line_range(int fd, struct line_range *lr) off = noff; } +found: /* Store comp_dir */ if (lf.found) { comp_dir = cu_get_comp_dir(&lf.cu_die); diff --git a/tools/perf/util/probe-finder.h b/tools/perf/util/probe-finder.h index beaefc3c1223..605730a366db 100644 --- a/tools/perf/util/probe-finder.h +++ b/tools/perf/util/probe-finder.h @@ -49,6 +49,7 @@ struct probe_finder { Dwarf_Addr addr; /* Address */ const char *fname; /* Real file name */ Dwarf_Die cu_die; /* Current CU */ + Dwarf_Die sp_die; struct list_head lcache; /* Line cache for lazy match */ /* For variable searching */ @@ -83,6 +84,7 @@ struct line_finder { int lno_s; /* Start line number */ int lno_e; /* End line number */ Dwarf_Die cu_die; /* Current CU */ + Dwarf_Die sp_die; int found; }; |