diff options
Diffstat (limited to 'arch')
128 files changed, 1639 insertions, 2153 deletions
diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index 42aa078a5e4d..5a621c6d22ab 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c @@ -585,8 +585,7 @@ handle_ipi(struct pt_regs *regs) switch (which) { case IPI_RESCHEDULE: - /* Reschedule callback. Everything to be done - is done by the interrupt return path. */ + scheduler_ipi(); break; case IPI_CALL_FUNC: diff --git a/arch/alpha/kernel/time.c b/arch/alpha/kernel/time.c index 918e8e0b72ff..818e74ed45dc 100644 --- a/arch/alpha/kernel/time.c +++ b/arch/alpha/kernel/time.c @@ -375,8 +375,7 @@ static struct clocksource clocksource_rpcc = { static inline void register_rpcc_clocksource(long cycle_freq) { - clocksource_calc_mult_shift(&clocksource_rpcc, cycle_freq, 4); - clocksource_register(&clocksource_rpcc); + clocksource_register_hz(&clocksource_rpcc, cycle_freq); } #else /* !CONFIG_SMP */ static inline void register_rpcc_clocksource(long cycle_freq) diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 99f625a3142c..d439a8f4c078 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -567,10 +567,7 @@ asmlinkage void __exception_irq_entry do_IPI(int ipinr, struct pt_regs *regs) break; case IPI_RESCHEDULE: - /* - * nothing more to do - eveything is - * done on the interrupt return path - */ + scheduler_ipi(); break; case IPI_CALL_FUNC: diff --git a/arch/blackfin/kernel/time-ts.c b/arch/blackfin/kernel/time-ts.c index cdb4beb6bc8f..9e9b60d969dc 100644 --- a/arch/blackfin/kernel/time-ts.c +++ b/arch/blackfin/kernel/time-ts.c @@ -23,29 +23,6 @@ #include <asm/gptimers.h> #include <asm/nmi.h> -/* Accelerators for sched_clock() - * convert from cycles(64bits) => nanoseconds (64bits) - * basic equation: - * ns = cycles / (freq / ns_per_sec) - * ns = cycles * (ns_per_sec / freq) - * ns = cycles * (10^9 / (cpu_khz * 10^3)) - * ns = cycles * (10^6 / cpu_khz) - * - * Then we use scaling math (suggested by george@mvista.com) to get: - * ns = cycles * (10^6 * SC / cpu_khz) / SC - * ns = cycles * cyc2ns_scale / SC - * - * And since SC is a constant power of two, we can convert the div - * into a shift. - * - * We can use khz divisor instead of mhz to keep a better precision, since - * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. - * (mathieu.desnoyers@polymtl.ca) - * - * -johnstul@us.ibm.com "math is hard, lets go shopping!" - */ - -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ #if defined(CONFIG_CYCLES_CLOCKSOURCE) @@ -63,7 +40,6 @@ static struct clocksource bfin_cs_cycles = { .rating = 400, .read = bfin_read_cycles, .mask = CLOCKSOURCE_MASK(64), - .shift = CYC2NS_SCALE_FACTOR, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -75,10 +51,7 @@ static inline unsigned long long bfin_cs_cycles_sched_clock(void) static int __init bfin_cs_cycles_init(void) { - bfin_cs_cycles.mult = \ - clocksource_hz2mult(get_cclk(), bfin_cs_cycles.shift); - - if (clocksource_register(&bfin_cs_cycles)) + if (clocksource_register_hz(&bfin_cs_cycles, get_cclk())) panic("failed to register clocksource"); return 0; @@ -111,7 +84,6 @@ static struct clocksource bfin_cs_gptimer0 = { .rating = 350, .read = bfin_read_gptimer0, .mask = CLOCKSOURCE_MASK(32), - .shift = CYC2NS_SCALE_FACTOR, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -125,10 +97,7 @@ static int __init bfin_cs_gptimer0_init(void) { setup_gptimer0(); - bfin_cs_gptimer0.mult = \ - clocksource_hz2mult(get_sclk(), bfin_cs_gptimer0.shift); - - if (clocksource_register(&bfin_cs_gptimer0)) + if (clocksource_register_hz(&bfin_cs_gptimer0, get_sclk())) panic("failed to register clocksource"); return 0; diff --git a/arch/blackfin/mach-common/smp.c b/arch/blackfin/mach-common/smp.c index 0b12a53fe11a..c0ac8e643361 100644 --- a/arch/blackfin/mach-common/smp.c +++ b/arch/blackfin/mach-common/smp.c @@ -178,6 +178,9 @@ static irqreturn_t ipi_handler_int1(int irq, void *dev_instance) while (msg_queue->count) { msg = &msg_queue->ipi_message[msg_queue->head]; switch (msg->type) { + case BFIN_IPI_RESCHEDULE: + scheduler_ipi(); + break; case BFIN_IPI_CALL_FUNC: spin_unlock_irqrestore(&msg_queue->lock, flags); ipi_call_function(cpu, msg); diff --git a/arch/cris/arch-v32/kernel/smp.c b/arch/cris/arch-v32/kernel/smp.c index 4c9e3e1ba5d1..66cc75657e2f 100644 --- a/arch/cris/arch-v32/kernel/smp.c +++ b/arch/cris/arch-v32/kernel/smp.c @@ -342,15 +342,18 @@ irqreturn_t crisv32_ipi_interrupt(int irq, void *dev_id) ipi = REG_RD(intr_vect, irq_regs[smp_processor_id()], rw_ipi); + if (ipi.vector & IPI_SCHEDULE) { + scheduler_ipi(); + } if (ipi.vector & IPI_CALL) { - func(info); + func(info); } if (ipi.vector & IPI_FLUSH_TLB) { - if (flush_mm == FLUSH_ALL) - __flush_tlb_all(); - else if (flush_vma == FLUSH_ALL) + if (flush_mm == FLUSH_ALL) + __flush_tlb_all(); + else if (flush_vma == FLUSH_ALL) __flush_tlb_mm(flush_mm); - else + else __flush_tlb_page(flush_vma, flush_addr); } diff --git a/arch/ia64/kernel/cyclone.c b/arch/ia64/kernel/cyclone.c index 1b811c61bdc6..f64097b5118a 100644 --- a/arch/ia64/kernel/cyclone.c +++ b/arch/ia64/kernel/cyclone.c @@ -31,8 +31,6 @@ static struct clocksource clocksource_cyclone = { .rating = 300, .read = read_cyclone, .mask = (1LL << 40) - 1, - .mult = 0, /*to be calculated*/ - .shift = 16, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -118,9 +116,7 @@ int __init init_cyclone_clock(void) /* initialize last tick */ cyclone_mc = cyclone_timer; clocksource_cyclone.fsys_mmio = cyclone_timer; - clocksource_cyclone.mult = clocksource_hz2mult(CYCLONE_TIMER_FREQ, - clocksource_cyclone.shift); - clocksource_register(&clocksource_cyclone); + clocksource_register_hz(&clocksource_cyclone, CYCLONE_TIMER_FREQ); return 0; } diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c index 5b704740f160..782c3a357f24 100644 --- a/arch/ia64/kernel/irq_ia64.c +++ b/arch/ia64/kernel/irq_ia64.c @@ -31,6 +31,7 @@ #include <linux/irq.h> #include <linux/ratelimit.h> #include <linux/acpi.h> +#include <linux/sched.h> #include <asm/delay.h> #include <asm/intrinsics.h> @@ -496,6 +497,7 @@ ia64_handle_irq (ia64_vector vector, struct pt_regs *regs) smp_local_flush_tlb(); kstat_incr_irqs_this_cpu(irq, desc); } else if (unlikely(IS_RESCHEDULE(vector))) { + scheduler_ipi(); kstat_incr_irqs_this_cpu(irq, desc); } else { ia64_setreg(_IA64_REG_CR_TPR, vector); diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index 156ad803d5b7..04440cc09b40 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -73,8 +73,6 @@ static struct clocksource clocksource_itc = { .rating = 350, .read = itc_get_cycles, .mask = CLOCKSOURCE_MASK(64), - .mult = 0, /*to be calculated*/ - .shift = 16, .flags = CLOCK_SOURCE_IS_CONTINUOUS, #ifdef CONFIG_PARAVIRT .resume = paravirt_clocksource_resume, @@ -365,11 +363,8 @@ ia64_init_itm (void) ia64_cpu_local_tick(); if (!itc_clocksource) { - /* Sort out mult/shift values: */ - clocksource_itc.mult = - clocksource_hz2mult(local_cpu_data->itc_freq, - clocksource_itc.shift); - clocksource_register(&clocksource_itc); + clocksource_register_hz(&clocksource_itc, + local_cpu_data->itc_freq); itc_clocksource = &clocksource_itc; } } diff --git a/arch/ia64/sn/kernel/sn2/timer.c b/arch/ia64/sn/kernel/sn2/timer.c index 21d6f09e3447..c34efda122e1 100644 --- a/arch/ia64/sn/kernel/sn2/timer.c +++ b/arch/ia64/sn/kernel/sn2/timer.c @@ -33,8 +33,6 @@ static struct clocksource clocksource_sn2 = { .rating = 450, .read = read_sn2, .mask = (1LL << 55) - 1, - .mult = 0, - .shift = 10, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -57,9 +55,7 @@ ia64_sn_udelay (unsigned long usecs) void __init sn_timer_init(void) { clocksource_sn2.fsys_mmio = RTC_COUNTER_ADDR; - clocksource_sn2.mult = clocksource_hz2mult(sn_rtc_cycles_per_second, - clocksource_sn2.shift); - clocksource_register(&clocksource_sn2); + clocksource_register_hz(&clocksource_sn2, sn_rtc_cycles_per_second); ia64_udelay = &ia64_sn_udelay; } diff --git a/arch/ia64/xen/irq_xen.c b/arch/ia64/xen/irq_xen.c index 108bb858acf2..b279e142c633 100644 --- a/arch/ia64/xen/irq_xen.c +++ b/arch/ia64/xen/irq_xen.c @@ -92,6 +92,8 @@ static unsigned short saved_irq_cnt; static int xen_slab_ready; #ifdef CONFIG_SMP +#include <linux/sched.h> + /* Dummy stub. Though we may check XEN_RESCHEDULE_VECTOR before __do_IRQ, * it ends up to issue several memory accesses upon percpu data and * thus adds unnecessary traffic to other paths. @@ -99,7 +101,13 @@ static int xen_slab_ready; static irqreturn_t xen_dummy_handler(int irq, void *dev_id) { + return IRQ_HANDLED; +} +static irqreturn_t +xen_resched_handler(int irq, void *dev_id) +{ + scheduler_ipi(); return IRQ_HANDLED; } @@ -110,7 +118,7 @@ static struct irqaction xen_ipi_irqaction = { }; static struct irqaction xen_resched_irqaction = { - .handler = xen_dummy_handler, + .handler = xen_resched_handler, .flags = IRQF_DISABLED, .name = "resched" }; diff --git a/arch/m32r/kernel/smp.c b/arch/m32r/kernel/smp.c index 31cef20b2996..fc10b39893d4 100644 --- a/arch/m32r/kernel/smp.c +++ b/arch/m32r/kernel/smp.c @@ -122,8 +122,6 @@ void smp_send_reschedule(int cpu_id) * * Description: This routine executes on CPU which received * 'RESCHEDULE_IPI'. - * Rescheduling is processed at the exit of interrupt - * operation. * * Born on Date: 2002.02.05 * @@ -138,7 +136,7 @@ void smp_send_reschedule(int cpu_id) *==========================================================================*/ void smp_reschedule_interrupt(void) { - /* nothing to do */ + scheduler_ipi(); } /*==========================================================================* diff --git a/arch/microblaze/kernel/timer.c b/arch/microblaze/kernel/timer.c index d8a214f11ac2..e5550ce4e0eb 100644 --- a/arch/microblaze/kernel/timer.c +++ b/arch/microblaze/kernel/timer.c @@ -217,16 +217,12 @@ static struct clocksource clocksource_microblaze = { .rating = 300, .read = microblaze_read, .mask = CLOCKSOURCE_MASK(32), - .shift = 8, /* I can shift it */ .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; static int __init microblaze_clocksource_init(void) { - clocksource_microblaze.mult = - clocksource_hz2mult(timer_clock_freq, - clocksource_microblaze.shift); - if (clocksource_register(&clocksource_microblaze)) + if (clocksource_register_hz(&clocksource_microblaze, timer_clock_freq)) panic("failed to register clocksource"); /* stop timer1 */ diff --git a/arch/mips/alchemy/common/time.c b/arch/mips/alchemy/common/time.c index 2aecb2fdf982..d5da6adbf634 100644 --- a/arch/mips/alchemy/common/time.c +++ b/arch/mips/alchemy/common/time.c @@ -141,8 +141,7 @@ static int __init alchemy_time_init(unsigned int m2int) goto cntr_err; /* register counter1 clocksource and event device */ - clocksource_set_clock(&au1x_counter1_clocksource, 32768); - clocksource_register(&au1x_counter1_clocksource); + clocksource_register_hz(&au1x_counter1_clocksource, 32768); cd->shift = 32; cd->mult = div_sc(32768, NSEC_PER_SEC, cd->shift); diff --git a/arch/mips/cavium-octeon/csrc-octeon.c b/arch/mips/cavium-octeon/csrc-octeon.c index 26bf71130bf8..29d56afbb02d 100644 --- a/arch/mips/cavium-octeon/csrc-octeon.c +++ b/arch/mips/cavium-octeon/csrc-octeon.c @@ -105,8 +105,7 @@ unsigned long long notrace sched_clock(void) void __init plat_time_init(void) { clocksource_mips.rating = 300; - clocksource_set_clock(&clocksource_mips, octeon_get_clock_rate()); - clocksource_register(&clocksource_mips); + clocksource_register_hz(&clocksource_mips, octeon_get_clock_rate()); } static u64 octeon_udelay_factor; diff --git a/arch/mips/cavium-octeon/smp.c b/arch/mips/cavium-octeon/smp.c index ba78b21cc8d0..76923eeb58b9 100644 --- a/arch/mips/cavium-octeon/smp.c +++ b/arch/mips/cavium-octeon/smp.c @@ -44,6 +44,8 @@ static irqreturn_t mailbox_interrupt(int irq, void *dev_id) if (action & SMP_CALL_FUNCTION) smp_call_function_interrupt(); + if (action & SMP_RESCHEDULE_YOURSELF) + scheduler_ipi(); /* Check if we've been told to flush the icache */ if (action & SMP_ICACHE_FLUSH) diff --git a/arch/mips/include/asm/jump_label.h b/arch/mips/include/asm/jump_label.h index 7622ccf75076..1881b316ca45 100644 --- a/arch/mips/include/asm/jump_label.h +++ b/arch/mips/include/asm/jump_label.h @@ -20,16 +20,18 @@ #define WORD_INSN ".word" #endif -#define JUMP_LABEL(key, label) \ - do { \ - asm goto("1:\tnop\n\t" \ - "nop\n\t" \ - ".pushsection __jump_table, \"a\"\n\t" \ - WORD_INSN " 1b, %l[" #label "], %0\n\t" \ - ".popsection\n\t" \ - : : "i" (key) : : label); \ - } while (0) - +static __always_inline bool arch_static_branch(struct jump_label_key *key) +{ + asm goto("1:\tnop\n\t" + "nop\n\t" + ".pushsection __jump_table, \"aw\"\n\t" + WORD_INSN " 1b, %l[l_yes], %0\n\t" + ".popsection\n\t" + : : "i" (key) : : l_yes); + return false; +l_yes: + return true; +} #endif /* __KERNEL__ */ diff --git a/arch/mips/include/asm/time.h b/arch/mips/include/asm/time.h index c7f1bfef1574..bc14447e69b5 100644 --- a/arch/mips/include/asm/time.h +++ b/arch/mips/include/asm/time.h @@ -84,12 +84,6 @@ static inline int init_mips_clocksource(void) #endif } -static inline void clocksource_set_clock(struct clocksource *cs, - unsigned int clock) -{ - clocksource_calc_mult_shift(cs, clock, 4); -} - static inline void clockevent_set_clock(struct clock_event_device *cd, unsigned int clock) { diff --git a/arch/mips/jz4740/time.c b/arch/mips/jz4740/time.c index eaa853a54af6..f83c2dd07a27 100644 --- a/arch/mips/jz4740/time.c +++ b/arch/mips/jz4740/time.c @@ -121,8 +121,7 @@ void __init plat_time_init(void) clockevents_register_device(&jz4740_clockevent); - clocksource_set_clock(&jz4740_clocksource, clk_rate); - ret = clocksource_register(&jz4740_clocksource); + ret = clocksource_register_hz(&jz4740_clocksource, clk_rate); if (ret) printk(KERN_ERR "Failed to register clocksource: %d\n", ret); diff --git a/arch/mips/kernel/cevt-txx9.c b/arch/mips/kernel/cevt-txx9.c index 0b7377361e22..f0ab92a1b057 100644 --- a/arch/mips/kernel/cevt-txx9.c +++ b/arch/mips/kernel/cevt-txx9.c @@ -51,8 +51,7 @@ void __init txx9_clocksource_init(unsigned long baseaddr, { struct txx9_tmr_reg __iomem *tmrptr; - clocksource_set_clock(&txx9_clocksource.cs, TIMER_CLK(imbusclk)); - clocksource_register(&txx9_clocksource.cs); + clocksource_register_hz(&txx9_clocksource.cs, TIMER_CLK(imbusclk)); tmrptr = ioremap(baseaddr, sizeof(struct txx9_tmr_reg)); __raw_writel(TCR_BASE, &tmrptr->tcr); diff --git a/arch/mips/kernel/csrc-bcm1480.c b/arch/mips/kernel/csrc-bcm1480.c index 51489f8a825e..f96f99c794a3 100644 --- a/arch/mips/kernel/csrc-bcm1480.c +++ b/arch/mips/kernel/csrc-bcm1480.c @@ -49,6 +49,5 @@ void __init sb1480_clocksource_init(void) plldiv = G_BCM1480_SYS_PLL_DIV(__raw_readq(IOADDR(A_SCD_SYSTEM_CFG))); zbbus = ((plldiv >> 1) * 50000000) + ((plldiv & 1) * 25000000); - clocksource_set_clock(cs, zbbus); - clocksource_register(cs); + clocksource_register_hz(cs, zbbus); } diff --git a/arch/mips/kernel/csrc-ioasic.c b/arch/mips/kernel/csrc-ioasic.c index 23da108506b0..46bd7fa98d6c 100644 --- a/arch/mips/kernel/csrc-ioasic.c +++ b/arch/mips/kernel/csrc-ioasic.c @@ -59,7 +59,5 @@ void __init dec_ioasic_clocksource_init(void) printk(KERN_INFO "I/O ASIC clock frequency %dHz\n", freq); clocksource_dec.rating = 200 + freq / 10000000; - clocksource_set_clock(&clocksource_dec, freq); - - clocksource_register(&clocksource_dec); + clocksource_register_hz(&clocksource_dec, freq); } diff --git a/arch/mips/kernel/csrc-powertv.c b/arch/mips/kernel/csrc-powertv.c index a27c16c8690e..2e7c5232da8d 100644 --- a/arch/mips/kernel/csrc-powertv.c +++ b/arch/mips/kernel/csrc-powertv.c @@ -78,9 +78,7 @@ static void __init powertv_c0_hpt_clocksource_init(void) clocksource_mips.rating = 200 + mips_hpt_frequency / 10000000; - clocksource_set_clock(&clocksource_mips, mips_hpt_frequency); - - clocksource_register(&clocksource_mips); + clocksource_register_hz(&clocksource_mips, mips_hpt_frequency); } /** @@ -130,43 +128,16 @@ static struct clocksource clocksource_tim_c = { /** * powertv_tim_c_clocksource_init - set up a clock source for the TIM_C clock * - * The hard part here is coming up with a constant k and shift s such that - * the 48-bit TIM_C value multiplied by k doesn't overflow and that value, - * when shifted right by s, yields the corresponding number of nanoseconds. * We know that TIM_C counts at 27 MHz/8, so each cycle corresponds to - * 1 / (27,000,000/8) seconds. Multiply that by a billion and you get the - * number of nanoseconds. Since the TIM_C value has 48 bits and the math is - * done in 64 bits, avoiding an overflow means that k must be less than - * 64 - 48 = 16 bits. + * 1 / (27,000,000/8) seconds. */ static void __init powertv_tim_c_clocksource_init(void) { - int prescale; - unsigned long dividend; - unsigned long k; - int s; - const int max_k_bits = (64 - 48) - 1; - const unsigned long billion = 1000000000; const unsigned long counts_per_second = 27000000 / 8; - prescale = BITS_PER_LONG - ilog2(billion) - 1; - dividend = billion << prescale; - k = dividend / counts_per_second; - s = ilog2(k) - max_k_bits; - - if (s < 0) - s = prescale; - - else { - k >>= s; - s += prescale; - } - - clocksource_tim_c.mult = k; - clocksource_tim_c.shift = s; clocksource_tim_c.rating = 200; - clocksource_register(&clocksource_tim_c); + clocksource_register_hz(&clocksource_tim_c, counts_per_second); tim_c = (struct tim_c *) asic_reg_addr(tim_ch); } diff --git a/arch/mips/kernel/csrc-r4k.c b/arch/mips/kernel/csrc-r4k.c index e95a3cd48eea..decd1fa38d55 100644 --- a/arch/mips/kernel/csrc-r4k.c +++ b/arch/mips/kernel/csrc-r4k.c @@ -30,9 +30,7 @@ int __init init_r4k_clocksource(void) /* Calculate a somewhat reasonable rating value */ clocksource_mips.rating = 200 + mips_hpt_frequency / 10000000; - clocksource_set_clock(&clocksource_mips, mips_hpt_frequency); - - clocksource_register(&clocksource_mips); + clocksource_register_hz(&clocksource_mips, mips_hpt_frequency); return 0; } diff --git a/arch/mips/kernel/csrc-sb1250.c b/arch/mips/kernel/csrc-sb1250.c index d14d3d1907fa..e9606d907685 100644 --- a/arch/mips/kernel/csrc-sb1250.c +++ b/arch/mips/kernel/csrc-sb1250.c @@ -65,6 +65,5 @@ void __init sb1250_clocksource_init(void) IOADDR(A_SCD_TIMER_REGISTER(SB1250_HPT_NUM, R_SCD_TIMER_CFG))); - clocksource_set_clock(cs, V_SCD_TIMER_FREQ); - clocksource_register(cs); + clocksource_register_hz(cs, V_SCD_TIMER_FREQ); } diff --git a/arch/mips/kernel/i8253.c b/arch/mips/kernel/i8253.c index 2392a7a296d4..9fadd17888d9 100644 --- a/arch/mips/kernel/i8253.c +++ b/arch/mips/kernel/i8253.c @@ -196,8 +196,6 @@ static struct clocksource clocksource_pit = { .rating = 110, .read = pit_read, .mask = CLOCKSOURCE_MASK(32), - .mult = 0, - .shift = 20, }; static int __init init_pit_clocksource(void) @@ -205,7 +203,6 @@ static int __init init_pit_clocksource(void) if (num_possible_cpus() > 1) /* PIT does not scale! */ return 0; - clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20); - return clocksource_register(&clocksource_pit); + return clocksource_register_hz(&clocksource_pit, CLOCK_TICK_RATE); } arch_initcall(init_pit_clocksource); diff --git a/arch/mips/kernel/smtc.c b/arch/mips/kernel/smtc.c index 5a88cc4ccd5a..cedac4633741 100644 --- a/arch/mips/kernel/smtc.c +++ b/arch/mips/kernel/smtc.c @@ -929,7 +929,7 @@ static void post_direct_ipi(int cpu, struct smtc_ipi *pipi) static void ipi_resched_interrupt(void) { - /* Return from interrupt should be enough to cause scheduler check */ + scheduler_ipi(); } static void ipi_call_interrupt(void) diff --git a/arch/mips/loongson/common/cs5536/cs5536_mfgpt.c b/arch/mips/loongson/common/cs5536/cs5536_mfgpt.c index 8c807c965199..0cb1b9760e34 100644 --- a/arch/mips/loongson/common/cs5536/cs5536_mfgpt.c +++ b/arch/mips/loongson/common/cs5536/cs5536_mfgpt.c @@ -201,8 +201,6 @@ static struct clocksource clocksource_mfgpt = { .rating = 120, /* Functional for real use, but not desired */ .read = mfgpt_read, .mask = CLOCKSOURCE_MASK(32), - .mult = 0, - .shift = 22, }; int __init init_mfgpt_clocksource(void) @@ -210,8 +208,7 @@ int __init init_mfgpt_clocksource(void) if (num_possible_cpus() > 1) /* MFGPT does not scale! */ return 0; - clocksource_mfgpt.mult = clocksource_hz2mult(MFGPT_TICK_RATE, 22); - return clocksource_register(&clocksource_mfgpt); + return clocksource_register_hz(&clocksource_mfgpt, MFGPT_TICK_RATE); } arch_initcall(init_mfgpt_clocksource); diff --git a/arch/mips/mti-malta/malta-int.c b/arch/mips/mti-malta/malta-int.c index e85c977328da..1d36c511a7a5 100644 --- a/arch/mips/mti-malta/malta-int.c +++ b/arch/mips/mti-malta/malta-int.c @@ -308,6 +308,8 @@ static void ipi_call_dispatch(void) static irqreturn_t ipi_resched_interrupt(int irq, void *dev_id) { + scheduler_ipi(); + return IRQ_HANDLED; } diff --git a/arch/mips/pmc-sierra/yosemite/smp.c b/arch/mips/pmc-sierra/yosemite/smp.c index efc9e889b349..2608752898c0 100644 --- a/arch/mips/pmc-sierra/yosemite/smp.c +++ b/arch/mips/pmc-sierra/yosemite/smp.c @@ -55,6 +55,8 @@ void titan_mailbox_irq(void) if (status & 0x2) smp_call_function_interrupt(); + if (status & 0x4) + scheduler_ipi(); break; case 1: @@ -63,6 +65,8 @@ void titan_mailbox_irq(void) if (status & 0x2) smp_call_function_interrupt(); + if (status & 0x4) + scheduler_ipi(); break; } } diff --git a/arch/mips/sgi-ip27/ip27-irq.c b/arch/mips/sgi-ip27/ip27-irq.c index 0a04603d577c..b18b04e48577 100644 --- a/arch/mips/sgi-ip27/ip27-irq.c +++ b/arch/mips/sgi-ip27/ip27-irq.c @@ -147,8 +147,10 @@ static void ip27_do_irq_mask0(void) #ifdef CONFIG_SMP if (pend0 & (1UL << CPU_RESCHED_A_IRQ)) { LOCAL_HUB_CLR_INTR(CPU_RESCHED_A_IRQ); + scheduler_ipi(); } else if (pend0 & (1UL << CPU_RESCHED_B_IRQ)) { LOCAL_HUB_CLR_INTR(CPU_RESCHED_B_IRQ); + scheduler_ipi(); } else if (pend0 & (1UL << CPU_CALL_A_IRQ)) { LOCAL_HUB_CLR_INTR(CPU_CALL_A_IRQ); smp_call_function_interrupt(); diff --git a/arch/mips/sgi-ip27/ip27-timer.c b/arch/mips/sgi-ip27/ip27-timer.c index 3f810c9cbf83..ef74f3267f91 100644 --- a/arch/mips/sgi-ip27/ip27-timer.c +++ b/arch/mips/sgi-ip27/ip27-timer.c @@ -163,8 +163,7 @@ static void __init hub_rt_clocksource_init(void) { struct clocksource *cs = &hub_rt_clocksource; - clocksource_set_clock(cs, CYCLES_PER_SEC); - clocksource_register(cs); + clocksource_register_hz(cs, CYCLES_PER_SEC); } void __init plat_time_init(void) diff --git a/arch/mips/sibyte/bcm1480/smp.c b/arch/mips/sibyte/bcm1480/smp.c index 47b347c992ea..d667875be564 100644 --- a/arch/mips/sibyte/bcm1480/smp.c +++ b/arch/mips/sibyte/bcm1480/smp.c @@ -20,6 +20,7 @@ #include <linux/delay.h> #include <linux/smp.h> #include <linux/kernel_stat.h> +#include <linux/sched.h> #include <asm/mmu_context.h> #include <asm/io.h> @@ -189,10 +190,8 @@ void bcm1480_mailbox_interrupt(void) /* Clear the mailbox to clear the interrupt */ __raw_writeq(((u64)action)<<48, mailbox_0_clear_regs[cpu]); - /* - * Nothing to do for SMP_RESCHEDULE_YOURSELF; returning from the - * interrupt will do the reschedule for us - */ + if (action & SMP_RESCHEDULE_YOURSELF) + scheduler_ipi(); if (action & SMP_CALL_FUNCTION) smp_call_function_interrupt(); diff --git a/arch/mips/sibyte/sb1250/smp.c b/arch/mips/sibyte/sb1250/smp.c index c00a5cb1128d..38e7f6bd7922 100644 --- a/arch/mips/sibyte/sb1250/smp.c +++ b/arch/mips/sibyte/sb1250/smp.c @@ -21,6 +21,7 @@ #include <linux/interrupt.h> #include <linux/smp.h> #include <linux/kernel_stat.h> +#include <linux/sched.h> #include <asm/mmu_context.h> #include <asm/io.h> @@ -177,10 +178,8 @@ void sb1250_mailbox_interrupt(void) /* Clear the mailbox to clear the interrupt */ ____raw_writeq(((u64)action) << 48, mailbox_clear_regs[cpu]); - /* - * Nothing to do for SMP_RESCHEDULE_YOURSELF; returning from the - * interrupt will do the reschedule for us - */ + if (action & SMP_RESCHEDULE_YOURSELF) + scheduler_ipi(); if (action & SMP_CALL_FUNCTION) smp_call_function_interrupt(); diff --git a/arch/mn10300/kernel/smp.c b/arch/mn10300/kernel/smp.c index 226c826a2194..83fb27912231 100644 --- a/arch/mn10300/kernel/smp.c +++ b/arch/mn10300/kernel/smp.c @@ -494,14 +494,11 @@ void smp_send_stop(void) * @irq: The interrupt number. * @dev_id: The device ID. * - * We need do nothing here, since the scheduling will be effected on our way - * back through entry.S. - * * Returns IRQ_HANDLED to indicate we handled the interrupt successfully. */ static irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id) { - /* do nothing */ + scheduler_ipi(); return IRQ_HANDLED; } diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c index 69d63d354ef0..828305f19cff 100644 --- a/arch/parisc/kernel/smp.c +++ b/arch/parisc/kernel/smp.c @@ -155,10 +155,7 @@ ipi_interrupt(int irq, void *dev_id) case IPI_RESCHEDULE: smp_debug(100, KERN_DEBUG "CPU%d IPI_RESCHEDULE\n", this_cpu); - /* - * Reschedule callback. Everything to be - * done is done by the interrupt return path. - */ + scheduler_ipi(); break; case IPI_CALL_FUNC: diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 87517ab6d365..a5c945814803 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -118,7 +118,7 @@ void smp_message_recv(int msg) generic_smp_call_function_interrupt(); break; case PPC_MSG_RESCHEDULE: - /* we notice need_resched on exit */ + scheduler_ipi(); break; case PPC_MSG_CALL_FUNC_SINGLE: generic_smp_call_function_single_interrupt(); @@ -148,7 +148,7 @@ static irqreturn_t call_function_action(int irq, void *data) static irqreturn_t reschedule_action(int irq, void *data) { - /* we just need the return path side effect of checking need_resched */ + scheduler_ipi(); return IRQ_HANDLED; } diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index dc7671ff6d78..ff2d2371b2e9 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -88,6 +88,7 @@ config S390 select HAVE_KERNEL_XZ select HAVE_GET_USER_PAGES_FAST select HAVE_ARCH_MUTEX_CPU_RELAX + select HAVE_ARCH_JUMP_LABEL if !MARCH_G5 select ARCH_INLINE_SPIN_TRYLOCK select ARCH_INLINE_SPIN_TRYLOCK_BH select ARCH_INLINE_SPIN_LOCK diff --git a/arch/s390/include/asm/jump_label.h b/arch/s390/include/asm/jump_label.h new file mode 100644 index 000000000000..95a6cf2b5b67 --- /dev/null +++ b/arch/s390/include/asm/jump_label.h @@ -0,0 +1,37 @@ +#ifndef _ASM_S390_JUMP_LABEL_H +#define _ASM_S390_JUMP_LABEL_H + +#include <linux/types.h> + +#define JUMP_LABEL_NOP_SIZE 6 + +#ifdef CONFIG_64BIT +#define ASM_PTR ".quad" +#define ASM_ALIGN ".balign 8" +#else +#define ASM_PTR ".long" +#define ASM_ALIGN ".balign 4" +#endif + +static __always_inline bool arch_static_branch(struct jump_label_key *key) +{ + asm goto("0: brcl 0,0\n" + ".pushsection __jump_table, \"aw\"\n" + ASM_ALIGN "\n" + ASM_PTR " 0b, %l[label], %0\n" + ".popsection\n" + : : "X" (key) : : label); + return false; +label: + return true; +} + +typedef unsigned long jump_label_t; + +struct jump_entry { + jump_label_t code; + jump_label_t target; + jump_label_t key; +}; + +#endif diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 64230bc392fa..5ff15dacb571 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -23,7 +23,7 @@ CFLAGS_sysinfo.o += -Iinclude/math-emu -Iarch/s390/math-emu -w obj-y := bitmap.o traps.o time.o process.o base.o early.o setup.o \ processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o \ s390_ext.o debug.o irq.o ipl.o dis.o diag.o mem_detect.o \ - vdso.o vtime.o sysinfo.o nmi.o sclp.o + vdso.o vtime.o sysinfo.o nmi.o sclp.o jump_label.o obj-y += $(if $(CONFIG_64BIT),entry64.o,entry.o) obj-y += $(if $(CONFIG_64BIT),reipl64.o,reipl.o) diff --git a/arch/s390/kernel/jump_label.c b/arch/s390/kernel/jump_label.c new file mode 100644 index 000000000000..44cc06bedf77 --- /dev/null +++ b/arch/s390/kernel/jump_label.c @@ -0,0 +1,59 @@ +/* + * Jump label s390 support + * + * Copyright IBM Corp. 2011 + * Author(s): Jan Glauber <jang@linux.vnet.ibm.com> + */ +#include <linux/module.h> +#include <linux/uaccess.h> +#include <linux/stop_machine.h> +#include <linux/jump_label.h> +#include <asm/ipl.h> + +#ifdef HAVE_JUMP_LABEL + +struct insn { + u16 opcode; + s32 offset; +} __packed; + +struct insn_args { + unsigned long *target; + struct insn *insn; + ssize_t size; +}; + +static int __arch_jump_label_transform(void *data) +{ + struct insn_args *args = data; + int rc; + + rc = probe_kernel_write(args->target, args->insn, args->size); + WARN_ON_ONCE(rc < 0); + return 0; +} + +void arch_jump_label_transform(struct jump_entry *entry, + enum jump_label_type type) +{ + struct insn_args args; + struct insn insn; + + if (type == JUMP_LABEL_ENABLE) { + /* brcl 15,offset */ + insn.opcode = 0xc0f4; + insn.offset = (entry->target - entry->code) >> 1; + } else { + /* brcl 0,0 */ + insn.opcode = 0xc004; + insn.offset = 0; + } + + args.target = (void *) entry->code; + args.insn = &insn; + args.size = JUMP_LABEL_NOP_SIZE; + + stop_machine(__arch_jump_label_transform, &args, NULL); +} + +#endif diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 85cd6ef89995..fed71dcaa0d6 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -165,12 +165,12 @@ static void do_ext_call_interrupt(unsigned int ext_int_code, kstat_cpu(smp_processor_id()).irqs[EXTINT_IPI]++; /* * handle bit signal external calls - * - * For the ec_schedule signal we have to do nothing. All the work - * is done automatically when we return from the interrupt. */ bits = xchg(&S390_lowcore.ext_call_fast, 0); + if (test_bit(ec_schedule, &bits)) + scheduler_ipi(); + if (test_bit(ec_call_function, &bits)) generic_smp_call_function_interrupt(); diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c index 509b36b45115..6207561ea34a 100644 --- a/arch/sh/kernel/smp.c +++ b/arch/sh/kernel/smp.c @@ -20,6 +20,7 @@ #include <linux/module.h> #include <linux/cpu.h> #include <linux/interrupt.h> +#include <linux/sched.h> #include <asm/atomic.h> #include <asm/processor.h> #include <asm/system.h> @@ -323,6 +324,7 @@ void smp_message_recv(unsigned int msg) generic_smp_call_function_interrupt(); break; case SMP_MSG_RESCHEDULE: + scheduler_ipi(); break; case SMP_MSG_FUNCTION_SINGLE: generic_smp_call_function_single_interrupt(); diff --git a/arch/sparc/include/asm/jump_label.h b/arch/sparc/include/asm/jump_label.h index 427d4684e0d2..fc73a82366f8 100644 --- a/arch/sparc/include/asm/jump_label.h +++ b/arch/sparc/include/asm/jump_label.h @@ -7,17 +7,20 @@ #define JUMP_LABEL_NOP_SIZE 4 -#define JUMP_LABEL(key, label) \ - do { \ - asm goto("1:\n\t" \ - "nop\n\t" \ - "nop\n\t" \ - ".pushsection __jump_table, \"a\"\n\t"\ - ".align 4\n\t" \ - ".word 1b, %l[" #label "], %c0\n\t" \ - ".popsection \n\t" \ - : : "i" (key) : : label);\ - } while (0) +static __always_inline bool arch_static_branch(struct jump_label_key *key) +{ + asm goto("1:\n\t" + "nop\n\t" + "nop\n\t" + ".pushsection __jump_table, \"aw\"\n\t" + ".align 4\n\t" + ".word 1b, %l[l_yes], %c0\n\t" + ".popsection \n\t" + : : "i" (key) : : l_yes); + return false; +l_yes: + return true; +} #endif /* __KERNEL__ */ diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h index 1c79f32734a0..8b9c556d630b 100644 --- a/arch/sparc/include/asm/topology_64.h +++ b/arch/sparc/include/asm/topology_64.h @@ -65,6 +65,10 @@ static inline int pcibus_to_node(struct pci_bus *pbus) #define smt_capable() (sparc64_multi_core) #endif /* CONFIG_SMP */ -#define cpu_coregroup_mask(cpu) (&cpu_core_map[cpu]) +extern cpumask_t cpu_core_map[NR_CPUS]; +static inline const struct cpumask *cpu_coregroup_mask(int cpu) +{ + return &cpu_core_map[cpu]; +} #endif /* _ASM_SPARC64_TOPOLOGY_H */ diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c index 2805949e3a14..f11b425b1359 100644 --- a/arch/sparc/kernel/smp_32.c +++ b/arch/sparc/kernel/smp_32.c @@ -127,7 +127,9 @@ struct linux_prom_registers smp_penguin_ctable __cpuinitdata = { 0 }; void smp_send_reschedule(int cpu) { - /* See sparc64 */ + /* + * XXX missing reschedule IPI, see scheduler_ipi() + */ } void smp_send_stop(void) diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index 3e94a8c23238..9478da7fdb3e 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1368,6 +1368,7 @@ void smp_send_reschedule(int cpu) void __irq_entry smp_receive_signal_client(int irq, struct pt_regs *regs) { clear_softint(1 << irq); + scheduler_ipi(); } /* This is a nop because we capture all other cpus diff --git a/arch/tile/kernel/smp.c b/arch/tile/kernel/smp.c index a4293102ef81..c52224d5ed45 100644 --- a/arch/tile/kernel/smp.c +++ b/arch/tile/kernel/smp.c @@ -189,12 +189,8 @@ void flush_icache_range(unsigned long start, unsigned long end) /* Called when smp_send_reschedule() triggers IRQ_RESCHEDULE. */ static irqreturn_t handle_reschedule_ipi(int irq, void *token) { - /* - * Nothing to do here; when we return from interrupt, the - * rescheduling will occur there. But do bump the interrupt - * profiler count in the meantime. - */ __get_cpu_var(irq_stat).irq_resched_count++; + scheduler_ipi(); return IRQ_HANDLED; } diff --git a/arch/um/kernel/smp.c b/arch/um/kernel/smp.c index 106bf27e2a9a..eefb107d2d73 100644 --- a/arch/um/kernel/smp.c +++ b/arch/um/kernel/smp.c @@ -173,7 +173,7 @@ void IPI_handler(int cpu) break; case 'R': - set_tsk_need_resched(current); + scheduler_ipi(); break; case 'S': diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a76fb16fee9b..9bb2adca0728 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -365,17 +365,6 @@ config X86_UV # Following is an alphabetically sorted list of 32 bit extended platforms # Please maintain the alphabetic order if and when there are additions -config X86_ELAN - bool "AMD Elan" - depends on X86_32 - depends on X86_EXTENDED_PLATFORM - ---help--- - Select this for an AMD Elan processor. - - Do not use this option for K6/Athlon/Opteron processors! - - If unsure, choose "PC-compatible" instead. - config X86_INTEL_CE bool "CE4100 TV platform" depends on PCI @@ -1174,7 +1163,7 @@ comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI" config AMD_NUMA def_bool y prompt "Old style AMD Opteron NUMA detection" - depends on X86_64 && NUMA && PCI + depends on NUMA && PCI ---help--- Enable AMD NUMA node topology detection. You should say Y here if you have a multi processor AMD system. This uses an old method to @@ -1201,7 +1190,7 @@ config NODES_SPAN_OTHER_NODES config NUMA_EMU bool "NUMA emulation" - depends on X86_64 && NUMA + depends on NUMA ---help--- Enable NUMA emulation. A flat machine will be split into virtual nodes when booted with "numa=fake=N", where N is the @@ -1223,6 +1212,10 @@ config HAVE_ARCH_BOOTMEM def_bool y depends on X86_32 && NUMA +config HAVE_ARCH_ALLOC_REMAP + def_bool y + depends on X86_32 && NUMA + config ARCH_HAVE_MEMORY_PRESENT def_bool y depends on X86_32 && DISCONTIGMEM @@ -1231,13 +1224,9 @@ config NEED_NODE_MEMMAP_SIZE def_bool y depends on X86_32 && (DISCONTIGMEM || SPARSEMEM) -config HAVE_ARCH_ALLOC_REMAP - def_bool y - depends on X86_32 && NUMA - config ARCH_FLATMEM_ENABLE def_bool y - depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && !NUMA + depends on X86_32 && !NUMA config ARCH_DISCONTIGMEM_ENABLE def_bool y @@ -1247,20 +1236,16 @@ config ARCH_DISCONTIGMEM_DEFAULT def_bool y depends on NUMA && X86_32 -config ARCH_PROC_KCORE_TEXT - def_bool y - depends on X86_64 && PROC_KCORE - -config ARCH_SPARSEMEM_DEFAULT - def_bool y - depends on X86_64 - config ARCH_SPARSEMEM_ENABLE def_bool y depends on X86_64 || NUMA || (EXPERIMENTAL && X86_32) || X86_32_NON_STANDARD select SPARSEMEM_STATIC if X86_32 select SPARSEMEM_VMEMMAP_ENABLE if X86_64 +config ARCH_SPARSEMEM_DEFAULT + def_bool y + depends on X86_64 + config ARCH_SELECT_MEMORY_MODEL def_bool y depends on ARCH_SPARSEMEM_ENABLE @@ -1269,6 +1254,10 @@ config ARCH_MEMORY_PROBE def_bool X86_64 depends on MEMORY_HOTPLUG +config ARCH_PROC_KCORE_TEXT + def_bool y + depends on X86_64 && PROC_KCORE + config ILLEGAL_POINTER_VALUE hex default 0 if X86_32 @@ -1703,10 +1692,6 @@ config ARCH_ENABLE_MEMORY_HOTREMOVE def_bool y depends on MEMORY_HOTPLUG -config HAVE_ARCH_EARLY_PFN_TO_NID - def_bool X86_64 - depends on NUMA - config USE_PERCPU_NUMA_NODE_ID def_bool y depends on NUMA @@ -2076,7 +2061,7 @@ config OLPC depends on !X86_PAE select GPIOLIB select OF - select OF_PROMTREE if PROC_DEVICETREE + select OF_PROMTREE ---help--- Add support for detecting the unique features of the OLPC XO hardware. diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index d161e939df62..6a7cfdf8ff69 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -1,6 +1,4 @@ # Put here option for CPU selection and depending optimization -if !X86_ELAN - choice prompt "Processor family" default M686 if X86_32 @@ -203,6 +201,14 @@ config MWINCHIP3D stores for this CPU, which can increase performance of some operations. +config MELAN + bool "AMD Elan" + depends on X86_32 + ---help--- + Select this for an AMD Elan processor. + + Do not use this option for K6/Athlon/Opteron processors! + config MGEODEGX1 bool "GeodeGX1" depends on X86_32 @@ -292,8 +298,6 @@ config X86_GENERIC This is really intended for distributors who need more generic optimizations. -endif - # # Define implied options from the CPU selection here config X86_INTERNODE_CACHE_SHIFT @@ -312,7 +316,7 @@ config X86_L1_CACHE_SHIFT int default "7" if MPENTIUM4 || MPSC default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU - default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 + default "4" if MELAN || M486 || M386 || MGEODEGX1 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX config X86_XADD @@ -358,7 +362,7 @@ config X86_POPAD_OK config X86_ALIGNMENT_16 def_bool y - depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 + depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 config X86_INTEL_USERCOPY def_bool y diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu index f2ee1abb1df9..86cee7b749e1 100644 --- a/arch/x86/Makefile_32.cpu +++ b/arch/x86/Makefile_32.cpu @@ -37,7 +37,7 @@ cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom,$(call cc-option,-march= $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic)) # AMD Elan support -cflags-$(CONFIG_X86_ELAN) += -march=i486 +cflags-$(CONFIG_MELAN) += -march=i486 # Geode GX1 support cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 12e0e7dd869c..416d865eae39 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -183,8 +183,6 @@ static inline void disable_acpi(void) { } #define ARCH_HAS_POWER_INIT 1 -struct bootnode; - #ifdef CONFIG_ACPI_NUMA extern int acpi_numa; extern int x86_acpi_numa_init(void); diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 13009d1af99a..bf535f947e8c 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -4,7 +4,6 @@ #include <linux/types.h> #include <linux/stddef.h> #include <linux/stringify.h> -#include <linux/jump_label.h> #include <asm/asm.h> /* @@ -191,12 +190,4 @@ extern void *text_poke(void *addr, const void *opcode, size_t len); extern void *text_poke_smp(void *addr, const void *opcode, size_t len); extern void text_poke_smp_batch(struct text_poke_param *params, int n); -#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL) -#define IDEAL_NOP_SIZE_5 5 -extern unsigned char ideal_nop5[IDEAL_NOP_SIZE_5]; -extern void arch_init_ideal_nop5(void); -#else -static inline void arch_init_ideal_nop5(void) {} -#endif - #endif /* _ASM_X86_ALTERNATIVE_H */ diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 331682231bb4..67f87f257611 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -11,7 +11,6 @@ struct amd_nb_bus_dev_range { extern const struct pci_device_id amd_nb_misc_ids[]; extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[]; -struct bootnode; extern bool early_is_amd_nb(u32 value); extern int amd_cache_northbridges(void); diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 2b7d573be549..a0c46f061210 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -363,7 +363,12 @@ struct apic { */ int (*x86_32_early_logical_apicid)(int cpu); - /* determine CPU -> NUMA node mapping */ + /* + * Optional method called from setup_local_APIC() after logical + * apicid is guaranteed to be known to initialize apicid -> node + * mapping if NUMA initialization hasn't done so already. Don't + * add new users. + */ int (*x86_32_numa_cpu_node)(int cpu); #endif }; @@ -537,8 +542,6 @@ static inline int default_phys_pkg_id(int cpuid_apic, int index_msb) return cpuid_apic >> index_msb; } -extern int default_x86_32_numa_cpu_node(int cpu); - #endif static inline unsigned int diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 91f3e087cf21..50c0d30e676d 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -207,8 +207,7 @@ extern const char * const x86_power_flags[32]; #define test_cpu_cap(c, bit) \ test_bit(bit, (unsigned long *)((c)->x86_capability)) -#define cpu_has(c, bit) \ - (__builtin_constant_p(bit) && \ +#define REQUIRED_MASK_BIT_SET(bit) \ ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) || \ (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1)) || \ (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2)) || \ @@ -218,10 +217,16 @@ extern const char * const x86_power_flags[32]; (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \ (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) || \ (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) || \ - (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) ) \ - ? 1 : \ + (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) ) + +#define cpu_has(c, bit) \ + (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ test_cpu_cap(c, bit)) +#define this_cpu_has(bit) \ + (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ + x86_this_cpu_test_bit(bit, (unsigned long *)&cpu_info.x86_capability)) + #define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit) #define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability)) diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h index 057099e5faba..0bdb0c54d9a1 100644 --- a/arch/x86/include/asm/dma.h +++ b/arch/x86/include/asm/dma.h @@ -69,22 +69,18 @@ #define MAX_DMA_CHANNELS 8 -#ifdef CONFIG_X86_32 - -/* The maximum address that we can perform a DMA transfer to on this platform */ -#define MAX_DMA_ADDRESS (PAGE_OFFSET + 0x1000000) - -#else - /* 16MB ISA DMA zone */ #define MAX_DMA_PFN ((16 * 1024 * 1024) >> PAGE_SHIFT) /* 4GB broken PCI/AGP hardware bus master zone */ #define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT) +#ifdef CONFIG_X86_32 +/* The maximum address that we can perform a DMA transfer to on this platform */ +#define MAX_DMA_ADDRESS (PAGE_OFFSET + 0x1000000) +#else /* Compat define for old dma zone */ #define MAX_DMA_ADDRESS ((unsigned long)__va(MAX_DMA_PFN << PAGE_SHIFT)) - #endif /* 8237 DMA controllers */ diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 574dbc22893a..a32b18ce6ead 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -5,20 +5,25 @@ #include <linux/types.h> #include <asm/nops.h> +#include <asm/asm.h> #define JUMP_LABEL_NOP_SIZE 5 -# define JUMP_LABEL_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t" - -# define JUMP_LABEL(key, label) \ - do { \ - asm goto("1:" \ - JUMP_LABEL_INITIAL_NOP \ - ".pushsection __jump_table, \"aw\" \n\t"\ - _ASM_PTR "1b, %l[" #label "], %c0 \n\t" \ - ".popsection \n\t" \ - : : "i" (key) : : label); \ - } while (0) +#define JUMP_LABEL_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t" + +static __always_inline bool arch_static_branch(struct jump_label_key *key) +{ + asm goto("1:" + JUMP_LABEL_INITIAL_NOP + ".pushsection __jump_table, \"aw\" \n\t" + _ASM_ALIGN "\n\t" + _ASM_PTR "1b, %l[l_yes], %c0 \n\t" + ".popsection \n\t" + : : "i" (key) : : l_yes); + return false; +l_yes: + return true; +} #endif /* __KERNEL__ */ diff --git a/arch/x86/include/asm/mach_traps.h b/arch/x86/include/asm/mach_traps.h index 72a8b52e7dfd..7775debde112 100644 --- a/arch/x86/include/asm/mach_traps.h +++ b/arch/x86/include/asm/mach_traps.h @@ -8,6 +8,7 @@ #include <asm/mc146818rtc.h> #define NMI_REASON_PORT 0x61 +#define NMI_ENABLE_PORT 0x70 /* Real-Time Clock Address Register as well */ #define NMI_REASON_SERR 0x80 #define NMI_REASON_IOCHK 0x40 @@ -30,12 +31,19 @@ static inline void reassert_nmi(void) old_reg = current_lock_cmos_reg(); else lock_cmos(0); /* register doesn't matter here */ - outb(0x8f, 0x70); - inb(0x71); /* dummy */ - outb(0x0f, 0x70); - inb(0x71); /* dummy */ + + /* + * This will cause the NMI output to transition low + * then high if there are any pending NMI sources. The + * CPU's NMI input logic will then register a new NMI. + */ + outb(0x8f, NMI_ENABLE_PORT); + inb(0x71); /* dummy */ + outb(0x0f, NMI_ENABLE_PORT); + inb(0x71); /* dummy */ + if (old_reg >= 0) - outb(old_reg, 0x70); + outb(old_reg, NMI_ENABLE_PORT); else unlock_cmos(); } diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index eb16e94ae04f..021979a6e23f 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -142,8 +142,6 @@ static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {} static inline void enable_p5_mce(void) {} #endif -extern void (*x86_mce_decode_callback)(struct mce *m); - void mce_setup(struct mce *m); void mce_log(struct mce *m); DECLARE_PER_CPU(struct sys_device, mce_dev); diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h index 91df7c51806c..5e83a416eca8 100644 --- a/arch/x86/include/asm/mmzone_32.h +++ b/arch/x86/include/asm/mmzone_32.h @@ -13,31 +13,11 @@ extern struct pglist_data *node_data[]; #define NODE_DATA(nid) (node_data[nid]) #include <asm/numaq.h> -/* summit or generic arch */ -#include <asm/srat.h> - -extern int get_memcfg_numa_flat(void); -/* - * This allows any one NUMA architecture to be compiled - * for, and still fall back to the flat function if it - * fails. - */ -static inline void get_memcfg_numa(void) -{ - - if (get_memcfg_numaq()) - return; - if (get_memcfg_from_srat()) - return; - get_memcfg_numa_flat(); -} extern void resume_map_numa_kva(pgd_t *pgd); #else /* !CONFIG_NUMA */ -#define get_memcfg_numa get_memcfg_numa_flat - static inline void resume_map_numa_kva(pgd_t *pgd) {} #endif /* CONFIG_NUMA */ diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h index 288b96f815a6..b3f88d7867c7 100644 --- a/arch/x86/include/asm/mmzone_64.h +++ b/arch/x86/include/asm/mmzone_64.h @@ -4,36 +4,13 @@ #ifndef _ASM_X86_MMZONE_64_H #define _ASM_X86_MMZONE_64_H - #ifdef CONFIG_NUMA #include <linux/mmdebug.h> - #include <asm/smp.h> -/* Simple perfect hash to map physical addresses to node numbers */ -struct memnode { - int shift; - unsigned int mapsize; - s16 *map; - s16 embedded_map[64 - 8]; -} ____cacheline_aligned; /* total size = 128 bytes */ -extern struct memnode memnode; -#define memnode_shift memnode.shift -#define memnodemap memnode.map -#define memnodemapsize memnode.mapsize - extern struct pglist_data *node_data[]; -static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) -{ - unsigned nid; - VIRTUAL_BUG_ON(!memnodemap); - nid = memnodemap[addr >> memnode_shift]; - VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]); - return nid; -} - #define NODE_DATA(nid) (node_data[nid]) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index 67763c5d8b4e..9eae7752ae9b 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -35,7 +35,7 @@ #define MODULE_PROC_FAMILY "K7 " #elif defined CONFIG_MK8 #define MODULE_PROC_FAMILY "K8 " -#elif defined CONFIG_X86_ELAN +#elif defined CONFIG_MELAN #define MODULE_PROC_FAMILY "ELAN " #elif defined CONFIG_MCRUSOE #define MODULE_PROC_FAMILY "CRUSOE " diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h index af788496020b..405b4032a60b 100644 --- a/arch/x86/include/asm/nops.h +++ b/arch/x86/include/asm/nops.h @@ -1,7 +1,13 @@ #ifndef _ASM_X86_NOPS_H #define _ASM_X86_NOPS_H -/* Define nops for use with alternative() */ +/* + * Define nops for use with alternative() and for tracing. + * + * *_NOP5_ATOMIC must be a single instruction. + */ + +#define NOP_DS_PREFIX 0x3e /* generic versions from gas 1: nop @@ -13,14 +19,15 @@ 6: leal 0x00000000(%esi),%esi 7: leal 0x00000000(,%esi,1),%esi */ -#define GENERIC_NOP1 ".byte 0x90\n" -#define GENERIC_NOP2 ".byte 0x89,0xf6\n" -#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n" -#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n" -#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4 -#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n" -#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n" -#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7 +#define GENERIC_NOP1 0x90 +#define GENERIC_NOP2 0x89,0xf6 +#define GENERIC_NOP3 0x8d,0x76,0x00 +#define GENERIC_NOP4 0x8d,0x74,0x26,0x00 +#define GENERIC_NOP5 GENERIC_NOP1,GENERIC_NOP4 +#define GENERIC_NOP6 0x8d,0xb6,0x00,0x00,0x00,0x00 +#define GENERIC_NOP7 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00 +#define GENERIC_NOP8 GENERIC_NOP1,GENERIC_NOP7 +#define GENERIC_NOP5_ATOMIC NOP_DS_PREFIX,GENERIC_NOP4 /* Opteron 64bit nops 1: nop @@ -29,13 +36,14 @@ 4: osp osp osp nop */ #define K8_NOP1 GENERIC_NOP1 -#define K8_NOP2 ".byte 0x66,0x90\n" -#define K8_NOP3 ".byte 0x66,0x66,0x90\n" -#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n" -#define K8_NOP5 K8_NOP3 K8_NOP2 -#define K8_NOP6 K8_NOP3 K8_NOP3 -#define K8_NOP7 K8_NOP4 K8_NOP3 -#define K8_NOP8 K8_NOP4 K8_NOP4 +#define K8_NOP2 0x66,K8_NOP1 +#define K8_NOP3 0x66,K8_NOP2 +#define K8_NOP4 0x66,K8_NOP3 +#define K8_NOP5 K8_NOP3,K8_NOP2 +#define K8_NOP6 K8_NOP3,K8_NOP3 +#define K8_NOP7 K8_NOP4,K8_NOP3 +#define K8_NOP8 K8_NOP4,K8_NOP4 +#define K8_NOP5_ATOMIC 0x66,K8_NOP4 /* K7 nops uses eax dependencies (arbitrary choice) @@ -47,13 +55,14 @@ 7: leal 0x00000000(,%eax,1),%eax */ #define K7_NOP1 GENERIC_NOP1 -#define K7_NOP2 ".byte 0x8b,0xc0\n" -#define K7_NOP3 ".byte 0x8d,0x04,0x20\n" -#define K7_NOP4 ".byte 0x8d,0x44,0x20,0x00\n" -#define K7_NOP5 K7_NOP4 ASM_NOP1 -#define K7_NOP6 ".byte 0x8d,0x80,0,0,0,0\n" -#define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n" -#define K7_NOP8 K7_NOP7 ASM_NOP1 +#define K7_NOP2 0x8b,0xc0 +#define K7_NOP3 0x8d,0x04,0x20 +#define K7_NOP4 0x8d,0x44,0x20,0x00 +#define K7_NOP5 K7_NOP4,K7_NOP1 +#define K7_NOP6 0x8d,0x80,0,0,0,0 +#define K7_NOP7 0x8D,0x04,0x05,0,0,0,0 +#define K7_NOP8 K7_NOP7,K7_NOP1 +#define K7_NOP5_ATOMIC NOP_DS_PREFIX,K7_NOP4 /* P6 nops uses eax dependencies (Intel-recommended choice) @@ -69,52 +78,65 @@ There is kernel code that depends on this. */ #define P6_NOP1 GENERIC_NOP1 -#define P6_NOP2 ".byte 0x66,0x90\n" -#define P6_NOP3 ".byte 0x0f,0x1f,0x00\n" -#define P6_NOP4 ".byte 0x0f,0x1f,0x40,0\n" -#define P6_NOP5 ".byte 0x0f,0x1f,0x44,0x00,0\n" -#define P6_NOP6 ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n" -#define P6_NOP7 ".byte 0x0f,0x1f,0x80,0,0,0,0\n" -#define P6_NOP8 ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n" +#define P6_NOP2 0x66,0x90 +#define P6_NOP3 0x0f,0x1f,0x00 +#define P6_NOP4 0x0f,0x1f,0x40,0 +#define P6_NOP5 0x0f,0x1f,0x44,0x00,0 +#define P6_NOP6 0x66,0x0f,0x1f,0x44,0x00,0 +#define P6_NOP7 0x0f,0x1f,0x80,0,0,0,0 +#define P6_NOP8 0x0f,0x1f,0x84,0x00,0,0,0,0 +#define P6_NOP5_ATOMIC P6_NOP5 + +#define _ASM_MK_NOP(x) ".byte " __stringify(x) "\n" #if defined(CONFIG_MK7) -#define ASM_NOP1 K7_NOP1 -#define ASM_NOP2 K7_NOP2 -#define ASM_NOP3 K7_NOP3 -#define ASM_NOP4 K7_NOP4 -#define ASM_NOP5 K7_NOP5 -#define ASM_NOP6 K7_NOP6 -#define ASM_NOP7 K7_NOP7 -#define ASM_NOP8 K7_NOP8 +#define ASM_NOP1 _ASM_MK_NOP(K7_NOP1) +#define ASM_NOP2 _ASM_MK_NOP(K7_NOP2) +#define ASM_NOP3 _ASM_MK_NOP(K7_NOP3) +#define ASM_NOP4 _ASM_MK_NOP(K7_NOP4) +#define ASM_NOP5 _ASM_MK_NOP(K7_NOP5) +#define ASM_NOP6 _ASM_MK_NOP(K7_NOP6) +#define ASM_NOP7 _ASM_MK_NOP(K7_NOP7) +#define ASM_NOP8 _ASM_MK_NOP(K7_NOP8) +#define ASM_NOP5_ATOMIC _ASM_MK_NOP(K7_NOP5_ATOMIC) #elif defined(CONFIG_X86_P6_NOP) -#define ASM_NOP1 P6_NOP1 -#define ASM_NOP2 P6_NOP2 -#define ASM_NOP3 P6_NOP3 -#define ASM_NOP4 P6_NOP4 -#define ASM_NOP5 P6_NOP5 -#define ASM_NOP6 P6_NOP6 -#define ASM_NOP7 P6_NOP7 -#define ASM_NOP8 P6_NOP8 +#define ASM_NOP1 _ASM_MK_NOP(P6_NOP1) +#define ASM_NOP2 _ASM_MK_NOP(P6_NOP2) +#define ASM_NOP3 _ASM_MK_NOP(P6_NOP3) +#define ASM_NOP4 _ASM_MK_NOP(P6_NOP4) +#define ASM_NOP5 _ASM_MK_NOP(P6_NOP5) +#define ASM_NOP6 _ASM_MK_NOP(P6_NOP6) +#define ASM_NOP7 _ASM_MK_NOP(P6_NOP7) +#define ASM_NOP8 _ASM_MK_NOP(P6_NOP8) +#define ASM_NOP5_ATOMIC _ASM_MK_NOP(P6_NOP5_ATOMIC) #elif defined(CONFIG_X86_64) -#define ASM_NOP1 K8_NOP1 -#define ASM_NOP2 K8_NOP2 -#define ASM_NOP3 K8_NOP3 -#define ASM_NOP4 K8_NOP4 -#define ASM_NOP5 K8_NOP5 -#define ASM_NOP6 K8_NOP6 -#define ASM_NOP7 K8_NOP7 -#define ASM_NOP8 K8_NOP8 +#define ASM_NOP1 _ASM_MK_NOP(K8_NOP1) +#define ASM_NOP2 _ASM_MK_NOP(K8_NOP2) +#define ASM_NOP3 _ASM_MK_NOP(K8_NOP3) +#define ASM_NOP4 _ASM_MK_NOP(K8_NOP4) +#define ASM_NOP5 _ASM_MK_NOP(K8_NOP5) +#define ASM_NOP6 _ASM_MK_NOP(K8_NOP6) +#define ASM_NOP7 _ASM_MK_NOP(K8_NOP7) +#define ASM_NOP8 _ASM_MK_NOP(K8_NOP8) +#define ASM_NOP5_ATOMIC _ASM_MK_NOP(K8_NOP5_ATOMIC) #else -#define ASM_NOP1 GENERIC_NOP1 -#define ASM_NOP2 GENERIC_NOP2 -#define ASM_NOP3 GENERIC_NOP3 -#define ASM_NOP4 GENERIC_NOP4 -#define ASM_NOP5 GENERIC_NOP5 -#define ASM_NOP6 GENERIC_NOP6 -#define ASM_NOP7 GENERIC_NOP7 -#define ASM_NOP8 GENERIC_NOP8 +#define ASM_NOP1 _ASM_MK_NOP(GENERIC_NOP1) +#define ASM_NOP2 _ASM_MK_NOP(GENERIC_NOP2) +#define ASM_NOP3 _ASM_MK_NOP(GENERIC_NOP3) +#define ASM_NOP4 _ASM_MK_NOP(GENERIC_NOP4) +#define ASM_NOP5 _ASM_MK_NOP(GENERIC_NOP5) +#define ASM_NOP6 _ASM_MK_NOP(GENERIC_NOP6) +#define ASM_NOP7 _ASM_MK_NOP(GENERIC_NOP7) +#define ASM_NOP8 _ASM_MK_NOP(GENERIC_NOP8) +#define ASM_NOP5_ATOMIC _ASM_MK_NOP(GENERIC_NOP5_ATOMIC) #endif #define ASM_NOP_MAX 8 +#define NOP_ATOMIC5 (ASM_NOP_MAX+1) /* Entry for the 5-byte atomic NOP */ + +#ifndef __ASSEMBLY__ +extern const unsigned char * const *ideal_nops; +extern void arch_init_ideal_nops(void); +#endif #endif /* _ASM_X86_NOPS_H */ diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index a50fc9f493b3..bfacd2ccf651 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -1,12 +1,24 @@ #ifndef _ASM_X86_NUMA_H #define _ASM_X86_NUMA_H +#include <linux/nodemask.h> + #include <asm/topology.h> #include <asm/apicdef.h> #ifdef CONFIG_NUMA #define NR_NODE_MEMBLKS (MAX_NUMNODES*2) +#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) + +/* + * Too small node sizes may confuse the VM badly. Usually they + * result from BIOS bugs. So dont recognize nodes as standalone + * NUMA entities that have less than this amount of RAM listed: + */ +#define NODE_MIN_SIZE (4*1024*1024) + +extern int numa_off; /* * __apicid_to_node[] stores the raw mapping between physical apicid and @@ -17,15 +29,27 @@ * numa_cpu_node(). */ extern s16 __apicid_to_node[MAX_LOCAL_APIC]; +extern nodemask_t numa_nodes_parsed __initdata; + +extern int __init numa_add_memblk(int nodeid, u64 start, u64 end); +extern void __init numa_set_distance(int from, int to, int distance); static inline void set_apicid_to_node(int apicid, s16 node) { __apicid_to_node[apicid] = node; } + +extern int __cpuinit numa_cpu_node(int cpu); + #else /* CONFIG_NUMA */ static inline void set_apicid_to_node(int apicid, s16 node) { } + +static inline int numa_cpu_node(int cpu) +{ + return NUMA_NO_NODE; +} #endif /* CONFIG_NUMA */ #ifdef CONFIG_X86_32 @@ -37,14 +61,12 @@ static inline void set_apicid_to_node(int apicid, s16 node) #ifdef CONFIG_NUMA extern void __cpuinit numa_set_node(int cpu, int node); extern void __cpuinit numa_clear_node(int cpu); -extern void __init numa_init_array(void); extern void __init init_cpu_to_node(void); extern void __cpuinit numa_add_cpu(int cpu); extern void __cpuinit numa_remove_cpu(int cpu); #else /* CONFIG_NUMA */ static inline void numa_set_node(int cpu, int node) { } static inline void numa_clear_node(int cpu) { } -static inline void numa_init_array(void) { } static inline void init_cpu_to_node(void) { } static inline void numa_add_cpu(int cpu) { } static inline void numa_remove_cpu(int cpu) { } @@ -54,4 +76,10 @@ static inline void numa_remove_cpu(int cpu) { } void debug_cpumask_set_cpu(int cpu, int node, bool enable); #endif +#ifdef CONFIG_NUMA_EMU +#define FAKE_NODE_MIN_SIZE ((u64)32 << 20) +#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) +void numa_emu_cmdline(char *); +#endif /* CONFIG_NUMA_EMU */ + #endif /* _ASM_X86_NUMA_H */ diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h index c6beed1ef103..e7d6b8254742 100644 --- a/arch/x86/include/asm/numa_32.h +++ b/arch/x86/include/asm/numa_32.h @@ -1,16 +1,6 @@ #ifndef _ASM_X86_NUMA_32_H #define _ASM_X86_NUMA_32_H -extern int numa_off; - -extern int pxm_to_nid(int pxm); - -#ifdef CONFIG_NUMA -extern int __cpuinit numa_cpu_node(int cpu); -#else /* CONFIG_NUMA */ -static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; } -#endif /* CONFIG_NUMA */ - #ifdef CONFIG_HIGHMEM extern void set_highmem_pages_init(void); #else diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index 344eb1790b46..0c05f7ae46e8 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -1,42 +1,6 @@ #ifndef _ASM_X86_NUMA_64_H #define _ASM_X86_NUMA_64_H -#include <linux/nodemask.h> - -struct bootnode { - u64 start; - u64 end; -}; - -#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) - -extern int numa_off; - extern unsigned long numa_free_all_bootmem(void); -extern void setup_node_bootmem(int nodeid, unsigned long start, - unsigned long end); - -#ifdef CONFIG_NUMA -/* - * Too small node sizes may confuse the VM badly. Usually they - * result from BIOS bugs. So dont recognize nodes as standalone - * NUMA entities that have less than this amount of RAM listed: - */ -#define NODE_MIN_SIZE (4*1024*1024) - -extern nodemask_t numa_nodes_parsed __initdata; - -extern int __cpuinit numa_cpu_node(int cpu); -extern int __init numa_add_memblk(int nodeid, u64 start, u64 end); -extern void __init numa_set_distance(int from, int to, int distance); - -#ifdef CONFIG_NUMA_EMU -#define FAKE_NODE_MIN_SIZE ((u64)32 << 20) -#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) -void numa_emu_cmdline(char *); -#endif /* CONFIG_NUMA_EMU */ -#else -static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; } -#endif #endif /* _ASM_X86_NUMA_64_H */ diff --git a/arch/x86/include/asm/numaq.h b/arch/x86/include/asm/numaq.h index 37c516545ec8..c3b3c322fd87 100644 --- a/arch/x86/include/asm/numaq.h +++ b/arch/x86/include/asm/numaq.h @@ -29,7 +29,7 @@ #ifdef CONFIG_X86_NUMAQ extern int found_numaq; -extern int get_memcfg_numaq(void); +extern int numaq_numa_init(void); extern int pci_numaq_init(void); extern void *xquad_portio; @@ -166,11 +166,6 @@ struct sys_cfg_data { void numaq_tsc_disable(void); -#else -static inline int get_memcfg_numaq(void) -{ - return 0; -} #endif /* CONFIG_X86_NUMAQ */ #endif /* _ASM_X86_NUMAQ_H */ diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h index c5d3a5abbb9f..24487712e0b1 100644 --- a/arch/x86/include/asm/olpc_ofw.h +++ b/arch/x86/include/asm/olpc_ofw.h @@ -26,15 +26,12 @@ extern void setup_olpc_ofw_pgd(void); /* check if OFW was detected during boot */ extern bool olpc_ofw_present(void); +extern void olpc_dt_build_devicetree(void); + #else /* !CONFIG_OLPC */ static inline void olpc_ofw_detect(void) { } static inline void setup_olpc_ofw_pgd(void) { } -#endif /* !CONFIG_OLPC */ - -#ifdef CONFIG_OF_PROMTREE -extern void olpc_dt_build_devicetree(void); -#else static inline void olpc_dt_build_devicetree(void) { } -#endif +#endif /* !CONFIG_OLPC */ #endif /* _ASM_X86_OLPC_OFW_H */ diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index d475b4398d8b..53278b0dfdf6 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -517,7 +517,7 @@ do { \ typeof(o2) __o2 = o2; \ typeof(o2) __n2 = n2; \ typeof(o2) __dummy; \ - alternative_io("call this_cpu_cmpxchg16b_emu\n\t" P6_NOP4, \ + alternative_io("call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP4, \ "cmpxchg16b " __percpu_prefix "(%%rsi)\n\tsetz %0\n\t", \ X86_FEATURE_CX16, \ ASM_OUTPUT2("=a"(__ret), "=d"(__dummy)), \ @@ -542,6 +542,33 @@ do { \ old__; \ }) +static __always_inline int x86_this_cpu_constant_test_bit(unsigned int nr, + const unsigned long __percpu *addr) +{ + unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG; + + return ((1UL << (nr % BITS_PER_LONG)) & percpu_read(*a)) != 0; +} + +static inline int x86_this_cpu_variable_test_bit(int nr, + const unsigned long __percpu *addr) +{ + int oldbit; + + asm volatile("bt "__percpu_arg(2)",%1\n\t" + "sbb %0,%0" + : "=r" (oldbit) + : "m" (*(unsigned long *)addr), "Ir" (nr)); + + return oldbit; +} + +#define x86_this_cpu_test_bit(nr, addr) \ + (__builtin_constant_p((nr)) \ + ? x86_this_cpu_constant_test_bit((nr), (addr)) \ + : x86_this_cpu_variable_test_bit((nr), (addr))) + + #include <asm-generic/percpu.h> /* We can use this directly for local CPU (faster). */ diff --git a/arch/x86/include/asm/probe_roms.h b/arch/x86/include/asm/probe_roms.h new file mode 100644 index 000000000000..4950a0b1d09c --- /dev/null +++ b/arch/x86/include/asm/probe_roms.h @@ -0,0 +1,8 @@ +#ifndef _PROBE_ROMS_H_ +#define _PROBE_ROMS_H_ +struct pci_dev; + +extern void __iomem *pci_map_biosrom(struct pci_dev *pdev); +extern void pci_unmap_biosrom(void __iomem *rom); +extern size_t pci_biosrom_size(struct pci_dev *pdev); +#endif diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index db8aa19a08a2..03d3a32ace20 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -104,10 +104,10 @@ void *extend_brk(size_t size, size_t align); type *name; \ RESERVE_BRK(name, sizeof(type) * entries) +extern void probe_roms(void); #ifdef __i386__ void __init i386_start_kernel(void); -extern void probe_roms(void); #else void __init x86_64_start_kernel(char *real_mode); diff --git a/arch/x86/include/asm/srat.h b/arch/x86/include/asm/srat.h deleted file mode 100644 index b508d639d1a7..000000000000 --- a/arch/x86/include/asm/srat.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Some of the code in this file has been gleaned from the 64 bit - * discontigmem support code base. - * - * Copyright (C) 2002, IBM Corp. - * - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Send feedback to Pat Gaughen <gone@us.ibm.com> - */ - -#ifndef _ASM_X86_SRAT_H -#define _ASM_X86_SRAT_H - -#ifdef CONFIG_ACPI_NUMA -extern int get_memcfg_from_srat(void); -#else -static inline int get_memcfg_from_srat(void) -{ - return 0; -} -#endif - -#endif /* _ASM_X86_SRAT_H */ diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index 12569e691ce3..c2ff2a1d845e 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -303,24 +303,81 @@ static inline void native_wbinvd(void) #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #else -#define read_cr0() (native_read_cr0()) -#define write_cr0(x) (native_write_cr0(x)) -#define read_cr2() (native_read_cr2()) -#define write_cr2(x) (native_write_cr2(x)) -#define read_cr3() (native_read_cr3()) -#define write_cr3(x) (native_write_cr3(x)) -#define read_cr4() (native_read_cr4()) -#define read_cr4_safe() (native_read_cr4_safe()) -#define write_cr4(x) (native_write_cr4(x)) -#define wbinvd() (native_wbinvd()) + +static inline unsigned long read_cr0(void) +{ + return native_read_cr0(); +} + +static inline void write_cr0(unsigned long x) +{ + native_write_cr0(x); +} + +static inline unsigned long read_cr2(void) +{ + return native_read_cr2(); +} + +static inline void write_cr2(unsigned long x) +{ + native_write_cr2(x); +} + +static inline unsigned long read_cr3(void) +{ + return native_read_cr3(); +} + +static inline void write_cr3(unsigned long x) +{ + native_write_cr3(x); +} + +static inline unsigned long read_cr4(void) +{ + return native_read_cr4(); +} + +static inline unsigned long read_cr4_safe(void) +{ + return native_read_cr4_safe(); +} + +static inline void write_cr4(unsigned long x) +{ + native_write_cr4(x); +} + +static inline void wbinvd(void) +{ + native_wbinvd(); +} + #ifdef CONFIG_X86_64 -#define read_cr8() (native_read_cr8()) -#define write_cr8(x) (native_write_cr8(x)) -#define load_gs_index native_load_gs_index + +static inline unsigned long read_cr8(void) +{ + return native_read_cr8(); +} + +static inline void write_cr8(unsigned long x) +{ + native_write_cr8(x); +} + +static inline void load_gs_index(unsigned selector) +{ + native_load_gs_index(selector); +} + #endif /* Clear the 'TS' bit */ -#define clts() (native_clts()) +static inline void clts(void) +{ + native_clts(); +} #endif/* CONFIG_PARAVIRT */ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 910a7084f7f2..c00692476e9f 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -93,19 +93,11 @@ extern void setup_node_to_cpumask_map(void); #define pcibus_to_node(bus) __pcibus_to_node(bus) #ifdef CONFIG_X86_32 -extern unsigned long node_start_pfn[]; -extern unsigned long node_end_pfn[]; -extern unsigned long node_remap_size[]; -#define node_has_online_mem(nid) (node_start_pfn[nid] != node_end_pfn[nid]) - # define SD_CACHE_NICE_TRIES 1 # define SD_IDLE_IDX 1 - #else - # define SD_CACHE_NICE_TRIES 2 # define SD_IDLE_IDX 2 - #endif /* sched_domains SD_NODE_INIT for NUMA machines */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 7338ef2218bc..758b379be4c8 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -36,7 +36,7 @@ obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o ldt.o dumpstack.o obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o obj-$(CONFIG_IRQ_WORK) += irq_work.o -obj-$(CONFIG_X86_32) += probe_roms_32.o +obj-y += probe_roms.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 4a234677e213..c0501ea6b634 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -67,17 +67,30 @@ __setup("noreplace-paravirt", setup_noreplace_paravirt); #define DPRINTK(fmt, args...) if (debug_alternative) \ printk(KERN_DEBUG fmt, args) +/* + * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes + * that correspond to that nop. Getting from one nop to the next, we + * add to the array the offset that is equal to the sum of all sizes of + * nops preceding the one we are after. + * + * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the + * nice symmetry of sizes of the previous nops. + */ #if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64) -/* Use inline assembly to define this because the nops are defined - as inline assembly strings in the include files and we cannot - get them easily into strings. */ -asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nintelnops: " - GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6 - GENERIC_NOP7 GENERIC_NOP8 - "\t.previous"); -extern const unsigned char intelnops[]; -static const unsigned char *const __initconst_or_module -intel_nops[ASM_NOP_MAX+1] = { +static const unsigned char intelnops[] = +{ + GENERIC_NOP1, + GENERIC_NOP2, + GENERIC_NOP3, + GENERIC_NOP4, + GENERIC_NOP5, + GENERIC_NOP6, + GENERIC_NOP7, + GENERIC_NOP8, + GENERIC_NOP5_ATOMIC +}; +static const unsigned char * const intel_nops[ASM_NOP_MAX+2] = +{ NULL, intelnops, intelnops + 1, @@ -87,17 +100,25 @@ intel_nops[ASM_NOP_MAX+1] = { intelnops + 1 + 2 + 3 + 4 + 5, intelnops + 1 + 2 + 3 + 4 + 5 + 6, intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7, + intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, }; #endif #ifdef K8_NOP1 -asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk8nops: " - K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 - K8_NOP7 K8_NOP8 - "\t.previous"); -extern const unsigned char k8nops[]; -static const unsigned char *const __initconst_or_module -k8_nops[ASM_NOP_MAX+1] = { +static const unsigned char k8nops[] = +{ + K8_NOP1, + K8_NOP2, + K8_NOP3, + K8_NOP4, + K8_NOP5, + K8_NOP6, + K8_NOP7, + K8_NOP8, + K8_NOP5_ATOMIC +}; +static const unsigned char * const k8_nops[ASM_NOP_MAX+2] = +{ NULL, k8nops, k8nops + 1, @@ -107,17 +128,25 @@ k8_nops[ASM_NOP_MAX+1] = { k8nops + 1 + 2 + 3 + 4 + 5, k8nops + 1 + 2 + 3 + 4 + 5 + 6, k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, + k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, }; #endif #if defined(K7_NOP1) && !defined(CONFIG_X86_64) -asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk7nops: " - K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6 - K7_NOP7 K7_NOP8 - "\t.previous"); -extern const unsigned char k7nops[]; -static const unsigned char *const __initconst_or_module -k7_nops[ASM_NOP_MAX+1] = { +static const unsigned char k7nops[] = +{ + K7_NOP1, + K7_NOP2, + K7_NOP3, + K7_NOP4, + K7_NOP5, + K7_NOP6, + K7_NOP7, + K7_NOP8, + K7_NOP5_ATOMIC +}; +static const unsigned char * const k7_nops[ASM_NOP_MAX+2] = +{ NULL, k7nops, k7nops + 1, @@ -127,17 +156,25 @@ k7_nops[ASM_NOP_MAX+1] = { k7nops + 1 + 2 + 3 + 4 + 5, k7nops + 1 + 2 + 3 + 4 + 5 + 6, k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, + k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, }; #endif #ifdef P6_NOP1 -asm("\t" __stringify(__INITRODATA_OR_MODULE) "\np6nops: " - P6_NOP1 P6_NOP2 P6_NOP3 P6_NOP4 P6_NOP5 P6_NOP6 - P6_NOP7 P6_NOP8 - "\t.previous"); -extern const unsigned char p6nops[]; -static const unsigned char *const __initconst_or_module -p6_nops[ASM_NOP_MAX+1] = { +static const unsigned char __initconst_or_module p6nops[] = +{ + P6_NOP1, + P6_NOP2, + P6_NOP3, + P6_NOP4, + P6_NOP5, + P6_NOP6, + P6_NOP7, + P6_NOP8, + P6_NOP5_ATOMIC +}; +static const unsigned char * const p6_nops[ASM_NOP_MAX+2] = +{ NULL, p6nops, p6nops + 1, @@ -147,47 +184,65 @@ p6_nops[ASM_NOP_MAX+1] = { p6nops + 1 + 2 + 3 + 4 + 5, p6nops + 1 + 2 + 3 + 4 + 5 + 6, p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, + p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, }; #endif +/* Initialize these to a safe default */ #ifdef CONFIG_X86_64 +const unsigned char * const *ideal_nops = p6_nops; +#else +const unsigned char * const *ideal_nops = intel_nops; +#endif -extern char __vsyscall_0; -static const unsigned char *const *__init_or_module find_nop_table(void) +void __init arch_init_ideal_nops(void) { - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - boot_cpu_has(X86_FEATURE_NOPL)) - return p6_nops; - else - return k8_nops; -} - -#else /* CONFIG_X86_64 */ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + /* + * Due to a decoder implementation quirk, some + * specific Intel CPUs actually perform better with + * the "k8_nops" than with the SDM-recommended NOPs. + */ + if (boot_cpu_data.x86 == 6 && + boot_cpu_data.x86_model >= 0x0f && + boot_cpu_data.x86_model != 0x1c && + boot_cpu_data.x86_model != 0x26 && + boot_cpu_data.x86_model != 0x27 && + boot_cpu_data.x86_model < 0x30) { + ideal_nops = k8_nops; + } else if (boot_cpu_has(X86_FEATURE_NOPL)) { + ideal_nops = p6_nops; + } else { +#ifdef CONFIG_X86_64 + ideal_nops = k8_nops; +#else + ideal_nops = intel_nops; +#endif + } -static const unsigned char *const *__init_or_module find_nop_table(void) -{ - if (boot_cpu_has(X86_FEATURE_K8)) - return k8_nops; - else if (boot_cpu_has(X86_FEATURE_K7)) - return k7_nops; - else if (boot_cpu_has(X86_FEATURE_NOPL)) - return p6_nops; - else - return intel_nops; + default: +#ifdef CONFIG_X86_64 + ideal_nops = k8_nops; +#else + if (boot_cpu_has(X86_FEATURE_K8)) + ideal_nops = k8_nops; + else if (boot_cpu_has(X86_FEATURE_K7)) + ideal_nops = k7_nops; + else + ideal_nops = intel_nops; +#endif + } } -#endif /* CONFIG_X86_64 */ - /* Use this to add nops to a buffer, then text_poke the whole buffer. */ static void __init_or_module add_nops(void *insns, unsigned int len) { - const unsigned char *const *noptable = find_nop_table(); - while (len > 0) { unsigned int noplen = len; if (noplen > ASM_NOP_MAX) noplen = ASM_NOP_MAX; - memcpy(insns, noptable[noplen], noplen); + memcpy(insns, ideal_nops[noplen], noplen); insns += noplen; len -= noplen; } @@ -195,6 +250,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len) extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; +extern char __vsyscall_0; void *text_poke_early(void *addr, const void *opcode, size_t len); /* Replace instructions with better alternatives for this CPU type. @@ -678,29 +734,3 @@ void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n) wrote_text = 0; __stop_machine(stop_machine_text_poke, (void *)&tpp, NULL); } - -#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL) - -#ifdef CONFIG_X86_64 -unsigned char ideal_nop5[5] = { 0x66, 0x66, 0x66, 0x66, 0x90 }; -#else -unsigned char ideal_nop5[5] = { 0x3e, 0x8d, 0x74, 0x26, 0x00 }; -#endif - -void __init arch_init_ideal_nop5(void) -{ - /* - * There is no good nop for all x86 archs. This selection - * algorithm should be unified with the one in find_nop_table(), - * but this should be good enough for now. - * - * For cases other than the ones below, use the safe (as in - * always functional) defaults above. - */ -#ifdef CONFIG_X86_64 - /* Don't use these on 32 bits due to broken virtualizers */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - memcpy(ideal_nop5, p6_nops[5], 5); -#endif -} -#endif diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index cd1ffed4ee22..289e92862fd9 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -177,7 +177,6 @@ static struct clocksource clocksource_apbt = { .rating = APBT_CLOCKSOURCE_RATING, .read = apbt_read_clocksource, .mask = APBT_MASK, - .shift = APBT_SHIFT, .flags = CLOCK_SOURCE_IS_CONTINUOUS, .resume = apbt_restart_clocksource, }; @@ -543,14 +542,7 @@ static int apbt_clocksource_register(void) if (t1 == apbt_read_clocksource(&clocksource_apbt)) panic("APBT counter not counting. APBT disabled\n"); - /* - * initialize and register APBT clocksource - * convert that to ns/clock cycle - * mult = (ns/c) * 2^APBT_SHIFT - */ - clocksource_apbt.mult = div_sc(MSEC_PER_SEC, - (unsigned long) apbt_freq, APBT_SHIFT); - clocksource_register(&clocksource_apbt); + clocksource_register_khz(&clocksource_apbt, (u32)apbt_freq*1000); return 0; } diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 73fb469908c6..3d2661ca6542 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -30,6 +30,22 @@ #include <asm/amd_nb.h> #include <asm/x86_init.h> +/* + * Using 512M as goal, in case kexec will load kernel_big + * that will do the on-position decompress, and could overlap with + * with the gart aperture that is used. + * Sequence: + * kernel_small + * ==> kexec (with kdump trigger path or gart still enabled) + * ==> kernel_small (gart area become e820_reserved) + * ==> kexec (with kdump trigger path or gart still enabled) + * ==> kerne_big (uncompressed size will be big than 64M or 128M) + * So don't use 512M below as gart iommu, leave the space for kernel + * code for safe. + */ +#define GART_MIN_ADDR (512ULL << 20) +#define GART_MAX_ADDR (1ULL << 32) + int gart_iommu_aperture; int gart_iommu_aperture_disabled __initdata; int gart_iommu_aperture_allowed __initdata; @@ -70,21 +86,9 @@ static u32 __init allocate_aperture(void) * memory. Unfortunately we cannot move it up because that would * make the IOMMU useless. */ - /* - * using 512M as goal, in case kexec will load kernel_big - * that will do the on position decompress, and could overlap with - * that position with gart that is used. - * sequende: - * kernel_small - * ==> kexec (with kdump trigger path or previous doesn't shutdown gart) - * ==> kernel_small(gart area become e820_reserved) - * ==> kexec (with kdump trigger path or previous doesn't shutdown gart) - * ==> kerne_big (uncompressed size will be big than 64M or 128M) - * so don't use 512M below as gart iommu, leave the space for kernel - * code for safe - */ - addr = memblock_find_in_range(0, 1ULL<<32, aper_size, 512ULL<<20); - if (addr == MEMBLOCK_ERROR || addr + aper_size > 0xffffffff) { + addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR, + aper_size, aper_size); + if (addr == MEMBLOCK_ERROR || addr + aper_size > GART_MAX_ADDR) { printk(KERN_ERR "Cannot allocate aperture memory hole (%lx,%uK)\n", addr, aper_size>>10); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index fabf01eff771..f92a8e5d1e21 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -505,7 +505,7 @@ static void __cpuinit setup_APIC_timer(void) { struct clock_event_device *levt = &__get_cpu_var(lapic_events); - if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_ARAT)) { + if (this_cpu_has(X86_FEATURE_ARAT)) { lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP; /* Make LAPIC timer preferrable over percpu HPET */ lapic_clockevent.rating = 150; @@ -1237,6 +1237,17 @@ void __cpuinit setup_local_APIC(void) /* always use the value from LDR */ early_per_cpu(x86_cpu_to_logical_apicid, cpu) = logical_smp_processor_id(); + + /* + * Some NUMA implementations (NUMAQ) don't initialize apicid to + * node mapping during NUMA init. Now that logical apicid is + * guaranteed to be known, give it another chance. This is already + * a bit too late - percpu allocation has already happened without + * proper NUMA affinity. + */ + if (apic->x86_32_numa_cpu_node) + set_apicid_to_node(early_per_cpu(x86_cpu_to_apicid, cpu), + apic->x86_32_numa_cpu_node(cpu)); #endif /* @@ -1812,30 +1823,41 @@ void smp_spurious_interrupt(struct pt_regs *regs) */ void smp_error_interrupt(struct pt_regs *regs) { - u32 v, v1; + u32 v0, v1; + u32 i = 0; + static const char * const error_interrupt_reason[] = { + "Send CS error", /* APIC Error Bit 0 */ + "Receive CS error", /* APIC Error Bit 1 */ + "Send accept error", /* APIC Error Bit 2 */ + "Receive accept error", /* APIC Error Bit 3 */ + "Redirectable IPI", /* APIC Error Bit 4 */ + "Send illegal vector", /* APIC Error Bit 5 */ + "Received illegal vector", /* APIC Error Bit 6 */ + "Illegal register address", /* APIC Error Bit 7 */ + }; exit_idle(); irq_enter(); /* First tickle the hardware, only then report what went on. -- REW */ - v = apic_read(APIC_ESR); + v0 = apic_read(APIC_ESR); apic_write(APIC_ESR, 0); v1 = apic_read(APIC_ESR); ack_APIC_irq(); atomic_inc(&irq_err_count); - /* - * Here is what the APIC error bits mean: - * 0: Send CS error - * 1: Receive CS error - * 2: Send accept error - * 3: Receive accept error - * 4: Reserved - * 5: Send illegal vector - * 6: Received illegal vector - * 7: Illegal register address - */ - pr_debug("APIC error on CPU%d: %02x(%02x)\n", - smp_processor_id(), v , v1); + apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x(%02x)", + smp_processor_id(), v0 , v1); + + v1 = v1 & 0xff; + while (v1) { + if (v1 & 0x1) + apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]); + i++; + v1 >>= 1; + }; + + apic_printk(APIC_DEBUG, KERN_CONT "\n"); + irq_exit(); } @@ -2003,21 +2025,6 @@ void default_init_apic_ldr(void) apic_write(APIC_LDR, val); } -#ifdef CONFIG_X86_32 -int default_x86_32_numa_cpu_node(int cpu) -{ -#ifdef CONFIG_NUMA - int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); - - if (apicid != BAD_APICID) - return __apicid_to_node[apicid]; - return NUMA_NO_NODE; -#else - return 0; -#endif -} -#endif - /* * Power management */ diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index f1baa2dc087a..775b82bc655c 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -119,14 +119,6 @@ static void noop_apic_write(u32 reg, u32 v) WARN_ON_ONCE(cpu_has_apic && !disable_apic); } -#ifdef CONFIG_X86_32 -static int noop_x86_32_numa_cpu_node(int cpu) -{ - /* we're always on node 0 */ - return 0; -} -#endif - struct apic apic_noop = { .name = "noop", .probe = noop_probe, @@ -195,6 +187,5 @@ struct apic apic_noop = { #ifdef CONFIG_X86_32 .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, - .x86_32_numa_cpu_node = noop_x86_32_numa_cpu_node, #endif }; diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 541a2e431659..d84ac5a584b5 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -253,5 +253,4 @@ struct apic apic_bigsmp = { .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, .x86_32_early_logical_apicid = bigsmp_early_logical_apicid, - .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node, }; diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 3e9de4854c5b..70533de5bd29 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -510,11 +510,6 @@ static void es7000_setup_apic_routing(void) nr_ioapics, cpumask_bits(es7000_target_cpus())[0]); } -static int es7000_numa_cpu_node(int cpu) -{ - return 0; -} - static int es7000_cpu_present_to_apicid(int mps_cpu) { if (!mps_cpu) @@ -688,7 +683,6 @@ struct apic __refdata apic_es7000_cluster = { .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, .x86_32_early_logical_apicid = es7000_early_logical_apicid, - .x86_32_numa_cpu_node = es7000_numa_cpu_node, }; struct apic __refdata apic_es7000 = { @@ -752,5 +746,4 @@ struct apic __refdata apic_es7000 = { .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, .x86_32_early_logical_apicid = es7000_early_logical_apicid, - .x86_32_numa_cpu_node = es7000_numa_cpu_node, }; diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 6273eee5134b..30f13319e24b 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -48,8 +48,6 @@ #include <asm/e820.h> #include <asm/ipi.h> -#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT)) - int found_numaq; /* @@ -79,31 +77,20 @@ int quad_local_to_mp_bus_id[NR_CPUS/4][4]; static inline void numaq_register_node(int node, struct sys_cfg_data *scd) { struct eachquadmem *eq = scd->eq + node; + u64 start = (u64)(eq->hi_shrd_mem_start - eq->priv_mem_size) << 20; + u64 end = (u64)(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size) << 20; + int ret; - node_set_online(node); - - /* Convert to pages */ - node_start_pfn[node] = - MB_TO_PAGES(eq->hi_shrd_mem_start - eq->priv_mem_size); - - node_end_pfn[node] = - MB_TO_PAGES(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size); - - memblock_x86_register_active_regions(node, node_start_pfn[node], - node_end_pfn[node]); - - memory_present(node, node_start_pfn[node], node_end_pfn[node]); - - node_remap_size[node] = node_memmap_size_bytes(node, - node_start_pfn[node], - node_end_pfn[node]); + node_set(node, numa_nodes_parsed); + ret = numa_add_memblk(node, start, end); + BUG_ON(ret < 0); } /* * Function: smp_dump_qct() * * Description: gets memory layout from the quad config table. This - * function also updates node_online_map with the nodes (quads) present. + * function also updates numa_nodes_parsed with the nodes (quads) present. */ static void __init smp_dump_qct(void) { @@ -112,7 +99,6 @@ static void __init smp_dump_qct(void) scd = (void *)__va(SYS_CFG_DATA_PRIV_ADDR); - nodes_clear(node_online_map); for_each_node(node) { if (scd->quads_present31_0 & (1 << node)) numaq_register_node(node, scd); @@ -282,14 +268,14 @@ static __init void early_check_numaq(void) } } -int __init get_memcfg_numaq(void) +int __init numaq_numa_init(void) { early_check_numaq(); if (!found_numaq) - return 0; + return -ENOENT; smp_dump_qct(); - return 1; + return 0; } #define NUMAQ_APIC_DFR_VALUE (APIC_DFR_CLUSTER) diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index fc84c7b61108..6541e471fd91 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -172,7 +172,6 @@ struct apic apic_default = { .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, .x86_32_early_logical_apicid = default_x86_32_early_logical_apicid, - .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node, }; extern struct apic apic_numaq; diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index e4b8059b414a..35bcd7d995a1 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -551,5 +551,4 @@ struct apic apic_summit = { .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, .x86_32_early_logical_apicid = summit_early_logical_apicid, - .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node, }; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 3385ea26f684..ff1ae9b6464d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -105,20 +105,6 @@ static int cpu_missing; ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); -static int default_decode_mce(struct notifier_block *nb, unsigned long val, - void *data) -{ - pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n"); - pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n"); - - return NOTIFY_STOP; -} - -static struct notifier_block mce_dec_nb = { - .notifier_call = default_decode_mce, - .priority = -1, -}; - /* MCA banks polled by the period polling timer for corrected events */ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL @@ -212,6 +198,8 @@ void mce_log(struct mce *mce) static void print_mce(struct mce *m) { + int ret = 0; + pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", m->extcpu, m->mcgstatus, m->bank, m->status); @@ -239,7 +227,11 @@ static void print_mce(struct mce *m) * Print out human-readable details about the MCE error, * (if the CPU has an implementation for that) */ - atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); + ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); + if (ret == NOTIFY_STOP) + return; + + pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); } #define PANIC_TIMEOUT 5 /* 5 seconds */ @@ -590,7 +582,6 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { mce_log(&m); atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m); - add_taint(TAINT_MACHINE_CHECK); } /* @@ -1722,8 +1713,6 @@ __setup("mce", mcheck_enable); int __init mcheck_init(void) { - atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb); - mcheck_intel_therm_init(); return 0; diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 6f8c5e9da97f..1b62c64ce294 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -187,8 +187,6 @@ static int therm_throt_process(bool new_event, int event, int level) this_cpu, level == CORE_LEVEL ? "Core" : "Package", state->count); - - add_taint(TAINT_MACHINE_CHECK); return 1; } if (old_event) { @@ -355,7 +353,6 @@ static void notify_thresholds(__u64 msr_val) static void intel_thermal_interrupt(void) { __u64 msr_val; - struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); rdmsrl(MSR_IA32_THERM_STATUS, msr_val); @@ -367,19 +364,19 @@ static void intel_thermal_interrupt(void) CORE_LEVEL) != 0) mce_log_therm_throt_event(CORE_THROTTLED | msr_val); - if (cpu_has(c, X86_FEATURE_PLN)) + if (this_cpu_has(X86_FEATURE_PLN)) if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, POWER_LIMIT_EVENT, CORE_LEVEL) != 0) mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val); - if (cpu_has(c, X86_FEATURE_PTS)) { + if (this_cpu_has(X86_FEATURE_PTS)) { rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, THERMAL_THROTTLING_EVENT, PACKAGE_LEVEL) != 0) mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val); - if (cpu_has(c, X86_FEATURE_PLN)) + if (this_cpu_has(X86_FEATURE_PLN)) if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_POWER_LIMIT, POWER_LIMIT_EVENT, @@ -393,7 +390,6 @@ static void unexpected_thermal_interrupt(void) { printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n", smp_processor_id()); - add_taint(TAINT_MACHINE_CHECK); } static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index e638689279d3..0de6b2b31f61 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -31,6 +31,7 @@ #include <asm/nmi.h> #include <asm/compat.h> #include <asm/smp.h> +#include <asm/alternative.h> #if 0 #undef wrmsrl @@ -363,12 +364,18 @@ again: return new_raw_count; } -/* using X86_FEATURE_PERFCTR_CORE to later implement ALTERNATIVE() here */ static inline int x86_pmu_addr_offset(int index) { - if (boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) - return index << 1; - return index; + int offset; + + /* offset = X86_FEATURE_PERFCTR_CORE ? index << 1 : index */ + alternative_io(ASM_NOP2, + "shll $1, %%eax", + X86_FEATURE_PERFCTR_CORE, + "=a" (offset), + "a" (index)); + + return offset; } static inline unsigned int x86_pmu_config_addr(int index) diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index cf4e369cea67..fe29c1d2219e 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -96,12 +96,14 @@ static __initconst const u64 amd_hw_cache_event_ids */ static const u64 amd_perfmon_event_map[] = { - [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, - [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, + [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */ + [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x00d1, /* "Dispatch stalls" event */ }; static u64 amd_pmu_event_map(int hw_event) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 447a28de6f09..41178c826c48 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -36,7 +36,7 @@ static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, }; -static struct event_constraint intel_core_event_constraints[] = +static struct event_constraint intel_core_event_constraints[] __read_mostly = { INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ @@ -47,7 +47,7 @@ static struct event_constraint intel_core_event_constraints[] = EVENT_CONSTRAINT_END }; -static struct event_constraint intel_core2_event_constraints[] = +static struct event_constraint intel_core2_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ @@ -70,7 +70,7 @@ static struct event_constraint intel_core2_event_constraints[] = EVENT_CONSTRAINT_END }; -static struct event_constraint intel_nehalem_event_constraints[] = +static struct event_constraint intel_nehalem_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ @@ -86,19 +86,19 @@ static struct event_constraint intel_nehalem_event_constraints[] = EVENT_CONSTRAINT_END }; -static struct extra_reg intel_nehalem_extra_regs[] = +static struct extra_reg intel_nehalem_extra_regs[] __read_mostly = { INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff), EVENT_EXTRA_END }; -static struct event_constraint intel_nehalem_percore_constraints[] = +static struct event_constraint intel_nehalem_percore_constraints[] __read_mostly = { INTEL_EVENT_CONSTRAINT(0xb7, 0), EVENT_CONSTRAINT_END }; -static struct event_constraint intel_westmere_event_constraints[] = +static struct event_constraint intel_westmere_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ @@ -110,7 +110,7 @@ static struct event_constraint intel_westmere_event_constraints[] = EVENT_CONSTRAINT_END }; -static struct event_constraint intel_snb_event_constraints[] = +static struct event_constraint intel_snb_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ @@ -123,21 +123,21 @@ static struct event_constraint intel_snb_event_constraints[] = EVENT_CONSTRAINT_END }; -static struct extra_reg intel_westmere_extra_regs[] = +static struct extra_reg intel_westmere_extra_regs[] __read_mostly = { INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff), INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff), EVENT_EXTRA_END }; -static struct event_constraint intel_westmere_percore_constraints[] = +static struct event_constraint intel_westmere_percore_constraints[] __read_mostly = { INTEL_EVENT_CONSTRAINT(0xb7, 0), INTEL_EVENT_CONSTRAINT(0xbb, 0), EVENT_CONSTRAINT_END }; -static struct event_constraint intel_gen_event_constraints[] = +static struct event_constraint intel_gen_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ @@ -1440,6 +1440,11 @@ static __init int intel_pmu_init(void) x86_pmu.enable_all = intel_pmu_nhm_enable_all; x86_pmu.extra_regs = intel_nehalem_extra_regs; + /* UOPS_ISSUED.STALLED_CYCLES */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; + /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; + if (ebx & 0x40) { /* * Erratum AAJ80 detected, we work it around by using @@ -1480,6 +1485,12 @@ static __init int intel_pmu_init(void) x86_pmu.enable_all = intel_pmu_nhm_enable_all; x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints; x86_pmu.extra_regs = intel_westmere_extra_regs; + + /* UOPS_ISSUED.STALLED_CYCLES */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; + /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; + pr_cont("Westmere events, "); break; @@ -1491,6 +1502,12 @@ static __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_snb_event_constraints; x86_pmu.pebs_constraints = intel_snb_pebs_events; + + /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; + /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x18001b1; + pr_cont("SandyBridge events, "); break; diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index e93fcd55fae1..ead584fb6a7d 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -468,7 +468,7 @@ static struct p4_event_bind p4_event_bind_map[] = { .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED), .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS), + P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_X87_ASSIST] = { @@ -912,8 +912,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) int idx, handled = 0; u64 val; - data.addr = 0; - data.raw = NULL; + perf_sample_data_init(&data, 0); cpuc = &__get_cpu_var(cpu_hw_events); @@ -1197,7 +1196,7 @@ static __init int p4_pmu_init(void) { unsigned int low, high; - /* If we get stripped -- indexig fails */ + /* If we get stripped -- indexing fails */ BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC); rdmsr(MSR_IA32_MISC_ENABLE, low, high); diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index a93742a57468..0ba15a6cc57e 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -260,9 +260,9 @@ do_ftrace_mod_code(unsigned long ip, void *new_code) return mod_code_status; } -static unsigned char *ftrace_nop_replace(void) +static const unsigned char *ftrace_nop_replace(void) { - return ideal_nop5; + return ideal_nops[NOP_ATOMIC5]; } static int diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index d6d6bb361931..3bb08509a7a1 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -23,7 +23,6 @@ static void __init i386_default_early_setup(void) { /* Initialize 32bit specific setup functions */ - x86_init.resources.probe_roms = probe_roms; x86_init.resources.reserve_resources = i386_reserve_resources; x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index 2dfd31597443..212fe6590aab 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -188,8 +188,6 @@ static struct clocksource pit_cs = { .rating = 110, .read = pit_read, .mask = CLOCKSOURCE_MASK(32), - .mult = 0, - .shift = 20, }; static int __init init_pit_clocksource(void) @@ -205,9 +203,7 @@ static int __init init_pit_clocksource(void) pit_ce.mode != CLOCK_EVT_MODE_PERIODIC) return 0; - pit_cs.mult = clocksource_hz2mult(CLOCK_TICK_RATE, pit_cs.shift); - - return clocksource_register(&pit_cs); + return clocksource_register_hz(&pit_cs, CLOCK_TICK_RATE); } arch_initcall(init_pit_clocksource); diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index 961b6b30ba90..3fee346ef545 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -34,7 +34,7 @@ void arch_jump_label_transform(struct jump_entry *entry, code.offset = entry->target - (entry->code + JUMP_LABEL_NOP_SIZE); } else - memcpy(&code, ideal_nop5, JUMP_LABEL_NOP_SIZE); + memcpy(&code, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE); get_online_cpus(); mutex_lock(&text_mutex); text_poke_smp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE); @@ -44,7 +44,8 @@ void arch_jump_label_transform(struct jump_entry *entry, void arch_jump_label_text_poke_early(jump_label_t addr) { - text_poke_early((void *)addr, ideal_nop5, JUMP_LABEL_NOP_SIZE); + text_poke_early((void *)addr, ideal_nops[NOP_ATOMIC5], + JUMP_LABEL_NOP_SIZE); } #endif diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index f98d3eafe07a..6389a6bca11b 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -26,8 +26,6 @@ #include <asm/x86_init.h> #include <asm/reboot.h> -#define KVM_SCALE 22 - static int kvmclock = 1; static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; @@ -120,8 +118,6 @@ static struct clocksource kvm_clock = { .read = kvm_clock_get_cycles, .rating = 400, .mask = CLOCKSOURCE_MASK(64), - .mult = 1 << KVM_SCALE, - .shift = KVM_SCALE, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -203,7 +199,7 @@ void __init kvmclock_init(void) machine_ops.crash_shutdown = kvm_crash_shutdown; #endif kvm_get_preset_lpj(); - clocksource_register(&kvm_clock); + clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); pv_info.paravirt_enabled = 1; pv_info.name = "KVM"; diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index ab23f1ad4bf1..52f256f2cc81 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -24,6 +24,7 @@ #include <linux/bug.h> #include <linux/mm.h> #include <linux/gfp.h> +#include <linux/jump_label.h> #include <asm/system.h> #include <asm/page.h> diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 5a532ce646bf..6f9bfffb2720 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -715,17 +715,15 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) } } -static int +static int __init check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count) { - int ret = 0; - if (!mpc_new_phys || count <= mpc_new_length) { WARN(1, "update_mptable: No spare slots (length: %x)\n", count); return -1; } - return ret; + return 0; } #else /* CONFIG_X86_IO_APIC */ static diff --git a/arch/x86/kernel/probe_roms_32.c b/arch/x86/kernel/probe_roms.c index 071e7fea42e5..ba0a4cce53be 100644 --- a/arch/x86/kernel/probe_roms_32.c +++ b/arch/x86/kernel/probe_roms.c @@ -73,6 +73,107 @@ static struct resource video_rom_resource = { .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM }; +/* does this oprom support the given pci device, or any of the devices + * that the driver supports? + */ +static bool match_id(struct pci_dev *pdev, unsigned short vendor, unsigned short device) +{ + struct pci_driver *drv = pdev->driver; + const struct pci_device_id *id; + + if (pdev->vendor == vendor && pdev->device == device) + return true; + + for (id = drv ? drv->id_table : NULL; id && id->vendor; id++) + if (id->vendor == vendor && id->device == device) + break; + + return id && id->vendor; +} + +static bool probe_list(struct pci_dev *pdev, unsigned short vendor, + const unsigned char *rom_list) +{ + unsigned short device; + + do { + if (probe_kernel_address(rom_list, device) != 0) + device = 0; + + if (device && match_id(pdev, vendor, device)) + break; + + rom_list += 2; + } while (device); + + return !!device; +} + +static struct resource *find_oprom(struct pci_dev *pdev) +{ + struct resource *oprom = NULL; + int i; + + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources); i++) { + struct resource *res = &adapter_rom_resources[i]; + unsigned short offset, vendor, device, list, rev; + const unsigned char *rom; + + if (res->end == 0) + break; + + rom = isa_bus_to_virt(res->start); + if (probe_kernel_address(rom + 0x18, offset) != 0) + continue; + + if (probe_kernel_address(rom + offset + 0x4, vendor) != 0) + continue; + + if (probe_kernel_address(rom + offset + 0x6, device) != 0) + continue; + + if (match_id(pdev, vendor, device)) { + oprom = res; + break; + } + + if (probe_kernel_address(rom + offset + 0x8, list) == 0 && + probe_kernel_address(rom + offset + 0xc, rev) == 0 && + rev >= 3 && list && + probe_list(pdev, vendor, rom + offset + list)) { + oprom = res; + break; + } + } + + return oprom; +} + +void *pci_map_biosrom(struct pci_dev *pdev) +{ + struct resource *oprom = find_oprom(pdev); + + if (!oprom) + return NULL; + + return ioremap(oprom->start, resource_size(oprom)); +} +EXPORT_SYMBOL(pci_map_biosrom); + +void pci_unmap_biosrom(void __iomem *image) +{ + iounmap(image); +} +EXPORT_SYMBOL(pci_unmap_biosrom); + +size_t pci_biosrom_size(struct pci_dev *pdev) +{ + struct resource *oprom = find_oprom(pdev); + + return oprom ? resource_size(oprom) : 0; +} +EXPORT_SYMBOL(pci_biosrom_size); + #define ROMSIGNATURE 0xaa55 static int __init romsignature(const unsigned char *rom) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index d46cbe46b7ab..88a90a977f8e 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -449,7 +449,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { if (!need_resched()) { - if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR)) + if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); __monitor((void *)¤t_thread_info()->flags, 0, 0); @@ -465,7 +465,7 @@ static void mwait_idle(void) if (!need_resched()) { trace_power_start(POWER_CSTATE, 1, smp_processor_id()); trace_cpu_idle(1, smp_processor_id()); - if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR)) + if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); __monitor((void *)¤t_thread_info()->flags, 0, 0); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 08c44b08bf5b..0c016f727695 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -36,7 +36,7 @@ EXPORT_SYMBOL(pm_power_off); static const struct desc_ptr no_idt = {}; static int reboot_mode; -enum reboot_type reboot_type = BOOT_KBD; +enum reboot_type reboot_type = BOOT_ACPI; int reboot_force; #if defined(CONFIG_X86_32) && defined(CONFIG_SMP) @@ -478,9 +478,24 @@ void __attribute__((weak)) mach_reboot_fixups(void) { } +/* + * Windows compatible x86 hardware expects the following on reboot: + * + * 1) If the FADT has the ACPI reboot register flag set, try it + * 2) If still alive, write to the keyboard controller + * 3) If still alive, write to the ACPI reboot register again + * 4) If still alive, write to the keyboard controller again + * + * If the machine is still alive at this stage, it gives up. We default to + * following the same pattern, except that if we're still alive after (4) we'll + * try to force a triple fault and then cycle between hitting the keyboard + * controller and doing that + */ static void native_machine_emergency_restart(void) { int i; + int attempt = 0; + int orig_reboot_type = reboot_type; if (reboot_emergency) emergency_vmx_disable_all(); @@ -502,6 +517,13 @@ static void native_machine_emergency_restart(void) outb(0xfe, 0x64); /* pulse reset low */ udelay(50); } + if (attempt == 0 && orig_reboot_type == BOOT_ACPI) { + attempt = 1; + reboot_type = BOOT_ACPI; + } else { + reboot_type = BOOT_TRIPLE; + } + break; case BOOT_TRIPLE: load_idt(&no_idt); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index cab4f24e2177..605e5ae19c7f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -691,8 +691,6 @@ early_param("reservelow", parse_reservelow); void __init setup_arch(char **cmdline_p) { - unsigned long flags; - #ifdef CONFIG_X86_32 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); visws_early_detect(); @@ -1040,9 +1038,7 @@ void __init setup_arch(char **cmdline_p) mcheck_init(); - local_irq_save(flags); - arch_init_ideal_nop5(); - local_irq_restore(flags); + arch_init_ideal_nops(); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 513deac7228d..013e7eba83bb 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -194,14 +194,13 @@ static void native_stop_other_cpus(int wait) } /* - * Reschedule call back. Nothing to do, - * all the work is done automatically when - * we return from the interrupt. + * Reschedule call back. */ void smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); inc_irq_stat(irq_resched_count); + scheduler_ipi(); /* * KVM uses this interrupt to force a cpu out of guest mode */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c2871d3c71b6..a3c430bdfb60 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1332,9 +1332,9 @@ static inline void mwait_play_dead(void) void *mwait_ptr; struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info); - if (!(cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c))) + if (!this_cpu_has(X86_FEATURE_MWAIT) && mwait_usable(c)) return; - if (!cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLSH)) + if (!this_cpu_has(X86_FEATURE_CLFLSH)) return; if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF) return; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index c11514e9128b..6eee0828e327 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -35,7 +35,7 @@ void iommu_shutdown_noop(void) { } struct x86_init_ops x86_init __initdata = { .resources = { - .probe_roms = x86_init_noop, + .probe_roms = probe_roms, .reserve_resources = reserve_standard_io_resources, .memory_setup = default_machine_specific_memory_setup, }, diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 083c95d8a87d..db832fd65ecb 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -913,8 +913,6 @@ static struct clocksource lguest_clock = { .rating = 200, .read = lguest_clock_read, .mask = CLOCKSOURCE_MASK(64), - .mult = 1 << 22, - .shift = 22, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -998,7 +996,7 @@ static void lguest_time_init(void) lguest_setup_irq(0); irq_set_handler(0, lguest_time_irq); - clocksource_register(&lguest_clock); + clocksource_register_hz(&lguest_clock, NSEC_PER_SEC); /* We can't set cpumask in the initializer: damn C limitations! Set it * here and register our timer device. */ diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 75ef61e35e38..2a560bb2573b 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -49,7 +49,7 @@ ENTRY(memcpy) jb .Lhandle_tail /* - * We check whether memory false dependece could occur, + * We check whether memory false dependence could occur, * then jump to corresponding copy mode. */ cmp %dil, %sil diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 3e608edf9958..3d11327c9ab4 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -23,8 +23,8 @@ mmiotrace-y := kmmio.o pf_in.o mmio-mod.o obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o -obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o -obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o +obj-$(CONFIG_AMD_NUMA) += amdtopology.o +obj-$(CONFIG_ACPI_NUMA) += srat.o obj-$(CONFIG_NUMA_EMU) += numa_emulation.o obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology.c index 0919c26820d4..5247d01329ca 100644 --- a/arch/x86/mm/amdtopology_64.c +++ b/arch/x86/mm/amdtopology.c @@ -12,6 +12,7 @@ #include <linux/module.h> #include <linux/nodemask.h> #include <linux/memblock.h> +#include <linux/bootmem.h> #include <asm/io.h> #include <linux/pci_ids.h> @@ -69,10 +70,10 @@ static __init void early_get_boot_cpu_id(void) int __init amd_numa_init(void) { - unsigned long start = PFN_PHYS(0); - unsigned long end = PFN_PHYS(max_pfn); + u64 start = PFN_PHYS(0); + u64 end = PFN_PHYS(max_pfn); unsigned numnodes; - unsigned long prevbase; + u64 prevbase; int i, j, nb; u32 nodeid, reg; unsigned int bits, cores, apicid_base; @@ -95,7 +96,7 @@ int __init amd_numa_init(void) prevbase = 0; for (i = 0; i < 8; i++) { - unsigned long base, limit; + u64 base, limit; base = read_pci_config(0, nb, 1, 0x40 + i*8); limit = read_pci_config(0, nb, 1, 0x44 + i*8); @@ -107,18 +108,18 @@ int __init amd_numa_init(void) continue; } if (nodeid >= numnodes) { - pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid, + pr_info("Ignoring excess node %d (%Lx:%Lx)\n", nodeid, base, limit); continue; } if (!limit) { - pr_info("Skipping node entry %d (base %lx)\n", + pr_info("Skipping node entry %d (base %Lx)\n", i, base); continue; } if ((base >> 8) & 3 || (limit >> 8) & 3) { - pr_err("Node %d using interleaving mode %lx/%lx\n", + pr_err("Node %d using interleaving mode %Lx/%Lx\n", nodeid, (base >> 8) & 3, (limit >> 8) & 3); return -EINVAL; } @@ -150,19 +151,19 @@ int __init amd_numa_init(void) continue; } if (limit < base) { - pr_err("Node %d bogus settings %lx-%lx.\n", + pr_err("Node %d bogus settings %Lx-%Lx.\n", nodeid, base, limit); continue; } /* Could sort here, but pun for now. Should not happen anyroads. */ if (prevbase > base) { - pr_err("Node map not sorted %lx,%lx\n", + pr_err("Node map not sorted %Lx,%Lx\n", prevbase, base); return -EINVAL; } - pr_info("Node %d MemBase %016lx Limit %016lx\n", + pr_info("Node %d MemBase %016Lx Limit %016Lx\n", nodeid, base, limit); prevbase = base; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 80088f994193..2cde0a34bed6 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -716,6 +716,7 @@ void __init paging_init(void) * NOTE: at this point the bootmem allocator is fully available. */ olpc_dt_build_devicetree(); + sparse_memory_present_with_active_regions(MAX_NUMNODES); sparse_init(); zone_sizes_init(); } diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 794233587287..0404bb3a077e 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -679,14 +679,6 @@ int arch_add_memory(int nid, u64 start, u64 size) } EXPORT_SYMBOL_GPL(arch_add_memory); -#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA) -int memory_add_physaddr_to_nid(u64 start) -{ - return 0; -} -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); -#endif - #endif /* CONFIG_MEMORY_HOTPLUG */ static struct kcore_list kcore_vsyscall; diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 0369843511dc..be1ef574ce9a 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -91,13 +91,6 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, return (__force void __iomem *)phys_to_virt(phys_addr); /* - * Check if the request spans more than any BAR in the iomem resource - * tree. - */ - WARN_ONCE(iomem_map_sanity_check(phys_addr, size), - KERN_INFO "Info: mapping multiple BARs. Your kernel is fine."); - - /* * Don't allow anybody to remap normal RAM that we're using.. */ last_pfn = last_addr >> PAGE_SHIFT; @@ -170,6 +163,13 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, ret_addr = (void __iomem *) (vaddr + offset); mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr); + /* + * Check if the request spans more than any BAR in the iomem resource + * tree. + */ + WARN_ONCE(iomem_map_sanity_check(unaligned_phys_addr, unaligned_size), + KERN_INFO "Info: mapping multiple BARs. Your kernel is fine."); + return ret_addr; err_free_area: free_vm_area(area); diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 745258dfc4dc..f5510d889a22 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -1,11 +1,39 @@ /* Common code for 32 and 64-bit NUMA */ -#include <linux/topology.h> -#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/init.h> #include <linux/bootmem.h> -#include <asm/numa.h> +#include <linux/memblock.h> +#include <linux/mmzone.h> +#include <linux/ctype.h> +#include <linux/module.h> +#include <linux/nodemask.h> +#include <linux/sched.h> +#include <linux/topology.h> + +#include <asm/e820.h> +#include <asm/proto.h> +#include <asm/dma.h> #include <asm/acpi.h> +#include <asm/amd_nb.h> + +#include "numa_internal.h" int __initdata numa_off; +nodemask_t numa_nodes_parsed __initdata; + +struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; +EXPORT_SYMBOL(node_data); + +static struct numa_meminfo numa_meminfo +#ifndef CONFIG_MEMORY_HOTPLUG +__initdata +#endif +; + +static int numa_distance_cnt; +static u8 *numa_distance; static __init int numa_setup(char *opt) { @@ -32,6 +60,15 @@ s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE }; +int __cpuinit numa_cpu_node(int cpu) +{ + int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); + + if (apicid != BAD_APICID) + return __apicid_to_node[apicid]; + return NUMA_NO_NODE; +} + cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; EXPORT_SYMBOL(node_to_cpumask_map); @@ -95,6 +132,407 @@ void __init setup_node_to_cpumask_map(void) pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); } +static int __init numa_add_memblk_to(int nid, u64 start, u64 end, + struct numa_meminfo *mi) +{ + /* ignore zero length blks */ + if (start == end) + return 0; + + /* whine about and ignore invalid blks */ + if (start > end || nid < 0 || nid >= MAX_NUMNODES) { + pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n", + nid, start, end); + return 0; + } + + if (mi->nr_blks >= NR_NODE_MEMBLKS) { + pr_err("NUMA: too many memblk ranges\n"); + return -EINVAL; + } + + mi->blk[mi->nr_blks].start = start; + mi->blk[mi->nr_blks].end = end; + mi->blk[mi->nr_blks].nid = nid; + mi->nr_blks++; + return 0; +} + +/** + * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo + * @idx: Index of memblk to remove + * @mi: numa_meminfo to remove memblk from + * + * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and + * decrementing @mi->nr_blks. + */ +void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) +{ + mi->nr_blks--; + memmove(&mi->blk[idx], &mi->blk[idx + 1], + (mi->nr_blks - idx) * sizeof(mi->blk[0])); +} + +/** + * numa_add_memblk - Add one numa_memblk to numa_meminfo + * @nid: NUMA node ID of the new memblk + * @start: Start address of the new memblk + * @end: End address of the new memblk + * + * Add a new memblk to the default numa_meminfo. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init numa_add_memblk(int nid, u64 start, u64 end) +{ + return numa_add_memblk_to(nid, start, end, &numa_meminfo); +} + +/* Initialize NODE_DATA for a node on the local memory */ +static void __init setup_node_data(int nid, u64 start, u64 end) +{ + const u64 nd_low = PFN_PHYS(MAX_DMA_PFN); + const u64 nd_high = PFN_PHYS(max_pfn_mapped); + const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); + bool remapped = false; + u64 nd_pa; + void *nd; + int tnid; + + /* + * Don't confuse VM with a node that doesn't have the + * minimum amount of memory: + */ + if (end && (end - start) < NODE_MIN_SIZE) + return; + + /* initialize remap allocator before aligning to ZONE_ALIGN */ + init_alloc_remap(nid, start, end); + + start = roundup(start, ZONE_ALIGN); + + printk(KERN_INFO "Initmem setup node %d %016Lx-%016Lx\n", + nid, start, end); + + /* + * Allocate node data. Try remap allocator first, node-local + * memory and then any node. Never allocate in DMA zone. + */ + nd = alloc_remap(nid, nd_size); + if (nd) { + nd_pa = __pa(nd); + remapped = true; + } else { + nd_pa = memblock_x86_find_in_range_node(nid, nd_low, nd_high, + nd_size, SMP_CACHE_BYTES); + if (nd_pa == MEMBLOCK_ERROR) + nd_pa = memblock_find_in_range(nd_low, nd_high, + nd_size, SMP_CACHE_BYTES); + if (nd_pa == MEMBLOCK_ERROR) { + pr_err("Cannot find %zu bytes in node %d\n", + nd_size, nid); + return; + } + memblock_x86_reserve_range(nd_pa, nd_pa + nd_size, "NODE_DATA"); + nd = __va(nd_pa); + } + + /* report and initialize */ + printk(KERN_INFO " NODE_DATA [%016Lx - %016Lx]%s\n", + nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : ""); + tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); + if (!remapped && tnid != nid) + printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid); + + node_data[nid] = nd; + memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); + NODE_DATA(nid)->node_id = nid; + NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT; + NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT; + + node_set_online(nid); +} + +/** + * numa_cleanup_meminfo - Cleanup a numa_meminfo + * @mi: numa_meminfo to clean up + * + * Sanitize @mi by merging and removing unncessary memblks. Also check for + * conflicts and clear unused memblks. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init numa_cleanup_meminfo(struct numa_meminfo *mi) +{ + const u64 low = 0; + const u64 high = PFN_PHYS(max_pfn); + int i, j, k; + + /* first, trim all entries */ + for (i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *bi = &mi->blk[i]; + + /* make sure all blocks are inside the limits */ + bi->start = max(bi->start, low); + bi->end = min(bi->end, high); + + /* and there's no empty block */ + if (bi->start >= bi->end) + numa_remove_memblk_from(i--, mi); + } + + /* merge neighboring / overlapping entries */ + for (i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *bi = &mi->blk[i]; + + for (j = i + 1; j < mi->nr_blks; j++) { + struct numa_memblk *bj = &mi->blk[j]; + u64 start, end; + + /* + * See whether there are overlapping blocks. Whine + * about but allow overlaps of the same nid. They + * will be merged below. + */ + if (bi->end > bj->start && bi->start < bj->end) { + if (bi->nid != bj->nid) { + pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n", + bi->nid, bi->start, bi->end, + bj->nid, bj->start, bj->end); + return -EINVAL; + } + pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n", + bi->nid, bi->start, bi->end, + bj->start, bj->end); + } + + /* + * Join together blocks on the same node, holes + * between which don't overlap with memory on other + * nodes. + */ + if (bi->nid != bj->nid) + continue; + start = min(bi->start, bj->start); + end = max(bi->end, bj->end); + for (k = 0; k < mi->nr_blks; k++) { + struct numa_memblk *bk = &mi->blk[k]; + + if (bi->nid == bk->nid) + continue; + if (start < bk->end && end > bk->start) + break; + } + if (k < mi->nr_blks) + continue; + printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%Lx,%Lx)\n", + bi->nid, bi->start, bi->end, bj->start, bj->end, + start, end); + bi->start = start; + bi->end = end; + numa_remove_memblk_from(j--, mi); + } + } + + /* clear unused ones */ + for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { + mi->blk[i].start = mi->blk[i].end = 0; + mi->blk[i].nid = NUMA_NO_NODE; + } + + return 0; +} + +/* + * Set nodes, which have memory in @mi, in *@nodemask. + */ +static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, + const struct numa_meminfo *mi) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(mi->blk); i++) + if (mi->blk[i].start != mi->blk[i].end && + mi->blk[i].nid != NUMA_NO_NODE) + node_set(mi->blk[i].nid, *nodemask); +} + +/** + * numa_reset_distance - Reset NUMA distance table + * + * The current table is freed. The next numa_set_distance() call will + * create a new one. + */ +void __init numa_reset_distance(void) +{ + size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); + + /* numa_distance could be 1LU marking allocation failure, test cnt */ + if (numa_distance_cnt) + memblock_x86_free_range(__pa(numa_distance), + __pa(numa_distance) + size); + numa_distance_cnt = 0; + numa_distance = NULL; /* enable table creation */ +} + +static int __init numa_alloc_distance(void) +{ + nodemask_t nodes_parsed; + size_t size; + int i, j, cnt = 0; + u64 phys; + + /* size the new table and allocate it */ + nodes_parsed = numa_nodes_parsed; + numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); + + for_each_node_mask(i, nodes_parsed) + cnt = i; + cnt++; + size = cnt * cnt * sizeof(numa_distance[0]); + + phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), + size, PAGE_SIZE); + if (phys == MEMBLOCK_ERROR) { + pr_warning("NUMA: Warning: can't allocate distance table!\n"); + /* don't retry until explicitly reset */ + numa_distance = (void *)1LU; + return -ENOMEM; + } + memblock_x86_reserve_range(phys, phys + size, "NUMA DIST"); + + numa_distance = __va(phys); + numa_distance_cnt = cnt; + + /* fill with the default distances */ + for (i = 0; i < cnt; i++) + for (j = 0; j < cnt; j++) + numa_distance[i * cnt + j] = i == j ? + LOCAL_DISTANCE : REMOTE_DISTANCE; + printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt); + + return 0; +} + +/** + * numa_set_distance - Set NUMA distance from one NUMA to another + * @from: the 'from' node to set distance + * @to: the 'to' node to set distance + * @distance: NUMA distance + * + * Set the distance from node @from to @to to @distance. If distance table + * doesn't exist, one which is large enough to accommodate all the currently + * known nodes will be created. + * + * If such table cannot be allocated, a warning is printed and further + * calls are ignored until the distance table is reset with + * numa_reset_distance(). + * + * If @from or @to is higher than the highest known node at the time of + * table creation or @distance doesn't make sense, the call is ignored. + * This is to allow simplification of specific NUMA config implementations. + */ +void __init numa_set_distance(int from, int to, int distance) +{ + if (!numa_distance && numa_alloc_distance() < 0) + return; + + if (from >= numa_distance_cnt || to >= numa_distance_cnt) { + printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n", + from, to, distance); + return; + } + + if ((u8)distance != distance || + (from == to && distance != LOCAL_DISTANCE)) { + pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n", + from, to, distance); + return; + } + + numa_distance[from * numa_distance_cnt + to] = distance; +} + +int __node_distance(int from, int to) +{ + if (from >= numa_distance_cnt || to >= numa_distance_cnt) + return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; + return numa_distance[from * numa_distance_cnt + to]; +} +EXPORT_SYMBOL(__node_distance); + +/* + * Sanity check to catch more bad NUMA configurations (they are amazingly + * common). Make sure the nodes cover all memory. + */ +static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) +{ + u64 numaram, e820ram; + int i; + + numaram = 0; + for (i = 0; i < mi->nr_blks; i++) { + u64 s = mi->blk[i].start >> PAGE_SHIFT; + u64 e = mi->blk[i].end >> PAGE_SHIFT; + numaram += e - s; + numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e); + if ((s64)numaram < 0) + numaram = 0; + } + + e820ram = max_pfn - (memblock_x86_hole_size(0, + PFN_PHYS(max_pfn)) >> PAGE_SHIFT); + /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ + if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) { + printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n", + (numaram << PAGE_SHIFT) >> 20, + (e820ram << PAGE_SHIFT) >> 20); + return false; + } + return true; +} + +static int __init numa_register_memblks(struct numa_meminfo *mi) +{ + int i, nid; + + /* Account for nodes with cpus and no memory */ + node_possible_map = numa_nodes_parsed; + numa_nodemask_from_meminfo(&node_possible_map, mi); + if (WARN_ON(nodes_empty(node_possible_map))) + return -EINVAL; + + for (i = 0; i < mi->nr_blks; i++) + memblock_x86_register_active_regions(mi->blk[i].nid, + mi->blk[i].start >> PAGE_SHIFT, + mi->blk[i].end >> PAGE_SHIFT); + + /* for out of order entries */ + sort_node_map(); + if (!numa_meminfo_cover_memory(mi)) + return -EINVAL; + + /* Finally register nodes. */ + for_each_node_mask(nid, node_possible_map) { + u64 start = PFN_PHYS(max_pfn); + u64 end = 0; + + for (i = 0; i < mi->nr_blks; i++) { + if (nid != mi->blk[i].nid) + continue; + start = min(mi->blk[i].start, start); + end = max(mi->blk[i].end, end); + } + + if (start < end) + setup_node_data(nid, start, end); + } + + return 0; +} + /* * There are unfortunately some poorly designed mainboards around that * only connect memory to a single CPU. This breaks the 1:1 cpu->node @@ -102,7 +540,7 @@ void __init setup_node_to_cpumask_map(void) * as the number of CPUs is not known yet. We round robin the existing * nodes. */ -void __init numa_init_array(void) +static void __init numa_init_array(void) { int rr, i; @@ -117,6 +555,95 @@ void __init numa_init_array(void) } } +static int __init numa_init(int (*init_func)(void)) +{ + int i; + int ret; + + for (i = 0; i < MAX_LOCAL_APIC; i++) + set_apicid_to_node(i, NUMA_NO_NODE); + + nodes_clear(numa_nodes_parsed); + nodes_clear(node_possible_map); + nodes_clear(node_online_map); + memset(&numa_meminfo, 0, sizeof(numa_meminfo)); + remove_all_active_ranges(); + numa_reset_distance(); + + ret = init_func(); + if (ret < 0) + return ret; + ret = numa_cleanup_meminfo(&numa_meminfo); + if (ret < 0) + return ret; + + numa_emulation(&numa_meminfo, numa_distance_cnt); + + ret = numa_register_memblks(&numa_meminfo); + if (ret < 0) + return ret; + + for (i = 0; i < nr_cpu_ids; i++) { + int nid = early_cpu_to_node(i); + + if (nid == NUMA_NO_NODE) + continue; + if (!node_online(nid)) + numa_clear_node(i); + } + numa_init_array(); + return 0; +} + +/** + * dummy_numa_init - Fallback dummy NUMA init + * + * Used if there's no underlying NUMA architecture, NUMA initialization + * fails, or NUMA is disabled on the command line. + * + * Must online at least one node and add memory blocks that cover all + * allowed memory. This function must not fail. + */ +static int __init dummy_numa_init(void) +{ + printk(KERN_INFO "%s\n", + numa_off ? "NUMA turned off" : "No NUMA configuration found"); + printk(KERN_INFO "Faking a node at %016Lx-%016Lx\n", + 0LLU, PFN_PHYS(max_pfn)); + + node_set(0, numa_nodes_parsed); + numa_add_memblk(0, 0, PFN_PHYS(max_pfn)); + + return 0; +} + +/** + * x86_numa_init - Initialize NUMA + * + * Try each configured NUMA initialization method until one succeeds. The + * last fallback is dummy single node config encomapssing whole memory and + * never fails. + */ +void __init x86_numa_init(void) +{ + if (!numa_off) { +#ifdef CONFIG_X86_NUMAQ + if (!numa_init(numaq_numa_init)) + return; +#endif +#ifdef CONFIG_ACPI_NUMA + if (!numa_init(x86_acpi_numa_init)) + return; +#endif +#ifdef CONFIG_AMD_NUMA + if (!numa_init(amd_numa_init)) + return; +#endif + } + + numa_init(dummy_numa_init); +} + static __init int find_near_online_node(int node) { int n, val; @@ -282,3 +809,18 @@ const struct cpumask *cpumask_of_node(int node) EXPORT_SYMBOL(cpumask_of_node); #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ + +#ifdef CONFIG_MEMORY_HOTPLUG +int memory_add_physaddr_to_nid(u64 start) +{ + struct numa_meminfo *mi = &numa_meminfo; + int nid = mi->blk[0].nid; + int i; + + for (i = 0; i < mi->nr_blks; i++) + if (mi->blk[i].start <= start && mi->blk[i].end > start) + nid = mi->blk[i].nid; + return nid; +} +EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); +#endif diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index bde3906420df..849a975d3fa0 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -22,39 +22,11 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include <linux/mm.h> #include <linux/bootmem.h> #include <linux/memblock.h> -#include <linux/mmzone.h> -#include <linux/highmem.h> -#include <linux/initrd.h> -#include <linux/nodemask.h> #include <linux/module.h> -#include <linux/kexec.h> -#include <linux/pfn.h> -#include <linux/swap.h> -#include <linux/acpi.h> - -#include <asm/e820.h> -#include <asm/setup.h> -#include <asm/mmzone.h> -#include <asm/bios_ebda.h> -#include <asm/proto.h> - -struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; -EXPORT_SYMBOL(node_data); - -/* - * numa interface - we expect the numa architecture specific code to have - * populated the following initialisation. - * - * 1) node_online_map - the map of all nodes configured (online) in the system - * 2) node_start_pfn - the starting page frame number for a node - * 3) node_end_pfn - the ending page fram number for a node - */ -unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly; -unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly; +#include "numa_internal.h" #ifdef CONFIG_DISCONTIGMEM /* @@ -99,108 +71,46 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, } #endif -extern unsigned long find_max_low_pfn(void); extern unsigned long highend_pfn, highstart_pfn; #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) -unsigned long node_remap_size[MAX_NUMNODES]; static void *node_remap_start_vaddr[MAX_NUMNODES]; void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); -static unsigned long kva_start_pfn; -static unsigned long kva_pages; - -int __cpuinit numa_cpu_node(int cpu) -{ - return apic->x86_32_numa_cpu_node(cpu); -} - -/* - * FLAT - support for basic PC memory model with discontig enabled, essentially - * a single node with all available processors in it with a flat - * memory map. - */ -int __init get_memcfg_numa_flat(void) -{ - printk(KERN_DEBUG "NUMA - single node, flat memory mode\n"); - - node_start_pfn[0] = 0; - node_end_pfn[0] = max_pfn; - memblock_x86_register_active_regions(0, 0, max_pfn); - memory_present(0, 0, max_pfn); - node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn); - - /* Indicate there is one node available. */ - nodes_clear(node_online_map); - node_set_online(0); - return 1; -} - -/* - * Find the highest page frame number we have available for the node - */ -static void __init propagate_e820_map_node(int nid) -{ - if (node_end_pfn[nid] > max_pfn) - node_end_pfn[nid] = max_pfn; - /* - * if a user has given mem=XXXX, then we need to make sure - * that the node _starts_ before that, too, not just ends - */ - if (node_start_pfn[nid] > max_pfn) - node_start_pfn[nid] = max_pfn; - BUG_ON(node_start_pfn[nid] > node_end_pfn[nid]); -} - -/* - * Allocate memory for the pg_data_t for this node via a crude pre-bootmem - * method. For node zero take this from the bottom of memory, for - * subsequent nodes place them at node_remap_start_vaddr which contains - * node local data in physically node local memory. See setup_memory() - * for details. - */ -static void __init allocate_pgdat(int nid) -{ - char buf[16]; - - if (node_has_online_mem(nid) && node_remap_start_vaddr[nid]) - NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; - else { - unsigned long pgdat_phys; - pgdat_phys = memblock_find_in_range(min_low_pfn<<PAGE_SHIFT, - max_pfn_mapped<<PAGE_SHIFT, - sizeof(pg_data_t), - PAGE_SIZE); - NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT)); - memset(buf, 0, sizeof(buf)); - sprintf(buf, "NODE_DATA %d", nid); - memblock_x86_reserve_range(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf); - } - printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n", - nid, (unsigned long)NODE_DATA(nid)); -} - /* - * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel - * virtual address space (KVA) is reserved and portions of nodes are mapped - * using it. This is to allow node-local memory to be allocated for - * structures that would normally require ZONE_NORMAL. The memory is - * allocated with alloc_remap() and callers should be prepared to allocate - * from the bootmem allocator instead. + * Remap memory allocator */ static unsigned long node_remap_start_pfn[MAX_NUMNODES]; static void *node_remap_end_vaddr[MAX_NUMNODES]; static void *node_remap_alloc_vaddr[MAX_NUMNODES]; -static unsigned long node_remap_offset[MAX_NUMNODES]; +/** + * alloc_remap - Allocate remapped memory + * @nid: NUMA node to allocate memory from + * @size: The size of allocation + * + * Allocate @size bytes from the remap area of NUMA node @nid. The + * size of the remap area is predetermined by init_alloc_remap() and + * only the callers considered there should call this function. For + * more info, please read the comment on top of init_alloc_remap(). + * + * The caller must be ready to handle allocation failure from this + * function and fall back to regular memory allocator in such cases. + * + * CONTEXT: + * Single CPU early boot context. + * + * RETURNS: + * Pointer to the allocated memory on success, %NULL on failure. + */ void *alloc_remap(int nid, unsigned long size) { void *allocation = node_remap_alloc_vaddr[nid]; size = ALIGN(size, L1_CACHE_BYTES); - if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid]) + if (!allocation || (allocation + size) > node_remap_end_vaddr[nid]) return NULL; node_remap_alloc_vaddr[nid] += size; @@ -209,26 +119,6 @@ void *alloc_remap(int nid, unsigned long size) return allocation; } -static void __init remap_numa_kva(void) -{ - void *vaddr; - unsigned long pfn; - int node; - - for_each_online_node(node) { - printk(KERN_DEBUG "remap_numa_kva: node %d\n", node); - for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { - vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT); - printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n", - (unsigned long)vaddr, - node_remap_start_pfn[node] + pfn); - set_pmd_pfn((ulong) vaddr, - node_remap_start_pfn[node] + pfn, - PAGE_KERNEL_LARGE); - } - } -} - #ifdef CONFIG_HIBERNATION /** * resume_map_numa_kva - add KVA mapping to the temporary page tables created @@ -240,15 +130,16 @@ void resume_map_numa_kva(pgd_t *pgd_base) int node; for_each_online_node(node) { - unsigned long start_va, start_pfn, size, pfn; + unsigned long start_va, start_pfn, nr_pages, pfn; start_va = (unsigned long)node_remap_start_vaddr[node]; start_pfn = node_remap_start_pfn[node]; - size = node_remap_size[node]; + nr_pages = (node_remap_end_vaddr[node] - + node_remap_start_vaddr[node]) >> PAGE_SHIFT; printk(KERN_DEBUG "%s: node %d\n", __func__, node); - for (pfn = 0; pfn < size; pfn += PTRS_PER_PTE) { + for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) { unsigned long vaddr = start_va + (pfn << PAGE_SHIFT); pgd_t *pgd = pgd_base + pgd_index(vaddr); pud_t *pud = pud_offset(pgd, vaddr); @@ -264,132 +155,89 @@ void resume_map_numa_kva(pgd_t *pgd_base) } #endif -static __init unsigned long calculate_numa_remap_pages(void) +/** + * init_alloc_remap - Initialize remap allocator for a NUMA node + * @nid: NUMA node to initizlie remap allocator for + * + * NUMA nodes may end up without any lowmem. As allocating pgdat and + * memmap on a different node with lowmem is inefficient, a special + * remap allocator is implemented which can be used by alloc_remap(). + * + * For each node, the amount of memory which will be necessary for + * pgdat and memmap is calculated and two memory areas of the size are + * allocated - one in the node and the other in lowmem; then, the area + * in the node is remapped to the lowmem area. + * + * As pgdat and memmap must be allocated in lowmem anyway, this + * doesn't waste lowmem address space; however, the actual lowmem + * which gets remapped over is wasted. The amount shouldn't be + * problematic on machines this feature will be used. + * + * Initialization failure isn't fatal. alloc_remap() is used + * opportunistically and the callers will fall back to other memory + * allocation mechanisms on failure. + */ +void __init init_alloc_remap(int nid, u64 start, u64 end) { - int nid; - unsigned long size, reserve_pages = 0; - - for_each_online_node(nid) { - u64 node_kva_target; - u64 node_kva_final; - - /* - * The acpi/srat node info can show hot-add memroy zones - * where memory could be added but not currently present. - */ - printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n", - nid, node_start_pfn[nid], node_end_pfn[nid]); - if (node_start_pfn[nid] > max_pfn) - continue; - if (!node_end_pfn[nid]) - continue; - if (node_end_pfn[nid] > max_pfn) - node_end_pfn[nid] = max_pfn; - - /* ensure the remap includes space for the pgdat. */ - size = node_remap_size[nid] + sizeof(pg_data_t); - - /* convert size to large (pmd size) pages, rounding up */ - size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES; - /* now the roundup is correct, convert to PAGE_SIZE pages */ - size = size * PTRS_PER_PTE; - - node_kva_target = round_down(node_end_pfn[nid] - size, - PTRS_PER_PTE); - node_kva_target <<= PAGE_SHIFT; - do { - node_kva_final = memblock_find_in_range(node_kva_target, - ((u64)node_end_pfn[nid])<<PAGE_SHIFT, - ((u64)size)<<PAGE_SHIFT, - LARGE_PAGE_BYTES); - node_kva_target -= LARGE_PAGE_BYTES; - } while (node_kva_final == MEMBLOCK_ERROR && - (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid])); - - if (node_kva_final == MEMBLOCK_ERROR) - panic("Can not get kva ram\n"); - - node_remap_size[nid] = size; - node_remap_offset[nid] = reserve_pages; - reserve_pages += size; - printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of" - " node %d at %llx\n", - size, nid, node_kva_final>>PAGE_SHIFT); - - /* - * prevent kva address below max_low_pfn want it on system - * with less memory later. - * layout will be: KVA address , KVA RAM - * - * we are supposed to only record the one less then max_low_pfn - * but we could have some hole in high memory, and it will only - * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide - * to use it as free. - * So memblock_x86_reserve_range here, hope we don't run out of that array - */ - memblock_x86_reserve_range(node_kva_final, - node_kva_final+(((u64)size)<<PAGE_SHIFT), - "KVA RAM"); - - node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT; - } - printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n", - reserve_pages); - return reserve_pages; -} + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long end_pfn = end >> PAGE_SHIFT; + unsigned long size, pfn; + u64 node_pa, remap_pa; + void *remap_va; -static void init_remap_allocator(int nid) -{ - node_remap_start_vaddr[nid] = pfn_to_kaddr( - kva_start_pfn + node_remap_offset[nid]); - node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] + - (node_remap_size[nid] * PAGE_SIZE); - node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + - ALIGN(sizeof(pg_data_t), PAGE_SIZE); - - printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid, - (ulong) node_remap_start_vaddr[nid], - (ulong) node_remap_end_vaddr[nid]); + /* + * The acpi/srat node info can show hot-add memroy zones where + * memory could be added but not currently present. + */ + printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n", + nid, start_pfn, end_pfn); + + /* calculate the necessary space aligned to large page size */ + size = node_memmap_size_bytes(nid, start_pfn, end_pfn); + size += ALIGN(sizeof(pg_data_t), PAGE_SIZE); + size = ALIGN(size, LARGE_PAGE_BYTES); + + /* allocate node memory and the lowmem remap area */ + node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES); + if (node_pa == MEMBLOCK_ERROR) { + pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n", + size, nid); + return; + } + memblock_x86_reserve_range(node_pa, node_pa + size, "KVA RAM"); + + remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT, + max_low_pfn << PAGE_SHIFT, + size, LARGE_PAGE_BYTES); + if (remap_pa == MEMBLOCK_ERROR) { + pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n", + size, nid); + memblock_x86_free_range(node_pa, node_pa + size); + return; + } + memblock_x86_reserve_range(remap_pa, remap_pa + size, "KVA PG"); + remap_va = phys_to_virt(remap_pa); + + /* perform actual remap */ + for (pfn = 0; pfn < size >> PAGE_SHIFT; pfn += PTRS_PER_PTE) + set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT), + (node_pa >> PAGE_SHIFT) + pfn, + PAGE_KERNEL_LARGE); + + /* initialize remap allocator parameters */ + node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT; + node_remap_start_vaddr[nid] = remap_va; + node_remap_end_vaddr[nid] = remap_va + size; + node_remap_alloc_vaddr[nid] = remap_va; + + printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n", + nid, node_pa, node_pa + size, remap_va, remap_va + size); } void __init initmem_init(void) { - int nid; - long kva_target_pfn; - - /* - * When mapping a NUMA machine we allocate the node_mem_map arrays - * from node local memory. They are then mapped directly into KVA - * between zone normal and vmalloc space. Calculate the size of - * this space and use it to adjust the boundary between ZONE_NORMAL - * and ZONE_HIGHMEM. - */ - - get_memcfg_numa(); - numa_init_array(); - - kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE); + x86_numa_init(); - kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); - do { - kva_start_pfn = memblock_find_in_range(kva_target_pfn<<PAGE_SHIFT, - max_low_pfn<<PAGE_SHIFT, - kva_pages<<PAGE_SHIFT, - PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT; - kva_target_pfn -= PTRS_PER_PTE; - } while (kva_start_pfn == MEMBLOCK_ERROR && kva_target_pfn > min_low_pfn); - - if (kva_start_pfn == MEMBLOCK_ERROR) - panic("Can not get kva space\n"); - - printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n", - kva_start_pfn, max_low_pfn); - printk(KERN_INFO "max_pfn = %lx\n", max_pfn); - - /* avoid clash with initrd */ - memblock_x86_reserve_range(kva_start_pfn<<PAGE_SHIFT, - (kva_start_pfn + kva_pages)<<PAGE_SHIFT, - "KVA PG"); #ifdef CONFIG_HIGHMEM highstart_pfn = highend_pfn = max_pfn; if (max_pfn > max_low_pfn) @@ -409,51 +257,9 @@ void __init initmem_init(void) printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n", (ulong) pfn_to_kaddr(max_low_pfn)); - for_each_online_node(nid) { - init_remap_allocator(nid); - - allocate_pgdat(nid); - } - remap_numa_kva(); printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", (ulong) pfn_to_kaddr(highstart_pfn)); - for_each_online_node(nid) - propagate_e820_map_node(nid); - - for_each_online_node(nid) { - memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); - NODE_DATA(nid)->node_id = nid; - } setup_bootmem_allocator(); } - -#ifdef CONFIG_MEMORY_HOTPLUG -static int paddr_to_nid(u64 addr) -{ - int nid; - unsigned long pfn = PFN_DOWN(addr); - - for_each_node(nid) - if (node_start_pfn[nid] <= pfn && - pfn < node_end_pfn[nid]) - return nid; - - return -1; -} - -/* - * This function is used to ask node id BEFORE memmap and mem_section's - * initialization (pfn_to_nid() can't be used yet). - * If _PXM is not defined on ACPI's DSDT, node id must be found by this. - */ -int memory_add_physaddr_to_nid(u64 addr) -{ - int nid = paddr_to_nid(addr); - return (nid >= 0) ? nid : 0; -} - -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); -#endif - diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 85b52fc03084..dd27f401f0a0 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -2,646 +2,13 @@ * Generic VM initialization for x86-64 NUMA setups. * Copyright 2002,2003 Andi Kleen, SuSE Labs. */ -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/string.h> -#include <linux/init.h> #include <linux/bootmem.h> -#include <linux/memblock.h> -#include <linux/mmzone.h> -#include <linux/ctype.h> -#include <linux/module.h> -#include <linux/nodemask.h> -#include <linux/sched.h> -#include <linux/acpi.h> - -#include <asm/e820.h> -#include <asm/proto.h> -#include <asm/dma.h> -#include <asm/acpi.h> -#include <asm/amd_nb.h> #include "numa_internal.h" -struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; -EXPORT_SYMBOL(node_data); - -nodemask_t numa_nodes_parsed __initdata; - -struct memnode memnode; - -static unsigned long __initdata nodemap_addr; -static unsigned long __initdata nodemap_size; - -static struct numa_meminfo numa_meminfo __initdata; - -static int numa_distance_cnt; -static u8 *numa_distance; - -/* - * Given a shift value, try to populate memnodemap[] - * Returns : - * 1 if OK - * 0 if memnodmap[] too small (of shift too small) - * -1 if node overlap or lost ram (shift too big) - */ -static int __init populate_memnodemap(const struct numa_meminfo *mi, int shift) -{ - unsigned long addr, end; - int i, res = -1; - - memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize); - for (i = 0; i < mi->nr_blks; i++) { - addr = mi->blk[i].start; - end = mi->blk[i].end; - if (addr >= end) - continue; - if ((end >> shift) >= memnodemapsize) - return 0; - do { - if (memnodemap[addr >> shift] != NUMA_NO_NODE) - return -1; - memnodemap[addr >> shift] = mi->blk[i].nid; - addr += (1UL << shift); - } while (addr < end); - res = 1; - } - return res; -} - -static int __init allocate_cachealigned_memnodemap(void) -{ - unsigned long addr; - - memnodemap = memnode.embedded_map; - if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map)) - return 0; - - addr = 0x8000; - nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); - nodemap_addr = memblock_find_in_range(addr, get_max_mapped(), - nodemap_size, L1_CACHE_BYTES); - if (nodemap_addr == MEMBLOCK_ERROR) { - printk(KERN_ERR - "NUMA: Unable to allocate Memory to Node hash map\n"); - nodemap_addr = nodemap_size = 0; - return -1; - } - memnodemap = phys_to_virt(nodemap_addr); - memblock_x86_reserve_range(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP"); - - printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", - nodemap_addr, nodemap_addr + nodemap_size); - return 0; -} - -/* - * The LSB of all start and end addresses in the node map is the value of the - * maximum possible shift. - */ -static int __init extract_lsb_from_nodes(const struct numa_meminfo *mi) -{ - int i, nodes_used = 0; - unsigned long start, end; - unsigned long bitfield = 0, memtop = 0; - - for (i = 0; i < mi->nr_blks; i++) { - start = mi->blk[i].start; - end = mi->blk[i].end; - if (start >= end) - continue; - bitfield |= start; - nodes_used++; - if (end > memtop) - memtop = end; - } - if (nodes_used <= 1) - i = 63; - else - i = find_first_bit(&bitfield, sizeof(unsigned long)*8); - memnodemapsize = (memtop >> i)+1; - return i; -} - -static int __init compute_hash_shift(const struct numa_meminfo *mi) -{ - int shift; - - shift = extract_lsb_from_nodes(mi); - if (allocate_cachealigned_memnodemap()) - return -1; - printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", - shift); - - if (populate_memnodemap(mi, shift) != 1) { - printk(KERN_INFO "Your memory is not aligned you need to " - "rebuild your kernel with a bigger NODEMAPSIZE " - "shift=%d\n", shift); - return -1; - } - return shift; -} - -int __meminit __early_pfn_to_nid(unsigned long pfn) -{ - return phys_to_nid(pfn << PAGE_SHIFT); -} - -static void * __init early_node_mem(int nodeid, unsigned long start, - unsigned long end, unsigned long size, - unsigned long align) -{ - unsigned long mem; - - /* - * put it on high as possible - * something will go with NODE_DATA - */ - if (start < (MAX_DMA_PFN<<PAGE_SHIFT)) - start = MAX_DMA_PFN<<PAGE_SHIFT; - if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) && - end > (MAX_DMA32_PFN<<PAGE_SHIFT)) - start = MAX_DMA32_PFN<<PAGE_SHIFT; - mem = memblock_x86_find_in_range_node(nodeid, start, end, size, align); - if (mem != MEMBLOCK_ERROR) - return __va(mem); - - /* extend the search scope */ - end = max_pfn_mapped << PAGE_SHIFT; - start = MAX_DMA_PFN << PAGE_SHIFT; - mem = memblock_find_in_range(start, end, size, align); - if (mem != MEMBLOCK_ERROR) - return __va(mem); - - printk(KERN_ERR "Cannot find %lu bytes in node %d\n", - size, nodeid); - - return NULL; -} - -static int __init numa_add_memblk_to(int nid, u64 start, u64 end, - struct numa_meminfo *mi) -{ - /* ignore zero length blks */ - if (start == end) - return 0; - - /* whine about and ignore invalid blks */ - if (start > end || nid < 0 || nid >= MAX_NUMNODES) { - pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n", - nid, start, end); - return 0; - } - - if (mi->nr_blks >= NR_NODE_MEMBLKS) { - pr_err("NUMA: too many memblk ranges\n"); - return -EINVAL; - } - - mi->blk[mi->nr_blks].start = start; - mi->blk[mi->nr_blks].end = end; - mi->blk[mi->nr_blks].nid = nid; - mi->nr_blks++; - return 0; -} - -/** - * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo - * @idx: Index of memblk to remove - * @mi: numa_meminfo to remove memblk from - * - * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and - * decrementing @mi->nr_blks. - */ -void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) -{ - mi->nr_blks--; - memmove(&mi->blk[idx], &mi->blk[idx + 1], - (mi->nr_blks - idx) * sizeof(mi->blk[0])); -} - -/** - * numa_add_memblk - Add one numa_memblk to numa_meminfo - * @nid: NUMA node ID of the new memblk - * @start: Start address of the new memblk - * @end: End address of the new memblk - * - * Add a new memblk to the default numa_meminfo. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int __init numa_add_memblk(int nid, u64 start, u64 end) -{ - return numa_add_memblk_to(nid, start, end, &numa_meminfo); -} - -/* Initialize bootmem allocator for a node */ -void __init -setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) -{ - unsigned long start_pfn, last_pfn, nodedata_phys; - const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE); - int nid; - - if (!end) - return; - - /* - * Don't confuse VM with a node that doesn't have the - * minimum amount of memory: - */ - if (end && (end - start) < NODE_MIN_SIZE) - return; - - start = roundup(start, ZONE_ALIGN); - - printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", nodeid, - start, end); - - start_pfn = start >> PAGE_SHIFT; - last_pfn = end >> PAGE_SHIFT; - - node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size, - SMP_CACHE_BYTES); - if (node_data[nodeid] == NULL) - return; - nodedata_phys = __pa(node_data[nodeid]); - memblock_x86_reserve_range(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA"); - printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys, - nodedata_phys + pgdat_size - 1); - nid = phys_to_nid(nodedata_phys); - if (nid != nodeid) - printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid); - - memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); - NODE_DATA(nodeid)->node_id = nodeid; - NODE_DATA(nodeid)->node_start_pfn = start_pfn; - NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; - - node_set_online(nodeid); -} - -/** - * numa_cleanup_meminfo - Cleanup a numa_meminfo - * @mi: numa_meminfo to clean up - * - * Sanitize @mi by merging and removing unncessary memblks. Also check for - * conflicts and clear unused memblks. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int __init numa_cleanup_meminfo(struct numa_meminfo *mi) -{ - const u64 low = 0; - const u64 high = (u64)max_pfn << PAGE_SHIFT; - int i, j, k; - - for (i = 0; i < mi->nr_blks; i++) { - struct numa_memblk *bi = &mi->blk[i]; - - /* make sure all blocks are inside the limits */ - bi->start = max(bi->start, low); - bi->end = min(bi->end, high); - - /* and there's no empty block */ - if (bi->start >= bi->end) { - numa_remove_memblk_from(i--, mi); - continue; - } - - for (j = i + 1; j < mi->nr_blks; j++) { - struct numa_memblk *bj = &mi->blk[j]; - unsigned long start, end; - - /* - * See whether there are overlapping blocks. Whine - * about but allow overlaps of the same nid. They - * will be merged below. - */ - if (bi->end > bj->start && bi->start < bj->end) { - if (bi->nid != bj->nid) { - pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n", - bi->nid, bi->start, bi->end, - bj->nid, bj->start, bj->end); - return -EINVAL; - } - pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n", - bi->nid, bi->start, bi->end, - bj->start, bj->end); - } - - /* - * Join together blocks on the same node, holes - * between which don't overlap with memory on other - * nodes. - */ - if (bi->nid != bj->nid) - continue; - start = max(min(bi->start, bj->start), low); - end = min(max(bi->end, bj->end), high); - for (k = 0; k < mi->nr_blks; k++) { - struct numa_memblk *bk = &mi->blk[k]; - - if (bi->nid == bk->nid) - continue; - if (start < bk->end && end > bk->start) - break; - } - if (k < mi->nr_blks) - continue; - printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n", - bi->nid, bi->start, bi->end, bj->start, bj->end, - start, end); - bi->start = start; - bi->end = end; - numa_remove_memblk_from(j--, mi); - } - } - - for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { - mi->blk[i].start = mi->blk[i].end = 0; - mi->blk[i].nid = NUMA_NO_NODE; - } - - return 0; -} - -/* - * Set nodes, which have memory in @mi, in *@nodemask. - */ -static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, - const struct numa_meminfo *mi) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(mi->blk); i++) - if (mi->blk[i].start != mi->blk[i].end && - mi->blk[i].nid != NUMA_NO_NODE) - node_set(mi->blk[i].nid, *nodemask); -} - -/** - * numa_reset_distance - Reset NUMA distance table - * - * The current table is freed. The next numa_set_distance() call will - * create a new one. - */ -void __init numa_reset_distance(void) -{ - size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); - - /* numa_distance could be 1LU marking allocation failure, test cnt */ - if (numa_distance_cnt) - memblock_x86_free_range(__pa(numa_distance), - __pa(numa_distance) + size); - numa_distance_cnt = 0; - numa_distance = NULL; /* enable table creation */ -} - -static int __init numa_alloc_distance(void) -{ - nodemask_t nodes_parsed; - size_t size; - int i, j, cnt = 0; - u64 phys; - - /* size the new table and allocate it */ - nodes_parsed = numa_nodes_parsed; - numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); - - for_each_node_mask(i, nodes_parsed) - cnt = i; - cnt++; - size = cnt * cnt * sizeof(numa_distance[0]); - - phys = memblock_find_in_range(0, (u64)max_pfn_mapped << PAGE_SHIFT, - size, PAGE_SIZE); - if (phys == MEMBLOCK_ERROR) { - pr_warning("NUMA: Warning: can't allocate distance table!\n"); - /* don't retry until explicitly reset */ - numa_distance = (void *)1LU; - return -ENOMEM; - } - memblock_x86_reserve_range(phys, phys + size, "NUMA DIST"); - - numa_distance = __va(phys); - numa_distance_cnt = cnt; - - /* fill with the default distances */ - for (i = 0; i < cnt; i++) - for (j = 0; j < cnt; j++) - numa_distance[i * cnt + j] = i == j ? - LOCAL_DISTANCE : REMOTE_DISTANCE; - printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt); - - return 0; -} - -/** - * numa_set_distance - Set NUMA distance from one NUMA to another - * @from: the 'from' node to set distance - * @to: the 'to' node to set distance - * @distance: NUMA distance - * - * Set the distance from node @from to @to to @distance. If distance table - * doesn't exist, one which is large enough to accommodate all the currently - * known nodes will be created. - * - * If such table cannot be allocated, a warning is printed and further - * calls are ignored until the distance table is reset with - * numa_reset_distance(). - * - * If @from or @to is higher than the highest known node at the time of - * table creation or @distance doesn't make sense, the call is ignored. - * This is to allow simplification of specific NUMA config implementations. - */ -void __init numa_set_distance(int from, int to, int distance) -{ - if (!numa_distance && numa_alloc_distance() < 0) - return; - - if (from >= numa_distance_cnt || to >= numa_distance_cnt) { - printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n", - from, to, distance); - return; - } - - if ((u8)distance != distance || - (from == to && distance != LOCAL_DISTANCE)) { - pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n", - from, to, distance); - return; - } - - numa_distance[from * numa_distance_cnt + to] = distance; -} - -int __node_distance(int from, int to) -{ - if (from >= numa_distance_cnt || to >= numa_distance_cnt) - return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; - return numa_distance[from * numa_distance_cnt + to]; -} -EXPORT_SYMBOL(__node_distance); - -/* - * Sanity check to catch more bad NUMA configurations (they are amazingly - * common). Make sure the nodes cover all memory. - */ -static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) -{ - unsigned long numaram, e820ram; - int i; - - numaram = 0; - for (i = 0; i < mi->nr_blks; i++) { - unsigned long s = mi->blk[i].start >> PAGE_SHIFT; - unsigned long e = mi->blk[i].end >> PAGE_SHIFT; - numaram += e - s; - numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e); - if ((long)numaram < 0) - numaram = 0; - } - - e820ram = max_pfn - (memblock_x86_hole_size(0, - max_pfn << PAGE_SHIFT) >> PAGE_SHIFT); - /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ - if ((long)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) { - printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n", - (numaram << PAGE_SHIFT) >> 20, - (e820ram << PAGE_SHIFT) >> 20); - return false; - } - return true; -} - -static int __init numa_register_memblks(struct numa_meminfo *mi) -{ - int i, nid; - - /* Account for nodes with cpus and no memory */ - node_possible_map = numa_nodes_parsed; - numa_nodemask_from_meminfo(&node_possible_map, mi); - if (WARN_ON(nodes_empty(node_possible_map))) - return -EINVAL; - - memnode_shift = compute_hash_shift(mi); - if (memnode_shift < 0) { - printk(KERN_ERR "NUMA: No NUMA node hash function found. Contact maintainer\n"); - return -EINVAL; - } - - for (i = 0; i < mi->nr_blks; i++) - memblock_x86_register_active_regions(mi->blk[i].nid, - mi->blk[i].start >> PAGE_SHIFT, - mi->blk[i].end >> PAGE_SHIFT); - - /* for out of order entries */ - sort_node_map(); - if (!numa_meminfo_cover_memory(mi)) - return -EINVAL; - - /* Finally register nodes. */ - for_each_node_mask(nid, node_possible_map) { - u64 start = (u64)max_pfn << PAGE_SHIFT; - u64 end = 0; - - for (i = 0; i < mi->nr_blks; i++) { - if (nid != mi->blk[i].nid) - continue; - start = min(mi->blk[i].start, start); - end = max(mi->blk[i].end, end); - } - - if (start < end) - setup_node_bootmem(nid, start, end); - } - - return 0; -} - -/** - * dummy_numma_init - Fallback dummy NUMA init - * - * Used if there's no underlying NUMA architecture, NUMA initialization - * fails, or NUMA is disabled on the command line. - * - * Must online at least one node and add memory blocks that cover all - * allowed memory. This function must not fail. - */ -static int __init dummy_numa_init(void) -{ - printk(KERN_INFO "%s\n", - numa_off ? "NUMA turned off" : "No NUMA configuration found"); - printk(KERN_INFO "Faking a node at %016lx-%016lx\n", - 0LU, max_pfn << PAGE_SHIFT); - - node_set(0, numa_nodes_parsed); - numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT); - - return 0; -} - -static int __init numa_init(int (*init_func)(void)) -{ - int i; - int ret; - - for (i = 0; i < MAX_LOCAL_APIC; i++) - set_apicid_to_node(i, NUMA_NO_NODE); - - nodes_clear(numa_nodes_parsed); - nodes_clear(node_possible_map); - nodes_clear(node_online_map); - memset(&numa_meminfo, 0, sizeof(numa_meminfo)); - remove_all_active_ranges(); - numa_reset_distance(); - - ret = init_func(); - if (ret < 0) - return ret; - ret = numa_cleanup_meminfo(&numa_meminfo); - if (ret < 0) - return ret; - - numa_emulation(&numa_meminfo, numa_distance_cnt); - - ret = numa_register_memblks(&numa_meminfo); - if (ret < 0) - return ret; - - for (i = 0; i < nr_cpu_ids; i++) { - int nid = early_cpu_to_node(i); - - if (nid == NUMA_NO_NODE) - continue; - if (!node_online(nid)) - numa_clear_node(i); - } - numa_init_array(); - return 0; -} - void __init initmem_init(void) { - int ret; - - if (!numa_off) { -#ifdef CONFIG_ACPI_NUMA - ret = numa_init(x86_acpi_numa_init); - if (!ret) - return; -#endif -#ifdef CONFIG_AMD_NUMA - ret = numa_init(amd_numa_init); - if (!ret) - return; -#endif - } - - numa_init(dummy_numa_init); + x86_numa_init(); } unsigned long __init numa_free_all_bootmem(void) @@ -656,12 +23,3 @@ unsigned long __init numa_free_all_bootmem(void) return pages; } - -int __cpuinit numa_cpu_node(int cpu) -{ - int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); - - if (apicid != BAD_APICID) - return __apicid_to_node[apicid]; - return NUMA_NO_NODE; -} diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index de84cc140379..d0ed086b6247 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -5,6 +5,7 @@ #include <linux/errno.h> #include <linux/topology.h> #include <linux/memblock.h> +#include <linux/bootmem.h> #include <asm/dma.h> #include "numa_internal.h" @@ -84,7 +85,13 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei, nr_nodes = MAX_NUMNODES; } - size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes; + /* + * Calculate target node size. x86_32 freaks on __udivdi3() so do + * the division in ulong number of pages and convert back. + */ + size = max_addr - addr - memblock_x86_hole_size(addr, max_addr); + size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes); + /* * Calculate the number of big nodes that can be allocated as a result * of consolidating the remainder. @@ -226,7 +233,7 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, */ while (nodes_weight(physnode_mask)) { for_each_node_mask(i, physnode_mask) { - u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT; + u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); u64 start, limit, end; int phys_blk; @@ -298,7 +305,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) { static struct numa_meminfo ei __initdata; static struct numa_meminfo pi __initdata; - const u64 max_addr = max_pfn << PAGE_SHIFT; + const u64 max_addr = PFN_PHYS(max_pfn); u8 *phys_dist = NULL; size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); int max_emu_nid, dfl_phys_nid; @@ -342,8 +349,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) if (numa_dist_cnt) { u64 phys; - phys = memblock_find_in_range(0, - (u64)max_pfn_mapped << PAGE_SHIFT, + phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), phys_size, PAGE_SIZE); if (phys == MEMBLOCK_ERROR) { pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h index ef2d97377d7c..7178c3afe05e 100644 --- a/arch/x86/mm/numa_internal.h +++ b/arch/x86/mm/numa_internal.h @@ -19,6 +19,14 @@ void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi); int __init numa_cleanup_meminfo(struct numa_meminfo *mi); void __init numa_reset_distance(void); +void __init x86_numa_init(void); + +#ifdef CONFIG_X86_64 +static inline void init_alloc_remap(int nid, u64 start, u64 end) { } +#else +void __init init_alloc_remap(int nid, u64 start, u64 end); +#endif + #ifdef CONFIG_NUMA_EMU void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt); diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat.c index 8e9d3394f6d4..81dbfdeb080d 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat.c @@ -26,8 +26,6 @@ int acpi_numa __initdata; -static struct bootnode nodes_add[MAX_NUMNODES]; - static __init int setup_node(int pxm) { return acpi_map_pxm_to_node(pxm); @@ -37,7 +35,6 @@ static __init void bad_srat(void) { printk(KERN_ERR "SRAT: SRAT not used.\n"); acpi_numa = -1; - memset(nodes_add, 0, sizeof(nodes_add)); } static __init inline int srat_disabled(void) @@ -131,73 +128,17 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) pxm, apic_id, node); } -#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE +#ifdef CONFIG_MEMORY_HOTPLUG static inline int save_add_info(void) {return 1;} #else static inline int save_add_info(void) {return 0;} #endif -/* - * Update nodes_add[] - * This code supports one contiguous hot add area per node - */ -static void __init -update_nodes_add(int node, unsigned long start, unsigned long end) -{ - unsigned long s_pfn = start >> PAGE_SHIFT; - unsigned long e_pfn = end >> PAGE_SHIFT; - int changed = 0; - struct bootnode *nd = &nodes_add[node]; - - /* I had some trouble with strange memory hotadd regions breaking - the boot. Be very strict here and reject anything unexpected. - If you want working memory hotadd write correct SRATs. - - The node size check is a basic sanity check to guard against - mistakes */ - if ((signed long)(end - start) < NODE_MIN_SIZE) { - printk(KERN_ERR "SRAT: Hotplug area too small\n"); - return; - } - - /* This check might be a bit too strict, but I'm keeping it for now. */ - if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) { - printk(KERN_ERR - "SRAT: Hotplug area %lu -> %lu has existing memory\n", - s_pfn, e_pfn); - return; - } - - /* Looks good */ - - if (nd->start == nd->end) { - nd->start = start; - nd->end = end; - changed = 1; - } else { - if (nd->start == end) { - nd->start = start; - changed = 1; - } - if (nd->end == start) { - nd->end = end; - changed = 1; - } - if (!changed) - printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n"); - } - - if (changed) { - node_set(node, numa_nodes_parsed); - printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", - nd->start, nd->end); - } -} /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ void __init acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) { - unsigned long start, end; + u64 start, end; int node, pxm; if (srat_disabled()) @@ -226,11 +167,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) return; } - printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, + printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, start, end); - - if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) - update_nodes_add(node, start, end); } void __init acpi_numa_arch_fixup(void) {} @@ -244,17 +182,3 @@ int __init x86_acpi_numa_init(void) return ret; return srat_disabled() ? -EINVAL : 0; } - -#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY) -int memory_add_physaddr_to_nid(u64 start) -{ - int i, ret = 0; - - for_each_node(i) - if (nodes_add[i].start <= start && nodes_add[i].end > start) - ret = i; - - return ret; -} -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); -#endif diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c deleted file mode 100644 index 364f36bdfad8..000000000000 --- a/arch/x86/mm/srat_32.c +++ /dev/null @@ -1,288 +0,0 @@ -/* - * Some of the code in this file has been gleaned from the 64 bit - * discontigmem support code base. - * - * Copyright (C) 2002, IBM Corp. - * - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Send feedback to Pat Gaughen <gone@us.ibm.com> - */ -#include <linux/mm.h> -#include <linux/bootmem.h> -#include <linux/memblock.h> -#include <linux/mmzone.h> -#include <linux/acpi.h> -#include <linux/nodemask.h> -#include <asm/srat.h> -#include <asm/topology.h> -#include <asm/smp.h> -#include <asm/e820.h> - -/* - * proximity macros and definitions - */ -#define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */ -#define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */ -#define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit)) -#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit))) -/* bitmap length; _PXM is at most 255 */ -#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) -static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */ - -#define MAX_CHUNKS_PER_NODE 3 -#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES) -struct node_memory_chunk_s { - unsigned long start_pfn; - unsigned long end_pfn; - u8 pxm; // proximity domain of node - u8 nid; // which cnode contains this chunk? - u8 bank; // which mem bank on this node -}; -static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS]; - -static int __initdata num_memory_chunks; /* total number of memory chunks */ -static u8 __initdata apicid_to_pxm[MAX_LOCAL_APIC]; - -int acpi_numa __initdata; - -static __init void bad_srat(void) -{ - printk(KERN_ERR "SRAT: SRAT not used.\n"); - acpi_numa = -1; - num_memory_chunks = 0; -} - -static __init inline int srat_disabled(void) -{ - return numa_off || acpi_numa < 0; -} - -/* Identify CPU proximity domains */ -void __init -acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity) -{ - if (srat_disabled()) - return; - if (cpu_affinity->header.length != - sizeof(struct acpi_srat_cpu_affinity)) { - bad_srat(); - return; - } - - if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0) - return; /* empty entry */ - - /* mark this node as "seen" in node bitmap */ - BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo); - - /* don't need to check apic_id here, because it is always 8 bits */ - apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo; - - printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n", - cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo); -} - -/* - * Identify memory proximity domains and hot-remove capabilities. - * Fill node memory chunk list structure. - */ -void __init -acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity) -{ - unsigned long long paddr, size; - unsigned long start_pfn, end_pfn; - u8 pxm; - struct node_memory_chunk_s *p, *q, *pend; - - if (srat_disabled()) - return; - if (memory_affinity->header.length != - sizeof(struct acpi_srat_mem_affinity)) { - bad_srat(); - return; - } - - if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0) - return; /* empty entry */ - - pxm = memory_affinity->proximity_domain & 0xff; - - /* mark this node as "seen" in node bitmap */ - BMAP_SET(pxm_bitmap, pxm); - - /* calculate info for memory chunk structure */ - paddr = memory_affinity->base_address; - size = memory_affinity->length; - - start_pfn = paddr >> PAGE_SHIFT; - end_pfn = (paddr + size) >> PAGE_SHIFT; - - - if (num_memory_chunks >= MAXCHUNKS) { - printk(KERN_WARNING "Too many mem chunks in SRAT." - " Ignoring %lld MBytes at %llx\n", - size/(1024*1024), paddr); - return; - } - - /* Insertion sort based on base address */ - pend = &node_memory_chunk[num_memory_chunks]; - for (p = &node_memory_chunk[0]; p < pend; p++) { - if (start_pfn < p->start_pfn) - break; - } - if (p < pend) { - for (q = pend; q >= p; q--) - *(q + 1) = *q; - } - p->start_pfn = start_pfn; - p->end_pfn = end_pfn; - p->pxm = pxm; - - num_memory_chunks++; - - printk(KERN_DEBUG "Memory range %08lx to %08lx" - " in proximity domain %02x %s\n", - start_pfn, end_pfn, - pxm, - ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ? - "enabled and removable" : "enabled" ) ); -} - -/* Callback for SLIT parsing */ -void __init acpi_numa_slit_init(struct acpi_table_slit *slit) -{ -} - -void acpi_numa_arch_fixup(void) -{ -} -/* - * The SRAT table always lists ascending addresses, so can always - * assume that the first "start" address that you see is the real - * start of the node, and that the current "end" address is after - * the previous one. - */ -static __init int node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk) -{ - /* - * Only add present memory as told by the e820. - * There is no guarantee from the SRAT that the memory it - * enumerates is present at boot time because it represents - * *possible* memory hotplug areas the same as normal RAM. - */ - if (memory_chunk->start_pfn >= max_pfn) { - printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n", - memory_chunk->start_pfn, memory_chunk->end_pfn); - return -1; - } - if (memory_chunk->nid != nid) - return -1; - - if (!node_has_online_mem(nid)) - node_start_pfn[nid] = memory_chunk->start_pfn; - - if (node_start_pfn[nid] > memory_chunk->start_pfn) - node_start_pfn[nid] = memory_chunk->start_pfn; - - if (node_end_pfn[nid] < memory_chunk->end_pfn) - node_end_pfn[nid] = memory_chunk->end_pfn; - - return 0; -} - -int __init get_memcfg_from_srat(void) -{ - int i, j, nid; - - if (srat_disabled()) - goto out_fail; - - if (acpi_numa_init() < 0) - goto out_fail; - - if (num_memory_chunks == 0) { - printk(KERN_DEBUG - "could not find any ACPI SRAT memory areas.\n"); - goto out_fail; - } - - /* Calculate total number of nodes in system from PXM bitmap and create - * a set of sequential node IDs starting at zero. (ACPI doesn't seem - * to specify the range of _PXM values.) - */ - /* - * MCD - we no longer HAVE to number nodes sequentially. PXM domain - * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically - * 32, so we will continue numbering them in this manner until MAX_NUMNODES - * approaches MAX_PXM_DOMAINS for i386. - */ - nodes_clear(node_online_map); - for (i = 0; i < MAX_PXM_DOMAINS; i++) { - if (BMAP_TEST(pxm_bitmap, i)) { - int nid = acpi_map_pxm_to_node(i); - node_set_online(nid); - } - } - BUG_ON(num_online_nodes() == 0); - - /* set cnode id in memory chunk structure */ - for (i = 0; i < num_memory_chunks; i++) - node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm); - - printk(KERN_DEBUG "pxm bitmap: "); - for (i = 0; i < sizeof(pxm_bitmap); i++) { - printk(KERN_CONT "%02x ", pxm_bitmap[i]); - } - printk(KERN_CONT "\n"); - printk(KERN_DEBUG "Number of logical nodes in system = %d\n", - num_online_nodes()); - printk(KERN_DEBUG "Number of memory chunks in system = %d\n", - num_memory_chunks); - - for (i = 0; i < MAX_LOCAL_APIC; i++) - set_apicid_to_node(i, pxm_to_node(apicid_to_pxm[i])); - - for (j = 0; j < num_memory_chunks; j++){ - struct node_memory_chunk_s * chunk = &node_memory_chunk[j]; - printk(KERN_DEBUG - "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n", - j, chunk->nid, chunk->start_pfn, chunk->end_pfn); - if (node_read_chunk(chunk->nid, chunk)) - continue; - - memblock_x86_register_active_regions(chunk->nid, chunk->start_pfn, - min(chunk->end_pfn, max_pfn)); - } - /* for out of order entries in SRAT */ - sort_node_map(); - - for_each_online_node(nid) { - unsigned long start = node_start_pfn[nid]; - unsigned long end = min(node_end_pfn[nid], max_pfn); - - memory_present(nid, start, end); - node_remap_size[nid] = node_memmap_size_bytes(nid, start, end); - } - return 1; -out_fail: - printk(KERN_DEBUG "failed to get NUMA memory information from SRAT" - " table\n"); - return 0; -} diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index 275dbc19e2cf..7000e74b3087 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -194,7 +194,7 @@ static unsigned long __init mrst_calibrate_tsc(void) return 0; } -void __init mrst_time_init(void) +static void __init mrst_time_init(void) { sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); switch (mrst_timer_options) { @@ -216,7 +216,7 @@ void __init mrst_time_init(void) apbt_time_init(); } -void __cpuinit mrst_arch_setup(void) +static void __cpuinit mrst_arch_setup(void) { if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27) __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL; diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile index c2a8cab65e5d..81c5e2165c24 100644 --- a/arch/x86/platform/olpc/Makefile +++ b/arch/x86/platform/olpc/Makefile @@ -1,4 +1,2 @@ -obj-$(CONFIG_OLPC) += olpc.o +obj-$(CONFIG_OLPC) += olpc.o olpc_ofw.o olpc_dt.o obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o -obj-$(CONFIG_OLPC) += olpc_ofw.o -obj-$(CONFIG_OF_PROMTREE) += olpc_dt.o diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c index edaf3fe8dc5e..0060fd59ea00 100644 --- a/arch/x86/platform/olpc/olpc.c +++ b/arch/x86/platform/olpc/olpc.c @@ -18,6 +18,7 @@ #include <linux/io.h> #include <linux/string.h> #include <linux/platform_device.h> +#include <linux/of.h> #include <asm/geode.h> #include <asm/setup.h> @@ -187,41 +188,43 @@ err: } EXPORT_SYMBOL_GPL(olpc_ec_cmd); -static bool __init check_ofw_architecture(void) +static bool __init check_ofw_architecture(struct device_node *root) { - size_t propsize; - char olpc_arch[5]; - const void *args[] = { NULL, "architecture", olpc_arch, (void *)5 }; - void *res[] = { &propsize }; + const char *olpc_arch; + int propsize; - if (olpc_ofw("getprop", args, res)) { - printk(KERN_ERR "ofw: getprop call failed!\n"); - return false; - } + olpc_arch = of_get_property(root, "architecture", &propsize); return propsize == 5 && strncmp("OLPC", olpc_arch, 5) == 0; } -static u32 __init get_board_revision(void) +static u32 __init get_board_revision(struct device_node *root) { - size_t propsize; - __be32 rev; - const void *args[] = { NULL, "board-revision-int", &rev, (void *)4 }; - void *res[] = { &propsize }; - - if (olpc_ofw("getprop", args, res) || propsize != 4) { - printk(KERN_ERR "ofw: getprop call failed!\n"); - return cpu_to_be32(0); - } - return be32_to_cpu(rev); + int propsize; + const __be32 *rev; + + rev = of_get_property(root, "board-revision-int", &propsize); + if (propsize != 4) + return 0; + + return be32_to_cpu(*rev); } static bool __init platform_detect(void) { - if (!check_ofw_architecture()) + struct device_node *root = of_find_node_by_path("/"); + bool success; + + if (!root) return false; - olpc_platform_info.flags |= OLPC_F_PRESENT; - olpc_platform_info.boardrev = get_board_revision(); - return true; + + success = check_ofw_architecture(root); + if (success) { + olpc_platform_info.boardrev = get_board_revision(root); + olpc_platform_info.flags |= OLPC_F_PRESENT; + } + + of_node_put(root); + return success; } static int __init add_xo1_platform_devices(void) diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c index 044bda5b3174..d39f63d017d2 100644 --- a/arch/x86/platform/olpc/olpc_dt.c +++ b/arch/x86/platform/olpc/olpc_dt.c @@ -19,7 +19,9 @@ #include <linux/kernel.h> #include <linux/bootmem.h> #include <linux/of.h> +#include <linux/of_platform.h> #include <linux/of_pdt.h> +#include <asm/olpc.h> #include <asm/olpc_ofw.h> static phandle __init olpc_dt_getsibling(phandle node) @@ -180,3 +182,20 @@ void __init olpc_dt_build_devicetree(void) pr_info("PROM DT: Built device tree with %u bytes of memory.\n", prom_early_allocated); } + +/* A list of DT node/bus matches that we want to expose as platform devices */ +static struct of_device_id __initdata of_ids[] = { + { .compatible = "olpc,xo1-battery" }, + { .compatible = "olpc,xo1-dcon" }, + { .compatible = "olpc,xo1-rtc" }, + {}, +}; + +static int __init olpc_create_platform_devices(void) +{ + if (machine_is_olpc()) + return of_platform_bus_probe(NULL, of_ids, NULL); + else + return 0; +} +device_initcall(olpc_create_platform_devices); diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c index 9daf5d1af9f1..0eb90184515f 100644 --- a/arch/x86/platform/uv/uv_time.c +++ b/arch/x86/platform/uv/uv_time.c @@ -40,7 +40,6 @@ static struct clocksource clocksource_uv = { .rating = 400, .read = uv_read_rtc, .mask = (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK, - .shift = 10, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -372,14 +371,11 @@ static __init int uv_rtc_setup_clock(void) if (!is_uv_system()) return -ENODEV; - clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second, - clocksource_uv.shift); - /* If single blade, prefer tsc */ if (uv_num_possible_blades() == 1) clocksource_uv.rating = 250; - rc = clocksource_register(&clocksource_uv); + rc = clocksource_register_hz(&clocksource_uv, sn_rtc_cycles_per_second); if (rc) printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc); else diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 30612441ed99..762b46ab14d5 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -46,13 +46,12 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); /* - * Reschedule call back. Nothing to do, - * all the work is done automatically when - * we return from the interrupt. + * Reschedule call back. */ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) { inc_irq_stat(irq_resched_count); + scheduler_ipi(); return IRQ_HANDLED; } diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 2e2d370a47b1..c532d280ce36 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -26,8 +26,6 @@ #include "xen-ops.h" -#define XEN_SHIFT 22 - /* Xen may fire a timer up to this many ns early */ #define TIMER_SLOP 100000 #define NS_PER_TICK (1000000000LL / HZ) @@ -211,8 +209,6 @@ static struct clocksource xen_clocksource __read_mostly = { .rating = 400, .read = xen_clocksource_get_cycles, .mask = ~0, - .mult = 1<<XEN_SHIFT, /* time directly in nanoseconds */ - .shift = XEN_SHIFT, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -448,7 +444,7 @@ static __init void xen_time_init(void) int cpu = smp_processor_id(); struct timespec tp; - clocksource_register(&xen_clocksource); + clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC); if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { /* Successfully turned off 100Hz tick, so we have the |