From d221938c049f4845da13c8593132595a6b9222a8 Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Fri, 25 Jan 2008 21:08:01 +0100 Subject: cpu-hotplug: refcount based cpu hotplug This patch implements a Refcount + Waitqueue based model for cpu-hotplug. Now, a thread which wants to prevent cpu-hotplug, will bump up a global refcount and the thread which wants to perform a cpu-hotplug operation will block till the global refcount goes to zero. The readers, if any, during an ongoing cpu-hotplug operation are blocked until the cpu-hotplug operation is over. Signed-off-by: Gautham R Shenoy Signed-off-by: Paul Jackson [For !CONFIG_HOTPLUG_CPU ] Signed-off-by: Ingo Molnar --- init/main.c | 1 + 1 file changed, 1 insertion(+) (limited to 'init/main.c') diff --git a/init/main.c b/init/main.c index 80b04b6c5157..f287ca5862b9 100644 --- a/init/main.c +++ b/init/main.c @@ -607,6 +607,7 @@ asmlinkage void __init start_kernel(void) vfs_caches_init_early(); cpuset_init_early(); mem_init(); + cpu_hotplug_init(); kmem_cache_init(); setup_per_cpu_pageset(); numa_policy_init(); -- cgit v1.2.3 From b32ef636a59aad12f9f9b5dc34c93222842c58ba Mon Sep 17 00:00:00 2001 From: "travis@sgi.com" Date: Wed, 30 Jan 2008 13:32:51 +0100 Subject: percpu: use a kconfig variable to signal arch specific percpu setup The use of the __GENERIC_PERCPU is a bit problematic since arches may want to run their own percpu setup while using the generic percpu definitions. Replace it through a kconfig variable. Cc: Rusty Russell Cc: Andi Kleen Signed-off-by: Christoph Lameter Signed-off-by: Mike Travis Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/ia64/Kconfig | 3 +++ arch/powerpc/Kconfig | 3 +++ arch/sparc64/Kconfig | 3 +++ arch/x86/Kconfig | 3 +++ include/asm-generic/percpu.h | 1 - include/asm-s390/percpu.h | 2 -- include/asm-x86/percpu_32.h | 2 -- init/main.c | 4 ++-- 8 files changed, 14 insertions(+), 7 deletions(-) (limited to 'init/main.c') diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 4a81b7fb191a..5a41e75ae1fe 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -80,6 +80,9 @@ config GENERIC_TIME_VSYSCALL bool default y +config ARCH_SETS_UP_PER_CPU_AREA + def_bool y + config DMI bool default y diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index c17a194beb0e..fb85f6b72fcf 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -42,6 +42,9 @@ config GENERIC_HARDIRQS bool default y +config ARCH_SETS_UP_PER_CPU_AREA + def_bool PPC64 + config IRQ_PER_CPU bool default y diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig index 1e25bce0366d..26f5791baa33 100644 --- a/arch/sparc64/Kconfig +++ b/arch/sparc64/Kconfig @@ -66,6 +66,9 @@ config AUDIT_ARCH bool default y +config ARCH_SETS_UP_PER_CPU_AREA + def_bool y + config ARCH_NO_VIRT_TO_BUS def_bool y diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1de28e850aa4..dc80a9f1f303 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -97,6 +97,9 @@ config GENERIC_TIME_VSYSCALL bool default X86_64 +config ARCH_SETS_UP_PER_CPU_AREA + def_bool X86_64 + config ARCH_SUPPORTS_OPROFILE bool default y diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h index d85172e9ed45..b5e53b9ab1f7 100644 --- a/include/asm-generic/percpu.h +++ b/include/asm-generic/percpu.h @@ -3,7 +3,6 @@ #include #include -#define __GENERIC_PER_CPU #ifdef CONFIG_SMP extern unsigned long __per_cpu_offset[NR_CPUS]; diff --git a/include/asm-s390/percpu.h b/include/asm-s390/percpu.h index 545857e64443..f94d0d3cdb2f 100644 --- a/include/asm-s390/percpu.h +++ b/include/asm-s390/percpu.h @@ -4,8 +4,6 @@ #include #include -#define __GENERIC_PER_CPU - /* * s390 uses its own implementation for per cpu data, the offset of * the cpu local data area is cached in the cpu's lowcore memory. diff --git a/include/asm-x86/percpu_32.h b/include/asm-x86/percpu_32.h index a7ebd436f3cc..3949586bf94e 100644 --- a/include/asm-x86/percpu_32.h +++ b/include/asm-x86/percpu_32.h @@ -41,8 +41,6 @@ * PER_CPU(cpu_gdt_descr, %ebx) */ #ifdef CONFIG_SMP -/* Same as generic implementation except for optimized local access. */ -#define __GENERIC_PER_CPU /* This is used for other cpus to find our section. */ extern unsigned long __per_cpu_offset[]; diff --git a/init/main.c b/init/main.c index f287ca5862b9..3f8aba291ed3 100644 --- a/init/main.c +++ b/init/main.c @@ -363,7 +363,7 @@ static inline void smp_prepare_cpus(unsigned int maxcpus) { } #else -#ifdef __GENERIC_PER_CPU +#ifndef CONFIG_ARCH_SETS_UP_PER_CPU_AREA unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); @@ -384,7 +384,7 @@ static void __init setup_per_cpu_areas(void) ptr += size; } } -#endif /* !__GENERIC_PER_CPU */ +#endif /* CONFIG_ARCH_SETS_UP_CPU_AREA */ /* Called by boot processor to activate the rest. */ static void __init smp_init(void) -- cgit v1.2.3 From ca74a6f84e68b44867022f4a4f3ec17c087c864e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 30 Jan 2008 13:33:17 +0100 Subject: x86: optimize lock prefix switching to run less frequently On VMs implemented using JITs that cache translated code changing the lock prefixes is a quite costly operation that forces the JIT to throw away and retranslate a lot of code. Previously a SMP kernel would rewrite the locks once for each CPU which is quite unnecessary. This patch changes the code to never switch at boot in the normal case (SMP kernel booting with >1 CPU) or only once for SMP kernel on UP. This makes a significant difference in boot up performance on AMD SimNow! Also I expect it to be a little faster on native systems too because a smp switch does a lot of text_poke()s which each synchronize the pipeline. v1->v2: Rename max_cpus v1->v2: Fix off by one in UP check (Thomas Gleixner) Signed-off-by: Andi Kleen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/alternative.c | 16 ++++++++++++++-- include/linux/smp.h | 2 ++ init/main.c | 16 ++++++++-------- 3 files changed, 24 insertions(+), 10 deletions(-) (limited to 'init/main.c') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index cdc43242da92..318a4f9b7ece 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -273,6 +273,7 @@ struct smp_alt_module { }; static LIST_HEAD(smp_alt_modules); static DEFINE_SPINLOCK(smp_alt); +static int smp_mode = 1; /* protected by smp_alt */ void alternatives_smp_module_add(struct module *mod, char *name, void *locks, void *locks_end, @@ -354,7 +355,14 @@ void alternatives_smp_switch(int smp) BUG_ON(!smp && (num_online_cpus() > 1)); spin_lock_irqsave(&smp_alt, flags); - if (smp) { + + /* + * Avoid unnecessary switches because it forces JIT based VMs to + * throw away all cached translations, which can be quite costly. + */ + if (smp == smp_mode) { + /* nothing */ + } else if (smp) { printk(KERN_INFO "SMP alternatives: switching to SMP code\n"); clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP); @@ -369,6 +377,7 @@ void alternatives_smp_switch(int smp) alternatives_smp_unlock(mod->locks, mod->locks_end, mod->text, mod->text_end); } + smp_mode = smp; spin_unlock_irqrestore(&smp_alt, flags); } @@ -441,7 +450,10 @@ void __init alternative_instructions(void) alternatives_smp_module_add(NULL, "core kernel", __smp_locks, __smp_locks_end, _text, _etext); - alternatives_smp_switch(0); + + /* Only switch to UP mode if we don't immediately boot others */ + if (num_possible_cpus() == 1 || setup_max_cpus <= 1) + alternatives_smp_switch(0); } #endif apply_paravirt(__parainstructions, __parainstructions_end); diff --git a/include/linux/smp.h b/include/linux/smp.h index c25e66bcecf3..55232ccf9cfd 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -78,6 +78,8 @@ int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait); */ void smp_prepare_boot_cpu(void); +extern unsigned int setup_max_cpus; + #else /* !SMP */ /* diff --git a/init/main.c b/init/main.c index 3f8aba291ed3..5843fe996703 100644 --- a/init/main.c +++ b/init/main.c @@ -128,7 +128,7 @@ static char *ramdisk_execute_command; #ifdef CONFIG_SMP /* Setup configured maximum number of CPUs to activate */ -static unsigned int __initdata max_cpus = NR_CPUS; +unsigned int __initdata setup_max_cpus = NR_CPUS; /* * Setup routine for controlling SMP activation @@ -146,7 +146,7 @@ static inline void disable_ioapic_setup(void) {}; static int __init nosmp(char *str) { - max_cpus = 0; + setup_max_cpus = 0; disable_ioapic_setup(); return 0; } @@ -155,8 +155,8 @@ early_param("nosmp", nosmp); static int __init maxcpus(char *str) { - get_option(&str, &max_cpus); - if (max_cpus == 0) + get_option(&str, &setup_max_cpus); + if (setup_max_cpus == 0) disable_ioapic_setup(); return 0; @@ -164,7 +164,7 @@ static int __init maxcpus(char *str) early_param("maxcpus", maxcpus); #else -#define max_cpus NR_CPUS +#define setup_max_cpus NR_CPUS #endif /* @@ -393,7 +393,7 @@ static void __init smp_init(void) /* FIXME: This should be done in userspace --RR */ for_each_present_cpu(cpu) { - if (num_online_cpus() >= max_cpus) + if (num_online_cpus() >= setup_max_cpus) break; if (!cpu_online(cpu)) cpu_up(cpu); @@ -401,7 +401,7 @@ static void __init smp_init(void) /* Any cleanup work */ printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus()); - smp_cpus_done(max_cpus); + smp_cpus_done(setup_max_cpus); } #endif @@ -824,7 +824,7 @@ static int __init kernel_init(void * unused) __set_special_pids(1, 1); cad_pid = task_pid(current); - smp_prepare_cpus(max_cpus); + smp_prepare_cpus(setup_max_cpus); do_pre_smp_initcalls(); -- cgit v1.2.3 From dd5af90a7f3d79e04b7eace9a98644dbf2038f4d Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Wed, 30 Jan 2008 13:33:32 +0100 Subject: x86/non-x86: percpu, node ids, apic ids x86.git fixup Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig | 2 +- include/asm-generic/percpu.h | 12 ++---------- init/main.c | 4 ++-- kernel/module.c | 8 ++++++++ 4 files changed, 13 insertions(+), 13 deletions(-) (limited to 'init/main.c') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f0887d12a5bb..8e1b33c5405f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -97,7 +97,7 @@ config GENERIC_TIME_VSYSCALL bool default X86_64 -config ARCH_SETS_UP_PER_CPU_AREA +config HAVE_SETUP_PER_CPU_AREA def_bool X86_64 config ARCH_SUPPORTS_OPROFILE diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h index c41b1a731129..4b8d31cda1a0 100644 --- a/include/asm-generic/percpu.h +++ b/include/asm-generic/percpu.h @@ -47,7 +47,7 @@ extern unsigned long __per_cpu_offset[NR_CPUS]; #endif /* - * A percpu variable may point to a discarded reghions. The following are + * A percpu variable may point to a discarded regions. The following are * established ways to produce a usable pointer from the percpu variable * offset. */ @@ -59,18 +59,10 @@ extern unsigned long __per_cpu_offset[NR_CPUS]; (*SHIFT_PERCPU_PTR(&per_cpu_var(var), __my_cpu_offset)) -#ifdef CONFIG_ARCH_SETS_UP_PER_CPU_AREA +#ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA extern void setup_per_cpu_areas(void); #endif -/* A macro to avoid #include hell... */ -#define percpu_modcopy(pcpudst, src, size) \ -do { \ - unsigned int __i; \ - for_each_possible_cpu(__i) \ - memcpy((pcpudst)+per_cpu_offset(__i), \ - (src), (size)); \ -} while (0) #else /* ! SMP */ #define per_cpu(var, cpu) (*((void)(cpu), &per_cpu_var(var))) diff --git a/init/main.c b/init/main.c index 5843fe996703..3316dffe3e57 100644 --- a/init/main.c +++ b/init/main.c @@ -363,7 +363,7 @@ static inline void smp_prepare_cpus(unsigned int maxcpus) { } #else -#ifndef CONFIG_ARCH_SETS_UP_PER_CPU_AREA +#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); @@ -384,7 +384,7 @@ static void __init setup_per_cpu_areas(void) ptr += size; } } -#endif /* CONFIG_ARCH_SETS_UP_CPU_AREA */ +#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ /* Called by boot processor to activate the rest. */ static void __init smp_init(void) diff --git a/kernel/module.c b/kernel/module.c index f6a4e721fd49..bd60278ee703 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -430,6 +430,14 @@ static unsigned int find_pcpusec(Elf_Ehdr *hdr, return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); } +static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size) +{ + int cpu; + + for_each_possible_cpu(cpu) + memcpy(pcpudest + per_cpu_offset(cpu), from, size); +} + static int percpu_modinit(void) { pcpu_num_used = 2; -- cgit v1.2.3 From 12d6f21eacc21d84a809829543f2fe45c7e37319 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 30 Jan 2008 13:33:58 +0100 Subject: x86: do not PSE on CONFIG_DEBUG_PAGEALLOC=y get more testing of the c_p_a() code done by not turning off PSE on DEBUG_PAGEALLOC. this simplifies the early pagetable setup code, and tests the largepage-splitup code quite heavily. In the end, all the largepages will be split up pretty quickly, so there's no difference to how DEBUG_PAGEALLOC worked before. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/common.c | 7 ------- arch/x86/mm/pageattr_32.c | 12 +++++++++++- include/asm-x86/cacheflush.h | 5 ----- include/linux/mm.h | 14 +++++++++++++- init/main.c | 5 +++++ 5 files changed, 29 insertions(+), 14 deletions(-) (limited to 'init/main.c') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index bba850b05d0e..db28aa9e2f69 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -641,13 +641,6 @@ void __init early_cpu_init(void) nexgen_init_cpu(); umc_init_cpu(); early_cpu_detect(); - -#ifdef CONFIG_DEBUG_PAGEALLOC - /* pse is not compatible with on-the-fly unmapping, - * disable it even if the cpus claim to support it. - */ - setup_clear_cpu_cap(X86_FEATURE_PSE); -#endif } /* Make sure %fs is initialized properly in idle threads */ diff --git a/arch/x86/mm/pageattr_32.c b/arch/x86/mm/pageattr_32.c index 9cf2fea54eb5..dd49b16b3a0e 100644 --- a/arch/x86/mm/pageattr_32.c +++ b/arch/x86/mm/pageattr_32.c @@ -61,13 +61,17 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) static int split_large_page(pte_t *kpte, unsigned long address) { pgprot_t ref_prot = pte_pgprot(pte_clrhuge(*kpte)); + gfp_t gfp_flags = GFP_KERNEL; unsigned long flags; unsigned long addr; pte_t *pbase, *tmp; struct page *base; int i, level; - base = alloc_pages(GFP_KERNEL, 0); +#ifdef CONFIG_DEBUG_PAGEALLOC + gfp_flags = GFP_ATOMIC; +#endif + base = alloc_pages(gfp_flags, 0); if (!base) return -ENOMEM; @@ -218,6 +222,12 @@ void kernel_map_pages(struct page *page, int numpages, int enable) numpages * PAGE_SIZE); } + /* + * If page allocator is not up yet then do not call c_p_a(): + */ + if (!debug_pagealloc_enabled) + return; + /* * the return value is ignored - the calls cannot fail, * large pages are disabled at boot time. diff --git a/include/asm-x86/cacheflush.h b/include/asm-x86/cacheflush.h index 9411a2d3f19c..fccb563e2305 100644 --- a/include/asm-x86/cacheflush.h +++ b/include/asm-x86/cacheflush.h @@ -29,11 +29,6 @@ int change_page_attr(struct page *page, int numpages, pgprot_t prot); int change_page_attr_addr(unsigned long addr, int numpages, pgprot_t prot); void clflush_cache_range(void *addr, int size); -#ifdef CONFIG_DEBUG_PAGEALLOC -/* internal debugging function */ -void kernel_map_pages(struct page *page, int numpages, int enable); -#endif - #ifdef CONFIG_DEBUG_RODATA void mark_rodata_ro(void); #endif diff --git a/include/linux/mm.h b/include/linux/mm.h index 3c22d971afa7..1bba6789a50a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1118,9 +1118,21 @@ static inline void vm_stat_account(struct mm_struct *mm, } #endif /* CONFIG_PROC_FS */ -#ifndef CONFIG_DEBUG_PAGEALLOC +#ifdef CONFIG_DEBUG_PAGEALLOC +extern int debug_pagealloc_enabled; + +extern void kernel_map_pages(struct page *page, int numpages, int enable); + +static inline void enable_debug_pagealloc(void) +{ + debug_pagealloc_enabled = 1; +} +#else static inline void kernel_map_pages(struct page *page, int numpages, int enable) {} +static inline void enable_debug_pagealloc(void) +{ +} #endif extern struct vm_area_struct *get_gate_vma(struct task_struct *tsk); diff --git a/init/main.c b/init/main.c index 3316dffe3e57..cb81ed116f62 100644 --- a/init/main.c +++ b/init/main.c @@ -318,6 +318,10 @@ static int __init unknown_bootoption(char *param, char *val) return 0; } +#ifdef CONFIG_DEBUG_PAGEALLOC +int __read_mostly debug_pagealloc_enabled = 0; +#endif + static int __init init_setup(char *str) { unsigned int i; @@ -552,6 +556,7 @@ asmlinkage void __init start_kernel(void) preempt_disable(); build_all_zonelists(); page_alloc_init(); + enable_debug_pagealloc(); printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line); parse_early_param(); parse_args("Booting kernel", static_command_line, __start___param, -- cgit v1.2.3