From bbc9e2f452d9c4b166d1f9a78d941d80173312fe Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 23 Jan 2011 14:37:39 +0100 Subject: x86: Unify cpu/apicid <-> NUMA node mapping between 32 and 64bit The mapping between cpu/apicid and node is done via apicid_to_node[] on 64bit and apicid_2_node[] + apic->x86_32_numa_cpu_node() on 32bit. This difference makes it difficult to further unify 32 and 64bit NUMA handling. This patch unifies it by replacing both apicid_to_node[] and apicid_2_node[] with __apicid_to_node[] array, which is accessed by two accessors - set_apicid_to_node() and numa_cpu_node(). On 64bit, numa_cpu_node() always consults __apicid_to_node[] directly while 32bit goes through apic->numa_cpu_node() method to allow apic implementations to override it. srat_detect_node() for amd cpus contains workaround for broken NUMA configuration which assumes relationship between APIC ID, HT node ID and NUMA topology. Leave it to access __apicid_to_node[] directly as mapping through CPU might result in undesirable behavior change. The comment is reformatted and updated to note the ugliness. Signed-off-by: Tejun Heo Reviewed-by: Pekka Enberg Cc: eric.dumazet@gmail.com Cc: yinghai@kernel.org Cc: brgerst@gmail.com Cc: gorcunov@gmail.com Cc: shaohui.zheng@intel.com Cc: rientjes@google.com LKML-Reference: <1295789862-25482-14-git-send-email-tj@kernel.org> Signed-off-by: Ingo Molnar Cc: David Rientjes --- arch/x86/include/asm/numa.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'arch/x86/include/asm/numa.h') diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index 27da400d3138..5e01c768a575 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -1,5 +1,33 @@ +#ifndef _ASM_X86_NUMA_H +#define _ASM_X86_NUMA_H + +#include + +#ifdef CONFIG_NUMA +/* + * __apicid_to_node[] stores the raw mapping between physical apicid and + * node and is used to initialize cpu_to_node mapping. + * + * The mapping may be overridden by apic->numa_cpu_node() on 32bit and thus + * should be accessed by the accessors - set_apicid_to_node() and + * numa_cpu_node(). + */ +extern s16 __apicid_to_node[MAX_LOCAL_APIC]; + +static inline void set_apicid_to_node(int apicid, s16 node) +{ + __apicid_to_node[apicid] = node; +} +#else /* CONFIG_NUMA */ +static inline void set_apicid_to_node(int apicid, s16 node) +{ +} +#endif /* CONFIG_NUMA */ + #ifdef CONFIG_X86_32 # include "numa_32.h" #else # include "numa_64.h" #endif + +#endif /* _ASM_X86_NUMA_H */ -- cgit v1.2.3 From 645a79195f66eb68ef3ab2b21d9829ac3aa085a9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 23 Jan 2011 14:37:40 +0100 Subject: x86: Unify CPU -> NUMA node mapping between 32 and 64bit Unlike 64bit, 32bit has been using its own cpu_to_node_map[] for CPU -> NUMA node mapping. Replace it with early_percpu variable x86_cpu_to_node_map and share the mapping code with 64bit. * USE_PERCPU_NUMA_NODE_ID is now enabled for 32bit too. * x86_cpu_to_node_map and numa_set/clear_node() are moved from numa_64 to numa. For now, on 32bit, x86_cpu_to_node_map is initialized with 0 instead of NUMA_NO_NODE. This is to avoid introducing unexpected behavior change and will be updated once init path is unified. * srat_detect_node() is now enabled for x86_32 too. It calls numa_set_node() and initializes the mapping making explicit cpu_to_node_map[] updates from map/unmap_cpu_to_node() unnecessary. Signed-off-by: Tejun Heo Cc: eric.dumazet@gmail.com Cc: yinghai@kernel.org Cc: brgerst@gmail.com Cc: gorcunov@gmail.com Cc: penberg@kernel.org Cc: shaohui.zheng@intel.com Cc: rientjes@google.com LKML-Reference: <1295789862-25482-15-git-send-email-tj@kernel.org> Signed-off-by: Ingo Molnar Cc: David Rientjes --- arch/x86/Kconfig | 2 +- arch/x86/include/asm/numa.h | 8 +++++ arch/x86/include/asm/numa_64.h | 4 --- arch/x86/include/asm/topology.h | 17 ---------- arch/x86/kernel/acpi/boot.c | 5 --- arch/x86/kernel/cpu/amd.c | 4 +-- arch/x86/kernel/cpu/intel.c | 2 +- arch/x86/kernel/setup_percpu.c | 4 +-- arch/x86/kernel/smpboot.c | 6 ---- arch/x86/mm/numa.c | 72 ++++++++++++++++++++++++++++++++++++++++- arch/x86/mm/numa_64.c | 65 ------------------------------------- 11 files changed, 85 insertions(+), 104 deletions(-) (limited to 'arch/x86/include/asm/numa.h') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d5ed94d30aad..95c36c474766 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1705,7 +1705,7 @@ config HAVE_ARCH_EARLY_PFN_TO_NID depends on NUMA config USE_PERCPU_NUMA_NODE_ID - def_bool X86_64 + def_bool y depends on NUMA menu "Power management and ACPI options" diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index 5e01c768a575..2b21fff9f655 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -30,4 +30,12 @@ static inline void set_apicid_to_node(int apicid, s16 node) # include "numa_64.h" #endif +#ifdef CONFIG_NUMA +extern void __cpuinit numa_set_node(int cpu, int node); +extern void __cpuinit numa_clear_node(int cpu); +#else /* CONFIG_NUMA */ +static inline void numa_set_node(int cpu, int node) { } +static inline void numa_clear_node(int cpu) { } +#endif /* CONFIG_NUMA */ + #endif /* _ASM_X86_NUMA_H */ diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index 4982a9c08c2f..6ead9f361bda 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -30,8 +30,6 @@ extern void setup_node_bootmem(int nodeid, unsigned long start, extern void __init init_cpu_to_node(void); extern int __cpuinit numa_cpu_node(int cpu); -extern void __cpuinit numa_set_node(int cpu, int node); -extern void __cpuinit numa_clear_node(int cpu); extern void __cpuinit numa_add_cpu(int cpu); extern void __cpuinit numa_remove_cpu(int cpu); @@ -43,8 +41,6 @@ void numa_emu_cmdline(char *); #else static inline void init_cpu_to_node(void) { } static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; } -static inline void numa_set_node(int cpu, int node) { } -static inline void numa_clear_node(int cpu) { } static inline void numa_add_cpu(int cpu, int node) { } static inline void numa_remove_cpu(int cpu) { } #endif diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 21899cc31e52..b101c17861f5 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -47,21 +47,6 @@ #include -#ifdef CONFIG_X86_32 - -/* Mappings between logical cpu number and node number */ -extern int cpu_to_node_map[]; - -/* Returns the number of the node containing CPU 'cpu' */ -static inline int __cpu_to_node(int cpu) -{ - return cpu_to_node_map[cpu]; -} -#define early_cpu_to_node __cpu_to_node -#define cpu_to_node __cpu_to_node - -#else /* CONFIG_X86_64 */ - /* Mappings between logical cpu number and node number */ DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map); @@ -84,8 +69,6 @@ static inline int early_cpu_to_node(int cpu) #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ -#endif /* CONFIG_X86_64 */ - /* Mappings between node number and cpus on that node. */ extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index a7bca59ec595..a2c512175395 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -590,12 +590,7 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) if (nid == -1 || !node_online(nid)) return; set_apicid_to_node(physid, nid); -#ifdef CONFIG_X86_64 numa_set_node(cpu, nid); -#else /* CONFIG_X86_32 */ - cpu_to_node_map[cpu] = nid; -#endif - #endif } diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 3cce8f2bb2e1..77858fd64620 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -233,7 +233,7 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) } #endif -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) +#ifdef CONFIG_NUMA /* * To workaround broken NUMA config. Read the comment in * srat_detect_node(). @@ -338,7 +338,7 @@ EXPORT_SYMBOL_GPL(amd_get_nb_id); static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) { -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) +#ifdef CONFIG_NUMA int cpu = smp_processor_id(); int node; unsigned apicid = c->apicid; diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 6052004bf4f4..df86bc8c859d 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -276,7 +276,7 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) { -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) +#ifdef CONFIG_NUMA unsigned node; int cpu = smp_processor_id(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index b5147f00f0ef..71f4727da373 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -233,6 +233,7 @@ void __init setup_per_cpu_areas(void) per_cpu(irq_stack_ptr, cpu) = per_cpu(irq_stack_union.irq_stack, cpu) + IRQ_STACK_SIZE - 64; +#endif #ifdef CONFIG_NUMA per_cpu(x86_cpu_to_node_map, cpu) = early_per_cpu_map(x86_cpu_to_node_map, cpu); @@ -245,7 +246,6 @@ void __init setup_per_cpu_areas(void) * So set them all (boot cpu and all APs). */ set_cpu_numa_node(cpu, early_cpu_to_node(cpu)); -#endif #endif /* * Up to this point, the boot CPU has been using .init.data @@ -263,7 +263,7 @@ void __init setup_per_cpu_areas(void) #ifdef CONFIG_X86_32 early_per_cpu_ptr(x86_cpu_to_logical_apicid) = NULL; #endif -#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) +#ifdef CONFIG_NUMA early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; #endif diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index b7cfce535cb0..2c203822424f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -133,16 +133,11 @@ EXPORT_PER_CPU_SYMBOL(cpu_info); atomic_t init_deasserted; #if defined(CONFIG_NUMA) && defined(CONFIG_X86_32) -/* which node each logical CPU is on */ -int cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 }; -EXPORT_SYMBOL(cpu_to_node_map); - /* set up a mapping between cpu and node. */ static void map_cpu_to_node(int cpu, int node) { printk(KERN_INFO "Mapping cpu %d to node %d\n", cpu, node); cpumask_set_cpu(cpu, node_to_cpumask_map[node]); - cpu_to_node_map[cpu] = node; } /* undo a mapping between cpu and node. */ @@ -153,7 +148,6 @@ static void unmap_cpu_to_node(int cpu) printk(KERN_INFO "Unmapping cpu %d from all nodes\n", cpu); for (node = 0; node < MAX_NUMNODES; node++) cpumask_clear_cpu(cpu, node_to_cpumask_map[node]); - cpu_to_node_map[cpu] = 0; } #else /* !(CONFIG_NUMA && CONFIG_X86_32) */ #define map_cpu_to_node(cpu, node) ({}) diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 480b3571c8b1..187810be3d6c 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -35,6 +35,44 @@ s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; EXPORT_SYMBOL(node_to_cpumask_map); +/* + * Map cpu index to node index + */ +#ifdef CONFIG_X86_32 +DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, 0); +#else +DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); +#endif +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); + +void __cpuinit numa_set_node(int cpu, int node) +{ + int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); + + /* early setting, no percpu area yet */ + if (cpu_to_node_map) { + cpu_to_node_map[cpu] = node; + return; + } + +#ifdef CONFIG_DEBUG_PER_CPU_MAPS + if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { + printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); + dump_stack(); + return; + } +#endif + per_cpu(x86_cpu_to_node_map, cpu) = node; + + if (node != NUMA_NO_NODE) + set_cpu_numa_node(cpu, node); +} + +void __cpuinit numa_clear_node(int cpu) +{ + numa_set_node(cpu, NUMA_NO_NODE); +} + /* * Allocate node_to_cpumask_map based on number of available nodes * Requires node_possible_map to be valid. @@ -62,6 +100,37 @@ void __init setup_node_to_cpumask_map(void) } #ifdef CONFIG_DEBUG_PER_CPU_MAPS + +int __cpu_to_node(int cpu) +{ + if (early_per_cpu_ptr(x86_cpu_to_node_map)) { + printk(KERN_WARNING + "cpu_to_node(%d): usage too early!\n", cpu); + dump_stack(); + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; + } + return per_cpu(x86_cpu_to_node_map, cpu); +} +EXPORT_SYMBOL(__cpu_to_node); + +/* + * Same function as cpu_to_node() but used if called before the + * per_cpu areas are setup. + */ +int early_cpu_to_node(int cpu) +{ + if (early_per_cpu_ptr(x86_cpu_to_node_map)) + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; + + if (!cpu_possible(cpu)) { + printk(KERN_WARNING + "early_cpu_to_node(%d): no per_cpu area!\n", cpu); + dump_stack(); + return NUMA_NO_NODE; + } + return per_cpu(x86_cpu_to_node_map, cpu); +} + /* * Returns a pointer to the bitmask of CPUs on Node 'node'. */ @@ -84,4 +153,5 @@ const struct cpumask *cpumask_of_node(int node) return node_to_cpumask_map[node]; } EXPORT_SYMBOL(cpumask_of_node); -#endif + +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */ diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 1e1026f61a5a..f5459400644d 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -29,12 +29,6 @@ struct memnode memnode; static unsigned long __initdata nodemap_addr; static unsigned long __initdata nodemap_size; -/* - * Map cpu index to node index - */ -DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); -EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); - /* * Given a shift value, try to populate memnodemap[] * Returns : @@ -732,34 +726,6 @@ int __cpuinit numa_cpu_node(int cpu) return NUMA_NO_NODE; } -void __cpuinit numa_set_node(int cpu, int node) -{ - int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); - - /* early setting, no percpu area yet */ - if (cpu_to_node_map) { - cpu_to_node_map[cpu] = node; - return; - } - -#ifdef CONFIG_DEBUG_PER_CPU_MAPS - if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { - printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); - dump_stack(); - return; - } -#endif - per_cpu(x86_cpu_to_node_map, cpu) = node; - - if (node != NUMA_NO_NODE) - set_cpu_numa_node(cpu, node); -} - -void __cpuinit numa_clear_node(int cpu) -{ - numa_set_node(cpu, NUMA_NO_NODE); -} - #ifndef CONFIG_DEBUG_PER_CPU_MAPS #ifndef CONFIG_NUMA_EMU @@ -887,37 +853,6 @@ void __cpuinit numa_remove_cpu(int cpu) { numa_set_cpumask(cpu, 0); } - -int __cpu_to_node(int cpu) -{ - if (early_per_cpu_ptr(x86_cpu_to_node_map)) { - printk(KERN_WARNING - "cpu_to_node(%d): usage too early!\n", cpu); - dump_stack(); - return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; - } - return per_cpu(x86_cpu_to_node_map, cpu); -} -EXPORT_SYMBOL(__cpu_to_node); - -/* - * Same function as cpu_to_node() but used if called before the - * per_cpu areas are setup. - */ -int early_cpu_to_node(int cpu) -{ - if (early_per_cpu_ptr(x86_cpu_to_node_map)) - return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; - - if (!cpu_possible(cpu)) { - printk(KERN_WARNING - "early_cpu_to_node(%d): no per_cpu area!\n", cpu); - dump_stack(); - return NUMA_NO_NODE; - } - return per_cpu(x86_cpu_to_node_map, cpu); -} - /* * --------- end of debug versions of the numa functions --------- */ -- cgit v1.2.3 From de2d9445f1627830ed2ebd00ee9d851986c940b5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 23 Jan 2011 14:37:41 +0100 Subject: x86: Unify node_to_cpumask_map handling between 32 and 64bit x86_32 has been managing node_to_cpumask_map explicitly from map_cpu_to_node() and friends in a rather ugly way. With previous changes, it's now possible to share the code with 64bit. * When CONFIG_NUMA_EMU is disabled, numa_add/remove_cpu() are implemented in numa.c and shared by 32 and 64bit. CONFIG_NUMA_EMU versions still live in numa_64.c. NUMA_EMU's dependency on 64bit is planned to be removed and the above should go away together. * identify_cpu() now calls numa_add_cpu() for 32bit too. This makes the explicit mask management from map_cpu_to_node() unnecessary. * The whole x86_32 specific map_cpu_to_node() chunk is no longer necessary. Dropped. Signed-off-by: Tejun Heo Reviewed-by: Pekka Enberg Cc: eric.dumazet@gmail.com Cc: yinghai@kernel.org Cc: brgerst@gmail.com Cc: gorcunov@gmail.com Cc: shaohui.zheng@intel.com Cc: rientjes@google.com LKML-Reference: <1295789862-25482-16-git-send-email-tj@kernel.org> Signed-off-by: Ingo Molnar Cc: David Rientjes Cc: Shaohui Zheng --- arch/x86/include/asm/numa.h | 9 +++++ arch/x86/include/asm/numa_32.h | 1 - arch/x86/include/asm/numa_64.h | 4 --- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/smpboot.c | 47 -------------------------- arch/x86/mm/numa.c | 64 +++++++++++++++++++++++++++++++++-- arch/x86/mm/numa_64.c | 75 +++++++++--------------------------------- 7 files changed, 87 insertions(+), 115 deletions(-) (limited to 'arch/x86/include/asm/numa.h') diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index 2b21fff9f655..d3964b28b128 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -1,6 +1,7 @@ #ifndef _ASM_X86_NUMA_H #define _ASM_X86_NUMA_H +#include #include #ifdef CONFIG_NUMA @@ -33,9 +34,17 @@ static inline void set_apicid_to_node(int apicid, s16 node) #ifdef CONFIG_NUMA extern void __cpuinit numa_set_node(int cpu, int node); extern void __cpuinit numa_clear_node(int cpu); +extern void __cpuinit numa_add_cpu(int cpu); +extern void __cpuinit numa_remove_cpu(int cpu); #else /* CONFIG_NUMA */ static inline void numa_set_node(int cpu, int node) { } static inline void numa_clear_node(int cpu) { } +static inline void numa_add_cpu(int cpu) { } +static inline void numa_remove_cpu(int cpu) { } #endif /* CONFIG_NUMA */ +#ifdef CONFIG_DEBUG_PER_CPU_MAPS +struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable); +#endif + #endif /* _ASM_X86_NUMA_H */ diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h index cdf8043d7a1a..bc66031afa1f 100644 --- a/arch/x86/include/asm/numa_32.h +++ b/arch/x86/include/asm/numa_32.h @@ -4,7 +4,6 @@ extern int numa_off; extern int pxm_to_nid(int pxm); -extern void numa_remove_cpu(int cpu); #ifdef CONFIG_NUMA extern int __cpuinit numa_cpu_node(int apicid); diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index 6ead9f361bda..123f1856101c 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -30,8 +30,6 @@ extern void setup_node_bootmem(int nodeid, unsigned long start, extern void __init init_cpu_to_node(void); extern int __cpuinit numa_cpu_node(int cpu); -extern void __cpuinit numa_add_cpu(int cpu); -extern void __cpuinit numa_remove_cpu(int cpu); #ifdef CONFIG_NUMA_EMU #define FAKE_NODE_MIN_SIZE ((u64)32 << 20) @@ -41,8 +39,6 @@ void numa_emu_cmdline(char *); #else static inline void init_cpu_to_node(void) { } static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; } -static inline void numa_add_cpu(int cpu, int node) { } -static inline void numa_remove_cpu(int cpu) { } #endif #endif /* _ASM_X86_NUMA_64_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 1d59834396bd..a2559c3ed500 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -869,7 +869,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) select_idle_routine(c); -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) +#ifdef CONFIG_NUMA numa_add_cpu(smp_processor_id()); #endif } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2c203822424f..522b2173888a 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -132,49 +132,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_info); atomic_t init_deasserted; -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32) -/* set up a mapping between cpu and node. */ -static void map_cpu_to_node(int cpu, int node) -{ - printk(KERN_INFO "Mapping cpu %d to node %d\n", cpu, node); - cpumask_set_cpu(cpu, node_to_cpumask_map[node]); -} - -/* undo a mapping between cpu and node. */ -static void unmap_cpu_to_node(int cpu) -{ - int node; - - printk(KERN_INFO "Unmapping cpu %d from all nodes\n", cpu); - for (node = 0; node < MAX_NUMNODES; node++) - cpumask_clear_cpu(cpu, node_to_cpumask_map[node]); -} -#else /* !(CONFIG_NUMA && CONFIG_X86_32) */ -#define map_cpu_to_node(cpu, node) ({}) -#define unmap_cpu_to_node(cpu) ({}) -#endif - -#ifdef CONFIG_X86_32 -static void map_cpu_to_logical_apicid(void) -{ - int cpu = smp_processor_id(); - int node; - - node = numa_cpu_node(cpu); - if (!node_online(node)) - node = first_online_node; - - map_cpu_to_node(cpu, node); -} - -void numa_remove_cpu(int cpu) -{ - unmap_cpu_to_node(cpu); -} -#else -#define map_cpu_to_logical_apicid() do {} while (0) -#endif - /* * Report back to the Boot Processor. * Running on AP. @@ -242,7 +199,6 @@ static void __cpuinit smp_callin(void) apic->smp_callin_clear_local_apic(); setup_local_APIC(); end_local_APIC_setup(); - map_cpu_to_logical_apicid(); /* * Need to setup vector mappings before we enable interrupts. @@ -943,7 +899,6 @@ static __init void disable_smp(void) physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); else physid_set_mask_of_physid(0, &phys_cpu_present_map); - map_cpu_to_logical_apicid(); cpumask_set_cpu(0, cpu_sibling_mask(0)); cpumask_set_cpu(0, cpu_core_mask(0)); } @@ -1120,8 +1075,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) end_local_APIC_setup(); - map_cpu_to_logical_apicid(); - if (apic->setup_portio_remap) apic->setup_portio_remap(); diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 187810be3d6c..75abecb614c9 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -99,7 +99,21 @@ void __init setup_node_to_cpumask_map(void) pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); } -#ifdef CONFIG_DEBUG_PER_CPU_MAPS +#ifndef CONFIG_DEBUG_PER_CPU_MAPS + +# ifndef CONFIG_NUMA_EMU +void __cpuinit numa_add_cpu(int cpu) +{ + cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); +} + +void __cpuinit numa_remove_cpu(int cpu) +{ + cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); +} +# endif /* !CONFIG_NUMA_EMU */ + +#else /* !CONFIG_DEBUG_PER_CPU_MAPS */ int __cpu_to_node(int cpu) { @@ -131,6 +145,52 @@ int early_cpu_to_node(int cpu) return per_cpu(x86_cpu_to_node_map, cpu); } +struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable) +{ + int node = early_cpu_to_node(cpu); + struct cpumask *mask; + char buf[64]; + + mask = node_to_cpumask_map[node]; + if (!mask) { + pr_err("node_to_cpumask_map[%i] NULL\n", node); + dump_stack(); + return NULL; + } + + cpulist_scnprintf(buf, sizeof(buf), mask); + printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", + enable ? "numa_add_cpu" : "numa_remove_cpu", + cpu, node, buf); + return mask; +} + +# ifndef CONFIG_NUMA_EMU +static void __cpuinit numa_set_cpumask(int cpu, int enable) +{ + struct cpumask *mask; + + mask = debug_cpumask_set_cpu(cpu, enable); + if (!mask) + return; + + if (enable) + cpumask_set_cpu(cpu, mask); + else + cpumask_clear_cpu(cpu, mask); +} + +void __cpuinit numa_add_cpu(int cpu) +{ + numa_set_cpumask(cpu, 1); +} + +void __cpuinit numa_remove_cpu(int cpu) +{ + numa_set_cpumask(cpu, 0); +} +# endif /* !CONFIG_NUMA_EMU */ + /* * Returns a pointer to the bitmask of CPUs on Node 'node'. */ @@ -154,4 +214,4 @@ const struct cpumask *cpumask_of_node(int node) } EXPORT_SYMBOL(cpumask_of_node); -#endif /* CONFIG_DEBUG_PER_CPU_MAPS */ +#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index f5459400644d..14664f58a759 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -726,19 +726,18 @@ int __cpuinit numa_cpu_node(int cpu) return NUMA_NO_NODE; } -#ifndef CONFIG_DEBUG_PER_CPU_MAPS - -#ifndef CONFIG_NUMA_EMU -void __cpuinit numa_add_cpu(int cpu) -{ - cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); -} - -void __cpuinit numa_remove_cpu(int cpu) -{ - cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); -} -#else +/* + * UGLINESS AHEAD: Currently, CONFIG_NUMA_EMU is 64bit only and makes use + * of 64bit specific data structures. The distinction is artificial and + * should be removed. numa_{add|remove}_cpu() are implemented in numa.c + * for both 32 and 64bit when CONFIG_NUMA_EMU is disabled but here when + * enabled. + * + * NUMA emulation is planned to be made generic and the following and other + * related code should be moved to numa.c. + */ +#ifdef CONFIG_NUMA_EMU +# ifndef CONFIG_DEBUG_PER_CPU_MAPS void __cpuinit numa_add_cpu(int cpu) { unsigned long addr; @@ -778,47 +777,7 @@ void __cpuinit numa_remove_cpu(int cpu) for_each_online_node(i) cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); } -#endif /* !CONFIG_NUMA_EMU */ - -#else /* CONFIG_DEBUG_PER_CPU_MAPS */ -static struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable) -{ - int node = early_cpu_to_node(cpu); - struct cpumask *mask; - char buf[64]; - - mask = node_to_cpumask_map[node]; - if (!mask) { - pr_err("node_to_cpumask_map[%i] NULL\n", node); - dump_stack(); - return NULL; - } - - cpulist_scnprintf(buf, sizeof(buf), mask); - printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", - enable ? "numa_add_cpu" : "numa_remove_cpu", - cpu, node, buf); - return mask; -} - -/* - * --------- debug versions of the numa functions --------- - */ -#ifndef CONFIG_NUMA_EMU -static void __cpuinit numa_set_cpumask(int cpu, int enable) -{ - struct cpumask *mask; - - mask = debug_cpumask_set_cpu(cpu, enable); - if (!mask) - return; - - if (enable) - cpumask_set_cpu(cpu, mask); - else - cpumask_clear_cpu(cpu, mask); -} -#else +# else /* !CONFIG_DEBUG_PER_CPU_MAPS */ static void __cpuinit numa_set_cpumask(int cpu, int enable) { int node = early_cpu_to_node(cpu); @@ -842,7 +801,6 @@ static void __cpuinit numa_set_cpumask(int cpu, int enable) cpumask_clear_cpu(cpu, mask); } } -#endif /* CONFIG_NUMA_EMU */ void __cpuinit numa_add_cpu(int cpu) { @@ -853,8 +811,5 @@ void __cpuinit numa_remove_cpu(int cpu) { numa_set_cpumask(cpu, 0); } -/* - * --------- end of debug versions of the numa functions --------- - */ - -#endif /* CONFIG_DEBUG_PER_CPU_MAPS */ +# endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ +#endif /* CONFIG_NUMA_EMU */ -- cgit v1.2.3 From 8db78cc4b4048e3add40bca1bc3e55057c319256 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 23 Jan 2011 14:37:42 +0100 Subject: x86: Unify NUMA initialization between 32 and 64bit Now that everything else is unified, NUMA initialization can be unified too. * numa_init_array() and init_cpu_to_node() are moved from numa_64 to numa. * numa_32::initmem_init() is updated to call numa_init_array() and setup_arch() to call init_cpu_to_node() on 32bit too. * x86_cpu_to_node_map is now initialized to NUMA_NO_NODE on 32bit too. This is safe now as numa_init_array() will initialize it early during boot. This makes NUMA mapping fully initialized before setup_per_cpu_areas() on 32bit too and thus makes the first percpu chunk which contains all the static variables and some of dynamic area allocated with NUMA affinity correctly considered. Signed-off-by: Tejun Heo Cc: yinghai@kernel.org Cc: brgerst@gmail.com Cc: gorcunov@gmail.com Cc: shaohui.zheng@intel.com Cc: rientjes@google.com LKML-Reference: <1295789862-25482-17-git-send-email-tj@kernel.org> Signed-off-by: Ingo Molnar Reported-by: Eric Dumazet Reviewed-by: Pekka Enberg --- arch/x86/include/asm/numa.h | 4 +++ arch/x86/include/asm/numa_64.h | 3 -- arch/x86/kernel/setup.c | 2 -- arch/x86/mm/numa.c | 76 +++++++++++++++++++++++++++++++++++++++--- arch/x86/mm/numa_32.c | 1 + arch/x86/mm/numa_64.c | 75 ----------------------------------------- 6 files changed, 77 insertions(+), 84 deletions(-) (limited to 'arch/x86/include/asm/numa.h') diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index d3964b28b128..26fc6e2dd0fb 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -34,11 +34,15 @@ static inline void set_apicid_to_node(int apicid, s16 node) #ifdef CONFIG_NUMA extern void __cpuinit numa_set_node(int cpu, int node); extern void __cpuinit numa_clear_node(int cpu); +extern void __init numa_init_array(void); +extern void __init init_cpu_to_node(void); extern void __cpuinit numa_add_cpu(int cpu); extern void __cpuinit numa_remove_cpu(int cpu); #else /* CONFIG_NUMA */ static inline void numa_set_node(int cpu, int node) { } static inline void numa_clear_node(int cpu) { } +static inline void numa_init_array(void) { } +static inline void init_cpu_to_node(void) { } static inline void numa_add_cpu(int cpu) { } static inline void numa_remove_cpu(int cpu) { } #endif /* CONFIG_NUMA */ diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index 123f1856101c..2819afa33632 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -13,7 +13,6 @@ extern int compute_hash_shift(struct bootnode *nodes, int numblks, #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) -extern void numa_init_array(void); extern int numa_off; extern unsigned long numa_free_all_bootmem(void); @@ -28,7 +27,6 @@ extern void setup_node_bootmem(int nodeid, unsigned long start, */ #define NODE_MIN_SIZE (4*1024*1024) -extern void __init init_cpu_to_node(void); extern int __cpuinit numa_cpu_node(int cpu); #ifdef CONFIG_NUMA_EMU @@ -37,7 +35,6 @@ extern int __cpuinit numa_cpu_node(int cpu); void numa_emu_cmdline(char *); #endif /* CONFIG_NUMA_EMU */ #else -static inline void init_cpu_to_node(void) { } static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; } #endif diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d3cfe26c0252..12023412fdf7 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1040,9 +1040,7 @@ void __init setup_arch(char **cmdline_p) prefill_possible_map(); -#ifdef CONFIG_X86_64 init_cpu_to_node(); -#endif init_apic_mappings(); ioapic_and_gsi_init(); diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 75abecb614c9..bf60715bd1b7 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -38,11 +38,7 @@ EXPORT_SYMBOL(node_to_cpumask_map); /* * Map cpu index to node index */ -#ifdef CONFIG_X86_32 -DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, 0); -#else DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); -#endif EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); void __cpuinit numa_set_node(int cpu, int node) @@ -99,6 +95,78 @@ void __init setup_node_to_cpumask_map(void) pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); } +/* + * There are unfortunately some poorly designed mainboards around that + * only connect memory to a single CPU. This breaks the 1:1 cpu->node + * mapping. To avoid this fill in the mapping for all possible CPUs, + * as the number of CPUs is not known yet. We round robin the existing + * nodes. + */ +void __init numa_init_array(void) +{ + int rr, i; + + rr = first_node(node_online_map); + for (i = 0; i < nr_cpu_ids; i++) { + if (early_cpu_to_node(i) != NUMA_NO_NODE) + continue; + numa_set_node(i, rr); + rr = next_node(rr, node_online_map); + if (rr == MAX_NUMNODES) + rr = first_node(node_online_map); + } +} + +static __init int find_near_online_node(int node) +{ + int n, val; + int min_val = INT_MAX; + int best_node = -1; + + for_each_online_node(n) { + val = node_distance(node, n); + + if (val < min_val) { + min_val = val; + best_node = n; + } + } + + return best_node; +} + +/* + * Setup early cpu_to_node. + * + * Populate cpu_to_node[] only if x86_cpu_to_apicid[], + * and apicid_to_node[] tables have valid entries for a CPU. + * This means we skip cpu_to_node[] initialisation for NUMA + * emulation and faking node case (when running a kernel compiled + * for NUMA on a non NUMA box), which is OK as cpu_to_node[] + * is already initialized in a round robin manner at numa_init_array, + * prior to this call, and this initialization is good enough + * for the fake NUMA cases. + * + * Called before the per_cpu areas are setup. + */ +void __init init_cpu_to_node(void) +{ + int cpu; + u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); + + BUG_ON(cpu_to_apicid == NULL); + + for_each_possible_cpu(cpu) { + int node = numa_cpu_node(cpu); + + if (node == NUMA_NO_NODE) + continue; + if (!node_online(node)) + node = find_near_online_node(node); + numa_set_node(cpu, node); + } +} + #ifndef CONFIG_DEBUG_PER_CPU_MAPS # ifndef CONFIG_NUMA_EMU diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 8d91d227be09..505bb04654b5 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -367,6 +367,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, */ get_memcfg_numa(); + numa_init_array(); kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE); diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 14664f58a759..f548fbf75f44 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -224,28 +224,6 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) node_set_online(nodeid); } -/* - * There are unfortunately some poorly designed mainboards around that - * only connect memory to a single CPU. This breaks the 1:1 cpu->node - * mapping. To avoid this fill in the mapping for all possible CPUs, - * as the number of CPUs is not known yet. We round robin the existing - * nodes. - */ -void __init numa_init_array(void) -{ - int rr, i; - - rr = first_node(node_online_map); - for (i = 0; i < nr_cpu_ids; i++) { - if (early_cpu_to_node(i) != NUMA_NO_NODE) - continue; - numa_set_node(i, rr); - rr = next_node(rr, node_online_map); - if (rr == MAX_NUMNODES) - rr = first_node(node_online_map); - } -} - #ifdef CONFIG_NUMA_EMU /* Numa emulation */ static struct bootnode nodes[MAX_NUMNODES] __initdata; @@ -664,59 +642,6 @@ unsigned long __init numa_free_all_bootmem(void) return pages; } -#ifdef CONFIG_NUMA - -static __init int find_near_online_node(int node) -{ - int n, val; - int min_val = INT_MAX; - int best_node = -1; - - for_each_online_node(n) { - val = node_distance(node, n); - - if (val < min_val) { - min_val = val; - best_node = n; - } - } - - return best_node; -} - -/* - * Setup early cpu_to_node. - * - * Populate cpu_to_node[] only if x86_cpu_to_apicid[], - * and apicid_to_node[] tables have valid entries for a CPU. - * This means we skip cpu_to_node[] initialisation for NUMA - * emulation and faking node case (when running a kernel compiled - * for NUMA on a non NUMA box), which is OK as cpu_to_node[] - * is already initialized in a round robin manner at numa_init_array, - * prior to this call, and this initialization is good enough - * for the fake NUMA cases. - * - * Called before the per_cpu areas are setup. - */ -void __init init_cpu_to_node(void) -{ - int cpu; - u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); - - BUG_ON(cpu_to_apicid == NULL); - - for_each_possible_cpu(cpu) { - int node = numa_cpu_node(cpu); - - if (node == NUMA_NO_NODE) - continue; - if (!node_online(node)) - node = find_near_online_node(node); - numa_set_node(cpu, node); - } -} -#endif - int __cpuinit numa_cpu_node(int cpu) { int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); -- cgit v1.2.3 From ef396ec96c1a8ffd2b0bc67f1f79c7274de02b95 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Feb 2011 17:11:07 +0100 Subject: x86-64, NUMA: Factor out memblk handling into numa_{add|register}_memblk() Factor out memblk handling from srat_64.c into two functions in numa_64.c. This patch doesn't introduce any behavior change. The next patch will make all init methods use these functions. - v2: Fixed build failure on 32bit due to misplaced NR_NODE_MEMBLKS. Reported by Ingo. Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/include/asm/acpi.h | 1 - arch/x86/include/asm/numa.h | 3 ++ arch/x86/include/asm/numa_64.h | 2 + arch/x86/mm/numa_64.c | 109 +++++++++++++++++++++++++++++++++++++++++ arch/x86/mm/srat_64.c | 96 ++---------------------------------- 5 files changed, 117 insertions(+), 94 deletions(-) (limited to 'arch/x86/include/asm/numa.h') diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 446a5b9a1d5f..12bd1fdd206b 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -187,7 +187,6 @@ struct bootnode; extern int acpi_numa; extern int x86_acpi_numa_init(void); extern int acpi_scan_nodes(void); -#define NR_NODE_MEMBLKS (MAX_NUMNODES*2) #ifdef CONFIG_NUMA_EMU extern void acpi_fake_nodes(const struct bootnode *fake_nodes, diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index 26fc6e2dd0fb..3d4dab43c994 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -5,6 +5,9 @@ #include #ifdef CONFIG_NUMA + +#define NR_NODE_MEMBLKS (MAX_NUMNODES*2) + /* * __apicid_to_node[] stores the raw mapping between physical apicid and * node and is used to initialize cpu_to_node mapping. diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index d3a45147d09b..3306a2b99ece 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -32,6 +32,8 @@ extern nodemask_t mem_nodes_parsed __initdata; extern struct bootnode numa_nodes[MAX_NUMNODES] __initdata; extern int __cpuinit numa_cpu_node(int cpu); +extern int __init numa_add_memblk(int nodeid, u64 start, u64 end); +extern int __init numa_register_memblks(void); #ifdef CONFIG_NUMA_EMU #define FAKE_NODE_MIN_SIZE ((u64)32 << 20) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 82ee3083b094..a1d702d2584c 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -33,6 +33,10 @@ struct memnode memnode; static unsigned long __initdata nodemap_addr; static unsigned long __initdata nodemap_size; +static int num_node_memblks __initdata; +static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata; +static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata; + struct bootnode numa_nodes[MAX_NUMNODES] __initdata; /* @@ -184,6 +188,43 @@ static void * __init early_node_mem(int nodeid, unsigned long start, return NULL; } +static __init int conflicting_memblks(unsigned long start, unsigned long end) +{ + int i; + for (i = 0; i < num_node_memblks; i++) { + struct bootnode *nd = &node_memblk_range[i]; + if (nd->start == nd->end) + continue; + if (nd->end > start && nd->start < end) + return memblk_nodeid[i]; + if (nd->end == end && nd->start == start) + return memblk_nodeid[i]; + } + return -1; +} + +int __init numa_add_memblk(int nid, u64 start, u64 end) +{ + int i; + + i = conflicting_memblks(start, end); + if (i == nid) { + printk(KERN_WARNING "NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n", + nid, start, end, numa_nodes[i].start, numa_nodes[i].end); + } else if (i >= 0) { + printk(KERN_ERR "NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n", + nid, start, end, i, + numa_nodes[i].start, numa_nodes[i].end); + return -EINVAL; + } + + node_memblk_range[num_node_memblks].start = start; + node_memblk_range[num_node_memblks].end = end; + memblk_nodeid[num_node_memblks] = nid; + num_node_memblks++; + return 0; +} + static __init void cutoff_node(int i, unsigned long start, unsigned long end) { struct bootnode *nd = &numa_nodes[i]; @@ -246,6 +287,71 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) node_set_online(nodeid); } +int __init numa_register_memblks(void) +{ + int i; + + /* + * Join together blocks on the same node, holes between + * which don't overlap with memory on other nodes. + */ + for (i = 0; i < num_node_memblks; ++i) { + int j, k; + + for (j = i + 1; j < num_node_memblks; ++j) { + unsigned long start, end; + + if (memblk_nodeid[i] != memblk_nodeid[j]) + continue; + start = min(node_memblk_range[i].end, + node_memblk_range[j].end); + end = max(node_memblk_range[i].start, + node_memblk_range[j].start); + for (k = 0; k < num_node_memblks; ++k) { + if (memblk_nodeid[i] == memblk_nodeid[k]) + continue; + if (start < node_memblk_range[k].end && + end > node_memblk_range[k].start) + break; + } + if (k < num_node_memblks) + continue; + start = min(node_memblk_range[i].start, + node_memblk_range[j].start); + end = max(node_memblk_range[i].end, + node_memblk_range[j].end); + printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n", + memblk_nodeid[i], + node_memblk_range[i].start, + node_memblk_range[i].end, + node_memblk_range[j].start, + node_memblk_range[j].end, + start, end); + node_memblk_range[i].start = start; + node_memblk_range[i].end = end; + k = --num_node_memblks - j; + memmove(memblk_nodeid + j, memblk_nodeid + j+1, + k * sizeof(*memblk_nodeid)); + memmove(node_memblk_range + j, node_memblk_range + j+1, + k * sizeof(*node_memblk_range)); + --j; + } + } + + memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks, + memblk_nodeid); + if (memnode_shift < 0) { + printk(KERN_ERR "NUMA: No NUMA node hash function found. Contact maintainer\n"); + return -EINVAL; + } + + for (i = 0; i < num_node_memblks; i++) + memblock_x86_register_active_regions(memblk_nodeid[i], + node_memblk_range[i].start >> PAGE_SHIFT, + node_memblk_range[i].end >> PAGE_SHIFT); + return 0; +} + #ifdef CONFIG_NUMA_EMU /* Numa emulation */ static struct bootnode nodes[MAX_NUMNODES] __initdata; @@ -653,6 +759,9 @@ void __init initmem_init(void) nodes_clear(mem_nodes_parsed); nodes_clear(node_possible_map); nodes_clear(node_online_map); + num_node_memblks = 0; + memset(node_memblk_range, 0, sizeof(node_memblk_range)); + memset(memblk_nodeid, 0, sizeof(memblk_nodeid)); memset(numa_nodes, 0, sizeof(numa_nodes)); if (numa_init[i]() < 0) diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 82b1087963a2..341b37193c76 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -30,30 +30,11 @@ static struct acpi_table_slit *acpi_slit; static struct bootnode nodes_add[MAX_NUMNODES]; -static int num_node_memblks __initdata; -static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata; -static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata; - static __init int setup_node(int pxm) { return acpi_map_pxm_to_node(pxm); } -static __init int conflicting_memblks(unsigned long start, unsigned long end) -{ - int i; - for (i = 0; i < num_node_memblks; i++) { - struct bootnode *nd = &node_memblk_range[i]; - if (nd->start == nd->end) - continue; - if (nd->end > start && nd->start < end) - return memblk_nodeid[i]; - if (nd->end == end && nd->start == start) - return memblk_nodeid[i]; - } - return -1; -} - static __init void bad_srat(void) { int i; @@ -233,7 +214,6 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) struct bootnode *nd; unsigned long start, end; int node, pxm; - int i; if (srat_disabled()) return; @@ -255,16 +235,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) bad_srat(); return; } - i = conflicting_memblks(start, end); - if (i == node) { - printk(KERN_WARNING - "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n", - pxm, start, end, numa_nodes[i].start, numa_nodes[i].end); - } else if (i >= 0) { - printk(KERN_ERR - "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n", - pxm, start, end, node_to_pxm(i), - numa_nodes[i].start, numa_nodes[i].end); + + if (numa_add_memblk(node, start, end) < 0) { bad_srat(); return; } @@ -285,11 +257,6 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) } } else update_nodes_add(node, start, end); - - node_memblk_range[num_node_memblks].start = start; - node_memblk_range[num_node_memblks].end = end; - memblk_nodeid[num_node_memblks] = node; - num_node_memblks++; } /* Sanity check to catch more bad SRATs (they are amazingly common). @@ -341,68 +308,11 @@ int __init acpi_scan_nodes(void) if (acpi_numa <= 0) return -1; - /* - * Join together blocks on the same node, holes between - * which don't overlap with memory on other nodes. - */ - for (i = 0; i < num_node_memblks; ++i) { - int j, k; - - for (j = i + 1; j < num_node_memblks; ++j) { - unsigned long start, end; - - if (memblk_nodeid[i] != memblk_nodeid[j]) - continue; - start = min(node_memblk_range[i].end, - node_memblk_range[j].end); - end = max(node_memblk_range[i].start, - node_memblk_range[j].start); - for (k = 0; k < num_node_memblks; ++k) { - if (memblk_nodeid[i] == memblk_nodeid[k]) - continue; - if (start < node_memblk_range[k].end && - end > node_memblk_range[k].start) - break; - } - if (k < num_node_memblks) - continue; - start = min(node_memblk_range[i].start, - node_memblk_range[j].start); - end = max(node_memblk_range[i].end, - node_memblk_range[j].end); - printk(KERN_INFO "SRAT: Node %d " - "[%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n", - memblk_nodeid[i], - node_memblk_range[i].start, - node_memblk_range[i].end, - node_memblk_range[j].start, - node_memblk_range[j].end, - start, end); - node_memblk_range[i].start = start; - node_memblk_range[i].end = end; - k = --num_node_memblks - j; - memmove(memblk_nodeid + j, memblk_nodeid + j+1, - k * sizeof(*memblk_nodeid)); - memmove(node_memblk_range + j, node_memblk_range + j+1, - k * sizeof(*node_memblk_range)); - --j; - } - } - - memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks, - memblk_nodeid); - if (memnode_shift < 0) { - printk(KERN_ERR - "SRAT: No NUMA node hash function found. Contact maintainer\n"); + if (numa_register_memblks() < 0) { bad_srat(); return -1; } - for (i = 0; i < num_node_memblks; i++) - memblock_x86_register_active_regions(memblk_nodeid[i], - node_memblk_range[i].start >> PAGE_SHIFT, - node_memblk_range[i].end >> PAGE_SHIFT); - /* for out of order entries in SRAT */ sort_node_map(); if (!nodes_cover_memory(numa_nodes)) { -- cgit v1.2.3 From 7a6c6547825a2324faa76cff856db11d78de075e Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 20 Apr 2011 19:19:13 -0700 Subject: x86, numa: Fix cpu nodemasks for NUMA emulation and CONFIG_DEBUG_PER_CPU_MAPS The cpu<->node mappings under CONFIG_DEBUG_PER_CPU_MAPS=y when NUMA emulation is enabled is currently broken because it does not iterate through every emulated node and bind cpus that have affinity to it. NUMA emulation should bind each cpu to every local node to accurately represent the true NUMA topology of the underlying machine. debug_cpumask_set_cpu() needs to be fixed at the same time so that the debugging information that it emits shows the new cpumask of the node being assigned when the cpu is being added or removed. It can now take responsibility of setting or clearing the cpu itself to remove the need for duplicate code. Also change its last parameter, "enable", to have the correct bool type since it can only be true or false. -v2: Fix the return statements, by Kosaki Motohiro Acked-and-Tested-by: KOSAKI Motohiro Signed-off-by: David Rientjes Cc: Andreas Herrmann Cc: Tejun Heo Cc: Linus Torvalds Link: http://lkml.kernel.org/r/alpine.DEB.2.00.1104201918470.12634@chino.kir.corp.google.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/numa.h | 2 +- arch/x86/mm/numa.c | 31 +++++++++++++------------------ arch/x86/mm/numa_emulation.c | 20 ++++++-------------- 3 files changed, 20 insertions(+), 33 deletions(-) (limited to 'arch/x86/include/asm/numa.h') diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index 3d4dab43c994..a50fc9f493b3 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -51,7 +51,7 @@ static inline void numa_remove_cpu(int cpu) { } #endif /* CONFIG_NUMA */ #ifdef CONFIG_DEBUG_PER_CPU_MAPS -struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable); +void debug_cpumask_set_cpu(int cpu, int node, bool enable); #endif #endif /* _ASM_X86_NUMA_H */ diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 9559d360fde7..745258dfc4dc 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -213,53 +213,48 @@ int early_cpu_to_node(int cpu) return per_cpu(x86_cpu_to_node_map, cpu); } -struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable) +void debug_cpumask_set_cpu(int cpu, int node, bool enable) { - int node = early_cpu_to_node(cpu); struct cpumask *mask; char buf[64]; if (node == NUMA_NO_NODE) { /* early_cpu_to_node() already emits a warning and trace */ - return NULL; + return; } mask = node_to_cpumask_map[node]; if (!mask) { pr_err("node_to_cpumask_map[%i] NULL\n", node); dump_stack(); - return NULL; + return; } + if (enable) + cpumask_set_cpu(cpu, mask); + else + cpumask_clear_cpu(cpu, mask); + cpulist_scnprintf(buf, sizeof(buf), mask); printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf); - return mask; + return; } # ifndef CONFIG_NUMA_EMU -static void __cpuinit numa_set_cpumask(int cpu, int enable) +static void __cpuinit numa_set_cpumask(int cpu, bool enable) { - struct cpumask *mask; - - mask = debug_cpumask_set_cpu(cpu, enable); - if (!mask) - return; - - if (enable) - cpumask_set_cpu(cpu, mask); - else - cpumask_clear_cpu(cpu, mask); + debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable); } void __cpuinit numa_add_cpu(int cpu) { - numa_set_cpumask(cpu, 1); + numa_set_cpumask(cpu, true); } void __cpuinit numa_remove_cpu(int cpu) { - numa_set_cpumask(cpu, 0); + numa_set_cpumask(cpu, false); } # endif /* !CONFIG_NUMA_EMU */ diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index ad091e4cff17..de84cc140379 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -454,10 +454,9 @@ void __cpuinit numa_remove_cpu(int cpu) cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); } #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ -static void __cpuinit numa_set_cpumask(int cpu, int enable) +static void __cpuinit numa_set_cpumask(int cpu, bool enable) { - struct cpumask *mask; - int nid, physnid, i; + int nid, physnid; nid = early_cpu_to_node(cpu); if (nid == NUMA_NO_NODE) { @@ -467,28 +466,21 @@ static void __cpuinit numa_set_cpumask(int cpu, int enable) physnid = emu_nid_to_phys[nid]; - for_each_online_node(i) { + for_each_online_node(nid) { if (emu_nid_to_phys[nid] != physnid) continue; - mask = debug_cpumask_set_cpu(cpu, enable); - if (!mask) - return; - - if (enable) - cpumask_set_cpu(cpu, mask); - else - cpumask_clear_cpu(cpu, mask); + debug_cpumask_set_cpu(cpu, nid, enable); } } void __cpuinit numa_add_cpu(int cpu) { - numa_set_cpumask(cpu, 1); + numa_set_cpumask(cpu, true); } void __cpuinit numa_remove_cpu(int cpu) { - numa_set_cpumask(cpu, 0); + numa_set_cpumask(cpu, false); } #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ -- cgit v1.2.3