From a25b9316841c5afa226f8f70a457861b35276a92 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 22 Jan 2013 13:24:30 -0800 Subject: x86, mm: Make DEBUG_VIRTUAL work earlier in boot The KVM code has some repeated bugs in it around use of __pa() on per-cpu data. Those data are not in an area on which using __pa() is valid. However, they are also called early enough in boot that __vmalloc_start_set is not set, and thus the CONFIG_DEBUG_VIRTUAL debugging does not catch them. This adds a check to also verify __pa() calls against max_low_pfn, which we can use earler in boot than is_vmalloc_addr(). However, if we are super-early in boot, max_low_pfn=0 and this will trip on every call, so also make sure that max_low_pfn is set before we try to use it. With this patch applied, CONFIG_DEBUG_VIRTUAL will actually catch the bug I was chasing (and fix later in this series). I'd love to find a generic way so that any __pa() call on percpu areas could do a BUG_ON(), but there don't appear to be any nice and easy ways to check if an address is a percpu one. Anybody have ideas on a way to do this? Signed-off-by: Dave Hansen Link: http://lkml.kernel.org/r/20130122212430.F46F8159@kernel.stglabs.ibm.com Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm/numa.c') diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 2d125be1bae9..76604eb9e4b0 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -219,7 +219,7 @@ static void __init setup_node_data(int nid, u64 start, u64 end) */ nd = alloc_remap(nid, nd_size); if (nd) { - nd_pa = __pa(nd); + nd_pa = __phys_addr_nodebug(nd); remapped = true; } else { nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); -- cgit v1.2.3 From 1e9209edc71b851d81f0316ca03a0e6335c0ef9a Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 27 Jan 2013 01:18:21 +0100 Subject: x86/numa: Use __pa_nodebug() instead MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ... and fix the following warning: arch/x86/mm/numa.c: In function ‘setup_node_data’: arch/x86/mm/numa.c:222:3: warning: passing argument 1 of ‘__phys_addr_nodebug’ makes integer from pointer without a cast Signed-off-by: Borislav Petkov Acked-by: Dave Hansen Link: http://lkml.kernel.org/r/1359245901-8512-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/mm/numa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm/numa.c') diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 76604eb9e4b0..b2313c6739f5 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -219,7 +219,7 @@ static void __init setup_node_data(int nid, u64 start, u64 end) */ nd = alloc_remap(nid, nd_size); if (nd) { - nd_pa = __phys_addr_nodebug(nd); + nd_pa = __pa_nodebug(nd); remapped = true; } else { nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); -- cgit v1.2.3 From f03574f2d5b2d6229dcdf2d322848065f72953c7 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 30 Jan 2013 16:56:16 -0800 Subject: x86-32, mm: Rip out x86_32 NUMA remapping code This code was an optimization for 32-bit NUMA systems. It has probably been the cause of a number of subtle bugs over the years, although the conditions to excite them would have been hard to trigger. Essentially, we remap part of the kernel linear mapping area, and then sometimes part of that area gets freed back in to the bootmem allocator. If those pages get used by kernel data structures (say mem_map[] or a dentry), there's no big deal. But, if anyone ever tried to use the linear mapping for these pages _and_ cared about their physical address, bad things happen. For instance, say you passed __GFP_ZERO to the page allocator and then happened to get handed one of these pages, it zero the remapped page, but it would make a pte to the _old_ page. There are probably a hundred other ways that it could screw with things. We don't need to hang on to performance optimizations for these old boxes any more. All my 32-bit NUMA systems are long dead and buried, and I probably had access to more than most people. This code is causing real things to break today: https://lkml.org/lkml/2013/1/9/376 I looked in to actually fixing this, but it requires surgery to way too much brittle code, as well as stuff like per_cpu_ptr_to_phys(). [ hpa: Cc: this for -stable, since it is a memory corruption issue. However, an alternative is to simply mark NUMA as depends BROKEN rather than EXPERIMENTAL in the X86_32 subclause... ] Link: http://lkml.kernel.org/r/20130131005616.1C79F411@kernel.stglabs.ibm.com Signed-off-by: H. Peter Anvin Cc: --- arch/x86/Kconfig | 4 -- arch/x86/mm/numa.c | 3 - arch/x86/mm/numa_32.c | 161 -------------------------------------------- arch/x86/mm/numa_internal.h | 6 -- 4 files changed, 174 deletions(-) (limited to 'arch/x86/mm/numa.c') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 79795af59810..108efcb21c9e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1253,10 +1253,6 @@ config NODES_SHIFT Specify the maximum number of NUMA Nodes available on the target system. Increases memory reserved to accommodate various tables. -config HAVE_ARCH_ALLOC_REMAP - def_bool y - depends on X86_32 && NUMA - config ARCH_HAVE_MEMORY_PRESENT def_bool y depends on X86_32 && DISCONTIGMEM diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index b2313c6739f5..61c2b6f5ff88 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -205,9 +205,6 @@ static void __init setup_node_data(int nid, u64 start, u64 end) if (end && (end - start) < NODE_MIN_SIZE) return; - /* initialize remap allocator before aligning to ZONE_ALIGN */ - init_alloc_remap(nid, start, end); - start = roundup(start, ZONE_ALIGN); printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 534255a36b6b..73a6d7395bd3 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -73,167 +73,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, extern unsigned long highend_pfn, highstart_pfn; -#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) - -static void *node_remap_start_vaddr[MAX_NUMNODES]; -void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); - -/* - * Remap memory allocator - */ -static unsigned long node_remap_start_pfn[MAX_NUMNODES]; -static void *node_remap_end_vaddr[MAX_NUMNODES]; -static void *node_remap_alloc_vaddr[MAX_NUMNODES]; - -/** - * alloc_remap - Allocate remapped memory - * @nid: NUMA node to allocate memory from - * @size: The size of allocation - * - * Allocate @size bytes from the remap area of NUMA node @nid. The - * size of the remap area is predetermined by init_alloc_remap() and - * only the callers considered there should call this function. For - * more info, please read the comment on top of init_alloc_remap(). - * - * The caller must be ready to handle allocation failure from this - * function and fall back to regular memory allocator in such cases. - * - * CONTEXT: - * Single CPU early boot context. - * - * RETURNS: - * Pointer to the allocated memory on success, %NULL on failure. - */ -void *alloc_remap(int nid, unsigned long size) -{ - void *allocation = node_remap_alloc_vaddr[nid]; - - size = ALIGN(size, L1_CACHE_BYTES); - - if (!allocation || (allocation + size) > node_remap_end_vaddr[nid]) - return NULL; - - node_remap_alloc_vaddr[nid] += size; - memset(allocation, 0, size); - - return allocation; -} - -#ifdef CONFIG_HIBERNATION -/** - * resume_map_numa_kva - add KVA mapping to the temporary page tables created - * during resume from hibernation - * @pgd_base - temporary resume page directory - */ -void resume_map_numa_kva(pgd_t *pgd_base) -{ - int node; - - for_each_online_node(node) { - unsigned long start_va, start_pfn, nr_pages, pfn; - - start_va = (unsigned long)node_remap_start_vaddr[node]; - start_pfn = node_remap_start_pfn[node]; - nr_pages = (node_remap_end_vaddr[node] - - node_remap_start_vaddr[node]) >> PAGE_SHIFT; - - printk(KERN_DEBUG "%s: node %d\n", __func__, node); - - for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) { - unsigned long vaddr = start_va + (pfn << PAGE_SHIFT); - pgd_t *pgd = pgd_base + pgd_index(vaddr); - pud_t *pud = pud_offset(pgd, vaddr); - pmd_t *pmd = pmd_offset(pud, vaddr); - - set_pmd(pmd, pfn_pmd(start_pfn + pfn, - PAGE_KERNEL_LARGE_EXEC)); - - printk(KERN_DEBUG "%s: %08lx -> pfn %08lx\n", - __func__, vaddr, start_pfn + pfn); - } - } -} -#endif - -/** - * init_alloc_remap - Initialize remap allocator for a NUMA node - * @nid: NUMA node to initizlie remap allocator for - * - * NUMA nodes may end up without any lowmem. As allocating pgdat and - * memmap on a different node with lowmem is inefficient, a special - * remap allocator is implemented which can be used by alloc_remap(). - * - * For each node, the amount of memory which will be necessary for - * pgdat and memmap is calculated and two memory areas of the size are - * allocated - one in the node and the other in lowmem; then, the area - * in the node is remapped to the lowmem area. - * - * As pgdat and memmap must be allocated in lowmem anyway, this - * doesn't waste lowmem address space; however, the actual lowmem - * which gets remapped over is wasted. The amount shouldn't be - * problematic on machines this feature will be used. - * - * Initialization failure isn't fatal. alloc_remap() is used - * opportunistically and the callers will fall back to other memory - * allocation mechanisms on failure. - */ -void __init init_alloc_remap(int nid, u64 start, u64 end) -{ - unsigned long start_pfn = start >> PAGE_SHIFT; - unsigned long end_pfn = end >> PAGE_SHIFT; - unsigned long size, pfn; - u64 node_pa, remap_pa; - void *remap_va; - - /* - * The acpi/srat node info can show hot-add memroy zones where - * memory could be added but not currently present. - */ - printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n", - nid, start_pfn, end_pfn); - - /* calculate the necessary space aligned to large page size */ - size = node_memmap_size_bytes(nid, start_pfn, end_pfn); - size += ALIGN(sizeof(pg_data_t), PAGE_SIZE); - size = ALIGN(size, LARGE_PAGE_BYTES); - - /* allocate node memory and the lowmem remap area */ - node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES); - if (!node_pa) { - pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n", - size, nid); - return; - } - memblock_reserve(node_pa, size); - - remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT, - max_low_pfn << PAGE_SHIFT, - size, LARGE_PAGE_BYTES); - if (!remap_pa) { - pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n", - size, nid); - memblock_free(node_pa, size); - return; - } - memblock_reserve(remap_pa, size); - remap_va = phys_to_virt(remap_pa); - - /* perform actual remap */ - for (pfn = 0; pfn < size >> PAGE_SHIFT; pfn += PTRS_PER_PTE) - set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT), - (node_pa >> PAGE_SHIFT) + pfn, - PAGE_KERNEL_LARGE); - - /* initialize remap allocator parameters */ - node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT; - node_remap_start_vaddr[nid] = remap_va; - node_remap_end_vaddr[nid] = remap_va + size; - node_remap_alloc_vaddr[nid] = remap_va; - - printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n", - nid, node_pa, node_pa + size, remap_va, remap_va + size); -} - void __init initmem_init(void) { x86_numa_init(); diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h index 7178c3afe05e..ad86ec91e640 100644 --- a/arch/x86/mm/numa_internal.h +++ b/arch/x86/mm/numa_internal.h @@ -21,12 +21,6 @@ void __init numa_reset_distance(void); void __init x86_numa_init(void); -#ifdef CONFIG_X86_64 -static inline void init_alloc_remap(int nid, u64 start, u64 end) { } -#else -void __init init_alloc_remap(int nid, u64 start, u64 end); -#endif - #ifdef CONFIG_NUMA_EMU void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt); -- cgit v1.2.3 From 07f4207a305c834f528d08428df4531744e25678 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 31 Jan 2013 14:00:48 -0800 Subject: x86-32, mm: Remove reference to alloc_remap() We have removed the remap allocator for x86-32, and x86-64 never had it (and doesn't need it). Remove residual reference to it. Reported-by: Yinghai Lu Signed-off-by: H. Peter Anvin Cc: Dave Hansen Cc: Link: http://lkml.kernel.org/r/CAE9FiQVn6_QZi3fNQ-JHYiR-7jeDJ5hT0SyT_%2BzVvfOj=PzF3w@mail.gmail.com --- arch/x86/mm/numa.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) (limited to 'arch/x86/mm/numa.c') diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 61c2b6f5ff88..8504f3698753 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -193,7 +193,6 @@ int __init numa_add_memblk(int nid, u64 start, u64 end) static void __init setup_node_data(int nid, u64 start, u64 end) { const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); - bool remapped = false; u64 nd_pa; void *nd; int tnid; @@ -211,28 +210,22 @@ static void __init setup_node_data(int nid, u64 start, u64 end) nid, start, end - 1); /* - * Allocate node data. Try remap allocator first, node-local - * memory and then any node. Never allocate in DMA zone. + * Allocate node data. Try node-local memory and then any node. + * Never allocate in DMA zone. */ - nd = alloc_remap(nid, nd_size); - if (nd) { - nd_pa = __pa_nodebug(nd); - remapped = true; - } else { - nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); - if (!nd_pa) { - pr_err("Cannot find %zu bytes in node %d\n", - nd_size, nid); - return; - } - nd = __va(nd_pa); + nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); + if (!nd_pa) { + pr_err("Cannot find %zu bytes in node %d\n", + nd_size, nid); + return; } + nd = __va(nd_pa); /* report and initialize */ - printk(KERN_INFO " NODE_DATA [mem %#010Lx-%#010Lx]%s\n", - nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : ""); + printk(KERN_INFO " NODE_DATA [mem %#010Lx-%#010Lx]\n", + nd_pa, nd_pa + nd_size - 1); tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); - if (!remapped && tnid != nid) + if (tnid != nid) printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid); node_data[nid] = nd; -- cgit v1.2.3 From c4c605246452d0e578945ea95a8e72877e97e8c6 Mon Sep 17 00:00:00 2001 From: Wen Congyang Date: Fri, 22 Feb 2013 16:33:24 -0800 Subject: cpu_hotplug: clear apicid to node when the cpu is hotremoved When a cpu is hotpluged, we call acpi_map_cpu2node() in _acpi_map_lsapic() to store the cpu's node and apicid's node. But we don't clear the cpu's node in acpi_unmap_lsapic() when this cpu is hotremoved. If the node is also hotremoved, we will get the following messages: kernel BUG at include/linux/gfp.h:329! invalid opcode: 0000 [#1] SMP Modules linked in: ebtable_nat ebtables ipt_MASQUERADE iptable_nat nf_nat xt_CHECKSUM iptable_mangle bridge stp llc sunrpc ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 iptable_filter ip_tables ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables binfmt_misc dm_mirror dm_region_hash dm_log dm_mod vhost_net macvtap macvlan tun uinput iTCO_wdt iTCO_vendor_support coretemp kvm_intel kvm crc32c_intel microcode pcspkr i2c_i801 i2c_core lpc_ich mfd_core ioatdma e1000e i7core_edac edac_core sg acpi_memhotplug igb dca sd_mod crc_t10dif megaraid_sas mptsas mptscsih mptbase scsi_transport_sas scsi_mod Pid: 3126, comm: init Not tainted 3.6.0-rc3-tangchen-hostbridge+ #13 FUJITSU-SV PRIMEQUEST 1800E/SB RIP: 0010:[] [] allocate_slab+0x28d/0x300 RSP: 0018:ffff88078a049cf8 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000001 RCX: 0000000000000000 RDX: 0000000000000001 RSI: 0000000000000001 RDI: 0000000000000246 RBP: ffff88078a049d38 R08: 00000000000040d0 R09: 0000000000000001 R10: 0000000000000000 R11: 0000000000000b5f R12: 00000000000052d0 R13: ffff8807c1417300 R14: 0000000000030038 R15: 0000000000000003 FS: 00007fa9b1b44700(0000) GS:ffff8807c3800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00007fa9b09acca0 CR3: 000000078b855000 CR4: 00000000000007e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process init (pid: 3126, threadinfo ffff88078a048000, task ffff8807bb6f2650) Call Trace: new_slab+0x30/0x1b0 __slab_alloc+0x358/0x4c0 kmem_cache_alloc_node_trace+0xb4/0x1e0 alloc_fair_sched_group+0xd0/0x1b0 sched_create_group+0x3e/0x110 sched_autogroup_create_attach+0x4d/0x180 sys_setsid+0xd4/0xf0 system_call_fastpath+0x16/0x1b Code: 89 c4 e9 73 fe ff ff 31 c0 89 de 48 c7 c7 45 de 9e 81 44 89 45 c8 e8 22 05 4b 00 85 db 44 8b 45 c8 0f 89 4f ff ff ff 0f 0b eb fe <0f> 0b 90 eb fd 0f 0b eb fe 89 de 48 c7 c7 45 de 9e 81 31 c0 44 RIP [] allocate_slab+0x28d/0x300 RSP ---[ end trace adf84c90f3fea3e5 ]--- The reason is that the cpu's node is not NUMA_NO_NODE, we will call alloc_pages_exact_node() to alloc memory on the node, but the node is offlined. If the node is onlined, we still need cpu's node. For example: a task on the cpu is sleeped when the cpu is hotremoved. We will choose another cpu to run this task when it is waked up. If we know the cpu's node, we will choose the cpu on the same node first. So we should clear cpu-to-node mapping when the node is offlined. This patch only clears apicid-to-node mapping when the cpu is hotremoved. [akpm@linux-foundation.org: fix section error] Signed-off-by: Wen Congyang Signed-off-by: Tang Chen Cc: Yasuaki Ishimatsu Cc: David Rientjes Cc: Jiang Liu Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/acpi/boot.c | 4 ++++ arch/x86/mm/numa.c | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm/numa.c') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index cfc755dc1607..230c8ea878e5 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -696,6 +696,10 @@ EXPORT_SYMBOL(acpi_map_lsapic); int acpi_unmap_lsapic(int cpu) { +#ifdef CONFIG_ACPI_NUMA + set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE); +#endif + per_cpu(x86_cpu_to_apicid, cpu) = -1; set_cpu_present(cpu, false); num_processors--; diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 8504f3698753..f22680bf69ec 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -56,7 +56,7 @@ early_param("numa", numa_setup); /* * apicid, cpu, node mappings */ -s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { +s16 __apicid_to_node[MAX_LOCAL_APIC] = { [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE }; -- cgit v1.2.3 From e13fe8695c57fed678877a9f3f8e99fc637ff4fb Mon Sep 17 00:00:00 2001 From: Wen Congyang Date: Fri, 22 Feb 2013 16:33:31 -0800 Subject: cpu-hotplug,memory-hotplug: clear cpu_to_node() when offlining the node When the node is offlined, there is no memory/cpu on the node. If a sleep task runs on a cpu of this node, it will be migrated to the cpu on the other node. So we can clear cpu-to-node mapping. [akpm@linux-foundation.org: numa_clear_node() and numa_set_node() can no longer be __cpuinit] Signed-off-by: Wen Congyang Signed-off-by: Tang Chen Cc: Yasuaki Ishimatsu Cc: David Rientjes Cc: Jiang Liu Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/numa.h | 4 ++-- arch/x86/mm/numa.c | 4 ++-- mm/memory_hotplug.c | 30 +++++++++++++++++++++++++++++- 3 files changed, 33 insertions(+), 5 deletions(-) (limited to 'arch/x86/mm/numa.c') diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index 52560a2038e1..1b99ee5c9f00 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -57,8 +57,8 @@ static inline int numa_cpu_node(int cpu) #endif #ifdef CONFIG_NUMA -extern void __cpuinit numa_set_node(int cpu, int node); -extern void __cpuinit numa_clear_node(int cpu); +extern void numa_set_node(int cpu, int node); +extern void numa_clear_node(int cpu); extern void __init init_cpu_to_node(void); extern void __cpuinit numa_add_cpu(int cpu); extern void __cpuinit numa_remove_cpu(int cpu); diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index f22680bf69ec..a713d081b59c 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -78,7 +78,7 @@ EXPORT_SYMBOL(node_to_cpumask_map); DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); -void __cpuinit numa_set_node(int cpu, int node) +void numa_set_node(int cpu, int node) { int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); @@ -101,7 +101,7 @@ void __cpuinit numa_set_node(int cpu, int node) set_cpu_numa_node(cpu, node); } -void __cpuinit numa_clear_node(int cpu) +void numa_clear_node(int cpu) { numa_set_node(cpu, NUMA_NO_NODE); } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 17e1447077ab..a9072a16a160 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1705,6 +1705,34 @@ static int check_cpu_on_node(void *data) return 0; } +static void unmap_cpu_on_node(void *data) +{ +#ifdef CONFIG_ACPI_NUMA + struct pglist_data *pgdat = data; + int cpu; + + for_each_possible_cpu(cpu) + if (cpu_to_node(cpu) == pgdat->node_id) + numa_clear_node(cpu); +#endif +} + +static int check_and_unmap_cpu_on_node(void *data) +{ + int ret = check_cpu_on_node(data); + + if (ret) + return ret; + + /* + * the node will be offlined when we come here, so we can clear + * the cpu_to_node() now. + */ + + unmap_cpu_on_node(data); + return 0; +} + /* offline the node if all memory sections of this node are removed */ void try_offline_node(int nid) { @@ -1731,7 +1759,7 @@ void try_offline_node(int nid) return; } - if (stop_machine(check_cpu_on_node, pgdat, NULL)) + if (stop_machine(check_and_unmap_cpu_on_node, pgdat, NULL)) return; /* -- cgit v1.2.3 From 4d59a75125d5a4717e57e9fc62c64b3d346e603e Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Fri, 22 Feb 2013 16:33:35 -0800 Subject: x86: get pg_data_t's memory from other node During the implementation of SRAT support, we met a problem. In setup_arch(), we have the following call series: 1) memblock is ready; 2) some functions use memblock to allocate memory; 3) parse ACPI tables, such as SRAT. Before 3), we don't know which memory is hotpluggable, and as a result, we cannot prevent memblock from allocating hotpluggable memory. So, in 2), there could be some hotpluggable memory allocated by memblock. Now, we are trying to parse SRAT earlier, before memblock is ready. But I think we need more investigation on this topic. So in this v5, I dropped all the SRAT support, and v5 is just the same as v3, and it is based on 3.8-rc3. As we planned, we will support getting info from SRAT without users' participation at last. And we will post another patch-set to do so. And also, I think for now, we can add this boot option as the first step of supporting movable node. Since Linux cannot migrate the direct mapped pages, the only way for now is to limit the whole node containing only movable memory. Using SRAT is one way. But even if we can use SRAT, users still need an interface to enable/disable this functionality if they don't want to loose their NUMA performance. So I think, a user interface is always needed. For now, users can disable this functionality by not specifying the boot option. Later, we will post SRAT support, and add another option value "movablecore_map=acpi" to using SRAT. This patch: If system can create movable node which all memory of the node is allocated as ZONE_MOVABLE, setup_node_data() cannot allocate memory for the node's pg_data_t. So, use memblock_alloc_try_nid() instead of memblock_alloc_nid() to retry when the first allocation fails. Signed-off-by: Yasuaki Ishimatsu Signed-off-by: Lai Jiangshan Signed-off-by: Tang Chen Signed-off-by: Jiang Liu Cc: Wu Jianguo Cc: Wen Congyang Cc: Mel Gorman Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/numa.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86/mm/numa.c') diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index a713d081b59c..e3963f52aaea 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -213,10 +213,9 @@ static void __init setup_node_data(int nid, u64 start, u64 end) * Allocate node data. Try node-local memory and then any node. * Never allocate in DMA zone. */ - nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); + nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid); if (!nd_pa) { - pr_err("Cannot find %zu bytes in node %d\n", - nd_size, nid); + pr_err("Cannot find %zu bytes in any node\n", nd_size); return; } nd = __va(nd_pa); -- cgit v1.2.3 From e8d1955258091e4c92d5a975ebd7fd8a98f5d30f Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Fri, 22 Feb 2013 16:33:44 -0800 Subject: acpi, memory-hotplug: parse SRAT before memblock is ready On linux, the pages used by kernel could not be migrated. As a result, if a memory range is used by kernel, it cannot be hot-removed. So if we want to hot-remove memory, we should prevent kernel from using it. The way now used to prevent this is specify a memory range by movablemem_map boot option and set it as ZONE_MOVABLE. But when the system is booting, memblock will allocate memory, and reserve the memory for kernel. And before we parse SRAT, and know the node memory ranges, memblock is working. And it may allocate memory in ranges to be set as ZONE_MOVABLE. This memory can be used by kernel, and never be freed. So, let's parse SRAT before memblock is called first. And it is early enough. The first call of memblock_find_in_range_node() is in: setup_arch() |-->setup_real_mode() so, this patch add a function early_parse_srat() to parse SRAT, and call it before setup_real_mode() is called. NOTE: 1) early_parse_srat() is called before numa_init(), and has initialized numa_meminfo. So DO NOT clear numa_nodes_parsed in numa_init() and DO NOT zero numa_meminfo in numa_init(), otherwise we will lose memory numa info. 2) I don't know why using count of memory affinities parsed from SRAT as a return value in original acpi_numa_init(). So I add a static variable srat_mem_cnt to remember this count and use it as the return value of the new acpi_numa_init() [mhocko@suse.cz: parse SRAT before memblock is ready fix] Signed-off-by: Tang Chen Reviewed-by: Wen Congyang Cc: KOSAKI Motohiro Cc: Jiang Liu Cc: Jianguo Wu Cc: Kamezawa Hiroyuki Cc: Lai Jiangshan Cc: Wu Jianguo Cc: Yasuaki Ishimatsu Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Len Brown Cc: "Brown, Len" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/setup.c | 13 +++++++++---- arch/x86/mm/numa.c | 6 ++++-- drivers/acpi/numa.c | 23 +++++++++++++---------- include/linux/acpi.h | 8 ++++++++ 4 files changed, 34 insertions(+), 16 deletions(-) (limited to 'arch/x86/mm/numa.c') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 915f5efefcf5..9c857f05cef0 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1056,6 +1056,15 @@ void __init setup_arch(char **cmdline_p) setup_bios_corruption_check(); #endif + /* + * In the memory hotplug case, the kernel needs info from SRAT to + * determine which memory is hotpluggable before allocating memory + * using memblock. + */ + acpi_boot_table_init(); + early_acpi_boot_init(); + early_parse_srat(); + #ifdef CONFIG_X86_32 printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n", (max_pfn_mapped< Date: Fri, 22 Feb 2013 15:11:47 -0800 Subject: x86/mm/numa: Don't check if node is NUMA_NO_NODE If we aren't debugging per_cpu maps, the cpu's node is stored in per_cpu variable numa_node. If `node' is NUMA_NO_NODE, it means the caller wants to clear the cpu's node. So we should also call set_cpu_numa_node() in this case. Signed-off-by: Wen Congyang Cc: Len Brown Cc: Pavel Machek Cc: "Rafael J. Wysocki" Cc: "H. Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/mm/numa.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/mm/numa.c') diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 2d125be1bae9..21d02f0d7a2c 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -97,8 +97,7 @@ void __cpuinit numa_set_node(int cpu, int node) #endif per_cpu(x86_cpu_to_node_map, cpu) = node; - if (node != NUMA_NO_NODE) - set_cpu_numa_node(cpu, node); + set_cpu_numa_node(cpu, node); } void __cpuinit numa_clear_node(int cpu) -- cgit v1.2.3 From 20e6926dcbafa1b361f1c29d967688be14b6ca4b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 1 Mar 2013 14:51:27 -0800 Subject: x86, ACPI, mm: Revert movablemem_map support Tim found: WARNING: at arch/x86/kernel/smpboot.c:324 topology_sane.isra.2+0x6f/0x80() Hardware name: S2600CP sched: CPU #1's llc-sibling CPU #0 is not on the same node! [node: 1 != 0]. Ignoring dependency. smpboot: Booting Node 1, Processors #1 Modules linked in: Pid: 0, comm: swapper/1 Not tainted 3.9.0-0-generic #1 Call Trace: set_cpu_sibling_map+0x279/0x449 start_secondary+0x11d/0x1e5 Don Morris reproduced on a HP z620 workstation, and bisected it to commit e8d195525809 ("acpi, memory-hotplug: parse SRAT before memblock is ready") It turns out movable_map has some problems, and it breaks several things 1. numa_init is called several times, NOT just for srat. so those nodes_clear(numa_nodes_parsed) memset(&numa_meminfo, 0, sizeof(numa_meminfo)) can not be just removed. Need to consider sequence is: numaq, srat, amd, dummy. and make fall back path working. 2. simply split acpi_numa_init to early_parse_srat. a. that early_parse_srat is NOT called for ia64, so you break ia64. b. for (i = 0; i < MAX_LOCAL_APIC; i++) set_apicid_to_node(i, NUMA_NO_NODE) still left in numa_init. So it will just clear result from early_parse_srat. it should be moved before that.... c. it breaks ACPI_TABLE_OVERIDE...as the acpi table scan is moved early before override from INITRD is settled. 3. that patch TITLE is total misleading, there is NO x86 in the title, but it changes critical x86 code. It caused x86 guys did not pay attention to find the problem early. Those patches really should be routed via tip/x86/mm. 4. after that commit, following range can not use movable ram: a. real_mode code.... well..funny, legacy Node0 [0,1M) could be hot-removed? b. initrd... it will be freed after booting, so it could be on movable... c. crashkernel for kdump...: looks like we can not put kdump kernel above 4G anymore. d. init_mem_mapping: can not put page table high anymore. e. initmem_init: vmemmap can not be high local node anymore. That is not good. If node is hotplugable, the mem related range like page table and vmemmap could be on the that node without problem and should be on that node. We have workaround patch that could fix some problems, but some can not be fixed. So just remove that offending commit and related ones including: f7210e6c4ac7 ("mm/memblock.c: use CONFIG_HAVE_MEMBLOCK_NODE_MAP to protect movablecore_map in memblock_overlaps_region().") 01a178a94e8e ("acpi, memory-hotplug: support getting hotplug info from SRAT") 27168d38fa20 ("acpi, memory-hotplug: extend movablemem_map ranges to the end of node") e8d195525809 ("acpi, memory-hotplug: parse SRAT before memblock is ready") fb06bc8e5f42 ("page_alloc: bootmem limit with movablecore_map") 42f47e27e761 ("page_alloc: make movablemem_map have higher priority") 6981ec31146c ("page_alloc: introduce zone_movable_limit[] to keep movable limit for nodes") 34b71f1e04fc ("page_alloc: add movable_memmap kernel parameter") 4d59a75125d5 ("x86: get pg_data_t's memory from other node") Later we should have patches that will make sure kernel put page table and vmemmap on local node ram instead of push them down to node0. Also need to find way to put other kernel used ram to local node ram. Reported-by: Tim Gardner Reported-by: Don Morris Bisected-by: Don Morris Tested-by: Don Morris Signed-off-by: Yinghai Lu Cc: Tony Luck Cc: Thomas Renninger Cc: Tejun Heo Cc: Tang Chen Cc: Yasuaki Ishimatsu Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 36 ----- arch/x86/kernel/setup.c | 13 +- arch/x86/mm/numa.c | 11 +- arch/x86/mm/srat.c | 125 +--------------- drivers/acpi/numa.c | 23 ++- include/linux/acpi.h | 8 - include/linux/memblock.h | 2 - include/linux/mm.h | 18 --- mm/memblock.c | 50 ------- mm/page_alloc.c | 285 +----------------------------------- 10 files changed, 27 insertions(+), 544 deletions(-) (limited to 'arch/x86/mm/numa.c') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index e567af39ee34..3a54fca730c0 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1645,42 +1645,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted. that the amount of memory usable for all allocations is not too small. - movablemem_map=acpi - [KNL,X86,IA-64,PPC] This parameter is similar to - memmap except it specifies the memory map of - ZONE_MOVABLE. - This option inform the kernel to use Hot Pluggable bit - in flags from SRAT from ACPI BIOS to determine which - memory devices could be hotplugged. The corresponding - memory ranges will be set as ZONE_MOVABLE. - NOTE: Whatever node the kernel resides in will always - be un-hotpluggable. - - movablemem_map=nn[KMG]@ss[KMG] - [KNL,X86,IA-64,PPC] This parameter is similar to - memmap except it specifies the memory map of - ZONE_MOVABLE. - If user specifies memory ranges, the info in SRAT will - be ingored. And it works like the following: - - If more ranges are all within one node, then from - lowest ss to the end of the node will be ZONE_MOVABLE. - - If a range is within a node, then from ss to the end - of the node will be ZONE_MOVABLE. - - If a range covers two or more nodes, then from ss to - the end of the 1st node will be ZONE_MOVABLE, and all - the rest nodes will only have ZONE_MOVABLE. - If memmap is specified at the same time, the - movablemem_map will be limited within the memmap - areas. If kernelcore or movablecore is also specified, - movablemem_map will have higher priority to be - satisfied. So the administrator should be careful that - the amount of movablemem_map areas are not too large. - Otherwise kernel won't have enough memory to start. - NOTE: We don't stop users specifying the node the - kernel resides in as hotpluggable so that this - option can be used as a workaround of firmware - bugs. - MTD_Partition= [MTD] Format: ,,, diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index e89acdf6b77b..84d32855f65c 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1056,15 +1056,6 @@ void __init setup_arch(char **cmdline_p) setup_bios_corruption_check(); #endif - /* - * In the memory hotplug case, the kernel needs info from SRAT to - * determine which memory is hotpluggable before allocating memory - * using memblock. - */ - acpi_boot_table_init(); - early_acpi_boot_init(); - early_parse_srat(); - #ifdef CONFIG_X86_32 printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n", (max_pfn_mapped<cnt; i++) { - if (end <= rgn->regions[i].base || - start >= rgn->regions[i].base + - rgn->regions[i].size) - continue; - - /* - * If the memory range overlaps the memory reserved by - * memblock, then the kernel resides in this node. - */ - node_set(node, movablemem_map.numa_nodes_kernel); - - goto out; - } - - /* - * If the kernel resides in this node, then the whole node - * should not be hotpluggable. - */ - if (node_isset(node, movablemem_map.numa_nodes_kernel)) - goto out; - - insert_movablemem_map(start_pfn, end_pfn); - - /* - * numa_nodes_hotplug nodemask represents which nodes are put - * into movablemem_map.map[]. - */ - node_set(node, movablemem_map.numa_nodes_hotplug); - goto out; - } - - /* - * For movablemem_map=nn[KMG]@ss[KMG]: - * - * SRAT: |_____| |_____| |_________| |_________| ...... - * node id: 0 1 1 2 - * user specified: |__| |___| - * movablemem_map: |___| |_________| |______| ...... - * - * Using movablemem_map, we can prevent memblock from allocating memory - * on ZONE_MOVABLE at boot time. - * - * NOTE: In this case, SRAT info will be ingored. - */ - overlap = movablemem_map_overlap(start_pfn, end_pfn); - if (overlap >= 0) { - /* - * If part of this range is in movablemem_map, we need to - * add the range after it to extend the range to the end - * of the node, because from the min address specified to - * the end of the node will be ZONE_MOVABLE. - */ - start_pfn = max(start_pfn, - movablemem_map.map[overlap].start_pfn); - insert_movablemem_map(start_pfn, end_pfn); - - /* - * Set the nodemask, so that if the address range on one node - * is not continuse, we can add the subsequent ranges on the - * same node into movablemem_map. - */ - node_set(node, movablemem_map.numa_nodes_hotplug); - } else { - if (node_isset(node, movablemem_map.numa_nodes_hotplug)) - /* - * Insert the range if we already have movable ranges - * on the same node. - */ - insert_movablemem_map(start_pfn, end_pfn); - } -out: - return; -} -#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ -static inline void -handle_movablemem(int node, u64 start, u64 end, u32 hotpluggable) -{ -} -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ - /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ int __init acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) { u64 start, end; - u32 hotpluggable; int node, pxm; if (srat_disabled()) @@ -269,8 +154,7 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) goto out_err_bad_srat; if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) goto out_err; - hotpluggable = ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE; - if (hotpluggable && !save_add_info()) + if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info()) goto out_err; start = ma->base_address; @@ -290,12 +174,9 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) node_set(node, numa_nodes_parsed); - printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx] %s\n", + printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n", node, pxm, - (unsigned long long) start, (unsigned long long) end - 1, - hotpluggable ? "Hot Pluggable": ""); - - handle_movablemem(node, start, end, hotpluggable); + (unsigned long long) start, (unsigned long long) end - 1); return 0; out_err_bad_srat: diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c index 59844ee149be..33e609f63585 100644 --- a/drivers/acpi/numa.c +++ b/drivers/acpi/numa.c @@ -282,10 +282,10 @@ acpi_table_parse_srat(enum acpi_srat_type id, handler, max_entries); } -static int srat_mem_cnt; - -void __init early_parse_srat(void) +int __init acpi_numa_init(void) { + int cnt = 0; + /* * Should not limit number with cpu num that is from NR_CPUS or nr_cpus= * SRAT cpu entries could have different order with that in MADT. @@ -295,24 +295,21 @@ void __init early_parse_srat(void) /* SRAT: Static Resource Affinity Table */ if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) { acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY, - acpi_parse_x2apic_affinity, 0); + acpi_parse_x2apic_affinity, 0); acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY, - acpi_parse_processor_affinity, 0); - srat_mem_cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY, - acpi_parse_memory_affinity, - NR_NODE_MEMBLKS); + acpi_parse_processor_affinity, 0); + cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY, + acpi_parse_memory_affinity, + NR_NODE_MEMBLKS); } -} -int __init acpi_numa_init(void) -{ /* SLIT: System Locality Information Table */ acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit); acpi_numa_arch_fixup(); - if (srat_mem_cnt < 0) - return srat_mem_cnt; + if (cnt < 0) + return cnt; else if (!parsed_numa_memblks) return -ENOENT; return 0; diff --git a/include/linux/acpi.h b/include/linux/acpi.h index f46cfd73a553..bcbdd7484e58 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -485,14 +485,6 @@ static inline bool acpi_driver_match_device(struct device *dev, #endif /* !CONFIG_ACPI */ -#ifdef CONFIG_ACPI_NUMA -void __init early_parse_srat(void); -#else -static inline void early_parse_srat(void) -{ -} -#endif - #ifdef CONFIG_ACPI void acpi_os_set_prepare_sleep(int (*func)(u8 sleep_state, u32 pm1a_ctrl, u32 pm1b_ctrl)); diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 3e5ecb2d790e..f388203db7e8 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -42,7 +42,6 @@ struct memblock { extern struct memblock memblock; extern int memblock_debug; -extern struct movablemem_map movablemem_map; #define memblock_dbg(fmt, ...) \ if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) @@ -61,7 +60,6 @@ int memblock_reserve(phys_addr_t base, phys_addr_t size); void memblock_trim_memory(phys_addr_t align); #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP - void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, unsigned long *out_end_pfn, int *out_nid); diff --git a/include/linux/mm.h b/include/linux/mm.h index e7c3f9a0111a..1ede55f292c2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1333,24 +1333,6 @@ extern void free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn); extern void sparse_memory_present_with_active_regions(int nid); -#define MOVABLEMEM_MAP_MAX MAX_NUMNODES -struct movablemem_entry { - unsigned long start_pfn; /* start pfn of memory segment */ - unsigned long end_pfn; /* end pfn of memory segment (exclusive) */ -}; - -struct movablemem_map { - bool acpi; /* true if using SRAT info */ - int nr_map; - struct movablemem_entry map[MOVABLEMEM_MAP_MAX]; - nodemask_t numa_nodes_hotplug; /* on which nodes we specify memory */ - nodemask_t numa_nodes_kernel; /* on which nodes kernel resides in */ -}; - -extern void __init insert_movablemem_map(unsigned long start_pfn, - unsigned long end_pfn); -extern int __init movablemem_map_overlap(unsigned long start_pfn, - unsigned long end_pfn); #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ #if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \ diff --git a/mm/memblock.c b/mm/memblock.c index 1bcd9b970564..b8d9147e5c08 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -92,58 +92,9 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, * * Find @size free area aligned to @align in the specified range and node. * - * If we have CONFIG_HAVE_MEMBLOCK_NODE_MAP defined, we need to check if the - * memory we found if not in hotpluggable ranges. - * * RETURNS: * Found address on success, %0 on failure. */ -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP -phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, - phys_addr_t end, phys_addr_t size, - phys_addr_t align, int nid) -{ - phys_addr_t this_start, this_end, cand; - u64 i; - int curr = movablemem_map.nr_map - 1; - - /* pump up @end */ - if (end == MEMBLOCK_ALLOC_ACCESSIBLE) - end = memblock.current_limit; - - /* avoid allocating the first page */ - start = max_t(phys_addr_t, start, PAGE_SIZE); - end = max(start, end); - - for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) { - this_start = clamp(this_start, start, end); - this_end = clamp(this_end, start, end); - -restart: - if (this_end <= this_start || this_end < size) - continue; - - for (; curr >= 0; curr--) { - if ((movablemem_map.map[curr].start_pfn << PAGE_SHIFT) - < this_end) - break; - } - - cand = round_down(this_end - size, align); - if (curr >= 0 && - cand < movablemem_map.map[curr].end_pfn << PAGE_SHIFT) { - this_end = movablemem_map.map[curr].start_pfn - << PAGE_SHIFT; - goto restart; - } - - if (cand >= this_start) - return cand; - } - - return 0; -} -#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align, int nid) @@ -172,7 +123,6 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, } return 0; } -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ /** * memblock_find_in_range - find free area in given range diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0dade3f18f7d..8fcced7823fa 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -202,18 +202,11 @@ static unsigned long __meminitdata nr_all_pages; static unsigned long __meminitdata dma_reserve; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP -/* Movable memory ranges, will also be used by memblock subsystem. */ -struct movablemem_map movablemem_map = { - .acpi = false, - .nr_map = 0, -}; - static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; static unsigned long __initdata required_kernelcore; static unsigned long __initdata required_movablecore; static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; -static unsigned long __meminitdata zone_movable_limit[MAX_NUMNODES]; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ int movable_zone; @@ -4412,77 +4405,6 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); } -/** - * sanitize_zone_movable_limit - Sanitize the zone_movable_limit array. - * - * zone_movable_limit is initialized as 0. This function will try to get - * the first ZONE_MOVABLE pfn of each node from movablemem_map, and - * assigne them to zone_movable_limit. - * zone_movable_limit[nid] == 0 means no limit for the node. - * - * Note: Each range is represented as [start_pfn, end_pfn) - */ -static void __meminit sanitize_zone_movable_limit(void) -{ - int map_pos = 0, i, nid; - unsigned long start_pfn, end_pfn; - - if (!movablemem_map.nr_map) - return; - - /* Iterate all ranges from minimum to maximum */ - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { - /* - * If we have found lowest pfn of ZONE_MOVABLE of the node - * specified by user, just go on to check next range. - */ - if (zone_movable_limit[nid]) - continue; - -#ifdef CONFIG_ZONE_DMA - /* Skip DMA memory. */ - if (start_pfn < arch_zone_highest_possible_pfn[ZONE_DMA]) - start_pfn = arch_zone_highest_possible_pfn[ZONE_DMA]; -#endif - -#ifdef CONFIG_ZONE_DMA32 - /* Skip DMA32 memory. */ - if (start_pfn < arch_zone_highest_possible_pfn[ZONE_DMA32]) - start_pfn = arch_zone_highest_possible_pfn[ZONE_DMA32]; -#endif - -#ifdef CONFIG_HIGHMEM - /* Skip lowmem if ZONE_MOVABLE is highmem. */ - if (zone_movable_is_highmem() && - start_pfn < arch_zone_lowest_possible_pfn[ZONE_HIGHMEM]) - start_pfn = arch_zone_lowest_possible_pfn[ZONE_HIGHMEM]; -#endif - - if (start_pfn >= end_pfn) - continue; - - while (map_pos < movablemem_map.nr_map) { - if (end_pfn <= movablemem_map.map[map_pos].start_pfn) - break; - - if (start_pfn >= movablemem_map.map[map_pos].end_pfn) { - map_pos++; - continue; - } - - /* - * The start_pfn of ZONE_MOVABLE is either the minimum - * pfn specified by movablemem_map, or 0, which means - * the node has no ZONE_MOVABLE. - */ - zone_movable_limit[nid] = max(start_pfn, - movablemem_map.map[map_pos].start_pfn); - - break; - } - } -} - #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, @@ -4500,6 +4422,7 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid, return zholes_size[zone_type]; } + #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, @@ -4941,19 +4864,12 @@ static void __init find_zone_movable_pfns_for_nodes(void) required_kernelcore = max(required_kernelcore, corepages); } - /* - * If neither kernelcore/movablecore nor movablemem_map is specified, - * there is no ZONE_MOVABLE. But if movablemem_map is specified, the - * start pfn of ZONE_MOVABLE has been stored in zone_movable_limit[]. - */ - if (!required_kernelcore) { - if (movablemem_map.nr_map) - memcpy(zone_movable_pfn, zone_movable_limit, - sizeof(zone_movable_pfn)); + /* If kernelcore was not specified, there is no ZONE_MOVABLE */ + if (!required_kernelcore) goto out; - } /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ + find_usable_zone_for_movable(); usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; restart: @@ -4981,24 +4897,10 @@ restart: for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { unsigned long size_pages; - /* - * Find more memory for kernelcore in - * [zone_movable_pfn[nid], zone_movable_limit[nid]). - */ start_pfn = max(start_pfn, zone_movable_pfn[nid]); if (start_pfn >= end_pfn) continue; - if (zone_movable_limit[nid]) { - end_pfn = min(end_pfn, zone_movable_limit[nid]); - /* No range left for kernelcore in this node */ - if (start_pfn >= end_pfn) { - zone_movable_pfn[nid] = - zone_movable_limit[nid]; - break; - } - } - /* Account for what is only usable for kernelcore */ if (start_pfn < usable_startpfn) { unsigned long kernel_pages; @@ -5058,12 +4960,12 @@ restart: if (usable_nodes && required_kernelcore > usable_nodes) goto restart; -out: /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ for (nid = 0; nid < MAX_NUMNODES; nid++) zone_movable_pfn[nid] = roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); +out: /* restore the node_state */ node_states[N_MEMORY] = saved_node_state; } @@ -5126,8 +5028,6 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) /* Find the PFNs that ZONE_MOVABLE begins at in each node */ memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); - find_usable_zone_for_movable(); - sanitize_zone_movable_limit(); find_zone_movable_pfns_for_nodes(); /* Print out the zone ranges */ @@ -5211,181 +5111,6 @@ static int __init cmdline_parse_movablecore(char *p) early_param("kernelcore", cmdline_parse_kernelcore); early_param("movablecore", cmdline_parse_movablecore); -/** - * movablemem_map_overlap() - Check if a range overlaps movablemem_map.map[]. - * @start_pfn: start pfn of the range to be checked - * @end_pfn: end pfn of the range to be checked (exclusive) - * - * This function checks if a given memory range [start_pfn, end_pfn) overlaps - * the movablemem_map.map[] array. - * - * Return: index of the first overlapped element in movablemem_map.map[] - * or -1 if they don't overlap each other. - */ -int __init movablemem_map_overlap(unsigned long start_pfn, - unsigned long end_pfn) -{ - int overlap; - - if (!movablemem_map.nr_map) - return -1; - - for (overlap = 0; overlap < movablemem_map.nr_map; overlap++) - if (start_pfn < movablemem_map.map[overlap].end_pfn) - break; - - if (overlap == movablemem_map.nr_map || - end_pfn <= movablemem_map.map[overlap].start_pfn) - return -1; - - return overlap; -} - -/** - * insert_movablemem_map - Insert a memory range in to movablemem_map.map. - * @start_pfn: start pfn of the range - * @end_pfn: end pfn of the range - * - * This function will also merge the overlapped ranges, and sort the array - * by start_pfn in monotonic increasing order. - */ -void __init insert_movablemem_map(unsigned long start_pfn, - unsigned long end_pfn) -{ - int pos, overlap; - - /* - * pos will be at the 1st overlapped range, or the position - * where the element should be inserted. - */ - for (pos = 0; pos < movablemem_map.nr_map; pos++) - if (start_pfn <= movablemem_map.map[pos].end_pfn) - break; - - /* If there is no overlapped range, just insert the element. */ - if (pos == movablemem_map.nr_map || - end_pfn < movablemem_map.map[pos].start_pfn) { - /* - * If pos is not the end of array, we need to move all - * the rest elements backward. - */ - if (pos < movablemem_map.nr_map) - memmove(&movablemem_map.map[pos+1], - &movablemem_map.map[pos], - sizeof(struct movablemem_entry) * - (movablemem_map.nr_map - pos)); - movablemem_map.map[pos].start_pfn = start_pfn; - movablemem_map.map[pos].end_pfn = end_pfn; - movablemem_map.nr_map++; - return; - } - - /* overlap will be at the last overlapped range */ - for (overlap = pos + 1; overlap < movablemem_map.nr_map; overlap++) - if (end_pfn < movablemem_map.map[overlap].start_pfn) - break; - - /* - * If there are more ranges overlapped, we need to merge them, - * and move the rest elements forward. - */ - overlap--; - movablemem_map.map[pos].start_pfn = min(start_pfn, - movablemem_map.map[pos].start_pfn); - movablemem_map.map[pos].end_pfn = max(end_pfn, - movablemem_map.map[overlap].end_pfn); - - if (pos != overlap && overlap + 1 != movablemem_map.nr_map) - memmove(&movablemem_map.map[pos+1], - &movablemem_map.map[overlap+1], - sizeof(struct movablemem_entry) * - (movablemem_map.nr_map - overlap - 1)); - - movablemem_map.nr_map -= overlap - pos; -} - -/** - * movablemem_map_add_region - Add a memory range into movablemem_map. - * @start: physical start address of range - * @end: physical end address of range - * - * This function transform the physical address into pfn, and then add the - * range into movablemem_map by calling insert_movablemem_map(). - */ -static void __init movablemem_map_add_region(u64 start, u64 size) -{ - unsigned long start_pfn, end_pfn; - - /* In case size == 0 or start + size overflows */ - if (start + size <= start) - return; - - if (movablemem_map.nr_map >= ARRAY_SIZE(movablemem_map.map)) { - pr_err("movablemem_map: too many entries;" - " ignoring [mem %#010llx-%#010llx]\n", - (unsigned long long) start, - (unsigned long long) (start + size - 1)); - return; - } - - start_pfn = PFN_DOWN(start); - end_pfn = PFN_UP(start + size); - insert_movablemem_map(start_pfn, end_pfn); -} - -/* - * cmdline_parse_movablemem_map - Parse boot option movablemem_map. - * @p: The boot option of the following format: - * movablemem_map=nn[KMG]@ss[KMG] - * - * This option sets the memory range [ss, ss+nn) to be used as movable memory. - * - * Return: 0 on success or -EINVAL on failure. - */ -static int __init cmdline_parse_movablemem_map(char *p) -{ - char *oldp; - u64 start_at, mem_size; - - if (!p) - goto err; - - if (!strcmp(p, "acpi")) - movablemem_map.acpi = true; - - /* - * If user decide to use info from BIOS, all the other user specified - * ranges will be ingored. - */ - if (movablemem_map.acpi) { - if (movablemem_map.nr_map) { - memset(movablemem_map.map, 0, - sizeof(struct movablemem_entry) - * movablemem_map.nr_map); - movablemem_map.nr_map = 0; - } - return 0; - } - - oldp = p; - mem_size = memparse(p, &p); - if (p == oldp) - goto err; - - if (*p == '@') { - oldp = ++p; - start_at = memparse(p, &p); - if (p == oldp || *p != '\0') - goto err; - - movablemem_map_add_region(start_at, mem_size); - return 0; - } -err: - return -EINVAL; -} -early_param("movablemem_map", cmdline_parse_movablemem_map); - #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ /** -- cgit v1.2.3