From 3fcfab16c5b86eaa3db3a9a31adba550c5b67141 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 19 Oct 2006 23:28:16 -0700 Subject: [PATCH] separate bdi congestion functions from queue congestion functions Separate out the concept of "queue congestion" from "backing-dev congestion". Congestion is a backing-dev concept, not a queue concept. The blk_* congestion functions are retained, as wrappers around the core backing-dev congestion functions. This proper layering is needed so that NFS can cleanly use the congestion functions, and so that CONFIG_BLOCK=n actually links. Cc: "Thomas Maier" Cc: "Jens Axboe" Cc: Trond Myklebust Cc: David Howells Cc: Peter Osterlund Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 40db96a655d0..afee38f04d84 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -1050,7 +1051,7 @@ nofail_alloc: if (page) goto got_pg; if (gfp_mask & __GFP_NOFAIL) { - blk_congestion_wait(WRITE, HZ/50); + congestion_wait(WRITE, HZ/50); goto nofail_alloc; } } @@ -1113,7 +1114,7 @@ rebalance: do_retry = 1; } if (do_retry) { - blk_congestion_wait(WRITE, HZ/50); + congestion_wait(WRITE, HZ/50); goto rebalance; } -- cgit v1.2.3 From 6220ec7844fda2686496013a66b5b9169976b991 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 19 Oct 2006 23:29:05 -0700 Subject: [PATCH] highest_possible_node_id() linkage fix Qooting Adrian: - net/sunrpc/svc.c uses highest_possible_node_id() - include/linux/nodemask.h says highest_possible_node_id() is out-of-line #if MAX_NUMNODES > 1 - the out-of-line highest_possible_node_id() is in lib/cpumask.c - lib/Makefile: lib-$(CONFIG_SMP) += cpumask.o CONFIG_ARCH_DISCONTIGMEM_ENABLE=y, CONFIG_SMP=n, CONFIG_SUNRPC=y -> highest_possible_node_id() is used in net/sunrpc/svc.c CONFIG_NODES_SHIFT defined and > 0 -> include/linux/numa.h: MAX_NUMNODES > 1 -> compile error The bug is not present on architectures where ARCH_DISCONTIGMEM_ENABLE depends on NUMA (but m32r isn't the only affected architecture). So move the function into page_alloc.c Cc: Adrian Bunk Cc: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/cpumask.c | 16 ---------------- mm/page_alloc.c | 16 ++++++++++++++++ 2 files changed, 16 insertions(+), 16 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/lib/cpumask.c b/lib/cpumask.c index 7a2a73f88d59..3a67dc5ada7d 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -43,19 +43,3 @@ int __any_online_cpu(const cpumask_t *mask) return cpu; } EXPORT_SYMBOL(__any_online_cpu); - -#if MAX_NUMNODES > 1 -/* - * Find the highest possible node id. - */ -int highest_possible_node_id(void) -{ - unsigned int node; - unsigned int highest = 0; - - for_each_node_mask(node, node_possible_map) - highest = node; - return highest; -} -EXPORT_SYMBOL(highest_possible_node_id); -#endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index afee38f04d84..ebd425c2e2a7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3120,3 +3120,19 @@ unsigned long page_to_pfn(struct page *page) EXPORT_SYMBOL(pfn_to_page); EXPORT_SYMBOL(page_to_pfn); #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ + +#if MAX_NUMNODES > 1 +/* + * Find the highest possible node id. + */ +int highest_possible_node_id(void) +{ + unsigned int node; + unsigned int highest = 0; + + for_each_node_mask(node, node_possible_map) + highest = node; + return highest; +} +EXPORT_SYMBOL(highest_possible_node_id); +#endif -- cgit v1.2.3 From 7516795739bd53175629b90fab0ad488d7a6a9f7 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Sat, 21 Oct 2006 10:24:14 -0700 Subject: [PATCH] Reintroduce NODES_SPAN_OTHER_NODES for powerpc Reintroduce NODES_SPAN_OTHER_NODES for powerpc Revert "[PATCH] Remove SPAN_OTHER_NODES config definition" This reverts commit f62859bb6871c5e4a8e591c60befc8caaf54db8c. Revert "[PATCH] mm: remove arch independent NODES_SPAN_OTHER_NODES" This reverts commit a94b3ab7eab4edcc9b2cb474b188f774c331adf7. Also update the comments to indicate that this is still required and where its used. Signed-off-by: Andy Whitcroft Cc: Paul Mackerras Cc: Mike Kravetz Cc: Benjamin Herrenschmidt Acked-by: Mel Gorman Acked-by: Will Schmidt Cc: Christoph Lameter Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/Kconfig | 9 +++++++++ arch/powerpc/configs/pseries_defconfig | 1 + include/linux/mmzone.h | 6 ++++++ mm/page_alloc.c | 2 ++ 4 files changed, 18 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 8b6910465578..2bd9b7fb0f6c 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -751,6 +751,15 @@ config ARCH_MEMORY_PROBE def_bool y depends on MEMORY_HOTPLUG +# Some NUMA nodes have memory ranges that span +# other nodes. Even though a pfn is valid and +# between a node's start and end pfns, it may not +# reside on that node. See memmap_init_zone() +# for details. +config NODES_SPAN_OTHER_NODES + def_bool y + depends on NEED_MULTIPLE_NODES + config PPC_64K_PAGES bool "64k page size" depends on PPC64 diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig index 9828663652e9..d2833c1a1f3d 100644 --- a/arch/powerpc/configs/pseries_defconfig +++ b/arch/powerpc/configs/pseries_defconfig @@ -184,6 +184,7 @@ CONFIG_SPLIT_PTLOCK_CPUS=4 CONFIG_MIGRATION=y CONFIG_RESOURCES_64BIT=y CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y +CONFIG_NODES_SPAN_OTHER_NODES=y # CONFIG_PPC_64K_PAGES is not set CONFIG_SCHED_SMT=y CONFIG_PROC_DEVICETREE=y diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 59855b8718a0..ed0762b283a9 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -674,6 +674,12 @@ void sparse_init(void); #define sparse_index_init(_sec, _nid) do {} while (0) #endif /* CONFIG_SPARSEMEM */ +#ifdef CONFIG_NODES_SPAN_OTHER_NODES +#define early_pfn_in_nid(pfn, nid) (early_pfn_to_nid(pfn) == (nid)) +#else +#define early_pfn_in_nid(pfn, nid) (1) +#endif + #ifndef early_pfn_valid #define early_pfn_valid(pfn) (1) #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ebd425c2e2a7..f5fc45472d5c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1689,6 +1689,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, for (pfn = start_pfn; pfn < end_pfn; pfn++) { if (!early_pfn_valid(pfn)) continue; + if (!early_pfn_in_nid(pfn, nid)) + continue; page = pfn_to_page(pfn); set_page_links(page, zone, nid, pfn); init_page_count(page); -- cgit v1.2.3 From 3bb1a852ab6c9cdf211a2f4a2f502340c8c38eca Mon Sep 17 00:00:00 2001 From: Martin Bligh Date: Sat, 28 Oct 2006 10:38:24 -0700 Subject: [PATCH] vmscan: Fix temp_priority race The temp_priority field in zone is racy, as we can walk through a reclaim path, and just before we copy it into prev_priority, it can be overwritten (say with DEF_PRIORITY) by another reclaimer. The same bug is contained in both try_to_free_pages and balance_pgdat, but it is fixed slightly differently. In balance_pgdat, we keep a separate priority record per zone in a local array. In try_to_free_pages there is no need to do this, as the priority level is the same for all zones that we reclaim from. Impact of this bug is that temp_priority is copied into prev_priority, and setting this artificially high causes reclaimers to set distress artificially low. They then fail to reclaim mapped pages, when they are, in fact, under severe memory pressure (their priority may be as low as 0). This causes the OOM killer to fire incorrectly. From: Andrew Morton __zone_reclaim() isn't modifying zone->prev_priority. But zone->prev_priority is used in the decision whether or not to bring mapped pages onto the inactive list. Hence there's a risk here that __zone_reclaim() will fail because zone->prev_priority ir large (ie: low urgency) and lots of mapped pages end up stuck on the active list. Fix that up by decreasing (ie making more urgent) zone->prev_priority as __zone_reclaim() scans the zone's pages. This bug perhaps explains why ZONE_RECLAIM_PRIORITY was created. It should be possible to remove that now, and to just start out at DEF_PRIORITY? Cc: Nick Piggin Cc: Christoph Lameter Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 6 +----- mm/page_alloc.c | 2 +- mm/vmscan.c | 55 +++++++++++++++++++++++++++++++++++++------------- mm/vmstat.c | 2 -- 4 files changed, 43 insertions(+), 22 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ed0762b283a9..e06683e2bea3 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -218,13 +218,9 @@ struct zone { * under - it drives the swappiness decision: whether to unmap mapped * pages. * - * temp_priority is used to remember the scanning priority at which - * this zone was successfully refilled to free_pages == pages_high. - * - * Access to both these fields is quite racy even on uniprocessor. But + * Access to both this field is quite racy even on uniprocessor. But * it is expected to average out OK. */ - int temp_priority; int prev_priority; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f5fc45472d5c..ecf853b5e30e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2407,7 +2407,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, zone->zone_pgdat = pgdat; zone->free_pages = 0; - zone->temp_priority = zone->prev_priority = DEF_PRIORITY; + zone->prev_priority = DEF_PRIORITY; zone_pcp_init(zone); INIT_LIST_HEAD(&zone->active_list); diff --git a/mm/vmscan.c b/mm/vmscan.c index f05527bf792b..b32560ead5c0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -723,6 +723,20 @@ done: return nr_reclaimed; } +/* + * We are about to scan this zone at a certain priority level. If that priority + * level is smaller (ie: more urgent) than the previous priority, then note + * that priority level within the zone. This is done so that when the next + * process comes in to scan this zone, it will immediately start out at this + * priority level rather than having to build up its own scanning priority. + * Here, this priority affects only the reclaim-mapped threshold. + */ +static inline void note_zone_scanning_priority(struct zone *zone, int priority) +{ + if (priority < zone->prev_priority) + zone->prev_priority = priority; +} + static inline int zone_is_near_oom(struct zone *zone) { return zone->pages_scanned >= (zone->nr_active + zone->nr_inactive)*3; @@ -972,9 +986,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones, if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) continue; - zone->temp_priority = priority; - if (zone->prev_priority > priority) - zone->prev_priority = priority; + note_zone_scanning_priority(zone, priority); if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; /* Let kswapd poll it */ @@ -1024,7 +1036,6 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) continue; - zone->temp_priority = DEF_PRIORITY; lru_pages += zone->nr_active + zone->nr_inactive; } @@ -1065,13 +1076,22 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) if (!sc.all_unreclaimable) ret = 1; out: + /* + * Now that we've scanned all the zones at this priority level, note + * that level within the zone so that the next thread which performs + * scanning of this zone will immediately start out at this priority + * level. This affects only the decision whether or not to bring + * mapped pages onto the inactive list. + */ + if (priority < 0) + priority = 0; for (i = 0; zones[i] != 0; i++) { struct zone *zone = zones[i]; if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) continue; - zone->prev_priority = zone->temp_priority; + zone->prev_priority = priority; } return ret; } @@ -1111,6 +1131,11 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) .swap_cluster_max = SWAP_CLUSTER_MAX, .swappiness = vm_swappiness, }; + /* + * temp_priority is used to remember the scanning priority at which + * this zone was successfully refilled to free_pages == pages_high. + */ + int temp_priority[MAX_NR_ZONES]; loop_again: total_scanned = 0; @@ -1118,11 +1143,8 @@ loop_again: sc.may_writepage = !laptop_mode; count_vm_event(PAGEOUTRUN); - for (i = 0; i < pgdat->nr_zones; i++) { - struct zone *zone = pgdat->node_zones + i; - - zone->temp_priority = DEF_PRIORITY; - } + for (i = 0; i < pgdat->nr_zones; i++) + temp_priority[i] = DEF_PRIORITY; for (priority = DEF_PRIORITY; priority >= 0; priority--) { int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ @@ -1183,10 +1205,9 @@ scan: if (!zone_watermark_ok(zone, order, zone->pages_high, end_zone, 0)) all_zones_ok = 0; - zone->temp_priority = priority; - if (zone->prev_priority > priority) - zone->prev_priority = priority; + temp_priority[i] = priority; sc.nr_scanned = 0; + note_zone_scanning_priority(zone, priority); nr_reclaimed += shrink_zone(priority, zone, &sc); reclaim_state->reclaimed_slab = 0; nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, @@ -1226,10 +1247,15 @@ scan: break; } out: + /* + * Note within each zone the priority level at which this zone was + * brought into a happy state. So that the next thread which scans this + * zone will start out at that priority level. + */ for (i = 0; i < pgdat->nr_zones; i++) { struct zone *zone = pgdat->node_zones + i; - zone->prev_priority = zone->temp_priority; + zone->prev_priority = temp_priority[i]; } if (!all_zones_ok) { cond_resched(); @@ -1614,6 +1640,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) */ priority = ZONE_RECLAIM_PRIORITY; do { + note_zone_scanning_priority(zone, priority); nr_reclaimed += shrink_zone(priority, zone, &sc); priority--; } while (priority >= 0 && nr_reclaimed < nr_pages); diff --git a/mm/vmstat.c b/mm/vmstat.c index 45b124e012f5..8614e8f6743b 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -587,11 +587,9 @@ static int zoneinfo_show(struct seq_file *m, void *arg) seq_printf(m, "\n all_unreclaimable: %u" "\n prev_priority: %i" - "\n temp_priority: %i" "\n start_pfn: %lu", zone->all_unreclaimable, zone->prev_priority, - zone->temp_priority, zone->zone_start_pfn); spin_unlock_irqrestore(&zone->lock, flags); seq_putc(m, '\n'); -- cgit v1.2.3 From 0c6cb974636dd29681b03f8eb0ae227decab01fb Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Sat, 28 Oct 2006 10:38:59 -0700 Subject: [PATCH] Calculation fix for memory holes beyong the end of physical memory absent_pages_in_range() made the assumption that users of the arch-independent zone-sizing API would not care about holes beyound the end of physical memory. This was not the case and was "fixed" in a patch called "Account for holes that are outside the range of physical memory". However, when given a range that started before a hole in "real" memory and ended beyond the end of memory, it would get the result wrong. The bug is in mainline but a patch is below. It has been tested successfully on a number of machines and architectures. Additional credit to Keith Mannthey for discovering the problem, helping identify the correct fix and confirming it Worked For Him. Signed-off-by: Mel Gorman Cc: keith mannthey Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ecf853b5e30e..b55bb358b832 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2261,7 +2261,7 @@ unsigned long __init __absent_pages_in_range(int nid, /* Account for ranges past physical memory on this node */ if (range_end_pfn > prev_end_pfn) - hole_pages = range_end_pfn - + hole_pages += range_end_pfn - max(range_start_pfn, prev_end_pfn); return hole_pages; -- cgit v1.2.3 From 941c7105dc4f4961727acc518e18e00b9a03cbf3 Mon Sep 17 00:00:00 2001 From: nkalmala Date: Thu, 2 Nov 2006 22:07:04 -0800 Subject: [PATCH] mm: un-needed add-store operation wastes a few bytes Un-needed add-store operation wastes a few bytes. 8 bytes wasted with -O2, on a ppc. Signed-off-by: nkalmala Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b55bb358b832..bf2f6cff1d6a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -853,7 +853,7 @@ again: pcp = &zone_pcp(zone, cpu)->pcp[cold]; local_irq_save(flags); if (!pcp->count) { - pcp->count += rmqueue_bulk(zone, 0, + pcp->count = rmqueue_bulk(zone, 0, pcp->batch, &pcp->list); if (unlikely(!pcp->count)) goto failed; -- cgit v1.2.3 From 1abbfb412b1610ec3a7ec0164108cee01191d9f5 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 23 Nov 2006 12:01:41 +0000 Subject: [PATCH] x86_64: fix bad page state in process 'swapper' find_min_pfn_for_node() and find_min_pfn_with_active_regions() both depend on a sorted early_node_map[]. However, sort_node_map() is being called after fin_min_pfn_with_active_regions() in free_area_init_nodes(). In most cases, this is ok, but on at least one x86_64, the SRAT table caused the E820 ranges to be registered out of order. This gave the wrong values for the min PFN range resulting in some pages not being initialised. This patch sorts the early_node_map in find_min_pfn_for_node(). It has been boot tested on x86, x86_64, ppc64 and ia64. Signed-off-by: Mel Gorman Acked-by: Andre Noll Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bf2f6cff1d6a..aa6fcc7ca66f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2612,6 +2612,9 @@ unsigned long __init find_min_pfn_for_node(unsigned long nid) { int i; + /* Regions in the early_node_map can be in any order */ + sort_node_map(); + /* Assuming a sorted map, the first range found has the starting pfn */ for_each_active_range_index_in_nid(i, nid) return early_node_map[i].start_pfn; @@ -2680,9 +2683,6 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); } - /* Regions in the early_node_map can be in any order */ - sort_node_map(); - /* Print out the zone ranges */ printk("Zone PFN ranges:\n"); for (i = 0; i < MAX_NR_ZONES; i++) -- cgit v1.2.3