From 03668b3ceb0c7a95e09f1b6169f5270ffc1a19f6 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 9 Aug 2010 17:18:54 -0700 Subject: oom: avoid oom killer for lowmem allocations If memory has been depleted in lowmem zones even with the protection afforded to it by /proc/sys/vm/lowmem_reserve_ratio, it is unlikely that killing current users will help. The memory is either reclaimable (or migratable) already, in which case we should not invoke the oom killer at all, or it is pinned by an application for I/O. Killing such an application may leave the hardware in an unspecified state and there is no guarantee that it will be able to make a timely exit. Lowmem allocations are now failed in oom conditions when __GFP_NOFAIL is not used so that the task can perhaps recover or try again later. Previously, the heuristic provided some protection for those tasks with CAP_SYS_RAWIO, but this is no longer necessary since we will not be killing tasks for the purposes of ISA allocations. high_zoneidx is gfp_zone(gfp_flags), meaning that ZONE_NORMAL will be the default for all allocations that are not __GFP_DMA, __GFP_DMA32, __GFP_HIGHMEM, and __GFP_MOVABLE on kernels configured to support those flags. Testing for high_zoneidx being less than ZONE_NORMAL will only return true for allocations that have either __GFP_DMA or __GFP_DMA32. Acked-by: KOSAKI Motohiro Signed-off-by: David Rientjes Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9bd339eb04c6..527f73e4c63f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1759,6 +1759,9 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, /* The OOM killer will not help higher order allocs */ if (order > PAGE_ALLOC_COSTLY_ORDER) goto out; + /* The OOM killer does not needlessly kill tasks for lowmem */ + if (high_zoneidx < ZONE_NORMAL) + goto out; /* * GFP_THISNODE contains __GFP_NORETRY and we never hit this. * Sanity check for bare calls of __GFP_THISNODE, not real OOM. @@ -2052,15 +2055,23 @@ rebalance: if (page) goto got_pg; - /* - * The OOM killer does not trigger for high-order - * ~__GFP_NOFAIL allocations so if no progress is being - * made, there are no other options and retrying is - * unlikely to help. - */ - if (order > PAGE_ALLOC_COSTLY_ORDER && - !(gfp_mask & __GFP_NOFAIL)) - goto nopage; + if (!(gfp_mask & __GFP_NOFAIL)) { + /* + * The oom killer is not called for high-order + * allocations that may fail, so if no progress + * is being made, there are no other options and + * retrying is unlikely to help. + */ + if (order > PAGE_ALLOC_COSTLY_ORDER) + goto nopage; + /* + * The oom killer is not called for lowmem + * allocations to prevent needlessly killing + * innocent tasks. + */ + if (high_zoneidx < ZONE_NORMAL) + goto nopage; + } goto restart; } -- cgit v1.2.3 From ff321feac22313cf53ffceb69224b09ac19ff22b Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 9 Aug 2010 17:18:57 -0700 Subject: mm: rename try_set_zone_oom() to try_set_zonelist_oom() We have been used naming try_set_zone_oom and clear_zonelist_oom. The role of functions is to lock of zonelist for preventing parallel OOM. So clear_zonelist_oom makes sense but try_set_zone_oome is rather awkward and unmatched with clear_zonelist_oom. Let's change it with try_set_zonelist_oom. Signed-off-by: Minchan Kim Acked-by: David Rientjes Reviewed-by: KOSAKI Motohiro Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 2 +- mm/oom_kill.c | 4 ++-- mm/page_alloc.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/oom.h b/include/linux/oom.h index 1a8e407a1a53..9d7c34a741e7 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -25,7 +25,7 @@ enum oom_constraint { CONSTRAINT_MEMCG, }; -extern int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_flags); +extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 2f37f6113d9e..c29cf00d9084 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -549,7 +549,7 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier); * if a parallel OOM killing is already taking place that includes a zone in * the zonelist. Otherwise, locks all zones in the zonelist and returns 1. */ -int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask) +int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) { struct zoneref *z; struct zone *zone; @@ -566,7 +566,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask) for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { /* * Lock each zone in the zonelist under zone_scan_lock so a - * parallel invocation of try_set_zone_oom() doesn't succeed + * parallel invocation of try_set_zonelist_oom() doesn't succeed * when it shouldn't. */ zone_set_flag(zone, ZONE_OOM_LOCKED); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 527f73e4c63f..33c6b4c1277b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1738,7 +1738,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, struct page *page; /* Acquire the OOM killer lock for the zones in zonelist */ - if (!try_set_zone_oom(zonelist, gfp_mask)) { + if (!try_set_zonelist_oom(zonelist, gfp_mask)) { schedule_timeout_uninterruptible(1); return NULL; } -- cgit v1.2.3 From 25edde0332916ae706ccf83de688be57bcc844b7 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 9 Aug 2010 17:19:27 -0700 Subject: vmscan: kill prev_priority completely Since 2.6.28 zone->prev_priority is unused. Then it can be removed safely. It reduce stack usage slightly. Now I have to say that I'm sorry. 2 years ago, I thought prev_priority can be integrate again, it's useful. but four (or more) times trying haven't got good performance number. Thus I give up such approach. The rest of this changelog is notes on prev_priority and why it existed in the first place and why it might be not necessary any more. This information is based heavily on discussions between Andrew Morton, Rik van Riel and Kosaki Motohiro who is heavily quotes from. Historically prev_priority was important because it determined when the VM would start unmapping PTE pages. i.e. there are no balances of note within the VM, Anon vs File and Mapped vs Unmapped. Without prev_priority, there is a potential risk of unnecessarily increasing minor faults as a large amount of read activity of use-once pages could push mapped pages to the end of the LRU and get unmapped. There is no proof this is still a problem but currently it is not considered to be. Active files are not deactivated if the active file list is smaller than the inactive list reducing the liklihood that file-mapped pages are being pushed off the LRU and referenced executable pages are kept on the active list to avoid them getting pushed out by read activity. Even if it is a problem, prev_priority prev_priority wouldn't works nowadays. First of all, current vmscan still a lot of UP centric code. it expose some weakness on some dozens CPUs machine. I think we need more and more improvement. The problem is, current vmscan mix up per-system-pressure, per-zone-pressure and per-task-pressure a bit. example, prev_priority try to boost priority to other concurrent priority. but if the another task have mempolicy restriction, it is unnecessary, but also makes wrong big latency and exceeding reclaim. per-task based priority + prev_priority adjustment make the emulation of per-system pressure. but it have two issue 1) too rough and brutal emulation 2) we need per-zone pressure, not per-system. Another example, currently DEF_PRIORITY is 12. it mean the lru rotate about 2 cycle (1/4096 + 1/2048 + 1/1024 + .. + 1) before invoking OOM-Killer. but if 10,0000 thrreads enter DEF_PRIORITY reclaim at the same time, the system have higher memory pressure than priority==0 (1/4096*10,000 > 2). prev_priority can't solve such multithreads workload issue. In other word, prev_priority concept assume the sysmtem don't have lots threads." Signed-off-by: KOSAKI Motohiro Signed-off-by: Mel Gorman Reviewed-by: Johannes Weiner Reviewed-by: Rik van Riel Cc: Dave Chinner Cc: Chris Mason Cc: Nick Piggin Cc: Rik van Riel Cc: Johannes Weiner Cc: Christoph Hellwig Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Andrea Arcangeli Cc: Michael Rubin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 ---- include/linux/mmzone.h | 15 ------------ mm/memcontrol.c | 31 ------------------------- mm/page_alloc.c | 2 -- mm/vmscan.c | 57 ---------------------------------------------- mm/vmstat.c | 2 -- 6 files changed, 112 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 9411d32840b0..9f1afd361583 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -98,11 +98,6 @@ extern void mem_cgroup_end_migration(struct mem_cgroup *mem, /* * For memory reclaim. */ -extern int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem); -extern void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, - int priority); -extern void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, - int priority); int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg); int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg); unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 9ed9c459b14c..6e6e62648a4d 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -347,21 +347,6 @@ struct zone { /* Zone statistics */ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; - /* - * prev_priority holds the scanning priority for this zone. It is - * defined as the scanning priority at which we achieved our reclaim - * target at the previous try_to_free_pages() or balance_pgdat() - * invocation. - * - * We use prev_priority as a measure of how much stress page reclaim is - * under - it drives the swappiness decision: whether to unmap mapped - * pages. - * - * Access to both this field is quite racy even on uniprocessor. But - * it is expected to average out OK. - */ - int prev_priority; - /* * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on * this zone's LRU. Maintained by the pageout code. diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 20a8193a7af8..31abd1c2c0c5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -211,8 +211,6 @@ struct mem_cgroup { */ spinlock_t reclaim_param_lock; - int prev_priority; /* for recording reclaim priority */ - /* * While reclaiming in a hierarchy, we cache the last child we * reclaimed from. @@ -858,35 +856,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) return ret; } -/* - * prev_priority control...this will be used in memory reclaim path. - */ -int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) -{ - int prev_priority; - - spin_lock(&mem->reclaim_param_lock); - prev_priority = mem->prev_priority; - spin_unlock(&mem->reclaim_param_lock); - - return prev_priority; -} - -void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority) -{ - spin_lock(&mem->reclaim_param_lock); - if (priority < mem->prev_priority) - mem->prev_priority = priority; - spin_unlock(&mem->reclaim_param_lock); -} - -void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority) -{ - spin_lock(&mem->reclaim_param_lock); - mem->prev_priority = priority; - spin_unlock(&mem->reclaim_param_lock); -} - static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) { unsigned long active; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 33c6b4c1277b..a9649f4b261e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4100,8 +4100,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone_seqlock_init(zone); zone->zone_pgdat = pgdat; - zone->prev_priority = DEF_PRIORITY; - zone_pcp_init(zone); for_each_lru(l) { INIT_LIST_HEAD(&zone->lru[l].list); diff --git a/mm/vmscan.c b/mm/vmscan.c index b7a4e6a3cf89..594eba8a44c0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1289,20 +1289,6 @@ done: return nr_reclaimed; } -/* - * We are about to scan this zone at a certain priority level. If that priority - * level is smaller (ie: more urgent) than the previous priority, then note - * that priority level within the zone. This is done so that when the next - * process comes in to scan this zone, it will immediately start out at this - * priority level rather than having to build up its own scanning priority. - * Here, this priority affects only the reclaim-mapped threshold. - */ -static inline void note_zone_scanning_priority(struct zone *zone, int priority) -{ - if (priority < zone->prev_priority) - zone->prev_priority = priority; -} - /* * This moves pages from the active list to the inactive list. * @@ -1766,17 +1752,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, if (scanning_global_lru(sc)) { if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; - note_zone_scanning_priority(zone, priority); - if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; /* Let kswapd poll it */ - } else { - /* - * Ignore cpuset limitation here. We just want to reduce - * # of used pages by us regardless of memory shortage. - */ - mem_cgroup_note_reclaim_priority(sc->mem_cgroup, - priority); } shrink_zone(priority, zone, sc); @@ -1877,17 +1854,6 @@ out: if (priority < 0) priority = 0; - if (scanning_global_lru(sc)) { - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { - - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - continue; - - zone->prev_priority = priority; - } - } else - mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority); - delayacct_freepages_end(); put_mems_allowed(); @@ -2053,22 +2019,12 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) .order = order, .mem_cgroup = NULL, }; - /* - * temp_priority is used to remember the scanning priority at which - * this zone was successfully refilled to - * free_pages == high_wmark_pages(zone). - */ - int temp_priority[MAX_NR_ZONES]; - loop_again: total_scanned = 0; sc.nr_reclaimed = 0; sc.may_writepage = !laptop_mode; count_vm_event(PAGEOUTRUN); - for (i = 0; i < pgdat->nr_zones; i++) - temp_priority[i] = DEF_PRIORITY; - for (priority = DEF_PRIORITY; priority >= 0; priority--) { int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long lru_pages = 0; @@ -2136,9 +2092,7 @@ loop_again: if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; - temp_priority[i] = priority; sc.nr_scanned = 0; - note_zone_scanning_priority(zone, priority); nid = pgdat->node_id; zid = zone_idx(zone); @@ -2211,16 +2165,6 @@ loop_again: break; } out: - /* - * Note within each zone the priority level at which this zone was - * brought into a happy state. So that the next thread which scans this - * zone will start out at that priority level. - */ - for (i = 0; i < pgdat->nr_zones; i++) { - struct zone *zone = pgdat->node_zones + i; - - zone->prev_priority = temp_priority[i]; - } if (!all_zones_ok) { cond_resched(); @@ -2639,7 +2583,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) */ priority = ZONE_RECLAIM_PRIORITY; do { - note_zone_scanning_priority(zone, priority); shrink_zone(priority, zone, &sc); priority--; } while (priority >= 0 && sc.nr_reclaimed < nr_pages); diff --git a/mm/vmstat.c b/mm/vmstat.c index 15a14b16e176..f389168f9a83 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -853,11 +853,9 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, } seq_printf(m, "\n all_unreclaimable: %u" - "\n prev_priority: %i" "\n start_pfn: %lu" "\n inactive_ratio: %u", zone->all_unreclaimable, - zone->prev_priority, zone->zone_start_pfn, zone->inactive_ratio); seq_putc(m, '\n'); -- cgit v1.2.3