From c6f37f12197ac3bd2e5a35f2f0e195ae63d437de Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 24 May 2009 22:16:31 +0200 Subject: PM/Suspend: Do not shrink memory before suspend Remove the shrinking of memory from the suspend-to-RAM code, where it is not really necessary. Signed-off-by: Rafael J. Wysocki Acked-by: Nigel Cunningham Acked-by: Wu Fengguang --- mm/vmscan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index d254306562cd..95c08a8cc2ba 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2056,7 +2056,7 @@ unsigned long global_lru_pages(void) + global_page_state(NR_INACTIVE_FILE); } -#ifdef CONFIG_PM +#ifdef CONFIG_HIBERNATION /* * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages * from LRU lists system-wide, for given pass and priority. @@ -2196,7 +2196,7 @@ out: return sc.nr_reclaimed; } -#endif +#endif /* CONFIG_HIBERNATION */ /* It's optimal to keep kswapds on the same CPUs as their memory, but not required for correctness. So if the last cpu in a node goes -- cgit v1.2.3 From 78dc583d3ab43115579cb5f3f7bd12e3548dd5a5 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 16 Jun 2009 15:31:40 -0700 Subject: vmscan: low order lumpy reclaim also should use PAGEOUT_IO_SYNC Commit 33c120ed2843090e2bd316de1588b8bf8b96cbde ("more aggressively use lumpy reclaim") increased how aggressive lumpy reclaim was by isolating both active and inactive pages for asynchronous lumpy reclaim on costly-high-order pages and for cheap-high-order when memory pressure is high. However, if the system is under heavy pressure and there are dirty pages, asynchronous IO may not be sufficient to reclaim a suitable page in time. This patch causes the caller to enter synchronous lumpy reclaim for costly-high-order pages and for cheap-high-order pages when under memory pressure. Minchan.kim@gmail.com said: Andy added synchronous lumpy reclaim with c661b078fd62abe06fd11fab4ac5e4eeafe26b6d. At that time, lumpy reclaim is not agressive. His intension is just for high-order users.(above PAGE_ALLOC_COSTLY_ORDER). After some time, Rik added aggressive lumpy reclaim with 33c120ed2843090e2bd316de1588b8bf8b96cbde. His intention was to do lumpy reclaim when high-order users and trouble getting a small set of contiguous pages. So we also have to add synchronous pageout for small set of contiguous pages. Cc: Lee Schermerhorn Cc: Andy Whitcroft Acked-by: Peter Zijlstra Cc: Rik van Riel Signed-off-by: KOSAKI Motohiro Reviewed-by: Minchan Kim Reviewed-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 95c08a8cc2ba..a6b7d14812e6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1061,6 +1061,19 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, unsigned long nr_scanned = 0; unsigned long nr_reclaimed = 0; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); + int lumpy_reclaim = 0; + + /* + * If we need a large contiguous chunk of memory, or have + * trouble getting a small set of contiguous pages, we + * will reclaim both active and inactive pages. + * + * We use the same threshold as pageout congestion_wait below. + */ + if (sc->order > PAGE_ALLOC_COSTLY_ORDER) + lumpy_reclaim = 1; + else if (sc->order && priority < DEF_PRIORITY - 2) + lumpy_reclaim = 1; pagevec_init(&pvec, 1); @@ -1073,19 +1086,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, unsigned long nr_freed; unsigned long nr_active; unsigned int count[NR_LRU_LISTS] = { 0, }; - int mode = ISOLATE_INACTIVE; - - /* - * If we need a large contiguous chunk of memory, or have - * trouble getting a small set of contiguous pages, we - * will reclaim both active and inactive pages. - * - * We use the same threshold as pageout congestion_wait below. - */ - if (sc->order > PAGE_ALLOC_COSTLY_ORDER) - mode = ISOLATE_BOTH; - else if (sc->order && priority < DEF_PRIORITY - 2) - mode = ISOLATE_BOTH; + int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE; nr_taken = sc->isolate_pages(sc->swap_cluster_max, &page_list, &nr_scan, sc->order, mode, @@ -1122,7 +1123,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, * but that should be acceptable to the caller */ if (nr_freed < nr_taken && !current_is_kswapd() && - sc->order > PAGE_ALLOC_COSTLY_ORDER) { + lumpy_reclaim) { congestion_wait(WRITE, HZ/10); /* -- cgit v1.2.3 From 418589663d6011de9006425b6c5721e1544fb47a Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:32:12 -0700 Subject: page allocator: use allocation flags as an index to the zone watermark ALLOC_WMARK_MIN, ALLOC_WMARK_LOW and ALLOC_WMARK_HIGH determin whether pages_min, pages_low or pages_high is used as the zone watermark when allocating the pages. Two branches in the allocator hotpath determine which watermark to use. This patch uses the flags as an array index into a watermark array that is indexed with WMARK_* defines accessed via helpers. All call sites that use zone->pages_* are updated to use the helpers for accessing the values and the array offsets for setting. Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Cc: KOSAKI Motohiro Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 11 +++++----- Documentation/vm/balance | 18 ++++++++-------- arch/m32r/mm/discontig.c | 6 +++--- include/linux/mmzone.h | 16 +++++++++++++- mm/page_alloc.c | 51 +++++++++++++++++++++++---------------------- mm/vmscan.c | 39 ++++++++++++++++++---------------- mm/vmstat.c | 6 +++--- 7 files changed, 83 insertions(+), 64 deletions(-) (limited to 'mm/vmscan.c') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 6fab2dcbb4d3..0ea5adbc5b16 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -233,8 +233,8 @@ These protections are added to score to judge whether this zone should be used for page allocation or should be reclaimed. In this example, if normal pages (index=2) are required to this DMA zone and -pages_high is used for watermark, the kernel judges this zone should not be -used because pages_free(1355) is smaller than watermark + protection[2] +watermark[WMARK_HIGH] is used for watermark, the kernel judges this zone should +not be used because pages_free(1355) is smaller than watermark + protection[2] (4 + 2004 = 2008). If this protection value is 0, this zone would be used for normal page requirement. If requirement is DMA zone(index=0), protection[0] (=0) is used. @@ -280,9 +280,10 @@ The default value is 65536. min_free_kbytes: This is used to force the Linux VM to keep a minimum number -of kilobytes free. The VM uses this number to compute a pages_min -value for each lowmem zone in the system. Each lowmem zone gets -a number of reserved free pages based proportionally on its size. +of kilobytes free. The VM uses this number to compute a +watermark[WMARK_MIN] value for each lowmem zone in the system. +Each lowmem zone gets a number of reserved free pages based +proportionally on its size. Some minimal amount of memory is needed to satisfy PF_MEMALLOC allocations; if you set this to lower than 1024KB, your system will diff --git a/Documentation/vm/balance b/Documentation/vm/balance index bd3d31bc4915..c46e68cf9344 100644 --- a/Documentation/vm/balance +++ b/Documentation/vm/balance @@ -75,15 +75,15 @@ Page stealing from process memory and shm is done if stealing the page would alleviate memory pressure on any zone in the page's node that has fallen below its watermark. -pages_min/pages_low/pages_high/low_on_memory/zone_wake_kswapd: These are -per-zone fields, used to determine when a zone needs to be balanced. When -the number of pages falls below pages_min, the hysteric field low_on_memory -gets set. This stays set till the number of free pages becomes pages_high. -When low_on_memory is set, page allocation requests will try to free some -pages in the zone (providing GFP_WAIT is set in the request). Orthogonal -to this, is the decision to poke kswapd to free some zone pages. That -decision is not hysteresis based, and is done when the number of free -pages is below pages_low; in which case zone_wake_kswapd is also set. +watemark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd: These +are per-zone fields, used to determine when a zone needs to be balanced. When +the number of pages falls below watermark[WMARK_MIN], the hysteric field +low_on_memory gets set. This stays set till the number of free pages becomes +watermark[WMARK_HIGH]. When low_on_memory is set, page allocation requests will +try to free some pages in the zone (providing GFP_WAIT is set in the request). +Orthogonal to this, is the decision to poke kswapd to free some zone pages. +That decision is not hysteresis based, and is done when the number of free +pages is below watermark[WMARK_LOW]; in which case zone_wake_kswapd is also set. (Good) Ideas that I have heard: diff --git a/arch/m32r/mm/discontig.c b/arch/m32r/mm/discontig.c index 7daf897292cf..b7a78ad429b7 100644 --- a/arch/m32r/mm/discontig.c +++ b/arch/m32r/mm/discontig.c @@ -154,9 +154,9 @@ unsigned long __init zone_sizes_init(void) * Use all area of internal RAM. * see __alloc_pages() */ - NODE_DATA(1)->node_zones->pages_min = 0; - NODE_DATA(1)->node_zones->pages_low = 0; - NODE_DATA(1)->node_zones->pages_high = 0; + NODE_DATA(1)->node_zones->watermark[WMARK_MIN] = 0; + NODE_DATA(1)->node_zones->watermark[WMARK_LOW] = 0; + NODE_DATA(1)->node_zones->watermark[WMARK_HIGH] = 0; return holes; } diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 0aa4445b0b8a..dd8487f0442f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -163,6 +163,17 @@ static inline int is_unevictable_lru(enum lru_list l) #endif } +enum zone_watermarks { + WMARK_MIN, + WMARK_LOW, + WMARK_HIGH, + NR_WMARK +}; + +#define min_wmark_pages(z) (z->watermark[WMARK_MIN]) +#define low_wmark_pages(z) (z->watermark[WMARK_LOW]) +#define high_wmark_pages(z) (z->watermark[WMARK_HIGH]) + struct per_cpu_pages { int count; /* number of pages in the list */ int high; /* high watermark, emptying needed */ @@ -275,7 +286,10 @@ struct zone_reclaim_stat { struct zone { /* Fields commonly accessed by the page allocator */ - unsigned long pages_min, pages_low, pages_high; + + /* zone watermarks, access with *_wmark_pages(zone) macros */ + unsigned long watermark[NR_WMARK]; + /* * We don't know if the memory that we're going to allocate will be freeable * or/and it will be released eventually, so to avoid totally wasting several diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8485735fc690..abe26003124d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1150,10 +1150,15 @@ failed: return NULL; } -#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ -#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ -#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ -#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ +/* The ALLOC_WMARK bits are used as an index to zone->watermark */ +#define ALLOC_WMARK_MIN WMARK_MIN +#define ALLOC_WMARK_LOW WMARK_LOW +#define ALLOC_WMARK_HIGH WMARK_HIGH +#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ + +/* Mask to get the watermark bits */ +#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) + #define ALLOC_HARDER 0x10 /* try to alloc harder */ #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ @@ -1440,14 +1445,10 @@ zonelist_scan: !cpuset_zone_allowed_softwall(zone, gfp_mask)) goto try_next_zone; + BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { unsigned long mark; - if (alloc_flags & ALLOC_WMARK_MIN) - mark = zone->pages_min; - else if (alloc_flags & ALLOC_WMARK_LOW) - mark = zone->pages_low; - else - mark = zone->pages_high; + mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; if (!zone_watermark_ok(zone, order, mark, classzone_idx, alloc_flags)) { if (!zone_reclaim_mode || @@ -1959,7 +1960,7 @@ static unsigned int nr_free_zone_pages(int offset) for_each_zone_zonelist(zone, z, zonelist, offset) { unsigned long size = zone->present_pages; - unsigned long high = zone->pages_high; + unsigned long high = high_wmark_pages(zone); if (size > high) sum += size - high; } @@ -2096,9 +2097,9 @@ void show_free_areas(void) "\n", zone->name, K(zone_page_state(zone, NR_FREE_PAGES)), - K(zone->pages_min), - K(zone->pages_low), - K(zone->pages_high), + K(min_wmark_pages(zone)), + K(low_wmark_pages(zone)), + K(high_wmark_pages(zone)), K(zone_page_state(zone, NR_ACTIVE_ANON)), K(zone_page_state(zone, NR_INACTIVE_ANON)), K(zone_page_state(zone, NR_ACTIVE_FILE)), @@ -2702,8 +2703,8 @@ static inline unsigned long wait_table_bits(unsigned long size) /* * Mark a number of pageblocks as MIGRATE_RESERVE. The number - * of blocks reserved is based on zone->pages_min. The memory within the - * reserve will tend to store contiguous free pages. Setting min_free_kbytes + * of blocks reserved is based on min_wmark_pages(zone). The memory within + * the reserve will tend to store contiguous free pages. Setting min_free_kbytes * higher will lead to a bigger reserve which will get freed as contiguous * blocks as reclaim kicks in */ @@ -2716,7 +2717,7 @@ static void setup_zone_migrate_reserve(struct zone *zone) /* Get the start pfn, end pfn and the number of blocks to reserve */ start_pfn = zone->zone_start_pfn; end_pfn = start_pfn + zone->spanned_pages; - reserve = roundup(zone->pages_min, pageblock_nr_pages) >> + reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> pageblock_order; for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { @@ -4319,8 +4320,8 @@ static void calculate_totalreserve_pages(void) max = zone->lowmem_reserve[j]; } - /* we treat pages_high as reserved pages. */ - max += zone->pages_high; + /* we treat the high watermark as reserved pages. */ + max += high_wmark_pages(zone); if (max > zone->present_pages) max = zone->present_pages; @@ -4400,7 +4401,7 @@ void setup_per_zone_pages_min(void) * need highmem pages, so cap pages_min to a small * value here. * - * The (pages_high-pages_low) and (pages_low-pages_min) + * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) * deltas controls asynch page reclaim, and so should * not be capped for highmem. */ @@ -4411,17 +4412,17 @@ void setup_per_zone_pages_min(void) min_pages = SWAP_CLUSTER_MAX; if (min_pages > 128) min_pages = 128; - zone->pages_min = min_pages; + zone->watermark[WMARK_MIN] = min_pages; } else { /* * If it's a lowmem zone, reserve a number of pages * proportionate to the zone's size. */ - zone->pages_min = tmp; + zone->watermark[WMARK_MIN] = tmp; } - zone->pages_low = zone->pages_min + (tmp >> 2); - zone->pages_high = zone->pages_min + (tmp >> 1); + zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); + zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); setup_zone_migrate_reserve(zone); spin_unlock_irqrestore(&zone->lock, flags); } @@ -4566,7 +4567,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, * whenever sysctl_lowmem_reserve_ratio changes. * * The reserve ratio obviously has absolutely no relation with the - * pages_min watermarks. The lowmem reserve ratio can only make sense + * minimum watermarks. The lowmem reserve ratio can only make sense * if in function of the boot time zone sizes. */ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, diff --git a/mm/vmscan.c b/mm/vmscan.c index a6b7d14812e6..e5245d051647 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1401,7 +1401,7 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, free = zone_page_state(zone, NR_FREE_PAGES); /* If we have very few page cache pages, force-scan anon pages. */ - if (unlikely(file + free <= zone->pages_high)) { + if (unlikely(file + free <= high_wmark_pages(zone))) { percent[0] = 100; percent[1] = 0; return; @@ -1533,11 +1533,13 @@ static void shrink_zone(int priority, struct zone *zone, * try to reclaim pages from zones which will satisfy the caller's allocation * request. * - * We reclaim from a zone even if that zone is over pages_high. Because: + * We reclaim from a zone even if that zone is over high_wmark_pages(zone). + * Because: * a) The caller may be trying to free *extra* pages to satisfy a higher-order * allocation or - * b) The zones may be over pages_high but they must go *over* pages_high to - * satisfy the `incremental min' zone defense algorithm. + * b) The target zone may be at high_wmark_pages(zone) but the lower zones + * must go *over* high_wmark_pages(zone) to satisfy the `incremental min' + * zone defense algorithm. * * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. @@ -1743,7 +1745,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, /* * For kswapd, balance_pgdat() will work across all this node's zones until - * they are all at pages_high. + * they are all at high_wmark_pages(zone). * * Returns the number of pages which were actually freed. * @@ -1756,11 +1758,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, * the zone for when the problem goes away. * * kswapd scans the zones in the highmem->normal->dma direction. It skips - * zones which have free_pages > pages_high, but once a zone is found to have - * free_pages <= pages_high, we scan that zone and the lower zones regardless - * of the number of free pages in the lower zones. This interoperates with - * the page allocator fallback scheme to ensure that aging of pages is balanced - * across the zones. + * zones which have free_pages > high_wmark_pages(zone), but once a zone is + * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the + * lower zones regardless of the number of free pages in the lower zones. This + * interoperates with the page allocator fallback scheme to ensure that aging + * of pages is balanced across the zones. */ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) { @@ -1781,7 +1783,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) }; /* * temp_priority is used to remember the scanning priority at which - * this zone was successfully refilled to free_pages == pages_high. + * this zone was successfully refilled to + * free_pages == high_wmark_pages(zone). */ int temp_priority[MAX_NR_ZONES]; @@ -1826,8 +1829,8 @@ loop_again: shrink_active_list(SWAP_CLUSTER_MAX, zone, &sc, priority, 0); - if (!zone_watermark_ok(zone, order, zone->pages_high, - 0, 0)) { + if (!zone_watermark_ok(zone, order, + high_wmark_pages(zone), 0, 0)) { end_zone = i; break; } @@ -1861,8 +1864,8 @@ loop_again: priority != DEF_PRIORITY) continue; - if (!zone_watermark_ok(zone, order, zone->pages_high, - end_zone, 0)) + if (!zone_watermark_ok(zone, order, + high_wmark_pages(zone), end_zone, 0)) all_zones_ok = 0; temp_priority[i] = priority; sc.nr_scanned = 0; @@ -1871,8 +1874,8 @@ loop_again: * We put equal pressure on every zone, unless one * zone has way too many pages free already. */ - if (!zone_watermark_ok(zone, order, 8*zone->pages_high, - end_zone, 0)) + if (!zone_watermark_ok(zone, order, + 8*high_wmark_pages(zone), end_zone, 0)) shrink_zone(priority, zone, &sc); reclaim_state->reclaimed_slab = 0; nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, @@ -2038,7 +2041,7 @@ void wakeup_kswapd(struct zone *zone, int order) return; pgdat = zone->zone_pgdat; - if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0)) + if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) return; if (pgdat->kswapd_max_order < order) pgdat->kswapd_max_order = order; diff --git a/mm/vmstat.c b/mm/vmstat.c index 74d66dba0cbe..415110772c73 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -714,9 +714,9 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n spanned %lu" "\n present %lu", zone_page_state(zone, NR_FREE_PAGES), - zone->pages_min, - zone->pages_low, - zone->pages_high, + min_wmark_pages(zone), + low_wmark_pages(zone), + high_wmark_pages(zone), zone->pages_scanned, zone->lru[LRU_ACTIVE_ANON].nr_scan, zone->lru[LRU_INACTIVE_ANON].nr_scan, -- cgit v1.2.3 From 56e49d218890f49b0057710a4b6fef31f5ffbfec Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 16 Jun 2009 15:32:28 -0700 Subject: vmscan: evict use-once pages first When the file LRU lists are dominated by streaming IO pages, evict those pages first, before considering evicting other pages. This should be safe from deadlocks or performance problems because only three things can happen to an inactive file page: 1) referenced twice and promoted to the active list 2) evicted by the pageout code 3) under IO, after which it will get evicted or promoted The pages freed in this way can either be reused for streaming IO, or allocated for something else. If the pages are used for streaming IO, this pageout pattern continues. Otherwise, we will fall back to the normal pageout pattern. Signed-off-by: Rik van Riel Reported-by: Elladan Cc: KOSAKI Motohiro Cc: Peter Zijlstra Cc: Lee Schermerhorn Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 7 +++++++ mm/memcontrol.c | 11 +++++++++++ mm/vmscan.c | 38 +++++++++++++++++++++++++++++++++++++- 3 files changed, 55 insertions(+), 1 deletion(-) (limited to 'mm/vmscan.c') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 25b9ca93d232..45add35dda1b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -94,6 +94,7 @@ extern void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, extern void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority); int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg); +int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg); unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, struct zone *zone, enum lru_list lru); @@ -239,6 +240,12 @@ mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) return 1; } +static inline int +mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) +{ + return 1; +} + static inline unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, struct zone *zone, enum lru_list lru) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 78eb8552818b..70db6e0a5eec 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -570,6 +570,17 @@ int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) return 0; } +int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) +{ + unsigned long active; + unsigned long inactive; + + inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE); + active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE); + + return (active > inactive); +} + unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, struct zone *zone, enum lru_list lru) diff --git a/mm/vmscan.c b/mm/vmscan.c index e5245d051647..9673437a5457 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1351,12 +1351,48 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) return low; } +static int inactive_file_is_low_global(struct zone *zone) +{ + unsigned long active, inactive; + + active = zone_page_state(zone, NR_ACTIVE_FILE); + inactive = zone_page_state(zone, NR_INACTIVE_FILE); + + return (active > inactive); +} + +/** + * inactive_file_is_low - check if file pages need to be deactivated + * @zone: zone to check + * @sc: scan control of this context + * + * When the system is doing streaming IO, memory pressure here + * ensures that active file pages get deactivated, until more + * than half of the file pages are on the inactive list. + * + * Once we get to that situation, protect the system's working + * set from being evicted by disabling active file page aging. + * + * This uses a different ratio than the anonymous pages, because + * the page cache uses a use-once replacement algorithm. + */ +static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) +{ + int low; + + if (scanning_global_lru(sc)) + low = inactive_file_is_low_global(zone); + else + low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup); + return low; +} + static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct zone *zone, struct scan_control *sc, int priority) { int file = is_file_lru(lru); - if (lru == LRU_ACTIVE_FILE) { + if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) { shrink_active_list(nr_to_scan, zone, sc, priority, file); return 0; } -- cgit v1.2.3 From 6e08a369ee10b361ac1cdcdf4fabd420fd08beb3 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:32:29 -0700 Subject: vmscan: cleanup the scan batching code The vmscan batching logic is twisting. Move it into a standalone function nr_scan_try_batch() and document it. No behavior change. Signed-off-by: Wu Fengguang Acked-by: Rik van Riel Cc: Nick Piggin Cc: Christoph Lameter Acked-by: Peter Zijlstra Acked-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 4 ++-- mm/page_alloc.c | 2 +- mm/vmscan.c | 39 ++++++++++++++++++++++++++++----------- mm/vmstat.c | 8 ++++---- 4 files changed, 35 insertions(+), 18 deletions(-) (limited to 'mm/vmscan.c') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index dd8487f0442f..db976b9f8791 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -334,9 +334,9 @@ struct zone { /* Fields commonly accessed by the page reclaim scanner */ spinlock_t lru_lock; - struct { + struct zone_lru { struct list_head list; - unsigned long nr_scan; + unsigned long nr_saved_scan; /* accumulated for batching */ } lru[NR_LRU_LISTS]; struct zone_reclaim_stat reclaim_stat; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 131655cdb6b2..e5b8f628d166 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3657,7 +3657,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone_pcp_init(zone); for_each_lru(l) { INIT_LIST_HEAD(&zone->lru[l].list); - zone->lru[l].nr_scan = 0; + zone->lru[l].nr_saved_scan = 0; } zone->reclaim_stat.recent_rotated[0] = 0; zone->reclaim_stat.recent_rotated[1] = 0; diff --git a/mm/vmscan.c b/mm/vmscan.c index 9673437a5457..d4da097533ce 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1492,6 +1492,26 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, percent[1] = 100 - percent[0]; } +/* + * Smallish @nr_to_scan's are deposited in @nr_saved_scan, + * until we collected @swap_cluster_max pages to scan. + */ +static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, + unsigned long *nr_saved_scan, + unsigned long swap_cluster_max) +{ + unsigned long nr; + + *nr_saved_scan += nr_to_scan; + nr = *nr_saved_scan; + + if (nr >= swap_cluster_max) + *nr_saved_scan = 0; + else + nr = 0; + + return nr; +} /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. @@ -1517,14 +1537,11 @@ static void shrink_zone(int priority, struct zone *zone, scan >>= priority; scan = (scan * percent[file]) / 100; } - if (scanning_global_lru(sc)) { - zone->lru[l].nr_scan += scan; - nr[l] = zone->lru[l].nr_scan; - if (nr[l] >= swap_cluster_max) - zone->lru[l].nr_scan = 0; - else - nr[l] = 0; - } else + if (scanning_global_lru(sc)) + nr[l] = nr_scan_try_batch(scan, + &zone->lru[l].nr_saved_scan, + swap_cluster_max); + else nr[l] = scan; } @@ -2124,11 +2141,11 @@ static void shrink_all_zones(unsigned long nr_pages, int prio, l == LRU_ACTIVE_FILE)) continue; - zone->lru[l].nr_scan += (lru_pages >> prio) + 1; - if (zone->lru[l].nr_scan >= nr_pages || pass > 3) { + zone->lru[l].nr_saved_scan += (lru_pages >> prio) + 1; + if (zone->lru[l].nr_saved_scan >= nr_pages || pass > 3) { unsigned long nr_to_scan; - zone->lru[l].nr_scan = 0; + zone->lru[l].nr_saved_scan = 0; nr_to_scan = min(nr_pages, lru_pages); nr_reclaimed += shrink_list(l, nr_to_scan, zone, sc, prio); diff --git a/mm/vmstat.c b/mm/vmstat.c index 415110772c73..84c055556911 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -718,10 +718,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, low_wmark_pages(zone), high_wmark_pages(zone), zone->pages_scanned, - zone->lru[LRU_ACTIVE_ANON].nr_scan, - zone->lru[LRU_INACTIVE_ANON].nr_scan, - zone->lru[LRU_ACTIVE_FILE].nr_scan, - zone->lru[LRU_INACTIVE_FILE].nr_scan, + zone->lru[LRU_ACTIVE_ANON].nr_saved_scan, + zone->lru[LRU_INACTIVE_ANON].nr_saved_scan, + zone->lru[LRU_ACTIVE_FILE].nr_saved_scan, + zone->lru[LRU_INACTIVE_FILE].nr_saved_scan, zone->spanned_pages, zone->present_pages); -- cgit v1.2.3 From af166777cf451f0373b952ce6766dc1c25385686 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:32:31 -0700 Subject: vmscan: ZVC updates in shrink_active_list() can be done once This effectively lifts the unit of updates to nr_inactive_* and pgdeactivate from PAGEVEC_SIZE=14 to SWAP_CLUSTER_MAX=32, or MAX_ORDER_NR_PAGES=1024 for reclaim_zone(). Cc: Johannes Weiner Acked-by: Rik van Riel Reviewed-by: Minchan Kim Signed-off-by: Wu Fengguang Cc: Peter Zijlstra Cc: Christoph Lameter Cc: KOSAKI Motohiro Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index d4da097533ce..7592d8eb1148 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1223,7 +1223,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, struct scan_control *sc, int priority, int file) { unsigned long pgmoved; - int pgdeactivate = 0; unsigned long pgscanned; LIST_HEAD(l_hold); /* The pages which were snipped off */ LIST_HEAD(l_inactive); @@ -1252,7 +1251,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); spin_unlock_irq(&zone->lru_lock); - pgmoved = 0; + pgmoved = 0; /* count referenced (mapping) mapped pages */ while (!list_empty(&l_hold)) { cond_resched(); page = lru_to_page(&l_hold); @@ -1286,7 +1285,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, */ reclaim_stat->recent_rotated[!!file] += pgmoved; - pgmoved = 0; + pgmoved = 0; /* count pages moved to inactive list */ while (!list_empty(&l_inactive)) { page = lru_to_page(&l_inactive); prefetchw_prev_lru_page(page, &l_inactive, flags); @@ -1299,10 +1298,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, mem_cgroup_add_lru_list(page, lru); pgmoved++; if (!pagevec_add(&pvec, page)) { - __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); spin_unlock_irq(&zone->lru_lock); - pgdeactivate += pgmoved; - pgmoved = 0; if (buffer_heads_over_limit) pagevec_strip(&pvec); __pagevec_release(&pvec); @@ -1310,9 +1306,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, } } __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); - pgdeactivate += pgmoved; __count_zone_vm_events(PGREFILL, zone, pgscanned); - __count_vm_events(PGDEACTIVATE, pgdeactivate); + __count_vm_events(PGDEACTIVATE, pgmoved); spin_unlock_irq(&zone->lru_lock); if (buffer_heads_over_limit) pagevec_strip(&pvec); -- cgit v1.2.3 From 69c854817566db82c362797b4a6521d0b00fe1d8 Mon Sep 17 00:00:00 2001 From: MinChan Kim Date: Tue, 16 Jun 2009 15:32:44 -0700 Subject: vmscan: prevent shrinking of active anon lru list in case of no swap space V3 shrink_zone() can deactivate active anon pages even if we don't have a swap device. Many embedded products don't have a swap device. So the deactivation of anon pages is unnecessary. This patch prevents unnecessary deactivation of anon lru pages. But, it don't prevent aging of anon pages to swap out. Signed-off-by: Minchan Kim Acked-by: KOSAKI Motohiro Cc: Johannes Weiner Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 7592d8eb1148..879d034930c4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1570,7 +1570,7 @@ static void shrink_zone(int priority, struct zone *zone, * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (inactive_anon_is_low(zone, sc)) + if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0) shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); throttle_vm_writeout(sc->gfp_mask); -- cgit v1.2.3 From 6837765963f1723e80ca97b1fae660f3a60d77df Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 16 Jun 2009 15:32:51 -0700 Subject: mm: remove CONFIG_UNEVICTABLE_LRU config option Currently, nobody wants to turn UNEVICTABLE_LRU off. Thus this configurability is unnecessary. Signed-off-by: KOSAKI Motohiro Cc: Johannes Weiner Cc: Andi Kleen Acked-by: Minchan Kim Cc: David Woodhouse Cc: Matt Mackall Cc: Rik van Riel Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 4 ---- fs/proc/meminfo.c | 4 ---- fs/proc/page.c | 2 -- include/linux/mmzone.h | 13 ------------- include/linux/page-flags.h | 16 +--------------- include/linux/pagemap.h | 12 ------------ include/linux/rmap.h | 7 ------- include/linux/swap.h | 19 ------------------- include/linux/vmstat.h | 2 -- kernel/sysctl.c | 2 -- mm/Kconfig | 14 +------------- mm/internal.h | 6 ------ mm/mlock.c | 22 ---------------------- mm/page_alloc.c | 9 --------- mm/rmap.c | 3 +-- mm/vmscan.c | 17 ----------------- mm/vmstat.c | 4 ---- 17 files changed, 3 insertions(+), 153 deletions(-) (limited to 'mm/vmscan.c') diff --git a/drivers/base/node.c b/drivers/base/node.c index 40b809742a1c..91d4087b4039 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -72,10 +72,8 @@ static ssize_t node_read_meminfo(struct sys_device * dev, "Node %d Inactive(anon): %8lu kB\n" "Node %d Active(file): %8lu kB\n" "Node %d Inactive(file): %8lu kB\n" -#ifdef CONFIG_UNEVICTABLE_LRU "Node %d Unevictable: %8lu kB\n" "Node %d Mlocked: %8lu kB\n" -#endif #ifdef CONFIG_HIGHMEM "Node %d HighTotal: %8lu kB\n" "Node %d HighFree: %8lu kB\n" @@ -105,10 +103,8 @@ static ssize_t node_read_meminfo(struct sys_device * dev, nid, K(node_page_state(nid, NR_INACTIVE_ANON)), nid, K(node_page_state(nid, NR_ACTIVE_FILE)), nid, K(node_page_state(nid, NR_INACTIVE_FILE)), -#ifdef CONFIG_UNEVICTABLE_LRU nid, K(node_page_state(nid, NR_UNEVICTABLE)), nid, K(node_page_state(nid, NR_MLOCK)), -#endif #ifdef CONFIG_HIGHMEM nid, K(i.totalhigh), nid, K(i.freehigh), diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index c6b0302af4c4..d5c410d47fae 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -64,10 +64,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) "Inactive(anon): %8lu kB\n" "Active(file): %8lu kB\n" "Inactive(file): %8lu kB\n" -#ifdef CONFIG_UNEVICTABLE_LRU "Unevictable: %8lu kB\n" "Mlocked: %8lu kB\n" -#endif #ifdef CONFIG_HIGHMEM "HighTotal: %8lu kB\n" "HighFree: %8lu kB\n" @@ -109,10 +107,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(pages[LRU_INACTIVE_ANON]), K(pages[LRU_ACTIVE_FILE]), K(pages[LRU_INACTIVE_FILE]), -#ifdef CONFIG_UNEVICTABLE_LRU K(pages[LRU_UNEVICTABLE]), K(global_page_state(NR_MLOCK)), -#endif #ifdef CONFIG_HIGHMEM K(i.totalhigh), K(i.freehigh), diff --git a/fs/proc/page.c b/fs/proc/page.c index 9d926bd279a4..2707c6c7a20f 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -172,10 +172,8 @@ static u64 get_uflags(struct page *page) u |= kpf_copy_bit(k, KPF_SWAPCACHE, PG_swapcache); u |= kpf_copy_bit(k, KPF_SWAPBACKED, PG_swapbacked); -#ifdef CONFIG_UNEVICTABLE_LRU u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable); u |= kpf_copy_bit(k, KPF_MLOCKED, PG_mlocked); -#endif #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index db976b9f8791..889598537370 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -83,13 +83,8 @@ enum zone_stat_item { NR_ACTIVE_ANON, /* " " " " " */ NR_INACTIVE_FILE, /* " " " " " */ NR_ACTIVE_FILE, /* " " " " " */ -#ifdef CONFIG_UNEVICTABLE_LRU NR_UNEVICTABLE, /* " " " " " */ NR_MLOCK, /* mlock()ed pages found and moved off LRU */ -#else - NR_UNEVICTABLE = NR_ACTIVE_FILE, /* avoid compiler errors in dead code */ - NR_MLOCK = NR_ACTIVE_FILE, -#endif NR_ANON_PAGES, /* Mapped anonymous pages */ NR_FILE_MAPPED, /* pagecache pages mapped into pagetables. only modified from process context */ @@ -132,11 +127,7 @@ enum lru_list { LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE, LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE, LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE, -#ifdef CONFIG_UNEVICTABLE_LRU LRU_UNEVICTABLE, -#else - LRU_UNEVICTABLE = LRU_ACTIVE_FILE, /* avoid compiler errors in dead code */ -#endif NR_LRU_LISTS }; @@ -156,11 +147,7 @@ static inline int is_active_lru(enum lru_list l) static inline int is_unevictable_lru(enum lru_list l) { -#ifdef CONFIG_UNEVICTABLE_LRU return (l == LRU_UNEVICTABLE); -#else - return 0; -#endif } enum zone_watermarks { diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 62214c7d2d93..d6792f88a176 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -95,9 +95,7 @@ enum pageflags { PG_reclaim, /* To be reclaimed asap */ PG_buddy, /* Page is free, on buddy lists */ PG_swapbacked, /* Page is backed by RAM/swap */ -#ifdef CONFIG_UNEVICTABLE_LRU PG_unevictable, /* Page is "unevictable" */ -#endif #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT PG_mlocked, /* Page is vma mlocked */ #endif @@ -248,14 +246,8 @@ PAGEFLAG_FALSE(SwapCache) SETPAGEFLAG_NOOP(SwapCache) CLEARPAGEFLAG_NOOP(SwapCache) #endif -#ifdef CONFIG_UNEVICTABLE_LRU PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable) TESTCLEARFLAG(Unevictable, unevictable) -#else -PAGEFLAG_FALSE(Unevictable) TESTCLEARFLAG_FALSE(Unevictable) - SETPAGEFLAG_NOOP(Unevictable) CLEARPAGEFLAG_NOOP(Unevictable) - __CLEARPAGEFLAG_NOOP(Unevictable) -#endif #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT #define MLOCK_PAGES 1 @@ -382,12 +374,6 @@ static inline void __ClearPageTail(struct page *page) #endif /* !PAGEFLAGS_EXTENDED */ -#ifdef CONFIG_UNEVICTABLE_LRU -#define __PG_UNEVICTABLE (1 << PG_unevictable) -#else -#define __PG_UNEVICTABLE 0 -#endif - #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT #define __PG_MLOCKED (1 << PG_mlocked) #else @@ -403,7 +389,7 @@ static inline void __ClearPageTail(struct page *page) 1 << PG_private | 1 << PG_private_2 | \ 1 << PG_buddy | 1 << PG_writeback | 1 << PG_reserved | \ 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \ - __PG_UNEVICTABLE | __PG_MLOCKED) + 1 << PG_unevictable | __PG_MLOCKED) /* * Flags checked when a page is prepped for return by the page allocator. diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 34da5230faab..aec3252afcf5 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -22,9 +22,7 @@ enum mapping_flags { AS_EIO = __GFP_BITS_SHIFT + 0, /* IO error on async write */ AS_ENOSPC = __GFP_BITS_SHIFT + 1, /* ENOSPC on async write */ AS_MM_ALL_LOCKS = __GFP_BITS_SHIFT + 2, /* under mm_take_all_locks() */ -#ifdef CONFIG_UNEVICTABLE_LRU AS_UNEVICTABLE = __GFP_BITS_SHIFT + 3, /* e.g., ramdisk, SHM_LOCK */ -#endif }; static inline void mapping_set_error(struct address_space *mapping, int error) @@ -37,8 +35,6 @@ static inline void mapping_set_error(struct address_space *mapping, int error) } } -#ifdef CONFIG_UNEVICTABLE_LRU - static inline void mapping_set_unevictable(struct address_space *mapping) { set_bit(AS_UNEVICTABLE, &mapping->flags); @@ -55,14 +51,6 @@ static inline int mapping_unevictable(struct address_space *mapping) return test_bit(AS_UNEVICTABLE, &mapping->flags); return !!mapping; } -#else -static inline void mapping_set_unevictable(struct address_space *mapping) { } -static inline void mapping_clear_unevictable(struct address_space *mapping) { } -static inline int mapping_unevictable(struct address_space *mapping) -{ - return 0; -} -#endif static inline gfp_t mapping_gfp_mask(struct address_space * mapping) { diff --git a/include/linux/rmap.h b/include/linux/rmap.h index b35bc0e19cd9..619379a1dd98 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -105,18 +105,11 @@ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *); */ int page_mkclean(struct page *); -#ifdef CONFIG_UNEVICTABLE_LRU /* * called in munlock()/munmap() path to check for other vmas holding * the page mlocked. */ int try_to_munlock(struct page *); -#else -static inline int try_to_munlock(struct page *page) -{ - return 0; /* a.k.a. SWAP_SUCCESS */ -} -#endif #else /* !CONFIG_MMU */ diff --git a/include/linux/swap.h b/include/linux/swap.h index d476aad3ff57..f30c06908f09 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -235,7 +235,6 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order) } #endif -#ifdef CONFIG_UNEVICTABLE_LRU extern int page_evictable(struct page *page, struct vm_area_struct *vma); extern void scan_mapping_unevictable_pages(struct address_space *); @@ -244,24 +243,6 @@ extern int scan_unevictable_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); extern int scan_unevictable_register_node(struct node *node); extern void scan_unevictable_unregister_node(struct node *node); -#else -static inline int page_evictable(struct page *page, - struct vm_area_struct *vma) -{ - return 1; -} - -static inline void scan_mapping_unevictable_pages(struct address_space *mapping) -{ -} - -static inline int scan_unevictable_register_node(struct node *node) -{ - return 0; -} - -static inline void scan_unevictable_unregister_node(struct node *node) { } -#endif extern int kswapd_run(int nid); diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 524cd1b28ecb..ff4696c6dce3 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -41,7 +41,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_HUGETLB_PAGE HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL, #endif -#ifdef CONFIG_UNEVICTABLE_LRU UNEVICTABLE_PGCULLED, /* culled to noreclaim list */ UNEVICTABLE_PGSCANNED, /* scanned for reclaimability */ UNEVICTABLE_PGRESCUED, /* rescued from noreclaim list */ @@ -50,7 +49,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, UNEVICTABLE_PGCLEARED, /* on COW, page truncate */ UNEVICTABLE_PGSTRANDED, /* unable to isolate on unlock */ UNEVICTABLE_MLOCKFREED, -#endif NR_VM_EVENT_ITEMS }; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0e51a35a4486..2ccee08f92f1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1325,7 +1325,6 @@ static struct ctl_table vm_table[] = { .extra2 = &one, }, #endif -#ifdef CONFIG_UNEVICTABLE_LRU { .ctl_name = CTL_UNNUMBERED, .procname = "scan_unevictable_pages", @@ -1334,7 +1333,6 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = &scan_unevictable_handler, }, -#endif /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt diff --git a/mm/Kconfig b/mm/Kconfig index 71830ba7b986..97d2c88b745e 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -203,25 +203,13 @@ config VIRT_TO_BUS def_bool y depends on !ARCH_NO_VIRT_TO_BUS -config UNEVICTABLE_LRU - bool "Add LRU list to track non-evictable pages" - default y - help - Keeps unevictable pages off of the active and inactive pageout - lists, so kswapd will not waste CPU time or have its balancing - algorithms thrown off by scanning these pages. Selecting this - will use one page flag and increase the code size a little, - say Y unless you know what you are doing. - - See Documentation/vm/unevictable-lru.txt for more information. - config HAVE_MLOCK bool default y if MMU=y config HAVE_MLOCKED_PAGE_BIT bool - default y if HAVE_MLOCK=y && UNEVICTABLE_LRU=y + default y if HAVE_MLOCK=y config MMU_NOTIFIER bool diff --git a/mm/internal.h b/mm/internal.h index b4ac332e8072..f02c7508068d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -73,7 +73,6 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) } #endif -#ifdef CONFIG_UNEVICTABLE_LRU /* * unevictable_migrate_page() called only from migrate_page_copy() to * migrate unevictable flag to new page. @@ -85,11 +84,6 @@ static inline void unevictable_migrate_page(struct page *new, struct page *old) if (TestClearPageUnevictable(old)) SetPageUnevictable(new); } -#else -static inline void unevictable_migrate_page(struct page *new, struct page *old) -{ -} -#endif #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT /* diff --git a/mm/mlock.c b/mm/mlock.c index ac130433c7d3..45eb650b9654 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -31,7 +31,6 @@ int can_do_mlock(void) } EXPORT_SYMBOL(can_do_mlock); -#ifdef CONFIG_UNEVICTABLE_LRU /* * Mlocked pages are marked with PageMlocked() flag for efficient testing * in vmscan and, possibly, the fault path; and to support semi-accurate @@ -261,27 +260,6 @@ static int __mlock_posix_error_return(long retval) return retval; } -#else /* CONFIG_UNEVICTABLE_LRU */ - -/* - * Just make pages present if VM_LOCKED. No-op if unlocking. - */ -static long __mlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - int mlock) -{ - if (mlock && (vma->vm_flags & VM_LOCKED)) - return make_pages_present(start, end); - return 0; -} - -static inline int __mlock_posix_error_return(long retval) -{ - return 0; -} - -#endif /* CONFIG_UNEVICTABLE_LRU */ - /** * mlock_vma_pages_range() - mlock pages in specified vma range. * @vma - the vma containing the specfied address range diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 00e293734fc9..c95a77cd581b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2077,19 +2077,14 @@ void show_free_areas(void) printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" " inactive_file:%lu" -//TODO: check/adjust line lengths -#ifdef CONFIG_UNEVICTABLE_LRU " unevictable:%lu" -#endif " dirty:%lu writeback:%lu unstable:%lu\n" " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", global_page_state(NR_ACTIVE_ANON), global_page_state(NR_ACTIVE_FILE), global_page_state(NR_INACTIVE_ANON), global_page_state(NR_INACTIVE_FILE), -#ifdef CONFIG_UNEVICTABLE_LRU global_page_state(NR_UNEVICTABLE), -#endif global_page_state(NR_FILE_DIRTY), global_page_state(NR_WRITEBACK), global_page_state(NR_UNSTABLE_NFS), @@ -2113,9 +2108,7 @@ void show_free_areas(void) " inactive_anon:%lukB" " active_file:%lukB" " inactive_file:%lukB" -#ifdef CONFIG_UNEVICTABLE_LRU " unevictable:%lukB" -#endif " present:%lukB" " pages_scanned:%lu" " all_unreclaimable? %s" @@ -2129,9 +2122,7 @@ void show_free_areas(void) K(zone_page_state(zone, NR_INACTIVE_ANON)), K(zone_page_state(zone, NR_ACTIVE_FILE)), K(zone_page_state(zone, NR_INACTIVE_FILE)), -#ifdef CONFIG_UNEVICTABLE_LRU K(zone_page_state(zone, NR_UNEVICTABLE)), -#endif K(zone->present_pages), zone->pages_scanned, (zone_is_all_unreclaimable(zone) ? "yes" : "no") diff --git a/mm/rmap.c b/mm/rmap.c index 23122af32611..316c9d6930ad 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1202,7 +1202,6 @@ int try_to_unmap(struct page *page, int migration) return ret; } -#ifdef CONFIG_UNEVICTABLE_LRU /** * try_to_munlock - try to munlock a page * @page: the page to be munlocked @@ -1226,4 +1225,4 @@ int try_to_munlock(struct page *page) else return try_to_unmap_file(page, 1, 0); } -#endif + diff --git a/mm/vmscan.c b/mm/vmscan.c index 879d034930c4..2c4b945b011f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -514,7 +514,6 @@ int remove_mapping(struct address_space *mapping, struct page *page) * * lru_lock must not be held, interrupts must be enabled. */ -#ifdef CONFIG_UNEVICTABLE_LRU void putback_lru_page(struct page *page) { int lru; @@ -568,20 +567,6 @@ redo: put_page(page); /* drop ref from isolate */ } -#else /* CONFIG_UNEVICTABLE_LRU */ - -void putback_lru_page(struct page *page) -{ - int lru; - VM_BUG_ON(PageLRU(page)); - - lru = !!TestClearPageActive(page) + page_is_file_cache(page); - lru_cache_add_lru(page, lru); - put_page(page); -} -#endif /* CONFIG_UNEVICTABLE_LRU */ - - /* * shrink_page_list() returns the number of reclaimed pages */ @@ -2470,7 +2455,6 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) } #endif -#ifdef CONFIG_UNEVICTABLE_LRU /* * page_evictable - test whether a page is evictable * @page: the page to test @@ -2717,4 +2701,3 @@ void scan_unevictable_unregister_node(struct node *node) sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); } -#endif diff --git a/mm/vmstat.c b/mm/vmstat.c index 1e151cf6bf86..1e3aa8139f22 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -629,10 +629,8 @@ static const char * const vmstat_text[] = { "nr_active_anon", "nr_inactive_file", "nr_active_file", -#ifdef CONFIG_UNEVICTABLE_LRU "nr_unevictable", "nr_mlock", -#endif "nr_anon_pages", "nr_mapped", "nr_file_pages", @@ -687,7 +685,6 @@ static const char * const vmstat_text[] = { "htlb_buddy_alloc_success", "htlb_buddy_alloc_fail", #endif -#ifdef CONFIG_UNEVICTABLE_LRU "unevictable_pgs_culled", "unevictable_pgs_scanned", "unevictable_pgs_rescued", @@ -697,7 +694,6 @@ static const char * const vmstat_text[] = { "unevictable_pgs_stranded", "unevictable_pgs_mlockfreed", #endif -#endif }; static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, -- cgit v1.2.3 From cb4b86ba47bb0937b71fb825b3ed88adf7a190f0 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 16 Jun 2009 15:32:52 -0700 Subject: mm: add swap cache interface for swap reference In a following patch, the usage of swap cache is recorded into swap_map. This patch is for necessary interface changes to do that. 2 interfaces: - swapcache_prepare() - swapcache_free() are added for allocating/freeing refcnt from swap-cache to existing swap entries. But implementation itself is not changed under this patch. At adding swapcache_free(), memcg's hook code is moved under swapcache_free(). This is better than using scattered hooks. Signed-off-by: KAMEZAWA Hiroyuki Reviewed-by: Daisuke Nishimura Acked-by: Balbir Singh Cc: Hugh Dickins Cc: Johannes Weiner Cc: Li Zefan Cc: Dhaval Giani Cc: YAMAMOTO Takashi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 7 +++++++ mm/shmem.c | 2 +- mm/swap_state.c | 11 +++++------ mm/swapfile.c | 19 +++++++++++++++++++ mm/vmscan.c | 3 +-- 5 files changed, 33 insertions(+), 9 deletions(-) (limited to 'mm/vmscan.c') diff --git a/include/linux/swap.h b/include/linux/swap.h index f30c06908f09..259e96c150ef 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -282,8 +282,10 @@ extern void si_swapinfo(struct sysinfo *); extern swp_entry_t get_swap_page(void); extern swp_entry_t get_swap_page_of_type(int); extern int swap_duplicate(swp_entry_t); +extern int swapcache_prepare(swp_entry_t); extern int valid_swaphandles(swp_entry_t, unsigned long *); extern void swap_free(swp_entry_t); +extern void swapcache_free(swp_entry_t, struct page *page); extern int free_swap_and_cache(swp_entry_t); extern int swap_type_of(dev_t, sector_t, struct block_device **); extern unsigned int count_swap_pages(int, int); @@ -352,11 +354,16 @@ static inline void show_swap_cache_info(void) #define free_swap_and_cache(swp) is_migration_entry(swp) #define swap_duplicate(swp) is_migration_entry(swp) +#define swapcache_prepare(swp) is_migration_entry(swp) static inline void swap_free(swp_entry_t swp) { } +static inline void swapcache_free(swp_entry_t swp, struct page *page) +{ +} + static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr) { diff --git a/mm/shmem.c b/mm/shmem.c index 0132fbd45a23..47ab19182287 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1097,7 +1097,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) shmem_swp_unmap(entry); unlock: spin_unlock(&info->lock); - swap_free(swap); + swapcache_free(swap, NULL); redirty: set_page_dirty(page); if (wbc->for_reclaim) diff --git a/mm/swap_state.c b/mm/swap_state.c index 1416e7e9e02d..19bdf3017a9e 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -162,11 +162,11 @@ int add_to_swap(struct page *page) return 1; case -EEXIST: /* Raced with "speculative" read_swap_cache_async */ - swap_free(entry); + swapcache_free(entry, NULL); continue; default: /* -ENOMEM radix-tree allocation failure */ - swap_free(entry); + swapcache_free(entry, NULL); return 0; } } @@ -188,8 +188,7 @@ void delete_from_swap_cache(struct page *page) __delete_from_swap_cache(page); spin_unlock_irq(&swapper_space.tree_lock); - mem_cgroup_uncharge_swapcache(page, entry); - swap_free(entry); + swapcache_free(entry, page); page_cache_release(page); } @@ -293,7 +292,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* * Swap entry may have been freed since our caller observed it. */ - if (!swap_duplicate(entry)) + if (!swapcache_prepare(entry)) break; /* @@ -317,7 +316,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, } ClearPageSwapBacked(new_page); __clear_page_locked(new_page); - swap_free(entry); + swapcache_free(entry, NULL); } while (err != -ENOMEM); if (new_page) diff --git a/mm/swapfile.c b/mm/swapfile.c index 312fafe0ab6e..3187079903fd 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -509,6 +509,16 @@ void swap_free(swp_entry_t entry) } } +/* + * Called after dropping swapcache to decrease refcnt to swap entries. + */ +void swapcache_free(swp_entry_t entry, struct page *page) +{ + if (page) + mem_cgroup_uncharge_swapcache(page, entry); + return swap_free(entry); +} + /* * How many references to page are currently swapped out? */ @@ -1979,6 +1989,15 @@ bad_file: goto out; } +/* + * Called when allocating swap cache for exising swap entry, + */ +int swapcache_prepare(swp_entry_t entry) +{ + return swap_duplicate(entry); +} + + struct swap_info_struct * get_swap_info_struct(unsigned type) { diff --git a/mm/vmscan.c b/mm/vmscan.c index 2c4b945b011f..52339dd7bf85 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -470,8 +470,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page) swp_entry_t swap = { .val = page_private(page) }; __delete_from_swap_cache(page); spin_unlock_irq(&mapping->tree_lock); - mem_cgroup_uncharge_swapcache(page, swap); - swap_free(swap); + swapcache_free(swap, page); } else { __remove_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); -- cgit v1.2.3 From 6fe6b7e35785e3232ffe7f81d3893f1316710a02 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:33:05 -0700 Subject: vmscan: report vm_flags in page_referenced() Collect vma->vm_flags of the VMAs that actually referenced the page. This is preparing for more informed reclaim heuristics, eg. to protect executable file pages more aggressively. For now only the VM_EXEC bit will be used by the caller. Thanks to Johannes, Peter and Minchan for all the good tips. Acked-by: Peter Zijlstra Reviewed-by: Rik van Riel Reviewed-by: Minchan Kim Reviewed-by: Johannes Weiner Signed-off-by: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 5 +++-- mm/rmap.c | 37 ++++++++++++++++++++++++++----------- mm/vmscan.c | 7 +++++-- 3 files changed, 34 insertions(+), 15 deletions(-) (limited to 'mm/vmscan.c') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 619379a1dd98..216d024f830d 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -83,7 +83,8 @@ static inline void page_dup_rmap(struct page *page, struct vm_area_struct *vma, /* * Called from mm/vmscan.c to handle paging out */ -int page_referenced(struct page *, int is_locked, struct mem_cgroup *cnt); +int page_referenced(struct page *, int is_locked, + struct mem_cgroup *cnt, unsigned long *vm_flags); int try_to_unmap(struct page *, int ignore_refs); /* @@ -117,7 +118,7 @@ int try_to_munlock(struct page *); #define anon_vma_prepare(vma) (0) #define anon_vma_link(vma) do {} while (0) -#define page_referenced(page,l,cnt) TestClearPageReferenced(page) +#define page_referenced(page, locked, cnt, flags) TestClearPageReferenced(page) #define try_to_unmap(page, refs) SWAP_FAIL static inline int page_mkclean(struct page *page) diff --git a/mm/rmap.c b/mm/rmap.c index 316c9d6930ad..c9ccc1a72dc3 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -333,7 +333,9 @@ static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) * repeatedly from either page_referenced_anon or page_referenced_file. */ static int page_referenced_one(struct page *page, - struct vm_area_struct *vma, unsigned int *mapcount) + struct vm_area_struct *vma, + unsigned int *mapcount, + unsigned long *vm_flags) { struct mm_struct *mm = vma->vm_mm; unsigned long address; @@ -381,11 +383,14 @@ out_unmap: (*mapcount)--; pte_unmap_unlock(pte, ptl); out: + if (referenced) + *vm_flags |= vma->vm_flags; return referenced; } static int page_referenced_anon(struct page *page, - struct mem_cgroup *mem_cont) + struct mem_cgroup *mem_cont, + unsigned long *vm_flags) { unsigned int mapcount; struct anon_vma *anon_vma; @@ -405,7 +410,8 @@ static int page_referenced_anon(struct page *page, */ if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) continue; - referenced += page_referenced_one(page, vma, &mapcount); + referenced += page_referenced_one(page, vma, + &mapcount, vm_flags); if (!mapcount) break; } @@ -418,6 +424,7 @@ static int page_referenced_anon(struct page *page, * page_referenced_file - referenced check for object-based rmap * @page: the page we're checking references on. * @mem_cont: target memory controller + * @vm_flags: collect encountered vma->vm_flags who actually referenced the page * * For an object-based mapped page, find all the places it is mapped and * check/clear the referenced flag. This is done by following the page->mapping @@ -427,7 +434,8 @@ static int page_referenced_anon(struct page *page, * This function is only called from page_referenced for object-based pages. */ static int page_referenced_file(struct page *page, - struct mem_cgroup *mem_cont) + struct mem_cgroup *mem_cont, + unsigned long *vm_flags) { unsigned int mapcount; struct address_space *mapping = page->mapping; @@ -467,7 +475,8 @@ static int page_referenced_file(struct page *page, */ if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) continue; - referenced += page_referenced_one(page, vma, &mapcount); + referenced += page_referenced_one(page, vma, + &mapcount, vm_flags); if (!mapcount) break; } @@ -481,29 +490,35 @@ static int page_referenced_file(struct page *page, * @page: the page to test * @is_locked: caller holds lock on the page * @mem_cont: target memory controller + * @vm_flags: collect encountered vma->vm_flags who actually referenced the page * * Quick test_and_clear_referenced for all mappings to a page, * returns the number of ptes which referenced the page. */ -int page_referenced(struct page *page, int is_locked, - struct mem_cgroup *mem_cont) +int page_referenced(struct page *page, + int is_locked, + struct mem_cgroup *mem_cont, + unsigned long *vm_flags) { int referenced = 0; if (TestClearPageReferenced(page)) referenced++; + *vm_flags = 0; if (page_mapped(page) && page->mapping) { if (PageAnon(page)) - referenced += page_referenced_anon(page, mem_cont); + referenced += page_referenced_anon(page, mem_cont, + vm_flags); else if (is_locked) - referenced += page_referenced_file(page, mem_cont); + referenced += page_referenced_file(page, mem_cont, + vm_flags); else if (!trylock_page(page)) referenced++; else { if (page->mapping) - referenced += - page_referenced_file(page, mem_cont); + referenced += page_referenced_file(page, + mem_cont, vm_flags); unlock_page(page); } } diff --git a/mm/vmscan.c b/mm/vmscan.c index 52339dd7bf85..6be2068f61c8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -577,6 +577,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, struct pagevec freed_pvec; int pgactivate = 0; unsigned long nr_reclaimed = 0; + unsigned long vm_flags; cond_resched(); @@ -627,7 +628,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep_locked; } - referenced = page_referenced(page, 1, sc->mem_cgroup); + referenced = page_referenced(page, 1, + sc->mem_cgroup, &vm_flags); /* In active use or really unfreeable? Activate it. */ if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced && page_mapping_inuse(page)) @@ -1208,6 +1210,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, { unsigned long pgmoved; unsigned long pgscanned; + unsigned long vm_flags; LIST_HEAD(l_hold); /* The pages which were snipped off */ LIST_HEAD(l_inactive); struct page *page; @@ -1248,7 +1251,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, /* page_referenced clears PageReferenced */ if (page_mapping_inuse(page) && - page_referenced(page, 0, sc->mem_cgroup)) + page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) pgmoved++; list_add(&page->lru, &l_inactive); -- cgit v1.2.3 From 8cab4754d24a0f2e05920170c845bd84472814c6 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:33:12 -0700 Subject: vmscan: make mapped executable pages the first class citizen Protect referenced PROT_EXEC mapped pages from being deactivated. PROT_EXEC(or its internal presentation VM_EXEC) pages normally belong to some currently running executables and their linked libraries, they shall really be cached aggressively to provide good user experiences. Thanks to Johannes Weiner for the advice to reuse the VMA walk in page_referenced() to get the PROT_EXEC bit. [more details] ( The consequences of this patch will have to be discussed together with Rik van Riel's recent patch "vmscan: evict use-once pages first". ) ( Some of the good points and insights are taken into this changelog. Thanks to all the involved people for the great LKML discussions. ) the problem =========== For a typical desktop, the most precious working set is composed of *actively accessed* (1) memory mapped executables (2) and their anonymous pages (3) and other files (4) and the dcache/icache/.. slabs while the least important data are (5) infrequently used or use-once files For a typical desktop, one major problem is busty and large amount of (5) use-once files flushing out the working set. Inside the working set, (4) dcache/icache have already been too sticky ;-) So we only have to care (2) anonymous and (1)(3) file pages. anonymous pages =============== Anonymous pages are effectively immune to the streaming IO attack, because we now have separate file/anon LRU lists. When the use-once files crowd into the file LRU, the list's "quality" is significantly lowered. Therefore the scan balance policy in get_scan_ratio() will choose to scan the (low quality) file LRU much more frequently than the anon LRU. file pages ========== Rik proposed to *not* scan the active file LRU when the inactive list grows larger than active list. This guarantees that when there are use-once streaming IO, and the working set is not too large(so that active_size < inactive_size), the active file LRU will *not* be scanned at all. So the not-too-large working set can be well protected. But there are also situations where the file working set is a bit large so that (active_size >= inactive_size), or the streaming IOs are not purely use-once. In these cases, the active list will be scanned slowly. Because the current shrink_active_list() policy is to deactivate active pages regardless of their referenced bits. The deactivated pages become susceptible to the streaming IO attack: the inactive list could be scanned fast (500MB / 50MBps = 10s) so that the deactivated pages don't have enough time to get re-referenced. Because a user tend to switch between windows in intervals from seconds to minutes. This patch holds mapped executable pages in the active list as long as they are referenced during each full scan of the active list. Because the active list is normally scanned much slower, they get longer grace time (eg. 100s) for further references, which better matches the pace of user operations. Therefore this patch greatly prolongs the in-cache time of executable code, when there are moderate memory pressures. before patch: guaranteed to be cached if reference intervals < I after patch: guaranteed to be cached if reference intervals < I+A (except when randomly reclaimed by the lumpy reclaim) where A = time to fully scan the active file LRU I = time to fully scan the inactive file LRU Note that normally A >> I. side effects ============ This patch is safe in general, it restores the pre-2.6.28 mmap() behavior but in a much smaller and well targeted scope. One may worry about some one to abuse the PROT_EXEC heuristic. But as Andrew Morton stated, there are other tricks to getting that sort of boost. Another concern is the PROT_EXEC mapped pages growing large in rare cases, and therefore hurting reclaim efficiency. But a sane application targeted for large audience will never use PROT_EXEC for data mappings. If some home made application tries to abuse that bit, it shall be aware of the consequences. If it is abused to scale of 2/3 total memory, it gains nothing but overheads. benchmarks ========== 1) memory tight desktop 1.1) brief summary - clock time and major faults are reduced by 50%; - pswpin numbers are reduced to ~1/3. That means X desktop responsiveness is doubled under high memory/swap pressure. 1.2) test scenario - nfsroot gnome desktop with 512M physical memory - run some programs, and switch between the existing windows after starting each new program. 1.3) progress timing (seconds) before after programs 0.02 0.02 N xeyes 0.75 0.76 N firefox 2.02 1.88 N nautilus 3.36 3.17 N nautilus --browser 5.26 4.89 N gthumb 7.12 6.47 N gedit 9.22 8.16 N xpdf /usr/share/doc/shared-mime-info/shared-mime-info-spec.pdf 13.58 12.55 N xterm 15.87 14.57 N mlterm 18.63 17.06 N gnome-terminal 21.16 18.90 N urxvt 26.24 23.48 N gnome-system-monitor 28.72 26.52 N gnome-help 32.15 29.65 N gnome-dictionary 39.66 36.12 N /usr/games/sol 43.16 39.27 N /usr/games/gnometris 48.65 42.56 N /usr/games/gnect 53.31 47.03 N /usr/games/gtali 58.60 52.05 N /usr/games/iagno 65.77 55.42 N /usr/games/gnotravex 70.76 61.47 N /usr/games/mahjongg 76.15 67.11 N /usr/games/gnome-sudoku 86.32 75.15 N /usr/games/glines 92.21 79.70 N /usr/games/glchess 103.79 88.48 N /usr/games/gnomine 113.84 96.51 N /usr/games/gnotski 124.40 102.19 N /usr/games/gnibbles 137.41 114.93 N /usr/games/gnobots2 155.53 125.02 N /usr/games/blackjack 179.85 135.11 N /usr/games/same-gnome 224.49 154.50 N /usr/bin/gnome-window-properties 248.44 162.09 N /usr/bin/gnome-default-applications-properties 282.62 173.29 N /usr/bin/gnome-at-properties 323.72 188.21 N /usr/bin/gnome-typing-monitor 363.99 199.93 N /usr/bin/gnome-at-visual 394.21 206.95 N /usr/bin/gnome-sound-properties 435.14 224.49 N /usr/bin/gnome-at-mobility 463.05 234.11 N /usr/bin/gnome-keybinding-properties 503.75 248.59 N /usr/bin/gnome-about-me 554.00 276.27 N /usr/bin/gnome-display-properties 615.48 304.39 N /usr/bin/gnome-network-preferences 693.03 342.01 N /usr/bin/gnome-mouse-properties 759.90 388.58 N /usr/bin/gnome-appearance-properties 937.90 508.47 N /usr/bin/gnome-control-center 1109.75 587.57 N /usr/bin/gnome-keyboard-properties 1399.05 758.16 N : oocalc 1524.64 830.03 N : oodraw 1684.31 900.03 N : ooimpress 1874.04 993.91 N : oomath 2115.12 1081.89 N : ooweb 2369.02 1161.99 N : oowriter Note that the last ": oo*" commands are actually commented out. 1.4) vmstat numbers (some relevant ones are marked with *) before after nr_free_pages 1293 3898 nr_inactive_anon 59956 53460 nr_active_anon 26815 30026 nr_inactive_file 2657 3218 nr_active_file 2019 2806 nr_unevictable 4 4 nr_mlock 4 4 nr_anon_pages 26706 27859 *nr_mapped 3542 4469 nr_file_pages 72232 67681 nr_dirty 1 0 nr_writeback 123 19 nr_slab_reclaimable 3375 3534 nr_slab_unreclaimable 11405 10665 nr_page_table_pages 8106 7864 nr_unstable 0 0 nr_bounce 0 0 *nr_vmscan_write 394776 230839 nr_writeback_temp 0 0 numa_hit 6843353 3318676 numa_miss 0 0 numa_foreign 0 0 numa_interleave 1719 1719 numa_local 6843353 3318676 numa_other 0 0 *pgpgin 5954683 2057175 *pgpgout 1578276 922744 *pswpin 1486615 512238 *pswpout 394568 230685 pgalloc_dma 277432 56602 pgalloc_dma32 6769477 3310348 pgalloc_normal 0 0 pgalloc_movable 0 0 pgfree 7048396 3371118 pgactivate 2036343 1471492 pgdeactivate 2189691 1612829 pgfault 3702176 3100702 *pgmajfault 452116 201343 pgrefill_dma 12185 7127 pgrefill_dma32 334384 653703 pgrefill_normal 0 0 pgrefill_movable 0 0 pgsteal_dma 74214 22179 pgsteal_dma32 3334164 1638029 pgsteal_normal 0 0 pgsteal_movable 0 0 pgscan_kswapd_dma 1081421 1216199 pgscan_kswapd_dma32 58979118 46002810 pgscan_kswapd_normal 0 0 pgscan_kswapd_movable 0 0 pgscan_direct_dma 2015438 1086109 pgscan_direct_dma32 55787823 36101597 pgscan_direct_normal 0 0 pgscan_direct_movable 0 0 pginodesteal 3461 7281 slabs_scanned 564864 527616 kswapd_steal 2889797 1448082 kswapd_inodesteal 14827 14835 pageoutrun 43459 21562 allocstall 9653 4032 pgrotated 384216 228631 1.5) free numbers at the end of the tests before patch: total used free shared buffers cached Mem: 474 467 7 0 0 236 -/+ buffers/cache: 230 243 Swap: 1023 418 605 after patch: total used free shared buffers cached Mem: 474 457 16 0 0 236 -/+ buffers/cache: 221 253 Swap: 1023 404 619 2) memory flushing in a file server 2.1) brief summary The number of major faults from 50 to 3 during 10% cache hot reads. That means this patch successfully stops major faults when the active file list is slowly scanned when there are partially cache hot streaming IO. 2.2) test scenario Do 100000 pread(size=110 pages, offset=(i*100) pages), where 10% of the pages will be activated: for i in `seq 0 100 10000000`; do echo $i 110; done > pattern-hot-10 iotrace.rb --load pattern-hot-10 --play /b/sparse vmmon nr_mapped nr_active_file nr_inactive_file pgmajfault pgdeactivate pgfree and monitor /proc/vmstat during the time. The test box has 2G memory. I carried out tests on fresh booted console as well as X desktop, and fetched the vmstat numbers on (1) begin: shortly after the big read IO starts; (2) end: just before the big read IO stops; (3) restore: the big read IO stops and the zsh working set restored (4) restore X: after IO, switch back and forth between the urxvt and firefox windows to restore their working set. 2.3) console mode results nr_mapped nr_active_file nr_inactive_file pgmajfault pgdeactivate pgfree 2.6.29 VM_EXEC protection ON: begin: 2481 2237 8694 630 0 574299 end: 275 231976 233914 633 776271 20933042 restore: 370 232154 234524 691 777183 20958453 2.6.29 VM_EXEC protection ON (second run): begin: 2434 2237 8493 629 0 574195 end: 284 231970 233536 632 771918 20896129 restore: 399 232218 234789 690 774526 20957909 2.6.30-rc4-mm VM_EXEC protection OFF: begin: 2479 2344 9659 210 0 579643 end: 284 232010 234142 260 772776 20917184 restore: 379 232159 234371 301 774888 20967849 The above console numbers show that - The startup pgmajfault of 2.6.30-rc4-mm is merely 1/3 that of 2.6.29. I'd attribute that improvement to the mmap readahead improvements :-) - The pgmajfault increment during the file copy is 633-630=3 vs 260-210=50. That's a huge improvement - which means with the VM_EXEC protection logic, active mmap pages is pretty safe even under partially cache hot streaming IO. - when active:inactive file lru size reaches 1:1, their scan rates is 1:20.8 under 10% cache hot IO. (computed with formula Dpgdeactivate:Dpgfree) That roughly means the active mmap pages get 20.8 more chances to get re-referenced to stay in memory. - The absolute nr_mapped drops considerably to 1/9 during the big IO, and the dropped pages are mostly inactive ones. The patch has almost no impact in this aspect, that means it won't unnecessarily increase memory pressure. (In contrast, your 20% mmap protection ratio will keep them all, and therefore eliminate the extra 41 major faults to restore working set of zsh etc.) The iotrace.rb read throughput is 151.194384MB/s 284.198252s 100001x 450560b --load pattern-hot-10 --play /b/sparse which means the inactive list is rotated at the speed of 250MB/s, so a full scan of which takes about 3.5 seconds, while a full scan of active file list takes about 77 seconds. 2.4) X mode results We can reach roughly the same conclusions for X desktop: nr_mapped nr_active_file nr_inactive_file pgmajfault pgdeactivate pgfree 2.6.30-rc4-mm VM_EXEC protection ON: begin: 9740 8920 64075 561 0 678360 end: 768 218254 220029 565 798953 21057006 restore: 857 218543 220987 606 799462 21075710 restore X: 2414 218560 225344 797 799462 21080795 2.6.30-rc4-mm VM_EXEC protection OFF: begin: 9368 5035 26389 554 0 633391 end: 770 218449 221230 661 646472 17832500 restore: 1113 218466 220978 710 649881 17905235 restore X: 2687 218650 225484 947 802700 21083584 - the absolute nr_mapped drops considerably (to 1/13 of the original size) during the streaming IO. - the delta of pgmajfault is 3 vs 107 during IO, or 236 vs 393 during the whole process. Cc: Elladan Cc: Nick Piggin Cc: Andi Kleen Cc: Christoph Lameter Acked-by: Rik van Riel Acked-by: Peter Zijlstra Acked-by: KOSAKI Motohiro Reviewed-by: Johannes Weiner Reviewed-by: Minchan Kim Signed-off-by: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 52 +++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 45 insertions(+), 7 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 6be2068f61c8..1024979d6589 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1212,6 +1212,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, unsigned long pgscanned; unsigned long vm_flags; LIST_HEAD(l_hold); /* The pages which were snipped off */ + LIST_HEAD(l_active); LIST_HEAD(l_inactive); struct page *page; struct pagevec pvec; @@ -1251,28 +1252,42 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, /* page_referenced clears PageReferenced */ if (page_mapping_inuse(page) && - page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) + page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { pgmoved++; + /* + * Identify referenced, file-backed active pages and + * give them one more trip around the active list. So + * that executable code get better chances to stay in + * memory under moderate memory pressure. Anon pages + * are not likely to be evicted by use-once streaming + * IO, plus JVM can create lots of anon VM_EXEC pages, + * so we ignore them here. + */ + if ((vm_flags & VM_EXEC) && !PageAnon(page)) { + list_add(&page->lru, &l_active); + continue; + } + } list_add(&page->lru, &l_inactive); } /* - * Move the pages to the [file or anon] inactive list. + * Move pages back to the lru list. */ pagevec_init(&pvec, 1); - lru = LRU_BASE + file * LRU_FILE; spin_lock_irq(&zone->lru_lock); /* - * Count referenced pages from currently used mappings as - * rotated, even though they are moved to the inactive list. - * This helps balance scan pressure between file and anonymous - * pages in get_scan_ratio. + * Count referenced pages from currently used mappings as rotated, + * even though only some of them are actually re-activated. This + * helps balance scan pressure between file and anonymous pages in + * get_scan_ratio. */ reclaim_stat->recent_rotated[!!file] += pgmoved; pgmoved = 0; /* count pages moved to inactive list */ + lru = LRU_BASE + file * LRU_FILE; while (!list_empty(&l_inactive)) { page = lru_to_page(&l_inactive); prefetchw_prev_lru_page(page, &l_inactive, flags); @@ -1295,6 +1310,29 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); __count_zone_vm_events(PGREFILL, zone, pgscanned); __count_vm_events(PGDEACTIVATE, pgmoved); + + pgmoved = 0; /* count pages moved back to active list */ + lru = LRU_ACTIVE + file * LRU_FILE; + while (!list_empty(&l_active)) { + page = lru_to_page(&l_active); + prefetchw_prev_lru_page(page, &l_active, flags); + VM_BUG_ON(PageLRU(page)); + SetPageLRU(page); + VM_BUG_ON(!PageActive(page)); + + list_move(&page->lru, &zone->lru[lru].list); + mem_cgroup_add_lru_list(page, lru); + pgmoved++; + if (!pagevec_add(&pvec, page)) { + spin_unlock_irq(&zone->lru_lock); + if (buffer_heads_over_limit) + pagevec_strip(&pvec); + __pagevec_release(&pvec); + spin_lock_irq(&zone->lru_lock); + } + } + __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); + spin_unlock_irq(&zone->lru_lock); if (buffer_heads_over_limit) pagevec_strip(&pvec); -- cgit v1.2.3 From 3eb4140f0389bdada022d5e8efd88504ad30df14 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:33:13 -0700 Subject: vmscan: merge duplicate code in shrink_active_list() The "move pages to active list" and "move pages to inactive list" code blocks are mostly identical and can be served by a function. Thanks to Andrew Morton for pointing this out. Note that buffer_heads_over_limit check will also be carried out for re-activated pages, which is slightly different from pre-2.6.28 kernels. Also, Rik's "vmscan: evict use-once pages first" patch could totally stop scans of active file list when memory pressure is low. So the net effect could be, the number of buffer heads is now more likely to grow large. However that's fine according to Johannes' comments: I don't think that this could be harmful. We just preserve the buffer mappings of what we consider the working set and with low memory pressure, as you say, this set is not big. As to stripping of reactivated pages: the only pages we re-activate for now are those VM_EXEC mapped ones. Since we don't expect IO from or to these pages, removing the buffer mappings in case they grow too large should be okay, I guess. Cc: Pekka Enberg Acked-by: Peter Zijlstra Reviewed-by: Rik van Riel Reviewed-by: Minchan Kim Reviewed-by: Johannes Weiner Signed-off-by: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 95 +++++++++++++++++++++++++++---------------------------------- 1 file changed, 42 insertions(+), 53 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 1024979d6589..f3a55d1f9ab7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1204,6 +1204,43 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority) * But we had to alter page->flags anyway. */ +static void move_active_pages_to_lru(struct zone *zone, + struct list_head *list, + enum lru_list lru) +{ + unsigned long pgmoved = 0; + struct pagevec pvec; + struct page *page; + + pagevec_init(&pvec, 1); + + while (!list_empty(list)) { + page = lru_to_page(list); + prefetchw_prev_lru_page(page, list, flags); + + VM_BUG_ON(PageLRU(page)); + SetPageLRU(page); + + VM_BUG_ON(!PageActive(page)); + if (!is_active_lru(lru)) + ClearPageActive(page); /* we are de-activating */ + + list_move(&page->lru, &zone->lru[lru].list); + mem_cgroup_add_lru_list(page, lru); + pgmoved++; + + if (!pagevec_add(&pvec, page) || list_empty(list)) { + spin_unlock_irq(&zone->lru_lock); + if (buffer_heads_over_limit) + pagevec_strip(&pvec); + __pagevec_release(&pvec); + spin_lock_irq(&zone->lru_lock); + } + } + __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); + if (!is_active_lru(lru)) + __count_vm_events(PGDEACTIVATE, pgmoved); +} static void shrink_active_list(unsigned long nr_pages, struct zone *zone, struct scan_control *sc, int priority, int file) @@ -1215,8 +1252,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, LIST_HEAD(l_active); LIST_HEAD(l_inactive); struct page *page; - struct pagevec pvec; - enum lru_list lru; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); lru_add_drain(); @@ -1233,6 +1268,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, } reclaim_stat->recent_scanned[!!file] += pgmoved; + __count_zone_vm_events(PGREFILL, zone, pgscanned); if (file) __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); else @@ -1275,8 +1311,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, /* * Move pages back to the lru list. */ - pagevec_init(&pvec, 1); - spin_lock_irq(&zone->lru_lock); /* * Count referenced pages from currently used mappings as rotated, @@ -1286,57 +1320,12 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, */ reclaim_stat->recent_rotated[!!file] += pgmoved; - pgmoved = 0; /* count pages moved to inactive list */ - lru = LRU_BASE + file * LRU_FILE; - while (!list_empty(&l_inactive)) { - page = lru_to_page(&l_inactive); - prefetchw_prev_lru_page(page, &l_inactive, flags); - VM_BUG_ON(PageLRU(page)); - SetPageLRU(page); - VM_BUG_ON(!PageActive(page)); - ClearPageActive(page); - - list_move(&page->lru, &zone->lru[lru].list); - mem_cgroup_add_lru_list(page, lru); - pgmoved++; - if (!pagevec_add(&pvec, page)) { - spin_unlock_irq(&zone->lru_lock); - if (buffer_heads_over_limit) - pagevec_strip(&pvec); - __pagevec_release(&pvec); - spin_lock_irq(&zone->lru_lock); - } - } - __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); - __count_zone_vm_events(PGREFILL, zone, pgscanned); - __count_vm_events(PGDEACTIVATE, pgmoved); - - pgmoved = 0; /* count pages moved back to active list */ - lru = LRU_ACTIVE + file * LRU_FILE; - while (!list_empty(&l_active)) { - page = lru_to_page(&l_active); - prefetchw_prev_lru_page(page, &l_active, flags); - VM_BUG_ON(PageLRU(page)); - SetPageLRU(page); - VM_BUG_ON(!PageActive(page)); - - list_move(&page->lru, &zone->lru[lru].list); - mem_cgroup_add_lru_list(page, lru); - pgmoved++; - if (!pagevec_add(&pvec, page)) { - spin_unlock_irq(&zone->lru_lock); - if (buffer_heads_over_limit) - pagevec_strip(&pvec); - __pagevec_release(&pvec); - spin_lock_irq(&zone->lru_lock); - } - } - __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); + move_active_pages_to_lru(zone, &l_active, + LRU_ACTIVE + file * LRU_FILE); + move_active_pages_to_lru(zone, &l_inactive, + LRU_BASE + file * LRU_FILE); spin_unlock_irq(&zone->lru_lock); - if (buffer_heads_over_limit) - pagevec_strip(&pvec); - pagevec_release(&pvec); } static int inactive_anon_is_low_global(struct zone *zone) -- cgit v1.2.3 From 9198e96c06744517e3b18fce8be6db61e96a3227 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Tue, 16 Jun 2009 15:33:15 -0700 Subject: vmscan: handle may_swap more strictly Commit 2e2e425989080cc534fc0fca154cae515f971cf5 ("vmscan,memcg: reintroduce sc->may_swap) add may_swap flag and handle it at get_scan_ratio(). But the result of get_scan_ratio() is ignored when priority == 0, so anon lru is scanned even if may_swap == 0 or nr_swap_pages == 0. IMHO, this is not an expected behavior. As for memcg especially, because of this behavior many and many pages are swapped-out just in vain when oom is invoked by mem+swap limit. This patch is for handling may_swap flag more strictly. Signed-off-by: Daisuke Nishimura Reviewed-by: KOSAKI Motohiro Cc: Minchan Kim Cc: Johannes Weiner Cc: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Cc: Rik van Riel Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index f3a55d1f9ab7..057e44b97aa1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1430,13 +1430,6 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, unsigned long ap, fp; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); - /* If we have no swap space, do not bother scanning anon pages. */ - if (!sc->may_swap || (nr_swap_pages <= 0)) { - percent[0] = 0; - percent[1] = 100; - return; - } - anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) + zone_nr_pages(zone, sc, LRU_INACTIVE_ANON); file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) + @@ -1534,15 +1527,22 @@ static void shrink_zone(int priority, struct zone *zone, enum lru_list l; unsigned long nr_reclaimed = sc->nr_reclaimed; unsigned long swap_cluster_max = sc->swap_cluster_max; + int noswap = 0; - get_scan_ratio(zone, sc, percent); + /* If we have no swap space, do not bother scanning anon pages. */ + if (!sc->may_swap || (nr_swap_pages <= 0)) { + noswap = 1; + percent[0] = 0; + percent[1] = 100; + } else + get_scan_ratio(zone, sc, percent); for_each_evictable_lru(l) { int file = is_file_lru(l); unsigned long scan; scan = zone_nr_pages(zone, sc, l); - if (priority) { + if (priority || noswap) { scan >>= priority; scan = (scan * percent[file]) / 100; } -- cgit v1.2.3 From 90afa5de6f3fa89a733861e843377302479fcf7e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:33:20 -0700 Subject: vmscan: properly account for the number of page cache pages zone_reclaim() can reclaim A bug was brought to my attention against a distro kernel but it affects mainline and I believe problems like this have been reported in various guises on the mailing lists although I don't have specific examples at the moment. The reported problem was that malloc() stalled for a long time (minutes in some cases) if a large tmpfs mount was occupying a large percentage of memory overall. The pages did not get cleaned or reclaimed by zone_reclaim() because the zone_reclaim_mode was unsuitable, but the lists are uselessly scanned frequencly making the CPU spin at near 100%. This patchset intends to address that bug and bring the behaviour of zone_reclaim() more in line with expectations which were noticed during investigation. It is based on top of mmotm and takes advantage of Kosaki's work with respect to zone_reclaim(). Patch 1 fixes the heuristics that zone_reclaim() uses to determine if the scan should go ahead. The broken heuristic is what was causing the malloc() stall as it uselessly scanned the LRU constantly. Currently, zone_reclaim is assuming zone_reclaim_mode is 1 and historically it could not deal with tmpfs pages at all. This fixes up the heuristic so that an unnecessary scan is more likely to be correctly avoided. Patch 2 notes that zone_reclaim() returning a failure automatically means the zone is marked full. This is not always true. It could have failed because the GFP mask or zone_reclaim_mode were unsuitable. Patch 3 introduces a counter zreclaim_failed that will increment each time the zone_reclaim scan-avoidance heuristics fail. If that counter is rapidly increasing, then zone_reclaim_mode should be set to 0 as a temporarily resolution and a bug reported because the scan-avoidance heuristic is still broken. This patch: On NUMA machines, the administrator can configure zone_reclaim_mode that is a more targetted form of direct reclaim. On machines with large NUMA distances for example, a zone_reclaim_mode defaults to 1 meaning that clean unmapped pages will be reclaimed if the zone watermarks are not being met. There is a heuristic that determines if the scan is worthwhile but the problem is that the heuristic is not being properly applied and is basically assuming zone_reclaim_mode is 1 if it is enabled. The lack of proper detection can manfiest as high CPU usage as the LRU list is scanned uselessly. Historically, once enabled it was depending on NR_FILE_PAGES which may include swapcache pages that the reclaim_mode cannot deal with. Patch vmscan-change-the-number-of-the-unmapped-files-in-zone-reclaim.patch by Kosaki Motohiro noted that zone_page_state(zone, NR_FILE_PAGES) included pages that were not file-backed such as swapcache and made a calculation based on the inactive, active and mapped files. This is far superior when zone_reclaim==1 but if RECLAIM_SWAP is set, then NR_FILE_PAGES is a reasonable starting figure. This patch alters how zone_reclaim() works out how many pages it might be able to reclaim given the current reclaim_mode. If RECLAIM_SWAP is set in the reclaim_mode it will either consider NR_FILE_PAGES as potential candidates or else use NR_{IN}ACTIVE}_PAGES-NR_FILE_MAPPED to discount swapcache and other non-file-backed pages. If RECLAIM_WRITE is not set, then NR_FILE_DIRTY number of pages are not candidates. If RECLAIM_SWAP is not set, then NR_FILE_MAPPED are not. [kosaki.motohiro@jp.fujitsu.com: Estimate unmapped pages minus tmpfs pages] [fengguang.wu@intel.com: Fix underflow problem in Kosaki's estimate] Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Acked-by: Christoph Lameter Cc: KOSAKI Motohiro Cc: Wu Fengguang Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 12 +++++++---- mm/vmscan.c | 52 +++++++++++++++++++++++++++++++++++++++------ 2 files changed, 53 insertions(+), 11 deletions(-) (limited to 'mm/vmscan.c') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 0ea5adbc5b16..c4de6359d440 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -315,10 +315,14 @@ min_unmapped_ratio: This is available only on NUMA kernels. -A percentage of the total pages in each zone. Zone reclaim will only -occur if more than this percentage of pages are file backed and unmapped. -This is to insure that a minimal amount of local pages is still available for -file I/O even if the node is overallocated. +This is a percentage of the total pages in each zone. Zone reclaim will +only occur if more than this percentage of pages are in a state that +zone_reclaim_mode allows to be reclaimed. + +If zone_reclaim_mode has the value 4 OR'd, then the percentage is compared +against all file-backed unmapped pages including swapcache pages and tmpfs +files. Otherwise, only unmapped pages backed by normal files but not tmpfs +files and similar are considered. The default is 1 percent. diff --git a/mm/vmscan.c b/mm/vmscan.c index 057e44b97aa1..79a98d98ed33 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2356,6 +2356,48 @@ int sysctl_min_unmapped_ratio = 1; */ int sysctl_min_slab_ratio = 5; +static inline unsigned long zone_unmapped_file_pages(struct zone *zone) +{ + unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED); + unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) + + zone_page_state(zone, NR_ACTIVE_FILE); + + /* + * It's possible for there to be more file mapped pages than + * accounted for by the pages on the file LRU lists because + * tmpfs pages accounted for as ANON can also be FILE_MAPPED + */ + return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0; +} + +/* Work out how many page cache pages we can reclaim in this reclaim_mode */ +static long zone_pagecache_reclaimable(struct zone *zone) +{ + long nr_pagecache_reclaimable; + long delta = 0; + + /* + * If RECLAIM_SWAP is set, then all file pages are considered + * potentially reclaimable. Otherwise, we have to worry about + * pages like swapcache and zone_unmapped_file_pages() provides + * a better estimate + */ + if (zone_reclaim_mode & RECLAIM_SWAP) + nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES); + else + nr_pagecache_reclaimable = zone_unmapped_file_pages(zone); + + /* If we can't clean pages, remove dirty pages from consideration */ + if (!(zone_reclaim_mode & RECLAIM_WRITE)) + delta += zone_page_state(zone, NR_FILE_DIRTY); + + /* Watch for any possible underflows due to delta */ + if (unlikely(delta > nr_pagecache_reclaimable)) + delta = nr_pagecache_reclaimable; + + return nr_pagecache_reclaimable - delta; +} + /* * Try to free up some pages from this zone through reclaim. */ @@ -2390,9 +2432,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - if (zone_page_state(zone, NR_FILE_PAGES) - - zone_page_state(zone, NR_FILE_MAPPED) > - zone->min_unmapped_pages) { + if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) { /* * Free memory by calling shrink zone with increasing * priorities until we have enough memory freed. @@ -2450,10 +2490,8 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * if less than a specified percentage of the zone is used by * unmapped file backed pages. */ - if (zone_page_state(zone, NR_FILE_PAGES) - - zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages - && zone_page_state(zone, NR_SLAB_RECLAIMABLE) - <= zone->min_slab_pages) + if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages && + zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) return 0; if (zone_is_all_unreclaimable(zone)) -- cgit v1.2.3 From fa5e084e43eb14c14942027e1e2e894aeed96097 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:33:22 -0700 Subject: vmscan: do not unconditionally treat zones that fail zone_reclaim() as full On NUMA machines, the administrator can configure zone_reclaim_mode that is a more targetted form of direct reclaim. On machines with large NUMA distances for example, a zone_reclaim_mode defaults to 1 meaning that clean unmapped pages will be reclaimed if the zone watermarks are not being met. The problem is that zone_reclaim() failing at all means the zone gets marked full. This can cause situations where a zone is usable, but is being skipped because it has been considered full. Take a situation where a large tmpfs mount is occuping a large percentage of memory overall. The pages do not get cleaned or reclaimed by zone_reclaim(), but the zone gets marked full and the zonelist cache considers them not worth trying in the future. This patch makes zone_reclaim() return more fine-grained information about what occured when zone_reclaim() failued. The zone only gets marked full if it really is unreclaimable. If it's a case that the scan did not occur or if enough pages were not reclaimed with the limited reclaim_mode, then the zone is simply skipped. There is a side-effect to this patch. Currently, if zone_reclaim() successfully reclaimed SWAP_CLUSTER_MAX, an allocation attempt would go ahead. With this patch applied, zone watermarks are rechecked after zone_reclaim() does some work. This bug was introduced by commit 9276b1bc96a132f4068fdee00983c532f43d3a26 ("memory page_alloc zonelist caching speedup") way back in 2.6.19 when the zonelist_cache was introduced. It was not intended that zone_reclaim() aggressively consider the zone to be full when it failed as full direct reclaim can still be an option. Due to the age of the bug, it should be considered a -stable candidate. Signed-off-by: Mel Gorman Reviewed-by: Wu Fengguang Reviewed-by: Rik van Riel Reviewed-by: KOSAKI Motohiro Cc: Christoph Lameter Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 4 ++++ mm/page_alloc.c | 26 ++++++++++++++++++++++---- mm/vmscan.c | 11 ++++++----- 3 files changed, 32 insertions(+), 9 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/internal.h b/mm/internal.h index f02c7508068d..f290c4db528b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -259,4 +259,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int flags, struct page **pages, struct vm_area_struct **vmas); +#define ZONE_RECLAIM_NOSCAN -2 +#define ZONE_RECLAIM_FULL -1 +#define ZONE_RECLAIM_SOME 0 +#define ZONE_RECLAIM_SUCCESS 1 #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6407cbfccd77..2f457a756d46 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1462,15 +1462,33 @@ zonelist_scan: BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { unsigned long mark; + int ret; + mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; - if (!zone_watermark_ok(zone, order, mark, - classzone_idx, alloc_flags)) { - if (!zone_reclaim_mode || - !zone_reclaim(zone, gfp_mask, order)) + if (zone_watermark_ok(zone, order, mark, + classzone_idx, alloc_flags)) + goto try_this_zone; + + if (zone_reclaim_mode == 0) + goto this_zone_full; + + ret = zone_reclaim(zone, gfp_mask, order); + switch (ret) { + case ZONE_RECLAIM_NOSCAN: + /* did not scan */ + goto try_next_zone; + case ZONE_RECLAIM_FULL: + /* scanned but unreclaimable */ + goto this_zone_full; + default: + /* did we reclaim enough */ + if (!zone_watermark_ok(zone, order, mark, + classzone_idx, alloc_flags)) goto this_zone_full; } } +try_this_zone: page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask, migratetype); if (page) diff --git a/mm/vmscan.c b/mm/vmscan.c index 79a98d98ed33..16c82a868e2b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2492,16 +2492,16 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) */ if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages && zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) - return 0; + return ZONE_RECLAIM_FULL; if (zone_is_all_unreclaimable(zone)) - return 0; + return ZONE_RECLAIM_FULL; /* * Do not scan if the allocation should not be delayed. */ if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) - return 0; + return ZONE_RECLAIM_NOSCAN; /* * Only run zone reclaim on the local zone or on zones that do not @@ -2511,10 +2511,11 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) */ node_id = zone_to_nid(zone); if (node_state(node_id, N_CPU) && node_id != numa_node_id()) - return 0; + return ZONE_RECLAIM_NOSCAN; if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) - return 0; + return ZONE_RECLAIM_NOSCAN; + ret = __zone_reclaim(zone, gfp_mask, order); zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); -- cgit v1.2.3 From 24cf72518c79cdcda486ed26074ff8151291cf65 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:33:23 -0700 Subject: vmscan: count the number of times zone_reclaim() scans and fails On NUMA machines, the administrator can configure zone_reclaim_mode that is a more targetted form of direct reclaim. On machines with large NUMA distances for example, a zone_reclaim_mode defaults to 1 meaning that clean unmapped pages will be reclaimed if the zone watermarks are not being met. There is a heuristic that determines if the scan is worthwhile but it is possible that the heuristic will fail and the CPU gets tied up scanning uselessly. Detecting the situation requires some guesswork and experimentation so this patch adds a counter "zreclaim_failed" to /proc/vmstat. If during high CPU utilisation this counter is increasing rapidly, then the resolution to the problem may be to set /proc/sys/vm/zone_reclaim_mode to 0. [akpm@linux-foundation.org: name things consistently] Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Christoph Lameter Reviewed-by: KOSAKI Motohiro Cc: Wu Fengguang Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 3 +++ mm/vmscan.c | 3 +++ mm/vmstat.c | 3 +++ 3 files changed, 9 insertions(+) (limited to 'mm/vmscan.c') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index ff4696c6dce3..81a97cf8f0a0 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -36,6 +36,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, FOR_ALL_ZONES(PGSTEAL), FOR_ALL_ZONES(PGSCAN_KSWAPD), FOR_ALL_ZONES(PGSCAN_DIRECT), +#ifdef CONFIG_NUMA + PGSCAN_ZONE_RECLAIM_FAILED, +#endif PGINODESTEAL, SLABS_SCANNED, KSWAPD_STEAL, KSWAPD_INODESTEAL, PAGEOUTRUN, ALLOCSTALL, PGROTATED, #ifdef CONFIG_HUGETLB_PAGE diff --git a/mm/vmscan.c b/mm/vmscan.c index 16c82a868e2b..3018ad756133 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2519,6 +2519,9 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) ret = __zone_reclaim(zone, gfp_mask, order); zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); + if (!ret) + count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); + return ret; } #endif diff --git a/mm/vmstat.c b/mm/vmstat.c index 1e3aa8139f22..138bed53706e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -673,6 +673,9 @@ static const char * const vmstat_text[] = { TEXTS_FOR_ZONES("pgscan_kswapd") TEXTS_FOR_ZONES("pgscan_direct") +#ifdef CONFIG_NUMA + "zone_reclaim_failed", +#endif "pginodesteal", "slabs_scanned", "kswapd_steal", -- cgit v1.2.3 From ee993b135ec75a93bd5c45e636bb210d2975159b Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 16 Jun 2009 15:33:24 -0700 Subject: mm: fix lumpy reclaim lru handling at isolate_lru_pages At lumpy reclaim, a page failed to be taken by __isolate_lru_page() can be pushed back to "src" list by list_move(). But the page may not be from "src" list. This pushes the page back to wrong LRU. And list_move() itself is unnecessary because the page is not on top of LRU. Then, leave it as it is if __isolate_lru_page() fails. Reviewed-by: Minchan Kim Reviewed-by: KOSAKI Motohiro Acked-by: Mel Gorman Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 3018ad756133..4139aa52b941 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -929,18 +929,10 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, /* Check that we have not crossed a zone boundary. */ if (unlikely(page_zone_id(cursor_page) != zone_id)) continue; - switch (__isolate_lru_page(cursor_page, mode, file)) { - case 0: + if (__isolate_lru_page(cursor_page, mode, file) == 0) { list_move(&cursor_page->lru, dst); nr_taken++; scan++; - break; - - case -EBUSY: - /* else it is being freed elsewhere */ - list_move(&cursor_page->lru, src); - default: - break; /* ! on LRU or wrong list */ } } } -- cgit v1.2.3 From 2ffebca6aa7e1687905c842dd8c5c1e811e574e7 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 17 Jun 2009 16:27:21 -0700 Subject: memcg: fix lru rotation in isolate_pages Try to fix memcg's lru rotation sanity: make memcg use the same logic as the global LRU does. Now, at __isolate_lru_page() retruns -EBUSY, the page is rotated to the tail of LRU in global LRU's isolate LRU pages. But in memcg, it's not handled. This makes memcg do the same behavior as global LRU and rotate LRU in the page is busy. Signed-off-by: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Acked-by: Daisuke Nishimura Cc: Balbir Singh Cc: Mel Gorman Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 13 ++++++++++++- mm/vmscan.c | 4 +++- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6ceb6f2dbac2..e2fa20dadf40 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -649,6 +649,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, int zid = zone_idx(z); struct mem_cgroup_per_zone *mz; int lru = LRU_FILE * !!file + !!active; + int ret; BUG_ON(!mem_cont); mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); @@ -666,9 +667,19 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, continue; scan++; - if (__isolate_lru_page(page, mode, file) == 0) { + ret = __isolate_lru_page(page, mode, file); + switch (ret) { + case 0: list_move(&page->lru, dst); + mem_cgroup_del_lru(page); nr_taken++; + break; + case -EBUSY: + /* we don't affect global LRU but rotate in our LRU */ + mem_cgroup_rotate_lru_list(page, page_lru(page)); + break; + default: + break; } } diff --git a/mm/vmscan.c b/mm/vmscan.c index 4139aa52b941..e8fa2d9eb212 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -837,7 +837,6 @@ int __isolate_lru_page(struct page *page, int mode, int file) */ ClearPageLRU(page); ret = 0; - mem_cgroup_del_lru(page); } return ret; @@ -885,12 +884,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, switch (__isolate_lru_page(page, mode, file)) { case 0: list_move(&page->lru, dst); + mem_cgroup_del_lru(page); nr_taken++; break; case -EBUSY: /* else it is being freed elsewhere */ list_move(&page->lru, src); + mem_cgroup_rotate_lru_list(page, page_lru(page)); continue; default: @@ -931,6 +932,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, continue; if (__isolate_lru_page(cursor_page, mode, file) == 0) { list_move(&cursor_page->lru, dst); + mem_cgroup_del_lru(page); nr_taken++; scan++; } -- cgit v1.2.3 From cb4cbcf6b3cf79f80c157afdc8dd8221643d8481 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 23 Jun 2009 08:57:55 +0900 Subject: mm: fix incorrect page removal from LRU The isolated page is "cursor_page" not "page". This could cause LRU list corruption under memory pressure, caught by CONFIG_DEBUG_LIST. Reported-by: Ingo Molnar Signed-off-by: KAMEZAWA Hiroyuki Reviewed-by: Balbir Singh Tested-by: Daisuke Nishimura Cc: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index e8fa2d9eb212..54155268dfca 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -932,7 +932,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, continue; if (__isolate_lru_page(cursor_page, mode, file) == 0) { list_move(&cursor_page->lru, dst); - mem_cgroup_del_lru(page); + mem_cgroup_del_lru(cursor_page); nr_taken++; scan++; } -- cgit v1.2.3