From d778df51c09264076fe0208c099ef7d428f21790 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 22 Feb 2013 16:32:12 -0800 Subject: mm: vmscan: save work scanning (almost) empty LRU lists In certain cases (kswapd reclaim, memcg target reclaim), a fixed minimum amount of pages is scanned from the LRU lists on each iteration, to make progress. Do not make this minimum bigger than the respective LRU list size, however, and save some busy work trying to isolate and reclaim pages that are not there. Empty LRU lists are quite common with memory cgroups in NUMA environments because there exists a set of LRU lists for each zone for each memory cgroup, while the memory of a single cgroup is expected to stay on just one node. The number of expected empty LRU lists is thus memcgs * (nodes - 1) * lru types Each attempt to reclaim from an empty LRU list does expensive size comparisons between lists, acquires the zone's lru lock etc. Avoid that. Signed-off-by: Johannes Weiner Reviewed-by: Rik van Riel Acked-by: Mel Gorman Reviewed-by: Michal Hocko Cc: Hugh Dickins Cc: Satoru Moriya Cc: Simon Jeons Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/swap.h') diff --git a/include/linux/swap.h b/include/linux/swap.h index 68df9c17fbbb..8c66486a8ca8 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -156,7 +156,7 @@ enum { SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ }; -#define SWAP_CLUSTER_MAX 32 +#define SWAP_CLUSTER_MAX 32UL #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX /* -- cgit v1.2.3 From 33806f06da654092182410d974b6d3c5396ea3eb Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 22 Feb 2013 16:34:37 -0800 Subject: swap: make each swap partition have one address_space When I use several fast SSD to do swap, swapper_space.tree_lock is heavily contended. This makes each swap partition have one address_space to reduce the lock contention. There is an array of address_space for swap. The swap entry type is the index to the array. In my test with 3 SSD, this increases the swapout throughput 20%. [akpm@linux-foundation.org: revert unneeded change to __add_to_swap_cache] Signed-off-by: Shaohua Li Cc: Hugh Dickins Acked-by: Rik van Riel Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/meminfo.c | 4 ++-- include/linux/swap.h | 9 +++++---- mm/memcontrol.c | 4 ++-- mm/mincore.c | 5 +++-- mm/swap.c | 9 +++++++-- mm/swap_state.c | 55 ++++++++++++++++++++++++++++++++++++---------------- mm/swapfile.c | 5 +++-- mm/util.c | 10 +++++++--- 8 files changed, 67 insertions(+), 34 deletions(-) (limited to 'include/linux/swap.h') diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index c3dac611c3c0..1efaaa19c4f3 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -40,7 +40,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) * sysctl_overcommit_ratio / 100) + total_swap_pages; cached = global_page_state(NR_FILE_PAGES) - - total_swapcache_pages - i.bufferram; + total_swapcache_pages() - i.bufferram; if (cached < 0) cached = 0; @@ -109,7 +109,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(i.freeram), K(i.bufferram), K(cached), - K(total_swapcache_pages), + K(total_swapcache_pages()), K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]), K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]), K(pages[LRU_ACTIVE_ANON]), diff --git a/include/linux/swap.h b/include/linux/swap.h index 8c66486a8ca8..235c039892ee 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -8,7 +8,7 @@ #include #include #include - +#include #include #include @@ -330,8 +330,9 @@ int generic_swapfile_activate(struct swap_info_struct *, struct file *, sector_t *); /* linux/mm/swap_state.c */ -extern struct address_space swapper_space; -#define total_swapcache_pages swapper_space.nrpages +extern struct address_space swapper_spaces[]; +#define swap_address_space(entry) (&swapper_spaces[swp_type(entry)]) +extern unsigned long total_swapcache_pages(void); extern void show_swap_cache_info(void); extern int add_to_swap(struct page *); extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); @@ -382,7 +383,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) #define nr_swap_pages 0L #define total_swap_pages 0L -#define total_swapcache_pages 0UL +#define total_swapcache_pages() 0UL #define si_swapinfo(val) \ do { (val)->freeswap = (val)->totalswap = 0; } while (0) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c878b1c69510..f85861531f22 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6307,7 +6307,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, * Because lookup_swap_cache() updates some statistics counter, * we call find_get_page() with swapper_space directly. */ - page = find_get_page(&swapper_space, ent.val); + page = find_get_page(swap_address_space(ent), ent.val); if (do_swap_account) entry->val = ent.val; @@ -6348,7 +6348,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, swp_entry_t swap = radix_to_swp_entry(page); if (do_swap_account) *entry = swap; - page = find_get_page(&swapper_space, swap.val); + page = find_get_page(swap_address_space(swap), swap.val); } #endif return page; diff --git a/mm/mincore.c b/mm/mincore.c index 936b4cee8cb1..da2be56a7b8f 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -75,7 +75,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) /* shmem/tmpfs may return swap: account for swapcache page too. */ if (radix_tree_exceptional_entry(page)) { swp_entry_t swap = radix_to_swp_entry(page); - page = find_get_page(&swapper_space, swap.val); + page = find_get_page(swap_address_space(swap), swap.val); } #endif if (page) { @@ -135,7 +135,8 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, } else { #ifdef CONFIG_SWAP pgoff = entry.val; - *vec = mincore_page(&swapper_space, pgoff); + *vec = mincore_page(swap_address_space(entry), + pgoff); #else WARN_ON(1); *vec = 1; diff --git a/mm/swap.c b/mm/swap.c index 6310dc2008ff..8a529a01e8fc 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -855,9 +855,14 @@ EXPORT_SYMBOL(pagevec_lookup_tag); void __init swap_setup(void) { unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); - #ifdef CONFIG_SWAP - bdi_init(swapper_space.backing_dev_info); + int i; + + bdi_init(swapper_spaces[0].backing_dev_info); + for (i = 0; i < MAX_SWAPFILES; i++) { + spin_lock_init(&swapper_spaces[i].tree_lock); + INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); + } #endif /* Use a smaller cluster for small-memory machines */ diff --git a/mm/swap_state.c b/mm/swap_state.c index 0cb36fb1f61c..8d6644c5d0cc 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -36,12 +36,12 @@ static struct backing_dev_info swap_backing_dev_info = { .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, }; -struct address_space swapper_space = { - .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), - .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock), - .a_ops = &swap_aops, - .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), - .backing_dev_info = &swap_backing_dev_info, +struct address_space swapper_spaces[MAX_SWAPFILES] = { + [0 ... MAX_SWAPFILES - 1] = { + .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), + .a_ops = &swap_aops, + .backing_dev_info = &swap_backing_dev_info, + } }; #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) @@ -53,9 +53,19 @@ static struct { unsigned long find_total; } swap_cache_info; +unsigned long total_swapcache_pages(void) +{ + int i; + unsigned long ret = 0; + + for (i = 0; i < MAX_SWAPFILES; i++) + ret += swapper_spaces[i].nrpages; + return ret; +} + void show_swap_cache_info(void) { - printk("%lu pages in swap cache\n", total_swapcache_pages); + printk("%lu pages in swap cache\n", total_swapcache_pages()); printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", swap_cache_info.add_total, swap_cache_info.del_total, swap_cache_info.find_success, swap_cache_info.find_total); @@ -70,6 +80,7 @@ void show_swap_cache_info(void) static int __add_to_swap_cache(struct page *page, swp_entry_t entry) { int error; + struct address_space *address_space; VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(PageSwapCache(page)); @@ -79,14 +90,16 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry) SetPageSwapCache(page); set_page_private(page, entry.val); - spin_lock_irq(&swapper_space.tree_lock); - error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); + address_space = swap_address_space(entry); + spin_lock_irq(&address_space->tree_lock); + error = radix_tree_insert(&address_space->page_tree, + entry.val, page); if (likely(!error)) { - total_swapcache_pages++; + address_space->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); INC_CACHE_INFO(add_total); } - spin_unlock_irq(&swapper_space.tree_lock); + spin_unlock_irq(&address_space->tree_lock); if (unlikely(error)) { /* @@ -122,14 +135,19 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) */ void __delete_from_swap_cache(struct page *page) { + swp_entry_t entry; + struct address_space *address_space; + VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(!PageSwapCache(page)); VM_BUG_ON(PageWriteback(page)); - radix_tree_delete(&swapper_space.page_tree, page_private(page)); + entry.val = page_private(page); + address_space = swap_address_space(entry); + radix_tree_delete(&address_space->page_tree, page_private(page)); set_page_private(page, 0); ClearPageSwapCache(page); - total_swapcache_pages--; + address_space->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); INC_CACHE_INFO(del_total); } @@ -195,12 +213,14 @@ int add_to_swap(struct page *page) void delete_from_swap_cache(struct page *page) { swp_entry_t entry; + struct address_space *address_space; entry.val = page_private(page); - spin_lock_irq(&swapper_space.tree_lock); + address_space = swap_address_space(entry); + spin_lock_irq(&address_space->tree_lock); __delete_from_swap_cache(page); - spin_unlock_irq(&swapper_space.tree_lock); + spin_unlock_irq(&address_space->tree_lock); swapcache_free(entry, page); page_cache_release(page); @@ -263,7 +283,7 @@ struct page * lookup_swap_cache(swp_entry_t entry) { struct page *page; - page = find_get_page(&swapper_space, entry.val); + page = find_get_page(swap_address_space(entry), entry.val); if (page) INC_CACHE_INFO(find_success); @@ -290,7 +310,8 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * called after lookup_swap_cache() failed, re-calling * that would confuse statistics. */ - found_page = find_get_page(&swapper_space, entry.val); + found_page = find_get_page(swap_address_space(entry), + entry.val); if (found_page) break; diff --git a/mm/swapfile.c b/mm/swapfile.c index e97a0e5aea91..e51864e6fe8b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -79,7 +79,7 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) struct page *page; int ret = 0; - page = find_get_page(&swapper_space, entry.val); + page = find_get_page(swap_address_space(entry), entry.val); if (!page) return 0; /* @@ -699,7 +699,8 @@ int free_swap_and_cache(swp_entry_t entry) p = swap_info_get(entry); if (p) { if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { - page = find_get_page(&swapper_space, entry.val); + page = find_get_page(swap_address_space(entry), + entry.val); if (page && !trylock_page(page)) { page_cache_release(page); page = NULL; diff --git a/mm/util.c b/mm/util.c index 16a73195a37b..ab1424dbe2e6 100644 --- a/mm/util.c +++ b/mm/util.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include "internal.h" @@ -389,9 +390,12 @@ struct address_space *page_mapping(struct page *page) VM_BUG_ON(PageSlab(page)); #ifdef CONFIG_SWAP - if (unlikely(PageSwapCache(page))) - mapping = &swapper_space; - else + if (unlikely(PageSwapCache(page))) { + swp_entry_t entry; + + entry.val = page_private(page); + mapping = swap_address_space(entry); + } else #endif if ((unsigned long)mapping & PAGE_MAPPING_ANON) mapping = NULL; -- cgit v1.2.3 From ec8acf20afb8534ed511f6613dd2226b9e301010 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 22 Feb 2013 16:34:38 -0800 Subject: swap: add per-partition lock for swapfile swap_lock is heavily contended when I test swap to 3 fast SSD (even slightly slower than swap to 2 such SSD). The main contention comes from swap_info_get(). This patch tries to fix the gap with adding a new per-partition lock. Global data like nr_swapfiles, total_swap_pages, least_priority and swap_list are still protected by swap_lock. nr_swap_pages is an atomic now, it can be changed without swap_lock. In theory, it's possible get_swap_page() finds no swap pages but actually there are free swap pages. But sounds not a big problem. Accessing partition specific data (like scan_swap_map and so on) is only protected by swap_info_struct.lock. Changing swap_info_struct.flags need hold swap_lock and swap_info_struct.lock, because scan_scan_map() will check it. read the flags is ok with either the locks hold. If both swap_lock and swap_info_struct.lock must be hold, we always hold the former first to avoid deadlock. swap_entry_free() can change swap_list. To delete that code, we add a new highest_priority_index. Whenever get_swap_page() is called, we check it. If it's valid, we use it. It's a pity get_swap_page() still holds swap_lock(). But in practice, swap_lock() isn't heavily contended in my test with this patch (or I can say there are other much more heavier bottlenecks like TLB flush). And BTW, looks get_swap_page() doesn't really need the lock. We never free swap_info[] and we check SWAP_WRITEOK flag. The only risk without the lock is we could swapout to some low priority swap, but we can quickly recover after several rounds of swap, so sounds not a big deal to me. But I'd prefer to fix this if it's a real problem. "swap: make each swap partition have one address_space" improved the swapout speed from 1.7G/s to 2G/s. This patch further improves the speed to 2.3G/s, so around 15% improvement. It's a multi-process test, so TLB flush isn't the biggest bottleneck before the patches. [arnd@arndb.de: fix it for nommu] [hughd@google.com: add missing unlock] [minchan@kernel.org: get rid of lockdep whinge on sys_swapon] Signed-off-by: Shaohua Li Cc: Hugh Dickins Cc: Rik van Riel Cc: Minchan Kim Cc: Greg Kroah-Hartman Cc: Seth Jennings Cc: Konrad Rzeszutek Wilk Cc: Xiao Guangrong Cc: Dan Magenheimer Cc: Stephen Rothwell Signed-off-by: Arnd Bergmann Signed-off-by: Hugh Dickins Signed-off-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc/mm/init_32.c | 2 +- arch/tile/mm/pgtable.c | 2 +- include/linux/swap.h | 32 ++++++++-- mm/mmap.c | 2 +- mm/nommu.c | 2 +- mm/swap_state.c | 3 +- mm/swapfile.c | 154 +++++++++++++++++++++++++++++++++--------------- mm/vmscan.c | 8 +-- 8 files changed, 145 insertions(+), 60 deletions(-) (limited to 'include/linux/swap.h') diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index dde85ef1c56d..48e0c030e8f5 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -57,7 +57,7 @@ void show_mem(unsigned int filter) printk("Mem-info:\n"); show_free_areas(filter); printk("Free swap: %6ldkB\n", - nr_swap_pages << (PAGE_SHIFT-10)); + get_nr_swap_pages() << (PAGE_SHIFT-10)); printk("%ld pages of RAM\n", totalram_pages); printk("%ld free pages\n", nr_free_pages()); } diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c index de0de0c0e8a1..b3b4972c2451 100644 --- a/arch/tile/mm/pgtable.c +++ b/arch/tile/mm/pgtable.c @@ -61,7 +61,7 @@ void show_mem(unsigned int filter) global_page_state(NR_PAGETABLE), global_page_state(NR_BOUNCE), global_page_state(NR_FILE_PAGES), - nr_swap_pages); + get_nr_swap_pages()); for_each_zone(zone) { unsigned long flags, order, total = 0, largest_order = -1; diff --git a/include/linux/swap.h b/include/linux/swap.h index 235c039892ee..a3e22d357e91 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -202,6 +202,18 @@ struct swap_info_struct { unsigned long *frontswap_map; /* frontswap in-use, one bit per page */ atomic_t frontswap_pages; /* frontswap pages in-use counter */ #endif + spinlock_t lock; /* + * protect map scan related fields like + * swap_map, lowest_bit, highest_bit, + * inuse_pages, cluster_next, + * cluster_nr, lowest_alloc and + * highest_alloc. other fields are only + * changed at swapon/swapoff, so are + * protected by swap_lock. changing + * flags need hold this lock and + * swap_lock. If both locks need hold, + * hold swap_lock first. + */ }; struct swap_list_t { @@ -209,9 +221,6 @@ struct swap_list_t { int next; /* swapfile to be used next */ }; -/* Swap 50% full? Release swapcache more aggressively.. */ -#define vm_swap_full() (nr_swap_pages*2 < total_swap_pages) - /* linux/mm/page_alloc.c */ extern unsigned long totalram_pages; extern unsigned long totalreserve_pages; @@ -347,8 +356,20 @@ extern struct page *swapin_readahead(swp_entry_t, gfp_t, struct vm_area_struct *vma, unsigned long addr); /* linux/mm/swapfile.c */ -extern long nr_swap_pages; +extern atomic_long_t nr_swap_pages; extern long total_swap_pages; + +/* Swap 50% full? Release swapcache more aggressively.. */ +static inline bool vm_swap_full(void) +{ + return atomic_long_read(&nr_swap_pages) * 2 < total_swap_pages; +} + +static inline long get_nr_swap_pages(void) +{ + return atomic_long_read(&nr_swap_pages); +} + extern void si_swapinfo(struct sysinfo *); extern swp_entry_t get_swap_page(void); extern swp_entry_t get_swap_page_of_type(int); @@ -381,9 +402,10 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) #else /* CONFIG_SWAP */ -#define nr_swap_pages 0L +#define get_nr_swap_pages() 0L #define total_swap_pages 0L #define total_swapcache_pages() 0UL +#define vm_swap_full() 0 #define si_swapinfo(val) \ do { (val)->freeswap = (val)->totalswap = 0; } while (0) diff --git a/mm/mmap.c b/mm/mmap.c index 44bb4d869884..28416f6b8dd5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -144,7 +144,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) */ free -= global_page_state(NR_SHMEM); - free += nr_swap_pages; + free += get_nr_swap_pages(); /* * Any slabs which are created with the diff --git a/mm/nommu.c b/mm/nommu.c index 18c1b932e2c4..87854a55829d 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1907,7 +1907,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) */ free -= global_page_state(NR_SHMEM); - free += nr_swap_pages; + free += get_nr_swap_pages(); /* * Any slabs which are created with the diff --git a/mm/swap_state.c b/mm/swap_state.c index 8d6644c5d0cc..7efcf1525921 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -69,7 +69,8 @@ void show_swap_cache_info(void) printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", swap_cache_info.add_total, swap_cache_info.del_total, swap_cache_info.find_success, swap_cache_info.find_total); - printk("Free swap = %ldkB\n", nr_swap_pages << (PAGE_SHIFT - 10)); + printk("Free swap = %ldkB\n", + get_nr_swap_pages() << (PAGE_SHIFT - 10)); printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); } diff --git a/mm/swapfile.c b/mm/swapfile.c index e51864e6fe8b..9b51266413cd 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -47,9 +47,11 @@ static sector_t map_swap_entry(swp_entry_t, struct block_device**); DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; -long nr_swap_pages; +atomic_long_t nr_swap_pages; +/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ long total_swap_pages; static int least_priority; +static atomic_t highest_priority_index = ATOMIC_INIT(-1); static const char Bad_file[] = "Bad swap file entry "; static const char Unused_file[] = "Unused swap file entry "; @@ -223,7 +225,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, si->lowest_alloc = si->max; si->highest_alloc = 0; } - spin_unlock(&swap_lock); + spin_unlock(&si->lock); /* * If seek is expensive, start searching for new cluster from @@ -242,7 +244,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, if (si->swap_map[offset]) last_in_cluster = offset + SWAPFILE_CLUSTER; else if (offset == last_in_cluster) { - spin_lock(&swap_lock); + spin_lock(&si->lock); offset -= SWAPFILE_CLUSTER - 1; si->cluster_next = offset; si->cluster_nr = SWAPFILE_CLUSTER - 1; @@ -263,7 +265,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, if (si->swap_map[offset]) last_in_cluster = offset + SWAPFILE_CLUSTER; else if (offset == last_in_cluster) { - spin_lock(&swap_lock); + spin_lock(&si->lock); offset -= SWAPFILE_CLUSTER - 1; si->cluster_next = offset; si->cluster_nr = SWAPFILE_CLUSTER - 1; @@ -277,7 +279,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, } offset = scan_base; - spin_lock(&swap_lock); + spin_lock(&si->lock); si->cluster_nr = SWAPFILE_CLUSTER - 1; si->lowest_alloc = 0; } @@ -293,9 +295,9 @@ checks: /* reuse swap entry of cache-only swap if not busy. */ if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { int swap_was_freed; - spin_unlock(&swap_lock); + spin_unlock(&si->lock); swap_was_freed = __try_to_reclaim_swap(si, offset); - spin_lock(&swap_lock); + spin_lock(&si->lock); /* entry was freed successfully, try to use this again */ if (swap_was_freed) goto checks; @@ -335,13 +337,13 @@ checks: si->lowest_alloc <= last_in_cluster) last_in_cluster = si->lowest_alloc - 1; si->flags |= SWP_DISCARDING; - spin_unlock(&swap_lock); + spin_unlock(&si->lock); if (offset < last_in_cluster) discard_swap_cluster(si, offset, last_in_cluster - offset + 1); - spin_lock(&swap_lock); + spin_lock(&si->lock); si->lowest_alloc = 0; si->flags &= ~SWP_DISCARDING; @@ -355,10 +357,10 @@ checks: * could defer that delay until swap_writepage, * but it's easier to keep this self-contained. */ - spin_unlock(&swap_lock); + spin_unlock(&si->lock); wait_on_bit(&si->flags, ilog2(SWP_DISCARDING), wait_for_discard, TASK_UNINTERRUPTIBLE); - spin_lock(&swap_lock); + spin_lock(&si->lock); } else { /* * Note pages allocated by racing tasks while @@ -374,14 +376,14 @@ checks: return offset; scan: - spin_unlock(&swap_lock); + spin_unlock(&si->lock); while (++offset <= si->highest_bit) { if (!si->swap_map[offset]) { - spin_lock(&swap_lock); + spin_lock(&si->lock); goto checks; } if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { - spin_lock(&swap_lock); + spin_lock(&si->lock); goto checks; } if (unlikely(--latency_ration < 0)) { @@ -392,11 +394,11 @@ scan: offset = si->lowest_bit; while (++offset < scan_base) { if (!si->swap_map[offset]) { - spin_lock(&swap_lock); + spin_lock(&si->lock); goto checks; } if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { - spin_lock(&swap_lock); + spin_lock(&si->lock); goto checks; } if (unlikely(--latency_ration < 0)) { @@ -404,7 +406,7 @@ scan: latency_ration = LATENCY_LIMIT; } } - spin_lock(&swap_lock); + spin_lock(&si->lock); no_page: si->flags -= SWP_SCANNING; @@ -417,13 +419,34 @@ swp_entry_t get_swap_page(void) pgoff_t offset; int type, next; int wrapped = 0; + int hp_index; spin_lock(&swap_lock); - if (nr_swap_pages <= 0) + if (atomic_long_read(&nr_swap_pages) <= 0) goto noswap; - nr_swap_pages--; + atomic_long_dec(&nr_swap_pages); for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { + hp_index = atomic_xchg(&highest_priority_index, -1); + /* + * highest_priority_index records current highest priority swap + * type which just frees swap entries. If its priority is + * higher than that of swap_list.next swap type, we use it. It + * isn't protected by swap_lock, so it can be an invalid value + * if the corresponding swap type is swapoff. We double check + * the flags here. It's even possible the swap type is swapoff + * and swapon again and its priority is changed. In such rare + * case, low prority swap type might be used, but eventually + * high priority swap will be used after several rounds of + * swap. + */ + if (hp_index != -1 && hp_index != type && + swap_info[type]->prio < swap_info[hp_index]->prio && + (swap_info[hp_index]->flags & SWP_WRITEOK)) { + type = hp_index; + swap_list.next = type; + } + si = swap_info[type]; next = si->next; if (next < 0 || @@ -432,22 +455,29 @@ swp_entry_t get_swap_page(void) wrapped++; } - if (!si->highest_bit) + spin_lock(&si->lock); + if (!si->highest_bit) { + spin_unlock(&si->lock); continue; - if (!(si->flags & SWP_WRITEOK)) + } + if (!(si->flags & SWP_WRITEOK)) { + spin_unlock(&si->lock); continue; + } swap_list.next = next; + + spin_unlock(&swap_lock); /* This is called for allocating swap entry for cache */ offset = scan_swap_map(si, SWAP_HAS_CACHE); - if (offset) { - spin_unlock(&swap_lock); + spin_unlock(&si->lock); + if (offset) return swp_entry(type, offset); - } + spin_lock(&swap_lock); next = swap_list.next; } - nr_swap_pages++; + atomic_long_inc(&nr_swap_pages); noswap: spin_unlock(&swap_lock); return (swp_entry_t) {0}; @@ -459,19 +489,19 @@ swp_entry_t get_swap_page_of_type(int type) struct swap_info_struct *si; pgoff_t offset; - spin_lock(&swap_lock); si = swap_info[type]; + spin_lock(&si->lock); if (si && (si->flags & SWP_WRITEOK)) { - nr_swap_pages--; + atomic_long_dec(&nr_swap_pages); /* This is called for allocating swap entry, not cache */ offset = scan_swap_map(si, 1); if (offset) { - spin_unlock(&swap_lock); + spin_unlock(&si->lock); return swp_entry(type, offset); } - nr_swap_pages++; + atomic_long_inc(&nr_swap_pages); } - spin_unlock(&swap_lock); + spin_unlock(&si->lock); return (swp_entry_t) {0}; } @@ -493,7 +523,7 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry) goto bad_offset; if (!p->swap_map[offset]) goto bad_free; - spin_lock(&swap_lock); + spin_lock(&p->lock); return p; bad_free: @@ -511,6 +541,27 @@ out: return NULL; } +/* + * This swap type frees swap entry, check if it is the highest priority swap + * type which just frees swap entry. get_swap_page() uses + * highest_priority_index to search highest priority swap type. The + * swap_info_struct.lock can't protect us if there are multiple swap types + * active, so we use atomic_cmpxchg. + */ +static void set_highest_priority_index(int type) +{ + int old_hp_index, new_hp_index; + + do { + old_hp_index = atomic_read(&highest_priority_index); + if (old_hp_index != -1 && + swap_info[old_hp_index]->prio >= swap_info[type]->prio) + break; + new_hp_index = type; + } while (atomic_cmpxchg(&highest_priority_index, + old_hp_index, new_hp_index) != old_hp_index); +} + static unsigned char swap_entry_free(struct swap_info_struct *p, swp_entry_t entry, unsigned char usage) { @@ -553,10 +604,8 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, p->lowest_bit = offset; if (offset > p->highest_bit) p->highest_bit = offset; - if (swap_list.next >= 0 && - p->prio > swap_info[swap_list.next]->prio) - swap_list.next = p->type; - nr_swap_pages++; + set_highest_priority_index(p->type); + atomic_long_inc(&nr_swap_pages); p->inuse_pages--; frontswap_invalidate_page(p->type, offset); if (p->flags & SWP_BLKDEV) { @@ -581,7 +630,7 @@ void swap_free(swp_entry_t entry) p = swap_info_get(entry); if (p) { swap_entry_free(p, entry, 1); - spin_unlock(&swap_lock); + spin_unlock(&p->lock); } } @@ -598,7 +647,7 @@ void swapcache_free(swp_entry_t entry, struct page *page) count = swap_entry_free(p, entry, SWAP_HAS_CACHE); if (page) mem_cgroup_uncharge_swapcache(page, entry, count != 0); - spin_unlock(&swap_lock); + spin_unlock(&p->lock); } } @@ -617,7 +666,7 @@ int page_swapcount(struct page *page) p = swap_info_get(entry); if (p) { count = swap_count(p->swap_map[swp_offset(entry)]); - spin_unlock(&swap_lock); + spin_unlock(&p->lock); } return count; } @@ -706,7 +755,7 @@ int free_swap_and_cache(swp_entry_t entry) page = NULL; } } - spin_unlock(&swap_lock); + spin_unlock(&p->lock); } if (page) { /* @@ -804,11 +853,13 @@ unsigned int count_swap_pages(int type, int free) if ((unsigned int)type < nr_swapfiles) { struct swap_info_struct *sis = swap_info[type]; + spin_lock(&sis->lock); if (sis->flags & SWP_WRITEOK) { n = sis->pages; if (free) n -= sis->inuse_pages; } + spin_unlock(&sis->lock); } spin_unlock(&swap_lock); return n; @@ -1457,7 +1508,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, p->swap_map = swap_map; frontswap_map_set(p, frontswap_map); p->flags |= SWP_WRITEOK; - nr_swap_pages += p->pages; + atomic_long_add(p->pages, &nr_swap_pages); total_swap_pages += p->pages; /* insert swap space into swap_list: */ @@ -1479,15 +1530,19 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, unsigned long *frontswap_map) { spin_lock(&swap_lock); + spin_lock(&p->lock); _enable_swap_info(p, prio, swap_map, frontswap_map); frontswap_init(p->type); + spin_unlock(&p->lock); spin_unlock(&swap_lock); } static void reinsert_swap_info(struct swap_info_struct *p) { spin_lock(&swap_lock); + spin_lock(&p->lock); _enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); + spin_unlock(&p->lock); spin_unlock(&swap_lock); } @@ -1547,14 +1602,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) /* just pick something that's safe... */ swap_list.next = swap_list.head; } + spin_lock(&p->lock); if (p->prio < 0) { for (i = p->next; i >= 0; i = swap_info[i]->next) swap_info[i]->prio = p->prio--; least_priority++; } - nr_swap_pages -= p->pages; + atomic_long_sub(p->pages, &nr_swap_pages); total_swap_pages -= p->pages; p->flags &= ~SWP_WRITEOK; + spin_unlock(&p->lock); spin_unlock(&swap_lock); set_current_oom_origin(); @@ -1573,14 +1630,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) mutex_lock(&swapon_mutex); spin_lock(&swap_lock); + spin_lock(&p->lock); drain_mmlist(); /* wait for anyone still in scan_swap_map */ p->highest_bit = 0; /* cuts scans short */ while (p->flags >= SWP_SCANNING) { + spin_unlock(&p->lock); spin_unlock(&swap_lock); schedule_timeout_uninterruptible(1); spin_lock(&swap_lock); + spin_lock(&p->lock); } swap_file = p->swap_file; @@ -1590,6 +1650,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->swap_map = NULL; p->flags = 0; frontswap_invalidate_area(type); + spin_unlock(&p->lock); spin_unlock(&swap_lock); mutex_unlock(&swapon_mutex); vfree(swap_map); @@ -1795,6 +1856,7 @@ static struct swap_info_struct *alloc_swap_info(void) p->flags = SWP_USED; p->next = -1; spin_unlock(&swap_lock); + spin_lock_init(&p->lock); return p; } @@ -2117,7 +2179,7 @@ void si_swapinfo(struct sysinfo *val) if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) nr_to_be_unused += si->inuse_pages; } - val->freeswap = nr_swap_pages + nr_to_be_unused; + val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; val->totalswap = total_swap_pages + nr_to_be_unused; spin_unlock(&swap_lock); } @@ -2150,7 +2212,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) p = swap_info[type]; offset = swp_offset(entry); - spin_lock(&swap_lock); + spin_lock(&p->lock); if (unlikely(offset >= p->max)) goto unlock_out; @@ -2185,7 +2247,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) p->swap_map[offset] = count | has_cache; unlock_out: - spin_unlock(&swap_lock); + spin_unlock(&p->lock); out: return err; @@ -2310,7 +2372,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) } if (!page) { - spin_unlock(&swap_lock); + spin_unlock(&si->lock); return -ENOMEM; } @@ -2358,7 +2420,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) list_add_tail(&page->lru, &head->lru); page = NULL; /* now it's attached, don't free it */ out: - spin_unlock(&swap_lock); + spin_unlock(&si->lock); outer: if (page) __free_page(page); diff --git a/mm/vmscan.c b/mm/vmscan.c index a68fa20269d9..b7d8015a6d54 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1684,7 +1684,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, force_scan = true; /* If we have no swap space, do not bother scanning anon pages. */ - if (!sc->may_swap || (nr_swap_pages <= 0)) { + if (!sc->may_swap || (get_nr_swap_pages() <= 0)) { scan_balance = SCAN_FILE; goto out; } @@ -1933,7 +1933,7 @@ static inline bool should_continue_reclaim(struct zone *zone, */ pages_for_compaction = (2UL << sc->order); inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE); - if (nr_swap_pages > 0) + if (get_nr_swap_pages() > 0) inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON); if (sc->nr_reclaimed < pages_for_compaction && inactive_lru_pages > pages_for_compaction) @@ -3085,7 +3085,7 @@ unsigned long global_reclaimable_pages(void) nr = global_page_state(NR_ACTIVE_FILE) + global_page_state(NR_INACTIVE_FILE); - if (nr_swap_pages > 0) + if (get_nr_swap_pages() > 0) nr += global_page_state(NR_ACTIVE_ANON) + global_page_state(NR_INACTIVE_ANON); @@ -3099,7 +3099,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone) nr = zone_page_state(zone, NR_ACTIVE_FILE) + zone_page_state(zone, NR_INACTIVE_FILE); - if (nr_swap_pages > 0) + if (get_nr_swap_pages() > 0) nr += zone_page_state(zone, NR_ACTIVE_ANON) + zone_page_state(zone, NR_INACTIVE_ANON); -- cgit v1.2.3 From ebec3862fd6eefe8301aa55ed2e30c685d831842 Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Fri, 22 Feb 2013 16:35:43 -0800 Subject: mm: fix return type for functions nr_free_*_pages Currently, the amount of RAM that functions nr_free_*_pages return is held in unsigned int. But in machines with big memory (exceeding 16TB), the amount may be incorrect because of overflow, so fix it. Signed-off-by: Zhang Yanfei Cc: Simon Horman Cc: Julian Anastasov Cc: David Miller Cc: Eric Van Hensbergen Cc: Ron Minnich Cc: Latchesar Ionkov Cc: Mel Gorman Cc: Minchan Kim Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 4 ++-- mm/page_alloc.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux/swap.h') diff --git a/include/linux/swap.h b/include/linux/swap.h index a3e22d357e91..8a15f38ebc5c 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -225,8 +225,8 @@ struct swap_list_t { extern unsigned long totalram_pages; extern unsigned long totalreserve_pages; extern unsigned long dirty_balance_reserve; -extern unsigned int nr_free_buffer_pages(void); -extern unsigned int nr_free_pagecache_pages(void); +extern unsigned long nr_free_buffer_pages(void); +extern unsigned long nr_free_pagecache_pages(void); /* Definition of global_page_state not available yet */ #define nr_free_pages() global_page_state(NR_FREE_PAGES) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 159f81577774..276140654305 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2813,13 +2813,13 @@ void free_pages_exact(void *virt, size_t size) } EXPORT_SYMBOL(free_pages_exact); -static unsigned int nr_free_zone_pages(int offset) +static unsigned long nr_free_zone_pages(int offset) { struct zoneref *z; struct zone *zone; /* Just pick one node, since fallback list is circular */ - unsigned int sum = 0; + unsigned long sum = 0; struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); @@ -2836,7 +2836,7 @@ static unsigned int nr_free_zone_pages(int offset) /* * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL */ -unsigned int nr_free_buffer_pages(void) +unsigned long nr_free_buffer_pages(void) { return nr_free_zone_pages(gfp_zone(GFP_USER)); } @@ -2845,7 +2845,7 @@ EXPORT_SYMBOL_GPL(nr_free_buffer_pages); /* * Amount of free RAM allocatable within all zones */ -unsigned int nr_free_pagecache_pages(void) +unsigned long nr_free_pagecache_pages(void) { return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); } -- cgit v1.2.3 From b21e0b90ccb99a377bce0167fed1e881bb5065d7 Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Fri, 22 Feb 2013 16:35:48 -0800 Subject: vmscan: change type of vm_total_pages to unsigned long This variable is calculated from nr_free_pagecache_pages so change its type to unsigned long. Signed-off-by: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 2 +- mm/vmscan.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/swap.h') diff --git a/include/linux/swap.h b/include/linux/swap.h index 8a15f38ebc5c..2818a123f3ea 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -275,7 +275,7 @@ extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; extern int remove_mapping(struct address_space *mapping, struct page *page); -extern long vm_total_pages; +extern unsigned long vm_total_pages; #ifdef CONFIG_NUMA extern int zone_reclaim_mode; diff --git a/mm/vmscan.c b/mm/vmscan.c index 606d0bb46091..88c5fed8b9a4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -128,7 +128,7 @@ struct scan_control { * From 0 .. 100. Higher means more swappy. */ int vm_swappiness = 60; -long vm_total_pages; /* The total number of pages which the VM controls */ +unsigned long vm_total_pages; /* The total number of pages which the VM controls */ static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); -- cgit v1.2.3