diff options
Diffstat (limited to 'mm/swap_state.c')
-rw-r--r-- | mm/swap_state.c | 396 |
1 files changed, 330 insertions, 66 deletions
diff --git a/mm/swap_state.c b/mm/swap_state.c index 539b8885e3d1..05b6803f0cce 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -19,6 +19,7 @@ #include <linux/migrate.h> #include <linux/vmalloc.h> #include <linux/swap_slots.h> +#include <linux/huge_mm.h> #include <asm/pgtable.h> @@ -36,8 +37,28 @@ static const struct address_space_operations swap_aops = { struct address_space *swapper_spaces[MAX_SWAPFILES]; static unsigned int nr_swapper_spaces[MAX_SWAPFILES]; +bool swap_vma_readahead = true; + +#define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) +#define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) +#define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK +#define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK) + +#define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK) +#define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT) +#define SWAP_RA_ADDR(v) ((v) & PAGE_MASK) + +#define SWAP_RA_VAL(addr, win, hits) \ + (((addr) & PAGE_MASK) | \ + (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) | \ + ((hits) & SWAP_RA_HITS_MASK)) + +/* Initial readahead hits is 4 to start up with a small window */ +#define GET_SWAP_RA_VAL(vma) \ + (atomic_long_read(&(vma)->swap_readahead_info) ? : 4) #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) +#define ADD_CACHE_INFO(x, nr) do { swap_cache_info.x += (nr); } while (0) static struct { unsigned long add_total; @@ -90,39 +111,46 @@ void show_swap_cache_info(void) */ int __add_to_swap_cache(struct page *page, swp_entry_t entry) { - int error; + int error, i, nr = hpage_nr_pages(page); struct address_space *address_space; + pgoff_t idx = swp_offset(entry); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapCache(page), page); VM_BUG_ON_PAGE(!PageSwapBacked(page), page); - get_page(page); + page_ref_add(page, nr); SetPageSwapCache(page); - set_page_private(page, entry.val); address_space = swap_address_space(entry); spin_lock_irq(&address_space->tree_lock); - error = radix_tree_insert(&address_space->page_tree, - swp_offset(entry), page); - if (likely(!error)) { - address_space->nrpages++; - __inc_node_page_state(page, NR_FILE_PAGES); - INC_CACHE_INFO(add_total); + for (i = 0; i < nr; i++) { + set_page_private(page + i, entry.val + i); + error = radix_tree_insert(&address_space->page_tree, + idx + i, page + i); + if (unlikely(error)) + break; } - spin_unlock_irq(&address_space->tree_lock); - - if (unlikely(error)) { + if (likely(!error)) { + address_space->nrpages += nr; + __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); + ADD_CACHE_INFO(add_total, nr); + } else { /* * Only the context which have set SWAP_HAS_CACHE flag * would call add_to_swap_cache(). * So add_to_swap_cache() doesn't returns -EEXIST. */ VM_BUG_ON(error == -EEXIST); - set_page_private(page, 0UL); + set_page_private(page + i, 0UL); + while (i--) { + radix_tree_delete(&address_space->page_tree, idx + i); + set_page_private(page + i, 0UL); + } ClearPageSwapCache(page); - put_page(page); + page_ref_sub(page, nr); } + spin_unlock_irq(&address_space->tree_lock); return error; } @@ -132,7 +160,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) { int error; - error = radix_tree_maybe_preload(gfp_mask); + error = radix_tree_maybe_preload_order(gfp_mask, compound_order(page)); if (!error) { error = __add_to_swap_cache(page, entry); radix_tree_preload_end(); @@ -146,8 +174,10 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) */ void __delete_from_swap_cache(struct page *page) { - swp_entry_t entry; struct address_space *address_space; + int i, nr = hpage_nr_pages(page); + swp_entry_t entry; + pgoff_t idx; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageSwapCache(page), page); @@ -155,12 +185,15 @@ void __delete_from_swap_cache(struct page *page) entry.val = page_private(page); address_space = swap_address_space(entry); - radix_tree_delete(&address_space->page_tree, swp_offset(entry)); - set_page_private(page, 0); + idx = swp_offset(entry); + for (i = 0; i < nr; i++) { + radix_tree_delete(&address_space->page_tree, idx + i); + set_page_private(page + i, 0); + } ClearPageSwapCache(page); - address_space->nrpages--; - __dec_node_page_state(page, NR_FILE_PAGES); - INC_CACHE_INFO(del_total); + address_space->nrpages -= nr; + __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); + ADD_CACHE_INFO(del_total, nr); } /** @@ -170,7 +203,7 @@ void __delete_from_swap_cache(struct page *page) * Allocate swap space for the page and add the page to the * swap cache. Caller needs to hold the page lock. */ -int add_to_swap(struct page *page, struct list_head *list) +int add_to_swap(struct page *page) { swp_entry_t entry; int err; @@ -178,20 +211,12 @@ int add_to_swap(struct page *page, struct list_head *list) VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageUptodate(page), page); - entry = get_swap_page(); + entry = get_swap_page(page); if (!entry.val) return 0; - if (mem_cgroup_try_charge_swap(page, entry)) { - swapcache_free(entry); - return 0; - } - - if (unlikely(PageTransHuge(page))) - if (unlikely(split_huge_page_to_list(page, list))) { - swapcache_free(entry); - return 0; - } + if (mem_cgroup_try_charge_swap(page, entry)) + goto fail; /* * Radix-tree node allocations from PF_MEMALLOC contexts could @@ -206,17 +231,30 @@ int add_to_swap(struct page *page, struct list_head *list) */ err = add_to_swap_cache(page, entry, __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); - - if (!err) { - return 1; - } else { /* -ENOMEM radix-tree allocation failure */ + /* -ENOMEM radix-tree allocation failure */ + if (err) /* * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. */ - swapcache_free(entry); - return 0; - } + goto fail; + /* + * Normally the page will be dirtied in unmap because its pte should be + * dirty. A special case is MADV_FREE page. The page'e pte could have + * dirty bit cleared but the page's SwapBacked bit is still set because + * clearing the dirty bit and SwapBacked bit has no lock protected. For + * such page, unmap will not set dirty bit for it, so page reclaim will + * not write the page out. This can cause data corruption when the page + * is swap in later. Always setting the dirty bit for the page solves + * the problem. + */ + set_page_dirty(page); + + return 1; + +fail: + put_swap_page(page, entry); + return 0; } /* @@ -237,8 +275,8 @@ void delete_from_swap_cache(struct page *page) __delete_from_swap_cache(page); spin_unlock_irq(&address_space->tree_lock); - swapcache_free(entry); - put_page(page); + put_swap_page(page, entry); + page_ref_sub(page, hpage_nr_pages(page)); } /* @@ -289,19 +327,36 @@ void free_pages_and_swap_cache(struct page **pages, int nr) * lock getting page table operations atomic even if we drop the page * lock before returning. */ -struct page * lookup_swap_cache(swp_entry_t entry) +struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, + unsigned long addr) { struct page *page; + unsigned long ra_info; + int win, hits, readahead; page = find_get_page(swap_address_space(entry), swp_offset(entry)); + INC_CACHE_INFO(find_total); if (page) { INC_CACHE_INFO(find_success); - if (TestClearPageReadahead(page)) - atomic_inc(&swapin_readahead_hits); + if (unlikely(PageTransCompound(page))) + return page; + readahead = TestClearPageReadahead(page); + if (vma) { + ra_info = GET_SWAP_RA_VAL(vma); + win = SWAP_RA_WIN(ra_info); + hits = SWAP_RA_HITS(ra_info); + if (readahead) + hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); + atomic_long_set(&vma->swap_readahead_info, + SWAP_RA_VAL(addr, win, hits)); + } + if (readahead) { + count_vm_event(SWAP_RA_HIT); + if (!vma) + atomic_inc(&swapin_readahead_hits); + } } - - INC_CACHE_INFO(find_total); return page; } @@ -389,7 +444,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. */ - swapcache_free(entry); + put_swap_page(new_page, entry); } while (err != -ENOMEM); if (new_page) @@ -404,34 +459,32 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * the swap entry is no longer in use. */ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, - struct vm_area_struct *vma, unsigned long addr) + struct vm_area_struct *vma, unsigned long addr, bool do_poll) { bool page_was_allocated; struct page *retpage = __read_swap_cache_async(entry, gfp_mask, vma, addr, &page_was_allocated); if (page_was_allocated) - swap_readpage(retpage); + swap_readpage(retpage, do_poll); return retpage; } -static unsigned long swapin_nr_pages(unsigned long offset) +static unsigned int __swapin_nr_pages(unsigned long prev_offset, + unsigned long offset, + int hits, + int max_pages, + int prev_win) { - static unsigned long prev_offset; - unsigned int pages, max_pages, last_ra; - static atomic_t last_readahead_pages; - - max_pages = 1 << READ_ONCE(page_cluster); - if (max_pages <= 1) - return 1; + unsigned int pages, last_ra; /* * This heuristic has been found to work well on both sequential and * random loads, swapping to hard disk or to SSD: please don't ask * what the "+ 2" means, it just happens to work well, that's all. */ - pages = atomic_xchg(&swapin_readahead_hits, 0) + 2; + pages = hits + 2; if (pages == 2) { /* * We can have no readahead hits to judge by: but must not get @@ -440,7 +493,6 @@ static unsigned long swapin_nr_pages(unsigned long offset) */ if (offset != prev_offset + 1 && offset != prev_offset - 1) pages = 1; - prev_offset = offset; } else { unsigned int roundup = 4; while (roundup < pages) @@ -452,9 +504,28 @@ static unsigned long swapin_nr_pages(unsigned long offset) pages = max_pages; /* Don't shrink readahead too fast */ - last_ra = atomic_read(&last_readahead_pages) / 2; + last_ra = prev_win / 2; if (pages < last_ra) pages = last_ra; + + return pages; +} + +static unsigned long swapin_nr_pages(unsigned long offset) +{ + static unsigned long prev_offset; + unsigned int hits, pages, max_pages; + static atomic_t last_readahead_pages; + + max_pages = 1 << READ_ONCE(page_cluster); + if (max_pages <= 1) + return 1; + + hits = atomic_xchg(&swapin_readahead_hits, 0); + pages = __swapin_nr_pages(prev_offset, offset, hits, max_pages, + atomic_read(&last_readahead_pages)); + if (!hits) + prev_offset = offset; atomic_set(&last_readahead_pages, pages); return pages; @@ -488,11 +559,13 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, unsigned long start_offset, end_offset; unsigned long mask; struct blk_plug plug; + bool do_poll = true, page_allocated; mask = swapin_nr_pages(offset) - 1; if (!mask) goto skip; + do_poll = false; /* Read a page_cluster sized and aligned cluster around offset. */ start_offset = offset & ~mask; end_offset = offset | mask; @@ -502,19 +575,26 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, blk_start_plug(&plug); for (offset = start_offset; offset <= end_offset ; offset++) { /* Ok, do the async read-ahead now */ - page = read_swap_cache_async(swp_entry(swp_type(entry), offset), - gfp_mask, vma, addr); + page = __read_swap_cache_async( + swp_entry(swp_type(entry), offset), + gfp_mask, vma, addr, &page_allocated); if (!page) continue; - if (offset != entry_offset) - SetPageReadahead(page); + if (page_allocated) { + swap_readpage(page, false); + if (offset != entry_offset && + likely(!PageTransCompound(page))) { + SetPageReadahead(page); + count_vm_event(SWAP_RA); + } + } put_page(page); } blk_finish_plug(&plug); lru_add_drain(); /* Push any new pages onto the LRU now */ skip: - return read_swap_cache_async(entry, gfp_mask, vma, addr); + return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll); } int init_swap_address_space(unsigned int type, unsigned long nr_pages) @@ -551,3 +631,187 @@ void exit_swap_address_space(unsigned int type) synchronize_rcu(); kvfree(spaces); } + +static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma, + unsigned long faddr, + unsigned long lpfn, + unsigned long rpfn, + unsigned long *start, + unsigned long *end) +{ + *start = max3(lpfn, PFN_DOWN(vma->vm_start), + PFN_DOWN(faddr & PMD_MASK)); + *end = min3(rpfn, PFN_DOWN(vma->vm_end), + PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE)); +} + +struct page *swap_readahead_detect(struct vm_fault *vmf, + struct vma_swap_readahead *swap_ra) +{ + struct vm_area_struct *vma = vmf->vma; + unsigned long swap_ra_info; + struct page *page; + swp_entry_t entry; + unsigned long faddr, pfn, fpfn; + unsigned long start, end; + pte_t *pte; + unsigned int max_win, hits, prev_win, win, left; +#ifndef CONFIG_64BIT + pte_t *tpte; +#endif + + max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster), + SWAP_RA_ORDER_CEILING); + if (max_win == 1) { + swap_ra->win = 1; + return NULL; + } + + faddr = vmf->address; + entry = pte_to_swp_entry(vmf->orig_pte); + if ((unlikely(non_swap_entry(entry)))) + return NULL; + page = lookup_swap_cache(entry, vma, faddr); + if (page) + return page; + + fpfn = PFN_DOWN(faddr); + swap_ra_info = GET_SWAP_RA_VAL(vma); + pfn = PFN_DOWN(SWAP_RA_ADDR(swap_ra_info)); + prev_win = SWAP_RA_WIN(swap_ra_info); + hits = SWAP_RA_HITS(swap_ra_info); + swap_ra->win = win = __swapin_nr_pages(pfn, fpfn, hits, + max_win, prev_win); + atomic_long_set(&vma->swap_readahead_info, + SWAP_RA_VAL(faddr, win, 0)); + + if (win == 1) + return NULL; + + /* Copy the PTEs because the page table may be unmapped */ + if (fpfn == pfn + 1) + swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end); + else if (pfn == fpfn + 1) + swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1, + &start, &end); + else { + left = (win - 1) / 2; + swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left, + &start, &end); + } + swap_ra->nr_pte = end - start; + swap_ra->offset = fpfn - start; + pte = vmf->pte - swap_ra->offset; +#ifdef CONFIG_64BIT + swap_ra->ptes = pte; +#else + tpte = swap_ra->ptes; + for (pfn = start; pfn != end; pfn++) + *tpte++ = *pte++; +#endif + + return NULL; +} + +struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask, + struct vm_fault *vmf, + struct vma_swap_readahead *swap_ra) +{ + struct blk_plug plug; + struct vm_area_struct *vma = vmf->vma; + struct page *page; + pte_t *pte, pentry; + swp_entry_t entry; + unsigned int i; + bool page_allocated; + + if (swap_ra->win == 1) + goto skip; + + blk_start_plug(&plug); + for (i = 0, pte = swap_ra->ptes; i < swap_ra->nr_pte; + i++, pte++) { + pentry = *pte; + if (pte_none(pentry)) + continue; + if (pte_present(pentry)) + continue; + entry = pte_to_swp_entry(pentry); + if (unlikely(non_swap_entry(entry))) + continue; + page = __read_swap_cache_async(entry, gfp_mask, vma, + vmf->address, &page_allocated); + if (!page) + continue; + if (page_allocated) { + swap_readpage(page, false); + if (i != swap_ra->offset && + likely(!PageTransCompound(page))) { + SetPageReadahead(page); + count_vm_event(SWAP_RA); + } + } + put_page(page); + } + blk_finish_plug(&plug); + lru_add_drain(); +skip: + return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, + swap_ra->win == 1); +} + +#ifdef CONFIG_SYSFS +static ssize_t vma_ra_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%s\n", swap_vma_readahead ? "true" : "false"); +} +static ssize_t vma_ra_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1)) + swap_vma_readahead = true; + else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1)) + swap_vma_readahead = false; + else + return -EINVAL; + + return count; +} +static struct kobj_attribute vma_ra_enabled_attr = + __ATTR(vma_ra_enabled, 0644, vma_ra_enabled_show, + vma_ra_enabled_store); + +static struct attribute *swap_attrs[] = { + &vma_ra_enabled_attr.attr, + NULL, +}; + +static struct attribute_group swap_attr_group = { + .attrs = swap_attrs, +}; + +static int __init swap_init_sysfs(void) +{ + int err; + struct kobject *swap_kobj; + + swap_kobj = kobject_create_and_add("swap", mm_kobj); + if (!swap_kobj) { + pr_err("failed to create swap kobject\n"); + return -ENOMEM; + } + err = sysfs_create_group(swap_kobj, &swap_attr_group); + if (err) { + pr_err("failed to register swap group\n"); + goto delete_obj; + } + return 0; + +delete_obj: + kobject_put(swap_kobj); + return err; +} +subsys_initcall(swap_init_sysfs); +#endif |