From fee7e49d45149fba60156f5b59014f764d3e3728 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 6 Jan 2015 13:00:05 -0800 Subject: mm: propagate error from stack expansion even for guard page Jay Foad reports that the address sanitizer test (asan) sometimes gets confused by a stack pointer that ends up being outside the stack vma that is reported by /proc/maps. This happens due to an interaction between RLIMIT_STACK and the guard page: when we do the guard page check, we ignore the potential error from the stack expansion, which effectively results in a missing guard page, since the expected stack expansion won't have been done. And since /proc/maps explicitly ignores the guard page (commit d7824370e263: "mm: fix up some user-visible effects of the stack guard page"), the stack pointer ends up being outside the reported stack area. This is the minimal patch: it just propagates the error. It also effectively makes the guard page part of the stack limit, which in turn measn that the actual real stack is one page less than the stack limit. Let's see if anybody notices. We could teach acct_stack_growth() to allow an extra page for a grow-up/grow-down stack in the rlimit test, but I don't want to add more complexity if it isn't needed. Reported-and-tested-by: Jay Foad Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- mm/memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index ca920d1fd314..d7e497e98f46 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2593,7 +2593,7 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo if (prev && prev->vm_end == address) return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM; - expand_downwards(vma, address - PAGE_SIZE); + return expand_downwards(vma, address - PAGE_SIZE); } if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) { struct vm_area_struct *next = vma->vm_next; @@ -2602,7 +2602,7 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo if (next && next->vm_start == address + PAGE_SIZE) return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM; - expand_upwards(vma, address + PAGE_SIZE); + return expand_upwards(vma, address + PAGE_SIZE); } return 0; } -- cgit v1.2.3 From 2d6d7f98284648c5ed113fe22a132148950b140f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 8 Jan 2015 14:32:18 -0800 Subject: mm: protect set_page_dirty() from ongoing truncation Tejun, while reviewing the code, spotted the following race condition between the dirtying and truncation of a page: __set_page_dirty_nobuffers() __delete_from_page_cache() if (TestSetPageDirty(page)) page->mapping = NULL if (PageDirty()) dec_zone_page_state(page, NR_FILE_DIRTY); dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); if (page->mapping) account_page_dirtied(page) __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); which results in an imbalance of NR_FILE_DIRTY and BDI_RECLAIMABLE. Dirtiers usually lock out truncation, either by holding the page lock directly, or in case of zap_pte_range(), by pinning the mapcount with the page table lock held. The notable exception to this rule, though, is do_wp_page(), for which this race exists. However, do_wp_page() already waits for a locked page to unlock before setting the dirty bit, in order to prevent a race where clear_page_dirty() misses the page bit in the presence of dirty ptes. Upgrade that wait to a fully locked set_page_dirty() to also cover the situation explained above. Afterwards, the code in set_page_dirty() dealing with a truncation race is no longer needed. Remove it. Reported-by: Tejun Heo Signed-off-by: Johannes Weiner Acked-by: Kirill A. Shutemov Reviewed-by: Jan Kara Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/writeback.h | 1 - mm/memory.c | 27 +++++++++++++++++---------- mm/page-writeback.c | 43 ++++++++++++------------------------------- 3 files changed, 29 insertions(+), 42 deletions(-) (limited to 'mm/memory.c') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index a219be961c0a..00048339c23e 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -177,7 +177,6 @@ int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, void *data); int do_writepages(struct address_space *mapping, struct writeback_control *wbc); -void set_page_dirty_balance(struct page *page); void writeback_set_ratelimit(void); void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end); diff --git a/mm/memory.c b/mm/memory.c index d7e497e98f46..c6565f00fb38 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2137,17 +2137,24 @@ reuse: if (!dirty_page) return ret; - /* - * Yes, Virginia, this is actually required to prevent a race - * with clear_page_dirty_for_io() from clearing the page dirty - * bit after it clear all dirty ptes, but before a racing - * do_wp_page installs a dirty pte. - * - * do_shared_fault is protected similarly. - */ if (!page_mkwrite) { - wait_on_page_locked(dirty_page); - set_page_dirty_balance(dirty_page); + struct address_space *mapping; + int dirtied; + + lock_page(dirty_page); + dirtied = set_page_dirty(dirty_page); + VM_BUG_ON_PAGE(PageAnon(dirty_page), dirty_page); + mapping = dirty_page->mapping; + unlock_page(dirty_page); + + if (dirtied && mapping) { + /* + * Some device drivers do not set page.mapping + * but still dirty their pages + */ + balance_dirty_pages_ratelimited(mapping); + } + /* file_update_time outside page_lock */ if (vma->vm_file) file_update_time(vma->vm_file); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d5d81f5384d1..6f4335238e33 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1541,16 +1541,6 @@ pause: bdi_start_background_writeback(bdi); } -void set_page_dirty_balance(struct page *page) -{ - if (set_page_dirty(page)) { - struct address_space *mapping = page_mapping(page); - - if (mapping) - balance_dirty_pages_ratelimited(mapping); - } -} - static DEFINE_PER_CPU(int, bdp_ratelimits); /* @@ -2123,32 +2113,25 @@ EXPORT_SYMBOL(account_page_dirtied); * page dirty in that case, but not all the buffers. This is a "bottom-up" * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. * - * Most callers have locked the page, which pins the address_space in memory. - * But zap_pte_range() does not lock the page, however in that case the - * mapping is pinned by the vma's ->vm_file reference. - * - * We take care to handle the case where the page was truncated from the - * mapping by re-checking page_mapping() inside tree_lock. + * The caller must ensure this doesn't race with truncation. Most will simply + * hold the page lock, but e.g. zap_pte_range() calls with the page mapped and + * the pte lock held, which also locks out truncation. */ int __set_page_dirty_nobuffers(struct page *page) { if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); - struct address_space *mapping2; unsigned long flags; if (!mapping) return 1; spin_lock_irqsave(&mapping->tree_lock, flags); - mapping2 = page_mapping(page); - if (mapping2) { /* Race with truncate? */ - BUG_ON(mapping2 != mapping); - WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); - account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->page_tree, - page_index(page), PAGECACHE_TAG_DIRTY); - } + BUG_ON(page_mapping(page) != mapping); + WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); + account_page_dirtied(page, mapping); + radix_tree_tag_set(&mapping->page_tree, page_index(page), + PAGECACHE_TAG_DIRTY); spin_unlock_irqrestore(&mapping->tree_lock, flags); if (mapping->host) { /* !PageAnon && !swapper_space */ @@ -2305,12 +2288,10 @@ int clear_page_dirty_for_io(struct page *page) /* * We carefully synchronise fault handlers against * installing a dirty pte and marking the page dirty - * at this point. We do this by having them hold the - * page lock at some point after installing their - * pte, but before marking the page dirty. - * Pages are always locked coming in here, so we get - * the desired exclusion. See mm/memory.c:do_wp_page() - * for more comments. + * at this point. We do this by having them hold the + * page lock while dirtying the page, and pages are + * always locked coming in here, so we get the desired + * exclusion. */ if (TestClearPageDirty(page)) { dec_zone_page_state(page, NR_FILE_DIRTY); -- cgit v1.2.3 From 721c21c17ab958abf19a8fc611c3bd4743680e38 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 12 Jan 2015 19:10:55 +0000 Subject: mm: mmu_gather: use tlb->end != 0 only for TLB invalidation When batching up address ranges for TLB invalidation, we check tlb->end != 0 to indicate that some pages have actually been unmapped. As of commit f045bbb9fa1b ("mmu_gather: fix over-eager tlb_flush_mmu_free() calling"), we use the same check for freeing these pages in order to avoid a performance regression where we call free_pages_and_swap_cache even when no pages are actually queued up. Unfortunately, the range could have been reset (tlb->end = 0) by tlb_end_vma, which has been shown to cause memory leaks on arm64. Furthermore, investigation into these leaks revealed that the fullmm case on task exit no longer invalidates the TLB, by virtue of tlb->end == 0 (in 3.18, need_flush would have been set). This patch resolves the problem by reverting commit f045bbb9fa1b, using instead tlb->local.nr as the predicate for page freeing in tlb_flush_mmu_free and ensuring that tlb->end is initialised to a non-zero value in the fullmm case. Tested-by: Mark Langsdorf Tested-by: Dave Hansen Signed-off-by: Will Deacon Signed-off-by: Linus Torvalds --- include/asm-generic/tlb.h | 8 ++++++-- mm/memory.c | 8 ++++---- 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'mm/memory.c') diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 08848050922e..db284bff29dc 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -136,8 +136,12 @@ static inline void __tlb_adjust_range(struct mmu_gather *tlb, static inline void __tlb_reset_range(struct mmu_gather *tlb) { - tlb->start = TASK_SIZE; - tlb->end = 0; + if (tlb->fullmm) { + tlb->start = tlb->end = ~0; + } else { + tlb->start = TASK_SIZE; + tlb->end = 0; + } } /* diff --git a/mm/memory.c b/mm/memory.c index c6565f00fb38..54f3a9b00956 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -235,6 +235,9 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) { + if (!tlb->end) + return; + tlb_flush(tlb); mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end); #ifdef CONFIG_HAVE_RCU_TABLE_FREE @@ -247,7 +250,7 @@ static void tlb_flush_mmu_free(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; - for (batch = &tlb->local; batch; batch = batch->next) { + for (batch = &tlb->local; batch && batch->nr; batch = batch->next) { free_pages_and_swap_cache(batch->pages, batch->nr); batch->nr = 0; } @@ -256,9 +259,6 @@ static void tlb_flush_mmu_free(struct mmu_gather *tlb) void tlb_flush_mmu(struct mmu_gather *tlb) { - if (!tlb->end) - return; - tlb_flush_mmu_tlbonly(tlb); tlb_flush_mmu_free(tlb); } -- cgit v1.2.3