From 1c290f642101e64f379e38ea0361d097c08e824d Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:52:07 -0800 Subject: mm: sanitize page->mapping for tail pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We don't define meaning of page->mapping for tail pages. Currently it's always NULL, which can be inconsistent with head page and potentially lead to problems. Let's poison the pointer to catch all illigal uses. page_rmapping(), page_mapping() and page_anon_vma() are changed to look on head page. The only illegal use I've caught so far is __GPF_COMP pages from sound subsystem, mapped with PTEs. do_shared_fault() is changed to use page_rmapping() instead of direct access to fault_page->mapping. Signed-off-by: Kirill A. Shutemov Reviewed-by: Jérôme Glisse Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Steve Capper Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/util.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'mm/util.c') diff --git a/mm/util.c b/mm/util.c index 2d28f7930043..8acb936a52c8 100644 --- a/mm/util.c +++ b/mm/util.c @@ -386,7 +386,9 @@ struct anon_vma *page_anon_vma(struct page *page) struct address_space *page_mapping(struct page *page) { - unsigned long mapping; + struct address_space *mapping; + + page = compound_head(page); /* This happens if someone calls flush_dcache_page on slab page */ if (unlikely(PageSlab(page))) @@ -399,10 +401,10 @@ struct address_space *page_mapping(struct page *page) return swap_address_space(entry); } - mapping = (unsigned long)page->mapping; - if (mapping & PAGE_MAPPING_FLAGS) + mapping = page->mapping; + if ((unsigned long)mapping & PAGE_MAPPING_FLAGS) return NULL; - return page->mapping; + return mapping; } int overcommit_ratio_handler(struct ctl_table *table, int write, -- cgit v1.2.3 From b20ce5e03b936be077463015661dcf52be274e5b Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:54:37 -0800 Subject: mm: prepare page_referenced() and page_idle to new THP refcounting Both page_referenced() and page_idle_clear_pte_refs_one() assume that THP can only be mapped with PMD, so there's no reason to look on PTEs for PageTransHuge() pages. That's no true anymore: THP can be mapped with PTEs too. The patch removes PageTransHuge() test from the functions and opencode page table check. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Kirill A. Shutemov Cc: Vladimir Davydov Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Naoya Horiguchi Cc: Sasha Levin Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 5 --- include/linux/mm.h | 23 ++++++---- mm/huge_memory.c | 73 ++++++++---------------------- mm/page_idle.c | 65 +++++++++++++++++++++++---- mm/rmap.c | 117 +++++++++++++++++++++++++++++++++--------------- mm/util.c | 14 ++++++ 6 files changed, 185 insertions(+), 112 deletions(-) (limited to 'mm/util.c') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 7aec5ee9cfdf..72cd942edb22 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -48,11 +48,6 @@ enum transparent_hugepage_flag { #endif }; -extern pmd_t *page_check_address_pmd(struct page *page, - struct mm_struct *mm, - unsigned long address, - spinlock_t **ptl); - #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) #define HPAGE_PMD_NR (1<_mapcount, -1); } +int __page_mapcount(struct page *page); + static inline int page_mapcount(struct page *page) { - int ret; VM_BUG_ON_PAGE(PageSlab(page), page); - ret = atomic_read(&page->_mapcount) + 1; - if (PageCompound(page)) { - page = compound_head(page); - ret += atomic_read(compound_mapcount_ptr(page)) + 1; - if (PageDoubleMap(page)) - ret--; - } - return ret; + if (unlikely(PageCompound(page))) + return __page_mapcount(page); + return atomic_read(&page->_mapcount) + 1; +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int total_mapcount(struct page *page); +#else +static inline int total_mapcount(struct page *page) +{ + return page_mapcount(page); } +#endif static inline int page_count(struct page *page) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f283cb7c480e..ab544b145b52 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1649,46 +1649,6 @@ bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, return false; } -/* - * This function returns whether a given @page is mapped onto the @address - * in the virtual space of @mm. - * - * When it's true, this function returns *pmd with holding the page table lock - * and passing it back to the caller via @ptl. - * If it's false, returns NULL without holding the page table lock. - */ -pmd_t *page_check_address_pmd(struct page *page, - struct mm_struct *mm, - unsigned long address, - spinlock_t **ptl) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - - if (address & ~HPAGE_PMD_MASK) - return NULL; - - pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) - return NULL; - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - return NULL; - pmd = pmd_offset(pud, address); - - *ptl = pmd_lock(mm, pmd); - if (!pmd_present(*pmd)) - goto unlock; - if (pmd_page(*pmd) != page) - goto unlock; - if (pmd_trans_huge(*pmd)) - return pmd; -unlock: - spin_unlock(*ptl); - return NULL; -} - #define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE) int hugepage_madvise(struct vm_area_struct *vma, @@ -3097,20 +3057,6 @@ static void unfreeze_page(struct anon_vma *anon_vma, struct page *page) } } -static int total_mapcount(struct page *page) -{ - int i, ret; - - ret = compound_mapcount(page); - for (i = 0; i < HPAGE_PMD_NR; i++) - ret += atomic_read(&page[i]._mapcount) + 1; - - if (PageDoubleMap(page)) - ret -= HPAGE_PMD_NR; - - return ret; -} - static int __split_huge_page_tail(struct page *head, int tail, struct lruvec *lruvec, struct list_head *list) { @@ -3211,6 +3157,25 @@ static void __split_huge_page(struct page *page, struct list_head *list) } } +int total_mapcount(struct page *page) +{ + int i, ret; + + VM_BUG_ON_PAGE(PageTail(page), page); + + if (likely(!PageCompound(page))) + return atomic_read(&page->_mapcount) + 1; + + ret = compound_mapcount(page); + if (PageHuge(page)) + return ret; + for (i = 0; i < HPAGE_PMD_NR; i++) + ret += atomic_read(&page[i]._mapcount) + 1; + if (PageDoubleMap(page)) + ret -= HPAGE_PMD_NR; + return ret; +} + /* * This function splits huge page into normal pages. @page can point to any * subpage of huge page to split. Split doesn't change the position of @page. diff --git a/mm/page_idle.c b/mm/page_idle.c index 1c245d9027e3..2c553ba969f8 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -56,23 +56,70 @@ static int page_idle_clear_pte_refs_one(struct page *page, { struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; + pgd_t *pgd; + pud_t *pud; pmd_t *pmd; pte_t *pte; bool referenced = false; - if (unlikely(PageTransHuge(page))) { - pmd = page_check_address_pmd(page, mm, addr, &ptl); - if (pmd) { - referenced = pmdp_clear_young_notify(vma, addr, pmd); + pgd = pgd_offset(mm, addr); + if (!pgd_present(*pgd)) + return SWAP_AGAIN; + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) + return SWAP_AGAIN; + pmd = pmd_offset(pud, addr); + + if (pmd_trans_huge(*pmd)) { + ptl = pmd_lock(mm, pmd); + if (!pmd_present(*pmd)) + goto unlock_pmd; + if (unlikely(!pmd_trans_huge(*pmd))) { spin_unlock(ptl); + goto map_pte; } + + if (pmd_page(*pmd) != page) + goto unlock_pmd; + + referenced = pmdp_clear_young_notify(vma, addr, pmd); + spin_unlock(ptl); + goto found; +unlock_pmd: + spin_unlock(ptl); + return SWAP_AGAIN; } else { - pte = page_check_address(page, mm, addr, &ptl, 0); - if (pte) { - referenced = ptep_clear_young_notify(vma, addr, pte); - pte_unmap_unlock(pte, ptl); - } + pmd_t pmde = *pmd; + + barrier(); + if (!pmd_present(pmde) || pmd_trans_huge(pmde)) + return SWAP_AGAIN; + + } +map_pte: + pte = pte_offset_map(pmd, addr); + if (!pte_present(*pte)) { + pte_unmap(pte); + return SWAP_AGAIN; } + + ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); + + if (!pte_present(*pte)) { + pte_unmap_unlock(pte, ptl); + return SWAP_AGAIN; + } + + /* THP can be referenced by any subpage */ + if (pte_pfn(*pte) - page_to_pfn(page) >= hpage_nr_pages(page)) { + pte_unmap_unlock(pte, ptl); + return SWAP_AGAIN; + } + + referenced = ptep_clear_young_notify(vma, addr, pte); + pte_unmap_unlock(pte, ptl); +found: if (referenced) { clear_page_idle(page); /* diff --git a/mm/rmap.c b/mm/rmap.c index 31d8866fb562..6127c00b2262 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -814,58 +814,105 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, spinlock_t *ptl; int referenced = 0; struct page_referenced_arg *pra = arg; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; - if (unlikely(PageTransHuge(page))) { - pmd_t *pmd; - - /* - * rmap might return false positives; we must filter - * these out using page_check_address_pmd(). - */ - pmd = page_check_address_pmd(page, mm, address, &ptl); - if (!pmd) + if (unlikely(PageHuge(page))) { + /* when pud is not present, pte will be NULL */ + pte = huge_pte_offset(mm, address); + if (!pte) return SWAP_AGAIN; - if (vma->vm_flags & VM_LOCKED) { + ptl = huge_pte_lockptr(page_hstate(page), mm, pte); + goto check_pte; + } + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + return SWAP_AGAIN; + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return SWAP_AGAIN; + pmd = pmd_offset(pud, address); + + if (pmd_trans_huge(*pmd)) { + int ret = SWAP_AGAIN; + + ptl = pmd_lock(mm, pmd); + if (!pmd_present(*pmd)) + goto unlock_pmd; + if (unlikely(!pmd_trans_huge(*pmd))) { spin_unlock(ptl); + goto map_pte; + } + + if (pmd_page(*pmd) != page) + goto unlock_pmd; + + if (vma->vm_flags & VM_LOCKED) { pra->vm_flags |= VM_LOCKED; - return SWAP_FAIL; /* To break the loop */ + ret = SWAP_FAIL; /* To break the loop */ + goto unlock_pmd; } if (pmdp_clear_flush_young_notify(vma, address, pmd)) referenced++; spin_unlock(ptl); + goto found; +unlock_pmd: + spin_unlock(ptl); + return ret; } else { - pte_t *pte; + pmd_t pmde = *pmd; - /* - * rmap might return false positives; we must filter - * these out using page_check_address(). - */ - pte = page_check_address(page, mm, address, &ptl, 0); - if (!pte) + barrier(); + if (!pmd_present(pmde) || pmd_trans_huge(pmde)) return SWAP_AGAIN; + } +map_pte: + pte = pte_offset_map(pmd, address); + if (!pte_present(*pte)) { + pte_unmap(pte); + return SWAP_AGAIN; + } - if (vma->vm_flags & VM_LOCKED) { - pte_unmap_unlock(pte, ptl); - pra->vm_flags |= VM_LOCKED; - return SWAP_FAIL; /* To break the loop */ - } + ptl = pte_lockptr(mm, pmd); +check_pte: + spin_lock(ptl); - if (ptep_clear_flush_young_notify(vma, address, pte)) { - /* - * Don't treat a reference through a sequentially read - * mapping as such. If the page has been used in - * another mapping, we will catch it; if this other - * mapping is already gone, the unmap path will have - * set PG_referenced or activated the page. - */ - if (likely(!(vma->vm_flags & VM_SEQ_READ))) - referenced++; - } + if (!pte_present(*pte)) { + pte_unmap_unlock(pte, ptl); + return SWAP_AGAIN; + } + + /* THP can be referenced by any subpage */ + if (pte_pfn(*pte) - page_to_pfn(page) >= hpage_nr_pages(page)) { + pte_unmap_unlock(pte, ptl); + return SWAP_AGAIN; + } + + if (vma->vm_flags & VM_LOCKED) { pte_unmap_unlock(pte, ptl); + pra->vm_flags |= VM_LOCKED; + return SWAP_FAIL; /* To break the loop */ } + if (ptep_clear_flush_young_notify(vma, address, pte)) { + /* + * Don't treat a reference through a sequentially read + * mapping as such. If the page has been used in + * another mapping, we will catch it; if this other + * mapping is already gone, the unmap path will have + * set PG_referenced or activated the page. + */ + if (likely(!(vma->vm_flags & VM_SEQ_READ))) + referenced++; + } + pte_unmap_unlock(pte, ptl); + +found: if (referenced) clear_page_idle(page); if (test_and_clear_page_young(page)) @@ -912,7 +959,7 @@ int page_referenced(struct page *page, int ret; int we_locked = 0; struct page_referenced_arg pra = { - .mapcount = page_mapcount(page), + .mapcount = total_mapcount(page), .memcg = memcg, }; struct rmap_walk_control rwc = { diff --git a/mm/util.c b/mm/util.c index 8acb936a52c8..6d1f9200f74e 100644 --- a/mm/util.c +++ b/mm/util.c @@ -407,6 +407,20 @@ struct address_space *page_mapping(struct page *page) return mapping; } +/* Slow path of page_mapcount() for compound pages */ +int __page_mapcount(struct page *page) +{ + int ret; + + ret = atomic_read(&page->_mapcount) + 1; + page = compound_head(page); + ret += atomic_read(compound_mapcount_ptr(page)) + 1; + if (PageDoubleMap(page)) + ret--; + return ret; +} +EXPORT_SYMBOL_GPL(__page_mapcount); + int overcommit_ratio_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) -- cgit v1.2.3 From a3b609ef9f8b1dbfe97034ccad6cd3fe71fbe7ab Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 20 Jan 2016 15:01:05 -0800 Subject: proc read mm's {arg,env}_{start,end} with mmap semaphore taken. Only functions doing more than one read are modified. Consumeres happened to deal with possibly changing data, but it does not seem like a good thing to rely on. Signed-off-by: Mateusz Guzik Acked-by: Cyrill Gorcunov Cc: Alexey Dobriyan Cc: Jarod Wilson Cc: Jan Stancek Cc: Al Viro Cc: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 13 ++++++++++--- mm/util.c | 16 ++++++++++++---- 2 files changed, 22 insertions(+), 7 deletions(-) (limited to 'mm/util.c') diff --git a/fs/proc/base.c b/fs/proc/base.c index e665097c1da5..4f764c2ac1a5 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -953,6 +953,7 @@ static ssize_t environ_read(struct file *file, char __user *buf, unsigned long src = *ppos; int ret = 0; struct mm_struct *mm = file->private_data; + unsigned long env_start, env_end; if (!mm) return 0; @@ -964,19 +965,25 @@ static ssize_t environ_read(struct file *file, char __user *buf, ret = 0; if (!atomic_inc_not_zero(&mm->mm_users)) goto free; + + down_read(&mm->mmap_sem); + env_start = mm->env_start; + env_end = mm->env_end; + up_read(&mm->mmap_sem); + while (count > 0) { size_t this_len, max_len; int retval; - if (src >= (mm->env_end - mm->env_start)) + if (src >= (env_end - env_start)) break; - this_len = mm->env_end - (mm->env_start + src); + this_len = env_end - (env_start + src); max_len = min_t(size_t, PAGE_SIZE, count); this_len = min(max_len, this_len); - retval = access_remote_vm(mm, (mm->env_start + src), + retval = access_remote_vm(mm, (env_start + src), page, this_len, 0); if (retval <= 0) { diff --git a/mm/util.c b/mm/util.c index 6d1f9200f74e..c108a6542d05 100644 --- a/mm/util.c +++ b/mm/util.c @@ -476,17 +476,25 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen) int res = 0; unsigned int len; struct mm_struct *mm = get_task_mm(task); + unsigned long arg_start, arg_end, env_start, env_end; if (!mm) goto out; if (!mm->arg_end) goto out_mm; /* Shh! No looking before we're done */ - len = mm->arg_end - mm->arg_start; + down_read(&mm->mmap_sem); + arg_start = mm->arg_start; + arg_end = mm->arg_end; + env_start = mm->env_start; + env_end = mm->env_end; + up_read(&mm->mmap_sem); + + len = arg_end - arg_start; if (len > buflen) len = buflen; - res = access_process_vm(task, mm->arg_start, buffer, len, 0); + res = access_process_vm(task, arg_start, buffer, len, 0); /* * If the nul at the end of args has been overwritten, then @@ -497,10 +505,10 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen) if (len < res) { res = len; } else { - len = mm->env_end - mm->env_start; + len = env_end - env_start; if (len > buflen - res) len = buflen - res; - res += access_process_vm(task, mm->env_start, + res += access_process_vm(task, env_start, buffer+res, len, 0); res = strnlen(buffer, res); } -- cgit v1.2.3