From 9977f0f164d46613288e0b5778eae500dfe06f31 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Tue, 12 Feb 2013 13:46:20 -0800 Subject: mm: don't overwrite mm->def_flags in do_mlockall() With commit 8e72033f2a48 ("thp: make MADV_HUGEPAGE check for mm->def_flags") the VM_NOHUGEPAGE flag may be set on s390 in mm->def_flags for certain processes, to prevent future thp mappings. This would be overwritten by do_mlockall(), which sets it back to 0 with an optional VM_LOCKED flag set. To fix this, instead of overwriting mm->def_flags in do_mlockall(), only the VM_LOCKED flag should be set or cleared. Signed-off-by: Gerald Schaefer Reported-by: Vivek Goyal Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm/mlock.c') diff --git a/mm/mlock.c b/mm/mlock.c index f0b9ce572fc7..c9bd528b01d2 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -517,11 +517,11 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) static int do_mlockall(int flags) { struct vm_area_struct * vma, * prev = NULL; - unsigned int def_flags = 0; if (flags & MCL_FUTURE) - def_flags = VM_LOCKED; - current->mm->def_flags = def_flags; + current->mm->def_flags |= VM_LOCKED; + else + current->mm->def_flags &= ~VM_LOCKED; if (flags == MCL_FUTURE) goto out; -- cgit v1.2.3 From bebeb3d68b24bb4132d452c5707fe321208bcbcd Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Fri, 22 Feb 2013 16:32:37 -0800 Subject: mm: introduce mm_populate() for populating new vmas When creating new mappings using the MAP_POPULATE / MAP_LOCKED flags (or with MCL_FUTURE in effect), we want to populate the pages within the newly created vmas. This may take a while as we may have to read pages from disk, so ideally we want to do this outside of the write-locked mmap_sem region. This change introduces mm_populate(), which is used to defer populating such mappings until after the mmap_sem write lock has been released. This is implemented as a generalization of the former do_mlock_pages(), which accomplished the same task but was using during mlock() / mlockall(). Signed-off-by: Michel Lespinasse Reported-by: Andy Lutomirski Acked-by: Rik van Riel Tested-by: Andy Lutomirski Cc: Greg Ungerer Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 6 +++++- include/linux/mm.h | 18 +++++++++++++++--- ipc/shm.c | 12 +++++++----- mm/mlock.c | 17 +++++++++++------ mm/mmap.c | 20 +++++++++++++++----- mm/nommu.c | 5 ++++- mm/util.c | 6 +++++- 7 files changed, 62 insertions(+), 22 deletions(-) (limited to 'mm/mlock.c') diff --git a/fs/aio.c b/fs/aio.c index 71f613cf4a85..82eec7c7b4bb 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -103,6 +103,7 @@ static int aio_setup_ring(struct kioctx *ctx) unsigned nr_events = ctx->max_reqs; unsigned long size; int nr_pages; + bool populate; /* Compensate for the ring buffer's head/tail overlap entry */ nr_events += 2; /* 1 is required, 2 for good luck */ @@ -129,7 +130,8 @@ static int aio_setup_ring(struct kioctx *ctx) down_write(&ctx->mm->mmap_sem); info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, PROT_READ|PROT_WRITE, - MAP_ANONYMOUS|MAP_PRIVATE, 0); + MAP_ANONYMOUS|MAP_PRIVATE, 0, + &populate); if (IS_ERR((void *)info->mmap_base)) { up_write(&ctx->mm->mmap_sem); info->mmap_size = 0; @@ -147,6 +149,8 @@ static int aio_setup_ring(struct kioctx *ctx) aio_free_ring(ctx); return -EAGAIN; } + if (populate) + mm_populate(info->mmap_base, info->mmap_size); ctx->user_id = info->mmap_base; diff --git a/include/linux/mm.h b/include/linux/mm.h index 9d9dcc35d6a1..da0a0fe970c2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1474,11 +1474,23 @@ extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned lo extern unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff); -extern unsigned long do_mmap_pgoff(struct file *, unsigned long, - unsigned long, unsigned long, - unsigned long, unsigned long); +extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, unsigned long flags, + unsigned long pgoff, bool *populate); extern int do_munmap(struct mm_struct *, unsigned long, size_t); +#ifdef CONFIG_MMU +extern int __mm_populate(unsigned long addr, unsigned long len, + int ignore_errors); +static inline void mm_populate(unsigned long addr, unsigned long len) +{ + /* Ignore errors */ + (void) __mm_populate(addr, len, 1); +} +#else +static inline void mm_populate(unsigned long addr, unsigned long len) {} +#endif + /* These take the mm semaphore themselves */ extern unsigned long vm_brk(unsigned long, unsigned long); extern int vm_munmap(unsigned long, size_t); diff --git a/ipc/shm.c b/ipc/shm.c index 4fa6d8fee730..9f047ba69e62 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -967,11 +967,11 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, unsigned long flags; unsigned long prot; int acc_mode; - unsigned long user_addr; struct ipc_namespace *ns; struct shm_file_data *sfd; struct path path; fmode_t f_mode; + bool populate = false; err = -EINVAL; if (shmid < 0) @@ -1070,13 +1070,15 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, goto invalid; } - user_addr = do_mmap_pgoff(file, addr, size, prot, flags, 0); - *raddr = user_addr; + addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate); + *raddr = addr; err = 0; - if (IS_ERR_VALUE(user_addr)) - err = (long)user_addr; + if (IS_ERR_VALUE(addr)) + err = (long)addr; invalid: up_write(¤t->mm->mmap_sem); + if (populate) + mm_populate(addr, size); out_fput: fput(file); diff --git a/mm/mlock.c b/mm/mlock.c index c9bd528b01d2..a296a49865df 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -416,7 +416,14 @@ static int do_mlock(unsigned long start, size_t len, int on) return error; } -static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors) +/* + * __mm_populate - populate and/or mlock pages within a range of address space. + * + * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap + * flags. VMAs must be already marked with the desired vm_flags, and + * mmap_sem must not be held. + */ +int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) { struct mm_struct *mm = current->mm; unsigned long end, nstart, nend; @@ -498,7 +505,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) error = do_mlock(start, len, 1); up_write(¤t->mm->mmap_sem); if (!error) - error = do_mlock_pages(start, len, 0); + error = __mm_populate(start, len, 0); return error; } @@ -564,10 +571,8 @@ SYSCALL_DEFINE1(mlockall, int, flags) capable(CAP_IPC_LOCK)) ret = do_mlockall(flags); up_write(¤t->mm->mmap_sem); - if (!ret && (flags & MCL_CURRENT)) { - /* Ignore errors */ - do_mlock_pages(0, TASK_SIZE, 1); - } + if (!ret && (flags & MCL_CURRENT)) + mm_populate(0, TASK_SIZE); out: return ret; } diff --git a/mm/mmap.c b/mm/mmap.c index 09da0b264982..9b12e3047a86 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1154,12 +1154,15 @@ static inline unsigned long round_hint_to_min(unsigned long hint) unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, - unsigned long flags, unsigned long pgoff) + unsigned long flags, unsigned long pgoff, + bool *populate) { struct mm_struct * mm = current->mm; struct inode *inode; vm_flags_t vm_flags; + *populate = false; + /* * Does the application expect PROT_READ to imply PROT_EXEC? * @@ -1280,7 +1283,12 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, } } - return mmap_region(file, addr, len, flags, vm_flags, pgoff); + addr = mmap_region(file, addr, len, flags, vm_flags, pgoff); + if (!IS_ERR_VALUE(addr) && + ((vm_flags & VM_LOCKED) || + (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) + *populate = true; + return addr; } SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, @@ -1531,10 +1539,12 @@ out: vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { - if (!mlock_vma_pages_range(vma, addr, addr + len)) + if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || + vma == get_gate_vma(current->mm))) mm->locked_vm += (len >> PAGE_SHIFT); - } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) - make_pages_present(addr, addr + len); + else + vma->vm_flags &= ~VM_LOCKED; + } if (file) uprobe_mmap(vma); diff --git a/mm/nommu.c b/mm/nommu.c index b20db4e22263..7296a5a280e7 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1250,7 +1250,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long len, unsigned long prot, unsigned long flags, - unsigned long pgoff) + unsigned long pgoff, + bool *populate) { struct vm_area_struct *vma; struct vm_region *region; @@ -1260,6 +1261,8 @@ unsigned long do_mmap_pgoff(struct file *file, kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); + *populate = false; + /* decide whether we should attempt the mapping, and if so what sort of * mapping */ ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, diff --git a/mm/util.c b/mm/util.c index c55e26b17d93..13467e043e9e 100644 --- a/mm/util.c +++ b/mm/util.c @@ -355,12 +355,16 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, { unsigned long ret; struct mm_struct *mm = current->mm; + bool populate; ret = security_mmap_file(file, prot, flag); if (!ret) { down_write(&mm->mmap_sem); - ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff); + ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff, + &populate); up_write(&mm->mmap_sem); + if (!IS_ERR_VALUE(ret) && populate) + mm_populate(ret, len); } return ret; } -- cgit v1.2.3 From cea10a19b7972a1954c4a2d05a7de8db48b444fb Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Fri, 22 Feb 2013 16:32:44 -0800 Subject: mm: directly use __mlock_vma_pages_range() in find_extend_vma() In find_extend_vma(), we don't need mlock_vma_pages_range() to verify the vma type - we know we're working with a stack. So, we can call directly into __mlock_vma_pages_range(), and remove the last make_pages_present() call site. Note that we don't use mm_populate() here, so we can't release the mmap_sem while allocating new stack pages. This is deemed acceptable, because the stack vmas grow by a bounded number of pages at a time, and these are anon pages so we don't have to read from disk to populate them. Signed-off-by: Michel Lespinasse Acked-by: Rik van Riel Tested-by: Andy Lutomirski Cc: Greg Ungerer Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - mm/internal.h | 4 ++-- mm/memory.c | 24 ----------------------- mm/mlock.c | 57 +++--------------------------------------------------- mm/mmap.c | 10 ++++------ 5 files changed, 9 insertions(+), 87 deletions(-) (limited to 'mm/mlock.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8332f3069fe3..0c34d3486816 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1035,7 +1035,6 @@ static inline int fixup_user_fault(struct task_struct *tsk, } #endif -extern int make_pages_present(unsigned long addr, unsigned long end); extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, int write); diff --git a/mm/internal.h b/mm/internal.h index 9ba21100ebf3..1c0c4cc0fcf7 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -162,8 +162,8 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, struct rb_node *rb_parent); #ifdef CONFIG_MMU -extern long mlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end); +extern long __mlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, int *nonblocking); extern void munlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); static inline void munlock_vma_pages_all(struct vm_area_struct *vma) diff --git a/mm/memory.c b/mm/memory.c index 0abd07097ec6..7837ceacf090 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3824,30 +3824,6 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) } #endif /* __PAGETABLE_PMD_FOLDED */ -int make_pages_present(unsigned long addr, unsigned long end) -{ - int ret, len, write; - struct vm_area_struct * vma; - - vma = find_vma(current->mm, addr); - if (!vma) - return -ENOMEM; - /* - * We want to touch writable mappings with a write fault in order - * to break COW, except for shared mappings because these don't COW - * and we would not want to dirty them for nothing. - */ - write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE; - BUG_ON(addr >= end); - BUG_ON(end > vma->vm_end); - len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; - ret = get_user_pages(current, current->mm, addr, - len, write, 0, NULL, NULL); - if (ret < 0) - return ret; - return ret == len ? 0 : -EFAULT; -} - #if !defined(__HAVE_ARCH_GATE_AREA) #if defined(AT_SYSINFO_EHDR) diff --git a/mm/mlock.c b/mm/mlock.c index a296a49865df..569400a5d079 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -155,9 +155,8 @@ void munlock_vma_page(struct page *page) * * vma->vm_mm->mmap_sem must be held for at least read. */ -static long __mlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - int *nonblocking) +long __mlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, int *nonblocking) { struct mm_struct *mm = vma->vm_mm; unsigned long addr = start; @@ -202,56 +201,6 @@ static int __mlock_posix_error_return(long retval) return retval; } -/** - * mlock_vma_pages_range() - mlock pages in specified vma range. - * @vma - the vma containing the specfied address range - * @start - starting address in @vma to mlock - * @end - end address [+1] in @vma to mlock - * - * For mmap()/mremap()/expansion of mlocked vma. - * - * return 0 on success for "normal" vmas. - * - * return number of pages [> 0] to be removed from locked_vm on success - * of "special" vmas. - */ -long mlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ - int nr_pages = (end - start) / PAGE_SIZE; - BUG_ON(!(vma->vm_flags & VM_LOCKED)); - - /* - * filter unlockable vmas - */ - if (vma->vm_flags & (VM_IO | VM_PFNMAP)) - goto no_mlock; - - if (!((vma->vm_flags & VM_DONTEXPAND) || - is_vm_hugetlb_page(vma) || - vma == get_gate_vma(current->mm))) { - - __mlock_vma_pages_range(vma, start, end, NULL); - - /* Hide errors from mmap() and other callers */ - return 0; - } - - /* - * User mapped kernel pages or huge pages: - * make these pages present to populate the ptes, but - * fall thru' to reset VM_LOCKED--no need to unlock, and - * return nr_pages so these don't get counted against task's - * locked limit. huge pages are already counted against - * locked vm limit. - */ - make_pages_present(start, end); - -no_mlock: - vma->vm_flags &= ~VM_LOCKED; /* and don't come back! */ - return nr_pages; /* error or pages NOT mlocked */ -} - /* * munlock_vma_pages_range() - munlock all pages in the vma range.' * @vma - vma containing range to be munlock()ed. @@ -303,7 +252,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, * * Filters out "special" vmas -- VM_LOCKED never gets set for these, and * munlock is a no-op. However, for some special vmas, we go ahead and - * populate the ptes via make_pages_present(). + * populate the ptes. * * For vmas that pass the filters, merge/split as appropriate. */ diff --git a/mm/mmap.c b/mm/mmap.c index d6bc39e939ab..8826c77513a9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2204,9 +2204,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) return vma; if (!prev || expand_stack(prev, addr)) return NULL; - if (prev->vm_flags & VM_LOCKED) { - mlock_vma_pages_range(prev, addr, prev->vm_end); - } + if (prev->vm_flags & VM_LOCKED) + __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL); return prev; } #else @@ -2232,9 +2231,8 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) start = vma->vm_start; if (expand_stack(vma, addr)) return NULL; - if (vma->vm_flags & VM_LOCKED) { - mlock_vma_pages_range(vma, addr, start); - } + if (vma->vm_flags & VM_LOCKED) + __mlock_vma_pages_range(vma, addr, start, NULL); return vma; } #endif -- cgit v1.2.3 From 1869305009857cdeaabe6283bcdc2359c5784543 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Fri, 22 Feb 2013 16:32:46 -0800 Subject: mm: introduce VM_POPULATE flag to better deal with racy userspace programs The vm_populate() code populates user mappings without constantly holding the mmap_sem. This makes it susceptible to racy userspace programs: the user mappings may change while vm_populate() is running, and in this case vm_populate() may end up populating the new mapping instead of the old one. In order to reduce the possibility of userspace getting surprised by this behavior, this change introduces the VM_POPULATE vma flag which gets set on vmas we want vm_populate() to work on. This way vm_populate() may still end up populating the new mapping after such a race, but only if the new mapping is also one that the user has requested (using MAP_SHARED, MAP_LOCKED or mlock) to be populated. Signed-off-by: Michel Lespinasse Acked-by: Rik van Riel Tested-by: Andy Lutomirski Cc: Greg Ungerer Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + include/linux/mman.h | 4 +++- mm/fremap.c | 12 ++++++++++-- mm/mlock.c | 19 ++++++++++--------- mm/mmap.c | 4 +--- 5 files changed, 25 insertions(+), 15 deletions(-) (limited to 'mm/mlock.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0c34d3486816..9a5fcdeaa3a0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -87,6 +87,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ +#define VM_POPULATE 0x00001000 #define VM_LOCKED 0x00002000 #define VM_IO 0x00004000 /* Memory mapped I/O or similar */ diff --git a/include/linux/mman.h b/include/linux/mman.h index 9aa863da287f..61c7a87e5d2b 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -79,6 +79,8 @@ calc_vm_flag_bits(unsigned long flags) { return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | - _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ); + ((flags & MAP_LOCKED) ? (VM_LOCKED | VM_POPULATE) : 0) | + (((flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE) ? + VM_POPULATE : 0); } #endif /* _LINUX_MMAN_H */ diff --git a/mm/fremap.c b/mm/fremap.c index 503a72387087..0cd4c11488ed 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -204,8 +204,10 @@ get_write_lock: unsigned long addr; struct file *file = get_file(vma->vm_file); - addr = mmap_region(file, start, size, - vma->vm_flags, pgoff); + vm_flags = vma->vm_flags; + if (!(flags & MAP_NONBLOCK)) + vm_flags |= VM_POPULATE; + addr = mmap_region(file, start, size, vm_flags, pgoff); fput(file); if (IS_ERR_VALUE(addr)) { err = addr; @@ -224,6 +226,12 @@ get_write_lock: mutex_unlock(&mapping->i_mmap_mutex); } + if (!(flags & MAP_NONBLOCK) && !(vma->vm_flags & VM_POPULATE)) { + if (!has_write_lock) + goto get_write_lock; + vma->vm_flags |= VM_POPULATE; + } + if (vma->vm_flags & VM_LOCKED) { /* * drop PG_Mlocked flag for over-mapped range diff --git a/mm/mlock.c b/mm/mlock.c index 569400a5d079..d6378feb2950 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -340,9 +340,9 @@ static int do_mlock(unsigned long start, size_t len, int on) /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ - newflags = vma->vm_flags | VM_LOCKED; - if (!on) - newflags &= ~VM_LOCKED; + newflags = vma->vm_flags & ~VM_LOCKED; + if (on) + newflags |= VM_LOCKED | VM_POPULATE; tmp = vma->vm_end; if (tmp > end) @@ -402,7 +402,8 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) * range with the first VMA. Also, skip undesirable VMA types. */ nend = min(end, vma->vm_end); - if (vma->vm_flags & (VM_IO | VM_PFNMAP)) + if ((vma->vm_flags & (VM_IO | VM_PFNMAP | VM_POPULATE)) != + VM_POPULATE) continue; if (nstart < vma->vm_start) nstart = vma->vm_start; @@ -475,18 +476,18 @@ static int do_mlockall(int flags) struct vm_area_struct * vma, * prev = NULL; if (flags & MCL_FUTURE) - current->mm->def_flags |= VM_LOCKED; + current->mm->def_flags |= VM_LOCKED | VM_POPULATE; else - current->mm->def_flags &= ~VM_LOCKED; + current->mm->def_flags &= ~(VM_LOCKED | VM_POPULATE); if (flags == MCL_FUTURE) goto out; for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { vm_flags_t newflags; - newflags = vma->vm_flags | VM_LOCKED; - if (!(flags & MCL_CURRENT)) - newflags &= ~VM_LOCKED; + newflags = vma->vm_flags & ~VM_LOCKED; + if (flags & MCL_CURRENT) + newflags |= VM_LOCKED | VM_POPULATE; /* Ignore errors */ mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); diff --git a/mm/mmap.c b/mm/mmap.c index 8826c77513a9..39a3944e1658 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1306,9 +1306,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, } addr = mmap_region(file, addr, len, vm_flags, pgoff); - if (!IS_ERR_VALUE(addr) && - ((vm_flags & VM_LOCKED) || - (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) + if (!IS_ERR_VALUE(addr) && (vm_flags & VM_POPULATE)) *populate = true; return addr; } -- cgit v1.2.3 From 4805b02e90187c68d8f4e3305c3482b797e35809 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 22 Feb 2013 16:35:20 -0800 Subject: mm/mlock.c: document scary-looking stack expansion mlock chain The fact that mlock calls get_user_pages, and get_user_pages might call mlock when expanding a stack looks like a potential recursion. However, mlock makes sure the requested range is already contained within a vma, so no stack expansion will actually happen from mlock. Should this ever change: the stack expansion mlocks only the newly expanded range and so will not result in recursive expansion. Signed-off-by: Johannes Weiner Reported-by: Al Viro Cc: Hugh Dickins Acked-by: Michel Lespinasse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm/mlock.c') diff --git a/mm/mlock.c b/mm/mlock.c index d6378feb2950..38db3b094105 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -185,6 +185,10 @@ long __mlock_vma_pages_range(struct vm_area_struct *vma, if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) gup_flags |= FOLL_FORCE; + /* + * We made sure addr is within a VMA, so the following will + * not result in a stack expansion that recurses back here. + */ return __get_user_pages(current, mm, addr, nr_pages, gup_flags, NULL, NULL, nonblocking); } -- cgit v1.2.3 From 28a35716d317980ae9bc2ff2f84c33a3cda9e884 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Fri, 22 Feb 2013 16:35:55 -0800 Subject: mm: use long type for page counts in mm_populate() and get_user_pages() Use long type for page counts in mm_populate() so as to avoid integer overflow when running the following test code: int main(void) { void *p = mmap(NULL, 0x100000000000, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0); printf("p: %p\n", p); mlockall(MCL_CURRENT); printf("done\n"); return 0; } Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Mel Gorman Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 6 +++--- include/linux/mm.h | 15 ++++++++------- mm/hugetlb.c | 12 ++++++------ mm/memory.c | 18 +++++++++--------- mm/mlock.c | 4 ++-- mm/nommu.c | 15 ++++++++------- 6 files changed, 36 insertions(+), 34 deletions(-) (limited to 'mm/mlock.c') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 0c80d3f57a5b..eedc334fb6f5 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -43,9 +43,9 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, #endif int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); -int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, - struct page **, struct vm_area_struct **, - unsigned long *, int *, int, unsigned int flags); +long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, + struct page **, struct vm_area_struct **, + unsigned long *, unsigned long *, long, unsigned int); void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long, struct page *); void __unmap_hugepage_range_final(struct mmu_gather *tlb, diff --git a/include/linux/mm.h b/include/linux/mm.h index 97da0302cf51..87b0ef253607 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1013,13 +1013,14 @@ extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void * extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, int write); -int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, unsigned int foll_flags, - struct page **pages, struct vm_area_struct **vmas, - int *nonblocking); -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int nr_pages, int write, int force, - struct page **pages, struct vm_area_struct **vmas); +long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int foll_flags, struct page **pages, + struct vm_area_struct **vmas, int *nonblocking); +long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + struct vm_area_struct **vmas); int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages); struct kvec; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e14a8c79a1eb..cdb64e4d238a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2920,14 +2920,14 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, return NULL; } -int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, - struct page **pages, struct vm_area_struct **vmas, - unsigned long *position, int *length, int i, - unsigned int flags) +long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, + struct page **pages, struct vm_area_struct **vmas, + unsigned long *position, unsigned long *nr_pages, + long i, unsigned int flags) { unsigned long pfn_offset; unsigned long vaddr = *position; - int remainder = *length; + unsigned long remainder = *nr_pages; struct hstate *h = hstate_vma(vma); spin_lock(&mm->page_table_lock); @@ -2997,7 +2997,7 @@ same_page: } } spin_unlock(&mm->page_table_lock); - *length = remainder; + *nr_pages = remainder; *position = vaddr; return i ? i : -EFAULT; diff --git a/mm/memory.c b/mm/memory.c index 7bd22a621817..bc929dbad215 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1677,15 +1677,15 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add * instead of __get_user_pages. __get_user_pages should be used only if * you need some special @gup_flags. */ -int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int nr_pages, unsigned int gup_flags, - struct page **pages, struct vm_area_struct **vmas, - int *nonblocking) +long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas, int *nonblocking) { - int i; + long i; unsigned long vm_flags; - if (nr_pages <= 0) + if (!nr_pages) return 0; VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); @@ -1981,9 +1981,9 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, * * See also get_user_pages_fast, for performance critical applications. */ -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int nr_pages, int write, int force, - struct page **pages, struct vm_area_struct **vmas) +long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, int write, + int force, struct page **pages, struct vm_area_struct **vmas) { int flags = FOLL_TOUCH; diff --git a/mm/mlock.c b/mm/mlock.c index 38db3b094105..e6638f565d42 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -160,7 +160,7 @@ long __mlock_vma_pages_range(struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; unsigned long addr = start; - int nr_pages = (end - start) / PAGE_SIZE; + unsigned long nr_pages = (end - start) / PAGE_SIZE; int gup_flags; VM_BUG_ON(start & ~PAGE_MASK); @@ -382,7 +382,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) unsigned long end, nstart, nend; struct vm_area_struct *vma = NULL; int locked = 0; - int ret = 0; + long ret = 0; VM_BUG_ON(start & ~PAGE_MASK); VM_BUG_ON(len != PAGE_ALIGN(len)); diff --git a/mm/nommu.c b/mm/nommu.c index 87854a55829d..6ab706608492 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -140,10 +140,10 @@ unsigned int kobjsize(const void *objp) return PAGE_SIZE << compound_order(page); } -int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int nr_pages, unsigned int foll_flags, - struct page **pages, struct vm_area_struct **vmas, - int *retry) +long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int foll_flags, struct page **pages, + struct vm_area_struct **vmas, int *nonblocking) { struct vm_area_struct *vma; unsigned long vm_flags; @@ -190,9 +190,10 @@ finish_or_fault: * slab page or a secondary page from a compound page * - don't permit access to VMAs that don't support it, such as I/O mappings */ -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int nr_pages, int write, int force, - struct page **pages, struct vm_area_struct **vmas) +long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + struct vm_area_struct **vmas) { int flags = 0; -- cgit v1.2.3 From ff6a6da60b894d008f704fbeb5bc596f9994b16e Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Wed, 27 Feb 2013 17:02:44 -0800 Subject: mm: accelerate munlock() treatment of THP pages munlock_vma_pages_range() was always incrementing addresses by PAGE_SIZE at a time. When munlocking THP pages (or the huge zero page), this resulted in taking the mm->page_table_lock 512 times in a row. We can do better by making use of the page_mask returned by follow_page_mask (for the huge zero page case), or the size of the page munlock_vma_page() operated on (for the true THP page case). Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Mel Gorman Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 2 +- mm/mlock.c | 34 +++++++++++++++++++++++----------- 2 files changed, 24 insertions(+), 12 deletions(-) (limited to 'mm/mlock.c') diff --git a/mm/internal.h b/mm/internal.h index 1c0c4cc0fcf7..8562de0a5197 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -195,7 +195,7 @@ static inline int mlocked_vma_newpage(struct vm_area_struct *vma, * must be called with vma's mmap_sem held for read or write, and page locked. */ extern void mlock_vma_page(struct page *page); -extern void munlock_vma_page(struct page *page); +extern unsigned int munlock_vma_page(struct page *page); /* * Clear the page's PageMlocked(). This can be useful in a situation where diff --git a/mm/mlock.c b/mm/mlock.c index e6638f565d42..1c5e33fce639 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -102,13 +102,16 @@ void mlock_vma_page(struct page *page) * can't isolate the page, we leave it for putback_lru_page() and vmscan * [page_referenced()/try_to_unmap()] to deal with. */ -void munlock_vma_page(struct page *page) +unsigned int munlock_vma_page(struct page *page) { + unsigned int page_mask = 0; + BUG_ON(!PageLocked(page)); if (TestClearPageMlocked(page)) { - mod_zone_page_state(page_zone(page), NR_MLOCK, - -hpage_nr_pages(page)); + unsigned int nr_pages = hpage_nr_pages(page); + mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); + page_mask = nr_pages - 1; if (!isolate_lru_page(page)) { int ret = SWAP_AGAIN; @@ -141,6 +144,8 @@ void munlock_vma_page(struct page *page) count_vm_event(UNEVICTABLE_PGMUNLOCKED); } } + + return page_mask; } /** @@ -159,7 +164,6 @@ long __mlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *nonblocking) { struct mm_struct *mm = vma->vm_mm; - unsigned long addr = start; unsigned long nr_pages = (end - start) / PAGE_SIZE; int gup_flags; @@ -189,7 +193,7 @@ long __mlock_vma_pages_range(struct vm_area_struct *vma, * We made sure addr is within a VMA, so the following will * not result in a stack expansion that recurses back here. */ - return __get_user_pages(current, mm, addr, nr_pages, gup_flags, + return __get_user_pages(current, mm, start, nr_pages, gup_flags, NULL, NULL, nonblocking); } @@ -226,13 +230,12 @@ static int __mlock_posix_error_return(long retval) void munlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - unsigned long addr; - - lru_add_drain(); vma->vm_flags &= ~VM_LOCKED; - for (addr = start; addr < end; addr += PAGE_SIZE) { + while (start < end) { struct page *page; + unsigned int page_mask, page_increm; + /* * Although FOLL_DUMP is intended for get_dump_page(), * it just so happens that its special treatment of the @@ -240,13 +243,22 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, * suits munlock very well (and if somehow an abnormal page * has sneaked into the range, we won't oops here: great). */ - page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); + page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP, + &page_mask); if (page && !IS_ERR(page)) { lock_page(page); - munlock_vma_page(page); + lru_add_drain(); + /* + * Any THP page found by follow_page_mask() may have + * gotten split before reaching munlock_vma_page(), + * so we need to recompute the page_mask here. + */ + page_mask = munlock_vma_page(page); unlock_page(page); put_page(page); } + page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); + start += page_increm * PAGE_SIZE; cond_resched(); } } -- cgit v1.2.3