From 95042f9eb78a8d9a17455e2ef263f2f310ecef15 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 12 Apr 2011 14:15:51 -0700 Subject: vm: fix mlock() on stack guard page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 53a7706d5ed8 ("mlock: do not hold mmap_sem for extended periods of time") changed mlock() to care about the exact number of pages that __get_user_pages() had brought it. Before, it would only care about errors. And that doesn't work, because we also handled one page specially in __mlock_vma_pages_range(), namely the stack guard page. So when that case was handled, the number of pages that the function returned was off by one. In particular, it could be zero, and then the caller would end up not making any progress at all. Rather than try to fix up that off-by-one error for the mlock case specially, this just moves the logic to handle the stack guard page into__get_user_pages() itself, thus making all the counts come out right automatically. Reported-by: Robert Święcki Cc: Hugh Dickins Cc: Oleg Nesterov Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- mm/mlock.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'mm/mlock.c') diff --git a/mm/mlock.c b/mm/mlock.c index 2689a08c79af..6b55e3efe0df 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -135,13 +135,6 @@ void munlock_vma_page(struct page *page) } } -static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) -{ - return (vma->vm_flags & VM_GROWSDOWN) && - (vma->vm_start == addr) && - !vma_stack_continue(vma->vm_prev, addr); -} - /** * __mlock_vma_pages_range() - mlock a range of pages in the vma. * @vma: target vma @@ -188,12 +181,6 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, if (vma->vm_flags & VM_LOCKED) gup_flags |= FOLL_MLOCK; - /* We don't try to access the guard page of a stack vma */ - if (stack_guard_page(vma, start)) { - addr += PAGE_SIZE; - nr_pages--; - } - return __get_user_pages(current, mm, addr, nr_pages, gup_flags, NULL, NULL, nonblocking); } -- cgit v1.2.3 From a1fde08c74e90accd62d4cfdbf580d2ede938fe7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 4 May 2011 21:30:28 -0700 Subject: VM: skip the stack guard page lookup in get_user_pages only for mlock The logic in __get_user_pages() used to skip the stack guard page lookup whenever the caller wasn't interested in seeing what the actual page was. But Michel Lespinasse points out that there are cases where we don't care about the physical page itself (so 'pages' may be NULL), but do want to make sure a page is mapped into the virtual address space. So using the existence of the "pages" array as an indication of whether to look up the guard page or not isn't actually so great, and we really should just use the FOLL_MLOCK bit. But because that bit was only set for the VM_LOCKED case (and not all vma's necessarily have it, even for mlock()), we couldn't do that originally. Fix that by moving the VM_LOCKED check deeper into the call-chain, which actually simplifies many things. Now mlock() gets simpler, and we can also check for FOLL_MLOCK in __get_user_pages() and the code ends up much more straightforward. Reported-and-reviewed-by: Michel Lespinasse Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- mm/memory.c | 7 +++---- mm/mlock.c | 5 +---- 2 files changed, 4 insertions(+), 8 deletions(-) (limited to 'mm/mlock.c') diff --git a/mm/memory.c b/mm/memory.c index 607098d47e74..27f425378112 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1359,7 +1359,7 @@ split_fallthrough: */ mark_page_accessed(page); } - if (flags & FOLL_MLOCK) { + if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { /* * The preliminary mapping check is mainly to avoid the * pointless overhead of lock_page on the ZERO_PAGE @@ -1552,10 +1552,9 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, } /* - * If we don't actually want the page itself, - * and it's the stack guard page, just skip it. + * For mlock, just skip the stack guard page. */ - if (!pages && stack_guard_page(vma, start)) + if ((gup_flags & FOLL_MLOCK) && stack_guard_page(vma, start)) goto next_page; do { diff --git a/mm/mlock.c b/mm/mlock.c index 6b55e3efe0df..516b2c2ddd5a 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -162,7 +162,7 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, VM_BUG_ON(end > vma->vm_end); VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); - gup_flags = FOLL_TOUCH; + gup_flags = FOLL_TOUCH | FOLL_MLOCK; /* * We want to touch writable mappings with a write fault in order * to break COW, except for shared mappings because these don't COW @@ -178,9 +178,6 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) gup_flags |= FOLL_FORCE; - if (vma->vm_flags & VM_LOCKED) - gup_flags |= FOLL_MLOCK; - return __get_user_pages(current, mm, addr, nr_pages, gup_flags, NULL, NULL, nonblocking); } -- cgit v1.2.3 From ca16d140af91febe25daeb9e032bf8bd46b8c31f Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 26 May 2011 19:16:19 +0900 Subject: mm: don't access vm_flags as 'int' The type of vma->vm_flags is 'unsigned long'. Neither 'int' nor 'unsigned int'. This patch fixes such misuse. Signed-off-by: KOSAKI Motohiro [ Changed to use a typedef - we'll extend it to cover more cases later, since there has been discussion about making it a 64-bit type.. - Linus ] Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 3 ++- fs/proc/task_mmu.c | 2 +- include/linux/hugetlb.h | 6 +++--- include/linux/hugetlb_inline.h | 2 +- include/linux/mm.h | 6 +++--- include/linux/mm_types.h | 4 +++- ipc/shm.c | 2 +- mm/fremap.c | 2 +- mm/hugetlb.c | 4 ++-- mm/memory.c | 2 +- mm/mlock.c | 8 ++++---- mm/mmap.c | 8 ++++---- 12 files changed, 26 insertions(+), 23 deletions(-) (limited to 'mm/mlock.c') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index e7a035781b7d..7aafeb8fa300 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -921,7 +921,8 @@ static int can_do_hugetlb_shm(void) return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group); } -struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, +struct file *hugetlb_file_setup(const char *name, size_t size, + vm_flags_t acctflag, struct user_struct **user, int creat_flags) { int error = -ENOMEM; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 2c9db29ea358..db15935fa757 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -211,7 +211,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) { struct mm_struct *mm = vma->vm_mm; struct file *file = vma->vm_file; - int flags = vma->vm_flags; + vm_flags_t flags = vma->vm_flags; unsigned long ino = 0; unsigned long long pgoff = 0; unsigned long start, end; diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 943c76b3d4bb..cf8931e1dd9d 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -41,7 +41,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags); int hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, - int acctflags); + vm_flags_t vm_flags); void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed); int dequeue_hwpoisoned_huge_page(struct page *page); void copy_huge_page(struct page *dst, struct page *src); @@ -168,7 +168,7 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) extern const struct file_operations hugetlbfs_file_operations; extern const struct vm_operations_struct hugetlb_vm_ops; -struct file *hugetlb_file_setup(const char *name, size_t size, int acct, +struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct, struct user_struct **user, int creat_flags); int hugetlb_get_quota(struct address_space *mapping, long delta); void hugetlb_put_quota(struct address_space *mapping, long delta); @@ -192,7 +192,7 @@ static inline void set_file_hugepages(struct file *file) #define is_file_hugepages(file) 0 #define set_file_hugepages(file) BUG() static inline struct file *hugetlb_file_setup(const char *name, size_t size, - int acctflag, struct user_struct **user, int creat_flags) + vm_flags_t acctflag, struct user_struct **user, int creat_flags) { return ERR_PTR(-ENOSYS); } diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h index 6931489a5c14..2bb681fbeb35 100644 --- a/include/linux/hugetlb_inline.h +++ b/include/linux/hugetlb_inline.h @@ -7,7 +7,7 @@ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) { - return vma->vm_flags & VM_HUGETLB; + return !!(vma->vm_flags & VM_HUGETLB); } #else diff --git a/include/linux/mm.h b/include/linux/mm.h index 8eb969ebf904..fb8e814f78dc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -165,12 +165,12 @@ extern pgprot_t protection_map[16]; */ static inline int is_linear_pfn_mapping(struct vm_area_struct *vma) { - return (vma->vm_flags & VM_PFN_AT_MMAP); + return !!(vma->vm_flags & VM_PFN_AT_MMAP); } static inline int is_pfn_mapping(struct vm_area_struct *vma) { - return (vma->vm_flags & VM_PFNMAP); + return !!(vma->vm_flags & VM_PFNMAP); } /* @@ -1432,7 +1432,7 @@ extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, unsigned long flag, unsigned long pgoff); extern unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, unsigned long flags, - unsigned int vm_flags, unsigned long pgoff); + vm_flags_t vm_flags, unsigned long pgoff); static inline unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 071d459e866b..6fe96c19f85e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -102,6 +102,8 @@ struct page { #endif }; +typedef unsigned long __nocast vm_flags_t; + /* * A region containing a mapping of a non-memory backed file under NOMMU * conditions. These are held in a global tree and are pinned by the VMAs that @@ -109,7 +111,7 @@ struct page { */ struct vm_region { struct rb_node vm_rb; /* link in global region tree */ - unsigned long vm_flags; /* VMA vm_flags */ + vm_flags_t vm_flags; /* VMA vm_flags */ unsigned long vm_start; /* start address of region */ unsigned long vm_end; /* region initialised to here */ unsigned long vm_top; /* region allocated to here */ diff --git a/ipc/shm.c b/ipc/shm.c index 729acb7e3148..ab3385a21b27 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -347,7 +347,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) struct file * file; char name[13]; int id; - int acctflag = 0; + vm_flags_t acctflag = 0; if (size < SHMMIN || size > ns->shm_ctlmax) return -EINVAL; diff --git a/mm/fremap.c b/mm/fremap.c index 7f4123056e06..b8e0e2d468af 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -224,7 +224,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, /* * drop PG_Mlocked flag for over-mapped range */ - unsigned int saved_flags = vma->vm_flags; + vm_flags_t saved_flags = vma->vm_flags; munlock_vma_pages_range(vma, start, start + size); vma->vm_flags = saved_flags; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5fd68b95c671..f33bb319b73f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2833,7 +2833,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma, int hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, - int acctflag) + vm_flags_t vm_flags) { long ret, chg; struct hstate *h = hstate_inode(inode); @@ -2843,7 +2843,7 @@ int hugetlb_reserve_pages(struct inode *inode, * attempt will be made for VM_NORESERVE to allocate a page * and filesystem quota without using reserves */ - if (acctflag & VM_NORESERVE) + if (vm_flags & VM_NORESERVE) return 0; /* diff --git a/mm/memory.c b/mm/memory.c index b73f677f0bb1..fc24f7d788bd 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -730,7 +730,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, add_taint(TAINT_BAD_PAGE); } -static inline int is_cow_mapping(unsigned int flags) +static inline int is_cow_mapping(vm_flags_t flags) { return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; } diff --git a/mm/mlock.c b/mm/mlock.c index 516b2c2ddd5a..048260c4e02e 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -307,13 +307,13 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, * For vmas that pass the filters, merge/split as appropriate. */ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, - unsigned long start, unsigned long end, unsigned int newflags) + unsigned long start, unsigned long end, vm_flags_t newflags) { struct mm_struct *mm = vma->vm_mm; pgoff_t pgoff; int nr_pages; int ret = 0; - int lock = newflags & VM_LOCKED; + int lock = !!(newflags & VM_LOCKED); if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) @@ -385,7 +385,7 @@ static int do_mlock(unsigned long start, size_t len, int on) prev = vma; for (nstart = start ; ; ) { - unsigned int newflags; + vm_flags_t newflags; /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ @@ -524,7 +524,7 @@ static int do_mlockall(int flags) goto out; for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { - unsigned int newflags; + vm_flags_t newflags; newflags = vma->vm_flags | VM_LOCKED; if (!(flags & MCL_CURRENT)) diff --git a/mm/mmap.c b/mm/mmap.c index ac2631b7477f..bbdc9af5e117 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -960,7 +960,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, { struct mm_struct * mm = current->mm; struct inode *inode; - unsigned int vm_flags; + vm_flags_t vm_flags; int error; unsigned long reqprot = prot; @@ -1165,7 +1165,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) */ int vma_wants_writenotify(struct vm_area_struct *vma) { - unsigned int vm_flags = vma->vm_flags; + vm_flags_t vm_flags = vma->vm_flags; /* If it was private or non-writable, the write bit is already clear */ if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED))) @@ -1193,7 +1193,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma) * We account for memory if it's a private writeable mapping, * not hugepages and VM_NORESERVE wasn't set. */ -static inline int accountable_mapping(struct file *file, unsigned int vm_flags) +static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) { /* * hugetlb has its own accounting separate from the core VM @@ -1207,7 +1207,7 @@ static inline int accountable_mapping(struct file *file, unsigned int vm_flags) unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, unsigned long flags, - unsigned int vm_flags, unsigned long pgoff) + vm_flags_t vm_flags, unsigned long pgoff) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; -- cgit v1.2.3