From b9d7e6ae82da124dc9c579fe1061264ef2a69407 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 28 Oct 2006 10:38:41 -0700 Subject: [PATCH] hugetlb: fix size=4G parsing On 32-bit machines, mount -t hugetlbfs -o size=4G gave a 0GB filesystem, size=5G gave a 1GB filesystem etc: there's no point in masking size with HPAGE_MASK just before shifting its lower bits away, and since HPAGE_MASK is a UL, that removed all the higher bits of the unsigned long long size. Signed-off-by: Hugh Dickins Cc: Adam Litke Cc: David Gibson Cc: "Chen, Kenneth W" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/hugetlbfs/inode.c') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 4ee3f006b861..0b23b963bb44 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -624,7 +624,6 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) do_div(size, 100); rest++; } - size &= HPAGE_MASK; pconfig->nr_blocks = (size >> HPAGE_SHIFT); value = rest; } else if (!strcmp(opt,"nr_inodes")) { -- cgit v1.2.3 From 856fc29505556cf263f3dcda2533cf3766c14ab6 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 28 Oct 2006 10:38:43 -0700 Subject: [PATCH] hugetlb: fix prio_tree unit hugetlb_vmtruncate_list was misconverted to prio_tree: its prio_tree is in units of PAGE_SIZE (PAGE_CACHE_SIZE) like any other, not HPAGE_SIZE (whereas its radix_tree is kept in units of HPAGE_SIZE, otherwise slots would be absurdly sparse). At first I thought the error benign, just calling __unmap_hugepage_range on more vmas than necessary; but on 32-bit machines, when the prio_tree is searched correctly, it happens to ensure the v_offset calculation won't overflow. As it stood, when truncating at or beyond 4GB, it was liable to discard pages COWed from lower offsets; or even to clear pmd entries of preceding vmas, triggering exit_mmap's BUG_ON(nr_ptes). Signed-off-by: Hugh Dickins Cc: Adam Litke Cc: David Gibson Cc: "Chen, Kenneth W" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'fs/hugetlbfs/inode.c') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 0b23b963bb44..0bea6a619e10 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -271,26 +271,24 @@ static void hugetlbfs_drop_inode(struct inode *inode) hugetlbfs_forget_inode(inode); } -/* - * h_pgoff is in HPAGE_SIZE units. - * vma->vm_pgoff is in PAGE_SIZE units. - */ static inline void -hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff) +hugetlb_vmtruncate_list(struct prio_tree_root *root, pgoff_t pgoff) { struct vm_area_struct *vma; struct prio_tree_iter iter; - vma_prio_tree_foreach(vma, &iter, root, h_pgoff, ULONG_MAX) { - unsigned long h_vm_pgoff; + vma_prio_tree_foreach(vma, &iter, root, pgoff, ULONG_MAX) { unsigned long v_offset; - h_vm_pgoff = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT); - v_offset = (h_pgoff - h_vm_pgoff) << HPAGE_SHIFT; /* - * Is this VMA fully outside the truncation point? + * Can the expression below overflow on 32-bit arches? + * No, because the prio_tree returns us only those vmas + * which overlap the truncated area starting at pgoff, + * and no vma on a 32-bit arch can span beyond the 4GB. */ - if (h_vm_pgoff >= h_pgoff) + if (vma->vm_pgoff < pgoff) + v_offset = (pgoff - vma->vm_pgoff) << PAGE_SHIFT; + else v_offset = 0; __unmap_hugepage_range(vma, @@ -303,14 +301,14 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff) */ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) { - unsigned long pgoff; + pgoff_t pgoff; struct address_space *mapping = inode->i_mapping; if (offset > inode->i_size) return -EINVAL; BUG_ON(offset & ~HPAGE_MASK); - pgoff = offset >> HPAGE_SHIFT; + pgoff = offset >> PAGE_SHIFT; inode->i_size = offset; spin_lock(&mapping->i_mmap_lock); -- cgit v1.2.3 From 68589bc353037f233fe510ad9ff432338c95db66 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 14 Nov 2006 02:03:32 -0800 Subject: [PATCH] hugetlb: prepare_hugepage_range check offset too (David:) If hugetlbfs_file_mmap() returns a failure to do_mmap_pgoff() - for example, because the given file offset is not hugepage aligned - then do_mmap_pgoff will go to the unmap_and_free_vma backout path. But at this stage the vma hasn't been marked as hugepage, and the backout path will call unmap_region() on it. That will eventually call down to the non-hugepage version of unmap_page_range(). On ppc64, at least, that will cause serious problems if there are any existing hugepage pagetable entries in the vicinity - for example if there are any other hugepage mappings under the same PUD. unmap_page_range() will trigger a bad_pud() on the hugepage pud entries. I suspect this will also cause bad problems on ia64, though I don't have a machine to test it on. (Hugh:) prepare_hugepage_range() should check file offset alignment when it checks virtual address and length, to stop MAP_FIXED with a bad huge offset from unmapping before it fails further down. PowerPC should apply the same prepare_hugepage_range alignment checks as ia64 and all the others do. Then none of the alignment checks in hugetlbfs_file_mmap are required (nor is the check for too small a mapping); but even so, move up setting of VM_HUGETLB and add a comment to warn of what David Gibson discovered - if hugetlbfs_file_mmap fails before setting it, do_mmap_pgoff's unmap_region when unwinding from error will go the non-huge way, which may cause bad behaviour on architectures (powerpc and ia64) which segregate their huge mappings into a separate region of the address space. Signed-off-by: Hugh Dickins Cc: "Luck, Tony" Cc: "David S. Miller" Acked-by: Adam Litke Acked-by: David Gibson Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/hugetlbpage.c | 4 +++- arch/powerpc/mm/hugetlbpage.c | 8 ++++++-- fs/hugetlbfs/inode.c | 21 ++++++++------------- include/linux/hugetlb.h | 10 +++++++--- mm/mmap.c | 2 +- 5 files changed, 25 insertions(+), 20 deletions(-) (limited to 'fs/hugetlbfs/inode.c') diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c index eee5c1cfbe32..f3a9585e98a8 100644 --- a/arch/ia64/mm/hugetlbpage.c +++ b/arch/ia64/mm/hugetlbpage.c @@ -70,8 +70,10 @@ huge_pte_offset (struct mm_struct *mm, unsigned long addr) * Don't actually need to do any preparation, but need to make sure * the address is in the right region. */ -int prepare_hugepage_range(unsigned long addr, unsigned long len) +int prepare_hugepage_range(unsigned long addr, unsigned long len, pgoff_t pgoff) { + if (pgoff & (~HPAGE_MASK >> PAGE_SHIFT)) + return -EINVAL; if (len & ~HPAGE_MASK) return -EINVAL; if (addr & ~HPAGE_MASK) diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index fd68b74c07c3..506d89768d45 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -491,11 +491,15 @@ static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas) return 0; } -int prepare_hugepage_range(unsigned long addr, unsigned long len) +int prepare_hugepage_range(unsigned long addr, unsigned long len, pgoff_t pgoff) { int err = 0; - if ( (addr+len) < addr ) + if (pgoff & (~HPAGE_MASK >> PAGE_SHIFT)) + return -EINVAL; + if (len & ~HPAGE_MASK) + return -EINVAL; + if (addr & ~HPAGE_MASK) return -EINVAL; if (addr < 0x100000000UL) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 0bea6a619e10..7f4756963d05 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -62,24 +62,19 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) loff_t len, vma_len; int ret; - if (vma->vm_pgoff & (HPAGE_SIZE / PAGE_SIZE - 1)) - return -EINVAL; - - if (vma->vm_start & ~HPAGE_MASK) - return -EINVAL; - - if (vma->vm_end & ~HPAGE_MASK) - return -EINVAL; - - if (vma->vm_end - vma->vm_start < HPAGE_SIZE) - return -EINVAL; + /* + * vma alignment has already been checked by prepare_hugepage_range. + * If you add any error returns here, do so after setting VM_HUGETLB, + * so is_vm_hugetlb_page tests below unmap_region go the right way + * when do_mmap_pgoff unwinds (may be important on powerpc and ia64). + */ + vma->vm_flags |= VM_HUGETLB | VM_RESERVED; + vma->vm_ops = &hugetlb_vm_ops; vma_len = (loff_t)(vma->vm_end - vma->vm_start); mutex_lock(&inode->i_mutex); file_accessed(file); - vma->vm_flags |= VM_HUGETLB | VM_RESERVED; - vma->vm_ops = &hugetlb_vm_ops; ret = -ENOMEM; len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 5081d27bfa27..ace64e57e17f 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -60,8 +60,11 @@ void hugetlb_free_pgd_range(struct mmu_gather **tlb, unsigned long addr, * If the arch doesn't supply something else, assume that hugepage * size aligned regions are ok without further preparation. */ -static inline int prepare_hugepage_range(unsigned long addr, unsigned long len) +static inline int prepare_hugepage_range(unsigned long addr, unsigned long len, + pgoff_t pgoff) { + if (pgoff & (~HPAGE_MASK >> PAGE_SHIFT)) + return -EINVAL; if (len & ~HPAGE_MASK) return -EINVAL; if (addr & ~HPAGE_MASK) @@ -69,7 +72,8 @@ static inline int prepare_hugepage_range(unsigned long addr, unsigned long len) return 0; } #else -int prepare_hugepage_range(unsigned long addr, unsigned long len); +int prepare_hugepage_range(unsigned long addr, unsigned long len, + pgoff_t pgoff); #endif #ifndef ARCH_HAS_SETCLEAR_HUGE_PTE @@ -107,7 +111,7 @@ static inline unsigned long hugetlb_total_pages(void) #define hugetlb_report_meminfo(buf) 0 #define hugetlb_report_node_meminfo(n, buf) 0 #define follow_huge_pmd(mm, addr, pmd, write) NULL -#define prepare_hugepage_range(addr, len) (-EINVAL) +#define prepare_hugepage_range(addr,len,pgoff) (-EINVAL) #define pmd_huge(x) 0 #define is_hugepage_only_range(mm, addr, len) 0 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; }) diff --git a/mm/mmap.c b/mm/mmap.c index 497e502dfd6b..bdace87d7c01 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1379,7 +1379,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, * Check if the given range is hugepage aligned, and * can be made suitable for hugepages. */ - ret = prepare_hugepage_range(addr, len); + ret = prepare_hugepage_range(addr, len, pgoff); } else { /* * Ensure that a normal request is not falling in a -- cgit v1.2.3