From b9d7e6ae82da124dc9c579fe1061264ef2a69407 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 28 Oct 2006 10:38:41 -0700
Subject: [PATCH] hugetlb: fix size=4G parsing

On 32-bit machines, mount -t hugetlbfs -o size=4G gave a 0GB filesystem,
size=5G gave a 1GB filesystem etc: there's no point in masking size with
HPAGE_MASK just before shifting its lower bits away, and since HPAGE_MASK is a
UL, that removed all the higher bits of the unsigned long long size.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Adam Litke <agl@us.ibm.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/hugetlbfs/inode.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs/hugetlbfs/inode.c')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 4ee3f006b861..0b23b963bb44 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -624,7 +624,6 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
 				do_div(size, 100);
 				rest++;
 			}
-			size &= HPAGE_MASK;
 			pconfig->nr_blocks = (size >> HPAGE_SHIFT);
 			value = rest;
 		} else if (!strcmp(opt,"nr_inodes")) {
-- 
cgit v1.2.3


From 856fc29505556cf263f3dcda2533cf3766c14ab6 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 28 Oct 2006 10:38:43 -0700
Subject: [PATCH] hugetlb: fix prio_tree unit

hugetlb_vmtruncate_list was misconverted to prio_tree: its prio_tree is in
units of PAGE_SIZE (PAGE_CACHE_SIZE) like any other, not HPAGE_SIZE (whereas
its radix_tree is kept in units of HPAGE_SIZE, otherwise slots would be
absurdly sparse).

At first I thought the error benign, just calling __unmap_hugepage_range on
more vmas than necessary; but on 32-bit machines, when the prio_tree is
searched correctly, it happens to ensure the v_offset calculation won't
overflow.  As it stood, when truncating at or beyond 4GB, it was liable to
discard pages COWed from lower offsets; or even to clear pmd entries of
preceding vmas, triggering exit_mmap's BUG_ON(nr_ptes).

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Adam Litke <agl@us.ibm.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/hugetlbfs/inode.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

(limited to 'fs/hugetlbfs/inode.c')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 0b23b963bb44..0bea6a619e10 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -271,26 +271,24 @@ static void hugetlbfs_drop_inode(struct inode *inode)
 		hugetlbfs_forget_inode(inode);
 }
 
-/*
- * h_pgoff is in HPAGE_SIZE units.
- * vma->vm_pgoff is in PAGE_SIZE units.
- */
 static inline void
-hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff)
+hugetlb_vmtruncate_list(struct prio_tree_root *root, pgoff_t pgoff)
 {
 	struct vm_area_struct *vma;
 	struct prio_tree_iter iter;
 
-	vma_prio_tree_foreach(vma, &iter, root, h_pgoff, ULONG_MAX) {
-		unsigned long h_vm_pgoff;
+	vma_prio_tree_foreach(vma, &iter, root, pgoff, ULONG_MAX) {
 		unsigned long v_offset;
 
-		h_vm_pgoff = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT);
-		v_offset = (h_pgoff - h_vm_pgoff) << HPAGE_SHIFT;
 		/*
-		 * Is this VMA fully outside the truncation point?
+		 * Can the expression below overflow on 32-bit arches?
+		 * No, because the prio_tree returns us only those vmas
+		 * which overlap the truncated area starting at pgoff,
+		 * and no vma on a 32-bit arch can span beyond the 4GB.
 		 */
-		if (h_vm_pgoff >= h_pgoff)
+		if (vma->vm_pgoff < pgoff)
+			v_offset = (pgoff - vma->vm_pgoff) << PAGE_SHIFT;
+		else
 			v_offset = 0;
 
 		__unmap_hugepage_range(vma,
@@ -303,14 +301,14 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff)
  */
 static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
 {
-	unsigned long pgoff;
+	pgoff_t pgoff;
 	struct address_space *mapping = inode->i_mapping;
 
 	if (offset > inode->i_size)
 		return -EINVAL;
 
 	BUG_ON(offset & ~HPAGE_MASK);
-	pgoff = offset >> HPAGE_SHIFT;
+	pgoff = offset >> PAGE_SHIFT;
 
 	inode->i_size = offset;
 	spin_lock(&mapping->i_mmap_lock);
-- 
cgit v1.2.3


From 68589bc353037f233fe510ad9ff432338c95db66 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Tue, 14 Nov 2006 02:03:32 -0800
Subject: [PATCH] hugetlb: prepare_hugepage_range check offset too

(David:)

If hugetlbfs_file_mmap() returns a failure to do_mmap_pgoff() - for example,
because the given file offset is not hugepage aligned - then do_mmap_pgoff
will go to the unmap_and_free_vma backout path.

But at this stage the vma hasn't been marked as hugepage, and the backout path
will call unmap_region() on it.  That will eventually call down to the
non-hugepage version of unmap_page_range().  On ppc64, at least, that will
cause serious problems if there are any existing hugepage pagetable entries in
the vicinity - for example if there are any other hugepage mappings under the
same PUD.  unmap_page_range() will trigger a bad_pud() on the hugepage pud
entries.  I suspect this will also cause bad problems on ia64, though I don't
have a machine to test it on.

(Hugh:)

prepare_hugepage_range() should check file offset alignment when it checks
virtual address and length, to stop MAP_FIXED with a bad huge offset from
unmapping before it fails further down.  PowerPC should apply the same
prepare_hugepage_range alignment checks as ia64 and all the others do.

Then none of the alignment checks in hugetlbfs_file_mmap are required (nor
is the check for too small a mapping); but even so, move up setting of
VM_HUGETLB and add a comment to warn of what David Gibson discovered - if
hugetlbfs_file_mmap fails before setting it, do_mmap_pgoff's unmap_region
when unwinding from error will go the non-huge way, which may cause bad
behaviour on architectures (powerpc and ia64) which segregate their huge
mappings into a separate region of the address space.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Acked-by: Adam Litke <agl@us.ibm.com>
Acked-by: David Gibson <david@gibson.dropbear.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/mm/hugetlbpage.c    |  4 +++-
 arch/powerpc/mm/hugetlbpage.c |  8 ++++++--
 fs/hugetlbfs/inode.c          | 21 ++++++++-------------
 include/linux/hugetlb.h       | 10 +++++++---
 mm/mmap.c                     |  2 +-
 5 files changed, 25 insertions(+), 20 deletions(-)

(limited to 'fs/hugetlbfs/inode.c')

diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index eee5c1cfbe32..f3a9585e98a8 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -70,8 +70,10 @@ huge_pte_offset (struct mm_struct *mm, unsigned long addr)
  * Don't actually need to do any preparation, but need to make sure
  * the address is in the right region.
  */
-int prepare_hugepage_range(unsigned long addr, unsigned long len)
+int prepare_hugepage_range(unsigned long addr, unsigned long len, pgoff_t pgoff)
 {
+	if (pgoff & (~HPAGE_MASK >> PAGE_SHIFT))
+		return -EINVAL;
 	if (len & ~HPAGE_MASK)
 		return -EINVAL;
 	if (addr & ~HPAGE_MASK)
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index fd68b74c07c3..506d89768d45 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -491,11 +491,15 @@ static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas)
 	return 0;
 }
 
-int prepare_hugepage_range(unsigned long addr, unsigned long len)
+int prepare_hugepage_range(unsigned long addr, unsigned long len, pgoff_t pgoff)
 {
 	int err = 0;
 
-	if ( (addr+len) < addr )
+	if (pgoff & (~HPAGE_MASK >> PAGE_SHIFT))
+		return -EINVAL;
+	if (len & ~HPAGE_MASK)
+		return -EINVAL;
+	if (addr & ~HPAGE_MASK)
 		return -EINVAL;
 
 	if (addr < 0x100000000UL)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 0bea6a619e10..7f4756963d05 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -62,24 +62,19 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	loff_t len, vma_len;
 	int ret;
 
-	if (vma->vm_pgoff & (HPAGE_SIZE / PAGE_SIZE - 1))
-		return -EINVAL;
-
-	if (vma->vm_start & ~HPAGE_MASK)
-		return -EINVAL;
-
-	if (vma->vm_end & ~HPAGE_MASK)
-		return -EINVAL;
-
-	if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
-		return -EINVAL;
+	/*
+	 * vma alignment has already been checked by prepare_hugepage_range.
+	 * If you add any error returns here, do so after setting VM_HUGETLB,
+	 * so is_vm_hugetlb_page tests below unmap_region go the right way
+	 * when do_mmap_pgoff unwinds (may be important on powerpc and ia64).
+	 */
+	vma->vm_flags |= VM_HUGETLB | VM_RESERVED;
+	vma->vm_ops = &hugetlb_vm_ops;
 
 	vma_len = (loff_t)(vma->vm_end - vma->vm_start);
 
 	mutex_lock(&inode->i_mutex);
 	file_accessed(file);
-	vma->vm_flags |= VM_HUGETLB | VM_RESERVED;
-	vma->vm_ops = &hugetlb_vm_ops;
 
 	ret = -ENOMEM;
 	len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 5081d27bfa27..ace64e57e17f 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -60,8 +60,11 @@ void hugetlb_free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
  * If the arch doesn't supply something else, assume that hugepage
  * size aligned regions are ok without further preparation.
  */
-static inline int prepare_hugepage_range(unsigned long addr, unsigned long len)
+static inline int prepare_hugepage_range(unsigned long addr, unsigned long len,
+						pgoff_t pgoff)
 {
+	if (pgoff & (~HPAGE_MASK >> PAGE_SHIFT))
+		return -EINVAL;
 	if (len & ~HPAGE_MASK)
 		return -EINVAL;
 	if (addr & ~HPAGE_MASK)
@@ -69,7 +72,8 @@ static inline int prepare_hugepage_range(unsigned long addr, unsigned long len)
 	return 0;
 }
 #else
-int prepare_hugepage_range(unsigned long addr, unsigned long len);
+int prepare_hugepage_range(unsigned long addr, unsigned long len,
+						pgoff_t pgoff);
 #endif
 
 #ifndef ARCH_HAS_SETCLEAR_HUGE_PTE
@@ -107,7 +111,7 @@ static inline unsigned long hugetlb_total_pages(void)
 #define hugetlb_report_meminfo(buf)		0
 #define hugetlb_report_node_meminfo(n, buf)	0
 #define follow_huge_pmd(mm, addr, pmd, write)	NULL
-#define prepare_hugepage_range(addr, len)	(-EINVAL)
+#define prepare_hugepage_range(addr,len,pgoff)	(-EINVAL)
 #define pmd_huge(x)	0
 #define is_hugepage_only_range(mm, addr, len)	0
 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
diff --git a/mm/mmap.c b/mm/mmap.c
index 497e502dfd6b..bdace87d7c01 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1379,7 +1379,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 		 * Check if the given range is hugepage aligned, and
 		 * can be made suitable for hugepages.
 		 */
-		ret = prepare_hugepage_range(addr, len);
+		ret = prepare_hugepage_range(addr, len, pgoff);
 	} else {
 		/*
 		 * Ensure that a normal request is not falling in a
-- 
cgit v1.2.3