From 7cb93181629c613ee2b8f4ffe3446f8003074842 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Wed, 30 Jul 2008 02:18:26 +0900
Subject: mm/hugetlb.c must #include <asm/io.h>

This patch fixes the following build error on sh caused by
commit aa888a74977a8f2120ae9332376e179c39a6b07d
(hugetlb: support larger than MAX_ORDER):

<--  snip  -->

...
  CC      mm/hugetlb.o
/home/bunk/linux/kernel-2.6/git/linux-2.6/mm/hugetlb.c: In function 'alloc_bootmem_huge_page':
/home/bunk/linux/kernel-2.6/git/linux-2.6/mm/hugetlb.c:958: error: implicit declaration of function 'virt_to_phys'
make[2]: *** [mm/hugetlb.o] Error 1

<--  snip  -->

Reported-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 mm/hugetlb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm/hugetlb.c')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3be79dc18c5c..b3c78640b629 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -16,7 +16,7 @@
 #include <linux/mutex.h>
 #include <linux/bootmem.h>
 #include <linux/sysfs.h>
-
+#include <asm/io.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 
-- 
cgit v1.2.3


From 0ef89d25d3e390dfa7c46772907951744a4067dc Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Thu, 31 Jul 2008 00:07:30 -0700
Subject: mm/hugetlb: don't crash when HPAGE_SHIFT is 0

Some platform decide whether they support huge pages at boot time.  On
these, such as powerpc, HPAGE_SHIFT is a variable, not a constant, and is
set to 0 when there is no such support.

The patches to introduce multiple huge pages support broke that causing
the kernel to crash at boot time on machines such as POWER3 which lack
support for multiple page sizes.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'mm/hugetlb.c')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d237a02eb228..28a2980ee435 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1283,7 +1283,12 @@ module_exit(hugetlb_exit);
 
 static int __init hugetlb_init(void)
 {
-	BUILD_BUG_ON(HPAGE_SHIFT == 0);
+	/* Some platform decide whether they support huge pages at boot
+	 * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
+	 * there is no such support
+	 */
+	if (HPAGE_SHIFT == 0)
+		return 0;
 
 	if (!size_to_hstate(default_hstate_size)) {
 		default_hstate_size = HPAGE_SIZE;
-- 
cgit v1.2.3


From d6606683a5e3dac35cb979c7195f54ed827567bd Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 6 Aug 2008 12:04:54 -0700
Subject: Revert duplicate "mm/hugetlb.c must #include <asm/io.h>"

This reverts commit 7cb93181629c613ee2b8f4ffe3446f8003074842, since we
did that patch twice, and the problem was already fixed earlier by
78a34ae29bf1c9df62a5bd0f0798b6c62a54d520.

Reported-by: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm/hugetlb.c')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 28a2980ee435..757ca983fd99 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -17,7 +17,7 @@
 #include <linux/mutex.h>
 #include <linux/bootmem.h>
 #include <linux/sysfs.h>
-#include <asm/io.h>
+
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/io.h>
-- 
cgit v1.2.3


From caff3a2c333e11a794308bd9a875a09b94fee24a Mon Sep 17 00:00:00 2001
From: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Date: Tue, 12 Aug 2008 15:08:38 -0700
Subject: hugetlb: call arch_prepare_hugepage() for surplus pages

The s390 software large page emulation implements shared page tables by
using page->index of the first tail page from a compound large page to
store page table information.  This is set up in arch_prepare_hugepage(),
which is called from alloc_fresh_huge_page_node().

A similar call to arch_prepare_hugepage() is missing for surplus large
pages that are allocated in alloc_buddy_huge_page(), which breaks the
software emulation mode for (surplus) large pages on s390.  This patch
adds the missing call to arch_prepare_hugepage().  It will have no effect
on other architectures where arch_prepare_hugepage() is a nop.

Also, use the correct order in the error path in alloc_fresh_huge_page_node().

Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Acked-by: Nick Piggin <npiggin@suse.de>
Acked-by: Adam Litke <agl@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'mm/hugetlb.c')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 757ca983fd99..92155db888b9 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -565,7 +565,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
 		huge_page_order(h));
 	if (page) {
 		if (arch_prepare_hugepage(page)) {
-			__free_pages(page, HUGETLB_PAGE_ORDER);
+			__free_pages(page, huge_page_order(h));
 			return NULL;
 		}
 		prep_new_huge_page(h, page, nid);
@@ -665,6 +665,11 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
 					__GFP_REPEAT|__GFP_NOWARN,
 					huge_page_order(h));
 
+	if (page && arch_prepare_hugepage(page)) {
+		__free_pages(page, huge_page_order(h));
+		return NULL;
+	}
+
 	spin_lock(&hugetlb_lock);
 	if (page) {
 		/*
-- 
cgit v1.2.3


From 57303d80175e10056bf51206f9961d586f02f967 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Tue, 12 Aug 2008 15:08:47 -0700
Subject: hugetlbfs: allocate structures for reservation tracking outside of
 spinlocks

In the normal case, hugetlbfs reserves hugepages at map time so that the
pages exist for future faults.  A struct file_region is used to track when
reservations have been consumed and where.  These file_regions are
allocated as necessary with kmalloc() which can sleep with the
mm->page_table_lock held.  This is wrong and triggers may-sleep warning
when PREEMPT is enabled.

Updates to the underlying file_region are done in two phases.  The first
phase prepares the region for the change, allocating any necessary memory,
without actually making the change.  The second phase actually commits the
change.  This patch makes use of this by checking the reservations before
the page_table_lock is taken; triggering any necessary allocations.  This
may then be safely repeated within the locks without any allocations being
required.

Credit to Mel Gorman for diagnosing this failure and initial versions of
the patch.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Tested-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 44 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 9 deletions(-)

(limited to 'mm/hugetlb.c')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 92155db888b9..4c97c174e2e1 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1942,6 +1942,15 @@ retry:
 			lock_page(page);
 	}
 
+	/*
+	 * If we are going to COW a private mapping later, we examine the
+	 * pending reservations for this page now. This will ensure that
+	 * any allocations necessary to record that reservation occur outside
+	 * the spinlock.
+	 */
+	if (write_access && !(vma->vm_flags & VM_SHARED))
+		vma_needs_reservation(h, vma, address);
+
 	spin_lock(&mm->page_table_lock);
 	size = i_size_read(mapping->host) >> huge_page_shift(h);
 	if (idx >= size)
@@ -1978,6 +1987,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	pte_t *ptep;
 	pte_t entry;
 	int ret;
+	struct page *pagecache_page = NULL;
 	static DEFINE_MUTEX(hugetlb_instantiation_mutex);
 	struct hstate *h = hstate_vma(vma);
 
@@ -2000,19 +2010,35 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	ret = 0;
 
+	/*
+	 * If we are going to COW the mapping later, we examine the pending
+	 * reservations for this page now. This will ensure that any
+	 * allocations necessary to record that reservation occur outside the
+	 * spinlock. For private mappings, we also lookup the pagecache
+	 * page now as it is used to determine if a reservation has been
+	 * consumed.
+	 */
+	if (write_access && !pte_write(entry)) {
+		vma_needs_reservation(h, vma, address);
+
+		if (!(vma->vm_flags & VM_SHARED))
+			pagecache_page = hugetlbfs_pagecache_page(h,
+								vma, address);
+	}
+
 	spin_lock(&mm->page_table_lock);
 	/* Check for a racing update before calling hugetlb_cow */
 	if (likely(pte_same(entry, huge_ptep_get(ptep))))
-		if (write_access && !pte_write(entry)) {
-			struct page *page;
-			page = hugetlbfs_pagecache_page(h, vma, address);
-			ret = hugetlb_cow(mm, vma, address, ptep, entry, page);
-			if (page) {
-				unlock_page(page);
-				put_page(page);
-			}
-		}
+		if (write_access && !pte_write(entry))
+			ret = hugetlb_cow(mm, vma, address, ptep, entry,
+							pagecache_page);
 	spin_unlock(&mm->page_table_lock);
+
+	if (pagecache_page) {
+		unlock_page(pagecache_page);
+		put_page(pagecache_page);
+	}
+
 	mutex_unlock(&hugetlb_instantiation_mutex);
 
 	return ret;
-- 
cgit v1.2.3


From 2b26736c88db85c038e04c2306d0745553e69602 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Tue, 12 Aug 2008 15:08:49 -0700
Subject: allocate structures for reservation tracking in hugetlbfs outside of
 spinlocks v2

[Andrew this should replace the previous version which did not check
the returns from the region prepare for errors.  This has been tested by
us and Gerald and it looks good.

Bah, while reviewing the locking based on your previous email I spotted
that we need to check the return from the vma_needs_reservation call for
allocation errors.  Here is an updated patch to correct this.  This passes
testing here.]

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Tested-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'mm/hugetlb.c')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4c97c174e2e1..67a71191136e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1949,7 +1949,10 @@ retry:
 	 * the spinlock.
 	 */
 	if (write_access && !(vma->vm_flags & VM_SHARED))
-		vma_needs_reservation(h, vma, address);
+		if (vma_needs_reservation(h, vma, address) < 0) {
+			ret = VM_FAULT_OOM;
+			goto backout_unlocked;
+		}
 
 	spin_lock(&mm->page_table_lock);
 	size = i_size_read(mapping->host) >> huge_page_shift(h);
@@ -1976,6 +1979,7 @@ out:
 
 backout:
 	spin_unlock(&mm->page_table_lock);
+backout_unlocked:
 	unlock_page(page);
 	put_page(page);
 	goto out;
@@ -2004,8 +2008,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	entry = huge_ptep_get(ptep);
 	if (huge_pte_none(entry)) {
 		ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
-		mutex_unlock(&hugetlb_instantiation_mutex);
-		return ret;
+		goto out_unlock;
 	}
 
 	ret = 0;
@@ -2019,7 +2022,10 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * consumed.
 	 */
 	if (write_access && !pte_write(entry)) {
-		vma_needs_reservation(h, vma, address);
+		if (vma_needs_reservation(h, vma, address) < 0) {
+			ret = VM_FAULT_OOM;
+			goto out_unlock;
+		}
 
 		if (!(vma->vm_flags & VM_SHARED))
 			pagecache_page = hugetlbfs_pagecache_page(h,
@@ -2039,6 +2045,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		put_page(pagecache_page);
 	}
 
+out_unlock:
 	mutex_unlock(&hugetlb_instantiation_mutex);
 
 	return ret;
-- 
cgit v1.2.3