From 51007004f44c9588d70ffb77e1f52479bd5b0e37 Mon Sep 17 00:00:00 2001
From: Chris Metcalf <cmetcalf@tilera.com>
Date: Tue, 27 Mar 2012 15:40:20 -0400
Subject: arch/tile: use interrupt critical sections less

In general we want to avoid ever touching memory while within an
interrupt critical section, since the page fault path goes through
a different path from the hypervisor when in an interrupt critical
section, and we carefully decided with tilegx that we didn't need
to support this path in the kernel.  (On tilepro we did implement
that path as part of supporting atomic instructions in software.)

In practice we always need to touch the kernel stack, since that's
where we store the interrupt state before releasing the critical
section, but this change cleans up a few things.  The IRQ_ENABLE
macro is split up so that when we want to enable interrupts in a
deferred way (e.g. for cpu_idle or for interrupt return) we can
read the per-cpu enable mask before entering the critical section.
The cache-migration code is changed to use interrupt masking instead
of interrupt critical sections.  And, the interrupt-entry code is
changed so that we defer loading "tp" from per-cpu data until after
we have released the interrupt critical section.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
 arch/tile/mm/init.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/tile/mm/init.c')

diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
index 6a9d20ddc34f..1e4633520b35 100644
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
@@ -444,6 +444,7 @@ static pgd_t pgtables[PTRS_PER_PGD]
  */
 static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 {
+	unsigned long long irqmask;
 	unsigned long address, pfn;
 	pmd_t *pmd;
 	pte_t *pte;
@@ -633,10 +634,13 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 	 *  - install pgtables[] as the real page table
 	 *  - flush the TLB so the new page table takes effect
 	 */
+	irqmask = interrupt_mask_save_mask();
+	interrupt_mask_set_mask(-1ULL);
 	rc = flush_and_install_context(__pa(pgtables),
 				       init_pgprot((unsigned long)pgtables),
 				       __get_cpu_var(current_asid),
 				       cpumask_bits(my_cpu_mask));
+	interrupt_mask_restore_mask(irqmask);
 	BUG_ON(rc != 0);
 
 	/* Copy the page table back to the normal swapper_pg_dir. */
-- 
cgit v1.2.3


From d5d14ed6f2db7287a5088e1350cf422bf72140b3 Mon Sep 17 00:00:00 2001
From: Chris Metcalf <cmetcalf@tilera.com>
Date: Thu, 29 Mar 2012 13:58:43 -0400
Subject: arch/tile: Allow tilegx to build with either 16K or 64K page size

This change introduces new flags for the hv_install_context()
API that passes a page table pointer to the hypervisor.  Clients
can explicitly request 4K, 16K, or 64K small pages when they
install a new context.  In practice, the page size is fixed at
kernel compile time and the same size is always requested every
time a new page table is installed.

The <hv/hypervisor.h> header changes so that it provides more abstract
macros for managing "page" things like PFNs and page tables.  For
example there is now a HV_DEFAULT_PAGE_SIZE_SMALL instead of the old
HV_PAGE_SIZE_SMALL.  The various PFN routines have been eliminated and
only PA- or PTFN-based ones remain (since PTFNs are always expressed
in fixed 2KB "page" size).  The page-table management macros are
renamed with a leading underscore and take page-size arguments with
the presumption that clients will use those macros in some single
place to provide the "real" macros they will use themselves.

I happened to notice the old hv_set_caching() API was totally broken
(it assumed 4KB pages) so I changed it so it would nominally work
correctly with other page sizes.

Tag modules with the page size so you can't load a module built with
a conflicting page size.  (And add a test for SMP while we're at it.)

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
 arch/tile/Kconfig                    |  25 ++++
 arch/tile/include/asm/Kbuild         |   1 -
 arch/tile/include/asm/mmu.h          |   2 +-
 arch/tile/include/asm/mmu_context.h  |   8 +-
 arch/tile/include/asm/module.h       |  40 +++++++
 arch/tile/include/asm/page.h         |  13 ++-
 arch/tile/include/asm/pgalloc.h      |  92 +++++++++++----
 arch/tile/include/asm/pgtable.h      |  10 +-
 arch/tile/include/asm/pgtable_32.h   |  14 ++-
 arch/tile/include/asm/pgtable_64.h   |  28 +++--
 arch/tile/include/hv/drv_xgbe_intf.h |   2 +-
 arch/tile/include/hv/hypervisor.h    | 214 ++++++++++++++++++++---------------
 arch/tile/kernel/head_32.S           |   8 +-
 arch/tile/kernel/head_64.S           |  22 ++--
 arch/tile/kernel/machine_kexec.c     |   7 +-
 arch/tile/kernel/setup.c             |   8 +-
 arch/tile/kernel/smp.c               |   2 +-
 arch/tile/lib/memcpy_tile64.c        |   8 +-
 arch/tile/mm/init.c                  |  11 +-
 arch/tile/mm/pgtable.c               |  27 ++---
 20 files changed, 345 insertions(+), 197 deletions(-)
 create mode 100644 arch/tile/include/asm/module.h

(limited to 'arch/tile/mm/init.c')

diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 74239dd77e06..38c3957e0b40 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -139,6 +139,31 @@ config NR_CPUS
 	  smaller kernel memory footprint results from using a smaller
 	  value on chips with fewer tiles.
 
+if TILEGX
+
+choice
+	prompt "Kernel page size"
+	default PAGE_SIZE_64KB
+	help
+	  This lets you select the page size of the kernel.  For best
+	  performance on memory-intensive applications, a page size of 64KB
+	  is recommended.  For workloads involving many small files, many
+	  connections, etc., it may be better to select 16KB, which uses
+	  memory more efficiently at some cost in TLB performance.
+
+	  Note that this option is TILE-Gx specific; currently
+	  TILEPro page size is set by rebuilding the hypervisor.
+
+config PAGE_SIZE_16KB
+	bool "16KB"
+
+config PAGE_SIZE_64KB
+	bool "64KB"
+
+endchoice
+
+endif
+
 source "kernel/time/Kconfig"
 
 source "kernel/Kconfig.hz"
diff --git a/arch/tile/include/asm/Kbuild b/arch/tile/include/asm/Kbuild
index 0bb42642343a..6b2e681695ec 100644
--- a/arch/tile/include/asm/Kbuild
+++ b/arch/tile/include/asm/Kbuild
@@ -21,7 +21,6 @@ generic-y += ipcbuf.h
 generic-y += irq_regs.h
 generic-y += kdebug.h
 generic-y += local.h
-generic-y += module.h
 generic-y += msgbuf.h
 generic-y += mutex.h
 generic-y += param.h
diff --git a/arch/tile/include/asm/mmu.h b/arch/tile/include/asm/mmu.h
index 92f94c77b6e4..e2c789096795 100644
--- a/arch/tile/include/asm/mmu.h
+++ b/arch/tile/include/asm/mmu.h
@@ -21,7 +21,7 @@ struct mm_context {
 	 * Written under the mmap_sem semaphore; read without the
 	 * semaphore but atomically, but it is conservatively set.
 	 */
-	unsigned int priority_cached;
+	unsigned long priority_cached;
 };
 
 typedef struct mm_context mm_context_t;
diff --git a/arch/tile/include/asm/mmu_context.h b/arch/tile/include/asm/mmu_context.h
index 15fb24641120..37f0b741dee7 100644
--- a/arch/tile/include/asm/mmu_context.h
+++ b/arch/tile/include/asm/mmu_context.h
@@ -30,11 +30,15 @@ init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 	return 0;
 }
 
-/* Note that arch/tile/kernel/head.S also calls hv_install_context() */
+/*
+ * Note that arch/tile/kernel/head_NN.S and arch/tile/mm/migrate_NN.S
+ * also call hv_install_context().
+ */
 static inline void __install_page_table(pgd_t *pgdir, int asid, pgprot_t prot)
 {
 	/* FIXME: DIRECTIO should not always be set. FIXME. */
-	int rc = hv_install_context(__pa(pgdir), prot, asid, HV_CTX_DIRECTIO);
+	int rc = hv_install_context(__pa(pgdir), prot, asid,
+				    HV_CTX_DIRECTIO | CTX_PAGE_FLAG);
 	if (rc < 0)
 		panic("hv_install_context failed: %d", rc);
 }
diff --git a/arch/tile/include/asm/module.h b/arch/tile/include/asm/module.h
new file mode 100644
index 000000000000..44ed07ccd3d2
--- /dev/null
+++ b/arch/tile/include/asm/module.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef _ASM_TILE_MODULE_H
+#define _ASM_TILE_MODULE_H
+
+#include <arch/chip.h>
+
+#include <asm-generic/module.h>
+
+/* We can't use modules built with different page sizes. */
+#if defined(CONFIG_PAGE_SIZE_16KB)
+# define MODULE_PGSZ " 16KB"
+#elif defined(CONFIG_PAGE_SIZE_64KB)
+# define MODULE_PGSZ " 64KB"
+#else
+# define MODULE_PGSZ ""
+#endif
+
+/* We don't really support no-SMP so tag if someone tries. */
+#ifdef CONFIG_SMP
+#define MODULE_NOSMP ""
+#else
+#define MODULE_NOSMP " nosmp"
+#endif
+
+#define MODULE_ARCH_VERMAGIC CHIP_ARCH_NAME MODULE_PGSZ MODULE_NOSMP
+
+#endif /* _ASM_TILE_MODULE_H */
diff --git a/arch/tile/include/asm/page.h b/arch/tile/include/asm/page.h
index db93518fac03..c750943f961e 100644
--- a/arch/tile/include/asm/page.h
+++ b/arch/tile/include/asm/page.h
@@ -20,8 +20,17 @@
 #include <arch/chip.h>
 
 /* PAGE_SHIFT and HPAGE_SHIFT determine the page sizes. */
-#define PAGE_SHIFT	HV_LOG2_PAGE_SIZE_SMALL
-#define HPAGE_SHIFT	HV_LOG2_PAGE_SIZE_LARGE
+#if defined(CONFIG_PAGE_SIZE_16KB)
+#define PAGE_SHIFT	14
+#define CTX_PAGE_FLAG	HV_CTX_PG_SM_16K
+#elif defined(CONFIG_PAGE_SIZE_64KB)
+#define PAGE_SHIFT	16
+#define CTX_PAGE_FLAG	HV_CTX_PG_SM_64K
+#else
+#define PAGE_SHIFT	HV_LOG2_DEFAULT_PAGE_SIZE_SMALL
+#define CTX_PAGE_FLAG	0
+#endif
+#define HPAGE_SHIFT	HV_LOG2_DEFAULT_PAGE_SIZE_LARGE
 
 #define PAGE_SIZE	(_AC(1, UL) << PAGE_SHIFT)
 #define HPAGE_SIZE	(_AC(1, UL) << HPAGE_SHIFT)
diff --git a/arch/tile/include/asm/pgalloc.h b/arch/tile/include/asm/pgalloc.h
index e919c0bdc22d..1b902508b664 100644
--- a/arch/tile/include/asm/pgalloc.h
+++ b/arch/tile/include/asm/pgalloc.h
@@ -19,24 +19,24 @@
 #include <linux/mm.h>
 #include <linux/mmzone.h>
 #include <asm/fixmap.h>
+#include <asm/page.h>
 #include <hv/hypervisor.h>
 
 /* Bits for the size of the second-level page table. */
-#define L2_KERNEL_PGTABLE_SHIFT \
-  (HV_LOG2_PAGE_SIZE_LARGE - HV_LOG2_PAGE_SIZE_SMALL + HV_LOG2_PTE_SIZE)
+#define L2_KERNEL_PGTABLE_SHIFT _HV_LOG2_L2_SIZE(HPAGE_SHIFT, PAGE_SHIFT)
+
+/* How big is a kernel L2 page table? */
+#define L2_KERNEL_PGTABLE_SIZE (1UL << L2_KERNEL_PGTABLE_SHIFT)
 
 /* We currently allocate user L2 page tables by page (unlike kernel L2s). */
-#if L2_KERNEL_PGTABLE_SHIFT < HV_LOG2_PAGE_SIZE_SMALL
-#define L2_USER_PGTABLE_SHIFT HV_LOG2_PAGE_SIZE_SMALL
+#if L2_KERNEL_PGTABLE_SHIFT < PAGE_SHIFT
+#define L2_USER_PGTABLE_SHIFT PAGE_SHIFT
 #else
 #define L2_USER_PGTABLE_SHIFT L2_KERNEL_PGTABLE_SHIFT
 #endif
 
 /* How many pages do we need, as an "order", for a user L2 page table? */
-#define L2_USER_PGTABLE_ORDER (L2_USER_PGTABLE_SHIFT - HV_LOG2_PAGE_SIZE_SMALL)
-
-/* How big is a kernel L2 page table? */
-#define L2_KERNEL_PGTABLE_SIZE (1 << L2_KERNEL_PGTABLE_SHIFT)
+#define L2_USER_PGTABLE_ORDER (L2_USER_PGTABLE_SHIFT - PAGE_SHIFT)
 
 static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
 {
@@ -50,14 +50,14 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
 static inline void pmd_populate_kernel(struct mm_struct *mm,
 				       pmd_t *pmd, pte_t *ptep)
 {
-	set_pmd(pmd, ptfn_pmd(__pa(ptep) >> HV_LOG2_PAGE_TABLE_ALIGN,
+	set_pmd(pmd, ptfn_pmd(HV_CPA_TO_PTFN(__pa(ptep)),
 			      __pgprot(_PAGE_PRESENT)));
 }
 
 static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
 				pgtable_t page)
 {
-	set_pmd(pmd, ptfn_pmd(HV_PFN_TO_PTFN(page_to_pfn(page)),
+	set_pmd(pmd, ptfn_pmd(HV_CPA_TO_PTFN(PFN_PHYS(page_to_pfn(page))),
 			      __pgprot(_PAGE_PRESENT)));
 }
 
@@ -68,8 +68,20 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
 extern pgd_t *pgd_alloc(struct mm_struct *mm);
 extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
 
-extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address);
-extern void pte_free(struct mm_struct *mm, struct page *pte);
+extern pgtable_t pgtable_alloc_one(struct mm_struct *mm, unsigned long address,
+				   int order);
+extern void pgtable_free(struct mm_struct *mm, struct page *pte, int order);
+
+static inline pgtable_t pte_alloc_one(struct mm_struct *mm,
+				      unsigned long address)
+{
+	return pgtable_alloc_one(mm, address, L2_USER_PGTABLE_ORDER);
+}
+
+static inline void pte_free(struct mm_struct *mm, struct page *pte)
+{
+	pgtable_free(mm, pte, L2_USER_PGTABLE_ORDER);
+}
 
 #define pmd_pgtable(pmd) pmd_page(pmd)
 
@@ -85,8 +97,13 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 	pte_free(mm, virt_to_page(pte));
 }
 
-extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
-			   unsigned long address);
+extern void __pgtable_free_tlb(struct mmu_gather *tlb, struct page *pte,
+			       unsigned long address, int order);
+static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
+				  unsigned long address)
+{
+	__pgtable_free_tlb(tlb, pte, address, L2_USER_PGTABLE_ORDER);
+}
 
 #define check_pgt_cache()	do { } while (0)
 
@@ -104,19 +121,44 @@ void shatter_pmd(pmd_t *pmd);
 void shatter_huge_page(unsigned long addr);
 
 #ifdef __tilegx__
-/* We share a single page allocator for both L1 and L2 page tables. */
-#if HV_L1_SIZE != HV_L2_SIZE
-# error Rework assumption that L1 and L2 page tables are same size.
-#endif
-#define L1_USER_PGTABLE_ORDER L2_USER_PGTABLE_ORDER
+
 #define pud_populate(mm, pud, pmd) \
   pmd_populate_kernel((mm), (pmd_t *)(pud), (pte_t *)(pmd))
-#define pmd_alloc_one(mm, addr) \
-  ((pmd_t *)page_to_virt(pte_alloc_one((mm), (addr))))
-#define pmd_free(mm, pmdp) \
-  pte_free((mm), virt_to_page(pmdp))
-#define __pmd_free_tlb(tlb, pmdp, address) \
-  __pte_free_tlb((tlb), virt_to_page(pmdp), (address))
+
+/* Bits for the size of the L1 (intermediate) page table. */
+#define L1_KERNEL_PGTABLE_SHIFT _HV_LOG2_L1_SIZE(HPAGE_SHIFT)
+
+/* How big is a kernel L2 page table? */
+#define L1_KERNEL_PGTABLE_SIZE (1UL << L1_KERNEL_PGTABLE_SHIFT)
+
+/* We currently allocate L1 page tables by page. */
+#if L1_KERNEL_PGTABLE_SHIFT < PAGE_SHIFT
+#define L1_USER_PGTABLE_SHIFT PAGE_SHIFT
+#else
+#define L1_USER_PGTABLE_SHIFT L1_KERNEL_PGTABLE_SHIFT
 #endif
 
+/* How many pages do we need, as an "order", for an L1 page table? */
+#define L1_USER_PGTABLE_ORDER (L1_USER_PGTABLE_SHIFT - PAGE_SHIFT)
+
+static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
+{
+	struct page *p = pgtable_alloc_one(mm, address, L1_USER_PGTABLE_ORDER);
+	return (pmd_t *)page_to_virt(p);
+}
+
+static inline void pmd_free(struct mm_struct *mm, pmd_t *pmdp)
+{
+	pgtable_free(mm, virt_to_page(pmdp), L1_USER_PGTABLE_ORDER);
+}
+
+static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp,
+				  unsigned long address)
+{
+	__pgtable_free_tlb(tlb, virt_to_page(pmdp), address,
+			   L1_USER_PGTABLE_ORDER);
+}
+
+#endif /* __tilegx__ */
+
 #endif /* _ASM_TILE_PGALLOC_H */
diff --git a/arch/tile/include/asm/pgtable.h b/arch/tile/include/asm/pgtable.h
index ec907d4dbd7a..319f4826d972 100644
--- a/arch/tile/include/asm/pgtable.h
+++ b/arch/tile/include/asm/pgtable.h
@@ -27,8 +27,10 @@
 #include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
+#include <linux/pfn.h>
 #include <asm/processor.h>
 #include <asm/fixmap.h>
+#include <asm/page.h>
 
 struct mm_struct;
 struct vm_area_struct;
@@ -162,7 +164,7 @@ extern void set_page_homes(void);
   (pgprot_t) { ((oldprot).val & ~_PAGE_ALL) | (newprot).val }
 
 /* Just setting the PFN to zero suffices. */
-#define pte_pgprot(x) hv_pte_set_pfn((x), 0)
+#define pte_pgprot(x) hv_pte_set_pa((x), 0)
 
 /*
  * For PTEs and PDEs, we must clear the Present bit first when
@@ -262,7 +264,7 @@ static inline int pte_none(pte_t pte)
 
 static inline unsigned long pte_pfn(pte_t pte)
 {
-	return hv_pte_get_pfn(pte);
+	return PFN_DOWN(hv_pte_get_pa(pte));
 }
 
 /* Set or get the remote cache cpu in a pgprot with remote caching. */
@@ -271,7 +273,7 @@ extern int get_remote_cache_cpu(pgprot_t prot);
 
 static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot)
 {
-	return hv_pte_set_pfn(prot, pfn);
+	return hv_pte_set_pa(prot, PFN_PHYS(pfn));
 }
 
 /* Support for priority mappings. */
@@ -471,7 +473,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
  * OK for pte_lockptr(), since we just end up with potentially one
  * lock being used for several pte_t arrays.
  */
-#define pmd_page(pmd) pfn_to_page(HV_PTFN_TO_PFN(pmd_ptfn(pmd)))
+#define pmd_page(pmd) pfn_to_page(PFN_DOWN(HV_PTFN_TO_CPA(pmd_ptfn(pmd))))
 
 static inline void pmd_clear(pmd_t *pmdp)
 {
diff --git a/arch/tile/include/asm/pgtable_32.h b/arch/tile/include/asm/pgtable_32.h
index 27e20f6844a8..4ce4a7a99c24 100644
--- a/arch/tile/include/asm/pgtable_32.h
+++ b/arch/tile/include/asm/pgtable_32.h
@@ -20,11 +20,12 @@
  * The level-1 index is defined by the huge page size.  A PGD is composed
  * of PTRS_PER_PGD pgd_t's and is the top level of the page table.
  */
-#define PGDIR_SHIFT	HV_LOG2_PAGE_SIZE_LARGE
-#define PGDIR_SIZE	HV_PAGE_SIZE_LARGE
+#define PGDIR_SHIFT	HPAGE_SHIFT
+#define PGDIR_SIZE	HPAGE_SIZE
 #define PGDIR_MASK	(~(PGDIR_SIZE-1))
-#define PTRS_PER_PGD	(1 << (32 - PGDIR_SHIFT))
-#define SIZEOF_PGD	(PTRS_PER_PGD * sizeof(pgd_t))
+#define PTRS_PER_PGD	_HV_L1_ENTRIES(HPAGE_SHIFT)
+#define PGD_INDEX(va)	_HV_L1_INDEX(va, HPAGE_SHIFT)
+#define SIZEOF_PGD	_HV_L1_SIZE(HPAGE_SHIFT)
 
 /*
  * The level-2 index is defined by the difference between the huge
@@ -33,8 +34,9 @@
  * Note that the hypervisor docs use PTE for what we call pte_t, so
  * this nomenclature is somewhat confusing.
  */
-#define PTRS_PER_PTE (1 << (HV_LOG2_PAGE_SIZE_LARGE - HV_LOG2_PAGE_SIZE_SMALL))
-#define SIZEOF_PTE	(PTRS_PER_PTE * sizeof(pte_t))
+#define PTRS_PER_PTE	_HV_L2_ENTRIES(HPAGE_SHIFT, PAGE_SHIFT)
+#define PTE_INDEX(va)	_HV_L2_INDEX(va, HPAGE_SHIFT, PAGE_SHIFT)
+#define SIZEOF_PTE	_HV_L2_SIZE(HPAGE_SHIFT, PAGE_SHIFT)
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/tile/include/asm/pgtable_64.h b/arch/tile/include/asm/pgtable_64.h
index e105f3ada655..2492fa5478e7 100644
--- a/arch/tile/include/asm/pgtable_64.h
+++ b/arch/tile/include/asm/pgtable_64.h
@@ -21,17 +21,19 @@
 #define PGDIR_SIZE	HV_L1_SPAN
 #define PGDIR_MASK	(~(PGDIR_SIZE-1))
 #define PTRS_PER_PGD	HV_L0_ENTRIES
-#define SIZEOF_PGD	(PTRS_PER_PGD * sizeof(pgd_t))
+#define PGD_INDEX(va)	HV_L0_INDEX(va)
+#define SIZEOF_PGD	HV_L0_SIZE
 
 /*
  * The level-1 index is defined by the huge page size.  A PMD is composed
  * of PTRS_PER_PMD pgd_t's and is the middle level of the page table.
  */
-#define PMD_SHIFT	HV_LOG2_PAGE_SIZE_LARGE
-#define PMD_SIZE	HV_PAGE_SIZE_LARGE
+#define PMD_SHIFT	HPAGE_SHIFT
+#define PMD_SIZE	HPAGE_SIZE
 #define PMD_MASK	(~(PMD_SIZE-1))
-#define PTRS_PER_PMD	(1 << (PGDIR_SHIFT - PMD_SHIFT))
-#define SIZEOF_PMD	(PTRS_PER_PMD * sizeof(pmd_t))
+#define PTRS_PER_PMD	_HV_L1_ENTRIES(HPAGE_SHIFT)
+#define PMD_INDEX(va)	_HV_L1_INDEX(va, HPAGE_SHIFT)
+#define SIZEOF_PMD	_HV_L1_SIZE(HPAGE_SHIFT)
 
 /*
  * The level-2 index is defined by the difference between the huge
@@ -40,17 +42,19 @@
  * Note that the hypervisor docs use PTE for what we call pte_t, so
  * this nomenclature is somewhat confusing.
  */
-#define PTRS_PER_PTE (1 << (HV_LOG2_PAGE_SIZE_LARGE - HV_LOG2_PAGE_SIZE_SMALL))
-#define SIZEOF_PTE	(PTRS_PER_PTE * sizeof(pte_t))
+#define PTRS_PER_PTE	_HV_L2_ENTRIES(HPAGE_SHIFT, PAGE_SHIFT)
+#define PTE_INDEX(va)	_HV_L2_INDEX(va, HPAGE_SHIFT, PAGE_SHIFT)
+#define SIZEOF_PTE	_HV_L2_SIZE(HPAGE_SHIFT, PAGE_SHIFT)
 
 /*
- * Align the vmalloc area to an L2 page table, and leave a guard page
- * at the beginning and end.  The vmalloc code also puts in an internal
+ * Align the vmalloc area to an L2 page table.  Omit guard pages at
+ * the beginning and end for simplicity (particularly in the per-cpu
+ * memory allocation code).  The vmalloc code puts in an internal
  * guard page between each allocation.
  */
 #define _VMALLOC_END	HUGE_VMAP_BASE
-#define VMALLOC_END	(_VMALLOC_END - PAGE_SIZE)
-#define VMALLOC_START	(_VMALLOC_START + PAGE_SIZE)
+#define VMALLOC_END	_VMALLOC_END
+#define VMALLOC_START	_VMALLOC_START
 
 #define HUGE_VMAP_END	(HUGE_VMAP_BASE + PGDIR_SIZE)
 
@@ -98,7 +102,7 @@ static inline int pud_bad(pud_t pud)
  * A pud_t points to a pmd_t array.  Since we can have multiple per
  * page, we don't have a one-to-one mapping of pud_t's to pages.
  */
-#define pud_page(pud) pfn_to_page(HV_PTFN_TO_PFN(pud_ptfn(pud)))
+#define pud_page(pud) pfn_to_page(PFN_DOWN(HV_PTFN_TO_CPA(pud_ptfn(pud))))
 
 static inline unsigned long pud_index(unsigned long address)
 {
diff --git a/arch/tile/include/hv/drv_xgbe_intf.h b/arch/tile/include/hv/drv_xgbe_intf.h
index f13188ac281a..2a20b266d944 100644
--- a/arch/tile/include/hv/drv_xgbe_intf.h
+++ b/arch/tile/include/hv/drv_xgbe_intf.h
@@ -460,7 +460,7 @@ typedef void* lepp_comp_t;
  *  linux's "MAX_SKB_FRAGS", and presumably over-estimates by one, for
  *  our page size of exactly 65536.  We add one for a "body" fragment.
  */
-#define LEPP_MAX_FRAGS (65536 / HV_PAGE_SIZE_SMALL + 2 + 1)
+#define LEPP_MAX_FRAGS (65536 / HV_DEFAULT_PAGE_SIZE_SMALL + 2 + 1)
 
 /** Total number of bytes needed for an lepp_tso_cmd_t. */
 #define LEPP_TSO_CMD_SIZE(num_frags, header_size) \
diff --git a/arch/tile/include/hv/hypervisor.h b/arch/tile/include/hv/hypervisor.h
index df74223944b5..f27871775b7a 100644
--- a/arch/tile/include/hv/hypervisor.h
+++ b/arch/tile/include/hv/hypervisor.h
@@ -17,8 +17,8 @@
  * The hypervisor's public API.
  */
 
-#ifndef _TILE_HV_H
-#define _TILE_HV_H
+#ifndef _HV_HV_H
+#define _HV_HV_H
 
 #include <arch/chip.h>
 
@@ -42,25 +42,29 @@
  */
 #define HV_L1_SPAN (__HV_SIZE_ONE << HV_LOG2_L1_SPAN)
 
-/** The log2 of the size of small pages, in bytes. This value should
- * be verified at runtime by calling hv_sysconf(HV_SYSCONF_PAGE_SIZE_SMALL).
+/** The log2 of the initial size of small pages, in bytes.
+ * See HV_DEFAULT_PAGE_SIZE_SMALL.
  */
-#define HV_LOG2_PAGE_SIZE_SMALL 16
+#define HV_LOG2_DEFAULT_PAGE_SIZE_SMALL 16
 
-/** The size of small pages, in bytes. This value should be verified
+/** The initial size of small pages, in bytes. This value should be verified
  * at runtime by calling hv_sysconf(HV_SYSCONF_PAGE_SIZE_SMALL).
+ * It may also be modified when installing a new context.
  */
-#define HV_PAGE_SIZE_SMALL (__HV_SIZE_ONE << HV_LOG2_PAGE_SIZE_SMALL)
+#define HV_DEFAULT_PAGE_SIZE_SMALL \
+  (__HV_SIZE_ONE << HV_LOG2_DEFAULT_PAGE_SIZE_SMALL)
 
-/** The log2 of the size of large pages, in bytes. This value should be
- * verified at runtime by calling hv_sysconf(HV_SYSCONF_PAGE_SIZE_LARGE).
+/** The log2 of the initial size of large pages, in bytes.
+ * See HV_DEFAULT_PAGE_SIZE_LARGE.
  */
-#define HV_LOG2_PAGE_SIZE_LARGE 24
+#define HV_LOG2_DEFAULT_PAGE_SIZE_LARGE 24
 
-/** The size of large pages, in bytes. This value should be verified
+/** The initial size of large pages, in bytes. This value should be verified
  * at runtime by calling hv_sysconf(HV_SYSCONF_PAGE_SIZE_LARGE).
+ * It may also be modified when installing a new context.
  */
-#define HV_PAGE_SIZE_LARGE (__HV_SIZE_ONE << HV_LOG2_PAGE_SIZE_LARGE)
+#define HV_DEFAULT_PAGE_SIZE_LARGE \
+  (__HV_SIZE_ONE << HV_LOG2_DEFAULT_PAGE_SIZE_LARGE)
 
 /** The log2 of the granularity at which page tables must be aligned;
  *  in other words, the CPA for a page table must have this many zero
@@ -401,7 +405,13 @@ typedef enum {
    *  that the temperature has hit an upper limit and is no longer being
    *  accurately tracked.
    */
-  HV_SYSCONF_BOARD_TEMP      = 6
+  HV_SYSCONF_BOARD_TEMP      = 6,
+
+  /** Legal page size bitmask for hv_install_context().
+   * For example, if 16KB and 64KB small pages are supported,
+   * it would return "HV_CTX_PG_SM_16K | HV_CTX_PG_SM_64K".
+   */
+  HV_SYSCONF_VALID_PAGE_SIZES = 7,
 
 } HV_SysconfQuery;
 
@@ -654,6 +664,12 @@ void hv_set_rtc(HV_RTCTime time);
  *  new page table does not need to contain any mapping for the
  *  hv_install_context address itself.
  *
+ *  At most one HV_CTX_PG_SM_* flag may be specified in "flags";
+ *  if multiple flags are specified, HV_EINVAL is returned.
+ *  Specifying none of the flags results in using the default page size.
+ *  All cores participating in a given client must request the same
+ *  page size, or the results are undefined.
+ *
  * @param page_table Root of the page table.
  * @param access PTE providing info on how to read the page table.  This
  *   value must be consistent between multiple tiles sharing a page table,
@@ -672,6 +688,11 @@ int hv_install_context(HV_PhysAddr page_table, HV_PTE access, HV_ASID asid,
 #define HV_CTX_DIRECTIO     0x1   /**< Direct I/O requests are accepted from
                                        PL0. */
 
+#define HV_CTX_PG_SM_4K     0x10  /**< Use 4K small pages, if available. */
+#define HV_CTX_PG_SM_16K    0x20  /**< Use 16K small pages, if available. */
+#define HV_CTX_PG_SM_64K    0x40  /**< Use 64K small pages, if available. */
+#define HV_CTX_PG_SM_MASK   0xf0  /**< Mask of all possible small pages. */
+
 #ifndef __ASSEMBLER__
 
 /** Value returned from hv_inquire_context(). */
@@ -1248,11 +1269,14 @@ HV_Errno hv_set_command_line(HV_VirtAddr buf, int length);
  * with the existing priority pages) or "red/black" (if they don't).
  * The bitmask provides information on which parts of the cache
  * have been used for pinned pages so far on this tile; if (1 << N)
- * appears in the bitmask, that indicates that a page has been marked
- * "priority" whose PFN equals N, mod 8.
+ * appears in the bitmask, that indicates that a 4KB region of the
+ * cache starting at (N * 4KB) is in use by a "priority" page.
+ * The portion of cache used by a particular page can be computed
+ * by taking the page's PA, modulo CHIP_L2_CACHE_SIZE(), and setting
+ * all the "4KB" bits corresponding to the actual page size.
  * @param bitmask A bitmap of priority page set values
  */
-void hv_set_caching(unsigned int bitmask);
+void hv_set_caching(unsigned long bitmask);
 
 
 /** Zero out a specified number of pages.
@@ -1884,15 +1908,6 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
                                               of word */
 #define HV_PTE_PTFN_BITS             29  /**< Number of bits in a PTFN */
 
-/** Position of the PFN field within the PTE (subset of the PTFN). */
-#define HV_PTE_INDEX_PFN (HV_PTE_INDEX_PTFN + (HV_LOG2_PAGE_SIZE_SMALL - \
-                                               HV_LOG2_PAGE_TABLE_ALIGN))
-
-/** Length of the PFN field within the PTE (subset of the PTFN). */
-#define HV_PTE_INDEX_PFN_BITS (HV_PTE_INDEX_PTFN_BITS - \
-                               (HV_LOG2_PAGE_SIZE_SMALL - \
-                                HV_LOG2_PAGE_TABLE_ALIGN))
-
 /*
  * Legal values for the PTE's mode field
  */
@@ -2245,40 +2260,11 @@ hv_pte_set_mode(HV_PTE pte, unsigned int val)
  *
  * This field contains the upper bits of the CPA (client physical
  * address) of the target page; the complete CPA is this field with
- * HV_LOG2_PAGE_SIZE_SMALL zero bits appended to it.
+ * HV_LOG2_PAGE_TABLE_ALIGN zero bits appended to it.
  *
- * For PTEs in a level-1 page table where the Page bit is set, the
- * CPA must be aligned modulo the large page size.
- */
-static __inline unsigned int
-hv_pte_get_pfn(const HV_PTE pte)
-{
-  return pte.val >> HV_PTE_INDEX_PFN;
-}
-
-
-/** Set the page frame number into a PTE.  See hv_pte_get_pfn. */
-static __inline HV_PTE
-hv_pte_set_pfn(HV_PTE pte, unsigned int val)
-{
-  /*
-   * Note that the use of "PTFN" in the next line is intentional; we
-   * don't want any garbage lower bits left in that field.
-   */
-  pte.val &= ~(((1ULL << HV_PTE_PTFN_BITS) - 1) << HV_PTE_INDEX_PTFN);
-  pte.val |= (__hv64) val << HV_PTE_INDEX_PFN;
-  return pte;
-}
-
-/** Get the page table frame number from the PTE.
- *
- * This field contains the upper bits of the CPA (client physical
- * address) of the target page table; the complete CPA is this field with
- * with HV_PAGE_TABLE_ALIGN zero bits appended to it.
- *
- * For PTEs in a level-1 page table when the Page bit is not set, the
- * CPA must be aligned modulo the sticter of HV_PAGE_TABLE_ALIGN and
- * the level-2 page table size.
+ * For all PTEs in the lowest-level page table, and for all PTEs with
+ * the Page bit set in all page tables, the CPA must be aligned modulo
+ * the relevant page size.
  */
 static __inline unsigned long
 hv_pte_get_ptfn(const HV_PTE pte)
@@ -2286,7 +2272,6 @@ hv_pte_get_ptfn(const HV_PTE pte)
   return pte.val >> HV_PTE_INDEX_PTFN;
 }
 
-
 /** Set the page table frame number into a PTE.  See hv_pte_get_ptfn. */
 static __inline HV_PTE
 hv_pte_set_ptfn(HV_PTE pte, unsigned long val)
@@ -2296,6 +2281,20 @@ hv_pte_set_ptfn(HV_PTE pte, unsigned long val)
   return pte;
 }
 
+/** Get the client physical address from the PTE.  See hv_pte_set_ptfn. */
+static __inline HV_PhysAddr
+hv_pte_get_pa(const HV_PTE pte)
+{
+  return (__hv64) hv_pte_get_ptfn(pte) << HV_LOG2_PAGE_TABLE_ALIGN;
+}
+
+/** Set the client physical address into a PTE.  See hv_pte_get_ptfn. */
+static __inline HV_PTE
+hv_pte_set_pa(HV_PTE pte, HV_PhysAddr pa)
+{
+  return hv_pte_set_ptfn(pte, pa >> HV_LOG2_PAGE_TABLE_ALIGN);
+}
+
 
 /** Get the remote tile caching this page.
  *
@@ -2331,28 +2330,20 @@ hv_pte_set_lotar(HV_PTE pte, unsigned int val)
 
 #endif  /* !__ASSEMBLER__ */
 
-/** Converts a client physical address to a pfn. */
-#define HV_CPA_TO_PFN(p) ((p) >> HV_LOG2_PAGE_SIZE_SMALL)
-
-/** Converts a pfn to a client physical address. */
-#define HV_PFN_TO_CPA(p) (((HV_PhysAddr)(p)) << HV_LOG2_PAGE_SIZE_SMALL)
-
 /** Converts a client physical address to a ptfn. */
 #define HV_CPA_TO_PTFN(p) ((p) >> HV_LOG2_PAGE_TABLE_ALIGN)
 
 /** Converts a ptfn to a client physical address. */
 #define HV_PTFN_TO_CPA(p) (((HV_PhysAddr)(p)) << HV_LOG2_PAGE_TABLE_ALIGN)
 
-/** Converts a ptfn to a pfn. */
-#define HV_PTFN_TO_PFN(p) \
-  ((p) >> (HV_LOG2_PAGE_SIZE_SMALL - HV_LOG2_PAGE_TABLE_ALIGN))
-
-/** Converts a pfn to a ptfn. */
-#define HV_PFN_TO_PTFN(p) \
-  ((p) << (HV_LOG2_PAGE_SIZE_SMALL - HV_LOG2_PAGE_TABLE_ALIGN))
-
 #if CHIP_VA_WIDTH() > 32
 
+/*
+ * Note that we currently do not allow customizing the page size
+ * of the L0 pages, but fix them at 4GB, so we do not use the
+ * "_HV_xxx" nomenclature for the L0 macros.
+ */
+
 /** Log number of HV_PTE entries in L0 page table */
 #define HV_LOG2_L0_ENTRIES (CHIP_VA_WIDTH() - HV_LOG2_L1_SPAN)
 
@@ -2382,69 +2373,104 @@ hv_pte_set_lotar(HV_PTE pte, unsigned int val)
 #endif /* CHIP_VA_WIDTH() > 32 */
 
 /** Log number of HV_PTE entries in L1 page table */
-#define HV_LOG2_L1_ENTRIES (HV_LOG2_L1_SPAN - HV_LOG2_PAGE_SIZE_LARGE)
+#define _HV_LOG2_L1_ENTRIES(log2_page_size_large) \
+  (HV_LOG2_L1_SPAN - log2_page_size_large)
 
 /** Number of HV_PTE entries in L1 page table */
-#define HV_L1_ENTRIES (1 << HV_LOG2_L1_ENTRIES)
+#define _HV_L1_ENTRIES(log2_page_size_large) \
+  (1 << _HV_LOG2_L1_ENTRIES(log2_page_size_large))
 
 /** Log size of L1 page table in bytes */
-#define HV_LOG2_L1_SIZE (HV_LOG2_PTE_SIZE + HV_LOG2_L1_ENTRIES)
+#define _HV_LOG2_L1_SIZE(log2_page_size_large) \
+  (HV_LOG2_PTE_SIZE + _HV_LOG2_L1_ENTRIES(log2_page_size_large))
 
 /** Size of L1 page table in bytes */
-#define HV_L1_SIZE (1 << HV_LOG2_L1_SIZE)
+#define _HV_L1_SIZE(log2_page_size_large) \
+  (1 << _HV_LOG2_L1_SIZE(log2_page_size_large))
 
 /** Log number of HV_PTE entries in level-2 page table */
-#define HV_LOG2_L2_ENTRIES (HV_LOG2_PAGE_SIZE_LARGE - HV_LOG2_PAGE_SIZE_SMALL)
+#define _HV_LOG2_L2_ENTRIES(log2_page_size_large, log2_page_size_small) \
+  (log2_page_size_large - log2_page_size_small)
 
 /** Number of HV_PTE entries in level-2 page table */
-#define HV_L2_ENTRIES (1 << HV_LOG2_L2_ENTRIES)
+#define _HV_L2_ENTRIES(log2_page_size_large, log2_page_size_small) \
+  (1 << _HV_LOG2_L2_ENTRIES(log2_page_size_large, log2_page_size_small))
 
 /** Log size of level-2 page table in bytes */
-#define HV_LOG2_L2_SIZE (HV_LOG2_PTE_SIZE + HV_LOG2_L2_ENTRIES)
+#define _HV_LOG2_L2_SIZE(log2_page_size_large, log2_page_size_small) \
+  (HV_LOG2_PTE_SIZE + \
+   _HV_LOG2_L2_ENTRIES(log2_page_size_large, log2_page_size_small))
 
 /** Size of level-2 page table in bytes */
-#define HV_L2_SIZE (1 << HV_LOG2_L2_SIZE)
+#define _HV_L2_SIZE(log2_page_size_large, log2_page_size_small) \
+  (1 << _HV_LOG2_L2_SIZE(log2_page_size_large, log2_page_size_small))
 
 #ifdef __ASSEMBLER__
 
 #if CHIP_VA_WIDTH() > 32
 
 /** Index in L1 for a specific VA */
-#define HV_L1_INDEX(va) \
-  (((va) >> HV_LOG2_PAGE_SIZE_LARGE) & (HV_L1_ENTRIES - 1))
+#define _HV_L1_INDEX(va, log2_page_size_large) \
+  (((va) >> log2_page_size_large) & (_HV_L1_ENTRIES(log2_page_size_large) - 1))
 
 #else /* CHIP_VA_WIDTH() > 32 */
 
 /** Index in L1 for a specific VA */
-#define HV_L1_INDEX(va) \
-  (((va) >> HV_LOG2_PAGE_SIZE_LARGE))
+#define _HV_L1_INDEX(va, log2_page_size_large) \
+  (((va) >> log2_page_size_large))
 
 #endif /* CHIP_VA_WIDTH() > 32 */
 
 /** Index in level-2 page table for a specific VA */
-#define HV_L2_INDEX(va) \
-  (((va) >> HV_LOG2_PAGE_SIZE_SMALL) & (HV_L2_ENTRIES - 1))
+#define _HV_L2_INDEX(va, log2_page_size_large, log2_page_size_small) \
+  (((va) >> log2_page_size_small) & \
+   (_HV_L2_ENTRIES(log2_page_size_large, log2_page_size_small) - 1))
 
 #else /* __ASSEMBLER __ */
 
 #if CHIP_VA_WIDTH() > 32
 
 /** Index in L1 for a specific VA */
-#define HV_L1_INDEX(va) \
-  (((HV_VirtAddr)(va) >> HV_LOG2_PAGE_SIZE_LARGE) & (HV_L1_ENTRIES - 1))
+#define _HV_L1_INDEX(va, log2_page_size_large) \
+  (((HV_VirtAddr)(va) >> log2_page_size_large) & \
+   (_HV_L1_ENTRIES(log2_page_size_large) - 1))
 
 #else /* CHIP_VA_WIDTH() > 32 */
 
 /** Index in L1 for a specific VA */
-#define HV_L1_INDEX(va) \
-  (((HV_VirtAddr)(va) >> HV_LOG2_PAGE_SIZE_LARGE))
+#define _HV_L1_INDEX(va, log2_page_size_large) \
+  (((HV_VirtAddr)(va) >> log2_page_size_large))
 
 #endif /* CHIP_VA_WIDTH() > 32 */
 
 /** Index in level-2 page table for a specific VA */
-#define HV_L2_INDEX(va) \
-  (((HV_VirtAddr)(va) >> HV_LOG2_PAGE_SIZE_SMALL) & (HV_L2_ENTRIES - 1))
+#define _HV_L2_INDEX(va, log2_page_size_large, log2_page_size_small) \
+  (((HV_VirtAddr)(va) >> log2_page_size_small) & \
+   (_HV_L2_ENTRIES(log2_page_size_large, log2_page_size_small) - 1))
 
 #endif /* __ASSEMBLER __ */
 
-#endif /* _TILE_HV_H */
+/** Position of the PFN field within the PTE (subset of the PTFN). */
+#define _HV_PTE_INDEX_PFN(log2_page_size) \
+  (HV_PTE_INDEX_PTFN + (log2_page_size - HV_LOG2_PAGE_TABLE_ALIGN))
+
+/** Length of the PFN field within the PTE (subset of the PTFN). */
+#define _HV_PTE_INDEX_PFN_BITS(log2_page_size) \
+  (HV_PTE_INDEX_PTFN_BITS - (log2_page_size - HV_LOG2_PAGE_TABLE_ALIGN))
+
+/** Converts a client physical address to a pfn. */
+#define _HV_CPA_TO_PFN(p, log2_page_size) ((p) >> log2_page_size)
+
+/** Converts a pfn to a client physical address. */
+#define _HV_PFN_TO_CPA(p, log2_page_size) \
+  (((HV_PhysAddr)(p)) << log2_page_size)
+
+/** Converts a ptfn to a pfn. */
+#define _HV_PTFN_TO_PFN(p, log2_page_size) \
+  ((p) >> (log2_page_size - HV_LOG2_PAGE_TABLE_ALIGN))
+
+/** Converts a pfn to a ptfn. */
+#define _HV_PFN_TO_PTFN(p, log2_page_size) \
+  ((p) << (log2_page_size - HV_LOG2_PAGE_TABLE_ALIGN))
+
+#endif /* _HV_HV_H */
diff --git a/arch/tile/kernel/head_32.S b/arch/tile/kernel/head_32.S
index 1a39b7c1c87e..f71bfeeaf1a9 100644
--- a/arch/tile/kernel/head_32.S
+++ b/arch/tile/kernel/head_32.S
@@ -69,7 +69,7 @@ ENTRY(_start)
 	}
 	{
 	  moveli lr, lo16(1f)
-	  move r5, zero
+	  moveli r5, CTX_PAGE_FLAG
 	}
 	{
 	  auli lr, lr, ha16(1f)
@@ -141,11 +141,11 @@ ENTRY(empty_zero_page)
 
 	.macro PTE va, cpa, bits1, no_org=0
 	.ifeq \no_org
-	.org swapper_pg_dir + HV_L1_INDEX(\va) * HV_PTE_SIZE
+	.org swapper_pg_dir + PGD_INDEX(\va) * HV_PTE_SIZE
 	.endif
 	.word HV_PTE_PAGE | HV_PTE_DIRTY | HV_PTE_PRESENT | HV_PTE_ACCESSED | \
 	      (HV_PTE_MODE_CACHE_NO_L3 << HV_PTE_INDEX_MODE)
-	.word (\bits1) | (HV_CPA_TO_PFN(\cpa) << (HV_PTE_INDEX_PFN - 32))
+	.word (\bits1) | (HV_CPA_TO_PTFN(\cpa) << (HV_PTE_INDEX_PTFN - 32))
 	.endm
 
 __PAGE_ALIGNED_DATA
@@ -166,7 +166,7 @@ ENTRY(swapper_pg_dir)
 	/* The true text VAs are mapped as VA = PA + MEM_SV_INTRPT */
 	PTE MEM_SV_INTRPT, 0, (1 << (HV_PTE_INDEX_READABLE - 32)) | \
 			      (1 << (HV_PTE_INDEX_EXECUTABLE - 32))
-	.org swapper_pg_dir + HV_L1_SIZE
+	.org swapper_pg_dir + PGDIR_SIZE
 	END(swapper_pg_dir)
 
 	/*
diff --git a/arch/tile/kernel/head_64.S b/arch/tile/kernel/head_64.S
index 6bc3a932fe45..f9a2734f7b82 100644
--- a/arch/tile/kernel/head_64.S
+++ b/arch/tile/kernel/head_64.S
@@ -114,7 +114,7 @@ ENTRY(_start)
 	  shl16insli r0, r0, hw0(swapper_pg_dir - PAGE_OFFSET)
 	}
 	{
-	  move r3, zero
+	  moveli r3, CTX_PAGE_FLAG
 	  j hv_install_context
 	}
 1:
@@ -210,19 +210,19 @@ ENTRY(empty_zero_page)
 	.macro PTE cpa, bits1
 	.quad HV_PTE_PAGE | HV_PTE_DIRTY | HV_PTE_PRESENT | HV_PTE_ACCESSED |\
 	      HV_PTE_GLOBAL | (HV_PTE_MODE_CACHE_NO_L3 << HV_PTE_INDEX_MODE) |\
-	      (\bits1) | (HV_CPA_TO_PFN(\cpa) << HV_PTE_INDEX_PFN)
+	      (\bits1) | (HV_CPA_TO_PTFN(\cpa) << HV_PTE_INDEX_PTFN)
 	.endm
 
 __PAGE_ALIGNED_DATA
 	.align PAGE_SIZE
 ENTRY(swapper_pg_dir)
-	.org swapper_pg_dir + HV_L0_INDEX(PAGE_OFFSET) * HV_PTE_SIZE
+	.org swapper_pg_dir + PGD_INDEX(PAGE_OFFSET) * HV_PTE_SIZE
 .Lsv_data_pmd:
 	.quad 0  /* PTE temp_data_pmd - PAGE_OFFSET, 0 */
-	.org swapper_pg_dir + HV_L0_INDEX(MEM_SV_START) * HV_PTE_SIZE
+	.org swapper_pg_dir + PGD_INDEX(MEM_SV_START) * HV_PTE_SIZE
 .Lsv_code_pmd:
 	.quad 0  /* PTE temp_code_pmd - PAGE_OFFSET, 0 */
-	.org swapper_pg_dir + HV_L0_SIZE
+	.org swapper_pg_dir + SIZEOF_PGD
 	END(swapper_pg_dir)
 
 	.align HV_PAGE_TABLE_ALIGN
@@ -233,11 +233,11 @@ ENTRY(temp_data_pmd)
 	 * permissions later.
 	 */
 	.set addr, 0
-	.rept HV_L1_ENTRIES
+	.rept PTRS_PER_PMD
 	PTE addr, HV_PTE_READABLE | HV_PTE_WRITABLE
-	.set addr, addr + HV_PAGE_SIZE_LARGE
+	.set addr, addr + HPAGE_SIZE
 	.endr
-	.org temp_data_pmd + HV_L1_SIZE
+	.org temp_data_pmd + SIZEOF_PMD
 	END(temp_data_pmd)
 
 	.align HV_PAGE_TABLE_ALIGN
@@ -248,11 +248,11 @@ ENTRY(temp_code_pmd)
 	 * permissions later.
 	 */
 	.set addr, 0
-	.rept HV_L1_ENTRIES
+	.rept PTRS_PER_PMD
 	PTE addr, HV_PTE_READABLE | HV_PTE_EXECUTABLE
-	.set addr, addr + HV_PAGE_SIZE_LARGE
+	.set addr, addr + HPAGE_SIZE
 	.endr
-	.org temp_code_pmd + HV_L1_SIZE
+	.org temp_code_pmd + SIZEOF_PMD
 	END(temp_code_pmd)
 
 	/*
diff --git a/arch/tile/kernel/machine_kexec.c b/arch/tile/kernel/machine_kexec.c
index 6255f2eab112..b0fa37c1a521 100644
--- a/arch/tile/kernel/machine_kexec.c
+++ b/arch/tile/kernel/machine_kexec.c
@@ -251,6 +251,7 @@ static void setup_quasi_va_is_pa(void)
 void machine_kexec(struct kimage *image)
 {
 	void *reboot_code_buffer;
+	pte_t *ptep;
 	void (*rnk)(unsigned long, void *, unsigned long)
 		__noreturn;
 
@@ -266,8 +267,10 @@ void machine_kexec(struct kimage *image)
 	 */
 	homecache_change_page_home(image->control_code_page, 0,
 				   smp_processor_id());
-	reboot_code_buffer = vmap(&image->control_code_page, 1, 0,
-				  __pgprot(_PAGE_KERNEL | _PAGE_EXECUTABLE));
+	reboot_code_buffer = page_address(image->control_code_page);
+	BUG_ON(reboot_code_buffer == NULL);
+	ptep = virt_to_pte(NULL, (unsigned long)reboot_code_buffer);
+	__set_pte(ptep, pte_mkexec(*ptep));
 	memcpy(reboot_code_buffer, relocate_new_kernel,
 	       relocate_new_kernel_size);
 	__flush_icache_range(
diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c
index bff23f476110..32948e21113a 100644
--- a/arch/tile/kernel/setup.c
+++ b/arch/tile/kernel/setup.c
@@ -1396,13 +1396,13 @@ void __init setup_per_cpu_areas(void)
 		for (i = 0; i < size; i += PAGE_SIZE, ++pfn, ++pg) {
 
 			/* Update the vmalloc mapping and page home. */
-			pte_t *ptep =
-				virt_to_pte(NULL, (unsigned long)ptr + i);
+			unsigned long addr = (unsigned long)ptr + i;
+			pte_t *ptep = virt_to_pte(NULL, addr);
 			pte_t pte = *ptep;
 			BUG_ON(pfn != pte_pfn(pte));
 			pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_TILE_L3);
 			pte = set_remote_cache_cpu(pte, cpu);
-			set_pte(ptep, pte);
+			set_pte_at(&init_mm, addr, ptep, pte);
 
 			/* Update the lowmem mapping for consistency. */
 			lowmem_va = (unsigned long)pfn_to_kaddr(pfn);
@@ -1415,7 +1415,7 @@ void __init setup_per_cpu_areas(void)
 				BUG_ON(pte_huge(*ptep));
 			}
 			BUG_ON(pfn != pte_pfn(*ptep));
-			set_pte(ptep, pte);
+			set_pte_at(&init_mm, lowmem_va, ptep, pte);
 		}
 	}
 
diff --git a/arch/tile/kernel/smp.c b/arch/tile/kernel/smp.c
index 91da0f721958..cbc73a8b8fe1 100644
--- a/arch/tile/kernel/smp.c
+++ b/arch/tile/kernel/smp.c
@@ -203,7 +203,7 @@ void __init ipi_init(void)
 		if (hv_get_ipi_pte(tile, KERNEL_PL, &pte) != 0)
 			panic("Failed to initialize IPI for cpu %d\n", cpu);
 
-		offset = hv_pte_get_pfn(pte) << PAGE_SHIFT;
+		offset = PFN_PHYS(pte_pfn(pte));
 		ipi_mappings[cpu] = ioremap_prot(offset, PAGE_SIZE, pte);
 	}
 #endif
diff --git a/arch/tile/lib/memcpy_tile64.c b/arch/tile/lib/memcpy_tile64.c
index b2fe15e01075..3bc4b4e40d93 100644
--- a/arch/tile/lib/memcpy_tile64.c
+++ b/arch/tile/lib/memcpy_tile64.c
@@ -160,7 +160,7 @@ retry_source:
 			break;
 		if (get_remote_cache_cpu(src_pte) == smp_processor_id())
 			break;
-		src_page = pfn_to_page(hv_pte_get_pfn(src_pte));
+		src_page = pfn_to_page(pte_pfn(src_pte));
 		get_page(src_page);
 		if (pte_val(src_pte) != pte_val(*src_ptep)) {
 			put_page(src_page);
@@ -168,7 +168,7 @@ retry_source:
 		}
 		if (pte_huge(src_pte)) {
 			/* Adjust the PTE to correspond to a small page */
-			int pfn = hv_pte_get_pfn(src_pte);
+			int pfn = pte_pfn(src_pte);
 			pfn += (((unsigned long)source & (HPAGE_SIZE-1))
 				>> PAGE_SHIFT);
 			src_pte = pfn_pte(pfn, src_pte);
@@ -188,7 +188,7 @@ retry_dest:
 			put_page(src_page);
 			break;
 		}
-		dst_page = pfn_to_page(hv_pte_get_pfn(dst_pte));
+		dst_page = pfn_to_page(pte_pfn(dst_pte));
 		if (dst_page == src_page) {
 			/*
 			 * Source and dest are on the same page; this
@@ -206,7 +206,7 @@ retry_dest:
 		}
 		if (pte_huge(dst_pte)) {
 			/* Adjust the PTE to correspond to a small page */
-			int pfn = hv_pte_get_pfn(dst_pte);
+			int pfn = pte_pfn(dst_pte);
 			pfn += (((unsigned long)dest & (HPAGE_SIZE-1))
 				>> PAGE_SHIFT);
 			dst_pte = pfn_pte(pfn, dst_pte);
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
index 1e4633520b35..c04fbfd93fc5 100644
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
@@ -82,7 +82,7 @@ static int num_l2_ptes[MAX_NUMNODES];
 
 static void init_prealloc_ptes(int node, int pages)
 {
-	BUG_ON(pages & (HV_L2_ENTRIES-1));
+	BUG_ON(pages & (PTRS_PER_PTE - 1));
 	if (pages) {
 		num_l2_ptes[node] = pages;
 		l2_ptes[node] = __alloc_bootmem(pages * sizeof(pte_t),
@@ -131,14 +131,9 @@ static void __init assign_pte(pmd_t *pmd, pte_t *page_table)
 
 #ifdef __tilegx__
 
-#if HV_L1_SIZE != HV_L2_SIZE
-# error Rework assumption that L1 and L2 page tables are same size.
-#endif
-
-/* Since pmd_t arrays and pte_t arrays are the same size, just use casts. */
 static inline pmd_t *alloc_pmd(void)
 {
-	return (pmd_t *)alloc_pte();
+	return __alloc_bootmem(L1_KERNEL_PGTABLE_SIZE, HV_PAGE_TABLE_ALIGN, 0);
 }
 
 static inline void assign_pmd(pud_t *pud, pmd_t *pmd)
@@ -811,7 +806,7 @@ void __init paging_init(void)
 	 * changing init_mm once we get up and running, and there's no
 	 * need for e.g. vmalloc_sync_all().
 	 */
-	BUILD_BUG_ON(pgd_index(VMALLOC_START) != pgd_index(VMALLOC_END));
+	BUILD_BUG_ON(pgd_index(VMALLOC_START) != pgd_index(VMALLOC_END - 1));
 	pud = pud_offset(pgd_base + pgd_index(VMALLOC_START), VMALLOC_START);
 	assign_pmd(pud, alloc_pmd());
 #endif
diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c
index 2410aa899b3e..3d7074347e6d 100644
--- a/arch/tile/mm/pgtable.c
+++ b/arch/tile/mm/pgtable.c
@@ -289,13 +289,12 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 
 #define L2_USER_PGTABLE_PAGES (1 << L2_USER_PGTABLE_ORDER)
 
-struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
+struct page *pgtable_alloc_one(struct mm_struct *mm, unsigned long address,
+			       int order)
 {
 	gfp_t flags = GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO;
 	struct page *p;
-#if L2_USER_PGTABLE_ORDER > 0
 	int i;
-#endif
 
 #ifdef CONFIG_HIGHPTE
 	flags |= __GFP_HIGHMEM;
@@ -305,17 +304,15 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
 	if (p == NULL)
 		return NULL;
 
-#if L2_USER_PGTABLE_ORDER > 0
 	/*
 	 * Make every page have a page_count() of one, not just the first.
 	 * We don't use __GFP_COMP since it doesn't look like it works
 	 * correctly with tlb_remove_page().
 	 */
-	for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) {
+	for (i = 1; i < order; ++i) {
 		init_page_count(p+i);
 		inc_zone_page_state(p+i, NR_PAGETABLE);
 	}
-#endif
 
 	pgtable_page_ctor(p);
 	return p;
@@ -326,28 +323,28 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
  * process).  We have to correct whatever pte_alloc_one() did before
  * returning the pages to the allocator.
  */
-void pte_free(struct mm_struct *mm, struct page *p)
+void pgtable_free(struct mm_struct *mm, struct page *p, int order)
 {
 	int i;
 
 	pgtable_page_dtor(p);
 	__free_page(p);
 
-	for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) {
+	for (i = 1; i < order; ++i) {
 		__free_page(p+i);
 		dec_zone_page_state(p+i, NR_PAGETABLE);
 	}
 }
 
-void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
-		    unsigned long address)
+void __pgtable_free_tlb(struct mmu_gather *tlb, struct page *pte,
+			unsigned long address, int order)
 {
 	int i;
 
 	pgtable_page_dtor(pte);
 	tlb_remove_page(tlb, pte);
 
-	for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) {
+	for (i = 1; i < order; ++i) {
 		tlb_remove_page(tlb, pte + i);
 		dec_zone_page_state(pte + i, NR_PAGETABLE);
 	}
@@ -490,7 +487,7 @@ void set_pte(pte_t *ptep, pte_t pte)
 /* Can this mm load a PTE with cached_priority set? */
 static inline int mm_is_priority_cached(struct mm_struct *mm)
 {
-	return mm->context.priority_cached;
+	return mm->context.priority_cached != 0;
 }
 
 /*
@@ -500,8 +497,8 @@ static inline int mm_is_priority_cached(struct mm_struct *mm)
 void start_mm_caching(struct mm_struct *mm)
 {
 	if (!mm_is_priority_cached(mm)) {
-		mm->context.priority_cached = -1U;
-		hv_set_caching(-1U);
+		mm->context.priority_cached = -1UL;
+		hv_set_caching(-1UL);
 	}
 }
 
@@ -516,7 +513,7 @@ void start_mm_caching(struct mm_struct *mm)
  * Presumably we'll come back later and have more luck and clear
  * the value then; for now we'll just keep the cache marked for priority.
  */
-static unsigned int update_priority_cached(struct mm_struct *mm)
+static unsigned long update_priority_cached(struct mm_struct *mm)
 {
 	if (mm->context.priority_cached && down_write_trylock(&mm->mmap_sem)) {
 		struct vm_area_struct *vm;
-- 
cgit v1.2.3


From 621b19551507c8fd9d721f4038509c5bb155a983 Mon Sep 17 00:00:00 2001
From: Chris Metcalf <cmetcalf@tilera.com>
Date: Sun, 1 Apr 2012 14:04:21 -0400
Subject: arch/tile: support multiple huge page sizes dynamically

This change adds support for a new "super" bit in the PTE, using the new
arch_make_huge_pte() method.  The Tilera hypervisor sees the bit set at a
given level of the page table and gangs together 4, 16, or 64 consecutive
pages from that level of the hierarchy to create a larger TLB entry.

One extra "super" page size can be specified at each of the three levels
of the page table hierarchy on tilegx, using the "hugepagesz" argument
on the boot command line.  A new hypervisor API is added to allow Linux
to tell the hypervisor how many PTEs to gang together at each level of
the page table.

To allow pre-allocating huge pages larger than the buddy allocator can
handle, this change modifies the Tilera bootmem support to put all of
memory on tilegx platforms into bootmem.

As part of this change I eliminate the vestigial CONFIG_HIGHPTE support,
which never worked anyway, and eliminate the hv_page_size() API in favor
of the standard vma_kernel_pagesize() API.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
 arch/tile/Kconfig                 |   8 ++
 arch/tile/include/asm/hugetlb.h   |  21 +++
 arch/tile/include/asm/page.h      |   5 +-
 arch/tile/include/asm/pgtable.h   |  12 +-
 arch/tile/include/asm/tlbflush.h  |  17 +--
 arch/tile/include/hv/hypervisor.h |  70 +++++++++-
 arch/tile/kernel/hvglue.lds       |   3 +-
 arch/tile/kernel/proc.c           |   1 +
 arch/tile/kernel/setup.c          | 161 ++++++++++++++-------
 arch/tile/kernel/tlb.c            |  11 +-
 arch/tile/mm/fault.c              |   2 +-
 arch/tile/mm/homecache.c          |   1 +
 arch/tile/mm/hugetlbpage.c        | 285 +++++++++++++++++++++++++++++---------
 arch/tile/mm/init.c               |   4 +
 arch/tile/mm/pgtable.c            |  13 --
 15 files changed, 456 insertions(+), 158 deletions(-)

(limited to 'arch/tile/mm/init.c')

diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 38c3957e0b40..cc5664286a1c 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -47,6 +47,14 @@ config NEED_PER_CPU_PAGE_FIRST_CHUNK
 config SYS_SUPPORTS_HUGETLBFS
 	def_bool y
 
+# Support for additional huge page sizes besides HPAGE_SIZE.
+# The software support is currently only present in the TILE-Gx
+# hypervisor. TILEPro in any case does not support page sizes
+# larger than the default HPAGE_SIZE.
+config HUGETLB_SUPER_PAGES
+	depends on HUGETLB_PAGE && TILEGX
+	def_bool y
+
 config GENERIC_CLOCKEVENTS
 	def_bool y
 
diff --git a/arch/tile/include/asm/hugetlb.h b/arch/tile/include/asm/hugetlb.h
index d396d1805163..b2042380a5aa 100644
--- a/arch/tile/include/asm/hugetlb.h
+++ b/arch/tile/include/asm/hugetlb.h
@@ -106,4 +106,25 @@ static inline void arch_release_hugepage(struct page *page)
 {
 }
 
+#ifdef CONFIG_HUGETLB_SUPER_PAGES
+static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
+				       struct page *page, int writable)
+{
+	size_t pagesize = huge_page_size(hstate_vma(vma));
+	if (pagesize != PUD_SIZE && pagesize != PMD_SIZE)
+		entry = pte_mksuper(entry);
+	return entry;
+}
+#define arch_make_huge_pte arch_make_huge_pte
+
+/* Sizes to scale up page size for PTEs with HV_PTE_SUPER bit. */
+enum {
+	HUGE_SHIFT_PGDIR = 0,
+	HUGE_SHIFT_PMD = 1,
+	HUGE_SHIFT_PAGE = 2,
+	HUGE_SHIFT_ENTRIES
+};
+extern int huge_shift[HUGE_SHIFT_ENTRIES];
+#endif
+
 #endif /* _ASM_TILE_HUGETLB_H */
diff --git a/arch/tile/include/asm/page.h b/arch/tile/include/asm/page.h
index c750943f961e..9d9131e5c552 100644
--- a/arch/tile/include/asm/page.h
+++ b/arch/tile/include/asm/page.h
@@ -87,8 +87,7 @@ typedef HV_PTE pgprot_t;
 /*
  * User L2 page tables are managed as one L2 page table per page,
  * because we use the page allocator for them.  This keeps the allocation
- * simple and makes it potentially useful to implement HIGHPTE at some point.
- * However, it's also inefficient, since L2 page tables are much smaller
+ * simple, but it's also inefficient, since L2 page tables are much smaller
  * than pages (currently 2KB vs 64KB).  So we should revisit this.
  */
 typedef struct page *pgtable_t;
@@ -137,7 +136,7 @@ static inline __attribute_const__ int get_order(unsigned long size)
 
 #define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)
 
-#define HUGE_MAX_HSTATE		2
+#define HUGE_MAX_HSTATE		6
 
 #ifdef CONFIG_HUGETLB_PAGE
 #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
diff --git a/arch/tile/include/asm/pgtable.h b/arch/tile/include/asm/pgtable.h
index 319f4826d972..73b1a4c9ad03 100644
--- a/arch/tile/include/asm/pgtable.h
+++ b/arch/tile/include/asm/pgtable.h
@@ -71,6 +71,7 @@ extern void set_page_homes(void);
 
 #define _PAGE_PRESENT           HV_PTE_PRESENT
 #define _PAGE_HUGE_PAGE         HV_PTE_PAGE
+#define _PAGE_SUPER_PAGE        HV_PTE_SUPER
 #define _PAGE_READABLE          HV_PTE_READABLE
 #define _PAGE_WRITABLE          HV_PTE_WRITABLE
 #define _PAGE_EXECUTABLE        HV_PTE_EXECUTABLE
@@ -87,6 +88,7 @@ extern void set_page_homes(void);
 #define _PAGE_ALL (\
   _PAGE_PRESENT | \
   _PAGE_HUGE_PAGE | \
+  _PAGE_SUPER_PAGE | \
   _PAGE_READABLE | \
   _PAGE_WRITABLE | \
   _PAGE_EXECUTABLE | \
@@ -197,6 +199,7 @@ static inline void __pte_clear(pte_t *ptep)
 #define pte_write hv_pte_get_writable
 #define pte_exec hv_pte_get_executable
 #define pte_huge hv_pte_get_page
+#define pte_super hv_pte_get_super
 #define pte_rdprotect hv_pte_clear_readable
 #define pte_exprotect hv_pte_clear_executable
 #define pte_mkclean hv_pte_clear_dirty
@@ -209,6 +212,7 @@ static inline void __pte_clear(pte_t *ptep)
 #define pte_mkyoung hv_pte_set_accessed
 #define pte_mkwrite hv_pte_set_writable
 #define pte_mkhuge hv_pte_set_page
+#define pte_mksuper hv_pte_set_super
 
 #define pte_special(pte) 0
 #define pte_mkspecial(pte) (pte)
@@ -338,13 +342,8 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
  */
 #define pgd_offset_k(address) pgd_offset(&init_mm, address)
 
-#if defined(CONFIG_HIGHPTE)
-extern pte_t *pte_offset_map(pmd_t *, unsigned long address);
-#define pte_unmap(pte) kunmap_atomic(pte)
-#else
 #define pte_offset_map(dir, address) pte_offset_kernel(dir, address)
 #define pte_unmap(pte) do { } while (0)
-#endif
 
 /* Clear a non-executable kernel PTE and flush it from the TLB. */
 #define kpte_clear_flush(ptep, vaddr)		\
@@ -537,7 +536,8 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
 /* Support /proc/NN/pgtable API. */
 struct seq_file;
 int arch_proc_pgtable_show(struct seq_file *m, struct mm_struct *mm,
-			   unsigned long vaddr, pte_t *ptep, void **datap);
+			   unsigned long vaddr, unsigned long pagesize,
+			   pte_t *ptep, void **datap);
 
 #endif /* !__ASSEMBLY__ */
 
diff --git a/arch/tile/include/asm/tlbflush.h b/arch/tile/include/asm/tlbflush.h
index 96199d214fb8..dcf91b25a1e5 100644
--- a/arch/tile/include/asm/tlbflush.h
+++ b/arch/tile/include/asm/tlbflush.h
@@ -38,16 +38,11 @@ DECLARE_PER_CPU(int, current_asid);
 /* The hypervisor tells us what ASIDs are available to us. */
 extern int min_asid, max_asid;
 
-static inline unsigned long hv_page_size(const struct vm_area_struct *vma)
-{
-	return (vma->vm_flags & VM_HUGETLB) ? HPAGE_SIZE : PAGE_SIZE;
-}
-
 /* Pass as vma pointer for non-executable mapping, if no vma available. */
-#define FLUSH_NONEXEC ((const struct vm_area_struct *)-1UL)
+#define FLUSH_NONEXEC ((struct vm_area_struct *)-1UL)
 
 /* Flush a single user page on this cpu. */
-static inline void local_flush_tlb_page(const struct vm_area_struct *vma,
+static inline void local_flush_tlb_page(struct vm_area_struct *vma,
 					unsigned long addr,
 					unsigned long page_size)
 {
@@ -60,7 +55,7 @@ static inline void local_flush_tlb_page(const struct vm_area_struct *vma,
 }
 
 /* Flush range of user pages on this cpu. */
-static inline void local_flush_tlb_pages(const struct vm_area_struct *vma,
+static inline void local_flush_tlb_pages(struct vm_area_struct *vma,
 					 unsigned long addr,
 					 unsigned long page_size,
 					 unsigned long len)
@@ -117,10 +112,10 @@ extern void flush_tlb_all(void);
 extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
 extern void flush_tlb_current_task(void);
 extern void flush_tlb_mm(struct mm_struct *);
-extern void flush_tlb_page(const struct vm_area_struct *, unsigned long);
-extern void flush_tlb_page_mm(const struct vm_area_struct *,
+extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
+extern void flush_tlb_page_mm(struct vm_area_struct *,
 			      struct mm_struct *, unsigned long);
-extern void flush_tlb_range(const struct vm_area_struct *,
+extern void flush_tlb_range(struct vm_area_struct *,
 			    unsigned long start, unsigned long end);
 
 #define flush_tlb()     flush_tlb_current_task()
diff --git a/arch/tile/include/hv/hypervisor.h b/arch/tile/include/hv/hypervisor.h
index f27871775b7a..85e5cab4c2f0 100644
--- a/arch/tile/include/hv/hypervisor.h
+++ b/arch/tile/include/hv/hypervisor.h
@@ -66,6 +66,22 @@
 #define HV_DEFAULT_PAGE_SIZE_LARGE \
   (__HV_SIZE_ONE << HV_LOG2_DEFAULT_PAGE_SIZE_LARGE)
 
+#if CHIP_VA_WIDTH() > 32
+
+/** The log2 of the initial size of jumbo pages, in bytes.
+ * See HV_DEFAULT_PAGE_SIZE_JUMBO.
+ */
+#define HV_LOG2_DEFAULT_PAGE_SIZE_JUMBO 32
+
+/** The initial size of jumbo pages, in bytes. This value should
+ * be verified at runtime by calling hv_sysconf(HV_SYSCONF_PAGE_SIZE_JUMBO).
+ * It may also be modified when installing a new context.
+ */
+#define HV_DEFAULT_PAGE_SIZE_JUMBO \
+  (__HV_SIZE_ONE << HV_LOG2_DEFAULT_PAGE_SIZE_JUMBO)
+
+#endif
+
 /** The log2 of the granularity at which page tables must be aligned;
  *  in other words, the CPA for a page table must have this many zero
  *  bits at the bottom of the address.
@@ -284,8 +300,11 @@
 #define HV_DISPATCH_GET_IPI_PTE                   56
 #endif
 
+/** hv_set_pte_super_shift */
+#define HV_DISPATCH_SET_PTE_SUPER_SHIFT           57
+
 /** One more than the largest dispatch value */
-#define _HV_DISPATCH_END                          57
+#define _HV_DISPATCH_END                          58
 
 
 #ifndef __ASSEMBLER__
@@ -413,6 +432,11 @@ typedef enum {
    */
   HV_SYSCONF_VALID_PAGE_SIZES = 7,
 
+  /** The size of jumbo pages, in bytes.
+   * If no jumbo pages are available, zero will be returned.
+   */
+  HV_SYSCONF_PAGE_SIZE_JUMBO = 8,
+
 } HV_SysconfQuery;
 
 /** Offset to subtract from returned Kelvin temperature to get degrees
@@ -695,6 +719,29 @@ int hv_install_context(HV_PhysAddr page_table, HV_PTE access, HV_ASID asid,
 
 #ifndef __ASSEMBLER__
 
+
+/** Set the number of pages ganged together by HV_PTE_SUPER at a
+ * particular level of the page table.
+ *
+ * The current TILE-Gx hardware only supports powers of four
+ * (i.e. log2_count must be a multiple of two), and the requested
+ * "super" page size must be less than the span of the next level in
+ * the page table.  The largest size that can be requested is 64GB.
+ *
+ * The shift value is initially "0" for all page table levels,
+ * indicating that the HV_PTE_SUPER bit is effectively ignored.
+ *
+ * If you change the count from one non-zero value to another, the
+ * hypervisor will flush the entire TLB and TSB to avoid confusion.
+ *
+ * @param level Page table level (0, 1, or 2)
+ * @param log2_count Base-2 log of the number of pages to gang together,
+ * i.e. how much to shift left the base page size for the super page size.
+ * @return Zero on success, or a hypervisor error code on failure.
+ */
+int hv_set_pte_super_shift(int level, int log2_count);
+
+
 /** Value returned from hv_inquire_context(). */
 typedef struct
 {
@@ -1891,8 +1938,9 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
 #define HV_PTE_INDEX_USER            10  /**< Page is user-accessible */
 #define HV_PTE_INDEX_ACCESSED        11  /**< Page has been accessed */
 #define HV_PTE_INDEX_DIRTY           12  /**< Page has been written */
-                                         /*   Bits 13-15 are reserved for
+                                         /*   Bits 13-14 are reserved for
                                               future use. */
+#define HV_PTE_INDEX_SUPER           15  /**< Pages ganged together for TLB */
 #define HV_PTE_INDEX_MODE            16  /**< Page mode; see HV_PTE_MODE_xxx */
 #define HV_PTE_MODE_BITS              3  /**< Number of bits in mode */
 #define HV_PTE_INDEX_CLIENT2         19  /**< Page client state 2 */
@@ -1987,7 +2035,10 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
 
 /** Does this PTE map a page?
  *
- * If this bit is set in the level-1 page table, the entry should be
+ * If this bit is set in a level-0 page table, the entry should be
+ * interpreted as a level-2 page table entry mapping a jumbo page.
+ *
+ * If this bit is set in a level-1 page table, the entry should be
  * interpreted as a level-2 page table entry mapping a large page.
  *
  * This bit should not be modified by the client while PRESENT is set, as
@@ -1997,6 +2048,18 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
  */
 #define HV_PTE_PAGE                  (__HV_PTE_ONE << HV_PTE_INDEX_PAGE)
 
+/** Does this PTE implicitly reference multiple pages?
+ *
+ * If this bit is set in the page table (either in the level-2 page table,
+ * or in a higher level page table in conjunction with the PAGE bit)
+ * then the PTE specifies a range of contiguous pages, not a single page.
+ * The hv_set_pte_super_shift() allows you to specify the count for
+ * each level of the page table.
+ *
+ * Note: this bit is not supported on TILEPro systems.
+ */
+#define HV_PTE_SUPER                 (__HV_PTE_ONE << HV_PTE_INDEX_SUPER)
+
 /** Is this a global (non-ASID) mapping?
  *
  * If this bit is set, the translations established by this PTE will
@@ -2215,6 +2278,7 @@ hv_pte_clear_##name(HV_PTE pte)                                 \
  */
 _HV_BIT(present,         PRESENT)
 _HV_BIT(page,            PAGE)
+_HV_BIT(super,           SUPER)
 _HV_BIT(client0,         CLIENT0)
 _HV_BIT(client1,         CLIENT1)
 _HV_BIT(client2,         CLIENT2)
diff --git a/arch/tile/kernel/hvglue.lds b/arch/tile/kernel/hvglue.lds
index 2b7cd0a659a9..d44c5a67a1ed 100644
--- a/arch/tile/kernel/hvglue.lds
+++ b/arch/tile/kernel/hvglue.lds
@@ -55,4 +55,5 @@ hv_store_mapping = TEXT_OFFSET + 0x106a0;
 hv_inquire_realpa = TEXT_OFFSET + 0x106c0;
 hv_flush_all = TEXT_OFFSET + 0x106e0;
 hv_get_ipi_pte = TEXT_OFFSET + 0x10700;
-hv_glue_internals = TEXT_OFFSET + 0x10720;
+hv_set_pte_super_shift = TEXT_OFFSET + 0x10720;
+hv_glue_internals = TEXT_OFFSET + 0x10740;
diff --git a/arch/tile/kernel/proc.c b/arch/tile/kernel/proc.c
index 446a7f52cc11..dafc447b5125 100644
--- a/arch/tile/kernel/proc.c
+++ b/arch/tile/kernel/proc.c
@@ -22,6 +22,7 @@
 #include <linux/proc_fs.h>
 #include <linux/sysctl.h>
 #include <linux/hardirq.h>
+#include <linux/hugetlb.h>
 #include <linux/mman.h>
 #include <asm/unaligned.h>
 #include <asm/pgtable.h>
diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c
index 32948e21113a..445c220eae51 100644
--- a/arch/tile/kernel/setup.c
+++ b/arch/tile/kernel/setup.c
@@ -28,6 +28,7 @@
 #include <linux/highmem.h>
 #include <linux/smp.h>
 #include <linux/timex.h>
+#include <linux/hugetlb.h>
 #include <asm/setup.h>
 #include <asm/sections.h>
 #include <asm/cacheflush.h>
@@ -49,9 +50,6 @@ char chip_model[64] __write_once;
 struct pglist_data node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
 
-/* We only create bootmem data on node 0. */
-static bootmem_data_t __initdata node0_bdata;
-
 /* Information on the NUMA nodes that we compute early */
 unsigned long __cpuinitdata node_start_pfn[MAX_NUMNODES];
 unsigned long __cpuinitdata node_end_pfn[MAX_NUMNODES];
@@ -518,37 +516,96 @@ static void __init setup_memory(void)
 #endif
 }
 
-static void __init setup_bootmem_allocator(void)
+/*
+ * On 32-bit machines, we only put bootmem on the low controller,
+ * since PAs > 4GB can't be used in bootmem.  In principle one could
+ * imagine, e.g., multiple 1 GB controllers all of which could support
+ * bootmem, but in practice using controllers this small isn't a
+ * particularly interesting scenario, so we just keep it simple and
+ * use only the first controller for bootmem on 32-bit machines.
+ */
+static inline int node_has_bootmem(int nid)
 {
-	unsigned long bootmap_size, first_alloc_pfn, last_alloc_pfn;
+#ifdef CONFIG_64BIT
+	return 1;
+#else
+	return nid == 0;
+#endif
+}
 
-	/* Provide a node 0 bdata. */
-	NODE_DATA(0)->bdata = &node0_bdata;
+static inline unsigned long alloc_bootmem_pfn(int nid,
+					      unsigned long size,
+					      unsigned long goal)
+{
+	void *kva = __alloc_bootmem_node(NODE_DATA(nid), size,
+					 PAGE_SIZE, goal);
+	unsigned long pfn = kaddr_to_pfn(kva);
+	BUG_ON(goal && PFN_PHYS(pfn) != goal);
+	return pfn;
+}
 
-#ifdef CONFIG_PCI
-	/* Don't let boot memory alias the PCI region. */
-	last_alloc_pfn = min(max_low_pfn, pci_reserve_start_pfn);
+static void __init setup_bootmem_allocator_node(int i)
+{
+	unsigned long start, end, mapsize, mapstart;
+
+	if (node_has_bootmem(i)) {
+		NODE_DATA(i)->bdata = &bootmem_node_data[i];
+	} else {
+		/* Share controller zero's bdata for now. */
+		NODE_DATA(i)->bdata = &bootmem_node_data[0];
+		return;
+	}
+
+	/* Skip up to after the bss in node 0. */
+	start = (i == 0) ? min_low_pfn : node_start_pfn[i];
+
+	/* Only lowmem, if we're a HIGHMEM build. */
+#ifdef CONFIG_HIGHMEM
+	end = node_lowmem_end_pfn[i];
 #else
-	last_alloc_pfn = max_low_pfn;
+	end = node_end_pfn[i];
 #endif
 
-	/*
-	 * Initialize the boot-time allocator (with low memory only):
-	 * The first argument says where to put the bitmap, and the
-	 * second says where the end of allocatable memory is.
-	 */
-	bootmap_size = init_bootmem(min_low_pfn, last_alloc_pfn);
+	/* No memory here. */
+	if (end == start)
+		return;
+
+	/* Figure out where the bootmem bitmap is located. */
+	mapsize = bootmem_bootmap_pages(end - start);
+	if (i == 0) {
+		/* Use some space right before the heap on node 0. */
+		mapstart = start;
+		start += mapsize;
+	} else {
+		/* Allocate bitmap on node 0 to avoid page table issues. */
+		mapstart = alloc_bootmem_pfn(0, PFN_PHYS(mapsize), 0);
+	}
 
+	/* Initialize a node. */
+	init_bootmem_node(NODE_DATA(i), mapstart, start, end);
+
+	/* Free all the space back into the allocator. */
+	free_bootmem(PFN_PHYS(start), PFN_PHYS(end - start));
+
+#if defined(CONFIG_PCI)
 	/*
-	 * Let the bootmem allocator use all the space we've given it
-	 * except for its own bitmap.
+	 * Throw away any memory aliased by the PCI region.  FIXME: this
+	 * is a temporary hack to work around bug 10502, and needs to be
+	 * fixed properly.
 	 */
-	first_alloc_pfn = min_low_pfn + PFN_UP(bootmap_size);
-	if (first_alloc_pfn >= last_alloc_pfn)
-		early_panic("Not enough memory on controller 0 for bootmem\n");
+	if (pci_reserve_start_pfn < end && pci_reserve_end_pfn > start)
+		reserve_bootmem(PFN_PHYS(pci_reserve_start_pfn),
+				PFN_PHYS(pci_reserve_end_pfn -
+					 pci_reserve_start_pfn),
+				BOOTMEM_EXCLUSIVE);
+#endif
+}
 
-	free_bootmem(PFN_PHYS(first_alloc_pfn),
-		     PFN_PHYS(last_alloc_pfn - first_alloc_pfn));
+static void __init setup_bootmem_allocator(void)
+{
+	int i;
+	for (i = 0; i < MAX_NUMNODES; ++i)
+		setup_bootmem_allocator_node(i);
 
 #ifdef CONFIG_KEXEC
 	if (crashk_res.start != crashk_res.end)
@@ -579,14 +636,6 @@ static int __init percpu_size(void)
 	return size;
 }
 
-static inline unsigned long alloc_bootmem_pfn(int size, unsigned long goal)
-{
-	void *kva = __alloc_bootmem(size, PAGE_SIZE, goal);
-	unsigned long pfn = kaddr_to_pfn(kva);
-	BUG_ON(goal && PFN_PHYS(pfn) != goal);
-	return pfn;
-}
-
 static void __init zone_sizes_init(void)
 {
 	unsigned long zones_size[MAX_NR_ZONES] = { 0 };
@@ -624,21 +673,22 @@ static void __init zone_sizes_init(void)
 		 * though, there'll be no lowmem, so we just alloc_bootmem
 		 * the memmap.  There will be no percpu memory either.
 		 */
-		if (__pfn_to_highbits(start) == 0) {
-			/* In low PAs, allocate via bootmem. */
+		if (i != 0 && cpu_isset(i, isolnodes)) {
+			node_memmap_pfn[i] =
+				alloc_bootmem_pfn(0, memmap_size, 0);
+			BUG_ON(node_percpu[i] != 0);
+		} else if (node_has_bootmem(start)) {
 			unsigned long goal = 0;
 			node_memmap_pfn[i] =
-				alloc_bootmem_pfn(memmap_size, goal);
+				alloc_bootmem_pfn(i, memmap_size, 0);
 			if (kdata_huge)
 				goal = PFN_PHYS(lowmem_end) - node_percpu[i];
 			if (node_percpu[i])
 				node_percpu_pfn[i] =
-				    alloc_bootmem_pfn(node_percpu[i], goal);
-		} else if (cpu_isset(i, isolnodes)) {
-			node_memmap_pfn[i] = alloc_bootmem_pfn(memmap_size, 0);
-			BUG_ON(node_percpu[i] != 0);
+					alloc_bootmem_pfn(i, node_percpu[i],
+							  goal);
 		} else {
-			/* In high PAs, just reserve some pages. */
+			/* In non-bootmem zones, just reserve some pages. */
 			node_memmap_pfn[i] = node_free_pfn[i];
 			node_free_pfn[i] += PFN_UP(memmap_size);
 			if (!kdata_huge) {
@@ -662,16 +712,9 @@ static void __init zone_sizes_init(void)
 		zones_size[ZONE_NORMAL] = end - start;
 #endif
 
-		/*
-		 * Everyone shares node 0's bootmem allocator, but
-		 * we use alloc_remap(), above, to put the actual
-		 * struct page array on the individual controllers,
-		 * which is most of the data that we actually care about.
-		 * We can't place bootmem allocators on the other
-		 * controllers since the bootmem allocator can only
-		 * operate on 32-bit physical addresses.
-		 */
-		NODE_DATA(i)->bdata = NODE_DATA(0)->bdata;
+		/* Take zone metadata from controller 0 if we're isolnode. */
+		if (node_isset(i, isolnodes))
+			NODE_DATA(i)->bdata = &bootmem_node_data[0];
 
 		free_area_init_node(i, zones_size, start, NULL);
 		printk(KERN_DEBUG "  Normal zone: %ld per-cpu pages\n",
@@ -854,6 +897,22 @@ subsys_initcall(topology_init);
 
 #endif /* CONFIG_NUMA */
 
+/*
+ * Initialize hugepage support on this cpu.  We do this on all cores
+ * early in boot: before argument parsing for the boot cpu, and after
+ * argument parsing but before the init functions run on the secondaries.
+ * So the values we set up here in the hypervisor may be overridden on
+ * the boot cpu as arguments are parsed.
+ */
+static __cpuinit void init_super_pages(void)
+{
+#ifdef CONFIG_HUGETLB_SUPER_PAGES
+	int i;
+	for (i = 0; i < HUGE_SHIFT_ENTRIES; ++i)
+		hv_set_pte_super_shift(i, huge_shift[i]);
+#endif
+}
+
 /**
  * setup_cpu() - Do all necessary per-cpu, tile-specific initialization.
  * @boot: Is this the boot cpu?
@@ -908,6 +967,8 @@ void __cpuinit setup_cpu(int boot)
 	/* Reset the network state on this cpu. */
 	reset_network_state();
 #endif
+
+	init_super_pages();
 }
 
 #ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/tile/kernel/tlb.c b/arch/tile/kernel/tlb.c
index a5f241c24cac..3fd54d5bbd4c 100644
--- a/arch/tile/kernel/tlb.c
+++ b/arch/tile/kernel/tlb.c
@@ -15,6 +15,7 @@
 
 #include <linux/cpumask.h>
 #include <linux/module.h>
+#include <linux/hugetlb.h>
 #include <asm/tlbflush.h>
 #include <asm/homecache.h>
 #include <hv/hypervisor.h>
@@ -49,25 +50,25 @@ void flush_tlb_current_task(void)
 	flush_tlb_mm(current->mm);
 }
 
-void flush_tlb_page_mm(const struct vm_area_struct *vma, struct mm_struct *mm,
+void flush_tlb_page_mm(struct vm_area_struct *vma, struct mm_struct *mm,
 		       unsigned long va)
 {
-	unsigned long size = hv_page_size(vma);
+	unsigned long size = vma_kernel_pagesize(vma);
 	int cache = (vma->vm_flags & VM_EXEC) ? HV_FLUSH_EVICT_L1I : 0;
 	flush_remote(0, cache, mm_cpumask(mm),
 		     va, size, size, mm_cpumask(mm), NULL, 0);
 }
 
-void flush_tlb_page(const struct vm_area_struct *vma, unsigned long va)
+void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
 {
 	flush_tlb_page_mm(vma, vma->vm_mm, va);
 }
 EXPORT_SYMBOL(flush_tlb_page);
 
-void flush_tlb_range(const struct vm_area_struct *vma,
+void flush_tlb_range(struct vm_area_struct *vma,
 		     unsigned long start, unsigned long end)
 {
-	unsigned long size = hv_page_size(vma);
+	unsigned long size = vma_kernel_pagesize(vma);
 	struct mm_struct *mm = vma->vm_mm;
 	int cache = (vma->vm_flags & VM_EXEC) ? HV_FLUSH_EVICT_L1I : 0;
 	flush_remote(0, cache, mm_cpumask(mm), start, end - start, size,
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index 22e58f51ed23..54f18fc25ed0 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -187,7 +187,7 @@ static pgd_t *get_current_pgd(void)
 	HV_Context ctx = hv_inquire_context();
 	unsigned long pgd_pfn = ctx.page_table >> PAGE_SHIFT;
 	struct page *pgd_page = pfn_to_page(pgd_pfn);
-	BUG_ON(PageHighMem(pgd_page));   /* oops, HIGHPTE? */
+	BUG_ON(PageHighMem(pgd_page));
 	return (pgd_t *) __va(ctx.page_table);
 }
 
diff --git a/arch/tile/mm/homecache.c b/arch/tile/mm/homecache.c
index 499f73770b05..dbcbdf7b8aa8 100644
--- a/arch/tile/mm/homecache.c
+++ b/arch/tile/mm/homecache.c
@@ -30,6 +30,7 @@
 #include <linux/cache.h>
 #include <linux/smp.h>
 #include <linux/module.h>
+#include <linux/hugetlb.h>
 
 #include <asm/page.h>
 #include <asm/sections.h>
diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c
index 42cfcba4e1ef..812e2d037972 100644
--- a/arch/tile/mm/hugetlbpage.c
+++ b/arch/tile/mm/hugetlbpage.c
@@ -27,85 +27,161 @@
 #include <linux/mman.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
+#include <asm/setup.h>
+
+#ifdef CONFIG_HUGETLB_SUPER_PAGES
+
+/*
+ * Provide an additional huge page size (in addition to the regular default
+ * huge page size) if no "hugepagesz" arguments are specified.
+ * Note that it must be smaller than the default huge page size so
+ * that it's possible to allocate them on demand from the buddy allocator.
+ * You can change this to 64K (on a 16K build), 256K, 1M, or 4M,
+ * or not define it at all.
+ */
+#define ADDITIONAL_HUGE_SIZE (1024 * 1024UL)
+
+/* "Extra" page-size multipliers, one per level of the page table. */
+int huge_shift[HUGE_SHIFT_ENTRIES] = {
+#ifdef ADDITIONAL_HUGE_SIZE
+#define ADDITIONAL_HUGE_SHIFT __builtin_ctzl(ADDITIONAL_HUGE_SIZE / PAGE_SIZE)
+	[HUGE_SHIFT_PAGE] = ADDITIONAL_HUGE_SHIFT
+#endif
+};
+
+/*
+ * This routine is a hybrid of pte_alloc_map() and pte_alloc_kernel().
+ * It assumes that L2 PTEs are never in HIGHMEM (we don't support that).
+ * It locks the user pagetable, and bumps up the mm->nr_ptes field,
+ * but otherwise allocate the page table using the kernel versions.
+ */
+static pte_t *pte_alloc_hugetlb(struct mm_struct *mm, pmd_t *pmd,
+				unsigned long address)
+{
+	pte_t *new;
+
+	if (pmd_none(*pmd)) {
+		new = pte_alloc_one_kernel(mm, address);
+		if (!new)
+			return NULL;
+
+		smp_wmb(); /* See comment in __pte_alloc */
+
+		spin_lock(&mm->page_table_lock);
+		if (likely(pmd_none(*pmd))) {  /* Has another populated it ? */
+			mm->nr_ptes++;
+			pmd_populate_kernel(mm, pmd, new);
+			new = NULL;
+		} else
+			VM_BUG_ON(pmd_trans_splitting(*pmd));
+		spin_unlock(&mm->page_table_lock);
+		if (new)
+			pte_free_kernel(mm, new);
+	}
+
+	return pte_offset_kernel(pmd, address);
+}
+#endif
 
 pte_t *huge_pte_alloc(struct mm_struct *mm,
 		      unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgd;
 	pud_t *pud;
-	pte_t *pte = NULL;
 
-	/* We do not yet support multiple huge page sizes. */
-	BUG_ON(sz != PMD_SIZE);
+	addr &= -sz;   /* Mask off any low bits in the address. */
 
 	pgd = pgd_offset(mm, addr);
 	pud = pud_alloc(mm, pgd, addr);
-	if (pud)
-		pte = (pte_t *) pmd_alloc(mm, pud, addr);
-	BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
 
-	return pte;
+#ifdef CONFIG_HUGETLB_SUPER_PAGES
+	if (sz >= PGDIR_SIZE) {
+		BUG_ON(sz != PGDIR_SIZE &&
+		       sz != PGDIR_SIZE << huge_shift[HUGE_SHIFT_PGDIR]);
+		return (pte_t *)pud;
+	} else {
+		pmd_t *pmd = pmd_alloc(mm, pud, addr);
+		if (sz >= PMD_SIZE) {
+			BUG_ON(sz != PMD_SIZE &&
+			       sz != (PMD_SIZE << huge_shift[HUGE_SHIFT_PMD]));
+			return (pte_t *)pmd;
+		}
+		else {
+			if (sz != PAGE_SIZE << huge_shift[HUGE_SHIFT_PAGE])
+				panic("Unexpected page size %#lx\n", sz);
+			return pte_alloc_hugetlb(mm, pmd, addr);
+		}
+	}
+#else
+	BUG_ON(sz != PMD_SIZE);
+	return (pte_t *) pmd_alloc(mm, pud, addr);
+#endif
 }
 
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+static pte_t *get_pte(pte_t *base, int index, int level)
 {
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd = NULL;
-
-	pgd = pgd_offset(mm, addr);
-	if (pgd_present(*pgd)) {
-		pud = pud_offset(pgd, addr);
-		if (pud_present(*pud))
-			pmd = pmd_offset(pud, addr);
+	pte_t *ptep = base + index;
+#ifdef CONFIG_HUGETLB_SUPER_PAGES
+	if (!pte_present(*ptep) && huge_shift[level] != 0) {
+		unsigned long mask = -1UL << huge_shift[level];
+		pte_t *super_ptep = base + (index & mask);
+		pte_t pte = *super_ptep;
+		if (pte_present(pte) && pte_super(pte))
+			ptep = super_ptep;
 	}
-	return (pte_t *) pmd;
+#endif
+	return ptep;
 }
 
-#ifdef HUGETLB_TEST
-struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
-			      int write)
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
-	unsigned long start = address;
-	int length = 1;
-	int nr;
-	struct page *page;
-	struct vm_area_struct *vma;
-
-	vma = find_vma(mm, addr);
-	if (!vma || !is_vm_hugetlb_page(vma))
-		return ERR_PTR(-EINVAL);
-
-	pte = huge_pte_offset(mm, address);
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+#ifdef CONFIG_HUGETLB_SUPER_PAGES
+	pte_t *pte;
+#endif
 
-	/* hugetlb should be locked, and hence, prefaulted */
-	WARN_ON(!pte || pte_none(*pte));
+	/* Get the top-level page table entry. */
+	pgd = (pgd_t *)get_pte((pte_t *)mm->pgd, pgd_index(addr), 0);
+	if (!pgd_present(*pgd))
+		return NULL;
 
-	page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
+	/* We don't have four levels. */
+	pud = pud_offset(pgd, addr);
+#ifndef __PAGETABLE_PUD_FOLDED
+# error support fourth page table level
+#endif
 
-	WARN_ON(!PageHead(page));
+	/* Check for an L0 huge PTE, if we have three levels. */
+#ifndef __PAGETABLE_PMD_FOLDED
+	if (pud_huge(*pud))
+		return (pte_t *)pud;
 
-	return page;
-}
-
-int pmd_huge(pmd_t pmd)
-{
-	return 0;
-}
+	pmd = (pmd_t *)get_pte((pte_t *)pud_page_vaddr(*pud),
+			       pmd_index(addr), 1);
+	if (!pmd_present(*pmd))
+		return NULL;
+#else
+	pmd = pmd_offset(pud, addr);
+#endif
 
-int pud_huge(pud_t pud)
-{
-	return 0;
-}
+	/* Check for an L1 huge PTE. */
+	if (pmd_huge(*pmd))
+		return (pte_t *)pmd;
+
+#ifdef CONFIG_HUGETLB_SUPER_PAGES
+	/* Check for an L2 huge PTE. */
+	pte = get_pte((pte_t *)pmd_page_vaddr(*pmd), pte_index(addr), 2);
+	if (!pte_present(*pte))
+		return NULL;
+	if (pte_super(*pte))
+		return pte;
+#endif
 
-struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-			     pmd_t *pmd, int write)
-{
 	return NULL;
 }
 
-#else
-
 struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
 			      int write)
 {
@@ -149,8 +225,6 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
 	return 0;
 }
 
-#endif
-
 #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
 static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
 		unsigned long addr, unsigned long len,
@@ -322,21 +396,102 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		return hugetlb_get_unmapped_area_topdown(file, addr, len,
 				pgoff, flags);
 }
+#endif /* HAVE_ARCH_HUGETLB_UNMAPPED_AREA */
 
-static __init int setup_hugepagesz(char *opt)
+#ifdef CONFIG_HUGETLB_SUPER_PAGES
+static __init int __setup_hugepagesz(unsigned long ps)
 {
-	unsigned long ps = memparse(opt, &opt);
-	if (ps == PMD_SIZE) {
-		hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
-	} else if (ps == PUD_SIZE) {
-		hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
+	int log_ps = __builtin_ctzl(ps);
+	int level, base_shift;
+
+	if ((1UL << log_ps) != ps || (log_ps & 1) != 0) {
+		pr_warn("Not enabling %ld byte huge pages;"
+			" must be a power of four.\n", ps);
+		return -EINVAL;
+	}
+
+	if (ps > 64*1024*1024*1024UL) {
+		pr_warn("Not enabling %ld MB huge pages;"
+			" largest legal value is 64 GB .\n", ps >> 20);
+		return -EINVAL;
+	} else if (ps >= PUD_SIZE) {
+		static long hv_jpage_size;
+		if (hv_jpage_size == 0)
+			hv_jpage_size = hv_sysconf(HV_SYSCONF_PAGE_SIZE_JUMBO);
+		if (hv_jpage_size != PUD_SIZE) {
+			pr_warn("Not enabling >= %ld MB huge pages:"
+				" hypervisor reports size %ld\n",
+				PUD_SIZE >> 20, hv_jpage_size);
+			return -EINVAL;
+		}
+		level = 0;
+		base_shift = PUD_SHIFT;
+	} else if (ps >= PMD_SIZE) {
+		level = 1;
+		base_shift = PMD_SHIFT;
+	} else if (ps > PAGE_SIZE) {
+		level = 2;
+		base_shift = PAGE_SHIFT;
 	} else {
-		pr_err("hugepagesz: Unsupported page size %lu M\n",
-			ps >> 20);
-		return 0;
+		pr_err("hugepagesz: huge page size %ld too small\n", ps);
+		return -EINVAL;
 	}
-	return 1;
+
+	if (log_ps != base_shift) {
+		int shift_val = log_ps - base_shift;
+		if (huge_shift[level] != 0) {
+			int old_shift = base_shift + huge_shift[level];
+			pr_warn("Not enabling %ld MB huge pages;"
+				" already have size %ld MB.\n",
+				ps >> 20, (1UL << old_shift) >> 20);
+			return -EINVAL;
+		}
+		if (hv_set_pte_super_shift(level, shift_val) != 0) {
+			pr_warn("Not enabling %ld MB huge pages;"
+				" no hypervisor support.\n", ps >> 20);
+			return -EINVAL;
+		}
+		printk(KERN_DEBUG "Enabled %ld MB huge pages\n", ps >> 20);
+		huge_shift[level] = shift_val;
+	}
+
+	hugetlb_add_hstate(log_ps - PAGE_SHIFT);
+
+	return 0;
+}
+
+static bool saw_hugepagesz;
+
+static __init int setup_hugepagesz(char *opt)
+{
+	if (!saw_hugepagesz) {
+		saw_hugepagesz = true;
+		memset(huge_shift, 0, sizeof(huge_shift));
+	}
+	return __setup_hugepagesz(memparse(opt, NULL));
 }
 __setup("hugepagesz=", setup_hugepagesz);
 
-#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/
+#ifdef ADDITIONAL_HUGE_SIZE
+/*
+ * Provide an additional huge page size if no "hugepagesz" args are given.
+ * In that case, all the cores have properly set up their hv super_shift
+ * already, but we need to notify the hugetlb code to enable the
+ * new huge page size from the Linux point of view.
+ */
+static __init int add_default_hugepagesz(void)
+{
+	if (!saw_hugepagesz) {
+		BUILD_BUG_ON(ADDITIONAL_HUGE_SIZE >= PMD_SIZE ||
+			     ADDITIONAL_HUGE_SIZE <= PAGE_SIZE);
+		BUILD_BUG_ON((PAGE_SIZE << ADDITIONAL_HUGE_SHIFT) !=
+			     ADDITIONAL_HUGE_SIZE);
+		BUILD_BUG_ON(ADDITIONAL_HUGE_SHIFT & 1);
+		hugetlb_add_hstate(ADDITIONAL_HUGE_SHIFT);
+	}
+	return 0;
+}
+arch_initcall(add_default_hugepagesz);
+#endif
+
+#endif /* CONFIG_HUGETLB_SUPER_PAGES */
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
index c04fbfd93fc5..630dd2ce2afe 100644
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
@@ -698,6 +698,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
 #endif /* CONFIG_HIGHMEM */
 
 
+#ifndef CONFIG_64BIT
 static void __init init_free_pfn_range(unsigned long start, unsigned long end)
 {
 	unsigned long pfn;
@@ -770,6 +771,7 @@ static void __init set_non_bootmem_pages_init(void)
 		init_free_pfn_range(start, end);
 	}
 }
+#endif
 
 /*
  * paging_init() sets up the page tables - note that all of lowmem is
@@ -858,8 +860,10 @@ void __init mem_init(void)
 	/* this will put all bootmem onto the freelists */
 	totalram_pages += free_all_bootmem();
 
+#ifndef CONFIG_64BIT
 	/* count all remaining LOWMEM and give all HIGHMEM to page allocator */
 	set_non_bootmem_pages_init();
+#endif
 
 	codesize =  (unsigned long)&_etext - (unsigned long)&_text;
 	datasize =  (unsigned long)&_end - (unsigned long)&_sdata;
diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c
index 3d7074347e6d..345edfed9fcd 100644
--- a/arch/tile/mm/pgtable.c
+++ b/arch/tile/mm/pgtable.c
@@ -132,15 +132,6 @@ void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
 	set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
 }
 
-#if defined(CONFIG_HIGHPTE)
-pte_t *_pte_offset_map(pmd_t *dir, unsigned long address)
-{
-	pte_t *pte = kmap_atomic(pmd_page(*dir)) +
-		(pmd_ptfn(*dir) << HV_LOG2_PAGE_TABLE_ALIGN) & ~PAGE_MASK;
-	return &pte[pte_index(address)];
-}
-#endif
-
 /**
  * shatter_huge_page() - ensure a given address is mapped by a small page.
  *
@@ -296,10 +287,6 @@ struct page *pgtable_alloc_one(struct mm_struct *mm, unsigned long address,
 	struct page *p;
 	int i;
 
-#ifdef CONFIG_HIGHPTE
-	flags |= __GFP_HIGHMEM;
-#endif
-
 	p = alloc_pages(flags, L2_USER_PGTABLE_ORDER);
 	if (p == NULL)
 		return NULL;
-- 
cgit v1.2.3