summaryrefslogtreecommitdiff
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/fault.c5
-rw-r--r--arch/x86/mm/init.c42
-rw-r--r--arch/x86/mm/kmmio.c25
-rw-r--r--arch/x86/mm/mmap.c21
-rw-r--r--arch/x86/mm/pageattr.c8
-rw-r--r--arch/x86/mm/pat.c16
-rw-r--r--arch/x86/mm/pgtable.c8
-rw-r--r--arch/x86/mm/pti.c1
-rw-r--r--arch/x86/mm/tlb.c205
9 files changed, 154 insertions, 177 deletions
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index db1c042e9853..b9123c497e0a 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -16,6 +16,7 @@
#include <linux/prefetch.h> /* prefetchw */
#include <linux/context_tracking.h> /* exception_enter(), ... */
#include <linux/uaccess.h> /* faulthandler_disabled() */
+#include <linux/mm_types.h>
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
#include <asm/traps.h> /* dotraplinkage, ... */
@@ -999,7 +1000,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
static noinline void
mm_fault_error(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, u32 *pkey, unsigned int fault)
+ unsigned long address, u32 *pkey, vm_fault_t fault)
{
if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
no_context(regs, error_code, address, 0, 0);
@@ -1213,7 +1214,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
struct vm_area_struct *vma;
struct task_struct *tsk;
struct mm_struct *mm;
- int fault, major = 0;
+ vm_fault_t fault, major = 0;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
u32 pkey;
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 74b157ac078d..7a8fc26c1115 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -4,6 +4,8 @@
#include <linux/swap.h>
#include <linux/memblock.h>
#include <linux/bootmem.h> /* for max_low_pfn */
+#include <linux/swapfile.h>
+#include <linux/swapops.h>
#include <asm/set_memory.h>
#include <asm/e820/api.h>
@@ -97,15 +99,22 @@ __ref void *alloc_low_pages(unsigned int num)
}
if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
- unsigned long ret;
- if (min_pfn_mapped >= max_pfn_mapped)
- panic("alloc_low_pages: ran out of memory");
- ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
+ unsigned long ret = 0;
+
+ if (min_pfn_mapped < max_pfn_mapped) {
+ ret = memblock_find_in_range(
+ min_pfn_mapped << PAGE_SHIFT,
max_pfn_mapped << PAGE_SHIFT,
PAGE_SIZE * num , PAGE_SIZE);
+ }
+ if (ret)
+ memblock_reserve(ret, PAGE_SIZE * num);
+ else if (can_use_brk_pgt)
+ ret = __pa(extend_brk(PAGE_SIZE * num, PAGE_SIZE));
+
if (!ret)
panic("alloc_low_pages: can not alloc memory");
- memblock_reserve(ret, PAGE_SIZE * num);
+
pfn = ret >> PAGE_SHIFT;
} else {
pfn = pgt_buf_end;
@@ -911,3 +920,26 @@ void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
__cachemode2pte_tbl[cache] = __cm_idx2pte(entry);
__pte2cachemode_tbl[entry] = cache;
}
+
+#ifdef CONFIG_SWAP
+unsigned long max_swapfile_size(void)
+{
+ unsigned long pages;
+
+ pages = generic_max_swapfile_size();
+
+ if (boot_cpu_has_bug(X86_BUG_L1TF)) {
+ /* Limit the swap file size to MAX_PA/2 for L1TF workaround */
+ unsigned long long l1tf_limit = l1tf_pfn_limit();
+ /*
+ * We encode swap offsets also with 3 bits below those for pfn
+ * which makes the usable limit higher.
+ */
+#if CONFIG_PGTABLE_LEVELS > 2
+ l1tf_limit <<= PAGE_SHIFT - SWP_OFFSET_FIRST_BIT;
+#endif
+ pages = min_t(unsigned long long, l1tf_limit, pages);
+ }
+ return pages;
+}
+#endif
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 7c8686709636..79eb55ce69a9 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -126,24 +126,29 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long addr)
static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old)
{
+ pmd_t new_pmd;
pmdval_t v = pmd_val(*pmd);
if (clear) {
- *old = v & _PAGE_PRESENT;
- v &= ~_PAGE_PRESENT;
- } else /* presume this has been called with clear==true previously */
- v |= *old;
- set_pmd(pmd, __pmd(v));
+ *old = v;
+ new_pmd = pmd_mknotpresent(*pmd);
+ } else {
+ /* Presume this has been called with clear==true previously */
+ new_pmd = __pmd(*old);
+ }
+ set_pmd(pmd, new_pmd);
}
static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old)
{
pteval_t v = pte_val(*pte);
if (clear) {
- *old = v & _PAGE_PRESENT;
- v &= ~_PAGE_PRESENT;
- } else /* presume this has been called with clear==true previously */
- v |= *old;
- set_pte_atomic(pte, __pte(v));
+ *old = v;
+ /* Nothing should care about address */
+ pte_clear(&init_mm, 0, pte);
+ } else {
+ /* Presume this has been called with clear==true previously */
+ set_pte_atomic(pte, __pte(*old));
+ }
}
static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 48c591251600..1e95d57760cf 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -240,3 +240,24 @@ int valid_mmap_phys_addr_range(unsigned long pfn, size_t count)
return phys_addr_valid(addr + count - 1);
}
+
+/*
+ * Only allow root to set high MMIO mappings to PROT_NONE.
+ * This prevents an unpriv. user to set them to PROT_NONE and invert
+ * them, then pointing to valid memory for L1TF speculation.
+ *
+ * Note: for locked down kernels may want to disable the root override.
+ */
+bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
+{
+ if (!boot_cpu_has_bug(X86_BUG_L1TF))
+ return true;
+ if (!__pte_needs_invert(pgprot_val(prot)))
+ return true;
+ /* If it's real memory always allow */
+ if (pfn_valid(pfn))
+ return true;
+ if (pfn >= l1tf_pfn_limit() && !capable(CAP_SYS_ADMIN))
+ return false;
+ return true;
+}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 0a74996a1149..8d6c34fe49be 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -1015,8 +1015,8 @@ static long populate_pmd(struct cpa_data *cpa,
pmd = pmd_offset(pud, start);
- set_pmd(pmd, __pmd(cpa->pfn << PAGE_SHIFT | _PAGE_PSE |
- massage_pgprot(pmd_pgprot)));
+ set_pmd(pmd, pmd_mkhuge(pfn_pmd(cpa->pfn,
+ canon_pgprot(pmd_pgprot))));
start += PMD_SIZE;
cpa->pfn += PMD_SIZE >> PAGE_SHIFT;
@@ -1088,8 +1088,8 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d,
* Map everything starting from the Gb boundary, possibly with 1G pages
*/
while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) {
- set_pud(pud, __pud(cpa->pfn << PAGE_SHIFT | _PAGE_PSE |
- massage_pgprot(pud_pgprot)));
+ set_pud(pud, pud_mkhuge(pfn_pud(cpa->pfn,
+ canon_pgprot(pud_pgprot))));
start += PUD_SIZE;
cpa->pfn += PUD_SIZE >> PAGE_SHIFT;
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 1555bd7d3449..3d0c83ef6aab 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -512,6 +512,17 @@ static int free_ram_pages_type(u64 start, u64 end)
return 0;
}
+static u64 sanitize_phys(u64 address)
+{
+ /*
+ * When changing the memtype for pages containing poison allow
+ * for a "decoy" virtual address (bit 63 clear) passed to
+ * set_memory_X(). __pa() on a "decoy" address results in a
+ * physical address with bit 63 set.
+ */
+ return address & __PHYSICAL_MASK;
+}
+
/*
* req_type typically has one of the:
* - _PAGE_CACHE_MODE_WB
@@ -533,6 +544,8 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type,
int is_range_ram;
int err = 0;
+ start = sanitize_phys(start);
+ end = sanitize_phys(end);
BUG_ON(start >= end); /* end is exclusive */
if (!pat_enabled()) {
@@ -609,6 +622,9 @@ int free_memtype(u64 start, u64 end)
if (!pat_enabled())
return 0;
+ start = sanitize_phys(start);
+ end = sanitize_phys(end);
+
/* Low ISA region is always mapped WB. No need to track */
if (x86_platform.is_untracked_pat_range(start, end))
return 0;
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 3ef095c70ae3..e848a4811785 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -63,7 +63,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
{
pgtable_page_dtor(pte);
paravirt_release_pte(page_to_pfn(pte));
- tlb_remove_table(tlb, pte);
+ paravirt_tlb_remove_table(tlb, pte);
}
#if CONFIG_PGTABLE_LEVELS > 2
@@ -79,21 +79,21 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
tlb->need_flush_all = 1;
#endif
pgtable_pmd_page_dtor(page);
- tlb_remove_table(tlb, page);
+ paravirt_tlb_remove_table(tlb, page);
}
#if CONFIG_PGTABLE_LEVELS > 3
void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
{
paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
- tlb_remove_table(tlb, virt_to_page(pud));
+ paravirt_tlb_remove_table(tlb, virt_to_page(pud));
}
#if CONFIG_PGTABLE_LEVELS > 4
void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
{
paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
- tlb_remove_table(tlb, virt_to_page(p4d));
+ paravirt_tlb_remove_table(tlb, virt_to_page(p4d));
}
#endif /* CONFIG_PGTABLE_LEVELS > 4 */
#endif /* CONFIG_PGTABLE_LEVELS > 3 */
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index d58b4aba9510..31341ae7309f 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -45,6 +45,7 @@
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/desc.h>
+#include <asm/sections.h>
#undef pr_fmt
#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 752dbf4e0e50..9517d1b2a281 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -7,7 +7,6 @@
#include <linux/export.h>
#include <linux/cpu.h>
#include <linux/debugfs.h>
-#include <linux/gfp.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
@@ -186,11 +185,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
{
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
- bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
unsigned cpu = smp_processor_id();
u64 next_tlb_gen;
- bool need_flush;
- u16 new_asid;
/*
* NB: The scheduler will call us with prev == next when switching
@@ -244,41 +240,20 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
next->context.ctx_id);
/*
- * Even in lazy TLB mode, the CPU should stay set in the
- * mm_cpumask. The TLB shootdown code can figure out from
- * from cpu_tlbstate.is_lazy whether or not to send an IPI.
+ * We don't currently support having a real mm loaded without
+ * our cpu set in mm_cpumask(). We have all the bookkeeping
+ * in place to figure out whether we would need to flush
+ * if our cpu were cleared in mm_cpumask(), but we don't
+ * currently use it.
*/
if (WARN_ON_ONCE(real_prev != &init_mm &&
!cpumask_test_cpu(cpu, mm_cpumask(next))))
cpumask_set_cpu(cpu, mm_cpumask(next));
- /*
- * If the CPU is not in lazy TLB mode, we are just switching
- * from one thread in a process to another thread in the same
- * process. No TLB flush required.
- */
- if (!was_lazy)
- return;
-
- /*
- * Read the tlb_gen to check whether a flush is needed.
- * If the TLB is up to date, just use it.
- * The barrier synchronizes with the tlb_gen increment in
- * the TLB shootdown code.
- */
- smp_mb();
- next_tlb_gen = atomic64_read(&next->context.tlb_gen);
- if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
- next_tlb_gen)
- return;
-
- /*
- * TLB contents went out of date while we were in lazy
- * mode. Fall through to the TLB switching code below.
- */
- new_asid = prev_asid;
- need_flush = true;
+ return;
} else {
+ u16 new_asid;
+ bool need_flush;
u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
/*
@@ -329,41 +304,41 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
- }
- if (need_flush) {
- this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
- this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
- load_new_mm_cr3(next->pgd, new_asid, true);
+ if (need_flush) {
+ this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
+ this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
+ load_new_mm_cr3(next->pgd, new_asid, true);
+
+ /*
+ * NB: This gets called via leave_mm() in the idle path
+ * where RCU functions differently. Tracing normally
+ * uses RCU, so we need to use the _rcuidle variant.
+ *
+ * (There is no good reason for this. The idle code should
+ * be rearranged to call this before rcu_idle_enter().)
+ */
+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+ } else {
+ /* The new ASID is already up to date. */
+ load_new_mm_cr3(next->pgd, new_asid, false);
+
+ /* See above wrt _rcuidle. */
+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
+ }
/*
- * NB: This gets called via leave_mm() in the idle path
- * where RCU functions differently. Tracing normally
- * uses RCU, so we need to use the _rcuidle variant.
- *
- * (There is no good reason for this. The idle code should
- * be rearranged to call this before rcu_idle_enter().)
+ * Record last user mm's context id, so we can avoid
+ * flushing branch buffer with IBPB if we switch back
+ * to the same user.
*/
- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
- } else {
- /* The new ASID is already up to date. */
- load_new_mm_cr3(next->pgd, new_asid, false);
+ if (next != &init_mm)
+ this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
- /* See above wrt _rcuidle. */
- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
+ this_cpu_write(cpu_tlbstate.loaded_mm, next);
+ this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
}
- /*
- * Record last user mm's context id, so we can avoid
- * flushing branch buffer with IBPB if we switch back
- * to the same user.
- */
- if (next != &init_mm)
- this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
-
- this_cpu_write(cpu_tlbstate.loaded_mm, next);
- this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
-
load_mm_cr4(next);
switch_ldt(real_prev, next);
}
@@ -386,7 +361,20 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
return;
- this_cpu_write(cpu_tlbstate.is_lazy, true);
+ if (tlb_defer_switch_to_init_mm()) {
+ /*
+ * There's a significant optimization that may be possible
+ * here. We have accurate enough TLB flush tracking that we
+ * don't need to maintain coherence of TLB per se when we're
+ * lazy. We do, however, need to maintain coherence of
+ * paging-structure caches. We could, in principle, leave our
+ * old mm loaded and only switch to init_mm when
+ * tlb_remove_page() happens.
+ */
+ this_cpu_write(cpu_tlbstate.is_lazy, true);
+ } else {
+ switch_mm(NULL, &init_mm, NULL);
+ }
}
/*
@@ -473,9 +461,6 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
* paging-structure cache to avoid speculatively reading
* garbage into our TLB. Since switching to init_mm is barely
* slower than a minimal flush, just switch to init_mm.
- *
- * This should be rare, with native_flush_tlb_others skipping
- * IPIs to lazy TLB mode CPUs.
*/
switch_mm_irqs_off(NULL, &init_mm, NULL);
return;
@@ -582,9 +567,6 @@ static void flush_tlb_func_remote(void *info)
void native_flush_tlb_others(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
{
- cpumask_var_t lazymask;
- unsigned int cpu;
-
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
if (info->end == TLB_FLUSH_ALL)
trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
@@ -608,6 +590,8 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
* that UV should be updated so that smp_call_function_many(),
* etc, are optimal on UV.
*/
+ unsigned int cpu;
+
cpu = smp_processor_id();
cpumask = uv_flush_tlb_others(cpumask, info);
if (cpumask)
@@ -615,29 +599,8 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
(void *)info, 1);
return;
}
-
- /*
- * A temporary cpumask is used in order to skip sending IPIs
- * to CPUs in lazy TLB state, while keeping them in mm_cpumask(mm).
- * If the allocation fails, simply IPI every CPU in mm_cpumask.
- */
- if (!alloc_cpumask_var(&lazymask, GFP_ATOMIC)) {
- smp_call_function_many(cpumask, flush_tlb_func_remote,
- (void *)info, 1);
- return;
- }
-
- cpumask_copy(lazymask, cpumask);
-
- for_each_cpu(cpu, lazymask) {
- if (per_cpu(cpu_tlbstate.is_lazy, cpu))
- cpumask_clear_cpu(cpu, lazymask);
- }
-
- smp_call_function_many(lazymask, flush_tlb_func_remote,
+ smp_call_function_many(cpumask, flush_tlb_func_remote,
(void *)info, 1);
-
- free_cpumask_var(lazymask);
}
/*
@@ -690,68 +653,6 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
put_cpu();
}
-void tlb_flush_remove_tables_local(void *arg)
-{
- struct mm_struct *mm = arg;
-
- if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm &&
- this_cpu_read(cpu_tlbstate.is_lazy)) {
- /*
- * We're in lazy mode. We need to at least flush our
- * paging-structure cache to avoid speculatively reading
- * garbage into our TLB. Since switching to init_mm is barely
- * slower than a minimal flush, just switch to init_mm.
- */
- switch_mm_irqs_off(NULL, &init_mm, NULL);
- }
-}
-
-static void mm_fill_lazy_tlb_cpu_mask(struct mm_struct *mm,
- struct cpumask *lazy_cpus)
-{
- int cpu;
-
- for_each_cpu(cpu, mm_cpumask(mm)) {
- if (!per_cpu(cpu_tlbstate.is_lazy, cpu))
- cpumask_set_cpu(cpu, lazy_cpus);
- }
-}
-
-void tlb_flush_remove_tables(struct mm_struct *mm)
-{
- int cpu = get_cpu();
- cpumask_var_t lazy_cpus;
-
- if (cpumask_any_but(mm_cpumask(mm), cpu) >= nr_cpu_ids) {
- put_cpu();
- return;
- }
-
- if (!zalloc_cpumask_var(&lazy_cpus, GFP_ATOMIC)) {
- /*
- * If the cpumask allocation fails, do a brute force flush
- * on all the CPUs that have this mm loaded.
- */
- smp_call_function_many(mm_cpumask(mm),
- tlb_flush_remove_tables_local, (void *)mm, 1);
- put_cpu();
- return;
- }
-
- /*
- * CPUs with !is_lazy either received a TLB flush IPI while the user
- * pages in this address range were unmapped, or have context switched
- * and reloaded %CR3 since then.
- *
- * Shootdown IPIs at page table freeing time only need to be sent to
- * CPUs that may have out of date TLB contents.
- */
- mm_fill_lazy_tlb_cpu_mask(mm, lazy_cpus);
- smp_call_function_many(lazy_cpus,
- tlb_flush_remove_tables_local, (void *)mm, 1);
- free_cpumask_var(lazy_cpus);
- put_cpu();
-}
static void do_flush_tlb_all(void *info)
{