diff options
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/debug_pagetables.c | 1 | ||||
-rw-r--r-- | arch/x86/mm/extable.c | 40 | ||||
-rw-r--r-- | arch/x86/mm/fault.c | 36 | ||||
-rw-r--r-- | arch/x86/mm/init.c | 7 | ||||
-rw-r--r-- | arch/x86/mm/ioremap.c | 5 | ||||
-rw-r--r-- | arch/x86/mm/mem_encrypt_amd.c | 10 | ||||
-rw-r--r-- | arch/x86/mm/pat/memtype.c | 12 | ||||
-rw-r--r-- | arch/x86/mm/pat/set_memory.c | 5 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 55 |
9 files changed, 135 insertions, 36 deletions
diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c index 092ea436c7e6..b43301cb2a80 100644 --- a/arch/x86/mm/debug_pagetables.c +++ b/arch/x86/mm/debug_pagetables.c @@ -71,6 +71,5 @@ static void __exit pt_dump_debug_exit(void) module_init(pt_dump_debug_init); module_exit(pt_dump_debug_exit); -MODULE_LICENSE("GPL"); MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>"); MODULE_DESCRIPTION("Kernel debugging helper that dumps pagetables"); diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 60814e110a54..271dcb2deabc 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -130,10 +130,36 @@ static bool ex_handler_fprestore(const struct exception_table_entry *fixup, return true; } +/* + * On x86-64, we end up being imprecise with 'access_ok()', and allow + * non-canonical user addresses to make the range comparisons simpler, + * and to not have to worry about LAM being enabled. + * + * In fact, we allow up to one page of "slop" at the sign boundary, + * which means that we can do access_ok() by just checking the sign + * of the pointer for the common case of having a small access size. + */ +static bool gp_fault_address_ok(unsigned long fault_address) +{ +#ifdef CONFIG_X86_64 + /* Is it in the "user space" part of the non-canonical space? */ + if (valid_user_address(fault_address)) + return true; + + /* .. or just above it? */ + fault_address -= PAGE_SIZE; + if (valid_user_address(fault_address)) + return true; +#endif + return false; +} + static bool ex_handler_uaccess(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr) + struct pt_regs *regs, int trapnr, + unsigned long fault_address) { - WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?"); + WARN_ONCE(trapnr == X86_TRAP_GP && !gp_fault_address_ok(fault_address), + "General protection fault in user access. Non-canonical address?"); return ex_handler_default(fixup, regs); } @@ -189,10 +215,12 @@ static bool ex_handler_imm_reg(const struct exception_table_entry *fixup, } static bool ex_handler_ucopy_len(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, int reg, int imm) + struct pt_regs *regs, int trapnr, + unsigned long fault_address, + int reg, int imm) { regs->cx = imm * regs->cx + *pt_regs_nr(regs, reg); - return ex_handler_uaccess(fixup, regs, trapnr); + return ex_handler_uaccess(fixup, regs, trapnr, fault_address); } int ex_get_fixup_type(unsigned long ip) @@ -238,7 +266,7 @@ int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, case EX_TYPE_FAULT_MCE_SAFE: return ex_handler_fault(e, regs, trapnr); case EX_TYPE_UACCESS: - return ex_handler_uaccess(e, regs, trapnr); + return ex_handler_uaccess(e, regs, trapnr, fault_addr); case EX_TYPE_COPY: return ex_handler_copy(e, regs, trapnr); case EX_TYPE_CLEAR_FS: @@ -269,7 +297,7 @@ int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, case EX_TYPE_FAULT_SGX: return ex_handler_sgx(e, regs, trapnr); case EX_TYPE_UCOPY_LEN: - return ex_handler_ucopy_len(e, regs, trapnr, reg, imm); + return ex_handler_ucopy_len(e, regs, trapnr, fault_addr, reg, imm); case EX_TYPE_ZEROPAD: return ex_handler_zeropad(e, regs, fault_addr); } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a498ae1fbe66..e4399983c50c 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -19,6 +19,7 @@ #include <linux/uaccess.h> /* faulthandler_disabled() */ #include <linux/efi.h> /* efi_crash_gracefully_on_page_fault()*/ #include <linux/mm_types.h> +#include <linux/mm.h> /* find_and_lock_vma() */ #include <asm/cpufeature.h> /* boot_cpu_has, ... */ #include <asm/traps.h> /* dotraplinkage, ... */ @@ -1333,6 +1334,38 @@ void do_user_addr_fault(struct pt_regs *regs, } #endif +#ifdef CONFIG_PER_VMA_LOCK + if (!(flags & FAULT_FLAG_USER)) + goto lock_mmap; + + vma = lock_vma_under_rcu(mm, address); + if (!vma) + goto lock_mmap; + + if (unlikely(access_error(error_code, vma))) { + vma_end_read(vma); + goto lock_mmap; + } + fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); + vma_end_read(vma); + + if (!(fault & VM_FAULT_RETRY)) { + count_vm_vma_lock_event(VMA_LOCK_SUCCESS); + goto done; + } + count_vm_vma_lock_event(VMA_LOCK_RETRY); + + /* Quick path to respond to signals */ + if (fault_signal_pending(fault, regs)) { + if (!user_mode(regs)) + kernelmode_fixup_or_oops(regs, error_code, address, + SIGBUS, BUS_ADRERR, + ARCH_DEFAULT_PKEY); + return; + } +lock_mmap: +#endif /* CONFIG_PER_VMA_LOCK */ + /* * Kernel-mode access to the user address space should only occur * on well-defined single instructions listed in the exception @@ -1433,6 +1466,9 @@ good_area: } mmap_read_unlock(mm); +#ifdef CONFIG_PER_VMA_LOCK +done: +#endif if (likely(!(fault & VM_FAULT_ERROR))) return; diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index cb258f58fdc8..3cdac0f0055d 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -806,7 +806,7 @@ void __init poking_init(void) BUG_ON(!poking_mm); /* Xen PV guests need the PGD to be pinned. */ - paravirt_arch_dup_mmap(NULL, poking_mm); + paravirt_enter_mmap(poking_mm); /* * Randomize the poking address, but make sure that the following page @@ -1048,6 +1048,11 @@ __visible DEFINE_PER_CPU_ALIGNED(struct tlb_state, cpu_tlbstate) = { .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ }; +#ifdef CONFIG_ADDRESS_MASKING +DEFINE_PER_CPU(u64, tlbstate_untag_mask); +EXPORT_PER_CPU_SYMBOL(tlbstate_untag_mask); +#endif + void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache) { /* entry 0 MUST be WB (hardwired to speed up translations) */ diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 6453fbaedb08..aa7d279321ea 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -116,6 +116,11 @@ static void __ioremap_check_other(resource_size_t addr, struct ioremap_desc *des if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) return; + if (x86_platform.hyper.is_private_mmio(addr)) { + desc->flags |= IORES_MAP_ENCRYPTED; + return; + } + if (!IS_ENABLED(CONFIG_EFI)) return; diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c index 9c4d8dbcb129..e0b51c09109f 100644 --- a/arch/x86/mm/mem_encrypt_amd.c +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -513,10 +513,14 @@ void __init mem_encrypt_free_decrypted_mem(void) npages = (vaddr_end - vaddr) >> PAGE_SHIFT; /* - * The unused memory range was mapped decrypted, change the encryption - * attribute from decrypted to encrypted before freeing it. + * If the unused memory range was mapped decrypted, change the encryption + * attribute from decrypted to encrypted before freeing it. Base the + * re-encryption on the same condition used for the decryption in + * sme_postprocess_startup(). Higher level abstractions, such as + * CC_ATTR_MEM_ENCRYPT, aren't necessarily equivalent in a Hyper-V VM + * using vTOM, where sme_me_mask is always zero. */ - if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) { + if (sme_me_mask) { r = set_memory_encrypted(vaddr, npages); if (r) { pr_warn("failed to free unused decrypted pages\n"); diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index 46a00aa858b6..de10800cd4dd 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -1073,11 +1073,15 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, } /* - * untrack_pfn_moved is called, while mremapping a pfnmap for a new region, - * with the old vma after its pfnmap page table has been removed. The new - * vma has a new pfnmap to the same pfn & cache type with VM_PAT set. + * untrack_pfn_clear is called if the following situation fits: + * + * 1) while mremapping a pfnmap for a new region, with the old vma after + * its pfnmap page table has been removed. The new vma has a new pfnmap + * to the same pfn & cache type with VM_PAT set. + * 2) while duplicating vm area, the new vma fails to copy the pgtable from + * old vma. */ -void untrack_pfn_moved(struct vm_area_struct *vma) +void untrack_pfn_clear(struct vm_area_struct *vma) { vm_flags_clear(vma, VM_PAT); } diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 356758b7d4b4..7159cf787613 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -234,7 +234,7 @@ within_inclusive(unsigned long addr, unsigned long start, unsigned long end) * take full advantage of the the limited (s32) immediate addressing range (2G) * of x86_64. * - * See Documentation/x86/x86_64/mm.rst for more detail. + * See Documentation/arch/x86/x86_64/mm.rst for more detail. */ static inline unsigned long highmap_start_pfn(void) @@ -2175,9 +2175,6 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc) static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) { - if (hv_is_isolation_supported()) - return hv_set_mem_host_visibility(addr, numpages, !enc); - if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) return __set_memory_enc_pgtable(addr, numpages, enc); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 92d73ccede70..267acf27480a 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -154,26 +154,30 @@ static inline u16 user_pcid(u16 asid) return ret; } -static inline unsigned long build_cr3(pgd_t *pgd, u16 asid) +static inline unsigned long build_cr3(pgd_t *pgd, u16 asid, unsigned long lam) { + unsigned long cr3 = __sme_pa(pgd) | lam; + if (static_cpu_has(X86_FEATURE_PCID)) { - return __sme_pa(pgd) | kern_pcid(asid); + VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); + cr3 |= kern_pcid(asid); } else { VM_WARN_ON_ONCE(asid != 0); - return __sme_pa(pgd); } + + return cr3; } -static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) +static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid, + unsigned long lam) { - VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); /* * Use boot_cpu_has() instead of this_cpu_has() as this function * might be called during early boot. This should work even after * boot because all CPU's the have same capabilities: */ VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID)); - return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH; + return build_cr3(pgd, asid, lam) | CR3_NOFLUSH; } /* @@ -274,15 +278,16 @@ static inline void invalidate_user_asid(u16 asid) (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask)); } -static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush) +static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, unsigned long lam, + bool need_flush) { unsigned long new_mm_cr3; if (need_flush) { invalidate_user_asid(new_asid); - new_mm_cr3 = build_cr3(pgdir, new_asid); + new_mm_cr3 = build_cr3(pgdir, new_asid, lam); } else { - new_mm_cr3 = build_cr3_noflush(pgdir, new_asid); + new_mm_cr3 = build_cr3_noflush(pgdir, new_asid, lam); } /* @@ -491,6 +496,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, { struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + unsigned long new_lam = mm_lam_cr3_mask(next); bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy); unsigned cpu = smp_processor_id(); u64 next_tlb_gen; @@ -520,7 +526,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * isn't free. */ #ifdef CONFIG_DEBUG_VM - if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) { + if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid, + tlbstate_lam_cr3_mask()))) { /* * If we were to BUG here, we'd be very likely to kill * the system so hard that we don't see the call trace. @@ -552,10 +559,16 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * instruction. */ if (real_prev == next) { + /* Not actually switching mm's */ VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != next->context.ctx_id); /* + * If this races with another thread that enables lam, 'new_lam' + * might not match tlbstate_lam_cr3_mask(). + */ + + /* * Even in lazy TLB mode, the CPU should stay set in the * mm_cpumask. The TLB shootdown code can figure out from * cpu_tlbstate_shared.is_lazy whether or not to send an IPI. @@ -622,15 +635,16 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, barrier(); } + set_tlbstate_lam_mode(next); if (need_flush) { this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); - load_new_mm_cr3(next->pgd, new_asid, true); + load_new_mm_cr3(next->pgd, new_asid, new_lam, true); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } else { /* The new ASID is already up to date. */ - load_new_mm_cr3(next->pgd, new_asid, false); + load_new_mm_cr3(next->pgd, new_asid, new_lam, false); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); } @@ -691,6 +705,10 @@ void initialize_tlbstate_and_flush(void) /* Assert that CR3 already references the right mm. */ WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd)); + /* LAM expected to be disabled */ + WARN_ON(cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57)); + WARN_ON(mm_lam_cr3_mask(mm)); + /* * Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization * doesn't work like other CR4 bits because it can only be set from @@ -699,8 +717,8 @@ void initialize_tlbstate_and_flush(void) WARN_ON(boot_cpu_has(X86_FEATURE_PCID) && !(cr4_read_shadow() & X86_CR4_PCIDE)); - /* Force ASID 0 and force a TLB flush. */ - write_cr3(build_cr3(mm->pgd, 0)); + /* Disable LAM, force ASID 0 and force a TLB flush. */ + write_cr3(build_cr3(mm->pgd, 0, 0)); /* Reinitialize tlbstate. */ this_cpu_write(cpu_tlbstate.last_user_mm_spec, LAST_USER_MM_INIT); @@ -708,6 +726,7 @@ void initialize_tlbstate_and_flush(void) this_cpu_write(cpu_tlbstate.next_asid, 1); this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen); + set_tlbstate_lam_mode(mm); for (i = 1; i < TLB_NR_DYN_ASIDS; i++) this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0); @@ -925,7 +944,7 @@ void flush_tlb_multi(const struct cpumask *cpumask, } /* - * See Documentation/x86/tlb.rst for details. We choose 33 + * See Documentation/arch/x86/tlb.rst for details. We choose 33 * because it is large enough to cover the vast majority (at * least 95%) of allocations, and is small enough that we are * confident it will not cause too much overhead. Each single @@ -1071,8 +1090,10 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end) */ unsigned long __get_current_cr3_fast(void) { - unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd, - this_cpu_read(cpu_tlbstate.loaded_mm_asid)); + unsigned long cr3 = + build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd, + this_cpu_read(cpu_tlbstate.loaded_mm_asid), + tlbstate_lam_cr3_mask()); /* For now, be very restrictive about when this can be called. */ VM_WARN_ON(in_nmi() || preemptible()); |