diff options
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/Makefile | 3 | ||||
-rw-r--r-- | arch/x86/mm/amdtopology.c | 34 | ||||
-rw-r--r-- | arch/x86/mm/debug_pagetables.c | 4 | ||||
-rw-r--r-- | arch/x86/mm/dump_pagetables.c | 4 | ||||
-rw-r--r-- | arch/x86/mm/extable.c | 78 | ||||
-rw-r--r-- | arch/x86/mm/fault.c | 46 | ||||
-rw-r--r-- | arch/x86/mm/ident_map.c | 23 | ||||
-rw-r--r-- | arch/x86/mm/maccess.c | 10 | ||||
-rw-r--r-- | arch/x86/mm/mem_encrypt.c | 55 | ||||
-rw-r--r-- | arch/x86/mm/mem_encrypt_identity.c | 108 | ||||
-rw-r--r-- | arch/x86/mm/numa.c | 21 | ||||
-rw-r--r-- | arch/x86/mm/pat/memtype.c | 9 | ||||
-rw-r--r-- | arch/x86/mm/pat/set_memory.c | 43 | ||||
-rw-r--r-- | arch/x86/mm/pgtable.c | 4 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 10 |
15 files changed, 253 insertions, 199 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index c80febc44cd2..428048e73bd2 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -16,6 +16,7 @@ KASAN_SANITIZE_pgprot.o := n KCSAN_SANITIZE := n # Avoid recursion by not calling KMSAN hooks for CEA code. KMSAN_SANITIZE_cpu_entry_area.o := n +KMSAN_SANITIZE_mem_encrypt_identity.o := n ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_mem_encrypt.o = -pg @@ -60,7 +61,7 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulation.o obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o -obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o +obj-$(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION) += pti.o obj-$(CONFIG_X86_MEM_ENCRYPT) += mem_encrypt.o obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_amd.o diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c index b3ca7d23e4b0..9332b36a1091 100644 --- a/arch/x86/mm/amdtopology.c +++ b/arch/x86/mm/amdtopology.c @@ -54,13 +54,11 @@ static __init int find_northbridge(void) int __init amd_numa_init(void) { - u64 start = PFN_PHYS(0); + unsigned int numnodes, cores, apicid; + u64 prevbase, start = PFN_PHYS(0); u64 end = PFN_PHYS(max_pfn); - unsigned numnodes; - u64 prevbase; - int i, j, nb; u32 nodeid, reg; - unsigned int bits, cores, apicid_base; + int i, j, nb; if (!early_pci_allowed()) return -EINVAL; @@ -158,26 +156,18 @@ int __init amd_numa_init(void) return -ENOENT; /* - * We seem to have valid NUMA configuration. Map apicids to nodes - * using the coreid bits from early_identify_cpu. + * We seem to have valid NUMA configuration. Map apicids to nodes + * using the size of the core domain in the APIC space. */ - bits = boot_cpu_data.x86_coreid_bits; - cores = 1 << bits; - apicid_base = 0; + cores = topology_get_domain_size(TOPO_CORE_DOMAIN); - /* - * get boot-time SMP configuration: - */ - early_get_smp_config(); + apicid = boot_cpu_physical_apicid; + if (apicid > 0) + pr_info("BSP APIC ID: %02x\n", apicid); - if (boot_cpu_physical_apicid > 0) { - pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid); - apicid_base = boot_cpu_physical_apicid; + for_each_node_mask(i, numa_nodes_parsed) { + for (j = 0; j < cores; j++, apicid++) + set_apicid_to_node(apicid, i); } - - for_each_node_mask(i, numa_nodes_parsed) - for (j = apicid_base; j < cores + apicid_base; j++) - set_apicid_to_node((i << bits) + j, i); - return 0; } diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c index b43301cb2a80..ae5c213a1cb0 100644 --- a/arch/x86/mm/debug_pagetables.c +++ b/arch/x86/mm/debug_pagetables.c @@ -22,7 +22,7 @@ static int ptdump_curknl_show(struct seq_file *m, void *v) DEFINE_SHOW_ATTRIBUTE(ptdump_curknl); -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION static int ptdump_curusr_show(struct seq_file *m, void *v) { if (current->mm->pgd) @@ -54,7 +54,7 @@ static int __init pt_dump_debug_init(void) debugfs_create_file("current_kernel", 0400, dir, NULL, &ptdump_curknl_fops); -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION debugfs_create_file("current_user", 0400, dir, NULL, &ptdump_curusr_fops); #endif diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index e1b599ecbbc2..b7b88c1d91ec 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -408,7 +408,7 @@ void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm, bool user) { pgd_t *pgd = mm->pgd; -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION if (user && boot_cpu_has(X86_FEATURE_PTI)) pgd = kernel_to_user_pgdp(pgd); #endif @@ -418,7 +418,7 @@ EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs); void ptdump_walk_user_pgd_level_checkwx(void) { -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION pgd_t *pgd = INIT_PGD; if (!(__supported_pte_mask & _PAGE_NX) || diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 271dcb2deabc..b522933bfa56 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -6,6 +6,7 @@ #include <xen/xen.h> #include <asm/fpu/api.h> +#include <asm/fred.h> #include <asm/sev.h> #include <asm/traps.h> #include <asm/kdebug.h> @@ -223,6 +224,79 @@ static bool ex_handler_ucopy_len(const struct exception_table_entry *fixup, return ex_handler_uaccess(fixup, regs, trapnr, fault_address); } +#ifdef CONFIG_X86_FRED +static bool ex_handler_eretu(const struct exception_table_entry *fixup, + struct pt_regs *regs, unsigned long error_code) +{ + struct pt_regs *uregs = (struct pt_regs *)(regs->sp - offsetof(struct pt_regs, orig_ax)); + unsigned short ss = uregs->ss; + unsigned short cs = uregs->cs; + + /* + * Move the NMI bit from the invalid stack frame, which caused ERETU + * to fault, to the fault handler's stack frame, thus to unblock NMI + * with the fault handler's ERETS instruction ASAP if NMI is blocked. + */ + regs->fred_ss.nmi = uregs->fred_ss.nmi; + + /* + * Sync event information to uregs, i.e., the ERETU return frame, but + * is it safe to write to the ERETU return frame which is just above + * current event stack frame? + * + * The RSP used by FRED to push a stack frame is not the value in %rsp, + * it is calculated from %rsp with the following 2 steps: + * 1) RSP = %rsp - (IA32_FRED_CONFIG & 0x1c0) // Reserve N*64 bytes + * 2) RSP = RSP & ~0x3f // Align to a 64-byte cache line + * when an event delivery doesn't trigger a stack level change. + * + * Here is an example with N*64 (N=1) bytes reserved: + * + * 64-byte cache line ==> ______________ + * |___Reserved___| + * |__Event_data__| + * |_____SS_______| + * |_____RSP______| + * |_____FLAGS____| + * |_____CS_______| + * |_____IP_______| + * 64-byte cache line ==> |__Error_code__| <== ERETU return frame + * |______________| + * |______________| + * |______________| + * |______________| + * |______________| + * |______________| + * |______________| + * 64-byte cache line ==> |______________| <== RSP after step 1) and 2) + * |___Reserved___| + * |__Event_data__| + * |_____SS_______| + * |_____RSP______| + * |_____FLAGS____| + * |_____CS_______| + * |_____IP_______| + * 64-byte cache line ==> |__Error_code__| <== ERETS return frame + * + * Thus a new FRED stack frame will always be pushed below a previous + * FRED stack frame ((N*64) bytes may be reserved between), and it is + * safe to write to a previous FRED stack frame as they never overlap. + */ + fred_info(uregs)->edata = fred_event_data(regs); + uregs->ssx = regs->ssx; + uregs->fred_ss.ss = ss; + /* The NMI bit was moved away above */ + uregs->fred_ss.nmi = 0; + uregs->csx = regs->csx; + uregs->fred_cs.sl = 0; + uregs->fred_cs.wfe = 0; + uregs->cs = cs; + uregs->orig_ax = error_code; + + return ex_handler_default(fixup, regs); +} +#endif + int ex_get_fixup_type(unsigned long ip) { const struct exception_table_entry *e = search_exception_tables(ip); @@ -300,6 +374,10 @@ int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, return ex_handler_ucopy_len(e, regs, trapnr, fault_addr, reg, imm); case EX_TYPE_ZEROPAD: return ex_handler_zeropad(e, regs, fault_addr); +#ifdef CONFIG_X86_FRED + case EX_TYPE_ERETU: + return ex_handler_eretu(e, regs, error_code); +#endif } BUG(); } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 679b09cfe241..402e08f6b7ec 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -34,6 +34,8 @@ #include <asm/kvm_para.h> /* kvm_handle_async_pf */ #include <asm/vdso.h> /* fixup_vdso_exception() */ #include <asm/irq_stack.h> +#include <asm/fred.h> +#include <asm/sev.h> /* snp_dump_hva_rmpentry() */ #define CREATE_TRACE_POINTS #include <asm/trace/exceptions.h> @@ -547,6 +549,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad !(error_code & X86_PF_PROT) ? "not-present page" : (error_code & X86_PF_RSVD) ? "reserved bit violation" : (error_code & X86_PF_PK) ? "protection keys violation" : + (error_code & X86_PF_RMP) ? "RMP violation" : "permissions violation"); if (!(error_code & X86_PF_USER) && user_mode(regs)) { @@ -579,6 +582,9 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad } dump_pagetable(address); + + if (error_code & X86_PF_RMP) + snp_dump_hva_rmpentry(address); } static noinline void @@ -798,15 +804,6 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, show_opcodes(regs, loglvl); } -/* - * The (legacy) vsyscall page is the long page in the kernel portion - * of the address space that has user-accessible permissions. - */ -static bool is_vsyscall_vaddr(unsigned long vaddr) -{ - return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR); -} - static void __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, unsigned long address, u32 pkey, int si_code) @@ -1302,21 +1299,14 @@ void do_user_addr_fault(struct pt_regs *regs, return; } - /* - * It's safe to allow irq's after cr2 has been saved and the - * vmalloc fault has been handled. - * - * User-mode registers count as a user access even for any - * potential system fault or CPU buglet: - */ - if (user_mode(regs)) { - local_irq_enable(); - flags |= FAULT_FLAG_USER; - } else { - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); + /* Legacy check - remove this after verifying that it doesn't trigger */ + if (WARN_ON_ONCE(!(regs->flags & X86_EFLAGS_IF))) { + bad_area_nosemaphore(regs, error_code, address); + return; } + local_irq_enable(); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); /* @@ -1332,6 +1322,14 @@ void do_user_addr_fault(struct pt_regs *regs, if (error_code & X86_PF_INSTR) flags |= FAULT_FLAG_INSTRUCTION; + /* + * We set FAULT_FLAG_USER based on the register state, not + * based on X86_PF_USER. User space accesses that cause + * system page faults are still user accesses. + */ + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; + #ifdef CONFIG_X86_64 /* * Faults in the vsyscall page might need emulation. The @@ -1518,8 +1516,10 @@ handle_page_fault(struct pt_regs *regs, unsigned long error_code, DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) { - unsigned long address = read_cr2(); irqentry_state_t state; + unsigned long address; + + address = cpu_feature_enabled(X86_FEATURE_FRED) ? fred_event_data(regs) : read_cr2(); prefetchw(¤t->mm->mmap_lock); diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index 968d7005f4a7..f50cc210a981 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -26,18 +26,31 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, for (; addr < end; addr = next) { pud_t *pud = pud_page + pud_index(addr); pmd_t *pmd; + bool use_gbpage; next = (addr & PUD_MASK) + PUD_SIZE; if (next > end) next = end; - if (info->direct_gbpages) { - pud_t pudval; + /* if this is already a gbpage, this portion is already mapped */ + if (pud_large(*pud)) + continue; + + /* Is using a gbpage allowed? */ + use_gbpage = info->direct_gbpages; - if (pud_present(*pud)) - continue; + /* Don't use gbpage if it maps more than the requested region. */ + /* at the begining: */ + use_gbpage &= ((addr & ~PUD_MASK) == 0); + /* ... or at the end: */ + use_gbpage &= ((next & ~PUD_MASK) == 0); + + /* Never overwrite existing mappings */ + use_gbpage &= !pud_present(*pud); + + if (use_gbpage) { + pud_t pudval; - addr &= PUD_MASK; pudval = __pud((addr - info->offset) | info->page_flag); set_pud(pud, pudval); continue; diff --git a/arch/x86/mm/maccess.c b/arch/x86/mm/maccess.c index 6993f026adec..42115ac079cf 100644 --- a/arch/x86/mm/maccess.c +++ b/arch/x86/mm/maccess.c @@ -3,6 +3,8 @@ #include <linux/uaccess.h> #include <linux/kernel.h> +#include <asm/vsyscall.h> + #ifdef CONFIG_X86_64 bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) { @@ -16,6 +18,14 @@ bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) return false; /* + * Reading from the vsyscall page may cause an unhandled fault in + * certain cases. Though it is at an address above TASK_SIZE_MAX, it is + * usually considered as a user space address. + */ + if (is_vsyscall_vaddr(vaddr)) + return false; + + /* * Allow everything during early boot before 'x86_virt_bits' * is initialized. Needed for instruction decoding in early * exception handlers. diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index c290c55b632b..6f3b3e028718 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -14,6 +14,8 @@ #include <linux/mem_encrypt.h> #include <linux/virtio_anchor.h> +#include <asm/sev.h> + /* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */ bool force_dma_unencrypted(struct device *dev) { @@ -42,38 +44,45 @@ bool force_dma_unencrypted(struct device *dev) static void print_mem_encrypt_feature_info(void) { - pr_info("Memory Encryption Features active:"); - - if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { - pr_cont(" Intel TDX\n"); - return; - } + pr_info("Memory Encryption Features active: "); - pr_cont(" AMD"); + switch (cc_vendor) { + case CC_VENDOR_INTEL: + pr_cont("Intel TDX\n"); + break; + case CC_VENDOR_AMD: + pr_cont("AMD"); - /* Secure Memory Encryption */ - if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) { + /* Secure Memory Encryption */ + if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) { /* * SME is mutually exclusive with any of the SEV * features below. - */ - pr_cont(" SME\n"); - return; - } + */ + pr_cont(" SME\n"); + return; + } - /* Secure Encrypted Virtualization */ - if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) - pr_cont(" SEV"); + /* Secure Encrypted Virtualization */ + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) + pr_cont(" SEV"); + + /* Encrypted Register State */ + if (cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) + pr_cont(" SEV-ES"); - /* Encrypted Register State */ - if (cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) - pr_cont(" SEV-ES"); + /* Secure Nested Paging */ + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + pr_cont(" SEV-SNP"); - /* Secure Nested Paging */ - if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) - pr_cont(" SEV-SNP"); + pr_cont("\n"); - pr_cont("\n"); + sev_show_status(); + + break; + default: + pr_cont("Unknown\n"); + } } /* Architecture __weak replacement functions */ diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index d73aeb16417f..64b5005d49e5 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -41,9 +41,9 @@ #include <linux/mem_encrypt.h> #include <linux/cc_platform.h> +#include <asm/init.h> #include <asm/setup.h> #include <asm/sections.h> -#include <asm/cmdline.h> #include <asm/coco.h> #include <asm/sev.h> @@ -95,11 +95,7 @@ struct sme_populate_pgd_data { */ static char sme_workarea[2 * PMD_SIZE] __section(".init.scratch"); -static char sme_cmdline_arg[] __initdata = "mem_encrypt"; -static char sme_cmdline_on[] __initdata = "on"; -static char sme_cmdline_off[] __initdata = "off"; - -static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd) +static void __head sme_clear_pgd(struct sme_populate_pgd_data *ppd) { unsigned long pgd_start, pgd_end, pgd_size; pgd_t *pgd_p; @@ -114,7 +110,7 @@ static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd) memset(pgd_p, 0, pgd_size); } -static pud_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) +static pud_t __head *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) { pgd_t *pgd; p4d_t *p4d; @@ -151,7 +147,7 @@ static pud_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) return pud; } -static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) +static void __head sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) { pud_t *pud; pmd_t *pmd; @@ -167,7 +163,7 @@ static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) set_pmd(pmd, __pmd(ppd->paddr | ppd->pmd_flags)); } -static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd) +static void __head sme_populate_pgd(struct sme_populate_pgd_data *ppd) { pud_t *pud; pmd_t *pmd; @@ -193,7 +189,7 @@ static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd) set_pte(pte, __pte(ppd->paddr | ppd->pte_flags)); } -static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) +static void __head __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) { while (ppd->vaddr < ppd->vaddr_end) { sme_populate_pgd_large(ppd); @@ -203,7 +199,7 @@ static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) } } -static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd) +static void __head __sme_map_range_pte(struct sme_populate_pgd_data *ppd) { while (ppd->vaddr < ppd->vaddr_end) { sme_populate_pgd(ppd); @@ -213,7 +209,7 @@ static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd) } } -static void __init __sme_map_range(struct sme_populate_pgd_data *ppd, +static void __head __sme_map_range(struct sme_populate_pgd_data *ppd, pmdval_t pmd_flags, pteval_t pte_flags) { unsigned long vaddr_end; @@ -237,22 +233,22 @@ static void __init __sme_map_range(struct sme_populate_pgd_data *ppd, __sme_map_range_pte(ppd); } -static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd) +static void __head sme_map_range_encrypted(struct sme_populate_pgd_data *ppd) { __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC); } -static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd) +static void __head sme_map_range_decrypted(struct sme_populate_pgd_data *ppd) { __sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC); } -static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd) +static void __head sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd) { __sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP); } -static unsigned long __init sme_pgtable_calc(unsigned long len) +static unsigned long __head sme_pgtable_calc(unsigned long len) { unsigned long entries = 0, tables = 0; @@ -289,7 +285,7 @@ static unsigned long __init sme_pgtable_calc(unsigned long len) return entries + tables; } -void __init sme_encrypt_kernel(struct boot_params *bp) +void __head sme_encrypt_kernel(struct boot_params *bp) { unsigned long workarea_start, workarea_end, workarea_len; unsigned long execute_start, execute_end, execute_len; @@ -305,7 +301,8 @@ void __init sme_encrypt_kernel(struct boot_params *bp) * instrumentation or checking boot_cpu_data in the cc_platform_has() * function. */ - if (!sme_get_me_mask() || sev_status & MSR_AMD64_SEV_ENABLED) + if (!sme_get_me_mask() || + RIP_REL_REF(sev_status) & MSR_AMD64_SEV_ENABLED) return; /* @@ -323,9 +320,8 @@ void __init sme_encrypt_kernel(struct boot_params *bp) * memory from being cached. */ - /* Physical addresses gives us the identity mapped virtual addresses */ - kernel_start = __pa_symbol(_text); - kernel_end = ALIGN(__pa_symbol(_end), PMD_SIZE); + kernel_start = (unsigned long)RIP_REL_REF(_text); + kernel_end = ALIGN((unsigned long)RIP_REL_REF(_end), PMD_SIZE); kernel_len = kernel_end - kernel_start; initrd_start = 0; @@ -343,14 +339,6 @@ void __init sme_encrypt_kernel(struct boot_params *bp) #endif /* - * We're running identity mapped, so we must obtain the address to the - * SME encryption workarea using rip-relative addressing. - */ - asm ("lea sme_workarea(%%rip), %0" - : "=r" (workarea_start) - : "p" (sme_workarea)); - - /* * Calculate required number of workarea bytes needed: * executable encryption area size: * stack page (PAGE_SIZE) @@ -359,7 +347,7 @@ void __init sme_encrypt_kernel(struct boot_params *bp) * pagetable structures for the encryption of the kernel * pagetable structures for workarea (in case not currently mapped) */ - execute_start = workarea_start; + execute_start = workarea_start = (unsigned long)RIP_REL_REF(sme_workarea); execute_end = execute_start + (PAGE_SIZE * 2) + PMD_SIZE; execute_len = execute_end - execute_start; @@ -502,14 +490,11 @@ void __init sme_encrypt_kernel(struct boot_params *bp) native_write_cr3(__native_read_cr3()); } -void __init sme_enable(struct boot_params *bp) +void __head sme_enable(struct boot_params *bp) { - const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off; unsigned int eax, ebx, ecx, edx; unsigned long feature_mask; - bool active_by_default; unsigned long me_mask; - char buffer[16]; bool snp; u64 msr; @@ -543,15 +528,18 @@ void __init sme_enable(struct boot_params *bp) me_mask = 1UL << (ebx & 0x3f); /* Check the SEV MSR whether SEV or SME is enabled */ - sev_status = __rdmsr(MSR_AMD64_SEV); - feature_mask = (sev_status & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT; + RIP_REL_REF(sev_status) = msr = __rdmsr(MSR_AMD64_SEV); + feature_mask = (msr & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT; /* The SEV-SNP CC blob should never be present unless SEV-SNP is enabled. */ - if (snp && !(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) + if (snp && !(msr & MSR_AMD64_SEV_SNP_ENABLED)) snp_abort(); /* Check if memory encryption is enabled */ if (feature_mask == AMD_SME_BIT) { + if (!(bp->hdr.xloadflags & XLF_MEM_ENCRYPTION)) + return; + /* * No SME if Hypervisor bit is set. This check is here to * prevent a guest from trying to enable SME. For running as a @@ -571,48 +559,10 @@ void __init sme_enable(struct boot_params *bp) msr = __rdmsr(MSR_AMD64_SYSCFG); if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) return; - } else { - /* SEV state cannot be controlled by a command line option */ - sme_me_mask = me_mask; - goto out; } - /* - * Fixups have not been applied to phys_base yet and we're running - * identity mapped, so we must obtain the address to the SME command - * line argument data using rip-relative addressing. - */ - asm ("lea sme_cmdline_arg(%%rip), %0" - : "=r" (cmdline_arg) - : "p" (sme_cmdline_arg)); - asm ("lea sme_cmdline_on(%%rip), %0" - : "=r" (cmdline_on) - : "p" (sme_cmdline_on)); - asm ("lea sme_cmdline_off(%%rip), %0" - : "=r" (cmdline_off) - : "p" (sme_cmdline_off)); - - if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT)) - active_by_default = true; - else - active_by_default = false; - - cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr | - ((u64)bp->ext_cmd_line_ptr << 32)); - - if (cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)) < 0) - return; - - if (!strncmp(buffer, cmdline_on, sizeof(buffer))) - sme_me_mask = me_mask; - else if (!strncmp(buffer, cmdline_off, sizeof(buffer))) - sme_me_mask = 0; - else - sme_me_mask = active_by_default ? me_mask : 0; -out: - if (sme_me_mask) { - physical_mask &= ~sme_me_mask; - cc_vendor = CC_VENDOR_AMD; - cc_set_mask(sme_me_mask); - } + RIP_REL_REF(sme_me_mask) = me_mask; + physical_mask &= ~me_mask; + cc_vendor = CC_VENDOR_AMD; + cc_set_mask(me_mask); } diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index adc497b93f03..65e9a6e391c0 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -934,7 +934,7 @@ static int __init cmp_memblk(const void *a, const void *b) const struct numa_memblk *ma = *(const struct numa_memblk **)a; const struct numa_memblk *mb = *(const struct numa_memblk **)b; - return ma->start - mb->start; + return (ma->start > mb->start) - (ma->start < mb->start); } static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata; @@ -944,14 +944,12 @@ static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata; * @start: address to begin fill * @end: address to end fill * - * Find and extend numa_meminfo memblks to cover the @start-@end - * physical address range, such that the first memblk includes - * @start, the last memblk includes @end, and any gaps in between - * are filled. + * Find and extend numa_meminfo memblks to cover the physical + * address range @start-@end * * RETURNS: * 0 : Success - * NUMA_NO_MEMBLK : No memblk exists in @start-@end range + * NUMA_NO_MEMBLK : No memblks exist in address range @start-@end */ int __init numa_fill_memblks(u64 start, u64 end) @@ -963,17 +961,14 @@ int __init numa_fill_memblks(u64 start, u64 end) /* * Create a list of pointers to numa_meminfo memblks that - * overlap start, end. Exclude (start == bi->end) since - * end addresses in both a CFMWS range and a memblk range - * are exclusive. - * - * This list of pointers is used to make in-place changes - * that fill out the numa_meminfo memblks. + * overlap start, end. The list is used to make in-place + * changes that fill out the numa_meminfo memblks. */ for (int i = 0; i < mi->nr_blks; i++) { struct numa_memblk *bi = &mi->blk[i]; - if (start < bi->end && end >= bi->start) { + if (memblock_addrs_overlap(start, end - start, bi->start, + bi->end - bi->start)) { blk[count] = &mi->blk[i]; count++; } diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index 0904d7e8e126..0d72183b5dd0 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -240,6 +240,8 @@ void pat_cpu_init(void) } wrmsrl(MSR_IA32_CR_PAT, pat_msr_val); + + __flush_tlb_all(); } /** @@ -296,13 +298,8 @@ void __init pat_bp_init(void) /* * Xen PV doesn't allow to set PAT MSR, but all cache modes are * supported. - * When running as TDX guest setting the PAT MSR won't work either - * due to the requirement to set CR0.CD when doing so. Rely on - * firmware to have set the PAT MSR correctly. */ - if (pat_disabled || - cpu_feature_enabled(X86_FEATURE_XENPV) || - cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { + if (pat_disabled || cpu_feature_enabled(X86_FEATURE_XENPV)) { init_cache_modes(pat_msr_val); return; } diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index e9b448d1b1b7..e5b2985a7c51 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -755,10 +755,14 @@ pmd_t *lookup_pmd_address(unsigned long address) * areas on 32-bit NUMA systems. The percpu areas can * end up in this kind of memory, for instance. * - * This could be optimized, but it is only intended to be - * used at initialization time, and keeping it - * unoptimized should increase the testing coverage for - * the more obscure platforms. + * Note that as long as the PTEs are well-formed with correct PFNs, this + * works without checking the PRESENT bit in the leaf PTE. This is unlike + * the similar vmalloc_to_page() and derivatives. Callers may depend on + * this behavior. + * + * This could be optimized, but it is only used in paths that are not perf + * sensitive, and keeping it unoptimized should increase the testing coverage + * for the more obscure platforms. */ phys_addr_t slow_virt_to_phys(void *__virt_addr) { @@ -2041,17 +2045,12 @@ int set_mce_nospec(unsigned long pfn) return rc; } -static int set_memory_p(unsigned long *addr, int numpages) -{ - return change_page_attr_set(addr, numpages, __pgprot(_PAGE_PRESENT), 0); -} - /* Restore full speculative operation to the pfn. */ int clear_mce_nospec(unsigned long pfn) { unsigned long addr = (unsigned long) pfn_to_kaddr(pfn); - return set_memory_p(&addr, 1); + return set_memory_p(addr, 1); } EXPORT_SYMBOL_GPL(clear_mce_nospec); #endif /* CONFIG_X86_64 */ @@ -2104,6 +2103,11 @@ int set_memory_np_noalias(unsigned long addr, int numpages) CPA_NO_CHECK_ALIAS, NULL); } +int set_memory_p(unsigned long addr, int numpages) +{ + return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_PRESENT), 0); +} + int set_memory_4k(unsigned long addr, int numpages) { return change_page_attr_set_clr(&addr, numpages, __pgprot(0), @@ -2153,7 +2157,7 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc) /* Notify hypervisor that we are about to set/clr encryption attribute. */ if (!x86_platform.guest.enc_status_change_prepare(addr, numpages, enc)) - return -EIO; + goto vmm_fail; ret = __change_page_attr_set_clr(&cpa, 1); @@ -2166,13 +2170,20 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc) */ cpa_flush(&cpa, 0); + if (ret) + return ret; + /* Notify hypervisor that we have successfully set/clr encryption attribute. */ - if (!ret) { - if (!x86_platform.guest.enc_status_change_finish(addr, numpages, enc)) - ret = -EIO; - } + if (!x86_platform.guest.enc_status_change_finish(addr, numpages, enc)) + goto vmm_fail; - return ret; + return 0; + +vmm_fail: + WARN_ONCE(1, "CPA VMM failure to convert memory (addr=%p, numpages=%d) to %s.\n", + (void *)addr, numpages, enc ? "private" : "shared"); + + return -EIO; } static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 0cbc1b8e8e3d..cceb779d882d 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -293,7 +293,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) for (i = 0; i < PREALLOCATED_PMDS; i++) mop_up_one_pmd(mm, &pgdp[i]); -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION if (!boot_cpu_has(X86_FEATURE_PTI)) return; @@ -325,7 +325,7 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) } } -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION static void pgd_prepopulate_user_pmd(struct mm_struct *mm, pgd_t *k_pgd, pmd_t *pmds[]) { diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 5768d386efab..4af930947380 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -89,10 +89,10 @@ #define CR3_HW_ASID_BITS 12 /* - * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for + * When enabled, MITIGATION_PAGE_TABLE_ISOLATION consumes a single bit for * user/kernel switches */ -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION # define PTI_CONSUMED_PCID_BITS 1 #else # define PTI_CONSUMED_PCID_BITS 0 @@ -114,7 +114,7 @@ static inline u16 kern_pcid(u16 asid) { VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* * Make sure that the dynamic ASID space does not conflict with the * bit we are using to switch between user and kernel ASIDs. @@ -149,7 +149,7 @@ static inline u16 kern_pcid(u16 asid) static inline u16 user_pcid(u16 asid) { u16 ret = kern_pcid(asid); -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION ret |= 1 << X86_CR3_PTI_PCID_USER_BIT; #endif return ret; @@ -262,7 +262,7 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, static inline void invalidate_user_asid(u16 asid) { /* There is no user ASID if address space separation is off */ - if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) + if (!IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)) return; /* |