diff options
Diffstat (limited to 'arch/x86')
267 files changed, 7379 insertions, 5030 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7422db409770..66bfabae8814 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -102,6 +102,7 @@ config X86 select ARCH_HAS_DEBUG_WX select ARCH_HAS_ZONE_DMA_SET if EXPERT select ARCH_HAVE_NMI_SAFE_CMPXCHG + select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO @@ -128,7 +129,8 @@ config X86 select ARCH_WANT_GENERAL_HUGETLB select ARCH_WANT_HUGE_PMD_SHARE select ARCH_WANT_LD_ORPHAN_WARN - select ARCH_WANT_OPTIMIZE_VMEMMAP if X86_64 + select ARCH_WANT_OPTIMIZE_DAX_VMEMMAP if X86_64 + select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP if X86_64 select ARCH_WANTS_THP_SWAP if X86_64 select ARCH_HAS_PARANOID_L1D_FLUSH select BUILDTIME_TABLE_SORT @@ -1308,44 +1310,8 @@ config X86_REBOOTFIXUPS Say N otherwise. config MICROCODE - bool "CPU microcode loading support" - default y + def_bool y depends on CPU_SUP_AMD || CPU_SUP_INTEL - help - If you say Y here, you will be able to update the microcode on - Intel and AMD processors. The Intel support is for the IA32 family, - e.g. Pentium Pro, Pentium II, Pentium III, Pentium 4, Xeon etc. The - AMD support is for families 0x10 and later. You will obviously need - the actual microcode binary data itself which is not shipped with - the Linux kernel. - - The preferred method to load microcode from a detached initrd is described - in Documentation/arch/x86/microcode.rst. For that you need to enable - CONFIG_BLK_DEV_INITRD in order for the loader to be able to scan the - initrd for microcode blobs. - - In addition, you can build the microcode into the kernel. For that you - need to add the vendor-supplied microcode to the CONFIG_EXTRA_FIRMWARE - config option. - -config MICROCODE_INTEL - bool "Intel microcode loading support" - depends on CPU_SUP_INTEL && MICROCODE - default MICROCODE - help - This options enables microcode patch loading support for Intel - processors. - - For the current Intel microcode data package go to - <https://downloadcenter.intel.com> and search for - 'Linux Processor Microcode Data File'. - -config MICROCODE_AMD - bool "AMD microcode loading support" - depends on CPU_SUP_AMD && MICROCODE - help - If you select this option, microcode patch loading support for AMD - processors will be enabled. config MICROCODE_LATE_LOADING bool "Late microcode loading (DANGEROUS)" @@ -1849,6 +1815,11 @@ config CC_HAS_IBT (CC_IS_CLANG && CLANG_VERSION >= 140000)) && \ $(as-instr,endbr64) +config X86_CET + def_bool n + help + CET features configured (Shadow stack or IBT) + config X86_KERNEL_IBT prompt "Indirect Branch Tracking" def_bool y @@ -1856,6 +1827,7 @@ config X86_KERNEL_IBT # https://github.com/llvm/llvm-project/commit/9d7001eba9c4cb311e03cd8cdc231f9e579f2d0f depends on !LD_IS_LLD || LLD_VERSION >= 140000 select OBJTOOL + select X86_CET help Build the kernel with support for Indirect Branch Tracking, a hardware support course-grain forward-edge Control Flow Integrity @@ -1949,12 +1921,31 @@ config X86_SGX If unsure, say N. +config X86_USER_SHADOW_STACK + bool "X86 userspace shadow stack" + depends on AS_WRUSS + depends on X86_64 + select ARCH_USES_HIGH_VMA_FLAGS + select X86_CET + help + Shadow stack protection is a hardware feature that detects function + return address corruption. This helps mitigate ROP attacks. + Applications must be enabled to use it, and old userspace does not + get protection "for free". + + CPUs supporting shadow stacks were first released in 2020. + + See Documentation/arch/x86/shstk.rst for more information. + + If unsure, say N. + config EFI bool "EFI runtime service support" depends on ACPI select UCS2_STRING select EFI_RUNTIME_WRAPPERS select ARCH_USE_MEMREMAP_PROT + select EFI_RUNTIME_MAP if KEXEC_CORE help This enables the kernel to use EFI runtime services that are available (such as the EFI variable services). @@ -2030,7 +2021,6 @@ config EFI_MAX_FAKE_MEM config EFI_RUNTIME_MAP bool "Export EFI runtime maps to sysfs" if EXPERT depends on EFI - default KEXEC_CORE help Export EFI runtime memory regions to /sys/firmware/efi/runtime-map. That memory map is required by the 2nd kernel to set up EFI virtual @@ -2040,88 +2030,37 @@ config EFI_RUNTIME_MAP source "kernel/Kconfig.hz" -config KEXEC - bool "kexec system call" - select KEXEC_CORE - help - kexec is a system call that implements the ability to shutdown your - current kernel, and to start another kernel. It is like a reboot - but it is independent of the system firmware. And like a reboot - you can start any kernel with it, not just Linux. - - The name comes from the similarity to the exec system call. +config ARCH_SUPPORTS_KEXEC + def_bool y - It is an ongoing process to be certain the hardware in a machine - is properly shutdown, so do not be surprised if this code does not - initially work for you. As of this writing the exact hardware - interface is strongly in flux, so no good recommendation can be - made. +config ARCH_SUPPORTS_KEXEC_FILE + def_bool X86_64 && CRYPTO && CRYPTO_SHA256 -config KEXEC_FILE - bool "kexec file based system call" - select KEXEC_CORE +config ARCH_SELECTS_KEXEC_FILE + def_bool y + depends on KEXEC_FILE select HAVE_IMA_KEXEC if IMA - depends on X86_64 - depends on CRYPTO=y - depends on CRYPTO_SHA256=y - help - This is new version of kexec system call. This system call is - file based and takes file descriptors as system call argument - for kernel and initramfs as opposed to list of segments as - accepted by previous system call. -config ARCH_HAS_KEXEC_PURGATORY +config ARCH_SUPPORTS_KEXEC_PURGATORY def_bool KEXEC_FILE -config KEXEC_SIG - bool "Verify kernel signature during kexec_file_load() syscall" - depends on KEXEC_FILE - help - - This option makes the kexec_file_load() syscall check for a valid - signature of the kernel image. The image can still be loaded without - a valid signature unless you also enable KEXEC_SIG_FORCE, though if - there's a signature that we can check, then it must be valid. +config ARCH_SUPPORTS_KEXEC_SIG + def_bool y - In addition to this option, you need to enable signature - verification for the corresponding kernel image type being - loaded in order for this to work. +config ARCH_SUPPORTS_KEXEC_SIG_FORCE + def_bool y -config KEXEC_SIG_FORCE - bool "Require a valid signature in kexec_file_load() syscall" - depends on KEXEC_SIG - help - This option makes kernel signature verification mandatory for - the kexec_file_load() syscall. +config ARCH_SUPPORTS_KEXEC_BZIMAGE_VERIFY_SIG + def_bool y -config KEXEC_BZIMAGE_VERIFY_SIG - bool "Enable bzImage signature verification support" - depends on KEXEC_SIG - depends on SIGNED_PE_FILE_VERIFICATION - select SYSTEM_TRUSTED_KEYRING - help - Enable bzImage signature verification support. +config ARCH_SUPPORTS_KEXEC_JUMP + def_bool y -config CRASH_DUMP - bool "kernel crash dumps" - depends on X86_64 || (X86_32 && HIGHMEM) - help - Generate crash dump after being started by kexec. - This should be normally only set in special crash dump kernels - which are loaded in the main kernel with kexec-tools into - a specially reserved region and then later executed after - a crash by kdump/kexec. The crash dump kernel must be compiled - to a memory address not used by the main kernel or BIOS using - PHYSICAL_START, or it must be built as a relocatable image - (CONFIG_RELOCATABLE=y). - For more details see Documentation/admin-guide/kdump/kdump.rst +config ARCH_SUPPORTS_CRASH_DUMP + def_bool X86_64 || (X86_32 && HIGHMEM) -config KEXEC_JUMP - bool "kexec jump" - depends on KEXEC && HIBERNATION - help - Jump between original kernel and kexeced kernel and invoke - code in physical address mode via KEXEC +config ARCH_SUPPORTS_CRASH_HOTPLUG + def_bool y config PHYSICAL_START hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP) @@ -2593,6 +2532,13 @@ config CPU_IBRS_ENTRY This mitigates both spectre_v2 and retbleed at great cost to performance. +config CPU_SRSO + bool "Mitigate speculative RAS overflow on AMD" + depends on CPU_SUP_AMD && X86_64 && RETHUNK + default y + help + Enable the SRSO mitigation needed on AMD Zen1-4 machines. + config SLS bool "Mitigate Straight-Line-Speculation" depends on CC_HAS_SLS && X86_64 @@ -2603,15 +2549,31 @@ config SLS against straight line speculation. The kernel image might be slightly larger. +config GDS_FORCE_MITIGATION + bool "Force GDS Mitigation" + depends on CPU_SUP_INTEL + default n + help + Gather Data Sampling (GDS) is a hardware vulnerability which allows + unprivileged speculative access to data which was previously stored in + vector registers. + + This option is equivalent to setting gather_data_sampling=force on the + command line. The microcode mitigation is used if present, otherwise + AVX is disabled as a mitigation. On affected systems that are missing + the microcode any userspace code that unconditionally uses AVX will + break with this option set. + + Setting this option on systems not vulnerable to GDS has no effect. + + If in doubt, say N. + endif config ARCH_HAS_ADD_PAGES def_bool y depends on ARCH_ENABLE_MEMORY_HOTPLUG -config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE - def_bool y - menu "Power management and ACPI options" config ARCH_HIBERNATION_HEADER diff --git a/arch/x86/Kconfig.assembler b/arch/x86/Kconfig.assembler index b88f784cb02e..8ad41da301e5 100644 --- a/arch/x86/Kconfig.assembler +++ b/arch/x86/Kconfig.assembler @@ -24,3 +24,8 @@ config AS_GFNI def_bool $(as-instr,vgf2p8mulb %xmm0$(comma)%xmm1$(comma)%xmm2) help Supported by binutils >= 2.30 and LLVM integrated assembler + +config AS_WRUSS + def_bool $(as-instr,wrussq %rax$(comma)(%rbx)) + help + Supported by binutils >= 2.31 and LLVM integrated assembler diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 95315d3474a2..5bfe5caaa444 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -335,9 +335,5 @@ define archhelp echo ' bzdisk/fdimage*/hdimage/isoimage also accept:' echo ' FDARGS="..." arguments for the booted kernel' echo ' FDINITRD=file initrd for the booted kernel' - echo '' - echo ' kvm_guest.config - Enable Kconfig items for running this kernel as a KVM guest' - echo ' xen.config - Enable Kconfig items for running this kernel as a Xen guest' - echo ' x86_debug.config - Enable tip tree debugging options for testing' endef diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 40d2ff503079..71fc531b95b4 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -74,6 +74,11 @@ LDFLAGS_vmlinux += -z noexecstack ifeq ($(CONFIG_LD_IS_BFD),y) LDFLAGS_vmlinux += $(call ld-option,--no-warn-rwx-segments) endif +ifeq ($(CONFIG_EFI_STUB),y) +# ensure that the static EFI stub library will be pulled in, even if it is +# never referenced explicitly from the startup code +LDFLAGS_vmlinux += -u efi_pe_entry +endif LDFLAGS_vmlinux += -T hostprogs := mkpiggy diff --git a/arch/x86/boot/compressed/efi_mixed.S b/arch/x86/boot/compressed/efi_mixed.S index 4ca70bf93dc0..f4e22ef774ab 100644 --- a/arch/x86/boot/compressed/efi_mixed.S +++ b/arch/x86/boot/compressed/efi_mixed.S @@ -26,8 +26,8 @@ * When booting in 64-bit mode on 32-bit EFI firmware, startup_64_mixed_mode() * is the first thing that runs after switching to long mode. Depending on * whether the EFI handover protocol or the compat entry point was used to - * enter the kernel, it will either branch to the 64-bit EFI handover - * entrypoint at offset 0x390 in the image, or to the 64-bit EFI PE/COFF + * enter the kernel, it will either branch to the common 64-bit EFI stub + * entrypoint efi_stub_entry() directly, or via the 64-bit EFI PE/COFF * entrypoint efi_pe_entry(). In the former case, the bootloader must provide a * struct bootparams pointer as the third argument, so the presence of such a * pointer is used to disambiguate. @@ -37,21 +37,23 @@ * | efi32_pe_entry |---->| | | +-----------+--+ * +------------------+ | | +------+----------------+ | * | startup_32 |---->| startup_64_mixed_mode | | - * +------------------+ | | +------+----------------+ V - * | efi32_stub_entry |---->| | | +------------------+ - * +------------------+ +------------+ +---->| efi64_stub_entry | - * +-------------+----+ - * +------------+ +----------+ | - * | startup_64 |<----| efi_main |<--------------+ - * +------------+ +----------+ + * +------------------+ | | +------+----------------+ | + * | efi32_stub_entry |---->| | | | + * +------------------+ +------------+ | | + * V | + * +------------+ +----------------+ | + * | startup_64 |<----| efi_stub_entry |<--------+ + * +------------+ +----------------+ */ SYM_FUNC_START(startup_64_mixed_mode) lea efi32_boot_args(%rip), %rdx mov 0(%rdx), %edi mov 4(%rdx), %esi +#ifdef CONFIG_EFI_HANDOVER_PROTOCOL mov 8(%rdx), %edx // saved bootparams pointer test %edx, %edx - jnz efi64_stub_entry + jnz efi_stub_entry +#endif /* * efi_pe_entry uses MS calling convention, which requires 32 bytes of * shadow space on the stack even if all arguments are passed in @@ -138,6 +140,28 @@ SYM_FUNC_START(__efi64_thunk) SYM_FUNC_END(__efi64_thunk) .code32 +#ifdef CONFIG_EFI_HANDOVER_PROTOCOL +SYM_FUNC_START(efi32_stub_entry) + call 1f +1: popl %ecx + + /* Clear BSS */ + xorl %eax, %eax + leal (_bss - 1b)(%ecx), %edi + leal (_ebss - 1b)(%ecx), %ecx + subl %edi, %ecx + shrl $2, %ecx + cld + rep stosl + + add $0x4, %esp /* Discard return address */ + popl %ecx + popl %edx + popl %esi + jmp efi32_entry +SYM_FUNC_END(efi32_stub_entry) +#endif + /* * EFI service pointer must be in %edi. * @@ -218,7 +242,7 @@ SYM_FUNC_END(efi_enter32) * stub may still exit and return to the firmware using the Exit() EFI boot * service.] */ -SYM_FUNC_START(efi32_entry) +SYM_FUNC_START_LOCAL(efi32_entry) call 1f 1: pop %ebx @@ -245,10 +269,6 @@ SYM_FUNC_START(efi32_entry) jmp startup_32 SYM_FUNC_END(efi32_entry) -#define ST32_boottime 60 // offsetof(efi_system_table_32_t, boottime) -#define BS32_handle_protocol 88 // offsetof(efi_boot_services_32_t, handle_protocol) -#define LI32_image_base 32 // offsetof(efi_loaded_image_32_t, image_base) - /* * efi_status_t efi32_pe_entry(efi_handle_t image_handle, * efi_system_table_32_t *sys_table) @@ -256,8 +276,6 @@ SYM_FUNC_END(efi32_entry) SYM_FUNC_START(efi32_pe_entry) pushl %ebp movl %esp, %ebp - pushl %eax // dummy push to allocate loaded_image - pushl %ebx // save callee-save registers pushl %edi @@ -266,48 +284,8 @@ SYM_FUNC_START(efi32_pe_entry) movl $0x80000003, %eax // EFI_UNSUPPORTED jnz 2f - call 1f -1: pop %ebx - - /* Get the loaded image protocol pointer from the image handle */ - leal -4(%ebp), %eax - pushl %eax // &loaded_image - leal (loaded_image_proto - 1b)(%ebx), %eax - pushl %eax // pass the GUID address - pushl 8(%ebp) // pass the image handle - - /* - * Note the alignment of the stack frame. - * sys_table - * handle <-- 16-byte aligned on entry by ABI - * return address - * frame pointer - * loaded_image <-- local variable - * saved %ebx <-- 16-byte aligned here - * saved %edi - * &loaded_image - * &loaded_image_proto - * handle <-- 16-byte aligned for call to handle_protocol - */ - - movl 12(%ebp), %eax // sys_table - movl ST32_boottime(%eax), %eax // sys_table->boottime - call *BS32_handle_protocol(%eax) // sys_table->boottime->handle_protocol - addl $12, %esp // restore argument space - testl %eax, %eax - jnz 2f - movl 8(%ebp), %ecx // image_handle movl 12(%ebp), %edx // sys_table - movl -4(%ebp), %esi // loaded_image - movl LI32_image_base(%esi), %esi // loaded_image->image_base - leal (startup_32 - 1b)(%ebx), %ebp // runtime address of startup_32 - /* - * We need to set the image_offset variable here since startup_32() will - * use it before we get to the 64-bit efi_pe_entry() in C code. - */ - subl %esi, %ebp // calculate image_offset - movl %ebp, (image_offset - 1b)(%ebx) // save image_offset xorl %esi, %esi jmp efi32_entry // pass %ecx, %edx, %esi // no other registers remain live @@ -318,14 +296,13 @@ SYM_FUNC_START(efi32_pe_entry) RET SYM_FUNC_END(efi32_pe_entry) - .section ".rodata" - /* EFI loaded image protocol GUID */ - .balign 4 -SYM_DATA_START_LOCAL(loaded_image_proto) - .long 0x5b1b31a1 - .word 0x9562, 0x11d2 - .byte 0x8e, 0x3f, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b -SYM_DATA_END(loaded_image_proto) +#ifdef CONFIG_EFI_HANDOVER_PROTOCOL + .org efi32_stub_entry + 0x200 + .code64 +SYM_FUNC_START_NOALIGN(efi64_stub_entry) + jmp efi_handover_entry +SYM_FUNC_END(efi64_stub_entry) +#endif .data .balign 8 diff --git a/arch/x86/boot/compressed/error.c b/arch/x86/boot/compressed/error.c index 5313c5cb2b80..19a8251de506 100644 --- a/arch/x86/boot/compressed/error.c +++ b/arch/x86/boot/compressed/error.c @@ -7,7 +7,7 @@ #include "misc.h" #include "error.h" -void warn(char *m) +void warn(const char *m) { error_putstr("\n\n"); error_putstr(m); diff --git a/arch/x86/boot/compressed/error.h b/arch/x86/boot/compressed/error.h index 86fe33b93715..31f9e080d61a 100644 --- a/arch/x86/boot/compressed/error.h +++ b/arch/x86/boot/compressed/error.h @@ -4,7 +4,7 @@ #include <linux/compiler.h> -void warn(char *m); +void warn(const char *m); void error(char *m) __noreturn; void panic(const char *fmt, ...) __noreturn __cold; diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 987ae727cf9f..1cfe9802a42f 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -84,19 +84,6 @@ SYM_FUNC_START(startup_32) #ifdef CONFIG_RELOCATABLE leal startup_32@GOTOFF(%edx), %ebx - -#ifdef CONFIG_EFI_STUB -/* - * If we were loaded via the EFI LoadImage service, startup_32() will be at an - * offset to the start of the space allocated for the image. efi_pe_entry() will - * set up image_offset to tell us where the image actually starts, so that we - * can use the full available buffer. - * image_offset = startup_32 - image_base - * Otherwise image_offset will be zero and has no effect on the calculations. - */ - subl image_offset@GOTOFF(%edx), %ebx -#endif - movl BP_kernel_alignment(%esi), %eax decl %eax addl %eax, %ebx @@ -150,17 +137,6 @@ SYM_FUNC_START(startup_32) jmp *%eax SYM_FUNC_END(startup_32) -#ifdef CONFIG_EFI_STUB -SYM_FUNC_START(efi32_stub_entry) - add $0x4, %esp - movl 8(%esp), %esi /* save boot_params pointer */ - call efi_main - /* efi_main returns the possibly relocated address of startup_32 */ - jmp *%eax -SYM_FUNC_END(efi32_stub_entry) -SYM_FUNC_ALIAS(efi_stub_entry, efi32_stub_entry) -#endif - .text SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) @@ -179,13 +155,7 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) */ /* push arguments for extract_kernel: */ - pushl output_len@GOTOFF(%ebx) /* decompressed length, end of relocs */ pushl %ebp /* output address */ - pushl input_len@GOTOFF(%ebx) /* input_len */ - leal input_data@GOTOFF(%ebx), %eax - pushl %eax /* input_data */ - leal boot_heap@GOTOFF(%ebx), %eax - pushl %eax /* heap area */ pushl %esi /* real mode pointer */ call extract_kernel /* returns kernel entry point in %eax */ addl $24, %esp @@ -213,8 +183,6 @@ SYM_DATA_END_LABEL(gdt, SYM_L_LOCAL, gdt_end) */ .bss .balign 4 -boot_heap: - .fill BOOT_HEAP_SIZE, 1, 0 boot_stack: .fill BOOT_STACK_SIZE, 1, 0 boot_stack_end: diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 03c4328a88cb..bf4a10a5794f 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -146,19 +146,6 @@ SYM_FUNC_START(startup_32) #ifdef CONFIG_RELOCATABLE movl %ebp, %ebx - -#ifdef CONFIG_EFI_STUB -/* - * If we were loaded via the EFI LoadImage service, startup_32 will be at an - * offset to the start of the space allocated for the image. efi_pe_entry will - * set up image_offset to tell us where the image actually starts, so that we - * can use the full available buffer. - * image_offset = startup_32 - image_base - * Otherwise image_offset will be zero and has no effect on the calculations. - */ - subl rva(image_offset)(%ebp), %ebx -#endif - movl BP_kernel_alignment(%esi), %eax decl %eax addl %eax, %ebx @@ -294,17 +281,6 @@ SYM_FUNC_START(startup_32) lret SYM_FUNC_END(startup_32) -#if IS_ENABLED(CONFIG_EFI_MIXED) && IS_ENABLED(CONFIG_EFI_HANDOVER_PROTOCOL) - .org 0x190 -SYM_FUNC_START(efi32_stub_entry) - add $0x4, %esp /* Discard return address */ - popl %ecx - popl %edx - popl %esi - jmp efi32_entry -SYM_FUNC_END(efi32_stub_entry) -#endif - .code64 .org 0x200 SYM_CODE_START(startup_64) @@ -346,20 +322,6 @@ SYM_CODE_START(startup_64) /* Start with the delta to where the kernel will run at. */ #ifdef CONFIG_RELOCATABLE leaq startup_32(%rip) /* - $startup_32 */, %rbp - -#ifdef CONFIG_EFI_STUB -/* - * If we were loaded via the EFI LoadImage service, startup_32 will be at an - * offset to the start of the space allocated for the image. efi_pe_entry will - * set up image_offset to tell us where the image actually starts, so that we - * can use the full available buffer. - * image_offset = startup_32 - image_base - * Otherwise image_offset will be zero and has no effect on the calculations. - */ - movl image_offset(%rip), %eax - subq %rax, %rbp -#endif - movl BP_kernel_alignment(%rsi), %eax decl %eax addq %rax, %rbp @@ -398,10 +360,6 @@ SYM_CODE_START(startup_64) * For the trampoline, we need the top page table to reside in lower * memory as we don't have a way to load 64-bit values into CR3 in * 32-bit mode. - * - * We go though the trampoline even if we don't have to: if we're - * already in a desired paging mode. This way the trampoline code gets - * tested on every boot. */ /* Make sure we have GDT with 32-bit code segment */ @@ -416,10 +374,14 @@ SYM_CODE_START(startup_64) lretq .Lon_kernel_cs: + /* + * RSI holds a pointer to a boot_params structure provided by the + * loader, and this needs to be preserved across C function calls. So + * move it into a callee saved register. + */ + movq %rsi, %r15 - pushq %rsi call load_stage1_idt - popq %rsi #ifdef CONFIG_AMD_MEM_ENCRYPT /* @@ -430,63 +392,24 @@ SYM_CODE_START(startup_64) * CPUID instructions being issued, so go ahead and do that now via * sev_enable(), which will also handle the rest of the SEV-related * detection/setup to ensure that has been done in advance of any dependent - * code. + * code. Pass the boot_params pointer as the first argument. */ - pushq %rsi - movq %rsi, %rdi /* real mode address */ + movq %r15, %rdi call sev_enable - popq %rsi #endif /* - * paging_prepare() sets up the trampoline and checks if we need to - * enable 5-level paging. + * configure_5level_paging() updates the number of paging levels using + * a trampoline in 32-bit addressable memory if the current number does + * not match the desired number. * - * paging_prepare() returns a two-quadword structure which lands - * into RDX:RAX: - * - Address of the trampoline is returned in RAX. - * - Non zero RDX means trampoline needs to enable 5-level - * paging. - * - * RSI holds real mode data and needs to be preserved across - * this function call. - */ - pushq %rsi - movq %rsi, %rdi /* real mode address */ - call paging_prepare - popq %rsi - - /* Save the trampoline address in RCX */ - movq %rax, %rcx - - /* - * Load the address of trampoline_return() into RDI. - * It will be used by the trampoline to return to the main code. + * Pass the boot_params pointer as the first argument. The second + * argument is the relocated address of the page table to use instead + * of the page table in trampoline memory (if required). */ - leaq trampoline_return(%rip), %rdi - - /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */ - pushq $__KERNEL32_CS - leaq TRAMPOLINE_32BIT_CODE_OFFSET(%rax), %rax - pushq %rax - lretq -trampoline_return: - /* Restore the stack, the 32-bit trampoline uses its own stack */ - leaq rva(boot_stack_end)(%rbx), %rsp - - /* - * cleanup_trampoline() would restore trampoline memory. - * - * RDI is address of the page table to use instead of page table - * in trampoline memory (if required). - * - * RSI holds real mode data and needs to be preserved across - * this function call. - */ - pushq %rsi - leaq rva(top_pgtable)(%rbx), %rdi - call cleanup_trampoline - popq %rsi + movq %r15, %rdi + leaq rva(top_pgtable)(%rbx), %rsi + call configure_5level_paging /* Zero EFLAGS */ pushq $0 @@ -496,7 +419,6 @@ trampoline_return: * Copy the compressed kernel to the end of our buffer * where decompression in place becomes safe. */ - pushq %rsi leaq (_bss-8)(%rip), %rsi leaq rva(_bss-8)(%rbx), %rdi movl $(_bss - startup_32), %ecx @@ -504,7 +426,6 @@ trampoline_return: std rep movsq cld - popq %rsi /* * The GDT may get overwritten either during the copy we just did or @@ -523,21 +444,6 @@ trampoline_return: jmp *%rax SYM_CODE_END(startup_64) -#ifdef CONFIG_EFI_STUB -#ifdef CONFIG_EFI_HANDOVER_PROTOCOL - .org 0x390 -#endif -SYM_FUNC_START(efi64_stub_entry) - and $~0xf, %rsp /* realign the stack */ - movq %rdx, %rbx /* save boot_params pointer */ - call efi_main - movq %rbx,%rsi - leaq rva(startup_64)(%rax), %rax - jmp *%rax -SYM_FUNC_END(efi64_stub_entry) -SYM_FUNC_ALIAS(efi_stub_entry, efi64_stub_entry) -#endif - .text SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) @@ -551,128 +457,122 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) shrq $3, %rcx rep stosq - pushq %rsi call load_stage2_idt /* Pass boot_params to initialize_identity_maps() */ - movq (%rsp), %rdi + movq %r15, %rdi call initialize_identity_maps - popq %rsi /* * Do the extraction, and jump to the new kernel.. */ - pushq %rsi /* Save the real mode argument */ - movq %rsi, %rdi /* real mode address */ - leaq boot_heap(%rip), %rsi /* malloc area for uncompression */ - leaq input_data(%rip), %rdx /* input_data */ - movl input_len(%rip), %ecx /* input_len */ - movq %rbp, %r8 /* output target address */ - movl output_len(%rip), %r9d /* decompressed length, end of relocs */ + /* pass struct boot_params pointer and output target address */ + movq %r15, %rdi + movq %rbp, %rsi call extract_kernel /* returns kernel entry point in %rax */ - popq %rsi /* * Jump to the decompressed kernel. */ + movq %r15, %rsi jmp *%rax SYM_FUNC_END(.Lrelocated) - .code32 /* - * This is the 32-bit trampoline that will be copied over to low memory. + * This is the 32-bit trampoline that will be copied over to low memory. It + * will be called using the ordinary 64-bit calling convention from code + * running in 64-bit mode. * - * RDI contains the return address (might be above 4G). - * ECX contains the base address of the trampoline memory. - * Non zero RDX means trampoline needs to enable 5-level paging. + * Return address is at the top of the stack (might be above 4G). + * The first argument (EDI) contains the address of the temporary PGD level + * page table in 32-bit addressable memory which will be programmed into + * register CR3. */ + .section ".rodata", "a", @progbits SYM_CODE_START(trampoline_32bit_src) - /* Set up data and stack segments */ - movl $__KERNEL_DS, %eax - movl %eax, %ds - movl %eax, %ss + /* + * Preserve callee save 64-bit registers on the stack: this is + * necessary because the architecture does not guarantee that GPRs will + * retain their full 64-bit values across a 32-bit mode switch. + */ + pushq %r15 + pushq %r14 + pushq %r13 + pushq %r12 + pushq %rbp + pushq %rbx + + /* Preserve top half of RSP in a legacy mode GPR to avoid truncation */ + movq %rsp, %rbx + shrq $32, %rbx + + /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */ + pushq $__KERNEL32_CS + leaq 0f(%rip), %rax + pushq %rax + lretq - /* Set up new stack */ - leal TRAMPOLINE_32BIT_STACK_END(%ecx), %esp + /* + * The 32-bit code below will do a far jump back to long mode and end + * up here after reconfiguring the number of paging levels. First, the + * stack pointer needs to be restored to its full 64-bit value before + * the callee save register contents can be popped from the stack. + */ +.Lret: + shlq $32, %rbx + orq %rbx, %rsp + + /* Restore the preserved 64-bit registers */ + popq %rbx + popq %rbp + popq %r12 + popq %r13 + popq %r14 + popq %r15 + retq + .code32 +0: /* Disable paging */ movl %cr0, %eax btrl $X86_CR0_PG_BIT, %eax movl %eax, %cr0 - /* Check what paging mode we want to be in after the trampoline */ - testl %edx, %edx - jz 1f - - /* We want 5-level paging: don't touch CR3 if it already points to 5-level page tables */ - movl %cr4, %eax - testl $X86_CR4_LA57, %eax - jnz 3f - jmp 2f -1: - /* We want 4-level paging: don't touch CR3 if it already points to 4-level page tables */ - movl %cr4, %eax - testl $X86_CR4_LA57, %eax - jz 3f -2: /* Point CR3 to the trampoline's new top level page table */ - leal TRAMPOLINE_32BIT_PGTABLE_OFFSET(%ecx), %eax - movl %eax, %cr3 -3: + movl %edi, %cr3 + /* Set EFER.LME=1 as a precaution in case hypervsior pulls the rug */ - pushl %ecx - pushl %edx movl $MSR_EFER, %ecx rdmsr btsl $_EFER_LME, %eax /* Avoid writing EFER if no change was made (for TDX guest) */ jc 1f wrmsr -1: popl %edx - popl %ecx - -#ifdef CONFIG_X86_MCE - /* - * Preserve CR4.MCE if the kernel will enable #MC support. - * Clearing MCE may fault in some environments (that also force #MC - * support). Any machine check that occurs before #MC support is fully - * configured will crash the system regardless of the CR4.MCE value set - * here. - */ - movl %cr4, %eax - andl $X86_CR4_MCE, %eax -#else - movl $0, %eax -#endif - - /* Enable PAE and LA57 (if required) paging modes */ - orl $X86_CR4_PAE, %eax - testl %edx, %edx - jz 1f - orl $X86_CR4_LA57, %eax 1: + /* Toggle CR4.LA57 */ + movl %cr4, %eax + btcl $X86_CR4_LA57_BIT, %eax movl %eax, %cr4 - /* Calculate address of paging_enabled() once we are executing in the trampoline */ - leal .Lpaging_enabled - trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_OFFSET(%ecx), %eax - - /* Prepare the stack for far return to Long Mode */ - pushl $__KERNEL_CS - pushl %eax - /* Enable paging again. */ movl %cr0, %eax btsl $X86_CR0_PG_BIT, %eax movl %eax, %cr0 - lret + /* + * Return to the 64-bit calling code using LJMP rather than LRET, to + * avoid the need for a 32-bit addressable stack. The destination + * address will be adjusted after the template code is copied into a + * 32-bit addressable buffer. + */ +.Ljmp: ljmpl $__KERNEL_CS, $(.Lret - trampoline_32bit_src) SYM_CODE_END(trampoline_32bit_src) - .code64 -SYM_FUNC_START_LOCAL_NOALIGN(.Lpaging_enabled) - /* Return from the trampoline */ - jmp *%rdi -SYM_FUNC_END(.Lpaging_enabled) +/* + * This symbol is placed right after trampoline_32bit_src() so its address can + * be used to infer the size of the trampoline code. + */ +SYM_DATA(trampoline_ljmp_imm_offset, .word .Ljmp + 1 - trampoline_32bit_src) /* * The trampoline code has a size limit. @@ -681,7 +581,7 @@ SYM_FUNC_END(.Lpaging_enabled) */ .org trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_SIZE - .code32 + .text SYM_FUNC_START_LOCAL_NOALIGN(.Lno_longmode) /* This isn't an x86-64 CPU, so hang intentionally, we cannot continue */ 1: @@ -726,8 +626,6 @@ SYM_DATA_END_LABEL(boot_idt, SYM_L_GLOBAL, boot_idt_end) */ .bss .balign 4 -SYM_DATA_LOCAL(boot_heap, .fill BOOT_HEAP_SIZE, 1, 0) - SYM_DATA_START_LOCAL(boot_stack) .fill BOOT_STACK_SIZE, 1, 0 .balign 16 diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c index bcc956c17872..08f93b0401bb 100644 --- a/arch/x86/boot/compressed/ident_map_64.c +++ b/arch/x86/boot/compressed/ident_map_64.c @@ -59,6 +59,14 @@ static void *alloc_pgt_page(void *context) return NULL; } + /* Consumed more tables than expected? */ + if (pages->pgt_buf_offset == BOOT_PGT_SIZE_WARN) { + debug_putstr("pgt_buf running low in " __FILE__ "\n"); + debug_putstr("Need to raise BOOT_PGT_SIZE?\n"); + debug_putaddr(pages->pgt_buf_offset); + debug_putaddr(pages->pgt_buf_size); + } + entry = pages->pgt_buf + pages->pgt_buf_offset; pages->pgt_buf_offset += PAGE_SIZE; diff --git a/arch/x86/boot/compressed/idt_64.c b/arch/x86/boot/compressed/idt_64.c index 6debb816e83d..3cdf94b41456 100644 --- a/arch/x86/boot/compressed/idt_64.c +++ b/arch/x86/boot/compressed/idt_64.c @@ -63,7 +63,14 @@ void load_stage2_idt(void) set_idt_entry(X86_TRAP_PF, boot_page_fault); #ifdef CONFIG_AMD_MEM_ENCRYPT - set_idt_entry(X86_TRAP_VC, boot_stage2_vc); + /* + * Clear the second stage #VC handler in case guest types + * needing #VC have not been detected. + */ + if (sev_status & BIT(1)) + set_idt_entry(X86_TRAP_VC, boot_stage2_vc); + else + set_idt_entry(X86_TRAP_VC, NULL); #endif load_boot_idt(&boot_idt_desc); diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 94b7abcf624b..f711f2a85862 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -330,6 +330,33 @@ static size_t parse_elf(void *output) return ehdr.e_entry - LOAD_PHYSICAL_ADDR; } +const unsigned long kernel_total_size = VO__end - VO__text; + +static u8 boot_heap[BOOT_HEAP_SIZE] __aligned(4); + +extern unsigned char input_data[]; +extern unsigned int input_len, output_len; + +unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr, + void (*error)(char *x)) +{ + unsigned long entry; + + if (!free_mem_ptr) { + free_mem_ptr = (unsigned long)boot_heap; + free_mem_end_ptr = (unsigned long)boot_heap + sizeof(boot_heap); + } + + if (__decompress(input_data, input_len, NULL, NULL, outbuf, output_len, + NULL, error) < 0) + return ULONG_MAX; + + entry = parse_elf(outbuf); + handle_relocations(outbuf, output_len, virt_addr); + + return entry; +} + /* * The compressed kernel image (ZO), has been moved so that its position * is against the end of the buffer used to hold the uncompressed kernel @@ -347,14 +374,10 @@ static size_t parse_elf(void *output) * |-------uncompressed kernel image---------| * */ -asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, - unsigned char *input_data, - unsigned long input_len, - unsigned char *output, - unsigned long output_len) +asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output) { - const unsigned long kernel_total_size = VO__end - VO__text; unsigned long virt_addr = LOAD_PHYSICAL_ADDR; + memptr heap = (memptr)boot_heap; unsigned long needed_size; size_t entry_offset; @@ -412,7 +435,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, * entries. This ensures the full mapped area is usable RAM * and doesn't include any reserved areas. */ - needed_size = max(output_len, kernel_total_size); + needed_size = max_t(unsigned long, output_len, kernel_total_size); #ifdef CONFIG_X86_64 needed_size = ALIGN(needed_size, MIN_KERNEL_ALIGN); #endif @@ -443,7 +466,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, #ifdef CONFIG_X86_64 if (heap > 0x3fffffffffffUL) error("Destination address too large"); - if (virt_addr + max(output_len, kernel_total_size) > KERNEL_IMAGE_SIZE) + if (virt_addr + needed_size > KERNEL_IMAGE_SIZE) error("Destination virtual address is beyond the kernel mapping area"); #else if (heap > ((-__PAGE_OFFSET-(128<<20)-1) & 0x7fffffff)) @@ -461,10 +484,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, accept_memory(__pa(output), __pa(output) + needed_size); } - __decompress(input_data, input_len, NULL, NULL, output, output_len, - NULL, error); - entry_offset = parse_elf(output); - handle_relocations(output, output_len, virt_addr); + entry_offset = decompress_kernel(output, virt_addr, error); debug_putstr("done.\nBooting the kernel (entry_offset: 0x"); debug_puthex(entry_offset); diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 964fe903a1cd..cc70d3fb9049 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -179,9 +179,7 @@ static inline int count_immovable_mem_regions(void) { return 0; } #endif /* ident_map_64.c */ -#ifdef CONFIG_X86_5LEVEL extern unsigned int __pgtable_l5_enabled, pgdir_shift, ptrs_per_p4d; -#endif extern void kernel_add_identity_map(unsigned long start, unsigned long end); /* Used by PAGE_KERN* macros: */ diff --git a/arch/x86/boot/compressed/pgtable.h b/arch/x86/boot/compressed/pgtable.h index cc9b2529a086..6d595abe06b3 100644 --- a/arch/x86/boot/compressed/pgtable.h +++ b/arch/x86/boot/compressed/pgtable.h @@ -3,18 +3,16 @@ #define TRAMPOLINE_32BIT_SIZE (2 * PAGE_SIZE) -#define TRAMPOLINE_32BIT_PGTABLE_OFFSET 0 - #define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE -#define TRAMPOLINE_32BIT_CODE_SIZE 0x80 - -#define TRAMPOLINE_32BIT_STACK_END TRAMPOLINE_32BIT_SIZE +#define TRAMPOLINE_32BIT_CODE_SIZE 0xA0 #ifndef __ASSEMBLER__ extern unsigned long *trampoline_32bit; -extern void trampoline_32bit_src(void *return_ptr); +extern void trampoline_32bit_src(void *trampoline, bool enable_5lvl); + +extern const u16 trampoline_ljmp_imm_offset; #endif /* __ASSEMBLER__ */ #endif /* BOOT_COMPRESSED_PAGETABLE_H */ diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index 2ac12ff4111b..7939eb6e6ce9 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -16,11 +16,6 @@ unsigned int __section(".data") pgdir_shift = 39; unsigned int __section(".data") ptrs_per_p4d = 1; #endif -struct paging_config { - unsigned long trampoline_start; - unsigned long l5_required; -}; - /* Buffer to preserve trampoline memory */ static char trampoline_save[TRAMPOLINE_32BIT_SIZE]; @@ -29,7 +24,7 @@ static char trampoline_save[TRAMPOLINE_32BIT_SIZE]; * purposes. * * Avoid putting the pointer into .bss as it will be cleared between - * paging_prepare() and extract_kernel(). + * configure_5level_paging() and extract_kernel(). */ unsigned long *trampoline_32bit __section(".data"); @@ -106,12 +101,13 @@ static unsigned long find_trampoline_placement(void) return bios_start - TRAMPOLINE_32BIT_SIZE; } -struct paging_config paging_prepare(void *rmode) +asmlinkage void configure_5level_paging(struct boot_params *bp, void *pgtable) { - struct paging_config paging_config = {}; + void (*toggle_la57)(void *cr3); + bool l5_required = false; /* Initialize boot_params. Required for cmdline_find_option_bool(). */ - boot_params = rmode; + boot_params = bp; /* * Check if LA57 is desired and supported. @@ -129,12 +125,22 @@ struct paging_config paging_prepare(void *rmode) !cmdline_find_option_bool("no5lvl") && native_cpuid_eax(0) >= 7 && (native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) { - paging_config.l5_required = 1; + l5_required = true; + + /* Initialize variables for 5-level paging */ + __pgtable_l5_enabled = 1; + pgdir_shift = 48; + ptrs_per_p4d = 512; } - paging_config.trampoline_start = find_trampoline_placement(); + /* + * The trampoline will not be used if the paging mode is already set to + * the desired one. + */ + if (l5_required == !!(native_read_cr4() & X86_CR4_LA57)) + return; - trampoline_32bit = (unsigned long *)paging_config.trampoline_start; + trampoline_32bit = (unsigned long *)find_trampoline_placement(); /* Preserve trampoline memory */ memcpy(trampoline_save, trampoline_32bit, TRAMPOLINE_32BIT_SIZE); @@ -143,32 +149,32 @@ struct paging_config paging_prepare(void *rmode) memset(trampoline_32bit, 0, TRAMPOLINE_32BIT_SIZE); /* Copy trampoline code in place */ - memcpy(trampoline_32bit + TRAMPOLINE_32BIT_CODE_OFFSET / sizeof(unsigned long), + toggle_la57 = memcpy(trampoline_32bit + + TRAMPOLINE_32BIT_CODE_OFFSET / sizeof(unsigned long), &trampoline_32bit_src, TRAMPOLINE_32BIT_CODE_SIZE); /* + * Avoid the need for a stack in the 32-bit trampoline code, by using + * LJMP rather than LRET to return back to long mode. LJMP takes an + * immediate absolute address, which needs to be adjusted based on the + * placement of the trampoline. + */ + *(u32 *)((u8 *)toggle_la57 + trampoline_ljmp_imm_offset) += + (unsigned long)toggle_la57; + + /* * The code below prepares page table in trampoline memory. * * The new page table will be used by trampoline code for switching * from 4- to 5-level paging or vice versa. - * - * If switching is not required, the page table is unused: trampoline - * code wouldn't touch CR3. - */ - - /* - * We are not going to use the page table in trampoline memory if we - * are already in the desired paging mode. */ - if (paging_config.l5_required == !!(native_read_cr4() & X86_CR4_LA57)) - goto out; - if (paging_config.l5_required) { + if (l5_required) { /* * For 4- to 5-level paging transition, set up current CR3 as * the first and the only entry in a new top-level page table. */ - trampoline_32bit[TRAMPOLINE_32BIT_PGTABLE_OFFSET] = __native_read_cr3() | _PAGE_TABLE_NOENC; + *trampoline_32bit = __native_read_cr3() | _PAGE_TABLE_NOENC; } else { unsigned long src; @@ -181,38 +187,17 @@ struct paging_config paging_prepare(void *rmode) * may be above 4G. */ src = *(unsigned long *)__native_read_cr3() & PAGE_MASK; - memcpy(trampoline_32bit + TRAMPOLINE_32BIT_PGTABLE_OFFSET / sizeof(unsigned long), - (void *)src, PAGE_SIZE); + memcpy(trampoline_32bit, (void *)src, PAGE_SIZE); } -out: - return paging_config; -} - -void cleanup_trampoline(void *pgtable) -{ - void *trampoline_pgtable; - - trampoline_pgtable = trampoline_32bit + TRAMPOLINE_32BIT_PGTABLE_OFFSET / sizeof(unsigned long); + toggle_la57(trampoline_32bit); /* - * Move the top level page table out of trampoline memory, - * if it's there. + * Move the top level page table out of trampoline memory. */ - if ((void *)__native_read_cr3() == trampoline_pgtable) { - memcpy(pgtable, trampoline_pgtable, PAGE_SIZE); - native_write_cr3((unsigned long)pgtable); - } + memcpy(pgtable, trampoline_32bit, PAGE_SIZE); + native_write_cr3((unsigned long)pgtable); /* Restore trampoline memory */ memcpy(trampoline_32bit, trampoline_save, TRAMPOLINE_32BIT_SIZE); - - /* Initialize variables for 5-level paging */ -#ifdef CONFIG_X86_5LEVEL - if (__read_cr4() & X86_CR4_LA57) { - __pgtable_l5_enabled = 1; - pgdir_shift = 48; - ptrs_per_p4d = 512; - } -#endif } diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 09dc8c187b3c..dc8c876fbd8f 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -365,22 +365,27 @@ static void enforce_vmpl0(void) * by the guest kernel. As and when a new feature is implemented in the * guest kernel, a corresponding bit should be added to the mask. */ -#define SNP_FEATURES_PRESENT (0) +#define SNP_FEATURES_PRESENT MSR_AMD64_SNP_DEBUG_SWAP + +u64 snp_get_unsupported_features(u64 status) +{ + if (!(status & MSR_AMD64_SEV_SNP_ENABLED)) + return 0; + + return status & SNP_FEATURES_IMPL_REQ & ~SNP_FEATURES_PRESENT; +} void snp_check_features(void) { u64 unsupported; - if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) - return; - /* * Terminate the boot if hypervisor has enabled any feature lacking * guest side implementation. Pass on the unsupported features mask through * EXIT_INFO_2 of the GHCB protocol so that those features can be reported * as part of the guest boot failure. */ - unsupported = sev_status & SNP_FEATURES_IMPL_REQ & ~SNP_FEATURES_PRESENT; + unsupported = snp_get_unsupported_features(sev_status); if (unsupported) { if (ghcb_version < 2 || (!boot_ghcb && !early_setup_ghcb())) sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); @@ -390,32 +395,22 @@ void snp_check_features(void) } } -void sev_enable(struct boot_params *bp) +/* + * sev_check_cpu_support - Check for SEV support in the CPU capabilities + * + * Returns < 0 if SEV is not supported, otherwise the position of the + * encryption bit in the page table descriptors. + */ +static int sev_check_cpu_support(void) { unsigned int eax, ebx, ecx, edx; - struct msr m; - bool snp; - - /* - * bp->cc_blob_address should only be set by boot/compressed kernel. - * Initialize it to 0 to ensure that uninitialized values from - * buggy bootloaders aren't propagated. - */ - if (bp) - bp->cc_blob_address = 0; - - /* - * Setup/preliminary detection of SNP. This will be sanity-checked - * against CPUID/MSR values later. - */ - snp = snp_init(bp); /* Check for the SME/SEV support leaf */ eax = 0x80000000; ecx = 0; native_cpuid(&eax, &ebx, &ecx, &edx); if (eax < 0x8000001f) - return; + return -ENODEV; /* * Check for the SME/SEV feature: @@ -429,7 +424,48 @@ void sev_enable(struct boot_params *bp) ecx = 0; native_cpuid(&eax, &ebx, &ecx, &edx); /* Check whether SEV is supported */ - if (!(eax & BIT(1))) { + if (!(eax & BIT(1))) + return -ENODEV; + + return ebx & 0x3f; +} + +void sev_enable(struct boot_params *bp) +{ + struct msr m; + int bitpos; + bool snp; + + /* + * bp->cc_blob_address should only be set by boot/compressed kernel. + * Initialize it to 0 to ensure that uninitialized values from + * buggy bootloaders aren't propagated. + */ + if (bp) + bp->cc_blob_address = 0; + + /* + * Do an initial SEV capability check before snp_init() which + * loads the CPUID page and the same checks afterwards are done + * without the hypervisor and are trustworthy. + * + * If the HV fakes SEV support, the guest will crash'n'burn + * which is good enough. + */ + + if (sev_check_cpu_support() < 0) + return; + + /* + * Setup/preliminary detection of SNP. This will be sanity-checked + * against CPUID/MSR values later. + */ + snp = snp_init(bp); + + /* Now repeat the checks with the SNP CPUID table. */ + + bitpos = sev_check_cpu_support(); + if (bitpos < 0) { if (snp) error("SEV-SNP support indicated by CC blob, but not CPUID."); return; @@ -461,7 +497,24 @@ void sev_enable(struct boot_params *bp) if (snp && !(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) error("SEV-SNP supported indicated by CC blob, but not SEV status MSR."); - sme_me_mask = BIT_ULL(ebx & 0x3f); + sme_me_mask = BIT_ULL(bitpos); +} + +/* + * sev_get_status - Retrieve the SEV status mask + * + * Returns 0 if the CPU is not SEV capable, otherwise the value of the + * AMD64_SEV MSR. + */ +u64 sev_get_status(void) +{ + struct msr m; + + if (sev_check_cpu_support() < 0) + return 0; + + boot_rdmsr(MSR_AMD64_SEV, &m); + return m.q; } /* Search for Confidential Computing blob in the EFI config table. */ diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 3cf34912abfe..1b411bbf3cb0 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -33,7 +33,6 @@ CONFIG_HYPERVISOR_GUEST=y CONFIG_PARAVIRT=y CONFIG_NR_CPUS=8 CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y -CONFIG_MICROCODE_AMD=y CONFIG_X86_MSR=y CONFIG_X86_CPUID=y CONFIG_X86_CHECK_BIOS_CORRUPTION=y @@ -245,7 +244,7 @@ CONFIG_QUOTA=y CONFIG_QUOTA_NETLINK_INTERFACE=y # CONFIG_PRINT_QUOTA_WARNING is not set CONFIG_QFMT_V2=y -CONFIG_AUTOFS4_FS=y +CONFIG_AUTOFS_FS=y CONFIG_ISO9660_FS=y CONFIG_JOLIET=y CONFIG_ZISOFS=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 27759236fd60..409e9182bd29 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -31,7 +31,6 @@ CONFIG_SMP=y CONFIG_HYPERVISOR_GUEST=y CONFIG_PARAVIRT=y CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y -CONFIG_MICROCODE_AMD=y CONFIG_X86_MSR=y CONFIG_X86_CPUID=y CONFIG_NUMA=y @@ -242,7 +241,7 @@ CONFIG_QUOTA=y CONFIG_QUOTA_NETLINK_INTERFACE=y # CONFIG_PRINT_QUOTA_WARNING is not set CONFIG_QFMT_V2=y -CONFIG_AUTOFS4_FS=y +CONFIG_AUTOFS_FS=y CONFIG_ISO9660_FS=y CONFIG_JOLIET=y CONFIG_ZISOFS=y diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index a5b0cb3efeba..39d6a62ac627 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -229,10 +229,9 @@ static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) return (struct crypto_aes_ctx *)ALIGN(addr, align); } -static int aes_set_key_common(struct crypto_tfm *tfm, void *raw_ctx, +static int aes_set_key_common(struct crypto_aes_ctx *ctx, const u8 *in_key, unsigned int key_len) { - struct crypto_aes_ctx *ctx = aes_ctx(raw_ctx); int err; if (key_len != AES_KEYSIZE_128 && key_len != AES_KEYSIZE_192 && @@ -253,7 +252,8 @@ static int aes_set_key_common(struct crypto_tfm *tfm, void *raw_ctx, static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, unsigned int key_len) { - return aes_set_key_common(tfm, crypto_tfm_ctx(tfm), in_key, key_len); + return aes_set_key_common(aes_ctx(crypto_tfm_ctx(tfm)), in_key, + key_len); } static void aesni_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) @@ -285,8 +285,7 @@ static void aesni_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) static int aesni_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int len) { - return aes_set_key_common(crypto_skcipher_tfm(tfm), - crypto_skcipher_ctx(tfm), key, len); + return aes_set_key_common(aes_ctx(crypto_skcipher_ctx(tfm)), key, len); } static int ecb_encrypt(struct skcipher_request *req) @@ -627,8 +626,7 @@ static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key, memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce)); - return aes_set_key_common(crypto_aead_tfm(aead), - &ctx->aes_key_expanded, key, key_len) ?: + return aes_set_key_common(&ctx->aes_key_expanded, key, key_len) ?: rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len); } @@ -893,14 +891,13 @@ static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key, keylen /= 2; /* first half of xts-key is for crypt */ - err = aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_crypt_ctx, - key, keylen); + err = aes_set_key_common(aes_ctx(ctx->raw_crypt_ctx), key, keylen); if (err) return err; /* second half of xts-key is for tweak */ - return aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_tweak_ctx, - key + keylen, keylen); + return aes_set_key_common(aes_ctx(ctx->raw_tweak_ctx), key + keylen, + keylen); } static int xts_crypt(struct skcipher_request *req, bool encrypt) @@ -1150,8 +1147,7 @@ static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key, { struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(aead); - return aes_set_key_common(crypto_aead_tfm(aead), - &ctx->aes_key_expanded, key, key_len) ?: + return aes_set_key_common(&ctx->aes_key_expanded, key, key_len) ?: rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len); } diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 91f6818884fa..43606de22511 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -285,7 +285,15 @@ SYM_FUNC_END(__switch_to_asm) */ .pushsection .text, "ax" SYM_CODE_START(ret_from_fork_asm) - UNWIND_HINT_REGS + /* + * This is the start of the kernel stack; even through there's a + * register set at the top, the regset isn't necessarily coherent + * (consider kthreads) and one cannot unwind further. + * + * This ensures stack unwinds of kernel threads terminate in a known + * good state. + */ + UNWIND_HINT_END_OF_STACK ANNOTATE_NOENDBR // copy_thread CALL_DEPTH_ACCOUNT @@ -295,6 +303,12 @@ SYM_CODE_START(ret_from_fork_asm) movq %r12, %rcx /* fn_arg */ call ret_from_fork + /* + * Set the stack state to what is expected for the target function + * -- at this point the register set should be a valid user set + * and unwind should work normally. + */ + UNWIND_HINT_REGS jmp swapgs_restore_regs_and_return_to_usermode SYM_CODE_END(ret_from_fork_asm) .popsection diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index bc0a3c941b35..2d0b1bd866ea 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -456,3 +456,4 @@ 449 i386 futex_waitv sys_futex_waitv 450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node 451 i386 cachestat sys_cachestat +452 i386 fchmodat2 sys_fchmodat2 diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 227538b0ce80..1d6eee30eceb 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -373,6 +373,8 @@ 449 common futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node 451 common cachestat sys_cachestat +452 common fchmodat2 sys_fchmodat2 +453 64 map_shadow_stack sys_map_shadow_stack # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 11a5c68d1218..7645730dc228 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -299,8 +299,8 @@ static unsigned long vdso_addr(unsigned long start, unsigned len) /* Round the lowest possible end address up to a PMD boundary. */ end = (start + len + PMD_SIZE - 1) & PMD_MASK; - if (end >= TASK_SIZE_MAX) - end = TASK_SIZE_MAX; + if (end >= DEFAULT_MAP_WINDOW) + end = DEFAULT_MAP_WINDOW; end -= len; if (end > start) { diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 371014802191..6911c5399d02 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -156,8 +156,8 @@ perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width) * count to the generic event atomically: */ prev_raw_count = local64_read(&hwc->prev_count); - if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, - new_raw_count) != prev_raw_count) + if (!local64_try_cmpxchg(&hwc->prev_count, + &prev_raw_count, new_raw_count)) return 0; /* @@ -247,11 +247,33 @@ int forward_event_to_ibs(struct perf_event *event) return -ENOENT; } +/* + * Grouping of IBS events is not possible since IBS can have only + * one event active at any point in time. + */ +static int validate_group(struct perf_event *event) +{ + struct perf_event *sibling; + + if (event->group_leader == event) + return 0; + + if (event->group_leader->pmu == event->pmu) + return -EINVAL; + + for_each_sibling_event(sibling, event->group_leader) { + if (sibling->pmu == event->pmu) + return -EINVAL; + } + return 0; +} + static int perf_ibs_init(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; struct perf_ibs *perf_ibs; u64 max_cnt, config; + int ret; perf_ibs = get_ibs_pmu(event->attr.type); if (!perf_ibs) @@ -265,6 +287,10 @@ static int perf_ibs_init(struct perf_event *event) if (config & ~perf_ibs->config_mask) return -EINVAL; + ret = validate_group(event); + if (ret) + return ret; + if (hwc->sample_period) { if (config & perf_ibs->cnt_mask) /* raw max_cnt may not be set */ @@ -702,38 +728,63 @@ static u8 perf_ibs_data_src(union ibs_op_data2 *op_data2) return op_data2->data_src_lo; } -static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2, - union ibs_op_data3 *op_data3, - struct perf_sample_data *data) +#define L(x) (PERF_MEM_S(LVL, x) | PERF_MEM_S(LVL, HIT)) +#define LN(x) PERF_MEM_S(LVLNUM, x) +#define REM PERF_MEM_S(REMOTE, REMOTE) +#define HOPS(x) PERF_MEM_S(HOPS, x) + +static u64 g_data_src[8] = { + [IBS_DATA_SRC_LOC_CACHE] = L(L3) | L(REM_CCE1) | LN(ANY_CACHE) | HOPS(0), + [IBS_DATA_SRC_DRAM] = L(LOC_RAM) | LN(RAM), + [IBS_DATA_SRC_REM_CACHE] = L(REM_CCE2) | LN(ANY_CACHE) | REM | HOPS(1), + [IBS_DATA_SRC_IO] = L(IO) | LN(IO), +}; + +#define RMT_NODE_BITS (1 << IBS_DATA_SRC_DRAM) +#define RMT_NODE_APPLICABLE(x) (RMT_NODE_BITS & (1 << x)) + +static u64 g_zen4_data_src[32] = { + [IBS_DATA_SRC_EXT_LOC_CACHE] = L(L3) | LN(L3), + [IBS_DATA_SRC_EXT_NEAR_CCX_CACHE] = L(REM_CCE1) | LN(ANY_CACHE) | REM | HOPS(0), + [IBS_DATA_SRC_EXT_DRAM] = L(LOC_RAM) | LN(RAM), + [IBS_DATA_SRC_EXT_FAR_CCX_CACHE] = L(REM_CCE2) | LN(ANY_CACHE) | REM | HOPS(1), + [IBS_DATA_SRC_EXT_PMEM] = LN(PMEM), + [IBS_DATA_SRC_EXT_IO] = L(IO) | LN(IO), + [IBS_DATA_SRC_EXT_EXT_MEM] = LN(CXL), +}; + +#define ZEN4_RMT_NODE_BITS ((1 << IBS_DATA_SRC_EXT_DRAM) | \ + (1 << IBS_DATA_SRC_EXT_PMEM) | \ + (1 << IBS_DATA_SRC_EXT_EXT_MEM)) +#define ZEN4_RMT_NODE_APPLICABLE(x) (ZEN4_RMT_NODE_BITS & (1 << x)) + +static __u64 perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2, + union ibs_op_data3 *op_data3, + struct perf_sample_data *data) { union perf_mem_data_src *data_src = &data->data_src; u8 ibs_data_src = perf_ibs_data_src(op_data2); data_src->mem_lvl = 0; + data_src->mem_lvl_num = 0; /* * DcMiss, L2Miss, DataSrc, DcMissLat etc. are all invalid for Uncached * memory accesses. So, check DcUcMemAcc bit early. */ - if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO) { - data_src->mem_lvl = PERF_MEM_LVL_UNC | PERF_MEM_LVL_HIT; - return; - } + if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO) + return L(UNC) | LN(UNC); /* L1 Hit */ - if (op_data3->dc_miss == 0) { - data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT; - return; - } + if (op_data3->dc_miss == 0) + return L(L1) | LN(L1); /* L2 Hit */ if (op_data3->l2_miss == 0) { /* Erratum #1293 */ if (boot_cpu_data.x86 != 0x19 || boot_cpu_data.x86_model > 0xF || - !(op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc)) { - data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; - return; - } + !(op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc)) + return L(L2) | LN(L2); } /* @@ -743,82 +794,36 @@ static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2, if (data_src->mem_op != PERF_MEM_OP_LOAD) goto check_mab; - /* L3 Hit */ if (ibs_caps & IBS_CAPS_ZEN4) { - if (ibs_data_src == IBS_DATA_SRC_EXT_LOC_CACHE) { - data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT; - return; - } - } else { - if (ibs_data_src == IBS_DATA_SRC_LOC_CACHE) { - data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_REM_CCE1 | - PERF_MEM_LVL_HIT; - return; - } - } + u64 val = g_zen4_data_src[ibs_data_src]; - /* A peer cache in a near CCX */ - if (ibs_caps & IBS_CAPS_ZEN4 && - ibs_data_src == IBS_DATA_SRC_EXT_NEAR_CCX_CACHE) { - data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1 | PERF_MEM_LVL_HIT; - return; - } + if (!val) + goto check_mab; - /* A peer cache in a far CCX */ - if (ibs_caps & IBS_CAPS_ZEN4) { - if (ibs_data_src == IBS_DATA_SRC_EXT_FAR_CCX_CACHE) { - data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2 | PERF_MEM_LVL_HIT; - return; + /* HOPS_1 because IBS doesn't provide remote socket detail */ + if (op_data2->rmt_node && ZEN4_RMT_NODE_APPLICABLE(ibs_data_src)) { + if (ibs_data_src == IBS_DATA_SRC_EXT_DRAM) + val = L(REM_RAM1) | LN(RAM) | REM | HOPS(1); + else + val |= REM | HOPS(1); } - } else { - if (ibs_data_src == IBS_DATA_SRC_REM_CACHE) { - data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2 | PERF_MEM_LVL_HIT; - return; - } - } - /* DRAM */ - if (ibs_data_src == IBS_DATA_SRC_EXT_DRAM) { - if (op_data2->rmt_node == 0) - data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT; - else - data_src->mem_lvl = PERF_MEM_LVL_REM_RAM1 | PERF_MEM_LVL_HIT; - return; - } + return val; + } else { + u64 val = g_data_src[ibs_data_src]; - /* PMEM */ - if (ibs_caps & IBS_CAPS_ZEN4 && ibs_data_src == IBS_DATA_SRC_EXT_PMEM) { - data_src->mem_lvl_num = PERF_MEM_LVLNUM_PMEM; - if (op_data2->rmt_node) { - data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; - /* IBS doesn't provide Remote socket detail */ - data_src->mem_hops = PERF_MEM_HOPS_1; - } - return; - } + if (!val) + goto check_mab; - /* Extension Memory */ - if (ibs_caps & IBS_CAPS_ZEN4 && - ibs_data_src == IBS_DATA_SRC_EXT_EXT_MEM) { - data_src->mem_lvl_num = PERF_MEM_LVLNUM_CXL; - if (op_data2->rmt_node) { - data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; - /* IBS doesn't provide Remote socket detail */ - data_src->mem_hops = PERF_MEM_HOPS_1; + /* HOPS_1 because IBS doesn't provide remote socket detail */ + if (op_data2->rmt_node && RMT_NODE_APPLICABLE(ibs_data_src)) { + if (ibs_data_src == IBS_DATA_SRC_DRAM) + val = L(REM_RAM1) | LN(RAM) | REM | HOPS(1); + else + val |= REM | HOPS(1); } - return; - } - /* IO */ - if (ibs_data_src == IBS_DATA_SRC_EXT_IO) { - data_src->mem_lvl = PERF_MEM_LVL_IO; - data_src->mem_lvl_num = PERF_MEM_LVLNUM_IO; - if (op_data2->rmt_node) { - data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; - /* IBS doesn't provide Remote socket detail */ - data_src->mem_hops = PERF_MEM_HOPS_1; - } - return; + return val; } check_mab: @@ -829,12 +834,11 @@ check_mab: * DataSrc simultaneously. Prioritize DataSrc over MAB, i.e. set * MAB only when IBS fails to provide DataSrc. */ - if (op_data3->dc_miss_no_mab_alloc) { - data_src->mem_lvl = PERF_MEM_LVL_LFB | PERF_MEM_LVL_HIT; - return; - } + if (op_data3->dc_miss_no_mab_alloc) + return L(LFB) | LN(LFB); - data_src->mem_lvl = PERF_MEM_LVL_NA; + /* Don't set HIT with NA */ + return PERF_MEM_S(LVL, NA) | LN(NA); } static bool perf_ibs_cache_hit_st_valid(void) @@ -924,7 +928,9 @@ static void perf_ibs_get_data_src(struct perf_ibs_data *ibs_data, union ibs_op_data2 *op_data2, union ibs_op_data3 *op_data3) { - perf_ibs_get_mem_lvl(op_data2, op_data3, data); + union perf_mem_data_src *data_src = &data->data_src; + + data_src->val |= perf_ibs_get_mem_lvl(op_data2, op_data3, data); perf_ibs_get_mem_snoop(op_data2, data); perf_ibs_get_tlb_lvl(op_data3, data); perf_ibs_get_mem_lock(op_data3, data); diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 9d248703cbdd..185f902e5f28 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -129,13 +129,11 @@ u64 x86_perf_event_update(struct perf_event *event) * exchange a new raw count - then add that new-prev delta * count to the generic event atomically: */ -again: prev_raw_count = local64_read(&hwc->prev_count); - rdpmcl(hwc->event_base_rdpmc, new_raw_count); - - if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, - new_raw_count) != prev_raw_count) - goto again; + do { + rdpmcl(hwc->event_base_rdpmc, new_raw_count); + } while (!local64_try_cmpxchg(&hwc->prev_count, + &prev_raw_count, new_raw_count)); /* * Now we have the new raw value and have updated the prev @@ -2168,7 +2166,6 @@ static int __init init_hw_perf_events(void) hybrid_pmu->pmu = pmu; hybrid_pmu->pmu.type = -1; hybrid_pmu->pmu.attr_update = x86_pmu.attr_update; - hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_HETEROGENEOUS_CPUS; hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_EXTENDED_HW_TYPE; err = perf_pmu_register(&hybrid_pmu->pmu, hybrid_pmu->name, diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 2a284ba951b7..fa355d3658a6 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2129,6 +2129,17 @@ static struct extra_reg intel_grt_extra_regs[] __read_mostly = { EVENT_EXTRA_END }; +EVENT_ATTR_STR(topdown-retiring, td_retiring_cmt, "event=0x72,umask=0x0"); +EVENT_ATTR_STR(topdown-bad-spec, td_bad_spec_cmt, "event=0x73,umask=0x0"); + +static struct attribute *cmt_events_attrs[] = { + EVENT_PTR(td_fe_bound_tnt), + EVENT_PTR(td_retiring_cmt), + EVENT_PTR(td_bad_spec_cmt), + EVENT_PTR(td_be_bound_tnt), + NULL +}; + static struct extra_reg intel_cmt_extra_regs[] __read_mostly = { /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x800ff3ffffffffffull, RSP_0), @@ -4847,6 +4858,8 @@ PMU_FORMAT_ATTR(ldlat, "config1:0-15"); PMU_FORMAT_ATTR(frontend, "config1:0-23"); +PMU_FORMAT_ATTR(snoop_rsp, "config1:0-63"); + static struct attribute *intel_arch3_formats_attr[] = { &format_attr_event.attr, &format_attr_umask.attr, @@ -4877,6 +4890,13 @@ static struct attribute *slm_format_attr[] = { NULL }; +static struct attribute *cmt_format_attr[] = { + &format_attr_offcore_rsp.attr, + &format_attr_ldlat.attr, + &format_attr_snoop_rsp.attr, + NULL +}; + static struct attribute *skl_format_attr[] = { &format_attr_frontend.attr, NULL, @@ -5656,7 +5676,6 @@ static struct attribute *adl_hybrid_extra_attr[] = { NULL }; -PMU_FORMAT_ATTR_SHOW(snoop_rsp, "config1:0-63"); FORMAT_ATTR_HYBRID(snoop_rsp, hybrid_small); static struct attribute *mtl_hybrid_extra_attr_rtm[] = { @@ -6174,7 +6193,7 @@ __init int intel_pmu_init(void) name = "Tremont"; break; - case INTEL_FAM6_ALDERLAKE_N: + case INTEL_FAM6_ATOM_GRACEMONT: x86_pmu.mid_ack = true; memcpy(hw_cache_event_ids, glp_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -6204,6 +6223,37 @@ __init int intel_pmu_init(void) name = "gracemont"; break; + case INTEL_FAM6_ATOM_CRESTMONT: + case INTEL_FAM6_ATOM_CRESTMONT_X: + x86_pmu.mid_ack = true; + memcpy(hw_cache_event_ids, glp_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, tnt_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); + hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1; + + x86_pmu.event_constraints = intel_slm_event_constraints; + x86_pmu.pebs_constraints = intel_grt_pebs_event_constraints; + x86_pmu.extra_regs = intel_cmt_extra_regs; + + x86_pmu.pebs_aliases = NULL; + x86_pmu.pebs_prec_dist = true; + x86_pmu.lbr_pt_coexist = true; + x86_pmu.pebs_block = true; + x86_pmu.flags |= PMU_FL_HAS_RSP_1; + x86_pmu.flags |= PMU_FL_INSTR_LATENCY; + + intel_pmu_pebs_data_source_cmt(); + x86_pmu.pebs_latency_data = mtl_latency_data_small; + x86_pmu.get_event_constraints = cmt_get_event_constraints; + x86_pmu.limit_period = spr_limit_period; + td_attr = cmt_events_attrs; + mem_attr = grt_mem_attrs; + extra_attr = cmt_format_attr; + pr_cont("Crestmont events, "); + name = "crestmont"; + break; + case INTEL_FAM6_WESTMERE: case INTEL_FAM6_WESTMERE_EP: case INTEL_FAM6_WESTMERE_EX: diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 835862c548cc..96fffb2d521d 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -365,13 +365,11 @@ static void cstate_pmu_event_update(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; u64 prev_raw_count, new_raw_count; -again: prev_raw_count = local64_read(&hwc->prev_count); - new_raw_count = cstate_pmu_read_counter(event); - - if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, - new_raw_count) != prev_raw_count) - goto again; + do { + new_raw_count = cstate_pmu_read_counter(event); + } while (!local64_try_cmpxchg(&hwc->prev_count, + &prev_raw_count, new_raw_count)); local64_add(new_raw_count - prev_raw_count, &event->count); } @@ -671,6 +669,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &glm_cstates), X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, &glm_cstates), X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, &glm_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, &adl_cstates), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &icl_cstates), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_cstates), @@ -686,7 +685,6 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &icl_cstates), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &adl_cstates), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &adl_cstates), - X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &adl_cstates), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_cstates), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_cstates), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &adl_cstates), diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index df88576d6b2a..eb8dd8b8a1e8 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -144,7 +144,7 @@ void __init intel_pmu_pebs_data_source_adl(void) __intel_pmu_pebs_data_source_grt(data_source); } -static void __init intel_pmu_pebs_data_source_cmt(u64 *data_source) +static void __init __intel_pmu_pebs_data_source_cmt(u64 *data_source) { data_source[0x07] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOPX, FWD); data_source[0x08] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM); @@ -164,7 +164,12 @@ void __init intel_pmu_pebs_data_source_mtl(void) data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX].pebs_data_source; memcpy(data_source, pebs_data_source, sizeof(pebs_data_source)); - intel_pmu_pebs_data_source_cmt(data_source); + __intel_pmu_pebs_data_source_cmt(data_source); +} + +void __init intel_pmu_pebs_data_source_cmt(void) +{ + __intel_pmu_pebs_data_source_cmt(pebs_data_source); } static u64 precise_store_data(u64 status) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index bc226603ef3e..69043e02e8a7 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1858,7 +1858,6 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &rkl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &adl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &adl_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &adl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &adl_uncore_init), @@ -1867,6 +1866,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &spr_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &spr_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, &adl_uncore_init), {}, }; MODULE_DEVICE_TABLE(x86cpu, intel_uncore_match); diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index d49e90dc04a4..8250f0f59c2b 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -1502,7 +1502,7 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool pci_dev_put(ubox_dev); - return err ? pcibios_err_to_errno(err) : 0; + return pcibios_err_to_errno(err); } int snbep_uncore_pci_init(void) @@ -6474,8 +6474,18 @@ void spr_uncore_cpu_init(void) type = uncore_find_type_by_id(uncore_msr_uncores, UNCORE_SPR_CHA); if (type) { + /* + * The value from the discovery table (stored in the type->num_boxes + * of UNCORE_SPR_CHA) is incorrect on some SPR variants because of a + * firmware bug. Using the value from SPR_MSR_UNC_CBO_CONFIG to replace it. + */ rdmsrl(SPR_MSR_UNC_CBO_CONFIG, num_cbo); - type->num_boxes = num_cbo; + /* + * The MSR doesn't work on the EMR XCC, but the firmware bug doesn't impact + * the EMR XCC. Don't let the value from the MSR replace the existing value. + */ + if (num_cbo) + type->num_boxes = num_cbo; } spr_uncore_iio_free_running.num_boxes = uncore_type_max_boxes(uncore_msr_uncores, UNCORE_SPR_IIO); } diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index 0feaaa571303..9e237b30f017 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -106,7 +106,7 @@ static bool test_intel(int idx, void *data) case INTEL_FAM6_ROCKETLAKE: case INTEL_FAM6_ALDERLAKE: case INTEL_FAM6_ALDERLAKE_L: - case INTEL_FAM6_ALDERLAKE_N: + case INTEL_FAM6_ATOM_GRACEMONT: case INTEL_FAM6_RAPTORLAKE: case INTEL_FAM6_RAPTORLAKE_P: case INTEL_FAM6_RAPTORLAKE_S: @@ -244,12 +244,10 @@ static void msr_event_update(struct perf_event *event) s64 delta; /* Careful, an NMI might modify the previous event value: */ -again: prev = local64_read(&event->hw.prev_count); - now = msr_read_counter(event); - - if (local64_cmpxchg(&event->hw.prev_count, prev, now) != prev) - goto again; + do { + now = msr_read_counter(event); + } while (!local64_try_cmpxchg(&event->hw.prev_count, &prev, now)); delta = now - prev; if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) { diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index d6de4487348c..c8ba2be7585d 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1606,6 +1606,8 @@ void intel_pmu_pebs_data_source_grt(void); void intel_pmu_pebs_data_source_mtl(void); +void intel_pmu_pebs_data_source_cmt(void); + int intel_pmu_setup_lbr_filter(struct perf_event *event); void intel_pt_interrupt(void); diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 52e6e7ed4f78..1579429846cc 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -804,7 +804,7 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &model_skl), - X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &model_spr), X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &model_spr), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &model_skl), diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index 1fbda2f94184..97bfe5f0531f 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -86,14 +86,14 @@ static void hv_apic_write(u32 reg, u32 val) } } -static void hv_apic_eoi_write(u32 reg, u32 val) +static void hv_apic_eoi_write(void) { struct hv_vp_assist_page *hvp = hv_vp_assist_page[smp_processor_id()]; if (hvp && (xchg(&hvp->apic_assist, 0) & 0x1)) return; - wrmsr(HV_X64_MSR_EOI, val, 0); + wrmsr(HV_X64_MSR_EOI, APIC_EOI_ACK, 0); } static bool cpu_is_self(int cpu) @@ -107,7 +107,6 @@ static bool cpu_is_self(int cpu) static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector, bool exclude_self) { - struct hv_send_ipi_ex **arg; struct hv_send_ipi_ex *ipi_arg; unsigned long flags; int nr_bank = 0; @@ -117,9 +116,8 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector, return false; local_irq_save(flags); - arg = (struct hv_send_ipi_ex **)this_cpu_ptr(hyperv_pcpu_input_arg); + ipi_arg = *this_cpu_ptr(hyperv_pcpu_input_arg); - ipi_arg = *arg; if (unlikely(!ipi_arg)) goto ipi_mask_ex_done; @@ -177,8 +175,11 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector, (exclude_self && weight == 1 && cpumask_test_cpu(this_cpu, mask))) return true; - if (!hv_hypercall_pg) - return false; + /* A fully enlightened TDX VM uses GHCI rather than hv_hypercall_pg. */ + if (!hv_hypercall_pg) { + if (ms_hyperv.paravisor_present || !hv_isolation_type_tdx()) + return false; + } if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) return false; @@ -231,9 +232,15 @@ static bool __send_ipi_one(int cpu, int vector) trace_hyperv_send_ipi_one(cpu, vector); - if (!hv_hypercall_pg || (vp == VP_INVAL)) + if (vp == VP_INVAL) return false; + /* A fully enlightened TDX VM uses GHCI rather than hv_hypercall_pg. */ + if (!hv_hypercall_pg) { + if (ms_hyperv.paravisor_present || !hv_isolation_type_tdx()) + return false; + } + if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) return false; @@ -288,12 +295,12 @@ void __init hv_apic_init(void) */ orig_apic = *apic; - apic->send_IPI = hv_send_ipi; - apic->send_IPI_mask = hv_send_ipi_mask; - apic->send_IPI_mask_allbutself = hv_send_ipi_mask_allbutself; - apic->send_IPI_allbutself = hv_send_ipi_allbutself; - apic->send_IPI_all = hv_send_ipi_all; - apic->send_IPI_self = hv_send_ipi_self; + apic_update_callback(send_IPI, hv_send_ipi); + apic_update_callback(send_IPI_mask, hv_send_ipi_mask); + apic_update_callback(send_IPI_mask_allbutself, hv_send_ipi_mask_allbutself); + apic_update_callback(send_IPI_allbutself, hv_send_ipi_allbutself); + apic_update_callback(send_IPI_all, hv_send_ipi_all); + apic_update_callback(send_IPI_self, hv_send_ipi_self); } if (ms_hyperv.hints & HV_X64_APIC_ACCESS_RECOMMENDED) { @@ -310,12 +317,12 @@ void __init hv_apic_init(void) * lazy EOI when available, but the same accessor works for * both xapic and x2apic because the field layout is the same. */ - apic_set_eoi_write(hv_apic_eoi_write); + apic_update_callback(eoi, hv_apic_eoi_write); if (!x2apic_enabled()) { - apic->read = hv_apic_read; - apic->write = hv_apic_write; - apic->icr_write = hv_apic_icr_write; - apic->icr_read = hv_apic_icr_read; + apic_update_callback(read, hv_apic_read); + apic_update_callback(write, hv_apic_write); + apic_update_callback(icr_write, hv_apic_icr_write); + apic_update_callback(icr_read, hv_apic_icr_read); } } } diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 6c04b52f139b..783ed339f341 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -14,10 +14,12 @@ #include <asm/apic.h> #include <asm/desc.h> #include <asm/sev.h> +#include <asm/ibt.h> #include <asm/hypervisor.h> #include <asm/hyperv-tlfs.h> #include <asm/mshyperv.h> #include <asm/idtentry.h> +#include <asm/set_memory.h> #include <linux/kexec.h> #include <linux/version.h> #include <linux/vmalloc.h> @@ -51,7 +53,7 @@ static int hyperv_init_ghcb(void) void *ghcb_va; void **ghcb_base; - if (!hv_isolation_type_snp()) + if (!ms_hyperv.paravisor_present || !hv_isolation_type_snp()) return 0; if (!hv_ghcb_pg) @@ -79,7 +81,7 @@ static int hyperv_init_ghcb(void) static int hv_cpu_init(unsigned int cpu) { union hv_vp_assist_msr_contents msr = { 0 }; - struct hv_vp_assist_page **hvp = &hv_vp_assist_page[cpu]; + struct hv_vp_assist_page **hvp; int ret; ret = hv_common_cpu_init(cpu); @@ -89,6 +91,7 @@ static int hv_cpu_init(unsigned int cpu) if (!hv_vp_assist_page) return 0; + hvp = &hv_vp_assist_page[cpu]; if (hv_root_partition) { /* * For root partition we get the hypervisor provided VP assist @@ -106,8 +109,21 @@ static int hv_cpu_init(unsigned int cpu) * in hv_cpu_die(), otherwise a CPU may not be stopped in the * case of CPU offlining and the VM will hang. */ - if (!*hvp) + if (!*hvp) { *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO); + + /* + * Hyper-V should never specify a VM that is a Confidential + * VM and also running in the root partition. Root partition + * is blocked to run in Confidential VM. So only decrypt assist + * page in non-root partition here. + */ + if (*hvp && !ms_hyperv.paravisor_present && hv_isolation_type_snp()) { + WARN_ON_ONCE(set_memory_decrypted((unsigned long)(*hvp), 1)); + memset(*hvp, 0, PAGE_SIZE); + } + } + if (*hvp) msr.pfn = vmalloc_to_pfn(*hvp); @@ -161,7 +177,7 @@ static inline bool hv_reenlightenment_available(void) DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_reenlightenment) { - ack_APIC_irq(); + apic_eoi(); inc_irq_stat(irq_hv_reenlightenment_count); schedule_delayed_work(&hv_reenlightenment_work, HZ/10); } @@ -378,6 +394,36 @@ static void __init hv_get_partition_id(void) local_irq_restore(flags); } +static u8 __init get_vtl(void) +{ + u64 control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_REGISTERS; + struct hv_get_vp_registers_input *input; + struct hv_get_vp_registers_output *output; + unsigned long flags; + u64 ret; + + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + output = (struct hv_get_vp_registers_output *)input; + + memset(input, 0, struct_size(input, element, 1)); + input->header.partitionid = HV_PARTITION_ID_SELF; + input->header.vpindex = HV_VP_INDEX_SELF; + input->header.inputvtl = 0; + input->element[0].name0 = HV_X64_REGISTER_VSM_VP_STATUS; + + ret = hv_do_hypercall(control, input, output); + if (hv_result_success(ret)) { + ret = output->as64.low & HV_X64_VTL_MASK; + } else { + pr_err("Failed to get VTL(%lld) and set VTL to zero by default.\n", ret); + ret = 0; + } + + local_irq_restore(flags); + return ret; +} + /* * This function is to be invoked early in the boot sequence after the * hypervisor has been detected. @@ -398,14 +444,24 @@ void __init hyperv_init(void) if (hv_common_init()) return; - hv_vp_assist_page = kcalloc(num_possible_cpus(), - sizeof(*hv_vp_assist_page), GFP_KERNEL); + /* + * The VP assist page is useless to a TDX guest: the only use we + * would have for it is lazy EOI, which can not be used with TDX. + */ + if (hv_isolation_type_tdx()) + hv_vp_assist_page = NULL; + else + hv_vp_assist_page = kcalloc(num_possible_cpus(), + sizeof(*hv_vp_assist_page), + GFP_KERNEL); if (!hv_vp_assist_page) { ms_hyperv.hints &= ~HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; - goto common_free; + + if (!hv_isolation_type_tdx()) + goto common_free; } - if (hv_isolation_type_snp()) { + if (ms_hyperv.paravisor_present && hv_isolation_type_snp()) { /* Negotiate GHCB Version. */ if (!hv_ghcb_negotiate_protocol()) hv_ghcb_terminate(SEV_TERM_SET_GEN, @@ -425,12 +481,32 @@ void __init hyperv_init(void) * Setup the hypercall page and enable hypercalls. * 1. Register the guest ID * 2. Enable the hypercall and register the hypercall page + * + * A TDX VM with no paravisor only uses TDX GHCI rather than hv_hypercall_pg: + * when the hypercall input is a page, such a VM must pass a decrypted + * page to Hyper-V, e.g. hv_post_message() uses the per-CPU page + * hyperv_pcpu_input_arg, which is decrypted if no paravisor is present. + * + * A TDX VM with the paravisor uses hv_hypercall_pg for most hypercalls, + * which are handled by the paravisor and the VM must use an encrypted + * input page: in such a VM, the hyperv_pcpu_input_arg is encrypted and + * used in the hypercalls, e.g. see hv_mark_gpa_visibility() and + * hv_arch_irq_unmask(). Such a VM uses TDX GHCI for two hypercalls: + * 1. HVCALL_SIGNAL_EVENT: see vmbus_set_event() and _hv_do_fast_hypercall8(). + * 2. HVCALL_POST_MESSAGE: the input page must be a decrypted page, i.e. + * hv_post_message() in such a VM can't use the encrypted hyperv_pcpu_input_arg; + * instead, hv_post_message() uses the post_msg_page, which is decrypted + * in such a VM and is only used in such a VM. */ guest_id = hv_generate_guest_id(LINUX_VERSION_CODE); wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id); - /* Hyper-V requires to write guest os id via ghcb in SNP IVM. */ - hv_ghcb_msr_write(HV_X64_MSR_GUEST_OS_ID, guest_id); + /* With the paravisor, the VM must also write the ID via GHCB/GHCI */ + hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, guest_id); + + /* A TDX VM with no paravisor only uses TDX GHCI rather than hv_hypercall_pg */ + if (hv_isolation_type_tdx() && !ms_hyperv.paravisor_present) + goto skip_hypercall_pg_init; hv_hypercall_pg = __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, VMALLOC_END, GFP_KERNEL, PAGE_KERNEL_ROX, @@ -471,6 +547,27 @@ void __init hyperv_init(void) wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); } +skip_hypercall_pg_init: + /* + * Some versions of Hyper-V that provide IBT in guest VMs have a bug + * in that there's no ENDBR64 instruction at the entry to the + * hypercall page. Because hypercalls are invoked via an indirect call + * to the hypercall page, all hypercall attempts fail when IBT is + * enabled, and Linux panics. For such buggy versions, disable IBT. + * + * Fixed versions of Hyper-V always provide ENDBR64 on the hypercall + * page, so if future Linux kernel versions enable IBT for 32-bit + * builds, additional hypercall page hackery will be required here + * to provide an ENDBR32. + */ +#ifdef CONFIG_X86_KERNEL_IBT + if (cpu_feature_enabled(X86_FEATURE_IBT) && + *(u32 *)hv_hypercall_pg != gen_endbr()) { + setup_clear_cpu_cap(X86_FEATURE_IBT); + pr_warn("Hyper-V: Disabling IBT because of Hyper-V bug\n"); + } +#endif + /* * hyperv_init() is called before LAPIC is initialized: see * apic_intr_mode_init() -> x86_platform.apic_post_init() and @@ -506,11 +603,15 @@ void __init hyperv_init(void) /* Query the VMs extended capability once, so that it can be cached. */ hv_query_ext_cap(0); + /* Find the VTL */ + if (!ms_hyperv.paravisor_present && hv_isolation_type_snp()) + ms_hyperv.vtl = get_vtl(); + return; clean_guest_os_id: wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); - hv_ghcb_msr_write(HV_X64_MSR_GUEST_OS_ID, 0); + hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, 0); cpuhp_remove_state(cpuhp); free_ghcb_page: free_percpu(hv_ghcb_pg); @@ -531,7 +632,7 @@ void hyperv_cleanup(void) /* Reset our OS id */ wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); - hv_ghcb_msr_write(HV_X64_MSR_GUEST_OS_ID, 0); + hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, 0); /* * Reset hypercall page reference before reset the page, @@ -594,6 +695,9 @@ bool hv_is_hyperv_initialized(void) if (x86_hyper_type != X86_HYPER_MS_HYPERV) return false; + /* A TDX VM with no paravisor uses TDX GHCI call rather than hv_hypercall_pg */ + if (hv_isolation_type_tdx() && !ms_hyperv.paravisor_present) + return true; /* * Verify that earlier initialization succeeded by checking * that the hypercall page is setup diff --git a/arch/x86/hyperv/hv_spinlock.c b/arch/x86/hyperv/hv_spinlock.c index 91cfe698bde0..737d6f7a6155 100644 --- a/arch/x86/hyperv/hv_spinlock.c +++ b/arch/x86/hyperv/hv_spinlock.c @@ -20,7 +20,7 @@ static bool __initdata hv_pvspin = true; static void hv_qlock_kick(int cpu) { - apic->send_IPI(cpu, X86_PLATFORM_IPI_VECTOR); + __apic_send_IPI(cpu, X86_PLATFORM_IPI_VECTOR); } static void hv_qlock_wait(u8 *byte, u8 val) diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c index 85d38b9f3586..36a562218010 100644 --- a/arch/x86/hyperv/hv_vtl.c +++ b/arch/x86/hyperv/hv_vtl.c @@ -25,6 +25,10 @@ void __init hv_vtl_init_platform(void) x86_init.irqs.pre_vector_init = x86_init_noop; x86_init.timers.timer_init = x86_init_noop; + /* Avoid searching for BIOS MP tables */ + x86_init.mpparse.find_smp_config = x86_init_noop; + x86_init.mpparse.get_smp_config = x86_init_uint_noop; + x86_platform.get_wallclock = get_rtc_noop; x86_platform.set_wallclock = set_rtc_noop; x86_platform.get_nmi_reason = hv_get_nmi_reason; @@ -222,7 +226,7 @@ static int __init hv_vtl_early_init(void) "Please add 'noxsave' to the kernel command line.\n"); real_mode_header = &hv_vtl_real_mode_header; - apic->wakeup_secondary_cpu_64 = hv_vtl_wakeup_secondary_cpu; + apic_update_callback(wakeup_secondary_cpu_64, hv_vtl_wakeup_secondary_cpu); return 0; } diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c index 14f46ad2ca64..8c6bf07f7d2b 100644 --- a/arch/x86/hyperv/ivm.c +++ b/arch/x86/hyperv/ivm.c @@ -18,6 +18,11 @@ #include <asm/mshyperv.h> #include <asm/hypervisor.h> #include <asm/mtrr.h> +#include <asm/io_apic.h> +#include <asm/realmode.h> +#include <asm/e820/api.h> +#include <asm/desc.h> +#include <uapi/asm/vmx.h> #ifdef CONFIG_AMD_MEM_ENCRYPT @@ -56,8 +61,10 @@ union hv_ghcb { } hypercall; } __packed __aligned(HV_HYP_PAGE_SIZE); +/* Only used in an SNP VM with the paravisor */ static u16 hv_ghcb_version __ro_after_init; +/* Functions only used in an SNP VM with the paravisor go here. */ u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size) { union hv_ghcb *hv_ghcb; @@ -175,7 +182,7 @@ bool hv_ghcb_negotiate_protocol(void) return true; } -void hv_ghcb_msr_write(u64 msr, u64 value) +static void hv_ghcb_msr_write(u64 msr, u64 value) { union hv_ghcb *hv_ghcb; void **ghcb_base; @@ -203,9 +210,8 @@ void hv_ghcb_msr_write(u64 msr, u64 value) local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(hv_ghcb_msr_write); -void hv_ghcb_msr_read(u64 msr, u64 *value) +static void hv_ghcb_msr_read(u64 msr, u64 *value) { union hv_ghcb *hv_ghcb; void **ghcb_base; @@ -235,7 +241,217 @@ void hv_ghcb_msr_read(u64 msr, u64 *value) | ((u64)lower_32_bits(hv_ghcb->ghcb.save.rdx) << 32); local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(hv_ghcb_msr_read); + +/* Only used in a fully enlightened SNP VM, i.e. without the paravisor */ +static u8 ap_start_input_arg[PAGE_SIZE] __bss_decrypted __aligned(PAGE_SIZE); +static u8 ap_start_stack[PAGE_SIZE] __aligned(PAGE_SIZE); +static DEFINE_PER_CPU(struct sev_es_save_area *, hv_sev_vmsa); + +/* Functions only used in an SNP VM without the paravisor go here. */ + +#define hv_populate_vmcb_seg(seg, gdtr_base) \ +do { \ + if (seg.selector) { \ + seg.base = 0; \ + seg.limit = HV_AP_SEGMENT_LIMIT; \ + seg.attrib = *(u16 *)(gdtr_base + seg.selector + 5); \ + seg.attrib = (seg.attrib & 0xFF) | ((seg.attrib >> 4) & 0xF00); \ + } \ +} while (0) \ + +static int snp_set_vmsa(void *va, bool vmsa) +{ + u64 attrs; + + /* + * Running at VMPL0 allows the kernel to change the VMSA bit for a page + * using the RMPADJUST instruction. However, for the instruction to + * succeed it must target the permissions of a lesser privileged + * (higher numbered) VMPL level, so use VMPL1 (refer to the RMPADJUST + * instruction in the AMD64 APM Volume 3). + */ + attrs = 1; + if (vmsa) + attrs |= RMPADJUST_VMSA_PAGE_BIT; + + return rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs); +} + +static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa) +{ + int err; + + err = snp_set_vmsa(vmsa, false); + if (err) + pr_err("clear VMSA page failed (%u), leaking page\n", err); + else + free_page((unsigned long)vmsa); +} + +int hv_snp_boot_ap(int cpu, unsigned long start_ip) +{ + struct sev_es_save_area *vmsa = (struct sev_es_save_area *) + __get_free_page(GFP_KERNEL | __GFP_ZERO); + struct sev_es_save_area *cur_vmsa; + struct desc_ptr gdtr; + u64 ret, retry = 5; + struct hv_enable_vp_vtl *start_vp_input; + unsigned long flags; + + if (!vmsa) + return -ENOMEM; + + native_store_gdt(&gdtr); + + vmsa->gdtr.base = gdtr.address; + vmsa->gdtr.limit = gdtr.size; + + asm volatile("movl %%es, %%eax;" : "=a" (vmsa->es.selector)); + hv_populate_vmcb_seg(vmsa->es, vmsa->gdtr.base); + + asm volatile("movl %%cs, %%eax;" : "=a" (vmsa->cs.selector)); + hv_populate_vmcb_seg(vmsa->cs, vmsa->gdtr.base); + + asm volatile("movl %%ss, %%eax;" : "=a" (vmsa->ss.selector)); + hv_populate_vmcb_seg(vmsa->ss, vmsa->gdtr.base); + + asm volatile("movl %%ds, %%eax;" : "=a" (vmsa->ds.selector)); + hv_populate_vmcb_seg(vmsa->ds, vmsa->gdtr.base); + + vmsa->efer = native_read_msr(MSR_EFER); + + asm volatile("movq %%cr4, %%rax;" : "=a" (vmsa->cr4)); + asm volatile("movq %%cr3, %%rax;" : "=a" (vmsa->cr3)); + asm volatile("movq %%cr0, %%rax;" : "=a" (vmsa->cr0)); + + vmsa->xcr0 = 1; + vmsa->g_pat = HV_AP_INIT_GPAT_DEFAULT; + vmsa->rip = (u64)secondary_startup_64_no_verify; + vmsa->rsp = (u64)&ap_start_stack[PAGE_SIZE]; + + /* + * Set the SNP-specific fields for this VMSA: + * VMPL level + * SEV_FEATURES (matches the SEV STATUS MSR right shifted 2 bits) + */ + vmsa->vmpl = 0; + vmsa->sev_features = sev_status >> 2; + + ret = snp_set_vmsa(vmsa, true); + if (!ret) { + pr_err("RMPADJUST(%llx) failed: %llx\n", (u64)vmsa, ret); + free_page((u64)vmsa); + return ret; + } + + local_irq_save(flags); + start_vp_input = (struct hv_enable_vp_vtl *)ap_start_input_arg; + memset(start_vp_input, 0, sizeof(*start_vp_input)); + start_vp_input->partition_id = -1; + start_vp_input->vp_index = cpu; + start_vp_input->target_vtl.target_vtl = ms_hyperv.vtl; + *(u64 *)&start_vp_input->vp_context = __pa(vmsa) | 1; + + do { + ret = hv_do_hypercall(HVCALL_START_VP, + start_vp_input, NULL); + } while (hv_result(ret) == HV_STATUS_TIME_OUT && retry--); + + local_irq_restore(flags); + + if (!hv_result_success(ret)) { + pr_err("HvCallStartVirtualProcessor failed: %llx\n", ret); + snp_cleanup_vmsa(vmsa); + vmsa = NULL; + } + + cur_vmsa = per_cpu(hv_sev_vmsa, cpu); + /* Free up any previous VMSA page */ + if (cur_vmsa) + snp_cleanup_vmsa(cur_vmsa); + + /* Record the current VMSA page */ + per_cpu(hv_sev_vmsa, cpu) = vmsa; + + return ret; +} + +#else +static inline void hv_ghcb_msr_write(u64 msr, u64 value) {} +static inline void hv_ghcb_msr_read(u64 msr, u64 *value) {} +#endif /* CONFIG_AMD_MEM_ENCRYPT */ + +#ifdef CONFIG_INTEL_TDX_GUEST +static void hv_tdx_msr_write(u64 msr, u64 val) +{ + struct tdx_hypercall_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = EXIT_REASON_MSR_WRITE, + .r12 = msr, + .r13 = val, + }; + + u64 ret = __tdx_hypercall(&args); + + WARN_ONCE(ret, "Failed to emulate MSR write: %lld\n", ret); +} + +static void hv_tdx_msr_read(u64 msr, u64 *val) +{ + struct tdx_hypercall_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = EXIT_REASON_MSR_READ, + .r12 = msr, + }; + + u64 ret = __tdx_hypercall_ret(&args); + + if (WARN_ONCE(ret, "Failed to emulate MSR read: %lld\n", ret)) + *val = 0; + else + *val = args.r11; +} + +u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2) +{ + struct tdx_hypercall_args args = { }; + + args.r10 = control; + args.rdx = param1; + args.r8 = param2; + + (void)__tdx_hypercall_ret(&args); + + return args.r11; +} + +#else +static inline void hv_tdx_msr_write(u64 msr, u64 value) {} +static inline void hv_tdx_msr_read(u64 msr, u64 *value) {} +#endif /* CONFIG_INTEL_TDX_GUEST */ + +#if defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST) +void hv_ivm_msr_write(u64 msr, u64 value) +{ + if (!ms_hyperv.paravisor_present) + return; + + if (hv_isolation_type_tdx()) + hv_tdx_msr_write(msr, value); + else if (hv_isolation_type_snp()) + hv_ghcb_msr_write(msr, value); +} + +void hv_ivm_msr_read(u64 msr, u64 *value) +{ + if (!ms_hyperv.paravisor_present) + return; + + if (hv_isolation_type_tdx()) + hv_tdx_msr_read(msr, value); + else if (hv_isolation_type_snp()) + hv_ghcb_msr_read(msr, value); +} /* * hv_mark_gpa_visibility - Set pages visible to host via hvcall. @@ -247,7 +463,7 @@ EXPORT_SYMBOL_GPL(hv_ghcb_msr_read); static int hv_mark_gpa_visibility(u16 count, const u64 pfn[], enum hv_mem_host_visibility visibility) { - struct hv_gpa_range_for_visibility **input_pcpu, *input; + struct hv_gpa_range_for_visibility *input; u16 pages_processed; u64 hv_status; unsigned long flags; @@ -263,9 +479,8 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[], } local_irq_save(flags); - input_pcpu = (struct hv_gpa_range_for_visibility **) - this_cpu_ptr(hyperv_pcpu_input_arg); - input = *input_pcpu; + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + if (unlikely(!input)) { local_irq_restore(flags); return -EINVAL; @@ -359,13 +574,34 @@ static bool hv_is_private_mmio(u64 addr) void __init hv_vtom_init(void) { + enum hv_isolation_type type = hv_get_isolation_type(); + + switch (type) { + case HV_ISOLATION_TYPE_VBS: + fallthrough; /* * By design, a VM using vTOM doesn't see the SEV setting, * so SEV initialization is bypassed and sev_status isn't set. * Set it here to indicate a vTOM VM. + * + * Note: if CONFIG_AMD_MEM_ENCRYPT is not set, sev_status is + * defined as 0ULL, to which we can't assigned a value. */ - sev_status = MSR_AMD64_SNP_VTOM; - cc_vendor = CC_VENDOR_AMD; +#ifdef CONFIG_AMD_MEM_ENCRYPT + case HV_ISOLATION_TYPE_SNP: + sev_status = MSR_AMD64_SNP_VTOM; + cc_vendor = CC_VENDOR_AMD; + break; +#endif + + case HV_ISOLATION_TYPE_TDX: + cc_vendor = CC_VENDOR_INTEL; + break; + + default: + panic("hv_vtom_init: unsupported isolation type %d\n", type); + } + cc_set_mask(ms_hyperv.shared_gpa_boundary); physical_mask &= ms_hyperv.shared_gpa_boundary - 1; @@ -378,7 +614,7 @@ void __init hv_vtom_init(void) mtrr_overwrite_state(NULL, 0, MTRR_TYPE_WRBACK); } -#endif /* CONFIG_AMD_MEM_ENCRYPT */ +#endif /* defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST) */ enum hv_isolation_type hv_get_isolation_type(void) { @@ -406,10 +642,20 @@ bool hv_is_isolation_supported(void) DEFINE_STATIC_KEY_FALSE(isolation_type_snp); /* - * hv_isolation_type_snp - Check system runs in the AMD SEV-SNP based + * hv_isolation_type_snp - Check if the system runs in an AMD SEV-SNP based * isolation VM. */ bool hv_isolation_type_snp(void) { return static_branch_unlikely(&isolation_type_snp); } + +DEFINE_STATIC_KEY_FALSE(isolation_type_tdx); +/* + * hv_isolation_type_tdx - Check if the system runs in an Intel TDX based + * isolated VM. + */ +bool hv_isolation_type_tdx(void) +{ + return static_branch_unlikely(&isolation_type_tdx); +} diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c index 8460bd35e10c..1cc113200ff5 100644 --- a/arch/x86/hyperv/mmu.c +++ b/arch/x86/hyperv/mmu.c @@ -61,7 +61,6 @@ static void hyperv_flush_tlb_multi(const struct cpumask *cpus, const struct flush_tlb_info *info) { int cpu, vcpu, gva_n, max_gvas; - struct hv_tlb_flush **flush_pcpu; struct hv_tlb_flush *flush; u64 status; unsigned long flags; @@ -74,10 +73,7 @@ static void hyperv_flush_tlb_multi(const struct cpumask *cpus, local_irq_save(flags); - flush_pcpu = (struct hv_tlb_flush **) - this_cpu_ptr(hyperv_pcpu_input_arg); - - flush = *flush_pcpu; + flush = *this_cpu_ptr(hyperv_pcpu_input_arg); if (unlikely(!flush)) { local_irq_restore(flags); @@ -178,17 +174,13 @@ static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus, const struct flush_tlb_info *info) { int nr_bank = 0, max_gvas, gva_n; - struct hv_tlb_flush_ex **flush_pcpu; struct hv_tlb_flush_ex *flush; u64 status; if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) return HV_STATUS_INVALID_PARAMETER; - flush_pcpu = (struct hv_tlb_flush_ex **) - this_cpu_ptr(hyperv_pcpu_input_arg); - - flush = *flush_pcpu; + flush = *this_cpu_ptr(hyperv_pcpu_input_arg); if (info->mm) { /* diff --git a/arch/x86/hyperv/nested.c b/arch/x86/hyperv/nested.c index 5d70968c8538..9dc259fa322e 100644 --- a/arch/x86/hyperv/nested.c +++ b/arch/x86/hyperv/nested.c @@ -19,7 +19,6 @@ int hyperv_flush_guest_mapping(u64 as) { - struct hv_guest_mapping_flush **flush_pcpu; struct hv_guest_mapping_flush *flush; u64 status; unsigned long flags; @@ -30,10 +29,7 @@ int hyperv_flush_guest_mapping(u64 as) local_irq_save(flags); - flush_pcpu = (struct hv_guest_mapping_flush **) - this_cpu_ptr(hyperv_pcpu_input_arg); - - flush = *flush_pcpu; + flush = *this_cpu_ptr(hyperv_pcpu_input_arg); if (unlikely(!flush)) { local_irq_restore(flags); @@ -90,7 +86,6 @@ EXPORT_SYMBOL_GPL(hyperv_fill_flush_guest_mapping_list); int hyperv_flush_guest_mapping_range(u64 as, hyperv_fill_flush_list_func fill_flush_list_func, void *data) { - struct hv_guest_mapping_flush_list **flush_pcpu; struct hv_guest_mapping_flush_list *flush; u64 status; unsigned long flags; @@ -102,10 +97,8 @@ int hyperv_flush_guest_mapping_range(u64 as, local_irq_save(flags); - flush_pcpu = (struct hv_guest_mapping_flush_list **) - this_cpu_ptr(hyperv_pcpu_input_arg); + flush = *this_cpu_ptr(hyperv_pcpu_input_arg); - flush = *flush_pcpu; if (unlikely(!flush)) { local_irq_restore(flags); goto fault; diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 8eb74cf386db..c8a7fc23f63c 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -6,7 +6,7 @@ * Copyright (C) 2001 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org> */ -#include <acpi/pdc_intel.h> +#include <acpi/proc_cap_intel.h> #include <asm/numa.h> #include <asm/fixmap.h> @@ -15,6 +15,7 @@ #include <asm/mpspec.h> #include <asm/x86_init.h> #include <asm/cpufeature.h> +#include <asm/irq_vectors.h> #ifdef CONFIG_ACPI_APEI # include <asm/pgtable_types.h> @@ -31,6 +32,7 @@ extern int acpi_skip_timer_override; extern int acpi_use_timer_override; extern int acpi_fix_pin2_polarity; extern int acpi_disable_cmcff; +extern bool acpi_int_src_ovr[NR_IRQS_LEGACY]; extern u8 acpi_sci_flags; extern u32 acpi_sci_override_gsi; @@ -100,23 +102,31 @@ static inline bool arch_has_acpi_pdc(void) c->x86_vendor == X86_VENDOR_CENTAUR); } -static inline void arch_acpi_set_pdc_bits(u32 *buf) +static inline void arch_acpi_set_proc_cap_bits(u32 *cap) { struct cpuinfo_x86 *c = &cpu_data(0); - buf[2] |= ACPI_PDC_C_CAPABILITY_SMP; + *cap |= ACPI_PROC_CAP_C_CAPABILITY_SMP; + + /* Enable coordination with firmware's _TSD info */ + *cap |= ACPI_PROC_CAP_SMP_T_SWCOORD; if (cpu_has(c, X86_FEATURE_EST)) - buf[2] |= ACPI_PDC_EST_CAPABILITY_SWSMP; + *cap |= ACPI_PROC_CAP_EST_CAPABILITY_SWSMP; if (cpu_has(c, X86_FEATURE_ACPI)) - buf[2] |= ACPI_PDC_T_FFH; + *cap |= ACPI_PROC_CAP_T_FFH; + + if (cpu_has(c, X86_FEATURE_HWP)) + *cap |= ACPI_PROC_CAP_COLLAB_PROC_PERF; /* - * If mwait/monitor is unsupported, C2/C3_FFH will be disabled + * If mwait/monitor is unsupported, C_C1_FFH and + * C2/C3_FFH will be disabled. */ - if (!cpu_has(c, X86_FEATURE_MWAIT)) - buf[2] &= ~(ACPI_PDC_C_C2C3_FFH); + if (!cpu_has(c, X86_FEATURE_MWAIT) || + boot_option_idle_override == IDLE_NOMWAIT) + *cap &= ~(ACPI_PROC_CAP_C_C1_FFH | ACPI_PROC_CAP_C_C2C3_FFH); } static inline bool acpi_has_cpu_in_madt(void) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 98c32aa5963a..5af4ec1a0f71 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -3,6 +3,7 @@ #define _ASM_X86_APIC_H #include <linux/cpumask.h> +#include <linux/static_call.h> #include <asm/alternative.h> #include <asm/cpufeature.h> @@ -40,11 +41,9 @@ #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) -extern void generic_apic_probe(void); +extern void x86_32_probe_apic(void); #else -static inline void generic_apic_probe(void) -{ -} +static inline void x86_32_probe_apic(void) { } #endif #ifdef CONFIG_X86_LOCAL_APIC @@ -52,7 +51,7 @@ static inline void generic_apic_probe(void) extern int apic_verbosity; extern int local_apic_timer_c2_ok; -extern int disable_apic; +extern bool apic_is_disabled; extern unsigned int lapic_timer_period; extern int cpuid_to_apicid[]; @@ -66,20 +65,6 @@ enum apic_intr_mode_id { APIC_SYMMETRIC_IO_NO_ROUTING }; -#ifdef CONFIG_SMP -extern void __inquire_remote_apic(int apicid); -#else /* CONFIG_SMP */ -static inline void __inquire_remote_apic(int apicid) -{ -} -#endif /* CONFIG_SMP */ - -static inline void default_inquire_remote_apic(int apicid) -{ - if (apic_verbosity >= APIC_DEBUG) - __inquire_remote_apic(apicid); -} - /* * With 82489DX we can't rely on apic feature bit * retrieved via cpuid but still have to deal with @@ -90,7 +75,7 @@ static inline void default_inquire_remote_apic(int apicid) */ static inline bool apic_from_smp_config(void) { - return smp_found_config && !disable_apic; + return smp_found_config && !apic_is_disabled; } /* @@ -114,8 +99,11 @@ static inline u32 native_apic_mem_read(u32 reg) return *((volatile u32 *)(APIC_BASE + reg)); } -extern void native_apic_wait_icr_idle(void); -extern u32 native_safe_apic_wait_icr_idle(void); +static inline void native_apic_mem_eoi(void) +{ + native_apic_mem_write(APIC_EOI, APIC_EOI_ACK); +} + extern void native_apic_icr_write(u32 low, u32 id); extern u64 native_apic_icr_read(void); @@ -149,12 +137,12 @@ extern void setup_secondary_APIC_clock(void); extern void lapic_update_tsc_freq(void); #ifdef CONFIG_X86_64 -static inline int apic_force_enable(unsigned long addr) +static inline bool apic_force_enable(unsigned long addr) { - return -1; + return false; } #else -extern int apic_force_enable(unsigned long addr); +extern bool apic_force_enable(unsigned long addr); #endif extern void apic_ap_setup(void); @@ -207,7 +195,7 @@ static inline void native_apic_msr_write(u32 reg, u32 v) wrmsr(APIC_BASE_MSR + (reg >> 4), v, 0); } -static inline void native_apic_msr_eoi_write(u32 reg, u32 v) +static inline void native_apic_msr_eoi(void) { __wrmsr(APIC_BASE_MSR + (APIC_EOI >> 4), APIC_EOI_ACK, 0); } @@ -223,18 +211,6 @@ static inline u32 native_apic_msr_read(u32 reg) return (u32)msr; } -static inline void native_x2apic_wait_icr_idle(void) -{ - /* no need to wait for icr idle in x2apic */ - return; -} - -static inline u32 native_safe_x2apic_wait_icr_idle(void) -{ - /* no need to wait for icr idle in x2apic */ - return 0; -} - static inline void native_x2apic_icr_write(u32 low, u32 id) { wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low); @@ -261,7 +237,7 @@ static inline int x2apic_enabled(void) #else /* !CONFIG_X86_X2APIC */ static inline void x2apic_setup(void) { } static inline int x2apic_enabled(void) { return 0; } - +static inline u32 native_apic_msr_read(u32 reg) { BUG(); } #define x2apic_mode (0) #define x2apic_supported() (0) #endif /* !CONFIG_X86_X2APIC */ @@ -280,8 +256,8 @@ struct irq_data; */ struct apic { /* Hotpath functions first */ - void (*eoi_write)(u32 reg, u32 v); - void (*native_eoi_write)(u32 reg, u32 v); + void (*eoi)(void); + void (*native_eoi)(void); void (*write)(u32 reg, u32 v); u32 (*read)(u32 reg); @@ -296,10 +272,11 @@ struct apic { void (*send_IPI_all)(int vector); void (*send_IPI_self)(int vector); - u32 disable_esr; - enum apic_delivery_modes delivery_mode; - bool dest_mode_logical; + + u32 disable_esr : 1, + dest_mode_logical : 1, + x2apic_set_max_apicid : 1; u32 (*calc_dest_apicid)(unsigned int cpu); @@ -307,19 +284,18 @@ struct apic { u64 (*icr_read)(void); void (*icr_write)(u32 low, u32 high); + /* The limit of the APIC ID space. */ + u32 max_apic_id; + /* Probe, setup and smpboot functions */ int (*probe)(void); int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id); - int (*apic_id_valid)(u32 apicid); - int (*apic_id_registered)(void); + bool (*apic_id_registered)(void); bool (*check_apicid_used)(physid_mask_t *map, int apicid); void (*init_apic_ldr)(void); void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap); - void (*setup_apic_routing)(void); int (*cpu_present_to_apicid)(int mps_cpu); - void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap); - int (*check_phys_apicid_present)(int phys_apicid); int (*phys_pkg_id)(int cpuid_apic, int index_msb); u32 (*get_apic_id)(unsigned long x); @@ -330,24 +306,26 @@ struct apic { /* wakeup secondary CPU using 64-bit wakeup point */ int (*wakeup_secondary_cpu_64)(int apicid, unsigned long start_eip); - void (*inquire_remote_apic)(int apicid); - -#ifdef CONFIG_X86_32 - /* - * Called very early during boot from get_smp_config(). It should - * return the logical apicid. x86_[bios]_cpu_to_apicid is - * initialized before this function is called. - * - * If logical apicid can't be determined that early, the function - * may return BAD_APICID. Logical apicid will be configured after - * init_apic_ldr() while bringing up CPUs. Note that NUMA affinity - * won't be applied properly during early boot in this case. - */ - int (*x86_32_early_logical_apicid)(int cpu); -#endif char *name; }; +struct apic_override { + void (*eoi)(void); + void (*native_eoi)(void); + void (*write)(u32 reg, u32 v); + u32 (*read)(u32 reg); + void (*send_IPI)(int cpu, int vector); + void (*send_IPI_mask)(const struct cpumask *mask, int vector); + void (*send_IPI_mask_allbutself)(const struct cpumask *msk, int vec); + void (*send_IPI_allbutself)(int vector); + void (*send_IPI_all)(int vector); + void (*send_IPI_self)(int vector); + u64 (*icr_read)(void); + void (*icr_write)(u32 low, u32 high); + int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip); + int (*wakeup_secondary_cpu_64)(int apicid, unsigned long start_eip); +}; + /* * Pointer to the local APIC driver in use on this system (there's * always just one such driver in use - the kernel decides via an @@ -383,43 +361,111 @@ extern int lapic_can_unplug_cpu(void); #endif #ifdef CONFIG_X86_LOCAL_APIC +extern struct apic_override __x86_apic_override; -static inline u32 apic_read(u32 reg) +void __init apic_setup_apic_calls(void); +void __init apic_install_driver(struct apic *driver); + +#define apic_update_callback(_callback, _fn) { \ + __x86_apic_override._callback = _fn; \ + apic->_callback = _fn; \ + static_call_update(apic_call_##_callback, _fn); \ + pr_info("APIC: %s() replaced with %ps()\n", #_callback, _fn); \ +} + +#define DECLARE_APIC_CALL(__cb) \ + DECLARE_STATIC_CALL(apic_call_##__cb, *apic->__cb) + +DECLARE_APIC_CALL(eoi); +DECLARE_APIC_CALL(native_eoi); +DECLARE_APIC_CALL(icr_read); +DECLARE_APIC_CALL(icr_write); +DECLARE_APIC_CALL(read); +DECLARE_APIC_CALL(send_IPI); +DECLARE_APIC_CALL(send_IPI_mask); +DECLARE_APIC_CALL(send_IPI_mask_allbutself); +DECLARE_APIC_CALL(send_IPI_allbutself); +DECLARE_APIC_CALL(send_IPI_all); +DECLARE_APIC_CALL(send_IPI_self); +DECLARE_APIC_CALL(wait_icr_idle); +DECLARE_APIC_CALL(wakeup_secondary_cpu); +DECLARE_APIC_CALL(wakeup_secondary_cpu_64); +DECLARE_APIC_CALL(write); + +static __always_inline u32 apic_read(u32 reg) +{ + return static_call(apic_call_read)(reg); +} + +static __always_inline void apic_write(u32 reg, u32 val) +{ + static_call(apic_call_write)(reg, val); +} + +static __always_inline void apic_eoi(void) +{ + static_call(apic_call_eoi)(); +} + +static __always_inline void apic_native_eoi(void) { - return apic->read(reg); + static_call(apic_call_native_eoi)(); } -static inline void apic_write(u32 reg, u32 val) +static __always_inline u64 apic_icr_read(void) { - apic->write(reg, val); + return static_call(apic_call_icr_read)(); } -static inline void apic_eoi(void) +static __always_inline void apic_icr_write(u32 low, u32 high) { - apic->eoi_write(APIC_EOI, APIC_EOI_ACK); + static_call(apic_call_icr_write)(low, high); } -static inline u64 apic_icr_read(void) +static __always_inline void __apic_send_IPI(int cpu, int vector) { - return apic->icr_read(); + static_call(apic_call_send_IPI)(cpu, vector); } -static inline void apic_icr_write(u32 low, u32 high) +static __always_inline void __apic_send_IPI_mask(const struct cpumask *mask, int vector) { - apic->icr_write(low, high); + static_call_mod(apic_call_send_IPI_mask)(mask, vector); } -static inline void apic_wait_icr_idle(void) +static __always_inline void __apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) { - apic->wait_icr_idle(); + static_call(apic_call_send_IPI_mask_allbutself)(mask, vector); } -static inline u32 safe_apic_wait_icr_idle(void) +static __always_inline void __apic_send_IPI_allbutself(int vector) { - return apic->safe_wait_icr_idle(); + static_call(apic_call_send_IPI_allbutself)(vector); } -extern void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)); +static __always_inline void __apic_send_IPI_all(int vector) +{ + static_call(apic_call_send_IPI_all)(vector); +} + +static __always_inline void __apic_send_IPI_self(int vector) +{ + static_call_mod(apic_call_send_IPI_self)(vector); +} + +static __always_inline void apic_wait_icr_idle(void) +{ + static_call_cond(apic_call_wait_icr_idle)(); +} + +static __always_inline u32 safe_apic_wait_icr_idle(void) +{ + return apic->safe_wait_icr_idle ? apic->safe_wait_icr_idle() : 0; +} + +static __always_inline bool apic_id_valid(u32 apic_id) +{ + return apic_id <= apic->max_apic_id; +} #else /* CONFIG_X86_LOCAL_APIC */ @@ -430,22 +476,16 @@ static inline u64 apic_icr_read(void) { return 0; } static inline void apic_icr_write(u32 low, u32 high) { } static inline void apic_wait_icr_idle(void) { } static inline u32 safe_apic_wait_icr_idle(void) { return 0; } -static inline void apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) {} +static inline void apic_set_eoi_cb(void (*eoi)(void)) {} +static inline void apic_native_eoi(void) { WARN_ON_ONCE(1); } +static inline void apic_setup_apic_calls(void) { } + +#define apic_update_callback(_callback, _fn) do { } while (0) #endif /* CONFIG_X86_LOCAL_APIC */ extern void apic_ack_irq(struct irq_data *data); -static inline void ack_APIC_irq(void) -{ - /* - * ack_APIC_irq() actually gets compiled as a single instruction - * ... yummie. - */ - apic_eoi(); -} - - static inline bool lapic_vector_set_in_irr(unsigned int vector) { u32 irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); @@ -475,10 +515,6 @@ extern void generic_bigsmp_probe(void); #include <asm/smp.h> -#define APIC_DFR_VALUE (APIC_DFR_FLAT) - -DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid); - extern struct apic apic_noop; static inline unsigned int read_apic_id(void) @@ -490,12 +526,14 @@ static inline unsigned int read_apic_id(void) #ifdef CONFIG_X86_64 typedef int (*wakeup_cpu_handler)(int apicid, unsigned long start_eip); -extern void acpi_wake_cpu_handler_update(wakeup_cpu_handler handler); +extern int default_acpi_madt_oem_check(char *, char *); +extern void x86_64_probe_apic(void); +#else +static inline int default_acpi_madt_oem_check(char *a, char *b) { return 0; } +static inline void x86_64_probe_apic(void) { } #endif extern int default_apic_id_valid(u32 apicid); -extern int default_acpi_madt_oem_check(char *, char *); -extern void default_setup_apic_routing(void); extern u32 apic_default_calc_apicid(unsigned int cpu); extern u32 apic_flat_calc_apicid(unsigned int cpu); @@ -503,9 +541,12 @@ extern u32 apic_flat_calc_apicid(unsigned int cpu); extern bool default_check_apicid_used(physid_mask_t *map, int apicid); extern void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap); extern int default_cpu_present_to_apicid(int mps_cpu); -extern int default_check_phys_apicid_present(int phys_apicid); -#endif /* CONFIG_X86_LOCAL_APIC */ +#else /* CONFIG_X86_LOCAL_APIC */ + +static inline unsigned int read_apic_id(void) { return 0; } + +#endif /* !CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_SMP void apic_smt_update(void); diff --git a/arch/x86/include/asm/audit.h b/arch/x86/include/asm/audit.h index 36aec57ea7a3..fa918f01333e 100644 --- a/arch/x86/include/asm/audit.h +++ b/arch/x86/include/asm/audit.h @@ -4,4 +4,11 @@ int ia32_classify_syscall(unsigned int syscall); +extern unsigned ia32_dir_class[]; +extern unsigned ia32_write_class[]; +extern unsigned ia32_read_class[]; +extern unsigned ia32_chattr_class[]; +extern unsigned ia32_signal_class[]; + + #endif /* _ASM_X86_AUDIT_H */ diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 9191280d9ea3..b3a7cfb0d99e 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -40,26 +40,51 @@ #ifdef CONFIG_X86_64 # define BOOT_STACK_SIZE 0x4000 +/* + * Used by decompressor's startup_32() to allocate page tables for identity + * mapping of the 4G of RAM in 4-level paging mode: + * - 1 level4 table; + * - 1 level3 table; + * - 4 level2 table that maps everything with 2M pages; + * + * The additional level5 table needed for 5-level paging is allocated from + * trampoline_32bit memory. + */ # define BOOT_INIT_PGT_SIZE (6*4096) -# ifdef CONFIG_RANDOMIZE_BASE + /* - * Assuming all cross the 512GB boundary: - * 1 page for level4 - * (2+2)*4 pages for kernel, param, cmd_line, and randomized kernel - * 2 pages for first 2M (video RAM: CONFIG_X86_VERBOSE_BOOTUP). - * Total is 19 pages. + * Total number of page tables kernel_add_identity_map() can allocate, + * including page tables consumed by startup_32(). + * + * Worst-case scenario: + * - 5-level paging needs 1 level5 table; + * - KASLR needs to map kernel, boot_params, cmdline and randomized kernel, + * assuming all of them cross 256T boundary: + * + 4*2 level4 table; + * + 4*2 level3 table; + * + 4*2 level2 table; + * - X86_VERBOSE_BOOTUP needs to map the first 2M (video RAM): + * + 1 level4 table; + * + 1 level3 table; + * + 1 level2 table; + * Total: 28 tables + * + * Add 4 spare table in case decompressor touches anything beyond what is + * accounted above. Warn if it happens. */ -# ifdef CONFIG_X86_VERBOSE_BOOTUP -# define BOOT_PGT_SIZE (19*4096) -# else /* !CONFIG_X86_VERBOSE_BOOTUP */ -# define BOOT_PGT_SIZE (17*4096) -# endif -# else /* !CONFIG_RANDOMIZE_BASE */ -# define BOOT_PGT_SIZE BOOT_INIT_PGT_SIZE -# endif +# define BOOT_PGT_SIZE_WARN (28*4096) +# define BOOT_PGT_SIZE (32*4096) #else /* !CONFIG_X86_64 */ # define BOOT_STACK_SIZE 0x1000 #endif +#ifndef __ASSEMBLY__ +extern unsigned int output_len; +extern const unsigned long kernel_total_size; + +unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr, + void (*error)(char *x)); +#endif + #endif /* _ASM_X86_BOOT_H */ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index cb8ca46213be..58cb9495e40f 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -14,7 +14,7 @@ * Defines x86 CPU feature bits */ #define NCAPINTS 21 /* N 32-bit words worth of info */ -#define NBUGINTS 1 /* N 32-bit bug flags */ +#define NBUGINTS 2 /* N 32-bit bug flags */ /* * Note: If the comment begins with a quoted string, that string is used @@ -198,7 +198,6 @@ #define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ #define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */ #define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */ -#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ #define X86_FEATURE_XCOMPACTED ( 7*32+10) /* "" Use compacted XSTATE (XSAVES or XSAVEC) */ @@ -308,6 +307,11 @@ #define X86_FEATURE_MSR_TSX_CTRL (11*32+20) /* "" MSR IA32_TSX_CTRL (Intel) implemented */ #define X86_FEATURE_SMBA (11*32+21) /* "" Slow Memory Bandwidth Allocation */ #define X86_FEATURE_BMEC (11*32+22) /* "" Bandwidth Monitoring Event Configuration */ +#define X86_FEATURE_USER_SHSTK (11*32+23) /* Shadow stack support for user mode applications */ + +#define X86_FEATURE_SRSO (11*32+24) /* "" AMD BTB untrain RETs */ +#define X86_FEATURE_SRSO_ALIAS (11*32+25) /* "" AMD BTB untrain RETs through aliasing */ +#define X86_FEATURE_IBPB_ON_VMEXIT (11*32+26) /* "" Issue an IBPB only on VMEXIT */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ @@ -380,6 +384,7 @@ #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ #define X86_FEATURE_WAITPKG (16*32+ 5) /* UMONITOR/UMWAIT/TPAUSE Instructions */ #define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ +#define X86_FEATURE_SHSTK (16*32+ 7) /* "" Shadow stack */ #define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */ #define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */ #define X86_FEATURE_VPCLMULQDQ (16*32+10) /* Carry-Less Multiplication Double Quadword */ @@ -434,6 +439,7 @@ #define X86_FEATURE_SEV_ES (19*32+ 3) /* AMD Secure Encrypted Virtualization - Encrypted State */ #define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* "" Virtual TSC_AUX */ #define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */ +#define X86_FEATURE_DEBUG_SWAP (19*32+14) /* AMD SEV-ES full debug state swap support */ /* AMD-defined Extended Feature 2 EAX, CPUID level 0x80000021 (EAX), word 20 */ #define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* "" No Nested Data Breakpoints */ @@ -442,6 +448,10 @@ #define X86_FEATURE_AUTOIBRS (20*32+ 8) /* "" Automatic IBRS */ #define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* "" SMM_CTL MSR is not present */ +#define X86_FEATURE_SBPB (20*32+27) /* "" Selective Branch Prediction Barrier */ +#define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* "" MSR_PRED_CMD[IBPB] flushes all branch type predictions */ +#define X86_FEATURE_SRSO_NO (20*32+29) /* "" CPU is not affected by SRSO */ + /* * BUG word(s) */ @@ -483,5 +493,9 @@ #define X86_BUG_RETBLEED X86_BUG(27) /* CPU is affected by RETBleed */ #define X86_BUG_EIBRS_PBRSB X86_BUG(28) /* EIBRS is vulnerable to Post Barrier RSB Predictions */ #define X86_BUG_SMT_RSB X86_BUG(29) /* CPU is vulnerable to Cross-Thread Return Address Predictions */ +#define X86_BUG_GDS X86_BUG(30) /* CPU is affected by Gather Data Sampling */ +/* BUG word 2 */ +#define X86_BUG_SRSO X86_BUG(1*32 + 0) /* AMD SRSO bug */ +#define X86_BUG_DIV0 X86_BUG(1*32 + 1) /* AMD DIV0 speculation bug */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index fafe9be7a6f4..702d93fdd10e 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -105,6 +105,18 @@ # define DISABLE_TDX_GUEST (1 << (X86_FEATURE_TDX_GUEST & 31)) #endif +#ifdef CONFIG_X86_USER_SHADOW_STACK +#define DISABLE_USER_SHSTK 0 +#else +#define DISABLE_USER_SHSTK (1 << (X86_FEATURE_USER_SHSTK & 31)) +#endif + +#ifdef CONFIG_X86_KERNEL_IBT +#define DISABLE_IBT 0 +#else +#define DISABLE_IBT (1 << (X86_FEATURE_IBT & 31)) +#endif + /* * Make sure to add features to the correct mask */ @@ -120,7 +132,7 @@ #define DISABLED_MASK9 (DISABLE_SGX) #define DISABLED_MASK10 0 #define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \ - DISABLE_CALL_DEPTH_TRACKING) + DISABLE_CALL_DEPTH_TRACKING|DISABLE_USER_SHSTK) #define DISABLED_MASK12 (DISABLE_LAM) #define DISABLED_MASK13 0 #define DISABLED_MASK14 0 @@ -128,7 +140,7 @@ #define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \ DISABLE_ENQCMD) #define DISABLED_MASK17 0 -#define DISABLED_MASK18 0 +#define DISABLED_MASK18 (DISABLE_IBT) #define DISABLED_MASK19 0 #define DISABLED_MASK20 0 #define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 21) diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h index b8f1dc0761e4..9931e4c7d73f 100644 --- a/arch/x86/include/asm/div64.h +++ b/arch/x86/include/asm/div64.h @@ -71,6 +71,12 @@ static inline u64 mul_u32_u32(u32 a, u32 b) } #define mul_u32_u32 mul_u32_u32 +/* + * __div64_32() is never called on x86, so prevent the + * generic definition from getting built. + */ +#define __div64_32 + #else # include <asm-generic/div64.h> diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 8b4be7cecdb8..c4555b269a1b 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -90,21 +90,9 @@ static inline void efi_fpu_end(void) } #ifdef CONFIG_X86_32 -#define arch_efi_call_virt_setup() \ -({ \ - efi_fpu_begin(); \ - firmware_restrict_branch_speculation_start(); \ -}) - -#define arch_efi_call_virt_teardown() \ -({ \ - firmware_restrict_branch_speculation_end(); \ - efi_fpu_end(); \ -}) - +#define EFI_X86_KERNEL_ALLOC_LIMIT (SZ_512M - 1) #else /* !CONFIG_X86_32 */ - -#define EFI_LOADER_SIGNATURE "EL64" +#define EFI_X86_KERNEL_ALLOC_LIMIT EFI_ALLOC_LIMIT extern asmlinkage u64 __efi_call(void *fp, ...); @@ -115,14 +103,6 @@ extern bool efi_disable_ibt_for_runtime; __efi_call(__VA_ARGS__); \ }) -#define arch_efi_call_virt_setup() \ -({ \ - efi_sync_low_kernel_mappings(); \ - efi_fpu_begin(); \ - firmware_restrict_branch_speculation_start(); \ - efi_enter_mm(); \ -}) - #undef arch_efi_call_virt #define arch_efi_call_virt(p, f, args...) ({ \ u64 ret, ibt = ibt_save(efi_disable_ibt_for_runtime); \ @@ -131,13 +111,6 @@ extern bool efi_disable_ibt_for_runtime; ret; \ }) -#define arch_efi_call_virt_teardown() \ -({ \ - efi_leave_mm(); \ - firmware_restrict_branch_speculation_end(); \ - efi_fpu_end(); \ -}) - #ifdef CONFIG_KASAN /* * CONFIG_KASAN may redefine memset to __memset. __memset function is present @@ -167,8 +140,8 @@ extern void efi_delete_dummy_variable(void); extern void efi_crash_gracefully_on_page_fault(unsigned long phys_addr); extern void efi_free_boot_services(void); -void efi_enter_mm(void); -void efi_leave_mm(void); +void arch_efi_call_virt_setup(void); +void arch_efi_call_virt_teardown(void); /* kexec external ABI */ struct efi_setup_data { @@ -218,6 +191,8 @@ efi_status_t efi_set_virtual_address_map(unsigned long memory_map_size, #ifdef CONFIG_EFI_MIXED +#define EFI_ALLOC_LIMIT (efi_is_64bit() ? ULONG_MAX : U32_MAX) + #define ARCH_HAS_EFISTUB_WRAPPERS static inline bool efi_is_64bit(void) diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h index 117903881fe4..ce8f50192ae3 100644 --- a/arch/x86/include/asm/entry-common.h +++ b/arch/x86/include/asm/entry-common.h @@ -92,6 +92,7 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, static __always_inline void arch_exit_to_user_mode(void) { mds_user_clear_cpu_buffers(); + amd_clear_divider(); } #define arch_exit_to_user_mode arch_exit_to_user_mode diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index b475d9a582b8..31089b851c4f 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -82,6 +82,15 @@ static inline void fpregs_unlock(void) preempt_enable(); } +/* + * FPU state gets lazily restored before returning to userspace. So when in the + * kernel, the valid FPU state may be kept in the buffer. This function will force + * restore all the fpu state to the registers early if needed, and lock them from + * being automatically saved/restored. Then FPU state can be modified safely in the + * registers, before unlocking with fpregs_unlock(). + */ +void fpregs_lock_and_load(void); + #ifdef CONFIG_X86_DEBUG_FPU extern void fpregs_assert_state_consistent(void); #else diff --git a/arch/x86/include/asm/fpu/regset.h b/arch/x86/include/asm/fpu/regset.h index 4f928d6a367b..697b77e96025 100644 --- a/arch/x86/include/asm/fpu/regset.h +++ b/arch/x86/include/asm/fpu/regset.h @@ -7,11 +7,12 @@ #include <linux/regset.h> -extern user_regset_active_fn regset_fpregs_active, regset_xregset_fpregs_active; +extern user_regset_active_fn regset_fpregs_active, regset_xregset_fpregs_active, + ssp_active; extern user_regset_get2_fn fpregs_get, xfpregs_get, fpregs_soft_get, - xstateregs_get; + xstateregs_get, ssp_get; extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set, - xstateregs_set; + xstateregs_set, ssp_set; /* * xstateregs_active == regset_fpregs_active. Please refer to the comment diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h index 78fcde7b1f07..ca6e5e5f16b2 100644 --- a/arch/x86/include/asm/fpu/sched.h +++ b/arch/x86/include/asm/fpu/sched.h @@ -11,7 +11,8 @@ extern void save_fpregs_to_fpstate(struct fpu *fpu); extern void fpu__drop(struct fpu *fpu); -extern int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal); +extern int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal, + unsigned long shstk_addr); extern void fpu_flush_thread(void); /* diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 7f6d858ff47a..eb810074f1e7 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -115,8 +115,8 @@ enum xfeature { XFEATURE_PT_UNIMPLEMENTED_SO_FAR, XFEATURE_PKRU, XFEATURE_PASID, - XFEATURE_RSRVD_COMP_11, - XFEATURE_RSRVD_COMP_12, + XFEATURE_CET_USER, + XFEATURE_CET_KERNEL_UNUSED, XFEATURE_RSRVD_COMP_13, XFEATURE_RSRVD_COMP_14, XFEATURE_LBR, @@ -138,6 +138,8 @@ enum xfeature { #define XFEATURE_MASK_PT (1 << XFEATURE_PT_UNIMPLEMENTED_SO_FAR) #define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU) #define XFEATURE_MASK_PASID (1 << XFEATURE_PASID) +#define XFEATURE_MASK_CET_USER (1 << XFEATURE_CET_USER) +#define XFEATURE_MASK_CET_KERNEL (1 << XFEATURE_CET_KERNEL_UNUSED) #define XFEATURE_MASK_LBR (1 << XFEATURE_LBR) #define XFEATURE_MASK_XTILE_CFG (1 << XFEATURE_XTILE_CFG) #define XFEATURE_MASK_XTILE_DATA (1 << XFEATURE_XTILE_DATA) @@ -253,6 +255,16 @@ struct pkru_state { } __packed; /* + * State component 11 is Control-flow Enforcement user states + */ +struct cet_user_state { + /* user control-flow settings */ + u64 user_cet; + /* user shadow stack pointer */ + u64 user_ssp; +}; + +/* * State component 15: Architectural LBR configuration state. * The size of Arch LBR state depends on the number of LBRs (lbr_depth). */ diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index cd3dd170e23a..d4427b88ee12 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -50,7 +50,8 @@ #define XFEATURE_MASK_USER_DYNAMIC XFEATURE_MASK_XTILE_DATA /* All currently supported supervisor features */ -#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID) +#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID | \ + XFEATURE_MASK_CET_USER) /* * A supervisor state component may not always contain valuable information, @@ -77,7 +78,8 @@ * Unsupported supervisor features. When a supervisor feature in this mask is * supported in the future, move it to the supported supervisor feature mask. */ -#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT) +#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT | \ + XFEATURE_MASK_CET_KERNEL) /* All supervisor states including supported and unsupported states. */ #define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \ diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index d465ece58151..551829884734 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -97,10 +97,10 @@ extern struct irq_cfg *irqd_cfg(struct irq_data *irq_data); extern void lock_vector_lock(void); extern void unlock_vector_lock(void); #ifdef CONFIG_SMP -extern void send_cleanup_vector(struct irq_cfg *); +extern void vector_schedule_cleanup(struct irq_cfg *); extern void irq_complete_move(struct irq_cfg *cfg); #else -static inline void send_cleanup_vector(struct irq_cfg *c) { } +static inline void vector_schedule_cleanup(struct irq_cfg *c) { } static inline void irq_complete_move(struct irq_cfg *c) { } #endif diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index cea95dcd27c2..2ff26f53cd62 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -169,7 +169,8 @@ enum hv_isolation_type { HV_ISOLATION_TYPE_NONE = 0, HV_ISOLATION_TYPE_VBS = 1, - HV_ISOLATION_TYPE_SNP = 2 + HV_ISOLATION_TYPE_SNP = 2, + HV_ISOLATION_TYPE_TDX = 3 }; /* Hyper-V specific model specific registers (MSRs) */ @@ -301,6 +302,13 @@ enum hv_isolation_type { #define HV_X64_MSR_TIME_REF_COUNT HV_REGISTER_TIME_REF_COUNT #define HV_X64_MSR_REFERENCE_TSC HV_REGISTER_REFERENCE_TSC +/* + * Registers are only accessible via HVCALL_GET_VP_REGISTERS hvcall and + * there is not associated MSR address. + */ +#define HV_X64_REGISTER_VSM_VP_STATUS 0x000D0003 +#define HV_X64_VTL_MASK GENMASK(3, 0) + /* Hyper-V memory host visibility */ enum hv_mem_host_visibility { VMBUS_PAGE_NOT_VISIBLE = 0, diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index b241af4ce9b4..05fd175cec7d 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -614,7 +614,7 @@ DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_DF, xenpv_exc_double_fault); #endif /* #CP */ -#ifdef CONFIG_X86_KERNEL_IBT +#ifdef CONFIG_X86_CET DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_CP, exc_control_protection); #endif @@ -648,7 +648,6 @@ DECLARE_IDTENTRY_SYSVEC(X86_PLATFORM_IPI_VECTOR, sysvec_x86_platform_ipi); #ifdef CONFIG_SMP DECLARE_IDTENTRY(RESCHEDULE_VECTOR, sysvec_reschedule_ipi); -DECLARE_IDTENTRY_SYSVEC(IRQ_MOVE_CLEANUP_VECTOR, sysvec_irq_move_cleanup); DECLARE_IDTENTRY_SYSVEC(REBOOT_VECTOR, sysvec_reboot); DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_SINGLE_VECTOR, sysvec_call_function_single); DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_VECTOR, sysvec_call_function); diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index b3af2d45bbbb..5fcd85fd64fd 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -98,8 +98,6 @@ #define INTEL_FAM6_ICELAKE_L 0x7E /* Sunny Cove */ #define INTEL_FAM6_ICELAKE_NNPI 0x9D /* Sunny Cove */ -#define INTEL_FAM6_LAKEFIELD 0x8A /* Sunny Cove / Tremont */ - #define INTEL_FAM6_ROCKETLAKE 0xA7 /* Cypress Cove */ #define INTEL_FAM6_TIGERLAKE_L 0x8C /* Willow Cove */ @@ -112,21 +110,24 @@ #define INTEL_FAM6_GRANITERAPIDS_X 0xAD #define INTEL_FAM6_GRANITERAPIDS_D 0xAE +/* "Hybrid" Processors (P-Core/E-Core) */ + +#define INTEL_FAM6_LAKEFIELD 0x8A /* Sunny Cove / Tremont */ + #define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */ #define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */ -#define INTEL_FAM6_ALDERLAKE_N 0xBE -#define INTEL_FAM6_RAPTORLAKE 0xB7 +#define INTEL_FAM6_RAPTORLAKE 0xB7 /* Raptor Cove / Enhanced Gracemont */ #define INTEL_FAM6_RAPTORLAKE_P 0xBA #define INTEL_FAM6_RAPTORLAKE_S 0xBF #define INTEL_FAM6_METEORLAKE 0xAC #define INTEL_FAM6_METEORLAKE_L 0xAA -#define INTEL_FAM6_LUNARLAKE_M 0xBD - #define INTEL_FAM6_ARROWLAKE 0xC6 +#define INTEL_FAM6_LUNARLAKE_M 0xBD + /* "Small Core" Processors (Atom/E-Core) */ #define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */ @@ -154,9 +155,10 @@ #define INTEL_FAM6_ATOM_TREMONT 0x96 /* Elkhart Lake */ #define INTEL_FAM6_ATOM_TREMONT_L 0x9C /* Jasper Lake */ -#define INTEL_FAM6_SIERRAFOREST_X 0xAF +#define INTEL_FAM6_ATOM_GRACEMONT 0xBE /* Alderlake N */ -#define INTEL_FAM6_GRANDRIDGE 0xB6 +#define INTEL_FAM6_ATOM_CRESTMONT_X 0xAF /* Sierra Forest */ +#define INTEL_FAM6_ATOM_CRESTMONT 0xB6 /* Grand Ridge */ /* Xeon Phi */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index e9025640f634..76238842406a 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -35,9 +35,6 @@ * - Arnaldo Carvalho de Melo <acme@conectiva.com.br> */ -#define ARCH_HAS_IOREMAP_WC -#define ARCH_HAS_IOREMAP_WT - #include <linux/string.h> #include <linux/compiler.h> #include <linux/cc_platform.h> @@ -212,8 +209,6 @@ void memset_io(volatile void __iomem *, int, size_t); #define memcpy_toio memcpy_toio #define memset_io memset_io -#include <asm-generic/iomap.h> - /* * ISA space is 'always mapped' on a typical x86 system, no need to * explicitly ioremap() it. The fact that the ISA IO space is mapped diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 437aa8d00e53..51c782600e02 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -109,8 +109,8 @@ extern int mp_irq_entries; /* MP IRQ source entries */ extern struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; -/* 1 if "noapic" boot option passed */ -extern int skip_ioapic_setup; +/* True if "noapic" boot option passed */ +extern bool ioapic_is_disabled; /* 1 if "noapic" boot option passed */ extern int noioapicquirk; @@ -129,7 +129,7 @@ extern unsigned long io_apic_irqs; * assignment of PCI IRQ's. */ #define io_apic_assign_pci_irqs \ - (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs) + (mp_irq_entries && !ioapic_is_disabled && io_apic_irqs) struct irq_cfg; extern void ioapic_insert_resources(void); @@ -179,6 +179,7 @@ extern void print_IO_APICs(void); #define IO_APIC_IRQ(x) 0 #define io_apic_assign_pci_irqs 0 #define setup_ioapic_ids_from_mpc x86_init_noop +#define nr_ioapics (0) static inline void ioapic_insert_resources(void) { } static inline int arch_early_ioapic_init(void) { return 0; } static inline void print_IO_APICs(void) {} diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 29e083b92813..836c170d3087 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -42,7 +42,7 @@ extern void init_ISA_irqs(void); #ifdef CONFIG_X86_LOCAL_APIC void arch_trigger_cpumask_backtrace(const struct cpumask *mask, - bool exclude_self); + int exclude_cpu); #define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace #endif diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 43dcb9284208..3a19904c2db6 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -35,13 +35,6 @@ */ #define FIRST_EXTERNAL_VECTOR 0x20 -/* - * Reserve the lowest usable vector (and hence lowest priority) 0x20 for - * triggering cleanup after irq migration. 0x21-0x2f will still be used - * for device interrupts. - */ -#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR - #define IA32_SYSCALL_VECTOR 0x80 /* diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 5b77bbc28f96..c9f6a6c5de3c 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -205,10 +205,26 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image); #endif #endif -typedef void crash_vmclear_fn(void); -extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss; extern void kdump_nmi_shootdown_cpus(void); +#ifdef CONFIG_CRASH_HOTPLUG +void arch_crash_handle_hotplug_event(struct kimage *image); +#define arch_crash_handle_hotplug_event arch_crash_handle_hotplug_event + +#ifdef CONFIG_HOTPLUG_CPU +int arch_crash_hotplug_cpu_support(void); +#define crash_hotplug_cpu_support arch_crash_hotplug_cpu_support +#endif + +#ifdef CONFIG_MEMORY_HOTPLUG +int arch_crash_hotplug_memory_support(void); +#define crash_hotplug_memory_support arch_crash_hotplug_memory_support +#endif + +unsigned int arch_crash_get_elfcorehdr_size(void); +#define crash_get_elfcorehdr_size arch_crash_get_elfcorehdr_size +#endif + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_KEXEC_H */ diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 13bc212cd4bc..e3054e3e46d5 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -37,6 +37,7 @@ KVM_X86_OP(get_segment) KVM_X86_OP(get_cpl) KVM_X86_OP(set_segment) KVM_X86_OP(get_cs_db_l_bits) +KVM_X86_OP(is_valid_cr0) KVM_X86_OP(set_cr0) KVM_X86_OP_OPTIONAL(post_set_cr3) KVM_X86_OP(is_valid_cr4) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 28bd38303d70..1a4def36d5bb 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -288,13 +288,13 @@ struct kvm_kernel_irq_routing_entry; * kvm_mmu_page_role tracks the properties of a shadow page (where shadow page * also includes TDP pages) to determine whether or not a page can be used in * the given MMU context. This is a subset of the overall kvm_cpu_role to - * minimize the size of kvm_memory_slot.arch.gfn_track, i.e. allows allocating - * 2 bytes per gfn instead of 4 bytes per gfn. + * minimize the size of kvm_memory_slot.arch.gfn_write_track, i.e. allows + * allocating 2 bytes per gfn instead of 4 bytes per gfn. * * Upper-level shadow pages having gptes are tracked for write-protection via - * gfn_track. As above, gfn_track is a 16 bit counter, so KVM must not create - * more than 2^16-1 upper-level shadow pages at a single gfn, otherwise - * gfn_track will overflow and explosions will ensure. + * gfn_write_track. As above, gfn_write_track is a 16 bit counter, so KVM must + * not create more than 2^16-1 upper-level shadow pages at a single gfn, + * otherwise gfn_write_track will overflow and explosions will ensue. * * A unique shadow page (SP) for a gfn is created if and only if an existing SP * cannot be reused. The ability to reuse a SP is tracked by its role, which @@ -746,7 +746,6 @@ struct kvm_vcpu_arch { u64 smi_count; bool at_instruction_boundary; bool tpr_access_reporting; - bool xsaves_enabled; bool xfd_no_write_intercept; u64 ia32_xss; u64 microcode_version; @@ -831,6 +830,25 @@ struct kvm_vcpu_arch { struct kvm_cpuid_entry2 *cpuid_entries; struct kvm_hypervisor_cpuid kvm_cpuid; + /* + * FIXME: Drop this macro and use KVM_NR_GOVERNED_FEATURES directly + * when "struct kvm_vcpu_arch" is no longer defined in an + * arch/x86/include/asm header. The max is mostly arbitrary, i.e. + * can be increased as necessary. + */ +#define KVM_MAX_NR_GOVERNED_FEATURES BITS_PER_LONG + + /* + * Track whether or not the guest is allowed to use features that are + * governed by KVM, where "governed" means KVM needs to manage state + * and/or explicitly enable the feature in hardware. Typically, but + * not always, governed features can be used by the guest if and only + * if both KVM and userspace want to expose the feature to the guest. + */ + struct { + DECLARE_BITMAP(enabled, KVM_MAX_NR_GOVERNED_FEATURES); + } governed_features; + u64 reserved_gpa_bits; int maxphyaddr; @@ -1005,7 +1023,7 @@ struct kvm_lpage_info { struct kvm_arch_memory_slot { struct kvm_rmap_head *rmap[KVM_NR_PAGE_SIZES]; struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; - unsigned short *gfn_track[KVM_PAGE_TRACK_MAX]; + unsigned short *gfn_write_track; }; /* @@ -1247,8 +1265,9 @@ struct kvm_arch { * create an NX huge page (without hanging the guest). */ struct list_head possible_nx_huge_pages; - struct kvm_page_track_notifier_node mmu_sp_tracker; +#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING struct kvm_page_track_notifier_head track_notifier_head; +#endif /* * Protects marking pages unsync during page faults, as TDP MMU page * faults only take mmu_lock for read. For simplicity, the unsync @@ -1566,9 +1585,10 @@ struct kvm_x86_ops { void (*set_segment)(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); + bool (*is_valid_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); void (*post_set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); - bool (*is_valid_cr4)(struct kvm_vcpu *vcpu, unsigned long cr0); + bool (*is_valid_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); int (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); @@ -1654,8 +1674,8 @@ struct kvm_x86_ops { u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu); u64 (*get_l2_tsc_multiplier)(struct kvm_vcpu *vcpu); - void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); - void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu, u64 multiplier); + void (*write_tsc_offset)(struct kvm_vcpu *vcpu); + void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu); /* * Retrieve somewhat arbitrary exit information. Intended to @@ -1794,8 +1814,8 @@ static inline struct kvm *kvm_arch_alloc_vm(void) #define __KVM_HAVE_ARCH_VM_FREE void kvm_arch_free_vm(struct kvm *kvm); -#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB -static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm) +#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS +static inline int kvm_arch_flush_remote_tlbs(struct kvm *kvm) { if (kvm_x86_ops.flush_remote_tlbs && !static_call(kvm_x86_flush_remote_tlbs)(kvm)) @@ -1804,6 +1824,8 @@ static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm) return -ENOTSUPP; } +#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE + #define kvm_arch_pmi_in_guest(vcpu) \ ((vcpu) && (vcpu)->arch.handling_intr_from_guest) @@ -1832,7 +1854,6 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *memslot); void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, const struct kvm_memory_slot *memslot); -void kvm_mmu_zap_all(struct kvm *kvm); void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages); diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h index eb186bc57f6a..3d040741044b 100644 --- a/arch/x86/include/asm/kvm_page_track.h +++ b/arch/x86/include/asm/kvm_page_track.h @@ -2,11 +2,9 @@ #ifndef _ASM_X86_KVM_PAGE_TRACK_H #define _ASM_X86_KVM_PAGE_TRACK_H -enum kvm_page_track_mode { - KVM_PAGE_TRACK_WRITE, - KVM_PAGE_TRACK_MAX, -}; +#include <linux/kvm_types.h> +#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING /* * The notifier represented by @kvm_page_track_notifier_node is linked into * the head which will be notified when guest is triggering the track event. @@ -26,54 +24,39 @@ struct kvm_page_track_notifier_node { * It is called when guest is writing the write-tracked page * and write emulation is finished at that time. * - * @vcpu: the vcpu where the write access happened. * @gpa: the physical address written by guest. * @new: the data was written to the address. * @bytes: the written length. * @node: this node */ - void (*track_write)(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, - int bytes, struct kvm_page_track_notifier_node *node); + void (*track_write)(gpa_t gpa, const u8 *new, int bytes, + struct kvm_page_track_notifier_node *node); + /* - * It is called when memory slot is being moved or removed - * users can drop write-protection for the pages in that memory slot + * Invoked when a memory region is removed from the guest. Or in KVM + * terms, when a memslot is deleted. * - * @kvm: the kvm where memory slot being moved or removed - * @slot: the memory slot being moved or removed - * @node: this node + * @gfn: base gfn of the region being removed + * @nr_pages: number of pages in the to-be-removed region + * @node: this node */ - void (*track_flush_slot)(struct kvm *kvm, struct kvm_memory_slot *slot, - struct kvm_page_track_notifier_node *node); + void (*track_remove_region)(gfn_t gfn, unsigned long nr_pages, + struct kvm_page_track_notifier_node *node); }; -int kvm_page_track_init(struct kvm *kvm); -void kvm_page_track_cleanup(struct kvm *kvm); +int kvm_page_track_register_notifier(struct kvm *kvm, + struct kvm_page_track_notifier_node *n); +void kvm_page_track_unregister_notifier(struct kvm *kvm, + struct kvm_page_track_notifier_node *n); -bool kvm_page_track_write_tracking_enabled(struct kvm *kvm); -int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot); - -void kvm_page_track_free_memslot(struct kvm_memory_slot *slot); -int kvm_page_track_create_memslot(struct kvm *kvm, - struct kvm_memory_slot *slot, - unsigned long npages); - -void kvm_slot_page_track_add_page(struct kvm *kvm, - struct kvm_memory_slot *slot, gfn_t gfn, - enum kvm_page_track_mode mode); -void kvm_slot_page_track_remove_page(struct kvm *kvm, - struct kvm_memory_slot *slot, gfn_t gfn, - enum kvm_page_track_mode mode); -bool kvm_slot_page_track_is_active(struct kvm *kvm, - const struct kvm_memory_slot *slot, - gfn_t gfn, enum kvm_page_track_mode mode); +int kvm_write_track_add_gfn(struct kvm *kvm, gfn_t gfn); +int kvm_write_track_remove_gfn(struct kvm *kvm, gfn_t gfn); +#else +/* + * Allow defining a node in a structure even if page tracking is disabled, e.g. + * to play nice with testing headers via direct inclusion from the command line. + */ +struct kvm_page_track_notifier_node {}; +#endif /* CONFIG_KVM_EXTERNAL_WRITE_TRACKING */ -void -kvm_page_track_register_notifier(struct kvm *kvm, - struct kvm_page_track_notifier_node *n); -void -kvm_page_track_unregister_notifier(struct kvm *kvm, - struct kvm_page_track_notifier_node *n); -void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, - int bytes); -void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot); #endif diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index 0953aa32a324..5ff49fd67732 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -8,6 +8,14 @@ #undef notrace #define notrace __attribute__((no_instrument_function)) +#ifdef CONFIG_64BIT +/* + * The generic version tends to create spurious ENDBR instructions under + * certain conditions. + */ +#define _THIS_IP_ ({ unsigned long __here; asm ("lea 0(%%rip), %0" : "=r" (__here)); __here; }) +#endif + #ifdef CONFIG_X86_32 #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0))) #endif /* CONFIG_X86_32 */ @@ -21,7 +29,7 @@ #define FUNCTION_PADDING #endif -#if (CONFIG_FUNCTION_ALIGNMENT > 8) && !defined(__DISABLE_EXPORTS) && !defined(BULID_VDSO) +#if (CONFIG_FUNCTION_ALIGNMENT > 8) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) # define __FUNC_ALIGN __ALIGN; FUNCTION_PADDING #else # define __FUNC_ALIGN __ALIGN diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h index 56d4ef604b91..635132a12778 100644 --- a/arch/x86/include/asm/local.h +++ b/arch/x86/include/asm/local.h @@ -127,8 +127,8 @@ static inline long local_cmpxchg(local_t *l, long old, long new) static inline bool local_try_cmpxchg(local_t *l, long *old, long new) { - typeof(l->a.counter) *__old = (typeof(l->a.counter) *) old; - return try_cmpxchg_local(&l->a.counter, __old, new); + return try_cmpxchg_local(&l->a.counter, + (typeof(l->a.counter) *) old, new); } /* Always has a lock prefix */ diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 7f97a8a97e24..473b16d73b47 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -50,8 +50,8 @@ void __init sme_enable(struct boot_params *bp); int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size); int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size); -void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, - bool enc); +void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, + unsigned long size, bool enc); void __init mem_encrypt_free_decrypted_mem(void); @@ -85,7 +85,7 @@ early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0; static inline int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; } static inline void __init -early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, bool enc) {} +early_set_mem_enc_dec_hypercall(unsigned long vaddr, unsigned long size, bool enc) {} static inline void mem_encrypt_free_decrypted_mem(void) { } diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 320566a0443d..bbbe9d744977 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -2,137 +2,77 @@ #ifndef _ASM_X86_MICROCODE_H #define _ASM_X86_MICROCODE_H -#include <asm/cpu.h> -#include <linux/earlycpio.h> -#include <linux/initrd.h> - -struct ucode_patch { - struct list_head plist; - void *data; /* Intel uses only this one */ - unsigned int size; - u32 patch_id; - u16 equiv_cpu; -}; - -extern struct list_head microcode_cache; - struct cpu_signature { unsigned int sig; unsigned int pf; unsigned int rev; }; -struct device; - -enum ucode_state { - UCODE_OK = 0, - UCODE_NEW, - UCODE_UPDATED, - UCODE_NFOUND, - UCODE_ERROR, -}; - -struct microcode_ops { - enum ucode_state (*request_microcode_fw) (int cpu, struct device *); - - void (*microcode_fini_cpu) (int cpu); - - /* - * The generic 'microcode_core' part guarantees that - * the callbacks below run on a target cpu when they - * are being called. - * See also the "Synchronization" section in microcode_core.c. - */ - enum ucode_state (*apply_microcode) (int cpu); - int (*collect_cpu_info) (int cpu, struct cpu_signature *csig); -}; - struct ucode_cpu_info { struct cpu_signature cpu_sig; void *mc; }; -extern struct ucode_cpu_info ucode_cpu_info[]; -struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa); - -#ifdef CONFIG_MICROCODE_INTEL -extern struct microcode_ops * __init init_intel_microcode(void); -#else -static inline struct microcode_ops * __init init_intel_microcode(void) -{ - return NULL; -} -#endif /* CONFIG_MICROCODE_INTEL */ -#ifdef CONFIG_MICROCODE_AMD -extern struct microcode_ops * __init init_amd_microcode(void); -extern void __exit exit_amd_microcode(void); +#ifdef CONFIG_MICROCODE +void load_ucode_bsp(void); +void load_ucode_ap(void); +void microcode_bsp_resume(void); #else -static inline struct microcode_ops * __init init_amd_microcode(void) -{ - return NULL; -} -static inline void __exit exit_amd_microcode(void) {} +static inline void load_ucode_bsp(void) { } +static inline void load_ucode_ap(void) { } +static inline void microcode_bsp_resume(void) { } #endif -#define MAX_UCODE_COUNT 128 +#ifdef CONFIG_CPU_SUP_INTEL +/* Intel specific microcode defines. Public for IFS */ +struct microcode_header_intel { + unsigned int hdrver; + unsigned int rev; + unsigned int date; + unsigned int sig; + unsigned int cksum; + unsigned int ldrver; + unsigned int pf; + unsigned int datasize; + unsigned int totalsize; + unsigned int metasize; + unsigned int reserved[2]; +}; -#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24)) -#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u') -#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I') -#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l') -#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h') -#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i') -#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D') +struct microcode_intel { + struct microcode_header_intel hdr; + unsigned int bits[]; +}; -#define CPUID_IS(a, b, c, ebx, ecx, edx) \ - (!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c)))) +#define DEFAULT_UCODE_DATASIZE (2000) +#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel)) +#define MC_HEADER_TYPE_MICROCODE 1 +#define MC_HEADER_TYPE_IFS 2 -/* - * In early loading microcode phase on BSP, boot_cpu_data is not set up yet. - * x86_cpuid_vendor() gets vendor id for BSP. - * - * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify - * coding, we still use x86_cpuid_vendor() to get vendor id for AP. - * - * x86_cpuid_vendor() gets vendor information directly from CPUID. - */ -static inline int x86_cpuid_vendor(void) +static inline int intel_microcode_get_datasize(struct microcode_header_intel *hdr) { - u32 eax = 0x00000000; - u32 ebx, ecx = 0, edx; - - native_cpuid(&eax, &ebx, &ecx, &edx); - - if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx)) - return X86_VENDOR_INTEL; - - if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx)) - return X86_VENDOR_AMD; - - return X86_VENDOR_UNKNOWN; + return hdr->datasize ? : DEFAULT_UCODE_DATASIZE; } -static inline unsigned int x86_cpuid_family(void) +static inline u32 intel_get_microcode_revision(void) { - u32 eax = 0x00000001; - u32 ebx, ecx = 0, edx; + u32 rev, dummy; + + native_wrmsrl(MSR_IA32_UCODE_REV, 0); - native_cpuid(&eax, &ebx, &ecx, &edx); + /* As documented in the SDM: Do a CPUID 1 here */ + native_cpuid_eax(1); - return x86_family(eax); + /* get the current revision from MSR 0x8B */ + native_rdmsr(MSR_IA32_UCODE_REV, dummy, rev); + + return rev; } -#ifdef CONFIG_MICROCODE -extern void __init load_ucode_bsp(void); -extern void load_ucode_ap(void); -void reload_early_microcode(unsigned int cpu); -extern bool initrd_gone; -void microcode_bsp_resume(void); -#else -static inline void __init load_ucode_bsp(void) { } -static inline void load_ucode_ap(void) { } -static inline void reload_early_microcode(unsigned int cpu) { } -static inline void microcode_bsp_resume(void) { } -#endif +void show_ucode_info_early(void); + +#else /* CONFIG_CPU_SUP_INTEL */ +static inline void show_ucode_info_early(void) { } +#endif /* !CONFIG_CPU_SUP_INTEL */ #endif /* _ASM_X86_MICROCODE_H */ diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h deleted file mode 100644 index e6662adf3af4..000000000000 --- a/arch/x86/include/asm/microcode_amd.h +++ /dev/null @@ -1,58 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_MICROCODE_AMD_H -#define _ASM_X86_MICROCODE_AMD_H - -#include <asm/microcode.h> - -#define UCODE_MAGIC 0x00414d44 -#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000 -#define UCODE_UCODE_TYPE 0x00000001 - -#define SECTION_HDR_SIZE 8 -#define CONTAINER_HDR_SZ 12 - -struct equiv_cpu_entry { - u32 installed_cpu; - u32 fixed_errata_mask; - u32 fixed_errata_compare; - u16 equiv_cpu; - u16 res; -} __attribute__((packed)); - -struct microcode_header_amd { - u32 data_code; - u32 patch_id; - u16 mc_patch_data_id; - u8 mc_patch_data_len; - u8 init_flag; - u32 mc_patch_data_checksum; - u32 nb_dev_id; - u32 sb_dev_id; - u16 processor_rev_id; - u8 nb_rev_id; - u8 sb_rev_id; - u8 bios_api_rev; - u8 reserved1[3]; - u32 match_reg[8]; -} __attribute__((packed)); - -struct microcode_amd { - struct microcode_header_amd hdr; - unsigned int mpb[]; -}; - -#define PATCH_MAX_SIZE (3 * PAGE_SIZE) - -#ifdef CONFIG_MICROCODE_AMD -extern void __init load_ucode_amd_bsp(unsigned int family); -extern void load_ucode_amd_ap(unsigned int family); -extern int __init save_microcode_in_initrd_amd(unsigned int family); -void reload_ucode_amd(unsigned int cpu); -#else -static inline void __init load_ucode_amd_bsp(unsigned int family) {} -static inline void load_ucode_amd_ap(unsigned int family) {} -static inline int __init -save_microcode_in_initrd_amd(unsigned int family) { return -EINVAL; } -static inline void reload_ucode_amd(unsigned int cpu) {} -#endif -#endif /* _ASM_X86_MICROCODE_AMD_H */ diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h deleted file mode 100644 index f1fa979e05bf..000000000000 --- a/arch/x86/include/asm/microcode_intel.h +++ /dev/null @@ -1,88 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_MICROCODE_INTEL_H -#define _ASM_X86_MICROCODE_INTEL_H - -#include <asm/microcode.h> - -struct microcode_header_intel { - unsigned int hdrver; - unsigned int rev; - unsigned int date; - unsigned int sig; - unsigned int cksum; - unsigned int ldrver; - unsigned int pf; - unsigned int datasize; - unsigned int totalsize; - unsigned int metasize; - unsigned int reserved[2]; -}; - -struct microcode_intel { - struct microcode_header_intel hdr; - unsigned int bits[]; -}; - -/* microcode format is extended from prescott processors */ -struct extended_signature { - unsigned int sig; - unsigned int pf; - unsigned int cksum; -}; - -struct extended_sigtable { - unsigned int count; - unsigned int cksum; - unsigned int reserved[3]; - struct extended_signature sigs[]; -}; - -#define DEFAULT_UCODE_DATASIZE (2000) -#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel)) -#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) -#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable)) -#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature)) -#define MC_HEADER_TYPE_MICROCODE 1 -#define MC_HEADER_TYPE_IFS 2 - -#define get_totalsize(mc) \ - (((struct microcode_intel *)mc)->hdr.datasize ? \ - ((struct microcode_intel *)mc)->hdr.totalsize : \ - DEFAULT_UCODE_TOTALSIZE) - -#define get_datasize(mc) \ - (((struct microcode_intel *)mc)->hdr.datasize ? \ - ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE) - -#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) - -static inline u32 intel_get_microcode_revision(void) -{ - u32 rev, dummy; - - native_wrmsrl(MSR_IA32_UCODE_REV, 0); - - /* As documented in the SDM: Do a CPUID 1 here */ - native_cpuid_eax(1); - - /* get the current revision from MSR 0x8B */ - native_rdmsr(MSR_IA32_UCODE_REV, dummy, rev); - - return rev; -} - -#ifdef CONFIG_MICROCODE_INTEL -extern void __init load_ucode_intel_bsp(void); -extern void load_ucode_intel_ap(void); -extern void show_ucode_info_early(void); -extern int __init save_microcode_in_initrd_intel(void); -void reload_ucode_intel(void); -#else -static inline __init void load_ucode_intel_bsp(void) {} -static inline void load_ucode_intel_ap(void) {} -static inline void show_ucode_info_early(void) {} -static inline int __init save_microcode_in_initrd_intel(void) { return -EINVAL; } -static inline void reload_ucode_intel(void) {} -#endif - -#endif /* _ASM_X86_MICROCODE_INTEL_H */ diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h new file mode 100644 index 000000000000..12b820259b9f --- /dev/null +++ b/arch/x86/include/asm/mman.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_MMAN_H__ +#define __ASM_MMAN_H__ + +#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS +#define arch_calc_vm_prot_bits(prot, key) ( \ + ((key) & 0x1 ? VM_PKEY_BIT0 : 0) | \ + ((key) & 0x2 ? VM_PKEY_BIT1 : 0) | \ + ((key) & 0x4 ? VM_PKEY_BIT2 : 0) | \ + ((key) & 0x8 ? VM_PKEY_BIT3 : 0)) +#endif + +#include <uapi/asm/mman.h> + +#endif /* __ASM_MMAN_H__ */ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 1d29dc791f5a..416901d406f8 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -186,6 +186,8 @@ do { \ #else #define deactivate_mm(tsk, mm) \ do { \ + if (!tsk->vfork_done) \ + shstk_free(tsk); \ load_gs_index(0); \ loadsegment(fs, 0); \ } while (0) diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index e90ac7e9ae2c..f46df8349e86 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -23,8 +23,6 @@ extern int pic_mode; #define MAX_IRQ_SOURCES 256 -extern unsigned int def_to_bigsmp; - #else /* CONFIG_X86_64: */ #define MAX_MP_BUSSES 256 @@ -41,7 +39,6 @@ extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); extern unsigned int boot_cpu_physical_apicid; extern u8 boot_cpu_apic_version; -extern unsigned long mp_lapic_addr; #ifdef CONFIG_X86_LOCAL_APIC extern int smp_found_config; @@ -76,7 +73,7 @@ static inline void e820__memblock_alloc_reserved_mpc_new(void) { } #define default_get_smp_config x86_init_uint_noop #endif -int generic_processor_info(int apicid, int version); +int generic_processor_info(int apicid); #define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC) @@ -87,13 +84,7 @@ struct physid_mask { typedef struct physid_mask physid_mask_t; #define physid_set(physid, map) set_bit(physid, (map).mask) -#define physid_clear(physid, map) clear_bit(physid, (map).mask) #define physid_isset(physid, map) test_bit(physid, (map).mask) -#define physid_test_and_set(physid, map) \ - test_and_set_bit(physid, (map).mask) - -#define physids_and(dst, src1, src2) \ - bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC) #define physids_or(dst, src1, src2) \ bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC) @@ -101,29 +92,9 @@ typedef struct physid_mask physid_mask_t; #define physids_clear(map) \ bitmap_zero((map).mask, MAX_LOCAL_APIC) -#define physids_complement(dst, src) \ - bitmap_complement((dst).mask, (src).mask, MAX_LOCAL_APIC) - #define physids_empty(map) \ bitmap_empty((map).mask, MAX_LOCAL_APIC) -#define physids_equal(map1, map2) \ - bitmap_equal((map1).mask, (map2).mask, MAX_LOCAL_APIC) - -#define physids_weight(map) \ - bitmap_weight((map).mask, MAX_LOCAL_APIC) - -#define physids_shift_right(d, s, n) \ - bitmap_shift_right((d).mask, (s).mask, n, MAX_LOCAL_APIC) - -#define physids_shift_left(d, s, n) \ - bitmap_shift_left((d).mask, (s).mask, n, MAX_LOCAL_APIC) - -static inline unsigned long physids_coerce(physid_mask_t *map) -{ - return map->mask[0]; -} - static inline void physids_promote(unsigned long physids, physid_mask_t *map) { physids_clear(*map); diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 88d9ef98e087..033b53f993c6 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -5,7 +5,7 @@ #include <linux/types.h> #include <linux/nmi.h> #include <linux/msi.h> -#include <asm/io.h> +#include <linux/io.h> #include <asm/hyperv-tlfs.h> #include <asm/nospec-branch.h> #include <asm/paravirt.h> @@ -26,6 +26,7 @@ union hv_ghcb; DECLARE_STATIC_KEY_FALSE(isolation_type_snp); +DECLARE_STATIC_KEY_FALSE(isolation_type_tdx); typedef int (*hyperv_fill_flush_list_func)( struct hv_guest_mapping_flush_list *flush, @@ -40,6 +41,7 @@ static inline unsigned char hv_get_nmi_reason(void) #if IS_ENABLED(CONFIG_HYPERV) extern int hyperv_init_cpuhp; +extern bool hyperv_paravisor_present; extern void *hv_hypercall_pg; @@ -47,10 +49,25 @@ extern u64 hv_current_partition_id; extern union hv_ghcb * __percpu *hv_ghcb_pg; +bool hv_isolation_type_snp(void); +bool hv_isolation_type_tdx(void); +u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2); + +/* + * DEFAULT INIT GPAT and SEGMENT LIMIT value in struct VMSA + * to start AP in enlightened SEV guest. + */ +#define HV_AP_INIT_GPAT_DEFAULT 0x0007040600070406ULL +#define HV_AP_SEGMENT_LIMIT 0xffffffff + int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages); int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id); int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags); +/* + * If the hypercall involves no input or output parameters, the hypervisor + * ignores the corresponding GPA pointer. + */ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) { u64 input_address = input ? virt_to_phys(input) : 0; @@ -58,6 +75,19 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) u64 hv_status; #ifdef CONFIG_X86_64 + if (hv_isolation_type_tdx() && !hyperv_paravisor_present) + return hv_tdx_hypercall(control, input_address, output_address); + + if (hv_isolation_type_snp() && !hyperv_paravisor_present) { + __asm__ __volatile__("mov %4, %%r8\n" + "vmmcall" + : "=a" (hv_status), ASM_CALL_CONSTRAINT, + "+c" (control), "+d" (input_address) + : "r" (output_address) + : "cc", "memory", "r8", "r9", "r10", "r11"); + return hv_status; + } + if (!hv_hypercall_pg) return U64_MAX; @@ -101,7 +131,16 @@ static inline u64 _hv_do_fast_hypercall8(u64 control, u64 input1) u64 hv_status; #ifdef CONFIG_X86_64 - { + if (hv_isolation_type_tdx() && !hyperv_paravisor_present) + return hv_tdx_hypercall(control, input1, 0); + + if (hv_isolation_type_snp() && !hyperv_paravisor_present) { + __asm__ __volatile__( + "vmmcall" + : "=a" (hv_status), ASM_CALL_CONSTRAINT, + "+c" (control), "+d" (input1) + :: "cc", "r8", "r9", "r10", "r11"); + } else { __asm__ __volatile__(CALL_NOSPEC : "=a" (hv_status), ASM_CALL_CONSTRAINT, "+c" (control), "+d" (input1) @@ -146,7 +185,17 @@ static inline u64 _hv_do_fast_hypercall16(u64 control, u64 input1, u64 input2) u64 hv_status; #ifdef CONFIG_X86_64 - { + if (hv_isolation_type_tdx() && !hyperv_paravisor_present) + return hv_tdx_hypercall(control, input1, input2); + + if (hv_isolation_type_snp() && !hyperv_paravisor_present) { + __asm__ __volatile__("mov %4, %%r8\n" + "vmmcall" + : "=a" (hv_status), ASM_CALL_CONSTRAINT, + "+c" (control), "+d" (input1) + : "r" (input2) + : "cc", "r8", "r9", "r10", "r11"); + } else { __asm__ __volatile__("mov %4, %%r8\n" CALL_NOSPEC : "=a" (hv_status), ASM_CALL_CONSTRAINT, @@ -225,20 +274,24 @@ int hv_map_ioapic_interrupt(int ioapic_id, bool level, int vcpu, int vector, int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry); #ifdef CONFIG_AMD_MEM_ENCRYPT -void hv_ghcb_msr_write(u64 msr, u64 value); -void hv_ghcb_msr_read(u64 msr, u64 *value); bool hv_ghcb_negotiate_protocol(void); void __noreturn hv_ghcb_terminate(unsigned int set, unsigned int reason); -void hv_vtom_init(void); +int hv_snp_boot_ap(int cpu, unsigned long start_ip); #else -static inline void hv_ghcb_msr_write(u64 msr, u64 value) {} -static inline void hv_ghcb_msr_read(u64 msr, u64 *value) {} static inline bool hv_ghcb_negotiate_protocol(void) { return false; } static inline void hv_ghcb_terminate(unsigned int set, unsigned int reason) {} -static inline void hv_vtom_init(void) {} +static inline int hv_snp_boot_ap(int cpu, unsigned long start_ip) { return 0; } #endif -extern bool hv_isolation_type_snp(void); +#if defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST) +void hv_vtom_init(void); +void hv_ivm_msr_write(u64 msr, u64 value); +void hv_ivm_msr_read(u64 msr, u64 *value); +#else +static inline void hv_vtom_init(void) {} +static inline void hv_ivm_msr_write(u64 msr, u64 value) {} +static inline void hv_ivm_msr_read(u64 msr, u64 *value) {} +#endif static inline bool hv_is_synic_reg(unsigned int reg) { diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 3aedae61af4f..1d111350197f 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -57,6 +57,7 @@ #define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */ #define PRED_CMD_IBPB BIT(0) /* Indirect Branch Prediction Barrier */ +#define PRED_CMD_SBPB BIT(7) /* Selective Branch Prediction Barrier */ #define MSR_PPIN_CTL 0x0000004e #define MSR_PPIN 0x0000004f @@ -155,6 +156,15 @@ * Not susceptible to Post-Barrier * Return Stack Buffer Predictions. */ +#define ARCH_CAP_GDS_CTRL BIT(25) /* + * CPU is vulnerable to Gather + * Data Sampling (GDS) and + * has controls for mitigation. + */ +#define ARCH_CAP_GDS_NO BIT(26) /* + * CPU is not vulnerable to Gather + * Data Sampling (GDS). + */ #define ARCH_CAP_XAPIC_DISABLE BIT(21) /* * IA32_XAPIC_DISABLE_STATUS MSR @@ -178,6 +188,8 @@ #define RNGDS_MITG_DIS BIT(0) /* SRBDS support */ #define RTM_ALLOW BIT(1) /* TSX development mode */ #define FB_CLEAR_DIS BIT(3) /* CPU Fill buffer clear disable */ +#define GDS_MITG_DIS BIT(4) /* Disable GDS mitigation */ +#define GDS_MITG_LOCKED BIT(5) /* GDS mitigation locked */ #define MSR_IA32_SYSENTER_CS 0x00000174 #define MSR_IA32_SYSENTER_ESP 0x00000175 @@ -545,6 +557,7 @@ #define MSR_AMD64_DE_CFG 0xc0011029 #define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT 1 #define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE BIT_ULL(MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT) +#define MSR_AMD64_DE_CFG_ZEN2_FP_BACKUP_FIX_BIT 9 #define MSR_AMD64_BU_CFG2 0xc001102a #define MSR_AMD64_IBSFETCHCTL 0xc0011030 diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 1a65cf4acb2b..c55cc243592e 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -211,7 +211,8 @@ * eventually turn into it's own annotation. */ .macro VALIDATE_UNRET_END -#if defined(CONFIG_NOINSTR_VALIDATION) && defined(CONFIG_CPU_UNRET_ENTRY) +#if defined(CONFIG_NOINSTR_VALIDATION) && \ + (defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO)) ANNOTATE_RETPOLINE_SAFE nop #endif @@ -271,9 +272,9 @@ .endm #ifdef CONFIG_CPU_UNRET_ENTRY -#define CALL_ZEN_UNTRAIN_RET "call zen_untrain_ret" +#define CALL_UNTRAIN_RET "call entry_untrain_ret" #else -#define CALL_ZEN_UNTRAIN_RET "" +#define CALL_UNTRAIN_RET "" #endif /* @@ -281,7 +282,7 @@ * return thunk isn't mapped into the userspace tables (then again, AMD * typically has NO_MELTDOWN). * - * While zen_untrain_ret() doesn't clobber anything but requires stack, + * While retbleed_untrain_ret() doesn't clobber anything but requires stack, * entry_ibpb() will clobber AX, CX, DX. * * As such, this must be placed after every *SWITCH_TO_KERNEL_CR3 at a point @@ -289,21 +290,32 @@ */ .macro UNTRAIN_RET #if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \ - defined(CONFIG_CALL_DEPTH_TRACKING) + defined(CONFIG_CALL_DEPTH_TRACKING) || defined(CONFIG_CPU_SRSO) VALIDATE_UNRET_END ALTERNATIVE_3 "", \ - CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \ + CALL_UNTRAIN_RET, X86_FEATURE_UNRET, \ "call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \ __stringify(RESET_CALL_DEPTH), X86_FEATURE_CALL_DEPTH #endif .endm +.macro UNTRAIN_RET_VM +#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \ + defined(CONFIG_CALL_DEPTH_TRACKING) || defined(CONFIG_CPU_SRSO) + VALIDATE_UNRET_END + ALTERNATIVE_3 "", \ + CALL_UNTRAIN_RET, X86_FEATURE_UNRET, \ + "call entry_ibpb", X86_FEATURE_IBPB_ON_VMEXIT, \ + __stringify(RESET_CALL_DEPTH), X86_FEATURE_CALL_DEPTH +#endif +.endm + .macro UNTRAIN_RET_FROM_CALL #if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \ defined(CONFIG_CALL_DEPTH_TRACKING) VALIDATE_UNRET_END ALTERNATIVE_3 "", \ - CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \ + CALL_UNTRAIN_RET, X86_FEATURE_UNRET, \ "call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \ __stringify(RESET_CALL_DEPTH_FROM_CALL), X86_FEATURE_CALL_DEPTH #endif @@ -330,15 +342,24 @@ extern retpoline_thunk_t __x86_indirect_thunk_array[]; extern retpoline_thunk_t __x86_indirect_call_thunk_array[]; extern retpoline_thunk_t __x86_indirect_jump_thunk_array[]; +#ifdef CONFIG_RETHUNK extern void __x86_return_thunk(void); -extern void zen_untrain_ret(void); +#else +static inline void __x86_return_thunk(void) {} +#endif + +extern void retbleed_return_thunk(void); +extern void srso_return_thunk(void); +extern void srso_alias_return_thunk(void); + +extern void retbleed_untrain_ret(void); +extern void srso_untrain_ret(void); +extern void srso_alias_untrain_ret(void); + +extern void entry_untrain_ret(void); extern void entry_ibpb(void); -#ifdef CONFIG_CALL_THUNKS extern void (*x86_return_thunk)(void); -#else -#define x86_return_thunk (&__x86_return_thunk) -#endif #ifdef CONFIG_CALL_DEPTH_TRACKING extern void __x86_return_skl(void); @@ -465,9 +486,6 @@ enum ssb_mitigation { SPEC_STORE_BYPASS_SECCOMP, }; -extern char __indirect_thunk_start[]; -extern char __indirect_thunk_end[]; - static __always_inline void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature) { @@ -479,11 +497,11 @@ void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature) : "memory"); } +extern u64 x86_pred_cmd; + static inline void indirect_branch_prediction_barrier(void) { - u64 val = PRED_CMD_IBPB; - - alternative_msr_write(MSR_IA32_PRED_CMD, val, X86_FEATURE_USE_IBPB); + alternative_msr_write(MSR_IA32_PRED_CMD, x86_pred_cmd, X86_FEATURE_USE_IBPB); } /* The Intel SPEC CTRL MSR base value cache */ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index b49778664d2b..6c8ff12140ae 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -739,6 +739,7 @@ static __always_inline unsigned long arch_local_irq_save(void) ".popsection") extern void default_banner(void); +void native_pv_lock_init(void) __init; #else /* __ASSEMBLY__ */ @@ -778,6 +779,12 @@ extern void default_banner(void); #endif /* __ASSEMBLY__ */ #else /* CONFIG_PARAVIRT */ # define default_banner x86_init_noop + +#ifndef __ASSEMBLY__ +static inline void native_pv_lock_init(void) +{ +} +#endif #endif /* !CONFIG_PARAVIRT */ #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 5700bb337987..d6ad98ca1288 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -125,9 +125,15 @@ extern pmdval_t early_pmd_flags; * The following only work if pte_present() is true. * Undefined behaviour if not.. */ -static inline int pte_dirty(pte_t pte) +static inline bool pte_dirty(pte_t pte) { - return pte_flags(pte) & _PAGE_DIRTY; + return pte_flags(pte) & _PAGE_DIRTY_BITS; +} + +static inline bool pte_shstk(pte_t pte) +{ + return cpu_feature_enabled(X86_FEATURE_SHSTK) && + (pte_flags(pte) & (_PAGE_RW | _PAGE_DIRTY)) == _PAGE_DIRTY; } static inline int pte_young(pte_t pte) @@ -135,9 +141,16 @@ static inline int pte_young(pte_t pte) return pte_flags(pte) & _PAGE_ACCESSED; } -static inline int pmd_dirty(pmd_t pmd) +static inline bool pmd_dirty(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_DIRTY_BITS; +} + +static inline bool pmd_shstk(pmd_t pmd) { - return pmd_flags(pmd) & _PAGE_DIRTY; + return cpu_feature_enabled(X86_FEATURE_SHSTK) && + (pmd_flags(pmd) & (_PAGE_RW | _PAGE_DIRTY | _PAGE_PSE)) == + (_PAGE_DIRTY | _PAGE_PSE); } #define pmd_young pmd_young @@ -146,9 +159,9 @@ static inline int pmd_young(pmd_t pmd) return pmd_flags(pmd) & _PAGE_ACCESSED; } -static inline int pud_dirty(pud_t pud) +static inline bool pud_dirty(pud_t pud) { - return pud_flags(pud) & _PAGE_DIRTY; + return pud_flags(pud) & _PAGE_DIRTY_BITS; } static inline int pud_young(pud_t pud) @@ -158,7 +171,27 @@ static inline int pud_young(pud_t pud) static inline int pte_write(pte_t pte) { - return pte_flags(pte) & _PAGE_RW; + /* + * Shadow stack pages are logically writable, but do not have + * _PAGE_RW. Check for them separately from _PAGE_RW itself. + */ + return (pte_flags(pte) & _PAGE_RW) || pte_shstk(pte); +} + +#define pmd_write pmd_write +static inline int pmd_write(pmd_t pmd) +{ + /* + * Shadow stack pages are logically writable, but do not have + * _PAGE_RW. Check for them separately from _PAGE_RW itself. + */ + return (pmd_flags(pmd) & _PAGE_RW) || pmd_shstk(pmd); +} + +#define pud_write pud_write +static inline int pud_write(pud_t pud) +{ + return pud_flags(pud) & _PAGE_RW; } static inline int pte_huge(pte_t pte) @@ -185,6 +218,8 @@ static inline int pte_special(pte_t pte) static inline u64 protnone_mask(u64 val); +#define PFN_PTE_SHIFT PAGE_SHIFT + static inline unsigned long pte_pfn(pte_t pte) { phys_addr_t pfn = pte_val(pte); @@ -290,9 +325,63 @@ static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear) return native_make_pte(v & ~clear); } +/* + * Write protection operations can result in Dirty=1,Write=0 PTEs. But in the + * case of X86_FEATURE_USER_SHSTK, these PTEs denote shadow stack memory. So + * when creating dirty, write-protected memory, a software bit is used: + * _PAGE_BIT_SAVED_DIRTY. The following functions take a PTE and transition the + * Dirty bit to SavedDirty, and vice-vesra. + * + * This shifting is only done if needed. In the case of shifting + * Dirty->SavedDirty, the condition is if the PTE is Write=0. In the case of + * shifting SavedDirty->Dirty, the condition is Write=1. + */ +static inline pgprotval_t mksaveddirty_shift(pgprotval_t v) +{ + pgprotval_t cond = (~v >> _PAGE_BIT_RW) & 1; + + v |= ((v >> _PAGE_BIT_DIRTY) & cond) << _PAGE_BIT_SAVED_DIRTY; + v &= ~(cond << _PAGE_BIT_DIRTY); + + return v; +} + +static inline pgprotval_t clear_saveddirty_shift(pgprotval_t v) +{ + pgprotval_t cond = (v >> _PAGE_BIT_RW) & 1; + + v |= ((v >> _PAGE_BIT_SAVED_DIRTY) & cond) << _PAGE_BIT_DIRTY; + v &= ~(cond << _PAGE_BIT_SAVED_DIRTY); + + return v; +} + +static inline pte_t pte_mksaveddirty(pte_t pte) +{ + pteval_t v = native_pte_val(pte); + + v = mksaveddirty_shift(v); + return native_make_pte(v); +} + +static inline pte_t pte_clear_saveddirty(pte_t pte) +{ + pteval_t v = native_pte_val(pte); + + v = clear_saveddirty_shift(v); + return native_make_pte(v); +} + static inline pte_t pte_wrprotect(pte_t pte) { - return pte_clear_flags(pte, _PAGE_RW); + pte = pte_clear_flags(pte, _PAGE_RW); + + /* + * Blindly clearing _PAGE_RW might accidentally create + * a shadow stack PTE (Write=0,Dirty=1). Move the hardware + * dirty value to the software bit, if present. + */ + return pte_mksaveddirty(pte); } #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP @@ -330,7 +419,7 @@ static inline pte_t pte_clear_uffd_wp(pte_t pte) static inline pte_t pte_mkclean(pte_t pte) { - return pte_clear_flags(pte, _PAGE_DIRTY); + return pte_clear_flags(pte, _PAGE_DIRTY_BITS); } static inline pte_t pte_mkold(pte_t pte) @@ -345,7 +434,16 @@ static inline pte_t pte_mkexec(pte_t pte) static inline pte_t pte_mkdirty(pte_t pte) { - return pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); + pte = pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); + + return pte_mksaveddirty(pte); +} + +static inline pte_t pte_mkwrite_shstk(pte_t pte) +{ + pte = pte_clear_flags(pte, _PAGE_RW); + + return pte_set_flags(pte, _PAGE_DIRTY); } static inline pte_t pte_mkyoung(pte_t pte) @@ -353,11 +451,15 @@ static inline pte_t pte_mkyoung(pte_t pte) return pte_set_flags(pte, _PAGE_ACCESSED); } -static inline pte_t pte_mkwrite(pte_t pte) +static inline pte_t pte_mkwrite_novma(pte_t pte) { return pte_set_flags(pte, _PAGE_RW); } +struct vm_area_struct; +pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma); +#define pte_mkwrite pte_mkwrite + static inline pte_t pte_mkhuge(pte_t pte) { return pte_set_flags(pte, _PAGE_PSE); @@ -402,9 +504,34 @@ static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) return native_make_pmd(v & ~clear); } +/* See comments above mksaveddirty_shift() */ +static inline pmd_t pmd_mksaveddirty(pmd_t pmd) +{ + pmdval_t v = native_pmd_val(pmd); + + v = mksaveddirty_shift(v); + return native_make_pmd(v); +} + +/* See comments above mksaveddirty_shift() */ +static inline pmd_t pmd_clear_saveddirty(pmd_t pmd) +{ + pmdval_t v = native_pmd_val(pmd); + + v = clear_saveddirty_shift(v); + return native_make_pmd(v); +} + static inline pmd_t pmd_wrprotect(pmd_t pmd) { - return pmd_clear_flags(pmd, _PAGE_RW); + pmd = pmd_clear_flags(pmd, _PAGE_RW); + + /* + * Blindly clearing _PAGE_RW might accidentally create + * a shadow stack PMD (RW=0, Dirty=1). Move the hardware + * dirty value to the software bit. + */ + return pmd_mksaveddirty(pmd); } #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP @@ -431,12 +558,21 @@ static inline pmd_t pmd_mkold(pmd_t pmd) static inline pmd_t pmd_mkclean(pmd_t pmd) { - return pmd_clear_flags(pmd, _PAGE_DIRTY); + return pmd_clear_flags(pmd, _PAGE_DIRTY_BITS); } static inline pmd_t pmd_mkdirty(pmd_t pmd) { - return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); + pmd = pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); + + return pmd_mksaveddirty(pmd); +} + +static inline pmd_t pmd_mkwrite_shstk(pmd_t pmd) +{ + pmd = pmd_clear_flags(pmd, _PAGE_RW); + + return pmd_set_flags(pmd, _PAGE_DIRTY); } static inline pmd_t pmd_mkdevmap(pmd_t pmd) @@ -454,11 +590,14 @@ static inline pmd_t pmd_mkyoung(pmd_t pmd) return pmd_set_flags(pmd, _PAGE_ACCESSED); } -static inline pmd_t pmd_mkwrite(pmd_t pmd) +static inline pmd_t pmd_mkwrite_novma(pmd_t pmd) { return pmd_set_flags(pmd, _PAGE_RW); } +pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); +#define pmd_mkwrite pmd_mkwrite + static inline pud_t pud_set_flags(pud_t pud, pudval_t set) { pudval_t v = native_pud_val(pud); @@ -473,6 +612,24 @@ static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear) return native_make_pud(v & ~clear); } +/* See comments above mksaveddirty_shift() */ +static inline pud_t pud_mksaveddirty(pud_t pud) +{ + pudval_t v = native_pud_val(pud); + + v = mksaveddirty_shift(v); + return native_make_pud(v); +} + +/* See comments above mksaveddirty_shift() */ +static inline pud_t pud_clear_saveddirty(pud_t pud) +{ + pudval_t v = native_pud_val(pud); + + v = clear_saveddirty_shift(v); + return native_make_pud(v); +} + static inline pud_t pud_mkold(pud_t pud) { return pud_clear_flags(pud, _PAGE_ACCESSED); @@ -480,17 +637,26 @@ static inline pud_t pud_mkold(pud_t pud) static inline pud_t pud_mkclean(pud_t pud) { - return pud_clear_flags(pud, _PAGE_DIRTY); + return pud_clear_flags(pud, _PAGE_DIRTY_BITS); } static inline pud_t pud_wrprotect(pud_t pud) { - return pud_clear_flags(pud, _PAGE_RW); + pud = pud_clear_flags(pud, _PAGE_RW); + + /* + * Blindly clearing _PAGE_RW might accidentally create + * a shadow stack PUD (RW=0, Dirty=1). Move the hardware + * dirty value to the software bit. + */ + return pud_mksaveddirty(pud); } static inline pud_t pud_mkdirty(pud_t pud) { - return pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); + pud = pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); + + return pud_mksaveddirty(pud); } static inline pud_t pud_mkdevmap(pud_t pud) @@ -510,7 +676,9 @@ static inline pud_t pud_mkyoung(pud_t pud) static inline pud_t pud_mkwrite(pud_t pud) { - return pud_set_flags(pud, _PAGE_RW); + pud = pud_set_flags(pud, _PAGE_RW); + + return pud_clear_saveddirty(pud); } #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY @@ -627,6 +795,7 @@ static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask); static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { pteval_t val = pte_val(pte), oldval = val; + pte_t pte_result; /* * Chop off the NX bit (if present), and add the NX portion of @@ -635,17 +804,54 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) val &= _PAGE_CHG_MASK; val |= check_pgprot(newprot) & ~_PAGE_CHG_MASK; val = flip_protnone_guard(oldval, val, PTE_PFN_MASK); - return __pte(val); + + pte_result = __pte(val); + + /* + * To avoid creating Write=0,Dirty=1 PTEs, pte_modify() needs to avoid: + * 1. Marking Write=0 PTEs Dirty=1 + * 2. Marking Dirty=1 PTEs Write=0 + * + * The first case cannot happen because the _PAGE_CHG_MASK will filter + * out any Dirty bit passed in newprot. Handle the second case by + * going through the mksaveddirty exercise. Only do this if the old + * value was Write=1 to avoid doing this on Shadow Stack PTEs. + */ + if (oldval & _PAGE_RW) + pte_result = pte_mksaveddirty(pte_result); + else + pte_result = pte_clear_saveddirty(pte_result); + + return pte_result; } static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) { pmdval_t val = pmd_val(pmd), oldval = val; + pmd_t pmd_result; - val &= _HPAGE_CHG_MASK; + val &= (_HPAGE_CHG_MASK & ~_PAGE_DIRTY); val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK; val = flip_protnone_guard(oldval, val, PHYSICAL_PMD_PAGE_MASK); - return __pmd(val); + + pmd_result = __pmd(val); + + /* + * To avoid creating Write=0,Dirty=1 PMDs, pte_modify() needs to avoid: + * 1. Marking Write=0 PMDs Dirty=1 + * 2. Marking Dirty=1 PMDs Write=0 + * + * The first case cannot happen because the _PAGE_CHG_MASK will filter + * out any Dirty bit passed in newprot. Handle the second case by + * going through the mksaveddirty exercise. Only do this if the old + * value was Write=1 to avoid doing this on Shadow Stack PTEs. + */ + if (oldval & _PAGE_RW) + pmd_result = pmd_mksaveddirty(pmd_result); + else + pmd_result = pmd_clear_saveddirty(pmd_result); + + return pmd_result; } /* @@ -829,7 +1035,14 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) * (Currently stuck as a macro because of indirect forward reference * to linux/mm.h:page_to_nid()) */ -#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) +#define mk_pte(page, pgprot) \ +({ \ + pgprot_t __pgprot = pgprot; \ + \ + WARN_ON_ONCE((pgprot_val(__pgprot) & (_PAGE_DIRTY | _PAGE_RW)) == \ + _PAGE_DIRTY); \ + pfn_pte(page_to_pfn(page), __pgprot); \ +}) static inline int pmd_bad(pmd_t pmd) { @@ -1020,24 +1233,17 @@ static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp) return res; } -static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - page_table_check_pte_set(mm, addr, ptep, pte); - set_pte(ptep, pte); -} - static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(mm, addr, pmdp, pmd); + page_table_check_pmd_set(mm, pmdp, pmd); set_pmd(pmdp, pmd); } static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud) { - page_table_check_pud_set(mm, addr, pudp, pud); + page_table_check_pud_set(mm, pudp, pud); native_set_pud(pudp, pud); } @@ -1068,7 +1274,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t pte = native_ptep_get_and_clear(ptep); - page_table_check_pte_clear(mm, addr, pte); + page_table_check_pte_clear(mm, pte); return pte; } @@ -1084,7 +1290,7 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, * care about updates and native needs no locking */ pte = native_local_ptep_get_and_clear(ptep); - page_table_check_pte_clear(mm, addr, pte); + page_table_check_pte_clear(mm, pte); } else { pte = ptep_get_and_clear(mm, addr, ptep); } @@ -1095,7 +1301,17 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte); + /* + * Avoid accidentally creating shadow stack PTEs + * (Write=0,Dirty=1). Use cmpxchg() to prevent races with + * the hardware setting Dirty=1. + */ + pte_t old_pte, new_pte; + + old_pte = READ_ONCE(*ptep); + do { + new_pte = pte_wrprotect(old_pte); + } while (!try_cmpxchg((long *)&ptep->pte, (long *)&old_pte, *(long *)&new_pte)); } #define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0) @@ -1121,19 +1337,13 @@ extern int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); -#define pmd_write pmd_write -static inline int pmd_write(pmd_t pmd) -{ - return pmd_flags(pmd) & _PAGE_RW; -} - #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { pmd_t pmd = native_pmdp_get_and_clear(pmdp); - page_table_check_pmd_clear(mm, addr, pmd); + page_table_check_pmd_clear(mm, pmd); return pmd; } @@ -1144,7 +1354,7 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, { pud_t pud = native_pudp_get_and_clear(pudp); - page_table_check_pud_clear(mm, addr, pud); + page_table_check_pud_clear(mm, pud); return pud; } @@ -1153,13 +1363,17 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { - clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp); -} + /* + * Avoid accidentally creating shadow stack PTEs + * (Write=0,Dirty=1). Use cmpxchg() to prevent races with + * the hardware setting Dirty=1. + */ + pmd_t old_pmd, new_pmd; -#define pud_write pud_write -static inline int pud_write(pud_t pud) -{ - return pud_flags(pud) & _PAGE_RW; + old_pmd = READ_ONCE(*pmdp); + do { + new_pmd = pmd_wrprotect(old_pmd); + } while (!try_cmpxchg((long *)pmdp, (long *)&old_pmd, *(long *)&new_pmd)); } #ifndef pmdp_establish @@ -1167,7 +1381,7 @@ static inline int pud_write(pud_t pud) static inline pmd_t pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd); + page_table_check_pmd_set(vma->vm_mm, pmdp, pmd); if (IS_ENABLED(CONFIG_SMP)) { return xchg(pmdp, pmd); } else { @@ -1292,6 +1506,11 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { } +static inline void update_mmu_cache_range(struct vm_fault *vmf, + struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, unsigned int nr) +{ +} static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd) { @@ -1412,6 +1631,11 @@ static inline bool __pte_access_permitted(unsigned long pteval, bool write) { unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER; + /* + * Write=0,Dirty=1 PTEs are shadow stack, which the kernel + * shouldn't generally allow access to, but since they + * are already Write=0, the below logic covers both cases. + */ if (write) need_pte_bits |= _PAGE_RW; @@ -1453,6 +1677,12 @@ static inline bool arch_has_hw_pte_young(void) return true; } +#define arch_check_zapped_pte arch_check_zapped_pte +void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte); + +#define arch_check_zapped_pmd arch_check_zapped_pmd +void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd); + #ifdef CONFIG_XEN_PV #define arch_has_hw_nonleaf_pmd_young arch_has_hw_nonleaf_pmd_young static inline bool arch_has_hw_nonleaf_pmd_young(void) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index ba3e2554799a..0b748ee16b3d 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -21,7 +21,8 @@ #define _PAGE_BIT_SOFTW2 10 /* " */ #define _PAGE_BIT_SOFTW3 11 /* " */ #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ -#define _PAGE_BIT_SOFTW4 58 /* available for programmer */ +#define _PAGE_BIT_SOFTW4 57 /* available for programmer */ +#define _PAGE_BIT_SOFTW5 58 /* available for programmer */ #define _PAGE_BIT_PKEY_BIT0 59 /* Protection Keys, bit 1/4 */ #define _PAGE_BIT_PKEY_BIT1 60 /* Protection Keys, bit 2/4 */ #define _PAGE_BIT_PKEY_BIT2 61 /* Protection Keys, bit 3/4 */ @@ -34,6 +35,13 @@ #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ #define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4 +#ifdef CONFIG_X86_64 +#define _PAGE_BIT_SAVED_DIRTY _PAGE_BIT_SOFTW5 /* Saved Dirty bit */ +#else +/* Shared with _PAGE_BIT_UFFD_WP which is not supported on 32 bit */ +#define _PAGE_BIT_SAVED_DIRTY _PAGE_BIT_SOFTW2 /* Saved Dirty bit */ +#endif + /* If _PAGE_BIT_PRESENT is clear, we use these: */ /* - if the user mapped it with PROT_NONE; pte_present gives true */ #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL @@ -117,6 +125,18 @@ #define _PAGE_SOFTW4 (_AT(pteval_t, 0)) #endif +/* + * The hardware requires shadow stack to be Write=0,Dirty=1. However, + * there are valid cases where the kernel might create read-only PTEs that + * are dirty (e.g., fork(), mprotect(), uffd-wp(), soft-dirty tracking). In + * this case, the _PAGE_SAVED_DIRTY bit is used instead of the HW-dirty bit, + * to avoid creating a wrong "shadow stack" PTEs. Such PTEs have + * (Write=0,SavedDirty=1,Dirty=0) set. + */ +#define _PAGE_SAVED_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SAVED_DIRTY) + +#define _PAGE_DIRTY_BITS (_PAGE_DIRTY | _PAGE_SAVED_DIRTY) + #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) /* @@ -125,11 +145,12 @@ * instance, and is *not* included in this mask since * pte_modify() does modify it. */ -#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ - _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ - _PAGE_SOFT_DIRTY | _PAGE_DEVMAP | _PAGE_ENC | \ - _PAGE_UFFD_WP) -#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) +#define _COMMON_PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ + _PAGE_SPECIAL | _PAGE_ACCESSED | \ + _PAGE_DIRTY_BITS | _PAGE_SOFT_DIRTY | \ + _PAGE_DEVMAP | _PAGE_ENC | _PAGE_UFFD_WP) +#define _PAGE_CHG_MASK (_COMMON_PAGE_CHG_MASK | _PAGE_PAT) +#define _HPAGE_CHG_MASK (_COMMON_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_PAT_LARGE) /* * The cache modes defined here are used to translate between pure SW usage @@ -188,14 +209,22 @@ enum page_cache_mode { #define __PAGE_KERNEL (__PP|__RW| 0|___A|__NX|___D| 0|___G) #define __PAGE_KERNEL_EXEC (__PP|__RW| 0|___A| 0|___D| 0|___G) + +/* + * Page tables needs to have Write=1 in order for any lower PTEs to be + * writable. This includes shadow stack memory (Write=0, Dirty=1) + */ #define _KERNPG_TABLE_NOENC (__PP|__RW| 0|___A| 0|___D| 0| 0) #define _KERNPG_TABLE (__PP|__RW| 0|___A| 0|___D| 0| 0| _ENC) #define _PAGE_TABLE_NOENC (__PP|__RW|_USR|___A| 0|___D| 0| 0) #define _PAGE_TABLE (__PP|__RW|_USR|___A| 0|___D| 0| 0| _ENC) -#define __PAGE_KERNEL_RO (__PP| 0| 0|___A|__NX|___D| 0|___G) -#define __PAGE_KERNEL_ROX (__PP| 0| 0|___A| 0|___D| 0|___G) + +#define __PAGE_KERNEL_RO (__PP| 0| 0|___A|__NX| 0| 0|___G) +#define __PAGE_KERNEL_ROX (__PP| 0| 0|___A| 0| 0| 0|___G) +#define __PAGE_KERNEL (__PP|__RW| 0|___A|__NX|___D| 0|___G) +#define __PAGE_KERNEL_EXEC (__PP|__RW| 0|___A| 0|___D| 0|___G) #define __PAGE_KERNEL_NOCACHE (__PP|__RW| 0|___A|__NX|___D| 0|___G| __NC) -#define __PAGE_KERNEL_VVAR (__PP| 0|_USR|___A|__NX|___D| 0|___G) +#define __PAGE_KERNEL_VVAR (__PP| 0|_USR|___A|__NX| 0| 0|___G) #define __PAGE_KERNEL_LARGE (__PP|__RW| 0|___A|__NX|___D|_PSE|___G) #define __PAGE_KERNEL_LARGE_EXEC (__PP|__RW| 0|___A| 0|___D|_PSE|___G) #define __PAGE_KERNEL_WP (__PP|__RW| 0|___A|__NX|___D| 0|___G| __WP) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index d46300e94f85..0086920cda06 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -28,6 +28,7 @@ struct vm86; #include <asm/unwind_hints.h> #include <asm/vmxfeatures.h> #include <asm/vdso/processor.h> +#include <asm/shstk.h> #include <linux/personality.h> #include <linux/cache.h> @@ -190,7 +191,6 @@ static inline unsigned long long l1tf_pfn_limit(void) } extern void early_cpu_init(void); -extern void identify_boot_cpu(void); extern void identify_secondary_cpu(struct cpuinfo_x86 *); extern void print_cpu_info(struct cpuinfo_x86 *); void print_cpu_msr(struct cpuinfo_x86 *); @@ -475,6 +475,13 @@ struct thread_struct { */ u32 pkru; +#ifdef CONFIG_X86_USER_SHADOW_STACK + unsigned long features; + unsigned long features_locked; + + struct thread_shstk shstk; +#endif + /* Floating point and extended processor state */ struct fpu fpu; /* @@ -586,7 +593,6 @@ extern char ignore_fpu_irq; #define HAVE_ARCH_PICK_MMAP_LAYOUT 1 #define ARCH_HAS_PREFETCHW -#define ARCH_HAS_SPINLOCK_PREFETCH #ifdef CONFIG_X86_32 # define BASE_PREFETCH "" @@ -620,11 +626,6 @@ static __always_inline void prefetchw(const void *x) "m" (*(const char *)x)); } -static inline void spin_lock_prefetch(const void *x) -{ - prefetchw(x); -} - #define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \ TOP_OF_KERNEL_STACK_PADDING) @@ -682,9 +683,15 @@ extern u16 get_llc_id(unsigned int cpu); #ifdef CONFIG_CPU_SUP_AMD extern u32 amd_get_nodes_per_socket(void); extern u32 amd_get_highest_perf(void); +extern bool cpu_has_ibpb_brtype_microcode(void); +extern void amd_clear_divider(void); +extern void amd_check_microcode(void); #else static inline u32 amd_get_nodes_per_socket(void) { return 0; } static inline u32 amd_get_highest_perf(void) { return 0; } +static inline bool cpu_has_ibpb_brtype_microcode(void) { return false; } +static inline void amd_clear_divider(void) { } +static inline void amd_check_microcode(void) { } #endif extern unsigned long arch_align_stack(unsigned long sp); @@ -727,4 +734,6 @@ bool arch_is_platform_page(u64 paddr); #define arch_is_platform_page arch_is_platform_page #endif +extern bool gds_ucode_mitigated(void); + #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index d87451df480b..cde8357bb226 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -74,8 +74,6 @@ static inline bool vcpu_is_preempted(long cpu) */ DECLARE_STATIC_KEY_TRUE(virt_spin_lock_key); -void native_pv_lock_init(void) __init; - /* * Shortcut for the queued_spin_lock_slowpath() function that allows * virt to hijack it. @@ -103,10 +101,7 @@ static inline bool virt_spin_lock(struct qspinlock *lock) return true; } -#else -static inline void native_pv_lock_init(void) -{ -} + #endif /* CONFIG_PARAVIRT */ #include <asm-generic/qspinlock.h> diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h index 42b17cf10b10..85b6e3609cb9 100644 --- a/arch/x86/include/asm/qspinlock_paravirt.h +++ b/arch/x86/include/asm/qspinlock_paravirt.h @@ -4,6 +4,8 @@ #include <asm/ibt.h> +void __lockfunc __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked); + /* * For x86-64, PV_CALLEE_SAVE_REGS_THUNK() saves and restores 8 64-bit * registers. For i386, however, only 1 32-bit register needs to be saved diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index 9177b4354c3f..6536873f8fc0 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -25,7 +25,14 @@ void __noreturn machine_real_restart(unsigned int type); #define MRR_BIOS 0 #define MRR_APM 1 +#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD) +typedef void (cpu_emergency_virt_cb)(void); +void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback); +void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback); void cpu_emergency_disable_virtualization(void); +#else +static inline void cpu_emergency_disable_virtualization(void) {} +#endif /* CONFIG_KVM_INTEL || CONFIG_KVM_AMD */ typedef void (*nmi_shootdown_cb)(int, struct pt_regs*); void nmi_shootdown_cpus(nmi_shootdown_cb callback); diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h index 7fa611216417..4b081e0d3306 100644 --- a/arch/x86/include/asm/rmwcc.h +++ b/arch/x86/include/asm/rmwcc.h @@ -2,12 +2,7 @@ #ifndef _ASM_X86_RMWcc #define _ASM_X86_RMWcc -/* This counts to 12. Any more, it will return 13th argument. */ -#define __RMWcc_ARGS(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _n, X...) _n -#define RMWcc_ARGS(X...) __RMWcc_ARGS(, ##X, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) - -#define __RMWcc_CONCAT(a, b) a ## b -#define RMWcc_CONCAT(a, b) __RMWcc_CONCAT(a, b) +#include <linux/args.h> #define __CLOBBERS_MEM(clb...) "memory", ## clb @@ -48,7 +43,7 @@ cc_label: c = true; \ #define GEN_UNARY_RMWcc_3(op, var, cc) \ GEN_UNARY_RMWcc_4(op, var, cc, "%[var]") -#define GEN_UNARY_RMWcc(X...) RMWcc_CONCAT(GEN_UNARY_RMWcc_, RMWcc_ARGS(X))(X) +#define GEN_UNARY_RMWcc(X...) CONCATENATE(GEN_UNARY_RMWcc_, COUNT_ARGS(X))(X) #define GEN_BINARY_RMWcc_6(op, var, cc, vcon, _val, arg0) \ __GEN_RMWcc(op " %[val], " arg0, var, cc, \ @@ -57,7 +52,7 @@ cc_label: c = true; \ #define GEN_BINARY_RMWcc_5(op, var, cc, vcon, val) \ GEN_BINARY_RMWcc_6(op, var, cc, vcon, val, "%[var]") -#define GEN_BINARY_RMWcc(X...) RMWcc_CONCAT(GEN_BINARY_RMWcc_, RMWcc_ARGS(X))(X) +#define GEN_BINARY_RMWcc(X...) CONCATENATE(GEN_BINARY_RMWcc_, COUNT_ARGS(X))(X) #define GEN_UNARY_SUFFIXED_RMWcc(op, suffix, var, cc, clobbers...) \ __GEN_RMWcc(op " %[var]\n\t" suffix, var, cc, \ diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h index a6e8373a5170..3fa87e5e11ab 100644 --- a/arch/x86/include/asm/sections.h +++ b/arch/x86/include/asm/sections.h @@ -2,8 +2,6 @@ #ifndef _ASM_X86_SECTIONS_H #define _ASM_X86_SECTIONS_H -#define arch_is_kernel_initmem_freed arch_is_kernel_initmem_freed - #include <asm-generic/sections.h> #include <asm/extable.h> @@ -18,20 +16,4 @@ extern char __end_of_kernel_reserve[]; extern unsigned long _brk_start, _brk_end; -static inline bool arch_is_kernel_initmem_freed(unsigned long addr) -{ - /* - * If _brk_start has not been cleared, brk allocation is incomplete, - * and we can not make assumptions about its use. - */ - if (_brk_start) - return 0; - - /* - * After brk allocation is complete, space between _brk_end and _end - * is available for allocation. - */ - return addr >= _brk_end && addr < (unsigned long)&_end; -} - #endif /* _ASM_X86_SECTIONS_H */ diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 794f69625780..9d6411c65920 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -56,7 +56,7 @@ #define GDT_ENTRY_INVALID_SEG 0 -#ifdef CONFIG_X86_32 +#if defined(CONFIG_X86_32) && !defined(BUILD_VDSO32_64) /* * The layout of the per-CPU GDT under Linux: * diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 66c806784c52..5b4a1ce3d368 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -164,6 +164,7 @@ static __always_inline void sev_es_nmi_complete(void) __sev_es_nmi_complete(); } extern int __init sev_es_efi_map_ghcbs(pgd_t *pgd); +extern void sev_enable(struct boot_params *bp); static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) { @@ -210,12 +211,15 @@ bool snp_init(struct boot_params *bp); void __init __noreturn snp_abort(void); int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio); void snp_accept_memory(phys_addr_t start, phys_addr_t end); +u64 snp_get_unsupported_features(u64 status); +u64 sev_get_status(void); #else static inline void sev_es_ist_enter(struct pt_regs *regs) { } static inline void sev_es_ist_exit(void) { } static inline int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { return 0; } static inline void sev_es_nmi_complete(void) { } static inline int sev_es_efi_map_ghcbs(pgd_t *pgd) { return 0; } +static inline void sev_enable(struct boot_params *bp) { } static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) { return 0; } static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) { return 0; } static inline void setup_ghcb(void) { } @@ -235,6 +239,8 @@ static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *in } static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { } +static inline u64 snp_get_unsupported_features(u64 status) { return 0; } +static inline u64 sev_get_status(void) { return 0; } #endif #endif diff --git a/arch/x86/include/asm/shstk.h b/arch/x86/include/asm/shstk.h new file mode 100644 index 000000000000..42fee8959df7 --- /dev/null +++ b/arch/x86/include/asm/shstk.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_SHSTK_H +#define _ASM_X86_SHSTK_H + +#ifndef __ASSEMBLY__ +#include <linux/types.h> + +struct task_struct; +struct ksignal; + +#ifdef CONFIG_X86_USER_SHADOW_STACK +struct thread_shstk { + u64 base; + u64 size; +}; + +long shstk_prctl(struct task_struct *task, int option, unsigned long arg2); +void reset_thread_features(void); +unsigned long shstk_alloc_thread_stack(struct task_struct *p, unsigned long clone_flags, + unsigned long stack_size); +void shstk_free(struct task_struct *p); +int setup_signal_shadow_stack(struct ksignal *ksig); +int restore_signal_shadow_stack(void); +#else +static inline long shstk_prctl(struct task_struct *task, int option, + unsigned long arg2) { return -EINVAL; } +static inline void reset_thread_features(void) {} +static inline unsigned long shstk_alloc_thread_stack(struct task_struct *p, + unsigned long clone_flags, + unsigned long stack_size) { return 0; } +static inline void shstk_free(struct task_struct *p) {} +static inline int setup_signal_shadow_stack(struct ksignal *ksig) { return 0; } +static inline int restore_signal_shadow_stack(void) { return 0; } +#endif /* CONFIG_X86_USER_SHADOW_STACK */ + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_X86_SHSTK_H */ diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 600cf25dbfc6..ad98dd1d9cfb 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -22,10 +22,6 @@ DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id); DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid); DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid); -DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid); -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) -DECLARE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid); -#endif struct task_struct; @@ -132,11 +128,8 @@ void smp_kick_mwait_play_dead(void); void native_smp_send_reschedule(int cpu); void native_send_call_func_ipi(const struct cpumask *mask); void native_send_call_func_single_ipi(int cpu); -void x86_idle_thread_init(unsigned int cpu, struct task_struct *idle); bool smp_park_other_cpus_in_init(void); - -void smp_store_boot_cpu_info(void); void smp_store_cpu_info(int id); asmlinkage __visible void smp_reboot_interrupt(void); @@ -186,13 +179,6 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu) extern unsigned disabled_cpus; -#ifdef CONFIG_X86_LOCAL_APIC -extern int hard_smp_processor_id(void); - -#else /* CONFIG_X86_LOCAL_APIC */ -#define hard_smp_processor_id() 0 -#endif /* CONFIG_X86_LOCAL_APIC */ - #ifdef CONFIG_DEBUG_NMI_SELFTEST extern void nmi_selftest(void); #else diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index de48d1389936..d6cd9344f6c7 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -202,6 +202,19 @@ static inline void clwb(volatile void *__p) : [pax] "a" (p)); } +#ifdef CONFIG_X86_USER_SHADOW_STACK +static inline int write_user_shstk_64(u64 __user *addr, u64 val) +{ + asm_volatile_goto("1: wrussq %[val], (%[addr])\n" + _ASM_EXTABLE(1b, %l[fail]) + :: [addr] "r" (addr), [val] "r" (val) + :: fail); + return 0; +fail: + return -EFAULT; +} +#endif /* CONFIG_X86_USER_SHADOW_STACK */ + #define nop() asm volatile ("nop") static inline void serialize(void) diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index e7c7379d6ac7..19bf955b67e0 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -288,6 +288,7 @@ static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_ #define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) +#define SVM_SEV_FEAT_DEBUG_SWAP BIT(5) struct vmcb_seg { u16 selector; @@ -345,7 +346,7 @@ struct vmcb_save_area { u64 last_excp_from; u64 last_excp_to; u8 reserved_0x298[72]; - u32 spec_ctrl; /* Guest version of SPEC_CTRL at 0x2E0 */ + u64 spec_ctrl; /* Guest version of SPEC_CTRL at 0x2E0 */ } __packed; /* Save area definition for SEV-ES and SEV-SNP guests */ @@ -512,7 +513,7 @@ struct ghcb { } __packed; -#define EXPECTED_VMCB_SAVE_AREA_SIZE 740 +#define EXPECTED_VMCB_SAVE_AREA_SIZE 744 #define EXPECTED_GHCB_SAVE_AREA_SIZE 1032 #define EXPECTED_SEV_ES_SAVE_AREA_SIZE 1648 #define EXPECTED_VMCB_CONTROL_AREA_SIZE 1024 diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 80450e1d5385..25726893c6f4 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -3,6 +3,7 @@ #define _ASM_X86_TLBFLUSH_H #include <linux/mm_types.h> +#include <linux/mmu_notifier.h> #include <linux/sched.h> #include <asm/processor.h> @@ -253,6 +254,18 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false); } +static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm) +{ + bool should_defer = false; + + /* If remote CPUs need to be flushed then defer batch the flush */ + if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids) + should_defer = true; + put_cpu(); + + return should_defer; +} + static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) { /* @@ -264,11 +277,18 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) return atomic64_inc_return(&mm->context.tlb_gen); } -static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, - struct mm_struct *mm) +static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, + struct mm_struct *mm, + unsigned long uaddr) { inc_mm_tlb_gen(mm); cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); +} + +static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm) +{ + flush_tlb_mm(mm); } extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); @@ -286,7 +306,8 @@ static inline bool pte_flags_need_flush(unsigned long oldflags, const pteval_t flush_on_clear = _PAGE_DIRTY | _PAGE_PRESENT | _PAGE_ACCESSED; const pteval_t software_flags = _PAGE_SOFTW1 | _PAGE_SOFTW2 | - _PAGE_SOFTW3 | _PAGE_SOFTW4; + _PAGE_SOFTW3 | _PAGE_SOFTW4 | + _PAGE_SAVED_DIRTY; const pteval_t flush_on_change = _PAGE_RW | _PAGE_USER | _PAGE_PWT | _PAGE_PCD | _PAGE_PSE | _PAGE_GLOBAL | _PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PKEY_BIT0 | _PAGE_PKEY_BIT1 | diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index caf41c4869a0..3235ba1e5b06 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -136,10 +136,11 @@ static inline int topology_max_smt_threads(void) return __max_smt_threads; } +#include <linux/cpu_smt.h> + int topology_update_package_map(unsigned int apicid, unsigned int cpu); int topology_update_die_map(unsigned int dieid, unsigned int cpu); int topology_phys_to_logical_pkg(unsigned int pkg); -bool topology_smt_supported(void); extern struct cpumask __cpu_primary_thread_mask; #define cpu_primary_thread_mask ((const struct cpumask *)&__cpu_primary_thread_mask) @@ -162,7 +163,6 @@ static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; } static inline int topology_max_die_per_package(void) { return 1; } static inline int topology_max_smt_threads(void) { return 1; } static inline bool topology_is_primary_thread(unsigned int cpu) { return true; } -static inline bool topology_smt_supported(void) { return false; } #endif /* !CONFIG_SMP */ static inline void arch_fix_phys_package_id(int num, u32 slot) diff --git a/arch/x86/include/asm/trap_pf.h b/arch/x86/include/asm/trap_pf.h index 10b1de500ab1..afa524325e55 100644 --- a/arch/x86/include/asm/trap_pf.h +++ b/arch/x86/include/asm/trap_pf.h @@ -11,6 +11,7 @@ * bit 3 == 1: use of reserved bit detected * bit 4 == 1: fault was an instruction fetch * bit 5 == 1: protection keys block access + * bit 6 == 1: shadow stack access fault * bit 15 == 1: SGX MMU page-fault */ enum x86_pf_error_code { @@ -20,6 +21,7 @@ enum x86_pf_error_code { X86_PF_RSVD = 1 << 3, X86_PF_INSTR = 1 << 4, X86_PF_PK = 1 << 5, + X86_PF_SHSTK = 1 << 6, X86_PF_SGX = 1 << 15, }; diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 47ecfff2c83d..b1c9cea6ba88 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -18,7 +18,8 @@ void __init trap_init(void); asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *eregs); #endif -extern bool ibt_selftest(void); +extern int ibt_selftest(void); +extern int ibt_selftest_noendbr(void); #ifdef CONFIG_X86_F00F_BUG /* For handling the FOOF bug */ @@ -47,4 +48,16 @@ void __noreturn handle_stack_overflow(struct pt_regs *regs, struct stack_info *info); #endif +static inline void cond_local_irq_enable(struct pt_regs *regs) +{ + if (regs->flags & X86_EFLAGS_IF) + local_irq_enable(); +} + +static inline void cond_local_irq_disable(struct pt_regs *regs) +{ + if (regs->flags & X86_EFLAGS_IF) + local_irq_disable(); +} + #endif /* _ASM_X86_TRAPS_H */ diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 81b826d3b753..f2c02e4469cc 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -116,7 +116,7 @@ copy_user_generic(void *to, const void *from, unsigned long len) "2:\n" _ASM_EXTABLE_UA(1b, 2b) :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT - : : "memory", "rax", "r8", "r9", "r10", "r11"); + : : "memory", "rax"); clac(); return len; } diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index 1b6455f881f9..6989b824fd32 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -10,6 +10,7 @@ * Copyright (c) Russ Anderson <rja@sgi.com> */ +#include <linux/efi.h> #include <linux/rtc.h> /* @@ -115,7 +116,8 @@ struct uv_arch_type_entry { struct uv_systab { char signature[4]; /* must be UV_SYSTAB_SIG */ u32 revision; /* distinguish different firmware revs */ - u64 function; /* BIOS runtime callback function ptr */ + u64 (__efiapi *function)(enum uv_bios_cmd, ...); + /* BIOS runtime callback function ptr */ u32 size; /* systab size (starting with _VERSION_UV4) */ struct { u32 type:8; /* type of entry */ diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h deleted file mode 100644 index 3b12e6b99412..000000000000 --- a/arch/x86/include/asm/virtext.h +++ /dev/null @@ -1,154 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* CPU virtualization extensions handling - * - * This should carry the code for handling CPU virtualization extensions - * that needs to live in the kernel core. - * - * Author: Eduardo Habkost <ehabkost@redhat.com> - * - * Copyright (C) 2008, Red Hat Inc. - * - * Contains code from KVM, Copyright (C) 2006 Qumranet, Inc. - */ -#ifndef _ASM_X86_VIRTEX_H -#define _ASM_X86_VIRTEX_H - -#include <asm/processor.h> - -#include <asm/vmx.h> -#include <asm/svm.h> -#include <asm/tlbflush.h> - -/* - * VMX functions: - */ - -static inline int cpu_has_vmx(void) -{ - unsigned long ecx = cpuid_ecx(1); - return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */ -} - - -/** - * cpu_vmxoff() - Disable VMX on the current CPU - * - * Disable VMX and clear CR4.VMXE (even if VMXOFF faults) - * - * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to - * atomically track post-VMXON state, e.g. this may be called in NMI context. - * Eat all faults as all other faults on VMXOFF faults are mode related, i.e. - * faults are guaranteed to be due to the !post-VMXON check unless the CPU is - * magically in RM, VM86, compat mode, or at CPL>0. - */ -static inline int cpu_vmxoff(void) -{ - asm_volatile_goto("1: vmxoff\n\t" - _ASM_EXTABLE(1b, %l[fault]) - ::: "cc", "memory" : fault); - - cr4_clear_bits(X86_CR4_VMXE); - return 0; - -fault: - cr4_clear_bits(X86_CR4_VMXE); - return -EIO; -} - -static inline int cpu_vmx_enabled(void) -{ - return __read_cr4() & X86_CR4_VMXE; -} - -/** Disable VMX if it is enabled on the current CPU - * - * You shouldn't call this if cpu_has_vmx() returns 0. - */ -static inline void __cpu_emergency_vmxoff(void) -{ - if (cpu_vmx_enabled()) - cpu_vmxoff(); -} - -/** Disable VMX if it is supported and enabled on the current CPU - */ -static inline void cpu_emergency_vmxoff(void) -{ - if (cpu_has_vmx()) - __cpu_emergency_vmxoff(); -} - - - - -/* - * SVM functions: - */ - -/** Check if the CPU has SVM support - * - * You can use the 'msg' arg to get a message describing the problem, - * if the function returns zero. Simply pass NULL if you are not interested - * on the messages; gcc should take care of not generating code for - * the messages on this case. - */ -static inline int cpu_has_svm(const char **msg) -{ - if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && - boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) { - if (msg) - *msg = "not amd or hygon"; - return 0; - } - - if (boot_cpu_data.extended_cpuid_level < SVM_CPUID_FUNC) { - if (msg) - *msg = "can't execute cpuid_8000000a"; - return 0; - } - - if (!boot_cpu_has(X86_FEATURE_SVM)) { - if (msg) - *msg = "svm not available"; - return 0; - } - return 1; -} - - -/** Disable SVM on the current CPU - * - * You should call this only if cpu_has_svm() returned true. - */ -static inline void cpu_svm_disable(void) -{ - uint64_t efer; - - wrmsrl(MSR_VM_HSAVE_PA, 0); - rdmsrl(MSR_EFER, efer); - if (efer & EFER_SVME) { - /* - * Force GIF=1 prior to disabling SVM to ensure INIT and NMI - * aren't blocked, e.g. if a fatal error occurred between CLGI - * and STGI. Note, STGI may #UD if SVM is disabled from NMI - * context between reading EFER and executing STGI. In that - * case, GIF must already be set, otherwise the NMI would have - * been blocked, so just eat the fault. - */ - asm_volatile_goto("1: stgi\n\t" - _ASM_EXTABLE(1b, %l[fault]) - ::: "memory" : fault); -fault: - wrmsrl(MSR_EFER, efer & ~EFER_SVME); - } -} - -/** Makes sure SVM is disabled, if it is supported on the CPU - */ -static inline void cpu_emergency_svm_disable(void) -{ - if (cpu_has_svm(NULL)) - cpu_svm_disable(); -} - -#endif /* _ASM_X86_VIRTEX_H */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 0d02c4aafa6f..0e73616b82f3 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -71,7 +71,7 @@ #define SECONDARY_EXEC_RDSEED_EXITING VMCS_CONTROL_BIT(RDSEED_EXITING) #define SECONDARY_EXEC_ENABLE_PML VMCS_CONTROL_BIT(PAGE_MOD_LOGGING) #define SECONDARY_EXEC_PT_CONCEAL_VMX VMCS_CONTROL_BIT(PT_CONCEAL_VMX) -#define SECONDARY_EXEC_XSAVES VMCS_CONTROL_BIT(XSAVES) +#define SECONDARY_EXEC_ENABLE_XSAVES VMCS_CONTROL_BIT(XSAVES) #define SECONDARY_EXEC_MODE_BASED_EPT_EXEC VMCS_CONTROL_BIT(MODE_BASED_EPT_EXEC) #define SECONDARY_EXEC_PT_USE_GPA VMCS_CONTROL_BIT(PT_USE_GPA) #define SECONDARY_EXEC_TSC_SCALING VMCS_CONTROL_BIT(TSC_SCALING) diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index fa9ec20783fa..85e63d58c074 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -295,7 +295,10 @@ static inline unsigned long bfn_to_local_pfn(unsigned long mfn) /* VIRT <-> MACHINE conversion */ #define virt_to_machine(v) (phys_to_machine(XPADDR(__pa(v)))) -#define virt_to_pfn(v) (PFN_DOWN(__pa(v))) +static inline unsigned long virt_to_pfn(const void *v) +{ + return PFN_DOWN(__pa(v)); +} #define virt_to_mfn(v) (pfn_to_mfn(virt_to_pfn(v))) #define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT)) diff --git a/arch/x86/include/asm/xen/swiotlb-xen.h b/arch/x86/include/asm/xen/swiotlb-xen.h index 77a2d19cc990..abde0f44df57 100644 --- a/arch/x86/include/asm/xen/swiotlb-xen.h +++ b/arch/x86/include/asm/xen/swiotlb-xen.h @@ -2,12 +2,6 @@ #ifndef _ASM_X86_SWIOTLB_XEN_H #define _ASM_X86_SWIOTLB_XEN_H -#ifdef CONFIG_SWIOTLB_XEN -extern int pci_xen_swiotlb_init_late(void); -#else -static inline int pci_xen_swiotlb_init_late(void) { return -ENXIO; } -#endif - int xen_swiotlb_fixup(void *buf, unsigned long nslabs); int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, unsigned int address_bits, diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h index 775dbd3aff73..46cdc941f958 100644 --- a/arch/x86/include/uapi/asm/mman.h +++ b/arch/x86/include/uapi/asm/mman.h @@ -3,14 +3,10 @@ #define _ASM_X86_MMAN_H #define MAP_32BIT 0x40 /* only give out 32bit addresses */ +#define MAP_ABOVE4G 0x80 /* only map above 4GB */ -#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS -#define arch_calc_vm_prot_bits(prot, key) ( \ - ((key) & 0x1 ? VM_PKEY_BIT0 : 0) | \ - ((key) & 0x2 ? VM_PKEY_BIT1 : 0) | \ - ((key) & 0x4 ? VM_PKEY_BIT2 : 0) | \ - ((key) & 0x8 ? VM_PKEY_BIT3 : 0)) -#endif +/* Flags for map_shadow_stack(2) */ +#define SHADOW_STACK_SET_TOKEN (1ULL << 0) /* Set up a restore token in the shadow stack */ #include <asm-generic/mman.h> diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h index e8d7ebbca1a4..384e2cc6ac19 100644 --- a/arch/x86/include/uapi/asm/prctl.h +++ b/arch/x86/include/uapi/asm/prctl.h @@ -23,9 +23,21 @@ #define ARCH_MAP_VDSO_32 0x2002 #define ARCH_MAP_VDSO_64 0x2003 +/* Don't use 0x3001-0x3004 because of old glibcs */ + #define ARCH_GET_UNTAG_MASK 0x4001 #define ARCH_ENABLE_TAGGED_ADDR 0x4002 #define ARCH_GET_MAX_TAG_BITS 0x4003 #define ARCH_FORCE_TAGGED_SVA 0x4004 +#define ARCH_SHSTK_ENABLE 0x5001 +#define ARCH_SHSTK_DISABLE 0x5002 +#define ARCH_SHSTK_LOCK 0x5003 +#define ARCH_SHSTK_UNLOCK 0x5004 +#define ARCH_SHSTK_STATUS 0x5005 + +/* ARCH_SHSTK_ features bits */ +#define ARCH_SHSTK_SHSTK (1ULL << 0) +#define ARCH_SHSTK_WRSS (1ULL << 1) + #endif /* _ASM_X86_PRCTL_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 4070a01c11b7..3269a0e23d3a 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -33,11 +33,10 @@ KCSAN_SANITIZE := n KMSAN_SANITIZE_head$(BITS).o := n KMSAN_SANITIZE_nmi.o := n -# If instrumentation of this dir is enabled, boot hangs during first second. -# Probably could be more selective here, but note that files related to irqs, -# boot, dumpstack/stacktrace, etc are either non-interesting or can lead to -# non-deterministic coverage. -KCOV_INSTRUMENT := n +# If instrumentation of the following files is enabled, boot hangs during +# first second. +KCOV_INSTRUMENT_head$(BITS).o := n +KCOV_INSTRUMENT_sev.o := n CFLAGS_irq.o := -I $(srctree)/$(src)/../include/asm/trace @@ -49,6 +48,7 @@ obj-y += process_$(BITS).o signal.o signal_$(BITS).o obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o dumpstack.o nmi.o obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o +obj-$(CONFIG_X86_KERNEL_IBT) += ibt_selftest.o obj-y += setup.o x86_init.o i8259.o irqinit.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_IRQ_WORK) += irq_work.o @@ -145,6 +145,10 @@ obj-$(CONFIG_CFI_CLANG) += cfi.o obj-$(CONFIG_CALL_THUNKS) += callthunks.o +obj-$(CONFIG_X86_CET) += cet.o + +obj-$(CONFIG_X86_USER_SHADOW_STACK) += shstk.o + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 21b542a6866c..2a0ea38955df 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -52,6 +52,7 @@ int acpi_lapic; int acpi_ioapic; int acpi_strict; int acpi_disable_cmcff; +bool acpi_int_src_ovr[NR_IRQS_LEGACY]; /* ACPI SCI override configuration */ u8 acpi_sci_flags __initdata; @@ -169,7 +170,6 @@ static int __init acpi_parse_madt(struct acpi_table_header *table) */ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) { - unsigned int ver = 0; int cpu; if (id >= MAX_LOCAL_APIC) { @@ -182,10 +182,7 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) return -EINVAL; } - if (boot_cpu_physical_apicid != -1U) - ver = boot_cpu_apic_version; - - cpu = generic_processor_info(id, ver); + cpu = generic_processor_info(id); if (cpu >= 0) early_per_cpu(x86_cpu_to_acpiid, cpu) = acpiid; @@ -239,7 +236,7 @@ acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end) * to not preallocating memory for all NR_CPUS * when we use CPU hotplug. */ - if (!apic->apic_id_valid(apic_id)) { + if (!apic_id_valid(apic_id)) { if (enabled) pr_warn("x2apic entry ignored\n"); return 0; @@ -588,6 +585,9 @@ acpi_parse_int_src_ovr(union acpi_subtable_headers * header, acpi_table_print_madt_entry(&header->common); + if (intsrc->source_irq < NR_IRQS_LEGACY) + acpi_int_src_ovr[intsrc->source_irq] = true; + if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) { acpi_sci_ioapic_setup(intsrc->source_irq, intsrc->inti_flags & ACPI_MADT_POLARITY_MASK, @@ -1178,7 +1178,7 @@ static int __init acpi_parse_mp_wake(union acpi_subtable_headers *header, acpi_mp_wake_mailbox_paddr = mp_wake->base_address; - acpi_wake_cpu_handler_update(acpi_wakeup_cpu); + apic_update_callback(wakeup_secondary_cpu_64, acpi_wakeup_cpu); return 0; } @@ -1275,7 +1275,7 @@ static int __init acpi_parse_madt_ioapic_entries(void) /* * if "noapic" boot option, don't look for IO-APICs */ - if (skip_ioapic_setup) { + if (ioapic_is_disabled) { pr_info("Skipping IOAPIC probe due to 'noapic' option.\n"); return -ENODEV; } diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 2dcf3a06af09..a5ead6a6d233 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -687,10 +687,6 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) #ifdef CONFIG_RETHUNK -#ifdef CONFIG_CALL_THUNKS -void (*x86_return_thunk)(void) __ro_after_init = &__x86_return_thunk; -#endif - /* * Rewrite the compiler generated return thunk tail-calls. * @@ -1531,6 +1527,7 @@ static noinline void __init int3_selftest(void) static __initdata int __alt_reloc_selftest_addr; +extern void __init __alt_reloc_selftest(void *arg); __visible noinline void __init __alt_reloc_selftest(void *arg) { WARN_ON(arg != &__alt_reloc_selftest_addr); diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 035a3db5330b..356de955e78d 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -24,6 +24,8 @@ #define PCI_DEVICE_ID_AMD_19H_M40H_ROOT 0x14b5 #define PCI_DEVICE_ID_AMD_19H_M60H_ROOT 0x14d8 #define PCI_DEVICE_ID_AMD_19H_M70H_ROOT 0x14e8 +#define PCI_DEVICE_ID_AMD_1AH_M00H_ROOT 0x153a +#define PCI_DEVICE_ID_AMD_1AH_M20H_ROOT 0x1507 #define PCI_DEVICE_ID_AMD_MI200_ROOT 0x14bb #define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464 @@ -39,6 +41,7 @@ #define PCI_DEVICE_ID_AMD_19H_M60H_DF_F4 0x14e4 #define PCI_DEVICE_ID_AMD_19H_M70H_DF_F4 0x14f4 #define PCI_DEVICE_ID_AMD_19H_M78H_DF_F4 0x12fc +#define PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4 0x12c4 #define PCI_DEVICE_ID_AMD_MI200_DF_F4 0x14d4 /* Protect the PCI config register pairs used for SMN. */ @@ -56,6 +59,8 @@ static const struct pci_device_id amd_root_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_ROOT) }, {} }; @@ -85,6 +90,8 @@ static const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F3) }, {} }; @@ -106,6 +113,7 @@ static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F4) }, {} }; diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index a6fcaf16cdbf..2ee867d796d9 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -7,7 +7,7 @@ # In particualr, smp_apic_timer_interrupt() is called in random places. KCOV_INSTRUMENT := n -obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_common.o apic_noop.o ipi.o vector.o +obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_common.o apic_noop.o ipi.o vector.o init.o obj-y += hw_nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index af49e24b46a4..760adac3d1a8 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -63,6 +63,8 @@ #include <asm/irq_regs.h> #include <asm/cpu.h> +#include "local.h" + unsigned int num_processors; unsigned disabled_cpus; @@ -74,11 +76,6 @@ EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid); u8 boot_cpu_apic_version __ro_after_init; /* - * The highest APIC ID seen during enumeration. - */ -static unsigned int max_physical_apicid; - -/* * Bitmask of physically existing CPUs: */ physid_mask_t phys_cpu_present_map; @@ -104,26 +101,20 @@ static bool virt_ext_dest_id __ro_after_init; /* For parallel bootup. */ unsigned long apic_mmio_base __ro_after_init; +static inline bool apic_accessible(void) +{ + return x2apic_mode || apic_mmio_base; +} + /* * Map cpu index to physical APIC ID */ DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID); -DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid, BAD_APICID); DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, U32_MAX); EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); -EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid); #ifdef CONFIG_X86_32 - -/* - * On x86_32, the mapping between cpu and logical apicid may vary - * depending on apic in use. The following early percpu variable is - * used for the mapping. This is where the behaviors of x86_64 and 32 - * actually diverge. Let's keep it ugly for now. - */ -DEFINE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid, BAD_APICID); - /* Local APIC was disabled by the BIOS and enabled by the kernel */ static int enabled_via_apicbase __ro_after_init; @@ -179,8 +170,8 @@ static __init int setup_apicpmtimer(char *s) __setup("apicpmtimer", setup_apicpmtimer); #endif -unsigned long mp_lapic_addr __ro_after_init; -int disable_apic __ro_after_init; +static unsigned long mp_lapic_addr __ro_after_init; +bool apic_is_disabled __ro_after_init; /* Disable local APIC timer from the kernel commandline or via dmi quirk */ static int disable_apic_timer __initdata; /* Local APIC timer works in C2 */ @@ -206,8 +197,6 @@ unsigned int lapic_timer_period = 0; static void apic_pm_activate(void); -static unsigned long apic_phys __ro_after_init; - /* * Get the LAPIC version */ @@ -247,31 +236,7 @@ static int modern_apic(void) */ static void __init apic_disable(void) { - pr_info("APIC: switched to apic NOOP\n"); - apic = &apic_noop; -} - -void native_apic_wait_icr_idle(void) -{ - while (apic_read(APIC_ICR) & APIC_ICR_BUSY) - cpu_relax(); -} - -u32 native_safe_apic_wait_icr_idle(void) -{ - u32 send_status; - int timeout; - - timeout = 0; - do { - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - if (!send_status) - break; - inc_irq_stat(icr_read_retry_count); - udelay(100); - } while (timeout++ < 1000); - - return send_status; + apic_install_driver(&apic_noop); } void native_apic_icr_write(u32 low, u32 id) @@ -537,7 +502,7 @@ static int lapic_timer_set_oneshot(struct clock_event_device *evt) static void lapic_timer_broadcast(const struct cpumask *mask) { #ifdef CONFIG_SMP - apic->send_IPI_mask(mask, LOCAL_TIMER_VECTOR); + __apic_send_IPI_mask(mask, LOCAL_TIMER_VECTOR); #endif } @@ -810,7 +775,7 @@ bool __init apic_needs_pit(void) return true; /* Is there an APIC at all or is it disabled? */ - if (!boot_cpu_has(X86_FEATURE_APIC) || disable_apic) + if (!boot_cpu_has(X86_FEATURE_APIC) || apic_is_disabled) return true; /* @@ -1110,7 +1075,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_apic_timer_interrupt) { struct pt_regs *old_regs = set_irq_regs(regs); - ack_APIC_irq(); + apic_eoi(); trace_local_timer_entry(LOCAL_TIMER_VECTOR); local_apic_timer_interrupt(); trace_local_timer_exit(LOCAL_TIMER_VECTOR); @@ -1134,8 +1099,7 @@ void clear_local_APIC(void) int maxlvt; u32 v; - /* APIC hasn't been mapped yet */ - if (!x2apic_mode && !apic_phys) + if (!apic_accessible()) return; maxlvt = lapic_get_maxlvt(); @@ -1225,8 +1189,7 @@ void apic_soft_disable(void) */ void disable_local_APIC(void) { - /* APIC hasn't been mapped yet */ - if (!x2apic_mode && !apic_phys) + if (!apic_accessible()) return; apic_soft_disable(); @@ -1299,7 +1262,7 @@ enum apic_intr_mode_id apic_intr_mode __ro_after_init; static int __init __apic_intr_mode_select(void) { /* Check kernel option */ - if (disable_apic) { + if (apic_is_disabled) { pr_info("APIC disabled via kernel command line\n"); return APIC_PIC; } @@ -1308,7 +1271,7 @@ static int __init __apic_intr_mode_select(void) #ifdef CONFIG_X86_64 /* On 64-bit, the APIC must be integrated, Check local APIC only */ if (!boot_cpu_has(X86_FEATURE_APIC)) { - disable_apic = 1; + apic_is_disabled = true; pr_info("APIC disabled by BIOS\n"); return APIC_PIC; } @@ -1317,16 +1280,15 @@ static int __init __apic_intr_mode_select(void) /* Neither 82489DX nor integrated APIC ? */ if (!boot_cpu_has(X86_FEATURE_APIC) && !smp_found_config) { - disable_apic = 1; + apic_is_disabled = true; return APIC_PIC; } /* If the BIOS pretends there is an integrated APIC ? */ if (!boot_cpu_has(X86_FEATURE_APIC) && APIC_INTEGRATED(boot_cpu_apic_version)) { - disable_apic = 1; - pr_err(FW_BUG "Local APIC %d not detected, force emulation\n", - boot_cpu_physical_apicid); + apic_is_disabled = true; + pr_err(FW_BUG "Local APIC not detected, force emulation\n"); return APIC_PIC; } #endif @@ -1347,12 +1309,6 @@ static int __init __apic_intr_mode_select(void) pr_info("APIC: SMP mode deactivated\n"); return APIC_SYMMETRIC_IO_NO_ROUTING; } - - if (read_apic_id() != boot_cpu_physical_apicid) { - panic("Boot APIC ID in local APIC unexpected (%d vs %d)", - read_apic_id(), boot_cpu_physical_apicid); - /* Or can we switch back to PIC here? */ - } #endif return APIC_SYMMETRIC_IO; @@ -1439,7 +1395,9 @@ void __init apic_intr_mode_init(void) break; } - default_setup_apic_routing(); + x86_64_probe_apic(); + + x86_32_install_bigsmp(); if (x86_platform.apic_post_init) x86_platform.apic_post_init(); @@ -1521,7 +1479,7 @@ static bool apic_check_and_ack(union apic_ir *irr, union apic_ir *isr) * per set bit. */ for_each_set_bit(bit, isr->map, APIC_IR_BITS) - ack_APIC_irq(); + apic_eoi(); return true; } @@ -1533,7 +1491,7 @@ static bool apic_check_and_ack(union apic_ir *irr, union apic_ir *isr) * interrupt from previous kernel might still have ISR bit set. * * Most probably by now the CPU has serviced that pending interrupt and it - * might not have done the ack_APIC_irq() because it thought, interrupt + * might not have done the apic_eoi() because it thought, interrupt * came from i8259 as ExtInt. LAPIC did not get EOI so it does not clear * the ISR bit and cpu thinks it has already serviced the interrupt. Hence * a vector might get locked. It was noticed for timer irq (vector @@ -1567,7 +1525,7 @@ static void setup_local_APIC(void) int cpu = smp_processor_id(); unsigned int value; - if (disable_apic) { + if (apic_is_disabled) { disable_ioapic_support(); return; } @@ -1589,36 +1547,18 @@ static void setup_local_APIC(void) apic_write(APIC_ESR, 0); } #endif - /* - * Double-check whether this APIC is really registered. - * This is meaningless in clustered apic mode, so we skip it. - */ - BUG_ON(!apic->apic_id_registered()); + /* Validate that the APIC is registered if required */ + BUG_ON(apic->apic_id_registered && !apic->apic_id_registered()); /* * Intel recommends to set DFR, LDR and TPR before enabling * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel - * document number 292116). So here it goes... + * document number 292116). + * + * Except for APICs which operate in physical destination mode. */ - apic->init_apic_ldr(); - -#ifdef CONFIG_X86_32 - if (apic->dest_mode_logical) { - int logical_apicid, ldr_apicid; - - /* - * APIC LDR is initialized. If logical_apicid mapping was - * initialized during get_smp_config(), make sure it matches - * the actual value. - */ - logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); - ldr_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); - if (logical_apicid != BAD_APICID) - WARN_ON(logical_apicid != ldr_apicid); - /* Always use the value from LDR. */ - early_per_cpu(x86_cpu_to_logical_apicid, cpu) = ldr_apicid; - } -#endif + if (apic->init_apic_ldr) + apic->init_apic_ldr(); /* * Set Task Priority to 'accept all except vectors 0-31'. An APIC @@ -1691,7 +1631,7 @@ static void setup_local_APIC(void) * TODO: set up through-local-APIC from through-I/O-APIC? --macro */ value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; - if (!cpu && (pic_mode || !value || skip_ioapic_setup)) { + if (!cpu && (pic_mode || !value || ioapic_is_disabled)) { value = APIC_DM_EXTINT; apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", cpu); } else { @@ -1748,6 +1688,25 @@ void apic_ap_setup(void) end_local_APIC_setup(); } +static __init void cpu_set_boot_apic(void); + +static __init void apic_read_boot_cpu_id(bool x2apic) +{ + /* + * This can be invoked from check_x2apic() before the APIC has been + * selected. But that code knows for sure that the BIOS enabled + * X2APIC. + */ + if (x2apic) { + boot_cpu_physical_apicid = native_apic_msr_read(APIC_ID); + boot_cpu_apic_version = GET_APIC_VERSION(native_apic_msr_read(APIC_LVR)); + } else { + boot_cpu_physical_apicid = read_apic_id(); + boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); + } + cpu_set_boot_apic(); +} + #ifdef CONFIG_X86_X2APIC int x2apic_mode; EXPORT_SYMBOL_GPL(x2apic_mode); @@ -1847,6 +1806,8 @@ void x2apic_setup(void) __x2apic_enable(); } +static __init void apic_set_fixmap(void); + static __init void x2apic_disable(void) { u32 x2apic_id, state = x2apic_state; @@ -1867,7 +1828,7 @@ static __init void x2apic_disable(void) } __x2apic_disable(); - register_lapic_address(mp_lapic_addr); + apic_set_fixmap(); } static __init void x2apic_enable(void) @@ -1928,6 +1889,7 @@ void __init check_x2apic(void) x2apic_state = X2APIC_ON_LOCKED; else x2apic_state = X2APIC_ON; + apic_read_boot_cpu_id(true); } else if (!boot_cpu_has(X86_FEATURE_X2APIC)) { x2apic_state = X2APIC_DISABLED; } @@ -1943,7 +1905,7 @@ void __init check_x2apic(void) pr_err("Kernel does not support x2APIC, please recompile with CONFIG_X86_X2APIC.\n"); pr_err("Disabling APIC, expect reduced performance and functionality.\n"); - disable_apic = 1; + apic_is_disabled = true; setup_clear_cpu_cap(X86_FEATURE_APIC); } @@ -1956,7 +1918,7 @@ void __init enable_IR_x2apic(void) unsigned long flags; int ret, ir_stat; - if (skip_ioapic_setup) { + if (ioapic_is_disabled) { pr_info("Not enabling interrupt remapping due to skipped IO-APIC setup\n"); return; } @@ -1994,19 +1956,19 @@ void __init enable_IR_x2apic(void) * On AMD64 we trust the BIOS - if it says no APIC it is likely * not correctly set up (usually the APIC timer won't work etc.) */ -static int __init detect_init_APIC(void) +static bool __init detect_init_APIC(void) { if (!boot_cpu_has(X86_FEATURE_APIC)) { pr_info("No local APIC present\n"); - return -1; + return false; } - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - return 0; + register_lapic_address(APIC_DEFAULT_PHYS_BASE); + return true; } #else -static int __init apic_verify(void) +static bool __init apic_verify(unsigned long addr) { u32 features, h, l; @@ -2017,28 +1979,28 @@ static int __init apic_verify(void) features = cpuid_edx(1); if (!(features & (1 << X86_FEATURE_APIC))) { pr_warn("Could not enable APIC!\n"); - return -1; + return false; } set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; /* The BIOS may have set up the APIC at some other address */ if (boot_cpu_data.x86 >= 6) { rdmsr(MSR_IA32_APICBASE, l, h); if (l & MSR_IA32_APICBASE_ENABLE) - mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; + addr = l & MSR_IA32_APICBASE_BASE; } + register_lapic_address(addr); pr_info("Found and enabled local APIC!\n"); - return 0; + return true; } -int __init apic_force_enable(unsigned long addr) +bool __init apic_force_enable(unsigned long addr) { u32 h, l; - if (disable_apic) - return -1; + if (apic_is_disabled) + return false; /* * Some BIOSes disable the local APIC in the APIC_BASE @@ -2055,17 +2017,17 @@ int __init apic_force_enable(unsigned long addr) enabled_via_apicbase = 1; } } - return apic_verify(); + return apic_verify(addr); } /* * Detect and initialize APIC */ -static int __init detect_init_APIC(void) +static bool __init detect_init_APIC(void) { /* Disabled by kernel option? */ - if (disable_apic) - return -1; + if (apic_is_disabled) + return false; switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: @@ -2092,22 +2054,22 @@ static int __init detect_init_APIC(void) if (!force_enable_local_apic) { pr_info("Local APIC disabled by BIOS -- " "you can enable it with \"lapic\"\n"); - return -1; + return false; } - if (apic_force_enable(APIC_DEFAULT_PHYS_BASE)) - return -1; + if (!apic_force_enable(APIC_DEFAULT_PHYS_BASE)) + return false; } else { - if (apic_verify()) - return -1; + if (!apic_verify(APIC_DEFAULT_PHYS_BASE)) + return false; } apic_pm_activate(); - return 0; + return true; no_apic: pr_info("No local APIC present or hardware disabled\n"); - return -1; + return false; } #endif @@ -2116,64 +2078,38 @@ no_apic: */ void __init init_apic_mappings(void) { - unsigned int new_apicid; - if (apic_validate_deadline_timer()) pr_info("TSC deadline timer available\n"); - if (x2apic_mode) { - boot_cpu_physical_apicid = read_apic_id(); + if (x2apic_mode) return; - } - /* If no local APIC can be found return early */ - if (!smp_found_config && detect_init_APIC()) { - /* lets NOP'ify apic operations */ - pr_info("APIC: disable apic facility\n"); - apic_disable(); - } else { - apic_phys = mp_lapic_addr; - - /* - * If the system has ACPI MADT tables or MP info, the LAPIC - * address is already registered. - */ - if (!acpi_lapic && !smp_found_config) - register_lapic_address(apic_phys); + if (!smp_found_config) { + if (!detect_init_APIC()) { + pr_info("APIC: disable apic facility\n"); + apic_disable(); + } + num_processors = 1; } +} - /* - * Fetch the APIC ID of the BSP in case we have a - * default configuration (or the MP table is broken). - */ - new_apicid = read_apic_id(); - if (boot_cpu_physical_apicid != new_apicid) { - boot_cpu_physical_apicid = new_apicid; - /* - * yeah -- we lie about apic_version - * in case if apic was disabled via boot option - * but it's not a problem for SMP compiled kernel - * since apic_intr_mode_select is prepared for such - * a case and disable smp mode - */ - boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); - } +static __init void apic_set_fixmap(void) +{ + set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); + apic_mmio_base = APIC_BASE; + apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", + apic_mmio_base, mp_lapic_addr); + apic_read_boot_cpu_id(false); } void __init register_lapic_address(unsigned long address) { + /* This should only happen once */ + WARN_ON_ONCE(mp_lapic_addr); mp_lapic_addr = address; - if (!x2apic_mode) { - set_fixmap_nocache(FIX_APIC_BASE, address); - apic_mmio_base = APIC_BASE; - apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", - APIC_BASE, address); - } - if (boot_cpu_physical_apicid == -1U) { - boot_cpu_physical_apicid = read_apic_id(); - boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); - } + if (!x2apic_mode) + apic_set_fixmap(); } /* @@ -2210,7 +2146,7 @@ static noinline void handle_spurious_interrupt(u8 vector) if (v & (1 << (vector & 0x1f))) { pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Acked\n", vector, smp_processor_id()); - ack_APIC_irq(); + apic_eoi(); } else { pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Not pending!\n", vector, smp_processor_id()); @@ -2261,7 +2197,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_error_interrupt) if (lapic_get_maxlvt() > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); v = apic_read(APIC_ESR); - ack_APIC_irq(); + apic_eoi(); atomic_inc(&irq_err_count); apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x", @@ -2446,54 +2382,43 @@ static int allocate_logical_cpuid(int apicid) return nr_logical_cpuids++; } -int generic_processor_info(int apicid, int version) +static void cpu_update_apic(int cpu, int apicid) { - int cpu, max = nr_cpu_ids; - bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid, - phys_cpu_present_map); +#if defined(CONFIG_SMP) || defined(CONFIG_X86_64) + early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; +#endif + set_cpu_possible(cpu, true); + physid_set(apicid, phys_cpu_present_map); + set_cpu_present(cpu, true); + num_processors++; - /* - * boot_cpu_physical_apicid is designed to have the apicid - * returned by read_apic_id(), i.e, the apicid of the - * currently booting-up processor. However, on some platforms, - * it is temporarily modified by the apicid reported as BSP - * through MP table. Concretely: - * - * - arch/x86/kernel/mpparse.c: MP_processor_info() - * - arch/x86/mm/amdtopology.c: amd_numa_init() - * - * This function is executed with the modified - * boot_cpu_physical_apicid. So, disabled_cpu_apicid kernel - * parameter doesn't work to disable APs on kdump 2nd kernel. - * - * Since fixing handling of boot_cpu_physical_apicid requires - * another discussion and tests on each platform, we leave it - * for now and here we use read_apic_id() directly in this - * function, generic_processor_info(). - */ - if (disabled_cpu_apicid != BAD_APICID && - disabled_cpu_apicid != read_apic_id() && - disabled_cpu_apicid == apicid) { - int thiscpu = num_processors + disabled_cpus; + if (system_state != SYSTEM_BOOTING) + cpu_mark_primary_thread(cpu, apicid); +} - pr_warn("APIC: Disabling requested cpu." - " Processor %d/0x%x ignored.\n", thiscpu, apicid); +static __init void cpu_set_boot_apic(void) +{ + cpuid_to_apicid[0] = boot_cpu_physical_apicid; + cpu_update_apic(0, boot_cpu_physical_apicid); + x86_32_probe_bigsmp_early(); +} - disabled_cpus++; - return -ENODEV; - } +int generic_processor_info(int apicid) +{ + int cpu, max = nr_cpu_ids; - /* - * If boot cpu has not been detected yet, then only allow upto - * nr_cpu_ids - 1 processors and keep one slot free for boot cpu - */ - if (!boot_cpu_detected && num_processors >= nr_cpu_ids - 1 && - apicid != boot_cpu_physical_apicid) { - int thiscpu = max + disabled_cpus - 1; + /* The boot CPU must be set before MADT/MPTABLE parsing happens */ + if (cpuid_to_apicid[0] == BAD_APICID) + panic("Boot CPU APIC not registered yet\n"); - pr_warn("APIC: NR_CPUS/possible_cpus limit of %i almost" - " reached. Keeping one slot for boot cpu." - " Processor %d/0x%x ignored.\n", max, thiscpu, apicid); + if (apicid == boot_cpu_physical_apicid) + return 0; + + if (disabled_cpu_apicid == apicid) { + int thiscpu = num_processors + disabled_cpus; + + pr_warn("APIC: Disabling requested cpu. Processor %d/0x%x ignored.\n", + thiscpu, apicid); disabled_cpus++; return -ENODEV; @@ -2509,66 +2434,16 @@ int generic_processor_info(int apicid, int version) return -EINVAL; } - if (apicid == boot_cpu_physical_apicid) { - /* - * x86_bios_cpu_apicid is required to have processors listed - * in same order as logical cpu numbers. Hence the first - * entry is BSP, and so on. - * boot_cpu_init() already hold bit 0 in cpu_present_mask - * for BSP. - */ - cpu = 0; - - /* Logical cpuid 0 is reserved for BSP. */ - cpuid_to_apicid[0] = apicid; - } else { - cpu = allocate_logical_cpuid(apicid); - if (cpu < 0) { - disabled_cpus++; - return -EINVAL; - } - } - - /* - * Validate version - */ - if (version == 0x0) { - pr_warn("BIOS bug: APIC version is 0 for CPU %d/0x%x, fixing up to 0x10\n", - cpu, apicid); - version = 0x10; - } - - if (version != boot_cpu_apic_version) { - pr_warn("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n", - boot_cpu_apic_version, cpu, version); + cpu = allocate_logical_cpuid(apicid); + if (cpu < 0) { + disabled_cpus++; + return -EINVAL; } - if (apicid > max_physical_apicid) - max_physical_apicid = apicid; - -#if defined(CONFIG_SMP) || defined(CONFIG_X86_64) - early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; - early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid; -#endif -#ifdef CONFIG_X86_32 - early_per_cpu(x86_cpu_to_logical_apicid, cpu) = - apic->x86_32_early_logical_apicid(cpu); -#endif - set_cpu_possible(cpu, true); - physid_set(apicid, phys_cpu_present_map); - set_cpu_present(cpu, true); - num_processors++; - - if (system_state != SYSTEM_BOOTING) - cpu_mark_primary_thread(cpu, apicid); - + cpu_update_apic(cpu, apicid); return cpu; } -int hard_smp_processor_id(void) -{ - return read_apic_id(); -} void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg, bool dmar) @@ -2610,47 +2485,10 @@ u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid) } EXPORT_SYMBOL_GPL(x86_msi_msg_get_destid); -#ifdef CONFIG_X86_64 -void __init acpi_wake_cpu_handler_update(wakeup_cpu_handler handler) -{ - struct apic **drv; - - for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) - (*drv)->wakeup_secondary_cpu_64 = handler; -} -#endif - -/* - * Override the generic EOI implementation with an optimized version. - * Only called during early boot when only one CPU is active and with - * interrupts disabled, so we know this does not race with actual APIC driver - * use. - */ -void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) -{ - struct apic **drv; - - for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { - /* Should happen once for each apic */ - WARN_ON((*drv)->eoi_write == eoi_write); - (*drv)->native_eoi_write = (*drv)->eoi_write; - (*drv)->eoi_write = eoi_write; - } -} - static void __init apic_bsp_up_setup(void) { #ifdef CONFIG_X86_64 apic_write(APIC_ID, apic->set_apic_id(boot_cpu_physical_apicid)); -#else - /* - * Hack: In case of kdump, after a crash, kernel might be booting - * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid - * might be zero if read from MP tables. Get it from LAPIC. - */ -# ifdef CONFIG_CRASH_DUMP - boot_cpu_physical_apicid = read_apic_id(); -# endif #endif physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); } @@ -2919,7 +2757,7 @@ int apic_is_clustered_box(void) */ static int __init setup_disableapic(char *arg) { - disable_apic = 1; + apic_is_disabled = true; setup_clear_cpu_cap(X86_FEATURE_APIC); return 0; } @@ -2956,11 +2794,11 @@ early_param("nolapic_timer", parse_nolapic_timer); static int __init apic_set_verbosity(char *arg) { if (!arg) { -#ifdef CONFIG_X86_64 - skip_ioapic_setup = 0; + if (IS_ENABLED(CONFIG_X86_32)) + return -EINVAL; + + ioapic_is_disabled = false; return 0; -#endif - return -EINVAL; } if (strcmp("debug", arg) == 0) @@ -2981,11 +2819,11 @@ early_param("apic", apic_set_verbosity); static int __init lapic_insert_resource(void) { - if (!apic_phys) + if (!apic_mmio_base) return -1; /* Put local APIC into the resource map. */ - lapic_resource.start = apic_phys; + lapic_resource.start = apic_mmio_base; lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1; insert_resource(&iomem_resource, &lapic_resource); diff --git a/arch/x86/kernel/apic/apic_common.c b/arch/x86/kernel/apic/apic_common.c index 02b4839478b1..7bc5d9bf59cd 100644 --- a/arch/x86/kernel/apic/apic_common.c +++ b/arch/x86/kernel/apic/apic_common.c @@ -6,6 +6,8 @@ #include <linux/irq.h> #include <asm/apic.h> +#include "local.h" + u32 apic_default_calc_apicid(unsigned int cpu) { return per_cpu(x86_cpu_to_apicid, cpu); @@ -29,18 +31,27 @@ void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) int default_cpu_present_to_apicid(int mps_cpu) { if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu)) - return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu); + return (int)per_cpu(x86_cpu_to_apicid, mps_cpu); else return BAD_APICID; } EXPORT_SYMBOL_GPL(default_cpu_present_to_apicid); -int default_check_phys_apicid_present(int phys_apicid) +bool default_apic_id_registered(void) { - return physid_isset(phys_apicid, phys_cpu_present_map); + return physid_isset(read_apic_id(), phys_cpu_present_map); } -int default_apic_id_valid(u32 apicid) +/* + * Set up the logical destination ID when the APIC operates in logical + * destination mode. + */ +void default_init_apic_ldr(void) { - return (apicid < 255); + unsigned long val; + + apic_write(APIC_DFR, APIC_DFR_FLAT); + val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; + val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id()); + apic_write(APIC_LDR, val); } diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 8f72b4351c9f..032a84e2c3cc 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -28,26 +28,6 @@ static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 1; } -/* - * Set up the logical destination ID. - * - * Intel recommends to set DFR, LDR and TPR before enabling - * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel - * document number 292116). So here it goes... - */ -void flat_init_apic_ldr(void) -{ - unsigned long val; - unsigned long num, id; - - num = smp_processor_id(); - id = 1UL << num; - apic_write(APIC_DFR, APIC_DFR_FLAT); - val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; - val |= SET_APIC_LOGICAL_ID(id); - apic_write(APIC_LDR, val); -} - static void _flat_send_IPI_mask(unsigned long mask, int vector) { unsigned long flags; @@ -86,16 +66,6 @@ static u32 set_apic_id(unsigned int id) return (id & 0xFF) << 24; } -static unsigned int read_xapic_id(void) -{ - return flat_get_apic_id(apic_read(APIC_ID)); -} - -static int flat_apic_id_registered(void) -{ - return physid_isset(read_xapic_id(), phys_cpu_present_map); -} - static int flat_phys_pkg_id(int initial_apic_id, int index_msb) { return initial_apic_id >> index_msb; @@ -110,23 +80,18 @@ static struct apic apic_flat __ro_after_init = { .name = "flat", .probe = flat_probe, .acpi_madt_oem_check = flat_acpi_madt_oem_check, - .apic_id_valid = default_apic_id_valid, - .apic_id_registered = flat_apic_id_registered, + .apic_id_registered = default_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = true, .disable_esr = 0, - .check_apicid_used = NULL, - .init_apic_ldr = flat_init_apic_ldr, - .ioapic_phys_id_map = NULL, - .setup_apic_routing = NULL, + .init_apic_ldr = default_init_apic_ldr, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .apicid_to_cpu_present = NULL, - .check_phys_apicid_present = default_check_phys_apicid_present, .phys_pkg_id = flat_phys_pkg_id, + .max_apic_id = 0xFE, .get_apic_id = flat_get_apic_id, .set_apic_id = set_apic_id, @@ -139,15 +104,13 @@ static struct apic apic_flat __ro_after_init = { .send_IPI_all = default_send_IPI_all, .send_IPI_self = default_send_IPI_self, - .inquire_remote_apic = default_inquire_remote_apic, - .read = native_apic_mem_read, .write = native_apic_mem_write, - .eoi_write = native_apic_mem_write, + .eoi = native_apic_mem_eoi, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, - .wait_icr_idle = native_apic_wait_icr_idle, - .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, + .wait_icr_idle = apic_mem_wait_icr_idle, + .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout, }; /* @@ -178,22 +141,9 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 0; } -static void physflat_init_apic_ldr(void) -{ - /* - * LDR and DFR are not involved in physflat mode, rather: - * "In physical destination mode, the destination processor is - * specified by its local APIC ID [...]." (Intel SDM, 10.6.2.1) - */ -} - static int physflat_probe(void) { - if (apic == &apic_physflat || num_possible_cpus() > 8 || - jailhouse_paravirt()) - return 1; - - return 0; + return apic == &apic_physflat || num_possible_cpus() > 8 || jailhouse_paravirt(); } static struct apic apic_physflat __ro_after_init = { @@ -201,8 +151,7 @@ static struct apic apic_physflat __ro_after_init = { .name = "physical flat", .probe = physflat_probe, .acpi_madt_oem_check = physflat_acpi_madt_oem_check, - .apic_id_valid = default_apic_id_valid, - .apic_id_registered = flat_apic_id_registered, + .apic_id_registered = default_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = false, @@ -210,14 +159,11 @@ static struct apic apic_physflat __ro_after_init = { .disable_esr = 0, .check_apicid_used = NULL, - .init_apic_ldr = physflat_init_apic_ldr, .ioapic_phys_id_map = NULL, - .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .apicid_to_cpu_present = NULL, - .check_phys_apicid_present = default_check_phys_apicid_present, .phys_pkg_id = flat_phys_pkg_id, + .max_apic_id = 0xFE, .get_apic_id = flat_get_apic_id, .set_apic_id = set_apic_id, @@ -230,15 +176,13 @@ static struct apic apic_physflat __ro_after_init = { .send_IPI_all = default_send_IPI_all, .send_IPI_self = default_send_IPI_self, - .inquire_remote_apic = default_inquire_remote_apic, - .read = native_apic_mem_read, .write = native_apic_mem_write, - .eoi_write = native_apic_mem_write, + .eoi = native_apic_mem_eoi, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, - .wait_icr_idle = native_apic_wait_icr_idle, - .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, + .wait_icr_idle = apic_mem_wait_icr_idle, + .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout, }; /* diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index fe78319e0f7a..966d7cf10b95 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -8,92 +8,42 @@ * Though in case if apic is disabled (for some reason) we try * to not uglify the caller's code and allow to call (some) apic routines * like self-ipi, etc... + * + * FIXME: Remove this gunk. The above argument which was intentionally left + * in place is silly to begin with because none of the callbacks except for + * APIC::read/write() have a WARN_ON_ONCE() in them. Sigh... */ #include <linux/cpumask.h> #include <linux/thread_info.h> #include <asm/apic.h> -static void noop_init_apic_ldr(void) { } static void noop_send_IPI(int cpu, int vector) { } static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { } static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { } static void noop_send_IPI_allbutself(int vector) { } static void noop_send_IPI_all(int vector) { } static void noop_send_IPI_self(int vector) { } -static void noop_apic_wait_icr_idle(void) { } static void noop_apic_icr_write(u32 low, u32 id) { } - -static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip) -{ - return -1; -} - -static u32 noop_safe_apic_wait_icr_idle(void) -{ - return 0; -} - -static u64 noop_apic_icr_read(void) -{ - return 0; -} - -static int noop_phys_pkg_id(int cpuid_apic, int index_msb) -{ - return 0; -} - -static unsigned int noop_get_apic_id(unsigned long x) -{ - return 0; -} - -static int noop_probe(void) -{ - /* - * NOOP apic should not ever be - * enabled via probe routine - */ - return 0; -} - -static int noop_apic_id_registered(void) -{ - /* - * if we would be really "pedantic" - * we should pass read_apic_id() here - * but since NOOP suppose APIC ID = 0 - * lets save a few cycles - */ - return physid_isset(0, phys_cpu_present_map); -} +static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip) { return -1; } +static u64 noop_apic_icr_read(void) { return 0; } +static int noop_phys_pkg_id(int cpuid_apic, int index_msb) { return 0; } +static unsigned int noop_get_apic_id(unsigned long x) { return 0; } +static void noop_apic_eoi(void) { } static u32 noop_apic_read(u32 reg) { - WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic); + WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !apic_is_disabled); return 0; } -static void noop_apic_write(u32 reg, u32 v) -{ - WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic); -} - -#ifdef CONFIG_X86_32 -static int noop_x86_32_early_logical_apicid(int cpu) +static void noop_apic_write(u32 reg, u32 val) { - return BAD_APICID; + WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !apic_is_disabled); } -#endif struct apic apic_noop __ro_after_init = { .name = "noop", - .probe = noop_probe, - .acpi_madt_oem_check = NULL, - - .apic_id_valid = default_apic_id_valid, - .apic_id_registered = noop_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = true, @@ -101,18 +51,13 @@ struct apic apic_noop __ro_after_init = { .disable_esr = 0, .check_apicid_used = default_check_apicid_used, - .init_apic_ldr = noop_init_apic_ldr, .ioapic_phys_id_map = default_ioapic_phys_id_map, - .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .apicid_to_cpu_present = physid_set_mask_of_physid, - - .check_phys_apicid_present = default_check_phys_apicid_present, .phys_pkg_id = noop_phys_pkg_id, + .max_apic_id = 0xFE, .get_apic_id = noop_get_apic_id, - .set_apic_id = NULL, .calc_dest_apicid = apic_flat_calc_apicid, @@ -125,17 +70,9 @@ struct apic apic_noop __ro_after_init = { .wakeup_secondary_cpu = noop_wakeup_secondary_cpu, - .inquire_remote_apic = NULL, - .read = noop_apic_read, .write = noop_apic_write, - .eoi_write = noop_apic_write, + .eoi = noop_apic_eoi, .icr_read = noop_apic_icr_read, .icr_write = noop_apic_icr_write, - .wait_icr_idle = noop_apic_wait_icr_idle, - .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle, - -#ifdef CONFIG_X86_32 - .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, -#endif }; diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index a54d817eb4b6..63f3d7be9dc7 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -56,17 +56,6 @@ static u32 numachip2_set_apic_id(unsigned int id) return id << 24; } -static int numachip_apic_id_valid(u32 apicid) -{ - /* Trust what bootloader passes in MADT */ - return 1; -} - -static int numachip_apic_id_registered(void) -{ - return 1; -} - static int numachip_phys_pkg_id(int initial_apic_id, int index_msb) { return initial_apic_id >> index_msb; @@ -228,38 +217,20 @@ static int numachip2_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 1; } -/* APIC IPIs are queued */ -static void numachip_apic_wait_icr_idle(void) -{ -} - -/* APIC NMI IPIs are queued */ -static u32 numachip_safe_apic_wait_icr_idle(void) -{ - return 0; -} - static const struct apic apic_numachip1 __refconst = { .name = "NumaConnect system", .probe = numachip1_probe, .acpi_madt_oem_check = numachip1_acpi_madt_oem_check, - .apic_id_valid = numachip_apic_id_valid, - .apic_id_registered = numachip_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = false, .disable_esr = 0, - .check_apicid_used = NULL, - .init_apic_ldr = flat_init_apic_ldr, - .ioapic_phys_id_map = NULL, - .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .apicid_to_cpu_present = NULL, - .check_phys_apicid_present = default_check_phys_apicid_present, .phys_pkg_id = numachip_phys_pkg_id, + .max_apic_id = UINT_MAX, .get_apic_id = numachip1_get_apic_id, .set_apic_id = numachip1_set_apic_id, @@ -273,15 +244,12 @@ static const struct apic apic_numachip1 __refconst = { .send_IPI_self = numachip_send_IPI_self, .wakeup_secondary_cpu = numachip_wakeup_secondary, - .inquire_remote_apic = NULL, /* REMRD not supported */ .read = native_apic_mem_read, .write = native_apic_mem_write, - .eoi_write = native_apic_mem_write, + .eoi = native_apic_mem_eoi, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, - .wait_icr_idle = numachip_apic_wait_icr_idle, - .safe_wait_icr_idle = numachip_safe_apic_wait_icr_idle, }; apic_driver(apic_numachip1); @@ -290,23 +258,16 @@ static const struct apic apic_numachip2 __refconst = { .name = "NumaConnect2 system", .probe = numachip2_probe, .acpi_madt_oem_check = numachip2_acpi_madt_oem_check, - .apic_id_valid = numachip_apic_id_valid, - .apic_id_registered = numachip_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = false, .disable_esr = 0, - .check_apicid_used = NULL, - .init_apic_ldr = flat_init_apic_ldr, - .ioapic_phys_id_map = NULL, - .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .apicid_to_cpu_present = NULL, - .check_phys_apicid_present = default_check_phys_apicid_present, .phys_pkg_id = numachip_phys_pkg_id, + .max_apic_id = UINT_MAX, .get_apic_id = numachip2_get_apic_id, .set_apic_id = numachip2_set_apic_id, @@ -320,15 +281,12 @@ static const struct apic apic_numachip2 __refconst = { .send_IPI_self = numachip_send_IPI_self, .wakeup_secondary_cpu = numachip_wakeup_secondary, - .inquire_remote_apic = NULL, /* REMRD not supported */ .read = native_apic_mem_read, .write = native_apic_mem_write, - .eoi_write = native_apic_mem_write, + .eoi = native_apic_mem_eoi, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, - .wait_icr_idle = numachip_apic_wait_icr_idle, - .safe_wait_icr_idle = numachip_safe_apic_wait_icr_idle, }; apic_driver(apic_numachip2); diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 77555f66c14d..0e5535add4b5 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -18,56 +18,17 @@ static unsigned bigsmp_get_apic_id(unsigned long x) return (x >> 24) & 0xFF; } -static int bigsmp_apic_id_registered(void) -{ - return 1; -} - static bool bigsmp_check_apicid_used(physid_mask_t *map, int apicid) { return false; } -static int bigsmp_early_logical_apicid(int cpu) -{ - /* on bigsmp, logical apicid is the same as physical */ - return early_per_cpu(x86_cpu_to_apicid, cpu); -} - -/* - * bigsmp enables physical destination mode - * and doesn't use LDR and DFR - */ -static void bigsmp_init_apic_ldr(void) -{ -} - -static void bigsmp_setup_apic_routing(void) -{ - printk(KERN_INFO - "Enabling APIC mode: Physflat. Using %d I/O APICs\n", - nr_ioapics); -} - -static int bigsmp_cpu_present_to_apicid(int mps_cpu) -{ - if (mps_cpu < nr_cpu_ids) - return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu); - - return BAD_APICID; -} - static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) { /* For clustered we don't have a good way to do this yet - hack */ physids_promote(0xFFL, retmap); } -static int bigsmp_check_phys_apicid_present(int phys_apicid) -{ - return 1; -} - static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) { return cpuid_apic >> index_msb; @@ -111,21 +72,13 @@ static const struct dmi_system_id bigsmp_dmi_table[] = { static int probe_bigsmp(void) { - if (def_to_bigsmp) - dmi_bigsmp = 1; - else - dmi_check_system(bigsmp_dmi_table); - - return dmi_bigsmp; + return dmi_check_system(bigsmp_dmi_table); } static struct apic apic_bigsmp __ro_after_init = { .name = "bigsmp", .probe = probe_bigsmp, - .acpi_madt_oem_check = NULL, - .apic_id_valid = default_apic_id_valid, - .apic_id_registered = bigsmp_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = false, @@ -133,14 +86,11 @@ static struct apic apic_bigsmp __ro_after_init = { .disable_esr = 1, .check_apicid_used = bigsmp_check_apicid_used, - .init_apic_ldr = bigsmp_init_apic_ldr, .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, - .setup_apic_routing = bigsmp_setup_apic_routing, - .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, - .apicid_to_cpu_present = physid_set_mask_of_physid, - .check_phys_apicid_present = bigsmp_check_phys_apicid_present, + .cpu_present_to_apicid = default_cpu_present_to_apicid, .phys_pkg_id = bigsmp_phys_pkg_id, + .max_apic_id = 0xFE, .get_apic_id = bigsmp_get_apic_id, .set_apic_id = NULL, @@ -153,37 +103,24 @@ static struct apic apic_bigsmp __ro_after_init = { .send_IPI_all = bigsmp_send_IPI_all, .send_IPI_self = default_send_IPI_self, - .inquire_remote_apic = default_inquire_remote_apic, - .read = native_apic_mem_read, .write = native_apic_mem_write, - .eoi_write = native_apic_mem_write, + .eoi = native_apic_mem_eoi, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, - .wait_icr_idle = native_apic_wait_icr_idle, - .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, - - .x86_32_early_logical_apicid = bigsmp_early_logical_apicid, + .wait_icr_idle = apic_mem_wait_icr_idle, + .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout, }; -void __init generic_bigsmp_probe(void) +bool __init apic_bigsmp_possible(bool cmdline_override) { - unsigned int cpu; - - if (!probe_bigsmp()) - return; - - apic = &apic_bigsmp; - - for_each_possible_cpu(cpu) { - if (early_per_cpu(x86_cpu_to_logical_apicid, - cpu) == BAD_APICID) - continue; - early_per_cpu(x86_cpu_to_logical_apicid, cpu) = - bigsmp_early_logical_apicid(cpu); - } + return apic == &apic_bigsmp || !cmdline_override; +} - pr_info("Overriding APIC driver with %s\n", apic_bigsmp.name); +void __init apic_bigsmp_force(void) +{ + if (apic != &apic_bigsmp) + apic_install_driver(&apic_bigsmp); } apic_driver(apic_bigsmp); diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 34a992e275ef..45af535c44a0 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -21,6 +21,8 @@ #include <linux/init.h> #include <linux/delay.h> +#include "local.h" + #ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF u64 hw_nmi_get_sample_period(int watchdog_thresh) { @@ -31,12 +33,12 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh) #ifdef arch_trigger_cpumask_backtrace static void nmi_raise_cpu_backtrace(cpumask_t *mask) { - apic->send_IPI_mask(mask, NMI_VECTOR); + __apic_send_IPI_mask(mask, NMI_VECTOR); } -void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self) +void arch_trigger_cpumask_backtrace(const cpumask_t *mask, int exclude_cpu) { - nmi_trigger_cpumask_backtrace(mask, exclude_self, + nmi_trigger_cpumask_backtrace(mask, exclude_cpu, nmi_raise_cpu_backtrace); } diff --git a/arch/x86/kernel/apic/init.c b/arch/x86/kernel/apic/init.c new file mode 100644 index 000000000000..821e2e536f19 --- /dev/null +++ b/arch/x86/kernel/apic/init.c @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-2.0-only +#define pr_fmt(fmt) "APIC: " fmt + +#include <asm/apic.h> + +#include "local.h" + +/* + * Use DEFINE_STATIC_CALL_NULL() to avoid having to provide stub functions + * for each callback. The callbacks are setup during boot and all except + * wait_icr_idle() must be initialized before usage. The IPI wrappers + * use static_call() and not static_call_cond() to catch any fails. + */ +#define DEFINE_APIC_CALL(__cb) \ + DEFINE_STATIC_CALL_NULL(apic_call_##__cb, *apic->__cb) + +DEFINE_APIC_CALL(eoi); +DEFINE_APIC_CALL(native_eoi); +DEFINE_APIC_CALL(icr_read); +DEFINE_APIC_CALL(icr_write); +DEFINE_APIC_CALL(read); +DEFINE_APIC_CALL(send_IPI); +DEFINE_APIC_CALL(send_IPI_mask); +DEFINE_APIC_CALL(send_IPI_mask_allbutself); +DEFINE_APIC_CALL(send_IPI_allbutself); +DEFINE_APIC_CALL(send_IPI_all); +DEFINE_APIC_CALL(send_IPI_self); +DEFINE_APIC_CALL(wait_icr_idle); +DEFINE_APIC_CALL(wakeup_secondary_cpu); +DEFINE_APIC_CALL(wakeup_secondary_cpu_64); +DEFINE_APIC_CALL(write); + +EXPORT_STATIC_CALL_TRAMP_GPL(apic_call_send_IPI_mask); +EXPORT_STATIC_CALL_TRAMP_GPL(apic_call_send_IPI_self); + +/* The container for function call overrides */ +struct apic_override __x86_apic_override __initdata; + +#define apply_override(__cb) \ + if (__x86_apic_override.__cb) \ + apic->__cb = __x86_apic_override.__cb + +static __init void restore_override_callbacks(void) +{ + apply_override(eoi); + apply_override(native_eoi); + apply_override(write); + apply_override(read); + apply_override(send_IPI); + apply_override(send_IPI_mask); + apply_override(send_IPI_mask_allbutself); + apply_override(send_IPI_allbutself); + apply_override(send_IPI_all); + apply_override(send_IPI_self); + apply_override(icr_read); + apply_override(icr_write); + apply_override(wakeup_secondary_cpu); + apply_override(wakeup_secondary_cpu_64); +} + +#define update_call(__cb) \ + static_call_update(apic_call_##__cb, *apic->__cb) + +static __init void update_static_calls(void) +{ + update_call(eoi); + update_call(native_eoi); + update_call(write); + update_call(read); + update_call(send_IPI); + update_call(send_IPI_mask); + update_call(send_IPI_mask_allbutself); + update_call(send_IPI_allbutself); + update_call(send_IPI_all); + update_call(send_IPI_self); + update_call(icr_read); + update_call(icr_write); + update_call(wait_icr_idle); + update_call(wakeup_secondary_cpu); + update_call(wakeup_secondary_cpu_64); +} + +void __init apic_setup_apic_calls(void) +{ + /* Ensure that the default APIC has native_eoi populated */ + apic->native_eoi = apic->eoi; + update_static_calls(); + pr_info("Static calls initialized\n"); +} + +void __init apic_install_driver(struct apic *driver) +{ + if (apic == driver) + return; + + apic = driver; + + if (IS_ENABLED(CONFIG_X86_X2APIC) && apic->x2apic_set_max_apicid) + apic->max_apic_id = x2apic_max_apicid; + + /* Copy the original eoi() callback as KVM/HyperV might overwrite it */ + if (!apic->native_eoi) + apic->native_eoi = apic->eoi; + + /* Apply any already installed callback overrides */ + restore_override_callbacks(); + update_static_calls(); + + pr_info("Switched APIC routing to: %s\n", driver->name); +} diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 4241dc243aa8..00da6cf6b07d 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -178,7 +178,7 @@ int mp_bus_id_to_type[MAX_MP_BUSSES]; DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); -int skip_ioapic_setup; +bool ioapic_is_disabled __ro_after_init; /** * disable_ioapic_support() - disables ioapic support at runtime @@ -189,7 +189,7 @@ void disable_ioapic_support(void) noioapicquirk = 1; noioapicreroute = -1; #endif - skip_ioapic_setup = 1; + ioapic_is_disabled = true; } static int __init parse_noapic(char *str) @@ -831,7 +831,7 @@ static int __acpi_get_override_irq(u32 gsi, bool *trigger, bool *polarity) { int ioapic, pin, idx; - if (skip_ioapic_setup) + if (ioapic_is_disabled) return -1; ioapic = mp_find_ioapic(gsi); @@ -1366,7 +1366,7 @@ void __init enable_IO_APIC(void) int i8259_apic, i8259_pin; int apic, pin; - if (skip_ioapic_setup) + if (ioapic_is_disabled) nr_ioapics = 0; if (!nr_legacy_irqs() || !nr_ioapics) @@ -1511,13 +1511,9 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void) physid_set(i, phys_id_present_map); ioapics[ioapic_idx].mp_config.apicid = i; } else { - physid_mask_t tmp; - apic->apicid_to_cpu_present(mpc_ioapic_id(ioapic_idx), - &tmp); - apic_printk(APIC_VERBOSE, "Setting %d in the " - "phys_id_present_map\n", - mpc_ioapic_id(ioapic_idx)); - physids_or(phys_id_present_map, phys_id_present_map, tmp); + apic_printk(APIC_VERBOSE, "Setting %d in the phys_id_present_map\n", + mpc_ioapic_id(ioapic_idx)); + physid_set(mpc_ioapic_id(ioapic_idx), phys_id_present_map); } /* @@ -1827,7 +1823,7 @@ static void ioapic_ack_level(struct irq_data *irq_data) * We must acknowledge the irq before we move it or the acknowledge will * not propagate properly. */ - ack_APIC_irq(); + apic_eoi(); /* * Tail end of clearing remote IRR bit (either by delivering the EOI @@ -2050,7 +2046,7 @@ static void unmask_lapic_irq(struct irq_data *data) static void ack_lapic_irq(struct irq_data *data) { - ack_APIC_irq(); + apic_eoi(); } static struct irq_chip lapic_chip __read_mostly = { @@ -2095,7 +2091,7 @@ static inline void __init unlock_ExtINT_logic(void) entry0 = ioapic_read_entry(apic, pin); clear_IO_APIC_pin(apic, pin); - apic_id = hard_smp_processor_id(); + apic_id = read_apic_id(); memset(&entry1, 0, sizeof(entry1)); entry1.dest_mode_logical = true; @@ -2399,7 +2395,7 @@ void __init setup_IO_APIC(void) { int ioapic; - if (skip_ioapic_setup || !nr_ioapics) + if (ioapic_is_disabled || !nr_ioapics) return; io_apic_irqs = nr_legacy_irqs() ? ~PIC_IRQS : ~0UL; @@ -2546,7 +2542,7 @@ static int io_apic_get_unique_id(int ioapic, int apic_id) apic_id = i; } - apic->apicid_to_cpu_present(apic_id, &tmp); + physid_set_mask_of_physid(apic_id, &tmp); physids_or(apic_id_map, apic_id_map, tmp); if (reg_00.bits.ID != apic_id) { @@ -2715,7 +2711,7 @@ void __init io_apic_init_mappings(void) "address found in MPTABLE, " "disabling IO/APIC support!\n"); smp_found_config = 0; - skip_ioapic_setup = 1; + ioapic_is_disabled = true; goto fake_ioapic_page; } #endif diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 2a6509e8c840..a44ba7209ef3 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -1,7 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/cpumask.h> +#include <linux/delay.h> #include <linux/smp.h> + #include <asm/io_apic.h> #include "local.h" @@ -52,9 +54,9 @@ void apic_send_IPI_allbutself(unsigned int vector) return; if (static_branch_likely(&apic_use_ipi_shorthand)) - apic->send_IPI_allbutself(vector); + __apic_send_IPI_allbutself(vector); else - apic->send_IPI_mask_allbutself(cpu_online_mask, vector); + __apic_send_IPI_mask_allbutself(cpu_online_mask, vector); } /* @@ -68,12 +70,12 @@ void native_smp_send_reschedule(int cpu) WARN(1, "sched: Unexpected reschedule of offline CPU#%d!\n", cpu); return; } - apic->send_IPI(cpu, RESCHEDULE_VECTOR); + __apic_send_IPI(cpu, RESCHEDULE_VECTOR); } void native_send_call_func_single_ipi(int cpu) { - apic->send_IPI(cpu, CALL_FUNCTION_SINGLE_VECTOR); + __apic_send_IPI(cpu, CALL_FUNCTION_SINGLE_VECTOR); } void native_send_call_func_ipi(const struct cpumask *mask) @@ -85,14 +87,14 @@ void native_send_call_func_ipi(const struct cpumask *mask) goto sendmask; if (cpumask_test_cpu(cpu, mask)) - apic->send_IPI_all(CALL_FUNCTION_VECTOR); + __apic_send_IPI_all(CALL_FUNCTION_VECTOR); else if (num_online_cpus() > 1) - apic->send_IPI_allbutself(CALL_FUNCTION_VECTOR); + __apic_send_IPI_allbutself(CALL_FUNCTION_VECTOR); return; } sendmask: - apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR); + __apic_send_IPI_mask(mask, CALL_FUNCTION_VECTOR); } #endif /* CONFIG_SMP */ @@ -102,74 +104,77 @@ static inline int __prepare_ICR2(unsigned int mask) return SET_XAPIC_DEST_FIELD(mask); } -static inline void __xapic_wait_icr_idle(void) +u32 apic_mem_wait_icr_idle_timeout(void) +{ + int cnt; + + for (cnt = 0; cnt < 1000; cnt++) { + if (!(apic_read(APIC_ICR) & APIC_ICR_BUSY)) + return 0; + inc_irq_stat(icr_read_retry_count); + udelay(100); + } + return APIC_ICR_BUSY; +} + +void apic_mem_wait_icr_idle(void) { while (native_apic_mem_read(APIC_ICR) & APIC_ICR_BUSY) cpu_relax(); } -void __default_send_IPI_shortcut(unsigned int shortcut, int vector) +/* + * This is safe against interruption because it only writes the lower 32 + * bits of the APIC_ICR register. The destination field is ignored for + * short hand IPIs. + * + * wait_icr_idle() + * write(ICR2, dest) + * NMI + * wait_icr_idle() + * write(ICR) + * wait_icr_idle() + * write(ICR) + * + * This function does not need to disable interrupts as there is no ICR2 + * interaction. The memory write is direct except when the machine is + * affected by the 11AP Pentium erratum, which turns the plain write into + * an XCHG operation. + */ +static void __default_send_IPI_shortcut(unsigned int shortcut, int vector) { /* - * Subtle. In the case of the 'never do double writes' workaround - * we have to lock out interrupts to be safe. As we don't care - * of the value read we use an atomic rmw access to avoid costly - * cli/sti. Otherwise we use an even cheaper single atomic write - * to the APIC. - */ - unsigned int cfg; - - /* - * Wait for idle. + * Wait for the previous ICR command to complete. Use + * safe_apic_wait_icr_idle() for the NMI vector as there have been + * issues where otherwise the system hangs when the panic CPU tries + * to stop the others before launching the kdump kernel. */ if (unlikely(vector == NMI_VECTOR)) - safe_apic_wait_icr_idle(); + apic_mem_wait_icr_idle_timeout(); else - __xapic_wait_icr_idle(); + apic_mem_wait_icr_idle(); - /* - * No need to touch the target chip field. Also the destination - * mode is ignored when a shorthand is used. - */ - cfg = __prepare_ICR(shortcut, vector, 0); - - /* - * Send the IPI. The write to APIC_ICR fires this off. - */ - native_apic_mem_write(APIC_ICR, cfg); + /* Destination field (ICR2) and the destination mode are ignored */ + native_apic_mem_write(APIC_ICR, __prepare_ICR(shortcut, vector, 0)); } /* * This is used to send an IPI with no shorthand notation (the destination is * specified in bits 56 to 63 of the ICR). */ -void __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest) +void __default_send_IPI_dest_field(unsigned int dest_mask, int vector, + unsigned int dest_mode) { - unsigned long cfg; - - /* - * Wait for idle. - */ + /* See comment in __default_send_IPI_shortcut() */ if (unlikely(vector == NMI_VECTOR)) - safe_apic_wait_icr_idle(); + apic_mem_wait_icr_idle_timeout(); else - __xapic_wait_icr_idle(); + apic_mem_wait_icr_idle(); - /* - * prepare target chip field - */ - cfg = __prepare_ICR2(mask); - native_apic_mem_write(APIC_ICR2, cfg); - - /* - * program the ICR - */ - cfg = __prepare_ICR(0, vector, dest); - - /* - * Send the IPI. The write to APIC_ICR fires this off. - */ - native_apic_mem_write(APIC_ICR, cfg); + /* Set the IPI destination field in the ICR */ + native_apic_mem_write(APIC_ICR2, __prepare_ICR2(dest_mask)); + /* Send it with the proper destination mode */ + native_apic_mem_write(APIC_ICR, __prepare_ICR(0, vector, dest_mode)); } void default_send_IPI_single_phys(int cpu, int vector) @@ -184,18 +189,13 @@ void default_send_IPI_single_phys(int cpu, int vector) void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector) { - unsigned long query_cpu; unsigned long flags; + unsigned long cpu; - /* - * Hack. The clustered APIC addressing mode doesn't allow us to send - * to an arbitrary mask, so I do a unicast to each CPU instead. - * - mbligh - */ local_irq_save(flags); - for_each_cpu(query_cpu, mask) { + for_each_cpu(cpu, mask) { __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, - query_cpu), vector, APIC_DEST_PHYSICAL); + cpu), vector, APIC_DEST_PHYSICAL); } local_irq_restore(flags); } @@ -203,18 +203,15 @@ void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector) void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, int vector) { - unsigned int this_cpu = smp_processor_id(); - unsigned int query_cpu; + unsigned int cpu, this_cpu = smp_processor_id(); unsigned long flags; - /* See Hack comment above */ - local_irq_save(flags); - for_each_cpu(query_cpu, mask) { - if (query_cpu == this_cpu) + for_each_cpu(cpu, mask) { + if (cpu == this_cpu) continue; __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, - query_cpu), vector, APIC_DEST_PHYSICAL); + cpu), vector, APIC_DEST_PHYSICAL); } local_irq_restore(flags); } @@ -224,7 +221,7 @@ void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, */ void default_send_IPI_single(int cpu, int vector) { - apic->send_IPI_mask(cpumask_of(cpu), vector); + __apic_send_IPI_mask(cpumask_of(cpu), vector); } void default_send_IPI_allbutself(int vector) @@ -243,50 +240,32 @@ void default_send_IPI_self(int vector) } #ifdef CONFIG_X86_32 - -void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, - int vector) +void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector) { unsigned long flags; - unsigned int query_cpu; - - /* - * Hack. The clustered APIC addressing mode doesn't allow us to send - * to an arbitrary mask, so I do a unicasts to each CPU instead. This - * should be modified to do 1 message per cluster ID - mbligh - */ + unsigned int cpu; local_irq_save(flags); - for_each_cpu(query_cpu, mask) - __default_send_IPI_dest_field( - early_per_cpu(x86_cpu_to_logical_apicid, query_cpu), - vector, APIC_DEST_LOGICAL); + for_each_cpu(cpu, mask) + __default_send_IPI_dest_field(1U << cpu, vector, APIC_DEST_LOGICAL); local_irq_restore(flags); } void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, int vector) { + unsigned int cpu, this_cpu = smp_processor_id(); unsigned long flags; - unsigned int query_cpu; - unsigned int this_cpu = smp_processor_id(); - - /* See Hack comment above */ local_irq_save(flags); - for_each_cpu(query_cpu, mask) { - if (query_cpu == this_cpu) + for_each_cpu(cpu, mask) { + if (cpu == this_cpu) continue; - __default_send_IPI_dest_field( - early_per_cpu(x86_cpu_to_logical_apicid, query_cpu), - vector, APIC_DEST_LOGICAL); - } + __default_send_IPI_dest_field(1U << cpu, vector, APIC_DEST_LOGICAL); + } local_irq_restore(flags); } -/* - * This is only used on smaller machines. - */ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector) { unsigned long mask = cpumask_bits(cpumask)[0]; @@ -301,7 +280,7 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector) local_irq_restore(flags); } -/* must come after the send_IPI functions above for inlining */ +#ifdef CONFIG_SMP static int convert_apicid_to_cpu(int apic_id) { int i; @@ -320,7 +299,7 @@ int safe_smp_processor_id(void) if (!boot_cpu_has(X86_FEATURE_APIC)) return 0; - apicid = hard_smp_processor_id(); + apicid = read_apic_id(); if (apicid == BAD_APICID) return 0; @@ -329,3 +308,4 @@ int safe_smp_processor_id(void) return cpuid >= 0 ? cpuid : 0; } #endif +#endif diff --git a/arch/x86/kernel/apic/local.h b/arch/x86/kernel/apic/local.h index a997d849509a..ec219c659c7d 100644 --- a/arch/x86/kernel/apic/local.h +++ b/arch/x86/kernel/apic/local.h @@ -13,18 +13,16 @@ #include <asm/irq_vectors.h> #include <asm/apic.h> -/* APIC flat 64 */ -void flat_init_apic_ldr(void); - /* X2APIC */ -int x2apic_apic_id_valid(u32 apicid); -int x2apic_apic_id_registered(void); void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest); unsigned int x2apic_get_apic_id(unsigned long id); u32 x2apic_set_apic_id(unsigned int id); int x2apic_phys_pkg_id(int initial_apicid, int index_msb); + +void x2apic_send_IPI_all(int vector); +void x2apic_send_IPI_allbutself(int vector); void x2apic_send_IPI_self(int vector); -void __x2apic_send_IPI_shorthand(int vector, u32 which); +extern u32 x2apic_max_apicid; /* IPI */ @@ -46,7 +44,10 @@ static inline unsigned int __prepare_ICR(unsigned int shortcut, int vector, return icr; } -void __default_send_IPI_shortcut(unsigned int shortcut, int vector); +void default_init_apic_ldr(void); + +void apic_mem_wait_icr_idle(void); +u32 apic_mem_wait_icr_idle_timeout(void); /* * This is used to send an IPI with no shorthand notation (the destination is @@ -62,8 +63,23 @@ void default_send_IPI_allbutself(int vector); void default_send_IPI_all(int vector); void default_send_IPI_self(int vector); +bool default_apic_id_registered(void); + #ifdef CONFIG_X86_32 void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector); void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, int vector); void default_send_IPI_mask_logical(const struct cpumask *mask, int vector); +void x86_32_probe_bigsmp_early(void); +void x86_32_install_bigsmp(void); +#else +static inline void x86_32_probe_bigsmp_early(void) { } +static inline void x86_32_install_bigsmp(void) { } +#endif + +#ifdef CONFIG_X86_BIGSMP +bool apic_bigsmp_possible(bool cmdline_selected); +void apic_bigsmp_force(void); +#else +static inline bool apic_bigsmp_possible(bool cmdline_selected) { return false; }; +static inline void apic_bigsmp_force(void) { } #endif diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 35d5b8fb18ef..6b6b711678fe 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -269,7 +269,7 @@ static const struct msi_parent_ops x86_vector_msi_parent_ops = { struct irq_domain * __init native_create_pci_msi_domain(void) { - if (disable_apic) + if (apic_is_disabled) return NULL; x86_vector_domain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT; diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index a61f642b1b90..9a06df6cdd68 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -10,46 +10,14 @@ #include <linux/errno.h> #include <linux/smp.h> +#include <xen/xen.h> + #include <asm/io_apic.h> #include <asm/apic.h> #include <asm/acpi.h> #include "local.h" -static int default_x86_32_early_logical_apicid(int cpu) -{ - return 1 << cpu; -} - -static void setup_apic_flat_routing(void) -{ -#ifdef CONFIG_X86_IO_APIC - printk(KERN_INFO - "Enabling APIC mode: Flat. Using %d I/O APICs\n", - nr_ioapics); -#endif -} - -static int default_apic_id_registered(void) -{ - return physid_isset(read_apic_id(), phys_cpu_present_map); -} - -/* - * Set up the logical destination ID. Intel recommends to set DFR, LDR and - * TPR before enabling an APIC. See e.g. "AP-388 82489DX User's Manual" - * (Intel document number 292116). - */ -static void default_init_apic_ldr(void) -{ - unsigned long val; - - apic_write(APIC_DFR, APIC_DFR_VALUE); - val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; - val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id()); - apic_write(APIC_LDR, val); -} - static int default_phys_pkg_id(int cpuid_apic, int index_msb) { return cpuid_apic >> index_msb; @@ -65,8 +33,6 @@ static struct apic apic_default __ro_after_init = { .name = "default", .probe = probe_default, - .acpi_madt_oem_check = NULL, - .apic_id_valid = default_apic_id_valid, .apic_id_registered = default_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, @@ -77,14 +43,11 @@ static struct apic apic_default __ro_after_init = { .check_apicid_used = default_check_apicid_used, .init_apic_ldr = default_init_apic_ldr, .ioapic_phys_id_map = default_ioapic_phys_id_map, - .setup_apic_routing = setup_apic_flat_routing, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .apicid_to_cpu_present = physid_set_mask_of_physid, - .check_phys_apicid_present = default_check_phys_apicid_present, .phys_pkg_id = default_phys_pkg_id, + .max_apic_id = 0xFE, .get_apic_id = default_get_apic_id, - .set_apic_id = NULL, .calc_dest_apicid = apic_flat_calc_apicid, @@ -95,17 +58,13 @@ static struct apic apic_default __ro_after_init = { .send_IPI_all = default_send_IPI_all, .send_IPI_self = default_send_IPI_self, - .inquire_remote_apic = default_inquire_remote_apic, - .read = native_apic_mem_read, .write = native_apic_mem_write, - .eoi_write = native_apic_mem_write, + .eoi = native_apic_mem_eoi, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, - .wait_icr_idle = native_apic_wait_icr_idle, - .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, - - .x86_32_early_logical_apicid = default_x86_32_early_logical_apicid, + .wait_icr_idle = apic_mem_wait_icr_idle, + .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout, }; apic_driver(apic_default); @@ -123,7 +82,7 @@ static int __init parse_apic(char *arg) for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { if (!strcmp((*drv)->name, arg)) { - apic = *drv; + apic_install_driver(*drv); cmdline_apic = 1; return 0; } @@ -134,49 +93,43 @@ static int __init parse_apic(char *arg) } early_param("apic", parse_apic); -void __init default_setup_apic_routing(void) +void __init x86_32_probe_bigsmp_early(void) { - int version = boot_cpu_apic_version; + if (nr_cpu_ids <= 8 || xen_pv_domain()) + return; - if (num_possible_cpus() > 8) { + if (IS_ENABLED(CONFIG_X86_BIGSMP)) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: - if (!APIC_XAPIC(version)) { - def_to_bigsmp = 0; + if (!APIC_XAPIC(boot_cpu_apic_version)) break; - } /* P4 and above */ fallthrough; case X86_VENDOR_HYGON: case X86_VENDOR_AMD: - def_to_bigsmp = 1; + if (apic_bigsmp_possible(cmdline_apic)) + return; + break; } } + pr_info("Limiting to 8 possible CPUs\n"); + set_nr_cpu_ids(8); +} -#ifdef CONFIG_X86_BIGSMP - /* - * This is used to switch to bigsmp mode when - * - There is no apic= option specified by the user - * - generic_apic_probe() has chosen apic_default as the sub_arch - * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support - */ - - if (!cmdline_apic && apic == &apic_default) - generic_bigsmp_probe(); -#endif - - if (apic->setup_apic_routing) - apic->setup_apic_routing(); +void __init x86_32_install_bigsmp(void) +{ + if (nr_cpu_ids > 8 && !xen_pv_domain()) + apic_bigsmp_force(); } -void __init generic_apic_probe(void) +void __init x86_32_probe_apic(void) { if (!cmdline_apic) { struct apic **drv; for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { if ((*drv)->probe()) { - apic = *drv; + apic_install_driver(*drv); break; } } @@ -184,26 +137,4 @@ void __init generic_apic_probe(void) if (drv == __apicdrivers_end) panic("Didn't find an APIC driver"); } - printk(KERN_INFO "Using APIC driver %s\n", apic->name); -} - -/* This function can switch the APIC even after the initial ->probe() */ -int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id) -{ - struct apic **drv; - - for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { - if (!(*drv)->acpi_madt_oem_check) - continue; - if (!(*drv)->acpi_madt_oem_check(oem_id, oem_table_id)) - continue; - - if (!cmdline_apic) { - apic = *drv; - printk(KERN_INFO "Switched to APIC driver `%s'.\n", - apic->name); - } - return 1; - } - return 0; } diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index c46720f185c0..ecdf0c4121e1 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -13,10 +13,8 @@ #include "local.h" -/* - * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. - */ -void __init default_setup_apic_routing(void) +/* Select the appropriate APIC driver */ +void __init x86_64_probe_apic(void) { struct apic **drv; @@ -24,11 +22,7 @@ void __init default_setup_apic_routing(void) for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { if ((*drv)->probe && (*drv)->probe()) { - if (apic != *drv) { - apic = *drv; - pr_info("Switched APIC routing to %s.\n", - apic->name); - } + apic_install_driver(*drv); break; } } @@ -40,11 +34,7 @@ int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id) for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { if ((*drv)->acpi_madt_oem_check(oem_id, oem_table_id)) { - if (apic != *drv) { - apic = *drv; - pr_info("Setting APIC routing to %s.\n", - apic->name); - } + apic_install_driver(*drv); return 1; } } diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index c1efebd27e6c..319448d87b99 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -44,7 +44,18 @@ static cpumask_var_t vector_searchmask; static struct irq_chip lapic_controller; static struct irq_matrix *vector_matrix; #ifdef CONFIG_SMP -static DEFINE_PER_CPU(struct hlist_head, cleanup_list); + +static void vector_cleanup_callback(struct timer_list *tmr); + +struct vector_cleanup { + struct hlist_head head; + struct timer_list timer; +}; + +static DEFINE_PER_CPU(struct vector_cleanup, vector_cleanup) = { + .head = HLIST_HEAD_INIT, + .timer = __TIMER_INITIALIZER(vector_cleanup_callback, TIMER_PINNED), +}; #endif void lock_vector_lock(void) @@ -536,7 +547,7 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, struct irq_data *irqd; int i, err, node; - if (disable_apic) + if (apic_is_disabled) return -ENXIO; /* @@ -680,7 +691,7 @@ static int x86_vector_select(struct irq_domain *d, struct irq_fwspec *fwspec, * if IRQ remapping is enabled. APIC IDs above 15 bits are * only permitted if IRQ remapping is enabled, so check that. */ - if (apic->apic_id_valid(32768)) + if (apic_id_valid(32768)) return 0; return x86_fwspec_is_ioapic(fwspec) || x86_fwspec_is_hpet(fwspec); @@ -841,10 +852,21 @@ void lapic_online(void) this_cpu_write(vector_irq[vector], __setup_vector_irq(vector)); } +static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr); + void lapic_offline(void) { + struct vector_cleanup *cl = this_cpu_ptr(&vector_cleanup); + lock_vector_lock(); + + /* In case the vector cleanup timer has not expired */ + __vector_cleanup(cl, false); + irq_matrix_offline(vector_matrix); + WARN_ON_ONCE(try_to_del_timer_sync(&cl->timer) < 0); + WARN_ON_ONCE(!hlist_empty(&cl->head)); + unlock_vector_lock(); } @@ -876,7 +898,7 @@ static int apic_retrigger_irq(struct irq_data *irqd) unsigned long flags; raw_spin_lock_irqsave(&vector_lock, flags); - apic->send_IPI(apicd->cpu, apicd->vector); + __apic_send_IPI(apicd->cpu, apicd->vector); raw_spin_unlock_irqrestore(&vector_lock, flags); return 1; @@ -885,7 +907,7 @@ static int apic_retrigger_irq(struct irq_data *irqd) void apic_ack_irq(struct irq_data *irqd) { irq_move_irq(irqd); - ack_APIC_irq(); + apic_eoi(); } void apic_ack_edge(struct irq_data *irqd) @@ -934,62 +956,98 @@ static void free_moved_vector(struct apic_chip_data *apicd) apicd->move_in_progress = 0; } -DEFINE_IDTENTRY_SYSVEC(sysvec_irq_move_cleanup) +static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr) { - struct hlist_head *clhead = this_cpu_ptr(&cleanup_list); struct apic_chip_data *apicd; struct hlist_node *tmp; + bool rearm = false; - ack_APIC_irq(); - /* Prevent vectors vanishing under us */ - raw_spin_lock(&vector_lock); + lockdep_assert_held(&vector_lock); - hlist_for_each_entry_safe(apicd, tmp, clhead, clist) { + hlist_for_each_entry_safe(apicd, tmp, &cl->head, clist) { unsigned int irr, vector = apicd->prev_vector; /* * Paranoia: Check if the vector that needs to be cleaned - * up is registered at the APICs IRR. If so, then this is - * not the best time to clean it up. Clean it up in the - * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR - * to this CPU. IRQ_MOVE_CLEANUP_VECTOR is the lowest - * priority external vector, so on return from this - * interrupt the device interrupt will happen first. + * up is registered at the APICs IRR. That's clearly a + * hardware issue if the vector arrived on the old target + * _after_ interrupts were disabled above. Keep @apicd + * on the list and schedule the timer again to give the CPU + * a chance to handle the pending interrupt. + * + * Do not check IRR when called from lapic_offline(), because + * fixup_irqs() was just called to scan IRR for set bits and + * forward them to new destination CPUs via IPIs. */ - irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); + irr = check_irr ? apic_read(APIC_IRR + (vector / 32 * 0x10)) : 0; if (irr & (1U << (vector % 32))) { - apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR); + pr_warn_once("Moved interrupt pending in old target APIC %u\n", apicd->irq); + rearm = true; continue; } free_moved_vector(apicd); } - raw_spin_unlock(&vector_lock); + /* + * Must happen under vector_lock to make the timer_pending() check + * in __vector_schedule_cleanup() race free against the rearm here. + */ + if (rearm) + mod_timer(&cl->timer, jiffies + 1); +} + +static void vector_cleanup_callback(struct timer_list *tmr) +{ + struct vector_cleanup *cl = container_of(tmr, typeof(*cl), timer); + + /* Prevent vectors vanishing under us */ + raw_spin_lock_irq(&vector_lock); + __vector_cleanup(cl, true); + raw_spin_unlock_irq(&vector_lock); } -static void __send_cleanup_vector(struct apic_chip_data *apicd) +static void __vector_schedule_cleanup(struct apic_chip_data *apicd) { - unsigned int cpu; + unsigned int cpu = apicd->prev_cpu; raw_spin_lock(&vector_lock); apicd->move_in_progress = 0; - cpu = apicd->prev_cpu; if (cpu_online(cpu)) { - hlist_add_head(&apicd->clist, per_cpu_ptr(&cleanup_list, cpu)); - apic->send_IPI(cpu, IRQ_MOVE_CLEANUP_VECTOR); + struct vector_cleanup *cl = per_cpu_ptr(&vector_cleanup, cpu); + + hlist_add_head(&apicd->clist, &cl->head); + + /* + * The lockless timer_pending() check is safe here. If it + * returns true, then the callback will observe this new + * apic data in the hlist as everything is serialized by + * vector lock. + * + * If it returns false then the timer is either not armed + * or the other CPU executes the callback, which again + * would be blocked on vector lock. Rearming it in the + * latter case makes it fire for nothing. + * + * This is also safe against the callback rearming the timer + * because that's serialized via vector lock too. + */ + if (!timer_pending(&cl->timer)) { + cl->timer.expires = jiffies + 1; + add_timer_on(&cl->timer, cpu); + } } else { apicd->prev_vector = 0; } raw_spin_unlock(&vector_lock); } -void send_cleanup_vector(struct irq_cfg *cfg) +void vector_schedule_cleanup(struct irq_cfg *cfg) { struct apic_chip_data *apicd; apicd = container_of(cfg, struct apic_chip_data, hw_irq_cfg); if (apicd->move_in_progress) - __send_cleanup_vector(apicd); + __vector_schedule_cleanup(apicd); } void irq_complete_move(struct irq_cfg *cfg) @@ -1007,7 +1065,7 @@ void irq_complete_move(struct irq_cfg *cfg) * on the same CPU. */ if (apicd->cpu == smp_processor_id()) - __send_cleanup_vector(apicd); + __vector_schedule_cleanup(apicd); } /* @@ -1150,7 +1208,7 @@ static void __init print_local_APIC(void *dummy) u64 icr; pr_debug("printing local APIC contents on CPU#%d/%d:\n", - smp_processor_id(), hard_smp_processor_id()); + smp_processor_id(), read_apic_id()); v = apic_read(APIC_ID); pr_info("... APIC ID: %08x (%01x)\n", v, read_apic_id()); v = apic_read(APIC_LVR); diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index b2b2b7f3e03f..affbff65e497 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -83,16 +83,6 @@ x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT); } -static void x2apic_send_IPI_allbutself(int vector) -{ - __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLBUT); -} - -static void x2apic_send_IPI_all(int vector) -{ - __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLINC); -} - static u32 x2apic_calc_apicid(unsigned int cpu) { return x86_cpu_to_logical_apicid[cpu]; @@ -236,8 +226,6 @@ static struct apic apic_x2apic_cluster __ro_after_init = { .name = "cluster x2apic", .probe = x2apic_cluster_probe, .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, - .apic_id_valid = x2apic_apic_id_valid, - .apic_id_registered = x2apic_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = true, @@ -247,12 +235,11 @@ static struct apic apic_x2apic_cluster __ro_after_init = { .check_apicid_used = NULL, .init_apic_ldr = init_x2apic_ldr, .ioapic_phys_id_map = NULL, - .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .apicid_to_cpu_present = NULL, - .check_phys_apicid_present = default_check_phys_apicid_present, .phys_pkg_id = x2apic_phys_pkg_id, + .max_apic_id = UINT_MAX, + .x2apic_set_max_apicid = true, .get_apic_id = x2apic_get_apic_id, .set_apic_id = x2apic_set_apic_id, @@ -265,15 +252,11 @@ static struct apic apic_x2apic_cluster __ro_after_init = { .send_IPI_all = x2apic_send_IPI_all, .send_IPI_self = x2apic_send_IPI_self, - .inquire_remote_apic = NULL, - .read = native_apic_msr_read, .write = native_apic_msr_write, - .eoi_write = native_apic_msr_eoi_write, + .eoi = native_apic_msr_eoi, .icr_read = native_x2apic_icr_read, .icr_write = native_x2apic_icr_write, - .wait_icr_idle = native_x2apic_wait_icr_idle, - .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle, }; apic_driver(apic_x2apic_cluster); diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 896bc41cb2ba..788cdb4ee394 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -8,11 +8,13 @@ int x2apic_phys; static struct apic apic_x2apic_phys; -static u32 x2apic_max_apicid __ro_after_init; +u32 x2apic_max_apicid __ro_after_init = UINT_MAX; void __init x2apic_set_max_apicid(u32 apicid) { x2apic_max_apicid = apicid; + if (apic->x2apic_set_max_apicid) + apic->max_apic_id = apicid; } static int __init set_x2apic_phys_mode(char *arg) @@ -81,43 +83,28 @@ static void __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT); } -static void x2apic_send_IPI_allbutself(int vector) +static void __x2apic_send_IPI_shorthand(int vector, u32 which) { - __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLBUT); -} + unsigned long cfg = __prepare_ICR(which, vector, 0); -static void x2apic_send_IPI_all(int vector) -{ - __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLINC); + /* x2apic MSRs are special and need a special fence: */ + weak_wrmsr_fence(); + native_x2apic_icr_write(cfg, 0); } -static void init_x2apic_ldr(void) +void x2apic_send_IPI_allbutself(int vector) { + __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLBUT); } -static int x2apic_phys_probe(void) -{ - if (!x2apic_mode) - return 0; - - if (x2apic_phys || x2apic_fadt_phys()) - return 1; - - return apic == &apic_x2apic_phys; -} - -/* Common x2apic functions, also used by x2apic_cluster */ -int x2apic_apic_id_valid(u32 apicid) +void x2apic_send_IPI_all(int vector) { - if (x2apic_max_apicid && apicid > x2apic_max_apicid) - return 0; - - return 1; + __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLINC); } -int x2apic_apic_id_registered(void) +void x2apic_send_IPI_self(int vector) { - return 1; + apic_write(APIC_SELF_IPI, vector); } void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest) @@ -126,13 +113,15 @@ void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest) native_x2apic_icr_write(cfg, apicid); } -void __x2apic_send_IPI_shorthand(int vector, u32 which) +static int x2apic_phys_probe(void) { - unsigned long cfg = __prepare_ICR(which, vector, 0); + if (!x2apic_mode) + return 0; - /* x2apic MSRs are special and need a special fence: */ - weak_wrmsr_fence(); - native_x2apic_icr_write(cfg, 0); + if (x2apic_phys || x2apic_fadt_phys()) + return 1; + + return apic == &apic_x2apic_phys; } unsigned int x2apic_get_apic_id(unsigned long id) @@ -150,33 +139,22 @@ int x2apic_phys_pkg_id(int initial_apicid, int index_msb) return initial_apicid >> index_msb; } -void x2apic_send_IPI_self(int vector) -{ - apic_write(APIC_SELF_IPI, vector); -} - static struct apic apic_x2apic_phys __ro_after_init = { .name = "physical x2apic", .probe = x2apic_phys_probe, .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, - .apic_id_valid = x2apic_apic_id_valid, - .apic_id_registered = x2apic_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = false, .disable_esr = 0, - .check_apicid_used = NULL, - .init_apic_ldr = init_x2apic_ldr, - .ioapic_phys_id_map = NULL, - .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .apicid_to_cpu_present = NULL, - .check_phys_apicid_present = default_check_phys_apicid_present, .phys_pkg_id = x2apic_phys_pkg_id, + .max_apic_id = UINT_MAX, + .x2apic_set_max_apicid = true, .get_apic_id = x2apic_get_apic_id, .set_apic_id = x2apic_set_apic_id, @@ -189,15 +167,11 @@ static struct apic apic_x2apic_phys __ro_after_init = { .send_IPI_all = x2apic_send_IPI_all, .send_IPI_self = x2apic_send_IPI_self, - .inquire_remote_apic = NULL, - .read = native_apic_msr_read, .write = native_apic_msr_write, - .eoi_write = native_apic_msr_eoi_write, + .eoi = native_apic_msr_eoi, .icr_read = native_x2apic_icr_read, .icr_write = native_x2apic_icr_write, - .wait_icr_idle = native_x2apic_wait_icr_idle, - .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle, }; apic_driver(apic_x2apic_phys); diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index d9384d5b4b8e..205cee567629 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -25,6 +25,8 @@ #include <asm/uv/uv.h> #include <asm/apic.h> +#include "local.h" + static enum uv_system_type uv_system_type; static int uv_hubbed_system; static int uv_hubless_system; @@ -294,8 +296,7 @@ static void __init early_get_apic_socketid_shift(void) static void __init uv_stringify(int len, char *to, char *from) { - /* Relies on 'to' being NULL chars so result will be NULL terminated */ - strncpy(to, from, len-1); + strscpy(to, from, len); /* Trim trailing spaces */ (void)strim(to); @@ -778,30 +779,6 @@ static void uv_send_IPI_all(int vector) uv_send_IPI_mask(cpu_online_mask, vector); } -static int uv_apic_id_valid(u32 apicid) -{ - return 1; -} - -static int uv_apic_id_registered(void) -{ - return 1; -} - -static void uv_init_apic_ldr(void) -{ -} - -static u32 apic_uv_calc_apicid(unsigned int cpu) -{ - return apic_default_calc_apicid(cpu); -} - -static unsigned int x2apic_get_apic_id(unsigned long id) -{ - return id; -} - static u32 set_apic_id(unsigned int id) { return id; @@ -817,11 +794,6 @@ static int uv_phys_pkg_id(int initial_apicid, int index_msb) return uv_read_apic_id() >> index_msb; } -static void uv_send_IPI_self(int vector) -{ - apic_write(APIC_SELF_IPI, vector); -} - static int uv_probe(void) { return apic == &apic_x2apic_uv_x; @@ -832,45 +804,35 @@ static struct apic apic_x2apic_uv_x __ro_after_init = { .name = "UV large system", .probe = uv_probe, .acpi_madt_oem_check = uv_acpi_madt_oem_check, - .apic_id_valid = uv_apic_id_valid, - .apic_id_registered = uv_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = false, .disable_esr = 0, - .check_apicid_used = NULL, - .init_apic_ldr = uv_init_apic_ldr, - .ioapic_phys_id_map = NULL, - .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .apicid_to_cpu_present = NULL, - .check_phys_apicid_present = default_check_phys_apicid_present, .phys_pkg_id = uv_phys_pkg_id, + .max_apic_id = UINT_MAX, .get_apic_id = x2apic_get_apic_id, .set_apic_id = set_apic_id, - .calc_dest_apicid = apic_uv_calc_apicid, + .calc_dest_apicid = apic_default_calc_apicid, .send_IPI = uv_send_IPI_one, .send_IPI_mask = uv_send_IPI_mask, .send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself, .send_IPI_allbutself = uv_send_IPI_allbutself, .send_IPI_all = uv_send_IPI_all, - .send_IPI_self = uv_send_IPI_self, + .send_IPI_self = x2apic_send_IPI_self, .wakeup_secondary_cpu = uv_wakeup_secondary, - .inquire_remote_apic = NULL, .read = native_apic_msr_read, .write = native_apic_msr_write, - .eoi_write = native_apic_msr_eoi_write, + .eoi = native_apic_msr_eoi, .icr_read = native_x2apic_icr_read, .icr_write = native_x2apic_icr_write, - .wait_icr_idle = native_x2apic_wait_icr_idle, - .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle, }; #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_LENGTH 3 @@ -1013,7 +975,7 @@ static void __init calc_mmioh_map(enum mmioh_arch index, /* One (UV2) mapping */ if (index == UV2_MMIOH) { - strncpy(id, "MMIOH", sizeof(id)); + strscpy(id, "MMIOH", sizeof(id)); max_io = max_pnode; mapped = 0; goto map_exit; @@ -1571,7 +1533,7 @@ static void __init build_socket_tables(void) { struct uv_gam_range_entry *gre = uv_gre_table; int nums, numn, nump; - int cpu, i, lnid; + int i, lnid, apicid; int minsock = _min_socket; int maxsock = _max_socket; int minpnode = _min_pnode; @@ -1622,15 +1584,14 @@ static void __init build_socket_tables(void) /* Set socket -> node values: */ lnid = NUMA_NO_NODE; - for_each_possible_cpu(cpu) { - int nid = cpu_to_node(cpu); - int apicid, sockid; + for (apicid = 0; apicid < ARRAY_SIZE(__apicid_to_node); apicid++) { + int nid = __apicid_to_node[apicid]; + int sockid; - if (lnid == nid) + if ((nid == NUMA_NO_NODE) || (lnid == nid)) continue; lnid = nid; - apicid = per_cpu(x86_cpu_to_apicid, cpu); sockid = apicid >> uv_cpuid.socketid_shift; if (_socket_to_node[sockid - minsock] == SOCK_EMPTY) @@ -1845,7 +1806,7 @@ static void __init uv_system_init_hub(void) /* Initialize per CPU info: */ for_each_possible_cpu(cpu) { - int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); + int apicid = per_cpu(x86_cpu_to_apicid, cpu); unsigned short bid; unsigned short pnode; diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index c6c15ce1952f..5934ee5bc087 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -239,12 +239,6 @@ extern int (*console_blank_hook)(int); #endif /* - * The apm_bios device is one of the misc char devices. - * This is its minor number. - */ -#define APM_MINOR_DEV 134 - -/* * Various options can be changed at boot time as follows: * (We allow underscores for compatibility with the modules code) * apm=on/off enable/disable APM diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c index 44c3601cfdc4..190c120f4285 100644 --- a/arch/x86/kernel/audit_64.c +++ b/arch/x86/kernel/audit_64.c @@ -63,11 +63,6 @@ int audit_classify_syscall(int abi, unsigned syscall) static int __init audit_classes_init(void) { #ifdef CONFIG_IA32_EMULATION - extern __u32 ia32_dir_class[]; - extern __u32 ia32_write_class[]; - extern __u32 ia32_read_class[]; - extern __u32 ia32_chattr_class[]; - extern __u32 ia32_signal_class[]; audit_register_class(AUDIT_CLASS_WRITE_32, ia32_write_class); audit_register_class(AUDIT_CLASS_READ_32, ia32_read_class); audit_register_class(AUDIT_CLASS_DIR_WRITE_32, ia32_dir_class); diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c new file mode 100644 index 000000000000..d2c732a34e5d --- /dev/null +++ b/arch/x86/kernel/cet.c @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/ptrace.h> +#include <asm/bugs.h> +#include <asm/traps.h> + +enum cp_error_code { + CP_EC = (1 << 15) - 1, + + CP_RET = 1, + CP_IRET = 2, + CP_ENDBR = 3, + CP_RSTRORSSP = 4, + CP_SETSSBSY = 5, + + CP_ENCL = 1 << 15, +}; + +static const char cp_err[][10] = { + [0] = "unknown", + [1] = "near ret", + [2] = "far/iret", + [3] = "endbranch", + [4] = "rstorssp", + [5] = "setssbsy", +}; + +static const char *cp_err_string(unsigned long error_code) +{ + unsigned int cpec = error_code & CP_EC; + + if (cpec >= ARRAY_SIZE(cp_err)) + cpec = 0; + return cp_err[cpec]; +} + +static void do_unexpected_cp(struct pt_regs *regs, unsigned long error_code) +{ + WARN_ONCE(1, "Unexpected %s #CP, error_code: %s\n", + user_mode(regs) ? "user mode" : "kernel mode", + cp_err_string(error_code)); +} + +static DEFINE_RATELIMIT_STATE(cpf_rate, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + +static void do_user_cp_fault(struct pt_regs *regs, unsigned long error_code) +{ + struct task_struct *tsk; + unsigned long ssp; + + /* + * An exception was just taken from userspace. Since interrupts are disabled + * here, no scheduling should have messed with the registers yet and they + * will be whatever is live in userspace. So read the SSP before enabling + * interrupts so locking the fpregs to do it later is not required. + */ + rdmsrl(MSR_IA32_PL3_SSP, ssp); + + cond_local_irq_enable(regs); + + tsk = current; + tsk->thread.error_code = error_code; + tsk->thread.trap_nr = X86_TRAP_CP; + + /* Ratelimit to prevent log spamming. */ + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + __ratelimit(&cpf_rate)) { + pr_emerg("%s[%d] control protection ip:%lx sp:%lx ssp:%lx error:%lx(%s)%s", + tsk->comm, task_pid_nr(tsk), + regs->ip, regs->sp, ssp, error_code, + cp_err_string(error_code), + error_code & CP_ENCL ? " in enclave" : ""); + print_vma_addr(KERN_CONT " in ", regs->ip); + pr_cont("\n"); + } + + force_sig_fault(SIGSEGV, SEGV_CPERR, (void __user *)0); + cond_local_irq_disable(regs); +} + +static __ro_after_init bool ibt_fatal = true; + +static void do_kernel_cp_fault(struct pt_regs *regs, unsigned long error_code) +{ + if ((error_code & CP_EC) != CP_ENDBR) { + do_unexpected_cp(regs, error_code); + return; + } + + if (unlikely(regs->ip == (unsigned long)&ibt_selftest_noendbr)) { + regs->ax = 0; + return; + } + + pr_err("Missing ENDBR: %pS\n", (void *)instruction_pointer(regs)); + if (!ibt_fatal) { + printk(KERN_DEFAULT CUT_HERE); + __warn(__FILE__, __LINE__, (void *)regs->ip, TAINT_WARN, regs, NULL); + return; + } + BUG(); +} + +static int __init ibt_setup(char *str) +{ + if (!strcmp(str, "off")) + setup_clear_cpu_cap(X86_FEATURE_IBT); + + if (!strcmp(str, "warn")) + ibt_fatal = false; + + return 1; +} + +__setup("ibt=", ibt_setup); + +DEFINE_IDTENTRY_ERRORCODE(exc_control_protection) +{ + if (user_mode(regs)) { + if (cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) + do_user_cp_fault(regs, error_code); + else + do_unexpected_cp(regs, error_code); + } else { + if (cpu_feature_enabled(X86_FEATURE_IBT)) + do_kernel_cp_fault(regs, error_code); + else + do_unexpected_cp(regs, error_code); + } +} diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c index 485441b7f030..bfeb18fad63f 100644 --- a/arch/x86/kernel/cpu/acrn.c +++ b/arch/x86/kernel/cpu/acrn.c @@ -51,7 +51,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_acrn_hv_callback) * will block the interrupt whose vector is lower than * HYPERVISOR_CALLBACK_VECTOR. */ - ack_APIC_irq(); + apic_eoi(); inc_irq_stat(irq_hv_callback_count); if (acrn_intr_handler) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 571abf808ea3..dd8379d84445 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -27,11 +27,6 @@ #include "cpu.h" -static const int amd_erratum_383[]; -static const int amd_erratum_400[]; -static const int amd_erratum_1054[]; -static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum); - /* * nodes_per_socket: Stores the number of nodes per socket. * Refer to Fam15h Models 00-0fh BKDG - CPUID Fn8000_001E_ECX @@ -39,6 +34,83 @@ static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum); */ static u32 nodes_per_socket = 1; +/* + * AMD errata checking + * + * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or + * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that + * have an OSVW id assigned, which it takes as first argument. Both take a + * variable number of family-specific model-stepping ranges created by + * AMD_MODEL_RANGE(). + * + * Example: + * + * const int amd_erratum_319[] = + * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2), + * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0), + * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0)); + */ + +#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 } +#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 } +#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \ + ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end)) +#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff) +#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff) +#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff) + +static const int amd_erratum_400[] = + AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), + AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)); + +static const int amd_erratum_383[] = + AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf)); + +/* #1054: Instructions Retired Performance Counter May Be Inaccurate */ +static const int amd_erratum_1054[] = + AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf)); + +static const int amd_zenbleed[] = + AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0x30, 0x0, 0x4f, 0xf), + AMD_MODEL_RANGE(0x17, 0x60, 0x0, 0x7f, 0xf), + AMD_MODEL_RANGE(0x17, 0x90, 0x0, 0x91, 0xf), + AMD_MODEL_RANGE(0x17, 0xa0, 0x0, 0xaf, 0xf)); + +static const int amd_div0[] = + AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0x00, 0x0, 0x2f, 0xf), + AMD_MODEL_RANGE(0x17, 0x50, 0x0, 0x5f, 0xf)); + +static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) +{ + int osvw_id = *erratum++; + u32 range; + u32 ms; + + if (osvw_id >= 0 && osvw_id < 65536 && + cpu_has(cpu, X86_FEATURE_OSVW)) { + u64 osvw_len; + + rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len); + if (osvw_id < osvw_len) { + u64 osvw_bits; + + rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6), + osvw_bits); + return osvw_bits & (1ULL << (osvw_id & 0x3f)); + } + } + + /* OSVW unavailable or ID unknown, match family-model-stepping range */ + ms = (cpu->x86_model << 4) | cpu->x86_stepping; + while ((range = *erratum++)) + if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) && + (ms >= AMD_MODEL_RANGE_START(range)) && + (ms <= AMD_MODEL_RANGE_END(range))) + return true; + + return false; +} + static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) { u32 gprs[8] = { 0 }; @@ -916,6 +988,47 @@ static void init_amd_zn(struct cpuinfo_x86 *c) } } +static bool cpu_has_zenbleed_microcode(void) +{ + u32 good_rev = 0; + + switch (boot_cpu_data.x86_model) { + case 0x30 ... 0x3f: good_rev = 0x0830107a; break; + case 0x60 ... 0x67: good_rev = 0x0860010b; break; + case 0x68 ... 0x6f: good_rev = 0x08608105; break; + case 0x70 ... 0x7f: good_rev = 0x08701032; break; + case 0xa0 ... 0xaf: good_rev = 0x08a00008; break; + + default: + return false; + break; + } + + if (boot_cpu_data.microcode < good_rev) + return false; + + return true; +} + +static void zenbleed_check(struct cpuinfo_x86 *c) +{ + if (!cpu_has_amd_erratum(c, amd_zenbleed)) + return; + + if (cpu_has(c, X86_FEATURE_HYPERVISOR)) + return; + + if (!cpu_has(c, X86_FEATURE_AVX)) + return; + + if (!cpu_has_zenbleed_microcode()) { + pr_notice_once("Zenbleed: please update your microcode for the most optimal fix\n"); + msr_set_bit(MSR_AMD64_DE_CFG, MSR_AMD64_DE_CFG_ZEN2_FP_BACKUP_FIX_BIT); + } else { + msr_clear_bit(MSR_AMD64_DE_CFG, MSR_AMD64_DE_CFG_ZEN2_FP_BACKUP_FIX_BIT); + } +} + static void init_amd(struct cpuinfo_x86 *c) { early_init_amd(c); @@ -934,7 +1047,7 @@ static void init_amd(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_FSRS); /* get apicid instead of initial apic id from cpuid */ - c->apicid = hard_smp_processor_id(); + c->apicid = read_apic_id(); /* K6s reports MCEs but don't actually have all the MSRs */ if (c->x86 < 6) @@ -1020,6 +1133,13 @@ static void init_amd(struct cpuinfo_x86 *c) if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && cpu_has(c, X86_FEATURE_AUTOIBRS)) WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS)); + + zenbleed_check(c); + + if (cpu_has_amd_erratum(c, amd_div0)) { + pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n"); + setup_force_cpu_bug(X86_BUG_DIV0); + } } #ifdef CONFIG_X86_32 @@ -1115,73 +1235,6 @@ static const struct cpu_dev amd_cpu_dev = { cpu_dev_register(amd_cpu_dev); -/* - * AMD errata checking - * - * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or - * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that - * have an OSVW id assigned, which it takes as first argument. Both take a - * variable number of family-specific model-stepping ranges created by - * AMD_MODEL_RANGE(). - * - * Example: - * - * const int amd_erratum_319[] = - * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2), - * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0), - * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0)); - */ - -#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 } -#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 } -#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \ - ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end)) -#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff) -#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff) -#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff) - -static const int amd_erratum_400[] = - AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), - AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)); - -static const int amd_erratum_383[] = - AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf)); - -/* #1054: Instructions Retired Performance Counter May Be Inaccurate */ -static const int amd_erratum_1054[] = - AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf)); - -static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) -{ - int osvw_id = *erratum++; - u32 range; - u32 ms; - - if (osvw_id >= 0 && osvw_id < 65536 && - cpu_has(cpu, X86_FEATURE_OSVW)) { - u64 osvw_len; - - rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len); - if (osvw_id < osvw_len) { - u64 osvw_bits; - - rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6), - osvw_bits); - return osvw_bits & (1ULL << (osvw_id & 0x3f)); - } - } - - /* OSVW unavailable or ID unknown, match family-model-stepping range */ - ms = (cpu->x86_model << 4) | cpu->x86_stepping; - while ((range = *erratum++)) - if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) && - (ms >= AMD_MODEL_RANGE_START(range)) && - (ms <= AMD_MODEL_RANGE_END(range))) - return true; - - return false; -} - static DEFINE_PER_CPU_READ_MOSTLY(unsigned long[4], amd_dr_addr_mask); static unsigned int amd_msr_dr_addr_masks[] = { @@ -1235,3 +1288,45 @@ u32 amd_get_highest_perf(void) return 255; } EXPORT_SYMBOL_GPL(amd_get_highest_perf); + +static void zenbleed_check_cpu(void *unused) +{ + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + + zenbleed_check(c); +} + +void amd_check_microcode(void) +{ + on_each_cpu(zenbleed_check_cpu, NULL, 1); +} + +bool cpu_has_ibpb_brtype_microcode(void) +{ + switch (boot_cpu_data.x86) { + /* Zen1/2 IBPB flushes branch type predictions too. */ + case 0x17: + return boot_cpu_has(X86_FEATURE_AMD_IBPB); + case 0x19: + /* Poke the MSR bit on Zen3/4 to check its presence. */ + if (!wrmsrl_safe(MSR_IA32_PRED_CMD, PRED_CMD_SBPB)) { + setup_force_cpu_cap(X86_FEATURE_SBPB); + return true; + } else { + return false; + } + default: + return false; + } +} + +/* + * Issue a DIV 0/1 insn to clear any division data from previous DIV + * operations. + */ +void noinstr amd_clear_divider(void) +{ + asm volatile(ALTERNATIVE("", "div %2\n\t", X86_BUG_DIV0) + :: "a" (0), "d" (0), "r" (1)); +} +EXPORT_SYMBOL_GPL(amd_clear_divider); diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 9e2a91830f72..f081d26616ac 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -47,6 +47,8 @@ static void __init taa_select_mitigation(void); static void __init mmio_select_mitigation(void); static void __init srbds_select_mitigation(void); static void __init l1d_flush_select_mitigation(void); +static void __init srso_select_mitigation(void); +static void __init gds_select_mitigation(void); /* The base value of the SPEC_CTRL MSR without task-specific bits set */ u64 x86_spec_ctrl_base; @@ -56,8 +58,13 @@ EXPORT_SYMBOL_GPL(x86_spec_ctrl_base); DEFINE_PER_CPU(u64, x86_spec_ctrl_current); EXPORT_SYMBOL_GPL(x86_spec_ctrl_current); +u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB; +EXPORT_SYMBOL_GPL(x86_pred_cmd); + static DEFINE_MUTEX(spec_ctrl_mutex); +void (*x86_return_thunk)(void) __ro_after_init = &__x86_return_thunk; + /* Update SPEC_CTRL MSR and its cached copy unconditionally */ static void update_spec_ctrl(u64 val) { @@ -160,6 +167,13 @@ void __init cpu_select_mitigations(void) md_clear_select_mitigation(); srbds_select_mitigation(); l1d_flush_select_mitigation(); + + /* + * srso_select_mitigation() depends and must run after + * retbleed_select_mitigation(). + */ + srso_select_mitigation(); + gds_select_mitigation(); } /* @@ -646,6 +660,149 @@ static int __init l1d_flush_parse_cmdline(char *str) early_param("l1d_flush", l1d_flush_parse_cmdline); #undef pr_fmt +#define pr_fmt(fmt) "GDS: " fmt + +enum gds_mitigations { + GDS_MITIGATION_OFF, + GDS_MITIGATION_UCODE_NEEDED, + GDS_MITIGATION_FORCE, + GDS_MITIGATION_FULL, + GDS_MITIGATION_FULL_LOCKED, + GDS_MITIGATION_HYPERVISOR, +}; + +#if IS_ENABLED(CONFIG_GDS_FORCE_MITIGATION) +static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FORCE; +#else +static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FULL; +#endif + +static const char * const gds_strings[] = { + [GDS_MITIGATION_OFF] = "Vulnerable", + [GDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", + [GDS_MITIGATION_FORCE] = "Mitigation: AVX disabled, no microcode", + [GDS_MITIGATION_FULL] = "Mitigation: Microcode", + [GDS_MITIGATION_FULL_LOCKED] = "Mitigation: Microcode (locked)", + [GDS_MITIGATION_HYPERVISOR] = "Unknown: Dependent on hypervisor status", +}; + +bool gds_ucode_mitigated(void) +{ + return (gds_mitigation == GDS_MITIGATION_FULL || + gds_mitigation == GDS_MITIGATION_FULL_LOCKED); +} +EXPORT_SYMBOL_GPL(gds_ucode_mitigated); + +void update_gds_msr(void) +{ + u64 mcu_ctrl_after; + u64 mcu_ctrl; + + switch (gds_mitigation) { + case GDS_MITIGATION_OFF: + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + mcu_ctrl |= GDS_MITG_DIS; + break; + case GDS_MITIGATION_FULL_LOCKED: + /* + * The LOCKED state comes from the boot CPU. APs might not have + * the same state. Make sure the mitigation is enabled on all + * CPUs. + */ + case GDS_MITIGATION_FULL: + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + mcu_ctrl &= ~GDS_MITG_DIS; + break; + case GDS_MITIGATION_FORCE: + case GDS_MITIGATION_UCODE_NEEDED: + case GDS_MITIGATION_HYPERVISOR: + return; + }; + + wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + + /* + * Check to make sure that the WRMSR value was not ignored. Writes to + * GDS_MITG_DIS will be ignored if this processor is locked but the boot + * processor was not. + */ + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl_after); + WARN_ON_ONCE(mcu_ctrl != mcu_ctrl_after); +} + +static void __init gds_select_mitigation(void) +{ + u64 mcu_ctrl; + + if (!boot_cpu_has_bug(X86_BUG_GDS)) + return; + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + gds_mitigation = GDS_MITIGATION_HYPERVISOR; + goto out; + } + + if (cpu_mitigations_off()) + gds_mitigation = GDS_MITIGATION_OFF; + /* Will verify below that mitigation _can_ be disabled */ + + /* No microcode */ + if (!(x86_read_arch_cap_msr() & ARCH_CAP_GDS_CTRL)) { + if (gds_mitigation == GDS_MITIGATION_FORCE) { + /* + * This only needs to be done on the boot CPU so do it + * here rather than in update_gds_msr() + */ + setup_clear_cpu_cap(X86_FEATURE_AVX); + pr_warn("Microcode update needed! Disabling AVX as mitigation.\n"); + } else { + gds_mitigation = GDS_MITIGATION_UCODE_NEEDED; + } + goto out; + } + + /* Microcode has mitigation, use it */ + if (gds_mitigation == GDS_MITIGATION_FORCE) + gds_mitigation = GDS_MITIGATION_FULL; + + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + if (mcu_ctrl & GDS_MITG_LOCKED) { + if (gds_mitigation == GDS_MITIGATION_OFF) + pr_warn("Mitigation locked. Disable failed.\n"); + + /* + * The mitigation is selected from the boot CPU. All other CPUs + * _should_ have the same state. If the boot CPU isn't locked + * but others are then update_gds_msr() will WARN() of the state + * mismatch. If the boot CPU is locked update_gds_msr() will + * ensure the other CPUs have the mitigation enabled. + */ + gds_mitigation = GDS_MITIGATION_FULL_LOCKED; + } + + update_gds_msr(); +out: + pr_info("%s\n", gds_strings[gds_mitigation]); +} + +static int __init gds_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!boot_cpu_has_bug(X86_BUG_GDS)) + return 0; + + if (!strcmp(str, "off")) + gds_mitigation = GDS_MITIGATION_OFF; + else if (!strcmp(str, "force")) + gds_mitigation = GDS_MITIGATION_FORCE; + + return 0; +} +early_param("gather_data_sampling", gds_parse_cmdline); + +#undef pr_fmt #define pr_fmt(fmt) "Spectre V1 : " fmt enum spectre_v1_mitigation { @@ -885,6 +1042,9 @@ do_cmd_auto: setup_force_cpu_cap(X86_FEATURE_RETHUNK); setup_force_cpu_cap(X86_FEATURE_UNRET); + if (IS_ENABLED(CONFIG_RETHUNK)) + x86_return_thunk = retbleed_return_thunk; + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) pr_err(RETBLEED_UNTRAIN_MSG); @@ -894,6 +1054,7 @@ do_cmd_auto: case RETBLEED_MITIGATION_IBPB: setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); + setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); mitigate_smt = true; break; @@ -1150,19 +1311,21 @@ spectre_v2_user_select_mitigation(void) } /* - * If no STIBP, enhanced IBRS is enabled, or SMT impossible, STIBP + * If no STIBP, Intel enhanced IBRS is enabled, or SMT impossible, STIBP * is not required. * - * Enhanced IBRS also protects against cross-thread branch target + * Intel's Enhanced IBRS also protects against cross-thread branch target * injection in user-mode as the IBRS bit remains always set which * implicitly enables cross-thread protections. However, in legacy IBRS * mode, the IBRS bit is set only on kernel entry and cleared on return - * to userspace. This disables the implicit cross-thread protection, - * so allow for STIBP to be selected in that case. + * to userspace. AMD Automatic IBRS also does not protect userspace. + * These modes therefore disable the implicit cross-thread protection, + * so allow for STIBP to be selected in those cases. */ if (!boot_cpu_has(X86_FEATURE_STIBP) || !smt_possible || - spectre_v2_in_eibrs_mode(spectre_v2_enabled)) + (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && + !boot_cpu_has(X86_FEATURE_AUTOIBRS))) return; /* @@ -2186,6 +2349,170 @@ static int __init l1tf_cmdline(char *str) early_param("l1tf", l1tf_cmdline); #undef pr_fmt +#define pr_fmt(fmt) "Speculative Return Stack Overflow: " fmt + +enum srso_mitigation { + SRSO_MITIGATION_NONE, + SRSO_MITIGATION_MICROCODE, + SRSO_MITIGATION_SAFE_RET, + SRSO_MITIGATION_IBPB, + SRSO_MITIGATION_IBPB_ON_VMEXIT, +}; + +enum srso_mitigation_cmd { + SRSO_CMD_OFF, + SRSO_CMD_MICROCODE, + SRSO_CMD_SAFE_RET, + SRSO_CMD_IBPB, + SRSO_CMD_IBPB_ON_VMEXIT, +}; + +static const char * const srso_strings[] = { + [SRSO_MITIGATION_NONE] = "Vulnerable", + [SRSO_MITIGATION_MICROCODE] = "Mitigation: microcode", + [SRSO_MITIGATION_SAFE_RET] = "Mitigation: safe RET", + [SRSO_MITIGATION_IBPB] = "Mitigation: IBPB", + [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only" +}; + +static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_NONE; +static enum srso_mitigation_cmd srso_cmd __ro_after_init = SRSO_CMD_SAFE_RET; + +static int __init srso_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) + srso_cmd = SRSO_CMD_OFF; + else if (!strcmp(str, "microcode")) + srso_cmd = SRSO_CMD_MICROCODE; + else if (!strcmp(str, "safe-ret")) + srso_cmd = SRSO_CMD_SAFE_RET; + else if (!strcmp(str, "ibpb")) + srso_cmd = SRSO_CMD_IBPB; + else if (!strcmp(str, "ibpb-vmexit")) + srso_cmd = SRSO_CMD_IBPB_ON_VMEXIT; + else + pr_err("Ignoring unknown SRSO option (%s).", str); + + return 0; +} +early_param("spec_rstack_overflow", srso_parse_cmdline); + +#define SRSO_NOTICE "WARNING: See https://kernel.org/doc/html/latest/admin-guide/hw-vuln/srso.html for mitigation options." + +static void __init srso_select_mitigation(void) +{ + bool has_microcode; + + if (!boot_cpu_has_bug(X86_BUG_SRSO) || cpu_mitigations_off()) + goto pred_cmd; + + /* + * The first check is for the kernel running as a guest in order + * for guests to verify whether IBPB is a viable mitigation. + */ + has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE) || cpu_has_ibpb_brtype_microcode(); + if (!has_microcode) { + pr_warn("IBPB-extending microcode not applied!\n"); + pr_warn(SRSO_NOTICE); + } else { + /* + * Enable the synthetic (even if in a real CPUID leaf) + * flags for guests. + */ + setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE); + + /* + * Zen1/2 with SMT off aren't vulnerable after the right + * IBPB microcode has been applied. + */ + if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) { + setup_force_cpu_cap(X86_FEATURE_SRSO_NO); + return; + } + } + + if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { + if (has_microcode) { + pr_err("Retbleed IBPB mitigation enabled, using same for SRSO\n"); + srso_mitigation = SRSO_MITIGATION_IBPB; + goto pred_cmd; + } + } + + switch (srso_cmd) { + case SRSO_CMD_OFF: + return; + + case SRSO_CMD_MICROCODE: + if (has_microcode) { + srso_mitigation = SRSO_MITIGATION_MICROCODE; + pr_warn(SRSO_NOTICE); + } + break; + + case SRSO_CMD_SAFE_RET: + if (IS_ENABLED(CONFIG_CPU_SRSO)) { + /* + * Enable the return thunk for generated code + * like ftrace, static_call, etc. + */ + setup_force_cpu_cap(X86_FEATURE_RETHUNK); + setup_force_cpu_cap(X86_FEATURE_UNRET); + + if (boot_cpu_data.x86 == 0x19) { + setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS); + x86_return_thunk = srso_alias_return_thunk; + } else { + setup_force_cpu_cap(X86_FEATURE_SRSO); + x86_return_thunk = srso_return_thunk; + } + srso_mitigation = SRSO_MITIGATION_SAFE_RET; + } else { + pr_err("WARNING: kernel not compiled with CPU_SRSO.\n"); + goto pred_cmd; + } + break; + + case SRSO_CMD_IBPB: + if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY)) { + if (has_microcode) { + setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); + srso_mitigation = SRSO_MITIGATION_IBPB; + } + } else { + pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n"); + goto pred_cmd; + } + break; + + case SRSO_CMD_IBPB_ON_VMEXIT: + if (IS_ENABLED(CONFIG_CPU_SRSO)) { + if (!boot_cpu_has(X86_FEATURE_ENTRY_IBPB) && has_microcode) { + setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); + srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; + } + } else { + pr_err("WARNING: kernel not compiled with CPU_SRSO.\n"); + goto pred_cmd; + } + break; + + default: + break; + } + + pr_info("%s%s\n", srso_strings[srso_mitigation], (has_microcode ? "" : ", no microcode")); + +pred_cmd: + if ((boot_cpu_has(X86_FEATURE_SRSO_NO) || srso_cmd == SRSO_CMD_OFF) && + boot_cpu_has(X86_FEATURE_SBPB)) + x86_pred_cmd = PRED_CMD_SBPB; +} + +#undef pr_fmt #define pr_fmt(fmt) fmt #ifdef CONFIG_SYSFS @@ -2294,7 +2621,8 @@ static ssize_t mmio_stale_data_show_state(char *buf) static char *stibp_state(void) { - if (spectre_v2_in_eibrs_mode(spectre_v2_enabled)) + if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && + !boot_cpu_has(X86_FEATURE_AUTOIBRS)) return ""; switch (spectre_v2_user_stibp) { @@ -2382,6 +2710,21 @@ static ssize_t retbleed_show_state(char *buf) return sysfs_emit(buf, "%s\n", retbleed_strings[retbleed_mitigation]); } +static ssize_t srso_show_state(char *buf) +{ + if (boot_cpu_has(X86_FEATURE_SRSO_NO)) + return sysfs_emit(buf, "Mitigation: SMT disabled\n"); + + return sysfs_emit(buf, "%s%s\n", + srso_strings[srso_mitigation], + (cpu_has_ibpb_brtype_microcode() ? "" : ", no microcode")); +} + +static ssize_t gds_show_state(char *buf) +{ + return sysfs_emit(buf, "%s\n", gds_strings[gds_mitigation]); +} + static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, char *buf, unsigned int bug) { @@ -2431,6 +2774,12 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_RETBLEED: return retbleed_show_state(buf); + case X86_BUG_SRSO: + return srso_show_state(buf); + + case X86_BUG_GDS: + return gds_show_state(buf); + default: break; } @@ -2495,4 +2844,14 @@ ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, cha { return cpu_show_common(dev, attr, buf, X86_BUG_RETBLEED); } + +ssize_t cpu_show_spec_rstack_overflow(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_SRSO); +} + +ssize_t cpu_show_gds(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_GDS); +} #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 52683fddafaf..382d4e6b848d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -59,7 +59,6 @@ #include <asm/cacheinfo.h> #include <asm/memtype.h> #include <asm/microcode.h> -#include <asm/microcode_intel.h> #include <asm/intel-family.h> #include <asm/cpu_device_id.h> #include <asm/uv/uv.h> @@ -588,27 +587,43 @@ __noendbr void ibt_restore(u64 save) static __always_inline void setup_cet(struct cpuinfo_x86 *c) { - u64 msr = CET_ENDBR_EN; + bool user_shstk, kernel_ibt; - if (!HAS_KERNEL_IBT || - !cpu_feature_enabled(X86_FEATURE_IBT)) + if (!IS_ENABLED(CONFIG_X86_CET)) return; - wrmsrl(MSR_IA32_S_CET, msr); + kernel_ibt = HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT); + user_shstk = cpu_feature_enabled(X86_FEATURE_SHSTK) && + IS_ENABLED(CONFIG_X86_USER_SHADOW_STACK); + + if (!kernel_ibt && !user_shstk) + return; + + if (user_shstk) + set_cpu_cap(c, X86_FEATURE_USER_SHSTK); + + if (kernel_ibt) + wrmsrl(MSR_IA32_S_CET, CET_ENDBR_EN); + else + wrmsrl(MSR_IA32_S_CET, 0); + cr4_set_bits(X86_CR4_CET); - if (!ibt_selftest()) { + if (kernel_ibt && ibt_selftest()) { pr_err("IBT selftest: Failed!\n"); wrmsrl(MSR_IA32_S_CET, 0); setup_clear_cpu_cap(X86_FEATURE_IBT); - return; } } __noendbr void cet_disable(void) { - if (cpu_feature_enabled(X86_FEATURE_IBT)) - wrmsrl(MSR_IA32_S_CET, 0); + if (!(cpu_feature_enabled(X86_FEATURE_IBT) || + cpu_feature_enabled(X86_FEATURE_SHSTK))) + return; + + wrmsrl(MSR_IA32_S_CET, 0); + wrmsrl(MSR_IA32_U_CET, 0); } /* @@ -1250,6 +1265,10 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { #define RETBLEED BIT(3) /* CPU is affected by SMT (cross-thread) return predictions */ #define SMT_RSB BIT(4) +/* CPU is affected by SRSO */ +#define SRSO BIT(5) +/* CPU is affected by GDS */ +#define GDS BIT(6) static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS), @@ -1261,28 +1280,31 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS), VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO), VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), - VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED), - VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), - VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), - VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), VULNBL_INTEL_STEPPINGS(CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED), - VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), - VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO), - VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPING_ANY, MMIO), - VULNBL_INTEL_STEPPINGS(COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), + VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO | GDS), + VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPING_ANY, MMIO | GDS), + VULNBL_INTEL_STEPPINGS(COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED), - VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), + VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(TIGERLAKE_L, X86_STEPPING_ANY, GDS), + VULNBL_INTEL_STEPPINGS(TIGERLAKE, X86_STEPPING_ANY, GDS), VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), - VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS), VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS), VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO), VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS), VULNBL_AMD(0x15, RETBLEED), VULNBL_AMD(0x16, RETBLEED), - VULNBL_AMD(0x17, RETBLEED | SMT_RSB), + VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO), VULNBL_HYGON(0x18, RETBLEED | SMT_RSB), + VULNBL_AMD(0x19, SRSO), {} }; @@ -1406,6 +1428,21 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) if (cpu_matches(cpu_vuln_blacklist, SMT_RSB)) setup_force_cpu_bug(X86_BUG_SMT_RSB); + if (!cpu_has(c, X86_FEATURE_SRSO_NO)) { + if (cpu_matches(cpu_vuln_blacklist, SRSO)) + setup_force_cpu_bug(X86_BUG_SRSO); + } + + /* + * Check if CPU is vulnerable to GDS. If running in a virtual machine on + * an affected processor, the VMM may have disabled the use of GATHER by + * disabling AVX2. The only way to do this in HW is to clear XCR0[2], + * which means that AVX will be disabled. + */ + if (cpu_matches(cpu_vuln_blacklist, GDS) && !(ia32_cap & ARCH_CAP_GDS_NO) && + boot_cpu_has(X86_FEATURE_AVX)) + setup_force_cpu_bug(X86_BUG_GDS); + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; @@ -1470,6 +1507,9 @@ static void __init cpu_parse_early_param(void) if (cmdline_find_option_bool(boot_command_line, "noxsaves")) setup_clear_cpu_cap(X86_FEATURE_XSAVES); + if (cmdline_find_option_bool(boot_command_line, "nousershstk")) + setup_clear_cpu_cap(X86_FEATURE_USER_SHSTK); + arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg)); if (arglen <= 0) return; @@ -1937,7 +1977,7 @@ void enable_sep_cpu(void) } #endif -void __init identify_boot_cpu(void) +static __init void identify_boot_cpu(void) { identify_cpu(&boot_cpu_data); if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) @@ -1962,6 +2002,8 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) validate_apic_and_package_id(c); x86_spec_ctrl_setup_ap(); update_srbds_msr(); + if (boot_cpu_has_bug(X86_BUG_GDS)) + update_gds_msr(); tsx_ap_init(); } @@ -2276,8 +2318,7 @@ void store_cpu_caps(struct cpuinfo_x86 *curr_info) * @prev_info: CPU capabilities stored before an update. * * The microcode loader calls this upon late microcode load to recheck features, - * only when microcode has been updated. Caller holds microcode_mutex and CPU - * hotplug lock. + * only when microcode has been updated. Caller holds and CPU hotplug lock. * * Return: None */ @@ -2287,6 +2328,8 @@ void microcode_check(struct cpuinfo_x86 *prev_info) perf_check_microcode(); + amd_check_microcode(); + store_cpu_caps(&curr_info); if (!memcmp(&prev_info->x86_capability, &curr_info.x86_capability, @@ -2317,7 +2360,7 @@ void __init arch_cpu_finalize_init(void) * identify_boot_cpu() initialized SMT support information, let the * core code know. */ - cpu_smt_check_topology(); + cpu_smt_set_num_threads(smp_num_siblings, smp_num_siblings); if (!IS_ENABLED(CONFIG_SMP)) { pr_info("CPU: "); diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 1c44630d4789..1dcd7d4e38ef 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -83,6 +83,7 @@ void cpu_select_mitigations(void); extern void x86_spec_ctrl_setup_ap(void); extern void update_srbds_msr(void); +extern void update_gds_msr(void); extern enum spectre_v2_mitigation spectre_v2_enabled; diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index f6748c8bd647..e462c1d3800a 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -81,6 +81,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_XFD, X86_FEATURE_XSAVES }, { X86_FEATURE_XFD, X86_FEATURE_XGETBV1 }, { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD }, + { X86_FEATURE_SHSTK, X86_FEATURE_XSAVES }, {} }; diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index 5a2962c492d3..defdc594be14 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -8,6 +8,7 @@ */ #include <linux/io.h> +#include <asm/apic.h> #include <asm/cpu.h> #include <asm/smp.h> #include <asm/numa.h> @@ -300,7 +301,7 @@ static void init_hygon(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_REP_GOOD); /* get apicid instead of initial apic id from cpuid */ - c->apicid = hard_smp_processor_id(); + c->apicid = read_apic_id(); /* * XXX someone from Hygon needs to confirm this DTRT diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 1c4639588ff9..be4045628fd3 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -20,7 +20,7 @@ #include <asm/bugs.h> #include <asm/cpu.h> #include <asm/intel-family.h> -#include <asm/microcode_intel.h> +#include <asm/microcode.h> #include <asm/hwcap2.h> #include <asm/elf.h> #include <asm/cpu_device_id.h> @@ -184,180 +184,6 @@ static bool bad_spectre_microcode(struct cpuinfo_x86 *c) return false; } -int intel_cpu_collect_info(struct ucode_cpu_info *uci) -{ - unsigned int val[2]; - unsigned int family, model; - struct cpu_signature csig = { 0 }; - unsigned int eax, ebx, ecx, edx; - - memset(uci, 0, sizeof(*uci)); - - eax = 0x00000001; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - csig.sig = eax; - - family = x86_family(eax); - model = x86_model(eax); - - if (model >= 5 || family > 6) { - /* get processor flags from MSR 0x17 */ - native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); - csig.pf = 1 << ((val[1] >> 18) & 7); - } - - csig.rev = intel_get_microcode_revision(); - - uci->cpu_sig = csig; - - return 0; -} -EXPORT_SYMBOL_GPL(intel_cpu_collect_info); - -/* - * Returns 1 if update has been found, 0 otherwise. - */ -int intel_find_matching_signature(void *mc, unsigned int csig, int cpf) -{ - struct microcode_header_intel *mc_hdr = mc; - struct extended_sigtable *ext_hdr; - struct extended_signature *ext_sig; - int i; - - if (intel_cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf)) - return 1; - - /* Look for ext. headers: */ - if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE) - return 0; - - ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE; - ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE; - - for (i = 0; i < ext_hdr->count; i++) { - if (intel_cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf)) - return 1; - ext_sig++; - } - return 0; -} -EXPORT_SYMBOL_GPL(intel_find_matching_signature); - -/** - * intel_microcode_sanity_check() - Sanity check microcode file. - * @mc: Pointer to the microcode file contents. - * @print_err: Display failure reason if true, silent if false. - * @hdr_type: Type of file, i.e. normal microcode file or In Field Scan file. - * Validate if the microcode header type matches with the type - * specified here. - * - * Validate certain header fields and verify if computed checksum matches - * with the one specified in the header. - * - * Return: 0 if the file passes all the checks, -EINVAL if any of the checks - * fail. - */ -int intel_microcode_sanity_check(void *mc, bool print_err, int hdr_type) -{ - unsigned long total_size, data_size, ext_table_size; - struct microcode_header_intel *mc_header = mc; - struct extended_sigtable *ext_header = NULL; - u32 sum, orig_sum, ext_sigcount = 0, i; - struct extended_signature *ext_sig; - - total_size = get_totalsize(mc_header); - data_size = get_datasize(mc_header); - - if (data_size + MC_HEADER_SIZE > total_size) { - if (print_err) - pr_err("Error: bad microcode data file size.\n"); - return -EINVAL; - } - - if (mc_header->ldrver != 1 || mc_header->hdrver != hdr_type) { - if (print_err) - pr_err("Error: invalid/unknown microcode update format. Header type %d\n", - mc_header->hdrver); - return -EINVAL; - } - - ext_table_size = total_size - (MC_HEADER_SIZE + data_size); - if (ext_table_size) { - u32 ext_table_sum = 0; - u32 *ext_tablep; - - if (ext_table_size < EXT_HEADER_SIZE || - ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { - if (print_err) - pr_err("Error: truncated extended signature table.\n"); - return -EINVAL; - } - - ext_header = mc + MC_HEADER_SIZE + data_size; - if (ext_table_size != exttable_size(ext_header)) { - if (print_err) - pr_err("Error: extended signature table size mismatch.\n"); - return -EFAULT; - } - - ext_sigcount = ext_header->count; - - /* - * Check extended table checksum: the sum of all dwords that - * comprise a valid table must be 0. - */ - ext_tablep = (u32 *)ext_header; - - i = ext_table_size / sizeof(u32); - while (i--) - ext_table_sum += ext_tablep[i]; - - if (ext_table_sum) { - if (print_err) - pr_warn("Bad extended signature table checksum, aborting.\n"); - return -EINVAL; - } - } - - /* - * Calculate the checksum of update data and header. The checksum of - * valid update data and header including the extended signature table - * must be 0. - */ - orig_sum = 0; - i = (MC_HEADER_SIZE + data_size) / sizeof(u32); - while (i--) - orig_sum += ((u32 *)mc)[i]; - - if (orig_sum) { - if (print_err) - pr_err("Bad microcode data checksum, aborting.\n"); - return -EINVAL; - } - - if (!ext_table_size) - return 0; - - /* - * Check extended signature checksum: 0 => valid. - */ - for (i = 0; i < ext_sigcount; i++) { - ext_sig = (void *)ext_header + EXT_HEADER_SIZE + - EXT_SIGNATURE_SIZE * i; - - sum = (mc_header->sig + mc_header->pf + mc_header->cksum) - - (ext_sig->sig + ext_sig->pf + ext_sig->cksum); - if (sum) { - if (print_err) - pr_err("Bad extended signature checksum, aborting.\n"); - return -EINVAL; - } - } - return 0; -} -EXPORT_SYMBOL_GPL(intel_microcode_sanity_check); - static void early_init_intel(struct cpuinfo_x86 *c) { u64 misc_enable; diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c index 3b8476158236..e4c3ba91321c 100644 --- a/arch/x86/kernel/cpu/intel_epb.c +++ b/arch/x86/kernel/cpu/intel_epb.c @@ -206,7 +206,7 @@ static int intel_epb_offline(unsigned int cpu) static const struct x86_cpu_id intel_epb_normal[] = { X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, ENERGY_PERF_BIAS_NORMAL_POWERSAVE), - X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, + X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, ENERGY_PERF_BIAS_NORMAL_POWERSAVE), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, ENERGY_PERF_BIAS_NORMAL_POWERSAVE), diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 5e74610b39e7..c267f43de39e 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -759,7 +759,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error) inc_irq_stat(irq_deferred_error_count); deferred_error_int_vector(); trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR); - ack_APIC_irq(); + apic_eoi(); } /* @@ -1261,10 +1261,10 @@ static void __threshold_remove_blocks(struct threshold_bank *b) struct threshold_block *pos = NULL; struct threshold_block *tmp = NULL; - kobject_del(b->kobj); + kobject_put(b->kobj); list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj) - kobject_del(&pos->kobj); + kobject_put(b->kobj); } static void threshold_remove_bank(struct threshold_bank *bank) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 89e2aab5d34d..6f35f724cc14 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -843,6 +843,26 @@ static noinstr bool quirk_skylake_repmov(void) } /* + * Some Zen-based Instruction Fetch Units set EIPV=RIPV=0 on poison consumption + * errors. This means mce_gather_info() will not save the "ip" and "cs" registers. + * + * However, the context is still valid, so save the "cs" register for later use. + * + * The "ip" register is truly unknown, so don't save it or fixup EIPV/RIPV. + * + * The Instruction Fetch Unit is at MCA bank 1 for all affected systems. + */ +static __always_inline void quirk_zen_ifu(int bank, struct mce *m, struct pt_regs *regs) +{ + if (bank != 1) + return; + if (!(m->status & MCI_STATUS_POISON)) + return; + + m->cs = regs->cs; +} + +/* * Do a quick check if any of the events requires a panic. * This decides if we keep the events around or clear them. */ @@ -861,6 +881,9 @@ static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned lo if (mce_flags.snb_ifu_quirk) quirk_sandybridge_ifu(i, m, regs); + if (mce_flags.zen_ifu_quirk) + quirk_zen_ifu(i, m, regs); + m->bank = i; if (mce_severity(m, regs, &tmp, true) >= MCE_PANIC_SEVERITY) { mce_read_aux(m, i); @@ -1608,6 +1631,13 @@ static void __start_timer(struct timer_list *t, unsigned long interval) local_irq_restore(flags); } +static void mc_poll_banks_default(void) +{ + machine_check_poll(0, this_cpu_ptr(&mce_poll_banks)); +} + +void (*mc_poll_banks)(void) = mc_poll_banks_default; + static void mce_timer_fn(struct timer_list *t) { struct timer_list *cpu_t = this_cpu_ptr(&mce_timer); @@ -1618,7 +1648,7 @@ static void mce_timer_fn(struct timer_list *t) iv = __this_cpu_read(mce_next_interval); if (mce_available(this_cpu_ptr(&cpu_info))) { - machine_check_poll(0, this_cpu_ptr(&mce_poll_banks)); + mc_poll_banks(); if (mce_intel_cmci_poll()) { iv = mce_adjust_timer(iv); @@ -1842,6 +1872,9 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) if (c->x86 == 0x15 && c->x86_model <= 0xf) mce_flags.overflow_recov = 1; + if (c->x86 >= 0x17 && c->x86 <= 0x1A) + mce_flags.zen_ifu_quirk = 1; + } if (c->x86_vendor == X86_VENDOR_INTEL) { diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 12cf2e7ca33c..4d8d4bcf915d 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -270,8 +270,7 @@ static void __maybe_unused raise_mce(struct mce *m) mce_irq_ipi, NULL, 0); preempt_enable(); } else if (m->inject_flags & MCJ_NMI_BROADCAST) - apic->send_IPI_mask(mce_inject_cpumask, - NMI_VECTOR); + __apic_send_IPI_mask(mce_inject_cpumask, NMI_VECTOR); } start = jiffies; while (!cpumask_empty(mce_inject_cpumask)) { diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index 95275a5e57e0..f5323551c1a9 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -56,6 +56,13 @@ static DEFINE_PER_CPU(int, cmci_backoff_cnt); */ static DEFINE_RAW_SPINLOCK(cmci_discover_lock); +/* + * On systems that do support CMCI but it's disabled, polling for MCEs can + * cause the same event to be reported multiple times because IA32_MCi_STATUS + * is shared by the same package. + */ +static DEFINE_SPINLOCK(cmci_poll_lock); + #define CMCI_THRESHOLD 1 #define CMCI_POLL_INTERVAL (30 * HZ) #define CMCI_STORM_INTERVAL (HZ) @@ -426,12 +433,22 @@ void cmci_disable_bank(int bank) raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); } +/* Bank polling function when CMCI is disabled. */ +static void cmci_mc_poll_banks(void) +{ + spin_lock(&cmci_poll_lock); + machine_check_poll(0, this_cpu_ptr(&mce_poll_banks)); + spin_unlock(&cmci_poll_lock); +} + void intel_init_cmci(void) { int banks; - if (!cmci_supported(&banks)) + if (!cmci_supported(&banks)) { + mc_poll_banks = cmci_mc_poll_banks; return; + } mce_threshold_vector = intel_threshold_interrupt; cmci_discover(banks); diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index d2412ce2d312..bcf1b3c66c9c 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -157,6 +157,9 @@ struct mce_vendor_flags { */ smca : 1, + /* Zen IFU quirk */ + zen_ifu_quirk : 1, + /* AMD-style error thresholding banks present. */ amd_threshold : 1, @@ -172,7 +175,7 @@ struct mce_vendor_flags { /* Skylake, Cascade Lake, Cooper Lake REP;MOVS* quirk */ skx_repmov_quirk : 1, - __reserved_0 : 56; + __reserved_0 : 55; }; extern struct mce_vendor_flags mce_flags; @@ -274,4 +277,5 @@ static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg) return 0; } +extern void (*mc_poll_banks)(void); #endif /* __X86_MCE_INTERNAL_H__ */ diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c index 6a059a035021..ef4e7bb5fd88 100644 --- a/arch/x86/kernel/cpu/mce/threshold.c +++ b/arch/x86/kernel/cpu/mce/threshold.c @@ -27,5 +27,5 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_threshold) inc_irq_stat(irq_threshold_count); mce_threshold_vector(); trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR); - ack_APIC_irq(); + apic_eoi(); } diff --git a/arch/x86/kernel/cpu/microcode/Makefile b/arch/x86/kernel/cpu/microcode/Makefile index 34098d48c48f..193d98b33a0a 100644 --- a/arch/x86/kernel/cpu/microcode/Makefile +++ b/arch/x86/kernel/cpu/microcode/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only microcode-y := core.o obj-$(CONFIG_MICROCODE) += microcode.o -microcode-$(CONFIG_MICROCODE_INTEL) += intel.o -microcode-$(CONFIG_MICROCODE_AMD) += amd.o +microcode-$(CONFIG_CPU_SUP_INTEL) += intel.o +microcode-$(CONFIG_CPU_SUP_AMD) += amd.o diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 87208e46f7ed..bbd1dc38ea03 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -29,13 +29,53 @@ #include <linux/kernel.h> #include <linux/pci.h> -#include <asm/microcode_amd.h> #include <asm/microcode.h> #include <asm/processor.h> #include <asm/setup.h> #include <asm/cpu.h> #include <asm/msr.h> +#include "internal.h" + +#define UCODE_MAGIC 0x00414d44 +#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000 +#define UCODE_UCODE_TYPE 0x00000001 + +#define SECTION_HDR_SIZE 8 +#define CONTAINER_HDR_SZ 12 + +struct equiv_cpu_entry { + u32 installed_cpu; + u32 fixed_errata_mask; + u32 fixed_errata_compare; + u16 equiv_cpu; + u16 res; +} __packed; + +struct microcode_header_amd { + u32 data_code; + u32 patch_id; + u16 mc_patch_data_id; + u8 mc_patch_data_len; + u8 init_flag; + u32 mc_patch_data_checksum; + u32 nb_dev_id; + u32 sb_dev_id; + u16 processor_rev_id; + u8 nb_rev_id; + u8 sb_rev_id; + u8 bios_api_rev; + u8 reserved1[3]; + u32 match_reg[8]; +} __packed; + +struct microcode_amd { + struct microcode_header_amd hdr; + unsigned int mpb[]; +}; + +#define PATCH_MAX_SIZE (3 * PAGE_SIZE) + static struct equiv_cpu_table { unsigned int num_entries; struct equiv_cpu_entry *entry; @@ -56,9 +96,6 @@ struct cont_desc { static u32 ucode_new_rev; -/* One blob per node. */ -static u8 amd_ucode_patch[MAX_NUMNODES][PATCH_MAX_SIZE]; - /* * Microcode patch container file is prepended to the initrd in cpio * format. See Documentation/arch/x86/microcode.rst @@ -415,20 +452,17 @@ static int __apply_microcode_amd(struct microcode_amd *mc) * * Returns true if container found (sets @desc), false otherwise. */ -static bool early_apply_microcode(u32 cpuid_1_eax, void *ucode, size_t size, bool save_patch) +static bool early_apply_microcode(u32 cpuid_1_eax, void *ucode, size_t size) { struct cont_desc desc = { 0 }; - u8 (*patch)[PATCH_MAX_SIZE]; struct microcode_amd *mc; u32 rev, dummy, *new_rev; bool ret = false; #ifdef CONFIG_X86_32 new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); - patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch); #else new_rev = &ucode_new_rev; - patch = &amd_ucode_patch[0]; #endif desc.cpuid_1_eax = cpuid_1_eax; @@ -452,9 +486,6 @@ static bool early_apply_microcode(u32 cpuid_1_eax, void *ucode, size_t size, boo if (!__apply_microcode_amd(mc)) { *new_rev = mc->hdr.patch_id; ret = true; - - if (save_patch) - memcpy(patch, mc, min_t(u32, desc.psize, PATCH_MAX_SIZE)); } return ret; @@ -507,7 +538,7 @@ static void find_blobs_in_containers(unsigned int cpuid_1_eax, struct cpio_data *ret = cp; } -void __init load_ucode_amd_bsp(unsigned int cpuid_1_eax) +static void apply_ucode_from_containers(unsigned int cpuid_1_eax) { struct cpio_data cp = { }; @@ -515,42 +546,12 @@ void __init load_ucode_amd_bsp(unsigned int cpuid_1_eax) if (!(cp.data && cp.size)) return; - early_apply_microcode(cpuid_1_eax, cp.data, cp.size, true); + early_apply_microcode(cpuid_1_eax, cp.data, cp.size); } -void load_ucode_amd_ap(unsigned int cpuid_1_eax) +void load_ucode_amd_early(unsigned int cpuid_1_eax) { - struct microcode_amd *mc; - struct cpio_data cp; - u32 *new_rev, rev, dummy; - - if (IS_ENABLED(CONFIG_X86_32)) { - mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch); - new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); - } else { - mc = (struct microcode_amd *)amd_ucode_patch; - new_rev = &ucode_new_rev; - } - - native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - - /* - * Check whether a new patch has been saved already. Also, allow application of - * the same revision in order to pick up SMT-thread-specific configuration even - * if the sibling SMT thread already has an up-to-date revision. - */ - if (*new_rev && rev <= mc->hdr.patch_id) { - if (!__apply_microcode_amd(mc)) { - *new_rev = mc->hdr.patch_id; - return; - } - } - - find_blobs_in_containers(cpuid_1_eax, &cp); - if (!(cp.data && cp.size)) - return; - - early_apply_microcode(cpuid_1_eax, cp.data, cp.size, false); + return apply_ucode_from_containers(cpuid_1_eax); } static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size); @@ -578,23 +579,6 @@ int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax) return 0; } -void reload_ucode_amd(unsigned int cpu) -{ - u32 rev, dummy __always_unused; - struct microcode_amd *mc; - - mc = (struct microcode_amd *)amd_ucode_patch[cpu_to_node(cpu)]; - - rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - - if (rev < mc->hdr.patch_id) { - if (!__apply_microcode_amd(mc)) { - ucode_new_rev = mc->hdr.patch_id; - pr_info("reload patch_level=0x%08x\n", ucode_new_rev); - } - } -} - /* * a small, trivial cache of per-family ucode patches */ @@ -655,6 +639,28 @@ static struct ucode_patch *find_patch(unsigned int cpu) return cache_find_patch(equiv_id); } +void reload_ucode_amd(unsigned int cpu) +{ + u32 rev, dummy __always_unused; + struct microcode_amd *mc; + struct ucode_patch *p; + + p = find_patch(cpu); + if (!p) + return; + + mc = p->data; + + rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + + if (rev < mc->hdr.patch_id) { + if (!__apply_microcode_amd(mc)) { + ucode_new_rev = mc->hdr.patch_id; + pr_info("reload patch_level=0x%08x\n", ucode_new_rev); + } + } +} + static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu); @@ -875,9 +881,6 @@ static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t siz continue; ret = UCODE_NEW; - - memset(&amd_ucode_patch[nid], 0, PATCH_MAX_SIZE); - memcpy(&amd_ucode_patch[nid], p->data, min_t(u32, p->size, PATCH_MAX_SIZE)); } return ret; diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 3afcf3de0dd4..6cc7a2c181da 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -31,15 +31,14 @@ #include <linux/fs.h> #include <linux/mm.h> -#include <asm/microcode_intel.h> #include <asm/cpu_device_id.h> -#include <asm/microcode_amd.h> #include <asm/perf_event.h> -#include <asm/microcode.h> #include <asm/processor.h> #include <asm/cmdline.h> #include <asm/setup.h> +#include "internal.h" + #define DRIVER_VERSION "2.2" static struct microcode_ops *microcode_ops; @@ -54,15 +53,12 @@ LIST_HEAD(microcode_cache); * * All non cpu-hotplug-callback call sites use: * - * - microcode_mutex to synchronize with each other; * - cpus_read_lock/unlock() to synchronize with * the cpu-hotplug-callback call sites. * * We guarantee that only a single cpu is being * updated at any particular moment of time. */ -static DEFINE_MUTEX(microcode_mutex); - struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; struct cpu_info_ctx { @@ -172,7 +168,7 @@ void __init load_ucode_bsp(void) if (intel) load_ucode_intel_bsp(); else - load_ucode_amd_bsp(cpuid_1_eax); + load_ucode_amd_early(cpuid_1_eax); } static bool check_loader_disabled_ap(void) @@ -200,7 +196,7 @@ void load_ucode_ap(void) break; case X86_VENDOR_AMD: if (x86_family(cpuid_1_eax) >= 0x10) - load_ucode_amd_ap(cpuid_1_eax); + load_ucode_amd_early(cpuid_1_eax); break; default: break; @@ -298,7 +294,7 @@ struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa) #endif } -void reload_early_microcode(unsigned int cpu) +static void reload_early_microcode(unsigned int cpu) { int vendor, family; @@ -488,10 +484,7 @@ static ssize_t reload_store(struct device *dev, if (tmp_ret != UCODE_NEW) goto put; - mutex_lock(µcode_mutex); ret = microcode_reload_late(); - mutex_unlock(µcode_mutex); - put: cpus_read_unlock(); diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 467cf37ea90a..94dd6af9c963 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -10,15 +10,7 @@ * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com> * H Peter Anvin" <hpa@zytor.com> */ - -/* - * This needs to be before all headers so that pr_debug in printk.h doesn't turn - * printk calls into no_printk(). - * - *#define DEBUG - */ #define pr_fmt(fmt) "microcode: " fmt - #include <linux/earlycpio.h> #include <linux/firmware.h> #include <linux/uaccess.h> @@ -30,13 +22,14 @@ #include <linux/uio.h> #include <linux/mm.h> -#include <asm/microcode_intel.h> #include <asm/intel-family.h> #include <asm/processor.h> #include <asm/tlbflush.h> #include <asm/setup.h> #include <asm/msr.h> +#include "internal.h" + static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin"; /* Current microcode patch used in early patching on the APs. */ @@ -45,6 +38,208 @@ static struct microcode_intel *intel_ucode_patch; /* last level cache size per core */ static int llc_size_per_core; +/* microcode format is extended from prescott processors */ +struct extended_signature { + unsigned int sig; + unsigned int pf; + unsigned int cksum; +}; + +struct extended_sigtable { + unsigned int count; + unsigned int cksum; + unsigned int reserved[3]; + struct extended_signature sigs[]; +}; + +#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) +#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable)) +#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature)) + +static inline unsigned int get_totalsize(struct microcode_header_intel *hdr) +{ + return hdr->datasize ? hdr->totalsize : DEFAULT_UCODE_TOTALSIZE; +} + +static inline unsigned int exttable_size(struct extended_sigtable *et) +{ + return et->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE; +} + +int intel_cpu_collect_info(struct ucode_cpu_info *uci) +{ + unsigned int val[2]; + unsigned int family, model; + struct cpu_signature csig = { 0 }; + unsigned int eax, ebx, ecx, edx; + + memset(uci, 0, sizeof(*uci)); + + eax = 0x00000001; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + csig.sig = eax; + + family = x86_family(eax); + model = x86_model(eax); + + if (model >= 5 || family > 6) { + /* get processor flags from MSR 0x17 */ + native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); + csig.pf = 1 << ((val[1] >> 18) & 7); + } + + csig.rev = intel_get_microcode_revision(); + + uci->cpu_sig = csig; + + return 0; +} +EXPORT_SYMBOL_GPL(intel_cpu_collect_info); + +/* + * Returns 1 if update has been found, 0 otherwise. + */ +int intel_find_matching_signature(void *mc, unsigned int csig, int cpf) +{ + struct microcode_header_intel *mc_hdr = mc; + struct extended_sigtable *ext_hdr; + struct extended_signature *ext_sig; + int i; + + if (intel_cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf)) + return 1; + + /* Look for ext. headers: */ + if (get_totalsize(mc_hdr) <= intel_microcode_get_datasize(mc_hdr) + MC_HEADER_SIZE) + return 0; + + ext_hdr = mc + intel_microcode_get_datasize(mc_hdr) + MC_HEADER_SIZE; + ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE; + + for (i = 0; i < ext_hdr->count; i++) { + if (intel_cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf)) + return 1; + ext_sig++; + } + return 0; +} +EXPORT_SYMBOL_GPL(intel_find_matching_signature); + +/** + * intel_microcode_sanity_check() - Sanity check microcode file. + * @mc: Pointer to the microcode file contents. + * @print_err: Display failure reason if true, silent if false. + * @hdr_type: Type of file, i.e. normal microcode file or In Field Scan file. + * Validate if the microcode header type matches with the type + * specified here. + * + * Validate certain header fields and verify if computed checksum matches + * with the one specified in the header. + * + * Return: 0 if the file passes all the checks, -EINVAL if any of the checks + * fail. + */ +int intel_microcode_sanity_check(void *mc, bool print_err, int hdr_type) +{ + unsigned long total_size, data_size, ext_table_size; + struct microcode_header_intel *mc_header = mc; + struct extended_sigtable *ext_header = NULL; + u32 sum, orig_sum, ext_sigcount = 0, i; + struct extended_signature *ext_sig; + + total_size = get_totalsize(mc_header); + data_size = intel_microcode_get_datasize(mc_header); + + if (data_size + MC_HEADER_SIZE > total_size) { + if (print_err) + pr_err("Error: bad microcode data file size.\n"); + return -EINVAL; + } + + if (mc_header->ldrver != 1 || mc_header->hdrver != hdr_type) { + if (print_err) + pr_err("Error: invalid/unknown microcode update format. Header type %d\n", + mc_header->hdrver); + return -EINVAL; + } + + ext_table_size = total_size - (MC_HEADER_SIZE + data_size); + if (ext_table_size) { + u32 ext_table_sum = 0; + u32 *ext_tablep; + + if (ext_table_size < EXT_HEADER_SIZE || + ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { + if (print_err) + pr_err("Error: truncated extended signature table.\n"); + return -EINVAL; + } + + ext_header = mc + MC_HEADER_SIZE + data_size; + if (ext_table_size != exttable_size(ext_header)) { + if (print_err) + pr_err("Error: extended signature table size mismatch.\n"); + return -EFAULT; + } + + ext_sigcount = ext_header->count; + + /* + * Check extended table checksum: the sum of all dwords that + * comprise a valid table must be 0. + */ + ext_tablep = (u32 *)ext_header; + + i = ext_table_size / sizeof(u32); + while (i--) + ext_table_sum += ext_tablep[i]; + + if (ext_table_sum) { + if (print_err) + pr_warn("Bad extended signature table checksum, aborting.\n"); + return -EINVAL; + } + } + + /* + * Calculate the checksum of update data and header. The checksum of + * valid update data and header including the extended signature table + * must be 0. + */ + orig_sum = 0; + i = (MC_HEADER_SIZE + data_size) / sizeof(u32); + while (i--) + orig_sum += ((u32 *)mc)[i]; + + if (orig_sum) { + if (print_err) + pr_err("Bad microcode data checksum, aborting.\n"); + return -EINVAL; + } + + if (!ext_table_size) + return 0; + + /* + * Check extended signature checksum: 0 => valid. + */ + for (i = 0; i < ext_sigcount; i++) { + ext_sig = (void *)ext_header + EXT_HEADER_SIZE + + EXT_SIGNATURE_SIZE * i; + + sum = (mc_header->sig + mc_header->pf + mc_header->cksum) - + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); + if (sum) { + if (print_err) + pr_err("Bad extended signature checksum, aborting.\n"); + return -EINVAL; + } + } + return 0; +} +EXPORT_SYMBOL_GPL(intel_microcode_sanity_check); + /* * Returns 1 if update has been found, 0 otherwise. */ @@ -202,86 +397,6 @@ next: return patch; } -static void show_saved_mc(void) -{ -#ifdef DEBUG - int i = 0, j; - unsigned int sig, pf, rev, total_size, data_size, date; - struct ucode_cpu_info uci; - struct ucode_patch *p; - - if (list_empty(µcode_cache)) { - pr_debug("no microcode data saved.\n"); - return; - } - - intel_cpu_collect_info(&uci); - - sig = uci.cpu_sig.sig; - pf = uci.cpu_sig.pf; - rev = uci.cpu_sig.rev; - pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev); - - list_for_each_entry(p, µcode_cache, plist) { - struct microcode_header_intel *mc_saved_header; - struct extended_sigtable *ext_header; - struct extended_signature *ext_sig; - int ext_sigcount; - - mc_saved_header = (struct microcode_header_intel *)p->data; - - sig = mc_saved_header->sig; - pf = mc_saved_header->pf; - rev = mc_saved_header->rev; - date = mc_saved_header->date; - - total_size = get_totalsize(mc_saved_header); - data_size = get_datasize(mc_saved_header); - - pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, total size=0x%x, date = %04x-%02x-%02x\n", - i++, sig, pf, rev, total_size, - date & 0xffff, - date >> 24, - (date >> 16) & 0xff); - - /* Look for ext. headers: */ - if (total_size <= data_size + MC_HEADER_SIZE) - continue; - - ext_header = (void *)mc_saved_header + data_size + MC_HEADER_SIZE; - ext_sigcount = ext_header->count; - ext_sig = (void *)ext_header + EXT_HEADER_SIZE; - - for (j = 0; j < ext_sigcount; j++) { - sig = ext_sig->sig; - pf = ext_sig->pf; - - pr_debug("\tExtended[%d]: sig=0x%x, pf=0x%x\n", - j, sig, pf); - - ext_sig++; - } - } -#endif -} - -/* - * Save this microcode patch. It will be loaded early when a CPU is - * hot-added or resumes. - */ -static void save_mc_for_early(struct ucode_cpu_info *uci, u8 *mc, unsigned int size) -{ - /* Synchronization during CPU hotplug. */ - static DEFINE_MUTEX(x86_cpu_microcode_mutex); - - mutex_lock(&x86_cpu_microcode_mutex); - - save_microcode_patch(uci, mc, size); - show_saved_mc(); - - mutex_unlock(&x86_cpu_microcode_mutex); -} - static bool load_builtin_intel_microcode(struct cpio_data *cp) { unsigned int eax = 1, ebx, ecx = 0, edx; @@ -428,9 +543,6 @@ int __init save_microcode_in_initrd_intel(void) intel_cpu_collect_info(&uci); scan_microcode(cp.data, cp.size, &uci, true); - - show_saved_mc(); - return 0; } @@ -701,12 +813,8 @@ static enum ucode_state generic_load_microcode(int cpu, struct iov_iter *iter) vfree(uci->mc); uci->mc = (struct microcode_intel *)new_mc; - /* - * If early loading microcode is supported, save this mc into - * permanent memory. So it will be loaded early when a CPU is hot added - * or resumes. - */ - save_mc_for_early(uci, new_mc, new_mc_size); + /* Save for CPU hotplug */ + save_microcode_patch(uci, new_mc, new_mc_size); pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", cpu, new_rev, uci->cpu_sig.rev); diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h new file mode 100644 index 000000000000..bf883aa71233 --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/internal.h @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _X86_MICROCODE_INTERNAL_H +#define _X86_MICROCODE_INTERNAL_H + +#include <linux/earlycpio.h> +#include <linux/initrd.h> + +#include <asm/cpu.h> +#include <asm/microcode.h> + +struct ucode_patch { + struct list_head plist; + void *data; /* Intel uses only this one */ + unsigned int size; + u32 patch_id; + u16 equiv_cpu; +}; + +extern struct list_head microcode_cache; + +struct device; + +enum ucode_state { + UCODE_OK = 0, + UCODE_NEW, + UCODE_UPDATED, + UCODE_NFOUND, + UCODE_ERROR, +}; + +struct microcode_ops { + enum ucode_state (*request_microcode_fw)(int cpu, struct device *dev); + + void (*microcode_fini_cpu)(int cpu); + + /* + * The generic 'microcode_core' part guarantees that + * the callbacks below run on a target cpu when they + * are being called. + * See also the "Synchronization" section in microcode_core.c. + */ + enum ucode_state (*apply_microcode)(int cpu); + int (*collect_cpu_info)(int cpu, struct cpu_signature *csig); +}; + +extern struct ucode_cpu_info ucode_cpu_info[]; +struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa); + +#define MAX_UCODE_COUNT 128 + +#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24)) +#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u') +#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I') +#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l') +#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h') +#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i') +#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D') + +#define CPUID_IS(a, b, c, ebx, ecx, edx) \ + (!(((ebx) ^ (a)) | ((edx) ^ (b)) | ((ecx) ^ (c)))) + +/* + * In early loading microcode phase on BSP, boot_cpu_data is not set up yet. + * x86_cpuid_vendor() gets vendor id for BSP. + * + * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify + * coding, we still use x86_cpuid_vendor() to get vendor id for AP. + * + * x86_cpuid_vendor() gets vendor information directly from CPUID. + */ +static inline int x86_cpuid_vendor(void) +{ + u32 eax = 0x00000000; + u32 ebx, ecx = 0, edx; + + native_cpuid(&eax, &ebx, &ecx, &edx); + + if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx)) + return X86_VENDOR_INTEL; + + if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx)) + return X86_VENDOR_AMD; + + return X86_VENDOR_UNKNOWN; +} + +static inline unsigned int x86_cpuid_family(void) +{ + u32 eax = 0x00000001; + u32 ebx, ecx = 0, edx; + + native_cpuid(&eax, &ebx, &ecx, &edx); + + return x86_family(eax); +} + +extern bool initrd_gone; + +#ifdef CONFIG_CPU_SUP_AMD +void load_ucode_amd_bsp(unsigned int family); +void load_ucode_amd_ap(unsigned int family); +void load_ucode_amd_early(unsigned int cpuid_1_eax); +int save_microcode_in_initrd_amd(unsigned int family); +void reload_ucode_amd(unsigned int cpu); +struct microcode_ops *init_amd_microcode(void); +void exit_amd_microcode(void); +#else /* CONFIG_CPU_SUP_AMD */ +static inline void load_ucode_amd_bsp(unsigned int family) { } +static inline void load_ucode_amd_ap(unsigned int family) { } +static inline void load_ucode_amd_early(unsigned int family) { } +static inline int save_microcode_in_initrd_amd(unsigned int family) { return -EINVAL; } +static inline void reload_ucode_amd(unsigned int cpu) { } +static inline struct microcode_ops *init_amd_microcode(void) { return NULL; } +static inline void exit_amd_microcode(void) { } +#endif /* !CONFIG_CPU_SUP_AMD */ + +#ifdef CONFIG_CPU_SUP_INTEL +void load_ucode_intel_bsp(void); +void load_ucode_intel_ap(void); +int save_microcode_in_initrd_intel(void); +void reload_ucode_intel(void); +struct microcode_ops *init_intel_microcode(void); +#else /* CONFIG_CPU_SUP_INTEL */ +static inline void load_ucode_intel_bsp(void) { } +static inline void load_ucode_intel_ap(void) { } +static inline int save_microcode_in_initrd_intel(void) { return -EINVAL; } +static inline void reload_ucode_intel(void) { } +static inline struct microcode_ops *init_intel_microcode(void) { return NULL; } +#endif /* !CONFIG_CPU_SUP_INTEL */ + +#endif /* _X86_MICROCODE_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index c7969e806c64..e6bba12c759c 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -32,6 +32,7 @@ #include <asm/nmi.h> #include <clocksource/hyperv_timer.h> #include <asm/numa.h> +#include <asm/svm.h> /* Is Linux running as the root partition? */ bool hv_root_partition; @@ -39,6 +40,10 @@ bool hv_root_partition; bool hv_nested; struct ms_hyperv_info ms_hyperv; +/* Used in modules via hv_do_hypercall(): see arch/x86/include/asm/mshyperv.h */ +bool hyperv_paravisor_present __ro_after_init; +EXPORT_SYMBOL_GPL(hyperv_paravisor_present); + #if IS_ENABLED(CONFIG_HYPERV) static inline unsigned int hv_get_nested_reg(unsigned int reg) { @@ -65,8 +70,8 @@ u64 hv_get_non_nested_register(unsigned int reg) { u64 value; - if (hv_is_synic_reg(reg) && hv_isolation_type_snp()) - hv_ghcb_msr_read(reg, &value); + if (hv_is_synic_reg(reg) && ms_hyperv.paravisor_present) + hv_ivm_msr_read(reg, &value); else rdmsrl(reg, value); return value; @@ -75,8 +80,8 @@ EXPORT_SYMBOL_GPL(hv_get_non_nested_register); void hv_set_non_nested_register(unsigned int reg, u64 value) { - if (hv_is_synic_reg(reg) && hv_isolation_type_snp()) { - hv_ghcb_msr_write(reg, value); + if (hv_is_synic_reg(reg) && ms_hyperv.paravisor_present) { + hv_ivm_msr_write(reg, value); /* Write proxy bit via wrmsl instruction */ if (hv_is_sint_reg(reg)) @@ -119,7 +124,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback) vmbus_handler(); if (ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED) - ack_APIC_irq(); + apic_eoi(); set_irq_regs(old_regs); } @@ -147,7 +152,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0) if (hv_stimer0_handler) hv_stimer0_handler(); add_interrupt_randomness(HYPERV_STIMER0_VECTOR); - ack_APIC_irq(); + apic_eoi(); set_irq_regs(old_regs); } @@ -295,6 +300,15 @@ static void __init hv_smp_prepare_cpus(unsigned int max_cpus) native_smp_prepare_cpus(max_cpus); + /* + * Override wakeup_secondary_cpu_64 callback for SEV-SNP + * enlightened guest. + */ + if (!ms_hyperv.paravisor_present && hv_isolation_type_snp()) { + apic->wakeup_secondary_cpu_64 = hv_snp_boot_ap; + return; + } + #ifdef CONFIG_X86_64 for_each_present_cpu(i) { if (i == 0) @@ -313,6 +327,26 @@ static void __init hv_smp_prepare_cpus(unsigned int max_cpus) } #endif +/* + * When a fully enlightened TDX VM runs on Hyper-V, the firmware sets the + * HW_REDUCED flag: refer to acpi_tb_create_local_fadt(). Consequently ttyS0 + * interrupts can't work because request_irq() -> ... -> irq_to_desc() returns + * NULL for ttyS0. This happens because mp_config_acpi_legacy_irqs() sees a + * nr_legacy_irqs() of 0, so it doesn't initialize the array 'mp_irqs[]', and + * later setup_IO_APIC_irqs() -> find_irq_entry() fails to find the legacy irqs + * from the array and hence doesn't create the necessary irq description info. + * + * Clone arch/x86/kernel/acpi/boot.c: acpi_generic_reduced_hw_init() here, + * except don't change 'legacy_pic', which keeps its default value + * 'default_legacy_pic'. This way, mp_config_acpi_legacy_irqs() sees a non-zero + * nr_legacy_irqs() and eventually serial console interrupts works properly. + */ +static void __init reduced_hw_init(void) +{ + x86_init.timers.timer_init = x86_init_noop; + x86_init.irqs.pre_vector_init = x86_init_noop; +} + static void __init ms_hyperv_init_platform(void) { int hv_max_functions_eax; @@ -399,11 +433,33 @@ static void __init ms_hyperv_init_platform(void) ms_hyperv.shared_gpa_boundary = BIT_ULL(ms_hyperv.shared_gpa_boundary_bits); + hyperv_paravisor_present = !!ms_hyperv.paravisor_present; + pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n", ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b); - if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP) + + if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP) { static_branch_enable(&isolation_type_snp); + } else if (hv_get_isolation_type() == HV_ISOLATION_TYPE_TDX) { + static_branch_enable(&isolation_type_tdx); + + /* A TDX VM must use x2APIC and doesn't use lazy EOI. */ + ms_hyperv.hints &= ~HV_X64_APIC_ACCESS_RECOMMENDED; + + if (!ms_hyperv.paravisor_present) { + /* To be supported: more work is required. */ + ms_hyperv.features &= ~HV_MSR_REFERENCE_TSC_AVAILABLE; + + /* HV_REGISTER_CRASH_CTL is unsupported. */ + ms_hyperv.misc_features &= ~HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; + + /* Don't trust Hyper-V's TLB-flushing hypercalls. */ + ms_hyperv.hints &= ~HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; + + x86_init.acpi.reduced_hw_early_init = reduced_hw_init; + } + } } if (hv_max_functions_eax >= HYPERV_CPUID_NESTED_FEATURES) { @@ -473,7 +529,7 @@ static void __init ms_hyperv_init_platform(void) #if IS_ENABLED(CONFIG_HYPERV) if ((hv_get_isolation_type() == HV_ISOLATION_TYPE_VBS) || - (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP)) + ms_hyperv.paravisor_present) hv_vtom_init(); /* * Setup the hook to get control post apic initialization. @@ -497,7 +553,8 @@ static void __init ms_hyperv_init_platform(void) # ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu; - if (hv_root_partition) + if (hv_root_partition || + (!ms_hyperv.paravisor_present && hv_isolation_type_snp())) smp_ops.smp_prepare_cpus = hv_smp_prepare_cpus; # endif @@ -560,6 +617,22 @@ static bool __init ms_hyperv_msi_ext_dest_id(void) return eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE; } +#ifdef CONFIG_AMD_MEM_ENCRYPT +static void hv_sev_es_hcall_prepare(struct ghcb *ghcb, struct pt_regs *regs) +{ + /* RAX and CPL are already in the GHCB */ + ghcb_set_rcx(ghcb, regs->cx); + ghcb_set_rdx(ghcb, regs->dx); + ghcb_set_r8(ghcb, regs->r8); +} + +static bool hv_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs) +{ + /* No checking of the return state needed */ + return true; +} +#endif + const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = { .name = "Microsoft Hyper-V", .detect = ms_hyperv_platform, @@ -567,4 +640,8 @@ const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = { .init.x2apic_available = ms_hyperv_x2apic_available, .init.msi_ext_dest_id = ms_hyperv_msi_ext_dest_id, .init.init_platform = ms_hyperv_init_platform, +#ifdef CONFIG_AMD_MEM_ENCRYPT + .runtime.sev_es_hcall_prepare = hv_sev_es_hcall_prepare, + .runtime.sev_es_hcall_finish = hv_sev_es_hcall_finish, +#endif }; diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 099b6f0d96bd..31c0e68f6227 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -4,6 +4,8 @@ #include <linux/string.h> #include <linux/seq_file.h> #include <linux/cpufreq.h> +#include <asm/prctl.h> +#include <linux/proc_fs.h> #include "cpu.h" @@ -175,3 +177,24 @@ const struct seq_operations cpuinfo_op = { .stop = c_stop, .show = show_cpuinfo, }; + +#ifdef CONFIG_X86_USER_SHADOW_STACK +static void dump_x86_features(struct seq_file *m, unsigned long features) +{ + if (features & ARCH_SHSTK_SHSTK) + seq_puts(m, "shstk "); + if (features & ARCH_SHSTK_WRSS) + seq_puts(m, "wrss "); +} + +void arch_proc_pid_thread_features(struct seq_file *m, struct task_struct *task) +{ + seq_puts(m, "x86_Thread_features:\t"); + dump_x86_features(m, task->thread.features); + seq_putc(m, '\n'); + + seq_puts(m, "x86_Thread_features_locked:\t"); + dump_x86_features(m, task->thread.features_locked); + seq_putc(m, '\n'); +} +#endif /* CONFIG_X86_USER_SHADOW_STACK */ diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 458cb7419502..8f559eeae08e 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -45,7 +45,21 @@ static u64 prefetch_disable_bits; */ static unsigned int pseudo_lock_major; static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0); -static struct class *pseudo_lock_class; + +static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode) +{ + const struct rdtgroup *rdtgrp; + + rdtgrp = dev_get_drvdata(dev); + if (mode) + *mode = 0600; + return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdtgrp->kn->name); +} + +static const struct class pseudo_lock_class = { + .name = "pseudo_lock", + .devnode = pseudo_lock_devnode, +}; /** * get_prefetch_disable_bits - prefetch disable bits of supported platforms @@ -1353,7 +1367,7 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) &pseudo_measure_fops); } - dev = device_create(pseudo_lock_class, NULL, + dev = device_create(&pseudo_lock_class, NULL, MKDEV(pseudo_lock_major, new_minor), rdtgrp, "%s", rdtgrp->kn->name); @@ -1383,7 +1397,7 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) goto out; out_device: - device_destroy(pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor)); + device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor)); out_debugfs: debugfs_remove_recursive(plr->debugfs_dir); pseudo_lock_minor_release(new_minor); @@ -1424,7 +1438,7 @@ void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) pseudo_lock_cstates_relax(plr); debugfs_remove_recursive(rdtgrp->plr->debugfs_dir); - device_destroy(pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor)); + device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor)); pseudo_lock_minor_release(plr->minor); free: @@ -1560,16 +1574,6 @@ static const struct file_operations pseudo_lock_dev_fops = { .mmap = pseudo_lock_dev_mmap, }; -static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode) -{ - const struct rdtgroup *rdtgrp; - - rdtgrp = dev_get_drvdata(dev); - if (mode) - *mode = 0600; - return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdtgrp->kn->name); -} - int rdt_pseudo_lock_init(void) { int ret; @@ -1580,21 +1584,18 @@ int rdt_pseudo_lock_init(void) pseudo_lock_major = ret; - pseudo_lock_class = class_create("pseudo_lock"); - if (IS_ERR(pseudo_lock_class)) { - ret = PTR_ERR(pseudo_lock_class); + ret = class_register(&pseudo_lock_class); + if (ret) { unregister_chrdev(pseudo_lock_major, "pseudo_lock"); return ret; } - pseudo_lock_class->devnode = pseudo_lock_devnode; return 0; } void rdt_pseudo_lock_release(void) { - class_destroy(pseudo_lock_class); - pseudo_lock_class = NULL; + class_unregister(&pseudo_lock_class); unregister_chrdev(pseudo_lock_major, "pseudo_lock"); pseudo_lock_major = 0; } diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c index c3e37eaec8ec..7aaa3652e31d 100644 --- a/arch/x86/kernel/cpu/sgx/virt.c +++ b/arch/x86/kernel/cpu/sgx/virt.c @@ -204,6 +204,7 @@ static int sgx_vepc_release(struct inode *inode, struct file *file) continue; xa_erase(&vepc->page_array, index); + cond_resched(); } /* @@ -222,6 +223,7 @@ static int sgx_vepc_release(struct inode *inode, struct file *file) list_add_tail(&epc_page->list, &secs_pages); xa_erase(&vepc->page_array, index); + cond_resched(); } /* @@ -243,6 +245,7 @@ static int sgx_vepc_release(struct inode *inode, struct file *file) if (sgx_vepc_free_page(epc_page)) list_add_tail(&epc_page->list, &secs_pages); + cond_resched(); } if (!list_empty(&secs_pages)) diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index bdc0d5539b57..dae436253de4 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -40,7 +40,6 @@ #include <asm/processor.h> #include <asm/msr.h> -static struct class *cpuid_class; static enum cpuhp_state cpuhp_cpuid_state; struct cpuid_regs_done { @@ -124,26 +123,31 @@ static const struct file_operations cpuid_fops = { .open = cpuid_open, }; +static char *cpuid_devnode(const struct device *dev, umode_t *mode) +{ + return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); +} + +static const struct class cpuid_class = { + .name = "cpuid", + .devnode = cpuid_devnode, +}; + static int cpuid_device_create(unsigned int cpu) { struct device *dev; - dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL, + dev = device_create(&cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL, "cpu%d", cpu); return PTR_ERR_OR_ZERO(dev); } static int cpuid_device_destroy(unsigned int cpu) { - device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu)); + device_destroy(&cpuid_class, MKDEV(CPUID_MAJOR, cpu)); return 0; } -static char *cpuid_devnode(const struct device *dev, umode_t *mode) -{ - return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); -} - static int __init cpuid_init(void) { int err; @@ -154,12 +158,9 @@ static int __init cpuid_init(void) CPUID_MAJOR); return -EBUSY; } - cpuid_class = class_create("cpuid"); - if (IS_ERR(cpuid_class)) { - err = PTR_ERR(cpuid_class); + err = class_register(&cpuid_class); + if (err) goto out_chrdev; - } - cpuid_class->devnode = cpuid_devnode; err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/cpuid:online", cpuid_device_create, cpuid_device_destroy); @@ -170,7 +171,7 @@ static int __init cpuid_init(void) return 0; out_class: - class_destroy(cpuid_class); + class_unregister(&cpuid_class); out_chrdev: __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); return err; @@ -180,7 +181,7 @@ module_init(cpuid_init); static void __exit cpuid_exit(void) { cpuhp_remove_state(cpuhp_cpuid_state); - class_destroy(cpuid_class); + class_unregister(&cpuid_class); __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); } module_exit(cpuid_exit); diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index cdd92ab43cda..c92d88680dbf 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -48,27 +48,6 @@ struct crash_memmap_data { unsigned int type; }; -/* - * This is used to VMCLEAR all VMCSs loaded on the - * processor. And when loading kvm_intel module, the - * callback function pointer will be assigned. - * - * protected by rcu. - */ -crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL; -EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss); - -static inline void cpu_crash_vmclear_loaded_vmcss(void) -{ - crash_vmclear_fn *do_vmclear_operation = NULL; - - rcu_read_lock(); - do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss); - if (do_vmclear_operation) - do_vmclear_operation(); - rcu_read_unlock(); -} - #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) static void kdump_nmi_callback(int cpu, struct pt_regs *regs) @@ -76,11 +55,6 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs) crash_save_cpu(regs, cpu); /* - * VMCLEAR VMCSs loaded on all cpus if needed. - */ - cpu_crash_vmclear_loaded_vmcss(); - - /* * Disable Intel PT to stop its logging */ cpu_emergency_stop_pt(); @@ -133,11 +107,6 @@ void native_machine_crash_shutdown(struct pt_regs *regs) crash_smp_send_stop(); - /* - * VMCLEAR VMCSs loaded on this cpu if needed. - */ - cpu_crash_vmclear_loaded_vmcss(); - cpu_emergency_disable_virtualization(); /* @@ -158,8 +127,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs) crash_save_cpu(regs, safe_smp_processor_id()); } -#ifdef CONFIG_KEXEC_FILE - +#if defined(CONFIG_KEXEC_FILE) || defined(CONFIG_CRASH_HOTPLUG) static int get_nr_ram_ranges_callback(struct resource *res, void *arg) { unsigned int *nr_ranges = arg; @@ -231,7 +199,7 @@ static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg) /* Prepare elf headers. Return addr and size */ static int prepare_elf_headers(struct kimage *image, void **addr, - unsigned long *sz) + unsigned long *sz, unsigned long *nr_mem_ranges) { struct crash_mem *cmem; int ret; @@ -249,6 +217,9 @@ static int prepare_elf_headers(struct kimage *image, void **addr, if (ret) goto out; + /* Return the computed number of memory ranges, for hotplug usage */ + *nr_mem_ranges = cmem->nr_ranges; + /* By default prepare 64bit headers */ ret = crash_prepare_elf64_headers(cmem, IS_ENABLED(CONFIG_X86_64), addr, sz); @@ -256,7 +227,9 @@ out: vfree(cmem); return ret; } +#endif +#ifdef CONFIG_KEXEC_FILE static int add_e820_entry(struct boot_params *params, struct e820_entry *entry) { unsigned int nr_e820_entries; @@ -371,18 +344,42 @@ out: int crash_load_segments(struct kimage *image) { int ret; + unsigned long pnum = 0; struct kexec_buf kbuf = { .image = image, .buf_min = 0, .buf_max = ULONG_MAX, .top_down = false }; /* Prepare elf headers and add a segment */ - ret = prepare_elf_headers(image, &kbuf.buffer, &kbuf.bufsz); + ret = prepare_elf_headers(image, &kbuf.buffer, &kbuf.bufsz, &pnum); if (ret) return ret; - image->elf_headers = kbuf.buffer; - image->elf_headers_sz = kbuf.bufsz; + image->elf_headers = kbuf.buffer; + image->elf_headers_sz = kbuf.bufsz; + kbuf.memsz = kbuf.bufsz; + +#ifdef CONFIG_CRASH_HOTPLUG + /* + * The elfcorehdr segment size accounts for VMCOREINFO, kernel_map, + * maximum CPUs and maximum memory ranges. + */ + if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) + pnum = 2 + CONFIG_NR_CPUS_DEFAULT + CONFIG_CRASH_MAX_MEMORY_RANGES; + else + pnum += 2 + CONFIG_NR_CPUS_DEFAULT; + + if (pnum < (unsigned long)PN_XNUM) { + kbuf.memsz = pnum * sizeof(Elf64_Phdr); + kbuf.memsz += sizeof(Elf64_Ehdr); + + image->elfcorehdr_index = image->nr_segments; + + /* Mark as usable to crash kernel, else crash kernel fails on boot */ + image->elf_headers_sz = kbuf.memsz; + } else { + pr_err("number of Phdrs %lu exceeds max\n", pnum); + } +#endif - kbuf.memsz = kbuf.bufsz; kbuf.buf_align = ELF_CORE_HEADER_ALIGN; kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; ret = kexec_add_buffer(&kbuf); @@ -395,3 +392,103 @@ int crash_load_segments(struct kimage *image) return ret; } #endif /* CONFIG_KEXEC_FILE */ + +#ifdef CONFIG_CRASH_HOTPLUG + +#undef pr_fmt +#define pr_fmt(fmt) "crash hp: " fmt + +/* These functions provide the value for the sysfs crash_hotplug nodes */ +#ifdef CONFIG_HOTPLUG_CPU +int arch_crash_hotplug_cpu_support(void) +{ + return crash_check_update_elfcorehdr(); +} +#endif + +#ifdef CONFIG_MEMORY_HOTPLUG +int arch_crash_hotplug_memory_support(void) +{ + return crash_check_update_elfcorehdr(); +} +#endif + +unsigned int arch_crash_get_elfcorehdr_size(void) +{ + unsigned int sz; + + /* kernel_map, VMCOREINFO and maximum CPUs */ + sz = 2 + CONFIG_NR_CPUS_DEFAULT; + if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) + sz += CONFIG_CRASH_MAX_MEMORY_RANGES; + sz *= sizeof(Elf64_Phdr); + return sz; +} + +/** + * arch_crash_handle_hotplug_event() - Handle hotplug elfcorehdr changes + * @image: a pointer to kexec_crash_image + * + * Prepare the new elfcorehdr and replace the existing elfcorehdr. + */ +void arch_crash_handle_hotplug_event(struct kimage *image) +{ + void *elfbuf = NULL, *old_elfcorehdr; + unsigned long nr_mem_ranges; + unsigned long mem, memsz; + unsigned long elfsz = 0; + + /* + * As crash_prepare_elf64_headers() has already described all + * possible CPUs, there is no need to update the elfcorehdr + * for additional CPU changes. + */ + if ((image->file_mode || image->elfcorehdr_updated) && + ((image->hp_action == KEXEC_CRASH_HP_ADD_CPU) || + (image->hp_action == KEXEC_CRASH_HP_REMOVE_CPU))) + return; + + /* + * Create the new elfcorehdr reflecting the changes to CPU and/or + * memory resources. + */ + if (prepare_elf_headers(image, &elfbuf, &elfsz, &nr_mem_ranges)) { + pr_err("unable to create new elfcorehdr"); + goto out; + } + + /* + * Obtain address and size of the elfcorehdr segment, and + * check it against the new elfcorehdr buffer. + */ + mem = image->segment[image->elfcorehdr_index].mem; + memsz = image->segment[image->elfcorehdr_index].memsz; + if (elfsz > memsz) { + pr_err("update elfcorehdr elfsz %lu > memsz %lu", + elfsz, memsz); + goto out; + } + + /* + * Copy new elfcorehdr over the old elfcorehdr at destination. + */ + old_elfcorehdr = kmap_local_page(pfn_to_page(mem >> PAGE_SHIFT)); + if (!old_elfcorehdr) { + pr_err("mapping elfcorehdr segment failed\n"); + goto out; + } + + /* + * Temporarily invalidate the crash image while the + * elfcorehdr is updated. + */ + xchg(&kexec_crash_image, NULL); + memcpy_flushcache(old_elfcorehdr, elfbuf, elfsz); + xchg(&kexec_crash_image, image); + kunmap_local(old_elfcorehdr); + pr_debug("updated elfcorehdr\n"); + +out: + vfree(elfbuf); +} +#endif diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 28da5dd83fc0..87d38f17ff5c 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -128,16 +128,15 @@ static void __init dtb_setup_hpet(void) static void __init dtb_cpu_setup(void) { struct device_node *dn; - u32 apic_id, version; + u32 apic_id; - version = GET_APIC_VERSION(apic_read(APIC_LVR)); for_each_of_cpu_node(dn) { apic_id = of_get_cpu_hwid(dn, 0); if (apic_id == ~0U) { pr_warn("%pOF: missing local APIC ID\n", dn); continue; } - generic_processor_info(apic_id, version); + generic_processor_info(apic_id); } } @@ -158,19 +157,15 @@ static void __init dtb_lapic_setup(void) /* Did the boot loader setup the local APIC ? */ if (!boot_cpu_has(X86_FEATURE_APIC)) { - if (apic_force_enable(lapic_addr)) + /* Try force enabling, which registers the APIC address */ + if (!apic_force_enable(lapic_addr)) return; - } - smp_found_config = 1; - if (of_property_read_bool(dn, "intel,virtual-wire-mode")) { - pr_info("Virtual Wire compatibility mode.\n"); - pic_mode = 0; } else { - pr_info("IMCR and PIC compatibility mode.\n"); - pic_mode = 1; + register_lapic_address(lapic_addr); } - - register_lapic_address(lapic_addr); + smp_found_config = 1; + pic_mode = !of_property_read_bool(dn, "intel,virtual-wire-mode"); + pr_info("%s compatibility mode.\n", pic_mode ? "IMCR and PIC" : "Virtual Wire"); } #endif /* CONFIG_X86_LOCAL_APIC */ diff --git a/arch/x86/kernel/fpu/context.h b/arch/x86/kernel/fpu/context.h index af5cbdd9bd29..f6d856bd50bc 100644 --- a/arch/x86/kernel/fpu/context.h +++ b/arch/x86/kernel/fpu/context.h @@ -19,8 +19,7 @@ * FPU state for a task MUST let the rest of the kernel know that the * FPU registers are no longer valid for this task. * - * Either one of these invalidation functions is enough. Invalidate - * a resource you control: CPU if using the CPU for something else + * Invalidate a resource you control: CPU if using the CPU for something else * (with preemption disabled), FPU for the current task, or a task that * is prevented from running by the current task. */ diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 1015af1ae562..a86d37052a64 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -552,8 +552,36 @@ static inline void fpu_inherit_perms(struct fpu *dst_fpu) } } +/* A passed ssp of zero will not cause any update */ +static int update_fpu_shstk(struct task_struct *dst, unsigned long ssp) +{ +#ifdef CONFIG_X86_USER_SHADOW_STACK + struct cet_user_state *xstate; + + /* If ssp update is not needed. */ + if (!ssp) + return 0; + + xstate = get_xsave_addr(&dst->thread.fpu.fpstate->regs.xsave, + XFEATURE_CET_USER); + + /* + * If there is a non-zero ssp, then 'dst' must be configured with a shadow + * stack and the fpu state should be up to date since it was just copied + * from the parent in fpu_clone(). So there must be a valid non-init CET + * state location in the buffer. + */ + if (WARN_ON_ONCE(!xstate)) + return 1; + + xstate->user_ssp = (u64)ssp; +#endif + return 0; +} + /* Clone current's FPU state on fork */ -int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal) +int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal, + unsigned long ssp) { struct fpu *src_fpu = ¤t->thread.fpu; struct fpu *dst_fpu = &dst->thread.fpu; @@ -613,6 +641,12 @@ int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal) if (use_xsave()) dst_fpu->fpstate->regs.xsave.header.xfeatures &= ~XFEATURE_MASK_PASID; + /* + * Update shadow stack pointer, in case it changed during clone. + */ + if (update_fpu_shstk(dst, ssp)) + return 1; + trace_x86_fpu_copy_src(src_fpu); trace_x86_fpu_copy_dst(dst_fpu); @@ -679,7 +713,7 @@ static void fpu_reset_fpregs(void) struct fpu *fpu = ¤t->thread.fpu; fpregs_lock(); - fpu__drop(fpu); + __fpu_invalidate_fpregs_state(fpu); /* * This does not change the actual hardware registers. It just * resets the memory image and sets TIF_NEED_FPU_LOAD so a @@ -753,6 +787,24 @@ void switch_fpu_return(void) } EXPORT_SYMBOL_GPL(switch_fpu_return); +void fpregs_lock_and_load(void) +{ + /* + * fpregs_lock() only disables preemption (mostly). So modifying state + * in an interrupt could screw up some in progress fpregs operation. + * Warn about it. + */ + WARN_ON_ONCE(!irq_fpu_usable()); + WARN_ON_ONCE(current->flags & PF_KTHREAD); + + fpregs_lock(); + + fpregs_assert_state_consistent(); + + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + fpregs_restore_userregs(); +} + #ifdef CONFIG_X86_DEBUG_FPU /* * If current FPU state according to its tracking (loaded FPU context on this diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 6d056b68f4ed..6bc1eb2a21bd 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -8,6 +8,7 @@ #include <asm/fpu/api.h> #include <asm/fpu/signal.h> #include <asm/fpu/regset.h> +#include <asm/prctl.h> #include "context.h" #include "internal.h" @@ -174,6 +175,86 @@ out: return ret; } +#ifdef CONFIG_X86_USER_SHADOW_STACK +int ssp_active(struct task_struct *target, const struct user_regset *regset) +{ + if (target->thread.features & ARCH_SHSTK_SHSTK) + return regset->n; + + return 0; +} + +int ssp_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + struct fpu *fpu = &target->thread.fpu; + struct cet_user_state *cetregs; + + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) + return -ENODEV; + + sync_fpstate(fpu); + cetregs = get_xsave_addr(&fpu->fpstate->regs.xsave, XFEATURE_CET_USER); + if (WARN_ON(!cetregs)) { + /* + * This shouldn't ever be NULL because shadow stack was + * verified to be enabled above. This means + * MSR_IA32_U_CET.CET_SHSTK_EN should be 1 and so + * XFEATURE_CET_USER should not be in the init state. + */ + return -ENODEV; + } + + return membuf_write(&to, (unsigned long *)&cetregs->user_ssp, + sizeof(cetregs->user_ssp)); +} + +int ssp_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct fpu *fpu = &target->thread.fpu; + struct xregs_state *xsave = &fpu->fpstate->regs.xsave; + struct cet_user_state *cetregs; + unsigned long user_ssp; + int r; + + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || + !ssp_active(target, regset)) + return -ENODEV; + + if (pos != 0 || count != sizeof(user_ssp)) + return -EINVAL; + + r = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &user_ssp, 0, -1); + if (r) + return r; + + /* + * Some kernel instructions (IRET, etc) can cause exceptions in the case + * of disallowed CET register values. Just prevent invalid values. + */ + if (user_ssp >= TASK_SIZE_MAX || !IS_ALIGNED(user_ssp, 8)) + return -EINVAL; + + fpu_force_restore(fpu); + + cetregs = get_xsave_addr(xsave, XFEATURE_CET_USER); + if (WARN_ON(!cetregs)) { + /* + * This shouldn't ever be NULL because shadow stack was + * verified to be enabled above. This means + * MSR_IA32_U_CET.CET_SHSTK_EN should be 1 and so + * XFEATURE_CET_USER should not be in the init state. + */ + return -ENODEV; + } + + cetregs->user_ssp = user_ssp; + return 0; +} +#endif /* CONFIG_X86_USER_SHADOW_STACK */ + #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION /* diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 0bab497c9436..cadf68737e6b 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -39,26 +39,26 @@ */ static const char *xfeature_names[] = { - "x87 floating point registers" , - "SSE registers" , - "AVX registers" , - "MPX bounds registers" , - "MPX CSR" , - "AVX-512 opmask" , - "AVX-512 Hi256" , - "AVX-512 ZMM_Hi256" , - "Processor Trace (unused)" , + "x87 floating point registers", + "SSE registers", + "AVX registers", + "MPX bounds registers", + "MPX CSR", + "AVX-512 opmask", + "AVX-512 Hi256", + "AVX-512 ZMM_Hi256", + "Processor Trace (unused)", "Protection Keys User registers", "PASID state", - "unknown xstate feature" , - "unknown xstate feature" , - "unknown xstate feature" , - "unknown xstate feature" , - "unknown xstate feature" , - "unknown xstate feature" , - "AMX Tile config" , - "AMX Tile data" , - "unknown xstate feature" , + "Control-flow User registers", + "Control-flow Kernel registers (unused)", + "unknown xstate feature", + "unknown xstate feature", + "unknown xstate feature", + "unknown xstate feature", + "AMX Tile config", + "AMX Tile data", + "unknown xstate feature", }; static unsigned short xsave_cpuid_features[] __initdata = { @@ -71,8 +71,9 @@ static unsigned short xsave_cpuid_features[] __initdata = { [XFEATURE_ZMM_Hi256] = X86_FEATURE_AVX512F, [XFEATURE_Hi16_ZMM] = X86_FEATURE_AVX512F, [XFEATURE_PT_UNIMPLEMENTED_SO_FAR] = X86_FEATURE_INTEL_PT, - [XFEATURE_PKRU] = X86_FEATURE_PKU, + [XFEATURE_PKRU] = X86_FEATURE_OSPKE, [XFEATURE_PASID] = X86_FEATURE_ENQCMD, + [XFEATURE_CET_USER] = X86_FEATURE_SHSTK, [XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE, [XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE, }; @@ -276,6 +277,7 @@ static void __init print_xstate_features(void) print_xstate_feature(XFEATURE_MASK_Hi16_ZMM); print_xstate_feature(XFEATURE_MASK_PKRU); print_xstate_feature(XFEATURE_MASK_PASID); + print_xstate_feature(XFEATURE_MASK_CET_USER); print_xstate_feature(XFEATURE_MASK_XTILE_CFG); print_xstate_feature(XFEATURE_MASK_XTILE_DATA); } @@ -344,6 +346,7 @@ static __init void os_xrstor_booting(struct xregs_state *xstate) XFEATURE_MASK_BNDREGS | \ XFEATURE_MASK_BNDCSR | \ XFEATURE_MASK_PASID | \ + XFEATURE_MASK_CET_USER | \ XFEATURE_MASK_XTILE) /* @@ -446,14 +449,15 @@ static void __init __xstate_dump_leaves(void) } \ } while (0) -#define XCHECK_SZ(sz, nr, nr_macro, __struct) do { \ - if ((nr == nr_macro) && \ - WARN_ONCE(sz != sizeof(__struct), \ - "%s: struct is %zu bytes, cpu state %d bytes\n", \ - __stringify(nr_macro), sizeof(__struct), sz)) { \ +#define XCHECK_SZ(sz, nr, __struct) ({ \ + if (WARN_ONCE(sz != sizeof(__struct), \ + "[%s]: struct is %zu bytes, cpu state %d bytes\n", \ + xfeature_names[nr], sizeof(__struct), sz)) { \ __xstate_dump_leaves(); \ } \ -} while (0) + true; \ +}) + /** * check_xtile_data_against_struct - Check tile data state size. @@ -527,36 +531,28 @@ static bool __init check_xstate_against_struct(int nr) * Ask the CPU for the size of the state. */ int sz = xfeature_size(nr); + /* * Match each CPU state with the corresponding software * structure. */ - XCHECK_SZ(sz, nr, XFEATURE_YMM, struct ymmh_struct); - XCHECK_SZ(sz, nr, XFEATURE_BNDREGS, struct mpx_bndreg_state); - XCHECK_SZ(sz, nr, XFEATURE_BNDCSR, struct mpx_bndcsr_state); - XCHECK_SZ(sz, nr, XFEATURE_OPMASK, struct avx_512_opmask_state); - XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state); - XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM, struct avx_512_hi16_state); - XCHECK_SZ(sz, nr, XFEATURE_PKRU, struct pkru_state); - XCHECK_SZ(sz, nr, XFEATURE_PASID, struct ia32_pasid_state); - XCHECK_SZ(sz, nr, XFEATURE_XTILE_CFG, struct xtile_cfg); - - /* The tile data size varies between implementations. */ - if (nr == XFEATURE_XTILE_DATA) - check_xtile_data_against_struct(sz); - - /* - * Make *SURE* to add any feature numbers in below if - * there are "holes" in the xsave state component - * numbers. - */ - if ((nr < XFEATURE_YMM) || - (nr >= XFEATURE_MAX) || - (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) || - ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_RSRVD_COMP_16))) { + switch (nr) { + case XFEATURE_YMM: return XCHECK_SZ(sz, nr, struct ymmh_struct); + case XFEATURE_BNDREGS: return XCHECK_SZ(sz, nr, struct mpx_bndreg_state); + case XFEATURE_BNDCSR: return XCHECK_SZ(sz, nr, struct mpx_bndcsr_state); + case XFEATURE_OPMASK: return XCHECK_SZ(sz, nr, struct avx_512_opmask_state); + case XFEATURE_ZMM_Hi256: return XCHECK_SZ(sz, nr, struct avx_512_zmm_uppers_state); + case XFEATURE_Hi16_ZMM: return XCHECK_SZ(sz, nr, struct avx_512_hi16_state); + case XFEATURE_PKRU: return XCHECK_SZ(sz, nr, struct pkru_state); + case XFEATURE_PASID: return XCHECK_SZ(sz, nr, struct ia32_pasid_state); + case XFEATURE_XTILE_CFG: return XCHECK_SZ(sz, nr, struct xtile_cfg); + case XFEATURE_CET_USER: return XCHECK_SZ(sz, nr, struct cet_user_state); + case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true; + default: XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr); return false; } + return true; } @@ -882,6 +878,13 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) goto out_disable; } + /* + * CPU capabilities initialization runs before FPU init. So + * X86_FEATURE_OSXSAVE is not set. Now that XSAVE is completely + * functional, set the feature bit so depending code works. + */ + setup_force_cpu_cap(X86_FEATURE_OSXSAVE); + print_xstate_offset_size(); pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", fpu_kernel_cfg.max_features, diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index c5b9289837dc..ea6995920b7a 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -51,7 +51,9 @@ SYM_CODE_START_NOALIGN(startup_64) * for us. These identity mapped page tables map all of the * kernel pages and possibly all of memory. * - * %rsi holds a physical pointer to real_mode_data. + * %RSI holds the physical address of the boot_params structure + * provided by the bootloader. Preserve it in %R15 so C function calls + * will not clobber it. * * We come here either directly from a 64bit bootloader, or from * arch/x86/boot/compressed/head_64.S. @@ -62,6 +64,7 @@ SYM_CODE_START_NOALIGN(startup_64) * compiled to run at we first fixup the physical addresses in our page * tables and then reload them. */ + mov %rsi, %r15 /* Set up the stack for verify_cpu() */ leaq (__end_init_task - PTREGS_SIZE)(%rip), %rsp @@ -75,9 +78,7 @@ SYM_CODE_START_NOALIGN(startup_64) shrq $32, %rdx wrmsr - pushq %rsi call startup_64_setup_env - popq %rsi /* Now switch to __KERNEL_CS so IRET works reliably */ pushq $__KERNEL_CS @@ -93,12 +94,10 @@ SYM_CODE_START_NOALIGN(startup_64) * Activate SEV/SME memory encryption if supported/enabled. This needs to * be done now, since this also includes setup of the SEV-SNP CPUID table, * which needs to be done before any CPUID instructions are executed in - * subsequent code. + * subsequent code. Pass the boot_params pointer as the first argument. */ - movq %rsi, %rdi - pushq %rsi + movq %r15, %rdi call sme_enable - popq %rsi #endif /* Sanitize CPU configuration */ @@ -111,9 +110,8 @@ SYM_CODE_START_NOALIGN(startup_64) * programmed into CR3. */ leaq _text(%rip), %rdi - pushq %rsi + movq %r15, %rsi call __startup_64 - popq %rsi /* Form the CR3 value being sure to include the CR3 modifier */ addq $(early_top_pgt - __START_KERNEL_map), %rax @@ -127,8 +125,6 @@ SYM_CODE_START(secondary_startup_64) * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, * and someone has loaded a mapped page table. * - * %rsi holds a physical pointer to real_mode_data. - * * We come here either from startup_64 (using physical addresses) * or from trampoline.S (using virtual addresses). * @@ -153,6 +149,9 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) UNWIND_HINT_END_OF_STACK ANNOTATE_NOENDBR + /* Clear %R15 which holds the boot_params pointer on the boot CPU */ + xorq %r15, %r15 + /* * Retrieve the modifier (SME encryption mask if SME is active) to be * added to the initial pgdir entry that will be programmed into CR3. @@ -199,13 +198,9 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) * hypervisor could lie about the C-bit position to perform a ROP * attack on the guest by writing to the unencrypted stack and wait for * the next RET instruction. - * %rsi carries pointer to realmode data and is callee-clobbered. Save - * and restore it. */ - pushq %rsi movq %rax, %rdi call sev_verify_cbit - popq %rsi /* * Switch to new page-table @@ -365,9 +360,7 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) wrmsr /* Setup and Load IDT */ - pushq %rsi call early_setup_idt - popq %rsi /* Check if nx is implemented */ movl $0x80000001, %eax @@ -403,9 +396,8 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) pushq $0 popfq - /* rsi is pointer to real mode structure with interesting info. - pass it to C */ - movq %rsi, %rdi + /* Pass the boot_params pointer as first argument */ + movq %r15, %rdi .Ljump_to_C_code: /* diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index c8eb1ac5125a..1648aa0204d9 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -421,7 +421,7 @@ static void __init hpet_legacy_clockevent_register(struct hpet_channel *hc) * the IO_APIC has been initialized. */ hc->cpu = boot_cpu_data.cpu_index; - strncpy(hc->name, "hpet", sizeof(hc->name)); + strscpy(hc->name, "hpet", sizeof(hc->name)); hpet_init_clockevent(hc, 50); hc->evt.tick_resume = hpet_clkevt_legacy_resume; diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 4d8aff05a509..30a55207c000 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -231,9 +231,7 @@ struct irq_chip i8259A_chip = { }; static char irq_trigger[2]; -/** - * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ - */ +/* ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ */ static void restore_ELCR(char *trigger) { outb(trigger[0], PIC_ELCR1); diff --git a/arch/x86/kernel/ibt_selftest.S b/arch/x86/kernel/ibt_selftest.S new file mode 100644 index 000000000000..c43c4ed28a9c --- /dev/null +++ b/arch/x86/kernel/ibt_selftest.S @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/linkage.h> +#include <linux/objtool.h> +#include <asm/nospec-branch.h> + +SYM_CODE_START(ibt_selftest_noendbr) + ANNOTATE_NOENDBR + UNWIND_HINT_FUNC + /* #CP handler sets %ax to 0 */ + RET +SYM_CODE_END(ibt_selftest_noendbr) + +SYM_FUNC_START(ibt_selftest) + lea ibt_selftest_noendbr(%rip), %rax + ANNOTATE_RETPOLINE_SAFE + jmp *%rax +SYM_FUNC_END(ibt_selftest) diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index a58c6bc1cd68..b786d48f5a0f 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -107,7 +107,7 @@ static const __initconst struct idt_data def_idts[] = { ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE), #endif -#ifdef CONFIG_X86_KERNEL_IBT +#ifdef CONFIG_X86_CET INTG(X86_TRAP_CP, asm_exc_control_protection), #endif @@ -131,7 +131,6 @@ static const __initconst struct idt_data apic_idts[] = { INTG(RESCHEDULE_VECTOR, asm_sysvec_reschedule_ipi), INTG(CALL_FUNCTION_VECTOR, asm_sysvec_call_function), INTG(CALL_FUNCTION_SINGLE_VECTOR, asm_sysvec_call_function_single), - INTG(IRQ_MOVE_CLEANUP_VECTOR, asm_sysvec_irq_move_cleanup), INTG(REBOOT_VECTOR, asm_sysvec_reboot), #endif diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 9f668d2f3d11..11761c124545 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -49,7 +49,7 @@ void ack_bad_irq(unsigned int irq) * completely. * But only ack when the APIC is enabled -AK */ - ack_APIC_irq(); + apic_eoi(); } #define irq_stats(x) (&per_cpu(irq_stat, x)) @@ -256,7 +256,7 @@ DEFINE_IDTENTRY_IRQ(common_interrupt) if (likely(!IS_ERR_OR_NULL(desc))) { handle_irq(desc, regs); } else { - ack_APIC_irq(); + apic_eoi(); if (desc == VECTOR_UNUSED) { pr_emerg_ratelimited("%s: %d.%u No irq handler for vector\n", @@ -280,7 +280,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_x86_platform_ipi) { struct pt_regs *old_regs = set_irq_regs(regs); - ack_APIC_irq(); + apic_eoi(); trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR); inc_irq_stat(x86_platform_ipis); if (x86_platform_ipi_callback) @@ -310,7 +310,7 @@ EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler); */ DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_ipi) { - ack_APIC_irq(); + apic_eoi(); inc_irq_stat(kvm_posted_intr_ipis); } @@ -319,7 +319,7 @@ DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_ipi) */ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_posted_intr_wakeup_ipi) { - ack_APIC_irq(); + apic_eoi(); inc_irq_stat(kvm_posted_intr_wakeup_ipis); kvm_posted_intr_wakeup_handler(); } @@ -329,7 +329,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_posted_intr_wakeup_ipi) */ DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_nested_ipi) { - ack_APIC_irq(); + apic_eoi(); inc_irq_stat(kvm_posted_intr_nested_ipis); } #endif @@ -401,6 +401,6 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_thermal) inc_irq_stat(irq_thermal_count); smp_thermal_vector(); trace_thermal_apic_exit(THERMAL_APIC_VECTOR); - ack_APIC_irq(); + apic_eoi(); } #endif diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index 890d4778cd35..b0a24deab4a1 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -16,7 +16,7 @@ #ifdef CONFIG_X86_LOCAL_APIC DEFINE_IDTENTRY_SYSVEC(sysvec_irq_work) { - ack_APIC_irq(); + apic_eoi(); trace_irq_work_entry(IRQ_WORK_VECTOR); inc_irq_stat(apic_irq_work_irqs); irq_work_run(); @@ -28,7 +28,7 @@ void arch_irq_work_raise(void) if (!arch_irq_work_has_interrupt()) return; - apic->send_IPI_self(IRQ_WORK_VECTOR); + __apic_send_IPI_self(IRQ_WORK_VECTOR); apic_wait_icr_idle(); } #endif diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c index 4eb8f2d19a87..578d16fc040f 100644 --- a/arch/x86/kernel/jailhouse.c +++ b/arch/x86/kernel/jailhouse.c @@ -101,10 +101,8 @@ static void __init jailhouse_get_smp_config(unsigned int early) register_lapic_address(0xfee00000); - for (cpu = 0; cpu < setup_data.v1.num_cpus; cpu++) { - generic_processor_info(setup_data.v1.cpu_ids[cpu], - boot_cpu_apic_version); - } + for (cpu = 0; cpu < setup_data.v1.num_cpus; cpu++) + generic_processor_info(setup_data.v1.cpu_ids[cpu]); smp_found_config = 1; diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index f7f6042eb7e6..e8babebad7b8 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -45,6 +45,7 @@ #include <linux/vmalloc.h> #include <linux/pgtable.h> #include <linux/set_memory.h> +#include <linux/cfi.h> #include <asm/text-patching.h> #include <asm/cacheflush.h> @@ -293,7 +294,40 @@ static int can_probe(unsigned long paddr) #endif addr += insn.length; } + if (IS_ENABLED(CONFIG_CFI_CLANG)) { + /* + * The compiler generates the following instruction sequence + * for indirect call checks and cfi.c decodes this; + * + *Â movl -<id>, %r10d ; 6 bytes + * addl -4(%reg), %r10d ; 4 bytes + * je .Ltmp1 ; 2 bytes + * ud2 ; <- regs->ip + * .Ltmp1: + * + * Also, these movl and addl are used for showing expected + * type. So those must not be touched. + */ + __addr = recover_probed_instruction(buf, addr); + if (!__addr) + return 0; + + if (insn_decode_kernel(&insn, (void *)__addr) < 0) + return 0; + + if (insn.opcode.value == 0xBA) + offset = 12; + else if (insn.opcode.value == 0x3) + offset = 6; + else + goto out; + + /* This movl/addl is used for decoding CFI. */ + if (is_cfi_trap(addr + offset)) + return 0; + } +out: return (addr == paddr); } diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 57b0037d0a99..517821b48391 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -226,7 +226,7 @@ static int copy_optimized_instructions(u8 *dest, u8 *src, u8 *real) } /* Check whether insn is indirect jump */ -static int __insn_is_indirect_jump(struct insn *insn) +static int insn_is_indirect_jump(struct insn *insn) { return ((insn->opcode.bytes[0] == 0xff && (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */ @@ -260,26 +260,6 @@ static int insn_jump_into_range(struct insn *insn, unsigned long start, int len) return (start <= target && target <= start + len); } -static int insn_is_indirect_jump(struct insn *insn) -{ - int ret = __insn_is_indirect_jump(insn); - -#ifdef CONFIG_RETPOLINE - /* - * Jump to x86_indirect_thunk_* is treated as an indirect jump. - * Note that even with CONFIG_RETPOLINE=y, the kernel compiled with - * older gcc may use indirect jump. So we add this check instead of - * replace indirect-jump check. - */ - if (!ret) - ret = insn_jump_into_range(insn, - (unsigned long)__indirect_thunk_start, - (unsigned long)__indirect_thunk_end - - (unsigned long)__indirect_thunk_start); -#endif - return ret; -} - /* Decode whole function to ensure any instructions don't jump into target */ static int can_optimize(unsigned long paddr) { @@ -334,9 +314,21 @@ static int can_optimize(unsigned long paddr) /* Recover address */ insn.kaddr = (void *)addr; insn.next_byte = (void *)(addr + insn.length); - /* Check any instructions don't jump into target */ - if (insn_is_indirect_jump(&insn) || - insn_jump_into_range(&insn, paddr + INT3_INSN_SIZE, + /* + * Check any instructions don't jump into target, indirectly or + * directly. + * + * The indirect case is present to handle a code with jump + * tables. When the kernel uses retpolines, the check should in + * theory additionally look for jumps to indirect thunks. + * However, the kernel built with retpolines or IBT has jump + * tables disabled so the check can be skipped altogether. + */ + if (!IS_ENABLED(CONFIG_RETPOLINE) && + !IS_ENABLED(CONFIG_X86_KERNEL_IBT) && + insn_is_indirect_jump(&insn)) + return 0; + if (insn_jump_into_range(&insn, paddr + INT3_INSN_SIZE, DISP32_SIZE)) return 0; addr += insn.length; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 1cceac5984da..b8ab9ee5896c 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -291,7 +291,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt) struct pt_regs *old_regs = set_irq_regs(regs); u32 token; - ack_APIC_irq(); + apic_eoi(); inc_irq_stat(irq_hv_callback_count); @@ -332,7 +332,7 @@ static void kvm_register_steal_time(void) static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; -static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val) +static notrace __maybe_unused void kvm_guest_apic_eoi_write(void) { /** * This relies on __test_and_clear_bit to modify the memory @@ -343,7 +343,7 @@ static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val) */ if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi))) return; - apic->native_eoi_write(APIC_EOI, APIC_EOI_ACK); + apic_native_eoi(); } static void kvm_guest_cpu_init(void) @@ -622,10 +622,10 @@ late_initcall(setup_efi_kvm_sev_migration); /* * Set the IPI entry points */ -static void kvm_setup_pv_ipi(void) +static __init void kvm_setup_pv_ipi(void) { - apic->send_IPI_mask = kvm_send_ipi_mask; - apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself; + apic_update_callback(send_IPI_mask, kvm_send_ipi_mask); + apic_update_callback(send_IPI_mask_allbutself, kvm_send_ipi_mask_allbutself); pr_info("setup PV IPIs\n"); } @@ -825,7 +825,7 @@ static void __init kvm_guest_init(void) } if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) - apic_set_eoi_write(kvm_guest_apic_eoi_write); + apic_update_callback(eoi, kvm_guest_apic_eoi_write); if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) { static_branch_enable(&kvm_async_pf_enabled); @@ -966,10 +966,8 @@ static void __init kvm_init_platform(void) * Ensure that _bss_decrypted section is marked as decrypted in the * shared pages list. */ - nr_pages = DIV_ROUND_UP(__end_bss_decrypted - __start_bss_decrypted, - PAGE_SIZE); early_set_mem_enc_dec_hypercall((unsigned long)__start_bss_decrypted, - nr_pages, 0); + __end_bss_decrypted - __start_bss_decrypted, 0); /* * If not booted using EFI, enable Live migration support. diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index fed721f90116..b223922248e9 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -48,7 +48,6 @@ static int __init mpf_checksum(unsigned char *mp, int len) static void __init MP_processor_info(struct mpc_cpu *m) { - int apicid; char *bootup_cpu = ""; if (!(m->cpuflag & CPU_ENABLED)) { @@ -56,15 +55,11 @@ static void __init MP_processor_info(struct mpc_cpu *m) return; } - apicid = m->apicid; - - if (m->cpuflag & CPU_BOOTPROCESSOR) { + if (m->cpuflag & CPU_BOOTPROCESSOR) bootup_cpu = " (Bootup-CPU)"; - boot_cpu_physical_apicid = m->apicid; - } pr_info("Processor #%d%s\n", m->apicid, bootup_cpu); - generic_processor_info(apicid, m->apicver); + generic_processor_info(m->apicid); } #ifdef CONFIG_X86_IO_APIC @@ -380,11 +375,6 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) int i; /* - * local APIC has default address - */ - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - - /* * 2 CPUs, numbered 0 & 1. */ processor.type = MP_PROCESSOR; @@ -525,10 +515,8 @@ void __init default_get_smp_config(unsigned int early) */ if (mpf->feature1) { if (early) { - /* - * local APIC has default address - */ - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + /* Local APIC has default address */ + register_lapic_address(APIC_DEFAULT_PHYS_BASE); goto out; } diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 7bb17d37db01..e17c16c54a37 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -39,7 +39,6 @@ #include <asm/cpufeature.h> #include <asm/msr.h> -static struct class *msr_class; static enum cpuhp_state cpuhp_msr_state; enum allow_write_msrs { @@ -235,26 +234,31 @@ static const struct file_operations msr_fops = { .compat_ioctl = msr_ioctl, }; +static char *msr_devnode(const struct device *dev, umode_t *mode) +{ + return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); +} + +static const struct class msr_class = { + .name = "msr", + .devnode = msr_devnode, +}; + static int msr_device_create(unsigned int cpu) { struct device *dev; - dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), NULL, + dev = device_create(&msr_class, NULL, MKDEV(MSR_MAJOR, cpu), NULL, "msr%d", cpu); return PTR_ERR_OR_ZERO(dev); } static int msr_device_destroy(unsigned int cpu) { - device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu)); + device_destroy(&msr_class, MKDEV(MSR_MAJOR, cpu)); return 0; } -static char *msr_devnode(const struct device *dev, umode_t *mode) -{ - return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); -} - static int __init msr_init(void) { int err; @@ -263,12 +267,9 @@ static int __init msr_init(void) pr_err("unable to get major %d for msr\n", MSR_MAJOR); return -EBUSY; } - msr_class = class_create("msr"); - if (IS_ERR(msr_class)) { - err = PTR_ERR(msr_class); + err = class_register(&msr_class); + if (err) goto out_chrdev; - } - msr_class->devnode = msr_devnode; err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/msr:online", msr_device_create, msr_device_destroy); @@ -278,7 +279,7 @@ static int __init msr_init(void) return 0; out_class: - class_destroy(msr_class); + class_unregister(&msr_class); out_chrdev: __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); return err; @@ -288,7 +289,7 @@ module_init(msr_init); static void __exit msr_exit(void) { cpuhp_remove_state(cpuhp_msr_state); - class_destroy(msr_class); + class_unregister(&msr_class); __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); } module_exit(msr_exit) diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c index a1a96df3dff1..e93a8545c74d 100644 --- a/arch/x86/kernel/nmi_selftest.c +++ b/arch/x86/kernel/nmi_selftest.c @@ -75,7 +75,7 @@ static void __init test_nmi_ipi(struct cpumask *mask) /* sync above data before sending NMI */ wmb(); - apic->send_IPI_mask(mask, NMI_VECTOR); + __apic_send_IPI_mask(mask, NMI_VECTOR); /* Don't wait longer than a second */ timeout = USEC_PER_SEC; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index ac10b46c5832..975f98d5eee5 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -75,10 +75,16 @@ DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key); void __init native_pv_lock_init(void) { - if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) + if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) && + !boot_cpu_has(X86_FEATURE_HYPERVISOR)) static_branch_disable(&virt_spin_lock_key); } +static void native_tlb_remove_table(struct mmu_gather *tlb, void *table) +{ + tlb_remove_page(tlb, table); +} + unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr, unsigned int len) { @@ -295,8 +301,7 @@ struct paravirt_patch_template pv_ops = { .mmu.flush_tlb_kernel = native_flush_tlb_global, .mmu.flush_tlb_one_user = native_flush_tlb_one_user, .mmu.flush_tlb_multi = native_flush_tlb_multi, - .mmu.tlb_remove_table = - (void (*)(struct mmu_gather *, void *))tlb_remove_page, + .mmu.tlb_remove_table = native_tlb_remove_table, .mmu.exit_mmap = paravirt_nop, .mmu.notify_page_enc_status_changed = paravirt_nop, diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index de6be0a3965e..f323d83e40a7 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -72,9 +72,15 @@ static inline void __init pci_swiotlb_detect(void) #endif /* CONFIG_SWIOTLB */ #ifdef CONFIG_SWIOTLB_XEN +static bool xen_swiotlb_enabled(void) +{ + return xen_initial_domain() || x86_swiotlb_enable || + (IS_ENABLED(CONFIG_XEN_PCIDEV_FRONTEND) && xen_pv_pci_possible); +} + static void __init pci_xen_swiotlb_init(void) { - if (!xen_initial_domain() && !x86_swiotlb_enable) + if (!xen_swiotlb_enabled()) return; x86_swiotlb_enable = true; x86_swiotlb_flags |= SWIOTLB_ANY; @@ -83,27 +89,6 @@ static void __init pci_xen_swiotlb_init(void) if (IS_ENABLED(CONFIG_PCI)) pci_request_acs(); } - -int pci_xen_swiotlb_init_late(void) -{ - if (dma_ops == &xen_swiotlb_dma_ops) - return 0; - - /* we can work with the default swiotlb */ - if (!io_tlb_default_mem.nslabs) { - int rc = swiotlb_init_late(swiotlb_size_or_default(), - GFP_KERNEL, xen_swiotlb_fixup); - if (rc < 0) - return rc; - } - - /* XXX: this switches the dma ops under live devices! */ - dma_ops = &xen_swiotlb_dma_ops; - if (IS_ENABLED(CONFIG_PCI)) - pci_request_acs(); - return 0; -} -EXPORT_SYMBOL_GPL(pci_xen_swiotlb_init_late); #else static inline void __init pci_xen_swiotlb_init(void) { diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 72015dba72ab..9f0909142a0a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -51,6 +51,7 @@ #include <asm/unwind.h> #include <asm/tdx.h> #include <asm/mmu_context.h> +#include <asm/shstk.h> #include "process.h" @@ -122,6 +123,7 @@ void exit_thread(struct task_struct *tsk) free_vm86(t); + shstk_free(tsk); fpu__drop(fpu); } @@ -162,6 +164,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) struct inactive_task_frame *frame; struct fork_frame *fork_frame; struct pt_regs *childregs; + unsigned long new_ssp; int ret = 0; childregs = task_pt_regs(p); @@ -199,7 +202,16 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) frame->flags = X86_EFLAGS_FIXED; #endif - fpu_clone(p, clone_flags, args->fn); + /* + * Allocate a new shadow stack for thread if needed. If shadow stack, + * is disabled, new_ssp will remain 0, and fpu_clone() will know not to + * update it. + */ + new_ssp = shstk_alloc_thread_stack(p, clone_flags, args->stack_size); + if (IS_ERR_VALUE(new_ssp)) + return PTR_ERR((void *)new_ssp); + + fpu_clone(p, clone_flags, args->fn, new_ssp); /* Kernel thread ? */ if (unlikely(p->flags & PF_KTHREAD)) { @@ -245,6 +257,13 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) if (!ret && unlikely(test_tsk_thread_flag(current, TIF_IO_BITMAP))) io_bitmap_share(p); + /* + * If copy_thread() if failing, don't leak the shadow stack possibly + * allocated in shstk_alloc_thread_stack() above. + */ + if (ret) + shstk_free(p); + return ret; } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3d181c16a2f6..33b268747bb7 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -515,6 +515,8 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, load_gs_index(__USER_DS); } + reset_thread_features(); + loadsegment(fs, 0); loadsegment(es, _ds); loadsegment(ds, _ds); @@ -894,6 +896,12 @@ long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) else return put_user(LAM_U57_BITS, (unsigned long __user *)arg2); #endif + case ARCH_SHSTK_ENABLE: + case ARCH_SHSTK_DISABLE: + case ARCH_SHSTK_LOCK: + case ARCH_SHSTK_UNLOCK: + case ARCH_SHSTK_STATUS: + return shstk_prctl(task, option, arg2); default: ret = -EINVAL; break; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index dfaa270a7cc9..095f04bdabdc 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -58,6 +58,7 @@ enum x86_regset_64 { REGSET64_FP, REGSET64_IOPERM, REGSET64_XSTATE, + REGSET64_SSP, }; #define REGSET_GENERAL \ @@ -1267,6 +1268,17 @@ static struct user_regset x86_64_regsets[] __ro_after_init = { .active = ioperm_active, .regset_get = ioperm_get }, +#ifdef CONFIG_X86_USER_SHADOW_STACK + [REGSET64_SSP] = { + .core_note_type = NT_X86_SHSTK, + .n = 1, + .size = sizeof(u64), + .align = sizeof(u64), + .active = ssp_active, + .regset_get = ssp_get, + .set = ssp_set + }, +#endif }; static const struct user_regset_view user_x86_64_view = { diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 3adbe97015c1..830425e6d38e 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -22,7 +22,6 @@ #include <asm/reboot_fixups.h> #include <asm/reboot.h> #include <asm/pci_x86.h> -#include <asm/virtext.h> #include <asm/cpu.h> #include <asm/nmi.h> #include <asm/smp.h> @@ -530,9 +529,54 @@ static inline void kb_wait(void) static inline void nmi_shootdown_cpus_on_restart(void); +#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD) +/* RCU-protected callback to disable virtualization prior to reboot. */ +static cpu_emergency_virt_cb __rcu *cpu_emergency_virt_callback; + +void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback) +{ + if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback))) + return; + + rcu_assign_pointer(cpu_emergency_virt_callback, callback); +} +EXPORT_SYMBOL_GPL(cpu_emergency_register_virt_callback); + +void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback) +{ + if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback) != callback)) + return; + + rcu_assign_pointer(cpu_emergency_virt_callback, NULL); + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(cpu_emergency_unregister_virt_callback); + +/* + * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during + * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if + * GIF=0, i.e. if the crash occurred between CLGI and STGI. + */ +void cpu_emergency_disable_virtualization(void) +{ + cpu_emergency_virt_cb *callback; + + /* + * IRQs must be disabled as KVM enables virtualization in hardware via + * function call IPIs, i.e. IRQs need to be disabled to guarantee + * virtualization stays disabled. + */ + lockdep_assert_irqs_disabled(); + + rcu_read_lock(); + callback = rcu_dereference(cpu_emergency_virt_callback); + if (callback) + callback(); + rcu_read_unlock(); +} + static void emergency_reboot_disable_virtualization(void) { - /* Just make sure we won't change CPUs while doing this */ local_irq_disable(); /* @@ -545,7 +589,7 @@ static void emergency_reboot_disable_virtualization(void) * Do the NMI shootdown even if virtualization is off on _this_ CPU, as * other CPUs may have virtualization enabled. */ - if (cpu_has_vmx() || cpu_has_svm(NULL)) { + if (rcu_access_pointer(cpu_emergency_virt_callback)) { /* Safely force _this_ CPU out of VMX/SVM operation. */ cpu_emergency_disable_virtualization(); @@ -553,7 +597,9 @@ static void emergency_reboot_disable_virtualization(void) nmi_shootdown_cpus_on_restart(); } } - +#else +static void emergency_reboot_disable_virtualization(void) { } +#endif /* CONFIG_KVM_INTEL || CONFIG_KVM_AMD */ void __attribute__((weak)) mach_reboot_fixups(void) { @@ -787,21 +833,9 @@ void machine_crash_shutdown(struct pt_regs *regs) } #endif - /* This is the CPU performing the emergency shutdown work. */ int crashing_cpu = -1; -/* - * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during - * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if - * GIF=0, i.e. if the crash occurred between CLGI and STGI. - */ -void cpu_emergency_disable_virtualization(void) -{ - cpu_emergency_vmxoff(); - cpu_emergency_svm_disable(); -} - #if defined(CONFIG_SMP) static nmi_shootdown_cb shootdown_callback; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index fd975a4a5200..b9145a63da77 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -114,7 +114,6 @@ static struct resource bss_resource = { #ifdef CONFIG_X86_32 /* CPU data as detected by the assembly code in head_32.S */ struct cpuinfo_x86 new_cpu_data; -unsigned int def_to_bigsmp; struct apm_info apm_info; EXPORT_SYMBOL(apm_info); @@ -1018,9 +1017,11 @@ void __init setup_arch(char **cmdline_p) x86_report_nx(); + apic_setup_apic_calls(); + if (acpi_mps_check()) { #ifdef CONFIG_X86_LOCAL_APIC - disable_apic = 1; + apic_is_disabled = true; #endif setup_clear_cpu_cap(X86_FEATURE_APIC); } @@ -1253,7 +1254,7 @@ void __init setup_arch(char **cmdline_p) map_vsyscall(); - generic_apic_probe(); + x86_32_probe_apic(); early_quirks(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index c242dc47e9cb..2c97bf7b56ae 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -181,15 +181,9 @@ void __init setup_per_cpu_areas(void) #ifdef CONFIG_X86_LOCAL_APIC per_cpu(x86_cpu_to_apicid, cpu) = early_per_cpu_map(x86_cpu_to_apicid, cpu); - per_cpu(x86_bios_cpu_apicid, cpu) = - early_per_cpu_map(x86_bios_cpu_apicid, cpu); per_cpu(x86_cpu_to_acpiid, cpu) = early_per_cpu_map(x86_cpu_to_acpiid, cpu); #endif -#ifdef CONFIG_X86_32 - per_cpu(x86_cpu_to_logical_apicid, cpu) = - early_per_cpu_map(x86_cpu_to_logical_apicid, cpu); -#endif #ifdef CONFIG_NUMA per_cpu(x86_cpu_to_node_map, cpu) = early_per_cpu_map(x86_cpu_to_node_map, cpu); @@ -214,12 +208,8 @@ void __init setup_per_cpu_areas(void) /* indicate the early static arrays will soon be gone */ #ifdef CONFIG_X86_LOCAL_APIC early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; - early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; early_per_cpu_ptr(x86_cpu_to_acpiid) = NULL; #endif -#ifdef CONFIG_X86_32 - early_per_cpu_ptr(x86_cpu_to_logical_apicid) = NULL; -#endif #ifdef CONFIG_NUMA early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; #endif diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 1ee7bed453de..2787826d9f60 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -1089,7 +1089,7 @@ static int wakeup_cpu_via_vmgexit(int apic_id, unsigned long start_ip) return ret; } -void snp_set_wakeup_secondary_cpu(void) +void __init snp_set_wakeup_secondary_cpu(void) { if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) return; @@ -1099,7 +1099,7 @@ void snp_set_wakeup_secondary_cpu(void) * required method to start APs under SNP. If the hypervisor does * not support AP creation, then no APs will be started. */ - apic->wakeup_secondary_cpu = wakeup_cpu_via_vmgexit; + apic_update_callback(wakeup_secondary_cpu, wakeup_cpu_via_vmgexit); } int __init sev_es_setup_ap_jump_table(struct real_mode_header *rmh) @@ -1575,6 +1575,9 @@ static enum es_result vc_handle_dr7_write(struct ghcb *ghcb, long val, *reg = vc_insn_get_rm(ctxt); enum es_result ret; + if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP) + return ES_VMM_ERROR; + if (!reg) return ES_DECODE_FAILED; @@ -1612,6 +1615,9 @@ static enum es_result vc_handle_dr7_read(struct ghcb *ghcb, struct sev_es_runtime_data *data = this_cpu_read(runtime_data); long *reg = vc_insn_get_rm(ctxt); + if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP) + return ES_VMM_ERROR; + if (!reg) return ES_DECODE_FAILED; diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c new file mode 100644 index 000000000000..fd689921a1db --- /dev/null +++ b/arch/x86/kernel/shstk.c @@ -0,0 +1,550 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * shstk.c - Intel shadow stack support + * + * Copyright (c) 2021, Intel Corporation. + * Yu-cheng Yu <yu-cheng.yu@intel.com> + */ + +#include <linux/sched.h> +#include <linux/bitops.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/sched/signal.h> +#include <linux/compat.h> +#include <linux/sizes.h> +#include <linux/user.h> +#include <linux/syscalls.h> +#include <asm/msr.h> +#include <asm/fpu/xstate.h> +#include <asm/fpu/types.h> +#include <asm/shstk.h> +#include <asm/special_insns.h> +#include <asm/fpu/api.h> +#include <asm/prctl.h> + +#define SS_FRAME_SIZE 8 + +static bool features_enabled(unsigned long features) +{ + return current->thread.features & features; +} + +static void features_set(unsigned long features) +{ + current->thread.features |= features; +} + +static void features_clr(unsigned long features) +{ + current->thread.features &= ~features; +} + +/* + * Create a restore token on the shadow stack. A token is always 8-byte + * and aligned to 8. + */ +static int create_rstor_token(unsigned long ssp, unsigned long *token_addr) +{ + unsigned long addr; + + /* Token must be aligned */ + if (!IS_ALIGNED(ssp, 8)) + return -EINVAL; + + addr = ssp - SS_FRAME_SIZE; + + /* + * SSP is aligned, so reserved bits and mode bit are a zero, just mark + * the token 64-bit. + */ + ssp |= BIT(0); + + if (write_user_shstk_64((u64 __user *)addr, (u64)ssp)) + return -EFAULT; + + if (token_addr) + *token_addr = addr; + + return 0; +} + +/* + * VM_SHADOW_STACK will have a guard page. This helps userspace protect + * itself from attacks. The reasoning is as follows: + * + * The shadow stack pointer(SSP) is moved by CALL, RET, and INCSSPQ. The + * INCSSP instruction can increment the shadow stack pointer. It is the + * shadow stack analog of an instruction like: + * + * addq $0x80, %rsp + * + * However, there is one important difference between an ADD on %rsp + * and INCSSP. In addition to modifying SSP, INCSSP also reads from the + * memory of the first and last elements that were "popped". It can be + * thought of as acting like this: + * + * READ_ONCE(ssp); // read+discard top element on stack + * ssp += nr_to_pop * 8; // move the shadow stack + * READ_ONCE(ssp-8); // read+discard last popped stack element + * + * The maximum distance INCSSP can move the SSP is 2040 bytes, before + * it would read the memory. Therefore a single page gap will be enough + * to prevent any operation from shifting the SSP to an adjacent stack, + * since it would have to land in the gap at least once, causing a + * fault. + */ +static unsigned long alloc_shstk(unsigned long addr, unsigned long size, + unsigned long token_offset, bool set_res_tok) +{ + int flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_ABOVE4G; + struct mm_struct *mm = current->mm; + unsigned long mapped_addr, unused; + + if (addr) + flags |= MAP_FIXED_NOREPLACE; + + mmap_write_lock(mm); + mapped_addr = do_mmap(NULL, addr, size, PROT_READ, flags, + VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL); + mmap_write_unlock(mm); + + if (!set_res_tok || IS_ERR_VALUE(mapped_addr)) + goto out; + + if (create_rstor_token(mapped_addr + token_offset, NULL)) { + vm_munmap(mapped_addr, size); + return -EINVAL; + } + +out: + return mapped_addr; +} + +static unsigned long adjust_shstk_size(unsigned long size) +{ + if (size) + return PAGE_ALIGN(size); + + return PAGE_ALIGN(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G)); +} + +static void unmap_shadow_stack(u64 base, u64 size) +{ + int r; + + r = vm_munmap(base, size); + + /* + * mmap_write_lock_killable() failed with -EINTR. This means + * the process is about to die and have it's MM cleaned up. + * This task shouldn't ever make it back to userspace. In this + * case it is ok to leak a shadow stack, so just exit out. + */ + if (r == -EINTR) + return; + + /* + * For all other types of vm_munmap() failure, either the + * system is out of memory or there is bug. + */ + WARN_ON_ONCE(r); +} + +static int shstk_setup(void) +{ + struct thread_shstk *shstk = ¤t->thread.shstk; + unsigned long addr, size; + + /* Already enabled */ + if (features_enabled(ARCH_SHSTK_SHSTK)) + return 0; + + /* Also not supported for 32 bit and x32 */ + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_32bit_syscall()) + return -EOPNOTSUPP; + + size = adjust_shstk_size(0); + addr = alloc_shstk(0, size, 0, false); + if (IS_ERR_VALUE(addr)) + return PTR_ERR((void *)addr); + + fpregs_lock_and_load(); + wrmsrl(MSR_IA32_PL3_SSP, addr + size); + wrmsrl(MSR_IA32_U_CET, CET_SHSTK_EN); + fpregs_unlock(); + + shstk->base = addr; + shstk->size = size; + features_set(ARCH_SHSTK_SHSTK); + + return 0; +} + +void reset_thread_features(void) +{ + memset(¤t->thread.shstk, 0, sizeof(struct thread_shstk)); + current->thread.features = 0; + current->thread.features_locked = 0; +} + +unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long clone_flags, + unsigned long stack_size) +{ + struct thread_shstk *shstk = &tsk->thread.shstk; + unsigned long addr, size; + + /* + * If shadow stack is not enabled on the new thread, skip any + * switch to a new shadow stack. + */ + if (!features_enabled(ARCH_SHSTK_SHSTK)) + return 0; + + /* + * For CLONE_VM, except vfork, the child needs a separate shadow + * stack. + */ + if ((clone_flags & (CLONE_VFORK | CLONE_VM)) != CLONE_VM) + return 0; + + size = adjust_shstk_size(stack_size); + addr = alloc_shstk(0, size, 0, false); + if (IS_ERR_VALUE(addr)) + return addr; + + shstk->base = addr; + shstk->size = size; + + return addr + size; +} + +static unsigned long get_user_shstk_addr(void) +{ + unsigned long long ssp; + + fpregs_lock_and_load(); + + rdmsrl(MSR_IA32_PL3_SSP, ssp); + + fpregs_unlock(); + + return ssp; +} + +#define SHSTK_DATA_BIT BIT(63) + +static int put_shstk_data(u64 __user *addr, u64 data) +{ + if (WARN_ON_ONCE(data & SHSTK_DATA_BIT)) + return -EINVAL; + + /* + * Mark the high bit so that the sigframe can't be processed as a + * return address. + */ + if (write_user_shstk_64(addr, data | SHSTK_DATA_BIT)) + return -EFAULT; + return 0; +} + +static int get_shstk_data(unsigned long *data, unsigned long __user *addr) +{ + unsigned long ldata; + + if (unlikely(get_user(ldata, addr))) + return -EFAULT; + + if (!(ldata & SHSTK_DATA_BIT)) + return -EINVAL; + + *data = ldata & ~SHSTK_DATA_BIT; + + return 0; +} + +static int shstk_push_sigframe(unsigned long *ssp) +{ + unsigned long target_ssp = *ssp; + + /* Token must be aligned */ + if (!IS_ALIGNED(target_ssp, 8)) + return -EINVAL; + + *ssp -= SS_FRAME_SIZE; + if (put_shstk_data((void __user *)*ssp, target_ssp)) + return -EFAULT; + + return 0; +} + +static int shstk_pop_sigframe(unsigned long *ssp) +{ + struct vm_area_struct *vma; + unsigned long token_addr; + bool need_to_check_vma; + int err = 1; + + /* + * It is possible for the SSP to be off the end of a shadow stack by 4 + * or 8 bytes. If the shadow stack is at the start of a page or 4 bytes + * before it, it might be this case, so check that the address being + * read is actually shadow stack. + */ + if (!IS_ALIGNED(*ssp, 8)) + return -EINVAL; + + need_to_check_vma = PAGE_ALIGN(*ssp) == *ssp; + + if (need_to_check_vma) + mmap_read_lock_killable(current->mm); + + err = get_shstk_data(&token_addr, (unsigned long __user *)*ssp); + if (unlikely(err)) + goto out_err; + + if (need_to_check_vma) { + vma = find_vma(current->mm, *ssp); + if (!vma || !(vma->vm_flags & VM_SHADOW_STACK)) { + err = -EFAULT; + goto out_err; + } + + mmap_read_unlock(current->mm); + } + + /* Restore SSP aligned? */ + if (unlikely(!IS_ALIGNED(token_addr, 8))) + return -EINVAL; + + /* SSP in userspace? */ + if (unlikely(token_addr >= TASK_SIZE_MAX)) + return -EINVAL; + + *ssp = token_addr; + + return 0; +out_err: + if (need_to_check_vma) + mmap_read_unlock(current->mm); + return err; +} + +int setup_signal_shadow_stack(struct ksignal *ksig) +{ + void __user *restorer = ksig->ka.sa.sa_restorer; + unsigned long ssp; + int err; + + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || + !features_enabled(ARCH_SHSTK_SHSTK)) + return 0; + + if (!restorer) + return -EINVAL; + + ssp = get_user_shstk_addr(); + if (unlikely(!ssp)) + return -EINVAL; + + err = shstk_push_sigframe(&ssp); + if (unlikely(err)) + return err; + + /* Push restorer address */ + ssp -= SS_FRAME_SIZE; + err = write_user_shstk_64((u64 __user *)ssp, (u64)restorer); + if (unlikely(err)) + return -EFAULT; + + fpregs_lock_and_load(); + wrmsrl(MSR_IA32_PL3_SSP, ssp); + fpregs_unlock(); + + return 0; +} + +int restore_signal_shadow_stack(void) +{ + unsigned long ssp; + int err; + + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || + !features_enabled(ARCH_SHSTK_SHSTK)) + return 0; + + ssp = get_user_shstk_addr(); + if (unlikely(!ssp)) + return -EINVAL; + + err = shstk_pop_sigframe(&ssp); + if (unlikely(err)) + return err; + + fpregs_lock_and_load(); + wrmsrl(MSR_IA32_PL3_SSP, ssp); + fpregs_unlock(); + + return 0; +} + +void shstk_free(struct task_struct *tsk) +{ + struct thread_shstk *shstk = &tsk->thread.shstk; + + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || + !features_enabled(ARCH_SHSTK_SHSTK)) + return; + + /* + * When fork() with CLONE_VM fails, the child (tsk) already has a + * shadow stack allocated, and exit_thread() calls this function to + * free it. In this case the parent (current) and the child share + * the same mm struct. + */ + if (!tsk->mm || tsk->mm != current->mm) + return; + + unmap_shadow_stack(shstk->base, shstk->size); +} + +static int wrss_control(bool enable) +{ + u64 msrval; + + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) + return -EOPNOTSUPP; + + /* + * Only enable WRSS if shadow stack is enabled. If shadow stack is not + * enabled, WRSS will already be disabled, so don't bother clearing it + * when disabling. + */ + if (!features_enabled(ARCH_SHSTK_SHSTK)) + return -EPERM; + + /* Already enabled/disabled? */ + if (features_enabled(ARCH_SHSTK_WRSS) == enable) + return 0; + + fpregs_lock_and_load(); + rdmsrl(MSR_IA32_U_CET, msrval); + + if (enable) { + features_set(ARCH_SHSTK_WRSS); + msrval |= CET_WRSS_EN; + } else { + features_clr(ARCH_SHSTK_WRSS); + if (!(msrval & CET_WRSS_EN)) + goto unlock; + + msrval &= ~CET_WRSS_EN; + } + + wrmsrl(MSR_IA32_U_CET, msrval); + +unlock: + fpregs_unlock(); + + return 0; +} + +static int shstk_disable(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) + return -EOPNOTSUPP; + + /* Already disabled? */ + if (!features_enabled(ARCH_SHSTK_SHSTK)) + return 0; + + fpregs_lock_and_load(); + /* Disable WRSS too when disabling shadow stack */ + wrmsrl(MSR_IA32_U_CET, 0); + wrmsrl(MSR_IA32_PL3_SSP, 0); + fpregs_unlock(); + + shstk_free(current); + features_clr(ARCH_SHSTK_SHSTK | ARCH_SHSTK_WRSS); + + return 0; +} + +SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags) +{ + bool set_tok = flags & SHADOW_STACK_SET_TOKEN; + unsigned long aligned_size; + + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) + return -EOPNOTSUPP; + + if (flags & ~SHADOW_STACK_SET_TOKEN) + return -EINVAL; + + /* If there isn't space for a token */ + if (set_tok && size < 8) + return -ENOSPC; + + if (addr && addr < SZ_4G) + return -ERANGE; + + /* + * An overflow would result in attempting to write the restore token + * to the wrong location. Not catastrophic, but just return the right + * error code and block it. + */ + aligned_size = PAGE_ALIGN(size); + if (aligned_size < size) + return -EOVERFLOW; + + return alloc_shstk(addr, aligned_size, size, set_tok); +} + +long shstk_prctl(struct task_struct *task, int option, unsigned long arg2) +{ + unsigned long features = arg2; + + if (option == ARCH_SHSTK_STATUS) { + return put_user(task->thread.features, (unsigned long __user *)arg2); + } + + if (option == ARCH_SHSTK_LOCK) { + task->thread.features_locked |= features; + return 0; + } + + /* Only allow via ptrace */ + if (task != current) { + if (option == ARCH_SHSTK_UNLOCK && IS_ENABLED(CONFIG_CHECKPOINT_RESTORE)) { + task->thread.features_locked &= ~features; + return 0; + } + return -EINVAL; + } + + /* Do not allow to change locked features */ + if (features & task->thread.features_locked) + return -EPERM; + + /* Only support enabling/disabling one feature at a time. */ + if (hweight_long(features) > 1) + return -EINVAL; + + if (option == ARCH_SHSTK_DISABLE) { + if (features & ARCH_SHSTK_WRSS) + return wrss_control(false); + if (features & ARCH_SHSTK_SHSTK) + return shstk_disable(); + return -EINVAL; + } + + /* Handle ARCH_SHSTK_ENABLE */ + if (features & ARCH_SHSTK_SHSTK) + return shstk_setup(); + if (features & ARCH_SHSTK_WRSS) + return wrss_control(true); + return -EINVAL; +} diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index cfeec3ee877e..65fe2094da59 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -40,6 +40,7 @@ #include <asm/syscall.h> #include <asm/sigframe.h> #include <asm/signal.h> +#include <asm/shstk.h> static inline int is_ia32_compat_frame(struct ksignal *ksig) { diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 9027fc088f97..c12624bc82a3 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -402,7 +402,7 @@ Efault: */ static_assert(NSIGILL == 11); static_assert(NSIGFPE == 15); -static_assert(NSIGSEGV == 9); +static_assert(NSIGSEGV == 10); static_assert(NSIGBUS == 5); static_assert(NSIGTRAP == 6); static_assert(NSIGCHLD == 6); diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 13a1e6083837..cacf2ede6217 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -175,6 +175,9 @@ int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) frame = get_sigframe(ksig, regs, sizeof(struct rt_sigframe), &fp); uc_flags = frame_uc_flags(regs); + if (setup_signal_shadow_stack(ksig)) + return -EFAULT; + if (!user_access_begin(frame, sizeof(*frame))) return -EFAULT; @@ -260,6 +263,9 @@ SYSCALL_DEFINE0(rt_sigreturn) if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) goto badframe; + if (restore_signal_shadow_stack()) + goto badframe; + if (restore_altstack(&frame->uc.uc_stack)) goto badframe; @@ -403,7 +409,7 @@ void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact) */ static_assert(NSIGILL == 11); static_assert(NSIGFPE == 15); -static_assert(NSIGSEGV == 9); +static_assert(NSIGSEGV == 10); static_assert(NSIGBUS == 5); static_assert(NSIGTRAP == 6); static_assert(NSIGCHLD == 6); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 7eb18ca7bd45..6eb06d001bcc 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -135,7 +135,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) */ DEFINE_IDTENTRY_SYSVEC(sysvec_reboot) { - ack_APIC_irq(); + apic_eoi(); cpu_emergency_disable_virtualization(); stop_this_cpu(NULL); } @@ -237,7 +237,7 @@ static void native_stop_other_cpus(int wait) pr_emerg("Shutting down cpus with NMI\n"); for_each_cpu(cpu, &cpus_stop_mask) - apic->send_IPI(cpu, NMI_VECTOR); + __apic_send_IPI(cpu, NMI_VECTOR); } /* * Don't wait longer than 10 ms if the caller didn't @@ -268,7 +268,7 @@ done: */ DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_reschedule_ipi) { - ack_APIC_irq(); + apic_eoi(); trace_reschedule_entry(RESCHEDULE_VECTOR); inc_irq_stat(irq_resched_count); scheduler_ipi(); @@ -277,7 +277,7 @@ DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_reschedule_ipi) DEFINE_IDTENTRY_SYSVEC(sysvec_call_function) { - ack_APIC_irq(); + apic_eoi(); trace_call_function_entry(CALL_FUNCTION_VECTOR); inc_irq_stat(irq_call_count); generic_smp_call_function_interrupt(); @@ -286,7 +286,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_call_function) DEFINE_IDTENTRY_SYSVEC(sysvec_call_function_single) { - ack_APIC_irq(); + apic_eoi(); trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); inc_irq_stat(irq_call_count); generic_smp_call_function_single_interrupt(); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index e1aa2cd7734b..48e040618731 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -327,14 +327,6 @@ static void notrace start_secondary(void *unused) } /** - * topology_smt_supported - Check whether SMT is supported by the CPUs - */ -bool topology_smt_supported(void) -{ - return smp_num_siblings > 1; -} - -/** * topology_phys_to_logical_pkg - Map a physical package id to a logical * @phys_pkg: The physical package id to map * @@ -422,7 +414,7 @@ found: return 0; } -void __init smp_store_boot_cpu_info(void) +static void __init smp_store_boot_cpu_info(void) { int id = 0; /* CPU 0 */ struct cpuinfo_x86 *c = &cpu_data(id); @@ -587,7 +579,6 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) } -#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_CLUSTER) || defined(CONFIG_SCHED_MC) static inline int x86_sched_itmt_flags(void) { return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0; @@ -611,7 +602,14 @@ static int x86_cluster_flags(void) return cpu_cluster_flags() | x86_sched_itmt_flags(); } #endif -#endif + +static int x86_die_flags(void) +{ + if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) + return x86_sched_itmt_flags(); + + return 0; +} /* * Set if a package/die has multiple NUMA nodes inside. @@ -632,14 +630,9 @@ static void __init build_sched_topology(void) }; #endif #ifdef CONFIG_SCHED_CLUSTER - /* - * For now, skip the cluster domain on Hybrid. - */ - if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) { - x86_topology[i++] = (struct sched_domain_topology_level){ - cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) - }; - } + x86_topology[i++] = (struct sched_domain_topology_level){ + cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) + }; #endif #ifdef CONFIG_SCHED_MC x86_topology[i++] = (struct sched_domain_topology_level){ @@ -653,7 +646,7 @@ static void __init build_sched_topology(void) */ if (!x86_has_numa_in_package) { x86_topology[i++] = (struct sched_domain_topology_level){ - cpu_cpu_mask, SD_INIT_NAME(DIE) + cpu_cpu_mask, x86_die_flags, SD_INIT_NAME(DIE) }; } @@ -774,44 +767,6 @@ static void impress_friends(void) pr_debug("Before bogocount - setting activated=1\n"); } -void __inquire_remote_apic(int apicid) -{ - unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; - const char * const names[] = { "ID", "VERSION", "SPIV" }; - int timeout; - u32 status; - - pr_info("Inquiring remote APIC 0x%x...\n", apicid); - - for (i = 0; i < ARRAY_SIZE(regs); i++) { - pr_info("... APIC 0x%x %s: ", apicid, names[i]); - - /* - * Wait for idle. - */ - status = safe_apic_wait_icr_idle(); - if (status) - pr_cont("a previous APIC delivery may have failed\n"); - - apic_icr_write(APIC_DM_REMRD | regs[i], apicid); - - timeout = 0; - do { - udelay(100); - status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK; - } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000); - - switch (status) { - case APIC_ICR_RR_VALID: - status = apic_read(APIC_RRR); - pr_cont("%08x\n", status); - break; - default: - pr_cont("failed\n"); - } - } -} - /* * The Multiprocessor Specification 1.4 (1997) example code suggests * that there should be a 10ms delay between the BSP asserting INIT @@ -1102,9 +1057,8 @@ int native_kick_ap(unsigned int cpu, struct task_struct *tidle) pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu); - if (apicid == BAD_APICID || - !physid_isset(apicid, phys_cpu_present_map) || - !apic->apic_id_valid(apicid)) { + if (apicid == BAD_APICID || !physid_isset(apicid, phys_cpu_present_map) || + !apic_id_valid(apicid)) { pr_err("%s: bad cpu %d\n", __func__, cpu); return -EINVAL; } @@ -1187,58 +1141,6 @@ static __init void disable_smp(void) cpumask_set_cpu(0, topology_die_cpumask(0)); } -/* - * Various sanity checks. - */ -static void __init smp_sanity_check(void) -{ - preempt_disable(); - -#if !defined(CONFIG_X86_BIGSMP) && defined(CONFIG_X86_32) - if (def_to_bigsmp && nr_cpu_ids > 8) { - unsigned int cpu; - unsigned nr; - - pr_warn("More than 8 CPUs detected - skipping them\n" - "Use CONFIG_X86_BIGSMP\n"); - - nr = 0; - for_each_present_cpu(cpu) { - if (nr >= 8) - set_cpu_present(cpu, false); - nr++; - } - - nr = 0; - for_each_possible_cpu(cpu) { - if (nr >= 8) - set_cpu_possible(cpu, false); - nr++; - } - - set_nr_cpu_ids(8); - } -#endif - - if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { - pr_warn("weird, boot CPU (#%d) not listed by the BIOS\n", - hard_smp_processor_id()); - - physid_set(hard_smp_processor_id(), phys_cpu_present_map); - } - - /* - * Should not be necessary because the MP table should list the boot - * CPU too, but we do it for the sake of robustness anyway. - */ - if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) { - pr_notice("weird, boot CPU (#%d) not listed by the BIOS\n", - boot_cpu_physical_apicid); - physid_set(hard_smp_processor_id(), phys_cpu_present_map); - } - preempt_enable(); -} - static void __init smp_cpu_index_default(void) { int i; @@ -1298,8 +1200,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) { smp_prepare_cpus_common(); - smp_sanity_check(); - switch (apic_intr_mode) { case APIC_PIC: case APIC_VIRTUAL_WIRE_NO_CONFIG: @@ -1356,7 +1256,7 @@ bool smp_park_other_cpus_in_init(void) if (this_cpu) return false; - for_each_present_cpu(cpu) { + for_each_cpu_and(cpu, &cpus_booted_once_mask, cpu_present_mask) { if (cpu == this_cpu) continue; apicid = apic->cpu_present_to_apicid(cpu); @@ -1435,24 +1335,6 @@ __init void prefill_possible_map(void) { int i, possible; - /* No boot processor was found in mptable or ACPI MADT */ - if (!num_processors) { - if (boot_cpu_has(X86_FEATURE_APIC)) { - int apicid = boot_cpu_physical_apicid; - int cpu = hard_smp_processor_id(); - - pr_warn("Boot CPU (id %d) not listed by BIOS\n", cpu); - - /* Make sure boot cpu is enumerated */ - if (apic->cpu_present_to_apicid(0) == BAD_APICID && - apic->apic_id_valid(apicid)) - generic_processor_info(apicid, boot_cpu_apic_version); - } - - if (!num_processors) - num_processors = 1; - } - i = setup_max_cpus ?: 1; if (setup_possible_cpus == -1) { possible = num_processors; @@ -1614,9 +1496,7 @@ void play_dead_common(void) idle_task_exit(); cpuhp_ap_report_dead(); - /* - * With physical CPU hotplug, we should halt the cpu - */ + local_irq_disable(); } diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index b70670a98597..77a9316da435 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -186,6 +186,19 @@ EXPORT_SYMBOL_GPL(arch_static_call_transform); */ bool __static_call_fixup(void *tramp, u8 op, void *dest) { + unsigned long addr = (unsigned long)tramp; + /* + * Not all .return_sites are a static_call trampoline (most are not). + * Check if the 3 bytes after the return are still kernel text, if not, + * then this definitely is not a trampoline and we need not worry + * further. + * + * This avoids the memcmp() below tripping over pagefaults etc.. + */ + if (((addr >> PAGE_SHIFT) != ((addr + 7) >> PAGE_SHIFT)) && + !kernel_text_address(addr + 7)) + return false; + if (memcmp(tramp+5, tramp_ud, 3)) { /* Not a trampoline site, not our problem. */ return false; diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 8cc653ffdccd..c783aeb37dce 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -193,7 +193,11 @@ get_unmapped_area: info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; - info.low_limit = PAGE_SIZE; + if (!in_32bit_syscall() && (flags & MAP_ABOVE4G)) + info.low_limit = SZ_4G; + else + info.low_limit = PAGE_SIZE; + info.high_limit = get_mmap_base(0); /* diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 58b1f208eff5..c876f1d36a81 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -77,18 +77,6 @@ DECLARE_BITMAP(system_vectors, NR_VECTORS); -static inline void cond_local_irq_enable(struct pt_regs *regs) -{ - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); -} - -static inline void cond_local_irq_disable(struct pt_regs *regs) -{ - if (regs->flags & X86_EFLAGS_IF) - local_irq_disable(); -} - __always_inline int is_valid_bugaddr(unsigned long addr) { if (addr < TASK_SIZE_MAX) @@ -213,81 +201,6 @@ DEFINE_IDTENTRY(exc_overflow) do_error_trap(regs, 0, "overflow", X86_TRAP_OF, SIGSEGV, 0, NULL); } -#ifdef CONFIG_X86_KERNEL_IBT - -static __ro_after_init bool ibt_fatal = true; - -extern void ibt_selftest_ip(void); /* code label defined in asm below */ - -enum cp_error_code { - CP_EC = (1 << 15) - 1, - - CP_RET = 1, - CP_IRET = 2, - CP_ENDBR = 3, - CP_RSTRORSSP = 4, - CP_SETSSBSY = 5, - - CP_ENCL = 1 << 15, -}; - -DEFINE_IDTENTRY_ERRORCODE(exc_control_protection) -{ - if (!cpu_feature_enabled(X86_FEATURE_IBT)) { - pr_err("Unexpected #CP\n"); - BUG(); - } - - if (WARN_ON_ONCE(user_mode(regs) || (error_code & CP_EC) != CP_ENDBR)) - return; - - if (unlikely(regs->ip == (unsigned long)&ibt_selftest_ip)) { - regs->ax = 0; - return; - } - - pr_err("Missing ENDBR: %pS\n", (void *)instruction_pointer(regs)); - if (!ibt_fatal) { - printk(KERN_DEFAULT CUT_HERE); - __warn(__FILE__, __LINE__, (void *)regs->ip, TAINT_WARN, regs, NULL); - return; - } - BUG(); -} - -/* Must be noinline to ensure uniqueness of ibt_selftest_ip. */ -noinline bool ibt_selftest(void) -{ - unsigned long ret; - - asm (" lea ibt_selftest_ip(%%rip), %%rax\n\t" - ANNOTATE_RETPOLINE_SAFE - " jmp *%%rax\n\t" - "ibt_selftest_ip:\n\t" - UNWIND_HINT_FUNC - ANNOTATE_NOENDBR - " nop\n\t" - - : "=a" (ret) : : "memory"); - - return !ret; -} - -static int __init ibt_setup(char *str) -{ - if (!strcmp(str, "off")) - setup_clear_cpu_cap(X86_FEATURE_IBT); - - if (!strcmp(str, "warn")) - ibt_fatal = false; - - return 1; -} - -__setup("ibt=", ibt_setup); - -#endif /* CONFIG_X86_KERNEL_IBT */ - #ifdef CONFIG_X86_F00F_BUG void handle_invalid_op(struct pt_regs *regs) #else @@ -697,9 +610,10 @@ static bool try_fixup_enqcmd_gp(void) } static bool gp_try_fixup_and_notify(struct pt_regs *regs, int trapnr, - unsigned long error_code, const char *str) + unsigned long error_code, const char *str, + unsigned long address) { - if (fixup_exception(regs, trapnr, error_code, 0)) + if (fixup_exception(regs, trapnr, error_code, address)) return true; current->thread.error_code = error_code; @@ -759,7 +673,7 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) goto exit; } - if (gp_try_fixup_and_notify(regs, X86_TRAP_GP, error_code, desc)) + if (gp_try_fixup_and_notify(regs, X86_TRAP_GP, error_code, desc, 0)) goto exit; if (error_code) @@ -1357,17 +1271,20 @@ DEFINE_IDTENTRY(exc_device_not_available) #define VE_FAULT_STR "VE fault" -static void ve_raise_fault(struct pt_regs *regs, long error_code) +static void ve_raise_fault(struct pt_regs *regs, long error_code, + unsigned long address) { if (user_mode(regs)) { gp_user_force_sig_segv(regs, X86_TRAP_VE, error_code, VE_FAULT_STR); return; } - if (gp_try_fixup_and_notify(regs, X86_TRAP_VE, error_code, VE_FAULT_STR)) + if (gp_try_fixup_and_notify(regs, X86_TRAP_VE, error_code, + VE_FAULT_STR, address)) { return; + } - die_addr(VE_FAULT_STR, regs, error_code, 0); + die_addr(VE_FAULT_STR, regs, error_code, address); } /* @@ -1431,7 +1348,7 @@ DEFINE_IDTENTRY(exc_virtualization_exception) * it successfully, treat it as #GP(0) and handle it. */ if (!tdx_handle_virt_exception(regs, &ve)) - ve_raise_fault(regs, 0); + ve_raise_fault(regs, 0, ve.gla); cond_local_irq_disable(regs); } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 3425c6a943e4..15f97c0abc9d 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1258,7 +1258,7 @@ static void __init check_system_tsc_reliable(void) if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && boot_cpu_has(X86_FEATURE_NONSTOP_TSC) && boot_cpu_has(X86_FEATURE_TSC_ADJUST) && - nr_online_nodes <= 2) + nr_online_nodes <= 4) tsc_disable_clocksource_watchdog(); } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 03c885d3640f..f15fb71f280e 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -133,18 +133,30 @@ SECTIONS KPROBES_TEXT SOFTIRQENTRY_TEXT #ifdef CONFIG_RETPOLINE - __indirect_thunk_start = .; - *(.text.__x86.*) - __indirect_thunk_end = .; + *(.text..__x86.indirect_thunk) + *(.text..__x86.return_thunk) #endif STATIC_CALL_TEXT ALIGN_ENTRY_TEXT_BEGIN +#ifdef CONFIG_CPU_SRSO + *(.text..__x86.rethunk_untrain) +#endif + ENTRY_TEXT + +#ifdef CONFIG_CPU_SRSO + /* + * See the comment above srso_alias_untrain_ret()'s + * definition. + */ + . = srso_alias_untrain_ret | (1 << 2) | (1 << 8) | (1 << 14) | (1 << 20); + *(.text..__x86.rethunk_safe) +#endif ALIGN_ENTRY_TEXT_END *(.gnu.warning) - } :text =0xcccc + } :text = 0xcccccccc /* End of text section, which should occupy whole number of pages */ _etext = .; @@ -509,7 +521,24 @@ INIT_PER_CPU(irq_stack_backing_store); #endif #ifdef CONFIG_RETHUNK -. = ASSERT((__x86_return_thunk & 0x3f) == 0, "__x86_return_thunk not cacheline-aligned"); +. = ASSERT((retbleed_return_thunk & 0x3f) == 0, "retbleed_return_thunk not cacheline-aligned"); +. = ASSERT((srso_safe_ret & 0x3f) == 0, "srso_safe_ret not cacheline-aligned"); +#endif + +#ifdef CONFIG_CPU_SRSO +/* + * GNU ld cannot do XOR until 2.41. + * https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=f6f78318fca803c4907fb8d7f6ded8295f1947b1 + * + * LLVM lld cannot do XOR until lld-17. + * https://github.com/llvm/llvm-project/commit/fae96104d4378166cbe5c875ef8ed808a356f3fb + * + * Instead do: (A | B) - (A & B) in order to compute the XOR + * of the two function addresses: + */ +. = ASSERT(((ABSOLUTE(srso_alias_untrain_ret) | srso_alias_safe_ret) - + (ABSOLUTE(srso_alias_untrain_ret) & srso_alias_safe_ret)) == ((1 << 2) | (1 << 8) | (1 << 14) | (1 << 20)), + "SRSO function pair won't alias"); #endif #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index 796cfaa46bfa..65e96b76c423 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -129,7 +129,7 @@ static void __init vsmp_cap_cpus(void) static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) { - return hard_smp_processor_id() >> index_msb; + return read_apic_id() >> index_msb; } static void vsmp_apic_post_init(void) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 89ca7f4c1464..ed90f148140d 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -101,7 +101,7 @@ config X86_SGX_KVM config KVM_AMD tristate "KVM for AMD processors support" - depends on KVM + depends on KVM && (CPU_SUP_AMD || CPU_SUP_HYGON) help Provides support for KVM on AMD processors equipped with the AMD-V (SVM) extensions. @@ -138,6 +138,19 @@ config KVM_XEN If in doubt, say "N". +config KVM_PROVE_MMU + bool "Prove KVM MMU correctness" + depends on DEBUG_KERNEL + depends on KVM + depends on EXPERT + help + Enables runtime assertions in KVM's MMU that are too costly to enable + in anything remotely resembling a production environment, e.g. this + gates code that verifies a to-be-freed page table doesn't have any + present SPTEs. + + If in doubt, say "N". + config KVM_EXTERNAL_WRITE_TRACKING bool diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 7f4d13383cf2..0544e30b4946 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -11,6 +11,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> +#include "linux/lockdep.h" #include <linux/export.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> @@ -84,6 +85,18 @@ static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( struct kvm_cpuid_entry2 *e; int i; + /* + * KVM has a semi-arbitrary rule that querying the guest's CPUID model + * with IRQs disabled is disallowed. The CPUID model can legitimately + * have over one hundred entries, i.e. the lookup is slow, and IRQs are + * typically disabled in KVM only when KVM is in a performance critical + * path, e.g. the core VM-Enter/VM-Exit run loop. Nothing will break + * if this rule is violated, this assertion is purely to flag potential + * performance issues. If this fires, consider moving the lookup out + * of the hotpath, e.g. by caching information during CPUID updates. + */ + lockdep_assert_irqs_enabled(); + for (i = 0; i < nent; i++) { e = &entries[i]; @@ -312,6 +325,27 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; struct kvm_cpuid_entry2 *best; + bool allow_gbpages; + + BUILD_BUG_ON(KVM_NR_GOVERNED_FEATURES > KVM_MAX_NR_GOVERNED_FEATURES); + bitmap_zero(vcpu->arch.governed_features.enabled, + KVM_MAX_NR_GOVERNED_FEATURES); + + /* + * If TDP is enabled, let the guest use GBPAGES if they're supported in + * hardware. The hardware page walker doesn't let KVM disable GBPAGES, + * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA + * walk for performance and complexity reasons. Not to mention KVM + * _can't_ solve the problem because GVA->GPA walks aren't visible to + * KVM once a TDP translation is installed. Mimic hardware behavior so + * that KVM's is at least consistent, i.e. doesn't randomly inject #PF. + * If TDP is disabled, honor *only* guest CPUID as KVM has full control + * and can install smaller shadow pages if the host lacks 1GiB support. + */ + allow_gbpages = tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) : + guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES); + if (allow_gbpages) + kvm_governed_feature_set(vcpu, X86_FEATURE_GBPAGES); best = kvm_find_cpuid_entry(vcpu, 1); if (best && apic) { @@ -647,7 +681,8 @@ void kvm_set_cpu_caps(void) ); kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX, - F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI) + F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI) | + F(AMX_COMPLEX) ); kvm_cpu_cap_mask(CPUID_D_1_EAX, @@ -729,6 +764,9 @@ void kvm_set_cpu_caps(void) F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */ ); + if (cpu_feature_enabled(X86_FEATURE_SRSO_NO)) + kvm_cpu_cap_set(X86_FEATURE_SRSO_NO); + kvm_cpu_cap_init_kvm_defined(CPUID_8000_0022_EAX, F(PERFMON_V2) ); @@ -1151,6 +1189,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) cpuid_entry_override(entry, CPUID_8000_0001_EDX); cpuid_entry_override(entry, CPUID_8000_0001_ECX); break; + case 0x80000005: + /* Pass host L1 cache and TLB info. */ + break; case 0x80000006: /* Drop reserved bits, pass host L2 cache and TLB info. */ entry->edx &= ~GENMASK(17, 16); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index b1658c0de847..284fa4704553 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -232,4 +232,50 @@ static __always_inline bool guest_pv_has(struct kvm_vcpu *vcpu, return vcpu->arch.pv_cpuid.features & (1u << kvm_feature); } +enum kvm_governed_features { +#define KVM_GOVERNED_FEATURE(x) KVM_GOVERNED_##x, +#include "governed_features.h" + KVM_NR_GOVERNED_FEATURES +}; + +static __always_inline int kvm_governed_feature_index(unsigned int x86_feature) +{ + switch (x86_feature) { +#define KVM_GOVERNED_FEATURE(x) case x: return KVM_GOVERNED_##x; +#include "governed_features.h" + default: + return -1; + } +} + +static __always_inline bool kvm_is_governed_feature(unsigned int x86_feature) +{ + return kvm_governed_feature_index(x86_feature) >= 0; +} + +static __always_inline void kvm_governed_feature_set(struct kvm_vcpu *vcpu, + unsigned int x86_feature) +{ + BUILD_BUG_ON(!kvm_is_governed_feature(x86_feature)); + + __set_bit(kvm_governed_feature_index(x86_feature), + vcpu->arch.governed_features.enabled); +} + +static __always_inline void kvm_governed_feature_check_and_set(struct kvm_vcpu *vcpu, + unsigned int x86_feature) +{ + if (kvm_cpu_cap_has(x86_feature) && guest_cpuid_has(vcpu, x86_feature)) + kvm_governed_feature_set(vcpu, x86_feature); +} + +static __always_inline bool guest_can_use(struct kvm_vcpu *vcpu, + unsigned int x86_feature) +{ + BUILD_BUG_ON(!kvm_is_governed_feature(x86_feature)); + + return test_bit(kvm_governed_feature_index(x86_feature), + vcpu->arch.governed_features.enabled); +} + #endif diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 936a397a08cd..2673cd5c46cb 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1799,13 +1799,11 @@ static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op) op->addr.mem, &op->val, op->bytes); - break; case OP_MEM_STR: return segmented_write(ctxt, op->addr.mem, op->data, op->bytes * op->count); - break; case OP_XMM: kvm_write_sse_reg(op->addr.xmm, &op->vec_val); break; diff --git a/arch/x86/kvm/governed_features.h b/arch/x86/kvm/governed_features.h new file mode 100644 index 000000000000..423a73395c10 --- /dev/null +++ b/arch/x86/kvm/governed_features.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#if !defined(KVM_GOVERNED_FEATURE) || defined(KVM_GOVERNED_X86_FEATURE) +BUILD_BUG() +#endif + +#define KVM_GOVERNED_X86_FEATURE(x) KVM_GOVERNED_FEATURE(X86_FEATURE_##x) + +KVM_GOVERNED_X86_FEATURE(GBPAGES) +KVM_GOVERNED_X86_FEATURE(XSAVES) +KVM_GOVERNED_X86_FEATURE(VMX) +KVM_GOVERNED_X86_FEATURE(NRIPS) +KVM_GOVERNED_X86_FEATURE(TSCRATEMSR) +KVM_GOVERNED_X86_FEATURE(V_VMSAVE_VMLOAD) +KVM_GOVERNED_X86_FEATURE(LBRV) +KVM_GOVERNED_X86_FEATURE(PAUSEFILTER) +KVM_GOVERNED_X86_FEATURE(PFTHRESHOLD) +KVM_GOVERNED_X86_FEATURE(VGIF) +KVM_GOVERNED_X86_FEATURE(VNMI) + +#undef KVM_GOVERNED_X86_FEATURE +#undef KVM_GOVERNED_FEATURE diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index b28fd020066f..7c2dac6824e2 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1293,7 +1293,6 @@ static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr) case HV_X64_MSR_VP_ASSIST_PAGE: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_APIC_ACCESS_AVAILABLE; - break; case HV_X64_MSR_TSC_FREQUENCY: case HV_X64_MSR_APIC_FREQUENCY: return hv_vcpu->cpuid_cache.features_eax & diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index ab65f3a47dfd..be7aeb9b8ea3 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -213,7 +213,6 @@ struct x86_emulate_ops { bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool exact_only); - bool (*guest_has_long_mode)(struct x86_emulate_ctxt *ctxt); bool (*guest_has_movbe)(struct x86_emulate_ctxt *ctxt); bool (*guest_has_fxsr)(struct x86_emulate_ctxt *ctxt); bool (*guest_has_rdpid)(struct x86_emulate_ctxt *ctxt); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 113ca9661ab2..dcd60b39e794 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -376,7 +376,8 @@ void kvm_recalculate_apic_map(struct kvm *kvm) struct kvm_vcpu *vcpu; unsigned long i; u32 max_id = 255; /* enough space for any xAPIC ID */ - bool xapic_id_mismatch = false; + bool xapic_id_mismatch; + int r; /* Read kvm->arch.apic_map_dirty before kvm->arch.apic_map. */ if (atomic_read_acquire(&kvm->arch.apic_map_dirty) == CLEAN) @@ -386,9 +387,14 @@ void kvm_recalculate_apic_map(struct kvm *kvm) "Dirty APIC map without an in-kernel local APIC"); mutex_lock(&kvm->arch.apic_map_lock); + +retry: /* - * Read kvm->arch.apic_map_dirty before kvm->arch.apic_map - * (if clean) or the APIC registers (if dirty). + * Read kvm->arch.apic_map_dirty before kvm->arch.apic_map (if clean) + * or the APIC registers (if dirty). Note, on retry the map may have + * not yet been marked dirty by whatever task changed a vCPU's x2APIC + * ID, i.e. the map may still show up as in-progress. In that case + * this task still needs to retry and complete its calculation. */ if (atomic_cmpxchg_acquire(&kvm->arch.apic_map_dirty, DIRTY, UPDATE_IN_PROGRESS) == CLEAN) { @@ -397,6 +403,15 @@ void kvm_recalculate_apic_map(struct kvm *kvm) return; } + /* + * Reset the mismatch flag between attempts so that KVM does the right + * thing if a vCPU changes its xAPIC ID, but do NOT reset max_id, i.e. + * keep max_id strictly increasing. Disallowing max_id from shrinking + * ensures KVM won't get stuck in an infinite loop, e.g. if the vCPU + * with the highest x2APIC ID is toggling its APIC on and off. + */ + xapic_id_mismatch = false; + kvm_for_each_vcpu(i, vcpu, kvm) if (kvm_apic_present(vcpu)) max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic)); @@ -415,9 +430,15 @@ void kvm_recalculate_apic_map(struct kvm *kvm) if (!kvm_apic_present(vcpu)) continue; - if (kvm_recalculate_phys_map(new, vcpu, &xapic_id_mismatch)) { + r = kvm_recalculate_phys_map(new, vcpu, &xapic_id_mismatch); + if (r) { kvfree(new); new = NULL; + if (r == -E2BIG) { + cond_resched(); + goto retry; + } + goto out; } @@ -637,16 +658,22 @@ bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr) *max_irr = -1; for (i = vec = 0; i <= 7; i++, vec += 32) { + u32 *p_irr = (u32 *)(regs + APIC_IRR + i * 0x10); + + irr_val = *p_irr; pir_val = READ_ONCE(pir[i]); - irr_val = *((u32 *)(regs + APIC_IRR + i * 0x10)); + if (pir_val) { + pir_val = xchg(&pir[i], 0); + prev_irr_val = irr_val; - irr_val |= xchg(&pir[i], 0); - *((u32 *)(regs + APIC_IRR + i * 0x10)) = irr_val; - if (prev_irr_val != irr_val) { - max_updated_irr = - __fls(irr_val ^ prev_irr_val) + vec; - } + do { + irr_val = prev_irr_val | pir_val; + } while (prev_irr_val != irr_val && + !try_cmpxchg(p_irr, &prev_irr_val, irr_val)); + + if (prev_irr_val != irr_val) + max_updated_irr = __fls(irr_val ^ prev_irr_val) + vec; } if (irr_val) *max_irr = __fls(irr_val) + vec; @@ -660,8 +687,11 @@ EXPORT_SYMBOL_GPL(__kvm_apic_update_irr); bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr) { struct kvm_lapic *apic = vcpu->arch.apic; + bool irr_updated = __kvm_apic_update_irr(pir, apic->regs, max_irr); - return __kvm_apic_update_irr(pir, apic->regs, max_irr); + if (unlikely(!apic->apicv_active && irr_updated)) + apic->irr_pending = true; + return irr_updated; } EXPORT_SYMBOL_GPL(kvm_apic_update_irr); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 92d5a1924fc1..253fb2093d5d 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -121,6 +121,8 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu); void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu); void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu); +void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, + int bytes); static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index ec169f5c7dce..e1d011c67cc6 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -25,6 +25,7 @@ #include "kvm_cache_regs.h" #include "smm.h" #include "kvm_emulate.h" +#include "page_track.h" #include "cpuid.h" #include "spte.h" @@ -53,7 +54,7 @@ #include <asm/io.h> #include <asm/set_memory.h> #include <asm/vmx.h> -#include <asm/kvm_page_track.h> + #include "trace.h" extern bool itlb_multihit_kvm_mitigation; @@ -115,11 +116,6 @@ static int max_huge_page_level __read_mostly; static int tdp_root_level __read_mostly; static int max_tdp_level __read_mostly; -#ifdef MMU_DEBUG -bool dbg = 0; -module_param(dbg, bool, 0644); -#endif - #define PTE_PREFETCH_NUM 8 #include <trace/events/kvm.h> @@ -278,16 +274,12 @@ static inline bool kvm_available_flush_remote_tlbs_range(void) return kvm_x86_ops.flush_remote_tlbs_range; } -void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t start_gfn, - gfn_t nr_pages) +int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages) { - int ret = -EOPNOTSUPP; + if (!kvm_x86_ops.flush_remote_tlbs_range) + return -EOPNOTSUPP; - if (kvm_x86_ops.flush_remote_tlbs_range) - ret = static_call(kvm_x86_flush_remote_tlbs_range)(kvm, start_gfn, - nr_pages); - if (ret) - kvm_flush_remote_tlbs(kvm); + return static_call(kvm_x86_flush_remote_tlbs_range)(kvm, gfn, nr_pages); } static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index); @@ -490,7 +482,7 @@ retry: */ static void mmu_spte_set(u64 *sptep, u64 new_spte) { - WARN_ON(is_shadow_present_pte(*sptep)); + WARN_ON_ONCE(is_shadow_present_pte(*sptep)); __set_spte(sptep, new_spte); } @@ -502,7 +494,7 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte) { u64 old_spte = *sptep; - WARN_ON(!is_shadow_present_pte(new_spte)); + WARN_ON_ONCE(!is_shadow_present_pte(new_spte)); check_spte_writable_invariants(new_spte); if (!is_shadow_present_pte(old_spte)) { @@ -515,7 +507,7 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte) else old_spte = __update_clear_spte_slow(sptep, new_spte); - WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte)); + WARN_ON_ONCE(spte_to_pfn(old_spte) != spte_to_pfn(new_spte)); return old_spte; } @@ -597,7 +589,7 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) * by a refcounted page, the refcount is elevated. */ page = kvm_pfn_to_refcounted_page(pfn); - WARN_ON(page && !page_count(page)); + WARN_ON_ONCE(page && !page_count(page)); if (is_accessed_spte(old_spte)) kvm_set_pfn_accessed(pfn); @@ -812,7 +804,7 @@ static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot, for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { linfo = lpage_info_slot(gfn, slot, i); linfo->disallow_lpage += count; - WARN_ON(linfo->disallow_lpage < 0); + WARN_ON_ONCE(linfo->disallow_lpage < 0); } } @@ -839,8 +831,7 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) /* the non-leaf shadow pages are keeping readonly. */ if (sp->role.level > PG_LEVEL_4K) - return kvm_slot_page_track_add_page(kvm, slot, gfn, - KVM_PAGE_TRACK_WRITE); + return __kvm_write_track_add_gfn(kvm, slot, gfn); kvm_mmu_gfn_disallow_lpage(slot, gfn); @@ -886,8 +877,7 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, gfn); if (sp->role.level > PG_LEVEL_4K) - return kvm_slot_page_track_remove_page(kvm, slot, gfn, - KVM_PAGE_TRACK_WRITE); + return __kvm_write_track_remove_gfn(kvm, slot, gfn); kvm_mmu_gfn_allow_lpage(slot, gfn); } @@ -941,10 +931,8 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, int count = 0; if (!rmap_head->val) { - rmap_printk("%p %llx 0->1\n", spte, *spte); rmap_head->val = (unsigned long)spte; } else if (!(rmap_head->val & 1)) { - rmap_printk("%p %llx 1->many\n", spte, *spte); desc = kvm_mmu_memory_cache_alloc(cache); desc->sptes[0] = (u64 *)rmap_head->val; desc->sptes[1] = spte; @@ -953,7 +941,6 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, rmap_head->val = (unsigned long)desc | 1; ++count; } else { - rmap_printk("%p %llx many->many\n", spte, *spte); desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); count = desc->tail_count + desc->spte_count; @@ -973,7 +960,8 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, return count; } -static void pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, +static void pte_list_desc_remove_entry(struct kvm *kvm, + struct kvm_rmap_head *rmap_head, struct pte_list_desc *desc, int i) { struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); @@ -984,7 +972,7 @@ static void pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, * when adding an entry and the previous head is full, and heads are * removed (this flow) when they become empty. */ - BUG_ON(j < 0); + KVM_BUG_ON_DATA_CORRUPTION(j < 0, kvm); /* * Replace the to-be-freed SPTE with the last valid entry from the head @@ -1009,35 +997,34 @@ static void pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, mmu_free_pte_list_desc(head_desc); } -static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) +static void pte_list_remove(struct kvm *kvm, u64 *spte, + struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; int i; - if (!rmap_head->val) { - pr_err("%s: %p 0->BUG\n", __func__, spte); - BUG(); - } else if (!(rmap_head->val & 1)) { - rmap_printk("%p 1->0\n", spte); - if ((u64 *)rmap_head->val != spte) { - pr_err("%s: %p 1->BUG\n", __func__, spte); - BUG(); - } + if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm)) + return; + + if (!(rmap_head->val & 1)) { + if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm)) + return; + rmap_head->val = 0; } else { - rmap_printk("%p many->many\n", spte); desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); while (desc) { for (i = 0; i < desc->spte_count; ++i) { if (desc->sptes[i] == spte) { - pte_list_desc_remove_entry(rmap_head, desc, i); + pte_list_desc_remove_entry(kvm, rmap_head, + desc, i); return; } } desc = desc->more; } - pr_err("%s: %p many->many\n", __func__, spte); - BUG(); + + KVM_BUG_ON_DATA_CORRUPTION(true, kvm); } } @@ -1045,7 +1032,7 @@ static void kvm_zap_one_rmap_spte(struct kvm *kvm, struct kvm_rmap_head *rmap_head, u64 *sptep) { mmu_spte_clear_track_bits(kvm, sptep); - pte_list_remove(sptep, rmap_head); + pte_list_remove(kvm, sptep, rmap_head); } /* Return true if at least one SPTE was zapped, false otherwise */ @@ -1120,7 +1107,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) slot = __gfn_to_memslot(slots, gfn); rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); - pte_list_remove(spte, rmap_head); + pte_list_remove(kvm, spte, rmap_head); } /* @@ -1212,7 +1199,7 @@ static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush) struct kvm_mmu_page *sp; sp = sptep_to_sp(sptep); - WARN_ON(sp->role.level == PG_LEVEL_4K); + WARN_ON_ONCE(sp->role.level == PG_LEVEL_4K); drop_spte(kvm, sptep); @@ -1241,8 +1228,6 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect) !(pt_protect && is_mmu_writable_spte(spte))) return false; - rmap_printk("spte %p %llx\n", sptep, *sptep); - if (pt_protect) spte &= ~shadow_mmu_writable_mask; spte = spte & ~PT_WRITABLE_MASK; @@ -1267,9 +1252,7 @@ static bool spte_clear_dirty(u64 *sptep) { u64 spte = *sptep; - rmap_printk("spte %p %llx\n", sptep, *sptep); - - MMU_WARN_ON(!spte_ad_enabled(spte)); + KVM_MMU_WARN_ON(!spte_ad_enabled(spte)); spte &= ~shadow_dirty_mask; return mmu_spte_update(sptep, spte); } @@ -1475,14 +1458,11 @@ static bool kvm_set_pte_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, u64 new_spte; kvm_pfn_t new_pfn; - WARN_ON(pte_huge(pte)); + WARN_ON_ONCE(pte_huge(pte)); new_pfn = pte_pfn(pte); restart: for_each_rmap_spte(rmap_head, &iter, sptep) { - rmap_printk("spte %p %llx gfn %llx (%d)\n", - sptep, *sptep, gfn, level); - need_flush = true; if (pte_write(pte)) { @@ -1588,7 +1568,7 @@ static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm, for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, range->start, range->end - 1, &iterator) ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn, - iterator.level, range->pte); + iterator.level, range->arg.pte); return ret; } @@ -1710,21 +1690,19 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) return young; } -#ifdef MMU_DEBUG -static int is_empty_shadow_page(u64 *spt) +static void kvm_mmu_check_sptes_at_free(struct kvm_mmu_page *sp) { - u64 *pos; - u64 *end; +#ifdef CONFIG_KVM_PROVE_MMU + int i; - for (pos = spt, end = pos + SPTE_ENT_PER_PAGE; pos != end; pos++) - if (is_shadow_present_pte(*pos)) { - printk(KERN_ERR "%s: %p %llx\n", __func__, - pos, *pos); - return 0; - } - return 1; -} + for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { + if (KVM_MMU_WARN_ON(is_shadow_present_pte(sp->spt[i]))) + pr_err_ratelimited("SPTE %llx (@ %p) for gfn %llx shadow-present at free", + sp->spt[i], &sp->spt[i], + kvm_mmu_page_get_gfn(sp, i)); + } #endif +} /* * This value is the sum of all of the kvm instances's @@ -1752,7 +1730,8 @@ static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp) { - MMU_WARN_ON(!is_empty_shadow_page(sp->spt)); + kvm_mmu_check_sptes_at_free(sp); + hlist_del(&sp->hash_link); list_del(&sp->link); free_page((unsigned long)sp->spt); @@ -1775,16 +1754,16 @@ static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache, pte_list_add(cache, parent_pte, &sp->parent_ptes); } -static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, +static void mmu_page_remove_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *parent_pte) { - pte_list_remove(parent_pte, &sp->parent_ptes); + pte_list_remove(kvm, parent_pte, &sp->parent_ptes); } -static void drop_parent_pte(struct kvm_mmu_page *sp, +static void drop_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *parent_pte) { - mmu_page_remove_parent_pte(sp, parent_pte); + mmu_page_remove_parent_pte(kvm, sp, parent_pte); mmu_spte_clear_no_track(parent_pte); } @@ -1840,7 +1819,7 @@ static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx) { --sp->unsync_children; - WARN_ON((int)sp->unsync_children < 0); + WARN_ON_ONCE((int)sp->unsync_children < 0); __clear_bit(idx, sp->unsync_child_bitmap); } @@ -1898,7 +1877,7 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp, static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) { - WARN_ON(!sp->unsync); + WARN_ON_ONCE(!sp->unsync); trace_kvm_mmu_sync_page(sp); sp->unsync = 0; --kvm->stat.mmu_unsync; @@ -2073,11 +2052,11 @@ static int mmu_pages_first(struct kvm_mmu_pages *pvec, if (pvec->nr == 0) return 0; - WARN_ON(pvec->page[0].idx != INVALID_INDEX); + WARN_ON_ONCE(pvec->page[0].idx != INVALID_INDEX); sp = pvec->page[0].sp; level = sp->role.level; - WARN_ON(level == PG_LEVEL_4K); + WARN_ON_ONCE(level == PG_LEVEL_4K); parents->parent[level-2] = sp; @@ -2099,7 +2078,7 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents) if (!sp) return; - WARN_ON(idx == INVALID_INDEX); + WARN_ON_ONCE(idx == INVALID_INDEX); clear_unsync_child_bit(sp, idx); level++; } while (!sp->unsync_children); @@ -2220,7 +2199,7 @@ static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm, if (ret < 0) break; - WARN_ON(!list_empty(&invalid_list)); + WARN_ON_ONCE(!list_empty(&invalid_list)); if (ret > 0) kvm_flush_remote_tlbs(kvm); } @@ -2499,7 +2478,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (child->role.access == direct_access) return; - drop_parent_pte(child, sptep); + drop_parent_pte(vcpu->kvm, child, sptep); kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep); } } @@ -2517,7 +2496,7 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, drop_spte(kvm, spte); } else { child = spte_to_child_sp(pte); - drop_parent_pte(child, spte); + drop_parent_pte(kvm, child, spte); /* * Recursively zap nested TDP SPs, parentless SPs are @@ -2548,13 +2527,13 @@ static int kvm_mmu_page_unlink_children(struct kvm *kvm, return zapped; } -static void kvm_mmu_unlink_parents(struct kvm_mmu_page *sp) +static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) { u64 *sptep; struct rmap_iterator iter; while ((sptep = rmap_get_first(&sp->parent_ptes, &iter))) - drop_parent_pte(sp, sptep); + drop_parent_pte(kvm, sp, sptep); } static int mmu_zap_unsync_children(struct kvm *kvm, @@ -2593,7 +2572,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, ++kvm->stat.mmu_shadow_zapped; *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list); *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list); - kvm_mmu_unlink_parents(sp); + kvm_mmu_unlink_parents(kvm, sp); /* Zapping children means active_mmu_pages has become unstable. */ list_unstable = *nr_zapped; @@ -2675,7 +2654,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, kvm_flush_remote_tlbs(kvm); list_for_each_entry_safe(sp, nsp, invalid_list, link) { - WARN_ON(!sp->role.invalid || sp->root_count); + WARN_ON_ONCE(!sp->role.invalid || sp->root_count); kvm_mmu_free_shadow_page(sp); } } @@ -2775,12 +2754,9 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) LIST_HEAD(invalid_list); int r; - pgprintk("%s: looking for gfn %llx\n", __func__, gfn); r = 0; write_lock(&kvm->mmu_lock); for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) { - pgprintk("%s: gfn %llx role %x\n", __func__, gfn, - sp->role.word); r = 1; kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); } @@ -2831,7 +2807,7 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, * track machinery is used to write-protect upper-level shadow pages, * i.e. this guards the role.level == 4K assertion below! */ - if (kvm_slot_page_track_is_active(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE)) + if (kvm_gfn_is_write_tracked(kvm, slot, gfn)) return -EPERM; /* @@ -2873,7 +2849,7 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, continue; } - WARN_ON(sp->role.level != PG_LEVEL_4K); + WARN_ON_ONCE(sp->role.level != PG_LEVEL_4K); kvm_unsync_page(kvm, sp); } if (locked) @@ -2938,9 +2914,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, bool prefetch = !fault || fault->prefetch; bool write_fault = fault && fault->write; - pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, - *sptep, write_fault, gfn); - if (unlikely(is_noslot_pfn(pfn))) { vcpu->stat.pf_mmio_spte_created++; mark_mmio_spte(vcpu, sptep, gfn, pte_access); @@ -2957,11 +2930,9 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, u64 pte = *sptep; child = spte_to_child_sp(pte); - drop_parent_pte(child, sptep); + drop_parent_pte(vcpu->kvm, child, sptep); flush = true; } else if (pfn != spte_to_pfn(*sptep)) { - pgprintk("hfn old %llx new %llx\n", - spte_to_pfn(*sptep), pfn); drop_spte(vcpu->kvm, sptep); flush = true; } else @@ -2986,8 +2957,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, if (flush) kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level); - pgprintk("%s: setting spte %llx\n", __func__, *sptep); - if (!was_rmapped) { WARN_ON_ONCE(ret == RET_PF_SPURIOUS); rmap_add(vcpu, slot, sptep, gfn, pte_access); @@ -3033,7 +3002,7 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *spte, *start = NULL; int i; - WARN_ON(!sp->role.direct); + WARN_ON_ONCE(!sp->role.direct); i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1); spte = sp->spt + i; @@ -3574,12 +3543,8 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, if (!VALID_PAGE(*root_hpa)) return; - /* - * The "root" may be a special root, e.g. a PAE entry, treat it as a - * SPTE to ensure any non-PA bits are dropped. - */ - sp = spte_to_child_sp(*root_hpa); - if (WARN_ON(!sp)) + sp = root_to_sp(*root_hpa); + if (WARN_ON_ONCE(!sp)) return; if (is_tdp_mmu_page(sp)) @@ -3624,7 +3589,9 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, &invalid_list); if (free_active_root) { - if (to_shadow_page(mmu->root.hpa)) { + if (kvm_mmu_is_dummy_root(mmu->root.hpa)) { + /* Nothing to cleanup for dummy roots. */ + } else if (root_to_sp(mmu->root.hpa)) { mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list); } else if (mmu->pae_root) { for (i = 0; i < 4; ++i) { @@ -3648,6 +3615,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_free_roots); void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu) { unsigned long roots_to_free = 0; + struct kvm_mmu_page *sp; hpa_t root_hpa; int i; @@ -3662,8 +3630,8 @@ void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu) if (!VALID_PAGE(root_hpa)) continue; - if (!to_shadow_page(root_hpa) || - to_shadow_page(root_hpa)->role.guest_mode) + sp = root_to_sp(root_hpa); + if (!sp || sp->role.guest_mode) roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); } @@ -3671,19 +3639,6 @@ void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu) } EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots); - -static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) -{ - int ret = 0; - - if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) { - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); - ret = 1; - } - - return ret; -} - static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant, u8 level) { @@ -3821,8 +3776,10 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) root_pgd = kvm_mmu_get_guest_pgd(vcpu, mmu); root_gfn = root_pgd >> PAGE_SHIFT; - if (mmu_check_root(vcpu, root_gfn)) - return 1; + if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) { + mmu->root.hpa = kvm_mmu_get_dummy_root(); + return 0; + } /* * On SVM, reading PDPTRs might access guest memory, which might fault @@ -3834,8 +3791,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) if (!(pdptrs[i] & PT_PRESENT_MASK)) continue; - if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT)) - return 1; + if (!kvm_vcpu_is_visible_gfn(vcpu, pdptrs[i] >> PAGE_SHIFT)) + pdptrs[i] = 0; } } @@ -4002,7 +3959,7 @@ static bool is_unsync_root(hpa_t root) { struct kvm_mmu_page *sp; - if (!VALID_PAGE(root)) + if (!VALID_PAGE(root) || kvm_mmu_is_dummy_root(root)) return false; /* @@ -4018,7 +3975,7 @@ static bool is_unsync_root(hpa_t root) * requirement isn't satisfied. */ smp_rmb(); - sp = to_shadow_page(root); + sp = root_to_sp(root); /* * PAE roots (somewhat arbitrarily) aren't backed by shadow pages, the @@ -4048,11 +4005,12 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) if (vcpu->arch.mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) { hpa_t root = vcpu->arch.mmu->root.hpa; - sp = to_shadow_page(root); if (!is_unsync_root(root)) return; + sp = root_to_sp(root); + write_lock(&vcpu->kvm->mmu_lock); mmu_sync_children(vcpu, sp, true); write_unlock(&vcpu->kvm->mmu_lock); @@ -4194,7 +4152,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) return RET_PF_EMULATE; reserved = get_mmio_spte(vcpu, addr, &spte); - if (WARN_ON(reserved)) + if (WARN_ON_ONCE(reserved)) return -EINVAL; if (is_mmio_spte(spte)) { @@ -4232,7 +4190,7 @@ static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu, * guest is writing the page which is write tracked which can * not be fixed by page fault handler. */ - if (kvm_slot_page_track_is_active(vcpu->kvm, fault->slot, fault->gfn, KVM_PAGE_TRACK_WRITE)) + if (kvm_gfn_is_write_tracked(vcpu->kvm, fault->slot, fault->gfn)) return true; return false; @@ -4382,7 +4340,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, static bool is_page_fault_stale(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root.hpa); + struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa); /* Special roots, e.g. pae_root, are not backed by shadow pages. */ if (sp && is_obsolete_sp(vcpu->kvm, sp)) @@ -4407,6 +4365,10 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault { int r; + /* Dummy roots are used only for shadowing bad guest roots. */ + if (WARN_ON_ONCE(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa))) + return RET_PF_RETRY; + if (page_fault_handle_page_track(vcpu, fault)) return RET_PF_EMULATE; @@ -4443,8 +4405,6 @@ out_unlock: static int nonpaging_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - pgprintk("%s: gva %lx error %x\n", __func__, fault->addr, fault->error_code); - /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */ fault->max_level = PG_LEVEL_2M; return direct_page_fault(vcpu, fault); @@ -4562,9 +4522,19 @@ static void nonpaging_init_context(struct kvm_mmu *context) static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd, union kvm_mmu_page_role role) { - return (role.direct || pgd == root->pgd) && - VALID_PAGE(root->hpa) && - role.word == to_shadow_page(root->hpa)->role.word; + struct kvm_mmu_page *sp; + + if (!VALID_PAGE(root->hpa)) + return false; + + if (!role.direct && pgd != root->pgd) + return false; + + sp = root_to_sp(root->hpa); + if (WARN_ON_ONCE(!sp)) + return false; + + return role.word == sp->role.word; } /* @@ -4634,11 +4604,10 @@ static bool fast_pgd_switch(struct kvm *kvm, struct kvm_mmu *mmu, gpa_t new_pgd, union kvm_mmu_page_role new_role) { /* - * For now, limit the caching to 64-bit hosts+VMs in order to avoid - * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs - * later if necessary. + * Limit reuse to 64-bit hosts+VMs without "special" roots in order to + * avoid having to deal with PDPTEs and other complexities. */ - if (VALID_PAGE(mmu->root.hpa) && !to_shadow_page(mmu->root.hpa)) + if (VALID_PAGE(mmu->root.hpa) && !root_to_sp(mmu->root.hpa)) kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT); if (VALID_PAGE(mmu->root.hpa)) @@ -4684,9 +4653,12 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd) * If this is a direct root page, it doesn't have a write flooding * count. Otherwise, clear the write flooding count. */ - if (!new_role.direct) - __clear_sp_write_flooding_count( - to_shadow_page(vcpu->arch.mmu->root.hpa)); + if (!new_role.direct) { + struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa); + + if (!WARN_ON_ONCE(!sp)) + __clear_sp_write_flooding_count(sp); + } } EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd); @@ -4808,28 +4780,13 @@ static void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check, } } -static bool guest_can_use_gbpages(struct kvm_vcpu *vcpu) -{ - /* - * If TDP is enabled, let the guest use GBPAGES if they're supported in - * hardware. The hardware page walker doesn't let KVM disable GBPAGES, - * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA - * walk for performance and complexity reasons. Not to mention KVM - * _can't_ solve the problem because GVA->GPA walks aren't visible to - * KVM once a TDP translation is installed. Mimic hardware behavior so - * that KVM's is at least consistent, i.e. doesn't randomly inject #PF. - */ - return tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) : - guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES); -} - static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { __reset_rsvds_bits_mask(&context->guest_rsvd_check, vcpu->arch.reserved_gpa_bits, context->cpu_role.base.level, is_efer_nx(context), - guest_can_use_gbpages(vcpu), + guest_can_use(vcpu, X86_FEATURE_GBPAGES), is_cr4_pse(context), guest_cpuid_is_amd_or_hygon(vcpu)); } @@ -4906,7 +4863,8 @@ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), context->root_role.level, context->root_role.efer_nx, - guest_can_use_gbpages(vcpu), is_pse, is_amd); + guest_can_use(vcpu, X86_FEATURE_GBPAGES), + is_pse, is_amd); if (!shadow_me_mask) return; @@ -5467,8 +5425,8 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) * physical address properties) in a single VM would require tracking * all relevant CPUID information in kvm_mmu_page_role. That is very * undesirable as it would increase the memory requirements for - * gfn_track (see struct kvm_mmu_page_role comments). For now that - * problem is swept under the rug; KVM's CPUID API is horrific and + * gfn_write_track (see struct kvm_mmu_page_role comments). For now + * that problem is swept under the rug; KVM's CPUID API is horrific and * it's all but impossible to solve it without introducing a new API. */ vcpu->arch.root_mmu.root_role.word = 0; @@ -5531,9 +5489,9 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu) struct kvm *kvm = vcpu->kvm; kvm_mmu_free_roots(kvm, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL); - WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root.hpa)); + WARN_ON_ONCE(VALID_PAGE(vcpu->arch.root_mmu.root.hpa)); kvm_mmu_free_roots(kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); - WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa)); + WARN_ON_ONCE(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa)); vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); } @@ -5546,16 +5504,21 @@ static bool is_obsolete_root(struct kvm *kvm, hpa_t root_hpa) /* * When freeing obsolete roots, treat roots as obsolete if they don't - * have an associated shadow page. This does mean KVM will get false + * have an associated shadow page, as it's impossible to determine if + * such roots are fresh or stale. This does mean KVM will get false * positives and free roots that don't strictly need to be freed, but * such false positives are relatively rare: * - * (a) only PAE paging and nested NPT has roots without shadow pages + * (a) only PAE paging and nested NPT have roots without shadow pages + * (or any shadow paging flavor with a dummy root, see note below) * (b) remote reloads due to a memslot update obsoletes _all_ roots * (c) KVM doesn't track previous roots for PAE paging, and the guest * is unlikely to zap an in-use PGD. + * + * Note! Dummy roots are unique in that they are obsoleted by memslot + * _creation_! See also FNAME(fetch). */ - sp = to_shadow_page(root_hpa); + sp = root_to_sp(root_hpa); return !sp || is_obsolete_sp(kvm, sp); } @@ -5634,9 +5597,6 @@ static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa, { unsigned offset, pte_size, misaligned; - pgprintk("misaligned: gpa %llx bytes %d role %x\n", - gpa, bytes, sp->role.word); - offset = offset_in_page(gpa); pte_size = sp->role.has_4_byte_gpte ? 4 : 8; @@ -5684,9 +5644,8 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte) return spte; } -static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *new, int bytes, - struct kvm_page_track_notifier_node *node) +void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, + int bytes) { gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_mmu_page *sp; @@ -5702,8 +5661,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) return; - pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); - write_lock(&vcpu->kvm->mmu_lock); gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes); @@ -5742,7 +5699,18 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err int r, emulation_type = EMULTYPE_PF; bool direct = vcpu->arch.mmu->root_role.direct; - if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) + /* + * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP + * checks when emulating instructions that triggers implicit access. + * WARN if hardware generates a fault with an error code that collides + * with the KVM-defined value. Clear the flag and continue on, i.e. + * don't terminate the VM, as KVM can't possibly be relying on a flag + * that KVM doesn't know about. + */ + if (WARN_ON_ONCE(error_code & PFERR_IMPLICIT_ACCESS)) + error_code &= ~PFERR_IMPLICIT_ACCESS; + + if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) return RET_PF_RETRY; r = RET_PF_INVALID; @@ -6099,7 +6067,7 @@ restart: * pages. Skip the bogus page, otherwise we'll get stuck in an * infinite loop if the page gets put back on the list (again). */ - if (WARN_ON(sp->role.invalid)) + if (WARN_ON_ONCE(sp->role.invalid)) continue; /* @@ -6199,16 +6167,8 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); } -static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, - struct kvm_memory_slot *slot, - struct kvm_page_track_notifier_node *node) -{ - kvm_mmu_zap_all_fast(kvm); -} - int kvm_mmu_init_vm(struct kvm *kvm) { - struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; int r; INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); @@ -6222,10 +6182,6 @@ int kvm_mmu_init_vm(struct kvm *kvm) return r; } - node->track_write = kvm_mmu_pte_write; - node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; - kvm_page_track_register_notifier(kvm, node); - kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache; kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO; @@ -6246,10 +6202,6 @@ static void mmu_free_vm_memory_caches(struct kvm *kvm) void kvm_mmu_uninit_vm(struct kvm *kvm) { - struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; - - kvm_page_track_unregister_notifier(kvm, node); - if (tdp_mmu_enabled) kvm_mmu_uninit_tdp_mmu(kvm); @@ -6670,7 +6622,7 @@ static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm, */ if (walk_slot_rmaps(kvm, slot, kvm_mmu_zap_collapsible_spte, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true)) - kvm_arch_flush_remote_tlbs_memslot(kvm, slot); + kvm_flush_remote_tlbs_memslot(kvm, slot); } void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, @@ -6689,20 +6641,6 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, } } -void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, - const struct kvm_memory_slot *memslot) -{ - /* - * All current use cases for flushing the TLBs for a specific memslot - * related to dirty logging, and many do the TLB flush out of mmu_lock. - * The interaction between the various operations on memslot must be - * serialized by slots_locks to ensure the TLB flush from one operation - * is observed by any other operation on the same memslot. - */ - lockdep_assert_held(&kvm->slots_lock); - kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages); -} - void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, const struct kvm_memory_slot *memslot) { @@ -6732,7 +6670,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, */ } -void kvm_mmu_zap_all(struct kvm *kvm) +static void kvm_mmu_zap_all(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; LIST_HEAD(invalid_list); @@ -6741,7 +6679,7 @@ void kvm_mmu_zap_all(struct kvm *kvm) write_lock(&kvm->mmu_lock); restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { - if (WARN_ON(sp->role.invalid)) + if (WARN_ON_ONCE(sp->role.invalid)) continue; if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) goto restart; @@ -6757,9 +6695,20 @@ restart: write_unlock(&kvm->mmu_lock); } +void kvm_arch_flush_shadow_all(struct kvm *kvm) +{ + kvm_mmu_zap_all(kvm); +} + +void kvm_arch_flush_shadow_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + kvm_mmu_zap_all_fast(kvm); +} + void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) { - WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); + WARN_ON_ONCE(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); gen &= MMIO_SPTE_GEN_MASK; @@ -6862,7 +6811,7 @@ static void mmu_destroy_caches(void) static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp) { if (nx_hugepage_mitigation_hard_disabled) - return sprintf(buffer, "never\n"); + return sysfs_emit(buffer, "never\n"); return param_get_bool(buffer, kp); } diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index d39af5639ce9..b102014e2c60 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -6,18 +6,10 @@ #include <linux/kvm_host.h> #include <asm/kvm_host.h> -#undef MMU_DEBUG - -#ifdef MMU_DEBUG -extern bool dbg; - -#define pgprintk(x...) do { if (dbg) printk(x); } while (0) -#define rmap_printk(fmt, args...) do { if (dbg) printk("%s: " fmt, __func__, ## args); } while (0) -#define MMU_WARN_ON(x) WARN_ON(x) +#ifdef CONFIG_KVM_PROVE_MMU +#define KVM_MMU_WARN_ON(x) WARN_ON_ONCE(x) #else -#define pgprintk(x...) do { } while (0) -#define rmap_printk(x...) do { } while (0) -#define MMU_WARN_ON(x) do { } while (0) +#define KVM_MMU_WARN_ON(x) BUILD_BUG_ON_INVALID(x) #endif /* Page table builder macros common to shadow (host) PTEs and guest PTEs. */ @@ -44,6 +36,16 @@ extern bool dbg; #define INVALID_PAE_ROOT 0 #define IS_VALID_PAE_ROOT(x) (!!(x)) +static inline hpa_t kvm_mmu_get_dummy_root(void) +{ + return my_zero_pfn(0) << PAGE_SHIFT; +} + +static inline bool kvm_mmu_is_dummy_root(hpa_t shadow_page) +{ + return is_zero_pfn(shadow_page >> PAGE_SHIFT); +} + typedef u64 __rcu *tdp_ptep_t; struct kvm_mmu_page { @@ -170,9 +172,6 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn, int min_level); -void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t start_gfn, - gfn_t nr_pages); - /* Flush the given page (huge or not) of guest memory. */ static inline void kvm_flush_remote_tlbs_gfn(struct kvm *kvm, gfn_t gfn, int level) { diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index 0a2ac438d647..c87da11f3a04 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -12,13 +12,13 @@ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/lockdep.h> #include <linux/kvm_host.h> #include <linux/rculist.h> -#include <asm/kvm_page_track.h> - #include "mmu.h" #include "mmu_internal.h" +#include "page_track.h" bool kvm_page_track_write_tracking_enabled(struct kvm *kvm) { @@ -28,103 +28,64 @@ bool kvm_page_track_write_tracking_enabled(struct kvm *kvm) void kvm_page_track_free_memslot(struct kvm_memory_slot *slot) { - int i; - - for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) { - kvfree(slot->arch.gfn_track[i]); - slot->arch.gfn_track[i] = NULL; - } + kvfree(slot->arch.gfn_write_track); + slot->arch.gfn_write_track = NULL; } -int kvm_page_track_create_memslot(struct kvm *kvm, - struct kvm_memory_slot *slot, - unsigned long npages) +static int __kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot, + unsigned long npages) { - int i; - - for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) { - if (i == KVM_PAGE_TRACK_WRITE && - !kvm_page_track_write_tracking_enabled(kvm)) - continue; - - slot->arch.gfn_track[i] = - __vcalloc(npages, sizeof(*slot->arch.gfn_track[i]), - GFP_KERNEL_ACCOUNT); - if (!slot->arch.gfn_track[i]) - goto track_free; - } + const size_t size = sizeof(*slot->arch.gfn_write_track); - return 0; + if (!slot->arch.gfn_write_track) + slot->arch.gfn_write_track = __vcalloc(npages, size, + GFP_KERNEL_ACCOUNT); -track_free: - kvm_page_track_free_memslot(slot); - return -ENOMEM; + return slot->arch.gfn_write_track ? 0 : -ENOMEM; } -static inline bool page_track_mode_is_valid(enum kvm_page_track_mode mode) +int kvm_page_track_create_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot, + unsigned long npages) { - if (mode < 0 || mode >= KVM_PAGE_TRACK_MAX) - return false; + if (!kvm_page_track_write_tracking_enabled(kvm)) + return 0; - return true; + return __kvm_page_track_write_tracking_alloc(slot, npages); } int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot) { - unsigned short *gfn_track; - - if (slot->arch.gfn_track[KVM_PAGE_TRACK_WRITE]) - return 0; - - gfn_track = __vcalloc(slot->npages, sizeof(*gfn_track), - GFP_KERNEL_ACCOUNT); - if (gfn_track == NULL) - return -ENOMEM; - - slot->arch.gfn_track[KVM_PAGE_TRACK_WRITE] = gfn_track; - return 0; + return __kvm_page_track_write_tracking_alloc(slot, slot->npages); } -static void update_gfn_track(struct kvm_memory_slot *slot, gfn_t gfn, - enum kvm_page_track_mode mode, short count) +static void update_gfn_write_track(struct kvm_memory_slot *slot, gfn_t gfn, + short count) { int index, val; index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K); - val = slot->arch.gfn_track[mode][index]; + val = slot->arch.gfn_write_track[index]; - if (WARN_ON(val + count < 0 || val + count > USHRT_MAX)) + if (WARN_ON_ONCE(val + count < 0 || val + count > USHRT_MAX)) return; - slot->arch.gfn_track[mode][index] += count; + slot->arch.gfn_write_track[index] += count; } -/* - * add guest page to the tracking pool so that corresponding access on that - * page will be intercepted. - * - * It should be called under the protection both of mmu-lock and kvm->srcu - * or kvm->slots_lock. - * - * @kvm: the guest instance we are interested in. - * @slot: the @gfn belongs to. - * @gfn: the guest page. - * @mode: tracking mode, currently only write track is supported. - */ -void kvm_slot_page_track_add_page(struct kvm *kvm, - struct kvm_memory_slot *slot, gfn_t gfn, - enum kvm_page_track_mode mode) +void __kvm_write_track_add_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn) { + lockdep_assert_held_write(&kvm->mmu_lock); - if (WARN_ON(!page_track_mode_is_valid(mode))) - return; + lockdep_assert_once(lockdep_is_held(&kvm->slots_lock) || + srcu_read_lock_held(&kvm->srcu)); - if (WARN_ON(mode == KVM_PAGE_TRACK_WRITE && - !kvm_page_track_write_tracking_enabled(kvm))) + if (KVM_BUG_ON(!kvm_page_track_write_tracking_enabled(kvm), kvm)) return; - update_gfn_track(slot, gfn, mode, 1); + update_gfn_write_track(slot, gfn, 1); /* * new track stops large page mapping for the @@ -132,37 +93,22 @@ void kvm_slot_page_track_add_page(struct kvm *kvm, */ kvm_mmu_gfn_disallow_lpage(slot, gfn); - if (mode == KVM_PAGE_TRACK_WRITE) - if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K)) - kvm_flush_remote_tlbs(kvm); + if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K)) + kvm_flush_remote_tlbs(kvm); } -EXPORT_SYMBOL_GPL(kvm_slot_page_track_add_page); -/* - * remove the guest page from the tracking pool which stops the interception - * of corresponding access on that page. It is the opposed operation of - * kvm_slot_page_track_add_page(). - * - * It should be called under the protection both of mmu-lock and kvm->srcu - * or kvm->slots_lock. - * - * @kvm: the guest instance we are interested in. - * @slot: the @gfn belongs to. - * @gfn: the guest page. - * @mode: tracking mode, currently only write track is supported. - */ -void kvm_slot_page_track_remove_page(struct kvm *kvm, - struct kvm_memory_slot *slot, gfn_t gfn, - enum kvm_page_track_mode mode) +void __kvm_write_track_remove_gfn(struct kvm *kvm, + struct kvm_memory_slot *slot, gfn_t gfn) { - if (WARN_ON(!page_track_mode_is_valid(mode))) - return; + lockdep_assert_held_write(&kvm->mmu_lock); - if (WARN_ON(mode == KVM_PAGE_TRACK_WRITE && - !kvm_page_track_write_tracking_enabled(kvm))) + lockdep_assert_once(lockdep_is_held(&kvm->slots_lock) || + srcu_read_lock_held(&kvm->srcu)); + + if (KVM_BUG_ON(!kvm_page_track_write_tracking_enabled(kvm), kvm)) return; - update_gfn_track(slot, gfn, mode, -1); + update_gfn_write_track(slot, gfn, -1); /* * allow large page mapping for the tracked page @@ -170,31 +116,26 @@ void kvm_slot_page_track_remove_page(struct kvm *kvm, */ kvm_mmu_gfn_allow_lpage(slot, gfn); } -EXPORT_SYMBOL_GPL(kvm_slot_page_track_remove_page); /* * check if the corresponding access on the specified guest page is tracked. */ -bool kvm_slot_page_track_is_active(struct kvm *kvm, - const struct kvm_memory_slot *slot, - gfn_t gfn, enum kvm_page_track_mode mode) +bool kvm_gfn_is_write_tracked(struct kvm *kvm, + const struct kvm_memory_slot *slot, gfn_t gfn) { int index; - if (WARN_ON(!page_track_mode_is_valid(mode))) - return false; - if (!slot) return false; - if (mode == KVM_PAGE_TRACK_WRITE && - !kvm_page_track_write_tracking_enabled(kvm)) + if (!kvm_page_track_write_tracking_enabled(kvm)) return false; index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K); - return !!READ_ONCE(slot->arch.gfn_track[mode][index]); + return !!READ_ONCE(slot->arch.gfn_write_track[index]); } +#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING void kvm_page_track_cleanup(struct kvm *kvm) { struct kvm_page_track_notifier_head *head; @@ -216,17 +157,22 @@ int kvm_page_track_init(struct kvm *kvm) * register the notifier so that event interception for the tracked guest * pages can be received. */ -void -kvm_page_track_register_notifier(struct kvm *kvm, - struct kvm_page_track_notifier_node *n) +int kvm_page_track_register_notifier(struct kvm *kvm, + struct kvm_page_track_notifier_node *n) { struct kvm_page_track_notifier_head *head; + if (!kvm || kvm->mm != current->mm) + return -ESRCH; + + kvm_get_kvm(kvm); + head = &kvm->arch.track_notifier_head; write_lock(&kvm->mmu_lock); hlist_add_head_rcu(&n->node, &head->track_notifier_list); write_unlock(&kvm->mmu_lock); + return 0; } EXPORT_SYMBOL_GPL(kvm_page_track_register_notifier); @@ -234,9 +180,8 @@ EXPORT_SYMBOL_GPL(kvm_page_track_register_notifier); * stop receiving the event interception. It is the opposed operation of * kvm_page_track_register_notifier(). */ -void -kvm_page_track_unregister_notifier(struct kvm *kvm, - struct kvm_page_track_notifier_node *n) +void kvm_page_track_unregister_notifier(struct kvm *kvm, + struct kvm_page_track_notifier_node *n) { struct kvm_page_track_notifier_head *head; @@ -246,6 +191,8 @@ kvm_page_track_unregister_notifier(struct kvm *kvm, hlist_del_rcu(&n->node); write_unlock(&kvm->mmu_lock); synchronize_srcu(&head->track_srcu); + + kvm_put_kvm(kvm); } EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier); @@ -256,34 +203,30 @@ EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier); * The node should figure out if the written page is the one that node is * interested in by itself. */ -void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, - int bytes) +void __kvm_page_track_write(struct kvm *kvm, gpa_t gpa, const u8 *new, int bytes) { struct kvm_page_track_notifier_head *head; struct kvm_page_track_notifier_node *n; int idx; - head = &vcpu->kvm->arch.track_notifier_head; + head = &kvm->arch.track_notifier_head; if (hlist_empty(&head->track_notifier_list)) return; idx = srcu_read_lock(&head->track_srcu); hlist_for_each_entry_srcu(n, &head->track_notifier_list, node, - srcu_read_lock_held(&head->track_srcu)) + srcu_read_lock_held(&head->track_srcu)) if (n->track_write) - n->track_write(vcpu, gpa, new, bytes, n); + n->track_write(gpa, new, bytes, n); srcu_read_unlock(&head->track_srcu, idx); } /* - * Notify the node that memory slot is being removed or moved so that it can - * drop write-protection for the pages in the memory slot. - * - * The node should figure out it has any write-protected pages in this slot - * by itself. + * Notify external page track nodes that a memory region is being removed from + * the VM, e.g. so that users can free any associated metadata. */ -void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot) +void kvm_page_track_delete_slot(struct kvm *kvm, struct kvm_memory_slot *slot) { struct kvm_page_track_notifier_head *head; struct kvm_page_track_notifier_node *n; @@ -296,8 +239,69 @@ void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot) idx = srcu_read_lock(&head->track_srcu); hlist_for_each_entry_srcu(n, &head->track_notifier_list, node, - srcu_read_lock_held(&head->track_srcu)) - if (n->track_flush_slot) - n->track_flush_slot(kvm, slot, n); + srcu_read_lock_held(&head->track_srcu)) + if (n->track_remove_region) + n->track_remove_region(slot->base_gfn, slot->npages, n); srcu_read_unlock(&head->track_srcu, idx); } + +/* + * add guest page to the tracking pool so that corresponding access on that + * page will be intercepted. + * + * @kvm: the guest instance we are interested in. + * @gfn: the guest page. + */ +int kvm_write_track_add_gfn(struct kvm *kvm, gfn_t gfn) +{ + struct kvm_memory_slot *slot; + int idx; + + idx = srcu_read_lock(&kvm->srcu); + + slot = gfn_to_memslot(kvm, gfn); + if (!slot) { + srcu_read_unlock(&kvm->srcu, idx); + return -EINVAL; + } + + write_lock(&kvm->mmu_lock); + __kvm_write_track_add_gfn(kvm, slot, gfn); + write_unlock(&kvm->mmu_lock); + + srcu_read_unlock(&kvm->srcu, idx); + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_write_track_add_gfn); + +/* + * remove the guest page from the tracking pool which stops the interception + * of corresponding access on that page. + * + * @kvm: the guest instance we are interested in. + * @gfn: the guest page. + */ +int kvm_write_track_remove_gfn(struct kvm *kvm, gfn_t gfn) +{ + struct kvm_memory_slot *slot; + int idx; + + idx = srcu_read_lock(&kvm->srcu); + + slot = gfn_to_memslot(kvm, gfn); + if (!slot) { + srcu_read_unlock(&kvm->srcu, idx); + return -EINVAL; + } + + write_lock(&kvm->mmu_lock); + __kvm_write_track_remove_gfn(kvm, slot, gfn); + write_unlock(&kvm->mmu_lock); + + srcu_read_unlock(&kvm->srcu, idx); + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_write_track_remove_gfn); +#endif diff --git a/arch/x86/kvm/mmu/page_track.h b/arch/x86/kvm/mmu/page_track.h new file mode 100644 index 000000000000..d4d72ed999b1 --- /dev/null +++ b/arch/x86/kvm/mmu/page_track.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_PAGE_TRACK_H +#define __KVM_X86_PAGE_TRACK_H + +#include <linux/kvm_host.h> + +#include <asm/kvm_page_track.h> + + +bool kvm_page_track_write_tracking_enabled(struct kvm *kvm); +int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot); + +void kvm_page_track_free_memslot(struct kvm_memory_slot *slot); +int kvm_page_track_create_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot, + unsigned long npages); + +void __kvm_write_track_add_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn); +void __kvm_write_track_remove_gfn(struct kvm *kvm, + struct kvm_memory_slot *slot, gfn_t gfn); + +bool kvm_gfn_is_write_tracked(struct kvm *kvm, + const struct kvm_memory_slot *slot, gfn_t gfn); + +#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING +int kvm_page_track_init(struct kvm *kvm); +void kvm_page_track_cleanup(struct kvm *kvm); + +void __kvm_page_track_write(struct kvm *kvm, gpa_t gpa, const u8 *new, int bytes); +void kvm_page_track_delete_slot(struct kvm *kvm, struct kvm_memory_slot *slot); + +static inline bool kvm_page_track_has_external_user(struct kvm *kvm) +{ + return !hlist_empty(&kvm->arch.track_notifier_head.track_notifier_list); +} +#else +static inline int kvm_page_track_init(struct kvm *kvm) { return 0; } +static inline void kvm_page_track_cleanup(struct kvm *kvm) { } + +static inline void __kvm_page_track_write(struct kvm *kvm, gpa_t gpa, + const u8 *new, int bytes) { } +static inline void kvm_page_track_delete_slot(struct kvm *kvm, + struct kvm_memory_slot *slot) { } + +static inline bool kvm_page_track_has_external_user(struct kvm *kvm) { return false; } + +#endif /* CONFIG_KVM_EXTERNAL_WRITE_TRACKING */ + +static inline void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, + const u8 *new, int bytes) +{ + __kvm_page_track_write(vcpu->kvm, gpa, new, bytes); + + kvm_mmu_track_write(vcpu, gpa, new, bytes); +} + +#endif /* __KVM_X86_PAGE_TRACK_H */ diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 0662e0278e70..c85255073f67 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -338,7 +338,6 @@ retry_walk: } #endif walker->max_level = walker->level; - ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu))); /* * FIXME: on Intel processors, loads of the PDPTE registers for PAE paging @@ -348,9 +347,21 @@ retry_walk: nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK; pte_access = ~0; + + /* + * Queue a page fault for injection if this assertion fails, as callers + * assume that walker.fault contains sane info on a walk failure. I.e. + * avoid making the situation worse by inducing even worse badness + * between when the assertion fails and when KVM kicks the vCPU out to + * userspace (because the VM is bugged). + */ + if (KVM_BUG_ON(is_long_mode(vcpu) && !is_pae(vcpu), vcpu->kvm)) + goto error; + ++walker->level; do { + struct kvm_memory_slot *slot; unsigned long host_addr; pt_access = pte_access; @@ -381,7 +392,11 @@ retry_walk: if (unlikely(real_gpa == INVALID_GPA)) return 0; - host_addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gpa_to_gfn(real_gpa), + slot = kvm_vcpu_gfn_to_memslot(vcpu, gpa_to_gfn(real_gpa)); + if (!kvm_is_visible_memslot(slot)) + goto error; + + host_addr = gfn_to_hva_memslot_prot(slot, gpa_to_gfn(real_gpa), &walker->pte_writable[walker->level - 1]); if (unlikely(kvm_is_error_hva(host_addr))) goto error; @@ -456,9 +471,6 @@ retry_walk: goto retry_walk; } - pgprintk("%s: pte %llx pte_access %x pt_access %x\n", - __func__, (u64)pte, walker->pte_access, - walker->pt_access[walker->level - 1]); return 1; error: @@ -529,8 +541,6 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) return false; - pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); - gfn = gpte_to_gfn(gpte); pte_access = sp->role.access & FNAME(gpte_access)(gpte); FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); @@ -638,8 +648,19 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, if (FNAME(gpte_changed)(vcpu, gw, top_level)) goto out_gpte_changed; - if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) + if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) + goto out_gpte_changed; + + /* + * Load a new root and retry the faulting instruction in the extremely + * unlikely scenario that the guest root gfn became visible between + * loading a dummy root and handling the resulting page fault, e.g. if + * userspace create a memslot in the interim. + */ + if (unlikely(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa))) { + kvm_make_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu); goto out_gpte_changed; + } for_each_shadow_entry(vcpu, fault->addr, it) { gfn_t table_gfn; @@ -758,7 +779,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault struct guest_walker walker; int r; - pgprintk("%s: addr %lx err %x\n", __func__, fault->addr, fault->error_code); WARN_ON_ONCE(fault->is_tdp); /* @@ -773,7 +793,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault * The page is not mapped by the guest. Let the guest handle it. */ if (!r) { - pgprintk("%s: guest page fault\n", __func__); if (!fault->prefetch) kvm_inject_emulated_page_fault(vcpu, &walker.fault); @@ -837,7 +856,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) { int offset = 0; - WARN_ON(sp->role.level != PG_LEVEL_4K); + WARN_ON_ONCE(sp->role.level != PG_LEVEL_4K); if (PTTYPE == 32) offset = sp->role.quadrant << SPTE_LEVEL_BITS; diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index cf2c6426a6fc..4a599130e9c9 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -61,7 +61,7 @@ static u64 generation_mmio_spte_mask(u64 gen) { u64 mask; - WARN_ON(gen & ~MMIO_SPTE_GEN_MASK); + WARN_ON_ONCE(gen & ~MMIO_SPTE_GEN_MASK); mask = (gen << MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_SPTE_GEN_LOW_MASK; mask |= (gen << MMIO_SPTE_GEN_HIGH_SHIFT) & MMIO_SPTE_GEN_HIGH_MASK; @@ -221,8 +221,6 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, * shadow pages and unsync'ing pages is not allowed. */ if (mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, can_unsync, prefetch)) { - pgprintk("%s: found shadow page for %llx, marking ro\n", - __func__, gfn); wrprot = true; pte_access &= ~ACC_WRITE_MASK; spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); @@ -242,7 +240,7 @@ out: if ((spte & PT_WRITABLE_MASK) && kvm_slot_dirty_track_enabled(slot)) { /* Enforced by kvm_mmu_hugepage_adjust. */ - WARN_ON(level > PG_LEVEL_4K); + WARN_ON_ONCE(level > PG_LEVEL_4K); mark_page_dirty_in_slot(vcpu->kvm, slot, gfn); } diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 1279db2eab44..a129951c9a88 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -3,6 +3,7 @@ #ifndef KVM_X86_MMU_SPTE_H #define KVM_X86_MMU_SPTE_H +#include "mmu.h" #include "mmu_internal.h" /* @@ -236,6 +237,18 @@ static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep) return to_shadow_page(__pa(sptep)); } +static inline struct kvm_mmu_page *root_to_sp(hpa_t root) +{ + if (kvm_mmu_is_dummy_root(root)) + return NULL; + + /* + * The "root" may be a special root, e.g. a PAE entry, treat it as a + * SPTE to ensure any non-PA bits are dropped. + */ + return spte_to_child_sp(root); +} + static inline bool is_mmio_spte(u64 spte) { return (spte & shadow_mmio_mask) == shadow_mmio_value && @@ -265,13 +278,13 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) static inline bool spte_ad_enabled(u64 spte) { - MMU_WARN_ON(!is_shadow_present_pte(spte)); + KVM_MMU_WARN_ON(!is_shadow_present_pte(spte)); return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED; } static inline bool spte_ad_need_write_protect(u64 spte) { - MMU_WARN_ON(!is_shadow_present_pte(spte)); + KVM_MMU_WARN_ON(!is_shadow_present_pte(spte)); /* * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED is '0', * and non-TDP SPTEs will never set these bits. Optimize for 64-bit @@ -282,13 +295,13 @@ static inline bool spte_ad_need_write_protect(u64 spte) static inline u64 spte_shadow_accessed_mask(u64 spte) { - MMU_WARN_ON(!is_shadow_present_pte(spte)); + KVM_MMU_WARN_ON(!is_shadow_present_pte(spte)); return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; } static inline u64 spte_shadow_dirty_mask(u64 spte) { - MMU_WARN_ON(!is_shadow_present_pte(spte)); + KVM_MMU_WARN_ON(!is_shadow_present_pte(spte)); return spte_ad_enabled(spte) ? shadow_dirty_mask : 0; } diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index d2eb0d4f8710..bd30ebfb2f2c 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -39,13 +39,14 @@ void tdp_iter_restart(struct tdp_iter *iter) void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root, int min_level, gfn_t next_last_level_gfn) { - int root_level = root->role.level; - - WARN_ON(root_level < 1); - WARN_ON(root_level > PT64_ROOT_MAX_LEVEL); + if (WARN_ON_ONCE(!root || (root->role.level < 1) || + (root->role.level > PT64_ROOT_MAX_LEVEL))) { + iter->valid = false; + return; + } iter->next_last_level_gfn = next_last_level_gfn; - iter->root_level = root_level; + iter->root_level = root->role.level; iter->min_level = min_level; iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root->spt; iter->as_id = kvm_mmu_page_as_id(root); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 512163d52194..6c63f2d1675f 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -475,9 +475,9 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, bool is_leaf = is_present && is_last_spte(new_spte, level); bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); - WARN_ON(level > PT64_ROOT_MAX_LEVEL); - WARN_ON(level < PG_LEVEL_4K); - WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); + WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL); + WARN_ON_ONCE(level < PG_LEVEL_4K); + WARN_ON_ONCE(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); /* * If this warning were to trigger it would indicate that there was a @@ -522,9 +522,9 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, * impact the guest since both the former and current SPTEs * are nonpresent. */ - if (WARN_ON(!is_mmio_spte(old_spte) && - !is_mmio_spte(new_spte) && - !is_removed_spte(new_spte))) + if (WARN_ON_ONCE(!is_mmio_spte(old_spte) && + !is_mmio_spte(new_spte) && + !is_removed_spte(new_spte))) pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" "should not be replaced with another,\n" "different nonpresent SPTE, unless one or both\n" @@ -661,7 +661,7 @@ static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, * should be used. If operating under the MMU lock in write mode, the * use of the removed SPTE should not be necessary. */ - WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte)); + WARN_ON_ONCE(is_removed_spte(old_spte) || is_removed_spte(new_spte)); old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level); @@ -689,7 +689,7 @@ static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter, else #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ - for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end) + for_each_tdp_pte(_iter, root_to_sp(_mmu->root.hpa), _start, _end) /* * Yield if the MMU lock is contended or this thread needs to return control @@ -709,7 +709,7 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, struct tdp_iter *iter, bool flush, bool shared) { - WARN_ON(iter->yielded); + WARN_ON_ONCE(iter->yielded); /* Ensure forward progress has been made before yielding. */ if (iter->next_last_level_gfn == iter->yielded_gfn) @@ -728,7 +728,7 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, rcu_read_lock(); - WARN_ON(iter->gfn > iter->next_last_level_gfn); + WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn); iter->yielded = true; } @@ -1241,7 +1241,7 @@ static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte; /* Huge pages aren't expected to be modified without first being zapped. */ - WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end); + WARN_ON_ONCE(pte_huge(range->arg.pte) || range->start + 1 != range->end); if (iter->level != PG_LEVEL_4K || !is_shadow_present_pte(iter->old_spte)) @@ -1255,9 +1255,9 @@ static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter, */ tdp_mmu_iter_set_spte(kvm, iter, 0); - if (!pte_write(range->pte)) { + if (!pte_write(range->arg.pte)) { new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte, - pte_pfn(range->pte)); + pte_pfn(range->arg.pte)); tdp_mmu_iter_set_spte(kvm, iter, new_spte); } @@ -1548,8 +1548,8 @@ retry: if (!is_shadow_present_pte(iter.old_spte)) continue; - MMU_WARN_ON(kvm_ad_enabled() && - spte_ad_need_write_protect(iter.old_spte)); + KVM_MMU_WARN_ON(kvm_ad_enabled() && + spte_ad_need_write_protect(iter.old_spte)); if (!(iter.old_spte & dbit)) continue; @@ -1600,6 +1600,8 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, shadow_dirty_mask; struct tdp_iter iter; + lockdep_assert_held_write(&kvm->mmu_lock); + rcu_read_lock(); tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), @@ -1607,8 +1609,8 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, if (!mask) break; - MMU_WARN_ON(kvm_ad_enabled() && - spte_ad_need_write_protect(iter.old_spte)); + KVM_MMU_WARN_ON(kvm_ad_enabled() && + spte_ad_need_write_protect(iter.old_spte)); if (iter.level > PG_LEVEL_4K || !(mask & (1UL << (iter.gfn - gfn)))) @@ -1646,7 +1648,6 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, { struct kvm_mmu_page *root; - lockdep_assert_held_write(&kvm->mmu_lock); for_each_tdp_mmu_root(kvm, root, slot->as_id) clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); } diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index bf653df86112..edb89b51b383 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -382,9 +382,6 @@ static bool check_pmu_event_filter(struct kvm_pmc *pmc) struct kvm_x86_pmu_event_filter *filter; struct kvm *kvm = pmc->vcpu->kvm; - if (!static_call(kvm_x86_pmu_hw_event_available)(pmc)) - return false; - filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); if (!filter) return true; @@ -398,6 +395,7 @@ static bool check_pmu_event_filter(struct kvm_pmc *pmc) static bool pmc_event_is_allowed(struct kvm_pmc *pmc) { return pmc_is_globally_enabled(pmc) && pmc_speculative_in_use(pmc) && + static_call(kvm_x86_pmu_hw_event_available)(pmc) && check_pmu_event_filter(pmc); } diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index 56cbdb24400a..b81650678375 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -43,6 +43,7 @@ enum kvm_only_cpuid_leafs { /* Intel-defined sub-features, CPUID level 0x00000007:1 (EDX) */ #define X86_FEATURE_AVX_VNNI_INT8 KVM_X86_FEATURE(CPUID_7_1_EDX, 4) #define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5) +#define X86_FEATURE_AMX_COMPLEX KVM_X86_FEATURE(CPUID_7_1_EDX, 8) #define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14) /* CPUID level 0x80000007 (EDX). */ diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index cfc8ab773025..2092db892d7d 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -791,6 +791,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) int ret = 0; unsigned long flags; struct amd_svm_iommu_ir *ir; + u64 entry; /** * In some cases, the existing irte is updated and re-set, @@ -824,6 +825,18 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) ir->data = pi->ir_data; spin_lock_irqsave(&svm->ir_list_lock, flags); + + /* + * Update the target pCPU for IOMMU doorbells if the vCPU is running. + * If the vCPU is NOT running, i.e. is blocking or scheduled out, KVM + * will update the pCPU info when the vCPU awkened and/or scheduled in. + * See also avic_vcpu_load(). + */ + entry = READ_ONCE(*(svm->avic_physical_id_cache)); + if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) + amd_iommu_update_ga(entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK, + true, pi->ir_data); + list_add(&ir->node, &svm->ir_list); spin_unlock_irqrestore(&svm->ir_list_lock, flags); out: @@ -986,10 +999,11 @@ static inline int avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) { int ret = 0; - unsigned long flags; struct amd_svm_iommu_ir *ir; struct vcpu_svm *svm = to_svm(vcpu); + lockdep_assert_held(&svm->ir_list_lock); + if (!kvm_arch_has_assigned_device(vcpu->kvm)) return 0; @@ -997,19 +1011,15 @@ avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) * Here, we go through the per-vcpu ir_list to update all existing * interrupt remapping table entry targeting this vcpu. */ - spin_lock_irqsave(&svm->ir_list_lock, flags); - if (list_empty(&svm->ir_list)) - goto out; + return 0; list_for_each_entry(ir, &svm->ir_list, node) { ret = amd_iommu_update_ga(cpu, r, ir->data); if (ret) - break; + return ret; } -out: - spin_unlock_irqrestore(&svm->ir_list_lock, flags); - return ret; + return 0; } void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) @@ -1017,6 +1027,7 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) u64 entry; int h_physical_id = kvm_cpu_get_apicid(cpu); struct vcpu_svm *svm = to_svm(vcpu); + unsigned long flags; lockdep_assert_preemption_disabled(); @@ -1033,6 +1044,15 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (kvm_vcpu_is_blocking(vcpu)) return; + /* + * Grab the per-vCPU interrupt remapping lock even if the VM doesn't + * _currently_ have assigned devices, as that can change. Holding + * ir_list_lock ensures that either svm_ir_list_add() will consume + * up-to-date entry information, or that this task will wait until + * svm_ir_list_add() completes to set the new target pCPU. + */ + spin_lock_irqsave(&svm->ir_list_lock, flags); + entry = READ_ONCE(*(svm->avic_physical_id_cache)); WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); @@ -1042,25 +1062,48 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) WRITE_ONCE(*(svm->avic_physical_id_cache), entry); avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true); + + spin_unlock_irqrestore(&svm->ir_list_lock, flags); } void avic_vcpu_put(struct kvm_vcpu *vcpu) { u64 entry; struct vcpu_svm *svm = to_svm(vcpu); + unsigned long flags; lockdep_assert_preemption_disabled(); + /* + * Note, reading the Physical ID entry outside of ir_list_lock is safe + * as only the pCPU that has loaded (or is loading) the vCPU is allowed + * to modify the entry, and preemption is disabled. I.e. the vCPU + * can't be scheduled out and thus avic_vcpu_{put,load}() can't run + * recursively. + */ entry = READ_ONCE(*(svm->avic_physical_id_cache)); /* Nothing to do if IsRunning == '0' due to vCPU blocking. */ if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) return; + /* + * Take and hold the per-vCPU interrupt remapping lock while updating + * the Physical ID entry even though the lock doesn't protect against + * multiple writers (see above). Holding ir_list_lock ensures that + * either svm_ir_list_add() will consume up-to-date entry information, + * or that this task will wait until svm_ir_list_add() completes to + * mark the vCPU as not running. + */ + spin_lock_irqsave(&svm->ir_list_lock, flags); + avic_update_iommu_vcpu_affinity(vcpu, -1, 0); entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; WRITE_ONCE(*(svm->avic_physical_id_cache), entry); + + spin_unlock_irqrestore(&svm->ir_list_lock, flags); + } void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 96936ddf1b3c..dd496c9e5f91 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -107,7 +107,7 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm) { - if (!svm->v_vmload_vmsave_enabled) + if (!guest_can_use(&svm->vcpu, X86_FEATURE_V_VMSAVE_VMLOAD)) return true; if (!nested_npt_enabled(svm)) @@ -552,6 +552,7 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 bool new_vmcb12 = false; struct vmcb *vmcb01 = svm->vmcb01.ptr; struct vmcb *vmcb02 = svm->nested.vmcb02.ptr; + struct kvm_vcpu *vcpu = &svm->vcpu; nested_vmcb02_compute_g_pat(svm); @@ -577,18 +578,18 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 vmcb_mark_dirty(vmcb02, VMCB_DT); } - kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED); + kvm_set_rflags(vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED); - svm_set_efer(&svm->vcpu, svm->nested.save.efer); + svm_set_efer(vcpu, svm->nested.save.efer); - svm_set_cr0(&svm->vcpu, svm->nested.save.cr0); - svm_set_cr4(&svm->vcpu, svm->nested.save.cr4); + svm_set_cr0(vcpu, svm->nested.save.cr0); + svm_set_cr4(vcpu, svm->nested.save.cr4); svm->vcpu.arch.cr2 = vmcb12->save.cr2; - kvm_rax_write(&svm->vcpu, vmcb12->save.rax); - kvm_rsp_write(&svm->vcpu, vmcb12->save.rsp); - kvm_rip_write(&svm->vcpu, vmcb12->save.rip); + kvm_rax_write(vcpu, vmcb12->save.rax); + kvm_rsp_write(vcpu, vmcb12->save.rsp); + kvm_rip_write(vcpu, vmcb12->save.rip); /* In case we don't even reach vcpu_run, the fields are not updated */ vmcb02->save.rax = vmcb12->save.rax; @@ -602,7 +603,8 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 vmcb_mark_dirty(vmcb02, VMCB_DR); } - if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { + if (unlikely(guest_can_use(vcpu, X86_FEATURE_LBRV) && + (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { /* * Reserved bits of DEBUGCTL are ignored. Be consistent with * svm_set_msr's definition of reserved bits. @@ -658,7 +660,8 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes. */ - if (svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK)) + if (guest_can_use(vcpu, X86_FEATURE_VGIF) && + (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK)) int_ctl_vmcb12_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK); else int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK); @@ -695,10 +698,9 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, vmcb02->control.tsc_offset = vcpu->arch.tsc_offset; - if (svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) { - WARN_ON(!svm->tsc_scaling_enabled); + if (guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR) && + svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) nested_svm_update_tsc_ratio_msr(vcpu); - } vmcb02->control.int_ctl = (svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) | @@ -717,7 +719,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, * what a nrips=0 CPU would do (L1 is responsible for advancing RIP * prior to injecting the event). */ - if (svm->nrips_enabled) + if (guest_can_use(vcpu, X86_FEATURE_NRIPS)) vmcb02->control.next_rip = svm->nested.ctl.next_rip; else if (boot_cpu_has(X86_FEATURE_NRIPS)) vmcb02->control.next_rip = vmcb12_rip; @@ -727,7 +729,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, svm->soft_int_injected = true; svm->soft_int_csbase = vmcb12_csbase; svm->soft_int_old_rip = vmcb12_rip; - if (svm->nrips_enabled) + if (guest_can_use(vcpu, X86_FEATURE_NRIPS)) svm->soft_int_next_rip = svm->nested.ctl.next_rip; else svm->soft_int_next_rip = vmcb12_rip; @@ -735,15 +737,21 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, vmcb02->control.virt_ext = vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK; - if (svm->lbrv_enabled) + if (guest_can_use(vcpu, X86_FEATURE_LBRV)) vmcb02->control.virt_ext |= (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK); if (!nested_vmcb_needs_vls_intercept(svm)) vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; - pause_count12 = svm->pause_filter_enabled ? svm->nested.ctl.pause_filter_count : 0; - pause_thresh12 = svm->pause_threshold_enabled ? svm->nested.ctl.pause_filter_thresh : 0; + if (guest_can_use(vcpu, X86_FEATURE_PAUSEFILTER)) + pause_count12 = svm->nested.ctl.pause_filter_count; + else + pause_count12 = 0; + if (guest_can_use(vcpu, X86_FEATURE_PFTHRESHOLD)) + pause_thresh12 = svm->nested.ctl.pause_filter_thresh; + else + pause_thresh12 = 0; if (kvm_pause_in_guest(svm->vcpu.kvm)) { /* use guest values since host doesn't intercept PAUSE */ vmcb02->control.pause_filter_count = pause_count12; @@ -1027,7 +1035,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) if (vmcb12->control.exit_code != SVM_EXIT_ERR) nested_save_pending_event_to_vmcb12(svm, vmcb12); - if (svm->nrips_enabled) + if (guest_can_use(vcpu, X86_FEATURE_NRIPS)) vmcb12->control.next_rip = vmcb02->control.next_rip; vmcb12->control.int_ctl = svm->nested.ctl.int_ctl; @@ -1066,7 +1074,8 @@ int nested_svm_vmexit(struct vcpu_svm *svm) if (!nested_exit_on_intr(svm)) kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); - if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { + if (unlikely(guest_can_use(vcpu, X86_FEATURE_LBRV) && + (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { svm_copy_lbrs(vmcb12, vmcb02); svm_update_lbrv(vcpu); } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) { @@ -1101,10 +1110,10 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS); } - if (svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) { - WARN_ON(!svm->tsc_scaling_enabled); + if (kvm_caps.has_tsc_control && + vcpu->arch.tsc_scaling_ratio != vcpu->arch.l1_tsc_scaling_ratio) { vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; - __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); + svm_write_tsc_multiplier(vcpu); } svm->nested.ctl.nested_cr3 = 0; @@ -1537,7 +1546,7 @@ void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu) vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(vcpu->arch.l1_tsc_scaling_ratio, svm->tsc_ratio_msr); - __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); + svm_write_tsc_multiplier(vcpu); } /* Inverse operation of nested_copy_vmcb_control_to_cache(). asid is copied too. */ diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 07756b7348ae..b9a0a939d59f 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -23,6 +23,7 @@ #include <asm/pkru.h> #include <asm/trapnr.h> #include <asm/fpu/xcr.h> +#include <asm/debugreg.h> #include "mmu.h" #include "x86.h" @@ -54,9 +55,14 @@ module_param_named(sev, sev_enabled, bool, 0444); /* enable/disable SEV-ES support */ static bool sev_es_enabled = true; module_param_named(sev_es, sev_es_enabled, bool, 0444); + +/* enable/disable SEV-ES DebugSwap support */ +static bool sev_es_debug_swap_enabled = true; +module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444); #else #define sev_enabled false #define sev_es_enabled false +#define sev_es_debug_swap_enabled false #endif /* CONFIG_KVM_AMD_SEV */ static u8 sev_enc_bit; @@ -606,6 +612,9 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm) save->xss = svm->vcpu.arch.ia32_xss; save->dr6 = svm->vcpu.arch.dr6; + if (sev_es_debug_swap_enabled) + save->sev_features |= SVM_SEV_FEAT_DEBUG_SWAP; + pr_debug("Virtual Machine Save Area (VMSA):\n"); print_hex_dump_debug("", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false); @@ -619,6 +628,11 @@ static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu, struct vcpu_svm *svm = to_svm(vcpu); int ret; + if (vcpu->guest_debug) { + pr_warn_once("KVM_SET_GUEST_DEBUG for SEV-ES guest is not supported"); + return -EINVAL; + } + /* Perform some pre-encryption checks against the VMSA */ ret = sev_es_sync_vmsa(svm); if (ret) @@ -1725,7 +1739,7 @@ static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm) * Note, the source is not required to have the same number of * vCPUs as the destination when migrating a vanilla SEV VM. */ - src_vcpu = kvm_get_vcpu(dst_kvm, i); + src_vcpu = kvm_get_vcpu(src_kvm, i); src_svm = to_svm(src_vcpu); /* @@ -2171,7 +2185,7 @@ void __init sev_hardware_setup(void) bool sev_es_supported = false; bool sev_supported = false; - if (!sev_enabled || !npt_enabled) + if (!sev_enabled || !npt_enabled || !nrips) goto out; /* @@ -2256,6 +2270,9 @@ out: sev_enabled = sev_supported; sev_es_enabled = sev_es_supported; + if (!sev_es_enabled || !cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) || + !cpu_feature_enabled(X86_FEATURE_NO_NESTED_DATA_BP)) + sev_es_debug_swap_enabled = false; #endif } @@ -2417,15 +2434,18 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) */ memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); - vcpu->arch.regs[VCPU_REGS_RAX] = ghcb_get_rax_if_valid(ghcb); - vcpu->arch.regs[VCPU_REGS_RBX] = ghcb_get_rbx_if_valid(ghcb); - vcpu->arch.regs[VCPU_REGS_RCX] = ghcb_get_rcx_if_valid(ghcb); - vcpu->arch.regs[VCPU_REGS_RDX] = ghcb_get_rdx_if_valid(ghcb); - vcpu->arch.regs[VCPU_REGS_RSI] = ghcb_get_rsi_if_valid(ghcb); + BUILD_BUG_ON(sizeof(svm->sev_es.valid_bitmap) != sizeof(ghcb->save.valid_bitmap)); + memcpy(&svm->sev_es.valid_bitmap, &ghcb->save.valid_bitmap, sizeof(ghcb->save.valid_bitmap)); + + vcpu->arch.regs[VCPU_REGS_RAX] = kvm_ghcb_get_rax_if_valid(svm, ghcb); + vcpu->arch.regs[VCPU_REGS_RBX] = kvm_ghcb_get_rbx_if_valid(svm, ghcb); + vcpu->arch.regs[VCPU_REGS_RCX] = kvm_ghcb_get_rcx_if_valid(svm, ghcb); + vcpu->arch.regs[VCPU_REGS_RDX] = kvm_ghcb_get_rdx_if_valid(svm, ghcb); + vcpu->arch.regs[VCPU_REGS_RSI] = kvm_ghcb_get_rsi_if_valid(svm, ghcb); - svm->vmcb->save.cpl = ghcb_get_cpl_if_valid(ghcb); + svm->vmcb->save.cpl = kvm_ghcb_get_cpl_if_valid(svm, ghcb); - if (ghcb_xcr0_is_valid(ghcb)) { + if (kvm_ghcb_xcr0_is_valid(svm)) { vcpu->arch.xcr0 = ghcb_get_xcr0(ghcb); kvm_update_cpuid_runtime(vcpu); } @@ -2436,84 +2456,88 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) control->exit_code_hi = upper_32_bits(exit_code); control->exit_info_1 = ghcb_get_sw_exit_info_1(ghcb); control->exit_info_2 = ghcb_get_sw_exit_info_2(ghcb); + svm->sev_es.sw_scratch = kvm_ghcb_get_sw_scratch_if_valid(svm, ghcb); /* Clear the valid entries fields */ memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); } +static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control) +{ + return (((u64)control->exit_code_hi) << 32) | control->exit_code; +} + static int sev_es_validate_vmgexit(struct vcpu_svm *svm) { - struct kvm_vcpu *vcpu; - struct ghcb *ghcb; + struct vmcb_control_area *control = &svm->vmcb->control; + struct kvm_vcpu *vcpu = &svm->vcpu; u64 exit_code; u64 reason; - ghcb = svm->sev_es.ghcb; - /* * Retrieve the exit code now even though it may not be marked valid * as it could help with debugging. */ - exit_code = ghcb_get_sw_exit_code(ghcb); + exit_code = kvm_ghcb_get_sw_exit_code(control); /* Only GHCB Usage code 0 is supported */ - if (ghcb->ghcb_usage) { + if (svm->sev_es.ghcb->ghcb_usage) { reason = GHCB_ERR_INVALID_USAGE; goto vmgexit_err; } reason = GHCB_ERR_MISSING_INPUT; - if (!ghcb_sw_exit_code_is_valid(ghcb) || - !ghcb_sw_exit_info_1_is_valid(ghcb) || - !ghcb_sw_exit_info_2_is_valid(ghcb)) + if (!kvm_ghcb_sw_exit_code_is_valid(svm) || + !kvm_ghcb_sw_exit_info_1_is_valid(svm) || + !kvm_ghcb_sw_exit_info_2_is_valid(svm)) goto vmgexit_err; - switch (ghcb_get_sw_exit_code(ghcb)) { + switch (exit_code) { case SVM_EXIT_READ_DR7: break; case SVM_EXIT_WRITE_DR7: - if (!ghcb_rax_is_valid(ghcb)) + if (!kvm_ghcb_rax_is_valid(svm)) goto vmgexit_err; break; case SVM_EXIT_RDTSC: break; case SVM_EXIT_RDPMC: - if (!ghcb_rcx_is_valid(ghcb)) + if (!kvm_ghcb_rcx_is_valid(svm)) goto vmgexit_err; break; case SVM_EXIT_CPUID: - if (!ghcb_rax_is_valid(ghcb) || - !ghcb_rcx_is_valid(ghcb)) + if (!kvm_ghcb_rax_is_valid(svm) || + !kvm_ghcb_rcx_is_valid(svm)) goto vmgexit_err; - if (ghcb_get_rax(ghcb) == 0xd) - if (!ghcb_xcr0_is_valid(ghcb)) + if (vcpu->arch.regs[VCPU_REGS_RAX] == 0xd) + if (!kvm_ghcb_xcr0_is_valid(svm)) goto vmgexit_err; break; case SVM_EXIT_INVD: break; case SVM_EXIT_IOIO: - if (ghcb_get_sw_exit_info_1(ghcb) & SVM_IOIO_STR_MASK) { - if (!ghcb_sw_scratch_is_valid(ghcb)) + if (control->exit_info_1 & SVM_IOIO_STR_MASK) { + if (!kvm_ghcb_sw_scratch_is_valid(svm)) goto vmgexit_err; } else { - if (!(ghcb_get_sw_exit_info_1(ghcb) & SVM_IOIO_TYPE_MASK)) - if (!ghcb_rax_is_valid(ghcb)) + if (!(control->exit_info_1 & SVM_IOIO_TYPE_MASK)) + if (!kvm_ghcb_rax_is_valid(svm)) goto vmgexit_err; } break; case SVM_EXIT_MSR: - if (!ghcb_rcx_is_valid(ghcb)) + if (!kvm_ghcb_rcx_is_valid(svm)) goto vmgexit_err; - if (ghcb_get_sw_exit_info_1(ghcb)) { - if (!ghcb_rax_is_valid(ghcb) || - !ghcb_rdx_is_valid(ghcb)) + if (control->exit_info_1) { + if (!kvm_ghcb_rax_is_valid(svm) || + !kvm_ghcb_rdx_is_valid(svm)) goto vmgexit_err; } break; case SVM_EXIT_VMMCALL: - if (!ghcb_rax_is_valid(ghcb) || - !ghcb_cpl_is_valid(ghcb)) + if (!kvm_ghcb_rax_is_valid(svm) || + !kvm_ghcb_cpl_is_valid(svm)) goto vmgexit_err; break; case SVM_EXIT_RDTSCP: @@ -2521,19 +2545,19 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) case SVM_EXIT_WBINVD: break; case SVM_EXIT_MONITOR: - if (!ghcb_rax_is_valid(ghcb) || - !ghcb_rcx_is_valid(ghcb) || - !ghcb_rdx_is_valid(ghcb)) + if (!kvm_ghcb_rax_is_valid(svm) || + !kvm_ghcb_rcx_is_valid(svm) || + !kvm_ghcb_rdx_is_valid(svm)) goto vmgexit_err; break; case SVM_EXIT_MWAIT: - if (!ghcb_rax_is_valid(ghcb) || - !ghcb_rcx_is_valid(ghcb)) + if (!kvm_ghcb_rax_is_valid(svm) || + !kvm_ghcb_rcx_is_valid(svm)) goto vmgexit_err; break; case SVM_VMGEXIT_MMIO_READ: case SVM_VMGEXIT_MMIO_WRITE: - if (!ghcb_sw_scratch_is_valid(ghcb)) + if (!kvm_ghcb_sw_scratch_is_valid(svm)) goto vmgexit_err; break; case SVM_VMGEXIT_NMI_COMPLETE: @@ -2549,11 +2573,9 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) return 0; vmgexit_err: - vcpu = &svm->vcpu; - if (reason == GHCB_ERR_INVALID_USAGE) { vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n", - ghcb->ghcb_usage); + svm->sev_es.ghcb->ghcb_usage); } else if (reason == GHCB_ERR_INVALID_EVENT) { vcpu_unimpl(vcpu, "vmgexit: exit code %#llx is not valid\n", exit_code); @@ -2563,11 +2585,8 @@ vmgexit_err: dump_ghcb(svm); } - /* Clear the valid entries fields */ - memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); - - ghcb_set_sw_exit_info_1(ghcb, 2); - ghcb_set_sw_exit_info_2(ghcb, reason); + ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2); + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, reason); /* Resume the guest to "return" the error code. */ return 1; @@ -2586,7 +2605,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm) */ if (svm->sev_es.ghcb_sa_sync) { kvm_write_guest(svm->vcpu.kvm, - ghcb_get_sw_scratch(svm->sev_es.ghcb), + svm->sev_es.sw_scratch, svm->sev_es.ghcb_sa, svm->sev_es.ghcb_sa_len); svm->sev_es.ghcb_sa_sync = false; @@ -2632,12 +2651,11 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu) static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) { struct vmcb_control_area *control = &svm->vmcb->control; - struct ghcb *ghcb = svm->sev_es.ghcb; u64 ghcb_scratch_beg, ghcb_scratch_end; u64 scratch_gpa_beg, scratch_gpa_end; void *scratch_va; - scratch_gpa_beg = ghcb_get_sw_scratch(ghcb); + scratch_gpa_beg = svm->sev_es.sw_scratch; if (!scratch_gpa_beg) { pr_err("vmgexit: scratch gpa not provided\n"); goto e_scratch; @@ -2708,8 +2726,8 @@ static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) return 0; e_scratch: - ghcb_set_sw_exit_info_1(ghcb, 2); - ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_SCRATCH_AREA); + ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2); + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_SCRATCH_AREA); return 1; } @@ -2822,7 +2840,6 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); struct vmcb_control_area *control = &svm->vmcb->control; u64 ghcb_gpa, exit_code; - struct ghcb *ghcb; int ret; /* Validate the GHCB */ @@ -2847,20 +2864,18 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) } svm->sev_es.ghcb = svm->sev_es.ghcb_map.hva; - ghcb = svm->sev_es.ghcb_map.hva; - trace_kvm_vmgexit_enter(vcpu->vcpu_id, ghcb); - - exit_code = ghcb_get_sw_exit_code(ghcb); + trace_kvm_vmgexit_enter(vcpu->vcpu_id, svm->sev_es.ghcb); + sev_es_sync_from_ghcb(svm); ret = sev_es_validate_vmgexit(svm); if (ret) return ret; - sev_es_sync_from_ghcb(svm); - ghcb_set_sw_exit_info_1(ghcb, 0); - ghcb_set_sw_exit_info_2(ghcb, 0); + ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 0); + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 0); + exit_code = kvm_ghcb_get_sw_exit_code(control); switch (exit_code) { case SVM_VMGEXIT_MMIO_READ: ret = setup_vmgexit_scratch(svm, true, control->exit_info_2); @@ -2883,7 +2898,10 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) svm->sev_es.ghcb_sa); break; case SVM_VMGEXIT_NMI_COMPLETE: - ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_IRET); + ++vcpu->stat.nmi_window_exits; + svm->nmi_masked = false; + kvm_make_request(KVM_REQ_EVENT, vcpu); + ret = 1; break; case SVM_VMGEXIT_AP_HLT_LOOP: ret = kvm_emulate_ap_reset_hold(vcpu); @@ -2898,13 +2916,13 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) break; case 1: /* Get AP jump table address */ - ghcb_set_sw_exit_info_2(ghcb, sev->ap_jump_table); + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, sev->ap_jump_table); break; default: pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n", control->exit_info_1); - ghcb_set_sw_exit_info_1(ghcb, 2); - ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_INPUT); + ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2); + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_INPUT); } ret = 1; @@ -2946,6 +2964,7 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in) static void sev_es_init_vmcb(struct vcpu_svm *svm) { + struct vmcb *vmcb = svm->vmcb01.ptr; struct kvm_vcpu *vcpu = &svm->vcpu; svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ES_ENABLE; @@ -2954,9 +2973,12 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) /* * An SEV-ES guest requires a VMSA area that is a separate from the * VMCB page. Do not include the encryption mask on the VMSA physical - * address since hardware will access it using the guest key. + * address since hardware will access it using the guest key. Note, + * the VMSA will be NULL if this vCPU is the destination for intrahost + * migration, and will be copied later. */ - svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa); + if (svm->sev_es.vmsa) + svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa); /* Can't intercept CR register access, HV can't modify CR registers */ svm_clr_intercept(svm, INTERCEPT_CR0_READ); @@ -2974,8 +2996,23 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) svm_set_intercept(svm, TRAP_CR4_WRITE); svm_set_intercept(svm, TRAP_CR8_WRITE); - /* No support for enable_vmware_backdoor */ - clr_exception_intercept(svm, GP_VECTOR); + vmcb->control.intercepts[INTERCEPT_DR] = 0; + if (!sev_es_debug_swap_enabled) { + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); + recalc_intercepts(svm); + } else { + /* + * Disable #DB intercept iff DebugSwap is enabled. KVM doesn't + * allow debugging SEV-ES guests, and enables DebugSwap iff + * NO_NESTED_DATA_BP is supported, so there's no reason to + * intercept #DB when DebugSwap is enabled. For simplicity + * with respect to guest debug, intercept #DB for other VMs + * even if NO_NESTED_DATA_BP is supported, i.e. even if the + * guest can't DoS the CPU with infinite #DB vectoring. + */ + clr_exception_intercept(svm, DB_VECTOR); + } /* Can't intercept XSETBV, HV can't modify XCR0 directly */ svm_clr_intercept(svm, INTERCEPT_XSETBV); @@ -3002,6 +3039,12 @@ void sev_init_vmcb(struct vcpu_svm *svm) svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE; clr_exception_intercept(svm, UD_VECTOR); + /* + * Don't intercept #GP for SEV guests, e.g. for the VMware backdoor, as + * KVM can't decrypt guest memory to decode the faulting instruction. + */ + clr_exception_intercept(svm, GP_VECTOR); + if (sev_es_guest(svm->vcpu.kvm)) sev_es_init_vmcb(svm); } @@ -3020,20 +3063,41 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm) void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa) { /* - * As an SEV-ES guest, hardware will restore the host state on VMEXIT, - * of which one step is to perform a VMLOAD. KVM performs the - * corresponding VMSAVE in svm_prepare_guest_switch for both - * traditional and SEV-ES guests. + * All host state for SEV-ES guests is categorized into three swap types + * based on how it is handled by hardware during a world switch: + * + * A: VMRUN: Host state saved in host save area + * VMEXIT: Host state loaded from host save area + * + * B: VMRUN: Host state _NOT_ saved in host save area + * VMEXIT: Host state loaded from host save area + * + * C: VMRUN: Host state _NOT_ saved in host save area + * VMEXIT: Host state initialized to default(reset) values + * + * Manually save type-B state, i.e. state that is loaded by VMEXIT but + * isn't saved by VMRUN, that isn't already saved by VMSAVE (performed + * by common SVM code). */ - - /* XCR0 is restored on VMEXIT, save the current host value */ hostsa->xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - - /* PKRU is restored on VMEXIT, save the current host value */ hostsa->pkru = read_pkru(); - - /* MSR_IA32_XSS is restored on VMEXIT, save the currnet host value */ hostsa->xss = host_xss; + + /* + * If DebugSwap is enabled, debug registers are loaded but NOT saved by + * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU both + * saves and loads debug registers (Type-A). + */ + if (sev_es_debug_swap_enabled) { + hostsa->dr0 = native_get_debugreg(0); + hostsa->dr1 = native_get_debugreg(1); + hostsa->dr2 = native_get_debugreg(2); + hostsa->dr3 = native_get_debugreg(3); + hostsa->dr0_addr_mask = amd_get_dr_addr_mask(0); + hostsa->dr1_addr_mask = amd_get_dr_addr_mask(1); + hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2); + hostsa->dr3_addr_mask = amd_get_dr_addr_mask(3); + } } void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d381ad424554..f283eb47f6ac 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -39,10 +39,9 @@ #include <asm/spec-ctrl.h> #include <asm/cpu_device_id.h> #include <asm/traps.h> +#include <asm/reboot.h> #include <asm/fpu/api.h> -#include <asm/virtext.h> - #include <trace/events/ipi.h> #include "trace.h" @@ -203,7 +202,7 @@ static int nested = true; module_param(nested, int, S_IRUGO); /* enable/disable Next RIP Save */ -static int nrips = true; +int nrips = true; module_param(nrips, int, 0444); /* enable/disable Virtual VMLOAD VMSAVE */ @@ -365,6 +364,8 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK; } +static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len); static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu, bool commit_side_effects) @@ -385,6 +386,14 @@ static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu, } if (!svm->next_rip) { + /* + * FIXME: Drop this when kvm_emulate_instruction() does the + * right thing and treats "can't emulate" as outright failure + * for EMULTYPE_SKIP. + */ + if (!svm_can_emulate_instruction(vcpu, EMULTYPE_SKIP, NULL, 0)) + return 0; + if (unlikely(!commit_side_effects)) old_rflags = svm->vmcb->save.rflags; @@ -517,14 +526,21 @@ static void svm_init_osvw(struct kvm_vcpu *vcpu) vcpu->arch.osvw.status |= 1; } -static bool kvm_is_svm_supported(void) +static bool __kvm_is_svm_supported(void) { - int cpu = raw_smp_processor_id(); - const char *msg; + int cpu = smp_processor_id(); + struct cpuinfo_x86 *c = &cpu_data(cpu); + u64 vm_cr; - if (!cpu_has_svm(&msg)) { - pr_err("SVM not supported by CPU %d, %s\n", cpu, msg); + if (c->x86_vendor != X86_VENDOR_AMD && + c->x86_vendor != X86_VENDOR_HYGON) { + pr_err("CPU %d isn't AMD or Hygon\n", cpu); + return false; + } + + if (!cpu_has(c, X86_FEATURE_SVM)) { + pr_err("SVM not supported by CPU %d\n", cpu); return false; } @@ -542,25 +558,55 @@ static bool kvm_is_svm_supported(void) return true; } +static bool kvm_is_svm_supported(void) +{ + bool supported; + + migrate_disable(); + supported = __kvm_is_svm_supported(); + migrate_enable(); + + return supported; +} + static int svm_check_processor_compat(void) { - if (!kvm_is_svm_supported()) + if (!__kvm_is_svm_supported()) return -EIO; return 0; } -void __svm_write_tsc_multiplier(u64 multiplier) +static void __svm_write_tsc_multiplier(u64 multiplier) { - preempt_disable(); - if (multiplier == __this_cpu_read(current_tsc_ratio)) - goto out; + return; wrmsrl(MSR_AMD64_TSC_RATIO, multiplier); __this_cpu_write(current_tsc_ratio, multiplier); -out: - preempt_enable(); +} + +static inline void kvm_cpu_svm_disable(void) +{ + uint64_t efer; + + wrmsrl(MSR_VM_HSAVE_PA, 0); + rdmsrl(MSR_EFER, efer); + if (efer & EFER_SVME) { + /* + * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and + * NMI aren't blocked. + */ + stgi(); + wrmsrl(MSR_EFER, efer & ~EFER_SVME); + } +} + +static void svm_emergency_disable(void) +{ + kvm_rebooting = true; + + kvm_cpu_svm_disable(); } static void svm_hardware_disable(void) @@ -569,7 +615,7 @@ static void svm_hardware_disable(void) if (tsc_scaling) __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT); - cpu_svm_disable(); + kvm_cpu_svm_disable(); amd_pmu_disable_virt(); } @@ -677,6 +723,39 @@ free_save_area: } +static void set_dr_intercepts(struct vcpu_svm *svm) +{ + struct vmcb *vmcb = svm->vmcb01.ptr; + + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); + + recalc_intercepts(svm); +} + +static void clr_dr_intercepts(struct vcpu_svm *svm) +{ + struct vmcb *vmcb = svm->vmcb01.ptr; + + vmcb->control.intercepts[INTERCEPT_DR] = 0; + + recalc_intercepts(svm); +} + static int direct_access_msr_slot(u32 msr) { u32 i; @@ -947,50 +1026,24 @@ static void svm_disable_lbrv(struct kvm_vcpu *vcpu) svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb); } -static int svm_get_lbr_msr(struct vcpu_svm *svm, u32 index) +static struct vmcb *svm_get_lbr_vmcb(struct vcpu_svm *svm) { /* - * If the LBR virtualization is disabled, the LBR msrs are always - * kept in the vmcb01 to avoid copying them on nested guest entries. - * - * If nested, and the LBR virtualization is enabled/disabled, the msrs - * are moved between the vmcb01 and vmcb02 as needed. + * If LBR virtualization is disabled, the LBR MSRs are always kept in + * vmcb01. If LBR virtualization is enabled and L1 is running VMs of + * its own, the MSRs are moved between vmcb01 and vmcb02 as needed. */ - struct vmcb *vmcb = - (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) ? - svm->vmcb : svm->vmcb01.ptr; - - switch (index) { - case MSR_IA32_DEBUGCTLMSR: - return vmcb->save.dbgctl; - case MSR_IA32_LASTBRANCHFROMIP: - return vmcb->save.br_from; - case MSR_IA32_LASTBRANCHTOIP: - return vmcb->save.br_to; - case MSR_IA32_LASTINTFROMIP: - return vmcb->save.last_excp_from; - case MSR_IA32_LASTINTTOIP: - return vmcb->save.last_excp_to; - default: - KVM_BUG(false, svm->vcpu.kvm, - "%s: Unknown MSR 0x%x", __func__, index); - return 0; - } + return svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK ? svm->vmcb : + svm->vmcb01.ptr; } void svm_update_lbrv(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - - bool enable_lbrv = svm_get_lbr_msr(svm, MSR_IA32_DEBUGCTLMSR) & - DEBUGCTLMSR_LBR; - - bool current_enable_lbrv = !!(svm->vmcb->control.virt_ext & - LBR_CTL_ENABLE_MASK); - - if (unlikely(is_guest_mode(vcpu) && svm->lbrv_enabled)) - if (unlikely(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)) - enable_lbrv = true; + bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK; + bool enable_lbrv = (svm_get_lbr_vmcb(svm)->save.dbgctl & DEBUGCTLMSR_LBR) || + (is_guest_mode(vcpu) && guest_can_use(vcpu, X86_FEATURE_LBRV) && + (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)); if (enable_lbrv == current_enable_lbrv) return; @@ -1101,21 +1154,23 @@ static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) return svm->tsc_ratio_msr; } -static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +static void svm_write_tsc_offset(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset; - svm->vmcb->control.tsc_offset = offset; + svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset; vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } -static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier) +void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu) { - __svm_write_tsc_multiplier(multiplier); + preempt_disable(); + if (to_svm(vcpu)->guest_state_loaded) + __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); + preempt_enable(); } - /* Evaluate instruction intercepts that depend on guest CPUID features. */ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu, struct vcpu_svm *svm) @@ -1156,8 +1211,6 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0); - - svm->v_vmload_vmsave_enabled = false; } else { /* * If hardware supports Virtual VMLOAD VMSAVE then enable it @@ -1201,10 +1254,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu) * Guest access to VMware backdoor ports could legitimately * trigger #GP because of TSS I/O permission bitmap. * We intercept those #GP and allow access to them anyway - * as VMware does. Don't intercept #GP for SEV guests as KVM can't - * decrypt guest memory to decode the faulting instruction. + * as VMware does. */ - if (enable_vmware_backdoor && !sev_guest(vcpu->kvm)) + if (enable_vmware_backdoor) set_exception_intercept(svm, GP_VECTOR); svm_set_intercept(svm, INTERCEPT_INTR); @@ -1498,7 +1550,9 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (sd->current_vmcb != svm->vmcb) { sd->current_vmcb = svm->vmcb; - indirect_branch_prediction_barrier(); + + if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT)) + indirect_branch_prediction_barrier(); } if (kvm_vcpu_apicv_active(vcpu)) avic_vcpu_load(vcpu, cpu); @@ -1786,6 +1840,11 @@ static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) } } +static bool svm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + return true; +} + void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1942,7 +2001,7 @@ static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if (vcpu->arch.guest_state_protected) + if (WARN_ON_ONCE(sev_es_guest(vcpu->kvm))) return; get_debugreg(vcpu->arch.db[0], 0); @@ -2503,12 +2562,13 @@ static int iret_interception(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + WARN_ON_ONCE(sev_es_guest(vcpu->kvm)); + ++vcpu->stat.nmi_window_exits; svm->awaiting_iret_completion = true; svm_clr_iret_intercept(svm); - if (!sev_es_guest(vcpu->kvm)) - svm->nmi_iret_rip = kvm_rip_read(vcpu); + svm->nmi_iret_rip = kvm_rip_read(vcpu); kvm_make_request(KVM_REQ_EVENT, vcpu); return 1; @@ -2673,6 +2733,13 @@ static int dr_interception(struct kvm_vcpu *vcpu) unsigned long val; int err = 0; + /* + * SEV-ES intercepts DR7 only to disable guest debugging and the guest issues a VMGEXIT + * for DR7 write only. KVM cannot change DR7 (always swapped as type 'A') so return early. + */ + if (sev_es_guest(vcpu->kvm)) + return 1; + if (vcpu->guest_debug == 0) { /* * No more DR vmexits; force a reload of the debug registers @@ -2757,7 +2824,8 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) switch (msr_info->index) { case MSR_AMD64_TSC_RATIO: - if (!msr_info->host_initiated && !svm->tsc_scaling_enabled) + if (!msr_info->host_initiated && + !guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR)) return 1; msr_info->data = svm->tsc_ratio_msr; break; @@ -2795,11 +2863,19 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = svm->tsc_aux; break; case MSR_IA32_DEBUGCTLMSR: + msr_info->data = svm_get_lbr_vmcb(svm)->save.dbgctl; + break; case MSR_IA32_LASTBRANCHFROMIP: + msr_info->data = svm_get_lbr_vmcb(svm)->save.br_from; + break; case MSR_IA32_LASTBRANCHTOIP: + msr_info->data = svm_get_lbr_vmcb(svm)->save.br_to; + break; case MSR_IA32_LASTINTFROMIP: + msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_from; + break; case MSR_IA32_LASTINTTOIP: - msr_info->data = svm_get_lbr_msr(svm, msr_info->index); + msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_to; break; case MSR_VM_HSAVE_PA: msr_info->data = svm->nested.hsave_msr; @@ -2899,7 +2975,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) switch (ecx) { case MSR_AMD64_TSC_RATIO: - if (!svm->tsc_scaling_enabled) { + if (!guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR)) { if (!msr->host_initiated) return 1; @@ -2921,7 +2997,8 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm->tsc_ratio_msr = data; - if (svm->tsc_scaling_enabled && is_guest_mode(vcpu)) + if (guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR) && + is_guest_mode(vcpu)) nested_svm_update_tsc_ratio_msr(vcpu); break; @@ -3030,13 +3107,8 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) if (data & DEBUGCTL_RESERVED_BITS) return 1; - if (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) - svm->vmcb->save.dbgctl = data; - else - svm->vmcb01.ptr->save.dbgctl = data; - + svm_get_lbr_vmcb(svm)->save.dbgctl = data; svm_update_lbrv(vcpu); - break; case MSR_VM_HSAVE_PA: /* @@ -3762,6 +3834,19 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) if (svm_get_nmi_mask(vcpu) && !svm->awaiting_iret_completion) return; /* IRET will cause a vm exit */ + /* + * SEV-ES guests are responsible for signaling when a vCPU is ready to + * receive a new NMI, as SEV-ES guests can't be single-stepped, i.e. + * KVM can't intercept and single-step IRET to detect when NMIs are + * unblocked (architecturally speaking). See SVM_VMGEXIT_NMI_COMPLETE. + * + * Note, GIF is guaranteed to be '1' for SEV-ES guests as hardware + * ignores SEV-ES guest writes to EFER.SVME *and* CLGI/STGI are not + * supported NAEs in the GHCB protocol. + */ + if (sev_es_guest(vcpu->kvm)) + return; + if (!gif_set(svm)) { if (vgif) svm_set_intercept(svm, INTERCEPT_STGI); @@ -3911,12 +3996,11 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) svm->soft_int_injected = false; /* - * If we've made progress since setting HF_IRET_MASK, we've + * If we've made progress since setting awaiting_iret_completion, we've * executed an IRET and can allow NMI injection. */ if (svm->awaiting_iret_completion && - (sev_es_guest(vcpu->kvm) || - kvm_rip_read(vcpu) != svm->nmi_iret_rip)) { + kvm_rip_read(vcpu) != svm->nmi_iret_rip) { svm->awaiting_iret_completion = false; svm->nmi_masked = false; kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -3986,14 +4070,8 @@ static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu) static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { - struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; - - /* - * Note, the next RIP must be provided as SRCU isn't held, i.e. KVM - * can't read guest memory (dereference memslots) to decode the WRMSR. - */ - if (control->exit_code == SVM_EXIT_MSR && control->exit_info_1 && - nrips && control->next_rip) + if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && + to_svm(vcpu)->vmcb->control.exit_info_1) return handle_fastpath_set_msr_irqoff(vcpu); return EXIT_FASTPATH_NONE; @@ -4005,6 +4083,8 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in guest_state_enter_irqoff(); + amd_clear_divider(); + if (sev_es_guest(vcpu->kvm)) __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted); else @@ -4206,28 +4286,37 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); struct kvm_cpuid_entry2 *best; - vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && - boot_cpu_has(X86_FEATURE_XSAVE) && - boot_cpu_has(X86_FEATURE_XSAVES); - - /* Update nrips enabled cache */ - svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) && - guest_cpuid_has(vcpu, X86_FEATURE_NRIPS); - - svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR); - svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV); - - svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD); - - svm->pause_filter_enabled = kvm_cpu_cap_has(X86_FEATURE_PAUSEFILTER) && - guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER); + /* + * SVM doesn't provide a way to disable just XSAVES in the guest, KVM + * can only disable all variants of by disallowing CR4.OSXSAVE from + * being set. As a result, if the host has XSAVE and XSAVES, and the + * guest has XSAVE enabled, the guest can execute XSAVES without + * faulting. Treat XSAVES as enabled in this case regardless of + * whether it's advertised to the guest so that KVM context switches + * XSS on VM-Enter/VM-Exit. Failure to do so would effectively give + * the guest read/write access to the host's XSS. + */ + if (boot_cpu_has(X86_FEATURE_XSAVE) && + boot_cpu_has(X86_FEATURE_XSAVES) && + guest_cpuid_has(vcpu, X86_FEATURE_XSAVE)) + kvm_governed_feature_set(vcpu, X86_FEATURE_XSAVES); - svm->pause_threshold_enabled = kvm_cpu_cap_has(X86_FEATURE_PFTHRESHOLD) && - guest_cpuid_has(vcpu, X86_FEATURE_PFTHRESHOLD); + kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_NRIPS); + kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_TSCRATEMSR); + kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LBRV); - svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF); + /* + * Intercept VMLOAD if the vCPU mode is Intel in order to emulate that + * VMLOAD drops bits 63:32 of SYSENTER (ignoring the fact that exposing + * SVM on Intel is bonkers and extremely unlikely to work). + */ + if (!guest_cpuid_is_intel(vcpu)) + kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD); - svm->vnmi_enabled = vnmi && guest_cpuid_has(vcpu, X86_FEATURE_VNMI); + kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PAUSEFILTER); + kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PFTHRESHOLD); + kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VGIF); + kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VNMI); svm_recalc_instruction_intercepts(vcpu, svm); @@ -4648,16 +4737,25 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, * and cannot be decrypted by KVM, i.e. KVM would read cyphertext and * decode garbage. * - * Inject #UD if KVM reached this point without an instruction buffer. - * In practice, this path should never be hit by a well-behaved guest, - * e.g. KVM doesn't intercept #UD or #GP for SEV guests, but this path - * is still theoretically reachable, e.g. via unaccelerated fault-like - * AVIC access, and needs to be handled by KVM to avoid putting the - * guest into an infinite loop. Injecting #UD is somewhat arbitrary, - * but its the least awful option given lack of insight into the guest. + * If KVM is NOT trying to simply skip an instruction, inject #UD if + * KVM reached this point without an instruction buffer. In practice, + * this path should never be hit by a well-behaved guest, e.g. KVM + * doesn't intercept #UD or #GP for SEV guests, but this path is still + * theoretically reachable, e.g. via unaccelerated fault-like AVIC + * access, and needs to be handled by KVM to avoid putting the guest + * into an infinite loop. Injecting #UD is somewhat arbitrary, but + * its the least awful option given lack of insight into the guest. + * + * If KVM is trying to skip an instruction, simply resume the guest. + * If a #NPF occurs while the guest is vectoring an INT3/INTO, then KVM + * will attempt to re-inject the INT3/INTO and skip the instruction. + * In that scenario, retrying the INT3/INTO and hoping the guest will + * make forward progress is the only option that has a chance of + * success (and in practice it will work the vast majority of the time). */ if (unlikely(!insn)) { - kvm_queue_exception(vcpu, UD_VECTOR); + if (!(emul_type & EMULTYPE_SKIP)) + kvm_queue_exception(vcpu, UD_VECTOR); return false; } @@ -4815,6 +4913,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_segment = svm_set_segment, .get_cpl = svm_get_cpl, .get_cs_db_l_bits = svm_get_cs_db_l_bits, + .is_valid_cr0 = svm_is_valid_cr0, .set_cr0 = svm_set_cr0, .post_set_cr3 = sev_post_set_cr3, .is_valid_cr4 = svm_is_valid_cr4, @@ -5108,9 +5207,11 @@ static __init int svm_hardware_setup(void) svm_adjust_mmio_mask(); + nrips = nrips && boot_cpu_has(X86_FEATURE_NRIPS); + /* * Note, SEV setup consumes npt_enabled and enable_mmio_caching (which - * may be modified by svm_adjust_mmio_mask()). + * may be modified by svm_adjust_mmio_mask()), as well as nrips. */ sev_hardware_setup(); @@ -5122,11 +5223,6 @@ static __init int svm_hardware_setup(void) goto err; } - if (nrips) { - if (!boot_cpu_has(X86_FEATURE_NRIPS)) - nrips = false; - } - enable_apicv = avic = avic && avic_hardware_setup(); if (!enable_apicv) { @@ -5209,6 +5305,13 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = { .pmu_ops = &amd_pmu_ops, }; +static void __svm_exit(void) +{ + kvm_x86_vendor_exit(); + + cpu_emergency_unregister_virt_callback(svm_emergency_disable); +} + static int __init svm_init(void) { int r; @@ -5222,6 +5325,8 @@ static int __init svm_init(void) if (r) return r; + cpu_emergency_register_virt_callback(svm_emergency_disable); + /* * Common KVM initialization _must_ come last, after this, /dev/kvm is * exposed to userspace! @@ -5234,14 +5339,14 @@ static int __init svm_init(void) return 0; err_kvm_init: - kvm_x86_vendor_exit(); + __svm_exit(); return r; } static void __exit svm_exit(void) { kvm_exit(); - kvm_x86_vendor_exit(); + __svm_exit(); } module_init(svm_init) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 18af7e712a5a..f41253958357 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -22,6 +22,7 @@ #include <asm/svm.h> #include <asm/sev-common.h> +#include "cpuid.h" #include "kvm_cache_regs.h" #define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT) @@ -33,6 +34,7 @@ #define MSRPM_OFFSETS 32 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; +extern int nrips; extern int vgif; extern bool intercept_smi; extern bool x2avic_enabled; @@ -190,10 +192,12 @@ struct vcpu_sev_es_state { /* SEV-ES support */ struct sev_es_save_area *vmsa; struct ghcb *ghcb; + u8 valid_bitmap[16]; struct kvm_host_map ghcb_map; bool received_first_sipi; /* SEV-ES scratch area support */ + u64 sw_scratch; void *ghcb_sa; u32 ghcb_sa_len; bool ghcb_sa_sync; @@ -258,16 +262,6 @@ struct vcpu_svm { unsigned long soft_int_next_rip; bool soft_int_injected; - /* optional nested SVM features that are enabled for this guest */ - bool nrips_enabled : 1; - bool tsc_scaling_enabled : 1; - bool v_vmload_vmsave_enabled : 1; - bool lbrv_enabled : 1; - bool pause_filter_enabled : 1; - bool pause_threshold_enabled : 1; - bool vgif_enabled : 1; - bool vnmi_enabled : 1; - u32 ldr_reg; u32 dfr_reg; struct page *avic_backing_page; @@ -404,48 +398,6 @@ static inline bool vmcb12_is_intercept(struct vmcb_ctrl_area_cached *control, u3 return test_bit(bit, (unsigned long *)&control->intercepts); } -static inline void set_dr_intercepts(struct vcpu_svm *svm) -{ - struct vmcb *vmcb = svm->vmcb01.ptr; - - if (!sev_es_guest(svm->vcpu.kvm)) { - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE); - } - - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); - - recalc_intercepts(svm); -} - -static inline void clr_dr_intercepts(struct vcpu_svm *svm) -{ - struct vmcb *vmcb = svm->vmcb01.ptr; - - vmcb->control.intercepts[INTERCEPT_DR] = 0; - - /* DR7 access must remain intercepted for an SEV-ES guest */ - if (sev_es_guest(svm->vcpu.kvm)) { - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); - } - - recalc_intercepts(svm); -} - static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit) { struct vmcb *vmcb = svm->vmcb01.ptr; @@ -491,7 +443,8 @@ static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit) static inline bool nested_vgif_enabled(struct vcpu_svm *svm) { - return svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK); + return guest_can_use(&svm->vcpu, X86_FEATURE_VGIF) && + (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK); } static inline struct vmcb *get_vgif_vmcb(struct vcpu_svm *svm) @@ -542,7 +495,7 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm) static inline bool nested_vnmi_enabled(struct vcpu_svm *svm) { - return svm->vnmi_enabled && + return guest_can_use(&svm->vcpu, X86_FEATURE_VNMI) && (svm->nested.ctl.int_ctl & V_NMI_ENABLE_MASK); } @@ -658,7 +611,7 @@ int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); int nested_svm_exit_special(struct vcpu_svm *svm); void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu); -void __svm_write_tsc_multiplier(u64 multiplier); +void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu); void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm, struct vmcb_control_area *control); void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm, @@ -744,4 +697,28 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm); void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted); void __svm_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted); +#define DEFINE_KVM_GHCB_ACCESSORS(field) \ + static __always_inline bool kvm_ghcb_##field##_is_valid(const struct vcpu_svm *svm) \ + { \ + return test_bit(GHCB_BITMAP_IDX(field), \ + (unsigned long *)&svm->sev_es.valid_bitmap); \ + } \ + \ + static __always_inline u64 kvm_ghcb_get_##field##_if_valid(struct vcpu_svm *svm, struct ghcb *ghcb) \ + { \ + return kvm_ghcb_##field##_is_valid(svm) ? ghcb->save.field : 0; \ + } \ + +DEFINE_KVM_GHCB_ACCESSORS(cpl) +DEFINE_KVM_GHCB_ACCESSORS(rax) +DEFINE_KVM_GHCB_ACCESSORS(rcx) +DEFINE_KVM_GHCB_ACCESSORS(rdx) +DEFINE_KVM_GHCB_ACCESSORS(rbx) +DEFINE_KVM_GHCB_ACCESSORS(rsi) +DEFINE_KVM_GHCB_ACCESSORS(sw_exit_code) +DEFINE_KVM_GHCB_ACCESSORS(sw_exit_info_1) +DEFINE_KVM_GHCB_ACCESSORS(sw_exit_info_2) +DEFINE_KVM_GHCB_ACCESSORS(sw_scratch) +DEFINE_KVM_GHCB_ACCESSORS(xcr0) + #endif diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 8e8295e774f0..ef2ebabb059c 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -222,7 +222,7 @@ SYM_FUNC_START(__svm_vcpu_run) * because interrupt handlers won't sanitize 'ret' if the return is * from the kernel. */ - UNTRAIN_RET + UNTRAIN_RET_VM /* * Clear all general purpose registers except RSP and RAX to prevent @@ -359,7 +359,7 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) * because interrupt handlers won't sanitize RET if the return is * from the kernel. */ - UNTRAIN_RET + UNTRAIN_RET_VM /* "Pop" @spec_ctrl_intercepted. */ pop %_ASM_BX diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index d0abee35d7ba..41a4533f9989 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -252,7 +252,7 @@ static inline bool cpu_has_vmx_pml(void) static inline bool cpu_has_vmx_xsaves(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_XSAVES; + SECONDARY_EXEC_ENABLE_XSAVES; } static inline bool cpu_has_vmx_waitpkg(void) diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c index 79450e1ed7cf..313b8bb5b8a7 100644 --- a/arch/x86/kvm/vmx/hyperv.c +++ b/arch/x86/kvm/vmx/hyperv.c @@ -78,7 +78,7 @@ SECONDARY_EXEC_DESC | \ SECONDARY_EXEC_ENABLE_RDTSCP | \ SECONDARY_EXEC_ENABLE_INVPCID | \ - SECONDARY_EXEC_XSAVES | \ + SECONDARY_EXEC_ENABLE_XSAVES | \ SECONDARY_EXEC_RDSEED_EXITING | \ SECONDARY_EXEC_RDRAND_EXITING | \ SECONDARY_EXEC_TSC_SCALING | \ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 516391cc0d64..c5ec0ef51ff7 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2307,7 +2307,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_ENABLE_RDTSCP | - SECONDARY_EXEC_XSAVES | + SECONDARY_EXEC_ENABLE_XSAVES | SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_APIC_REGISTER_VIRT | @@ -6331,7 +6331,7 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, * If if it were, XSS would have to be checked against * the XSS exit bitmap in vmcs12. */ - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES); case EXIT_REASON_UMWAIT: case EXIT_REASON_TPAUSE: return nested_cpu_has2(vmcs12, @@ -6426,7 +6426,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, vmx = to_vmx(vcpu); vmcs12 = get_vmcs12(vcpu); - if (nested_vmx_allowed(vcpu) && + if (guest_can_use(vcpu, X86_FEATURE_VMX) && (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; @@ -6567,7 +6567,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) return -EINVAL; } else { - if (!nested_vmx_allowed(vcpu)) + if (!guest_can_use(vcpu, X86_FEATURE_VMX)) return -EINVAL; if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) @@ -6601,7 +6601,8 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, return -EINVAL; if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && - (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled)) + (!guest_can_use(vcpu, X86_FEATURE_VMX) || + !vmx->nested.enlightened_vmcs_enabled)) return -EINVAL; vmx_leave_nested(vcpu); @@ -6874,7 +6875,7 @@ static void nested_vmx_setup_secondary_ctls(u32 ept_caps, SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_ENABLE_VMFUNC | SECONDARY_EXEC_RDSEED_EXITING | - SECONDARY_EXEC_XSAVES | + SECONDARY_EXEC_ENABLE_XSAVES | SECONDARY_EXEC_TSC_SCALING | SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 96952263b029..b4b9d51438c6 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -168,7 +168,7 @@ static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12) { - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES); } static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 80c769c58a87..f2efa0bf7ae8 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -22,23 +22,51 @@ #define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0) +enum intel_pmu_architectural_events { + /* + * The order of the architectural events matters as support for each + * event is enumerated via CPUID using the index of the event. + */ + INTEL_ARCH_CPU_CYCLES, + INTEL_ARCH_INSTRUCTIONS_RETIRED, + INTEL_ARCH_REFERENCE_CYCLES, + INTEL_ARCH_LLC_REFERENCES, + INTEL_ARCH_LLC_MISSES, + INTEL_ARCH_BRANCHES_RETIRED, + INTEL_ARCH_BRANCHES_MISPREDICTED, + + NR_REAL_INTEL_ARCH_EVENTS, + + /* + * Pseudo-architectural event used to implement IA32_FIXED_CTR2, a.k.a. + * TSC reference cycles. The architectural reference cycles event may + * or may not actually use the TSC as the reference, e.g. might use the + * core crystal clock or the bus clock (yeah, "architectural"). + */ + PSEUDO_ARCH_REFERENCE_CYCLES = NR_REAL_INTEL_ARCH_EVENTS, + NR_INTEL_ARCH_EVENTS, +}; + static struct { u8 eventsel; u8 unit_mask; } const intel_arch_events[] = { - [0] = { 0x3c, 0x00 }, - [1] = { 0xc0, 0x00 }, - [2] = { 0x3c, 0x01 }, - [3] = { 0x2e, 0x4f }, - [4] = { 0x2e, 0x41 }, - [5] = { 0xc4, 0x00 }, - [6] = { 0xc5, 0x00 }, - /* The above index must match CPUID 0x0A.EBX bit vector */ - [7] = { 0x00, 0x03 }, + [INTEL_ARCH_CPU_CYCLES] = { 0x3c, 0x00 }, + [INTEL_ARCH_INSTRUCTIONS_RETIRED] = { 0xc0, 0x00 }, + [INTEL_ARCH_REFERENCE_CYCLES] = { 0x3c, 0x01 }, + [INTEL_ARCH_LLC_REFERENCES] = { 0x2e, 0x4f }, + [INTEL_ARCH_LLC_MISSES] = { 0x2e, 0x41 }, + [INTEL_ARCH_BRANCHES_RETIRED] = { 0xc4, 0x00 }, + [INTEL_ARCH_BRANCHES_MISPREDICTED] = { 0xc5, 0x00 }, + [PSEUDO_ARCH_REFERENCE_CYCLES] = { 0x00, 0x03 }, }; /* mapping between fixed pmc index and intel_arch_events array */ -static int fixed_pmc_events[] = {1, 0, 7}; +static int fixed_pmc_events[] = { + [0] = INTEL_ARCH_INSTRUCTIONS_RETIRED, + [1] = INTEL_ARCH_CPU_CYCLES, + [2] = PSEUDO_ARCH_REFERENCE_CYCLES, +}; static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) { @@ -80,16 +108,18 @@ static bool intel_hw_event_available(struct kvm_pmc *pmc) u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; int i; - for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++) { + BUILD_BUG_ON(ARRAY_SIZE(intel_arch_events) != NR_INTEL_ARCH_EVENTS); + + /* + * Disallow events reported as unavailable in guest CPUID. Note, this + * doesn't apply to pseudo-architectural events. + */ + for (i = 0; i < NR_REAL_INTEL_ARCH_EVENTS; i++) { if (intel_arch_events[i].eventsel != event_select || intel_arch_events[i].unit_mask != unit_mask) continue; - /* disable event that reported as not present by cpuid */ - if ((i < 7) && !(pmu->available_event_types & (1 << i))) - return false; - - break; + return pmu->available_event_types & BIT(i); } return true; @@ -438,16 +468,17 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) static void setup_fixed_pmc_eventsel(struct kvm_pmu *pmu) { - size_t size = ARRAY_SIZE(fixed_pmc_events); - struct kvm_pmc *pmc; - u32 event; int i; + BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_events) != KVM_PMC_MAX_FIXED); + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { - pmc = &pmu->fixed_counters[i]; - event = fixed_pmc_events[array_index_nospec(i, size)]; + int index = array_index_nospec(i, KVM_PMC_MAX_FIXED); + struct kvm_pmc *pmc = &pmu->fixed_counters[index]; + u32 event = fixed_pmc_events[index]; + pmc->eventsel = (intel_arch_events[event].unit_mask << 8) | - intel_arch_events[event].eventsel; + intel_arch_events[event].eventsel; } } @@ -508,10 +539,8 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) if (pmu->version == 1) { pmu->nr_arch_fixed_counters = 0; } else { - pmu->nr_arch_fixed_counters = - min3(ARRAY_SIZE(fixed_pmc_events), - (size_t) edx.split.num_counters_fixed, - (size_t)kvm_pmu_cap.num_counters_fixed); + pmu->nr_arch_fixed_counters = min_t(int, edx.split.num_counters_fixed, + kvm_pmu_cap.num_counters_fixed); edx.split.bit_width_fixed = min_t(int, edx.split.bit_width_fixed, kvm_pmu_cap.bit_width_fixed); pmu->counter_bitmask[KVM_PMC_FIXED] = diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 94c38bea60e7..af662312fd07 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -175,7 +175,7 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) * scheduled out). */ if (pi_test_on(&new)) - apic->send_IPI_self(POSTED_INTR_WAKEUP_VECTOR); + __apic_send_IPI_self(POSTED_INTR_WAKEUP_VECTOR); local_irq_restore(flags); } diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 07e927d4d099..be275a0410a8 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -303,10 +303,8 @@ SYM_FUNC_START(vmx_do_nmi_irqoff) VMX_DO_EVENT_IRQOFF call asm_exc_nmi_kvm_vmx SYM_FUNC_END(vmx_do_nmi_irqoff) - -.section .text, "ax" - #ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT + /** * vmread_error_trampoline - Trampoline from inline asm to vmread_error() * @field: VMCS field encoding that failed @@ -335,7 +333,7 @@ SYM_FUNC_START(vmread_error_trampoline) mov 3*WORD_SIZE(%_ASM_BP), %_ASM_ARG2 mov 2*WORD_SIZE(%_ASM_BP), %_ASM_ARG1 - call vmread_error + call vmread_error_trampoline2 /* Zero out @fault, which will be popped into the result register. */ _ASM_MOV $0, 3*WORD_SIZE(%_ASM_BP) @@ -357,6 +355,8 @@ SYM_FUNC_START(vmread_error_trampoline) SYM_FUNC_END(vmread_error_trampoline) #endif +.section .text, "ax" + SYM_FUNC_START(vmx_do_interrupt_irqoff) VMX_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1 SYM_FUNC_END(vmx_do_interrupt_irqoff) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 0ecf4be2c6af..72e3943f3693 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -41,13 +41,12 @@ #include <asm/idtentry.h> #include <asm/io.h> #include <asm/irq_remapping.h> -#include <asm/kexec.h> +#include <asm/reboot.h> #include <asm/perf_event.h> #include <asm/mmu_context.h> #include <asm/mshyperv.h> #include <asm/mwait.h> #include <asm/spec-ctrl.h> -#include <asm/virtext.h> #include <asm/vmx.h> #include "capabilities.h" @@ -237,9 +236,6 @@ static const struct { #define L1D_CACHE_ORDER 4 static void *vmx_l1d_flush_pages; -/* Control for disabling CPU Fill buffer clear */ -static bool __read_mostly vmx_fb_clear_ctrl_available; - static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) { struct page *page; @@ -255,14 +251,9 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) return 0; } - if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) { - u64 msr; - - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr); - if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { - l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; - return 0; - } + if (host_arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; + return 0; } /* If set to auto use the default l1tf mitigation method */ @@ -366,22 +357,9 @@ static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) { if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param))) - return sprintf(s, "???\n"); - - return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); -} - -static void vmx_setup_fb_clear_ctrl(void) -{ - u64 msr; + return sysfs_emit(s, "???\n"); - if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES) && - !boot_cpu_has_bug(X86_BUG_MDS) && - !boot_cpu_has_bug(X86_BUG_TAA)) { - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr); - if (msr & ARCH_CAP_FB_CLEAR_CTRL) - vmx_fb_clear_ctrl_available = true; - } + return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); } static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) @@ -409,7 +387,9 @@ static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx) static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) { - vmx->disable_fb_clear = vmx_fb_clear_ctrl_available; + vmx->disable_fb_clear = (host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) && + !boot_cpu_has_bug(X86_BUG_MDS) && + !boot_cpu_has_bug(X86_BUG_TAA); /* * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS @@ -441,13 +421,23 @@ do { \ pr_warn_ratelimited(fmt); \ } while (0) -void vmread_error(unsigned long field, bool fault) +noinline void vmread_error(unsigned long field) +{ + vmx_insn_failed("vmread failed: field=%lx\n", field); +} + +#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT +noinstr void vmread_error_trampoline2(unsigned long field, bool fault) { - if (fault) + if (fault) { kvm_spurious_fault(); - else - vmx_insn_failed("vmread failed: field=%lx\n", field); + } else { + instrumentation_begin(); + vmread_error(field); + instrumentation_end(); + } } +#endif noinline void vmwrite_error(unsigned long field, unsigned long value) { @@ -744,17 +734,51 @@ static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, return ret; } -#ifdef CONFIG_KEXEC_CORE -static void crash_vmclear_local_loaded_vmcss(void) +/* + * Disable VMX and clear CR4.VMXE (even if VMXOFF faults) + * + * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to + * atomically track post-VMXON state, e.g. this may be called in NMI context. + * Eat all faults as all other faults on VMXOFF faults are mode related, i.e. + * faults are guaranteed to be due to the !post-VMXON check unless the CPU is + * magically in RM, VM86, compat mode, or at CPL>0. + */ +static int kvm_cpu_vmxoff(void) +{ + asm_volatile_goto("1: vmxoff\n\t" + _ASM_EXTABLE(1b, %l[fault]) + ::: "cc", "memory" : fault); + + cr4_clear_bits(X86_CR4_VMXE); + return 0; + +fault: + cr4_clear_bits(X86_CR4_VMXE); + return -EIO; +} + +static void vmx_emergency_disable(void) { int cpu = raw_smp_processor_id(); struct loaded_vmcs *v; + kvm_rebooting = true; + + /* + * Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be + * set in task context. If this races with VMX is disabled by an NMI, + * VMCLEAR and VMXOFF may #UD, but KVM will eat those faults due to + * kvm_rebooting set. + */ + if (!(__read_cr4() & X86_CR4_VMXE)) + return; + list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), loaded_vmcss_on_cpu_link) vmcs_clear(v->vmcs); + + kvm_cpu_vmxoff(); } -#endif /* CONFIG_KEXEC_CORE */ static void __loaded_vmcs_clear(void *arg) { @@ -1503,6 +1527,11 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long old_rflags; + /* + * Unlike CR0 and CR4, RFLAGS handling requires checking if the vCPU + * is an unrestricted guest in order to mark L2 as needing emulation + * if L1 runs L2 as a restricted guest. + */ if (is_unrestricted_guest(vcpu)) { kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); vmx->rflags = rflags; @@ -1884,25 +1913,14 @@ u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) return kvm_caps.default_tsc_scaling_ratio; } -static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu) { - vmcs_write64(TSC_OFFSET, offset); + vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); } -static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier) +static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu) { - vmcs_write64(TSC_MULTIPLIER, multiplier); -} - -/* - * nested_vmx_allowed() checks whether a guest should be allowed to use VMX - * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for - * all guests if the "nested" module option is off, and can also be disabled - * for a single guest by disabling its VMX cpuid bit. - */ -bool nested_vmx_allowed(struct kvm_vcpu *vcpu) -{ - return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX); + vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); } /* @@ -2032,7 +2050,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0]; break; case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: - if (!nested_vmx_allowed(vcpu)) + if (!guest_can_use(vcpu, X86_FEATURE_VMX)) return 1; if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, &msr_info->data)) @@ -2340,7 +2358,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: if (!msr_info->host_initiated) return 1; /* they are read-only */ - if (!nested_vmx_allowed(vcpu)) + if (!guest_can_use(vcpu, X86_FEATURE_VMX)) return 1; return vmx_set_vmx_msr(vcpu, msr_index, data); case MSR_IA32_RTIT_CTL: @@ -2714,11 +2732,11 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, return 0; } -static bool kvm_is_vmx_supported(void) +static bool __kvm_is_vmx_supported(void) { - int cpu = raw_smp_processor_id(); + int cpu = smp_processor_id(); - if (!cpu_has_vmx()) { + if (!(cpuid_ecx(1) & feature_bit(VMX))) { pr_err("VMX not supported by CPU %d\n", cpu); return false; } @@ -2732,13 +2750,24 @@ static bool kvm_is_vmx_supported(void) return true; } +static bool kvm_is_vmx_supported(void) +{ + bool supported; + + migrate_disable(); + supported = __kvm_is_vmx_supported(); + migrate_enable(); + + return supported; +} + static int vmx_check_processor_compat(void) { int cpu = raw_smp_processor_id(); struct vmcs_config vmcs_conf; struct vmx_capability vmx_cap; - if (!kvm_is_vmx_supported()) + if (!__kvm_is_vmx_supported()) return -EIO; if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) { @@ -2818,7 +2847,7 @@ static void vmx_hardware_disable(void) { vmclear_local_loaded_vmcss(); - if (cpu_vmxoff()) + if (kvm_cpu_vmxoff()) kvm_spurious_fault(); hv_reset_evmcs(); @@ -3037,6 +3066,15 @@ static void enter_rmode(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm); + /* + * KVM should never use VM86 to virtualize Real Mode when L2 is active, + * as using VM86 is unnecessary if unrestricted guest is enabled, and + * if unrestricted guest is disabled, VM-Enter (from L1) with CR0.PG=0 + * should VM-Fail and KVM should reject userspace attempts to stuff + * CR0.PG=0 when L2 is active. + */ + WARN_ON_ONCE(is_guest_mode(vcpu)); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); @@ -3047,13 +3085,6 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vmx->rmode.vm86_active = 1; - /* - * Very old userspace does not call KVM_SET_TSS_ADDR before entering - * vcpu. Warn the user that an update is overdue. - */ - if (!kvm_vmx->tss_addr) - pr_warn_once("KVM_SET_TSS_ADDR needs to be called before running vCPU\n"); - vmx_segment_cache_clear(vmx); vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr); @@ -3226,6 +3257,17 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu) #define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \ CPU_BASED_CR3_STORE_EXITING) +static bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + if (is_guest_mode(vcpu)) + return nested_guest_cr0_valid(vcpu, cr0); + + if (to_vmx(vcpu)->nested.vmxon) + return nested_host_cr0_valid(vcpu, cr0); + + return true; +} + void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -3235,7 +3277,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG); hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); - if (is_unrestricted_guest(vcpu)) + if (enable_unrestricted_guest) hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; else { hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; @@ -3263,7 +3305,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } #endif - if (enable_ept && !is_unrestricted_guest(vcpu)) { + if (enable_ept && !enable_unrestricted_guest) { /* * Ensure KVM has an up-to-date snapshot of the guest's CR3. If * the below code _enables_ CR3 exiting, vmx_cache_reg() will @@ -3315,7 +3357,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vmx->emulation_required = vmx_emulation_required(vcpu); } -static int vmx_get_max_tdp_level(void) +static int vmx_get_max_ept_level(void) { if (cpu_has_vmx_ept_5levels()) return 5; @@ -3394,7 +3436,7 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) * this bit, even if host CR4.MCE == 0. */ hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); - if (is_unrestricted_guest(vcpu)) + if (enable_unrestricted_guest) hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; else if (vmx->rmode.vm86_active) hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON; @@ -3414,7 +3456,7 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) vcpu->arch.cr4 = cr4; kvm_register_mark_available(vcpu, VCPU_EXREG_CR4); - if (!is_unrestricted_guest(vcpu)) { + if (!enable_unrestricted_guest) { if (enable_ept) { if (!is_paging(vcpu)) { hw_cr4 &= ~X86_CR4_PAE; @@ -4144,7 +4186,7 @@ static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, */ if (vcpu != kvm_get_running_vcpu()) - apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); + __apic_send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); return; } #endif @@ -4518,16 +4560,19 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control, * based on a single guest CPUID bit, with a dedicated feature bit. This also * verifies that the control is actually supported by KVM and hardware. */ -#define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \ -({ \ - bool __enabled; \ - \ - if (cpu_has_vmx_##name()) { \ - __enabled = guest_cpuid_has(&(vmx)->vcpu, \ - X86_FEATURE_##feat_name); \ - vmx_adjust_secondary_exec_control(vmx, exec_control, \ - SECONDARY_EXEC_##ctrl_name, __enabled, exiting); \ - } \ +#define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \ +({ \ + struct kvm_vcpu *__vcpu = &(vmx)->vcpu; \ + bool __enabled; \ + \ + if (cpu_has_vmx_##name()) { \ + if (kvm_is_governed_feature(X86_FEATURE_##feat_name)) \ + __enabled = guest_can_use(__vcpu, X86_FEATURE_##feat_name); \ + else \ + __enabled = guest_cpuid_has(__vcpu, X86_FEATURE_##feat_name); \ + vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\ + __enabled, exiting); \ + } \ }) /* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */ @@ -4587,19 +4632,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging)) exec_control &= ~SECONDARY_EXEC_ENABLE_PML; - if (cpu_has_vmx_xsaves()) { - /* Exposing XSAVES only when XSAVE is exposed */ - bool xsaves_enabled = - boot_cpu_has(X86_FEATURE_XSAVE) && - guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && - guest_cpuid_has(vcpu, X86_FEATURE_XSAVES); - - vcpu->arch.xsaves_enabled = xsaves_enabled; - - vmx_adjust_secondary_exec_control(vmx, &exec_control, - SECONDARY_EXEC_XSAVES, - xsaves_enabled, false); - } + vmx_adjust_sec_exec_feature(vmx, &exec_control, xsaves, XSAVES); /* * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either @@ -4618,6 +4651,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) SECONDARY_EXEC_ENABLE_RDTSCP, rdpid_or_rdtscp_enabled, false); } + vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID); vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND); @@ -4651,7 +4685,8 @@ static int vmx_alloc_ipiv_pid_table(struct kvm *kvm) if (kvm_vmx->pid_table) return 0; - pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, vmx_get_pid_table_order(kvm)); + pages = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, + vmx_get_pid_table_order(kvm)); if (!pages) return -ENOMEM; @@ -5364,18 +5399,11 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) val = (val & ~vmcs12->cr0_guest_host_mask) | (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); - if (!nested_guest_cr0_valid(vcpu, val)) - return 1; - if (kvm_set_cr0(vcpu, val)) return 1; vmcs_writel(CR0_READ_SHADOW, orig_val); return 0; } else { - if (to_vmx(vcpu)->nested.vmxon && - !nested_host_cr0_valid(vcpu, val)) - return 1; - return kvm_set_cr0(vcpu, val); } } @@ -6767,8 +6795,10 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn)); read_unlock(&vcpu->kvm->mmu_lock); - vmx_flush_tlb_current(vcpu); - + /* + * No need for a manual TLB flush at this point, KVM has already done a + * flush if there were SPTEs pointing at the previous page. + */ out: /* * Do not pin apic access page in memory, the MMU notifier @@ -7214,13 +7244,20 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, flags); vcpu->arch.cr2 = native_read_cr2(); + vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET; + + vmx->idt_vectoring_info = 0; vmx_enable_fb_clear(vmx); - if (unlikely(vmx->fail)) + if (unlikely(vmx->fail)) { vmx->exit_reason.full = 0xdead; - else - vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON); + goto out; + } + + vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON); + if (likely(!vmx->exit_reason.failed_vmentry)) + vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI && is_nmi(vmx_get_intr_info(vcpu))) { @@ -7229,6 +7266,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, kvm_after_interrupt(vcpu); } +out: guest_state_exit_irqoff(); } @@ -7350,8 +7388,6 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) loadsegment(es, __USER_DS); #endif - vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET; - pt_guest_exit(vmx); kvm_load_host_xsave_state(vcpu); @@ -7368,17 +7404,12 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->nested.nested_run_pending = 0; } - vmx->idt_vectoring_info = 0; - if (unlikely(vmx->fail)) return EXIT_FASTPATH_NONE; if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY)) kvm_machine_check(); - if (likely(!vmx->exit_reason.failed_vmentry)) - vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); - trace_kvm_exit(vcpu, KVM_ISA_VMX); if (unlikely(vmx->exit_reason.failed_vmentry)) @@ -7722,8 +7753,16 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */ - vcpu->arch.xsaves_enabled = false; + /* + * XSAVES is effectively enabled if and only if XSAVE is also exposed + * to the guest. XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be + * set if and only if XSAVE is supported. + */ + if (boot_cpu_has(X86_FEATURE_XSAVE) && + guest_cpuid_has(vcpu, X86_FEATURE_XSAVE)) + kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_XSAVES); + + kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VMX); vmx_setup_uret_msrs(vmx); @@ -7731,7 +7770,7 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vmcs_set_secondary_exec_control(vmx, vmx_secondary_exec_control(vmx)); - if (nested_vmx_allowed(vcpu)) + if (guest_can_use(vcpu, X86_FEATURE_VMX)) vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; @@ -7740,7 +7779,7 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX); - if (nested_vmx_allowed(vcpu)) + if (guest_can_use(vcpu, X86_FEATURE_VMX)) nested_vmx_cr_fixed1_bits_update(vcpu); if (boot_cpu_has(X86_FEATURE_INTEL_PT) && @@ -8203,6 +8242,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .set_segment = vmx_set_segment, .get_cpl = vmx_get_cpl, .get_cs_db_l_bits = vmx_get_cs_db_l_bits, + .is_valid_cr0 = vmx_is_valid_cr0, .set_cr0 = vmx_set_cr0, .is_valid_cr4 = vmx_is_valid_cr4, .set_cr4 = vmx_set_cr4, @@ -8496,7 +8536,7 @@ static __init int hardware_setup(void) */ vmx_setup_me_spte_mask(); - kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(), + kvm_configure_mmu(enable_ept, 0, vmx_get_max_ept_level(), ept_caps_to_lpage_level(vmx_capability.ept)); /* @@ -8592,10 +8632,8 @@ static void __vmx_exit(void) { allow_smaller_maxphyaddr = false; -#ifdef CONFIG_KEXEC_CORE - RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); - synchronize_rcu(); -#endif + cpu_emergency_unregister_virt_callback(vmx_emergency_disable); + vmx_cleanup_l1d_flush(); } @@ -8636,18 +8674,14 @@ static int __init vmx_init(void) if (r) goto err_l1d_flush; - vmx_setup_fb_clear_ctrl(); - for_each_possible_cpu(cpu) { INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); pi_init_cpu(cpu); } -#ifdef CONFIG_KEXEC_CORE - rcu_assign_pointer(crash_vmclear_loaded_vmcss, - crash_vmclear_local_loaded_vmcss); -#endif + cpu_emergency_register_virt_callback(vmx_emergency_disable); + vmx_check_vmcs12_offsets(); /* diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 32384ba38499..c2130d2c8e24 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -374,7 +374,6 @@ struct kvm_vmx { u64 *pid_table; }; -bool nested_vmx_allowed(struct kvm_vcpu *vcpu); void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, struct loaded_vmcs *buddy); int allocate_vpid(void); @@ -562,7 +561,7 @@ static inline u8 vmx_get_rvi(void) SECONDARY_EXEC_APIC_REGISTER_VIRT | \ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \ SECONDARY_EXEC_SHADOW_VMCS | \ - SECONDARY_EXEC_XSAVES | \ + SECONDARY_EXEC_ENABLE_XSAVES | \ SECONDARY_EXEC_RDSEED_EXITING | \ SECONDARY_EXEC_RDRAND_EXITING | \ SECONDARY_EXEC_ENABLE_PML | \ diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index ce47dc265f89..33af7b4c6eb4 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -10,7 +10,7 @@ #include "vmcs.h" #include "../x86.h" -void vmread_error(unsigned long field, bool fault); +void vmread_error(unsigned long field); void vmwrite_error(unsigned long field, unsigned long value); void vmclear_error(struct vmcs *vmcs, u64 phys_addr); void vmptrld_error(struct vmcs *vmcs, u64 phys_addr); @@ -31,6 +31,13 @@ void invept_error(unsigned long ext, u64 eptp, gpa_t gpa); * void vmread_error_trampoline(unsigned long field, bool fault); */ extern unsigned long vmread_error_trampoline; + +/* + * The second VMREAD error trampoline, called from the assembly trampoline, + * exists primarily to enable instrumentation for the VM-Fail path. + */ +void vmread_error_trampoline2(unsigned long field, bool fault); + #endif static __always_inline void vmcs_check16(unsigned long field) @@ -101,8 +108,7 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field) do_fail: instrumentation_begin(); - WARN_ONCE(1, KBUILD_MODNAME ": vmread failed: field=%lx\n", field); - pr_warn_ratelimited(KBUILD_MODNAME ": vmread failed: field=%lx\n", field); + vmread_error(field); instrumentation_end(); return 0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a6b9bea62fb8..6c9c81e82e65 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -25,6 +25,7 @@ #include "tss.h" #include "kvm_cache_regs.h" #include "kvm_emulate.h" +#include "mmu/page_track.h" #include "x86.h" #include "cpuid.h" #include "pmu.h" @@ -237,6 +238,9 @@ EXPORT_SYMBOL_GPL(enable_apicv); u64 __read_mostly host_xss; EXPORT_SYMBOL_GPL(host_xss); +u64 __read_mostly host_arch_capabilities; +EXPORT_SYMBOL_GPL(host_arch_capabilities); + const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS(), STATS_DESC_COUNTER(VM, mmu_shadow_zapped), @@ -906,6 +910,22 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) } EXPORT_SYMBOL_GPL(load_pdptrs); +static bool kvm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ +#ifdef CONFIG_X86_64 + if (cr0 & 0xffffffff00000000UL) + return false; +#endif + + if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) + return false; + + if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) + return false; + + return static_call(kvm_x86_is_valid_cr0)(vcpu, cr0); +} + void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0) { /* @@ -952,20 +972,13 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { unsigned long old_cr0 = kvm_read_cr0(vcpu); - cr0 |= X86_CR0_ET; - -#ifdef CONFIG_X86_64 - if (cr0 & 0xffffffff00000000UL) + if (!kvm_is_valid_cr0(vcpu, cr0)) return 1; -#endif - cr0 &= ~CR0_RESERVED_BITS; - - if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) - return 1; + cr0 |= X86_CR0_ET; - if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) - return 1; + /* Write to CR0 reserved bits are ignored, even on Intel. */ + cr0 &= ~CR0_RESERVED_BITS; #ifdef CONFIG_X86_64 if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) && @@ -1012,7 +1025,7 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) if (vcpu->arch.xcr0 != host_xcr0) xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); - if (vcpu->arch.xsaves_enabled && + if (guest_can_use(vcpu, X86_FEATURE_XSAVES) && vcpu->arch.ia32_xss != host_xss) wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss); } @@ -1043,7 +1056,7 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) if (vcpu->arch.xcr0 != host_xcr0) xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); - if (vcpu->arch.xsaves_enabled && + if (guest_can_use(vcpu, X86_FEATURE_XSAVES) && vcpu->arch.ia32_xss != host_xss) wrmsrl(MSR_IA32_XSS, host_xss); } @@ -1607,16 +1620,11 @@ static bool kvm_is_immutable_feature_msr(u32 msr) ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \ ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \ ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \ - ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO) + ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO) static u64 kvm_get_arch_capabilities(void) { - u64 data = 0; - - if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) { - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data); - data &= KVM_SUPPORTED_ARCH_CAP; - } + u64 data = host_arch_capabilities & KVM_SUPPORTED_ARCH_CAP; /* * If nx_huge_pages is enabled, KVM's shadow paging will ensure that @@ -1664,6 +1672,9 @@ static u64 kvm_get_arch_capabilities(void) */ } + if (!boot_cpu_has_bug(X86_BUG_GDS) || gds_ucode_mitigated()) + data |= ARCH_CAP_GDS_NO; + return data; } @@ -2172,6 +2183,8 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) u64 data; fastpath_t ret = EXIT_FASTPATH_NONE; + kvm_vcpu_srcu_read_lock(vcpu); + switch (msr) { case APIC_BASE_MSR + (APIC_ICR >> 4): data = kvm_read_edx_eax(vcpu); @@ -2194,6 +2207,8 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) if (ret != EXIT_FASTPATH_NONE) trace_kvm_msr_write(msr, data); + kvm_vcpu_srcu_read_unlock(vcpu); + return ret; } EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff); @@ -2615,7 +2630,7 @@ static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset) else vcpu->arch.tsc_offset = l1_offset; - static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset); + static_call(kvm_x86_write_tsc_offset)(vcpu); } static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier) @@ -2631,8 +2646,7 @@ static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multipli vcpu->arch.tsc_scaling_ratio = l1_multiplier; if (kvm_caps.has_tsc_control) - static_call(kvm_x86_write_tsc_multiplier)( - vcpu, vcpu->arch.tsc_scaling_ratio); + static_call(kvm_x86_write_tsc_multiplier)(vcpu); } static inline bool kvm_check_tsc_unstable(void) @@ -4649,7 +4663,6 @@ static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr) return 0; default: return -ENXIO; - break; } } @@ -6516,7 +6529,7 @@ static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter) static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter, struct kvm_msr_filter_range *user_range) { - unsigned long *bitmap = NULL; + unsigned long *bitmap; size_t bitmap_size; if (!user_range->nmsrs) @@ -8229,11 +8242,6 @@ static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, exact_only); } -static bool emulator_guest_has_long_mode(struct x86_emulate_ctxt *ctxt) -{ - return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_LM); -} - static bool emulator_guest_has_movbe(struct x86_emulate_ctxt *ctxt) { return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE); @@ -8335,7 +8343,6 @@ static const struct x86_emulate_ops emulate_ops = { .fix_hypercall = emulator_fix_hypercall, .intercept = emulator_intercept, .get_cpuid = emulator_get_cpuid, - .guest_has_long_mode = emulator_guest_has_long_mode, .guest_has_movbe = emulator_guest_has_movbe, .guest_has_fxsr = emulator_guest_has_fxsr, .guest_has_rdpid = emulator_guest_has_rdpid, @@ -9156,7 +9163,7 @@ static int kvmclock_cpu_down_prep(unsigned int cpu) static void tsc_khz_changed(void *data) { struct cpufreq_freqs *freq = data; - unsigned long khz = 0; + unsigned long khz; WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_CONSTANT_TSC)); @@ -9496,6 +9503,9 @@ static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) kvm_init_pmu_capability(ops->pmu_ops); + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, host_arch_capabilities); + r = ops->hardware_setup(); if (r != 0) goto out_mmu_exit; @@ -10203,9 +10213,13 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu, if (r < 0) goto out; if (r) { - kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false); - static_call(kvm_x86_inject_irq)(vcpu, false); - WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0); + int irq = kvm_cpu_get_interrupt(vcpu); + + if (!WARN_ON_ONCE(irq == -1)) { + kvm_queue_interrupt(vcpu, irq, false); + static_call(kvm_x86_inject_irq)(vcpu, false); + WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0); + } } if (kvm_cpu_has_injectable_intr(vcpu)) static_call(kvm_x86_enable_irq_window)(vcpu); @@ -11091,12 +11105,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) r = -EINTR; goto out; } + /* - * It should be impossible for the hypervisor timer to be in - * use before KVM has ever run the vCPU. + * Don't bother switching APIC timer emulation from the + * hypervisor timer to the software timer, the only way for the + * APIC timer to be active is if userspace stuffed vCPU state, + * i.e. put the vCPU into a nonsensical state. Only an INIT + * will transition the vCPU out of UNINITIALIZED (without more + * state stuffing from userspace), which will reset the local + * APIC and thus cancel the timer or drop the IRQ (if the timer + * already expired). */ - WARN_ON_ONCE(kvm_lapic_hv_timer_in_use(vcpu)); - kvm_vcpu_srcu_read_unlock(vcpu); kvm_vcpu_block(vcpu); kvm_vcpu_srcu_read_lock(vcpu); @@ -11460,7 +11479,8 @@ static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) return false; } - return kvm_is_valid_cr4(vcpu, sregs->cr4); + return kvm_is_valid_cr4(vcpu, sregs->cr4) && + kvm_is_valid_cr0(vcpu, sregs->cr0); } static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs, @@ -11777,15 +11797,22 @@ static int sync_regs(struct kvm_vcpu *vcpu) __set_regs(vcpu, &vcpu->run->s.regs.regs); vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS; } + if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) { - if (__set_sregs(vcpu, &vcpu->run->s.regs.sregs)) + struct kvm_sregs sregs = vcpu->run->s.regs.sregs; + + if (__set_sregs(vcpu, &sregs)) return -EINVAL; + vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS; } + if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) { - if (kvm_vcpu_ioctl_x86_set_vcpu_events( - vcpu, &vcpu->run->s.regs.events)) + struct kvm_vcpu_events events = vcpu->run->s.regs.events; + + if (kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events)) return -EINVAL; + vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS; } @@ -12606,6 +12633,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *new, enum kvm_mr_change change) { + /* + * KVM doesn't support moving memslots when there are external page + * trackers attached to the VM, i.e. if KVMGT is in use. + */ + if (change == KVM_MR_MOVE && kvm_page_track_has_external_user(kvm)) + return -EINVAL; + if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) { if ((new->base_gfn + new->npages - 1) > kvm_mmu_max_gfn()) return -EINVAL; @@ -12751,7 +12785,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, * See is_writable_pte() for more details (the case involving * access-tracked SPTEs is particularly relevant). */ - kvm_arch_flush_remote_tlbs_memslot(kvm, new); + kvm_flush_remote_tlbs_memslot(kvm, new); } } @@ -12760,6 +12794,9 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_memory_slot *new, enum kvm_mr_change change) { + if (change == KVM_MR_DELETE) + kvm_page_track_delete_slot(kvm, old); + if (!kvm->arch.n_requested_mmu_pages && (change == KVM_MR_CREATE || change == KVM_MR_DELETE)) { unsigned long nr_mmu_pages; @@ -12776,17 +12813,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, kvm_arch_free_memslot(kvm, old); } -void kvm_arch_flush_shadow_all(struct kvm *kvm) -{ - kvm_mmu_zap_all(kvm); -} - -void kvm_arch_flush_shadow_memslot(struct kvm *kvm, - struct kvm_memory_slot *slot) -{ - kvm_page_track_flush_slot(kvm, slot); -} - static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) { return (is_guest_mode(vcpu) && @@ -13185,7 +13211,7 @@ EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); bool kvm_arch_has_irq_bypass(void) { - return true; + return enable_apicv && irq_remapping_cap(IRQ_POSTING_CAP); } int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 82e3dafc5453..1e7be1f6ab29 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -323,6 +323,7 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); extern u64 host_xcr0; extern u64 host_xss; +extern u64 host_arch_capabilities; extern struct kvm_caps kvm_caps; diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 01c5de4c279b..0a81aafed7f8 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -27,7 +27,7 @@ * NOTE! The calling convention is very intentionally the same as * for 'rep movs', so that we can rewrite the function call with * just a plain 'rep movs' on machines that have FSRM. But to make - * it simpler for us, we can clobber rsi/rdi and rax/r8-r11 freely. + * it simpler for us, we can clobber rsi/rdi and rax freely. */ SYM_FUNC_START(rep_movs_alternative) cmpq $64,%rcx @@ -68,55 +68,24 @@ SYM_FUNC_START(rep_movs_alternative) _ASM_EXTABLE_UA( 3b, .Lcopy_user_tail) .Llarge: -0: ALTERNATIVE "jmp .Lunrolled", "rep movsb", X86_FEATURE_ERMS +0: ALTERNATIVE "jmp .Llarge_movsq", "rep movsb", X86_FEATURE_ERMS 1: RET - _ASM_EXTABLE_UA( 0b, 1b) + _ASM_EXTABLE_UA( 0b, 1b) - .p2align 4 -.Lunrolled: -10: movq (%rsi),%r8 -11: movq 8(%rsi),%r9 -12: movq 16(%rsi),%r10 -13: movq 24(%rsi),%r11 -14: movq %r8,(%rdi) -15: movq %r9,8(%rdi) -16: movq %r10,16(%rdi) -17: movq %r11,24(%rdi) -20: movq 32(%rsi),%r8 -21: movq 40(%rsi),%r9 -22: movq 48(%rsi),%r10 -23: movq 56(%rsi),%r11 -24: movq %r8,32(%rdi) -25: movq %r9,40(%rdi) -26: movq %r10,48(%rdi) -27: movq %r11,56(%rdi) - addq $64,%rsi - addq $64,%rdi - subq $64,%rcx - cmpq $64,%rcx - jae .Lunrolled - cmpl $8,%ecx - jae .Lword +.Llarge_movsq: + movq %rcx,%rax + shrq $3,%rcx + andl $7,%eax +0: rep movsq + movl %eax,%ecx testl %ecx,%ecx jne .Lcopy_user_tail RET - _ASM_EXTABLE_UA(10b, .Lcopy_user_tail) - _ASM_EXTABLE_UA(11b, .Lcopy_user_tail) - _ASM_EXTABLE_UA(12b, .Lcopy_user_tail) - _ASM_EXTABLE_UA(13b, .Lcopy_user_tail) - _ASM_EXTABLE_UA(14b, .Lcopy_user_tail) - _ASM_EXTABLE_UA(15b, .Lcopy_user_tail) - _ASM_EXTABLE_UA(16b, .Lcopy_user_tail) - _ASM_EXTABLE_UA(17b, .Lcopy_user_tail) - _ASM_EXTABLE_UA(20b, .Lcopy_user_tail) - _ASM_EXTABLE_UA(21b, .Lcopy_user_tail) - _ASM_EXTABLE_UA(22b, .Lcopy_user_tail) - _ASM_EXTABLE_UA(23b, .Lcopy_user_tail) - _ASM_EXTABLE_UA(24b, .Lcopy_user_tail) - _ASM_EXTABLE_UA(25b, .Lcopy_user_tail) - _ASM_EXTABLE_UA(26b, .Lcopy_user_tail) - _ASM_EXTABLE_UA(27b, .Lcopy_user_tail) +1: leaq (%rax,%rcx,8),%rcx + jmp .Lcopy_user_tail + + _ASM_EXTABLE_UA( 0b, 1b) SYM_FUNC_END(rep_movs_alternative) EXPORT_SYMBOL(rep_movs_alternative) diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index 1451e0c4ae22..235bbda6fc82 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -56,7 +56,6 @@ SYM_FUNC_END(__put_user_1) EXPORT_SYMBOL(__put_user_1) SYM_FUNC_START(__put_user_nocheck_1) - ENDBR ASM_STAC 2: movb %al,(%_ASM_CX) xor %ecx,%ecx @@ -76,7 +75,6 @@ SYM_FUNC_END(__put_user_2) EXPORT_SYMBOL(__put_user_2) SYM_FUNC_START(__put_user_nocheck_2) - ENDBR ASM_STAC 4: movw %ax,(%_ASM_CX) xor %ecx,%ecx @@ -96,7 +94,6 @@ SYM_FUNC_END(__put_user_4) EXPORT_SYMBOL(__put_user_4) SYM_FUNC_START(__put_user_nocheck_4) - ENDBR ASM_STAC 6: movl %eax,(%_ASM_CX) xor %ecx,%ecx @@ -119,7 +116,6 @@ SYM_FUNC_END(__put_user_8) EXPORT_SYMBOL(__put_user_8) SYM_FUNC_START(__put_user_nocheck_8) - ENDBR ASM_STAC 9: mov %_ASM_AX,(%_ASM_CX) #ifdef CONFIG_X86_32 diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 3fd066d42ec0..cd86aeb5fdd3 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -11,8 +11,9 @@ #include <asm/unwind_hints.h> #include <asm/percpu.h> #include <asm/frame.h> +#include <asm/nops.h> - .section .text.__x86.indirect_thunk + .section .text..__x86.indirect_thunk .macro POLINE reg @@ -131,36 +132,107 @@ SYM_CODE_END(__x86_indirect_jump_thunk_array) */ #ifdef CONFIG_RETHUNK - .section .text.__x86.return_thunk +/* + * srso_alias_untrain_ret() and srso_alias_safe_ret() are placed at + * special addresses: + * + * - srso_alias_untrain_ret() is 2M aligned + * - srso_alias_safe_ret() is also in the same 2M page but bits 2, 8, 14 + * and 20 in its virtual address are set (while those bits in the + * srso_alias_untrain_ret() function are cleared). + * + * This guarantees that those two addresses will alias in the branch + * target buffer of Zen3/4 generations, leading to any potential + * poisoned entries at that BTB slot to get evicted. + * + * As a result, srso_alias_safe_ret() becomes a safe return. + */ +#ifdef CONFIG_CPU_SRSO + .section .text..__x86.rethunk_untrain + +SYM_START(srso_alias_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE) + UNWIND_HINT_FUNC + ANNOTATE_NOENDBR + ASM_NOP2 + lfence + jmp srso_alias_return_thunk +SYM_FUNC_END(srso_alias_untrain_ret) +__EXPORT_THUNK(srso_alias_untrain_ret) + + .section .text..__x86.rethunk_safe +#else +/* dummy definition for alternatives */ +SYM_START(srso_alias_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE) + ANNOTATE_UNRET_SAFE + ret + int3 +SYM_FUNC_END(srso_alias_untrain_ret) +#endif + +SYM_START(srso_alias_safe_ret, SYM_L_GLOBAL, SYM_A_NONE) + lea 8(%_ASM_SP), %_ASM_SP + UNWIND_HINT_FUNC + ANNOTATE_UNRET_SAFE + ret + int3 +SYM_FUNC_END(srso_alias_safe_ret) + + .section .text..__x86.return_thunk + +SYM_CODE_START(srso_alias_return_thunk) + UNWIND_HINT_FUNC + ANNOTATE_NOENDBR + call srso_alias_safe_ret + ud2 +SYM_CODE_END(srso_alias_return_thunk) + +/* + * Some generic notes on the untraining sequences: + * + * They are interchangeable when it comes to flushing potentially wrong + * RET predictions from the BTB. + * + * The SRSO Zen1/2 (MOVABS) untraining sequence is longer than the + * Retbleed sequence because the return sequence done there + * (srso_safe_ret()) is longer and the return sequence must fully nest + * (end before) the untraining sequence. Therefore, the untraining + * sequence must fully overlap the return sequence. + * + * Regarding alignment - the instructions which need to be untrained, + * must all start at a cacheline boundary for Zen1/2 generations. That + * is, instruction sequences starting at srso_safe_ret() and + * the respective instruction sequences at retbleed_return_thunk() + * must start at a cacheline boundary. + */ /* * Safety details here pertain to the AMD Zen{1,2} microarchitecture: - * 1) The RET at __x86_return_thunk must be on a 64 byte boundary, for + * 1) The RET at retbleed_return_thunk must be on a 64 byte boundary, for * alignment within the BTB. - * 2) The instruction at zen_untrain_ret must contain, and not + * 2) The instruction at retbleed_untrain_ret must contain, and not * end with, the 0xc3 byte of the RET. * 3) STIBP must be enabled, or SMT disabled, to prevent the sibling thread * from re-poisioning the BTB prediction. */ .align 64 - .skip 64 - (__x86_return_thunk - zen_untrain_ret), 0xcc -SYM_START(zen_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE) + .skip 64 - (retbleed_return_thunk - retbleed_untrain_ret), 0xcc +SYM_START(retbleed_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE) ANNOTATE_NOENDBR /* - * As executed from zen_untrain_ret, this is: + * As executed from retbleed_untrain_ret, this is: * * TEST $0xcc, %bl * LFENCE - * JMP __x86_return_thunk + * JMP retbleed_return_thunk * * Executing the TEST instruction has a side effect of evicting any BTB * prediction (potentially attacker controlled) attached to the RET, as - * __x86_return_thunk + 1 isn't an instruction boundary at the moment. + * retbleed_return_thunk + 1 isn't an instruction boundary at the moment. */ .byte 0xf6 /* - * As executed from __x86_return_thunk, this is a plain RET. + * As executed from retbleed_return_thunk, this is a plain RET. * * As part of the TEST above, RET is the ModRM byte, and INT3 the imm8. * @@ -172,13 +244,13 @@ SYM_START(zen_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE) * With SMT enabled and STIBP active, a sibling thread cannot poison * RET's prediction to a type of its choice, but can evict the * prediction due to competitive sharing. If the prediction is - * evicted, __x86_return_thunk will suffer Straight Line Speculation + * evicted, retbleed_return_thunk will suffer Straight Line Speculation * which will be contained safely by the INT3. */ -SYM_INNER_LABEL(__x86_return_thunk, SYM_L_GLOBAL) +SYM_INNER_LABEL(retbleed_return_thunk, SYM_L_GLOBAL) ret int3 -SYM_CODE_END(__x86_return_thunk) +SYM_CODE_END(retbleed_return_thunk) /* * Ensure the TEST decoding / BTB invalidation is complete. @@ -189,11 +261,67 @@ SYM_CODE_END(__x86_return_thunk) * Jump back and execute the RET in the middle of the TEST instruction. * INT3 is for SLS protection. */ - jmp __x86_return_thunk + jmp retbleed_return_thunk int3 -SYM_FUNC_END(zen_untrain_ret) -__EXPORT_THUNK(zen_untrain_ret) +SYM_FUNC_END(retbleed_untrain_ret) +__EXPORT_THUNK(retbleed_untrain_ret) +/* + * SRSO untraining sequence for Zen1/2, similar to retbleed_untrain_ret() + * above. On kernel entry, srso_untrain_ret() is executed which is a + * + * movabs $0xccccc30824648d48,%rax + * + * and when the return thunk executes the inner label srso_safe_ret() + * later, it is a stack manipulation and a RET which is mispredicted and + * thus a "safe" one to use. + */ + .align 64 + .skip 64 - (srso_safe_ret - srso_untrain_ret), 0xcc +SYM_START(srso_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE) + ANNOTATE_NOENDBR + .byte 0x48, 0xb8 + +/* + * This forces the function return instruction to speculate into a trap + * (UD2 in srso_return_thunk() below). This RET will then mispredict + * and execution will continue at the return site read from the top of + * the stack. + */ +SYM_INNER_LABEL(srso_safe_ret, SYM_L_GLOBAL) + lea 8(%_ASM_SP), %_ASM_SP + ret + int3 + int3 + /* end of movabs */ + lfence + call srso_safe_ret + ud2 +SYM_CODE_END(srso_safe_ret) +SYM_FUNC_END(srso_untrain_ret) +__EXPORT_THUNK(srso_untrain_ret) + +SYM_CODE_START(srso_return_thunk) + UNWIND_HINT_FUNC + ANNOTATE_NOENDBR + call srso_safe_ret + ud2 +SYM_CODE_END(srso_return_thunk) + +SYM_FUNC_START(entry_untrain_ret) + ALTERNATIVE_2 "jmp retbleed_untrain_ret", \ + "jmp srso_untrain_ret", X86_FEATURE_SRSO, \ + "jmp srso_alias_untrain_ret", X86_FEATURE_SRSO_ALIAS +SYM_FUNC_END(entry_untrain_ret) +__EXPORT_THUNK(entry_untrain_ret) + +SYM_CODE_START(__x86_return_thunk) + UNWIND_HINT_FUNC + ANNOTATE_NOENDBR + ANNOTATE_UNRET_SAFE + ret + int3 +SYM_CODE_END(__x86_return_thunk) EXPORT_SYMBOL(__x86_return_thunk) #endif /* CONFIG_RETHUNK */ diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index e8711b2cafaf..ab778eac1952 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1112,8 +1112,22 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) (error_code & X86_PF_INSTR), foreign)) return 1; + /* + * Shadow stack accesses (PF_SHSTK=1) are only permitted to + * shadow stack VMAs. All other accesses result in an error. + */ + if (error_code & X86_PF_SHSTK) { + if (unlikely(!(vma->vm_flags & VM_SHADOW_STACK))) + return 1; + if (unlikely(!(vma->vm_flags & VM_WRITE))) + return 1; + return 0; + } + if (error_code & X86_PF_WRITE) { /* write, present and write, not present: */ + if (unlikely(vma->vm_flags & VM_SHADOW_STACK)) + return 1; if (unlikely(!(vma->vm_flags & VM_WRITE))) return 1; return 0; @@ -1305,6 +1319,14 @@ void do_user_addr_fault(struct pt_regs *regs, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + /* + * Read-only permissions can not be expressed in shadow stack PTEs. + * Treat all shadow stack accesses as WRITE faults. This ensures + * that the MM will prepare everything (e.g., break COW) such that + * maybe_mkwrite() can create a proper shadow stack PTE. + */ + if (error_code & X86_PF_SHSTK) + flags |= FAULT_FLAG_WRITE; if (error_code & X86_PF_WRITE) flags |= FAULT_FLAG_WRITE; if (error_code & X86_PF_INSTR) @@ -1328,7 +1350,6 @@ void do_user_addr_fault(struct pt_regs *regs, } #endif -#ifdef CONFIG_PER_VMA_LOCK if (!(flags & FAULT_FLAG_USER)) goto lock_mmap; @@ -1341,7 +1362,8 @@ void do_user_addr_fault(struct pt_regs *regs, goto lock_mmap; } fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); - vma_end_read(vma); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); @@ -1358,7 +1380,6 @@ void do_user_addr_fault(struct pt_regs *regs, return; } lock_mmap: -#endif /* CONFIG_PER_VMA_LOCK */ retry: vma = lock_mm_and_find_vma(mm, address, regs); @@ -1418,9 +1439,7 @@ retry: } mmap_read_unlock(mm); -#ifdef CONFIG_PER_VMA_LOCK done: -#endif if (likely(!(fault & VM_FAULT_ERROR))) return; diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 8192452d1d2d..679893ea5e68 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -20,7 +20,6 @@ #include <asm/tlb.h> #include <asm/proto.h> #include <asm/dma.h> /* for MAX_DMA_PFN */ -#include <asm/microcode.h> #include <asm/kaslr.h> #include <asm/hypervisor.h> #include <asm/cpufeature.h> @@ -273,7 +272,7 @@ static void __init probe_page_size_mask(void) static const struct x86_cpu_id invlpg_miss_ids[] = { INTEL_MATCH(INTEL_FAM6_ALDERLAKE ), INTEL_MATCH(INTEL_FAM6_ALDERLAKE_L ), - INTEL_MATCH(INTEL_FAM6_ALDERLAKE_N ), + INTEL_MATCH(INTEL_FAM6_ATOM_GRACEMONT ), INTEL_MATCH(INTEL_FAM6_RAPTORLAKE ), INTEL_MATCH(INTEL_FAM6_RAPTORLAKE_P), INTEL_MATCH(INTEL_FAM6_RAPTORLAKE_S), @@ -307,15 +306,6 @@ static void setup_pcid(void) * start_secondary(). */ cr4_set_bits(X86_CR4_PCIDE); - - /* - * INVPCID's single-context modes (2/3) only work if we set - * X86_CR4_PCIDE, *and* we INVPCID support. It's unusable - * on systems that have X86_CR4_PCIDE clear, or that have - * no INVPCID support at all. - */ - if (boot_cpu_has(X86_FEATURE_INVPCID)) - setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE); } else { /* * flush_tlb_all(), as currently implemented, won't work if diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c index 54bbd5163e8d..6faea41e99b6 100644 --- a/arch/x86/mm/mem_encrypt_amd.c +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -288,11 +288,10 @@ static bool amd_enc_cache_flush_required(void) return !cpu_feature_enabled(X86_FEATURE_SME_COHERENT); } -static void enc_dec_hypercall(unsigned long vaddr, int npages, bool enc) +static void enc_dec_hypercall(unsigned long vaddr, unsigned long size, bool enc) { #ifdef CONFIG_PARAVIRT - unsigned long sz = npages << PAGE_SHIFT; - unsigned long vaddr_end = vaddr + sz; + unsigned long vaddr_end = vaddr + size; while (vaddr < vaddr_end) { int psize, pmask, level; @@ -342,7 +341,7 @@ static bool amd_enc_status_change_finish(unsigned long vaddr, int npages, bool e snp_set_memory_private(vaddr, npages); if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) - enc_dec_hypercall(vaddr, npages, enc); + enc_dec_hypercall(vaddr, npages << PAGE_SHIFT, enc); return true; } @@ -466,7 +465,7 @@ static int __init early_set_memory_enc_dec(unsigned long vaddr, ret = 0; - early_set_mem_enc_dec_hypercall(start, PAGE_ALIGN(size) >> PAGE_SHIFT, enc); + early_set_mem_enc_dec_hypercall(start, size, enc); out: __flush_tlb_all(); return ret; @@ -482,9 +481,9 @@ int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size) return early_set_memory_enc_dec(vaddr, size, true); } -void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, bool enc) +void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, unsigned long size, bool enc) { - enc_dec_hypercall(vaddr, npages, enc); + enc_dec_hypercall(vaddr, size, enc); } void __init sme_early_init(void) diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index df4182b6449f..bda9f129835e 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -2074,12 +2074,12 @@ int set_memory_nx(unsigned long addr, int numpages) int set_memory_ro(unsigned long addr, int numpages) { - return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0); + return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW | _PAGE_DIRTY), 0); } int set_memory_rox(unsigned long addr, int numpages) { - pgprot_t clr = __pgprot(_PAGE_RW); + pgprot_t clr = __pgprot(_PAGE_RW | _PAGE_DIRTY); if (__supported_pte_mask & _PAGE_NX) clr.pgprot |= _PAGE_NX; diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 15a8009a4480..9deadf517f14 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -52,7 +52,7 @@ early_param("userpte", setup_userpte); void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { - pgtable_pte_page_dtor(pte); + pagetable_pte_dtor(page_ptdesc(pte)); paravirt_release_pte(page_to_pfn(pte)); paravirt_tlb_remove_table(tlb, pte); } @@ -60,7 +60,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) #if CONFIG_PGTABLE_LEVELS > 2 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) { - struct page *page = virt_to_page(pmd); + struct ptdesc *ptdesc = virt_to_ptdesc(pmd); paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); /* * NOTE! For PAE, any changes to the top page-directory-pointer-table @@ -69,8 +69,8 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) #ifdef CONFIG_X86_PAE tlb->need_flush_all = 1; #endif - pgtable_pmd_page_dtor(page); - paravirt_tlb_remove_table(tlb, page); + pagetable_pmd_dtor(ptdesc); + paravirt_tlb_remove_table(tlb, ptdesc_page(ptdesc)); } #if CONFIG_PGTABLE_LEVELS > 3 @@ -92,16 +92,16 @@ void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) static inline void pgd_list_add(pgd_t *pgd) { - struct page *page = virt_to_page(pgd); + struct ptdesc *ptdesc = virt_to_ptdesc(pgd); - list_add(&page->lru, &pgd_list); + list_add(&ptdesc->pt_list, &pgd_list); } static inline void pgd_list_del(pgd_t *pgd) { - struct page *page = virt_to_page(pgd); + struct ptdesc *ptdesc = virt_to_ptdesc(pgd); - list_del(&page->lru); + list_del(&ptdesc->pt_list); } #define UNSHARED_PTRS_PER_PGD \ @@ -112,12 +112,12 @@ static inline void pgd_list_del(pgd_t *pgd) static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) { - virt_to_page(pgd)->pt_mm = mm; + virt_to_ptdesc(pgd)->pt_mm = mm; } struct mm_struct *pgd_page_get_mm(struct page *page) { - return page->pt_mm; + return page_ptdesc(page)->pt_mm; } static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) @@ -213,11 +213,14 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) { int i; + struct ptdesc *ptdesc; for (i = 0; i < count; i++) if (pmds[i]) { - pgtable_pmd_page_dtor(virt_to_page(pmds[i])); - free_page((unsigned long)pmds[i]); + ptdesc = virt_to_ptdesc(pmds[i]); + + pagetable_pmd_dtor(ptdesc); + pagetable_free(ptdesc); mm_dec_nr_pmds(mm); } } @@ -230,18 +233,24 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) if (mm == &init_mm) gfp &= ~__GFP_ACCOUNT; + gfp &= ~__GFP_HIGHMEM; for (i = 0; i < count; i++) { - pmd_t *pmd = (pmd_t *)__get_free_page(gfp); - if (!pmd) + pmd_t *pmd = NULL; + struct ptdesc *ptdesc = pagetable_alloc(gfp, 0); + + if (!ptdesc) failed = true; - if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) { - free_page((unsigned long)pmd); - pmd = NULL; + if (ptdesc && !pagetable_pmd_ctor(ptdesc)) { + pagetable_free(ptdesc); + ptdesc = NULL; failed = true; } - if (pmd) + if (ptdesc) { mm_inc_nr_pmds(mm); + pmd = ptdesc_address(ptdesc); + } + pmds[i] = pmd; } @@ -830,7 +839,7 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr) free_page((unsigned long)pmd_sv); - pgtable_pmd_page_dtor(virt_to_page(pmd)); + pagetable_pmd_dtor(virt_to_ptdesc(pmd)); free_page((unsigned long)pmd); return 1; @@ -872,3 +881,43 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) #endif /* CONFIG_X86_64 */ #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ + +pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_SHADOW_STACK) + return pte_mkwrite_shstk(pte); + + pte = pte_mkwrite_novma(pte); + + return pte_clear_saveddirty(pte); +} + +pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_SHADOW_STACK) + return pmd_mkwrite_shstk(pmd); + + pmd = pmd_mkwrite_novma(pmd); + + return pmd_clear_saveddirty(pmd); +} + +void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte) +{ + /* + * Hardware before shadow stack can (rarely) set Dirty=1 + * on a Write=0 PTE. So the below condition + * only indicates a software bug when shadow stack is + * supported by the HW. This checking is covered in + * pte_shstk(). + */ + VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && + pte_shstk(pte)); +} + +void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd) +{ + /* See note in arch_check_zapped_pte() */ + VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && + pmd_shstk(pmd)); +} diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index dac07e4f5834..9c52a95937ad 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -40,9 +40,8 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) return; pxm = pa->proximity_domain; apic_id = pa->apic_id; - if (!apic->apic_id_valid(apic_id)) { - printk(KERN_INFO "SRAT: PXM %u -> X2APIC 0x%04x ignored\n", - pxm, apic_id); + if (!apic_id_valid(apic_id)) { + pr_info("SRAT: PXM %u -> X2APIC 0x%04x ignored\n", pxm, apic_id); return; } node = acpi_map_pxm_to_node(pxm); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 267acf27480a..453ea95b667d 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -10,6 +10,7 @@ #include <linux/debugfs.h> #include <linux/sched/smt.h> #include <linux/task_work.h> +#include <linux/mmu_notifier.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> @@ -1036,6 +1037,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, put_flush_tlb_info(); put_cpu(); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } @@ -1140,21 +1142,28 @@ void flush_tlb_one_kernel(unsigned long addr) */ STATIC_NOPV void native_flush_tlb_one_user(unsigned long addr) { - u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + u32 loaded_mm_asid; + bool cpu_pcide; + /* Flush 'addr' from the kernel PCID: */ asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); + /* If PTI is off there is no user PCID and nothing to flush. */ if (!static_cpu_has(X86_FEATURE_PTI)) return; + loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + cpu_pcide = this_cpu_read(cpu_tlbstate.cr4) & X86_CR4_PCIDE; + /* - * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1. - * Just use invalidate_user_asid() in case we are called early. + * invpcid_flush_one(pcid>0) will #GP if CR4.PCIDE==0. Check + * 'cpu_pcide' to ensure that *this* CPU will not trigger those + * #GP's even if called before CR4.PCIDE has been initialized. */ - if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) - invalidate_user_asid(loaded_mm_asid); - else + if (boot_cpu_has(X86_FEATURE_INVPCID) && cpu_pcide) invpcid_flush_one(user_pcid(loaded_mm_asid), addr); + else + invalidate_user_asid(loaded_mm_asid); } void flush_tlb_one_user(unsigned long addr) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 438adb695daa..a5930042139d 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -701,6 +701,38 @@ static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg) *pprog = prog; } +static void emit_movsx_reg(u8 **pprog, int num_bits, bool is64, u32 dst_reg, + u32 src_reg) +{ + u8 *prog = *pprog; + + if (is64) { + /* movs[b,w,l]q dst, src */ + if (num_bits == 8) + EMIT4(add_2mod(0x48, src_reg, dst_reg), 0x0f, 0xbe, + add_2reg(0xC0, src_reg, dst_reg)); + else if (num_bits == 16) + EMIT4(add_2mod(0x48, src_reg, dst_reg), 0x0f, 0xbf, + add_2reg(0xC0, src_reg, dst_reg)); + else if (num_bits == 32) + EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x63, + add_2reg(0xC0, src_reg, dst_reg)); + } else { + /* movs[b,w]l dst, src */ + if (num_bits == 8) { + EMIT4(add_2mod(0x40, src_reg, dst_reg), 0x0f, 0xbe, + add_2reg(0xC0, src_reg, dst_reg)); + } else if (num_bits == 16) { + if (is_ereg(dst_reg) || is_ereg(src_reg)) + EMIT1(add_2mod(0x40, src_reg, dst_reg)); + EMIT3(add_2mod(0x0f, src_reg, dst_reg), 0xbf, + add_2reg(0xC0, src_reg, dst_reg)); + } + } + + *pprog = prog; +} + /* Emit the suffix (ModR/M etc) for addressing *(ptr_reg + off) and val_reg */ static void emit_insn_suffix(u8 **pprog, u32 ptr_reg, u32 val_reg, int off) { @@ -779,6 +811,29 @@ static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) *pprog = prog; } +/* LDSX: dst_reg = *(s8*)(src_reg + off) */ +static void emit_ldsx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) +{ + u8 *prog = *pprog; + + switch (size) { + case BPF_B: + /* Emit 'movsx rax, byte ptr [rax + off]' */ + EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xBE); + break; + case BPF_H: + /* Emit 'movsx rax, word ptr [rax + off]' */ + EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xBF); + break; + case BPF_W: + /* Emit 'movsx rax, dword ptr [rax+0x14]' */ + EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x63); + break; + } + emit_insn_suffix(&prog, src_reg, dst_reg, off); + *pprog = prog; +} + /* STX: *(u8*)(dst_reg + off) = src_reg */ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) { @@ -1028,9 +1083,14 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image case BPF_ALU64 | BPF_MOV | BPF_X: case BPF_ALU | BPF_MOV | BPF_X: - emit_mov_reg(&prog, - BPF_CLASS(insn->code) == BPF_ALU64, - dst_reg, src_reg); + if (insn->off == 0) + emit_mov_reg(&prog, + BPF_CLASS(insn->code) == BPF_ALU64, + dst_reg, src_reg); + else + emit_movsx_reg(&prog, insn->off, + BPF_CLASS(insn->code) == BPF_ALU64, + dst_reg, src_reg); break; /* neg dst */ @@ -1134,15 +1194,26 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image /* mov rax, dst_reg */ emit_mov_reg(&prog, is64, BPF_REG_0, dst_reg); - /* - * xor edx, edx - * equivalent to 'xor rdx, rdx', but one byte less - */ - EMIT2(0x31, 0xd2); + if (insn->off == 0) { + /* + * xor edx, edx + * equivalent to 'xor rdx, rdx', but one byte less + */ + EMIT2(0x31, 0xd2); - /* div src_reg */ - maybe_emit_1mod(&prog, src_reg, is64); - EMIT2(0xF7, add_1reg(0xF0, src_reg)); + /* div src_reg */ + maybe_emit_1mod(&prog, src_reg, is64); + EMIT2(0xF7, add_1reg(0xF0, src_reg)); + } else { + if (BPF_CLASS(insn->code) == BPF_ALU) + EMIT1(0x99); /* cdq */ + else + EMIT2(0x48, 0x99); /* cqo */ + + /* idiv src_reg */ + maybe_emit_1mod(&prog, src_reg, is64); + EMIT2(0xF7, add_1reg(0xF8, src_reg)); + } if (BPF_OP(insn->code) == BPF_MOD && dst_reg != BPF_REG_3) @@ -1262,6 +1333,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image break; case BPF_ALU | BPF_END | BPF_FROM_BE: + case BPF_ALU64 | BPF_END | BPF_FROM_LE: switch (imm32) { case 16: /* Emit 'ror %ax, 8' to swap lower 2 bytes */ @@ -1370,9 +1442,17 @@ st: if (is_imm8(insn->off)) case BPF_LDX | BPF_PROBE_MEM | BPF_W: case BPF_LDX | BPF_MEM | BPF_DW: case BPF_LDX | BPF_PROBE_MEM | BPF_DW: + /* LDXS: dst_reg = *(s8*)(src_reg + off) */ + case BPF_LDX | BPF_MEMSX | BPF_B: + case BPF_LDX | BPF_MEMSX | BPF_H: + case BPF_LDX | BPF_MEMSX | BPF_W: + case BPF_LDX | BPF_PROBE_MEMSX | BPF_B: + case BPF_LDX | BPF_PROBE_MEMSX | BPF_H: + case BPF_LDX | BPF_PROBE_MEMSX | BPF_W: insn_off = insn->off; - if (BPF_MODE(insn->code) == BPF_PROBE_MEM) { + if (BPF_MODE(insn->code) == BPF_PROBE_MEM || + BPF_MODE(insn->code) == BPF_PROBE_MEMSX) { /* Conservatively check that src_reg + insn->off is a kernel address: * src_reg + insn->off >= TASK_SIZE_MAX + PAGE_SIZE * src_reg is used as scratch for src_reg += insn->off and restored @@ -1415,8 +1495,13 @@ st: if (is_imm8(insn->off)) start_of_ldx = prog; end_of_jmp[-1] = start_of_ldx - end_of_jmp; } - emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn_off); - if (BPF_MODE(insn->code) == BPF_PROBE_MEM) { + if (BPF_MODE(insn->code) == BPF_PROBE_MEMSX || + BPF_MODE(insn->code) == BPF_MEMSX) + emit_ldsx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn_off); + else + emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn_off); + if (BPF_MODE(insn->code) == BPF_PROBE_MEM || + BPF_MODE(insn->code) == BPF_PROBE_MEMSX) { struct exception_table_entry *ex; u8 *_insn = image + proglen + (start_of_ldx - temp); s64 delta; @@ -1730,16 +1815,24 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */ break; case BPF_JMP | BPF_JA: - if (insn->off == -1) - /* -1 jmp instructions will always jump - * backwards two bytes. Explicitly handling - * this case avoids wasting too many passes - * when there are long sequences of replaced - * dead code. - */ - jmp_offset = -2; - else - jmp_offset = addrs[i + insn->off] - addrs[i]; + case BPF_JMP32 | BPF_JA: + if (BPF_CLASS(insn->code) == BPF_JMP) { + if (insn->off == -1) + /* -1 jmp instructions will always jump + * backwards two bytes. Explicitly handling + * this case avoids wasting too many passes + * when there are long sequences of replaced + * dead code. + */ + jmp_offset = -2; + else + jmp_offset = addrs[i + insn->off] - addrs[i]; + } else { + if (insn->imm == -1) + jmp_offset = -2; + else + jmp_offset = addrs[i + insn->imm] - addrs[i]; + } if (!jmp_offset) { /* @@ -1857,59 +1950,177 @@ emit_jmp: return proglen; } -static void save_regs(const struct btf_func_model *m, u8 **prog, int nr_regs, - int stack_size) +static void clean_stack_garbage(const struct btf_func_model *m, + u8 **pprog, int nr_stack_slots, + int stack_size) +{ + int arg_size, off; + u8 *prog; + + /* Generally speaking, the compiler will pass the arguments + * on-stack with "push" instruction, which will take 8-byte + * on the stack. In this case, there won't be garbage values + * while we copy the arguments from origin stack frame to current + * in BPF_DW. + * + * However, sometimes the compiler will only allocate 4-byte on + * the stack for the arguments. For now, this case will only + * happen if there is only one argument on-stack and its size + * not more than 4 byte. In this case, there will be garbage + * values on the upper 4-byte where we store the argument on + * current stack frame. + * + * arguments on origin stack: + * + * stack_arg_1(4-byte) xxx(4-byte) + * + * what we copy: + * + * stack_arg_1(8-byte): stack_arg_1(origin) xxx + * + * and the xxx is the garbage values which we should clean here. + */ + if (nr_stack_slots != 1) + return; + + /* the size of the last argument */ + arg_size = m->arg_size[m->nr_args - 1]; + if (arg_size <= 4) { + off = -(stack_size - 4); + prog = *pprog; + /* mov DWORD PTR [rbp + off], 0 */ + if (!is_imm8(off)) + EMIT2_off32(0xC7, 0x85, off); + else + EMIT3(0xC7, 0x45, off); + EMIT(0, 4); + *pprog = prog; + } +} + +/* get the count of the regs that are used to pass arguments */ +static int get_nr_used_regs(const struct btf_func_model *m) { - int i, j, arg_size; - bool next_same_struct = false; + int i, arg_regs, nr_used_regs = 0; + + for (i = 0; i < min_t(int, m->nr_args, MAX_BPF_FUNC_ARGS); i++) { + arg_regs = (m->arg_size[i] + 7) / 8; + if (nr_used_regs + arg_regs <= 6) + nr_used_regs += arg_regs; + + if (nr_used_regs >= 6) + break; + } + + return nr_used_regs; +} + +static void save_args(const struct btf_func_model *m, u8 **prog, + int stack_size, bool for_call_origin) +{ + int arg_regs, first_off = 0, nr_regs = 0, nr_stack_slots = 0; + int i, j; /* Store function arguments to stack. * For a function that accepts two pointers the sequence will be: * mov QWORD PTR [rbp-0x10],rdi * mov QWORD PTR [rbp-0x8],rsi */ - for (i = 0, j = 0; i < min(nr_regs, 6); i++) { - /* The arg_size is at most 16 bytes, enforced by the verifier. */ - arg_size = m->arg_size[j]; - if (arg_size > 8) { - arg_size = 8; - next_same_struct = !next_same_struct; - } + for (i = 0; i < min_t(int, m->nr_args, MAX_BPF_FUNC_ARGS); i++) { + arg_regs = (m->arg_size[i] + 7) / 8; - emit_stx(prog, bytes_to_bpf_size(arg_size), - BPF_REG_FP, - i == 5 ? X86_REG_R9 : BPF_REG_1 + i, - -(stack_size - i * 8)); + /* According to the research of Yonghong, struct members + * should be all in register or all on the stack. + * Meanwhile, the compiler will pass the argument on regs + * if the remaining regs can hold the argument. + * + * Disorder of the args can happen. For example: + * + * struct foo_struct { + * long a; + * int b; + * }; + * int foo(char, char, char, char, char, struct foo_struct, + * char); + * + * the arg1-5,arg7 will be passed by regs, and arg6 will + * by stack. + */ + if (nr_regs + arg_regs > 6) { + /* copy function arguments from origin stack frame + * into current stack frame. + * + * The starting address of the arguments on-stack + * is: + * rbp + 8(push rbp) + + * 8(return addr of origin call) + + * 8(return addr of the caller) + * which means: rbp + 24 + */ + for (j = 0; j < arg_regs; j++) { + emit_ldx(prog, BPF_DW, BPF_REG_0, BPF_REG_FP, + nr_stack_slots * 8 + 0x18); + emit_stx(prog, BPF_DW, BPF_REG_FP, BPF_REG_0, + -stack_size); + + if (!nr_stack_slots) + first_off = stack_size; + stack_size -= 8; + nr_stack_slots++; + } + } else { + /* Only copy the arguments on-stack to current + * 'stack_size' and ignore the regs, used to + * prepare the arguments on-stack for orign call. + */ + if (for_call_origin) { + nr_regs += arg_regs; + continue; + } - j = next_same_struct ? j : j + 1; + /* copy the arguments from regs into stack */ + for (j = 0; j < arg_regs; j++) { + emit_stx(prog, BPF_DW, BPF_REG_FP, + nr_regs == 5 ? X86_REG_R9 : BPF_REG_1 + nr_regs, + -stack_size); + stack_size -= 8; + nr_regs++; + } + } } + + clean_stack_garbage(m, prog, nr_stack_slots, first_off); } -static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_regs, +static void restore_regs(const struct btf_func_model *m, u8 **prog, int stack_size) { - int i, j, arg_size; - bool next_same_struct = false; + int i, j, arg_regs, nr_regs = 0; /* Restore function arguments from stack. * For a function that accepts two pointers the sequence will be: * EMIT4(0x48, 0x8B, 0x7D, 0xF0); mov rdi,QWORD PTR [rbp-0x10] * EMIT4(0x48, 0x8B, 0x75, 0xF8); mov rsi,QWORD PTR [rbp-0x8] + * + * The logic here is similar to what we do in save_args() */ - for (i = 0, j = 0; i < min(nr_regs, 6); i++) { - /* The arg_size is at most 16 bytes, enforced by the verifier. */ - arg_size = m->arg_size[j]; - if (arg_size > 8) { - arg_size = 8; - next_same_struct = !next_same_struct; + for (i = 0; i < min_t(int, m->nr_args, MAX_BPF_FUNC_ARGS); i++) { + arg_regs = (m->arg_size[i] + 7) / 8; + if (nr_regs + arg_regs <= 6) { + for (j = 0; j < arg_regs; j++) { + emit_ldx(prog, BPF_DW, + nr_regs == 5 ? X86_REG_R9 : BPF_REG_1 + nr_regs, + BPF_REG_FP, + -stack_size); + stack_size -= 8; + nr_regs++; + } + } else { + stack_size -= 8 * arg_regs; } - emit_ldx(prog, bytes_to_bpf_size(arg_size), - i == 5 ? X86_REG_R9 : BPF_REG_1 + i, - BPF_REG_FP, - -(stack_size - i * 8)); - - j = next_same_struct ? j : j + 1; + if (nr_regs >= 6) + break; } } @@ -1938,7 +2149,10 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, /* arg1: mov rdi, progs[i] */ emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); /* arg2: lea rsi, [rbp - ctx_cookie_off] */ - EMIT4(0x48, 0x8D, 0x75, -run_ctx_off); + if (!is_imm8(-run_ctx_off)) + EMIT3_off32(0x48, 0x8D, 0xB5, -run_ctx_off); + else + EMIT4(0x48, 0x8D, 0x75, -run_ctx_off); if (emit_rsb_call(&prog, bpf_trampoline_enter(p), prog)) return -EINVAL; @@ -1954,7 +2168,10 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, emit_nops(&prog, 2); /* arg1: lea rdi, [rbp - stack_size] */ - EMIT4(0x48, 0x8D, 0x7D, -stack_size); + if (!is_imm8(-stack_size)) + EMIT3_off32(0x48, 0x8D, 0xBD, -stack_size); + else + EMIT4(0x48, 0x8D, 0x7D, -stack_size); /* arg2: progs[i]->insnsi for interpreter */ if (!p->jited) emit_mov_imm64(&prog, BPF_REG_2, @@ -1984,7 +2201,10 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, /* arg2: mov rsi, rbx <- start time in nsec */ emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); /* arg3: lea rdx, [rbp - run_ctx_off] */ - EMIT4(0x48, 0x8D, 0x55, -run_ctx_off); + if (!is_imm8(-run_ctx_off)) + EMIT3_off32(0x48, 0x8D, 0x95, -run_ctx_off); + else + EMIT4(0x48, 0x8D, 0x55, -run_ctx_off); if (emit_rsb_call(&prog, bpf_trampoline_exit(p), prog)) return -EINVAL; @@ -2136,7 +2356,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i void *func_addr) { int i, ret, nr_regs = m->nr_args, stack_size = 0; - int regs_off, nregs_off, ip_off, run_ctx_off; + int regs_off, nregs_off, ip_off, run_ctx_off, arg_stack_off, rbx_off; struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY]; struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT]; struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN]; @@ -2150,8 +2370,10 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG) nr_regs += (m->arg_size[i] + 7) / 8 - 1; - /* x86-64 supports up to 6 arguments. 7+ can be added in the future */ - if (nr_regs > 6) + /* x86-64 supports up to MAX_BPF_FUNC_ARGS arguments. 1-6 + * are passed through regs, the remains are through stack. + */ + if (nr_regs > MAX_BPF_FUNC_ARGS) return -ENOTSUPP; /* Generated trampoline stack layout: @@ -2170,7 +2392,14 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i * * RBP - ip_off [ traced function ] BPF_TRAMP_F_IP_ARG flag * + * RBP - rbx_off [ rbx value ] always + * * RBP - run_ctx_off [ bpf_tramp_run_ctx ] + * + * [ stack_argN ] BPF_TRAMP_F_CALL_ORIG + * [ ... ] + * [ stack_arg2 ] + * RBP - arg_stack_off [ stack_arg1 ] */ /* room for return value of orig_call or fentry prog */ @@ -2190,9 +2419,26 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i ip_off = stack_size; + stack_size += 8; + rbx_off = stack_size; + stack_size += (sizeof(struct bpf_tramp_run_ctx) + 7) & ~0x7; run_ctx_off = stack_size; + if (nr_regs > 6 && (flags & BPF_TRAMP_F_CALL_ORIG)) { + /* the space that used to pass arguments on-stack */ + stack_size += (nr_regs - get_nr_used_regs(m)) * 8; + /* make sure the stack pointer is 16-byte aligned if we + * need pass arguments on stack, which means + * [stack_size + 8(rbp) + 8(rip) + 8(origin rip)] + * should be 16-byte aligned. Following code depend on + * that stack_size is already 8-byte aligned. + */ + stack_size += (stack_size % 16) ? 0 : 8; + } + + arg_stack_off = stack_size; + if (flags & BPF_TRAMP_F_SKIP_FRAME) { /* skip patched call instruction and point orig_call to actual * body of the kernel function. @@ -2212,8 +2458,14 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i x86_call_depth_emit_accounting(&prog, NULL); EMIT1(0x55); /* push rbp */ EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */ - EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */ - EMIT1(0x53); /* push rbx */ + if (!is_imm8(stack_size)) + /* sub rsp, stack_size */ + EMIT3_off32(0x48, 0x81, 0xEC, stack_size); + else + /* sub rsp, stack_size */ + EMIT4(0x48, 0x83, 0xEC, stack_size); + /* mov QWORD PTR [rbp - rbx_off], rbx */ + emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_6, -rbx_off); /* Store number of argument registers of the traced function: * mov rax, nr_regs @@ -2231,7 +2483,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -ip_off); } - save_regs(m, &prog, nr_regs, regs_off); + save_args(m, &prog, regs_off, false); if (flags & BPF_TRAMP_F_CALL_ORIG) { /* arg1: mov rdi, im */ @@ -2261,7 +2513,8 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i } if (flags & BPF_TRAMP_F_CALL_ORIG) { - restore_regs(m, &prog, nr_regs, regs_off); + restore_regs(m, &prog, regs_off); + save_args(m, &prog, arg_stack_off, true); if (flags & BPF_TRAMP_F_ORIG_STACK) { emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, 8); @@ -2302,7 +2555,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i } if (flags & BPF_TRAMP_F_RESTORE_REGS) - restore_regs(m, &prog, nr_regs, regs_off); + restore_regs(m, &prog, regs_off); /* This needs to be done regardless. If there were fmod_ret programs, * the return value is only updated on the stack and still needs to be @@ -2321,7 +2574,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i if (save_ret) emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); - EMIT1(0x5B); /* pop rbx */ + emit_ldx(&prog, BPF_DW, BPF_REG_6, BPF_REG_FP, -rbx_off); EMIT1(0xC9); /* leave */ if (flags & BPF_TRAMP_F_SKIP_FRAME) /* skip our return address and return to parent */ diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index dd40d3fea74e..631512f7ec85 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -51,6 +51,14 @@ static struct pci_root_info __init *find_pci_root_info(int node, int link) return NULL; } +static inline resource_size_t cap_resource(u64 val) +{ + if (val > RESOURCE_SIZE_MAX) + return RESOURCE_SIZE_MAX; + + return val; +} + /** * early_root_info_init() * called before pcibios_scan_root and pci_scan_bus diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c index 2752c02e3f0e..e4a525e59eaf 100644 --- a/arch/x86/pci/bus_numa.c +++ b/arch/x86/pci/bus_numa.c @@ -101,7 +101,7 @@ void update_res(struct pci_root_info *info, resource_size_t start, if (start > end) return; - if (start == MAX_RESOURCE) + if (start == RESOURCE_SIZE_MAX) return; if (!merge) diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index a498b847d740..0de436316a1d 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -136,14 +136,14 @@ static inline struct irq_routing_table *pirq_convert_irt_table(u8 *addr, if (ir->signature != IRT_SIGNATURE || !ir->used || ir->size < ir->used) return NULL; - size = sizeof(*ir) + ir->used * sizeof(ir->slots[0]); + size = struct_size(ir, slots, ir->used); if (size > limit - addr) return NULL; DBG(KERN_DEBUG "PCI: $IRT Interrupt Routing Table found at 0x%lx\n", __pa(ir)); - size = sizeof(*rt) + ir->used * sizeof(rt->slots[0]); + size = struct_size(rt, slots, ir->used); rt = kzalloc(size, GFP_KERNEL); if (!rt) return NULL; diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 014c508e914d..652cd53e77f6 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -517,7 +517,7 @@ int __init pci_xen_init(void) #ifdef CONFIG_PCI_MSI static void __init xen_hvm_msi_init(void) { - if (!disable_apic) { + if (!apic_is_disabled) { /* * If hardware supports (x2)APIC virtualization (as indicated * by hypervisor's leaf 4) then we don't need to use pirqs/ diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c index e06a199423c0..b2cc7b4552a1 100644 --- a/arch/x86/platform/efi/efi_32.c +++ b/arch/x86/platform/efi/efi_32.c @@ -140,3 +140,15 @@ void __init efi_runtime_update_mappings(void) } } } + +void arch_efi_call_virt_setup(void) +{ + efi_fpu_begin(); + firmware_restrict_branch_speculation_start(); +} + +void arch_efi_call_virt_teardown(void) +{ + firmware_restrict_branch_speculation_end(); + efi_fpu_end(); +} diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 77f7ac3668cb..91d31ac422d6 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -474,19 +474,34 @@ void __init efi_dump_pagetable(void) * can not change under us. * It should be ensured that there are no concurrent calls to this function. */ -void efi_enter_mm(void) +static void efi_enter_mm(void) { efi_prev_mm = current->active_mm; current->active_mm = &efi_mm; switch_mm(efi_prev_mm, &efi_mm, NULL); } -void efi_leave_mm(void) +static void efi_leave_mm(void) { current->active_mm = efi_prev_mm; switch_mm(&efi_mm, efi_prev_mm, NULL); } +void arch_efi_call_virt_setup(void) +{ + efi_sync_low_kernel_mappings(); + efi_fpu_begin(); + firmware_restrict_branch_speculation_start(); + efi_enter_mm(); +} + +void arch_efi_call_virt_teardown(void) +{ + efi_leave_mm(); + firmware_restrict_branch_speculation_end(); + efi_fpu_end(); +} + static DEFINE_SPINLOCK(efi_runtime_lock); /* diff --git a/arch/x86/platform/efi/memmap.c b/arch/x86/platform/efi/memmap.c index c69f8471e6d0..4ef20b49eb5e 100644 --- a/arch/x86/platform/efi/memmap.c +++ b/arch/x86/platform/efi/memmap.c @@ -82,7 +82,7 @@ int __init efi_memmap_alloc(unsigned int num_entries, /** * efi_memmap_install - Install a new EFI memory map in efi.memmap - * @ctx: map allocation parameters (address, size, flags) + * @data: efi memmap installation parameters * * Unlike efi_memmap_init_*(), this function does not allow the caller * to switch from early to late mappings. It simply uses the existing diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index ee21d6a36a80..4221259a5870 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -58,7 +58,7 @@ uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask, ret = parent->chip->irq_set_affinity(parent, mask, force); if (ret >= 0) { uv_program_mmr(cfg, data->chip_data); - send_cleanup_vector(cfg); + vector_schedule_cleanup(cfg); } return ret; diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c index a60af0230e27..45d0c17ce77c 100644 --- a/arch/x86/platform/uv/uv_nmi.c +++ b/arch/x86/platform/uv/uv_nmi.c @@ -202,21 +202,17 @@ static int param_set_action(const char *val, const struct kernel_param *kp) { int i; int n = ARRAY_SIZE(valid_acts); - char arg[ACTION_LEN], *p; + char arg[ACTION_LEN]; /* (remove possible '\n') */ - strncpy(arg, val, ACTION_LEN - 1); - arg[ACTION_LEN - 1] = '\0'; - p = strchr(arg, '\n'); - if (p) - *p = '\0'; + strscpy(arg, val, strnchrnul(val, sizeof(arg)-1, '\n') - val + 1); for (i = 0; i < n; i++) if (!strcmp(arg, valid_acts[i].action)) break; if (i < n) { - strcpy(uv_nmi_action, arg); + strscpy(uv_nmi_action, arg, sizeof(uv_nmi_action)); pr_info("UV: New NMI action:%s\n", uv_nmi_action); return 0; } @@ -601,7 +597,7 @@ static void uv_nmi_nr_cpus_ping(void) for_each_cpu(cpu, uv_nmi_cpu_mask) uv_cpu_nmi_per(cpu).pinging = 1; - apic->send_IPI_mask(uv_nmi_cpu_mask, APIC_DM_NMI); + __apic_send_IPI_mask(uv_nmi_cpu_mask, APIC_DM_NMI); } /* Clean up flags for CPU's that ignored both NMI and ping */ @@ -959,7 +955,7 @@ static int uv_handle_nmi(unsigned int reason, struct pt_regs *regs) /* Unexpected return, revert action to "dump" */ if (master) - strncpy(uv_nmi_action, "dump", strlen(uv_nmi_action)); + strscpy(uv_nmi_action, "dump", sizeof(uv_nmi_action)); } /* Pause as all CPU's enter the NMI handler */ diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index c2a29be35c01..08aa0f25f12a 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -19,6 +19,10 @@ CFLAGS_sha256.o := -D__DISABLE_EXPORTS -D__NO_FORTIFY # optimization flags. KBUILD_CFLAGS := $(filter-out -fprofile-sample-use=% -fprofile-use=%,$(KBUILD_CFLAGS)) +# When LTO is enabled, llvm emits many text sections, which is not supported +# by kexec. Remove -flto=* flags. +KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO),$(KBUILD_CFLAGS)) + # When linking purgatory.ro with -r unresolved symbols are not checked, # also link a purgatory.chk binary without -r to check for unresolved symbols. PURGATORY_LDFLAGS := -e purgatory_start -z nodefaultlib diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c index 7558139920f8..aea47e793963 100644 --- a/arch/x86/purgatory/purgatory.c +++ b/arch/x86/purgatory/purgatory.c @@ -14,6 +14,7 @@ #include <crypto/sha2.h> #include <asm/purgatory.h> +#include "../boot/compressed/error.h" #include "../boot/string.h" u8 purgatory_sha256_digest[SHA256_DIGEST_SIZE] __section(".kexec-purgatory"); diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile index ee89f6bb9242..8bc72a51b257 100644 --- a/arch/x86/um/Makefile +++ b/arch/x86/um/Makefile @@ -13,15 +13,16 @@ obj-y = bugs_$(BITS).o delay.o fault.o ldt.o \ ptrace_$(BITS).o ptrace_user.o setjmp_$(BITS).o signal.o \ stub_$(BITS).o stub_segv.o \ sys_call_table_$(BITS).o sysrq_$(BITS).o tls_$(BITS).o \ - mem_$(BITS).o subarch.o os-$(OS)/ + mem_$(BITS).o subarch.o os-Linux/ ifeq ($(CONFIG_X86_32),y) -obj-y += checksum_32.o syscalls_32.o +obj-y += syscalls_32.o obj-$(CONFIG_ELF_CORE) += elfcore.o subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o subarch-y += ../lib/cmpxchg8b_emu.o ../lib/atomic64_386_32.o +subarch-y += ../lib/checksum_32.o subarch-y += ../kernel/sys_ia32.o else diff --git a/arch/x86/um/asm/mm_context.h b/arch/x86/um/asm/mm_context.h index 4a73d63e4760..dc32dc023c2f 100644 --- a/arch/x86/um/asm/mm_context.h +++ b/arch/x86/um/asm/mm_context.h @@ -11,8 +11,6 @@ #include <linux/mutex.h> #include <asm/ldt.h> -extern void ldt_host_info(void); - #define LDT_PAGES_MAX \ ((LDT_ENTRIES * LDT_ENTRY_SIZE)/PAGE_SIZE) #define LDT_ENTRIES_PER_PAGE \ diff --git a/arch/x86/um/checksum_32.S b/arch/x86/um/checksum_32.S deleted file mode 100644 index aed782ab7721..000000000000 --- a/arch/x86/um/checksum_32.S +++ /dev/null @@ -1,214 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * INET An implementation of the TCP/IP protocol suite for the LINUX - * operating system. INET is implemented using the BSD Socket - * interface as the means of communication with the user level. - * - * IP/TCP/UDP checksumming routines - * - * Authors: Jorge Cwik, <jorge@laser.satlink.net> - * Arnt Gulbrandsen, <agulbra@nvg.unit.no> - * Tom May, <ftom@netcom.com> - * Pentium Pro/II routines: - * Alexander Kjeldaas <astor@guardian.no> - * Finn Arne Gangstad <finnag@guardian.no> - * Lots of code moved from tcp.c and ip.c; see those files - * for more names. - * - * Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception - * handling. - * Andi Kleen, add zeroing on error - * converted to pure assembler - */ - -#include <asm/errno.h> -#include <asm/asm.h> -#include <asm/export.h> - -/* - * computes a partial checksum, e.g. for TCP/UDP fragments - */ - -/* -unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) - */ - -.text -.align 4 -.globl csum_partial - -#ifndef CONFIG_X86_USE_PPRO_CHECKSUM - - /* - * Experiments with Ethernet and SLIP connections show that buff - * is aligned on either a 2-byte or 4-byte boundary. We get at - * least a twofold speedup on 486 and Pentium if it is 4-byte aligned. - * Fortunately, it is easy to convert 2-byte alignment to 4-byte - * alignment for the unrolled loop. - */ -csum_partial: - pushl %esi - pushl %ebx - movl 20(%esp),%eax # Function arg: unsigned int sum - movl 16(%esp),%ecx # Function arg: int len - movl 12(%esp),%esi # Function arg: unsigned char *buff - testl $2, %esi # Check alignment. - jz 2f # Jump if alignment is ok. - subl $2, %ecx # Alignment uses up two bytes. - jae 1f # Jump if we had at least two bytes. - addl $2, %ecx # ecx was < 2. Deal with it. - jmp 4f -1: movw (%esi), %bx - addl $2, %esi - addw %bx, %ax - adcl $0, %eax -2: - movl %ecx, %edx - shrl $5, %ecx - jz 2f - testl %esi, %esi -1: movl (%esi), %ebx - adcl %ebx, %eax - movl 4(%esi), %ebx - adcl %ebx, %eax - movl 8(%esi), %ebx - adcl %ebx, %eax - movl 12(%esi), %ebx - adcl %ebx, %eax - movl 16(%esi), %ebx - adcl %ebx, %eax - movl 20(%esi), %ebx - adcl %ebx, %eax - movl 24(%esi), %ebx - adcl %ebx, %eax - movl 28(%esi), %ebx - adcl %ebx, %eax - lea 32(%esi), %esi - dec %ecx - jne 1b - adcl $0, %eax -2: movl %edx, %ecx - andl $0x1c, %edx - je 4f - shrl $2, %edx # This clears CF -3: adcl (%esi), %eax - lea 4(%esi), %esi - dec %edx - jne 3b - adcl $0, %eax -4: andl $3, %ecx - jz 7f - cmpl $2, %ecx - jb 5f - movw (%esi),%cx - leal 2(%esi),%esi - je 6f - shll $16,%ecx -5: movb (%esi),%cl -6: addl %ecx,%eax - adcl $0, %eax -7: - popl %ebx - popl %esi - RET - -#else - -/* Version for PentiumII/PPro */ - -csum_partial: - pushl %esi - pushl %ebx - movl 20(%esp),%eax # Function arg: unsigned int sum - movl 16(%esp),%ecx # Function arg: int len - movl 12(%esp),%esi # Function arg: const unsigned char *buf - - testl $2, %esi - jnz 30f -10: - movl %ecx, %edx - movl %ecx, %ebx - andl $0x7c, %ebx - shrl $7, %ecx - addl %ebx,%esi - shrl $2, %ebx - negl %ebx - lea 45f(%ebx,%ebx,2), %ebx - testl %esi, %esi - jmp *%ebx - - # Handle 2-byte-aligned regions -20: addw (%esi), %ax - lea 2(%esi), %esi - adcl $0, %eax - jmp 10b - -30: subl $2, %ecx - ja 20b - je 32f - movzbl (%esi),%ebx # csumming 1 byte, 2-aligned - addl %ebx, %eax - adcl $0, %eax - jmp 80f -32: - addw (%esi), %ax # csumming 2 bytes, 2-aligned - adcl $0, %eax - jmp 80f - -40: - addl -128(%esi), %eax - adcl -124(%esi), %eax - adcl -120(%esi), %eax - adcl -116(%esi), %eax - adcl -112(%esi), %eax - adcl -108(%esi), %eax - adcl -104(%esi), %eax - adcl -100(%esi), %eax - adcl -96(%esi), %eax - adcl -92(%esi), %eax - adcl -88(%esi), %eax - adcl -84(%esi), %eax - adcl -80(%esi), %eax - adcl -76(%esi), %eax - adcl -72(%esi), %eax - adcl -68(%esi), %eax - adcl -64(%esi), %eax - adcl -60(%esi), %eax - adcl -56(%esi), %eax - adcl -52(%esi), %eax - adcl -48(%esi), %eax - adcl -44(%esi), %eax - adcl -40(%esi), %eax - adcl -36(%esi), %eax - adcl -32(%esi), %eax - adcl -28(%esi), %eax - adcl -24(%esi), %eax - adcl -20(%esi), %eax - adcl -16(%esi), %eax - adcl -12(%esi), %eax - adcl -8(%esi), %eax - adcl -4(%esi), %eax -45: - lea 128(%esi), %esi - adcl $0, %eax - dec %ecx - jge 40b - movl %edx, %ecx -50: andl $3, %ecx - jz 80f - - # Handle the last 1-3 bytes without jumping - notl %ecx # 1->2, 2->1, 3->0, higher bits are masked - movl $0xffffff,%ebx # by the shll and shrl instructions - shll $3,%ecx - shrl %cl,%ebx - andl -128(%esi),%ebx # esi is 4-aligned so should be ok - addl %ebx,%eax - adcl $0,%eax -80: - popl %ebx - popl %esi - RET - -#endif - EXPORT_SYMBOL(csum_partial) diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index 62d34b6611c5..7ad91225fdf4 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -81,6 +81,11 @@ static void xen_apic_write(u32 reg, u32 val) WARN(1,"register: %x, value: %x\n", reg, val); } +static void xen_apic_eoi(void) +{ + WARN_ON_ONCE(1); +} + static u64 xen_apic_icr_read(void) { return 0; @@ -92,11 +97,6 @@ static void xen_apic_icr_write(u32 low, u32 id) WARN_ON(1); } -static u32 xen_safe_apic_wait_icr_idle(void) -{ - return 0; -} - static int xen_apic_probe_pv(void) { if (xen_pv_domain()) @@ -110,29 +110,11 @@ static int xen_madt_oem_check(char *oem_id, char *oem_table_id) return xen_pv_domain(); } -static int xen_id_always_valid(u32 apicid) -{ - return 1; -} - -static int xen_id_always_registered(void) -{ - return 1; -} - static int xen_phys_pkg_id(int initial_apic_id, int index_msb) { return initial_apic_id >> index_msb; } -static void xen_noop(void) -{ -} - -static void xen_silent_inquire(int apicid) -{ -} - static int xen_cpu_present_to_apicid(int cpu) { if (cpu_present(cpu)) @@ -141,68 +123,41 @@ static int xen_cpu_present_to_apicid(int cpu) return BAD_APICID; } -static struct apic xen_pv_apic = { - .name = "Xen PV", - .probe = xen_apic_probe_pv, +static struct apic xen_pv_apic __ro_after_init = { + .name = "Xen PV", + .probe = xen_apic_probe_pv, .acpi_madt_oem_check = xen_madt_oem_check, - .apic_id_valid = xen_id_always_valid, - .apic_id_registered = xen_id_always_registered, /* .delivery_mode and .dest_mode_logical not used by XENPV */ .disable_esr = 0, - .check_apicid_used = default_check_apicid_used, /* Used on 32-bit */ - .init_apic_ldr = xen_noop, /* setup_local_APIC calls it */ - .ioapic_phys_id_map = default_ioapic_phys_id_map, /* Used on 32-bit */ - .setup_apic_routing = NULL, .cpu_present_to_apicid = xen_cpu_present_to_apicid, - .apicid_to_cpu_present = physid_set_mask_of_physid, /* Used on 32-bit */ - .check_phys_apicid_present = default_check_phys_apicid_present, /* smp_sanity_check needs it */ .phys_pkg_id = xen_phys_pkg_id, /* detect_ht */ - .get_apic_id = xen_get_apic_id, - .set_apic_id = xen_set_apic_id, /* Can be NULL on 32-bit. */ + .max_apic_id = UINT_MAX, + .get_apic_id = xen_get_apic_id, + .set_apic_id = xen_set_apic_id, .calc_dest_apicid = apic_flat_calc_apicid, #ifdef CONFIG_SMP - .send_IPI_mask = xen_send_IPI_mask, - .send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself, - .send_IPI_allbutself = xen_send_IPI_allbutself, - .send_IPI_all = xen_send_IPI_all, - .send_IPI_self = xen_send_IPI_self, + .send_IPI_mask = xen_send_IPI_mask, + .send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself, + .send_IPI_allbutself = xen_send_IPI_allbutself, + .send_IPI_all = xen_send_IPI_all, + .send_IPI_self = xen_send_IPI_self, #endif - /* .wait_for_init_deassert- used by AP bootup - smp_callin which we don't use */ - .inquire_remote_apic = xen_silent_inquire, - .read = xen_apic_read, .write = xen_apic_write, - .eoi_write = xen_apic_write, + .eoi = xen_apic_eoi, - .icr_read = xen_apic_icr_read, - .icr_write = xen_apic_icr_write, - .wait_icr_idle = xen_noop, - .safe_wait_icr_idle = xen_safe_apic_wait_icr_idle, + .icr_read = xen_apic_icr_read, + .icr_write = xen_apic_icr_write, }; +apic_driver(xen_pv_apic); -static void __init xen_apic_check(void) -{ - if (apic == &xen_pv_apic) - return; - - pr_info("Switched APIC routing from %s to %s.\n", apic->name, - xen_pv_apic.name); - apic = &xen_pv_apic; -} void __init xen_init_apic(void) { x86_apic_ops.io_apic_read = xen_io_apic_read; - /* On PV guests the APIC CPUID bit is disabled so none of the - * routines end up executing. */ - if (!xen_initial_domain()) - apic = &xen_pv_apic; - - x86_platform.apic_post_init = xen_apic_check; } -apic_driver(xen_pv_apic); diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index a6820ca940bf..9a192f51f1b0 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -132,7 +132,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_xen_hvm_callback) struct pt_regs *old_regs = set_irq_regs(regs); if (xen_percpu_upcall) - ack_APIC_irq(); + apic_eoi(); inc_irq_stat(irq_hv_callback_count); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 93b658248d01..49352fad7d1d 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -79,7 +79,7 @@ #ifdef CONFIG_ACPI #include <linux/acpi.h> #include <asm/acpi.h> -#include <acpi/pdc_intel.h> +#include <acpi/proc_cap_intel.h> #include <acpi/processor.h> #include <xen/interface/platform.h> #endif @@ -288,17 +288,17 @@ static bool __init xen_check_mwait(void) native_cpuid(&ax, &bx, &cx, &dx); - /* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so, + /* Ask the Hypervisor whether to clear ACPI_PROC_CAP_C_C2C3_FFH. If so, * don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3. */ buf[0] = ACPI_PDC_REVISION_ID; buf[1] = 1; - buf[2] = (ACPI_PDC_C_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_SWSMP); + buf[2] = (ACPI_PROC_CAP_C_CAPABILITY_SMP | ACPI_PROC_CAP_EST_CAPABILITY_SWSMP); set_xen_guest_handle(op.u.set_pminfo.pdc, buf); if ((HYPERVISOR_platform_op(&op) == 0) && - (buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) { + (buf[2] & (ACPI_PROC_CAP_C_C1_FFH | ACPI_PROC_CAP_C_C2C3_FFH))) { cpuid_leaf5_ecx_val = cx; cpuid_leaf5_edx_val = dx; } @@ -523,7 +523,7 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr) BUG_ON(size > PAGE_SIZE); BUG_ON(va & ~PAGE_MASK); - pfn = virt_to_pfn(va); + pfn = virt_to_pfn((void *)va); mfn = pfn_to_mfn(pfn); pte = pfn_pte(pfn, PAGE_KERNEL_RO); @@ -694,7 +694,7 @@ static struct trap_array_entry trap_array[] = { TRAP_ENTRY(exc_coprocessor_error, false ), TRAP_ENTRY(exc_alignment_check, false ), TRAP_ENTRY(exc_simd_coprocessor_error, false ), -#ifdef CONFIG_X86_KERNEL_IBT +#ifdef CONFIG_X86_CET TRAP_ENTRY(exc_control_protection, false ), #endif }; @@ -1326,7 +1326,7 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si) x86_init.resources.memory_setup = xen_memory_setup; x86_init.irqs.intr_mode_select = x86_init_noop; - x86_init.irqs.intr_mode_init = x86_init_noop; + x86_init.irqs.intr_mode_init = x86_64_probe_apic; x86_init.oem.arch_setup = xen_arch_setup; x86_init.oem.banner = xen_banner; x86_init.hyper.init_platform = xen_pv_init_platform; @@ -1366,12 +1366,10 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si) xen_init_capabilities(); -#ifdef CONFIG_X86_LOCAL_APIC /* * set up the basic apic ops. */ xen_init_apic(); -#endif machine_ops = xen_machine_ops; diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index e0a975165de7..1652c39e3dfb 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -166,7 +166,7 @@ void make_lowmem_page_readwrite(void *vaddr) if (pte == NULL) return; /* vaddr missing */ - ptev = pte_mkwrite(*pte); + ptev = pte_mkwrite_novma(*pte); if (HYPERVISOR_update_va_mapping(address, ptev, 0)) BUG(); @@ -667,7 +667,7 @@ static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm) spinlock_t *ptl = NULL; #if USE_SPLIT_PTE_PTLOCKS - ptl = ptlock_ptr(page); + ptl = ptlock_ptr(page_ptdesc(page)); spin_lock_nest_lock(ptl, &mm->page_table_lock); #endif @@ -2202,13 +2202,13 @@ static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order, mcs = __xen_mc_entry(0); if (in_frames) - in_frames[i] = virt_to_mfn(vaddr); + in_frames[i] = virt_to_mfn((void *)vaddr); MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0); - __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY); + __set_phys_to_machine(virt_to_pfn((void *)vaddr), INVALID_P2M_ENTRY); if (out_frames) - out_frames[i] = virt_to_pfn(vaddr); + out_frames[i] = virt_to_pfn((void *)vaddr); } xen_mc_issue(0); } @@ -2250,7 +2250,7 @@ static void xen_remap_exchanged_ptes(unsigned long vaddr, int order, MULTI_update_va_mapping(mcs.mc, vaddr, mfn_pte(mfn, PAGE_KERNEL), flags); - set_phys_to_machine(virt_to_pfn(vaddr), mfn); + set_phys_to_machine(virt_to_pfn((void *)vaddr), mfn); } xen_mc_issue(0); @@ -2310,12 +2310,6 @@ int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, int success; unsigned long vstart = (unsigned long)phys_to_virt(pstart); - /* - * Currently an auto-translated guest will not perform I/O, nor will - * it require PAE page directories below 4GB. Therefore any calls to - * this function are redundant and can be ignored. - */ - if (unlikely(order > MAX_CONTIG_ORDER)) return -ENOMEM; @@ -2327,7 +2321,7 @@ int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, xen_zap_pfn_range(vstart, order, in_frames, NULL); /* 2. Get a new contiguous memory extent. */ - out_frame = virt_to_pfn(vstart); + out_frame = virt_to_pfn((void *)vstart); success = xen_exchange_memory(1UL << order, 0, in_frames, 1, order, &out_frame, address_bits); @@ -2360,7 +2354,7 @@ void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order) spin_lock_irqsave(&xen_reservation_lock, flags); /* 1. Find start MFN of contiguous extent. */ - in_frame = virt_to_mfn(vstart); + in_frame = virt_to_mfn((void *)vstart); /* 2. Zap current PTEs. */ xen_zap_pfn_range(vstart, order, NULL, out_frames); diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 8b5cf7bb1f47..b3e37961065a 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -44,6 +44,9 @@ struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; /* Number of pages released from the initial allocation. */ unsigned long xen_released_pages; +/* Memory map would allow PCI passthrough. */ +bool xen_pv_pci_possible; + /* E820 map used during setting up memory. */ static struct e820_table xen_e820_table __initdata; @@ -340,7 +343,7 @@ static void __init xen_do_set_identity_and_remap_chunk( WARN_ON(size == 0); - mfn_save = virt_to_mfn(buf); + mfn_save = virt_to_mfn((void *)buf); for (ident_pfn_iter = start_pfn, remap_pfn_iter = remap_pfn; ident_pfn_iter < ident_end_pfn; @@ -503,7 +506,7 @@ void __init xen_remap_memory(void) unsigned long pfn_s = ~0UL; unsigned long len = 0; - mfn_save = virt_to_mfn(buf); + mfn_save = virt_to_mfn((void *)buf); while (xen_remap_mfn != INVALID_P2M_ENTRY) { /* Map the remap information */ @@ -814,6 +817,9 @@ char * __init xen_memory_setup(void) chunk_size = size; type = xen_e820_table.entries[i].type; + if (type == E820_TYPE_RESERVED) + xen_pv_pci_possible = true; + if (type == E820_TYPE_RAM) { if (addr < mem_end) { chunk_size = min(size, mem_end - addr); diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index cef78b8c89f4..a0f07bbfcd6e 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -182,7 +182,8 @@ static void __init _get_smp_config(unsigned int early) if (subtract) set_nr_cpu_ids(nr_cpu_ids - subtract); #endif - + /* Pretend to be a proper enumerated system */ + smp_found_config = 1; } static void __init xen_pv_smp_prepare_boot_cpu(void) @@ -210,7 +211,7 @@ static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus) { unsigned cpu; - if (skip_ioapic_setup) { + if (ioapic_is_disabled) { char *m = (max_cpus == 0) ? "The nosmp parameter is incompatible with Xen; " \ "use Xen dom0_max_vcpus=1 parameter" : diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 08f1ceb9eb81..9e5e68008785 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -148,7 +148,7 @@ xen_pv_trap asm_exc_page_fault xen_pv_trap asm_exc_spurious_interrupt_bug xen_pv_trap asm_exc_coprocessor_error xen_pv_trap asm_exc_alignment_check -#ifdef CONFIG_X86_KERNEL_IBT +#ifdef CONFIG_X86_CET xen_pv_trap asm_exc_control_protection #endif #ifdef CONFIG_X86_MCE |