summaryrefslogtreecommitdiff
path: root/arch/x86/kernel/traps.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-05-23 17:51:12 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-05-23 17:51:12 -0700
commit3a755ebcc2557e22b895b8976257f682c653db1d (patch)
tree7aeda9181996705ad1c82690b85ac53fe3b41716 /arch/x86/kernel/traps.c
parent5b828263b180c16037382e8ffddd0611a363aabe (diff)
parentc796f02162e428b595ff70196dca161ee46b163b (diff)
Merge tag 'x86_tdx_for_v5.19_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull Intel TDX support from Borislav Petkov: "Intel Trust Domain Extensions (TDX) support. This is the Intel version of a confidential computing solution called Trust Domain Extensions (TDX). This series adds support to run the kernel as part of a TDX guest. It provides similar guest protections to AMD's SEV-SNP like guest memory and register state encryption, memory integrity protection and a lot more. Design-wise, it differs from AMD's solution considerably: it uses a software module which runs in a special CPU mode called (Secure Arbitration Mode) SEAM. As the name suggests, this module serves as sort of an arbiter which the confidential guest calls for services it needs during its lifetime. Just like AMD's SNP set, this series reworks and streamlines certain parts of x86 arch code so that this feature can be properly accomodated" * tag 'x86_tdx_for_v5.19_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (34 commits) x86/tdx: Fix RETs in TDX asm x86/tdx: Annotate a noreturn function x86/mm: Fix spacing within memory encryption features message x86/kaslr: Fix build warning in KASLR code in boot stub Documentation/x86: Document TDX kernel architecture ACPICA: Avoid cache flush inside virtual machines x86/tdx/ioapic: Add shared bit for IOAPIC base address x86/mm: Make DMA memory shared for TD guest x86/mm/cpa: Add support for TDX shared memory x86/tdx: Make pages shared in ioremap() x86/topology: Disable CPU online/offline control for TDX guests x86/boot: Avoid #VE during boot for TDX platforms x86/boot: Set CR0.NE early and keep it set during the boot x86/acpi/x86/boot: Add multiprocessor wake-up support x86/boot: Add a trampoline for booting APs via firmware handoff x86/tdx: Wire up KVM hypercalls x86/tdx: Port I/O: Add early boot support x86/tdx: Port I/O: Add runtime hypercalls x86/boot: Port I/O: Add decompression-time support for TDX x86/boot: Port I/O: Allow to hook up alternative helpers ...
Diffstat (limited to 'arch/x86/kernel/traps.c')
-rw-r--r--arch/x86/kernel/traps.c143
1 files changed, 117 insertions, 26 deletions
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1563fb995005..a4e2efde5d1f 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -62,6 +62,7 @@
#include <asm/insn.h>
#include <asm/insn-eval.h>
#include <asm/vdso.h>
+#include <asm/tdx.h>
#ifdef CONFIG_X86_64
#include <asm/x86_init.h>
@@ -686,13 +687,40 @@ static bool try_fixup_enqcmd_gp(void)
#endif
}
+static bool gp_try_fixup_and_notify(struct pt_regs *regs, int trapnr,
+ unsigned long error_code, const char *str)
+{
+ if (fixup_exception(regs, trapnr, error_code, 0))
+ return true;
+
+ current->thread.error_code = error_code;
+ current->thread.trap_nr = trapnr;
+
+ /*
+ * To be potentially processing a kprobe fault and to trust the result
+ * from kprobe_running(), we have to be non-preemptible.
+ */
+ if (!preemptible() && kprobe_running() &&
+ kprobe_fault_handler(regs, trapnr))
+ return true;
+
+ return notify_die(DIE_GPF, str, regs, error_code, trapnr, SIGSEGV) == NOTIFY_STOP;
+}
+
+static void gp_user_force_sig_segv(struct pt_regs *regs, int trapnr,
+ unsigned long error_code, const char *str)
+{
+ current->thread.error_code = error_code;
+ current->thread.trap_nr = trapnr;
+ show_signal(current, SIGSEGV, "", str, regs, error_code);
+ force_sig(SIGSEGV);
+}
+
DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
{
char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR;
enum kernel_gp_hint hint = GP_NO_HINT;
- struct task_struct *tsk;
unsigned long gp_addr;
- int ret;
if (user_mode(regs) && try_fixup_enqcmd_gp())
return;
@@ -711,40 +739,18 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
return;
}
- tsk = current;
-
if (user_mode(regs)) {
if (fixup_iopl_exception(regs))
goto exit;
- tsk->thread.error_code = error_code;
- tsk->thread.trap_nr = X86_TRAP_GP;
-
if (fixup_vdso_exception(regs, X86_TRAP_GP, error_code, 0))
goto exit;
- show_signal(tsk, SIGSEGV, "", desc, regs, error_code);
- force_sig(SIGSEGV);
+ gp_user_force_sig_segv(regs, X86_TRAP_GP, error_code, desc);
goto exit;
}
- if (fixup_exception(regs, X86_TRAP_GP, error_code, 0))
- goto exit;
-
- tsk->thread.error_code = error_code;
- tsk->thread.trap_nr = X86_TRAP_GP;
-
- /*
- * To be potentially processing a kprobe fault and to trust the result
- * from kprobe_running(), we have to be non-preemptible.
- */
- if (!preemptible() &&
- kprobe_running() &&
- kprobe_fault_handler(regs, X86_TRAP_GP))
- goto exit;
-
- ret = notify_die(DIE_GPF, desc, regs, error_code, X86_TRAP_GP, SIGSEGV);
- if (ret == NOTIFY_STOP)
+ if (gp_try_fixup_and_notify(regs, X86_TRAP_GP, error_code, desc))
goto exit;
if (error_code)
@@ -1343,6 +1349,91 @@ DEFINE_IDTENTRY(exc_device_not_available)
}
}
+#ifdef CONFIG_INTEL_TDX_GUEST
+
+#define VE_FAULT_STR "VE fault"
+
+static void ve_raise_fault(struct pt_regs *regs, long error_code)
+{
+ if (user_mode(regs)) {
+ gp_user_force_sig_segv(regs, X86_TRAP_VE, error_code, VE_FAULT_STR);
+ return;
+ }
+
+ if (gp_try_fixup_and_notify(regs, X86_TRAP_VE, error_code, VE_FAULT_STR))
+ return;
+
+ die_addr(VE_FAULT_STR, regs, error_code, 0);
+}
+
+/*
+ * Virtualization Exceptions (#VE) are delivered to TDX guests due to
+ * specific guest actions which may happen in either user space or the
+ * kernel:
+ *
+ * * Specific instructions (WBINVD, for example)
+ * * Specific MSR accesses
+ * * Specific CPUID leaf accesses
+ * * Access to specific guest physical addresses
+ *
+ * In the settings that Linux will run in, virtualization exceptions are
+ * never generated on accesses to normal, TD-private memory that has been
+ * accepted (by BIOS or with tdx_enc_status_changed()).
+ *
+ * Syscall entry code has a critical window where the kernel stack is not
+ * yet set up. Any exception in this window leads to hard to debug issues
+ * and can be exploited for privilege escalation. Exceptions in the NMI
+ * entry code also cause issues. Returning from the exception handler with
+ * IRET will re-enable NMIs and nested NMI will corrupt the NMI stack.
+ *
+ * For these reasons, the kernel avoids #VEs during the syscall gap and
+ * the NMI entry code. Entry code paths do not access TD-shared memory,
+ * MMIO regions, use #VE triggering MSRs, instructions, or CPUID leaves
+ * that might generate #VE. VMM can remove memory from TD at any point,
+ * but access to unaccepted (or missing) private memory leads to VM
+ * termination, not to #VE.
+ *
+ * Similarly to page faults and breakpoints, #VEs are allowed in NMI
+ * handlers once the kernel is ready to deal with nested NMIs.
+ *
+ * During #VE delivery, all interrupts, including NMIs, are blocked until
+ * TDGETVEINFO is called. It prevents #VE nesting until the kernel reads
+ * the VE info.
+ *
+ * If a guest kernel action which would normally cause a #VE occurs in
+ * the interrupt-disabled region before TDGETVEINFO, a #DF (fault
+ * exception) is delivered to the guest which will result in an oops.
+ *
+ * The entry code has been audited carefully for following these expectations.
+ * Changes in the entry code have to be audited for correctness vs. this
+ * aspect. Similarly to #PF, #VE in these places will expose kernel to
+ * privilege escalation or may lead to random crashes.
+ */
+DEFINE_IDTENTRY(exc_virtualization_exception)
+{
+ struct ve_info ve;
+
+ /*
+ * NMIs/Machine-checks/Interrupts will be in a disabled state
+ * till TDGETVEINFO TDCALL is executed. This ensures that VE
+ * info cannot be overwritten by a nested #VE.
+ */
+ tdx_get_ve_info(&ve);
+
+ cond_local_irq_enable(regs);
+
+ /*
+ * If tdx_handle_virt_exception() could not process
+ * it successfully, treat it as #GP(0) and handle it.
+ */
+ if (!tdx_handle_virt_exception(regs, &ve))
+ ve_raise_fault(regs, 0);
+
+ cond_local_irq_disable(regs);
+}
+
+#endif
+
#ifdef CONFIG_X86_32
DEFINE_IDTENTRY_SW(iret_error)
{