summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Makefile13
-rw-r--r--arch/x86/boot/compressed/Makefile7
-rw-r--r--arch/x86/boot/compressed/misc.c2
-rw-r--r--arch/x86/boot/compressed/misc.h2
-rw-r--r--arch/x86/boot/compressed/sev.c (renamed from arch/x86/boot/compressed/sev-es.c)4
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl2
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl2
-rw-r--r--arch/x86/events/core.c6
-rw-r--r--arch/x86/events/intel/core.c2
-rw-r--r--arch/x86/events/intel/lbr.c26
-rw-r--r--arch/x86/events/intel/uncore_snbep.c9
-rw-r--r--arch/x86/events/perf_event.h6
-rw-r--r--arch/x86/include/asm/apic.h1
-rw-r--r--arch/x86/include/asm/disabled-features.h7
-rw-r--r--arch/x86/include/asm/fpu/api.h6
-rw-r--r--arch/x86/include/asm/fpu/internal.h7
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h16
-rw-r--r--arch/x86/include/asm/kvm_para.h10
-rw-r--r--arch/x86/include/asm/msr-index.h6
-rw-r--r--arch/x86/include/asm/processor.h2
-rw-r--r--arch/x86/include/asm/sev-common.h62
-rw-r--r--arch/x86/include/asm/sev.h (renamed from arch/x86/include/asm/sev-es.h)30
-rw-r--r--arch/x86/include/asm/thermal.h4
-rw-r--r--arch/x86/include/asm/vdso/clocksource.h2
-rw-r--r--arch/x86/include/uapi/asm/kvm.h2
-rw-r--r--arch/x86/kernel/Makefile6
-rw-r--r--arch/x86/kernel/alternative.c64
-rw-r--r--arch/x86/kernel/apic/apic.c1
-rw-r--r--arch/x86/kernel/apic/vector.c20
-rw-r--r--arch/x86/kernel/cpu/amd.c20
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c4
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c4
-rw-r--r--arch/x86/kernel/fpu/xstate.c57
-rw-r--r--arch/x86/kernel/head64.c2
-rw-r--r--arch/x86/kernel/kvm.c129
-rw-r--r--arch/x86/kernel/kvmclock.c26
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c2
-rw-r--r--arch/x86/kernel/nmi.c2
-rw-r--r--arch/x86/kernel/setup.c44
-rw-r--r--arch/x86/kernel/sev-shared.c (renamed from arch/x86/kernel/sev-es-shared.c)21
-rw-r--r--arch/x86/kernel/sev.c (renamed from arch/x86/kernel/sev-es.c)140
-rw-r--r--arch/x86/kernel/signal_compat.c9
-rw-r--r--arch/x86/kernel/smpboot.c2
-rw-r--r--arch/x86/kvm/cpuid.c20
-rw-r--r--arch/x86/kvm/emulate.c7
-rw-r--r--arch/x86/kvm/hyperv.c8
-rw-r--r--arch/x86/kvm/kvm_emulate.h4
-rw-r--r--arch/x86/kvm/lapic.c35
-rw-r--r--arch/x86/kvm/mmu/mmu.c20
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h14
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c23
-rw-r--r--arch/x86/kvm/svm/avic.c6
-rw-r--r--arch/x86/kvm/svm/nested.c23
-rw-r--r--arch/x86/kvm/svm/sev.c38
-rw-r--r--arch/x86/kvm/svm/svm.c70
-rw-r--r--arch/x86/kvm/svm/svm.h41
-rw-r--r--arch/x86/kvm/trace.h6
-rw-r--r--arch/x86/kvm/vmx/capabilities.h6
-rw-r--r--arch/x86/kvm/vmx/nested.c29
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c14
-rw-r--r--arch/x86/kvm/vmx/posted_intr.h1
-rw-r--r--arch/x86/kvm/vmx/vmx.c226
-rw-r--r--arch/x86/kvm/vmx/vmx.h12
-rw-r--r--arch/x86/kvm/x86.c201
-rw-r--r--arch/x86/mm/extable.c2
-rw-r--r--arch/x86/mm/fault.c4
-rw-r--r--arch/x86/mm/mem_encrypt_identity.c17
-rw-r--r--arch/x86/pci/amd_bus.c2
-rw-r--r--arch/x86/platform/efi/efi_64.c2
-rw-r--r--arch/x86/platform/efi/quirks.c12
-rw-r--r--arch/x86/realmode/init.c16
-rw-r--r--arch/x86/realmode/rm/trampoline_64.S4
-rw-r--r--arch/x86/xen/enlighten_pv.c8
75 files changed, 986 insertions, 677 deletions
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index c77c5d8a7b3e..cb5e8d39cac1 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -178,11 +178,6 @@ ifeq ($(ACCUMULATE_OUTGOING_ARGS), 1)
KBUILD_CFLAGS += $(call cc-option,-maccumulate-outgoing-args,)
endif
-ifdef CONFIG_LTO_CLANG
-KBUILD_LDFLAGS += -plugin-opt=-code-model=kernel \
- -plugin-opt=-stack-alignment=$(if $(CONFIG_X86_32),4,8)
-endif
-
# Workaround for a gcc prelease that unfortunately was shipped in a suse release
KBUILD_CFLAGS += -Wno-sign-compare
#
@@ -202,7 +197,13 @@ ifdef CONFIG_RETPOLINE
endif
endif
-KBUILD_LDFLAGS := -m elf_$(UTS_MACHINE)
+KBUILD_LDFLAGS += -m elf_$(UTS_MACHINE)
+
+ifdef CONFIG_LTO_CLANG
+ifeq ($(shell test $(CONFIG_LLD_VERSION) -lt 130000; echo $$?),0)
+KBUILD_LDFLAGS += -plugin-opt=-stack-alignment=$(if $(CONFIG_X86_32),4,8)
+endif
+endif
ifdef CONFIG_X86_NEED_RELOCS
LDFLAGS_vmlinux := --emit-relocs --discard-none
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 6e5522aebbbd..431bf7f846c3 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -30,6 +30,7 @@ targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \
KBUILD_CFLAGS := -m$(BITS) -O2
KBUILD_CFLAGS += -fno-strict-aliasing -fPIE
+KBUILD_CFLAGS += -Wundef
KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
cflags-$(CONFIG_X86_32) := -march=i386
cflags-$(CONFIG_X86_64) := -mcmodel=small -mno-red-zone
@@ -48,10 +49,10 @@ KBUILD_CFLAGS += $(call as-option,-Wa$(comma)-mrelax-relocations=no)
KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h
KBUILD_CFLAGS += $(CLANG_FLAGS)
-# sev-es.c indirectly inludes inat-table.h which is generated during
+# sev.c indirectly inludes inat-table.h which is generated during
# compilation and stored in $(objtree). Add the directory to the includes so
# that the compiler finds it even with out-of-tree builds (make O=/some/path).
-CFLAGS_sev-es.o += -I$(objtree)/arch/x86/lib/
+CFLAGS_sev.o += -I$(objtree)/arch/x86/lib/
KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
GCOV_PROFILE := n
@@ -93,7 +94,7 @@ ifdef CONFIG_X86_64
vmlinux-objs-y += $(obj)/idt_64.o $(obj)/idt_handlers_64.o
vmlinux-objs-y += $(obj)/mem_encrypt.o
vmlinux-objs-y += $(obj)/pgtable_64.o
- vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev-es.o
+ vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev.o
endif
vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index dde042f64cca..743f13ea25c1 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -172,7 +172,7 @@ void __puthex(unsigned long value)
}
}
-#if CONFIG_X86_NEED_RELOCS
+#ifdef CONFIG_X86_NEED_RELOCS
static void handle_relocations(void *output, unsigned long output_len,
unsigned long virt_addr)
{
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index e5612f035498..31139256859f 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -79,7 +79,7 @@ struct mem_vector {
u64 size;
};
-#if CONFIG_RANDOMIZE_BASE
+#ifdef CONFIG_RANDOMIZE_BASE
/* kaslr.c */
void choose_random_location(unsigned long input,
unsigned long input_size,
diff --git a/arch/x86/boot/compressed/sev-es.c b/arch/x86/boot/compressed/sev.c
index 82041bd380e5..670e998fe930 100644
--- a/arch/x86/boot/compressed/sev-es.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -13,7 +13,7 @@
#include "misc.h"
#include <asm/pgtable_types.h>
-#include <asm/sev-es.h>
+#include <asm/sev.h>
#include <asm/trapnr.h>
#include <asm/trap_pf.h>
#include <asm/msr-index.h>
@@ -117,7 +117,7 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
#include "../../lib/insn.c"
/* Include code for early handlers */
-#include "../../kernel/sev-es-shared.c"
+#include "../../kernel/sev-shared.c"
static bool early_setup_sev_es(void)
{
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 28a1423ce32e..4bbc267fb36b 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -447,7 +447,7 @@
440 i386 process_madvise sys_process_madvise
441 i386 epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2
442 i386 mount_setattr sys_mount_setattr
-443 i386 quotactl_path sys_quotactl_path
+# 443 reserved for quotactl_path
444 i386 landlock_create_ruleset sys_landlock_create_ruleset
445 i386 landlock_add_rule sys_landlock_add_rule
446 i386 landlock_restrict_self sys_landlock_restrict_self
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index ecd551b08d05..ce18119ea0d0 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -364,7 +364,7 @@
440 common process_madvise sys_process_madvise
441 common epoll_pwait2 sys_epoll_pwait2
442 common mount_setattr sys_mount_setattr
-443 common quotactl_path sys_quotactl_path
+# 443 reserved for quotactl_path
444 common landlock_create_ruleset sys_landlock_create_ruleset
445 common landlock_add_rule sys_landlock_add_rule
446 common landlock_restrict_self sys_landlock_restrict_self
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 8e509325c2c3..8f71dd72ef95 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -396,10 +396,12 @@ int x86_reserve_hardware(void)
if (!atomic_inc_not_zero(&pmc_refcount)) {
mutex_lock(&pmc_reserve_mutex);
if (atomic_read(&pmc_refcount) == 0) {
- if (!reserve_pmc_hardware())
+ if (!reserve_pmc_hardware()) {
err = -EBUSY;
- else
+ } else {
reserve_ds_buffers();
+ reserve_lbr_buffers();
+ }
}
if (!err)
atomic_inc(&pmc_refcount);
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 2521d03de5e0..e28892270c58 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -6253,7 +6253,7 @@ __init int intel_pmu_init(void)
* Check all LBT MSR here.
* Disable LBR access if any LBR MSRs can not be accessed.
*/
- if (x86_pmu.lbr_nr && !check_msr(x86_pmu.lbr_tos, 0x3UL))
+ if (x86_pmu.lbr_tos && !check_msr(x86_pmu.lbr_tos, 0x3UL))
x86_pmu.lbr_nr = 0;
for (i = 0; i < x86_pmu.lbr_nr; i++) {
if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) &&
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 76dbab6ac9fb..4409d2cccfda 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -658,7 +658,6 @@ static inline bool branch_user_callstack(unsigned br_sel)
void intel_pmu_lbr_add(struct perf_event *event)
{
- struct kmem_cache *kmem_cache = event->pmu->task_ctx_cache;
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
if (!x86_pmu.lbr_nr)
@@ -696,11 +695,6 @@ void intel_pmu_lbr_add(struct perf_event *event)
perf_sched_cb_inc(event->ctx->pmu);
if (!cpuc->lbr_users++ && !event->total_time_running)
intel_pmu_lbr_reset();
-
- if (static_cpu_has(X86_FEATURE_ARCH_LBR) &&
- kmem_cache && !cpuc->lbr_xsave &&
- (cpuc->lbr_users != cpuc->lbr_pebs_users))
- cpuc->lbr_xsave = kmem_cache_alloc(kmem_cache, GFP_KERNEL);
}
void release_lbr_buffers(void)
@@ -722,6 +716,26 @@ void release_lbr_buffers(void)
}
}
+void reserve_lbr_buffers(void)
+{
+ struct kmem_cache *kmem_cache;
+ struct cpu_hw_events *cpuc;
+ int cpu;
+
+ if (!static_cpu_has(X86_FEATURE_ARCH_LBR))
+ return;
+
+ for_each_possible_cpu(cpu) {
+ cpuc = per_cpu_ptr(&cpu_hw_events, cpu);
+ kmem_cache = x86_get_pmu(cpu)->task_ctx_cache;
+ if (!kmem_cache || cpuc->lbr_xsave)
+ continue;
+
+ cpuc->lbr_xsave = kmem_cache_alloc_node(kmem_cache, GFP_KERNEL,
+ cpu_to_node(cpu));
+ }
+}
+
void intel_pmu_lbr_del(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 63f097289a84..3a75a2c601c2 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -1406,6 +1406,8 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
die_id = i;
else
die_id = topology_phys_to_logical_pkg(i);
+ if (die_id < 0)
+ die_id = -ENODEV;
map->pbus_to_dieid[bus] = die_id;
break;
}
@@ -1452,14 +1454,14 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
i = -1;
if (reverse) {
for (bus = 255; bus >= 0; bus--) {
- if (map->pbus_to_dieid[bus] >= 0)
+ if (map->pbus_to_dieid[bus] != -1)
i = map->pbus_to_dieid[bus];
else
map->pbus_to_dieid[bus] = i;
}
} else {
for (bus = 0; bus <= 255; bus++) {
- if (map->pbus_to_dieid[bus] >= 0)
+ if (map->pbus_to_dieid[bus] != -1)
i = map->pbus_to_dieid[bus];
else
map->pbus_to_dieid[bus] = i;
@@ -5097,9 +5099,10 @@ static struct intel_uncore_type icx_uncore_m2m = {
.perf_ctr = SNR_M2M_PCI_PMON_CTR0,
.event_ctl = SNR_M2M_PCI_PMON_CTL0,
.event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .event_mask_ext = SNR_M2M_PCI_PMON_UMASK_EXT,
.box_ctl = SNR_M2M_PCI_PMON_BOX_CTL,
.ops = &snr_m2m_uncore_pci_ops,
- .format_group = &skx_uncore_format_group,
+ .format_group = &snr_m2m_uncore_format_group,
};
static struct attribute *icx_upi_uncore_formats_attr[] = {
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 27fa85e7d4fd..ad87cb36f7c8 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -1244,6 +1244,8 @@ void reserve_ds_buffers(void);
void release_lbr_buffers(void);
+void reserve_lbr_buffers(void);
+
extern struct event_constraint bts_constraint;
extern struct event_constraint vlbr_constraint;
@@ -1393,6 +1395,10 @@ static inline void release_lbr_buffers(void)
{
}
+static inline void reserve_lbr_buffers(void)
+{
+}
+
static inline int intel_pmu_init(void)
{
return 0;
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 412b51e059c8..48067af94678 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -174,6 +174,7 @@ static inline int apic_is_clustered_box(void)
extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask);
extern void lapic_assign_system_vectors(void);
extern void lapic_assign_legacy_vector(unsigned int isairq, bool replace);
+extern void lapic_update_legacy_vectors(void);
extern void lapic_online(void);
extern void lapic_offline(void);
extern bool apic_needs_pit(void);
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index b7dd944dc867..8f28fafa98b3 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -56,11 +56,8 @@
# define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31))
#endif
-#ifdef CONFIG_IOMMU_SUPPORT
-# define DISABLE_ENQCMD 0
-#else
-# define DISABLE_ENQCMD (1 << (X86_FEATURE_ENQCMD & 31))
-#endif
+/* Force disable because it's broken beyond repair */
+#define DISABLE_ENQCMD (1 << (X86_FEATURE_ENQCMD & 31))
#ifdef CONFIG_X86_SGX
# define DISABLE_SGX 0
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index ed33a14188f6..23bef08a8388 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -106,10 +106,6 @@ extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name);
*/
#define PASID_DISABLED 0
-#ifdef CONFIG_IOMMU_SUPPORT
-/* Update current's PASID MSR/state by mm's PASID. */
-void update_pasid(void);
-#else
static inline void update_pasid(void) { }
-#endif
+
#endif /* _ASM_X86_FPU_API_H */
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 8d33ad80704f..ceeba9f63172 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -584,13 +584,6 @@ static inline void switch_fpu_finish(struct fpu *new_fpu)
pkru_val = pk->pkru;
}
__write_pkru(pkru_val);
-
- /*
- * Expensive PASID MSR write will be avoided in update_pasid() because
- * TIF_NEED_FPU_LOAD was set. And the PASID state won't be updated
- * unless it's different from mm->pasid to reduce overhead.
- */
- update_pasid();
}
#endif /* _ASM_X86_FPU_INTERNAL_H */
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 323641097f63..e7bef91cee04 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -99,6 +99,7 @@ KVM_X86_OP_NULL(post_block)
KVM_X86_OP_NULL(vcpu_blocking)
KVM_X86_OP_NULL(vcpu_unblocking)
KVM_X86_OP_NULL(update_pi_irte)
+KVM_X86_OP_NULL(start_assignment)
KVM_X86_OP_NULL(apicv_post_state_restore)
KVM_X86_OP_NULL(dy_apicv_has_pending_interrupt)
KVM_X86_OP_NULL(set_hv_timer)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index cbbcee0a84f9..9c7ced0e3171 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -113,6 +113,7 @@
#define VALID_PAGE(x) ((x) != INVALID_PAGE)
#define UNMAPPED_GVA (~(gpa_t)0)
+#define INVALID_GPA (~(gpa_t)0)
/* KVM Hugepage definitions for x86 */
#define KVM_MAX_HUGEPAGE_LEVEL PG_LEVEL_1G
@@ -199,6 +200,7 @@ enum x86_intercept_stage;
#define KVM_NR_DB_REGS 4
+#define DR6_BUS_LOCK (1 << 11)
#define DR6_BD (1 << 13)
#define DR6_BS (1 << 14)
#define DR6_BT (1 << 15)
@@ -212,7 +214,7 @@ enum x86_intercept_stage;
* DR6_ACTIVE_LOW is also used as the init/reset value for DR6.
*/
#define DR6_ACTIVE_LOW 0xffff0ff0
-#define DR6_VOLATILE 0x0001e00f
+#define DR6_VOLATILE 0x0001e80f
#define DR6_FIXED_1 (DR6_ACTIVE_LOW & ~DR6_VOLATILE)
#define DR7_BP_EN_MASK 0x000000ff
@@ -407,7 +409,7 @@ struct kvm_mmu {
u32 pkru_mask;
u64 *pae_root;
- u64 *lm_root;
+ u64 *pml4_root;
/*
* check zero bits on shadow page table entries, these
@@ -1350,6 +1352,7 @@ struct kvm_x86_ops {
int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set);
+ void (*start_assignment)(struct kvm *kvm);
void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu);
bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu);
@@ -1417,6 +1420,7 @@ struct kvm_arch_async_pf {
bool direct_map;
};
+extern u32 __read_mostly kvm_nr_uret_msrs;
extern u64 __read_mostly host_efer;
extern bool __read_mostly allow_smaller_maxphyaddr;
extern struct kvm_x86_ops kvm_x86_ops;
@@ -1775,9 +1779,15 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
unsigned long ipi_bitmap_high, u32 min,
unsigned long icr, int op_64_bit);
-void kvm_define_user_return_msr(unsigned index, u32 msr);
+int kvm_add_user_return_msr(u32 msr);
+int kvm_find_user_return_msr(u32 msr);
int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask);
+static inline bool kvm_is_supported_user_return_msr(u32 msr)
+{
+ return kvm_find_user_return_msr(msr) >= 0;
+}
+
u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc);
u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc);
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 338119852512..69299878b200 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -7,8 +7,6 @@
#include <linux/interrupt.h>
#include <uapi/asm/kvm_para.h>
-extern void kvmclock_init(void);
-
#ifdef CONFIG_KVM_GUEST
bool kvm_check_and_clear_guest_paused(void);
#else
@@ -86,13 +84,14 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
}
#ifdef CONFIG_KVM_GUEST
+void kvmclock_init(void);
+void kvmclock_disable(void);
bool kvm_para_available(void);
unsigned int kvm_arch_para_features(void);
unsigned int kvm_arch_para_hints(void);
void kvm_async_pf_task_wait_schedule(u32 token);
void kvm_async_pf_task_wake(u32 token);
u32 kvm_read_and_reset_apf_flags(void);
-void kvm_disable_steal_time(void);
bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token);
DECLARE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
@@ -137,11 +136,6 @@ static inline u32 kvm_read_and_reset_apf_flags(void)
return 0;
}
-static inline void kvm_disable_steal_time(void)
-{
- return;
-}
-
static __always_inline bool kvm_handle_async_pf(struct pt_regs *regs, u32 token)
{
return false;
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 742d89a00721..211ba3375ee9 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -537,9 +537,9 @@
/* K8 MSRs */
#define MSR_K8_TOP_MEM1 0xc001001a
#define MSR_K8_TOP_MEM2 0xc001001d
-#define MSR_K8_SYSCFG 0xc0010010
-#define MSR_K8_SYSCFG_MEM_ENCRYPT_BIT 23
-#define MSR_K8_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_K8_SYSCFG_MEM_ENCRYPT_BIT)
+#define MSR_AMD64_SYSCFG 0xc0010010
+#define MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT 23
+#define MSR_AMD64_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT)
#define MSR_K8_INT_PENDING_MSG 0xc0010055
/* C1E active bits in int pending message */
#define K8_INTP_C1E_ACTIVE_MASK 0x18000000
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 154321d29050..556b2b17c3e2 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -787,8 +787,10 @@ DECLARE_PER_CPU(u64, msr_misc_features_shadow);
#ifdef CONFIG_CPU_SUP_AMD
extern u32 amd_get_nodes_per_socket(void);
+extern u32 amd_get_highest_perf(void);
#else
static inline u32 amd_get_nodes_per_socket(void) { return 0; }
+static inline u32 amd_get_highest_perf(void) { return 0; }
#endif
static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves)
diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h
new file mode 100644
index 000000000000..629c3df243f0
--- /dev/null
+++ b/arch/x86/include/asm/sev-common.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * AMD SEV header common between the guest and the hypervisor.
+ *
+ * Author: Brijesh Singh <brijesh.singh@amd.com>
+ */
+
+#ifndef __ASM_X86_SEV_COMMON_H
+#define __ASM_X86_SEV_COMMON_H
+
+#define GHCB_MSR_INFO_POS 0
+#define GHCB_MSR_INFO_MASK (BIT_ULL(12) - 1)
+
+#define GHCB_MSR_SEV_INFO_RESP 0x001
+#define GHCB_MSR_SEV_INFO_REQ 0x002
+#define GHCB_MSR_VER_MAX_POS 48
+#define GHCB_MSR_VER_MAX_MASK 0xffff
+#define GHCB_MSR_VER_MIN_POS 32
+#define GHCB_MSR_VER_MIN_MASK 0xffff
+#define GHCB_MSR_CBIT_POS 24
+#define GHCB_MSR_CBIT_MASK 0xff
+#define GHCB_MSR_SEV_INFO(_max, _min, _cbit) \
+ ((((_max) & GHCB_MSR_VER_MAX_MASK) << GHCB_MSR_VER_MAX_POS) | \
+ (((_min) & GHCB_MSR_VER_MIN_MASK) << GHCB_MSR_VER_MIN_POS) | \
+ (((_cbit) & GHCB_MSR_CBIT_MASK) << GHCB_MSR_CBIT_POS) | \
+ GHCB_MSR_SEV_INFO_RESP)
+#define GHCB_MSR_INFO(v) ((v) & 0xfffUL)
+#define GHCB_MSR_PROTO_MAX(v) (((v) >> GHCB_MSR_VER_MAX_POS) & GHCB_MSR_VER_MAX_MASK)
+#define GHCB_MSR_PROTO_MIN(v) (((v) >> GHCB_MSR_VER_MIN_POS) & GHCB_MSR_VER_MIN_MASK)
+
+#define GHCB_MSR_CPUID_REQ 0x004
+#define GHCB_MSR_CPUID_RESP 0x005
+#define GHCB_MSR_CPUID_FUNC_POS 32
+#define GHCB_MSR_CPUID_FUNC_MASK 0xffffffff
+#define GHCB_MSR_CPUID_VALUE_POS 32
+#define GHCB_MSR_CPUID_VALUE_MASK 0xffffffff
+#define GHCB_MSR_CPUID_REG_POS 30
+#define GHCB_MSR_CPUID_REG_MASK 0x3
+#define GHCB_CPUID_REQ_EAX 0
+#define GHCB_CPUID_REQ_EBX 1
+#define GHCB_CPUID_REQ_ECX 2
+#define GHCB_CPUID_REQ_EDX 3
+#define GHCB_CPUID_REQ(fn, reg) \
+ (GHCB_MSR_CPUID_REQ | \
+ (((unsigned long)reg & GHCB_MSR_CPUID_REG_MASK) << GHCB_MSR_CPUID_REG_POS) | \
+ (((unsigned long)fn) << GHCB_MSR_CPUID_FUNC_POS))
+
+#define GHCB_MSR_TERM_REQ 0x100
+#define GHCB_MSR_TERM_REASON_SET_POS 12
+#define GHCB_MSR_TERM_REASON_SET_MASK 0xf
+#define GHCB_MSR_TERM_REASON_POS 16
+#define GHCB_MSR_TERM_REASON_MASK 0xff
+#define GHCB_SEV_TERM_REASON(reason_set, reason_val) \
+ (((((u64)reason_set) & GHCB_MSR_TERM_REASON_SET_MASK) << GHCB_MSR_TERM_REASON_SET_POS) | \
+ ((((u64)reason_val) & GHCB_MSR_TERM_REASON_MASK) << GHCB_MSR_TERM_REASON_POS))
+
+#define GHCB_SEV_ES_REASON_GENERAL_REQUEST 0
+#define GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED 1
+
+#define GHCB_RESP_CODE(v) ((v) & GHCB_MSR_INFO_MASK)
+
+#endif
diff --git a/arch/x86/include/asm/sev-es.h b/arch/x86/include/asm/sev.h
index cf1d957c7091..fa5cd05d3b5b 100644
--- a/arch/x86/include/asm/sev-es.h
+++ b/arch/x86/include/asm/sev.h
@@ -10,34 +10,12 @@
#include <linux/types.h>
#include <asm/insn.h>
+#include <asm/sev-common.h>
-#define GHCB_SEV_INFO 0x001UL
-#define GHCB_SEV_INFO_REQ 0x002UL
-#define GHCB_INFO(v) ((v) & 0xfffUL)
-#define GHCB_PROTO_MAX(v) (((v) >> 48) & 0xffffUL)
-#define GHCB_PROTO_MIN(v) (((v) >> 32) & 0xffffUL)
-#define GHCB_PROTO_OUR 0x0001UL
-#define GHCB_SEV_CPUID_REQ 0x004UL
-#define GHCB_CPUID_REQ_EAX 0
-#define GHCB_CPUID_REQ_EBX 1
-#define GHCB_CPUID_REQ_ECX 2
-#define GHCB_CPUID_REQ_EDX 3
-#define GHCB_CPUID_REQ(fn, reg) (GHCB_SEV_CPUID_REQ | \
- (((unsigned long)reg & 3) << 30) | \
- (((unsigned long)fn) << 32))
+#define GHCB_PROTO_OUR 0x0001UL
+#define GHCB_PROTOCOL_MAX 1ULL
+#define GHCB_DEFAULT_USAGE 0ULL
-#define GHCB_PROTOCOL_MAX 0x0001UL
-#define GHCB_DEFAULT_USAGE 0x0000UL
-
-#define GHCB_SEV_CPUID_RESP 0x005UL
-#define GHCB_SEV_TERMINATE 0x100UL
-#define GHCB_SEV_TERMINATE_REASON(reason_set, reason_val) \
- (((((u64)reason_set) & 0x7) << 12) | \
- ((((u64)reason_val) & 0xff) << 16))
-#define GHCB_SEV_ES_REASON_GENERAL_REQUEST 0
-#define GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED 1
-
-#define GHCB_SEV_GHCB_RESP_CODE(v) ((v) & 0xfff)
#define VMGEXIT() { asm volatile("rep; vmmcall\n\r"); }
enum es_result {
diff --git a/arch/x86/include/asm/thermal.h b/arch/x86/include/asm/thermal.h
index ddbdefd5b94f..91a7b6687c3b 100644
--- a/arch/x86/include/asm/thermal.h
+++ b/arch/x86/include/asm/thermal.h
@@ -3,11 +3,13 @@
#define _ASM_X86_THERMAL_H
#ifdef CONFIG_X86_THERMAL_VECTOR
+void therm_lvt_init(void);
void intel_init_thermal(struct cpuinfo_x86 *c);
bool x86_thermal_enabled(void);
void intel_thermal_interrupt(void);
#else
-static inline void intel_init_thermal(struct cpuinfo_x86 *c) { }
+static inline void therm_lvt_init(void) { }
+static inline void intel_init_thermal(struct cpuinfo_x86 *c) { }
#endif
#endif /* _ASM_X86_THERMAL_H */
diff --git a/arch/x86/include/asm/vdso/clocksource.h b/arch/x86/include/asm/vdso/clocksource.h
index 119ac8612d89..136e5e57cfe1 100644
--- a/arch/x86/include/asm/vdso/clocksource.h
+++ b/arch/x86/include/asm/vdso/clocksource.h
@@ -7,4 +7,6 @@
VDSO_CLOCKMODE_PVCLOCK, \
VDSO_CLOCKMODE_HVCLOCK
+#define HAVE_VDSO_CLOCKMODE_HVCLOCK
+
#endif /* __ASM_VDSO_CLOCKSOURCE_H */
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 5a3022c8af82..0662f644aad9 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -437,6 +437,8 @@ struct kvm_vmx_nested_state_hdr {
__u16 flags;
} smm;
+ __u16 pad;
+
__u32 flags;
__u64 preemption_timer_deadline;
};
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0704c2a94272..0f66682ac02a 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -20,7 +20,7 @@ CFLAGS_REMOVE_kvmclock.o = -pg
CFLAGS_REMOVE_ftrace.o = -pg
CFLAGS_REMOVE_early_printk.o = -pg
CFLAGS_REMOVE_head64.o = -pg
-CFLAGS_REMOVE_sev-es.o = -pg
+CFLAGS_REMOVE_sev.o = -pg
endif
KASAN_SANITIZE_head$(BITS).o := n
@@ -28,7 +28,7 @@ KASAN_SANITIZE_dumpstack.o := n
KASAN_SANITIZE_dumpstack_$(BITS).o := n
KASAN_SANITIZE_stacktrace.o := n
KASAN_SANITIZE_paravirt.o := n
-KASAN_SANITIZE_sev-es.o := n
+KASAN_SANITIZE_sev.o := n
# With some compiler versions the generated code results in boot hangs, caused
# by several compilation units. To be safe, disable all instrumentation.
@@ -148,7 +148,7 @@ obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o
obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o
obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o
-obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev-es.o
+obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev.o
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 6974b5174495..6fe5b44fcbc9 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -183,41 +183,69 @@ done:
}
/*
+ * optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90)
+ *
+ * @instr: instruction byte stream
+ * @instrlen: length of the above
+ * @off: offset within @instr where the first NOP has been detected
+ *
+ * Return: number of NOPs found (and replaced).
+ */
+static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off)
+{
+ unsigned long flags;
+ int i = off, nnops;
+
+ while (i < instrlen) {
+ if (instr[i] != 0x90)
+ break;
+
+ i++;
+ }
+
+ nnops = i - off;
+
+ if (nnops <= 1)
+ return nnops;
+
+ local_irq_save(flags);
+ add_nops(instr + off, nnops);
+ local_irq_restore(flags);
+
+ DUMP_BYTES(instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i);
+
+ return nnops;
+}
+
+/*
* "noinline" to cause control flow change and thus invalidate I$ and
* cause refetch after modification.
*/
static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr)
{
- unsigned long flags;
struct insn insn;
- int nop, i = 0;
+ int i = 0;
/*
- * Jump over the non-NOP insns, the remaining bytes must be single-byte
- * NOPs, optimize them.
+ * Jump over the non-NOP insns and optimize single-byte NOPs into bigger
+ * ones.
*/
for (;;) {
if (insn_decode_kernel(&insn, &instr[i]))
return;
+ /*
+ * See if this and any potentially following NOPs can be
+ * optimized.
+ */
if (insn.length == 1 && insn.opcode.bytes[0] == 0x90)
- break;
-
- if ((i += insn.length) >= a->instrlen)
- return;
- }
+ i += optimize_nops_range(instr, a->instrlen, i);
+ else
+ i += insn.length;
- for (nop = i; i < a->instrlen; i++) {
- if (WARN_ONCE(instr[i] != 0x90, "Not a NOP at 0x%px\n", &instr[i]))
+ if (i >= a->instrlen)
return;
}
-
- local_irq_save(flags);
- add_nops(instr + nop, i - nop);
- local_irq_restore(flags);
-
- DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ",
- instr, nop, a->instrlen);
}
/*
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 4a39fb429f15..d262811ce14b 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2604,6 +2604,7 @@ static void __init apic_bsp_setup(bool upmode)
end_local_APIC_setup();
irq_remap_enable_fault_handling();
setup_IO_APIC();
+ lapic_update_legacy_vectors();
}
#ifdef CONFIG_UP_LATE_INIT
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 6dbdc7c22bb7..fb67ed5e7e6a 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -738,6 +738,26 @@ void lapic_assign_legacy_vector(unsigned int irq, bool replace)
irq_matrix_assign_system(vector_matrix, ISA_IRQ_VECTOR(irq), replace);
}
+void __init lapic_update_legacy_vectors(void)
+{
+ unsigned int i;
+
+ if (IS_ENABLED(CONFIG_X86_IO_APIC) && nr_ioapics > 0)
+ return;
+
+ /*
+ * If the IO/APIC is disabled via config, kernel command line or
+ * lack of enumeration then all legacy interrupts are routed
+ * through the PIC. Make sure that they are marked as legacy
+ * vectors. PIC_CASCADE_IRQ has already been marked in
+ * lapic_assign_system_vectors().
+ */
+ for (i = 0; i < nr_legacy_irqs(); i++) {
+ if (i != PIC_CASCADE_IR)
+ lapic_assign_legacy_vector(i, true);
+ }
+}
+
void __init lapic_assign_system_vectors(void)
{
unsigned int i, vector = 0;
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 2d11384dc9ab..c06ac56eae4d 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -593,8 +593,8 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
*/
if (cpu_has(c, X86_FEATURE_SME) || cpu_has(c, X86_FEATURE_SEV)) {
/* Check if memory encryption is enabled */
- rdmsrl(MSR_K8_SYSCFG, msr);
- if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
+ rdmsrl(MSR_AMD64_SYSCFG, msr);
+ if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
goto clear_all;
/*
@@ -1165,3 +1165,19 @@ void set_dr_addr_mask(unsigned long mask, int dr)
break;
}
}
+
+u32 amd_get_highest_perf(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) ||
+ (c->x86_model >= 0x70 && c->x86_model < 0x80)))
+ return 166;
+
+ if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) ||
+ (c->x86_model >= 0x40 && c->x86_model < 0x70)))
+ return 166;
+
+ return 255;
+}
+EXPORT_SYMBOL_GPL(amd_get_highest_perf);
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 0c3b372318b7..b5f43049fa5f 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -836,7 +836,7 @@ int __init amd_special_default_mtrr(void)
if (boot_cpu_data.x86 < 0xf)
return 0;
/* In case some hypervisor doesn't pass SYSCFG through: */
- if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
+ if (rdmsr_safe(MSR_AMD64_SYSCFG, &l, &h) < 0)
return 0;
/*
* Memory between 4GB and top of mem is forced WB by this magic bit.
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index b90f3f437765..558108296f3c 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -53,13 +53,13 @@ static inline void k8_check_syscfg_dram_mod_en(void)
(boot_cpu_data.x86 >= 0x0f)))
return;
- rdmsr(MSR_K8_SYSCFG, lo, hi);
+ rdmsr(MSR_AMD64_SYSCFG, lo, hi);
if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) {
pr_err(FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]"
" not cleared by BIOS, clearing this bit\n",
smp_processor_id());
lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY;
- mtrr_wrmsr(MSR_K8_SYSCFG, lo, hi);
+ mtrr_wrmsr(MSR_AMD64_SYSCFG, lo, hi);
}
}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 3ef5868ac588..7aecb2fc3186 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -63,7 +63,7 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
case 15:
return msr - MSR_P4_BPU_PERFCTR0;
}
- fallthrough;
+ break;
case X86_VENDOR_ZHAOXIN:
case X86_VENDOR_CENTAUR:
return msr - MSR_ARCH_PERFMON_PERFCTR0;
@@ -96,7 +96,7 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
case 15:
return msr - MSR_P4_BSU_ESCR0;
}
- fallthrough;
+ break;
case X86_VENDOR_ZHAOXIN:
case X86_VENDOR_CENTAUR:
return msr - MSR_ARCH_PERFMON_EVENTSEL0;
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index a85c64000218..d0eef963aad1 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1402,60 +1402,3 @@ int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
return 0;
}
#endif /* CONFIG_PROC_PID_ARCH_STATUS */
-
-#ifdef CONFIG_IOMMU_SUPPORT
-void update_pasid(void)
-{
- u64 pasid_state;
- u32 pasid;
-
- if (!cpu_feature_enabled(X86_FEATURE_ENQCMD))
- return;
-
- if (!current->mm)
- return;
-
- pasid = READ_ONCE(current->mm->pasid);
- /* Set the valid bit in the PASID MSR/state only for valid pasid. */
- pasid_state = pasid == PASID_DISABLED ?
- pasid : pasid | MSR_IA32_PASID_VALID;
-
- /*
- * No need to hold fregs_lock() since the task's fpstate won't
- * be changed by others (e.g. ptrace) while the task is being
- * switched to or is in IPI.
- */
- if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
- /* The MSR is active and can be directly updated. */
- wrmsrl(MSR_IA32_PASID, pasid_state);
- } else {
- struct fpu *fpu = &current->thread.fpu;
- struct ia32_pasid_state *ppasid_state;
- struct xregs_state *xsave;
-
- /*
- * The CPU's xstate registers are not currently active. Just
- * update the PASID state in the memory buffer here. The
- * PASID MSR will be loaded when returning to user mode.
- */
- xsave = &fpu->state.xsave;
- xsave->header.xfeatures |= XFEATURE_MASK_PASID;
- ppasid_state = get_xsave_addr(xsave, XFEATURE_PASID);
- /*
- * Since XFEATURE_MASK_PASID is set in xfeatures, ppasid_state
- * won't be NULL and no need to check its value.
- *
- * Only update the task's PASID state when it's different
- * from the mm's pasid.
- */
- if (ppasid_state->pasid != pasid_state) {
- /*
- * Invalid fpregs so that state restoring will pick up
- * the PASID state.
- */
- __fpu_invalidate_fpregs_state(fpu);
- ppasid_state->pasid = pasid_state;
- }
- }
-}
-#endif /* CONFIG_IOMMU_SUPPORT */
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 18be44163a50..de01903c3735 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -39,7 +39,7 @@
#include <asm/realmode.h>
#include <asm/extable.h>
#include <asm/trapnr.h>
-#include <asm/sev-es.h>
+#include <asm/sev.h>
/*
* Manage page tables very early on.
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index d307c22e5c18..a26643dc6bd6 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -26,6 +26,7 @@
#include <linux/kprobes.h>
#include <linux/nmi.h>
#include <linux/swait.h>
+#include <linux/syscore_ops.h>
#include <asm/timer.h>
#include <asm/cpu.h>
#include <asm/traps.h>
@@ -37,6 +38,7 @@
#include <asm/tlb.h>
#include <asm/cpuidle_haltpoll.h>
#include <asm/ptrace.h>
+#include <asm/reboot.h>
#include <asm/svm.h>
DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
@@ -345,7 +347,7 @@ static void kvm_guest_cpu_init(void)
wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
__this_cpu_write(apf_reason.enabled, 1);
- pr_info("KVM setup async PF for cpu %d\n", smp_processor_id());
+ pr_info("setup async PF for cpu %d\n", smp_processor_id());
}
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
@@ -371,34 +373,17 @@ static void kvm_pv_disable_apf(void)
wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
__this_cpu_write(apf_reason.enabled, 0);
- pr_info("Unregister pv shared memory for cpu %d\n", smp_processor_id());
+ pr_info("disable async PF for cpu %d\n", smp_processor_id());
}
-static void kvm_pv_guest_cpu_reboot(void *unused)
+static void kvm_disable_steal_time(void)
{
- /*
- * We disable PV EOI before we load a new kernel by kexec,
- * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory.
- * New kernel can re-enable when it boots.
- */
- if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
- wrmsrl(MSR_KVM_PV_EOI_EN, 0);
- kvm_pv_disable_apf();
- kvm_disable_steal_time();
-}
+ if (!has_steal_clock)
+ return;
-static int kvm_pv_reboot_notify(struct notifier_block *nb,
- unsigned long code, void *unused)
-{
- if (code == SYS_RESTART)
- on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1);
- return NOTIFY_DONE;
+ wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
}
-static struct notifier_block kvm_pv_reboot_nb = {
- .notifier_call = kvm_pv_reboot_notify,
-};
-
static u64 kvm_steal_clock(int cpu)
{
u64 steal;
@@ -416,14 +401,6 @@ static u64 kvm_steal_clock(int cpu)
return steal;
}
-void kvm_disable_steal_time(void)
-{
- if (!has_steal_clock)
- return;
-
- wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
-}
-
static inline void __set_percpu_decrypted(void *ptr, unsigned long size)
{
early_set_memory_decrypted((unsigned long) ptr, size);
@@ -451,6 +428,27 @@ static void __init sev_map_percpu_data(void)
}
}
+static void kvm_guest_cpu_offline(bool shutdown)
+{
+ kvm_disable_steal_time();
+ if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
+ wrmsrl(MSR_KVM_PV_EOI_EN, 0);
+ kvm_pv_disable_apf();
+ if (!shutdown)
+ apf_task_wake_all();
+ kvmclock_disable();
+}
+
+static int kvm_cpu_online(unsigned int cpu)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ kvm_guest_cpu_init();
+ local_irq_restore(flags);
+ return 0;
+}
+
#ifdef CONFIG_SMP
static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
@@ -635,31 +633,64 @@ static void __init kvm_smp_prepare_boot_cpu(void)
kvm_spinlock_init();
}
-static void kvm_guest_cpu_offline(void)
+static int kvm_cpu_down_prepare(unsigned int cpu)
{
- kvm_disable_steal_time();
- if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
- wrmsrl(MSR_KVM_PV_EOI_EN, 0);
- kvm_pv_disable_apf();
- apf_task_wake_all();
+ unsigned long flags;
+
+ local_irq_save(flags);
+ kvm_guest_cpu_offline(false);
+ local_irq_restore(flags);
+ return 0;
}
-static int kvm_cpu_online(unsigned int cpu)
+#endif
+
+static int kvm_suspend(void)
{
- local_irq_disable();
- kvm_guest_cpu_init();
- local_irq_enable();
+ kvm_guest_cpu_offline(false);
+
return 0;
}
-static int kvm_cpu_down_prepare(unsigned int cpu)
+static void kvm_resume(void)
{
- local_irq_disable();
- kvm_guest_cpu_offline();
- local_irq_enable();
- return 0;
+ kvm_cpu_online(raw_smp_processor_id());
+}
+
+static struct syscore_ops kvm_syscore_ops = {
+ .suspend = kvm_suspend,
+ .resume = kvm_resume,
+};
+
+static void kvm_pv_guest_cpu_reboot(void *unused)
+{
+ kvm_guest_cpu_offline(true);
+}
+
+static int kvm_pv_reboot_notify(struct notifier_block *nb,
+ unsigned long code, void *unused)
+{
+ if (code == SYS_RESTART)
+ on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1);
+ return NOTIFY_DONE;
}
+static struct notifier_block kvm_pv_reboot_nb = {
+ .notifier_call = kvm_pv_reboot_notify,
+};
+
+/*
+ * After a PV feature is registered, the host will keep writing to the
+ * registered memory location. If the guest happens to shutdown, this memory
+ * won't be valid. In cases like kexec, in which you install a new kernel, this
+ * means a random memory location will be kept being written.
+ */
+#ifdef CONFIG_KEXEC_CORE
+static void kvm_crash_shutdown(struct pt_regs *regs)
+{
+ kvm_guest_cpu_offline(true);
+ native_machine_crash_shutdown(regs);
+}
#endif
static void __init kvm_guest_init(void)
@@ -704,6 +735,12 @@ static void __init kvm_guest_init(void)
kvm_guest_cpu_init();
#endif
+#ifdef CONFIG_KEXEC_CORE
+ machine_ops.crash_shutdown = kvm_crash_shutdown;
+#endif
+
+ register_syscore_ops(&kvm_syscore_ops);
+
/*
* Hard lockup detection is enabled by default. Disable it, as guests
* can get false positives too easily, for example if the host is
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index d37ed4e1d033..ad273e5861c1 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -20,7 +20,6 @@
#include <asm/hypervisor.h>
#include <asm/mem_encrypt.h>
#include <asm/x86_init.h>
-#include <asm/reboot.h>
#include <asm/kvmclock.h>
static int kvmclock __initdata = 1;
@@ -203,28 +202,9 @@ static void kvm_setup_secondary_clock(void)
}
#endif
-/*
- * After the clock is registered, the host will keep writing to the
- * registered memory location. If the guest happens to shutdown, this memory
- * won't be valid. In cases like kexec, in which you install a new kernel, this
- * means a random memory location will be kept being written. So before any
- * kind of shutdown from our side, we unregister the clock by writing anything
- * that does not have the 'enable' bit set in the msr
- */
-#ifdef CONFIG_KEXEC_CORE
-static void kvm_crash_shutdown(struct pt_regs *regs)
-{
- native_write_msr(msr_kvm_system_time, 0, 0);
- kvm_disable_steal_time();
- native_machine_crash_shutdown(regs);
-}
-#endif
-
-static void kvm_shutdown(void)
+void kvmclock_disable(void)
{
native_write_msr(msr_kvm_system_time, 0, 0);
- kvm_disable_steal_time();
- native_machine_shutdown();
}
static void __init kvmclock_init_mem(void)
@@ -351,10 +331,6 @@ void __init kvmclock_init(void)
#endif
x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
- machine_ops.shutdown = kvm_shutdown;
-#ifdef CONFIG_KEXEC_CORE
- machine_ops.crash_shutdown = kvm_crash_shutdown;
-#endif
kvm_get_preset_lpj();
/*
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index b5cb49e57df8..c94dec6a1834 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -95,7 +95,7 @@ static void get_fam10h_pci_mmconf_base(void)
return;
/* SYS_CFG */
- address = MSR_K8_SYSCFG;
+ address = MSR_AMD64_SYSCFG;
rdmsrl(address, val);
/* TOP_MEM2 is not enabled? */
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 2ef961cf4cfc..4bce802d25fb 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -33,7 +33,7 @@
#include <asm/reboot.h>
#include <asm/cache.h>
#include <asm/nospec-branch.h>
-#include <asm/sev-es.h>
+#include <asm/sev.h>
#define CREATE_TRACE_POINTS
#include <trace/events/nmi.h>
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 72920af0b3c0..1e720626069a 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -44,6 +44,7 @@
#include <asm/pci-direct.h>
#include <asm/prom.h>
#include <asm/proto.h>
+#include <asm/thermal.h>
#include <asm/unwind.h>
#include <asm/vsyscall.h>
#include <linux/vmalloc.h>
@@ -637,11 +638,11 @@ static void __init trim_snb_memory(void)
* them from accessing certain memory ranges, namely anything below
* 1M and in the pages listed in bad_pages[] above.
*
- * To avoid these pages being ever accessed by SNB gfx devices
- * reserve all memory below the 1 MB mark and bad_pages that have
- * not already been reserved at boot time.
+ * To avoid these pages being ever accessed by SNB gfx devices reserve
+ * bad_pages that have not already been reserved at boot time.
+ * All memory below the 1 MB mark is anyway reserved later during
+ * setup_arch(), so there is no need to reserve it here.
*/
- memblock_reserve(0, 1<<20);
for (i = 0; i < ARRAY_SIZE(bad_pages); i++) {
if (memblock_reserve(bad_pages[i], PAGE_SIZE))
@@ -733,14 +734,14 @@ static void __init early_reserve_memory(void)
* The first 4Kb of memory is a BIOS owned area, but generally it is
* not listed as such in the E820 table.
*
- * Reserve the first memory page and typically some additional
- * memory (64KiB by default) since some BIOSes are known to corrupt
- * low memory. See the Kconfig help text for X86_RESERVE_LOW.
+ * Reserve the first 64K of memory since some BIOSes are known to
+ * corrupt low memory. After the real mode trampoline is allocated the
+ * rest of the memory below 640k is reserved.
*
* In addition, make sure page 0 is always reserved because on
* systems with L1TF its contents can be leaked to user processes.
*/
- memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE));
+ memblock_reserve(0, SZ_64K);
early_reserve_initrd();
@@ -751,6 +752,7 @@ static void __init early_reserve_memory(void)
reserve_ibft_region();
reserve_bios_regions();
+ trim_snb_memory();
}
/*
@@ -1081,14 +1083,20 @@ void __init setup_arch(char **cmdline_p)
(max_pfn_mapped<<PAGE_SHIFT) - 1);
#endif
- reserve_real_mode();
-
/*
- * Reserving memory causing GPU hangs on Sandy Bridge integrated
- * graphics devices should be done after we allocated memory under
- * 1M for the real mode trampoline.
+ * Find free memory for the real mode trampoline and place it
+ * there.
+ * If there is not enough free memory under 1M, on EFI-enabled
+ * systems there will be additional attempt to reclaim the memory
+ * for the real mode trampoline at efi_free_boot_services().
+ *
+ * Unconditionally reserve the entire first 1M of RAM because
+ * BIOSes are know to corrupt low memory and several
+ * hundred kilobytes are not worth complex detection what memory gets
+ * clobbered. Moreover, on machines with SandyBridge graphics or in
+ * setups that use crashkernel the entire 1M is reserved anyway.
*/
- trim_snb_memory();
+ reserve_real_mode();
init_mem_mapping();
@@ -1226,6 +1234,14 @@ void __init setup_arch(char **cmdline_p)
x86_init.timers.wallclock_init();
+ /*
+ * This needs to run before setup_local_APIC() which soft-disables the
+ * local APIC temporarily and that masks the thermal LVT interrupt,
+ * leading to softlockups on machines which have configured SMI
+ * interrupt delivery.
+ */
+ therm_lvt_init();
+
mcheck_init();
register_refined_jiffies(CLOCK_TICK_RATE);
diff --git a/arch/x86/kernel/sev-es-shared.c b/arch/x86/kernel/sev-shared.c
index 0aa9f13efd57..9f90f460a28c 100644
--- a/arch/x86/kernel/sev-es-shared.c
+++ b/arch/x86/kernel/sev-shared.c
@@ -26,13 +26,13 @@ static bool __init sev_es_check_cpu_features(void)
static void __noreturn sev_es_terminate(unsigned int reason)
{
- u64 val = GHCB_SEV_TERMINATE;
+ u64 val = GHCB_MSR_TERM_REQ;
/*
* Tell the hypervisor what went wrong - only reason-set 0 is
* currently supported.
*/
- val |= GHCB_SEV_TERMINATE_REASON(0, reason);
+ val |= GHCB_SEV_TERM_REASON(0, reason);
/* Request Guest Termination from Hypvervisor */
sev_es_wr_ghcb_msr(val);
@@ -47,15 +47,15 @@ static bool sev_es_negotiate_protocol(void)
u64 val;
/* Do the GHCB protocol version negotiation */
- sev_es_wr_ghcb_msr(GHCB_SEV_INFO_REQ);
+ sev_es_wr_ghcb_msr(GHCB_MSR_SEV_INFO_REQ);
VMGEXIT();
val = sev_es_rd_ghcb_msr();
- if (GHCB_INFO(val) != GHCB_SEV_INFO)
+ if (GHCB_MSR_INFO(val) != GHCB_MSR_SEV_INFO_RESP)
return false;
- if (GHCB_PROTO_MAX(val) < GHCB_PROTO_OUR ||
- GHCB_PROTO_MIN(val) > GHCB_PROTO_OUR)
+ if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTO_OUR ||
+ GHCB_MSR_PROTO_MIN(val) > GHCB_PROTO_OUR)
return false;
return true;
@@ -63,6 +63,7 @@ static bool sev_es_negotiate_protocol(void)
static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb)
{
+ ghcb->save.sw_exit_code = 0;
memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
}
@@ -153,28 +154,28 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EAX));
VMGEXIT();
val = sev_es_rd_ghcb_msr();
- if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP)
+ if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP)
goto fail;
regs->ax = val >> 32;
sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EBX));
VMGEXIT();
val = sev_es_rd_ghcb_msr();
- if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP)
+ if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP)
goto fail;
regs->bx = val >> 32;
sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_ECX));
VMGEXIT();
val = sev_es_rd_ghcb_msr();
- if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP)
+ if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP)
goto fail;
regs->cx = val >> 32;
sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EDX));
VMGEXIT();
val = sev_es_rd_ghcb_msr();
- if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP)
+ if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP)
goto fail;
regs->dx = val >> 32;
diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev.c
index 73873b007838..651b81cd648e 100644
--- a/arch/x86/kernel/sev-es.c
+++ b/arch/x86/kernel/sev.c
@@ -22,7 +22,7 @@
#include <asm/cpu_entry_area.h>
#include <asm/stacktrace.h>
-#include <asm/sev-es.h>
+#include <asm/sev.h>
#include <asm/insn-eval.h>
#include <asm/fpu/internal.h>
#include <asm/processor.h>
@@ -203,8 +203,18 @@ static __always_inline struct ghcb *sev_es_get_ghcb(struct ghcb_state *state)
if (unlikely(data->ghcb_active)) {
/* GHCB is already in use - save its contents */
- if (unlikely(data->backup_ghcb_active))
- return NULL;
+ if (unlikely(data->backup_ghcb_active)) {
+ /*
+ * Backup-GHCB is also already in use. There is no way
+ * to continue here so just kill the machine. To make
+ * panic() work, mark GHCBs inactive so that messages
+ * can be printed out.
+ */
+ data->ghcb_active = false;
+ data->backup_ghcb_active = false;
+
+ panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use");
+ }
/* Mark backup_ghcb active before writing to it */
data->backup_ghcb_active = true;
@@ -221,24 +231,6 @@ static __always_inline struct ghcb *sev_es_get_ghcb(struct ghcb_state *state)
return ghcb;
}
-static __always_inline void sev_es_put_ghcb(struct ghcb_state *state)
-{
- struct sev_es_runtime_data *data;
- struct ghcb *ghcb;
-
- data = this_cpu_read(runtime_data);
- ghcb = &data->ghcb_page;
-
- if (state->ghcb) {
- /* Restore GHCB from Backup */
- *ghcb = *state->ghcb;
- data->backup_ghcb_active = false;
- state->ghcb = NULL;
- } else {
- data->ghcb_active = false;
- }
-}
-
/* Needed in vc_early_forward_exception */
void do_early_exception(struct pt_regs *regs, int trapnr);
@@ -323,31 +315,44 @@ static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
u16 d2;
u8 d1;
- /* If instruction ran in kernel mode and the I/O buffer is in kernel space */
- if (!user_mode(ctxt->regs) && !access_ok(target, size)) {
- memcpy(dst, buf, size);
- return ES_OK;
- }
-
+ /*
+ * This function uses __put_user() independent of whether kernel or user
+ * memory is accessed. This works fine because __put_user() does no
+ * sanity checks of the pointer being accessed. All that it does is
+ * to report when the access failed.
+ *
+ * Also, this function runs in atomic context, so __put_user() is not
+ * allowed to sleep. The page-fault handler detects that it is running
+ * in atomic context and will not try to take mmap_sem and handle the
+ * fault, so additional pagefault_enable()/disable() calls are not
+ * needed.
+ *
+ * The access can't be done via copy_to_user() here because
+ * vc_write_mem() must not use string instructions to access unsafe
+ * memory. The reason is that MOVS is emulated by the #VC handler by
+ * splitting the move up into a read and a write and taking a nested #VC
+ * exception on whatever of them is the MMIO access. Using string
+ * instructions here would cause infinite nesting.
+ */
switch (size) {
case 1:
memcpy(&d1, buf, 1);
- if (put_user(d1, target))
+ if (__put_user(d1, target))
goto fault;
break;
case 2:
memcpy(&d2, buf, 2);
- if (put_user(d2, target))
+ if (__put_user(d2, target))
goto fault;
break;
case 4:
memcpy(&d4, buf, 4);
- if (put_user(d4, target))
+ if (__put_user(d4, target))
goto fault;
break;
case 8:
memcpy(&d8, buf, 8);
- if (put_user(d8, target))
+ if (__put_user(d8, target))
goto fault;
break;
default:
@@ -378,30 +383,43 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
u16 d2;
u8 d1;
- /* If instruction ran in kernel mode and the I/O buffer is in kernel space */
- if (!user_mode(ctxt->regs) && !access_ok(s, size)) {
- memcpy(buf, src, size);
- return ES_OK;
- }
-
+ /*
+ * This function uses __get_user() independent of whether kernel or user
+ * memory is accessed. This works fine because __get_user() does no
+ * sanity checks of the pointer being accessed. All that it does is
+ * to report when the access failed.
+ *
+ * Also, this function runs in atomic context, so __get_user() is not
+ * allowed to sleep. The page-fault handler detects that it is running
+ * in atomic context and will not try to take mmap_sem and handle the
+ * fault, so additional pagefault_enable()/disable() calls are not
+ * needed.
+ *
+ * The access can't be done via copy_from_user() here because
+ * vc_read_mem() must not use string instructions to access unsafe
+ * memory. The reason is that MOVS is emulated by the #VC handler by
+ * splitting the move up into a read and a write and taking a nested #VC
+ * exception on whatever of them is the MMIO access. Using string
+ * instructions here would cause infinite nesting.
+ */
switch (size) {
case 1:
- if (get_user(d1, s))
+ if (__get_user(d1, s))
goto fault;
memcpy(buf, &d1, 1);
break;
case 2:
- if (get_user(d2, s))
+ if (__get_user(d2, s))
goto fault;
memcpy(buf, &d2, 2);
break;
case 4:
- if (get_user(d4, s))
+ if (__get_user(d4, s))
goto fault;
memcpy(buf, &d4, 4);
break;
case 8:
- if (get_user(d8, s))
+ if (__get_user(d8, s))
goto fault;
memcpy(buf, &d8, 8);
break;
@@ -459,7 +477,30 @@ static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt
}
/* Include code shared with pre-decompression boot stage */
-#include "sev-es-shared.c"
+#include "sev-shared.c"
+
+static __always_inline void sev_es_put_ghcb(struct ghcb_state *state)
+{
+ struct sev_es_runtime_data *data;
+ struct ghcb *ghcb;
+
+ data = this_cpu_read(runtime_data);
+ ghcb = &data->ghcb_page;
+
+ if (state->ghcb) {
+ /* Restore GHCB from Backup */
+ *ghcb = *state->ghcb;
+ data->backup_ghcb_active = false;
+ state->ghcb = NULL;
+ } else {
+ /*
+ * Invalidate the GHCB so a VMGEXIT instruction issued
+ * from userspace won't appear to be valid.
+ */
+ vc_ghcb_invalidate(ghcb);
+ data->ghcb_active = false;
+ }
+}
void noinstr __sev_es_nmi_complete(void)
{
@@ -1255,6 +1296,10 @@ static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt)
case X86_TRAP_UD:
exc_invalid_op(ctxt->regs);
break;
+ case X86_TRAP_PF:
+ write_cr2(ctxt->fi.cr2);
+ exc_page_fault(ctxt->regs, error_code);
+ break;
case X86_TRAP_AC:
exc_alignment_check(ctxt->regs, error_code);
break;
@@ -1284,7 +1329,6 @@ static __always_inline bool on_vc_fallback_stack(struct pt_regs *regs)
*/
DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication)
{
- struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
irqentry_state_t irq_state;
struct ghcb_state state;
struct es_em_ctxt ctxt;
@@ -1310,16 +1354,6 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication)
*/
ghcb = sev_es_get_ghcb(&state);
- if (!ghcb) {
- /*
- * Mark GHCBs inactive so that panic() is able to print the
- * message.
- */
- data->ghcb_active = false;
- data->backup_ghcb_active = false;
-
- panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use");
- }
vc_ghcb_invalidate(ghcb);
result = vc_init_em_ctxt(&ctxt, regs, error_code);
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
index 0e5d0a7e203b..06743ec054d2 100644
--- a/arch/x86/kernel/signal_compat.c
+++ b/arch/x86/kernel/signal_compat.c
@@ -127,6 +127,9 @@ static inline void signal_compat_build_tests(void)
BUILD_BUG_ON(offsetof(siginfo_t, si_addr) != 0x10);
BUILD_BUG_ON(offsetof(compat_siginfo_t, si_addr) != 0x0C);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_trapno) != 0x18);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_trapno) != 0x10);
+
BUILD_BUG_ON(offsetof(siginfo_t, si_addr_lsb) != 0x18);
BUILD_BUG_ON(offsetof(compat_siginfo_t, si_addr_lsb) != 0x10);
@@ -138,8 +141,10 @@ static inline void signal_compat_build_tests(void)
BUILD_BUG_ON(offsetof(siginfo_t, si_pkey) != 0x20);
BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pkey) != 0x14);
- BUILD_BUG_ON(offsetof(siginfo_t, si_perf) != 0x18);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf) != 0x10);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_perf_data) != 0x18);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_perf_type) != 0x20);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_data) != 0x10);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_type) != 0x14);
CHECK_CSI_OFFSET(_sigpoll);
CHECK_CSI_SIZE (_sigpoll, 2*sizeof(int));
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 0ad5214f598a..7770245cc7fa 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -2043,7 +2043,7 @@ static bool amd_set_max_freq_ratio(void)
return false;
}
- highest_perf = perf_caps.highest_perf;
+ highest_perf = amd_get_highest_perf();
nominal_perf = perf_caps.nominal_perf;
if (!highest_perf || !nominal_perf) {
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 19606a341888..9a48f138832d 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -458,7 +458,7 @@ void kvm_set_cpu_caps(void)
F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ |
- F(SGX_LC)
+ F(SGX_LC) | F(BUS_LOCK_DETECT)
);
/* Set LA57 based on hardware capability. */
if (cpuid_ecx(7) & F(LA57))
@@ -567,6 +567,21 @@ void kvm_set_cpu_caps(void)
F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
F(PMM) | F(PMM_EN)
);
+
+ /*
+ * Hide RDTSCP and RDPID if either feature is reported as supported but
+ * probing MSR_TSC_AUX failed. This is purely a sanity check and
+ * should never happen, but the guest will likely crash if RDTSCP or
+ * RDPID is misreported, and KVM has botched MSR_TSC_AUX emulation in
+ * the past. For example, the sanity check may fire if this instance of
+ * KVM is running as L1 on top of an older, broken KVM.
+ */
+ if (WARN_ON((kvm_cpu_cap_has(X86_FEATURE_RDTSCP) ||
+ kvm_cpu_cap_has(X86_FEATURE_RDPID)) &&
+ !kvm_is_supported_user_return_msr(MSR_TSC_AUX))) {
+ kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
+ kvm_cpu_cap_clear(X86_FEATURE_RDPID);
+ }
}
EXPORT_SYMBOL_GPL(kvm_set_cpu_caps);
@@ -637,7 +652,8 @@ static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func)
case 7:
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
entry->eax = 0;
- entry->ecx = F(RDPID);
+ if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP))
+ entry->ecx = F(RDPID);
++array->nent;
default:
break;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 77e1c89a95a7..5e5de05a8fbf 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -4502,7 +4502,7 @@ static const struct opcode group8[] = {
* from the register case of group9.
*/
static const struct gprefix pfx_0f_c7_7 = {
- N, N, N, II(DstMem | ModRM | Op3264 | EmulateOnUD, em_rdpid, rdtscp),
+ N, N, N, II(DstMem | ModRM | Op3264 | EmulateOnUD, em_rdpid, rdpid),
};
@@ -5111,7 +5111,7 @@ done:
return rc;
}
-int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type)
{
int rc = X86EMUL_CONTINUE;
int mode = ctxt->mode;
@@ -5322,7 +5322,8 @@ done_prefixes:
ctxt->execute = opcode.u.execute;
- if (unlikely(ctxt->ud) && likely(!(ctxt->d & EmulateOnUD)))
+ if (unlikely(emulation_type & EMULTYPE_TRAP_UD) &&
+ likely(!(ctxt->d & EmulateOnUD)))
return EMULATION_FAILED;
if (unlikely(ctxt->d &
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index f98370a39936..f00830e5202f 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1172,6 +1172,7 @@ void kvm_hv_invalidate_tsc_page(struct kvm *kvm)
{
struct kvm_hv *hv = to_kvm_hv(kvm);
u64 gfn;
+ int idx;
if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN ||
hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET ||
@@ -1190,9 +1191,16 @@ void kvm_hv_invalidate_tsc_page(struct kvm *kvm)
gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
hv->tsc_ref.tsc_sequence = 0;
+
+ /*
+ * Take the srcu lock as memslots will be accessed to check the gfn
+ * cache generation against the memslots generation.
+ */
+ idx = srcu_read_lock(&kvm->srcu);
if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
&hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)))
hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN;
+ srcu_read_unlock(&kvm->srcu, idx);
out_unlock:
mutex_unlock(&hv->hv_lock);
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index 0d359115429a..3e870bf9ca4d 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -314,7 +314,6 @@ struct x86_emulate_ctxt {
int interruptibility;
bool perm_ok; /* do not check permissions if true */
- bool ud; /* inject an #UD if host doesn't support insn */
bool tf; /* TF value before instruction (after for syscall/sysret) */
bool have_exception;
@@ -468,6 +467,7 @@ enum x86_intercept {
x86_intercept_clgi,
x86_intercept_skinit,
x86_intercept_rdtscp,
+ x86_intercept_rdpid,
x86_intercept_icebp,
x86_intercept_wbinvd,
x86_intercept_monitor,
@@ -490,7 +490,7 @@ enum x86_intercept {
#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
#endif
-int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len);
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type);
bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt);
#define EMULATION_FAILED -1
#define EMULATION_OK 0
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 152591f9243a..6d72d8f43310 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1494,6 +1494,15 @@ static void limit_periodic_timer_frequency(struct kvm_lapic *apic)
static void cancel_hv_timer(struct kvm_lapic *apic);
+static void cancel_apic_timer(struct kvm_lapic *apic)
+{
+ hrtimer_cancel(&apic->lapic_timer.timer);
+ preempt_disable();
+ if (apic->lapic_timer.hv_timer_in_use)
+ cancel_hv_timer(apic);
+ preempt_enable();
+}
+
static void apic_update_lvtt(struct kvm_lapic *apic)
{
u32 timer_mode = kvm_lapic_get_reg(apic, APIC_LVTT) &
@@ -1502,11 +1511,7 @@ static void apic_update_lvtt(struct kvm_lapic *apic)
if (apic->lapic_timer.timer_mode != timer_mode) {
if (apic_lvtt_tscdeadline(apic) != (timer_mode ==
APIC_LVT_TIMER_TSCDEADLINE)) {
- hrtimer_cancel(&apic->lapic_timer.timer);
- preempt_disable();
- if (apic->lapic_timer.hv_timer_in_use)
- cancel_hv_timer(apic);
- preempt_enable();
+ cancel_apic_timer(apic);
kvm_lapic_set_reg(apic, APIC_TMICT, 0);
apic->lapic_timer.period = 0;
apic->lapic_timer.tscdeadline = 0;
@@ -1598,11 +1603,19 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
apic->lapic_timer.advance_expire_delta = guest_tsc - tsc_deadline;
+ if (lapic_timer_advance_dynamic) {
+ adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta);
+ /*
+ * If the timer fired early, reread the TSC to account for the
+ * overhead of the above adjustment to avoid waiting longer
+ * than is necessary.
+ */
+ if (guest_tsc < tsc_deadline)
+ guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+ }
+
if (guest_tsc < tsc_deadline)
__wait_lapic_expire(vcpu, tsc_deadline - guest_tsc);
-
- if (lapic_timer_advance_dynamic)
- adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta);
}
void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
@@ -1661,7 +1674,7 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn)
}
atomic_inc(&apic->lapic_timer.pending);
- kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+ kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
if (from_timer_fn)
kvm_vcpu_kick(vcpu);
}
@@ -1913,8 +1926,8 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
if (!apic->lapic_timer.hv_timer_in_use)
goto out;
WARN_ON(rcuwait_active(&vcpu->wait));
- cancel_hv_timer(apic);
apic_timer_expired(apic, false);
+ cancel_hv_timer(apic);
if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
advance_periodic_target_expiration(apic);
@@ -2084,7 +2097,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
if (apic_lvtt_tscdeadline(apic))
break;
- hrtimer_cancel(&apic->lapic_timer.timer);
+ cancel_apic_timer(apic);
kvm_lapic_set_reg(apic, APIC_TMICT, val);
start_apic_timer(apic);
break;
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 4b3ee244ebe0..0144c40d09c7 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3310,12 +3310,12 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
- if (WARN_ON_ONCE(!mmu->lm_root)) {
+ if (WARN_ON_ONCE(!mmu->pml4_root)) {
r = -EIO;
goto out_unlock;
}
- mmu->lm_root[0] = __pa(mmu->pae_root) | pm_mask;
+ mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask;
}
for (i = 0; i < 4; ++i) {
@@ -3335,7 +3335,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
}
if (mmu->shadow_root_level == PT64_ROOT_4LEVEL)
- mmu->root_hpa = __pa(mmu->lm_root);
+ mmu->root_hpa = __pa(mmu->pml4_root);
else
mmu->root_hpa = __pa(mmu->pae_root);
@@ -3350,7 +3350,7 @@ out_unlock:
static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
- u64 *lm_root, *pae_root;
+ u64 *pml4_root, *pae_root;
/*
* When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
@@ -3369,14 +3369,14 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
if (WARN_ON_ONCE(mmu->shadow_root_level != PT64_ROOT_4LEVEL))
return -EIO;
- if (mmu->pae_root && mmu->lm_root)
+ if (mmu->pae_root && mmu->pml4_root)
return 0;
/*
* The special roots should always be allocated in concert. Yell and
* bail if KVM ends up in a state where only one of the roots is valid.
*/
- if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->lm_root))
+ if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root))
return -EIO;
/*
@@ -3387,14 +3387,14 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
if (!pae_root)
return -ENOMEM;
- lm_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
- if (!lm_root) {
+ pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!pml4_root) {
free_page((unsigned long)pae_root);
return -ENOMEM;
}
mmu->pae_root = pae_root;
- mmu->lm_root = lm_root;
+ mmu->pml4_root = pml4_root;
return 0;
}
@@ -5261,7 +5261,7 @@ static void free_mmu_pages(struct kvm_mmu *mmu)
if (!tdp_enabled && mmu->pae_root)
set_memory_encrypted((unsigned long)mmu->pae_root, 1);
free_page((unsigned long)mmu->pae_root);
- free_page((unsigned long)mmu->lm_root);
+ free_page((unsigned long)mmu->pml4_root);
}
static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 70b7e44e3035..823a5919f9fa 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -90,8 +90,8 @@ struct guest_walker {
gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS];
bool pte_writable[PT_MAX_FULL_LEVELS];
- unsigned pt_access;
- unsigned pte_access;
+ unsigned int pt_access[PT_MAX_FULL_LEVELS];
+ unsigned int pte_access;
gfn_t gfn;
struct x86_exception fault;
};
@@ -418,13 +418,15 @@ retry_walk:
}
walker->ptes[walker->level - 1] = pte;
+
+ /* Convert to ACC_*_MASK flags for struct guest_walker. */
+ walker->pt_access[walker->level - 1] = FNAME(gpte_access)(pt_access ^ walk_nx_mask);
} while (!is_last_gpte(mmu, walker->level, pte));
pte_pkey = FNAME(gpte_pkeys)(vcpu, pte);
accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0;
/* Convert to ACC_*_MASK flags for struct guest_walker. */
- walker->pt_access = FNAME(gpte_access)(pt_access ^ walk_nx_mask);
walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask);
errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access);
if (unlikely(errcode))
@@ -463,7 +465,8 @@ retry_walk:
}
pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
- __func__, (u64)pte, walker->pte_access, walker->pt_access);
+ __func__, (u64)pte, walker->pte_access,
+ walker->pt_access[walker->level - 1]);
return 1;
error:
@@ -643,7 +646,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
struct kvm_mmu_page *sp = NULL;
struct kvm_shadow_walk_iterator it;
- unsigned direct_access, access = gw->pt_access;
+ unsigned int direct_access, access;
int top_level, level, req_level, ret;
gfn_t base_gfn = gw->gfn;
@@ -675,6 +678,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
sp = NULL;
if (!is_shadow_present_pte(*it.sptep)) {
table_gfn = gw->table_gfn[it.level - 2];
+ access = gw->pt_access[it.level - 2];
sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1,
false, access);
}
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 88f69a6cc492..237317b1eddd 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -388,7 +388,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
}
/**
- * handle_changed_spte - handle bookkeeping associated with an SPTE change
+ * __handle_changed_spte - handle bookkeeping associated with an SPTE change
* @kvm: kvm instance
* @as_id: the address space of the paging structure the SPTE was a part of
* @gfn: the base GFN that was mapped by the SPTE
@@ -444,6 +444,13 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
+ if (is_large_pte(old_spte) != is_large_pte(new_spte)) {
+ if (is_large_pte(old_spte))
+ atomic64_sub(1, (atomic64_t*)&kvm->stat.lpages);
+ else
+ atomic64_add(1, (atomic64_t*)&kvm->stat.lpages);
+ }
+
/*
* The only times a SPTE should be changed from a non-present to
* non-present state is when an MMIO entry is installed/modified/
@@ -1009,6 +1016,14 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
}
if (!is_shadow_present_pte(iter.old_spte)) {
+ /*
+ * If SPTE has been forzen by another thread, just
+ * give up and retry, avoiding unnecessary page table
+ * allocation and free.
+ */
+ if (is_removed_spte(iter.old_spte))
+ break;
+
sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
child_pt = sp->spt;
@@ -1177,9 +1192,9 @@ bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
}
/*
- * Remove write access from all the SPTEs mapping GFNs [start, end). If
- * skip_4k is set, SPTEs that map 4k pages, will not be write-protected.
- * Returns true if an SPTE has been changed and the TLBs need to be flushed.
+ * Remove write access from all SPTEs at or above min_level that map GFNs
+ * [start, end). Returns true if an SPTE has been changed and the TLBs need to
+ * be flushed.
*/
static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t start, gfn_t end, int min_level)
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 712b4e0de481..0e62e6a2438c 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -28,10 +28,8 @@
#include "svm.h"
/* enable / disable AVIC */
-int avic;
-#ifdef CONFIG_X86_LOCAL_APIC
-module_param(avic, int, S_IRUGO);
-#endif
+bool avic;
+module_param(avic, bool, S_IRUGO);
#define SVM_AVIC_DOORBELL 0xc001011b
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 540d43ba2cf4..5e8d8443154e 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -764,7 +764,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
svm_switch_vmcb(svm, &svm->vmcb01);
- WARN_ON_ONCE(svm->vmcb->control.exit_code != SVM_EXIT_VMRUN);
/*
* On vmexit the GIF is set to false and
@@ -872,6 +871,15 @@ void svm_free_nested(struct vcpu_svm *svm)
__free_page(virt_to_page(svm->nested.vmcb02.ptr));
svm->nested.vmcb02.ptr = NULL;
+ /*
+ * When last_vmcb12_gpa matches the current vmcb12 gpa,
+ * some vmcb12 fields are not loaded if they are marked clean
+ * in the vmcb12, since in this case they are up to date already.
+ *
+ * When the vmcb02 is freed, this optimization becomes invalid.
+ */
+ svm->nested.last_vmcb12_gpa = INVALID_GPA;
+
svm->nested.initialized = false;
}
@@ -884,9 +892,11 @@ void svm_leave_nested(struct vcpu_svm *svm)
if (is_guest_mode(vcpu)) {
svm->nested.nested_run_pending = 0;
+ svm->nested.vmcb12_gpa = INVALID_GPA;
+
leave_guest_mode(vcpu);
- svm_switch_vmcb(svm, &svm->nested.vmcb02);
+ svm_switch_vmcb(svm, &svm->vmcb01);
nested_svm_uninit_mmu_context(vcpu);
vmcb_mark_all_dirty(svm->vmcb);
@@ -1298,12 +1308,17 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
* L2 registers if needed are moved from the current VMCB to VMCB02.
*/
+ if (is_guest_mode(vcpu))
+ svm_leave_nested(svm);
+ else
+ svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save;
+
+ svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
+
svm->nested.nested_run_pending =
!!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa;
- if (svm->current_vmcb == &svm->vmcb01)
- svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save;
svm->vmcb01.ptr->save.es = save->es;
svm->vmcb01.ptr->save.cs = save->cs;
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 1356ee095cd5..e0ce5da97fc2 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -763,7 +763,7 @@ static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr,
}
static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
- unsigned long __user dst_uaddr,
+ void __user *dst_uaddr,
unsigned long dst_paddr,
int size, int *err)
{
@@ -787,8 +787,7 @@ static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
if (tpage) {
offset = paddr & 15;
- if (copy_to_user((void __user *)(uintptr_t)dst_uaddr,
- page_address(tpage) + offset, size))
+ if (copy_to_user(dst_uaddr, page_address(tpage) + offset, size))
ret = -EFAULT;
}
@@ -800,9 +799,9 @@ e_free:
}
static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
- unsigned long __user vaddr,
+ void __user *vaddr,
unsigned long dst_paddr,
- unsigned long __user dst_vaddr,
+ void __user *dst_vaddr,
int size, int *error)
{
struct page *src_tpage = NULL;
@@ -810,13 +809,12 @@ static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
int ret, len = size;
/* If source buffer is not aligned then use an intermediate buffer */
- if (!IS_ALIGNED(vaddr, 16)) {
+ if (!IS_ALIGNED((unsigned long)vaddr, 16)) {
src_tpage = alloc_page(GFP_KERNEL);
if (!src_tpage)
return -ENOMEM;
- if (copy_from_user(page_address(src_tpage),
- (void __user *)(uintptr_t)vaddr, size)) {
+ if (copy_from_user(page_address(src_tpage), vaddr, size)) {
__free_page(src_tpage);
return -EFAULT;
}
@@ -830,7 +828,7 @@ static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
* - copy the source buffer in an intermediate buffer
* - use the intermediate buffer as source buffer
*/
- if (!IS_ALIGNED(dst_vaddr, 16) || !IS_ALIGNED(size, 16)) {
+ if (!IS_ALIGNED((unsigned long)dst_vaddr, 16) || !IS_ALIGNED(size, 16)) {
int dst_offset;
dst_tpage = alloc_page(GFP_KERNEL);
@@ -855,7 +853,7 @@ static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
page_address(src_tpage), size);
else {
if (copy_from_user(page_address(dst_tpage) + dst_offset,
- (void __user *)(uintptr_t)vaddr, size)) {
+ vaddr, size)) {
ret = -EFAULT;
goto e_free;
}
@@ -935,15 +933,15 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
if (dec)
ret = __sev_dbg_decrypt_user(kvm,
__sme_page_pa(src_p[0]) + s_off,
- dst_vaddr,
+ (void __user *)dst_vaddr,
__sme_page_pa(dst_p[0]) + d_off,
len, &argp->error);
else
ret = __sev_dbg_encrypt_user(kvm,
__sme_page_pa(src_p[0]) + s_off,
- vaddr,
+ (void __user *)vaddr,
__sme_page_pa(dst_p[0]) + d_off,
- dst_vaddr,
+ (void __user *)dst_vaddr,
len, &argp->error);
sev_unpin_memory(kvm, src_p, n);
@@ -1105,10 +1103,9 @@ __sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp,
struct sev_data_send_start data;
int ret;
+ memset(&data, 0, sizeof(data));
data.handle = sev->handle;
ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
- if (ret < 0)
- return ret;
params->session_len = data.session_len;
if (copy_to_user((void __user *)(uintptr_t)argp->data, params,
@@ -1217,10 +1214,9 @@ __sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp,
struct sev_data_send_update_data data;
int ret;
+ memset(&data, 0, sizeof(data));
data.handle = sev->handle;
ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error);
- if (ret < 0)
- return ret;
params->hdr_len = data.hdr_len;
params->trans_len = data.trans_len;
@@ -1764,7 +1760,8 @@ e_mirror_unlock:
e_source_unlock:
mutex_unlock(&source_kvm->lock);
e_source_put:
- fput(source_kvm_file);
+ if (source_kvm_file)
+ fput(source_kvm_file);
return ret;
}
@@ -2198,7 +2195,7 @@ vmgexit_err:
return -EINVAL;
}
-static void pre_sev_es_run(struct vcpu_svm *svm)
+void sev_es_unmap_ghcb(struct vcpu_svm *svm)
{
if (!svm->ghcb)
return;
@@ -2234,9 +2231,6 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu)
struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
int asid = sev_get_asid(svm->vcpu.kvm);
- /* Perform any SEV-ES pre-run actions */
- pre_sev_es_run(svm);
-
/* Assign the asid allocated with this SEV guest */
svm->asid = asid;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index b649f92287a2..e088086f3de6 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -212,7 +212,7 @@ DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
* RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to
* defer the restoration of TSC_AUX until the CPU returns to userspace.
*/
-#define TSC_AUX_URET_SLOT 0
+static int tsc_aux_uret_slot __read_mostly = -1;
static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
@@ -447,6 +447,11 @@ static int has_svm(void)
return 0;
}
+ if (pgtable_l5_enabled()) {
+ pr_info("KVM doesn't yet support 5-level paging on AMD SVM\n");
+ return 0;
+ }
+
return 1;
}
@@ -858,8 +863,8 @@ static __init void svm_adjust_mmio_mask(void)
return;
/* If memory encryption is not enabled, use existing mask */
- rdmsrl(MSR_K8_SYSCFG, msr);
- if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
+ rdmsrl(MSR_AMD64_SYSCFG, msr);
+ if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
return;
enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
@@ -959,8 +964,7 @@ static __init int svm_hardware_setup(void)
kvm_tsc_scaling_ratio_frac_bits = 32;
}
- if (boot_cpu_has(X86_FEATURE_RDTSCP))
- kvm_define_user_return_msr(TSC_AUX_URET_SLOT, MSR_TSC_AUX);
+ tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
/* Check for pause filtering support */
if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
@@ -1006,9 +1010,7 @@ static __init int svm_hardware_setup(void)
}
if (avic) {
- if (!npt_enabled ||
- !boot_cpu_has(X86_FEATURE_AVIC) ||
- !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) {
+ if (!npt_enabled || !boot_cpu_has(X86_FEATURE_AVIC)) {
avic = false;
} else {
pr_info("AVIC enabled\n");
@@ -1100,7 +1102,9 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
return svm->vmcb->control.tsc_offset;
}
-static void svm_check_invpcid(struct vcpu_svm *svm)
+/* Evaluate instruction intercepts that depend on guest CPUID features. */
+static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
+ struct vcpu_svm *svm)
{
/*
* Intercept INVPCID if shadow paging is enabled to sync/free shadow
@@ -1113,6 +1117,13 @@ static void svm_check_invpcid(struct vcpu_svm *svm)
else
svm_clr_intercept(svm, INTERCEPT_INVPCID);
}
+
+ if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) {
+ if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
+ svm_clr_intercept(svm, INTERCEPT_RDTSCP);
+ else
+ svm_set_intercept(svm, INTERCEPT_RDTSCP);
+ }
}
static void init_vmcb(struct kvm_vcpu *vcpu)
@@ -1235,8 +1246,8 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
svm->current_vmcb->asid_generation = 0;
svm->asid = 0;
- svm->nested.vmcb12_gpa = 0;
- svm->nested.last_vmcb12_gpa = 0;
+ svm->nested.vmcb12_gpa = INVALID_GPA;
+ svm->nested.last_vmcb12_gpa = INVALID_GPA;
vcpu->arch.hflags = 0;
if (!kvm_pause_in_guest(vcpu->kvm)) {
@@ -1248,7 +1259,7 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
svm_clr_intercept(svm, INTERCEPT_PAUSE);
}
- svm_check_invpcid(svm);
+ svm_recalc_instruction_intercepts(vcpu, svm);
/*
* If the host supports V_SPEC_CTRL then disable the interception
@@ -1424,6 +1435,9 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
+ if (sev_es_guest(vcpu->kvm))
+ sev_es_unmap_ghcb(svm);
+
if (svm->guest_state_loaded)
return;
@@ -1445,8 +1459,8 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
}
}
- if (static_cpu_has(X86_FEATURE_RDTSCP))
- kvm_set_user_return_msr(TSC_AUX_URET_SLOT, svm->tsc_aux, -1ull);
+ if (likely(tsc_aux_uret_slot >= 0))
+ kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
svm->guest_state_loaded = true;
}
@@ -2655,11 +2669,6 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
break;
case MSR_TSC_AUX:
- if (!boot_cpu_has(X86_FEATURE_RDTSCP))
- return 1;
- if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
- return 1;
msr_info->data = svm->tsc_aux;
break;
/*
@@ -2876,30 +2885,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
break;
case MSR_TSC_AUX:
- if (!boot_cpu_has(X86_FEATURE_RDTSCP))
- return 1;
-
- if (!msr->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
- return 1;
-
- /*
- * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has
- * incomplete and conflicting architectural behavior. Current
- * AMD CPUs completely ignore bits 63:32, i.e. they aren't
- * reserved and always read as zeros. Emulate AMD CPU behavior
- * to avoid explosions if the vCPU is migrated from an AMD host
- * to an Intel host.
- */
- data = (u32)data;
-
/*
* TSC_AUX is usually changed only during boot and never read
* directly. Intercept TSC_AUX instead of exposing it to the
* guest via direct_access_msrs, and switch it via user return.
*/
preempt_disable();
- r = kvm_set_user_return_msr(TSC_AUX_URET_SLOT, data, -1ull);
+ r = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
preempt_enable();
if (r)
return 1;
@@ -3084,6 +3076,7 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[SVM_EXIT_STGI] = stgi_interception,
[SVM_EXIT_CLGI] = clgi_interception,
[SVM_EXIT_SKINIT] = skinit_interception,
+ [SVM_EXIT_RDTSCP] = kvm_handle_invalid_op,
[SVM_EXIT_WBINVD] = kvm_emulate_wbinvd,
[SVM_EXIT_MONITOR] = kvm_emulate_monitor,
[SVM_EXIT_MWAIT] = kvm_emulate_mwait,
@@ -3972,8 +3965,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
- /* Check again if INVPCID interception if required */
- svm_check_invpcid(svm);
+ svm_recalc_instruction_intercepts(vcpu, svm);
/* For sev guests, the memory encryption bit is not reserved in CR3. */
if (sev_guest(vcpu->kvm)) {
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 84b3133c2251..2908c6ab5bb4 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -20,6 +20,7 @@
#include <linux/bits.h>
#include <asm/svm.h>
+#include <asm/sev-common.h>
#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
@@ -479,7 +480,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
-extern int avic;
+extern bool avic;
static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data)
{
@@ -525,40 +526,9 @@ void svm_vcpu_unblocking(struct kvm_vcpu *vcpu);
/* sev.c */
-#define GHCB_VERSION_MAX 1ULL
-#define GHCB_VERSION_MIN 1ULL
-
-#define GHCB_MSR_INFO_POS 0
-#define GHCB_MSR_INFO_MASK (BIT_ULL(12) - 1)
-
-#define GHCB_MSR_SEV_INFO_RESP 0x001
-#define GHCB_MSR_SEV_INFO_REQ 0x002
-#define GHCB_MSR_VER_MAX_POS 48
-#define GHCB_MSR_VER_MAX_MASK 0xffff
-#define GHCB_MSR_VER_MIN_POS 32
-#define GHCB_MSR_VER_MIN_MASK 0xffff
-#define GHCB_MSR_CBIT_POS 24
-#define GHCB_MSR_CBIT_MASK 0xff
-#define GHCB_MSR_SEV_INFO(_max, _min, _cbit) \
- ((((_max) & GHCB_MSR_VER_MAX_MASK) << GHCB_MSR_VER_MAX_POS) | \
- (((_min) & GHCB_MSR_VER_MIN_MASK) << GHCB_MSR_VER_MIN_POS) | \
- (((_cbit) & GHCB_MSR_CBIT_MASK) << GHCB_MSR_CBIT_POS) | \
- GHCB_MSR_SEV_INFO_RESP)
-
-#define GHCB_MSR_CPUID_REQ 0x004
-#define GHCB_MSR_CPUID_RESP 0x005
-#define GHCB_MSR_CPUID_FUNC_POS 32
-#define GHCB_MSR_CPUID_FUNC_MASK 0xffffffff
-#define GHCB_MSR_CPUID_VALUE_POS 32
-#define GHCB_MSR_CPUID_VALUE_MASK 0xffffffff
-#define GHCB_MSR_CPUID_REG_POS 30
-#define GHCB_MSR_CPUID_REG_MASK 0x3
-
-#define GHCB_MSR_TERM_REQ 0x100
-#define GHCB_MSR_TERM_REASON_SET_POS 12
-#define GHCB_MSR_TERM_REASON_SET_MASK 0xf
-#define GHCB_MSR_TERM_REASON_POS 16
-#define GHCB_MSR_TERM_REASON_MASK 0xff
+#define GHCB_VERSION_MAX 1ULL
+#define GHCB_VERSION_MIN 1ULL
+
extern unsigned int max_sev_asid;
@@ -581,6 +551,7 @@ void sev_es_init_vmcb(struct vcpu_svm *svm);
void sev_es_create_vcpu(struct vcpu_svm *svm);
void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu);
+void sev_es_unmap_ghcb(struct vcpu_svm *svm);
/* vmenter.S */
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index a61c015870e3..4f839148948b 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -1550,16 +1550,16 @@ TRACE_EVENT(kvm_nested_vmenter_failed,
TP_ARGS(msg, err),
TP_STRUCT__entry(
- __field(const char *, msg)
+ __string(msg, msg)
__field(u32, err)
),
TP_fast_assign(
- __entry->msg = msg;
+ __assign_str(msg, msg);
__entry->err = err;
),
- TP_printk("%s%s", __entry->msg, !__entry->err ? "" :
+ TP_printk("%s%s", __get_str(msg), !__entry->err ? "" :
__print_symbolic(__entry->err, VMX_VMENTER_INSTRUCTION_ERRORS))
);
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index d1d77985e889..aa0e7872fcc9 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -90,8 +90,7 @@ static inline bool cpu_has_vmx_preemption_timer(void)
static inline bool cpu_has_vmx_posted_intr(void)
{
- return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
- vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
+ return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
}
static inline bool cpu_has_load_ia32_efer(void)
@@ -398,6 +397,9 @@ static inline u64 vmx_supported_debugctl(void)
{
u64 debugctl = 0;
+ if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+ debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;
+
if (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT)
debugctl |= DEBUGCTLMSR_LBR_MASK;
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index bced76637823..6058a65a6ede 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -3098,15 +3098,8 @@ static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
nested_vmx_handle_enlightened_vmptrld(vcpu, false);
if (evmptrld_status == EVMPTRLD_VMFAIL ||
- evmptrld_status == EVMPTRLD_ERROR) {
- pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
- __func__);
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror =
- KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
+ evmptrld_status == EVMPTRLD_ERROR)
return false;
- }
}
return true;
@@ -3194,8 +3187,16 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
{
- if (!nested_get_evmcs_page(vcpu))
+ if (!nested_get_evmcs_page(vcpu)) {
+ pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
+ __func__);
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror =
+ KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+
return false;
+ }
if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu))
return false;
@@ -4435,7 +4436,15 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
/* Similarly, triple faults in L2 should never escape. */
WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu));
- kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+ if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
+ /*
+ * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map
+ * Enlightened VMCS after migration and we still need to
+ * do that when something is forcing L2->L1 exit prior to
+ * the first L2 run.
+ */
+ (void)nested_get_evmcs_page(vcpu);
+ }
/* Service the TLB flush request for L2 before switching to L1. */
if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 459748680daf..5f81ef092bd4 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -238,6 +238,20 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
/*
+ * Bail out of the block loop if the VM has an assigned
+ * device, but the blocking vCPU didn't reconfigure the
+ * PI.NV to the wakeup vector, i.e. the assigned device
+ * came along after the initial check in pi_pre_block().
+ */
+void vmx_pi_start_assignment(struct kvm *kvm)
+{
+ if (!irq_remapping_cap(IRQ_POSTING_CAP))
+ return;
+
+ kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK);
+}
+
+/*
* pi_update_irte - set IRTE for Posted-Interrupts
*
* @kvm: kvm
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
index 0bdc41391c5b..7f7b2326caf5 100644
--- a/arch/x86/kvm/vmx/posted_intr.h
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -95,5 +95,6 @@ void __init pi_init_cpu(int cpu);
bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq,
bool set);
+void vmx_pi_start_assignment(struct kvm *kvm);
#endif /* __KVM_X86_VMX_POSTED_INTR_H */
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index d000cddbd734..50b42d7a8a11 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -455,21 +455,6 @@ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
static unsigned long host_idt_base;
-/*
- * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
- * will emulate SYSCALL in legacy mode if the vendor string in guest
- * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
- * support this emulation, IA32_STAR must always be included in
- * vmx_uret_msrs_list[], even in i386 builds.
- */
-static const u32 vmx_uret_msrs_list[] = {
-#ifdef CONFIG_X86_64
- MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
-#endif
- MSR_EFER, MSR_TSC_AUX, MSR_STAR,
- MSR_IA32_TSX_CTRL,
-};
-
#if IS_ENABLED(CONFIG_HYPERV)
static bool __read_mostly enlightened_vmcs = true;
module_param(enlightened_vmcs, bool, 0444);
@@ -697,21 +682,11 @@ static bool is_valid_passthrough_msr(u32 msr)
return r;
}
-static inline int __vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
-{
- int i;
-
- for (i = 0; i < vmx->nr_uret_msrs; ++i)
- if (vmx_uret_msrs_list[vmx->guest_uret_msrs[i].slot] == msr)
- return i;
- return -1;
-}
-
struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
{
int i;
- i = __vmx_find_uret_msr(vmx, msr);
+ i = kvm_find_user_return_msr(msr);
if (i >= 0)
return &vmx->guest_uret_msrs[i];
return NULL;
@@ -720,13 +695,14 @@ struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
struct vmx_uret_msr *msr, u64 data)
{
+ unsigned int slot = msr - vmx->guest_uret_msrs;
int ret = 0;
u64 old_msr_data = msr->data;
msr->data = data;
- if (msr - vmx->guest_uret_msrs < vmx->nr_active_uret_msrs) {
+ if (msr->load_into_hardware) {
preempt_disable();
- ret = kvm_set_user_return_msr(msr->slot, msr->data, msr->mask);
+ ret = kvm_set_user_return_msr(slot, msr->data, msr->mask);
preempt_enable();
if (ret)
msr->data = old_msr_data;
@@ -1078,7 +1054,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx)
return false;
}
- i = __vmx_find_uret_msr(vmx, MSR_EFER);
+ i = kvm_find_user_return_msr(MSR_EFER);
if (i < 0)
return false;
@@ -1240,11 +1216,14 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
*/
if (!vmx->guest_uret_msrs_loaded) {
vmx->guest_uret_msrs_loaded = true;
- for (i = 0; i < vmx->nr_active_uret_msrs; ++i)
- kvm_set_user_return_msr(vmx->guest_uret_msrs[i].slot,
+ for (i = 0; i < kvm_nr_uret_msrs; ++i) {
+ if (!vmx->guest_uret_msrs[i].load_into_hardware)
+ continue;
+
+ kvm_set_user_return_msr(i,
vmx->guest_uret_msrs[i].data,
vmx->guest_uret_msrs[i].mask);
-
+ }
}
if (vmx->nested.need_vmcs12_to_shadow_sync)
@@ -1751,19 +1730,16 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
vmx_clear_hlt(vcpu);
}
-static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr)
+static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr,
+ bool load_into_hardware)
{
- struct vmx_uret_msr tmp;
- int from, to;
+ struct vmx_uret_msr *uret_msr;
- from = __vmx_find_uret_msr(vmx, msr);
- if (from < 0)
+ uret_msr = vmx_find_uret_msr(vmx, msr);
+ if (!uret_msr)
return;
- to = vmx->nr_active_uret_msrs++;
- tmp = vmx->guest_uret_msrs[to];
- vmx->guest_uret_msrs[to] = vmx->guest_uret_msrs[from];
- vmx->guest_uret_msrs[from] = tmp;
+ uret_msr->load_into_hardware = load_into_hardware;
}
/*
@@ -1773,29 +1749,42 @@ static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr)
*/
static void setup_msrs(struct vcpu_vmx *vmx)
{
- vmx->guest_uret_msrs_loaded = false;
- vmx->nr_active_uret_msrs = 0;
#ifdef CONFIG_X86_64
+ bool load_syscall_msrs;
+
/*
* The SYSCALL MSRs are only needed on long mode guests, and only
* when EFER.SCE is set.
*/
- if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) {
- vmx_setup_uret_msr(vmx, MSR_STAR);
- vmx_setup_uret_msr(vmx, MSR_LSTAR);
- vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK);
- }
+ load_syscall_msrs = is_long_mode(&vmx->vcpu) &&
+ (vmx->vcpu.arch.efer & EFER_SCE);
+
+ vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs);
+ vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs);
+ vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs);
#endif
- if (update_transition_efer(vmx))
- vmx_setup_uret_msr(vmx, MSR_EFER);
+ vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx));
- if (guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
- vmx_setup_uret_msr(vmx, MSR_TSC_AUX);
+ vmx_setup_uret_msr(vmx, MSR_TSC_AUX,
+ guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) ||
+ guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID));
- vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL);
+ /*
+ * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new
+ * kernel and old userspace. If those guests run on a tsx=off host, do
+ * allow guests to use TSX_CTRL, but don't change the value in hardware
+ * so that TSX remains always disabled.
+ */
+ vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM));
if (cpu_has_vmx_msr_bitmap())
vmx_update_msr_bitmap(&vmx->vcpu);
+
+ /*
+ * The set of MSRs to load may have changed, reload MSRs before the
+ * next VM-Enter.
+ */
+ vmx->guest_uret_msrs_loaded = false;
}
static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
@@ -1993,11 +1982,6 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
else
msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
break;
- case MSR_TSC_AUX:
- if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
- return 1;
- goto find_uret_msr;
case MSR_IA32_DEBUGCTLMSR:
msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
break;
@@ -2031,6 +2015,9 @@ static u64 vcpu_supported_debugctl(struct kvm_vcpu *vcpu)
if (!intel_pmu_lbr_is_enabled(vcpu))
debugctl &= ~DEBUGCTLMSR_LBR_MASK;
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
+ debugctl &= ~DEBUGCTLMSR_BUS_LOCK_DETECT;
+
return debugctl;
}
@@ -2313,14 +2300,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
else
vmx->pt_desc.guest.addr_a[index / 2] = data;
break;
- case MSR_TSC_AUX:
- if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
- return 1;
- /* Check reserved bit, higher 32 bits should be zero */
- if ((data >> 32) != 0)
- return 1;
- goto find_uret_msr;
case MSR_IA32_PERF_CAPABILITIES:
if (data && !vcpu_to_pmu(vcpu)->version)
return 1;
@@ -4369,7 +4348,23 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
xsaves_enabled, false);
}
- vmx_adjust_sec_exec_feature(vmx, &exec_control, rdtscp, RDTSCP);
+ /*
+ * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either
+ * feature is exposed to the guest. This creates a virtualization hole
+ * if both are supported in hardware but only one is exposed to the
+ * guest, but letting the guest execute RDTSCP or RDPID when either one
+ * is advertised is preferable to emulating the advertised instruction
+ * in KVM on #UD, and obviously better than incorrectly injecting #UD.
+ */
+ if (cpu_has_vmx_rdtscp()) {
+ bool rdpid_or_rdtscp_enabled =
+ guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) ||
+ guest_cpuid_has(vcpu, X86_FEATURE_RDPID);
+
+ vmx_adjust_secondary_exec_control(vmx, &exec_control,
+ SECONDARY_EXEC_ENABLE_RDTSCP,
+ rdpid_or_rdtscp_enabled, false);
+ }
vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
@@ -4848,7 +4843,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct kvm_run *kvm_run = vcpu->run;
u32 intr_info, ex_no, error_code;
- unsigned long cr2, rip, dr6;
+ unsigned long cr2, dr6;
u32 vect_info;
vect_info = vmx->idt_vectoring_info;
@@ -4938,8 +4933,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
vmx->vcpu.arch.event_exit_inst_len =
vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
kvm_run->exit_reason = KVM_EXIT_DEBUG;
- rip = kvm_rip_read(vcpu);
- kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
+ kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
kvm_run->debug.arch.exception = ex_no;
break;
case AC_VECTOR:
@@ -6855,6 +6849,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
{
+ struct vmx_uret_msr *tsx_ctrl;
struct vcpu_vmx *vmx;
int i, cpu, err;
@@ -6877,43 +6872,19 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
goto free_vpid;
}
- BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
-
- for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) {
- u32 index = vmx_uret_msrs_list[i];
- u32 data_low, data_high;
- int j = vmx->nr_uret_msrs;
-
- if (rdmsr_safe(index, &data_low, &data_high) < 0)
- continue;
- if (wrmsr_safe(index, data_low, data_high) < 0)
- continue;
-
- vmx->guest_uret_msrs[j].slot = i;
- vmx->guest_uret_msrs[j].data = 0;
- switch (index) {
- case MSR_IA32_TSX_CTRL:
- /*
- * TSX_CTRL_CPUID_CLEAR is handled in the CPUID
- * interception. Keep the host value unchanged to avoid
- * changing CPUID bits under the host kernel's feet.
- *
- * hle=0, rtm=0, tsx_ctrl=1 can be found with some
- * combinations of new kernel and old userspace. If
- * those guests run on a tsx=off host, do allow guests
- * to use TSX_CTRL, but do not change the value on the
- * host so that TSX remains always disabled.
- */
- if (boot_cpu_has(X86_FEATURE_RTM))
- vmx->guest_uret_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
- else
- vmx->guest_uret_msrs[j].mask = 0;
- break;
- default:
- vmx->guest_uret_msrs[j].mask = -1ull;
- break;
- }
- ++vmx->nr_uret_msrs;
+ for (i = 0; i < kvm_nr_uret_msrs; ++i) {
+ vmx->guest_uret_msrs[i].data = 0;
+ vmx->guest_uret_msrs[i].mask = -1ull;
+ }
+ if (boot_cpu_has(X86_FEATURE_RTM)) {
+ /*
+ * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception.
+ * Keep the host value unchanged to avoid changing CPUID bits
+ * under the host kernel's feet.
+ */
+ tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
+ if (tsx_ctrl)
+ vmx->guest_uret_msrs[i].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
}
err = alloc_loaded_vmcs(&vmx->vmcs01);
@@ -7344,9 +7315,11 @@ static __init void vmx_set_cpu_caps(void)
if (!cpu_has_vmx_xsaves())
kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
- /* CPUID 0x80000001 */
- if (!cpu_has_vmx_rdtscp())
+ /* CPUID 0x80000001 and 0x7 (RDPID) */
+ if (!cpu_has_vmx_rdtscp()) {
kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
+ kvm_cpu_cap_clear(X86_FEATURE_RDPID);
+ }
if (cpu_has_vmx_waitpkg())
kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
@@ -7402,8 +7375,9 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
/*
* RDPID causes #UD if disabled through secondary execution controls.
* Because it is marked as EmulateOnUD, we need to intercept it here.
+ * Note, RDPID is hidden behind ENABLE_RDTSCP.
*/
- case x86_intercept_rdtscp:
+ case x86_intercept_rdpid:
if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
exception->vector = UD_VECTOR;
exception->error_code_valid = false;
@@ -7746,6 +7720,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.nested_ops = &vmx_nested_ops,
.update_pi_irte = pi_update_irte,
+ .start_assignment = vmx_pi_start_assignment,
#ifdef CONFIG_X86_64
.set_hv_timer = vmx_set_hv_timer,
@@ -7769,17 +7744,42 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
};
+static __init void vmx_setup_user_return_msrs(void)
+{
+
+ /*
+ * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
+ * will emulate SYSCALL in legacy mode if the vendor string in guest
+ * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
+ * support this emulation, MSR_STAR is included in the list for i386,
+ * but is never loaded into hardware. MSR_CSTAR is also never loaded
+ * into hardware and is here purely for emulation purposes.
+ */
+ const u32 vmx_uret_msrs_list[] = {
+ #ifdef CONFIG_X86_64
+ MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
+ #endif
+ MSR_EFER, MSR_TSC_AUX, MSR_STAR,
+ MSR_IA32_TSX_CTRL,
+ };
+ int i;
+
+ BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
+
+ for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
+ kvm_add_user_return_msr(vmx_uret_msrs_list[i]);
+}
+
static __init int hardware_setup(void)
{
unsigned long host_bndcfgs;
struct desc_ptr dt;
- int r, i, ept_lpage_level;
+ int r, ept_lpage_level;
store_idt(&dt);
host_idt_base = dt.address;
- for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
- kvm_define_user_return_msr(i, vmx_uret_msrs_list[i]);
+ vmx_setup_user_return_msrs();
if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
return -EIO;
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 008cb87ff088..16e4e457ba23 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -36,7 +36,7 @@ struct vmx_msrs {
};
struct vmx_uret_msr {
- unsigned int slot; /* The MSR's slot in kvm_user_return_msrs. */
+ bool load_into_hardware;
u64 data;
u64 mask;
};
@@ -245,8 +245,16 @@ struct vcpu_vmx {
u32 idt_vectoring_info;
ulong rflags;
+ /*
+ * User return MSRs are always emulated when enabled in the guest, but
+ * only loaded into hardware when necessary, e.g. SYSCALL #UDs outside
+ * of 64-bit mode or if EFER.SCE=1, thus the SYSCALL MSRs don't need to
+ * be loaded into hardware if those conditions aren't met.
+ * nr_active_uret_msrs tracks the number of MSRs that need to be loaded
+ * into hardware when running the guest. guest_uret_msrs[] is resorted
+ * whenever the number of "active" uret MSRs is modified.
+ */
struct vmx_uret_msr guest_uret_msrs[MAX_NR_USER_RETURN_MSRS];
- int nr_uret_msrs;
int nr_active_uret_msrs;
bool guest_uret_msrs_loaded;
#ifdef CONFIG_X86_64
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6eda2834fc05..6d3955a6a763 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -184,11 +184,6 @@ module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
*/
#define KVM_MAX_NR_USER_RETURN_MSRS 16
-struct kvm_user_return_msrs_global {
- int nr;
- u32 msrs[KVM_MAX_NR_USER_RETURN_MSRS];
-};
-
struct kvm_user_return_msrs {
struct user_return_notifier urn;
bool registered;
@@ -198,7 +193,9 @@ struct kvm_user_return_msrs {
} values[KVM_MAX_NR_USER_RETURN_MSRS];
};
-static struct kvm_user_return_msrs_global __read_mostly user_return_msrs_global;
+u32 __read_mostly kvm_nr_uret_msrs;
+EXPORT_SYMBOL_GPL(kvm_nr_uret_msrs);
+static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS];
static struct kvm_user_return_msrs __percpu *user_return_msrs;
#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
@@ -330,23 +327,53 @@ static void kvm_on_user_return(struct user_return_notifier *urn)
user_return_notifier_unregister(urn);
}
local_irq_restore(flags);
- for (slot = 0; slot < user_return_msrs_global.nr; ++slot) {
+ for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) {
values = &msrs->values[slot];
if (values->host != values->curr) {
- wrmsrl(user_return_msrs_global.msrs[slot], values->host);
+ wrmsrl(kvm_uret_msrs_list[slot], values->host);
values->curr = values->host;
}
}
}
-void kvm_define_user_return_msr(unsigned slot, u32 msr)
+static int kvm_probe_user_return_msr(u32 msr)
{
- BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS);
- user_return_msrs_global.msrs[slot] = msr;
- if (slot >= user_return_msrs_global.nr)
- user_return_msrs_global.nr = slot + 1;
+ u64 val;
+ int ret;
+
+ preempt_disable();
+ ret = rdmsrl_safe(msr, &val);
+ if (ret)
+ goto out;
+ ret = wrmsrl_safe(msr, val);
+out:
+ preempt_enable();
+ return ret;
}
-EXPORT_SYMBOL_GPL(kvm_define_user_return_msr);
+
+int kvm_add_user_return_msr(u32 msr)
+{
+ BUG_ON(kvm_nr_uret_msrs >= KVM_MAX_NR_USER_RETURN_MSRS);
+
+ if (kvm_probe_user_return_msr(msr))
+ return -1;
+
+ kvm_uret_msrs_list[kvm_nr_uret_msrs] = msr;
+ return kvm_nr_uret_msrs++;
+}
+EXPORT_SYMBOL_GPL(kvm_add_user_return_msr);
+
+int kvm_find_user_return_msr(u32 msr)
+{
+ int i;
+
+ for (i = 0; i < kvm_nr_uret_msrs; ++i) {
+ if (kvm_uret_msrs_list[i] == msr)
+ return i;
+ }
+ return -1;
+}
+EXPORT_SYMBOL_GPL(kvm_find_user_return_msr);
static void kvm_user_return_msr_cpu_online(void)
{
@@ -355,8 +382,8 @@ static void kvm_user_return_msr_cpu_online(void)
u64 value;
int i;
- for (i = 0; i < user_return_msrs_global.nr; ++i) {
- rdmsrl_safe(user_return_msrs_global.msrs[i], &value);
+ for (i = 0; i < kvm_nr_uret_msrs; ++i) {
+ rdmsrl_safe(kvm_uret_msrs_list[i], &value);
msrs->values[i].host = value;
msrs->values[i].curr = value;
}
@@ -371,7 +398,7 @@ int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
value = (value & mask) | (msrs->values[slot].host & ~mask);
if (value == msrs->values[slot].curr)
return 0;
- err = wrmsrl_safe(user_return_msrs_global.msrs[slot], value);
+ err = wrmsrl_safe(kvm_uret_msrs_list[slot], value);
if (err)
return 1;
@@ -1149,6 +1176,9 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
fixed |= DR6_RTM;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
+ fixed |= DR6_BUS_LOCK;
return fixed;
}
@@ -1615,6 +1645,30 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
* invokes 64-bit SYSENTER.
*/
data = get_canonical(data, vcpu_virt_addr_bits(vcpu));
+ break;
+ case MSR_TSC_AUX:
+ if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
+ return 1;
+
+ if (!host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_RDPID))
+ return 1;
+
+ /*
+ * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has
+ * incomplete and conflicting architectural behavior. Current
+ * AMD CPUs completely ignore bits 63:32, i.e. they aren't
+ * reserved and always read as zeros. Enforce Intel's reserved
+ * bits check if and only if the guest CPU is Intel, and clear
+ * the bits in all other cases. This ensures cross-vendor
+ * migration will provide consistent behavior for the guest.
+ */
+ if (guest_cpuid_is_intel(vcpu) && (data >> 32) != 0)
+ return 1;
+
+ data = (u32)data;
+ break;
}
msr.data = data;
@@ -1651,6 +1705,18 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
return KVM_MSR_RET_FILTERED;
+ switch (index) {
+ case MSR_TSC_AUX:
+ if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
+ return 1;
+
+ if (!host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_RDPID))
+ return 1;
+ break;
+ }
+
msr.index = index;
msr.host_initiated = host_initiated;
@@ -3006,6 +3072,19 @@ static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
+
+ if (!tdp_enabled) {
+ /*
+ * A TLB flush on behalf of the guest is equivalent to
+ * INVPCID(all), toggling CR4.PGE, etc., which requires
+ * a forced sync of the shadow page tables. Unload the
+ * entire MMU here and the subsequent load will sync the
+ * shadow page tables, and also flush the TLB.
+ */
+ kvm_mmu_unload(vcpu);
+ return;
+ }
+
static_call(kvm_x86_tlb_flush_guest)(vcpu);
}
@@ -3035,10 +3114,14 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
* expensive IPIs.
*/
if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
+ u8 st_preempted = xchg(&st->preempted, 0);
+
trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
- st->preempted & KVM_VCPU_FLUSH_TLB);
- if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
+ st_preempted & KVM_VCPU_FLUSH_TLB);
+ if (st_preempted & KVM_VCPU_FLUSH_TLB)
kvm_vcpu_flush_tlb_guest(vcpu);
+ } else {
+ st->preempted = 0;
}
vcpu->arch.st.preempted = 0;
@@ -3402,7 +3485,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_LASTBRANCHTOIP:
case MSR_IA32_LASTINTFROMIP:
case MSR_IA32_LASTINTTOIP:
- case MSR_K8_SYSCFG:
+ case MSR_AMD64_SYSCFG:
case MSR_K8_TSEG_ADDR:
case MSR_K8_TSEG_MASK:
case MSR_VM_HSAVE_PA:
@@ -5468,14 +5551,18 @@ static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter)
static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
struct kvm_msr_filter_range *user_range)
{
- struct msr_bitmap_range range;
unsigned long *bitmap = NULL;
size_t bitmap_size;
- int r;
if (!user_range->nmsrs)
return 0;
+ if (user_range->flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE))
+ return -EINVAL;
+
+ if (!user_range->flags)
+ return -EINVAL;
+
bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long);
if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE)
return -EINVAL;
@@ -5484,31 +5571,15 @@ static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
if (IS_ERR(bitmap))
return PTR_ERR(bitmap);
- range = (struct msr_bitmap_range) {
+ msr_filter->ranges[msr_filter->count] = (struct msr_bitmap_range) {
.flags = user_range->flags,
.base = user_range->base,
.nmsrs = user_range->nmsrs,
.bitmap = bitmap,
};
- if (range.flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) {
- r = -EINVAL;
- goto err;
- }
-
- if (!range.flags) {
- r = -EINVAL;
- goto err;
- }
-
- /* Everything ok, add this range identifier. */
- msr_filter->ranges[msr_filter->count] = range;
msr_filter->count++;
-
return 0;
-err:
- kfree(bitmap);
- return r;
}
static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)
@@ -5937,7 +6008,8 @@ static void kvm_init_msr_list(void)
continue;
break;
case MSR_TSC_AUX:
- if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP))
+ if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) &&
+ !kvm_cpu_cap_has(X86_FEATURE_RDPID))
continue;
break;
case MSR_IA32_UMWAIT_CONTROL:
@@ -7171,6 +7243,11 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK);
+ ctxt->interruptibility = 0;
+ ctxt->have_exception = false;
+ ctxt->exception.vector = -1;
+ ctxt->perm_ok = false;
+
init_decode_cache(ctxt);
vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
}
@@ -7506,14 +7583,7 @@ int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
kvm_vcpu_check_breakpoint(vcpu, &r))
return r;
- ctxt->interruptibility = 0;
- ctxt->have_exception = false;
- ctxt->exception.vector = -1;
- ctxt->perm_ok = false;
-
- ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
-
- r = x86_decode_insn(ctxt, insn, insn_len);
+ r = x86_decode_insn(ctxt, insn, insn_len, emulation_type);
trace_kvm_emulate_insn_start(vcpu);
++vcpu->stat.insn_emulation;
@@ -8040,6 +8110,18 @@ static void pvclock_gtod_update_fn(struct work_struct *work)
static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
/*
+ * Indirection to move queue_work() out of the tk_core.seq write held
+ * region to prevent possible deadlocks against time accessors which
+ * are invoked with work related locks held.
+ */
+static void pvclock_irq_work_fn(struct irq_work *w)
+{
+ queue_work(system_long_wq, &pvclock_gtod_work);
+}
+
+static DEFINE_IRQ_WORK(pvclock_irq_work, pvclock_irq_work_fn);
+
+/*
* Notification about pvclock gtod data update.
*/
static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
@@ -8050,13 +8132,14 @@ static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
update_pvclock_gtod(tk);
- /* disable master clock if host does not trust, or does not
- * use, TSC based clocksource.
+ /*
+ * Disable master clock if host does not trust, or does not use,
+ * TSC based clocksource. Delegate queue_work() to irq_work as
+ * this is invoked with tk_core.seq write held.
*/
if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
atomic_read(&kvm_guest_has_master_clock) != 0)
- queue_work(system_long_wq, &pvclock_gtod_work);
-
+ irq_work_queue(&pvclock_irq_work);
return 0;
}
@@ -8118,6 +8201,7 @@ int kvm_arch_init(void *opaque)
printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n");
goto out_free_x86_emulator_cache;
}
+ kvm_nr_uret_msrs = 0;
r = kvm_mmu_module_init();
if (r)
@@ -8168,6 +8252,8 @@ void kvm_arch_exit(void)
cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
#ifdef CONFIG_X86_64
pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
+ irq_work_sync(&pvclock_irq_work);
+ cancel_work_sync(&pvclock_gtod_work);
#endif
kvm_x86_ops.hardware_enable = NULL;
kvm_mmu_module_exit();
@@ -8289,6 +8375,9 @@ static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
vcpu->stat.directed_yield_attempted++;
+ if (single_task_running())
+ goto no_yield;
+
rcu_read_lock();
map = rcu_dereference(vcpu->kvm->arch.apic_map);
@@ -9425,7 +9514,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
if (r <= 0)
break;
- kvm_clear_request(KVM_REQ_PENDING_TIMER, vcpu);
+ kvm_clear_request(KVM_REQ_UNBLOCK, vcpu);
if (kvm_cpu_has_pending_timer(vcpu))
kvm_inject_pending_timer_irqs(vcpu);
@@ -10044,8 +10133,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
kvm_update_dr7(vcpu);
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
- vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
- get_segment_base(vcpu, VCPU_SREG_CS);
+ vcpu->arch.singlestep_rip = kvm_get_linear_rip(vcpu);
/*
* Trigger an rflags update that will inject or remove the trace
@@ -11428,7 +11516,8 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
void kvm_arch_start_assignment(struct kvm *kvm)
{
- atomic_inc(&kvm->arch.assigned_device_count);
+ if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1)
+ static_call_cond(kvm_x86_start_assignment)(kvm);
}
EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index b93d6cd08a7f..121921b2927c 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -5,7 +5,7 @@
#include <xen/xen.h>
#include <asm/fpu/internal.h>
-#include <asm/sev-es.h>
+#include <asm/sev.h>
#include <asm/traps.h>
#include <asm/kdebug.h>
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 1c548ad00752..6bda7f67d737 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -836,8 +836,8 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
if (si_code == SEGV_PKUERR)
force_sig_pkuerr((void __user *)address, pkey);
-
- force_sig_fault(SIGSEGV, si_code, (void __user *)address);
+ else
+ force_sig_fault(SIGSEGV, si_code, (void __user *)address);
local_irq_disable();
}
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
index 04aba7e80a36..470b20208430 100644
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -504,10 +504,6 @@ void __init sme_enable(struct boot_params *bp)
#define AMD_SME_BIT BIT(0)
#define AMD_SEV_BIT BIT(1)
- /* Check the SEV MSR whether SEV or SME is enabled */
- sev_status = __rdmsr(MSR_AMD64_SEV);
- feature_mask = (sev_status & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT;
-
/*
* Check for the SME/SEV feature:
* CPUID Fn8000_001F[EAX]
@@ -519,17 +515,22 @@ void __init sme_enable(struct boot_params *bp)
eax = 0x8000001f;
ecx = 0;
native_cpuid(&eax, &ebx, &ecx, &edx);
- if (!(eax & feature_mask))
+ /* Check whether SEV or SME is supported */
+ if (!(eax & (AMD_SEV_BIT | AMD_SME_BIT)))
return;
me_mask = 1UL << (ebx & 0x3f);
+ /* Check the SEV MSR whether SEV or SME is enabled */
+ sev_status = __rdmsr(MSR_AMD64_SEV);
+ feature_mask = (sev_status & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT;
+
/* Check if memory encryption is enabled */
if (feature_mask == AMD_SME_BIT) {
/*
* No SME if Hypervisor bit is set. This check is here to
* prevent a guest from trying to enable SME. For running as a
- * KVM guest the MSR_K8_SYSCFG will be sufficient, but there
+ * KVM guest the MSR_AMD64_SYSCFG will be sufficient, but there
* might be other hypervisors which emulate that MSR as non-zero
* or even pass it through to the guest.
* A malicious hypervisor can still trick a guest into this
@@ -542,8 +543,8 @@ void __init sme_enable(struct boot_params *bp)
return;
/* For SME, check the SYSCFG MSR */
- msr = __rdmsr(MSR_K8_SYSCFG);
- if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
+ msr = __rdmsr(MSR_AMD64_SYSCFG);
+ if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
return;
} else {
/* SEV state cannot be controlled by a command line option */
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index ae744b6a0785..dd40d3fea74e 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -284,7 +284,7 @@ static int __init early_root_info_init(void)
/* need to take out [4G, TOM2) for RAM*/
/* SYS_CFG */
- address = MSR_K8_SYSCFG;
+ address = MSR_AMD64_SYSCFG;
rdmsrl(address, val);
/* TOP_MEM2 is enabled? */
if (val & (1<<21)) {
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index df7b5477fc4f..7515e78ef898 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -47,7 +47,7 @@
#include <asm/realmode.h>
#include <asm/time.h>
#include <asm/pgalloc.h>
-#include <asm/sev-es.h>
+#include <asm/sev.h>
/*
* We allocate runtime services regions top-down, starting from -4G, i.e.
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index 7850111008a8..b15ebfe40a73 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -450,6 +450,18 @@ void __init efi_free_boot_services(void)
size -= rm_size;
}
+ /*
+ * Don't free memory under 1M for two reasons:
+ * - BIOS might clobber it
+ * - Crash kernel needs it to be reserved
+ */
+ if (start + size < SZ_1M)
+ continue;
+ if (start < SZ_1M) {
+ size -= (SZ_1M - start);
+ start = SZ_1M;
+ }
+
memblock_free_late(start, size);
}
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index 1be71ef5e4c4..6534c92d0f83 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -9,7 +9,7 @@
#include <asm/realmode.h>
#include <asm/tlbflush.h>
#include <asm/crash.h>
-#include <asm/sev-es.h>
+#include <asm/sev.h>
struct real_mode_header *real_mode_header;
u32 *trampoline_cr4_features;
@@ -29,14 +29,16 @@ void __init reserve_real_mode(void)
/* Has to be under 1M so we can execute real-mode AP code. */
mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
- if (!mem) {
+ if (!mem)
pr_info("No sub-1M memory is available for the trampoline\n");
- return;
- }
+ else
+ set_real_mode_mem(mem);
- memblock_reserve(mem, size);
- set_real_mode_mem(mem);
- crash_reserve_low_1M();
+ /*
+ * Unconditionally reserve the entire fisrt 1M, see comment in
+ * setup_arch().
+ */
+ memblock_reserve(0, SZ_1M);
}
static void sme_sev_setup_real_mode(struct trampoline_header *th)
diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S
index 84c5d1b33d10..cc8391f86cdb 100644
--- a/arch/x86/realmode/rm/trampoline_64.S
+++ b/arch/x86/realmode/rm/trampoline_64.S
@@ -123,9 +123,9 @@ SYM_CODE_START(startup_32)
*/
btl $TH_FLAGS_SME_ACTIVE_BIT, pa_tr_flags
jnc .Ldone
- movl $MSR_K8_SYSCFG, %ecx
+ movl $MSR_AMD64_SYSCFG, %ecx
rdmsr
- bts $MSR_K8_SYSCFG_MEM_ENCRYPT_BIT, %eax
+ bts $MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT, %eax
jc .Ldone
/*
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 17503fed2017..e87699aa2dc8 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -1273,16 +1273,16 @@ asmlinkage __visible void __init xen_start_kernel(void)
/* Get mfn list */
xen_build_dynamic_phys_to_machine();
+ /* Work out if we support NX */
+ get_cpu_cap(&boot_cpu_data);
+ x86_configure_nx();
+
/*
* Set up kernel GDT and segment registers, mainly so that
* -fstack-protector code can be executed.
*/
xen_setup_gdt(0);
- /* Work out if we support NX */
- get_cpu_cap(&boot_cpu_data);
- x86_configure_nx();
-
/* Determine virtual and physical address sizes */
get_cpu_address_sizes(&boot_cpu_data);