summaryrefslogtreecommitdiff
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/Kconfig11
-rw-r--r--arch/x86/kvm/debugfs.c3
-rw-r--r--arch/x86/kvm/emulate.c45
-rw-r--r--arch/x86/kvm/hyperv.c50
-rw-r--r--arch/x86/kvm/hyperv.h3
-rw-r--r--arch/x86/kvm/kvm_emulate.h2
-rw-r--r--arch/x86/kvm/lapic.c27
-rw-r--r--arch/x86/kvm/mmu/mmu.c37
-rw-r--r--arch/x86/kvm/mmu/page_track.c68
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c124
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h2
-rw-r--r--arch/x86/kvm/smm.c15
-rw-r--r--arch/x86/kvm/svm/svm.c25
-rw-r--r--arch/x86/kvm/svm/svm_ops.h6
-rw-r--r--arch/x86/kvm/trace.h9
-rw-r--r--arch/x86/kvm/vmx/nested.c2
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c2
-rw-r--r--arch/x86/kvm/vmx/run_flags.h7
-rw-r--r--arch/x86/kvm/vmx/vmenter.S9
-rw-r--r--arch/x86/kvm/vmx/vmx.c109
-rw-r--r--arch/x86/kvm/vmx/vmx.h2
-rw-r--r--arch/x86/kvm/vmx/vmx_ops.h6
-rw-r--r--arch/x86/kvm/x86.c174
23 files changed, 459 insertions, 279 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 87e3da7b0439..8c3032a96caf 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -7,7 +7,6 @@ source "virt/kvm/Kconfig"
menuconfig VIRTUALIZATION
bool "Virtualization"
- depends on HAVE_KVM || X86
default y
help
Say Y here to get to see options for using your Linux host to run other
@@ -20,7 +19,6 @@ if VIRTUALIZATION
config KVM
tristate "Kernel-based Virtual Machine (KVM) support"
- depends on HAVE_KVM
depends on HIGH_RES_TIMERS
depends on X86_LOCAL_APIC
select KVM_COMMON
@@ -29,9 +27,9 @@ config KVM
select HAVE_KVM_PFNCACHE
select HAVE_KVM_DIRTY_RING_TSO
select HAVE_KVM_DIRTY_RING_ACQ_REL
- select IRQ_BYPASS_MANAGER
select HAVE_KVM_IRQ_BYPASS
select HAVE_KVM_IRQ_ROUTING
+ select HAVE_KVM_READONLY_MEM
select KVM_ASYNC_PF
select USER_RETURN_NOTIFIER
select KVM_MMIO
@@ -80,9 +78,10 @@ config KVM_SW_PROTECTED_VM
depends on KVM && X86_64
select KVM_GENERIC_PRIVATE_MEM
help
- Enable support for KVM software-protected VMs. Currently "protected"
- means the VM can be backed with memory provided by
- KVM_CREATE_GUEST_MEMFD.
+ Enable support for KVM software-protected VMs. Currently, software-
+ protected VMs are purely a development and testing vehicle for
+ KVM_CREATE_GUEST_MEMFD. Attempting to run a "real" VM workload as a
+ software-protected VM will fail miserably.
If unsure, say "N".
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
index 95ea1a1f7403..999227fc7c66 100644
--- a/arch/x86/kvm/debugfs.c
+++ b/arch/x86/kvm/debugfs.c
@@ -189,9 +189,8 @@ static const struct file_operations mmu_rmaps_stat_fops = {
.release = kvm_mmu_rmaps_stat_release,
};
-int kvm_arch_create_vm_debugfs(struct kvm *kvm)
+void kvm_arch_create_vm_debugfs(struct kvm *kvm)
{
debugfs_create_file("mmu_rmaps_stat", 0644, kvm->debugfs_dentry, kvm,
&mmu_rmaps_stat_fops);
- return 0;
}
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index e223043ef5b2..3328c7c7f2f9 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1820,22 +1820,22 @@ static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)
return X86EMUL_CONTINUE;
}
-static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes)
+static int emulate_push(struct x86_emulate_ctxt *ctxt, const void *data, int len)
{
struct segmented_address addr;
- rsp_increment(ctxt, -bytes);
+ rsp_increment(ctxt, -len);
addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt);
addr.seg = VCPU_SREG_SS;
- return segmented_write(ctxt, addr, data, bytes);
+ return segmented_write(ctxt, addr, data, len);
}
static int em_push(struct x86_emulate_ctxt *ctxt)
{
/* Disable writeback. */
ctxt->dst.type = OP_NONE;
- return push(ctxt, &ctxt->src.val, ctxt->op_bytes);
+ return emulate_push(ctxt, &ctxt->src.val, ctxt->op_bytes);
}
static int emulate_pop(struct x86_emulate_ctxt *ctxt,
@@ -1863,7 +1863,8 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
void *dest, int len)
{
int rc;
- unsigned long val, change_mask;
+ unsigned long val = 0;
+ unsigned long change_mask;
int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> X86_EFLAGS_IOPL_BIT;
int cpl = ctxt->ops->cpl(ctxt);
@@ -1920,7 +1921,7 @@ static int em_enter(struct x86_emulate_ctxt *ctxt)
return X86EMUL_UNHANDLEABLE;
rbp = reg_read(ctxt, VCPU_REGS_RBP);
- rc = push(ctxt, &rbp, stack_size(ctxt));
+ rc = emulate_push(ctxt, &rbp, stack_size(ctxt));
if (rc != X86EMUL_CONTINUE)
return rc;
assign_masked(reg_rmw(ctxt, VCPU_REGS_RBP), reg_read(ctxt, VCPU_REGS_RSP),
@@ -1954,7 +1955,7 @@ static int em_push_sreg(struct x86_emulate_ctxt *ctxt)
static int em_pop_sreg(struct x86_emulate_ctxt *ctxt)
{
int seg = ctxt->src2.val;
- unsigned long selector;
+ unsigned long selector = 0;
int rc;
rc = emulate_pop(ctxt, &selector, 2);
@@ -2000,7 +2001,7 @@ static int em_popa(struct x86_emulate_ctxt *ctxt)
{
int rc = X86EMUL_CONTINUE;
int reg = VCPU_REGS_RDI;
- u32 val;
+ u32 val = 0;
while (reg >= VCPU_REGS_RAX) {
if (reg == VCPU_REGS_RSP) {
@@ -2229,7 +2230,7 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
static int em_ret(struct x86_emulate_ctxt *ctxt)
{
int rc;
- unsigned long eip;
+ unsigned long eip = 0;
rc = emulate_pop(ctxt, &eip, ctxt->op_bytes);
if (rc != X86EMUL_CONTINUE)
@@ -2241,7 +2242,8 @@ static int em_ret(struct x86_emulate_ctxt *ctxt)
static int em_ret_far(struct x86_emulate_ctxt *ctxt)
{
int rc;
- unsigned long eip, cs;
+ unsigned long eip = 0;
+ unsigned long cs = 0;
int cpl = ctxt->ops->cpl(ctxt);
struct desc_struct new_desc;
@@ -3011,7 +3013,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
ret = em_push(ctxt);
}
- ops->get_dr(ctxt, 7, &dr7);
+ dr7 = ops->get_dr(ctxt, 7);
ops->set_dr(ctxt, 7, dr7 & ~(DR_LOCAL_ENABLE_MASK | DR_LOCAL_SLOWDOWN));
return ret;
@@ -3184,7 +3186,7 @@ fail:
static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
{
int rc;
- unsigned long eip;
+ unsigned long eip = 0;
rc = emulate_pop(ctxt, &eip, ctxt->op_bytes);
if (rc != X86EMUL_CONTINUE)
@@ -3866,15 +3868,6 @@ static int check_cr_access(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
-static int check_dr7_gd(struct x86_emulate_ctxt *ctxt)
-{
- unsigned long dr7;
-
- ctxt->ops->get_dr(ctxt, 7, &dr7);
-
- return dr7 & DR7_GD;
-}
-
static int check_dr_read(struct x86_emulate_ctxt *ctxt)
{
int dr = ctxt->modrm_reg;
@@ -3887,10 +3880,10 @@ static int check_dr_read(struct x86_emulate_ctxt *ctxt)
if ((cr4 & X86_CR4_DE) && (dr == 4 || dr == 5))
return emulate_ud(ctxt);
- if (check_dr7_gd(ctxt)) {
+ if (ctxt->ops->get_dr(ctxt, 7) & DR7_GD) {
ulong dr6;
- ctxt->ops->get_dr(ctxt, 6, &dr6);
+ dr6 = ctxt->ops->get_dr(ctxt, 6);
dr6 &= ~DR_TRAP_BITS;
dr6 |= DR6_BD | DR6_ACTIVE_LOW;
ctxt->ops->set_dr(ctxt, 6, dr6);
@@ -4505,11 +4498,11 @@ static const struct instr_dual instr_dual_0f_38_f1 = {
};
static const struct gprefix three_byte_0f_38_f0 = {
- ID(0, &instr_dual_0f_38_f0), N, N, N
+ ID(0, &instr_dual_0f_38_f0), ID(0, &instr_dual_0f_38_f0), N, N
};
static const struct gprefix three_byte_0f_38_f1 = {
- ID(0, &instr_dual_0f_38_f1), N, N, N
+ ID(0, &instr_dual_0f_38_f1), ID(0, &instr_dual_0f_38_f1), N, N
};
/*
@@ -5449,7 +5442,7 @@ twobyte_insn:
ctxt->dst.val = ops->get_cr(ctxt, ctxt->modrm_reg);
break;
case 0x21: /* mov from dr to reg */
- ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val);
+ ctxt->dst.val = ops->get_dr(ctxt, ctxt->modrm_reg);
break;
case 0x40 ... 0x4f: /* cmov */
if (test_cc(ctxt->b, ctxt->eflags))
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 4943f6b2bbee..8a47f8541eab 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1322,6 +1322,56 @@ static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr)
return false;
}
+#define KVM_HV_WIN2016_GUEST_ID 0x1040a00003839
+#define KVM_HV_WIN2016_GUEST_ID_MASK (~GENMASK_ULL(23, 16)) /* mask out the service version */
+
+/*
+ * Hyper-V enabled Windows Server 2016 SMP VMs fail to boot in !XSAVES && XSAVEC
+ * configuration.
+ * Such configuration can result from, for example, AMD Erratum 1386 workaround.
+ *
+ * Print a notice so users aren't left wondering what's suddenly gone wrong.
+ */
+static void __kvm_hv_xsaves_xsavec_maybe_warn(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+
+ /* Check again under the hv_lock. */
+ if (hv->xsaves_xsavec_checked)
+ return;
+
+ if ((hv->hv_guest_os_id & KVM_HV_WIN2016_GUEST_ID_MASK) !=
+ KVM_HV_WIN2016_GUEST_ID)
+ return;
+
+ hv->xsaves_xsavec_checked = true;
+
+ /* UP configurations aren't affected */
+ if (atomic_read(&kvm->online_vcpus) < 2)
+ return;
+
+ if (guest_cpuid_has(vcpu, X86_FEATURE_XSAVES) ||
+ !guest_cpuid_has(vcpu, X86_FEATURE_XSAVEC))
+ return;
+
+ pr_notice_ratelimited("Booting SMP Windows KVM VM with !XSAVES && XSAVEC. "
+ "If it fails to boot try disabling XSAVEC in the VM config.\n");
+}
+
+void kvm_hv_xsaves_xsavec_maybe_warn(struct kvm_vcpu *vcpu)
+{
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+
+ if (!vcpu->arch.hyperv_enabled ||
+ hv->xsaves_xsavec_checked)
+ return;
+
+ mutex_lock(&hv->hv_lock);
+ __kvm_hv_xsaves_xsavec_maybe_warn(vcpu);
+ mutex_unlock(&hv->hv_lock);
+}
+
static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
bool host)
{
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 1dc0b6604526..923e64903da9 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -182,6 +182,8 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
struct pvclock_vcpu_time_info *hv_clock);
void kvm_hv_request_tsc_page_update(struct kvm *kvm);
+void kvm_hv_xsaves_xsavec_maybe_warn(struct kvm_vcpu *vcpu);
+
void kvm_hv_init_vm(struct kvm *kvm);
void kvm_hv_destroy_vm(struct kvm *kvm);
int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu);
@@ -267,6 +269,7 @@ int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu);
static inline void kvm_hv_setup_tsc_page(struct kvm *kvm,
struct pvclock_vcpu_time_info *hv_clock) {}
static inline void kvm_hv_request_tsc_page_update(struct kvm *kvm) {}
+static inline void kvm_hv_xsaves_xsavec_maybe_warn(struct kvm_vcpu *vcpu) {}
static inline void kvm_hv_init_vm(struct kvm *kvm) {}
static inline void kvm_hv_destroy_vm(struct kvm *kvm) {}
static inline int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index e6d149825169..153977c556f4 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -203,7 +203,7 @@ struct x86_emulate_ops {
ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr);
int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val);
int (*cpl)(struct x86_emulate_ctxt *ctxt);
- void (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
+ ulong (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr);
int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
int (*set_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
int (*get_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 3242f3da2457..681f6d82d015 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -124,6 +124,9 @@ static inline int __apic_test_and_clear_vector(int vec, void *bitmap)
return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
}
+__read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
+EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu);
+
__read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_hw_disabled, HZ);
__read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_sw_disabled, HZ);
@@ -2466,8 +2469,10 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!vcpu->arch.apic)
+ if (!vcpu->arch.apic) {
+ static_branch_dec(&kvm_has_noapic_vcpu);
return;
+ }
hrtimer_cancel(&apic->lapic_timer.timer);
@@ -2809,6 +2814,11 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
ASSERT(vcpu != NULL);
+ if (!irqchip_in_kernel(vcpu->kvm)) {
+ static_branch_inc(&kvm_has_noapic_vcpu);
+ return 0;
+ }
+
apic = kzalloc(sizeof(*apic), GFP_KERNEL_ACCOUNT);
if (!apic)
goto nomem;
@@ -2844,6 +2854,21 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
static_branch_inc(&apic_sw_disabled.key); /* sw disabled at reset */
kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
+ /*
+ * Defer evaluating inhibits until the vCPU is first run, as this vCPU
+ * will not get notified of any changes until this vCPU is visible to
+ * other vCPUs (marked online and added to the set of vCPUs).
+ *
+ * Opportunistically mark APICv active as VMX in particularly is highly
+ * unlikely to have inhibits. Ignore the current per-VM APICv state so
+ * that vCPU creation is guaranteed to run with a deterministic value,
+ * the request will ensure the vCPU gets the correct state before VM-Entry.
+ */
+ if (enable_apicv) {
+ apic->apicv_active = true;
+ kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+ }
+
return 0;
nomem_free_apic:
kfree(apic);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 2d6cdeab1f8a..e5e2af69e24d 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3575,10 +3575,14 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
if (WARN_ON_ONCE(!sp))
return;
- if (is_tdp_mmu_page(sp))
+ if (is_tdp_mmu_page(sp)) {
+ lockdep_assert_held_read(&kvm->mmu_lock);
kvm_tdp_mmu_put_root(kvm, sp);
- else if (!--sp->root_count && sp->role.invalid)
- kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+ } else {
+ lockdep_assert_held_write(&kvm->mmu_lock);
+ if (!--sp->root_count && sp->role.invalid)
+ kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+ }
*root_hpa = INVALID_PAGE;
}
@@ -3587,6 +3591,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
ulong roots_to_free)
{
+ bool is_tdp_mmu = tdp_mmu_enabled && mmu->root_role.direct;
int i;
LIST_HEAD(invalid_list);
bool free_active_root;
@@ -3609,7 +3614,10 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
return;
}
- write_lock(&kvm->mmu_lock);
+ if (is_tdp_mmu)
+ read_lock(&kvm->mmu_lock);
+ else
+ write_lock(&kvm->mmu_lock);
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
@@ -3635,8 +3643,13 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
mmu->root.pgd = 0;
}
- kvm_mmu_commit_zap_page(kvm, &invalid_list);
- write_unlock(&kvm->mmu_lock);
+ if (is_tdp_mmu) {
+ read_unlock(&kvm->mmu_lock);
+ WARN_ON_ONCE(!list_empty(&invalid_list));
+ } else {
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ write_unlock(&kvm->mmu_lock);
+ }
}
EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
@@ -3693,15 +3706,15 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
unsigned i;
int r;
+ if (tdp_mmu_enabled)
+ return kvm_tdp_mmu_alloc_root(vcpu);
+
write_lock(&vcpu->kvm->mmu_lock);
r = make_mmu_pages_available(vcpu);
if (r < 0)
goto out_unlock;
- if (tdp_mmu_enabled) {
- root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
- mmu->root.hpa = root;
- } else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
+ if (shadow_root_level >= PT64_ROOT_4LEVEL) {
root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level);
mmu->root.hpa = root;
} else if (shadow_root_level == PT32E_ROOT_LEVEL) {
@@ -6997,9 +7010,7 @@ int kvm_mmu_vendor_module_init(void)
kvm_mmu_reset_all_pte_masks();
- pte_list_desc_cache = kmem_cache_create("pte_list_desc",
- sizeof(struct pte_list_desc),
- 0, SLAB_ACCOUNT, NULL);
+ pte_list_desc_cache = KMEM_CACHE(pte_list_desc, SLAB_ACCOUNT);
if (!pte_list_desc_cache)
goto out;
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
index c87da11f3a04..f6448284c18e 100644
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -20,10 +20,23 @@
#include "mmu_internal.h"
#include "page_track.h"
+static bool kvm_external_write_tracking_enabled(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
+ /*
+ * Read external_write_tracking_enabled before related pointers. Pairs
+ * with the smp_store_release in kvm_page_track_write_tracking_enable().
+ */
+ return smp_load_acquire(&kvm->arch.external_write_tracking_enabled);
+#else
+ return false;
+#endif
+}
+
bool kvm_page_track_write_tracking_enabled(struct kvm *kvm)
{
- return IS_ENABLED(CONFIG_KVM_EXTERNAL_WRITE_TRACKING) ||
- !tdp_enabled || kvm_shadow_root_allocated(kvm);
+ return kvm_external_write_tracking_enabled(kvm) ||
+ kvm_shadow_root_allocated(kvm) || !tdp_enabled;
}
void kvm_page_track_free_memslot(struct kvm_memory_slot *slot)
@@ -153,6 +166,50 @@ int kvm_page_track_init(struct kvm *kvm)
return init_srcu_struct(&head->track_srcu);
}
+static int kvm_enable_external_write_tracking(struct kvm *kvm)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
+ int r = 0, i, bkt;
+
+ mutex_lock(&kvm->slots_arch_lock);
+
+ /*
+ * Check for *any* write tracking user (not just external users) under
+ * lock. This avoids unnecessary work, e.g. if KVM itself is using
+ * write tracking, or if two external users raced when registering.
+ */
+ if (kvm_page_track_write_tracking_enabled(kvm))
+ goto out_success;
+
+ for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
+ slots = __kvm_memslots(kvm, i);
+ kvm_for_each_memslot(slot, bkt, slots) {
+ /*
+ * Intentionally do NOT free allocations on failure to
+ * avoid having to track which allocations were made
+ * now versus when the memslot was created. The
+ * metadata is guaranteed to be freed when the slot is
+ * freed, and will be kept/used if userspace retries
+ * the failed ioctl() instead of killing the VM.
+ */
+ r = kvm_page_track_write_tracking_alloc(slot);
+ if (r)
+ goto out_unlock;
+ }
+ }
+
+out_success:
+ /*
+ * Ensure that external_write_tracking_enabled becomes true strictly
+ * after all the related pointers are set.
+ */
+ smp_store_release(&kvm->arch.external_write_tracking_enabled, true);
+out_unlock:
+ mutex_unlock(&kvm->slots_arch_lock);
+ return r;
+}
+
/*
* register the notifier so that event interception for the tracked guest
* pages can be received.
@@ -161,10 +218,17 @@ int kvm_page_track_register_notifier(struct kvm *kvm,
struct kvm_page_track_notifier_node *n)
{
struct kvm_page_track_notifier_head *head;
+ int r;
if (!kvm || kvm->mm != current->mm)
return -ESRCH;
+ if (!kvm_external_write_tracking_enabled(kvm)) {
+ r = kvm_enable_external_write_tracking(kvm);
+ if (r)
+ return r;
+ }
+
kvm_get_kvm(kvm);
head = &kvm->arch.track_notifier_head;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 6ae19b4ee5b1..d078157e62aa 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -149,11 +149,11 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
* If shared is set, this function is operating under the MMU lock in read
* mode.
*/
-#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _only_valid)\
- for (_root = tdp_mmu_next_root(_kvm, NULL, _only_valid); \
- ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
- _root = tdp_mmu_next_root(_kvm, _root, _only_valid)) \
- if (kvm_mmu_page_as_id(_root) != _as_id) { \
+#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _only_valid) \
+ for (_root = tdp_mmu_next_root(_kvm, NULL, _only_valid); \
+ ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
+ _root = tdp_mmu_next_root(_kvm, _root, _only_valid)) \
+ if (_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) { \
} else
#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
@@ -171,12 +171,19 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
* Holding mmu_lock for write obviates the need for RCU protection as the list
* is guaranteed to be stable.
*/
-#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
- list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
- if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
- kvm_mmu_page_as_id(_root) != _as_id) { \
+#define __for_each_tdp_mmu_root(_kvm, _root, _as_id, _only_valid) \
+ list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
+ if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
+ ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \
+ ((_only_valid) && (_root)->role.invalid))) { \
} else
+#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
+ __for_each_tdp_mmu_root(_kvm, _root, _as_id, false)
+
+#define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id) \
+ __for_each_tdp_mmu_root(_kvm, _root, _as_id, true)
+
static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
{
struct kvm_mmu_page *sp;
@@ -216,22 +223,41 @@ static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
}
-hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
+int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu)
{
- union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ union kvm_mmu_page_role role = mmu->root_role;
+ int as_id = kvm_mmu_role_as_id(role);
struct kvm *kvm = vcpu->kvm;
struct kvm_mmu_page *root;
- lockdep_assert_held_write(&kvm->mmu_lock);
+ /*
+ * Check for an existing root before acquiring the pages lock to avoid
+ * unnecessary serialization if multiple vCPUs are loading a new root.
+ * E.g. when bringing up secondary vCPUs, KVM will already have created
+ * a valid root on behalf of the primary vCPU.
+ */
+ read_lock(&kvm->mmu_lock);
+
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, as_id) {
+ if (root->role.word == role.word)
+ goto out_read_unlock;
+ }
+
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
/*
- * Check for an existing root before allocating a new one. Note, the
- * role check prevents consuming an invalid root.
+ * Recheck for an existing root after acquiring the pages lock, another
+ * vCPU may have raced ahead and created a new usable root. Manually
+ * walk the list of roots as the standard macros assume that the pages
+ * lock is *not* held. WARN if grabbing a reference to a usable root
+ * fails, as the last reference to a root can only be put *after* the
+ * root has been invalidated, which requires holding mmu_lock for write.
*/
- for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
+ list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
if (root->role.word == role.word &&
- kvm_tdp_mmu_get_root(root))
- goto out;
+ !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root)))
+ goto out_spin_unlock;
}
root = tdp_mmu_alloc_sp(vcpu);
@@ -245,13 +271,20 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
* is ultimately put by kvm_tdp_mmu_zap_invalidated_roots().
*/
refcount_set(&root->tdp_mmu_root_count, 2);
-
- spin_lock(&kvm->arch.tdp_mmu_pages_lock);
list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
- spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
-out:
- return __pa(root->spt);
+out_spin_unlock:
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+out_read_unlock:
+ read_unlock(&kvm->mmu_lock);
+ /*
+ * Note, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS will prevent entering the guest
+ * and actually consuming the root if it's invalidated after dropping
+ * mmu_lock, and the root can't be freed as this vCPU holds a reference.
+ */
+ mmu->root.hpa = __pa(root->spt);
+ mmu->root.pgd = 0;
+ return 0;
}
static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
@@ -734,15 +767,26 @@ static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
rcu_read_lock();
/*
- * To avoid RCU stalls due to recursively removing huge swaths of SPs,
- * split the zap into two passes. On the first pass, zap at the 1gb
- * level, and then zap top-level SPs on the second pass. "1gb" is not
- * arbitrary, as KVM must be able to zap a 1gb shadow page without
- * inducing a stall to allow in-place replacement with a 1gb hugepage.
+ * Zap roots in multiple passes of decreasing granularity, i.e. zap at
+ * 4KiB=>2MiB=>1GiB=>root, in order to better honor need_resched() (all
+ * preempt models) or mmu_lock contention (full or real-time models).
+ * Zapping at finer granularity marginally increases the total time of
+ * the zap, but in most cases the zap itself isn't latency sensitive.
*
- * Because zapping a SP recurses on its children, stepping down to
- * PG_LEVEL_4K in the iterator itself is unnecessary.
+ * If KVM is configured to prove the MMU, skip the 4KiB and 2MiB zaps
+ * in order to mimic the page fault path, which can replace a 1GiB page
+ * table with an equivalent 1GiB hugepage, i.e. can get saddled with
+ * zapping a 1GiB region that's fully populated with 4KiB SPTEs. This
+ * allows verifying that KVM can safely zap 1GiB regions, e.g. without
+ * inducing RCU stalls, without relying on a relatively rare event
+ * (zapping roots is orders of magnitude more common). Note, because
+ * zapping a SP recurses on its children, stepping down to PG_LEVEL_4K
+ * in the iterator itself is unnecessary.
*/
+ if (!IS_ENABLED(CONFIG_KVM_PROVE_MMU)) {
+ __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_4K);
+ __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_2M);
+ }
__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
__tdp_mmu_zap_root(kvm, root, shared, root->role.level);
@@ -800,7 +844,13 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
continue;
tdp_mmu_iter_set_spte(kvm, &iter, 0);
- flush = true;
+
+ /*
+ * Zappings SPTEs in invalid roots doesn't require a TLB flush,
+ * see kvm_tdp_mmu_zap_invalidated_roots() for details.
+ */
+ if (!root->role.invalid)
+ flush = true;
}
rcu_read_unlock();
@@ -813,16 +863,16 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
}
/*
- * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns
- * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or
- * more SPTEs were zapped since the MMU lock was last acquired.
+ * Zap leaf SPTEs for the range of gfns, [start, end), for all *VALID** roots.
+ * Returns true if a TLB flush is needed before releasing the MMU lock, i.e. if
+ * one or more SPTEs were zapped since the MMU lock was last acquired.
*/
bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush)
{
struct kvm_mmu_page *root;
lockdep_assert_held_write(&kvm->mmu_lock);
- for_each_tdp_mmu_root_yield_safe(kvm, root)
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, -1)
flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush);
return flush;
@@ -896,7 +946,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
* the VM is being destroyed).
*
* Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference.
- * See kvm_tdp_mmu_get_vcpu_root_hpa().
+ * See kvm_tdp_mmu_alloc_root().
*/
void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
{
@@ -1622,7 +1672,7 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
{
struct kvm_mmu_page *root;
- for_each_tdp_mmu_root(kvm, root, slot->as_id)
+ for_each_valid_tdp_mmu_root(kvm, root, slot->as_id)
clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
}
@@ -1740,7 +1790,7 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
bool spte_set = false;
lockdep_assert_held_write(&kvm->mmu_lock);
- for_each_tdp_mmu_root(kvm, root, slot->as_id)
+ for_each_valid_tdp_mmu_root(kvm, root, slot->as_id)
spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
return spte_set;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 20d97aa46c49..6e1ea04ca885 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -10,7 +10,7 @@
void kvm_mmu_init_tdp_mmu(struct kvm *kvm);
void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
-hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
+int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu);
__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
{
diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c
index dc3d95fdca7d..d06d43d8d2aa 100644
--- a/arch/x86/kvm/smm.c
+++ b/arch/x86/kvm/smm.c
@@ -184,7 +184,6 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu,
struct kvm_smram_state_32 *smram)
{
struct desc_ptr dt;
- unsigned long val;
int i;
smram->cr0 = kvm_read_cr0(vcpu);
@@ -195,10 +194,8 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu,
for (i = 0; i < 8; i++)
smram->gprs[i] = kvm_register_read_raw(vcpu, i);
- kvm_get_dr(vcpu, 6, &val);
- smram->dr6 = (u32)val;
- kvm_get_dr(vcpu, 7, &val);
- smram->dr7 = (u32)val;
+ smram->dr6 = (u32)vcpu->arch.dr6;
+ smram->dr7 = (u32)vcpu->arch.dr7;
enter_smm_save_seg_32(vcpu, &smram->tr, &smram->tr_sel, VCPU_SREG_TR);
enter_smm_save_seg_32(vcpu, &smram->ldtr, &smram->ldtr_sel, VCPU_SREG_LDTR);
@@ -231,7 +228,6 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu,
struct kvm_smram_state_64 *smram)
{
struct desc_ptr dt;
- unsigned long val;
int i;
for (i = 0; i < 16; i++)
@@ -240,11 +236,8 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu,
smram->rip = kvm_rip_read(vcpu);
smram->rflags = kvm_get_rflags(vcpu);
-
- kvm_get_dr(vcpu, 6, &val);
- smram->dr6 = val;
- kvm_get_dr(vcpu, 7, &val);
- smram->dr7 = val;
+ smram->dr6 = vcpu->arch.dr6;
+ smram->dr7 = vcpu->arch.dr7;
smram->cr0 = kvm_read_cr0(vcpu);
smram->cr3 = kvm_read_cr3(vcpu);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index e90b429c84f1..b9096bb79c00 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -2735,7 +2735,6 @@ static int dr_interception(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
int reg, dr;
- unsigned long val;
int err = 0;
/*
@@ -2763,11 +2762,9 @@ static int dr_interception(struct kvm_vcpu *vcpu)
dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
if (dr >= 16) { /* mov to DRn */
dr -= 16;
- val = kvm_register_read(vcpu, reg);
- err = kvm_set_dr(vcpu, dr, val);
+ err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg));
} else {
- kvm_get_dr(vcpu, dr, &val);
- kvm_register_write(vcpu, reg, val);
+ kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr));
}
return kvm_complete_insn_gp(vcpu, err);
@@ -4092,6 +4089,9 @@ static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
{
+ if (is_guest_mode(vcpu))
+ return EXIT_FASTPATH_NONE;
+
if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
to_svm(vcpu)->vmcb->control.exit_info_1)
return handle_fastpath_set_msr_irqoff(vcpu);
@@ -4115,12 +4115,13 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
guest_state_exit_irqoff();
}
-static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
+static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
+ bool force_immediate_exit)
{
struct vcpu_svm *svm = to_svm(vcpu);
bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
- trace_kvm_entry(vcpu);
+ trace_kvm_entry(vcpu, force_immediate_exit);
svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
@@ -4139,9 +4140,12 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
* is enough to force an immediate vmexit.
*/
disable_nmi_singlestep(svm);
- smp_send_reschedule(vcpu->cpu);
+ force_immediate_exit = true;
}
+ if (force_immediate_exit)
+ smp_send_reschedule(vcpu->cpu);
+
pre_svm_run(vcpu);
sync_lapic_to_cr8(vcpu);
@@ -4237,9 +4241,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
svm_complete_interrupts(vcpu);
- if (is_guest_mode(vcpu))
- return EXIT_FASTPATH_NONE;
-
return svm_exit_handlers_fastpath(vcpu);
}
@@ -4997,8 +4998,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.check_intercept = svm_check_intercept,
.handle_exit_irqoff = svm_handle_exit_irqoff,
- .request_immediate_exit = __kvm_request_immediate_exit,
-
.sched_in = svm_sched_in,
.nested_ops = &svm_nested_ops,
diff --git a/arch/x86/kvm/svm/svm_ops.h b/arch/x86/kvm/svm/svm_ops.h
index 36c8af87a707..4e725854c63a 100644
--- a/arch/x86/kvm/svm/svm_ops.h
+++ b/arch/x86/kvm/svm/svm_ops.h
@@ -8,7 +8,7 @@
#define svm_asm(insn, clobber...) \
do { \
- asm_volatile_goto("1: " __stringify(insn) "\n\t" \
+ asm goto("1: " __stringify(insn) "\n\t" \
_ASM_EXTABLE(1b, %l[fault]) \
::: clobber : fault); \
return; \
@@ -18,7 +18,7 @@ fault: \
#define svm_asm1(insn, op1, clobber...) \
do { \
- asm_volatile_goto("1: " __stringify(insn) " %0\n\t" \
+ asm goto("1: " __stringify(insn) " %0\n\t" \
_ASM_EXTABLE(1b, %l[fault]) \
:: op1 : clobber : fault); \
return; \
@@ -28,7 +28,7 @@ fault: \
#define svm_asm2(insn, op1, op2, clobber...) \
do { \
- asm_volatile_goto("1: " __stringify(insn) " %1, %0\n\t" \
+ asm goto("1: " __stringify(insn) " %1, %0\n\t" \
_ASM_EXTABLE(1b, %l[fault]) \
:: op1, op2 : clobber : fault); \
return; \
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 83843379813e..88659de4d2a7 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -15,20 +15,23 @@
* Tracepoint for guest mode entry.
*/
TRACE_EVENT(kvm_entry,
- TP_PROTO(struct kvm_vcpu *vcpu),
- TP_ARGS(vcpu),
+ TP_PROTO(struct kvm_vcpu *vcpu, bool force_immediate_exit),
+ TP_ARGS(vcpu, force_immediate_exit),
TP_STRUCT__entry(
__field( unsigned int, vcpu_id )
__field( unsigned long, rip )
+ __field( bool, immediate_exit )
),
TP_fast_assign(
__entry->vcpu_id = vcpu->vcpu_id;
__entry->rip = kvm_rip_read(vcpu);
+ __entry->immediate_exit = force_immediate_exit;
),
- TP_printk("vcpu %u, rip 0x%lx", __entry->vcpu_id, __entry->rip)
+ TP_printk("vcpu %u, rip 0x%lx%s", __entry->vcpu_id, __entry->rip,
+ __entry->immediate_exit ? "[immediate exit]" : "")
);
/*
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 6329a306856b..0cd977f22c4b 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -4433,7 +4433,7 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
(vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
- kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
+ vmcs12->guest_dr7 = vcpu->arch.dr7;
if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
vmcs12->guest_ia32_efer = vcpu->arch.efer;
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index a6216c874729..315c7c2ba89b 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -71,7 +71,7 @@ static int fixed_pmc_events[] = {
static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
{
struct kvm_pmc *pmc;
- u8 old_fixed_ctr_ctrl = pmu->fixed_ctr_ctrl;
+ u64 old_fixed_ctr_ctrl = pmu->fixed_ctr_ctrl;
int i;
pmu->fixed_ctr_ctrl = data;
diff --git a/arch/x86/kvm/vmx/run_flags.h b/arch/x86/kvm/vmx/run_flags.h
index edc3f16cc189..6a9bfdfbb6e5 100644
--- a/arch/x86/kvm/vmx/run_flags.h
+++ b/arch/x86/kvm/vmx/run_flags.h
@@ -2,7 +2,10 @@
#ifndef __KVM_X86_VMX_RUN_FLAGS_H
#define __KVM_X86_VMX_RUN_FLAGS_H
-#define VMX_RUN_VMRESUME (1 << 0)
-#define VMX_RUN_SAVE_SPEC_CTRL (1 << 1)
+#define VMX_RUN_VMRESUME_SHIFT 0
+#define VMX_RUN_SAVE_SPEC_CTRL_SHIFT 1
+
+#define VMX_RUN_VMRESUME BIT(VMX_RUN_VMRESUME_SHIFT)
+#define VMX_RUN_SAVE_SPEC_CTRL BIT(VMX_RUN_SAVE_SPEC_CTRL_SHIFT)
#endif /* __KVM_X86_VMX_RUN_FLAGS_H */
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 906ecd001511..2bfbf758d061 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -139,7 +139,7 @@ SYM_FUNC_START(__vmx_vcpu_run)
mov (%_ASM_SP), %_ASM_AX
/* Check if vmlaunch or vmresume is needed */
- test $VMX_RUN_VMRESUME, %ebx
+ bt $VMX_RUN_VMRESUME_SHIFT, %ebx
/* Load guest registers. Don't clobber flags. */
mov VCPU_RCX(%_ASM_AX), %_ASM_CX
@@ -161,8 +161,11 @@ SYM_FUNC_START(__vmx_vcpu_run)
/* Load guest RAX. This kills the @regs pointer! */
mov VCPU_RAX(%_ASM_AX), %_ASM_AX
- /* Check EFLAGS.ZF from 'test VMX_RUN_VMRESUME' above */
- jz .Lvmlaunch
+ /* Clobbers EFLAGS.ZF */
+ CLEAR_CPU_BUFFERS
+
+ /* Check EFLAGS.CF from the VMX_RUN_VMRESUME bit test above. */
+ jnc .Lvmlaunch
/*
* After a successful VMRESUME/VMLAUNCH, control flow "magically"
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 7e7d044e0669..24d377d0a0c8 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -49,6 +49,8 @@
#include <asm/spec-ctrl.h>
#include <asm/vmx.h>
+#include <trace/events/ipi.h>
+
#include "capabilities.h"
#include "cpuid.h"
#include "hyperv.h"
@@ -388,7 +390,16 @@ static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
{
- vmx->disable_fb_clear = (host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) &&
+ /*
+ * Disable VERW's behavior of clearing CPU buffers for the guest if the
+ * CPU isn't affected by MDS/TAA, and the host hasn't forcefully enabled
+ * the mitigation. Disabling the clearing behavior provides a
+ * performance boost for guests that aren't aware that manually clearing
+ * CPU buffers is unnecessary, at the cost of MSR accesses on VM-Entry
+ * and VM-Exit.
+ */
+ vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) &&
+ (host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) &&
!boot_cpu_has_bug(X86_BUG_MDS) &&
!boot_cpu_has_bug(X86_BUG_TAA);
@@ -729,7 +740,7 @@ static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
*/
static int kvm_cpu_vmxoff(void)
{
- asm_volatile_goto("1: vmxoff\n\t"
+ asm goto("1: vmxoff\n\t"
_ASM_EXTABLE(1b, %l[fault])
::: "cc", "memory" : fault);
@@ -1272,8 +1283,6 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
u16 fs_sel, gs_sel;
int i;
- vmx->req_immediate_exit = false;
-
/*
* Note that guest MSRs to be saved/restored can also be changed
* when guest state is loaded. This happens when guest transitions
@@ -2775,7 +2784,7 @@ static int kvm_cpu_vmxon(u64 vmxon_pointer)
cr4_set_bits(X86_CR4_VMXE);
- asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
+ asm goto("1: vmxon %[vmxon_pointer]\n\t"
_ASM_EXTABLE(1b, %l[fault])
: : [vmxon_pointer] "m"(vmxon_pointer)
: : fault);
@@ -5556,10 +5565,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
reg = DEBUG_REG_ACCESS_REG(exit_qualification);
if (exit_qualification & TYPE_MOV_FROM_DR) {
- unsigned long val;
-
- kvm_get_dr(vcpu, dr, &val);
- kvm_register_write(vcpu, reg, val);
+ kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr));
err = 0;
} else {
err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg));
@@ -5981,22 +5987,46 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
return 1;
}
-static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
+static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu,
+ bool force_immediate_exit)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (!vmx->req_immediate_exit &&
- !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) {
- kvm_lapic_expired_hv_timer(vcpu);
+ /*
+ * In the *extremely* unlikely scenario that this is a spurious VM-Exit
+ * due to the timer expiring while it was "soft" disabled, just eat the
+ * exit and re-enter the guest.
+ */
+ if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled))
return EXIT_FASTPATH_REENTER_GUEST;
- }
- return EXIT_FASTPATH_NONE;
+ /*
+ * If the timer expired because KVM used it to force an immediate exit,
+ * then mission accomplished.
+ */
+ if (force_immediate_exit)
+ return EXIT_FASTPATH_EXIT_HANDLED;
+
+ /*
+ * If L2 is active, go down the slow path as emulating the guest timer
+ * expiration likely requires synthesizing a nested VM-Exit.
+ */
+ if (is_guest_mode(vcpu))
+ return EXIT_FASTPATH_NONE;
+
+ kvm_lapic_expired_hv_timer(vcpu);
+ return EXIT_FASTPATH_REENTER_GUEST;
}
static int handle_preemption_timer(struct kvm_vcpu *vcpu)
{
- handle_fastpath_preemption_timer(vcpu);
+ /*
+ * This non-fastpath handler is reached if and only if the preemption
+ * timer was being used to emulate a guest timer while L2 is active.
+ * All other scenarios are supposed to be handled in the fastpath.
+ */
+ WARN_ON_ONCE(!is_guest_mode(vcpu));
+ kvm_lapic_expired_hv_timer(vcpu);
return 1;
}
@@ -7136,13 +7166,13 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
msrs[i].host, false);
}
-static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
+static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u64 tscl;
u32 delta_tsc;
- if (vmx->req_immediate_exit) {
+ if (force_immediate_exit) {
vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
vmx->loaded_vmcs->hv_timer_soft_disabled = false;
} else if (vmx->hv_deadline_tsc != -1) {
@@ -7195,13 +7225,22 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
barrier_nospec();
}
-static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu,
+ bool force_immediate_exit)
{
+ /*
+ * If L2 is active, some VMX preemption timer exits can be handled in
+ * the fastpath even, all other exits must use the slow path.
+ */
+ if (is_guest_mode(vcpu) &&
+ to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_PREEMPTION_TIMER)
+ return EXIT_FASTPATH_NONE;
+
switch (to_vmx(vcpu)->exit_reason.basic) {
case EXIT_REASON_MSR_WRITE:
return handle_fastpath_set_msr_irqoff(vcpu);
case EXIT_REASON_PREEMPTION_TIMER:
- return handle_fastpath_preemption_timer(vcpu);
+ return handle_fastpath_preemption_timer(vcpu, force_immediate_exit);
default:
return EXIT_FASTPATH_NONE;
}
@@ -7214,11 +7253,14 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
guest_state_enter_irqoff();
- /* L1D Flush includes CPU buffer clear to mitigate MDS */
+ /*
+ * L1D Flush includes CPU buffer clear to mitigate MDS, but VERW
+ * mitigation for MDS is done late in VMentry and is still
+ * executed in spite of L1D Flush. This is because an extra VERW
+ * should not matter much after the big hammer L1D Flush.
+ */
if (static_branch_unlikely(&vmx_l1d_should_flush))
vmx_l1d_flush(vcpu);
- else if (static_branch_unlikely(&mds_user_clear))
- mds_clear_cpu_buffers();
else if (static_branch_unlikely(&mmio_stale_data_clear) &&
kvm_arch_has_assigned_device(vcpu->kvm))
mds_clear_cpu_buffers();
@@ -7258,7 +7300,7 @@ out:
guest_state_exit_irqoff();
}
-static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
+static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long cr3, cr4;
@@ -7285,7 +7327,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
return EXIT_FASTPATH_NONE;
}
- trace_kvm_entry(vcpu);
+ trace_kvm_entry(vcpu, force_immediate_exit);
if (vmx->ple_window_dirty) {
vmx->ple_window_dirty = false;
@@ -7344,7 +7386,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx_passthrough_lbr_msrs(vcpu);
if (enable_preemption_timer)
- vmx_update_hv_timer(vcpu);
+ vmx_update_hv_timer(vcpu, force_immediate_exit);
+ else if (force_immediate_exit)
+ smp_send_reschedule(vcpu->cpu);
kvm_wait_lapic_expire(vcpu);
@@ -7408,10 +7452,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx_recover_nmi_blocking(vmx);
vmx_complete_interrupts(vmx);
- if (is_guest_mode(vcpu))
- return EXIT_FASTPATH_NONE;
-
- return vmx_exit_handlers_fastpath(vcpu);
+ return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit);
}
static void vmx_vcpu_free(struct kvm_vcpu *vcpu)
@@ -7891,11 +7932,6 @@ static __init void vmx_set_cpu_caps(void)
kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
}
-static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
-{
- to_vmx(vcpu)->req_immediate_exit = true;
-}
-
static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
struct x86_instruction_info *info)
{
@@ -8348,8 +8384,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.check_intercept = vmx_check_intercept,
.handle_exit_irqoff = vmx_handle_exit_irqoff,
- .request_immediate_exit = vmx_request_immediate_exit,
-
.sched_in = vmx_sched_in,
.cpu_dirty_log_size = PML_ENTITY_NUM,
@@ -8609,7 +8643,6 @@ static __init int hardware_setup(void)
if (!enable_preemption_timer) {
vmx_x86_ops.set_hv_timer = NULL;
vmx_x86_ops.cancel_hv_timer = NULL;
- vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
}
kvm_caps.supported_mce_cap |= MCG_LMCE_P;
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index e3b0985bb74a..65786dbe7d60 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -332,8 +332,6 @@ struct vcpu_vmx {
unsigned int ple_window;
bool ple_window_dirty;
- bool req_immediate_exit;
-
/* Support for PML */
#define PML_ENTITY_NUM 512
struct page *pml_pg;
diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h
index f41ce3c24123..8060e5fc6dbd 100644
--- a/arch/x86/kvm/vmx/vmx_ops.h
+++ b/arch/x86/kvm/vmx/vmx_ops.h
@@ -94,7 +94,7 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field)
#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
- asm_volatile_goto("1: vmread %[field], %[output]\n\t"
+ asm_goto_output("1: vmread %[field], %[output]\n\t"
"jna %l[do_fail]\n\t"
_ASM_EXTABLE(1b, %l[do_exception])
@@ -188,7 +188,7 @@ static __always_inline unsigned long vmcs_readl(unsigned long field)
#define vmx_asm1(insn, op1, error_args...) \
do { \
- asm_volatile_goto("1: " __stringify(insn) " %0\n\t" \
+ asm goto("1: " __stringify(insn) " %0\n\t" \
".byte 0x2e\n\t" /* branch not taken hint */ \
"jna %l[error]\n\t" \
_ASM_EXTABLE(1b, %l[fault]) \
@@ -205,7 +205,7 @@ fault: \
#define vmx_asm2(insn, op1, op2, error_args...) \
do { \
- asm_volatile_goto("1: " __stringify(insn) " %1, %0\n\t" \
+ asm goto("1: " __stringify(insn) " %1, %0\n\t" \
".byte 0x2e\n\t" /* branch not taken hint */ \
"jna %l[error]\n\t" \
_ASM_EXTABLE(1b, %l[fault]) \
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 363b1c080205..064862d87b9e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1399,22 +1399,19 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
}
EXPORT_SYMBOL_GPL(kvm_set_dr);
-void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
+unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr)
{
size_t size = ARRAY_SIZE(vcpu->arch.db);
switch (dr) {
case 0 ... 3:
- *val = vcpu->arch.db[array_index_nospec(dr, size)];
- break;
+ return vcpu->arch.db[array_index_nospec(dr, size)];
case 4:
case 6:
- *val = vcpu->arch.dr6;
- break;
+ return vcpu->arch.dr6;
case 5:
default: /* 7 */
- *val = vcpu->arch.dr7;
- break;
+ return vcpu->arch.dr7;
}
}
EXPORT_SYMBOL_GPL(kvm_get_dr);
@@ -1704,22 +1701,17 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
struct kvm_msr_entry msr;
int r;
+ /* Unconditionally clear the output for simplicity */
+ msr.data = 0;
msr.index = index;
r = kvm_get_msr_feature(&msr);
- if (r == KVM_MSR_RET_INVALID) {
- /* Unconditionally clear the output for simplicity */
- *data = 0;
- if (kvm_msr_ignored_check(index, 0, false))
- r = 0;
- }
-
- if (r)
- return r;
+ if (r == KVM_MSR_RET_INVALID && kvm_msr_ignored_check(index, 0, false))
+ r = 0;
*data = msr.data;
- return 0;
+ return r;
}
static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
@@ -1782,6 +1774,10 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if ((efer ^ old_efer) & KVM_MMU_EFER_ROLE_BITS)
kvm_mmu_reset_context(vcpu);
+ if (!static_cpu_has(X86_FEATURE_XSAVES) &&
+ (efer & EFER_SVME))
+ kvm_hv_xsaves_xsavec_maybe_warn(vcpu);
+
return 0;
}
@@ -2507,7 +2503,7 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
}
#ifdef CONFIG_X86_64
-static inline int gtod_is_based_on_tsc(int mode)
+static inline bool gtod_is_based_on_tsc(int mode)
{
return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK;
}
@@ -4581,7 +4577,7 @@ static bool kvm_is_vm_type_supported(unsigned long type)
{
return type == KVM_X86_DEFAULT_VM ||
(type == KVM_X86_SW_PROTECTED_VM &&
- IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_enabled);
+ IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled);
}
int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
@@ -5062,8 +5058,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
int idx;
if (vcpu->preempted) {
- if (!vcpu->arch.guest_state_protected)
- vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu);
+ vcpu->arch.preempted_in_kernel = kvm_arch_vcpu_in_kernel(vcpu);
/*
* Take the srcu lock as memslots will be accessed to check the gfn
@@ -5454,7 +5449,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) {
vcpu->arch.nmi_pending = 0;
atomic_set(&vcpu->arch.nmi_queued, events->nmi.pending);
- kvm_make_request(KVM_REQ_NMI, vcpu);
+ if (events->nmi.pending)
+ kvm_make_request(KVM_REQ_NMI, vcpu);
}
static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked);
@@ -5509,18 +5505,23 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
struct kvm_debugregs *dbgregs)
{
- unsigned long val;
+ unsigned int i;
memset(dbgregs, 0, sizeof(*dbgregs));
- memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
- kvm_get_dr(vcpu, 6, &val);
- dbgregs->dr6 = val;
+
+ BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.db) != ARRAY_SIZE(dbgregs->db));
+ for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++)
+ dbgregs->db[i] = vcpu->arch.db[i];
+
+ dbgregs->dr6 = vcpu->arch.dr6;
dbgregs->dr7 = vcpu->arch.dr7;
}
static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
struct kvm_debugregs *dbgregs)
{
+ unsigned int i;
+
if (dbgregs->flags)
return -EINVAL;
@@ -5529,7 +5530,9 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
if (!kvm_dr7_valid(dbgregs->dr7))
return -EINVAL;
- memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
+ for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++)
+ vcpu->arch.db[i] = dbgregs->db[i];
+
kvm_update_dr0123(vcpu);
vcpu->arch.dr6 = dbgregs->dr6;
vcpu->arch.dr7 = dbgregs->dr7;
@@ -7016,6 +7019,9 @@ set_identity_unlock:
r = -EEXIST;
if (kvm->arch.vpit)
goto create_pit_unlock;
+ r = -ENOENT;
+ if (!pic_in_kernel(kvm))
+ goto create_pit_unlock;
r = -ENOMEM;
kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
if (kvm->arch.vpit)
@@ -8164,10 +8170,9 @@ static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt));
}
-static void emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
- unsigned long *dest)
+static unsigned long emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr)
{
- kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
+ return kvm_get_dr(emul_to_vcpu(ctxt), dr);
}
static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
@@ -8787,31 +8792,24 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
kvm_release_pfn_clean(pfn);
- /* The instructions are well-emulated on direct mmu. */
- if (vcpu->arch.mmu->root_role.direct) {
- unsigned int indirect_shadow_pages;
-
- write_lock(&vcpu->kvm->mmu_lock);
- indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
- write_unlock(&vcpu->kvm->mmu_lock);
-
- if (indirect_shadow_pages)
- kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
-
- return true;
- }
-
/*
- * if emulation was due to access to shadowed page table
- * and it failed try to unshadow page and re-enter the
- * guest to let CPU execute the instruction.
+ * If emulation may have been triggered by a write to a shadowed page
+ * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the
+ * guest to let the CPU re-execute the instruction in the hope that the
+ * CPU can cleanly execute the instruction that KVM failed to emulate.
*/
- kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+ if (vcpu->kvm->arch.indirect_shadow_pages)
+ kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
/*
- * If the access faults on its page table, it can not
- * be fixed by unprotecting shadow page and it should
- * be reported to userspace.
+ * If the failed instruction faulted on an access to page tables that
+ * are used to translate any part of the instruction, KVM can't resolve
+ * the issue by unprotecting the gfn, as zapping the shadow page will
+ * result in the instruction taking a !PRESENT page fault and thus put
+ * the vCPU into an infinite loop of page faults. E.g. KVM will create
+ * a SPTE and write-protect the gfn to resolve the !PRESENT fault, and
+ * then zap the SPTE to unprotect the gfn, and then do it all over
+ * again. Report the error to userspace.
*/
return !(emulation_type & EMULTYPE_WRITE_PF_TO_SP);
}
@@ -9632,11 +9630,13 @@ static void kvm_x86_check_cpu_compat(void *ret)
*(int *)ret = kvm_x86_check_processor_compatibility();
}
-static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
+int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
{
u64 host_pat;
int r, cpu;
+ guard(mutex)(&vendor_module_lock);
+
if (kvm_x86_ops.hardware_enable) {
pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name);
return -EEXIST;
@@ -9766,17 +9766,6 @@ out_free_x86_emulator_cache:
kmem_cache_destroy(x86_emulator_cache);
return r;
}
-
-int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
-{
- int r;
-
- mutex_lock(&vendor_module_lock);
- r = __kvm_x86_vendor_init(ops);
- mutex_unlock(&vendor_module_lock);
-
- return r;
-}
EXPORT_SYMBOL_GPL(kvm_x86_vendor_init);
void kvm_x86_vendor_exit(void)
@@ -10673,12 +10662,6 @@ static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu);
}
-void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
-{
- smp_send_reschedule(vcpu->cpu);
-}
-EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);
-
/*
* Called within kvm->srcu read side.
* Returns 1 to let vcpu_run() continue the guest execution loop without
@@ -10928,10 +10911,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
goto cancel_injection;
}
- if (req_immediate_exit) {
+ if (req_immediate_exit)
kvm_make_request(KVM_REQ_EVENT, vcpu);
- static_call(kvm_x86_request_immediate_exit)(vcpu);
- }
fpregs_assert_state_consistent();
if (test_thread_flag(TIF_NEED_FPU_LOAD))
@@ -10962,7 +10943,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) &&
(kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED));
- exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu);
+ exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu, req_immediate_exit);
if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
break;
@@ -12060,27 +12041,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
if (r < 0)
return r;
- if (irqchip_in_kernel(vcpu->kvm)) {
- r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
- if (r < 0)
- goto fail_mmu_destroy;
-
- /*
- * Defer evaluating inhibits until the vCPU is first run, as
- * this vCPU will not get notified of any changes until this
- * vCPU is visible to other vCPUs (marked online and added to
- * the set of vCPUs). Opportunistically mark APICv active as
- * VMX in particularly is highly unlikely to have inhibits.
- * Ignore the current per-VM APICv state so that vCPU creation
- * is guaranteed to run with a deterministic value, the request
- * will ensure the vCPU gets the correct state before VM-Entry.
- */
- if (enable_apicv) {
- vcpu->arch.apic->apicv_active = true;
- kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
- }
- } else
- static_branch_inc(&kvm_has_noapic_vcpu);
+ r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
+ if (r < 0)
+ goto fail_mmu_destroy;
r = -ENOMEM;
@@ -12201,8 +12164,6 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
srcu_read_unlock(&vcpu->kvm->srcu, idx);
free_page((unsigned long)vcpu->arch.pio_data);
kvfree(vcpu->arch.cpuid_entries);
- if (!lapic_in_kernel(vcpu))
- static_branch_dec(&kvm_has_noapic_vcpu);
}
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -12479,9 +12440,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
}
-__read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
-EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu);
-
void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -13084,11 +13042,13 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
{
- if (kvm_vcpu_apicv_active(vcpu) &&
- static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu))
- return true;
+ return kvm_vcpu_apicv_active(vcpu) &&
+ static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu);
+}
- return false;
+bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.preempted_in_kernel;
}
bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
@@ -13111,9 +13071,6 @@ bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
if (vcpu->arch.guest_state_protected)
return true;
- if (vcpu != kvm_get_running_vcpu())
- return vcpu->arch.preempted_in_kernel;
-
return static_call(kvm_x86_get_cpl)(vcpu) == 0;
}
@@ -13908,9 +13865,6 @@ module_init(kvm_x86_init);
static void __exit kvm_x86_exit(void)
{
- /*
- * If module_init() is implemented, module_exit() must also be
- * implemented to allow module unload.
- */
+ WARN_ON_ONCE(static_branch_unlikely(&kvm_has_noapic_vcpu));
}
module_exit(kvm_x86_exit);