summaryrefslogtreecommitdiff
path: root/arch/x86/kvm/x86.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/x86.c')
-rw-r--r--arch/x86/kvm/x86.c418
1 files changed, 289 insertions, 129 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 47e021bdcc94..cebdaa1e3cf5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -75,6 +75,7 @@
#include <asm/tlbflush.h>
#include <asm/intel_pt.h>
#include <asm/emulate_prefix.h>
+#include <asm/sgx.h>
#include <clocksource/hyperv_timer.h>
#define CREATE_TRACE_POINTS
@@ -156,9 +157,9 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
/*
* lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables
- * adaptive tuning starting from default advancment of 1000ns. '0' disables
+ * adaptive tuning starting from default advancement of 1000ns. '0' disables
* advancement entirely. Any other value is used as-is and disables adaptive
- * tuning, i.e. allows priveleged userspace to set an exact advancement time.
+ * tuning, i.e. allows privileged userspace to set an exact advancement time.
*/
static int __read_mostly lapic_timer_advance_ns = -1;
module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR);
@@ -245,6 +246,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
VCPU_STAT("l1d_flush", l1d_flush),
VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
+ VCPU_STAT("nested_run", nested_run),
+ VCPU_STAT("directed_yield_attempted", directed_yield_attempted),
+ VCPU_STAT("directed_yield_successful", directed_yield_successful),
VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
VM_STAT("mmu_pte_write", mmu_pte_write),
VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
@@ -271,8 +275,7 @@ static struct kmem_cache *x86_emulator_cache;
* When called, it means the previous get/set msr reached an invalid msr.
* Return true if we want to ignore/silent this failed msr access.
*/
-static bool kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr,
- u64 data, bool write)
+static bool kvm_msr_ignored_check(u32 msr, u64 data, bool write)
{
const char *op = write ? "wrmsr" : "rdmsr";
@@ -544,8 +547,6 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
queue:
- if (has_error && !is_protmode(vcpu))
- has_error = false;
if (reinject) {
/*
* On vmentry, vcpu->arch.exception.pending is only
@@ -984,14 +985,17 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
return 0;
}
-int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu)
{
- if (static_call(kvm_x86_get_cpl)(vcpu) == 0)
- return __kvm_set_xcr(vcpu, index, xcr);
+ if (static_call(kvm_x86_get_cpl)(vcpu) != 0 ||
+ __kvm_set_xcr(vcpu, kvm_rcx_read(vcpu), kvm_read_edx_eax(vcpu))) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
-EXPORT_SYMBOL_GPL(kvm_set_xcr);
+EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv);
bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
@@ -1073,10 +1077,15 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
return 0;
}
- if (is_long_mode(vcpu) && kvm_vcpu_is_illegal_gpa(vcpu, cr3))
+ /*
+ * Do not condition the GPA check on long mode, this helper is used to
+ * stuff CR3, e.g. for RSM emulation, and there is no guarantee that
+ * the current vCPU mode is accurate.
+ */
+ if (kvm_vcpu_is_illegal_gpa(vcpu, cr3))
return 1;
- else if (is_pae_paging(vcpu) &&
- !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
+
+ if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
return 1;
kvm_mmu_new_pgd(vcpu, cr3, skip_tlb_flush, skip_tlb_flush);
@@ -1192,20 +1201,21 @@ void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
}
EXPORT_SYMBOL_GPL(kvm_get_dr);
-bool kvm_rdpmc(struct kvm_vcpu *vcpu)
+int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu)
{
u32 ecx = kvm_rcx_read(vcpu);
u64 data;
- int err;
- err = kvm_pmu_rdpmc(vcpu, ecx, &data);
- if (err)
- return err;
+ if (kvm_pmu_rdpmc(vcpu, ecx, &data)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
kvm_rax_write(vcpu, (u32)data);
kvm_rdx_write(vcpu, data >> 32);
- return err;
+ return kvm_skip_emulated_instruction(vcpu);
}
-EXPORT_SYMBOL_GPL(kvm_rdpmc);
+EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc);
/*
* List of msr numbers which we expose to userspace through KVM_GET_MSRS
@@ -1288,7 +1298,7 @@ static const u32 emulated_msrs_all[] = {
MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
MSR_IA32_TSC_ADJUST,
- MSR_IA32_TSCDEADLINE,
+ MSR_IA32_TSC_DEADLINE,
MSR_IA32_ARCH_CAPABILITIES,
MSR_IA32_PERF_CAPABILITIES,
MSR_IA32_MISC_ENABLE,
@@ -1373,7 +1383,7 @@ static u64 kvm_get_arch_capabilities(void)
/*
* If nx_huge_pages is enabled, KVM's shadow paging will ensure that
* the nested hypervisor runs with NX huge pages. If it is not,
- * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other
+ * L1 is anyway vulnerable to ITLB_MULTIHIT exploits from other
* L1 guests, so it need not worry about its own (L2) guests.
*/
data |= ARCH_CAP_PSCHANGE_MC_NO;
@@ -1445,7 +1455,7 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
if (r == KVM_MSR_RET_INVALID) {
/* Unconditionally clear the output for simplicity */
*data = 0;
- if (kvm_msr_ignored_check(vcpu, index, 0, false))
+ if (kvm_msr_ignored_check(index, 0, false))
r = 0;
}
@@ -1526,35 +1536,44 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type)
{
+ struct kvm_x86_msr_filter *msr_filter;
+ struct msr_bitmap_range *ranges;
struct kvm *kvm = vcpu->kvm;
- struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges;
- u32 count = kvm->arch.msr_filter.count;
- u32 i;
- bool r = kvm->arch.msr_filter.default_allow;
+ bool allowed;
int idx;
+ u32 i;
- /* MSR filtering not set up or x2APIC enabled, allow everything */
- if (!count || (index >= 0x800 && index <= 0x8ff))
+ /* x2APIC MSRs do not support filtering. */
+ if (index >= 0x800 && index <= 0x8ff)
return true;
- /* Prevent collision with set_msr_filter */
idx = srcu_read_lock(&kvm->srcu);
- for (i = 0; i < count; i++) {
+ msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu);
+ if (!msr_filter) {
+ allowed = true;
+ goto out;
+ }
+
+ allowed = msr_filter->default_allow;
+ ranges = msr_filter->ranges;
+
+ for (i = 0; i < msr_filter->count; i++) {
u32 start = ranges[i].base;
u32 end = start + ranges[i].nmsrs;
u32 flags = ranges[i].flags;
unsigned long *bitmap = ranges[i].bitmap;
if ((index >= start) && (index < end) && (flags & type)) {
- r = !!test_bit(index - start, bitmap);
+ allowed = !!test_bit(index - start, bitmap);
break;
}
}
+out:
srcu_read_unlock(&kvm->srcu, idx);
- return r;
+ return allowed;
}
EXPORT_SYMBOL_GPL(kvm_msr_allowed);
@@ -1611,7 +1630,7 @@ static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
int ret = __kvm_set_msr(vcpu, index, data, host_initiated);
if (ret == KVM_MSR_RET_INVALID)
- if (kvm_msr_ignored_check(vcpu, index, data, true))
+ if (kvm_msr_ignored_check(index, data, true))
ret = 0;
return ret;
@@ -1649,7 +1668,7 @@ static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
if (ret == KVM_MSR_RET_INVALID) {
/* Unconditionally clear *data for simplicity */
*data = 0;
- if (kvm_msr_ignored_check(vcpu, index, 0, false))
+ if (kvm_msr_ignored_check(index, 0, false))
ret = 0;
}
@@ -1783,6 +1802,40 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
+int kvm_emulate_as_nop(struct kvm_vcpu *vcpu)
+{
+ return kvm_skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_as_nop);
+
+int kvm_emulate_invd(struct kvm_vcpu *vcpu)
+{
+ /* Treat an INVD instruction as a NOP and just skip it. */
+ return kvm_emulate_as_nop(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_invd);
+
+int kvm_emulate_mwait(struct kvm_vcpu *vcpu)
+{
+ pr_warn_once("kvm: MWAIT instruction emulated as NOP!\n");
+ return kvm_emulate_as_nop(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_mwait);
+
+int kvm_handle_invalid_op(struct kvm_vcpu *vcpu)
+{
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+EXPORT_SYMBOL_GPL(kvm_handle_invalid_op);
+
+int kvm_emulate_monitor(struct kvm_vcpu *vcpu)
+{
+ pr_warn_once("kvm: MONITOR instruction emulated as NOP!\n");
+ return kvm_emulate_as_nop(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_monitor);
+
static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
{
xfer_to_guest_mode_prepare();
@@ -1841,7 +1894,7 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
ret = EXIT_FASTPATH_EXIT_HANDLED;
}
break;
- case MSR_IA32_TSCDEADLINE:
+ case MSR_IA32_TSC_DEADLINE:
data = kvm_read_edx_eax(vcpu);
if (!handle_fastpath_set_tscdeadline(vcpu, data)) {
kvm_skip_emulated_instruction(vcpu);
@@ -2320,7 +2373,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
kvm_vcpu_write_tsc_offset(vcpu, offset);
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
- spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
+ spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags);
if (!matched) {
kvm->arch.nr_vcpus_matched_tsc = 0;
} else if (!already_matched) {
@@ -2328,7 +2381,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
}
kvm_track_tsc_matching(vcpu);
- spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
+ spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags);
}
static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
@@ -2550,11 +2603,16 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
int i;
struct kvm_vcpu *vcpu;
struct kvm_arch *ka = &kvm->arch;
+ unsigned long flags;
+
+ kvm_hv_invalidate_tsc_page(kvm);
- spin_lock(&ka->pvclock_gtod_sync_lock);
kvm_make_mclock_inprogress_request(kvm);
+
/* no guest entries from this point */
+ spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
pvclock_update_vm_gtod_copy(kvm);
+ spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -2562,8 +2620,6 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
/* guest entries allowed */
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
-
- spin_unlock(&ka->pvclock_gtod_sync_lock);
#endif
}
@@ -2571,17 +2627,18 @@ u64 get_kvmclock_ns(struct kvm *kvm)
{
struct kvm_arch *ka = &kvm->arch;
struct pvclock_vcpu_time_info hv_clock;
+ unsigned long flags;
u64 ret;
- spin_lock(&ka->pvclock_gtod_sync_lock);
+ spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
if (!ka->use_master_clock) {
- spin_unlock(&ka->pvclock_gtod_sync_lock);
+ spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
return get_kvmclock_base_ns() + ka->kvmclock_offset;
}
hv_clock.tsc_timestamp = ka->master_cycle_now;
hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
- spin_unlock(&ka->pvclock_gtod_sync_lock);
+ spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
/* both __this_cpu_read() and rdtsc() should be on the same cpu */
get_cpu();
@@ -2675,13 +2732,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
* If the host uses TSC clock, then passthrough TSC as stable
* to the guest.
*/
- spin_lock(&ka->pvclock_gtod_sync_lock);
+ spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
use_master_clock = ka->use_master_clock;
if (use_master_clock) {
host_tsc = ka->master_cycle_now;
kernel_ns = ka->master_kernel_ns;
}
- spin_unlock(&ka->pvclock_gtod_sync_lock);
+ spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
@@ -3075,7 +3132,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return kvm_set_apic_base(vcpu, msr_info);
case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
return kvm_x2apic_msr_write(vcpu, msr, data);
- case MSR_IA32_TSCDEADLINE:
+ case MSR_IA32_TSC_DEADLINE:
kvm_set_lapic_tscdeadline_msr(vcpu, data);
break;
case MSR_IA32_TSC_ADJUST:
@@ -3370,6 +3427,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = 0;
break;
case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
+ if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
+ return kvm_pmu_get_msr(vcpu, msr_info);
+ if (!msr_info->host_initiated)
+ return 1;
+ msr_info->data = 0;
+ break;
case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
@@ -3437,7 +3500,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
- case MSR_IA32_TSCDEADLINE:
+ case MSR_IA32_TSC_DEADLINE:
msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
break;
case MSR_IA32_TSC_ADJUST:
@@ -3759,8 +3822,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_X86_USER_SPACE_MSR:
case KVM_CAP_X86_MSR_FILTER:
case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
+#ifdef CONFIG_X86_SGX_KVM
+ case KVM_CAP_SGX_ATTRIBUTE:
+#endif
+ case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
r = 1;
break;
+ case KVM_CAP_SET_GUEST_DEBUG2:
+ return KVM_GUESTDBG_VALID_MASK;
#ifdef CONFIG_KVM_XEN
case KVM_CAP_XEN_HVM:
r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
@@ -4013,7 +4082,6 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
{
struct kvm_host_map map;
struct kvm_steal_time *st;
- int idx;
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
@@ -4021,15 +4089,9 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
if (vcpu->arch.st.preempted)
return;
- /*
- * Take the srcu lock as memslots will be accessed to check the gfn
- * cache generation against the memslots generation.
- */
- idx = srcu_read_lock(&vcpu->kvm->srcu);
-
if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
&vcpu->arch.st.cache, true))
- goto out;
+ return;
st = map.hva +
offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
@@ -4037,20 +4099,25 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
-
-out:
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
+ int idx;
+
if (vcpu->preempted && !vcpu->arch.guest_state_protected)
vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu);
+ /*
+ * Take the srcu lock as memslots will be accessed to check the gfn
+ * cache generation against the memslots generation.
+ */
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
if (kvm_xen_msr_enabled(vcpu->kvm))
kvm_xen_runstate_set_preempted(vcpu);
else
kvm_steal_time_set_preempted(vcpu);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
static_call(kvm_x86_vcpu_put)(vcpu);
vcpu->arch.last_host_tsc = rdtsc();
@@ -4663,7 +4730,6 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
kvm_update_pv_runtime(vcpu);
return 0;
-
default:
return -EINVAL;
}
@@ -5345,6 +5411,28 @@ split_irqchip_unlock:
kvm->arch.bus_lock_detection_enabled = true;
r = 0;
break;
+#ifdef CONFIG_X86_SGX_KVM
+ case KVM_CAP_SGX_ATTRIBUTE: {
+ unsigned long allowed_attributes = 0;
+
+ r = sgx_set_attribute(&allowed_attributes, cap->args[0]);
+ if (r)
+ break;
+
+ /* KVM only supports the PROVISIONKEY privileged attribute. */
+ if ((allowed_attributes & SGX_ATTR_PROVISIONKEY) &&
+ !(allowed_attributes & ~SGX_ATTR_PROVISIONKEY))
+ kvm->arch.sgx_provisioning_allowed = true;
+ else
+ r = -EINVAL;
+ break;
+ }
+#endif
+ case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
+ r = -EINVAL;
+ if (kvm_x86_ops.vm_copy_enc_context_from)
+ r = kvm_x86_ops.vm_copy_enc_context_from(kvm, cap->args[0]);
+ return r;
default:
r = -EINVAL;
break;
@@ -5352,25 +5440,34 @@ split_irqchip_unlock:
return r;
}
-static void kvm_clear_msr_filter(struct kvm *kvm)
+static struct kvm_x86_msr_filter *kvm_alloc_msr_filter(bool default_allow)
+{
+ struct kvm_x86_msr_filter *msr_filter;
+
+ msr_filter = kzalloc(sizeof(*msr_filter), GFP_KERNEL_ACCOUNT);
+ if (!msr_filter)
+ return NULL;
+
+ msr_filter->default_allow = default_allow;
+ return msr_filter;
+}
+
+static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter)
{
u32 i;
- u32 count = kvm->arch.msr_filter.count;
- struct msr_bitmap_range ranges[16];
- mutex_lock(&kvm->lock);
- kvm->arch.msr_filter.count = 0;
- memcpy(ranges, kvm->arch.msr_filter.ranges, count * sizeof(ranges[0]));
- mutex_unlock(&kvm->lock);
- synchronize_srcu(&kvm->srcu);
+ if (!msr_filter)
+ return;
+
+ for (i = 0; i < msr_filter->count; i++)
+ kfree(msr_filter->ranges[i].bitmap);
- for (i = 0; i < count; i++)
- kfree(ranges[i].bitmap);
+ kfree(msr_filter);
}
-static int kvm_add_msr_filter(struct kvm *kvm, struct kvm_msr_filter_range *user_range)
+static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
+ struct kvm_msr_filter_range *user_range)
{
- struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges;
struct msr_bitmap_range range;
unsigned long *bitmap = NULL;
size_t bitmap_size;
@@ -5404,11 +5501,9 @@ static int kvm_add_msr_filter(struct kvm *kvm, struct kvm_msr_filter_range *user
goto err;
}
- /* Everything ok, add this range identifier to our global pool */
- ranges[kvm->arch.msr_filter.count] = range;
- /* Make sure we filled the array before we tell anyone to walk it */
- smp_wmb();
- kvm->arch.msr_filter.count++;
+ /* Everything ok, add this range identifier. */
+ msr_filter->ranges[msr_filter->count] = range;
+ msr_filter->count++;
return 0;
err:
@@ -5419,10 +5514,11 @@ err:
static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)
{
struct kvm_msr_filter __user *user_msr_filter = argp;
+ struct kvm_x86_msr_filter *new_filter, *old_filter;
struct kvm_msr_filter filter;
bool default_allow;
- int r = 0;
bool empty = true;
+ int r = 0;
u32 i;
if (copy_from_user(&filter, user_msr_filter, sizeof(filter)))
@@ -5435,25 +5531,32 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)
if (empty && !default_allow)
return -EINVAL;
- kvm_clear_msr_filter(kvm);
+ new_filter = kvm_alloc_msr_filter(default_allow);
+ if (!new_filter)
+ return -ENOMEM;
- kvm->arch.msr_filter.default_allow = default_allow;
-
- /*
- * Protect from concurrent calls to this function that could trigger
- * a TOCTOU violation on kvm->arch.msr_filter.count.
- */
- mutex_lock(&kvm->lock);
for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) {
- r = kvm_add_msr_filter(kvm, &filter.ranges[i]);
- if (r)
- break;
+ r = kvm_add_msr_filter(new_filter, &filter.ranges[i]);
+ if (r) {
+ kvm_free_msr_filter(new_filter);
+ return r;
+ }
}
+ mutex_lock(&kvm->lock);
+
+ /* The per-VM filter is protected by kvm->lock... */
+ old_filter = srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1);
+
+ rcu_assign_pointer(kvm->arch.msr_filter, new_filter);
+ synchronize_srcu(&kvm->srcu);
+
+ kvm_free_msr_filter(old_filter);
+
kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED);
mutex_unlock(&kvm->lock);
- return r;
+ return 0;
}
long kvm_arch_vm_ioctl(struct file *filp,
@@ -5700,6 +5803,7 @@ set_pit2_out:
}
#endif
case KVM_SET_CLOCK: {
+ struct kvm_arch *ka = &kvm->arch;
struct kvm_clock_data user_ns;
u64 now_ns;
@@ -5718,8 +5822,22 @@ set_pit2_out:
* pvclock_update_vm_gtod_copy().
*/
kvm_gen_update_masterclock(kvm);
- now_ns = get_kvmclock_ns(kvm);
- kvm->arch.kvmclock_offset += user_ns.clock - now_ns;
+
+ /*
+ * This pairs with kvm_guest_time_update(): when masterclock is
+ * in use, we use master_kernel_ns + kvmclock_offset to set
+ * unsigned 'system_time' so if we use get_kvmclock_ns() (which
+ * is slightly ahead) here we risk going negative on unsigned
+ * 'system_time' when 'user_ns.clock' is very small.
+ */
+ spin_lock_irq(&ka->pvclock_gtod_sync_lock);
+ if (kvm->arch.use_master_clock)
+ now_ns = ka->master_kernel_ns;
+ else
+ now_ns = get_kvmclock_base_ns();
+ ka->kvmclock_offset = user_ns.clock - now_ns;
+ spin_unlock_irq(&ka->pvclock_gtod_sync_lock);
+
kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
break;
}
@@ -5959,6 +6077,7 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
}
+EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read);
gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
@@ -5975,6 +6094,7 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
access |= PFERR_WRITE_MASK;
return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
}
+EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write);
/* uses this to access any guest's mapped memory without checking CPL */
gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
@@ -6603,7 +6723,7 @@ static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
int cpu = get_cpu();
cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
- smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
+ on_each_cpu_mask(vcpu->arch.wbinvd_dirty_mask,
wbinvd_ipi, NULL, 1);
put_cpu();
cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
@@ -6894,12 +7014,12 @@ static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt)
static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
{
- return kvm_register_read(emul_to_vcpu(ctxt), reg);
+ return kvm_register_read_raw(emul_to_vcpu(ctxt), reg);
}
static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val)
{
- kvm_register_write(emul_to_vcpu(ctxt), reg, val);
+ kvm_register_write_raw(emul_to_vcpu(ctxt), reg, val);
}
static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
@@ -7698,6 +7818,7 @@ static void kvm_hyperv_tsc_notifier(void)
struct kvm *kvm;
struct kvm_vcpu *vcpu;
int cpu;
+ unsigned long flags;
mutex_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
@@ -7713,17 +7834,15 @@ static void kvm_hyperv_tsc_notifier(void)
list_for_each_entry(kvm, &vm_list, vm_list) {
struct kvm_arch *ka = &kvm->arch;
- spin_lock(&ka->pvclock_gtod_sync_lock);
-
+ spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
pvclock_update_vm_gtod_copy(kvm);
+ spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
kvm_for_each_vcpu(cpu, vcpu, kvm)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
kvm_for_each_vcpu(cpu, vcpu, kvm)
kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
-
- spin_unlock(&ka->pvclock_gtod_sync_lock);
}
mutex_unlock(&kvm_lock);
}
@@ -8004,9 +8123,6 @@ int kvm_arch_init(void *opaque)
if (r)
goto out_free_percpu;
- kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
- PT_DIRTY_MASK, PT64_NX_MASK, 0,
- PT_PRESENT_MASK, 0, sme_me_mask);
kvm_timer_init();
perf_register_guest_info_callbacks(&kvm_guest_cbs);
@@ -8166,21 +8282,35 @@ void kvm_apicv_init(struct kvm *kvm, bool enable)
}
EXPORT_SYMBOL_GPL(kvm_apicv_init);
-static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id)
+static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
{
struct kvm_vcpu *target = NULL;
struct kvm_apic_map *map;
+ vcpu->stat.directed_yield_attempted++;
+
rcu_read_lock();
- map = rcu_dereference(kvm->arch.apic_map);
+ map = rcu_dereference(vcpu->kvm->arch.apic_map);
if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id])
target = map->phys_map[dest_id]->vcpu;
rcu_read_unlock();
- if (target && READ_ONCE(target->ready))
- kvm_vcpu_yield_to(target);
+ if (!target || !READ_ONCE(target->ready))
+ goto no_yield;
+
+ /* Ignore requests to yield to self */
+ if (vcpu == target)
+ goto no_yield;
+
+ if (kvm_vcpu_yield_to(target) <= 0)
+ goto no_yield;
+
+ vcpu->stat.directed_yield_successful++;
+
+no_yield:
+ return;
}
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
@@ -8227,7 +8357,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
break;
kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
- kvm_sched_yield(vcpu->kvm, a1);
+ kvm_sched_yield(vcpu, a1);
ret = 0;
break;
#ifdef CONFIG_X86_64
@@ -8245,7 +8375,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD))
break;
- kvm_sched_yield(vcpu->kvm, a0);
+ kvm_sched_yield(vcpu, a0);
ret = 0;
break;
default:
@@ -8328,6 +8458,27 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
static_call(kvm_x86_update_cr8_intercept)(vcpu, tpr, max_irr);
}
+
+int kvm_check_nested_events(struct kvm_vcpu *vcpu)
+{
+ if (WARN_ON_ONCE(!is_guest_mode(vcpu)))
+ return -EIO;
+
+ if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
+ kvm_x86_ops.nested_ops->triple_fault(vcpu);
+ return 1;
+ }
+
+ return kvm_x86_ops.nested_ops->check_events(vcpu);
+}
+
+static void kvm_inject_exception(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
+ vcpu->arch.exception.error_code = false;
+ static_call(kvm_x86_queue_exception)(vcpu);
+}
+
static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
{
int r;
@@ -8336,7 +8487,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
/* try to reinject previous events if any */
if (vcpu->arch.exception.injected) {
- static_call(kvm_x86_queue_exception)(vcpu);
+ kvm_inject_exception(vcpu);
can_inject = false;
}
/*
@@ -8373,7 +8524,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
* from L2 to L1.
*/
if (is_guest_mode(vcpu)) {
- r = kvm_x86_ops.nested_ops->check_events(vcpu);
+ r = kvm_check_nested_events(vcpu);
if (r < 0)
goto busy;
}
@@ -8399,7 +8550,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
}
}
- static_call(kvm_x86_queue_exception)(vcpu);
+ kvm_inject_exception(vcpu);
can_inject = false;
}
@@ -8548,7 +8699,7 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf)
put_smstate(u32, buf, 0x7ff0, kvm_rip_read(vcpu));
for (i = 0; i < 8; i++)
- put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read(vcpu, i));
+ put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read_raw(vcpu, i));
kvm_get_dr(vcpu, 6, &val);
put_smstate(u32, buf, 0x7fcc, (u32)val);
@@ -8594,7 +8745,7 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
int i;
for (i = 0; i < 16; i++)
- put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read(vcpu, i));
+ put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read_raw(vcpu, i));
put_smstate(u64, buf, 0x7f78, kvm_rip_read(vcpu));
put_smstate(u32, buf, 0x7f70, kvm_get_rflags(vcpu));
@@ -8936,10 +9087,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
goto out;
}
if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
- vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
- vcpu->mmio_needed = 0;
- r = 0;
- goto out;
+ if (is_guest_mode(vcpu)) {
+ kvm_x86_ops.nested_ops->triple_fault(vcpu);
+ } else {
+ vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+ vcpu->mmio_needed = 0;
+ r = 0;
+ goto out;
+ }
}
if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
/* Page is swapped out. Do synthetic halt */
@@ -9237,7 +9392,7 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
{
if (is_guest_mode(vcpu))
- kvm_x86_ops.nested_ops->check_events(vcpu);
+ kvm_check_nested_events(vcpu);
return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
!vcpu->arch.apf.halted);
@@ -10634,8 +10789,6 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm)
void kvm_arch_destroy_vm(struct kvm *kvm)
{
- u32 i;
-
if (current->mm == kvm->mm) {
/*
* Free memory regions allocated on behalf of userspace,
@@ -10651,8 +10804,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
mutex_unlock(&kvm->slots_lock);
}
static_call_cond(kvm_x86_vm_destroy)(kvm);
- for (i = 0; i < kvm->arch.msr_filter.count; i++)
- kfree(kvm->arch.msr_filter.ranges[i].bitmap);
+ kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
kvm_pic_destroy(kvm);
kvm_ioapic_destroy(kvm);
kvm_free_vcpus(kvm);
@@ -10966,6 +11118,14 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
}
+bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.apicv_active && static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu))
+ return true;
+
+ return false;
+}
+
bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
{
if (READ_ONCE(vcpu->arch.pv.pv_unhalted))
@@ -10976,14 +11136,14 @@ bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
kvm_test_request(KVM_REQ_EVENT, vcpu))
return true;
- if (vcpu->arch.apicv_active && static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu))
- return true;
-
- return false;
+ return kvm_arch_dy_has_pending_interrupt(vcpu);
}
bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
{
+ if (vcpu->arch.guest_state_protected)
+ return true;
+
return vcpu->arch.preempted_in_kernel;
}
@@ -11254,7 +11414,7 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
if (!kvm_pv_async_pf_enabled(vcpu))
return true;
else
- return apf_pageready_slot_free(vcpu);
+ return kvm_lapic_enabled(vcpu) && apf_pageready_slot_free(vcpu);
}
void kvm_arch_start_assignment(struct kvm *kvm)
@@ -11503,7 +11663,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
fallthrough;
case INVPCID_TYPE_ALL_INCL_GLOBAL:
- kvm_mmu_unload(vcpu);
+ kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
return kvm_skip_emulated_instruction(vcpu);
default: