summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2011-04-12 11:33:13 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2011-04-12 11:33:13 +1000
commit6f485c71e6e0dc3add099f958f9c8cd57df23aba (patch)
treefdfe60afe20454ed83ea41d620a5bd904923d407
parentaa7547198f7a467ce5005add59c8c50c9e814555 (diff)
parentd4168b8a7f2b7d92d355ec82f8f5e293ea3d81c7 (diff)
Merge remote-tracking branch 'kvm/linux-next'
-rw-r--r--Documentation/kvm/api.txt23
-rw-r--r--arch/powerpc/include/asm/kvm_host.h1
-rw-r--r--arch/powerpc/kvm/powerpc.c4
-rw-r--r--arch/powerpc/kvm/timing.c10
-rw-r--r--arch/x86/include/asm/kvm_emulate.h102
-rw-r--r--arch/x86/include/asm/kvm_host.h43
-rw-r--r--arch/x86/include/asm/msr-index.h1
-rw-r--r--arch/x86/kvm/emulate.c570
-rw-r--r--arch/x86/kvm/i8254.h1
-rw-r--r--arch/x86/kvm/mmu.c16
-rw-r--r--arch/x86/kvm/paging_tmpl.h4
-rw-r--r--arch/x86/kvm/svm.c410
-rw-r--r--arch/x86/kvm/vmx.c107
-rw-r--r--arch/x86/kvm/x86.c273
-rw-r--r--include/linux/kvm.h5
-rw-r--r--include/linux/kvm_host.h14
-rw-r--r--virt/kvm/kvm_main.c17
17 files changed, 1323 insertions, 278 deletions
diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index 9bef4e4cec50..1b9eaa7e8856 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -1263,6 +1263,29 @@ struct kvm_assigned_msix_entry {
__u16 padding[3];
};
+4.54 KVM_SET_TSC_KHZ
+
+Capability: KVM_CAP_TSC_CONTROL
+Architectures: x86
+Type: vcpu ioctl
+Parameters: virtual tsc_khz
+Returns: 0 on success, -1 on error
+
+Specifies the tsc frequency for the virtual machine. The unit of the
+frequency is KHz.
+
+4.55 KVM_GET_TSC_KHZ
+
+Capability: KVM_CAP_GET_TSC_KHZ
+Architectures: x86
+Type: vcpu ioctl
+Parameters: none
+Returns: virtual tsc-khz on success, negative value on error
+
+Returns the tsc frequency of the guest. The unit of the return value is
+KHz. If the host has unstable tsc this ioctl returns -EIO instead as an
+error.
+
5. The kvm_run structure
Application code obtains a pointer to the kvm_run structure by
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index bba3b9b72a39..890897cee050 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -255,6 +255,7 @@ struct kvm_vcpu_arch {
u32 dbsr;
#ifdef CONFIG_KVM_EXIT_TIMING
+ struct mutex exit_timing_lock;
struct kvmppc_exit_timing timing_exit;
struct kvmppc_exit_timing timing_last_enter;
u32 last_exit_type;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 99758460efde..ec3d2e75c0a8 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -284,6 +284,10 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
tasklet_init(&vcpu->arch.tasklet, kvmppc_decrementer_func, (ulong)vcpu);
vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup;
+#ifdef CONFIG_KVM_EXIT_TIMING
+ mutex_init(&vcpu->arch.exit_timing_lock);
+#endif
+
return 0;
}
diff --git a/arch/powerpc/kvm/timing.c b/arch/powerpc/kvm/timing.c
index a021f5827a33..18f40fd3e98f 100644
--- a/arch/powerpc/kvm/timing.c
+++ b/arch/powerpc/kvm/timing.c
@@ -34,8 +34,8 @@ void kvmppc_init_timing_stats(struct kvm_vcpu *vcpu)
{
int i;
- /* pause guest execution to avoid concurrent updates */
- mutex_lock(&vcpu->mutex);
+ /* Take a lock to avoid concurrent updates */
+ mutex_lock(&vcpu->arch.exit_timing_lock);
vcpu->arch.last_exit_type = 0xDEAD;
for (i = 0; i < __NUMBER_OF_KVM_EXIT_TYPES; i++) {
@@ -49,7 +49,7 @@ void kvmppc_init_timing_stats(struct kvm_vcpu *vcpu)
vcpu->arch.timing_exit.tv64 = 0;
vcpu->arch.timing_last_enter.tv64 = 0;
- mutex_unlock(&vcpu->mutex);
+ mutex_unlock(&vcpu->arch.exit_timing_lock);
}
static void add_exit_timing(struct kvm_vcpu *vcpu, u64 duration, int type)
@@ -65,6 +65,8 @@ static void add_exit_timing(struct kvm_vcpu *vcpu, u64 duration, int type)
return;
}
+ mutex_lock(&vcpu->arch.exit_timing_lock);
+
vcpu->arch.timing_count_type[type]++;
/* sum */
@@ -93,6 +95,8 @@ static void add_exit_timing(struct kvm_vcpu *vcpu, u64 duration, int type)
vcpu->arch.timing_min_duration[type] = duration;
if (unlikely(duration > vcpu->arch.timing_max_duration[type]))
vcpu->arch.timing_max_duration[type] = duration;
+
+ mutex_unlock(&vcpu->arch.exit_timing_lock);
}
void kvmppc_update_timing_stats(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 0f5213564326..081844860a3d 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -14,6 +14,8 @@
#include <asm/desc_defs.h>
struct x86_emulate_ctxt;
+enum x86_intercept;
+enum x86_intercept_stage;
struct x86_exception {
u8 vector;
@@ -24,6 +26,24 @@ struct x86_exception {
};
/*
+ * This struct is used to carry enough information from the instruction
+ * decoder to main KVM so that a decision can be made whether the
+ * instruction needs to be intercepted or not.
+ */
+struct x86_instruction_info {
+ u8 intercept; /* which intercept */
+ u8 rep_prefix; /* rep prefix? */
+ u8 modrm_mod; /* mod part of modrm */
+ u8 modrm_reg; /* index of register used */
+ u8 modrm_rm; /* rm part of modrm */
+ u64 src_val; /* value of source operand */
+ u8 src_bytes; /* size of source operand */
+ u8 dst_bytes; /* size of destination operand */
+ u8 ad_bytes; /* size of src/dst address */
+ u64 next_rip; /* rip following the instruction */
+};
+
+/*
* x86_emulate_ops:
*
* These operations represent the instruction emulator's interface to memory.
@@ -62,6 +82,7 @@ struct x86_exception {
#define X86EMUL_RETRY_INSTR 3 /* retry the instruction for some reason */
#define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */
#define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */
+#define X86EMUL_INTERCEPTED 6 /* Intercepted by nested VMCB/VMCS */
struct x86_emulate_ops {
/*
@@ -158,11 +179,18 @@ struct x86_emulate_ops {
int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu);
int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
+ void (*get_fpu)(struct x86_emulate_ctxt *ctxt); /* disables preempt */
+ void (*put_fpu)(struct x86_emulate_ctxt *ctxt); /* reenables preempt */
+ int (*intercept)(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage);
};
+typedef u32 __attribute__((vector_size(16))) sse128_t;
+
/* Type, address-of, and value of an instruction's operand. */
struct operand {
- enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
+ enum { OP_REG, OP_MEM, OP_IMM, OP_XMM, OP_NONE } type;
unsigned int bytes;
union {
unsigned long orig_val;
@@ -174,11 +202,13 @@ struct operand {
ulong ea;
unsigned seg;
} mem;
+ unsigned xmm;
} addr;
union {
unsigned long val;
u64 val64;
char valptr[sizeof(unsigned long) + 2];
+ sse128_t vec_val;
};
};
@@ -197,6 +227,7 @@ struct read_cache {
struct decode_cache {
u8 twobyte;
u8 b;
+ u8 intercept;
u8 lock_prefix;
u8 rep_prefix;
u8 op_bytes;
@@ -209,6 +240,7 @@ struct decode_cache {
u8 seg_override;
unsigned int d;
int (*execute)(struct x86_emulate_ctxt *ctxt);
+ int (*check_perm)(struct x86_emulate_ctxt *ctxt);
unsigned long regs[NR_VCPU_REGS];
unsigned long eip;
/* modrm */
@@ -238,6 +270,7 @@ struct x86_emulate_ctxt {
/* interruptibility state, as a result of execution of STI or MOV SS */
int interruptibility;
+ bool guest_mode; /* guest running a nested guest */
bool perm_ok; /* do not check permissions if true */
bool only_vendor_specific_insn;
@@ -249,8 +282,8 @@ struct x86_emulate_ctxt {
};
/* Repeat String Operation Prefix */
-#define REPE_PREFIX 1
-#define REPNE_PREFIX 2
+#define REPE_PREFIX 0xf3
+#define REPNE_PREFIX 0xf2
/* Execution mode, passed to the emulator. */
#define X86EMUL_MODE_REAL 0 /* Real mode. */
@@ -259,6 +292,68 @@ struct x86_emulate_ctxt {
#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */
#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */
+/* any protected mode */
+#define X86EMUL_MODE_PROT (X86EMUL_MODE_PROT16|X86EMUL_MODE_PROT32| \
+ X86EMUL_MODE_PROT64)
+
+enum x86_intercept_stage {
+ X86_ICPT_PRE_EXCEPT,
+ X86_ICPT_POST_EXCEPT,
+ X86_ICPT_POST_MEMACCESS,
+};
+
+enum x86_intercept {
+ x86_intercept_none,
+ x86_intercept_cr_read,
+ x86_intercept_cr_write,
+ x86_intercept_clts,
+ x86_intercept_lmsw,
+ x86_intercept_smsw,
+ x86_intercept_dr_read,
+ x86_intercept_dr_write,
+ x86_intercept_lidt,
+ x86_intercept_sidt,
+ x86_intercept_lgdt,
+ x86_intercept_sgdt,
+ x86_intercept_lldt,
+ x86_intercept_sldt,
+ x86_intercept_ltr,
+ x86_intercept_str,
+ x86_intercept_rdtsc,
+ x86_intercept_rdpmc,
+ x86_intercept_pushf,
+ x86_intercept_popf,
+ x86_intercept_cpuid,
+ x86_intercept_rsm,
+ x86_intercept_iret,
+ x86_intercept_intn,
+ x86_intercept_invd,
+ x86_intercept_pause,
+ x86_intercept_hlt,
+ x86_intercept_invlpg,
+ x86_intercept_invlpga,
+ x86_intercept_vmrun,
+ x86_intercept_vmload,
+ x86_intercept_vmsave,
+ x86_intercept_vmmcall,
+ x86_intercept_stgi,
+ x86_intercept_clgi,
+ x86_intercept_skinit,
+ x86_intercept_rdtscp,
+ x86_intercept_icebp,
+ x86_intercept_wbinvd,
+ x86_intercept_monitor,
+ x86_intercept_mwait,
+ x86_intercept_rdmsr,
+ x86_intercept_wrmsr,
+ x86_intercept_in,
+ x86_intercept_ins,
+ x86_intercept_out,
+ x86_intercept_outs,
+
+ nr_x86_intercepts
+};
+
/* Host execution mode. */
#if defined(CONFIG_X86_32)
#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32
@@ -270,6 +365,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len);
#define EMULATION_FAILED -1
#define EMULATION_OK 0
#define EMULATION_RESTART 1
+#define EMULATION_INTERCEPTED 2
int x86_emulate_insn(struct x86_emulate_ctxt *ctxt);
int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
u16 tss_selector, int reason,
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c8af0991fdf0..bd57639fd5db 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -30,14 +30,30 @@
#define KVM_MEMORY_SLOTS 32
/* memory slots that does not exposed to userspace */
#define KVM_PRIVATE_MEM_SLOTS 4
+#define KVM_MMIO_SIZE 16
#define KVM_PIO_PAGE_OFFSET 1
#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
+#define CR0_RESERVED_BITS \
+ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
+ | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
+ | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
+
#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \
0xFFFFFF0000000000ULL)
+#define CR4_RESERVED_BITS \
+ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
+ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
+ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
+ | X86_CR4_OSXSAVE \
+ | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
+
+#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
+
+
#define INVALID_PAGE (~(hpa_t)0)
#define VALID_PAGE(x) ((x) != INVALID_PAGE)
@@ -118,6 +134,8 @@ enum kvm_reg {
enum kvm_reg_ex {
VCPU_EXREG_PDPTR = NR_VCPU_REGS,
VCPU_EXREG_CR3,
+ VCPU_EXREG_RFLAGS,
+ VCPU_EXREG_CPL,
};
enum {
@@ -256,7 +274,7 @@ struct kvm_mmu {
struct kvm_mmu_page *sp);
void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- u64 *spte, const void *pte, unsigned long mmu_seq);
+ u64 *spte, const void *pte);
hpa_t root_hpa;
int root_level;
int shadow_root_level;
@@ -378,7 +396,10 @@ struct kvm_vcpu_arch {
u64 last_kernel_ns;
u64 last_tsc_nsec;
u64 last_tsc_write;
+ u32 virtual_tsc_khz;
bool tsc_catchup;
+ u32 tsc_catchup_mult;
+ s8 tsc_catchup_shift;
bool nmi_pending;
bool nmi_injected;
@@ -448,9 +469,6 @@ struct kvm_arch {
u64 last_tsc_nsec;
u64 last_tsc_offset;
u64 last_tsc_write;
- u32 virtual_tsc_khz;
- u32 virtual_tsc_mult;
- s8 virtual_tsc_shift;
struct kvm_xen_hvm_config xen_hvm_config;
@@ -502,6 +520,8 @@ struct kvm_vcpu_stat {
u32 nmi_injections;
};
+struct x86_instruction_info;
+
struct kvm_x86_ops {
int (*cpu_has_kvm_support)(void); /* __init */
int (*disabled_by_bios)(void); /* __init */
@@ -586,9 +606,17 @@ struct kvm_x86_ops {
bool (*has_wbinvd_exit)(void);
+ void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz);
void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
+ u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc);
+
void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
+
+ int (*check_intercept)(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage);
+
const struct trace_print_flags *exit_reasons_str;
};
@@ -627,6 +655,13 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
extern bool tdp_enabled;
+/* control of guest tsc rate supported? */
+extern bool kvm_has_tsc_control;
+/* minimum supported tsc_khz for guests */
+extern u32 kvm_min_guest_tsc_khz;
+/* maximum supported tsc_khz for guests */
+extern u32 kvm_max_guest_tsc_khz;
+
enum emulation_result {
EMULATE_DONE, /* no further processing */
EMULATE_DO_MMIO, /* kvm_run filled with mmio request */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index fd5a1f365c95..a7b3e404683c 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -114,6 +114,7 @@
complete list. */
#define MSR_AMD64_PATCH_LEVEL 0x0000008b
+#define MSR_AMD64_TSC_RATIO 0xc0000104
#define MSR_AMD64_NB_CFG 0xc001001f
#define MSR_AMD64_PATCH_LOADER 0xc0010020
#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 0ad47b819a8b..50bffb98ca82 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -75,7 +75,11 @@
#define Stack (1<<13) /* Stack instruction (push/pop) */
#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
+#define Prefix (1<<16) /* Instruction varies with 66/f2/f3 prefix */
+#define Sse (1<<17) /* SSE Vector instruction */
+#define RMExt (1<<18) /* Opcode extension in ModRM r/m if mod == 3 */
/* Misc flags */
+#define Prot (1<<21) /* instruction generates #UD if not in prot-mode */
#define VendorSpecific (1<<22) /* Vendor specific instruction */
#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */
#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */
@@ -102,11 +106,14 @@
struct opcode {
u32 flags;
+ u8 intercept;
union {
int (*execute)(struct x86_emulate_ctxt *ctxt);
struct opcode *group;
struct group_dual *gdual;
+ struct gprefix *gprefix;
} u;
+ int (*check_perm)(struct x86_emulate_ctxt *ctxt);
};
struct group_dual {
@@ -114,6 +121,13 @@ struct group_dual {
struct opcode mod3[8];
};
+struct gprefix {
+ struct opcode pfx_no;
+ struct opcode pfx_66;
+ struct opcode pfx_f2;
+ struct opcode pfx_f3;
+};
+
/* EFLAGS bit definitions. */
#define EFLG_ID (1<<21)
#define EFLG_VIP (1<<20)
@@ -395,6 +409,26 @@ struct group_dual {
(_eip) += (_size); \
})
+static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
+ enum x86_intercept intercept,
+ enum x86_intercept_stage stage)
+{
+ struct x86_instruction_info info = {
+ .intercept = intercept,
+ .rep_prefix = ctxt->decode.rep_prefix,
+ .modrm_mod = ctxt->decode.modrm_mod,
+ .modrm_reg = ctxt->decode.modrm_reg,
+ .modrm_rm = ctxt->decode.modrm_rm,
+ .src_val = ctxt->decode.src.val64,
+ .src_bytes = ctxt->decode.src.bytes,
+ .dst_bytes = ctxt->decode.dst.bytes,
+ .ad_bytes = ctxt->decode.ad_bytes,
+ .next_rip = ctxt->eip,
+ };
+
+ return ctxt->ops->intercept(ctxt->vcpu, &info, stage);
+}
+
static inline unsigned long ad_mask(struct decode_cache *c)
{
return (1UL << (c->ad_bytes << 3)) - 1;
@@ -476,6 +510,11 @@ static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
return X86EMUL_PROPAGATE_FAULT;
}
+static int emulate_db(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_exception(ctxt, DB_VECTOR, 0, false);
+}
+
static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
{
return emulate_exception(ctxt, GP_VECTOR, err, true);
@@ -496,6 +535,11 @@ static int emulate_de(struct x86_emulate_ctxt *ctxt)
return emulate_exception(ctxt, DE_VECTOR, 0, false);
}
+static int emulate_nm(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_exception(ctxt, NM_VECTOR, 0, false);
+}
+
static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
unsigned long eip, u8 *dest)
@@ -623,7 +667,63 @@ static void fetch_register_operand(struct operand *op)
}
}
-static void decode_register_operand(struct operand *op,
+static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg)
+{
+ ctxt->ops->get_fpu(ctxt);
+ switch (reg) {
+ case 0: asm("movdqu %%xmm0, %0" : "=m"(*data)); break;
+ case 1: asm("movdqu %%xmm1, %0" : "=m"(*data)); break;
+ case 2: asm("movdqu %%xmm2, %0" : "=m"(*data)); break;
+ case 3: asm("movdqu %%xmm3, %0" : "=m"(*data)); break;
+ case 4: asm("movdqu %%xmm4, %0" : "=m"(*data)); break;
+ case 5: asm("movdqu %%xmm5, %0" : "=m"(*data)); break;
+ case 6: asm("movdqu %%xmm6, %0" : "=m"(*data)); break;
+ case 7: asm("movdqu %%xmm7, %0" : "=m"(*data)); break;
+#ifdef CONFIG_X86_64
+ case 8: asm("movdqu %%xmm8, %0" : "=m"(*data)); break;
+ case 9: asm("movdqu %%xmm9, %0" : "=m"(*data)); break;
+ case 10: asm("movdqu %%xmm10, %0" : "=m"(*data)); break;
+ case 11: asm("movdqu %%xmm11, %0" : "=m"(*data)); break;
+ case 12: asm("movdqu %%xmm12, %0" : "=m"(*data)); break;
+ case 13: asm("movdqu %%xmm13, %0" : "=m"(*data)); break;
+ case 14: asm("movdqu %%xmm14, %0" : "=m"(*data)); break;
+ case 15: asm("movdqu %%xmm15, %0" : "=m"(*data)); break;
+#endif
+ default: BUG();
+ }
+ ctxt->ops->put_fpu(ctxt);
+}
+
+static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
+ int reg)
+{
+ ctxt->ops->get_fpu(ctxt);
+ switch (reg) {
+ case 0: asm("movdqu %0, %%xmm0" : : "m"(*data)); break;
+ case 1: asm("movdqu %0, %%xmm1" : : "m"(*data)); break;
+ case 2: asm("movdqu %0, %%xmm2" : : "m"(*data)); break;
+ case 3: asm("movdqu %0, %%xmm3" : : "m"(*data)); break;
+ case 4: asm("movdqu %0, %%xmm4" : : "m"(*data)); break;
+ case 5: asm("movdqu %0, %%xmm5" : : "m"(*data)); break;
+ case 6: asm("movdqu %0, %%xmm6" : : "m"(*data)); break;
+ case 7: asm("movdqu %0, %%xmm7" : : "m"(*data)); break;
+#ifdef CONFIG_X86_64
+ case 8: asm("movdqu %0, %%xmm8" : : "m"(*data)); break;
+ case 9: asm("movdqu %0, %%xmm9" : : "m"(*data)); break;
+ case 10: asm("movdqu %0, %%xmm10" : : "m"(*data)); break;
+ case 11: asm("movdqu %0, %%xmm11" : : "m"(*data)); break;
+ case 12: asm("movdqu %0, %%xmm12" : : "m"(*data)); break;
+ case 13: asm("movdqu %0, %%xmm13" : : "m"(*data)); break;
+ case 14: asm("movdqu %0, %%xmm14" : : "m"(*data)); break;
+ case 15: asm("movdqu %0, %%xmm15" : : "m"(*data)); break;
+#endif
+ default: BUG();
+ }
+ ctxt->ops->put_fpu(ctxt);
+}
+
+static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
+ struct operand *op,
struct decode_cache *c,
int inhibit_bytereg)
{
@@ -632,6 +732,15 @@ static void decode_register_operand(struct operand *op,
if (!(c->d & ModRM))
reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
+
+ if (c->d & Sse) {
+ op->type = OP_XMM;
+ op->bytes = 16;
+ op->addr.xmm = reg;
+ read_sse_reg(ctxt, &op->vec_val, reg);
+ return;
+ }
+
op->type = OP_REG;
if ((c->d & ByteOp) && !inhibit_bytereg) {
op->addr.reg = decode_register(reg, c->regs, highbyte_regs);
@@ -671,6 +780,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
op->addr.reg = decode_register(c->modrm_rm,
c->regs, c->d & ByteOp);
+ if (c->d & Sse) {
+ op->type = OP_XMM;
+ op->bytes = 16;
+ op->addr.xmm = c->modrm_rm;
+ read_sse_reg(ctxt, &op->vec_val, c->modrm_rm);
+ return rc;
+ }
fetch_register_operand(op);
return rc;
}
@@ -1098,6 +1214,9 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
if (rc != X86EMUL_CONTINUE)
return rc;
break;
+ case OP_XMM:
+ write_sse_reg(ctxt, &c->dst.vec_val, c->dst.addr.xmm);
+ break;
case OP_NONE:
/* no writeback */
break;
@@ -2168,7 +2287,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
ctxt->eip = c->eip;
}
- return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
+ return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
}
static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg,
@@ -2306,12 +2425,9 @@ static int em_cwd(struct x86_emulate_ctxt *ctxt)
static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
{
- unsigned cpl = ctxt->ops->cpl(ctxt->vcpu);
struct decode_cache *c = &ctxt->decode;
u64 tsc = 0;
- if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD))
- return emulate_gp(ctxt, 0);
ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc);
c->regs[VCPU_REGS_RAX] = (u32)tsc;
c->regs[VCPU_REGS_RDX] = tsc >> 32;
@@ -2325,20 +2441,256 @@ static int em_mov(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
+static int em_movdqu(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ memcpy(&c->dst.vec_val, &c->src.vec_val, c->op_bytes);
+ return X86EMUL_CONTINUE;
+}
+
+static bool valid_cr(int nr)
+{
+ switch (nr) {
+ case 0:
+ case 2 ... 4:
+ case 8:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static int check_cr_read(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ if (!valid_cr(c->modrm_reg))
+ return emulate_ud(ctxt);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_cr_write(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ u64 new_val = c->src.val64;
+ int cr = c->modrm_reg;
+
+ static u64 cr_reserved_bits[] = {
+ 0xffffffff00000000ULL,
+ 0, 0, 0, /* CR3 checked later */
+ CR4_RESERVED_BITS,
+ 0, 0, 0,
+ CR8_RESERVED_BITS,
+ };
+
+ if (!valid_cr(cr))
+ return emulate_ud(ctxt);
+
+ if (new_val & cr_reserved_bits[cr])
+ return emulate_gp(ctxt, 0);
+
+ switch (cr) {
+ case 0: {
+ u64 cr4, efer;
+ if (((new_val & X86_CR0_PG) && !(new_val & X86_CR0_PE)) ||
+ ((new_val & X86_CR0_NW) && !(new_val & X86_CR0_CD)))
+ return emulate_gp(ctxt, 0);
+
+ cr4 = ctxt->ops->get_cr(4, ctxt->vcpu);
+ ctxt->ops->get_msr(ctxt->vcpu, MSR_EFER, &efer);
+
+ if ((new_val & X86_CR0_PG) && (efer & EFER_LME) &&
+ !(cr4 & X86_CR4_PAE))
+ return emulate_gp(ctxt, 0);
+
+ break;
+ }
+ case 3: {
+ u64 rsvd = 0;
+
+ if (is_long_mode(ctxt->vcpu))
+ rsvd = CR3_L_MODE_RESERVED_BITS;
+ else if (is_pae(ctxt->vcpu))
+ rsvd = CR3_PAE_RESERVED_BITS;
+ else if (is_paging(ctxt->vcpu))
+ rsvd = CR3_NONPAE_RESERVED_BITS;
+
+ if (new_val & rsvd)
+ return emulate_gp(ctxt, 0);
+
+ break;
+ }
+ case 4: {
+ u64 cr4, efer;
+
+ cr4 = ctxt->ops->get_cr(4, ctxt->vcpu);
+ ctxt->ops->get_msr(ctxt->vcpu, MSR_EFER, &efer);
+
+ if ((efer & EFER_LMA) && !(new_val & X86_CR4_PAE))
+ return emulate_gp(ctxt, 0);
+
+ break;
+ }
+ }
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_dr7_gd(struct x86_emulate_ctxt *ctxt)
+{
+ unsigned long dr7;
+
+ ctxt->ops->get_dr(7, &dr7, ctxt->vcpu);
+
+ /* Check if DR7.Global_Enable is set */
+ return dr7 & (1 << 13);
+}
+
+static int check_dr_read(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int dr = c->modrm_reg;
+ u64 cr4;
+
+ if (dr > 7)
+ return emulate_ud(ctxt);
+
+ cr4 = ctxt->ops->get_cr(4, ctxt->vcpu);
+ if ((cr4 & X86_CR4_DE) && (dr == 4 || dr == 5))
+ return emulate_ud(ctxt);
+
+ if (check_dr7_gd(ctxt))
+ return emulate_db(ctxt);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_dr_write(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ u64 new_val = c->src.val64;
+ int dr = c->modrm_reg;
+
+ if ((dr == 6 || dr == 7) && (new_val & 0xffffffff00000000ULL))
+ return emulate_gp(ctxt, 0);
+
+ return check_dr_read(ctxt);
+}
+
+static int check_svme(struct x86_emulate_ctxt *ctxt)
+{
+ u64 efer;
+
+ ctxt->ops->get_msr(ctxt->vcpu, MSR_EFER, &efer);
+
+ if (!(efer & EFER_SVME))
+ return emulate_ud(ctxt);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_svme_pa(struct x86_emulate_ctxt *ctxt)
+{
+ u64 rax = kvm_register_read(ctxt->vcpu, VCPU_REGS_RAX);
+
+ /* Valid physical address? */
+ if (rax & 0xffff000000000000)
+ return emulate_gp(ctxt, 0);
+
+ return check_svme(ctxt);
+}
+
+static int check_rdtsc(struct x86_emulate_ctxt *ctxt)
+{
+ u64 cr4 = ctxt->ops->get_cr(4, ctxt->vcpu);
+
+ if (cr4 & X86_CR4_TSD && ctxt->ops->cpl(ctxt->vcpu))
+ return emulate_ud(ctxt);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
+{
+ u64 cr4 = ctxt->ops->get_cr(4, ctxt->vcpu);
+ u64 rcx = kvm_register_read(ctxt->vcpu, VCPU_REGS_RCX);
+
+ if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt->vcpu)) ||
+ (rcx > 3))
+ return emulate_gp(ctxt, 0);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_perm_in(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ c->dst.bytes = min(c->dst.bytes, 4u);
+ if (!emulator_io_permited(ctxt, ctxt->ops, c->src.val, c->dst.bytes))
+ return emulate_gp(ctxt, 0);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_perm_out(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ c->src.bytes = min(c->src.bytes, 4u);
+ if (!emulator_io_permited(ctxt, ctxt->ops, c->dst.val, c->src.bytes))
+ return emulate_gp(ctxt, 0);
+
+ return X86EMUL_CONTINUE;
+}
+
#define D(_y) { .flags = (_y) }
+#define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i }
+#define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \
+ .check_perm = (_p) }
#define N D(0)
+#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) }
#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
#define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) }
#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
+#define II(_f, _e, _i) \
+ { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i }
+#define IIP(_f, _e, _i, _p) \
+ { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i, \
+ .check_perm = (_p) }
+#define GP(_f, _g) { .flags = ((_f) | Prefix), .u.gprefix = (_g) }
#define D2bv(_f) D((_f) | ByteOp), D(_f)
+#define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p)
#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e)
#define D6ALU(_f) D2bv((_f) | DstMem | SrcReg | ModRM), \
D2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock), \
D2bv(((_f) & ~Lock) | DstAcc | SrcImm)
+static struct opcode group7_rm1[] = {
+ DI(SrcNone | ModRM | Priv, monitor),
+ DI(SrcNone | ModRM | Priv, mwait),
+ N, N, N, N, N, N,
+};
+
+static struct opcode group7_rm3[] = {
+ DIP(SrcNone | ModRM | Prot | Priv, vmrun, check_svme_pa),
+ DI(SrcNone | ModRM | Prot | VendorSpecific, vmmcall),
+ DIP(SrcNone | ModRM | Prot | Priv, vmload, check_svme_pa),
+ DIP(SrcNone | ModRM | Prot | Priv, vmsave, check_svme_pa),
+ DIP(SrcNone | ModRM | Prot | Priv, stgi, check_svme),
+ DIP(SrcNone | ModRM | Prot | Priv, clgi, check_svme),
+ DIP(SrcNone | ModRM | Prot | Priv, skinit, check_svme),
+ DIP(SrcNone | ModRM | Prot | Priv, invlpga, check_svme),
+};
+static struct opcode group7_rm7[] = {
+ N,
+ DIP(SrcNone | ModRM, rdtscp, check_rdtsc),
+ N, N, N, N, N, N,
+};
static struct opcode group1[] = {
X7(D(Lock)), N
};
@@ -2366,16 +2718,26 @@ static struct opcode group5[] = {
D(SrcMem | ModRM | Stack), N,
};
+static struct opcode group6[] = {
+ DI(ModRM | Prot, sldt),
+ DI(ModRM | Prot, str),
+ DI(ModRM | Prot | Priv, lldt),
+ DI(ModRM | Prot | Priv, ltr),
+ N, N, N, N,
+};
+
static struct group_dual group7 = { {
- N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv),
- D(SrcNone | ModRM | DstMem | Mov), N,
- D(SrcMem16 | ModRM | Mov | Priv),
- D(SrcMem | ModRM | ByteOp | Priv | NoAccess),
+ DI(ModRM | Mov | DstMem | Priv, sgdt),
+ DI(ModRM | Mov | DstMem | Priv, sidt),
+ DI(ModRM | SrcMem | Priv, lgdt), DI(ModRM | SrcMem | Priv, lidt),
+ DI(SrcNone | ModRM | DstMem | Mov, smsw), N,
+ DI(SrcMem16 | ModRM | Mov | Priv, lmsw),
+ DI(SrcMem | ModRM | ByteOp | Priv | NoAccess, invlpg),
}, {
- D(SrcNone | ModRM | Priv | VendorSpecific), N,
- N, D(SrcNone | ModRM | Priv | VendorSpecific),
- D(SrcNone | ModRM | DstMem | Mov), N,
- D(SrcMem16 | ModRM | Mov | Priv), N,
+ D(SrcNone | ModRM | Priv | VendorSpecific), EXT(0, group7_rm1),
+ N, EXT(0, group7_rm3),
+ DI(SrcNone | ModRM | DstMem | Mov, smsw), N,
+ DI(SrcMem16 | ModRM | Mov | Priv, lmsw), EXT(0, group7_rm7),
} };
static struct opcode group8[] = {
@@ -2394,6 +2756,10 @@ static struct opcode group11[] = {
I(DstMem | SrcImm | ModRM | Mov, em_mov), X7(D(Undefined)),
};
+static struct gprefix pfx_0f_6f_0f_7f = {
+ N, N, N, I(Sse, em_movdqu),
+};
+
static struct opcode opcode_table[256] = {
/* 0x00 - 0x07 */
D6ALU(Lock),
@@ -2430,8 +2796,8 @@ static struct opcode opcode_table[256] = {
I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
I(SrcImmByte | Mov | Stack, em_push),
I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
- D2bv(DstDI | Mov | String), /* insb, insw/insd */
- D2bv(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */
+ D2bvIP(DstDI | Mov | String, ins, check_perm_in), /* insb, insw/insd */
+ D2bvIP(SrcSI | ImplicitOps | String, outs, check_perm_out), /* outsb, outsw/outsd */
/* 0x70 - 0x7F */
X16(D(SrcImmByte)),
/* 0x80 - 0x87 */
@@ -2446,11 +2812,11 @@ static struct opcode opcode_table[256] = {
D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg),
D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A),
/* 0x90 - 0x97 */
- X8(D(SrcAcc | DstReg)),
+ DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)),
/* 0x98 - 0x9F */
D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
I(SrcImmFAddr | No64, em_call_far), N,
- D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N,
+ DI(ImplicitOps | Stack, pushf), DI(ImplicitOps | Stack, popf), N, N,
/* 0xA0 - 0xA7 */
I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov),
@@ -2473,7 +2839,8 @@ static struct opcode opcode_table[256] = {
G(ByteOp, group11), G(0, group11),
/* 0xC8 - 0xCF */
N, N, N, D(ImplicitOps | Stack),
- D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps),
+ D(ImplicitOps), DI(SrcImmByte, intn),
+ D(ImplicitOps | No64), DI(ImplicitOps, iret),
/* 0xD0 - 0xD7 */
D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM),
N, N, N, N,
@@ -2481,14 +2848,17 @@ static struct opcode opcode_table[256] = {
N, N, N, N, N, N, N, N,
/* 0xE0 - 0xE7 */
X4(D(SrcImmByte)),
- D2bv(SrcImmUByte | DstAcc), D2bv(SrcAcc | DstImmUByte),
+ D2bvIP(SrcImmUByte | DstAcc, in, check_perm_in),
+ D2bvIP(SrcAcc | DstImmUByte, out, check_perm_out),
/* 0xE8 - 0xEF */
D(SrcImm | Stack), D(SrcImm | ImplicitOps),
D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps),
- D2bv(SrcNone | DstAcc), D2bv(SrcAcc | ImplicitOps),
+ D2bvIP(SrcNone | DstAcc, in, check_perm_in),
+ D2bvIP(SrcAcc | ImplicitOps, out, check_perm_out),
/* 0xF0 - 0xF7 */
- N, N, N, N,
- D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3),
+ N, DI(ImplicitOps, icebp), N, N,
+ DI(ImplicitOps | Priv, hlt), D(ImplicitOps),
+ G(ByteOp, group3), G(0, group3),
/* 0xF8 - 0xFF */
D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps),
D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
@@ -2496,20 +2866,24 @@ static struct opcode opcode_table[256] = {
static struct opcode twobyte_table[256] = {
/* 0x00 - 0x0F */
- N, GD(0, &group7), N, N,
- N, D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv), N,
- D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N,
+ G(0, group6), GD(0, &group7), N, N,
+ N, D(ImplicitOps | VendorSpecific), DI(ImplicitOps | Priv, clts), N,
+ DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N,
N, D(ImplicitOps | ModRM), N, N,
/* 0x10 - 0x1F */
N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N,
/* 0x20 - 0x2F */
- D(ModRM | DstMem | Priv | Op3264), D(ModRM | DstMem | Priv | Op3264),
- D(ModRM | SrcMem | Priv | Op3264), D(ModRM | SrcMem | Priv | Op3264),
+ DIP(ModRM | DstMem | Priv | Op3264, cr_read, check_cr_read),
+ DIP(ModRM | DstMem | Priv | Op3264, dr_read, check_dr_read),
+ DIP(ModRM | SrcMem | Priv | Op3264, cr_write, check_cr_write),
+ DIP(ModRM | SrcMem | Priv | Op3264, dr_write, check_dr_write),
N, N, N, N,
N, N, N, N, N, N, N, N,
/* 0x30 - 0x3F */
- D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc),
- D(ImplicitOps | Priv), N,
+ DI(ImplicitOps | Priv, wrmsr),
+ IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
+ DI(ImplicitOps | Priv, rdmsr),
+ DIP(ImplicitOps | Priv, rdpmc, check_rdpmc),
D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv | VendorSpecific),
N, N,
N, N, N, N, N, N, N, N,
@@ -2518,21 +2892,27 @@ static struct opcode twobyte_table[256] = {
/* 0x50 - 0x5F */
N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
/* 0x60 - 0x6F */
- N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+ N, N, N, N,
+ N, N, N, N,
+ N, N, N, N,
+ N, N, N, GP(SrcMem | DstReg | ModRM | Mov, &pfx_0f_6f_0f_7f),
/* 0x70 - 0x7F */
- N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+ N, N, N, N,
+ N, N, N, N,
+ N, N, N, N,
+ N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_6f_0f_7f),
/* 0x80 - 0x8F */
X16(D(SrcImm)),
/* 0x90 - 0x9F */
X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
/* 0xA0 - 0xA7 */
D(ImplicitOps | Stack), D(ImplicitOps | Stack),
- N, D(DstMem | SrcReg | ModRM | BitOp),
+ DI(ImplicitOps, cpuid), D(DstMem | SrcReg | ModRM | BitOp),
D(DstMem | SrcReg | Src2ImmByte | ModRM),
D(DstMem | SrcReg | Src2CL | ModRM), N, N,
/* 0xA8 - 0xAF */
D(ImplicitOps | Stack), D(ImplicitOps | Stack),
- N, D(DstMem | SrcReg | ModRM | BitOp | Lock),
+ DI(ImplicitOps, rsm), D(DstMem | SrcReg | ModRM | BitOp | Lock),
D(DstMem | SrcReg | Src2ImmByte | ModRM),
D(DstMem | SrcReg | Src2CL | ModRM),
D(ModRM), I(DstReg | SrcMem | ModRM, em_imul),
@@ -2564,8 +2944,11 @@ static struct opcode twobyte_table[256] = {
#undef G
#undef GD
#undef I
+#undef GP
+#undef EXT
#undef D2bv
+#undef D2bvIP
#undef I2bv
#undef D6ALU
@@ -2625,7 +3008,8 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
struct decode_cache *c = &ctxt->decode;
int rc = X86EMUL_CONTINUE;
int mode = ctxt->mode;
- int def_op_bytes, def_ad_bytes, dual, goffset;
+ int def_op_bytes, def_ad_bytes, dual, goffset, simd_prefix;
+ bool op_prefix = false;
struct opcode opcode, *g_mod012, *g_mod3;
struct operand memop = { .type = OP_NONE };
@@ -2662,6 +3046,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
for (;;) {
switch (c->b = insn_fetch(u8, 1, c->eip)) {
case 0x66: /* operand-size override */
+ op_prefix = true;
/* switch between 2/4 bytes */
c->op_bytes = def_op_bytes ^ 6;
break;
@@ -2692,10 +3077,8 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
c->lock_prefix = 1;
break;
case 0xf2: /* REPNE/REPNZ */
- c->rep_prefix = REPNE_PREFIX;
- break;
case 0xf3: /* REP/REPE/REPZ */
- c->rep_prefix = REPE_PREFIX;
+ c->rep_prefix = c->b;
break;
default:
goto done_prefixes;
@@ -2741,10 +3124,31 @@ done_prefixes:
opcode = g_mod3[goffset];
else
opcode = g_mod012[goffset];
+
+ if (opcode.flags & RMExt) {
+ goffset = c->modrm & 7;
+ opcode = opcode.u.group[goffset];
+ }
+
+ c->d |= opcode.flags;
+ }
+
+ if (c->d & Prefix) {
+ if (c->rep_prefix && op_prefix)
+ return X86EMUL_UNHANDLEABLE;
+ simd_prefix = op_prefix ? 0x66 : c->rep_prefix;
+ switch (simd_prefix) {
+ case 0x00: opcode = opcode.u.gprefix->pfx_no; break;
+ case 0x66: opcode = opcode.u.gprefix->pfx_66; break;
+ case 0xf2: opcode = opcode.u.gprefix->pfx_f2; break;
+ case 0xf3: opcode = opcode.u.gprefix->pfx_f3; break;
+ }
c->d |= opcode.flags;
}
c->execute = opcode.u.execute;
+ c->check_perm = opcode.check_perm;
+ c->intercept = opcode.intercept;
/* Unrecognised? */
if (c->d == 0 || (c->d & Undefined))
@@ -2763,6 +3167,9 @@ done_prefixes:
c->op_bytes = 4;
}
+ if (c->d & Sse)
+ c->op_bytes = 16;
+
/* ModRM and SIB bytes. */
if (c->d & ModRM) {
rc = decode_modrm(ctxt, ops, &memop);
@@ -2792,7 +3199,7 @@ done_prefixes:
case SrcNone:
break;
case SrcReg:
- decode_register_operand(&c->src, c, 0);
+ decode_register_operand(ctxt, &c->src, c, 0);
break;
case SrcMem16:
memop.bytes = 2;
@@ -2883,7 +3290,7 @@ done_prefixes:
/* Decode and fetch the destination operand: register or memory. */
switch (c->d & DstMask) {
case DstReg:
- decode_register_operand(&c->dst, c,
+ decode_register_operand(ctxt, &c->dst, c,
c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
break;
case DstImmUByte:
@@ -2926,7 +3333,7 @@ done_prefixes:
}
done:
- return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
+ return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
}
static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
@@ -2979,12 +3386,51 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
goto done;
}
+ if ((c->d & Sse)
+ && ((ops->get_cr(0, ctxt->vcpu) & X86_CR0_EM)
+ || !(ops->get_cr(4, ctxt->vcpu) & X86_CR4_OSFXSR))) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+
+ if ((c->d & Sse) && (ops->get_cr(0, ctxt->vcpu) & X86_CR0_TS)) {
+ rc = emulate_nm(ctxt);
+ goto done;
+ }
+
+ if (unlikely(ctxt->guest_mode) && c->intercept) {
+ rc = emulator_check_intercept(ctxt, c->intercept,
+ X86_ICPT_PRE_EXCEPT);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
/* Privileged instruction can be executed only in CPL=0 */
if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
rc = emulate_gp(ctxt, 0);
goto done;
}
+ /* Instruction can only be executed in protected mode */
+ if ((c->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+
+ /* Do instruction specific permission checks */
+ if (c->check_perm) {
+ rc = c->check_perm(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ if (unlikely(ctxt->guest_mode) && c->intercept) {
+ rc = emulator_check_intercept(ctxt, c->intercept,
+ X86_ICPT_POST_EXCEPT);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
if (c->rep_prefix && (c->d & String)) {
/* All REP prefixes have the same first termination condition */
if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
@@ -3023,6 +3469,13 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
special_insn:
+ if (unlikely(ctxt->guest_mode) && c->intercept) {
+ rc = emulator_check_intercept(ctxt, c->intercept,
+ X86_ICPT_POST_MEMACCESS);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
if (c->execute) {
rc = c->execute(ctxt);
if (rc != X86EMUL_CONTINUE)
@@ -3304,11 +3757,6 @@ special_insn:
case 0xed: /* in (e/r)ax,dx */
c->src.val = c->regs[VCPU_REGS_RDX];
do_io_in:
- c->dst.bytes = min(c->dst.bytes, 4u);
- if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
- rc = emulate_gp(ctxt, 0);
- goto done;
- }
if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
&c->dst.val))
goto done; /* IO is needed */
@@ -3317,12 +3765,6 @@ special_insn:
case 0xef: /* out dx,(e/r)ax */
c->dst.val = c->regs[VCPU_REGS_RDX];
do_io_out:
- c->src.bytes = min(c->src.bytes, 4u);
- if (!emulator_io_permited(ctxt, ops, c->dst.val,
- c->src.bytes)) {
- rc = emulate_gp(ctxt, 0);
- goto done;
- }
ops->pio_out_emulated(c->src.bytes, c->dst.val,
&c->src.val, 1, ctxt->vcpu);
c->dst.type = OP_NONE; /* Disable writeback. */
@@ -3427,6 +3869,9 @@ writeback:
done:
if (rc == X86EMUL_PROPAGATE_FAULT)
ctxt->have_exception = true;
+ if (rc == X86EMUL_INTERCEPTED)
+ return EMULATION_INTERCEPTED;
+
return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
twobyte_insn:
@@ -3515,23 +3960,9 @@ twobyte_insn:
case 0x18: /* Grp16 (prefetch/nop) */
break;
case 0x20: /* mov cr, reg */
- switch (c->modrm_reg) {
- case 1:
- case 5 ... 7:
- case 9 ... 15:
- emulate_ud(ctxt);
- rc = X86EMUL_PROPAGATE_FAULT;
- goto done;
- }
c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu);
break;
case 0x21: /* mov from dr to reg */
- if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
- (c->modrm_reg == 4 || c->modrm_reg == 5)) {
- emulate_ud(ctxt);
- rc = X86EMUL_PROPAGATE_FAULT;
- goto done;
- }
ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu);
break;
case 0x22: /* mov reg, cr */
@@ -3543,13 +3974,6 @@ twobyte_insn:
c->dst.type = OP_NONE;
break;
case 0x23: /* mov from reg to dr */
- if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
- (c->modrm_reg == 4 || c->modrm_reg == 5)) {
- emulate_ud(ctxt);
- rc = X86EMUL_PROPAGATE_FAULT;
- goto done;
- }
-
if (ops->set_dr(c->modrm_reg, c->src.val &
((ctxt->mode == X86EMUL_MODE_PROT64) ?
~0ULL : ~0U), ctxt->vcpu) < 0) {
@@ -3739,5 +4163,5 @@ twobyte_insn:
goto writeback;
cannot_emulate:
- return -1;
+ return EMULATION_FAILED;
}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 46d08ca0b48f..b681a9f78c5e 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -33,7 +33,6 @@ struct kvm_kpit_state {
};
struct kvm_pit {
- unsigned long base_addresss;
struct kvm_io_device dev;
struct kvm_io_device speaker_dev;
struct kvm *kvm;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 22fae7593ee7..28418054b880 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1206,7 +1206,7 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp, u64 *spte,
- const void *pte, unsigned long mmu_seq)
+ const void *pte)
{
WARN_ON(1);
}
@@ -3163,9 +3163,8 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
}
static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp,
- u64 *spte,
- const void *new, unsigned long mmu_seq)
+ struct kvm_mmu_page *sp, u64 *spte,
+ const void *new)
{
if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
++vcpu->kvm->stat.mmu_pde_zapped;
@@ -3173,7 +3172,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
}
++vcpu->kvm->stat.mmu_pte_updated;
- vcpu->arch.mmu.update_pte(vcpu, sp, spte, new, mmu_seq);
+ vcpu->arch.mmu.update_pte(vcpu, sp, spte, new);
}
static bool need_remote_flush(u64 old, u64 new)
@@ -3229,7 +3228,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
struct kvm_mmu_page *sp;
struct hlist_node *node;
LIST_HEAD(invalid_list);
- unsigned long mmu_seq;
u64 entry, gentry, *spte;
unsigned pte_size, page_offset, misaligned, quadrant, offset;
int level, npte, invlpg_counter, r, flooded = 0;
@@ -3271,9 +3269,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
break;
}
- mmu_seq = vcpu->kvm->mmu_notifier_seq;
- smp_rmb();
-
spin_lock(&vcpu->kvm->mmu_lock);
if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
gentry = 0;
@@ -3345,8 +3340,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
if (gentry &&
!((sp->role.word ^ vcpu->arch.mmu.base_role.word)
& mask.word))
- mmu_pte_write_new_pte(vcpu, sp, spte, &gentry,
- mmu_seq);
+ mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
if (!remote_flush && need_remote_flush(entry, *spte))
remote_flush = true;
++spte;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index c6397795d865..74f8567d57ac 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -325,7 +325,7 @@ no_present:
}
static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- u64 *spte, const void *pte, unsigned long mmu_seq)
+ u64 *spte, const void *pte)
{
pt_element_t gpte;
unsigned pte_access;
@@ -342,8 +342,6 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
kvm_release_pfn_clean(pfn);
return;
}
- if (mmu_notifier_retry(vcpu, mmu_seq))
- return;
/*
* we call mmu_set_spte() with host_writable = true because that
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 6bb15d583e47..a6bf2ad7429c 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -63,6 +63,10 @@ MODULE_LICENSE("GPL");
#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
+#define TSC_RATIO_RSVD 0xffffff0000000000ULL
+#define TSC_RATIO_MIN 0x0000000000000001ULL
+#define TSC_RATIO_MAX 0x000000ffffffffffULL
+
static bool erratum_383_found __read_mostly;
static const u32 host_save_user_msrs[] = {
@@ -93,14 +97,6 @@ struct nested_state {
/* A VMEXIT is required but not yet emulated */
bool exit_required;
- /*
- * If we vmexit during an instruction emulation we need this to restore
- * the l1 guest rip after the emulation
- */
- unsigned long vmexit_rip;
- unsigned long vmexit_rsp;
- unsigned long vmexit_rax;
-
/* cache for intercepts of the guest */
u32 intercept_cr;
u32 intercept_dr;
@@ -144,8 +140,13 @@ struct vcpu_svm {
unsigned int3_injected;
unsigned long int3_rip;
u32 apf_reason;
+
+ u64 tsc_ratio;
};
+static DEFINE_PER_CPU(u64, current_tsc_ratio);
+#define TSC_RATIO_DEFAULT 0x0100000000ULL
+
#define MSR_INVALID 0xffffffffU
static struct svm_direct_access_msrs {
@@ -190,6 +191,7 @@ static int nested_svm_intercept(struct vcpu_svm *svm);
static int nested_svm_vmexit(struct vcpu_svm *svm);
static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
bool has_error_code, u32 error_code);
+static u64 __scale_tsc(u64 ratio, u64 tsc);
enum {
VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
@@ -376,7 +378,6 @@ struct svm_cpu_data {
};
static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
-static uint32_t svm_features;
struct svm_init_data {
int cpu;
@@ -569,6 +570,10 @@ static int has_svm(void)
static void svm_hardware_disable(void *garbage)
{
+ /* Make sure we clean up behind us */
+ if (static_cpu_has(X86_FEATURE_TSCRATEMSR))
+ wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
+
cpu_svm_disable();
}
@@ -610,6 +615,11 @@ static int svm_hardware_enable(void *garbage)
wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
+ if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+ wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
+ __get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT;
+ }
+
svm_init_erratum_383();
return 0;
@@ -791,6 +801,23 @@ static __init int svm_hardware_setup(void)
if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
kvm_enable_efer_bits(EFER_FFXSR);
+ if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+ u64 max;
+
+ kvm_has_tsc_control = true;
+
+ /*
+ * Make sure the user can only configure tsc_khz values that
+ * fit into a signed integer.
+ * A min value is not calculated needed because it will always
+ * be 1 on all machines and a value of 0 is used to disable
+ * tsc-scaling for the vcpu.
+ */
+ max = min(0x7fffffffULL, __scale_tsc(tsc_khz, TSC_RATIO_MAX));
+
+ kvm_max_guest_tsc_khz = max;
+ }
+
if (nested) {
printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
@@ -802,8 +829,6 @@ static __init int svm_hardware_setup(void)
goto err;
}
- svm_features = cpuid_edx(SVM_CPUID_FUNC);
-
if (!boot_cpu_has(X86_FEATURE_NPT))
npt_enabled = false;
@@ -854,6 +879,64 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
seg->base = 0;
}
+static u64 __scale_tsc(u64 ratio, u64 tsc)
+{
+ u64 mult, frac, _tsc;
+
+ mult = ratio >> 32;
+ frac = ratio & ((1ULL << 32) - 1);
+
+ _tsc = tsc;
+ _tsc *= mult;
+ _tsc += (tsc >> 32) * frac;
+ _tsc += ((tsc & ((1ULL << 32) - 1)) * frac) >> 32;
+
+ return _tsc;
+}
+
+static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 _tsc = tsc;
+
+ if (svm->tsc_ratio != TSC_RATIO_DEFAULT)
+ _tsc = __scale_tsc(svm->tsc_ratio, tsc);
+
+ return _tsc;
+}
+
+static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 ratio;
+ u64 khz;
+
+ /* TSC scaling supported? */
+ if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR))
+ return;
+
+ /* TSC-Scaling disabled or guest TSC same frequency as host TSC? */
+ if (user_tsc_khz == 0) {
+ vcpu->arch.virtual_tsc_khz = 0;
+ svm->tsc_ratio = TSC_RATIO_DEFAULT;
+ return;
+ }
+
+ khz = user_tsc_khz;
+
+ /* TSC scaling required - calculate ratio */
+ ratio = khz << 32;
+ do_div(ratio, tsc_khz);
+
+ if (ratio == 0 || ratio & TSC_RATIO_RSVD) {
+ WARN_ONCE(1, "Invalid TSC ratio - virtual-tsc-khz=%u\n",
+ user_tsc_khz);
+ return;
+ }
+ vcpu->arch.virtual_tsc_khz = user_tsc_khz;
+ svm->tsc_ratio = ratio;
+}
+
static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -880,6 +963,15 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
}
+static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
+{
+ u64 tsc;
+
+ tsc = svm_scale_tsc(vcpu, native_read_tsc());
+
+ return target_tsc - tsc;
+}
+
static void init_vmcb(struct vcpu_svm *svm)
{
struct vmcb_control_area *control = &svm->vmcb->control;
@@ -975,7 +1067,7 @@ static void init_vmcb(struct vcpu_svm *svm)
svm_set_efer(&svm->vcpu, 0);
save->dr6 = 0xffff0ff0;
save->dr7 = 0x400;
- save->rflags = 2;
+ kvm_set_rflags(&svm->vcpu, 2);
save->rip = 0x0000fff0;
svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
@@ -1048,6 +1140,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
goto out;
}
+ svm->tsc_ratio = TSC_RATIO_DEFAULT;
+
err = kvm_vcpu_init(&svm->vcpu, kvm, id);
if (err)
goto free_svm;
@@ -1141,6 +1235,12 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
+
+ if (static_cpu_has(X86_FEATURE_TSCRATEMSR) &&
+ svm->tsc_ratio != __get_cpu_var(current_tsc_ratio)) {
+ __get_cpu_var(current_tsc_ratio) = svm->tsc_ratio;
+ wrmsrl(MSR_AMD64_TSC_RATIO, svm->tsc_ratio);
+ }
}
static void svm_vcpu_put(struct kvm_vcpu *vcpu)
@@ -1365,31 +1465,6 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (is_guest_mode(vcpu)) {
- /*
- * We are here because we run in nested mode, the host kvm
- * intercepts cr0 writes but the l1 hypervisor does not.
- * But the L1 hypervisor may intercept selective cr0 writes.
- * This needs to be checked here.
- */
- unsigned long old, new;
-
- /* Remove bits that would trigger a real cr0 write intercept */
- old = vcpu->arch.cr0 & SVM_CR0_SELECTIVE_MASK;
- new = cr0 & SVM_CR0_SELECTIVE_MASK;
-
- if (old == new) {
- /* cr0 write with ts and mp unchanged */
- svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
- if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) {
- svm->nested.vmexit_rip = kvm_rip_read(vcpu);
- svm->nested.vmexit_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
- svm->nested.vmexit_rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
- return;
- }
- }
- }
-
#ifdef CONFIG_X86_64
if (vcpu->arch.efer & EFER_LME) {
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
@@ -2127,7 +2202,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu);
nested_vmcb->save.cr2 = vmcb->save.cr2;
nested_vmcb->save.cr4 = svm->vcpu.arch.cr4;
- nested_vmcb->save.rflags = vmcb->save.rflags;
+ nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu);
nested_vmcb->save.rip = vmcb->save.rip;
nested_vmcb->save.rsp = vmcb->save.rsp;
nested_vmcb->save.rax = vmcb->save.rax;
@@ -2184,7 +2259,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
svm->vmcb->save.ds = hsave->save.ds;
svm->vmcb->save.gdtr = hsave->save.gdtr;
svm->vmcb->save.idtr = hsave->save.idtr;
- svm->vmcb->save.rflags = hsave->save.rflags;
+ kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
svm_set_efer(&svm->vcpu, hsave->save.efer);
svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
svm_set_cr4(&svm->vcpu, hsave->save.cr4);
@@ -2312,7 +2387,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
hsave->save.efer = svm->vcpu.arch.efer;
hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
hsave->save.cr4 = svm->vcpu.arch.cr4;
- hsave->save.rflags = vmcb->save.rflags;
+ hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
hsave->save.rip = kvm_rip_read(&svm->vcpu);
hsave->save.rsp = vmcb->save.rsp;
hsave->save.rax = vmcb->save.rax;
@@ -2323,7 +2398,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
copy_vmcb_control_area(hsave, vmcb);
- if (svm->vmcb->save.rflags & X86_EFLAGS_IF)
+ if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
svm->vcpu.arch.hflags |= HF_HIF_MASK;
else
svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
@@ -2341,7 +2416,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
svm->vmcb->save.ds = nested_vmcb->save.ds;
svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
svm->vmcb->save.idtr = nested_vmcb->save.idtr;
- svm->vmcb->save.rflags = nested_vmcb->save.rflags;
+ kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags);
svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
@@ -2443,13 +2518,13 @@ static int vmload_interception(struct vcpu_svm *svm)
if (nested_svm_check_permissions(svm))
return 1;
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
- skip_emulated_instruction(&svm->vcpu);
-
nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
if (!nested_vmcb)
return 1;
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+ skip_emulated_instruction(&svm->vcpu);
+
nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
nested_svm_unmap(page);
@@ -2464,13 +2539,13 @@ static int vmsave_interception(struct vcpu_svm *svm)
if (nested_svm_check_permissions(svm))
return 1;
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
- skip_emulated_instruction(&svm->vcpu);
-
nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
if (!nested_vmcb)
return 1;
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+ skip_emulated_instruction(&svm->vcpu);
+
nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
nested_svm_unmap(page);
@@ -2676,6 +2751,29 @@ static int emulate_on_interception(struct vcpu_svm *svm)
return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
}
+bool check_selective_cr0_intercepted(struct vcpu_svm *svm, unsigned long val)
+{
+ unsigned long cr0 = svm->vcpu.arch.cr0;
+ bool ret = false;
+ u64 intercept;
+
+ intercept = svm->nested.intercept;
+
+ if (!is_guest_mode(&svm->vcpu) ||
+ (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))))
+ return false;
+
+ cr0 &= ~SVM_CR0_SELECTIVE_MASK;
+ val &= ~SVM_CR0_SELECTIVE_MASK;
+
+ if (cr0 ^ val) {
+ svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
+ ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
+ }
+
+ return ret;
+}
+
#define CR_VALID (1ULL << 63)
static int cr_interception(struct vcpu_svm *svm)
@@ -2699,7 +2797,8 @@ static int cr_interception(struct vcpu_svm *svm)
val = kvm_register_read(&svm->vcpu, reg);
switch (cr) {
case 0:
- err = kvm_set_cr0(&svm->vcpu, val);
+ if (!check_selective_cr0_intercepted(svm, val))
+ err = kvm_set_cr0(&svm->vcpu, val);
break;
case 3:
err = kvm_set_cr3(&svm->vcpu, val);
@@ -2744,23 +2843,6 @@ static int cr_interception(struct vcpu_svm *svm)
return 1;
}
-static int cr0_write_interception(struct vcpu_svm *svm)
-{
- struct kvm_vcpu *vcpu = &svm->vcpu;
- int r;
-
- r = cr_interception(svm);
-
- if (svm->nested.vmexit_rip) {
- kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip);
- kvm_register_write(vcpu, VCPU_REGS_RSP, svm->nested.vmexit_rsp);
- kvm_register_write(vcpu, VCPU_REGS_RAX, svm->nested.vmexit_rax);
- svm->nested.vmexit_rip = 0;
- }
-
- return r;
-}
-
static int dr_interception(struct vcpu_svm *svm)
{
int reg, dr;
@@ -2813,7 +2895,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
case MSR_IA32_TSC: {
struct vmcb *vmcb = get_host_vmcb(svm);
- *data = vmcb->control.tsc_offset + native_read_tsc();
+ *data = vmcb->control.tsc_offset +
+ svm_scale_tsc(vcpu, native_read_tsc());
+
break;
}
case MSR_STAR:
@@ -3048,7 +3132,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_READ_CR4] = cr_interception,
[SVM_EXIT_READ_CR8] = cr_interception,
[SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
- [SVM_EXIT_WRITE_CR0] = cr0_write_interception,
+ [SVM_EXIT_WRITE_CR0] = cr_interception,
[SVM_EXIT_WRITE_CR3] = cr_interception,
[SVM_EXIT_WRITE_CR4] = cr_interception,
[SVM_EXIT_WRITE_CR8] = cr8_write_interception,
@@ -3384,7 +3468,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
return 0;
- ret = !!(vmcb->save.rflags & X86_EFLAGS_IF);
+ ret = !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF);
if (is_guest_mode(vcpu))
return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
@@ -3871,6 +3955,190 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
update_cr0_intercept(svm);
}
+#define PRE_EX(exit) { .exit_code = (exit), \
+ .stage = X86_ICPT_PRE_EXCEPT, \
+ .valid = true }
+#define POST_EX(exit) { .exit_code = (exit), \
+ .stage = X86_ICPT_POST_EXCEPT, \
+ .valid = true }
+#define POST_MEM(exit) { .exit_code = (exit), \
+ .stage = X86_ICPT_POST_MEMACCESS, \
+ .valid = true }
+
+static struct __x86_intercept {
+ u32 exit_code;
+ enum x86_intercept_stage stage;
+ bool valid;
+} x86_intercept_map[] = {
+ [x86_intercept_cr_read] = POST_EX(SVM_EXIT_READ_CR0),
+ [x86_intercept_cr_write] = POST_EX(SVM_EXIT_WRITE_CR0),
+ [x86_intercept_clts] = POST_EX(SVM_EXIT_WRITE_CR0),
+ [x86_intercept_lmsw] = POST_EX(SVM_EXIT_WRITE_CR0),
+ [x86_intercept_smsw] = POST_EX(SVM_EXIT_READ_CR0),
+ [x86_intercept_dr_read] = POST_EX(SVM_EXIT_READ_DR0),
+ [x86_intercept_dr_write] = POST_EX(SVM_EXIT_WRITE_DR0),
+ [x86_intercept_sldt] = POST_EX(SVM_EXIT_LDTR_READ),
+ [x86_intercept_str] = POST_EX(SVM_EXIT_TR_READ),
+ [x86_intercept_lldt] = POST_EX(SVM_EXIT_LDTR_WRITE),
+ [x86_intercept_ltr] = POST_EX(SVM_EXIT_TR_WRITE),
+ [x86_intercept_sgdt] = POST_EX(SVM_EXIT_GDTR_READ),
+ [x86_intercept_sidt] = POST_EX(SVM_EXIT_IDTR_READ),
+ [x86_intercept_lgdt] = POST_EX(SVM_EXIT_GDTR_WRITE),
+ [x86_intercept_lidt] = POST_EX(SVM_EXIT_IDTR_WRITE),
+ [x86_intercept_vmrun] = POST_EX(SVM_EXIT_VMRUN),
+ [x86_intercept_vmmcall] = POST_EX(SVM_EXIT_VMMCALL),
+ [x86_intercept_vmload] = POST_EX(SVM_EXIT_VMLOAD),
+ [x86_intercept_vmsave] = POST_EX(SVM_EXIT_VMSAVE),
+ [x86_intercept_stgi] = POST_EX(SVM_EXIT_STGI),
+ [x86_intercept_clgi] = POST_EX(SVM_EXIT_CLGI),
+ [x86_intercept_skinit] = POST_EX(SVM_EXIT_SKINIT),
+ [x86_intercept_invlpga] = POST_EX(SVM_EXIT_INVLPGA),
+ [x86_intercept_rdtscp] = POST_EX(SVM_EXIT_RDTSCP),
+ [x86_intercept_monitor] = POST_MEM(SVM_EXIT_MONITOR),
+ [x86_intercept_mwait] = POST_EX(SVM_EXIT_MWAIT),
+ [x86_intercept_invlpg] = POST_EX(SVM_EXIT_INVLPG),
+ [x86_intercept_invd] = POST_EX(SVM_EXIT_INVD),
+ [x86_intercept_wbinvd] = POST_EX(SVM_EXIT_WBINVD),
+ [x86_intercept_wrmsr] = POST_EX(SVM_EXIT_MSR),
+ [x86_intercept_rdtsc] = POST_EX(SVM_EXIT_RDTSC),
+ [x86_intercept_rdmsr] = POST_EX(SVM_EXIT_MSR),
+ [x86_intercept_rdpmc] = POST_EX(SVM_EXIT_RDPMC),
+ [x86_intercept_cpuid] = PRE_EX(SVM_EXIT_CPUID),
+ [x86_intercept_rsm] = PRE_EX(SVM_EXIT_RSM),
+ [x86_intercept_pause] = PRE_EX(SVM_EXIT_PAUSE),
+ [x86_intercept_pushf] = PRE_EX(SVM_EXIT_PUSHF),
+ [x86_intercept_popf] = PRE_EX(SVM_EXIT_POPF),
+ [x86_intercept_intn] = PRE_EX(SVM_EXIT_SWINT),
+ [x86_intercept_iret] = PRE_EX(SVM_EXIT_IRET),
+ [x86_intercept_icebp] = PRE_EX(SVM_EXIT_ICEBP),
+ [x86_intercept_hlt] = POST_EX(SVM_EXIT_HLT),
+ [x86_intercept_in] = POST_EX(SVM_EXIT_IOIO),
+ [x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO),
+ [x86_intercept_out] = POST_EX(SVM_EXIT_IOIO),
+ [x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO),
+};
+
+#undef PRE_EX
+#undef POST_EX
+#undef POST_MEM
+
+static int svm_check_intercept(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int vmexit, ret = X86EMUL_CONTINUE;
+ struct __x86_intercept icpt_info;
+ struct vmcb *vmcb = svm->vmcb;
+
+ if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
+ goto out;
+
+ icpt_info = x86_intercept_map[info->intercept];
+
+ if (!icpt_info.valid || stage != icpt_info.stage)
+ goto out;
+
+ switch (icpt_info.exit_code) {
+ case SVM_EXIT_READ_CR0:
+ if (info->intercept == x86_intercept_cr_read)
+ icpt_info.exit_code += info->modrm_reg;
+ break;
+ case SVM_EXIT_WRITE_CR0: {
+ unsigned long cr0, val;
+ u64 intercept;
+
+ if (info->intercept == x86_intercept_cr_write)
+ icpt_info.exit_code += info->modrm_reg;
+
+ if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0)
+ break;
+
+ intercept = svm->nested.intercept;
+
+ if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))
+ break;
+
+ cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
+ val = info->src_val & ~SVM_CR0_SELECTIVE_MASK;
+
+ if (info->intercept == x86_intercept_lmsw) {
+ cr0 &= 0xfUL;
+ val &= 0xfUL;
+ /* lmsw can't clear PE - catch this here */
+ if (cr0 & X86_CR0_PE)
+ val |= X86_CR0_PE;
+ }
+
+ if (cr0 ^ val)
+ icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
+
+ break;
+ }
+ case SVM_EXIT_READ_DR0:
+ case SVM_EXIT_WRITE_DR0:
+ icpt_info.exit_code += info->modrm_reg;
+ break;
+ case SVM_EXIT_MSR:
+ if (info->intercept == x86_intercept_wrmsr)
+ vmcb->control.exit_info_1 = 1;
+ else
+ vmcb->control.exit_info_1 = 0;
+ break;
+ case SVM_EXIT_PAUSE:
+ /*
+ * We get this for NOP only, but pause
+ * is rep not, check this here
+ */
+ if (info->rep_prefix != REPE_PREFIX)
+ goto out;
+ case SVM_EXIT_IOIO: {
+ u64 exit_info;
+ u32 bytes;
+
+ exit_info = (vcpu->arch.regs[VCPU_REGS_RDX] & 0xffff) << 16;
+
+ if (info->intercept == x86_intercept_in ||
+ info->intercept == x86_intercept_ins) {
+ exit_info |= SVM_IOIO_TYPE_MASK;
+ bytes = info->src_bytes;
+ } else {
+ bytes = info->dst_bytes;
+ }
+
+ if (info->intercept == x86_intercept_outs ||
+ info->intercept == x86_intercept_ins)
+ exit_info |= SVM_IOIO_STR_MASK;
+
+ if (info->rep_prefix)
+ exit_info |= SVM_IOIO_REP_MASK;
+
+ bytes = min(bytes, 4u);
+
+ exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
+
+ exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
+
+ vmcb->control.exit_info_1 = exit_info;
+ vmcb->control.exit_info_2 = info->next_rip;
+
+ break;
+ }
+ default:
+ break;
+ }
+
+ vmcb->control.next_rip = info->next_rip;
+ vmcb->control.exit_code = icpt_info.exit_code;
+ vmexit = nested_svm_exit_handled(svm);
+
+ ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
+ : X86EMUL_CONTINUE;
+
+out:
+ return ret;
+}
+
static struct kvm_x86_ops svm_x86_ops = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,
@@ -3952,10 +4220,14 @@ static struct kvm_x86_ops svm_x86_ops = {
.has_wbinvd_exit = svm_has_wbinvd_exit,
+ .set_tsc_khz = svm_set_tsc_khz,
.write_tsc_offset = svm_write_tsc_offset,
.adjust_tsc_offset = svm_adjust_tsc_offset,
+ .compute_tsc_offset = svm_compute_tsc_offset,
.set_tdp_cr3 = set_tdp_cr3,
+
+ .check_intercept = svm_check_intercept,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5b4cdcbd154c..aabe3334d064 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -128,8 +128,11 @@ struct vcpu_vmx {
unsigned long host_rsp;
int launched;
u8 fail;
+ u8 cpl;
+ bool nmi_known_unmasked;
u32 exit_intr_info;
u32 idt_vectoring_info;
+ ulong rflags;
struct shared_msr_entry *guest_msrs;
int nmsrs;
int save_nmsrs;
@@ -970,17 +973,24 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
{
unsigned long rflags, save_rflags;
- rflags = vmcs_readl(GUEST_RFLAGS);
- if (to_vmx(vcpu)->rmode.vm86_active) {
- rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
- save_rflags = to_vmx(vcpu)->rmode.save_rflags;
- rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
+ if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
+ __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
+ rflags = vmcs_readl(GUEST_RFLAGS);
+ if (to_vmx(vcpu)->rmode.vm86_active) {
+ rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
+ save_rflags = to_vmx(vcpu)->rmode.save_rflags;
+ rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
+ }
+ to_vmx(vcpu)->rflags = rflags;
}
- return rflags;
+ return to_vmx(vcpu)->rflags;
}
static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
+ __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
+ __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
+ to_vmx(vcpu)->rflags = rflags;
if (to_vmx(vcpu)->rmode.vm86_active) {
to_vmx(vcpu)->rmode.save_rflags = rflags;
rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
@@ -1151,6 +1161,16 @@ static u64 guest_read_tsc(void)
}
/*
+ * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ
+ * ioctl. In this case the call-back should update internal vmx state to make
+ * the changes effective.
+ */
+static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
+{
+ /* Nothing to do here */
+}
+
+/*
* writes 'offset' into guest's timestamp counter offset register
*/
static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
@@ -1164,6 +1184,11 @@ static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
vmcs_write64(TSC_OFFSET, offset + adjustment);
}
+static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
+{
+ return target_tsc - native_read_tsc();
+}
+
/*
* Reads an msr value (of 'msr_index') into 'pdata'.
* Returns 0 on success, non-0 otherwise.
@@ -1998,6 +2023,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
vmcs_writel(CR0_READ_SHADOW, cr0);
vmcs_writel(GUEST_CR0, hw_cr0);
vcpu->arch.cr0 = cr0;
+ __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
}
static u64 construct_eptp(unsigned long root_hpa)
@@ -2108,17 +2134,28 @@ static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
return vmcs_readl(sf->base);
}
-static int vmx_get_cpl(struct kvm_vcpu *vcpu)
+static int __vmx_get_cpl(struct kvm_vcpu *vcpu)
{
if (!is_protmode(vcpu))
return 0;
- if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */
+ if (!is_long_mode(vcpu)
+ && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */
return 3;
return vmcs_read16(GUEST_CS_SELECTOR) & 3;
}
+static int vmx_get_cpl(struct kvm_vcpu *vcpu)
+{
+ if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) {
+ __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
+ to_vmx(vcpu)->cpl = __vmx_get_cpl(vcpu);
+ }
+ return to_vmx(vcpu)->cpl;
+}
+
+
static u32 vmx_segment_access_rights(struct kvm_segment *var)
{
u32 ar;
@@ -2184,6 +2221,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
ar |= 0x1; /* Accessed */
vmcs_write32(sf->ar_bytes, ar);
+ __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
}
static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
@@ -2937,6 +2975,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
}
++vcpu->stat.nmi_injections;
+ vmx->nmi_known_unmasked = false;
if (vmx->rmode.vm86_active) {
if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR) != EMULATE_DONE)
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
@@ -2961,6 +3000,8 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
{
if (!cpu_has_virtual_nmis())
return to_vmx(vcpu)->soft_vnmi_blocked;
+ if (to_vmx(vcpu)->nmi_known_unmasked)
+ return false;
return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
}
@@ -2974,6 +3015,7 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
vmx->vnmi_blocked_time = 0;
}
} else {
+ vmx->nmi_known_unmasked = !masked;
if (masked)
vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
GUEST_INTR_STATE_NMI);
@@ -3091,7 +3133,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
enum emulation_result er;
vect_info = vmx->idt_vectoring_info;
- intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+ intr_info = vmx->exit_intr_info;
if (is_machine_check(intr_info))
return handle_machine_check(vcpu);
@@ -3505,9 +3547,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
switch (type) {
case INTR_TYPE_NMI_INTR:
vcpu->arch.nmi_injected = false;
- if (cpu_has_virtual_nmis())
- vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
- GUEST_INTR_STATE_NMI);
+ vmx_set_nmi_mask(vcpu, true);
break;
case INTR_TYPE_EXT_INTR:
case INTR_TYPE_SOFT_INTR:
@@ -3867,12 +3907,17 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
{
- u32 exit_intr_info = vmx->exit_intr_info;
+ u32 exit_intr_info;
+
+ if (!(vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY
+ || vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI))
+ return;
+
+ vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+ exit_intr_info = vmx->exit_intr_info;
/* Handle machine checks before interrupts are enabled */
- if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
- || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
- && is_machine_check(exit_intr_info)))
+ if (is_machine_check(exit_intr_info))
kvm_machine_check();
/* We need to handle NMIs before interrupts are enabled */
@@ -3886,7 +3931,7 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
{
- u32 exit_intr_info = vmx->exit_intr_info;
+ u32 exit_intr_info;
bool unblock_nmi;
u8 vector;
bool idtv_info_valid;
@@ -3894,6 +3939,13 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
if (cpu_has_virtual_nmis()) {
+ if (vmx->nmi_known_unmasked)
+ return;
+ /*
+ * Can't use vmx->exit_intr_info since we're not sure what
+ * the exit reason is.
+ */
+ exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
/*
@@ -3910,6 +3962,10 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
vector != DF_VECTOR && !idtv_info_valid)
vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
GUEST_INTR_STATE_NMI);
+ else
+ vmx->nmi_known_unmasked =
+ !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
+ & GUEST_INTR_STATE_NMI);
} else if (unlikely(vmx->soft_vnmi_blocked))
vmx->vnmi_blocked_time +=
ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
@@ -3946,8 +4002,7 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
* Clear bit "block by NMI" before VM entry if a NMI
* delivery faulted.
*/
- vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
- GUEST_INTR_STATE_NMI);
+ vmx_set_nmi_mask(&vmx->vcpu, false);
break;
case INTR_TYPE_SOFT_EXCEPTION:
vmx->vcpu.arch.event_exit_inst_len =
@@ -4124,6 +4179,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
);
vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
+ | (1 << VCPU_EXREG_RFLAGS)
+ | (1 << VCPU_EXREG_CPL)
| (1 << VCPU_EXREG_PDPTR)
| (1 << VCPU_EXREG_CR3));
vcpu->arch.regs_dirty = 0;
@@ -4134,7 +4191,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx->launched = 1;
vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
- vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
vmx_complete_atomic_exit(vmx);
vmx_recover_nmi_blocking(vmx);
@@ -4368,6 +4424,13 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
{
}
+static int vmx_check_intercept(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage)
+{
+ return X86EMUL_CONTINUE;
+}
+
static struct kvm_x86_ops vmx_x86_ops = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
@@ -4449,10 +4512,14 @@ static struct kvm_x86_ops vmx_x86_ops = {
.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
+ .set_tsc_khz = vmx_set_tsc_khz,
.write_tsc_offset = vmx_write_tsc_offset,
.adjust_tsc_offset = vmx_adjust_tsc_offset,
+ .compute_tsc_offset = vmx_compute_tsc_offset,
.set_tdp_cr3 = vmx_set_cr3,
+
+ .check_intercept = vmx_check_intercept,
};
static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 934b4c6b0bf9..1d5a7f418795 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -60,19 +60,6 @@
#include <asm/div64.h>
#define MAX_IO_MSRS 256
-#define CR0_RESERVED_BITS \
- (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
- | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
- | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
-#define CR4_RESERVED_BITS \
- (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
- | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
- | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
- | X86_CR4_OSXSAVE \
- | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
-
-#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
-
#define KVM_MAX_MCE_BANKS 32
#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
@@ -100,6 +87,11 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);
int ignore_msrs = 0;
module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
+bool kvm_has_tsc_control;
+EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
+u32 kvm_max_guest_tsc_khz;
+EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
+
#define KVM_NR_SHARED_MSRS 16
struct kvm_shared_msrs_global {
@@ -361,8 +353,8 @@ void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
void kvm_inject_nmi(struct kvm_vcpu *vcpu)
{
- kvm_make_request(KVM_REQ_NMI, vcpu);
kvm_make_request(KVM_REQ_EVENT, vcpu);
+ vcpu->arch.nmi_pending = 1;
}
EXPORT_SYMBOL_GPL(kvm_inject_nmi);
@@ -982,7 +974,15 @@ static inline int kvm_tsc_changes_freq(void)
return ret;
}
-static inline u64 nsec_to_cycles(u64 nsec)
+static u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.virtual_tsc_khz)
+ return vcpu->arch.virtual_tsc_khz;
+ else
+ return __this_cpu_read(cpu_tsc_khz);
+}
+
+static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
{
u64 ret;
@@ -990,25 +990,24 @@ static inline u64 nsec_to_cycles(u64 nsec)
if (kvm_tsc_changes_freq())
printk_once(KERN_WARNING
"kvm: unreliable cycle conversion on adjustable rate TSC\n");
- ret = nsec * __this_cpu_read(cpu_tsc_khz);
+ ret = nsec * vcpu_tsc_khz(vcpu);
do_div(ret, USEC_PER_SEC);
return ret;
}
-static void kvm_arch_set_tsc_khz(struct kvm *kvm, u32 this_tsc_khz)
+static void kvm_init_tsc_catchup(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
{
/* Compute a scale to convert nanoseconds in TSC cycles */
kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
- &kvm->arch.virtual_tsc_shift,
- &kvm->arch.virtual_tsc_mult);
- kvm->arch.virtual_tsc_khz = this_tsc_khz;
+ &vcpu->arch.tsc_catchup_shift,
+ &vcpu->arch.tsc_catchup_mult);
}
static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
{
u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec,
- vcpu->kvm->arch.virtual_tsc_mult,
- vcpu->kvm->arch.virtual_tsc_shift);
+ vcpu->arch.tsc_catchup_mult,
+ vcpu->arch.tsc_catchup_shift);
tsc += vcpu->arch.last_tsc_write;
return tsc;
}
@@ -1021,7 +1020,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
s64 sdiff;
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
- offset = data - native_read_tsc();
+ offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
ns = get_kernel_ns();
elapsed = ns - kvm->arch.last_tsc_nsec;
sdiff = data - kvm->arch.last_tsc_write;
@@ -1037,13 +1036,13 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
* In that case, for a reliable TSC, we can match TSC offsets,
* or make a best guest using elapsed value.
*/
- if (sdiff < nsec_to_cycles(5ULL * NSEC_PER_SEC) &&
+ if (sdiff < nsec_to_cycles(vcpu, 5ULL * NSEC_PER_SEC) &&
elapsed < 5ULL * NSEC_PER_SEC) {
if (!check_tsc_unstable()) {
offset = kvm->arch.last_tsc_offset;
pr_debug("kvm: matched tsc offset for %llu\n", data);
} else {
- u64 delta = nsec_to_cycles(elapsed);
+ u64 delta = nsec_to_cycles(vcpu, elapsed);
offset += delta;
pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
}
@@ -1075,8 +1074,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
local_irq_save(flags);
kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp);
kernel_ns = get_kernel_ns();
- this_tsc_khz = __this_cpu_read(cpu_tsc_khz);
-
+ this_tsc_khz = vcpu_tsc_khz(v);
if (unlikely(this_tsc_khz == 0)) {
local_irq_restore(flags);
kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
@@ -1993,6 +1991,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_X86_ROBUST_SINGLESTEP:
case KVM_CAP_XSAVE:
case KVM_CAP_ASYNC_PF:
+ case KVM_CAP_GET_TSC_KHZ:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -2019,6 +2018,9 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_XCRS:
r = cpu_has_xsave;
break;
+ case KVM_CAP_TSC_CONTROL:
+ r = kvm_has_tsc_control;
+ break;
default:
r = 0;
break;
@@ -2120,8 +2122,13 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
kvm_x86_ops->vcpu_load(vcpu, cpu);
if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
/* Make sure TSC doesn't go backwards */
- s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
- native_read_tsc() - vcpu->arch.last_host_tsc;
+ s64 tsc_delta;
+ u64 tsc;
+
+ kvm_get_msr(vcpu, MSR_IA32_TSC, &tsc);
+ tsc_delta = !vcpu->arch.last_guest_tsc ? 0 :
+ tsc - vcpu->arch.last_guest_tsc;
+
if (tsc_delta < 0)
mark_tsc_unstable("KVM discovered backwards TSC");
if (check_tsc_unstable()) {
@@ -2418,6 +2425,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
(1 << KVM_FEATURE_NOP_IO_DELAY) |
(1 << KVM_FEATURE_CLOCKSOURCE2) |
+ (1 << KVM_FEATURE_ASYNC_PF) |
(1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
entry->ebx = 0;
entry->ecx = 0;
@@ -3046,6 +3054,32 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
break;
}
+ case KVM_SET_TSC_KHZ: {
+ u32 user_tsc_khz;
+
+ r = -EINVAL;
+ if (!kvm_has_tsc_control)
+ break;
+
+ user_tsc_khz = (u32)arg;
+
+ if (user_tsc_khz >= kvm_max_guest_tsc_khz)
+ goto out;
+
+ kvm_x86_ops->set_tsc_khz(vcpu, user_tsc_khz);
+
+ r = 0;
+ goto out;
+ }
+ case KVM_GET_TSC_KHZ: {
+ r = -EIO;
+ if (check_tsc_unstable())
+ goto out;
+
+ r = vcpu_tsc_khz(vcpu);
+
+ goto out;
+ }
default:
r = -EINVAL;
}
@@ -3595,20 +3629,43 @@ static void kvm_init_msr_list(void)
static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
const void *v)
{
- if (vcpu->arch.apic &&
- !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))
- return 0;
+ int handled = 0;
+ int n;
- return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
+ do {
+ n = min(len, 8);
+ if (!(vcpu->arch.apic &&
+ !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, n, v))
+ && kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))
+ break;
+ handled += n;
+ addr += n;
+ len -= n;
+ v += n;
+ } while (len);
+
+ return handled;
}
static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
{
- if (vcpu->arch.apic &&
- !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v))
- return 0;
+ int handled = 0;
+ int n;
- return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
+ do {
+ n = min(len, 8);
+ if (!(vcpu->arch.apic &&
+ !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, n, v))
+ && kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))
+ break;
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v);
+ handled += n;
+ addr += n;
+ len -= n;
+ v += n;
+ } while (len);
+
+ return handled;
}
static void kvm_set_segment(struct kvm_vcpu *vcpu,
@@ -3768,6 +3825,7 @@ static int emulator_read_emulated(unsigned long addr,
struct kvm_vcpu *vcpu)
{
gpa_t gpa;
+ int handled;
if (vcpu->mmio_read_completed) {
memcpy(val, vcpu->mmio_data, bytes);
@@ -3794,18 +3852,24 @@ mmio:
/*
* Is this MMIO handled locally?
*/
- if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) {
- trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val);
+ handled = vcpu_mmio_read(vcpu, gpa, bytes, val);
+
+ if (handled == bytes)
return X86EMUL_CONTINUE;
- }
+
+ gpa += handled;
+ bytes -= handled;
+ val += handled;
trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
vcpu->mmio_needed = 1;
vcpu->run->exit_reason = KVM_EXIT_MMIO;
vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
- vcpu->run->mmio.len = vcpu->mmio_size = bytes;
+ vcpu->mmio_size = bytes;
+ vcpu->run->mmio.len = min(vcpu->mmio_size, 8);
vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0;
+ vcpu->mmio_index = 0;
return X86EMUL_IO_NEEDED;
}
@@ -3829,6 +3893,7 @@ static int emulator_write_emulated_onepage(unsigned long addr,
struct kvm_vcpu *vcpu)
{
gpa_t gpa;
+ int handled;
gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
@@ -3847,15 +3912,23 @@ mmio:
/*
* Is this MMIO handled locally?
*/
- if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
+ handled = vcpu_mmio_write(vcpu, gpa, bytes, val);
+ if (handled == bytes)
return X86EMUL_CONTINUE;
+ gpa += handled;
+ bytes -= handled;
+ val += handled;
+
vcpu->mmio_needed = 1;
+ memcpy(vcpu->mmio_data, val, bytes);
vcpu->run->exit_reason = KVM_EXIT_MMIO;
vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
- vcpu->run->mmio.len = vcpu->mmio_size = bytes;
+ vcpu->mmio_size = bytes;
+ vcpu->run->mmio.len = min(vcpu->mmio_size, 8);
vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
- memcpy(vcpu->run->mmio.data, val, bytes);
+ memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8);
+ vcpu->mmio_index = 0;
return X86EMUL_CONTINUE;
}
@@ -4241,6 +4314,29 @@ static void emulator_set_segment_selector(u16 sel, int seg,
kvm_set_segment(vcpu, &kvm_seg, seg);
}
+static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt)
+{
+ preempt_disable();
+ kvm_load_guest_fpu(ctxt->vcpu);
+ /*
+ * CR0.TS may reference the host fpu state, not the guest fpu state,
+ * so it may be clear at this point.
+ */
+ clts();
+}
+
+static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt)
+{
+ preempt_enable();
+}
+
+static int emulator_intercept(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage)
+{
+ return kvm_x86_ops->check_intercept(vcpu, info, stage);
+}
+
static struct x86_emulate_ops emulate_ops = {
.read_std = kvm_read_guest_virt_system,
.write_std = kvm_write_guest_virt_system,
@@ -4264,6 +4360,9 @@ static struct x86_emulate_ops emulate_ops = {
.set_dr = emulator_set_dr,
.set_msr = kvm_set_msr,
.get_msr = kvm_get_msr,
+ .get_fpu = emulator_get_fpu,
+ .put_fpu = emulator_put_fpu,
+ .intercept = emulator_intercept,
};
static void cache_all_regs(struct kvm_vcpu *vcpu)
@@ -4310,7 +4409,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
vcpu->arch.emulate_ctxt.vcpu = vcpu;
- vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
+ vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
vcpu->arch.emulate_ctxt.mode =
(!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
@@ -4318,6 +4417,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
? X86EMUL_MODE_VM86 : cs_l
? X86EMUL_MODE_PROT64 : cs_db
? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+ vcpu->arch.emulate_ctxt.guest_mode = is_guest_mode(vcpu);
memset(c, 0, sizeof(struct decode_cache));
memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
}
@@ -4340,7 +4440,7 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq)
vcpu->arch.emulate_ctxt.eip = c->eip;
memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
- kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+ kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
if (irq == NMI_VECTOR)
vcpu->arch.nmi_pending = false;
@@ -4449,6 +4549,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
restart:
r = x86_emulate_insn(&vcpu->arch.emulate_ctxt);
+ if (r == EMULATION_INTERCEPTED)
+ return EMULATE_DONE;
+
if (r == EMULATION_FAILED) {
if (reexecute_instruction(vcpu, cr2))
return EMULATE_DONE;
@@ -4463,17 +4566,15 @@ restart:
if (!vcpu->arch.pio.in)
vcpu->arch.pio.count = 0;
r = EMULATE_DO_MMIO;
- } else if (vcpu->mmio_needed) {
- if (vcpu->mmio_is_write)
- vcpu->mmio_needed = 0;
+ } else if (vcpu->mmio_needed)
r = EMULATE_DO_MMIO;
- } else if (r == EMULATION_RESTART)
+ else if (r == EMULATION_RESTART)
goto restart;
else
r = EMULATE_DONE;
toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility);
- kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+ kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
kvm_make_request(KVM_REQ_EVENT, vcpu);
memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
@@ -5170,6 +5271,7 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
{
int r;
+ bool nmi_pending;
bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
vcpu->run->request_interrupt_window;
@@ -5207,19 +5309,25 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
r = 1;
goto out;
}
- if (kvm_check_request(KVM_REQ_NMI, vcpu))
- vcpu->arch.nmi_pending = true;
}
r = kvm_mmu_reload(vcpu);
if (unlikely(r))
goto out;
+ /*
+ * An NMI can be injected between local nmi_pending read and
+ * vcpu->arch.nmi_pending read inside inject_pending_event().
+ * But in that case, KVM_REQ_EVENT will be set, which makes
+ * the race described above benign.
+ */
+ nmi_pending = ACCESS_ONCE(vcpu->arch.nmi_pending);
+
if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
inject_pending_event(vcpu);
/* enable NMI/IRQ window open exits if needed */
- if (vcpu->arch.nmi_pending)
+ if (nmi_pending)
kvm_x86_ops->enable_nmi_window(vcpu);
else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
kvm_x86_ops->enable_irq_window(vcpu);
@@ -5399,6 +5507,40 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
return r;
}
+static int complete_mmio(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *run = vcpu->run;
+ int r;
+
+ if (!(vcpu->arch.pio.count || vcpu->mmio_needed))
+ return 1;
+
+ if (vcpu->mmio_needed) {
+ vcpu->mmio_needed = 0;
+ if (!vcpu->mmio_is_write)
+ memcpy(vcpu->mmio_data, run->mmio.data, 8);
+ vcpu->mmio_index += 8;
+ if (vcpu->mmio_index < vcpu->mmio_size) {
+ run->exit_reason = KVM_EXIT_MMIO;
+ run->mmio.phys_addr = vcpu->mmio_phys_addr + vcpu->mmio_index;
+ memcpy(run->mmio.data, vcpu->mmio_data + vcpu->mmio_index, 8);
+ run->mmio.len = min(vcpu->mmio_size - vcpu->mmio_index, 8);
+ run->mmio.is_write = vcpu->mmio_is_write;
+ vcpu->mmio_needed = 1;
+ return 0;
+ }
+ if (vcpu->mmio_is_write)
+ return 1;
+ vcpu->mmio_read_completed = 1;
+ }
+ vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
+ srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+ if (r != EMULATE_DONE)
+ return 0;
+ return 1;
+}
+
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
int r;
@@ -5425,20 +5567,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
}
}
- if (vcpu->arch.pio.count || vcpu->mmio_needed) {
- if (vcpu->mmio_needed) {
- memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
- vcpu->mmio_read_completed = 1;
- vcpu->mmio_needed = 0;
- }
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
- r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
- srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
- if (r != EMULATE_DONE) {
- r = 0;
- goto out;
- }
- }
+ r = complete_mmio(vcpu);
+ if (r <= 0)
+ goto out;
+
if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
kvm_register_write(vcpu, VCPU_REGS_RAX,
kvm_run->hypercall.ret);
@@ -5592,7 +5724,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
- kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+ kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
kvm_make_request(KVM_REQ_EVENT, vcpu);
return EMULATE_DONE;
}
@@ -5974,8 +6106,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
}
vcpu->arch.pio_data = page_address(page);
- if (!kvm->arch.virtual_tsc_khz)
- kvm_arch_set_tsc_khz(kvm, max_tsc_khz);
+ kvm_init_tsc_catchup(vcpu, max_tsc_khz);
r = kvm_mmu_create(vcpu);
if (r < 0)
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index ea2dc1a2e13d..2f63ebeac639 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -541,6 +541,8 @@ struct kvm_ppc_pvinfo {
#define KVM_CAP_PPC_GET_PVINFO 57
#define KVM_CAP_PPC_IRQ_LEVEL 58
#define KVM_CAP_ASYNC_PF 59
+#define KVM_CAP_TSC_CONTROL 60
+#define KVM_CAP_GET_TSC_KHZ 61
#ifdef KVM_CAP_IRQ_ROUTING
@@ -677,6 +679,9 @@ struct kvm_clock_data {
#define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2)
/* Available with KVM_CAP_PPC_GET_PVINFO */
#define KVM_PPC_GET_PVINFO _IOW(KVMIO, 0xa1, struct kvm_ppc_pvinfo)
+/* Available with KVM_CAP_TSC_CONTROL */
+#define KVM_SET_TSC_KHZ _IO(KVMIO, 0xa2)
+#define KVM_GET_TSC_KHZ _IO(KVMIO, 0xa3)
/*
* ioctls for vcpu fds
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ab428552af8e..d1f507567068 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -27,6 +27,10 @@
#include <asm/kvm_host.h>
+#ifndef KVM_MMIO_SIZE
+#define KVM_MMIO_SIZE 8
+#endif
+
/*
* vcpu->requests bit members
*/
@@ -43,7 +47,6 @@
#define KVM_REQ_DEACTIVATE_FPU 10
#define KVM_REQ_EVENT 11
#define KVM_REQ_APF_HALT 12
-#define KVM_REQ_NMI 13
#define KVM_USERSPACE_IRQ_SOURCE_ID 0
@@ -133,7 +136,8 @@ struct kvm_vcpu {
int mmio_read_completed;
int mmio_is_write;
int mmio_size;
- unsigned char mmio_data[8];
+ int mmio_index;
+ unsigned char mmio_data[KVM_MMIO_SIZE];
gpa_t mmio_phys_addr;
#endif
@@ -365,7 +369,6 @@ pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
bool *writable);
pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn);
-int memslot_id(struct kvm *kvm, gfn_t gfn);
void kvm_release_pfn_dirty(pfn_t);
void kvm_release_pfn_clean(pfn_t pfn);
void kvm_set_pfn_dirty(pfn_t pfn);
@@ -597,6 +600,11 @@ static inline void kvm_guest_exit(void)
current->flags &= ~PF_VCPU;
}
+static inline int memslot_id(struct kvm *kvm, gfn_t gfn)
+{
+ return gfn_to_memslot(kvm, gfn)->id;
+}
+
static inline unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
gfn_t gfn)
{
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 6330653480e4..58146457bf97 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -996,23 +996,6 @@ out:
return size;
}
-int memslot_id(struct kvm *kvm, gfn_t gfn)
-{
- int i;
- struct kvm_memslots *slots = kvm_memslots(kvm);
- struct kvm_memory_slot *memslot = NULL;
-
- for (i = 0; i < slots->nmemslots; ++i) {
- memslot = &slots->memslots[i];
-
- if (gfn >= memslot->base_gfn
- && gfn < memslot->base_gfn + memslot->npages)
- break;
- }
-
- return memslot - slots->memslots;
-}
-
static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
gfn_t *nr_pages)
{