summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2009-06-26 11:37:43 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2009-06-26 11:37:43 +1000
commit9b415b4090cdb19ae35f347be816f8661dc7d822 (patch)
tree4a42bf3100d1cbe58002e30afbafdcb027effb25 /arch
parent72c7a6b3a5fbb89025f302e467b27d33e40891f2 (diff)
parent95ac0038a6c83aa89b77b6db53cd6269e43c301f (diff)
Merge commit 'kvm/master'
Diffstat (limited to 'arch')
-rw-r--r--arch/ia64/include/asm/kvm_host.h4
-rw-r--r--arch/ia64/kvm/Kconfig8
-rw-r--r--arch/ia64/kvm/kvm-ia64.c37
-rw-r--r--arch/ia64/kvm/kvm_lib.c6
-rw-r--r--arch/ia64/kvm/vcpu.c4
-rw-r--r--arch/powerpc/include/asm/kvm_host.h4
-rw-r--r--arch/powerpc/kvm/44x.c4
-rw-r--r--arch/powerpc/kvm/Kconfig3
-rw-r--r--arch/powerpc/kvm/booke.c2
-rw-r--r--arch/powerpc/kvm/e500.c7
-rw-r--r--arch/powerpc/kvm/e500_emulate.c3
-rw-r--r--arch/powerpc/kvm/e500_tlb.c10
-rw-r--r--arch/powerpc/kvm/e500_tlb.h6
-rw-r--r--arch/powerpc/kvm/emulate.c4
-rw-r--r--arch/powerpc/kvm/powerpc.c16
-rw-r--r--arch/s390/include/asm/kvm_host.h19
-rw-r--r--arch/s390/kvm/Kconfig6
-rw-r--r--arch/s390/kvm/gaccess.h23
-rw-r--r--arch/s390/kvm/intercept.c18
-rw-r--r--arch/s390/kvm/kvm-s390.c101
-rw-r--r--arch/s390/kvm/kvm-s390.h32
-rw-r--r--arch/s390/kvm/priv.c2
-rw-r--r--arch/s390/kvm/sigp.c60
-rw-r--r--arch/x86/include/asm/kvm.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h47
-rw-r--r--arch/x86/include/asm/msr-index.h1
-rw-r--r--arch/x86/include/asm/vmx.h8
-rw-r--r--arch/x86/kvm/Kconfig9
-rw-r--r--arch/x86/kvm/Makefile36
-rw-r--r--arch/x86/kvm/i8254.c72
-rw-r--r--arch/x86/kvm/i8254.h2
-rw-r--r--arch/x86/kvm/i8259.c28
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h9
-rw-r--r--arch/x86/kvm/kvm_svm.h51
-rw-r--r--arch/x86/kvm/kvm_timer.h2
-rw-r--r--arch/x86/kvm/lapic.c78
-rw-r--r--arch/x86/kvm/lapic.h1
-rw-r--r--arch/x86/kvm/mmu.c355
-rw-r--r--arch/x86/kvm/mmu.h4
-rw-r--r--arch/x86/kvm/paging_tmpl.h45
-rw-r--r--arch/x86/kvm/svm.c259
-rw-r--r--arch/x86/kvm/timer.c16
-rw-r--r--arch/x86/kvm/trace.h260
-rw-r--r--arch/x86/kvm/vmx.c433
-rw-r--r--arch/x86/kvm/x86.c519
-rw-r--r--arch/x86/kvm/x86_emulate.c259
46 files changed, 2100 insertions, 774 deletions
diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index 5f43697aed30..d9b6325a9328 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -235,7 +235,8 @@ struct kvm_vm_data {
#define KVM_REQ_PTC_G 32
#define KVM_REQ_RESUME 33
-#define KVM_PAGES_PER_HPAGE 1
+#define KVM_NR_PAGE_SIZES 1
+#define KVM_PAGES_PER_HPAGE(x) 1
struct kvm;
struct kvm_vcpu;
@@ -465,7 +466,6 @@ struct kvm_arch {
unsigned long metaphysical_rr4;
unsigned long vmm_init_rr;
- int online_vcpus;
int is_sn2;
struct kvm_ioapic *vioapic;
diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig
index 64d520937874..cbadd8a65233 100644
--- a/arch/ia64/kvm/Kconfig
+++ b/arch/ia64/kvm/Kconfig
@@ -1,12 +1,8 @@
#
# KVM configuration
#
-config HAVE_KVM
- bool
-config HAVE_KVM_IRQCHIP
- bool
- default y
+source "virt/kvm/Kconfig"
menuconfig VIRTUALIZATION
bool "Virtualization"
@@ -28,6 +24,8 @@ config KVM
depends on PCI
select PREEMPT_NOTIFIERS
select ANON_INODES
+ select HAVE_KVM_IRQCHIP
+ select KVM_APIC_ARCHITECTURE
---help---
Support hosting fully virtualized guest machines using hardware
virtualization extensions. You will need a fairly recent
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 80c57b0a21c4..c1c5cb6554cd 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -337,13 +337,12 @@ static struct kvm_vcpu *lid_to_vcpu(struct kvm *kvm, unsigned long id,
{
union ia64_lid lid;
int i;
+ struct kvm_vcpu *vcpu;
- for (i = 0; i < kvm->arch.online_vcpus; i++) {
- if (kvm->vcpus[i]) {
- lid.val = VCPU_LID(kvm->vcpus[i]);
- if (lid.id == id && lid.eid == eid)
- return kvm->vcpus[i];
- }
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ lid.val = VCPU_LID(vcpu);
+ if (lid.id == id && lid.eid == eid)
+ return vcpu;
}
return NULL;
@@ -409,21 +408,21 @@ static int handle_global_purge(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
struct kvm *kvm = vcpu->kvm;
struct call_data call_data;
int i;
+ struct kvm_vcpu *vcpui;
call_data.ptc_g_data = p->u.ptc_g_data;
- for (i = 0; i < kvm->arch.online_vcpus; i++) {
- if (!kvm->vcpus[i] || kvm->vcpus[i]->arch.mp_state ==
- KVM_MP_STATE_UNINITIALIZED ||
- vcpu == kvm->vcpus[i])
+ kvm_for_each_vcpu(i, vcpui, kvm) {
+ if (vcpui->arch.mp_state == KVM_MP_STATE_UNINITIALIZED ||
+ vcpu == vcpui)
continue;
- if (waitqueue_active(&kvm->vcpus[i]->wq))
- wake_up_interruptible(&kvm->vcpus[i]->wq);
+ if (waitqueue_active(&vcpui->wq))
+ wake_up_interruptible(&vcpui->wq);
- if (kvm->vcpus[i]->cpu != -1) {
- call_data.vcpu = kvm->vcpus[i];
- smp_call_function_single(kvm->vcpus[i]->cpu,
+ if (vcpui->cpu != -1) {
+ call_data.vcpu = vcpui;
+ smp_call_function_single(vcpui->cpu,
vcpu_global_purge, &call_data, 1);
} else
printk(KERN_WARNING"kvm: Uninit vcpu received ipi!\n");
@@ -852,8 +851,6 @@ struct kvm *kvm_arch_create_vm(void)
kvm_init_vm(kvm);
- kvm->arch.online_vcpus = 0;
-
return kvm;
}
@@ -1216,7 +1213,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
if (IS_ERR(vmm_vcpu))
return PTR_ERR(vmm_vcpu);
- if (vcpu->vcpu_id == 0) {
+ if (kvm_vcpu_is_bsp(vcpu)) {
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
/*Set entry address for first run.*/
@@ -1224,7 +1221,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
/*Initialize itc offset for vcpus*/
itc_offset = 0UL - kvm_get_itc(vcpu);
- for (i = 0; i < kvm->arch.online_vcpus; i++) {
+ for (i = 0; i < KVM_MAX_VCPUS; i++) {
v = (struct kvm_vcpu *)((char *)vcpu +
sizeof(struct kvm_vcpu_data) * i);
v->arch.itc_offset = itc_offset;
@@ -1356,8 +1353,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
goto fail;
}
- kvm->arch.online_vcpus++;
-
return vcpu;
fail:
return ERR_PTR(r);
diff --git a/arch/ia64/kvm/kvm_lib.c b/arch/ia64/kvm/kvm_lib.c
index a85cb611ecd7..f1268b8e6f9e 100644
--- a/arch/ia64/kvm/kvm_lib.c
+++ b/arch/ia64/kvm/kvm_lib.c
@@ -11,5 +11,11 @@
*
*/
#undef CONFIG_MODULES
+#include <linux/module.h>
+#undef CONFIG_KALLSYMS
+#undef EXPORT_SYMBOL
+#undef EXPORT_SYMBOL_GPL
+#define EXPORT_SYMBOL(sym)
+#define EXPORT_SYMBOL_GPL(sym)
#include "../../../lib/vsprintf.c"
#include "../../../lib/ctype.c"
diff --git a/arch/ia64/kvm/vcpu.c b/arch/ia64/kvm/vcpu.c
index a2c6c15e4761..2334eac40f8e 100644
--- a/arch/ia64/kvm/vcpu.c
+++ b/arch/ia64/kvm/vcpu.c
@@ -830,8 +830,8 @@ static void vcpu_set_itc(struct kvm_vcpu *vcpu, u64 val)
kvm = (struct kvm *)KVM_VM_BASE;
- if (vcpu->vcpu_id == 0) {
- for (i = 0; i < kvm->arch.online_vcpus; i++) {
+ if (kvm_vcpu_is_bsp(vcpu)) {
+ for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) {
v = (struct kvm_vcpu *)((char *)vcpu +
sizeof(struct kvm_vcpu_data) * i);
VMX(v, itc_offset) = itc_offset;
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index dfdf13c9fefd..c9c930ed11d7 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -34,7 +34,8 @@
#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
/* We don't currently support large pages. */
-#define KVM_PAGES_PER_HPAGE (1<<31)
+#define KVM_NR_PAGE_SIZES 1
+#define KVM_PAGES_PER_HPAGE(x) (1UL<<31)
struct kvm;
struct kvm_run;
@@ -153,7 +154,6 @@ struct kvm_vcpu_arch {
u32 pid;
u32 swap_pid;
- u32 pvr;
u32 ccr0;
u32 ccr1;
u32 dbcr0;
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
index 0cef809cec21..f4d1b55aa70b 100644
--- a/arch/powerpc/kvm/44x.c
+++ b/arch/powerpc/kvm/44x.c
@@ -138,7 +138,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
kmem_cache_free(kvm_vcpu_cache, vcpu_44x);
}
-static int kvmppc_44x_init(void)
+static int __init kvmppc_44x_init(void)
{
int r;
@@ -149,7 +149,7 @@ static int kvmppc_44x_init(void)
return kvm_init(NULL, sizeof(struct kvmppc_vcpu_44x), THIS_MODULE);
}
-static void kvmppc_44x_exit(void)
+static void __exit kvmppc_44x_exit(void)
{
kvmppc_booke_exit();
}
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 5a152a52796f..46019dccce1c 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -2,8 +2,7 @@
# KVM configuration
#
-config HAVE_KVM_IRQCHIP
- bool
+source "virt/kvm/Kconfig"
menuconfig VIRTUALIZATION
bool "Virtualization"
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 642e4204cf25..e7bf4d029484 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -520,7 +520,7 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
return kvmppc_core_vcpu_translate(vcpu, tr);
}
-int kvmppc_booke_init(void)
+int __init kvmppc_booke_init(void)
{
unsigned long ivor[16];
unsigned long max_ivor = 0;
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
index d8067fd81cdd..64949eef43f1 100644
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -60,9 +60,6 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
kvmppc_e500_tlb_setup(vcpu_e500);
- /* Use the same core vertion as host's */
- vcpu->arch.pvr = mfspr(SPRN_PVR);
-
return 0;
}
@@ -132,7 +129,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
kmem_cache_free(kvm_vcpu_cache, vcpu_e500);
}
-static int kvmppc_e500_init(void)
+static int __init kvmppc_e500_init(void)
{
int r, i;
unsigned long ivor[3];
@@ -160,7 +157,7 @@ static int kvmppc_e500_init(void)
return kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), THIS_MODULE);
}
-static void kvmppc_e500_exit(void)
+static void __init kvmppc_e500_exit(void)
{
kvmppc_booke_exit();
}
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c
index 3f760414b9f8..be95b8d8e3b7 100644
--- a/arch/powerpc/kvm/e500_emulate.c
+++ b/arch/powerpc/kvm/e500_emulate.c
@@ -180,6 +180,9 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
case SPRN_MMUCSR0:
vcpu->arch.gpr[rt] = 0; break;
+ case SPRN_MMUCFG:
+ vcpu->arch.gpr[rt] = mfspr(SPRN_MMUCFG); break;
+
/* extra exceptions */
case SPRN_IVOR32:
vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL];
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index 0e773fc2d5e4..a2048ac095ab 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -269,7 +269,7 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu,
tlbsel = (vcpu_e500->mas4 >> 28) & 0x1;
victim = (tlbsel == 0) ? tlb0_get_next_victim(vcpu_e500) : 0;
pidsel = (vcpu_e500->mas4 >> 16) & 0xf;
- tsized = (vcpu_e500->mas4 >> 8) & 0xf;
+ tsized = (vcpu_e500->mas4 >> 7) & 0x1f;
vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim)
| MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]);
@@ -309,7 +309,7 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
vcpu_e500->shadow_pages[tlbsel][esel] = new_page;
/* Force TS=1 IPROT=0 TSIZE=4KB for all guest mappings. */
- stlbe->mas1 = MAS1_TSIZE(BOOKE_PAGESZ_4K)
+ stlbe->mas1 = MAS1_TSIZE(BOOK3E_PAGESZ_4K)
| MAS1_TID(get_tlb_tid(gtlbe)) | MAS1_TS | MAS1_VALID;
stlbe->mas2 = (gvaddr & MAS2_EPN)
| e500_shadow_mas2_attrib(gtlbe->mas2,
@@ -545,7 +545,7 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
case 0:
/* TLB0 */
gtlbe->mas1 &= ~MAS1_TSIZE(~0);
- gtlbe->mas1 |= MAS1_TSIZE(BOOKE_PAGESZ_4K);
+ gtlbe->mas1 |= MAS1_TSIZE(BOOK3E_PAGESZ_4K);
stlbsel = 0;
sesel = kvmppc_e500_stlbe_map(vcpu_e500, 0, esel);
@@ -679,14 +679,14 @@ void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500)
/* Insert large initial mapping for guest. */
tlbe = &vcpu_e500->guest_tlb[1][0];
- tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOKE_PAGESZ_256M);
+ tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_256M);
tlbe->mas2 = 0;
tlbe->mas3 = E500_TLB_SUPER_PERM_MASK;
tlbe->mas7 = 0;
/* 4K map for serial output. Used by kernel wrapper. */
tlbe = &vcpu_e500->guest_tlb[1][1];
- tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOKE_PAGESZ_4K);
+ tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_4K);
tlbe->mas2 = (0xe0004500 & 0xFFFFF000) | MAS2_I | MAS2_G;
tlbe->mas3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK;
tlbe->mas7 = 0;
diff --git a/arch/powerpc/kvm/e500_tlb.h b/arch/powerpc/kvm/e500_tlb.h
index 45b064b76906..d28e3010a5e2 100644
--- a/arch/powerpc/kvm/e500_tlb.h
+++ b/arch/powerpc/kvm/e500_tlb.h
@@ -16,7 +16,7 @@
#define __KVM_E500_TLB_H__
#include <linux/kvm_host.h>
-#include <asm/mmu-fsl-booke.h>
+#include <asm/mmu-book3e.h>
#include <asm/tlb.h>
#include <asm/kvm_e500.h>
@@ -59,7 +59,7 @@ extern void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *);
/* TLB helper functions */
static inline unsigned int get_tlb_size(const struct tlbe *tlbe)
{
- return (tlbe->mas1 >> 8) & 0xf;
+ return (tlbe->mas1 >> 7) & 0x1f;
}
static inline gva_t get_tlb_eaddr(const struct tlbe *tlbe)
@@ -70,7 +70,7 @@ static inline gva_t get_tlb_eaddr(const struct tlbe *tlbe)
static inline u64 get_tlb_bytes(const struct tlbe *tlbe)
{
unsigned int pgsize = get_tlb_size(tlbe);
- return 1ULL << 10 << (pgsize << 1);
+ return 1ULL << 10 << pgsize;
}
static inline gva_t get_tlb_end(const struct tlbe *tlbe)
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index a561d6e8da1c..28a8237fe78b 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -187,7 +187,9 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
case SPRN_SRR1:
vcpu->arch.gpr[rt] = vcpu->arch.srr1; break;
case SPRN_PVR:
- vcpu->arch.gpr[rt] = vcpu->arch.pvr; break;
+ vcpu->arch.gpr[rt] = mfspr(SPRN_PVR); break;
+ case SPRN_PIR:
+ vcpu->arch.gpr[rt] = mfspr(SPRN_PIR); break;
/* Note: mftb and TBRL/TBWL are user-accessible, so
* the guest can always access the real TB anyways.
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 2cf915e51e7e..7ad30e0a1b9a 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -122,13 +122,17 @@ struct kvm *kvm_arch_create_vm(void)
static void kvmppc_free_vcpus(struct kvm *kvm)
{
unsigned int i;
+ struct kvm_vcpu *vcpu;
- for (i = 0; i < KVM_MAX_VCPUS; ++i) {
- if (kvm->vcpus[i]) {
- kvm_arch_vcpu_free(kvm->vcpus[i]);
- kvm->vcpus[i] = NULL;
- }
- }
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_arch_vcpu_free(vcpu);
+
+ mutex_lock(&kvm->lock);
+ for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
+ kvm->vcpus[i] = NULL;
+
+ atomic_set(&kvm->online_vcpus, 0);
+ mutex_unlock(&kvm->lock);
}
void kvm_arch_sync_events(struct kvm *kvm)
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index a27d0d5a6f86..78e07a622b45 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -1,7 +1,7 @@
/*
* asm-s390/kvm_host.h - definition for kernel virtual machines on s390
*
- * Copyright IBM Corp. 2008
+ * Copyright IBM Corp. 2008,2009
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License (version 2 only)
@@ -40,7 +40,11 @@ struct sca_block {
struct sca_entry cpu[64];
} __attribute__((packed));
-#define KVM_PAGES_PER_HPAGE 256
+#define KVM_NR_PAGE_SIZES 2
+#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + ((x) - 1) * 8)
+#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x))
+#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1))
+#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
#define CPUSTAT_HOST 0x80000000
#define CPUSTAT_WAIT 0x10000000
@@ -99,7 +103,9 @@ struct kvm_s390_sie_block {
__u8 reservedd0[48]; /* 0x00d0 */
__u64 gcr[16]; /* 0x0100 */
__u64 gbea; /* 0x0180 */
- __u8 reserved188[120]; /* 0x0188 */
+ __u8 reserved188[24]; /* 0x0188 */
+ __u32 fac; /* 0x01a0 */
+ __u8 reserved1a4[92]; /* 0x01a4 */
} __attribute__((packed));
struct kvm_vcpu_stat {
@@ -180,8 +186,9 @@ struct kvm_s390_interrupt_info {
};
/* for local_interrupt.action_flags */
-#define ACTION_STORE_ON_STOP 1
-#define ACTION_STOP_ON_STOP 2
+#define ACTION_STORE_ON_STOP (1<<0)
+#define ACTION_STOP_ON_STOP (1<<1)
+#define ACTION_RELOADVCPU_ON_STOP (1<<2)
struct kvm_s390_local_interrupt {
spinlock_t lock;
@@ -225,8 +232,6 @@ struct kvm_vm_stat {
};
struct kvm_arch{
- unsigned long guest_origin;
- unsigned long guest_memsize;
struct sca_block *sca;
debug_info_t *dbf;
struct kvm_s390_float_interrupt float_int;
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index 3e260b7e37b2..ad75ce33be12 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -1,11 +1,7 @@
#
# KVM configuration
#
-config HAVE_KVM
- bool
-
-config HAVE_KVM_IRQCHIP
- bool
+source "virt/kvm/Kconfig"
menuconfig VIRTUALIZATION
bool "Virtualization"
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index ed60f3a74a85..03c716a0f01f 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -1,7 +1,7 @@
/*
* gaccess.h - access guest memory
*
- * Copyright IBM Corp. 2008
+ * Copyright IBM Corp. 2008,2009
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License (version 2 only)
@@ -16,13 +16,14 @@
#include <linux/compiler.h>
#include <linux/kvm_host.h>
#include <asm/uaccess.h>
+#include "kvm-s390.h"
static inline void __user *__guestaddr_to_user(struct kvm_vcpu *vcpu,
unsigned long guestaddr)
{
unsigned long prefix = vcpu->arch.sie_block->prefix;
- unsigned long origin = vcpu->kvm->arch.guest_origin;
- unsigned long memsize = vcpu->kvm->arch.guest_memsize;
+ unsigned long origin = vcpu->arch.sie_block->gmsor;
+ unsigned long memsize = kvm_s390_vcpu_get_memsize(vcpu);
if (guestaddr < 2 * PAGE_SIZE)
guestaddr += prefix;
@@ -158,8 +159,8 @@ static inline int copy_to_guest(struct kvm_vcpu *vcpu, unsigned long guestdest,
const void *from, unsigned long n)
{
unsigned long prefix = vcpu->arch.sie_block->prefix;
- unsigned long origin = vcpu->kvm->arch.guest_origin;
- unsigned long memsize = vcpu->kvm->arch.guest_memsize;
+ unsigned long origin = vcpu->arch.sie_block->gmsor;
+ unsigned long memsize = kvm_s390_vcpu_get_memsize(vcpu);
if ((guestdest < 2 * PAGE_SIZE) && (guestdest + n > 2 * PAGE_SIZE))
goto slowpath;
@@ -209,8 +210,8 @@ static inline int copy_from_guest(struct kvm_vcpu *vcpu, void *to,
unsigned long guestsrc, unsigned long n)
{
unsigned long prefix = vcpu->arch.sie_block->prefix;
- unsigned long origin = vcpu->kvm->arch.guest_origin;
- unsigned long memsize = vcpu->kvm->arch.guest_memsize;
+ unsigned long origin = vcpu->arch.sie_block->gmsor;
+ unsigned long memsize = kvm_s390_vcpu_get_memsize(vcpu);
if ((guestsrc < 2 * PAGE_SIZE) && (guestsrc + n > 2 * PAGE_SIZE))
goto slowpath;
@@ -244,8 +245,8 @@ static inline int copy_to_guest_absolute(struct kvm_vcpu *vcpu,
unsigned long guestdest,
const void *from, unsigned long n)
{
- unsigned long origin = vcpu->kvm->arch.guest_origin;
- unsigned long memsize = vcpu->kvm->arch.guest_memsize;
+ unsigned long origin = vcpu->arch.sie_block->gmsor;
+ unsigned long memsize = kvm_s390_vcpu_get_memsize(vcpu);
if (guestdest + n > memsize)
return -EFAULT;
@@ -262,8 +263,8 @@ static inline int copy_from_guest_absolute(struct kvm_vcpu *vcpu, void *to,
unsigned long guestsrc,
unsigned long n)
{
- unsigned long origin = vcpu->kvm->arch.guest_origin;
- unsigned long memsize = vcpu->kvm->arch.guest_memsize;
+ unsigned long origin = vcpu->arch.sie_block->gmsor;
+ unsigned long memsize = kvm_s390_vcpu_get_memsize(vcpu);
if (guestsrc + n > memsize)
return -EFAULT;
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 98997ccba501..ba9d8a7bc1ac 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -1,7 +1,7 @@
/*
* intercept.c - in-kernel handling for sie intercepts
*
- * Copyright IBM Corp. 2008
+ * Copyright IBM Corp. 2008,2009
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License (version 2 only)
@@ -128,7 +128,7 @@ static int handle_noop(struct kvm_vcpu *vcpu)
static int handle_stop(struct kvm_vcpu *vcpu)
{
- int rc;
+ int rc = 0;
vcpu->stat.exit_stop_request++;
atomic_clear_mask(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
@@ -141,12 +141,18 @@ static int handle_stop(struct kvm_vcpu *vcpu)
rc = -ENOTSUPP;
}
+ if (vcpu->arch.local_int.action_bits & ACTION_RELOADVCPU_ON_STOP) {
+ vcpu->arch.local_int.action_bits &= ~ACTION_RELOADVCPU_ON_STOP;
+ rc = SIE_INTERCEPT_RERUNVCPU;
+ vcpu->run->exit_reason = KVM_EXIT_INTR;
+ }
+
if (vcpu->arch.local_int.action_bits & ACTION_STOP_ON_STOP) {
vcpu->arch.local_int.action_bits &= ~ACTION_STOP_ON_STOP;
VCPU_EVENT(vcpu, 3, "%s", "cpu stopped");
rc = -ENOTSUPP;
- } else
- rc = 0;
+ }
+
spin_unlock_bh(&vcpu->arch.local_int.lock);
return rc;
}
@@ -158,9 +164,9 @@ static int handle_validity(struct kvm_vcpu *vcpu)
vcpu->stat.exit_validity++;
if ((viwhy == 0x37) && (vcpu->arch.sie_block->prefix
- <= vcpu->kvm->arch.guest_memsize - 2*PAGE_SIZE)){
+ <= kvm_s390_vcpu_get_memsize(vcpu) - 2*PAGE_SIZE)) {
rc = fault_in_pages_writeable((char __user *)
- vcpu->kvm->arch.guest_origin +
+ vcpu->arch.sie_block->gmsor +
vcpu->arch.sie_block->prefix,
2*PAGE_SIZE);
if (rc)
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index c18b21d6991c..07ced89740d7 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1,7 +1,7 @@
/*
* s390host.c -- hosting zSeries kernel virtual machines
*
- * Copyright IBM Corp. 2008
+ * Copyright IBM Corp. 2008,2009
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License (version 2 only)
@@ -10,6 +10,7 @@
* Author(s): Carsten Otte <cotte@de.ibm.com>
* Christian Borntraeger <borntraeger@de.ibm.com>
* Heiko Carstens <heiko.carstens@de.ibm.com>
+ * Christian Ehrhardt <ehrhardt@de.ibm.com>
*/
#include <linux/compiler.h>
@@ -25,6 +26,7 @@
#include <asm/lowcore.h>
#include <asm/pgtable.h>
#include <asm/nmi.h>
+#include <asm/system.h>
#include "kvm-s390.h"
#include "gaccess.h"
@@ -69,6 +71,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ NULL }
};
+static unsigned long long *facilities;
/* Section: not file related */
void kvm_arch_hardware_enable(void *garbage)
@@ -208,13 +211,17 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
static void kvm_free_vcpus(struct kvm *kvm)
{
unsigned int i;
+ struct kvm_vcpu *vcpu;
- for (i = 0; i < KVM_MAX_VCPUS; ++i) {
- if (kvm->vcpus[i]) {
- kvm_arch_vcpu_destroy(kvm->vcpus[i]);
- kvm->vcpus[i] = NULL;
- }
- }
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_arch_vcpu_destroy(vcpu);
+
+ mutex_lock(&kvm->lock);
+ for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
+ kvm->vcpus[i] = NULL;
+
+ atomic_set(&kvm->online_vcpus, 0);
+ mutex_unlock(&kvm->lock);
}
void kvm_arch_sync_events(struct kvm *kvm)
@@ -276,18 +283,13 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu)
vcpu->arch.sie_block->gbea = 1;
}
-/* The current code can have up to 256 pages for virtio */
-#define VIRTIODESCSPACE (256ul * 4096ul)
-
int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
{
atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH);
- vcpu->arch.sie_block->gmslm = vcpu->kvm->arch.guest_memsize +
- vcpu->kvm->arch.guest_origin +
- VIRTIODESCSPACE - 1ul;
- vcpu->arch.sie_block->gmsor = vcpu->kvm->arch.guest_origin;
+ set_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests);
vcpu->arch.sie_block->ecb = 2;
vcpu->arch.sie_block->eca = 0xC1002001U;
+ vcpu->arch.sie_block->fac = (int) (long) facilities;
hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
tasklet_init(&vcpu->arch.tasklet, kvm_s390_tasklet,
(unsigned long) vcpu);
@@ -316,8 +318,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
BUG_ON(!kvm->arch.sca);
if (!kvm->arch.sca->cpu[id].sda)
kvm->arch.sca->cpu[id].sda = (__u64) vcpu->arch.sie_block;
- else
- BUG_ON(!kvm->vcpus[id]); /* vcpu does already exist */
vcpu->arch.sie_block->scaoh = (__u32)(((__u64)kvm->arch.sca) >> 32);
vcpu->arch.sie_block->scaol = (__u32)(__u64)kvm->arch.sca;
@@ -487,9 +487,15 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
vcpu_load(vcpu);
+rerun_vcpu:
+ if (vcpu->requests)
+ if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
+ kvm_s390_vcpu_set_mem(vcpu);
+
/* verify, that memory has been registered */
- if (!vcpu->kvm->arch.guest_memsize) {
+ if (!vcpu->arch.sie_block->gmslm) {
vcpu_put(vcpu);
+ VCPU_EVENT(vcpu, 3, "%s", "no memory registered to run vcpu");
return -EINVAL;
}
@@ -506,6 +512,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
vcpu->arch.sie_block->gpsw.addr = kvm_run->s390_sieic.addr;
break;
case KVM_EXIT_UNKNOWN:
+ case KVM_EXIT_INTR:
case KVM_EXIT_S390_RESET:
break;
default:
@@ -519,8 +526,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
rc = kvm_handle_sie_intercept(vcpu);
} while (!signal_pending(current) && !rc);
- if (signal_pending(current) && !rc)
+ if (rc == SIE_INTERCEPT_RERUNVCPU)
+ goto rerun_vcpu;
+
+ if (signal_pending(current) && !rc) {
+ kvm_run->exit_reason = KVM_EXIT_INTR;
rc = -EINTR;
+ }
if (rc == -ENOTSUPP) {
/* intercept cannot be handled in-kernel, prepare kvm-run */
@@ -673,6 +685,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
int user_alloc)
{
int i;
+ struct kvm_vcpu *vcpu;
/* A few sanity checks. We can have exactly one memory slot which has
to start at guest virtual zero and which has to be located at a
@@ -681,7 +694,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
vmas. It is okay to mmap() and munmap() stuff in this slot after
doing this call at any time */
- if (mem->slot || kvm->arch.guest_memsize)
+ if (mem->slot)
return -EINVAL;
if (mem->guest_phys_addr)
@@ -696,36 +709,14 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
if (!user_alloc)
return -EINVAL;
- /* lock all vcpus */
- for (i = 0; i < KVM_MAX_VCPUS; ++i) {
- if (!kvm->vcpus[i])
+ /* request update of sie control block for all available vcpus */
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (test_and_set_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
continue;
- if (!mutex_trylock(&kvm->vcpus[i]->mutex))
- goto fail_out;
- }
-
- kvm->arch.guest_origin = mem->userspace_addr;
- kvm->arch.guest_memsize = mem->memory_size;
-
- /* update sie control blocks, and unlock all vcpus */
- for (i = 0; i < KVM_MAX_VCPUS; ++i) {
- if (kvm->vcpus[i]) {
- kvm->vcpus[i]->arch.sie_block->gmsor =
- kvm->arch.guest_origin;
- kvm->vcpus[i]->arch.sie_block->gmslm =
- kvm->arch.guest_memsize +
- kvm->arch.guest_origin +
- VIRTIODESCSPACE - 1ul;
- mutex_unlock(&kvm->vcpus[i]->mutex);
- }
+ kvm_s390_inject_sigp_stop(vcpu, ACTION_RELOADVCPU_ON_STOP);
}
return 0;
-
-fail_out:
- for (; i >= 0; i--)
- mutex_unlock(&kvm->vcpus[i]->mutex);
- return -EINVAL;
}
void kvm_arch_flush_shadow(struct kvm *kvm)
@@ -739,11 +730,29 @@ gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
static int __init kvm_s390_init(void)
{
- return kvm_init(NULL, sizeof(struct kvm_vcpu), THIS_MODULE);
+ int ret;
+ ret = kvm_init(NULL, sizeof(struct kvm_vcpu), THIS_MODULE);
+ if (ret)
+ return ret;
+
+ /*
+ * guests can ask for up to 255+1 double words, we need a full page
+ * to hold the maximum amount of facilites. On the other hand, we
+ * only set facilities that are known to work in KVM.
+ */
+ facilities = (unsigned long long *) get_zeroed_page(GFP_DMA);
+ if (!facilities) {
+ kvm_exit();
+ return -ENOMEM;
+ }
+ stfle(facilities, 1);
+ facilities[0] &= 0xff00fff3f0700000ULL;
+ return 0;
}
static void __exit kvm_s390_exit(void)
{
+ free_page((unsigned long) facilities);
kvm_exit();
}
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 748fee872323..ec5eee7c25d8 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -1,7 +1,7 @@
/*
* kvm_s390.h - definition for kvm on s390
*
- * Copyright IBM Corp. 2008
+ * Copyright IBM Corp. 2008,2009
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License (version 2 only)
@@ -9,6 +9,7 @@
*
* Author(s): Carsten Otte <cotte@de.ibm.com>
* Christian Borntraeger <borntraeger@de.ibm.com>
+ * Christian Ehrhardt <ehrhardt@de.ibm.com>
*/
#ifndef ARCH_S390_KVM_S390_H
@@ -18,8 +19,13 @@
#include <linux/kvm.h>
#include <linux/kvm_host.h>
+/* The current code can have up to 256 pages for virtio */
+#define VIRTIODESCSPACE (256ul * 4096ul)
+
typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu);
+/* negativ values are error codes, positive values for internal conditions */
+#define SIE_INTERCEPT_RERUNVCPU (1<<0)
int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu);
#define VM_EVENT(d_kvm, d_loglevel, d_string, d_args...)\
@@ -50,6 +56,30 @@ int kvm_s390_inject_vm(struct kvm *kvm,
int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
struct kvm_s390_interrupt *s390int);
int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code);
+int kvm_s390_inject_sigp_stop(struct kvm_vcpu *vcpu, int action);
+
+static inline int kvm_s390_vcpu_get_memsize(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.sie_block->gmslm
+ - vcpu->arch.sie_block->gmsor
+ - VIRTIODESCSPACE + 1ul;
+}
+
+static inline void kvm_s390_vcpu_set_mem(struct kvm_vcpu *vcpu)
+{
+ struct kvm_memory_slot *mem;
+
+ down_read(&vcpu->kvm->slots_lock);
+ mem = &vcpu->kvm->memslots[0];
+
+ vcpu->arch.sie_block->gmsor = mem->userspace_addr;
+ vcpu->arch.sie_block->gmslm =
+ mem->userspace_addr +
+ (mem->npages << PAGE_SHIFT) +
+ VIRTIODESCSPACE - 1ul;
+
+ up_read(&vcpu->kvm->slots_lock);
+}
/* implemented in priv.c */
int kvm_s390_handle_b2(struct kvm_vcpu *vcpu);
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 93ecd06e1a74..d426aac8095d 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -158,7 +158,7 @@ static int handle_stfl(struct kvm_vcpu *vcpu)
vcpu->stat.instruction_stfl++;
/* only pass the facility bits, which we can handle */
- facility_list &= 0xfe00fff3;
+ facility_list &= 0xff00fff3;
rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list),
&facility_list, sizeof(facility_list));
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index 36678835034d..bdbd88ddef82 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -1,7 +1,7 @@
/*
* sigp.c - handlinge interprocessor communication
*
- * Copyright IBM Corp. 2008
+ * Copyright IBM Corp. 2008,2009
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License (version 2 only)
@@ -9,6 +9,7 @@
*
* Author(s): Carsten Otte <cotte@de.ibm.com>
* Christian Borntraeger <borntraeger@de.ibm.com>
+ * Christian Ehrhardt <ehrhardt@de.ibm.com>
*/
#include <linux/kvm.h>
@@ -107,46 +108,57 @@ unlock:
return rc;
}
-static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int store)
+static int __inject_sigp_stop(struct kvm_s390_local_interrupt *li, int action)
{
- struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
- struct kvm_s390_local_interrupt *li;
struct kvm_s390_interrupt_info *inti;
- int rc;
-
- if (cpu_addr >= KVM_MAX_VCPUS)
- return 3; /* not operational */
inti = kzalloc(sizeof(*inti), GFP_KERNEL);
if (!inti)
return -ENOMEM;
-
inti->type = KVM_S390_SIGP_STOP;
- spin_lock(&fi->lock);
- li = fi->local_int[cpu_addr];
- if (li == NULL) {
- rc = 3; /* not operational */
- kfree(inti);
- goto unlock;
- }
spin_lock_bh(&li->lock);
list_add_tail(&inti->list, &li->list);
atomic_set(&li->active, 1);
atomic_set_mask(CPUSTAT_STOP_INT, li->cpuflags);
- if (store)
- li->action_bits |= ACTION_STORE_ON_STOP;
- li->action_bits |= ACTION_STOP_ON_STOP;
+ li->action_bits |= action;
if (waitqueue_active(&li->wq))
wake_up_interruptible(&li->wq);
spin_unlock_bh(&li->lock);
- rc = 0; /* order accepted */
+
+ return 0; /* order accepted */
+}
+
+static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int action)
+{
+ struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
+ struct kvm_s390_local_interrupt *li;
+ int rc;
+
+ if (cpu_addr >= KVM_MAX_VCPUS)
+ return 3; /* not operational */
+
+ spin_lock(&fi->lock);
+ li = fi->local_int[cpu_addr];
+ if (li == NULL) {
+ rc = 3; /* not operational */
+ goto unlock;
+ }
+
+ rc = __inject_sigp_stop(li, action);
+
unlock:
spin_unlock(&fi->lock);
VCPU_EVENT(vcpu, 4, "sent sigp stop to cpu %x", cpu_addr);
return rc;
}
+int kvm_s390_inject_sigp_stop(struct kvm_vcpu *vcpu, int action)
+{
+ struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
+ return __inject_sigp_stop(li, action);
+}
+
static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter)
{
int rc;
@@ -177,9 +189,9 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address,
/* make sure that the new value is valid memory */
address = address & 0x7fffe000u;
if ((copy_from_guest(vcpu, &tmp,
- (u64) (address + vcpu->kvm->arch.guest_origin) , 1)) ||
+ (u64) (address + vcpu->arch.sie_block->gmsor) , 1)) ||
(copy_from_guest(vcpu, &tmp, (u64) (address +
- vcpu->kvm->arch.guest_origin + PAGE_SIZE), 1))) {
+ vcpu->arch.sie_block->gmsor + PAGE_SIZE), 1))) {
*reg |= SIGP_STAT_INVALID_PARAMETER;
return 1; /* invalid parameter */
}
@@ -261,11 +273,11 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
break;
case SIGP_STOP:
vcpu->stat.instruction_sigp_stop++;
- rc = __sigp_stop(vcpu, cpu_addr, 0);
+ rc = __sigp_stop(vcpu, cpu_addr, ACTION_STOP_ON_STOP);
break;
case SIGP_STOP_STORE_STATUS:
vcpu->stat.instruction_sigp_stop++;
- rc = __sigp_stop(vcpu, cpu_addr, 1);
+ rc = __sigp_stop(vcpu, cpu_addr, ACTION_STORE_ON_STOP);
break;
case SIGP_SET_ARCH:
vcpu->stat.instruction_sigp_arch++;
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 125be8b19568..708b9c32a5da 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -17,6 +17,7 @@
#define __KVM_HAVE_USER_NMI
#define __KVM_HAVE_GUEST_DEBUG
#define __KVM_HAVE_MSIX
+#define __KVM_HAVE_MCE
/* Architectural interrupt line count. */
#define KVM_NR_INTERRUPTS 256
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index eabdc1cfab5c..30b625d8e5f0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -14,6 +14,7 @@
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/mmu_notifier.h>
+#include <linux/tracepoint.h>
#include <linux/kvm.h>
#include <linux/kvm_para.h>
@@ -37,12 +38,14 @@
#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \
0xFFFFFF0000000000ULL)
-#define KVM_GUEST_CR0_MASK \
- (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE \
- | X86_CR0_NW | X86_CR0_CD)
+#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
+ (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
+#define KVM_GUEST_CR0_MASK \
+ (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
+#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \
+ (X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP)
#define KVM_VM_CR0_ALWAYS_ON \
- (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE | X86_CR0_TS \
- | X86_CR0_MP)
+ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
#define KVM_GUEST_CR4_MASK \
(X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE)
#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
@@ -51,12 +54,12 @@
#define INVALID_PAGE (~(hpa_t)0)
#define UNMAPPED_GVA (~(gpa_t)0)
-/* shadow tables are PAE even on non-PAE hosts */
-#define KVM_HPAGE_SHIFT 21
-#define KVM_HPAGE_SIZE (1UL << KVM_HPAGE_SHIFT)
-#define KVM_HPAGE_MASK (~(KVM_HPAGE_SIZE - 1))
-
-#define KVM_PAGES_PER_HPAGE (KVM_HPAGE_SIZE / PAGE_SIZE)
+/* KVM Hugepage definitions for x86 */
+#define KVM_NR_PAGE_SIZES 2
+#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + (((x) - 1) * 9))
+#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x))
+#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1))
+#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
#define DE_VECTOR 0
#define DB_VECTOR 1
@@ -120,6 +123,10 @@ enum kvm_reg {
NR_VCPU_REGS
};
+enum kvm_reg_ex {
+ VCPU_EXREG_PDPTR = NR_VCPU_REGS,
+};
+
enum {
VCPU_SREG_ES,
VCPU_SREG_CS,
@@ -334,16 +341,6 @@ struct kvm_vcpu_arch {
u8 nr;
} interrupt;
- struct {
- int vm86_active;
- u8 save_iopl;
- struct kvm_save_segment {
- u16 selector;
- unsigned long base;
- u32 limit;
- u32 ar;
- } tr, es, ds, fs, gs;
- } rmode;
int halt_request; /* real mode on Intel only */
int cpuid_nent;
@@ -373,6 +370,11 @@ struct kvm_vcpu_arch {
unsigned long dr6;
unsigned long dr7;
unsigned long eff_db[KVM_NR_DB_REGS];
+
+ u64 mcg_cap;
+ u64 mcg_status;
+ u64 mcg_ctl;
+ u64 *mce_banks;
};
struct kvm_mem_alias {
@@ -526,6 +528,7 @@ struct kvm_x86_ops {
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
int (*get_tdp_level)(void);
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
+ const struct trace_print_flags *exit_reasons_str;
};
extern struct kvm_x86_ops *kvm_x86_ops;
@@ -752,8 +755,6 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
}
-#define MSR_IA32_TIME_STAMP_COUNTER 0x010
-
#define TSS_IOPB_BASE_OFFSET 0x66
#define TSS_BASE_SIZE 0x68
#define TSS_IOPB_SIZE (65536 / 8)
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 1692fb5050e3..1723635e017a 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -378,6 +378,7 @@
/* AMD-V MSRs */
#define MSR_VM_CR 0xc0010114
+#define MSR_VM_IGNNE 0xc0010115
#define MSR_VM_HSAVE_PA 0xc0010117
#endif /* _ASM_X86_MSR_INDEX_H */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 11be5ad2e0e9..272514c2d456 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -55,6 +55,7 @@
#define SECONDARY_EXEC_ENABLE_EPT 0x00000002
#define SECONDARY_EXEC_ENABLE_VPID 0x00000020
#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
+#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080
#define PIN_BASED_EXT_INTR_MASK 0x00000001
@@ -351,9 +352,16 @@ enum vmcs_field {
#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0
#define VMX_EPT_EXTENT_CONTEXT 1
#define VMX_EPT_EXTENT_GLOBAL 2
+
+#define VMX_EPT_EXECUTE_ONLY_BIT (1ull)
+#define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6)
+#define VMX_EPTP_UC_BIT (1ull << 8)
+#define VMX_EPTP_WB_BIT (1ull << 14)
+#define VMX_EPT_2MB_PAGE_BIT (1ull << 16)
#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24)
#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
+
#define VMX_EPT_DEFAULT_GAW 3
#define VMX_EPT_MAX_GAW 0x4
#define VMX_EPT_MT_EPTE_SHIFT 3
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 8600a09e0c6c..7fbedfd34d6c 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -1,12 +1,8 @@
#
# KVM configuration
#
-config HAVE_KVM
- bool
-config HAVE_KVM_IRQCHIP
- bool
- default y
+source "virt/kvm/Kconfig"
menuconfig VIRTUALIZATION
bool "Virtualization"
@@ -29,6 +25,9 @@ config KVM
select PREEMPT_NOTIFIERS
select MMU_NOTIFIER
select ANON_INODES
+ select HAVE_KVM_IRQCHIP
+ select HAVE_KVM_EVENTFD
+ select KVM_APIC_ARCHITECTURE
---help---
Support hosting fully virtualized guest machines using hardware
virtualization extensions. You will need a fairly recent
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index b43c4efafe80..7c56850b82cb 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -1,22 +1,20 @@
-#
-# Makefile for Kernel-based Virtual Machine module
-#
-
-common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
- coalesced_mmio.o irq_comm.o)
-ifeq ($(CONFIG_KVM_TRACE),y)
-common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
-endif
-ifeq ($(CONFIG_IOMMU_API),y)
-common-objs += $(addprefix ../../../virt/kvm/, iommu.o)
-endif
EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
-kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \
- i8254.o timer.o
-obj-$(CONFIG_KVM) += kvm.o
-kvm-intel-objs = vmx.o
-obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
-kvm-amd-objs = svm.o
-obj-$(CONFIG_KVM_AMD) += kvm-amd.o
+CFLAGS_x86.o := -I.
+CFLAGS_svm.o := -I.
+CFLAGS_vmx.o := -I.
+
+kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
+ coalesced_mmio.o irq_comm.o eventfd.o)
+kvm-$(CONFIG_KVM_TRACE) += $(addprefix ../../../virt/kvm/, kvm_trace.o)
+kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
+
+kvm-y += x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \
+ i8254.o timer.o
+kvm-intel-y += vmx.o
+kvm-amd-y += svm.o
+
+obj-$(CONFIG_KVM) += kvm.o
+obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
+obj-$(CONFIG_KVM_AMD) += kvm-amd.o
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 4d6f0d293ee2..6e0a2033895a 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -228,7 +228,7 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu)
{
struct kvm_pit *pit = vcpu->kvm->arch.vpit;
- if (pit && vcpu->vcpu_id == 0 && pit->pit_state.irq_ack)
+ if (pit && kvm_vcpu_is_bsp(vcpu) && pit->pit_state.irq_ack)
return atomic_read(&pit->pit_state.pit_timer.pending);
return 0;
}
@@ -249,7 +249,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
struct kvm_pit *pit = vcpu->kvm->arch.vpit;
struct hrtimer *timer;
- if (vcpu->vcpu_id != 0 || !pit)
+ if (!kvm_vcpu_is_bsp(vcpu) || !pit)
return;
timer = &pit->pit_state.pit_timer.timer;
@@ -291,7 +291,7 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
pt->timer.function = kvm_timer_fn;
pt->t_ops = &kpit_ops;
pt->kvm = ps->pit->kvm;
- pt->vcpu_id = 0;
+ pt->vcpu = pt->kvm->bsp_vcpu;
atomic_set(&pt->pending, 0);
ps->irq_ack = 1;
@@ -342,15 +342,23 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val)
{
- mutex_lock(&kvm->arch.vpit->pit_state.lock);
pit_load_count(kvm, channel, val);
- mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+}
+
+static inline struct kvm_pit *dev_to_pit(struct kvm_io_device *dev)
+{
+ return container_of(dev, struct kvm_pit, dev);
+}
+
+static inline struct kvm_pit *speaker_to_pit(struct kvm_io_device *dev)
+{
+ return container_of(dev, struct kvm_pit, speaker_dev);
}
static void pit_ioport_write(struct kvm_io_device *this,
gpa_t addr, int len, const void *data)
{
- struct kvm_pit *pit = (struct kvm_pit *)this->private;
+ struct kvm_pit *pit = dev_to_pit(this);
struct kvm_kpit_state *pit_state = &pit->pit_state;
struct kvm *kvm = pit->kvm;
int channel, access;
@@ -423,7 +431,7 @@ static void pit_ioport_write(struct kvm_io_device *this,
static void pit_ioport_read(struct kvm_io_device *this,
gpa_t addr, int len, void *data)
{
- struct kvm_pit *pit = (struct kvm_pit *)this->private;
+ struct kvm_pit *pit = dev_to_pit(this);
struct kvm_kpit_state *pit_state = &pit->pit_state;
struct kvm *kvm = pit->kvm;
int ret, count;
@@ -494,7 +502,7 @@ static int pit_in_range(struct kvm_io_device *this, gpa_t addr,
static void speaker_ioport_write(struct kvm_io_device *this,
gpa_t addr, int len, const void *data)
{
- struct kvm_pit *pit = (struct kvm_pit *)this->private;
+ struct kvm_pit *pit = speaker_to_pit(this);
struct kvm_kpit_state *pit_state = &pit->pit_state;
struct kvm *kvm = pit->kvm;
u32 val = *(u32 *) data;
@@ -508,7 +516,7 @@ static void speaker_ioport_write(struct kvm_io_device *this,
static void speaker_ioport_read(struct kvm_io_device *this,
gpa_t addr, int len, void *data)
{
- struct kvm_pit *pit = (struct kvm_pit *)this->private;
+ struct kvm_pit *pit = speaker_to_pit(this);
struct kvm_kpit_state *pit_state = &pit->pit_state;
struct kvm *kvm = pit->kvm;
unsigned int refresh_clock;
@@ -560,7 +568,19 @@ static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
}
}
-struct kvm_pit *kvm_create_pit(struct kvm *kvm)
+static const struct kvm_io_device_ops pit_dev_ops = {
+ .read = pit_ioport_read,
+ .write = pit_ioport_write,
+ .in_range = pit_in_range,
+};
+
+static const struct kvm_io_device_ops speaker_dev_ops = {
+ .read = speaker_ioport_read,
+ .write = speaker_ioport_write,
+ .in_range = speaker_in_range,
+};
+
+struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
{
struct kvm_pit *pit;
struct kvm_kpit_state *pit_state;
@@ -579,19 +599,6 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
mutex_lock(&pit->pit_state.lock);
spin_lock_init(&pit->pit_state.inject_lock);
- /* Initialize PIO device */
- pit->dev.read = pit_ioport_read;
- pit->dev.write = pit_ioport_write;
- pit->dev.in_range = pit_in_range;
- pit->dev.private = pit;
- kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev);
-
- pit->speaker_dev.read = speaker_ioport_read;
- pit->speaker_dev.write = speaker_ioport_write;
- pit->speaker_dev.in_range = speaker_in_range;
- pit->speaker_dev.private = pit;
- kvm_io_bus_register_dev(&kvm->pio_bus, &pit->speaker_dev);
-
kvm->arch.vpit = pit;
pit->kvm = kvm;
@@ -610,6 +617,14 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
pit->mask_notifier.func = pit_mask_notifer;
kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+ kvm_iodevice_init(&pit->dev, &pit_dev_ops);
+ kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev);
+
+ if (flags & KVM_PIT_SPEAKER_DUMMY) {
+ kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops);
+ kvm_io_bus_register_dev(&kvm->pio_bus, &pit->speaker_dev);
+ }
+
return pit;
}
@@ -634,10 +649,10 @@ static void __inject_pit_timer_intr(struct kvm *kvm)
struct kvm_vcpu *vcpu;
int i;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->irq_lock);
kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->irq_lock);
/*
* Provides NMI watchdog support via Virtual Wire mode.
@@ -649,11 +664,8 @@ static void __inject_pit_timer_intr(struct kvm *kvm)
* VCPU0, and only if its LVT0 is in EXTINT mode.
*/
if (kvm->arch.vapics_in_nmi_mode > 0)
- for (i = 0; i < KVM_MAX_VCPUS; ++i) {
- vcpu = kvm->vcpus[i];
- if (vcpu)
- kvm_apic_nmi_wd_deliver(vcpu);
- }
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_apic_nmi_wd_deliver(vcpu);
}
void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index bbd863ff60b7..b2670180f225 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -50,7 +50,7 @@ struct kvm_pit {
void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val);
-struct kvm_pit *kvm_create_pit(struct kvm *kvm);
+struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags);
void kvm_free_pit(struct kvm *kvm);
void kvm_pit_reset(struct kvm_pit *pit);
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 1ccb50c74f18..148c52a608d6 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -57,7 +57,7 @@ static void pic_unlock(struct kvm_pic *s)
}
if (wakeup) {
- vcpu = s->kvm->vcpus[0];
+ vcpu = s->kvm->bsp_vcpu;
if (vcpu)
kvm_vcpu_kick(vcpu);
}
@@ -72,8 +72,10 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
void kvm_pic_clear_isr_ack(struct kvm *kvm)
{
struct kvm_pic *s = pic_irqchip(kvm);
+ pic_lock(s);
s->pics[0].isr_ack = 0xff;
s->pics[1].isr_ack = 0xff;
+ pic_unlock(s);
}
/*
@@ -252,7 +254,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
{
int irq, irqbase, n;
struct kvm *kvm = s->pics_state->irq_request_opaque;
- struct kvm_vcpu *vcpu0 = kvm->vcpus[0];
+ struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu;
if (s == &s->pics_state->pics[0])
irqbase = 0;
@@ -444,10 +446,15 @@ static int picdev_in_range(struct kvm_io_device *this, gpa_t addr,
}
}
+static inline struct kvm_pic *to_pic(struct kvm_io_device *dev)
+{
+ return container_of(dev, struct kvm_pic, dev);
+}
+
static void picdev_write(struct kvm_io_device *this,
gpa_t addr, int len, const void *val)
{
- struct kvm_pic *s = this->private;
+ struct kvm_pic *s = to_pic(this);
unsigned char data = *(unsigned char *)val;
if (len != 1) {
@@ -474,7 +481,7 @@ static void picdev_write(struct kvm_io_device *this,
static void picdev_read(struct kvm_io_device *this,
gpa_t addr, int len, void *val)
{
- struct kvm_pic *s = this->private;
+ struct kvm_pic *s = to_pic(this);
unsigned char data = 0;
if (len != 1) {
@@ -505,7 +512,7 @@ static void picdev_read(struct kvm_io_device *this,
static void pic_irq_request(void *opaque, int level)
{
struct kvm *kvm = opaque;
- struct kvm_vcpu *vcpu = kvm->vcpus[0];
+ struct kvm_vcpu *vcpu = kvm->bsp_vcpu;
struct kvm_pic *s = pic_irqchip(kvm);
int irq = pic_get_irq(&s->pics[0]);
@@ -516,6 +523,12 @@ static void pic_irq_request(void *opaque, int level)
}
}
+static const struct kvm_io_device_ops picdev_ops = {
+ .read = picdev_read,
+ .write = picdev_write,
+ .in_range = picdev_in_range,
+};
+
struct kvm_pic *kvm_create_pic(struct kvm *kvm)
{
struct kvm_pic *s;
@@ -534,10 +547,7 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
/*
* Initialize PIO device
*/
- s->dev.read = picdev_read;
- s->dev.write = picdev_write;
- s->dev.in_range = picdev_in_range;
- s->dev.private = s;
+ kvm_iodevice_init(&s->dev, &picdev_ops);
kvm_io_bus_register_dev(&kvm->pio_bus, &s->dev);
return s;
}
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 1ff819dce7d3..7bcc5b6a4403 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -29,4 +29,13 @@ static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
kvm_register_write(vcpu, VCPU_REGS_RIP, val);
}
+static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
+{
+ if (!test_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_avail))
+ kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR);
+
+ return vcpu->arch.pdptrs[index];
+}
+
#endif
diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h
deleted file mode 100644
index ed66e4c078dc..000000000000
--- a/arch/x86/kvm/kvm_svm.h
+++ /dev/null
@@ -1,51 +0,0 @@
-#ifndef __KVM_SVM_H
-#define __KVM_SVM_H
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/list.h>
-#include <linux/kvm_host.h>
-#include <asm/msr.h>
-
-#include <asm/svm.h>
-
-static const u32 host_save_user_msrs[] = {
-#ifdef CONFIG_X86_64
- MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
- MSR_FS_BASE,
-#endif
- MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
-};
-
-#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
-
-struct kvm_vcpu;
-
-struct vcpu_svm {
- struct kvm_vcpu vcpu;
- struct vmcb *vmcb;
- unsigned long vmcb_pa;
- struct svm_cpu_data *svm_data;
- uint64_t asid_generation;
-
- u64 next_rip;
-
- u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
- u64 host_gs_base;
- unsigned long host_cr2;
-
- u32 *msrpm;
- struct vmcb *hsave;
- u64 hsave_msr;
-
- u64 nested_vmcb;
-
- /* These are the merged vectors */
- u32 *nested_msrpm;
-
- /* gpa pointers to the real vectors */
- u64 nested_vmcb_msrpm;
-};
-
-#endif
-
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
index 26bd6ba74e1c..55c7524dda54 100644
--- a/arch/x86/kvm/kvm_timer.h
+++ b/arch/x86/kvm/kvm_timer.h
@@ -6,7 +6,7 @@ struct kvm_timer {
bool reinject;
struct kvm_timer_ops *t_ops;
struct kvm *kvm;
- int vcpu_id;
+ struct kvm_vcpu *vcpu;
};
struct kvm_timer_ops {
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index ae99d83f81a3..2e0286596387 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -34,6 +34,7 @@
#include <asm/atomic.h>
#include "kvm_cache_regs.h"
#include "irq.h"
+#include "trace.h"
#ifndef CONFIG_X86_64
#define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
@@ -165,36 +166,52 @@ static int find_highest_vector(void *bitmap)
static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
{
+ apic->irr_pending = true;
return apic_test_and_set_vector(vec, apic->regs + APIC_IRR);
}
-static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
+static inline int apic_search_irr(struct kvm_lapic *apic)
{
- apic_clear_vector(vec, apic->regs + APIC_IRR);
+ return find_highest_vector(apic->regs + APIC_IRR);
}
static inline int apic_find_highest_irr(struct kvm_lapic *apic)
{
int result;
- result = find_highest_vector(apic->regs + APIC_IRR);
+ if (!apic->irr_pending)
+ return -1;
+
+ result = apic_search_irr(apic);
ASSERT(result == -1 || result >= 16);
return result;
}
+static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
+{
+ apic->irr_pending = false;
+ apic_clear_vector(vec, apic->regs + APIC_IRR);
+ if (apic_search_irr(apic) != -1)
+ apic->irr_pending = true;
+}
+
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
int highest_irr;
+ /* This may race with setting of irr in __apic_accept_irq() and
+ * value returned may be wrong, but kvm_vcpu_kick() in __apic_accept_irq
+ * will cause vmexit immediately and the value will be recalculated
+ * on the next vmentry.
+ */
if (!apic)
return 0;
highest_irr = apic_find_highest_irr(apic);
return highest_irr;
}
-EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
int vector, int level, int trig_mode);
@@ -425,7 +442,9 @@ static void apic_set_eoi(struct kvm_lapic *apic)
trigger_mode = IOAPIC_LEVEL_TRIG;
else
trigger_mode = IOAPIC_EDGE_TRIG;
+ mutex_lock(&apic->vcpu->kvm->irq_lock);
kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
+ mutex_unlock(&apic->vcpu->kvm->irq_lock);
}
static void apic_send_ipi(struct kvm_lapic *apic)
@@ -449,7 +468,9 @@ static void apic_send_ipi(struct kvm_lapic *apic)
irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
irq.vector);
+ mutex_lock(&apic->vcpu->kvm->irq_lock);
kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq);
+ mutex_unlock(&apic->vcpu->kvm->irq_lock);
}
static u32 apic_get_tmcct(struct kvm_lapic *apic)
@@ -495,8 +516,6 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
{
u32 val = 0;
- KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler);
-
if (offset >= LAPIC_MMIO_LENGTH)
return 0;
@@ -522,10 +541,15 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
return val;
}
+static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev)
+{
+ return container_of(dev, struct kvm_lapic, dev);
+}
+
static void apic_mmio_read(struct kvm_io_device *this,
gpa_t address, int len, void *data)
{
- struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
+ struct kvm_lapic *apic = to_lapic(this);
unsigned int offset = address - apic->base_address;
unsigned char alignment = offset & 0xf;
u32 result;
@@ -537,6 +561,8 @@ static void apic_mmio_read(struct kvm_io_device *this,
}
result = __apic_read(apic, offset & ~0xf);
+ trace_kvm_apic_read(offset, result);
+
switch (len) {
case 1:
case 2:
@@ -606,7 +632,7 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
static void apic_mmio_write(struct kvm_io_device *this,
gpa_t address, int len, const void *data)
{
- struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
+ struct kvm_lapic *apic = to_lapic(this);
unsigned int offset = address - apic->base_address;
unsigned char alignment = offset & 0xf;
u32 val;
@@ -632,7 +658,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
offset &= 0xff0;
- KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler);
+ trace_kvm_apic_write(offset, val);
switch (offset) {
case APIC_ID: /* Local APIC ID */
@@ -723,7 +749,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr,
int len, int size)
{
- struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
+ struct kvm_lapic *apic = to_lapic(this);
int ret = 0;
@@ -763,7 +789,6 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
| (apic_get_reg(apic, APIC_TASKPRI) & 4));
}
-EXPORT_SYMBOL_GPL(kvm_lapic_set_tpr);
u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
{
@@ -776,7 +801,6 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
return (tpr & 0xf0) >> 4;
}
-EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8);
void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
{
@@ -787,7 +811,8 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
vcpu->arch.apic_base = value;
return;
}
- if (apic->vcpu->vcpu_id)
+
+ if (!kvm_vcpu_is_bsp(apic->vcpu))
value &= ~MSR_IA32_APICBASE_BSP;
vcpu->arch.apic_base = value;
@@ -800,12 +825,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
}
-u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.apic_base;
-}
-EXPORT_SYMBOL_GPL(kvm_lapic_get_base);
-
void kvm_lapic_reset(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic;
@@ -842,9 +861,10 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
}
+ apic->irr_pending = false;
update_divide_count(apic);
atomic_set(&apic->lapic_timer.pending, 0);
- if (vcpu->vcpu_id == 0)
+ if (kvm_vcpu_is_bsp(vcpu))
vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
apic_update_ppr(apic);
@@ -855,7 +875,6 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
vcpu, kvm_apic_id(apic),
vcpu->arch.apic_base, apic->base_address);
}
-EXPORT_SYMBOL_GPL(kvm_lapic_reset);
bool kvm_apic_present(struct kvm_vcpu *vcpu)
{
@@ -866,7 +885,6 @@ int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
{
return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic);
}
-EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
/*
*----------------------------------------------------------------------
@@ -917,6 +935,12 @@ static struct kvm_timer_ops lapic_timer_ops = {
.is_periodic = lapic_is_periodic,
};
+static const struct kvm_io_device_ops apic_mmio_ops = {
+ .read = apic_mmio_read,
+ .write = apic_mmio_write,
+ .in_range = apic_mmio_range,
+};
+
int kvm_create_lapic(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic;
@@ -945,16 +969,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
apic->lapic_timer.timer.function = kvm_timer_fn;
apic->lapic_timer.t_ops = &lapic_timer_ops;
apic->lapic_timer.kvm = vcpu->kvm;
- apic->lapic_timer.vcpu_id = vcpu->vcpu_id;
+ apic->lapic_timer.vcpu = vcpu;
apic->base_address = APIC_DEFAULT_PHYS_BASE;
vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
kvm_lapic_reset(vcpu);
- apic->dev.read = apic_mmio_read;
- apic->dev.write = apic_mmio_write;
- apic->dev.in_range = apic_mmio_range;
- apic->dev.private = apic;
+ kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
return 0;
nomem_free_apic:
@@ -962,7 +983,6 @@ nomem_free_apic:
nomem:
return -ENOMEM;
}
-EXPORT_SYMBOL_GPL(kvm_create_lapic);
int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
{
@@ -985,7 +1005,7 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
int r = 0;
- if (vcpu->vcpu_id == 0) {
+ if (kvm_vcpu_is_bsp(vcpu)) {
if (!apic_hw_enabled(vcpu->arch.apic))
r = 1;
if ((lvt0 & APIC_LVT_MASKED) == 0 &&
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index a587f8349c46..3f3ecc6edbf5 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -12,6 +12,7 @@ struct kvm_lapic {
struct kvm_timer lapic_timer;
u32 divide_count;
struct kvm_vcpu *vcpu;
+ bool irr_pending;
struct page *regs_page;
void *regs;
gpa_t vapic_addr;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5c3d6e81a7dc..1f24d8833d61 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -18,6 +18,7 @@
*/
#include "mmu.h"
+#include "kvm_cache_regs.h"
#include <linux/kvm_host.h>
#include <linux/types.h>
@@ -142,7 +143,7 @@ module_param(oos_shadow, bool, 0644);
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
struct kvm_rmap_desc {
- u64 *shadow_ptes[RMAP_EXT];
+ u64 *sptes[RMAP_EXT];
struct kvm_rmap_desc *more;
};
@@ -239,16 +240,25 @@ static int is_writeble_pte(unsigned long pte)
return pte & PT_WRITABLE_MASK;
}
-static int is_dirty_pte(unsigned long pte)
+static int is_dirty_gpte(unsigned long pte)
{
- return pte & shadow_dirty_mask;
+ return pte & PT_DIRTY_MASK;
}
-static int is_rmap_pte(u64 pte)
+static int is_rmap_spte(u64 pte)
{
return is_shadow_present_pte(pte);
}
+static int is_last_spte(u64 pte, int level)
+{
+ if (level == PT_PAGE_TABLE_LEVEL)
+ return 1;
+ if (level == PT_DIRECTORY_LEVEL && is_large_pte(pte))
+ return 1;
+ return 0;
+}
+
static pfn_t spte_to_pfn(u64 pte)
{
return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -261,7 +271,7 @@ static gfn_t pse36_gfn_delta(u32 gpte)
return (gpte & PT32_DIR_PSE36_MASK) << shift;
}
-static void set_shadow_pte(u64 *sptep, u64 spte)
+static void __set_spte(u64 *sptep, u64 spte)
{
#ifdef CONFIG_X86_64
set_64bit((unsigned long *)sptep, spte);
@@ -384,9 +394,9 @@ static int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot)
{
unsigned long idx;
- idx = (gfn / KVM_PAGES_PER_HPAGE) -
- (slot->base_gfn / KVM_PAGES_PER_HPAGE);
- return &slot->lpage_info[idx].write_count;
+ idx = (gfn / KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL)) -
+ (slot->base_gfn / KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL));
+ return &slot->lpage_info[0][idx].write_count;
}
static void account_shadowed(struct kvm *kvm, gfn_t gfn)
@@ -475,10 +485,10 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage)
if (!lpage)
return &slot->rmap[gfn - slot->base_gfn];
- idx = (gfn / KVM_PAGES_PER_HPAGE) -
- (slot->base_gfn / KVM_PAGES_PER_HPAGE);
+ idx = (gfn / KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL)) -
+ (slot->base_gfn / KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL));
- return &slot->lpage_info[idx].rmap_pde;
+ return &slot->lpage_info[0][idx].rmap_pde;
}
/*
@@ -497,7 +507,7 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage)
unsigned long *rmapp;
int i;
- if (!is_rmap_pte(*spte))
+ if (!is_rmap_spte(*spte))
return;
gfn = unalias_gfn(vcpu->kvm, gfn);
sp = page_header(__pa(spte));
@@ -509,21 +519,21 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage)
} else if (!(*rmapp & 1)) {
rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
desc = mmu_alloc_rmap_desc(vcpu);
- desc->shadow_ptes[0] = (u64 *)*rmapp;
- desc->shadow_ptes[1] = spte;
+ desc->sptes[0] = (u64 *)*rmapp;
+ desc->sptes[1] = spte;
*rmapp = (unsigned long)desc | 1;
} else {
rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
- while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
+ while (desc->sptes[RMAP_EXT-1] && desc->more)
desc = desc->more;
- if (desc->shadow_ptes[RMAP_EXT-1]) {
+ if (desc->sptes[RMAP_EXT-1]) {
desc->more = mmu_alloc_rmap_desc(vcpu);
desc = desc->more;
}
- for (i = 0; desc->shadow_ptes[i]; ++i)
+ for (i = 0; desc->sptes[i]; ++i)
;
- desc->shadow_ptes[i] = spte;
+ desc->sptes[i] = spte;
}
}
@@ -534,14 +544,14 @@ static void rmap_desc_remove_entry(unsigned long *rmapp,
{
int j;
- for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
+ for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j)
;
- desc->shadow_ptes[i] = desc->shadow_ptes[j];
- desc->shadow_ptes[j] = NULL;
+ desc->sptes[i] = desc->sptes[j];
+ desc->sptes[j] = NULL;
if (j != 0)
return;
if (!prev_desc && !desc->more)
- *rmapp = (unsigned long)desc->shadow_ptes[0];
+ *rmapp = (unsigned long)desc->sptes[0];
else
if (prev_desc)
prev_desc->more = desc->more;
@@ -559,7 +569,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
unsigned long *rmapp;
int i;
- if (!is_rmap_pte(*spte))
+ if (!is_rmap_spte(*spte))
return;
sp = page_header(__pa(spte));
pfn = spte_to_pfn(*spte);
@@ -586,8 +596,8 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
prev_desc = NULL;
while (desc) {
- for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
- if (desc->shadow_ptes[i] == spte) {
+ for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i)
+ if (desc->sptes[i] == spte) {
rmap_desc_remove_entry(rmapp,
desc, i,
prev_desc);
@@ -618,10 +628,10 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
prev_desc = NULL;
prev_spte = NULL;
while (desc) {
- for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) {
+ for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
if (prev_spte == spte)
- return desc->shadow_ptes[i];
- prev_spte = desc->shadow_ptes[i];
+ return desc->sptes[i];
+ prev_spte = desc->sptes[i];
}
desc = desc->more;
}
@@ -643,7 +653,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
BUG_ON(!(*spte & PT_PRESENT_MASK));
rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
if (is_writeble_pte(*spte)) {
- set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
+ __set_spte(spte, *spte & ~PT_WRITABLE_MASK);
write_protected = 1;
}
spte = rmap_next(kvm, rmapp, spte);
@@ -667,7 +677,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
if (is_writeble_pte(*spte)) {
rmap_remove(kvm, spte);
--kvm->stat.lpages;
- set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+ __set_spte(spte, shadow_trap_nonpresent_pte);
spte = NULL;
write_protected = 1;
}
@@ -686,7 +696,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
BUG_ON(!(*spte & PT_PRESENT_MASK));
rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
rmap_remove(kvm, spte);
- set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+ __set_spte(spte, shadow_trap_nonpresent_pte);
need_tlb_flush = 1;
}
return need_tlb_flush;
@@ -714,11 +724,11 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
end = start + (memslot->npages << PAGE_SHIFT);
if (hva >= start && hva < end) {
gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+ int idx = gfn_offset /
+ KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL);
retval |= handler(kvm, &memslot->rmap[gfn_offset]);
retval |= handler(kvm,
- &memslot->lpage_info[
- gfn_offset /
- KVM_PAGES_PER_HPAGE].rmap_pde);
+ &memslot->lpage_info[0][idx].rmap_pde);
}
}
@@ -1272,6 +1282,11 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
{
if (iterator->level < PT_PAGE_TABLE_LEVEL)
return false;
+
+ if (iterator->level == PT_PAGE_TABLE_LEVEL)
+ if (is_large_pte(*iterator->sptep))
+ return false;
+
iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
return true;
@@ -1292,25 +1307,17 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
pt = sp->spt;
- if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- if (is_shadow_present_pte(pt[i]))
- rmap_remove(kvm, &pt[i]);
- pt[i] = shadow_trap_nonpresent_pte;
- }
- return;
- }
-
for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
ent = pt[i];
if (is_shadow_present_pte(ent)) {
- if (!is_large_pte(ent)) {
+ if (!is_last_spte(ent, sp->role.level)) {
ent &= PT64_BASE_ADDR_MASK;
mmu_page_remove_parent_pte(page_header(ent),
&pt[i]);
} else {
- --kvm->stat.lpages;
+ if (is_large_pte(ent))
+ --kvm->stat.lpages;
rmap_remove(kvm, &pt[i]);
}
}
@@ -1326,10 +1333,10 @@ static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
{
int i;
+ struct kvm_vcpu *vcpu;
- for (i = 0; i < KVM_MAX_VCPUS; ++i)
- if (kvm->vcpus[i])
- kvm->vcpus[i]->arch.last_pte_updated = NULL;
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ vcpu->arch.last_pte_updated = NULL;
}
static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -1348,7 +1355,7 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
}
BUG_ON(!parent_pte);
kvm_mmu_put_page(sp, parent_pte);
- set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
+ __set_spte(parent_pte, shadow_trap_nonpresent_pte);
}
}
@@ -1495,7 +1502,7 @@ static void mmu_convert_notrap(struct kvm_mmu_page *sp)
for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
if (pt[i] == shadow_notrap_nonpresent_pte)
- set_shadow_pte(&pt[i], shadow_trap_nonpresent_pte);
+ __set_spte(&pt[i], shadow_trap_nonpresent_pte);
}
}
@@ -1661,7 +1668,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
return 0;
}
-static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
+static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pte_access, int user_fault,
int write_fault, int dirty, int largepage,
gfn_t gfn, pfn_t pfn, bool speculative,
@@ -1711,7 +1718,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
* is responsibility of mmu_get_page / kvm_sync_page.
* Same reasoning can be applied to dirty page accounting.
*/
- if (!can_unsync && is_writeble_pte(*shadow_pte))
+ if (!can_unsync && is_writeble_pte(*sptep))
goto set_pte;
if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
@@ -1728,61 +1735,61 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
mark_page_dirty(vcpu->kvm, gfn);
set_pte:
- set_shadow_pte(shadow_pte, spte);
+ __set_spte(sptep, spte);
return ret;
}
-static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
+static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pt_access, unsigned pte_access,
int user_fault, int write_fault, int dirty,
int *ptwrite, int largepage, gfn_t gfn,
pfn_t pfn, bool speculative)
{
int was_rmapped = 0;
- int was_writeble = is_writeble_pte(*shadow_pte);
+ int was_writeble = is_writeble_pte(*sptep);
pgprintk("%s: spte %llx access %x write_fault %d"
" user_fault %d gfn %lx\n",
- __func__, *shadow_pte, pt_access,
+ __func__, *sptep, pt_access,
write_fault, user_fault, gfn);
- if (is_rmap_pte(*shadow_pte)) {
+ if (is_rmap_spte(*sptep)) {
/*
* If we overwrite a PTE page pointer with a 2MB PMD, unlink
* the parent of the now unreachable PTE.
*/
- if (largepage && !is_large_pte(*shadow_pte)) {
+ if (largepage && !is_large_pte(*sptep)) {
struct kvm_mmu_page *child;
- u64 pte = *shadow_pte;
+ u64 pte = *sptep;
child = page_header(pte & PT64_BASE_ADDR_MASK);
- mmu_page_remove_parent_pte(child, shadow_pte);
- } else if (pfn != spte_to_pfn(*shadow_pte)) {
+ mmu_page_remove_parent_pte(child, sptep);
+ } else if (pfn != spte_to_pfn(*sptep)) {
pgprintk("hfn old %lx new %lx\n",
- spte_to_pfn(*shadow_pte), pfn);
- rmap_remove(vcpu->kvm, shadow_pte);
+ spte_to_pfn(*sptep), pfn);
+ rmap_remove(vcpu->kvm, sptep);
} else
was_rmapped = 1;
}
- if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
+ if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
dirty, largepage, gfn, pfn, speculative, true)) {
if (write_fault)
*ptwrite = 1;
kvm_x86_ops->tlb_flush(vcpu);
}
- pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte);
+ pgprintk("%s: setting spte %llx\n", __func__, *sptep);
pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
- is_large_pte(*shadow_pte)? "2MB" : "4kB",
- is_present_pte(*shadow_pte)?"RW":"R", gfn,
- *shadow_pte, shadow_pte);
- if (!was_rmapped && is_large_pte(*shadow_pte))
+ is_large_pte(*sptep)? "2MB" : "4kB",
+ is_present_pte(*sptep)?"RW":"R", gfn,
+ *shadow_pte, sptep);
+ if (!was_rmapped && is_large_pte(*sptep))
++vcpu->kvm->stat.lpages;
- page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
+ page_header_update_slot(vcpu->kvm, sptep, gfn);
if (!was_rmapped) {
- rmap_add(vcpu, shadow_pte, gfn, largepage);
- if (!is_rmap_pte(*shadow_pte))
+ rmap_add(vcpu, sptep, gfn, largepage);
+ if (!is_rmap_spte(*sptep))
kvm_release_pfn_clean(pfn);
} else {
if (was_writeble)
@@ -1791,7 +1798,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
kvm_release_pfn_clean(pfn);
}
if (speculative) {
- vcpu->arch.last_pte_updated = shadow_pte;
+ vcpu->arch.last_pte_updated = sptep;
vcpu->arch.last_pte_gfn = gfn;
}
}
@@ -1829,10 +1836,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
return -ENOMEM;
}
- set_shadow_pte(iterator.sptep,
- __pa(sp->spt)
- | PT_PRESENT_MASK | PT_WRITABLE_MASK
- | shadow_user_mask | shadow_x_mask);
+ __set_spte(iterator.sptep,
+ __pa(sp->spt)
+ | PT_PRESENT_MASK | PT_WRITABLE_MASK
+ | shadow_user_mask | shadow_x_mask);
}
}
return pt_write;
@@ -1845,8 +1852,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
pfn_t pfn;
unsigned long mmu_seq;
- if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
- gfn &= ~(KVM_PAGES_PER_HPAGE-1);
+ if (is_largepage_backed(vcpu, gfn &
+ ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1))) {
+ gfn &= ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1);
largepage = 1;
}
@@ -1930,6 +1938,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
gfn_t root_gfn;
struct kvm_mmu_page *sp;
int direct = 0;
+ u64 pdptr;
root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
@@ -1957,11 +1966,12 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
ASSERT(!VALID_PAGE(root));
if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
- if (!is_present_pte(vcpu->arch.pdptrs[i])) {
+ pdptr = kvm_pdptr_read(vcpu, i);
+ if (!is_present_gpte(pdptr)) {
vcpu->arch.mmu.pae_root[i] = 0;
continue;
}
- root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT;
+ root_gfn = pdptr >> PAGE_SHIFT;
} else if (vcpu->arch.mmu.root_level == 0)
root_gfn = 0;
if (mmu_check_root(vcpu, root_gfn))
@@ -2049,8 +2059,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
if (r)
return r;
- if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
- gfn &= ~(KVM_PAGES_PER_HPAGE-1);
+ if (is_largepage_backed(vcpu, gfn &
+ ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1))) {
+ gfn &= ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1);
largepage = 1;
}
mmu_seq = vcpu->kvm->mmu_notifier_seq;
@@ -2157,7 +2168,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
else
/* 32 bits PSE 4MB page */
context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
- context->rsvd_bits_mask[1][0] = ~0ull;
+ context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
break;
case PT32E_ROOT_LEVEL:
context->rsvd_bits_mask[0][2] =
@@ -2170,7 +2181,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 62) |
rsvd_bits(13, 20); /* large page */
- context->rsvd_bits_mask[1][0] = ~0ull;
+ context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
break;
case PT64_ROOT_LEVEL:
context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
@@ -2186,7 +2197,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 51) |
rsvd_bits(13, 20); /* large page */
- context->rsvd_bits_mask[1][0] = ~0ull;
+ context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
break;
}
}
@@ -2354,15 +2365,14 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
pte = *spte;
if (is_shadow_present_pte(pte)) {
- if (sp->role.level == PT_PAGE_TABLE_LEVEL ||
- is_large_pte(pte))
+ if (is_last_spte(pte, sp->role.level))
rmap_remove(vcpu->kvm, spte);
else {
child = page_header(pte & PT64_BASE_ADDR_MASK);
mmu_page_remove_parent_pte(child, spte);
}
}
- set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+ __set_spte(spte, shadow_trap_nonpresent_pte);
if (is_large_pte(pte))
--vcpu->kvm->stat.lpages;
}
@@ -2448,12 +2458,12 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
if ((bytes == 4) && (gpa % 4 == 0))
memcpy((void *)&gpte, new, 4);
}
- if (!is_present_pte(gpte))
+ if (!is_present_gpte(gpte))
return;
gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
- gfn &= ~(KVM_PAGES_PER_HPAGE-1);
+ gfn &= ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1);
vcpu->arch.update_pte.largepage = 1;
}
vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
@@ -2646,8 +2656,9 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
++vcpu->stat.mmio_exits;
return 0;
case EMULATE_FAIL:
- kvm_report_emulation_failure(vcpu, "pagetable");
- return 1;
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ return 0;
default:
BUG();
}
@@ -3005,6 +3016,24 @@ out:
return r;
}
+int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
+{
+ struct kvm_shadow_walk_iterator iterator;
+ int nr_sptes = 0;
+
+ spin_lock(&vcpu->kvm->mmu_lock);
+ for_each_shadow_entry(vcpu, addr, iterator) {
+ sptes[iterator.level-1] = *iterator.sptep;
+ nr_sptes++;
+ if (!is_shadow_present_pte(*iterator.sptep))
+ break;
+ }
+ spin_unlock(&vcpu->kvm->mmu_lock);
+
+ return nr_sptes;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
+
#ifdef AUDIT
static const char *audit_msg;
@@ -3017,6 +3046,54 @@ static gva_t canonicalize(gva_t gva)
return gva;
}
+
+typedef void (*inspect_spte_fn) (struct kvm *kvm, struct kvm_mmu_page *sp,
+ u64 *sptep);
+
+static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
+ inspect_spte_fn fn)
+{
+ int i;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ u64 ent = sp->spt[i];
+
+ if (is_shadow_present_pte(ent)) {
+ if (!is_last_spte(ent, sp->role.level)) {
+ struct kvm_mmu_page *child;
+ child = page_header(ent & PT64_BASE_ADDR_MASK);
+ __mmu_spte_walk(kvm, child, fn);
+ } else
+ fn(kvm, sp, &sp->spt[i]);
+ }
+ }
+}
+
+static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
+{
+ int i;
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return;
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+ hpa_t root = vcpu->arch.mmu.root_hpa;
+ sp = page_header(root);
+ __mmu_spte_walk(vcpu->kvm, sp, fn);
+ return;
+ }
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+ if (root && VALID_PAGE(root)) {
+ root &= PT64_BASE_ADDR_MASK;
+ sp = page_header(root);
+ __mmu_spte_walk(vcpu->kvm, sp, fn);
+ }
+ }
+ return;
+}
+
static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
gva_t va, int level)
{
@@ -3031,20 +3108,19 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
continue;
va = canonicalize(va);
- if (level > 1) {
- if (ent == shadow_notrap_nonpresent_pte)
- printk(KERN_ERR "audit: (%s) nontrapping pte"
- " in nonleaf level: levels %d gva %lx"
- " level %d pte %llx\n", audit_msg,
- vcpu->arch.mmu.root_level, va, level, ent);
- else
- audit_mappings_page(vcpu, ent, va, level - 1);
- } else {
+ if (is_shadow_present_pte(ent) && !is_last_spte(ent, level))
+ audit_mappings_page(vcpu, ent, va, level - 1);
+ else {
gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
gfn_t gfn = gpa >> PAGE_SHIFT;
pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
+ if (is_error_pfn(pfn)) {
+ kvm_release_pfn_clean(pfn);
+ continue;
+ }
+
if (is_shadow_present_pte(ent)
&& (ent & PT64_BASE_ADDR_MASK) != hpa)
printk(KERN_ERR "xx audit error: (%s) levels %d"
@@ -3098,7 +3174,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
while (d) {
for (k = 0; k < RMAP_EXT; ++k)
- if (d->shadow_ptes[k])
+ if (d->sptes[k])
++nmaps;
else
break;
@@ -3109,9 +3185,48 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
return nmaps;
}
-static int count_writable_mappings(struct kvm_vcpu *vcpu)
+void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep)
+{
+ unsigned long *rmapp;
+ struct kvm_mmu_page *rev_sp;
+ gfn_t gfn;
+
+ if (*sptep & PT_WRITABLE_MASK) {
+ rev_sp = page_header(__pa(sptep));
+ gfn = rev_sp->gfns[sptep - rev_sp->spt];
+
+ if (!gfn_to_memslot(kvm, gfn)) {
+ if (!printk_ratelimit())
+ return;
+ printk(KERN_ERR "%s: no memslot for gfn %ld\n",
+ audit_msg, gfn);
+ printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
+ audit_msg, sptep - rev_sp->spt,
+ rev_sp->gfn);
+ dump_stack();
+ return;
+ }
+
+ rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt],
+ is_large_pte(*sptep));
+ if (!*rmapp) {
+ if (!printk_ratelimit())
+ return;
+ printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
+ audit_msg, *sptep);
+ dump_stack();
+ }
+ }
+
+}
+
+void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu)
+{
+ mmu_spte_walk(vcpu, inspect_spte_has_rmap);
+}
+
+static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
{
- int nmaps = 0;
struct kvm_mmu_page *sp;
int i;
@@ -3128,20 +3243,16 @@ static int count_writable_mappings(struct kvm_vcpu *vcpu)
continue;
if (!(ent & PT_WRITABLE_MASK))
continue;
- ++nmaps;
+ inspect_spte_has_rmap(vcpu->kvm, sp, &pt[i]);
}
}
- return nmaps;
+ return;
}
static void audit_rmap(struct kvm_vcpu *vcpu)
{
- int n_rmap = count_rmaps(vcpu);
- int n_actual = count_writable_mappings(vcpu);
-
- if (n_rmap != n_actual)
- printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
- __func__, audit_msg, n_rmap, n_actual);
+ check_writable_mappings_rmap(vcpu);
+ count_rmaps(vcpu);
}
static void audit_write_protection(struct kvm_vcpu *vcpu)
@@ -3149,20 +3260,28 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
struct kvm_mmu_page *sp;
struct kvm_memory_slot *slot;
unsigned long *rmapp;
+ u64 *spte;
gfn_t gfn;
list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
if (sp->role.direct)
continue;
+ if (sp->unsync)
+ continue;
gfn = unalias_gfn(vcpu->kvm, sp->gfn);
slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn);
rmapp = &slot->rmap[gfn - slot->base_gfn];
- if (*rmapp)
- printk(KERN_ERR "%s: (%s) shadow page has writable"
- " mappings: gfn %lx role %x\n",
+
+ spte = rmap_next(vcpu->kvm, rmapp, NULL);
+ while (spte) {
+ if (*spte & PT_WRITABLE_MASK)
+ printk(KERN_ERR "%s: (%s) shadow page has "
+ "writable mappings: gfn %lx role %x\n",
__func__, audit_msg, sp->gfn,
sp->role.word);
+ spte = rmap_next(vcpu->kvm, rmapp, spte);
+ }
}
}
@@ -3174,7 +3293,9 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
audit_msg = msg;
audit_rmap(vcpu);
audit_write_protection(vcpu);
- audit_mappings(vcpu);
+ if (strcmp("pre pte write", audit_msg) != 0)
+ audit_mappings(vcpu);
+ audit_writable_sptes_have_rmaps(vcpu);
dbg = olddbg;
}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 3494a2fb136e..61a1b3884b49 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -37,6 +37,8 @@
#define PT32_ROOT_LEVEL 2
#define PT32E_ROOT_LEVEL 3
+int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
+
static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
{
if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
@@ -75,7 +77,7 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
return vcpu->arch.cr0 & X86_CR0_PG;
}
-static inline int is_present_pte(unsigned long pte)
+static inline int is_present_gpte(unsigned long pte)
{
return pte & PT_PRESENT_MASK;
}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 258e4591e1ca..53e129cec5fd 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -131,8 +131,8 @@ walk:
pte = vcpu->arch.cr3;
#if PTTYPE == 64
if (!is_long_mode(vcpu)) {
- pte = vcpu->arch.pdptrs[(addr >> 30) & 3];
- if (!is_present_pte(pte))
+ pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3);
+ if (!is_present_gpte(pte))
goto not_present;
--walker->level;
}
@@ -155,7 +155,7 @@ walk:
kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
- if (!is_present_pte(pte))
+ if (!is_present_gpte(pte))
goto not_present;
rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level);
@@ -205,7 +205,7 @@ walk:
--walker->level;
}
- if (write_fault && !is_dirty_pte(pte)) {
+ if (write_fault && !is_dirty_gpte(pte)) {
bool ret;
mark_page_dirty(vcpu->kvm, table_gfn);
@@ -252,8 +252,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
gpte = *(const pt_element_t *)pte;
if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
- if (!is_present_pte(gpte))
- set_shadow_pte(spte, shadow_notrap_nonpresent_pte);
+ if (!is_present_gpte(gpte))
+ __set_spte(spte, shadow_notrap_nonpresent_pte);
return;
}
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
@@ -281,7 +281,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
{
unsigned access = gw->pt_access;
struct kvm_mmu_page *shadow_page;
- u64 spte, *sptep;
+ u64 spte, *sptep = NULL;
int direct;
gfn_t table_gfn;
int r;
@@ -289,7 +289,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
pt_element_t curr_pte;
struct kvm_shadow_walk_iterator iterator;
- if (!is_present_pte(gw->ptes[gw->level - 1]))
+ if (!is_present_gpte(gw->ptes[gw->level - 1]))
return NULL;
for_each_shadow_entry(vcpu, addr, iterator) {
@@ -311,14 +311,14 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
if (is_large_pte(*sptep)) {
rmap_remove(vcpu->kvm, sptep);
- set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
+ __set_spte(sptep, shadow_trap_nonpresent_pte);
kvm_flush_remote_tlbs(vcpu->kvm);
}
if (level == PT_DIRECTORY_LEVEL
&& gw->level == PT_DIRECTORY_LEVEL) {
direct = 1;
- if (!is_dirty_pte(gw->ptes[level - 1]))
+ if (!is_dirty_gpte(gw->ptes[level - 1]))
access &= ~ACC_WRITE_MASK;
table_gfn = gpte_to_gfn(gw->ptes[level - 1]);
} else {
@@ -369,7 +369,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
int user_fault = error_code & PFERR_USER_MASK;
int fetch_fault = error_code & PFERR_FETCH_MASK;
struct guest_walker walker;
- u64 *shadow_pte;
+ u64 *sptep;
int write_pt = 0;
int r;
pfn_t pfn;
@@ -401,7 +401,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
if (walker.level == PT_DIRECTORY_LEVEL) {
gfn_t large_gfn;
- large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1);
+ large_gfn = walker.gfn &
+ ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1);
if (is_largepage_backed(vcpu, large_gfn)) {
walker.gfn = large_gfn;
largepage = 1;
@@ -422,11 +423,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
- shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
- largepage, &write_pt, pfn);
+ sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
+ largepage, &write_pt, pfn);
pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
- shadow_pte, *shadow_pte, write_pt);
+ sptep, *sptep, write_pt);
if (!write_pt)
vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
@@ -472,7 +473,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
--vcpu->kvm->stat.lpages;
need_flush = 1;
}
- set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
+ __set_spte(sptep, shadow_trap_nonpresent_pte);
break;
}
@@ -489,7 +490,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
sizeof(pt_element_t)))
return;
- if (is_present_pte(gpte) && (gpte & PT_ACCESSED_MASK)) {
+ if (is_present_gpte(gpte) && (gpte & PT_ACCESSED_MASK)) {
if (mmu_topup_memory_caches(vcpu))
return;
kvm_mmu_pte_write(vcpu, pte_gpa, (const u8 *)&gpte,
@@ -536,7 +537,7 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt);
pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t);
for (j = 0; j < ARRAY_SIZE(pt); ++j)
- if (r || is_present_pte(pt[j]))
+ if (r || is_present_gpte(pt[j]))
sp->spt[i+j] = shadow_trap_nonpresent_pte;
else
sp->spt[i+j] = shadow_notrap_nonpresent_pte;
@@ -574,23 +575,23 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
sizeof(pt_element_t)))
return -EINVAL;
- if (gpte_to_gfn(gpte) != gfn || !is_present_pte(gpte) ||
+ if (gpte_to_gfn(gpte) != gfn || !is_present_gpte(gpte) ||
!(gpte & PT_ACCESSED_MASK)) {
u64 nonpresent;
rmap_remove(vcpu->kvm, &sp->spt[i]);
- if (is_present_pte(gpte))
+ if (is_present_gpte(gpte))
nonpresent = shadow_trap_nonpresent_pte;
else
nonpresent = shadow_notrap_nonpresent_pte;
- set_shadow_pte(&sp->spt[i], nonpresent);
+ __set_spte(&sp->spt[i], nonpresent);
continue;
}
nr_present++;
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
- is_dirty_pte(gpte), 0, gfn,
+ is_dirty_gpte(gpte), 0, gfn,
spte_to_pfn(sp->spt[i]), true, false);
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 71510e07e69e..fc14bdf60d39 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -15,7 +15,6 @@
*/
#include <linux/kvm_host.h>
-#include "kvm_svm.h"
#include "irq.h"
#include "mmu.h"
#include "kvm_cache_regs.h"
@@ -26,10 +25,12 @@
#include <linux/vmalloc.h>
#include <linux/highmem.h>
#include <linux/sched.h>
+#include <linux/ftrace_event.h>
#include <asm/desc.h>
#include <asm/virtext.h>
+#include "trace.h"
#define __ex(x) __kvm_handle_fault_on_reboot(x)
@@ -57,6 +58,46 @@ MODULE_LICENSE("GPL");
#define nsvm_printk(fmt, args...) do {} while(0)
#endif
+static const u32 host_save_user_msrs[] = {
+#ifdef CONFIG_X86_64
+ MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
+ MSR_FS_BASE,
+#endif
+ MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
+};
+
+#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
+
+struct kvm_vcpu;
+
+struct vcpu_svm {
+ struct kvm_vcpu vcpu;
+ struct vmcb *vmcb;
+ unsigned long vmcb_pa;
+ struct svm_cpu_data *svm_data;
+ uint64_t asid_generation;
+ uint64_t sysenter_cs;
+ uint64_t sysenter_esp;
+ uint64_t sysenter_eip;
+
+ u64 next_rip;
+
+ u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
+ u64 host_gs_base;
+
+ u32 *msrpm;
+ struct vmcb *hsave;
+ u64 hsave_msr;
+
+ u64 nested_vmcb;
+
+ /* These are the merged vectors */
+ u32 *nested_msrpm;
+
+ /* gpa pointers to the real vectors */
+ u64 nested_vmcb_msrpm;
+};
+
/* enable NPT for AMD64 and X86 with PAE */
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
static bool npt_enabled = true;
@@ -147,19 +188,6 @@ static inline void invlpga(unsigned long addr, u32 asid)
asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid));
}
-static inline unsigned long kvm_read_cr2(void)
-{
- unsigned long cr2;
-
- asm volatile ("mov %%cr2, %0" : "=r" (cr2));
- return cr2;
-}
-
-static inline void kvm_write_cr2(unsigned long val)
-{
- asm volatile ("mov %0, %%cr2" :: "r" (val));
-}
-
static inline void force_new_asid(struct kvm_vcpu *vcpu)
{
to_svm(vcpu)->asid_generation--;
@@ -605,7 +633,7 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
init_vmcb(svm);
- if (vcpu->vcpu_id != 0) {
+ if (!kvm_vcpu_is_bsp(vcpu)) {
kvm_rip_write(vcpu, 0);
svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
@@ -669,7 +697,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
fx_init(&svm->vcpu);
svm->vcpu.fpu_active = 1;
svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
- if (svm->vcpu.vcpu_id == 0)
+ if (kvm_vcpu_is_bsp(&svm->vcpu))
svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
return &svm->vcpu;
@@ -739,6 +767,18 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
to_svm(vcpu)->vmcb->save.rflags = rflags;
}
+static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
+{
+ switch (reg) {
+ case VCPU_EXREG_PDPTR:
+ BUG_ON(!npt_enabled);
+ load_pdptrs(vcpu, vcpu->arch.cr3);
+ break;
+ default:
+ BUG();
+ }
+}
+
static void svm_set_vintr(struct vcpu_svm *svm)
{
svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR;
@@ -1061,7 +1101,6 @@ static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
val = 0;
}
- KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
return val;
}
@@ -1070,8 +1109,6 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
{
struct vcpu_svm *svm = to_svm(vcpu);
- KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)value, handler);
-
*exception = 0;
switch (dr) {
@@ -1119,14 +1156,7 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
fault_address = svm->vmcb->control.exit_info_2;
error_code = svm->vmcb->control.exit_info_1;
- if (!npt_enabled)
- KVMTRACE_3D(PAGE_FAULT, &svm->vcpu, error_code,
- (u32)fault_address, (u32)(fault_address >> 32),
- handler);
- else
- KVMTRACE_3D(TDP_FAULT, &svm->vcpu, error_code,
- (u32)fault_address, (u32)(fault_address >> 32),
- handler);
+ trace_kvm_page_fault(fault_address, error_code);
/*
* FIXME: Tis shouldn't be necessary here, but there is a flush
* missing in the MMU code. Until we find this bug, flush the
@@ -1253,14 +1283,12 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
- KVMTRACE_0D(NMI, &svm->vcpu, handler);
return 1;
}
static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
++svm->vcpu.stat.irq_exits;
- KVMTRACE_0D(INTR, &svm->vcpu, handler);
return 1;
}
@@ -1577,7 +1605,8 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1,
/* Kill any pending exceptions */
if (svm->vcpu.arch.exception.pending == true)
nsvm_printk("WARNING: Pending Exception\n");
- svm->vcpu.arch.exception.pending = false;
+ kvm_clear_exception_queue(&svm->vcpu);
+ kvm_clear_interrupt_queue(&svm->vcpu);
/* Restore selected save entries */
svm->vmcb->save.es = hsave->save.es;
@@ -1645,7 +1674,8 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1,
svm->nested_vmcb = svm->vmcb->save.rax;
/* Clear internal status */
- svm->vcpu.arch.exception.pending = false;
+ kvm_clear_exception_queue(&svm->vcpu);
+ kvm_clear_interrupt_queue(&svm->vcpu);
/* Save the old vmcb, so we don't need to pick what we save, but
can restore everything when a VMEXIT occurs */
@@ -1845,6 +1875,19 @@ static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
return 1;
}
+static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ nsvm_printk("INVLPGA\n");
+
+ /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
+ kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]);
+
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+ skip_emulated_instruction(&svm->vcpu);
+ return 1;
+}
+
static int invalid_op_interception(struct vcpu_svm *svm,
struct kvm_run *kvm_run)
{
@@ -1953,7 +1996,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
struct vcpu_svm *svm = to_svm(vcpu);
switch (ecx) {
- case MSR_IA32_TIME_STAMP_COUNTER: {
+ case MSR_IA32_TSC: {
u64 tsc;
rdtscll(tsc);
@@ -1978,13 +2021,13 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
break;
#endif
case MSR_IA32_SYSENTER_CS:
- *data = svm->vmcb->save.sysenter_cs;
+ *data = svm->sysenter_cs;
break;
case MSR_IA32_SYSENTER_EIP:
- *data = svm->vmcb->save.sysenter_eip;
+ *data = svm->sysenter_eip;
break;
case MSR_IA32_SYSENTER_ESP:
- *data = svm->vmcb->save.sysenter_esp;
+ *data = svm->sysenter_esp;
break;
/* Nobody will change the following 5 values in the VMCB so
we can safely return them on rdmsr. They will always be 0
@@ -2027,8 +2070,7 @@ static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
if (svm_get_msr(&svm->vcpu, ecx, &data))
kvm_inject_gp(&svm->vcpu, 0);
else {
- KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data,
- (u32)(data >> 32), handler);
+ trace_kvm_msr_read(ecx, data);
svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff;
svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
@@ -2043,7 +2085,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
struct vcpu_svm *svm = to_svm(vcpu);
switch (ecx) {
- case MSR_IA32_TIME_STAMP_COUNTER: {
+ case MSR_IA32_TSC: {
u64 tsc;
rdtscll(tsc);
@@ -2068,13 +2110,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
break;
#endif
case MSR_IA32_SYSENTER_CS:
- svm->vmcb->save.sysenter_cs = data;
+ svm->sysenter_cs = data;
break;
case MSR_IA32_SYSENTER_EIP:
- svm->vmcb->save.sysenter_eip = data;
+ svm->sysenter_eip = data;
break;
case MSR_IA32_SYSENTER_ESP:
- svm->vmcb->save.sysenter_esp = data;
+ svm->sysenter_esp = data;
break;
case MSR_IA32_DEBUGCTLMSR:
if (!svm_has(SVM_FEATURE_LBRV)) {
@@ -2091,25 +2133,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
else
svm_disable_lbrv(svm);
break;
- case MSR_K7_EVNTSEL0:
- case MSR_K7_EVNTSEL1:
- case MSR_K7_EVNTSEL2:
- case MSR_K7_EVNTSEL3:
- case MSR_K7_PERFCTR0:
- case MSR_K7_PERFCTR1:
- case MSR_K7_PERFCTR2:
- case MSR_K7_PERFCTR3:
- /*
- * Just discard all writes to the performance counters; this
- * should keep both older linux and windows 64-bit guests
- * happy
- */
- pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", ecx, data);
-
- break;
case MSR_VM_HSAVE_PA:
svm->hsave_msr = data;
break;
+ case MSR_VM_CR:
+ case MSR_VM_IGNNE:
+ pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
+ break;
default:
return kvm_set_msr_common(vcpu, ecx, data);
}
@@ -2122,8 +2152,7 @@ static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
| ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
- KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32),
- handler);
+ trace_kvm_msr_write(ecx, data);
svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
if (svm_set_msr(&svm->vcpu, ecx, data))
@@ -2144,8 +2173,6 @@ static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
static int interrupt_window_interception(struct vcpu_svm *svm,
struct kvm_run *kvm_run)
{
- KVMTRACE_0D(PEND_INTR, &svm->vcpu, handler);
-
svm_clear_vintr(svm);
svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
/*
@@ -2201,7 +2228,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
[SVM_EXIT_INVD] = emulate_on_interception,
[SVM_EXIT_HLT] = halt_interception,
[SVM_EXIT_INVLPG] = invlpg_interception,
- [SVM_EXIT_INVLPGA] = invalid_op_interception,
+ [SVM_EXIT_INVLPGA] = invlpga_interception,
[SVM_EXIT_IOIO] = io_interception,
[SVM_EXIT_MSR] = msr_interception,
[SVM_EXIT_TASK_SWITCH] = task_switch_interception,
@@ -2224,8 +2251,7 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
u32 exit_code = svm->vmcb->control.exit_code;
- KVMTRACE_3D(VMEXIT, vcpu, exit_code, (u32)svm->vmcb->save.rip,
- (u32)((u64)svm->vmcb->save.rip >> 32), entryexit);
+ trace_kvm_exit(exit_code, svm->vmcb->save.rip);
if (is_nested(svm)) {
nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n",
@@ -2246,12 +2272,6 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
}
vcpu->arch.cr0 = svm->vmcb->save.cr0;
vcpu->arch.cr3 = svm->vmcb->save.cr3;
- if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
- if (!load_pdptrs(vcpu, vcpu->arch.cr3)) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
- }
if (mmu_reload) {
kvm_mmu_reset_context(vcpu);
kvm_mmu_load(vcpu);
@@ -2319,7 +2339,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
{
struct vmcb_control_area *control;
- KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler);
+ trace_kvm_inj_virq(irq);
++svm->vcpu.stat.irq_injections;
control = &svm->vmcb->control;
@@ -2329,21 +2349,14 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
}
-static void svm_queue_irq(struct kvm_vcpu *vcpu, unsigned nr)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- svm->vmcb->control.event_inj = nr |
- SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
-}
-
static void svm_set_irq(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- nested_svm_intr(svm);
+ BUG_ON(!(svm->vcpu.arch.hflags & HF_GIF_MASK));
- svm_queue_irq(vcpu, vcpu->arch.interrupt.nr);
+ svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
+ SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
}
static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
@@ -2371,13 +2384,25 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
struct vmcb *vmcb = svm->vmcb;
return (vmcb->save.rflags & X86_EFLAGS_IF) &&
!(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
- (svm->vcpu.arch.hflags & HF_GIF_MASK);
+ (svm->vcpu.arch.hflags & HF_GIF_MASK) &&
+ !is_nested(svm);
}
static void enable_irq_window(struct kvm_vcpu *vcpu)
{
- svm_set_vintr(to_svm(vcpu));
- svm_inject_irq(to_svm(vcpu), 0x0);
+ struct vcpu_svm *svm = to_svm(vcpu);
+ nsvm_printk("Trying to open IRQ window\n");
+
+ nested_svm_intr(svm);
+
+ /* In case GIF=0 we can't rely on the CPU to tell us when
+ * GIF becomes 1, because that's a separate STGI/VMRUN intercept.
+ * The next time we get that intercept, this function will be
+ * called again though and we'll get the vintr intercept. */
+ if (svm->vcpu.arch.hflags & HF_GIF_MASK) {
+ svm_set_vintr(svm);
+ svm_inject_irq(svm, 0x0);
+ }
}
static void enable_nmi_window(struct kvm_vcpu *vcpu)
@@ -2456,6 +2481,8 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
case SVM_EXITINTINFO_TYPE_EXEPT:
/* In case of software exception do not reinject an exception
vector, but re-execute and instruction instead */
+ if (is_nested(svm))
+ break;
if (kvm_exception_is_soft(vector))
break;
if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
@@ -2498,7 +2525,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
fs_selector = kvm_read_fs();
gs_selector = kvm_read_gs();
ldt_selector = kvm_read_ldt();
- svm->host_cr2 = kvm_read_cr2();
if (!is_nested(svm))
svm->vmcb->save.cr2 = vcpu->arch.cr2;
/* required for live migration with NPT */
@@ -2585,8 +2611,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
- kvm_write_cr2(svm->host_cr2);
-
kvm_load_fs(fs_selector);
kvm_load_gs(gs_selector);
kvm_load_ldt(ldt_selector);
@@ -2602,6 +2626,11 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
svm->next_rip = 0;
+ if (npt_enabled) {
+ vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
+ vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
+ }
+
svm_complete_interrupts(svm);
}
@@ -2673,6 +2702,59 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
return 0;
}
+static const struct trace_print_flags svm_exit_reasons_str[] = {
+ { SVM_EXIT_READ_CR0, "read_cr0" },
+ { SVM_EXIT_READ_CR3, "read_cr3" },
+ { SVM_EXIT_READ_CR4, "read_cr4" },
+ { SVM_EXIT_READ_CR8, "read_cr8" },
+ { SVM_EXIT_WRITE_CR0, "write_cr0" },
+ { SVM_EXIT_WRITE_CR3, "write_cr3" },
+ { SVM_EXIT_WRITE_CR4, "write_cr4" },
+ { SVM_EXIT_WRITE_CR8, "write_cr8" },
+ { SVM_EXIT_READ_DR0, "read_dr0" },
+ { SVM_EXIT_READ_DR1, "read_dr1" },
+ { SVM_EXIT_READ_DR2, "read_dr2" },
+ { SVM_EXIT_READ_DR3, "read_dr3" },
+ { SVM_EXIT_WRITE_DR0, "write_dr0" },
+ { SVM_EXIT_WRITE_DR1, "write_dr1" },
+ { SVM_EXIT_WRITE_DR2, "write_dr2" },
+ { SVM_EXIT_WRITE_DR3, "write_dr3" },
+ { SVM_EXIT_WRITE_DR5, "write_dr5" },
+ { SVM_EXIT_WRITE_DR7, "write_dr7" },
+ { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" },
+ { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" },
+ { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" },
+ { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" },
+ { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" },
+ { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" },
+ { SVM_EXIT_INTR, "interrupt" },
+ { SVM_EXIT_NMI, "nmi" },
+ { SVM_EXIT_SMI, "smi" },
+ { SVM_EXIT_INIT, "init" },
+ { SVM_EXIT_VINTR, "vintr" },
+ { SVM_EXIT_CPUID, "cpuid" },
+ { SVM_EXIT_INVD, "invd" },
+ { SVM_EXIT_HLT, "hlt" },
+ { SVM_EXIT_INVLPG, "invlpg" },
+ { SVM_EXIT_INVLPGA, "invlpga" },
+ { SVM_EXIT_IOIO, "io" },
+ { SVM_EXIT_MSR, "msr" },
+ { SVM_EXIT_TASK_SWITCH, "task_switch" },
+ { SVM_EXIT_SHUTDOWN, "shutdown" },
+ { SVM_EXIT_VMRUN, "vmrun" },
+ { SVM_EXIT_VMMCALL, "hypercall" },
+ { SVM_EXIT_VMLOAD, "vmload" },
+ { SVM_EXIT_VMSAVE, "vmsave" },
+ { SVM_EXIT_STGI, "stgi" },
+ { SVM_EXIT_CLGI, "clgi" },
+ { SVM_EXIT_SKINIT, "skinit" },
+ { SVM_EXIT_WBINVD, "wbinvd" },
+ { SVM_EXIT_MONITOR, "monitor" },
+ { SVM_EXIT_MWAIT, "mwait" },
+ { SVM_EXIT_NPF, "npf" },
+ { -1, NULL }
+};
+
static struct kvm_x86_ops svm_x86_ops = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,
@@ -2710,6 +2792,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.set_gdt = svm_set_gdt,
.get_dr = svm_get_dr,
.set_dr = svm_set_dr,
+ .cache_reg = svm_cache_reg,
.get_rflags = svm_get_rflags,
.set_rflags = svm_set_rflags,
@@ -2733,6 +2816,8 @@ static struct kvm_x86_ops svm_x86_ops = {
.set_tss_addr = svm_set_tss_addr,
.get_tdp_level = get_npt_level,
.get_mt_mask = svm_get_mt_mask,
+
+ .exit_reasons_str = svm_exit_reasons_str,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index 86dbac072d0c..eea40439066c 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -9,12 +9,16 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
int restart_timer = 0;
wait_queue_head_t *q = &vcpu->wq;
- /* FIXME: this code should not know anything about vcpus */
- if (!atomic_inc_and_test(&ktimer->pending))
+ /*
+ * There is a race window between reading and incrementing, but we do
+ * not care about potentially loosing timer events in the !reinject
+ * case anyway.
+ */
+ if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
+ atomic_inc(&ktimer->pending);
+ /* FIXME: this code should not know anything about vcpus */
set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
-
- if (!ktimer->reinject)
- atomic_set(&ktimer->pending, 1);
+ }
if (waitqueue_active(q))
wake_up_interruptible(q);
@@ -33,7 +37,7 @@ enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
struct kvm_vcpu *vcpu;
struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
- vcpu = ktimer->kvm->vcpus[ktimer->vcpu_id];
+ vcpu = ktimer->vcpu;
if (!vcpu)
return HRTIMER_NORESTART;
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
new file mode 100644
index 000000000000..4b6b6e83fc6b
--- /dev/null
+++ b/arch/x86/kvm/trace.h
@@ -0,0 +1,260 @@
+#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVM_H
+
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm
+#define TRACE_INCLUDE_PATH arch/x86/kvm
+#define TRACE_INCLUDE_FILE trace
+
+/*
+ * Tracepoint for guest mode entry.
+ */
+TRACE_EVENT(kvm_entry,
+ TP_PROTO(unsigned int vcpu_id),
+ TP_ARGS(vcpu_id),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ ),
+
+ TP_printk("vcpu %u\n", __entry->vcpu_id)
+);
+
+/*
+ * Tracepoint for hypercall.
+ */
+TRACE_EVENT(kvm_hypercall,
+ TP_PROTO(unsigned long nr, unsigned long a0, unsigned long a1,
+ unsigned long a2, unsigned long a3),
+ TP_ARGS(nr, a0, a1, a2, a3),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, nr )
+ __field( unsigned long, a0 )
+ __field( unsigned long, a1 )
+ __field( unsigned long, a2 )
+ __field( unsigned long, a3 )
+ ),
+
+ TP_fast_assign(
+ __entry->nr = nr;
+ __entry->a0 = a0;
+ __entry->a1 = a1;
+ __entry->a2 = a2;
+ __entry->a3 = a3;
+ ),
+
+ TP_printk("nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx",
+ __entry->nr, __entry->a0, __entry->a1, __entry->a2,
+ __entry->a3)
+);
+
+/*
+ * Tracepoint for PIO.
+ */
+TRACE_EVENT(kvm_pio,
+ TP_PROTO(unsigned int rw, unsigned int port, unsigned int size,
+ unsigned int count),
+ TP_ARGS(rw, port, size, count),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, rw )
+ __field( unsigned int, port )
+ __field( unsigned int, size )
+ __field( unsigned int, count )
+ ),
+
+ TP_fast_assign(
+ __entry->rw = rw;
+ __entry->port = port;
+ __entry->size = size;
+ __entry->count = count;
+ ),
+
+ TP_printk("pio_%s at 0x%x size %d count %d",
+ __entry->rw ? "write" : "read",
+ __entry->port, __entry->size, __entry->count)
+);
+
+/*
+ * Tracepoint for cpuid.
+ */
+TRACE_EVENT(kvm_cpuid,
+ TP_PROTO(unsigned int function, unsigned long rax, unsigned long rbx,
+ unsigned long rcx, unsigned long rdx),
+ TP_ARGS(function, rax, rbx, rcx, rdx),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, function )
+ __field( unsigned long, rax )
+ __field( unsigned long, rbx )
+ __field( unsigned long, rcx )
+ __field( unsigned long, rdx )
+ ),
+
+ TP_fast_assign(
+ __entry->function = function;
+ __entry->rax = rax;
+ __entry->rbx = rbx;
+ __entry->rcx = rcx;
+ __entry->rdx = rdx;
+ ),
+
+ TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx",
+ __entry->function, __entry->rax,
+ __entry->rbx, __entry->rcx, __entry->rdx)
+);
+
+/*
+ * Tracepoint for apic access.
+ */
+TRACE_EVENT(kvm_apic,
+ TP_PROTO(unsigned int rw, unsigned int reg, unsigned int val),
+ TP_ARGS(rw, reg, val),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, rw )
+ __field( unsigned int, reg )
+ __field( unsigned int, val )
+ ),
+
+ TP_fast_assign(
+ __entry->rw = rw;
+ __entry->reg = reg;
+ __entry->val = val;
+ ),
+
+ TP_printk("apic_%s 0x%x = 0x%x",
+ __entry->rw ? "write" : "read",
+ __entry->reg, __entry->val)
+);
+
+#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val)
+#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val)
+
+/*
+ * Tracepoint for kvm guest exit:
+ */
+TRACE_EVENT(kvm_exit,
+ TP_PROTO(unsigned int exit_reason, unsigned long guest_rip),
+ TP_ARGS(exit_reason, guest_rip),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, exit_reason )
+ __field( unsigned long, guest_rip )
+ ),
+
+ TP_fast_assign(
+ __entry->exit_reason = exit_reason;
+ __entry->guest_rip = guest_rip;
+ ),
+
+ TP_printk("reason %s rip 0x%lx",
+ ftrace_print_symbols_seq(p, __entry->exit_reason,
+ kvm_x86_ops->exit_reasons_str),
+ __entry->guest_rip)
+);
+
+/*
+ * Tracepoint for kvm interrupt injection:
+ */
+TRACE_EVENT(kvm_inj_virq,
+ TP_PROTO(unsigned int irq),
+ TP_ARGS(irq),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, irq )
+ ),
+
+ TP_fast_assign(
+ __entry->irq = irq;
+ ),
+
+ TP_printk("irq %u", __entry->irq)
+);
+
+/*
+ * Tracepoint for page fault.
+ */
+TRACE_EVENT(kvm_page_fault,
+ TP_PROTO(unsigned long fault_address, unsigned int error_code),
+ TP_ARGS(fault_address, error_code),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, fault_address )
+ __field( unsigned int, error_code )
+ ),
+
+ TP_fast_assign(
+ __entry->fault_address = fault_address;
+ __entry->error_code = error_code;
+ ),
+
+ TP_printk("address %lx error_code %x",
+ __entry->fault_address, __entry->error_code)
+);
+
+/*
+ * Tracepoint for guest MSR access.
+ */
+TRACE_EVENT(kvm_msr,
+ TP_PROTO(unsigned int rw, unsigned int ecx, unsigned long data),
+ TP_ARGS(rw, ecx, data),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, rw )
+ __field( unsigned int, ecx )
+ __field( unsigned long, data )
+ ),
+
+ TP_fast_assign(
+ __entry->rw = rw;
+ __entry->ecx = ecx;
+ __entry->data = data;
+ ),
+
+ TP_printk("msr_%s %x = 0x%lx",
+ __entry->rw ? "write" : "read",
+ __entry->ecx, __entry->data)
+);
+
+#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data)
+#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data)
+
+/*
+ * Tracepoint for guest CR access.
+ */
+TRACE_EVENT(kvm_cr,
+ TP_PROTO(unsigned int rw, unsigned int cr, unsigned long val),
+ TP_ARGS(rw, cr, val),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, rw )
+ __field( unsigned int, cr )
+ __field( unsigned long, val )
+ ),
+
+ TP_fast_assign(
+ __entry->rw = rw;
+ __entry->cr = cr;
+ __entry->val = val;
+ ),
+
+ TP_printk("cr_%s %x = 0x%lx",
+ __entry->rw ? "write" : "read",
+ __entry->cr, __entry->val)
+);
+
+#define trace_kvm_cr_read(cr, val) trace_kvm_cr(0, cr, val)
+#define trace_kvm_cr_write(cr, val) trace_kvm_cr(1, cr, val)
+
+#endif /* _TRACE_KVM_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e770bf349ec4..119fa470eded 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -25,6 +25,7 @@
#include <linux/highmem.h>
#include <linux/sched.h>
#include <linux/moduleparam.h>
+#include <linux/ftrace_event.h>
#include "kvm_cache_regs.h"
#include "x86.h"
@@ -34,6 +35,8 @@
#include <asm/virtext.h>
#include <asm/mce.h>
+#include "trace.h"
+
#define __ex(x) __kvm_handle_fault_on_reboot(x)
MODULE_AUTHOR("Qumranet");
@@ -51,6 +54,10 @@ module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
static int __read_mostly enable_ept = 1;
module_param_named(ept, enable_ept, bool, S_IRUGO);
+static int __read_mostly enable_unrestricted_guest = 1;
+module_param_named(unrestricted_guest,
+ enable_unrestricted_guest, bool, S_IRUGO);
+
static int __read_mostly emulate_invalid_guest_state = 0;
module_param(emulate_invalid_guest_state, bool, S_IRUGO);
@@ -84,6 +91,14 @@ struct vcpu_vmx {
int guest_efer_loaded;
} host_state;
struct {
+ int vm86_active;
+ u8 save_iopl;
+ struct kvm_save_segment {
+ u16 selector;
+ unsigned long base;
+ u32 limit;
+ u32 ar;
+ } tr, es, ds, fs, gs;
struct {
bool pending;
u8 vector;
@@ -161,6 +176,8 @@ static struct kvm_vmx_segment_field {
VMX_SEGMENT_FIELD(LDTR),
};
+static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
+
/*
* Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
* away by decrementing the array size.
@@ -256,6 +273,26 @@ static inline bool cpu_has_vmx_flexpriority(void)
cpu_has_vmx_virtualize_apic_accesses();
}
+static inline bool cpu_has_vmx_ept_execute_only(void)
+{
+ return !!(vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT);
+}
+
+static inline bool cpu_has_vmx_eptp_uncacheable(void)
+{
+ return !!(vmx_capability.ept & VMX_EPTP_UC_BIT);
+}
+
+static inline bool cpu_has_vmx_eptp_writeback(void)
+{
+ return !!(vmx_capability.ept & VMX_EPTP_WB_BIT);
+}
+
+static inline bool cpu_has_vmx_ept_2m_page(void)
+{
+ return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT);
+}
+
static inline int cpu_has_vmx_invept_individual_addr(void)
{
return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
@@ -277,6 +314,12 @@ static inline int cpu_has_vmx_ept(void)
SECONDARY_EXEC_ENABLE_EPT;
}
+static inline int cpu_has_vmx_unrestricted_guest(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_UNRESTRICTED_GUEST;
+}
+
static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
{
return flexpriority_enabled &&
@@ -504,7 +547,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
eb |= 1u << BP_VECTOR;
}
- if (vcpu->arch.rmode.vm86_active)
+ if (to_vmx(vcpu)->rmode.vm86_active)
eb = ~0;
if (enable_ept)
eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
@@ -740,7 +783,7 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
- if (vcpu->arch.rmode.vm86_active)
+ if (to_vmx(vcpu)->rmode.vm86_active)
rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
vmcs_writel(GUEST_RFLAGS, rflags);
}
@@ -797,12 +840,13 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
intr_info |= INTR_INFO_DELIVER_CODE_MASK;
}
- if (vcpu->arch.rmode.vm86_active) {
+ if (vmx->rmode.vm86_active) {
vmx->rmode.irq.pending = true;
vmx->rmode.irq.vector = nr;
vmx->rmode.irq.rip = kvm_rip_read(vcpu);
- if (nr == BP_VECTOR || nr == OF_VECTOR)
- vmx->rmode.irq.rip++;
+ if (kvm_exception_is_soft(nr))
+ vmx->rmode.irq.rip +=
+ vmx->vcpu.arch.event_exit_inst_len;
intr_info |= INTR_TYPE_SOFT_INTR;
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
@@ -940,7 +984,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
case MSR_EFER:
return kvm_get_msr_common(vcpu, msr_index, pdata);
#endif
- case MSR_IA32_TIME_STAMP_COUNTER:
+ case MSR_IA32_TSC:
data = guest_read_tsc();
break;
case MSR_IA32_SYSENTER_CS:
@@ -1000,22 +1044,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
case MSR_IA32_SYSENTER_ESP:
vmcs_writel(GUEST_SYSENTER_ESP, data);
break;
- case MSR_IA32_TIME_STAMP_COUNTER:
+ case MSR_IA32_TSC:
rdtscll(host_tsc);
guest_write_tsc(data, host_tsc);
break;
- case MSR_P6_PERFCTR0:
- case MSR_P6_PERFCTR1:
- case MSR_P6_EVNTSEL0:
- case MSR_P6_EVNTSEL1:
- /*
- * Just discard all writes to the performance counters; this
- * should keep both older linux and windows 64-bit guests
- * happy
- */
- pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data);
-
- break;
case MSR_IA32_CR_PAT:
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
vmcs_write64(GUEST_IA32_PAT, data);
@@ -1046,6 +1078,10 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
case VCPU_REGS_RIP:
vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
break;
+ case VCPU_EXREG_PDPTR:
+ if (enable_ept)
+ ept_save_pdptrs(vcpu);
+ break;
default:
break;
}
@@ -1203,7 +1239,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_WBINVD_EXITING |
SECONDARY_EXEC_ENABLE_VPID |
- SECONDARY_EXEC_ENABLE_EPT;
+ SECONDARY_EXEC_ENABLE_EPT |
+ SECONDARY_EXEC_UNRESTRICTED_GUEST;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
@@ -1333,8 +1370,13 @@ static __init int hardware_setup(void)
if (!cpu_has_vmx_vpid())
enable_vpid = 0;
- if (!cpu_has_vmx_ept())
+ if (!cpu_has_vmx_ept()) {
enable_ept = 0;
+ enable_unrestricted_guest = 0;
+ }
+
+ if (!cpu_has_vmx_unrestricted_guest())
+ enable_unrestricted_guest = 0;
if (!cpu_has_vmx_flexpriority())
flexpriority_enabled = 0;
@@ -1342,6 +1384,9 @@ static __init int hardware_setup(void)
if (!cpu_has_vmx_tpr_shadow())
kvm_x86_ops->update_cr8_intercept = NULL;
+ if (enable_ept && !cpu_has_vmx_ept_2m_page())
+ kvm_disable_largepages();
+
return alloc_kvm_area();
}
@@ -1372,15 +1417,15 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
vmx->emulation_required = 1;
- vcpu->arch.rmode.vm86_active = 0;
+ vmx->rmode.vm86_active = 0;
- vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
- vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit);
- vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar);
+ vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base);
+ vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit);
+ vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar);
flags = vmcs_readl(GUEST_RFLAGS);
flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
- flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT);
+ flags |= (vmx->rmode.save_iopl << IOPL_SHIFT);
vmcs_writel(GUEST_RFLAGS, flags);
vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
@@ -1391,10 +1436,10 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
if (emulate_invalid_guest_state)
return;
- fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
- fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
- fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
- fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
+ fix_pmode_dataseg(VCPU_SREG_ES, &vmx->rmode.es);
+ fix_pmode_dataseg(VCPU_SREG_DS, &vmx->rmode.ds);
+ fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs);
+ fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs);
vmcs_write16(GUEST_SS_SELECTOR, 0);
vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
@@ -1433,20 +1478,23 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
unsigned long flags;
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (enable_unrestricted_guest)
+ return;
+
vmx->emulation_required = 1;
- vcpu->arch.rmode.vm86_active = 1;
+ vmx->rmode.vm86_active = 1;
- vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
+ vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
- vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
+ vmx->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
- vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
+ vmx->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
flags = vmcs_readl(GUEST_RFLAGS);
- vcpu->arch.rmode.save_iopl
+ vmx->rmode.save_iopl
= (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
@@ -1468,10 +1516,10 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_CS_BASE, 0xf0000);
vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
- fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
- fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
- fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
- fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
+ fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es);
+ fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds);
+ fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
+ fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs);
continue_rmode:
kvm_mmu_reset_context(vcpu);
@@ -1545,11 +1593,11 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
{
+ if (!test_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_dirty))
+ return;
+
if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
- if (!load_pdptrs(vcpu, vcpu->arch.cr3)) {
- printk(KERN_ERR "EPT: Fail to load pdptrs!\n");
- return;
- }
vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]);
vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]);
vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]);
@@ -1557,6 +1605,21 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
}
}
+static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
+{
+ if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
+ vcpu->arch.pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
+ vcpu->arch.pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
+ vcpu->arch.pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
+ vcpu->arch.pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
+ }
+
+ __set_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_avail);
+ __set_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_dirty);
+}
+
static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
@@ -1571,7 +1634,6 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
CPU_BASED_CR3_STORE_EXITING));
vcpu->arch.cr0 = cr0;
vmx_set_cr4(vcpu, vcpu->arch.cr4);
- *hw_cr0 |= X86_CR0_PE | X86_CR0_PG;
*hw_cr0 &= ~X86_CR0_WP;
} else if (!is_paging(vcpu)) {
/* From nonpaging to paging */
@@ -1598,15 +1660,21 @@ static void ept_update_paging_mode_cr4(unsigned long *hw_cr4,
static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
- unsigned long hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) |
- KVM_VM_CR0_ALWAYS_ON;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long hw_cr0;
+
+ if (enable_unrestricted_guest)
+ hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST)
+ | KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
+ else
+ hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON;
vmx_fpu_deactivate(vcpu);
- if (vcpu->arch.rmode.vm86_active && (cr0 & X86_CR0_PE))
+ if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
enter_pmode(vcpu);
- if (!vcpu->arch.rmode.vm86_active && !(cr0 & X86_CR0_PE))
+ if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
enter_rmode(vcpu);
#ifdef CONFIG_X86_64
@@ -1650,8 +1718,6 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
if (enable_ept) {
eptp = construct_eptp(cr3);
vmcs_write64(EPT_POINTER, eptp);
- ept_sync_context(eptp);
- ept_load_pdptrs(vcpu);
guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 :
VMX_EPT_IDENTITY_PAGETABLE_ADDR;
}
@@ -1664,7 +1730,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
- unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.vm86_active ?
+ unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ?
KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
vcpu->arch.cr4 = cr4;
@@ -1744,20 +1810,21 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var)
static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
u32 ar;
- if (vcpu->arch.rmode.vm86_active && seg == VCPU_SREG_TR) {
- vcpu->arch.rmode.tr.selector = var->selector;
- vcpu->arch.rmode.tr.base = var->base;
- vcpu->arch.rmode.tr.limit = var->limit;
- vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var);
+ if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) {
+ vmx->rmode.tr.selector = var->selector;
+ vmx->rmode.tr.base = var->base;
+ vmx->rmode.tr.limit = var->limit;
+ vmx->rmode.tr.ar = vmx_segment_access_rights(var);
return;
}
vmcs_writel(sf->base, var->base);
vmcs_write32(sf->limit, var->limit);
vmcs_write16(sf->selector, var->selector);
- if (vcpu->arch.rmode.vm86_active && var->s) {
+ if (vmx->rmode.vm86_active && var->s) {
/*
* Hack real-mode segments into vm86 compatibility.
*/
@@ -1766,6 +1833,21 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
ar = 0xf3;
} else
ar = vmx_segment_access_rights(var);
+
+ /*
+ * Fix the "Accessed" bit in AR field of segment registers for older
+ * qemu binaries.
+ * IA32 arch specifies that at the time of processor reset the
+ * "Accessed" bit in the AR field of segment registers is 1. And qemu
+ * is setting it to 0 in the usedland code. This causes invalid guest
+ * state vmexit when "unrestricted guest" mode is turned on.
+ * Fix for this setup issue in cpu_reset is being pushed in the qemu
+ * tree. Newer qemu binaries with that qemu fix would not need this
+ * kvm hack.
+ */
+ if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
+ ar |= 0x1; /* Accessed */
+
vmcs_write32(sf->ar_bytes, ar);
}
@@ -2062,11 +2144,19 @@ out:
static void seg_setup(int seg)
{
struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+ unsigned int ar;
vmcs_write16(sf->selector, 0);
vmcs_writel(sf->base, 0);
vmcs_write32(sf->limit, 0xffff);
- vmcs_write32(sf->ar_bytes, 0xf3);
+ if (enable_unrestricted_guest) {
+ ar = 0x93;
+ if (seg == VCPU_SREG_CS)
+ ar |= 0x08; /* code segment */
+ } else
+ ar = 0xf3;
+
+ vmcs_write32(sf->ar_bytes, ar);
}
static int alloc_apic_access_page(struct kvm *kvm)
@@ -2209,6 +2299,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
if (!enable_ept)
exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+ if (!enable_unrestricted_guest)
+ exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
}
@@ -2326,14 +2418,14 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
goto out;
}
- vmx->vcpu.arch.rmode.vm86_active = 0;
+ vmx->rmode.vm86_active = 0;
vmx->soft_vnmi_blocked = 0;
vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
kvm_set_cr8(&vmx->vcpu, 0);
msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
- if (vmx->vcpu.vcpu_id == 0)
+ if (kvm_vcpu_is_bsp(&vmx->vcpu))
msr |= MSR_IA32_APICBASE_BSP;
kvm_set_apic_base(&vmx->vcpu, msr);
@@ -2344,7 +2436,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
* GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
* insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
*/
- if (vmx->vcpu.vcpu_id == 0) {
+ if (kvm_vcpu_is_bsp(&vmx->vcpu)) {
vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
vmcs_writel(GUEST_CS_BASE, 0x000f0000);
} else {
@@ -2373,7 +2465,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_SYSENTER_EIP, 0);
vmcs_writel(GUEST_RFLAGS, 0x02);
- if (vmx->vcpu.vcpu_id == 0)
+ if (kvm_vcpu_is_bsp(&vmx->vcpu))
kvm_rip_write(vcpu, 0xfff0);
else
kvm_rip_write(vcpu, 0);
@@ -2461,13 +2553,16 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
uint32_t intr;
int irq = vcpu->arch.interrupt.nr;
- KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler);
+ trace_kvm_inj_virq(irq);
++vcpu->stat.irq_injections;
- if (vcpu->arch.rmode.vm86_active) {
+ if (vmx->rmode.vm86_active) {
vmx->rmode.irq.pending = true;
vmx->rmode.irq.vector = irq;
vmx->rmode.irq.rip = kvm_rip_read(vcpu);
+ if (vcpu->arch.interrupt.soft)
+ vmx->rmode.irq.rip +=
+ vmx->vcpu.arch.event_exit_inst_len;
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
@@ -2502,7 +2597,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
}
++vcpu->stat.nmi_injections;
- if (vcpu->arch.rmode.vm86_active) {
+ if (vmx->rmode.vm86_active) {
vmx->rmode.irq.pending = true;
vmx->rmode.irq.vector = NMI_VECTOR;
vmx->rmode.irq.rip = kvm_rip_read(vcpu);
@@ -2659,14 +2754,14 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if (enable_ept)
BUG();
cr2 = vmcs_readl(EXIT_QUALIFICATION);
- KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
- (u32)((u64)cr2 >> 32), handler);
+ trace_kvm_page_fault(cr2, error_code);
+
if (kvm_event_needs_reinjection(vcpu))
kvm_mmu_unprotect_page_virt(vcpu, cr2);
return kvm_mmu_page_fault(vcpu, cr2, error_code);
}
- if (vcpu->arch.rmode.vm86_active &&
+ if (vmx->rmode.vm86_active &&
handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
error_code)) {
if (vcpu->arch.halt_request) {
@@ -2707,7 +2802,6 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu,
struct kvm_run *kvm_run)
{
++vcpu->stat.irq_exits;
- KVMTRACE_1D(INTR, vcpu, vmcs_read32(VM_EXIT_INTR_INFO), handler);
return 1;
}
@@ -2755,7 +2849,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
- unsigned long exit_qualification;
+ unsigned long exit_qualification, val;
int cr;
int reg;
@@ -2764,21 +2858,19 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
reg = (exit_qualification >> 8) & 15;
switch ((exit_qualification >> 4) & 3) {
case 0: /* mov to cr */
- KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr,
- (u32)kvm_register_read(vcpu, reg),
- (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
- handler);
+ val = kvm_register_read(vcpu, reg);
+ trace_kvm_cr_write(cr, val);
switch (cr) {
case 0:
- kvm_set_cr0(vcpu, kvm_register_read(vcpu, reg));
+ kvm_set_cr0(vcpu, val);
skip_emulated_instruction(vcpu);
return 1;
case 3:
- kvm_set_cr3(vcpu, kvm_register_read(vcpu, reg));
+ kvm_set_cr3(vcpu, val);
skip_emulated_instruction(vcpu);
return 1;
case 4:
- kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg));
+ kvm_set_cr4(vcpu, val);
skip_emulated_instruction(vcpu);
return 1;
case 8: {
@@ -2800,23 +2892,19 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
vcpu->arch.cr0 &= ~X86_CR0_TS;
vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
vmx_fpu_activate(vcpu);
- KVMTRACE_0D(CLTS, vcpu, handler);
skip_emulated_instruction(vcpu);
return 1;
case 1: /*mov from cr*/
switch (cr) {
case 3:
kvm_register_write(vcpu, reg, vcpu->arch.cr3);
- KVMTRACE_3D(CR_READ, vcpu, (u32)cr,
- (u32)kvm_register_read(vcpu, reg),
- (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
- handler);
+ trace_kvm_cr_read(cr, vcpu->arch.cr3);
skip_emulated_instruction(vcpu);
return 1;
case 8:
- kvm_register_write(vcpu, reg, kvm_get_cr8(vcpu));
- KVMTRACE_2D(CR_READ, vcpu, (u32)cr,
- (u32)kvm_register_read(vcpu, reg), handler);
+ val = kvm_get_cr8(vcpu);
+ kvm_register_write(vcpu, cr, val);
+ trace_kvm_cr_read(cr, val);
skip_emulated_instruction(vcpu);
return 1;
}
@@ -2884,7 +2972,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
val = 0;
}
kvm_register_write(vcpu, reg, val);
- KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
} else {
val = vcpu->arch.regs[reg];
switch (dr) {
@@ -2917,7 +3004,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
}
break;
}
- KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)val, handler);
}
skip_emulated_instruction(vcpu);
return 1;
@@ -2939,8 +3025,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
return 1;
}
- KVMTRACE_3D(MSR_READ, vcpu, ecx, (u32)data, (u32)(data >> 32),
- handler);
+ trace_kvm_msr_read(ecx, data);
/* FIXME: handling of bits 32:63 of rax, rdx */
vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
@@ -2955,8 +3040,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
| ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
- KVMTRACE_3D(MSR_WRITE, vcpu, ecx, (u32)data, (u32)(data >> 32),
- handler);
+ trace_kvm_msr_write(ecx, data);
if (vmx_set_msr(vcpu, ecx, data) != 0) {
kvm_inject_gp(vcpu, 0);
@@ -2983,7 +3067,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
- KVMTRACE_0D(PEND_INTR, vcpu, handler);
++vcpu->stat.irq_window_exits;
/*
@@ -3012,6 +3095,12 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
return 1;
}
+static int handle_vmx_insn(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+
static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -3124,14 +3213,98 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
(long unsigned int)exit_qualification);
kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
- kvm_run->hw.hardware_exit_reason = 0;
- return -ENOTSUPP;
+ kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION;
+ return 0;
}
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+ trace_kvm_page_fault(gpa, exit_qualification);
return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0);
}
+static u64 ept_rsvd_mask(u64 spte, int level)
+{
+ int i;
+ u64 mask = 0;
+
+ for (i = 51; i > boot_cpu_data.x86_phys_bits; i--)
+ mask |= (1ULL << i);
+
+ if (level > 2)
+ /* bits 7:3 reserved */
+ mask |= 0xf8;
+ else if (level == 2) {
+ if (spte & (1ULL << 7))
+ /* 2MB ref, bits 20:12 reserved */
+ mask |= 0x1ff000;
+ else
+ /* bits 6:3 reserved */
+ mask |= 0x78;
+ }
+
+ return mask;
+}
+
+static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
+ int level)
+{
+ printk(KERN_ERR "%s: spte 0x%llx level %d\n", __func__, spte, level);
+
+ /* 010b (write-only) */
+ WARN_ON((spte & 0x7) == 0x2);
+
+ /* 110b (write/execute) */
+ WARN_ON((spte & 0x7) == 0x6);
+
+ /* 100b (execute-only) and value not supported by logical processor */
+ if (!cpu_has_vmx_ept_execute_only())
+ WARN_ON((spte & 0x7) == 0x4);
+
+ /* not 000b */
+ if ((spte & 0x7)) {
+ u64 rsvd_bits = spte & ept_rsvd_mask(spte, level);
+
+ if (rsvd_bits != 0) {
+ printk(KERN_ERR "%s: rsvd_bits = 0x%llx\n",
+ __func__, rsvd_bits);
+ WARN_ON(1);
+ }
+
+ if (level == 1 || (level == 2 && (spte & (1ULL << 7)))) {
+ u64 ept_mem_type = (spte & 0x38) >> 3;
+
+ if (ept_mem_type == 2 || ept_mem_type == 3 ||
+ ept_mem_type == 7) {
+ printk(KERN_ERR "%s: ept_mem_type=0x%llx\n",
+ __func__, ept_mem_type);
+ WARN_ON(1);
+ }
+ }
+ }
+}
+
+static int handle_ept_misconfig(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ u64 sptes[4];
+ int nr_sptes, i;
+ gpa_t gpa;
+
+ gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+
+ printk(KERN_ERR "EPT: Misconfiguration.\n");
+ printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa);
+
+ nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes);
+
+ for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i)
+ ept_misconfig_inspect_spte(vcpu, sptes[i-1], i);
+
+ kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
+ kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG;
+
+ return 0;
+}
+
static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
u32 cpu_based_vm_exec_control;
@@ -3198,12 +3371,22 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
[EXIT_REASON_HLT] = handle_halt,
[EXIT_REASON_INVLPG] = handle_invlpg,
[EXIT_REASON_VMCALL] = handle_vmcall,
+ [EXIT_REASON_VMCLEAR] = handle_vmx_insn,
+ [EXIT_REASON_VMLAUNCH] = handle_vmx_insn,
+ [EXIT_REASON_VMPTRLD] = handle_vmx_insn,
+ [EXIT_REASON_VMPTRST] = handle_vmx_insn,
+ [EXIT_REASON_VMREAD] = handle_vmx_insn,
+ [EXIT_REASON_VMRESUME] = handle_vmx_insn,
+ [EXIT_REASON_VMWRITE] = handle_vmx_insn,
+ [EXIT_REASON_VMOFF] = handle_vmx_insn,
+ [EXIT_REASON_VMON] = handle_vmx_insn,
[EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
[EXIT_REASON_APIC_ACCESS] = handle_apic_access,
[EXIT_REASON_WBINVD] = handle_wbinvd,
[EXIT_REASON_TASK_SWITCH] = handle_task_switch,
- [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
[EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
+ [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
+ [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
};
static const int kvm_vmx_max_exit_handlers =
@@ -3219,8 +3402,7 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
u32 exit_reason = vmx->exit_reason;
u32 vectoring_info = vmx->idt_vectoring_info;
- KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu),
- (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit);
+ trace_kvm_exit(exit_reason, kvm_rip_read(vcpu));
/* If we need to emulate an MMIO from handle_invalid_guest_state
* we just return 0 */
@@ -3232,10 +3414,8 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
/* Access CR3 don't cause VMExit in paging mode, so we need
* to sync with guest real CR3. */
- if (enable_ept && is_paging(vcpu)) {
+ if (enable_ept && is_paging(vcpu))
vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
- ept_load_pdptrs(vcpu);
- }
if (unlikely(vmx->fail)) {
kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -3311,10 +3491,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
/* We need to handle NMIs before interrupts are enabled */
if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
- (exit_intr_info & INTR_INFO_VALID_MASK)) {
- KVMTRACE_0D(NMI, &vmx->vcpu, handler);
+ (exit_intr_info & INTR_INFO_VALID_MASK))
asm("int $2");
- }
idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
@@ -3419,6 +3597,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (enable_ept && is_paging(vcpu)) {
+ vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
+ ept_load_pdptrs(vcpu);
+ }
/* Record the guest's net vcpu time for enforced NMI injections. */
if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
vmx->entry_time = ktime_get();
@@ -3434,6 +3616,14 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+ /* When single-stepping over STI and MOV SS, we must clear the
+ * corresponding interruptibility bits in the guest state. Otherwise
+ * vmentry fails as it then expects bit 14 (BS) in pending debug
+ * exceptions being set, but that's not correct for the guest debugging
+ * case. */
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+ vmx_set_interrupt_shadow(vcpu, 0);
+
/*
* Loading guest fpu may have cleared host cr0.ts
*/
@@ -3450,11 +3640,16 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
"mov %%"R"sp, %c[host_rsp](%0) \n\t"
__ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
"1: \n\t"
+ /* Reload cr2 if changed */
+ "mov %c[cr2](%0), %%"R"ax \n\t"
+ "mov %%cr2, %%"R"dx \n\t"
+ "cmp %%"R"ax, %%"R"dx \n\t"
+ "je 2f \n\t"
+ "mov %%"R"ax, %%cr2 \n\t"
+ "2: \n\t"
/* Check if vmlaunch of vmresume is needed */
"cmpl $0, %c[launched](%0) \n\t"
/* Load guest registers. Don't clobber flags. */
- "mov %c[cr2](%0), %%"R"ax \n\t"
- "mov %%"R"ax, %%cr2 \n\t"
"mov %c[rax](%0), %%"R"ax \n\t"
"mov %c[rbx](%0), %%"R"bx \n\t"
"mov %c[rdx](%0), %%"R"dx \n\t"
@@ -3532,7 +3727,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
#endif
);
- vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
+ vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
+ | (1 << VCPU_EXREG_PDPTR));
vcpu->arch.regs_dirty = 0;
get_debugreg(vcpu->arch.dr6, 6);
@@ -3684,6 +3880,29 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
return ret;
}
+static const struct trace_print_flags vmx_exit_reasons_str[] = {
+ { EXIT_REASON_EXCEPTION_NMI, "exception" },
+ { EXIT_REASON_EXTERNAL_INTERRUPT, "ext_irq" },
+ { EXIT_REASON_TRIPLE_FAULT, "triple_fault" },
+ { EXIT_REASON_NMI_WINDOW, "nmi_window" },
+ { EXIT_REASON_IO_INSTRUCTION, "io_instruction" },
+ { EXIT_REASON_CR_ACCESS, "cr_access" },
+ { EXIT_REASON_DR_ACCESS, "dr_access" },
+ { EXIT_REASON_CPUID, "cpuid" },
+ { EXIT_REASON_MSR_READ, "rdmsr" },
+ { EXIT_REASON_MSR_WRITE, "wrmsr" },
+ { EXIT_REASON_PENDING_INTERRUPT, "interrupt_window" },
+ { EXIT_REASON_HLT, "halt" },
+ { EXIT_REASON_INVLPG, "invlpg" },
+ { EXIT_REASON_VMCALL, "hypercall" },
+ { EXIT_REASON_TPR_BELOW_THRESHOLD, "tpr_below_thres" },
+ { EXIT_REASON_APIC_ACCESS, "apic_access" },
+ { EXIT_REASON_WBINVD, "wbinvd" },
+ { EXIT_REASON_TASK_SWITCH, "task_switch" },
+ { EXIT_REASON_EPT_VIOLATION, "ept_violation" },
+ { -1, NULL }
+};
+
static struct kvm_x86_ops vmx_x86_ops = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
@@ -3743,6 +3962,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
.set_tss_addr = vmx_set_tss_addr,
.get_tdp_level = get_ept_level,
.get_mt_mask = vmx_get_mt_mask,
+
+ .exit_reasons_str = vmx_exit_reasons_str,
};
static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 249540f98513..af53f64376cb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -37,11 +37,14 @@
#include <linux/iommu.h>
#include <linux/intel-iommu.h>
#include <linux/cpufreq.h>
+#define CREATE_TRACE_POINTS
+#include "trace.h"
#include <asm/uaccess.h>
#include <asm/msr.h>
#include <asm/desc.h>
#include <asm/mtrr.h>
+#include <asm/mce.h>
#define MAX_IO_MSRS 256
#define CR0_RESERVED_BITS \
@@ -55,6 +58,10 @@
| X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
+
+#define KVM_MAX_MCE_BANKS 32
+#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P
+
/* EFER defaults:
* - enable syscall per default because its emulated by KVM
* - enable LME and LMA per default on 64 bit KVM
@@ -76,6 +83,9 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
struct kvm_x86_ops *kvm_x86_ops;
EXPORT_SYMBOL_GPL(kvm_x86_ops);
+int ignore_msrs = 0;
+module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
+
struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "pf_fixed", VCPU_STAT(pf_fixed) },
{ "pf_guest", VCPU_STAT(pf_guest) },
@@ -176,16 +186,22 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
++vcpu->stat.pf_guest;
if (vcpu->arch.exception.pending) {
- if (vcpu->arch.exception.nr == PF_VECTOR) {
- printk(KERN_DEBUG "kvm: inject_page_fault:"
- " double fault 0x%lx\n", addr);
- vcpu->arch.exception.nr = DF_VECTOR;
- vcpu->arch.exception.error_code = 0;
- } else if (vcpu->arch.exception.nr == DF_VECTOR) {
+ switch(vcpu->arch.exception.nr) {
+ case DF_VECTOR:
/* triple fault -> shutdown */
set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+ return;
+ case PF_VECTOR:
+ vcpu->arch.exception.nr = DF_VECTOR;
+ vcpu->arch.exception.error_code = 0;
+ return;
+ default:
+ /* replace previous exception with a new one in a hope
+ that instruction re-execution will regenerate lost
+ exception */
+ vcpu->arch.exception.pending = false;
+ break;
}
- return;
}
vcpu->arch.cr2 = addr;
kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
@@ -232,7 +248,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
goto out;
}
for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
- if (is_present_pte(pdpte[i]) &&
+ if (is_present_gpte(pdpte[i]) &&
(pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
ret = 0;
goto out;
@@ -241,6 +257,10 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
ret = 1;
memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
+ __set_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_avail);
+ __set_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_dirty);
out:
return ret;
@@ -256,6 +276,10 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
if (is_long_mode(vcpu) || !is_pae(vcpu))
return false;
+ if (!test_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_avail))
+ return true;
+
r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
if (r < 0)
goto out;
@@ -328,9 +352,6 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0);
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
{
kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
- KVMTRACE_1D(LMSW, vcpu,
- (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)),
- handler);
}
EXPORT_SYMBOL_GPL(kvm_lmsw);
@@ -466,7 +487,7 @@ static u32 msrs_to_save[] = {
#ifdef CONFIG_X86_64
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
#endif
- MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+ MSR_IA32_TSC, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
};
@@ -644,8 +665,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
- kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
- &vcpu->hv_clock.tsc_timestamp);
+ kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
ktime_get_ts(&ts);
local_irq_restore(flags);
@@ -704,11 +724,48 @@ static bool msr_mtrr_valid(unsigned msr)
return false;
}
+static bool valid_pat_type(unsigned t)
+{
+ return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
+}
+
+static bool valid_mtrr_type(unsigned t)
+{
+ return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
+}
+
+static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ int i;
+
+ if (!msr_mtrr_valid(msr))
+ return false;
+
+ if (msr == MSR_IA32_CR_PAT) {
+ for (i = 0; i < 8; i++)
+ if (!valid_pat_type((data >> (i * 8)) & 0xff))
+ return false;
+ return true;
+ } else if (msr == MSR_MTRRdefType) {
+ if (data & ~0xcff)
+ return false;
+ return valid_mtrr_type(data & 0xff);
+ } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
+ for (i = 0; i < 8 ; i++)
+ if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
+ return false;
+ return true;
+ }
+
+ /* variable MTRRs */
+ return valid_mtrr_type(data & 0xff);
+}
+
static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
- if (!msr_mtrr_valid(msr))
+ if (!mtrr_valid(vcpu, msr, data))
return 1;
if (msr == MSR_MTRRdefType) {
@@ -741,23 +798,51 @@ static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
return 0;
}
+static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ u64 mcg_cap = vcpu->arch.mcg_cap;
+ unsigned bank_num = mcg_cap & 0xff;
+
+ switch (msr) {
+ case MSR_IA32_MCG_STATUS:
+ vcpu->arch.mcg_status = data;
+ break;
+ case MSR_IA32_MCG_CTL:
+ if (!(mcg_cap & MCG_CTL_P))
+ return 1;
+ if (data != 0 && data != ~(u64)0)
+ return -1;
+ vcpu->arch.mcg_ctl = data;
+ break;
+ default:
+ if (msr >= MSR_IA32_MC0_CTL &&
+ msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
+ u32 offset = msr - MSR_IA32_MC0_CTL;
+ /* only 0 or all 1s can be written to IA32_MCi_CTL */
+ if ((offset & 0x3) == 0 &&
+ data != 0 && data != ~(u64)0)
+ return -1;
+ vcpu->arch.mce_banks[offset] = data;
+ break;
+ }
+ return 1;
+ }
+ return 0;
+}
+
int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
switch (msr) {
case MSR_EFER:
set_efer(vcpu, data);
break;
- case MSR_IA32_MC0_STATUS:
- pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
- __func__, data);
- break;
- case MSR_IA32_MCG_STATUS:
- pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
- __func__, data);
- break;
- case MSR_IA32_MCG_CTL:
- pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
- __func__, data);
+ case MSR_K7_HWCR:
+ data &= ~(u64)0x40; /* ignore flush filter disable */
+ if (data != 0) {
+ pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
+ data);
+ return 1;
+ }
break;
case MSR_IA32_DEBUGCTLMSR:
if (!data) {
@@ -813,9 +898,50 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
kvm_request_guest_time_update(vcpu);
break;
}
+ case MSR_IA32_MCG_CTL:
+ case MSR_IA32_MCG_STATUS:
+ case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
+ return set_msr_mce(vcpu, msr, data);
+
+ /* Performance counters are not protected by a CPUID bit,
+ * so we should check all of them in the generic path for the sake of
+ * cross vendor migration.
+ * Writing a zero into the event select MSRs disables them,
+ * which we perfectly emulate ;-). Any other value should be at least
+ * reported, some guests depend on them.
+ */
+ case MSR_P6_EVNTSEL0:
+ case MSR_P6_EVNTSEL1:
+ case MSR_K7_EVNTSEL0:
+ case MSR_K7_EVNTSEL1:
+ case MSR_K7_EVNTSEL2:
+ case MSR_K7_EVNTSEL3:
+ if (data != 0)
+ pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
+ "0x%x data 0x%llx\n", msr, data);
+ break;
+ /* at least RHEL 4 unconditionally writes to the perfctr registers,
+ * so we ignore writes to make it happy.
+ */
+ case MSR_P6_PERFCTR0:
+ case MSR_P6_PERFCTR1:
+ case MSR_K7_PERFCTR0:
+ case MSR_K7_PERFCTR1:
+ case MSR_K7_PERFCTR2:
+ case MSR_K7_PERFCTR3:
+ pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
+ "0x%x data 0x%llx\n", msr, data);
+ break;
default:
- pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
- return 1;
+ if (!ignore_msrs) {
+ pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
+ msr, data);
+ return 1;
+ } else {
+ pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n",
+ msr, data);
+ break;
+ }
}
return 0;
}
@@ -868,26 +994,47 @@ static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
return 0;
}
-int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
{
u64 data;
+ u64 mcg_cap = vcpu->arch.mcg_cap;
+ unsigned bank_num = mcg_cap & 0xff;
switch (msr) {
- case 0xc0010010: /* SYSCFG */
- case 0xc0010015: /* HWCR */
- case MSR_IA32_PLATFORM_ID:
case MSR_IA32_P5_MC_ADDR:
case MSR_IA32_P5_MC_TYPE:
- case MSR_IA32_MC0_CTL:
- case MSR_IA32_MCG_STATUS:
+ data = 0;
+ break;
case MSR_IA32_MCG_CAP:
+ data = vcpu->arch.mcg_cap;
+ break;
case MSR_IA32_MCG_CTL:
- case MSR_IA32_MC0_MISC:
- case MSR_IA32_MC0_MISC+4:
- case MSR_IA32_MC0_MISC+8:
- case MSR_IA32_MC0_MISC+12:
- case MSR_IA32_MC0_MISC+16:
- case MSR_IA32_MC0_MISC+20:
+ if (!(mcg_cap & MCG_CTL_P))
+ return 1;
+ data = vcpu->arch.mcg_ctl;
+ break;
+ case MSR_IA32_MCG_STATUS:
+ data = vcpu->arch.mcg_status;
+ break;
+ default:
+ if (msr >= MSR_IA32_MC0_CTL &&
+ msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
+ u32 offset = msr - MSR_IA32_MC0_CTL;
+ data = vcpu->arch.mce_banks[offset];
+ break;
+ }
+ return 1;
+ }
+ *pdata = data;
+ return 0;
+}
+
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+ u64 data;
+
+ switch (msr) {
+ case MSR_IA32_PLATFORM_ID:
case MSR_IA32_UCODE_REV:
case MSR_IA32_EBL_CR_POWERON:
case MSR_IA32_DEBUGCTLMSR:
@@ -895,9 +1042,13 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_IA32_LASTBRANCHTOIP:
case MSR_IA32_LASTINTFROMIP:
case MSR_IA32_LASTINTTOIP:
+ case MSR_K8_SYSCFG:
+ case MSR_K7_HWCR:
case MSR_VM_HSAVE_PA:
case MSR_P6_EVNTSEL0:
case MSR_P6_EVNTSEL1:
+ case MSR_K7_EVNTSEL0:
+ case MSR_K8_INT_PENDING_MSG:
data = 0;
break;
case MSR_MTRRcap:
@@ -929,9 +1080,22 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_KVM_SYSTEM_TIME:
data = vcpu->arch.time;
break;
+ case MSR_IA32_P5_MC_ADDR:
+ case MSR_IA32_P5_MC_TYPE:
+ case MSR_IA32_MCG_CAP:
+ case MSR_IA32_MCG_CTL:
+ case MSR_IA32_MCG_STATUS:
+ case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
+ return get_msr_mce(vcpu, msr, pdata);
default:
- pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
- return 1;
+ if (!ignore_msrs) {
+ pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
+ return 1;
+ } else {
+ pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
+ data = 0;
+ }
+ break;
}
*pdata = data;
return 0;
@@ -1030,6 +1194,8 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_REINJECT_CONTROL:
case KVM_CAP_IRQ_INJECT_STATUS:
case KVM_CAP_ASSIGN_DEV_IRQ:
+ case KVM_CAP_IRQFD:
+ case KVM_CAP_PIT2:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -1050,6 +1216,9 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_IOMMU:
r = iommu_found();
break;
+ case KVM_CAP_MCE:
+ r = KVM_MAX_MCE_BANKS;
+ break;
default:
r = 0;
break;
@@ -1110,6 +1279,16 @@ long kvm_arch_dev_ioctl(struct file *filp,
r = 0;
break;
}
+ case KVM_X86_GET_MCE_CAP_SUPPORTED: {
+ u64 mce_cap;
+
+ mce_cap = KVM_MCE_CAP_SUPPORTED;
+ r = -EFAULT;
+ if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
+ goto out;
+ r = 0;
+ break;
+ }
default:
r = -EINVAL;
}
@@ -1398,6 +1577,10 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
do_cpuid_ent(&cpuid_entries[nent], func, 0,
&nent, cpuid->nent);
+ r = -E2BIG;
+ if (nent >= cpuid->nent)
+ goto out_free;
+
r = -EFAULT;
if (copy_to_user(entries, cpuid_entries,
nent * sizeof(struct kvm_cpuid_entry2)))
@@ -1466,6 +1649,80 @@ static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
return 0;
}
+static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
+ u64 mcg_cap)
+{
+ int r;
+ unsigned bank_num = mcg_cap & 0xff, bank;
+
+ r = -EINVAL;
+ if (!bank_num)
+ goto out;
+ if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
+ goto out;
+ r = 0;
+ vcpu->arch.mcg_cap = mcg_cap;
+ /* Init IA32_MCG_CTL to all 1s */
+ if (mcg_cap & MCG_CTL_P)
+ vcpu->arch.mcg_ctl = ~(u64)0;
+ /* Init IA32_MCi_CTL to all 1s */
+ for (bank = 0; bank < bank_num; bank++)
+ vcpu->arch.mce_banks[bank*4] = ~(u64)0;
+out:
+ return r;
+}
+
+static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
+ struct kvm_x86_mce *mce)
+{
+ u64 mcg_cap = vcpu->arch.mcg_cap;
+ unsigned bank_num = mcg_cap & 0xff;
+ u64 *banks = vcpu->arch.mce_banks;
+
+ if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
+ return -EINVAL;
+ /*
+ * if IA32_MCG_CTL is not all 1s, the uncorrected error
+ * reporting is disabled
+ */
+ if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
+ vcpu->arch.mcg_ctl != ~(u64)0)
+ return 0;
+ banks += 4 * mce->bank;
+ /*
+ * if IA32_MCi_CTL is not all 1s, the uncorrected error
+ * reporting is disabled for the bank
+ */
+ if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
+ return 0;
+ if (mce->status & MCI_STATUS_UC) {
+ if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
+ !(vcpu->arch.cr4 & X86_CR4_MCE)) {
+ printk(KERN_DEBUG "kvm: set_mce: "
+ "injects mce exception while "
+ "previous one is in progress!\n");
+ set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+ return 0;
+ }
+ if (banks[1] & MCI_STATUS_VAL)
+ mce->status |= MCI_STATUS_OVER;
+ banks[2] = mce->addr;
+ banks[3] = mce->misc;
+ vcpu->arch.mcg_status = mce->mcg_status;
+ banks[1] = mce->status;
+ kvm_queue_exception(vcpu, MC_VECTOR);
+ } else if (!(banks[1] & MCI_STATUS_VAL)
+ || !(banks[1] & MCI_STATUS_UC)) {
+ if (banks[1] & MCI_STATUS_VAL)
+ mce->status |= MCI_STATUS_OVER;
+ banks[2] = mce->addr;
+ banks[3] = mce->misc;
+ banks[1] = mce->status;
+ } else
+ banks[1] |= MCI_STATUS_OVER;
+ return 0;
+}
+
long kvm_arch_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -1599,6 +1856,24 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
break;
}
+ case KVM_X86_SETUP_MCE: {
+ u64 mcg_cap;
+
+ r = -EFAULT;
+ if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
+ goto out;
+ r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
+ break;
+ }
+ case KVM_X86_SET_MCE: {
+ struct kvm_x86_mce mce;
+
+ r = -EFAULT;
+ if (copy_from_user(&mce, argp, sizeof mce))
+ goto out;
+ r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
+ break;
+ }
default:
r = -EINVAL;
}
@@ -1738,19 +2013,25 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
r = 0;
switch (chip->chip_id) {
case KVM_IRQCHIP_PIC_MASTER:
+ spin_lock(&pic_irqchip(kvm)->lock);
memcpy(&pic_irqchip(kvm)->pics[0],
&chip->chip.pic,
sizeof(struct kvm_pic_state));
+ spin_unlock(&pic_irqchip(kvm)->lock);
break;
case KVM_IRQCHIP_PIC_SLAVE:
+ spin_lock(&pic_irqchip(kvm)->lock);
memcpy(&pic_irqchip(kvm)->pics[1],
&chip->chip.pic,
sizeof(struct kvm_pic_state));
+ spin_unlock(&pic_irqchip(kvm)->lock);
break;
case KVM_IRQCHIP_IOAPIC:
+ mutex_lock(&kvm->irq_lock);
memcpy(ioapic_irqchip(kvm),
&chip->chip.ioapic,
sizeof(struct kvm_ioapic_state));
+ mutex_unlock(&kvm->irq_lock);
break;
default:
r = -EINVAL;
@@ -1764,7 +2045,9 @@ static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
{
int r = 0;
+ mutex_lock(&kvm->arch.vpit->pit_state.lock);
memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
+ mutex_unlock(&kvm->arch.vpit->pit_state.lock);
return r;
}
@@ -1772,8 +2055,10 @@ static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
{
int r = 0;
+ mutex_lock(&kvm->arch.vpit->pit_state.lock);
memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
kvm_pit_load_count(kvm, 0, ps->channels[0].count);
+ mutex_unlock(&kvm->arch.vpit->pit_state.lock);
return r;
}
@@ -1782,7 +2067,9 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
{
if (!kvm->arch.vpit)
return -ENXIO;
+ mutex_lock(&kvm->arch.vpit->pit_state.lock);
kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject;
+ mutex_unlock(&kvm->arch.vpit->pit_state.lock);
return 0;
}
@@ -1833,6 +2120,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
union {
struct kvm_pit_state ps;
struct kvm_memory_alias alias;
+ struct kvm_pit_config pit_config;
} u;
switch (ioctl) {
@@ -1893,12 +2181,20 @@ long kvm_arch_vm_ioctl(struct file *filp,
}
break;
case KVM_CREATE_PIT:
+ u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
+ goto create_pit;
+ case KVM_CREATE_PIT2:
+ r = -EFAULT;
+ if (copy_from_user(&u.pit_config, argp,
+ sizeof(struct kvm_pit_config)))
+ goto out;
+ create_pit:
mutex_lock(&kvm->lock);
r = -EEXIST;
if (kvm->arch.vpit)
goto create_pit_unlock;
r = -ENOMEM;
- kvm->arch.vpit = kvm_create_pit(kvm);
+ kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
if (kvm->arch.vpit)
r = 0;
create_pit_unlock:
@@ -1913,10 +2209,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
goto out;
if (irqchip_in_kernel(kvm)) {
__s32 status;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->irq_lock);
status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
irq_event.irq, irq_event.level);
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->irq_lock);
if (ioctl == KVM_IRQ_LINE_STATUS) {
irq_event.status = status;
if (copy_to_user(argp, &irq_event,
@@ -2049,7 +2345,7 @@ static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
if (vcpu->arch.apic) {
dev = &vcpu->arch.apic->dev;
- if (dev->in_range(dev, addr, len, is_write))
+ if (kvm_iodevice_in_range(dev, addr, len, is_write))
return dev;
}
return NULL;
@@ -2162,12 +2458,11 @@ mmio:
*/
mutex_lock(&vcpu->kvm->lock);
mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0);
+ mutex_unlock(&vcpu->kvm->lock);
if (mmio_dev) {
kvm_iodevice_read(mmio_dev, gpa, bytes, val);
- mutex_unlock(&vcpu->kvm->lock);
return X86EMUL_CONTINUE;
}
- mutex_unlock(&vcpu->kvm->lock);
vcpu->mmio_needed = 1;
vcpu->mmio_phys_addr = gpa;
@@ -2217,12 +2512,11 @@ mmio:
*/
mutex_lock(&vcpu->kvm->lock);
mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1);
+ mutex_unlock(&vcpu->kvm->lock);
if (mmio_dev) {
kvm_iodevice_write(mmio_dev, gpa, bytes, val);
- mutex_unlock(&vcpu->kvm->lock);
return X86EMUL_CONTINUE;
}
- mutex_unlock(&vcpu->kvm->lock);
vcpu->mmio_needed = 1;
vcpu->mmio_phys_addr = gpa;
@@ -2311,7 +2605,6 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
int emulate_clts(struct kvm_vcpu *vcpu)
{
- KVMTRACE_0D(CLTS, vcpu, handler);
kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
return X86EMUL_CONTINUE;
}
@@ -2412,14 +2705,33 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
- /* Reject the instructions other than VMCALL/VMMCALL when
- * try to emulate invalid opcode */
+ /* Only allow emulation of specific instructions on #UD
+ * (namely VMMCALL, sysenter, sysexit, syscall)*/
c = &vcpu->arch.emulate_ctxt.decode;
- if ((emulation_type & EMULTYPE_TRAP_UD) &&
- (!(c->twobyte && c->b == 0x01 &&
- (c->modrm_reg == 0 || c->modrm_reg == 3) &&
- c->modrm_mod == 3 && c->modrm_rm == 1)))
- return EMULATE_FAIL;
+ if (emulation_type & EMULTYPE_TRAP_UD) {
+ if (!c->twobyte)
+ return EMULATE_FAIL;
+ switch (c->b) {
+ case 0x01: /* VMMCALL */
+ if (c->modrm_mod != 3 || c->modrm_rm != 1)
+ return EMULATE_FAIL;
+ break;
+ case 0x34: /* sysenter */
+ case 0x35: /* sysexit */
+ if (c->modrm_mod != 0 || c->modrm_rm != 0)
+ return EMULATE_FAIL;
+ break;
+ case 0x05: /* syscall */
+ if (c->modrm_mod != 0 || c->modrm_rm != 0)
+ return EMULATE_FAIL;
+ break;
+ default:
+ return EMULATE_FAIL;
+ }
+
+ if (!(c->modrm_reg == 0 || c->modrm_reg == 3))
+ return EMULATE_FAIL;
+ }
++vcpu->stat.insn_emulation;
if (r) {
@@ -2545,7 +2857,6 @@ static void kernel_pio(struct kvm_io_device *pio_dev,
{
/* TODO: String I/O for in kernel device */
- mutex_lock(&vcpu->kvm->lock);
if (vcpu->arch.pio.in)
kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,
vcpu->arch.pio.size,
@@ -2554,7 +2865,6 @@ static void kernel_pio(struct kvm_io_device *pio_dev,
kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,
vcpu->arch.pio.size,
pd);
- mutex_unlock(&vcpu->kvm->lock);
}
static void pio_string_write(struct kvm_io_device *pio_dev,
@@ -2564,14 +2874,12 @@ static void pio_string_write(struct kvm_io_device *pio_dev,
void *pd = vcpu->arch.pio_data;
int i;
- mutex_lock(&vcpu->kvm->lock);
for (i = 0; i < io->cur_count; i++) {
kvm_iodevice_write(pio_dev, io->port,
io->size,
pd);
pd += io->size;
}
- mutex_unlock(&vcpu->kvm->lock);
}
static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
@@ -2598,17 +2906,15 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
vcpu->arch.pio.down = 0;
vcpu->arch.pio.rep = 0;
- if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
- KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
- handler);
- else
- KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
- handler);
+ trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
+ size, 1);
val = kvm_register_read(vcpu, VCPU_REGS_RAX);
memcpy(vcpu->arch.pio_data, &val, 4);
+ mutex_lock(&vcpu->kvm->lock);
pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
+ mutex_unlock(&vcpu->kvm->lock);
if (pio_dev) {
kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
complete_pio(vcpu);
@@ -2637,12 +2943,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
vcpu->arch.pio.down = down;
vcpu->arch.pio.rep = rep;
- if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
- KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
- handler);
- else
- KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
- handler);
+ trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
+ size, count);
if (!count) {
kvm_x86_ops->skip_emulated_instruction(vcpu);
@@ -2672,9 +2974,12 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
vcpu->arch.pio.guest_gva = address;
+ mutex_lock(&vcpu->kvm->lock);
pio_dev = vcpu_find_pio_dev(vcpu, port,
vcpu->arch.pio.cur_count,
!vcpu->arch.pio.in);
+ mutex_unlock(&vcpu->kvm->lock);
+
if (!vcpu->arch.pio.in) {
/* string PIO write */
ret = pio_copy_data(vcpu);
@@ -2724,10 +3029,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
- for (i = 0; i < KVM_MAX_VCPUS; ++i) {
- vcpu = kvm->vcpus[i];
- if (!vcpu)
- continue;
+ kvm_for_each_vcpu(i, vcpu, kvm) {
if (vcpu->cpu != freq->cpu)
continue;
if (!kvm_request_guest_time_update(vcpu))
@@ -2820,7 +3122,6 @@ void kvm_arch_exit(void)
int kvm_emulate_halt(struct kvm_vcpu *vcpu)
{
++vcpu->stat.halt_exits;
- KVMTRACE_0D(HLT, vcpu, handler);
if (irqchip_in_kernel(vcpu->kvm)) {
vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
return 1;
@@ -2851,7 +3152,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
- KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler);
+ trace_kvm_hypercall(nr, a0, a1, a2, a3);
if (!is_long_mode(vcpu)) {
nr &= 0xFFFFFFFF;
@@ -2951,8 +3252,6 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
return 0;
}
- KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value,
- (u32)((u64)value >> 32), handler);
return value;
}
@@ -2960,9 +3259,6 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
unsigned long *rflags)
{
- KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val,
- (u32)((u64)val >> 32), handler);
-
switch (cr) {
case 0:
kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
@@ -3072,11 +3368,11 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
}
kvm_x86_ops->skip_emulated_instruction(vcpu);
- KVMTRACE_5D(CPUID, vcpu, function,
- (u32)kvm_register_read(vcpu, VCPU_REGS_RAX),
- (u32)kvm_register_read(vcpu, VCPU_REGS_RBX),
- (u32)kvm_register_read(vcpu, VCPU_REGS_RCX),
- (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler);
+ trace_kvm_cpuid(function,
+ kvm_register_read(vcpu, VCPU_REGS_RAX),
+ kvm_register_read(vcpu, VCPU_REGS_RBX),
+ kvm_register_read(vcpu, VCPU_REGS_RCX),
+ kvm_register_read(vcpu, VCPU_REGS_RDX));
}
EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
@@ -3157,9 +3453,6 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
- kvm_x86_ops->set_interrupt_shadow(vcpu, 0);
-
/* try to reinject previous events if any */
if (vcpu->arch.nmi_injected) {
kvm_x86_ops->set_nmi(vcpu);
@@ -3275,7 +3568,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
set_debugreg(vcpu->arch.eff_db[3], 3);
}
- KVMTRACE_0D(VMENTRY, vcpu, entryexit);
+ trace_kvm_entry(vcpu->vcpu_id);
kvm_x86_ops->run(vcpu, kvm_run);
if (unlikely(vcpu->arch.switch_db_regs)) {
@@ -4111,7 +4404,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
/* Older userspace won't unhalt the vcpu on reset. */
- if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 &&
+ if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
!(vcpu->arch.cr0 & X86_CR0_PE))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -4382,7 +4675,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
kvm = vcpu->kvm;
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
- if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
+ if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
else
vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
@@ -4404,6 +4697,14 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
goto fail_mmu_destroy;
}
+ vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
+ GFP_KERNEL);
+ if (!vcpu->arch.mce_banks) {
+ r = -ENOMEM;
+ goto fail_mmu_destroy;
+ }
+ vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
+
return 0;
fail_mmu_destroy:
@@ -4451,20 +4752,22 @@ static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
static void kvm_free_vcpus(struct kvm *kvm)
{
unsigned int i;
+ struct kvm_vcpu *vcpu;
/*
* Unpin any mmu pages first.
*/
- for (i = 0; i < KVM_MAX_VCPUS; ++i)
- if (kvm->vcpus[i])
- kvm_unload_vcpu_mmu(kvm->vcpus[i]);
- for (i = 0; i < KVM_MAX_VCPUS; ++i) {
- if (kvm->vcpus[i]) {
- kvm_arch_vcpu_free(kvm->vcpus[i]);
- kvm->vcpus[i] = NULL;
- }
- }
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_unload_vcpu_mmu(vcpu);
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_arch_vcpu_free(vcpu);
+ mutex_lock(&kvm->lock);
+ for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
+ kvm->vcpus[i] = NULL;
+
+ atomic_set(&kvm->online_vcpus, 0);
+ mutex_unlock(&kvm->lock);
}
void kvm_arch_sync_events(struct kvm *kvm)
@@ -4580,3 +4883,9 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
{
return kvm_x86_ops->interrupt_allowed(vcpu);
}
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index c1b6c232e02b..c6663d46f328 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -32,6 +32,8 @@
#include <linux/module.h>
#include <asm/kvm_x86_emulate.h>
+#include "mmu.h" /* for is_long_mode() */
+
/*
* Opcode effective-address decode tables.
* Note that we only emulate instructions that have at least one memory
@@ -60,6 +62,7 @@
#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */
#define SrcOne (7<<4) /* Implied '1' */
#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */
+#define SrcImmU (9<<4) /* Immediate operand, unsigned */
#define SrcMask (0xf<<4)
/* Generic ModRM decode. */
#define ModRM (1<<8)
@@ -195,7 +198,7 @@ static u32 opcode_table[256] = {
ByteOp | SrcImmUByte, SrcImmUByte,
/* 0xE8 - 0xEF */
SrcImm | Stack, SrcImm | ImplicitOps,
- SrcImm | Src2Imm16, SrcImmByte | ImplicitOps,
+ SrcImmU | Src2Imm16, SrcImmByte | ImplicitOps,
SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
/* 0xF0 - 0xF7 */
@@ -208,7 +211,7 @@ static u32 opcode_table[256] = {
static u32 twobyte_table[256] = {
/* 0x00 - 0x0F */
- 0, Group | GroupDual | Group7, 0, 0, 0, 0, ImplicitOps, 0,
+ 0, Group | GroupDual | Group7, 0, 0, 0, ImplicitOps, ImplicitOps, 0,
ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
/* 0x10 - 0x1F */
0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
@@ -216,7 +219,9 @@ static u32 twobyte_table[256] = {
ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
/* 0x30 - 0x3F */
- ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ ImplicitOps, 0, ImplicitOps, 0,
+ ImplicitOps, ImplicitOps, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
/* 0x40 - 0x47 */
DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
@@ -319,8 +324,11 @@ static u32 group2_table[] = {
};
/* EFLAGS bit definitions. */
+#define EFLG_VM (1<<17)
+#define EFLG_RF (1<<16)
#define EFLG_OF (1<<11)
#define EFLG_DF (1<<10)
+#define EFLG_IF (1<<9)
#define EFLG_SF (1<<7)
#define EFLG_ZF (1<<6)
#define EFLG_AF (1<<4)
@@ -1027,6 +1035,7 @@ done_prefixes:
c->src.type = OP_MEM;
break;
case SrcImm:
+ case SrcImmU:
c->src.type = OP_IMM;
c->src.ptr = (unsigned long *)c->eip;
c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
@@ -1044,6 +1053,19 @@ done_prefixes:
c->src.val = insn_fetch(s32, 4, c->eip);
break;
}
+ if ((c->d & SrcMask) == SrcImmU) {
+ switch (c->src.bytes) {
+ case 1:
+ c->src.val &= 0xff;
+ break;
+ case 2:
+ c->src.val &= 0xffff;
+ break;
+ case 4:
+ c->src.val &= 0xffffffff;
+ break;
+ }
+ }
break;
case SrcImmByte:
case SrcImmUByte:
@@ -1361,7 +1383,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
return 0;
}
-void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
+static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
{
u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask);
/*
@@ -1375,6 +1397,217 @@ void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
ctxt->interruptibility = mask;
}
+static inline void
+setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
+ struct kvm_segment *cs, struct kvm_segment *ss)
+{
+ memset(cs, 0, sizeof(struct kvm_segment));
+ kvm_x86_ops->get_segment(ctxt->vcpu, cs, VCPU_SREG_CS);
+ memset(ss, 0, sizeof(struct kvm_segment));
+
+ cs->l = 0; /* will be adjusted later */
+ cs->base = 0; /* flat segment */
+ cs->g = 1; /* 4kb granularity */
+ cs->limit = 0xffffffff; /* 4GB limit */
+ cs->type = 0x0b; /* Read, Execute, Accessed */
+ cs->s = 1;
+ cs->dpl = 0; /* will be adjusted later */
+ cs->present = 1;
+ cs->db = 1;
+
+ ss->unusable = 0;
+ ss->base = 0; /* flat segment */
+ ss->limit = 0xffffffff; /* 4GB limit */
+ ss->g = 1; /* 4kb granularity */
+ ss->s = 1;
+ ss->type = 0x03; /* Read/Write, Accessed */
+ ss->db = 1; /* 32bit stack segment */
+ ss->dpl = 0;
+ ss->present = 1;
+}
+
+static int
+emulate_syscall(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ struct kvm_segment cs, ss;
+ u64 msr_data;
+
+ /* syscall is not available in real mode */
+ if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL
+ || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE))
+ return -1;
+
+ setup_syscalls_segments(ctxt, &cs, &ss);
+
+ kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
+ msr_data >>= 32;
+ cs.selector = (u16)(msr_data & 0xfffc);
+ ss.selector = (u16)(msr_data + 8);
+
+ if (is_long_mode(ctxt->vcpu)) {
+ cs.db = 0;
+ cs.l = 1;
+ }
+ kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
+ kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+
+ c->regs[VCPU_REGS_RCX] = c->eip;
+ if (is_long_mode(ctxt->vcpu)) {
+#ifdef CONFIG_X86_64
+ c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
+
+ kvm_x86_ops->get_msr(ctxt->vcpu,
+ ctxt->mode == X86EMUL_MODE_PROT64 ?
+ MSR_LSTAR : MSR_CSTAR, &msr_data);
+ c->eip = msr_data;
+
+ kvm_x86_ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data);
+ ctxt->eflags &= ~(msr_data | EFLG_RF);
+#endif
+ } else {
+ /* legacy mode */
+ kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
+ c->eip = (u32)msr_data;
+
+ ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
+ }
+
+ return 0;
+}
+
+static int
+emulate_sysenter(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ struct kvm_segment cs, ss;
+ u64 msr_data;
+
+ /* inject #UD if LOCK prefix is used */
+ if (c->lock_prefix)
+ return -1;
+
+ /* inject #GP if in real mode or paging is disabled */
+ if (ctxt->mode == X86EMUL_MODE_REAL ||
+ !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return -1;
+ }
+
+ /* XXX sysenter/sysexit have not been tested in 64bit mode.
+ * Therefore, we inject an #UD.
+ */
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ return -1;
+
+ setup_syscalls_segments(ctxt, &cs, &ss);
+
+ kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
+ switch (ctxt->mode) {
+ case X86EMUL_MODE_PROT32:
+ if ((msr_data & 0xfffc) == 0x0) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return -1;
+ }
+ break;
+ case X86EMUL_MODE_PROT64:
+ if (msr_data == 0x0) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return -1;
+ }
+ break;
+ }
+
+ ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
+ cs.selector = (u16)msr_data;
+ cs.selector &= ~SELECTOR_RPL_MASK;
+ ss.selector = cs.selector + 8;
+ ss.selector &= ~SELECTOR_RPL_MASK;
+ if (ctxt->mode == X86EMUL_MODE_PROT64
+ || is_long_mode(ctxt->vcpu)) {
+ cs.db = 0;
+ cs.l = 1;
+ }
+
+ kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
+ kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+
+ kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
+ c->eip = msr_data;
+
+ kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
+ c->regs[VCPU_REGS_RSP] = msr_data;
+
+ return 0;
+}
+
+static int
+emulate_sysexit(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ struct kvm_segment cs, ss;
+ u64 msr_data;
+ int usermode;
+
+ /* inject #UD if LOCK prefix is used */
+ if (c->lock_prefix)
+ return -1;
+
+ /* inject #GP if in real mode or paging is disabled */
+ if (ctxt->mode == X86EMUL_MODE_REAL
+ || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return -1;
+ }
+
+ /* sysexit must be called from CPL 0 */
+ if (kvm_x86_ops->get_cpl(ctxt->vcpu) != 0) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return -1;
+ }
+
+ setup_syscalls_segments(ctxt, &cs, &ss);
+
+ if ((c->rex_prefix & 0x8) != 0x0)
+ usermode = X86EMUL_MODE_PROT64;
+ else
+ usermode = X86EMUL_MODE_PROT32;
+
+ cs.dpl = 3;
+ ss.dpl = 3;
+ kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
+ switch (usermode) {
+ case X86EMUL_MODE_PROT32:
+ cs.selector = (u16)(msr_data + 16);
+ if ((msr_data & 0xfffc) == 0x0) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return -1;
+ }
+ ss.selector = (u16)(msr_data + 24);
+ break;
+ case X86EMUL_MODE_PROT64:
+ cs.selector = (u16)(msr_data + 32);
+ if (msr_data == 0x0) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return -1;
+ }
+ ss.selector = cs.selector + 8;
+ cs.db = 0;
+ cs.l = 1;
+ break;
+ }
+ cs.selector |= SELECTOR_RPL_MASK;
+ ss.selector |= SELECTOR_RPL_MASK;
+
+ kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
+ kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+
+ c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX];
+ c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX];
+
+ return 0;
+}
+
int
x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
{
@@ -1970,6 +2203,12 @@ twobyte_insn:
goto cannot_emulate;
}
break;
+ case 0x05: /* syscall */
+ if (emulate_syscall(ctxt) == -1)
+ goto cannot_emulate;
+ else
+ goto writeback;
+ break;
case 0x06:
emulate_clts(ctxt->vcpu);
c->dst.type = OP_NONE;
@@ -2036,6 +2275,18 @@ twobyte_insn:
rc = X86EMUL_CONTINUE;
c->dst.type = OP_NONE;
break;
+ case 0x34: /* sysenter */
+ if (emulate_sysenter(ctxt) == -1)
+ goto cannot_emulate;
+ else
+ goto writeback;
+ break;
+ case 0x35: /* sysexit */
+ if (emulate_sysexit(ctxt) == -1)
+ goto cannot_emulate;
+ else
+ goto writeback;
+ break;
case 0x40 ... 0x4f: /* cmov */
c->dst.val = c->dst.orig_val = c->src.val;
if (!test_cc(c->b, ctxt->eflags))