summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2008-03-18 16:50:18 +1100
committerStephen Rothwell <sfr@canb.auug.org.au>2008-03-18 16:50:18 +1100
commite1ed0949e29188bf093eaeb974df65b481dcb860 (patch)
tree483f319a7c65303bbc924ca54a1e5e2421a4a4c2 /arch
parent30dadf6bf38f886f7e12f11746918f3c472522af (diff)
parentac1cd26007b7904043b8d4fc802b82aac076030a (diff)
Merge commit 'kvm/master'
Diffstat (limited to 'arch')
-rw-r--r--arch/x86/Kconfig18
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/kvm.c247
-rw-r--r--arch/x86/kernel/kvmclock.c160
-rw-r--r--arch/x86/kernel/setup_32.c6
-rw-r--r--arch/x86/kernel/setup_64.c7
-rw-r--r--arch/x86/kvm/Makefile3
-rw-r--r--arch/x86/kvm/i8254.c600
-rw-r--r--arch/x86/kvm/i8254.h62
-rw-r--r--arch/x86/kvm/irq.c3
-rw-r--r--arch/x86/kvm/kvm_svm.h2
-rw-r--r--arch/x86/kvm/lapic.c8
-rw-r--r--arch/x86/kvm/mmu.c561
-rw-r--r--arch/x86/kvm/mmu.h6
-rw-r--r--arch/x86/kvm/paging_tmpl.h70
-rw-r--r--arch/x86/kvm/segment_descriptor.h29
-rw-r--r--arch/x86/kvm/svm.c229
-rw-r--r--arch/x86/kvm/vmx.c135
-rw-r--r--arch/x86/kvm/vmx.h10
-rw-r--r--arch/x86/kvm/x86.c384
-rw-r--r--arch/x86/kvm/x86_emulate.c256
21 files changed, 2381 insertions, 417 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 05ccdf209df8..a0c3567c7b6d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -370,6 +370,24 @@ config VMI
at the moment), by linking the kernel to a GPL-ed ROM module
provided by the hypervisor.
+config KVM_CLOCK
+ bool "KVM paravirtualized clock"
+ select PARAVIRT
+ help
+ Turning on this option will allow you to run a paravirtualized clock
+ when running over the KVM hypervisor. Instead of relying on a PIT
+ (or probably other) emulation by the underlying device model, the host
+ provides the guest with timing infrastructure such as time of day, and
+ system time
+
+config KVM_GUEST
+ bool "KVM Guest support"
+ select PARAVIRT
+ depends on !(X86_VISWS || X86_VOYAGER)
+ help
+ This option enables various optimizations for running under the KVM
+ hypervisor.
+
source "arch/x86/lguest/Kconfig"
config PARAVIRT
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 4eb5ce841106..1cc9d42e34c4 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -77,6 +77,8 @@ obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
+obj-$(CONFIG_KVM_GUEST) += kvm.o
+obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
ifdef CONFIG_INPUT_PCSPKR
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
new file mode 100644
index 000000000000..d9121f932589
--- /dev/null
+++ b/arch/x86/kernel/kvm.c
@@ -0,0 +1,247 @@
+/*
+ * KVM paravirt_ops implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright IBM Corporation, 2007
+ * Authors: Anthony Liguori <aliguori@us.ibm.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/kvm_para.h>
+#include <linux/cpu.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/hardirq.h>
+
+#define MMU_QUEUE_SIZE 1024
+
+struct kvm_para_state {
+ u8 mmu_queue[MMU_QUEUE_SIZE];
+ int mmu_queue_len;
+ enum paravirt_lazy_mode mode;
+};
+
+static DEFINE_PER_CPU(struct kvm_para_state, para_state);
+
+static struct kvm_para_state *kvm_para_state(void)
+{
+ return &per_cpu(para_state, raw_smp_processor_id());
+}
+
+/*
+ * No need for any "IO delay" on KVM
+ */
+static void kvm_io_delay(void)
+{
+}
+
+static void kvm_mmu_op(void *buffer, unsigned len)
+{
+ int r;
+ unsigned long a1, a2;
+
+ do {
+ a1 = __pa(buffer);
+ a2 = 0; /* on i386 __pa() always returns <4G */
+ r = kvm_hypercall3(KVM_HC_MMU_OP, len, a1, a2);
+ buffer += r;
+ len -= r;
+ } while (len);
+}
+
+static void mmu_queue_flush(struct kvm_para_state *state)
+{
+ if (state->mmu_queue_len) {
+ kvm_mmu_op(state->mmu_queue, state->mmu_queue_len);
+ state->mmu_queue_len = 0;
+ }
+}
+
+static void kvm_deferred_mmu_op(void *buffer, int len)
+{
+ struct kvm_para_state *state = kvm_para_state();
+
+ if (state->mode != PARAVIRT_LAZY_MMU) {
+ kvm_mmu_op(buffer, len);
+ return;
+ }
+ if (state->mmu_queue_len + len > sizeof state->mmu_queue)
+ mmu_queue_flush(state);
+ memcpy(state->mmu_queue + state->mmu_queue_len, buffer, len);
+ state->mmu_queue_len += len;
+}
+
+static void kvm_mmu_write(void *dest, u64 val)
+{
+ __u64 pte_phys;
+ struct kvm_mmu_op_write_pte wpte;
+
+#ifdef CONFIG_HIGHPTE
+ struct page *page;
+ unsigned long dst = (unsigned long) dest;
+
+ page = kmap_atomic_to_page(dest);
+ pte_phys = page_to_pfn(page);
+ pte_phys <<= PAGE_SHIFT;
+ pte_phys += (dst & ~(PAGE_MASK));
+#else
+ pte_phys = (unsigned long)__pa(dest);
+#endif
+ wpte.header.op = KVM_MMU_OP_WRITE_PTE;
+ wpte.pte_val = val;
+ wpte.pte_phys = pte_phys;
+
+ kvm_deferred_mmu_op(&wpte, sizeof wpte);
+}
+
+/*
+ * We only need to hook operations that are MMU writes. We hook these so that
+ * we can use lazy MMU mode to batch these operations. We could probably
+ * improve the performance of the host code if we used some of the information
+ * here to simplify processing of batched writes.
+ */
+static void kvm_set_pte(pte_t *ptep, pte_t pte)
+{
+ kvm_mmu_write(ptep, pte_val(pte));
+}
+
+static void kvm_set_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ kvm_mmu_write(ptep, pte_val(pte));
+}
+
+static void kvm_set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+ kvm_mmu_write(pmdp, pmd_val(pmd));
+}
+
+#if PAGETABLE_LEVELS >= 3
+#ifdef CONFIG_X86_PAE
+static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+ kvm_mmu_write(ptep, pte_val(pte));
+}
+
+static void kvm_set_pte_present(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ kvm_mmu_write(ptep, pte_val(pte));
+}
+
+static void kvm_pte_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ kvm_mmu_write(ptep, 0);
+}
+
+static void kvm_pmd_clear(pmd_t *pmdp)
+{
+ kvm_mmu_write(pmdp, 0);
+}
+#endif
+
+static void kvm_set_pud(pud_t *pudp, pud_t pud)
+{
+ kvm_mmu_write(pudp, pud_val(pud));
+}
+
+#if PAGETABLE_LEVELS == 4
+static void kvm_set_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+ kvm_mmu_write(pgdp, pgd_val(pgd));
+}
+#endif
+#endif /* PAGETABLE_LEVELS >= 3 */
+
+static void kvm_flush_tlb(void)
+{
+ struct kvm_mmu_op_flush_tlb ftlb = {
+ .header.op = KVM_MMU_OP_FLUSH_TLB,
+ };
+
+ kvm_deferred_mmu_op(&ftlb, sizeof ftlb);
+}
+
+static void kvm_release_pt(u32 pfn)
+{
+ struct kvm_mmu_op_release_pt rpt = {
+ .header.op = KVM_MMU_OP_RELEASE_PT,
+ .pt_phys = (u64)pfn << PAGE_SHIFT,
+ };
+
+ kvm_mmu_op(&rpt, sizeof rpt);
+}
+
+static void kvm_enter_lazy_mmu(void)
+{
+ struct kvm_para_state *state = kvm_para_state();
+
+ paravirt_enter_lazy_mmu();
+ state->mode = paravirt_get_lazy_mode();
+}
+
+static void kvm_leave_lazy_mmu(void)
+{
+ struct kvm_para_state *state = kvm_para_state();
+
+ mmu_queue_flush(state);
+ paravirt_leave_lazy(paravirt_get_lazy_mode());
+ state->mode = paravirt_get_lazy_mode();
+}
+
+static void paravirt_ops_setup(void)
+{
+ pv_info.name = "KVM";
+ pv_info.paravirt_enabled = 1;
+
+ if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
+ pv_cpu_ops.io_delay = kvm_io_delay;
+
+ if (kvm_para_has_feature(KVM_FEATURE_MMU_OP)) {
+ pv_mmu_ops.set_pte = kvm_set_pte;
+ pv_mmu_ops.set_pte_at = kvm_set_pte_at;
+ pv_mmu_ops.set_pmd = kvm_set_pmd;
+#if PAGETABLE_LEVELS >= 3
+#ifdef CONFIG_X86_PAE
+ pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic;
+ pv_mmu_ops.set_pte_present = kvm_set_pte_present;
+ pv_mmu_ops.pte_clear = kvm_pte_clear;
+ pv_mmu_ops.pmd_clear = kvm_pmd_clear;
+#endif
+ pv_mmu_ops.set_pud = kvm_set_pud;
+#if PAGETABLE_LEVELS == 4
+ pv_mmu_ops.set_pgd = kvm_set_pgd;
+#endif
+#endif
+ pv_mmu_ops.flush_tlb_user = kvm_flush_tlb;
+ pv_mmu_ops.release_pt = kvm_release_pt;
+ pv_mmu_ops.release_pd = kvm_release_pt;
+
+ pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu;
+ pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu;
+ }
+}
+
+void __init kvm_guest_init(void)
+{
+ if (!kvm_para_available())
+ return;
+
+ paravirt_ops_setup();
+}
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
new file mode 100644
index 000000000000..b999f5e5b3bf
--- /dev/null
+++ b/arch/x86/kernel/kvmclock.c
@@ -0,0 +1,160 @@
+/* KVM paravirtual clock driver. A clocksource implementation
+ Copyright (C) 2008 Glauber de Oliveira Costa, Red Hat Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+#include <linux/clocksource.h>
+#include <linux/kvm_para.h>
+#include <asm/arch_hooks.h>
+#include <asm/msr.h>
+#include <asm/apic.h>
+#include <linux/percpu.h>
+
+#define KVM_SCALE 22
+
+static int kvmclock = 1;
+
+static int parse_no_kvmclock(char *arg)
+{
+ kvmclock = 0;
+ return 0;
+}
+early_param("no-kvmclock", parse_no_kvmclock);
+
+/* The hypervisor will put information about time periodically here */
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock);
+#define get_clock(cpu, field) per_cpu(hv_clock, cpu).field
+
+static inline u64 kvm_get_delta(u64 last_tsc)
+{
+ int cpu = smp_processor_id();
+ u64 delta = native_read_tsc() - last_tsc;
+ return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE;
+}
+
+static struct kvm_wall_clock wall_clock;
+static cycle_t kvm_clock_read(void);
+/*
+ * The wallclock is the time of day when we booted. Since then, some time may
+ * have elapsed since the hypervisor wrote the data. So we try to account for
+ * that with system time
+ */
+unsigned long kvm_get_wallclock(void)
+{
+ u32 wc_sec, wc_nsec;
+ u64 delta;
+ struct timespec ts;
+ int version, nsec;
+ int low, high;
+
+ low = (int)__pa(&wall_clock);
+ high = ((u64)__pa(&wall_clock) >> 32);
+
+ delta = kvm_clock_read();
+
+ native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
+ do {
+ version = wall_clock.wc_version;
+ rmb();
+ wc_sec = wall_clock.wc_sec;
+ wc_nsec = wall_clock.wc_nsec;
+ rmb();
+ } while ((wall_clock.wc_version != version) || (version & 1));
+
+ delta = kvm_clock_read() - delta;
+ delta += wc_nsec;
+ nsec = do_div(delta, NSEC_PER_SEC);
+ set_normalized_timespec(&ts, wc_sec + delta, nsec);
+ /*
+ * Of all mechanisms of time adjustment I've tested, this one
+ * was the champion!
+ */
+ return ts.tv_sec + 1;
+}
+
+int kvm_set_wallclock(unsigned long now)
+{
+ return 0;
+}
+
+/*
+ * This is our read_clock function. The host puts an tsc timestamp each time
+ * it updates a new time. Without the tsc adjustment, we can have a situation
+ * in which a vcpu starts to run earlier (smaller system_time), but probes
+ * time later (compared to another vcpu), leading to backwards time
+ */
+static cycle_t kvm_clock_read(void)
+{
+ u64 last_tsc, now;
+ int cpu;
+
+ preempt_disable();
+ cpu = smp_processor_id();
+
+ last_tsc = get_clock(cpu, tsc_timestamp);
+ now = get_clock(cpu, system_time);
+
+ now += kvm_get_delta(last_tsc);
+ preempt_enable();
+
+ return now;
+}
+static struct clocksource kvm_clock = {
+ .name = "kvm-clock",
+ .read = kvm_clock_read,
+ .rating = 400,
+ .mask = CLOCKSOURCE_MASK(64),
+ .mult = 1 << KVM_SCALE,
+ .shift = KVM_SCALE,
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS,
+};
+
+static int kvm_register_clock(void)
+{
+ int cpu = smp_processor_id();
+ int low, high;
+ low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
+ high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
+
+ return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);
+}
+
+static void kvm_setup_secondary_clock(void)
+{
+ /*
+ * Now that the first cpu already had this clocksource initialized,
+ * we shouldn't fail.
+ */
+ WARN_ON(kvm_register_clock());
+ /* ok, done with our trickery, call native */
+ setup_secondary_APIC_clock();
+}
+
+void __init kvmclock_init(void)
+{
+ if (!kvm_para_available())
+ return;
+
+ if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
+ if (kvm_register_clock())
+ return;
+ pv_time_ops.get_wallclock = kvm_get_wallclock;
+ pv_time_ops.set_wallclock = kvm_set_wallclock;
+ pv_time_ops.sched_clock = kvm_clock_read;
+ pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
+ clocksource_register(&kvm_clock);
+ }
+}
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 9f5d06cc588d..b2bbf7b8f3fb 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -46,6 +46,7 @@
#include <linux/pfn.h>
#include <linux/pci.h>
#include <linux/init_ohci1394_dma.h>
+#include <linux/kvm_para.h>
#include <video/edid.h>
@@ -771,6 +772,10 @@ void __init setup_arch(char **cmdline_p)
if (mtrr_trim_uncached_memory(max_pfn))
max_low_pfn = setup_memory();
+#ifdef CONFIG_KVM_CLOCK
+ kvmclock_init();
+#endif
+
#ifdef CONFIG_VMI
/*
* Must be after max_low_pfn is determined, and before kernel
@@ -778,6 +783,7 @@ void __init setup_arch(char **cmdline_p)
*/
vmi_init();
#endif
+ kvm_guest_init();
/*
* NOTE: before this point _nobody_ is allowed to allocate
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index c99be05d7b23..06c544359ee0 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -41,6 +41,7 @@
#include <linux/ctype.h>
#include <linux/uaccess.h>
#include <linux/init_ohci1394_dma.h>
+#include <linux/kvm_para.h>
#include <asm/mtrr.h>
#include <asm/uaccess.h>
@@ -350,6 +351,10 @@ void __init setup_arch(char **cmdline_p)
io_delay_init();
+#ifdef CONFIG_KVM_CLOCK
+ kvmclock_init();
+#endif
+
#ifdef CONFIG_SMP
/* setup to use the early static init tables during kernel startup */
x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
@@ -448,6 +453,8 @@ void __init setup_arch(char **cmdline_p)
init_apic_mappings();
ioapic_init_mappings();
+ kvm_guest_init();
+
/*
* We trust e820 completely. No explicit ROM probing in memory.
*/
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index ffdd0b310784..4d0c22e11f1a 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -6,7 +6,8 @@ common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o)
EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
-kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o
+kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \
+ i8254.o
obj-$(CONFIG_KVM) += kvm.o
kvm-intel-objs = vmx.o
obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
new file mode 100644
index 000000000000..06a241ae4b56
--- /dev/null
+++ b/arch/x86/kvm/i8254.c
@@ -0,0 +1,600 @@
+/*
+ * 8253/8254 interval timer emulation
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ * Copyright (c) 2006 Intel Corporation
+ * Copyright (c) 2007 Keir Fraser, XenSource Inc
+ * Copyright (c) 2008 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ * Authors:
+ * Sheng Yang <sheng.yang@intel.com>
+ * Based on QEMU and Xen.
+ */
+
+#include <linux/kvm_host.h>
+
+#include "irq.h"
+#include "i8254.h"
+
+#ifndef CONFIG_X86_64
+#define mod_64(x, y) ((x) - (y) * div64_64(x, y))
+#else
+#define mod_64(x, y) ((x) % (y))
+#endif
+
+#define RW_STATE_LSB 1
+#define RW_STATE_MSB 2
+#define RW_STATE_WORD0 3
+#define RW_STATE_WORD1 4
+
+/* Compute with 96 bit intermediate result: (a*b)/c */
+static u64 muldiv64(u64 a, u32 b, u32 c)
+{
+ union {
+ u64 ll;
+ struct {
+ u32 low, high;
+ } l;
+ } u, res;
+ u64 rl, rh;
+
+ u.ll = a;
+ rl = (u64)u.l.low * (u64)b;
+ rh = (u64)u.l.high * (u64)b;
+ rh += (rl >> 32);
+ res.l.high = div64_64(rh, c);
+ res.l.low = div64_64(((mod_64(rh, c) << 32) + (rl & 0xffffffff)), c);
+ return res.ll;
+}
+
+static void pit_set_gate(struct kvm *kvm, int channel, u32 val)
+{
+ struct kvm_kpit_channel_state *c =
+ &kvm->arch.vpit->pit_state.channels[channel];
+
+ WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+
+ switch (c->mode) {
+ default:
+ case 0:
+ case 4:
+ /* XXX: just disable/enable counting */
+ break;
+ case 1:
+ case 2:
+ case 3:
+ case 5:
+ /* Restart counting on rising edge. */
+ if (c->gate < val)
+ c->count_load_time = ktime_get();
+ break;
+ }
+
+ c->gate = val;
+}
+
+int pit_get_gate(struct kvm *kvm, int channel)
+{
+ WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+
+ return kvm->arch.vpit->pit_state.channels[channel].gate;
+}
+
+static int pit_get_count(struct kvm *kvm, int channel)
+{
+ struct kvm_kpit_channel_state *c =
+ &kvm->arch.vpit->pit_state.channels[channel];
+ s64 d, t;
+ int counter;
+
+ WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+
+ t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
+ d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
+
+ switch (c->mode) {
+ case 0:
+ case 1:
+ case 4:
+ case 5:
+ counter = (c->count - d) & 0xffff;
+ break;
+ case 3:
+ /* XXX: may be incorrect for odd counts */
+ counter = c->count - (mod_64((2 * d), c->count));
+ break;
+ default:
+ counter = c->count - mod_64(d, c->count);
+ break;
+ }
+ return counter;
+}
+
+static int pit_get_out(struct kvm *kvm, int channel)
+{
+ struct kvm_kpit_channel_state *c =
+ &kvm->arch.vpit->pit_state.channels[channel];
+ s64 d, t;
+ int out;
+
+ WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+
+ t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
+ d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
+
+ switch (c->mode) {
+ default:
+ case 0:
+ out = (d >= c->count);
+ break;
+ case 1:
+ out = (d < c->count);
+ break;
+ case 2:
+ out = ((mod_64(d, c->count) == 0) && (d != 0));
+ break;
+ case 3:
+ out = (mod_64(d, c->count) < ((c->count + 1) >> 1));
+ break;
+ case 4:
+ case 5:
+ out = (d == c->count);
+ break;
+ }
+
+ return out;
+}
+
+static void pit_latch_count(struct kvm *kvm, int channel)
+{
+ struct kvm_kpit_channel_state *c =
+ &kvm->arch.vpit->pit_state.channels[channel];
+
+ WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+
+ if (!c->count_latched) {
+ c->latched_count = pit_get_count(kvm, channel);
+ c->count_latched = c->rw_mode;
+ }
+}
+
+static void pit_latch_status(struct kvm *kvm, int channel)
+{
+ struct kvm_kpit_channel_state *c =
+ &kvm->arch.vpit->pit_state.channels[channel];
+
+ WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+
+ if (!c->status_latched) {
+ /* TODO: Return NULL COUNT (bit 6). */
+ c->status = ((pit_get_out(kvm, channel) << 7) |
+ (c->rw_mode << 4) |
+ (c->mode << 1) |
+ c->bcd);
+ c->status_latched = 1;
+ }
+}
+
+int __pit_timer_fn(struct kvm_kpit_state *ps)
+{
+ struct kvm_vcpu *vcpu0 = ps->pit->kvm->vcpus[0];
+ struct kvm_kpit_timer *pt = &ps->pit_timer;
+
+ atomic_inc(&pt->pending);
+ if (vcpu0 && waitqueue_active(&vcpu0->wq)) {
+ vcpu0->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
+ wake_up_interruptible(&vcpu0->wq);
+ }
+
+ pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
+ pt->scheduled = ktime_to_ns(pt->timer.expires);
+
+ return (pt->period == 0 ? 0 : 1);
+}
+
+static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
+{
+ struct kvm_kpit_state *ps;
+ int restart_timer = 0;
+
+ ps = container_of(data, struct kvm_kpit_state, pit_timer.timer);
+
+ restart_timer = __pit_timer_fn(ps);
+
+ if (restart_timer)
+ return HRTIMER_RESTART;
+ else
+ return HRTIMER_NORESTART;
+}
+
+static void destroy_pit_timer(struct kvm_kpit_timer *pt)
+{
+ pr_debug("pit: execute del timer!\n");
+ hrtimer_cancel(&pt->timer);
+}
+
+static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period)
+{
+ s64 interval;
+
+ interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
+
+ pr_debug("pit: create pit timer, interval is %llu nsec\n", interval);
+
+ /* TODO The new value only affected after the retriggered */
+ hrtimer_cancel(&pt->timer);
+ pt->period = (is_period == 0) ? 0 : interval;
+ pt->timer.function = pit_timer_fn;
+ atomic_set(&pt->pending, 0);
+
+ hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval),
+ HRTIMER_MODE_ABS);
+}
+
+static void pit_load_count(struct kvm *kvm, int channel, u32 val)
+{
+ struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
+
+ WARN_ON(!mutex_is_locked(&ps->lock));
+
+ pr_debug("pit: load_count val is %d, channel is %d\n", val, channel);
+
+ /*
+ * Though spec said the state of 8254 is undefined after power-up,
+ * seems some tricky OS like Windows XP depends on IRQ0 interrupt
+ * when booting up.
+ * So here setting initialize rate for it, and not a specific number
+ */
+ if (val == 0)
+ val = 0x10000;
+
+ ps->channels[channel].count_load_time = ktime_get();
+ ps->channels[channel].count = val;
+
+ if (channel != 0)
+ return;
+
+ /* Two types of timer
+ * mode 1 is one shot, mode 2 is period, otherwise del timer */
+ switch (ps->channels[0].mode) {
+ case 1:
+ create_pit_timer(&ps->pit_timer, val, 0);
+ break;
+ case 2:
+ create_pit_timer(&ps->pit_timer, val, 1);
+ break;
+ default:
+ destroy_pit_timer(&ps->pit_timer);
+ }
+}
+
+void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val)
+{
+ mutex_lock(&kvm->arch.vpit->pit_state.lock);
+ pit_load_count(kvm, channel, val);
+ mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+}
+
+static void pit_ioport_write(struct kvm_io_device *this,
+ gpa_t addr, int len, const void *data)
+{
+ struct kvm_pit *pit = (struct kvm_pit *)this->private;
+ struct kvm_kpit_state *pit_state = &pit->pit_state;
+ struct kvm *kvm = pit->kvm;
+ int channel, access;
+ struct kvm_kpit_channel_state *s;
+ u32 val = *(u32 *) data;
+
+ val &= 0xff;
+ addr &= KVM_PIT_CHANNEL_MASK;
+
+ mutex_lock(&pit_state->lock);
+
+ if (val != 0)
+ pr_debug("pit: write addr is 0x%x, len is %d, val is 0x%x\n",
+ (unsigned int)addr, len, val);
+
+ if (addr == 3) {
+ channel = val >> 6;
+ if (channel == 3) {
+ /* Read-Back Command. */
+ for (channel = 0; channel < 3; channel++) {
+ s = &pit_state->channels[channel];
+ if (val & (2 << channel)) {
+ if (!(val & 0x20))
+ pit_latch_count(kvm, channel);
+ if (!(val & 0x10))
+ pit_latch_status(kvm, channel);
+ }
+ }
+ } else {
+ /* Select Counter <channel>. */
+ s = &pit_state->channels[channel];
+ access = (val >> 4) & KVM_PIT_CHANNEL_MASK;
+ if (access == 0) {
+ pit_latch_count(kvm, channel);
+ } else {
+ s->rw_mode = access;
+ s->read_state = access;
+ s->write_state = access;
+ s->mode = (val >> 1) & 7;
+ if (s->mode > 5)
+ s->mode -= 4;
+ s->bcd = val & 1;
+ }
+ }
+ } else {
+ /* Write Count. */
+ s = &pit_state->channels[addr];
+ switch (s->write_state) {
+ default:
+ case RW_STATE_LSB:
+ pit_load_count(kvm, addr, val);
+ break;
+ case RW_STATE_MSB:
+ pit_load_count(kvm, addr, val << 8);
+ break;
+ case RW_STATE_WORD0:
+ s->write_latch = val;
+ s->write_state = RW_STATE_WORD1;
+ break;
+ case RW_STATE_WORD1:
+ pit_load_count(kvm, addr, s->write_latch | (val << 8));
+ s->write_state = RW_STATE_WORD0;
+ break;
+ }
+ }
+
+ mutex_unlock(&pit_state->lock);
+}
+
+static void pit_ioport_read(struct kvm_io_device *this,
+ gpa_t addr, int len, void *data)
+{
+ struct kvm_pit *pit = (struct kvm_pit *)this->private;
+ struct kvm_kpit_state *pit_state = &pit->pit_state;
+ struct kvm *kvm = pit->kvm;
+ int ret, count;
+ struct kvm_kpit_channel_state *s;
+
+ addr &= KVM_PIT_CHANNEL_MASK;
+ s = &pit_state->channels[addr];
+
+ mutex_lock(&pit_state->lock);
+
+ if (s->status_latched) {
+ s->status_latched = 0;
+ ret = s->status;
+ } else if (s->count_latched) {
+ switch (s->count_latched) {
+ default:
+ case RW_STATE_LSB:
+ ret = s->latched_count & 0xff;
+ s->count_latched = 0;
+ break;
+ case RW_STATE_MSB:
+ ret = s->latched_count >> 8;
+ s->count_latched = 0;
+ break;
+ case RW_STATE_WORD0:
+ ret = s->latched_count & 0xff;
+ s->count_latched = RW_STATE_MSB;
+ break;
+ }
+ } else {
+ switch (s->read_state) {
+ default:
+ case RW_STATE_LSB:
+ count = pit_get_count(kvm, addr);
+ ret = count & 0xff;
+ break;
+ case RW_STATE_MSB:
+ count = pit_get_count(kvm, addr);
+ ret = (count >> 8) & 0xff;
+ break;
+ case RW_STATE_WORD0:
+ count = pit_get_count(kvm, addr);
+ ret = count & 0xff;
+ s->read_state = RW_STATE_WORD1;
+ break;
+ case RW_STATE_WORD1:
+ count = pit_get_count(kvm, addr);
+ ret = (count >> 8) & 0xff;
+ s->read_state = RW_STATE_WORD0;
+ break;
+ }
+ }
+
+ if (len > sizeof(ret))
+ len = sizeof(ret);
+ memcpy(data, (char *)&ret, len);
+
+ mutex_unlock(&pit_state->lock);
+}
+
+static int pit_in_range(struct kvm_io_device *this, gpa_t addr)
+{
+ return ((addr >= KVM_PIT_BASE_ADDRESS) &&
+ (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
+}
+
+static void speaker_ioport_write(struct kvm_io_device *this,
+ gpa_t addr, int len, const void *data)
+{
+ struct kvm_pit *pit = (struct kvm_pit *)this->private;
+ struct kvm_kpit_state *pit_state = &pit->pit_state;
+ struct kvm *kvm = pit->kvm;
+ u32 val = *(u32 *) data;
+
+ mutex_lock(&pit_state->lock);
+ pit_state->speaker_data_on = (val >> 1) & 1;
+ pit_set_gate(kvm, 2, val & 1);
+ mutex_unlock(&pit_state->lock);
+}
+
+static void speaker_ioport_read(struct kvm_io_device *this,
+ gpa_t addr, int len, void *data)
+{
+ struct kvm_pit *pit = (struct kvm_pit *)this->private;
+ struct kvm_kpit_state *pit_state = &pit->pit_state;
+ struct kvm *kvm = pit->kvm;
+ unsigned int refresh_clock;
+ int ret;
+
+ /* Refresh clock toggles at about 15us. We approximate as 2^14ns. */
+ refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1;
+
+ mutex_lock(&pit_state->lock);
+ ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(kvm, 2) |
+ (pit_get_out(kvm, 2) << 5) | (refresh_clock << 4));
+ if (len > sizeof(ret))
+ len = sizeof(ret);
+ memcpy(data, (char *)&ret, len);
+ mutex_unlock(&pit_state->lock);
+}
+
+static int speaker_in_range(struct kvm_io_device *this, gpa_t addr)
+{
+ return (addr == KVM_SPEAKER_BASE_ADDRESS);
+}
+
+void kvm_pit_reset(struct kvm_pit *pit)
+{
+ int i;
+ struct kvm_kpit_channel_state *c;
+
+ mutex_lock(&pit->pit_state.lock);
+ for (i = 0; i < 3; i++) {
+ c = &pit->pit_state.channels[i];
+ c->mode = 0xff;
+ c->gate = (i != 2);
+ pit_load_count(pit->kvm, i, 0);
+ }
+ mutex_unlock(&pit->pit_state.lock);
+
+ atomic_set(&pit->pit_state.pit_timer.pending, 0);
+ pit->pit_state.inject_pending = 1;
+}
+
+struct kvm_pit *kvm_create_pit(struct kvm *kvm)
+{
+ struct kvm_pit *pit;
+ struct kvm_kpit_state *pit_state;
+
+ pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL);
+ if (!pit)
+ return NULL;
+
+ mutex_init(&pit->pit_state.lock);
+ mutex_lock(&pit->pit_state.lock);
+
+ /* Initialize PIO device */
+ pit->dev.read = pit_ioport_read;
+ pit->dev.write = pit_ioport_write;
+ pit->dev.in_range = pit_in_range;
+ pit->dev.private = pit;
+ kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev);
+
+ pit->speaker_dev.read = speaker_ioport_read;
+ pit->speaker_dev.write = speaker_ioport_write;
+ pit->speaker_dev.in_range = speaker_in_range;
+ pit->speaker_dev.private = pit;
+ kvm_io_bus_register_dev(&kvm->pio_bus, &pit->speaker_dev);
+
+ kvm->arch.vpit = pit;
+ pit->kvm = kvm;
+
+ pit_state = &pit->pit_state;
+ pit_state->pit = pit;
+ hrtimer_init(&pit_state->pit_timer.timer,
+ CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ mutex_unlock(&pit->pit_state.lock);
+
+ kvm_pit_reset(pit);
+
+ return pit;
+}
+
+void kvm_free_pit(struct kvm *kvm)
+{
+ struct hrtimer *timer;
+
+ if (kvm->arch.vpit) {
+ mutex_lock(&kvm->arch.vpit->pit_state.lock);
+ timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
+ hrtimer_cancel(timer);
+ mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ kfree(kvm->arch.vpit);
+ }
+}
+
+void __inject_pit_timer_intr(struct kvm *kvm)
+{
+ mutex_lock(&kvm->lock);
+ kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1);
+ kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 0);
+ kvm_pic_set_irq(pic_irqchip(kvm), 0, 1);
+ kvm_pic_set_irq(pic_irqchip(kvm), 0, 0);
+ mutex_unlock(&kvm->lock);
+}
+
+void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pit *pit = vcpu->kvm->arch.vpit;
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_kpit_state *ps;
+ static unsigned long last_injected_time;
+
+ if (vcpu && pit) {
+ ps = &pit->pit_state;
+
+ /* Try to inject pending interrupts when:
+ * 1. Pending exists
+ * 2. Last interrupt was accepted or waited for too long time*/
+ if (atomic_read(&ps->pit_timer.pending) &&
+ (ps->inject_pending ||
+ (jiffies - last_injected_time
+ >= KVM_MAX_PIT_INTR_INTERVAL))) {
+ ps->inject_pending = 0;
+ __inject_pit_timer_intr(kvm);
+ last_injected_time = jiffies;
+ }
+ }
+}
+
+void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
+{
+ struct kvm_arch *arch = &vcpu->kvm->arch;
+ struct kvm_kpit_state *ps;
+
+ if (vcpu && arch->vpit) {
+ ps = &arch->vpit->pit_state;
+ if (atomic_read(&ps->pit_timer.pending) &&
+ (((arch->vpic->pics[0].imr & 1) == 0 &&
+ arch->vpic->pics[0].irq_base == vec) ||
+ (arch->vioapic->redirtbl[0].fields.vector == vec &&
+ arch->vioapic->redirtbl[0].fields.mask != 1))) {
+ ps->inject_pending = 1;
+ atomic_dec(&ps->pit_timer.pending);
+ ps->channels[0].count_load_time = ktime_get();
+ }
+ }
+}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
new file mode 100644
index 000000000000..e63ef38ac638
--- /dev/null
+++ b/arch/x86/kvm/i8254.h
@@ -0,0 +1,62 @@
+#ifndef __I8254_H
+#define __I8254_H
+
+#include "iodev.h"
+
+struct kvm_kpit_timer {
+ struct hrtimer timer;
+ int irq;
+ s64 period; /* unit: ns */
+ s64 scheduled;
+ ktime_t last_update;
+ atomic_t pending;
+};
+
+struct kvm_kpit_channel_state {
+ u32 count; /* can be 65536 */
+ u16 latched_count;
+ u8 count_latched;
+ u8 status_latched;
+ u8 status;
+ u8 read_state;
+ u8 write_state;
+ u8 write_latch;
+ u8 rw_mode;
+ u8 mode;
+ u8 bcd; /* not supported */
+ u8 gate; /* timer start */
+ ktime_t count_load_time;
+};
+
+struct kvm_kpit_state {
+ struct kvm_kpit_channel_state channels[3];
+ struct kvm_kpit_timer pit_timer;
+ u32 speaker_data_on;
+ struct mutex lock;
+ struct kvm_pit *pit;
+ bool inject_pending; /* if inject pending interrupts */
+};
+
+struct kvm_pit {
+ unsigned long base_addresss;
+ struct kvm_io_device dev;
+ struct kvm_io_device speaker_dev;
+ struct kvm *kvm;
+ struct kvm_kpit_state pit_state;
+};
+
+#define KVM_PIT_BASE_ADDRESS 0x40
+#define KVM_SPEAKER_BASE_ADDRESS 0x61
+#define KVM_PIT_MEM_LENGTH 4
+#define KVM_PIT_FREQ 1193181
+#define KVM_MAX_PIT_INTR_INTERVAL HZ / 100
+#define KVM_PIT_CHANNEL_MASK 0x3
+
+void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
+void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
+void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val);
+struct kvm_pit *kvm_create_pit(struct kvm *kvm);
+void kvm_free_pit(struct kvm *kvm);
+void kvm_pit_reset(struct kvm_pit *pit);
+
+#endif
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index e5714759e97f..dbfe21c99c48 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -23,6 +23,7 @@
#include <linux/kvm_host.h>
#include "irq.h"
+#include "i8254.h"
/*
* check if there is pending interrupt without
@@ -66,6 +67,7 @@ EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
{
kvm_inject_apic_timer_irqs(vcpu);
+ kvm_inject_pit_timer_irqs(vcpu);
/* TODO: PIT, RTC etc. */
}
EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
@@ -73,6 +75,7 @@ EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
{
kvm_apic_timer_intr_post(vcpu, vec);
+ kvm_pit_timer_intr_post(vcpu, vec);
/* TODO: PIT, RTC etc. */
}
EXPORT_SYMBOL_GPL(kvm_timer_intr_post);
diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h
index ecdfe97e4635..65ef0fc2c036 100644
--- a/arch/x86/kvm/kvm_svm.h
+++ b/arch/x86/kvm/kvm_svm.h
@@ -39,6 +39,8 @@ struct vcpu_svm {
unsigned long host_db_regs[NUM_DB_REGS];
unsigned long host_dr6;
unsigned long host_dr7;
+
+ u32 *msrpm;
};
#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 68a6b1511934..31280df7d2e3 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -658,7 +658,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
PRIx64 ", "
"timer initial count 0x%x, period %lldns, "
- "expire @ 0x%016" PRIx64 ".\n", __FUNCTION__,
+ "expire @ 0x%016" PRIx64 ".\n", __func__,
APIC_BUS_CYCLE_NS, ktime_to_ns(now),
apic_get_reg(apic, APIC_TMICT),
apic->timer.period,
@@ -691,7 +691,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
/* too common printing */
if (offset != APIC_EOI)
apic_debug("%s: offset 0x%x with length 0x%x, and value is "
- "0x%x\n", __FUNCTION__, offset, len, val);
+ "0x%x\n", __func__, offset, len, val);
offset &= 0xff0;
@@ -869,7 +869,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
struct kvm_lapic *apic;
int i;
- apic_debug("%s\n", __FUNCTION__);
+ apic_debug("%s\n", __func__);
ASSERT(vcpu);
apic = vcpu->arch.apic;
@@ -907,7 +907,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
apic_update_ppr(apic);
apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
- "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__,
+ "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__,
vcpu, kvm_apic_id(apic),
vcpu->arch.apic_base, apic->base_address);
}
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d8172aabc660..5bd2ca7184e4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -27,11 +27,22 @@
#include <linux/highmem.h>
#include <linux/module.h>
#include <linux/swap.h>
+#include <linux/hugetlb.h>
+#include <linux/compiler.h>
#include <asm/page.h>
#include <asm/cmpxchg.h>
#include <asm/io.h>
+/*
+ * When setting this variable to true it enables Two-Dimensional-Paging
+ * where the hardware walks 2 page tables:
+ * 1. the guest-virtual to guest-physical
+ * 2. while doing 1. it walks guest-physical to host-physical
+ * If the hardware supports that we don't need to do shadow paging.
+ */
+bool tdp_enabled = false;
+
#undef MMU_DEBUG
#undef AUDIT
@@ -101,8 +112,6 @@ static int dbg = 1;
#define PT_FIRST_AVAIL_BITS_SHIFT 9
#define PT64_SECOND_AVAIL_BITS_SHIFT 52
-#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
-
#define VALID_PAGE(x) ((x) != INVALID_PAGE)
#define PT64_LEVEL_BITS 9
@@ -159,6 +168,13 @@ static int dbg = 1;
#define ACC_USER_MASK PT_USER_MASK
#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
+struct kvm_pv_mmu_op_buffer {
+ void *ptr;
+ unsigned len;
+ unsigned processed;
+ char buf[512] __aligned(sizeof(long));
+};
+
struct kvm_rmap_desc {
u64 *shadow_ptes[RMAP_EXT];
struct kvm_rmap_desc *more;
@@ -200,11 +216,15 @@ static int is_present_pte(unsigned long pte)
static int is_shadow_present_pte(u64 pte)
{
- pte &= ~PT_SHADOW_IO_MARK;
return pte != shadow_trap_nonpresent_pte
&& pte != shadow_notrap_nonpresent_pte;
}
+static int is_large_pte(u64 pte)
+{
+ return pte & PT_PAGE_SIZE_MASK;
+}
+
static int is_writeble_pte(unsigned long pte)
{
return pte & PT_WRITABLE_MASK;
@@ -215,11 +235,6 @@ static int is_dirty_pte(unsigned long pte)
return pte & PT_DIRTY_MASK;
}
-static int is_io_pte(unsigned long pte)
-{
- return pte & PT_SHADOW_IO_MARK;
-}
-
static int is_rmap_pte(u64 pte)
{
return pte != shadow_trap_nonpresent_pte
@@ -350,16 +365,100 @@ static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
}
/*
+ * Return the pointer to the largepage write count for a given
+ * gfn, handling slots that are not large page aligned.
+ */
+static int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot)
+{
+ unsigned long idx;
+
+ idx = (gfn / KVM_PAGES_PER_HPAGE) -
+ (slot->base_gfn / KVM_PAGES_PER_HPAGE);
+ return &slot->lpage_info[idx].write_count;
+}
+
+static void account_shadowed(struct kvm *kvm, gfn_t gfn)
+{
+ int *write_count;
+
+ write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
+ *write_count += 1;
+ WARN_ON(*write_count > KVM_PAGES_PER_HPAGE);
+}
+
+static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
+{
+ int *write_count;
+
+ write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
+ *write_count -= 1;
+ WARN_ON(*write_count < 0);
+}
+
+static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn)
+{
+ struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+ int *largepage_idx;
+
+ if (slot) {
+ largepage_idx = slot_largepage_idx(gfn, slot);
+ return *largepage_idx;
+ }
+
+ return 1;
+}
+
+static int host_largepage_backed(struct kvm *kvm, gfn_t gfn)
+{
+ struct vm_area_struct *vma;
+ unsigned long addr;
+
+ addr = gfn_to_hva(kvm, gfn);
+ if (kvm_is_error_hva(addr))
+ return 0;
+
+ vma = find_vma(current->mm, addr);
+ if (vma && is_vm_hugetlb_page(vma))
+ return 1;
+
+ return 0;
+}
+
+static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
+{
+ struct kvm_memory_slot *slot;
+
+ if (has_wrprotected_page(vcpu->kvm, large_gfn))
+ return 0;
+
+ if (!host_largepage_backed(vcpu->kvm, large_gfn))
+ return 0;
+
+ slot = gfn_to_memslot(vcpu->kvm, large_gfn);
+ if (slot && slot->dirty_bitmap)
+ return 0;
+
+ return 1;
+}
+
+/*
* Take gfn and return the reverse mapping to it.
* Note: gfn must be unaliased before this function get called
*/
-static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn)
+static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage)
{
struct kvm_memory_slot *slot;
+ unsigned long idx;
slot = gfn_to_memslot(kvm, gfn);
- return &slot->rmap[gfn - slot->base_gfn];
+ if (!lpage)
+ return &slot->rmap[gfn - slot->base_gfn];
+
+ idx = (gfn / KVM_PAGES_PER_HPAGE) -
+ (slot->base_gfn / KVM_PAGES_PER_HPAGE);
+
+ return &slot->lpage_info[idx].rmap_pde;
}
/*
@@ -371,7 +470,7 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn)
* If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
* containing more mappings.
*/
-static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
+static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage)
{
struct kvm_mmu_page *sp;
struct kvm_rmap_desc *desc;
@@ -383,7 +482,7 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
gfn = unalias_gfn(vcpu->kvm, gfn);
sp = page_header(__pa(spte));
sp->gfns[spte - sp->spt] = gfn;
- rmapp = gfn_to_rmap(vcpu->kvm, gfn);
+ rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage);
if (!*rmapp) {
rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
*rmapp = (unsigned long)spte;
@@ -449,7 +548,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
kvm_release_page_dirty(page);
else
kvm_release_page_clean(page);
- rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]);
+ rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], is_large_pte(*spte));
if (!*rmapp) {
printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
BUG();
@@ -515,7 +614,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
int write_protected = 0;
gfn = unalias_gfn(kvm, gfn);
- rmapp = gfn_to_rmap(kvm, gfn);
+ rmapp = gfn_to_rmap(kvm, gfn, 0);
spte = rmap_next(kvm, rmapp, NULL);
while (spte) {
@@ -528,8 +627,27 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
}
spte = rmap_next(kvm, rmapp, spte);
}
+ /* check for huge page mappings */
+ rmapp = gfn_to_rmap(kvm, gfn, 1);
+ spte = rmap_next(kvm, rmapp, NULL);
+ while (spte) {
+ BUG_ON(!spte);
+ BUG_ON(!(*spte & PT_PRESENT_MASK));
+ BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
+ pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
+ if (is_writeble_pte(*spte)) {
+ rmap_remove(kvm, spte);
+ --kvm->stat.lpages;
+ set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+ write_protected = 1;
+ }
+ spte = rmap_next(kvm, rmapp, spte);
+ }
+
if (write_protected)
kvm_flush_remote_tlbs(kvm);
+
+ account_shadowed(kvm, gfn);
}
#ifdef MMU_DEBUG
@@ -539,8 +657,8 @@ static int is_empty_shadow_page(u64 *spt)
u64 *end;
for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
- if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) {
- printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
+ if (*pos != shadow_trap_nonpresent_pte) {
+ printk(KERN_ERR "%s: %p %llx\n", __func__,
pos, *pos);
return 0;
}
@@ -560,7 +678,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
static unsigned kvm_page_table_hashfn(gfn_t gfn)
{
- return gfn;
+ return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
}
static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
@@ -663,13 +781,14 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
struct kvm_mmu_page *sp;
struct hlist_node *node;
- pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
- index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+ pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
+ index = kvm_page_table_hashfn(gfn);
bucket = &kvm->arch.mmu_page_hash[index];
hlist_for_each_entry(sp, node, bucket, hash_link)
- if (sp->gfn == gfn && !sp->role.metaphysical) {
+ if (sp->gfn == gfn && !sp->role.metaphysical
+ && !sp->role.invalid) {
pgprintk("%s: found role %x\n",
- __FUNCTION__, sp->role.word);
+ __func__, sp->role.word);
return sp;
}
return NULL;
@@ -700,21 +819,21 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
role.quadrant = quadrant;
}
- pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
+ pgprintk("%s: looking gfn %lx role %x\n", __func__,
gfn, role.word);
- index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+ index = kvm_page_table_hashfn(gfn);
bucket = &vcpu->kvm->arch.mmu_page_hash[index];
hlist_for_each_entry(sp, node, bucket, hash_link)
if (sp->gfn == gfn && sp->role.word == role.word) {
mmu_page_add_parent_pte(vcpu, sp, parent_pte);
- pgprintk("%s: found\n", __FUNCTION__);
+ pgprintk("%s: found\n", __func__);
return sp;
}
++vcpu->kvm->stat.mmu_cache_miss;
sp = kvm_mmu_alloc_page(vcpu, parent_pte);
if (!sp)
return sp;
- pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
+ pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word);
sp->gfn = gfn;
sp->role = role;
hlist_add_head(&sp->hash_link, bucket);
@@ -746,11 +865,17 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
ent = pt[i];
+ if (is_shadow_present_pte(ent)) {
+ if (!is_large_pte(ent)) {
+ ent &= PT64_BASE_ADDR_MASK;
+ mmu_page_remove_parent_pte(page_header(ent),
+ &pt[i]);
+ } else {
+ --kvm->stat.lpages;
+ rmap_remove(kvm, &pt[i]);
+ }
+ }
pt[i] = shadow_trap_nonpresent_pte;
- if (!is_shadow_present_pte(ent))
- continue;
- ent &= PT64_BASE_ADDR_MASK;
- mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
}
kvm_flush_remote_tlbs(kvm);
}
@@ -790,10 +915,15 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
}
kvm_mmu_page_unlink_children(kvm, sp);
if (!sp->root_count) {
+ if (!sp->role.metaphysical)
+ unaccount_shadowed(kvm, sp->gfn);
hlist_del(&sp->hash_link);
kvm_mmu_free_page(kvm, sp);
- } else
+ } else {
list_move(&sp->link, &kvm->arch.active_mmu_pages);
+ sp->role.invalid = 1;
+ kvm_reload_remote_mmus(kvm);
+ }
kvm_mmu_reset_last_pte_updated(kvm);
}
@@ -839,13 +969,13 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
struct hlist_node *node, *n;
int r;
- pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
+ pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
r = 0;
- index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+ index = kvm_page_table_hashfn(gfn);
bucket = &kvm->arch.mmu_page_hash[index];
hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
if (sp->gfn == gfn && !sp->role.metaphysical) {
- pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
+ pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
sp->role.word);
kvm_mmu_zap_page(kvm, sp);
r = 1;
@@ -858,7 +988,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
struct kvm_mmu_page *sp;
while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) {
- pgprintk("%s: zap %lx %x\n", __FUNCTION__, gfn, sp->role.word);
+ pgprintk("%s: zap %lx %x\n", __func__, gfn, sp->role.word);
kvm_mmu_zap_page(kvm, sp);
}
}
@@ -890,15 +1020,39 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
unsigned pt_access, unsigned pte_access,
int user_fault, int write_fault, int dirty,
- int *ptwrite, gfn_t gfn, struct page *page)
+ int *ptwrite, int largepage, gfn_t gfn,
+ struct page *page)
{
u64 spte;
- int was_rmapped = is_rmap_pte(*shadow_pte);
+ int was_rmapped = 0;
int was_writeble = is_writeble_pte(*shadow_pte);
+ hfn_t host_pfn = (*shadow_pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
+
+ if (is_rmap_pte(*shadow_pte)) {
+ if (host_pfn != page_to_pfn(page))
+ rmap_remove(vcpu->kvm, shadow_pte);
+ else
+ was_rmapped = 1;
+ }
+
+ /*
+ * If we overwrite a PTE page pointer with a 2MB PMD, unlink
+ * the parent of the now unreachable PTE.
+ */
+ if (largepage) {
+ if (was_rmapped && !is_large_pte(*shadow_pte)) {
+ struct kvm_mmu_page *child;
+ u64 pte = *shadow_pte;
+
+ child = page_header(pte & PT64_BASE_ADDR_MASK);
+ mmu_page_remove_parent_pte(child, shadow_pte);
+ }
+ was_rmapped = is_large_pte(*shadow_pte);
+ }
pgprintk("%s: spte %llx access %x write_fault %d"
" user_fault %d gfn %lx\n",
- __FUNCTION__, *shadow_pte, pt_access,
+ __func__, *shadow_pte, pt_access,
write_fault, user_fault, gfn);
/*
@@ -915,13 +1069,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
spte |= PT_PRESENT_MASK;
if (pte_access & ACC_USER_MASK)
spte |= PT_USER_MASK;
-
- if (is_error_page(page)) {
- set_shadow_pte(shadow_pte,
- shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK);
- kvm_release_page_clean(page);
- return;
- }
+ if (largepage)
+ spte |= PT_PAGE_SIZE_MASK;
spte |= page_to_phys(page);
@@ -936,9 +1085,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
}
shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
- if (shadow) {
+ if (shadow ||
+ (largepage && has_wrprotected_page(vcpu->kvm, gfn))) {
pgprintk("%s: found shadow page for %lx, marking ro\n",
- __FUNCTION__, gfn);
+ __func__, gfn);
pte_access &= ~ACC_WRITE_MASK;
if (is_writeble_pte(spte)) {
spte &= ~PT_WRITABLE_MASK;
@@ -954,11 +1104,18 @@ unshadowed:
if (pte_access & ACC_WRITE_MASK)
mark_page_dirty(vcpu->kvm, gfn);
- pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte);
+ pgprintk("%s: setting spte %llx\n", __func__, spte);
+ pgprintk("instantiating %s PTE (%s) at %d (%llx) addr %llx\n",
+ (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB",
+ (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte);
set_shadow_pte(shadow_pte, spte);
+ if (!was_rmapped && (spte & PT_PAGE_SIZE_MASK)
+ && (spte & PT_PRESENT_MASK))
+ ++vcpu->kvm->stat.lpages;
+
page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
if (!was_rmapped) {
- rmap_add(vcpu, shadow_pte, gfn);
+ rmap_add(vcpu, shadow_pte, gfn, largepage);
if (!is_rmap_pte(*shadow_pte))
kvm_release_page_clean(page);
} else {
@@ -975,10 +1132,10 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
{
}
-static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write,
- gfn_t gfn, struct page *page)
+static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
+ int largepage, gfn_t gfn, struct page *page,
+ int level)
{
- int level = PT32E_ROOT_LEVEL;
hpa_t table_addr = vcpu->arch.mmu.root_hpa;
int pt_write = 0;
@@ -991,8 +1148,14 @@ static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write,
if (level == 1) {
mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
- 0, write, 1, &pt_write, gfn, page);
- return pt_write || is_io_pte(table[index]);
+ 0, write, 1, &pt_write, 0, gfn, page);
+ return pt_write;
+ }
+
+ if (largepage && level == 2) {
+ mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
+ 0, write, 1, &pt_write, 1, gfn, page);
+ return pt_write;
}
if (table[index] == shadow_trap_nonpresent_pte) {
@@ -1020,18 +1183,32 @@ static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write,
static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
{
int r;
+ int largepage = 0;
struct page *page;
down_read(&vcpu->kvm->slots_lock);
down_read(&current->mm->mmap_sem);
+ if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
+ gfn &= ~(KVM_PAGES_PER_HPAGE-1);
+ largepage = 1;
+ }
+
page = gfn_to_page(vcpu->kvm, gfn);
up_read(&current->mm->mmap_sem);
+ /* mmio */
+ if (is_error_page(page)) {
+ kvm_release_page_clean(page);
+ up_read(&vcpu->kvm->slots_lock);
+ return 1;
+ }
+
spin_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_free_some_pages(vcpu);
- r = __nonpaging_map(vcpu, v, write, gfn, page);
+ r = __direct_map(vcpu, v, write, largepage, gfn, page,
+ PT32E_ROOT_LEVEL);
spin_unlock(&vcpu->kvm->mmu_lock);
up_read(&vcpu->kvm->slots_lock);
@@ -1063,6 +1240,8 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
sp = page_header(root);
--sp->root_count;
+ if (!sp->root_count && sp->role.invalid)
+ kvm_mmu_zap_page(vcpu->kvm, sp);
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
spin_unlock(&vcpu->kvm->mmu_lock);
return;
@@ -1075,6 +1254,8 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
root &= PT64_BASE_ADDR_MASK;
sp = page_header(root);
--sp->root_count;
+ if (!sp->root_count && sp->role.invalid)
+ kvm_mmu_zap_page(vcpu->kvm, sp);
}
vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
}
@@ -1087,6 +1268,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
int i;
gfn_t root_gfn;
struct kvm_mmu_page *sp;
+ int metaphysical = 0;
root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
@@ -1095,14 +1277,20 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
hpa_t root = vcpu->arch.mmu.root_hpa;
ASSERT(!VALID_PAGE(root));
+ if (tdp_enabled)
+ metaphysical = 1;
sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
- PT64_ROOT_LEVEL, 0, ACC_ALL, NULL);
+ PT64_ROOT_LEVEL, metaphysical,
+ ACC_ALL, NULL);
root = __pa(sp->spt);
++sp->root_count;
vcpu->arch.mmu.root_hpa = root;
return;
}
#endif
+ metaphysical = !is_paging(vcpu);
+ if (tdp_enabled)
+ metaphysical = 1;
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->arch.mmu.pae_root[i];
@@ -1116,7 +1304,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
} else if (vcpu->arch.mmu.root_level == 0)
root_gfn = 0;
sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
- PT32_ROOT_LEVEL, !is_paging(vcpu),
+ PT32_ROOT_LEVEL, metaphysical,
ACC_ALL, NULL);
root = __pa(sp->spt);
++sp->root_count;
@@ -1136,7 +1324,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
gfn_t gfn;
int r;
- pgprintk("%s: gva %lx error %x\n", __FUNCTION__, gva, error_code);
+ pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
r = mmu_topup_memory_caches(vcpu);
if (r)
return r;
@@ -1150,6 +1338,42 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
error_code & PFERR_WRITE_MASK, gfn);
}
+static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
+ u32 error_code)
+{
+ struct page *page;
+ int r;
+ int largepage = 0;
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+
+ ASSERT(vcpu);
+ ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+ r = mmu_topup_memory_caches(vcpu);
+ if (r)
+ return r;
+
+ down_read(&current->mm->mmap_sem);
+ if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
+ gfn &= ~(KVM_PAGES_PER_HPAGE-1);
+ largepage = 1;
+ }
+ page = gfn_to_page(vcpu->kvm, gfn);
+ if (is_error_page(page)) {
+ kvm_release_page_clean(page);
+ up_read(&current->mm->mmap_sem);
+ return 1;
+ }
+ spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_free_some_pages(vcpu);
+ r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
+ largepage, gfn, page, TDP_ROOT_LEVEL);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ up_read(&current->mm->mmap_sem);
+
+ return r;
+}
+
static void nonpaging_free(struct kvm_vcpu *vcpu)
{
mmu_free_roots(vcpu);
@@ -1178,7 +1402,7 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
static void paging_new_cr3(struct kvm_vcpu *vcpu)
{
- pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->arch.cr3);
+ pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3);
mmu_free_roots(vcpu);
}
@@ -1243,7 +1467,35 @@ static int paging32E_init_context(struct kvm_vcpu *vcpu)
return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
}
-static int init_kvm_mmu(struct kvm_vcpu *vcpu)
+static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *context = &vcpu->arch.mmu;
+
+ context->new_cr3 = nonpaging_new_cr3;
+ context->page_fault = tdp_page_fault;
+ context->free = nonpaging_free;
+ context->prefetch_page = nonpaging_prefetch_page;
+ context->shadow_root_level = TDP_ROOT_LEVEL;
+ context->root_hpa = INVALID_PAGE;
+
+ if (!is_paging(vcpu)) {
+ context->gva_to_gpa = nonpaging_gva_to_gpa;
+ context->root_level = 0;
+ } else if (is_long_mode(vcpu)) {
+ context->gva_to_gpa = paging64_gva_to_gpa;
+ context->root_level = PT64_ROOT_LEVEL;
+ } else if (is_pae(vcpu)) {
+ context->gva_to_gpa = paging64_gva_to_gpa;
+ context->root_level = PT32E_ROOT_LEVEL;
+ } else {
+ context->gva_to_gpa = paging32_gva_to_gpa;
+ context->root_level = PT32_ROOT_LEVEL;
+ }
+
+ return 0;
+}
+
+static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -1258,6 +1510,14 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu)
return paging32_init_context(vcpu);
}
+static int init_kvm_mmu(struct kvm_vcpu *vcpu)
+{
+ if (tdp_enabled)
+ return init_kvm_tdp_mmu(vcpu);
+ else
+ return init_kvm_softmmu(vcpu);
+}
+
static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
@@ -1306,7 +1566,8 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
pte = *spte;
if (is_shadow_present_pte(pte)) {
- if (sp->role.level == PT_PAGE_TABLE_LEVEL)
+ if (sp->role.level == PT_PAGE_TABLE_LEVEL ||
+ is_large_pte(pte))
rmap_remove(vcpu->kvm, spte);
else {
child = page_header(pte & PT64_BASE_ADDR_MASK);
@@ -1314,24 +1575,26 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
}
}
set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+ if (is_large_pte(pte))
+ --vcpu->kvm->stat.lpages;
}
static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp,
u64 *spte,
- const void *new, int bytes,
- int offset_in_pte)
+ const void *new)
{
- if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
+ if ((sp->role.level != PT_PAGE_TABLE_LEVEL)
+ && !vcpu->arch.update_pte.largepage) {
++vcpu->kvm->stat.mmu_pde_zapped;
return;
}
++vcpu->kvm->stat.mmu_pte_updated;
if (sp->role.glevels == PT32_ROOT_LEVEL)
- paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
+ paging32_update_pte(vcpu, sp, spte, new);
else
- paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
+ paging64_update_pte(vcpu, sp, spte, new);
}
static bool need_remote_flush(u64 old, u64 new)
@@ -1370,6 +1633,8 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
u64 gpte = 0;
struct page *page;
+ vcpu->arch.update_pte.largepage = 0;
+
if (bytes != 4 && bytes != 8)
return;
@@ -1398,11 +1663,19 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
down_read(&current->mm->mmap_sem);
+ if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
+ gfn &= ~(KVM_PAGES_PER_HPAGE-1);
+ vcpu->arch.update_pte.largepage = 1;
+ }
page = gfn_to_page(vcpu->kvm, gfn);
up_read(&current->mm->mmap_sem);
+ if (is_error_page(page)) {
+ kvm_release_page_clean(page);
+ return;
+ }
vcpu->arch.update_pte.gfn = gfn;
- vcpu->arch.update_pte.page = gfn_to_page(vcpu->kvm, gfn);
+ vcpu->arch.update_pte.page = page;
}
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -1413,7 +1686,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
struct hlist_node *node, *n;
struct hlist_head *bucket;
unsigned index;
- u64 entry;
+ u64 entry, gentry;
u64 *spte;
unsigned offset = offset_in_page(gpa);
unsigned pte_size;
@@ -1423,8 +1696,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
int level;
int flooded = 0;
int npte;
+ int r;
- pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
+ pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
spin_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_free_some_pages(vcpu);
@@ -1440,7 +1714,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
vcpu->arch.last_pt_write_count = 1;
vcpu->arch.last_pte_updated = NULL;
}
- index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+ index = kvm_page_table_hashfn(gfn);
bucket = &vcpu->kvm->arch.mmu_page_hash[index];
hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
if (sp->gfn != gfn || sp->role.metaphysical)
@@ -1486,11 +1760,20 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
continue;
}
spte = &sp->spt[page_offset / sizeof(*spte)];
+ if ((gpa & (pte_size - 1)) || (bytes < pte_size)) {
+ gentry = 0;
+ r = kvm_read_guest_atomic(vcpu->kvm,
+ gpa & ~(u64)(pte_size - 1),
+ &gentry, pte_size);
+ new = (const void *)&gentry;
+ if (r < 0)
+ new = NULL;
+ }
while (npte--) {
entry = *spte;
mmu_pte_write_zap_pte(vcpu, sp, spte);
- mmu_pte_write_new_pte(vcpu, sp, spte, new, bytes,
- page_offset & (pte_size - 1));
+ if (new)
+ mmu_pte_write_new_pte(vcpu, sp, spte, new);
mmu_pte_write_flush_tlb(vcpu, entry, *spte);
++spte;
}
@@ -1567,6 +1850,12 @@ out:
}
EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
+void kvm_enable_tdp(void)
+{
+ tdp_enabled = true;
+}
+EXPORT_SYMBOL_GPL(kvm_enable_tdp);
+
static void free_mmu_pages(struct kvm_vcpu *vcpu)
{
struct kvm_mmu_page *sp;
@@ -1722,6 +2011,132 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
return nr_mmu_pages;
}
+static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer,
+ unsigned len)
+{
+ if (len > buffer->len)
+ return NULL;
+ return buffer->ptr;
+}
+
+static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer,
+ unsigned len)
+{
+ void *ret;
+
+ ret = pv_mmu_peek_buffer(buffer, len);
+ if (!ret)
+ return ret;
+ buffer->ptr += len;
+ buffer->len -= len;
+ buffer->processed += len;
+ return ret;
+}
+
+static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
+ gpa_t addr, gpa_t value)
+{
+ int bytes = 8;
+ int r;
+
+ if (!is_long_mode(vcpu) && !is_pae(vcpu))
+ bytes = 4;
+
+ r = mmu_topup_memory_caches(vcpu);
+ if (r)
+ return r;
+
+ if (!__emulator_write_phys(vcpu, addr, &value, bytes))
+ return -EFAULT;
+
+ return 1;
+}
+
+static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
+{
+ kvm_x86_ops->tlb_flush(vcpu);
+ return 1;
+}
+
+static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr)
+{
+ spin_lock(&vcpu->kvm->mmu_lock);
+ mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ return 1;
+}
+
+static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu,
+ struct kvm_pv_mmu_op_buffer *buffer)
+{
+ struct kvm_mmu_op_header *header;
+
+ header = pv_mmu_peek_buffer(buffer, sizeof *header);
+ if (!header)
+ return 0;
+ switch (header->op) {
+ case KVM_MMU_OP_WRITE_PTE: {
+ struct kvm_mmu_op_write_pte *wpte;
+
+ wpte = pv_mmu_read_buffer(buffer, sizeof *wpte);
+ if (!wpte)
+ return 0;
+ return kvm_pv_mmu_write(vcpu, wpte->pte_phys,
+ wpte->pte_val);
+ }
+ case KVM_MMU_OP_FLUSH_TLB: {
+ struct kvm_mmu_op_flush_tlb *ftlb;
+
+ ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb);
+ if (!ftlb)
+ return 0;
+ return kvm_pv_mmu_flush_tlb(vcpu);
+ }
+ case KVM_MMU_OP_RELEASE_PT: {
+ struct kvm_mmu_op_release_pt *rpt;
+
+ rpt = pv_mmu_read_buffer(buffer, sizeof *rpt);
+ if (!rpt)
+ return 0;
+ return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys);
+ }
+ default: return 0;
+ }
+}
+
+int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
+ gpa_t addr, unsigned long *ret)
+{
+ int r;
+ struct kvm_pv_mmu_op_buffer buffer;
+
+ down_read(&vcpu->kvm->slots_lock);
+ down_read(&current->mm->mmap_sem);
+
+ buffer.ptr = buffer.buf;
+ buffer.len = min(bytes, sizeof buffer.buf);
+ buffer.processed = 0;
+
+ r = kvm_read_guest(vcpu->kvm, addr, buffer.buf, buffer.len);
+ if (r)
+ goto out;
+
+ while (buffer.len) {
+ r = kvm_pv_mmu_op_one(vcpu, &buffer);
+ if (r < 0)
+ goto out;
+ if (r == 0)
+ break;
+ }
+
+ r = 1;
+out:
+ *ret = buffer.processed;
+ up_read(&current->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
+ return r;
+}
+
#ifdef AUDIT
static const char *audit_msg;
@@ -1857,7 +2272,7 @@ static void audit_rmap(struct kvm_vcpu *vcpu)
if (n_rmap != n_actual)
printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
- __FUNCTION__, audit_msg, n_rmap, n_actual);
+ __func__, audit_msg, n_rmap, n_actual);
}
static void audit_write_protection(struct kvm_vcpu *vcpu)
@@ -1877,7 +2292,7 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
if (*rmapp)
printk(KERN_ERR "%s: (%s) shadow page has writable"
" mappings: gfn %lx role %x\n",
- __FUNCTION__, audit_msg, sp->gfn,
+ __func__, audit_msg, sp->gfn,
sp->role.word);
}
}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 1fce19ec7a23..e64e9f56a65e 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -3,6 +3,12 @@
#include <linux/kvm_host.h>
+#ifdef CONFIG_X86_64
+#define TDP_ROOT_LEVEL PT64_ROOT_LEVEL
+#else
+#define TDP_ROOT_LEVEL PT32E_ROOT_LEVEL
+#endif
+
static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
{
if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index ecc0856268c4..57abbd091143 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -130,7 +130,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
unsigned index, pt_access, pte_access;
gpa_t pte_gpa;
- pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
+ pgprintk("%s: addr %lx\n", __func__, addr);
walk:
walker->level = vcpu->arch.mmu.root_level;
pte = vcpu->arch.cr3;
@@ -155,7 +155,7 @@ walk:
pte_gpa += index * sizeof(pt_element_t);
walker->table_gfn[walker->level - 1] = table_gfn;
walker->pte_gpa[walker->level - 1] = pte_gpa;
- pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
+ pgprintk("%s: table_gfn[%d] %lx\n", __func__,
walker->level - 1, table_gfn);
kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
@@ -222,7 +222,7 @@ walk:
walker->pt_access = pt_access;
walker->pte_access = pte_access;
pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
- __FUNCTION__, (u64)pte, pt_access, pte_access);
+ __func__, (u64)pte, pt_access, pte_access);
return 1;
not_present:
@@ -243,22 +243,20 @@ err:
}
static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
- u64 *spte, const void *pte, int bytes,
- int offset_in_pte)
+ u64 *spte, const void *pte)
{
pt_element_t gpte;
unsigned pte_access;
struct page *npage;
+ int largepage = vcpu->arch.update_pte.largepage;
gpte = *(const pt_element_t *)pte;
if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
- if (!offset_in_pte && !is_present_pte(gpte))
+ if (!is_present_pte(gpte))
set_shadow_pte(spte, shadow_notrap_nonpresent_pte);
return;
}
- if (bytes < sizeof(pt_element_t))
- return;
- pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
+ pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
return;
@@ -267,7 +265,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
return;
get_page(npage);
mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
- gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte), npage);
+ gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte),
+ npage);
}
/*
@@ -275,8 +274,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
*/
static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
struct guest_walker *walker,
- int user_fault, int write_fault, int *ptwrite,
- struct page *page)
+ int user_fault, int write_fault, int largepage,
+ int *ptwrite, struct page *page)
{
hpa_t shadow_addr;
int level;
@@ -304,11 +303,19 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
shadow_ent = ((u64 *)__va(shadow_addr)) + index;
if (level == PT_PAGE_TABLE_LEVEL)
break;
- if (is_shadow_present_pte(*shadow_ent)) {
+
+ if (largepage && level == PT_DIRECTORY_LEVEL)
+ break;
+
+ if (is_shadow_present_pte(*shadow_ent)
+ && !is_large_pte(*shadow_ent)) {
shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
continue;
}
+ if (is_large_pte(*shadow_ent))
+ rmap_remove(vcpu->kvm, shadow_ent);
+
if (level - 1 == PT_PAGE_TABLE_LEVEL
&& walker->level == PT_DIRECTORY_LEVEL) {
metaphysical = 1;
@@ -342,7 +349,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
user_fault, write_fault,
walker->ptes[walker->level-1] & PT_DIRTY_MASK,
- ptwrite, walker->gfn, page);
+ ptwrite, largepage, walker->gfn, page);
return shadow_ent;
}
@@ -372,8 +379,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
int write_pt = 0;
int r;
struct page *page;
+ int largepage = 0;
- pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
+ pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
kvm_mmu_audit(vcpu, "pre page fault");
r = mmu_topup_memory_caches(vcpu);
@@ -391,7 +399,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
* The page is not mapped by the guest. Let the guest handle it.
*/
if (!r) {
- pgprintk("%s: guest page fault\n", __FUNCTION__);
+ pgprintk("%s: guest page fault\n", __func__);
inject_page_fault(vcpu, addr, walker.error_code);
vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
up_read(&vcpu->kvm->slots_lock);
@@ -399,28 +407,36 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
}
down_read(&current->mm->mmap_sem);
+ if (walker.level == PT_DIRECTORY_LEVEL) {
+ gfn_t large_gfn;
+ large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1);
+ if (is_largepage_backed(vcpu, large_gfn)) {
+ walker.gfn = large_gfn;
+ largepage = 1;
+ }
+ }
page = gfn_to_page(vcpu->kvm, walker.gfn);
up_read(&current->mm->mmap_sem);
+ /* mmio */
+ if (is_error_page(page)) {
+ pgprintk("gfn %x is mmio\n", walker.gfn);
+ kvm_release_page_clean(page);
+ up_read(&vcpu->kvm->slots_lock);
+ return 1;
+ }
+
spin_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_free_some_pages(vcpu);
shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
- &write_pt, page);
- pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
+ largepage, &write_pt, page);
+
+ pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
shadow_pte, *shadow_pte, write_pt);
if (!write_pt)
vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
- /*
- * mmio: emulate if accessible, otherwise its a guest fault.
- */
- if (shadow_pte && is_io_pte(*shadow_pte)) {
- spin_unlock(&vcpu->kvm->mmu_lock);
- up_read(&vcpu->kvm->slots_lock);
- return 1;
- }
-
++vcpu->stat.pf_fixed;
kvm_mmu_audit(vcpu, "post page fault (fixed)");
spin_unlock(&vcpu->kvm->mmu_lock);
diff --git a/arch/x86/kvm/segment_descriptor.h b/arch/x86/kvm/segment_descriptor.h
deleted file mode 100644
index 56fc4c873389..000000000000
--- a/arch/x86/kvm/segment_descriptor.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef __SEGMENT_DESCRIPTOR_H
-#define __SEGMENT_DESCRIPTOR_H
-
-struct segment_descriptor {
- u16 limit_low;
- u16 base_low;
- u8 base_mid;
- u8 type : 4;
- u8 system : 1;
- u8 dpl : 2;
- u8 present : 1;
- u8 limit_high : 4;
- u8 avl : 1;
- u8 long_mode : 1;
- u8 default_op : 1;
- u8 granularity : 1;
- u8 base_high;
-} __attribute__((packed));
-
-#ifdef CONFIG_X86_64
-/* LDT or TSS descriptor in the GDT. 16 bytes. */
-struct segment_descriptor_64 {
- struct segment_descriptor s;
- u32 base_higher;
- u32 pad_zero;
-};
-
-#endif
-#endif
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1a582f1090e8..51741f96e7fb 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -47,6 +47,18 @@ MODULE_LICENSE("GPL");
#define SVM_FEATURE_LBRV (1 << 1)
#define SVM_DEATURE_SVML (1 << 2)
+#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
+
+/* enable NPT for AMD64 and X86 with PAE */
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+static bool npt_enabled = true;
+#else
+static bool npt_enabled = false;
+#endif
+static int npt = 1;
+
+module_param(npt, int, S_IRUGO);
+
static void kvm_reput_irq(struct vcpu_svm *svm);
static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
@@ -54,8 +66,7 @@ static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
return container_of(vcpu, struct vcpu_svm, vcpu);
}
-unsigned long iopm_base;
-unsigned long msrpm_base;
+static unsigned long iopm_base;
struct kvm_ldttss_desc {
u16 limit0;
@@ -182,7 +193,7 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
- if (!(efer & EFER_LMA))
+ if (!npt_enabled && !(efer & EFER_LMA))
efer &= ~EFER_LME;
to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK;
@@ -219,12 +230,12 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
if (!svm->next_rip) {
- printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__);
+ printk(KERN_DEBUG "%s: NOP\n", __func__);
return;
}
if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE)
printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n",
- __FUNCTION__,
+ __func__,
svm->vmcb->save.rip,
svm->next_rip);
@@ -279,11 +290,7 @@ static void svm_hardware_enable(void *garbage)
struct svm_cpu_data *svm_data;
uint64_t efer;
-#ifdef CONFIG_X86_64
- struct desc_ptr gdt_descr;
-#else
struct desc_ptr gdt_descr;
-#endif
struct desc_struct *gdt;
int me = raw_smp_processor_id();
@@ -302,7 +309,6 @@ static void svm_hardware_enable(void *garbage)
svm_data->asid_generation = 1;
svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
svm_data->next_asid = svm_data->max_asid + 1;
- svm_features = cpuid_edx(SVM_CPUID_FUNC);
asm volatile ("sgdt %0" : "=m"(gdt_descr));
gdt = (struct desc_struct *)gdt_descr.address;
@@ -361,12 +367,51 @@ static void set_msr_interception(u32 *msrpm, unsigned msr,
BUG();
}
+static void svm_vcpu_init_msrpm(u32 *msrpm)
+{
+ memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
+
+#ifdef CONFIG_X86_64
+ set_msr_interception(msrpm, MSR_GS_BASE, 1, 1);
+ set_msr_interception(msrpm, MSR_FS_BASE, 1, 1);
+ set_msr_interception(msrpm, MSR_KERNEL_GS_BASE, 1, 1);
+ set_msr_interception(msrpm, MSR_LSTAR, 1, 1);
+ set_msr_interception(msrpm, MSR_CSTAR, 1, 1);
+ set_msr_interception(msrpm, MSR_SYSCALL_MASK, 1, 1);
+#endif
+ set_msr_interception(msrpm, MSR_K6_STAR, 1, 1);
+ set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1);
+ set_msr_interception(msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
+ set_msr_interception(msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
+}
+
+static void svm_enable_lbrv(struct vcpu_svm *svm)
+{
+ u32 *msrpm = svm->msrpm;
+
+ svm->vmcb->control.lbr_ctl = 1;
+ set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
+ set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
+ set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
+ set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
+}
+
+static void svm_disable_lbrv(struct vcpu_svm *svm)
+{
+ u32 *msrpm = svm->msrpm;
+
+ svm->vmcb->control.lbr_ctl = 0;
+ set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
+ set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
+ set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
+ set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
+}
+
static __init int svm_hardware_setup(void)
{
int cpu;
struct page *iopm_pages;
- struct page *msrpm_pages;
- void *iopm_va, *msrpm_va;
+ void *iopm_va;
int r;
iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
@@ -379,41 +424,33 @@ static __init int svm_hardware_setup(void)
clear_bit(0x80, iopm_va); /* allow direct access to PC debug port */
iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
+ if (boot_cpu_has(X86_FEATURE_NX))
+ kvm_enable_efer_bits(EFER_NX);
- msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
+ for_each_online_cpu(cpu) {
+ r = svm_cpu_init(cpu);
+ if (r)
+ goto err;
+ }
- r = -ENOMEM;
- if (!msrpm_pages)
- goto err_1;
+ svm_features = cpuid_edx(SVM_CPUID_FUNC);
- msrpm_va = page_address(msrpm_pages);
- memset(msrpm_va, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
- msrpm_base = page_to_pfn(msrpm_pages) << PAGE_SHIFT;
+ if (!svm_has(SVM_FEATURE_NPT))
+ npt_enabled = false;
-#ifdef CONFIG_X86_64
- set_msr_interception(msrpm_va, MSR_GS_BASE, 1, 1);
- set_msr_interception(msrpm_va, MSR_FS_BASE, 1, 1);
- set_msr_interception(msrpm_va, MSR_KERNEL_GS_BASE, 1, 1);
- set_msr_interception(msrpm_va, MSR_LSTAR, 1, 1);
- set_msr_interception(msrpm_va, MSR_CSTAR, 1, 1);
- set_msr_interception(msrpm_va, MSR_SYSCALL_MASK, 1, 1);
-#endif
- set_msr_interception(msrpm_va, MSR_K6_STAR, 1, 1);
- set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_CS, 1, 1);
- set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_ESP, 1, 1);
- set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_EIP, 1, 1);
+ if (npt_enabled && !npt) {
+ printk(KERN_INFO "kvm: Nested Paging disabled\n");
+ npt_enabled = false;
+ }
- for_each_online_cpu(cpu) {
- r = svm_cpu_init(cpu);
- if (r)
- goto err_2;
+ if (npt_enabled) {
+ printk(KERN_INFO "kvm: Nested Paging enabled\n");
+ kvm_enable_tdp();
}
+
return 0;
-err_2:
- __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
- msrpm_base = 0;
-err_1:
+err:
__free_pages(iopm_pages, IOPM_ALLOC_ORDER);
iopm_base = 0;
return r;
@@ -421,9 +458,8 @@ err_1:
static __exit void svm_hardware_unsetup(void)
{
- __free_pages(pfn_to_page(msrpm_base >> PAGE_SHIFT), MSRPM_ALLOC_ORDER);
__free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
- iopm_base = msrpm_base = 0;
+ iopm_base = 0;
}
static void init_seg(struct vmcb_seg *seg)
@@ -443,10 +479,10 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
seg->base = 0;
}
-static void init_vmcb(struct vmcb *vmcb)
+static void init_vmcb(struct vcpu_svm *svm)
{
- struct vmcb_control_area *control = &vmcb->control;
- struct vmcb_save_area *save = &vmcb->save;
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ struct vmcb_save_area *save = &svm->vmcb->save;
control->intercept_cr_read = INTERCEPT_CR0_MASK |
INTERCEPT_CR3_MASK |
@@ -508,7 +544,7 @@ static void init_vmcb(struct vmcb *vmcb)
(1ULL << INTERCEPT_MWAIT);
control->iopm_base_pa = iopm_base;
- control->msrpm_base_pa = msrpm_base;
+ control->msrpm_base_pa = __pa(svm->msrpm);
control->tsc_offset = 0;
control->int_ctl = V_INTR_MASKING_MASK;
@@ -550,13 +586,29 @@ static void init_vmcb(struct vmcb *vmcb)
save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP;
save->cr4 = X86_CR4_PAE;
/* rdx = ?? */
+
+ if (npt_enabled) {
+ /* Setup VMCB for Nested Paging */
+ control->nested_ctl = 1;
+ control->intercept_exceptions &= ~(1 << PF_VECTOR);
+ control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK|
+ INTERCEPT_CR3_MASK);
+ control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK|
+ INTERCEPT_CR3_MASK);
+ save->g_pat = 0x0007040600070406ULL;
+ /* enable caching because the QEMU Bios doesn't enable it */
+ save->cr0 = X86_CR0_ET;
+ save->cr3 = 0;
+ save->cr4 = 0;
+ }
+
}
static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- init_vmcb(svm->vmcb);
+ init_vmcb(svm);
if (vcpu->vcpu_id != 0) {
svm->vmcb->save.rip = 0;
@@ -571,6 +623,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
{
struct vcpu_svm *svm;
struct page *page;
+ struct page *msrpm_pages;
int err;
svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
@@ -589,12 +642,19 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
goto uninit;
}
+ err = -ENOMEM;
+ msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
+ if (!msrpm_pages)
+ goto uninit;
+ svm->msrpm = page_address(msrpm_pages);
+ svm_vcpu_init_msrpm(svm->msrpm);
+
svm->vmcb = page_address(page);
clear_page(svm->vmcb);
svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
svm->asid_generation = 0;
memset(svm->db_regs, 0, sizeof(svm->db_regs));
- init_vmcb(svm->vmcb);
+ init_vmcb(svm);
fx_init(&svm->vcpu);
svm->vcpu.fpu_active = 1;
@@ -617,6 +677,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
__free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
+ __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
kvm_vcpu_uninit(vcpu);
kmem_cache_free(kvm_vcpu_cache, svm);
}
@@ -784,6 +845,9 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
}
}
#endif
+ if (npt_enabled)
+ goto set;
+
if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
vcpu->fpu_active = 1;
@@ -791,18 +855,26 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
vcpu->arch.cr0 = cr0;
cr0 |= X86_CR0_PG | X86_CR0_WP;
- cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
if (!vcpu->fpu_active) {
svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
cr0 |= X86_CR0_TS;
}
+set:
+ /*
+ * re-enable caching here because the QEMU bios
+ * does not do it - this results in some delay at
+ * reboot
+ */
+ cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
svm->vmcb->save.cr0 = cr0;
}
static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
vcpu->arch.cr4 = cr4;
- to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE;
+ if (!npt_enabled)
+ cr4 |= X86_CR4_PAE;
+ to_svm(vcpu)->vmcb->save.cr4 = cr4;
}
static void svm_set_segment(struct kvm_vcpu *vcpu,
@@ -920,7 +992,7 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
}
default:
printk(KERN_DEBUG "%s: unexpected dr %u\n",
- __FUNCTION__, dr);
+ __func__, dr);
*exception = UD_VECTOR;
return;
}
@@ -969,7 +1041,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
* so reinitialize it.
*/
clear_page(svm->vmcb);
- init_vmcb(svm->vmcb);
+ init_vmcb(svm);
kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
return 0;
@@ -1033,7 +1105,7 @@ static int invalid_op_interception(struct vcpu_svm *svm,
static int task_switch_interception(struct vcpu_svm *svm,
struct kvm_run *kvm_run)
{
- pr_unimpl(&svm->vcpu, "%s: task switch is unsupported\n", __FUNCTION__);
+ pr_unimpl(&svm->vcpu, "%s: task switch is unsupported\n", __func__);
kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
return 0;
}
@@ -1049,7 +1121,7 @@ static int emulate_on_interception(struct vcpu_svm *svm,
struct kvm_run *kvm_run)
{
if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE)
- pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__);
+ pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
return 1;
}
@@ -1179,8 +1251,19 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
svm->vmcb->save.sysenter_esp = data;
break;
case MSR_IA32_DEBUGCTLMSR:
- pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
- __FUNCTION__, data);
+ if (!svm_has(SVM_FEATURE_LBRV)) {
+ pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
+ __func__, data);
+ break;
+ }
+ if (data & DEBUGCTL_RESERVED_BITS)
+ return 1;
+
+ svm->vmcb->save.dbgctl = data;
+ if (data & (1ULL<<0))
+ svm_enable_lbrv(svm);
+ else
+ svm_disable_lbrv(svm);
break;
case MSR_K7_EVNTSEL0:
case MSR_K7_EVNTSEL1:
@@ -1290,14 +1373,34 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
[SVM_EXIT_WBINVD] = emulate_on_interception,
[SVM_EXIT_MONITOR] = invalid_op_interception,
[SVM_EXIT_MWAIT] = invalid_op_interception,
+ [SVM_EXIT_NPF] = pf_interception,
};
-
static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
u32 exit_code = svm->vmcb->control.exit_code;
+ if (npt_enabled) {
+ int mmu_reload = 0;
+ if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) {
+ svm_set_cr0(vcpu, svm->vmcb->save.cr0);
+ mmu_reload = 1;
+ }
+ vcpu->arch.cr0 = svm->vmcb->save.cr0;
+ vcpu->arch.cr3 = svm->vmcb->save.cr3;
+ if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
+ if (!load_pdptrs(vcpu, vcpu->arch.cr3)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+ }
+ if (mmu_reload) {
+ kvm_mmu_reset_context(vcpu);
+ kvm_mmu_load(vcpu);
+ }
+ }
+
kvm_reput_irq(svm);
if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
@@ -1308,10 +1411,11 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
}
if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
- exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR)
+ exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
+ exit_code != SVM_EXIT_NPF)
printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
"exit_code 0x%x\n",
- __FUNCTION__, svm->vmcb->control.exit_int_info,
+ __func__, svm->vmcb->control.exit_int_info,
exit_code);
if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
@@ -1499,6 +1603,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
svm->host_dr6 = read_dr6();
svm->host_dr7 = read_dr7();
svm->vmcb->save.cr2 = vcpu->arch.cr2;
+ /* required for live migration with NPT */
+ if (npt_enabled)
+ svm->vmcb->save.cr3 = vcpu->arch.cr3;
if (svm->vmcb->save.dr7 & 0xff) {
write_dr7(0);
@@ -1642,6 +1749,12 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ if (npt_enabled) {
+ svm->vmcb->control.nested_cr3 = root;
+ force_new_asid(vcpu);
+ return;
+ }
+
svm->vmcb->save.cr3 = root;
force_new_asid(vcpu);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 94ea724638fd..9810608c961f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -17,7 +17,6 @@
#include "irq.h"
#include "vmx.h"
-#include "segment_descriptor.h"
#include "mmu.h"
#include <linux/kvm_host.h>
@@ -37,6 +36,9 @@ MODULE_LICENSE("GPL");
static int bypass_guest_pf = 1;
module_param(bypass_guest_pf, bool, 0);
+static int enable_vpid = 1;
+module_param(enable_vpid, bool, 0);
+
struct vmcs {
u32 revision_id;
u32 abort;
@@ -71,6 +73,7 @@ struct vcpu_vmx {
unsigned rip;
} irq;
} rmode;
+ int vpid;
};
static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -86,6 +89,9 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
static struct page *vmx_io_bitmap_a;
static struct page *vmx_io_bitmap_b;
+static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
+static DEFINE_SPINLOCK(vmx_vpid_lock);
+
static struct vmcs_config {
int size;
int order;
@@ -204,6 +210,12 @@ static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
(irqchip_in_kernel(kvm)));
}
+static inline int cpu_has_vmx_vpid(void)
+{
+ return (vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_VPID);
+}
+
static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
{
int i;
@@ -214,6 +226,20 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
return -1;
}
+static inline void __invvpid(int ext, u16 vpid, gva_t gva)
+{
+ struct {
+ u64 vpid : 16;
+ u64 rsvd : 48;
+ u64 gva;
+ } operand = { vpid, 0, gva };
+
+ asm volatile (ASM_VMX_INVVPID
+ /* CF==1 or ZF==1 --> rc = -1 */
+ "; ja 1f ; ud2 ; 1:"
+ : : "a"(&operand), "c"(ext) : "cc", "memory");
+}
+
static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
{
int i;
@@ -257,6 +283,14 @@ static void vcpu_clear(struct vcpu_vmx *vmx)
vmx->launched = 0;
}
+static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx)
+{
+ if (vmx->vpid == 0)
+ return;
+
+ __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
+}
+
static unsigned long vmcs_readl(unsigned long field)
{
unsigned long value;
@@ -349,19 +383,16 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
static void reload_tss(void)
{
-#ifndef CONFIG_X86_64
-
/*
* VT restores TR but not its size. Useless.
*/
struct descriptor_table gdt;
- struct segment_descriptor *descs;
+ struct desc_struct *descs;
get_gdt(&gdt);
descs = (void *)gdt.base;
descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
load_TR_desc();
-#endif
}
static void load_transition_efer(struct vcpu_vmx *vmx)
@@ -488,11 +519,12 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u64 phys_addr = __pa(vmx->vmcs);
- u64 tsc_this, delta;
+ u64 tsc_this, delta, new_offset;
if (vcpu->cpu != cpu) {
vcpu_clear(vmx);
kvm_migrate_apic_timer(vcpu);
+ vpid_sync_vcpu_all(vmx);
}
if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
@@ -527,8 +559,11 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
* Make sure the time stamp counter is monotonous.
*/
rdtscll(tsc_this);
- delta = vcpu->arch.host_tsc - tsc_this;
- vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta);
+ if (tsc_this < vcpu->arch.host_tsc) {
+ delta = vcpu->arch.host_tsc - tsc_this;
+ new_offset = vmcs_read64(TSC_OFFSET) + delta;
+ vmcs_write64(TSC_OFFSET, new_offset);
+ }
}
}
@@ -599,7 +634,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
{
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
nr | INTR_TYPE_EXCEPTION
- | (has_error_code ? INTR_INFO_DELIEVER_CODE_MASK : 0)
+ | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
| INTR_INFO_VALID_MASK);
if (has_error_code)
vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
@@ -974,7 +1009,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
min = 0;
opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
- SECONDARY_EXEC_WBINVD_EXITING;
+ SECONDARY_EXEC_WBINVD_EXITING |
+ SECONDARY_EXEC_ENABLE_VPID;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
return -EIO;
@@ -1083,6 +1119,10 @@ static __init int hardware_setup(void)
{
if (setup_vmcs_config(&vmcs_config) < 0)
return -EIO;
+
+ if (boot_cpu_has(X86_FEATURE_NX))
+ kvm_enable_efer_bits(EFER_NX);
+
return alloc_kvm_area();
}
@@ -1217,7 +1257,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
- __FUNCTION__);
+ __func__);
vmcs_write32(GUEST_TR_AR_BYTES,
(guest_tr_ar & ~AR_TYPE_MASK)
| AR_TYPE_BUSY_64_TSS);
@@ -1242,6 +1282,11 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
#endif
+static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
+{
+ vpid_sync_vcpu_all(to_vmx(vcpu));
+}
+
static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
{
vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK;
@@ -1278,6 +1323,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
+ vmx_flush_tlb(vcpu);
vmcs_writel(GUEST_CR3, cr3);
if (vcpu->arch.cr0 & X86_CR0_PE)
vmx_fpu_deactivate(vcpu);
@@ -1291,14 +1337,14 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
vcpu->arch.cr4 = cr4;
}
-#ifdef CONFIG_X86_64
-
static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
vcpu->arch.shadow_efer = efer;
+ if (!msr)
+ return;
if (efer & EFER_LMA) {
vmcs_write32(VM_ENTRY_CONTROLS,
vmcs_read32(VM_ENTRY_CONTROLS) |
@@ -1315,8 +1361,6 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
setup_msrs(vmx);
}
-#endif
-
static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
{
struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
@@ -1497,6 +1541,22 @@ out:
return r;
}
+static void allocate_vpid(struct vcpu_vmx *vmx)
+{
+ int vpid;
+
+ vmx->vpid = 0;
+ if (!enable_vpid || !cpu_has_vmx_vpid())
+ return;
+ spin_lock(&vmx_vpid_lock);
+ vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
+ if (vpid < VMX_NR_VPIDS) {
+ vmx->vpid = vpid;
+ __set_bit(vpid, vmx_vpid_bitmap);
+ }
+ spin_unlock(&vmx_vpid_lock);
+}
+
/*
* Sets up the vmcs for emulated real mode.
*/
@@ -1535,6 +1595,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
exec_control &=
~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ if (vmx->vpid == 0)
+ exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
}
@@ -1624,7 +1686,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmx->vcpu.arch.rmode.active = 0;
vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
- set_cr8(&vmx->vcpu, 0);
+ kvm_set_cr8(&vmx->vcpu, 0);
msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
if (vmx->vcpu.vcpu_id == 0)
msr |= MSR_IA32_APICBASE_BSP;
@@ -1707,15 +1769,18 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmcs_write64(APIC_ACCESS_ADDR,
page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
+ if (vmx->vpid != 0)
+ vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
+
vmx->vcpu.arch.cr0 = 0x60000010;
vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
vmx_set_cr4(&vmx->vcpu, 0);
-#ifdef CONFIG_X86_64
vmx_set_efer(&vmx->vcpu, 0);
-#endif
vmx_fpu_activate(&vmx->vcpu);
update_exception_bitmap(&vmx->vcpu);
+ vpid_sync_vcpu_all(vmx);
+
return 0;
out:
@@ -1847,7 +1912,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if ((vect_info & VECTORING_INFO_VALID_MASK) &&
!is_page_fault(intr_info))
printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
- "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
+ "intr info 0x%x\n", __func__, vect_info, intr_info);
if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) {
int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
@@ -1872,7 +1937,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
error_code = 0;
rip = vmcs_readl(GUEST_RIP);
- if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
+ if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
if (is_page_fault(intr_info)) {
cr2 = vmcs_readl(EXIT_QUALIFICATION);
@@ -1964,22 +2029,22 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
switch (cr) {
case 0:
vcpu_load_rsp_rip(vcpu);
- set_cr0(vcpu, vcpu->arch.regs[reg]);
+ kvm_set_cr0(vcpu, vcpu->arch.regs[reg]);
skip_emulated_instruction(vcpu);
return 1;
case 3:
vcpu_load_rsp_rip(vcpu);
- set_cr3(vcpu, vcpu->arch.regs[reg]);
+ kvm_set_cr3(vcpu, vcpu->arch.regs[reg]);
skip_emulated_instruction(vcpu);
return 1;
case 4:
vcpu_load_rsp_rip(vcpu);
- set_cr4(vcpu, vcpu->arch.regs[reg]);
+ kvm_set_cr4(vcpu, vcpu->arch.regs[reg]);
skip_emulated_instruction(vcpu);
return 1;
case 8:
vcpu_load_rsp_rip(vcpu);
- set_cr8(vcpu, vcpu->arch.regs[reg]);
+ kvm_set_cr8(vcpu, vcpu->arch.regs[reg]);
skip_emulated_instruction(vcpu);
if (irqchip_in_kernel(vcpu->kvm))
return 1;
@@ -2005,14 +2070,14 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
return 1;
case 8:
vcpu_load_rsp_rip(vcpu);
- vcpu->arch.regs[reg] = get_cr8(vcpu);
+ vcpu->arch.regs[reg] = kvm_get_cr8(vcpu);
vcpu_put_rsp_rip(vcpu);
skip_emulated_instruction(vcpu);
return 1;
}
break;
case 3: /* lmsw */
- lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
+ kvm_lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
skip_emulated_instruction(vcpu);
return 1;
@@ -2213,7 +2278,7 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
exit_reason != EXIT_REASON_EXCEPTION_NMI)
printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
- "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
+ "exit reason is 0x%x\n", __func__, exit_reason);
if (exit_reason < kvm_vmx_max_exit_handlers
&& kvm_vmx_exit_handlers[exit_reason])
return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
@@ -2224,10 +2289,6 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
return 0;
}
-static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
-{
-}
-
static void update_tpr_threshold(struct kvm_vcpu *vcpu)
{
int max_irr, tpr;
@@ -2292,7 +2353,7 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
- if (unlikely(idtv_info_field & INTR_INFO_DELIEVER_CODE_MASK))
+ if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK))
vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
vmcs_read32(IDT_VECTORING_ERROR_CODE));
if (unlikely(has_ext_irq))
@@ -2492,6 +2553,10 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ spin_lock(&vmx_vpid_lock);
+ if (vmx->vpid != 0)
+ __clear_bit(vmx->vpid, vmx_vpid_bitmap);
+ spin_unlock(&vmx_vpid_lock);
vmx_free_vmcs(vcpu);
kfree(vmx->host_msrs);
kfree(vmx->guest_msrs);
@@ -2508,6 +2573,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
if (!vmx)
return ERR_PTR(-ENOMEM);
+ allocate_vpid(vmx);
+
err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
if (err)
goto free_vcpu;
@@ -2599,9 +2666,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.set_cr0 = vmx_set_cr0,
.set_cr3 = vmx_set_cr3,
.set_cr4 = vmx_set_cr4,
-#ifdef CONFIG_X86_64
.set_efer = vmx_set_efer,
-#endif
.get_idt = vmx_get_idt,
.set_idt = vmx_set_idt,
.get_gdt = vmx_get_gdt,
@@ -2655,6 +2720,8 @@ static int __init vmx_init(void)
memset(iova, 0xff, PAGE_SIZE);
kunmap(vmx_io_bitmap_b);
+ set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
+
r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
if (r)
goto out1;
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index d52ae8d7303d..5dff4606b988 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -49,6 +49,7 @@
* Definitions of Secondary Processor-Based VM-Execution Controls.
*/
#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
+#define SECONDARY_EXEC_ENABLE_VPID 0x00000020
#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
@@ -65,6 +66,7 @@
/* VMCS Encodings */
enum vmcs_field {
+ VIRTUAL_PROCESSOR_ID = 0x00000000,
GUEST_ES_SELECTOR = 0x00000800,
GUEST_CS_SELECTOR = 0x00000802,
GUEST_SS_SELECTOR = 0x00000804,
@@ -231,12 +233,12 @@ enum vmcs_field {
*/
#define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */
#define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */
-#define INTR_INFO_DELIEVER_CODE_MASK 0x800 /* 11 */
+#define INTR_INFO_DELIVER_CODE_MASK 0x800 /* 11 */
#define INTR_INFO_VALID_MASK 0x80000000 /* 31 */
#define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK
#define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK
-#define VECTORING_INFO_DELIEVER_CODE_MASK INTR_INFO_DELIEVER_CODE_MASK
+#define VECTORING_INFO_DELIVER_CODE_MASK INTR_INFO_DELIVER_CODE_MASK
#define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK
#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */
@@ -321,4 +323,8 @@ enum vmcs_field {
#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9
+#define VMX_NR_VPIDS (1 << 16)
+#define VMX_VPID_EXTENT_SINGLE_CONTEXT 1
+#define VMX_VPID_EXTENT_ALL_CONTEXT 2
+
#endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6b01552bd1f1..1ef56ad7aa80 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -15,10 +15,11 @@
*/
#include <linux/kvm_host.h>
-#include "segment_descriptor.h"
#include "irq.h"
#include "mmu.h"
+#include "i8254.h"
+#include <linux/clocksource.h>
#include <linux/kvm.h>
#include <linux/fs.h>
#include <linux/vmalloc.h>
@@ -28,6 +29,7 @@
#include <asm/uaccess.h>
#include <asm/msr.h>
+#include <asm/desc.h>
#define MAX_IO_MSRS 256
#define CR0_RESERVED_BITS \
@@ -41,7 +43,15 @@
| X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
-#define EFER_RESERVED_BITS 0xfffffffffffff2fe
+/* EFER defaults:
+ * - enable syscall per default because its emulated by KVM
+ * - enable LME and LMA per default on 64 bit KVM
+ */
+#ifdef CONFIG_X86_64
+static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL;
+#else
+static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
+#endif
#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
@@ -63,6 +73,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "irq_window", VCPU_STAT(irq_window_exits) },
{ "halt_exits", VCPU_STAT(halt_exits) },
{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
+ { "hypercalls", VCPU_STAT(hypercalls) },
{ "request_irq", VCPU_STAT(request_irq_exits) },
{ "irq_exits", VCPU_STAT(irq_exits) },
{ "host_state_reload", VCPU_STAT(host_state_reload) },
@@ -78,6 +89,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "mmu_recycled", VM_STAT(mmu_recycled) },
{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
+ { "largepages", VM_STAT(lpages) },
{ NULL }
};
@@ -85,7 +97,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
unsigned long segment_base(u16 selector)
{
struct descriptor_table gdt;
- struct segment_descriptor *d;
+ struct desc_struct *d;
unsigned long table_base;
unsigned long v;
@@ -101,13 +113,12 @@ unsigned long segment_base(u16 selector)
asm("sldt %0" : "=g"(ldt_selector));
table_base = segment_base(ldt_selector);
}
- d = (struct segment_descriptor *)(table_base + (selector & ~7));
- v = d->base_low | ((unsigned long)d->base_mid << 16) |
- ((unsigned long)d->base_high << 24);
+ d = (struct desc_struct *)(table_base + (selector & ~7));
+ v = d->base0 | ((unsigned long)d->base1 << 16) |
+ ((unsigned long)d->base2 << 24);
#ifdef CONFIG_X86_64
- if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
- v |= ((unsigned long) \
- ((struct segment_descriptor_64 *)d)->base_higher) << 32;
+ if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
+ v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
#endif
return v;
}
@@ -145,11 +156,16 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
u32 error_code)
{
++vcpu->stat.pf_guest;
- if (vcpu->arch.exception.pending && vcpu->arch.exception.nr == PF_VECTOR) {
- printk(KERN_DEBUG "kvm: inject_page_fault:"
- " double fault 0x%lx\n", addr);
- vcpu->arch.exception.nr = DF_VECTOR;
- vcpu->arch.exception.error_code = 0;
+ if (vcpu->arch.exception.pending) {
+ if (vcpu->arch.exception.nr == PF_VECTOR) {
+ printk(KERN_DEBUG "kvm: inject_page_fault:"
+ " double fault 0x%lx\n", addr);
+ vcpu->arch.exception.nr = DF_VECTOR;
+ vcpu->arch.exception.error_code = 0;
+ } else if (vcpu->arch.exception.nr == DF_VECTOR) {
+ /* triple fault -> shutdown */
+ set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+ }
return;
}
vcpu->arch.cr2 = addr;
@@ -205,6 +221,7 @@ out:
return ret;
}
+EXPORT_SYMBOL_GPL(load_pdptrs);
static bool pdptrs_changed(struct kvm_vcpu *vcpu)
{
@@ -226,7 +243,7 @@ out:
return changed;
}
-void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
if (cr0 & CR0_RESERVED_BITS) {
printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
@@ -284,15 +301,15 @@ void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
kvm_mmu_reset_context(vcpu);
return;
}
-EXPORT_SYMBOL_GPL(set_cr0);
+EXPORT_SYMBOL_GPL(kvm_set_cr0);
-void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
+void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
{
- set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
+ kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
}
-EXPORT_SYMBOL_GPL(lmsw);
+EXPORT_SYMBOL_GPL(kvm_lmsw);
-void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
if (cr4 & CR4_RESERVED_BITS) {
printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
@@ -323,9 +340,9 @@ void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
vcpu->arch.cr4 = cr4;
kvm_mmu_reset_context(vcpu);
}
-EXPORT_SYMBOL_GPL(set_cr4);
+EXPORT_SYMBOL_GPL(kvm_set_cr4);
-void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
kvm_mmu_flush_tlb(vcpu);
@@ -377,9 +394,9 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
}
up_read(&vcpu->kvm->slots_lock);
}
-EXPORT_SYMBOL_GPL(set_cr3);
+EXPORT_SYMBOL_GPL(kvm_set_cr3);
-void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
{
if (cr8 & CR8_RESERVED_BITS) {
printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
@@ -391,16 +408,16 @@ void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
else
vcpu->arch.cr8 = cr8;
}
-EXPORT_SYMBOL_GPL(set_cr8);
+EXPORT_SYMBOL_GPL(kvm_set_cr8);
-unsigned long get_cr8(struct kvm_vcpu *vcpu)
+unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
{
if (irqchip_in_kernel(vcpu->kvm))
return kvm_lapic_get_cr8(vcpu);
else
return vcpu->arch.cr8;
}
-EXPORT_SYMBOL_GPL(get_cr8);
+EXPORT_SYMBOL_GPL(kvm_get_cr8);
/*
* List of msr numbers which we expose to userspace through KVM_GET_MSRS
@@ -415,7 +432,8 @@ static u32 msrs_to_save[] = {
#ifdef CONFIG_X86_64
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
#endif
- MSR_IA32_TIME_STAMP_COUNTER,
+ MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+ MSR_IA32_PERF_STATUS,
};
static unsigned num_msrs_to_save;
@@ -424,11 +442,9 @@ static u32 emulated_msrs[] = {
MSR_IA32_MISC_ENABLE,
};
-#ifdef CONFIG_X86_64
-
static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
- if (efer & EFER_RESERVED_BITS) {
+ if (efer & efer_reserved_bits) {
printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
efer);
kvm_inject_gp(vcpu, 0);
@@ -450,7 +466,12 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
vcpu->arch.shadow_efer = efer;
}
-#endif
+void kvm_enable_efer_bits(u64 mask)
+{
+ efer_reserved_bits &= ~mask;
+}
+EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
+
/*
* Writes msr value into into the appropriate "register".
@@ -470,26 +491,88 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
return kvm_set_msr(vcpu, index, *data);
}
+static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
+{
+ static int version;
+ struct kvm_wall_clock wc;
+ struct timespec wc_ts;
+
+ if (!wall_clock)
+ return;
+
+ mutex_lock(&kvm->lock);
+
+ version++;
+ kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
+
+ wc_ts = current_kernel_time();
+ wc.wc_sec = wc_ts.tv_sec;
+ wc.wc_nsec = wc_ts.tv_nsec;
+ wc.wc_version = version;
+ kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
+
+ version++;
+ kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
+
+ mutex_unlock(&kvm->lock);
+}
+
+static void kvm_write_guest_time(struct kvm_vcpu *v)
+{
+ struct timespec ts;
+ unsigned long flags;
+ struct kvm_vcpu_arch *vcpu = &v->arch;
+ void *shared_kaddr;
+
+ if ((!vcpu->time_page))
+ return;
+
+ /* Keep irq disabled to prevent changes to the clock */
+ local_irq_save(flags);
+ kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
+ &vcpu->hv_clock.tsc_timestamp);
+ ktime_get_ts(&ts);
+ local_irq_restore(flags);
+
+ /* With all the info we got, fill in the values */
+
+ vcpu->hv_clock.system_time = ts.tv_nsec +
+ (NSEC_PER_SEC * (u64)ts.tv_sec);
+ /*
+ * The interface expects us to write an even number signaling that the
+ * update is finished. Since the guest won't see the intermediate
+ * state, we just write "2" at the end
+ */
+ vcpu->hv_clock.version = 2;
+
+ shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
+
+ memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock));
+
+ kunmap_atomic(shared_kaddr, KM_USER0);
+
+ mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
+}
+
int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
switch (msr) {
-#ifdef CONFIG_X86_64
case MSR_EFER:
set_efer(vcpu, data);
break;
-#endif
case MSR_IA32_MC0_STATUS:
pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
- __FUNCTION__, data);
+ __func__, data);
break;
case MSR_IA32_MCG_STATUS:
pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
- __FUNCTION__, data);
+ __func__, data);
break;
case MSR_IA32_MCG_CTL:
pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
- __FUNCTION__, data);
+ __func__, data);
break;
case MSR_IA32_UCODE_REV:
case MSR_IA32_UCODE_WRITE:
@@ -501,6 +584,42 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
case MSR_IA32_MISC_ENABLE:
vcpu->arch.ia32_misc_enable_msr = data;
break;
+ case MSR_KVM_WALL_CLOCK:
+ vcpu->kvm->arch.wall_clock = data;
+ kvm_write_wall_clock(vcpu->kvm, data);
+ break;
+ case MSR_KVM_SYSTEM_TIME: {
+ if (vcpu->arch.time_page) {
+ kvm_release_page_dirty(vcpu->arch.time_page);
+ vcpu->arch.time_page = NULL;
+ }
+
+ vcpu->arch.time = data;
+
+ /* we verify if the enable bit is set... */
+ if (!(data & 1))
+ break;
+
+ /* ...but clean it before doing the actual write */
+ vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
+
+ vcpu->arch.hv_clock.tsc_to_system_mul =
+ clocksource_khz2mult(tsc_khz, 22);
+ vcpu->arch.hv_clock.tsc_shift = 22;
+
+ down_read(&current->mm->mmap_sem);
+ vcpu->arch.time_page =
+ gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
+ up_read(&current->mm->mmap_sem);
+
+ if (is_error_page(vcpu->arch.time_page)) {
+ kvm_release_page_clean(vcpu->arch.time_page);
+ vcpu->arch.time_page = NULL;
+ }
+
+ kvm_write_guest_time(vcpu);
+ break;
+ }
default:
pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
return 1;
@@ -540,7 +659,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_IA32_MC0_MISC+12:
case MSR_IA32_MC0_MISC+16:
case MSR_IA32_UCODE_REV:
- case MSR_IA32_PERF_STATUS:
case MSR_IA32_EBL_CR_POWERON:
/* MTRR registers */
case 0xfe:
@@ -556,11 +674,21 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_IA32_MISC_ENABLE:
data = vcpu->arch.ia32_misc_enable_msr;
break;
-#ifdef CONFIG_X86_64
+ case MSR_IA32_PERF_STATUS:
+ /* TSC increment by tick */
+ data = 1000ULL;
+ /* CPU multiplier */
+ data |= (((uint64_t)4ULL) << 40);
+ break;
case MSR_EFER:
data = vcpu->arch.shadow_efer;
break;
-#endif
+ case MSR_KVM_WALL_CLOCK:
+ data = vcpu->kvm->arch.wall_clock;
+ break;
+ case MSR_KVM_SYSTEM_TIME:
+ data = vcpu->arch.time;
+ break;
default:
pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
return 1;
@@ -688,11 +816,23 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_USER_MEMORY:
case KVM_CAP_SET_TSS_ADDR:
case KVM_CAP_EXT_CPUID:
+ case KVM_CAP_CLOCKSOURCE:
+ case KVM_CAP_PIT:
+ case KVM_CAP_NOP_IO_DELAY:
r = 1;
break;
case KVM_CAP_VAPIC:
r = !kvm_x86_ops->cpu_has_accelerated_tpr();
break;
+ case KVM_CAP_NR_VCPUS:
+ r = KVM_MAX_VCPUS;
+ break;
+ case KVM_CAP_NR_MEMSLOTS:
+ r = KVM_MEMORY_SLOTS;
+ break;
+ case KVM_CAP_PV_MMU:
+ r = !tdp_enabled;
+ break;
default:
r = 0;
break;
@@ -763,6 +903,7 @@ out:
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
kvm_x86_ops->vcpu_load(vcpu, cpu);
+ kvm_write_guest_time(vcpu);
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -958,32 +1099,32 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
}
/* function 4 and 0xb have additional index. */
case 4: {
- int index, cache_type;
+ int i, cache_type;
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
/* read more entries until cache_type is zero */
- for (index = 1; *nent < maxnent; ++index) {
- cache_type = entry[index - 1].eax & 0x1f;
+ for (i = 1; *nent < maxnent; ++i) {
+ cache_type = entry[i - 1].eax & 0x1f;
if (!cache_type)
break;
- do_cpuid_1_ent(&entry[index], function, index);
- entry[index].flags |=
+ do_cpuid_1_ent(&entry[i], function, i);
+ entry[i].flags |=
KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
++*nent;
}
break;
}
case 0xb: {
- int index, level_type;
+ int i, level_type;
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
/* read more entries until level_type is zero */
- for (index = 1; *nent < maxnent; ++index) {
- level_type = entry[index - 1].ecx & 0xff;
+ for (i = 1; *nent < maxnent; ++i) {
+ level_type = entry[i - 1].ecx & 0xff;
if (!level_type)
break;
- do_cpuid_1_ent(&entry[index], function, index);
- entry[index].flags |=
+ do_cpuid_1_ent(&entry[i], function, i);
+ entry[i].flags |=
KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
++*nent;
}
@@ -1365,6 +1506,23 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
return r;
}
+static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
+{
+ int r = 0;
+
+ memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
+ return r;
+}
+
+static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
+{
+ int r = 0;
+
+ memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
+ kvm_pit_load_count(kvm, 0, ps->channels[0].count);
+ return r;
+}
+
/*
* Get (and clear) the dirty memory log for a memory slot.
*/
@@ -1457,6 +1615,12 @@ long kvm_arch_vm_ioctl(struct file *filp,
} else
goto out;
break;
+ case KVM_CREATE_PIT:
+ r = -ENOMEM;
+ kvm->arch.vpit = kvm_create_pit(kvm);
+ if (kvm->arch.vpit)
+ r = 0;
+ break;
case KVM_IRQ_LINE: {
struct kvm_irq_level irq_event;
@@ -1512,6 +1676,37 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = 0;
break;
}
+ case KVM_GET_PIT: {
+ struct kvm_pit_state ps;
+ r = -EFAULT;
+ if (copy_from_user(&ps, argp, sizeof ps))
+ goto out;
+ r = -ENXIO;
+ if (!kvm->arch.vpit)
+ goto out;
+ r = kvm_vm_ioctl_get_pit(kvm, &ps);
+ if (r)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(argp, &ps, sizeof ps))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_SET_PIT: {
+ struct kvm_pit_state ps;
+ r = -EFAULT;
+ if (copy_from_user(&ps, argp, sizeof ps))
+ goto out;
+ r = -ENXIO;
+ if (!kvm->arch.vpit)
+ goto out;
+ r = kvm_vm_ioctl_set_pit(kvm, &ps);
+ if (r)
+ goto out;
+ r = 0;
+ break;
+ }
default:
;
}
@@ -1646,22 +1841,29 @@ mmio:
return X86EMUL_UNHANDLEABLE;
}
-static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
- const void *val, int bytes)
+int __emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
+ const void *val, int bytes)
{
int ret;
- down_read(&vcpu->kvm->slots_lock);
ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
- if (ret < 0) {
- up_read(&vcpu->kvm->slots_lock);
+ if (ret < 0)
return 0;
- }
kvm_mmu_pte_write(vcpu, gpa, val, bytes);
- up_read(&vcpu->kvm->slots_lock);
return 1;
}
+static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
+ const void *val, int bytes)
+{
+ int ret;
+
+ down_read(&vcpu->kvm->slots_lock);
+ ret =__emulator_write_phys(vcpu, gpa, val, bytes);
+ up_read(&vcpu->kvm->slots_lock);
+ return ret;
+}
+
static int emulator_write_emulated_onepage(unsigned long addr,
const void *val,
unsigned int bytes,
@@ -1802,7 +2004,7 @@ int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
*dest = kvm_x86_ops->get_dr(vcpu, dr);
return X86EMUL_CONTINUE;
default:
- pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr);
+ pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr);
return X86EMUL_UNHANDLEABLE;
}
}
@@ -1840,7 +2042,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
}
EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
-struct x86_emulate_ops emulate_ops = {
+static struct x86_emulate_ops emulate_ops = {
.read_std = emulator_read_std,
.read_emulated = emulator_read_emulated,
.write_emulated = emulator_write_emulated,
@@ -2251,9 +2453,19 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_emulate_halt);
+static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
+ unsigned long a1)
+{
+ if (is_long_mode(vcpu))
+ return a0;
+ else
+ return a0 | ((gpa_t)a1 << 32);
+}
+
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
{
unsigned long nr, a0, a1, a2, a3, ret;
+ int r = 1;
kvm_x86_ops->cache_regs(vcpu);
@@ -2275,13 +2487,17 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
case KVM_HC_VAPIC_POLL_IRQ:
ret = 0;
break;
+ case KVM_HC_MMU_OP:
+ r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
+ break;
default:
ret = -KVM_ENOSYS;
break;
}
vcpu->arch.regs[VCPU_REGS_RAX] = ret;
kvm_x86_ops->decache_regs(vcpu);
- return 0;
+ ++vcpu->stat.hypercalls;
+ return r;
}
EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
@@ -2329,7 +2545,7 @@ void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
unsigned long *rflags)
{
- lmsw(vcpu, msw);
+ kvm_lmsw(vcpu, msw);
*rflags = kvm_x86_ops->get_rflags(vcpu);
}
@@ -2346,9 +2562,9 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
case 4:
return vcpu->arch.cr4;
case 8:
- return get_cr8(vcpu);
+ return kvm_get_cr8(vcpu);
default:
- vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
+ vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
return 0;
}
}
@@ -2358,23 +2574,23 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
{
switch (cr) {
case 0:
- set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
+ kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
*rflags = kvm_x86_ops->get_rflags(vcpu);
break;
case 2:
vcpu->arch.cr2 = val;
break;
case 3:
- set_cr3(vcpu, val);
+ kvm_set_cr3(vcpu, val);
break;
case 4:
- set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
+ kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
break;
case 8:
- set_cr8(vcpu, val & 0xfUL);
+ kvm_set_cr8(vcpu, val & 0xfUL);
break;
default:
- vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
+ vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
}
}
@@ -2469,7 +2685,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu,
struct kvm_run *kvm_run)
{
kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
- kvm_run->cr8 = get_cr8(vcpu);
+ kvm_run->cr8 = kvm_get_cr8(vcpu);
kvm_run->apic_base = kvm_get_apic_base(vcpu);
if (irqchip_in_kernel(vcpu->kvm))
kvm_run->ready_for_interrupt_injection = 1;
@@ -2526,6 +2742,10 @@ preempted:
kvm_x86_ops->guest_debug_pre(vcpu);
again:
+ if (vcpu->requests)
+ if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
+ kvm_mmu_unload(vcpu);
+
r = kvm_mmu_reload(vcpu);
if (unlikely(r))
goto out;
@@ -2539,6 +2759,11 @@ again:
r = 0;
goto out;
}
+ if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
+ kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
+ r = 0;
+ goto out;
+ }
}
kvm_inject_pending_timer_irqs(vcpu);
@@ -2557,6 +2782,14 @@ again:
goto out;
}
+ if (vcpu->requests)
+ if (test_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) {
+ local_irq_enable();
+ preempt_enable();
+ r = 1;
+ goto out;
+ }
+
if (signal_pending(current)) {
local_irq_enable();
preempt_enable();
@@ -2658,7 +2891,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
/* re-sync apic's tpr */
if (!irqchip_in_kernel(vcpu->kvm))
- set_cr8(vcpu, kvm_run->cr8);
+ kvm_set_cr8(vcpu, kvm_run->cr8);
if (vcpu->arch.pio.cur_count) {
r = complete_pio(vcpu);
@@ -2773,7 +3006,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
static void get_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg)
{
- return kvm_x86_ops->get_segment(vcpu, var, seg);
+ kvm_x86_ops->get_segment(vcpu, var, seg);
}
void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
@@ -2816,7 +3049,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
sregs->cr2 = vcpu->arch.cr2;
sregs->cr3 = vcpu->arch.cr3;
sregs->cr4 = vcpu->arch.cr4;
- sregs->cr8 = get_cr8(vcpu);
+ sregs->cr8 = kvm_get_cr8(vcpu);
sregs->efer = vcpu->arch.shadow_efer;
sregs->apic_base = kvm_get_apic_base(vcpu);
@@ -2839,7 +3072,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
static void set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg)
{
- return kvm_x86_ops->set_segment(vcpu, var, seg);
+ kvm_x86_ops->set_segment(vcpu, var, seg);
}
int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
@@ -2862,12 +3095,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
vcpu->arch.cr3 = sregs->cr3;
- set_cr8(vcpu, sregs->cr8);
+ kvm_set_cr8(vcpu, sregs->cr8);
mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
-#ifdef CONFIG_X86_64
kvm_x86_ops->set_efer(vcpu, sregs->efer);
-#endif
kvm_set_apic_base(vcpu, sregs->apic_base);
kvm_x86_ops->decache_cr4_guest_bits(vcpu);
@@ -3219,6 +3450,7 @@ static void kvm_free_vcpus(struct kvm *kvm)
void kvm_arch_destroy_vm(struct kvm *kvm)
{
+ kvm_free_pit(kvm);
kfree(kvm->arch.vpic);
kfree(kvm->arch.vioapic);
kvm_free_vcpus(kvm);
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 79586003397a..f59ed93f5d24 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -65,6 +65,14 @@
#define MemAbs (1<<9) /* Memory operand is absolute displacement */
#define String (1<<10) /* String instruction (rep capable) */
#define Stack (1<<11) /* Stack instruction (push/pop) */
+#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
+#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
+#define GroupMask 0xff /* Group number stored in bits 0:7 */
+
+enum {
+ Group1_80, Group1_81, Group1_82, Group1_83,
+ Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
+};
static u16 opcode_table[256] = {
/* 0x00 - 0x07 */
@@ -123,14 +131,14 @@ static u16 opcode_table[256] = {
ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
/* 0x80 - 0x87 */
- ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
- ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
+ Group | Group1_80, Group | Group1_81,
+ Group | Group1_82, Group | Group1_83,
ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
/* 0x88 - 0x8F */
ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov | Stack,
+ 0, ModRM | DstReg, 0, Group | Group1A,
/* 0x90 - 0x9F */
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
@@ -164,16 +172,15 @@ static u16 opcode_table[256] = {
0, 0, 0, 0,
/* 0xF0 - 0xF7 */
0, 0, 0, 0,
- ImplicitOps, ImplicitOps,
- ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
+ ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3,
/* 0xF8 - 0xFF */
ImplicitOps, 0, ImplicitOps, ImplicitOps,
- 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
+ 0, 0, Group | Group4, Group | Group5,
};
static u16 twobyte_table[256] = {
/* 0x00 - 0x0F */
- 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0,
+ 0, Group | GroupDual | Group7, 0, 0, 0, 0, ImplicitOps, 0,
ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
/* 0x10 - 0x1F */
0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
@@ -229,6 +236,53 @@ static u16 twobyte_table[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
+static u16 group_table[] = {
+ [Group1_80*8] =
+ ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+ ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+ ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+ ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+ [Group1_81*8] =
+ DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
+ DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
+ DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
+ DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
+ [Group1_82*8] =
+ ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+ ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+ ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+ ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+ [Group1_83*8] =
+ DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM,
+ DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM,
+ DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM,
+ DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM,
+ [Group1A*8] =
+ DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
+ [Group3_Byte*8] =
+ ByteOp | SrcImm | DstMem | ModRM, 0,
+ ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
+ 0, 0, 0, 0,
+ [Group3*8] =
+ DstMem | SrcImm | ModRM | SrcImm, 0,
+ DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
+ 0, 0, 0, 0,
+ [Group4*8] =
+ ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
+ 0, 0, 0, 0, 0, 0,
+ [Group5*8] =
+ DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0,
+ SrcMem | ModRM, 0, SrcMem | ModRM | Stack, 0,
+ [Group7*8] =
+ 0, 0, ModRM | SrcMem, ModRM | SrcMem,
+ SrcNone | ModRM | DstMem, 0, SrcMem | ModRM, SrcMem | ModRM | ByteOp,
+};
+
+static u16 group2_table[] = {
+ [Group7*8] =
+ SrcNone | ModRM, 0, 0, 0, SrcNone | ModRM | DstMem, 0, SrcMem | ModRM, 0,
+};
+
/* EFLAGS bit definitions. */
#define EFLG_OF (1<<11)
#define EFLG_DF (1<<10)
@@ -317,7 +371,7 @@ static u16 twobyte_table[256] = {
#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
do { \
- unsigned long _tmp; \
+ unsigned long __tmp; \
switch ((_dst).bytes) { \
case 1: \
__asm__ __volatile__ ( \
@@ -325,7 +379,7 @@ static u16 twobyte_table[256] = {
_op"b %"_bx"3,%1; " \
_POST_EFLAGS("0", "4", "2") \
: "=m" (_eflags), "=m" ((_dst).val), \
- "=&r" (_tmp) \
+ "=&r" (__tmp) \
: _by ((_src).val), "i" (EFLAGS_MASK)); \
break; \
default: \
@@ -426,29 +480,40 @@ static u16 twobyte_table[256] = {
(_type)_x; \
})
+static inline unsigned long ad_mask(struct decode_cache *c)
+{
+ return (1UL << (c->ad_bytes << 3)) - 1;
+}
+
/* Access/update address held in a register, based on addressing mode. */
-#define address_mask(reg) \
- ((c->ad_bytes == sizeof(unsigned long)) ? \
- (reg) : ((reg) & ((1UL << (c->ad_bytes << 3)) - 1)))
-#define register_address(base, reg) \
- ((base) + address_mask(reg))
-#define register_address_increment(reg, inc) \
- do { \
- /* signed type ensures sign extension to long */ \
- int _inc = (inc); \
- if (c->ad_bytes == sizeof(unsigned long)) \
- (reg) += _inc; \
- else \
- (reg) = ((reg) & \
- ~((1UL << (c->ad_bytes << 3)) - 1)) | \
- (((reg) + _inc) & \
- ((1UL << (c->ad_bytes << 3)) - 1)); \
- } while (0)
+static inline unsigned long
+address_mask(struct decode_cache *c, unsigned long reg)
+{
+ if (c->ad_bytes == sizeof(unsigned long))
+ return reg;
+ else
+ return reg & ad_mask(c);
+}
-#define JMP_REL(rel) \
- do { \
- register_address_increment(c->eip, rel); \
- } while (0)
+static inline unsigned long
+register_address(struct decode_cache *c, unsigned long base, unsigned long reg)
+{
+ return base + address_mask(c, reg);
+}
+
+static inline void
+register_address_increment(struct decode_cache *c, unsigned long *reg, int inc)
+{
+ if (c->ad_bytes == sizeof(unsigned long))
+ *reg += inc;
+ else
+ *reg = (*reg & ~ad_mask(c)) | ((*reg + inc) & ad_mask(c));
+}
+
+static inline void jmp_rel(struct decode_cache *c, int rel)
+{
+ register_address_increment(c, &c->eip, rel);
+}
static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
@@ -763,7 +828,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
struct decode_cache *c = &ctxt->decode;
int rc = 0;
int mode = ctxt->mode;
- int def_op_bytes, def_ad_bytes;
+ int def_op_bytes, def_ad_bytes, group;
/* Shadow copy of register state. Committed on successful emulation. */
@@ -864,12 +929,24 @@ done_prefixes:
c->b = insn_fetch(u8, 1, c->eip);
c->d = twobyte_table[c->b];
}
+ }
- /* Unrecognised? */
- if (c->d == 0) {
- DPRINTF("Cannot emulate %02x\n", c->b);
- return -1;
- }
+ if (c->d & Group) {
+ group = c->d & GroupMask;
+ c->modrm = insn_fetch(u8, 1, c->eip);
+ --c->eip;
+
+ group = (group << 3) + ((c->modrm >> 3) & 7);
+ if ((c->d & GroupDual) && (c->modrm >> 6) == 3)
+ c->d = group2_table[group];
+ else
+ c->d = group_table[group];
+ }
+
+ /* Unrecognised? */
+ if (c->d == 0) {
+ DPRINTF("Cannot emulate %02x\n", c->b);
+ return -1;
}
if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
@@ -984,8 +1061,8 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
c->dst.type = OP_MEM;
c->dst.bytes = c->op_bytes;
c->dst.val = c->src.val;
- register_address_increment(c->regs[VCPU_REGS_RSP], -c->op_bytes);
- c->dst.ptr = (void *) register_address(ctxt->ss_base,
+ register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
+ c->dst.ptr = (void *) register_address(c, ctxt->ss_base,
c->regs[VCPU_REGS_RSP]);
}
@@ -995,13 +1072,13 @@ static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
struct decode_cache *c = &ctxt->decode;
int rc;
- rc = ops->read_std(register_address(ctxt->ss_base,
+ rc = ops->read_std(register_address(c, ctxt->ss_base,
c->regs[VCPU_REGS_RSP]),
&c->dst.val, c->dst.bytes, ctxt->vcpu);
if (rc != 0)
return rc;
- register_address_increment(c->regs[VCPU_REGS_RSP], c->dst.bytes);
+ register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->dst.bytes);
return 0;
}
@@ -1043,26 +1120,6 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
switch (c->modrm_reg) {
case 0 ... 1: /* test */
- /*
- * Special case in Grp3: test has an immediate
- * source operand.
- */
- c->src.type = OP_IMM;
- c->src.ptr = (unsigned long *)c->eip;
- c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- if (c->src.bytes == 8)
- c->src.bytes = 4;
- switch (c->src.bytes) {
- case 1:
- c->src.val = insn_fetch(s8, 1, c->eip);
- break;
- case 2:
- c->src.val = insn_fetch(s16, 2, c->eip);
- break;
- case 4:
- c->src.val = insn_fetch(s32, 4, c->eip);
- break;
- }
emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
break;
case 2: /* not */
@@ -1076,7 +1133,6 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
rc = X86EMUL_UNHANDLEABLE;
break;
}
-done:
return rc;
}
@@ -1084,7 +1140,6 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
- int rc;
switch (c->modrm_reg) {
case 0: /* inc */
@@ -1094,36 +1149,11 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
emulate_1op("dec", c->dst, ctxt->eflags);
break;
case 4: /* jmp abs */
- if (c->b == 0xff)
- c->eip = c->dst.val;
- else {
- DPRINTF("Cannot emulate %02x\n", c->b);
- return X86EMUL_UNHANDLEABLE;
- }
+ c->eip = c->src.val;
break;
case 6: /* push */
-
- /* 64-bit mode: PUSH always pushes a 64-bit operand. */
-
- if (ctxt->mode == X86EMUL_MODE_PROT64) {
- c->dst.bytes = 8;
- rc = ops->read_std((unsigned long)c->dst.ptr,
- &c->dst.val, 8, ctxt->vcpu);
- if (rc != 0)
- return rc;
- }
- register_address_increment(c->regs[VCPU_REGS_RSP],
- -c->dst.bytes);
- rc = ops->write_emulated(register_address(ctxt->ss_base,
- c->regs[VCPU_REGS_RSP]), &c->dst.val,
- c->dst.bytes, ctxt->vcpu);
- if (rc != 0)
- return rc;
- c->dst.type = OP_NONE;
+ emulate_push(ctxt);
break;
- default:
- DPRINTF("Cannot emulate %02x\n", c->b);
- return X86EMUL_UNHANDLEABLE;
}
return 0;
}
@@ -1361,19 +1391,19 @@ special_insn:
c->dst.type = OP_MEM;
c->dst.bytes = c->op_bytes;
c->dst.val = c->src.val;
- register_address_increment(c->regs[VCPU_REGS_RSP],
+ register_address_increment(c, &c->regs[VCPU_REGS_RSP],
-c->op_bytes);
c->dst.ptr = (void *) register_address(
- ctxt->ss_base, c->regs[VCPU_REGS_RSP]);
+ c, ctxt->ss_base, c->regs[VCPU_REGS_RSP]);
break;
case 0x58 ... 0x5f: /* pop reg */
pop_instruction:
- if ((rc = ops->read_std(register_address(ctxt->ss_base,
+ if ((rc = ops->read_std(register_address(c, ctxt->ss_base,
c->regs[VCPU_REGS_RSP]), c->dst.ptr,
c->op_bytes, ctxt->vcpu)) != 0)
goto done;
- register_address_increment(c->regs[VCPU_REGS_RSP],
+ register_address_increment(c, &c->regs[VCPU_REGS_RSP],
c->op_bytes);
c->dst.type = OP_NONE; /* Disable writeback. */
break;
@@ -1393,9 +1423,9 @@ special_insn:
1,
(c->d & ByteOp) ? 1 : c->op_bytes,
c->rep_prefix ?
- address_mask(c->regs[VCPU_REGS_RCX]) : 1,
+ address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
(ctxt->eflags & EFLG_DF),
- register_address(ctxt->es_base,
+ register_address(c, ctxt->es_base,
c->regs[VCPU_REGS_RDI]),
c->rep_prefix,
c->regs[VCPU_REGS_RDX]) == 0) {
@@ -1409,9 +1439,9 @@ special_insn:
0,
(c->d & ByteOp) ? 1 : c->op_bytes,
c->rep_prefix ?
- address_mask(c->regs[VCPU_REGS_RCX]) : 1,
+ address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
(ctxt->eflags & EFLG_DF),
- register_address(c->override_base ?
+ register_address(c, c->override_base ?
*c->override_base :
ctxt->ds_base,
c->regs[VCPU_REGS_RSI]),
@@ -1425,7 +1455,7 @@ special_insn:
int rel = insn_fetch(s8, 1, c->eip);
if (test_cc(c->b, ctxt->eflags))
- JMP_REL(rel);
+ jmp_rel(c, rel);
break;
}
case 0x80 ... 0x83: /* Grp1 */
@@ -1501,27 +1531,27 @@ special_insn:
case 0xa4 ... 0xa5: /* movs */
c->dst.type = OP_MEM;
c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.ptr = (unsigned long *)register_address(
+ c->dst.ptr = (unsigned long *)register_address(c,
ctxt->es_base,
c->regs[VCPU_REGS_RDI]);
- if ((rc = ops->read_emulated(register_address(
+ if ((rc = ops->read_emulated(register_address(c,
c->override_base ? *c->override_base :
ctxt->ds_base,
c->regs[VCPU_REGS_RSI]),
&c->dst.val,
c->dst.bytes, ctxt->vcpu)) != 0)
goto done;
- register_address_increment(c->regs[VCPU_REGS_RSI],
+ register_address_increment(c, &c->regs[VCPU_REGS_RSI],
(ctxt->eflags & EFLG_DF) ? -c->dst.bytes
: c->dst.bytes);
- register_address_increment(c->regs[VCPU_REGS_RDI],
+ register_address_increment(c, &c->regs[VCPU_REGS_RDI],
(ctxt->eflags & EFLG_DF) ? -c->dst.bytes
: c->dst.bytes);
break;
case 0xa6 ... 0xa7: /* cmps */
c->src.type = OP_NONE; /* Disable writeback. */
c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->src.ptr = (unsigned long *)register_address(
+ c->src.ptr = (unsigned long *)register_address(c,
c->override_base ? *c->override_base :
ctxt->ds_base,
c->regs[VCPU_REGS_RSI]);
@@ -1533,7 +1563,7 @@ special_insn:
c->dst.type = OP_NONE; /* Disable writeback. */
c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.ptr = (unsigned long *)register_address(
+ c->dst.ptr = (unsigned long *)register_address(c,
ctxt->es_base,
c->regs[VCPU_REGS_RDI]);
if ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
@@ -1546,10 +1576,10 @@ special_insn:
emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
- register_address_increment(c->regs[VCPU_REGS_RSI],
+ register_address_increment(c, &c->regs[VCPU_REGS_RSI],
(ctxt->eflags & EFLG_DF) ? -c->src.bytes
: c->src.bytes);
- register_address_increment(c->regs[VCPU_REGS_RDI],
+ register_address_increment(c, &c->regs[VCPU_REGS_RDI],
(ctxt->eflags & EFLG_DF) ? -c->dst.bytes
: c->dst.bytes);
@@ -1557,11 +1587,11 @@ special_insn:
case 0xaa ... 0xab: /* stos */
c->dst.type = OP_MEM;
c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.ptr = (unsigned long *)register_address(
+ c->dst.ptr = (unsigned long *)register_address(c,
ctxt->es_base,
c->regs[VCPU_REGS_RDI]);
c->dst.val = c->regs[VCPU_REGS_RAX];
- register_address_increment(c->regs[VCPU_REGS_RDI],
+ register_address_increment(c, &c->regs[VCPU_REGS_RDI],
(ctxt->eflags & EFLG_DF) ? -c->dst.bytes
: c->dst.bytes);
break;
@@ -1569,7 +1599,7 @@ special_insn:
c->dst.type = OP_REG;
c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
- if ((rc = ops->read_emulated(register_address(
+ if ((rc = ops->read_emulated(register_address(c,
c->override_base ? *c->override_base :
ctxt->ds_base,
c->regs[VCPU_REGS_RSI]),
@@ -1577,7 +1607,7 @@ special_insn:
c->dst.bytes,
ctxt->vcpu)) != 0)
goto done;
- register_address_increment(c->regs[VCPU_REGS_RSI],
+ register_address_increment(c, &c->regs[VCPU_REGS_RSI],
(ctxt->eflags & EFLG_DF) ? -c->dst.bytes
: c->dst.bytes);
break;
@@ -1616,14 +1646,14 @@ special_insn:
goto cannot_emulate;
}
c->src.val = (unsigned long) c->eip;
- JMP_REL(rel);
+ jmp_rel(c, rel);
c->op_bytes = c->ad_bytes;
emulate_push(ctxt);
break;
}
case 0xe9: /* jmp rel */
case 0xeb: /* jmp rel short */
- JMP_REL(c->src.val);
+ jmp_rel(c, c->src.val);
c->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0xf4: /* hlt */
@@ -1823,7 +1853,7 @@ twobyte_insn:
goto cannot_emulate;
}
if (test_cc(c->b, ctxt->eflags))
- JMP_REL(rel);
+ jmp_rel(c, rel);
c->dst.type = OP_NONE;
break;
}