summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/virtual/kvm/mmu.txt79
-rw-r--r--arch/x86/include/asm/kvm_host.h11
-rw-r--r--arch/x86/kvm/lapic.c53
-rw-r--r--arch/x86/kvm/lapic.h6
-rw-r--r--arch/x86/kvm/mmu.c170
-rw-r--r--arch/x86/kvm/mmu.h17
-rw-r--r--arch/x86/kvm/mmutrace.h34
-rw-r--r--arch/x86/kvm/paging_tmpl.h10
-rw-r--r--arch/x86/kvm/svm.c10
-rw-r--r--arch/x86/kvm/trace.h21
-rw-r--r--arch/x86/kvm/vmx.c19
-rw-r--r--arch/x86/kvm/x86.c12
-rw-r--r--virt/kvm/irq_comm.c9
13 files changed, 354 insertions, 97 deletions
diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt
index 869abcc48315..290894176142 100644
--- a/Documentation/virtual/kvm/mmu.txt
+++ b/Documentation/virtual/kvm/mmu.txt
@@ -210,6 +210,24 @@ Shadow pages contain the following information:
A bitmap indicating which sptes in spt point (directly or indirectly) at
pages that may be unsynchronized. Used to quickly locate all unsychronized
pages reachable from a given page.
+ mmu_valid_gen:
+ Generation number of the page. It is compared with kvm->arch.mmu_valid_gen
+ during hash table lookup, and used to skip invalidated shadow pages (see
+ "Zapping all pages" below.)
+ clear_spte_count:
+ Only present on 32-bit hosts, where a 64-bit spte cannot be written
+ atomically. The reader uses this while running out of the MMU lock
+ to detect in-progress updates and retry them until the writer has
+ finished the write.
+ write_flooding_count:
+ A guest may write to a page table many times, causing a lot of
+ emulations if the page needs to be write-protected (see "Synchronized
+ and unsynchronized pages" below). Leaf pages can be unsynchronized
+ so that they do not trigger frequent emulation, but this is not
+ possible for non-leafs. This field counts the number of emulations
+ since the last time the page table was actually used; if emulation
+ is triggered too frequently on this page, KVM will unmap the page
+ to avoid emulation in the future.
Reverse map
===========
@@ -258,14 +276,26 @@ This is the most complicated event. The cause of a page fault can be:
Handling a page fault is performed as follows:
+ - if the RSV bit of the error code is set, the page fault is caused by guest
+ accessing MMIO and cached MMIO information is available.
+ - walk shadow page table
+ - check for valid generation number in the spte (see "Fast invalidation of
+ MMIO sptes" below)
+ - cache the information to vcpu->arch.mmio_gva, vcpu->arch.access and
+ vcpu->arch.mmio_gfn, and call the emulator
+ - If both P bit and R/W bit of error code are set, this could possibly
+ be handled as a "fast page fault" (fixed without taking the MMU lock). See
+ the description in Documentation/virtual/kvm/locking.txt.
- if needed, walk the guest page tables to determine the guest translation
(gva->gpa or ngpa->gpa)
- if permissions are insufficient, reflect the fault back to the guest
- determine the host page
- - if this is an mmio request, there is no host page; call the emulator
- to emulate the instruction instead
+ - if this is an mmio request, there is no host page; cache the info to
+ vcpu->arch.mmio_gva, vcpu->arch.access and vcpu->arch.mmio_gfn
- walk the shadow page table to find the spte for the translation,
instantiating missing intermediate page tables as necessary
+ - If this is an mmio request, cache the mmio info to the spte and set some
+ reserved bit on the spte (see callers of kvm_mmu_set_mmio_spte_mask)
- try to unsynchronize the page
- if successful, we can let the guest continue and modify the gpte
- emulate the instruction
@@ -351,6 +381,51 @@ causes its write_count to be incremented, thus preventing instantiation of
a large spte. The frames at the end of an unaligned memory slot have
artificially inflated ->write_counts so they can never be instantiated.
+Zapping all pages (page generation count)
+=========================================
+
+For the large memory guests, walking and zapping all pages is really slow
+(because there are a lot of pages), and also blocks memory accesses of
+all VCPUs because it needs to hold the MMU lock.
+
+To make it be more scalable, kvm maintains a global generation number
+which is stored in kvm->arch.mmu_valid_gen. Every shadow page stores
+the current global generation-number into sp->mmu_valid_gen when it
+is created. Pages with a mismatching generation number are "obsolete".
+
+When KVM need zap all shadow pages sptes, it just simply increases the global
+generation-number then reload root shadow pages on all vcpus. As the VCPUs
+create new shadow page tables, the old pages are not used because of the
+mismatching generation number.
+
+KVM then walks through all pages and zaps obsolete pages. While the zap
+operation needs to take the MMU lock, the lock can be released periodically
+so that the VCPUs can make progress.
+
+Fast invalidation of MMIO sptes
+===============================
+
+As mentioned in "Reaction to events" above, kvm will cache MMIO
+information in leaf sptes. When a new memslot is added or an existing
+memslot is changed, this information may become stale and needs to be
+invalidated. This also needs to hold the MMU lock while walking all
+shadow pages, and is made more scalable with a similar technique.
+
+MMIO sptes have a few spare bits, which are used to store a
+generation number. The global generation number is stored in
+kvm_memslots(kvm)->generation, and increased whenever guest memory info
+changes. This generation number is distinct from the one described in
+the previous section.
+
+When KVM finds an MMIO spte, it checks the generation number of the spte.
+If the generation number of the spte does not equal the global generation
+number, it will ignore the cached MMIO information and handle the page
+fault through the slow path.
+
+Since only 19 bits are used to store generation-number on mmio spte, all
+pages are zapped when there is an overflow.
+
+
Further reading
===============
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1f98c1bb5b7a..280e3271b027 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -222,15 +222,22 @@ struct kvm_mmu_page {
int root_count; /* Currently serving as active root */
unsigned int unsync_children;
unsigned long parent_ptes; /* Reverse mapping for parent_pte */
+
+ /* The page is obsolete if mmu_valid_gen != kvm->arch.mmu_valid_gen. */
unsigned long mmu_valid_gen;
+
DECLARE_BITMAP(unsync_child_bitmap, 512);
#ifdef CONFIG_X86_32
+ /*
+ * Used out of the mmu-lock to avoid reading spte values while an
+ * update is in progress; see the comments in __get_spte_lockless().
+ */
int clear_spte_count;
#endif
+ /* Number of writes since the last time traversal visited this page. */
int write_flooding_count;
- bool mmio_cached;
};
struct kvm_pio_request {
@@ -773,7 +780,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask);
void kvm_mmu_zap_all(struct kvm *kvm);
-void kvm_mmu_zap_mmio_sptes(struct kvm *kvm);
+void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm);
unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 9d751931cf84..9f4bea805bed 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -405,17 +405,17 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
return highest_irr;
}
-static void __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
- int vector, int level, int trig_mode,
- unsigned long *dest_map);
+static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
+ int vector, int level, int trig_mode,
+ unsigned long *dest_map);
-void kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
- unsigned long *dest_map)
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
+ unsigned long *dest_map)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- __apic_accept_irq(apic, irq->delivery_mode, irq->vector,
- irq->level, irq->trig_mode, dest_map);
+ return __apic_accept_irq(apic, irq->delivery_mode, irq->vector,
+ irq->level, irq->trig_mode, dest_map);
}
static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
@@ -608,8 +608,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
*r = -1;
if (irq->shorthand == APIC_DEST_SELF) {
- kvm_apic_set_irq(src->vcpu, irq, dest_map);
- *r = 1;
+ *r = kvm_apic_set_irq(src->vcpu, irq, dest_map);
return true;
}
@@ -654,8 +653,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
continue;
if (*r < 0)
*r = 0;
- kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map);
- *r += 1;
+ *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map);
}
ret = true;
@@ -664,11 +662,15 @@ out:
return ret;
}
-/* Set an IRQ pending in the lapic. */
-static void __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
- int vector, int level, int trig_mode,
- unsigned long *dest_map)
+/*
+ * Add a pending IRQ into lapic.
+ * Return 1 if successfully added and 0 if discarded.
+ */
+static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
+ int vector, int level, int trig_mode,
+ unsigned long *dest_map)
{
+ int result = 0;
struct kvm_vcpu *vcpu = apic->vcpu;
switch (delivery_mode) {
@@ -682,10 +684,13 @@ static void __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
if (dest_map)
__set_bit(vcpu->vcpu_id, dest_map);
- if (kvm_x86_ops->deliver_posted_interrupt)
+ if (kvm_x86_ops->deliver_posted_interrupt) {
+ result = 1;
kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
- else {
- if (apic_test_and_set_irr(vector, apic)) {
+ } else {
+ result = !apic_test_and_set_irr(vector, apic);
+
+ if (!result) {
if (trig_mode)
apic_debug("level trig mode repeatedly "
"for vector %d", vector);
@@ -697,7 +702,7 @@ static void __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
}
out:
trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
- trig_mode, vector, false);
+ trig_mode, vector, !result);
break;
case APIC_DM_REMRD:
@@ -709,12 +714,14 @@ out:
break;
case APIC_DM_NMI:
+ result = 1;
kvm_inject_nmi(vcpu);
kvm_vcpu_kick(vcpu);
break;
case APIC_DM_INIT:
if (!trig_mode || level) {
+ result = 1;
/* assumes that there are only KVM_APIC_INIT/SIPI */
apic->pending_events = (1UL << KVM_APIC_INIT);
/* make sure pending_events is visible before sending
@@ -731,6 +738,7 @@ out:
case APIC_DM_STARTUP:
apic_debug("SIPI to vcpu %d vector 0x%02x\n",
vcpu->vcpu_id, vector);
+ result = 1;
apic->sipi_vector = vector;
/* make sure sipi_vector is visible for the receiver */
smp_wmb();
@@ -752,6 +760,7 @@ out:
delivery_mode);
break;
}
+ return result;
}
int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
@@ -1461,7 +1470,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu)
return 0;
}
-void kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
+int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
{
u32 reg = kvm_apic_get_reg(apic, lvt_type);
int vector, mode, trig_mode;
@@ -1470,8 +1479,10 @@ void kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
vector = reg & APIC_VECTOR_MASK;
mode = reg & APIC_MODE_MASK;
trig_mode = reg & APIC_LVT_LEVEL_TRIGGER;
- __apic_accept_irq(apic, mode, vector, 1, trig_mode, NULL);
+ return __apic_accept_irq(apic, mode, vector, 1, trig_mode,
+ NULL);
}
+ return 0;
}
void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 61a73a01ab0b..c730ac9fe801 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -57,9 +57,9 @@ void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr);
void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
-void kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
- unsigned long *dest_map);
-void kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
+ unsigned long *dest_map);
+int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6941fa74eb35..0d094da49541 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -197,15 +197,63 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
-static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access)
+/*
+ * spte bits of bit 3 ~ bit 11 are used as low 9 bits of generation number,
+ * the bits of bits 52 ~ bit 61 are used as high 10 bits of generation
+ * number.
+ */
+#define MMIO_SPTE_GEN_LOW_SHIFT 3
+#define MMIO_SPTE_GEN_HIGH_SHIFT 52
+
+#define MMIO_GEN_SHIFT 19
+#define MMIO_GEN_LOW_SHIFT 9
+#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 1)
+#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1)
+#define MMIO_MAX_GEN ((1 << MMIO_GEN_SHIFT) - 1)
+
+static u64 generation_mmio_spte_mask(unsigned int gen)
{
- struct kvm_mmu_page *sp = page_header(__pa(sptep));
+ u64 mask;
+
+ WARN_ON(gen > MMIO_MAX_GEN);
+
+ mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT;
+ mask |= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT;
+ return mask;
+}
+
+static unsigned int get_mmio_spte_generation(u64 spte)
+{
+ unsigned int gen;
+
+ spte &= ~shadow_mmio_mask;
+
+ gen = (spte >> MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_GEN_LOW_MASK;
+ gen |= (spte >> MMIO_SPTE_GEN_HIGH_SHIFT) << MMIO_GEN_LOW_SHIFT;
+ return gen;
+}
+
+static unsigned int kvm_current_mmio_generation(struct kvm *kvm)
+{
+ /*
+ * Init kvm generation close to MMIO_MAX_GEN to easily test the
+ * code of handling generation number wrap-around.
+ */
+ return (kvm_memslots(kvm)->generation +
+ MMIO_MAX_GEN - 150) & MMIO_GEN_MASK;
+}
+
+static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn,
+ unsigned access)
+{
+ unsigned int gen = kvm_current_mmio_generation(kvm);
+ u64 mask = generation_mmio_spte_mask(gen);
access &= ACC_WRITE_MASK | ACC_USER_MASK;
+ mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT;
- sp->mmio_cached = true;
- trace_mark_mmio_spte(sptep, gfn, access);
- mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT);
+ trace_mark_mmio_spte(sptep, gfn, access, gen);
+ mmu_spte_set(sptep, mask);
}
static bool is_mmio_spte(u64 spte)
@@ -215,24 +263,38 @@ static bool is_mmio_spte(u64 spte)
static gfn_t get_mmio_spte_gfn(u64 spte)
{
- return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT;
+ u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask;
+ return (spte & ~mask) >> PAGE_SHIFT;
}
static unsigned get_mmio_spte_access(u64 spte)
{
- return (spte & ~shadow_mmio_mask) & ~PAGE_MASK;
+ u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask;
+ return (spte & ~mask) & ~PAGE_MASK;
}
-static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access)
+static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
+ pfn_t pfn, unsigned access)
{
if (unlikely(is_noslot_pfn(pfn))) {
- mark_mmio_spte(sptep, gfn, access);
+ mark_mmio_spte(kvm, sptep, gfn, access);
return true;
}
return false;
}
+static bool check_mmio_spte(struct kvm *kvm, u64 spte)
+{
+ unsigned int kvm_gen, spte_gen;
+
+ kvm_gen = kvm_current_mmio_generation(kvm);
+ spte_gen = get_mmio_spte_generation(spte);
+
+ trace_check_mmio_spte(spte, kvm_gen, spte_gen);
+ return likely(kvm_gen == spte_gen);
+}
+
static inline u64 rsvd_bits(int s, int e)
{
return ((1ULL << (e - s + 1)) - 1) << s;
@@ -404,9 +466,20 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
/*
* The idea using the light way get the spte on x86_32 guest is from
* gup_get_pte(arch/x86/mm/gup.c).
- * The difference is we can not catch the spte tlb flush if we leave
- * guest mode, so we emulate it by increase clear_spte_count when spte
- * is cleared.
+ *
+ * An spte tlb flush may be pending, because kvm_set_pte_rmapp
+ * coalesces them and we are running out of the MMU lock. Therefore
+ * we need to protect against in-progress updates of the spte.
+ *
+ * Reading the spte while an update is in progress may get the old value
+ * for the high part of the spte. The race is fine for a present->non-present
+ * change (because the high part of the spte is ignored for non-present spte),
+ * but for a present->present change we must reread the spte.
+ *
+ * All such changes are done in two steps (present->non-present and
+ * non-present->present), hence it is enough to count the number of
+ * present->non-present updates: if it changed while reading the spte,
+ * we might have hit the race. This is done using clear_spte_count.
*/
static u64 __get_spte_lockless(u64 *sptep)
{
@@ -2364,7 +2437,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
u64 spte;
int ret = 0;
- if (set_mmio_spte(sptep, gfn, pfn, pte_access))
+ if (set_mmio_spte(vcpu->kvm, sptep, gfn, pfn, pte_access))
return 0;
spte = PT_PRESENT_MASK;
@@ -3184,17 +3257,12 @@ static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr)
return spte;
}
-/*
- * If it is a real mmio page fault, return 1 and emulat the instruction
- * directly, return 0 to let CPU fault again on the address, -1 is
- * returned if bug is detected.
- */
int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
{
u64 spte;
if (quickly_check_mmio_pf(vcpu, addr, direct))
- return 1;
+ return RET_MMIO_PF_EMULATE;
spte = walk_shadow_page_get_mmio_spte(vcpu, addr);
@@ -3202,12 +3270,15 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
gfn_t gfn = get_mmio_spte_gfn(spte);
unsigned access = get_mmio_spte_access(spte);
+ if (!check_mmio_spte(vcpu->kvm, spte))
+ return RET_MMIO_PF_INVALID;
+
if (direct)
addr = 0;
trace_handle_mmio_page_fault(addr, gfn, access);
vcpu_cache_mmio_info(vcpu, addr, gfn, access);
- return 1;
+ return RET_MMIO_PF_EMULATE;
}
/*
@@ -3215,13 +3286,13 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
* it's a BUG if the gfn is not a mmio page.
*/
if (direct && !check_direct_spte_mmio_pf(spte))
- return -1;
+ return RET_MMIO_PF_BUG;
/*
* If the page table is zapped by other cpus, let CPU fault again on
* the address.
*/
- return 0;
+ return RET_MMIO_PF_RETRY;
}
EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common);
@@ -3231,7 +3302,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr,
int ret;
ret = handle_mmio_page_fault_common(vcpu, addr, direct);
- WARN_ON(ret < 0);
+ WARN_ON(ret == RET_MMIO_PF_BUG);
return ret;
}
@@ -3243,8 +3314,12 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
- if (unlikely(error_code & PFERR_RSVD_MASK))
- return handle_mmio_page_fault(vcpu, gva, error_code, true);
+ if (unlikely(error_code & PFERR_RSVD_MASK)) {
+ r = handle_mmio_page_fault(vcpu, gva, error_code, true);
+
+ if (likely(r != RET_MMIO_PF_INVALID))
+ return r;
+ }
r = mmu_topup_memory_caches(vcpu);
if (r)
@@ -3320,8 +3395,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
ASSERT(vcpu);
ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
- if (unlikely(error_code & PFERR_RSVD_MASK))
- return handle_mmio_page_fault(vcpu, gpa, error_code, true);
+ if (unlikely(error_code & PFERR_RSVD_MASK)) {
+ r = handle_mmio_page_fault(vcpu, gpa, error_code, true);
+
+ if (likely(r != RET_MMIO_PF_INVALID))
+ return r;
+ }
r = mmu_topup_memory_caches(vcpu);
if (r)
@@ -3427,8 +3506,8 @@ static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
*access &= mask;
}
-static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
- int *nr_present)
+static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
+ unsigned access, int *nr_present)
{
if (unlikely(is_mmio_spte(*sptep))) {
if (gfn != get_mmio_spte_gfn(*sptep)) {
@@ -3437,7 +3516,7 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
}
(*nr_present)++;
- mark_mmio_spte(sptep, gfn, access);
+ mark_mmio_spte(kvm, sptep, gfn, access);
return true;
}
@@ -4294,27 +4373,24 @@ void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm)
spin_unlock(&kvm->mmu_lock);
}
-void kvm_mmu_zap_mmio_sptes(struct kvm *kvm)
+static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
{
- struct kvm_mmu_page *sp, *node;
- LIST_HEAD(invalid_list);
-
- spin_lock(&kvm->mmu_lock);
-restart:
- list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
- if (!sp->mmio_cached)
- continue;
- if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
- goto restart;
- }
-
- kvm_mmu_commit_zap_page(kvm, &invalid_list);
- spin_unlock(&kvm->mmu_lock);
+ return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
}
-static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
+void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm)
{
- return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
+ /*
+ * The very rare case: if the generation-number is round,
+ * zap all shadow pages.
+ *
+ * The max value is MMIO_MAX_GEN - 1 since it is not called
+ * when mark memslot invalid.
+ */
+ if (unlikely(kvm_current_mmio_generation(kvm) >= (MMIO_MAX_GEN - 1))) {
+ printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n");
+ kvm_mmu_invalidate_zap_all_pages(kvm);
+ }
}
static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 922bfae77c58..5b59c573aba7 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -52,6 +52,23 @@
int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
+
+/*
+ * Return values of handle_mmio_page_fault_common:
+ * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction
+ * directly.
+ * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page
+ * fault path update the mmio spte.
+ * RET_MMIO_PF_RETRY: let CPU fault again on the address.
+ * RET_MMIO_PF_BUG: bug is detected.
+ */
+enum {
+ RET_MMIO_PF_EMULATE = 1,
+ RET_MMIO_PF_INVALID = 2,
+ RET_MMIO_PF_RETRY = 0,
+ RET_MMIO_PF_BUG = -1
+};
+
int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index eb444dd374af..9d2e0ffcb190 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -199,23 +199,25 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
TRACE_EVENT(
mark_mmio_spte,
- TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access),
- TP_ARGS(sptep, gfn, access),
+ TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access, unsigned int gen),
+ TP_ARGS(sptep, gfn, access, gen),
TP_STRUCT__entry(
__field(void *, sptep)
__field(gfn_t, gfn)
__field(unsigned, access)
+ __field(unsigned int, gen)
),
TP_fast_assign(
__entry->sptep = sptep;
__entry->gfn = gfn;
__entry->access = access;
+ __entry->gen = gen;
),
- TP_printk("sptep:%p gfn %llx access %x", __entry->sptep, __entry->gfn,
- __entry->access)
+ TP_printk("sptep:%p gfn %llx access %x gen %x", __entry->sptep,
+ __entry->gfn, __entry->access, __entry->gen)
);
TRACE_EVENT(
@@ -296,6 +298,30 @@ TRACE_EVENT(
__entry->mmu_valid_gen, __entry->mmu_used_pages
)
);
+
+
+TRACE_EVENT(
+ check_mmio_spte,
+ TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen),
+ TP_ARGS(spte, kvm_gen, spte_gen),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, kvm_gen)
+ __field(unsigned int, spte_gen)
+ __field(u64, spte)
+ ),
+
+ TP_fast_assign(
+ __entry->kvm_gen = kvm_gen;
+ __entry->spte_gen = spte_gen;
+ __entry->spte = spte;
+ ),
+
+ TP_printk("spte %llx kvm_gen %x spte-gen %x valid %d", __entry->spte,
+ __entry->kvm_gen, __entry->spte_gen,
+ __entry->kvm_gen == __entry->spte_gen
+ )
+);
#endif /* _TRACE_KVMMMU_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index da20860b457a..7769699d48a8 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -552,9 +552,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
- if (unlikely(error_code & PFERR_RSVD_MASK))
- return handle_mmio_page_fault(vcpu, addr, error_code,
+ if (unlikely(error_code & PFERR_RSVD_MASK)) {
+ r = handle_mmio_page_fault(vcpu, addr, error_code,
mmu_is_nested(vcpu));
+ if (likely(r != RET_MMIO_PF_INVALID))
+ return r;
+ };
r = mmu_topup_memory_caches(vcpu);
if (r)
@@ -792,7 +795,8 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
pte_access &= gpte_access(vcpu, gpte);
protect_clean_gpte(&pte_access, gpte);
- if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present))
+ if (sync_mmio_spte(vcpu->kvm, &sp->spt[i], gfn, pte_access,
+ &nr_present))
continue;
if (gfn != sp->gfns[i]) {
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index a14a6eaf871d..c0bc80391e40 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1026,7 +1026,10 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
g_tsc_offset = svm->vmcb->control.tsc_offset -
svm->nested.hsave->control.tsc_offset;
svm->nested.hsave->control.tsc_offset = offset;
- }
+ } else
+ trace_kvm_write_tsc_offset(vcpu->vcpu_id,
+ svm->vmcb->control.tsc_offset,
+ offset);
svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
@@ -1044,6 +1047,11 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho
svm->vmcb->control.tsc_offset += adjustment;
if (is_guest_mode(vcpu))
svm->nested.hsave->control.tsc_offset += adjustment;
+ else
+ trace_kvm_write_tsc_offset(vcpu->vcpu_id,
+ svm->vmcb->control.tsc_offset - adjustment,
+ svm->vmcb->control.tsc_offset);
+
mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
}
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index fe5e00ed7036..545245d7cc63 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -756,6 +756,27 @@ TRACE_EVENT(
__entry->gpa_match ? "GPA" : "GVA")
);
+TRACE_EVENT(kvm_write_tsc_offset,
+ TP_PROTO(unsigned int vcpu_id, __u64 previous_tsc_offset,
+ __u64 next_tsc_offset),
+ TP_ARGS(vcpu_id, previous_tsc_offset, next_tsc_offset),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( __u64, previous_tsc_offset )
+ __field( __u64, next_tsc_offset )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->previous_tsc_offset = previous_tsc_offset;
+ __entry->next_tsc_offset = next_tsc_offset;
+ ),
+
+ TP_printk("vcpu=%u prev=%llu next=%llu", __entry->vcpu_id,
+ __entry->previous_tsc_offset, __entry->next_tsc_offset)
+);
+
#ifdef CONFIG_X86_64
#define host_clocks \
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 260a91939555..036e8636f685 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2096,6 +2096,8 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
(nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?
vmcs12->tsc_offset : 0));
} else {
+ trace_kvm_write_tsc_offset(vcpu->vcpu_id,
+ vmcs_read64(TSC_OFFSET), offset);
vmcs_write64(TSC_OFFSET, offset);
}
}
@@ -2103,11 +2105,14 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
{
u64 offset = vmcs_read64(TSC_OFFSET);
+
vmcs_write64(TSC_OFFSET, offset + adjustment);
if (is_guest_mode(vcpu)) {
/* Even when running L2, the adjustment needs to apply to L1 */
to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment;
- }
+ } else
+ trace_kvm_write_tsc_offset(vcpu->vcpu_id, offset,
+ offset + adjustment);
}
static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
@@ -4176,10 +4181,10 @@ static void ept_set_mmio_spte_mask(void)
/*
* EPT Misconfigurations can be generated if the value of bits 2:0
* of an EPT paging-structure entry is 110b (write/execute).
- * Also, magic bits (0xffull << 49) is set to quickly identify mmio
+ * Also, magic bits (0x3ull << 62) is set to quickly identify mmio
* spte.
*/
- kvm_mmu_set_mmio_spte_mask(0xffull << 49 | 0x6ull);
+ kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull);
}
/*
@@ -5366,10 +5371,14 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
ret = handle_mmio_page_fault_common(vcpu, gpa, true);
- if (likely(ret == 1))
+ if (likely(ret == RET_MMIO_PF_EMULATE))
return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
EMULATE_DONE;
- if (unlikely(!ret))
+
+ if (unlikely(ret == RET_MMIO_PF_INVALID))
+ return kvm_mmu_page_fault(vcpu, gpa, 0, NULL, 0);
+
+ if (unlikely(ret == RET_MMIO_PF_RETRY))
return 1;
/* It is the real ept misconfig */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 737c804b310c..7d71c0fb11de 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5280,7 +5280,13 @@ static void kvm_set_mmio_spte_mask(void)
* Set the reserved bits and the present bit of an paging-structure
* entry to generate page fault with PFER.RSV = 1.
*/
- mask = ((1ull << (62 - maxphyaddr + 1)) - 1) << maxphyaddr;
+ /* Mask the reserved physical address bits. */
+ mask = ((1ull << (51 - maxphyaddr + 1)) - 1) << maxphyaddr;
+
+ /* Bit 62 is always reserved for 32bit host. */
+ mask |= 0x3ull << 62;
+
+ /* Set the present bit. */
mask |= 1ull;
#ifdef CONFIG_X86_64
@@ -7078,8 +7084,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
* If memory slot is created, or moved, we need to clear all
* mmio sptes.
*/
- if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE))
- kvm_mmu_zap_mmio_sptes(kvm);
+ kvm_mmu_invalidate_mmio_sptes(kvm);
}
void kvm_arch_flush_shadow_all(struct kvm *kvm)
@@ -7298,3 +7303,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index ef1817b61cf4..e2e6b4473a96 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -91,8 +91,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
if (!kvm_is_dm_lowest_prio(irq)) {
if (r < 0)
r = 0;
- kvm_apic_set_irq(vcpu, irq, dest_map);
- r++;
+ r += kvm_apic_set_irq(vcpu, irq, dest_map);
} else if (kvm_lapic_enabled(vcpu)) {
if (!lowest)
lowest = vcpu;
@@ -101,10 +100,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
}
}
- if (lowest) {
- kvm_apic_set_irq(lowest, irq, dest_map);
- r = 1;
- }
+ if (lowest)
+ r = kvm_apic_set_irq(lowest, irq, dest_map);
return r;
}