From bbacc0c111c3c5d1f3192b8cc1642b9c3954f80d Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Mon, 10 Dec 2012 10:33:09 -0700
Subject: KVM: Rename KVM_MEMORY_SLOTS -> KVM_USER_MEM_SLOTS

It's easy to confuse KVM_MEMORY_SLOTS and KVM_MEM_SLOTS_NUM.  One is
the user accessible slots and the other is user + private.  Make this
more obvious.

Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kvm/x86.c')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 76f54461f7cb..816074757c96 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2518,7 +2518,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 		r = KVM_MAX_VCPUS;
 		break;
 	case KVM_CAP_NR_MEMSLOTS:
-		r = KVM_MEMORY_SLOTS;
+		r = KVM_USER_MEM_SLOTS;
 		break;
 	case KVM_CAP_PV_MMU:	/* obsolete */
 		r = 0;
@@ -3435,7 +3435,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 	mutex_lock(&kvm->slots_lock);
 
 	r = -EINVAL;
-	if (log->slot >= KVM_MEMORY_SLOTS)
+	if (log->slot >= KVM_USER_MEM_SLOTS)
 		goto out;
 
 	memslot = id_to_memslot(kvm->memslots, log->slot);
@@ -6845,7 +6845,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 	int map_flags = MAP_PRIVATE | MAP_ANONYMOUS;
 
 	/* Prevent internal slot pages from being moved by fork()/COW. */
-	if (memslot->id >= KVM_MEMORY_SLOTS)
+	if (memslot->id >= KVM_USER_MEM_SLOTS)
 		map_flags = MAP_SHARED | MAP_ANONYMOUS;
 
 	/*To keep backward compatibility with older userspace,
-- 
cgit v1.2.3


From f82a8cfe9354f5cdea55ebeceba3fd19051d3ee8 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Mon, 10 Dec 2012 10:33:21 -0700
Subject: KVM: struct kvm_memory_slot.user_alloc -> bool

There's no need for this to be an int, it holds a boolean.
Move to the end of the struct for alignment.

Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c   |  6 +++---
 arch/powerpc/kvm/powerpc.c |  4 ++--
 arch/s390/kvm/kvm-s390.c   |  4 ++--
 arch/x86/kvm/vmx.c         |  6 +++---
 arch/x86/kvm/x86.c         |  4 ++--
 include/linux/kvm_host.h   | 12 ++++++------
 virt/kvm/kvm_main.c        |  8 ++++----
 7 files changed, 22 insertions(+), 22 deletions(-)

(limited to 'arch/x86/kvm/x86.c')

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 9bacfe207b43..ad3126a58644 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -955,7 +955,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 					kvm_mem.guest_phys_addr;
 		kvm_userspace_mem.memory_size = kvm_mem.memory_size;
 		r = kvm_vm_ioctl_set_memory_region(kvm,
-					&kvm_userspace_mem, 0);
+					&kvm_userspace_mem, false);
 		if (r)
 			goto out;
 		break;
@@ -1580,7 +1580,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 		struct kvm_memory_slot *memslot,
 		struct kvm_memory_slot old,
 		struct kvm_userspace_memory_region *mem,
-		int user_alloc)
+		bool user_alloc)
 {
 	unsigned long i;
 	unsigned long pfn;
@@ -1611,7 +1611,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 void kvm_arch_commit_memory_region(struct kvm *kvm,
 		struct kvm_userspace_memory_region *mem,
 		struct kvm_memory_slot old,
-		int user_alloc)
+		bool user_alloc)
 {
 	return;
 }
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 70739a089560..be83fca2e8fd 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -412,7 +412,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                    struct kvm_memory_slot *memslot,
                                    struct kvm_memory_slot old,
                                    struct kvm_userspace_memory_region *mem,
-                                   int user_alloc)
+                                   bool user_alloc)
 {
 	return kvmppc_core_prepare_memory_region(kvm, memslot, mem);
 }
@@ -420,7 +420,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 void kvm_arch_commit_memory_region(struct kvm *kvm,
                struct kvm_userspace_memory_region *mem,
                struct kvm_memory_slot old,
-               int user_alloc)
+               bool user_alloc)
 {
 	kvmppc_core_commit_memory_region(kvm, mem, old);
 }
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index c9011bfaabbe..f718bc65835c 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -928,7 +928,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				   struct kvm_memory_slot *memslot,
 				   struct kvm_memory_slot old,
 				   struct kvm_userspace_memory_region *mem,
-				   int user_alloc)
+				   bool user_alloc)
 {
 	/* A few sanity checks. We can have exactly one memory slot which has
 	   to start at guest virtual zero and which has to be located at a
@@ -958,7 +958,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 void kvm_arch_commit_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem,
 				struct kvm_memory_slot old,
-				int user_alloc)
+				bool user_alloc)
 {
 	int rc;
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9120ae1901e4..b3101e368079 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3667,7 +3667,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
 	kvm_userspace_mem.flags = 0;
 	kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
 	kvm_userspace_mem.memory_size = PAGE_SIZE;
-	r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
+	r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, false);
 	if (r)
 		goto out;
 
@@ -3697,7 +3697,7 @@ static int alloc_identity_pagetable(struct kvm *kvm)
 	kvm_userspace_mem.guest_phys_addr =
 		kvm->arch.ept_identity_map_addr;
 	kvm_userspace_mem.memory_size = PAGE_SIZE;
-	r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
+	r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, false);
 	if (r)
 		goto out;
 
@@ -4251,7 +4251,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
 		.flags = 0,
 	};
 
-	ret = kvm_set_memory_region(kvm, &tss_mem, 0);
+	ret = kvm_set_memory_region(kvm, &tss_mem, false);
 	if (ret)
 		return ret;
 	kvm->arch.tss_addr = addr;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 816074757c96..1c9c834b72f0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6839,7 +6839,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				struct kvm_memory_slot *memslot,
 				struct kvm_memory_slot old,
 				struct kvm_userspace_memory_region *mem,
-				int user_alloc)
+				bool user_alloc)
 {
 	int npages = memslot->npages;
 	int map_flags = MAP_PRIVATE | MAP_ANONYMOUS;
@@ -6875,7 +6875,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 void kvm_arch_commit_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem,
 				struct kvm_memory_slot old,
-				int user_alloc)
+				bool user_alloc)
 {
 
 	int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 5a3581ceb036..d897f035749f 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -270,8 +270,8 @@ struct kvm_memory_slot {
 	unsigned long *dirty_bitmap;
 	struct kvm_arch_memory_slot arch;
 	unsigned long userspace_addr;
-	int user_alloc;
 	int id;
+	bool user_alloc;
 };
 
 static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot)
@@ -451,10 +451,10 @@ id_to_memslot(struct kvm_memslots *slots, int id)
 
 int kvm_set_memory_region(struct kvm *kvm,
 			  struct kvm_userspace_memory_region *mem,
-			  int user_alloc);
+			  bool user_alloc);
 int __kvm_set_memory_region(struct kvm *kvm,
 			    struct kvm_userspace_memory_region *mem,
-			    int user_alloc);
+			    bool user_alloc);
 void kvm_arch_free_memslot(struct kvm_memory_slot *free,
 			   struct kvm_memory_slot *dont);
 int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages);
@@ -462,11 +462,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				struct kvm_memory_slot *memslot,
 				struct kvm_memory_slot old,
 				struct kvm_userspace_memory_region *mem,
-				int user_alloc);
+				bool user_alloc);
 void kvm_arch_commit_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem,
 				struct kvm_memory_slot old,
-				int user_alloc);
+				bool user_alloc);
 bool kvm_largepages_enabled(void);
 void kvm_disable_largepages(void);
 /* flush all memory translations */
@@ -553,7 +553,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
 				   struct
 				   kvm_userspace_memory_region *mem,
-				   int user_alloc);
+				   bool user_alloc);
 int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level);
 long kvm_arch_vm_ioctl(struct file *filp,
 		       unsigned int ioctl, unsigned long arg);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5f0638cb6968..42c1eb73a0e3 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -709,7 +709,7 @@ static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
  */
 int __kvm_set_memory_region(struct kvm *kvm,
 			    struct kvm_userspace_memory_region *mem,
-			    int user_alloc)
+			    bool user_alloc)
 {
 	int r;
 	gfn_t base_gfn;
@@ -889,7 +889,7 @@ EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
 
 int kvm_set_memory_region(struct kvm *kvm,
 			  struct kvm_userspace_memory_region *mem,
-			  int user_alloc)
+			  bool user_alloc)
 {
 	int r;
 
@@ -903,7 +903,7 @@ EXPORT_SYMBOL_GPL(kvm_set_memory_region);
 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
 				   struct
 				   kvm_userspace_memory_region *mem,
-				   int user_alloc)
+				   bool user_alloc)
 {
 	if (mem->slot >= KVM_USER_MEM_SLOTS)
 		return -EINVAL;
@@ -2148,7 +2148,7 @@ static long kvm_vm_ioctl(struct file *filp,
 						sizeof kvm_userspace_mem))
 			goto out;
 
-		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1);
+		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, true);
 		break;
 	}
 	case KVM_GET_DIRTY_LOG: {
-- 
cgit v1.2.3


From c972f3b125d8818748429b94cd2e59f473943a33 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Date: Tue, 8 Jan 2013 19:43:28 +0900
Subject: KVM: Write protect the updated slot only when dirty logging is
 enabled

Calling kvm_mmu_slot_remove_write_access() for a deleted slot does
nothing but search for non-existent mmu pages which have mappings to
that deleted memory; this is safe but a waste of time.

Since we want to make the function rmap based in a later patch, in a
manner which makes it unsafe to be called for a deleted slot, we makes
the caller see if the slot is non-zero and being dirty logged.

Reviewed-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/x86.c  | 8 +++++++-
 virt/kvm/kvm_main.c | 1 -
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kvm/x86.c')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1c9c834b72f0..add5e4801968 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6897,7 +6897,13 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 	spin_lock(&kvm->mmu_lock);
 	if (nr_mmu_pages)
 		kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
-	kvm_mmu_slot_remove_write_access(kvm, mem->slot);
+	/*
+	 * Write protect all pages for dirty logging.
+	 * Existing largepage mappings are destroyed here and new ones will
+	 * not be created until the end of the logging.
+	 */
+	if (npages && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
+		kvm_mmu_slot_remove_write_access(kvm, mem->slot);
 	spin_unlock(&kvm->mmu_lock);
 	/*
 	 * If memory slot is created, or moved, we need to clear all
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e45c20ca422a..f689a6d7f9d4 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -817,7 +817,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
 	if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
 		if (kvm_create_dirty_bitmap(&new) < 0)
 			goto out_free;
-		/* destroy any largepage mappings for dirty tracking */
 	}
 
 	if (!npages || base_gfn != old.base_gfn) {
-- 
cgit v1.2.3


From b34cb590fb099f7929dd78d9464b70319ee12a98 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Date: Tue, 8 Jan 2013 19:46:07 +0900
Subject: KVM: Make kvm_mmu_change_mmu_pages() take mmu_lock by itself

No reason to make callers take mmu_lock since we do not need to protect
kvm_mmu_change_mmu_pages() and kvm_mmu_slot_remove_write_access()
together by mmu_lock in kvm_arch_commit_memory_region(): the former
calls kvm_mmu_commit_zap_page() and flushes TLBs by itself.

Note: we do not need to protect kvm->arch.n_requested_mmu_pages by
mmu_lock as can be seen from the fact that it is read locklessly.

Reviewed-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/mmu.c | 4 ++++
 arch/x86/kvm/x86.c | 9 ++++-----
 2 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kvm/x86.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9c1b2d6158bf..f5572804f594 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2143,6 +2143,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
 	 * change the value
 	 */
 
+	spin_lock(&kvm->mmu_lock);
+
 	if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
 		while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
 			!list_empty(&kvm->arch.active_mmu_pages)) {
@@ -2157,6 +2159,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
 	}
 
 	kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
+
+	spin_unlock(&kvm->mmu_lock);
 }
 
 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index add5e4801968..080bbdcbf2ee 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3270,12 +3270,10 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
 		return -EINVAL;
 
 	mutex_lock(&kvm->slots_lock);
-	spin_lock(&kvm->mmu_lock);
 
 	kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
 	kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
 
-	spin_unlock(&kvm->mmu_lock);
 	mutex_unlock(&kvm->slots_lock);
 	return 0;
 }
@@ -6894,7 +6892,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 	if (!kvm->arch.n_requested_mmu_pages)
 		nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
 
-	spin_lock(&kvm->mmu_lock);
 	if (nr_mmu_pages)
 		kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
 	/*
@@ -6902,9 +6899,11 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 	 * Existing largepage mappings are destroyed here and new ones will
 	 * not be created until the end of the logging.
 	 */
-	if (npages && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
+	if (npages && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES)) {
+		spin_lock(&kvm->mmu_lock);
 		kvm_mmu_slot_remove_write_access(kvm, mem->slot);
-	spin_unlock(&kvm->mmu_lock);
+		spin_unlock(&kvm->mmu_lock);
+	}
 	/*
 	 * If memory slot is created, or moved, we need to clear all
 	 * mmio sptes.
-- 
cgit v1.2.3


From 9d1beefb71146bbf5f820ab17c450808b0d0b2df Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Date: Tue, 8 Jan 2013 19:46:48 +0900
Subject: KVM: Make kvm_mmu_slot_remove_write_access() take mmu_lock by itself

Better to place mmu_lock handling and TLB flushing code together since
this is a self-contained function.

Reviewed-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/mmu.c | 3 +++
 arch/x86/kvm/x86.c | 5 +----
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kvm/x86.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f5572804f594..e5dcae31cebc 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4173,6 +4173,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 	memslot = id_to_memslot(kvm->memslots, slot);
 	last_gfn = memslot->base_gfn + memslot->npages - 1;
 
+	spin_lock(&kvm->mmu_lock);
+
 	for (i = PT_PAGE_TABLE_LEVEL;
 	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
 		unsigned long *rmapp;
@@ -4188,6 +4190,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 	}
 
 	kvm_flush_remote_tlbs(kvm);
+	spin_unlock(&kvm->mmu_lock);
 }
 
 void kvm_mmu_zap_all(struct kvm *kvm)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 080bbdcbf2ee..54832280cdef 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6899,11 +6899,8 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 	 * Existing largepage mappings are destroyed here and new ones will
 	 * not be created until the end of the logging.
 	 */
-	if (npages && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES)) {
-		spin_lock(&kvm->mmu_lock);
+	if (npages && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
 		kvm_mmu_slot_remove_write_access(kvm, mem->slot);
-		spin_unlock(&kvm->mmu_lock);
-	}
 	/*
 	 * If memory slot is created, or moved, we need to clear all
 	 * mmio sptes.
-- 
cgit v1.2.3


From 22368028febf81f5c6d1e9fdd737d50543219b00 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Sun, 13 Jan 2013 23:44:12 +0800
Subject: KVM: x86: clean up reexecute_instruction

Little cleanup for reexecute_instruction, also use gpa_to_gfn in
retry_instruction

Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kvm/x86.c')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 54832280cdef..f0288c52d5e4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4759,19 +4759,18 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
 	if (tdp_enabled)
 		return false;
 
+	gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
+	if (gpa == UNMAPPED_GVA)
+		return true; /* let cpu generate fault */
+
 	/*
 	 * if emulation was due to access to shadowed page table
 	 * and it failed try to unshadow page and re-enter the
 	 * guest to let CPU execute the instruction.
 	 */
-	if (kvm_mmu_unprotect_page_virt(vcpu, gva))
+	if (kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)))
 		return true;
 
-	gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
-
-	if (gpa == UNMAPPED_GVA)
-		return true; /* let cpu generate fault */
-
 	/*
 	 * Do not retry the unhandleable instruction if it faults on the
 	 * readonly host memory, otherwise it will goto a infinite loop:
@@ -4826,7 +4825,7 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
 	if (!vcpu->arch.mmu.direct_map)
 		gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
 
-	kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
 
 	return true;
 }
-- 
cgit v1.2.3


From 95b3cf69bdf8b27a02d878e24ca353cebb4e009e Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Sun, 13 Jan 2013 23:46:52 +0800
Subject: KVM: x86: let reexecute_instruction work for tdp

Currently, reexecute_instruction refused to retry all instructions if
tdp is enabled. If nested npt is used, the emulation may be caused by
shadow page, it can be fixed by dropping the shadow page. And the only
condition that tdp can not retry the instruction is the access fault
on error pfn

Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 61 ++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 43 insertions(+), 18 deletions(-)

(limited to 'arch/x86/kvm/x86.c')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f0288c52d5e4..6f9cab071eca 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4751,25 +4751,25 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu)
 	return r;
 }
 
-static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
+static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2)
 {
-	gpa_t gpa;
+	gpa_t gpa = cr2;
 	pfn_t pfn;
 
-	if (tdp_enabled)
-		return false;
-
-	gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
-	if (gpa == UNMAPPED_GVA)
-		return true; /* let cpu generate fault */
+	if (!vcpu->arch.mmu.direct_map) {
+		/*
+		 * Write permission should be allowed since only
+		 * write access need to be emulated.
+		 */
+		gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
 
-	/*
-	 * if emulation was due to access to shadowed page table
-	 * and it failed try to unshadow page and re-enter the
-	 * guest to let CPU execute the instruction.
-	 */
-	if (kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)))
-		return true;
+		/*
+		 * If the mapping is invalid in guest, let cpu retry
+		 * it to generate fault.
+		 */
+		if (gpa == UNMAPPED_GVA)
+			return true;
+	}
 
 	/*
 	 * Do not retry the unhandleable instruction if it faults on the
@@ -4778,12 +4778,37 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
 	 * instruction -> ...
 	 */
 	pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
-	if (!is_error_noslot_pfn(pfn)) {
-		kvm_release_pfn_clean(pfn);
+
+	/*
+	 * If the instruction failed on the error pfn, it can not be fixed,
+	 * report the error to userspace.
+	 */
+	if (is_error_noslot_pfn(pfn))
+		return false;
+
+	kvm_release_pfn_clean(pfn);
+
+	/* The instructions are well-emulated on direct mmu. */
+	if (vcpu->arch.mmu.direct_map) {
+		unsigned int indirect_shadow_pages;
+
+		spin_lock(&vcpu->kvm->mmu_lock);
+		indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
+		spin_unlock(&vcpu->kvm->mmu_lock);
+
+		if (indirect_shadow_pages)
+			kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+
 		return true;
 	}
 
-	return false;
+	/*
+	 * if emulation was due to access to shadowed page table
+	 * and it failed try to unshadow page and re-enter the
+	 * guest to let CPU execute the instruction.
+	 */
+	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+	return true;
 }
 
 static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
-- 
cgit v1.2.3


From 93c05d3ef25275829d421a255271595ac219a518 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Sun, 13 Jan 2013 23:49:07 +0800
Subject: KVM: x86: improve reexecute_instruction

The current reexecute_instruction can not well detect the failed instruction
emulation. It allows guest to retry all the instructions except it accesses
on error pfn

For example, some cases are nested-write-protect - if the page we want to
write is used as PDE but it chains to itself. Under this case, we should
stop the emulation and report the case to userspace

Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  7 +++++++
 arch/x86/kvm/paging_tmpl.h      | 27 ++++++++++++++++++++-------
 arch/x86/kvm/x86.c              | 22 ++++++++++++++++++----
 3 files changed, 45 insertions(+), 11 deletions(-)

(limited to 'arch/x86/kvm/x86.c')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f75e1feb6ec5..77d56a4ba89c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -497,6 +497,13 @@ struct kvm_vcpu_arch {
 		u64 msr_val;
 		struct gfn_to_hva_cache data;
 	} pv_eoi;
+
+	/*
+	 * Indicate whether the access faults on its page table in guest
+	 * which is set when fix page fault and used to detect unhandeable
+	 * instruction.
+	 */
+	bool write_fault_to_shadow_pgtable;
 };
 
 struct kvm_lpage_info {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 3d1a35237dbf..ca69dcccbe31 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -497,26 +497,34 @@ out_gpte_changed:
  * created when kvm establishes shadow page table that stop kvm using large
  * page size. Do it early can avoid unnecessary #PF and emulation.
  *
+ * @write_fault_to_shadow_pgtable will return true if the fault gfn is
+ * currently used as its page table.
+ *
  * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok
  * since the PDPT is always shadowed, that means, we can not use large page
  * size to map the gfn which is used as PDPT.
  */
 static bool
 FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
-			      struct guest_walker *walker, int user_fault)
+			      struct guest_walker *walker, int user_fault,
+			      bool *write_fault_to_shadow_pgtable)
 {
 	int level;
 	gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1);
+	bool self_changed = false;
 
 	if (!(walker->pte_access & ACC_WRITE_MASK ||
 	      (!is_write_protection(vcpu) && !user_fault)))
 		return false;
 
-	for (level = walker->level; level <= walker->max_level; level++)
-		if (!((walker->gfn ^ walker->table_gfn[level - 1]) & mask))
-			return true;
+	for (level = walker->level; level <= walker->max_level; level++) {
+		gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1];
+
+		self_changed |= !(gfn & mask);
+		*write_fault_to_shadow_pgtable |= !gfn;
+	}
 
-	return false;
+	return self_changed;
 }
 
 /*
@@ -544,7 +552,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	int level = PT_PAGE_TABLE_LEVEL;
 	int force_pt_level;
 	unsigned long mmu_seq;
-	bool map_writable;
+	bool map_writable, is_self_change_mapping;
 
 	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
 
@@ -572,9 +580,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 		return 0;
 	}
 
+	vcpu->arch.write_fault_to_shadow_pgtable = false;
+
+	is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
+	      &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
+
 	if (walker.level >= PT_DIRECTORY_LEVEL)
 		force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn)
-		   || FNAME(is_self_change_mapping)(vcpu, &walker, user_fault);
+		   || is_self_change_mapping;
 	else
 		force_pt_level = 1;
 	if (!force_pt_level) {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6f9cab071eca..e00dd0515a84 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4751,7 +4751,8 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu)
 	return r;
 }
 
-static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2)
+static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
+				  bool write_fault_to_shadow_pgtable)
 {
 	gpa_t gpa = cr2;
 	pfn_t pfn;
@@ -4808,7 +4809,13 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2)
 	 * guest to let CPU execute the instruction.
 	 */
 	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
-	return true;
+
+	/*
+	 * If the access faults on its page table, it can not
+	 * be fixed by unprotecting shadow page and it should
+	 * be reported to userspace.
+	 */
+	return !write_fault_to_shadow_pgtable;
 }
 
 static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
@@ -4867,7 +4874,13 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 	int r;
 	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 	bool writeback = true;
+	bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
 
+	/*
+	 * Clear write_fault_to_shadow_pgtable here to ensure it is
+	 * never reused.
+	 */
+	vcpu->arch.write_fault_to_shadow_pgtable = false;
 	kvm_clear_exception_queue(vcpu);
 
 	if (!(emulation_type & EMULTYPE_NO_DECODE)) {
@@ -4886,7 +4899,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 		if (r != EMULATION_OK)  {
 			if (emulation_type & EMULTYPE_TRAP_UD)
 				return EMULATE_FAIL;
-			if (reexecute_instruction(vcpu, cr2))
+			if (reexecute_instruction(vcpu, cr2,
+						  write_fault_to_spt))
 				return EMULATE_DONE;
 			if (emulation_type & EMULTYPE_SKIP)
 				return EMULATE_FAIL;
@@ -4916,7 +4930,7 @@ restart:
 		return EMULATE_DONE;
 
 	if (r == EMULATION_FAILED) {
-		if (reexecute_instruction(vcpu, cr2))
+		if (reexecute_instruction(vcpu, cr2, write_fault_to_spt))
 			return EMULATE_DONE;
 
 		return handle_emulation_failure(vcpu);
-- 
cgit v1.2.3


From 378a8b099fc207ddcb91b19a8c1457667e0af398 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 21 Jan 2013 15:36:48 +0200
Subject: KVM: x86: fix use of uninitialized memory as segment descriptor in
 emulator.

If VMX reports segment as unusable, zero descriptor passed by the emulator
before returning. Such descriptor will be considered not present by the
emulator.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kvm/x86.c')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e00dd0515a84..b9f55299ed7e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4489,8 +4489,10 @@ static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
 	kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
 	*selector = var.selector;
 
-	if (var.unusable)
+	if (var.unusable) {
+		memset(desc, 0, sizeof(*desc));
 		return false;
+	}
 
 	if (var.g)
 		var.limit >>= 12;
-- 
cgit v1.2.3


From c7c9c56ca26f7b9458711b2d78b60b60e0d38ba7 Mon Sep 17 00:00:00 2001
From: Yang Zhang <yang.z.zhang@Intel.com>
Date: Fri, 25 Jan 2013 10:18:51 +0800
Subject: x86, apicv: add virtual interrupt delivery support

Virtual interrupt delivery avoids KVM to inject vAPIC interrupts
manually, which is fully taken care of by the hardware. This needs
some special awareness into existing interrupr injection path:

- for pending interrupt, instead of direct injection, we may need
  update architecture specific indicators before resuming to guest.

- A pending interrupt, which is masked by ISR, should be also
  considered in above update action, since hardware will decide
  when to inject it at right time. Current has_interrupt and
  get_interrupt only returns a valid vector from injection p.o.v.

Reviewed-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/ia64/kvm/lapic.h           |   6 ++
 arch/x86/include/asm/kvm_host.h |   5 ++
 arch/x86/include/asm/vmx.h      |  11 ++++
 arch/x86/kvm/irq.c              |  56 +++++++++++++++++--
 arch/x86/kvm/lapic.c            | 106 +++++++++++++++++++++++++++--------
 arch/x86/kvm/lapic.h            |  27 +++++++++
 arch/x86/kvm/svm.c              |  18 ++++++
 arch/x86/kvm/vmx.c              | 119 ++++++++++++++++++++++++++++++++++++----
 arch/x86/kvm/x86.c              |  23 +++++++-
 include/linux/kvm_host.h        |   3 +
 virt/kvm/ioapic.c               |  39 +++++++++++++
 virt/kvm/ioapic.h               |   4 ++
 virt/kvm/irq_comm.c             |  25 +++++++++
 virt/kvm/kvm_main.c             |   5 ++
 14 files changed, 407 insertions(+), 40 deletions(-)

(limited to 'arch/x86/kvm/x86.c')

diff --git a/arch/ia64/kvm/lapic.h b/arch/ia64/kvm/lapic.h
index c5f92a926a9a..c3e2935b6db4 100644
--- a/arch/ia64/kvm/lapic.h
+++ b/arch/ia64/kvm/lapic.h
@@ -27,4 +27,10 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
 #define kvm_apic_present(x) (true)
 #define kvm_lapic_enabled(x) (true)
 
+static inline bool kvm_apic_vid_enabled(void)
+{
+	/* IA64 has no apicv supporting, do nothing here */
+	return false;
+}
+
 #endif
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d42c2839be98..635a74d22409 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -699,6 +699,10 @@ struct kvm_x86_ops {
 	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
 	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
 	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
+	int (*vm_has_apicv)(struct kvm *kvm);
+	void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
+	void (*hwapic_isr_update)(struct kvm *kvm, int isr);
+	void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
 	void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
 	int (*get_tdp_level)(void);
@@ -994,6 +998,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva);
 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
 void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
 int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
+int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
 int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 0a54df0b36fc..694586ca6456 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -62,6 +62,7 @@
 #define EXIT_REASON_MCE_DURING_VMENTRY  41
 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43
 #define EXIT_REASON_APIC_ACCESS         44
+#define EXIT_REASON_EOI_INDUCED         45
 #define EXIT_REASON_EPT_VIOLATION       48
 #define EXIT_REASON_EPT_MISCONFIG       49
 #define EXIT_REASON_WBINVD              54
@@ -144,6 +145,7 @@
 #define SECONDARY_EXEC_WBINVD_EXITING		0x00000040
 #define SECONDARY_EXEC_UNRESTRICTED_GUEST	0x00000080
 #define SECONDARY_EXEC_APIC_REGISTER_VIRT       0x00000100
+#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY    0x00000200
 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING	0x00000400
 #define SECONDARY_EXEC_ENABLE_INVPCID		0x00001000
 
@@ -181,6 +183,7 @@ enum vmcs_field {
 	GUEST_GS_SELECTOR               = 0x0000080a,
 	GUEST_LDTR_SELECTOR             = 0x0000080c,
 	GUEST_TR_SELECTOR               = 0x0000080e,
+	GUEST_INTR_STATUS               = 0x00000810,
 	HOST_ES_SELECTOR                = 0x00000c00,
 	HOST_CS_SELECTOR                = 0x00000c02,
 	HOST_SS_SELECTOR                = 0x00000c04,
@@ -208,6 +211,14 @@ enum vmcs_field {
 	APIC_ACCESS_ADDR_HIGH		= 0x00002015,
 	EPT_POINTER                     = 0x0000201a,
 	EPT_POINTER_HIGH                = 0x0000201b,
+	EOI_EXIT_BITMAP0                = 0x0000201c,
+	EOI_EXIT_BITMAP0_HIGH           = 0x0000201d,
+	EOI_EXIT_BITMAP1                = 0x0000201e,
+	EOI_EXIT_BITMAP1_HIGH           = 0x0000201f,
+	EOI_EXIT_BITMAP2                = 0x00002020,
+	EOI_EXIT_BITMAP2_HIGH           = 0x00002021,
+	EOI_EXIT_BITMAP3                = 0x00002022,
+	EOI_EXIT_BITMAP3_HIGH           = 0x00002023,
 	GUEST_PHYSICAL_ADDRESS          = 0x00002400,
 	GUEST_PHYSICAL_ADDRESS_HIGH     = 0x00002401,
 	VMCS_LINK_POINTER               = 0x00002800,
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index b111aee815f8..484bc874688b 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -37,6 +37,38 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
 
+/*
+ * check if there is pending interrupt from
+ * non-APIC source without intack.
+ */
+static int kvm_cpu_has_extint(struct kvm_vcpu *v)
+{
+	if (kvm_apic_accept_pic_intr(v))
+		return pic_irqchip(v->kvm)->output;	/* PIC */
+	else
+		return 0;
+}
+
+/*
+ * check if there is injectable interrupt:
+ * when virtual interrupt delivery enabled,
+ * interrupt from apic will handled by hardware,
+ * we don't need to check it here.
+ */
+int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
+{
+	if (!irqchip_in_kernel(v->kvm))
+		return v->arch.interrupt.pending;
+
+	if (kvm_cpu_has_extint(v))
+		return 1;
+
+	if (kvm_apic_vid_enabled(v->kvm))
+		return 0;
+
+	return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
+}
+
 /*
  * check if there is pending interrupt without
  * intack.
@@ -46,27 +78,41 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
 	if (!irqchip_in_kernel(v->kvm))
 		return v->arch.interrupt.pending;
 
-	if (kvm_apic_accept_pic_intr(v) && pic_irqchip(v->kvm)->output)
-		return pic_irqchip(v->kvm)->output;	/* PIC */
+	if (kvm_cpu_has_extint(v))
+		return 1;
 
 	return kvm_apic_has_interrupt(v) != -1;	/* LAPIC */
 }
 EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
 
+/*
+ * Read pending interrupt(from non-APIC source)
+ * vector and intack.
+ */
+static int kvm_cpu_get_extint(struct kvm_vcpu *v)
+{
+	if (kvm_cpu_has_extint(v))
+		return kvm_pic_read_irq(v->kvm); /* PIC */
+	return -1;
+}
+
 /*
  * Read pending interrupt vector and intack.
  */
 int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
 {
+	int vector;
+
 	if (!irqchip_in_kernel(v->kvm))
 		return v->arch.interrupt.nr;
 
-	if (kvm_apic_accept_pic_intr(v) && pic_irqchip(v->kvm)->output)
-		return kvm_pic_read_irq(v->kvm);	/* PIC */
+	vector = kvm_cpu_get_extint(v);
+
+	if (kvm_apic_vid_enabled(v->kvm) || vector != -1)
+		return vector;			/* PIC */
 
 	return kvm_get_apic_interrupt(v);	/* APIC */
 }
-EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
 
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index f69fc5077a89..02b51dd4e4ad 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -145,21 +145,51 @@ static inline int kvm_apic_id(struct kvm_lapic *apic)
 	return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
 }
 
-static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr)
+void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
+				struct kvm_lapic_irq *irq,
+				u64 *eoi_exit_bitmap)
 {
-	u16 cid;
-	ldr >>= 32 - map->ldr_bits;
-	cid = (ldr >> map->cid_shift) & map->cid_mask;
+	struct kvm_lapic **dst;
+	struct kvm_apic_map *map;
+	unsigned long bitmap = 1;
+	int i;
 
-	BUG_ON(cid >= ARRAY_SIZE(map->logical_map));
+	rcu_read_lock();
+	map = rcu_dereference(vcpu->kvm->arch.apic_map);
 
-	return cid;
-}
+	if (unlikely(!map)) {
+		__set_bit(irq->vector, (unsigned long *)eoi_exit_bitmap);
+		goto out;
+	}
 
-static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr)
-{
-	ldr >>= (32 - map->ldr_bits);
-	return ldr & map->lid_mask;
+	if (irq->dest_mode == 0) { /* physical mode */
+		if (irq->delivery_mode == APIC_DM_LOWEST ||
+				irq->dest_id == 0xff) {
+			__set_bit(irq->vector,
+				  (unsigned long *)eoi_exit_bitmap);
+			goto out;
+		}
+		dst = &map->phys_map[irq->dest_id & 0xff];
+	} else {
+		u32 mda = irq->dest_id << (32 - map->ldr_bits);
+
+		dst = map->logical_map[apic_cluster_id(map, mda)];
+
+		bitmap = apic_logical_id(map, mda);
+	}
+
+	for_each_set_bit(i, &bitmap, 16) {
+		if (!dst[i])
+			continue;
+		if (dst[i]->vcpu == vcpu) {
+			__set_bit(irq->vector,
+				  (unsigned long *)eoi_exit_bitmap);
+			break;
+		}
+	}
+
+out:
+	rcu_read_unlock();
 }
 
 static void recalculate_apic_map(struct kvm *kvm)
@@ -225,6 +255,8 @@ out:
 
 	if (old)
 		kfree_rcu(old, rcu);
+
+	kvm_ioapic_make_eoibitmap_request(kvm);
 }
 
 static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
@@ -340,6 +372,10 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
 {
 	int result;
 
+	/*
+	 * Note that irr_pending is just a hint. It will be always
+	 * true with virtual interrupt delivery enabled.
+	 */
 	if (!apic->irr_pending)
 		return -1;
 
@@ -456,6 +492,8 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
 static inline int apic_find_highest_isr(struct kvm_lapic *apic)
 {
 	int result;
+
+	/* Note that isr_count is always 1 with vid enabled */
 	if (!apic->isr_count)
 		return -1;
 	if (likely(apic->highest_isr_cache != -1))
@@ -735,6 +773,19 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
 	return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
 }
 
+static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
+{
+	if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
+	    kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
+		int trigger_mode;
+		if (apic_test_vector(vector, apic->regs + APIC_TMR))
+			trigger_mode = IOAPIC_LEVEL_TRIG;
+		else
+			trigger_mode = IOAPIC_EDGE_TRIG;
+		kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
+	}
+}
+
 static int apic_set_eoi(struct kvm_lapic *apic)
 {
 	int vector = apic_find_highest_isr(apic);
@@ -751,19 +802,26 @@ static int apic_set_eoi(struct kvm_lapic *apic)
 	apic_clear_isr(vector, apic);
 	apic_update_ppr(apic);
 
-	if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
-	    kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
-		int trigger_mode;
-		if (apic_test_vector(vector, apic->regs + APIC_TMR))
-			trigger_mode = IOAPIC_LEVEL_TRIG;
-		else
-			trigger_mode = IOAPIC_EDGE_TRIG;
-		kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
-	}
+	kvm_ioapic_send_eoi(apic, vector);
 	kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
 	return vector;
 }
 
+/*
+ * this interface assumes a trap-like exit, which has already finished
+ * desired side effect including vISR and vPPR update.
+ */
+void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	trace_kvm_eoi(apic, vector);
+
+	kvm_ioapic_send_eoi(apic, vector);
+	kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated);
+
 static void apic_send_ipi(struct kvm_lapic *apic)
 {
 	u32 icr_low = kvm_apic_get_reg(apic, APIC_ICR);
@@ -1375,8 +1433,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
 		apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
 		apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
 	}
-	apic->irr_pending = false;
-	apic->isr_count = 0;
+	apic->irr_pending = kvm_apic_vid_enabled(vcpu->kvm);
+	apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm);
 	apic->highest_isr_cache = -1;
 	update_divide_count(apic);
 	atomic_set(&apic->lapic_timer.pending, 0);
@@ -1591,8 +1649,10 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
 	update_divide_count(apic);
 	start_apic_timer(apic);
 	apic->irr_pending = true;
-	apic->isr_count = count_vectors(apic->regs + APIC_ISR);
+	apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm) ?
+				1 : count_vectors(apic->regs + APIC_ISR);
 	apic->highest_isr_cache = -1;
+	kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic));
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
 
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 22a5397b638c..1676d34ddb4e 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -65,6 +65,7 @@ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
 void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data);
 
 void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset);
+void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector);
 
 void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
 void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
@@ -131,4 +132,30 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic)
 	return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
 }
 
+static inline bool kvm_apic_vid_enabled(struct kvm *kvm)
+{
+	return kvm_x86_ops->vm_has_apicv(kvm);
+}
+
+static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr)
+{
+	u16 cid;
+	ldr >>= 32 - map->ldr_bits;
+	cid = (ldr >> map->cid_shift) & map->cid_mask;
+
+	BUG_ON(cid >= ARRAY_SIZE(map->logical_map));
+
+	return cid;
+}
+
+static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr)
+{
+	ldr >>= (32 - map->ldr_bits);
+	return ldr & map->lid_mask;
+}
+
+void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
+				struct kvm_lapic_irq *irq,
+				u64 *eoi_bitmap);
+
 #endif
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 38407e9fd1bd..e1b1ce21bc00 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3576,6 +3576,21 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
 	return;
 }
 
+static int svm_vm_has_apicv(struct kvm *kvm)
+{
+	return 0;
+}
+
+static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+{
+	return;
+}
+
+static void svm_hwapic_isr_update(struct kvm *kvm, int isr)
+{
+	return;
+}
+
 static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -4296,6 +4311,9 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.enable_irq_window = enable_irq_window,
 	.update_cr8_intercept = update_cr8_intercept,
 	.set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
+	.vm_has_apicv = svm_vm_has_apicv,
+	.load_eoi_exitmap = svm_load_eoi_exitmap,
+	.hwapic_isr_update = svm_hwapic_isr_update,
 
 	.set_tss_addr = svm_set_tss_addr,
 	.get_tdp_level = get_npt_level,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3ce8a1629330..0cf74a641dec 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -84,8 +84,8 @@ module_param(vmm_exclusive, bool, S_IRUGO);
 static bool __read_mostly fasteoi = 1;
 module_param(fasteoi, bool, S_IRUGO);
 
-static bool __read_mostly enable_apicv_reg = 1;
-module_param(enable_apicv_reg, bool, S_IRUGO);
+static bool __read_mostly enable_apicv_reg_vid = 1;
+module_param(enable_apicv_reg_vid, bool, S_IRUGO);
 
 /*
  * If nested=1, nested virtualization is supported, i.e., guests may use
@@ -781,6 +781,12 @@ static inline bool cpu_has_vmx_apic_register_virt(void)
 		SECONDARY_EXEC_APIC_REGISTER_VIRT;
 }
 
+static inline bool cpu_has_vmx_virtual_intr_delivery(void)
+{
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
+}
+
 static inline bool cpu_has_vmx_flexpriority(void)
 {
 	return cpu_has_vmx_tpr_shadow() &&
@@ -2571,7 +2577,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 			SECONDARY_EXEC_PAUSE_LOOP_EXITING |
 			SECONDARY_EXEC_RDTSCP |
 			SECONDARY_EXEC_ENABLE_INVPCID |
-			SECONDARY_EXEC_APIC_REGISTER_VIRT;
+			SECONDARY_EXEC_APIC_REGISTER_VIRT |
+			SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
 		if (adjust_vmx_controls(min2, opt2,
 					MSR_IA32_VMX_PROCBASED_CTLS2,
 					&_cpu_based_2nd_exec_control) < 0)
@@ -2586,7 +2593,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
 		_cpu_based_2nd_exec_control &= ~(
 				SECONDARY_EXEC_APIC_REGISTER_VIRT |
-				SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
+				SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+				SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
 
 	if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
 		/* CR3 accesses and invlpg don't need to cause VM Exits when EPT
@@ -2785,8 +2793,14 @@ static __init int hardware_setup(void)
 	if (!cpu_has_vmx_ple())
 		ple_gap = 0;
 
-	if (!cpu_has_vmx_apic_register_virt())
-		enable_apicv_reg = 0;
+	if (!cpu_has_vmx_apic_register_virt() ||
+				!cpu_has_vmx_virtual_intr_delivery())
+		enable_apicv_reg_vid = 0;
+
+	if (enable_apicv_reg_vid)
+		kvm_x86_ops->update_cr8_intercept = NULL;
+	else
+		kvm_x86_ops->hwapic_irr_update = NULL;
 
 	if (nested)
 		nested_vmx_setup_ctls_msrs();
@@ -3928,6 +3942,11 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
 	return exec_control;
 }
 
+static int vmx_vm_has_apicv(struct kvm *kvm)
+{
+	return enable_apicv_reg_vid && irqchip_in_kernel(kvm);
+}
+
 static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
 {
 	u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
@@ -3945,8 +3964,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
 		exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
 	if (!ple_gap)
 		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
-	if (!enable_apicv_reg || !irqchip_in_kernel(vmx->vcpu.kvm))
-		exec_control &= ~SECONDARY_EXEC_APIC_REGISTER_VIRT;
+	if (!vmx_vm_has_apicv(vmx->vcpu.kvm))
+		exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
+				  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
 	exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
 	return exec_control;
 }
@@ -3992,6 +4012,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 				vmx_secondary_exec_control(vmx));
 	}
 
+	if (enable_apicv_reg_vid) {
+		vmcs_write64(EOI_EXIT_BITMAP0, 0);
+		vmcs_write64(EOI_EXIT_BITMAP1, 0);
+		vmcs_write64(EOI_EXIT_BITMAP2, 0);
+		vmcs_write64(EOI_EXIT_BITMAP3, 0);
+
+		vmcs_write16(GUEST_INTR_STATUS, 0);
+	}
+
 	if (ple_gap) {
 		vmcs_write32(PLE_GAP, ple_gap);
 		vmcs_write32(PLE_WINDOW, ple_window);
@@ -4906,6 +4935,16 @@ static int handle_apic_access(struct kvm_vcpu *vcpu)
 	return emulate_instruction(vcpu, 0) == EMULATE_DONE;
 }
 
+static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
+{
+	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	int vector = exit_qualification & 0xff;
+
+	/* EOI-induced VM exit is trap-like and thus no need to adjust IP */
+	kvm_apic_set_eoi_accelerated(vcpu, vector);
+	return 1;
+}
+
 static int handle_apic_write(struct kvm_vcpu *vcpu)
 {
 	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -5851,6 +5890,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
 	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
 	[EXIT_REASON_APIC_WRITE]              = handle_apic_write,
+	[EXIT_REASON_EOI_INDUCED]             = handle_apic_eoi_induced,
 	[EXIT_REASON_WBINVD]                  = handle_wbinvd,
 	[EXIT_REASON_XSETBV]                  = handle_xsetbv,
 	[EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
@@ -6208,7 +6248,8 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
 	 * There is not point to enable virtualize x2apic without enable
 	 * apicv
 	 */
-	if (!cpu_has_vmx_virtualize_x2apic_mode() || !enable_apicv_reg)
+	if (!cpu_has_vmx_virtualize_x2apic_mode() ||
+				!vmx_vm_has_apicv(vcpu->kvm))
 		return;
 
 	if (!vm_need_tpr_shadow(vcpu->kvm))
@@ -6228,6 +6269,56 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
 	vmx_set_msr_bitmap(vcpu);
 }
 
+static void vmx_hwapic_isr_update(struct kvm *kvm, int isr)
+{
+	u16 status;
+	u8 old;
+
+	if (!vmx_vm_has_apicv(kvm))
+		return;
+
+	if (isr == -1)
+		isr = 0;
+
+	status = vmcs_read16(GUEST_INTR_STATUS);
+	old = status >> 8;
+	if (isr != old) {
+		status &= 0xff;
+		status |= isr << 8;
+		vmcs_write16(GUEST_INTR_STATUS, status);
+	}
+}
+
+static void vmx_set_rvi(int vector)
+{
+	u16 status;
+	u8 old;
+
+	status = vmcs_read16(GUEST_INTR_STATUS);
+	old = (u8)status & 0xff;
+	if ((u8)vector != old) {
+		status &= ~0xff;
+		status |= (u8)vector;
+		vmcs_write16(GUEST_INTR_STATUS, status);
+	}
+}
+
+static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
+{
+	if (max_irr == -1)
+		return;
+
+	vmx_set_rvi(max_irr);
+}
+
+static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+{
+	vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
+	vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
+	vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
+	vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
+}
+
 static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
 {
 	u32 exit_intr_info;
@@ -7492,6 +7583,10 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.enable_irq_window = enable_irq_window,
 	.update_cr8_intercept = update_cr8_intercept,
 	.set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
+	.vm_has_apicv = vmx_vm_has_apicv,
+	.load_eoi_exitmap = vmx_load_eoi_exitmap,
+	.hwapic_irr_update = vmx_hwapic_irr_update,
+	.hwapic_isr_update = vmx_hwapic_isr_update,
 
 	.set_tss_addr = vmx_set_tss_addr,
 	.get_tdp_level = get_ept_level,
@@ -7594,7 +7689,7 @@ static int __init vmx_init(void)
 	memcpy(vmx_msr_bitmap_longmode_x2apic,
 			vmx_msr_bitmap_longmode, PAGE_SIZE);
 
-	if (enable_apicv_reg) {
+	if (enable_apicv_reg_vid) {
 		for (msr = 0x800; msr <= 0x8ff; msr++)
 			vmx_disable_intercept_msr_read_x2apic(msr);
 
@@ -7606,6 +7701,10 @@ static int __init vmx_init(void)
 		vmx_enable_intercept_msr_read_x2apic(0x839);
 		/* TPR */
 		vmx_disable_intercept_msr_write_x2apic(0x808);
+		/* EOI */
+		vmx_disable_intercept_msr_write_x2apic(0x80b);
+		/* SELF-IPI */
+		vmx_disable_intercept_msr_write_x2apic(0x83f);
 	}
 
 	if (enable_ept) {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b9f55299ed7e..cf512e70c797 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5565,7 +5565,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
 			vcpu->arch.nmi_injected = true;
 			kvm_x86_ops->set_nmi(vcpu);
 		}
-	} else if (kvm_cpu_has_interrupt(vcpu)) {
+	} else if (kvm_cpu_has_injectable_intr(vcpu)) {
 		if (kvm_x86_ops->interrupt_allowed(vcpu)) {
 			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
 					    false);
@@ -5633,6 +5633,16 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 #endif
 }
 
+static void update_eoi_exitmap(struct kvm_vcpu *vcpu)
+{
+	u64 eoi_exit_bitmap[4];
+
+	memset(eoi_exit_bitmap, 0, 32);
+
+	kvm_ioapic_calculate_eoi_exitmap(vcpu, eoi_exit_bitmap);
+	kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
+}
+
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 {
 	int r;
@@ -5686,6 +5696,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			kvm_handle_pmu_event(vcpu);
 		if (kvm_check_request(KVM_REQ_PMI, vcpu))
 			kvm_deliver_pmi(vcpu);
+		if (kvm_check_request(KVM_REQ_EOIBITMAP, vcpu))
+			update_eoi_exitmap(vcpu);
 	}
 
 	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
@@ -5694,10 +5706,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		/* enable NMI/IRQ window open exits if needed */
 		if (vcpu->arch.nmi_pending)
 			kvm_x86_ops->enable_nmi_window(vcpu);
-		else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
+		else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
 			kvm_x86_ops->enable_irq_window(vcpu);
 
 		if (kvm_lapic_enabled(vcpu)) {
+			/*
+			 * Update architecture specific hints for APIC
+			 * virtual interrupt delivery.
+			 */
+			if (kvm_x86_ops->hwapic_irr_update)
+				kvm_x86_ops->hwapic_irr_update(vcpu,
+					kvm_lapic_find_highest_irr(vcpu));
 			update_cr8_intercept(vcpu);
 			kvm_lapic_sync_to_vapic(vcpu);
 		}
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4dd7d7531e69..0350e0d5e031 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -123,6 +123,7 @@ static inline bool is_error_page(struct page *page)
 #define KVM_REQ_MASTERCLOCK_UPDATE 19
 #define KVM_REQ_MCLOCK_INPROGRESS 20
 #define KVM_REQ_EPR_EXIT          21
+#define KVM_REQ_EOIBITMAP         22
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID		0
 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID	1
@@ -538,6 +539,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
 void kvm_flush_remote_tlbs(struct kvm *kvm);
 void kvm_reload_remote_mmus(struct kvm *kvm);
 void kvm_make_mclock_inprogress_request(struct kvm *kvm);
+void kvm_make_update_eoibitmap_request(struct kvm *kvm);
 
 long kvm_arch_dev_ioctl(struct file *filp,
 			unsigned int ioctl, unsigned long arg);
@@ -691,6 +693,7 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
 int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level);
 int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
 		int irq_source_id, int level);
+bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin);
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
 				   struct kvm_irq_ack_notifier *kian);
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index f3abbef46c42..ce82b9401958 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -35,6 +35,7 @@
 #include <linux/hrtimer.h>
 #include <linux/io.h>
 #include <linux/slab.h>
+#include <linux/export.h>
 #include <asm/processor.h>
 #include <asm/page.h>
 #include <asm/current.h>
@@ -115,6 +116,42 @@ static void update_handled_vectors(struct kvm_ioapic *ioapic)
 	smp_wmb();
 }
 
+void kvm_ioapic_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
+					u64 *eoi_exit_bitmap)
+{
+	struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
+	union kvm_ioapic_redirect_entry *e;
+	struct kvm_lapic_irq irqe;
+	int index;
+
+	spin_lock(&ioapic->lock);
+	/* traverse ioapic entry to set eoi exit bitmap*/
+	for (index = 0; index < IOAPIC_NUM_PINS; index++) {
+		e = &ioapic->redirtbl[index];
+		if (!e->fields.mask &&
+			(e->fields.trig_mode == IOAPIC_LEVEL_TRIG ||
+			 kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC,
+				 index))) {
+			irqe.dest_id = e->fields.dest_id;
+			irqe.vector = e->fields.vector;
+			irqe.dest_mode = e->fields.dest_mode;
+			irqe.delivery_mode = e->fields.delivery_mode << 8;
+			kvm_calculate_eoi_exitmap(vcpu, &irqe, eoi_exit_bitmap);
+		}
+	}
+	spin_unlock(&ioapic->lock);
+}
+EXPORT_SYMBOL_GPL(kvm_ioapic_calculate_eoi_exitmap);
+
+void kvm_ioapic_make_eoibitmap_request(struct kvm *kvm)
+{
+	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+
+	if (!kvm_apic_vid_enabled(kvm) || !ioapic)
+		return;
+	kvm_make_update_eoibitmap_request(kvm);
+}
+
 static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 {
 	unsigned index;
@@ -156,6 +193,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 		if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG
 		    && ioapic->irr & (1 << index))
 			ioapic_service(ioapic, index);
+		kvm_ioapic_make_eoibitmap_request(ioapic->kvm);
 		break;
 	}
 }
@@ -455,6 +493,7 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
 	spin_lock(&ioapic->lock);
 	memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
 	update_handled_vectors(ioapic);
+	kvm_ioapic_make_eoibitmap_request(kvm);
 	spin_unlock(&ioapic->lock);
 	return 0;
 }
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index a30abfe6ed16..0400a466c50c 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -82,5 +82,9 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 		struct kvm_lapic_irq *irq);
 int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
 int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
+void kvm_ioapic_make_eoibitmap_request(struct kvm *kvm);
+void kvm_ioapic_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
+					u64 *eoi_exit_bitmap);
+
 
 #endif
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 656fa455e154..ff6d40e2c06d 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -22,6 +22,7 @@
 
 #include <linux/kvm_host.h>
 #include <linux/slab.h>
+#include <linux/export.h>
 #include <trace/events/kvm.h>
 
 #include <asm/msidef.h>
@@ -237,6 +238,28 @@ int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level)
 	return ret;
 }
 
+bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin)
+{
+	struct kvm_irq_ack_notifier *kian;
+	struct hlist_node *n;
+	int gsi;
+
+	rcu_read_lock();
+	gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
+	if (gsi != -1)
+		hlist_for_each_entry_rcu(kian, n, &kvm->irq_ack_notifier_list,
+					 link)
+			if (kian->gsi == gsi) {
+				rcu_read_unlock();
+				return true;
+			}
+
+	rcu_read_unlock();
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(kvm_irq_has_notifier);
+
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
 {
 	struct kvm_irq_ack_notifier *kian;
@@ -261,6 +284,7 @@ void kvm_register_irq_ack_notifier(struct kvm *kvm,
 	mutex_lock(&kvm->irq_lock);
 	hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list);
 	mutex_unlock(&kvm->irq_lock);
+	kvm_ioapic_make_eoibitmap_request(kvm);
 }
 
 void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
@@ -270,6 +294,7 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
 	hlist_del_init_rcu(&kian->link);
 	mutex_unlock(&kvm->irq_lock);
 	synchronize_rcu();
+	kvm_ioapic_make_eoibitmap_request(kvm);
 }
 
 int kvm_request_irq_source_id(struct kvm *kvm)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 3fec2cdd951b..abc23e27173d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -217,6 +217,11 @@ void kvm_make_mclock_inprogress_request(struct kvm *kvm)
 	make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
 }
 
+void kvm_make_update_eoibitmap_request(struct kvm *kvm)
+{
+	make_all_cpus_request(kvm, KVM_REQ_EOIBITMAP);
+}
+
 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 {
 	struct page *page;
-- 
cgit v1.2.3


From 2c9afa52ef081334925905d6370d36b6602c328c Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 30 Jan 2013 16:45:02 +0200
Subject: KVM: MMU: set base_role.nxe during mmu initialization.

Move base_role.nxe initialisation to where all other roles are initialized.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 1 +
 arch/x86/kvm/x86.c | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86/kvm/x86.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 40737b38da19..8028ac65db18 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3687,6 +3687,7 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
 	else
 		r = paging32_init_context(vcpu, context);
 
+	vcpu->arch.mmu.base_role.nxe = is_nx(vcpu);
 	vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
 	vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
 	vcpu->arch.mmu.base_role.smep_andnot_wp
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cf512e70c797..373e17a0d398 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -870,8 +870,6 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
 
 	kvm_x86_ops->set_efer(vcpu, efer);
 
-	vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
-
 	/* Update reserved bits */
 	if ((efer ^ old_efer) & EFER_NX)
 		kvm_mmu_reset_context(vcpu);
-- 
cgit v1.2.3


From 7a905b1485adf863607b5fc9e32a3fa3838bcc23 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Date: Thu, 7 Feb 2013 18:55:57 +0900
Subject: KVM: Remove user_alloc from struct kvm_memory_slot

This field was needed to differentiate memory slots created by the new
API, KVM_SET_USER_MEMORY_REGION, from those by the old equivalent,
KVM_SET_MEMORY_REGION, whose support was dropped long before:

  commit b74a07beed0e64bfba413dcb70dd6749c57f43dc
  KVM: Remove kernel-allocated memory regions

Although we also have private memory slots to which KVM allocates
memory with vm_mmap(), !user_alloc slots in other words, the slot id
should be enough for differentiating them.

Note: corresponding function parameters will be removed later.

Reviewed-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/x86.c       | 37 ++++++++++++++++---------------------
 include/linux/kvm_host.h |  1 -
 virt/kvm/kvm_main.c      |  1 -
 3 files changed, 16 insertions(+), 23 deletions(-)

(limited to 'arch/x86/kvm/x86.c')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 373e17a0d398..3c5bb6fe5280 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6897,33 +6897,28 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				bool user_alloc)
 {
 	int npages = memslot->npages;
-	int map_flags = MAP_PRIVATE | MAP_ANONYMOUS;
 
-	/* Prevent internal slot pages from being moved by fork()/COW. */
-	if (memslot->id >= KVM_USER_MEM_SLOTS)
-		map_flags = MAP_SHARED | MAP_ANONYMOUS;
-
-	/*To keep backward compatibility with older userspace,
-	 *x86 needs to handle !user_alloc case.
+	/*
+	 * Only private memory slots need to be mapped here since
+	 * KVM_SET_MEMORY_REGION ioctl is no longer supported.
 	 */
-	if (!user_alloc) {
-		if (npages && !old.npages) {
-			unsigned long userspace_addr;
+	if ((memslot->id >= KVM_USER_MEM_SLOTS) && npages && !old.npages) {
+		unsigned long userspace_addr;
 
-			userspace_addr = vm_mmap(NULL, 0,
-						 npages * PAGE_SIZE,
-						 PROT_READ | PROT_WRITE,
-						 map_flags,
-						 0);
+		/*
+		 * MAP_SHARED to prevent internal slot pages from being moved
+		 * by fork()/COW.
+		 */
+		userspace_addr = vm_mmap(NULL, 0, npages * PAGE_SIZE,
+					 PROT_READ | PROT_WRITE,
+					 MAP_SHARED | MAP_ANONYMOUS, 0);
 
-			if (IS_ERR((void *)userspace_addr))
-				return PTR_ERR((void *)userspace_addr);
+		if (IS_ERR((void *)userspace_addr))
+			return PTR_ERR((void *)userspace_addr);
 
-			memslot->userspace_addr = userspace_addr;
-		}
+		memslot->userspace_addr = userspace_addr;
 	}
 
-
 	return 0;
 }
 
@@ -6935,7 +6930,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 
 	int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
 
-	if (!user_alloc && !old.user_alloc && old.npages && !npages) {
+	if ((mem->slot >= KVM_USER_MEM_SLOTS) && old.npages && !npages) {
 		int ret;
 
 		ret = vm_munmap(old.userspace_addr,
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 0350e0d5e031..722cae78bbc4 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -273,7 +273,6 @@ struct kvm_memory_slot {
 	unsigned long userspace_addr;
 	u32 flags;
 	short id;
-	bool user_alloc;
 };
 
 static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2e93630b4add..adc68feb5c5a 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -839,7 +839,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
 
 	r = -ENOMEM;
 	if (change == KVM_MR_CREATE) {
-		new.user_alloc = user_alloc;
 		new.userspace_addr = mem->userspace_addr;
 
 		if (kvm_arch_create_memslot(&new, npages))
-- 
cgit v1.2.3


From 2e32b7190641a184b8510d3e342400473ff1ab60 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 19 Feb 2013 19:33:13 +0100
Subject: x86, kvm: Add MSR_AMD64_BU_CFG2 to the list of ignored MSRs

The "x86, AMD: Enable WC+ memory type on family 10 processors" patch
currently in -tip added a workaround for AMD F10h CPUs which #GPs my
guest when booted in kvm. This is because it accesses MSR_AMD64_BU_CFG2
which is not currently ignored by kvm. Do that because this MSR is only
baremetal-relevant anyway. While at it, move the ignored MSRs at the
beginning of kvm_set_msr_common so that we exit then and there.

Acked-by: Gleb Natapov <gleb@redhat.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Andre Przywara <andre@andrep.de>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: http://lkml.kernel.org/r/1361298793-31834-2-git-send-email-bp@alien8.de
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kvm/x86.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kvm/x86.c')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c243b81e3c74..37040079cd6b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1881,6 +1881,14 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	u64 data = msr_info->data;
 
 	switch (msr) {
+	case MSR_AMD64_NB_CFG:
+	case MSR_IA32_UCODE_REV:
+	case MSR_IA32_UCODE_WRITE:
+	case MSR_VM_HSAVE_PA:
+	case MSR_AMD64_PATCH_LOADER:
+	case MSR_AMD64_BU_CFG2:
+		break;
+
 	case MSR_EFER:
 		return set_efer(vcpu, data);
 	case MSR_K7_HWCR:
@@ -1900,8 +1908,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			return 1;
 		}
 		break;
-	case MSR_AMD64_NB_CFG:
-		break;
 	case MSR_IA32_DEBUGCTLMSR:
 		if (!data) {
 			/* We support the non-activated case already */
@@ -1914,11 +1920,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
 			    __func__, data);
 		break;
-	case MSR_IA32_UCODE_REV:
-	case MSR_IA32_UCODE_WRITE:
-	case MSR_VM_HSAVE_PA:
-	case MSR_AMD64_PATCH_LOADER:
-		break;
 	case 0x200 ... 0x2ff:
 		return set_msr_mtrr(vcpu, msr, data);
 	case MSR_IA32_APICBASE:
@@ -2253,6 +2254,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case MSR_K8_INT_PENDING_MSG:
 	case MSR_AMD64_NB_CFG:
 	case MSR_FAM10H_MMIO_CONF_BASE:
+	case MSR_AMD64_BU_CFG2:
 		data = 0;
 		break;
 	case MSR_P6_PERFCTR0:
-- 
cgit v1.2.3