From 5dfd486c4750c9278c63fa96e6e85bdd2fb58e9d Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 22 Jan 2013 13:24:35 -0800 Subject: x86, kvm: Fix kvm's use of __pa() on percpu areas In short, it is illegal to call __pa() on an address holding a percpu variable. This replaces those __pa() calls with slow_virt_to_phys(). All of the cases in this patch are in boot time (or CPU hotplug time at worst) code, so the slow pagetable walking in slow_virt_to_phys() is not expected to have a performance impact. The times when this actually matters are pretty obscure (certain 32-bit NUMA systems), but it _does_ happen. It is important to keep KVM guests working on these systems because the real hardware is getting harder and harder to find. This bug manifested first by me seeing a plain hang at boot after this message: CPU 0 irqstacks, hard=f3018000 soft=f301a000 or, sometimes, it would actually make it out to the console: [ 0.000000] BUG: unable to handle kernel paging request at ffffffff I eventually traced it down to the KVM async pagefault code. This can be worked around by disabling that code either at compile-time, or on the kernel command-line. The kvm async pagefault code was injecting page faults in to the guest which the guest misinterpreted because its "reason" was not being properly sent from the host. The guest passes a physical address of an per-cpu async page fault structure via an MSR to the host. Since __pa() is broken on percpu data, the physical address it sent was bascially bogus and the host went scribbling on random data. The guest never saw the real reason for the page fault (it was injected by the host), assumed that the kernel had taken a _real_ page fault, and panic()'d. The behavior varied, though, depending on what got corrupted by the bad write. Signed-off-by: Dave Hansen Link: http://lkml.kernel.org/r/20130122212435.4905663F@kernel.stglabs.ibm.com Acked-by: Rik van Riel Reviewed-by: Marcelo Tosatti Signed-off-by: H. Peter Anvin --- arch/x86/kernel/kvmclock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel/kvmclock.c') diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 220a360010f8..9f966dc0b9e4 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -162,8 +162,8 @@ int kvm_register_clock(char *txt) int low, high, ret; struct pvclock_vcpu_time_info *src = &hv_clock[cpu].pvti; - low = (int)__pa(src) | 1; - high = ((u64)__pa(src) >> 32); + low = (int)slow_virt_to_phys(src) | 1; + high = ((u64)slow_virt_to_phys(src) >> 32); ret = native_write_msr_safe(msr_kvm_system_time, low, high); printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", cpu, high, low, txt); -- cgit v1.2.3 From ed55705dd5008b408c48a8459b8b34b01f3de985 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Mon, 18 Feb 2013 22:58:14 -0300 Subject: x86: pvclock kvm: align allocation size to page size To match whats mapped via vsyscalls to userspace. Reported-by: Peter Hurley Signed-off-by: Marcelo Tosatti --- arch/x86/kernel/kvmclock.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel/kvmclock.c') diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 220a360010f8..5bedbdddf1f2 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -218,6 +218,9 @@ static void kvm_shutdown(void) void __init kvmclock_init(void) { unsigned long mem; + int size; + + size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); if (!kvm_para_available()) return; @@ -231,16 +234,14 @@ void __init kvmclock_init(void) printk(KERN_INFO "kvm-clock: Using msrs %x and %x", msr_kvm_system_time, msr_kvm_wall_clock); - mem = memblock_alloc(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS, - PAGE_SIZE); + mem = memblock_alloc(size, PAGE_SIZE); if (!mem) return; hv_clock = __va(mem); if (kvm_register_clock("boot clock")) { hv_clock = NULL; - memblock_free(mem, - sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); + memblock_free(mem, size); return; } pv_time_ops.sched_clock = kvm_clock_read; @@ -275,7 +276,7 @@ int __init kvm_setup_vsyscall_timeinfo(void) struct pvclock_vcpu_time_info *vcpu_time; unsigned int size; - size = sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS; + size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); preempt_disable(); cpu = smp_processor_id(); -- cgit v1.2.3