diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2013-11-05 15:11:05 +1100 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2013-11-05 15:11:05 +1100 |
commit | 2633dfbcb4803a0e39c6364caf37f2b055ba83eb (patch) | |
tree | bb6f7e3115decd24f93ab99773741b8a563da6c2 /virt | |
parent | 457dc90b2aea757818bdfa8d20fddbb3389329fc (diff) | |
parent | 81e87e26796782e014fd1f2bb9cd8fb6ce4021a8 (diff) |
Merge remote-tracking branch 'kvm/linux-next'
Diffstat (limited to 'virt')
-rw-r--r-- | virt/kvm/Kconfig | 3 | ||||
-rw-r--r-- | virt/kvm/async_pf.c | 22 | ||||
-rw-r--r-- | virt/kvm/iommu.c | 34 | ||||
-rw-r--r-- | virt/kvm/kvm_main.c | 88 | ||||
-rw-r--r-- | virt/kvm/vfio.c | 264 |
5 files changed, 335 insertions, 76 deletions
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 779262f59e25..fbe1a48bd629 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -27,3 +27,6 @@ config HAVE_KVM_MSI config HAVE_KVM_CPU_RELAX_INTERCEPT bool + +config KVM_VFIO + bool diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 8a39dda7a325..8631d9c14320 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -56,7 +56,6 @@ void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu) static void async_pf_execute(struct work_struct *work) { - struct page *page = NULL; struct kvm_async_pf *apf = container_of(work, struct kvm_async_pf, work); struct mm_struct *mm = apf->mm; @@ -68,14 +67,12 @@ static void async_pf_execute(struct work_struct *work) use_mm(mm); down_read(&mm->mmap_sem); - get_user_pages(current, mm, addr, 1, 1, 0, &page, NULL); + get_user_pages(current, mm, addr, 1, 1, 0, NULL, NULL); up_read(&mm->mmap_sem); unuse_mm(mm); spin_lock(&vcpu->async_pf.lock); list_add_tail(&apf->link, &vcpu->async_pf.done); - apf->page = page; - apf->done = true; spin_unlock(&vcpu->async_pf.lock); /* @@ -83,7 +80,7 @@ static void async_pf_execute(struct work_struct *work) * this point */ - trace_kvm_async_pf_completed(addr, page, gva); + trace_kvm_async_pf_completed(addr, gva); if (waitqueue_active(&vcpu->wq)) wake_up_interruptible(&vcpu->wq); @@ -99,9 +96,8 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu) struct kvm_async_pf *work = list_entry(vcpu->async_pf.queue.next, typeof(*work), queue); - cancel_work_sync(&work->work); list_del(&work->queue); - if (!work->done) { /* work was canceled */ + if (cancel_work_sync(&work->work)) { mmdrop(work->mm); kvm_put_kvm(vcpu->kvm); /* == work->vcpu->kvm */ kmem_cache_free(async_pf_cache, work); @@ -114,8 +110,6 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu) list_entry(vcpu->async_pf.done.next, typeof(*work), link); list_del(&work->link); - if (!is_error_page(work->page)) - kvm_release_page_clean(work->page); kmem_cache_free(async_pf_cache, work); } spin_unlock(&vcpu->async_pf.lock); @@ -135,14 +129,11 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu) list_del(&work->link); spin_unlock(&vcpu->async_pf.lock); - if (work->page) - kvm_arch_async_page_ready(vcpu, work); + kvm_arch_async_page_ready(vcpu, work); kvm_arch_async_page_present(vcpu, work); list_del(&work->queue); vcpu->async_pf.queued--; - if (!is_error_page(work->page)) - kvm_release_page_clean(work->page); kmem_cache_free(async_pf_cache, work); } } @@ -165,8 +156,7 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, if (!work) return 0; - work->page = NULL; - work->done = false; + work->wakeup_all = false; work->vcpu = vcpu; work->gva = gva; work->addr = gfn_to_hva(vcpu->kvm, gfn); @@ -206,7 +196,7 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu) if (!work) return -ENOMEM; - work->page = KVM_ERR_PTR_BAD_PAGE; + work->wakeup_all = true; INIT_LIST_HEAD(&work->queue); /* for list_del to work */ spin_lock(&vcpu->async_pf.lock); diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c index 72a130bc448a..c7d9ce122529 100644 --- a/virt/kvm/iommu.c +++ b/virt/kvm/iommu.c @@ -79,7 +79,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) flags = IOMMU_READ; if (!(slot->flags & KVM_MEM_READONLY)) flags |= IOMMU_WRITE; - if (kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY) + if (!kvm->arch.iommu_noncoherent) flags |= IOMMU_CACHE; @@ -140,6 +140,9 @@ static int kvm_iommu_map_memslots(struct kvm *kvm) struct kvm_memslots *slots; struct kvm_memory_slot *memslot; + if (kvm->arch.iommu_noncoherent) + kvm_arch_register_noncoherent_dma(kvm); + idx = srcu_read_lock(&kvm->srcu); slots = kvm_memslots(kvm); @@ -158,7 +161,8 @@ int kvm_assign_device(struct kvm *kvm, { struct pci_dev *pdev = NULL; struct iommu_domain *domain = kvm->arch.iommu_domain; - int r, last_flags; + int r; + bool noncoherent; /* check if iommu exists and in use */ if (!domain) @@ -174,15 +178,13 @@ int kvm_assign_device(struct kvm *kvm, return r; } - last_flags = kvm->arch.iommu_flags; - if (iommu_domain_has_cap(kvm->arch.iommu_domain, - IOMMU_CAP_CACHE_COHERENCY)) - kvm->arch.iommu_flags |= KVM_IOMMU_CACHE_COHERENCY; + noncoherent = !iommu_domain_has_cap(kvm->arch.iommu_domain, + IOMMU_CAP_CACHE_COHERENCY); /* Check if need to update IOMMU page table for guest memory */ - if ((last_flags ^ kvm->arch.iommu_flags) == - KVM_IOMMU_CACHE_COHERENCY) { + if (noncoherent != kvm->arch.iommu_noncoherent) { kvm_iommu_unmap_memslots(kvm); + kvm->arch.iommu_noncoherent = noncoherent; r = kvm_iommu_map_memslots(kvm); if (r) goto out_unmap; @@ -190,11 +192,7 @@ int kvm_assign_device(struct kvm *kvm, pdev->dev_flags |= PCI_DEV_FLAGS_ASSIGNED; - printk(KERN_DEBUG "assign device %x:%x:%x.%x\n", - assigned_dev->host_segnr, - assigned_dev->host_busnr, - PCI_SLOT(assigned_dev->host_devfn), - PCI_FUNC(assigned_dev->host_devfn)); + dev_info(&pdev->dev, "kvm assign device\n"); return 0; out_unmap: @@ -220,11 +218,7 @@ int kvm_deassign_device(struct kvm *kvm, pdev->dev_flags &= ~PCI_DEV_FLAGS_ASSIGNED; - printk(KERN_DEBUG "deassign device %x:%x:%x.%x\n", - assigned_dev->host_segnr, - assigned_dev->host_busnr, - PCI_SLOT(assigned_dev->host_devfn), - PCI_FUNC(assigned_dev->host_devfn)); + dev_info(&pdev->dev, "kvm deassign device\n"); return 0; } @@ -336,6 +330,9 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm) srcu_read_unlock(&kvm->srcu, idx); + if (kvm->arch.iommu_noncoherent) + kvm_arch_unregister_noncoherent_dma(kvm); + return 0; } @@ -350,6 +347,7 @@ int kvm_iommu_unmap_guest(struct kvm *kvm) mutex_lock(&kvm->slots_lock); kvm_iommu_unmap_memslots(kvm); kvm->arch.iommu_domain = NULL; + kvm->arch.iommu_noncoherent = false; mutex_unlock(&kvm->slots_lock); iommu_domain_free(domain); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1cf9ccb01013..564e464b2f4a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -70,7 +70,8 @@ MODULE_LICENSE("GPL"); * kvm->lock --> kvm->slots_lock --> kvm->irq_lock */ -DEFINE_RAW_SPINLOCK(kvm_lock); +DEFINE_SPINLOCK(kvm_lock); +static DEFINE_RAW_SPINLOCK(kvm_count_lock); LIST_HEAD(vm_list); static cpumask_var_t cpus_hardware_enabled; @@ -490,9 +491,9 @@ static struct kvm *kvm_create_vm(unsigned long type) if (r) goto out_err; - raw_spin_lock(&kvm_lock); + spin_lock(&kvm_lock); list_add(&kvm->vm_list, &vm_list); - raw_spin_unlock(&kvm_lock); + spin_unlock(&kvm_lock); return kvm; @@ -581,9 +582,9 @@ static void kvm_destroy_vm(struct kvm *kvm) struct mm_struct *mm = kvm->mm; kvm_arch_sync_events(kvm); - raw_spin_lock(&kvm_lock); + spin_lock(&kvm_lock); list_del(&kvm->vm_list); - raw_spin_unlock(&kvm_lock); + spin_unlock(&kvm_lock); kvm_free_irq_routing(kvm); for (i = 0; i < KVM_NR_BUSES; i++) kvm_io_bus_destroy(kvm->buses[i]); @@ -872,21 +873,6 @@ int __kvm_set_memory_region(struct kvm *kvm, goto out_free; } - /* - * IOMMU mapping: New slots need to be mapped. Old slots need to be - * un-mapped and re-mapped if their base changes. Since base change - * unmapping is handled above with slot deletion, mapping alone is - * needed here. Anything else the iommu might care about for existing - * slots (size changes, userspace addr changes and read-only flag - * changes) is disallowed above, so any other attribute changes getting - * here can be skipped. - */ - if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { - r = kvm_iommu_map_pages(kvm, &new); - if (r) - goto out_slots; - } - /* actual memory is freed via old in kvm_free_physmem_slot below */ if (change == KVM_MR_DELETE) { new.dirty_bitmap = NULL; @@ -900,6 +886,20 @@ int __kvm_set_memory_region(struct kvm *kvm, kvm_free_physmem_slot(&old, &new); kfree(old_memslots); + /* + * IOMMU mapping: New slots need to be mapped. Old slots need to be + * un-mapped and re-mapped if their base changes. Since base change + * unmapping is handled above with slot deletion, mapping alone is + * needed here. Anything else the iommu might care about for existing + * slots (size changes, userspace addr changes and read-only flag + * changes) is disallowed above, so any other attribute changes getting + * here can be skipped. + */ + if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { + r = kvm_iommu_map_pages(kvm, &new); + return r; + } + return 0; out_slots: @@ -2271,6 +2271,11 @@ static int kvm_ioctl_create_device(struct kvm *kvm, ops = &kvm_xics_ops; break; #endif +#ifdef CONFIG_KVM_VFIO + case KVM_DEV_TYPE_VFIO: + ops = &kvm_vfio_ops; + break; +#endif default: return -ENODEV; } @@ -2683,11 +2688,12 @@ static void hardware_enable_nolock(void *junk) } } -static void hardware_enable(void *junk) +static void hardware_enable(void) { - raw_spin_lock(&kvm_lock); - hardware_enable_nolock(junk); - raw_spin_unlock(&kvm_lock); + raw_spin_lock(&kvm_count_lock); + if (kvm_usage_count) + hardware_enable_nolock(NULL); + raw_spin_unlock(&kvm_count_lock); } static void hardware_disable_nolock(void *junk) @@ -2700,11 +2706,12 @@ static void hardware_disable_nolock(void *junk) kvm_arch_hardware_disable(NULL); } -static void hardware_disable(void *junk) +static void hardware_disable(void) { - raw_spin_lock(&kvm_lock); - hardware_disable_nolock(junk); - raw_spin_unlock(&kvm_lock); + raw_spin_lock(&kvm_count_lock); + if (kvm_usage_count) + hardware_disable_nolock(NULL); + raw_spin_unlock(&kvm_count_lock); } static void hardware_disable_all_nolock(void) @@ -2718,16 +2725,16 @@ static void hardware_disable_all_nolock(void) static void hardware_disable_all(void) { - raw_spin_lock(&kvm_lock); + raw_spin_lock(&kvm_count_lock); hardware_disable_all_nolock(); - raw_spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_count_lock); } static int hardware_enable_all(void) { int r = 0; - raw_spin_lock(&kvm_lock); + raw_spin_lock(&kvm_count_lock); kvm_usage_count++; if (kvm_usage_count == 1) { @@ -2740,7 +2747,7 @@ static int hardware_enable_all(void) } } - raw_spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_count_lock); return r; } @@ -2750,20 +2757,17 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, { int cpu = (long)v; - if (!kvm_usage_count) - return NOTIFY_OK; - val &= ~CPU_TASKS_FROZEN; switch (val) { case CPU_DYING: printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", cpu); - hardware_disable(NULL); + hardware_disable(); break; case CPU_STARTING: printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", cpu); - hardware_enable(NULL); + hardware_enable(); break; } return NOTIFY_OK; @@ -3056,10 +3060,10 @@ static int vm_stat_get(void *_offset, u64 *val) struct kvm *kvm; *val = 0; - raw_spin_lock(&kvm_lock); + spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) *val += *(u32 *)((void *)kvm + offset); - raw_spin_unlock(&kvm_lock); + spin_unlock(&kvm_lock); return 0; } @@ -3073,12 +3077,12 @@ static int vcpu_stat_get(void *_offset, u64 *val) int i; *val = 0; - raw_spin_lock(&kvm_lock); + spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) kvm_for_each_vcpu(i, vcpu, kvm) *val += *(u32 *)((void *)vcpu + offset); - raw_spin_unlock(&kvm_lock); + spin_unlock(&kvm_lock); return 0; } @@ -3133,7 +3137,7 @@ static int kvm_suspend(void) static void kvm_resume(void) { if (kvm_usage_count) { - WARN_ON(raw_spin_is_locked(&kvm_lock)); + WARN_ON(raw_spin_is_locked(&kvm_count_lock)); hardware_enable_nolock(NULL); } } diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c new file mode 100644 index 000000000000..ca4260e35037 --- /dev/null +++ b/virt/kvm/vfio.c @@ -0,0 +1,264 @@ +/* + * VFIO-KVM bridge pseudo device + * + * Copyright (C) 2013 Red Hat, Inc. All rights reserved. + * Author: Alex Williamson <alex.williamson@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/errno.h> +#include <linux/file.h> +#include <linux/kvm_host.h> +#include <linux/list.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/vfio.h> + +struct kvm_vfio_group { + struct list_head node; + struct vfio_group *vfio_group; +}; + +struct kvm_vfio { + struct list_head group_list; + struct mutex lock; + bool noncoherent; +}; + +static struct vfio_group *kvm_vfio_group_get_external_user(struct file *filep) +{ + struct vfio_group *vfio_group; + struct vfio_group *(*fn)(struct file *); + + fn = symbol_get(vfio_group_get_external_user); + if (!fn) + return ERR_PTR(-EINVAL); + + vfio_group = fn(filep); + + symbol_put(vfio_group_get_external_user); + + return vfio_group; +} + +static void kvm_vfio_group_put_external_user(struct vfio_group *vfio_group) +{ + void (*fn)(struct vfio_group *); + + fn = symbol_get(vfio_group_put_external_user); + if (!fn) + return; + + fn(vfio_group); + + symbol_put(vfio_group_put_external_user); +} + +/* + * Groups can use the same or different IOMMU domains. If the same then + * adding a new group may change the coherency of groups we've previously + * been told about. We don't want to care about any of that so we retest + * each group and bail as soon as we find one that's noncoherent. This + * means we only ever [un]register_noncoherent_dma once for the whole device. + */ +static void kvm_vfio_update_coherency(struct kvm_device *dev) +{ + struct kvm_vfio *kv = dev->private; + bool noncoherent = false; + struct kvm_vfio_group *kvg; + + mutex_lock(&kv->lock); + + list_for_each_entry(kvg, &kv->group_list, node) { + /* + * TODO: We need an interface to check the coherency of + * the IOMMU domain this group is using. For now, assume + * it's always noncoherent. + */ + noncoherent = true; + break; + } + + if (noncoherent != kv->noncoherent) { + kv->noncoherent = noncoherent; + + if (kv->noncoherent) + kvm_arch_register_noncoherent_dma(dev->kvm); + else + kvm_arch_unregister_noncoherent_dma(dev->kvm); + } + + mutex_unlock(&kv->lock); +} + +static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg) +{ + struct kvm_vfio *kv = dev->private; + struct vfio_group *vfio_group; + struct kvm_vfio_group *kvg; + void __user *argp = (void __user *)arg; + struct fd f; + int32_t fd; + int ret; + + switch (attr) { + case KVM_DEV_VFIO_GROUP_ADD: + if (get_user(fd, (int32_t __user *)argp)) + return -EFAULT; + + f = fdget(fd); + if (!f.file) + return -EBADF; + + vfio_group = kvm_vfio_group_get_external_user(f.file); + fdput(f); + + if (IS_ERR(vfio_group)) + return PTR_ERR(vfio_group); + + mutex_lock(&kv->lock); + + list_for_each_entry(kvg, &kv->group_list, node) { + if (kvg->vfio_group == vfio_group) { + mutex_unlock(&kv->lock); + kvm_vfio_group_put_external_user(vfio_group); + return -EEXIST; + } + } + + kvg = kzalloc(sizeof(*kvg), GFP_KERNEL); + if (!kvg) { + mutex_unlock(&kv->lock); + kvm_vfio_group_put_external_user(vfio_group); + return -ENOMEM; + } + + list_add_tail(&kvg->node, &kv->group_list); + kvg->vfio_group = vfio_group; + + mutex_unlock(&kv->lock); + + kvm_vfio_update_coherency(dev); + + return 0; + + case KVM_DEV_VFIO_GROUP_DEL: + if (get_user(fd, (int32_t __user *)argp)) + return -EFAULT; + + f = fdget(fd); + if (!f.file) + return -EBADF; + + vfio_group = kvm_vfio_group_get_external_user(f.file); + fdput(f); + + if (IS_ERR(vfio_group)) + return PTR_ERR(vfio_group); + + ret = -ENOENT; + + mutex_lock(&kv->lock); + + list_for_each_entry(kvg, &kv->group_list, node) { + if (kvg->vfio_group != vfio_group) + continue; + + list_del(&kvg->node); + kvm_vfio_group_put_external_user(kvg->vfio_group); + kfree(kvg); + ret = 0; + break; + } + + mutex_unlock(&kv->lock); + + kvm_vfio_group_put_external_user(vfio_group); + + kvm_vfio_update_coherency(dev); + + return ret; + } + + return -ENXIO; +} + +static int kvm_vfio_set_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_VFIO_GROUP: + return kvm_vfio_set_group(dev, attr->attr, attr->addr); + } + + return -ENXIO; +} + +static int kvm_vfio_has_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_VFIO_GROUP: + switch (attr->attr) { + case KVM_DEV_VFIO_GROUP_ADD: + case KVM_DEV_VFIO_GROUP_DEL: + return 0; + } + + break; + } + + return -ENXIO; +} + +static void kvm_vfio_destroy(struct kvm_device *dev) +{ + struct kvm_vfio *kv = dev->private; + struct kvm_vfio_group *kvg, *tmp; + + list_for_each_entry_safe(kvg, tmp, &kv->group_list, node) { + kvm_vfio_group_put_external_user(kvg->vfio_group); + list_del(&kvg->node); + kfree(kvg); + } + + kvm_vfio_update_coherency(dev); + + kfree(kv); + kfree(dev); /* alloc by kvm_ioctl_create_device, free by .destroy */ +} + +static int kvm_vfio_create(struct kvm_device *dev, u32 type) +{ + struct kvm_device *tmp; + struct kvm_vfio *kv; + + /* Only one VFIO "device" per VM */ + list_for_each_entry(tmp, &dev->kvm->devices, vm_node) + if (tmp->ops == &kvm_vfio_ops) + return -EBUSY; + + kv = kzalloc(sizeof(*kv), GFP_KERNEL); + if (!kv) + return -ENOMEM; + + INIT_LIST_HEAD(&kv->group_list); + mutex_init(&kv->lock); + + dev->private = kv; + + return 0; +} + +struct kvm_device_ops kvm_vfio_ops = { + .name = "kvm-vfio", + .create = kvm_vfio_create, + .destroy = kvm_vfio_destroy, + .set_attr = kvm_vfio_set_attr, + .has_attr = kvm_vfio_has_attr, +}; |