summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--drivers/vfio/pci/Kconfig12
-rw-r--r--drivers/vfio/pci/Makefile2
-rw-r--r--drivers/vfio/pci/vfio_pci.c12
-rw-r--r--drivers/vfio/pci/vfio_pci_igd.c10
-rw-r--r--drivers/vfio/pci/vfio_pci_private.h2
-rw-r--r--drivers/vfio/pci/vfio_pci_zdev.c24
-rw-r--r--drivers/vfio/vfio.c5
-rw-r--r--drivers/vfio/vfio_iommu_type1.c334
-rw-r--r--include/linux/vfio.h7
-rw-r--r--include/uapi/linux/vfio.h27
10 files changed, 307 insertions, 128 deletions
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 40a223381ab6..ac3c1dd3edef 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -45,15 +45,3 @@ config VFIO_PCI_NVLINK2
depends on VFIO_PCI && PPC_POWERNV
help
VFIO PCI support for P9 Witherspoon machine with NVIDIA V100 GPUs
-
-config VFIO_PCI_ZDEV
- bool "VFIO PCI ZPCI device CLP support"
- depends on VFIO_PCI && S390
- default y
- help
- Enabling this option exposes VFIO capabilities containing hardware
- configuration for zPCI devices. This enables userspace (e.g. QEMU)
- to supply proper configuration values instead of hard-coded defaults
- for zPCI devices passed through via VFIO on s390.
-
- Say Y here.
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index 781e0809d6ee..eff97a7cd9f1 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -3,6 +3,6 @@
vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
vfio-pci-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2.o
-vfio-pci-$(CONFIG_VFIO_PCI_ZDEV) += vfio_pci_zdev.o
+vfio-pci-$(CONFIG_S390) += vfio_pci_zdev.o
obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 706de3ef94bb..65e7e6b44578 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -807,6 +807,7 @@ static long vfio_pci_ioctl(void *device_data,
struct vfio_device_info info;
struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
unsigned long capsz;
+ int ret;
minsz = offsetofend(struct vfio_device_info, num_irqs);
@@ -832,13 +833,10 @@ static long vfio_pci_ioctl(void *device_data,
info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions;
info.num_irqs = VFIO_PCI_NUM_IRQS;
- if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV)) {
- int ret = vfio_pci_info_zdev_add_caps(vdev, &caps);
-
- if (ret && ret != -ENODEV) {
- pci_warn(vdev->pdev, "Failed to setup zPCI info capabilities\n");
- return ret;
- }
+ ret = vfio_pci_info_zdev_add_caps(vdev, &caps);
+ if (ret && ret != -ENODEV) {
+ pci_warn(vdev->pdev, "Failed to setup zPCI info capabilities\n");
+ return ret;
}
if (caps.size) {
diff --git a/drivers/vfio/pci/vfio_pci_igd.c b/drivers/vfio/pci/vfio_pci_igd.c
index 53d97f459252..e66dfb0178ed 100644
--- a/drivers/vfio/pci/vfio_pci_igd.c
+++ b/drivers/vfio/pci/vfio_pci_igd.c
@@ -127,7 +127,7 @@ static size_t vfio_pci_igd_cfg_rw(struct vfio_pci_device *vdev,
ret = pci_user_read_config_byte(pdev, pos, &val);
if (ret)
- return pcibios_err_to_errno(ret);
+ return ret;
if (copy_to_user(buf + count - size, &val, 1))
return -EFAULT;
@@ -141,7 +141,7 @@ static size_t vfio_pci_igd_cfg_rw(struct vfio_pci_device *vdev,
ret = pci_user_read_config_word(pdev, pos, &val);
if (ret)
- return pcibios_err_to_errno(ret);
+ return ret;
val = cpu_to_le16(val);
if (copy_to_user(buf + count - size, &val, 2))
@@ -156,7 +156,7 @@ static size_t vfio_pci_igd_cfg_rw(struct vfio_pci_device *vdev,
ret = pci_user_read_config_dword(pdev, pos, &val);
if (ret)
- return pcibios_err_to_errno(ret);
+ return ret;
val = cpu_to_le32(val);
if (copy_to_user(buf + count - size, &val, 4))
@@ -171,7 +171,7 @@ static size_t vfio_pci_igd_cfg_rw(struct vfio_pci_device *vdev,
ret = pci_user_read_config_word(pdev, pos, &val);
if (ret)
- return pcibios_err_to_errno(ret);
+ return ret;
val = cpu_to_le16(val);
if (copy_to_user(buf + count - size, &val, 2))
@@ -186,7 +186,7 @@ static size_t vfio_pci_igd_cfg_rw(struct vfio_pci_device *vdev,
ret = pci_user_read_config_byte(pdev, pos, &val);
if (ret)
- return pcibios_err_to_errno(ret);
+ return ret;
if (copy_to_user(buf + count - size, &val, 1))
return -EFAULT;
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index 5c90e560c5c7..9cd1882a05af 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -214,7 +214,7 @@ static inline int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev)
}
#endif
-#ifdef CONFIG_VFIO_PCI_ZDEV
+#ifdef CONFIG_S390
extern int vfio_pci_info_zdev_add_caps(struct vfio_pci_device *vdev,
struct vfio_info_cap *caps);
#else
diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c
index 229685634031..7b011b62c766 100644
--- a/drivers/vfio/pci/vfio_pci_zdev.c
+++ b/drivers/vfio/pci/vfio_pci_zdev.c
@@ -24,8 +24,7 @@
/*
* Add the Base PCI Function information to the device info region.
*/
-static int zpci_base_cap(struct zpci_dev *zdev, struct vfio_pci_device *vdev,
- struct vfio_info_cap *caps)
+static int zpci_base_cap(struct zpci_dev *zdev, struct vfio_info_cap *caps)
{
struct vfio_device_info_cap_zpci_base cap = {
.header.id = VFIO_DEVICE_INFO_CAP_ZPCI_BASE,
@@ -45,8 +44,7 @@ static int zpci_base_cap(struct zpci_dev *zdev, struct vfio_pci_device *vdev,
/*
* Add the Base PCI Function Group information to the device info region.
*/
-static int zpci_group_cap(struct zpci_dev *zdev, struct vfio_pci_device *vdev,
- struct vfio_info_cap *caps)
+static int zpci_group_cap(struct zpci_dev *zdev, struct vfio_info_cap *caps)
{
struct vfio_device_info_cap_zpci_group cap = {
.header.id = VFIO_DEVICE_INFO_CAP_ZPCI_GROUP,
@@ -66,14 +64,15 @@ static int zpci_group_cap(struct zpci_dev *zdev, struct vfio_pci_device *vdev,
/*
* Add the device utility string to the device info region.
*/
-static int zpci_util_cap(struct zpci_dev *zdev, struct vfio_pci_device *vdev,
- struct vfio_info_cap *caps)
+static int zpci_util_cap(struct zpci_dev *zdev, struct vfio_info_cap *caps)
{
struct vfio_device_info_cap_zpci_util *cap;
int cap_size = sizeof(*cap) + CLP_UTIL_STR_LEN;
int ret;
cap = kmalloc(cap_size, GFP_KERNEL);
+ if (!cap)
+ return -ENOMEM;
cap->header.id = VFIO_DEVICE_INFO_CAP_ZPCI_UTIL;
cap->header.version = 1;
@@ -90,14 +89,15 @@ static int zpci_util_cap(struct zpci_dev *zdev, struct vfio_pci_device *vdev,
/*
* Add the function path string to the device info region.
*/
-static int zpci_pfip_cap(struct zpci_dev *zdev, struct vfio_pci_device *vdev,
- struct vfio_info_cap *caps)
+static int zpci_pfip_cap(struct zpci_dev *zdev, struct vfio_info_cap *caps)
{
struct vfio_device_info_cap_zpci_pfip *cap;
int cap_size = sizeof(*cap) + CLP_PFIP_NR_SEGMENTS;
int ret;
cap = kmalloc(cap_size, GFP_KERNEL);
+ if (!cap)
+ return -ENOMEM;
cap->header.id = VFIO_DEVICE_INFO_CAP_ZPCI_PFIP;
cap->header.version = 1;
@@ -123,21 +123,21 @@ int vfio_pci_info_zdev_add_caps(struct vfio_pci_device *vdev,
if (!zdev)
return -ENODEV;
- ret = zpci_base_cap(zdev, vdev, caps);
+ ret = zpci_base_cap(zdev, caps);
if (ret)
return ret;
- ret = zpci_group_cap(zdev, vdev, caps);
+ ret = zpci_group_cap(zdev, caps);
if (ret)
return ret;
if (zdev->util_str_avail) {
- ret = zpci_util_cap(zdev, vdev, caps);
+ ret = zpci_util_cap(zdev, caps);
if (ret)
return ret;
}
- ret = zpci_pfip_cap(zdev, vdev, caps);
+ ret = zpci_pfip_cap(zdev, caps);
return ret;
}
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 4ad8a35667a7..38779e6fd80c 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1220,6 +1220,11 @@ static int vfio_fops_open(struct inode *inode, struct file *filep)
static int vfio_fops_release(struct inode *inode, struct file *filep)
{
struct vfio_container *container = filep->private_data;
+ struct vfio_iommu_driver *driver = container->iommu_driver;
+
+ if (driver && driver->ops->notify)
+ driver->ops->notify(container->iommu_data,
+ VFIO_IOMMU_CONTAINER_CLOSE);
filep->private_data = NULL;
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 0b4dedaa9128..b3df383d7028 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -69,11 +69,15 @@ struct vfio_iommu {
struct rb_root dma_list;
struct blocking_notifier_head notifier;
unsigned int dma_avail;
+ unsigned int vaddr_invalid_count;
uint64_t pgsize_bitmap;
+ uint64_t num_non_pinned_groups;
+ wait_queue_head_t vaddr_wait;
bool v2;
bool nesting;
bool dirty_page_tracking;
bool pinned_page_dirty_scope;
+ bool container_open;
};
struct vfio_domain {
@@ -92,6 +96,7 @@ struct vfio_dma {
int prot; /* IOMMU_READ/WRITE */
bool iommu_mapped;
bool lock_cap; /* capable(CAP_IPC_LOCK) */
+ bool vaddr_invalid;
struct task_struct *task;
struct rb_root pfn_list; /* Ex-user pinned pfn list */
unsigned long *bitmap;
@@ -143,12 +148,13 @@ struct vfio_regions {
#define DIRTY_BITMAP_PAGES_MAX ((u64)INT_MAX)
#define DIRTY_BITMAP_SIZE_MAX DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
+#define WAITED 1
+
static int put_pfn(unsigned long pfn, int prot);
static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
struct iommu_group *iommu_group);
-static void update_pinned_page_dirty_scope(struct vfio_iommu *iommu);
/*
* This code handles mapping and unmapping of user data buffers
* into DMA'ble space using the IOMMU
@@ -173,6 +179,31 @@ static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
return NULL;
}
+static struct rb_node *vfio_find_dma_first_node(struct vfio_iommu *iommu,
+ dma_addr_t start, size_t size)
+{
+ struct rb_node *res = NULL;
+ struct rb_node *node = iommu->dma_list.rb_node;
+ struct vfio_dma *dma_res = NULL;
+
+ while (node) {
+ struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
+
+ if (start < dma->iova + dma->size) {
+ res = node;
+ dma_res = dma;
+ if (start >= dma->iova)
+ break;
+ node = node->rb_left;
+ } else {
+ node = node->rb_right;
+ }
+ }
+ if (res && size && dma_res->iova >= start + size)
+ res = NULL;
+ return res;
+}
+
static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
{
struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
@@ -236,6 +267,18 @@ static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize)
}
}
+static void vfio_iommu_populate_bitmap_full(struct vfio_iommu *iommu)
+{
+ struct rb_node *n;
+ unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
+
+ for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
+ struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+
+ bitmap_set(dma->bitmap, 0, dma->size >> pgshift);
+ }
+}
+
static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize)
{
struct rb_node *n;
@@ -479,6 +522,61 @@ done:
return ret;
}
+static int vfio_wait(struct vfio_iommu *iommu)
+{
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(&iommu->vaddr_wait, &wait, TASK_KILLABLE);
+ mutex_unlock(&iommu->lock);
+ schedule();
+ mutex_lock(&iommu->lock);
+ finish_wait(&iommu->vaddr_wait, &wait);
+ if (kthread_should_stop() || !iommu->container_open ||
+ fatal_signal_pending(current)) {
+ return -EFAULT;
+ }
+ return WAITED;
+}
+
+/*
+ * Find dma struct and wait for its vaddr to be valid. iommu lock is dropped
+ * if the task waits, but is re-locked on return. Return result in *dma_p.
+ * Return 0 on success with no waiting, WAITED on success if waited, and -errno
+ * on error.
+ */
+static int vfio_find_dma_valid(struct vfio_iommu *iommu, dma_addr_t start,
+ size_t size, struct vfio_dma **dma_p)
+{
+ int ret;
+
+ do {
+ *dma_p = vfio_find_dma(iommu, start, size);
+ if (!*dma_p)
+ ret = -EINVAL;
+ else if (!(*dma_p)->vaddr_invalid)
+ ret = 0;
+ else
+ ret = vfio_wait(iommu);
+ } while (ret > 0);
+
+ return ret;
+}
+
+/*
+ * Wait for all vaddr in the dma_list to become valid. iommu lock is dropped
+ * if the task waits, but is re-locked on return. Return 0 on success with no
+ * waiting, WAITED on success if waited, and -errno on error.
+ */
+static int vfio_wait_all_valid(struct vfio_iommu *iommu)
+{
+ int ret = 0;
+
+ while (iommu->vaddr_invalid_count && ret >= 0)
+ ret = vfio_wait(iommu);
+
+ return ret;
+}
+
/*
* Attempt to pin pages. We really don't want to track all the pfns and
* the iommu can only map chunks of consecutive pfns anyway, so get the
@@ -640,6 +738,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
unsigned long remote_vaddr;
struct vfio_dma *dma;
bool do_accounting;
+ dma_addr_t iova;
if (!iommu || !user_pfn || !phys_pfn)
return -EINVAL;
@@ -650,6 +749,22 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
mutex_lock(&iommu->lock);
+ /*
+ * Wait for all necessary vaddr's to be valid so they can be used in
+ * the main loop without dropping the lock, to avoid racing vs unmap.
+ */
+again:
+ if (iommu->vaddr_invalid_count) {
+ for (i = 0; i < npage; i++) {
+ iova = user_pfn[i] << PAGE_SHIFT;
+ ret = vfio_find_dma_valid(iommu, iova, PAGE_SIZE, &dma);
+ if (ret < 0)
+ goto pin_done;
+ if (ret == WAITED)
+ goto again;
+ }
+ }
+
/* Fail if notifier list is empty */
if (!iommu->notifier.head) {
ret = -EINVAL;
@@ -664,7 +779,6 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
for (i = 0; i < npage; i++) {
- dma_addr_t iova;
struct vfio_pfn *vpfn;
iova = user_pfn[i] << PAGE_SHIFT;
@@ -714,7 +828,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
group = vfio_iommu_find_iommu_group(iommu, iommu_group);
if (!group->pinned_page_dirty_scope) {
group->pinned_page_dirty_scope = true;
- update_pinned_page_dirty_scope(iommu);
+ iommu->num_non_pinned_groups--;
}
goto pin_done;
@@ -945,10 +1059,15 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
{
+ WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list));
vfio_unmap_unpin(iommu, dma, true);
vfio_unlink_dma(iommu, dma);
put_task_struct(dma->task);
vfio_dma_bitmap_free(dma);
+ if (dma->vaddr_invalid) {
+ iommu->vaddr_invalid_count--;
+ wake_up_all(&iommu->vaddr_wait);
+ }
kfree(dma);
iommu->dma_avail++;
}
@@ -991,7 +1110,7 @@ static int update_user_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
* mark all pages dirty if any IOMMU capable device is not able
* to report dirty pages and all pages are pinned and mapped.
*/
- if (!iommu->pinned_page_dirty_scope && dma->iommu_mapped)
+ if (iommu->num_non_pinned_groups && dma->iommu_mapped)
bitmap_set(dma->bitmap, 0, nbits);
if (shift) {
@@ -1074,34 +1193,36 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
{
struct vfio_dma *dma, *dma_last = NULL;
size_t unmapped = 0, pgsize;
- int ret = 0, retries = 0;
+ int ret = -EINVAL, retries = 0;
unsigned long pgshift;
+ dma_addr_t iova = unmap->iova;
+ unsigned long size = unmap->size;
+ bool unmap_all = unmap->flags & VFIO_DMA_UNMAP_FLAG_ALL;
+ bool invalidate_vaddr = unmap->flags & VFIO_DMA_UNMAP_FLAG_VADDR;
+ struct rb_node *n, *first_n;
mutex_lock(&iommu->lock);
pgshift = __ffs(iommu->pgsize_bitmap);
pgsize = (size_t)1 << pgshift;
- if (unmap->iova & (pgsize - 1)) {
- ret = -EINVAL;
+ if (iova & (pgsize - 1))
goto unlock;
- }
- if (!unmap->size || unmap->size & (pgsize - 1)) {
- ret = -EINVAL;
+ if (unmap_all) {
+ if (iova || size)
+ goto unlock;
+ size = SIZE_MAX;
+ } else if (!size || size & (pgsize - 1)) {
goto unlock;
}
- if (unmap->iova + unmap->size - 1 < unmap->iova ||
- unmap->size > SIZE_MAX) {
- ret = -EINVAL;
+ if (iova + size - 1 < iova || size > SIZE_MAX)
goto unlock;
- }
/* When dirty tracking is enabled, allow only min supported pgsize */
if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
(!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) {
- ret = -EINVAL;
goto unlock;
}
@@ -1138,21 +1259,25 @@ again:
* will only return success and a size of zero if there were no
* mappings within the range.
*/
- if (iommu->v2) {
- dma = vfio_find_dma(iommu, unmap->iova, 1);
- if (dma && dma->iova != unmap->iova) {
- ret = -EINVAL;
+ if (iommu->v2 && !unmap_all) {
+ dma = vfio_find_dma(iommu, iova, 1);
+ if (dma && dma->iova != iova)
goto unlock;
- }
- dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
- if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
- ret = -EINVAL;
+
+ dma = vfio_find_dma(iommu, iova + size - 1, 0);
+ if (dma && dma->iova + dma->size != iova + size)
goto unlock;
- }
}
- while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
- if (!iommu->v2 && unmap->iova > dma->iova)
+ ret = 0;
+ n = first_n = vfio_find_dma_first_node(iommu, iova, size);
+
+ while (n) {
+ dma = rb_entry(n, struct vfio_dma, node);
+ if (dma->iova >= iova + size)
+ break;
+
+ if (!iommu->v2 && iova > dma->iova)
break;
/*
* Task with same address space who mapped this iova range is
@@ -1161,6 +1286,27 @@ again:
if (dma->task->mm != current->mm)
break;
+ if (invalidate_vaddr) {
+ if (dma->vaddr_invalid) {
+ struct rb_node *last_n = n;
+
+ for (n = first_n; n != last_n; n = rb_next(n)) {
+ dma = rb_entry(n,
+ struct vfio_dma, node);
+ dma->vaddr_invalid = false;
+ iommu->vaddr_invalid_count--;
+ }
+ ret = -EINVAL;
+ unmapped = 0;
+ break;
+ }
+ dma->vaddr_invalid = true;
+ iommu->vaddr_invalid_count++;
+ unmapped += dma->size;
+ n = rb_next(n);
+ continue;
+ }
+
if (!RB_EMPTY_ROOT(&dma->pfn_list)) {
struct vfio_iommu_type1_dma_unmap nb_unmap;
@@ -1190,12 +1336,13 @@ again:
if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
ret = update_user_bitmap(bitmap->data, iommu, dma,
- unmap->iova, pgsize);
+ iova, pgsize);
if (ret)
break;
}
unmapped += dma->size;
+ n = rb_next(n);
vfio_remove_dma(iommu, dma);
}
@@ -1299,6 +1446,7 @@ static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
static int vfio_dma_do_map(struct vfio_iommu *iommu,
struct vfio_iommu_type1_dma_map *map)
{
+ bool set_vaddr = map->flags & VFIO_DMA_MAP_FLAG_VADDR;
dma_addr_t iova = map->iova;
unsigned long vaddr = map->vaddr;
size_t size = map->size;
@@ -1316,13 +1464,16 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
if (map->flags & VFIO_DMA_MAP_FLAG_READ)
prot |= IOMMU_READ;
+ if ((prot && set_vaddr) || (!prot && !set_vaddr))
+ return -EINVAL;
+
mutex_lock(&iommu->lock);
pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
WARN_ON((pgsize - 1) & PAGE_MASK);
- if (!prot || !size || (size | iova | vaddr) & (pgsize - 1)) {
+ if (!size || (size | iova | vaddr) & (pgsize - 1)) {
ret = -EINVAL;
goto out_unlock;
}
@@ -1333,7 +1484,21 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
goto out_unlock;
}
- if (vfio_find_dma(iommu, iova, size)) {
+ dma = vfio_find_dma(iommu, iova, size);
+ if (set_vaddr) {
+ if (!dma) {
+ ret = -ENOENT;
+ } else if (!dma->vaddr_invalid || dma->iova != iova ||
+ dma->size != size) {
+ ret = -EINVAL;
+ } else {
+ dma->vaddr = vaddr;
+ dma->vaddr_invalid = false;
+ iommu->vaddr_invalid_count--;
+ wake_up_all(&iommu->vaddr_wait);
+ }
+ goto out_unlock;
+ } else if (dma) {
ret = -EEXIST;
goto out_unlock;
}
@@ -1430,6 +1595,10 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
int ret;
+ ret = vfio_wait_all_valid(iommu);
+ if (ret < 0)
+ return ret;
+
/* Arbitrarily pick the first domain in the list for lookups */
if (!list_empty(&iommu->domain_list))
d = list_first_entry(&iommu->domain_list,
@@ -1622,33 +1791,6 @@ static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
return group;
}
-static void update_pinned_page_dirty_scope(struct vfio_iommu *iommu)
-{
- struct vfio_domain *domain;
- struct vfio_group *group;
-
- list_for_each_entry(domain, &iommu->domain_list, next) {
- list_for_each_entry(group, &domain->group_list, next) {
- if (!group->pinned_page_dirty_scope) {
- iommu->pinned_page_dirty_scope = false;
- return;
- }
- }
- }
-
- if (iommu->external_domain) {
- domain = iommu->external_domain;
- list_for_each_entry(group, &domain->group_list, next) {
- if (!group->pinned_page_dirty_scope) {
- iommu->pinned_page_dirty_scope = false;
- return;
- }
- }
- }
-
- iommu->pinned_page_dirty_scope = true;
-}
-
static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions,
phys_addr_t *base)
{
@@ -2057,8 +2199,6 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
* addition of a dirty tracking group.
*/
group->pinned_page_dirty_scope = true;
- if (!iommu->pinned_page_dirty_scope)
- update_pinned_page_dirty_scope(iommu);
mutex_unlock(&iommu->lock);
return 0;
@@ -2188,7 +2328,7 @@ done:
* demotes the iommu scope until it declares itself dirty tracking
* capable via the page pinning interface.
*/
- iommu->pinned_page_dirty_scope = false;
+ iommu->num_non_pinned_groups++;
mutex_unlock(&iommu->lock);
vfio_iommu_resv_free(&group_resv_regions);
@@ -2238,23 +2378,6 @@ static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
}
}
-static void vfio_sanity_check_pfn_list(struct vfio_iommu *iommu)
-{
- struct rb_node *n;
-
- n = rb_first(&iommu->dma_list);
- for (; n; n = rb_next(n)) {
- struct vfio_dma *dma;
-
- dma = rb_entry(n, struct vfio_dma, node);
-
- if (WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list)))
- break;
- }
- /* mdev vendor driver must unregister notifier */
- WARN_ON(iommu->notifier.head);
-}
-
/*
* Called when a domain is removed in detach. It is possible that
* the removed domain decided the iova aperture window. Modify the
@@ -2354,10 +2477,10 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
kfree(group);
if (list_empty(&iommu->external_domain->group_list)) {
- vfio_sanity_check_pfn_list(iommu);
-
- if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
+ if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
+ WARN_ON(iommu->notifier.head);
vfio_iommu_unmap_unpin_all(iommu);
+ }
kfree(iommu->external_domain);
iommu->external_domain = NULL;
@@ -2391,10 +2514,12 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
*/
if (list_empty(&domain->group_list)) {
if (list_is_singular(&iommu->domain_list)) {
- if (!iommu->external_domain)
+ if (!iommu->external_domain) {
+ WARN_ON(iommu->notifier.head);
vfio_iommu_unmap_unpin_all(iommu);
- else
+ } else {
vfio_iommu_unmap_unpin_reaccount(iommu);
+ }
}
iommu_domain_free(domain->domain);
list_del(&domain->next);
@@ -2415,8 +2540,11 @@ detach_group_done:
* Removal of a group without dirty tracking may allow the iommu scope
* to be promoted.
*/
- if (update_dirty_scope)
- update_pinned_page_dirty_scope(iommu);
+ if (update_dirty_scope) {
+ iommu->num_non_pinned_groups--;
+ if (iommu->dirty_page_tracking)
+ vfio_iommu_populate_bitmap_full(iommu);
+ }
mutex_unlock(&iommu->lock);
}
@@ -2446,8 +2574,10 @@ static void *vfio_iommu_type1_open(unsigned long arg)
INIT_LIST_HEAD(&iommu->iova_list);
iommu->dma_list = RB_ROOT;
iommu->dma_avail = dma_entry_limit;
+ iommu->container_open = true;
mutex_init(&iommu->lock);
BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
+ init_waitqueue_head(&iommu->vaddr_wait);
return iommu;
}
@@ -2475,7 +2605,6 @@ static void vfio_iommu_type1_release(void *iommu_data)
if (iommu->external_domain) {
vfio_release_domain(iommu->external_domain, true);
- vfio_sanity_check_pfn_list(iommu);
kfree(iommu->external_domain);
}
@@ -2517,6 +2646,8 @@ static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
case VFIO_TYPE1_IOMMU:
case VFIO_TYPE1v2_IOMMU:
case VFIO_TYPE1_NESTING_IOMMU:
+ case VFIO_UNMAP_ALL:
+ case VFIO_UPDATE_VADDR:
return 1;
case VFIO_DMA_CC_IOMMU:
if (!iommu)
@@ -2688,7 +2819,8 @@ static int vfio_iommu_type1_map_dma(struct vfio_iommu *iommu,
{
struct vfio_iommu_type1_dma_map map;
unsigned long minsz;
- uint32_t mask = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+ uint32_t mask = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE |
+ VFIO_DMA_MAP_FLAG_VADDR;
minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
@@ -2706,6 +2838,9 @@ static int vfio_iommu_type1_unmap_dma(struct vfio_iommu *iommu,
{
struct vfio_iommu_type1_dma_unmap unmap;
struct vfio_bitmap bitmap = { 0 };
+ uint32_t mask = VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP |
+ VFIO_DMA_UNMAP_FLAG_VADDR |
+ VFIO_DMA_UNMAP_FLAG_ALL;
unsigned long minsz;
int ret;
@@ -2714,8 +2849,12 @@ static int vfio_iommu_type1_unmap_dma(struct vfio_iommu *iommu,
if (copy_from_user(&unmap, (void __user *)arg, minsz))
return -EFAULT;
- if (unmap.argsz < minsz ||
- unmap.flags & ~VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP)
+ if (unmap.argsz < minsz || unmap.flags & ~mask)
+ return -EINVAL;
+
+ if ((unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
+ (unmap.flags & (VFIO_DMA_UNMAP_FLAG_ALL |
+ VFIO_DMA_UNMAP_FLAG_VADDR)))
return -EINVAL;
if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
@@ -2906,12 +3045,13 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
struct vfio_dma *dma;
bool kthread = current->mm == NULL;
size_t offset;
+ int ret;
*copied = 0;
- dma = vfio_find_dma(iommu, user_iova, 1);
- if (!dma)
- return -EINVAL;
+ ret = vfio_find_dma_valid(iommu, user_iova, 1, &dma);
+ if (ret < 0)
+ return ret;
if ((write && !(dma->prot & IOMMU_WRITE)) ||
!(dma->prot & IOMMU_READ))
@@ -3003,6 +3143,19 @@ vfio_iommu_type1_group_iommu_domain(void *iommu_data,
return domain;
}
+static void vfio_iommu_type1_notify(void *iommu_data,
+ enum vfio_iommu_notify_type event)
+{
+ struct vfio_iommu *iommu = iommu_data;
+
+ if (event != VFIO_IOMMU_CONTAINER_CLOSE)
+ return;
+ mutex_lock(&iommu->lock);
+ iommu->container_open = false;
+ mutex_unlock(&iommu->lock);
+ wake_up_all(&iommu->vaddr_wait);
+}
+
static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
.name = "vfio-iommu-type1",
.owner = THIS_MODULE,
@@ -3017,6 +3170,7 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
.unregister_notifier = vfio_iommu_type1_unregister_notifier,
.dma_rw = vfio_iommu_type1_dma_rw,
.group_iommu_domain = vfio_iommu_type1_group_iommu_domain,
+ .notify = vfio_iommu_type1_notify,
};
static int __init vfio_iommu_type1_init(void)
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index f45940b38a02..b7e18bde5aa8 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -57,6 +57,11 @@ extern struct vfio_device *vfio_device_get_from_dev(struct device *dev);
extern void vfio_device_put(struct vfio_device *device);
extern void *vfio_device_data(struct vfio_device *device);
+/* events for the backend driver notify callback */
+enum vfio_iommu_notify_type {
+ VFIO_IOMMU_CONTAINER_CLOSE = 0,
+};
+
/**
* struct vfio_iommu_driver_ops - VFIO IOMMU driver callbacks
*/
@@ -92,6 +97,8 @@ struct vfio_iommu_driver_ops {
void *data, size_t count, bool write);
struct iommu_domain *(*group_iommu_domain)(void *iommu_data,
struct iommu_group *group);
+ void (*notify)(void *iommu_data,
+ enum vfio_iommu_notify_type event);
};
extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index d1812777139f..8ce36c1d53ca 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -46,6 +46,12 @@
*/
#define VFIO_NOIOMMU_IOMMU 8
+/* Supports VFIO_DMA_UNMAP_FLAG_ALL */
+#define VFIO_UNMAP_ALL 9
+
+/* Supports the vaddr flag for DMA map and unmap */
+#define VFIO_UPDATE_VADDR 10
+
/*
* The IOCTL interface is designed for extensibility by embedding the
* structure length (argsz) and flags into structures passed between
@@ -1074,12 +1080,22 @@ struct vfio_iommu_type1_info_dma_avail {
*
* Map process virtual addresses to IO virtual addresses using the
* provided struct vfio_dma_map. Caller sets argsz. READ &/ WRITE required.
+ *
+ * If flags & VFIO_DMA_MAP_FLAG_VADDR, update the base vaddr for iova, and
+ * unblock translation of host virtual addresses in the iova range. The vaddr
+ * must have previously been invalidated with VFIO_DMA_UNMAP_FLAG_VADDR. To
+ * maintain memory consistency within the user application, the updated vaddr
+ * must address the same memory object as originally mapped. Failure to do so
+ * will result in user memory corruption and/or device misbehavior. iova and
+ * size must match those in the original MAP_DMA call. Protection is not
+ * changed, and the READ & WRITE flags must be 0.
*/
struct vfio_iommu_type1_dma_map {
__u32 argsz;
__u32 flags;
#define VFIO_DMA_MAP_FLAG_READ (1 << 0) /* readable from device */
#define VFIO_DMA_MAP_FLAG_WRITE (1 << 1) /* writable from device */
+#define VFIO_DMA_MAP_FLAG_VADDR (1 << 2)
__u64 vaddr; /* Process virtual address */
__u64 iova; /* IO virtual address */
__u64 size; /* Size of mapping (bytes) */
@@ -1102,6 +1118,7 @@ struct vfio_bitmap {
* field. No guarantee is made to the user that arbitrary unmaps of iova
* or size different from those used in the original mapping call will
* succeed.
+ *
* VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP should be set to get the dirty bitmap
* before unmapping IO virtual addresses. When this flag is set, the user must
* provide a struct vfio_bitmap in data[]. User must provide zero-allocated
@@ -1111,11 +1128,21 @@ struct vfio_bitmap {
* indicates that the page at that offset from iova is dirty. A Bitmap of the
* pages in the range of unmapped size is returned in the user-provided
* vfio_bitmap.data.
+ *
+ * If flags & VFIO_DMA_UNMAP_FLAG_ALL, unmap all addresses. iova and size
+ * must be 0. This cannot be combined with the get-dirty-bitmap flag.
+ *
+ * If flags & VFIO_DMA_UNMAP_FLAG_VADDR, do not unmap, but invalidate host
+ * virtual addresses in the iova range. Tasks that attempt to translate an
+ * iova's vaddr will block. DMA to already-mapped pages continues. This
+ * cannot be combined with the get-dirty-bitmap flag.
*/
struct vfio_iommu_type1_dma_unmap {
__u32 argsz;
__u32 flags;
#define VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP (1 << 0)
+#define VFIO_DMA_UNMAP_FLAG_ALL (1 << 1)
+#define VFIO_DMA_UNMAP_FLAG_VADDR (1 << 2)
__u64 iova; /* IO virtual address */
__u64 size; /* Size of mapping (bytes) */
__u8 data[];