summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-12-14 09:15:43 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2022-12-14 09:15:43 -0800
commit08cdc2157966c07d3f986a097ddaa74cee312751 (patch)
treedad2562768b49876c642c2505813e90a467ae40a /include
parentaa5ad10f6cca6d42f3fef6cb862e03b220ea19a6 (diff)
parentd6c55c0a20e5059abdde81713ddf6324a946eb3c (diff)
Merge tag 'for-linus-iommufd' of git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd
Pull iommufd implementation from Jason Gunthorpe: "iommufd is the user API to control the IOMMU subsystem as it relates to managing IO page tables that point at user space memory. It takes over from drivers/vfio/vfio_iommu_type1.c (aka the VFIO container) which is the VFIO specific interface for a similar idea. We see a broad need for extended features, some being highly IOMMU device specific: - Binding iommu_domain's to PASID/SSID - Userspace IO page tables, for ARM, x86 and S390 - Kernel bypassed invalidation of user page tables - Re-use of the KVM page table in the IOMMU - Dirty page tracking in the IOMMU - Runtime Increase/Decrease of IOPTE size - PRI support with faults resolved in userspace Many of these HW features exist to support VM use cases - for instance the combination of PASID, PRI and Userspace IO Page Tables allows an implementation of DMA Shared Virtual Addressing (vSVA) within a guest. Dirty tracking enables VM live migration with SRIOV devices and PASID support allow creating "scalable IOV" devices, among other things. As these features are fundamental to a VM platform they need to be uniformly exposed to all the driver families that do DMA into VMs, which is currently VFIO and VDPA" For more background, see the extended explanations in Jason's pull request: https://lore.kernel.org/lkml/Y5dzTU8dlmXTbzoJ@nvidia.com/ * tag 'for-linus-iommufd' of git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd: (62 commits) iommufd: Change the order of MSI setup iommufd: Improve a few unclear bits of code iommufd: Fix comment typos vfio: Move vfio group specific code into group.c vfio: Refactor dma APIs for emulated devices vfio: Wrap vfio group module init/clean code into helpers vfio: Refactor vfio_device open and close vfio: Make vfio_device_open() truly device specific vfio: Swap order of vfio_device_container_register() and open_device() vfio: Set device->group in helper function vfio: Create wrappers for group register/unregister vfio: Move the sanity check of the group to vfio_create_group() vfio: Simplify vfio_create_group() iommufd: Allow iommufd to supply /dev/vfio/vfio vfio: Make vfio_container optionally compiled vfio: Move container related MODULE_ALIAS statements into container.c vfio-iommufd: Support iommufd for emulated VFIO devices vfio-iommufd: Support iommufd for physical VFIO devices vfio-iommufd: Allow iommufd to be used in place of a container fd vfio: Use IOMMU_CAP_ENFORCE_CACHE_COHERENCY for vfio_file_enforced_coherent() ...
Diffstat (limited to 'include')
-rw-r--r--include/linux/intel-svm.h13
-rw-r--r--include/linux/interval_tree.h58
-rw-r--r--include/linux/iommu.h145
-rw-r--r--include/linux/iommufd.h98
-rw-r--r--include/linux/sched/user.h2
-rw-r--r--include/linux/vfio.h39
-rw-r--r--include/uapi/linux/iommufd.h347
7 files changed, 657 insertions, 45 deletions
diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h
index 207ef06ba3e1..f9a0d44f6fdb 100644
--- a/include/linux/intel-svm.h
+++ b/include/linux/intel-svm.h
@@ -13,17 +13,4 @@
#define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x20)
#define PRQ_DEPTH ((0x1000 << PRQ_ORDER) >> 5)
-/*
- * The SVM_FLAG_SUPERVISOR_MODE flag requests a PASID which can be used only
- * for access to kernel addresses. No IOTLB flushes are automatically done
- * for kernel mappings; it is valid only for access to the kernel's static
- * 1:1 mapping of physical memory — not to vmalloc or even module mappings.
- * A future API addition may permit the use of such ranges, by means of an
- * explicit IOTLB flush call (akin to the DMA API's unmap method).
- *
- * It is unlikely that we will ever hook into flush_tlb_kernel_range() to
- * do such IOTLB flushes automatically.
- */
-#define SVM_FLAG_SUPERVISOR_MODE BIT(0)
-
#endif /* __INTEL_SVM_H__ */
diff --git a/include/linux/interval_tree.h b/include/linux/interval_tree.h
index 288c26f50732..2b8026a39906 100644
--- a/include/linux/interval_tree.h
+++ b/include/linux/interval_tree.h
@@ -27,4 +27,62 @@ extern struct interval_tree_node *
interval_tree_iter_next(struct interval_tree_node *node,
unsigned long start, unsigned long last);
+/**
+ * struct interval_tree_span_iter - Find used and unused spans.
+ * @start_hole: Start of an interval for a hole when is_hole == 1
+ * @last_hole: Inclusive end of an interval for a hole when is_hole == 1
+ * @start_used: Start of a used interval when is_hole == 0
+ * @last_used: Inclusive end of a used interval when is_hole == 0
+ * @is_hole: 0 == used, 1 == is_hole, -1 == done iteration
+ *
+ * This iterator travels over spans in an interval tree. It does not return
+ * nodes but classifies each span as either a hole, where no nodes intersect, or
+ * a used, which is fully covered by nodes. Each iteration step toggles between
+ * hole and used until the entire range is covered. The returned spans always
+ * fully cover the requested range.
+ *
+ * The iterator is greedy, it always returns the largest hole or used possible,
+ * consolidating all consecutive nodes.
+ *
+ * Use interval_tree_span_iter_done() to detect end of iteration.
+ */
+struct interval_tree_span_iter {
+ /* private: not for use by the caller */
+ struct interval_tree_node *nodes[2];
+ unsigned long first_index;
+ unsigned long last_index;
+
+ /* public: */
+ union {
+ unsigned long start_hole;
+ unsigned long start_used;
+ };
+ union {
+ unsigned long last_hole;
+ unsigned long last_used;
+ };
+ int is_hole;
+};
+
+void interval_tree_span_iter_first(struct interval_tree_span_iter *state,
+ struct rb_root_cached *itree,
+ unsigned long first_index,
+ unsigned long last_index);
+void interval_tree_span_iter_advance(struct interval_tree_span_iter *iter,
+ struct rb_root_cached *itree,
+ unsigned long new_index);
+void interval_tree_span_iter_next(struct interval_tree_span_iter *state);
+
+static inline bool
+interval_tree_span_iter_done(struct interval_tree_span_iter *state)
+{
+ return state->is_hole == -1;
+}
+
+#define interval_tree_for_each_span(span, itree, first_index, last_index) \
+ for (interval_tree_span_iter_first(span, itree, \
+ first_index, last_index); \
+ !interval_tree_span_iter_done(span); \
+ interval_tree_span_iter_next(span))
+
#endif /* _LINUX_INTERVAL_TREE_H */
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 6f53ad74fa0d..46e1347bfa22 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -64,6 +64,8 @@ struct iommu_domain_geometry {
#define __IOMMU_DOMAIN_PT (1U << 2) /* Domain is identity mapped */
#define __IOMMU_DOMAIN_DMA_FQ (1U << 3) /* DMA-API uses flush queue */
+#define __IOMMU_DOMAIN_SVA (1U << 4) /* Shared process address space */
+
/*
* This are the possible domain-types
*
@@ -77,6 +79,8 @@ struct iommu_domain_geometry {
* certain optimizations for these domains
* IOMMU_DOMAIN_DMA_FQ - As above, but definitely using batched TLB
* invalidation.
+ * IOMMU_DOMAIN_SVA - DMA addresses are shared process addresses
+ * represented by mm_struct's.
*/
#define IOMMU_DOMAIN_BLOCKED (0U)
#define IOMMU_DOMAIN_IDENTITY (__IOMMU_DOMAIN_PT)
@@ -86,15 +90,27 @@ struct iommu_domain_geometry {
#define IOMMU_DOMAIN_DMA_FQ (__IOMMU_DOMAIN_PAGING | \
__IOMMU_DOMAIN_DMA_API | \
__IOMMU_DOMAIN_DMA_FQ)
+#define IOMMU_DOMAIN_SVA (__IOMMU_DOMAIN_SVA)
struct iommu_domain {
unsigned type;
const struct iommu_domain_ops *ops;
unsigned long pgsize_bitmap; /* Bitmap of page sizes in use */
- iommu_fault_handler_t handler;
- void *handler_token;
struct iommu_domain_geometry geometry;
struct iommu_dma_cookie *iova_cookie;
+ enum iommu_page_response_code (*iopf_handler)(struct iommu_fault *fault,
+ void *data);
+ void *fault_data;
+ union {
+ struct {
+ iommu_fault_handler_t handler;
+ void *handler_token;
+ };
+ struct { /* IOMMU_DOMAIN_SVA */
+ struct mm_struct *mm;
+ int users;
+ };
+ };
};
static inline bool iommu_is_dma_domain(struct iommu_domain *domain)
@@ -108,6 +124,11 @@ enum iommu_cap {
IOMMU_CAP_NOEXEC, /* IOMMU_NOEXEC flag */
IOMMU_CAP_PRE_BOOT_PROTECTION, /* Firmware says it used the IOMMU for
DMA protection and we should too */
+ /*
+ * Per-device flag indicating if enforce_cache_coherency() will work on
+ * this device.
+ */
+ IOMMU_CAP_ENFORCE_CACHE_COHERENCY,
};
/* These are the possible reserved region types */
@@ -214,15 +235,15 @@ struct iommu_iotlb_gather {
* driver init to device driver init (default no)
* @dev_enable/disable_feat: per device entries to enable/disable
* iommu specific features.
- * @sva_bind: Bind process address space to device
- * @sva_unbind: Unbind process address space from device
- * @sva_get_pasid: Get PASID associated to a SVA handle
* @page_response: handle page request response
* @def_domain_type: device default domain type, return value:
* - IOMMU_DOMAIN_IDENTITY: must use an identity domain
* - IOMMU_DOMAIN_DMA: must use a dma domain
* - 0: use the default setting
* @default_domain_ops: the default ops for domains
+ * @remove_dev_pasid: Remove any translation configurations of a specific
+ * pasid, so that any DMA transactions with this pasid
+ * will be blocked by the hardware.
* @pgsize_bitmap: bitmap of all possible supported page sizes
* @owner: Driver module providing these ops
*/
@@ -247,16 +268,12 @@ struct iommu_ops {
int (*dev_enable_feat)(struct device *dev, enum iommu_dev_features f);
int (*dev_disable_feat)(struct device *dev, enum iommu_dev_features f);
- struct iommu_sva *(*sva_bind)(struct device *dev, struct mm_struct *mm,
- void *drvdata);
- void (*sva_unbind)(struct iommu_sva *handle);
- u32 (*sva_get_pasid)(struct iommu_sva *handle);
-
int (*page_response)(struct device *dev,
struct iommu_fault_event *evt,
struct iommu_page_response *msg);
int (*def_domain_type)(struct device *dev);
+ void (*remove_dev_pasid)(struct device *dev, ioasid_t pasid);
const struct iommu_domain_ops *default_domain_ops;
unsigned long pgsize_bitmap;
@@ -266,7 +283,20 @@ struct iommu_ops {
/**
* struct iommu_domain_ops - domain specific operations
* @attach_dev: attach an iommu domain to a device
+ * Return:
+ * * 0 - success
+ * * EINVAL - can indicate that device and domain are incompatible due to
+ * some previous configuration of the domain, in which case the
+ * driver shouldn't log an error, since it is legitimate for a
+ * caller to test reuse of existing domains. Otherwise, it may
+ * still represent some other fundamental problem
+ * * ENOMEM - out of memory
+ * * ENOSPC - non-ENOMEM type of resource allocation failures
+ * * EBUSY - device is attached to a domain and cannot be changed
+ * * ENODEV - device specific errors, not able to be attached
+ * * <others> - treated as ENODEV by the caller. Use is discouraged
* @detach_dev: detach an iommu domain from a device
+ * @set_dev_pasid: set an iommu domain to a pasid of device
* @map: map a physically contiguous memory region to an iommu domain
* @map_pages: map a physically contiguous set of pages of the same size to
* an iommu domain.
@@ -287,6 +317,8 @@ struct iommu_ops {
struct iommu_domain_ops {
int (*attach_dev)(struct iommu_domain *domain, struct device *dev);
void (*detach_dev)(struct iommu_domain *domain, struct device *dev);
+ int (*set_dev_pasid)(struct iommu_domain *domain, struct device *dev,
+ ioasid_t pasid);
int (*map)(struct iommu_domain *domain, unsigned long iova,
phys_addr_t paddr, size_t size, int prot, gfp_t gfp);
@@ -322,12 +354,14 @@ struct iommu_domain_ops {
* @list: Used by the iommu-core to keep a list of registered iommus
* @ops: iommu-ops for talking to this iommu
* @dev: struct device for sysfs handling
+ * @max_pasids: number of supported PASIDs
*/
struct iommu_device {
struct list_head list;
const struct iommu_ops *ops;
struct fwnode_handle *fwnode;
struct device *dev;
+ u32 max_pasids;
};
/**
@@ -366,6 +400,7 @@ struct iommu_fault_param {
* @fwspec: IOMMU fwspec data
* @iommu_dev: IOMMU device this device is linked to
* @priv: IOMMU Driver private data
+ * @max_pasids: number of PASIDs this device can consume
*
* TODO: migrate other per device data pointers under iommu_dev_data, e.g.
* struct iommu_group *iommu_group;
@@ -377,6 +412,7 @@ struct dev_iommu {
struct iommu_fwspec *fwspec;
struct iommu_device *iommu_dev;
void *priv;
+ u32 max_pasids;
};
int iommu_device_register(struct iommu_device *iommu,
@@ -626,6 +662,7 @@ struct iommu_fwspec {
*/
struct iommu_sva {
struct device *dev;
+ struct iommu_domain *domain;
};
int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode,
@@ -667,12 +704,6 @@ void iommu_release_device(struct device *dev);
int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features f);
int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features f);
-struct iommu_sva *iommu_sva_bind_device(struct device *dev,
- struct mm_struct *mm,
- void *drvdata);
-void iommu_sva_unbind_device(struct iommu_sva *handle);
-u32 iommu_sva_get_pasid(struct iommu_sva *handle);
-
int iommu_device_use_default_domain(struct device *dev);
void iommu_device_unuse_default_domain(struct device *dev);
@@ -680,6 +711,18 @@ int iommu_group_claim_dma_owner(struct iommu_group *group, void *owner);
void iommu_group_release_dma_owner(struct iommu_group *group);
bool iommu_group_dma_owner_claimed(struct iommu_group *group);
+int iommu_device_claim_dma_owner(struct device *dev, void *owner);
+void iommu_device_release_dma_owner(struct device *dev);
+
+struct iommu_domain *iommu_sva_domain_alloc(struct device *dev,
+ struct mm_struct *mm);
+int iommu_attach_device_pasid(struct iommu_domain *domain,
+ struct device *dev, ioasid_t pasid);
+void iommu_detach_device_pasid(struct iommu_domain *domain,
+ struct device *dev, ioasid_t pasid);
+struct iommu_domain *
+iommu_get_domain_for_dev_pasid(struct device *dev, ioasid_t pasid,
+ unsigned int type);
#else /* CONFIG_IOMMU_API */
struct iommu_ops {};
@@ -999,21 +1042,6 @@ iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features feat)
return -ENODEV;
}
-static inline struct iommu_sva *
-iommu_sva_bind_device(struct device *dev, struct mm_struct *mm, void *drvdata)
-{
- return NULL;
-}
-
-static inline void iommu_sva_unbind_device(struct iommu_sva *handle)
-{
-}
-
-static inline u32 iommu_sva_get_pasid(struct iommu_sva *handle)
-{
- return IOMMU_PASID_INVALID;
-}
-
static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev)
{
return NULL;
@@ -1042,6 +1070,39 @@ static inline bool iommu_group_dma_owner_claimed(struct iommu_group *group)
{
return false;
}
+
+static inline void iommu_device_release_dma_owner(struct device *dev)
+{
+}
+
+static inline int iommu_device_claim_dma_owner(struct device *dev, void *owner)
+{
+ return -ENODEV;
+}
+
+static inline struct iommu_domain *
+iommu_sva_domain_alloc(struct device *dev, struct mm_struct *mm)
+{
+ return NULL;
+}
+
+static inline int iommu_attach_device_pasid(struct iommu_domain *domain,
+ struct device *dev, ioasid_t pasid)
+{
+ return -ENODEV;
+}
+
+static inline void iommu_detach_device_pasid(struct iommu_domain *domain,
+ struct device *dev, ioasid_t pasid)
+{
+}
+
+static inline struct iommu_domain *
+iommu_get_domain_for_dev_pasid(struct device *dev, ioasid_t pasid,
+ unsigned int type)
+{
+ return NULL;
+}
#endif /* CONFIG_IOMMU_API */
/**
@@ -1124,4 +1185,26 @@ static inline bool tegra_dev_iommu_get_stream_id(struct device *dev, u32 *stream
return false;
}
+#ifdef CONFIG_IOMMU_SVA
+struct iommu_sva *iommu_sva_bind_device(struct device *dev,
+ struct mm_struct *mm);
+void iommu_sva_unbind_device(struct iommu_sva *handle);
+u32 iommu_sva_get_pasid(struct iommu_sva *handle);
+#else
+static inline struct iommu_sva *
+iommu_sva_bind_device(struct device *dev, struct mm_struct *mm)
+{
+ return NULL;
+}
+
+static inline void iommu_sva_unbind_device(struct iommu_sva *handle)
+{
+}
+
+static inline u32 iommu_sva_get_pasid(struct iommu_sva *handle)
+{
+ return IOMMU_PASID_INVALID;
+}
+#endif /* CONFIG_IOMMU_SVA */
+
#endif /* __LINUX_IOMMU_H */
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
new file mode 100644
index 000000000000..650d45629647
--- /dev/null
+++ b/include/linux/iommufd.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2021 Intel Corporation
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
+ */
+#ifndef __LINUX_IOMMUFD_H
+#define __LINUX_IOMMUFD_H
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+
+struct device;
+struct iommufd_device;
+struct page;
+struct iommufd_ctx;
+struct iommufd_access;
+struct file;
+
+struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx,
+ struct device *dev, u32 *id);
+void iommufd_device_unbind(struct iommufd_device *idev);
+
+int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id);
+void iommufd_device_detach(struct iommufd_device *idev);
+
+struct iommufd_access_ops {
+ u8 needs_pin_pages : 1;
+ void (*unmap)(void *data, unsigned long iova, unsigned long length);
+};
+
+enum {
+ IOMMUFD_ACCESS_RW_READ = 0,
+ IOMMUFD_ACCESS_RW_WRITE = 1 << 0,
+ /* Set if the caller is in a kthread then rw will use kthread_use_mm() */
+ IOMMUFD_ACCESS_RW_KTHREAD = 1 << 1,
+
+ /* Only for use by selftest */
+ __IOMMUFD_ACCESS_RW_SLOW_PATH = 1 << 2,
+};
+
+struct iommufd_access *
+iommufd_access_create(struct iommufd_ctx *ictx, u32 ioas_id,
+ const struct iommufd_access_ops *ops, void *data);
+void iommufd_access_destroy(struct iommufd_access *access);
+
+void iommufd_ctx_get(struct iommufd_ctx *ictx);
+
+#if IS_ENABLED(CONFIG_IOMMUFD)
+struct iommufd_ctx *iommufd_ctx_from_file(struct file *file);
+void iommufd_ctx_put(struct iommufd_ctx *ictx);
+
+int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova,
+ unsigned long length, struct page **out_pages,
+ unsigned int flags);
+void iommufd_access_unpin_pages(struct iommufd_access *access,
+ unsigned long iova, unsigned long length);
+int iommufd_access_rw(struct iommufd_access *access, unsigned long iova,
+ void *data, size_t len, unsigned int flags);
+int iommufd_vfio_compat_ioas_id(struct iommufd_ctx *ictx, u32 *out_ioas_id);
+#else /* !CONFIG_IOMMUFD */
+static inline struct iommufd_ctx *iommufd_ctx_from_file(struct file *file)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline void iommufd_ctx_put(struct iommufd_ctx *ictx)
+{
+}
+
+static inline int iommufd_access_pin_pages(struct iommufd_access *access,
+ unsigned long iova,
+ unsigned long length,
+ struct page **out_pages,
+ unsigned int flags)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void iommufd_access_unpin_pages(struct iommufd_access *access,
+ unsigned long iova,
+ unsigned long length)
+{
+}
+
+static inline int iommufd_access_rw(struct iommufd_access *access, unsigned long iova,
+ void *data, size_t len, unsigned int flags)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int iommufd_vfio_compat_ioas_id(struct iommufd_ctx *ictx,
+ u32 *out_ioas_id)
+{
+ return -EOPNOTSUPP;
+}
+#endif /* CONFIG_IOMMUFD */
+#endif
diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h
index f054d0360a75..4cc52698e214 100644
--- a/include/linux/sched/user.h
+++ b/include/linux/sched/user.h
@@ -25,7 +25,7 @@ struct user_struct {
#if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL) || \
defined(CONFIG_NET) || defined(CONFIG_IO_URING) || \
- defined(CONFIG_VFIO_PCI_ZDEV_KVM)
+ defined(CONFIG_VFIO_PCI_ZDEV_KVM) || IS_ENABLED(CONFIG_IOMMUFD)
atomic_long_t locked_vm;
#endif
#ifdef CONFIG_WATCH_QUEUE
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index fdd393f70b19..a615542df1e0 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -17,6 +17,9 @@
#include <linux/iova_bitmap.h>
struct kvm;
+struct iommufd_ctx;
+struct iommufd_device;
+struct iommufd_access;
/*
* VFIO devices can be placed in a set, this allows all devices to share this
@@ -54,6 +57,12 @@ struct vfio_device {
struct completion comp;
struct list_head group_next;
struct list_head iommu_entry;
+ struct iommufd_access *iommufd_access;
+#if IS_ENABLED(CONFIG_IOMMUFD)
+ struct iommufd_device *iommufd_device;
+ struct iommufd_ctx *iommufd_ictx;
+ bool iommufd_attached;
+#endif
};
/**
@@ -80,6 +89,10 @@ struct vfio_device_ops {
char *name;
int (*init)(struct vfio_device *vdev);
void (*release)(struct vfio_device *vdev);
+ int (*bind_iommufd)(struct vfio_device *vdev,
+ struct iommufd_ctx *ictx, u32 *out_device_id);
+ void (*unbind_iommufd)(struct vfio_device *vdev);
+ int (*attach_ioas)(struct vfio_device *vdev, u32 *pt_id);
int (*open_device)(struct vfio_device *vdev);
void (*close_device)(struct vfio_device *vdev);
ssize_t (*read)(struct vfio_device *vdev, char __user *buf,
@@ -96,6 +109,32 @@ struct vfio_device_ops {
void __user *arg, size_t argsz);
};
+#if IS_ENABLED(CONFIG_IOMMUFD)
+int vfio_iommufd_physical_bind(struct vfio_device *vdev,
+ struct iommufd_ctx *ictx, u32 *out_device_id);
+void vfio_iommufd_physical_unbind(struct vfio_device *vdev);
+int vfio_iommufd_physical_attach_ioas(struct vfio_device *vdev, u32 *pt_id);
+int vfio_iommufd_emulated_bind(struct vfio_device *vdev,
+ struct iommufd_ctx *ictx, u32 *out_device_id);
+void vfio_iommufd_emulated_unbind(struct vfio_device *vdev);
+int vfio_iommufd_emulated_attach_ioas(struct vfio_device *vdev, u32 *pt_id);
+#else
+#define vfio_iommufd_physical_bind \
+ ((int (*)(struct vfio_device *vdev, struct iommufd_ctx *ictx, \
+ u32 *out_device_id)) NULL)
+#define vfio_iommufd_physical_unbind \
+ ((void (*)(struct vfio_device *vdev)) NULL)
+#define vfio_iommufd_physical_attach_ioas \
+ ((int (*)(struct vfio_device *vdev, u32 *pt_id)) NULL)
+#define vfio_iommufd_emulated_bind \
+ ((int (*)(struct vfio_device *vdev, struct iommufd_ctx *ictx, \
+ u32 *out_device_id)) NULL)
+#define vfio_iommufd_emulated_unbind \
+ ((void (*)(struct vfio_device *vdev)) NULL)
+#define vfio_iommufd_emulated_attach_ioas \
+ ((int (*)(struct vfio_device *vdev, u32 *pt_id)) NULL)
+#endif
+
/**
* @migration_set_state: Optional callback to change the migration state for
* devices that support migration. It's mandatory for
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
new file mode 100644
index 000000000000..98ebba80cfa1
--- /dev/null
+++ b/include/uapi/linux/iommufd.h
@@ -0,0 +1,347 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
+ */
+#ifndef _UAPI_IOMMUFD_H
+#define _UAPI_IOMMUFD_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#define IOMMUFD_TYPE (';')
+
+/**
+ * DOC: General ioctl format
+ *
+ * The ioctl interface follows a general format to allow for extensibility. Each
+ * ioctl is passed in a structure pointer as the argument providing the size of
+ * the structure in the first u32. The kernel checks that any structure space
+ * beyond what it understands is 0. This allows userspace to use the backward
+ * compatible portion while consistently using the newer, larger, structures.
+ *
+ * ioctls use a standard meaning for common errnos:
+ *
+ * - ENOTTY: The IOCTL number itself is not supported at all
+ * - E2BIG: The IOCTL number is supported, but the provided structure has
+ * non-zero in a part the kernel does not understand.
+ * - EOPNOTSUPP: The IOCTL number is supported, and the structure is
+ * understood, however a known field has a value the kernel does not
+ * understand or support.
+ * - EINVAL: Everything about the IOCTL was understood, but a field is not
+ * correct.
+ * - ENOENT: An ID or IOVA provided does not exist.
+ * - ENOMEM: Out of memory.
+ * - EOVERFLOW: Mathematics overflowed.
+ *
+ * As well as additional errnos, within specific ioctls.
+ */
+enum {
+ IOMMUFD_CMD_BASE = 0x80,
+ IOMMUFD_CMD_DESTROY = IOMMUFD_CMD_BASE,
+ IOMMUFD_CMD_IOAS_ALLOC,
+ IOMMUFD_CMD_IOAS_ALLOW_IOVAS,
+ IOMMUFD_CMD_IOAS_COPY,
+ IOMMUFD_CMD_IOAS_IOVA_RANGES,
+ IOMMUFD_CMD_IOAS_MAP,
+ IOMMUFD_CMD_IOAS_UNMAP,
+ IOMMUFD_CMD_OPTION,
+ IOMMUFD_CMD_VFIO_IOAS,
+};
+
+/**
+ * struct iommu_destroy - ioctl(IOMMU_DESTROY)
+ * @size: sizeof(struct iommu_destroy)
+ * @id: iommufd object ID to destroy. Can be any destroyable object type.
+ *
+ * Destroy any object held within iommufd.
+ */
+struct iommu_destroy {
+ __u32 size;
+ __u32 id;
+};
+#define IOMMU_DESTROY _IO(IOMMUFD_TYPE, IOMMUFD_CMD_DESTROY)
+
+/**
+ * struct iommu_ioas_alloc - ioctl(IOMMU_IOAS_ALLOC)
+ * @size: sizeof(struct iommu_ioas_alloc)
+ * @flags: Must be 0
+ * @out_ioas_id: Output IOAS ID for the allocated object
+ *
+ * Allocate an IO Address Space (IOAS) which holds an IO Virtual Address (IOVA)
+ * to memory mapping.
+ */
+struct iommu_ioas_alloc {
+ __u32 size;
+ __u32 flags;
+ __u32 out_ioas_id;
+};
+#define IOMMU_IOAS_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_ALLOC)
+
+/**
+ * struct iommu_iova_range - ioctl(IOMMU_IOVA_RANGE)
+ * @start: First IOVA
+ * @last: Inclusive last IOVA
+ *
+ * An interval in IOVA space.
+ */
+struct iommu_iova_range {
+ __aligned_u64 start;
+ __aligned_u64 last;
+};
+
+/**
+ * struct iommu_ioas_iova_ranges - ioctl(IOMMU_IOAS_IOVA_RANGES)
+ * @size: sizeof(struct iommu_ioas_iova_ranges)
+ * @ioas_id: IOAS ID to read ranges from
+ * @num_iovas: Input/Output total number of ranges in the IOAS
+ * @__reserved: Must be 0
+ * @allowed_iovas: Pointer to the output array of struct iommu_iova_range
+ * @out_iova_alignment: Minimum alignment required for mapping IOVA
+ *
+ * Query an IOAS for ranges of allowed IOVAs. Mapping IOVA outside these ranges
+ * is not allowed. num_iovas will be set to the total number of iovas and
+ * the allowed_iovas[] will be filled in as space permits.
+ *
+ * The allowed ranges are dependent on the HW path the DMA operation takes, and
+ * can change during the lifetime of the IOAS. A fresh empty IOAS will have a
+ * full range, and each attached device will narrow the ranges based on that
+ * device's HW restrictions. Detaching a device can widen the ranges. Userspace
+ * should query ranges after every attach/detach to know what IOVAs are valid
+ * for mapping.
+ *
+ * On input num_iovas is the length of the allowed_iovas array. On output it is
+ * the total number of iovas filled in. The ioctl will return -EMSGSIZE and set
+ * num_iovas to the required value if num_iovas is too small. In this case the
+ * caller should allocate a larger output array and re-issue the ioctl.
+ *
+ * out_iova_alignment returns the minimum IOVA alignment that can be given
+ * to IOMMU_IOAS_MAP/COPY. IOVA's must satisfy::
+ *
+ * starting_iova % out_iova_alignment == 0
+ * (starting_iova + length) % out_iova_alignment == 0
+ *
+ * out_iova_alignment can be 1 indicating any IOVA is allowed. It cannot
+ * be higher than the system PAGE_SIZE.
+ */
+struct iommu_ioas_iova_ranges {
+ __u32 size;
+ __u32 ioas_id;
+ __u32 num_iovas;
+ __u32 __reserved;
+ __aligned_u64 allowed_iovas;
+ __aligned_u64 out_iova_alignment;
+};
+#define IOMMU_IOAS_IOVA_RANGES _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_IOVA_RANGES)
+
+/**
+ * struct iommu_ioas_allow_iovas - ioctl(IOMMU_IOAS_ALLOW_IOVAS)
+ * @size: sizeof(struct iommu_ioas_allow_iovas)
+ * @ioas_id: IOAS ID to allow IOVAs from
+ * @num_iovas: Input/Output total number of ranges in the IOAS
+ * @__reserved: Must be 0
+ * @allowed_iovas: Pointer to array of struct iommu_iova_range
+ *
+ * Ensure a range of IOVAs are always available for allocation. If this call
+ * succeeds then IOMMU_IOAS_IOVA_RANGES will never return a list of IOVA ranges
+ * that are narrower than the ranges provided here. This call will fail if
+ * IOMMU_IOAS_IOVA_RANGES is currently narrower than the given ranges.
+ *
+ * When an IOAS is first created the IOVA_RANGES will be maximally sized, and as
+ * devices are attached the IOVA will narrow based on the device restrictions.
+ * When an allowed range is specified any narrowing will be refused, ie device
+ * attachment can fail if the device requires limiting within the allowed range.
+ *
+ * Automatic IOVA allocation is also impacted by this call. MAP will only
+ * allocate within the allowed IOVAs if they are present.
+ *
+ * This call replaces the entire allowed list with the given list.
+ */
+struct iommu_ioas_allow_iovas {
+ __u32 size;
+ __u32 ioas_id;
+ __u32 num_iovas;
+ __u32 __reserved;
+ __aligned_u64 allowed_iovas;
+};
+#define IOMMU_IOAS_ALLOW_IOVAS _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_ALLOW_IOVAS)
+
+/**
+ * enum iommufd_ioas_map_flags - Flags for map and copy
+ * @IOMMU_IOAS_MAP_FIXED_IOVA: If clear the kernel will compute an appropriate
+ * IOVA to place the mapping at
+ * @IOMMU_IOAS_MAP_WRITEABLE: DMA is allowed to write to this mapping
+ * @IOMMU_IOAS_MAP_READABLE: DMA is allowed to read from this mapping
+ */
+enum iommufd_ioas_map_flags {
+ IOMMU_IOAS_MAP_FIXED_IOVA = 1 << 0,
+ IOMMU_IOAS_MAP_WRITEABLE = 1 << 1,
+ IOMMU_IOAS_MAP_READABLE = 1 << 2,
+};
+
+/**
+ * struct iommu_ioas_map - ioctl(IOMMU_IOAS_MAP)
+ * @size: sizeof(struct iommu_ioas_map)
+ * @flags: Combination of enum iommufd_ioas_map_flags
+ * @ioas_id: IOAS ID to change the mapping of
+ * @__reserved: Must be 0
+ * @user_va: Userspace pointer to start mapping from
+ * @length: Number of bytes to map
+ * @iova: IOVA the mapping was placed at. If IOMMU_IOAS_MAP_FIXED_IOVA is set
+ * then this must be provided as input.
+ *
+ * Set an IOVA mapping from a user pointer. If FIXED_IOVA is specified then the
+ * mapping will be established at iova, otherwise a suitable location based on
+ * the reserved and allowed lists will be automatically selected and returned in
+ * iova.
+ *
+ * If IOMMU_IOAS_MAP_FIXED_IOVA is specified then the iova range must currently
+ * be unused, existing IOVA cannot be replaced.
+ */
+struct iommu_ioas_map {
+ __u32 size;
+ __u32 flags;
+ __u32 ioas_id;
+ __u32 __reserved;
+ __aligned_u64 user_va;
+ __aligned_u64 length;
+ __aligned_u64 iova;
+};
+#define IOMMU_IOAS_MAP _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_MAP)
+
+/**
+ * struct iommu_ioas_copy - ioctl(IOMMU_IOAS_COPY)
+ * @size: sizeof(struct iommu_ioas_copy)
+ * @flags: Combination of enum iommufd_ioas_map_flags
+ * @dst_ioas_id: IOAS ID to change the mapping of
+ * @src_ioas_id: IOAS ID to copy from
+ * @length: Number of bytes to copy and map
+ * @dst_iova: IOVA the mapping was placed at. If IOMMU_IOAS_MAP_FIXED_IOVA is
+ * set then this must be provided as input.
+ * @src_iova: IOVA to start the copy
+ *
+ * Copy an already existing mapping from src_ioas_id and establish it in
+ * dst_ioas_id. The src iova/length must exactly match a range used with
+ * IOMMU_IOAS_MAP.
+ *
+ * This may be used to efficiently clone a subset of an IOAS to another, or as a
+ * kind of 'cache' to speed up mapping. Copy has an efficiency advantage over
+ * establishing equivalent new mappings, as internal resources are shared, and
+ * the kernel will pin the user memory only once.
+ */
+struct iommu_ioas_copy {
+ __u32 size;
+ __u32 flags;
+ __u32 dst_ioas_id;
+ __u32 src_ioas_id;
+ __aligned_u64 length;
+ __aligned_u64 dst_iova;
+ __aligned_u64 src_iova;
+};
+#define IOMMU_IOAS_COPY _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_COPY)
+
+/**
+ * struct iommu_ioas_unmap - ioctl(IOMMU_IOAS_UNMAP)
+ * @size: sizeof(struct iommu_ioas_unmap)
+ * @ioas_id: IOAS ID to change the mapping of
+ * @iova: IOVA to start the unmapping at
+ * @length: Number of bytes to unmap, and return back the bytes unmapped
+ *
+ * Unmap an IOVA range. The iova/length must be a superset of a previously
+ * mapped range used with IOMMU_IOAS_MAP or IOMMU_IOAS_COPY. Splitting or
+ * truncating ranges is not allowed. The values 0 to U64_MAX will unmap
+ * everything.
+ */
+struct iommu_ioas_unmap {
+ __u32 size;
+ __u32 ioas_id;
+ __aligned_u64 iova;
+ __aligned_u64 length;
+};
+#define IOMMU_IOAS_UNMAP _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_UNMAP)
+
+/**
+ * enum iommufd_option - ioctl(IOMMU_OPTION_RLIMIT_MODE) and
+ * ioctl(IOMMU_OPTION_HUGE_PAGES)
+ * @IOMMU_OPTION_RLIMIT_MODE:
+ * Change how RLIMIT_MEMLOCK accounting works. The caller must have privilege
+ * to invoke this. Value 0 (default) is user based accouting, 1 uses process
+ * based accounting. Global option, object_id must be 0
+ * @IOMMU_OPTION_HUGE_PAGES:
+ * Value 1 (default) allows contiguous pages to be combined when generating
+ * iommu mappings. Value 0 disables combining, everything is mapped to
+ * PAGE_SIZE. This can be useful for benchmarking. This is a per-IOAS
+ * option, the object_id must be the IOAS ID.
+ */
+enum iommufd_option {
+ IOMMU_OPTION_RLIMIT_MODE = 0,
+ IOMMU_OPTION_HUGE_PAGES = 1,
+};
+
+/**
+ * enum iommufd_option_ops - ioctl(IOMMU_OPTION_OP_SET) and
+ * ioctl(IOMMU_OPTION_OP_GET)
+ * @IOMMU_OPTION_OP_SET: Set the option's value
+ * @IOMMU_OPTION_OP_GET: Get the option's value
+ */
+enum iommufd_option_ops {
+ IOMMU_OPTION_OP_SET = 0,
+ IOMMU_OPTION_OP_GET = 1,
+};
+
+/**
+ * struct iommu_option - iommu option multiplexer
+ * @size: sizeof(struct iommu_option)
+ * @option_id: One of enum iommufd_option
+ * @op: One of enum iommufd_option_ops
+ * @__reserved: Must be 0
+ * @object_id: ID of the object if required
+ * @val64: Option value to set or value returned on get
+ *
+ * Change a simple option value. This multiplexor allows controlling options
+ * on objects. IOMMU_OPTION_OP_SET will load an option and IOMMU_OPTION_OP_GET
+ * will return the current value.
+ */
+struct iommu_option {
+ __u32 size;
+ __u32 option_id;
+ __u16 op;
+ __u16 __reserved;
+ __u32 object_id;
+ __aligned_u64 val64;
+};
+#define IOMMU_OPTION _IO(IOMMUFD_TYPE, IOMMUFD_CMD_OPTION)
+
+/**
+ * enum iommufd_vfio_ioas_op - IOMMU_VFIO_IOAS_* ioctls
+ * @IOMMU_VFIO_IOAS_GET: Get the current compatibility IOAS
+ * @IOMMU_VFIO_IOAS_SET: Change the current compatibility IOAS
+ * @IOMMU_VFIO_IOAS_CLEAR: Disable VFIO compatibility
+ */
+enum iommufd_vfio_ioas_op {
+ IOMMU_VFIO_IOAS_GET = 0,
+ IOMMU_VFIO_IOAS_SET = 1,
+ IOMMU_VFIO_IOAS_CLEAR = 2,
+};
+
+/**
+ * struct iommu_vfio_ioas - ioctl(IOMMU_VFIO_IOAS)
+ * @size: sizeof(struct iommu_vfio_ioas)
+ * @ioas_id: For IOMMU_VFIO_IOAS_SET the input IOAS ID to set
+ * For IOMMU_VFIO_IOAS_GET will output the IOAS ID
+ * @op: One of enum iommufd_vfio_ioas_op
+ * @__reserved: Must be 0
+ *
+ * The VFIO compatibility support uses a single ioas because VFIO APIs do not
+ * support the ID field. Set or Get the IOAS that VFIO compatibility will use.
+ * When VFIO_GROUP_SET_CONTAINER is used on an iommufd it will get the
+ * compatibility ioas, either by taking what is already set, or auto creating
+ * one. From then on VFIO will continue to use that ioas and is not effected by
+ * this ioctl. SET or CLEAR does not destroy any auto-created IOAS.
+ */
+struct iommu_vfio_ioas {
+ __u32 size;
+ __u32 ioas_id;
+ __u16 op;
+ __u16 __reserved;
+};
+#define IOMMU_VFIO_IOAS _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VFIO_IOAS)
+#endif