summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.clang-format3
-rw-r--r--Documentation/userspace-api/index.rst1
-rw-r--r--Documentation/userspace-api/ioctl/ioctl-number.rst1
-rw-r--r--Documentation/userspace-api/iommufd.rst223
-rw-r--r--MAINTAINERS12
-rw-r--r--drivers/iommu/Kconfig1
-rw-r--r--drivers/iommu/Makefile2
-rw-r--r--drivers/iommu/amd/iommu.c2
-rw-r--r--drivers/iommu/intel/iommu.c16
-rw-r--r--drivers/iommu/iommu.c121
-rw-r--r--drivers/iommu/iommufd/Kconfig24
-rw-r--r--drivers/iommu/iommufd/Makefile13
-rw-r--r--drivers/iommu/iommufd/device.c778
-rw-r--r--drivers/iommu/iommufd/double_span.h53
-rw-r--r--drivers/iommu/iommufd/hw_pagetable.c57
-rw-r--r--drivers/iommu/iommufd/io_pagetable.c1212
-rw-r--r--drivers/iommu/iommufd/io_pagetable.h241
-rw-r--r--drivers/iommu/iommufd/ioas.c398
-rw-r--r--drivers/iommu/iommufd/iommufd_private.h307
-rw-r--r--drivers/iommu/iommufd/iommufd_test.h93
-rw-r--r--drivers/iommu/iommufd/main.c424
-rw-r--r--drivers/iommu/iommufd/pages.c1981
-rw-r--r--drivers/iommu/iommufd/selftest.c853
-rw-r--r--drivers/iommu/iommufd/vfio_compat.c472
-rw-r--r--include/linux/interval_tree.h58
-rw-r--r--include/linux/iommu.h17
-rw-r--r--include/linux/iommufd.h98
-rw-r--r--include/linux/sched/user.h2
-rw-r--r--include/uapi/linux/iommufd.h347
-rw-r--r--kernel/user.c1
-rw-r--r--lib/Kconfig4
-rw-r--r--lib/interval_tree.c132
-rwxr-xr-xscripts/kernel-doc12
-rw-r--r--tools/testing/selftests/Makefile1
-rw-r--r--tools/testing/selftests/iommu/.gitignore3
-rw-r--r--tools/testing/selftests/iommu/Makefile12
-rw-r--r--tools/testing/selftests/iommu/config2
-rw-r--r--tools/testing/selftests/iommu/iommufd.c1654
-rw-r--r--tools/testing/selftests/iommu/iommufd_fail_nth.c580
-rw-r--r--tools/testing/selftests/iommu/iommufd_utils.h278
40 files changed, 10452 insertions, 37 deletions
diff --git a/.clang-format b/.clang-format
index 1247d54f9e49..78aba4a10b1b 100644
--- a/.clang-format
+++ b/.clang-format
@@ -440,8 +440,11 @@ ForEachMacros:
- 'inet_lhash2_for_each_icsk'
- 'inet_lhash2_for_each_icsk_continue'
- 'inet_lhash2_for_each_icsk_rcu'
+ - 'interval_tree_for_each_double_span'
+ - 'interval_tree_for_each_span'
- 'intlist__for_each_entry'
- 'intlist__for_each_entry_safe'
+ - 'iopt_for_each_contig_area'
- 'kcore_copy__for_each_phdr'
- 'key_for_each'
- 'key_for_each_safe'
diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst
index c78da9ce0ec4..f16337bdb852 100644
--- a/Documentation/userspace-api/index.rst
+++ b/Documentation/userspace-api/index.rst
@@ -25,6 +25,7 @@ place where this information is gathered.
ebpf/index
ioctl/index
iommu
+ iommufd
media/index
netlink/index
sysfs-platform_profile
diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index 5f81e2a24a5c..eb045fc495a4 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -105,6 +105,7 @@ Code Seq# Include File Comments
'8' all SNP8023 advanced NIC card
<mailto:mcr@solidum.com>
';' 64-7F linux/vfio.h
+';' 80-FF linux/iommufd.h
'=' 00-3f uapi/linux/ptp_clock.h <mailto:richardcochran@gmail.com>
'@' 00-0F linux/radeonfb.h conflict!
'@' 00-0F drivers/video/aty/aty128fb.c conflict!
diff --git a/Documentation/userspace-api/iommufd.rst b/Documentation/userspace-api/iommufd.rst
new file mode 100644
index 000000000000..79dd9eb51587
--- /dev/null
+++ b/Documentation/userspace-api/iommufd.rst
@@ -0,0 +1,223 @@
+.. SPDX-License-Identifier: GPL-2.0+
+
+=======
+IOMMUFD
+=======
+
+:Author: Jason Gunthorpe
+:Author: Kevin Tian
+
+Overview
+========
+
+IOMMUFD is the user API to control the IOMMU subsystem as it relates to managing
+IO page tables from userspace using file descriptors. It intends to be general
+and consumable by any driver that wants to expose DMA to userspace. These
+drivers are eventually expected to deprecate any internal IOMMU logic
+they may already/historically implement (e.g. vfio_iommu_type1.c).
+
+At minimum iommufd provides universal support of managing I/O address spaces and
+I/O page tables for all IOMMUs, with room in the design to add non-generic
+features to cater to specific hardware functionality.
+
+In this context the capital letter (IOMMUFD) refers to the subsystem while the
+small letter (iommufd) refers to the file descriptors created via /dev/iommu for
+use by userspace.
+
+Key Concepts
+============
+
+User Visible Objects
+--------------------
+
+Following IOMMUFD objects are exposed to userspace:
+
+- IOMMUFD_OBJ_IOAS, representing an I/O address space (IOAS), allowing map/unmap
+ of user space memory into ranges of I/O Virtual Address (IOVA).
+
+ The IOAS is a functional replacement for the VFIO container, and like the VFIO
+ container it copies an IOVA map to a list of iommu_domains held within it.
+
+- IOMMUFD_OBJ_DEVICE, representing a device that is bound to iommufd by an
+ external driver.
+
+- IOMMUFD_OBJ_HW_PAGETABLE, representing an actual hardware I/O page table
+ (i.e. a single struct iommu_domain) managed by the iommu driver.
+
+ The IOAS has a list of HW_PAGETABLES that share the same IOVA mapping and
+ it will synchronize its mapping with each member HW_PAGETABLE.
+
+All user-visible objects are destroyed via the IOMMU_DESTROY uAPI.
+
+The diagram below shows relationship between user-visible objects and kernel
+datastructures (external to iommufd), with numbers referred to operations
+creating the objects and links::
+
+ _________________________________________________________
+ | iommufd |
+ | [1] |
+ | _________________ |
+ | | | |
+ | | | |
+ | | | |
+ | | | |
+ | | | |
+ | | | |
+ | | | [3] [2] |
+ | | | ____________ __________ |
+ | | IOAS |<--| |<------| | |
+ | | | |HW_PAGETABLE| | DEVICE | |
+ | | | |____________| |__________| |
+ | | | | | |
+ | | | | | |
+ | | | | | |
+ | | | | | |
+ | | | | | |
+ | |_________________| | | |
+ | | | | |
+ |_________|___________________|___________________|_______|
+ | | |
+ | _____v______ _______v_____
+ | PFN storage | | | |
+ |------------>|iommu_domain| |struct device|
+ |____________| |_____________|
+
+1. IOMMUFD_OBJ_IOAS is created via the IOMMU_IOAS_ALLOC uAPI. An iommufd can
+ hold multiple IOAS objects. IOAS is the most generic object and does not
+ expose interfaces that are specific to single IOMMU drivers. All operations
+ on the IOAS must operate equally on each of the iommu_domains inside of it.
+
+2. IOMMUFD_OBJ_DEVICE is created when an external driver calls the IOMMUFD kAPI
+ to bind a device to an iommufd. The driver is expected to implement a set of
+ ioctls to allow userspace to initiate the binding operation. Successful
+ completion of this operation establishes the desired DMA ownership over the
+ device. The driver must also set the driver_managed_dma flag and must not
+ touch the device until this operation succeeds.
+
+3. IOMMUFD_OBJ_HW_PAGETABLE is created when an external driver calls the IOMMUFD
+ kAPI to attach a bound device to an IOAS. Similarly the external driver uAPI
+ allows userspace to initiate the attaching operation. If a compatible
+ pagetable already exists then it is reused for the attachment. Otherwise a
+ new pagetable object and iommu_domain is created. Successful completion of
+ this operation sets up the linkages among IOAS, device and iommu_domain. Once
+ this completes the device could do DMA.
+
+ Every iommu_domain inside the IOAS is also represented to userspace as a
+ HW_PAGETABLE object.
+
+ .. note::
+
+ Future IOMMUFD updates will provide an API to create and manipulate the
+ HW_PAGETABLE directly.
+
+A device can only bind to an iommufd due to DMA ownership claim and attach to at
+most one IOAS object (no support of PASID yet).
+
+Kernel Datastructure
+--------------------
+
+User visible objects are backed by following datastructures:
+
+- iommufd_ioas for IOMMUFD_OBJ_IOAS.
+- iommufd_device for IOMMUFD_OBJ_DEVICE.
+- iommufd_hw_pagetable for IOMMUFD_OBJ_HW_PAGETABLE.
+
+Several terminologies when looking at these datastructures:
+
+- Automatic domain - refers to an iommu domain created automatically when
+ attaching a device to an IOAS object. This is compatible to the semantics of
+ VFIO type1.
+
+- Manual domain - refers to an iommu domain designated by the user as the
+ target pagetable to be attached to by a device. Though currently there are
+ no uAPIs to directly create such domain, the datastructure and algorithms
+ are ready for handling that use case.
+
+- In-kernel user - refers to something like a VFIO mdev that is using the
+ IOMMUFD access interface to access the IOAS. This starts by creating an
+ iommufd_access object that is similar to the domain binding a physical device
+ would do. The access object will then allow converting IOVA ranges into struct
+ page * lists, or doing direct read/write to an IOVA.
+
+iommufd_ioas serves as the metadata datastructure to manage how IOVA ranges are
+mapped to memory pages, composed of:
+
+- struct io_pagetable holding the IOVA map
+- struct iopt_area's representing populated portions of IOVA
+- struct iopt_pages representing the storage of PFNs
+- struct iommu_domain representing the IO page table in the IOMMU
+- struct iopt_pages_access representing in-kernel users of PFNs
+- struct xarray pinned_pfns holding a list of pages pinned by in-kernel users
+
+Each iopt_pages represents a logical linear array of full PFNs. The PFNs are
+ultimately derived from userspace VAs via an mm_struct. Once they have been
+pinned the PFNs are stored in IOPTEs of an iommu_domain or inside the pinned_pfns
+xarray if they have been pinned through an iommufd_access.
+
+PFN have to be copied between all combinations of storage locations, depending
+on what domains are present and what kinds of in-kernel "software access" users
+exist. The mechanism ensures that a page is pinned only once.
+
+An io_pagetable is composed of iopt_areas pointing at iopt_pages, along with a
+list of iommu_domains that mirror the IOVA to PFN map.
+
+Multiple io_pagetable-s, through their iopt_area-s, can share a single
+iopt_pages which avoids multi-pinning and double accounting of page
+consumption.
+
+iommufd_ioas is sharable between subsystems, e.g. VFIO and VDPA, as long as
+devices managed by different subsystems are bound to a same iommufd.
+
+IOMMUFD User API
+================
+
+.. kernel-doc:: include/uapi/linux/iommufd.h
+
+IOMMUFD Kernel API
+==================
+
+The IOMMUFD kAPI is device-centric with group-related tricks managed behind the
+scene. This allows the external drivers calling such kAPI to implement a simple
+device-centric uAPI for connecting its device to an iommufd, instead of
+explicitly imposing the group semantics in its uAPI as VFIO does.
+
+.. kernel-doc:: drivers/iommu/iommufd/device.c
+ :export:
+
+.. kernel-doc:: drivers/iommu/iommufd/main.c
+ :export:
+
+VFIO and IOMMUFD
+----------------
+
+Connecting a VFIO device to iommufd can be done in two ways.
+
+First is a VFIO compatible way by directly implementing the /dev/vfio/vfio
+container IOCTLs by mapping them into io_pagetable operations. Doing so allows
+the use of iommufd in legacy VFIO applications by symlinking /dev/vfio/vfio to
+/dev/iommufd or extending VFIO to SET_CONTAINER using an iommufd instead of a
+container fd.
+
+The second approach directly extends VFIO to support a new set of device-centric
+user API based on aforementioned IOMMUFD kernel API. It requires userspace
+change but better matches the IOMMUFD API semantics and easier to support new
+iommufd features when comparing it to the first approach.
+
+Currently both approaches are still work-in-progress.
+
+There are still a few gaps to be resolved to catch up with VFIO type1, as
+documented in iommufd_vfio_check_extension().
+
+Future TODOs
+============
+
+Currently IOMMUFD supports only kernel-managed I/O page table, similar to VFIO
+type1. New features on the radar include:
+
+ - Binding iommu_domain's to PASID/SSID
+ - Userspace page tables, for ARM, x86 and S390
+ - Kernel bypass'd invalidation of user page tables
+ - Re-use of the KVM page table in the IOMMU
+ - Dirty page tracking in the IOMMU
+ - Runtime Increase/Decrease of IOPTE size
+ - PRI support with faults resolved in userspace
diff --git a/MAINTAINERS b/MAINTAINERS
index 379945f82a64..c0a93779731d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10717,6 +10717,18 @@ F: drivers/iommu/dma-iommu.h
F: drivers/iommu/iova.c
F: include/linux/iova.h
+IOMMUFD
+M: Jason Gunthorpe <jgg@nvidia.com>
+M: Kevin Tian <kevin.tian@intel.com>
+L: iommu@lists.linux.dev
+S: Maintained
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd.git
+F: Documentation/userspace-api/iommufd.rst
+F: drivers/iommu/iommufd/
+F: include/linux/iommufd.h
+F: include/uapi/linux/iommufd.h
+F: tools/testing/selftests/iommu/
+
IOMMU SUBSYSTEM
M: Joerg Roedel <joro@8bytes.org>
M: Will Deacon <will@kernel.org>
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index dc5f7a156ff5..319966cde5cf 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -188,6 +188,7 @@ config MSM_IOMMU
source "drivers/iommu/amd/Kconfig"
source "drivers/iommu/intel/Kconfig"
+source "drivers/iommu/iommufd/Kconfig"
config IRQ_REMAP
bool "Support for Interrupt Remapping"
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 7fbf6a337662..f461d0651385 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
-obj-y += amd/ intel/ arm/
+obj-y += amd/ intel/ arm/ iommufd/
obj-$(CONFIG_IOMMU_API) += iommu.o
obj-$(CONFIG_IOMMU_API) += iommu-traces.o
obj-$(CONFIG_IOMMU_API) += iommu-sysfs.o
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 45299eb7e8e3..240c535e317c 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -2278,6 +2278,8 @@ static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap)
return false;
case IOMMU_CAP_PRE_BOOT_PROTECTION:
return amdr_ivrs_remap_support;
+ case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
+ return true;
default:
break;
}
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index f298e51d5aa6..157c97274110 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4450,14 +4450,20 @@ static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
{
- if (cap == IOMMU_CAP_CACHE_COHERENCY)
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+
+ switch (cap) {
+ case IOMMU_CAP_CACHE_COHERENCY:
return true;
- if (cap == IOMMU_CAP_INTR_REMAP)
+ case IOMMU_CAP_INTR_REMAP:
return irq_remapping_enabled == 1;
- if (cap == IOMMU_CAP_PRE_BOOT_PROTECTION)
+ case IOMMU_CAP_PRE_BOOT_PROTECTION:
return dmar_platform_optin();
-
- return false;
+ case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
+ return ecap_sc_support(info->iommu->ecap);
+ default:
+ return false;
+ }
}
static struct iommu_device *intel_iommu_probe_device(struct device *dev)
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 6ca377f4fbf9..d69ebba81beb 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -3108,41 +3108,49 @@ static int __iommu_group_alloc_blocking_domain(struct iommu_group *group)
return 0;
}
+static int __iommu_take_dma_ownership(struct iommu_group *group, void *owner)
+{
+ int ret;
+
+ if ((group->domain && group->domain != group->default_domain) ||
+ !xa_empty(&group->pasid_array))
+ return -EBUSY;
+
+ ret = __iommu_group_alloc_blocking_domain(group);
+ if (ret)
+ return ret;
+ ret = __iommu_group_set_domain(group, group->blocking_domain);
+ if (ret)
+ return ret;
+
+ group->owner = owner;
+ group->owner_cnt++;
+ return 0;
+}
+
/**
* iommu_group_claim_dma_owner() - Set DMA ownership of a group
* @group: The group.
* @owner: Caller specified pointer. Used for exclusive ownership.
*
- * This is to support backward compatibility for vfio which manages
- * the dma ownership in iommu_group level. New invocations on this
- * interface should be prohibited.
+ * This is to support backward compatibility for vfio which manages the dma
+ * ownership in iommu_group level. New invocations on this interface should be
+ * prohibited. Only a single owner may exist for a group.
*/
int iommu_group_claim_dma_owner(struct iommu_group *group, void *owner)
{
int ret = 0;
+ if (WARN_ON(!owner))
+ return -EINVAL;
+
mutex_lock(&group->mutex);
if (group->owner_cnt) {
ret = -EPERM;
goto unlock_out;
- } else {
- if ((group->domain && group->domain != group->default_domain) ||
- !xa_empty(&group->pasid_array)) {
- ret = -EBUSY;
- goto unlock_out;
- }
-
- ret = __iommu_group_alloc_blocking_domain(group);
- if (ret)
- goto unlock_out;
-
- ret = __iommu_group_set_domain(group, group->blocking_domain);
- if (ret)
- goto unlock_out;
- group->owner = owner;
}
- group->owner_cnt++;
+ ret = __iommu_take_dma_ownership(group, owner);
unlock_out:
mutex_unlock(&group->mutex);
@@ -3151,31 +3159,92 @@ unlock_out:
EXPORT_SYMBOL_GPL(iommu_group_claim_dma_owner);
/**
- * iommu_group_release_dma_owner() - Release DMA ownership of a group
- * @group: The group.
+ * iommu_device_claim_dma_owner() - Set DMA ownership of a device
+ * @dev: The device.
+ * @owner: Caller specified pointer. Used for exclusive ownership.
*
- * Release the DMA ownership claimed by iommu_group_claim_dma_owner().
+ * Claim the DMA ownership of a device. Multiple devices in the same group may
+ * concurrently claim ownership if they present the same owner value. Returns 0
+ * on success and error code on failure
*/
-void iommu_group_release_dma_owner(struct iommu_group *group)
+int iommu_device_claim_dma_owner(struct device *dev, void *owner)
{
- int ret;
+ struct iommu_group *group = iommu_group_get(dev);
+ int ret = 0;
+
+ if (!group)
+ return -ENODEV;
+ if (WARN_ON(!owner))
+ return -EINVAL;
mutex_lock(&group->mutex);
+ if (group->owner_cnt) {
+ if (group->owner != owner) {
+ ret = -EPERM;
+ goto unlock_out;
+ }
+ group->owner_cnt++;
+ goto unlock_out;
+ }
+
+ ret = __iommu_take_dma_ownership(group, owner);
+unlock_out:
+ mutex_unlock(&group->mutex);
+ iommu_group_put(group);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_device_claim_dma_owner);
+
+static void __iommu_release_dma_ownership(struct iommu_group *group)
+{
+ int ret;
+
if (WARN_ON(!group->owner_cnt || !group->owner ||
!xa_empty(&group->pasid_array)))
- goto unlock_out;
+ return;
group->owner_cnt = 0;
group->owner = NULL;
ret = __iommu_group_set_domain(group, group->default_domain);
WARN(ret, "iommu driver failed to attach the default domain");
+}
-unlock_out:
+/**
+ * iommu_group_release_dma_owner() - Release DMA ownership of a group
+ * @dev: The device
+ *
+ * Release the DMA ownership claimed by iommu_group_claim_dma_owner().
+ */
+void iommu_group_release_dma_owner(struct iommu_group *group)
+{
+ mutex_lock(&group->mutex);
+ __iommu_release_dma_ownership(group);
mutex_unlock(&group->mutex);
}
EXPORT_SYMBOL_GPL(iommu_group_release_dma_owner);
/**
+ * iommu_device_release_dma_owner() - Release DMA ownership of a device
+ * @group: The device.
+ *
+ * Release the DMA ownership claimed by iommu_device_claim_dma_owner().
+ */
+void iommu_device_release_dma_owner(struct device *dev)
+{
+ struct iommu_group *group = iommu_group_get(dev);
+
+ mutex_lock(&group->mutex);
+ if (group->owner_cnt > 1)
+ group->owner_cnt--;
+ else
+ __iommu_release_dma_ownership(group);
+ mutex_unlock(&group->mutex);
+ iommu_group_put(group);
+}
+EXPORT_SYMBOL_GPL(iommu_device_release_dma_owner);
+
+/**
* iommu_group_dma_owner_claimed() - Query group dma ownership status
* @group: The group.
*
diff --git a/drivers/iommu/iommufd/Kconfig b/drivers/iommu/iommufd/Kconfig
new file mode 100644
index 000000000000..871244f2443f
--- /dev/null
+++ b/drivers/iommu/iommufd/Kconfig
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config IOMMUFD
+ tristate "IOMMU Userspace API"
+ select INTERVAL_TREE
+ select INTERVAL_TREE_SPAN_ITER
+ select IOMMU_API
+ default n
+ help
+ Provides /dev/iommu, the user API to control the IOMMU subsystem as
+ it relates to managing IO page tables that point at user space memory.
+
+ If you don't know what to do here, say N.
+
+if IOMMUFD
+config IOMMUFD_TEST
+ bool "IOMMU Userspace API Test support"
+ depends on DEBUG_KERNEL
+ depends on FAULT_INJECTION
+ depends on RUNTIME_TESTING_MENU
+ default n
+ help
+ This is dangerous, do not enable unless running
+ tools/testing/selftests/iommu
+endif
diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
new file mode 100644
index 000000000000..8aeba81800c5
--- /dev/null
+++ b/drivers/iommu/iommufd/Makefile
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0-only
+iommufd-y := \
+ device.o \
+ hw_pagetable.o \
+ io_pagetable.o \
+ ioas.o \
+ main.o \
+ pages.o \
+ vfio_compat.o
+
+iommufd-$(CONFIG_IOMMUFD_TEST) += selftest.o
+
+obj-$(CONFIG_IOMMUFD) += iommufd.o
diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
new file mode 100644
index 000000000000..dd2a415b603e
--- /dev/null
+++ b/drivers/iommu/iommufd/device.c
@@ -0,0 +1,778 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
+ */
+#include <linux/iommufd.h>
+#include <linux/slab.h>
+#include <linux/iommu.h>
+#include <linux/irqdomain.h>
+
+#include "io_pagetable.h"
+#include "iommufd_private.h"
+
+static bool allow_unsafe_interrupts;
+module_param(allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(
+ allow_unsafe_interrupts,
+ "Allow IOMMUFD to bind to devices even if the platform cannot isolate "
+ "the MSI interrupt window. Enabling this is a security weakness.");
+
+/*
+ * A iommufd_device object represents the binding relationship between a
+ * consuming driver and the iommufd. These objects are created/destroyed by
+ * external drivers, not by userspace.
+ */
+struct iommufd_device {
+ struct iommufd_object obj;
+ struct iommufd_ctx *ictx;
+ struct iommufd_hw_pagetable *hwpt;
+ /* Head at iommufd_hw_pagetable::devices */
+ struct list_head devices_item;
+ /* always the physical device */
+ struct device *dev;
+ struct iommu_group *group;
+ bool enforce_cache_coherency;
+};
+
+void iommufd_device_destroy(struct iommufd_object *obj)
+{
+ struct iommufd_device *idev =
+ container_of(obj, struct iommufd_device, obj);
+
+ iommu_device_release_dma_owner(idev->dev);
+ iommu_group_put(idev->group);
+ iommufd_ctx_put(idev->ictx);
+}
+
+/**
+ * iommufd_device_bind - Bind a physical device to an iommu fd
+ * @ictx: iommufd file descriptor
+ * @dev: Pointer to a physical device struct
+ * @id: Output ID number to return to userspace for this device
+ *
+ * A successful bind establishes an ownership over the device and returns
+ * struct iommufd_device pointer, otherwise returns error pointer.
+ *
+ * A driver using this API must set driver_managed_dma and must not touch
+ * the device until this routine succeeds and establishes ownership.
+ *
+ * Binding a PCI device places the entire RID under iommufd control.
+ *
+ * The caller must undo this with iommufd_device_unbind()
+ */
+struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx,
+ struct device *dev, u32 *id)
+{
+ struct iommufd_device *idev;
+ struct iommu_group *group;
+ int rc;
+
+ /*
+ * iommufd always sets IOMMU_CACHE because we offer no way for userspace
+ * to restore cache coherency.
+ */
+ if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY))
+ return ERR_PTR(-EINVAL);
+
+ group = iommu_group_get(dev);
+ if (!group)
+ return ERR_PTR(-ENODEV);
+
+ rc = iommu_device_claim_dma_owner(dev, ictx);
+ if (rc)
+ goto out_group_put;
+
+ idev = iommufd_object_alloc(ictx, idev, IOMMUFD_OBJ_DEVICE);
+ if (IS_ERR(idev)) {
+ rc = PTR_ERR(idev);
+ goto out_release_owner;
+ }
+ idev->ictx = ictx;
+ iommufd_ctx_get(ictx);
+ idev->dev = dev;
+ idev->enforce_cache_coherency =
+ device_iommu_capable(dev, IOMMU_CAP_ENFORCE_CACHE_COHERENCY);
+ /* The calling driver is a user until iommufd_device_unbind() */
+ refcount_inc(&idev->obj.users);
+ /* group refcount moves into iommufd_device */
+ idev->group = group;
+
+ /*
+ * If the caller fails after this success it must call
+ * iommufd_unbind_device() which is safe since we hold this refcount.
+ * This also means the device is a leaf in the graph and no other object
+ * can take a reference on it.
+ */
+ iommufd_object_finalize(ictx, &idev->obj);
+ *id = idev->obj.id;
+ return idev;
+
+out_release_owner:
+ iommu_device_release_dma_owner(dev);
+out_group_put:
+ iommu_group_put(group);
+ return ERR_PTR(rc);
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_device_bind, IOMMUFD);
+
+/**
+ * iommufd_device_unbind - Undo iommufd_device_bind()
+ * @idev: Device returned by iommufd_device_bind()
+ *
+ * Release the device from iommufd control. The DMA ownership will return back
+ * to unowned with DMA controlled by the DMA API. This invalidates the
+ * iommufd_device pointer, other APIs that consume it must not be called
+ * concurrently.
+ */
+void iommufd_device_unbind(struct iommufd_device *idev)
+{
+ bool was_destroyed;
+
+ was_destroyed = iommufd_object_destroy_user(idev->ictx, &idev->obj);
+ WARN_ON(!was_destroyed);
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_device_unbind, IOMMUFD);
+
+static int iommufd_device_setup_msi(struct iommufd_device *idev,
+ struct iommufd_hw_pagetable *hwpt,
+ phys_addr_t sw_msi_start)
+{
+ int rc;
+
+ /*
+ * IOMMU_CAP_INTR_REMAP means that the platform is isolating MSI, and it
+ * creates the MSI window by default in the iommu domain. Nothing
+ * further to do.
+ */
+ if (device_iommu_capable(idev->dev, IOMMU_CAP_INTR_REMAP))
+ return 0;
+
+ /*
+ * On ARM systems that set the global IRQ_DOMAIN_FLAG_MSI_REMAP every
+ * allocated iommu_domain will block interrupts by default and this
+ * special flow is needed to turn them back on. iommu_dma_prepare_msi()
+ * will install pages into our domain after request_irq() to make this
+ * work.
+ *
+ * FIXME: This is conceptually broken for iommufd since we want to allow
+ * userspace to change the domains, eg switch from an identity IOAS to a
+ * DMA IOAS. There is currently no way to create a MSI window that
+ * matches what the IRQ layer actually expects in a newly created
+ * domain.
+ */
+ if (irq_domain_check_msi_remap()) {
+ if (WARN_ON(!sw_msi_start))
+ return -EPERM;
+ /*
+ * iommu_get_msi_cookie() can only be called once per domain,
+ * it returns -EBUSY on later calls.
+ */
+ if (hwpt->msi_cookie)
+ return 0;
+ rc = iommu_get_msi_cookie(hwpt->domain, sw_msi_start);
+ if (rc)
+ return rc;
+ hwpt->msi_cookie = true;
+ return 0;
+ }
+
+ /*
+ * Otherwise the platform has a MSI window that is not isolated. For
+ * historical compat with VFIO allow a module parameter to ignore the
+ * insecurity.
+ */
+ if (!allow_unsafe_interrupts)
+ return -EPERM;
+
+ dev_warn(
+ idev->dev,
+ "MSI interrupt window cannot be isolated by the IOMMU, this platform is insecure. Use the \"allow_unsafe_interrupts\" module parameter to override\n");
+ return 0;
+}
+
+static bool iommufd_hw_pagetable_has_group(struct iommufd_hw_pagetable *hwpt,
+ struct iommu_group *group)
+{
+ struct iommufd_device *cur_dev;
+
+ list_for_each_entry(cur_dev, &hwpt->devices, devices_item)
+ if (cur_dev->group == group)
+ return true;
+ return false;
+}
+
+static int iommufd_device_do_attach(struct iommufd_device *idev,
+ struct iommufd_hw_pagetable *hwpt)
+{
+ phys_addr_t sw_msi_start = 0;
+ int rc;
+
+ mutex_lock(&hwpt->devices_lock);
+
+ /*
+ * Try to upgrade the domain we have, it is an iommu driver bug to
+ * report IOMMU_CAP_ENFORCE_CACHE_COHERENCY but fail
+ * enforce_cache_coherency when there are no devices attached to the
+ * domain.
+ */
+ if (idev->enforce_cache_coherency && !hwpt->enforce_cache_coherency) {
+ if (hwpt->domain->ops->enforce_cache_coherency)
+ hwpt->enforce_cache_coherency =
+ hwpt->domain->ops->enforce_cache_coherency(
+ hwpt->domain);
+ if (!hwpt->enforce_cache_coherency) {
+ WARN_ON(list_empty(&hwpt->devices));
+ rc = -EINVAL;
+ goto out_unlock;
+ }
+ }
+
+ rc = iopt_table_enforce_group_resv_regions(&hwpt->ioas->iopt, idev->dev,
+ idev->group, &sw_msi_start);
+ if (rc)
+ goto out_unlock;
+
+ rc = iommufd_device_setup_msi(idev, hwpt, sw_msi_start);
+ if (rc)
+ goto out_iova;
+
+ /*
+ * FIXME: Hack around missing a device-centric iommu api, only attach to
+ * the group once for the first device that is in the group.
+ */
+ if (!iommufd_hw_pagetable_has_group(hwpt, idev->group)) {
+ rc = iommu_attach_group(hwpt->domain, idev->group);
+ if (rc)
+ goto out_iova;
+
+ if (list_empty(&hwpt->devices)) {
+ rc = iopt_table_add_domain(&hwpt->ioas->iopt,
+ hwpt->domain);
+ if (rc)
+ goto out_detach;
+ }
+ }
+
+ idev->hwpt = hwpt;
+ refcount_inc(&hwpt->obj.users);
+ list_add(&idev->devices_item, &hwpt->devices);
+ mutex_unlock(&hwpt->devices_lock);
+ return 0;
+
+out_detach:
+ iommu_detach_group(hwpt->domain, idev->group);
+out_iova:
+ iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev);
+out_unlock:
+ mutex_unlock(&hwpt->devices_lock);
+ return rc;
+}
+
+/*
+ * When automatically managing the domains we search for a compatible domain in
+ * the iopt and if one is found use it, otherwise create a new domain.
+ * Automatic domain selection will never pick a manually created domain.
+ */
+static int iommufd_device_auto_get_domain(struct iommufd_device *idev,
+ struct iommufd_ioas *ioas)
+{
+ struct iommufd_hw_pagetable *hwpt;
+ int rc;
+
+ /*
+ * There is no differentiation when domains are allocated, so any domain
+ * that is willing to attach to the device is interchangeable with any
+ * other.
+ */
+ mutex_lock(&ioas->mutex);
+ list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) {
+ if (!hwpt->auto_domain)
+ continue;
+
+ rc = iommufd_device_do_attach(idev, hwpt);
+
+ /*
+ * -EINVAL means the domain is incompatible with the device.
+ * Other error codes should propagate to userspace as failure.
+ * Success means the domain is attached.
+ */
+ if (rc == -EINVAL)
+ continue;
+ goto out_unlock;
+ }
+
+ hwpt = iommufd_hw_pagetable_alloc(idev->ictx, ioas, idev->dev);
+ if (IS_ERR(hwpt)) {
+ rc = PTR_ERR(hwpt);
+ goto out_unlock;
+ }
+ hwpt->auto_domain = true;
+
+ rc = iommufd_device_do_attach(idev, hwpt);
+ if (rc)
+ goto out_abort;
+ list_add_tail(&hwpt->hwpt_item, &ioas->hwpt_list);
+
+ mutex_unlock(&ioas->mutex);
+ iommufd_object_finalize(idev->ictx, &hwpt->obj);
+ return 0;
+
+out_abort:
+ iommufd_object_abort_and_destroy(idev->ictx, &hwpt->obj);
+out_unlock:
+ mutex_unlock(&ioas->mutex);
+ return rc;
+}
+
+/**
+ * iommufd_device_attach - Connect a device from an iommu_domain
+ * @idev: device to attach
+ * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HW_PAGETABLE
+ * Output the IOMMUFD_OBJ_HW_PAGETABLE ID
+ *
+ * This connects the device to an iommu_domain, either automatically or manually
+ * selected. Once this completes the device could do DMA.
+ *
+ * The caller should return the resulting pt_id back to userspace.
+ * This function is undone by calling iommufd_device_detach().
+ */
+int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id)
+{
+ struct iommufd_object *pt_obj;
+ int rc;
+
+ pt_obj = iommufd_get_object(idev->ictx, *pt_id, IOMMUFD_OBJ_ANY);
+ if (IS_ERR(pt_obj))
+ return PTR_ERR(pt_obj);
+
+ switch (pt_obj->type) {
+ case IOMMUFD_OBJ_HW_PAGETABLE: {
+ struct iommufd_hw_pagetable *hwpt =
+ container_of(pt_obj, struct iommufd_hw_pagetable, obj);
+
+ rc = iommufd_device_do_attach(idev, hwpt);
+ if (rc)
+ goto out_put_pt_obj;
+
+ mutex_lock(&hwpt->ioas->mutex);
+ list_add_tail(&hwpt->hwpt_item, &hwpt->ioas->hwpt_list);
+ mutex_unlock(&hwpt->ioas->mutex);
+ break;
+ }
+ case IOMMUFD_OBJ_IOAS: {
+ struct iommufd_ioas *ioas =
+ container_of(pt_obj, struct iommufd_ioas, obj);
+
+ rc = iommufd_device_auto_get_domain(idev, ioas);
+ if (rc)
+ goto out_put_pt_obj;
+ break;
+ }
+ default:
+ rc = -EINVAL;
+ goto out_put_pt_obj;
+ }
+
+ refcount_inc(&idev->obj.users);
+ *pt_id = idev->hwpt->obj.id;
+ rc = 0;
+
+out_put_pt_obj:
+ iommufd_put_object(pt_obj);
+ return rc;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, IOMMUFD);
+
+/**
+ * iommufd_device_detach - Disconnect a device to an iommu_domain
+ * @idev: device to detach
+ *
+ * Undo iommufd_device_attach(). This disconnects the idev from the previously
+ * attached pt_id. The device returns back to a blocked DMA translation.
+ */
+void iommufd_device_detach(struct iommufd_device *idev)
+{
+ struct iommufd_hw_pagetable *hwpt = idev->hwpt;
+
+ mutex_lock(&hwpt->ioas->mutex);
+ mutex_lock(&hwpt->devices_lock);
+ list_del(&idev->devices_item);
+ if (!iommufd_hw_pagetable_has_group(hwpt, idev->group)) {
+ if (list_empty(&hwpt->devices)) {
+ iopt_table_remove_domain(&hwpt->ioas->iopt,
+ hwpt->domain);
+ list_del(&hwpt->hwpt_item);
+ }
+ iommu_detach_group(hwpt->domain, idev->group);
+ }
+ iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev);
+ mutex_unlock(&hwpt->devices_lock);
+ mutex_unlock(&hwpt->ioas->mutex);
+
+ if (hwpt->auto_domain)
+ iommufd_object_destroy_user(idev->ictx, &hwpt->obj);
+ else
+ refcount_dec(&hwpt->obj.users);
+
+ idev->hwpt = NULL;
+
+ refcount_dec(&idev->obj.users);
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_device_detach, IOMMUFD);
+
+void iommufd_access_destroy_object(struct iommufd_object *obj)
+{
+ struct iommufd_access *access =
+ container_of(obj, struct iommufd_access, obj);
+
+ iopt_remove_access(&access->ioas->iopt, access);
+ iommufd_ctx_put(access->ictx);
+ refcount_dec(&access->ioas->obj.users);
+}
+
+/**
+ * iommufd_access_create - Create an iommufd_access
+ * @ictx: iommufd file descriptor
+ * @ioas_id: ID for a IOMMUFD_OBJ_IOAS
+ * @ops: Driver's ops to associate with the access
+ * @data: Opaque data to pass into ops functions
+ *
+ * An iommufd_access allows a driver to read/write to the IOAS without using
+ * DMA. The underlying CPU memory can be accessed using the
+ * iommufd_access_pin_pages() or iommufd_access_rw() functions.
+ *
+ * The provided ops are required to use iommufd_access_pin_pages().
+ */
+struct iommufd_access *
+iommufd_access_create(struct iommufd_ctx *ictx, u32 ioas_id,
+ const struct iommufd_access_ops *ops, void *data)
+{
+ struct iommufd_access *access;
+ struct iommufd_object *obj;
+ int rc;
+
+ /*
+ * There is no uAPI for the access object, but to keep things symmetric
+ * use the object infrastructure anyhow.
+ */
+ access = iommufd_object_alloc(ictx, access, IOMMUFD_OBJ_ACCESS);
+ if (IS_ERR(access))
+ return access;
+
+ access->data = data;
+ access->ops = ops;
+
+ obj = iommufd_get_object(ictx, ioas_id, IOMMUFD_OBJ_IOAS);
+ if (IS_ERR(obj)) {
+ rc = PTR_ERR(obj);
+ goto out_abort;
+ }
+ access->ioas = container_of(obj, struct iommufd_ioas, obj);
+ iommufd_ref_to_users(obj);
+
+ if (ops->needs_pin_pages)
+ access->iova_alignment = PAGE_SIZE;
+ else
+ access->iova_alignment = 1;
+ rc = iopt_add_access(&access->ioas->iopt, access);
+ if (rc)
+ goto out_put_ioas;
+
+ /* The calling driver is a user until iommufd_access_destroy() */
+ refcount_inc(&access->obj.users);
+ access->ictx = ictx;
+ iommufd_ctx_get(ictx);
+ iommufd_object_finalize(ictx, &access->obj);
+ return access;
+out_put_ioas:
+ refcount_dec(&access->ioas->obj.users);
+out_abort:
+ iommufd_object_abort(ictx, &access->obj);
+ return ERR_PTR(rc);
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_access_create, IOMMUFD);
+
+/**
+ * iommufd_access_destroy - Destroy an iommufd_access
+ * @access: The access to destroy
+ *
+ * The caller must stop using the access before destroying it.
+ */
+void iommufd_access_destroy(struct iommufd_access *access)
+{
+ bool was_destroyed;
+
+ was_destroyed = iommufd_object_destroy_user(access->ictx, &access->obj);
+ WARN_ON(!was_destroyed);
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_access_destroy, IOMMUFD);
+
+/**
+ * iommufd_access_notify_unmap - Notify users of an iopt to stop using it
+ * @iopt: iopt to work on
+ * @iova: Starting iova in the iopt
+ * @length: Number of bytes
+ *
+ * After this function returns there should be no users attached to the pages
+ * linked to this iopt that intersect with iova,length. Anyone that has attached
+ * a user through iopt_access_pages() needs to detach it through
+ * iommufd_access_unpin_pages() before this function returns.
+ *
+ * iommufd_access_destroy() will wait for any outstanding unmap callback to
+ * complete. Once iommufd_access_destroy() no unmap ops are running or will
+ * run in the future. Due to this a driver must not create locking that prevents
+ * unmap to complete while iommufd_access_destroy() is running.
+ */
+void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova,
+ unsigned long length)
+{
+ struct iommufd_ioas *ioas =
+ container_of(iopt, struct iommufd_ioas, iopt);
+ struct iommufd_access *access;
+ unsigned long index;
+
+ xa_lock(&ioas->iopt.access_list);
+ xa_for_each(&ioas->iopt.access_list, index, access) {
+ if (!iommufd_lock_obj(&access->obj))
+ continue;
+ xa_unlock(&ioas->iopt.access_list);
+
+ access->ops->unmap(access->data, iova, length);
+
+ iommufd_put_object(&access->obj);
+ xa_lock(&ioas->iopt.access_list);
+ }
+ xa_unlock(&ioas->iopt.access_list);
+}
+
+/**
+ * iommufd_access_unpin_pages() - Undo iommufd_access_pin_pages
+ * @access: IOAS access to act on
+ * @iova: Starting IOVA
+ * @length: Number of bytes to access
+ *
+ * Return the struct page's. The caller must stop accessing them before calling
+ * this. The iova/length must exactly match the one provided to access_pages.
+ */
+void iommufd_access_unpin_pages(struct iommufd_access *access,
+ unsigned long iova, unsigned long length)
+{
+ struct io_pagetable *iopt = &access->ioas->iopt;
+ struct iopt_area_contig_iter iter;
+ unsigned long last_iova;
+ struct iopt_area *area;
+
+ if (WARN_ON(!length) ||
+ WARN_ON(check_add_overflow(iova, length - 1, &last_iova)))
+ return;
+
+ down_read(&iopt->iova_rwsem);
+ iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova)
+ iopt_area_remove_access(
+ area, iopt_area_iova_to_index(area, iter.cur_iova),
+ iopt_area_iova_to_index(
+ area,
+ min(last_iova, iopt_area_last_iova(area))));
+ up_read(&iopt->iova_rwsem);
+ WARN_ON(!iopt_area_contig_done(&iter));
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_access_unpin_pages, IOMMUFD);
+
+static bool iopt_area_contig_is_aligned(struct iopt_area_contig_iter *iter)
+{
+ if (iopt_area_start_byte(iter->area, iter->cur_iova) % PAGE_SIZE)
+ return false;
+
+ if (!iopt_area_contig_done(iter) &&
+ (iopt_area_start_byte(iter->area, iopt_area_last_iova(iter->area)) %
+ PAGE_SIZE) != (PAGE_SIZE - 1))
+ return false;
+ return true;
+}
+
+static bool check_area_prot(struct iopt_area *area, unsigned int flags)
+{
+ if (flags & IOMMUFD_ACCESS_RW_WRITE)
+ return area->iommu_prot & IOMMU_WRITE;
+ return area->iommu_prot & IOMMU_READ;
+}
+
+/**
+ * iommufd_access_pin_pages() - Return a list of pages under the iova
+ * @access: IOAS access to act on
+ * @iova: Starting IOVA
+ * @length: Number of bytes to access
+ * @out_pages: Output page list
+ * @flags: IOPMMUFD_ACCESS_RW_* flags
+ *
+ * Reads @length bytes starting at iova and returns the struct page * pointers.
+ * These can be kmap'd by the caller for CPU access.
+ *
+ * The caller must perform iommufd_access_unpin_pages() when done to balance
+ * this.
+ *
+ * This API always requires a page aligned iova. This happens naturally if the
+ * ioas alignment is >= PAGE_SIZE and the iova is PAGE_SIZE aligned. However
+ * smaller alignments have corner cases where this API can fail on otherwise
+ * aligned iova.
+ */
+int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova,
+ unsigned long length, struct page **out_pages,
+ unsigned int flags)
+{
+ struct io_pagetable *iopt = &access->ioas->iopt;
+ struct iopt_area_contig_iter iter;
+ unsigned long last_iova;
+ struct iopt_area *area;
+ int rc;
+
+ /* Driver's ops don't support pin_pages */
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
+ WARN_ON(access->iova_alignment != PAGE_SIZE || !access->ops->unmap))
+ return -EINVAL;
+
+ if (!length)
+ return -EINVAL;
+ if (check_add_overflow(iova, length - 1, &last_iova))
+ return -EOVERFLOW;
+
+ down_read(&iopt->iova_rwsem);
+ iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
+ unsigned long last = min(last_iova, iopt_area_last_iova(area));
+ unsigned long last_index = iopt_area_iova_to_index(area, last);
+ unsigned long index =
+ iopt_area_iova_to_index(area, iter.cur_iova);
+
+ if (area->prevent_access ||
+ !iopt_area_contig_is_aligned(&iter)) {
+ rc = -EINVAL;
+ goto err_remove;
+ }
+
+ if (!check_area_prot(area, flags)) {
+ rc = -EPERM;
+ goto err_remove;
+ }
+
+ rc = iopt_area_add_access(area, index, last_index, out_pages,
+ flags);
+ if (rc)
+ goto err_remove;
+ out_pages += last_index - index + 1;
+ }
+ if (!iopt_area_contig_done(&iter)) {
+ rc = -ENOENT;
+ goto err_remove;
+ }
+
+ up_read(&iopt->iova_rwsem);
+ return 0;
+
+err_remove:
+ if (iova < iter.cur_iova) {
+ last_iova = iter.cur_iova - 1;
+ iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova)
+ iopt_area_remove_access(
+ area,
+ iopt_area_iova_to_index(area, iter.cur_iova),
+ iopt_area_iova_to_index(
+ area, min(last_iova,
+ iopt_area_last_iova(area))));
+ }
+ up_read(&iopt->iova_rwsem);
+ return rc;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_access_pin_pages, IOMMUFD);
+
+/**
+ * iommufd_access_rw - Read or write data under the iova
+ * @access: IOAS access to act on
+ * @iova: Starting IOVA
+ * @data: Kernel buffer to copy to/from
+ * @length: Number of bytes to access
+ * @flags: IOMMUFD_ACCESS_RW_* flags
+ *
+ * Copy kernel to/from data into the range given by IOVA/length. If flags
+ * indicates IOMMUFD_ACCESS_RW_KTHREAD then a large copy can be optimized
+ * by changing it into copy_to/from_user().
+ */
+int iommufd_access_rw(struct iommufd_access *access, unsigned long iova,
+ void *data, size_t length, unsigned int flags)
+{
+ struct io_pagetable *iopt = &access->ioas->iopt;
+ struct iopt_area_contig_iter iter;
+ struct iopt_area *area;
+ unsigned long last_iova;
+ int rc;
+
+ if (!length)
+ return -EINVAL;
+ if (check_add_overflow(iova, length - 1, &last_iova))
+ return -EOVERFLOW;
+
+ down_read(&iopt->iova_rwsem);
+ iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
+ unsigned long last = min(last_iova, iopt_area_last_iova(area));
+ unsigned long bytes = (last - iter.cur_iova) + 1;
+
+ if (area->prevent_access) {
+ rc = -EINVAL;
+ goto err_out;
+ }
+
+ if (!check_area_prot(area, flags)) {
+ rc = -EPERM;
+ goto err_out;
+ }
+
+ rc = iopt_pages_rw_access(
+ area->pages, iopt_area_start_byte(area, iter.cur_iova),
+ data, bytes, flags);
+ if (rc)
+ goto err_out;
+ data += bytes;
+ }
+ if (!iopt_area_contig_done(&iter))
+ rc = -ENOENT;
+err_out:
+ up_read(&iopt->iova_rwsem);
+ return rc;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_access_rw, IOMMUFD);
+
+#ifdef CONFIG_IOMMUFD_TEST
+/*
+ * Creating a real iommufd_device is too hard, bypass creating a iommufd_device
+ * and go directly to attaching a domain.
+ */
+struct iommufd_hw_pagetable *
+iommufd_device_selftest_attach(struct iommufd_ctx *ictx,
+ struct iommufd_ioas *ioas,
+ struct device *mock_dev)
+{
+ struct iommufd_hw_pagetable *hwpt;
+ int rc;
+
+ hwpt = iommufd_hw_pagetable_alloc(ictx, ioas, mock_dev);
+ if (IS_ERR(hwpt))
+ return hwpt;
+
+ rc = iopt_table_add_domain(&hwpt->ioas->iopt, hwpt->domain);
+ if (rc)
+ goto out_hwpt;
+
+ refcount_inc(&hwpt->obj.users);
+ iommufd_object_finalize(ictx, &hwpt->obj);
+ return hwpt;
+
+out_hwpt:
+ iommufd_object_abort_and_destroy(ictx, &hwpt->obj);
+ return ERR_PTR(rc);
+}
+
+void iommufd_device_selftest_detach(struct iommufd_ctx *ictx,
+ struct iommufd_hw_pagetable *hwpt)
+{
+ iopt_table_remove_domain(&hwpt->ioas->iopt, hwpt->domain);
+ refcount_dec(&hwpt->obj.users);
+}
+#endif
diff --git a/drivers/iommu/iommufd/double_span.h b/drivers/iommu/iommufd/double_span.h
new file mode 100644
index 000000000000..b37aab7488c0
--- /dev/null
+++ b/drivers/iommu/iommufd/double_span.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.
+ */
+#ifndef __IOMMUFD_DOUBLE_SPAN_H
+#define __IOMMUFD_DOUBLE_SPAN_H
+
+#include <linux/interval_tree.h>
+
+/*
+ * This is a variation of the general interval_tree_span_iter that computes the
+ * spans over the union of two different interval trees. Used ranges are broken
+ * up and reported based on the tree that provides the interval. The first span
+ * always takes priority. Like interval_tree_span_iter it is greedy and the same
+ * value of is_used will not repeat on two iteration cycles.
+ */
+struct interval_tree_double_span_iter {
+ struct rb_root_cached *itrees[2];
+ struct interval_tree_span_iter spans[2];
+ union {
+ unsigned long start_hole;
+ unsigned long start_used;
+ };
+ union {
+ unsigned long last_hole;
+ unsigned long last_used;
+ };
+ /* 0 = hole, 1 = used span[0], 2 = used span[1], -1 done iteration */
+ int is_used;
+};
+
+void interval_tree_double_span_iter_update(
+ struct interval_tree_double_span_iter *iter);
+void interval_tree_double_span_iter_first(
+ struct interval_tree_double_span_iter *iter,
+ struct rb_root_cached *itree1, struct rb_root_cached *itree2,
+ unsigned long first_index, unsigned long last_index);
+void interval_tree_double_span_iter_next(
+ struct interval_tree_double_span_iter *iter);
+
+static inline bool
+interval_tree_double_span_iter_done(struct interval_tree_double_span_iter *state)
+{
+ return state->is_used == -1;
+}
+
+#define interval_tree_for_each_double_span(span, itree1, itree2, first_index, \
+ last_index) \
+ for (interval_tree_double_span_iter_first(span, itree1, itree2, \
+ first_index, last_index); \
+ !interval_tree_double_span_iter_done(span); \
+ interval_tree_double_span_iter_next(span))
+
+#endif
diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
new file mode 100644
index 000000000000..43d473989a06
--- /dev/null
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
+ */
+#include <linux/iommu.h>
+
+#include "iommufd_private.h"
+
+void iommufd_hw_pagetable_destroy(struct iommufd_object *obj)
+{
+ struct iommufd_hw_pagetable *hwpt =
+ container_of(obj, struct iommufd_hw_pagetable, obj);
+
+ WARN_ON(!list_empty(&hwpt->devices));
+
+ iommu_domain_free(hwpt->domain);
+ refcount_dec(&hwpt->ioas->obj.users);
+ mutex_destroy(&hwpt->devices_lock);
+}
+
+/**
+ * iommufd_hw_pagetable_alloc() - Get an iommu_domain for a device
+ * @ictx: iommufd context
+ * @ioas: IOAS to associate the domain with
+ * @dev: Device to get an iommu_domain for
+ *
+ * Allocate a new iommu_domain and return it as a hw_pagetable.
+ */
+struct iommufd_hw_pagetable *
+iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
+ struct device *dev)
+{
+ struct iommufd_hw_pagetable *hwpt;
+ int rc;
+
+ hwpt = iommufd_object_alloc(ictx, hwpt, IOMMUFD_OBJ_HW_PAGETABLE);
+ if (IS_ERR(hwpt))
+ return hwpt;
+
+ hwpt->domain = iommu_domain_alloc(dev->bus);
+ if (!hwpt->domain) {
+ rc = -ENOMEM;
+ goto out_abort;
+ }
+
+ INIT_LIST_HEAD(&hwpt->devices);
+ INIT_LIST_HEAD(&hwpt->hwpt_item);
+ mutex_init(&hwpt->devices_lock);
+ /* Pairs with iommufd_hw_pagetable_destroy() */
+ refcount_inc(&ioas->obj.users);
+ hwpt->ioas = ioas;
+ return hwpt;
+
+out_abort:
+ iommufd_object_abort(ictx, &hwpt->obj);
+ return ERR_PTR(rc);
+}
diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c
new file mode 100644
index 000000000000..3467cea79568
--- /dev/null
+++ b/drivers/iommu/iommufd/io_pagetable.c
@@ -0,0 +1,1212 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
+ *
+ * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
+ * PFNs can be placed into an iommu_domain, or returned to the caller as a page
+ * list for access by an in-kernel user.
+ *
+ * The datastructure uses the iopt_pages to optimize the storage of the PFNs
+ * between the domains and xarray.
+ */
+#include <linux/iommufd.h>
+#include <linux/lockdep.h>
+#include <linux/iommu.h>
+#include <linux/sched/mm.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+
+#include "io_pagetable.h"
+#include "double_span.h"
+
+struct iopt_pages_list {
+ struct iopt_pages *pages;
+ struct iopt_area *area;
+ struct list_head next;
+ unsigned long start_byte;
+ unsigned long length;
+};
+
+struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter,
+ struct io_pagetable *iopt,
+ unsigned long iova,
+ unsigned long last_iova)
+{
+ lockdep_assert_held(&iopt->iova_rwsem);
+
+ iter->cur_iova = iova;
+ iter->last_iova = last_iova;
+ iter->area = iopt_area_iter_first(iopt, iova, iova);
+ if (!iter->area)
+ return NULL;
+ if (!iter->area->pages) {
+ iter->area = NULL;
+ return NULL;
+ }
+ return iter->area;
+}
+
+struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter)
+{
+ unsigned long last_iova;
+
+ if (!iter->area)
+ return NULL;
+ last_iova = iopt_area_last_iova(iter->area);
+ if (iter->last_iova <= last_iova)
+ return NULL;
+
+ iter->cur_iova = last_iova + 1;
+ iter->area = iopt_area_iter_next(iter->area, iter->cur_iova,
+ iter->last_iova);
+ if (!iter->area)
+ return NULL;
+ if (iter->cur_iova != iopt_area_iova(iter->area) ||
+ !iter->area->pages) {
+ iter->area = NULL;
+ return NULL;
+ }
+ return iter->area;
+}
+
+static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span,
+ unsigned long length,
+ unsigned long iova_alignment,
+ unsigned long page_offset)
+{
+ if (span->is_used || span->last_hole - span->start_hole < length - 1)
+ return false;
+
+ span->start_hole = ALIGN(span->start_hole, iova_alignment) |
+ page_offset;
+ if (span->start_hole > span->last_hole ||
+ span->last_hole - span->start_hole < length - 1)
+ return false;
+ return true;
+}
+
+static bool __alloc_iova_check_used(struct interval_tree_span_iter *span,
+ unsigned long length,
+ unsigned long iova_alignment,
+ unsigned long page_offset)
+{
+ if (span->is_hole || span->last_used - span->start_used < length - 1)
+ return false;
+
+ span->start_used = ALIGN(span->start_used, iova_alignment) |
+ page_offset;
+ if (span->start_used > span->last_used ||
+ span->last_used - span->start_used < length - 1)
+ return false;
+ return true;
+}
+
+/*
+ * Automatically find a block of IOVA that is not being used and not reserved.
+ * Does not return a 0 IOVA even if it is valid.
+ */
+static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
+ unsigned long uptr, unsigned long length)
+{
+ unsigned long page_offset = uptr % PAGE_SIZE;
+ struct interval_tree_double_span_iter used_span;
+ struct interval_tree_span_iter allowed_span;
+ unsigned long iova_alignment;
+
+ lockdep_assert_held(&iopt->iova_rwsem);
+
+ /* Protect roundup_pow-of_two() from overflow */
+ if (length == 0 || length >= ULONG_MAX / 2)
+ return -EOVERFLOW;
+
+ /*
+ * Keep alignment present in the uptr when building the IOVA, this
+ * increases the chance we can map a THP.
+ */
+ if (!uptr)
+ iova_alignment = roundup_pow_of_two(length);
+ else
+ iova_alignment = min_t(unsigned long,
+ roundup_pow_of_two(length),
+ 1UL << __ffs64(uptr));
+
+ if (iova_alignment < iopt->iova_alignment)
+ return -EINVAL;
+
+ interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree,
+ PAGE_SIZE, ULONG_MAX - PAGE_SIZE) {
+ if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) {
+ allowed_span.start_used = PAGE_SIZE;
+ allowed_span.last_used = ULONG_MAX - PAGE_SIZE;
+ allowed_span.is_hole = false;
+ }
+
+ if (!__alloc_iova_check_used(&allowed_span, length,
+ iova_alignment, page_offset))
+ continue;
+
+ interval_tree_for_each_double_span(
+ &used_span, &iopt->reserved_itree, &iopt->area_itree,
+ allowed_span.start_used, allowed_span.last_used) {
+ if (!__alloc_iova_check_hole(&used_span, length,
+ iova_alignment,
+ page_offset))
+ continue;
+
+ *iova = used_span.start_hole;
+ return 0;
+ }
+ }
+ return -ENOSPC;
+}
+
+static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova,
+ unsigned long length)
+{
+ unsigned long last;
+
+ lockdep_assert_held(&iopt->iova_rwsem);
+
+ if ((iova & (iopt->iova_alignment - 1)))
+ return -EINVAL;
+
+ if (check_add_overflow(iova, length - 1, &last))
+ return -EOVERFLOW;
+
+ /* No reserved IOVA intersects the range */
+ if (iopt_reserved_iter_first(iopt, iova, last))
+ return -EINVAL;
+
+ /* Check that there is not already a mapping in the range */
+ if (iopt_area_iter_first(iopt, iova, last))
+ return -EEXIST;
+ return 0;
+}
+
+/*
+ * The area takes a slice of the pages from start_bytes to start_byte + length
+ */
+static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area,
+ struct iopt_pages *pages, unsigned long iova,
+ unsigned long start_byte, unsigned long length,
+ int iommu_prot)
+{
+ lockdep_assert_held_write(&iopt->iova_rwsem);
+
+ if ((iommu_prot & IOMMU_WRITE) && !pages->writable)
+ return -EPERM;
+
+ area->iommu_prot = iommu_prot;
+ area->page_offset = start_byte % PAGE_SIZE;
+ if (area->page_offset & (iopt->iova_alignment - 1))
+ return -EINVAL;
+
+ area->node.start = iova;
+ if (check_add_overflow(iova, length - 1, &area->node.last))
+ return -EOVERFLOW;
+
+ area->pages_node.start = start_byte / PAGE_SIZE;
+ if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
+ return -EOVERFLOW;
+ area->pages_node.last = area->pages_node.last / PAGE_SIZE;
+ if (WARN_ON(area->pages_node.last >= pages->npages))
+ return -EOVERFLOW;
+
+ /*
+ * The area is inserted with a NULL pages indicating it is not fully
+ * initialized yet.
+ */
+ area->iopt = iopt;
+ interval_tree_insert(&area->node, &iopt->area_itree);
+ return 0;
+}
+
+static int iopt_alloc_area_pages(struct io_pagetable *iopt,
+ struct list_head *pages_list,
+ unsigned long length, unsigned long *dst_iova,
+ int iommu_prot, unsigned int flags)
+{
+ struct iopt_pages_list *elm;
+ unsigned long iova;
+ int rc = 0;
+
+ list_for_each_entry(elm, pages_list, next) {
+ elm->area = kzalloc(sizeof(*elm->area), GFP_KERNEL_ACCOUNT);
+ if (!elm->area)
+ return -ENOMEM;
+ }
+
+ down_write(&iopt->iova_rwsem);
+ if ((length & (iopt->iova_alignment - 1)) || !length) {
+ rc = -EINVAL;
+ goto out_unlock;
+ }
+
+ if (flags & IOPT_ALLOC_IOVA) {
+ /* Use the first entry to guess the ideal IOVA alignment */
+ elm = list_first_entry(pages_list, struct iopt_pages_list,
+ next);
+ rc = iopt_alloc_iova(
+ iopt, dst_iova,
+ (uintptr_t)elm->pages->uptr + elm->start_byte, length);
+ if (rc)
+ goto out_unlock;
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
+ WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) {
+ rc = -EINVAL;
+ goto out_unlock;
+ }
+ } else {
+ rc = iopt_check_iova(iopt, *dst_iova, length);
+ if (rc)
+ goto out_unlock;
+ }
+
+ /*
+ * Areas are created with a NULL pages so that the IOVA space is
+ * reserved and we can unlock the iova_rwsem.
+ */
+ iova = *dst_iova;
+ list_for_each_entry(elm, pages_list, next) {
+ rc = iopt_insert_area(iopt, elm->area, elm->pages, iova,
+ elm->start_byte, elm->length, iommu_prot);
+ if (rc)
+ goto out_unlock;
+ iova += elm->length;
+ }
+
+out_unlock:
+ up_write(&iopt->iova_rwsem);
+ return rc;
+}
+
+static void iopt_abort_area(struct iopt_area *area)
+{
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
+ WARN_ON(area->pages);
+ if (area->iopt) {
+ down_write(&area->iopt->iova_rwsem);
+ interval_tree_remove(&area->node, &area->iopt->area_itree);
+ up_write(&area->iopt->iova_rwsem);
+ }
+ kfree(area);
+}
+
+void iopt_free_pages_list(struct list_head *pages_list)
+{
+ struct iopt_pages_list *elm;
+
+ while ((elm = list_first_entry_or_null(pages_list,
+ struct iopt_pages_list, next))) {
+ if (elm->area)
+ iopt_abort_area(elm->area);
+ if (elm->pages)
+ iopt_put_pages(elm->pages);
+ list_del(&elm->next);
+ kfree(elm);
+ }
+}
+
+static int iopt_fill_domains_pages(struct list_head *pages_list)
+{
+ struct iopt_pages_list *undo_elm;
+ struct iopt_pages_list *elm;
+ int rc;
+
+ list_for_each_entry(elm, pages_list, next) {
+ rc = iopt_area_fill_domains(elm->area, elm->pages);
+ if (rc)
+ goto err_undo;
+ }
+ return 0;
+
+err_undo:
+ list_for_each_entry(undo_elm, pages_list, next) {
+ if (undo_elm == elm)
+ break;
+ iopt_area_unfill_domains(undo_elm->area, undo_elm->pages);
+ }
+ return rc;
+}
+
+int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
+ unsigned long length, unsigned long *dst_iova,
+ int iommu_prot, unsigned int flags)
+{
+ struct iopt_pages_list *elm;
+ int rc;
+
+ rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova,
+ iommu_prot, flags);
+ if (rc)
+ return rc;
+
+ down_read(&iopt->domains_rwsem);
+ rc = iopt_fill_domains_pages(pages_list);
+ if (rc)
+ goto out_unlock_domains;
+
+ down_write(&iopt->iova_rwsem);
+ list_for_each_entry(elm, pages_list, next) {
+ /*
+ * area->pages must be set inside the domains_rwsem to ensure
+ * any newly added domains will get filled. Moves the reference
+ * in from the list.
+ */
+ elm->area->pages = elm->pages;
+ elm->pages = NULL;
+ elm->area = NULL;
+ }
+ up_write(&iopt->iova_rwsem);
+out_unlock_domains:
+ up_read(&iopt->domains_rwsem);
+ return rc;
+}
+
+/**
+ * iopt_map_user_pages() - Map a user VA to an iova in the io page table
+ * @ictx: iommufd_ctx the iopt is part of
+ * @iopt: io_pagetable to act on
+ * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
+ * the chosen iova on output. Otherwise is the iova to map to on input
+ * @uptr: User VA to map
+ * @length: Number of bytes to map
+ * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
+ * @flags: IOPT_ALLOC_IOVA or zero
+ *
+ * iova, uptr, and length must be aligned to iova_alignment. For domain backed
+ * page tables this will pin the pages and load them into the domain at iova.
+ * For non-domain page tables this will only setup a lazy reference and the
+ * caller must use iopt_access_pages() to touch them.
+ *
+ * iopt_unmap_iova() must be called to undo this before the io_pagetable can be
+ * destroyed.
+ */
+int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
+ unsigned long *iova, void __user *uptr,
+ unsigned long length, int iommu_prot,
+ unsigned int flags)
+{
+ struct iopt_pages_list elm = {};
+ LIST_HEAD(pages_list);
+ int rc;
+
+ elm.pages = iopt_alloc_pages(uptr, length, iommu_prot & IOMMU_WRITE);
+ if (IS_ERR(elm.pages))
+ return PTR_ERR(elm.pages);
+ if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM &&
+ elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER)
+ elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM;
+ elm.start_byte = uptr - elm.pages->uptr;
+ elm.length = length;
+ list_add(&elm.next, &pages_list);
+
+ rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags);
+ if (rc) {
+ if (elm.area)
+ iopt_abort_area(elm.area);
+ if (elm.pages)
+ iopt_put_pages(elm.pages);
+ return rc;
+ }
+ return 0;
+}
+
+int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
+ unsigned long length, struct list_head *pages_list)
+{
+ struct iopt_area_contig_iter iter;
+ unsigned long last_iova;
+ struct iopt_area *area;
+ int rc;
+
+ if (!length)
+ return -EINVAL;
+ if (check_add_overflow(iova, length - 1, &last_iova))
+ return -EOVERFLOW;
+
+ down_read(&iopt->iova_rwsem);
+ iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
+ struct iopt_pages_list *elm;
+ unsigned long last = min(last_iova, iopt_area_last_iova(area));
+
+ elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT);
+ if (!elm) {
+ rc = -ENOMEM;
+ goto err_free;
+ }
+ elm->start_byte = iopt_area_start_byte(area, iter.cur_iova);
+ elm->pages = area->pages;
+ elm->length = (last - iter.cur_iova) + 1;
+ kref_get(&elm->pages->kref);
+ list_add_tail(&elm->next, pages_list);
+ }
+ if (!iopt_area_contig_done(&iter)) {
+ rc = -ENOENT;
+ goto err_free;
+ }
+ up_read(&iopt->iova_rwsem);
+ return 0;
+err_free:
+ up_read(&iopt->iova_rwsem);
+ iopt_free_pages_list(pages_list);
+ return rc;
+}
+
+static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
+ unsigned long last, unsigned long *unmapped)
+{
+ struct iopt_area *area;
+ unsigned long unmapped_bytes = 0;
+ int rc = -ENOENT;
+
+ /*
+ * The domains_rwsem must be held in read mode any time any area->pages
+ * is NULL. This prevents domain attach/detatch from running
+ * concurrently with cleaning up the area.
+ */
+again:
+ down_read(&iopt->domains_rwsem);
+ down_write(&iopt->iova_rwsem);
+ while ((area = iopt_area_iter_first(iopt, start, last))) {
+ unsigned long area_last = iopt_area_last_iova(area);
+ unsigned long area_first = iopt_area_iova(area);
+ struct iopt_pages *pages;
+
+ /* Userspace should not race map/unmap's of the same area */
+ if (!area->pages) {
+ rc = -EBUSY;
+ goto out_unlock_iova;
+ }
+
+ if (area_first < start || area_last > last) {
+ rc = -ENOENT;
+ goto out_unlock_iova;
+ }
+
+ /*
+ * num_accesses writers must hold the iova_rwsem too, so we can
+ * safely read it under the write side of the iovam_rwsem
+ * without the pages->mutex.
+ */
+ if (area->num_accesses) {
+ start = area_first;
+ area->prevent_access = true;
+ up_write(&iopt->iova_rwsem);
+ up_read(&iopt->domains_rwsem);
+ iommufd_access_notify_unmap(iopt, area_first,
+ iopt_area_length(area));
+ if (WARN_ON(READ_ONCE(area->num_accesses)))
+ return -EDEADLOCK;
+ goto again;
+ }
+
+ pages = area->pages;
+ area->pages = NULL;
+ up_write(&iopt->iova_rwsem);
+
+ iopt_area_unfill_domains(area, pages);
+ iopt_abort_area(area);
+ iopt_put_pages(pages);
+
+ unmapped_bytes += area_last - area_first + 1;
+
+ down_write(&iopt->iova_rwsem);
+ }
+ if (unmapped_bytes)
+ rc = 0;
+
+out_unlock_iova:
+ up_write(&iopt->iova_rwsem);
+ up_read(&iopt->domains_rwsem);
+ if (unmapped)
+ *unmapped = unmapped_bytes;
+ return rc;
+}
+
+/**
+ * iopt_unmap_iova() - Remove a range of iova
+ * @iopt: io_pagetable to act on
+ * @iova: Starting iova to unmap
+ * @length: Number of bytes to unmap
+ * @unmapped: Return number of bytes unmapped
+ *
+ * The requested range must be a superset of existing ranges.
+ * Splitting/truncating IOVA mappings is not allowed.
+ */
+int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
+ unsigned long length, unsigned long *unmapped)
+{
+ unsigned long iova_last;
+
+ if (!length)
+ return -EINVAL;
+
+ if (check_add_overflow(iova, length - 1, &iova_last))
+ return -EOVERFLOW;
+
+ return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped);
+}
+
+int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped)
+{
+ int rc;
+
+ rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped);
+ /* If the IOVAs are empty then unmap all succeeds */
+ if (rc == -ENOENT)
+ return 0;
+ return rc;
+}
+
+/* The caller must always free all the nodes in the allowed_iova rb_root. */
+int iopt_set_allow_iova(struct io_pagetable *iopt,
+ struct rb_root_cached *allowed_iova)
+{
+ struct iopt_allowed *allowed;
+
+ down_write(&iopt->iova_rwsem);
+ swap(*allowed_iova, iopt->allowed_itree);
+
+ for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed;
+ allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) {
+ if (iopt_reserved_iter_first(iopt, allowed->node.start,
+ allowed->node.last)) {
+ swap(*allowed_iova, iopt->allowed_itree);
+ up_write(&iopt->iova_rwsem);
+ return -EADDRINUSE;
+ }
+ }
+ up_write(&iopt->iova_rwsem);
+ return 0;
+}
+
+int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
+ unsigned long last, void *owner)
+{
+ struct iopt_reserved *reserved;
+
+ lockdep_assert_held_write(&iopt->iova_rwsem);
+
+ if (iopt_area_iter_first(iopt, start, last) ||
+ iopt_allowed_iter_first(iopt, start, last))
+ return -EADDRINUSE;
+
+ reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT);
+ if (!reserved)
+ return -ENOMEM;
+ reserved->node.start = start;
+ reserved->node.last = last;
+ reserved->owner = owner;
+ interval_tree_insert(&reserved->node, &iopt->reserved_itree);
+ return 0;
+}
+
+static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
+{
+ struct iopt_reserved *reserved, *next;
+
+ lockdep_assert_held_write(&iopt->iova_rwsem);
+
+ for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved;
+ reserved = next) {
+ next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX);
+
+ if (reserved->owner == owner) {
+ interval_tree_remove(&reserved->node,
+ &iopt->reserved_itree);
+ kfree(reserved);
+ }
+ }
+}
+
+void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
+{
+ down_write(&iopt->iova_rwsem);
+ __iopt_remove_reserved_iova(iopt, owner);
+ up_write(&iopt->iova_rwsem);
+}
+
+void iopt_init_table(struct io_pagetable *iopt)
+{
+ init_rwsem(&iopt->iova_rwsem);
+ init_rwsem(&iopt->domains_rwsem);
+ iopt->area_itree = RB_ROOT_CACHED;
+ iopt->allowed_itree = RB_ROOT_CACHED;
+ iopt->reserved_itree = RB_ROOT_CACHED;
+ xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT);
+ xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC);
+
+ /*
+ * iopt's start as SW tables that can use the entire size_t IOVA space
+ * due to the use of size_t in the APIs. They have no alignment
+ * restriction.
+ */
+ iopt->iova_alignment = 1;
+}
+
+void iopt_destroy_table(struct io_pagetable *iopt)
+{
+ struct interval_tree_node *node;
+
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
+ iopt_remove_reserved_iova(iopt, NULL);
+
+ while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0,
+ ULONG_MAX))) {
+ interval_tree_remove(node, &iopt->allowed_itree);
+ kfree(container_of(node, struct iopt_allowed, node));
+ }
+
+ WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root));
+ WARN_ON(!xa_empty(&iopt->domains));
+ WARN_ON(!xa_empty(&iopt->access_list));
+ WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root));
+}
+
+/**
+ * iopt_unfill_domain() - Unfill a domain with PFNs
+ * @iopt: io_pagetable to act on
+ * @domain: domain to unfill
+ *
+ * This is used when removing a domain from the iopt. Every area in the iopt
+ * will be unmapped from the domain. The domain must already be removed from the
+ * domains xarray.
+ */
+static void iopt_unfill_domain(struct io_pagetable *iopt,
+ struct iommu_domain *domain)
+{
+ struct iopt_area *area;
+
+ lockdep_assert_held(&iopt->iova_rwsem);
+ lockdep_assert_held_write(&iopt->domains_rwsem);
+
+ /*
+ * Some other domain is holding all the pfns still, rapidly unmap this
+ * domain.
+ */
+ if (iopt->next_domain_id != 0) {
+ /* Pick an arbitrary remaining domain to act as storage */
+ struct iommu_domain *storage_domain =
+ xa_load(&iopt->domains, 0);
+
+ for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
+ area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
+ struct iopt_pages *pages = area->pages;
+
+ if (!pages)
+ continue;
+
+ mutex_lock(&pages->mutex);
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
+ WARN_ON(!area->storage_domain);
+ if (area->storage_domain == domain)
+ area->storage_domain = storage_domain;
+ mutex_unlock(&pages->mutex);
+
+ iopt_area_unmap_domain(area, domain);
+ }
+ return;
+ }
+
+ for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
+ area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
+ struct iopt_pages *pages = area->pages;
+
+ if (!pages)
+ continue;
+
+ mutex_lock(&pages->mutex);
+ interval_tree_remove(&area->pages_node, &pages->domains_itree);
+ WARN_ON(area->storage_domain != domain);
+ area->storage_domain = NULL;
+ iopt_area_unfill_domain(area, pages, domain);
+ mutex_unlock(&pages->mutex);
+ }
+}
+
+/**
+ * iopt_fill_domain() - Fill a domain with PFNs
+ * @iopt: io_pagetable to act on
+ * @domain: domain to fill
+ *
+ * Fill the domain with PFNs from every area in the iopt. On failure the domain
+ * is left unchanged.
+ */
+static int iopt_fill_domain(struct io_pagetable *iopt,
+ struct iommu_domain *domain)
+{
+ struct iopt_area *end_area;
+ struct iopt_area *area;
+ int rc;
+
+ lockdep_assert_held(&iopt->iova_rwsem);
+ lockdep_assert_held_write(&iopt->domains_rwsem);
+
+ for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
+ area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
+ struct iopt_pages *pages = area->pages;
+
+ if (!pages)
+ continue;
+
+ mutex_lock(&pages->mutex);
+ rc = iopt_area_fill_domain(area, domain);
+ if (rc) {
+ mutex_unlock(&pages->mutex);
+ goto out_unfill;
+ }
+ if (!area->storage_domain) {
+ WARN_ON(iopt->next_domain_id != 0);
+ area->storage_domain = domain;
+ interval_tree_insert(&area->pages_node,
+ &pages->domains_itree);
+ }
+ mutex_unlock(&pages->mutex);
+ }
+ return 0;
+
+out_unfill:
+ end_area = area;
+ for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
+ area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
+ struct iopt_pages *pages = area->pages;
+
+ if (area == end_area)
+ break;
+ if (!pages)
+ continue;
+ mutex_lock(&pages->mutex);
+ if (iopt->next_domain_id == 0) {
+ interval_tree_remove(&area->pages_node,
+ &pages->domains_itree);
+ area->storage_domain = NULL;
+ }
+ iopt_area_unfill_domain(area, pages, domain);
+ mutex_unlock(&pages->mutex);
+ }
+ return rc;
+}
+
+/* All existing area's conform to an increased page size */
+static int iopt_check_iova_alignment(struct io_pagetable *iopt,
+ unsigned long new_iova_alignment)
+{
+ unsigned long align_mask = new_iova_alignment - 1;
+ struct iopt_area *area;
+
+ lockdep_assert_held(&iopt->iova_rwsem);
+ lockdep_assert_held(&iopt->domains_rwsem);
+
+ for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
+ area = iopt_area_iter_next(area, 0, ULONG_MAX))
+ if ((iopt_area_iova(area) & align_mask) ||
+ (iopt_area_length(area) & align_mask) ||
+ (area->page_offset & align_mask))
+ return -EADDRINUSE;
+
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) {
+ struct iommufd_access *access;
+ unsigned long index;
+
+ xa_for_each(&iopt->access_list, index, access)
+ if (WARN_ON(access->iova_alignment >
+ new_iova_alignment))
+ return -EADDRINUSE;
+ }
+ return 0;
+}
+
+int iopt_table_add_domain(struct io_pagetable *iopt,
+ struct iommu_domain *domain)
+{
+ const struct iommu_domain_geometry *geometry = &domain->geometry;
+ struct iommu_domain *iter_domain;
+ unsigned int new_iova_alignment;
+ unsigned long index;
+ int rc;
+
+ down_write(&iopt->domains_rwsem);
+ down_write(&iopt->iova_rwsem);
+
+ xa_for_each(&iopt->domains, index, iter_domain) {
+ if (WARN_ON(iter_domain == domain)) {
+ rc = -EEXIST;
+ goto out_unlock;
+ }
+ }
+
+ /*
+ * The io page size drives the iova_alignment. Internally the iopt_pages
+ * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE
+ * objects into the iommu_domain.
+ *
+ * A iommu_domain must always be able to accept PAGE_SIZE to be
+ * compatible as we can't guarantee higher contiguity.
+ */
+ new_iova_alignment = max_t(unsigned long,
+ 1UL << __ffs(domain->pgsize_bitmap),
+ iopt->iova_alignment);
+ if (new_iova_alignment > PAGE_SIZE) {
+ rc = -EINVAL;
+ goto out_unlock;
+ }
+ if (new_iova_alignment != iopt->iova_alignment) {
+ rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
+ if (rc)
+ goto out_unlock;
+ }
+
+ /* No area exists that is outside the allowed domain aperture */
+ if (geometry->aperture_start != 0) {
+ rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1,
+ domain);
+ if (rc)
+ goto out_reserved;
+ }
+ if (geometry->aperture_end != ULONG_MAX) {
+ rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1,
+ ULONG_MAX, domain);
+ if (rc)
+ goto out_reserved;
+ }
+
+ rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL);
+ if (rc)
+ goto out_reserved;
+
+ rc = iopt_fill_domain(iopt, domain);
+ if (rc)
+ goto out_release;
+
+ iopt->iova_alignment = new_iova_alignment;
+ xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL);
+ iopt->next_domain_id++;
+ up_write(&iopt->iova_rwsem);
+ up_write(&iopt->domains_rwsem);
+ return 0;
+out_release:
+ xa_release(&iopt->domains, iopt->next_domain_id);
+out_reserved:
+ __iopt_remove_reserved_iova(iopt, domain);
+out_unlock:
+ up_write(&iopt->iova_rwsem);
+ up_write(&iopt->domains_rwsem);
+ return rc;
+}
+
+static int iopt_calculate_iova_alignment(struct io_pagetable *iopt)
+{
+ unsigned long new_iova_alignment;
+ struct iommufd_access *access;
+ struct iommu_domain *domain;
+ unsigned long index;
+
+ lockdep_assert_held_write(&iopt->iova_rwsem);
+ lockdep_assert_held(&iopt->domains_rwsem);
+
+ /* See batch_iommu_map_small() */
+ if (iopt->disable_large_pages)
+ new_iova_alignment = PAGE_SIZE;
+ else
+ new_iova_alignment = 1;
+
+ xa_for_each(&iopt->domains, index, domain)
+ new_iova_alignment = max_t(unsigned long,
+ 1UL << __ffs(domain->pgsize_bitmap),
+ new_iova_alignment);
+ xa_for_each(&iopt->access_list, index, access)
+ new_iova_alignment = max_t(unsigned long,
+ access->iova_alignment,
+ new_iova_alignment);
+
+ if (new_iova_alignment > iopt->iova_alignment) {
+ int rc;
+
+ rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
+ if (rc)
+ return rc;
+ }
+ iopt->iova_alignment = new_iova_alignment;
+ return 0;
+}
+
+void iopt_table_remove_domain(struct io_pagetable *iopt,
+ struct iommu_domain *domain)
+{
+ struct iommu_domain *iter_domain = NULL;
+ unsigned long index;
+
+ down_write(&iopt->domains_rwsem);
+ down_write(&iopt->iova_rwsem);
+
+ xa_for_each(&iopt->domains, index, iter_domain)
+ if (iter_domain == domain)
+ break;
+ if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id)
+ goto out_unlock;
+
+ /*
+ * Compress the xarray to keep it linear by swapping the entry to erase
+ * with the tail entry and shrinking the tail.
+ */
+ iopt->next_domain_id--;
+ iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id);
+ if (index != iopt->next_domain_id)
+ xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL);
+
+ iopt_unfill_domain(iopt, domain);
+ __iopt_remove_reserved_iova(iopt, domain);
+
+ WARN_ON(iopt_calculate_iova_alignment(iopt));
+out_unlock:
+ up_write(&iopt->iova_rwsem);
+ up_write(&iopt->domains_rwsem);
+}
+
+/**
+ * iopt_area_split - Split an area into two parts at iova
+ * @area: The area to split
+ * @iova: Becomes the last of a new area
+ *
+ * This splits an area into two. It is part of the VFIO compatibility to allow
+ * poking a hole in the mapping. The two areas continue to point at the same
+ * iopt_pages, just with different starting bytes.
+ */
+static int iopt_area_split(struct iopt_area *area, unsigned long iova)
+{
+ unsigned long alignment = area->iopt->iova_alignment;
+ unsigned long last_iova = iopt_area_last_iova(area);
+ unsigned long start_iova = iopt_area_iova(area);
+ unsigned long new_start = iova + 1;
+ struct io_pagetable *iopt = area->iopt;
+ struct iopt_pages *pages = area->pages;
+ struct iopt_area *lhs;
+ struct iopt_area *rhs;
+ int rc;
+
+ lockdep_assert_held_write(&iopt->iova_rwsem);
+
+ if (iova == start_iova || iova == last_iova)
+ return 0;
+
+ if (!pages || area->prevent_access)
+ return -EBUSY;
+
+ if (new_start & (alignment - 1) ||
+ iopt_area_start_byte(area, new_start) & (alignment - 1))
+ return -EINVAL;
+
+ lhs = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
+ if (!lhs)
+ return -ENOMEM;
+
+ rhs = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
+ if (!rhs) {
+ rc = -ENOMEM;
+ goto err_free_lhs;
+ }
+
+ mutex_lock(&pages->mutex);
+ /*
+ * Splitting is not permitted if an access exists, we don't track enough
+ * information to split existing accesses.
+ */
+ if (area->num_accesses) {
+ rc = -EINVAL;
+ goto err_unlock;
+ }
+
+ /*
+ * Splitting is not permitted if a domain could have been mapped with
+ * huge pages.
+ */
+ if (area->storage_domain && !iopt->disable_large_pages) {
+ rc = -EINVAL;
+ goto err_unlock;
+ }
+
+ interval_tree_remove(&area->node, &iopt->area_itree);
+ rc = iopt_insert_area(iopt, lhs, area->pages, start_iova,
+ iopt_area_start_byte(area, start_iova),
+ (new_start - 1) - start_iova + 1,
+ area->iommu_prot);
+ if (WARN_ON(rc))
+ goto err_insert;
+
+ rc = iopt_insert_area(iopt, rhs, area->pages, new_start,
+ iopt_area_start_byte(area, new_start),
+ last_iova - new_start + 1, area->iommu_prot);
+ if (WARN_ON(rc))
+ goto err_remove_lhs;
+
+ lhs->storage_domain = area->storage_domain;
+ lhs->pages = area->pages;
+ rhs->storage_domain = area->storage_domain;
+ rhs->pages = area->pages;
+ kref_get(&rhs->pages->kref);
+ kfree(area);
+ mutex_unlock(&pages->mutex);
+
+ /*
+ * No change to domains or accesses because the pages hasn't been
+ * changed
+ */
+ return 0;
+
+err_remove_lhs:
+ interval_tree_remove(&lhs->node, &iopt->area_itree);
+err_insert:
+ interval_tree_insert(&area->node, &iopt->area_itree);
+err_unlock:
+ mutex_unlock(&pages->mutex);
+ kfree(rhs);
+err_free_lhs:
+ kfree(lhs);
+ return rc;
+}
+
+int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas,
+ size_t num_iovas)
+{
+ int rc = 0;
+ int i;
+
+ down_write(&iopt->iova_rwsem);
+ for (i = 0; i < num_iovas; i++) {
+ struct iopt_area *area;
+
+ area = iopt_area_iter_first(iopt, iovas[i], iovas[i]);
+ if (!area)
+ continue;
+ rc = iopt_area_split(area, iovas[i]);
+ if (rc)
+ break;
+ }
+ up_write(&iopt->iova_rwsem);
+ return rc;
+}
+
+void iopt_enable_large_pages(struct io_pagetable *iopt)
+{
+ int rc;
+
+ down_write(&iopt->domains_rwsem);
+ down_write(&iopt->iova_rwsem);
+ WRITE_ONCE(iopt->disable_large_pages, false);
+ rc = iopt_calculate_iova_alignment(iopt);
+ WARN_ON(rc);
+ up_write(&iopt->iova_rwsem);
+ up_write(&iopt->domains_rwsem);
+}
+
+int iopt_disable_large_pages(struct io_pagetable *iopt)
+{
+ int rc = 0;
+
+ down_write(&iopt->domains_rwsem);
+ down_write(&iopt->iova_rwsem);
+ if (iopt->disable_large_pages)
+ goto out_unlock;
+
+ /* Won't do it if domains already have pages mapped in them */
+ if (!xa_empty(&iopt->domains) &&
+ !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) {
+ rc = -EINVAL;
+ goto out_unlock;
+ }
+
+ WRITE_ONCE(iopt->disable_large_pages, true);
+ rc = iopt_calculate_iova_alignment(iopt);
+ if (rc)
+ WRITE_ONCE(iopt->disable_large_pages, false);
+out_unlock:
+ up_write(&iopt->iova_rwsem);
+ up_write(&iopt->domains_rwsem);
+ return rc;
+}
+
+int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access)
+{
+ int rc;
+
+ down_write(&iopt->domains_rwsem);
+ down_write(&iopt->iova_rwsem);
+ rc = xa_alloc(&iopt->access_list, &access->iopt_access_list_id, access,
+ xa_limit_16b, GFP_KERNEL_ACCOUNT);
+ if (rc)
+ goto out_unlock;
+
+ rc = iopt_calculate_iova_alignment(iopt);
+ if (rc) {
+ xa_erase(&iopt->access_list, access->iopt_access_list_id);
+ goto out_unlock;
+ }
+
+out_unlock:
+ up_write(&iopt->iova_rwsem);
+ up_write(&iopt->domains_rwsem);
+ return rc;
+}
+
+void iopt_remove_access(struct io_pagetable *iopt,
+ struct iommufd_access *access)
+{
+ down_write(&iopt->domains_rwsem);
+ down_write(&iopt->iova_rwsem);
+ WARN_ON(xa_erase(&iopt->access_list, access->iopt_access_list_id) !=
+ access);
+ WARN_ON(iopt_calculate_iova_alignment(iopt));
+ up_write(&iopt->iova_rwsem);
+ up_write(&iopt->domains_rwsem);
+}
+
+/* Narrow the valid_iova_itree to include reserved ranges from a group. */
+int iopt_table_enforce_group_resv_regions(struct io_pagetable *iopt,
+ struct device *device,
+ struct iommu_group *group,
+ phys_addr_t *sw_msi_start)
+{
+ struct iommu_resv_region *resv;
+ struct iommu_resv_region *tmp;
+ LIST_HEAD(group_resv_regions);
+ int rc;
+
+ down_write(&iopt->iova_rwsem);
+ rc = iommu_get_group_resv_regions(group, &group_resv_regions);
+ if (rc)
+ goto out_unlock;
+
+ list_for_each_entry(resv, &group_resv_regions, list) {
+ if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
+ continue;
+
+ /*
+ * The presence of any 'real' MSI regions should take precedence
+ * over the software-managed one if the IOMMU driver happens to
+ * advertise both types.
+ */
+ if (sw_msi_start && resv->type == IOMMU_RESV_MSI) {
+ *sw_msi_start = 0;
+ sw_msi_start = NULL;
+ }
+ if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI)
+ *sw_msi_start = resv->start;
+
+ rc = iopt_reserve_iova(iopt, resv->start,
+ resv->length - 1 + resv->start, device);
+ if (rc)
+ goto out_reserved;
+ }
+ rc = 0;
+ goto out_free_resv;
+
+out_reserved:
+ __iopt_remove_reserved_iova(iopt, device);
+out_free_resv:
+ list_for_each_entry_safe(resv, tmp, &group_resv_regions, list)
+ kfree(resv);
+out_unlock:
+ up_write(&iopt->iova_rwsem);
+ return rc;
+}
diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h
new file mode 100644
index 000000000000..83e7c175f2a2
--- /dev/null
+++ b/drivers/iommu/iommufd/io_pagetable.h
@@ -0,0 +1,241 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
+ *
+ */
+#ifndef __IO_PAGETABLE_H
+#define __IO_PAGETABLE_H
+
+#include <linux/interval_tree.h>
+#include <linux/mutex.h>
+#include <linux/kref.h>
+#include <linux/xarray.h>
+
+#include "iommufd_private.h"
+
+struct iommu_domain;
+
+/*
+ * Each io_pagetable is composed of intervals of areas which cover regions of
+ * the iova that are backed by something. iova not covered by areas is not
+ * populated in the page table. Each area is fully populated with pages.
+ *
+ * iovas are in byte units, but must be iopt->iova_alignment aligned.
+ *
+ * pages can be NULL, this means some other thread is still working on setting
+ * up or tearing down the area. When observed under the write side of the
+ * domain_rwsem a NULL pages must mean the area is still being setup and no
+ * domains are filled.
+ *
+ * storage_domain points at an arbitrary iommu_domain that is holding the PFNs
+ * for this area. It is locked by the pages->mutex. This simplifies the locking
+ * as the pages code can rely on the storage_domain without having to get the
+ * iopt->domains_rwsem.
+ *
+ * The io_pagetable::iova_rwsem protects node
+ * The iopt_pages::mutex protects pages_node
+ * iopt and immu_prot are immutable
+ * The pages::mutex protects num_accesses
+ */
+struct iopt_area {
+ struct interval_tree_node node;
+ struct interval_tree_node pages_node;
+ struct io_pagetable *iopt;
+ struct iopt_pages *pages;
+ struct iommu_domain *storage_domain;
+ /* How many bytes into the first page the area starts */
+ unsigned int page_offset;
+ /* IOMMU_READ, IOMMU_WRITE, etc */
+ int iommu_prot;
+ bool prevent_access : 1;
+ unsigned int num_accesses;
+};
+
+struct iopt_allowed {
+ struct interval_tree_node node;
+};
+
+struct iopt_reserved {
+ struct interval_tree_node node;
+ void *owner;
+};
+
+int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages);
+void iopt_area_unfill_domains(struct iopt_area *area, struct iopt_pages *pages);
+
+int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain);
+void iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages,
+ struct iommu_domain *domain);
+void iopt_area_unmap_domain(struct iopt_area *area,
+ struct iommu_domain *domain);
+
+static inline unsigned long iopt_area_index(struct iopt_area *area)
+{
+ return area->pages_node.start;
+}
+
+static inline unsigned long iopt_area_last_index(struct iopt_area *area)
+{
+ return area->pages_node.last;
+}
+
+static inline unsigned long iopt_area_iova(struct iopt_area *area)
+{
+ return area->node.start;
+}
+
+static inline unsigned long iopt_area_last_iova(struct iopt_area *area)
+{
+ return area->node.last;
+}
+
+static inline size_t iopt_area_length(struct iopt_area *area)
+{
+ return (area->node.last - area->node.start) + 1;
+}
+
+/*
+ * Number of bytes from the start of the iopt_pages that the iova begins.
+ * iopt_area_start_byte() / PAGE_SIZE encodes the starting page index
+ * iopt_area_start_byte() % PAGE_SIZE encodes the offset within that page
+ */
+static inline unsigned long iopt_area_start_byte(struct iopt_area *area,
+ unsigned long iova)
+{
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
+ WARN_ON(iova < iopt_area_iova(area) ||
+ iova > iopt_area_last_iova(area));
+ return (iova - iopt_area_iova(area)) + area->page_offset +
+ iopt_area_index(area) * PAGE_SIZE;
+}
+
+static inline unsigned long iopt_area_iova_to_index(struct iopt_area *area,
+ unsigned long iova)
+{
+ return iopt_area_start_byte(area, iova) / PAGE_SIZE;
+}
+
+#define __make_iopt_iter(name) \
+ static inline struct iopt_##name *iopt_##name##_iter_first( \
+ struct io_pagetable *iopt, unsigned long start, \
+ unsigned long last) \
+ { \
+ struct interval_tree_node *node; \
+ \
+ lockdep_assert_held(&iopt->iova_rwsem); \
+ node = interval_tree_iter_first(&iopt->name##_itree, start, \
+ last); \
+ if (!node) \
+ return NULL; \
+ return container_of(node, struct iopt_##name, node); \
+ } \
+ static inline struct iopt_##name *iopt_##name##_iter_next( \
+ struct iopt_##name *last_node, unsigned long start, \
+ unsigned long last) \
+ { \
+ struct interval_tree_node *node; \
+ \
+ node = interval_tree_iter_next(&last_node->node, start, last); \
+ if (!node) \
+ return NULL; \
+ return container_of(node, struct iopt_##name, node); \
+ }
+
+__make_iopt_iter(area)
+__make_iopt_iter(allowed)
+__make_iopt_iter(reserved)
+
+struct iopt_area_contig_iter {
+ unsigned long cur_iova;
+ unsigned long last_iova;
+ struct iopt_area *area;
+};
+struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter,
+ struct io_pagetable *iopt,
+ unsigned long iova,
+ unsigned long last_iova);
+struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter);
+
+static inline bool iopt_area_contig_done(struct iopt_area_contig_iter *iter)
+{
+ return iter->area && iter->last_iova <= iopt_area_last_iova(iter->area);
+}
+
+/*
+ * Iterate over a contiguous list of areas that span the iova,last_iova range.
+ * The caller must check iopt_area_contig_done() after the loop to see if
+ * contiguous areas existed.
+ */
+#define iopt_for_each_contig_area(iter, area, iopt, iova, last_iova) \
+ for (area = iopt_area_contig_init(iter, iopt, iova, last_iova); area; \
+ area = iopt_area_contig_next(iter))
+
+enum {
+ IOPT_PAGES_ACCOUNT_NONE = 0,
+ IOPT_PAGES_ACCOUNT_USER = 1,
+ IOPT_PAGES_ACCOUNT_MM = 2,
+};
+
+/*
+ * This holds a pinned page list for multiple areas of IO address space. The
+ * pages always originate from a linear chunk of userspace VA. Multiple
+ * io_pagetable's, through their iopt_area's, can share a single iopt_pages
+ * which avoids multi-pinning and double accounting of page consumption.
+ *
+ * indexes in this structure are measured in PAGE_SIZE units, are 0 based from
+ * the start of the uptr and extend to npages. pages are pinned dynamically
+ * according to the intervals in the access_itree and domains_itree, npinned
+ * records the current number of pages pinned.
+ */
+struct iopt_pages {
+ struct kref kref;
+ struct mutex mutex;
+ size_t npages;
+ size_t npinned;
+ size_t last_npinned;
+ struct task_struct *source_task;
+ struct mm_struct *source_mm;
+ struct user_struct *source_user;
+ void __user *uptr;
+ bool writable:1;
+ u8 account_mode;
+
+ struct xarray pinned_pfns;
+ /* Of iopt_pages_access::node */
+ struct rb_root_cached access_itree;
+ /* Of iopt_area::pages_node */
+ struct rb_root_cached domains_itree;
+};
+
+struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length,
+ bool writable);
+void iopt_release_pages(struct kref *kref);
+static inline void iopt_put_pages(struct iopt_pages *pages)
+{
+ kref_put(&pages->kref, iopt_release_pages);
+}
+
+void iopt_pages_fill_from_xarray(struct iopt_pages *pages, unsigned long start,
+ unsigned long last, struct page **out_pages);
+int iopt_pages_fill_xarray(struct iopt_pages *pages, unsigned long start,
+ unsigned long last, struct page **out_pages);
+void iopt_pages_unfill_xarray(struct iopt_pages *pages, unsigned long start,
+ unsigned long last);
+
+int iopt_area_add_access(struct iopt_area *area, unsigned long start,
+ unsigned long last, struct page **out_pages,
+ unsigned int flags);
+void iopt_area_remove_access(struct iopt_area *area, unsigned long start,
+ unsigned long last);
+int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte,
+ void *data, unsigned long length, unsigned int flags);
+
+/*
+ * Each interval represents an active iopt_access_pages(), it acts as an
+ * interval lock that keeps the PFNs pinned and stored in the xarray.
+ */
+struct iopt_pages_access {
+ struct interval_tree_node node;
+ unsigned int users;
+};
+
+#endif
diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c
new file mode 100644
index 000000000000..31577e9d434f
--- /dev/null
+++ b/drivers/iommu/iommufd/ioas.c
@@ -0,0 +1,398 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
+ */
+#include <linux/interval_tree.h>
+#include <linux/iommufd.h>
+#include <linux/iommu.h>
+#include <uapi/linux/iommufd.h>
+
+#include "io_pagetable.h"
+
+void iommufd_ioas_destroy(struct iommufd_object *obj)
+{
+ struct iommufd_ioas *ioas = container_of(obj, struct iommufd_ioas, obj);
+ int rc;
+
+ rc = iopt_unmap_all(&ioas->iopt, NULL);
+ WARN_ON(rc && rc != -ENOENT);
+ iopt_destroy_table(&ioas->iopt);
+ mutex_destroy(&ioas->mutex);
+}
+
+struct iommufd_ioas *iommufd_ioas_alloc(struct iommufd_ctx *ictx)
+{
+ struct iommufd_ioas *ioas;
+
+ ioas = iommufd_object_alloc(ictx, ioas, IOMMUFD_OBJ_IOAS);
+ if (IS_ERR(ioas))
+ return ioas;
+
+ iopt_init_table(&ioas->iopt);
+ INIT_LIST_HEAD(&ioas->hwpt_list);
+ mutex_init(&ioas->mutex);
+ return ioas;
+}
+
+int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_ioas_alloc *cmd = ucmd->cmd;
+ struct iommufd_ioas *ioas;
+ int rc;
+
+ if (cmd->flags)
+ return -EOPNOTSUPP;
+
+ ioas = iommufd_ioas_alloc(ucmd->ictx);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+
+ cmd->out_ioas_id = ioas->obj.id;
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+ if (rc)
+ goto out_table;
+ iommufd_object_finalize(ucmd->ictx, &ioas->obj);
+ return 0;
+
+out_table:
+ iommufd_object_abort_and_destroy(ucmd->ictx, &ioas->obj);
+ return rc;
+}
+
+int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_iova_range __user *ranges;
+ struct iommu_ioas_iova_ranges *cmd = ucmd->cmd;
+ struct iommufd_ioas *ioas;
+ struct interval_tree_span_iter span;
+ u32 max_iovas;
+ int rc;
+
+ if (cmd->__reserved)
+ return -EOPNOTSUPP;
+
+ ioas = iommufd_get_ioas(ucmd, cmd->ioas_id);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+
+ down_read(&ioas->iopt.iova_rwsem);
+ max_iovas = cmd->num_iovas;
+ ranges = u64_to_user_ptr(cmd->allowed_iovas);
+ cmd->num_iovas = 0;
+ cmd->out_iova_alignment = ioas->iopt.iova_alignment;
+ interval_tree_for_each_span(&span, &ioas->iopt.reserved_itree, 0,
+ ULONG_MAX) {
+ if (!span.is_hole)
+ continue;
+ if (cmd->num_iovas < max_iovas) {
+ struct iommu_iova_range elm = {
+ .start = span.start_hole,
+ .last = span.last_hole,
+ };
+
+ if (copy_to_user(&ranges[cmd->num_iovas], &elm,
+ sizeof(elm))) {
+ rc = -EFAULT;
+ goto out_put;
+ }
+ }
+ cmd->num_iovas++;
+ }
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+ if (rc)
+ goto out_put;
+ if (cmd->num_iovas > max_iovas)
+ rc = -EMSGSIZE;
+out_put:
+ up_read(&ioas->iopt.iova_rwsem);
+ iommufd_put_object(&ioas->obj);
+ return rc;
+}
+
+static int iommufd_ioas_load_iovas(struct rb_root_cached *itree,
+ struct iommu_iova_range __user *ranges,
+ u32 num)
+{
+ u32 i;
+
+ for (i = 0; i != num; i++) {
+ struct iommu_iova_range range;
+ struct iopt_allowed *allowed;
+
+ if (copy_from_user(&range, ranges + i, sizeof(range)))
+ return -EFAULT;
+
+ if (range.start >= range.last)
+ return -EINVAL;
+
+ if (interval_tree_iter_first(itree, range.start, range.last))
+ return -EINVAL;
+
+ allowed = kzalloc(sizeof(*allowed), GFP_KERNEL_ACCOUNT);
+ if (!allowed)
+ return -ENOMEM;
+ allowed->node.start = range.start;
+ allowed->node.last = range.last;
+
+ interval_tree_insert(&allowed->node, itree);
+ }
+ return 0;
+}
+
+int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_ioas_allow_iovas *cmd = ucmd->cmd;
+ struct rb_root_cached allowed_iova = RB_ROOT_CACHED;
+ struct interval_tree_node *node;
+ struct iommufd_ioas *ioas;
+ struct io_pagetable *iopt;
+ int rc = 0;
+
+ if (cmd->__reserved)
+ return -EOPNOTSUPP;
+
+ ioas = iommufd_get_ioas(ucmd, cmd->ioas_id);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+ iopt = &ioas->iopt;
+
+ rc = iommufd_ioas_load_iovas(&allowed_iova,
+ u64_to_user_ptr(cmd->allowed_iovas),
+ cmd->num_iovas);
+ if (rc)
+ goto out_free;
+
+ /*
+ * We want the allowed tree update to be atomic, so we have to keep the
+ * original nodes around, and keep track of the new nodes as we allocate
+ * memory for them. The simplest solution is to have a new/old tree and
+ * then swap new for old. On success we free the old tree, on failure we
+ * free the new tree.
+ */
+ rc = iopt_set_allow_iova(iopt, &allowed_iova);
+out_free:
+ while ((node = interval_tree_iter_first(&allowed_iova, 0, ULONG_MAX))) {
+ interval_tree_remove(node, &allowed_iova);
+ kfree(container_of(node, struct iopt_allowed, node));
+ }
+ iommufd_put_object(&ioas->obj);
+ return rc;
+}
+
+static int conv_iommu_prot(u32 map_flags)
+{
+ /*
+ * We provide no manual cache coherency ioctls to userspace and most
+ * architectures make the CPU ops for cache flushing privileged.
+ * Therefore we require the underlying IOMMU to support CPU coherent
+ * operation. Support for IOMMU_CACHE is enforced by the
+ * IOMMU_CAP_CACHE_COHERENCY test during bind.
+ */
+ int iommu_prot = IOMMU_CACHE;
+
+ if (map_flags & IOMMU_IOAS_MAP_WRITEABLE)
+ iommu_prot |= IOMMU_WRITE;
+ if (map_flags & IOMMU_IOAS_MAP_READABLE)
+ iommu_prot |= IOMMU_READ;
+ return iommu_prot;
+}
+
+int iommufd_ioas_map(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_ioas_map *cmd = ucmd->cmd;
+ unsigned long iova = cmd->iova;
+ struct iommufd_ioas *ioas;
+ unsigned int flags = 0;
+ int rc;
+
+ if ((cmd->flags &
+ ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE |
+ IOMMU_IOAS_MAP_READABLE)) ||
+ cmd->__reserved)
+ return -EOPNOTSUPP;
+ if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX)
+ return -EOVERFLOW;
+
+ ioas = iommufd_get_ioas(ucmd, cmd->ioas_id);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+
+ if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA))
+ flags = IOPT_ALLOC_IOVA;
+ rc = iopt_map_user_pages(ucmd->ictx, &ioas->iopt, &iova,
+ u64_to_user_ptr(cmd->user_va), cmd->length,
+ conv_iommu_prot(cmd->flags), flags);
+ if (rc)
+ goto out_put;
+
+ cmd->iova = iova;
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+out_put:
+ iommufd_put_object(&ioas->obj);
+ return rc;
+}
+
+int iommufd_ioas_copy(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_ioas_copy *cmd = ucmd->cmd;
+ struct iommufd_ioas *src_ioas;
+ struct iommufd_ioas *dst_ioas;
+ unsigned int flags = 0;
+ LIST_HEAD(pages_list);
+ unsigned long iova;
+ int rc;
+
+ iommufd_test_syz_conv_iova_id(ucmd, cmd->src_ioas_id, &cmd->src_iova,
+ &cmd->flags);
+
+ if ((cmd->flags &
+ ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE |
+ IOMMU_IOAS_MAP_READABLE)))
+ return -EOPNOTSUPP;
+ if (cmd->length >= ULONG_MAX || cmd->src_iova >= ULONG_MAX ||
+ cmd->dst_iova >= ULONG_MAX)
+ return -EOVERFLOW;
+
+ src_ioas = iommufd_get_ioas(ucmd, cmd->src_ioas_id);
+ if (IS_ERR(src_ioas))
+ return PTR_ERR(src_ioas);
+ rc = iopt_get_pages(&src_ioas->iopt, cmd->src_iova, cmd->length,
+ &pages_list);
+ iommufd_put_object(&src_ioas->obj);
+ if (rc)
+ return rc;
+
+ dst_ioas = iommufd_get_ioas(ucmd, cmd->dst_ioas_id);
+ if (IS_ERR(dst_ioas)) {
+ rc = PTR_ERR(dst_ioas);
+ goto out_pages;
+ }
+
+ if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA))
+ flags = IOPT_ALLOC_IOVA;
+ iova = cmd->dst_iova;
+ rc = iopt_map_pages(&dst_ioas->iopt, &pages_list, cmd->length, &iova,
+ conv_iommu_prot(cmd->flags), flags);
+ if (rc)
+ goto out_put_dst;
+
+ cmd->dst_iova = iova;
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+out_put_dst:
+ iommufd_put_object(&dst_ioas->obj);
+out_pages:
+ iopt_free_pages_list(&pages_list);
+ return rc;
+}
+
+int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_ioas_unmap *cmd = ucmd->cmd;
+ struct iommufd_ioas *ioas;
+ unsigned long unmapped = 0;
+ int rc;
+
+ ioas = iommufd_get_ioas(ucmd, cmd->ioas_id);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+
+ if (cmd->iova == 0 && cmd->length == U64_MAX) {
+ rc = iopt_unmap_all(&ioas->iopt, &unmapped);
+ if (rc)
+ goto out_put;
+ } else {
+ if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX) {
+ rc = -EOVERFLOW;
+ goto out_put;
+ }
+ rc = iopt_unmap_iova(&ioas->iopt, cmd->iova, cmd->length,
+ &unmapped);
+ if (rc)
+ goto out_put;
+ }
+
+ cmd->length = unmapped;
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+
+out_put:
+ iommufd_put_object(&ioas->obj);
+ return rc;
+}
+
+int iommufd_option_rlimit_mode(struct iommu_option *cmd,
+ struct iommufd_ctx *ictx)
+{
+ if (cmd->object_id)
+ return -EOPNOTSUPP;
+
+ if (cmd->op == IOMMU_OPTION_OP_GET) {
+ cmd->val64 = ictx->account_mode == IOPT_PAGES_ACCOUNT_MM;
+ return 0;
+ }
+ if (cmd->op == IOMMU_OPTION_OP_SET) {
+ int rc = 0;
+
+ if (!capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ xa_lock(&ictx->objects);
+ if (!xa_empty(&ictx->objects)) {
+ rc = -EBUSY;
+ } else {
+ if (cmd->val64 == 0)
+ ictx->account_mode = IOPT_PAGES_ACCOUNT_USER;
+ else if (cmd->val64 == 1)
+ ictx->account_mode = IOPT_PAGES_ACCOUNT_MM;
+ else
+ rc = -EINVAL;
+ }
+ xa_unlock(&ictx->objects);
+
+ return rc;
+ }
+ return -EOPNOTSUPP;
+}
+
+static int iommufd_ioas_option_huge_pages(struct iommu_option *cmd,
+ struct iommufd_ioas *ioas)
+{
+ if (cmd->op == IOMMU_OPTION_OP_GET) {
+ cmd->val64 = !ioas->iopt.disable_large_pages;
+ return 0;
+ }
+ if (cmd->op == IOMMU_OPTION_OP_SET) {
+ if (cmd->val64 == 0)
+ return iopt_disable_large_pages(&ioas->iopt);
+ if (cmd->val64 == 1) {
+ iopt_enable_large_pages(&ioas->iopt);
+ return 0;
+ }
+ return -EINVAL;
+ }
+ return -EOPNOTSUPP;
+}
+
+int iommufd_ioas_option(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_option *cmd = ucmd->cmd;
+ struct iommufd_ioas *ioas;
+ int rc = 0;
+
+ if (cmd->__reserved)
+ return -EOPNOTSUPP;
+
+ ioas = iommufd_get_ioas(ucmd, cmd->object_id);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+
+ switch (cmd->option_id) {
+ case IOMMU_OPTION_HUGE_PAGES:
+ rc = iommufd_ioas_option_huge_pages(cmd, ioas);
+ break;
+ default:
+ rc = -EOPNOTSUPP;
+ }
+
+ iommufd_put_object(&ioas->obj);
+ return rc;
+}
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
new file mode 100644
index 000000000000..222e86591f8a
--- /dev/null
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -0,0 +1,307 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
+ */
+#ifndef __IOMMUFD_PRIVATE_H
+#define __IOMMUFD_PRIVATE_H
+
+#include <linux/rwsem.h>
+#include <linux/xarray.h>
+#include <linux/refcount.h>
+#include <linux/uaccess.h>
+
+struct iommu_domain;
+struct iommu_group;
+struct iommu_option;
+
+struct iommufd_ctx {
+ struct file *file;
+ struct xarray objects;
+
+ u8 account_mode;
+ struct iommufd_ioas *vfio_ioas;
+};
+
+/*
+ * The IOVA to PFN map. The map automatically copies the PFNs into multiple
+ * domains and permits sharing of PFNs between io_pagetable instances. This
+ * supports both a design where IOAS's are 1:1 with a domain (eg because the
+ * domain is HW customized), or where the IOAS is 1:N with multiple generic
+ * domains. The io_pagetable holds an interval tree of iopt_areas which point
+ * to shared iopt_pages which hold the pfns mapped to the page table.
+ *
+ * The locking order is domains_rwsem -> iova_rwsem -> pages::mutex
+ */
+struct io_pagetable {
+ struct rw_semaphore domains_rwsem;
+ struct xarray domains;
+ struct xarray access_list;
+ unsigned int next_domain_id;
+
+ struct rw_semaphore iova_rwsem;
+ struct rb_root_cached area_itree;
+ /* IOVA that cannot become reserved, struct iopt_allowed */
+ struct rb_root_cached allowed_itree;
+ /* IOVA that cannot be allocated, struct iopt_reserved */
+ struct rb_root_cached reserved_itree;
+ u8 disable_large_pages;
+ unsigned long iova_alignment;
+};
+
+void iopt_init_table(struct io_pagetable *iopt);
+void iopt_destroy_table(struct io_pagetable *iopt);
+int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
+ unsigned long length, struct list_head *pages_list);
+void iopt_free_pages_list(struct list_head *pages_list);
+enum {
+ IOPT_ALLOC_IOVA = 1 << 0,
+};
+int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
+ unsigned long *iova, void __user *uptr,
+ unsigned long length, int iommu_prot,
+ unsigned int flags);
+int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
+ unsigned long length, unsigned long *dst_iova,
+ int iommu_prot, unsigned int flags);
+int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
+ unsigned long length, unsigned long *unmapped);
+int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped);
+
+void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova,
+ unsigned long length);
+int iopt_table_add_domain(struct io_pagetable *iopt,
+ struct iommu_domain *domain);
+void iopt_table_remove_domain(struct io_pagetable *iopt,
+ struct iommu_domain *domain);
+int iopt_table_enforce_group_resv_regions(struct io_pagetable *iopt,
+ struct device *device,
+ struct iommu_group *group,
+ phys_addr_t *sw_msi_start);
+int iopt_set_allow_iova(struct io_pagetable *iopt,
+ struct rb_root_cached *allowed_iova);
+int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
+ unsigned long last, void *owner);
+void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner);
+int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas,
+ size_t num_iovas);
+void iopt_enable_large_pages(struct io_pagetable *iopt);
+int iopt_disable_large_pages(struct io_pagetable *iopt);
+
+struct iommufd_ucmd {
+ struct iommufd_ctx *ictx;
+ void __user *ubuffer;
+ u32 user_size;
+ void *cmd;
+};
+
+int iommufd_vfio_ioctl(struct iommufd_ctx *ictx, unsigned int cmd,
+ unsigned long arg);
+
+/* Copy the response in ucmd->cmd back to userspace. */
+static inline int iommufd_ucmd_respond(struct iommufd_ucmd *ucmd,
+ size_t cmd_len)
+{
+ if (copy_to_user(ucmd->ubuffer, ucmd->cmd,
+ min_t(size_t, ucmd->user_size, cmd_len)))
+ return -EFAULT;
+ return 0;
+}
+
+enum iommufd_object_type {
+ IOMMUFD_OBJ_NONE,
+ IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE,
+ IOMMUFD_OBJ_DEVICE,
+ IOMMUFD_OBJ_HW_PAGETABLE,
+ IOMMUFD_OBJ_IOAS,
+ IOMMUFD_OBJ_ACCESS,
+#ifdef CONFIG_IOMMUFD_TEST
+ IOMMUFD_OBJ_SELFTEST,
+#endif
+};
+
+/* Base struct for all objects with a userspace ID handle. */
+struct iommufd_object {
+ struct rw_semaphore destroy_rwsem;
+ refcount_t users;
+ enum iommufd_object_type type;
+ unsigned int id;
+};
+
+static inline bool iommufd_lock_obj(struct iommufd_object *obj)
+{
+ if (!down_read_trylock(&obj->destroy_rwsem))
+ return false;
+ if (!refcount_inc_not_zero(&obj->users)) {
+ up_read(&obj->destroy_rwsem);
+ return false;
+ }
+ return true;
+}
+
+struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id,
+ enum iommufd_object_type type);
+static inline void iommufd_put_object(struct iommufd_object *obj)
+{
+ refcount_dec(&obj->users);
+ up_read(&obj->destroy_rwsem);
+}
+
+/**
+ * iommufd_ref_to_users() - Switch from destroy_rwsem to users refcount
+ * protection
+ * @obj - Object to release
+ *
+ * Objects have two refcount protections (destroy_rwsem and the refcount_t
+ * users). Holding either of these will prevent the object from being destroyed.
+ *
+ * Depending on the use case, one protection or the other is appropriate. In
+ * most cases references are being protected by the destroy_rwsem. This allows
+ * orderly destruction of the object because iommufd_object_destroy_user() will
+ * wait for it to become unlocked. However, as a rwsem, it cannot be held across
+ * a system call return. So cases that have longer term needs must switch
+ * to the weaker users refcount_t.
+ *
+ * With users protection iommufd_object_destroy_user() will return false,
+ * refusing to destroy the object, causing -EBUSY to userspace.
+ */
+static inline void iommufd_ref_to_users(struct iommufd_object *obj)
+{
+ up_read(&obj->destroy_rwsem);
+ /* iommufd_lock_obj() obtains users as well */
+}
+void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj);
+void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx,
+ struct iommufd_object *obj);
+void iommufd_object_finalize(struct iommufd_ctx *ictx,
+ struct iommufd_object *obj);
+bool iommufd_object_destroy_user(struct iommufd_ctx *ictx,
+ struct iommufd_object *obj);
+struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
+ size_t size,
+ enum iommufd_object_type type);
+
+#define iommufd_object_alloc(ictx, ptr, type) \
+ container_of(_iommufd_object_alloc( \
+ ictx, \
+ sizeof(*(ptr)) + BUILD_BUG_ON_ZERO( \
+ offsetof(typeof(*(ptr)), \
+ obj) != 0), \
+ type), \
+ typeof(*(ptr)), obj)
+
+/*
+ * The IO Address Space (IOAS) pagetable is a virtual page table backed by the
+ * io_pagetable object. It is a user controlled mapping of IOVA -> PFNs. The
+ * mapping is copied into all of the associated domains and made available to
+ * in-kernel users.
+ *
+ * Every iommu_domain that is created is wrapped in a iommufd_hw_pagetable
+ * object. When we go to attach a device to an IOAS we need to get an
+ * iommu_domain and wrapping iommufd_hw_pagetable for it.
+ *
+ * An iommu_domain & iommfd_hw_pagetable will be automatically selected
+ * for a device based on the hwpt_list. If no suitable iommu_domain
+ * is found a new iommu_domain will be created.
+ */
+struct iommufd_ioas {
+ struct iommufd_object obj;
+ struct io_pagetable iopt;
+ struct mutex mutex;
+ struct list_head hwpt_list;
+};
+
+static inline struct iommufd_ioas *iommufd_get_ioas(struct iommufd_ucmd *ucmd,
+ u32 id)
+{
+ return container_of(iommufd_get_object(ucmd->ictx, id,
+ IOMMUFD_OBJ_IOAS),
+ struct iommufd_ioas, obj);
+}
+
+struct iommufd_ioas *iommufd_ioas_alloc(struct iommufd_ctx *ictx);
+int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd);
+void iommufd_ioas_destroy(struct iommufd_object *obj);
+int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd);
+int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd);
+int iommufd_ioas_map(struct iommufd_ucmd *ucmd);
+int iommufd_ioas_copy(struct iommufd_ucmd *ucmd);
+int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd);
+int iommufd_ioas_option(struct iommufd_ucmd *ucmd);
+int iommufd_option_rlimit_mode(struct iommu_option *cmd,
+ struct iommufd_ctx *ictx);
+
+int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd);
+
+/*
+ * A HW pagetable is called an iommu_domain inside the kernel. This user object
+ * allows directly creating and inspecting the domains. Domains that have kernel
+ * owned page tables will be associated with an iommufd_ioas that provides the
+ * IOVA to PFN map.
+ */
+struct iommufd_hw_pagetable {
+ struct iommufd_object obj;
+ struct iommufd_ioas *ioas;
+ struct iommu_domain *domain;
+ bool auto_domain : 1;
+ bool enforce_cache_coherency : 1;
+ bool msi_cookie : 1;
+ /* Head at iommufd_ioas::hwpt_list */
+ struct list_head hwpt_item;
+ struct mutex devices_lock;
+ struct list_head devices;
+};
+
+struct iommufd_hw_pagetable *
+iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
+ struct device *dev);
+void iommufd_hw_pagetable_destroy(struct iommufd_object *obj);
+
+void iommufd_device_destroy(struct iommufd_object *obj);
+
+struct iommufd_access {
+ struct iommufd_object obj;
+ struct iommufd_ctx *ictx;
+ struct iommufd_ioas *ioas;
+ const struct iommufd_access_ops *ops;
+ void *data;
+ unsigned long iova_alignment;
+ u32 iopt_access_list_id;
+};
+
+int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access);
+void iopt_remove_access(struct io_pagetable *iopt,
+ struct iommufd_access *access);
+void iommufd_access_destroy_object(struct iommufd_object *obj);
+
+#ifdef CONFIG_IOMMUFD_TEST
+struct iommufd_hw_pagetable *
+iommufd_device_selftest_attach(struct iommufd_ctx *ictx,
+ struct iommufd_ioas *ioas,
+ struct device *mock_dev);
+void iommufd_device_selftest_detach(struct iommufd_ctx *ictx,
+ struct iommufd_hw_pagetable *hwpt);
+int iommufd_test(struct iommufd_ucmd *ucmd);
+void iommufd_selftest_destroy(struct iommufd_object *obj);
+extern size_t iommufd_test_memory_limit;
+void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd,
+ unsigned int ioas_id, u64 *iova, u32 *flags);
+bool iommufd_should_fail(void);
+void __init iommufd_test_init(void);
+void iommufd_test_exit(void);
+#else
+static inline void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd,
+ unsigned int ioas_id,
+ u64 *iova, u32 *flags)
+{
+}
+static inline bool iommufd_should_fail(void)
+{
+ return false;
+}
+static inline void __init iommufd_test_init(void)
+{
+}
+static inline void iommufd_test_exit(void)
+{
+}
+#endif
+#endif
diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h
new file mode 100644
index 000000000000..1d96a8f466fd
--- /dev/null
+++ b/drivers/iommu/iommufd/iommufd_test.h
@@ -0,0 +1,93 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
+ */
+#ifndef _UAPI_IOMMUFD_TEST_H
+#define _UAPI_IOMMUFD_TEST_H
+
+#include <linux/types.h>
+#include <linux/iommufd.h>
+
+enum {
+ IOMMU_TEST_OP_ADD_RESERVED = 1,
+ IOMMU_TEST_OP_MOCK_DOMAIN,
+ IOMMU_TEST_OP_MD_CHECK_MAP,
+ IOMMU_TEST_OP_MD_CHECK_REFS,
+ IOMMU_TEST_OP_CREATE_ACCESS,
+ IOMMU_TEST_OP_DESTROY_ACCESS_PAGES,
+ IOMMU_TEST_OP_ACCESS_PAGES,
+ IOMMU_TEST_OP_ACCESS_RW,
+ IOMMU_TEST_OP_SET_TEMP_MEMORY_LIMIT,
+};
+
+enum {
+ MOCK_APERTURE_START = 1UL << 24,
+ MOCK_APERTURE_LAST = (1UL << 31) - 1,
+};
+
+enum {
+ MOCK_FLAGS_ACCESS_WRITE = 1 << 0,
+ MOCK_FLAGS_ACCESS_SYZ = 1 << 16,
+};
+
+enum {
+ MOCK_ACCESS_RW_WRITE = 1 << 0,
+ MOCK_ACCESS_RW_SLOW_PATH = 1 << 2,
+};
+
+enum {
+ MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES = 1 << 0,
+};
+
+struct iommu_test_cmd {
+ __u32 size;
+ __u32 op;
+ __u32 id;
+ __u32 __reserved;
+ union {
+ struct {
+ __aligned_u64 start;
+ __aligned_u64 length;
+ } add_reserved;
+ struct {
+ __u32 out_device_id;
+ __u32 out_hwpt_id;
+ } mock_domain;
+ struct {
+ __aligned_u64 iova;
+ __aligned_u64 length;
+ __aligned_u64 uptr;
+ } check_map;
+ struct {
+ __aligned_u64 length;
+ __aligned_u64 uptr;
+ __u32 refs;
+ } check_refs;
+ struct {
+ __u32 out_access_fd;
+ __u32 flags;
+ } create_access;
+ struct {
+ __u32 access_pages_id;
+ } destroy_access_pages;
+ struct {
+ __u32 flags;
+ __u32 out_access_pages_id;
+ __aligned_u64 iova;
+ __aligned_u64 length;
+ __aligned_u64 uptr;
+ } access_pages;
+ struct {
+ __aligned_u64 iova;
+ __aligned_u64 length;
+ __aligned_u64 uptr;
+ __u32 flags;
+ } access_rw;
+ struct {
+ __u32 limit;
+ } memory_limit;
+ };
+ __u32 last;
+};
+#define IOMMU_TEST_CMD _IO(IOMMUFD_TYPE, IOMMUFD_CMD_BASE + 32)
+
+#endif
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
new file mode 100644
index 000000000000..bcb463e58100
--- /dev/null
+++ b/drivers/iommu/iommufd/main.c
@@ -0,0 +1,424 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2021 Intel Corporation
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
+ *
+ * iommufd provides control over the IOMMU HW objects created by IOMMU kernel
+ * drivers. IOMMU HW objects revolve around IO page tables that map incoming DMA
+ * addresses (IOVA) to CPU addresses.
+ */
+#define pr_fmt(fmt) "iommufd: " fmt
+
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/miscdevice.h>
+#include <linux/mutex.h>
+#include <linux/bug.h>
+#include <uapi/linux/iommufd.h>
+#include <linux/iommufd.h>
+
+#include "iommufd_private.h"
+#include "iommufd_test.h"
+
+struct iommufd_object_ops {
+ void (*destroy)(struct iommufd_object *obj);
+};
+static const struct iommufd_object_ops iommufd_object_ops[];
+
+struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
+ size_t size,
+ enum iommufd_object_type type)
+{
+ struct iommufd_object *obj;
+ int rc;
+
+ obj = kzalloc(size, GFP_KERNEL_ACCOUNT);
+ if (!obj)
+ return ERR_PTR(-ENOMEM);
+ obj->type = type;
+ init_rwsem(&obj->destroy_rwsem);
+ refcount_set(&obj->users, 1);
+
+ /*
+ * Reserve an ID in the xarray but do not publish the pointer yet since
+ * the caller hasn't initialized it yet. Once the pointer is published
+ * in the xarray and visible to other threads we can't reliably destroy
+ * it anymore, so the caller must complete all errorable operations
+ * before calling iommufd_object_finalize().
+ */
+ rc = xa_alloc(&ictx->objects, &obj->id, XA_ZERO_ENTRY,
+ xa_limit_32b, GFP_KERNEL_ACCOUNT);
+ if (rc)
+ goto out_free;
+ return obj;
+out_free:
+ kfree(obj);
+ return ERR_PTR(rc);
+}
+
+/*
+ * Allow concurrent access to the object.
+ *
+ * Once another thread can see the object pointer it can prevent object
+ * destruction. Expect for special kernel-only objects there is no in-kernel way
+ * to reliably destroy a single object. Thus all APIs that are creating objects
+ * must use iommufd_object_abort() to handle their errors and only call
+ * iommufd_object_finalize() once object creation cannot fail.
+ */
+void iommufd_object_finalize(struct iommufd_ctx *ictx,
+ struct iommufd_object *obj)
+{
+ void *old;
+
+ old = xa_store(&ictx->objects, obj->id, obj, GFP_KERNEL);
+ /* obj->id was returned from xa_alloc() so the xa_store() cannot fail */
+ WARN_ON(old);
+}
+
+/* Undo _iommufd_object_alloc() if iommufd_object_finalize() was not called */
+void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj)
+{
+ void *old;
+
+ old = xa_erase(&ictx->objects, obj->id);
+ WARN_ON(old);
+ kfree(obj);
+}
+
+/*
+ * Abort an object that has been fully initialized and needs destroy, but has
+ * not been finalized.
+ */
+void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx,
+ struct iommufd_object *obj)
+{
+ iommufd_object_ops[obj->type].destroy(obj);
+ iommufd_object_abort(ictx, obj);
+}
+
+struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id,
+ enum iommufd_object_type type)
+{
+ struct iommufd_object *obj;
+
+ if (iommufd_should_fail())
+ return ERR_PTR(-ENOENT);
+
+ xa_lock(&ictx->objects);
+ obj = xa_load(&ictx->objects, id);
+ if (!obj || (type != IOMMUFD_OBJ_ANY && obj->type != type) ||
+ !iommufd_lock_obj(obj))
+ obj = ERR_PTR(-ENOENT);
+ xa_unlock(&ictx->objects);
+ return obj;
+}
+
+/*
+ * The caller holds a users refcount and wants to destroy the object. Returns
+ * true if the object was destroyed. In all cases the caller no longer has a
+ * reference on obj.
+ */
+bool iommufd_object_destroy_user(struct iommufd_ctx *ictx,
+ struct iommufd_object *obj)
+{
+ /*
+ * The purpose of the destroy_rwsem is to ensure deterministic
+ * destruction of objects used by external drivers and destroyed by this
+ * function. Any temporary increment of the refcount must hold the read
+ * side of this, such as during ioctl execution.
+ */
+ down_write(&obj->destroy_rwsem);
+ xa_lock(&ictx->objects);
+ refcount_dec(&obj->users);
+ if (!refcount_dec_if_one(&obj->users)) {
+ xa_unlock(&ictx->objects);
+ up_write(&obj->destroy_rwsem);
+ return false;
+ }
+ __xa_erase(&ictx->objects, obj->id);
+ if (ictx->vfio_ioas && &ictx->vfio_ioas->obj == obj)
+ ictx->vfio_ioas = NULL;
+ xa_unlock(&ictx->objects);
+ up_write(&obj->destroy_rwsem);
+
+ iommufd_object_ops[obj->type].destroy(obj);
+ kfree(obj);
+ return true;
+}
+
+static int iommufd_destroy(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_destroy *cmd = ucmd->cmd;
+ struct iommufd_object *obj;
+
+ obj = iommufd_get_object(ucmd->ictx, cmd->id, IOMMUFD_OBJ_ANY);
+ if (IS_ERR(obj))
+ return PTR_ERR(obj);
+ iommufd_ref_to_users(obj);
+ /* See iommufd_ref_to_users() */
+ if (!iommufd_object_destroy_user(ucmd->ictx, obj))
+ return -EBUSY;
+ return 0;
+}
+
+static int iommufd_fops_open(struct inode *inode, struct file *filp)
+{
+ struct iommufd_ctx *ictx;
+
+ ictx = kzalloc(sizeof(*ictx), GFP_KERNEL_ACCOUNT);
+ if (!ictx)
+ return -ENOMEM;
+
+ xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT);
+ ictx->file = filp;
+ filp->private_data = ictx;
+ return 0;
+}
+
+static int iommufd_fops_release(struct inode *inode, struct file *filp)
+{
+ struct iommufd_ctx *ictx = filp->private_data;
+ struct iommufd_object *obj;
+
+ /*
+ * The objects in the xarray form a graph of "users" counts, and we have
+ * to destroy them in a depth first manner. Leaf objects will reduce the
+ * users count of interior objects when they are destroyed.
+ *
+ * Repeatedly destroying all the "1 users" leaf objects will progress
+ * until the entire list is destroyed. If this can't progress then there
+ * is some bug related to object refcounting.
+ */
+ while (!xa_empty(&ictx->objects)) {
+ unsigned int destroyed = 0;
+ unsigned long index;
+
+ xa_for_each(&ictx->objects, index, obj) {
+ if (!refcount_dec_if_one(&obj->users))
+ continue;
+ destroyed++;
+ xa_erase(&ictx->objects, index);
+ iommufd_object_ops[obj->type].destroy(obj);
+ kfree(obj);
+ }
+ /* Bug related to users refcount */
+ if (WARN_ON(!destroyed))
+ break;
+ }
+ kfree(ictx);
+ return 0;
+}
+
+static int iommufd_option(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_option *cmd = ucmd->cmd;
+ int rc;
+
+ if (cmd->__reserved)
+ return -EOPNOTSUPP;
+
+ switch (cmd->option_id) {
+ case IOMMU_OPTION_RLIMIT_MODE:
+ rc = iommufd_option_rlimit_mode(cmd, ucmd->ictx);
+ break;
+ case IOMMU_OPTION_HUGE_PAGES:
+ rc = iommufd_ioas_option(ucmd);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ if (rc)
+ return rc;
+ if (copy_to_user(&((struct iommu_option __user *)ucmd->ubuffer)->val64,
+ &cmd->val64, sizeof(cmd->val64)))
+ return -EFAULT;
+ return 0;
+}
+
+union ucmd_buffer {
+ struct iommu_destroy destroy;
+ struct iommu_ioas_alloc alloc;
+ struct iommu_ioas_allow_iovas allow_iovas;
+ struct iommu_ioas_iova_ranges iova_ranges;
+ struct iommu_ioas_map map;
+ struct iommu_ioas_unmap unmap;
+#ifdef CONFIG_IOMMUFD_TEST
+ struct iommu_test_cmd test;
+#endif
+};
+
+struct iommufd_ioctl_op {
+ unsigned int size;
+ unsigned int min_size;
+ unsigned int ioctl_num;
+ int (*execute)(struct iommufd_ucmd *ucmd);
+};
+
+#define IOCTL_OP(_ioctl, _fn, _struct, _last) \
+ [_IOC_NR(_ioctl) - IOMMUFD_CMD_BASE] = { \
+ .size = sizeof(_struct) + \
+ BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) < \
+ sizeof(_struct)), \
+ .min_size = offsetofend(_struct, _last), \
+ .ioctl_num = _ioctl, \
+ .execute = _fn, \
+ }
+static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
+ IOCTL_OP(IOMMU_DESTROY, iommufd_destroy, struct iommu_destroy, id),
+ IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl,
+ struct iommu_ioas_alloc, out_ioas_id),
+ IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas,
+ struct iommu_ioas_allow_iovas, allowed_iovas),
+ IOCTL_OP(IOMMU_IOAS_COPY, iommufd_ioas_copy, struct iommu_ioas_copy,
+ src_iova),
+ IOCTL_OP(IOMMU_IOAS_IOVA_RANGES, iommufd_ioas_iova_ranges,
+ struct iommu_ioas_iova_ranges, out_iova_alignment),
+ IOCTL_OP(IOMMU_IOAS_MAP, iommufd_ioas_map, struct iommu_ioas_map,
+ iova),
+ IOCTL_OP(IOMMU_IOAS_UNMAP, iommufd_ioas_unmap, struct iommu_ioas_unmap,
+ length),
+ IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option,
+ val64),
+ IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas,
+ __reserved),
+#ifdef CONFIG_IOMMUFD_TEST
+ IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last),
+#endif
+};
+
+static long iommufd_fops_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ struct iommufd_ctx *ictx = filp->private_data;
+ const struct iommufd_ioctl_op *op;
+ struct iommufd_ucmd ucmd = {};
+ union ucmd_buffer buf;
+ unsigned int nr;
+ int ret;
+
+ nr = _IOC_NR(cmd);
+ if (nr < IOMMUFD_CMD_BASE ||
+ (nr - IOMMUFD_CMD_BASE) >= ARRAY_SIZE(iommufd_ioctl_ops))
+ return iommufd_vfio_ioctl(ictx, cmd, arg);
+
+ ucmd.ictx = ictx;
+ ucmd.ubuffer = (void __user *)arg;
+ ret = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer);
+ if (ret)
+ return ret;
+
+ op = &iommufd_ioctl_ops[nr - IOMMUFD_CMD_BASE];
+ if (op->ioctl_num != cmd)
+ return -ENOIOCTLCMD;
+ if (ucmd.user_size < op->min_size)
+ return -EINVAL;
+
+ ucmd.cmd = &buf;
+ ret = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer,
+ ucmd.user_size);
+ if (ret)
+ return ret;
+ ret = op->execute(&ucmd);
+ return ret;
+}
+
+static const struct file_operations iommufd_fops = {
+ .owner = THIS_MODULE,
+ .open = iommufd_fops_open,
+ .release = iommufd_fops_release,
+ .unlocked_ioctl = iommufd_fops_ioctl,
+};
+
+/**
+ * iommufd_ctx_get - Get a context reference
+ * @ictx: Context to get
+ *
+ * The caller must already hold a valid reference to ictx.
+ */
+void iommufd_ctx_get(struct iommufd_ctx *ictx)
+{
+ get_file(ictx->file);
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_ctx_get, IOMMUFD);
+
+/**
+ * iommufd_ctx_from_file - Acquires a reference to the iommufd context
+ * @file: File to obtain the reference from
+ *
+ * Returns a pointer to the iommufd_ctx, otherwise ERR_PTR. The struct file
+ * remains owned by the caller and the caller must still do fput. On success
+ * the caller is responsible to call iommufd_ctx_put().
+ */
+struct iommufd_ctx *iommufd_ctx_from_file(struct file *file)
+{
+ struct iommufd_ctx *ictx;
+
+ if (file->f_op != &iommufd_fops)
+ return ERR_PTR(-EBADFD);
+ ictx = file->private_data;
+ iommufd_ctx_get(ictx);
+ return ictx;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_file, IOMMUFD);
+
+/**
+ * iommufd_ctx_put - Put back a reference
+ * @ictx: Context to put back
+ */
+void iommufd_ctx_put(struct iommufd_ctx *ictx)
+{
+ fput(ictx->file);
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_ctx_put, IOMMUFD);
+
+static const struct iommufd_object_ops iommufd_object_ops[] = {
+ [IOMMUFD_OBJ_ACCESS] = {
+ .destroy = iommufd_access_destroy_object,
+ },
+ [IOMMUFD_OBJ_DEVICE] = {
+ .destroy = iommufd_device_destroy,
+ },
+ [IOMMUFD_OBJ_IOAS] = {
+ .destroy = iommufd_ioas_destroy,
+ },
+ [IOMMUFD_OBJ_HW_PAGETABLE] = {
+ .destroy = iommufd_hw_pagetable_destroy,
+ },
+#ifdef CONFIG_IOMMUFD_TEST
+ [IOMMUFD_OBJ_SELFTEST] = {
+ .destroy = iommufd_selftest_destroy,
+ },
+#endif
+};
+
+static struct miscdevice iommu_misc_dev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "iommu",
+ .fops = &iommufd_fops,
+ .nodename = "iommu",
+ .mode = 0660,
+};
+
+static int __init iommufd_init(void)
+{
+ int ret;
+
+ ret = misc_register(&iommu_misc_dev);
+ if (ret)
+ return ret;
+ iommufd_test_init();
+ return 0;
+}
+
+static void __exit iommufd_exit(void)
+{
+ iommufd_test_exit();
+ misc_deregister(&iommu_misc_dev);
+}
+
+module_init(iommufd_init);
+module_exit(iommufd_exit);
+
+MODULE_DESCRIPTION("I/O Address Space Management for passthrough devices");
+MODULE_LICENSE("GPL");
diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c
new file mode 100644
index 000000000000..429fa3b0a239
--- /dev/null
+++ b/drivers/iommu/iommufd/pages.c
@@ -0,0 +1,1981 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
+ *
+ * The iopt_pages is the center of the storage and motion of PFNs. Each
+ * iopt_pages represents a logical linear array of full PFNs. The array is 0
+ * based and has npages in it. Accessors use 'index' to refer to the entry in
+ * this logical array, regardless of its storage location.
+ *
+ * PFNs are stored in a tiered scheme:
+ * 1) iopt_pages::pinned_pfns xarray
+ * 2) An iommu_domain
+ * 3) The origin of the PFNs, i.e. the userspace pointer
+ *
+ * PFN have to be copied between all combinations of tiers, depending on the
+ * configuration.
+ *
+ * When a PFN is taken out of the userspace pointer it is pinned exactly once.
+ * The storage locations of the PFN's index are tracked in the two interval
+ * trees. If no interval includes the index then it is not pinned.
+ *
+ * If access_itree includes the PFN's index then an in-kernel access has
+ * requested the page. The PFN is stored in the xarray so other requestors can
+ * continue to find it.
+ *
+ * If the domains_itree includes the PFN's index then an iommu_domain is storing
+ * the PFN and it can be read back using iommu_iova_to_phys(). To avoid
+ * duplicating storage the xarray is not used if only iommu_domains are using
+ * the PFN's index.
+ *
+ * As a general principle this is designed so that destroy never fails. This
+ * means removing an iommu_domain or releasing a in-kernel access will not fail
+ * due to insufficient memory. In practice this means some cases have to hold
+ * PFNs in the xarray even though they are also being stored in an iommu_domain.
+ *
+ * While the iopt_pages can use an iommu_domain as storage, it does not have an
+ * IOVA itself. Instead the iopt_area represents a range of IOVA and uses the
+ * iopt_pages as the PFN provider. Multiple iopt_areas can share the iopt_pages
+ * and reference their own slice of the PFN array, with sub page granularity.
+ *
+ * In this file the term 'last' indicates an inclusive and closed interval, eg
+ * [0,0] refers to a single PFN. 'end' means an open range, eg [0,0) refers to
+ * no PFNs.
+ *
+ * Be cautious of overflow. An IOVA can go all the way up to U64_MAX, so
+ * last_iova + 1 can overflow. An iopt_pages index will always be much less than
+ * ULONG_MAX so last_index + 1 cannot overflow.
+ */
+#include <linux/overflow.h>
+#include <linux/slab.h>
+#include <linux/iommu.h>
+#include <linux/sched/mm.h>
+#include <linux/highmem.h>
+#include <linux/kthread.h>
+#include <linux/iommufd.h>
+
+#include "io_pagetable.h"
+#include "double_span.h"
+
+#ifndef CONFIG_IOMMUFD_TEST
+#define TEMP_MEMORY_LIMIT 65536
+#else
+#define TEMP_MEMORY_LIMIT iommufd_test_memory_limit
+#endif
+#define BATCH_BACKUP_SIZE 32
+
+/*
+ * More memory makes pin_user_pages() and the batching more efficient, but as
+ * this is only a performance optimization don't try too hard to get it. A 64k
+ * allocation can hold about 26M of 4k pages and 13G of 2M pages in an
+ * pfn_batch. Various destroy paths cannot fail and provide a small amount of
+ * stack memory as a backup contingency. If backup_len is given this cannot
+ * fail.
+ */
+static void *temp_kmalloc(size_t *size, void *backup, size_t backup_len)
+{
+ void *res;
+
+ if (WARN_ON(*size == 0))
+ return NULL;
+
+ if (*size < backup_len)
+ return backup;
+
+ if (!backup && iommufd_should_fail())
+ return NULL;
+
+ *size = min_t(size_t, *size, TEMP_MEMORY_LIMIT);
+ res = kmalloc(*size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
+ if (res)
+ return res;
+ *size = PAGE_SIZE;
+ if (backup_len) {
+ res = kmalloc(*size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
+ if (res)
+ return res;
+ *size = backup_len;
+ return backup;
+ }
+ return kmalloc(*size, GFP_KERNEL);
+}
+
+void interval_tree_double_span_iter_update(
+ struct interval_tree_double_span_iter *iter)
+{
+ unsigned long last_hole = ULONG_MAX;
+ unsigned int i;
+
+ for (i = 0; i != ARRAY_SIZE(iter->spans); i++) {
+ if (interval_tree_span_iter_done(&iter->spans[i])) {
+ iter->is_used = -1;
+ return;
+ }
+
+ if (iter->spans[i].is_hole) {
+ last_hole = min(last_hole, iter->spans[i].last_hole);
+ continue;
+ }
+
+ iter->is_used = i + 1;
+ iter->start_used = iter->spans[i].start_used;
+ iter->last_used = min(iter->spans[i].last_used, last_hole);
+ return;
+ }
+
+ iter->is_used = 0;
+ iter->start_hole = iter->spans[0].start_hole;
+ iter->last_hole =
+ min(iter->spans[0].last_hole, iter->spans[1].last_hole);
+}
+
+void interval_tree_double_span_iter_first(
+ struct interval_tree_double_span_iter *iter,
+ struct rb_root_cached *itree1, struct rb_root_cached *itree2,
+ unsigned long first_index, unsigned long last_index)
+{
+ unsigned int i;
+
+ iter->itrees[0] = itree1;
+ iter->itrees[1] = itree2;
+ for (i = 0; i != ARRAY_SIZE(iter->spans); i++)
+ interval_tree_span_iter_first(&iter->spans[i], iter->itrees[i],
+ first_index, last_index);
+ interval_tree_double_span_iter_update(iter);
+}
+
+void interval_tree_double_span_iter_next(
+ struct interval_tree_double_span_iter *iter)
+{
+ unsigned int i;
+
+ if (iter->is_used == -1 ||
+ iter->last_hole == iter->spans[0].last_index) {
+ iter->is_used = -1;
+ return;
+ }
+
+ for (i = 0; i != ARRAY_SIZE(iter->spans); i++)
+ interval_tree_span_iter_advance(
+ &iter->spans[i], iter->itrees[i], iter->last_hole + 1);
+ interval_tree_double_span_iter_update(iter);
+}
+
+static void iopt_pages_add_npinned(struct iopt_pages *pages, size_t npages)
+{
+ int rc;
+
+ rc = check_add_overflow(pages->npinned, npages, &pages->npinned);
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
+ WARN_ON(rc || pages->npinned > pages->npages);
+}
+
+static void iopt_pages_sub_npinned(struct iopt_pages *pages, size_t npages)
+{
+ int rc;
+
+ rc = check_sub_overflow(pages->npinned, npages, &pages->npinned);
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
+ WARN_ON(rc || pages->npinned > pages->npages);
+}
+
+static void iopt_pages_err_unpin(struct iopt_pages *pages,
+ unsigned long start_index,
+ unsigned long last_index,
+ struct page **page_list)
+{
+ unsigned long npages = last_index - start_index + 1;
+
+ unpin_user_pages(page_list, npages);
+ iopt_pages_sub_npinned(pages, npages);
+}
+
+/*
+ * index is the number of PAGE_SIZE units from the start of the area's
+ * iopt_pages. If the iova is sub page-size then the area has an iova that
+ * covers a portion of the first and last pages in the range.
+ */
+static unsigned long iopt_area_index_to_iova(struct iopt_area *area,
+ unsigned long index)
+{
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
+ WARN_ON(index < iopt_area_index(area) ||
+ index > iopt_area_last_index(area));
+ index -= iopt_area_index(area);
+ if (index == 0)
+ return iopt_area_iova(area);
+ return iopt_area_iova(area) - area->page_offset + index * PAGE_SIZE;
+}
+
+static unsigned long iopt_area_index_to_iova_last(struct iopt_area *area,
+ unsigned long index)
+{
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
+ WARN_ON(index < iopt_area_index(area) ||
+ index > iopt_area_last_index(area));
+ if (index == iopt_area_last_index(area))
+ return iopt_area_last_iova(area);
+ return iopt_area_iova(area) - area->page_offset +
+ (index - iopt_area_index(area) + 1) * PAGE_SIZE - 1;
+}
+
+static void iommu_unmap_nofail(struct iommu_domain *domain, unsigned long iova,
+ size_t size)
+{
+ size_t ret;
+
+ ret = iommu_unmap(domain, iova, size);
+ /*
+ * It is a logic error in this code or a driver bug if the IOMMU unmaps
+ * something other than exactly as requested. This implies that the
+ * iommu driver may not fail unmap for reasons beyond bad agruments.
+ * Particularly, the iommu driver may not do a memory allocation on the
+ * unmap path.
+ */
+ WARN_ON(ret != size);
+}
+
+static void iopt_area_unmap_domain_range(struct iopt_area *area,
+ struct iommu_domain *domain,
+ unsigned long start_index,
+ unsigned long last_index)
+{
+ unsigned long start_iova = iopt_area_index_to_iova(area, start_index);
+
+ iommu_unmap_nofail(domain, start_iova,
+ iopt_area_index_to_iova_last(area, last_index) -
+ start_iova + 1);
+}
+
+static struct iopt_area *iopt_pages_find_domain_area(struct iopt_pages *pages,
+ unsigned long index)
+{
+ struct interval_tree_node *node;
+
+ node = interval_tree_iter_first(&pages->domains_itree, index, index);
+ if (!node)
+ return NULL;
+ return container_of(node, struct iopt_area, pages_node);
+}
+
+/*
+ * A simple datastructure to hold a vector of PFNs, optimized for contiguous
+ * PFNs. This is used as a temporary holding memory for shuttling pfns from one
+ * place to another. Generally everything is made more efficient if operations
+ * work on the largest possible grouping of pfns. eg fewer lock/unlock cycles,
+ * better cache locality, etc
+ */
+struct pfn_batch {
+ unsigned long *pfns;
+ u32 *npfns;
+ unsigned int array_size;
+ unsigned int end;
+ unsigned int total_pfns;
+};
+
+static void batch_clear(struct pfn_batch *batch)
+{
+ batch->total_pfns = 0;
+ batch->end = 0;
+ batch->pfns[0] = 0;
+ batch->npfns[0] = 0;
+}
+
+/*
+ * Carry means we carry a portion of the final hugepage over to the front of the
+ * batch
+ */
+static void batch_clear_carry(struct pfn_batch *batch, unsigned int keep_pfns)
+{
+ if (!keep_pfns)
+ return batch_clear(batch);
+
+ batch->total_pfns = keep_pfns;
+ batch->npfns[0] = keep_pfns;
+ batch->pfns[0] = batch->pfns[batch->end - 1] +
+ (batch->npfns[batch->end - 1] - keep_pfns);
+ batch->end = 0;
+}
+
+static void batch_skip_carry(struct pfn_batch *batch, unsigned int skip_pfns)
+{
+ if (!batch->total_pfns)
+ return;
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
+ WARN_ON(batch->total_pfns != batch->npfns[0]);
+ skip_pfns = min(batch->total_pfns, skip_pfns);
+ batch->pfns[0] += skip_pfns;
+ batch->npfns[0] -= skip_pfns;
+ batch->total_pfns -= skip_pfns;
+}
+
+static int __batch_init(struct pfn_batch *batch, size_t max_pages, void *backup,
+ size_t backup_len)
+{
+ const size_t elmsz = sizeof(*batch->pfns) + sizeof(*batch->npfns);
+ size_t size = max_pages * elmsz;
+
+ batch->pfns = temp_kmalloc(&size, backup, backup_len);
+ if (!batch->pfns)
+ return -ENOMEM;
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && WARN_ON(size < elmsz))
+ return -EINVAL;
+ batch->array_size = size / elmsz;
+ batch->npfns = (u32 *)(batch->pfns + batch->array_size);
+ batch_clear(batch);
+ return 0;
+}
+
+static int batch_init(struct pfn_batch *batch, size_t max_pages)
+{
+ return __batch_init(batch, max_pages, NULL, 0);
+}
+
+static void batch_init_backup(struct pfn_batch *batch, size_t max_pages,
+ void *backup, size_t backup_len)
+{
+ __batch_init(batch, max_pages, backup, backup_len);
+}
+
+static void batch_destroy(struct pfn_batch *batch, void *backup)
+{
+ if (batch->pfns != backup)
+ kfree(batch->pfns);
+}
+
+/* true if the pfn could be added, false otherwise */
+static bool batch_add_pfn(struct pfn_batch *batch, unsigned long pfn)
+{
+ const unsigned int MAX_NPFNS = type_max(typeof(*batch->npfns));
+
+ if (batch->end &&
+ pfn == batch->pfns[batch->end - 1] + batch->npfns[batch->end - 1] &&
+ batch->npfns[batch->end - 1] != MAX_NPFNS) {
+ batch->npfns[batch->end - 1]++;
+ batch->total_pfns++;
+ return true;
+ }
+ if (batch->end == batch->array_size)
+ return false;
+ batch->total_pfns++;
+ batch->pfns[batch->end] = pfn;
+ batch->npfns[batch->end] = 1;
+ batch->end++;
+ return true;
+}
+
+/*
+ * Fill the batch with pfns from the domain. When the batch is full, or it
+ * reaches last_index, the function will return. The caller should use
+ * batch->total_pfns to determine the starting point for the next iteration.
+ */
+static void batch_from_domain(struct pfn_batch *batch,
+ struct iommu_domain *domain,
+ struct iopt_area *area, unsigned long start_index,
+ unsigned long last_index)
+{
+ unsigned int page_offset = 0;
+ unsigned long iova;
+ phys_addr_t phys;
+
+ iova = iopt_area_index_to_iova(area, start_index);
+ if (start_index == iopt_area_index(area))
+ page_offset = area->page_offset;
+ while (start_index <= last_index) {
+ /*
+ * This is pretty slow, it would be nice to get the page size
+ * back from the driver, or have the driver directly fill the
+ * batch.
+ */
+ phys = iommu_iova_to_phys(domain, iova) - page_offset;
+ if (!batch_add_pfn(batch, PHYS_PFN(phys)))
+ return;
+ iova += PAGE_SIZE - page_offset;
+ page_offset = 0;
+ start_index++;
+ }
+}
+
+static struct page **raw_pages_from_domain(struct iommu_domain *domain,
+ struct iopt_area *area,
+ unsigned long start_index,
+ unsigned long last_index,
+ struct page **out_pages)
+{
+ unsigned int page_offset = 0;
+ unsigned long iova;
+ phys_addr_t phys;
+
+ iova = iopt_area_index_to_iova(area, start_index);
+ if (start_index == iopt_area_index(area))
+ page_offset = area->page_offset;
+ while (start_index <= last_index) {
+ phys = iommu_iova_to_phys(domain, iova) - page_offset;
+ *(out_pages++) = pfn_to_page(PHYS_PFN(phys));
+ iova += PAGE_SIZE - page_offset;
+ page_offset = 0;
+ start_index++;
+ }
+ return out_pages;
+}
+
+/* Continues reading a domain until we reach a discontiguity in the pfns. */
+static void batch_from_domain_continue(struct pfn_batch *batch,
+ struct iommu_domain *domain,
+ struct iopt_area *area,
+ unsigned long start_index,
+ unsigned long last_index)
+{
+ unsigned int array_size = batch->array_size;
+
+ batch->array_size = batch->end;
+ batch_from_domain(batch, domain, area, start_index, last_index);
+ batch->array_size = array_size;
+}
+
+/*
+ * This is part of the VFIO compatibility support for VFIO_TYPE1_IOMMU. That
+ * mode permits splitting a mapped area up, and then one of the splits is
+ * unmapped. Doing this normally would cause us to violate our invariant of
+ * pairing map/unmap. Thus, to support old VFIO compatibility disable support
+ * for batching consecutive PFNs. All PFNs mapped into the iommu are done in
+ * PAGE_SIZE units, not larger or smaller.
+ */
+static int batch_iommu_map_small(struct iommu_domain *domain,
+ unsigned long iova, phys_addr_t paddr,
+ size_t size, int prot)
+{
+ unsigned long start_iova = iova;
+ int rc;
+
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
+ WARN_ON(paddr % PAGE_SIZE || iova % PAGE_SIZE ||
+ size % PAGE_SIZE);
+
+ while (size) {
+ rc = iommu_map(domain, iova, paddr, PAGE_SIZE, prot);
+ if (rc)
+ goto err_unmap;
+ iova += PAGE_SIZE;
+ paddr += PAGE_SIZE;
+ size -= PAGE_SIZE;
+ }
+ return 0;
+
+err_unmap:
+ if (start_iova != iova)
+ iommu_unmap_nofail(domain, start_iova, iova - start_iova);
+ return rc;
+}
+
+static int batch_to_domain(struct pfn_batch *batch, struct iommu_domain *domain,
+ struct iopt_area *area, unsigned long start_index)
+{
+ bool disable_large_pages = area->iopt->disable_large_pages;
+ unsigned long last_iova = iopt_area_last_iova(area);
+ unsigned int page_offset = 0;
+ unsigned long start_iova;
+ unsigned long next_iova;
+ unsigned int cur = 0;
+ unsigned long iova;
+ int rc;
+
+ /* The first index might be a partial page */
+ if (start_index == iopt_area_index(area))
+ page_offset = area->page_offset;
+ next_iova = iova = start_iova =
+ iopt_area_index_to_iova(area, start_index);
+ while (cur < batch->end) {
+ next_iova = min(last_iova + 1,
+ next_iova + batch->npfns[cur] * PAGE_SIZE -
+ page_offset);
+ if (disable_large_pages)
+ rc = batch_iommu_map_small(
+ domain, iova,
+ PFN_PHYS(batch->pfns[cur]) + page_offset,
+ next_iova - iova, area->iommu_prot);
+ else
+ rc = iommu_map(domain, iova,
+ PFN_PHYS(batch->pfns[cur]) + page_offset,
+ next_iova - iova, area->iommu_prot);
+ if (rc)
+ goto err_unmap;
+ iova = next_iova;
+ page_offset = 0;
+ cur++;
+ }
+ return 0;
+err_unmap:
+ if (start_iova != iova)
+ iommu_unmap_nofail(domain, start_iova, iova - start_iova);
+ return rc;
+}
+
+static void batch_from_xarray(struct pfn_batch *batch, struct xarray *xa,
+ unsigned long start_index,
+ unsigned long last_index)
+{
+ XA_STATE(xas, xa, start_index);
+ void *entry;
+
+ rcu_read_lock();
+ while (true) {
+ entry = xas_next(&xas);
+ if (xas_retry(&xas, entry))
+ continue;
+ WARN_ON(!xa_is_value(entry));
+ if (!batch_add_pfn(batch, xa_to_value(entry)) ||
+ start_index == last_index)
+ break;
+ start_index++;
+ }
+ rcu_read_unlock();
+}
+
+static void batch_from_xarray_clear(struct pfn_batch *batch, struct xarray *xa,
+ unsigned long start_index,
+ unsigned long last_index)
+{
+ XA_STATE(xas, xa, start_index);
+ void *entry;
+
+ xas_lock(&xas);
+ while (true) {
+ entry = xas_next(&xas);
+ if (xas_retry(&xas, entry))
+ continue;
+ WARN_ON(!xa_is_value(entry));
+ if (!batch_add_pfn(batch, xa_to_value(entry)))
+ break;
+ xas_store(&xas, NULL);
+ if (start_index == last_index)
+ break;
+ start_index++;
+ }
+ xas_unlock(&xas);
+}
+
+static void clear_xarray(struct xarray *xa, unsigned long start_index,
+ unsigned long last_index)
+{
+ XA_STATE(xas, xa, start_index);
+ void *entry;
+
+ xas_lock(&xas);
+ xas_for_each(&xas, entry, last_index)
+ xas_store(&xas, NULL);
+ xas_unlock(&xas);
+}
+
+static int pages_to_xarray(struct xarray *xa, unsigned long start_index,
+ unsigned long last_index, struct page **pages)
+{
+ struct page **end_pages = pages + (last_index - start_index) + 1;
+ struct page **half_pages = pages + (end_pages - pages) / 2;
+ XA_STATE(xas, xa, start_index);
+
+ do {
+ void *old;
+
+ xas_lock(&xas);
+ while (pages != end_pages) {
+ /* xarray does not participate in fault injection */
+ if (pages == half_pages && iommufd_should_fail()) {
+ xas_set_err(&xas, -EINVAL);
+ xas_unlock(&xas);
+ /* aka xas_destroy() */
+ xas_nomem(&xas, GFP_KERNEL);
+ goto err_clear;
+ }
+
+ old = xas_store(&xas, xa_mk_value(page_to_pfn(*pages)));
+ if (xas_error(&xas))
+ break;
+ WARN_ON(old);
+ pages++;
+ xas_next(&xas);
+ }
+ xas_unlock(&xas);
+ } while (xas_nomem(&xas, GFP_KERNEL));
+
+err_clear:
+ if (xas_error(&xas)) {
+ if (xas.xa_index != start_index)
+ clear_xarray(xa, start_index, xas.xa_index - 1);
+ return xas_error(&xas);
+ }
+ return 0;
+}
+
+static void batch_from_pages(struct pfn_batch *batch, struct page **pages,
+ size_t npages)
+{
+ struct page **end = pages + npages;
+
+ for (; pages != end; pages++)
+ if (!batch_add_pfn(batch, page_to_pfn(*pages)))
+ break;
+}
+
+static void batch_unpin(struct pfn_batch *batch, struct iopt_pages *pages,
+ unsigned int first_page_off, size_t npages)
+{
+ unsigned int cur = 0;
+
+ while (first_page_off) {
+ if (batch->npfns[cur] > first_page_off)
+ break;
+ first_page_off -= batch->npfns[cur];
+ cur++;
+ }
+
+ while (npages) {
+ size_t to_unpin = min_t(size_t, npages,
+ batch->npfns[cur] - first_page_off);
+
+ unpin_user_page_range_dirty_lock(
+ pfn_to_page(batch->pfns[cur] + first_page_off),
+ to_unpin, pages->writable);
+ iopt_pages_sub_npinned(pages, to_unpin);
+ cur++;
+ first_page_off = 0;
+ npages -= to_unpin;
+ }
+}
+
+static void copy_data_page(struct page *page, void *data, unsigned long offset,
+ size_t length, unsigned int flags)
+{
+ void *mem;
+
+ mem = kmap_local_page(page);
+ if (flags & IOMMUFD_ACCESS_RW_WRITE) {
+ memcpy(mem + offset, data, length);
+ set_page_dirty_lock(page);
+ } else {
+ memcpy(data, mem + offset, length);
+ }
+ kunmap_local(mem);
+}
+
+static unsigned long batch_rw(struct pfn_batch *batch, void *data,
+ unsigned long offset, unsigned long length,
+ unsigned int flags)
+{
+ unsigned long copied = 0;
+ unsigned int npage = 0;
+ unsigned int cur = 0;
+
+ while (cur < batch->end) {
+ unsigned long bytes = min(length, PAGE_SIZE - offset);
+
+ copy_data_page(pfn_to_page(batch->pfns[cur] + npage), data,
+ offset, bytes, flags);
+ offset = 0;
+ length -= bytes;
+ data += bytes;
+ copied += bytes;
+ npage++;
+ if (npage == batch->npfns[cur]) {
+ npage = 0;
+ cur++;
+ }
+ if (!length)
+ break;
+ }
+ return copied;
+}
+
+/* pfn_reader_user is just the pin_user_pages() path */
+struct pfn_reader_user {
+ struct page **upages;
+ size_t upages_len;
+ unsigned long upages_start;
+ unsigned long upages_end;
+ unsigned int gup_flags;
+ /*
+ * 1 means mmget() and mmap_read_lock(), 0 means only mmget(), -1 is
+ * neither
+ */
+ int locked;
+};
+
+static void pfn_reader_user_init(struct pfn_reader_user *user,
+ struct iopt_pages *pages)
+{
+ user->upages = NULL;
+ user->upages_start = 0;
+ user->upages_end = 0;
+ user->locked = -1;
+
+ if (pages->writable) {
+ user->gup_flags = FOLL_LONGTERM | FOLL_WRITE;
+ } else {
+ /* Still need to break COWs on read */
+ user->gup_flags = FOLL_LONGTERM | FOLL_FORCE | FOLL_WRITE;
+ }
+}
+
+static void pfn_reader_user_destroy(struct pfn_reader_user *user,
+ struct iopt_pages *pages)
+{
+ if (user->locked != -1) {
+ if (user->locked)
+ mmap_read_unlock(pages->source_mm);
+ if (pages->source_mm != current->mm)
+ mmput(pages->source_mm);
+ user->locked = 0;
+ }
+
+ kfree(user->upages);
+ user->upages = NULL;
+}
+
+static int pfn_reader_user_pin(struct pfn_reader_user *user,
+ struct iopt_pages *pages,
+ unsigned long start_index,
+ unsigned long last_index)
+{
+ bool remote_mm = pages->source_mm != current->mm;
+ unsigned long npages;
+ uintptr_t uptr;
+ long rc;
+
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
+ WARN_ON(last_index < start_index))
+ return -EINVAL;
+
+ if (!user->upages) {
+ /* All undone in pfn_reader_destroy() */
+ user->upages_len =
+ (last_index - start_index + 1) * sizeof(*user->upages);
+ user->upages = temp_kmalloc(&user->upages_len, NULL, 0);
+ if (!user->upages)
+ return -ENOMEM;
+ }
+
+ if (user->locked == -1) {
+ /*
+ * The majority of usages will run the map task within the mm
+ * providing the pages, so we can optimize into
+ * get_user_pages_fast()
+ */
+ if (remote_mm) {
+ if (!mmget_not_zero(pages->source_mm))
+ return -EFAULT;
+ }
+ user->locked = 0;
+ }
+
+ npages = min_t(unsigned long, last_index - start_index + 1,
+ user->upages_len / sizeof(*user->upages));
+
+
+ if (iommufd_should_fail())
+ return -EFAULT;
+
+ uptr = (uintptr_t)(pages->uptr + start_index * PAGE_SIZE);
+ if (!remote_mm)
+ rc = pin_user_pages_fast(uptr, npages, user->gup_flags,
+ user->upages);
+ else {
+ if (!user->locked) {
+ mmap_read_lock(pages->source_mm);
+ user->locked = 1;
+ }
+ /*
+ * FIXME: last NULL can be &pfns->locked once the GUP patch
+ * is merged.
+ */
+ rc = pin_user_pages_remote(pages->source_mm, uptr, npages,
+ user->gup_flags, user->upages, NULL,
+ NULL);
+ }
+ if (rc <= 0) {
+ if (WARN_ON(!rc))
+ return -EFAULT;
+ return rc;
+ }
+ iopt_pages_add_npinned(pages, rc);
+ user->upages_start = start_index;
+ user->upages_end = start_index + rc;
+ return 0;
+}
+
+/* This is the "modern" and faster accounting method used by io_uring */
+static int incr_user_locked_vm(struct iopt_pages *pages, unsigned long npages)
+{
+ unsigned long lock_limit;
+ unsigned long cur_pages;
+ unsigned long new_pages;
+
+ lock_limit = task_rlimit(pages->source_task, RLIMIT_MEMLOCK) >>
+ PAGE_SHIFT;
+ npages = pages->npinned - pages->last_npinned;
+ do {
+ cur_pages = atomic_long_read(&pages->source_user->locked_vm);
+ new_pages = cur_pages + npages;
+ if (new_pages > lock_limit)
+ return -ENOMEM;
+ } while (atomic_long_cmpxchg(&pages->source_user->locked_vm, cur_pages,
+ new_pages) != cur_pages);
+ return 0;
+}
+
+static void decr_user_locked_vm(struct iopt_pages *pages, unsigned long npages)
+{
+ if (WARN_ON(atomic_long_read(&pages->source_user->locked_vm) < npages))
+ return;
+ atomic_long_sub(npages, &pages->source_user->locked_vm);
+}
+
+/* This is the accounting method used for compatibility with VFIO */
+static int update_mm_locked_vm(struct iopt_pages *pages, unsigned long npages,
+ bool inc, struct pfn_reader_user *user)
+{
+ bool do_put = false;
+ int rc;
+
+ if (user && user->locked) {
+ mmap_read_unlock(pages->source_mm);
+ user->locked = 0;
+ /* If we had the lock then we also have a get */
+ } else if ((!user || !user->upages) &&
+ pages->source_mm != current->mm) {
+ if (!mmget_not_zero(pages->source_mm))
+ return -EINVAL;
+ do_put = true;
+ }
+
+ mmap_write_lock(pages->source_mm);
+ rc = __account_locked_vm(pages->source_mm, npages, inc,
+ pages->source_task, false);
+ mmap_write_unlock(pages->source_mm);
+
+ if (do_put)
+ mmput(pages->source_mm);
+ return rc;
+}
+
+static int do_update_pinned(struct iopt_pages *pages, unsigned long npages,
+ bool inc, struct pfn_reader_user *user)
+{
+ int rc = 0;
+
+ switch (pages->account_mode) {
+ case IOPT_PAGES_ACCOUNT_NONE:
+ break;
+ case IOPT_PAGES_ACCOUNT_USER:
+ if (inc)
+ rc = incr_user_locked_vm(pages, npages);
+ else
+ decr_user_locked_vm(pages, npages);
+ break;
+ case IOPT_PAGES_ACCOUNT_MM:
+ rc = update_mm_locked_vm(pages, npages, inc, user);
+ break;
+ }
+ if (rc)
+ return rc;
+
+ pages->last_npinned = pages->npinned;
+ if (inc)
+ atomic64_add(npages, &pages->source_mm->pinned_vm);
+ else
+ atomic64_sub(npages, &pages->source_mm->pinned_vm);
+ return 0;
+}
+
+static void update_unpinned(struct iopt_pages *pages)
+{
+ if (WARN_ON(pages->npinned > pages->last_npinned))
+ return;
+ if (pages->npinned == pages->last_npinned)
+ return;
+ do_update_pinned(pages, pages->last_npinned - pages->npinned, false,
+ NULL);
+}
+
+/*
+ * Changes in the number of pages pinned is done after the pages have been read
+ * and processed. If the user lacked the limit then the error unwind will unpin
+ * everything that was just pinned. This is because it is expensive to calculate
+ * how many pages we have already pinned within a range to generate an accurate
+ * prediction in advance of doing the work to actually pin them.
+ */
+static int pfn_reader_user_update_pinned(struct pfn_reader_user *user,
+ struct iopt_pages *pages)
+{
+ unsigned long npages;
+ bool inc;
+
+ lockdep_assert_held(&pages->mutex);
+
+ if (pages->npinned == pages->last_npinned)
+ return 0;
+
+ if (pages->npinned < pages->last_npinned) {
+ npages = pages->last_npinned - pages->npinned;
+ inc = false;
+ } else {
+ if (iommufd_should_fail())
+ return -ENOMEM;
+ npages = pages->npinned - pages->last_npinned;
+ inc = true;
+ }
+ return do_update_pinned(pages, npages, inc, user);
+}
+
+/*
+ * PFNs are stored in three places, in order of preference:
+ * - The iopt_pages xarray. This is only populated if there is a
+ * iopt_pages_access
+ * - The iommu_domain under an area
+ * - The original PFN source, ie pages->source_mm
+ *
+ * This iterator reads the pfns optimizing to load according to the
+ * above order.
+ */
+struct pfn_reader {
+ struct iopt_pages *pages;
+ struct interval_tree_double_span_iter span;
+ struct pfn_batch batch;
+ unsigned long batch_start_index;
+ unsigned long batch_end_index;
+ unsigned long last_index;
+
+ struct pfn_reader_user user;
+};
+
+static int pfn_reader_update_pinned(struct pfn_reader *pfns)
+{
+ return pfn_reader_user_update_pinned(&pfns->user, pfns->pages);
+}
+
+/*
+ * The batch can contain a mixture of pages that are still in use and pages that
+ * need to be unpinned. Unpin only pages that are not held anywhere else.
+ */
+static void pfn_reader_unpin(struct pfn_reader *pfns)
+{
+ unsigned long last = pfns->batch_end_index - 1;
+ unsigned long start = pfns->batch_start_index;
+ struct interval_tree_double_span_iter span;
+ struct iopt_pages *pages = pfns->pages;
+
+ lockdep_assert_held(&pages->mutex);
+
+ interval_tree_for_each_double_span(&span, &pages->access_itree,
+ &pages->domains_itree, start, last) {
+ if (span.is_used)
+ continue;
+
+ batch_unpin(&pfns->batch, pages, span.start_hole - start,
+ span.last_hole - span.start_hole + 1);
+ }
+}
+
+/* Process a single span to load it from the proper storage */
+static int pfn_reader_fill_span(struct pfn_reader *pfns)
+{
+ struct interval_tree_double_span_iter *span = &pfns->span;
+ unsigned long start_index = pfns->batch_end_index;
+ struct iopt_area *area;
+ int rc;
+
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
+ WARN_ON(span->last_used < start_index))
+ return -EINVAL;
+
+ if (span->is_used == 1) {
+ batch_from_xarray(&pfns->batch, &pfns->pages->pinned_pfns,
+ start_index, span->last_used);
+ return 0;
+ }
+
+ if (span->is_used == 2) {
+ /*
+ * Pull as many pages from the first domain we find in the
+ * target span. If it is too small then we will be called again
+ * and we'll find another area.
+ */
+ area = iopt_pages_find_domain_area(pfns->pages, start_index);
+ if (WARN_ON(!area))
+ return -EINVAL;
+
+ /* The storage_domain cannot change without the pages mutex */
+ batch_from_domain(
+ &pfns->batch, area->storage_domain, area, start_index,
+ min(iopt_area_last_index(area), span->last_used));
+ return 0;
+ }
+
+ if (start_index >= pfns->user.upages_end) {
+ rc = pfn_reader_user_pin(&pfns->user, pfns->pages, start_index,
+ span->last_hole);
+ if (rc)
+ return rc;
+ }
+
+ batch_from_pages(&pfns->batch,
+ pfns->user.upages +
+ (start_index - pfns->user.upages_start),
+ pfns->user.upages_end - start_index);
+ return 0;
+}
+
+static bool pfn_reader_done(struct pfn_reader *pfns)
+{
+ return pfns->batch_start_index == pfns->last_index + 1;
+}
+
+static int pfn_reader_next(struct pfn_reader *pfns)
+{
+ int rc;
+
+ batch_clear(&pfns->batch);
+ pfns->batch_start_index = pfns->batch_end_index;
+
+ while (pfns->batch_end_index != pfns->last_index + 1) {
+ unsigned int npfns = pfns->batch.total_pfns;
+
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
+ WARN_ON(interval_tree_double_span_iter_done(&pfns->span)))
+ return -EINVAL;
+
+ rc = pfn_reader_fill_span(pfns);
+ if (rc)
+ return rc;
+
+ if (WARN_ON(!pfns->batch.total_pfns))
+ return -EINVAL;
+
+ pfns->batch_end_index =
+ pfns->batch_start_index + pfns->batch.total_pfns;
+ if (pfns->batch_end_index == pfns->span.last_used + 1)
+ interval_tree_double_span_iter_next(&pfns->span);
+
+ /* Batch is full */
+ if (npfns == pfns->batch.total_pfns)
+ return 0;
+ }
+ return 0;
+}
+
+static int pfn_reader_init(struct pfn_reader *pfns, struct iopt_pages *pages,
+ unsigned long start_index, unsigned long last_index)
+{
+ int rc;
+
+ lockdep_assert_held(&pages->mutex);
+
+ pfns->pages = pages;
+ pfns->batch_start_index = start_index;
+ pfns->batch_end_index = start_index;
+ pfns->last_index = last_index;
+ pfn_reader_user_init(&pfns->user, pages);
+ rc = batch_init(&pfns->batch, last_index - start_index + 1);
+ if (rc)
+ return rc;
+ interval_tree_double_span_iter_first(&pfns->span, &pages->access_itree,
+ &pages->domains_itree, start_index,
+ last_index);
+ return 0;
+}
+
+/*
+ * There are many assertions regarding the state of pages->npinned vs
+ * pages->last_pinned, for instance something like unmapping a domain must only
+ * decrement the npinned, and pfn_reader_destroy() must be called only after all
+ * the pins are updated. This is fine for success flows, but error flows
+ * sometimes need to release the pins held inside the pfn_reader before going on
+ * to complete unmapping and releasing pins held in domains.
+ */
+static void pfn_reader_release_pins(struct pfn_reader *pfns)
+{
+ struct iopt_pages *pages = pfns->pages;
+
+ if (pfns->user.upages_end > pfns->batch_end_index) {
+ size_t npages = pfns->user.upages_end - pfns->batch_end_index;
+
+ /* Any pages not transferred to the batch are just unpinned */
+ unpin_user_pages(pfns->user.upages + (pfns->batch_end_index -
+ pfns->user.upages_start),
+ npages);
+ iopt_pages_sub_npinned(pages, npages);
+ pfns->user.upages_end = pfns->batch_end_index;
+ }
+ if (pfns->batch_start_index != pfns->batch_end_index) {
+ pfn_reader_unpin(pfns);
+ pfns->batch_start_index = pfns->batch_end_index;
+ }
+}
+
+static void pfn_reader_destroy(struct pfn_reader *pfns)
+{
+ struct iopt_pages *pages = pfns->pages;
+
+ pfn_reader_release_pins(pfns);
+ pfn_reader_user_destroy(&pfns->user, pfns->pages);
+ batch_destroy(&pfns->batch, NULL);
+ WARN_ON(pages->last_npinned != pages->npinned);
+}
+
+static int pfn_reader_first(struct pfn_reader *pfns, struct iopt_pages *pages,
+ unsigned long start_index, unsigned long last_index)
+{
+ int rc;
+
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
+ WARN_ON(last_index < start_index))
+ return -EINVAL;
+
+ rc = pfn_reader_init(pfns, pages, start_index, last_index);
+ if (rc)
+ return rc;
+ rc = pfn_reader_next(pfns);
+ if (rc) {
+ pfn_reader_destroy(pfns);
+ return rc;
+ }
+ return 0;
+}
+
+struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length,
+ bool writable)
+{
+ struct iopt_pages *pages;
+
+ /*
+ * The iommu API uses size_t as the length, and protect the DIV_ROUND_UP
+ * below from overflow
+ */
+ if (length > SIZE_MAX - PAGE_SIZE || length == 0)
+ return ERR_PTR(-EINVAL);
+
+ pages = kzalloc(sizeof(*pages), GFP_KERNEL_ACCOUNT);
+ if (!pages)
+ return ERR_PTR(-ENOMEM);
+
+ kref_init(&pages->kref);
+ xa_init_flags(&pages->pinned_pfns, XA_FLAGS_ACCOUNT);
+ mutex_init(&pages->mutex);
+ pages->source_mm = current->mm;
+ mmgrab(pages->source_mm);
+ pages->uptr = (void __user *)ALIGN_DOWN((uintptr_t)uptr, PAGE_SIZE);
+ pages->npages = DIV_ROUND_UP(length + (uptr - pages->uptr), PAGE_SIZE);
+ pages->access_itree = RB_ROOT_CACHED;
+ pages->domains_itree = RB_ROOT_CACHED;
+ pages->writable = writable;
+ if (capable(CAP_IPC_LOCK))
+ pages->account_mode = IOPT_PAGES_ACCOUNT_NONE;
+ else
+ pages->account_mode = IOPT_PAGES_ACCOUNT_USER;
+ pages->source_task = current->group_leader;
+ get_task_struct(current->group_leader);
+ pages->source_user = get_uid(current_user());
+ return pages;
+}
+
+void iopt_release_pages(struct kref *kref)
+{
+ struct iopt_pages *pages = container_of(kref, struct iopt_pages, kref);
+
+ WARN_ON(!RB_EMPTY_ROOT(&pages->access_itree.rb_root));
+ WARN_ON(!RB_EMPTY_ROOT(&pages->domains_itree.rb_root));
+ WARN_ON(pages->npinned);
+ WARN_ON(!xa_empty(&pages->pinned_pfns));
+ mmdrop(pages->source_mm);
+ mutex_destroy(&pages->mutex);
+ put_task_struct(pages->source_task);
+ free_uid(pages->source_user);
+ kfree(pages);
+}
+
+static void
+iopt_area_unpin_domain(struct pfn_batch *batch, struct iopt_area *area,
+ struct iopt_pages *pages, struct iommu_domain *domain,
+ unsigned long start_index, unsigned long last_index,
+ unsigned long *unmapped_end_index,
+ unsigned long real_last_index)
+{
+ while (start_index <= last_index) {
+ unsigned long batch_last_index;
+
+ if (*unmapped_end_index <= last_index) {
+ unsigned long start =
+ max(start_index, *unmapped_end_index);
+
+ batch_from_domain(batch, domain, area, start,
+ last_index);
+ batch_last_index = start + batch->total_pfns - 1;
+ } else {
+ batch_last_index = last_index;
+ }
+
+ /*
+ * unmaps must always 'cut' at a place where the pfns are not
+ * contiguous to pair with the maps that always install
+ * contiguous pages. Thus, if we have to stop unpinning in the
+ * middle of the domains we need to keep reading pfns until we
+ * find a cut point to do the unmap. The pfns we read are
+ * carried over and either skipped or integrated into the next
+ * batch.
+ */
+ if (batch_last_index == last_index &&
+ last_index != real_last_index)
+ batch_from_domain_continue(batch, domain, area,
+ last_index + 1,
+ real_last_index);
+
+ if (*unmapped_end_index <= batch_last_index) {
+ iopt_area_unmap_domain_range(
+ area, domain, *unmapped_end_index,
+ start_index + batch->total_pfns - 1);
+ *unmapped_end_index = start_index + batch->total_pfns;
+ }
+
+ /* unpin must follow unmap */
+ batch_unpin(batch, pages, 0,
+ batch_last_index - start_index + 1);
+ start_index = batch_last_index + 1;
+
+ batch_clear_carry(batch,
+ *unmapped_end_index - batch_last_index - 1);
+ }
+}
+
+static void __iopt_area_unfill_domain(struct iopt_area *area,
+ struct iopt_pages *pages,
+ struct iommu_domain *domain,
+ unsigned long last_index)
+{
+ struct interval_tree_double_span_iter span;
+ unsigned long start_index = iopt_area_index(area);
+ unsigned long unmapped_end_index = start_index;
+ u64 backup[BATCH_BACKUP_SIZE];
+ struct pfn_batch batch;
+
+ lockdep_assert_held(&pages->mutex);
+
+ /*
+ * For security we must not unpin something that is still DMA mapped,
+ * so this must unmap any IOVA before we go ahead and unpin the pages.
+ * This creates a complexity where we need to skip over unpinning pages
+ * held in the xarray, but continue to unmap from the domain.
+ *
+ * The domain unmap cannot stop in the middle of a contiguous range of
+ * PFNs. To solve this problem the unpinning step will read ahead to the
+ * end of any contiguous span, unmap that whole span, and then only
+ * unpin the leading part that does not have any accesses. The residual
+ * PFNs that were unmapped but not unpinned are called a "carry" in the
+ * batch as they are moved to the front of the PFN list and continue on
+ * to the next iteration(s).
+ */
+ batch_init_backup(&batch, last_index + 1, backup, sizeof(backup));
+ interval_tree_for_each_double_span(&span, &pages->domains_itree,
+ &pages->access_itree, start_index,
+ last_index) {
+ if (span.is_used) {
+ batch_skip_carry(&batch,
+ span.last_used - span.start_used + 1);
+ continue;
+ }
+ iopt_area_unpin_domain(&batch, area, pages, domain,
+ span.start_hole, span.last_hole,
+ &unmapped_end_index, last_index);
+ }
+ /*
+ * If the range ends in a access then we do the residual unmap without
+ * any unpins.
+ */
+ if (unmapped_end_index != last_index + 1)
+ iopt_area_unmap_domain_range(area, domain, unmapped_end_index,
+ last_index);
+ WARN_ON(batch.total_pfns);
+ batch_destroy(&batch, backup);
+ update_unpinned(pages);
+}
+
+static void iopt_area_unfill_partial_domain(struct iopt_area *area,
+ struct iopt_pages *pages,
+ struct iommu_domain *domain,
+ unsigned long end_index)
+{
+ if (end_index != iopt_area_index(area))
+ __iopt_area_unfill_domain(area, pages, domain, end_index - 1);
+}
+
+/**
+ * iopt_area_unmap_domain() - Unmap without unpinning PFNs in a domain
+ * @area: The IOVA range to unmap
+ * @domain: The domain to unmap
+ *
+ * The caller must know that unpinning is not required, usually because there
+ * are other domains in the iopt.
+ */
+void iopt_area_unmap_domain(struct iopt_area *area, struct iommu_domain *domain)
+{
+ iommu_unmap_nofail(domain, iopt_area_iova(area),
+ iopt_area_length(area));
+}
+
+/**
+ * iopt_area_unfill_domain() - Unmap and unpin PFNs in a domain
+ * @area: IOVA area to use
+ * @pages: page supplier for the area (area->pages is NULL)
+ * @domain: Domain to unmap from
+ *
+ * The domain should be removed from the domains_itree before calling. The
+ * domain will always be unmapped, but the PFNs may not be unpinned if there are
+ * still accesses.
+ */
+void iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages,
+ struct iommu_domain *domain)
+{
+ __iopt_area_unfill_domain(area, pages, domain,
+ iopt_area_last_index(area));
+}
+
+/**
+ * iopt_area_fill_domain() - Map PFNs from the area into a domain
+ * @area: IOVA area to use
+ * @domain: Domain to load PFNs into
+ *
+ * Read the pfns from the area's underlying iopt_pages and map them into the
+ * given domain. Called when attaching a new domain to an io_pagetable.
+ */
+int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain)
+{
+ unsigned long done_end_index;
+ struct pfn_reader pfns;
+ int rc;
+
+ lockdep_assert_held(&area->pages->mutex);
+
+ rc = pfn_reader_first(&pfns, area->pages, iopt_area_index(area),
+ iopt_area_last_index(area));
+ if (rc)
+ return rc;
+
+ while (!pfn_reader_done(&pfns)) {
+ done_end_index = pfns.batch_start_index;
+ rc = batch_to_domain(&pfns.batch, domain, area,
+ pfns.batch_start_index);
+ if (rc)
+ goto out_unmap;
+ done_end_index = pfns.batch_end_index;
+
+ rc = pfn_reader_next(&pfns);
+ if (rc)
+ goto out_unmap;
+ }
+
+ rc = pfn_reader_update_pinned(&pfns);
+ if (rc)
+ goto out_unmap;
+ goto out_destroy;
+
+out_unmap:
+ pfn_reader_release_pins(&pfns);
+ iopt_area_unfill_partial_domain(area, area->pages, domain,
+ done_end_index);
+out_destroy:
+ pfn_reader_destroy(&pfns);
+ return rc;
+}
+
+/**
+ * iopt_area_fill_domains() - Install PFNs into the area's domains
+ * @area: The area to act on
+ * @pages: The pages associated with the area (area->pages is NULL)
+ *
+ * Called during area creation. The area is freshly created and not inserted in
+ * the domains_itree yet. PFNs are read and loaded into every domain held in the
+ * area's io_pagetable and the area is installed in the domains_itree.
+ *
+ * On failure all domains are left unchanged.
+ */
+int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages)
+{
+ unsigned long done_first_end_index;
+ unsigned long done_all_end_index;
+ struct iommu_domain *domain;
+ unsigned long unmap_index;
+ struct pfn_reader pfns;
+ unsigned long index;
+ int rc;
+
+ lockdep_assert_held(&area->iopt->domains_rwsem);
+
+ if (xa_empty(&area->iopt->domains))
+ return 0;
+
+ mutex_lock(&pages->mutex);
+ rc = pfn_reader_first(&pfns, pages, iopt_area_index(area),
+ iopt_area_last_index(area));
+ if (rc)
+ goto out_unlock;
+
+ while (!pfn_reader_done(&pfns)) {
+ done_first_end_index = pfns.batch_end_index;
+ done_all_end_index = pfns.batch_start_index;
+ xa_for_each(&area->iopt->domains, index, domain) {
+ rc = batch_to_domain(&pfns.batch, domain, area,
+ pfns.batch_start_index);
+ if (rc)
+ goto out_unmap;
+ }
+ done_all_end_index = done_first_end_index;
+
+ rc = pfn_reader_next(&pfns);
+ if (rc)
+ goto out_unmap;
+ }
+ rc = pfn_reader_update_pinned(&pfns);
+ if (rc)
+ goto out_unmap;
+
+ area->storage_domain = xa_load(&area->iopt->domains, 0);
+ interval_tree_insert(&area->pages_node, &pages->domains_itree);
+ goto out_destroy;
+
+out_unmap:
+ pfn_reader_release_pins(&pfns);
+ xa_for_each(&area->iopt->domains, unmap_index, domain) {
+ unsigned long end_index;
+
+ if (unmap_index < index)
+ end_index = done_first_end_index;
+ else
+ end_index = done_all_end_index;
+
+ /*
+ * The area is not yet part of the domains_itree so we have to
+ * manage the unpinning specially. The last domain does the
+ * unpin, every other domain is just unmapped.
+ */
+ if (unmap_index != area->iopt->next_domain_id - 1) {
+ if (end_index != iopt_area_index(area))
+ iopt_area_unmap_domain_range(
+ area, domain, iopt_area_index(area),
+ end_index - 1);
+ } else {
+ iopt_area_unfill_partial_domain(area, pages, domain,
+ end_index);
+ }
+ }
+out_destroy:
+ pfn_reader_destroy(&pfns);
+out_unlock:
+ mutex_unlock(&pages->mutex);
+ return rc;
+}
+
+/**
+ * iopt_area_unfill_domains() - unmap PFNs from the area's domains
+ * @area: The area to act on
+ * @pages: The pages associated with the area (area->pages is NULL)
+ *
+ * Called during area destruction. This unmaps the iova's covered by all the
+ * area's domains and releases the PFNs.
+ */
+void iopt_area_unfill_domains(struct iopt_area *area, struct iopt_pages *pages)
+{
+ struct io_pagetable *iopt = area->iopt;
+ struct iommu_domain *domain;
+ unsigned long index;
+
+ lockdep_assert_held(&iopt->domains_rwsem);
+
+ mutex_lock(&pages->mutex);
+ if (!area->storage_domain)
+ goto out_unlock;
+
+ xa_for_each(&iopt->domains, index, domain)
+ if (domain != area->storage_domain)
+ iopt_area_unmap_domain_range(
+ area, domain, iopt_area_index(area),
+ iopt_area_last_index(area));
+
+ interval_tree_remove(&area->pages_node, &pages->domains_itree);
+ iopt_area_unfill_domain(area, pages, area->storage_domain);
+ area->storage_domain = NULL;
+out_unlock:
+ mutex_unlock(&pages->mutex);
+}
+
+static void iopt_pages_unpin_xarray(struct pfn_batch *batch,
+ struct iopt_pages *pages,
+ unsigned long start_index,
+ unsigned long end_index)
+{
+ while (start_index <= end_index) {
+ batch_from_xarray_clear(batch, &pages->pinned_pfns, start_index,
+ end_index);
+ batch_unpin(batch, pages, 0, batch->total_pfns);
+ start_index += batch->total_pfns;
+ batch_clear(batch);
+ }
+}
+
+/**
+ * iopt_pages_unfill_xarray() - Update the xarry after removing an access
+ * @pages: The pages to act on
+ * @start_index: Starting PFN index
+ * @last_index: Last PFN index
+ *
+ * Called when an iopt_pages_access is removed, removes pages from the itree.
+ * The access should already be removed from the access_itree.
+ */
+void iopt_pages_unfill_xarray(struct iopt_pages *pages,
+ unsigned long start_index,
+ unsigned long last_index)
+{
+ struct interval_tree_double_span_iter span;
+ u64 backup[BATCH_BACKUP_SIZE];
+ struct pfn_batch batch;
+ bool batch_inited = false;
+
+ lockdep_assert_held(&pages->mutex);
+
+ interval_tree_for_each_double_span(&span, &pages->access_itree,
+ &pages->domains_itree, start_index,
+ last_index) {
+ if (!span.is_used) {
+ if (!batch_inited) {
+ batch_init_backup(&batch,
+ last_index - start_index + 1,
+ backup, sizeof(backup));
+ batch_inited = true;
+ }
+ iopt_pages_unpin_xarray(&batch, pages, span.start_hole,
+ span.last_hole);
+ } else if (span.is_used == 2) {
+ /* Covered by a domain */
+ clear_xarray(&pages->pinned_pfns, span.start_used,
+ span.last_used);
+ }
+ /* Otherwise covered by an existing access */
+ }
+ if (batch_inited)
+ batch_destroy(&batch, backup);
+ update_unpinned(pages);
+}
+
+/**
+ * iopt_pages_fill_from_xarray() - Fast path for reading PFNs
+ * @pages: The pages to act on
+ * @start_index: The first page index in the range
+ * @last_index: The last page index in the range
+ * @out_pages: The output array to return the pages
+ *
+ * This can be called if the caller is holding a refcount on an
+ * iopt_pages_access that is known to have already been filled. It quickly reads
+ * the pages directly from the xarray.
+ *
+ * This is part of the SW iommu interface to read pages for in-kernel use.
+ */
+void iopt_pages_fill_from_xarray(struct iopt_pages *pages,
+ unsigned long start_index,
+ unsigned long last_index,
+ struct page **out_pages)
+{
+ XA_STATE(xas, &pages->pinned_pfns, start_index);
+ void *entry;
+
+ rcu_read_lock();
+ while (start_index <= last_index) {
+ entry = xas_next(&xas);
+ if (xas_retry(&xas, entry))
+ continue;
+ WARN_ON(!xa_is_value(entry));
+ *(out_pages++) = pfn_to_page(xa_to_value(entry));
+ start_index++;
+ }
+ rcu_read_unlock();
+}
+
+static int iopt_pages_fill_from_domain(struct iopt_pages *pages,
+ unsigned long start_index,
+ unsigned long last_index,
+ struct page **out_pages)
+{
+ while (start_index != last_index + 1) {
+ unsigned long domain_last;
+ struct iopt_area *area;
+
+ area = iopt_pages_find_domain_area(pages, start_index);
+ if (WARN_ON(!area))
+ return -EINVAL;
+
+ domain_last = min(iopt_area_last_index(area), last_index);
+ out_pages = raw_pages_from_domain(area->storage_domain, area,
+ start_index, domain_last,
+ out_pages);
+ start_index = domain_last + 1;
+ }
+ return 0;
+}
+
+static int iopt_pages_fill_from_mm(struct iopt_pages *pages,
+ struct pfn_reader_user *user,
+ unsigned long start_index,
+ unsigned long last_index,
+ struct page **out_pages)
+{
+ unsigned long cur_index = start_index;
+ int rc;
+
+ while (cur_index != last_index + 1) {
+ user->upages = out_pages + (cur_index - start_index);
+ rc = pfn_reader_user_pin(user, pages, cur_index, last_index);
+ if (rc)
+ goto out_unpin;
+ cur_index = user->upages_end;
+ }
+ return 0;
+
+out_unpin:
+ if (start_index != cur_index)
+ iopt_pages_err_unpin(pages, start_index, cur_index - 1,
+ out_pages);
+ return rc;
+}
+
+/**
+ * iopt_pages_fill_xarray() - Read PFNs
+ * @pages: The pages to act on
+ * @start_index: The first page index in the range
+ * @last_index: The last page index in the range
+ * @out_pages: The output array to return the pages, may be NULL
+ *
+ * This populates the xarray and returns the pages in out_pages. As the slow
+ * path this is able to copy pages from other storage tiers into the xarray.
+ *
+ * On failure the xarray is left unchanged.
+ *
+ * This is part of the SW iommu interface to read pages for in-kernel use.
+ */
+int iopt_pages_fill_xarray(struct iopt_pages *pages, unsigned long start_index,
+ unsigned long last_index, struct page **out_pages)
+{
+ struct interval_tree_double_span_iter span;
+ unsigned long xa_end = start_index;
+ struct pfn_reader_user user;
+ int rc;
+
+ lockdep_assert_held(&pages->mutex);
+
+ pfn_reader_user_init(&user, pages);
+ user.upages_len = (last_index - start_index + 1) * sizeof(*out_pages);
+ interval_tree_for_each_double_span(&span, &pages->access_itree,
+ &pages->domains_itree, start_index,
+ last_index) {
+ struct page **cur_pages;
+
+ if (span.is_used == 1) {
+ cur_pages = out_pages + (span.start_used - start_index);
+ iopt_pages_fill_from_xarray(pages, span.start_used,
+ span.last_used, cur_pages);
+ continue;
+ }
+
+ if (span.is_used == 2) {
+ cur_pages = out_pages + (span.start_used - start_index);
+ iopt_pages_fill_from_domain(pages, span.start_used,
+ span.last_used, cur_pages);
+ rc = pages_to_xarray(&pages->pinned_pfns,
+ span.start_used, span.last_used,
+ cur_pages);
+ if (rc)
+ goto out_clean_xa;
+ xa_end = span.last_used + 1;
+ continue;
+ }
+
+ /* hole */
+ cur_pages = out_pages + (span.start_hole - start_index);
+ rc = iopt_pages_fill_from_mm(pages, &user, span.start_hole,
+ span.last_hole, cur_pages);
+ if (rc)
+ goto out_clean_xa;
+ rc = pages_to_xarray(&pages->pinned_pfns, span.start_hole,
+ span.last_hole, cur_pages);
+ if (rc) {
+ iopt_pages_err_unpin(pages, span.start_hole,
+ span.last_hole, cur_pages);
+ goto out_clean_xa;
+ }
+ xa_end = span.last_hole + 1;
+ }
+ rc = pfn_reader_user_update_pinned(&user, pages);
+ if (rc)
+ goto out_clean_xa;
+ user.upages = NULL;
+ pfn_reader_user_destroy(&user, pages);
+ return 0;
+
+out_clean_xa:
+ if (start_index != xa_end)
+ iopt_pages_unfill_xarray(pages, start_index, xa_end - 1);
+ user.upages = NULL;
+ pfn_reader_user_destroy(&user, pages);
+ return rc;
+}
+
+/*
+ * This uses the pfn_reader instead of taking a shortcut by using the mm. It can
+ * do every scenario and is fully consistent with what an iommu_domain would
+ * see.
+ */
+static int iopt_pages_rw_slow(struct iopt_pages *pages,
+ unsigned long start_index,
+ unsigned long last_index, unsigned long offset,
+ void *data, unsigned long length,
+ unsigned int flags)
+{
+ struct pfn_reader pfns;
+ int rc;
+
+ mutex_lock(&pages->mutex);
+
+ rc = pfn_reader_first(&pfns, pages, start_index, last_index);
+ if (rc)
+ goto out_unlock;
+
+ while (!pfn_reader_done(&pfns)) {
+ unsigned long done;
+
+ done = batch_rw(&pfns.batch, data, offset, length, flags);
+ data += done;
+ length -= done;
+ offset = 0;
+ pfn_reader_unpin(&pfns);
+
+ rc = pfn_reader_next(&pfns);
+ if (rc)
+ goto out_destroy;
+ }
+ if (WARN_ON(length != 0))
+ rc = -EINVAL;
+out_destroy:
+ pfn_reader_destroy(&pfns);
+out_unlock:
+ mutex_unlock(&pages->mutex);
+ return rc;
+}
+
+/*
+ * A medium speed path that still allows DMA inconsistencies, but doesn't do any
+ * memory allocations or interval tree searches.
+ */
+static int iopt_pages_rw_page(struct iopt_pages *pages, unsigned long index,
+ unsigned long offset, void *data,
+ unsigned long length, unsigned int flags)
+{
+ struct page *page = NULL;
+ int rc;
+
+ if (!mmget_not_zero(pages->source_mm))
+ return iopt_pages_rw_slow(pages, index, index, offset, data,
+ length, flags);
+
+ if (iommufd_should_fail()) {
+ rc = -EINVAL;
+ goto out_mmput;
+ }
+
+ mmap_read_lock(pages->source_mm);
+ rc = pin_user_pages_remote(
+ pages->source_mm, (uintptr_t)(pages->uptr + index * PAGE_SIZE),
+ 1, (flags & IOMMUFD_ACCESS_RW_WRITE) ? FOLL_WRITE : 0, &page,
+ NULL, NULL);
+ mmap_read_unlock(pages->source_mm);
+ if (rc != 1) {
+ if (WARN_ON(rc >= 0))
+ rc = -EINVAL;
+ goto out_mmput;
+ }
+ copy_data_page(page, data, offset, length, flags);
+ unpin_user_page(page);
+ rc = 0;
+
+out_mmput:
+ mmput(pages->source_mm);
+ return rc;
+}
+
+/**
+ * iopt_pages_rw_access - Copy to/from a linear slice of the pages
+ * @pages: pages to act on
+ * @start_byte: First byte of pages to copy to/from
+ * @data: Kernel buffer to get/put the data
+ * @length: Number of bytes to copy
+ * @flags: IOMMUFD_ACCESS_RW_* flags
+ *
+ * This will find each page in the range, kmap it and then memcpy to/from
+ * the given kernel buffer.
+ */
+int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte,
+ void *data, unsigned long length, unsigned int flags)
+{
+ unsigned long start_index = start_byte / PAGE_SIZE;
+ unsigned long last_index = (start_byte + length - 1) / PAGE_SIZE;
+ bool change_mm = current->mm != pages->source_mm;
+ int rc = 0;
+
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
+ (flags & __IOMMUFD_ACCESS_RW_SLOW_PATH))
+ change_mm = true;
+
+ if ((flags & IOMMUFD_ACCESS_RW_WRITE) && !pages->writable)
+ return -EPERM;
+
+ if (!(flags & IOMMUFD_ACCESS_RW_KTHREAD) && change_mm) {
+ if (start_index == last_index)
+ return iopt_pages_rw_page(pages, start_index,
+ start_byte % PAGE_SIZE, data,
+ length, flags);
+ return iopt_pages_rw_slow(pages, start_index, last_index,
+ start_byte % PAGE_SIZE, data, length,
+ flags);
+ }
+
+ /*
+ * Try to copy using copy_to_user(). We do this as a fast path and
+ * ignore any pinning inconsistencies, unlike a real DMA path.
+ */
+ if (change_mm) {
+ if (!mmget_not_zero(pages->source_mm))
+ return iopt_pages_rw_slow(pages, start_index,
+ last_index,
+ start_byte % PAGE_SIZE, data,
+ length, flags);
+ kthread_use_mm(pages->source_mm);
+ }
+
+ if (flags & IOMMUFD_ACCESS_RW_WRITE) {
+ if (copy_to_user(pages->uptr + start_byte, data, length))
+ rc = -EFAULT;
+ } else {
+ if (copy_from_user(data, pages->uptr + start_byte, length))
+ rc = -EFAULT;
+ }
+
+ if (change_mm) {
+ kthread_unuse_mm(pages->source_mm);
+ mmput(pages->source_mm);
+ }
+
+ return rc;
+}
+
+static struct iopt_pages_access *
+iopt_pages_get_exact_access(struct iopt_pages *pages, unsigned long index,
+ unsigned long last)
+{
+ struct interval_tree_node *node;
+
+ lockdep_assert_held(&pages->mutex);
+
+ /* There can be overlapping ranges in this interval tree */
+ for (node = interval_tree_iter_first(&pages->access_itree, index, last);
+ node; node = interval_tree_iter_next(node, index, last))
+ if (node->start == index && node->last == last)
+ return container_of(node, struct iopt_pages_access,
+ node);
+ return NULL;
+}
+
+/**
+ * iopt_area_add_access() - Record an in-knerel access for PFNs
+ * @area: The source of PFNs
+ * @start_index: First page index
+ * @last_index: Inclusive last page index
+ * @out_pages: Output list of struct page's representing the PFNs
+ * @flags: IOMMUFD_ACCESS_RW_* flags
+ *
+ * Record that an in-kernel access will be accessing the pages, ensure they are
+ * pinned, and return the PFNs as a simple list of 'struct page *'.
+ *
+ * This should be undone through a matching call to iopt_area_remove_access()
+ */
+int iopt_area_add_access(struct iopt_area *area, unsigned long start_index,
+ unsigned long last_index, struct page **out_pages,
+ unsigned int flags)
+{
+ struct iopt_pages *pages = area->pages;
+ struct iopt_pages_access *access;
+ int rc;
+
+ if ((flags & IOMMUFD_ACCESS_RW_WRITE) && !pages->writable)
+ return -EPERM;
+
+ mutex_lock(&pages->mutex);
+ access = iopt_pages_get_exact_access(pages, start_index, last_index);
+ if (access) {
+ area->num_accesses++;
+ access->users++;
+ iopt_pages_fill_from_xarray(pages, start_index, last_index,
+ out_pages);
+ mutex_unlock(&pages->mutex);
+ return 0;
+ }
+
+ access = kzalloc(sizeof(*access), GFP_KERNEL_ACCOUNT);
+ if (!access) {
+ rc = -ENOMEM;
+ goto err_unlock;
+ }
+
+ rc = iopt_pages_fill_xarray(pages, start_index, last_index, out_pages);
+ if (rc)
+ goto err_free;
+
+ access->node.start = start_index;
+ access->node.last = last_index;
+ access->users = 1;
+ area->num_accesses++;
+ interval_tree_insert(&access->node, &pages->access_itree);
+ mutex_unlock(&pages->mutex);
+ return 0;
+
+err_free:
+ kfree(access);
+err_unlock:
+ mutex_unlock(&pages->mutex);
+ return rc;
+}
+
+/**
+ * iopt_area_remove_access() - Release an in-kernel access for PFNs
+ * @area: The source of PFNs
+ * @start_index: First page index
+ * @last_index: Inclusive last page index
+ *
+ * Undo iopt_area_add_access() and unpin the pages if necessary. The caller
+ * must stop using the PFNs before calling this.
+ */
+void iopt_area_remove_access(struct iopt_area *area, unsigned long start_index,
+ unsigned long last_index)
+{
+ struct iopt_pages *pages = area->pages;
+ struct iopt_pages_access *access;
+
+ mutex_lock(&pages->mutex);
+ access = iopt_pages_get_exact_access(pages, start_index, last_index);
+ if (WARN_ON(!access))
+ goto out_unlock;
+
+ WARN_ON(area->num_accesses == 0 || access->users == 0);
+ area->num_accesses--;
+ access->users--;
+ if (access->users)
+ goto out_unlock;
+
+ interval_tree_remove(&access->node, &pages->access_itree);
+ iopt_pages_unfill_xarray(pages, start_index, last_index);
+ kfree(access);
+out_unlock:
+ mutex_unlock(&pages->mutex);
+}
diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
new file mode 100644
index 000000000000..cfb5fe9a5e0e
--- /dev/null
+++ b/drivers/iommu/iommufd/selftest.c
@@ -0,0 +1,853 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
+ *
+ * Kernel side components to support tools/testing/selftests/iommu
+ */
+#include <linux/slab.h>
+#include <linux/iommu.h>
+#include <linux/xarray.h>
+#include <linux/file.h>
+#include <linux/anon_inodes.h>
+#include <linux/fault-inject.h>
+#include <uapi/linux/iommufd.h>
+
+#include "io_pagetable.h"
+#include "iommufd_private.h"
+#include "iommufd_test.h"
+
+static DECLARE_FAULT_ATTR(fail_iommufd);
+static struct dentry *dbgfs_root;
+
+size_t iommufd_test_memory_limit = 65536;
+
+enum {
+ MOCK_IO_PAGE_SIZE = PAGE_SIZE / 2,
+
+ /*
+ * Like a real page table alignment requires the low bits of the address
+ * to be zero. xarray also requires the high bit to be zero, so we store
+ * the pfns shifted. The upper bits are used for metadata.
+ */
+ MOCK_PFN_MASK = ULONG_MAX / MOCK_IO_PAGE_SIZE,
+
+ _MOCK_PFN_START = MOCK_PFN_MASK + 1,
+ MOCK_PFN_START_IOVA = _MOCK_PFN_START,
+ MOCK_PFN_LAST_IOVA = _MOCK_PFN_START,
+};
+
+/*
+ * Syzkaller has trouble randomizing the correct iova to use since it is linked
+ * to the map ioctl's output, and it has no ide about that. So, simplify things.
+ * In syzkaller mode the 64 bit IOVA is converted into an nth area and offset
+ * value. This has a much smaller randomization space and syzkaller can hit it.
+ */
+static unsigned long iommufd_test_syz_conv_iova(struct io_pagetable *iopt,
+ u64 *iova)
+{
+ struct syz_layout {
+ __u32 nth_area;
+ __u32 offset;
+ };
+ struct syz_layout *syz = (void *)iova;
+ unsigned int nth = syz->nth_area;
+ struct iopt_area *area;
+
+ down_read(&iopt->iova_rwsem);
+ for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
+ area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
+ if (nth == 0) {
+ up_read(&iopt->iova_rwsem);
+ return iopt_area_iova(area) + syz->offset;
+ }
+ nth--;
+ }
+ up_read(&iopt->iova_rwsem);
+
+ return 0;
+}
+
+void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd,
+ unsigned int ioas_id, u64 *iova, u32 *flags)
+{
+ struct iommufd_ioas *ioas;
+
+ if (!(*flags & MOCK_FLAGS_ACCESS_SYZ))
+ return;
+ *flags &= ~(u32)MOCK_FLAGS_ACCESS_SYZ;
+
+ ioas = iommufd_get_ioas(ucmd, ioas_id);
+ if (IS_ERR(ioas))
+ return;
+ *iova = iommufd_test_syz_conv_iova(&ioas->iopt, iova);
+ iommufd_put_object(&ioas->obj);
+}
+
+struct mock_iommu_domain {
+ struct iommu_domain domain;
+ struct xarray pfns;
+};
+
+enum selftest_obj_type {
+ TYPE_IDEV,
+};
+
+struct selftest_obj {
+ struct iommufd_object obj;
+ enum selftest_obj_type type;
+
+ union {
+ struct {
+ struct iommufd_hw_pagetable *hwpt;
+ struct iommufd_ctx *ictx;
+ struct device mock_dev;
+ } idev;
+ };
+};
+
+static struct iommu_domain *mock_domain_alloc(unsigned int iommu_domain_type)
+{
+ struct mock_iommu_domain *mock;
+
+ if (WARN_ON(iommu_domain_type != IOMMU_DOMAIN_UNMANAGED))
+ return NULL;
+
+ mock = kzalloc(sizeof(*mock), GFP_KERNEL);
+ if (!mock)
+ return NULL;
+ mock->domain.geometry.aperture_start = MOCK_APERTURE_START;
+ mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST;
+ mock->domain.pgsize_bitmap = MOCK_IO_PAGE_SIZE;
+ xa_init(&mock->pfns);
+ return &mock->domain;
+}
+
+static void mock_domain_free(struct iommu_domain *domain)
+{
+ struct mock_iommu_domain *mock =
+ container_of(domain, struct mock_iommu_domain, domain);
+
+ WARN_ON(!xa_empty(&mock->pfns));
+ kfree(mock);
+}
+
+static int mock_domain_map_pages(struct iommu_domain *domain,
+ unsigned long iova, phys_addr_t paddr,
+ size_t pgsize, size_t pgcount, int prot,
+ gfp_t gfp, size_t *mapped)
+{
+ struct mock_iommu_domain *mock =
+ container_of(domain, struct mock_iommu_domain, domain);
+ unsigned long flags = MOCK_PFN_START_IOVA;
+ unsigned long start_iova = iova;
+
+ /*
+ * xarray does not reliably work with fault injection because it does a
+ * retry allocation, so put our own failure point.
+ */
+ if (iommufd_should_fail())
+ return -ENOENT;
+
+ WARN_ON(iova % MOCK_IO_PAGE_SIZE);
+ WARN_ON(pgsize % MOCK_IO_PAGE_SIZE);
+ for (; pgcount; pgcount--) {
+ size_t cur;
+
+ for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) {
+ void *old;
+
+ if (pgcount == 1 && cur + MOCK_IO_PAGE_SIZE == pgsize)
+ flags = MOCK_PFN_LAST_IOVA;
+ old = xa_store(&mock->pfns, iova / MOCK_IO_PAGE_SIZE,
+ xa_mk_value((paddr / MOCK_IO_PAGE_SIZE) |
+ flags),
+ gfp);
+ if (xa_is_err(old)) {
+ for (; start_iova != iova;
+ start_iova += MOCK_IO_PAGE_SIZE)
+ xa_erase(&mock->pfns,
+ start_iova /
+ MOCK_IO_PAGE_SIZE);
+ return xa_err(old);
+ }
+ WARN_ON(old);
+ iova += MOCK_IO_PAGE_SIZE;
+ paddr += MOCK_IO_PAGE_SIZE;
+ *mapped += MOCK_IO_PAGE_SIZE;
+ flags = 0;
+ }
+ }
+ return 0;
+}
+
+static size_t mock_domain_unmap_pages(struct iommu_domain *domain,
+ unsigned long iova, size_t pgsize,
+ size_t pgcount,
+ struct iommu_iotlb_gather *iotlb_gather)
+{
+ struct mock_iommu_domain *mock =
+ container_of(domain, struct mock_iommu_domain, domain);
+ bool first = true;
+ size_t ret = 0;
+ void *ent;
+
+ WARN_ON(iova % MOCK_IO_PAGE_SIZE);
+ WARN_ON(pgsize % MOCK_IO_PAGE_SIZE);
+
+ for (; pgcount; pgcount--) {
+ size_t cur;
+
+ for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) {
+ ent = xa_erase(&mock->pfns, iova / MOCK_IO_PAGE_SIZE);
+ WARN_ON(!ent);
+ /*
+ * iommufd generates unmaps that must be a strict
+ * superset of the map's performend So every starting
+ * IOVA should have been an iova passed to map, and the
+ *
+ * First IOVA must be present and have been a first IOVA
+ * passed to map_pages
+ */
+ if (first) {
+ WARN_ON(!(xa_to_value(ent) &
+ MOCK_PFN_START_IOVA));
+ first = false;
+ }
+ if (pgcount == 1 && cur + MOCK_IO_PAGE_SIZE == pgsize)
+ WARN_ON(!(xa_to_value(ent) &
+ MOCK_PFN_LAST_IOVA));
+
+ iova += MOCK_IO_PAGE_SIZE;
+ ret += MOCK_IO_PAGE_SIZE;
+ }
+ }
+ return ret;
+}
+
+static phys_addr_t mock_domain_iova_to_phys(struct iommu_domain *domain,
+ dma_addr_t iova)
+{
+ struct mock_iommu_domain *mock =
+ container_of(domain, struct mock_iommu_domain, domain);
+ void *ent;
+
+ WARN_ON(iova % MOCK_IO_PAGE_SIZE);
+ ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE);
+ WARN_ON(!ent);
+ return (xa_to_value(ent) & MOCK_PFN_MASK) * MOCK_IO_PAGE_SIZE;
+}
+
+static const struct iommu_ops mock_ops = {
+ .owner = THIS_MODULE,
+ .pgsize_bitmap = MOCK_IO_PAGE_SIZE,
+ .domain_alloc = mock_domain_alloc,
+ .default_domain_ops =
+ &(struct iommu_domain_ops){
+ .free = mock_domain_free,
+ .map_pages = mock_domain_map_pages,
+ .unmap_pages = mock_domain_unmap_pages,
+ .iova_to_phys = mock_domain_iova_to_phys,
+ },
+};
+
+static inline struct iommufd_hw_pagetable *
+get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id,
+ struct mock_iommu_domain **mock)
+{
+ struct iommufd_hw_pagetable *hwpt;
+ struct iommufd_object *obj;
+
+ obj = iommufd_get_object(ucmd->ictx, mockpt_id,
+ IOMMUFD_OBJ_HW_PAGETABLE);
+ if (IS_ERR(obj))
+ return ERR_CAST(obj);
+ hwpt = container_of(obj, struct iommufd_hw_pagetable, obj);
+ if (hwpt->domain->ops != mock_ops.default_domain_ops) {
+ iommufd_put_object(&hwpt->obj);
+ return ERR_PTR(-EINVAL);
+ }
+ *mock = container_of(hwpt->domain, struct mock_iommu_domain, domain);
+ return hwpt;
+}
+
+/* Create an hw_pagetable with the mock domain so we can test the domain ops */
+static int iommufd_test_mock_domain(struct iommufd_ucmd *ucmd,
+ struct iommu_test_cmd *cmd)
+{
+ static struct bus_type mock_bus = { .iommu_ops = &mock_ops };
+ struct iommufd_hw_pagetable *hwpt;
+ struct selftest_obj *sobj;
+ struct iommufd_ioas *ioas;
+ int rc;
+
+ ioas = iommufd_get_ioas(ucmd, cmd->id);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+
+ sobj = iommufd_object_alloc(ucmd->ictx, sobj, IOMMUFD_OBJ_SELFTEST);
+ if (IS_ERR(sobj)) {
+ rc = PTR_ERR(sobj);
+ goto out_ioas;
+ }
+ sobj->idev.ictx = ucmd->ictx;
+ sobj->type = TYPE_IDEV;
+ sobj->idev.mock_dev.bus = &mock_bus;
+
+ hwpt = iommufd_device_selftest_attach(ucmd->ictx, ioas,
+ &sobj->idev.mock_dev);
+ if (IS_ERR(hwpt)) {
+ rc = PTR_ERR(hwpt);
+ goto out_sobj;
+ }
+ sobj->idev.hwpt = hwpt;
+
+ /* Userspace must destroy both of these IDs to destroy the object */
+ cmd->mock_domain.out_hwpt_id = hwpt->obj.id;
+ cmd->mock_domain.out_device_id = sobj->obj.id;
+ iommufd_object_finalize(ucmd->ictx, &sobj->obj);
+ iommufd_put_object(&ioas->obj);
+ return iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+
+out_sobj:
+ iommufd_object_abort(ucmd->ictx, &sobj->obj);
+out_ioas:
+ iommufd_put_object(&ioas->obj);
+ return rc;
+}
+
+/* Add an additional reserved IOVA to the IOAS */
+static int iommufd_test_add_reserved(struct iommufd_ucmd *ucmd,
+ unsigned int mockpt_id,
+ unsigned long start, size_t length)
+{
+ struct iommufd_ioas *ioas;
+ int rc;
+
+ ioas = iommufd_get_ioas(ucmd, mockpt_id);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+ down_write(&ioas->iopt.iova_rwsem);
+ rc = iopt_reserve_iova(&ioas->iopt, start, start + length - 1, NULL);
+ up_write(&ioas->iopt.iova_rwsem);
+ iommufd_put_object(&ioas->obj);
+ return rc;
+}
+
+/* Check that every pfn under each iova matches the pfn under a user VA */
+static int iommufd_test_md_check_pa(struct iommufd_ucmd *ucmd,
+ unsigned int mockpt_id, unsigned long iova,
+ size_t length, void __user *uptr)
+{
+ struct iommufd_hw_pagetable *hwpt;
+ struct mock_iommu_domain *mock;
+ int rc;
+
+ if (iova % MOCK_IO_PAGE_SIZE || length % MOCK_IO_PAGE_SIZE ||
+ (uintptr_t)uptr % MOCK_IO_PAGE_SIZE)
+ return -EINVAL;
+
+ hwpt = get_md_pagetable(ucmd, mockpt_id, &mock);
+ if (IS_ERR(hwpt))
+ return PTR_ERR(hwpt);
+
+ for (; length; length -= MOCK_IO_PAGE_SIZE) {
+ struct page *pages[1];
+ unsigned long pfn;
+ long npages;
+ void *ent;
+
+ npages = get_user_pages_fast((uintptr_t)uptr & PAGE_MASK, 1, 0,
+ pages);
+ if (npages < 0) {
+ rc = npages;
+ goto out_put;
+ }
+ if (WARN_ON(npages != 1)) {
+ rc = -EFAULT;
+ goto out_put;
+ }
+ pfn = page_to_pfn(pages[0]);
+ put_page(pages[0]);
+
+ ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE);
+ if (!ent ||
+ (xa_to_value(ent) & MOCK_PFN_MASK) * MOCK_IO_PAGE_SIZE !=
+ pfn * PAGE_SIZE + ((uintptr_t)uptr % PAGE_SIZE)) {
+ rc = -EINVAL;
+ goto out_put;
+ }
+ iova += MOCK_IO_PAGE_SIZE;
+ uptr += MOCK_IO_PAGE_SIZE;
+ }
+ rc = 0;
+
+out_put:
+ iommufd_put_object(&hwpt->obj);
+ return rc;
+}
+
+/* Check that the page ref count matches, to look for missing pin/unpins */
+static int iommufd_test_md_check_refs(struct iommufd_ucmd *ucmd,
+ void __user *uptr, size_t length,
+ unsigned int refs)
+{
+ if (length % PAGE_SIZE || (uintptr_t)uptr % PAGE_SIZE)
+ return -EINVAL;
+
+ for (; length; length -= PAGE_SIZE) {
+ struct page *pages[1];
+ long npages;
+
+ npages = get_user_pages_fast((uintptr_t)uptr, 1, 0, pages);
+ if (npages < 0)
+ return npages;
+ if (WARN_ON(npages != 1))
+ return -EFAULT;
+ if (!PageCompound(pages[0])) {
+ unsigned int count;
+
+ count = page_ref_count(pages[0]);
+ if (count / GUP_PIN_COUNTING_BIAS != refs) {
+ put_page(pages[0]);
+ return -EIO;
+ }
+ }
+ put_page(pages[0]);
+ uptr += PAGE_SIZE;
+ }
+ return 0;
+}
+
+struct selftest_access {
+ struct iommufd_access *access;
+ struct file *file;
+ struct mutex lock;
+ struct list_head items;
+ unsigned int next_id;
+ bool destroying;
+};
+
+struct selftest_access_item {
+ struct list_head items_elm;
+ unsigned long iova;
+ size_t length;
+ unsigned int id;
+};
+
+static const struct file_operations iommfd_test_staccess_fops;
+
+static struct selftest_access *iommufd_access_get(int fd)
+{
+ struct file *file;
+
+ file = fget(fd);
+ if (!file)
+ return ERR_PTR(-EBADFD);
+
+ if (file->f_op != &iommfd_test_staccess_fops) {
+ fput(file);
+ return ERR_PTR(-EBADFD);
+ }
+ return file->private_data;
+}
+
+static void iommufd_test_access_unmap(void *data, unsigned long iova,
+ unsigned long length)
+{
+ unsigned long iova_last = iova + length - 1;
+ struct selftest_access *staccess = data;
+ struct selftest_access_item *item;
+ struct selftest_access_item *tmp;
+
+ mutex_lock(&staccess->lock);
+ list_for_each_entry_safe(item, tmp, &staccess->items, items_elm) {
+ if (iova > item->iova + item->length - 1 ||
+ iova_last < item->iova)
+ continue;
+ list_del(&item->items_elm);
+ iommufd_access_unpin_pages(staccess->access, item->iova,
+ item->length);
+ kfree(item);
+ }
+ mutex_unlock(&staccess->lock);
+}
+
+static int iommufd_test_access_item_destroy(struct iommufd_ucmd *ucmd,
+ unsigned int access_id,
+ unsigned int item_id)
+{
+ struct selftest_access_item *item;
+ struct selftest_access *staccess;
+
+ staccess = iommufd_access_get(access_id);
+ if (IS_ERR(staccess))
+ return PTR_ERR(staccess);
+
+ mutex_lock(&staccess->lock);
+ list_for_each_entry(item, &staccess->items, items_elm) {
+ if (item->id == item_id) {
+ list_del(&item->items_elm);
+ iommufd_access_unpin_pages(staccess->access, item->iova,
+ item->length);
+ mutex_unlock(&staccess->lock);
+ kfree(item);
+ fput(staccess->file);
+ return 0;
+ }
+ }
+ mutex_unlock(&staccess->lock);
+ fput(staccess->file);
+ return -ENOENT;
+}
+
+static int iommufd_test_staccess_release(struct inode *inode,
+ struct file *filep)
+{
+ struct selftest_access *staccess = filep->private_data;
+
+ if (staccess->access) {
+ iommufd_test_access_unmap(staccess, 0, ULONG_MAX);
+ iommufd_access_destroy(staccess->access);
+ }
+ mutex_destroy(&staccess->lock);
+ kfree(staccess);
+ return 0;
+}
+
+static const struct iommufd_access_ops selftest_access_ops_pin = {
+ .needs_pin_pages = 1,
+ .unmap = iommufd_test_access_unmap,
+};
+
+static const struct iommufd_access_ops selftest_access_ops = {
+ .unmap = iommufd_test_access_unmap,
+};
+
+static const struct file_operations iommfd_test_staccess_fops = {
+ .release = iommufd_test_staccess_release,
+};
+
+static struct selftest_access *iommufd_test_alloc_access(void)
+{
+ struct selftest_access *staccess;
+ struct file *filep;
+
+ staccess = kzalloc(sizeof(*staccess), GFP_KERNEL_ACCOUNT);
+ if (!staccess)
+ return ERR_PTR(-ENOMEM);
+ INIT_LIST_HEAD(&staccess->items);
+ mutex_init(&staccess->lock);
+
+ filep = anon_inode_getfile("[iommufd_test_staccess]",
+ &iommfd_test_staccess_fops, staccess,
+ O_RDWR);
+ if (IS_ERR(filep)) {
+ kfree(staccess);
+ return ERR_CAST(filep);
+ }
+ staccess->file = filep;
+ return staccess;
+}
+
+static int iommufd_test_create_access(struct iommufd_ucmd *ucmd,
+ unsigned int ioas_id, unsigned int flags)
+{
+ struct iommu_test_cmd *cmd = ucmd->cmd;
+ struct selftest_access *staccess;
+ struct iommufd_access *access;
+ int fdno;
+ int rc;
+
+ if (flags & ~MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES)
+ return -EOPNOTSUPP;
+
+ staccess = iommufd_test_alloc_access();
+ if (IS_ERR(staccess))
+ return PTR_ERR(staccess);
+
+ fdno = get_unused_fd_flags(O_CLOEXEC);
+ if (fdno < 0) {
+ rc = -ENOMEM;
+ goto out_free_staccess;
+ }
+
+ access = iommufd_access_create(
+ ucmd->ictx, ioas_id,
+ (flags & MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES) ?
+ &selftest_access_ops_pin :
+ &selftest_access_ops,
+ staccess);
+ if (IS_ERR(access)) {
+ rc = PTR_ERR(access);
+ goto out_put_fdno;
+ }
+ cmd->create_access.out_access_fd = fdno;
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+ if (rc)
+ goto out_destroy;
+
+ staccess->access = access;
+ fd_install(fdno, staccess->file);
+ return 0;
+
+out_destroy:
+ iommufd_access_destroy(access);
+out_put_fdno:
+ put_unused_fd(fdno);
+out_free_staccess:
+ fput(staccess->file);
+ return rc;
+}
+
+/* Check that the pages in a page array match the pages in the user VA */
+static int iommufd_test_check_pages(void __user *uptr, struct page **pages,
+ size_t npages)
+{
+ for (; npages; npages--) {
+ struct page *tmp_pages[1];
+ long rc;
+
+ rc = get_user_pages_fast((uintptr_t)uptr, 1, 0, tmp_pages);
+ if (rc < 0)
+ return rc;
+ if (WARN_ON(rc != 1))
+ return -EFAULT;
+ put_page(tmp_pages[0]);
+ if (tmp_pages[0] != *pages)
+ return -EBADE;
+ pages++;
+ uptr += PAGE_SIZE;
+ }
+ return 0;
+}
+
+static int iommufd_test_access_pages(struct iommufd_ucmd *ucmd,
+ unsigned int access_id, unsigned long iova,
+ size_t length, void __user *uptr,
+ u32 flags)
+{
+ struct iommu_test_cmd *cmd = ucmd->cmd;
+ struct selftest_access_item *item;
+ struct selftest_access *staccess;
+ struct page **pages;
+ size_t npages;
+ int rc;
+
+ /* Prevent syzkaller from triggering a WARN_ON in kvzalloc() */
+ if (length > 16*1024*1024)
+ return -ENOMEM;
+
+ if (flags & ~(MOCK_FLAGS_ACCESS_WRITE | MOCK_FLAGS_ACCESS_SYZ))
+ return -EOPNOTSUPP;
+
+ staccess = iommufd_access_get(access_id);
+ if (IS_ERR(staccess))
+ return PTR_ERR(staccess);
+
+ if (staccess->access->ops != &selftest_access_ops_pin) {
+ rc = -EOPNOTSUPP;
+ goto out_put;
+ }
+
+ if (flags & MOCK_FLAGS_ACCESS_SYZ)
+ iova = iommufd_test_syz_conv_iova(&staccess->access->ioas->iopt,
+ &cmd->access_pages.iova);
+
+ npages = (ALIGN(iova + length, PAGE_SIZE) -
+ ALIGN_DOWN(iova, PAGE_SIZE)) /
+ PAGE_SIZE;
+ pages = kvcalloc(npages, sizeof(*pages), GFP_KERNEL_ACCOUNT);
+ if (!pages) {
+ rc = -ENOMEM;
+ goto out_put;
+ }
+
+ /*
+ * Drivers will need to think very carefully about this locking. The
+ * core code can do multiple unmaps instantaneously after
+ * iommufd_access_pin_pages() and *all* the unmaps must not return until
+ * the range is unpinned. This simple implementation puts a global lock
+ * around the pin, which may not suit drivers that want this to be a
+ * performance path. drivers that get this wrong will trigger WARN_ON
+ * races and cause EDEADLOCK failures to userspace.
+ */
+ mutex_lock(&staccess->lock);
+ rc = iommufd_access_pin_pages(staccess->access, iova, length, pages,
+ flags & MOCK_FLAGS_ACCESS_WRITE);
+ if (rc)
+ goto out_unlock;
+
+ /* For syzkaller allow uptr to be NULL to skip this check */
+ if (uptr) {
+ rc = iommufd_test_check_pages(
+ uptr - (iova - ALIGN_DOWN(iova, PAGE_SIZE)), pages,
+ npages);
+ if (rc)
+ goto out_unaccess;
+ }
+
+ item = kzalloc(sizeof(*item), GFP_KERNEL_ACCOUNT);
+ if (!item) {
+ rc = -ENOMEM;
+ goto out_unaccess;
+ }
+
+ item->iova = iova;
+ item->length = length;
+ item->id = staccess->next_id++;
+ list_add_tail(&item->items_elm, &staccess->items);
+
+ cmd->access_pages.out_access_pages_id = item->id;
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+ if (rc)
+ goto out_free_item;
+ goto out_unlock;
+
+out_free_item:
+ list_del(&item->items_elm);
+ kfree(item);
+out_unaccess:
+ iommufd_access_unpin_pages(staccess->access, iova, length);
+out_unlock:
+ mutex_unlock(&staccess->lock);
+ kvfree(pages);
+out_put:
+ fput(staccess->file);
+ return rc;
+}
+
+static int iommufd_test_access_rw(struct iommufd_ucmd *ucmd,
+ unsigned int access_id, unsigned long iova,
+ size_t length, void __user *ubuf,
+ unsigned int flags)
+{
+ struct iommu_test_cmd *cmd = ucmd->cmd;
+ struct selftest_access *staccess;
+ void *tmp;
+ int rc;
+
+ /* Prevent syzkaller from triggering a WARN_ON in kvzalloc() */
+ if (length > 16*1024*1024)
+ return -ENOMEM;
+
+ if (flags & ~(MOCK_ACCESS_RW_WRITE | MOCK_ACCESS_RW_SLOW_PATH |
+ MOCK_FLAGS_ACCESS_SYZ))
+ return -EOPNOTSUPP;
+
+ staccess = iommufd_access_get(access_id);
+ if (IS_ERR(staccess))
+ return PTR_ERR(staccess);
+
+ tmp = kvzalloc(length, GFP_KERNEL_ACCOUNT);
+ if (!tmp) {
+ rc = -ENOMEM;
+ goto out_put;
+ }
+
+ if (flags & MOCK_ACCESS_RW_WRITE) {
+ if (copy_from_user(tmp, ubuf, length)) {
+ rc = -EFAULT;
+ goto out_free;
+ }
+ }
+
+ if (flags & MOCK_FLAGS_ACCESS_SYZ)
+ iova = iommufd_test_syz_conv_iova(&staccess->access->ioas->iopt,
+ &cmd->access_rw.iova);
+
+ rc = iommufd_access_rw(staccess->access, iova, tmp, length, flags);
+ if (rc)
+ goto out_free;
+ if (!(flags & MOCK_ACCESS_RW_WRITE)) {
+ if (copy_to_user(ubuf, tmp, length)) {
+ rc = -EFAULT;
+ goto out_free;
+ }
+ }
+
+out_free:
+ kvfree(tmp);
+out_put:
+ fput(staccess->file);
+ return rc;
+}
+static_assert((unsigned int)MOCK_ACCESS_RW_WRITE == IOMMUFD_ACCESS_RW_WRITE);
+static_assert((unsigned int)MOCK_ACCESS_RW_SLOW_PATH ==
+ __IOMMUFD_ACCESS_RW_SLOW_PATH);
+
+void iommufd_selftest_destroy(struct iommufd_object *obj)
+{
+ struct selftest_obj *sobj = container_of(obj, struct selftest_obj, obj);
+
+ switch (sobj->type) {
+ case TYPE_IDEV:
+ iommufd_device_selftest_detach(sobj->idev.ictx,
+ sobj->idev.hwpt);
+ break;
+ }
+}
+
+int iommufd_test(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_test_cmd *cmd = ucmd->cmd;
+
+ switch (cmd->op) {
+ case IOMMU_TEST_OP_ADD_RESERVED:
+ return iommufd_test_add_reserved(ucmd, cmd->id,
+ cmd->add_reserved.start,
+ cmd->add_reserved.length);
+ case IOMMU_TEST_OP_MOCK_DOMAIN:
+ return iommufd_test_mock_domain(ucmd, cmd);
+ case IOMMU_TEST_OP_MD_CHECK_MAP:
+ return iommufd_test_md_check_pa(
+ ucmd, cmd->id, cmd->check_map.iova,
+ cmd->check_map.length,
+ u64_to_user_ptr(cmd->check_map.uptr));
+ case IOMMU_TEST_OP_MD_CHECK_REFS:
+ return iommufd_test_md_check_refs(
+ ucmd, u64_to_user_ptr(cmd->check_refs.uptr),
+ cmd->check_refs.length, cmd->check_refs.refs);
+ case IOMMU_TEST_OP_CREATE_ACCESS:
+ return iommufd_test_create_access(ucmd, cmd->id,
+ cmd->create_access.flags);
+ case IOMMU_TEST_OP_ACCESS_PAGES:
+ return iommufd_test_access_pages(
+ ucmd, cmd->id, cmd->access_pages.iova,
+ cmd->access_pages.length,
+ u64_to_user_ptr(cmd->access_pages.uptr),
+ cmd->access_pages.flags);
+ case IOMMU_TEST_OP_ACCESS_RW:
+ return iommufd_test_access_rw(
+ ucmd, cmd->id, cmd->access_rw.iova,
+ cmd->access_rw.length,
+ u64_to_user_ptr(cmd->access_rw.uptr),
+ cmd->access_rw.flags);
+ case IOMMU_TEST_OP_DESTROY_ACCESS_PAGES:
+ return iommufd_test_access_item_destroy(
+ ucmd, cmd->id, cmd->destroy_access_pages.access_pages_id);
+ case IOMMU_TEST_OP_SET_TEMP_MEMORY_LIMIT:
+ /* Protect _batch_init(), can not be less than elmsz */
+ if (cmd->memory_limit.limit <
+ sizeof(unsigned long) + sizeof(u32))
+ return -EINVAL;
+ iommufd_test_memory_limit = cmd->memory_limit.limit;
+ return 0;
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+bool iommufd_should_fail(void)
+{
+ return should_fail(&fail_iommufd, 1);
+}
+
+void __init iommufd_test_init(void)
+{
+ dbgfs_root =
+ fault_create_debugfs_attr("fail_iommufd", NULL, &fail_iommufd);
+}
+
+void iommufd_test_exit(void)
+{
+ debugfs_remove_recursive(dbgfs_root);
+}
diff --git a/drivers/iommu/iommufd/vfio_compat.c b/drivers/iommu/iommufd/vfio_compat.c
new file mode 100644
index 000000000000..3ceca0e8311c
--- /dev/null
+++ b/drivers/iommu/iommufd/vfio_compat.c
@@ -0,0 +1,472 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
+ */
+#include <linux/file.h>
+#include <linux/interval_tree.h>
+#include <linux/iommu.h>
+#include <linux/iommufd.h>
+#include <linux/slab.h>
+#include <linux/vfio.h>
+#include <uapi/linux/vfio.h>
+#include <uapi/linux/iommufd.h>
+
+#include "iommufd_private.h"
+
+static struct iommufd_ioas *get_compat_ioas(struct iommufd_ctx *ictx)
+{
+ struct iommufd_ioas *ioas = ERR_PTR(-ENODEV);
+
+ xa_lock(&ictx->objects);
+ if (!ictx->vfio_ioas || !iommufd_lock_obj(&ictx->vfio_ioas->obj))
+ goto out_unlock;
+ ioas = ictx->vfio_ioas;
+out_unlock:
+ xa_unlock(&ictx->objects);
+ return ioas;
+}
+
+/**
+ * iommufd_vfio_compat_ioas_id - Return the IOAS ID that vfio should use
+ * @ictx: Context to operate on
+ * @out_ioas_id: The ioas_id the caller should use
+ *
+ * The compatibility IOAS is the IOAS that the vfio compatibility ioctls operate
+ * on since they do not have an IOAS ID input in their ABI. Only attaching a
+ * group should cause a default creation of the internal ioas, this returns the
+ * existing ioas if it has already been assigned somehow.
+ */
+int iommufd_vfio_compat_ioas_id(struct iommufd_ctx *ictx, u32 *out_ioas_id)
+{
+ struct iommufd_ioas *ioas = NULL;
+ struct iommufd_ioas *out_ioas;
+
+ ioas = iommufd_ioas_alloc(ictx);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+
+ xa_lock(&ictx->objects);
+ if (ictx->vfio_ioas && iommufd_lock_obj(&ictx->vfio_ioas->obj))
+ out_ioas = ictx->vfio_ioas;
+ else {
+ out_ioas = ioas;
+ ictx->vfio_ioas = ioas;
+ }
+ xa_unlock(&ictx->objects);
+
+ *out_ioas_id = out_ioas->obj.id;
+ if (out_ioas != ioas) {
+ iommufd_put_object(&out_ioas->obj);
+ iommufd_object_abort(ictx, &ioas->obj);
+ return 0;
+ }
+ /*
+ * An automatically created compat IOAS is treated as a userspace
+ * created object. Userspace can learn the ID via IOMMU_VFIO_IOAS_GET,
+ * and if not manually destroyed it will be destroyed automatically
+ * at iommufd release.
+ */
+ iommufd_object_finalize(ictx, &ioas->obj);
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_id, IOMMUFD_VFIO);
+
+int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_vfio_ioas *cmd = ucmd->cmd;
+ struct iommufd_ioas *ioas;
+
+ if (cmd->__reserved)
+ return -EOPNOTSUPP;
+ switch (cmd->op) {
+ case IOMMU_VFIO_IOAS_GET:
+ ioas = get_compat_ioas(ucmd->ictx);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+ cmd->ioas_id = ioas->obj.id;
+ iommufd_put_object(&ioas->obj);
+ return iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+
+ case IOMMU_VFIO_IOAS_SET:
+ ioas = iommufd_get_ioas(ucmd, cmd->ioas_id);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+ xa_lock(&ucmd->ictx->objects);
+ ucmd->ictx->vfio_ioas = ioas;
+ xa_unlock(&ucmd->ictx->objects);
+ iommufd_put_object(&ioas->obj);
+ return 0;
+
+ case IOMMU_VFIO_IOAS_CLEAR:
+ xa_lock(&ucmd->ictx->objects);
+ ucmd->ictx->vfio_ioas = NULL;
+ xa_unlock(&ucmd->ictx->objects);
+ return 0;
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static int iommufd_vfio_map_dma(struct iommufd_ctx *ictx, unsigned int cmd,
+ void __user *arg)
+{
+ u32 supported_flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+ size_t minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
+ struct vfio_iommu_type1_dma_map map;
+ int iommu_prot = IOMMU_CACHE;
+ struct iommufd_ioas *ioas;
+ unsigned long iova;
+ int rc;
+
+ if (copy_from_user(&map, arg, minsz))
+ return -EFAULT;
+
+ if (map.argsz < minsz || map.flags & ~supported_flags)
+ return -EINVAL;
+
+ if (map.flags & VFIO_DMA_MAP_FLAG_READ)
+ iommu_prot |= IOMMU_READ;
+ if (map.flags & VFIO_DMA_MAP_FLAG_WRITE)
+ iommu_prot |= IOMMU_WRITE;
+
+ ioas = get_compat_ioas(ictx);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+
+ /*
+ * Maps created through the legacy interface always use VFIO compatible
+ * rlimit accounting. If the user wishes to use the faster user based
+ * rlimit accounting then they must use the new interface.
+ */
+ iova = map.iova;
+ rc = iopt_map_user_pages(ictx, &ioas->iopt, &iova, u64_to_user_ptr(map.vaddr),
+ map.size, iommu_prot, 0);
+ iommufd_put_object(&ioas->obj);
+ return rc;
+}
+
+static int iommufd_vfio_unmap_dma(struct iommufd_ctx *ictx, unsigned int cmd,
+ void __user *arg)
+{
+ size_t minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
+ /*
+ * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP is obsoleted by the new
+ * dirty tracking direction:
+ * https://lore.kernel.org/kvm/20220731125503.142683-1-yishaih@nvidia.com/
+ * https://lore.kernel.org/kvm/20220428210933.3583-1-joao.m.martins@oracle.com/
+ */
+ u32 supported_flags = VFIO_DMA_UNMAP_FLAG_ALL;
+ struct vfio_iommu_type1_dma_unmap unmap;
+ unsigned long unmapped = 0;
+ struct iommufd_ioas *ioas;
+ int rc;
+
+ if (copy_from_user(&unmap, arg, minsz))
+ return -EFAULT;
+
+ if (unmap.argsz < minsz || unmap.flags & ~supported_flags)
+ return -EINVAL;
+
+ ioas = get_compat_ioas(ictx);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+
+ if (unmap.flags & VFIO_DMA_UNMAP_FLAG_ALL) {
+ if (unmap.iova != 0 || unmap.size != 0) {
+ rc = -EINVAL;
+ goto err_put;
+ }
+ rc = iopt_unmap_all(&ioas->iopt, &unmapped);
+ } else {
+ if (READ_ONCE(ioas->iopt.disable_large_pages)) {
+ /*
+ * Create cuts at the start and last of the requested
+ * range. If the start IOVA is 0 then it doesn't need to
+ * be cut.
+ */
+ unsigned long iovas[] = { unmap.iova + unmap.size - 1,
+ unmap.iova - 1 };
+
+ rc = iopt_cut_iova(&ioas->iopt, iovas,
+ unmap.iova ? 2 : 1);
+ if (rc)
+ goto err_put;
+ }
+ rc = iopt_unmap_iova(&ioas->iopt, unmap.iova, unmap.size,
+ &unmapped);
+ }
+ unmap.size = unmapped;
+ if (copy_to_user(arg, &unmap, minsz))
+ rc = -EFAULT;
+
+err_put:
+ iommufd_put_object(&ioas->obj);
+ return rc;
+}
+
+static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx)
+{
+ struct iommufd_hw_pagetable *hwpt;
+ struct iommufd_ioas *ioas;
+ int rc = 1;
+
+ ioas = get_compat_ioas(ictx);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+
+ mutex_lock(&ioas->mutex);
+ list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) {
+ if (!hwpt->enforce_cache_coherency) {
+ rc = 0;
+ break;
+ }
+ }
+ mutex_unlock(&ioas->mutex);
+
+ iommufd_put_object(&ioas->obj);
+ return rc;
+}
+
+static int iommufd_vfio_check_extension(struct iommufd_ctx *ictx,
+ unsigned long type)
+{
+ switch (type) {
+ case VFIO_TYPE1_IOMMU:
+ case VFIO_TYPE1v2_IOMMU:
+ case VFIO_UNMAP_ALL:
+ return 1;
+
+ case VFIO_DMA_CC_IOMMU:
+ return iommufd_vfio_cc_iommu(ictx);
+
+ /*
+ * This is obsolete, and to be removed from VFIO. It was an incomplete
+ * idea that got merged.
+ * https://lore.kernel.org/kvm/0-v1-0093c9b0e345+19-vfio_no_nesting_jgg@nvidia.com/
+ */
+ case VFIO_TYPE1_NESTING_IOMMU:
+ return 0;
+
+ /*
+ * VFIO_DMA_MAP_FLAG_VADDR
+ * https://lore.kernel.org/kvm/1611939252-7240-1-git-send-email-steven.sistare@oracle.com/
+ * https://lore.kernel.org/all/Yz777bJZjTyLrHEQ@nvidia.com/
+ *
+ * It is hard to see how this could be implemented safely.
+ */
+ case VFIO_UPDATE_VADDR:
+ default:
+ return 0;
+ }
+}
+
+static int iommufd_vfio_set_iommu(struct iommufd_ctx *ictx, unsigned long type)
+{
+ struct iommufd_ioas *ioas = NULL;
+ int rc = 0;
+
+ if (type != VFIO_TYPE1_IOMMU && type != VFIO_TYPE1v2_IOMMU)
+ return -EINVAL;
+
+ /* VFIO fails the set_iommu if there is no group */
+ ioas = get_compat_ioas(ictx);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+
+ /*
+ * The difference between TYPE1 and TYPE1v2 is the ability to unmap in
+ * the middle of mapped ranges. This is complicated by huge page support
+ * which creates single large IOPTEs that cannot be split by the iommu
+ * driver. TYPE1 is very old at this point and likely nothing uses it,
+ * however it is simple enough to emulate by simply disabling the
+ * problematic large IOPTEs. Then we can safely unmap within any range.
+ */
+ if (type == VFIO_TYPE1_IOMMU)
+ rc = iopt_disable_large_pages(&ioas->iopt);
+ iommufd_put_object(&ioas->obj);
+ return rc;
+}
+
+static unsigned long iommufd_get_pagesizes(struct iommufd_ioas *ioas)
+{
+ struct io_pagetable *iopt = &ioas->iopt;
+ unsigned long pgsize_bitmap = ULONG_MAX;
+ struct iommu_domain *domain;
+ unsigned long index;
+
+ down_read(&iopt->domains_rwsem);
+ xa_for_each(&iopt->domains, index, domain)
+ pgsize_bitmap &= domain->pgsize_bitmap;
+
+ /* See vfio_update_pgsize_bitmap() */
+ if (pgsize_bitmap & ~PAGE_MASK) {
+ pgsize_bitmap &= PAGE_MASK;
+ pgsize_bitmap |= PAGE_SIZE;
+ }
+ pgsize_bitmap = max(pgsize_bitmap, ioas->iopt.iova_alignment);
+ up_read(&iopt->domains_rwsem);
+ return pgsize_bitmap;
+}
+
+static int iommufd_fill_cap_iova(struct iommufd_ioas *ioas,
+ struct vfio_info_cap_header __user *cur,
+ size_t avail)
+{
+ struct vfio_iommu_type1_info_cap_iova_range __user *ucap_iovas =
+ container_of(cur,
+ struct vfio_iommu_type1_info_cap_iova_range __user,
+ header);
+ struct vfio_iommu_type1_info_cap_iova_range cap_iovas = {
+ .header = {
+ .id = VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE,
+ .version = 1,
+ },
+ };
+ struct interval_tree_span_iter span;
+
+ interval_tree_for_each_span(&span, &ioas->iopt.reserved_itree, 0,
+ ULONG_MAX) {
+ struct vfio_iova_range range;
+
+ if (!span.is_hole)
+ continue;
+ range.start = span.start_hole;
+ range.end = span.last_hole;
+ if (avail >= struct_size(&cap_iovas, iova_ranges,
+ cap_iovas.nr_iovas + 1) &&
+ copy_to_user(&ucap_iovas->iova_ranges[cap_iovas.nr_iovas],
+ &range, sizeof(range)))
+ return -EFAULT;
+ cap_iovas.nr_iovas++;
+ }
+ if (avail >= struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas) &&
+ copy_to_user(ucap_iovas, &cap_iovas, sizeof(cap_iovas)))
+ return -EFAULT;
+ return struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas);
+}
+
+static int iommufd_fill_cap_dma_avail(struct iommufd_ioas *ioas,
+ struct vfio_info_cap_header __user *cur,
+ size_t avail)
+{
+ struct vfio_iommu_type1_info_dma_avail cap_dma = {
+ .header = {
+ .id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL,
+ .version = 1,
+ },
+ /*
+ * iommufd's limit is based on the cgroup's memory limit.
+ * Normally vfio would return U16_MAX here, and provide a module
+ * parameter to adjust it. Since S390 qemu userspace actually
+ * pays attention and needs a value bigger than U16_MAX return
+ * U32_MAX.
+ */
+ .avail = U32_MAX,
+ };
+
+ if (avail >= sizeof(cap_dma) &&
+ copy_to_user(cur, &cap_dma, sizeof(cap_dma)))
+ return -EFAULT;
+ return sizeof(cap_dma);
+}
+
+static int iommufd_vfio_iommu_get_info(struct iommufd_ctx *ictx,
+ void __user *arg)
+{
+ typedef int (*fill_cap_fn)(struct iommufd_ioas *ioas,
+ struct vfio_info_cap_header __user *cur,
+ size_t avail);
+ static const fill_cap_fn fill_fns[] = {
+ iommufd_fill_cap_dma_avail,
+ iommufd_fill_cap_iova,
+ };
+ size_t minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
+ struct vfio_info_cap_header __user *last_cap = NULL;
+ struct vfio_iommu_type1_info info;
+ struct iommufd_ioas *ioas;
+ size_t total_cap_size;
+ int rc;
+ int i;
+
+ if (copy_from_user(&info, arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+ minsz = min_t(size_t, info.argsz, sizeof(info));
+
+ ioas = get_compat_ioas(ictx);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+
+ info.flags = VFIO_IOMMU_INFO_PGSIZES;
+ info.iova_pgsizes = iommufd_get_pagesizes(ioas);
+ info.cap_offset = 0;
+
+ down_read(&ioas->iopt.iova_rwsem);
+ total_cap_size = sizeof(info);
+ for (i = 0; i != ARRAY_SIZE(fill_fns); i++) {
+ int cap_size;
+
+ if (info.argsz > total_cap_size)
+ cap_size = fill_fns[i](ioas, arg + total_cap_size,
+ info.argsz - total_cap_size);
+ else
+ cap_size = fill_fns[i](ioas, NULL, 0);
+ if (cap_size < 0) {
+ rc = cap_size;
+ goto out_put;
+ }
+ if (last_cap && info.argsz >= total_cap_size &&
+ put_user(total_cap_size, &last_cap->next)) {
+ rc = -EFAULT;
+ goto out_put;
+ }
+ last_cap = arg + total_cap_size;
+ total_cap_size += cap_size;
+ }
+
+ /*
+ * If the user did not provide enough space then only some caps are
+ * returned and the argsz will be updated to the correct amount to get
+ * all caps.
+ */
+ if (info.argsz >= total_cap_size)
+ info.cap_offset = sizeof(info);
+ info.argsz = total_cap_size;
+ info.flags |= VFIO_IOMMU_INFO_CAPS;
+ if (copy_to_user(arg, &info, minsz)) {
+ rc = -EFAULT;
+ goto out_put;
+ }
+ rc = 0;
+
+out_put:
+ up_read(&ioas->iopt.iova_rwsem);
+ iommufd_put_object(&ioas->obj);
+ return rc;
+}
+
+int iommufd_vfio_ioctl(struct iommufd_ctx *ictx, unsigned int cmd,
+ unsigned long arg)
+{
+ void __user *uarg = (void __user *)arg;
+
+ switch (cmd) {
+ case VFIO_GET_API_VERSION:
+ return VFIO_API_VERSION;
+ case VFIO_SET_IOMMU:
+ return iommufd_vfio_set_iommu(ictx, arg);
+ case VFIO_CHECK_EXTENSION:
+ return iommufd_vfio_check_extension(ictx, arg);
+ case VFIO_IOMMU_GET_INFO:
+ return iommufd_vfio_iommu_get_info(ictx, uarg);
+ case VFIO_IOMMU_MAP_DMA:
+ return iommufd_vfio_map_dma(ictx, cmd, uarg);
+ case VFIO_IOMMU_UNMAP_DMA:
+ return iommufd_vfio_unmap_dma(ictx, cmd, uarg);
+ case VFIO_IOMMU_DIRTY_PAGES:
+ default:
+ return -ENOIOCTLCMD;
+ }
+ return -ENOIOCTLCMD;
+}
diff --git a/include/linux/interval_tree.h b/include/linux/interval_tree.h
index 288c26f50732..2b8026a39906 100644
--- a/include/linux/interval_tree.h
+++ b/include/linux/interval_tree.h
@@ -27,4 +27,62 @@ extern struct interval_tree_node *
interval_tree_iter_next(struct interval_tree_node *node,
unsigned long start, unsigned long last);
+/**
+ * struct interval_tree_span_iter - Find used and unused spans.
+ * @start_hole: Start of an interval for a hole when is_hole == 1
+ * @last_hole: Inclusive end of an interval for a hole when is_hole == 1
+ * @start_used: Start of a used interval when is_hole == 0
+ * @last_used: Inclusive end of a used interval when is_hole == 0
+ * @is_hole: 0 == used, 1 == is_hole, -1 == done iteration
+ *
+ * This iterator travels over spans in an interval tree. It does not return
+ * nodes but classifies each span as either a hole, where no nodes intersect, or
+ * a used, which is fully covered by nodes. Each iteration step toggles between
+ * hole and used until the entire range is covered. The returned spans always
+ * fully cover the requested range.
+ *
+ * The iterator is greedy, it always returns the largest hole or used possible,
+ * consolidating all consecutive nodes.
+ *
+ * Use interval_tree_span_iter_done() to detect end of iteration.
+ */
+struct interval_tree_span_iter {
+ /* private: not for use by the caller */
+ struct interval_tree_node *nodes[2];
+ unsigned long first_index;
+ unsigned long last_index;
+
+ /* public: */
+ union {
+ unsigned long start_hole;
+ unsigned long start_used;
+ };
+ union {
+ unsigned long last_hole;
+ unsigned long last_used;
+ };
+ int is_hole;
+};
+
+void interval_tree_span_iter_first(struct interval_tree_span_iter *state,
+ struct rb_root_cached *itree,
+ unsigned long first_index,
+ unsigned long last_index);
+void interval_tree_span_iter_advance(struct interval_tree_span_iter *iter,
+ struct rb_root_cached *itree,
+ unsigned long new_index);
+void interval_tree_span_iter_next(struct interval_tree_span_iter *state);
+
+static inline bool
+interval_tree_span_iter_done(struct interval_tree_span_iter *state)
+{
+ return state->is_hole == -1;
+}
+
+#define interval_tree_for_each_span(span, itree, first_index, last_index) \
+ for (interval_tree_span_iter_first(span, itree, \
+ first_index, last_index); \
+ !interval_tree_span_iter_done(span); \
+ interval_tree_span_iter_next(span))
+
#endif /* _LINUX_INTERVAL_TREE_H */
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 68d7d304cdb7..1690c334e516 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -124,6 +124,11 @@ enum iommu_cap {
IOMMU_CAP_NOEXEC, /* IOMMU_NOEXEC flag */
IOMMU_CAP_PRE_BOOT_PROTECTION, /* Firmware says it used the IOMMU for
DMA protection and we should too */
+ /*
+ * Per-device flag indicating if enforce_cache_coherency() will work on
+ * this device.
+ */
+ IOMMU_CAP_ENFORCE_CACHE_COHERENCY,
};
/* These are the possible reserved region types */
@@ -702,6 +707,9 @@ int iommu_group_claim_dma_owner(struct iommu_group *group, void *owner);
void iommu_group_release_dma_owner(struct iommu_group *group);
bool iommu_group_dma_owner_claimed(struct iommu_group *group);
+int iommu_device_claim_dma_owner(struct device *dev, void *owner);
+void iommu_device_release_dma_owner(struct device *dev);
+
struct iommu_domain *iommu_sva_domain_alloc(struct device *dev,
struct mm_struct *mm);
int iommu_attach_device_pasid(struct iommu_domain *domain,
@@ -1059,6 +1067,15 @@ static inline bool iommu_group_dma_owner_claimed(struct iommu_group *group)
return false;
}
+static inline void iommu_device_release_dma_owner(struct device *dev)
+{
+}
+
+static inline int iommu_device_claim_dma_owner(struct device *dev, void *owner)
+{
+ return -ENODEV;
+}
+
static inline struct iommu_domain *
iommu_sva_domain_alloc(struct device *dev, struct mm_struct *mm)
{
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
new file mode 100644
index 000000000000..650d45629647
--- /dev/null
+++ b/include/linux/iommufd.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2021 Intel Corporation
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
+ */
+#ifndef __LINUX_IOMMUFD_H
+#define __LINUX_IOMMUFD_H
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+
+struct device;
+struct iommufd_device;
+struct page;
+struct iommufd_ctx;
+struct iommufd_access;
+struct file;
+
+struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx,
+ struct device *dev, u32 *id);
+void iommufd_device_unbind(struct iommufd_device *idev);
+
+int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id);
+void iommufd_device_detach(struct iommufd_device *idev);
+
+struct iommufd_access_ops {
+ u8 needs_pin_pages : 1;
+ void (*unmap)(void *data, unsigned long iova, unsigned long length);
+};
+
+enum {
+ IOMMUFD_ACCESS_RW_READ = 0,
+ IOMMUFD_ACCESS_RW_WRITE = 1 << 0,
+ /* Set if the caller is in a kthread then rw will use kthread_use_mm() */
+ IOMMUFD_ACCESS_RW_KTHREAD = 1 << 1,
+
+ /* Only for use by selftest */
+ __IOMMUFD_ACCESS_RW_SLOW_PATH = 1 << 2,
+};
+
+struct iommufd_access *
+iommufd_access_create(struct iommufd_ctx *ictx, u32 ioas_id,
+ const struct iommufd_access_ops *ops, void *data);
+void iommufd_access_destroy(struct iommufd_access *access);
+
+void iommufd_ctx_get(struct iommufd_ctx *ictx);
+
+#if IS_ENABLED(CONFIG_IOMMUFD)
+struct iommufd_ctx *iommufd_ctx_from_file(struct file *file);
+void iommufd_ctx_put(struct iommufd_ctx *ictx);
+
+int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova,
+ unsigned long length, struct page **out_pages,
+ unsigned int flags);
+void iommufd_access_unpin_pages(struct iommufd_access *access,
+ unsigned long iova, unsigned long length);
+int iommufd_access_rw(struct iommufd_access *access, unsigned long iova,
+ void *data, size_t len, unsigned int flags);
+int iommufd_vfio_compat_ioas_id(struct iommufd_ctx *ictx, u32 *out_ioas_id);
+#else /* !CONFIG_IOMMUFD */
+static inline struct iommufd_ctx *iommufd_ctx_from_file(struct file *file)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline void iommufd_ctx_put(struct iommufd_ctx *ictx)
+{
+}
+
+static inline int iommufd_access_pin_pages(struct iommufd_access *access,
+ unsigned long iova,
+ unsigned long length,
+ struct page **out_pages,
+ unsigned int flags)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void iommufd_access_unpin_pages(struct iommufd_access *access,
+ unsigned long iova,
+ unsigned long length)
+{
+}
+
+static inline int iommufd_access_rw(struct iommufd_access *access, unsigned long iova,
+ void *data, size_t len, unsigned int flags)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int iommufd_vfio_compat_ioas_id(struct iommufd_ctx *ictx,
+ u32 *out_ioas_id)
+{
+ return -EOPNOTSUPP;
+}
+#endif /* CONFIG_IOMMUFD */
+#endif
diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h
index f054d0360a75..4cc52698e214 100644
--- a/include/linux/sched/user.h
+++ b/include/linux/sched/user.h
@@ -25,7 +25,7 @@ struct user_struct {
#if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL) || \
defined(CONFIG_NET) || defined(CONFIG_IO_URING) || \
- defined(CONFIG_VFIO_PCI_ZDEV_KVM)
+ defined(CONFIG_VFIO_PCI_ZDEV_KVM) || IS_ENABLED(CONFIG_IOMMUFD)
atomic_long_t locked_vm;
#endif
#ifdef CONFIG_WATCH_QUEUE
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
new file mode 100644
index 000000000000..98ebba80cfa1
--- /dev/null
+++ b/include/uapi/linux/iommufd.h
@@ -0,0 +1,347 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
+ */
+#ifndef _UAPI_IOMMUFD_H
+#define _UAPI_IOMMUFD_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#define IOMMUFD_TYPE (';')
+
+/**
+ * DOC: General ioctl format
+ *
+ * The ioctl interface follows a general format to allow for extensibility. Each
+ * ioctl is passed in a structure pointer as the argument providing the size of
+ * the structure in the first u32. The kernel checks that any structure space
+ * beyond what it understands is 0. This allows userspace to use the backward
+ * compatible portion while consistently using the newer, larger, structures.
+ *
+ * ioctls use a standard meaning for common errnos:
+ *
+ * - ENOTTY: The IOCTL number itself is not supported at all
+ * - E2BIG: The IOCTL number is supported, but the provided structure has
+ * non-zero in a part the kernel does not understand.
+ * - EOPNOTSUPP: The IOCTL number is supported, and the structure is
+ * understood, however a known field has a value the kernel does not
+ * understand or support.
+ * - EINVAL: Everything about the IOCTL was understood, but a field is not
+ * correct.
+ * - ENOENT: An ID or IOVA provided does not exist.
+ * - ENOMEM: Out of memory.
+ * - EOVERFLOW: Mathematics overflowed.
+ *
+ * As well as additional errnos, within specific ioctls.
+ */
+enum {
+ IOMMUFD_CMD_BASE = 0x80,
+ IOMMUFD_CMD_DESTROY = IOMMUFD_CMD_BASE,
+ IOMMUFD_CMD_IOAS_ALLOC,
+ IOMMUFD_CMD_IOAS_ALLOW_IOVAS,
+ IOMMUFD_CMD_IOAS_COPY,
+ IOMMUFD_CMD_IOAS_IOVA_RANGES,
+ IOMMUFD_CMD_IOAS_MAP,
+ IOMMUFD_CMD_IOAS_UNMAP,
+ IOMMUFD_CMD_OPTION,
+ IOMMUFD_CMD_VFIO_IOAS,
+};
+
+/**
+ * struct iommu_destroy - ioctl(IOMMU_DESTROY)
+ * @size: sizeof(struct iommu_destroy)
+ * @id: iommufd object ID to destroy. Can be any destroyable object type.
+ *
+ * Destroy any object held within iommufd.
+ */
+struct iommu_destroy {
+ __u32 size;
+ __u32 id;
+};
+#define IOMMU_DESTROY _IO(IOMMUFD_TYPE, IOMMUFD_CMD_DESTROY)
+
+/**
+ * struct iommu_ioas_alloc - ioctl(IOMMU_IOAS_ALLOC)
+ * @size: sizeof(struct iommu_ioas_alloc)
+ * @flags: Must be 0
+ * @out_ioas_id: Output IOAS ID for the allocated object
+ *
+ * Allocate an IO Address Space (IOAS) which holds an IO Virtual Address (IOVA)
+ * to memory mapping.
+ */
+struct iommu_ioas_alloc {
+ __u32 size;
+ __u32 flags;
+ __u32 out_ioas_id;
+};
+#define IOMMU_IOAS_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_ALLOC)
+
+/**
+ * struct iommu_iova_range - ioctl(IOMMU_IOVA_RANGE)
+ * @start: First IOVA
+ * @last: Inclusive last IOVA
+ *
+ * An interval in IOVA space.
+ */
+struct iommu_iova_range {
+ __aligned_u64 start;
+ __aligned_u64 last;
+};
+
+/**
+ * struct iommu_ioas_iova_ranges - ioctl(IOMMU_IOAS_IOVA_RANGES)
+ * @size: sizeof(struct iommu_ioas_iova_ranges)
+ * @ioas_id: IOAS ID to read ranges from
+ * @num_iovas: Input/Output total number of ranges in the IOAS
+ * @__reserved: Must be 0
+ * @allowed_iovas: Pointer to the output array of struct iommu_iova_range
+ * @out_iova_alignment: Minimum alignment required for mapping IOVA
+ *
+ * Query an IOAS for ranges of allowed IOVAs. Mapping IOVA outside these ranges
+ * is not allowed. num_iovas will be set to the total number of iovas and
+ * the allowed_iovas[] will be filled in as space permits.
+ *
+ * The allowed ranges are dependent on the HW path the DMA operation takes, and
+ * can change during the lifetime of the IOAS. A fresh empty IOAS will have a
+ * full range, and each attached device will narrow the ranges based on that
+ * device's HW restrictions. Detaching a device can widen the ranges. Userspace
+ * should query ranges after every attach/detach to know what IOVAs are valid
+ * for mapping.
+ *
+ * On input num_iovas is the length of the allowed_iovas array. On output it is
+ * the total number of iovas filled in. The ioctl will return -EMSGSIZE and set
+ * num_iovas to the required value if num_iovas is too small. In this case the
+ * caller should allocate a larger output array and re-issue the ioctl.
+ *
+ * out_iova_alignment returns the minimum IOVA alignment that can be given
+ * to IOMMU_IOAS_MAP/COPY. IOVA's must satisfy::
+ *
+ * starting_iova % out_iova_alignment == 0
+ * (starting_iova + length) % out_iova_alignment == 0
+ *
+ * out_iova_alignment can be 1 indicating any IOVA is allowed. It cannot
+ * be higher than the system PAGE_SIZE.
+ */
+struct iommu_ioas_iova_ranges {
+ __u32 size;
+ __u32 ioas_id;
+ __u32 num_iovas;
+ __u32 __reserved;
+ __aligned_u64 allowed_iovas;
+ __aligned_u64 out_iova_alignment;
+};
+#define IOMMU_IOAS_IOVA_RANGES _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_IOVA_RANGES)
+
+/**
+ * struct iommu_ioas_allow_iovas - ioctl(IOMMU_IOAS_ALLOW_IOVAS)
+ * @size: sizeof(struct iommu_ioas_allow_iovas)
+ * @ioas_id: IOAS ID to allow IOVAs from
+ * @num_iovas: Input/Output total number of ranges in the IOAS
+ * @__reserved: Must be 0
+ * @allowed_iovas: Pointer to array of struct iommu_iova_range
+ *
+ * Ensure a range of IOVAs are always available for allocation. If this call
+ * succeeds then IOMMU_IOAS_IOVA_RANGES will never return a list of IOVA ranges
+ * that are narrower than the ranges provided here. This call will fail if
+ * IOMMU_IOAS_IOVA_RANGES is currently narrower than the given ranges.
+ *
+ * When an IOAS is first created the IOVA_RANGES will be maximally sized, and as
+ * devices are attached the IOVA will narrow based on the device restrictions.
+ * When an allowed range is specified any narrowing will be refused, ie device
+ * attachment can fail if the device requires limiting within the allowed range.
+ *
+ * Automatic IOVA allocation is also impacted by this call. MAP will only
+ * allocate within the allowed IOVAs if they are present.
+ *
+ * This call replaces the entire allowed list with the given list.
+ */
+struct iommu_ioas_allow_iovas {
+ __u32 size;
+ __u32 ioas_id;
+ __u32 num_iovas;
+ __u32 __reserved;
+ __aligned_u64 allowed_iovas;
+};
+#define IOMMU_IOAS_ALLOW_IOVAS _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_ALLOW_IOVAS)
+
+/**
+ * enum iommufd_ioas_map_flags - Flags for map and copy
+ * @IOMMU_IOAS_MAP_FIXED_IOVA: If clear the kernel will compute an appropriate
+ * IOVA to place the mapping at
+ * @IOMMU_IOAS_MAP_WRITEABLE: DMA is allowed to write to this mapping
+ * @IOMMU_IOAS_MAP_READABLE: DMA is allowed to read from this mapping
+ */
+enum iommufd_ioas_map_flags {
+ IOMMU_IOAS_MAP_FIXED_IOVA = 1 << 0,
+ IOMMU_IOAS_MAP_WRITEABLE = 1 << 1,
+ IOMMU_IOAS_MAP_READABLE = 1 << 2,
+};
+
+/**
+ * struct iommu_ioas_map - ioctl(IOMMU_IOAS_MAP)
+ * @size: sizeof(struct iommu_ioas_map)
+ * @flags: Combination of enum iommufd_ioas_map_flags
+ * @ioas_id: IOAS ID to change the mapping of
+ * @__reserved: Must be 0
+ * @user_va: Userspace pointer to start mapping from
+ * @length: Number of bytes to map
+ * @iova: IOVA the mapping was placed at. If IOMMU_IOAS_MAP_FIXED_IOVA is set
+ * then this must be provided as input.
+ *
+ * Set an IOVA mapping from a user pointer. If FIXED_IOVA is specified then the
+ * mapping will be established at iova, otherwise a suitable location based on
+ * the reserved and allowed lists will be automatically selected and returned in
+ * iova.
+ *
+ * If IOMMU_IOAS_MAP_FIXED_IOVA is specified then the iova range must currently
+ * be unused, existing IOVA cannot be replaced.
+ */
+struct iommu_ioas_map {
+ __u32 size;
+ __u32 flags;
+ __u32 ioas_id;
+ __u32 __reserved;
+ __aligned_u64 user_va;
+ __aligned_u64 length;
+ __aligned_u64 iova;
+};
+#define IOMMU_IOAS_MAP _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_MAP)
+
+/**
+ * struct iommu_ioas_copy - ioctl(IOMMU_IOAS_COPY)
+ * @size: sizeof(struct iommu_ioas_copy)
+ * @flags: Combination of enum iommufd_ioas_map_flags
+ * @dst_ioas_id: IOAS ID to change the mapping of
+ * @src_ioas_id: IOAS ID to copy from
+ * @length: Number of bytes to copy and map
+ * @dst_iova: IOVA the mapping was placed at. If IOMMU_IOAS_MAP_FIXED_IOVA is
+ * set then this must be provided as input.
+ * @src_iova: IOVA to start the copy
+ *
+ * Copy an already existing mapping from src_ioas_id and establish it in
+ * dst_ioas_id. The src iova/length must exactly match a range used with
+ * IOMMU_IOAS_MAP.
+ *
+ * This may be used to efficiently clone a subset of an IOAS to another, or as a
+ * kind of 'cache' to speed up mapping. Copy has an efficiency advantage over
+ * establishing equivalent new mappings, as internal resources are shared, and
+ * the kernel will pin the user memory only once.
+ */
+struct iommu_ioas_copy {
+ __u32 size;
+ __u32 flags;
+ __u32 dst_ioas_id;
+ __u32 src_ioas_id;
+ __aligned_u64 length;
+ __aligned_u64 dst_iova;
+ __aligned_u64 src_iova;
+};
+#define IOMMU_IOAS_COPY _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_COPY)
+
+/**
+ * struct iommu_ioas_unmap - ioctl(IOMMU_IOAS_UNMAP)
+ * @size: sizeof(struct iommu_ioas_unmap)
+ * @ioas_id: IOAS ID to change the mapping of
+ * @iova: IOVA to start the unmapping at
+ * @length: Number of bytes to unmap, and return back the bytes unmapped
+ *
+ * Unmap an IOVA range. The iova/length must be a superset of a previously
+ * mapped range used with IOMMU_IOAS_MAP or IOMMU_IOAS_COPY. Splitting or
+ * truncating ranges is not allowed. The values 0 to U64_MAX will unmap
+ * everything.
+ */
+struct iommu_ioas_unmap {
+ __u32 size;
+ __u32 ioas_id;
+ __aligned_u64 iova;
+ __aligned_u64 length;
+};
+#define IOMMU_IOAS_UNMAP _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_UNMAP)
+
+/**
+ * enum iommufd_option - ioctl(IOMMU_OPTION_RLIMIT_MODE) and
+ * ioctl(IOMMU_OPTION_HUGE_PAGES)
+ * @IOMMU_OPTION_RLIMIT_MODE:
+ * Change how RLIMIT_MEMLOCK accounting works. The caller must have privilege
+ * to invoke this. Value 0 (default) is user based accouting, 1 uses process
+ * based accounting. Global option, object_id must be 0
+ * @IOMMU_OPTION_HUGE_PAGES:
+ * Value 1 (default) allows contiguous pages to be combined when generating
+ * iommu mappings. Value 0 disables combining, everything is mapped to
+ * PAGE_SIZE. This can be useful for benchmarking. This is a per-IOAS
+ * option, the object_id must be the IOAS ID.
+ */
+enum iommufd_option {
+ IOMMU_OPTION_RLIMIT_MODE = 0,
+ IOMMU_OPTION_HUGE_PAGES = 1,
+};
+
+/**
+ * enum iommufd_option_ops - ioctl(IOMMU_OPTION_OP_SET) and
+ * ioctl(IOMMU_OPTION_OP_GET)
+ * @IOMMU_OPTION_OP_SET: Set the option's value
+ * @IOMMU_OPTION_OP_GET: Get the option's value
+ */
+enum iommufd_option_ops {
+ IOMMU_OPTION_OP_SET = 0,
+ IOMMU_OPTION_OP_GET = 1,
+};
+
+/**
+ * struct iommu_option - iommu option multiplexer
+ * @size: sizeof(struct iommu_option)
+ * @option_id: One of enum iommufd_option
+ * @op: One of enum iommufd_option_ops
+ * @__reserved: Must be 0
+ * @object_id: ID of the object if required
+ * @val64: Option value to set or value returned on get
+ *
+ * Change a simple option value. This multiplexor allows controlling options
+ * on objects. IOMMU_OPTION_OP_SET will load an option and IOMMU_OPTION_OP_GET
+ * will return the current value.
+ */
+struct iommu_option {
+ __u32 size;
+ __u32 option_id;
+ __u16 op;
+ __u16 __reserved;
+ __u32 object_id;
+ __aligned_u64 val64;
+};
+#define IOMMU_OPTION _IO(IOMMUFD_TYPE, IOMMUFD_CMD_OPTION)
+
+/**
+ * enum iommufd_vfio_ioas_op - IOMMU_VFIO_IOAS_* ioctls
+ * @IOMMU_VFIO_IOAS_GET: Get the current compatibility IOAS
+ * @IOMMU_VFIO_IOAS_SET: Change the current compatibility IOAS
+ * @IOMMU_VFIO_IOAS_CLEAR: Disable VFIO compatibility
+ */
+enum iommufd_vfio_ioas_op {
+ IOMMU_VFIO_IOAS_GET = 0,
+ IOMMU_VFIO_IOAS_SET = 1,
+ IOMMU_VFIO_IOAS_CLEAR = 2,
+};
+
+/**
+ * struct iommu_vfio_ioas - ioctl(IOMMU_VFIO_IOAS)
+ * @size: sizeof(struct iommu_vfio_ioas)
+ * @ioas_id: For IOMMU_VFIO_IOAS_SET the input IOAS ID to set
+ * For IOMMU_VFIO_IOAS_GET will output the IOAS ID
+ * @op: One of enum iommufd_vfio_ioas_op
+ * @__reserved: Must be 0
+ *
+ * The VFIO compatibility support uses a single ioas because VFIO APIs do not
+ * support the ID field. Set or Get the IOAS that VFIO compatibility will use.
+ * When VFIO_GROUP_SET_CONTAINER is used on an iommufd it will get the
+ * compatibility ioas, either by taking what is already set, or auto creating
+ * one. From then on VFIO will continue to use that ioas and is not effected by
+ * this ioctl. SET or CLEAR does not destroy any auto-created IOAS.
+ */
+struct iommu_vfio_ioas {
+ __u32 size;
+ __u32 ioas_id;
+ __u16 op;
+ __u16 __reserved;
+};
+#define IOMMU_VFIO_IOAS _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VFIO_IOAS)
+#endif
diff --git a/kernel/user.c b/kernel/user.c
index e2cf8c22b539..d667debeafd6 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -185,6 +185,7 @@ void free_uid(struct user_struct *up)
if (refcount_dec_and_lock_irqsave(&up->__count, &uidhash_lock, &flags))
free_user(up, flags);
}
+EXPORT_SYMBOL_GPL(free_uid);
struct user_struct *alloc_uid(kuid_t uid)
{
diff --git a/lib/Kconfig b/lib/Kconfig
index 9bbf8a4b2108..c6c323fd2517 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -479,6 +479,10 @@ config INTERVAL_TREE
for more information.
+config INTERVAL_TREE_SPAN_ITER
+ bool
+ depends on INTERVAL_TREE
+
config XARRAY_MULTI
bool
help
diff --git a/lib/interval_tree.c b/lib/interval_tree.c
index 593ce56ece50..3412737ff365 100644
--- a/lib/interval_tree.c
+++ b/lib/interval_tree.c
@@ -15,3 +15,135 @@ EXPORT_SYMBOL_GPL(interval_tree_insert);
EXPORT_SYMBOL_GPL(interval_tree_remove);
EXPORT_SYMBOL_GPL(interval_tree_iter_first);
EXPORT_SYMBOL_GPL(interval_tree_iter_next);
+
+#ifdef CONFIG_INTERVAL_TREE_SPAN_ITER
+/*
+ * Roll nodes[1] into nodes[0] by advancing nodes[1] to the end of a contiguous
+ * span of nodes. This makes nodes[0]->last the end of that contiguous used span
+ * indexes that started at the original nodes[1]->start. nodes[1] is now the
+ * first node starting the next used span. A hole span is between nodes[0]->last
+ * and nodes[1]->start. nodes[1] must be !NULL.
+ */
+static void
+interval_tree_span_iter_next_gap(struct interval_tree_span_iter *state)
+{
+ struct interval_tree_node *cur = state->nodes[1];
+
+ state->nodes[0] = cur;
+ do {
+ if (cur->last > state->nodes[0]->last)
+ state->nodes[0] = cur;
+ cur = interval_tree_iter_next(cur, state->first_index,
+ state->last_index);
+ } while (cur && (state->nodes[0]->last >= cur->start ||
+ state->nodes[0]->last + 1 == cur->start));
+ state->nodes[1] = cur;
+}
+
+void interval_tree_span_iter_first(struct interval_tree_span_iter *iter,
+ struct rb_root_cached *itree,
+ unsigned long first_index,
+ unsigned long last_index)
+{
+ iter->first_index = first_index;
+ iter->last_index = last_index;
+ iter->nodes[0] = NULL;
+ iter->nodes[1] =
+ interval_tree_iter_first(itree, first_index, last_index);
+ if (!iter->nodes[1]) {
+ /* No nodes intersect the span, whole span is hole */
+ iter->start_hole = first_index;
+ iter->last_hole = last_index;
+ iter->is_hole = 1;
+ return;
+ }
+ if (iter->nodes[1]->start > first_index) {
+ /* Leading hole on first iteration */
+ iter->start_hole = first_index;
+ iter->last_hole = iter->nodes[1]->start - 1;
+ iter->is_hole = 1;
+ interval_tree_span_iter_next_gap(iter);
+ return;
+ }
+
+ /* Starting inside a used */
+ iter->start_used = first_index;
+ iter->is_hole = 0;
+ interval_tree_span_iter_next_gap(iter);
+ iter->last_used = iter->nodes[0]->last;
+ if (iter->last_used >= last_index) {
+ iter->last_used = last_index;
+ iter->nodes[0] = NULL;
+ iter->nodes[1] = NULL;
+ }
+}
+EXPORT_SYMBOL_GPL(interval_tree_span_iter_first);
+
+void interval_tree_span_iter_next(struct interval_tree_span_iter *iter)
+{
+ if (!iter->nodes[0] && !iter->nodes[1]) {
+ iter->is_hole = -1;
+ return;
+ }
+
+ if (iter->is_hole) {
+ iter->start_used = iter->last_hole + 1;
+ iter->last_used = iter->nodes[0]->last;
+ if (iter->last_used >= iter->last_index) {
+ iter->last_used = iter->last_index;
+ iter->nodes[0] = NULL;
+ iter->nodes[1] = NULL;
+ }
+ iter->is_hole = 0;
+ return;
+ }
+
+ if (!iter->nodes[1]) {
+ /* Trailing hole */
+ iter->start_hole = iter->nodes[0]->last + 1;
+ iter->last_hole = iter->last_index;
+ iter->nodes[0] = NULL;
+ iter->is_hole = 1;
+ return;
+ }
+
+ /* must have both nodes[0] and [1], interior hole */
+ iter->start_hole = iter->nodes[0]->last + 1;
+ iter->last_hole = iter->nodes[1]->start - 1;
+ iter->is_hole = 1;
+ interval_tree_span_iter_next_gap(iter);
+}
+EXPORT_SYMBOL_GPL(interval_tree_span_iter_next);
+
+/*
+ * Advance the iterator index to a specific position. The returned used/hole is
+ * updated to start at new_index. This is faster than calling
+ * interval_tree_span_iter_first() as it can avoid full searches in several
+ * cases where the iterator is already set.
+ */
+void interval_tree_span_iter_advance(struct interval_tree_span_iter *iter,
+ struct rb_root_cached *itree,
+ unsigned long new_index)
+{
+ if (iter->is_hole == -1)
+ return;
+
+ iter->first_index = new_index;
+ if (new_index > iter->last_index) {
+ iter->is_hole = -1;
+ return;
+ }
+
+ /* Rely on the union aliasing hole/used */
+ if (iter->start_hole <= new_index && new_index <= iter->last_hole) {
+ iter->start_hole = new_index;
+ return;
+ }
+ if (new_index == iter->last_hole + 1)
+ interval_tree_span_iter_next(iter);
+ else
+ interval_tree_span_iter_first(iter, itree, new_index,
+ iter->last_index);
+}
+EXPORT_SYMBOL_GPL(interval_tree_span_iter_advance);
+#endif
diff --git a/scripts/kernel-doc b/scripts/kernel-doc
index aea04365bc69..48e3feca3170 100755
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@@ -256,6 +256,7 @@ my $doc_inline_sect = '\s*\*\s*(@\s*[\w][\w\.]*\s*):(.*)';
my $doc_inline_end = '^\s*\*/\s*$';
my $doc_inline_oneline = '^\s*/\*\*\s*(@[\w\s]+):\s*(.*)\s*\*/\s*$';
my $export_symbol = '^\s*EXPORT_SYMBOL(_GPL)?\s*\(\s*(\w+)\s*\)\s*;';
+my $export_symbol_ns = '^\s*EXPORT_SYMBOL_NS(_GPL)?\s*\(\s*(\w+)\s*,\s*\w+\)\s*;';
my $function_pointer = qr{([^\(]*\(\*)\s*\)\s*\(([^\)]*)\)};
my $attribute = qr{__attribute__\s*\(\([a-z0-9,_\*\s\(\)]*\)\)}i;
@@ -1948,6 +1949,10 @@ sub process_export_file($) {
next if (defined($nosymbol_table{$2}));
$function_table{$2} = 1;
}
+ if (/$export_symbol_ns/) {
+ next if (defined($nosymbol_table{$2}));
+ $function_table{$2} = 1;
+ }
}
close(IN);
@@ -2419,12 +2424,12 @@ found on PATH.
=item -export
Only output documentation for the symbols that have been exported using
-EXPORT_SYMBOL() or EXPORT_SYMBOL_GPL() in any input FILE or -export-file FILE.
+EXPORT_SYMBOL() and related macros in any input FILE or -export-file FILE.
=item -internal
Only output documentation for the symbols that have NOT been exported using
-EXPORT_SYMBOL() or EXPORT_SYMBOL_GPL() in any input FILE or -export-file FILE.
+EXPORT_SYMBOL() and related macros in any input FILE or -export-file FILE.
=item -function NAME
@@ -2451,8 +2456,7 @@ Do not output DOC: sections.
=item -export-file FILE
-Specify an additional FILE in which to look for EXPORT_SYMBOL() and
-EXPORT_SYMBOL_GPL().
+Specify an additional FILE in which to look for EXPORT_SYMBOL information.
To be used with -export or -internal.
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index f07aef7c592c..d6680af7b295 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -27,6 +27,7 @@ TARGETS += ftrace
TARGETS += futex
TARGETS += gpio
TARGETS += intel_pstate
+TARGETS += iommu
TARGETS += ipc
TARGETS += ir
TARGETS += kcmp
diff --git a/tools/testing/selftests/iommu/.gitignore b/tools/testing/selftests/iommu/.gitignore
new file mode 100644
index 000000000000..7d0703049eba
--- /dev/null
+++ b/tools/testing/selftests/iommu/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+/iommufd
+/iommufd_fail_nth
diff --git a/tools/testing/selftests/iommu/Makefile b/tools/testing/selftests/iommu/Makefile
new file mode 100644
index 000000000000..7cb74d26f141
--- /dev/null
+++ b/tools/testing/selftests/iommu/Makefile
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0-only
+CFLAGS += -Wall -O2 -Wno-unused-function
+CFLAGS += -I../../../../include/uapi/
+CFLAGS += -I../../../../include/
+
+CFLAGS += -D_GNU_SOURCE
+
+TEST_GEN_PROGS :=
+TEST_GEN_PROGS += iommufd
+TEST_GEN_PROGS += iommufd_fail_nth
+
+include ../lib.mk
diff --git a/tools/testing/selftests/iommu/config b/tools/testing/selftests/iommu/config
new file mode 100644
index 000000000000..6c4f901d6fed
--- /dev/null
+++ b/tools/testing/selftests/iommu/config
@@ -0,0 +1,2 @@
+CONFIG_IOMMUFD
+CONFIG_IOMMUFD_TEST
diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
new file mode 100644
index 000000000000..8aa8a346cf22
--- /dev/null
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -0,0 +1,1654 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES */
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/eventfd.h>
+
+#define __EXPORTED_HEADERS__
+#include <linux/vfio.h>
+
+#include "iommufd_utils.h"
+
+static void *buffer;
+
+static unsigned long PAGE_SIZE;
+static unsigned long HUGEPAGE_SIZE;
+
+#define MOCK_PAGE_SIZE (PAGE_SIZE / 2)
+
+static unsigned long get_huge_page_size(void)
+{
+ char buf[80];
+ int ret;
+ int fd;
+
+ fd = open("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size",
+ O_RDONLY);
+ if (fd < 0)
+ return 2 * 1024 * 1024;
+
+ ret = read(fd, buf, sizeof(buf));
+ close(fd);
+ if (ret <= 0 || ret == sizeof(buf))
+ return 2 * 1024 * 1024;
+ buf[ret] = 0;
+ return strtoul(buf, NULL, 10);
+}
+
+static __attribute__((constructor)) void setup_sizes(void)
+{
+ void *vrc;
+ int rc;
+
+ PAGE_SIZE = sysconf(_SC_PAGE_SIZE);
+ HUGEPAGE_SIZE = get_huge_page_size();
+
+ BUFFER_SIZE = PAGE_SIZE * 16;
+ rc = posix_memalign(&buffer, HUGEPAGE_SIZE, BUFFER_SIZE);
+ assert(!rc);
+ assert(buffer);
+ assert((uintptr_t)buffer % HUGEPAGE_SIZE == 0);
+ vrc = mmap(buffer, BUFFER_SIZE, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
+ assert(vrc == buffer);
+}
+
+FIXTURE(iommufd)
+{
+ int fd;
+};
+
+FIXTURE_SETUP(iommufd)
+{
+ self->fd = open("/dev/iommu", O_RDWR);
+ ASSERT_NE(-1, self->fd);
+}
+
+FIXTURE_TEARDOWN(iommufd)
+{
+ teardown_iommufd(self->fd, _metadata);
+}
+
+TEST_F(iommufd, simple_close)
+{
+}
+
+TEST_F(iommufd, cmd_fail)
+{
+ struct iommu_destroy cmd = { .size = sizeof(cmd), .id = 0 };
+
+ /* object id is invalid */
+ EXPECT_ERRNO(ENOENT, _test_ioctl_destroy(self->fd, 0));
+ /* Bad pointer */
+ EXPECT_ERRNO(EFAULT, ioctl(self->fd, IOMMU_DESTROY, NULL));
+ /* Unknown ioctl */
+ EXPECT_ERRNO(ENOTTY,
+ ioctl(self->fd, _IO(IOMMUFD_TYPE, IOMMUFD_CMD_BASE - 1),
+ &cmd));
+}
+
+TEST_F(iommufd, cmd_length)
+{
+#define TEST_LENGTH(_struct, _ioctl) \
+ { \
+ struct { \
+ struct _struct cmd; \
+ uint8_t extra; \
+ } cmd = { .cmd = { .size = sizeof(struct _struct) - 1 }, \
+ .extra = UINT8_MAX }; \
+ int old_errno; \
+ int rc; \
+ \
+ EXPECT_ERRNO(EINVAL, ioctl(self->fd, _ioctl, &cmd)); \
+ cmd.cmd.size = sizeof(struct _struct) + 1; \
+ EXPECT_ERRNO(E2BIG, ioctl(self->fd, _ioctl, &cmd)); \
+ cmd.cmd.size = sizeof(struct _struct); \
+ rc = ioctl(self->fd, _ioctl, &cmd); \
+ old_errno = errno; \
+ cmd.cmd.size = sizeof(struct _struct) + 1; \
+ cmd.extra = 0; \
+ if (rc) { \
+ EXPECT_ERRNO(old_errno, \
+ ioctl(self->fd, _ioctl, &cmd)); \
+ } else { \
+ ASSERT_EQ(0, ioctl(self->fd, _ioctl, &cmd)); \
+ } \
+ }
+
+ TEST_LENGTH(iommu_destroy, IOMMU_DESTROY);
+ TEST_LENGTH(iommu_ioas_alloc, IOMMU_IOAS_ALLOC);
+ TEST_LENGTH(iommu_ioas_iova_ranges, IOMMU_IOAS_IOVA_RANGES);
+ TEST_LENGTH(iommu_ioas_allow_iovas, IOMMU_IOAS_ALLOW_IOVAS);
+ TEST_LENGTH(iommu_ioas_map, IOMMU_IOAS_MAP);
+ TEST_LENGTH(iommu_ioas_copy, IOMMU_IOAS_COPY);
+ TEST_LENGTH(iommu_ioas_unmap, IOMMU_IOAS_UNMAP);
+ TEST_LENGTH(iommu_option, IOMMU_OPTION);
+ TEST_LENGTH(iommu_vfio_ioas, IOMMU_VFIO_IOAS);
+#undef TEST_LENGTH
+}
+
+TEST_F(iommufd, cmd_ex_fail)
+{
+ struct {
+ struct iommu_destroy cmd;
+ __u64 future;
+ } cmd = { .cmd = { .size = sizeof(cmd), .id = 0 } };
+
+ /* object id is invalid and command is longer */
+ EXPECT_ERRNO(ENOENT, ioctl(self->fd, IOMMU_DESTROY, &cmd));
+ /* future area is non-zero */
+ cmd.future = 1;
+ EXPECT_ERRNO(E2BIG, ioctl(self->fd, IOMMU_DESTROY, &cmd));
+ /* Original command "works" */
+ cmd.cmd.size = sizeof(cmd.cmd);
+ EXPECT_ERRNO(ENOENT, ioctl(self->fd, IOMMU_DESTROY, &cmd));
+ /* Short command fails */
+ cmd.cmd.size = sizeof(cmd.cmd) - 1;
+ EXPECT_ERRNO(EINVAL, ioctl(self->fd, IOMMU_DESTROY, &cmd));
+}
+
+TEST_F(iommufd, global_options)
+{
+ struct iommu_option cmd = {
+ .size = sizeof(cmd),
+ .option_id = IOMMU_OPTION_RLIMIT_MODE,
+ .op = IOMMU_OPTION_OP_GET,
+ .val64 = 1,
+ };
+
+ cmd.option_id = IOMMU_OPTION_RLIMIT_MODE;
+ ASSERT_EQ(0, ioctl(self->fd, IOMMU_OPTION, &cmd));
+ ASSERT_EQ(0, cmd.val64);
+
+ /* This requires root */
+ cmd.op = IOMMU_OPTION_OP_SET;
+ cmd.val64 = 1;
+ ASSERT_EQ(0, ioctl(self->fd, IOMMU_OPTION, &cmd));
+ cmd.val64 = 2;
+ EXPECT_ERRNO(EINVAL, ioctl(self->fd, IOMMU_OPTION, &cmd));
+
+ cmd.op = IOMMU_OPTION_OP_GET;
+ ASSERT_EQ(0, ioctl(self->fd, IOMMU_OPTION, &cmd));
+ ASSERT_EQ(1, cmd.val64);
+
+ cmd.op = IOMMU_OPTION_OP_SET;
+ cmd.val64 = 0;
+ ASSERT_EQ(0, ioctl(self->fd, IOMMU_OPTION, &cmd));
+
+ cmd.op = IOMMU_OPTION_OP_GET;
+ cmd.option_id = IOMMU_OPTION_HUGE_PAGES;
+ EXPECT_ERRNO(ENOENT, ioctl(self->fd, IOMMU_OPTION, &cmd));
+ cmd.op = IOMMU_OPTION_OP_SET;
+ EXPECT_ERRNO(ENOENT, ioctl(self->fd, IOMMU_OPTION, &cmd));
+}
+
+FIXTURE(iommufd_ioas)
+{
+ int fd;
+ uint32_t ioas_id;
+ uint32_t domain_id;
+ uint64_t base_iova;
+};
+
+FIXTURE_VARIANT(iommufd_ioas)
+{
+ unsigned int mock_domains;
+ unsigned int memory_limit;
+};
+
+FIXTURE_SETUP(iommufd_ioas)
+{
+ unsigned int i;
+
+
+ self->fd = open("/dev/iommu", O_RDWR);
+ ASSERT_NE(-1, self->fd);
+ test_ioctl_ioas_alloc(&self->ioas_id);
+
+ if (!variant->memory_limit) {
+ test_ioctl_set_default_memory_limit();
+ } else {
+ test_ioctl_set_temp_memory_limit(variant->memory_limit);
+ }
+
+ for (i = 0; i != variant->mock_domains; i++) {
+ test_cmd_mock_domain(self->ioas_id, NULL, &self->domain_id);
+ self->base_iova = MOCK_APERTURE_START;
+ }
+}
+
+FIXTURE_TEARDOWN(iommufd_ioas)
+{
+ test_ioctl_set_default_memory_limit();
+ teardown_iommufd(self->fd, _metadata);
+}
+
+FIXTURE_VARIANT_ADD(iommufd_ioas, no_domain)
+{
+};
+
+FIXTURE_VARIANT_ADD(iommufd_ioas, mock_domain)
+{
+ .mock_domains = 1,
+};
+
+FIXTURE_VARIANT_ADD(iommufd_ioas, two_mock_domain)
+{
+ .mock_domains = 2,
+};
+
+FIXTURE_VARIANT_ADD(iommufd_ioas, mock_domain_limit)
+{
+ .mock_domains = 1,
+ .memory_limit = 16,
+};
+
+TEST_F(iommufd_ioas, ioas_auto_destroy)
+{
+}
+
+TEST_F(iommufd_ioas, ioas_destroy)
+{
+ if (self->domain_id) {
+ /* IOAS cannot be freed while a domain is on it */
+ EXPECT_ERRNO(EBUSY,
+ _test_ioctl_destroy(self->fd, self->ioas_id));
+ } else {
+ /* Can allocate and manually free an IOAS table */
+ test_ioctl_destroy(self->ioas_id);
+ }
+}
+
+TEST_F(iommufd_ioas, ioas_area_destroy)
+{
+ /* Adding an area does not change ability to destroy */
+ test_ioctl_ioas_map_fixed(buffer, PAGE_SIZE, self->base_iova);
+ if (self->domain_id)
+ EXPECT_ERRNO(EBUSY,
+ _test_ioctl_destroy(self->fd, self->ioas_id));
+ else
+ test_ioctl_destroy(self->ioas_id);
+}
+
+TEST_F(iommufd_ioas, ioas_area_auto_destroy)
+{
+ int i;
+
+ /* Can allocate and automatically free an IOAS table with many areas */
+ for (i = 0; i != 10; i++) {
+ test_ioctl_ioas_map_fixed(buffer, PAGE_SIZE,
+ self->base_iova + i * PAGE_SIZE);
+ }
+}
+
+TEST_F(iommufd_ioas, area)
+{
+ int i;
+
+ /* Unmap fails if nothing is mapped */
+ for (i = 0; i != 10; i++)
+ test_err_ioctl_ioas_unmap(ENOENT, i * PAGE_SIZE, PAGE_SIZE);
+
+ /* Unmap works */
+ for (i = 0; i != 10; i++)
+ test_ioctl_ioas_map_fixed(buffer, PAGE_SIZE,
+ self->base_iova + i * PAGE_SIZE);
+ for (i = 0; i != 10; i++)
+ test_ioctl_ioas_unmap(self->base_iova + i * PAGE_SIZE,
+ PAGE_SIZE);
+
+ /* Split fails */
+ test_ioctl_ioas_map_fixed(buffer, PAGE_SIZE * 2,
+ self->base_iova + 16 * PAGE_SIZE);
+ test_err_ioctl_ioas_unmap(ENOENT, self->base_iova + 16 * PAGE_SIZE,
+ PAGE_SIZE);
+ test_err_ioctl_ioas_unmap(ENOENT, self->base_iova + 17 * PAGE_SIZE,
+ PAGE_SIZE);
+
+ /* Over map fails */
+ test_err_ioctl_ioas_map_fixed(EEXIST, buffer, PAGE_SIZE * 2,
+ self->base_iova + 16 * PAGE_SIZE);
+ test_err_ioctl_ioas_map_fixed(EEXIST, buffer, PAGE_SIZE,
+ self->base_iova + 16 * PAGE_SIZE);
+ test_err_ioctl_ioas_map_fixed(EEXIST, buffer, PAGE_SIZE,
+ self->base_iova + 17 * PAGE_SIZE);
+ test_err_ioctl_ioas_map_fixed(EEXIST, buffer, PAGE_SIZE * 2,
+ self->base_iova + 15 * PAGE_SIZE);
+ test_err_ioctl_ioas_map_fixed(EEXIST, buffer, PAGE_SIZE * 3,
+ self->base_iova + 15 * PAGE_SIZE);
+
+ /* unmap all works */
+ test_ioctl_ioas_unmap(0, UINT64_MAX);
+
+ /* Unmap all succeeds on an empty IOAS */
+ test_ioctl_ioas_unmap(0, UINT64_MAX);
+}
+
+TEST_F(iommufd_ioas, unmap_fully_contained_areas)
+{
+ uint64_t unmap_len;
+ int i;
+
+ /* Give no_domain some space to rewind base_iova */
+ self->base_iova += 4 * PAGE_SIZE;
+
+ for (i = 0; i != 4; i++)
+ test_ioctl_ioas_map_fixed(buffer, 8 * PAGE_SIZE,
+ self->base_iova + i * 16 * PAGE_SIZE);
+
+ /* Unmap not fully contained area doesn't work */
+ test_err_ioctl_ioas_unmap(ENOENT, self->base_iova - 4 * PAGE_SIZE,
+ 8 * PAGE_SIZE);
+ test_err_ioctl_ioas_unmap(ENOENT,
+ self->base_iova + 3 * 16 * PAGE_SIZE +
+ 8 * PAGE_SIZE - 4 * PAGE_SIZE,
+ 8 * PAGE_SIZE);
+
+ /* Unmap fully contained areas works */
+ ASSERT_EQ(0, _test_ioctl_ioas_unmap(self->fd, self->ioas_id,
+ self->base_iova - 4 * PAGE_SIZE,
+ 3 * 16 * PAGE_SIZE + 8 * PAGE_SIZE +
+ 4 * PAGE_SIZE,
+ &unmap_len));
+ ASSERT_EQ(32 * PAGE_SIZE, unmap_len);
+}
+
+TEST_F(iommufd_ioas, area_auto_iova)
+{
+ struct iommu_test_cmd test_cmd = {
+ .size = sizeof(test_cmd),
+ .op = IOMMU_TEST_OP_ADD_RESERVED,
+ .id = self->ioas_id,
+ .add_reserved = { .start = PAGE_SIZE * 4,
+ .length = PAGE_SIZE * 100 },
+ };
+ struct iommu_iova_range ranges[1] = {};
+ struct iommu_ioas_allow_iovas allow_cmd = {
+ .size = sizeof(allow_cmd),
+ .ioas_id = self->ioas_id,
+ .num_iovas = 1,
+ .allowed_iovas = (uintptr_t)ranges,
+ };
+ __u64 iovas[10];
+ int i;
+
+ /* Simple 4k pages */
+ for (i = 0; i != 10; i++)
+ test_ioctl_ioas_map(buffer, PAGE_SIZE, &iovas[i]);
+ for (i = 0; i != 10; i++)
+ test_ioctl_ioas_unmap(iovas[i], PAGE_SIZE);
+
+ /* Kernel automatically aligns IOVAs properly */
+ for (i = 0; i != 10; i++) {
+ size_t length = PAGE_SIZE * (i + 1);
+
+ if (self->domain_id) {
+ test_ioctl_ioas_map(buffer, length, &iovas[i]);
+ } else {
+ test_ioctl_ioas_map((void *)(1UL << 31), length,
+ &iovas[i]);
+ }
+ EXPECT_EQ(0, iovas[i] % (1UL << (ffs(length) - 1)));
+ }
+ for (i = 0; i != 10; i++)
+ test_ioctl_ioas_unmap(iovas[i], PAGE_SIZE * (i + 1));
+
+ /* Avoids a reserved region */
+ ASSERT_EQ(0,
+ ioctl(self->fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_ADD_RESERVED),
+ &test_cmd));
+ for (i = 0; i != 10; i++) {
+ size_t length = PAGE_SIZE * (i + 1);
+
+ test_ioctl_ioas_map(buffer, length, &iovas[i]);
+ EXPECT_EQ(0, iovas[i] % (1UL << (ffs(length) - 1)));
+ EXPECT_EQ(false,
+ iovas[i] > test_cmd.add_reserved.start &&
+ iovas[i] <
+ test_cmd.add_reserved.start +
+ test_cmd.add_reserved.length);
+ }
+ for (i = 0; i != 10; i++)
+ test_ioctl_ioas_unmap(iovas[i], PAGE_SIZE * (i + 1));
+
+ /* Allowed region intersects with a reserved region */
+ ranges[0].start = PAGE_SIZE;
+ ranges[0].last = PAGE_SIZE * 600;
+ EXPECT_ERRNO(EADDRINUSE,
+ ioctl(self->fd, IOMMU_IOAS_ALLOW_IOVAS, &allow_cmd));
+
+ /* Allocate from an allowed region */
+ if (self->domain_id) {
+ ranges[0].start = MOCK_APERTURE_START + PAGE_SIZE;
+ ranges[0].last = MOCK_APERTURE_START + PAGE_SIZE * 600 - 1;
+ } else {
+ ranges[0].start = PAGE_SIZE * 200;
+ ranges[0].last = PAGE_SIZE * 600 - 1;
+ }
+ ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_ALLOW_IOVAS, &allow_cmd));
+ for (i = 0; i != 10; i++) {
+ size_t length = PAGE_SIZE * (i + 1);
+
+ test_ioctl_ioas_map(buffer, length, &iovas[i]);
+ EXPECT_EQ(0, iovas[i] % (1UL << (ffs(length) - 1)));
+ EXPECT_EQ(true, iovas[i] >= ranges[0].start);
+ EXPECT_EQ(true, iovas[i] <= ranges[0].last);
+ EXPECT_EQ(true, iovas[i] + length > ranges[0].start);
+ EXPECT_EQ(true, iovas[i] + length <= ranges[0].last + 1);
+ }
+ for (i = 0; i != 10; i++)
+ test_ioctl_ioas_unmap(iovas[i], PAGE_SIZE * (i + 1));
+}
+
+TEST_F(iommufd_ioas, area_allowed)
+{
+ struct iommu_test_cmd test_cmd = {
+ .size = sizeof(test_cmd),
+ .op = IOMMU_TEST_OP_ADD_RESERVED,
+ .id = self->ioas_id,
+ .add_reserved = { .start = PAGE_SIZE * 4,
+ .length = PAGE_SIZE * 100 },
+ };
+ struct iommu_iova_range ranges[1] = {};
+ struct iommu_ioas_allow_iovas allow_cmd = {
+ .size = sizeof(allow_cmd),
+ .ioas_id = self->ioas_id,
+ .num_iovas = 1,
+ .allowed_iovas = (uintptr_t)ranges,
+ };
+
+ /* Reserved intersects an allowed */
+ allow_cmd.num_iovas = 1;
+ ranges[0].start = self->base_iova;
+ ranges[0].last = ranges[0].start + PAGE_SIZE * 600;
+ ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_ALLOW_IOVAS, &allow_cmd));
+ test_cmd.add_reserved.start = ranges[0].start + PAGE_SIZE;
+ test_cmd.add_reserved.length = PAGE_SIZE;
+ EXPECT_ERRNO(EADDRINUSE,
+ ioctl(self->fd,
+ _IOMMU_TEST_CMD(IOMMU_TEST_OP_ADD_RESERVED),
+ &test_cmd));
+ allow_cmd.num_iovas = 0;
+ ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_ALLOW_IOVAS, &allow_cmd));
+
+ /* Allowed intersects a reserved */
+ ASSERT_EQ(0,
+ ioctl(self->fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_ADD_RESERVED),
+ &test_cmd));
+ allow_cmd.num_iovas = 1;
+ ranges[0].start = self->base_iova;
+ ranges[0].last = ranges[0].start + PAGE_SIZE * 600;
+ EXPECT_ERRNO(EADDRINUSE,
+ ioctl(self->fd, IOMMU_IOAS_ALLOW_IOVAS, &allow_cmd));
+}
+
+TEST_F(iommufd_ioas, copy_area)
+{
+ struct iommu_ioas_copy copy_cmd = {
+ .size = sizeof(copy_cmd),
+ .flags = IOMMU_IOAS_MAP_FIXED_IOVA,
+ .dst_ioas_id = self->ioas_id,
+ .src_ioas_id = self->ioas_id,
+ .length = PAGE_SIZE,
+ };
+
+ test_ioctl_ioas_map_fixed(buffer, PAGE_SIZE, self->base_iova);
+
+ /* Copy inside a single IOAS */
+ copy_cmd.src_iova = self->base_iova;
+ copy_cmd.dst_iova = self->base_iova + PAGE_SIZE;
+ ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_COPY, &copy_cmd));
+
+ /* Copy between IOAS's */
+ copy_cmd.src_iova = self->base_iova;
+ copy_cmd.dst_iova = 0;
+ test_ioctl_ioas_alloc(&copy_cmd.dst_ioas_id);
+ ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_COPY, &copy_cmd));
+}
+
+TEST_F(iommufd_ioas, iova_ranges)
+{
+ struct iommu_test_cmd test_cmd = {
+ .size = sizeof(test_cmd),
+ .op = IOMMU_TEST_OP_ADD_RESERVED,
+ .id = self->ioas_id,
+ .add_reserved = { .start = PAGE_SIZE, .length = PAGE_SIZE },
+ };
+ struct iommu_iova_range *ranges = buffer;
+ struct iommu_ioas_iova_ranges ranges_cmd = {
+ .size = sizeof(ranges_cmd),
+ .ioas_id = self->ioas_id,
+ .num_iovas = BUFFER_SIZE / sizeof(*ranges),
+ .allowed_iovas = (uintptr_t)ranges,
+ };
+
+ /* Range can be read */
+ ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_IOVA_RANGES, &ranges_cmd));
+ EXPECT_EQ(1, ranges_cmd.num_iovas);
+ if (!self->domain_id) {
+ EXPECT_EQ(0, ranges[0].start);
+ EXPECT_EQ(SIZE_MAX, ranges[0].last);
+ EXPECT_EQ(1, ranges_cmd.out_iova_alignment);
+ } else {
+ EXPECT_EQ(MOCK_APERTURE_START, ranges[0].start);
+ EXPECT_EQ(MOCK_APERTURE_LAST, ranges[0].last);
+ EXPECT_EQ(MOCK_PAGE_SIZE, ranges_cmd.out_iova_alignment);
+ }
+
+ /* Buffer too small */
+ memset(ranges, 0, BUFFER_SIZE);
+ ranges_cmd.num_iovas = 0;
+ EXPECT_ERRNO(EMSGSIZE,
+ ioctl(self->fd, IOMMU_IOAS_IOVA_RANGES, &ranges_cmd));
+ EXPECT_EQ(1, ranges_cmd.num_iovas);
+ EXPECT_EQ(0, ranges[0].start);
+ EXPECT_EQ(0, ranges[0].last);
+
+ /* 2 ranges */
+ ASSERT_EQ(0,
+ ioctl(self->fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_ADD_RESERVED),
+ &test_cmd));
+ ranges_cmd.num_iovas = BUFFER_SIZE / sizeof(*ranges);
+ ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_IOVA_RANGES, &ranges_cmd));
+ if (!self->domain_id) {
+ EXPECT_EQ(2, ranges_cmd.num_iovas);
+ EXPECT_EQ(0, ranges[0].start);
+ EXPECT_EQ(PAGE_SIZE - 1, ranges[0].last);
+ EXPECT_EQ(PAGE_SIZE * 2, ranges[1].start);
+ EXPECT_EQ(SIZE_MAX, ranges[1].last);
+ } else {
+ EXPECT_EQ(1, ranges_cmd.num_iovas);
+ EXPECT_EQ(MOCK_APERTURE_START, ranges[0].start);
+ EXPECT_EQ(MOCK_APERTURE_LAST, ranges[0].last);
+ }
+
+ /* Buffer too small */
+ memset(ranges, 0, BUFFER_SIZE);
+ ranges_cmd.num_iovas = 1;
+ if (!self->domain_id) {
+ EXPECT_ERRNO(EMSGSIZE, ioctl(self->fd, IOMMU_IOAS_IOVA_RANGES,
+ &ranges_cmd));
+ EXPECT_EQ(2, ranges_cmd.num_iovas);
+ EXPECT_EQ(0, ranges[0].start);
+ EXPECT_EQ(PAGE_SIZE - 1, ranges[0].last);
+ } else {
+ ASSERT_EQ(0,
+ ioctl(self->fd, IOMMU_IOAS_IOVA_RANGES, &ranges_cmd));
+ EXPECT_EQ(1, ranges_cmd.num_iovas);
+ EXPECT_EQ(MOCK_APERTURE_START, ranges[0].start);
+ EXPECT_EQ(MOCK_APERTURE_LAST, ranges[0].last);
+ }
+ EXPECT_EQ(0, ranges[1].start);
+ EXPECT_EQ(0, ranges[1].last);
+}
+
+TEST_F(iommufd_ioas, access_pin)
+{
+ struct iommu_test_cmd access_cmd = {
+ .size = sizeof(access_cmd),
+ .op = IOMMU_TEST_OP_ACCESS_PAGES,
+ .access_pages = { .iova = MOCK_APERTURE_START,
+ .length = BUFFER_SIZE,
+ .uptr = (uintptr_t)buffer },
+ };
+ struct iommu_test_cmd check_map_cmd = {
+ .size = sizeof(check_map_cmd),
+ .op = IOMMU_TEST_OP_MD_CHECK_MAP,
+ .check_map = { .iova = MOCK_APERTURE_START,
+ .length = BUFFER_SIZE,
+ .uptr = (uintptr_t)buffer },
+ };
+ uint32_t access_pages_id;
+ unsigned int npages;
+
+ test_cmd_create_access(self->ioas_id, &access_cmd.id,
+ MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES);
+
+ for (npages = 1; npages < BUFFER_SIZE / PAGE_SIZE; npages++) {
+ uint32_t mock_device_id;
+ uint32_t mock_hwpt_id;
+
+ access_cmd.access_pages.length = npages * PAGE_SIZE;
+
+ /* Single map/unmap */
+ test_ioctl_ioas_map_fixed(buffer, BUFFER_SIZE,
+ MOCK_APERTURE_START);
+ ASSERT_EQ(0, ioctl(self->fd,
+ _IOMMU_TEST_CMD(IOMMU_TEST_OP_ACCESS_PAGES),
+ &access_cmd));
+ test_cmd_destroy_access_pages(
+ access_cmd.id,
+ access_cmd.access_pages.out_access_pages_id);
+
+ /* Double user */
+ ASSERT_EQ(0, ioctl(self->fd,
+ _IOMMU_TEST_CMD(IOMMU_TEST_OP_ACCESS_PAGES),
+ &access_cmd));
+ access_pages_id = access_cmd.access_pages.out_access_pages_id;
+ ASSERT_EQ(0, ioctl(self->fd,
+ _IOMMU_TEST_CMD(IOMMU_TEST_OP_ACCESS_PAGES),
+ &access_cmd));
+ test_cmd_destroy_access_pages(
+ access_cmd.id,
+ access_cmd.access_pages.out_access_pages_id);
+ test_cmd_destroy_access_pages(access_cmd.id, access_pages_id);
+
+ /* Add/remove a domain with a user */
+ ASSERT_EQ(0, ioctl(self->fd,
+ _IOMMU_TEST_CMD(IOMMU_TEST_OP_ACCESS_PAGES),
+ &access_cmd));
+ test_cmd_mock_domain(self->ioas_id, &mock_device_id,
+ &mock_hwpt_id);
+ check_map_cmd.id = mock_hwpt_id;
+ ASSERT_EQ(0, ioctl(self->fd,
+ _IOMMU_TEST_CMD(IOMMU_TEST_OP_MD_CHECK_MAP),
+ &check_map_cmd));
+
+ test_ioctl_destroy(mock_device_id);
+ test_ioctl_destroy(mock_hwpt_id);
+ test_cmd_destroy_access_pages(
+ access_cmd.id,
+ access_cmd.access_pages.out_access_pages_id);
+
+ test_ioctl_ioas_unmap(MOCK_APERTURE_START, BUFFER_SIZE);
+ }
+ test_cmd_destroy_access(access_cmd.id);
+}
+
+TEST_F(iommufd_ioas, access_pin_unmap)
+{
+ struct iommu_test_cmd access_pages_cmd = {
+ .size = sizeof(access_pages_cmd),
+ .op = IOMMU_TEST_OP_ACCESS_PAGES,
+ .access_pages = { .iova = MOCK_APERTURE_START,
+ .length = BUFFER_SIZE,
+ .uptr = (uintptr_t)buffer },
+ };
+
+ test_cmd_create_access(self->ioas_id, &access_pages_cmd.id,
+ MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES);
+ test_ioctl_ioas_map_fixed(buffer, BUFFER_SIZE, MOCK_APERTURE_START);
+ ASSERT_EQ(0,
+ ioctl(self->fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_ACCESS_PAGES),
+ &access_pages_cmd));
+
+ /* Trigger the unmap op */
+ test_ioctl_ioas_unmap(MOCK_APERTURE_START, BUFFER_SIZE);
+
+ /* kernel removed the item for us */
+ test_err_destroy_access_pages(
+ ENOENT, access_pages_cmd.id,
+ access_pages_cmd.access_pages.out_access_pages_id);
+}
+
+static void check_access_rw(struct __test_metadata *_metadata, int fd,
+ unsigned int access_id, uint64_t iova,
+ unsigned int def_flags)
+{
+ uint16_t tmp[32];
+ struct iommu_test_cmd access_cmd = {
+ .size = sizeof(access_cmd),
+ .op = IOMMU_TEST_OP_ACCESS_RW,
+ .id = access_id,
+ .access_rw = { .uptr = (uintptr_t)tmp },
+ };
+ uint16_t *buffer16 = buffer;
+ unsigned int i;
+ void *tmp2;
+
+ for (i = 0; i != BUFFER_SIZE / sizeof(*buffer16); i++)
+ buffer16[i] = rand();
+
+ for (access_cmd.access_rw.iova = iova + PAGE_SIZE - 50;
+ access_cmd.access_rw.iova < iova + PAGE_SIZE + 50;
+ access_cmd.access_rw.iova++) {
+ for (access_cmd.access_rw.length = 1;
+ access_cmd.access_rw.length < sizeof(tmp);
+ access_cmd.access_rw.length++) {
+ access_cmd.access_rw.flags = def_flags;
+ ASSERT_EQ(0, ioctl(fd,
+ _IOMMU_TEST_CMD(
+ IOMMU_TEST_OP_ACCESS_RW),
+ &access_cmd));
+ ASSERT_EQ(0,
+ memcmp(buffer + (access_cmd.access_rw.iova -
+ iova),
+ tmp, access_cmd.access_rw.length));
+
+ for (i = 0; i != ARRAY_SIZE(tmp); i++)
+ tmp[i] = rand();
+ access_cmd.access_rw.flags = def_flags |
+ MOCK_ACCESS_RW_WRITE;
+ ASSERT_EQ(0, ioctl(fd,
+ _IOMMU_TEST_CMD(
+ IOMMU_TEST_OP_ACCESS_RW),
+ &access_cmd));
+ ASSERT_EQ(0,
+ memcmp(buffer + (access_cmd.access_rw.iova -
+ iova),
+ tmp, access_cmd.access_rw.length));
+ }
+ }
+
+ /* Multi-page test */
+ tmp2 = malloc(BUFFER_SIZE);
+ ASSERT_NE(NULL, tmp2);
+ access_cmd.access_rw.iova = iova;
+ access_cmd.access_rw.length = BUFFER_SIZE;
+ access_cmd.access_rw.flags = def_flags;
+ access_cmd.access_rw.uptr = (uintptr_t)tmp2;
+ ASSERT_EQ(0, ioctl(fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_ACCESS_RW),
+ &access_cmd));
+ ASSERT_EQ(0, memcmp(buffer, tmp2, access_cmd.access_rw.length));
+ free(tmp2);
+}
+
+TEST_F(iommufd_ioas, access_rw)
+{
+ __u32 access_id;
+ __u64 iova;
+
+ test_cmd_create_access(self->ioas_id, &access_id, 0);
+ test_ioctl_ioas_map(buffer, BUFFER_SIZE, &iova);
+ check_access_rw(_metadata, self->fd, access_id, iova, 0);
+ check_access_rw(_metadata, self->fd, access_id, iova,
+ MOCK_ACCESS_RW_SLOW_PATH);
+ test_ioctl_ioas_unmap(iova, BUFFER_SIZE);
+ test_cmd_destroy_access(access_id);
+}
+
+TEST_F(iommufd_ioas, access_rw_unaligned)
+{
+ __u32 access_id;
+ __u64 iova;
+
+ test_cmd_create_access(self->ioas_id, &access_id, 0);
+
+ /* Unaligned pages */
+ iova = self->base_iova + MOCK_PAGE_SIZE;
+ test_ioctl_ioas_map_fixed(buffer, BUFFER_SIZE, iova);
+ check_access_rw(_metadata, self->fd, access_id, iova, 0);
+ test_ioctl_ioas_unmap(iova, BUFFER_SIZE);
+ test_cmd_destroy_access(access_id);
+}
+
+TEST_F(iommufd_ioas, fork_gone)
+{
+ __u32 access_id;
+ pid_t child;
+
+ test_cmd_create_access(self->ioas_id, &access_id, 0);
+
+ /* Create a mapping with a different mm */
+ child = fork();
+ if (!child) {
+ test_ioctl_ioas_map_fixed(buffer, BUFFER_SIZE,
+ MOCK_APERTURE_START);
+ exit(0);
+ }
+ ASSERT_NE(-1, child);
+ ASSERT_EQ(child, waitpid(child, NULL, 0));
+
+ if (self->domain_id) {
+ /*
+ * If a domain already existed then everything was pinned within
+ * the fork, so this copies from one domain to another.
+ */
+ test_cmd_mock_domain(self->ioas_id, NULL, NULL);
+ check_access_rw(_metadata, self->fd, access_id,
+ MOCK_APERTURE_START, 0);
+
+ } else {
+ /*
+ * Otherwise we need to actually pin pages which can't happen
+ * since the fork is gone.
+ */
+ test_err_mock_domain(EFAULT, self->ioas_id, NULL, NULL);
+ }
+
+ test_cmd_destroy_access(access_id);
+}
+
+TEST_F(iommufd_ioas, fork_present)
+{
+ __u32 access_id;
+ int pipefds[2];
+ uint64_t tmp;
+ pid_t child;
+ int efd;
+
+ test_cmd_create_access(self->ioas_id, &access_id, 0);
+
+ ASSERT_EQ(0, pipe2(pipefds, O_CLOEXEC));
+ efd = eventfd(0, EFD_CLOEXEC);
+ ASSERT_NE(-1, efd);
+
+ /* Create a mapping with a different mm */
+ child = fork();
+ if (!child) {
+ __u64 iova;
+ uint64_t one = 1;
+
+ close(pipefds[1]);
+ test_ioctl_ioas_map_fixed(buffer, BUFFER_SIZE,
+ MOCK_APERTURE_START);
+ if (write(efd, &one, sizeof(one)) != sizeof(one))
+ exit(100);
+ if (read(pipefds[0], &iova, 1) != 1)
+ exit(100);
+ exit(0);
+ }
+ close(pipefds[0]);
+ ASSERT_NE(-1, child);
+ ASSERT_EQ(8, read(efd, &tmp, sizeof(tmp)));
+
+ /* Read pages from the remote process */
+ test_cmd_mock_domain(self->ioas_id, NULL, NULL);
+ check_access_rw(_metadata, self->fd, access_id, MOCK_APERTURE_START, 0);
+
+ ASSERT_EQ(0, close(pipefds[1]));
+ ASSERT_EQ(child, waitpid(child, NULL, 0));
+
+ test_cmd_destroy_access(access_id);
+}
+
+TEST_F(iommufd_ioas, ioas_option_huge_pages)
+{
+ struct iommu_option cmd = {
+ .size = sizeof(cmd),
+ .option_id = IOMMU_OPTION_HUGE_PAGES,
+ .op = IOMMU_OPTION_OP_GET,
+ .val64 = 3,
+ .object_id = self->ioas_id,
+ };
+
+ ASSERT_EQ(0, ioctl(self->fd, IOMMU_OPTION, &cmd));
+ ASSERT_EQ(1, cmd.val64);
+
+ cmd.op = IOMMU_OPTION_OP_SET;
+ cmd.val64 = 0;
+ ASSERT_EQ(0, ioctl(self->fd, IOMMU_OPTION, &cmd));
+
+ cmd.op = IOMMU_OPTION_OP_GET;
+ cmd.val64 = 3;
+ ASSERT_EQ(0, ioctl(self->fd, IOMMU_OPTION, &cmd));
+ ASSERT_EQ(0, cmd.val64);
+
+ cmd.op = IOMMU_OPTION_OP_SET;
+ cmd.val64 = 2;
+ EXPECT_ERRNO(EINVAL, ioctl(self->fd, IOMMU_OPTION, &cmd));
+
+ cmd.op = IOMMU_OPTION_OP_SET;
+ cmd.val64 = 1;
+ ASSERT_EQ(0, ioctl(self->fd, IOMMU_OPTION, &cmd));
+}
+
+TEST_F(iommufd_ioas, ioas_iova_alloc)
+{
+ unsigned int length;
+ __u64 iova;
+
+ for (length = 1; length != PAGE_SIZE * 2; length++) {
+ if (variant->mock_domains && (length % MOCK_PAGE_SIZE)) {
+ test_err_ioctl_ioas_map(EINVAL, buffer, length, &iova);
+ } else {
+ test_ioctl_ioas_map(buffer, length, &iova);
+ test_ioctl_ioas_unmap(iova, length);
+ }
+ }
+}
+
+TEST_F(iommufd_ioas, ioas_align_change)
+{
+ struct iommu_option cmd = {
+ .size = sizeof(cmd),
+ .option_id = IOMMU_OPTION_HUGE_PAGES,
+ .op = IOMMU_OPTION_OP_SET,
+ .object_id = self->ioas_id,
+ /* 0 means everything must be aligned to PAGE_SIZE */
+ .val64 = 0,
+ };
+
+ /*
+ * We cannot upgrade the alignment using OPTION_HUGE_PAGES when a domain
+ * and map are present.
+ */
+ if (variant->mock_domains)
+ return;
+
+ /*
+ * We can upgrade to PAGE_SIZE alignment when things are aligned right
+ */
+ test_ioctl_ioas_map_fixed(buffer, PAGE_SIZE, MOCK_APERTURE_START);
+ ASSERT_EQ(0, ioctl(self->fd, IOMMU_OPTION, &cmd));
+
+ /* Misalignment is rejected at map time */
+ test_err_ioctl_ioas_map_fixed(EINVAL, buffer + MOCK_PAGE_SIZE,
+ PAGE_SIZE,
+ MOCK_APERTURE_START + PAGE_SIZE);
+ ASSERT_EQ(0, ioctl(self->fd, IOMMU_OPTION, &cmd));
+
+ /* Reduce alignment */
+ cmd.val64 = 1;
+ ASSERT_EQ(0, ioctl(self->fd, IOMMU_OPTION, &cmd));
+
+ /* Confirm misalignment is rejected during alignment upgrade */
+ test_ioctl_ioas_map_fixed(buffer + MOCK_PAGE_SIZE, PAGE_SIZE,
+ MOCK_APERTURE_START + PAGE_SIZE);
+ cmd.val64 = 0;
+ EXPECT_ERRNO(EADDRINUSE, ioctl(self->fd, IOMMU_OPTION, &cmd));
+
+ test_ioctl_ioas_unmap(MOCK_APERTURE_START + PAGE_SIZE, PAGE_SIZE);
+ test_ioctl_ioas_unmap(MOCK_APERTURE_START, PAGE_SIZE);
+}
+
+TEST_F(iommufd_ioas, copy_sweep)
+{
+ struct iommu_ioas_copy copy_cmd = {
+ .size = sizeof(copy_cmd),
+ .flags = IOMMU_IOAS_MAP_FIXED_IOVA,
+ .src_ioas_id = self->ioas_id,
+ .dst_iova = MOCK_APERTURE_START,
+ .length = MOCK_PAGE_SIZE,
+ };
+ unsigned int dst_ioas_id;
+ uint64_t last_iova;
+ uint64_t iova;
+
+ test_ioctl_ioas_alloc(&dst_ioas_id);
+ copy_cmd.dst_ioas_id = dst_ioas_id;
+
+ if (variant->mock_domains)
+ last_iova = MOCK_APERTURE_START + BUFFER_SIZE - 1;
+ else
+ last_iova = MOCK_APERTURE_START + BUFFER_SIZE - 2;
+
+ test_ioctl_ioas_map_fixed(buffer, last_iova - MOCK_APERTURE_START + 1,
+ MOCK_APERTURE_START);
+
+ for (iova = MOCK_APERTURE_START - PAGE_SIZE; iova <= last_iova;
+ iova += 511) {
+ copy_cmd.src_iova = iova;
+ if (iova < MOCK_APERTURE_START ||
+ iova + copy_cmd.length - 1 > last_iova) {
+ EXPECT_ERRNO(ENOENT, ioctl(self->fd, IOMMU_IOAS_COPY,
+ &copy_cmd));
+ } else {
+ ASSERT_EQ(0,
+ ioctl(self->fd, IOMMU_IOAS_COPY, &copy_cmd));
+ test_ioctl_ioas_unmap_id(dst_ioas_id, copy_cmd.dst_iova,
+ copy_cmd.length);
+ }
+ }
+
+ test_ioctl_destroy(dst_ioas_id);
+}
+
+FIXTURE(iommufd_mock_domain)
+{
+ int fd;
+ uint32_t ioas_id;
+ uint32_t domain_id;
+ uint32_t domain_ids[2];
+ int mmap_flags;
+ size_t mmap_buf_size;
+};
+
+FIXTURE_VARIANT(iommufd_mock_domain)
+{
+ unsigned int mock_domains;
+ bool hugepages;
+};
+
+FIXTURE_SETUP(iommufd_mock_domain)
+{
+ unsigned int i;
+
+ self->fd = open("/dev/iommu", O_RDWR);
+ ASSERT_NE(-1, self->fd);
+ test_ioctl_ioas_alloc(&self->ioas_id);
+
+ ASSERT_GE(ARRAY_SIZE(self->domain_ids), variant->mock_domains);
+
+ for (i = 0; i != variant->mock_domains; i++)
+ test_cmd_mock_domain(self->ioas_id, NULL, &self->domain_ids[i]);
+ self->domain_id = self->domain_ids[0];
+
+ self->mmap_flags = MAP_SHARED | MAP_ANONYMOUS;
+ self->mmap_buf_size = PAGE_SIZE * 8;
+ if (variant->hugepages) {
+ /*
+ * MAP_POPULATE will cause the kernel to fail mmap if THPs are
+ * not available.
+ */
+ self->mmap_flags |= MAP_HUGETLB | MAP_POPULATE;
+ self->mmap_buf_size = HUGEPAGE_SIZE * 2;
+ }
+}
+
+FIXTURE_TEARDOWN(iommufd_mock_domain)
+{
+ teardown_iommufd(self->fd, _metadata);
+}
+
+FIXTURE_VARIANT_ADD(iommufd_mock_domain, one_domain)
+{
+ .mock_domains = 1,
+ .hugepages = false,
+};
+
+FIXTURE_VARIANT_ADD(iommufd_mock_domain, two_domains)
+{
+ .mock_domains = 2,
+ .hugepages = false,
+};
+
+FIXTURE_VARIANT_ADD(iommufd_mock_domain, one_domain_hugepage)
+{
+ .mock_domains = 1,
+ .hugepages = true,
+};
+
+FIXTURE_VARIANT_ADD(iommufd_mock_domain, two_domains_hugepage)
+{
+ .mock_domains = 2,
+ .hugepages = true,
+};
+
+/* Have the kernel check that the user pages made it to the iommu_domain */
+#define check_mock_iova(_ptr, _iova, _length) \
+ ({ \
+ struct iommu_test_cmd check_map_cmd = { \
+ .size = sizeof(check_map_cmd), \
+ .op = IOMMU_TEST_OP_MD_CHECK_MAP, \
+ .id = self->domain_id, \
+ .check_map = { .iova = _iova, \
+ .length = _length, \
+ .uptr = (uintptr_t)(_ptr) }, \
+ }; \
+ ASSERT_EQ(0, \
+ ioctl(self->fd, \
+ _IOMMU_TEST_CMD(IOMMU_TEST_OP_MD_CHECK_MAP), \
+ &check_map_cmd)); \
+ if (self->domain_ids[1]) { \
+ check_map_cmd.id = self->domain_ids[1]; \
+ ASSERT_EQ(0, \
+ ioctl(self->fd, \
+ _IOMMU_TEST_CMD( \
+ IOMMU_TEST_OP_MD_CHECK_MAP), \
+ &check_map_cmd)); \
+ } \
+ })
+
+TEST_F(iommufd_mock_domain, basic)
+{
+ size_t buf_size = self->mmap_buf_size;
+ uint8_t *buf;
+ __u64 iova;
+
+ /* Simple one page map */
+ test_ioctl_ioas_map(buffer, PAGE_SIZE, &iova);
+ check_mock_iova(buffer, iova, PAGE_SIZE);
+
+ buf = mmap(0, buf_size, PROT_READ | PROT_WRITE, self->mmap_flags, -1,
+ 0);
+ ASSERT_NE(MAP_FAILED, buf);
+
+ /* EFAULT half way through mapping */
+ ASSERT_EQ(0, munmap(buf + buf_size / 2, buf_size / 2));
+ test_err_ioctl_ioas_map(EFAULT, buf, buf_size, &iova);
+
+ /* EFAULT on first page */
+ ASSERT_EQ(0, munmap(buf, buf_size / 2));
+ test_err_ioctl_ioas_map(EFAULT, buf, buf_size, &iova);
+}
+
+TEST_F(iommufd_mock_domain, ro_unshare)
+{
+ uint8_t *buf;
+ __u64 iova;
+ int fd;
+
+ fd = open("/proc/self/exe", O_RDONLY);
+ ASSERT_NE(-1, fd);
+
+ buf = mmap(0, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+ ASSERT_NE(MAP_FAILED, buf);
+ close(fd);
+
+ /*
+ * There have been lots of changes to the "unshare" mechanism in
+ * get_user_pages(), make sure it works right. The write to the page
+ * after we map it for reading should not change the assigned PFN.
+ */
+ ASSERT_EQ(0,
+ _test_ioctl_ioas_map(self->fd, self->ioas_id, buf, PAGE_SIZE,
+ &iova, IOMMU_IOAS_MAP_READABLE));
+ check_mock_iova(buf, iova, PAGE_SIZE);
+ memset(buf, 1, PAGE_SIZE);
+ check_mock_iova(buf, iova, PAGE_SIZE);
+ ASSERT_EQ(0, munmap(buf, PAGE_SIZE));
+}
+
+TEST_F(iommufd_mock_domain, all_aligns)
+{
+ size_t test_step = variant->hugepages ? (self->mmap_buf_size / 16) :
+ MOCK_PAGE_SIZE;
+ size_t buf_size = self->mmap_buf_size;
+ unsigned int start;
+ unsigned int end;
+ uint8_t *buf;
+
+ buf = mmap(0, buf_size, PROT_READ | PROT_WRITE, self->mmap_flags, -1,
+ 0);
+ ASSERT_NE(MAP_FAILED, buf);
+ check_refs(buf, buf_size, 0);
+
+ /*
+ * Map every combination of page size and alignment within a big region,
+ * less for hugepage case as it takes so long to finish.
+ */
+ for (start = 0; start < buf_size; start += test_step) {
+ if (variant->hugepages)
+ end = buf_size;
+ else
+ end = start + MOCK_PAGE_SIZE;
+ for (; end < buf_size; end += MOCK_PAGE_SIZE) {
+ size_t length = end - start;
+ __u64 iova;
+
+ test_ioctl_ioas_map(buf + start, length, &iova);
+ check_mock_iova(buf + start, iova, length);
+ check_refs(buf + start / PAGE_SIZE * PAGE_SIZE,
+ end / PAGE_SIZE * PAGE_SIZE -
+ start / PAGE_SIZE * PAGE_SIZE,
+ 1);
+
+ test_ioctl_ioas_unmap(iova, length);
+ }
+ }
+ check_refs(buf, buf_size, 0);
+ ASSERT_EQ(0, munmap(buf, buf_size));
+}
+
+TEST_F(iommufd_mock_domain, all_aligns_copy)
+{
+ size_t test_step = variant->hugepages ? self->mmap_buf_size / 16 :
+ MOCK_PAGE_SIZE;
+ size_t buf_size = self->mmap_buf_size;
+ unsigned int start;
+ unsigned int end;
+ uint8_t *buf;
+
+ buf = mmap(0, buf_size, PROT_READ | PROT_WRITE, self->mmap_flags, -1,
+ 0);
+ ASSERT_NE(MAP_FAILED, buf);
+ check_refs(buf, buf_size, 0);
+
+ /*
+ * Map every combination of page size and alignment within a big region,
+ * less for hugepage case as it takes so long to finish.
+ */
+ for (start = 0; start < buf_size; start += test_step) {
+ if (variant->hugepages)
+ end = buf_size;
+ else
+ end = start + MOCK_PAGE_SIZE;
+ for (; end < buf_size; end += MOCK_PAGE_SIZE) {
+ size_t length = end - start;
+ unsigned int old_id;
+ uint32_t mock_device_id;
+ __u64 iova;
+
+ test_ioctl_ioas_map(buf + start, length, &iova);
+
+ /* Add and destroy a domain while the area exists */
+ old_id = self->domain_ids[1];
+ test_cmd_mock_domain(self->ioas_id, &mock_device_id,
+ &self->domain_ids[1]);
+
+ check_mock_iova(buf + start, iova, length);
+ check_refs(buf + start / PAGE_SIZE * PAGE_SIZE,
+ end / PAGE_SIZE * PAGE_SIZE -
+ start / PAGE_SIZE * PAGE_SIZE,
+ 1);
+
+ test_ioctl_destroy(mock_device_id);
+ test_ioctl_destroy(self->domain_ids[1]);
+ self->domain_ids[1] = old_id;
+
+ test_ioctl_ioas_unmap(iova, length);
+ }
+ }
+ check_refs(buf, buf_size, 0);
+ ASSERT_EQ(0, munmap(buf, buf_size));
+}
+
+TEST_F(iommufd_mock_domain, user_copy)
+{
+ struct iommu_test_cmd access_cmd = {
+ .size = sizeof(access_cmd),
+ .op = IOMMU_TEST_OP_ACCESS_PAGES,
+ .access_pages = { .length = BUFFER_SIZE,
+ .uptr = (uintptr_t)buffer },
+ };
+ struct iommu_ioas_copy copy_cmd = {
+ .size = sizeof(copy_cmd),
+ .flags = IOMMU_IOAS_MAP_FIXED_IOVA,
+ .dst_ioas_id = self->ioas_id,
+ .dst_iova = MOCK_APERTURE_START,
+ .length = BUFFER_SIZE,
+ };
+ unsigned int ioas_id;
+
+ /* Pin the pages in an IOAS with no domains then copy to an IOAS with domains */
+ test_ioctl_ioas_alloc(&ioas_id);
+ test_ioctl_ioas_map_id(ioas_id, buffer, BUFFER_SIZE,
+ &copy_cmd.src_iova);
+
+ test_cmd_create_access(ioas_id, &access_cmd.id,
+ MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES);
+
+ access_cmd.access_pages.iova = copy_cmd.src_iova;
+ ASSERT_EQ(0,
+ ioctl(self->fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_ACCESS_PAGES),
+ &access_cmd));
+ copy_cmd.src_ioas_id = ioas_id;
+ ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_COPY, &copy_cmd));
+ check_mock_iova(buffer, MOCK_APERTURE_START, BUFFER_SIZE);
+
+ test_cmd_destroy_access_pages(
+ access_cmd.id, access_cmd.access_pages.out_access_pages_id);
+ test_cmd_destroy_access(access_cmd.id) test_ioctl_destroy(ioas_id);
+
+ test_ioctl_destroy(ioas_id);
+}
+
+/* VFIO compatibility IOCTLs */
+
+TEST_F(iommufd, simple_ioctls)
+{
+ ASSERT_EQ(VFIO_API_VERSION, ioctl(self->fd, VFIO_GET_API_VERSION));
+ ASSERT_EQ(1, ioctl(self->fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU));
+}
+
+TEST_F(iommufd, unmap_cmd)
+{
+ struct vfio_iommu_type1_dma_unmap unmap_cmd = {
+ .iova = MOCK_APERTURE_START,
+ .size = PAGE_SIZE,
+ };
+
+ unmap_cmd.argsz = 1;
+ EXPECT_ERRNO(EINVAL, ioctl(self->fd, VFIO_IOMMU_UNMAP_DMA, &unmap_cmd));
+
+ unmap_cmd.argsz = sizeof(unmap_cmd);
+ unmap_cmd.flags = 1 << 31;
+ EXPECT_ERRNO(EINVAL, ioctl(self->fd, VFIO_IOMMU_UNMAP_DMA, &unmap_cmd));
+
+ unmap_cmd.flags = 0;
+ EXPECT_ERRNO(ENODEV, ioctl(self->fd, VFIO_IOMMU_UNMAP_DMA, &unmap_cmd));
+}
+
+TEST_F(iommufd, map_cmd)
+{
+ struct vfio_iommu_type1_dma_map map_cmd = {
+ .iova = MOCK_APERTURE_START,
+ .size = PAGE_SIZE,
+ .vaddr = (__u64)buffer,
+ };
+
+ map_cmd.argsz = 1;
+ EXPECT_ERRNO(EINVAL, ioctl(self->fd, VFIO_IOMMU_MAP_DMA, &map_cmd));
+
+ map_cmd.argsz = sizeof(map_cmd);
+ map_cmd.flags = 1 << 31;
+ EXPECT_ERRNO(EINVAL, ioctl(self->fd, VFIO_IOMMU_MAP_DMA, &map_cmd));
+
+ /* Requires a domain to be attached */
+ map_cmd.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+ EXPECT_ERRNO(ENODEV, ioctl(self->fd, VFIO_IOMMU_MAP_DMA, &map_cmd));
+}
+
+TEST_F(iommufd, info_cmd)
+{
+ struct vfio_iommu_type1_info info_cmd = {};
+
+ /* Invalid argsz */
+ info_cmd.argsz = 1;
+ EXPECT_ERRNO(EINVAL, ioctl(self->fd, VFIO_IOMMU_GET_INFO, &info_cmd));
+
+ info_cmd.argsz = sizeof(info_cmd);
+ EXPECT_ERRNO(ENODEV, ioctl(self->fd, VFIO_IOMMU_GET_INFO, &info_cmd));
+}
+
+TEST_F(iommufd, set_iommu_cmd)
+{
+ /* Requires a domain to be attached */
+ EXPECT_ERRNO(ENODEV,
+ ioctl(self->fd, VFIO_SET_IOMMU, VFIO_TYPE1v2_IOMMU));
+ EXPECT_ERRNO(ENODEV, ioctl(self->fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU));
+}
+
+TEST_F(iommufd, vfio_ioas)
+{
+ struct iommu_vfio_ioas vfio_ioas_cmd = {
+ .size = sizeof(vfio_ioas_cmd),
+ .op = IOMMU_VFIO_IOAS_GET,
+ };
+ __u32 ioas_id;
+
+ /* ENODEV if there is no compat ioas */
+ EXPECT_ERRNO(ENODEV, ioctl(self->fd, IOMMU_VFIO_IOAS, &vfio_ioas_cmd));
+
+ /* Invalid id for set */
+ vfio_ioas_cmd.op = IOMMU_VFIO_IOAS_SET;
+ EXPECT_ERRNO(ENOENT, ioctl(self->fd, IOMMU_VFIO_IOAS, &vfio_ioas_cmd));
+
+ /* Valid id for set*/
+ test_ioctl_ioas_alloc(&ioas_id);
+ vfio_ioas_cmd.ioas_id = ioas_id;
+ ASSERT_EQ(0, ioctl(self->fd, IOMMU_VFIO_IOAS, &vfio_ioas_cmd));
+
+ /* Same id comes back from get */
+ vfio_ioas_cmd.op = IOMMU_VFIO_IOAS_GET;
+ ASSERT_EQ(0, ioctl(self->fd, IOMMU_VFIO_IOAS, &vfio_ioas_cmd));
+ ASSERT_EQ(ioas_id, vfio_ioas_cmd.ioas_id);
+
+ /* Clear works */
+ vfio_ioas_cmd.op = IOMMU_VFIO_IOAS_CLEAR;
+ ASSERT_EQ(0, ioctl(self->fd, IOMMU_VFIO_IOAS, &vfio_ioas_cmd));
+ vfio_ioas_cmd.op = IOMMU_VFIO_IOAS_GET;
+ EXPECT_ERRNO(ENODEV, ioctl(self->fd, IOMMU_VFIO_IOAS, &vfio_ioas_cmd));
+}
+
+FIXTURE(vfio_compat_mock_domain)
+{
+ int fd;
+ uint32_t ioas_id;
+};
+
+FIXTURE_VARIANT(vfio_compat_mock_domain)
+{
+ unsigned int version;
+};
+
+FIXTURE_SETUP(vfio_compat_mock_domain)
+{
+ struct iommu_vfio_ioas vfio_ioas_cmd = {
+ .size = sizeof(vfio_ioas_cmd),
+ .op = IOMMU_VFIO_IOAS_SET,
+ };
+
+ self->fd = open("/dev/iommu", O_RDWR);
+ ASSERT_NE(-1, self->fd);
+
+ /* Create what VFIO would consider a group */
+ test_ioctl_ioas_alloc(&self->ioas_id);
+ test_cmd_mock_domain(self->ioas_id, NULL, NULL);
+
+ /* Attach it to the vfio compat */
+ vfio_ioas_cmd.ioas_id = self->ioas_id;
+ ASSERT_EQ(0, ioctl(self->fd, IOMMU_VFIO_IOAS, &vfio_ioas_cmd));
+ ASSERT_EQ(0, ioctl(self->fd, VFIO_SET_IOMMU, variant->version));
+}
+
+FIXTURE_TEARDOWN(vfio_compat_mock_domain)
+{
+ teardown_iommufd(self->fd, _metadata);
+}
+
+FIXTURE_VARIANT_ADD(vfio_compat_mock_domain, Ver1v2)
+{
+ .version = VFIO_TYPE1v2_IOMMU,
+};
+
+FIXTURE_VARIANT_ADD(vfio_compat_mock_domain, Ver1v0)
+{
+ .version = VFIO_TYPE1_IOMMU,
+};
+
+TEST_F(vfio_compat_mock_domain, simple_close)
+{
+}
+
+TEST_F(vfio_compat_mock_domain, option_huge_pages)
+{
+ struct iommu_option cmd = {
+ .size = sizeof(cmd),
+ .option_id = IOMMU_OPTION_HUGE_PAGES,
+ .op = IOMMU_OPTION_OP_GET,
+ .val64 = 3,
+ .object_id = self->ioas_id,
+ };
+
+ ASSERT_EQ(0, ioctl(self->fd, IOMMU_OPTION, &cmd));
+ if (variant->version == VFIO_TYPE1_IOMMU) {
+ ASSERT_EQ(0, cmd.val64);
+ } else {
+ ASSERT_EQ(1, cmd.val64);
+ }
+}
+
+/*
+ * Execute an ioctl command stored in buffer and check that the result does not
+ * overflow memory.
+ */
+static bool is_filled(const void *buf, uint8_t c, size_t len)
+{
+ const uint8_t *cbuf = buf;
+
+ for (; len; cbuf++, len--)
+ if (*cbuf != c)
+ return false;
+ return true;
+}
+
+#define ioctl_check_buf(fd, cmd) \
+ ({ \
+ size_t _cmd_len = *(__u32 *)buffer; \
+ \
+ memset(buffer + _cmd_len, 0xAA, BUFFER_SIZE - _cmd_len); \
+ ASSERT_EQ(0, ioctl(fd, cmd, buffer)); \
+ ASSERT_EQ(true, is_filled(buffer + _cmd_len, 0xAA, \
+ BUFFER_SIZE - _cmd_len)); \
+ })
+
+static void check_vfio_info_cap_chain(struct __test_metadata *_metadata,
+ struct vfio_iommu_type1_info *info_cmd)
+{
+ const struct vfio_info_cap_header *cap;
+
+ ASSERT_GE(info_cmd->argsz, info_cmd->cap_offset + sizeof(*cap));
+ cap = buffer + info_cmd->cap_offset;
+ while (true) {
+ size_t cap_size;
+
+ if (cap->next)
+ cap_size = (buffer + cap->next) - (void *)cap;
+ else
+ cap_size = (buffer + info_cmd->argsz) - (void *)cap;
+
+ switch (cap->id) {
+ case VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE: {
+ struct vfio_iommu_type1_info_cap_iova_range *data =
+ (void *)cap;
+
+ ASSERT_EQ(1, data->header.version);
+ ASSERT_EQ(1, data->nr_iovas);
+ EXPECT_EQ(MOCK_APERTURE_START,
+ data->iova_ranges[0].start);
+ EXPECT_EQ(MOCK_APERTURE_LAST, data->iova_ranges[0].end);
+ break;
+ }
+ case VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL: {
+ struct vfio_iommu_type1_info_dma_avail *data =
+ (void *)cap;
+
+ ASSERT_EQ(1, data->header.version);
+ ASSERT_EQ(sizeof(*data), cap_size);
+ break;
+ }
+ default:
+ ASSERT_EQ(false, true);
+ break;
+ }
+ if (!cap->next)
+ break;
+
+ ASSERT_GE(info_cmd->argsz, cap->next + sizeof(*cap));
+ ASSERT_GE(buffer + cap->next, (void *)cap);
+ cap = buffer + cap->next;
+ }
+}
+
+TEST_F(vfio_compat_mock_domain, get_info)
+{
+ struct vfio_iommu_type1_info *info_cmd = buffer;
+ unsigned int i;
+ size_t caplen;
+
+ /* Pre-cap ABI */
+ *info_cmd = (struct vfio_iommu_type1_info){
+ .argsz = offsetof(struct vfio_iommu_type1_info, cap_offset),
+ };
+ ioctl_check_buf(self->fd, VFIO_IOMMU_GET_INFO);
+ ASSERT_NE(0, info_cmd->iova_pgsizes);
+ ASSERT_EQ(VFIO_IOMMU_INFO_PGSIZES | VFIO_IOMMU_INFO_CAPS,
+ info_cmd->flags);
+
+ /* Read the cap chain size */
+ *info_cmd = (struct vfio_iommu_type1_info){
+ .argsz = sizeof(*info_cmd),
+ };
+ ioctl_check_buf(self->fd, VFIO_IOMMU_GET_INFO);
+ ASSERT_NE(0, info_cmd->iova_pgsizes);
+ ASSERT_EQ(VFIO_IOMMU_INFO_PGSIZES | VFIO_IOMMU_INFO_CAPS,
+ info_cmd->flags);
+ ASSERT_EQ(0, info_cmd->cap_offset);
+ ASSERT_LT(sizeof(*info_cmd), info_cmd->argsz);
+
+ /* Read the caps, kernel should never create a corrupted caps */
+ caplen = info_cmd->argsz;
+ for (i = sizeof(*info_cmd); i < caplen; i++) {
+ *info_cmd = (struct vfio_iommu_type1_info){
+ .argsz = i,
+ };
+ ioctl_check_buf(self->fd, VFIO_IOMMU_GET_INFO);
+ ASSERT_EQ(VFIO_IOMMU_INFO_PGSIZES | VFIO_IOMMU_INFO_CAPS,
+ info_cmd->flags);
+ if (!info_cmd->cap_offset)
+ continue;
+ check_vfio_info_cap_chain(_metadata, info_cmd);
+ }
+}
+
+static void shuffle_array(unsigned long *array, size_t nelms)
+{
+ unsigned int i;
+
+ /* Shuffle */
+ for (i = 0; i != nelms; i++) {
+ unsigned long tmp = array[i];
+ unsigned int other = rand() % (nelms - i);
+
+ array[i] = array[other];
+ array[other] = tmp;
+ }
+}
+
+TEST_F(vfio_compat_mock_domain, map)
+{
+ struct vfio_iommu_type1_dma_map map_cmd = {
+ .argsz = sizeof(map_cmd),
+ .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE,
+ .vaddr = (uintptr_t)buffer,
+ .size = BUFFER_SIZE,
+ .iova = MOCK_APERTURE_START,
+ };
+ struct vfio_iommu_type1_dma_unmap unmap_cmd = {
+ .argsz = sizeof(unmap_cmd),
+ .size = BUFFER_SIZE,
+ .iova = MOCK_APERTURE_START,
+ };
+ unsigned long pages_iova[BUFFER_SIZE / PAGE_SIZE];
+ unsigned int i;
+
+ /* Simple map/unmap */
+ ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_MAP_DMA, &map_cmd));
+ ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_UNMAP_DMA, &unmap_cmd));
+ ASSERT_EQ(BUFFER_SIZE, unmap_cmd.size);
+
+ /* UNMAP_FLAG_ALL requres 0 iova/size */
+ ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_MAP_DMA, &map_cmd));
+ unmap_cmd.flags = VFIO_DMA_UNMAP_FLAG_ALL;
+ EXPECT_ERRNO(EINVAL, ioctl(self->fd, VFIO_IOMMU_UNMAP_DMA, &unmap_cmd));
+
+ unmap_cmd.iova = 0;
+ unmap_cmd.size = 0;
+ ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_UNMAP_DMA, &unmap_cmd));
+ ASSERT_EQ(BUFFER_SIZE, unmap_cmd.size);
+
+ /* Small pages */
+ for (i = 0; i != ARRAY_SIZE(pages_iova); i++) {
+ map_cmd.iova = pages_iova[i] =
+ MOCK_APERTURE_START + i * PAGE_SIZE;
+ map_cmd.vaddr = (uintptr_t)buffer + i * PAGE_SIZE;
+ map_cmd.size = PAGE_SIZE;
+ ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_MAP_DMA, &map_cmd));
+ }
+ shuffle_array(pages_iova, ARRAY_SIZE(pages_iova));
+
+ unmap_cmd.flags = 0;
+ unmap_cmd.size = PAGE_SIZE;
+ for (i = 0; i != ARRAY_SIZE(pages_iova); i++) {
+ unmap_cmd.iova = pages_iova[i];
+ ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_UNMAP_DMA, &unmap_cmd));
+ }
+}
+
+TEST_F(vfio_compat_mock_domain, huge_map)
+{
+ size_t buf_size = HUGEPAGE_SIZE * 2;
+ struct vfio_iommu_type1_dma_map map_cmd = {
+ .argsz = sizeof(map_cmd),
+ .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE,
+ .size = buf_size,
+ .iova = MOCK_APERTURE_START,
+ };
+ struct vfio_iommu_type1_dma_unmap unmap_cmd = {
+ .argsz = sizeof(unmap_cmd),
+ };
+ unsigned long pages_iova[16];
+ unsigned int i;
+ void *buf;
+
+ /* Test huge pages and splitting */
+ buf = mmap(0, buf_size, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, -1,
+ 0);
+ ASSERT_NE(MAP_FAILED, buf);
+ map_cmd.vaddr = (uintptr_t)buf;
+ ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_MAP_DMA, &map_cmd));
+
+ unmap_cmd.size = buf_size / ARRAY_SIZE(pages_iova);
+ for (i = 0; i != ARRAY_SIZE(pages_iova); i++)
+ pages_iova[i] = MOCK_APERTURE_START + (i * unmap_cmd.size);
+ shuffle_array(pages_iova, ARRAY_SIZE(pages_iova));
+
+ /* type1 mode can cut up larger mappings, type1v2 always fails */
+ for (i = 0; i != ARRAY_SIZE(pages_iova); i++) {
+ unmap_cmd.iova = pages_iova[i];
+ unmap_cmd.size = buf_size / ARRAY_SIZE(pages_iova);
+ if (variant->version == VFIO_TYPE1_IOMMU) {
+ ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_UNMAP_DMA,
+ &unmap_cmd));
+ } else {
+ EXPECT_ERRNO(ENOENT,
+ ioctl(self->fd, VFIO_IOMMU_UNMAP_DMA,
+ &unmap_cmd));
+ }
+ }
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c
new file mode 100644
index 000000000000..9713111b820d
--- /dev/null
+++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c
@@ -0,0 +1,580 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
+ *
+ * These tests are "kernel integrity" tests. They are looking for kernel
+ * WARN/OOPS/kasn/etc splats triggered by kernel sanitizers & debugging
+ * features. It does not attempt to verify that the system calls are doing what
+ * they are supposed to do.
+ *
+ * The basic philosophy is to run a sequence of calls that will succeed and then
+ * sweep every failure injection point on that call chain to look for
+ * interesting things in error handling.
+ *
+ * This test is best run with:
+ * echo 1 > /proc/sys/kernel/panic_on_warn
+ * If something is actually going wrong.
+ */
+#include <fcntl.h>
+#include <dirent.h>
+
+#define __EXPORTED_HEADERS__
+#include <linux/vfio.h>
+
+#include "iommufd_utils.h"
+
+static bool have_fault_injection;
+
+static int writeat(int dfd, const char *fn, const char *val)
+{
+ size_t val_len = strlen(val);
+ ssize_t res;
+ int fd;
+
+ fd = openat(dfd, fn, O_WRONLY);
+ if (fd == -1)
+ return -1;
+ res = write(fd, val, val_len);
+ assert(res == val_len);
+ close(fd);
+ return 0;
+}
+
+static __attribute__((constructor)) void setup_buffer(void)
+{
+ BUFFER_SIZE = 2*1024*1024;
+
+ buffer = mmap(0, BUFFER_SIZE, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+}
+
+/*
+ * This sets up fail_injection in a way that is useful for this test.
+ * It does not attempt to restore things back to how they were.
+ */
+static __attribute__((constructor)) void setup_fault_injection(void)
+{
+ DIR *debugfs = opendir("/sys/kernel/debug/");
+ struct dirent *dent;
+
+ if (!debugfs)
+ return;
+
+ /* Allow any allocation call to be fault injected */
+ if (writeat(dirfd(debugfs), "failslab/ignore-gfp-wait", "N"))
+ return;
+ writeat(dirfd(debugfs), "fail_page_alloc/ignore-gfp-wait", "N");
+ writeat(dirfd(debugfs), "fail_page_alloc/ignore-gfp-highmem", "N");
+
+ while ((dent = readdir(debugfs))) {
+ char fn[300];
+
+ if (strncmp(dent->d_name, "fail", 4) != 0)
+ continue;
+
+ /* We are looking for kernel splats, quiet down the log */
+ snprintf(fn, sizeof(fn), "%s/verbose", dent->d_name);
+ writeat(dirfd(debugfs), fn, "0");
+ }
+ closedir(debugfs);
+ have_fault_injection = true;
+}
+
+struct fail_nth_state {
+ int proc_fd;
+ unsigned int iteration;
+};
+
+static void fail_nth_first(struct __test_metadata *_metadata,
+ struct fail_nth_state *nth_state)
+{
+ char buf[300];
+
+ snprintf(buf, sizeof(buf), "/proc/self/task/%u/fail-nth", getpid());
+ nth_state->proc_fd = open(buf, O_RDWR);
+ ASSERT_NE(-1, nth_state->proc_fd);
+}
+
+static bool fail_nth_next(struct __test_metadata *_metadata,
+ struct fail_nth_state *nth_state,
+ int test_result)
+{
+ static const char disable_nth[] = "0";
+ char buf[300];
+
+ /*
+ * This is just an arbitrary limit based on the current kernel
+ * situation. Changes in the kernel can dramtically change the number of
+ * required fault injection sites, so if this hits it doesn't
+ * necessarily mean a test failure, just that the limit has to be made
+ * bigger.
+ */
+ ASSERT_GT(400, nth_state->iteration);
+ if (nth_state->iteration != 0) {
+ ssize_t res;
+ ssize_t res2;
+
+ buf[0] = 0;
+ /*
+ * Annoyingly disabling the nth can also fail. This means
+ * the test passed without triggering failure
+ */
+ res = pread(nth_state->proc_fd, buf, sizeof(buf), 0);
+ if (res == -1 && errno == EFAULT) {
+ buf[0] = '1';
+ buf[1] = '\n';
+ res = 2;
+ }
+
+ res2 = pwrite(nth_state->proc_fd, disable_nth,
+ ARRAY_SIZE(disable_nth) - 1, 0);
+ if (res2 == -1 && errno == EFAULT) {
+ res2 = pwrite(nth_state->proc_fd, disable_nth,
+ ARRAY_SIZE(disable_nth) - 1, 0);
+ buf[0] = '1';
+ buf[1] = '\n';
+ }
+ ASSERT_EQ(ARRAY_SIZE(disable_nth) - 1, res2);
+
+ /* printf(" nth %u result=%d nth=%u\n", nth_state->iteration,
+ test_result, atoi(buf)); */
+ fflush(stdout);
+ ASSERT_LT(1, res);
+ if (res != 2 || buf[0] != '0' || buf[1] != '\n')
+ return false;
+ } else {
+ /* printf(" nth %u result=%d\n", nth_state->iteration,
+ test_result); */
+ }
+ nth_state->iteration++;
+ return true;
+}
+
+/*
+ * This is called during the test to start failure injection. It allows the test
+ * to do some setup that has already been swept and thus reduce the required
+ * iterations.
+ */
+void __fail_nth_enable(struct __test_metadata *_metadata,
+ struct fail_nth_state *nth_state)
+{
+ char buf[300];
+ size_t len;
+
+ if (!nth_state->iteration)
+ return;
+
+ len = snprintf(buf, sizeof(buf), "%u", nth_state->iteration);
+ ASSERT_EQ(len, pwrite(nth_state->proc_fd, buf, len, 0));
+}
+#define fail_nth_enable() __fail_nth_enable(_metadata, _nth_state)
+
+#define TEST_FAIL_NTH(fixture_name, name) \
+ static int test_nth_##name(struct __test_metadata *_metadata, \
+ FIXTURE_DATA(fixture_name) *self, \
+ const FIXTURE_VARIANT(fixture_name) \
+ *variant, \
+ struct fail_nth_state *_nth_state); \
+ TEST_F(fixture_name, name) \
+ { \
+ struct fail_nth_state nth_state = {}; \
+ int test_result = 0; \
+ \
+ if (!have_fault_injection) \
+ SKIP(return, \
+ "fault injection is not enabled in the kernel"); \
+ fail_nth_first(_metadata, &nth_state); \
+ ASSERT_EQ(0, test_nth_##name(_metadata, self, variant, \
+ &nth_state)); \
+ while (fail_nth_next(_metadata, &nth_state, test_result)) { \
+ fixture_name##_teardown(_metadata, self, variant); \
+ fixture_name##_setup(_metadata, self, variant); \
+ test_result = test_nth_##name(_metadata, self, \
+ variant, &nth_state); \
+ }; \
+ ASSERT_EQ(0, test_result); \
+ } \
+ static int test_nth_##name( \
+ struct __test_metadata __attribute__((unused)) *_metadata, \
+ FIXTURE_DATA(fixture_name) __attribute__((unused)) *self, \
+ const FIXTURE_VARIANT(fixture_name) __attribute__((unused)) \
+ *variant, \
+ struct fail_nth_state *_nth_state)
+
+FIXTURE(basic_fail_nth)
+{
+ int fd;
+ uint32_t access_id;
+};
+
+FIXTURE_SETUP(basic_fail_nth)
+{
+ self->fd = -1;
+ self->access_id = 0;
+}
+
+FIXTURE_TEARDOWN(basic_fail_nth)
+{
+ int rc;
+
+ if (self->access_id) {
+ /* The access FD holds the iommufd open until it closes */
+ rc = _test_cmd_destroy_access(self->access_id);
+ assert(rc == 0);
+ }
+ teardown_iommufd(self->fd, _metadata);
+}
+
+/* Cover ioas.c */
+TEST_FAIL_NTH(basic_fail_nth, basic)
+{
+ struct iommu_iova_range ranges[10];
+ uint32_t ioas_id;
+ __u64 iova;
+
+ fail_nth_enable();
+
+ self->fd = open("/dev/iommu", O_RDWR);
+ if (self->fd == -1)
+ return -1;
+
+ if (_test_ioctl_ioas_alloc(self->fd, &ioas_id))
+ return -1;
+
+ {
+ struct iommu_ioas_iova_ranges ranges_cmd = {
+ .size = sizeof(ranges_cmd),
+ .num_iovas = ARRAY_SIZE(ranges),
+ .ioas_id = ioas_id,
+ .allowed_iovas = (uintptr_t)ranges,
+ };
+ if (ioctl(self->fd, IOMMU_IOAS_IOVA_RANGES, &ranges_cmd))
+ return -1;
+ }
+
+ {
+ struct iommu_ioas_allow_iovas allow_cmd = {
+ .size = sizeof(allow_cmd),
+ .ioas_id = ioas_id,
+ .num_iovas = 1,
+ .allowed_iovas = (uintptr_t)ranges,
+ };
+
+ ranges[0].start = 16*1024;
+ ranges[0].last = BUFFER_SIZE + 16 * 1024 * 600 - 1;
+ if (ioctl(self->fd, IOMMU_IOAS_ALLOW_IOVAS, &allow_cmd))
+ return -1;
+ }
+
+ if (_test_ioctl_ioas_map(self->fd, ioas_id, buffer, BUFFER_SIZE, &iova,
+ IOMMU_IOAS_MAP_WRITEABLE |
+ IOMMU_IOAS_MAP_READABLE))
+ return -1;
+
+ {
+ struct iommu_ioas_copy copy_cmd = {
+ .size = sizeof(copy_cmd),
+ .flags = IOMMU_IOAS_MAP_WRITEABLE |
+ IOMMU_IOAS_MAP_READABLE,
+ .dst_ioas_id = ioas_id,
+ .src_ioas_id = ioas_id,
+ .src_iova = iova,
+ .length = sizeof(ranges),
+ };
+
+ if (ioctl(self->fd, IOMMU_IOAS_COPY, &copy_cmd))
+ return -1;
+ }
+
+ if (_test_ioctl_ioas_unmap(self->fd, ioas_id, iova, BUFFER_SIZE,
+ NULL))
+ return -1;
+ /* Failure path of no IOVA to unmap */
+ _test_ioctl_ioas_unmap(self->fd, ioas_id, iova, BUFFER_SIZE, NULL);
+ return 0;
+}
+
+/* iopt_area_fill_domains() and iopt_area_fill_domain() */
+TEST_FAIL_NTH(basic_fail_nth, map_domain)
+{
+ uint32_t ioas_id;
+ __u32 device_id;
+ __u32 hwpt_id;
+ __u64 iova;
+
+ self->fd = open("/dev/iommu", O_RDWR);
+ if (self->fd == -1)
+ return -1;
+
+ if (_test_ioctl_ioas_alloc(self->fd, &ioas_id))
+ return -1;
+
+ if (_test_ioctl_set_temp_memory_limit(self->fd, 32))
+ return -1;
+
+ fail_nth_enable();
+
+ if (_test_cmd_mock_domain(self->fd, ioas_id, &device_id, &hwpt_id))
+ return -1;
+
+ if (_test_ioctl_ioas_map(self->fd, ioas_id, buffer, 262144, &iova,
+ IOMMU_IOAS_MAP_WRITEABLE |
+ IOMMU_IOAS_MAP_READABLE))
+ return -1;
+
+ if (_test_ioctl_destroy(self->fd, device_id))
+ return -1;
+ if (_test_ioctl_destroy(self->fd, hwpt_id))
+ return -1;
+
+ if (_test_cmd_mock_domain(self->fd, ioas_id, &device_id, &hwpt_id))
+ return -1;
+ return 0;
+}
+
+TEST_FAIL_NTH(basic_fail_nth, map_two_domains)
+{
+ uint32_t ioas_id;
+ __u32 device_id2;
+ __u32 device_id;
+ __u32 hwpt_id2;
+ __u32 hwpt_id;
+ __u64 iova;
+
+ self->fd = open("/dev/iommu", O_RDWR);
+ if (self->fd == -1)
+ return -1;
+
+ if (_test_ioctl_ioas_alloc(self->fd, &ioas_id))
+ return -1;
+
+ if (_test_ioctl_set_temp_memory_limit(self->fd, 32))
+ return -1;
+
+ if (_test_cmd_mock_domain(self->fd, ioas_id, &device_id, &hwpt_id))
+ return -1;
+
+ fail_nth_enable();
+
+ if (_test_cmd_mock_domain(self->fd, ioas_id, &device_id2, &hwpt_id2))
+ return -1;
+
+ if (_test_ioctl_ioas_map(self->fd, ioas_id, buffer, 262144, &iova,
+ IOMMU_IOAS_MAP_WRITEABLE |
+ IOMMU_IOAS_MAP_READABLE))
+ return -1;
+
+ if (_test_ioctl_destroy(self->fd, device_id))
+ return -1;
+ if (_test_ioctl_destroy(self->fd, hwpt_id))
+ return -1;
+
+ if (_test_ioctl_destroy(self->fd, device_id2))
+ return -1;
+ if (_test_ioctl_destroy(self->fd, hwpt_id2))
+ return -1;
+
+ if (_test_cmd_mock_domain(self->fd, ioas_id, &device_id, &hwpt_id))
+ return -1;
+ if (_test_cmd_mock_domain(self->fd, ioas_id, &device_id2, &hwpt_id2))
+ return -1;
+ return 0;
+}
+
+TEST_FAIL_NTH(basic_fail_nth, access_rw)
+{
+ uint64_t tmp_big[4096];
+ uint32_t ioas_id;
+ uint16_t tmp[32];
+ __u64 iova;
+
+ self->fd = open("/dev/iommu", O_RDWR);
+ if (self->fd == -1)
+ return -1;
+
+ if (_test_ioctl_ioas_alloc(self->fd, &ioas_id))
+ return -1;
+
+ if (_test_ioctl_set_temp_memory_limit(self->fd, 32))
+ return -1;
+
+ if (_test_ioctl_ioas_map(self->fd, ioas_id, buffer, 262144, &iova,
+ IOMMU_IOAS_MAP_WRITEABLE |
+ IOMMU_IOAS_MAP_READABLE))
+ return -1;
+
+ fail_nth_enable();
+
+ if (_test_cmd_create_access(self->fd, ioas_id, &self->access_id, 0))
+ return -1;
+
+ {
+ struct iommu_test_cmd access_cmd = {
+ .size = sizeof(access_cmd),
+ .op = IOMMU_TEST_OP_ACCESS_RW,
+ .id = self->access_id,
+ .access_rw = { .iova = iova,
+ .length = sizeof(tmp),
+ .uptr = (uintptr_t)tmp },
+ };
+
+ // READ
+ if (ioctl(self->fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_ACCESS_RW),
+ &access_cmd))
+ return -1;
+
+ access_cmd.access_rw.flags = MOCK_ACCESS_RW_WRITE;
+ if (ioctl(self->fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_ACCESS_RW),
+ &access_cmd))
+ return -1;
+
+ access_cmd.access_rw.flags = MOCK_ACCESS_RW_SLOW_PATH;
+ if (ioctl(self->fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_ACCESS_RW),
+ &access_cmd))
+ return -1;
+ access_cmd.access_rw.flags = MOCK_ACCESS_RW_SLOW_PATH |
+ MOCK_ACCESS_RW_WRITE;
+ if (ioctl(self->fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_ACCESS_RW),
+ &access_cmd))
+ return -1;
+ }
+
+ {
+ struct iommu_test_cmd access_cmd = {
+ .size = sizeof(access_cmd),
+ .op = IOMMU_TEST_OP_ACCESS_RW,
+ .id = self->access_id,
+ .access_rw = { .iova = iova,
+ .flags = MOCK_ACCESS_RW_SLOW_PATH,
+ .length = sizeof(tmp_big),
+ .uptr = (uintptr_t)tmp_big },
+ };
+
+ if (ioctl(self->fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_ACCESS_RW),
+ &access_cmd))
+ return -1;
+ }
+ if (_test_cmd_destroy_access(self->access_id))
+ return -1;
+ self->access_id = 0;
+ return 0;
+}
+
+/* pages.c access functions */
+TEST_FAIL_NTH(basic_fail_nth, access_pin)
+{
+ uint32_t access_pages_id;
+ uint32_t ioas_id;
+ __u64 iova;
+
+ self->fd = open("/dev/iommu", O_RDWR);
+ if (self->fd == -1)
+ return -1;
+
+ if (_test_ioctl_ioas_alloc(self->fd, &ioas_id))
+ return -1;
+
+ if (_test_ioctl_set_temp_memory_limit(self->fd, 32))
+ return -1;
+
+ if (_test_ioctl_ioas_map(self->fd, ioas_id, buffer, BUFFER_SIZE, &iova,
+ IOMMU_IOAS_MAP_WRITEABLE |
+ IOMMU_IOAS_MAP_READABLE))
+ return -1;
+
+ if (_test_cmd_create_access(self->fd, ioas_id, &self->access_id,
+ MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES))
+ return -1;
+
+ fail_nth_enable();
+
+ {
+ struct iommu_test_cmd access_cmd = {
+ .size = sizeof(access_cmd),
+ .op = IOMMU_TEST_OP_ACCESS_PAGES,
+ .id = self->access_id,
+ .access_pages = { .iova = iova,
+ .length = BUFFER_SIZE,
+ .uptr = (uintptr_t)buffer },
+ };
+
+ if (ioctl(self->fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_ACCESS_RW),
+ &access_cmd))
+ return -1;
+ access_pages_id = access_cmd.access_pages.out_access_pages_id;
+ }
+
+ if (_test_cmd_destroy_access_pages(self->fd, self->access_id,
+ access_pages_id))
+ return -1;
+
+ if (_test_cmd_destroy_access(self->access_id))
+ return -1;
+ self->access_id = 0;
+ return 0;
+}
+
+/* iopt_pages_fill_xarray() */
+TEST_FAIL_NTH(basic_fail_nth, access_pin_domain)
+{
+ uint32_t access_pages_id;
+ uint32_t ioas_id;
+ __u32 device_id;
+ __u32 hwpt_id;
+ __u64 iova;
+
+ self->fd = open("/dev/iommu", O_RDWR);
+ if (self->fd == -1)
+ return -1;
+
+ if (_test_ioctl_ioas_alloc(self->fd, &ioas_id))
+ return -1;
+
+ if (_test_ioctl_set_temp_memory_limit(self->fd, 32))
+ return -1;
+
+ if (_test_cmd_mock_domain(self->fd, ioas_id, &device_id, &hwpt_id))
+ return -1;
+
+ if (_test_ioctl_ioas_map(self->fd, ioas_id, buffer, BUFFER_SIZE, &iova,
+ IOMMU_IOAS_MAP_WRITEABLE |
+ IOMMU_IOAS_MAP_READABLE))
+ return -1;
+
+ if (_test_cmd_create_access(self->fd, ioas_id, &self->access_id,
+ MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES))
+ return -1;
+
+ fail_nth_enable();
+
+ {
+ struct iommu_test_cmd access_cmd = {
+ .size = sizeof(access_cmd),
+ .op = IOMMU_TEST_OP_ACCESS_PAGES,
+ .id = self->access_id,
+ .access_pages = { .iova = iova,
+ .length = BUFFER_SIZE,
+ .uptr = (uintptr_t)buffer },
+ };
+
+ if (ioctl(self->fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_ACCESS_RW),
+ &access_cmd))
+ return -1;
+ access_pages_id = access_cmd.access_pages.out_access_pages_id;
+ }
+
+ if (_test_cmd_destroy_access_pages(self->fd, self->access_id,
+ access_pages_id))
+ return -1;
+
+ if (_test_cmd_destroy_access(self->access_id))
+ return -1;
+ self->access_id = 0;
+
+ if (_test_ioctl_destroy(self->fd, device_id))
+ return -1;
+ if (_test_ioctl_destroy(self->fd, hwpt_id))
+ return -1;
+ return 0;
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h
new file mode 100644
index 000000000000..0d1f46369c2a
--- /dev/null
+++ b/tools/testing/selftests/iommu/iommufd_utils.h
@@ -0,0 +1,278 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES */
+#ifndef __SELFTEST_IOMMUFD_UTILS
+#define __SELFTEST_IOMMUFD_UTILS
+
+#include <unistd.h>
+#include <stddef.h>
+#include <sys/fcntl.h>
+#include <sys/ioctl.h>
+#include <stdint.h>
+#include <assert.h>
+
+#include "../kselftest_harness.h"
+#include "../../../../drivers/iommu/iommufd/iommufd_test.h"
+
+/* Hack to make assertions more readable */
+#define _IOMMU_TEST_CMD(x) IOMMU_TEST_CMD
+
+static void *buffer;
+static unsigned long BUFFER_SIZE;
+
+/*
+ * Have the kernel check the refcount on pages. I don't know why a freshly
+ * mmap'd anon non-compound page starts out with a ref of 3
+ */
+#define check_refs(_ptr, _length, _refs) \
+ ({ \
+ struct iommu_test_cmd test_cmd = { \
+ .size = sizeof(test_cmd), \
+ .op = IOMMU_TEST_OP_MD_CHECK_REFS, \
+ .check_refs = { .length = _length, \
+ .uptr = (uintptr_t)(_ptr), \
+ .refs = _refs }, \
+ }; \
+ ASSERT_EQ(0, \
+ ioctl(self->fd, \
+ _IOMMU_TEST_CMD(IOMMU_TEST_OP_MD_CHECK_REFS), \
+ &test_cmd)); \
+ })
+
+static int _test_cmd_mock_domain(int fd, unsigned int ioas_id, __u32 *device_id,
+ __u32 *hwpt_id)
+{
+ struct iommu_test_cmd cmd = {
+ .size = sizeof(cmd),
+ .op = IOMMU_TEST_OP_MOCK_DOMAIN,
+ .id = ioas_id,
+ .mock_domain = {},
+ };
+ int ret;
+
+ ret = ioctl(fd, IOMMU_TEST_CMD, &cmd);
+ if (ret)
+ return ret;
+ if (device_id)
+ *device_id = cmd.mock_domain.out_device_id;
+ assert(cmd.id != 0);
+ if (hwpt_id)
+ *hwpt_id = cmd.mock_domain.out_hwpt_id;
+ return 0;
+}
+#define test_cmd_mock_domain(ioas_id, device_id, hwpt_id) \
+ ASSERT_EQ(0, _test_cmd_mock_domain(self->fd, ioas_id, device_id, \
+ hwpt_id))
+#define test_err_mock_domain(_errno, ioas_id, device_id, hwpt_id) \
+ EXPECT_ERRNO(_errno, _test_cmd_mock_domain(self->fd, ioas_id, \
+ device_id, hwpt_id))
+
+static int _test_cmd_create_access(int fd, unsigned int ioas_id,
+ __u32 *access_id, unsigned int flags)
+{
+ struct iommu_test_cmd cmd = {
+ .size = sizeof(cmd),
+ .op = IOMMU_TEST_OP_CREATE_ACCESS,
+ .id = ioas_id,
+ .create_access = { .flags = flags },
+ };
+ int ret;
+
+ ret = ioctl(fd, IOMMU_TEST_CMD, &cmd);
+ if (ret)
+ return ret;
+ *access_id = cmd.create_access.out_access_fd;
+ return 0;
+}
+#define test_cmd_create_access(ioas_id, access_id, flags) \
+ ASSERT_EQ(0, _test_cmd_create_access(self->fd, ioas_id, access_id, \
+ flags))
+
+static int _test_cmd_destroy_access(unsigned int access_id)
+{
+ return close(access_id);
+}
+#define test_cmd_destroy_access(access_id) \
+ ASSERT_EQ(0, _test_cmd_destroy_access(access_id))
+
+static int _test_cmd_destroy_access_pages(int fd, unsigned int access_id,
+ unsigned int access_pages_id)
+{
+ struct iommu_test_cmd cmd = {
+ .size = sizeof(cmd),
+ .op = IOMMU_TEST_OP_DESTROY_ACCESS_PAGES,
+ .id = access_id,
+ .destroy_access_pages = { .access_pages_id = access_pages_id },
+ };
+ return ioctl(fd, IOMMU_TEST_CMD, &cmd);
+}
+#define test_cmd_destroy_access_pages(access_id, access_pages_id) \
+ ASSERT_EQ(0, _test_cmd_destroy_access_pages(self->fd, access_id, \
+ access_pages_id))
+#define test_err_destroy_access_pages(_errno, access_id, access_pages_id) \
+ EXPECT_ERRNO(_errno, _test_cmd_destroy_access_pages( \
+ self->fd, access_id, access_pages_id))
+
+static int _test_ioctl_destroy(int fd, unsigned int id)
+{
+ struct iommu_destroy cmd = {
+ .size = sizeof(cmd),
+ .id = id,
+ };
+ return ioctl(fd, IOMMU_DESTROY, &cmd);
+}
+#define test_ioctl_destroy(id) ASSERT_EQ(0, _test_ioctl_destroy(self->fd, id))
+
+static int _test_ioctl_ioas_alloc(int fd, __u32 *id)
+{
+ struct iommu_ioas_alloc cmd = {
+ .size = sizeof(cmd),
+ };
+ int ret;
+
+ ret = ioctl(fd, IOMMU_IOAS_ALLOC, &cmd);
+ if (ret)
+ return ret;
+ *id = cmd.out_ioas_id;
+ return 0;
+}
+#define test_ioctl_ioas_alloc(id) \
+ ({ \
+ ASSERT_EQ(0, _test_ioctl_ioas_alloc(self->fd, id)); \
+ ASSERT_NE(0, *(id)); \
+ })
+
+static int _test_ioctl_ioas_map(int fd, unsigned int ioas_id, void *buffer,
+ size_t length, __u64 *iova, unsigned int flags)
+{
+ struct iommu_ioas_map cmd = {
+ .size = sizeof(cmd),
+ .flags = flags,
+ .ioas_id = ioas_id,
+ .user_va = (uintptr_t)buffer,
+ .length = length,
+ };
+ int ret;
+
+ if (flags & IOMMU_IOAS_MAP_FIXED_IOVA)
+ cmd.iova = *iova;
+
+ ret = ioctl(fd, IOMMU_IOAS_MAP, &cmd);
+ *iova = cmd.iova;
+ return ret;
+}
+#define test_ioctl_ioas_map(buffer, length, iova_p) \
+ ASSERT_EQ(0, _test_ioctl_ioas_map(self->fd, self->ioas_id, buffer, \
+ length, iova_p, \
+ IOMMU_IOAS_MAP_WRITEABLE | \
+ IOMMU_IOAS_MAP_READABLE))
+
+#define test_err_ioctl_ioas_map(_errno, buffer, length, iova_p) \
+ EXPECT_ERRNO(_errno, \
+ _test_ioctl_ioas_map(self->fd, self->ioas_id, buffer, \
+ length, iova_p, \
+ IOMMU_IOAS_MAP_WRITEABLE | \
+ IOMMU_IOAS_MAP_READABLE))
+
+#define test_ioctl_ioas_map_id(ioas_id, buffer, length, iova_p) \
+ ASSERT_EQ(0, _test_ioctl_ioas_map(self->fd, ioas_id, buffer, length, \
+ iova_p, \
+ IOMMU_IOAS_MAP_WRITEABLE | \
+ IOMMU_IOAS_MAP_READABLE))
+
+#define test_ioctl_ioas_map_fixed(buffer, length, iova) \
+ ({ \
+ __u64 __iova = iova; \
+ ASSERT_EQ(0, _test_ioctl_ioas_map( \
+ self->fd, self->ioas_id, buffer, length, \
+ &__iova, \
+ IOMMU_IOAS_MAP_FIXED_IOVA | \
+ IOMMU_IOAS_MAP_WRITEABLE | \
+ IOMMU_IOAS_MAP_READABLE)); \
+ })
+
+#define test_err_ioctl_ioas_map_fixed(_errno, buffer, length, iova) \
+ ({ \
+ __u64 __iova = iova; \
+ EXPECT_ERRNO(_errno, \
+ _test_ioctl_ioas_map( \
+ self->fd, self->ioas_id, buffer, length, \
+ &__iova, \
+ IOMMU_IOAS_MAP_FIXED_IOVA | \
+ IOMMU_IOAS_MAP_WRITEABLE | \
+ IOMMU_IOAS_MAP_READABLE)); \
+ })
+
+static int _test_ioctl_ioas_unmap(int fd, unsigned int ioas_id, uint64_t iova,
+ size_t length, uint64_t *out_len)
+{
+ struct iommu_ioas_unmap cmd = {
+ .size = sizeof(cmd),
+ .ioas_id = ioas_id,
+ .iova = iova,
+ .length = length,
+ };
+ int ret;
+
+ ret = ioctl(fd, IOMMU_IOAS_UNMAP, &cmd);
+ if (out_len)
+ *out_len = cmd.length;
+ return ret;
+}
+#define test_ioctl_ioas_unmap(iova, length) \
+ ASSERT_EQ(0, _test_ioctl_ioas_unmap(self->fd, self->ioas_id, iova, \
+ length, NULL))
+
+#define test_ioctl_ioas_unmap_id(ioas_id, iova, length) \
+ ASSERT_EQ(0, _test_ioctl_ioas_unmap(self->fd, ioas_id, iova, length, \
+ NULL))
+
+#define test_err_ioctl_ioas_unmap(_errno, iova, length) \
+ EXPECT_ERRNO(_errno, _test_ioctl_ioas_unmap(self->fd, self->ioas_id, \
+ iova, length, NULL))
+
+static int _test_ioctl_set_temp_memory_limit(int fd, unsigned int limit)
+{
+ struct iommu_test_cmd memlimit_cmd = {
+ .size = sizeof(memlimit_cmd),
+ .op = IOMMU_TEST_OP_SET_TEMP_MEMORY_LIMIT,
+ .memory_limit = { .limit = limit },
+ };
+
+ return ioctl(fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_SET_TEMP_MEMORY_LIMIT),
+ &memlimit_cmd);
+}
+
+#define test_ioctl_set_temp_memory_limit(limit) \
+ ASSERT_EQ(0, _test_ioctl_set_temp_memory_limit(self->fd, limit))
+
+#define test_ioctl_set_default_memory_limit() \
+ test_ioctl_set_temp_memory_limit(65536)
+
+static void teardown_iommufd(int fd, struct __test_metadata *_metadata)
+{
+ struct iommu_test_cmd test_cmd = {
+ .size = sizeof(test_cmd),
+ .op = IOMMU_TEST_OP_MD_CHECK_REFS,
+ .check_refs = { .length = BUFFER_SIZE,
+ .uptr = (uintptr_t)buffer },
+ };
+
+ if (fd == -1)
+ return;
+
+ EXPECT_EQ(0, close(fd));
+
+ fd = open("/dev/iommu", O_RDWR);
+ EXPECT_NE(-1, fd);
+ EXPECT_EQ(0, ioctl(fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_MD_CHECK_REFS),
+ &test_cmd));
+ EXPECT_EQ(0, close(fd));
+}
+
+#define EXPECT_ERRNO(expected_errno, cmd) \
+ ({ \
+ ASSERT_EQ(-1, cmd); \
+ EXPECT_EQ(expected_errno, errno); \
+ })
+
+#endif