summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/devicetree/bindings/iommu/arm,smmu.yaml5
-rw-r--r--Documentation/devicetree/bindings/iommu/riscv,iommu.yaml147
-rw-r--r--MAINTAINERS9
-rw-r--r--arch/s390/include/asm/pci.h4
-rw-r--r--arch/s390/pci/pci.c3
-rw-r--r--arch/s390/pci/pci_debug.c10
-rw-r--r--drivers/gpu/drm/nouveau/nvkm/engine/device/tegra.c4
-rw-r--r--drivers/iommu/Kconfig1
-rw-r--r--drivers/iommu/Makefile2
-rw-r--r--drivers/iommu/amd/amd_iommu.h8
-rw-r--r--drivers/iommu/amd/init.c18
-rw-r--r--drivers/iommu/amd/iommu.c141
-rw-r--r--drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c6
-rw-r--r--drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c7
-rw-r--r--drivers/iommu/arm/arm-smmu/arm-smmu.c11
-rw-r--r--drivers/iommu/io-pgtable-arm-v7s.c149
-rw-r--r--drivers/iommu/io-pgtable-arm.c114
-rw-r--r--drivers/iommu/iommu-sysfs.c2
-rw-r--r--drivers/iommu/iommu.c250
-rw-r--r--drivers/iommu/iova.c2
-rw-r--r--drivers/iommu/mtk_iommu.c2
-rw-r--r--drivers/iommu/omap-iommu.c26
-rw-r--r--drivers/iommu/riscv/Kconfig20
-rw-r--r--drivers/iommu/riscv/Makefile3
-rw-r--r--drivers/iommu/riscv/iommu-bits.h784
-rw-r--r--drivers/iommu/riscv/iommu-pci.c120
-rw-r--r--drivers/iommu/riscv/iommu-platform.c92
-rw-r--r--drivers/iommu/riscv/iommu.c1661
-rw-r--r--drivers/iommu/riscv/iommu.h88
-rw-r--r--drivers/iommu/s390-iommu.c73
-rw-r--r--drivers/media/platform/nvidia/tegra-vde/iommu.c7
-rw-r--r--drivers/remoteproc/remoteproc_core.c6
-rw-r--r--include/linux/iommu.h20
-rw-r--r--include/uapi/linux/iommufd.h8
34 files changed, 3333 insertions, 470 deletions
diff --git a/Documentation/devicetree/bindings/iommu/arm,smmu.yaml b/Documentation/devicetree/bindings/iommu/arm,smmu.yaml
index 92d350b8e01a..c1e11bc6b7a0 100644
--- a/Documentation/devicetree/bindings/iommu/arm,smmu.yaml
+++ b/Documentation/devicetree/bindings/iommu/arm,smmu.yaml
@@ -36,10 +36,12 @@ properties:
items:
- enum:
- qcom,qcm2290-smmu-500
+ - qcom,qcs615-smmu-500
- qcom,qcs8300-smmu-500
- qcom,qdu1000-smmu-500
- qcom,sa8255p-smmu-500
- qcom,sa8775p-smmu-500
+ - qcom,sar2130p-smmu-500
- qcom,sc7180-smmu-500
- qcom,sc7280-smmu-500
- qcom,sc8180x-smmu-500
@@ -88,6 +90,7 @@ properties:
- qcom,qcm2290-smmu-500
- qcom,sa8255p-smmu-500
- qcom,sa8775p-smmu-500
+ - qcom,sar2130p-smmu-500
- qcom,sc7280-smmu-500
- qcom,sc8180x-smmu-500
- qcom,sc8280xp-smmu-500
@@ -524,6 +527,7 @@ allOf:
compatible:
items:
- enum:
+ - qcom,sar2130p-smmu-500
- qcom,sm8550-smmu-500
- qcom,sm8650-smmu-500
- qcom,x1e80100-smmu-500
@@ -555,6 +559,7 @@ allOf:
- cavium,smmu-v2
- marvell,ap806-smmu-500
- nvidia,smmu-500
+ - qcom,qcs615-smmu-500
- qcom,qcs8300-smmu-500
- qcom,qdu1000-smmu-500
- qcom,sa8255p-smmu-500
diff --git a/Documentation/devicetree/bindings/iommu/riscv,iommu.yaml b/Documentation/devicetree/bindings/iommu/riscv,iommu.yaml
new file mode 100644
index 000000000000..5d015eeb06d0
--- /dev/null
+++ b/Documentation/devicetree/bindings/iommu/riscv,iommu.yaml
@@ -0,0 +1,147 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/iommu/riscv,iommu.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: RISC-V IOMMU Architecture Implementation
+
+maintainers:
+ - Tomasz Jeznach <tjeznach@rivosinc.com>
+
+description: |
+ The RISC-V IOMMU provides memory address translation and isolation for
+ input and output devices, supporting per-device translation context,
+ shared process address spaces including the ATS and PRI components of
+ the PCIe specification, two stage address translation and MSI remapping.
+ It supports identical translation table format to the RISC-V address
+ translation tables with page level access and protection attributes.
+ Hardware uses in-memory command and fault reporting queues with wired
+ interrupt or MSI notifications.
+
+ Visit https://github.com/riscv-non-isa/riscv-iommu for more details.
+
+ For information on assigning RISC-V IOMMU to its peripheral devices,
+ see generic IOMMU bindings.
+
+properties:
+ # For PCIe IOMMU hardware compatible property should contain the vendor
+ # and device ID according to the PCI Bus Binding specification.
+ # Since PCI provides built-in identification methods, compatible is not
+ # actually required. For non-PCIe hardware implementations 'riscv,iommu'
+ # should be specified along with 'reg' property providing MMIO location.
+ compatible:
+ oneOf:
+ - items:
+ - enum:
+ - qemu,riscv-iommu
+ - const: riscv,iommu
+ - items:
+ - enum:
+ - pci1efd,edf1
+ - const: riscv,pci-iommu
+
+ reg:
+ maxItems: 1
+ description:
+ For non-PCI devices this represents base address and size of for the
+ IOMMU memory mapped registers interface.
+ For PCI IOMMU hardware implementation this should represent an address
+ of the IOMMU, as defined in the PCI Bus Binding reference.
+
+ '#iommu-cells':
+ const: 1
+ description:
+ The single cell describes the requester id emitted by a master to the
+ IOMMU.
+
+ interrupts:
+ minItems: 1
+ maxItems: 4
+ description:
+ Wired interrupt vectors available for RISC-V IOMMU to notify the
+ RISC-V HARTS. The cause to interrupt vector is software defined
+ using IVEC IOMMU register.
+
+ msi-parent: true
+
+ power-domains:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - '#iommu-cells'
+
+additionalProperties: false
+
+examples:
+ - |+
+ /* Example 1 (IOMMU device with wired interrupts) */
+ #include <dt-bindings/interrupt-controller/irq.h>
+
+ iommu1: iommu@1bccd000 {
+ compatible = "qemu,riscv-iommu", "riscv,iommu";
+ reg = <0x1bccd000 0x1000>;
+ interrupt-parent = <&aplic_smode>;
+ interrupts = <32 IRQ_TYPE_LEVEL_HIGH>,
+ <33 IRQ_TYPE_LEVEL_HIGH>,
+ <34 IRQ_TYPE_LEVEL_HIGH>,
+ <35 IRQ_TYPE_LEVEL_HIGH>;
+ #iommu-cells = <1>;
+ };
+
+ /* Device with two IOMMU device IDs, 0 and 7 */
+ master1 {
+ iommus = <&iommu1 0>, <&iommu1 7>;
+ };
+
+ - |+
+ /* Example 2 (IOMMU device with shared wired interrupt) */
+ #include <dt-bindings/interrupt-controller/irq.h>
+
+ iommu2: iommu@1bccd000 {
+ compatible = "qemu,riscv-iommu", "riscv,iommu";
+ reg = <0x1bccd000 0x1000>;
+ interrupt-parent = <&aplic_smode>;
+ interrupts = <32 IRQ_TYPE_LEVEL_HIGH>;
+ #iommu-cells = <1>;
+ };
+
+ - |+
+ /* Example 3 (IOMMU device with MSIs) */
+ iommu3: iommu@1bcdd000 {
+ compatible = "qemu,riscv-iommu", "riscv,iommu";
+ reg = <0x1bccd000 0x1000>;
+ msi-parent = <&imsics_smode>;
+ #iommu-cells = <1>;
+ };
+
+ - |+
+ /* Example 4 (IOMMU PCIe device with MSIs) */
+ bus {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ pcie@30000000 {
+ device_type = "pci";
+ #address-cells = <3>;
+ #size-cells = <2>;
+ reg = <0x0 0x30000000 0x0 0x1000000>;
+ ranges = <0x02000000 0x0 0x41000000 0x0 0x41000000 0x0 0x0f000000>;
+
+ /*
+ * The IOMMU manages all functions in this PCI domain except
+ * itself. Omit BDF 00:01.0.
+ */
+ iommu-map = <0x0 &iommu0 0x0 0x8>,
+ <0x9 &iommu0 0x9 0xfff7>;
+
+ /* The IOMMU programming interface uses slot 00:01.0 */
+ iommu0: iommu@1,0 {
+ compatible = "pci1efd,edf1", "riscv,pci-iommu";
+ reg = <0x800 0 0 0 0>;
+ #iommu-cells = <1>;
+ };
+ };
+ };
diff --git a/MAINTAINERS b/MAINTAINERS
index 21fdaa19229a..24a45561ccb8 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -19810,6 +19810,15 @@ F: arch/riscv/
N: riscv
K: riscv
+RISC-V IOMMU
+M: Tomasz Jeznach <tjeznach@rivosinc.com>
+L: iommu@lists.linux.dev
+L: linux-riscv@lists.infradead.org
+S: Maintained
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/iommu/linux.git
+F: Documentation/devicetree/bindings/iommu/riscv,iommu.yaml
+F: drivers/iommu/riscv/
+
RISC-V MICROCHIP FPGA SUPPORT
M: Conor Dooley <conor.dooley@microchip.com>
M: Daire McNamara <daire.mcnamara@microchip.com>
diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
index 9d920ced6047..30b20ce9a700 100644
--- a/arch/s390/include/asm/pci.h
+++ b/arch/s390/include/asm/pci.h
@@ -96,7 +96,6 @@ struct zpci_bar_struct {
u8 size; /* order 2 exponent */
};
-struct s390_domain;
struct kvm_zdev;
#define ZPCI_FUNCTIONS_PER_BUS 256
@@ -181,9 +180,10 @@ struct zpci_dev {
struct dentry *debugfs_dev;
/* IOMMU and passthrough */
- struct s390_domain *s390_domain; /* s390 IOMMU domain data */
+ struct iommu_domain *s390_domain; /* attached IOMMU domain */
struct kvm_zdev *kzdev;
struct mutex kzdev_lock;
+ spinlock_t dom_lock; /* protect s390_domain change */
};
static inline bool zdev_enabled(struct zpci_dev *zdev)
diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index bd9624c20b80..be3299609f9b 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -160,6 +160,7 @@ int zpci_fmb_enable_device(struct zpci_dev *zdev)
u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_SET_MEASURE);
struct zpci_iommu_ctrs *ctrs;
struct zpci_fib fib = {0};
+ unsigned long flags;
u8 cc, status;
if (zdev->fmb || sizeof(*zdev->fmb) < zdev->fmb_length)
@@ -171,6 +172,7 @@ int zpci_fmb_enable_device(struct zpci_dev *zdev)
WARN_ON((u64) zdev->fmb & 0xf);
/* reset software counters */
+ spin_lock_irqsave(&zdev->dom_lock, flags);
ctrs = zpci_get_iommu_ctrs(zdev);
if (ctrs) {
atomic64_set(&ctrs->mapped_pages, 0);
@@ -179,6 +181,7 @@ int zpci_fmb_enable_device(struct zpci_dev *zdev)
atomic64_set(&ctrs->sync_map_rpcits, 0);
atomic64_set(&ctrs->sync_rpcits, 0);
}
+ spin_unlock_irqrestore(&zdev->dom_lock, flags);
fib.fmb_addr = virt_to_phys(zdev->fmb);
diff --git a/arch/s390/pci/pci_debug.c b/arch/s390/pci/pci_debug.c
index 2cb5043a997d..38014206c16b 100644
--- a/arch/s390/pci/pci_debug.c
+++ b/arch/s390/pci/pci_debug.c
@@ -71,17 +71,23 @@ static void pci_fmb_show(struct seq_file *m, char *name[], int length,
static void pci_sw_counter_show(struct seq_file *m)
{
- struct zpci_iommu_ctrs *ctrs = zpci_get_iommu_ctrs(m->private);
+ struct zpci_dev *zdev = m->private;
+ struct zpci_iommu_ctrs *ctrs;
atomic64_t *counter;
+ unsigned long flags;
int i;
+ spin_lock_irqsave(&zdev->dom_lock, flags);
+ ctrs = zpci_get_iommu_ctrs(m->private);
if (!ctrs)
- return;
+ goto unlock;
counter = &ctrs->mapped_pages;
for (i = 0; i < ARRAY_SIZE(pci_sw_names); i++, counter++)
seq_printf(m, "%26s:\t%llu\n", pci_sw_names[i],
atomic64_read(counter));
+unlock:
+ spin_unlock_irqrestore(&zdev->dom_lock, flags);
}
static int pci_perf_show(struct seq_file *m, void *v)
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/tegra.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/tegra.c
index d1c294f00665..78a83f904bbd 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/device/tegra.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/tegra.c
@@ -120,8 +120,8 @@ nvkm_device_tegra_probe_iommu(struct nvkm_device_tegra *tdev)
mutex_init(&tdev->iommu.mutex);
if (device_iommu_mapped(dev)) {
- tdev->iommu.domain = iommu_domain_alloc(&platform_bus_type);
- if (!tdev->iommu.domain)
+ tdev->iommu.domain = iommu_paging_domain_alloc(dev);
+ if (IS_ERR(tdev->iommu.domain))
goto error;
/*
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index b3aa1f5d5321..ade4e8cf2a3e 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -195,6 +195,7 @@ config MSM_IOMMU
source "drivers/iommu/amd/Kconfig"
source "drivers/iommu/intel/Kconfig"
source "drivers/iommu/iommufd/Kconfig"
+source "drivers/iommu/riscv/Kconfig"
config IRQ_REMAP
bool "Support for Interrupt Remapping"
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 542760d963ec..5e5a83c6c2aa 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
-obj-y += amd/ intel/ arm/ iommufd/
+obj-y += amd/ intel/ arm/ iommufd/ riscv/
obj-$(CONFIG_IOMMU_API) += iommu.o
obj-$(CONFIG_IOMMU_API) += iommu-traces.o
obj-$(CONFIG_IOMMU_API) += iommu-sysfs.o
diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h
index 6386fa4556d9..38509e1019e9 100644
--- a/drivers/iommu/amd/amd_iommu.h
+++ b/drivers/iommu/amd/amd_iommu.h
@@ -46,6 +46,7 @@ extern int amd_iommu_gpt_level;
extern unsigned long amd_iommu_pgsize_bitmap;
/* Protection domain ops */
+void amd_iommu_init_identity_domain(void);
struct protection_domain *protection_domain_alloc(unsigned int type, int nid);
void protection_domain_free(struct protection_domain *domain);
struct iommu_domain *amd_iommu_domain_alloc_sva(struct device *dev,
@@ -118,9 +119,14 @@ static inline bool check_feature2(u64 mask)
return (amd_iommu_efr2 & mask);
}
+static inline bool amd_iommu_v2_pgtbl_supported(void)
+{
+ return (check_feature(FEATURE_GIOSUP) && check_feature(FEATURE_GT));
+}
+
static inline bool amd_iommu_gt_ppr_supported(void)
{
- return (check_feature(FEATURE_GT) &&
+ return (amd_iommu_v2_pgtbl_supported() &&
check_feature(FEATURE_PPR) &&
check_feature(FEATURE_EPHSUP));
}
diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index 43131c3a2172..45fdba48d54d 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -2070,14 +2070,6 @@ static int __init iommu_init_pci(struct amd_iommu *iommu)
init_iommu_perf_ctr(iommu);
- if (amd_iommu_pgtable == AMD_IOMMU_V2) {
- if (!check_feature(FEATURE_GIOSUP) ||
- !check_feature(FEATURE_GT)) {
- pr_warn("Cannot enable v2 page table for DMA-API. Fallback to v1.\n");
- amd_iommu_pgtable = AMD_IOMMU_V1;
- }
- }
-
if (is_rd890_iommu(iommu->dev)) {
int i, j;
@@ -2172,6 +2164,9 @@ static int __init amd_iommu_init_pci(void)
struct amd_iommu_pci_seg *pci_seg;
int ret;
+ /* Init global identity domain before registering IOMMU */
+ amd_iommu_init_identity_domain();
+
for_each_iommu(iommu) {
ret = iommu_init_pci(iommu);
if (ret) {
@@ -3091,6 +3086,13 @@ static int __init early_amd_iommu_init(void)
FIELD_GET(FEATURE_GATS, amd_iommu_efr) == GUEST_PGTABLE_5_LEVEL)
amd_iommu_gpt_level = PAGE_MODE_5_LEVEL;
+ if (amd_iommu_pgtable == AMD_IOMMU_V2) {
+ if (!amd_iommu_v2_pgtbl_supported()) {
+ pr_warn("Cannot enable v2 page table for DMA-API. Fallback to v1.\n");
+ amd_iommu_pgtable = AMD_IOMMU_V1;
+ }
+ }
+
/* Disable any previously enabled IOMMUs */
if (!is_kdump_kernel() || amd_iommu_disabled)
disable_iommus();
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 8364cd6fa47d..477aaf76b7ad 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -74,6 +74,9 @@ struct kmem_cache *amd_iommu_irq_cache;
static void detach_device(struct device *dev);
+static int amd_iommu_attach_device(struct iommu_domain *dom,
+ struct device *dev);
+
static void set_dte_entry(struct amd_iommu *iommu,
struct iommu_dev_data *dev_data);
@@ -2263,43 +2266,41 @@ void protection_domain_free(struct protection_domain *domain)
kfree(domain);
}
+static void protection_domain_init(struct protection_domain *domain, int nid)
+{
+ spin_lock_init(&domain->lock);
+ INIT_LIST_HEAD(&domain->dev_list);
+ INIT_LIST_HEAD(&domain->dev_data_list);
+ domain->iop.pgtbl.cfg.amd.nid = nid;
+}
+
struct protection_domain *protection_domain_alloc(unsigned int type, int nid)
{
- struct io_pgtable_ops *pgtbl_ops;
struct protection_domain *domain;
- int pgtable;
domain = kzalloc(sizeof(*domain), GFP_KERNEL);
if (!domain)
return NULL;
domain->id = domain_id_alloc();
- if (!domain->id)
- goto err_free;
+ if (!domain->id) {
+ kfree(domain);
+ return NULL;
+ }
- spin_lock_init(&domain->lock);
- INIT_LIST_HEAD(&domain->dev_list);
- INIT_LIST_HEAD(&domain->dev_data_list);
- domain->iop.pgtbl.cfg.amd.nid = nid;
+ protection_domain_init(domain, nid);
+
+ return domain;
+}
+
+static int pdom_setup_pgtable(struct protection_domain *domain,
+ unsigned int type, int pgtable)
+{
+ struct io_pgtable_ops *pgtbl_ops;
- switch (type) {
/* No need to allocate io pgtable ops in passthrough mode */
- case IOMMU_DOMAIN_IDENTITY:
- case IOMMU_DOMAIN_SVA:
- return domain;
- case IOMMU_DOMAIN_DMA:
- pgtable = amd_iommu_pgtable;
- break;
- /*
- * Force IOMMU v1 page table when allocating
- * domain for pass-through devices.
- */
- case IOMMU_DOMAIN_UNMANAGED:
- pgtable = AMD_IOMMU_V1;
- break;
- default:
- goto err_id;
- }
+ if (!(type & __IOMMU_DOMAIN_PAGING))
+ return 0;
switch (pgtable) {
case AMD_IOMMU_V1:
@@ -2309,25 +2310,20 @@ struct protection_domain *protection_domain_alloc(unsigned int type, int nid)
domain->pd_mode = PD_MODE_V2;
break;
default:
- goto err_id;
+ return -EINVAL;
}
pgtbl_ops =
alloc_io_pgtable_ops(pgtable, &domain->iop.pgtbl.cfg, domain);
if (!pgtbl_ops)
- goto err_id;
+ return -ENOMEM;
- return domain;
-err_id:
- domain_id_free(domain->id);
-err_free:
- kfree(domain);
- return NULL;
+ return 0;
}
-static inline u64 dma_max_address(void)
+static inline u64 dma_max_address(int pgtable)
{
- if (amd_iommu_pgtable == AMD_IOMMU_V1)
+ if (pgtable == AMD_IOMMU_V1)
return ~0ULL;
/* V2 with 4/5 level page table */
@@ -2340,11 +2336,13 @@ static bool amd_iommu_hd_support(struct amd_iommu *iommu)
}
static struct iommu_domain *do_iommu_domain_alloc(unsigned int type,
- struct device *dev, u32 flags)
+ struct device *dev,
+ u32 flags, int pgtable)
{
bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
struct protection_domain *domain;
struct amd_iommu *iommu = NULL;
+ int ret;
if (dev)
iommu = get_amd_iommu_from_dev(dev);
@@ -2356,16 +2354,20 @@ static struct iommu_domain *do_iommu_domain_alloc(unsigned int type,
if (amd_iommu_snp_en && (type == IOMMU_DOMAIN_IDENTITY))
return ERR_PTR(-EINVAL);
- if (dirty_tracking && !amd_iommu_hd_support(iommu))
- return ERR_PTR(-EOPNOTSUPP);
-
domain = protection_domain_alloc(type,
dev ? dev_to_node(dev) : NUMA_NO_NODE);
if (!domain)
return ERR_PTR(-ENOMEM);
+ ret = pdom_setup_pgtable(domain, type, pgtable);
+ if (ret) {
+ domain_id_free(domain->id);
+ kfree(domain);
+ return ERR_PTR(ret);
+ }
+
domain->domain.geometry.aperture_start = 0;
- domain->domain.geometry.aperture_end = dma_max_address();
+ domain->domain.geometry.aperture_end = dma_max_address(pgtable);
domain->domain.geometry.force_aperture = true;
domain->domain.pgsize_bitmap = domain->iop.pgtbl.cfg.pgsize_bitmap;
@@ -2383,8 +2385,16 @@ static struct iommu_domain *do_iommu_domain_alloc(unsigned int type,
static struct iommu_domain *amd_iommu_domain_alloc(unsigned int type)
{
struct iommu_domain *domain;
+ int pgtable = amd_iommu_pgtable;
- domain = do_iommu_domain_alloc(type, NULL, 0);
+ /*
+ * Force IOMMU v1 page table when allocating
+ * domain for pass-through devices.
+ */
+ if (type == IOMMU_DOMAIN_UNMANAGED)
+ pgtable = AMD_IOMMU_V1;
+
+ domain = do_iommu_domain_alloc(type, NULL, 0, pgtable);
if (IS_ERR(domain))
return NULL;
@@ -2398,11 +2408,36 @@ amd_iommu_domain_alloc_user(struct device *dev, u32 flags,
{
unsigned int type = IOMMU_DOMAIN_UNMANAGED;
+ struct amd_iommu *iommu = NULL;
+ const u32 supported_flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
+ IOMMU_HWPT_ALLOC_PASID;
+
+ if (dev)
+ iommu = get_amd_iommu_from_dev(dev);
- if ((flags & ~IOMMU_HWPT_ALLOC_DIRTY_TRACKING) || parent || user_data)
+ if ((flags & ~supported_flags) || parent || user_data)
return ERR_PTR(-EOPNOTSUPP);
- return do_iommu_domain_alloc(type, dev, flags);
+ /* Allocate domain with v2 page table if IOMMU supports PASID. */
+ if (flags & IOMMU_HWPT_ALLOC_PASID) {
+ if (!amd_iommu_pasid_supported())
+ return ERR_PTR(-EOPNOTSUPP);
+
+ return do_iommu_domain_alloc(type, dev, flags, AMD_IOMMU_V2);
+ }
+
+ /* Allocate domain with v1 page table for dirty tracking */
+ if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) {
+ if (iommu && amd_iommu_hd_support(iommu)) {
+ return do_iommu_domain_alloc(type, dev,
+ flags, AMD_IOMMU_V1);
+ }
+
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ /* If nothing specific is required use the kernel commandline default */
+ return do_iommu_domain_alloc(type, dev, 0, amd_iommu_pgtable);
}
void amd_iommu_domain_free(struct iommu_domain *dom)
@@ -2444,6 +2479,25 @@ static struct iommu_domain blocked_domain = {
}
};
+static struct protection_domain identity_domain;
+
+static const struct iommu_domain_ops identity_domain_ops = {
+ .attach_dev = amd_iommu_attach_device,
+};
+
+void amd_iommu_init_identity_domain(void)
+{
+ struct iommu_domain *domain = &identity_domain.domain;
+
+ domain->type = IOMMU_DOMAIN_IDENTITY;
+ domain->ops = &identity_domain_ops;
+ domain->owner = &amd_iommu_ops;
+
+ identity_domain.id = domain_id_alloc();
+
+ protection_domain_init(&identity_domain, NUMA_NO_NODE);
+}
+
static int amd_iommu_attach_device(struct iommu_domain *dom,
struct device *dev)
{
@@ -2842,6 +2896,7 @@ static int amd_iommu_dev_disable_feature(struct device *dev,
const struct iommu_ops amd_iommu_ops = {
.capable = amd_iommu_capable,
.blocked_domain = &blocked_domain,
+ .identity_domain = &identity_domain.domain,
.domain_alloc = amd_iommu_domain_alloc,
.domain_alloc_user = amd_iommu_domain_alloc_user,
.domain_alloc_sva = amd_iommu_domain_alloc_sva,
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 353fea58cd31..826db8894fb7 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -3084,7 +3084,8 @@ arm_smmu_domain_alloc_user(struct device *dev, u32 flags,
const struct iommu_user_data *user_data)
{
struct arm_smmu_master *master = dev_iommu_priv_get(dev);
- const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
+ const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
+ IOMMU_HWPT_ALLOC_PASID;
struct arm_smmu_domain *smmu_domain;
int ret;
@@ -3093,6 +3094,9 @@ arm_smmu_domain_alloc_user(struct device *dev, u32 flags,
if (parent || user_data)
return ERR_PTR(-EOPNOTSUPP);
+ if (flags & IOMMU_HWPT_ALLOC_PASID)
+ return arm_smmu_domain_alloc_paging(dev);
+
smmu_domain = arm_smmu_domain_alloc();
if (IS_ERR(smmu_domain))
return ERR_CAST(smmu_domain);
diff --git a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
index fcd13d301fff..c8ec74f089f3 100644
--- a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
+++ b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
@@ -509,7 +509,8 @@ static int tegra241_vcmdq_alloc_smmu_cmdq(struct tegra241_vcmdq *vcmdq)
snprintf(name, 16, "vcmdq%u", vcmdq->idx);
- q->llq.max_n_shift = VCMDQ_LOG2SIZE_MAX;
+ /* Queue size, capped to ensure natural alignment */
+ q->llq.max_n_shift = min_t(u32, CMDQ_MAX_SZ_SHIFT, VCMDQ_LOG2SIZE_MAX);
/* Use the common helper to init the VCMDQ, and then... */
ret = arm_smmu_init_one_queue(smmu, q, vcmdq->page0,
@@ -800,7 +801,9 @@ out_fallback:
return 0;
}
-struct dentry *cmdqv_debugfs_dir;
+#ifdef CONFIG_IOMMU_DEBUGFS
+static struct dentry *cmdqv_debugfs_dir;
+#endif
static struct arm_smmu_device *
__tegra241_cmdqv_probe(struct arm_smmu_device *smmu, struct resource *res,
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 8321962b3714..14618772a3d6 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -1437,6 +1437,17 @@ static struct iommu_device *arm_smmu_probe_device(struct device *dev)
goto out_free;
} else {
smmu = arm_smmu_get_by_fwnode(fwspec->iommu_fwnode);
+
+ /*
+ * Defer probe if the relevant SMMU instance hasn't finished
+ * probing yet. This is a fragile hack and we'd ideally
+ * avoid this race in the core code. Until that's ironed
+ * out, however, this is the most pragmatic option on the
+ * table.
+ */
+ if (!smmu)
+ return ERR_PTR(dev_err_probe(dev, -EPROBE_DEFER,
+ "smmu dev has not bound yet\n"));
}
ret = -EINVAL;
diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c
index 06ffc683b28f..523355e91a2c 100644
--- a/drivers/iommu/io-pgtable-arm-v7s.c
+++ b/drivers/iommu/io-pgtable-arm-v7s.c
@@ -166,7 +166,6 @@ struct arm_v7s_io_pgtable {
arm_v7s_iopte *pgd;
struct kmem_cache *l2_tables;
- spinlock_t split_lock;
};
static bool arm_v7s_pte_is_cont(arm_v7s_iopte pte, int lvl);
@@ -363,25 +362,6 @@ static arm_v7s_iopte arm_v7s_prot_to_pte(int prot, int lvl,
return pte;
}
-static int arm_v7s_pte_to_prot(arm_v7s_iopte pte, int lvl)
-{
- int prot = IOMMU_READ;
- arm_v7s_iopte attr = pte >> ARM_V7S_ATTR_SHIFT(lvl);
-
- if (!(attr & ARM_V7S_PTE_AP_RDONLY))
- prot |= IOMMU_WRITE;
- if (!(attr & ARM_V7S_PTE_AP_UNPRIV))
- prot |= IOMMU_PRIV;
- if ((attr & (ARM_V7S_TEX_MASK << ARM_V7S_TEX_SHIFT)) == 0)
- prot |= IOMMU_MMIO;
- else if (pte & ARM_V7S_ATTR_C)
- prot |= IOMMU_CACHE;
- if (pte & ARM_V7S_ATTR_XN(lvl))
- prot |= IOMMU_NOEXEC;
-
- return prot;
-}
-
static arm_v7s_iopte arm_v7s_pte_to_cont(arm_v7s_iopte pte, int lvl)
{
if (lvl == 1) {
@@ -398,23 +378,6 @@ static arm_v7s_iopte arm_v7s_pte_to_cont(arm_v7s_iopte pte, int lvl)
return pte;
}
-static arm_v7s_iopte arm_v7s_cont_to_pte(arm_v7s_iopte pte, int lvl)
-{
- if (lvl == 1) {
- pte &= ~ARM_V7S_CONT_SECTION;
- } else if (lvl == 2) {
- arm_v7s_iopte xn = pte & BIT(ARM_V7S_CONT_PAGE_XN_SHIFT);
- arm_v7s_iopte tex = pte & (ARM_V7S_CONT_PAGE_TEX_MASK <<
- ARM_V7S_CONT_PAGE_TEX_SHIFT);
-
- pte ^= xn | tex | ARM_V7S_PTE_TYPE_CONT_PAGE;
- pte |= (xn >> ARM_V7S_CONT_PAGE_XN_SHIFT) |
- (tex >> ARM_V7S_CONT_PAGE_TEX_SHIFT) |
- ARM_V7S_PTE_TYPE_PAGE;
- }
- return pte;
-}
-
static bool arm_v7s_pte_is_cont(arm_v7s_iopte pte, int lvl)
{
if (lvl == 1 && !ARM_V7S_PTE_IS_TABLE(pte, lvl))
@@ -591,77 +554,6 @@ static void arm_v7s_free_pgtable(struct io_pgtable *iop)
kfree(data);
}
-static arm_v7s_iopte arm_v7s_split_cont(struct arm_v7s_io_pgtable *data,
- unsigned long iova, int idx, int lvl,
- arm_v7s_iopte *ptep)
-{
- struct io_pgtable *iop = &data->iop;
- arm_v7s_iopte pte;
- size_t size = ARM_V7S_BLOCK_SIZE(lvl);
- int i;
-
- /* Check that we didn't lose a race to get the lock */
- pte = *ptep;
- if (!arm_v7s_pte_is_cont(pte, lvl))
- return pte;
-
- ptep -= idx & (ARM_V7S_CONT_PAGES - 1);
- pte = arm_v7s_cont_to_pte(pte, lvl);
- for (i = 0; i < ARM_V7S_CONT_PAGES; i++)
- ptep[i] = pte + i * size;
-
- __arm_v7s_pte_sync(ptep, ARM_V7S_CONT_PAGES, &iop->cfg);
-
- size *= ARM_V7S_CONT_PAGES;
- io_pgtable_tlb_flush_walk(iop, iova, size, size);
- return pte;
-}
-
-static size_t arm_v7s_split_blk_unmap(struct arm_v7s_io_pgtable *data,
- struct iommu_iotlb_gather *gather,
- unsigned long iova, size_t size,
- arm_v7s_iopte blk_pte,
- arm_v7s_iopte *ptep)
-{
- struct io_pgtable_cfg *cfg = &data->iop.cfg;
- arm_v7s_iopte pte, *tablep;
- int i, unmap_idx, num_entries, num_ptes;
-
- tablep = __arm_v7s_alloc_table(2, GFP_ATOMIC, data);
- if (!tablep)
- return 0; /* Bytes unmapped */
-
- num_ptes = ARM_V7S_PTES_PER_LVL(2, cfg);
- num_entries = size >> ARM_V7S_LVL_SHIFT(2);
- unmap_idx = ARM_V7S_LVL_IDX(iova, 2, cfg);
-
- pte = arm_v7s_prot_to_pte(arm_v7s_pte_to_prot(blk_pte, 1), 2, cfg);
- if (num_entries > 1)
- pte = arm_v7s_pte_to_cont(pte, 2);
-
- for (i = 0; i < num_ptes; i += num_entries, pte += size) {
- /* Unmap! */
- if (i == unmap_idx)
- continue;
-
- __arm_v7s_set_pte(&tablep[i], pte, num_entries, cfg);
- }
-
- pte = arm_v7s_install_table(tablep, ptep, blk_pte, cfg);
- if (pte != blk_pte) {
- __arm_v7s_free_table(tablep, 2, data);
-
- if (!ARM_V7S_PTE_IS_TABLE(pte, 1))
- return 0;
-
- tablep = iopte_deref(pte, 1, data);
- return __arm_v7s_unmap(data, gather, iova, size, 2, tablep);
- }
-
- io_pgtable_tlb_add_page(&data->iop, gather, iova, size);
- return size;
-}
-
static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data,
struct iommu_iotlb_gather *gather,
unsigned long iova, size_t size, int lvl,
@@ -694,11 +586,8 @@ static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data,
* case in a lock for the sake of correctness and be done with it.
*/
if (num_entries <= 1 && arm_v7s_pte_is_cont(pte[0], lvl)) {
- unsigned long flags;
-
- spin_lock_irqsave(&data->split_lock, flags);
- pte[0] = arm_v7s_split_cont(data, iova, idx, lvl, ptep);
- spin_unlock_irqrestore(&data->split_lock, flags);
+ WARN_ONCE(true, "Unmap of a partial large IOPTE is not allowed");
+ return 0;
}
/* If the size matches this level, we're in the right place */
@@ -721,12 +610,8 @@ static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data,
}
return size;
} else if (lvl == 1 && !ARM_V7S_PTE_IS_TABLE(pte[0], lvl)) {
- /*
- * Insert a table at the next level to map the old region,
- * minus the part we want to unmap
- */
- return arm_v7s_split_blk_unmap(data, gather, iova, size, pte[0],
- ptep);
+ WARN_ONCE(true, "Unmap of a partial large IOPTE is not allowed");
+ return 0;
}
/* Keep on walkin' */
@@ -811,8 +696,6 @@ static struct io_pgtable *arm_v7s_alloc_pgtable(struct io_pgtable_cfg *cfg,
if (!data)
return NULL;
- spin_lock_init(&data->split_lock);
-
/*
* ARM_MTK_TTBR_EXT extend the translation table base support larger
* memory address.
@@ -936,8 +819,8 @@ static int __init arm_v7s_do_selftests(void)
.quirks = IO_PGTABLE_QUIRK_ARM_NS,
.pgsize_bitmap = SZ_4K | SZ_64K | SZ_1M | SZ_16M,
};
- unsigned int iova, size, iova_start;
- unsigned int i, loopnr = 0;
+ unsigned int iova, size;
+ unsigned int i;
size_t mapped;
selftest_running = true;
@@ -985,26 +868,6 @@ static int __init arm_v7s_do_selftests(void)
return __FAIL(ops);
iova += SZ_16M;
- loopnr++;
- }
-
- /* Partial unmap */
- i = 1;
- size = 1UL << __ffs(cfg.pgsize_bitmap);
- while (i < loopnr) {
- iova_start = i * SZ_16M;
- if (ops->unmap_pages(ops, iova_start + size, size, 1, NULL) != size)
- return __FAIL(ops);
-
- /* Remap of partial unmap */
- if (ops->map_pages(ops, iova_start + size, size, size, 1,
- IOMMU_READ, GFP_KERNEL, &mapped))
- return __FAIL(ops);
-
- if (ops->iova_to_phys(ops, iova_start + size + 42)
- != (size + 42))
- return __FAIL(ops);
- i++;
}
/* Full unmap */
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 0e67f1721a3d..d045a76dbfdc 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -199,6 +199,18 @@ static phys_addr_t iopte_to_paddr(arm_lpae_iopte pte,
return (paddr | (paddr << (48 - 12))) & (ARM_LPAE_PTE_ADDR_MASK << 4);
}
+/*
+ * Convert an index returned by ARM_LPAE_PGD_IDX(), which can point into
+ * a concatenated PGD, into the maximum number of entries that can be
+ * mapped in the same table page.
+ */
+static inline int arm_lpae_max_entries(int i, struct arm_lpae_io_pgtable *data)
+{
+ int ptes_per_table = ARM_LPAE_PTES_PER_TABLE(data);
+
+ return ptes_per_table - (i & (ptes_per_table - 1));
+}
+
static bool selftest_running = false;
static dma_addr_t __arm_lpae_dma_addr(void *pages)
@@ -390,7 +402,7 @@ static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova,
/* If we can install a leaf entry at this level, then do so */
if (size == block_size) {
- max_entries = ARM_LPAE_PTES_PER_TABLE(data) - map_idx_start;
+ max_entries = arm_lpae_max_entries(map_idx_start, data);
num_entries = min_t(int, pgcount, max_entries);
ret = arm_lpae_init_pte(data, iova, paddr, prot, lvl, num_entries, ptep);
if (!ret)
@@ -569,66 +581,6 @@ static void arm_lpae_free_pgtable(struct io_pgtable *iop)
kfree(data);
}
-static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data,
- struct iommu_iotlb_gather *gather,
- unsigned long iova, size_t size,
- arm_lpae_iopte blk_pte, int lvl,
- arm_lpae_iopte *ptep, size_t pgcount)
-{
- struct io_pgtable_cfg *cfg = &data->iop.cfg;
- arm_lpae_iopte pte, *tablep;
- phys_addr_t blk_paddr;
- size_t tablesz = ARM_LPAE_GRANULE(data);
- size_t split_sz = ARM_LPAE_BLOCK_SIZE(lvl, data);
- int ptes_per_table = ARM_LPAE_PTES_PER_TABLE(data);
- int i, unmap_idx_start = -1, num_entries = 0, max_entries;
-
- if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS))
- return 0;
-
- tablep = __arm_lpae_alloc_pages(tablesz, GFP_ATOMIC, cfg, data->iop.cookie);
- if (!tablep)
- return 0; /* Bytes unmapped */
-
- if (size == split_sz) {
- unmap_idx_start = ARM_LPAE_LVL_IDX(iova, lvl, data);
- max_entries = ptes_per_table - unmap_idx_start;
- num_entries = min_t(int, pgcount, max_entries);
- }
-
- blk_paddr = iopte_to_paddr(blk_pte, data);
- pte = iopte_prot(blk_pte);
-
- for (i = 0; i < ptes_per_table; i++, blk_paddr += split_sz) {
- /* Unmap! */
- if (i >= unmap_idx_start && i < (unmap_idx_start + num_entries))
- continue;
-
- __arm_lpae_init_pte(data, blk_paddr, pte, lvl, 1, &tablep[i]);
- }
-
- pte = arm_lpae_install_table(tablep, ptep, blk_pte, data);
- if (pte != blk_pte) {
- __arm_lpae_free_pages(tablep, tablesz, cfg, data->iop.cookie);
- /*
- * We may race against someone unmapping another part of this
- * block, but anything else is invalid. We can't misinterpret
- * a page entry here since we're never at the last level.
- */
- if (iopte_type(pte) != ARM_LPAE_PTE_TYPE_TABLE)
- return 0;
-
- tablep = iopte_deref(pte, data);
- } else if (unmap_idx_start >= 0) {
- for (i = 0; i < num_entries; i++)
- io_pgtable_tlb_add_page(&data->iop, gather, iova + i * size, size);
-
- return num_entries * size;
- }
-
- return __arm_lpae_unmap(data, gather, iova, size, pgcount, lvl, tablep);
-}
-
static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
struct iommu_iotlb_gather *gather,
unsigned long iova, size_t size, size_t pgcount,
@@ -650,7 +602,7 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
/* If the size matches this level, we're in the right place */
if (size == ARM_LPAE_BLOCK_SIZE(lvl, data)) {
- max_entries = ARM_LPAE_PTES_PER_TABLE(data) - unmap_idx_start;
+ max_entries = arm_lpae_max_entries(unmap_idx_start, data);
num_entries = min_t(int, pgcount, max_entries);
/* Find and handle non-leaf entries */
@@ -678,12 +630,8 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
return i * size;
} else if (iopte_leaf(pte, lvl, iop->fmt)) {
- /*
- * Insert a table at the next level to map the old region,
- * minus the part we want to unmap
- */
- return arm_lpae_split_blk_unmap(data, gather, iova, size, pte,
- lvl + 1, ptep, pgcount);
+ WARN_ONCE(true, "Unmap of a partial large IOPTE is not allowed");
+ return 0;
}
/* Keep on walkin' */
@@ -1347,19 +1295,6 @@ static int __init arm_lpae_run_tests(struct io_pgtable_cfg *cfg)
iova += SZ_1G;
}
- /* Partial unmap */
- size = 1UL << __ffs(cfg->pgsize_bitmap);
- if (ops->unmap_pages(ops, SZ_1G + size, size, 1, NULL) != size)
- return __FAIL(ops, i);
-
- /* Remap of partial unmap */
- if (ops->map_pages(ops, SZ_1G + size, size, size, 1,
- IOMMU_READ, GFP_KERNEL, &mapped))
- return __FAIL(ops, i);
-
- if (ops->iova_to_phys(ops, SZ_1G + size + 42) != (size + 42))
- return __FAIL(ops, i);
-
/* Full unmap */
iova = 0;
for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) {
@@ -1382,6 +1317,23 @@ static int __init arm_lpae_run_tests(struct io_pgtable_cfg *cfg)
iova += SZ_1G;
}
+ /*
+ * Map/unmap the last largest supported page of the IAS, this can
+ * trigger corner cases in the concatednated page tables.
+ */
+ mapped = 0;
+ size = 1UL << __fls(cfg->pgsize_bitmap);
+ iova = (1UL << cfg->ias) - size;
+ if (ops->map_pages(ops, iova, iova, size, 1,
+ IOMMU_READ | IOMMU_WRITE |
+ IOMMU_NOEXEC | IOMMU_CACHE,
+ GFP_KERNEL, &mapped))
+ return __FAIL(ops, i);
+ if (mapped != size)
+ return __FAIL(ops, i);
+ if (ops->unmap_pages(ops, iova, size, 1, NULL) != size)
+ return __FAIL(ops, i);
+
free_io_pgtable_ops(ops);
}
diff --git a/drivers/iommu/iommu-sysfs.c b/drivers/iommu/iommu-sysfs.c
index cbe378c34ba3..170022c09536 100644
--- a/drivers/iommu/iommu-sysfs.c
+++ b/drivers/iommu/iommu-sysfs.c
@@ -34,7 +34,7 @@ static void release_device(struct device *dev)
kfree(dev);
}
-static struct class iommu_class = {
+static const struct class iommu_class = {
.name = "iommu",
.dev_release = release_device,
.dev_groups = dev_groups,
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 83c8e617a2c5..87a0721c7b6d 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -32,6 +32,7 @@
#include <trace/events/iommu.h>
#include <linux/sched/mm.h>
#include <linux/msi.h>
+#include <uapi/linux/iommufd.h>
#include "dma-iommu.h"
#include "iommu-priv.h"
@@ -90,15 +91,17 @@ static const char * const iommu_group_resv_type_string[] = {
#define IOMMU_CMD_LINE_DMA_API BIT(0)
#define IOMMU_CMD_LINE_STRICT BIT(1)
+static int bus_iommu_probe(const struct bus_type *bus);
static int iommu_bus_notifier(struct notifier_block *nb,
unsigned long action, void *data);
static void iommu_release_device(struct device *dev);
-static struct iommu_domain *
-__iommu_group_domain_alloc(struct iommu_group *group, unsigned int type);
static int __iommu_attach_device(struct iommu_domain *domain,
struct device *dev);
static int __iommu_attach_group(struct iommu_domain *domain,
struct iommu_group *group);
+static struct iommu_domain *__iommu_paging_domain_alloc_flags(struct device *dev,
+ unsigned int type,
+ unsigned int flags);
enum {
IOMMU_SET_DOMAIN_MUST_SUCCEED = 1 << 0,
@@ -133,6 +136,8 @@ static struct group_device *iommu_group_alloc_device(struct iommu_group *group,
struct device *dev);
static void __iommu_group_free_device(struct iommu_group *group,
struct group_device *grp_dev);
+static void iommu_domain_init(struct iommu_domain *domain, unsigned int type,
+ const struct iommu_ops *ops);
#define IOMMU_GROUP_ATTR(_name, _mode, _show, _store) \
struct iommu_group_attribute iommu_group_attr_##_name = \
@@ -1141,10 +1146,6 @@ map_end:
}
}
-
- if (!list_empty(&mappings) && iommu_is_dma_domain(domain))
- iommu_flush_iotlb_all(domain);
-
out:
iommu_put_resv_regions(dev, &mappings);
@@ -1586,12 +1587,59 @@ struct iommu_group *fsl_mc_device_group(struct device *dev)
}
EXPORT_SYMBOL_GPL(fsl_mc_device_group);
+static struct iommu_domain *__iommu_alloc_identity_domain(struct device *dev)
+{
+ const struct iommu_ops *ops = dev_iommu_ops(dev);
+ struct iommu_domain *domain;
+
+ if (ops->identity_domain)
+ return ops->identity_domain;
+
+ /* Older drivers create the identity domain via ops->domain_alloc() */
+ if (!ops->domain_alloc)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ domain = ops->domain_alloc(IOMMU_DOMAIN_IDENTITY);
+ if (IS_ERR(domain))
+ return domain;
+ if (!domain)
+ return ERR_PTR(-ENOMEM);
+
+ iommu_domain_init(domain, IOMMU_DOMAIN_IDENTITY, ops);
+ return domain;
+}
+
static struct iommu_domain *
__iommu_group_alloc_default_domain(struct iommu_group *group, int req_type)
{
+ struct device *dev = iommu_group_first_dev(group);
+ struct iommu_domain *dom;
+
if (group->default_domain && group->default_domain->type == req_type)
return group->default_domain;
- return __iommu_group_domain_alloc(group, req_type);
+
+ /*
+ * When allocating the DMA API domain assume that the driver is going to
+ * use PASID and make sure the RID's domain is PASID compatible.
+ */
+ if (req_type & __IOMMU_DOMAIN_PAGING) {
+ dom = __iommu_paging_domain_alloc_flags(dev, req_type,
+ dev->iommu->max_pasids ? IOMMU_HWPT_ALLOC_PASID : 0);
+
+ /*
+ * If driver does not support PASID feature then
+ * try to allocate non-PASID domain
+ */
+ if (PTR_ERR(dom) == -EOPNOTSUPP)
+ dom = __iommu_paging_domain_alloc_flags(dev, req_type, 0);
+
+ return dom;
+ }
+
+ if (req_type == IOMMU_DOMAIN_IDENTITY)
+ return __iommu_alloc_identity_domain(dev);
+
+ return ERR_PTR(-EINVAL);
}
/*
@@ -1795,7 +1843,7 @@ static void iommu_group_do_probe_finalize(struct device *dev)
ops->probe_finalize(dev);
}
-int bus_iommu_probe(const struct bus_type *bus)
+static int bus_iommu_probe(const struct bus_type *bus)
{
struct iommu_group *group, *next;
LIST_HEAD(group_list);
@@ -1841,31 +1889,6 @@ int bus_iommu_probe(const struct bus_type *bus)
}
/**
- * iommu_present() - make platform-specific assumptions about an IOMMU
- * @bus: bus to check
- *
- * Do not use this function. You want device_iommu_mapped() instead.
- *
- * Return: true if some IOMMU is present and aware of devices on the given bus;
- * in general it may not be the only IOMMU, and it may not have anything to do
- * with whatever device you are ultimately interested in.
- */
-bool iommu_present(const struct bus_type *bus)
-{
- bool ret = false;
-
- for (int i = 0; i < ARRAY_SIZE(iommu_buses); i++) {
- if (iommu_buses[i] == bus) {
- spin_lock(&iommu_device_lock);
- ret = !list_empty(&iommu_device_list);
- spin_unlock(&iommu_device_lock);
- }
- }
- return ret;
-}
-EXPORT_SYMBOL_GPL(iommu_present);
-
-/**
* device_iommu_capable() - check for a general IOMMU capability
* @dev: device to which the capability would be relevant, if available
* @cap: IOMMU capability
@@ -1934,117 +1957,67 @@ void iommu_set_fault_handler(struct iommu_domain *domain,
}
EXPORT_SYMBOL_GPL(iommu_set_fault_handler);
-static struct iommu_domain *__iommu_domain_alloc(const struct iommu_ops *ops,
- struct device *dev,
- unsigned int type)
+static void iommu_domain_init(struct iommu_domain *domain, unsigned int type,
+ const struct iommu_ops *ops)
{
- struct iommu_domain *domain;
- unsigned int alloc_type = type & IOMMU_DOMAIN_ALLOC_FLAGS;
-
- if (alloc_type == IOMMU_DOMAIN_IDENTITY && ops->identity_domain)
- return ops->identity_domain;
- else if (alloc_type == IOMMU_DOMAIN_BLOCKED && ops->blocked_domain)
- return ops->blocked_domain;
- else if (type & __IOMMU_DOMAIN_PAGING && ops->domain_alloc_paging)
- domain = ops->domain_alloc_paging(dev);
- else if (ops->domain_alloc)
- domain = ops->domain_alloc(alloc_type);
- else
- return ERR_PTR(-EOPNOTSUPP);
-
- /*
- * Many domain_alloc ops now return ERR_PTR, make things easier for the
- * driver by accepting ERR_PTR from all domain_alloc ops instead of
- * having two rules.
- */
- if (IS_ERR(domain))
- return domain;
- if (!domain)
- return ERR_PTR(-ENOMEM);
-
domain->type = type;
domain->owner = ops;
+ if (!domain->ops)
+ domain->ops = ops->default_domain_ops;
+
/*
* If not already set, assume all sizes by default; the driver
* may override this later
*/
if (!domain->pgsize_bitmap)
domain->pgsize_bitmap = ops->pgsize_bitmap;
-
- if (!domain->ops)
- domain->ops = ops->default_domain_ops;
-
- if (iommu_is_dma_domain(domain)) {
- int rc;
-
- rc = iommu_get_dma_cookie(domain);
- if (rc) {
- iommu_domain_free(domain);
- return ERR_PTR(rc);
- }
- }
- return domain;
}
static struct iommu_domain *
-__iommu_group_domain_alloc(struct iommu_group *group, unsigned int type)
-{
- struct device *dev = iommu_group_first_dev(group);
-
- return __iommu_domain_alloc(dev_iommu_ops(dev), dev, type);
-}
-
-static int __iommu_domain_alloc_dev(struct device *dev, void *data)
+__iommu_paging_domain_alloc_flags(struct device *dev, unsigned int type,
+ unsigned int flags)
{
- const struct iommu_ops **ops = data;
+ const struct iommu_ops *ops;
+ struct iommu_domain *domain;
if (!dev_has_iommu(dev))
- return 0;
-
- if (WARN_ONCE(*ops && *ops != dev_iommu_ops(dev),
- "Multiple IOMMU drivers present for bus %s, which the public IOMMU API can't fully support yet. You will still need to disable one or more for this to work, sorry!\n",
- dev_bus_name(dev)))
- return -EBUSY;
-
- *ops = dev_iommu_ops(dev);
- return 0;
-}
+ return ERR_PTR(-ENODEV);
-/*
- * The iommu ops in bus has been retired. Do not use this interface in
- * new drivers.
- */
-struct iommu_domain *iommu_domain_alloc(const struct bus_type *bus)
-{
- const struct iommu_ops *ops = NULL;
- int err = bus_for_each_dev(bus, NULL, &ops, __iommu_domain_alloc_dev);
- struct iommu_domain *domain;
+ ops = dev_iommu_ops(dev);
- if (err || !ops)
- return NULL;
+ if (ops->domain_alloc_paging && !flags)
+ domain = ops->domain_alloc_paging(dev);
+ else if (ops->domain_alloc_user)
+ domain = ops->domain_alloc_user(dev, flags, NULL, NULL);
+ else if (ops->domain_alloc && !flags)
+ domain = ops->domain_alloc(IOMMU_DOMAIN_UNMANAGED);
+ else
+ return ERR_PTR(-EOPNOTSUPP);
- domain = __iommu_domain_alloc(ops, NULL, IOMMU_DOMAIN_UNMANAGED);
if (IS_ERR(domain))
- return NULL;
+ return domain;
+ if (!domain)
+ return ERR_PTR(-ENOMEM);
+
+ iommu_domain_init(domain, type, ops);
return domain;
}
-EXPORT_SYMBOL_GPL(iommu_domain_alloc);
/**
- * iommu_paging_domain_alloc() - Allocate a paging domain
+ * iommu_paging_domain_alloc_flags() - Allocate a paging domain
* @dev: device for which the domain is allocated
+ * @flags: Bitmap of iommufd_hwpt_alloc_flags
*
* Allocate a paging domain which will be managed by a kernel driver. Return
- * allocated domain if successful, or a ERR pointer for failure.
+ * allocated domain if successful, or an ERR pointer for failure.
*/
-struct iommu_domain *iommu_paging_domain_alloc(struct device *dev)
+struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev,
+ unsigned int flags)
{
- if (!dev_has_iommu(dev))
- return ERR_PTR(-ENODEV);
-
- return __iommu_domain_alloc(dev_iommu_ops(dev), dev, IOMMU_DOMAIN_UNMANAGED);
+ return __iommu_paging_domain_alloc_flags(dev,
+ IOMMU_DOMAIN_UNMANAGED, flags);
}
-EXPORT_SYMBOL_GPL(iommu_paging_domain_alloc);
+EXPORT_SYMBOL_GPL(iommu_paging_domain_alloc_flags);
void iommu_domain_free(struct iommu_domain *domain)
{
@@ -2216,8 +2189,8 @@ EXPORT_SYMBOL_GPL(iommu_attach_group);
/**
* iommu_group_replace_domain - replace the domain that a group is attached to
- * @new_domain: new IOMMU domain to replace with
* @group: IOMMU group that will be attached to the new domain
+ * @new_domain: new IOMMU domain to replace with
*
* This API allows the group to switch domains without being forced to go to
* the blocking domain in-between.
@@ -2586,6 +2559,20 @@ static size_t __iommu_unmap(struct iommu_domain *domain,
return unmapped;
}
+/**
+ * iommu_unmap() - Remove mappings from a range of IOVA
+ * @domain: Domain to manipulate
+ * @iova: IO virtual address to start
+ * @size: Length of the range starting from @iova
+ *
+ * iommu_unmap() will remove a translation created by iommu_map(). It cannot
+ * subdivide a mapping created by iommu_map(), so it should be called with IOVA
+ * ranges that match what was passed to iommu_map(). The range can aggregate
+ * contiguous iommu_map() calls so long as no individual range is split.
+ *
+ * Returns: Number of bytes of IOVA unmapped. iova + res will be the point
+ * unmapping stopped.
+ */
size_t iommu_unmap(struct iommu_domain *domain,
unsigned long iova, size_t size)
{
@@ -2965,6 +2952,14 @@ static int iommu_setup_default_domain(struct iommu_group *group,
if (group->default_domain == dom)
return 0;
+ if (iommu_is_dma_domain(dom)) {
+ ret = iommu_get_dma_cookie(dom);
+ if (ret) {
+ iommu_domain_free(dom);
+ return ret;
+ }
+ }
+
/*
* IOMMU_RESV_DIRECT and IOMMU_RESV_DIRECT_RELAXABLE regions must be
* mapped before their device is attached, in order to guarantee
@@ -3152,22 +3147,25 @@ void iommu_device_unuse_default_domain(struct device *dev)
static int __iommu_group_alloc_blocking_domain(struct iommu_group *group)
{
+ struct device *dev = iommu_group_first_dev(group);
+ const struct iommu_ops *ops = dev_iommu_ops(dev);
struct iommu_domain *domain;
if (group->blocking_domain)
return 0;
- domain = __iommu_group_domain_alloc(group, IOMMU_DOMAIN_BLOCKED);
- if (IS_ERR(domain)) {
- /*
- * For drivers that do not yet understand IOMMU_DOMAIN_BLOCKED
- * create an empty domain instead.
- */
- domain = __iommu_group_domain_alloc(group,
- IOMMU_DOMAIN_UNMANAGED);
- if (IS_ERR(domain))
- return PTR_ERR(domain);
+ if (ops->blocked_domain) {
+ group->blocking_domain = ops->blocked_domain;
+ return 0;
}
+
+ /*
+ * For drivers that do not yet understand IOMMU_DOMAIN_BLOCKED create an
+ * empty PAGING domain instead.
+ */
+ domain = iommu_paging_domain_alloc(dev);
+ if (IS_ERR(domain))
+ return PTR_ERR(domain);
group->blocking_domain = domain;
return 0;
}
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 16c6adff3eb7..a28197b88c92 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -506,7 +506,7 @@ __adjust_overlap_range(struct iova *iova,
* reserve_iova - reserves an iova in the given range
* @iovad: - iova domain pointer
* @pfn_lo: - lower page frame address
- * @pfn_hi:- higher pfn adderss
+ * @pfn_hi:- higher pfn address
* This function allocates reserves the address range from pfn_lo to pfn_hi so
* that this address is not dished out as part of alloc_iova.
*/
diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c
index 6a2707fe7a78..c45313c43b9e 100644
--- a/drivers/iommu/mtk_iommu.c
+++ b/drivers/iommu/mtk_iommu.c
@@ -1599,7 +1599,7 @@ static const unsigned int mt8186_larb_region_msk[MT8192_MULTI_REGION_NR_MAX][MTK
static const struct mtk_iommu_plat_data mt8186_data_mm = {
.m4u_plat = M4U_MT8186,
.flags = HAS_BCLK | HAS_SUB_COMM_2BITS | OUT_ORDER_WR_EN |
- WR_THROT_EN | IOVA_34_EN | MTK_IOMMU_TYPE_MM,
+ WR_THROT_EN | IOVA_34_EN | MTK_IOMMU_TYPE_MM | PGTABLE_PA_35_EN,
.larbid_remap = {{0}, {1, MTK_INVALID_LARBID, 8}, {4}, {7}, {2}, {9, 11, 19, 20},
{MTK_INVALID_LARBID, 14, 16},
{MTK_INVALID_LARBID, 13, MTK_INVALID_LARBID, 17}},
diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c
index c9528065a59a..3f72aef8bd5b 100644
--- a/drivers/iommu/omap-iommu.c
+++ b/drivers/iommu/omap-iommu.c
@@ -1230,25 +1230,24 @@ static int omap_iommu_probe(struct platform_device *pdev)
if (err)
return err;
- err = iommu_device_register(&obj->iommu, &omap_iommu_ops, &pdev->dev);
- if (err)
- goto out_sysfs;
obj->has_iommu_driver = true;
}
+ err = iommu_device_register(&obj->iommu, &omap_iommu_ops, &pdev->dev);
+ if (err)
+ goto out_sysfs;
+
pm_runtime_enable(obj->dev);
omap_iommu_debugfs_add(obj);
dev_info(&pdev->dev, "%s registered\n", obj->name);
- /* Re-probe bus to probe device attached to this IOMMU */
- bus_iommu_probe(&platform_bus_type);
-
return 0;
out_sysfs:
- iommu_device_sysfs_remove(&obj->iommu);
+ if (obj->has_iommu_driver)
+ iommu_device_sysfs_remove(&obj->iommu);
return err;
}
@@ -1256,10 +1255,10 @@ static void omap_iommu_remove(struct platform_device *pdev)
{
struct omap_iommu *obj = platform_get_drvdata(pdev);
- if (obj->has_iommu_driver) {
+ if (obj->has_iommu_driver)
iommu_device_sysfs_remove(&obj->iommu);
- iommu_device_unregister(&obj->iommu);
- }
+
+ iommu_device_unregister(&obj->iommu);
omap_iommu_debugfs_remove(obj);
@@ -1723,12 +1722,19 @@ static void omap_iommu_release_device(struct device *dev)
}
+static int omap_iommu_of_xlate(struct device *dev, const struct of_phandle_args *args)
+{
+ /* TODO: collect args->np to save re-parsing in probe above */
+ return 0;
+}
+
static const struct iommu_ops omap_iommu_ops = {
.identity_domain = &omap_iommu_identity_domain,
.domain_alloc_paging = omap_iommu_domain_alloc_paging,
.probe_device = omap_iommu_probe_device,
.release_device = omap_iommu_release_device,
.device_group = generic_single_device_group,
+ .of_xlate = omap_iommu_of_xlate,
.pgsize_bitmap = OMAP_IOMMU_PGSIZES,
.default_domain_ops = &(const struct iommu_domain_ops) {
.attach_dev = omap_iommu_attach_dev,
diff --git a/drivers/iommu/riscv/Kconfig b/drivers/iommu/riscv/Kconfig
new file mode 100644
index 000000000000..c071816f59a6
--- /dev/null
+++ b/drivers/iommu/riscv/Kconfig
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: GPL-2.0-only
+# RISC-V IOMMU support
+
+config RISCV_IOMMU
+ bool "RISC-V IOMMU Support"
+ depends on RISCV && 64BIT
+ default y
+ select IOMMU_API
+ help
+ Support for implementations of the RISC-V IOMMU architecture that
+ complements the RISC-V MMU capabilities, providing similar address
+ translation and protection functions for accesses from I/O devices.
+
+ Say Y here if your SoC includes an IOMMU device implementing
+ the RISC-V IOMMU architecture.
+
+config RISCV_IOMMU_PCI
+ def_bool y if RISCV_IOMMU && PCI_MSI
+ help
+ Support for the PCIe implementation of RISC-V IOMMU architecture.
diff --git a/drivers/iommu/riscv/Makefile b/drivers/iommu/riscv/Makefile
new file mode 100644
index 000000000000..f54c9ed17d41
--- /dev/null
+++ b/drivers/iommu/riscv/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_RISCV_IOMMU) += iommu.o iommu-platform.o
+obj-$(CONFIG_RISCV_IOMMU_PCI) += iommu-pci.o
diff --git a/drivers/iommu/riscv/iommu-bits.h b/drivers/iommu/riscv/iommu-bits.h
new file mode 100644
index 000000000000..98daf0e1a306
--- /dev/null
+++ b/drivers/iommu/riscv/iommu-bits.h
@@ -0,0 +1,784 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright © 2022-2024 Rivos Inc.
+ * Copyright © 2023 FORTH-ICS/CARV
+ * Copyright © 2023 RISC-V IOMMU Task Group
+ *
+ * RISC-V IOMMU - Register Layout and Data Structures.
+ *
+ * Based on the 'RISC-V IOMMU Architecture Specification', Version 1.0
+ * Published at https://github.com/riscv-non-isa/riscv-iommu
+ *
+ */
+
+#ifndef _RISCV_IOMMU_BITS_H_
+#define _RISCV_IOMMU_BITS_H_
+
+#include <linux/types.h>
+#include <linux/bitfield.h>
+#include <linux/bits.h>
+
+/*
+ * Chapter 5: Memory Mapped register interface
+ */
+
+/* Common field positions */
+#define RISCV_IOMMU_PPN_FIELD GENMASK_ULL(53, 10)
+#define RISCV_IOMMU_QUEUE_LOG2SZ_FIELD GENMASK_ULL(4, 0)
+#define RISCV_IOMMU_QUEUE_INDEX_FIELD GENMASK_ULL(31, 0)
+#define RISCV_IOMMU_QUEUE_ENABLE BIT(0)
+#define RISCV_IOMMU_QUEUE_INTR_ENABLE BIT(1)
+#define RISCV_IOMMU_QUEUE_MEM_FAULT BIT(8)
+#define RISCV_IOMMU_QUEUE_OVERFLOW BIT(9)
+#define RISCV_IOMMU_QUEUE_ACTIVE BIT(16)
+#define RISCV_IOMMU_QUEUE_BUSY BIT(17)
+
+#define RISCV_IOMMU_ATP_PPN_FIELD GENMASK_ULL(43, 0)
+#define RISCV_IOMMU_ATP_MODE_FIELD GENMASK_ULL(63, 60)
+
+/* 5.3 IOMMU Capabilities (64bits) */
+#define RISCV_IOMMU_REG_CAPABILITIES 0x0000
+#define RISCV_IOMMU_CAPABILITIES_VERSION GENMASK_ULL(7, 0)
+#define RISCV_IOMMU_CAPABILITIES_SV32 BIT_ULL(8)
+#define RISCV_IOMMU_CAPABILITIES_SV39 BIT_ULL(9)
+#define RISCV_IOMMU_CAPABILITIES_SV48 BIT_ULL(10)
+#define RISCV_IOMMU_CAPABILITIES_SV57 BIT_ULL(11)
+#define RISCV_IOMMU_CAPABILITIES_SVPBMT BIT_ULL(15)
+#define RISCV_IOMMU_CAPABILITIES_SV32X4 BIT_ULL(16)
+#define RISCV_IOMMU_CAPABILITIES_SV39X4 BIT_ULL(17)
+#define RISCV_IOMMU_CAPABILITIES_SV48X4 BIT_ULL(18)
+#define RISCV_IOMMU_CAPABILITIES_SV57X4 BIT_ULL(19)
+#define RISCV_IOMMU_CAPABILITIES_AMO_MRIF BIT_ULL(21)
+#define RISCV_IOMMU_CAPABILITIES_MSI_FLAT BIT_ULL(22)
+#define RISCV_IOMMU_CAPABILITIES_MSI_MRIF BIT_ULL(23)
+#define RISCV_IOMMU_CAPABILITIES_AMO_HWAD BIT_ULL(24)
+#define RISCV_IOMMU_CAPABILITIES_ATS BIT_ULL(25)
+#define RISCV_IOMMU_CAPABILITIES_T2GPA BIT_ULL(26)
+#define RISCV_IOMMU_CAPABILITIES_END BIT_ULL(27)
+#define RISCV_IOMMU_CAPABILITIES_IGS GENMASK_ULL(29, 28)
+#define RISCV_IOMMU_CAPABILITIES_HPM BIT_ULL(30)
+#define RISCV_IOMMU_CAPABILITIES_DBG BIT_ULL(31)
+#define RISCV_IOMMU_CAPABILITIES_PAS GENMASK_ULL(37, 32)
+#define RISCV_IOMMU_CAPABILITIES_PD8 BIT_ULL(38)
+#define RISCV_IOMMU_CAPABILITIES_PD17 BIT_ULL(39)
+#define RISCV_IOMMU_CAPABILITIES_PD20 BIT_ULL(40)
+
+/**
+ * enum riscv_iommu_igs_settings - Interrupt Generation Support Settings
+ * @RISCV_IOMMU_CAPABILITIES_IGS_MSI: IOMMU supports only MSI generation
+ * @RISCV_IOMMU_CAPABILITIES_IGS_WSI: IOMMU supports only Wired-Signaled interrupt
+ * @RISCV_IOMMU_CAPABILITIES_IGS_BOTH: IOMMU supports both MSI and WSI generation
+ * @RISCV_IOMMU_CAPABILITIES_IGS_RSRV: Reserved for standard use
+ */
+enum riscv_iommu_igs_settings {
+ RISCV_IOMMU_CAPABILITIES_IGS_MSI = 0,
+ RISCV_IOMMU_CAPABILITIES_IGS_WSI = 1,
+ RISCV_IOMMU_CAPABILITIES_IGS_BOTH = 2,
+ RISCV_IOMMU_CAPABILITIES_IGS_RSRV = 3
+};
+
+/* 5.4 Features control register (32bits) */
+#define RISCV_IOMMU_REG_FCTL 0x0008
+#define RISCV_IOMMU_FCTL_BE BIT(0)
+#define RISCV_IOMMU_FCTL_WSI BIT(1)
+#define RISCV_IOMMU_FCTL_GXL BIT(2)
+
+/* 5.5 Device-directory-table pointer (64bits) */
+#define RISCV_IOMMU_REG_DDTP 0x0010
+#define RISCV_IOMMU_DDTP_IOMMU_MODE GENMASK_ULL(3, 0)
+#define RISCV_IOMMU_DDTP_BUSY BIT_ULL(4)
+#define RISCV_IOMMU_DDTP_PPN RISCV_IOMMU_PPN_FIELD
+
+/**
+ * enum riscv_iommu_ddtp_modes - IOMMU translation modes
+ * @RISCV_IOMMU_DDTP_IOMMU_MODE_OFF: No inbound transactions allowed
+ * @RISCV_IOMMU_DDTP_IOMMU_MODE_BARE: Pass-through mode
+ * @RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL: One-level DDT
+ * @RISCV_IOMMU_DDTP_IOMMU_MODE_2LVL: Two-level DDT
+ * @RISCV_IOMMU_DDTP_IOMMU_MODE_3LVL: Three-level DDT
+ * @RISCV_IOMMU_DDTP_IOMMU_MODE_MAX: Max value allowed by specification
+ */
+enum riscv_iommu_ddtp_modes {
+ RISCV_IOMMU_DDTP_IOMMU_MODE_OFF = 0,
+ RISCV_IOMMU_DDTP_IOMMU_MODE_BARE = 1,
+ RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL = 2,
+ RISCV_IOMMU_DDTP_IOMMU_MODE_2LVL = 3,
+ RISCV_IOMMU_DDTP_IOMMU_MODE_3LVL = 4,
+ RISCV_IOMMU_DDTP_IOMMU_MODE_MAX = 4
+};
+
+/* 5.6 Command Queue Base (64bits) */
+#define RISCV_IOMMU_REG_CQB 0x0018
+#define RISCV_IOMMU_CQB_ENTRIES RISCV_IOMMU_QUEUE_LOG2SZ_FIELD
+#define RISCV_IOMMU_CQB_PPN RISCV_IOMMU_PPN_FIELD
+
+/* 5.7 Command Queue head (32bits) */
+#define RISCV_IOMMU_REG_CQH 0x0020
+#define RISCV_IOMMU_CQH_INDEX RISCV_IOMMU_QUEUE_INDEX_FIELD
+
+/* 5.8 Command Queue tail (32bits) */
+#define RISCV_IOMMU_REG_CQT 0x0024
+#define RISCV_IOMMU_CQT_INDEX RISCV_IOMMU_QUEUE_INDEX_FIELD
+
+/* 5.9 Fault Queue Base (64bits) */
+#define RISCV_IOMMU_REG_FQB 0x0028
+#define RISCV_IOMMU_FQB_ENTRIES RISCV_IOMMU_QUEUE_LOG2SZ_FIELD
+#define RISCV_IOMMU_FQB_PPN RISCV_IOMMU_PPN_FIELD
+
+/* 5.10 Fault Queue Head (32bits) */
+#define RISCV_IOMMU_REG_FQH 0x0030
+#define RISCV_IOMMU_FQH_INDEX RISCV_IOMMU_QUEUE_INDEX_FIELD
+
+/* 5.11 Fault Queue tail (32bits) */
+#define RISCV_IOMMU_REG_FQT 0x0034
+#define RISCV_IOMMU_FQT_INDEX RISCV_IOMMU_QUEUE_INDEX_FIELD
+
+/* 5.12 Page Request Queue base (64bits) */
+#define RISCV_IOMMU_REG_PQB 0x0038
+#define RISCV_IOMMU_PQB_ENTRIES RISCV_IOMMU_QUEUE_LOG2SZ_FIELD
+#define RISCV_IOMMU_PQB_PPN RISCV_IOMMU_PPN_FIELD
+
+/* 5.13 Page Request Queue head (32bits) */
+#define RISCV_IOMMU_REG_PQH 0x0040
+#define RISCV_IOMMU_PQH_INDEX RISCV_IOMMU_QUEUE_INDEX_FIELD
+
+/* 5.14 Page Request Queue tail (32bits) */
+#define RISCV_IOMMU_REG_PQT 0x0044
+#define RISCV_IOMMU_PQT_INDEX_MASK RISCV_IOMMU_QUEUE_INDEX_FIELD
+
+/* 5.15 Command Queue CSR (32bits) */
+#define RISCV_IOMMU_REG_CQCSR 0x0048
+#define RISCV_IOMMU_CQCSR_CQEN RISCV_IOMMU_QUEUE_ENABLE
+#define RISCV_IOMMU_CQCSR_CIE RISCV_IOMMU_QUEUE_INTR_ENABLE
+#define RISCV_IOMMU_CQCSR_CQMF RISCV_IOMMU_QUEUE_MEM_FAULT
+#define RISCV_IOMMU_CQCSR_CMD_TO BIT(9)
+#define RISCV_IOMMU_CQCSR_CMD_ILL BIT(10)
+#define RISCV_IOMMU_CQCSR_FENCE_W_IP BIT(11)
+#define RISCV_IOMMU_CQCSR_CQON RISCV_IOMMU_QUEUE_ACTIVE
+#define RISCV_IOMMU_CQCSR_BUSY RISCV_IOMMU_QUEUE_BUSY
+
+/* 5.16 Fault Queue CSR (32bits) */
+#define RISCV_IOMMU_REG_FQCSR 0x004C
+#define RISCV_IOMMU_FQCSR_FQEN RISCV_IOMMU_QUEUE_ENABLE
+#define RISCV_IOMMU_FQCSR_FIE RISCV_IOMMU_QUEUE_INTR_ENABLE
+#define RISCV_IOMMU_FQCSR_FQMF RISCV_IOMMU_QUEUE_MEM_FAULT
+#define RISCV_IOMMU_FQCSR_FQOF RISCV_IOMMU_QUEUE_OVERFLOW
+#define RISCV_IOMMU_FQCSR_FQON RISCV_IOMMU_QUEUE_ACTIVE
+#define RISCV_IOMMU_FQCSR_BUSY RISCV_IOMMU_QUEUE_BUSY
+
+/* 5.17 Page Request Queue CSR (32bits) */
+#define RISCV_IOMMU_REG_PQCSR 0x0050
+#define RISCV_IOMMU_PQCSR_PQEN RISCV_IOMMU_QUEUE_ENABLE
+#define RISCV_IOMMU_PQCSR_PIE RISCV_IOMMU_QUEUE_INTR_ENABLE
+#define RISCV_IOMMU_PQCSR_PQMF RISCV_IOMMU_QUEUE_MEM_FAULT
+#define RISCV_IOMMU_PQCSR_PQOF RISCV_IOMMU_QUEUE_OVERFLOW
+#define RISCV_IOMMU_PQCSR_PQON RISCV_IOMMU_QUEUE_ACTIVE
+#define RISCV_IOMMU_PQCSR_BUSY RISCV_IOMMU_QUEUE_BUSY
+
+/* 5.18 Interrupt Pending Status (32bits) */
+#define RISCV_IOMMU_REG_IPSR 0x0054
+
+#define RISCV_IOMMU_INTR_CQ 0
+#define RISCV_IOMMU_INTR_FQ 1
+#define RISCV_IOMMU_INTR_PM 2
+#define RISCV_IOMMU_INTR_PQ 3
+#define RISCV_IOMMU_INTR_COUNT 4
+
+#define RISCV_IOMMU_IPSR_CIP BIT(RISCV_IOMMU_INTR_CQ)
+#define RISCV_IOMMU_IPSR_FIP BIT(RISCV_IOMMU_INTR_FQ)
+#define RISCV_IOMMU_IPSR_PMIP BIT(RISCV_IOMMU_INTR_PM)
+#define RISCV_IOMMU_IPSR_PIP BIT(RISCV_IOMMU_INTR_PQ)
+
+/* 5.19 Performance monitoring counter overflow status (32bits) */
+#define RISCV_IOMMU_REG_IOCOUNTOVF 0x0058
+#define RISCV_IOMMU_IOCOUNTOVF_CY BIT(0)
+#define RISCV_IOMMU_IOCOUNTOVF_HPM GENMASK_ULL(31, 1)
+
+/* 5.20 Performance monitoring counter inhibits (32bits) */
+#define RISCV_IOMMU_REG_IOCOUNTINH 0x005C
+#define RISCV_IOMMU_IOCOUNTINH_CY BIT(0)
+#define RISCV_IOMMU_IOCOUNTINH_HPM GENMASK(31, 1)
+
+/* 5.21 Performance monitoring cycles counter (64bits) */
+#define RISCV_IOMMU_REG_IOHPMCYCLES 0x0060
+#define RISCV_IOMMU_IOHPMCYCLES_COUNTER GENMASK_ULL(62, 0)
+#define RISCV_IOMMU_IOHPMCYCLES_OF BIT_ULL(63)
+
+/* 5.22 Performance monitoring event counters (31 * 64bits) */
+#define RISCV_IOMMU_REG_IOHPMCTR_BASE 0x0068
+#define RISCV_IOMMU_REG_IOHPMCTR(_n) (RISCV_IOMMU_REG_IOHPMCTR_BASE + ((_n) * 0x8))
+
+/* 5.23 Performance monitoring event selectors (31 * 64bits) */
+#define RISCV_IOMMU_REG_IOHPMEVT_BASE 0x0160
+#define RISCV_IOMMU_REG_IOHPMEVT(_n) (RISCV_IOMMU_REG_IOHPMEVT_BASE + ((_n) * 0x8))
+#define RISCV_IOMMU_IOHPMEVT_EVENTID GENMASK_ULL(14, 0)
+#define RISCV_IOMMU_IOHPMEVT_DMASK BIT_ULL(15)
+#define RISCV_IOMMU_IOHPMEVT_PID_PSCID GENMASK_ULL(35, 16)
+#define RISCV_IOMMU_IOHPMEVT_DID_GSCID GENMASK_ULL(59, 36)
+#define RISCV_IOMMU_IOHPMEVT_PV_PSCV BIT_ULL(60)
+#define RISCV_IOMMU_IOHPMEVT_DV_GSCV BIT_ULL(61)
+#define RISCV_IOMMU_IOHPMEVT_IDT BIT_ULL(62)
+#define RISCV_IOMMU_IOHPMEVT_OF BIT_ULL(63)
+
+/* Number of defined performance-monitoring event selectors */
+#define RISCV_IOMMU_IOHPMEVT_CNT 31
+
+/**
+ * enum riscv_iommu_hpmevent_id - Performance-monitoring event identifier
+ *
+ * @RISCV_IOMMU_HPMEVENT_INVALID: Invalid event, do not count
+ * @RISCV_IOMMU_HPMEVENT_URQ: Untranslated requests
+ * @RISCV_IOMMU_HPMEVENT_TRQ: Translated requests
+ * @RISCV_IOMMU_HPMEVENT_ATS_RQ: ATS translation requests
+ * @RISCV_IOMMU_HPMEVENT_TLB_MISS: TLB misses
+ * @RISCV_IOMMU_HPMEVENT_DD_WALK: Device directory walks
+ * @RISCV_IOMMU_HPMEVENT_PD_WALK: Process directory walks
+ * @RISCV_IOMMU_HPMEVENT_S_VS_WALKS: First-stage page table walks
+ * @RISCV_IOMMU_HPMEVENT_G_WALKS: Second-stage page table walks
+ * @RISCV_IOMMU_HPMEVENT_MAX: Value to denote maximum Event IDs
+ */
+enum riscv_iommu_hpmevent_id {
+ RISCV_IOMMU_HPMEVENT_INVALID = 0,
+ RISCV_IOMMU_HPMEVENT_URQ = 1,
+ RISCV_IOMMU_HPMEVENT_TRQ = 2,
+ RISCV_IOMMU_HPMEVENT_ATS_RQ = 3,
+ RISCV_IOMMU_HPMEVENT_TLB_MISS = 4,
+ RISCV_IOMMU_HPMEVENT_DD_WALK = 5,
+ RISCV_IOMMU_HPMEVENT_PD_WALK = 6,
+ RISCV_IOMMU_HPMEVENT_S_VS_WALKS = 7,
+ RISCV_IOMMU_HPMEVENT_G_WALKS = 8,
+ RISCV_IOMMU_HPMEVENT_MAX = 9
+};
+
+/* 5.24 Translation request IOVA (64bits) */
+#define RISCV_IOMMU_REG_TR_REQ_IOVA 0x0258
+#define RISCV_IOMMU_TR_REQ_IOVA_VPN GENMASK_ULL(63, 12)
+
+/* 5.25 Translation request control (64bits) */
+#define RISCV_IOMMU_REG_TR_REQ_CTL 0x0260
+#define RISCV_IOMMU_TR_REQ_CTL_GO_BUSY BIT_ULL(0)
+#define RISCV_IOMMU_TR_REQ_CTL_PRIV BIT_ULL(1)
+#define RISCV_IOMMU_TR_REQ_CTL_EXE BIT_ULL(2)
+#define RISCV_IOMMU_TR_REQ_CTL_NW BIT_ULL(3)
+#define RISCV_IOMMU_TR_REQ_CTL_PID GENMASK_ULL(31, 12)
+#define RISCV_IOMMU_TR_REQ_CTL_PV BIT_ULL(32)
+#define RISCV_IOMMU_TR_REQ_CTL_DID GENMASK_ULL(63, 40)
+
+/* 5.26 Translation request response (64bits) */
+#define RISCV_IOMMU_REG_TR_RESPONSE 0x0268
+#define RISCV_IOMMU_TR_RESPONSE_FAULT BIT_ULL(0)
+#define RISCV_IOMMU_TR_RESPONSE_PBMT GENMASK_ULL(8, 7)
+#define RISCV_IOMMU_TR_RESPONSE_SZ BIT_ULL(9)
+#define RISCV_IOMMU_TR_RESPONSE_PPN RISCV_IOMMU_PPN_FIELD
+
+/* 5.27 Interrupt cause to vector (64bits) */
+#define RISCV_IOMMU_REG_ICVEC 0x02F8
+#define RISCV_IOMMU_ICVEC_CIV GENMASK_ULL(3, 0)
+#define RISCV_IOMMU_ICVEC_FIV GENMASK_ULL(7, 4)
+#define RISCV_IOMMU_ICVEC_PMIV GENMASK_ULL(11, 8)
+#define RISCV_IOMMU_ICVEC_PIV GENMASK_ULL(15, 12)
+
+/* 5.28 MSI Configuration table (32 * 64bits) */
+#define RISCV_IOMMU_REG_MSI_CFG_TBL 0x0300
+#define RISCV_IOMMU_REG_MSI_CFG_TBL_ADDR(_n) \
+ (RISCV_IOMMU_REG_MSI_CFG_TBL + ((_n) * 0x10))
+#define RISCV_IOMMU_MSI_CFG_TBL_ADDR GENMASK_ULL(55, 2)
+#define RISCV_IOMMU_REG_MSI_CFG_TBL_DATA(_n) \
+ (RISCV_IOMMU_REG_MSI_CFG_TBL + ((_n) * 0x10) + 0x08)
+#define RISCV_IOMMU_MSI_CFG_TBL_DATA GENMASK_ULL(31, 0)
+#define RISCV_IOMMU_REG_MSI_CFG_TBL_CTRL(_n) \
+ (RISCV_IOMMU_REG_MSI_CFG_TBL + ((_n) * 0x10) + 0x0C)
+#define RISCV_IOMMU_MSI_CFG_TBL_CTRL_M BIT_ULL(0)
+
+#define RISCV_IOMMU_REG_SIZE 0x1000
+
+/*
+ * Chapter 2: Data structures
+ */
+
+/*
+ * Device Directory Table macros for non-leaf nodes
+ */
+#define RISCV_IOMMU_DDTE_V BIT_ULL(0)
+#define RISCV_IOMMU_DDTE_PPN RISCV_IOMMU_PPN_FIELD
+
+/**
+ * struct riscv_iommu_dc - Device Context
+ * @tc: Translation Control
+ * @iohgatp: I/O Hypervisor guest address translation and protection
+ * (Second stage context)
+ * @ta: Translation Attributes
+ * @fsc: First stage context
+ * @msiptp: MSI page table pointer
+ * @msi_addr_mask: MSI address mask
+ * @msi_addr_pattern: MSI address pattern
+ * @_reserved: Reserved for future use, padding
+ *
+ * This structure is used for leaf nodes on the Device Directory Table,
+ * in case RISCV_IOMMU_CAPABILITIES_MSI_FLAT is not set, the bottom 4 fields
+ * are not present and are skipped with pointer arithmetic to avoid
+ * casting, check out riscv_iommu_get_dc().
+ * See section 2.1 for more details
+ */
+struct riscv_iommu_dc {
+ u64 tc;
+ u64 iohgatp;
+ u64 ta;
+ u64 fsc;
+ u64 msiptp;
+ u64 msi_addr_mask;
+ u64 msi_addr_pattern;
+ u64 _reserved;
+};
+
+/* Translation control fields */
+#define RISCV_IOMMU_DC_TC_V BIT_ULL(0)
+#define RISCV_IOMMU_DC_TC_EN_ATS BIT_ULL(1)
+#define RISCV_IOMMU_DC_TC_EN_PRI BIT_ULL(2)
+#define RISCV_IOMMU_DC_TC_T2GPA BIT_ULL(3)
+#define RISCV_IOMMU_DC_TC_DTF BIT_ULL(4)
+#define RISCV_IOMMU_DC_TC_PDTV BIT_ULL(5)
+#define RISCV_IOMMU_DC_TC_PRPR BIT_ULL(6)
+#define RISCV_IOMMU_DC_TC_GADE BIT_ULL(7)
+#define RISCV_IOMMU_DC_TC_SADE BIT_ULL(8)
+#define RISCV_IOMMU_DC_TC_DPE BIT_ULL(9)
+#define RISCV_IOMMU_DC_TC_SBE BIT_ULL(10)
+#define RISCV_IOMMU_DC_TC_SXL BIT_ULL(11)
+
+/* Second-stage (aka G-stage) context fields */
+#define RISCV_IOMMU_DC_IOHGATP_PPN RISCV_IOMMU_ATP_PPN_FIELD
+#define RISCV_IOMMU_DC_IOHGATP_GSCID GENMASK_ULL(59, 44)
+#define RISCV_IOMMU_DC_IOHGATP_MODE RISCV_IOMMU_ATP_MODE_FIELD
+
+/**
+ * enum riscv_iommu_dc_iohgatp_modes - Guest address translation/protection modes
+ * @RISCV_IOMMU_DC_IOHGATP_MODE_BARE: No translation/protection
+ * @RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4: Sv32x4 (2-bit extension of Sv32), when fctl.GXL == 1
+ * @RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4: Sv39x4 (2-bit extension of Sv39), when fctl.GXL == 0
+ * @RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4: Sv48x4 (2-bit extension of Sv48), when fctl.GXL == 0
+ * @RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4: Sv57x4 (2-bit extension of Sv57), when fctl.GXL == 0
+ */
+enum riscv_iommu_dc_iohgatp_modes {
+ RISCV_IOMMU_DC_IOHGATP_MODE_BARE = 0,
+ RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4 = 8,
+ RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4 = 8,
+ RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4 = 9,
+ RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4 = 10
+};
+
+/* Translation attributes fields */
+#define RISCV_IOMMU_DC_TA_PSCID GENMASK_ULL(31, 12)
+
+/* First-stage context fields */
+#define RISCV_IOMMU_DC_FSC_PPN RISCV_IOMMU_ATP_PPN_FIELD
+#define RISCV_IOMMU_DC_FSC_MODE RISCV_IOMMU_ATP_MODE_FIELD
+
+/**
+ * enum riscv_iommu_dc_fsc_atp_modes - First stage address translation/protection modes
+ * @RISCV_IOMMU_DC_FSC_MODE_BARE: No translation/protection
+ * @RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV32: Sv32, when dc.tc.SXL == 1
+ * @RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39: Sv39, when dc.tc.SXL == 0
+ * @RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48: Sv48, when dc.tc.SXL == 0
+ * @RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57: Sv57, when dc.tc.SXL == 0
+ * @RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8: 1lvl PDT, 8bit process ids
+ * @RISCV_IOMMU_DC_FSC_PDTP_MODE_PD17: 2lvl PDT, 17bit process ids
+ * @RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20: 3lvl PDT, 20bit process ids
+ *
+ * FSC holds IOSATP when RISCV_IOMMU_DC_TC_PDTV is 0 and PDTP otherwise.
+ * IOSATP controls the first stage address translation (same as the satp register on
+ * the RISC-V MMU), and PDTP holds the process directory table, used to select a
+ * first stage page table based on a process id (for devices that support multiple
+ * process ids).
+ */
+enum riscv_iommu_dc_fsc_atp_modes {
+ RISCV_IOMMU_DC_FSC_MODE_BARE = 0,
+ RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV32 = 8,
+ RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 = 8,
+ RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48 = 9,
+ RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57 = 10,
+ RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8 = 1,
+ RISCV_IOMMU_DC_FSC_PDTP_MODE_PD17 = 2,
+ RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20 = 3
+};
+
+/* MSI page table pointer */
+#define RISCV_IOMMU_DC_MSIPTP_PPN RISCV_IOMMU_ATP_PPN_FIELD
+#define RISCV_IOMMU_DC_MSIPTP_MODE RISCV_IOMMU_ATP_MODE_FIELD
+#define RISCV_IOMMU_DC_MSIPTP_MODE_OFF 0
+#define RISCV_IOMMU_DC_MSIPTP_MODE_FLAT 1
+
+/* MSI address mask */
+#define RISCV_IOMMU_DC_MSI_ADDR_MASK GENMASK_ULL(51, 0)
+
+/* MSI address pattern */
+#define RISCV_IOMMU_DC_MSI_PATTERN GENMASK_ULL(51, 0)
+
+/**
+ * struct riscv_iommu_pc - Process Context
+ * @ta: Translation Attributes
+ * @fsc: First stage context
+ *
+ * This structure is used for leaf nodes on the Process Directory Table
+ * See section 2.3 for more details
+ */
+struct riscv_iommu_pc {
+ u64 ta;
+ u64 fsc;
+};
+
+/* Translation attributes fields */
+#define RISCV_IOMMU_PC_TA_V BIT_ULL(0)
+#define RISCV_IOMMU_PC_TA_ENS BIT_ULL(1)
+#define RISCV_IOMMU_PC_TA_SUM BIT_ULL(2)
+#define RISCV_IOMMU_PC_TA_PSCID GENMASK_ULL(31, 12)
+
+/* First stage context fields */
+#define RISCV_IOMMU_PC_FSC_PPN RISCV_IOMMU_ATP_PPN_FIELD
+#define RISCV_IOMMU_PC_FSC_MODE RISCV_IOMMU_ATP_MODE_FIELD
+
+/*
+ * Chapter 3: In-memory queue interface
+ */
+
+/**
+ * struct riscv_iommu_command - Generic IOMMU command structure
+ * @dword0: Includes the opcode and the function identifier
+ * @dword1: Opcode specific data
+ *
+ * The commands are interpreted as two 64bit fields, where the first
+ * 7bits of the first field are the opcode which also defines the
+ * command's format, followed by a 3bit field that specifies the
+ * function invoked by that command, and the rest is opcode-specific.
+ * This is a generic struct which will be populated differently
+ * according to each command. For more infos on the commands and
+ * the command queue check section 3.1.
+ */
+struct riscv_iommu_command {
+ u64 dword0;
+ u64 dword1;
+};
+
+/* Fields on dword0, common for all commands */
+#define RISCV_IOMMU_CMD_OPCODE GENMASK_ULL(6, 0)
+#define RISCV_IOMMU_CMD_FUNC GENMASK_ULL(9, 7)
+
+/* 3.1.1 IOMMU Page-table cache invalidation */
+/* Fields on dword0 */
+#define RISCV_IOMMU_CMD_IOTINVAL_OPCODE 1
+#define RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA 0
+#define RISCV_IOMMU_CMD_IOTINVAL_FUNC_GVMA 1
+#define RISCV_IOMMU_CMD_IOTINVAL_AV BIT_ULL(10)
+#define RISCV_IOMMU_CMD_IOTINVAL_PSCID GENMASK_ULL(31, 12)
+#define RISCV_IOMMU_CMD_IOTINVAL_PSCV BIT_ULL(32)
+#define RISCV_IOMMU_CMD_IOTINVAL_GV BIT_ULL(33)
+#define RISCV_IOMMU_CMD_IOTINVAL_GSCID GENMASK_ULL(59, 44)
+/* dword1[61:10] is the 4K-aligned page address */
+#define RISCV_IOMMU_CMD_IOTINVAL_ADDR GENMASK_ULL(61, 10)
+
+/* 3.1.2 IOMMU Command Queue Fences */
+/* Fields on dword0 */
+#define RISCV_IOMMU_CMD_IOFENCE_OPCODE 2
+#define RISCV_IOMMU_CMD_IOFENCE_FUNC_C 0
+#define RISCV_IOMMU_CMD_IOFENCE_AV BIT_ULL(10)
+#define RISCV_IOMMU_CMD_IOFENCE_WSI BIT_ULL(11)
+#define RISCV_IOMMU_CMD_IOFENCE_PR BIT_ULL(12)
+#define RISCV_IOMMU_CMD_IOFENCE_PW BIT_ULL(13)
+#define RISCV_IOMMU_CMD_IOFENCE_DATA GENMASK_ULL(63, 32)
+/* dword1 is the address, word-size aligned and shifted to the right by two bits. */
+
+/* 3.1.3 IOMMU Directory cache invalidation */
+/* Fields on dword0 */
+#define RISCV_IOMMU_CMD_IODIR_OPCODE 3
+#define RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT 0
+#define RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT 1
+#define RISCV_IOMMU_CMD_IODIR_PID GENMASK_ULL(31, 12)
+#define RISCV_IOMMU_CMD_IODIR_DV BIT_ULL(33)
+#define RISCV_IOMMU_CMD_IODIR_DID GENMASK_ULL(63, 40)
+/* dword1 is reserved for standard use */
+
+/* 3.1.4 IOMMU PCIe ATS */
+/* Fields on dword0 */
+#define RISCV_IOMMU_CMD_ATS_OPCODE 4
+#define RISCV_IOMMU_CMD_ATS_FUNC_INVAL 0
+#define RISCV_IOMMU_CMD_ATS_FUNC_PRGR 1
+#define RISCV_IOMMU_CMD_ATS_PID GENMASK_ULL(31, 12)
+#define RISCV_IOMMU_CMD_ATS_PV BIT_ULL(32)
+#define RISCV_IOMMU_CMD_ATS_DSV BIT_ULL(33)
+#define RISCV_IOMMU_CMD_ATS_RID GENMASK_ULL(55, 40)
+#define RISCV_IOMMU_CMD_ATS_DSEG GENMASK_ULL(63, 56)
+/* dword1 is the ATS payload, two different payload types for INVAL and PRGR */
+
+/* ATS.INVAL payload*/
+#define RISCV_IOMMU_CMD_ATS_INVAL_G BIT_ULL(0)
+/* Bits 1 - 10 are zeroed */
+#define RISCV_IOMMU_CMD_ATS_INVAL_S BIT_ULL(11)
+#define RISCV_IOMMU_CMD_ATS_INVAL_UADDR GENMASK_ULL(63, 12)
+
+/* ATS.PRGR payload */
+/* Bits 0 - 31 are zeroed */
+#define RISCV_IOMMU_CMD_ATS_PRGR_PRG_INDEX GENMASK_ULL(40, 32)
+/* Bits 41 - 43 are zeroed */
+#define RISCV_IOMMU_CMD_ATS_PRGR_RESP_CODE GENMASK_ULL(47, 44)
+#define RISCV_IOMMU_CMD_ATS_PRGR_DST_ID GENMASK_ULL(63, 48)
+
+/**
+ * struct riscv_iommu_fq_record - Fault/Event Queue Record
+ * @hdr: Header, includes fault/event cause, PID/DID, transaction type etc
+ * @_reserved: Low 32bits for custom use, high 32bits for standard use
+ * @iotval: Transaction-type/cause specific format
+ * @iotval2: Cause specific format
+ *
+ * The fault/event queue reports events and failures raised when
+ * processing transactions. Each record is a 32byte structure where
+ * the first dword has a fixed format for providing generic infos
+ * regarding the fault/event, and two more dwords are there for
+ * fault/event-specific information. For more details see section
+ * 3.2.
+ */
+struct riscv_iommu_fq_record {
+ u64 hdr;
+ u64 _reserved;
+ u64 iotval;
+ u64 iotval2;
+};
+
+/* Fields on header */
+#define RISCV_IOMMU_FQ_HDR_CAUSE GENMASK_ULL(11, 0)
+#define RISCV_IOMMU_FQ_HDR_PID GENMASK_ULL(31, 12)
+#define RISCV_IOMMU_FQ_HDR_PV BIT_ULL(32)
+#define RISCV_IOMMU_FQ_HDR_PRIV BIT_ULL(33)
+#define RISCV_IOMMU_FQ_HDR_TTYP GENMASK_ULL(39, 34)
+#define RISCV_IOMMU_FQ_HDR_DID GENMASK_ULL(63, 40)
+
+/**
+ * enum riscv_iommu_fq_causes - Fault/event cause values
+ * @RISCV_IOMMU_FQ_CAUSE_INST_FAULT: Instruction access fault
+ * @RISCV_IOMMU_FQ_CAUSE_RD_ADDR_MISALIGNED: Read address misaligned
+ * @RISCV_IOMMU_FQ_CAUSE_RD_FAULT: Read load fault
+ * @RISCV_IOMMU_FQ_CAUSE_WR_ADDR_MISALIGNED: Write/AMO address misaligned
+ * @RISCV_IOMMU_FQ_CAUSE_WR_FAULT: Write/AMO access fault
+ * @RISCV_IOMMU_FQ_CAUSE_INST_FAULT_S: Instruction page fault
+ * @RISCV_IOMMU_FQ_CAUSE_RD_FAULT_S: Read page fault
+ * @RISCV_IOMMU_FQ_CAUSE_WR_FAULT_S: Write/AMO page fault
+ * @RISCV_IOMMU_FQ_CAUSE_INST_FAULT_VS: Instruction guest page fault
+ * @RISCV_IOMMU_FQ_CAUSE_RD_FAULT_VS: Read guest page fault
+ * @RISCV_IOMMU_FQ_CAUSE_WR_FAULT_VS: Write/AMO guest page fault
+ * @RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED: All inbound transactions disallowed
+ * @RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT: DDT entry load access fault
+ * @RISCV_IOMMU_FQ_CAUSE_DDT_INVALID: DDT entry invalid
+ * @RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED: DDT entry misconfigured
+ * @RISCV_IOMMU_FQ_CAUSE_TTYP_BLOCKED: Transaction type disallowed
+ * @RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT: MSI PTE load access fault
+ * @RISCV_IOMMU_FQ_CAUSE_MSI_INVALID: MSI PTE invalid
+ * @RISCV_IOMMU_FQ_CAUSE_MSI_MISCONFIGURED: MSI PTE misconfigured
+ * @RISCV_IOMMU_FQ_CAUSE_MRIF_FAULT: MRIF access fault
+ * @RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT: PDT entry load access fault
+ * @RISCV_IOMMU_FQ_CAUSE_PDT_INVALID: PDT entry invalid
+ * @RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED: PDT entry misconfigured
+ * @RISCV_IOMMU_FQ_CAUSE_DDT_CORRUPTED: DDT data corruption
+ * @RISCV_IOMMU_FQ_CAUSE_PDT_CORRUPTED: PDT data corruption
+ * @RISCV_IOMMU_FQ_CAUSE_MSI_PT_CORRUPTED: MSI page table data corruption
+ * @RISCV_IOMMU_FQ_CAUSE_MRIF_CORRUIPTED: MRIF data corruption
+ * @RISCV_IOMMU_FQ_CAUSE_INTERNAL_DP_ERROR: Internal data path error
+ * @RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT: IOMMU MSI write access fault
+ * @RISCV_IOMMU_FQ_CAUSE_PT_CORRUPTED: First/second stage page table data corruption
+ *
+ * Values are on table 11 of the spec, encodings 275 - 2047 are reserved for standard
+ * use, and 2048 - 4095 for custom use.
+ */
+enum riscv_iommu_fq_causes {
+ RISCV_IOMMU_FQ_CAUSE_INST_FAULT = 1,
+ RISCV_IOMMU_FQ_CAUSE_RD_ADDR_MISALIGNED = 4,
+ RISCV_IOMMU_FQ_CAUSE_RD_FAULT = 5,
+ RISCV_IOMMU_FQ_CAUSE_WR_ADDR_MISALIGNED = 6,
+ RISCV_IOMMU_FQ_CAUSE_WR_FAULT = 7,
+ RISCV_IOMMU_FQ_CAUSE_INST_FAULT_S = 12,
+ RISCV_IOMMU_FQ_CAUSE_RD_FAULT_S = 13,
+ RISCV_IOMMU_FQ_CAUSE_WR_FAULT_S = 15,
+ RISCV_IOMMU_FQ_CAUSE_INST_FAULT_VS = 20,
+ RISCV_IOMMU_FQ_CAUSE_RD_FAULT_VS = 21,
+ RISCV_IOMMU_FQ_CAUSE_WR_FAULT_VS = 23,
+ RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED = 256,
+ RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT = 257,
+ RISCV_IOMMU_FQ_CAUSE_DDT_INVALID = 258,
+ RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED = 259,
+ RISCV_IOMMU_FQ_CAUSE_TTYP_BLOCKED = 260,
+ RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT = 261,
+ RISCV_IOMMU_FQ_CAUSE_MSI_INVALID = 262,
+ RISCV_IOMMU_FQ_CAUSE_MSI_MISCONFIGURED = 263,
+ RISCV_IOMMU_FQ_CAUSE_MRIF_FAULT = 264,
+ RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT = 265,
+ RISCV_IOMMU_FQ_CAUSE_PDT_INVALID = 266,
+ RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED = 267,
+ RISCV_IOMMU_FQ_CAUSE_DDT_CORRUPTED = 268,
+ RISCV_IOMMU_FQ_CAUSE_PDT_CORRUPTED = 269,
+ RISCV_IOMMU_FQ_CAUSE_MSI_PT_CORRUPTED = 270,
+ RISCV_IOMMU_FQ_CAUSE_MRIF_CORRUIPTED = 271,
+ RISCV_IOMMU_FQ_CAUSE_INTERNAL_DP_ERROR = 272,
+ RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT = 273,
+ RISCV_IOMMU_FQ_CAUSE_PT_CORRUPTED = 274
+};
+
+/**
+ * enum riscv_iommu_fq_ttypes: Fault/event transaction types
+ * @RISCV_IOMMU_FQ_TTYP_NONE: None. Fault not caused by an inbound transaction.
+ * @RISCV_IOMMU_FQ_TTYP_UADDR_INST_FETCH: Instruction fetch from untranslated address
+ * @RISCV_IOMMU_FQ_TTYP_UADDR_RD: Read from untranslated address
+ * @RISCV_IOMMU_FQ_TTYP_UADDR_WR: Write/AMO to untranslated address
+ * @RISCV_IOMMU_FQ_TTYP_TADDR_INST_FETCH: Instruction fetch from translated address
+ * @RISCV_IOMMU_FQ_TTYP_TADDR_RD: Read from translated address
+ * @RISCV_IOMMU_FQ_TTYP_TADDR_WR: Write/AMO to translated address
+ * @RISCV_IOMMU_FQ_TTYP_PCIE_ATS_REQ: PCIe ATS translation request
+ * @RISCV_IOMMU_FQ_TTYP_PCIE_MSG_REQ: PCIe message request
+ *
+ * Values are on table 12 of the spec, type 4 and 10 - 31 are reserved for standard use
+ * and 31 - 63 for custom use.
+ */
+enum riscv_iommu_fq_ttypes {
+ RISCV_IOMMU_FQ_TTYP_NONE = 0,
+ RISCV_IOMMU_FQ_TTYP_UADDR_INST_FETCH = 1,
+ RISCV_IOMMU_FQ_TTYP_UADDR_RD = 2,
+ RISCV_IOMMU_FQ_TTYP_UADDR_WR = 3,
+ RISCV_IOMMU_FQ_TTYP_TADDR_INST_FETCH = 5,
+ RISCV_IOMMU_FQ_TTYP_TADDR_RD = 6,
+ RISCV_IOMMU_FQ_TTYP_TADDR_WR = 7,
+ RISCV_IOMMU_FQ_TTYP_PCIE_ATS_REQ = 8,
+ RISCV_IOMMU_FQ_TTYP_PCIE_MSG_REQ = 9,
+};
+
+/**
+ * struct riscv_iommu_pq_record - PCIe Page Request record
+ * @hdr: Header, includes PID, DID etc
+ * @payload: Holds the page address, request group and permission bits
+ *
+ * For more infos on the PCIe Page Request queue see chapter 3.3.
+ */
+struct riscv_iommu_pq_record {
+ u64 hdr;
+ u64 payload;
+};
+
+/* Header fields */
+#define RISCV_IOMMU_PQ_HDR_PID GENMASK_ULL(31, 12)
+#define RISCV_IOMMU_PQ_HDR_PV BIT_ULL(32)
+#define RISCV_IOMMU_PQ_HDR_PRIV BIT_ULL(33)
+#define RISCV_IOMMU_PQ_HDR_EXEC BIT_ULL(34)
+#define RISCV_IOMMU_PQ_HDR_DID GENMASK_ULL(63, 40)
+
+/* Payload fields */
+#define RISCV_IOMMU_PQ_PAYLOAD_R BIT_ULL(0)
+#define RISCV_IOMMU_PQ_PAYLOAD_W BIT_ULL(1)
+#define RISCV_IOMMU_PQ_PAYLOAD_L BIT_ULL(2)
+#define RISCV_IOMMU_PQ_PAYLOAD_RWL_MASK GENMASK_ULL(2, 0)
+#define RISCV_IOMMU_PQ_PAYLOAD_PRGI GENMASK_ULL(11, 3) /* Page Request Group Index */
+#define RISCV_IOMMU_PQ_PAYLOAD_ADDR GENMASK_ULL(63, 12)
+
+/**
+ * struct riscv_iommu_msipte - MSI Page Table Entry
+ * @pte: MSI PTE
+ * @mrif_info: Memory-resident interrupt file info
+ *
+ * The MSI Page Table is used for virtualizing MSIs, so that when
+ * a device sends an MSI to a guest, the IOMMU can reroute it
+ * by translating the MSI address, either to a guest interrupt file
+ * or a memory resident interrupt file (MRIF). Note that this page table
+ * is an array of MSI PTEs, not a multi-level pt, each entry
+ * is a leaf entry. For more infos check out the AIA spec, chapter 9.5.
+ *
+ * Also in basic mode the mrif_info field is ignored by the IOMMU and can
+ * be used by software, any other reserved fields on pte must be zeroed-out
+ * by software.
+ */
+struct riscv_iommu_msipte {
+ u64 pte;
+ u64 mrif_info;
+};
+
+/* Fields on pte */
+#define RISCV_IOMMU_MSIPTE_V BIT_ULL(0)
+#define RISCV_IOMMU_MSIPTE_M GENMASK_ULL(2, 1)
+#define RISCV_IOMMU_MSIPTE_MRIF_ADDR GENMASK_ULL(53, 7) /* When M == 1 (MRIF mode) */
+#define RISCV_IOMMU_MSIPTE_PPN RISCV_IOMMU_PPN_FIELD /* When M == 3 (basic mode) */
+#define RISCV_IOMMU_MSIPTE_C BIT_ULL(63)
+
+/* Fields on mrif_info */
+#define RISCV_IOMMU_MSIPTE_MRIF_NID GENMASK_ULL(9, 0)
+#define RISCV_IOMMU_MSIPTE_MRIF_NPPN RISCV_IOMMU_PPN_FIELD
+#define RISCV_IOMMU_MSIPTE_MRIF_NID_MSB BIT_ULL(60)
+
+/* Helper functions: command structure builders. */
+
+static inline void riscv_iommu_cmd_inval_vma(struct riscv_iommu_command *cmd)
+{
+ cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOTINVAL_OPCODE) |
+ FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA);
+ cmd->dword1 = 0;
+}
+
+static inline void riscv_iommu_cmd_inval_set_addr(struct riscv_iommu_command *cmd,
+ u64 addr)
+{
+ cmd->dword1 = FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_ADDR, phys_to_pfn(addr));
+ cmd->dword0 |= RISCV_IOMMU_CMD_IOTINVAL_AV;
+}
+
+static inline void riscv_iommu_cmd_inval_set_pscid(struct riscv_iommu_command *cmd,
+ int pscid)
+{
+ cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_PSCID, pscid) |
+ RISCV_IOMMU_CMD_IOTINVAL_PSCV;
+}
+
+static inline void riscv_iommu_cmd_inval_set_gscid(struct riscv_iommu_command *cmd,
+ int gscid)
+{
+ cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_GSCID, gscid) |
+ RISCV_IOMMU_CMD_IOTINVAL_GV;
+}
+
+static inline void riscv_iommu_cmd_iofence(struct riscv_iommu_command *cmd)
+{
+ cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOFENCE_OPCODE) |
+ FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOFENCE_FUNC_C) |
+ RISCV_IOMMU_CMD_IOFENCE_PR | RISCV_IOMMU_CMD_IOFENCE_PW;
+ cmd->dword1 = 0;
+}
+
+static inline void riscv_iommu_cmd_iofence_set_av(struct riscv_iommu_command *cmd,
+ u64 addr, u32 data)
+{
+ cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOFENCE_OPCODE) |
+ FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOFENCE_FUNC_C) |
+ FIELD_PREP(RISCV_IOMMU_CMD_IOFENCE_DATA, data) |
+ RISCV_IOMMU_CMD_IOFENCE_AV;
+ cmd->dword1 = addr >> 2;
+}
+
+static inline void riscv_iommu_cmd_iodir_inval_ddt(struct riscv_iommu_command *cmd)
+{
+ cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IODIR_OPCODE) |
+ FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT);
+ cmd->dword1 = 0;
+}
+
+static inline void riscv_iommu_cmd_iodir_inval_pdt(struct riscv_iommu_command *cmd)
+{
+ cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IODIR_OPCODE) |
+ FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT);
+ cmd->dword1 = 0;
+}
+
+static inline void riscv_iommu_cmd_iodir_set_did(struct riscv_iommu_command *cmd,
+ unsigned int devid)
+{
+ cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IODIR_DID, devid) |
+ RISCV_IOMMU_CMD_IODIR_DV;
+}
+
+static inline void riscv_iommu_cmd_iodir_set_pid(struct riscv_iommu_command *cmd,
+ unsigned int pasid)
+{
+ cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IODIR_PID, pasid);
+}
+
+#endif /* _RISCV_IOMMU_BITS_H_ */
diff --git a/drivers/iommu/riscv/iommu-pci.c b/drivers/iommu/riscv/iommu-pci.c
new file mode 100644
index 000000000000..c7a89143014c
--- /dev/null
+++ b/drivers/iommu/riscv/iommu-pci.c
@@ -0,0 +1,120 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Copyright © 2022-2024 Rivos Inc.
+ * Copyright © 2023 FORTH-ICS/CARV
+ *
+ * RISCV IOMMU as a PCIe device
+ *
+ * Authors
+ * Tomasz Jeznach <tjeznach@rivosinc.com>
+ * Nick Kossifidis <mick@ics.forth.gr>
+ */
+
+#include <linux/compiler.h>
+#include <linux/init.h>
+#include <linux/iommu.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+
+#include "iommu-bits.h"
+#include "iommu.h"
+
+/* QEMU RISC-V IOMMU implementation */
+#define PCI_DEVICE_ID_REDHAT_RISCV_IOMMU 0x0014
+
+/* Rivos Inc. assigned PCI Vendor and Device IDs */
+#ifndef PCI_VENDOR_ID_RIVOS
+#define PCI_VENDOR_ID_RIVOS 0x1efd
+#endif
+
+#define PCI_DEVICE_ID_RIVOS_RISCV_IOMMU_GA 0x0008
+
+static int riscv_iommu_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+ struct device *dev = &pdev->dev;
+ struct riscv_iommu_device *iommu;
+ int rc, vec;
+
+ rc = pcim_enable_device(pdev);
+ if (rc)
+ return rc;
+
+ if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM))
+ return -ENODEV;
+
+ if (pci_resource_len(pdev, 0) < RISCV_IOMMU_REG_SIZE)
+ return -ENODEV;
+
+ rc = pcim_iomap_regions(pdev, BIT(0), pci_name(pdev));
+ if (rc)
+ return dev_err_probe(dev, rc, "pcim_iomap_regions failed\n");
+
+ iommu = devm_kzalloc(dev, sizeof(*iommu), GFP_KERNEL);
+ if (!iommu)
+ return -ENOMEM;
+
+ iommu->dev = dev;
+ iommu->reg = pcim_iomap_table(pdev)[0];
+
+ pci_set_master(pdev);
+ dev_set_drvdata(dev, iommu);
+
+ /* Check device reported capabilities / features. */
+ iommu->caps = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_CAPABILITIES);
+ iommu->fctl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FCTL);
+
+ /* The PCI driver only uses MSIs, make sure the IOMMU supports this */
+ switch (FIELD_GET(RISCV_IOMMU_CAPABILITIES_IGS, iommu->caps)) {
+ case RISCV_IOMMU_CAPABILITIES_IGS_MSI:
+ case RISCV_IOMMU_CAPABILITIES_IGS_BOTH:
+ break;
+ default:
+ return dev_err_probe(dev, -ENODEV,
+ "unable to use message-signaled interrupts\n");
+ }
+
+ /* Allocate and assign IRQ vectors for the various events */
+ rc = pci_alloc_irq_vectors(pdev, 1, RISCV_IOMMU_INTR_COUNT,
+ PCI_IRQ_MSIX | PCI_IRQ_MSI);
+ if (rc <= 0)
+ return dev_err_probe(dev, -ENODEV,
+ "unable to allocate irq vectors\n");
+
+ iommu->irqs_count = rc;
+ for (vec = 0; vec < iommu->irqs_count; vec++)
+ iommu->irqs[vec] = msi_get_virq(dev, vec);
+
+ /* Enable message-signaled interrupts, fctl.WSI */
+ if (iommu->fctl & RISCV_IOMMU_FCTL_WSI) {
+ iommu->fctl ^= RISCV_IOMMU_FCTL_WSI;
+ riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FCTL, iommu->fctl);
+ }
+
+ return riscv_iommu_init(iommu);
+}
+
+static void riscv_iommu_pci_remove(struct pci_dev *pdev)
+{
+ struct riscv_iommu_device *iommu = dev_get_drvdata(&pdev->dev);
+
+ riscv_iommu_remove(iommu);
+}
+
+static const struct pci_device_id riscv_iommu_pci_tbl[] = {
+ {PCI_VDEVICE(REDHAT, PCI_DEVICE_ID_REDHAT_RISCV_IOMMU), 0},
+ {PCI_VDEVICE(RIVOS, PCI_DEVICE_ID_RIVOS_RISCV_IOMMU_GA), 0},
+ {0,}
+};
+
+static struct pci_driver riscv_iommu_pci_driver = {
+ .name = KBUILD_MODNAME,
+ .id_table = riscv_iommu_pci_tbl,
+ .probe = riscv_iommu_pci_probe,
+ .remove = riscv_iommu_pci_remove,
+ .driver = {
+ .suppress_bind_attrs = true,
+ },
+};
+
+builtin_pci_driver(riscv_iommu_pci_driver);
diff --git a/drivers/iommu/riscv/iommu-platform.c b/drivers/iommu/riscv/iommu-platform.c
new file mode 100644
index 000000000000..da336863f152
--- /dev/null
+++ b/drivers/iommu/riscv/iommu-platform.c
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * RISC-V IOMMU as a platform device
+ *
+ * Copyright © 2023 FORTH-ICS/CARV
+ * Copyright © 2023-2024 Rivos Inc.
+ *
+ * Authors
+ * Nick Kossifidis <mick@ics.forth.gr>
+ * Tomasz Jeznach <tjeznach@rivosinc.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+
+#include "iommu-bits.h"
+#include "iommu.h"
+
+static int riscv_iommu_platform_probe(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct riscv_iommu_device *iommu = NULL;
+ struct resource *res = NULL;
+ int vec;
+
+ iommu = devm_kzalloc(dev, sizeof(*iommu), GFP_KERNEL);
+ if (!iommu)
+ return -ENOMEM;
+
+ iommu->dev = dev;
+ iommu->reg = devm_platform_get_and_ioremap_resource(pdev, 0, &res);
+ if (IS_ERR(iommu->reg))
+ return dev_err_probe(dev, PTR_ERR(iommu->reg),
+ "could not map register region\n");
+
+ dev_set_drvdata(dev, iommu);
+
+ /* Check device reported capabilities / features. */
+ iommu->caps = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_CAPABILITIES);
+ iommu->fctl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FCTL);
+
+ /* For now we only support WSI */
+ switch (FIELD_GET(RISCV_IOMMU_CAPABILITIES_IGS, iommu->caps)) {
+ case RISCV_IOMMU_CAPABILITIES_IGS_WSI:
+ case RISCV_IOMMU_CAPABILITIES_IGS_BOTH:
+ break;
+ default:
+ return dev_err_probe(dev, -ENODEV,
+ "unable to use wire-signaled interrupts\n");
+ }
+
+ iommu->irqs_count = platform_irq_count(pdev);
+ if (iommu->irqs_count <= 0)
+ return dev_err_probe(dev, -ENODEV,
+ "no IRQ resources provided\n");
+ if (iommu->irqs_count > RISCV_IOMMU_INTR_COUNT)
+ iommu->irqs_count = RISCV_IOMMU_INTR_COUNT;
+
+ for (vec = 0; vec < iommu->irqs_count; vec++)
+ iommu->irqs[vec] = platform_get_irq(pdev, vec);
+
+ /* Enable wire-signaled interrupts, fctl.WSI */
+ if (!(iommu->fctl & RISCV_IOMMU_FCTL_WSI)) {
+ iommu->fctl |= RISCV_IOMMU_FCTL_WSI;
+ riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FCTL, iommu->fctl);
+ }
+
+ return riscv_iommu_init(iommu);
+};
+
+static void riscv_iommu_platform_remove(struct platform_device *pdev)
+{
+ riscv_iommu_remove(dev_get_drvdata(&pdev->dev));
+};
+
+static const struct of_device_id riscv_iommu_of_match[] = {
+ {.compatible = "riscv,iommu",},
+ {},
+};
+
+static struct platform_driver riscv_iommu_platform_driver = {
+ .probe = riscv_iommu_platform_probe,
+ .remove_new = riscv_iommu_platform_remove,
+ .driver = {
+ .name = "riscv,iommu",
+ .of_match_table = riscv_iommu_of_match,
+ .suppress_bind_attrs = true,
+ },
+};
+
+builtin_platform_driver(riscv_iommu_platform_driver);
diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
new file mode 100644
index 000000000000..8a05def774bd
--- /dev/null
+++ b/drivers/iommu/riscv/iommu.c
@@ -0,0 +1,1661 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * IOMMU API for RISC-V IOMMU implementations.
+ *
+ * Copyright © 2022-2024 Rivos Inc.
+ * Copyright © 2023 FORTH-ICS/CARV
+ *
+ * Authors
+ * Tomasz Jeznach <tjeznach@rivosinc.com>
+ * Nick Kossifidis <mick@ics.forth.gr>
+ */
+
+#define pr_fmt(fmt) "riscv-iommu: " fmt
+
+#include <linux/compiler.h>
+#include <linux/crash_dump.h>
+#include <linux/init.h>
+#include <linux/iommu.h>
+#include <linux/iopoll.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+
+#include "../iommu-pages.h"
+#include "iommu-bits.h"
+#include "iommu.h"
+
+/* Timeouts in [us] */
+#define RISCV_IOMMU_QCSR_TIMEOUT 150000
+#define RISCV_IOMMU_QUEUE_TIMEOUT 150000
+#define RISCV_IOMMU_DDTP_TIMEOUT 10000000
+#define RISCV_IOMMU_IOTINVAL_TIMEOUT 90000000
+
+/* Number of entries per CMD/FLT queue, should be <= INT_MAX */
+#define RISCV_IOMMU_DEF_CQ_COUNT 8192
+#define RISCV_IOMMU_DEF_FQ_COUNT 4096
+
+/* RISC-V IOMMU PPN <> PHYS address conversions, PHYS <=> PPN[53:10] */
+#define phys_to_ppn(pa) (((pa) >> 2) & (((1ULL << 44) - 1) << 10))
+#define ppn_to_phys(pn) (((pn) << 2) & (((1ULL << 44) - 1) << 12))
+
+#define dev_to_iommu(dev) \
+ iommu_get_iommu_dev(dev, struct riscv_iommu_device, iommu)
+
+/* IOMMU PSCID allocation namespace. */
+static DEFINE_IDA(riscv_iommu_pscids);
+#define RISCV_IOMMU_MAX_PSCID (BIT(20) - 1)
+
+/* Device resource-managed allocations */
+struct riscv_iommu_devres {
+ void *addr;
+ int order;
+};
+
+static void riscv_iommu_devres_pages_release(struct device *dev, void *res)
+{
+ struct riscv_iommu_devres *devres = res;
+
+ iommu_free_pages(devres->addr, devres->order);
+}
+
+static int riscv_iommu_devres_pages_match(struct device *dev, void *res, void *p)
+{
+ struct riscv_iommu_devres *devres = res;
+ struct riscv_iommu_devres *target = p;
+
+ return devres->addr == target->addr;
+}
+
+static void *riscv_iommu_get_pages(struct riscv_iommu_device *iommu, int order)
+{
+ struct riscv_iommu_devres *devres;
+ void *addr;
+
+ addr = iommu_alloc_pages_node(dev_to_node(iommu->dev),
+ GFP_KERNEL_ACCOUNT, order);
+ if (unlikely(!addr))
+ return NULL;
+
+ devres = devres_alloc(riscv_iommu_devres_pages_release,
+ sizeof(struct riscv_iommu_devres), GFP_KERNEL);
+
+ if (unlikely(!devres)) {
+ iommu_free_pages(addr, order);
+ return NULL;
+ }
+
+ devres->addr = addr;
+ devres->order = order;
+
+ devres_add(iommu->dev, devres);
+
+ return addr;
+}
+
+static void riscv_iommu_free_pages(struct riscv_iommu_device *iommu, void *addr)
+{
+ struct riscv_iommu_devres devres = { .addr = addr };
+
+ devres_release(iommu->dev, riscv_iommu_devres_pages_release,
+ riscv_iommu_devres_pages_match, &devres);
+}
+
+/*
+ * Hardware queue allocation and management.
+ */
+
+/* Setup queue base, control registers and default queue length */
+#define RISCV_IOMMU_QUEUE_INIT(q, name) do { \
+ struct riscv_iommu_queue *_q = q; \
+ _q->qid = RISCV_IOMMU_INTR_ ## name; \
+ _q->qbr = RISCV_IOMMU_REG_ ## name ## B; \
+ _q->qcr = RISCV_IOMMU_REG_ ## name ## CSR; \
+ _q->mask = _q->mask ?: (RISCV_IOMMU_DEF_ ## name ## _COUNT) - 1;\
+} while (0)
+
+/* Note: offsets are the same for all queues */
+#define Q_HEAD(q) ((q)->qbr + (RISCV_IOMMU_REG_CQH - RISCV_IOMMU_REG_CQB))
+#define Q_TAIL(q) ((q)->qbr + (RISCV_IOMMU_REG_CQT - RISCV_IOMMU_REG_CQB))
+#define Q_ITEM(q, index) ((q)->mask & (index))
+#define Q_IPSR(q) BIT((q)->qid)
+
+/*
+ * Discover queue ring buffer hardware configuration, allocate in-memory
+ * ring buffer or use fixed I/O memory location, configure queue base register.
+ * Must be called before hardware queue is enabled.
+ *
+ * @queue - data structure, configured with RISCV_IOMMU_QUEUE_INIT()
+ * @entry_size - queue single element size in bytes.
+ */
+static int riscv_iommu_queue_alloc(struct riscv_iommu_device *iommu,
+ struct riscv_iommu_queue *queue,
+ size_t entry_size)
+{
+ unsigned int logsz;
+ u64 qb, rb;
+
+ /*
+ * Use WARL base register property to discover maximum allowed
+ * number of entries and optional fixed IO address for queue location.
+ */
+ riscv_iommu_writeq(iommu, queue->qbr, RISCV_IOMMU_QUEUE_LOG2SZ_FIELD);
+ qb = riscv_iommu_readq(iommu, queue->qbr);
+
+ /*
+ * Calculate and verify hardware supported queue length, as reported
+ * by the field LOG2SZ, where max queue length is equal to 2^(LOG2SZ + 1).
+ * Update queue size based on hardware supported value.
+ */
+ logsz = ilog2(queue->mask);
+ if (logsz > FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb))
+ logsz = FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb);
+
+ /*
+ * Use WARL base register property to discover an optional fixed IO
+ * address for queue ring buffer location. Otherwise allocate contiguous
+ * system memory.
+ */
+ if (FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb)) {
+ const size_t queue_size = entry_size << (logsz + 1);
+
+ queue->phys = pfn_to_phys(FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb));
+ queue->base = devm_ioremap(iommu->dev, queue->phys, queue_size);
+ } else {
+ do {
+ const size_t queue_size = entry_size << (logsz + 1);
+ const int order = get_order(queue_size);
+
+ queue->base = riscv_iommu_get_pages(iommu, order);
+ queue->phys = __pa(queue->base);
+ } while (!queue->base && logsz-- > 0);
+ }
+
+ if (!queue->base)
+ return -ENOMEM;
+
+ qb = phys_to_ppn(queue->phys) |
+ FIELD_PREP(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, logsz);
+
+ /* Update base register and read back to verify hw accepted our write */
+ riscv_iommu_writeq(iommu, queue->qbr, qb);
+ rb = riscv_iommu_readq(iommu, queue->qbr);
+ if (rb != qb) {
+ dev_err(iommu->dev, "queue #%u allocation failed\n", queue->qid);
+ return -ENODEV;
+ }
+
+ /* Update actual queue mask */
+ queue->mask = (2U << logsz) - 1;
+
+ dev_dbg(iommu->dev, "queue #%u allocated 2^%u entries",
+ queue->qid, logsz + 1);
+
+ return 0;
+}
+
+/* Check interrupt queue status, IPSR */
+static irqreturn_t riscv_iommu_queue_ipsr(int irq, void *data)
+{
+ struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
+
+ if (riscv_iommu_readl(queue->iommu, RISCV_IOMMU_REG_IPSR) & Q_IPSR(queue))
+ return IRQ_WAKE_THREAD;
+
+ return IRQ_NONE;
+}
+
+static int riscv_iommu_queue_vec(struct riscv_iommu_device *iommu, int n)
+{
+ /* Reuse ICVEC.CIV mask for all interrupt vectors mapping. */
+ return (iommu->icvec >> (n * 4)) & RISCV_IOMMU_ICVEC_CIV;
+}
+
+/*
+ * Enable queue processing in the hardware, register interrupt handler.
+ *
+ * @queue - data structure, already allocated with riscv_iommu_queue_alloc()
+ * @irq_handler - threaded interrupt handler.
+ */
+static int riscv_iommu_queue_enable(struct riscv_iommu_device *iommu,
+ struct riscv_iommu_queue *queue,
+ irq_handler_t irq_handler)
+{
+ const unsigned int irq = iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)];
+ u32 csr;
+ int rc;
+
+ if (queue->iommu)
+ return -EBUSY;
+
+ /* Polling not implemented */
+ if (!irq)
+ return -ENODEV;
+
+ queue->iommu = iommu;
+ rc = request_threaded_irq(irq, riscv_iommu_queue_ipsr, irq_handler,
+ IRQF_ONESHOT | IRQF_SHARED,
+ dev_name(iommu->dev), queue);
+ if (rc) {
+ queue->iommu = NULL;
+ return rc;
+ }
+
+ /*
+ * Enable queue with interrupts, clear any memory fault if any.
+ * Wait for the hardware to acknowledge request and activate queue
+ * processing.
+ * Note: All CSR bitfields are in the same offsets for all queues.
+ */
+ riscv_iommu_writel(iommu, queue->qcr,
+ RISCV_IOMMU_QUEUE_ENABLE |
+ RISCV_IOMMU_QUEUE_INTR_ENABLE |
+ RISCV_IOMMU_QUEUE_MEM_FAULT);
+
+ riscv_iommu_readl_timeout(iommu, queue->qcr,
+ csr, !(csr & RISCV_IOMMU_QUEUE_BUSY),
+ 10, RISCV_IOMMU_QCSR_TIMEOUT);
+
+ if (RISCV_IOMMU_QUEUE_ACTIVE != (csr & (RISCV_IOMMU_QUEUE_ACTIVE |
+ RISCV_IOMMU_QUEUE_BUSY |
+ RISCV_IOMMU_QUEUE_MEM_FAULT))) {
+ /* Best effort to stop and disable failing hardware queue. */
+ riscv_iommu_writel(iommu, queue->qcr, 0);
+ free_irq(irq, queue);
+ queue->iommu = NULL;
+ dev_err(iommu->dev, "queue #%u failed to start\n", queue->qid);
+ return -EBUSY;
+ }
+
+ /* Clear any pending interrupt flag. */
+ riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
+
+ return 0;
+}
+
+/*
+ * Disable queue. Wait for the hardware to acknowledge request and
+ * stop processing enqueued requests. Report errors but continue.
+ */
+static void riscv_iommu_queue_disable(struct riscv_iommu_queue *queue)
+{
+ struct riscv_iommu_device *iommu = queue->iommu;
+ u32 csr;
+
+ if (!iommu)
+ return;
+
+ free_irq(iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)], queue);
+ riscv_iommu_writel(iommu, queue->qcr, 0);
+ riscv_iommu_readl_timeout(iommu, queue->qcr,
+ csr, !(csr & RISCV_IOMMU_QUEUE_BUSY),
+ 10, RISCV_IOMMU_QCSR_TIMEOUT);
+
+ if (csr & (RISCV_IOMMU_QUEUE_ACTIVE | RISCV_IOMMU_QUEUE_BUSY))
+ dev_err(iommu->dev, "fail to disable hardware queue #%u, csr 0x%x\n",
+ queue->qid, csr);
+
+ queue->iommu = NULL;
+}
+
+/*
+ * Returns number of available valid queue entries and the first item index.
+ * Update shadow producer index if necessary.
+ */
+static int riscv_iommu_queue_consume(struct riscv_iommu_queue *queue,
+ unsigned int *index)
+{
+ unsigned int head = atomic_read(&queue->head);
+ unsigned int tail = atomic_read(&queue->tail);
+ unsigned int last = Q_ITEM(queue, tail);
+ int available = (int)(tail - head);
+
+ *index = head;
+
+ if (available > 0)
+ return available;
+
+ /* read hardware producer index, check reserved register bits are not set. */
+ if (riscv_iommu_readl_timeout(queue->iommu, Q_TAIL(queue),
+ tail, (tail & ~queue->mask) == 0,
+ 0, RISCV_IOMMU_QUEUE_TIMEOUT)) {
+ dev_err_once(queue->iommu->dev,
+ "Hardware error: queue access timeout\n");
+ return 0;
+ }
+
+ if (tail == last)
+ return 0;
+
+ /* update shadow producer index */
+ return (int)(atomic_add_return((tail - last) & queue->mask, &queue->tail) - head);
+}
+
+/*
+ * Release processed queue entries, should match riscv_iommu_queue_consume() calls.
+ */
+static void riscv_iommu_queue_release(struct riscv_iommu_queue *queue, int count)
+{
+ const unsigned int head = atomic_add_return(count, &queue->head);
+
+ riscv_iommu_writel(queue->iommu, Q_HEAD(queue), Q_ITEM(queue, head));
+}
+
+/* Return actual consumer index based on hardware reported queue head index. */
+static unsigned int riscv_iommu_queue_cons(struct riscv_iommu_queue *queue)
+{
+ const unsigned int cons = atomic_read(&queue->head);
+ const unsigned int last = Q_ITEM(queue, cons);
+ unsigned int head;
+
+ if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head,
+ !(head & ~queue->mask),
+ 0, RISCV_IOMMU_QUEUE_TIMEOUT))
+ return cons;
+
+ return cons + ((head - last) & queue->mask);
+}
+
+/* Wait for submitted item to be processed. */
+static int riscv_iommu_queue_wait(struct riscv_iommu_queue *queue,
+ unsigned int index,
+ unsigned int timeout_us)
+{
+ unsigned int cons = atomic_read(&queue->head);
+
+ /* Already processed by the consumer */
+ if ((int)(cons - index) > 0)
+ return 0;
+
+ /* Monitor consumer index */
+ return readx_poll_timeout(riscv_iommu_queue_cons, queue, cons,
+ (int)(cons - index) > 0, 0, timeout_us);
+}
+
+/* Enqueue an entry and wait to be processed if timeout_us > 0
+ *
+ * Error handling for IOMMU hardware not responding in reasonable time
+ * will be added as separate patch series along with other RAS features.
+ * For now, only report hardware failure and continue.
+ */
+static unsigned int riscv_iommu_queue_send(struct riscv_iommu_queue *queue,
+ void *entry, size_t entry_size)
+{
+ unsigned int prod;
+ unsigned int head;
+ unsigned int tail;
+ unsigned long flags;
+
+ /* Do not preempt submission flow. */
+ local_irq_save(flags);
+
+ /* 1. Allocate some space in the queue */
+ prod = atomic_inc_return(&queue->prod) - 1;
+ head = atomic_read(&queue->head);
+
+ /* 2. Wait for space availability. */
+ if ((prod - head) > queue->mask) {
+ if (readx_poll_timeout(atomic_read, &queue->head,
+ head, (prod - head) < queue->mask,
+ 0, RISCV_IOMMU_QUEUE_TIMEOUT))
+ goto err_busy;
+ } else if ((prod - head) == queue->mask) {
+ const unsigned int last = Q_ITEM(queue, head);
+
+ if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head,
+ !(head & ~queue->mask) && head != last,
+ 0, RISCV_IOMMU_QUEUE_TIMEOUT))
+ goto err_busy;
+ atomic_add((head - last) & queue->mask, &queue->head);
+ }
+
+ /* 3. Store entry in the ring buffer */
+ memcpy(queue->base + Q_ITEM(queue, prod) * entry_size, entry, entry_size);
+
+ /* 4. Wait for all previous entries to be ready */
+ if (readx_poll_timeout(atomic_read, &queue->tail, tail, prod == tail,
+ 0, RISCV_IOMMU_QUEUE_TIMEOUT))
+ goto err_busy;
+
+ /*
+ * 5. Make sure the ring buffer update (whether in normal or I/O memory) is
+ * completed and visible before signaling the tail doorbell to fetch
+ * the next command. 'fence ow, ow'
+ */
+ dma_wmb();
+ riscv_iommu_writel(queue->iommu, Q_TAIL(queue), Q_ITEM(queue, prod + 1));
+
+ /*
+ * 6. Make sure the doorbell write to the device has finished before updating
+ * the shadow tail index in normal memory. 'fence o, w'
+ */
+ mmiowb();
+ atomic_inc(&queue->tail);
+
+ /* 7. Complete submission and restore local interrupts */
+ local_irq_restore(flags);
+
+ return prod;
+
+err_busy:
+ local_irq_restore(flags);
+ dev_err_once(queue->iommu->dev, "Hardware error: command enqueue failed\n");
+
+ return prod;
+}
+
+/*
+ * IOMMU Command queue chapter 3.1
+ */
+
+/* Command queue interrupt handler thread function */
+static irqreturn_t riscv_iommu_cmdq_process(int irq, void *data)
+{
+ const struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
+ unsigned int ctrl;
+
+ /* Clear MF/CQ errors, complete error recovery to be implemented. */
+ ctrl = riscv_iommu_readl(queue->iommu, queue->qcr);
+ if (ctrl & (RISCV_IOMMU_CQCSR_CQMF | RISCV_IOMMU_CQCSR_CMD_TO |
+ RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_FENCE_W_IP)) {
+ riscv_iommu_writel(queue->iommu, queue->qcr, ctrl);
+ dev_warn(queue->iommu->dev,
+ "Queue #%u error; fault:%d timeout:%d illegal:%d fence_w_ip:%d\n",
+ queue->qid,
+ !!(ctrl & RISCV_IOMMU_CQCSR_CQMF),
+ !!(ctrl & RISCV_IOMMU_CQCSR_CMD_TO),
+ !!(ctrl & RISCV_IOMMU_CQCSR_CMD_ILL),
+ !!(ctrl & RISCV_IOMMU_CQCSR_FENCE_W_IP));
+ }
+
+ /* Placeholder for command queue interrupt notifiers */
+
+ /* Clear command interrupt pending. */
+ riscv_iommu_writel(queue->iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
+
+ return IRQ_HANDLED;
+}
+
+/* Send command to the IOMMU command queue */
+static void riscv_iommu_cmd_send(struct riscv_iommu_device *iommu,
+ struct riscv_iommu_command *cmd)
+{
+ riscv_iommu_queue_send(&iommu->cmdq, cmd, sizeof(*cmd));
+}
+
+/* Send IOFENCE.C command and wait for all scheduled commands to complete. */
+static void riscv_iommu_cmd_sync(struct riscv_iommu_device *iommu,
+ unsigned int timeout_us)
+{
+ struct riscv_iommu_command cmd;
+ unsigned int prod;
+
+ riscv_iommu_cmd_iofence(&cmd);
+ prod = riscv_iommu_queue_send(&iommu->cmdq, &cmd, sizeof(cmd));
+
+ if (!timeout_us)
+ return;
+
+ if (riscv_iommu_queue_wait(&iommu->cmdq, prod, timeout_us))
+ dev_err_once(iommu->dev,
+ "Hardware error: command execution timeout\n");
+}
+
+/*
+ * IOMMU Fault/Event queue chapter 3.2
+ */
+
+static void riscv_iommu_fault(struct riscv_iommu_device *iommu,
+ struct riscv_iommu_fq_record *event)
+{
+ unsigned int err = FIELD_GET(RISCV_IOMMU_FQ_HDR_CAUSE, event->hdr);
+ unsigned int devid = FIELD_GET(RISCV_IOMMU_FQ_HDR_DID, event->hdr);
+
+ /* Placeholder for future fault handling implementation, report only. */
+ if (err)
+ dev_warn_ratelimited(iommu->dev,
+ "Fault %d devid: 0x%x iotval: %llx iotval2: %llx\n",
+ err, devid, event->iotval, event->iotval2);
+}
+
+/* Fault queue interrupt handler thread function */
+static irqreturn_t riscv_iommu_fltq_process(int irq, void *data)
+{
+ struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
+ struct riscv_iommu_device *iommu = queue->iommu;
+ struct riscv_iommu_fq_record *events;
+ unsigned int ctrl, idx;
+ int cnt, len;
+
+ events = (struct riscv_iommu_fq_record *)queue->base;
+
+ /* Clear fault interrupt pending and process all received fault events. */
+ riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
+
+ do {
+ cnt = riscv_iommu_queue_consume(queue, &idx);
+ for (len = 0; len < cnt; idx++, len++)
+ riscv_iommu_fault(iommu, &events[Q_ITEM(queue, idx)]);
+ riscv_iommu_queue_release(queue, cnt);
+ } while (cnt > 0);
+
+ /* Clear MF/OF errors, complete error recovery to be implemented. */
+ ctrl = riscv_iommu_readl(iommu, queue->qcr);
+ if (ctrl & (RISCV_IOMMU_FQCSR_FQMF | RISCV_IOMMU_FQCSR_FQOF)) {
+ riscv_iommu_writel(iommu, queue->qcr, ctrl);
+ dev_warn(iommu->dev,
+ "Queue #%u error; memory fault:%d overflow:%d\n",
+ queue->qid,
+ !!(ctrl & RISCV_IOMMU_FQCSR_FQMF),
+ !!(ctrl & RISCV_IOMMU_FQCSR_FQOF));
+ }
+
+ return IRQ_HANDLED;
+}
+
+/* Lookup and initialize device context info structure. */
+static struct riscv_iommu_dc *riscv_iommu_get_dc(struct riscv_iommu_device *iommu,
+ unsigned int devid)
+{
+ const bool base_format = !(iommu->caps & RISCV_IOMMU_CAPABILITIES_MSI_FLAT);
+ unsigned int depth;
+ unsigned long ddt, old, new;
+ void *ptr;
+ u8 ddi_bits[3] = { 0 };
+ u64 *ddtp = NULL;
+
+ /* Make sure the mode is valid */
+ if (iommu->ddt_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL ||
+ iommu->ddt_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_3LVL)
+ return NULL;
+
+ /*
+ * Device id partitioning for base format:
+ * DDI[0]: bits 0 - 6 (1st level) (7 bits)
+ * DDI[1]: bits 7 - 15 (2nd level) (9 bits)
+ * DDI[2]: bits 16 - 23 (3rd level) (8 bits)
+ *
+ * For extended format:
+ * DDI[0]: bits 0 - 5 (1st level) (6 bits)
+ * DDI[1]: bits 6 - 14 (2nd level) (9 bits)
+ * DDI[2]: bits 15 - 23 (3rd level) (9 bits)
+ */
+ if (base_format) {
+ ddi_bits[0] = 7;
+ ddi_bits[1] = 7 + 9;
+ ddi_bits[2] = 7 + 9 + 8;
+ } else {
+ ddi_bits[0] = 6;
+ ddi_bits[1] = 6 + 9;
+ ddi_bits[2] = 6 + 9 + 9;
+ }
+
+ /* Make sure device id is within range */
+ depth = iommu->ddt_mode - RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL;
+ if (devid >= (1 << ddi_bits[depth]))
+ return NULL;
+
+ /* Get to the level of the non-leaf node that holds the device context */
+ for (ddtp = iommu->ddt_root; depth-- > 0;) {
+ const int split = ddi_bits[depth];
+ /*
+ * Each non-leaf node is 64bits wide and on each level
+ * nodes are indexed by DDI[depth].
+ */
+ ddtp += (devid >> split) & 0x1FF;
+
+ /*
+ * Check if this node has been populated and if not
+ * allocate a new level and populate it.
+ */
+ do {
+ ddt = READ_ONCE(*(unsigned long *)ddtp);
+ if (ddt & RISCV_IOMMU_DDTE_V) {
+ ddtp = __va(ppn_to_phys(ddt));
+ break;
+ }
+
+ ptr = riscv_iommu_get_pages(iommu, 0);
+ if (!ptr)
+ return NULL;
+
+ new = phys_to_ppn(__pa(ptr)) | RISCV_IOMMU_DDTE_V;
+ old = cmpxchg_relaxed((unsigned long *)ddtp, ddt, new);
+
+ if (old == ddt) {
+ ddtp = (u64 *)ptr;
+ break;
+ }
+
+ /* Race setting DDT detected, re-read and retry. */
+ riscv_iommu_free_pages(iommu, ptr);
+ } while (1);
+ }
+
+ /*
+ * Grab the node that matches DDI[depth], note that when using base
+ * format the device context is 4 * 64bits, and the extended format
+ * is 8 * 64bits, hence the (3 - base_format) below.
+ */
+ ddtp += (devid & ((64 << base_format) - 1)) << (3 - base_format);
+
+ return (struct riscv_iommu_dc *)ddtp;
+}
+
+/*
+ * This is best effort IOMMU translation shutdown flow.
+ * Disable IOMMU without waiting for hardware response.
+ */
+static void riscv_iommu_disable(struct riscv_iommu_device *iommu)
+{
+ riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, 0);
+ riscv_iommu_writel(iommu, RISCV_IOMMU_REG_CQCSR, 0);
+ riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FQCSR, 0);
+ riscv_iommu_writel(iommu, RISCV_IOMMU_REG_PQCSR, 0);
+}
+
+#define riscv_iommu_read_ddtp(iommu) ({ \
+ u64 ddtp; \
+ riscv_iommu_readq_timeout((iommu), RISCV_IOMMU_REG_DDTP, ddtp, \
+ !(ddtp & RISCV_IOMMU_DDTP_BUSY), 10, \
+ RISCV_IOMMU_DDTP_TIMEOUT); \
+ ddtp; })
+
+static int riscv_iommu_iodir_alloc(struct riscv_iommu_device *iommu)
+{
+ u64 ddtp;
+ unsigned int mode;
+
+ ddtp = riscv_iommu_read_ddtp(iommu);
+ if (ddtp & RISCV_IOMMU_DDTP_BUSY)
+ return -EBUSY;
+
+ /*
+ * It is optional for the hardware to report a fixed address for device
+ * directory root page when DDT.MODE is OFF or BARE.
+ */
+ mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp);
+ if (mode == RISCV_IOMMU_DDTP_IOMMU_MODE_BARE ||
+ mode == RISCV_IOMMU_DDTP_IOMMU_MODE_OFF) {
+ /* Use WARL to discover hardware fixed DDT PPN */
+ riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP,
+ FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, mode));
+ ddtp = riscv_iommu_read_ddtp(iommu);
+ if (ddtp & RISCV_IOMMU_DDTP_BUSY)
+ return -EBUSY;
+
+ iommu->ddt_phys = ppn_to_phys(ddtp);
+ if (iommu->ddt_phys)
+ iommu->ddt_root = devm_ioremap(iommu->dev,
+ iommu->ddt_phys, PAGE_SIZE);
+ if (iommu->ddt_root)
+ memset(iommu->ddt_root, 0, PAGE_SIZE);
+ }
+
+ if (!iommu->ddt_root) {
+ iommu->ddt_root = riscv_iommu_get_pages(iommu, 0);
+ iommu->ddt_phys = __pa(iommu->ddt_root);
+ }
+
+ if (!iommu->ddt_root)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/*
+ * Discover supported DDT modes starting from requested value,
+ * configure DDTP register with accepted mode and root DDT address.
+ * Accepted iommu->ddt_mode is updated on success.
+ */
+static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu,
+ unsigned int ddtp_mode)
+{
+ struct device *dev = iommu->dev;
+ u64 ddtp, rq_ddtp;
+ unsigned int mode, rq_mode = ddtp_mode;
+ struct riscv_iommu_command cmd;
+
+ ddtp = riscv_iommu_read_ddtp(iommu);
+ if (ddtp & RISCV_IOMMU_DDTP_BUSY)
+ return -EBUSY;
+
+ /* Disallow state transition from xLVL to xLVL. */
+ mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp);
+ if (mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE &&
+ mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF &&
+ rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE &&
+ rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF)
+ return -EINVAL;
+
+ do {
+ rq_ddtp = FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, rq_mode);
+ if (rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_BARE)
+ rq_ddtp |= phys_to_ppn(iommu->ddt_phys);
+
+ riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, rq_ddtp);
+ ddtp = riscv_iommu_read_ddtp(iommu);
+ if (ddtp & RISCV_IOMMU_DDTP_BUSY) {
+ dev_err(dev, "timeout when setting ddtp (ddt mode: %u, read: %llx)\n",
+ rq_mode, ddtp);
+ return -EBUSY;
+ }
+
+ /* Verify IOMMU hardware accepts new DDTP config. */
+ mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp);
+
+ if (rq_mode == mode)
+ break;
+
+ /* Hardware mandatory DDTP mode has not been accepted. */
+ if (rq_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL && rq_ddtp != ddtp) {
+ dev_err(dev, "DDTP update failed hw: %llx vs %llx\n",
+ ddtp, rq_ddtp);
+ return -EINVAL;
+ }
+
+ /*
+ * Mode field is WARL, an IOMMU may support a subset of
+ * directory table levels in which case if we tried to set
+ * an unsupported number of levels we'll readback either
+ * a valid xLVL or off/bare. If we got off/bare, try again
+ * with a smaller xLVL.
+ */
+ if (mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL &&
+ rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL) {
+ dev_dbg(dev, "DDTP hw mode %u vs %u\n", mode, rq_mode);
+ rq_mode--;
+ continue;
+ }
+
+ /*
+ * We tried all supported modes and IOMMU hardware failed to
+ * accept new settings, something went very wrong since off/bare
+ * and at least one xLVL must be supported.
+ */
+ dev_err(dev, "DDTP hw mode %u, failed to set %u\n",
+ mode, ddtp_mode);
+ return -EINVAL;
+ } while (1);
+
+ iommu->ddt_mode = mode;
+ if (mode != ddtp_mode)
+ dev_dbg(dev, "DDTP hw mode %u, requested %u\n", mode, ddtp_mode);
+
+ /* Invalidate device context cache */
+ riscv_iommu_cmd_iodir_inval_ddt(&cmd);
+ riscv_iommu_cmd_send(iommu, &cmd);
+
+ /* Invalidate address translation cache */
+ riscv_iommu_cmd_inval_vma(&cmd);
+ riscv_iommu_cmd_send(iommu, &cmd);
+
+ /* IOFENCE.C */
+ riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
+
+ return 0;
+}
+
+/* This struct contains protection domain specific IOMMU driver data. */
+struct riscv_iommu_domain {
+ struct iommu_domain domain;
+ struct list_head bonds;
+ spinlock_t lock; /* protect bonds list updates. */
+ int pscid;
+ bool amo_enabled;
+ int numa_node;
+ unsigned int pgd_mode;
+ unsigned long *pgd_root;
+};
+
+#define iommu_domain_to_riscv(iommu_domain) \
+ container_of(iommu_domain, struct riscv_iommu_domain, domain)
+
+/* Private IOMMU data for managed devices, dev_iommu_priv_* */
+struct riscv_iommu_info {
+ struct riscv_iommu_domain *domain;
+};
+
+/*
+ * Linkage between an iommu_domain and attached devices.
+ *
+ * Protection domain requiring IOATC and DevATC translation cache invalidations,
+ * should be linked to attached devices using a riscv_iommu_bond structure.
+ * Devices should be linked to the domain before first use and unlinked after
+ * the translations from the referenced protection domain can no longer be used.
+ * Blocking and identity domains are not tracked here, as the IOMMU hardware
+ * does not cache negative and/or identity (BARE mode) translations, and DevATC
+ * is disabled for those protection domains.
+ *
+ * The device pointer and IOMMU data remain stable in the bond struct after
+ * _probe_device() where it's attached to the managed IOMMU, up to the
+ * completion of the _release_device() call. The release of the bond structure
+ * is synchronized with the device release.
+ */
+struct riscv_iommu_bond {
+ struct list_head list;
+ struct rcu_head rcu;
+ struct device *dev;
+};
+
+static int riscv_iommu_bond_link(struct riscv_iommu_domain *domain,
+ struct device *dev)
+{
+ struct riscv_iommu_device *iommu = dev_to_iommu(dev);
+ struct riscv_iommu_bond *bond;
+ struct list_head *bonds;
+
+ bond = kzalloc(sizeof(*bond), GFP_KERNEL);
+ if (!bond)
+ return -ENOMEM;
+ bond->dev = dev;
+
+ /*
+ * List of devices attached to the domain is arranged based on
+ * managed IOMMU device.
+ */
+
+ spin_lock(&domain->lock);
+ list_for_each(bonds, &domain->bonds)
+ if (dev_to_iommu(list_entry(bonds, struct riscv_iommu_bond, list)->dev) == iommu)
+ break;
+ list_add_rcu(&bond->list, bonds);
+ spin_unlock(&domain->lock);
+
+ /* Synchronize with riscv_iommu_iotlb_inval() sequence. See comment below. */
+ smp_mb();
+
+ return 0;
+}
+
+static void riscv_iommu_bond_unlink(struct riscv_iommu_domain *domain,
+ struct device *dev)
+{
+ struct riscv_iommu_device *iommu = dev_to_iommu(dev);
+ struct riscv_iommu_bond *bond, *found = NULL;
+ struct riscv_iommu_command cmd;
+ int count = 0;
+
+ if (!domain)
+ return;
+
+ spin_lock(&domain->lock);
+ list_for_each_entry(bond, &domain->bonds, list) {
+ if (found && count)
+ break;
+ else if (bond->dev == dev)
+ found = bond;
+ else if (dev_to_iommu(bond->dev) == iommu)
+ count++;
+ }
+ if (found)
+ list_del_rcu(&found->list);
+ spin_unlock(&domain->lock);
+ kfree_rcu(found, rcu);
+
+ /*
+ * If this was the last bond between this domain and the IOMMU
+ * invalidate all cached entries for domain's PSCID.
+ */
+ if (!count) {
+ riscv_iommu_cmd_inval_vma(&cmd);
+ riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
+ riscv_iommu_cmd_send(iommu, &cmd);
+
+ riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
+ }
+}
+
+/*
+ * Send IOTLB.INVAL for whole address space for ranges larger than 2MB.
+ * This limit will be replaced with range invalidations, if supported by
+ * the hardware, when RISC-V IOMMU architecture specification update for
+ * range invalidations update will be available.
+ */
+#define RISCV_IOMMU_IOTLB_INVAL_LIMIT (2 << 20)
+
+static void riscv_iommu_iotlb_inval(struct riscv_iommu_domain *domain,
+ unsigned long start, unsigned long end)
+{
+ struct riscv_iommu_bond *bond;
+ struct riscv_iommu_device *iommu, *prev;
+ struct riscv_iommu_command cmd;
+ unsigned long len = end - start + 1;
+ unsigned long iova;
+
+ /*
+ * For each IOMMU linked with this protection domain (via bonds->dev),
+ * an IOTLB invaliation command will be submitted and executed.
+ *
+ * Possbile race with domain attach flow is handled by sequencing
+ * bond creation - riscv_iommu_bond_link(), and device directory
+ * update - riscv_iommu_iodir_update().
+ *
+ * PTE Update / IOTLB Inval Device attach & directory update
+ * -------------------------- --------------------------
+ * update page table entries add dev to the bond list
+ * FENCE RW,RW FENCE RW,RW
+ * For all IOMMUs: (can be empty) Update FSC/PSCID
+ * FENCE IOW,IOW FENCE IOW,IOW
+ * IOTLB.INVAL IODIR.INVAL
+ * IOFENCE.C
+ *
+ * If bond list is not updated with new device, directory context will
+ * be configured with already valid page table content. If an IOMMU is
+ * linked to the protection domain it will receive invalidation
+ * requests for updated page table entries.
+ */
+ smp_mb();
+
+ rcu_read_lock();
+
+ prev = NULL;
+ list_for_each_entry_rcu(bond, &domain->bonds, list) {
+ iommu = dev_to_iommu(bond->dev);
+
+ /*
+ * IOTLB invalidation request can be safely omitted if already sent
+ * to the IOMMU for the same PSCID, and with domain->bonds list
+ * arranged based on the device's IOMMU, it's sufficient to check
+ * last device the invalidation was sent to.
+ */
+ if (iommu == prev)
+ continue;
+
+ riscv_iommu_cmd_inval_vma(&cmd);
+ riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
+ if (len && len < RISCV_IOMMU_IOTLB_INVAL_LIMIT) {
+ for (iova = start; iova < end; iova += PAGE_SIZE) {
+ riscv_iommu_cmd_inval_set_addr(&cmd, iova);
+ riscv_iommu_cmd_send(iommu, &cmd);
+ }
+ } else {
+ riscv_iommu_cmd_send(iommu, &cmd);
+ }
+ prev = iommu;
+ }
+
+ prev = NULL;
+ list_for_each_entry_rcu(bond, &domain->bonds, list) {
+ iommu = dev_to_iommu(bond->dev);
+ if (iommu == prev)
+ continue;
+
+ riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
+ prev = iommu;
+ }
+ rcu_read_unlock();
+}
+
+#define RISCV_IOMMU_FSC_BARE 0
+
+/*
+ * Update IODIR for the device.
+ *
+ * During the execution of riscv_iommu_probe_device(), IODIR entries are
+ * allocated for the device's identifiers. Device context invalidation
+ * becomes necessary only if one of the updated entries was previously
+ * marked as valid, given that invalid device context entries are not
+ * cached by the IOMMU hardware.
+ * In this implementation, updating a valid device context while the
+ * device is not quiesced might be disruptive, potentially causing
+ * interim translation faults.
+ */
+static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu,
+ struct device *dev, u64 fsc, u64 ta)
+{
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+ struct riscv_iommu_dc *dc;
+ struct riscv_iommu_command cmd;
+ bool sync_required = false;
+ u64 tc;
+ int i;
+
+ for (i = 0; i < fwspec->num_ids; i++) {
+ dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
+ tc = READ_ONCE(dc->tc);
+ if (!(tc & RISCV_IOMMU_DC_TC_V))
+ continue;
+
+ WRITE_ONCE(dc->tc, tc & ~RISCV_IOMMU_DC_TC_V);
+
+ /* Invalidate device context cached values */
+ riscv_iommu_cmd_iodir_inval_ddt(&cmd);
+ riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]);
+ riscv_iommu_cmd_send(iommu, &cmd);
+ sync_required = true;
+ }
+
+ if (sync_required)
+ riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
+
+ /*
+ * For device context with DC_TC_PDTV = 0, translation attributes valid bit
+ * is stored as DC_TC_V bit (both sharing the same location at BIT(0)).
+ */
+ for (i = 0; i < fwspec->num_ids; i++) {
+ dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
+ tc = READ_ONCE(dc->tc);
+ tc |= ta & RISCV_IOMMU_DC_TC_V;
+
+ WRITE_ONCE(dc->fsc, fsc);
+ WRITE_ONCE(dc->ta, ta & RISCV_IOMMU_PC_TA_PSCID);
+ /* Update device context, write TC.V as the last step. */
+ dma_wmb();
+ WRITE_ONCE(dc->tc, tc);
+
+ /* Invalidate device context after update */
+ riscv_iommu_cmd_iodir_inval_ddt(&cmd);
+ riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]);
+ riscv_iommu_cmd_send(iommu, &cmd);
+ }
+
+ riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
+}
+
+/*
+ * IOVA page translation tree management.
+ */
+
+static void riscv_iommu_iotlb_flush_all(struct iommu_domain *iommu_domain)
+{
+ struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
+
+ riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX);
+}
+
+static void riscv_iommu_iotlb_sync(struct iommu_domain *iommu_domain,
+ struct iommu_iotlb_gather *gather)
+{
+ struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
+
+ riscv_iommu_iotlb_inval(domain, gather->start, gather->end);
+}
+
+#define PT_SHIFT (PAGE_SHIFT - ilog2(sizeof(pte_t)))
+
+#define _io_pte_present(pte) ((pte) & (_PAGE_PRESENT | _PAGE_PROT_NONE))
+#define _io_pte_leaf(pte) ((pte) & _PAGE_LEAF)
+#define _io_pte_none(pte) ((pte) == 0)
+#define _io_pte_entry(pn, prot) ((_PAGE_PFN_MASK & ((pn) << _PAGE_PFN_SHIFT)) | (prot))
+
+static void riscv_iommu_pte_free(struct riscv_iommu_domain *domain,
+ unsigned long pte, struct list_head *freelist)
+{
+ unsigned long *ptr;
+ int i;
+
+ if (!_io_pte_present(pte) || _io_pte_leaf(pte))
+ return;
+
+ ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));
+
+ /* Recursively free all sub page table pages */
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ pte = READ_ONCE(ptr[i]);
+ if (!_io_pte_none(pte) && cmpxchg_relaxed(ptr + i, pte, 0) == pte)
+ riscv_iommu_pte_free(domain, pte, freelist);
+ }
+
+ if (freelist)
+ list_add_tail(&virt_to_page(ptr)->lru, freelist);
+ else
+ iommu_free_page(ptr);
+}
+
+static unsigned long *riscv_iommu_pte_alloc(struct riscv_iommu_domain *domain,
+ unsigned long iova, size_t pgsize,
+ gfp_t gfp)
+{
+ unsigned long *ptr = domain->pgd_root;
+ unsigned long pte, old;
+ int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2;
+ void *addr;
+
+ do {
+ const int shift = PAGE_SHIFT + PT_SHIFT * level;
+
+ ptr += ((iova >> shift) & (PTRS_PER_PTE - 1));
+ /*
+ * Note: returned entry might be a non-leaf if there was
+ * existing mapping with smaller granularity. Up to the caller
+ * to replace and invalidate.
+ */
+ if (((size_t)1 << shift) == pgsize)
+ return ptr;
+pte_retry:
+ pte = READ_ONCE(*ptr);
+ /*
+ * This is very likely incorrect as we should not be adding
+ * new mapping with smaller granularity on top
+ * of existing 2M/1G mapping. Fail.
+ */
+ if (_io_pte_present(pte) && _io_pte_leaf(pte))
+ return NULL;
+ /*
+ * Non-leaf entry is missing, allocate and try to add to the
+ * page table. This might race with other mappings, retry.
+ */
+ if (_io_pte_none(pte)) {
+ addr = iommu_alloc_page_node(domain->numa_node, gfp);
+ if (!addr)
+ return NULL;
+ old = pte;
+ pte = _io_pte_entry(virt_to_pfn(addr), _PAGE_TABLE);
+ if (cmpxchg_relaxed(ptr, old, pte) != old) {
+ iommu_free_page(addr);
+ goto pte_retry;
+ }
+ }
+ ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));
+ } while (level-- > 0);
+
+ return NULL;
+}
+
+static unsigned long *riscv_iommu_pte_fetch(struct riscv_iommu_domain *domain,
+ unsigned long iova, size_t *pte_pgsize)
+{
+ unsigned long *ptr = domain->pgd_root;
+ unsigned long pte;
+ int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2;
+
+ do {
+ const int shift = PAGE_SHIFT + PT_SHIFT * level;
+
+ ptr += ((iova >> shift) & (PTRS_PER_PTE - 1));
+ pte = READ_ONCE(*ptr);
+ if (_io_pte_present(pte) && _io_pte_leaf(pte)) {
+ *pte_pgsize = (size_t)1 << shift;
+ return ptr;
+ }
+ if (_io_pte_none(pte))
+ return NULL;
+ ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));
+ } while (level-- > 0);
+
+ return NULL;
+}
+
+static int riscv_iommu_map_pages(struct iommu_domain *iommu_domain,
+ unsigned long iova, phys_addr_t phys,
+ size_t pgsize, size_t pgcount, int prot,
+ gfp_t gfp, size_t *mapped)
+{
+ struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
+ size_t size = 0;
+ unsigned long *ptr;
+ unsigned long pte, old, pte_prot;
+ int rc = 0;
+ LIST_HEAD(freelist);
+
+ if (!(prot & IOMMU_WRITE))
+ pte_prot = _PAGE_BASE | _PAGE_READ;
+ else if (domain->amo_enabled)
+ pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE;
+ else
+ pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY;
+
+ while (pgcount) {
+ ptr = riscv_iommu_pte_alloc(domain, iova, pgsize, gfp);
+ if (!ptr) {
+ rc = -ENOMEM;
+ break;
+ }
+
+ old = READ_ONCE(*ptr);
+ pte = _io_pte_entry(phys_to_pfn(phys), pte_prot);
+ if (cmpxchg_relaxed(ptr, old, pte) != old)
+ continue;
+
+ riscv_iommu_pte_free(domain, old, &freelist);
+
+ size += pgsize;
+ iova += pgsize;
+ phys += pgsize;
+ --pgcount;
+ }
+
+ *mapped = size;
+
+ if (!list_empty(&freelist)) {
+ /*
+ * In 1.0 spec version, the smallest scope we can use to
+ * invalidate all levels of page table (i.e. leaf and non-leaf)
+ * is an invalidate-all-PSCID IOTINVAL.VMA with AV=0.
+ * This will be updated with hardware support for
+ * capability.NL (non-leaf) IOTINVAL command.
+ */
+ riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX);
+ iommu_put_pages_list(&freelist);
+ }
+
+ return rc;
+}
+
+static size_t riscv_iommu_unmap_pages(struct iommu_domain *iommu_domain,
+ unsigned long iova, size_t pgsize,
+ size_t pgcount,
+ struct iommu_iotlb_gather *gather)
+{
+ struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
+ size_t size = pgcount << __ffs(pgsize);
+ unsigned long *ptr, old;
+ size_t unmapped = 0;
+ size_t pte_size;
+
+ while (unmapped < size) {
+ ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size);
+ if (!ptr)
+ return unmapped;
+
+ /* partial unmap is not allowed, fail. */
+ if (iova & (pte_size - 1))
+ return unmapped;
+
+ old = READ_ONCE(*ptr);
+ if (cmpxchg_relaxed(ptr, old, 0) != old)
+ continue;
+
+ iommu_iotlb_gather_add_page(&domain->domain, gather, iova,
+ pte_size);
+
+ iova += pte_size;
+ unmapped += pte_size;
+ }
+
+ return unmapped;
+}
+
+static phys_addr_t riscv_iommu_iova_to_phys(struct iommu_domain *iommu_domain,
+ dma_addr_t iova)
+{
+ struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
+ unsigned long pte_size;
+ unsigned long *ptr;
+
+ ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size);
+ if (_io_pte_none(*ptr) || !_io_pte_present(*ptr))
+ return 0;
+
+ return pfn_to_phys(__page_val_to_pfn(*ptr)) | (iova & (pte_size - 1));
+}
+
+static void riscv_iommu_free_paging_domain(struct iommu_domain *iommu_domain)
+{
+ struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
+ const unsigned long pfn = virt_to_pfn(domain->pgd_root);
+
+ WARN_ON(!list_empty(&domain->bonds));
+
+ if ((int)domain->pscid > 0)
+ ida_free(&riscv_iommu_pscids, domain->pscid);
+
+ riscv_iommu_pte_free(domain, _io_pte_entry(pfn, _PAGE_TABLE), NULL);
+ kfree(domain);
+}
+
+static bool riscv_iommu_pt_supported(struct riscv_iommu_device *iommu, int pgd_mode)
+{
+ switch (pgd_mode) {
+ case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
+ return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39;
+
+ case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
+ return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48;
+
+ case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
+ return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57;
+ }
+ return false;
+}
+
+static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain,
+ struct device *dev)
+{
+ struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
+ struct riscv_iommu_device *iommu = dev_to_iommu(dev);
+ struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
+ u64 fsc, ta;
+
+ if (!riscv_iommu_pt_supported(iommu, domain->pgd_mode))
+ return -ENODEV;
+
+ fsc = FIELD_PREP(RISCV_IOMMU_PC_FSC_MODE, domain->pgd_mode) |
+ FIELD_PREP(RISCV_IOMMU_PC_FSC_PPN, virt_to_pfn(domain->pgd_root));
+ ta = FIELD_PREP(RISCV_IOMMU_PC_TA_PSCID, domain->pscid) |
+ RISCV_IOMMU_PC_TA_V;
+
+ if (riscv_iommu_bond_link(domain, dev))
+ return -ENOMEM;
+
+ riscv_iommu_iodir_update(iommu, dev, fsc, ta);
+ riscv_iommu_bond_unlink(info->domain, dev);
+ info->domain = domain;
+
+ return 0;
+}
+
+static const struct iommu_domain_ops riscv_iommu_paging_domain_ops = {
+ .attach_dev = riscv_iommu_attach_paging_domain,
+ .free = riscv_iommu_free_paging_domain,
+ .map_pages = riscv_iommu_map_pages,
+ .unmap_pages = riscv_iommu_unmap_pages,
+ .iova_to_phys = riscv_iommu_iova_to_phys,
+ .iotlb_sync = riscv_iommu_iotlb_sync,
+ .flush_iotlb_all = riscv_iommu_iotlb_flush_all,
+};
+
+static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev)
+{
+ struct riscv_iommu_domain *domain;
+ struct riscv_iommu_device *iommu;
+ unsigned int pgd_mode;
+ dma_addr_t va_mask;
+ int va_bits;
+
+ iommu = dev_to_iommu(dev);
+ if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57) {
+ pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57;
+ va_bits = 57;
+ } else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48) {
+ pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48;
+ va_bits = 48;
+ } else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39) {
+ pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39;
+ va_bits = 39;
+ } else {
+ dev_err(dev, "cannot find supported page table mode\n");
+ return ERR_PTR(-ENODEV);
+ }
+
+ domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+ if (!domain)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD_RCU(&domain->bonds);
+ spin_lock_init(&domain->lock);
+ domain->numa_node = dev_to_node(iommu->dev);
+ domain->amo_enabled = !!(iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD);
+ domain->pgd_mode = pgd_mode;
+ domain->pgd_root = iommu_alloc_page_node(domain->numa_node,
+ GFP_KERNEL_ACCOUNT);
+ if (!domain->pgd_root) {
+ kfree(domain);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ domain->pscid = ida_alloc_range(&riscv_iommu_pscids, 1,
+ RISCV_IOMMU_MAX_PSCID, GFP_KERNEL);
+ if (domain->pscid < 0) {
+ iommu_free_page(domain->pgd_root);
+ kfree(domain);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /*
+ * Note: RISC-V Privilege spec mandates that virtual addresses
+ * need to be sign-extended, so if (VA_BITS - 1) is set, all
+ * bits >= VA_BITS need to also be set or else we'll get a
+ * page fault. However the code that creates the mappings
+ * above us (e.g. iommu_dma_alloc_iova()) won't do that for us
+ * for now, so we'll end up with invalid virtual addresses
+ * to map. As a workaround until we get this sorted out
+ * limit the available virtual addresses to VA_BITS - 1.
+ */
+ va_mask = DMA_BIT_MASK(va_bits - 1);
+
+ domain->domain.geometry.aperture_start = 0;
+ domain->domain.geometry.aperture_end = va_mask;
+ domain->domain.geometry.force_aperture = true;
+ domain->domain.pgsize_bitmap = va_mask & (SZ_4K | SZ_2M | SZ_1G | SZ_512G);
+
+ domain->domain.ops = &riscv_iommu_paging_domain_ops;
+
+ return &domain->domain;
+}
+
+static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain,
+ struct device *dev)
+{
+ struct riscv_iommu_device *iommu = dev_to_iommu(dev);
+ struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
+
+ /* Make device context invalid, translation requests will fault w/ #258 */
+ riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, 0);
+ riscv_iommu_bond_unlink(info->domain, dev);
+ info->domain = NULL;
+
+ return 0;
+}
+
+static struct iommu_domain riscv_iommu_blocking_domain = {
+ .type = IOMMU_DOMAIN_BLOCKED,
+ .ops = &(const struct iommu_domain_ops) {
+ .attach_dev = riscv_iommu_attach_blocking_domain,
+ }
+};
+
+static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain,
+ struct device *dev)
+{
+ struct riscv_iommu_device *iommu = dev_to_iommu(dev);
+ struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
+
+ riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, RISCV_IOMMU_PC_TA_V);
+ riscv_iommu_bond_unlink(info->domain, dev);
+ info->domain = NULL;
+
+ return 0;
+}
+
+static struct iommu_domain riscv_iommu_identity_domain = {
+ .type = IOMMU_DOMAIN_IDENTITY,
+ .ops = &(const struct iommu_domain_ops) {
+ .attach_dev = riscv_iommu_attach_identity_domain,
+ }
+};
+
+static struct iommu_group *riscv_iommu_device_group(struct device *dev)
+{
+ if (dev_is_pci(dev))
+ return pci_device_group(dev);
+ return generic_device_group(dev);
+}
+
+static int riscv_iommu_of_xlate(struct device *dev, const struct of_phandle_args *args)
+{
+ return iommu_fwspec_add_ids(dev, args->args, 1);
+}
+
+static struct iommu_device *riscv_iommu_probe_device(struct device *dev)
+{
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+ struct riscv_iommu_device *iommu;
+ struct riscv_iommu_info *info;
+ struct riscv_iommu_dc *dc;
+ u64 tc;
+ int i;
+
+ if (!fwspec || !fwspec->iommu_fwnode->dev || !fwspec->num_ids)
+ return ERR_PTR(-ENODEV);
+
+ iommu = dev_get_drvdata(fwspec->iommu_fwnode->dev);
+ if (!iommu)
+ return ERR_PTR(-ENODEV);
+
+ /*
+ * IOMMU hardware operating in fail-over BARE mode will provide
+ * identity translation for all connected devices anyway...
+ */
+ if (iommu->ddt_mode <= RISCV_IOMMU_DDTP_IOMMU_MODE_BARE)
+ return ERR_PTR(-ENODEV);
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return ERR_PTR(-ENOMEM);
+ /*
+ * Allocate and pre-configure device context entries in
+ * the device directory. Do not mark the context valid yet.
+ */
+ tc = 0;
+ if (iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD)
+ tc |= RISCV_IOMMU_DC_TC_SADE;
+ for (i = 0; i < fwspec->num_ids; i++) {
+ dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
+ if (!dc) {
+ kfree(info);
+ return ERR_PTR(-ENODEV);
+ }
+ if (READ_ONCE(dc->tc) & RISCV_IOMMU_DC_TC_V)
+ dev_warn(dev, "already attached to IOMMU device directory\n");
+ WRITE_ONCE(dc->tc, tc);
+ }
+
+ dev_iommu_priv_set(dev, info);
+
+ return &iommu->iommu;
+}
+
+static void riscv_iommu_release_device(struct device *dev)
+{
+ struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
+
+ kfree_rcu_mightsleep(info);
+}
+
+static const struct iommu_ops riscv_iommu_ops = {
+ .pgsize_bitmap = SZ_4K,
+ .of_xlate = riscv_iommu_of_xlate,
+ .identity_domain = &riscv_iommu_identity_domain,
+ .blocked_domain = &riscv_iommu_blocking_domain,
+ .release_domain = &riscv_iommu_blocking_domain,
+ .domain_alloc_paging = riscv_iommu_alloc_paging_domain,
+ .device_group = riscv_iommu_device_group,
+ .probe_device = riscv_iommu_probe_device,
+ .release_device = riscv_iommu_release_device,
+};
+
+static int riscv_iommu_init_check(struct riscv_iommu_device *iommu)
+{
+ u64 ddtp;
+
+ /*
+ * Make sure the IOMMU is switched off or in pass-through mode during
+ * regular boot flow and disable translation when we boot into a kexec
+ * kernel and the previous kernel left them enabled.
+ */
+ ddtp = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_DDTP);
+ if (ddtp & RISCV_IOMMU_DDTP_BUSY)
+ return -EBUSY;
+
+ if (FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp) >
+ RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) {
+ if (!is_kdump_kernel())
+ return -EBUSY;
+ riscv_iommu_disable(iommu);
+ }
+
+ /* Configure accesses to in-memory data structures for CPU-native byte order. */
+ if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) !=
+ !!(iommu->fctl & RISCV_IOMMU_FCTL_BE)) {
+ if (!(iommu->caps & RISCV_IOMMU_CAPABILITIES_END))
+ return -EINVAL;
+ riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FCTL,
+ iommu->fctl ^ RISCV_IOMMU_FCTL_BE);
+ iommu->fctl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FCTL);
+ if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) !=
+ !!(iommu->fctl & RISCV_IOMMU_FCTL_BE))
+ return -EINVAL;
+ }
+
+ /*
+ * Distribute interrupt vectors, always use first vector for CIV.
+ * At least one interrupt is required. Read back and verify.
+ */
+ if (!iommu->irqs_count)
+ return -EINVAL;
+
+ iommu->icvec = FIELD_PREP(RISCV_IOMMU_ICVEC_FIV, 1 % iommu->irqs_count) |
+ FIELD_PREP(RISCV_IOMMU_ICVEC_PIV, 2 % iommu->irqs_count) |
+ FIELD_PREP(RISCV_IOMMU_ICVEC_PMIV, 3 % iommu->irqs_count);
+ riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_ICVEC, iommu->icvec);
+ iommu->icvec = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_ICVEC);
+ if (max(max(FIELD_GET(RISCV_IOMMU_ICVEC_CIV, iommu->icvec),
+ FIELD_GET(RISCV_IOMMU_ICVEC_FIV, iommu->icvec)),
+ max(FIELD_GET(RISCV_IOMMU_ICVEC_PIV, iommu->icvec),
+ FIELD_GET(RISCV_IOMMU_ICVEC_PMIV, iommu->icvec))) >= iommu->irqs_count)
+ return -EINVAL;
+
+ return 0;
+}
+
+void riscv_iommu_remove(struct riscv_iommu_device *iommu)
+{
+ iommu_device_unregister(&iommu->iommu);
+ iommu_device_sysfs_remove(&iommu->iommu);
+ riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF);
+ riscv_iommu_queue_disable(&iommu->cmdq);
+ riscv_iommu_queue_disable(&iommu->fltq);
+}
+
+int riscv_iommu_init(struct riscv_iommu_device *iommu)
+{
+ int rc;
+
+ RISCV_IOMMU_QUEUE_INIT(&iommu->cmdq, CQ);
+ RISCV_IOMMU_QUEUE_INIT(&iommu->fltq, FQ);
+
+ rc = riscv_iommu_init_check(iommu);
+ if (rc)
+ return dev_err_probe(iommu->dev, rc, "unexpected device state\n");
+
+ rc = riscv_iommu_iodir_alloc(iommu);
+ if (rc)
+ return rc;
+
+ rc = riscv_iommu_queue_alloc(iommu, &iommu->cmdq,
+ sizeof(struct riscv_iommu_command));
+ if (rc)
+ return rc;
+
+ rc = riscv_iommu_queue_alloc(iommu, &iommu->fltq,
+ sizeof(struct riscv_iommu_fq_record));
+ if (rc)
+ return rc;
+
+ rc = riscv_iommu_queue_enable(iommu, &iommu->cmdq, riscv_iommu_cmdq_process);
+ if (rc)
+ return rc;
+
+ rc = riscv_iommu_queue_enable(iommu, &iommu->fltq, riscv_iommu_fltq_process);
+ if (rc)
+ goto err_queue_disable;
+
+ rc = riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_MAX);
+ if (rc)
+ goto err_queue_disable;
+
+ rc = iommu_device_sysfs_add(&iommu->iommu, NULL, NULL, "riscv-iommu@%s",
+ dev_name(iommu->dev));
+ if (rc) {
+ dev_err_probe(iommu->dev, rc, "cannot register sysfs interface\n");
+ goto err_iodir_off;
+ }
+
+ rc = iommu_device_register(&iommu->iommu, &riscv_iommu_ops, iommu->dev);
+ if (rc) {
+ dev_err_probe(iommu->dev, rc, "cannot register iommu interface\n");
+ goto err_remove_sysfs;
+ }
+
+ return 0;
+
+err_remove_sysfs:
+ iommu_device_sysfs_remove(&iommu->iommu);
+err_iodir_off:
+ riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF);
+err_queue_disable:
+ riscv_iommu_queue_disable(&iommu->fltq);
+ riscv_iommu_queue_disable(&iommu->cmdq);
+ return rc;
+}
diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h
new file mode 100644
index 000000000000..b1c4664542b4
--- /dev/null
+++ b/drivers/iommu/riscv/iommu.h
@@ -0,0 +1,88 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright © 2022-2024 Rivos Inc.
+ * Copyright © 2023 FORTH-ICS/CARV
+ *
+ * Authors
+ * Tomasz Jeznach <tjeznach@rivosinc.com>
+ * Nick Kossifidis <mick@ics.forth.gr>
+ */
+
+#ifndef _RISCV_IOMMU_H_
+#define _RISCV_IOMMU_H_
+
+#include <linux/iommu.h>
+#include <linux/types.h>
+#include <linux/iopoll.h>
+
+#include "iommu-bits.h"
+
+struct riscv_iommu_device;
+
+struct riscv_iommu_queue {
+ atomic_t prod; /* unbounded producer allocation index */
+ atomic_t head; /* unbounded shadow ring buffer consumer index */
+ atomic_t tail; /* unbounded shadow ring buffer producer index */
+ unsigned int mask; /* index mask, queue length - 1 */
+ unsigned int irq; /* allocated interrupt number */
+ struct riscv_iommu_device *iommu; /* iommu device handling the queue when active */
+ void *base; /* ring buffer kernel pointer */
+ dma_addr_t phys; /* ring buffer physical address */
+ u16 qbr; /* base register offset, head and tail reference */
+ u16 qcr; /* control and status register offset */
+ u8 qid; /* queue identifier, same as RISCV_IOMMU_INTR_XX */
+};
+
+struct riscv_iommu_device {
+ /* iommu core interface */
+ struct iommu_device iommu;
+
+ /* iommu hardware */
+ struct device *dev;
+
+ /* hardware control register space */
+ void __iomem *reg;
+
+ /* supported and enabled hardware capabilities */
+ u64 caps;
+ u32 fctl;
+
+ /* available interrupt numbers, MSI or WSI */
+ unsigned int irqs[RISCV_IOMMU_INTR_COUNT];
+ unsigned int irqs_count;
+ unsigned int icvec;
+
+ /* hardware queues */
+ struct riscv_iommu_queue cmdq;
+ struct riscv_iommu_queue fltq;
+
+ /* device directory */
+ unsigned int ddt_mode;
+ dma_addr_t ddt_phys;
+ u64 *ddt_root;
+};
+
+int riscv_iommu_init(struct riscv_iommu_device *iommu);
+void riscv_iommu_remove(struct riscv_iommu_device *iommu);
+
+#define riscv_iommu_readl(iommu, addr) \
+ readl_relaxed((iommu)->reg + (addr))
+
+#define riscv_iommu_readq(iommu, addr) \
+ readq_relaxed((iommu)->reg + (addr))
+
+#define riscv_iommu_writel(iommu, addr, val) \
+ writel_relaxed((val), (iommu)->reg + (addr))
+
+#define riscv_iommu_writeq(iommu, addr, val) \
+ writeq_relaxed((val), (iommu)->reg + (addr))
+
+#define riscv_iommu_readq_timeout(iommu, addr, val, cond, delay_us, timeout_us) \
+ readx_poll_timeout(readq_relaxed, (iommu)->reg + (addr), val, cond, \
+ delay_us, timeout_us)
+
+#define riscv_iommu_readl_timeout(iommu, addr, val, cond, delay_us, timeout_us) \
+ readx_poll_timeout(readl_relaxed, (iommu)->reg + (addr), val, cond, \
+ delay_us, timeout_us)
+
+#endif
diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c
index d8eaa7ea380b..fbdeded3d48b 100644
--- a/drivers/iommu/s390-iommu.c
+++ b/drivers/iommu/s390-iommu.c
@@ -33,6 +33,8 @@ struct s390_domain {
struct rcu_head rcu;
};
+static struct iommu_domain blocking_domain;
+
static inline unsigned int calc_rtx(dma_addr_t ptr)
{
return ((unsigned long)ptr >> ZPCI_RT_SHIFT) & ZPCI_INDEX_MASK;
@@ -369,20 +371,36 @@ static void s390_domain_free(struct iommu_domain *domain)
call_rcu(&s390_domain->rcu, s390_iommu_rcu_free_domain);
}
-static void s390_iommu_detach_device(struct iommu_domain *domain,
- struct device *dev)
+static void zdev_s390_domain_update(struct zpci_dev *zdev,
+ struct iommu_domain *domain)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&zdev->dom_lock, flags);
+ zdev->s390_domain = domain;
+ spin_unlock_irqrestore(&zdev->dom_lock, flags);
+}
+
+static int blocking_domain_attach_device(struct iommu_domain *domain,
+ struct device *dev)
{
- struct s390_domain *s390_domain = to_s390_domain(domain);
struct zpci_dev *zdev = to_zpci_dev(dev);
+ struct s390_domain *s390_domain;
unsigned long flags;
+ if (zdev->s390_domain->type == IOMMU_DOMAIN_BLOCKED)
+ return 0;
+
+ s390_domain = to_s390_domain(zdev->s390_domain);
spin_lock_irqsave(&s390_domain->list_lock, flags);
list_del_rcu(&zdev->iommu_list);
spin_unlock_irqrestore(&s390_domain->list_lock, flags);
zpci_unregister_ioat(zdev, 0);
- zdev->s390_domain = NULL;
zdev->dma_table = NULL;
+ zdev_s390_domain_update(zdev, domain);
+
+ return 0;
}
static int s390_iommu_attach_device(struct iommu_domain *domain,
@@ -401,20 +419,15 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
domain->geometry.aperture_end < zdev->start_dma))
return -EINVAL;
- if (zdev->s390_domain)
- s390_iommu_detach_device(&zdev->s390_domain->domain, dev);
+ blocking_domain_attach_device(&blocking_domain, dev);
+ /* If we fail now DMA remains blocked via blocking domain */
cc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
virt_to_phys(s390_domain->dma_table), &status);
- /*
- * If the device is undergoing error recovery the reset code
- * will re-establish the new domain.
- */
if (cc && status != ZPCI_PCI_ST_FUNC_NOT_AVAIL)
return -EIO;
-
zdev->dma_table = s390_domain->dma_table;
- zdev->s390_domain = s390_domain;
+ zdev_s390_domain_update(zdev, domain);
spin_lock_irqsave(&s390_domain->list_lock, flags);
list_add_rcu(&zdev->iommu_list, &s390_domain->devices);
@@ -466,19 +479,11 @@ static struct iommu_device *s390_iommu_probe_device(struct device *dev)
if (zdev->tlb_refresh)
dev->iommu->shadow_on_flush = 1;
- return &zdev->iommu_dev;
-}
+ /* Start with DMA blocked */
+ spin_lock_init(&zdev->dom_lock);
+ zdev_s390_domain_update(zdev, &blocking_domain);
-static void s390_iommu_release_device(struct device *dev)
-{
- struct zpci_dev *zdev = to_zpci_dev(dev);
-
- /*
- * release_device is expected to detach any domain currently attached
- * to the device, but keep it attached to other devices in the group.
- */
- if (zdev)
- s390_iommu_detach_device(&zdev->s390_domain->domain, dev);
+ return &zdev->iommu_dev;
}
static int zpci_refresh_all(struct zpci_dev *zdev)
@@ -697,9 +702,15 @@ static size_t s390_iommu_unmap_pages(struct iommu_domain *domain,
struct zpci_iommu_ctrs *zpci_get_iommu_ctrs(struct zpci_dev *zdev)
{
- if (!zdev || !zdev->s390_domain)
+ struct s390_domain *s390_domain;
+
+ lockdep_assert_held(&zdev->dom_lock);
+
+ if (zdev->s390_domain->type == IOMMU_DOMAIN_BLOCKED)
return NULL;
- return &zdev->s390_domain->ctrs;
+
+ s390_domain = to_s390_domain(zdev->s390_domain);
+ return &s390_domain->ctrs;
}
int zpci_init_iommu(struct zpci_dev *zdev)
@@ -776,11 +787,19 @@ static int __init s390_iommu_init(void)
}
subsys_initcall(s390_iommu_init);
+static struct iommu_domain blocking_domain = {
+ .type = IOMMU_DOMAIN_BLOCKED,
+ .ops = &(const struct iommu_domain_ops) {
+ .attach_dev = blocking_domain_attach_device,
+ }
+};
+
static const struct iommu_ops s390_iommu_ops = {
+ .blocked_domain = &blocking_domain,
+ .release_domain = &blocking_domain,
.capable = s390_iommu_capable,
.domain_alloc_paging = s390_domain_alloc_paging,
.probe_device = s390_iommu_probe_device,
- .release_device = s390_iommu_release_device,
.device_group = generic_device_group,
.pgsize_bitmap = SZ_4K,
.get_resv_regions = s390_iommu_get_resv_regions,
diff --git a/drivers/media/platform/nvidia/tegra-vde/iommu.c b/drivers/media/platform/nvidia/tegra-vde/iommu.c
index 5521ed3e465f..b1d9d841d944 100644
--- a/drivers/media/platform/nvidia/tegra-vde/iommu.c
+++ b/drivers/media/platform/nvidia/tegra-vde/iommu.c
@@ -78,9 +78,10 @@ int tegra_vde_iommu_init(struct tegra_vde *vde)
arm_iommu_release_mapping(mapping);
}
#endif
- vde->domain = iommu_domain_alloc(&platform_bus_type);
- if (!vde->domain) {
- err = -ENOMEM;
+ vde->domain = iommu_paging_domain_alloc(dev);
+ if (IS_ERR(vde->domain)) {
+ err = PTR_ERR(vde->domain);
+ vde->domain = NULL;
goto put_group;
}
diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
index f276956f2c5c..eb66f78ec8b7 100644
--- a/drivers/remoteproc/remoteproc_core.c
+++ b/drivers/remoteproc/remoteproc_core.c
@@ -109,10 +109,10 @@ static int rproc_enable_iommu(struct rproc *rproc)
return 0;
}
- domain = iommu_domain_alloc(dev->bus);
- if (!domain) {
+ domain = iommu_paging_domain_alloc(dev);
+ if (IS_ERR(domain)) {
dev_err(dev, "can't alloc iommu domain\n");
- return -ENOMEM;
+ return PTR_ERR(domain);
}
iommu_set_fault_handler(domain, rproc_iommu_fault, rproc);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index bd722f473635..9a3215b5c1e5 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -511,8 +511,6 @@ static inline int __iommu_copy_struct_from_user_array(
* the caller iommu_domain_alloc() returns.
* @domain_alloc_user: Allocate an iommu domain corresponding to the input
* parameters as defined in include/uapi/linux/iommufd.h.
- * Unlike @domain_alloc, it is called only by IOMMUFD and
- * must fully initialize the new domain before return.
* Upon success, if the @user_data is valid and the @parent
* points to a kernel-managed domain, the new domain must be
* IOMMU_DOMAIN_NESTED type; otherwise, the @parent must be
@@ -784,12 +782,14 @@ static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather)
};
}
-extern int bus_iommu_probe(const struct bus_type *bus);
extern bool iommu_present(const struct bus_type *bus);
extern bool device_iommu_capable(struct device *dev, enum iommu_cap cap);
extern bool iommu_group_has_isolated_msi(struct iommu_group *group);
-extern struct iommu_domain *iommu_domain_alloc(const struct bus_type *bus);
-struct iommu_domain *iommu_paging_domain_alloc(struct device *dev);
+struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev, unsigned int flags);
+static inline struct iommu_domain *iommu_paging_domain_alloc(struct device *dev)
+{
+ return iommu_paging_domain_alloc_flags(dev, 0);
+}
extern void iommu_domain_free(struct iommu_domain *domain);
extern int iommu_attach_device(struct iommu_domain *domain,
struct device *dev);
@@ -1081,19 +1081,15 @@ struct iommu_iotlb_gather {};
struct iommu_dirty_bitmap {};
struct iommu_dirty_ops {};
-static inline bool iommu_present(const struct bus_type *bus)
-{
- return false;
-}
-
static inline bool device_iommu_capable(struct device *dev, enum iommu_cap cap)
{
return false;
}
-static inline struct iommu_domain *iommu_domain_alloc(const struct bus_type *bus)
+static inline struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev,
+ unsigned int flags)
{
- return NULL;
+ return ERR_PTR(-ENODEV);
}
static inline struct iommu_domain *iommu_paging_domain_alloc(struct device *dev)
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 72010f71c5e4..0c0ed28ee113 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -359,11 +359,19 @@ struct iommu_vfio_ioas {
* enforced on device attachment
* @IOMMU_HWPT_FAULT_ID_VALID: The fault_id field of hwpt allocation data is
* valid.
+ * @IOMMU_HWPT_ALLOC_PASID: Requests a domain that can be used with PASID. The
+ * domain can be attached to any PASID on the device.
+ * Any domain attached to the non-PASID part of the
+ * device must also be flaged, otherwise attaching a
+ * PASID will blocked.
+ * If IOMMU does not support PASID it will return
+ * error (-EOPNOTSUPP).
*/
enum iommufd_hwpt_alloc_flags {
IOMMU_HWPT_ALLOC_NEST_PARENT = 1 << 0,
IOMMU_HWPT_ALLOC_DIRTY_TRACKING = 1 << 1,
IOMMU_HWPT_FAULT_ID_VALID = 1 << 2,
+ IOMMU_HWPT_ALLOC_PASID = 1 << 3,
};
/**