summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/gpu/rfc/gpusvm.rst107
-rw-r--r--Documentation/gpu/rfc/index.rst4
-rw-r--r--drivers/base/component.c3
-rw-r--r--drivers/base/devres.c12
-rw-r--r--drivers/gpu/drm/Kconfig9
-rw-r--r--drivers/gpu/drm/Makefile1
-rw-r--r--drivers/gpu/drm/drm_gpusvm.c2236
-rw-r--r--drivers/gpu/drm/xe/Kconfig10
-rw-r--r--drivers/gpu/drm/xe/Makefile3
-rw-r--r--drivers/gpu/drm/xe/abi/guc_actions_abi.h1
-rw-r--r--drivers/gpu/drm/xe/abi/guc_actions_slpc_abi.h3
-rw-r--r--drivers/gpu/drm/xe/display/xe_display.c13
-rw-r--r--drivers/gpu/drm/xe/display/xe_display.h1
-rw-r--r--drivers/gpu/drm/xe/regs/xe_engine_regs.h1
-rw-r--r--drivers/gpu/drm/xe/regs/xe_eu_stall_regs.h29
-rw-r--r--drivers/gpu/drm/xe/regs/xe_gt_regs.h7
-rw-r--r--drivers/gpu/drm/xe/regs/xe_regs.h4
-rw-r--r--drivers/gpu/drm/xe/tests/xe_pci.c26
-rw-r--r--drivers/gpu/drm/xe/xe_bo.c54
-rw-r--r--drivers/gpu/drm/xe/xe_bo.h20
-rw-r--r--drivers/gpu/drm/xe/xe_bo_types.h4
-rw-r--r--drivers/gpu/drm/xe/xe_devcoredump.c8
-rw-r--r--drivers/gpu/drm/xe/xe_device.c101
-rw-r--r--drivers/gpu/drm/xe/xe_device.h3
-rw-r--r--drivers/gpu/drm/xe/xe_device_sysfs.c6
-rw-r--r--drivers/gpu/drm/xe/xe_device_types.h36
-rw-r--r--drivers/gpu/drm/xe/xe_eu_stall.c960
-rw-r--r--drivers/gpu/drm/xe/xe_eu_stall.h24
-rw-r--r--drivers/gpu/drm/xe/xe_exec_queue.c11
-rw-r--r--drivers/gpu/drm/xe/xe_exec_queue_types.h2
-rw-r--r--drivers/gpu/drm/xe/xe_gen_wa_oob.c6
-rw-r--r--drivers/gpu/drm/xe/xe_gsc_proxy.c2
-rw-r--r--drivers/gpu/drm/xe/xe_gt.c13
-rw-r--r--drivers/gpu/drm/xe/xe_gt_clock.c57
-rw-r--r--drivers/gpu/drm/xe/xe_gt_debugfs.c11
-rw-r--r--drivers/gpu/drm/xe/xe_gt_pagefault.c20
-rw-r--r--drivers/gpu/drm/xe/xe_gt_sriov_pf_service.c5
-rw-r--r--drivers/gpu/drm/xe/xe_gt_sriov_vf.c9
-rw-r--r--drivers/gpu/drm/xe/xe_gt_stats.c8
-rw-r--r--drivers/gpu/drm/xe/xe_gt_stats_types.h2
-rw-r--r--drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c22
-rw-r--r--drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h2
-rw-r--r--drivers/gpu/drm/xe/xe_gt_topology.h13
-rw-r--r--drivers/gpu/drm/xe/xe_gt_types.h15
-rw-r--r--drivers/gpu/drm/xe/xe_guc.c5
-rw-r--r--drivers/gpu/drm/xe/xe_guc_ads.c2
-rw-r--r--drivers/gpu/drm/xe/xe_guc_engine_activity.c373
-rw-r--r--drivers/gpu/drm/xe/xe_guc_engine_activity.h19
-rw-r--r--drivers/gpu/drm/xe/xe_guc_engine_activity_types.h92
-rw-r--r--drivers/gpu/drm/xe/xe_guc_fwif.h19
-rw-r--r--drivers/gpu/drm/xe/xe_guc_pc.c16
-rw-r--r--drivers/gpu/drm/xe/xe_guc_submit.c10
-rw-r--r--drivers/gpu/drm/xe/xe_guc_types.h4
-rw-r--r--drivers/gpu/drm/xe/xe_heci_gsc.c39
-rw-r--r--drivers/gpu/drm/xe/xe_heci_gsc.h3
-rw-r--r--drivers/gpu/drm/xe/xe_hmm.c188
-rw-r--r--drivers/gpu/drm/xe/xe_hmm.h7
-rw-r--r--drivers/gpu/drm/xe/xe_hw_engine_group.c1
-rw-r--r--drivers/gpu/drm/xe/xe_migrate.c175
-rw-r--r--drivers/gpu/drm/xe/xe_migrate.h10
-rw-r--r--drivers/gpu/drm/xe/xe_module.c7
-rw-r--r--drivers/gpu/drm/xe/xe_module.h2
-rw-r--r--drivers/gpu/drm/xe/xe_oa.c35
-rw-r--r--drivers/gpu/drm/xe/xe_observation.c14
-rw-r--r--drivers/gpu/drm/xe/xe_pci.c245
-rw-r--r--drivers/gpu/drm/xe/xe_pci_sriov.c51
-rw-r--r--drivers/gpu/drm/xe/xe_pci_types.h15
-rw-r--r--drivers/gpu/drm/xe/xe_pmu.c175
-rw-r--r--drivers/gpu/drm/xe/xe_pt.c493
-rw-r--r--drivers/gpu/drm/xe/xe_pt.h5
-rw-r--r--drivers/gpu/drm/xe/xe_pt_types.h2
-rw-r--r--drivers/gpu/drm/xe/xe_pt_walk.c3
-rw-r--r--drivers/gpu/drm/xe/xe_pt_walk.h4
-rw-r--r--drivers/gpu/drm/xe/xe_pxp.c78
-rw-r--r--drivers/gpu/drm/xe/xe_query.c50
-rw-r--r--drivers/gpu/drm/xe/xe_res_cursor.h123
-rw-r--r--drivers/gpu/drm/xe/xe_ring_ops.c4
-rw-r--r--drivers/gpu/drm/xe/xe_survivability_mode.c77
-rw-r--r--drivers/gpu/drm/xe/xe_survivability_mode.h5
-rw-r--r--drivers/gpu/drm/xe/xe_svm.c946
-rw-r--r--drivers/gpu/drm/xe/xe_svm.h150
-rw-r--r--drivers/gpu/drm/xe/xe_tile.c5
-rw-r--r--drivers/gpu/drm/xe/xe_trace.h30
-rw-r--r--drivers/gpu/drm/xe/xe_trace_guc.h49
-rw-r--r--drivers/gpu/drm/xe/xe_tuning.c72
-rw-r--r--drivers/gpu/drm/xe/xe_tuning.h3
-rw-r--r--drivers/gpu/drm/xe/xe_uc.c3
-rw-r--r--drivers/gpu/drm/xe/xe_vm.c521
-rw-r--r--drivers/gpu/drm/xe/xe_vm.h25
-rw-r--r--drivers/gpu/drm/xe/xe_vm_types.h65
-rw-r--r--drivers/gpu/drm/xe/xe_wa.c19
-rw-r--r--drivers/gpu/drm/xe/xe_wa_oob.rules10
-rw-r--r--include/drm/drm_gpusvm.h509
-rw-r--r--include/drm/drm_gpuvm.h5
-rw-r--r--include/drm/drm_pagemap.h107
-rw-r--r--include/linux/migrate.h1
-rw-r--r--include/uapi/drm/xe_drm.h117
-rw-r--r--mm/memory.c13
-rw-r--r--mm/migrate_device.c116
99 files changed, 8210 insertions, 791 deletions
diff --git a/Documentation/gpu/rfc/gpusvm.rst b/Documentation/gpu/rfc/gpusvm.rst
new file mode 100644
index 000000000000..073e46065d9c
--- /dev/null
+++ b/Documentation/gpu/rfc/gpusvm.rst
@@ -0,0 +1,107 @@
+.. SPDX-License-Identifier: (GPL-2.0+ OR MIT)
+
+===============
+GPU SVM Section
+===============
+
+Agreed upon design principles
+=============================
+
+* migrate_to_ram path
+ * Rely only on core MM concepts (migration PTEs, page references, and
+ page locking).
+ * No driver specific locks other than locks for hardware interaction in
+ this path. These are not required and generally a bad idea to
+ invent driver defined locks to seal core MM races.
+ * An example of a driver-specific lock causing issues occurred before
+ fixing do_swap_page to lock the faulting page. A driver-exclusive lock
+ in migrate_to_ram produced a stable livelock if enough threads read
+ the faulting page.
+ * Partial migration is supported (i.e., a subset of pages attempting to
+ migrate can actually migrate, with only the faulting page guaranteed
+ to migrate).
+ * Driver handles mixed migrations via retry loops rather than locking.
+* Eviction
+ * Eviction is defined as migrating data from the GPU back to the
+ CPU without a virtual address to free up GPU memory.
+ * Only looking at physical memory data structures and locks as opposed to
+ looking at virtual memory data structures and locks.
+ * No looking at mm/vma structs or relying on those being locked.
+ * The rationale for the above two points is that CPU virtual addresses
+ can change at any moment, while the physical pages remain stable.
+ * GPU page table invalidation, which requires a GPU virtual address, is
+ handled via the notifier that has access to the GPU virtual address.
+* GPU fault side
+ * mmap_read only used around core MM functions which require this lock
+ and should strive to take mmap_read lock only in GPU SVM layer.
+ * Big retry loop to handle all races with the mmu notifier under the gpu
+ pagetable locks/mmu notifier range lock/whatever we end up calling
+ those.
+ * Races (especially against concurrent eviction or migrate_to_ram)
+ should not be handled on the fault side by trying to hold locks;
+ rather, they should be handled using retry loops. One possible
+ exception is holding a BO's dma-resv lock during the initial migration
+ to VRAM, as this is a well-defined lock that can be taken underneath
+ the mmap_read lock.
+ * One possible issue with the above approach is if a driver has a strict
+ migration policy requiring GPU access to occur in GPU memory.
+ Concurrent CPU access could cause a livelock due to endless retries.
+ While no current user (Xe) of GPU SVM has such a policy, it is likely
+ to be added in the future. Ideally, this should be resolved on the
+ core-MM side rather than through a driver-side lock.
+* Physical memory to virtual backpointer
+ * This does not work, as no pointers from physical memory to virtual
+ memory should exist. mremap() is an example of the core MM updating
+ the virtual address without notifying the driver of address
+ change rather the driver only receiving the invalidation notifier.
+ * The physical memory backpointer (page->zone_device_data) should remain
+ stable from allocation to page free. Safely updating this against a
+ concurrent user would be very difficult unless the page is free.
+* GPU pagetable locking
+ * Notifier lock only protects range tree, pages valid state for a range
+ (rather than seqno due to wider notifiers), pagetable entries, and
+ mmu notifier seqno tracking, it is not a global lock to protect
+ against races.
+ * All races handled with big retry as mentioned above.
+
+Overview of baseline design
+===========================
+
+Baseline design is simple as possible to get a working basline in which can be
+built upon.
+
+.. kernel-doc:: drivers/gpu/drm/xe/drm_gpusvm.c
+ :doc: Overview
+ :doc: Locking
+ :doc: Migrataion
+ :doc: Partial Unmapping of Ranges
+ :doc: Examples
+
+Possible future design features
+===============================
+
+* Concurrent GPU faults
+ * CPU faults are concurrent so makes sense to have concurrent GPU
+ faults.
+ * Should be possible with fined grained locking in the driver GPU
+ fault handler.
+ * No expected GPU SVM changes required.
+* Ranges with mixed system and device pages
+ * Can be added if required to drm_gpusvm_get_pages fairly easily.
+* Multi-GPU support
+ * Work in progress and patches expected after initially landing on GPU
+ SVM.
+ * Ideally can be done with little to no changes to GPU SVM.
+* Drop ranges in favor of radix tree
+ * May be desirable for faster notifiers.
+* Compound device pages
+ * Nvidia, AMD, and Intel all have agreed expensive core MM functions in
+ migrate device layer are a performance bottleneck, having compound
+ device pages should help increase performance by reducing the number
+ of these expensive calls.
+* Higher order dma mapping for migration
+ * 4k dma mapping adversely affects migration performance on Intel
+ hardware, higher order (2M) dma mapping should help here.
+* Build common userptr implementation on top of GPU SVM
+* Driver side madvise implementation and migration policies
+* Pull in pending dma-mapping API changes from Leon / Nvidia when these land
diff --git a/Documentation/gpu/rfc/index.rst b/Documentation/gpu/rfc/index.rst
index 476719771eef..396e535377fb 100644
--- a/Documentation/gpu/rfc/index.rst
+++ b/Documentation/gpu/rfc/index.rst
@@ -18,6 +18,10 @@ host such documentation:
.. toctree::
+ gpusvm.rst
+
+.. toctree::
+
i915_gem_lmem.rst
.. toctree::
diff --git a/drivers/base/component.c b/drivers/base/component.c
index d63e01f4851d..a482708566bc 100644
--- a/drivers/base/component.c
+++ b/drivers/base/component.c
@@ -588,6 +588,9 @@ static void component_unbind(struct component *component,
{
WARN_ON(!component->bound);
+ dev_dbg(adev->parent, "unbinding %s component %p (ops %ps)\n",
+ dev_name(component->dev), component, component->ops);
+
if (component->ops && component->ops->unbind)
component->ops->unbind(component->dev, adev->parent, data);
component->bound = false;
diff --git a/drivers/base/devres.c b/drivers/base/devres.c
index 93e7779ef21e..d8a733ea5e1a 100644
--- a/drivers/base/devres.c
+++ b/drivers/base/devres.c
@@ -576,7 +576,10 @@ void *devres_open_group(struct device *dev, void *id, gfp_t gfp)
}
EXPORT_SYMBOL_GPL(devres_open_group);
-/* Find devres group with ID @id. If @id is NULL, look for the latest. */
+/*
+ * Find devres group with ID @id. If @id is NULL, look for the latest open
+ * group.
+ */
static struct devres_group *find_group(struct device *dev, void *id)
{
struct devres_node *node;
@@ -687,6 +690,13 @@ int devres_release_group(struct device *dev, void *id)
spin_unlock_irqrestore(&dev->devres_lock, flags);
release_nodes(dev, &todo);
+ } else if (list_empty(&dev->devres_head)) {
+ /*
+ * dev is probably dying via devres_release_all(): groups
+ * have already been removed and are on the process of
+ * being released - don't touch and don't warn.
+ */
+ spin_unlock_irqrestore(&dev->devres_lock, flags);
} else {
WARN_ON(1);
spin_unlock_irqrestore(&dev->devres_lock, flags);
diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig
index e5b59de28216..1be14d8634f4 100644
--- a/drivers/gpu/drm/Kconfig
+++ b/drivers/gpu/drm/Kconfig
@@ -278,6 +278,15 @@ config DRM_GPUVM
GPU-VM representation providing helpers to manage a GPUs virtual
address space
+config DRM_GPUSVM
+ tristate
+ depends on DRM && DEVICE_PRIVATE
+ select HMM_MIRROR
+ select MMU_NOTIFIER
+ help
+ GPU-SVM representation providing helpers to manage a GPUs shared
+ virtual memory
+
config DRM_BUDDY
tristate
depends on DRM
diff --git a/drivers/gpu/drm/Makefile b/drivers/gpu/drm/Makefile
index 4cd054188faf..ed54a546bbe2 100644
--- a/drivers/gpu/drm/Makefile
+++ b/drivers/gpu/drm/Makefile
@@ -104,6 +104,7 @@ obj-$(CONFIG_DRM_PANEL_BACKLIGHT_QUIRKS) += drm_panel_backlight_quirks.o
#
obj-$(CONFIG_DRM_EXEC) += drm_exec.o
obj-$(CONFIG_DRM_GPUVM) += drm_gpuvm.o
+obj-$(CONFIG_DRM_GPUSVM) += drm_gpusvm.o
obj-$(CONFIG_DRM_BUDDY) += drm_buddy.o
diff --git a/drivers/gpu/drm/drm_gpusvm.c b/drivers/gpu/drm/drm_gpusvm.c
new file mode 100644
index 000000000000..f314f5c4af0f
--- /dev/null
+++ b/drivers/gpu/drm/drm_gpusvm.c
@@ -0,0 +1,2236 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+/*
+ * Copyright © 2024 Intel Corporation
+ *
+ * Authors:
+ * Matthew Brost <matthew.brost@intel.com>
+ */
+
+#include <linux/dma-mapping.h>
+#include <linux/hmm.h>
+#include <linux/memremap.h>
+#include <linux/migrate.h>
+#include <linux/mm_types.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+
+#include <drm/drm_device.h>
+#include <drm/drm_gpusvm.h>
+#include <drm/drm_pagemap.h>
+#include <drm/drm_print.h>
+
+/**
+ * DOC: Overview
+ *
+ * GPU Shared Virtual Memory (GPU SVM) layer for the Direct Rendering Manager (DRM)
+ *
+ * The GPU SVM layer is a component of the DRM framework designed to manage shared
+ * virtual memory between the CPU and GPU. It enables efficient data exchange and
+ * processing for GPU-accelerated applications by allowing memory sharing and
+ * synchronization between the CPU's and GPU's virtual address spaces.
+ *
+ * Key GPU SVM Components:
+ * - Notifiers: Notifiers: Used for tracking memory intervals and notifying the
+ * GPU of changes, notifiers are sized based on a GPU SVM
+ * initialization parameter, with a recommendation of 512M or
+ * larger. They maintain a Red-BlacK tree and a list of ranges that
+ * fall within the notifier interval. Notifiers are tracked within
+ * a GPU SVM Red-BlacK tree and list and are dynamically inserted
+ * or removed as ranges within the interval are created or
+ * destroyed.
+ * - Ranges: Represent memory ranges mapped in a DRM device and managed
+ * by GPU SVM. They are sized based on an array of chunk sizes, which
+ * is a GPU SVM initialization parameter, and the CPU address space.
+ * Upon GPU fault, the largest aligned chunk that fits within the
+ * faulting CPU address space is chosen for the range size. Ranges are
+ * expected to be dynamically allocated on GPU fault and removed on an
+ * MMU notifier UNMAP event. As mentioned above, ranges are tracked in
+ * a notifier's Red-Black tree.
+ * - Operations: Define the interface for driver-specific GPU SVM operations
+ * such as range allocation, notifier allocation, and
+ * invalidations.
+ * - Device Memory Allocations: Embedded structure containing enough information
+ * for GPU SVM to migrate to / from device memory.
+ * - Device Memory Operations: Define the interface for driver-specific device
+ * memory operations release memory, populate pfns,
+ * and copy to / from device memory.
+ *
+ * This layer provides interfaces for allocating, mapping, migrating, and
+ * releasing memory ranges between the CPU and GPU. It handles all core memory
+ * management interactions (DMA mapping, HMM, and migration) and provides
+ * driver-specific virtual functions (vfuncs). This infrastructure is sufficient
+ * to build the expected driver components for an SVM implementation as detailed
+ * below.
+ *
+ * Expected Driver Components:
+ * - GPU page fault handler: Used to create ranges and notifiers based on the
+ * fault address, optionally migrate the range to
+ * device memory, and create GPU bindings.
+ * - Garbage collector: Used to unmap and destroy GPU bindings for ranges.
+ * Ranges are expected to be added to the garbage collector
+ * upon a MMU_NOTIFY_UNMAP event in notifier callback.
+ * - Notifier callback: Used to invalidate and DMA unmap GPU bindings for
+ * ranges.
+ */
+
+/**
+ * DOC: Locking
+ *
+ * GPU SVM handles locking for core MM interactions, i.e., it locks/unlocks the
+ * mmap lock as needed.
+ *
+ * GPU SVM introduces a global notifier lock, which safeguards the notifier's
+ * range RB tree and list, as well as the range's DMA mappings and sequence
+ * number. GPU SVM manages all necessary locking and unlocking operations,
+ * except for the recheck range's pages being valid
+ * (drm_gpusvm_range_pages_valid) when the driver is committing GPU bindings. This
+ * lock corresponds to the 'driver->update' lock mentioned in the HMM
+ * documentation (TODO: Link). Future revisions may transition from a GPU SVM
+ * global lock to a per-notifier lock if finer-grained locking is deemed
+ * necessary.
+ *
+ * In addition to the locking mentioned above, the driver should implement a
+ * lock to safeguard core GPU SVM function calls that modify state, such as
+ * drm_gpusvm_range_find_or_insert and drm_gpusvm_range_remove. This lock is
+ * denoted as 'driver_svm_lock' in code examples. Finer grained driver side
+ * locking should also be possible for concurrent GPU fault processing within a
+ * single GPU SVM. The 'driver_svm_lock' can be via drm_gpusvm_driver_set_lock
+ * to add annotations to GPU SVM.
+ */
+
+/**
+ * DOC: Migration
+ *
+ * The migration support is quite simple, allowing migration between RAM and
+ * device memory at the range granularity. For example, GPU SVM currently does not
+ * support mixing RAM and device memory pages within a range. This means that upon GPU
+ * fault, the entire range can be migrated to device memory, and upon CPU fault, the
+ * entire range is migrated to RAM. Mixed RAM and device memory storage within a range
+ * could be added in the future if required.
+ *
+ * The reasoning for only supporting range granularity is as follows: it
+ * simplifies the implementation, and range sizes are driver-defined and should
+ * be relatively small.
+ */
+
+/**
+ * DOC: Partial Unmapping of Ranges
+ *
+ * Partial unmapping of ranges (e.g., 1M out of 2M is unmapped by CPU resulting
+ * in MMU_NOTIFY_UNMAP event) presents several challenges, with the main one
+ * being that a subset of the range still has CPU and GPU mappings. If the
+ * backing store for the range is in device memory, a subset of the backing store has
+ * references. One option would be to split the range and device memory backing store,
+ * but the implementation for this would be quite complicated. Given that
+ * partial unmappings are rare and driver-defined range sizes are relatively
+ * small, GPU SVM does not support splitting of ranges.
+ *
+ * With no support for range splitting, upon partial unmapping of a range, the
+ * driver is expected to invalidate and destroy the entire range. If the range
+ * has device memory as its backing, the driver is also expected to migrate any
+ * remaining pages back to RAM.
+ */
+
+/**
+ * DOC: Examples
+ *
+ * This section provides three examples of how to build the expected driver
+ * components: the GPU page fault handler, the garbage collector, and the
+ * notifier callback.
+ *
+ * The generic code provided does not include logic for complex migration
+ * policies, optimized invalidations, fined grained driver locking, or other
+ * potentially required driver locking (e.g., DMA-resv locks).
+ *
+ * 1) GPU page fault handler
+ *
+ * int driver_bind_range(struct drm_gpusvm *gpusvm, struct drm_gpusvm_range *range)
+ * {
+ * int err = 0;
+ *
+ * driver_alloc_and_setup_memory_for_bind(gpusvm, range);
+ *
+ * drm_gpusvm_notifier_lock(gpusvm);
+ * if (drm_gpusvm_range_pages_valid(range))
+ * driver_commit_bind(gpusvm, range);
+ * else
+ * err = -EAGAIN;
+ * drm_gpusvm_notifier_unlock(gpusvm);
+ *
+ * return err;
+ * }
+ *
+ * int driver_gpu_fault(struct drm_gpusvm *gpusvm, unsigned long fault_addr,
+ * unsigned long gpuva_start, unsigned long gpuva_end)
+ * {
+ * struct drm_gpusvm_ctx ctx = {};
+ * int err;
+ *
+ * driver_svm_lock();
+ * retry:
+ * // Always process UNMAPs first so view of GPU SVM ranges is current
+ * driver_garbage_collector(gpusvm);
+ *
+ * range = drm_gpusvm_range_find_or_insert(gpusvm, fault_addr,
+ * gpuva_start, gpuva_end,
+ * &ctx);
+ * if (IS_ERR(range)) {
+ * err = PTR_ERR(range);
+ * goto unlock;
+ * }
+ *
+ * if (driver_migration_policy(range)) {
+ * mmap_read_lock(mm);
+ * devmem = driver_alloc_devmem();
+ * err = drm_gpusvm_migrate_to_devmem(gpusvm, range,
+ * devmem_allocation,
+ * &ctx);
+ * mmap_read_unlock(mm);
+ * if (err) // CPU mappings may have changed
+ * goto retry;
+ * }
+ *
+ * err = drm_gpusvm_range_get_pages(gpusvm, range, &ctx);
+ * if (err == -EOPNOTSUPP || err == -EFAULT || err == -EPERM) { // CPU mappings changed
+ * if (err == -EOPNOTSUPP)
+ * drm_gpusvm_range_evict(gpusvm, range);
+ * goto retry;
+ * } else if (err) {
+ * goto unlock;
+ * }
+ *
+ * err = driver_bind_range(gpusvm, range);
+ * if (err == -EAGAIN) // CPU mappings changed
+ * goto retry
+ *
+ * unlock:
+ * driver_svm_unlock();
+ * return err;
+ * }
+ *
+ * 2) Garbage Collector.
+ *
+ * void __driver_garbage_collector(struct drm_gpusvm *gpusvm,
+ * struct drm_gpusvm_range *range)
+ * {
+ * assert_driver_svm_locked(gpusvm);
+ *
+ * // Partial unmap, migrate any remaining device memory pages back to RAM
+ * if (range->flags.partial_unmap)
+ * drm_gpusvm_range_evict(gpusvm, range);
+ *
+ * driver_unbind_range(range);
+ * drm_gpusvm_range_remove(gpusvm, range);
+ * }
+ *
+ * void driver_garbage_collector(struct drm_gpusvm *gpusvm)
+ * {
+ * assert_driver_svm_locked(gpusvm);
+ *
+ * for_each_range_in_garbage_collector(gpusvm, range)
+ * __driver_garbage_collector(gpusvm, range);
+ * }
+ *
+ * 3) Notifier callback.
+ *
+ * void driver_invalidation(struct drm_gpusvm *gpusvm,
+ * struct drm_gpusvm_notifier *notifier,
+ * const struct mmu_notifier_range *mmu_range)
+ * {
+ * struct drm_gpusvm_ctx ctx = { .in_notifier = true, };
+ * struct drm_gpusvm_range *range = NULL;
+ *
+ * driver_invalidate_device_pages(gpusvm, mmu_range->start, mmu_range->end);
+ *
+ * drm_gpusvm_for_each_range(range, notifier, mmu_range->start,
+ * mmu_range->end) {
+ * drm_gpusvm_range_unmap_pages(gpusvm, range, &ctx);
+ *
+ * if (mmu_range->event != MMU_NOTIFY_UNMAP)
+ * continue;
+ *
+ * drm_gpusvm_range_set_unmapped(range, mmu_range);
+ * driver_garbage_collector_add(gpusvm, range);
+ * }
+ * }
+ */
+
+/**
+ * npages_in_range() - Calculate the number of pages in a given range
+ * @start: The start address of the range
+ * @end: The end address of the range
+ *
+ * This macro calculates the number of pages in a given memory range,
+ * specified by the start and end addresses. It divides the difference
+ * between the end and start addresses by the page size (PAGE_SIZE) to
+ * determine the number of pages in the range.
+ *
+ * Return: The number of pages in the specified range.
+ */
+static unsigned long
+npages_in_range(unsigned long start, unsigned long end)
+{
+ return (end - start) >> PAGE_SHIFT;
+}
+
+/**
+ * struct drm_gpusvm_zdd - GPU SVM zone device data
+ *
+ * @refcount: Reference count for the zdd
+ * @devmem_allocation: device memory allocation
+ * @device_private_page_owner: Device private pages owner
+ *
+ * This structure serves as a generic wrapper installed in
+ * page->zone_device_data. It provides infrastructure for looking up a device
+ * memory allocation upon CPU page fault and asynchronously releasing device
+ * memory once the CPU has no page references. Asynchronous release is useful
+ * because CPU page references can be dropped in IRQ contexts, while releasing
+ * device memory likely requires sleeping locks.
+ */
+struct drm_gpusvm_zdd {
+ struct kref refcount;
+ struct drm_gpusvm_devmem *devmem_allocation;
+ void *device_private_page_owner;
+};
+
+/**
+ * drm_gpusvm_zdd_alloc() - Allocate a zdd structure.
+ * @device_private_page_owner: Device private pages owner
+ *
+ * This function allocates and initializes a new zdd structure. It sets up the
+ * reference count and initializes the destroy work.
+ *
+ * Return: Pointer to the allocated zdd on success, ERR_PTR() on failure.
+ */
+static struct drm_gpusvm_zdd *
+drm_gpusvm_zdd_alloc(void *device_private_page_owner)
+{
+ struct drm_gpusvm_zdd *zdd;
+
+ zdd = kmalloc(sizeof(*zdd), GFP_KERNEL);
+ if (!zdd)
+ return NULL;
+
+ kref_init(&zdd->refcount);
+ zdd->devmem_allocation = NULL;
+ zdd->device_private_page_owner = device_private_page_owner;
+
+ return zdd;
+}
+
+/**
+ * drm_gpusvm_zdd_get() - Get a reference to a zdd structure.
+ * @zdd: Pointer to the zdd structure.
+ *
+ * This function increments the reference count of the provided zdd structure.
+ *
+ * Return: Pointer to the zdd structure.
+ */
+static struct drm_gpusvm_zdd *drm_gpusvm_zdd_get(struct drm_gpusvm_zdd *zdd)
+{
+ kref_get(&zdd->refcount);
+ return zdd;
+}
+
+/**
+ * drm_gpusvm_zdd_destroy() - Destroy a zdd structure.
+ * @ref: Pointer to the reference count structure.
+ *
+ * This function queues the destroy_work of the zdd for asynchronous destruction.
+ */
+static void drm_gpusvm_zdd_destroy(struct kref *ref)
+{
+ struct drm_gpusvm_zdd *zdd =
+ container_of(ref, struct drm_gpusvm_zdd, refcount);
+ struct drm_gpusvm_devmem *devmem = zdd->devmem_allocation;
+
+ if (devmem) {
+ complete_all(&devmem->detached);
+ if (devmem->ops->devmem_release)
+ devmem->ops->devmem_release(devmem);
+ }
+ kfree(zdd);
+}
+
+/**
+ * drm_gpusvm_zdd_put() - Put a zdd reference.
+ * @zdd: Pointer to the zdd structure.
+ *
+ * This function decrements the reference count of the provided zdd structure
+ * and schedules its destruction if the count drops to zero.
+ */
+static void drm_gpusvm_zdd_put(struct drm_gpusvm_zdd *zdd)
+{
+ kref_put(&zdd->refcount, drm_gpusvm_zdd_destroy);
+}
+
+/**
+ * drm_gpusvm_range_find() - Find GPU SVM range from GPU SVM notifier
+ * @notifier: Pointer to the GPU SVM notifier structure.
+ * @start: Start address of the range
+ * @end: End address of the range
+ *
+ * Return: A pointer to the drm_gpusvm_range if found or NULL
+ */
+struct drm_gpusvm_range *
+drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, unsigned long start,
+ unsigned long end)
+{
+ struct interval_tree_node *itree;
+
+ itree = interval_tree_iter_first(&notifier->root, start, end - 1);
+
+ if (itree)
+ return container_of(itree, struct drm_gpusvm_range, itree);
+ else
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(drm_gpusvm_range_find);
+
+/**
+ * drm_gpusvm_for_each_range_safe() - Safely iterate over GPU SVM ranges in a notifier
+ * @range__: Iterator variable for the ranges
+ * @next__: Iterator variable for the ranges temporay storage
+ * @notifier__: Pointer to the GPU SVM notifier
+ * @start__: Start address of the range
+ * @end__: End address of the range
+ *
+ * This macro is used to iterate over GPU SVM ranges in a notifier while
+ * removing ranges from it.
+ */
+#define drm_gpusvm_for_each_range_safe(range__, next__, notifier__, start__, end__) \
+ for ((range__) = drm_gpusvm_range_find((notifier__), (start__), (end__)), \
+ (next__) = __drm_gpusvm_range_next(range__); \
+ (range__) && (drm_gpusvm_range_start(range__) < (end__)); \
+ (range__) = (next__), (next__) = __drm_gpusvm_range_next(range__))
+
+/**
+ * __drm_gpusvm_notifier_next() - get the next drm_gpusvm_notifier in the list
+ * @notifier: a pointer to the current drm_gpusvm_notifier
+ *
+ * Return: A pointer to the next drm_gpusvm_notifier if available, or NULL if
+ * the current notifier is the last one or if the input notifier is
+ * NULL.
+ */
+static struct drm_gpusvm_notifier *
+__drm_gpusvm_notifier_next(struct drm_gpusvm_notifier *notifier)
+{
+ if (notifier && !list_is_last(&notifier->entry,
+ &notifier->gpusvm->notifier_list))
+ return list_next_entry(notifier, entry);
+
+ return NULL;
+}
+
+static struct drm_gpusvm_notifier *
+notifier_iter_first(struct rb_root_cached *root, unsigned long start,
+ unsigned long last)
+{
+ struct interval_tree_node *itree;
+
+ itree = interval_tree_iter_first(root, start, last);
+
+ if (itree)
+ return container_of(itree, struct drm_gpusvm_notifier, itree);
+ else
+ return NULL;
+}
+
+/**
+ * drm_gpusvm_for_each_notifier() - Iterate over GPU SVM notifiers in a gpusvm
+ * @notifier__: Iterator variable for the notifiers
+ * @notifier__: Pointer to the GPU SVM notifier
+ * @start__: Start address of the notifier
+ * @end__: End address of the notifier
+ *
+ * This macro is used to iterate over GPU SVM notifiers in a gpusvm.
+ */
+#define drm_gpusvm_for_each_notifier(notifier__, gpusvm__, start__, end__) \
+ for ((notifier__) = notifier_iter_first(&(gpusvm__)->root, (start__), (end__) - 1); \
+ (notifier__) && (drm_gpusvm_notifier_start(notifier__) < (end__)); \
+ (notifier__) = __drm_gpusvm_notifier_next(notifier__))
+
+/**
+ * drm_gpusvm_for_each_notifier_safe() - Safely iterate over GPU SVM notifiers in a gpusvm
+ * @notifier__: Iterator variable for the notifiers
+ * @next__: Iterator variable for the notifiers temporay storage
+ * @notifier__: Pointer to the GPU SVM notifier
+ * @start__: Start address of the notifier
+ * @end__: End address of the notifier
+ *
+ * This macro is used to iterate over GPU SVM notifiers in a gpusvm while
+ * removing notifiers from it.
+ */
+#define drm_gpusvm_for_each_notifier_safe(notifier__, next__, gpusvm__, start__, end__) \
+ for ((notifier__) = notifier_iter_first(&(gpusvm__)->root, (start__), (end__) - 1), \
+ (next__) = __drm_gpusvm_notifier_next(notifier__); \
+ (notifier__) && (drm_gpusvm_notifier_start(notifier__) < (end__)); \
+ (notifier__) = (next__), (next__) = __drm_gpusvm_notifier_next(notifier__))
+
+/**
+ * drm_gpusvm_notifier_invalidate() - Invalidate a GPU SVM notifier.
+ * @mni: Pointer to the mmu_interval_notifier structure.
+ * @mmu_range: Pointer to the mmu_notifier_range structure.
+ * @cur_seq: Current sequence number.
+ *
+ * This function serves as a generic MMU notifier for GPU SVM. It sets the MMU
+ * notifier sequence number and calls the driver invalidate vfunc under
+ * gpusvm->notifier_lock.
+ *
+ * Return: true if the operation succeeds, false otherwise.
+ */
+static bool
+drm_gpusvm_notifier_invalidate(struct mmu_interval_notifier *mni,
+ const struct mmu_notifier_range *mmu_range,
+ unsigned long cur_seq)
+{
+ struct drm_gpusvm_notifier *notifier =
+ container_of(mni, typeof(*notifier), notifier);
+ struct drm_gpusvm *gpusvm = notifier->gpusvm;
+
+ if (!mmu_notifier_range_blockable(mmu_range))
+ return false;
+
+ down_write(&gpusvm->notifier_lock);
+ mmu_interval_set_seq(mni, cur_seq);
+ gpusvm->ops->invalidate(gpusvm, notifier, mmu_range);
+ up_write(&gpusvm->notifier_lock);
+
+ return true;
+}
+
+/**
+ * drm_gpusvm_notifier_ops - MMU interval notifier operations for GPU SVM
+ */
+static const struct mmu_interval_notifier_ops drm_gpusvm_notifier_ops = {
+ .invalidate = drm_gpusvm_notifier_invalidate,
+};
+
+/**
+ * drm_gpusvm_init() - Initialize the GPU SVM.
+ * @gpusvm: Pointer to the GPU SVM structure.
+ * @name: Name of the GPU SVM.
+ * @drm: Pointer to the DRM device structure.
+ * @mm: Pointer to the mm_struct for the address space.
+ * @device_private_page_owner: Device private pages owner.
+ * @mm_start: Start address of GPU SVM.
+ * @mm_range: Range of the GPU SVM.
+ * @notifier_size: Size of individual notifiers.
+ * @ops: Pointer to the operations structure for GPU SVM.
+ * @chunk_sizes: Pointer to the array of chunk sizes used in range allocation.
+ * Entries should be powers of 2 in descending order with last
+ * entry being SZ_4K.
+ * @num_chunks: Number of chunks.
+ *
+ * This function initializes the GPU SVM.
+ *
+ * Return: 0 on success, a negative error code on failure.
+ */
+int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
+ const char *name, struct drm_device *drm,
+ struct mm_struct *mm, void *device_private_page_owner,
+ unsigned long mm_start, unsigned long mm_range,
+ unsigned long notifier_size,
+ const struct drm_gpusvm_ops *ops,
+ const unsigned long *chunk_sizes, int num_chunks)
+{
+ if (!ops->invalidate || !num_chunks)
+ return -EINVAL;
+
+ gpusvm->name = name;
+ gpusvm->drm = drm;
+ gpusvm->mm = mm;
+ gpusvm->device_private_page_owner = device_private_page_owner;
+ gpusvm->mm_start = mm_start;
+ gpusvm->mm_range = mm_range;
+ gpusvm->notifier_size = notifier_size;
+ gpusvm->ops = ops;
+ gpusvm->chunk_sizes = chunk_sizes;
+ gpusvm->num_chunks = num_chunks;
+
+ mmgrab(mm);
+ gpusvm->root = RB_ROOT_CACHED;
+ INIT_LIST_HEAD(&gpusvm->notifier_list);
+
+ init_rwsem(&gpusvm->notifier_lock);
+
+ fs_reclaim_acquire(GFP_KERNEL);
+ might_lock(&gpusvm->notifier_lock);
+ fs_reclaim_release(GFP_KERNEL);
+
+#ifdef CONFIG_LOCKDEP
+ gpusvm->lock_dep_map = NULL;
+#endif
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(drm_gpusvm_init);
+
+/**
+ * drm_gpusvm_notifier_find() - Find GPU SVM notifier
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @fault_addr: Fault address
+ *
+ * This function finds the GPU SVM notifier associated with the fault address.
+ *
+ * Return: Pointer to the GPU SVM notifier on success, NULL otherwise.
+ */
+static struct drm_gpusvm_notifier *
+drm_gpusvm_notifier_find(struct drm_gpusvm *gpusvm,
+ unsigned long fault_addr)
+{
+ return notifier_iter_first(&gpusvm->root, fault_addr, fault_addr + 1);
+}
+
+/**
+ * to_drm_gpusvm_notifier() - retrieve the container struct for a given rbtree node
+ * @node: a pointer to the rbtree node embedded within a drm_gpusvm_notifier struct
+ *
+ * Return: A pointer to the containing drm_gpusvm_notifier structure.
+ */
+static struct drm_gpusvm_notifier *to_drm_gpusvm_notifier(struct rb_node *node)
+{
+ return container_of(node, struct drm_gpusvm_notifier, itree.rb);
+}
+
+/**
+ * drm_gpusvm_notifier_insert() - Insert GPU SVM notifier
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @notifier: Pointer to the GPU SVM notifier structure
+ *
+ * This function inserts the GPU SVM notifier into the GPU SVM RB tree and list.
+ */
+static void drm_gpusvm_notifier_insert(struct drm_gpusvm *gpusvm,
+ struct drm_gpusvm_notifier *notifier)
+{
+ struct rb_node *node;
+ struct list_head *head;
+
+ interval_tree_insert(&notifier->itree, &gpusvm->root);
+
+ node = rb_prev(&notifier->itree.rb);
+ if (node)
+ head = &(to_drm_gpusvm_notifier(node))->entry;
+ else
+ head = &gpusvm->notifier_list;
+
+ list_add(&notifier->entry, head);
+}
+
+/**
+ * drm_gpusvm_notifier_remove() - Remove GPU SVM notifier
+ * @gpusvm: Pointer to the GPU SVM tructure
+ * @notifier: Pointer to the GPU SVM notifier structure
+ *
+ * This function removes the GPU SVM notifier from the GPU SVM RB tree and list.
+ */
+static void drm_gpusvm_notifier_remove(struct drm_gpusvm *gpusvm,
+ struct drm_gpusvm_notifier *notifier)
+{
+ interval_tree_remove(&notifier->itree, &gpusvm->root);
+ list_del(&notifier->entry);
+}
+
+/**
+ * drm_gpusvm_fini() - Finalize the GPU SVM.
+ * @gpusvm: Pointer to the GPU SVM structure.
+ *
+ * This function finalizes the GPU SVM by cleaning up any remaining ranges and
+ * notifiers, and dropping a reference to struct MM.
+ */
+void drm_gpusvm_fini(struct drm_gpusvm *gpusvm)
+{
+ struct drm_gpusvm_notifier *notifier, *next;
+
+ drm_gpusvm_for_each_notifier_safe(notifier, next, gpusvm, 0, LONG_MAX) {
+ struct drm_gpusvm_range *range, *__next;
+
+ /*
+ * Remove notifier first to avoid racing with any invalidation
+ */
+ mmu_interval_notifier_remove(&notifier->notifier);
+ notifier->flags.removed = true;
+
+ drm_gpusvm_for_each_range_safe(range, __next, notifier, 0,
+ LONG_MAX)
+ drm_gpusvm_range_remove(gpusvm, range);
+ }
+
+ mmdrop(gpusvm->mm);
+ WARN_ON(!RB_EMPTY_ROOT(&gpusvm->root.rb_root));
+}
+EXPORT_SYMBOL_GPL(drm_gpusvm_fini);
+
+/**
+ * drm_gpusvm_notifier_alloc() - Allocate GPU SVM notifier
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @fault_addr: Fault address
+ *
+ * This function allocates and initializes the GPU SVM notifier structure.
+ *
+ * Return: Pointer to the allocated GPU SVM notifier on success, ERR_PTR() on failure.
+ */
+static struct drm_gpusvm_notifier *
+drm_gpusvm_notifier_alloc(struct drm_gpusvm *gpusvm, unsigned long fault_addr)
+{
+ struct drm_gpusvm_notifier *notifier;
+
+ if (gpusvm->ops->notifier_alloc)
+ notifier = gpusvm->ops->notifier_alloc();
+ else
+ notifier = kzalloc(sizeof(*notifier), GFP_KERNEL);
+
+ if (!notifier)
+ return ERR_PTR(-ENOMEM);
+
+ notifier->gpusvm = gpusvm;
+ notifier->itree.start = ALIGN_DOWN(fault_addr, gpusvm->notifier_size);
+ notifier->itree.last = ALIGN(fault_addr + 1, gpusvm->notifier_size) - 1;
+ INIT_LIST_HEAD(&notifier->entry);
+ notifier->root = RB_ROOT_CACHED;
+ INIT_LIST_HEAD(&notifier->range_list);
+
+ return notifier;
+}
+
+/**
+ * drm_gpusvm_notifier_free() - Free GPU SVM notifier
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @notifier: Pointer to the GPU SVM notifier structure
+ *
+ * This function frees the GPU SVM notifier structure.
+ */
+static void drm_gpusvm_notifier_free(struct drm_gpusvm *gpusvm,
+ struct drm_gpusvm_notifier *notifier)
+{
+ WARN_ON(!RB_EMPTY_ROOT(&notifier->root.rb_root));
+
+ if (gpusvm->ops->notifier_free)
+ gpusvm->ops->notifier_free(notifier);
+ else
+ kfree(notifier);
+}
+
+/**
+ * to_drm_gpusvm_range() - retrieve the container struct for a given rbtree node
+ * @node: a pointer to the rbtree node embedded within a drm_gpusvm_range struct
+ *
+ * Return: A pointer to the containing drm_gpusvm_range structure.
+ */
+static struct drm_gpusvm_range *to_drm_gpusvm_range(struct rb_node *node)
+{
+ return container_of(node, struct drm_gpusvm_range, itree.rb);
+}
+
+/**
+ * drm_gpusvm_range_insert() - Insert GPU SVM range
+ * @notifier: Pointer to the GPU SVM notifier structure
+ * @range: Pointer to the GPU SVM range structure
+ *
+ * This function inserts the GPU SVM range into the notifier RB tree and list.
+ */
+static void drm_gpusvm_range_insert(struct drm_gpusvm_notifier *notifier,
+ struct drm_gpusvm_range *range)
+{
+ struct rb_node *node;
+ struct list_head *head;
+
+ drm_gpusvm_notifier_lock(notifier->gpusvm);
+ interval_tree_insert(&range->itree, &notifier->root);
+
+ node = rb_prev(&range->itree.rb);
+ if (node)
+ head = &(to_drm_gpusvm_range(node))->entry;
+ else
+ head = &notifier->range_list;
+
+ list_add(&range->entry, head);
+ drm_gpusvm_notifier_unlock(notifier->gpusvm);
+}
+
+/**
+ * __drm_gpusvm_range_remove() - Remove GPU SVM range
+ * @notifier: Pointer to the GPU SVM notifier structure
+ * @range: Pointer to the GPU SVM range structure
+ *
+ * This macro removes the GPU SVM range from the notifier RB tree and list.
+ */
+static void __drm_gpusvm_range_remove(struct drm_gpusvm_notifier *notifier,
+ struct drm_gpusvm_range *range)
+{
+ interval_tree_remove(&range->itree, &notifier->root);
+ list_del(&range->entry);
+}
+
+/**
+ * drm_gpusvm_range_alloc() - Allocate GPU SVM range
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @notifier: Pointer to the GPU SVM notifier structure
+ * @fault_addr: Fault address
+ * @chunk_size: Chunk size
+ * @migrate_devmem: Flag indicating whether to migrate device memory
+ *
+ * This function allocates and initializes the GPU SVM range structure.
+ *
+ * Return: Pointer to the allocated GPU SVM range on success, ERR_PTR() on failure.
+ */
+static struct drm_gpusvm_range *
+drm_gpusvm_range_alloc(struct drm_gpusvm *gpusvm,
+ struct drm_gpusvm_notifier *notifier,
+ unsigned long fault_addr, unsigned long chunk_size,
+ bool migrate_devmem)
+{
+ struct drm_gpusvm_range *range;
+
+ if (gpusvm->ops->range_alloc)
+ range = gpusvm->ops->range_alloc(gpusvm);
+ else
+ range = kzalloc(sizeof(*range), GFP_KERNEL);
+
+ if (!range)
+ return ERR_PTR(-ENOMEM);
+
+ kref_init(&range->refcount);
+ range->gpusvm = gpusvm;
+ range->notifier = notifier;
+ range->itree.start = ALIGN_DOWN(fault_addr, chunk_size);
+ range->itree.last = ALIGN(fault_addr + 1, chunk_size) - 1;
+ INIT_LIST_HEAD(&range->entry);
+ range->notifier_seq = LONG_MAX;
+ range->flags.migrate_devmem = migrate_devmem ? 1 : 0;
+
+ return range;
+}
+
+/**
+ * drm_gpusvm_check_pages() - Check pages
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @notifier: Pointer to the GPU SVM notifier structure
+ * @start: Start address
+ * @end: End address
+ *
+ * Check if pages between start and end have been faulted in on the CPU. Use to
+ * prevent migration of pages without CPU backing store.
+ *
+ * Return: True if pages have been faulted into CPU, False otherwise
+ */
+static bool drm_gpusvm_check_pages(struct drm_gpusvm *gpusvm,
+ struct drm_gpusvm_notifier *notifier,
+ unsigned long start, unsigned long end)
+{
+ struct hmm_range hmm_range = {
+ .default_flags = 0,
+ .notifier = &notifier->notifier,
+ .start = start,
+ .end = end,
+ .dev_private_owner = gpusvm->device_private_page_owner,
+ };
+ unsigned long timeout =
+ jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
+ unsigned long *pfns;
+ unsigned long npages = npages_in_range(start, end);
+ int err, i;
+
+ mmap_assert_locked(gpusvm->mm);
+
+ pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
+ if (!pfns)
+ return false;
+
+ hmm_range.notifier_seq = mmu_interval_read_begin(&notifier->notifier);
+ hmm_range.hmm_pfns = pfns;
+
+ while (true) {
+ err = hmm_range_fault(&hmm_range);
+ if (err == -EBUSY) {
+ if (time_after(jiffies, timeout))
+ break;
+
+ hmm_range.notifier_seq =
+ mmu_interval_read_begin(&notifier->notifier);
+ continue;
+ }
+ break;
+ }
+ if (err)
+ goto err_free;
+
+ for (i = 0; i < npages;) {
+ if (!(pfns[i] & HMM_PFN_VALID)) {
+ err = -EFAULT;
+ goto err_free;
+ }
+ i += 0x1 << hmm_pfn_to_map_order(pfns[i]);
+ }
+
+err_free:
+ kvfree(pfns);
+ return err ? false : true;
+}
+
+/**
+ * drm_gpusvm_range_chunk_size() - Determine chunk size for GPU SVM range
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @notifier: Pointer to the GPU SVM notifier structure
+ * @vas: Pointer to the virtual memory area structure
+ * @fault_addr: Fault address
+ * @gpuva_start: Start address of GPUVA which mirrors CPU
+ * @gpuva_end: End address of GPUVA which mirrors CPU
+ * @check_pages_threshold: Check CPU pages for present threshold
+ *
+ * This function determines the chunk size for the GPU SVM range based on the
+ * fault address, GPU SVM chunk sizes, existing GPU SVM ranges, and the virtual
+ * memory area boundaries.
+ *
+ * Return: Chunk size on success, LONG_MAX on failure.
+ */
+static unsigned long
+drm_gpusvm_range_chunk_size(struct drm_gpusvm *gpusvm,
+ struct drm_gpusvm_notifier *notifier,
+ struct vm_area_struct *vas,
+ unsigned long fault_addr,
+ unsigned long gpuva_start,
+ unsigned long gpuva_end,
+ unsigned long check_pages_threshold)
+{
+ unsigned long start, end;
+ int i = 0;
+
+retry:
+ for (; i < gpusvm->num_chunks; ++i) {
+ start = ALIGN_DOWN(fault_addr, gpusvm->chunk_sizes[i]);
+ end = ALIGN(fault_addr + 1, gpusvm->chunk_sizes[i]);
+
+ if (start >= vas->vm_start && end <= vas->vm_end &&
+ start >= drm_gpusvm_notifier_start(notifier) &&
+ end <= drm_gpusvm_notifier_end(notifier) &&
+ start >= gpuva_start && end <= gpuva_end)
+ break;
+ }
+
+ if (i == gpusvm->num_chunks)
+ return LONG_MAX;
+
+ /*
+ * If allocation more than page, ensure not to overlap with existing
+ * ranges.
+ */
+ if (end - start != SZ_4K) {
+ struct drm_gpusvm_range *range;
+
+ range = drm_gpusvm_range_find(notifier, start, end);
+ if (range) {
+ ++i;
+ goto retry;
+ }
+
+ /*
+ * XXX: Only create range on pages CPU has faulted in. Without
+ * this check, or prefault, on BMG 'xe_exec_system_allocator --r
+ * process-many-malloc' fails. In the failure case, each process
+ * mallocs 16k but the CPU VMA is ~128k which results in 64k SVM
+ * ranges. When migrating the SVM ranges, some processes fail in
+ * drm_gpusvm_migrate_to_devmem with 'migrate.cpages != npages'
+ * and then upon drm_gpusvm_range_get_pages device pages from
+ * other processes are collected + faulted in which creates all
+ * sorts of problems. Unsure exactly how this happening, also
+ * problem goes away if 'xe_exec_system_allocator --r
+ * process-many-malloc' mallocs at least 64k at a time.
+ */
+ if (end - start <= check_pages_threshold &&
+ !drm_gpusvm_check_pages(gpusvm, notifier, start, end)) {
+ ++i;
+ goto retry;
+ }
+ }
+
+ return end - start;
+}
+
+#ifdef CONFIG_LOCKDEP
+/**
+ * drm_gpusvm_driver_lock_held() - Assert GPU SVM driver lock is held
+ * @gpusvm: Pointer to the GPU SVM structure.
+ *
+ * Ensure driver lock is held.
+ */
+static void drm_gpusvm_driver_lock_held(struct drm_gpusvm *gpusvm)
+{
+ if ((gpusvm)->lock_dep_map)
+ lockdep_assert(lock_is_held_type((gpusvm)->lock_dep_map, 0));
+}
+#else
+static void drm_gpusvm_driver_lock_held(struct drm_gpusvm *gpusvm)
+{
+}
+#endif
+
+/**
+ * drm_gpusvm_range_find_or_insert() - Find or insert GPU SVM range
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @fault_addr: Fault address
+ * @gpuva_start: Start address of GPUVA which mirrors CPU
+ * @gpuva_end: End address of GPUVA which mirrors CPU
+ * @ctx: GPU SVM context
+ *
+ * This function finds or inserts a newly allocated a GPU SVM range based on the
+ * fault address. Caller must hold a lock to protect range lookup and insertion.
+ *
+ * Return: Pointer to the GPU SVM range on success, ERR_PTR() on failure.
+ */
+struct drm_gpusvm_range *
+drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm,
+ unsigned long fault_addr,
+ unsigned long gpuva_start,
+ unsigned long gpuva_end,
+ const struct drm_gpusvm_ctx *ctx)
+{
+ struct drm_gpusvm_notifier *notifier;
+ struct drm_gpusvm_range *range;
+ struct mm_struct *mm = gpusvm->mm;
+ struct vm_area_struct *vas;
+ bool notifier_alloc = false;
+ unsigned long chunk_size;
+ int err;
+ bool migrate_devmem;
+
+ drm_gpusvm_driver_lock_held(gpusvm);
+
+ if (fault_addr < gpusvm->mm_start ||
+ fault_addr > gpusvm->mm_start + gpusvm->mm_range)
+ return ERR_PTR(-EINVAL);
+
+ if (!mmget_not_zero(mm))
+ return ERR_PTR(-EFAULT);
+
+ notifier = drm_gpusvm_notifier_find(gpusvm, fault_addr);
+ if (!notifier) {
+ notifier = drm_gpusvm_notifier_alloc(gpusvm, fault_addr);
+ if (IS_ERR(notifier)) {
+ err = PTR_ERR(notifier);
+ goto err_mmunlock;
+ }
+ notifier_alloc = true;
+ err = mmu_interval_notifier_insert(&notifier->notifier,
+ mm,
+ drm_gpusvm_notifier_start(notifier),
+ drm_gpusvm_notifier_size(notifier),
+ &drm_gpusvm_notifier_ops);
+ if (err)
+ goto err_notifier;
+ }
+
+ mmap_read_lock(mm);
+
+ vas = vma_lookup(mm, fault_addr);
+ if (!vas) {
+ err = -ENOENT;
+ goto err_notifier_remove;
+ }
+
+ if (!ctx->read_only && !(vas->vm_flags & VM_WRITE)) {
+ err = -EPERM;
+ goto err_notifier_remove;
+ }
+
+ range = drm_gpusvm_range_find(notifier, fault_addr, fault_addr + 1);
+ if (range)
+ goto out_mmunlock;
+ /*
+ * XXX: Short-circuiting migration based on migrate_vma_* current
+ * limitations. If/when migrate_vma_* add more support, this logic will
+ * have to change.
+ */
+ migrate_devmem = ctx->devmem_possible &&
+ vma_is_anonymous(vas) && !is_vm_hugetlb_page(vas);
+
+ chunk_size = drm_gpusvm_range_chunk_size(gpusvm, notifier, vas,
+ fault_addr, gpuva_start,
+ gpuva_end,
+ ctx->check_pages_threshold);
+ if (chunk_size == LONG_MAX) {
+ err = -EINVAL;
+ goto err_notifier_remove;
+ }
+
+ range = drm_gpusvm_range_alloc(gpusvm, notifier, fault_addr, chunk_size,
+ migrate_devmem);
+ if (IS_ERR(range)) {
+ err = PTR_ERR(range);
+ goto err_notifier_remove;
+ }
+
+ drm_gpusvm_range_insert(notifier, range);
+ if (notifier_alloc)
+ drm_gpusvm_notifier_insert(gpusvm, notifier);
+
+out_mmunlock:
+ mmap_read_unlock(mm);
+ mmput(mm);
+
+ return range;
+
+err_notifier_remove:
+ mmap_read_unlock(mm);
+ if (notifier_alloc)
+ mmu_interval_notifier_remove(&notifier->notifier);
+err_notifier:
+ if (notifier_alloc)
+ drm_gpusvm_notifier_free(gpusvm, notifier);
+err_mmunlock:
+ mmput(mm);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(drm_gpusvm_range_find_or_insert);
+
+/**
+ * __drm_gpusvm_range_unmap_pages() - Unmap pages associated with a GPU SVM range (internal)
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @range: Pointer to the GPU SVM range structure
+ * @npages: Number of pages to unmap
+ *
+ * This function unmap pages associated with a GPU SVM range. Assumes and
+ * asserts correct locking is in place when called.
+ */
+static void __drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
+ struct drm_gpusvm_range *range,
+ unsigned long npages)
+{
+ unsigned long i, j;
+ struct drm_pagemap *dpagemap = range->dpagemap;
+ struct device *dev = gpusvm->drm->dev;
+
+ lockdep_assert_held(&gpusvm->notifier_lock);
+
+ if (range->flags.has_dma_mapping) {
+ for (i = 0, j = 0; i < npages; j++) {
+ struct drm_pagemap_device_addr *addr = &range->dma_addr[j];
+
+ if (addr->proto == DRM_INTERCONNECT_SYSTEM)
+ dma_unmap_page(dev,
+ addr->addr,
+ PAGE_SIZE << addr->order,
+ addr->dir);
+ else if (dpagemap && dpagemap->ops->device_unmap)
+ dpagemap->ops->device_unmap(dpagemap,
+ dev, *addr);
+ i += 1 << addr->order;
+ }
+ range->flags.has_devmem_pages = false;
+ range->flags.has_dma_mapping = false;
+ range->dpagemap = NULL;
+ }
+}
+
+/**
+ * drm_gpusvm_range_free_pages() - Free pages associated with a GPU SVM range
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @range: Pointer to the GPU SVM range structure
+ *
+ * This function frees the dma address array associated with a GPU SVM range.
+ */
+static void drm_gpusvm_range_free_pages(struct drm_gpusvm *gpusvm,
+ struct drm_gpusvm_range *range)
+{
+ lockdep_assert_held(&gpusvm->notifier_lock);
+
+ if (range->dma_addr) {
+ kvfree(range->dma_addr);
+ range->dma_addr = NULL;
+ }
+}
+
+/**
+ * drm_gpusvm_range_remove() - Remove GPU SVM range
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @range: Pointer to the GPU SVM range to be removed
+ *
+ * This function removes the specified GPU SVM range and also removes the parent
+ * GPU SVM notifier if no more ranges remain in the notifier. The caller must
+ * hold a lock to protect range and notifier removal.
+ */
+void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
+ struct drm_gpusvm_range *range)
+{
+ unsigned long npages = npages_in_range(drm_gpusvm_range_start(range),
+ drm_gpusvm_range_end(range));
+ struct drm_gpusvm_notifier *notifier;
+
+ drm_gpusvm_driver_lock_held(gpusvm);
+
+ notifier = drm_gpusvm_notifier_find(gpusvm,
+ drm_gpusvm_range_start(range));
+ if (WARN_ON_ONCE(!notifier))
+ return;
+
+ drm_gpusvm_notifier_lock(gpusvm);
+ __drm_gpusvm_range_unmap_pages(gpusvm, range, npages);
+ drm_gpusvm_range_free_pages(gpusvm, range);
+ __drm_gpusvm_range_remove(notifier, range);
+ drm_gpusvm_notifier_unlock(gpusvm);
+
+ drm_gpusvm_range_put(range);
+
+ if (RB_EMPTY_ROOT(&notifier->root.rb_root)) {
+ if (!notifier->flags.removed)
+ mmu_interval_notifier_remove(&notifier->notifier);
+ drm_gpusvm_notifier_remove(gpusvm, notifier);
+ drm_gpusvm_notifier_free(gpusvm, notifier);
+ }
+}
+EXPORT_SYMBOL_GPL(drm_gpusvm_range_remove);
+
+/**
+ * drm_gpusvm_range_get() - Get a reference to GPU SVM range
+ * @range: Pointer to the GPU SVM range
+ *
+ * This function increments the reference count of the specified GPU SVM range.
+ *
+ * Return: Pointer to the GPU SVM range.
+ */
+struct drm_gpusvm_range *
+drm_gpusvm_range_get(struct drm_gpusvm_range *range)
+{
+ kref_get(&range->refcount);
+
+ return range;
+}
+EXPORT_SYMBOL_GPL(drm_gpusvm_range_get);
+
+/**
+ * drm_gpusvm_range_destroy() - Destroy GPU SVM range
+ * @refcount: Pointer to the reference counter embedded in the GPU SVM range
+ *
+ * This function destroys the specified GPU SVM range when its reference count
+ * reaches zero. If a custom range-free function is provided, it is invoked to
+ * free the range; otherwise, the range is deallocated using kfree().
+ */
+static void drm_gpusvm_range_destroy(struct kref *refcount)
+{
+ struct drm_gpusvm_range *range =
+ container_of(refcount, struct drm_gpusvm_range, refcount);
+ struct drm_gpusvm *gpusvm = range->gpusvm;
+
+ if (gpusvm->ops->range_free)
+ gpusvm->ops->range_free(range);
+ else
+ kfree(range);
+}
+
+/**
+ * drm_gpusvm_range_put() - Put a reference to GPU SVM range
+ * @range: Pointer to the GPU SVM range
+ *
+ * This function decrements the reference count of the specified GPU SVM range
+ * and frees it when the count reaches zero.
+ */
+void drm_gpusvm_range_put(struct drm_gpusvm_range *range)
+{
+ kref_put(&range->refcount, drm_gpusvm_range_destroy);
+}
+EXPORT_SYMBOL_GPL(drm_gpusvm_range_put);
+
+/**
+ * drm_gpusvm_range_pages_valid() - GPU SVM range pages valid
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @range: Pointer to the GPU SVM range structure
+ *
+ * This function determines if a GPU SVM range pages are valid. Expected be
+ * called holding gpusvm->notifier_lock and as the last step before committing a
+ * GPU binding. This is akin to a notifier seqno check in the HMM documentation
+ * but due to wider notifiers (i.e., notifiers which span multiple ranges) this
+ * function is required for finer grained checking (i.e., per range) if pages
+ * are valid.
+ *
+ * Return: True if GPU SVM range has valid pages, False otherwise
+ */
+bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
+ struct drm_gpusvm_range *range)
+{
+ lockdep_assert_held(&gpusvm->notifier_lock);
+
+ return range->flags.has_devmem_pages || range->flags.has_dma_mapping;
+}
+EXPORT_SYMBOL_GPL(drm_gpusvm_range_pages_valid);
+
+/**
+ * drm_gpusvm_range_pages_valid_unlocked() - GPU SVM range pages valid unlocked
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @range: Pointer to the GPU SVM range structure
+ *
+ * This function determines if a GPU SVM range pages are valid. Expected be
+ * called without holding gpusvm->notifier_lock.
+ *
+ * Return: True if GPU SVM range has valid pages, False otherwise
+ */
+static bool
+drm_gpusvm_range_pages_valid_unlocked(struct drm_gpusvm *gpusvm,
+ struct drm_gpusvm_range *range)
+{
+ bool pages_valid;
+
+ if (!range->dma_addr)
+ return false;
+
+ drm_gpusvm_notifier_lock(gpusvm);
+ pages_valid = drm_gpusvm_range_pages_valid(gpusvm, range);
+ if (!pages_valid)
+ drm_gpusvm_range_free_pages(gpusvm, range);
+ drm_gpusvm_notifier_unlock(gpusvm);
+
+ return pages_valid;
+}
+
+/**
+ * drm_gpusvm_range_get_pages() - Get pages for a GPU SVM range
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @range: Pointer to the GPU SVM range structure
+ * @ctx: GPU SVM context
+ *
+ * This function gets pages for a GPU SVM range and ensures they are mapped for
+ * DMA access.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
+ struct drm_gpusvm_range *range,
+ const struct drm_gpusvm_ctx *ctx)
+{
+ struct mmu_interval_notifier *notifier = &range->notifier->notifier;
+ struct hmm_range hmm_range = {
+ .default_flags = HMM_PFN_REQ_FAULT | (ctx->read_only ? 0 :
+ HMM_PFN_REQ_WRITE),
+ .notifier = notifier,
+ .start = drm_gpusvm_range_start(range),
+ .end = drm_gpusvm_range_end(range),
+ .dev_private_owner = gpusvm->device_private_page_owner,
+ };
+ struct mm_struct *mm = gpusvm->mm;
+ struct drm_gpusvm_zdd *zdd;
+ unsigned long timeout =
+ jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
+ unsigned long i, j;
+ unsigned long npages = npages_in_range(drm_gpusvm_range_start(range),
+ drm_gpusvm_range_end(range));
+ unsigned long num_dma_mapped;
+ unsigned int order = 0;
+ unsigned long *pfns;
+ struct page **pages;
+ int err = 0;
+ struct dev_pagemap *pagemap;
+ struct drm_pagemap *dpagemap;
+
+retry:
+ hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
+ if (drm_gpusvm_range_pages_valid_unlocked(gpusvm, range))
+ goto set_seqno;
+
+ pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
+ if (!pfns)
+ return -ENOMEM;
+
+ if (!mmget_not_zero(mm)) {
+ err = -EFAULT;
+ goto err_free;
+ }
+
+ hmm_range.hmm_pfns = pfns;
+ while (true) {
+ mmap_read_lock(mm);
+ err = hmm_range_fault(&hmm_range);
+ mmap_read_unlock(mm);
+
+ if (err == -EBUSY) {
+ if (time_after(jiffies, timeout))
+ break;
+
+ hmm_range.notifier_seq =
+ mmu_interval_read_begin(notifier);
+ continue;
+ }
+ break;
+ }
+ mmput(mm);
+ if (err)
+ goto err_free;
+
+ pages = (struct page **)pfns;
+map_pages:
+ /*
+ * Perform all dma mappings under the notifier lock to not
+ * access freed pages. A notifier will either block on
+ * the notifier lock or unmap dma.
+ */
+ drm_gpusvm_notifier_lock(gpusvm);
+
+ if (range->flags.unmapped) {
+ drm_gpusvm_notifier_unlock(gpusvm);
+ err = -EFAULT;
+ goto err_free;
+ }
+
+ if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
+ drm_gpusvm_notifier_unlock(gpusvm);
+ kvfree(pfns);
+ goto retry;
+ }
+
+ if (!range->dma_addr) {
+ /* Unlock and restart mapping to allocate memory. */
+ drm_gpusvm_notifier_unlock(gpusvm);
+ range->dma_addr = kvmalloc_array(npages,
+ sizeof(*range->dma_addr),
+ GFP_KERNEL);
+ if (!range->dma_addr) {
+ err = -ENOMEM;
+ goto err_free;
+ }
+ goto map_pages;
+ }
+
+ zdd = NULL;
+ num_dma_mapped = 0;
+ for (i = 0, j = 0; i < npages; ++j) {
+ struct page *page = hmm_pfn_to_page(pfns[i]);
+
+ order = hmm_pfn_to_map_order(pfns[i]);
+ if (is_device_private_page(page) ||
+ is_device_coherent_page(page)) {
+ if (zdd != page->zone_device_data && i > 0) {
+ err = -EOPNOTSUPP;
+ goto err_unmap;
+ }
+ zdd = page->zone_device_data;
+ if (pagemap != page->pgmap) {
+ if (i > 0) {
+ err = -EOPNOTSUPP;
+ goto err_unmap;
+ }
+
+ pagemap = page->pgmap;
+ dpagemap = zdd->devmem_allocation->dpagemap;
+ if (drm_WARN_ON(gpusvm->drm, !dpagemap)) {
+ /*
+ * Raced. This is not supposed to happen
+ * since hmm_range_fault() should've migrated
+ * this page to system.
+ */
+ err = -EAGAIN;
+ goto err_unmap;
+ }
+ }
+ range->dma_addr[j] =
+ dpagemap->ops->device_map(dpagemap,
+ gpusvm->drm->dev,
+ page, order,
+ DMA_BIDIRECTIONAL);
+ if (dma_mapping_error(gpusvm->drm->dev,
+ range->dma_addr[j].addr)) {
+ err = -EFAULT;
+ goto err_unmap;
+ }
+
+ pages[i] = page;
+ } else {
+ dma_addr_t addr;
+
+ if (is_zone_device_page(page) || zdd) {
+ err = -EOPNOTSUPP;
+ goto err_unmap;
+ }
+
+ addr = dma_map_page(gpusvm->drm->dev,
+ page, 0,
+ PAGE_SIZE << order,
+ DMA_BIDIRECTIONAL);
+ if (dma_mapping_error(gpusvm->drm->dev, addr)) {
+ err = -EFAULT;
+ goto err_unmap;
+ }
+
+ range->dma_addr[j] = drm_pagemap_device_addr_encode
+ (addr, DRM_INTERCONNECT_SYSTEM, order,
+ DMA_BIDIRECTIONAL);
+ }
+ i += 1 << order;
+ num_dma_mapped = i;
+ }
+
+ range->flags.has_dma_mapping = true;
+ if (zdd) {
+ range->flags.has_devmem_pages = true;
+ range->dpagemap = dpagemap;
+ }
+
+ drm_gpusvm_notifier_unlock(gpusvm);
+ kvfree(pfns);
+set_seqno:
+ range->notifier_seq = hmm_range.notifier_seq;
+
+ return 0;
+
+err_unmap:
+ __drm_gpusvm_range_unmap_pages(gpusvm, range, num_dma_mapped);
+ drm_gpusvm_notifier_unlock(gpusvm);
+err_free:
+ kvfree(pfns);
+ if (err == -EAGAIN)
+ goto retry;
+ return err;
+}
+EXPORT_SYMBOL_GPL(drm_gpusvm_range_get_pages);
+
+/**
+ * drm_gpusvm_range_unmap_pages() - Unmap pages associated with a GPU SVM range
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @range: Pointer to the GPU SVM range structure
+ * @ctx: GPU SVM context
+ *
+ * This function unmaps pages associated with a GPU SVM range. If @in_notifier
+ * is set, it is assumed that gpusvm->notifier_lock is held in write mode; if it
+ * is clear, it acquires gpusvm->notifier_lock in read mode. Must be called on
+ * each GPU SVM range attached to notifier in gpusvm->ops->invalidate for IOMMU
+ * security model.
+ */
+void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
+ struct drm_gpusvm_range *range,
+ const struct drm_gpusvm_ctx *ctx)
+{
+ unsigned long npages = npages_in_range(drm_gpusvm_range_start(range),
+ drm_gpusvm_range_end(range));
+
+ if (ctx->in_notifier)
+ lockdep_assert_held_write(&gpusvm->notifier_lock);
+ else
+ drm_gpusvm_notifier_lock(gpusvm);
+
+ __drm_gpusvm_range_unmap_pages(gpusvm, range, npages);
+
+ if (!ctx->in_notifier)
+ drm_gpusvm_notifier_unlock(gpusvm);
+}
+EXPORT_SYMBOL_GPL(drm_gpusvm_range_unmap_pages);
+
+/**
+ * drm_gpusvm_migration_unlock_put_page() - Put a migration page
+ * @page: Pointer to the page to put
+ *
+ * This function unlocks and puts a page.
+ */
+static void drm_gpusvm_migration_unlock_put_page(struct page *page)
+{
+ unlock_page(page);
+ put_page(page);
+}
+
+/**
+ * drm_gpusvm_migration_unlock_put_pages() - Put migration pages
+ * @npages: Number of pages
+ * @migrate_pfn: Array of migrate page frame numbers
+ *
+ * This function unlocks and puts an array of pages.
+ */
+static void drm_gpusvm_migration_unlock_put_pages(unsigned long npages,
+ unsigned long *migrate_pfn)
+{
+ unsigned long i;
+
+ for (i = 0; i < npages; ++i) {
+ struct page *page;
+
+ if (!migrate_pfn[i])
+ continue;
+
+ page = migrate_pfn_to_page(migrate_pfn[i]);
+ drm_gpusvm_migration_unlock_put_page(page);
+ migrate_pfn[i] = 0;
+ }
+}
+
+/**
+ * drm_gpusvm_get_devmem_page() - Get a reference to a device memory page
+ * @page: Pointer to the page
+ * @zdd: Pointer to the GPU SVM zone device data
+ *
+ * This function associates the given page with the specified GPU SVM zone
+ * device data and initializes it for zone device usage.
+ */
+static void drm_gpusvm_get_devmem_page(struct page *page,
+ struct drm_gpusvm_zdd *zdd)
+{
+ page->zone_device_data = drm_gpusvm_zdd_get(zdd);
+ zone_device_page_init(page);
+}
+
+/**
+ * drm_gpusvm_migrate_map_pages() - Map migration pages for GPU SVM migration
+ * @dev: The device for which the pages are being mapped
+ * @dma_addr: Array to store DMA addresses corresponding to mapped pages
+ * @migrate_pfn: Array of migrate page frame numbers to map
+ * @npages: Number of pages to map
+ * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
+ *
+ * This function maps pages of memory for migration usage in GPU SVM. It
+ * iterates over each page frame number provided in @migrate_pfn, maps the
+ * corresponding page, and stores the DMA address in the provided @dma_addr
+ * array.
+ *
+ * Return: 0 on success, -EFAULT if an error occurs during mapping.
+ */
+static int drm_gpusvm_migrate_map_pages(struct device *dev,
+ dma_addr_t *dma_addr,
+ unsigned long *migrate_pfn,
+ unsigned long npages,
+ enum dma_data_direction dir)
+{
+ unsigned long i;
+
+ for (i = 0; i < npages; ++i) {
+ struct page *page = migrate_pfn_to_page(migrate_pfn[i]);
+
+ if (!page)
+ continue;
+
+ if (WARN_ON_ONCE(is_zone_device_page(page)))
+ return -EFAULT;
+
+ dma_addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir);
+ if (dma_mapping_error(dev, dma_addr[i]))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+/**
+ * drm_gpusvm_migrate_unmap_pages() - Unmap pages previously mapped for GPU SVM migration
+ * @dev: The device for which the pages were mapped
+ * @dma_addr: Array of DMA addresses corresponding to mapped pages
+ * @npages: Number of pages to unmap
+ * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
+ *
+ * This function unmaps previously mapped pages of memory for GPU Shared Virtual
+ * Memory (SVM). It iterates over each DMA address provided in @dma_addr, checks
+ * if it's valid and not already unmapped, and unmaps the corresponding page.
+ */
+static void drm_gpusvm_migrate_unmap_pages(struct device *dev,
+ dma_addr_t *dma_addr,
+ unsigned long npages,
+ enum dma_data_direction dir)
+{
+ unsigned long i;
+
+ for (i = 0; i < npages; ++i) {
+ if (!dma_addr[i] || dma_mapping_error(dev, dma_addr[i]))
+ continue;
+
+ dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
+ }
+}
+
+/**
+ * drm_gpusvm_migrate_to_devmem() - Migrate GPU SVM range to device memory
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @range: Pointer to the GPU SVM range structure
+ * @devmem_allocation: Pointer to the device memory allocation. The caller
+ * should hold a reference to the device memory allocation,
+ * which should be dropped via ops->devmem_release or upon
+ * the failure of this function.
+ * @ctx: GPU SVM context
+ *
+ * This function migrates the specified GPU SVM range to device memory. It
+ * performs the necessary setup and invokes the driver-specific operations for
+ * migration to device memory. Upon successful return, @devmem_allocation can
+ * safely reference @range until ops->devmem_release is called which only upon
+ * successful return. Expected to be called while holding the mmap lock in read
+ * mode.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int drm_gpusvm_migrate_to_devmem(struct drm_gpusvm *gpusvm,
+ struct drm_gpusvm_range *range,
+ struct drm_gpusvm_devmem *devmem_allocation,
+ const struct drm_gpusvm_ctx *ctx)
+{
+ const struct drm_gpusvm_devmem_ops *ops = devmem_allocation->ops;
+ unsigned long start = drm_gpusvm_range_start(range),
+ end = drm_gpusvm_range_end(range);
+ struct migrate_vma migrate = {
+ .start = start,
+ .end = end,
+ .pgmap_owner = gpusvm->device_private_page_owner,
+ .flags = MIGRATE_VMA_SELECT_SYSTEM,
+ };
+ struct mm_struct *mm = gpusvm->mm;
+ unsigned long i, npages = npages_in_range(start, end);
+ struct vm_area_struct *vas;
+ struct drm_gpusvm_zdd *zdd = NULL;
+ struct page **pages;
+ dma_addr_t *dma_addr;
+ void *buf;
+ int err;
+
+ mmap_assert_locked(gpusvm->mm);
+
+ if (!range->flags.migrate_devmem)
+ return -EINVAL;
+
+ if (!ops->populate_devmem_pfn || !ops->copy_to_devmem ||
+ !ops->copy_to_ram)
+ return -EOPNOTSUPP;
+
+ vas = vma_lookup(mm, start);
+ if (!vas) {
+ err = -ENOENT;
+ goto err_out;
+ }
+
+ if (end > vas->vm_end || start < vas->vm_start) {
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ if (!vma_is_anonymous(vas)) {
+ err = -EBUSY;
+ goto err_out;
+ }
+
+ buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
+ sizeof(*pages), GFP_KERNEL);
+ if (!buf) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
+ pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
+
+ zdd = drm_gpusvm_zdd_alloc(gpusvm->device_private_page_owner);
+ if (!zdd) {
+ err = -ENOMEM;
+ goto err_free;
+ }
+
+ migrate.vma = vas;
+ migrate.src = buf;
+ migrate.dst = migrate.src + npages;
+
+ err = migrate_vma_setup(&migrate);
+ if (err)
+ goto err_free;
+
+ if (!migrate.cpages) {
+ err = -EFAULT;
+ goto err_free;
+ }
+
+ if (migrate.cpages != npages) {
+ err = -EBUSY;
+ goto err_finalize;
+ }
+
+ err = ops->populate_devmem_pfn(devmem_allocation, npages, migrate.dst);
+ if (err)
+ goto err_finalize;
+
+ err = drm_gpusvm_migrate_map_pages(devmem_allocation->dev, dma_addr,
+ migrate.src, npages, DMA_TO_DEVICE);
+ if (err)
+ goto err_finalize;
+
+ for (i = 0; i < npages; ++i) {
+ struct page *page = pfn_to_page(migrate.dst[i]);
+
+ pages[i] = page;
+ migrate.dst[i] = migrate_pfn(migrate.dst[i]);
+ drm_gpusvm_get_devmem_page(page, zdd);
+ }
+
+ err = ops->copy_to_devmem(pages, dma_addr, npages);
+ if (err)
+ goto err_finalize;
+
+ /* Upon success bind devmem allocation to range and zdd */
+ zdd->devmem_allocation = devmem_allocation; /* Owns ref */
+
+err_finalize:
+ if (err)
+ drm_gpusvm_migration_unlock_put_pages(npages, migrate.dst);
+ migrate_vma_pages(&migrate);
+ migrate_vma_finalize(&migrate);
+ drm_gpusvm_migrate_unmap_pages(devmem_allocation->dev, dma_addr, npages,
+ DMA_TO_DEVICE);
+err_free:
+ if (zdd)
+ drm_gpusvm_zdd_put(zdd);
+ kvfree(buf);
+err_out:
+ return err;
+}
+EXPORT_SYMBOL_GPL(drm_gpusvm_migrate_to_devmem);
+
+/**
+ * drm_gpusvm_migrate_populate_ram_pfn() - Populate RAM PFNs for a VM area
+ * @vas: Pointer to the VM area structure, can be NULL
+ * @fault_page: Fault page
+ * @npages: Number of pages to populate
+ * @mpages: Number of pages to migrate
+ * @src_mpfn: Source array of migrate PFNs
+ * @mpfn: Array of migrate PFNs to populate
+ * @addr: Start address for PFN allocation
+ *
+ * This function populates the RAM migrate page frame numbers (PFNs) for the
+ * specified VM area structure. It allocates and locks pages in the VM area for
+ * RAM usage. If vas is non-NULL use alloc_page_vma for allocation, if NULL use
+ * alloc_page for allocation.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+static int drm_gpusvm_migrate_populate_ram_pfn(struct vm_area_struct *vas,
+ struct page *fault_page,
+ unsigned long npages,
+ unsigned long *mpages,
+ unsigned long *src_mpfn,
+ unsigned long *mpfn,
+ unsigned long addr)
+{
+ unsigned long i;
+
+ for (i = 0; i < npages; ++i, addr += PAGE_SIZE) {
+ struct page *page, *src_page;
+
+ if (!(src_mpfn[i] & MIGRATE_PFN_MIGRATE))
+ continue;
+
+ src_page = migrate_pfn_to_page(src_mpfn[i]);
+ if (!src_page)
+ continue;
+
+ if (fault_page) {
+ if (src_page->zone_device_data !=
+ fault_page->zone_device_data)
+ continue;
+ }
+
+ if (vas)
+ page = alloc_page_vma(GFP_HIGHUSER, vas, addr);
+ else
+ page = alloc_page(GFP_HIGHUSER);
+
+ if (!page)
+ goto free_pages;
+
+ mpfn[i] = migrate_pfn(page_to_pfn(page));
+ }
+
+ for (i = 0; i < npages; ++i) {
+ struct page *page = migrate_pfn_to_page(mpfn[i]);
+
+ if (!page)
+ continue;
+
+ WARN_ON_ONCE(!trylock_page(page));
+ ++*mpages;
+ }
+
+ return 0;
+
+free_pages:
+ for (i = 0; i < npages; ++i) {
+ struct page *page = migrate_pfn_to_page(mpfn[i]);
+
+ if (!page)
+ continue;
+
+ put_page(page);
+ mpfn[i] = 0;
+ }
+ return -ENOMEM;
+}
+
+/**
+ * drm_gpusvm_evict_to_ram() - Evict GPU SVM range to RAM
+ * @devmem_allocation: Pointer to the device memory allocation
+ *
+ * Similar to __drm_gpusvm_migrate_to_ram but does not require mmap lock and
+ * migration done via migrate_device_* functions.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int drm_gpusvm_evict_to_ram(struct drm_gpusvm_devmem *devmem_allocation)
+{
+ const struct drm_gpusvm_devmem_ops *ops = devmem_allocation->ops;
+ unsigned long npages, mpages = 0;
+ struct page **pages;
+ unsigned long *src, *dst;
+ dma_addr_t *dma_addr;
+ void *buf;
+ int i, err = 0;
+ unsigned int retry_count = 2;
+
+ npages = devmem_allocation->size >> PAGE_SHIFT;
+
+retry:
+ if (!mmget_not_zero(devmem_allocation->mm))
+ return -EFAULT;
+
+ buf = kvcalloc(npages, 2 * sizeof(*src) + sizeof(*dma_addr) +
+ sizeof(*pages), GFP_KERNEL);
+ if (!buf) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ src = buf;
+ dst = buf + (sizeof(*src) * npages);
+ dma_addr = buf + (2 * sizeof(*src) * npages);
+ pages = buf + (2 * sizeof(*src) + sizeof(*dma_addr)) * npages;
+
+ err = ops->populate_devmem_pfn(devmem_allocation, npages, src);
+ if (err)
+ goto err_free;
+
+ err = migrate_device_pfns(src, npages);
+ if (err)
+ goto err_free;
+
+ err = drm_gpusvm_migrate_populate_ram_pfn(NULL, NULL, npages, &mpages,
+ src, dst, 0);
+ if (err || !mpages)
+ goto err_finalize;
+
+ err = drm_gpusvm_migrate_map_pages(devmem_allocation->dev, dma_addr,
+ dst, npages, DMA_FROM_DEVICE);
+ if (err)
+ goto err_finalize;
+
+ for (i = 0; i < npages; ++i)
+ pages[i] = migrate_pfn_to_page(src[i]);
+
+ err = ops->copy_to_ram(pages, dma_addr, npages);
+ if (err)
+ goto err_finalize;
+
+err_finalize:
+ if (err)
+ drm_gpusvm_migration_unlock_put_pages(npages, dst);
+ migrate_device_pages(src, dst, npages);
+ migrate_device_finalize(src, dst, npages);
+ drm_gpusvm_migrate_unmap_pages(devmem_allocation->dev, dma_addr, npages,
+ DMA_FROM_DEVICE);
+err_free:
+ kvfree(buf);
+err_out:
+ mmput_async(devmem_allocation->mm);
+
+ if (completion_done(&devmem_allocation->detached))
+ return 0;
+
+ if (retry_count--) {
+ cond_resched();
+ goto retry;
+ }
+
+ return err ?: -EBUSY;
+}
+EXPORT_SYMBOL_GPL(drm_gpusvm_evict_to_ram);
+
+/**
+ * __drm_gpusvm_migrate_to_ram() - Migrate GPU SVM range to RAM (internal)
+ * @vas: Pointer to the VM area structure
+ * @device_private_page_owner: Device private pages owner
+ * @page: Pointer to the page for fault handling (can be NULL)
+ * @fault_addr: Fault address
+ * @size: Size of migration
+ *
+ * This internal function performs the migration of the specified GPU SVM range
+ * to RAM. It sets up the migration, populates + dma maps RAM PFNs, and
+ * invokes the driver-specific operations for migration to RAM.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+static int __drm_gpusvm_migrate_to_ram(struct vm_area_struct *vas,
+ void *device_private_page_owner,
+ struct page *page,
+ unsigned long fault_addr,
+ unsigned long size)
+{
+ struct migrate_vma migrate = {
+ .vma = vas,
+ .pgmap_owner = device_private_page_owner,
+ .flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE |
+ MIGRATE_VMA_SELECT_DEVICE_COHERENT,
+ .fault_page = page,
+ };
+ struct drm_gpusvm_zdd *zdd;
+ const struct drm_gpusvm_devmem_ops *ops;
+ struct device *dev = NULL;
+ unsigned long npages, mpages = 0;
+ struct page **pages;
+ dma_addr_t *dma_addr;
+ unsigned long start, end;
+ void *buf;
+ int i, err = 0;
+
+ start = ALIGN_DOWN(fault_addr, size);
+ end = ALIGN(fault_addr + 1, size);
+
+ /* Corner where VMA area struct has been partially unmapped */
+ if (start < vas->vm_start)
+ start = vas->vm_start;
+ if (end > vas->vm_end)
+ end = vas->vm_end;
+
+ migrate.start = start;
+ migrate.end = end;
+ npages = npages_in_range(start, end);
+
+ buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
+ sizeof(*pages), GFP_KERNEL);
+ if (!buf) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
+ pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
+
+ migrate.vma = vas;
+ migrate.src = buf;
+ migrate.dst = migrate.src + npages;
+
+ err = migrate_vma_setup(&migrate);
+ if (err)
+ goto err_free;
+
+ /* Raced with another CPU fault, nothing to do */
+ if (!migrate.cpages)
+ goto err_free;
+
+ if (!page) {
+ for (i = 0; i < npages; ++i) {
+ if (!(migrate.src[i] & MIGRATE_PFN_MIGRATE))
+ continue;
+
+ page = migrate_pfn_to_page(migrate.src[i]);
+ break;
+ }
+
+ if (!page)
+ goto err_finalize;
+ }
+ zdd = page->zone_device_data;
+ ops = zdd->devmem_allocation->ops;
+ dev = zdd->devmem_allocation->dev;
+
+ err = drm_gpusvm_migrate_populate_ram_pfn(vas, page, npages, &mpages,
+ migrate.src, migrate.dst,
+ start);
+ if (err)
+ goto err_finalize;
+
+ err = drm_gpusvm_migrate_map_pages(dev, dma_addr, migrate.dst, npages,
+ DMA_FROM_DEVICE);
+ if (err)
+ goto err_finalize;
+
+ for (i = 0; i < npages; ++i)
+ pages[i] = migrate_pfn_to_page(migrate.src[i]);
+
+ err = ops->copy_to_ram(pages, dma_addr, npages);
+ if (err)
+ goto err_finalize;
+
+err_finalize:
+ if (err)
+ drm_gpusvm_migration_unlock_put_pages(npages, migrate.dst);
+ migrate_vma_pages(&migrate);
+ migrate_vma_finalize(&migrate);
+ if (dev)
+ drm_gpusvm_migrate_unmap_pages(dev, dma_addr, npages,
+ DMA_FROM_DEVICE);
+err_free:
+ kvfree(buf);
+err_out:
+
+ return err;
+}
+
+/**
+ * drm_gpusvm_range_evict - Evict GPU SVM range
+ * @pagemap: Pointer to the GPU SVM structure
+ * @range: Pointer to the GPU SVM range to be removed
+ *
+ * This function evicts the specified GPU SVM range. This function will not
+ * evict coherent pages.
+ *
+ * Return: 0 on success, a negative error code on failure.
+ */
+int drm_gpusvm_range_evict(struct drm_gpusvm *gpusvm,
+ struct drm_gpusvm_range *range)
+{
+ struct mmu_interval_notifier *notifier = &range->notifier->notifier;
+ struct hmm_range hmm_range = {
+ .default_flags = HMM_PFN_REQ_FAULT,
+ .notifier = notifier,
+ .start = drm_gpusvm_range_start(range),
+ .end = drm_gpusvm_range_end(range),
+ .dev_private_owner = NULL,
+ };
+ unsigned long timeout =
+ jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
+ unsigned long *pfns;
+ unsigned long npages = npages_in_range(drm_gpusvm_range_start(range),
+ drm_gpusvm_range_end(range));
+ int err = 0;
+ struct mm_struct *mm = gpusvm->mm;
+
+ if (!mmget_not_zero(mm))
+ return -EFAULT;
+
+ pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
+ if (!pfns)
+ return -ENOMEM;
+
+ hmm_range.hmm_pfns = pfns;
+ while (!time_after(jiffies, timeout)) {
+ hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
+ if (time_after(jiffies, timeout)) {
+ err = -ETIME;
+ break;
+ }
+
+ mmap_read_lock(mm);
+ err = hmm_range_fault(&hmm_range);
+ mmap_read_unlock(mm);
+ if (err != -EBUSY)
+ break;
+ }
+
+ kvfree(pfns);
+ mmput(mm);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(drm_gpusvm_range_evict);
+
+/**
+ * drm_gpusvm_page_free() - Put GPU SVM zone device data associated with a page
+ * @page: Pointer to the page
+ *
+ * This function is a callback used to put the GPU SVM zone device data
+ * associated with a page when it is being released.
+ */
+static void drm_gpusvm_page_free(struct page *page)
+{
+ drm_gpusvm_zdd_put(page->zone_device_data);
+}
+
+/**
+ * drm_gpusvm_migrate_to_ram() - Migrate GPU SVM range to RAM (page fault handler)
+ * @vmf: Pointer to the fault information structure
+ *
+ * This function is a page fault handler used to migrate a GPU SVM range to RAM.
+ * It retrieves the GPU SVM range information from the faulting page and invokes
+ * the internal migration function to migrate the range back to RAM.
+ *
+ * Return: VM_FAULT_SIGBUS on failure, 0 on success.
+ */
+static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault *vmf)
+{
+ struct drm_gpusvm_zdd *zdd = vmf->page->zone_device_data;
+ int err;
+
+ err = __drm_gpusvm_migrate_to_ram(vmf->vma,
+ zdd->device_private_page_owner,
+ vmf->page, vmf->address,
+ zdd->devmem_allocation->size);
+
+ return err ? VM_FAULT_SIGBUS : 0;
+}
+
+/**
+ * drm_gpusvm_pagemap_ops() - Device page map operations for GPU SVM
+ */
+static const struct dev_pagemap_ops drm_gpusvm_pagemap_ops = {
+ .page_free = drm_gpusvm_page_free,
+ .migrate_to_ram = drm_gpusvm_migrate_to_ram,
+};
+
+/**
+ * drm_gpusvm_pagemap_ops_get() - Retrieve GPU SVM device page map operations
+ *
+ * Return: Pointer to the GPU SVM device page map operations structure.
+ */
+const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void)
+{
+ return &drm_gpusvm_pagemap_ops;
+}
+EXPORT_SYMBOL_GPL(drm_gpusvm_pagemap_ops_get);
+
+/**
+ * drm_gpusvm_has_mapping() - Check if GPU SVM has mapping for the given address range
+ * @gpusvm: Pointer to the GPU SVM structure.
+ * @start: Start address
+ * @end: End address
+ *
+ * Return: True if GPU SVM has mapping, False otherwise
+ */
+bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, unsigned long start,
+ unsigned long end)
+{
+ struct drm_gpusvm_notifier *notifier;
+
+ drm_gpusvm_for_each_notifier(notifier, gpusvm, start, end) {
+ struct drm_gpusvm_range *range = NULL;
+
+ drm_gpusvm_for_each_range(range, notifier, start, end)
+ return true;
+ }
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(drm_gpusvm_has_mapping);
+
+/**
+ * drm_gpusvm_range_set_unmapped() - Mark a GPU SVM range as unmapped
+ * @range: Pointer to the GPU SVM range structure.
+ * @mmu_range: Pointer to the MMU notifier range structure.
+ *
+ * This function marks a GPU SVM range as unmapped and sets the partial_unmap flag
+ * if the range partially falls within the provided MMU notifier range.
+ */
+void drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range *range,
+ const struct mmu_notifier_range *mmu_range)
+{
+ lockdep_assert_held_write(&range->gpusvm->notifier_lock);
+
+ range->flags.unmapped = true;
+ if (drm_gpusvm_range_start(range) < mmu_range->start ||
+ drm_gpusvm_range_end(range) > mmu_range->end)
+ range->flags.partial_unmap = true;
+}
+EXPORT_SYMBOL_GPL(drm_gpusvm_range_set_unmapped);
+
+/**
+ * drm_gpusvm_devmem_init() - Initialize a GPU SVM device memory allocation
+ *
+ * @dev: Pointer to the device structure which device memory allocation belongs to
+ * @mm: Pointer to the mm_struct for the address space
+ * @ops: Pointer to the operations structure for GPU SVM device memory
+ * @dpagemap: The struct drm_pagemap we're allocating from.
+ * @size: Size of device memory allocation
+ */
+void drm_gpusvm_devmem_init(struct drm_gpusvm_devmem *devmem_allocation,
+ struct device *dev, struct mm_struct *mm,
+ const struct drm_gpusvm_devmem_ops *ops,
+ struct drm_pagemap *dpagemap, size_t size)
+{
+ init_completion(&devmem_allocation->detached);
+ devmem_allocation->dev = dev;
+ devmem_allocation->mm = mm;
+ devmem_allocation->ops = ops;
+ devmem_allocation->dpagemap = dpagemap;
+ devmem_allocation->size = size;
+}
+EXPORT_SYMBOL_GPL(drm_gpusvm_devmem_init);
+
+MODULE_DESCRIPTION("DRM GPUSVM");
+MODULE_LICENSE("GPL");
diff --git a/drivers/gpu/drm/xe/Kconfig b/drivers/gpu/drm/xe/Kconfig
index 99219c16e8aa..7d7995196702 100644
--- a/drivers/gpu/drm/xe/Kconfig
+++ b/drivers/gpu/drm/xe/Kconfig
@@ -39,6 +39,7 @@ config DRM_XE
select DRM_TTM_HELPER
select DRM_EXEC
select DRM_GPUVM
+ select DRM_GPUSVM if !UML && DEVICE_PRIVATE
select DRM_SCHED
select MMU_NOTIFIER
select WANT_DEV_COREDUMP
@@ -73,6 +74,15 @@ config DRM_XE_DP_TUNNEL
If in doubt say "Y".
+config DRM_XE_DEVMEM_MIRROR
+ bool "Enable device memory mirror"
+ depends on DRM_XE
+ select GET_FREE_REGION
+ default y
+ help
+ Disable this option only if you want to compile out without device
+ memory mirror. Will reduce KMD memory footprint when disabled.
+
config DRM_XE_FORCE_PROBE
string "Force probe xe for selected Intel hardware IDs"
depends on DRM_XE
diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
index 81b8914b877c..9699b08585f7 100644
--- a/drivers/gpu/drm/xe/Makefile
+++ b/drivers/gpu/drm/xe/Makefile
@@ -33,6 +33,7 @@ xe-y += xe_bb.o \
xe_device_sysfs.o \
xe_dma_buf.o \
xe_drm_client.o \
+ xe_eu_stall.o \
xe_exec.o \
xe_exec_queue.o \
xe_execlist.o \
@@ -60,6 +61,7 @@ xe-y += xe_bb.o \
xe_guc_capture.o \
xe_guc_ct.o \
xe_guc_db_mgr.o \
+ xe_guc_engine_activity.o \
xe_guc_hwconfig.o \
xe_guc_id_mgr.o \
xe_guc_klv_helpers.o \
@@ -123,6 +125,7 @@ xe-y += xe_bb.o \
xe_wopcm.o
xe-$(CONFIG_HMM_MIRROR) += xe_hmm.o
+xe-$(CONFIG_DRM_GPUSVM) += xe_svm.o
# graphics hardware monitoring (HWMON) support
xe-$(CONFIG_HWMON) += xe_hwmon.o
diff --git a/drivers/gpu/drm/xe/abi/guc_actions_abi.h b/drivers/gpu/drm/xe/abi/guc_actions_abi.h
index fee385532fb0..ec516e838ee8 100644
--- a/drivers/gpu/drm/xe/abi/guc_actions_abi.h
+++ b/drivers/gpu/drm/xe/abi/guc_actions_abi.h
@@ -140,6 +140,7 @@ enum xe_guc_action {
XE_GUC_ACTION_REGISTER_CONTEXT_MULTI_LRC = 0x4601,
XE_GUC_ACTION_CLIENT_SOFT_RESET = 0x5507,
XE_GUC_ACTION_SET_ENG_UTIL_BUFF = 0x550A,
+ XE_GUC_ACTION_SET_DEVICE_ENGINE_ACTIVITY_BUFFER = 0x550C,
XE_GUC_ACTION_NOTIFY_MEMORY_CAT_ERROR = 0x6000,
XE_GUC_ACTION_REPORT_PAGE_FAULT_REQ_DESC = 0x6002,
XE_GUC_ACTION_PAGE_FAULT_RES_DESC = 0x6003,
diff --git a/drivers/gpu/drm/xe/abi/guc_actions_slpc_abi.h b/drivers/gpu/drm/xe/abi/guc_actions_slpc_abi.h
index 85abe4f09ae2..b28c8fa061f7 100644
--- a/drivers/gpu/drm/xe/abi/guc_actions_slpc_abi.h
+++ b/drivers/gpu/drm/xe/abi/guc_actions_slpc_abi.h
@@ -174,6 +174,9 @@ struct slpc_task_state_data {
};
} __packed;
+#define SLPC_CTX_FREQ_REQ_IS_COMPUTE REG_BIT(28)
+#define SLPC_OPTIMIZED_STRATEGY_COMPUTE REG_BIT(0)
+
struct slpc_shared_data_header {
/* Total size in bytes of this shared buffer. */
u32 size;
diff --git a/drivers/gpu/drm/xe/display/xe_display.c b/drivers/gpu/drm/xe/display/xe_display.c
index 02a413a07382..d5d453dc927a 100644
--- a/drivers/gpu/drm/xe/display/xe_display.c
+++ b/drivers/gpu/drm/xe/display/xe_display.c
@@ -170,6 +170,7 @@ static void xe_display_fini(void *arg)
intel_hpd_poll_fini(xe);
intel_hdcp_component_fini(display);
intel_audio_deinit(display);
+ intel_display_driver_remove(display);
}
int xe_display_init(struct xe_device *xe)
@@ -184,7 +185,7 @@ int xe_display_init(struct xe_device *xe)
if (err)
return err;
- return xe_device_add_action_or_reset(xe, xe_display_fini, xe);
+ return devm_add_action_or_reset(xe->drm.dev, xe_display_fini, xe);
}
void xe_display_register(struct xe_device *xe)
@@ -209,16 +210,6 @@ void xe_display_unregister(struct xe_device *xe)
intel_display_driver_unregister(display);
}
-void xe_display_driver_remove(struct xe_device *xe)
-{
- struct intel_display *display = &xe->display;
-
- if (!xe->info.probe_display)
- return;
-
- intel_display_driver_remove(display);
-}
-
/* IRQ-related functions */
void xe_display_irq_handler(struct xe_device *xe, u32 master_ctl)
diff --git a/drivers/gpu/drm/xe/display/xe_display.h b/drivers/gpu/drm/xe/display/xe_display.h
index 685dc74402fb..46e14f8dee28 100644
--- a/drivers/gpu/drm/xe/display/xe_display.h
+++ b/drivers/gpu/drm/xe/display/xe_display.h
@@ -14,7 +14,6 @@ struct drm_driver;
bool xe_display_driver_probe_defer(struct pci_dev *pdev);
void xe_display_driver_set_hooks(struct drm_driver *driver);
-void xe_display_driver_remove(struct xe_device *xe);
int xe_display_create(struct xe_device *xe);
diff --git a/drivers/gpu/drm/xe/regs/xe_engine_regs.h b/drivers/gpu/drm/xe/regs/xe_engine_regs.h
index c8fd3d5ca502..4f372dc2cb89 100644
--- a/drivers/gpu/drm/xe/regs/xe_engine_regs.h
+++ b/drivers/gpu/drm/xe/regs/xe_engine_regs.h
@@ -53,7 +53,6 @@
#define RING_CTL(base) XE_REG((base) + 0x3c)
#define RING_CTL_SIZE(size) ((size) - PAGE_SIZE) /* in bytes -> pages */
-#define RING_CTL_SIZE(size) ((size) - PAGE_SIZE) /* in bytes -> pages */
#define RING_START_UDW(base) XE_REG((base) + 0x48)
diff --git a/drivers/gpu/drm/xe/regs/xe_eu_stall_regs.h b/drivers/gpu/drm/xe/regs/xe_eu_stall_regs.h
new file mode 100644
index 000000000000..c53f57fdde65
--- /dev/null
+++ b/drivers/gpu/drm/xe/regs/xe_eu_stall_regs.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2025 Intel Corporation
+ */
+
+#ifndef _XE_EU_STALL_REGS_H_
+#define _XE_EU_STALL_REGS_H_
+
+#include "regs/xe_reg_defs.h"
+
+#define XEHPC_EUSTALL_BASE XE_REG_MCR(0xe520)
+#define XEHPC_EUSTALL_BASE_BUF_ADDR REG_GENMASK(31, 6)
+#define XEHPC_EUSTALL_BASE_XECORE_BUF_SZ REG_GENMASK(5, 3)
+#define XEHPC_EUSTALL_BASE_ENABLE_SAMPLING REG_BIT(1)
+
+#define XEHPC_EUSTALL_BASE_UPPER XE_REG_MCR(0xe524)
+
+#define XEHPC_EUSTALL_REPORT XE_REG_MCR(0xe528, XE_REG_OPTION_MASKED)
+#define XEHPC_EUSTALL_REPORT_WRITE_PTR_MASK REG_GENMASK(15, 2)
+#define XEHPC_EUSTALL_REPORT_OVERFLOW_DROP REG_BIT(1)
+
+#define XEHPC_EUSTALL_REPORT1 XE_REG_MCR(0xe52c, XE_REG_OPTION_MASKED)
+#define XEHPC_EUSTALL_REPORT1_READ_PTR_MASK REG_GENMASK(15, 2)
+
+#define XEHPC_EUSTALL_CTRL XE_REG_MCR(0xe53c, XE_REG_OPTION_MASKED)
+#define EUSTALL_MOCS REG_GENMASK(9, 3)
+#define EUSTALL_SAMPLE_RATE REG_GENMASK(2, 0)
+
+#endif
diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
index 096859072396..da1f198ac107 100644
--- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h
+++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
@@ -358,14 +358,18 @@
#define RENDER_AWAKE_STATUS REG_BIT(1)
#define MEDIA_SLICE0_AWAKE_STATUS REG_BIT(0)
+#define MISC_STATUS_0 XE_REG(0xa500)
+
#define FORCEWAKE_MEDIA_VDBOX(n) XE_REG(0xa540 + (n) * 4)
#define FORCEWAKE_MEDIA_VEBOX(n) XE_REG(0xa560 + (n) * 4)
#define FORCEWAKE_GSC XE_REG(0xa618)
+#define XELP_GARBCNTL XE_REG(0xb004)
+#define XELP_BUS_HASH_CTL_BIT_EXC REG_BIT(7)
+
#define XEHPC_LNCFMISCCFGREG0 XE_REG_MCR(0xb01c, XE_REG_OPTION_MASKED)
#define XEHPC_OVRLSCCC REG_BIT(0)
-/* L3 Cache Control */
#define LNCFCMOCS_REG_COUNT 32
#define XELP_LNCFCMOCS(i) XE_REG(0xb020 + (i) * 4)
#define XEHP_LNCFCMOCS(i) XE_REG_MCR(0xb020 + (i) * 4)
@@ -478,6 +482,7 @@
#define TDL_TSL_CHICKEN XE_REG_MCR(0xe4c4, XE_REG_OPTION_MASKED)
#define STK_ID_RESTRICT REG_BIT(12)
#define SLM_WMTP_RESTORE REG_BIT(11)
+#define RES_CHK_SPR_DIS REG_BIT(6)
#define ROW_CHICKEN XE_REG_MCR(0xe4f0, XE_REG_OPTION_MASKED)
#define UGM_BACKUP_MODE REG_BIT(13)
diff --git a/drivers/gpu/drm/xe/regs/xe_regs.h b/drivers/gpu/drm/xe/regs/xe_regs.h
index 6cf282618836..3abb17d2ca33 100644
--- a/drivers/gpu/drm/xe/regs/xe_regs.h
+++ b/drivers/gpu/drm/xe/regs/xe_regs.h
@@ -7,10 +7,6 @@
#include "regs/xe_reg_defs.h"
-#define TIMESTAMP_OVERRIDE XE_REG(0x44074)
-#define TIMESTAMP_OVERRIDE_US_COUNTER_DENOMINATOR_MASK REG_GENMASK(15, 12)
-#define TIMESTAMP_OVERRIDE_US_COUNTER_DIVIDER_MASK REG_GENMASK(9, 0)
-
#define GU_CNTL_PROTECTED XE_REG(0x10100C)
#define DRIVERINT_FLR_DIS REG_BIT(31)
diff --git a/drivers/gpu/drm/xe/tests/xe_pci.c b/drivers/gpu/drm/xe/tests/xe_pci.c
index 67404863087e..1d3e2e50c355 100644
--- a/drivers/gpu/drm/xe/tests/xe_pci.c
+++ b/drivers/gpu/drm/xe/tests/xe_pci.c
@@ -21,15 +21,15 @@
*/
void xe_call_for_each_graphics_ip(xe_graphics_fn xe_fn)
{
- const struct xe_graphics_desc *ip, *last = NULL;
+ const struct xe_graphics_desc *desc, *last = NULL;
- for (int i = 0; i < ARRAY_SIZE(graphics_ip_map); i++) {
- ip = graphics_ip_map[i].ip;
- if (ip == last)
+ for (int i = 0; i < ARRAY_SIZE(graphics_ips); i++) {
+ desc = graphics_ips[i].desc;
+ if (desc == last)
continue;
- xe_fn(ip);
- last = ip;
+ xe_fn(desc);
+ last = desc;
}
}
EXPORT_SYMBOL_IF_KUNIT(xe_call_for_each_graphics_ip);
@@ -43,15 +43,15 @@ EXPORT_SYMBOL_IF_KUNIT(xe_call_for_each_graphics_ip);
*/
void xe_call_for_each_media_ip(xe_media_fn xe_fn)
{
- const struct xe_media_desc *ip, *last = NULL;
+ const struct xe_media_desc *desc, *last = NULL;
- for (int i = 0; i < ARRAY_SIZE(media_ip_map); i++) {
- ip = media_ip_map[i].ip;
- if (ip == last)
+ for (int i = 0; i < ARRAY_SIZE(media_ips); i++) {
+ desc = media_ips[i].desc;
+ if (desc == last)
continue;
- xe_fn(ip);
- last = ip;
+ xe_fn(desc);
+ last = desc;
}
}
EXPORT_SYMBOL_IF_KUNIT(xe_call_for_each_media_ip);
@@ -110,7 +110,7 @@ done:
kunit_activate_static_stub(test, read_gmdid, fake_read_gmdid);
xe_info_init_early(xe, desc, subplatform_desc);
- xe_info_init(xe, desc->graphics, desc->media);
+ xe_info_init(xe, desc);
return 0;
}
diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
index c21e6bca3141..64f9c936eea0 100644
--- a/drivers/gpu/drm/xe/xe_bo.c
+++ b/drivers/gpu/drm/xe/xe_bo.c
@@ -281,6 +281,8 @@ int xe_bo_placement_for_flags(struct xe_device *xe, struct xe_bo *bo,
static void xe_evict_flags(struct ttm_buffer_object *tbo,
struct ttm_placement *placement)
{
+ struct xe_bo *bo;
+
if (!xe_bo_is_xe_bo(tbo)) {
/* Don't handle scatter gather BOs */
if (tbo->type == ttm_bo_type_sg) {
@@ -292,6 +294,12 @@ static void xe_evict_flags(struct ttm_buffer_object *tbo,
return;
}
+ bo = ttm_to_xe_bo(tbo);
+ if (bo->flags & XE_BO_FLAG_CPU_ADDR_MIRROR) {
+ *placement = sys_placement;
+ return;
+ }
+
/*
* For xe, sg bos that are evicted to system just triggers a
* rebind of the sg list upon subsequent validation to XE_PL_TT.
@@ -789,6 +797,20 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict,
goto out;
}
+ if (!move_lacks_source && (bo->flags & XE_BO_FLAG_CPU_ADDR_MIRROR) &&
+ new_mem->mem_type == XE_PL_SYSTEM) {
+ ret = xe_svm_bo_evict(bo);
+ if (!ret) {
+ drm_dbg(&xe->drm, "Evict system allocator BO success\n");
+ ttm_bo_move_null(ttm_bo, new_mem);
+ } else {
+ drm_dbg(&xe->drm, "Evict system allocator BO failed=%pe\n",
+ ERR_PTR(ret));
+ }
+
+ goto out;
+ }
+
if (old_mem_type == XE_PL_SYSTEM && new_mem->mem_type == XE_PL_TT && !handle_system_ccs) {
ttm_bo_move_null(ttm_bo, new_mem);
goto out;
@@ -2441,6 +2463,7 @@ int xe_gem_create_ioctl(struct drm_device *dev, void *data,
struct xe_file *xef = to_xe_file(file);
struct drm_xe_gem_create *args = data;
struct xe_vm *vm = NULL;
+ ktime_t end = 0;
struct xe_bo *bo;
unsigned int bo_flags;
u32 handle;
@@ -2512,6 +2535,10 @@ int xe_gem_create_ioctl(struct drm_device *dev, void *data,
vm = xe_vm_lookup(xef, args->vm_id);
if (XE_IOCTL_DBG(xe, !vm))
return -ENOENT;
+ }
+
+retry:
+ if (vm) {
err = xe_vm_lock(vm, true);
if (err)
goto out_vm;
@@ -2525,6 +2552,8 @@ int xe_gem_create_ioctl(struct drm_device *dev, void *data,
if (IS_ERR(bo)) {
err = PTR_ERR(bo);
+ if (xe_vm_validate_should_retry(NULL, err, &end))
+ goto retry;
goto out_vm;
}
@@ -2821,6 +2850,31 @@ void xe_bo_put_commit(struct llist_head *deferred)
drm_gem_object_free(&bo->ttm.base.refcount);
}
+static void xe_bo_dev_work_func(struct work_struct *work)
+{
+ struct xe_bo_dev *bo_dev = container_of(work, typeof(*bo_dev), async_free);
+
+ xe_bo_put_commit(&bo_dev->async_list);
+}
+
+/**
+ * xe_bo_dev_init() - Initialize BO dev to manage async BO freeing
+ * @bo_dev: The BO dev structure
+ */
+void xe_bo_dev_init(struct xe_bo_dev *bo_dev)
+{
+ INIT_WORK(&bo_dev->async_free, xe_bo_dev_work_func);
+}
+
+/**
+ * xe_bo_dev_fini() - Finalize BO dev managing async BO freeing
+ * @bo_dev: The BO dev structure
+ */
+void xe_bo_dev_fini(struct xe_bo_dev *bo_dev)
+{
+ flush_work(&bo_dev->async_free);
+}
+
void xe_bo_put(struct xe_bo *bo)
{
struct xe_tile *tile;
diff --git a/drivers/gpu/drm/xe/xe_bo.h b/drivers/gpu/drm/xe/xe_bo.h
index 2a7240c45511..bda3fdd408da 100644
--- a/drivers/gpu/drm/xe/xe_bo.h
+++ b/drivers/gpu/drm/xe/xe_bo.h
@@ -47,6 +47,7 @@
XE_BO_FLAG_GGTT1 | \
XE_BO_FLAG_GGTT2 | \
XE_BO_FLAG_GGTT3)
+#define XE_BO_FLAG_CPU_ADDR_MIRROR BIT(22)
/* this one is trigger internally only */
#define XE_BO_FLAG_INTERNAL_TEST BIT(30)
@@ -345,6 +346,25 @@ xe_bo_put_deferred(struct xe_bo *bo, struct llist_head *deferred)
void xe_bo_put_commit(struct llist_head *deferred);
+/**
+ * xe_bo_put_async() - Put BO async
+ * @bo: The bo to put.
+ *
+ * Put BO async, the final put is deferred to a worker to exit an IRQ context.
+ */
+static inline void
+xe_bo_put_async(struct xe_bo *bo)
+{
+ struct xe_bo_dev *bo_device = &xe_bo_device(bo)->bo_device;
+
+ if (xe_bo_put_deferred(bo, &bo_device->async_list))
+ schedule_work(&bo_device->async_free);
+}
+
+void xe_bo_dev_init(struct xe_bo_dev *bo_device);
+
+void xe_bo_dev_fini(struct xe_bo_dev *bo_device);
+
struct sg_table *xe_bo_sg(struct xe_bo *bo);
/*
diff --git a/drivers/gpu/drm/xe/xe_bo_types.h b/drivers/gpu/drm/xe/xe_bo_types.h
index 60c522866500..15a92e3d4898 100644
--- a/drivers/gpu/drm/xe/xe_bo_types.h
+++ b/drivers/gpu/drm/xe/xe_bo_types.h
@@ -8,6 +8,7 @@
#include <linux/iosys-map.h>
+#include <drm/drm_gpusvm.h>
#include <drm/ttm/ttm_bo.h>
#include <drm/ttm/ttm_device.h>
#include <drm/ttm/ttm_placement.h>
@@ -80,6 +81,9 @@ struct xe_bo {
*/
u16 cpu_caching;
+ /** @devmem_allocation: SVM device memory allocation */
+ struct drm_gpusvm_devmem devmem_allocation;
+
/** @vram_userfault_link: Link into @mem_access.vram_userfault.list */
struct list_head vram_userfault_link;
diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
index 39fe485d2085..81b9d9bb3f57 100644
--- a/drivers/gpu/drm/xe/xe_devcoredump.c
+++ b/drivers/gpu/drm/xe/xe_devcoredump.c
@@ -237,7 +237,7 @@ static void xe_devcoredump_deferred_snap_work(struct work_struct *work)
/*
* NB: Despite passing a GFP_ flags parameter here, more allocations are done
- * internally using GFP_KERNEL expliictly. Hence this call must be in the worker
+ * internally using GFP_KERNEL explicitly. Hence this call must be in the worker
* thread and not in the initial capture call.
*/
dev_coredumpm_timeout(gt_to_xe(ss->gt)->drm.dev, THIS_MODULE, coredump, 0, GFP_KERNEL,
@@ -423,11 +423,11 @@ void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, char suffi
if (size & 3)
drm_printf(p, "Size not word aligned: %zu", size);
if (offset & 3)
- drm_printf(p, "Offset not word aligned: %zu", size);
+ drm_printf(p, "Offset not word aligned: %zu", offset);
line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_KERNEL);
- if (IS_ERR_OR_NULL(line_buff)) {
- drm_printf(p, "Failed to allocate line buffer: %pe", line_buff);
+ if (!line_buff) {
+ drm_printf(p, "Failed to allocate line buffer\n");
return;
}
diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
index 68ef12b57344..5d79b439dd62 100644
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -54,7 +54,6 @@
#include "xe_query.h"
#include "xe_shrinker.h"
#include "xe_sriov.h"
-#include "xe_survivability_mode.h"
#include "xe_tile.h"
#include "xe_ttm_stolen_mgr.h"
#include "xe_ttm_sys_mgr.h"
@@ -66,12 +65,6 @@
#include <generated/xe_wa_oob.h>
-struct xe_device_remove_action {
- struct list_head node;
- void (*action)(void *);
- void *data;
-};
-
static int xe_file_open(struct drm_device *dev, struct drm_file *file)
{
struct xe_device *xe = to_xe_device(dev);
@@ -395,6 +388,8 @@ static void xe_device_destroy(struct drm_device *dev, void *dummy)
{
struct xe_device *xe = to_xe_device(dev);
+ xe_bo_dev_fini(&xe->bo_device);
+
if (xe->preempt_fence_wq)
destroy_workqueue(xe->preempt_fence_wq);
@@ -435,6 +430,7 @@ struct xe_device *xe_device_create(struct pci_dev *pdev,
if (WARN_ON(err))
goto err;
+ xe_bo_dev_init(&xe->bo_device);
err = drmm_add_action_or_reset(&xe->drm, xe_device_destroy, NULL);
if (err)
goto err;
@@ -675,7 +671,7 @@ static int wait_for_lmem_ready(struct xe_device *xe)
}
ALLOW_ERROR_INJECTION(wait_for_lmem_ready, ERRNO); /* See xe_pci_probe() */
-static void update_device_info(struct xe_device *xe)
+static void sriov_update_device_info(struct xe_device *xe)
{
/* disable features that are not available/applicable to VFs */
if (IS_SRIOV_VF(xe)) {
@@ -706,15 +702,11 @@ int xe_device_probe_early(struct xe_device *xe)
xe_sriov_probe_early(xe);
- update_device_info(xe);
+ sriov_update_device_info(xe);
err = xe_pcode_probe_early(xe);
- if (err) {
- if (xe_survivability_mode_required(xe))
- xe_survivability_mode_init(xe);
-
+ if (err)
return err;
- }
err = wait_for_lmem_ready(xe);
if (err)
@@ -760,9 +752,6 @@ int xe_device_probe(struct xe_device *xe)
int err;
u8 id;
- xe->probing = true;
- INIT_LIST_HEAD(&xe->remove_action_list);
-
xe_pat_init_early(xe);
err = xe_sriov_init(xe);
@@ -770,6 +759,7 @@ int xe_device_probe(struct xe_device *xe)
return err;
xe->info.mem_region_mask = 1;
+
err = xe_set_dma_info(xe);
if (err)
return err;
@@ -778,7 +768,9 @@ int xe_device_probe(struct xe_device *xe)
if (err)
return err;
- xe_ttm_sys_mgr_init(xe);
+ err = xe_ttm_sys_mgr_init(xe);
+ if (err)
+ return err;
for_each_gt(gt, xe, id) {
err = xe_gt_init_early(gt);
@@ -873,7 +865,9 @@ int xe_device_probe(struct xe_device *xe)
return err;
}
- xe_heci_gsc_init(xe);
+ err = xe_heci_gsc_init(xe);
+ if (err)
+ return err;
err = xe_oa_init(xe);
if (err)
@@ -885,11 +879,11 @@ int xe_device_probe(struct xe_device *xe)
err = xe_pxp_init(xe);
if (err)
- goto err_remove_display;
+ return err;
err = drm_dev_register(&xe->drm, 0);
if (err)
- goto err_remove_display;
+ return err;
xe_display_register(xe);
@@ -912,84 +906,19 @@ int xe_device_probe(struct xe_device *xe)
xe_vsec_init(xe);
- xe->probing = false;
-
return devm_add_action_or_reset(xe->drm.dev, xe_device_sanitize, xe);
err_unregister_display:
xe_display_unregister(xe);
-err_remove_display:
- xe_display_driver_remove(xe);
return err;
}
-/**
- * xe_device_call_remove_actions - Call the remove actions
- * @xe: xe device instance
- *
- * This is only to be used by xe_pci and xe_device to call the remove actions
- * while removing the driver or handling probe failures.
- */
-void xe_device_call_remove_actions(struct xe_device *xe)
-{
- struct xe_device_remove_action *ra, *tmp;
-
- list_for_each_entry_safe(ra, tmp, &xe->remove_action_list, node) {
- ra->action(ra->data);
- list_del(&ra->node);
- kfree(ra);
- }
-
- xe->probing = false;
-}
-
-/**
- * xe_device_add_action_or_reset - Add an action to run on driver removal
- * @xe: xe device instance
- * @action: Function that should be called on device remove
- * @data: Pointer to data passed to @action implementation
- *
- * This adds a custom action to the list of remove callbacks executed on device
- * remove, before any dev or drm managed resources are removed. This is only
- * needed if the action leads to component_del()/component_master_del() since
- * that is not compatible with devres cleanup.
- *
- * Returns: 0 on success or a negative error code on failure, in which case
- * @action is already called.
- */
-int xe_device_add_action_or_reset(struct xe_device *xe,
- void (*action)(void *), void *data)
-{
- struct xe_device_remove_action *ra;
-
- drm_WARN_ON(&xe->drm, !xe->probing);
-
- ra = kmalloc(sizeof(*ra), GFP_KERNEL);
- if (!ra) {
- action(data);
- return -ENOMEM;
- }
-
- INIT_LIST_HEAD(&ra->node);
- ra->action = action;
- ra->data = data;
- list_add(&ra->node, &xe->remove_action_list);
-
- return 0;
-}
-
void xe_device_remove(struct xe_device *xe)
{
xe_display_unregister(xe);
drm_dev_unplug(&xe->drm);
-
- xe_display_driver_remove(xe);
-
- xe_heci_gsc_fini(xe);
-
- xe_device_call_remove_actions(xe);
}
void xe_device_shutdown(struct xe_device *xe)
diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h
index 079dad32a6f5..0bc3bc8e6803 100644
--- a/drivers/gpu/drm/xe/xe_device.h
+++ b/drivers/gpu/drm/xe/xe_device.h
@@ -45,9 +45,6 @@ struct xe_device *xe_device_create(struct pci_dev *pdev,
const struct pci_device_id *ent);
int xe_device_probe_early(struct xe_device *xe);
int xe_device_probe(struct xe_device *xe);
-int xe_device_add_action_or_reset(struct xe_device *xe,
- void (*action)(void *), void *data);
-void xe_device_call_remove_actions(struct xe_device *xe);
void xe_device_remove(struct xe_device *xe);
void xe_device_shutdown(struct xe_device *xe);
diff --git a/drivers/gpu/drm/xe/xe_device_sysfs.c b/drivers/gpu/drm/xe/xe_device_sysfs.c
index 7375937934fa..7efbd4c52791 100644
--- a/drivers/gpu/drm/xe/xe_device_sysfs.c
+++ b/drivers/gpu/drm/xe/xe_device_sysfs.c
@@ -32,9 +32,6 @@ vram_d3cold_threshold_show(struct device *dev,
struct xe_device *xe = pdev_to_xe_device(pdev);
int ret;
- if (!xe)
- return -EINVAL;
-
xe_pm_runtime_get(xe);
ret = sysfs_emit(buf, "%d\n", xe->d3cold.vram_threshold);
xe_pm_runtime_put(xe);
@@ -51,9 +48,6 @@ vram_d3cold_threshold_store(struct device *dev, struct device_attribute *attr,
u32 vram_d3cold_threshold;
int ret;
- if (!xe)
- return -EINVAL;
-
ret = kstrtou32(buff, 0, &vram_d3cold_threshold);
if (ret)
return ret;
diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
index e312595dda7e..72ef0b6fc425 100644
--- a/drivers/gpu/drm/xe/xe_device_types.h
+++ b/drivers/gpu/drm/xe/xe_device_types.h
@@ -10,6 +10,7 @@
#include <drm/drm_device.h>
#include <drm/drm_file.h>
+#include <drm/drm_pagemap.h>
#include <drm/ttm/ttm_device.h>
#include "xe_devcoredump_types.h"
@@ -106,6 +107,19 @@ struct xe_vram_region {
resource_size_t actual_physical_size;
/** @mapping: pointer to VRAM mappable space */
void __iomem *mapping;
+ /** @pagemap: Used to remap device memory as ZONE_DEVICE */
+ struct dev_pagemap pagemap;
+ /**
+ * @dpagemap: The struct drm_pagemap of the ZONE_DEVICE memory
+ * pages of this tile.
+ */
+ struct drm_pagemap dpagemap;
+ /**
+ * @hpa_base: base host physical address
+ *
+ * This is generated when remap device memory as ZONE_DEVICE
+ */
+ resource_size_t hpa_base;
/** @ttm: VRAM TTM manager */
struct xe_ttm_vram_mgr ttm;
};
@@ -431,20 +445,6 @@ struct xe_device {
struct xe_tile tiles[XE_MAX_TILES_PER_DEVICE];
/**
- * @remove_action_list: list of actions to execute on device remove.
- * Use xe_device_add_remove_action() for that. Actions can only be added
- * during probe and are executed during the call from PCI subsystem to
- * remove the driver from the device.
- */
- struct list_head remove_action_list;
-
- /**
- * @probing: cover the section in which @remove_action_list can be used
- * to post cleaning actions
- */
- bool probing;
-
- /**
* @mem_access: keep track of memory access in the device, possibly
* triggering additional actions when they occur.
*/
@@ -541,6 +541,14 @@ struct xe_device {
int mode;
} wedged;
+ /** @bo_device: Struct to control async free of BOs */
+ struct xe_bo_dev {
+ /** @bo_device.async_free: Free worker */
+ struct work_struct async_free;
+ /** @bo_device.async_list: List of BOs to be freed */
+ struct llist_head async_list;
+ } bo_device;
+
/** @pmu: performance monitoring unit */
struct xe_pmu pmu;
diff --git a/drivers/gpu/drm/xe/xe_eu_stall.c b/drivers/gpu/drm/xe/xe_eu_stall.c
new file mode 100644
index 000000000000..88a92baf5c95
--- /dev/null
+++ b/drivers/gpu/drm/xe/xe_eu_stall.c
@@ -0,0 +1,960 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2025 Intel Corporation
+ */
+
+#include <linux/anon_inodes.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/types.h>
+
+#include <drm/drm_drv.h>
+#include <generated/xe_wa_oob.h>
+#include <uapi/drm/xe_drm.h>
+
+#include "xe_bo.h"
+#include "xe_device.h"
+#include "xe_eu_stall.h"
+#include "xe_force_wake.h"
+#include "xe_gt_mcr.h"
+#include "xe_gt_printk.h"
+#include "xe_gt_topology.h"
+#include "xe_macros.h"
+#include "xe_observation.h"
+#include "xe_pm.h"
+#include "xe_trace.h"
+#include "xe_wa.h"
+
+#include "regs/xe_eu_stall_regs.h"
+#include "regs/xe_gt_regs.h"
+
+#define POLL_PERIOD_MS 5
+
+static size_t per_xecore_buf_size = SZ_512K;
+
+struct per_xecore_buf {
+ /* Buffer vaddr */
+ u8 *vaddr;
+ /* Write pointer */
+ u32 write;
+ /* Read pointer */
+ u32 read;
+};
+
+struct xe_eu_stall_data_stream {
+ bool pollin;
+ bool enabled;
+ int wait_num_reports;
+ int sampling_rate_mult;
+ wait_queue_head_t poll_wq;
+ size_t data_record_size;
+ size_t per_xecore_buf_size;
+
+ struct xe_gt *gt;
+ struct xe_bo *bo;
+ struct per_xecore_buf *xecore_buf;
+ struct {
+ bool reported_to_user;
+ xe_dss_mask_t mask;
+ } data_drop;
+ struct delayed_work buf_poll_work;
+};
+
+struct xe_eu_stall_gt {
+ /* Lock to protect stream */
+ struct mutex stream_lock;
+ /* EU stall data stream */
+ struct xe_eu_stall_data_stream *stream;
+ /* Workqueue to schedule buffer pointers polling work */
+ struct workqueue_struct *buf_ptr_poll_wq;
+};
+
+/**
+ * struct eu_stall_open_properties - EU stall sampling properties received
+ * from user space at open.
+ * @sampling_rate_mult: EU stall sampling rate multiplier.
+ * HW will sample every (sampling_rate_mult x 251) cycles.
+ * @wait_num_reports: Minimum number of EU stall data reports to unblock poll().
+ * @gt: GT on which EU stall data will be captured.
+ */
+struct eu_stall_open_properties {
+ int sampling_rate_mult;
+ int wait_num_reports;
+ struct xe_gt *gt;
+};
+
+/*
+ * EU stall data format for PVC
+ */
+struct xe_eu_stall_data_pvc {
+ __u64 ip_addr:29; /* Bits 0 to 28 */
+ __u64 active_count:8; /* Bits 29 to 36 */
+ __u64 other_count:8; /* Bits 37 to 44 */
+ __u64 control_count:8; /* Bits 45 to 52 */
+ __u64 pipestall_count:8; /* Bits 53 to 60 */
+ __u64 send_count:8; /* Bits 61 to 68 */
+ __u64 dist_acc_count:8; /* Bits 69 to 76 */
+ __u64 sbid_count:8; /* Bits 77 to 84 */
+ __u64 sync_count:8; /* Bits 85 to 92 */
+ __u64 inst_fetch_count:8; /* Bits 93 to 100 */
+ __u64 unused_bits:27;
+ __u64 unused[6];
+} __packed;
+
+/*
+ * EU stall data format for Xe2 arch GPUs (LNL, BMG).
+ */
+struct xe_eu_stall_data_xe2 {
+ __u64 ip_addr:29; /* Bits 0 to 28 */
+ __u64 tdr_count:8; /* Bits 29 to 36 */
+ __u64 other_count:8; /* Bits 37 to 44 */
+ __u64 control_count:8; /* Bits 45 to 52 */
+ __u64 pipestall_count:8; /* Bits 53 to 60 */
+ __u64 send_count:8; /* Bits 61 to 68 */
+ __u64 dist_acc_count:8; /* Bits 69 to 76 */
+ __u64 sbid_count:8; /* Bits 77 to 84 */
+ __u64 sync_count:8; /* Bits 85 to 92 */
+ __u64 inst_fetch_count:8; /* Bits 93 to 100 */
+ __u64 active_count:8; /* Bits 101 to 108 */
+ __u64 ex_id:3; /* Bits 109 to 111 */
+ __u64 end_flag:1; /* Bit 112 */
+ __u64 unused_bits:15;
+ __u64 unused[6];
+} __packed;
+
+const u64 eu_stall_sampling_rates[] = {251, 251 * 2, 251 * 3, 251 * 4, 251 * 5, 251 * 6, 251 * 7};
+
+/**
+ * xe_eu_stall_get_sampling_rates - get EU stall sampling rates information.
+ *
+ * @num_rates: Pointer to a u32 to return the number of sampling rates.
+ * @rates: double u64 pointer to point to an array of sampling rates.
+ *
+ * Stores the number of sampling rates and pointer to the array of
+ * sampling rates in the input pointers.
+ *
+ * Returns: Size of the EU stall sampling rates array.
+ */
+size_t xe_eu_stall_get_sampling_rates(u32 *num_rates, const u64 **rates)
+{
+ *num_rates = ARRAY_SIZE(eu_stall_sampling_rates);
+ *rates = eu_stall_sampling_rates;
+
+ return sizeof(eu_stall_sampling_rates);
+}
+
+/**
+ * xe_eu_stall_get_per_xecore_buf_size - get per XeCore buffer size.
+ *
+ * Returns: The per XeCore buffer size used to allocate the per GT
+ * EU stall data buffer.
+ */
+size_t xe_eu_stall_get_per_xecore_buf_size(void)
+{
+ return per_xecore_buf_size;
+}
+
+/**
+ * xe_eu_stall_data_record_size - get EU stall data record size.
+ *
+ * @xe: Pointer to a Xe device.
+ *
+ * Returns: EU stall data record size.
+ */
+size_t xe_eu_stall_data_record_size(struct xe_device *xe)
+{
+ size_t record_size = 0;
+
+ if (xe->info.platform == XE_PVC)
+ record_size = sizeof(struct xe_eu_stall_data_pvc);
+ else if (GRAPHICS_VER(xe) >= 20)
+ record_size = sizeof(struct xe_eu_stall_data_xe2);
+
+ xe_assert(xe, is_power_of_2(record_size));
+
+ return record_size;
+}
+
+/**
+ * num_data_rows - Return the number of EU stall data rows of 64B each
+ * for a given data size.
+ *
+ * @data_size: EU stall data size
+ */
+static u32 num_data_rows(u32 data_size)
+{
+ return data_size >> 6;
+}
+
+static void xe_eu_stall_fini(void *arg)
+{
+ struct xe_gt *gt = arg;
+
+ destroy_workqueue(gt->eu_stall->buf_ptr_poll_wq);
+ mutex_destroy(&gt->eu_stall->stream_lock);
+ kfree(gt->eu_stall);
+}
+
+/**
+ * xe_eu_stall_init() - Allocate and initialize GT level EU stall data
+ * structure xe_eu_stall_gt within struct xe_gt.
+ *
+ * @gt: GT being initialized.
+ *
+ * Returns: zero on success or a negative error code.
+ */
+int xe_eu_stall_init(struct xe_gt *gt)
+{
+ struct xe_device *xe = gt_to_xe(gt);
+ int ret;
+
+ gt->eu_stall = kzalloc(sizeof(*gt->eu_stall), GFP_KERNEL);
+ if (!gt->eu_stall) {
+ ret = -ENOMEM;
+ goto exit;
+ }
+
+ mutex_init(&gt->eu_stall->stream_lock);
+
+ gt->eu_stall->buf_ptr_poll_wq = alloc_ordered_workqueue("xe_eu_stall", 0);
+ if (!gt->eu_stall->buf_ptr_poll_wq) {
+ ret = -ENOMEM;
+ goto exit_free;
+ }
+
+ ret = devm_add_action_or_reset(xe->drm.dev, xe_eu_stall_fini, gt);
+ if (ret)
+ goto exit_destroy;
+
+ return 0;
+exit_destroy:
+ destroy_workqueue(gt->eu_stall->buf_ptr_poll_wq);
+exit_free:
+ mutex_destroy(&gt->eu_stall->stream_lock);
+ kfree(gt->eu_stall);
+exit:
+ return ret;
+}
+
+static int set_prop_eu_stall_sampling_rate(struct xe_device *xe, u64 value,
+ struct eu_stall_open_properties *props)
+{
+ value = div_u64(value, 251);
+ if (value == 0 || value > 7) {
+ drm_dbg(&xe->drm, "Invalid EU stall sampling rate %llu\n", value);
+ return -EINVAL;
+ }
+ props->sampling_rate_mult = value;
+ return 0;
+}
+
+static int set_prop_eu_stall_wait_num_reports(struct xe_device *xe, u64 value,
+ struct eu_stall_open_properties *props)
+{
+ props->wait_num_reports = value;
+
+ return 0;
+}
+
+static int set_prop_eu_stall_gt_id(struct xe_device *xe, u64 value,
+ struct eu_stall_open_properties *props)
+{
+ if (value >= xe->info.gt_count) {
+ drm_dbg(&xe->drm, "Invalid GT ID %llu for EU stall sampling\n", value);
+ return -EINVAL;
+ }
+ props->gt = xe_device_get_gt(xe, value);
+ return 0;
+}
+
+typedef int (*set_eu_stall_property_fn)(struct xe_device *xe, u64 value,
+ struct eu_stall_open_properties *props);
+
+static const set_eu_stall_property_fn xe_set_eu_stall_property_funcs[] = {
+ [DRM_XE_EU_STALL_PROP_SAMPLE_RATE] = set_prop_eu_stall_sampling_rate,
+ [DRM_XE_EU_STALL_PROP_WAIT_NUM_REPORTS] = set_prop_eu_stall_wait_num_reports,
+ [DRM_XE_EU_STALL_PROP_GT_ID] = set_prop_eu_stall_gt_id,
+};
+
+static int xe_eu_stall_user_ext_set_property(struct xe_device *xe, u64 extension,
+ struct eu_stall_open_properties *props)
+{
+ u64 __user *address = u64_to_user_ptr(extension);
+ struct drm_xe_ext_set_property ext;
+ int err;
+ u32 idx;
+
+ err = __copy_from_user(&ext, address, sizeof(ext));
+ if (XE_IOCTL_DBG(xe, err))
+ return -EFAULT;
+
+ if (XE_IOCTL_DBG(xe, ext.property >= ARRAY_SIZE(xe_set_eu_stall_property_funcs)) ||
+ XE_IOCTL_DBG(xe, ext.pad))
+ return -EINVAL;
+
+ idx = array_index_nospec(ext.property, ARRAY_SIZE(xe_set_eu_stall_property_funcs));
+ return xe_set_eu_stall_property_funcs[idx](xe, ext.value, props);
+}
+
+typedef int (*xe_eu_stall_user_extension_fn)(struct xe_device *xe, u64 extension,
+ struct eu_stall_open_properties *props);
+static const xe_eu_stall_user_extension_fn xe_eu_stall_user_extension_funcs[] = {
+ [DRM_XE_EU_STALL_EXTENSION_SET_PROPERTY] = xe_eu_stall_user_ext_set_property,
+};
+
+#define MAX_USER_EXTENSIONS 5
+static int xe_eu_stall_user_extensions(struct xe_device *xe, u64 extension,
+ int ext_number, struct eu_stall_open_properties *props)
+{
+ u64 __user *address = u64_to_user_ptr(extension);
+ struct drm_xe_user_extension ext;
+ int err;
+ u32 idx;
+
+ if (XE_IOCTL_DBG(xe, ext_number >= MAX_USER_EXTENSIONS))
+ return -E2BIG;
+
+ err = __copy_from_user(&ext, address, sizeof(ext));
+ if (XE_IOCTL_DBG(xe, err))
+ return -EFAULT;
+
+ if (XE_IOCTL_DBG(xe, ext.pad) ||
+ XE_IOCTL_DBG(xe, ext.name >= ARRAY_SIZE(xe_eu_stall_user_extension_funcs)))
+ return -EINVAL;
+
+ idx = array_index_nospec(ext.name, ARRAY_SIZE(xe_eu_stall_user_extension_funcs));
+ err = xe_eu_stall_user_extension_funcs[idx](xe, extension, props);
+ if (XE_IOCTL_DBG(xe, err))
+ return err;
+
+ if (ext.next_extension)
+ return xe_eu_stall_user_extensions(xe, ext.next_extension, ++ext_number, props);
+
+ return 0;
+}
+
+/**
+ * buf_data_size - Calculate the number of bytes in a circular buffer
+ * given the read and write pointers and the size of
+ * the buffer.
+ *
+ * @buf_size: Size of the circular buffer
+ * @read_ptr: Read pointer with an additional overflow bit
+ * @write_ptr: Write pointer with an additional overflow bit
+ *
+ * Since the read and write pointers have an additional overflow bit,
+ * this function calculates the offsets from the pointers and use the
+ * offsets to calculate the data size in the buffer.
+ *
+ * Returns: number of bytes of data in the buffer
+ */
+static u32 buf_data_size(size_t buf_size, u32 read_ptr, u32 write_ptr)
+{
+ u32 read_offset, write_offset, size = 0;
+
+ if (read_ptr == write_ptr)
+ goto exit;
+
+ read_offset = read_ptr & (buf_size - 1);
+ write_offset = write_ptr & (buf_size - 1);
+
+ if (write_offset > read_offset)
+ size = write_offset - read_offset;
+ else
+ size = buf_size - read_offset + write_offset;
+exit:
+ return size;
+}
+
+/**
+ * eu_stall_data_buf_poll - Poll for EU stall data in the buffer.
+ *
+ * @stream: xe EU stall data stream instance
+ *
+ * Returns: true if the EU stall buffer contains minimum stall data as
+ * specified by the event report count, else false.
+ */
+static bool eu_stall_data_buf_poll(struct xe_eu_stall_data_stream *stream)
+{
+ u32 read_ptr, write_ptr_reg, write_ptr, total_data = 0;
+ u32 buf_size = stream->per_xecore_buf_size;
+ struct per_xecore_buf *xecore_buf;
+ struct xe_gt *gt = stream->gt;
+ bool min_data_present = false;
+ u16 group, instance;
+ unsigned int xecore;
+
+ mutex_lock(&gt->eu_stall->stream_lock);
+ for_each_dss_steering(xecore, gt, group, instance) {
+ xecore_buf = &stream->xecore_buf[xecore];
+ read_ptr = xecore_buf->read;
+ write_ptr_reg = xe_gt_mcr_unicast_read(gt, XEHPC_EUSTALL_REPORT,
+ group, instance);
+ write_ptr = REG_FIELD_GET(XEHPC_EUSTALL_REPORT_WRITE_PTR_MASK, write_ptr_reg);
+ write_ptr <<= 6;
+ write_ptr &= ((buf_size << 1) - 1);
+ if (!min_data_present) {
+ total_data += buf_data_size(buf_size, read_ptr, write_ptr);
+ if (num_data_rows(total_data) >= stream->wait_num_reports)
+ min_data_present = true;
+ }
+ if (write_ptr_reg & XEHPC_EUSTALL_REPORT_OVERFLOW_DROP)
+ set_bit(xecore, stream->data_drop.mask);
+ xecore_buf->write = write_ptr;
+ }
+ mutex_unlock(&gt->eu_stall->stream_lock);
+
+ return min_data_present;
+}
+
+static void clear_dropped_eviction_line_bit(struct xe_gt *gt, u16 group, u16 instance)
+{
+ struct xe_device *xe = gt_to_xe(gt);
+ u32 write_ptr_reg;
+
+ /* On PVC, the overflow bit has to be cleared by writing 1 to it.
+ * On Xe2 and later GPUs, the bit has to be cleared by writing 0 to it.
+ */
+ if (GRAPHICS_VER(xe) >= 20)
+ write_ptr_reg = _MASKED_BIT_DISABLE(XEHPC_EUSTALL_REPORT_OVERFLOW_DROP);
+ else
+ write_ptr_reg = _MASKED_BIT_ENABLE(XEHPC_EUSTALL_REPORT_OVERFLOW_DROP);
+
+ xe_gt_mcr_unicast_write(gt, XEHPC_EUSTALL_REPORT, write_ptr_reg, group, instance);
+}
+
+static int xe_eu_stall_data_buf_read(struct xe_eu_stall_data_stream *stream,
+ char __user *buf, size_t count,
+ size_t *total_data_size, struct xe_gt *gt,
+ u16 group, u16 instance, unsigned int xecore)
+{
+ size_t read_data_size, copy_size, buf_size;
+ u32 read_ptr_reg, read_ptr, write_ptr;
+ u8 *xecore_start_vaddr, *read_vaddr;
+ struct per_xecore_buf *xecore_buf;
+ u32 read_offset, write_offset;
+
+ /* Hardware increments the read and write pointers such that they can
+ * overflow into one additional bit. For example, a 256KB size buffer
+ * offset pointer needs 18 bits. But HW uses 19 bits for the read and
+ * write pointers. This technique avoids wasting a slot in the buffer.
+ * Read and write offsets are calculated from the pointers in order to
+ * check if the write pointer has wrapped around the array.
+ */
+ xecore_buf = &stream->xecore_buf[xecore];
+ xecore_start_vaddr = xecore_buf->vaddr;
+ read_ptr = xecore_buf->read;
+ write_ptr = xecore_buf->write;
+ buf_size = stream->per_xecore_buf_size;
+
+ read_data_size = buf_data_size(buf_size, read_ptr, write_ptr);
+ /* Read only the data that the user space buffer can accommodate */
+ read_data_size = min_t(size_t, count - *total_data_size, read_data_size);
+ if (read_data_size == 0)
+ goto exit_drop;
+
+ read_offset = read_ptr & (buf_size - 1);
+ write_offset = write_ptr & (buf_size - 1);
+ read_vaddr = xecore_start_vaddr + read_offset;
+
+ if (write_offset > read_offset) {
+ if (copy_to_user(buf + *total_data_size, read_vaddr, read_data_size))
+ return -EFAULT;
+ } else {
+ if (read_data_size >= buf_size - read_offset)
+ copy_size = buf_size - read_offset;
+ else
+ copy_size = read_data_size;
+ if (copy_to_user(buf + *total_data_size, read_vaddr, copy_size))
+ return -EFAULT;
+ if (copy_to_user(buf + *total_data_size + copy_size,
+ xecore_start_vaddr, read_data_size - copy_size))
+ return -EFAULT;
+ }
+
+ *total_data_size += read_data_size;
+ read_ptr += read_data_size;
+
+ /* Read pointer can overflow into one additional bit */
+ read_ptr &= (buf_size << 1) - 1;
+ read_ptr_reg = REG_FIELD_PREP(XEHPC_EUSTALL_REPORT1_READ_PTR_MASK, (read_ptr >> 6));
+ read_ptr_reg = _MASKED_FIELD(XEHPC_EUSTALL_REPORT1_READ_PTR_MASK, read_ptr_reg);
+ xe_gt_mcr_unicast_write(gt, XEHPC_EUSTALL_REPORT1, read_ptr_reg, group, instance);
+ xecore_buf->read = read_ptr;
+ trace_xe_eu_stall_data_read(group, instance, read_ptr, write_ptr,
+ read_data_size, *total_data_size);
+exit_drop:
+ /* Clear drop bit (if set) after any data was read or if the buffer was empty.
+ * Drop bit can be set even if the buffer is empty as the buffer may have been emptied
+ * in the previous read() and the data drop bit was set during the previous read().
+ */
+ if (test_bit(xecore, stream->data_drop.mask)) {
+ clear_dropped_eviction_line_bit(gt, group, instance);
+ clear_bit(xecore, stream->data_drop.mask);
+ }
+ return 0;
+}
+
+/**
+ * xe_eu_stall_stream_read_locked - copy EU stall counters data from the
+ * per xecore buffers to the userspace buffer
+ * @stream: A stream opened for EU stall count metrics
+ * @file: An xe EU stall data stream file
+ * @buf: destination buffer given by userspace
+ * @count: the number of bytes userspace wants to read
+ *
+ * Returns: Number of bytes copied or a negative error code
+ * If we've successfully copied any data then reporting that takes
+ * precedence over any internal error status, so the data isn't lost.
+ */
+static ssize_t xe_eu_stall_stream_read_locked(struct xe_eu_stall_data_stream *stream,
+ struct file *file, char __user *buf,
+ size_t count)
+{
+ struct xe_gt *gt = stream->gt;
+ size_t total_size = 0;
+ u16 group, instance;
+ unsigned int xecore;
+ int ret = 0;
+
+ if (bitmap_weight(stream->data_drop.mask, XE_MAX_DSS_FUSE_BITS)) {
+ if (!stream->data_drop.reported_to_user) {
+ stream->data_drop.reported_to_user = true;
+ xe_gt_dbg(gt, "EU stall data dropped in XeCores: %*pb\n",
+ XE_MAX_DSS_FUSE_BITS, stream->data_drop.mask);
+ return -EIO;
+ }
+ stream->data_drop.reported_to_user = false;
+ }
+
+ for_each_dss_steering(xecore, gt, group, instance) {
+ ret = xe_eu_stall_data_buf_read(stream, buf, count, &total_size,
+ gt, group, instance, xecore);
+ if (ret || count == total_size)
+ break;
+ }
+ return total_size ?: (ret ?: -EAGAIN);
+}
+
+/*
+ * Userspace must enable the EU stall stream with DRM_XE_OBSERVATION_IOCTL_ENABLE
+ * before calling read().
+ *
+ * Returns: The number of bytes copied or a negative error code on failure.
+ * -EIO if HW drops any EU stall data when the buffer is full.
+ */
+static ssize_t xe_eu_stall_stream_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct xe_eu_stall_data_stream *stream = file->private_data;
+ struct xe_gt *gt = stream->gt;
+ ssize_t ret, aligned_count;
+
+ aligned_count = ALIGN_DOWN(count, stream->data_record_size);
+ if (aligned_count == 0)
+ return -EINVAL;
+
+ if (!stream->enabled) {
+ xe_gt_dbg(gt, "EU stall data stream not enabled to read\n");
+ return -EINVAL;
+ }
+
+ if (!(file->f_flags & O_NONBLOCK)) {
+ do {
+ ret = wait_event_interruptible(stream->poll_wq, stream->pollin);
+ if (ret)
+ return -EINTR;
+
+ mutex_lock(&gt->eu_stall->stream_lock);
+ ret = xe_eu_stall_stream_read_locked(stream, file, buf, aligned_count);
+ mutex_unlock(&gt->eu_stall->stream_lock);
+ } while (ret == -EAGAIN);
+ } else {
+ mutex_lock(&gt->eu_stall->stream_lock);
+ ret = xe_eu_stall_stream_read_locked(stream, file, buf, aligned_count);
+ mutex_unlock(&gt->eu_stall->stream_lock);
+ }
+
+ /*
+ * This may not work correctly if the user buffer is very small.
+ * We don't want to block the next read() when there is data in the buffer
+ * now, but couldn't be accommodated in the small user buffer.
+ */
+ stream->pollin = false;
+
+ return ret;
+}
+
+static void xe_eu_stall_stream_free(struct xe_eu_stall_data_stream *stream)
+{
+ struct xe_gt *gt = stream->gt;
+
+ gt->eu_stall->stream = NULL;
+ kfree(stream);
+}
+
+static void xe_eu_stall_data_buf_destroy(struct xe_eu_stall_data_stream *stream)
+{
+ xe_bo_unpin_map_no_vm(stream->bo);
+ kfree(stream->xecore_buf);
+}
+
+static int xe_eu_stall_data_buf_alloc(struct xe_eu_stall_data_stream *stream,
+ u16 last_xecore)
+{
+ struct xe_tile *tile = stream->gt->tile;
+ struct xe_bo *bo;
+ u32 size;
+
+ stream->xecore_buf = kcalloc(last_xecore, sizeof(*stream->xecore_buf), GFP_KERNEL);
+ if (!stream->xecore_buf)
+ return -ENOMEM;
+
+ size = stream->per_xecore_buf_size * last_xecore;
+
+ bo = xe_bo_create_pin_map_at_aligned(tile->xe, tile, NULL,
+ size, ~0ull, ttm_bo_type_kernel,
+ XE_BO_FLAG_SYSTEM | XE_BO_FLAG_GGTT, SZ_64);
+ if (IS_ERR(bo)) {
+ kfree(stream->xecore_buf);
+ return PTR_ERR(bo);
+ }
+
+ XE_WARN_ON(!IS_ALIGNED(xe_bo_ggtt_addr(bo), SZ_64));
+ stream->bo = bo;
+
+ return 0;
+}
+
+static int xe_eu_stall_stream_enable(struct xe_eu_stall_data_stream *stream)
+{
+ u32 write_ptr_reg, write_ptr, read_ptr_reg, reg_value;
+ struct per_xecore_buf *xecore_buf;
+ struct xe_gt *gt = stream->gt;
+ u16 group, instance;
+ unsigned int fw_ref;
+ int xecore;
+
+ /* Take runtime pm ref and forcewake to disable RC6 */
+ xe_pm_runtime_get(gt_to_xe(gt));
+ fw_ref = xe_force_wake_get(gt_to_fw(gt), XE_FW_RENDER);
+ if (!xe_force_wake_ref_has_domain(fw_ref, XE_FW_RENDER)) {
+ xe_gt_err(gt, "Failed to get RENDER forcewake\n");
+ xe_pm_runtime_put(gt_to_xe(gt));
+ return -ETIMEDOUT;
+ }
+
+ if (XE_WA(gt, 22016596838))
+ xe_gt_mcr_multicast_write(gt, ROW_CHICKEN2,
+ _MASKED_BIT_ENABLE(DISABLE_DOP_GATING));
+
+ for_each_dss_steering(xecore, gt, group, instance) {
+ write_ptr_reg = xe_gt_mcr_unicast_read(gt, XEHPC_EUSTALL_REPORT, group, instance);
+ /* Clear any drop bits set and not cleared in the previous session. */
+ if (write_ptr_reg & XEHPC_EUSTALL_REPORT_OVERFLOW_DROP)
+ clear_dropped_eviction_line_bit(gt, group, instance);
+ write_ptr = REG_FIELD_GET(XEHPC_EUSTALL_REPORT_WRITE_PTR_MASK, write_ptr_reg);
+ read_ptr_reg = REG_FIELD_PREP(XEHPC_EUSTALL_REPORT1_READ_PTR_MASK, write_ptr);
+ read_ptr_reg = _MASKED_FIELD(XEHPC_EUSTALL_REPORT1_READ_PTR_MASK, read_ptr_reg);
+ /* Initialize the read pointer to the write pointer */
+ xe_gt_mcr_unicast_write(gt, XEHPC_EUSTALL_REPORT1, read_ptr_reg, group, instance);
+ write_ptr <<= 6;
+ write_ptr &= (stream->per_xecore_buf_size << 1) - 1;
+ xecore_buf = &stream->xecore_buf[xecore];
+ xecore_buf->write = write_ptr;
+ xecore_buf->read = write_ptr;
+ }
+ stream->data_drop.reported_to_user = false;
+ bitmap_zero(stream->data_drop.mask, XE_MAX_DSS_FUSE_BITS);
+
+ reg_value = _MASKED_FIELD(EUSTALL_MOCS | EUSTALL_SAMPLE_RATE,
+ REG_FIELD_PREP(EUSTALL_MOCS, gt->mocs.uc_index << 1) |
+ REG_FIELD_PREP(EUSTALL_SAMPLE_RATE,
+ stream->sampling_rate_mult));
+ xe_gt_mcr_multicast_write(gt, XEHPC_EUSTALL_CTRL, reg_value);
+ /* GGTT addresses can never be > 32 bits */
+ xe_gt_mcr_multicast_write(gt, XEHPC_EUSTALL_BASE_UPPER, 0);
+ reg_value = xe_bo_ggtt_addr(stream->bo);
+ reg_value |= REG_FIELD_PREP(XEHPC_EUSTALL_BASE_XECORE_BUF_SZ,
+ stream->per_xecore_buf_size / SZ_256K);
+ reg_value |= XEHPC_EUSTALL_BASE_ENABLE_SAMPLING;
+ xe_gt_mcr_multicast_write(gt, XEHPC_EUSTALL_BASE, reg_value);
+
+ return 0;
+}
+
+static void eu_stall_data_buf_poll_work_fn(struct work_struct *work)
+{
+ struct xe_eu_stall_data_stream *stream =
+ container_of(work, typeof(*stream), buf_poll_work.work);
+ struct xe_gt *gt = stream->gt;
+
+ if (eu_stall_data_buf_poll(stream)) {
+ stream->pollin = true;
+ wake_up(&stream->poll_wq);
+ }
+ queue_delayed_work(gt->eu_stall->buf_ptr_poll_wq,
+ &stream->buf_poll_work,
+ msecs_to_jiffies(POLL_PERIOD_MS));
+}
+
+static int xe_eu_stall_stream_init(struct xe_eu_stall_data_stream *stream,
+ struct eu_stall_open_properties *props)
+{
+ unsigned int max_wait_num_reports, xecore, last_xecore, num_xecores;
+ struct per_xecore_buf *xecore_buf;
+ struct xe_gt *gt = stream->gt;
+ xe_dss_mask_t all_xecores;
+ u16 group, instance;
+ u32 vaddr_offset;
+ int ret;
+
+ bitmap_or(all_xecores, gt->fuse_topo.g_dss_mask, gt->fuse_topo.c_dss_mask,
+ XE_MAX_DSS_FUSE_BITS);
+ num_xecores = bitmap_weight(all_xecores, XE_MAX_DSS_FUSE_BITS);
+ last_xecore = xe_gt_topology_mask_last_dss(all_xecores) + 1;
+
+ max_wait_num_reports = num_data_rows(per_xecore_buf_size * num_xecores);
+ if (props->wait_num_reports == 0 || props->wait_num_reports > max_wait_num_reports) {
+ xe_gt_dbg(gt, "Invalid EU stall event report count %u\n",
+ props->wait_num_reports);
+ xe_gt_dbg(gt, "Minimum event report count is 1, maximum is %u\n",
+ max_wait_num_reports);
+ return -EINVAL;
+ }
+
+ init_waitqueue_head(&stream->poll_wq);
+ INIT_DELAYED_WORK(&stream->buf_poll_work, eu_stall_data_buf_poll_work_fn);
+ stream->per_xecore_buf_size = per_xecore_buf_size;
+ stream->sampling_rate_mult = props->sampling_rate_mult;
+ stream->wait_num_reports = props->wait_num_reports;
+ stream->data_record_size = xe_eu_stall_data_record_size(gt_to_xe(gt));
+
+ ret = xe_eu_stall_data_buf_alloc(stream, last_xecore);
+ if (ret)
+ return ret;
+
+ for_each_dss_steering(xecore, gt, group, instance) {
+ xecore_buf = &stream->xecore_buf[xecore];
+ vaddr_offset = xecore * stream->per_xecore_buf_size;
+ xecore_buf->vaddr = stream->bo->vmap.vaddr + vaddr_offset;
+ }
+ return 0;
+}
+
+static __poll_t xe_eu_stall_stream_poll_locked(struct xe_eu_stall_data_stream *stream,
+ struct file *file, poll_table *wait)
+{
+ __poll_t events = 0;
+
+ poll_wait(file, &stream->poll_wq, wait);
+
+ if (stream->pollin)
+ events |= EPOLLIN;
+
+ return events;
+}
+
+static __poll_t xe_eu_stall_stream_poll(struct file *file, poll_table *wait)
+{
+ struct xe_eu_stall_data_stream *stream = file->private_data;
+ struct xe_gt *gt = stream->gt;
+ __poll_t ret;
+
+ mutex_lock(&gt->eu_stall->stream_lock);
+ ret = xe_eu_stall_stream_poll_locked(stream, file, wait);
+ mutex_unlock(&gt->eu_stall->stream_lock);
+
+ return ret;
+}
+
+static int xe_eu_stall_enable_locked(struct xe_eu_stall_data_stream *stream)
+{
+ struct xe_gt *gt = stream->gt;
+ int ret = 0;
+
+ if (stream->enabled)
+ return ret;
+
+ stream->enabled = true;
+
+ ret = xe_eu_stall_stream_enable(stream);
+
+ queue_delayed_work(gt->eu_stall->buf_ptr_poll_wq,
+ &stream->buf_poll_work,
+ msecs_to_jiffies(POLL_PERIOD_MS));
+ return ret;
+}
+
+static int xe_eu_stall_disable_locked(struct xe_eu_stall_data_stream *stream)
+{
+ struct xe_gt *gt = stream->gt;
+
+ if (!stream->enabled)
+ return 0;
+
+ stream->enabled = false;
+
+ xe_gt_mcr_multicast_write(gt, XEHPC_EUSTALL_BASE, 0);
+
+ cancel_delayed_work_sync(&stream->buf_poll_work);
+
+ if (XE_WA(gt, 22016596838))
+ xe_gt_mcr_multicast_write(gt, ROW_CHICKEN2,
+ _MASKED_BIT_DISABLE(DISABLE_DOP_GATING));
+
+ xe_force_wake_put(gt_to_fw(gt), XE_FW_RENDER);
+ xe_pm_runtime_put(gt_to_xe(gt));
+
+ return 0;
+}
+
+static long xe_eu_stall_stream_ioctl_locked(struct xe_eu_stall_data_stream *stream,
+ unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case DRM_XE_OBSERVATION_IOCTL_ENABLE:
+ return xe_eu_stall_enable_locked(stream);
+ case DRM_XE_OBSERVATION_IOCTL_DISABLE:
+ return xe_eu_stall_disable_locked(stream);
+ }
+
+ return -EINVAL;
+}
+
+static long xe_eu_stall_stream_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct xe_eu_stall_data_stream *stream = file->private_data;
+ struct xe_gt *gt = stream->gt;
+ long ret;
+
+ mutex_lock(&gt->eu_stall->stream_lock);
+ ret = xe_eu_stall_stream_ioctl_locked(stream, cmd, arg);
+ mutex_unlock(&gt->eu_stall->stream_lock);
+
+ return ret;
+}
+
+static int xe_eu_stall_stream_close(struct inode *inode, struct file *file)
+{
+ struct xe_eu_stall_data_stream *stream = file->private_data;
+ struct xe_gt *gt = stream->gt;
+
+ drm_dev_put(&gt->tile->xe->drm);
+
+ mutex_lock(&gt->eu_stall->stream_lock);
+ xe_eu_stall_disable_locked(stream);
+ xe_eu_stall_data_buf_destroy(stream);
+ xe_eu_stall_stream_free(stream);
+ mutex_unlock(&gt->eu_stall->stream_lock);
+
+ return 0;
+}
+
+static const struct file_operations fops_eu_stall = {
+ .owner = THIS_MODULE,
+ .llseek = noop_llseek,
+ .release = xe_eu_stall_stream_close,
+ .poll = xe_eu_stall_stream_poll,
+ .read = xe_eu_stall_stream_read,
+ .unlocked_ioctl = xe_eu_stall_stream_ioctl,
+ .compat_ioctl = xe_eu_stall_stream_ioctl,
+};
+
+static int xe_eu_stall_stream_open_locked(struct drm_device *dev,
+ struct eu_stall_open_properties *props,
+ struct drm_file *file)
+{
+ struct xe_eu_stall_data_stream *stream;
+ struct xe_gt *gt = props->gt;
+ unsigned long f_flags = 0;
+ int ret, stream_fd;
+
+ /* Only one session can be active at any time */
+ if (gt->eu_stall->stream) {
+ xe_gt_dbg(gt, "EU stall sampling session already active\n");
+ return -EBUSY;
+ }
+
+ stream = kzalloc(sizeof(*stream), GFP_KERNEL);
+ if (!stream)
+ return -ENOMEM;
+
+ gt->eu_stall->stream = stream;
+ stream->gt = gt;
+
+ ret = xe_eu_stall_stream_init(stream, props);
+ if (ret) {
+ xe_gt_dbg(gt, "EU stall stream init failed : %d\n", ret);
+ goto err_free;
+ }
+
+ stream_fd = anon_inode_getfd("[xe_eu_stall]", &fops_eu_stall, stream, f_flags);
+ if (stream_fd < 0) {
+ ret = stream_fd;
+ xe_gt_dbg(gt, "EU stall inode get fd failed : %d\n", ret);
+ goto err_destroy;
+ }
+
+ /* Take a reference on the driver that will be kept with stream_fd
+ * until its release.
+ */
+ drm_dev_get(&gt->tile->xe->drm);
+
+ return stream_fd;
+
+err_destroy:
+ xe_eu_stall_data_buf_destroy(stream);
+err_free:
+ xe_eu_stall_stream_free(stream);
+ return ret;
+}
+
+/**
+ * xe_eu_stall_stream_open - Open a xe EU stall data stream fd
+ *
+ * @dev: DRM device pointer
+ * @data: pointer to first struct @drm_xe_ext_set_property in
+ * the chain of input properties from the user space.
+ * @file: DRM file pointer
+ *
+ * This function opens a EU stall data stream with input properties from
+ * the user space.
+ *
+ * Returns: EU stall data stream fd on success or a negative error code.
+ */
+int xe_eu_stall_stream_open(struct drm_device *dev, u64 data, struct drm_file *file)
+{
+ struct xe_device *xe = to_xe_device(dev);
+ struct eu_stall_open_properties props = {};
+ int ret;
+
+ if (!xe_eu_stall_supported_on_platform(xe)) {
+ drm_dbg(&xe->drm, "EU stall monitoring is not supported on this platform\n");
+ return -ENODEV;
+ }
+
+ if (xe_observation_paranoid && !perfmon_capable()) {
+ drm_dbg(&xe->drm, "Insufficient privileges for EU stall monitoring\n");
+ return -EACCES;
+ }
+
+ /* Initialize and set default values */
+ props.wait_num_reports = 1;
+ props.sampling_rate_mult = 4;
+
+ ret = xe_eu_stall_user_extensions(xe, data, 0, &props);
+ if (ret)
+ return ret;
+
+ if (!props.gt) {
+ drm_dbg(&xe->drm, "GT ID not provided for EU stall sampling\n");
+ return -EINVAL;
+ }
+
+ mutex_lock(&props.gt->eu_stall->stream_lock);
+ ret = xe_eu_stall_stream_open_locked(dev, &props, file);
+ mutex_unlock(&props.gt->eu_stall->stream_lock);
+
+ return ret;
+}
diff --git a/drivers/gpu/drm/xe/xe_eu_stall.h b/drivers/gpu/drm/xe/xe_eu_stall.h
new file mode 100644
index 000000000000..ed9d0f233566
--- /dev/null
+++ b/drivers/gpu/drm/xe/xe_eu_stall.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2025 Intel Corporation
+ */
+
+#ifndef __XE_EU_STALL_H__
+#define __XE_EU_STALL_H__
+
+#include "xe_gt_types.h"
+
+size_t xe_eu_stall_get_per_xecore_buf_size(void);
+size_t xe_eu_stall_data_record_size(struct xe_device *xe);
+size_t xe_eu_stall_get_sampling_rates(u32 *num_rates, const u64 **rates);
+
+int xe_eu_stall_init(struct xe_gt *gt);
+int xe_eu_stall_stream_open(struct drm_device *dev,
+ u64 data,
+ struct drm_file *file);
+
+static inline bool xe_eu_stall_supported_on_platform(struct xe_device *xe)
+{
+ return xe->info.platform == XE_PVC || GRAPHICS_VER(xe) >= 20;
+}
+#endif
diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
index 23a9f519ce1c..606922d9dd73 100644
--- a/drivers/gpu/drm/xe/xe_exec_queue.c
+++ b/drivers/gpu/drm/xe/xe_exec_queue.c
@@ -203,6 +203,7 @@ err_post_alloc:
__xe_exec_queue_free(q);
return ERR_PTR(err);
}
+ALLOW_ERROR_INJECTION(xe_exec_queue_create, ERRNO);
struct xe_exec_queue *xe_exec_queue_create_class(struct xe_device *xe, struct xe_gt *gt,
struct xe_vm *vm,
@@ -604,11 +605,12 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
struct xe_tile *tile;
struct xe_exec_queue *q = NULL;
u32 logical_mask;
+ u32 flags = 0;
u32 id;
u32 len;
int err;
- if (XE_IOCTL_DBG(xe, args->flags) ||
+ if (XE_IOCTL_DBG(xe, args->flags & ~DRM_XE_EXEC_QUEUE_LOW_LATENCY_HINT) ||
XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
return -EINVAL;
@@ -625,6 +627,9 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
if (XE_IOCTL_DBG(xe, eci[0].gt_id >= xe->info.gt_count))
return -EINVAL;
+ if (args->flags & DRM_XE_EXEC_QUEUE_LOW_LATENCY_HINT)
+ flags |= EXEC_QUEUE_FLAG_LOW_LATENCY;
+
if (eci[0].engine_class == DRM_XE_ENGINE_CLASS_VM_BIND) {
if (XE_IOCTL_DBG(xe, args->width != 1) ||
XE_IOCTL_DBG(xe, args->num_placements != 1) ||
@@ -633,8 +638,8 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
for_each_tile(tile, xe, id) {
struct xe_exec_queue *new;
- u32 flags = EXEC_QUEUE_FLAG_VM;
+ flags |= EXEC_QUEUE_FLAG_VM;
if (id)
flags |= EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD;
@@ -680,7 +685,7 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
}
q = xe_exec_queue_create(xe, vm, logical_mask,
- args->width, hwe, 0,
+ args->width, hwe, flags,
args->extensions);
up_read(&vm->lock);
xe_vm_put(vm);
diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h
index 6eb7ff091534..cc1cffb5c87f 100644
--- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
+++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
@@ -85,6 +85,8 @@ struct xe_exec_queue {
#define EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD BIT(3)
/* kernel exec_queue only, set priority to highest level */
#define EXEC_QUEUE_FLAG_HIGH_PRIORITY BIT(4)
+/* flag to indicate low latency hint to guc */
+#define EXEC_QUEUE_FLAG_LOW_LATENCY BIT(5)
/**
* @flags: flags for this exec queue, should statically setup aside from ban
diff --git a/drivers/gpu/drm/xe/xe_gen_wa_oob.c b/drivers/gpu/drm/xe/xe_gen_wa_oob.c
index 904cf47925aa..ed9183599e31 100644
--- a/drivers/gpu/drm/xe/xe_gen_wa_oob.c
+++ b/drivers/gpu/drm/xe/xe_gen_wa_oob.c
@@ -28,10 +28,10 @@
"\n" \
"#endif\n"
-static void print_usage(FILE *f)
+static void print_usage(FILE *f, const char *progname)
{
fprintf(f, "usage: %s <input-rule-file> <generated-c-source-file> <generated-c-header-file>\n",
- program_invocation_short_name);
+ progname);
}
static void print_parse_error(const char *err_msg, const char *line,
@@ -144,7 +144,7 @@ int main(int argc, const char *argv[])
if (argc < 3) {
fprintf(stderr, "ERROR: wrong arguments\n");
- print_usage(stderr);
+ print_usage(stderr, argv[0]);
return 1;
}
diff --git a/drivers/gpu/drm/xe/xe_gsc_proxy.c b/drivers/gpu/drm/xe/xe_gsc_proxy.c
index 31c90577faf0..8cf70b228ff3 100644
--- a/drivers/gpu/drm/xe/xe_gsc_proxy.c
+++ b/drivers/gpu/drm/xe/xe_gsc_proxy.c
@@ -490,7 +490,7 @@ int xe_gsc_proxy_init(struct xe_gsc *gsc)
gsc->proxy.component_added = true;
- return xe_device_add_action_or_reset(xe, xe_gsc_proxy_remove, gsc);
+ return devm_add_action_or_reset(xe->drm.dev, xe_gsc_proxy_remove, gsc);
}
/**
diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
index 650a0ee56e97..10a9e3c72b36 100644
--- a/drivers/gpu/drm/xe/xe_gt.c
+++ b/drivers/gpu/drm/xe/xe_gt.c
@@ -19,6 +19,7 @@
#include "xe_bb.h"
#include "xe_bo.h"
#include "xe_device.h"
+#include "xe_eu_stall.h"
#include "xe_exec_queue.h"
#include "xe_execlist.h"
#include "xe_force_wake.h"
@@ -361,9 +362,11 @@ int xe_gt_init_early(struct xe_gt *gt)
if (err)
return err;
- xe_wa_process_gt(gt);
+ err = xe_tuning_init(gt);
+ if (err)
+ return err;
+
xe_wa_process_oob(gt);
- xe_tuning_process_gt(gt);
xe_force_wake_init_gt(gt, gt_to_fw(gt));
spin_lock_init(&gt->global_invl_lock);
@@ -450,6 +453,8 @@ static int all_fw_domain_init(struct xe_gt *gt)
}
xe_gt_mcr_set_implicit_defaults(gt);
+ xe_wa_process_gt(gt);
+ xe_tuning_process_gt(gt);
xe_reg_sr_apply_mmio(&gt->reg_sr, gt);
err = xe_gt_clock_init(gt);
@@ -613,6 +618,10 @@ int xe_gt_init(struct xe_gt *gt)
xe_gt_record_user_engines(gt);
+ err = xe_eu_stall_init(gt);
+ if (err)
+ return err;
+
return 0;
}
diff --git a/drivers/gpu/drm/xe/xe_gt_clock.c b/drivers/gpu/drm/xe/xe_gt_clock.c
index cc2ae159298e..2a958c92d8ea 100644
--- a/drivers/gpu/drm/xe/xe_gt_clock.c
+++ b/drivers/gpu/drm/xe/xe_gt_clock.c
@@ -12,25 +12,10 @@
#include "xe_assert.h"
#include "xe_device.h"
#include "xe_gt.h"
+#include "xe_gt_printk.h"
#include "xe_macros.h"
#include "xe_mmio.h"
-static u32 read_reference_ts_freq(struct xe_gt *gt)
-{
- u32 ts_override = xe_mmio_read32(&gt->mmio, TIMESTAMP_OVERRIDE);
- u32 base_freq, frac_freq;
-
- base_freq = REG_FIELD_GET(TIMESTAMP_OVERRIDE_US_COUNTER_DIVIDER_MASK,
- ts_override) + 1;
- base_freq *= 1000000;
-
- frac_freq = REG_FIELD_GET(TIMESTAMP_OVERRIDE_US_COUNTER_DENOMINATOR_MASK,
- ts_override);
- frac_freq = 1000000 / (frac_freq + 1);
-
- return base_freq + frac_freq;
-}
-
static u32 get_crystal_clock_freq(u32 rpm_config_reg)
{
const u32 f19_2_mhz = 19200000;
@@ -57,26 +42,30 @@ static u32 get_crystal_clock_freq(u32 rpm_config_reg)
int xe_gt_clock_init(struct xe_gt *gt)
{
- u32 ctc_reg = xe_mmio_read32(&gt->mmio, CTC_MODE);
+ u32 c0 = xe_mmio_read32(&gt->mmio, RPM_CONFIG0);
u32 freq = 0;
- /* Assuming gen11+ so assert this assumption is correct */
- xe_gt_assert(gt, GRAPHICS_VER(gt_to_xe(gt)) >= 11);
-
- if (ctc_reg & CTC_SOURCE_DIVIDE_LOGIC) {
- freq = read_reference_ts_freq(gt);
- } else {
- u32 c0 = xe_mmio_read32(&gt->mmio, RPM_CONFIG0);
-
- freq = get_crystal_clock_freq(c0);
-
- /*
- * Now figure out how the command stream's timestamp
- * register increments from this frequency (it might
- * increment only every few clock cycle).
- */
- freq >>= 3 - REG_FIELD_GET(RPM_CONFIG0_CTC_SHIFT_PARAMETER_MASK, c0);
- }
+ /*
+ * CTC_MODE[0] = 1 is definitely not supported for Xe2 and later
+ * platforms. In theory it could be a valid setting for pre-Xe2
+ * platforms, but there's no documentation on how to properly handle
+ * this case. Reading TIMESTAMP_OVERRIDE, as the driver attempted in
+ * the past has been confirmed as incorrect by the hardware architects.
+ *
+ * For now just warn if we ever encounter hardware in the wild that
+ * has this setting and move on as if it hadn't been set.
+ */
+ if (xe_mmio_read32(&gt->mmio, CTC_MODE) & CTC_SOURCE_DIVIDE_LOGIC)
+ xe_gt_warn(gt, "CTC_MODE[0] is set; this is unexpected and undocumented\n");
+
+ freq = get_crystal_clock_freq(c0);
+
+ /*
+ * Now figure out how the command stream's timestamp
+ * register increments from this frequency (it might
+ * increment only every few clock cycle).
+ */
+ freq >>= 3 - REG_FIELD_GET(RPM_CONFIG0_CTC_SHIFT_PARAMETER_MASK, c0);
gt->info.reference_clock = freq;
return 0;
diff --git a/drivers/gpu/drm/xe/xe_gt_debugfs.c b/drivers/gpu/drm/xe/xe_gt_debugfs.c
index e7792858b1e4..2d63a69cbfa3 100644
--- a/drivers/gpu/drm/xe/xe_gt_debugfs.c
+++ b/drivers/gpu/drm/xe/xe_gt_debugfs.c
@@ -30,6 +30,7 @@
#include "xe_reg_sr.h"
#include "xe_reg_whitelist.h"
#include "xe_sriov.h"
+#include "xe_tuning.h"
#include "xe_uc_debugfs.h"
#include "xe_wa.h"
@@ -217,6 +218,15 @@ static int workarounds(struct xe_gt *gt, struct drm_printer *p)
return 0;
}
+static int tunings(struct xe_gt *gt, struct drm_printer *p)
+{
+ xe_pm_runtime_get(gt_to_xe(gt));
+ xe_tuning_dump(gt, p);
+ xe_pm_runtime_put(gt_to_xe(gt));
+
+ return 0;
+}
+
static int pat(struct xe_gt *gt, struct drm_printer *p)
{
xe_pm_runtime_get(gt_to_xe(gt));
@@ -300,6 +310,7 @@ static const struct drm_info_list debugfs_list[] = {
{"powergate_info", .show = xe_gt_debugfs_simple_show, .data = powergate_info},
{"register-save-restore", .show = xe_gt_debugfs_simple_show, .data = register_save_restore},
{"workarounds", .show = xe_gt_debugfs_simple_show, .data = workarounds},
+ {"tunings", .show = xe_gt_debugfs_simple_show, .data = tunings},
{"pat", .show = xe_gt_debugfs_simple_show, .data = pat},
{"mocs", .show = xe_gt_debugfs_simple_show, .data = mocs},
{"default_lrc_rcs", .show = xe_gt_debugfs_simple_show, .data = rcs_default_lrc},
diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c
index 46701ca11ce0..c5ad9a0a89c2 100644
--- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
+++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
@@ -19,6 +19,7 @@
#include "xe_guc.h"
#include "xe_guc_ct.h"
#include "xe_migrate.h"
+#include "xe_svm.h"
#include "xe_trace_bo.h"
#include "xe_vm.h"
@@ -125,8 +126,8 @@ static int xe_pf_begin(struct drm_exec *exec, struct xe_vma *vma,
return 0;
}
-static int handle_vma_pagefault(struct xe_gt *gt, struct pagefault *pf,
- struct xe_vma *vma)
+static int handle_vma_pagefault(struct xe_gt *gt, struct xe_vma *vma,
+ bool atomic)
{
struct xe_vm *vm = xe_vma_vm(vma);
struct xe_tile *tile = gt_to_tile(gt);
@@ -134,13 +135,13 @@ static int handle_vma_pagefault(struct xe_gt *gt, struct pagefault *pf,
struct dma_fence *fence;
ktime_t end = 0;
int err;
- bool atomic;
+
+ lockdep_assert_held_write(&vm->lock);
xe_gt_stats_incr(gt, XE_GT_STATS_ID_VMA_PAGEFAULT_COUNT, 1);
- xe_gt_stats_incr(gt, XE_GT_STATS_ID_VMA_PAGEFAULT_BYTES, xe_vma_size(vma));
+ xe_gt_stats_incr(gt, XE_GT_STATS_ID_VMA_PAGEFAULT_KB, xe_vma_size(vma) / 1024);
trace_xe_vma_pagefault(vma);
- atomic = access_is_atomic(pf->access_type);
/* Check if VMA is valid */
if (vma_is_valid(tile, vma) && !atomic)
@@ -210,6 +211,7 @@ static int handle_pagefault(struct xe_gt *gt, struct pagefault *pf)
struct xe_vm *vm;
struct xe_vma *vma = NULL;
int err;
+ bool atomic;
/* SW isn't expected to handle TRTT faults */
if (pf->trva_fault)
@@ -235,7 +237,13 @@ static int handle_pagefault(struct xe_gt *gt, struct pagefault *pf)
goto unlock_vm;
}
- err = handle_vma_pagefault(gt, pf, vma);
+ atomic = access_is_atomic(pf->access_type);
+
+ if (xe_vma_is_cpu_addr_mirror(vma))
+ err = xe_svm_handle_pagefault(vm, vma, gt_to_tile(gt),
+ pf->page_addr, atomic);
+ else
+ err = handle_vma_pagefault(gt, vma, atomic);
unlock_vm:
if (!err)
diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_service.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_service.c
index 6b5f849a0722..4efde5f46b43 100644
--- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_service.c
+++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_service.c
@@ -114,7 +114,6 @@ static const struct xe_reg tgl_runtime_regs[] = {
GT_VEBOX_VDBOX_DISABLE, /* _MMIO(0x9140) */
CTC_MODE, /* _MMIO(0xa26c) */
HUC_KERNEL_LOAD_INFO, /* _MMIO(0xc1dc) */
- TIMESTAMP_OVERRIDE, /* _MMIO(0x44074) */
};
static const struct xe_reg ats_m_runtime_regs[] = {
@@ -127,7 +126,6 @@ static const struct xe_reg ats_m_runtime_regs[] = {
XEHP_GT_COMPUTE_DSS_ENABLE, /* _MMIO(0x9144) */
CTC_MODE, /* _MMIO(0xa26c) */
HUC_KERNEL_LOAD_INFO, /* _MMIO(0xc1dc) */
- TIMESTAMP_OVERRIDE, /* _MMIO(0x44074) */
};
static const struct xe_reg pvc_runtime_regs[] = {
@@ -140,7 +138,6 @@ static const struct xe_reg pvc_runtime_regs[] = {
XEHPC_GT_COMPUTE_DSS_ENABLE_EXT,/* _MMIO(0x9148) */
CTC_MODE, /* _MMIO(0xA26C) */
HUC_KERNEL_LOAD_INFO, /* _MMIO(0xc1dc) */
- TIMESTAMP_OVERRIDE, /* _MMIO(0x44074) */
};
static const struct xe_reg ver_1270_runtime_regs[] = {
@@ -155,7 +152,6 @@ static const struct xe_reg ver_1270_runtime_regs[] = {
XEHPC_GT_COMPUTE_DSS_ENABLE_EXT,/* _MMIO(0x9148) */
CTC_MODE, /* _MMIO(0xa26c) */
HUC_KERNEL_LOAD_INFO, /* _MMIO(0xc1dc) */
- TIMESTAMP_OVERRIDE, /* _MMIO(0x44074) */
};
static const struct xe_reg ver_2000_runtime_regs[] = {
@@ -173,7 +169,6 @@ static const struct xe_reg ver_2000_runtime_regs[] = {
XE2_GT_GEOMETRY_DSS_2, /* _MMIO(0x9154) */
CTC_MODE, /* _MMIO(0xa26c) */
HUC_KERNEL_LOAD_INFO, /* _MMIO(0xc1dc) */
- TIMESTAMP_OVERRIDE, /* _MMIO(0x44074) */
};
static const struct xe_reg ver_3000_runtime_regs[] = {
diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_vf.c b/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
index 4831549da319..a439261bf4d7 100644
--- a/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
+++ b/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
@@ -47,12 +47,19 @@ static int guc_action_vf_reset(struct xe_guc *guc)
return ret > 0 ? -EPROTO : ret;
}
+#define GUC_RESET_VF_STATE_RETRY_MAX 10
static int vf_reset_guc_state(struct xe_gt *gt)
{
+ unsigned int retry = GUC_RESET_VF_STATE_RETRY_MAX;
struct xe_guc *guc = &gt->uc.guc;
int err;
- err = guc_action_vf_reset(guc);
+ do {
+ err = guc_action_vf_reset(guc);
+ if (!err || err != -ETIMEDOUT)
+ break;
+ } while (--retry);
+
if (unlikely(err))
xe_gt_sriov_err(gt, "Failed to reset GuC state (%pe)\n", ERR_PTR(err));
return err;
diff --git a/drivers/gpu/drm/xe/xe_gt_stats.c b/drivers/gpu/drm/xe/xe_gt_stats.c
index 2e9879ea4674..6155ea354432 100644
--- a/drivers/gpu/drm/xe/xe_gt_stats.c
+++ b/drivers/gpu/drm/xe/xe_gt_stats.c
@@ -23,13 +23,13 @@ void xe_gt_stats_incr(struct xe_gt *gt, const enum xe_gt_stats_id id, int incr)
if (id >= __XE_GT_STATS_NUM_IDS)
return;
- atomic_add(incr, &gt->stats.counters[id]);
+ atomic64_add(incr, &gt->stats.counters[id]);
}
static const char *const stat_description[__XE_GT_STATS_NUM_IDS] = {
"tlb_inval_count",
"vma_pagefault_count",
- "vma_pagefault_bytes",
+ "vma_pagefault_kb",
};
/**
@@ -44,8 +44,8 @@ int xe_gt_stats_print_info(struct xe_gt *gt, struct drm_printer *p)
enum xe_gt_stats_id id;
for (id = 0; id < __XE_GT_STATS_NUM_IDS; ++id)
- drm_printf(p, "%s: %d\n", stat_description[id],
- atomic_read(&gt->stats.counters[id]));
+ drm_printf(p, "%s: %lld\n", stat_description[id],
+ atomic64_read(&gt->stats.counters[id]));
return 0;
}
diff --git a/drivers/gpu/drm/xe/xe_gt_stats_types.h b/drivers/gpu/drm/xe/xe_gt_stats_types.h
index b072bd80c4b9..d556771f99d6 100644
--- a/drivers/gpu/drm/xe/xe_gt_stats_types.h
+++ b/drivers/gpu/drm/xe/xe_gt_stats_types.h
@@ -9,7 +9,7 @@
enum xe_gt_stats_id {
XE_GT_STATS_ID_TLB_INVAL,
XE_GT_STATS_ID_VMA_PAGEFAULT_COUNT,
- XE_GT_STATS_ID_VMA_PAGEFAULT_BYTES,
+ XE_GT_STATS_ID_VMA_PAGEFAULT_KB,
/* must be the last entry */
__XE_GT_STATS_NUM_IDS,
};
diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
index 0a93831c0a02..03072e094991 100644
--- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
+++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
@@ -411,6 +411,28 @@ int xe_gt_tlb_invalidation_range(struct xe_gt *gt,
}
/**
+ * xe_gt_tlb_invalidation_vm - Issue a TLB invalidation on this GT for a VM
+ * @gt: graphics tile
+ * @vm: VM to invalidate
+ *
+ * Invalidate entire VM's address space
+ */
+void xe_gt_tlb_invalidation_vm(struct xe_gt *gt, struct xe_vm *vm)
+{
+ struct xe_gt_tlb_invalidation_fence fence;
+ u64 range = 1ull << vm->xe->info.va_bits;
+ int ret;
+
+ xe_gt_tlb_invalidation_fence_init(gt, &fence, true);
+
+ ret = xe_gt_tlb_invalidation_range(gt, &fence, 0, range, vm->usm.asid);
+ if (ret < 0)
+ return;
+
+ xe_gt_tlb_invalidation_fence_wait(&fence);
+}
+
+/**
* xe_gt_tlb_invalidation_vma - Issue a TLB invalidation on this GT for a VMA
* @gt: GT structure
* @fence: invalidation fence which will be signal on TLB invalidation
diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h
index 672acfcdf0d7..abe9b03d543e 100644
--- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h
+++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h
@@ -12,6 +12,7 @@
struct xe_gt;
struct xe_guc;
+struct xe_vm;
struct xe_vma;
int xe_gt_tlb_invalidation_init_early(struct xe_gt *gt);
@@ -21,6 +22,7 @@ int xe_gt_tlb_invalidation_ggtt(struct xe_gt *gt);
int xe_gt_tlb_invalidation_vma(struct xe_gt *gt,
struct xe_gt_tlb_invalidation_fence *fence,
struct xe_vma *vma);
+void xe_gt_tlb_invalidation_vm(struct xe_gt *gt, struct xe_vm *vm);
int xe_gt_tlb_invalidation_range(struct xe_gt *gt,
struct xe_gt_tlb_invalidation_fence *fence,
u64 start, u64 end, u32 asid);
diff --git a/drivers/gpu/drm/xe/xe_gt_topology.h b/drivers/gpu/drm/xe/xe_gt_topology.h
index 746b325bbf6e..a72d26ba0653 100644
--- a/drivers/gpu/drm/xe/xe_gt_topology.h
+++ b/drivers/gpu/drm/xe/xe_gt_topology.h
@@ -25,6 +25,19 @@ void xe_gt_topology_init(struct xe_gt *gt);
void xe_gt_topology_dump(struct xe_gt *gt, struct drm_printer *p);
+/**
+ * xe_gt_topology_mask_last_dss() - Returns the index of the last DSS in a mask.
+ * @mask: Input DSS mask
+ *
+ * Return: Index of the last DSS in the input DSS mask,
+ * XE_MAX_DSS_FUSE_BITS if DSS mask is empty.
+ */
+static inline unsigned int
+xe_gt_topology_mask_last_dss(const xe_dss_mask_t mask)
+{
+ return find_last_bit(mask, XE_MAX_DSS_FUSE_BITS);
+}
+
unsigned int
xe_dss_mask_group_ffs(const xe_dss_mask_t mask, int groupsize, int groupnum);
diff --git a/drivers/gpu/drm/xe/xe_gt_types.h b/drivers/gpu/drm/xe/xe_gt_types.h
index 6e66bf0e8b3f..e3cfb026ac88 100644
--- a/drivers/gpu/drm/xe/xe_gt_types.h
+++ b/drivers/gpu/drm/xe/xe_gt_types.h
@@ -139,7 +139,7 @@ struct xe_gt {
/** @stats: GT stats */
struct {
/** @stats.counters: counters for various GT stats */
- atomic_t counters[__XE_GT_STATS_NUM_IDS];
+ atomic64_t counters[__XE_GT_STATS_NUM_IDS];
} stats;
#endif
@@ -413,6 +413,16 @@ struct xe_gt {
bool oob_initialized;
} wa_active;
+ /** @tuning_active: keep track of active tunings */
+ struct {
+ /** @tuning_active.gt: bitmap with active GT tunings */
+ unsigned long *gt;
+ /** @tuning_active.engine: bitmap with active engine tunings */
+ unsigned long *engine;
+ /** @tuning_active.lrc: bitmap with active LRC tunings */
+ unsigned long *lrc;
+ } tuning_active;
+
/** @user_engines: engines present in GT and available to userspace */
struct {
/**
@@ -430,6 +440,9 @@ struct xe_gt {
/** @oa: oa observation subsystem per gt info */
struct xe_oa_gt oa;
+
+ /** @eu_stall: EU stall counters subsystem per gt info */
+ struct xe_eu_stall_gt *eu_stall;
};
#endif
diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c
index 1619c0a52db9..bc1ff0a4e1e7 100644
--- a/drivers/gpu/drm/xe/xe_guc.c
+++ b/drivers/gpu/drm/xe/xe_guc.c
@@ -27,6 +27,7 @@
#include "xe_guc_capture.h"
#include "xe_guc_ct.h"
#include "xe_guc_db_mgr.h"
+#include "xe_guc_engine_activity.h"
#include "xe_guc_hwconfig.h"
#include "xe_guc_log.h"
#include "xe_guc_pc.h"
@@ -744,6 +745,10 @@ int xe_guc_init_post_hwconfig(struct xe_guc *guc)
if (ret)
return ret;
+ ret = xe_guc_engine_activity_init(guc);
+ if (ret)
+ return ret;
+
ret = xe_guc_buf_cache_init(&guc->buf);
if (ret)
return ret;
diff --git a/drivers/gpu/drm/xe/xe_guc_ads.c b/drivers/gpu/drm/xe/xe_guc_ads.c
index fab259adc380..e7c9e095a19f 100644
--- a/drivers/gpu/drm/xe/xe_guc_ads.c
+++ b/drivers/gpu/drm/xe/xe_guc_ads.c
@@ -342,7 +342,7 @@ static void guc_waklv_init(struct xe_guc_ads *ads)
offset = guc_ads_waklv_offset(ads);
remain = guc_ads_waklv_size(ads);
- if (XE_WA(gt, 14019882105))
+ if (XE_WA(gt, 14019882105) || XE_WA(gt, 16021333562))
guc_waklv_enable_simple(ads,
GUC_WORKAROUND_KLV_BLOCK_INTERRUPTS_WHEN_MGSR_BLOCKED,
&offset, &remain);
diff --git a/drivers/gpu/drm/xe/xe_guc_engine_activity.c b/drivers/gpu/drm/xe/xe_guc_engine_activity.c
new file mode 100644
index 000000000000..2a457dcf31d5
--- /dev/null
+++ b/drivers/gpu/drm/xe/xe_guc_engine_activity.c
@@ -0,0 +1,373 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2025 Intel Corporation
+ */
+
+#include <drm/drm_managed.h>
+
+#include "abi/guc_actions_abi.h"
+#include "regs/xe_gt_regs.h"
+
+#include "xe_bo.h"
+#include "xe_force_wake.h"
+#include "xe_gt_printk.h"
+#include "xe_guc.h"
+#include "xe_guc_engine_activity.h"
+#include "xe_guc_ct.h"
+#include "xe_hw_engine.h"
+#include "xe_map.h"
+#include "xe_mmio.h"
+#include "xe_trace_guc.h"
+
+#define TOTAL_QUANTA 0x8000
+
+static struct iosys_map engine_activity_map(struct xe_guc *guc, struct xe_hw_engine *hwe)
+{
+ struct xe_guc_engine_activity *engine_activity = &guc->engine_activity;
+ struct engine_activity_buffer *buffer = &engine_activity->device_buffer;
+ u16 guc_class = xe_engine_class_to_guc_class(hwe->class);
+ size_t offset;
+
+ offset = offsetof(struct guc_engine_activity_data,
+ engine_activity[guc_class][hwe->logical_instance]);
+
+ return IOSYS_MAP_INIT_OFFSET(&buffer->activity_bo->vmap, offset);
+}
+
+static struct iosys_map engine_metadata_map(struct xe_guc *guc)
+{
+ struct xe_guc_engine_activity *engine_activity = &guc->engine_activity;
+ struct engine_activity_buffer *buffer = &engine_activity->device_buffer;
+
+ return buffer->metadata_bo->vmap;
+}
+
+static int allocate_engine_activity_group(struct xe_guc *guc)
+{
+ struct xe_guc_engine_activity *engine_activity = &guc->engine_activity;
+ struct xe_device *xe = guc_to_xe(guc);
+ u32 num_activity_group = 1; /* Will be modified for VF */
+
+ engine_activity->eag = drmm_kcalloc(&xe->drm, num_activity_group,
+ sizeof(struct engine_activity_group), GFP_KERNEL);
+
+ if (!engine_activity->eag)
+ return -ENOMEM;
+
+ engine_activity->num_activity_group = num_activity_group;
+
+ return 0;
+}
+
+static int allocate_engine_activity_buffers(struct xe_guc *guc,
+ struct engine_activity_buffer *buffer)
+{
+ u32 metadata_size = sizeof(struct guc_engine_activity_metadata);
+ u32 size = sizeof(struct guc_engine_activity_data);
+ struct xe_gt *gt = guc_to_gt(guc);
+ struct xe_tile *tile = gt_to_tile(gt);
+ struct xe_bo *bo, *metadata_bo;
+
+ metadata_bo = xe_bo_create_pin_map(gt_to_xe(gt), tile, NULL, PAGE_ALIGN(metadata_size),
+ ttm_bo_type_kernel, XE_BO_FLAG_SYSTEM |
+ XE_BO_FLAG_GGTT | XE_BO_FLAG_GGTT_INVALIDATE);
+
+ if (IS_ERR(metadata_bo))
+ return PTR_ERR(metadata_bo);
+
+ bo = xe_bo_create_pin_map(gt_to_xe(gt), tile, NULL, PAGE_ALIGN(size),
+ ttm_bo_type_kernel, XE_BO_FLAG_VRAM_IF_DGFX(tile) |
+ XE_BO_FLAG_GGTT | XE_BO_FLAG_GGTT_INVALIDATE);
+
+ if (IS_ERR(bo)) {
+ xe_bo_unpin_map_no_vm(metadata_bo);
+ return PTR_ERR(bo);
+ }
+
+ buffer->metadata_bo = metadata_bo;
+ buffer->activity_bo = bo;
+ return 0;
+}
+
+static void free_engine_activity_buffers(struct engine_activity_buffer *buffer)
+{
+ xe_bo_unpin_map_no_vm(buffer->metadata_bo);
+ xe_bo_unpin_map_no_vm(buffer->activity_bo);
+}
+
+static bool is_engine_activity_supported(struct xe_guc *guc)
+{
+ struct xe_uc_fw_version *version = &guc->fw.versions.found[XE_UC_FW_VER_COMPATIBILITY];
+ struct xe_uc_fw_version required = { 1, 14, 1 };
+ struct xe_gt *gt = guc_to_gt(guc);
+
+ if (IS_SRIOV_VF(gt_to_xe(gt))) {
+ xe_gt_info(gt, "engine activity stats not supported on VFs\n");
+ return false;
+ }
+
+ /* engine activity stats is supported from GuC interface version (1.14.1) */
+ if (GUC_SUBMIT_VER(guc) < MAKE_GUC_VER_STRUCT(required)) {
+ xe_gt_info(gt,
+ "engine activity stats unsupported in GuC interface v%u.%u.%u, need v%u.%u.%u or higher\n",
+ version->major, version->minor, version->patch, required.major,
+ required.minor, required.patch);
+ return false;
+ }
+
+ return true;
+}
+
+static struct engine_activity *hw_engine_to_engine_activity(struct xe_hw_engine *hwe)
+{
+ struct xe_guc *guc = &hwe->gt->uc.guc;
+ struct engine_activity_group *eag = &guc->engine_activity.eag[0];
+ u16 guc_class = xe_engine_class_to_guc_class(hwe->class);
+
+ return &eag->engine[guc_class][hwe->logical_instance];
+}
+
+static u64 cpu_ns_to_guc_tsc_tick(ktime_t ns, u32 freq)
+{
+ return mul_u64_u32_div(ns, freq, NSEC_PER_SEC);
+}
+
+#define read_engine_activity_record(xe_, map_, field_) \
+ xe_map_rd_field(xe_, map_, 0, struct guc_engine_activity, field_)
+
+#define read_metadata_record(xe_, map_, field_) \
+ xe_map_rd_field(xe_, map_, 0, struct guc_engine_activity_metadata, field_)
+
+static u64 get_engine_active_ticks(struct xe_guc *guc, struct xe_hw_engine *hwe)
+{
+ struct engine_activity *ea = hw_engine_to_engine_activity(hwe);
+ struct guc_engine_activity *cached_activity = &ea->activity;
+ struct guc_engine_activity_metadata *cached_metadata = &ea->metadata;
+ struct xe_guc_engine_activity *engine_activity = &guc->engine_activity;
+ struct iosys_map activity_map, metadata_map;
+ struct xe_device *xe = guc_to_xe(guc);
+ struct xe_gt *gt = guc_to_gt(guc);
+ u32 last_update_tick, global_change_num;
+ u64 active_ticks, gpm_ts;
+ u16 change_num;
+
+ activity_map = engine_activity_map(guc, hwe);
+ metadata_map = engine_metadata_map(guc);
+ global_change_num = read_metadata_record(xe, &metadata_map, global_change_num);
+
+ /* GuC has not initialized activity data yet, return 0 */
+ if (!global_change_num)
+ goto update;
+
+ if (global_change_num == cached_metadata->global_change_num)
+ goto update;
+
+ cached_metadata->global_change_num = global_change_num;
+ change_num = read_engine_activity_record(xe, &activity_map, change_num);
+
+ if (!change_num || change_num == cached_activity->change_num)
+ goto update;
+
+ /* read engine activity values */
+ last_update_tick = read_engine_activity_record(xe, &activity_map, last_update_tick);
+ active_ticks = read_engine_activity_record(xe, &activity_map, active_ticks);
+
+ /* activity calculations */
+ ea->running = !!last_update_tick;
+ ea->total += active_ticks - cached_activity->active_ticks;
+ ea->active = 0;
+
+ /* cache the counter */
+ cached_activity->change_num = change_num;
+ cached_activity->last_update_tick = last_update_tick;
+ cached_activity->active_ticks = active_ticks;
+
+update:
+ if (ea->running) {
+ gpm_ts = xe_mmio_read64_2x32(&gt->mmio, MISC_STATUS_0) >>
+ engine_activity->gpm_timestamp_shift;
+ ea->active = lower_32_bits(gpm_ts) - cached_activity->last_update_tick;
+ }
+
+ trace_xe_guc_engine_activity(xe, ea, hwe->name, hwe->instance);
+
+ return ea->total + ea->active;
+}
+
+static u64 get_engine_total_ticks(struct xe_guc *guc, struct xe_hw_engine *hwe)
+{
+ struct engine_activity *ea = hw_engine_to_engine_activity(hwe);
+ struct guc_engine_activity_metadata *cached_metadata = &ea->metadata;
+ struct guc_engine_activity *cached_activity = &ea->activity;
+ struct iosys_map activity_map, metadata_map;
+ struct xe_device *xe = guc_to_xe(guc);
+ ktime_t now, cpu_delta;
+ u64 numerator;
+ u16 quanta_ratio;
+
+ activity_map = engine_activity_map(guc, hwe);
+ metadata_map = engine_metadata_map(guc);
+
+ if (!cached_metadata->guc_tsc_frequency_hz)
+ cached_metadata->guc_tsc_frequency_hz = read_metadata_record(xe, &metadata_map,
+ guc_tsc_frequency_hz);
+
+ quanta_ratio = read_engine_activity_record(xe, &activity_map, quanta_ratio);
+ cached_activity->quanta_ratio = quanta_ratio;
+
+ /* Total ticks calculations */
+ now = ktime_get();
+ cpu_delta = now - ea->last_cpu_ts;
+ ea->last_cpu_ts = now;
+ numerator = (ea->quanta_remainder_ns + cpu_delta) * cached_activity->quanta_ratio;
+ ea->quanta_ns += numerator / TOTAL_QUANTA;
+ ea->quanta_remainder_ns = numerator % TOTAL_QUANTA;
+ ea->quanta = cpu_ns_to_guc_tsc_tick(ea->quanta_ns, cached_metadata->guc_tsc_frequency_hz);
+
+ trace_xe_guc_engine_activity(xe, ea, hwe->name, hwe->instance);
+
+ return ea->quanta;
+}
+
+static int enable_engine_activity_stats(struct xe_guc *guc)
+{
+ struct xe_guc_engine_activity *engine_activity = &guc->engine_activity;
+ struct engine_activity_buffer *buffer = &engine_activity->device_buffer;
+ u32 action[] = {
+ XE_GUC_ACTION_SET_DEVICE_ENGINE_ACTIVITY_BUFFER,
+ xe_bo_ggtt_addr(buffer->metadata_bo),
+ 0,
+ xe_bo_ggtt_addr(buffer->activity_bo),
+ 0,
+ };
+
+ /* Blocking here to ensure the buffers are ready before reading them */
+ return xe_guc_ct_send_block(&guc->ct, action, ARRAY_SIZE(action));
+}
+
+static void engine_activity_set_cpu_ts(struct xe_guc *guc)
+{
+ struct xe_guc_engine_activity *engine_activity = &guc->engine_activity;
+ struct engine_activity_group *eag = &engine_activity->eag[0];
+ int i, j;
+
+ for (i = 0; i < GUC_MAX_ENGINE_CLASSES; i++)
+ for (j = 0; j < GUC_MAX_INSTANCES_PER_CLASS; j++)
+ eag->engine[i][j].last_cpu_ts = ktime_get();
+}
+
+static u32 gpm_timestamp_shift(struct xe_gt *gt)
+{
+ u32 reg;
+
+ reg = xe_mmio_read32(&gt->mmio, RPM_CONFIG0);
+
+ return 3 - REG_FIELD_GET(RPM_CONFIG0_CTC_SHIFT_PARAMETER_MASK, reg);
+}
+
+/**
+ * xe_guc_engine_activity_active_ticks - Get engine active ticks
+ * @guc: The GuC object
+ * @hwe: The hw_engine object
+ *
+ * Return: accumulated ticks @hwe was active since engine activity stats were enabled.
+ */
+u64 xe_guc_engine_activity_active_ticks(struct xe_guc *guc, struct xe_hw_engine *hwe)
+{
+ if (!xe_guc_engine_activity_supported(guc))
+ return 0;
+
+ return get_engine_active_ticks(guc, hwe);
+}
+
+/**
+ * xe_guc_engine_activity_total_ticks - Get engine total ticks
+ * @guc: The GuC object
+ * @hwe: The hw_engine object
+ *
+ * Return: accumulated quanta of ticks allocated for the engine
+ */
+u64 xe_guc_engine_activity_total_ticks(struct xe_guc *guc, struct xe_hw_engine *hwe)
+{
+ if (!xe_guc_engine_activity_supported(guc))
+ return 0;
+
+ return get_engine_total_ticks(guc, hwe);
+}
+
+/**
+ * xe_guc_engine_activity_supported - Check support for engine activity stats
+ * @guc: The GuC object
+ *
+ * Engine activity stats is supported from GuC interface version (1.14.1)
+ *
+ * Return: true if engine activity stats supported, false otherwise
+ */
+bool xe_guc_engine_activity_supported(struct xe_guc *guc)
+{
+ struct xe_guc_engine_activity *engine_activity = &guc->engine_activity;
+
+ return engine_activity->supported;
+}
+
+/**
+ * xe_guc_engine_activity_enable_stats - Enable engine activity stats
+ * @guc: The GuC object
+ *
+ * Enable engine activity stats and set initial timestamps
+ */
+void xe_guc_engine_activity_enable_stats(struct xe_guc *guc)
+{
+ int ret;
+
+ if (!xe_guc_engine_activity_supported(guc))
+ return;
+
+ ret = enable_engine_activity_stats(guc);
+ if (ret)
+ xe_gt_err(guc_to_gt(guc), "failed to enable activity stats%d\n", ret);
+ else
+ engine_activity_set_cpu_ts(guc);
+}
+
+static void engine_activity_fini(void *arg)
+{
+ struct xe_guc_engine_activity *engine_activity = arg;
+ struct engine_activity_buffer *buffer = &engine_activity->device_buffer;
+
+ free_engine_activity_buffers(buffer);
+}
+
+/**
+ * xe_guc_engine_activity_init - Initialize the engine activity data
+ * @guc: The GuC object
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int xe_guc_engine_activity_init(struct xe_guc *guc)
+{
+ struct xe_guc_engine_activity *engine_activity = &guc->engine_activity;
+ struct xe_gt *gt = guc_to_gt(guc);
+ int ret;
+
+ engine_activity->supported = is_engine_activity_supported(guc);
+ if (!engine_activity->supported)
+ return 0;
+
+ ret = allocate_engine_activity_group(guc);
+ if (ret) {
+ xe_gt_err(gt, "failed to allocate engine activity group (%pe)\n", ERR_PTR(ret));
+ return ret;
+ }
+
+ ret = allocate_engine_activity_buffers(guc, &engine_activity->device_buffer);
+ if (ret) {
+ xe_gt_err(gt, "failed to allocate engine activity buffers (%pe)\n", ERR_PTR(ret));
+ return ret;
+ }
+
+ engine_activity->gpm_timestamp_shift = gpm_timestamp_shift(gt);
+
+ return devm_add_action_or_reset(gt_to_xe(gt)->drm.dev, engine_activity_fini,
+ engine_activity);
+}
diff --git a/drivers/gpu/drm/xe/xe_guc_engine_activity.h b/drivers/gpu/drm/xe/xe_guc_engine_activity.h
new file mode 100644
index 000000000000..a042d4cb404c
--- /dev/null
+++ b/drivers/gpu/drm/xe/xe_guc_engine_activity.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2025 Intel Corporation
+ */
+
+#ifndef _XE_GUC_ENGINE_ACTIVITY_H_
+#define _XE_GUC_ENGINE_ACTIVITY_H_
+
+#include <linux/types.h>
+
+struct xe_hw_engine;
+struct xe_guc;
+
+int xe_guc_engine_activity_init(struct xe_guc *guc);
+bool xe_guc_engine_activity_supported(struct xe_guc *guc);
+void xe_guc_engine_activity_enable_stats(struct xe_guc *guc);
+u64 xe_guc_engine_activity_active_ticks(struct xe_guc *guc, struct xe_hw_engine *hwe);
+u64 xe_guc_engine_activity_total_ticks(struct xe_guc *guc, struct xe_hw_engine *hwe);
+#endif
diff --git a/drivers/gpu/drm/xe/xe_guc_engine_activity_types.h b/drivers/gpu/drm/xe/xe_guc_engine_activity_types.h
new file mode 100644
index 000000000000..5cdd034b6b70
--- /dev/null
+++ b/drivers/gpu/drm/xe/xe_guc_engine_activity_types.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2025 Intel Corporation
+ */
+
+#ifndef _XE_GUC_ENGINE_ACTIVITY_TYPES_H_
+#define _XE_GUC_ENGINE_ACTIVITY_TYPES_H_
+
+#include <linux/types.h>
+
+#include "xe_guc_fwif.h"
+/**
+ * struct engine_activity - Engine specific activity data
+ *
+ * Contains engine specific activity data and snapshot of the
+ * structures from GuC
+ */
+struct engine_activity {
+ /** @active: current activity */
+ u64 active;
+
+ /** @last_cpu_ts: cpu timestamp in nsec of previous sample */
+ u64 last_cpu_ts;
+
+ /** @quanta: total quanta used on HW */
+ u64 quanta;
+
+ /** @quanta_ns: total quanta_ns used on HW */
+ u64 quanta_ns;
+
+ /**
+ * @quanta_remainder_ns: remainder when the CPU time is scaled as
+ * per the quanta_ratio. This remainder is used in subsequent
+ * quanta calculations.
+ */
+ u64 quanta_remainder_ns;
+
+ /** @total: total engine activity */
+ u64 total;
+
+ /** @running: true if engine is running some work */
+ bool running;
+
+ /** @metadata: snapshot of engine activity metadata */
+ struct guc_engine_activity_metadata metadata;
+
+ /** @activity: snapshot of engine activity counter */
+ struct guc_engine_activity activity;
+};
+
+/**
+ * struct engine_activity_group - Activity data for all engines
+ */
+struct engine_activity_group {
+ /** @engine: engine specific activity data */
+ struct engine_activity engine[GUC_MAX_ENGINE_CLASSES][GUC_MAX_INSTANCES_PER_CLASS];
+};
+
+/**
+ * struct engine_activity_buffer - engine activity buffers
+ *
+ * This contains the buffers allocated for metadata and activity data
+ */
+struct engine_activity_buffer {
+ /** @activity_bo: object allocated to hold activity data */
+ struct xe_bo *activity_bo;
+
+ /** @metadata_bo: object allocated to hold activity metadata */
+ struct xe_bo *metadata_bo;
+};
+
+/**
+ * struct xe_guc_engine_activity - Data used by engine activity implementation
+ */
+struct xe_guc_engine_activity {
+ /** @gpm_timestamp_shift: Right shift value for the gpm timestamp */
+ u32 gpm_timestamp_shift;
+
+ /** @num_activity_group: number of activity groups */
+ u32 num_activity_group;
+
+ /** @supported: indicates support for engine activity stats */
+ bool supported;
+
+ /** @eag: holds the device level engine activity data */
+ struct engine_activity_group *eag;
+
+ /** @device_buffer: buffer object for global engine activity */
+ struct engine_activity_buffer device_buffer;
+};
+#endif
+
diff --git a/drivers/gpu/drm/xe/xe_guc_fwif.h b/drivers/gpu/drm/xe/xe_guc_fwif.h
index 057153f89b30..6f57578b07cb 100644
--- a/drivers/gpu/drm/xe/xe_guc_fwif.h
+++ b/drivers/gpu/drm/xe/xe_guc_fwif.h
@@ -208,6 +208,25 @@ struct guc_engine_usage {
struct guc_engine_usage_record engines[GUC_MAX_ENGINE_CLASSES][GUC_MAX_INSTANCES_PER_CLASS];
} __packed;
+/* Engine Activity stats */
+struct guc_engine_activity {
+ u16 change_num;
+ u16 quanta_ratio;
+ u32 last_update_tick;
+ u64 active_ticks;
+} __packed;
+
+struct guc_engine_activity_data {
+ struct guc_engine_activity engine_activity[GUC_MAX_ENGINE_CLASSES][GUC_MAX_INSTANCES_PER_CLASS];
+} __packed;
+
+struct guc_engine_activity_metadata {
+ u32 guc_tsc_frequency_hz;
+ u32 lag_latency_usec;
+ u32 global_change_num;
+ u32 reserved;
+} __packed;
+
/* This action will be programmed in C1BC - SOFT_SCRATCH_15_REG */
enum xe_guc_recv_message {
XE_GUC_RECV_MSG_CRASH_DUMP_POSTED = BIT(1),
diff --git a/drivers/gpu/drm/xe/xe_guc_pc.c b/drivers/gpu/drm/xe/xe_guc_pc.c
index 02409eedb914..25040efa043f 100644
--- a/drivers/gpu/drm/xe/xe_guc_pc.c
+++ b/drivers/gpu/drm/xe/xe_guc_pc.c
@@ -995,6 +995,17 @@ out:
return ret;
}
+static int pc_action_set_strategy(struct xe_guc_pc *pc, u32 val)
+{
+ int ret = 0;
+
+ ret = pc_action_set_param(pc,
+ SLPC_PARAM_STRATEGIES,
+ val);
+
+ return ret;
+}
+
/**
* xe_guc_pc_start - Start GuC's Power Conservation component
* @pc: Xe_GuC_PC instance
@@ -1054,6 +1065,11 @@ int xe_guc_pc_start(struct xe_guc_pc *pc)
}
ret = pc_action_setup_gucrc(pc, GUCRC_FIRMWARE_CONTROL);
+ if (ret)
+ goto out;
+
+ /* Enable SLPC Optimized Strategy for compute */
+ ret = pc_action_set_strategy(pc, SLPC_OPTIMIZED_STRATEGY_COMPUTE);
out:
xe_force_wake_put(gt_to_fw(gt), fw_ref);
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
index 913c74d6e2ae..b95934055f72 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -15,6 +15,7 @@
#include <drm/drm_managed.h>
#include "abi/guc_actions_abi.h"
+#include "abi/guc_actions_slpc_abi.h"
#include "abi/guc_klvs_abi.h"
#include "regs/xe_lrc_layout.h"
#include "xe_assert.h"
@@ -400,6 +401,7 @@ static void __guc_exec_queue_policy_add_##func(struct exec_queue_policy *policy,
MAKE_EXEC_QUEUE_POLICY_ADD(execution_quantum, EXECUTION_QUANTUM)
MAKE_EXEC_QUEUE_POLICY_ADD(preemption_timeout, PREEMPTION_TIMEOUT)
MAKE_EXEC_QUEUE_POLICY_ADD(priority, SCHEDULING_PRIORITY)
+MAKE_EXEC_QUEUE_POLICY_ADD(slpc_exec_queue_freq_req, SLPM_GT_FREQUENCY)
#undef MAKE_EXEC_QUEUE_POLICY_ADD
static const int xe_exec_queue_prio_to_guc[] = {
@@ -414,14 +416,20 @@ static void init_policies(struct xe_guc *guc, struct xe_exec_queue *q)
struct exec_queue_policy policy;
enum xe_exec_queue_priority prio = q->sched_props.priority;
u32 timeslice_us = q->sched_props.timeslice_us;
+ u32 slpc_exec_queue_freq_req = 0;
u32 preempt_timeout_us = q->sched_props.preempt_timeout_us;
xe_gt_assert(guc_to_gt(guc), exec_queue_registered(q));
+ if (q->flags & EXEC_QUEUE_FLAG_LOW_LATENCY)
+ slpc_exec_queue_freq_req |= SLPC_CTX_FREQ_REQ_IS_COMPUTE;
+
__guc_exec_queue_policy_start_klv(&policy, q->guc->id);
__guc_exec_queue_policy_add_priority(&policy, xe_exec_queue_prio_to_guc[prio]);
__guc_exec_queue_policy_add_execution_quantum(&policy, timeslice_us);
__guc_exec_queue_policy_add_preemption_timeout(&policy, preempt_timeout_us);
+ __guc_exec_queue_policy_add_slpc_exec_queue_freq_req(&policy,
+ slpc_exec_queue_freq_req);
xe_guc_ct_send(&guc->ct, (u32 *)&policy.h2g,
__guc_exec_queue_policy_action_size(&policy), 0, 0);
@@ -1248,6 +1256,8 @@ static void __guc_exec_queue_fini_async(struct work_struct *w)
if (xe_exec_queue_is_lr(q))
cancel_work_sync(&ge->lr_tdr);
+ /* Confirm no work left behind accessing device structures */
+ cancel_delayed_work_sync(&ge->sched.base.work_tdr);
release_guc_id(guc, q);
xe_sched_entity_fini(&ge->entity);
xe_sched_fini(&ge->sched);
diff --git a/drivers/gpu/drm/xe/xe_guc_types.h b/drivers/gpu/drm/xe/xe_guc_types.h
index 573aa6308380..63bac64429a5 100644
--- a/drivers/gpu/drm/xe/xe_guc_types.h
+++ b/drivers/gpu/drm/xe/xe_guc_types.h
@@ -13,6 +13,7 @@
#include "xe_guc_ads_types.h"
#include "xe_guc_buf_types.h"
#include "xe_guc_ct_types.h"
+#include "xe_guc_engine_activity_types.h"
#include "xe_guc_fwif.h"
#include "xe_guc_log_types.h"
#include "xe_guc_pc_types.h"
@@ -103,6 +104,9 @@ struct xe_guc {
/** @relay: GuC Relay Communication used in SR-IOV */
struct xe_guc_relay relay;
+ /** @engine_activity: Device specific engine activity */
+ struct xe_guc_engine_activity engine_activity;
+
/**
* @notify_reg: Register which is written to notify GuC of H2G messages
*/
diff --git a/drivers/gpu/drm/xe/xe_heci_gsc.c b/drivers/gpu/drm/xe/xe_heci_gsc.c
index 06dc78d3a812..27d11e06a82b 100644
--- a/drivers/gpu/drm/xe/xe_heci_gsc.c
+++ b/drivers/gpu/drm/xe/xe_heci_gsc.c
@@ -89,12 +89,9 @@ static void heci_gsc_release_dev(struct device *dev)
kfree(adev);
}
-void xe_heci_gsc_fini(struct xe_device *xe)
+static void xe_heci_gsc_fini(void *arg)
{
- struct xe_heci_gsc *heci_gsc = &xe->heci_gsc;
-
- if (!xe->info.has_heci_gscfi && !xe->info.has_heci_cscfi)
- return;
+ struct xe_heci_gsc *heci_gsc = arg;
if (heci_gsc->adev) {
struct auxiliary_device *aux_dev = &heci_gsc->adev->aux_dev;
@@ -106,6 +103,7 @@ void xe_heci_gsc_fini(struct xe_device *xe)
if (heci_gsc->irq >= 0)
irq_free_desc(heci_gsc->irq);
+
heci_gsc->irq = -1;
}
@@ -172,14 +170,14 @@ static int heci_gsc_add_device(struct xe_device *xe, const struct heci_gsc_def *
return ret;
}
-void xe_heci_gsc_init(struct xe_device *xe)
+int xe_heci_gsc_init(struct xe_device *xe)
{
struct xe_heci_gsc *heci_gsc = &xe->heci_gsc;
- const struct heci_gsc_def *def;
+ const struct heci_gsc_def *def = NULL;
int ret;
if (!xe->info.has_heci_gscfi && !xe->info.has_heci_cscfi)
- return;
+ return 0;
heci_gsc->irq = -1;
@@ -191,29 +189,24 @@ void xe_heci_gsc_init(struct xe_device *xe)
def = &heci_gsc_def_dg2;
} else if (xe->info.platform == XE_DG1) {
def = &heci_gsc_def_dg1;
- } else {
- drm_warn_once(&xe->drm, "Unknown platform\n");
- return;
}
- if (!def->name) {
- drm_warn_once(&xe->drm, "HECI is not implemented!\n");
- return;
+ if (!def || !def->name) {
+ drm_warn(&xe->drm, "HECI is not implemented!\n");
+ return 0;
}
- if (!def->use_polling && !xe_survivability_mode_enabled(xe)) {
+ ret = devm_add_action_or_reset(xe->drm.dev, xe_heci_gsc_fini, heci_gsc);
+ if (ret)
+ return ret;
+
+ if (!def->use_polling && !xe_survivability_mode_is_enabled(xe)) {
ret = heci_gsc_irq_setup(xe);
if (ret)
- goto fail;
+ return ret;
}
- ret = heci_gsc_add_device(xe, def);
- if (ret)
- goto fail;
-
- return;
-fail:
- xe_heci_gsc_fini(xe);
+ return heci_gsc_add_device(xe, def);
}
void xe_heci_gsc_irq_handler(struct xe_device *xe, u32 iir)
diff --git a/drivers/gpu/drm/xe/xe_heci_gsc.h b/drivers/gpu/drm/xe/xe_heci_gsc.h
index 48b3b1838045..745eb6783942 100644
--- a/drivers/gpu/drm/xe/xe_heci_gsc.h
+++ b/drivers/gpu/drm/xe/xe_heci_gsc.h
@@ -33,8 +33,7 @@ struct xe_heci_gsc {
int irq;
};
-void xe_heci_gsc_init(struct xe_device *xe);
-void xe_heci_gsc_fini(struct xe_device *xe);
+int xe_heci_gsc_init(struct xe_device *xe);
void xe_heci_gsc_irq_handler(struct xe_device *xe, u32 iir);
void xe_heci_csc_irq_handler(struct xe_device *xe, u32 iir);
diff --git a/drivers/gpu/drm/xe/xe_hmm.c b/drivers/gpu/drm/xe/xe_hmm.c
index 089834467880..392102515f3d 100644
--- a/drivers/gpu/drm/xe/xe_hmm.c
+++ b/drivers/gpu/drm/xe/xe_hmm.c
@@ -19,11 +19,10 @@ static u64 xe_npages_in_range(unsigned long start, unsigned long end)
return (end - start) >> PAGE_SHIFT;
}
-/*
+/**
* xe_mark_range_accessed() - mark a range is accessed, so core mm
* have such information for memory eviction or write back to
* hard disk
- *
* @range: the range to mark
* @write: if write to this range, we mark pages in this range
* as dirty
@@ -43,15 +42,51 @@ static void xe_mark_range_accessed(struct hmm_range *range, bool write)
}
}
-/*
+static int xe_alloc_sg(struct xe_device *xe, struct sg_table *st,
+ struct hmm_range *range, struct rw_semaphore *notifier_sem)
+{
+ unsigned long i, npages, hmm_pfn;
+ unsigned long num_chunks = 0;
+ int ret;
+
+ /* HMM docs says this is needed. */
+ ret = down_read_interruptible(notifier_sem);
+ if (ret)
+ return ret;
+
+ if (mmu_interval_read_retry(range->notifier, range->notifier_seq)) {
+ up_read(notifier_sem);
+ return -EAGAIN;
+ }
+
+ npages = xe_npages_in_range(range->start, range->end);
+ for (i = 0; i < npages;) {
+ unsigned long len;
+
+ hmm_pfn = range->hmm_pfns[i];
+ xe_assert(xe, hmm_pfn & HMM_PFN_VALID);
+
+ len = 1UL << hmm_pfn_to_map_order(hmm_pfn);
+
+ /* If order > 0 the page may extend beyond range->start */
+ len -= (hmm_pfn & ~HMM_PFN_FLAGS) & (len - 1);
+ i += len;
+ num_chunks++;
+ }
+ up_read(notifier_sem);
+
+ return sg_alloc_table(st, num_chunks, GFP_KERNEL);
+}
+
+/**
* xe_build_sg() - build a scatter gather table for all the physical pages/pfn
* in a hmm_range. dma-map pages if necessary. dma-address is save in sg table
* and will be used to program GPU page table later.
- *
* @xe: the xe device who will access the dma-address in sg table
* @range: the hmm range that we build the sg table from. range->hmm_pfns[]
* has the pfn numbers of pages that back up this hmm address range.
* @st: pointer to the sg table.
+ * @notifier_sem: The xe notifier lock.
* @write: whether we write to this range. This decides dma map direction
* for system pages. If write we map it bi-diretional; otherwise
* DMA_TO_DEVICE
@@ -78,43 +113,84 @@ static void xe_mark_range_accessed(struct hmm_range *range, bool write)
* Returns 0 if successful; -ENOMEM if fails to allocate memory
*/
static int xe_build_sg(struct xe_device *xe, struct hmm_range *range,
- struct sg_table *st, bool write)
+ struct sg_table *st,
+ struct rw_semaphore *notifier_sem,
+ bool write)
{
+ unsigned long npages = xe_npages_in_range(range->start, range->end);
struct device *dev = xe->drm.dev;
- struct page **pages;
- u64 i, npages;
- int ret;
+ struct scatterlist *sgl;
+ struct page *page;
+ unsigned long i, j;
- npages = xe_npages_in_range(range->start, range->end);
- pages = kvmalloc_array(npages, sizeof(*pages), GFP_KERNEL);
- if (!pages)
- return -ENOMEM;
+ lockdep_assert_held(notifier_sem);
- for (i = 0; i < npages; i++) {
- pages[i] = hmm_pfn_to_page(range->hmm_pfns[i]);
- xe_assert(xe, !is_device_private_page(pages[i]));
+ i = 0;
+ for_each_sg(st->sgl, sgl, st->nents, j) {
+ unsigned long hmm_pfn, size;
+
+ hmm_pfn = range->hmm_pfns[i];
+ page = hmm_pfn_to_page(hmm_pfn);
+ xe_assert(xe, !is_device_private_page(page));
+
+ size = 1UL << hmm_pfn_to_map_order(hmm_pfn);
+ size -= page_to_pfn(page) & (size - 1);
+ i += size;
+
+ if (unlikely(j == st->nents - 1)) {
+ if (i > npages)
+ size -= (i - npages);
+ sg_mark_end(sgl);
+ }
+ sg_set_page(sgl, page, size << PAGE_SHIFT, 0);
}
+ xe_assert(xe, i == npages);
- ret = sg_alloc_table_from_pages_segment(st, pages, npages, 0, npages << PAGE_SHIFT,
- xe_sg_segment_size(dev), GFP_KERNEL);
- if (ret)
- goto free_pages;
+ return dma_map_sgtable(dev, st, write ? DMA_BIDIRECTIONAL : DMA_TO_DEVICE,
+ DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_NO_KERNEL_MAPPING);
+}
+
+static void xe_hmm_userptr_set_mapped(struct xe_userptr_vma *uvma)
+{
+ struct xe_userptr *userptr = &uvma->userptr;
+ struct xe_vm *vm = xe_vma_vm(&uvma->vma);
+
+ lockdep_assert_held_write(&vm->lock);
+ lockdep_assert_held(&vm->userptr.notifier_lock);
+
+ mutex_lock(&userptr->unmap_mutex);
+ xe_assert(vm->xe, !userptr->mapped);
+ userptr->mapped = true;
+ mutex_unlock(&userptr->unmap_mutex);
+}
+
+void xe_hmm_userptr_unmap(struct xe_userptr_vma *uvma)
+{
+ struct xe_userptr *userptr = &uvma->userptr;
+ struct xe_vma *vma = &uvma->vma;
+ bool write = !xe_vma_read_only(vma);
+ struct xe_vm *vm = xe_vma_vm(vma);
+ struct xe_device *xe = vm->xe;
- ret = dma_map_sgtable(dev, st, write ? DMA_BIDIRECTIONAL : DMA_TO_DEVICE,
- DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_NO_KERNEL_MAPPING);
- if (ret) {
- sg_free_table(st);
- st = NULL;
+ if (!lockdep_is_held_type(&vm->userptr.notifier_lock, 0) &&
+ !lockdep_is_held_type(&vm->lock, 0) &&
+ !(vma->gpuva.flags & XE_VMA_DESTROYED)) {
+ /* Don't unmap in exec critical section. */
+ xe_vm_assert_held(vm);
+ /* Don't unmap while mapping the sg. */
+ lockdep_assert_held(&vm->lock);
}
-free_pages:
- kvfree(pages);
- return ret;
+ mutex_lock(&userptr->unmap_mutex);
+ if (userptr->sg && userptr->mapped)
+ dma_unmap_sgtable(xe->drm.dev, userptr->sg,
+ write ? DMA_BIDIRECTIONAL : DMA_TO_DEVICE, 0);
+ userptr->mapped = false;
+ mutex_unlock(&userptr->unmap_mutex);
}
-/*
+/**
* xe_hmm_userptr_free_sg() - Free the scatter gather table of userptr
- *
* @uvma: the userptr vma which hold the scatter gather table
*
* With function xe_userptr_populate_range, we allocate storage of
@@ -124,16 +200,9 @@ free_pages:
void xe_hmm_userptr_free_sg(struct xe_userptr_vma *uvma)
{
struct xe_userptr *userptr = &uvma->userptr;
- struct xe_vma *vma = &uvma->vma;
- bool write = !xe_vma_read_only(vma);
- struct xe_vm *vm = xe_vma_vm(vma);
- struct xe_device *xe = vm->xe;
- struct device *dev = xe->drm.dev;
-
- xe_assert(xe, userptr->sg);
- dma_unmap_sgtable(dev, userptr->sg,
- write ? DMA_BIDIRECTIONAL : DMA_TO_DEVICE, 0);
+ xe_assert(xe_vma_vm(&uvma->vma)->xe, userptr->sg);
+ xe_hmm_userptr_unmap(uvma);
sg_free_table(userptr->sg);
userptr->sg = NULL;
}
@@ -166,13 +235,20 @@ int xe_hmm_userptr_populate_range(struct xe_userptr_vma *uvma,
{
unsigned long timeout =
jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
- unsigned long *pfns, flags = HMM_PFN_REQ_FAULT;
+ unsigned long *pfns;
struct xe_userptr *userptr;
struct xe_vma *vma = &uvma->vma;
u64 userptr_start = xe_vma_userptr(vma);
u64 userptr_end = userptr_start + xe_vma_size(vma);
struct xe_vm *vm = xe_vma_vm(vma);
- struct hmm_range hmm_range;
+ struct hmm_range hmm_range = {
+ .pfn_flags_mask = 0, /* ignore pfns */
+ .default_flags = HMM_PFN_REQ_FAULT,
+ .start = userptr_start,
+ .end = userptr_end,
+ .notifier = &uvma->userptr.notifier,
+ .dev_private_owner = vm->xe,
+ };
bool write = !xe_vma_read_only(vma);
unsigned long notifier_seq;
u64 npages;
@@ -199,19 +275,14 @@ int xe_hmm_userptr_populate_range(struct xe_userptr_vma *uvma,
return -ENOMEM;
if (write)
- flags |= HMM_PFN_REQ_WRITE;
+ hmm_range.default_flags |= HMM_PFN_REQ_WRITE;
if (!mmget_not_zero(userptr->notifier.mm)) {
ret = -EFAULT;
goto free_pfns;
}
- hmm_range.default_flags = flags;
hmm_range.hmm_pfns = pfns;
- hmm_range.notifier = &userptr->notifier;
- hmm_range.start = userptr_start;
- hmm_range.end = userptr_end;
- hmm_range.dev_private_owner = vm->xe;
while (true) {
hmm_range.notifier_seq = mmu_interval_read_begin(&userptr->notifier);
@@ -238,16 +309,37 @@ int xe_hmm_userptr_populate_range(struct xe_userptr_vma *uvma,
if (ret)
goto free_pfns;
- ret = xe_build_sg(vm->xe, &hmm_range, &userptr->sgt, write);
+ ret = xe_alloc_sg(vm->xe, &userptr->sgt, &hmm_range, &vm->userptr.notifier_lock);
if (ret)
goto free_pfns;
+ ret = down_read_interruptible(&vm->userptr.notifier_lock);
+ if (ret)
+ goto free_st;
+
+ if (mmu_interval_read_retry(hmm_range.notifier, hmm_range.notifier_seq)) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+
+ ret = xe_build_sg(vm->xe, &hmm_range, &userptr->sgt,
+ &vm->userptr.notifier_lock, write);
+ if (ret)
+ goto out_unlock;
+
xe_mark_range_accessed(&hmm_range, write);
userptr->sg = &userptr->sgt;
+ xe_hmm_userptr_set_mapped(uvma);
userptr->notifier_seq = hmm_range.notifier_seq;
+ up_read(&vm->userptr.notifier_lock);
+ kvfree(pfns);
+ return 0;
+out_unlock:
+ up_read(&vm->userptr.notifier_lock);
+free_st:
+ sg_free_table(&userptr->sgt);
free_pfns:
kvfree(pfns);
return ret;
}
-
diff --git a/drivers/gpu/drm/xe/xe_hmm.h b/drivers/gpu/drm/xe/xe_hmm.h
index 909dc2bdcd97..0ea98d8e7bbc 100644
--- a/drivers/gpu/drm/xe/xe_hmm.h
+++ b/drivers/gpu/drm/xe/xe_hmm.h
@@ -3,9 +3,16 @@
* Copyright © 2024 Intel Corporation
*/
+#ifndef _XE_HMM_H_
+#define _XE_HMM_H_
+
#include <linux/types.h>
struct xe_userptr_vma;
int xe_hmm_userptr_populate_range(struct xe_userptr_vma *uvma, bool is_mm_mmap_locked);
+
void xe_hmm_userptr_free_sg(struct xe_userptr_vma *uvma);
+
+void xe_hmm_userptr_unmap(struct xe_userptr_vma *uvma);
+#endif
diff --git a/drivers/gpu/drm/xe/xe_hw_engine_group.c b/drivers/gpu/drm/xe/xe_hw_engine_group.c
index 82750520a90a..2d68c5b5262a 100644
--- a/drivers/gpu/drm/xe/xe_hw_engine_group.c
+++ b/drivers/gpu/drm/xe/xe_hw_engine_group.c
@@ -178,6 +178,7 @@ err_suspend:
up_write(&group->mode_sem);
return err;
}
+ALLOW_ERROR_INJECTION(xe_hw_engine_group_add_exec_queue, ERRNO);
/**
* xe_hw_engine_group_del_exec_queue() - Delete an exec queue from a hw engine group
diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
index 278bc96cf593..df4282c71bf0 100644
--- a/drivers/gpu/drm/xe/xe_migrate.c
+++ b/drivers/gpu/drm/xe/xe_migrate.c
@@ -1544,6 +1544,181 @@ void xe_migrate_wait(struct xe_migrate *m)
dma_fence_wait(m->fence, false);
}
+static u32 pte_update_cmd_size(u64 size)
+{
+ u32 num_dword;
+ u64 entries = DIV_ROUND_UP(size, XE_PAGE_SIZE);
+
+ XE_WARN_ON(size > MAX_PREEMPTDISABLE_TRANSFER);
+ /*
+ * MI_STORE_DATA_IMM command is used to update page table. Each
+ * instruction can update maximumly 0x1ff pte entries. To update
+ * n (n <= 0x1ff) pte entries, we need:
+ * 1 dword for the MI_STORE_DATA_IMM command header (opcode etc)
+ * 2 dword for the page table's physical location
+ * 2*n dword for value of pte to fill (each pte entry is 2 dwords)
+ */
+ num_dword = (1 + 2) * DIV_ROUND_UP(entries, 0x1ff);
+ num_dword += entries * 2;
+
+ return num_dword;
+}
+
+static void build_pt_update_batch_sram(struct xe_migrate *m,
+ struct xe_bb *bb, u32 pt_offset,
+ dma_addr_t *sram_addr, u32 size)
+{
+ u16 pat_index = tile_to_xe(m->tile)->pat.idx[XE_CACHE_WB];
+ u32 ptes;
+ int i = 0;
+
+ ptes = DIV_ROUND_UP(size, XE_PAGE_SIZE);
+ while (ptes) {
+ u32 chunk = min(0x1ffU, ptes);
+
+ bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk);
+ bb->cs[bb->len++] = pt_offset;
+ bb->cs[bb->len++] = 0;
+
+ pt_offset += chunk * 8;
+ ptes -= chunk;
+
+ while (chunk--) {
+ u64 addr = sram_addr[i++] & PAGE_MASK;
+
+ xe_tile_assert(m->tile, addr);
+ addr = m->q->vm->pt_ops->pte_encode_addr(m->tile->xe,
+ addr, pat_index,
+ 0, false, 0);
+ bb->cs[bb->len++] = lower_32_bits(addr);
+ bb->cs[bb->len++] = upper_32_bits(addr);
+ }
+ }
+}
+
+enum xe_migrate_copy_dir {
+ XE_MIGRATE_COPY_TO_VRAM,
+ XE_MIGRATE_COPY_TO_SRAM,
+};
+
+static struct dma_fence *xe_migrate_vram(struct xe_migrate *m,
+ unsigned long npages,
+ dma_addr_t *sram_addr, u64 vram_addr,
+ const enum xe_migrate_copy_dir dir)
+{
+ struct xe_gt *gt = m->tile->primary_gt;
+ struct xe_device *xe = gt_to_xe(gt);
+ struct dma_fence *fence = NULL;
+ u32 batch_size = 2;
+ u64 src_L0_ofs, dst_L0_ofs;
+ u64 round_update_size;
+ struct xe_sched_job *job;
+ struct xe_bb *bb;
+ u32 update_idx, pt_slot = 0;
+ int err;
+
+ if (npages * PAGE_SIZE > MAX_PREEMPTDISABLE_TRANSFER)
+ return ERR_PTR(-EINVAL);
+
+ round_update_size = npages * PAGE_SIZE;
+ batch_size += pte_update_cmd_size(round_update_size);
+ batch_size += EMIT_COPY_DW;
+
+ bb = xe_bb_new(gt, batch_size, true);
+ if (IS_ERR(bb)) {
+ err = PTR_ERR(bb);
+ return ERR_PTR(err);
+ }
+
+ build_pt_update_batch_sram(m, bb, pt_slot * XE_PAGE_SIZE,
+ sram_addr, round_update_size);
+
+ if (dir == XE_MIGRATE_COPY_TO_VRAM) {
+ src_L0_ofs = xe_migrate_vm_addr(pt_slot, 0);
+ dst_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr, false);
+
+ } else {
+ src_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr, false);
+ dst_L0_ofs = xe_migrate_vm_addr(pt_slot, 0);
+ }
+
+ bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
+ update_idx = bb->len;
+
+ emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, round_update_size,
+ XE_PAGE_SIZE);
+
+ job = xe_bb_create_migration_job(m->q, bb,
+ xe_migrate_batch_base(m, true),
+ update_idx);
+ if (IS_ERR(job)) {
+ err = PTR_ERR(job);
+ goto err;
+ }
+
+ xe_sched_job_add_migrate_flush(job, 0);
+
+ mutex_lock(&m->job_mutex);
+ xe_sched_job_arm(job);
+ fence = dma_fence_get(&job->drm.s_fence->finished);
+ xe_sched_job_push(job);
+
+ dma_fence_put(m->fence);
+ m->fence = dma_fence_get(fence);
+ mutex_unlock(&m->job_mutex);
+
+ xe_bb_free(bb, fence);
+
+ return fence;
+
+err:
+ xe_bb_free(bb, NULL);
+
+ return ERR_PTR(err);
+}
+
+/**
+ * xe_migrate_to_vram() - Migrate to VRAM
+ * @m: The migration context.
+ * @npages: Number of pages to migrate.
+ * @src_addr: Array of dma addresses (source of migrate)
+ * @dst_addr: Device physical address of VRAM (destination of migrate)
+ *
+ * Copy from an array dma addresses to a VRAM device physical address
+ *
+ * Return: dma fence for migrate to signal completion on succees, ERR_PTR on
+ * failure
+ */
+struct dma_fence *xe_migrate_to_vram(struct xe_migrate *m,
+ unsigned long npages,
+ dma_addr_t *src_addr,
+ u64 dst_addr)
+{
+ return xe_migrate_vram(m, npages, src_addr, dst_addr,
+ XE_MIGRATE_COPY_TO_VRAM);
+}
+
+/**
+ * xe_migrate_from_vram() - Migrate from VRAM
+ * @m: The migration context.
+ * @npages: Number of pages to migrate.
+ * @src_addr: Device physical address of VRAM (source of migrate)
+ * @dst_addr: Array of dma addresses (destination of migrate)
+ *
+ * Copy from a VRAM device physical address to an array dma addresses
+ *
+ * Return: dma fence for migrate to signal completion on succees, ERR_PTR on
+ * failure
+ */
+struct dma_fence *xe_migrate_from_vram(struct xe_migrate *m,
+ unsigned long npages,
+ u64 src_addr,
+ dma_addr_t *dst_addr)
+{
+ return xe_migrate_vram(m, npages, dst_addr, src_addr,
+ XE_MIGRATE_COPY_TO_SRAM);
+}
+
#if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST)
#include "tests/xe_migrate.c"
#endif
diff --git a/drivers/gpu/drm/xe/xe_migrate.h b/drivers/gpu/drm/xe/xe_migrate.h
index 0109866e398a..6ff9a963425c 100644
--- a/drivers/gpu/drm/xe/xe_migrate.h
+++ b/drivers/gpu/drm/xe/xe_migrate.h
@@ -95,6 +95,16 @@ struct xe_migrate_pt_update {
struct xe_migrate *xe_migrate_init(struct xe_tile *tile);
+struct dma_fence *xe_migrate_to_vram(struct xe_migrate *m,
+ unsigned long npages,
+ dma_addr_t *src_addr,
+ u64 dst_addr);
+
+struct dma_fence *xe_migrate_from_vram(struct xe_migrate *m,
+ unsigned long npages,
+ u64 src_addr,
+ dma_addr_t *dst_addr);
+
struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
struct xe_bo *src_bo,
struct xe_bo *dst_bo,
diff --git a/drivers/gpu/drm/xe/xe_module.c b/drivers/gpu/drm/xe/xe_module.c
index 7185a2cdf6e3..9f4632e39a1a 100644
--- a/drivers/gpu/drm/xe/xe_module.c
+++ b/drivers/gpu/drm/xe/xe_module.c
@@ -22,9 +22,16 @@ struct xe_modparam xe_modparam = {
.guc_log_level = 3,
.force_probe = CONFIG_DRM_XE_FORCE_PROBE,
.wedged_mode = 1,
+ .svm_notifier_size = 512,
/* the rest are 0 by default */
};
+module_param_named(svm_notifier_size, xe_modparam.svm_notifier_size, uint, 0600);
+MODULE_PARM_DESC(svm_notifier_size, "Set the svm notifier size(in MiB), must be power of 2");
+
+module_param_named(always_migrate_to_vram, xe_modparam.always_migrate_to_vram, bool, 0444);
+MODULE_PARM_DESC(always_migrate_to_vram, "Always migrate to VRAM on GPU fault");
+
module_param_named_unsafe(force_execlist, xe_modparam.force_execlist, bool, 0444);
MODULE_PARM_DESC(force_execlist, "Force Execlist submission");
diff --git a/drivers/gpu/drm/xe/xe_module.h b/drivers/gpu/drm/xe/xe_module.h
index 161a5e6f717f..84339e509c80 100644
--- a/drivers/gpu/drm/xe/xe_module.h
+++ b/drivers/gpu/drm/xe/xe_module.h
@@ -12,6 +12,7 @@
struct xe_modparam {
bool force_execlist;
bool probe_display;
+ bool always_migrate_to_vram;
u32 force_vram_bar_size;
int guc_log_level;
char *guc_firmware_path;
@@ -22,6 +23,7 @@ struct xe_modparam {
unsigned int max_vfs;
#endif
int wedged_mode;
+ u32 svm_notifier_size;
};
extern struct xe_modparam xe_modparam;
diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c
index 2c5a24a13e87..6f185632da14 100644
--- a/drivers/gpu/drm/xe/xe_oa.c
+++ b/drivers/gpu/drm/xe/xe_oa.c
@@ -12,6 +12,8 @@
#include <drm/drm_managed.h>
#include <uapi/drm/xe_drm.h>
+#include <generated/xe_wa_oob.h>
+
#include "abi/guc_actions_slpc_abi.h"
#include "instructions/xe_mi_commands.h"
#include "regs/xe_engine_regs.h"
@@ -35,6 +37,7 @@
#include "xe_sched_job.h"
#include "xe_sriov.h"
#include "xe_sync.h"
+#include "xe_wa.h"
#define DEFAULT_POLL_FREQUENCY_HZ 200
#define DEFAULT_POLL_PERIOD_NS (NSEC_PER_SEC / DEFAULT_POLL_FREQUENCY_HZ)
@@ -812,11 +815,8 @@ static void xe_oa_disable_metric_set(struct xe_oa_stream *stream)
struct xe_mmio *mmio = &stream->gt->mmio;
u32 sqcnt1;
- /*
- * Wa_1508761755:xehpsdv, dg2
- * Enable thread stall DOP gating and EU DOP gating.
- */
- if (stream->oa->xe->info.platform == XE_DG2) {
+ /* Enable thread stall DOP gating and EU DOP gating. */
+ if (XE_WA(stream->gt, 1508761755)) {
xe_gt_mcr_multicast_write(stream->gt, ROW_CHICKEN,
_MASKED_BIT_DISABLE(STALL_DOP_GATING_DISABLE));
xe_gt_mcr_multicast_write(stream->gt, ROW_CHICKEN2,
@@ -1065,11 +1065,10 @@ static int xe_oa_enable_metric_set(struct xe_oa_stream *stream)
int ret;
/*
- * Wa_1508761755:xehpsdv, dg2
* EU NOA signals behave incorrectly if EU clock gating is enabled.
* Disable thread stall DOP gating and EU DOP gating.
*/
- if (stream->oa->xe->info.platform == XE_DG2) {
+ if (XE_WA(stream->gt, 1508761755)) {
xe_gt_mcr_multicast_write(stream->gt, ROW_CHICKEN,
_MASKED_BIT_ENABLE(STALL_DOP_GATING_DISABLE));
xe_gt_mcr_multicast_write(stream->gt, ROW_CHICKEN2,
@@ -1690,7 +1689,7 @@ static int xe_oa_stream_init(struct xe_oa_stream *stream,
stream->oa_buffer.format = &stream->oa->oa_formats[param->oa_format];
stream->sample = param->sample;
- stream->periodic = param->period_exponent > 0;
+ stream->periodic = param->period_exponent >= 0;
stream->period_exponent = param->period_exponent;
stream->no_preempt = param->no_preempt;
stream->wait_num_reports = param->wait_num_reports;
@@ -1720,12 +1719,10 @@ static int xe_oa_stream_init(struct xe_oa_stream *stream,
}
/*
- * Wa_1509372804:pvc
- *
* GuC reset of engines causes OA to lose configuration
* state. Prevent this by overriding GUCRC mode.
*/
- if (stream->oa->xe->info.platform == XE_PVC) {
+ if (XE_WA(stream->gt, 1509372804)) {
ret = xe_guc_pc_override_gucrc_mode(&gt->uc.guc.pc,
SLPC_GUCRC_MODE_GUCRC_NO_RC6);
if (ret)
@@ -1857,23 +1854,14 @@ u32 xe_oa_timestamp_frequency(struct xe_gt *gt)
{
u32 reg, shift;
- /*
- * Wa_18013179988:dg2
- * Wa_14015568240:pvc
- * Wa_14015846243:mtl
- */
- switch (gt_to_xe(gt)->info.platform) {
- case XE_DG2:
- case XE_PVC:
- case XE_METEORLAKE:
+ if (XE_WA(gt, 18013179988) || XE_WA(gt, 14015568240)) {
xe_pm_runtime_get(gt_to_xe(gt));
reg = xe_mmio_read32(&gt->mmio, RPM_CONFIG0);
xe_pm_runtime_put(gt_to_xe(gt));
shift = REG_FIELD_GET(RPM_CONFIG0_CTC_SHIFT_PARAMETER_MASK, reg);
return gt->info.reference_clock << (3 - shift);
-
- default:
+ } else {
return gt->info.reference_clock;
}
}
@@ -1971,6 +1959,7 @@ int xe_oa_stream_open_ioctl(struct drm_device *dev, u64 data, struct drm_file *f
}
param.xef = xef;
+ param.period_exponent = -1;
ret = xe_oa_user_extensions(oa, XE_OA_USER_EXTN_FROM_OPEN, data, 0, &param);
if (ret)
return ret;
@@ -2025,7 +2014,7 @@ int xe_oa_stream_open_ioctl(struct drm_device *dev, u64 data, struct drm_file *f
goto err_exec_q;
}
- if (param.period_exponent > 0) {
+ if (param.period_exponent >= 0) {
u64 oa_period, oa_freq_hz;
/* Requesting samples from OAG buffer is a privileged operation */
diff --git a/drivers/gpu/drm/xe/xe_observation.c b/drivers/gpu/drm/xe/xe_observation.c
index 57cf01efc07f..e3f9b546207e 100644
--- a/drivers/gpu/drm/xe/xe_observation.c
+++ b/drivers/gpu/drm/xe/xe_observation.c
@@ -8,6 +8,7 @@
#include <uapi/drm/xe_drm.h>
+#include "xe_eu_stall.h"
#include "xe_oa.h"
#include "xe_observation.h"
@@ -29,6 +30,17 @@ static int xe_oa_ioctl(struct drm_device *dev, struct drm_xe_observation_param *
}
}
+static int xe_eu_stall_ioctl(struct drm_device *dev, struct drm_xe_observation_param *arg,
+ struct drm_file *file)
+{
+ switch (arg->observation_op) {
+ case DRM_XE_OBSERVATION_OP_STREAM_OPEN:
+ return xe_eu_stall_stream_open(dev, arg->param, file);
+ default:
+ return -EINVAL;
+ }
+}
+
/**
* xe_observation_ioctl - The top level observation layer ioctl
* @dev: @drm_device
@@ -51,6 +63,8 @@ int xe_observation_ioctl(struct drm_device *dev, void *data, struct drm_file *fi
switch (arg->observation_type) {
case DRM_XE_OBSERVATION_TYPE_OA:
return xe_oa_ioctl(dev, arg, file);
+ case DRM_XE_OBSERVATION_TYPE_EU_STALL:
+ return xe_eu_stall_ioctl(dev, arg, file);
default:
return -EINVAL;
}
diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
index f8417f4d8ce6..da9679c8cf26 100644
--- a/drivers/gpu/drm/xe/xe_pci.c
+++ b/drivers/gpu/drm/xe/xe_pci.c
@@ -46,9 +46,9 @@ struct xe_subplatform_desc {
struct xe_device_desc {
/* Should only ever be set for platforms without GMD_ID */
- const struct xe_graphics_desc *graphics;
+ const struct xe_ip *pre_gmdid_graphics_ip;
/* Should only ever be set for platforms without GMD_ID */
- const struct xe_media_desc *media;
+ const struct xe_ip *pre_gmdid_media_ip;
const char *platform_name;
const struct xe_subplatform_desc *subplatforms;
@@ -82,21 +82,6 @@ __diag_ignore_all("-Woverride-init", "Allow field overrides in table");
#define NOP(x) x
static const struct xe_graphics_desc graphics_xelp = {
- .name = "Xe_LP",
- .ver = 12,
- .rel = 0,
-
- .hw_engine_mask = BIT(XE_HW_ENGINE_RCS0) | BIT(XE_HW_ENGINE_BCS0),
-
- .va_bits = 48,
- .vm_max_level = 3,
-};
-
-static const struct xe_graphics_desc graphics_xelpp = {
- .name = "Xe_LP+",
- .ver = 12,
- .rel = 10,
-
.hw_engine_mask = BIT(XE_HW_ENGINE_RCS0) | BIT(XE_HW_ENGINE_BCS0),
.va_bits = 48,
@@ -109,10 +94,6 @@ static const struct xe_graphics_desc graphics_xelpp = {
.vm_max_level = 3
static const struct xe_graphics_desc graphics_xehpg = {
- .name = "Xe_HPG",
- .ver = 12,
- .rel = 55,
-
.hw_engine_mask =
BIT(XE_HW_ENGINE_RCS0) | BIT(XE_HW_ENGINE_BCS0) |
BIT(XE_HW_ENGINE_CCS0) | BIT(XE_HW_ENGINE_CCS1) |
@@ -125,10 +106,6 @@ static const struct xe_graphics_desc graphics_xehpg = {
};
static const struct xe_graphics_desc graphics_xehpc = {
- .name = "Xe_HPC",
- .ver = 12,
- .rel = 60,
-
.hw_engine_mask =
BIT(XE_HW_ENGINE_BCS0) | BIT(XE_HW_ENGINE_BCS1) |
BIT(XE_HW_ENGINE_BCS2) | BIT(XE_HW_ENGINE_BCS3) |
@@ -149,7 +126,6 @@ static const struct xe_graphics_desc graphics_xehpc = {
};
static const struct xe_graphics_desc graphics_xelpg = {
- .name = "Xe_LPG",
.hw_engine_mask =
BIT(XE_HW_ENGINE_RCS0) | BIT(XE_HW_ENGINE_BCS0) |
BIT(XE_HW_ENGINE_CCS0),
@@ -172,50 +148,54 @@ static const struct xe_graphics_desc graphics_xelpg = {
GENMASK(XE_HW_ENGINE_CCS3, XE_HW_ENGINE_CCS0)
static const struct xe_graphics_desc graphics_xe2 = {
- .name = "Xe2_LPG / Xe2_HPG / Xe3_LPG",
-
XE2_GFX_FEATURES,
};
static const struct xe_media_desc media_xem = {
- .name = "Xe_M",
- .ver = 12,
- .rel = 0,
-
- .hw_engine_mask =
- GENMASK(XE_HW_ENGINE_VCS7, XE_HW_ENGINE_VCS0) |
- GENMASK(XE_HW_ENGINE_VECS3, XE_HW_ENGINE_VECS0),
-};
-
-static const struct xe_media_desc media_xehpm = {
- .name = "Xe_HPM",
- .ver = 12,
- .rel = 55,
-
.hw_engine_mask =
GENMASK(XE_HW_ENGINE_VCS7, XE_HW_ENGINE_VCS0) |
GENMASK(XE_HW_ENGINE_VECS3, XE_HW_ENGINE_VECS0),
};
static const struct xe_media_desc media_xelpmp = {
- .name = "Xe_LPM+",
.hw_engine_mask =
GENMASK(XE_HW_ENGINE_VCS7, XE_HW_ENGINE_VCS0) |
GENMASK(XE_HW_ENGINE_VECS3, XE_HW_ENGINE_VECS0) |
BIT(XE_HW_ENGINE_GSCCS0)
};
-static const struct xe_media_desc media_xe2 = {
- .name = "Xe2_LPM / Xe2_HPM / Xe3_LPM",
- .hw_engine_mask =
- GENMASK(XE_HW_ENGINE_VCS7, XE_HW_ENGINE_VCS0) |
- GENMASK(XE_HW_ENGINE_VECS3, XE_HW_ENGINE_VECS0) |
- BIT(XE_HW_ENGINE_GSCCS0)
+/* Pre-GMDID Graphics IPs */
+static const struct xe_ip graphics_ip_xelp = { 1200, "Xe_LP", &graphics_xelp };
+static const struct xe_ip graphics_ip_xelpp = { 1210, "Xe_LP+", &graphics_xelp };
+static const struct xe_ip graphics_ip_xehpg = { 1255, "Xe_HPG", &graphics_xehpg };
+static const struct xe_ip graphics_ip_xehpc = { 1260, "Xe_HPC", &graphics_xehpc };
+
+/* GMDID-based Graphics IPs */
+static const struct xe_ip graphics_ips[] = {
+ { 1270, "Xe_LPG", &graphics_xelpg },
+ { 1271, "Xe_LPG", &graphics_xelpg },
+ { 1274, "Xe_LPG+", &graphics_xelpg },
+ { 2001, "Xe2_HPG", &graphics_xe2 },
+ { 2004, "Xe2_LPG", &graphics_xe2 },
+ { 3000, "Xe3_LPG", &graphics_xe2 },
+ { 3001, "Xe3_LPG", &graphics_xe2 },
+};
+
+/* Pre-GMDID Media IPs */
+static const struct xe_ip media_ip_xem = { 1200, "Xe_M", &media_xem };
+static const struct xe_ip media_ip_xehpm = { 1255, "Xe_HPM", &media_xem };
+
+/* GMDID-based Media IPs */
+static const struct xe_ip media_ips[] = {
+ { 1300, "Xe_LPM+", &media_xelpmp },
+ { 1301, "Xe2_HPM", &media_xelpmp },
+ { 2000, "Xe2_LPM", &media_xelpmp },
+ { 3000, "Xe3_LPM", &media_xelpmp },
};
static const struct xe_device_desc tgl_desc = {
- .graphics = &graphics_xelp,
- .media = &media_xem,
+ .pre_gmdid_graphics_ip = &graphics_ip_xelp,
+ .pre_gmdid_media_ip = &media_ip_xem,
PLATFORM(TIGERLAKE),
.dma_mask_size = 39,
.has_display = true,
@@ -224,8 +204,8 @@ static const struct xe_device_desc tgl_desc = {
};
static const struct xe_device_desc rkl_desc = {
- .graphics = &graphics_xelp,
- .media = &media_xem,
+ .pre_gmdid_graphics_ip = &graphics_ip_xelp,
+ .pre_gmdid_media_ip = &media_ip_xem,
PLATFORM(ROCKETLAKE),
.dma_mask_size = 39,
.has_display = true,
@@ -236,8 +216,8 @@ static const struct xe_device_desc rkl_desc = {
static const u16 adls_rpls_ids[] = { INTEL_RPLS_IDS(NOP), 0 };
static const struct xe_device_desc adl_s_desc = {
- .graphics = &graphics_xelp,
- .media = &media_xem,
+ .pre_gmdid_graphics_ip = &graphics_ip_xelp,
+ .pre_gmdid_media_ip = &media_ip_xem,
PLATFORM(ALDERLAKE_S),
.dma_mask_size = 39,
.has_display = true,
@@ -252,8 +232,8 @@ static const struct xe_device_desc adl_s_desc = {
static const u16 adlp_rplu_ids[] = { INTEL_RPLU_IDS(NOP), 0 };
static const struct xe_device_desc adl_p_desc = {
- .graphics = &graphics_xelp,
- .media = &media_xem,
+ .pre_gmdid_graphics_ip = &graphics_ip_xelp,
+ .pre_gmdid_media_ip = &media_ip_xem,
PLATFORM(ALDERLAKE_P),
.dma_mask_size = 39,
.has_display = true,
@@ -266,8 +246,8 @@ static const struct xe_device_desc adl_p_desc = {
};
static const struct xe_device_desc adl_n_desc = {
- .graphics = &graphics_xelp,
- .media = &media_xem,
+ .pre_gmdid_graphics_ip = &graphics_ip_xelp,
+ .pre_gmdid_media_ip = &media_ip_xem,
PLATFORM(ALDERLAKE_N),
.dma_mask_size = 39,
.has_display = true,
@@ -279,8 +259,8 @@ static const struct xe_device_desc adl_n_desc = {
.is_dgfx = 1
static const struct xe_device_desc dg1_desc = {
- .graphics = &graphics_xelpp,
- .media = &media_xem,
+ .pre_gmdid_graphics_ip = &graphics_ip_xelpp,
+ .pre_gmdid_media_ip = &media_ip_xem,
DGFX_FEATURES,
PLATFORM(DG1),
.dma_mask_size = 39,
@@ -305,8 +285,8 @@ static const u16 dg2_g12_ids[] = { INTEL_DG2_G12_IDS(NOP), 0 };
}
static const struct xe_device_desc ats_m_desc = {
- .graphics = &graphics_xehpg,
- .media = &media_xehpm,
+ .pre_gmdid_graphics_ip = &graphics_ip_xehpg,
+ .pre_gmdid_media_ip = &media_ip_xehpm,
.dma_mask_size = 46,
.require_force_probe = true,
@@ -315,8 +295,8 @@ static const struct xe_device_desc ats_m_desc = {
};
static const struct xe_device_desc dg2_desc = {
- .graphics = &graphics_xehpg,
- .media = &media_xehpm,
+ .pre_gmdid_graphics_ip = &graphics_ip_xehpg,
+ .pre_gmdid_media_ip = &media_ip_xehpm,
.dma_mask_size = 46,
.require_force_probe = true,
@@ -325,7 +305,7 @@ static const struct xe_device_desc dg2_desc = {
};
static const __maybe_unused struct xe_device_desc pvc_desc = {
- .graphics = &graphics_xehpc,
+ .pre_gmdid_graphics_ip = &graphics_ip_xehpc,
DGFX_FEATURES,
PLATFORM(PVC),
.dma_mask_size = 52,
@@ -370,25 +350,6 @@ static const struct xe_device_desc ptl_desc = {
#undef PLATFORM
__diag_pop();
-/* Map of GMD_ID values to graphics IP */
-static const struct gmdid_map graphics_ip_map[] = {
- { 1270, &graphics_xelpg },
- { 1271, &graphics_xelpg },
- { 1274, &graphics_xelpg }, /* Xe_LPG+ */
- { 2001, &graphics_xe2 },
- { 2004, &graphics_xe2 },
- { 3000, &graphics_xe2 },
- { 3001, &graphics_xe2 },
-};
-
-/* Map of GMD_ID values to media IP */
-static const struct gmdid_map media_ip_map[] = {
- { 1300, &media_xelpmp },
- { 1301, &media_xe2 },
- { 2000, &media_xe2 },
- { 3000, &media_xe2 },
-};
-
/*
* Make sure any device matches here are from most specific to most
* general. For example, since the Quanta match is based on the subsystem
@@ -549,66 +510,49 @@ static void read_gmdid(struct xe_device *xe, enum xe_gmdid_type type, u32 *ver,
}
/*
- * Pre-GMD_ID platform: device descriptor already points to the appropriate
- * graphics descriptor. Simply forward the description and calculate the version
- * appropriately. "graphics" should be present in all such platforms, while
- * media is optional.
- */
-static void handle_pre_gmdid(struct xe_device *xe,
- const struct xe_graphics_desc *graphics,
- const struct xe_media_desc *media)
-{
- xe->info.graphics_verx100 = graphics->ver * 100 + graphics->rel;
-
- if (media)
- xe->info.media_verx100 = media->ver * 100 + media->rel;
-
-}
-
-/*
- * GMD_ID platform: read IP version from hardware and select graphics descriptor
+ * Read IP version from hardware and select graphics/media IP descriptors
* based on the result.
*/
static void handle_gmdid(struct xe_device *xe,
- const struct xe_graphics_desc **graphics,
- const struct xe_media_desc **media,
+ const struct xe_ip **graphics_ip,
+ const struct xe_ip **media_ip,
u32 *graphics_revid,
u32 *media_revid)
{
u32 ver;
+ *graphics_ip = NULL;
+ *media_ip = NULL;
+
read_gmdid(xe, GMDID_GRAPHICS, &ver, graphics_revid);
- for (int i = 0; i < ARRAY_SIZE(graphics_ip_map); i++) {
- if (ver == graphics_ip_map[i].ver) {
- xe->info.graphics_verx100 = ver;
- *graphics = graphics_ip_map[i].ip;
+ for (int i = 0; i < ARRAY_SIZE(graphics_ips); i++) {
+ if (ver == graphics_ips[i].verx100) {
+ *graphics_ip = &graphics_ips[i];
break;
}
}
- if (!xe->info.graphics_verx100) {
+ if (!*graphics_ip) {
drm_err(&xe->drm, "Hardware reports unknown graphics version %u.%02u\n",
ver / 100, ver % 100);
}
read_gmdid(xe, GMDID_MEDIA, &ver, media_revid);
-
/* Media may legitimately be fused off / not present */
if (ver == 0)
return;
- for (int i = 0; i < ARRAY_SIZE(media_ip_map); i++) {
- if (ver == media_ip_map[i].ver) {
- xe->info.media_verx100 = ver;
- *media = media_ip_map[i].ip;
+ for (int i = 0; i < ARRAY_SIZE(media_ips); i++) {
+ if (ver == media_ips[i].verx100) {
+ *media_ip = &media_ips[i];
break;
}
}
- if (!xe->info.media_verx100) {
+ if (!*media_ip) {
drm_err(&xe->drm, "Hardware reports unknown media version %u.%02u\n",
ver / 100, ver % 100);
}
@@ -659,26 +603,31 @@ static int xe_info_init_early(struct xe_device *xe,
* present in device info.
*/
static int xe_info_init(struct xe_device *xe,
- const struct xe_graphics_desc *graphics_desc,
- const struct xe_media_desc *media_desc)
+ const struct xe_device_desc *desc)
{
u32 graphics_gmdid_revid = 0, media_gmdid_revid = 0;
+ const struct xe_ip *graphics_ip;
+ const struct xe_ip *media_ip;
+ const struct xe_graphics_desc *graphics_desc;
+ const struct xe_media_desc *media_desc;
struct xe_tile *tile;
struct xe_gt *gt;
u8 id;
/*
* If this platform supports GMD_ID, we'll detect the proper IP
- * descriptor to use from hardware registers. desc->graphics will only
- * ever be set at this point for platforms before GMD_ID. In that case
- * the IP descriptions and versions are simply derived from that.
+ * descriptor to use from hardware registers.
+ * desc->pre_gmdid_graphics_ip will only ever be set at this point for
+ * platforms before GMD_ID. In that case the IP descriptions and
+ * versions are simply derived from that.
*/
- if (graphics_desc) {
- handle_pre_gmdid(xe, graphics_desc, media_desc);
+ if (desc->pre_gmdid_graphics_ip) {
+ graphics_ip = desc->pre_gmdid_graphics_ip;
+ media_ip = desc->pre_gmdid_media_ip;
xe->info.step = xe_step_pre_gmdid_get(xe);
} else {
- xe_assert(xe, !media_desc);
- handle_gmdid(xe, &graphics_desc, &media_desc,
+ xe_assert(xe, !desc->pre_gmdid_media_ip);
+ handle_gmdid(xe, &graphics_ip, &media_ip,
&graphics_gmdid_revid, &media_gmdid_revid);
xe->info.step = xe_step_gmdid_get(xe,
graphics_gmdid_revid,
@@ -690,11 +639,21 @@ static int xe_info_init(struct xe_device *xe,
* error and we should abort driver load. Failing to detect media
* IP is non-fatal; we'll just proceed without enabling media support.
*/
- if (!graphics_desc)
+ if (!graphics_ip)
return -ENODEV;
- xe->info.graphics_name = graphics_desc->name;
- xe->info.media_name = media_desc ? media_desc->name : "none";
+ xe->info.graphics_verx100 = graphics_ip->verx100;
+ xe->info.graphics_name = graphics_ip->name;
+ graphics_desc = graphics_ip->desc;
+
+ if (media_ip) {
+ xe->info.media_verx100 = media_ip->verx100;
+ xe->info.media_name = media_ip->name;
+ media_desc = media_ip->desc;
+ } else {
+ xe->info.media_name = "none";
+ media_desc = NULL;
+ }
xe->info.vram_flags = graphics_desc->vram_flags;
xe->info.va_bits = graphics_desc->va_bits;
@@ -765,21 +724,16 @@ static int xe_info_init(struct xe_device *xe,
static void xe_pci_remove(struct pci_dev *pdev)
{
- struct xe_device *xe;
-
- xe = pdev_to_xe_device(pdev);
- if (!xe) /* driver load aborted, nothing to cleanup */
- return;
+ struct xe_device *xe = pdev_to_xe_device(pdev);
if (IS_SRIOV_PF(xe))
xe_pci_sriov_configure(pdev, 0);
- if (xe_survivability_mode_enabled(xe))
- return xe_survivability_mode_remove(xe);
+ if (xe_survivability_mode_is_enabled(xe))
+ return;
xe_device_remove(xe);
xe_pm_runtime_fini(xe);
- pci_set_drvdata(pdev, NULL);
}
/*
@@ -851,19 +805,20 @@ static int xe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
err = xe_device_probe_early(xe);
/*
- * In Boot Survivability mode, no drm card is exposed
- * and driver is loaded with bare minimum to allow
- * for firmware to be flashed through mei. Return
- * success if survivability mode is enabled.
+ * In Boot Survivability mode, no drm card is exposed and driver is
+ * loaded with bare minimum to allow for firmware to be flashed through
+ * mei. If early probe fails, check if survivability mode is flagged by
+ * HW to be enabled. In that case enable it and return success.
*/
if (err) {
- if (xe_survivability_mode_enabled(xe))
+ if (xe_survivability_mode_required(xe) &&
+ xe_survivability_mode_enable(xe))
return 0;
return err;
}
- err = xe_info_init(xe, desc->graphics, desc->media);
+ err = xe_info_init(xe, desc);
if (err)
return err;
@@ -900,10 +855,8 @@ static int xe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
return err;
err = xe_device_probe(xe);
- if (err) {
- xe_device_call_remove_actions(xe);
+ if (err)
return err;
- }
err = xe_pm_init(xe);
if (err)
@@ -953,7 +906,7 @@ static int xe_pci_suspend(struct device *dev)
struct xe_device *xe = pdev_to_xe_device(pdev);
int err;
- if (xe_survivability_mode_enabled(xe))
+ if (xe_survivability_mode_is_enabled(xe))
return -EBUSY;
err = xe_pm_suspend(xe);
diff --git a/drivers/gpu/drm/xe/xe_pci_sriov.c b/drivers/gpu/drm/xe/xe_pci_sriov.c
index aaceee748287..09ee8a06fe2e 100644
--- a/drivers/gpu/drm/xe/xe_pci_sriov.c
+++ b/drivers/gpu/drm/xe/xe_pci_sriov.c
@@ -62,6 +62,55 @@ static void pf_reset_vfs(struct xe_device *xe, unsigned int num_vfs)
xe_gt_sriov_pf_control_trigger_flr(gt, n);
}
+static struct pci_dev *xe_pci_pf_get_vf_dev(struct xe_device *xe, unsigned int vf_id)
+{
+ struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
+
+ xe_assert(xe, IS_SRIOV_PF(xe));
+
+ /* caller must use pci_dev_put() */
+ return pci_get_domain_bus_and_slot(pci_domain_nr(pdev->bus),
+ pdev->bus->number,
+ pci_iov_virtfn_devfn(pdev, vf_id));
+}
+
+static void pf_link_vfs(struct xe_device *xe, int num_vfs)
+{
+ struct pci_dev *pdev_pf = to_pci_dev(xe->drm.dev);
+ struct device_link *link;
+ struct pci_dev *pdev_vf;
+ unsigned int n;
+
+ /*
+ * When both PF and VF devices are enabled on the host, during system
+ * resume they are resuming in parallel.
+ *
+ * But PF has to complete the provision of VF first to allow any VFs to
+ * successfully resume.
+ *
+ * Create a parent-child device link between PF and VF devices that will
+ * enforce correct resume order.
+ */
+ for (n = 1; n <= num_vfs; n++) {
+ pdev_vf = xe_pci_pf_get_vf_dev(xe, n - 1);
+
+ /* unlikely, something weird is happening, abort */
+ if (!pdev_vf) {
+ xe_sriov_err(xe, "Cannot find VF%u device, aborting link%s creation!\n",
+ n, str_plural(num_vfs));
+ break;
+ }
+
+ link = device_link_add(&pdev_vf->dev, &pdev_pf->dev,
+ DL_FLAG_AUTOREMOVE_CONSUMER);
+ /* unlikely and harmless, continue with other VFs */
+ if (!link)
+ xe_sriov_notice(xe, "Failed linking VF%u\n", n);
+
+ pci_dev_put(pdev_vf);
+ }
+}
+
static int pf_enable_vfs(struct xe_device *xe, int num_vfs)
{
struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
@@ -92,6 +141,8 @@ static int pf_enable_vfs(struct xe_device *xe, int num_vfs)
if (err < 0)
goto failed;
+ pf_link_vfs(xe, num_vfs);
+
xe_sriov_info(xe, "Enabled %u of %u VF%s\n",
num_vfs, total_vfs, str_plural(total_vfs));
return num_vfs;
diff --git a/drivers/gpu/drm/xe/xe_pci_types.h b/drivers/gpu/drm/xe/xe_pci_types.h
index b96423844952..e9b9bbc138d3 100644
--- a/drivers/gpu/drm/xe/xe_pci_types.h
+++ b/drivers/gpu/drm/xe/xe_pci_types.h
@@ -9,10 +9,6 @@
#include <linux/types.h>
struct xe_graphics_desc {
- const char *name;
- u8 ver;
- u8 rel;
-
u8 va_bits;
u8 vm_max_level;
u8 vram_flags;
@@ -28,18 +24,15 @@ struct xe_graphics_desc {
};
struct xe_media_desc {
- const char *name;
- u8 ver;
- u8 rel;
-
u64 hw_engine_mask; /* hardware engines provided by media IP */
u8 has_indirect_ring_state:1;
};
-struct gmdid_map {
- unsigned int ver;
- const void *ip;
+struct xe_ip {
+ unsigned int verx100;
+ const char *name;
+ const void *desc;
};
#endif
diff --git a/drivers/gpu/drm/xe/xe_pmu.c b/drivers/gpu/drm/xe/xe_pmu.c
index 3910a82328ee..4f62a6e515d6 100644
--- a/drivers/gpu/drm/xe/xe_pmu.c
+++ b/drivers/gpu/drm/xe/xe_pmu.c
@@ -7,16 +7,18 @@
#include <linux/device.h>
#include "xe_device.h"
+#include "xe_force_wake.h"
#include "xe_gt_idle.h"
+#include "xe_guc_engine_activity.h"
+#include "xe_hw_engine.h"
#include "xe_pm.h"
#include "xe_pmu.h"
/**
* DOC: Xe PMU (Performance Monitoring Unit)
*
- * Expose events/counters like GT-C6 residency and GT frequency to user land via
- * the perf interface. Events are per device. The GT can be selected with an
- * extra config sub-field (bits 60-63).
+ * Expose events/counters like GT-C6 residency, GT frequency and per-class-engine
+ * activity to user land via the perf interface. Events are per device.
*
* All events are listed in sysfs:
*
@@ -24,7 +26,18 @@
* $ ls /sys/bus/event_source/devices/xe_0000_00_02.0/events/
* $ ls /sys/bus/event_source/devices/xe_0000_00_02.0/format/
*
- * The format directory has info regarding the configs that can be used.
+ * The following format parameters are available to read events,
+ * but only few are valid with each event:
+ *
+ * gt[60:63] Selects gt for the event
+ * engine_class[20:27] Selects engine-class for event
+ * engine_instance[12:19] Selects the engine-instance for the event
+ *
+ * For engine specific events (engine-*), gt, engine_class and engine_instance parameters must be
+ * set as populated by DRM_XE_DEVICE_QUERY_ENGINES.
+ *
+ * For gt specific events (gt-*) gt parameter must be passed. All other parameters will be 0.
+ *
* The standard perf tool can be used to grep for a certain event as well.
* Example:
*
@@ -35,20 +48,34 @@
* $ perf stat -e <event_name,gt=> -I <interval>
*/
-#define XE_PMU_EVENT_GT_MASK GENMASK_ULL(63, 60)
-#define XE_PMU_EVENT_ID_MASK GENMASK_ULL(11, 0)
+#define XE_PMU_EVENT_GT_MASK GENMASK_ULL(63, 60)
+#define XE_PMU_EVENT_ENGINE_CLASS_MASK GENMASK_ULL(27, 20)
+#define XE_PMU_EVENT_ENGINE_INSTANCE_MASK GENMASK_ULL(19, 12)
+#define XE_PMU_EVENT_ID_MASK GENMASK_ULL(11, 0)
static unsigned int config_to_event_id(u64 config)
{
return FIELD_GET(XE_PMU_EVENT_ID_MASK, config);
}
+static unsigned int config_to_engine_class(u64 config)
+{
+ return FIELD_GET(XE_PMU_EVENT_ENGINE_CLASS_MASK, config);
+}
+
+static unsigned int config_to_engine_instance(u64 config)
+{
+ return FIELD_GET(XE_PMU_EVENT_ENGINE_INSTANCE_MASK, config);
+}
+
static unsigned int config_to_gt_id(u64 config)
{
return FIELD_GET(XE_PMU_EVENT_GT_MASK, config);
}
-#define XE_PMU_EVENT_GT_C6_RESIDENCY 0x01
+#define XE_PMU_EVENT_GT_C6_RESIDENCY 0x01
+#define XE_PMU_EVENT_ENGINE_ACTIVE_TICKS 0x02
+#define XE_PMU_EVENT_ENGINE_TOTAL_TICKS 0x03
static struct xe_gt *event_to_gt(struct perf_event *event)
{
@@ -58,6 +85,59 @@ static struct xe_gt *event_to_gt(struct perf_event *event)
return xe_device_get_gt(xe, gt);
}
+static struct xe_hw_engine *event_to_hwe(struct perf_event *event)
+{
+ struct xe_device *xe = container_of(event->pmu, typeof(*xe), pmu.base);
+ struct drm_xe_engine_class_instance eci;
+ u64 config = event->attr.config;
+ struct xe_hw_engine *hwe;
+
+ eci.engine_class = config_to_engine_class(config);
+ eci.engine_instance = config_to_engine_instance(config);
+ eci.gt_id = config_to_gt_id(config);
+
+ hwe = xe_hw_engine_lookup(xe, eci);
+ if (!hwe || xe_hw_engine_is_reserved(hwe))
+ return NULL;
+
+ return hwe;
+}
+
+static bool is_engine_event(u64 config)
+{
+ unsigned int event_id = config_to_event_id(config);
+
+ return (event_id == XE_PMU_EVENT_ENGINE_TOTAL_TICKS ||
+ event_id == XE_PMU_EVENT_ENGINE_ACTIVE_TICKS);
+}
+
+static bool event_gt_forcewake(struct perf_event *event)
+{
+ struct xe_device *xe = container_of(event->pmu, typeof(*xe), pmu.base);
+ u64 config = event->attr.config;
+ struct xe_gt *gt;
+ unsigned int *fw_ref;
+
+ if (!is_engine_event(config))
+ return true;
+
+ gt = xe_device_get_gt(xe, config_to_gt_id(config));
+
+ fw_ref = kzalloc(sizeof(*fw_ref), GFP_KERNEL);
+ if (!fw_ref)
+ return false;
+
+ *fw_ref = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT);
+ if (!*fw_ref) {
+ kfree(fw_ref);
+ return false;
+ }
+
+ event->pmu_private = fw_ref;
+
+ return true;
+}
+
static bool event_supported(struct xe_pmu *pmu, unsigned int gt,
unsigned int id)
{
@@ -68,9 +148,47 @@ static bool event_supported(struct xe_pmu *pmu, unsigned int gt,
pmu->supported_events & BIT_ULL(id);
}
+static bool event_param_valid(struct perf_event *event)
+{
+ struct xe_device *xe = container_of(event->pmu, typeof(*xe), pmu.base);
+ unsigned int engine_class, engine_instance;
+ u64 config = event->attr.config;
+ struct xe_gt *gt;
+
+ gt = xe_device_get_gt(xe, config_to_gt_id(config));
+ if (!gt)
+ return false;
+
+ engine_class = config_to_engine_class(config);
+ engine_instance = config_to_engine_instance(config);
+
+ switch (config_to_event_id(config)) {
+ case XE_PMU_EVENT_GT_C6_RESIDENCY:
+ if (engine_class || engine_instance)
+ return false;
+ break;
+ case XE_PMU_EVENT_ENGINE_ACTIVE_TICKS:
+ case XE_PMU_EVENT_ENGINE_TOTAL_TICKS:
+ if (!event_to_hwe(event))
+ return false;
+ break;
+ }
+
+ return true;
+}
+
static void xe_pmu_event_destroy(struct perf_event *event)
{
struct xe_device *xe = container_of(event->pmu, typeof(*xe), pmu.base);
+ struct xe_gt *gt;
+ unsigned int *fw_ref = event->pmu_private;
+
+ if (fw_ref) {
+ gt = xe_device_get_gt(xe, config_to_gt_id(event->attr.config));
+ xe_force_wake_put(gt_to_fw(gt), *fw_ref);
+ kfree(fw_ref);
+ event->pmu_private = NULL;
+ }
drm_WARN_ON(&xe->drm, event->parent);
xe_pm_runtime_put(xe);
@@ -104,15 +222,37 @@ static int xe_pmu_event_init(struct perf_event *event)
if (has_branch_stack(event))
return -EOPNOTSUPP;
+ if (!event_param_valid(event))
+ return -ENOENT;
+
if (!event->parent) {
drm_dev_get(&xe->drm);
xe_pm_runtime_get(xe);
+ if (!event_gt_forcewake(event)) {
+ xe_pm_runtime_put(xe);
+ drm_dev_put(&xe->drm);
+ return -EINVAL;
+ }
event->destroy = xe_pmu_event_destroy;
}
return 0;
}
+static u64 read_engine_events(struct xe_gt *gt, struct perf_event *event)
+{
+ struct xe_hw_engine *hwe;
+ u64 val = 0;
+
+ hwe = event_to_hwe(event);
+ if (config_to_event_id(event->attr.config) == XE_PMU_EVENT_ENGINE_ACTIVE_TICKS)
+ val = xe_guc_engine_activity_active_ticks(&gt->uc.guc, hwe);
+ else
+ val = xe_guc_engine_activity_total_ticks(&gt->uc.guc, hwe);
+
+ return val;
+}
+
static u64 __xe_pmu_event_read(struct perf_event *event)
{
struct xe_gt *gt = event_to_gt(event);
@@ -123,6 +263,9 @@ static u64 __xe_pmu_event_read(struct perf_event *event)
switch (config_to_event_id(event->attr.config)) {
case XE_PMU_EVENT_GT_C6_RESIDENCY:
return xe_gt_idle_residency_msec(&gt->gtidle);
+ case XE_PMU_EVENT_ENGINE_ACTIVE_TICKS:
+ case XE_PMU_EVENT_ENGINE_TOTAL_TICKS:
+ return read_engine_events(gt, event);
}
return 0;
@@ -207,11 +350,15 @@ static void xe_pmu_event_del(struct perf_event *event, int flags)
xe_pmu_event_stop(event, PERF_EF_UPDATE);
}
-PMU_FORMAT_ATTR(gt, "config:60-63");
-PMU_FORMAT_ATTR(event, "config:0-11");
+PMU_FORMAT_ATTR(gt, "config:60-63");
+PMU_FORMAT_ATTR(engine_class, "config:20-27");
+PMU_FORMAT_ATTR(engine_instance, "config:12-19");
+PMU_FORMAT_ATTR(event, "config:0-11");
static struct attribute *pmu_format_attrs[] = {
&format_attr_event.attr,
+ &format_attr_engine_class.attr,
+ &format_attr_engine_instance.attr,
&format_attr_gt.attr,
NULL,
};
@@ -270,6 +417,8 @@ static ssize_t event_attr_show(struct device *dev,
XE_EVENT_ATTR_GROUP(v_, id_, &pmu_event_ ##v_.attr.attr)
XE_EVENT_ATTR_SIMPLE(gt-c6-residency, gt_c6_residency, XE_PMU_EVENT_GT_C6_RESIDENCY, "ms");
+XE_EVENT_ATTR_NOUNIT(engine-active-ticks, engine_active_ticks, XE_PMU_EVENT_ENGINE_ACTIVE_TICKS);
+XE_EVENT_ATTR_NOUNIT(engine-total-ticks, engine_total_ticks, XE_PMU_EVENT_ENGINE_TOTAL_TICKS);
static struct attribute *pmu_empty_event_attrs[] = {
/* Empty - all events are added as groups with .attr_update() */
@@ -283,15 +432,23 @@ static const struct attribute_group pmu_events_attr_group = {
static const struct attribute_group *pmu_events_attr_update[] = {
&pmu_group_gt_c6_residency,
+ &pmu_group_engine_active_ticks,
+ &pmu_group_engine_total_ticks,
NULL,
};
static void set_supported_events(struct xe_pmu *pmu)
{
struct xe_device *xe = container_of(pmu, typeof(*xe), pmu);
+ struct xe_gt *gt = xe_device_get_gt(xe, 0);
if (!xe->info.skip_guc_pc)
pmu->supported_events |= BIT_ULL(XE_PMU_EVENT_GT_C6_RESIDENCY);
+
+ if (xe_guc_engine_activity_supported(&gt->uc.guc)) {
+ pmu->supported_events |= BIT_ULL(XE_PMU_EVENT_ENGINE_ACTIVE_TICKS);
+ pmu->supported_events |= BIT_ULL(XE_PMU_EVENT_ENGINE_TOTAL_TICKS);
+ }
}
/**
diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
index 1ddcc7e79a93..ffaf0d02dc7d 100644
--- a/drivers/gpu/drm/xe/xe_pt.c
+++ b/drivers/gpu/drm/xe/xe_pt.c
@@ -20,6 +20,7 @@
#include "xe_res_cursor.h"
#include "xe_sched_job.h"
#include "xe_sync.h"
+#include "xe_svm.h"
#include "xe_trace.h"
#include "xe_ttm_stolen_mgr.h"
#include "xe_vm.h"
@@ -28,6 +29,8 @@ struct xe_pt_dir {
struct xe_pt pt;
/** @children: Array of page-table child nodes */
struct xe_ptw *children[XE_PDES];
+ /** @staging: Array of page-table staging nodes */
+ struct xe_ptw *staging[XE_PDES];
};
#if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)
@@ -48,9 +51,10 @@ static struct xe_pt_dir *as_xe_pt_dir(struct xe_pt *pt)
return container_of(pt, struct xe_pt_dir, pt);
}
-static struct xe_pt *xe_pt_entry(struct xe_pt_dir *pt_dir, unsigned int index)
+static struct xe_pt *
+xe_pt_entry_staging(struct xe_pt_dir *pt_dir, unsigned int index)
{
- return container_of(pt_dir->children[index], struct xe_pt, base);
+ return container_of(pt_dir->staging[index], struct xe_pt, base);
}
static u64 __xe_pt_empty_pte(struct xe_tile *tile, struct xe_vm *vm,
@@ -125,6 +129,7 @@ struct xe_pt *xe_pt_create(struct xe_vm *vm, struct xe_tile *tile,
}
pt->bo = bo;
pt->base.children = level ? as_xe_pt_dir(pt)->children : NULL;
+ pt->base.staging = level ? as_xe_pt_dir(pt)->staging : NULL;
if (vm->xef)
xe_drm_client_add_bo(vm->xef->client, pt->bo);
@@ -206,8 +211,8 @@ void xe_pt_destroy(struct xe_pt *pt, u32 flags, struct llist_head *deferred)
struct xe_pt_dir *pt_dir = as_xe_pt_dir(pt);
for (i = 0; i < XE_PDES; i++) {
- if (xe_pt_entry(pt_dir, i))
- xe_pt_destroy(xe_pt_entry(pt_dir, i), flags,
+ if (xe_pt_entry_staging(pt_dir, i))
+ xe_pt_destroy(xe_pt_entry_staging(pt_dir, i), flags,
deferred);
}
}
@@ -215,6 +220,20 @@ void xe_pt_destroy(struct xe_pt *pt, u32 flags, struct llist_head *deferred)
}
/**
+ * xe_pt_clear() - Clear a page-table.
+ * @xe: xe device.
+ * @pt: The page-table.
+ *
+ * Clears page-table by setting to zero.
+ */
+void xe_pt_clear(struct xe_device *xe, struct xe_pt *pt)
+{
+ struct iosys_map *map = &pt->bo->vmap;
+
+ xe_map_memset(xe, map, 0, 0, SZ_4K);
+}
+
+/**
* DOC: Pagetable building
*
* Below we use the term "page-table" for both page-directories, containing
@@ -376,8 +395,10 @@ xe_pt_insert_entry(struct xe_pt_stage_bind_walk *xe_walk, struct xe_pt *parent,
/* Continue building a non-connected subtree. */
struct iosys_map *map = &parent->bo->vmap;
- if (unlikely(xe_child))
+ if (unlikely(xe_child)) {
parent->base.children[offset] = &xe_child->base;
+ parent->base.staging[offset] = &xe_child->base;
+ }
xe_pt_write(xe_walk->vm->xe, map, offset, pte);
parent->num_live++;
@@ -587,6 +608,7 @@ static const struct xe_pt_walk_ops xe_pt_stage_bind_ops = {
* range.
* @tile: The tile we're building for.
* @vma: The vma indicating the address range.
+ * @range: The range indicating the address range.
* @entries: Storage for the update entries used for connecting the tree to
* the main tree at commit time.
* @num_entries: On output contains the number of @entries used.
@@ -602,6 +624,7 @@ static const struct xe_pt_walk_ops xe_pt_stage_bind_ops = {
*/
static int
xe_pt_stage_bind(struct xe_tile *tile, struct xe_vma *vma,
+ struct xe_svm_range *range,
struct xe_vm_pgtable_update *entries, u32 *num_entries)
{
struct xe_device *xe = tile_to_xe(tile);
@@ -614,18 +637,48 @@ xe_pt_stage_bind(struct xe_tile *tile, struct xe_vma *vma,
.ops = &xe_pt_stage_bind_ops,
.shifts = xe_normal_pt_shifts,
.max_level = XE_PT_HIGHEST_LEVEL,
+ .staging = true,
},
.vm = xe_vma_vm(vma),
.tile = tile,
.curs = &curs,
- .va_curs_start = xe_vma_start(vma),
+ .va_curs_start = range ? range->base.itree.start :
+ xe_vma_start(vma),
.vma = vma,
.wupd.entries = entries,
- .needs_64K = (xe_vma_vm(vma)->flags & XE_VM_FLAG_64K) && is_devmem,
};
struct xe_pt *pt = xe_vma_vm(vma)->pt_root[tile->id];
int ret;
+ if (range) {
+ /* Move this entire thing to xe_svm.c? */
+ xe_svm_notifier_lock(xe_vma_vm(vma));
+ if (!xe_svm_range_pages_valid(range)) {
+ xe_svm_range_debug(range, "BIND PREPARE - RETRY");
+ xe_svm_notifier_unlock(xe_vma_vm(vma));
+ return -EAGAIN;
+ }
+ if (xe_svm_range_has_dma_mapping(range)) {
+ xe_res_first_dma(range->base.dma_addr, 0,
+ range->base.itree.last + 1 - range->base.itree.start,
+ &curs);
+ is_devmem = xe_res_is_vram(&curs);
+ if (is_devmem)
+ xe_svm_range_debug(range, "BIND PREPARE - DMA VRAM");
+ else
+ xe_svm_range_debug(range, "BIND PREPARE - DMA");
+ } else {
+ xe_assert(xe, false);
+ }
+ /*
+ * Note, when unlocking the resource cursor dma addresses may become
+ * stale, but the bind will be aborted anyway at commit time.
+ */
+ xe_svm_notifier_unlock(xe_vma_vm(vma));
+ }
+
+ xe_walk.needs_64K = (xe_vma_vm(vma)->flags & XE_VM_FLAG_64K) && is_devmem;
+
/**
* Default atomic expectations for different allocation scenarios are as follows:
*
@@ -647,7 +700,7 @@ xe_pt_stage_bind(struct xe_tile *tile, struct xe_vma *vma,
* gets migrated to LMEM, bind such allocations with
* device atomics enabled.
*/
- else if (is_devmem && !xe_bo_has_single_placement(bo))
+ else if (is_devmem)
xe_walk.default_pte |= XE_USM_PPGTT_PTE_AE;
} else {
xe_walk.default_pte |= XE_USM_PPGTT_PTE_AE;
@@ -663,15 +716,16 @@ xe_pt_stage_bind(struct xe_tile *tile, struct xe_vma *vma,
if (is_devmem) {
xe_walk.default_pte |= XE_PPGTT_PTE_DM;
- xe_walk.dma_offset = vram_region_gpu_offset(bo->ttm.resource);
+ xe_walk.dma_offset = bo ? vram_region_gpu_offset(bo->ttm.resource) : 0;
}
if (!xe_vma_has_no_bo(vma) && xe_bo_is_stolen(bo))
xe_walk.dma_offset = xe_ttm_stolen_gpu_offset(xe_bo_device(bo));
- xe_bo_assert_held(bo);
+ if (!range)
+ xe_bo_assert_held(bo);
- if (!xe_vma_is_null(vma)) {
+ if (!xe_vma_is_null(vma) && !range) {
if (xe_vma_is_userptr(vma))
xe_res_first_sg(to_userptr_vma(vma)->userptr.sg, 0,
xe_vma_size(vma), &curs);
@@ -681,12 +735,14 @@ xe_pt_stage_bind(struct xe_tile *tile, struct xe_vma *vma,
else
xe_res_first_sg(xe_bo_sg(bo), xe_vma_bo_offset(vma),
xe_vma_size(vma), &curs);
- } else {
+ } else if (!range) {
curs.size = xe_vma_size(vma);
}
- ret = xe_pt_walk_range(&pt->base, pt->level, xe_vma_start(vma),
- xe_vma_end(vma), &xe_walk.base);
+ ret = xe_pt_walk_range(&pt->base, pt->level,
+ range ? range->base.itree.start : xe_vma_start(vma),
+ range ? range->base.itree.last + 1 : xe_vma_end(vma),
+ &xe_walk.base);
*num_entries = xe_walk.wupd.num_used_entries;
return ret;
@@ -830,6 +886,46 @@ bool xe_pt_zap_ptes(struct xe_tile *tile, struct xe_vma *vma)
return xe_walk.needs_invalidate;
}
+/**
+ * xe_pt_zap_ptes_range() - Zap (zero) gpu ptes of a SVM range
+ * @tile: The tile we're zapping for.
+ * @vm: The VM we're zapping for.
+ * @range: The SVM range we're zapping for.
+ *
+ * SVM invalidation needs to be able to zap the gpu ptes of a given address
+ * range. In order to be able to do that, that function needs access to the
+ * shared page-table entries so it can either clear the leaf PTEs or
+ * clear the pointers to lower-level page-tables. The caller is required
+ * to hold the SVM notifier lock.
+ *
+ * Return: Whether ptes were actually updated and a TLB invalidation is
+ * required.
+ */
+bool xe_pt_zap_ptes_range(struct xe_tile *tile, struct xe_vm *vm,
+ struct xe_svm_range *range)
+{
+ struct xe_pt_zap_ptes_walk xe_walk = {
+ .base = {
+ .ops = &xe_pt_zap_ptes_ops,
+ .shifts = xe_normal_pt_shifts,
+ .max_level = XE_PT_HIGHEST_LEVEL,
+ },
+ .tile = tile,
+ };
+ struct xe_pt *pt = vm->pt_root[tile->id];
+ u8 pt_mask = (range->tile_present & ~range->tile_invalidated);
+
+ xe_svm_assert_in_notifier(vm);
+
+ if (!(pt_mask & BIT(tile->id)))
+ return false;
+
+ (void)xe_pt_walk_shared(&pt->base, pt->level, range->base.itree.start,
+ range->base.itree.last + 1, &xe_walk.base);
+
+ return xe_walk.needs_invalidate;
+}
+
static void
xe_vm_populate_pgtable(struct xe_migrate_pt_update *pt_update, struct xe_tile *tile,
struct iosys_map *map, void *data,
@@ -873,18 +969,38 @@ static void xe_pt_cancel_bind(struct xe_vma *vma,
}
}
-static void xe_pt_commit_locks_assert(struct xe_vma *vma)
+#define XE_INVALID_VMA ((struct xe_vma *)(0xdeaddeadull))
+
+static void xe_pt_commit_prepare_locks_assert(struct xe_vma *vma)
{
- struct xe_vm *vm = xe_vma_vm(vma);
+ struct xe_vm *vm;
+ if (vma == XE_INVALID_VMA)
+ return;
+
+ vm = xe_vma_vm(vma);
lockdep_assert_held(&vm->lock);
- if (!xe_vma_is_userptr(vma) && !xe_vma_is_null(vma))
+ if (!xe_vma_has_no_bo(vma))
dma_resv_assert_held(xe_vma_bo(vma)->ttm.base.resv);
xe_vm_assert_held(vm);
}
+static void xe_pt_commit_locks_assert(struct xe_vma *vma)
+{
+ struct xe_vm *vm;
+
+ if (vma == XE_INVALID_VMA)
+ return;
+
+ vm = xe_vma_vm(vma);
+ xe_pt_commit_prepare_locks_assert(vma);
+
+ if (xe_vma_is_userptr(vma))
+ lockdep_assert_held_read(&vm->userptr.notifier_lock);
+}
+
static void xe_pt_commit(struct xe_vma *vma,
struct xe_vm_pgtable_update *entries,
u32 num_entries, struct llist_head *deferred)
@@ -895,14 +1011,19 @@ static void xe_pt_commit(struct xe_vma *vma,
for (i = 0; i < num_entries; i++) {
struct xe_pt *pt = entries[i].pt;
+ struct xe_pt_dir *pt_dir;
if (!pt->level)
continue;
+ pt_dir = as_xe_pt_dir(pt);
for (j = 0; j < entries[i].qwords; j++) {
struct xe_pt *oldpte = entries[i].pt_entries[j].pt;
+ int j_ = j + entries[i].ofs;
- xe_pt_destroy(oldpte, xe_vma_vm(vma)->flags, deferred);
+ pt_dir->children[j_] = pt_dir->staging[j_];
+ xe_pt_destroy(oldpte, (vma == XE_INVALID_VMA) ? 0 :
+ xe_vma_vm(vma)->flags, deferred);
}
}
}
@@ -913,7 +1034,7 @@ static void xe_pt_abort_bind(struct xe_vma *vma,
{
int i, j;
- xe_pt_commit_locks_assert(vma);
+ xe_pt_commit_prepare_locks_assert(vma);
for (i = num_entries - 1; i >= 0; --i) {
struct xe_pt *pt = entries[i].pt;
@@ -928,10 +1049,10 @@ static void xe_pt_abort_bind(struct xe_vma *vma,
pt_dir = as_xe_pt_dir(pt);
for (j = 0; j < entries[i].qwords; j++) {
u32 j_ = j + entries[i].ofs;
- struct xe_pt *newpte = xe_pt_entry(pt_dir, j_);
+ struct xe_pt *newpte = xe_pt_entry_staging(pt_dir, j_);
struct xe_pt *oldpte = entries[i].pt_entries[j].pt;
- pt_dir->children[j_] = oldpte ? &oldpte->base : 0;
+ pt_dir->staging[j_] = oldpte ? &oldpte->base : 0;
xe_pt_destroy(newpte, xe_vma_vm(vma)->flags, NULL);
}
}
@@ -943,7 +1064,7 @@ static void xe_pt_commit_prepare_bind(struct xe_vma *vma,
{
u32 i, j;
- xe_pt_commit_locks_assert(vma);
+ xe_pt_commit_prepare_locks_assert(vma);
for (i = 0; i < num_entries; i++) {
struct xe_pt *pt = entries[i].pt;
@@ -961,10 +1082,10 @@ static void xe_pt_commit_prepare_bind(struct xe_vma *vma,
struct xe_pt *newpte = entries[i].pt_entries[j].pt;
struct xe_pt *oldpte = NULL;
- if (xe_pt_entry(pt_dir, j_))
- oldpte = xe_pt_entry(pt_dir, j_);
+ if (xe_pt_entry_staging(pt_dir, j_))
+ oldpte = xe_pt_entry_staging(pt_dir, j_);
- pt_dir->children[j_] = &newpte->base;
+ pt_dir->staging[j_] = &newpte->base;
entries[i].pt_entries[j].pt = oldpte;
}
}
@@ -981,12 +1102,13 @@ static void xe_pt_free_bind(struct xe_vm_pgtable_update *entries,
static int
xe_pt_prepare_bind(struct xe_tile *tile, struct xe_vma *vma,
+ struct xe_svm_range *range,
struct xe_vm_pgtable_update *entries, u32 *num_entries)
{
int err;
*num_entries = 0;
- err = xe_pt_stage_bind(tile, vma, entries, num_entries);
+ err = xe_pt_stage_bind(tile, vma, range, entries, num_entries);
if (!err)
xe_tile_assert(tile, *num_entries);
@@ -1069,6 +1191,11 @@ static int op_add_deps(struct xe_vm *vm, struct xe_vma_op *op,
{
int err = 0;
+ /*
+ * No need to check for is_cpu_addr_mirror here as vma_add_deps is a
+ * NOP if VMA is_cpu_addr_mirror
+ */
+
switch (op->base.op) {
case DRM_GPUVA_OP_MAP:
if (!op->map.immediate && xe_vm_in_fault_mode(vm))
@@ -1087,6 +1214,8 @@ static int op_add_deps(struct xe_vm *vm, struct xe_vma_op *op,
case DRM_GPUVA_OP_PREFETCH:
err = vma_add_deps(gpuva_to_vma(op->base.prefetch.va), job);
break;
+ case DRM_GPUVA_OP_DRIVER:
+ break;
default:
drm_warn(&vm->xe->drm, "NOT POSSIBLE");
}
@@ -1213,42 +1342,22 @@ static int vma_check_userptr(struct xe_vm *vm, struct xe_vma *vma,
return 0;
uvma = to_userptr_vma(vma);
- notifier_seq = uvma->userptr.notifier_seq;
+ if (xe_pt_userptr_inject_eagain(uvma))
+ xe_vma_userptr_force_invalidate(uvma);
- if (uvma->userptr.initial_bind && !xe_vm_in_fault_mode(vm))
- return 0;
+ notifier_seq = uvma->userptr.notifier_seq;
if (!mmu_interval_read_retry(&uvma->userptr.notifier,
- notifier_seq) &&
- !xe_pt_userptr_inject_eagain(uvma))
+ notifier_seq))
return 0;
- if (xe_vm_in_fault_mode(vm)) {
+ if (xe_vm_in_fault_mode(vm))
return -EAGAIN;
- } else {
- spin_lock(&vm->userptr.invalidated_lock);
- list_move_tail(&uvma->userptr.invalidate_link,
- &vm->userptr.invalidated);
- spin_unlock(&vm->userptr.invalidated_lock);
-
- if (xe_vm_in_preempt_fence_mode(vm)) {
- struct dma_resv_iter cursor;
- struct dma_fence *fence;
- long err;
-
- dma_resv_iter_begin(&cursor, xe_vm_resv(vm),
- DMA_RESV_USAGE_BOOKKEEP);
- dma_resv_for_each_fence_unlocked(&cursor, fence)
- dma_fence_enable_sw_signaling(fence);
- dma_resv_iter_end(&cursor);
-
- err = dma_resv_wait_timeout(xe_vm_resv(vm),
- DMA_RESV_USAGE_BOOKKEEP,
- false, MAX_SCHEDULE_TIMEOUT);
- XE_WARN_ON(err <= 0);
- }
- }
+ /*
+ * Just continue the operation since exec or rebind worker
+ * will take care of rebinding.
+ */
return 0;
}
@@ -1311,6 +1420,40 @@ static int xe_pt_userptr_pre_commit(struct xe_migrate_pt_update *pt_update)
return err;
}
+static int xe_pt_svm_pre_commit(struct xe_migrate_pt_update *pt_update)
+{
+ struct xe_vm *vm = pt_update->vops->vm;
+ struct xe_vma_ops *vops = pt_update->vops;
+ struct xe_vma_op *op;
+ int err;
+
+ err = xe_pt_pre_commit(pt_update);
+ if (err)
+ return err;
+
+ xe_svm_notifier_lock(vm);
+
+ list_for_each_entry(op, &vops->list, link) {
+ struct xe_svm_range *range = op->map_range.range;
+
+ if (op->subop == XE_VMA_SUBOP_UNMAP_RANGE)
+ continue;
+
+ xe_svm_range_debug(range, "PRE-COMMIT");
+
+ xe_assert(vm->xe, xe_vma_is_cpu_addr_mirror(op->map_range.vma));
+ xe_assert(vm->xe, op->subop == XE_VMA_SUBOP_MAP_RANGE);
+
+ if (!xe_svm_range_pages_valid(range)) {
+ xe_svm_range_debug(range, "PRE-COMMIT - RETRY");
+ xe_svm_notifier_unlock(vm);
+ return -EAGAIN;
+ }
+ }
+
+ return 0;
+}
+
struct invalidation_fence {
struct xe_gt_tlb_invalidation_fence base;
struct xe_gt *gt;
@@ -1496,7 +1639,9 @@ static const struct xe_pt_walk_ops xe_pt_stage_unbind_ops = {
* xe_pt_stage_unbind() - Build page-table update structures for an unbind
* operation
* @tile: The tile we're unbinding for.
+ * @vm: The vm
* @vma: The vma we're unbinding.
+ * @range: The range we're unbinding.
* @entries: Caller-provided storage for the update structures.
*
* Builds page-table update structures for an unbind operation. The function
@@ -1506,24 +1651,30 @@ static const struct xe_pt_walk_ops xe_pt_stage_unbind_ops = {
*
* Return: The number of entries used.
*/
-static unsigned int xe_pt_stage_unbind(struct xe_tile *tile, struct xe_vma *vma,
+static unsigned int xe_pt_stage_unbind(struct xe_tile *tile,
+ struct xe_vm *vm,
+ struct xe_vma *vma,
+ struct xe_svm_range *range,
struct xe_vm_pgtable_update *entries)
{
+ u64 start = range ? range->base.itree.start : xe_vma_start(vma);
+ u64 end = range ? range->base.itree.last + 1 : xe_vma_end(vma);
struct xe_pt_stage_unbind_walk xe_walk = {
.base = {
.ops = &xe_pt_stage_unbind_ops,
.shifts = xe_normal_pt_shifts,
.max_level = XE_PT_HIGHEST_LEVEL,
+ .staging = true,
},
.tile = tile,
- .modified_start = xe_vma_start(vma),
- .modified_end = xe_vma_end(vma),
+ .modified_start = start,
+ .modified_end = end,
.wupd.entries = entries,
};
- struct xe_pt *pt = xe_vma_vm(vma)->pt_root[tile->id];
+ struct xe_pt *pt = vm->pt_root[tile->id];
- (void)xe_pt_walk_shared(&pt->base, pt->level, xe_vma_start(vma),
- xe_vma_end(vma), &xe_walk.base);
+ (void)xe_pt_walk_shared(&pt->base, pt->level, start, end,
+ &xe_walk.base);
return xe_walk.wupd.num_used_entries;
}
@@ -1555,7 +1706,7 @@ static void xe_pt_abort_unbind(struct xe_vma *vma,
{
int i, j;
- xe_pt_commit_locks_assert(vma);
+ xe_pt_commit_prepare_locks_assert(vma);
for (i = num_entries - 1; i >= 0; --i) {
struct xe_vm_pgtable_update *entry = &entries[i];
@@ -1568,7 +1719,7 @@ static void xe_pt_abort_unbind(struct xe_vma *vma,
continue;
for (j = entry->ofs; j < entry->ofs + entry->qwords; j++)
- pt_dir->children[j] =
+ pt_dir->staging[j] =
entries[i].pt_entries[j - entry->ofs].pt ?
&entries[i].pt_entries[j - entry->ofs].pt->base : NULL;
}
@@ -1581,7 +1732,7 @@ xe_pt_commit_prepare_unbind(struct xe_vma *vma,
{
int i, j;
- xe_pt_commit_locks_assert(vma);
+ xe_pt_commit_prepare_locks_assert(vma);
for (i = 0; i < num_entries; ++i) {
struct xe_vm_pgtable_update *entry = &entries[i];
@@ -1595,20 +1746,20 @@ xe_pt_commit_prepare_unbind(struct xe_vma *vma,
pt_dir = as_xe_pt_dir(pt);
for (j = entry->ofs; j < entry->ofs + entry->qwords; j++) {
entry->pt_entries[j - entry->ofs].pt =
- xe_pt_entry(pt_dir, j);
- pt_dir->children[j] = NULL;
+ xe_pt_entry_staging(pt_dir, j);
+ pt_dir->staging[j] = NULL;
}
}
}
static void
xe_pt_update_ops_rfence_interval(struct xe_vm_pgtable_update_ops *pt_update_ops,
- struct xe_vma *vma)
+ u64 start, u64 end)
{
+ u64 last;
u32 current_op = pt_update_ops->current_op;
struct xe_vm_pgtable_update_op *pt_op = &pt_update_ops->ops[current_op];
int i, level = 0;
- u64 start, last;
for (i = 0; i < pt_op->num_entries; i++) {
const struct xe_vm_pgtable_update *entry = &pt_op->entries[i];
@@ -1618,8 +1769,8 @@ xe_pt_update_ops_rfence_interval(struct xe_vm_pgtable_update_ops *pt_update_ops,
}
/* Greedy (non-optimal) calculation but simple */
- start = ALIGN_DOWN(xe_vma_start(vma), 0x1ull << xe_pt_shift(level));
- last = ALIGN(xe_vma_end(vma), 0x1ull << xe_pt_shift(level)) - 1;
+ start = ALIGN_DOWN(start, 0x1ull << xe_pt_shift(level));
+ last = ALIGN(end, 0x1ull << xe_pt_shift(level)) - 1;
if (start < pt_update_ops->start)
pt_update_ops->start = start;
@@ -1646,6 +1797,7 @@ static int bind_op_prepare(struct xe_vm *vm, struct xe_tile *tile,
struct xe_vm_pgtable_update_op *pt_op = &pt_update_ops->ops[current_op];
int err;
+ xe_tile_assert(tile, !xe_vma_is_cpu_addr_mirror(vma));
xe_bo_assert_held(xe_vma_bo(vma));
vm_dbg(&xe_vma_vm(vma)->xe->drm,
@@ -1660,7 +1812,7 @@ static int bind_op_prepare(struct xe_vm *vm, struct xe_tile *tile,
if (err)
return err;
- err = xe_pt_prepare_bind(tile, vma, pt_op->entries,
+ err = xe_pt_prepare_bind(tile, vma, NULL, pt_op->entries,
&pt_op->num_entries);
if (!err) {
xe_tile_assert(tile, pt_op->num_entries <=
@@ -1668,7 +1820,9 @@ static int bind_op_prepare(struct xe_vm *vm, struct xe_tile *tile,
xe_vm_dbg_print_entries(tile_to_xe(tile), pt_op->entries,
pt_op->num_entries, true);
- xe_pt_update_ops_rfence_interval(pt_update_ops, vma);
+ xe_pt_update_ops_rfence_interval(pt_update_ops,
+ xe_vma_start(vma),
+ xe_vma_end(vma));
++pt_update_ops->current_op;
pt_update_ops->needs_userptr_lock |= xe_vma_is_userptr(vma);
@@ -1702,6 +1856,48 @@ static int bind_op_prepare(struct xe_vm *vm, struct xe_tile *tile,
return err;
}
+static int bind_range_prepare(struct xe_vm *vm, struct xe_tile *tile,
+ struct xe_vm_pgtable_update_ops *pt_update_ops,
+ struct xe_vma *vma, struct xe_svm_range *range)
+{
+ u32 current_op = pt_update_ops->current_op;
+ struct xe_vm_pgtable_update_op *pt_op = &pt_update_ops->ops[current_op];
+ int err;
+
+ xe_tile_assert(tile, xe_vma_is_cpu_addr_mirror(vma));
+
+ vm_dbg(&xe_vma_vm(vma)->xe->drm,
+ "Preparing bind, with range [%lx...%lx)\n",
+ range->base.itree.start, range->base.itree.last);
+
+ pt_op->vma = NULL;
+ pt_op->bind = true;
+ pt_op->rebind = BIT(tile->id) & range->tile_present;
+
+ err = xe_pt_prepare_bind(tile, vma, range, pt_op->entries,
+ &pt_op->num_entries);
+ if (!err) {
+ xe_tile_assert(tile, pt_op->num_entries <=
+ ARRAY_SIZE(pt_op->entries));
+ xe_vm_dbg_print_entries(tile_to_xe(tile), pt_op->entries,
+ pt_op->num_entries, true);
+
+ xe_pt_update_ops_rfence_interval(pt_update_ops,
+ range->base.itree.start,
+ range->base.itree.last + 1);
+ ++pt_update_ops->current_op;
+ pt_update_ops->needs_svm_lock = true;
+
+ pt_op->vma = vma;
+ xe_pt_commit_prepare_bind(vma, pt_op->entries,
+ pt_op->num_entries, pt_op->rebind);
+ } else {
+ xe_pt_cancel_bind(vma, pt_op->entries, pt_op->num_entries);
+ }
+
+ return err;
+}
+
static int unbind_op_prepare(struct xe_tile *tile,
struct xe_vm_pgtable_update_ops *pt_update_ops,
struct xe_vma *vma)
@@ -1713,19 +1909,13 @@ static int unbind_op_prepare(struct xe_tile *tile,
if (!((vma->tile_present | vma->tile_staged) & BIT(tile->id)))
return 0;
+ xe_tile_assert(tile, !xe_vma_is_cpu_addr_mirror(vma));
xe_bo_assert_held(xe_vma_bo(vma));
vm_dbg(&xe_vma_vm(vma)->xe->drm,
"Preparing unbind, with range [%llx...%llx)\n",
xe_vma_start(vma), xe_vma_end(vma) - 1);
- /*
- * Wait for invalidation to complete. Can corrupt internal page table
- * state if an invalidation is running while preparing an unbind.
- */
- if (xe_vma_is_userptr(vma) && xe_vm_in_fault_mode(xe_vma_vm(vma)))
- mmu_interval_read_begin(&to_userptr_vma(vma)->userptr.notifier);
-
pt_op->vma = vma;
pt_op->bind = false;
pt_op->rebind = false;
@@ -1734,11 +1924,13 @@ static int unbind_op_prepare(struct xe_tile *tile,
if (err)
return err;
- pt_op->num_entries = xe_pt_stage_unbind(tile, vma, pt_op->entries);
+ pt_op->num_entries = xe_pt_stage_unbind(tile, xe_vma_vm(vma),
+ vma, NULL, pt_op->entries);
xe_vm_dbg_print_entries(tile_to_xe(tile), pt_op->entries,
pt_op->num_entries, false);
- xe_pt_update_ops_rfence_interval(pt_update_ops, vma);
+ xe_pt_update_ops_rfence_interval(pt_update_ops, xe_vma_start(vma),
+ xe_vma_end(vma));
++pt_update_ops->current_op;
pt_update_ops->needs_userptr_lock |= xe_vma_is_userptr(vma);
pt_update_ops->needs_invalidation = true;
@@ -1748,6 +1940,42 @@ static int unbind_op_prepare(struct xe_tile *tile,
return 0;
}
+static int unbind_range_prepare(struct xe_vm *vm,
+ struct xe_tile *tile,
+ struct xe_vm_pgtable_update_ops *pt_update_ops,
+ struct xe_svm_range *range)
+{
+ u32 current_op = pt_update_ops->current_op;
+ struct xe_vm_pgtable_update_op *pt_op = &pt_update_ops->ops[current_op];
+
+ if (!(range->tile_present & BIT(tile->id)))
+ return 0;
+
+ vm_dbg(&vm->xe->drm,
+ "Preparing unbind, with range [%lx...%lx)\n",
+ range->base.itree.start, range->base.itree.last);
+
+ pt_op->vma = XE_INVALID_VMA;
+ pt_op->bind = false;
+ pt_op->rebind = false;
+
+ pt_op->num_entries = xe_pt_stage_unbind(tile, vm, NULL, range,
+ pt_op->entries);
+
+ xe_vm_dbg_print_entries(tile_to_xe(tile), pt_op->entries,
+ pt_op->num_entries, false);
+ xe_pt_update_ops_rfence_interval(pt_update_ops, range->base.itree.start,
+ range->base.itree.last + 1);
+ ++pt_update_ops->current_op;
+ pt_update_ops->needs_svm_lock = true;
+ pt_update_ops->needs_invalidation = true;
+
+ xe_pt_commit_prepare_unbind(XE_INVALID_VMA, pt_op->entries,
+ pt_op->num_entries);
+
+ return 0;
+}
+
static int op_prepare(struct xe_vm *vm,
struct xe_tile *tile,
struct xe_vm_pgtable_update_ops *pt_update_ops,
@@ -1759,15 +1987,21 @@ static int op_prepare(struct xe_vm *vm,
switch (op->base.op) {
case DRM_GPUVA_OP_MAP:
- if (!op->map.immediate && xe_vm_in_fault_mode(vm))
+ if ((!op->map.immediate && xe_vm_in_fault_mode(vm)) ||
+ op->map.is_cpu_addr_mirror)
break;
err = bind_op_prepare(vm, tile, pt_update_ops, op->map.vma);
pt_update_ops->wait_vm_kernel = true;
break;
case DRM_GPUVA_OP_REMAP:
- err = unbind_op_prepare(tile, pt_update_ops,
- gpuva_to_vma(op->base.remap.unmap->va));
+ {
+ struct xe_vma *old = gpuva_to_vma(op->base.remap.unmap->va);
+
+ if (xe_vma_is_cpu_addr_mirror(old))
+ break;
+
+ err = unbind_op_prepare(tile, pt_update_ops, old);
if (!err && op->remap.prev) {
err = bind_op_prepare(vm, tile, pt_update_ops,
@@ -1780,15 +2014,40 @@ static int op_prepare(struct xe_vm *vm,
pt_update_ops->wait_vm_bookkeep = true;
}
break;
+ }
case DRM_GPUVA_OP_UNMAP:
- err = unbind_op_prepare(tile, pt_update_ops,
- gpuva_to_vma(op->base.unmap.va));
+ {
+ struct xe_vma *vma = gpuva_to_vma(op->base.unmap.va);
+
+ if (xe_vma_is_cpu_addr_mirror(vma))
+ break;
+
+ err = unbind_op_prepare(tile, pt_update_ops, vma);
break;
+ }
case DRM_GPUVA_OP_PREFETCH:
- err = bind_op_prepare(vm, tile, pt_update_ops,
- gpuva_to_vma(op->base.prefetch.va));
+ {
+ struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
+
+ if (xe_vma_is_cpu_addr_mirror(vma))
+ break;
+
+ err = bind_op_prepare(vm, tile, pt_update_ops, vma);
pt_update_ops->wait_vm_kernel = true;
break;
+ }
+ case DRM_GPUVA_OP_DRIVER:
+ if (op->subop == XE_VMA_SUBOP_MAP_RANGE) {
+ xe_assert(vm->xe, xe_vma_is_cpu_addr_mirror(op->map_range.vma));
+
+ err = bind_range_prepare(vm, tile, pt_update_ops,
+ op->map_range.vma,
+ op->map_range.range);
+ } else if (op->subop == XE_VMA_SUBOP_UNMAP_RANGE) {
+ err = unbind_range_prepare(vm, tile, pt_update_ops,
+ op->unmap_range.range);
+ }
+ break;
default:
drm_warn(&vm->xe->drm, "NOT POSSIBLE");
}
@@ -1858,6 +2117,8 @@ static void bind_op_commit(struct xe_vm *vm, struct xe_tile *tile,
struct xe_vma *vma, struct dma_fence *fence,
struct dma_fence *fence2)
{
+ xe_tile_assert(tile, !xe_vma_is_cpu_addr_mirror(vma));
+
if (!xe_vma_has_no_bo(vma) && !xe_vma_bo(vma)->vm) {
dma_resv_add_fence(xe_vma_bo(vma)->ttm.base.resv, fence,
pt_update_ops->wait_vm_bookkeep ?
@@ -1891,6 +2152,8 @@ static void unbind_op_commit(struct xe_vm *vm, struct xe_tile *tile,
struct xe_vma *vma, struct dma_fence *fence,
struct dma_fence *fence2)
{
+ xe_tile_assert(tile, !xe_vma_is_cpu_addr_mirror(vma));
+
if (!xe_vma_has_no_bo(vma) && !xe_vma_bo(vma)->vm) {
dma_resv_add_fence(xe_vma_bo(vma)->ttm.base.resv, fence,
pt_update_ops->wait_vm_bookkeep ?
@@ -1925,16 +2188,21 @@ static void op_commit(struct xe_vm *vm,
switch (op->base.op) {
case DRM_GPUVA_OP_MAP:
- if (!op->map.immediate && xe_vm_in_fault_mode(vm))
+ if ((!op->map.immediate && xe_vm_in_fault_mode(vm)) ||
+ op->map.is_cpu_addr_mirror)
break;
bind_op_commit(vm, tile, pt_update_ops, op->map.vma, fence,
fence2);
break;
case DRM_GPUVA_OP_REMAP:
- unbind_op_commit(vm, tile, pt_update_ops,
- gpuva_to_vma(op->base.remap.unmap->va), fence,
- fence2);
+ {
+ struct xe_vma *old = gpuva_to_vma(op->base.remap.unmap->va);
+
+ if (xe_vma_is_cpu_addr_mirror(old))
+ break;
+
+ unbind_op_commit(vm, tile, pt_update_ops, old, fence, fence2);
if (op->remap.prev)
bind_op_commit(vm, tile, pt_update_ops, op->remap.prev,
@@ -1943,14 +2211,35 @@ static void op_commit(struct xe_vm *vm,
bind_op_commit(vm, tile, pt_update_ops, op->remap.next,
fence, fence2);
break;
+ }
case DRM_GPUVA_OP_UNMAP:
- unbind_op_commit(vm, tile, pt_update_ops,
- gpuva_to_vma(op->base.unmap.va), fence, fence2);
+ {
+ struct xe_vma *vma = gpuva_to_vma(op->base.unmap.va);
+
+ if (!xe_vma_is_cpu_addr_mirror(vma))
+ unbind_op_commit(vm, tile, pt_update_ops, vma, fence,
+ fence2);
break;
+ }
case DRM_GPUVA_OP_PREFETCH:
- bind_op_commit(vm, tile, pt_update_ops,
- gpuva_to_vma(op->base.prefetch.va), fence, fence2);
+ {
+ struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
+
+ if (!xe_vma_is_cpu_addr_mirror(vma))
+ bind_op_commit(vm, tile, pt_update_ops, vma, fence,
+ fence2);
+ break;
+ }
+ case DRM_GPUVA_OP_DRIVER:
+ {
+ if (op->subop == XE_VMA_SUBOP_MAP_RANGE) {
+ op->map_range.range->tile_present |= BIT(tile->id);
+ op->map_range.range->tile_invalidated &= ~BIT(tile->id);
+ } else if (op->subop == XE_VMA_SUBOP_UNMAP_RANGE) {
+ op->unmap_range.range->tile_present &= ~BIT(tile->id);
+ }
break;
+ }
default:
drm_warn(&vm->xe->drm, "NOT POSSIBLE");
}
@@ -1968,6 +2257,12 @@ static const struct xe_migrate_pt_update_ops userptr_migrate_ops = {
.pre_commit = xe_pt_userptr_pre_commit,
};
+static const struct xe_migrate_pt_update_ops svm_migrate_ops = {
+ .populate = xe_vm_populate_pgtable,
+ .clear = xe_migrate_clear_pgtable_callback,
+ .pre_commit = xe_pt_svm_pre_commit,
+};
+
/**
* xe_pt_update_ops_run() - Run PT update operations
* @tile: Tile of PT update operations
@@ -1993,7 +2288,9 @@ xe_pt_update_ops_run(struct xe_tile *tile, struct xe_vma_ops *vops)
struct xe_vma_op *op;
int err = 0, i;
struct xe_migrate_pt_update update = {
- .ops = pt_update_ops->needs_userptr_lock ?
+ .ops = pt_update_ops->needs_svm_lock ?
+ &svm_migrate_ops :
+ pt_update_ops->needs_userptr_lock ?
&userptr_migrate_ops :
&migrate_ops,
.vops = vops,
@@ -2114,6 +2411,8 @@ xe_pt_update_ops_run(struct xe_tile *tile, struct xe_vma_ops *vops)
&ifence->base.base, &mfence->base.base);
}
+ if (pt_update_ops->needs_svm_lock)
+ xe_svm_notifier_unlock(vm);
if (pt_update_ops->needs_userptr_lock)
up_read(&vm->userptr.notifier_lock);
diff --git a/drivers/gpu/drm/xe/xe_pt.h b/drivers/gpu/drm/xe/xe_pt.h
index 9ab386431cad..5ecf003d513c 100644
--- a/drivers/gpu/drm/xe/xe_pt.h
+++ b/drivers/gpu/drm/xe/xe_pt.h
@@ -13,6 +13,7 @@ struct dma_fence;
struct xe_bo;
struct xe_device;
struct xe_exec_queue;
+struct xe_svm_range;
struct xe_sync_entry;
struct xe_tile;
struct xe_vm;
@@ -35,6 +36,8 @@ void xe_pt_populate_empty(struct xe_tile *tile, struct xe_vm *vm,
void xe_pt_destroy(struct xe_pt *pt, u32 flags, struct llist_head *deferred);
+void xe_pt_clear(struct xe_device *xe, struct xe_pt *pt);
+
int xe_pt_update_ops_prepare(struct xe_tile *tile, struct xe_vma_ops *vops);
struct dma_fence *xe_pt_update_ops_run(struct xe_tile *tile,
struct xe_vma_ops *vops);
@@ -42,5 +45,7 @@ void xe_pt_update_ops_fini(struct xe_tile *tile, struct xe_vma_ops *vops);
void xe_pt_update_ops_abort(struct xe_tile *tile, struct xe_vma_ops *vops);
bool xe_pt_zap_ptes(struct xe_tile *tile, struct xe_vma *vma);
+bool xe_pt_zap_ptes_range(struct xe_tile *tile, struct xe_vm *vm,
+ struct xe_svm_range *range);
#endif
diff --git a/drivers/gpu/drm/xe/xe_pt_types.h b/drivers/gpu/drm/xe/xe_pt_types.h
index 384cc04de719..69eab6f37cfe 100644
--- a/drivers/gpu/drm/xe/xe_pt_types.h
+++ b/drivers/gpu/drm/xe/xe_pt_types.h
@@ -104,6 +104,8 @@ struct xe_vm_pgtable_update_ops {
u32 num_ops;
/** @current_op: current operations */
u32 current_op;
+ /** @needs_svm_lock: Needs SVM lock */
+ bool needs_svm_lock;
/** @needs_userptr_lock: Needs userptr lock */
bool needs_userptr_lock;
/** @needs_invalidation: Needs invalidation */
diff --git a/drivers/gpu/drm/xe/xe_pt_walk.c b/drivers/gpu/drm/xe/xe_pt_walk.c
index b8b3d2aea492..be602a763ff3 100644
--- a/drivers/gpu/drm/xe/xe_pt_walk.c
+++ b/drivers/gpu/drm/xe/xe_pt_walk.c
@@ -74,7 +74,8 @@ int xe_pt_walk_range(struct xe_ptw *parent, unsigned int level,
u64 addr, u64 end, struct xe_pt_walk *walk)
{
pgoff_t offset = xe_pt_offset(addr, level, walk);
- struct xe_ptw **entries = parent->children ? parent->children : NULL;
+ struct xe_ptw **entries = walk->staging ? (parent->staging ?: NULL) :
+ (parent->children ?: NULL);
const struct xe_pt_walk_ops *ops = walk->ops;
enum page_walk_action action;
struct xe_ptw *child;
diff --git a/drivers/gpu/drm/xe/xe_pt_walk.h b/drivers/gpu/drm/xe/xe_pt_walk.h
index 5ecc4d2f0f65..5c02c244f7de 100644
--- a/drivers/gpu/drm/xe/xe_pt_walk.h
+++ b/drivers/gpu/drm/xe/xe_pt_walk.h
@@ -11,12 +11,14 @@
/**
* struct xe_ptw - base class for driver pagetable subclassing.
* @children: Pointer to an array of children if any.
+ * @staging: Pointer to an array of staging if any.
*
* Drivers could subclass this, and if it's a page-directory, typically
* embed an array of xe_ptw pointers.
*/
struct xe_ptw {
struct xe_ptw **children;
+ struct xe_ptw **staging;
};
/**
@@ -41,6 +43,8 @@ struct xe_pt_walk {
* as shared pagetables.
*/
bool shared_pt_mode;
+ /** @staging: Walk staging PT structure */
+ bool staging;
};
/**
diff --git a/drivers/gpu/drm/xe/xe_pxp.c b/drivers/gpu/drm/xe/xe_pxp.c
index 3cd3f83e86b0..47499ca02693 100644
--- a/drivers/gpu/drm/xe/xe_pxp.c
+++ b/drivers/gpu/drm/xe/xe_pxp.c
@@ -132,14 +132,6 @@ static int pxp_wait_for_session_state(struct xe_pxp *pxp, u32 id, bool in_play)
static void pxp_invalidate_queues(struct xe_pxp *pxp);
-static void pxp_invalidate_state(struct xe_pxp *pxp)
-{
- pxp_invalidate_queues(pxp);
-
- if (pxp->status == XE_PXP_ACTIVE)
- pxp->key_instance++;
-}
-
static int pxp_terminate_hw(struct xe_pxp *pxp)
{
struct xe_gt *gt = pxp->gt;
@@ -193,7 +185,8 @@ static void pxp_terminate(struct xe_pxp *pxp)
mutex_lock(&pxp->mutex);
- pxp_invalidate_state(pxp);
+ if (pxp->status == XE_PXP_ACTIVE)
+ pxp->key_instance++;
/*
* we'll mark the status as needing termination on resume, so no need to
@@ -220,6 +213,8 @@ static void pxp_terminate(struct xe_pxp *pxp)
mutex_unlock(&pxp->mutex);
+ pxp_invalidate_queues(pxp);
+
ret = pxp_terminate_hw(pxp);
if (ret) {
drm_err(&xe->drm, "PXP termination failed: %pe\n", ERR_PTR(ret));
@@ -665,23 +660,15 @@ out:
return ret;
}
-/**
- * xe_pxp_exec_queue_remove - remove a queue from the PXP list
- * @pxp: the xe->pxp pointer (it will be NULL if PXP is disabled)
- * @q: the queue to remove from the list
- *
- * If PXP is enabled and the exec_queue is in the list, the queue will be
- * removed from the list and its PM reference will be released. It is safe to
- * call this function multiple times for the same queue.
- */
-void xe_pxp_exec_queue_remove(struct xe_pxp *pxp, struct xe_exec_queue *q)
+static void __pxp_exec_queue_remove(struct xe_pxp *pxp, struct xe_exec_queue *q, bool lock)
{
bool need_pm_put = false;
if (!xe_pxp_is_enabled(pxp))
return;
- spin_lock_irq(&pxp->queues.lock);
+ if (lock)
+ spin_lock_irq(&pxp->queues.lock);
if (!list_empty(&q->pxp.link)) {
list_del_init(&q->pxp.link);
@@ -690,36 +677,54 @@ void xe_pxp_exec_queue_remove(struct xe_pxp *pxp, struct xe_exec_queue *q)
q->pxp.type = DRM_XE_PXP_TYPE_NONE;
- spin_unlock_irq(&pxp->queues.lock);
+ if (lock)
+ spin_unlock_irq(&pxp->queues.lock);
if (need_pm_put)
xe_pm_runtime_put(pxp->xe);
}
+/**
+ * xe_pxp_exec_queue_remove - remove a queue from the PXP list
+ * @pxp: the xe->pxp pointer (it will be NULL if PXP is disabled)
+ * @q: the queue to remove from the list
+ *
+ * If PXP is enabled and the exec_queue is in the list, the queue will be
+ * removed from the list and its PM reference will be released. It is safe to
+ * call this function multiple times for the same queue.
+ */
+void xe_pxp_exec_queue_remove(struct xe_pxp *pxp, struct xe_exec_queue *q)
+{
+ __pxp_exec_queue_remove(pxp, q, true);
+}
+
static void pxp_invalidate_queues(struct xe_pxp *pxp)
{
struct xe_exec_queue *tmp, *q;
+ LIST_HEAD(to_clean);
spin_lock_irq(&pxp->queues.lock);
- /*
- * Removing a queue from the PXP list requires a put of the RPM ref that
- * the queue holds to keep the PXP session alive, which can't be done
- * under spinlock. Since it is safe to kill a queue multiple times, we
- * can leave the invalid queue in the list for now and postpone the
- * removal and associated RPM put to when the queue is destroyed.
- */
- list_for_each_entry(tmp, &pxp->queues.list, pxp.link) {
- q = xe_exec_queue_get_unless_zero(tmp);
-
+ list_for_each_entry_safe(q, tmp, &pxp->queues.list, pxp.link) {
+ q = xe_exec_queue_get_unless_zero(q);
if (!q)
continue;
+ list_move_tail(&q->pxp.link, &to_clean);
+ }
+ spin_unlock_irq(&pxp->queues.lock);
+
+ list_for_each_entry_safe(q, tmp, &to_clean, pxp.link) {
xe_exec_queue_kill(q);
+
+ /*
+ * We hold a ref to the queue so there is no risk of racing with
+ * the calls to exec_queue_remove coming from exec_queue_destroy.
+ */
+ __pxp_exec_queue_remove(pxp, q, false);
+
xe_exec_queue_put(q);
}
-
- spin_unlock_irq(&pxp->queues.lock);
}
/**
@@ -816,6 +821,7 @@ int xe_pxp_obj_key_check(struct xe_pxp *pxp, struct drm_gem_object *obj)
*/
int xe_pxp_pm_suspend(struct xe_pxp *pxp)
{
+ bool needs_queue_inval = false;
int ret = 0;
if (!xe_pxp_is_enabled(pxp))
@@ -848,7 +854,8 @@ wait_for_activation:
break;
fallthrough;
case XE_PXP_ACTIVE:
- pxp_invalidate_state(pxp);
+ pxp->key_instance++;
+ needs_queue_inval = true;
break;
default:
drm_err(&pxp->xe->drm, "unexpected state during PXP suspend: %u",
@@ -865,6 +872,9 @@ wait_for_activation:
mutex_unlock(&pxp->mutex);
+ if (needs_queue_inval)
+ pxp_invalidate_queues(pxp);
+
/*
* if there is a termination in progress, wait for it.
* We need to wait outside the lock because the completion is done from
diff --git a/drivers/gpu/drm/xe/xe_query.c b/drivers/gpu/drm/xe/xe_query.c
index ebfae746f861..5e65830dad25 100644
--- a/drivers/gpu/drm/xe/xe_query.c
+++ b/drivers/gpu/drm/xe/xe_query.c
@@ -16,6 +16,7 @@
#include "regs/xe_gt_regs.h"
#include "xe_bo.h"
#include "xe_device.h"
+#include "xe_eu_stall.h"
#include "xe_exec_queue.h"
#include "xe_force_wake.h"
#include "xe_ggtt.h"
@@ -337,8 +338,13 @@ static int query_config(struct xe_device *xe, struct drm_xe_device_query *query)
config->info[DRM_XE_QUERY_CONFIG_REV_AND_DEVICE_ID] =
xe->info.devid | (xe->info.revid << 16);
if (xe_device_get_root_tile(xe)->mem.vram.usable_size)
- config->info[DRM_XE_QUERY_CONFIG_FLAGS] =
+ config->info[DRM_XE_QUERY_CONFIG_FLAGS] |=
DRM_XE_QUERY_CONFIG_FLAG_HAS_VRAM;
+ if (xe->info.has_usm && IS_ENABLED(CONFIG_DRM_GPUSVM))
+ config->info[DRM_XE_QUERY_CONFIG_FLAGS] |=
+ DRM_XE_QUERY_CONFIG_FLAG_HAS_CPU_ADDR_MIRROR;
+ config->info[DRM_XE_QUERY_CONFIG_FLAGS] |=
+ DRM_XE_QUERY_CONFIG_FLAG_HAS_LOW_LATENCY;
config->info[DRM_XE_QUERY_CONFIG_MIN_ALIGNMENT] =
xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K ? SZ_64K : SZ_4K;
config->info[DRM_XE_QUERY_CONFIG_VA_BITS] = xe->info.va_bits;
@@ -729,6 +735,47 @@ static int query_pxp_status(struct xe_device *xe, struct drm_xe_device_query *qu
return 0;
}
+static int query_eu_stall(struct xe_device *xe,
+ struct drm_xe_device_query *query)
+{
+ void __user *query_ptr = u64_to_user_ptr(query->data);
+ struct drm_xe_query_eu_stall *info;
+ size_t size, array_size;
+ const u64 *rates;
+ u32 num_rates;
+ int ret;
+
+ if (!xe_eu_stall_supported_on_platform(xe)) {
+ drm_dbg(&xe->drm, "EU stall monitoring is not supported on this platform\n");
+ return -ENODEV;
+ }
+
+ array_size = xe_eu_stall_get_sampling_rates(&num_rates, &rates);
+ size = sizeof(struct drm_xe_query_eu_stall) + array_size;
+
+ if (query->size == 0) {
+ query->size = size;
+ return 0;
+ } else if (XE_IOCTL_DBG(xe, query->size != size)) {
+ return -EINVAL;
+ }
+
+ info = kzalloc(size, GFP_KERNEL);
+ if (!info)
+ return -ENOMEM;
+
+ info->num_sampling_rates = num_rates;
+ info->capabilities = DRM_XE_EU_STALL_CAPS_BASE;
+ info->record_size = xe_eu_stall_data_record_size(xe);
+ info->per_xecore_buf_size = xe_eu_stall_get_per_xecore_buf_size();
+ memcpy(info->sampling_rates, rates, array_size);
+
+ ret = copy_to_user(query_ptr, info, size);
+ kfree(info);
+
+ return ret ? -EFAULT : 0;
+}
+
static int (* const xe_query_funcs[])(struct xe_device *xe,
struct drm_xe_device_query *query) = {
query_engines,
@@ -741,6 +788,7 @@ static int (* const xe_query_funcs[])(struct xe_device *xe,
query_uc_fw_version,
query_oa_units,
query_pxp_status,
+ query_eu_stall,
};
int xe_query_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
diff --git a/drivers/gpu/drm/xe/xe_res_cursor.h b/drivers/gpu/drm/xe/xe_res_cursor.h
index dca374b6521c..d1a403cfb628 100644
--- a/drivers/gpu/drm/xe/xe_res_cursor.h
+++ b/drivers/gpu/drm/xe/xe_res_cursor.h
@@ -26,6 +26,7 @@
#include <linux/scatterlist.h>
+#include <drm/drm_pagemap.h>
#include <drm/ttm/ttm_placement.h>
#include <drm/ttm/ttm_range_manager.h>
#include <drm/ttm/ttm_resource.h>
@@ -34,17 +35,38 @@
#include "xe_bo.h"
#include "xe_device.h"
#include "xe_macros.h"
+#include "xe_svm.h"
#include "xe_ttm_vram_mgr.h"
-/* state back for walking over vram_mgr, stolen_mgr, and gtt_mgr allocations */
+/**
+ * struct xe_res_cursor - state for walking over dma mapping, vram_mgr,
+ * stolen_mgr, and gtt_mgr allocations
+ */
struct xe_res_cursor {
+ /** @start: Start of cursor */
u64 start;
+ /** @size: Size of the current segment. */
u64 size;
+ /** @remaining: Remaining bytes in cursor */
u64 remaining;
+ /** @node: Opaque point current node cursor */
void *node;
+ /** @mem_type: Memory type */
u32 mem_type;
+ /** @sgl: Scatterlist for cursor */
struct scatterlist *sgl;
+ /** @dma_addr: Current element in a struct drm_pagemap_device_addr array */
+ const struct drm_pagemap_device_addr *dma_addr;
+ /** @mm: Buddy allocator for VRAM cursor */
struct drm_buddy *mm;
+ /**
+ * @dma_start: DMA start address for the current segment.
+ * This may be different to @dma_addr.addr since elements in
+ * the array may be coalesced to a single segment.
+ */
+ u64 dma_start;
+ /** @dma_seg_size: Size of the current DMA segment. */
+ u64 dma_seg_size;
};
static struct drm_buddy *xe_res_get_buddy(struct ttm_resource *res)
@@ -70,6 +92,7 @@ static inline void xe_res_first(struct ttm_resource *res,
struct xe_res_cursor *cur)
{
cur->sgl = NULL;
+ cur->dma_addr = NULL;
if (!res)
goto fallback;
@@ -142,6 +165,36 @@ static inline void __xe_res_sg_next(struct xe_res_cursor *cur)
}
/**
+ * __xe_res_dma_next() - Advance the cursor when end-of-segment is reached
+ * @cur: The cursor
+ */
+static inline void __xe_res_dma_next(struct xe_res_cursor *cur)
+{
+ const struct drm_pagemap_device_addr *addr = cur->dma_addr;
+ u64 start = cur->start;
+
+ while (start >= cur->dma_seg_size) {
+ start -= cur->dma_seg_size;
+ addr++;
+ cur->dma_seg_size = PAGE_SIZE << addr->order;
+ }
+ cur->dma_start = addr->addr;
+
+ /* Coalesce array_elements */
+ while (cur->dma_seg_size - start < cur->remaining) {
+ if (cur->dma_start + cur->dma_seg_size != addr[1].addr ||
+ addr->proto != addr[1].proto)
+ break;
+ addr++;
+ cur->dma_seg_size += PAGE_SIZE << addr->order;
+ }
+
+ cur->dma_addr = addr;
+ cur->start = start;
+ cur->size = cur->dma_seg_size - start;
+}
+
+/**
* xe_res_first_sg - initialize a xe_res_cursor with a scatter gather table
*
* @sg: scatter gather table to walk
@@ -160,12 +213,43 @@ static inline void xe_res_first_sg(const struct sg_table *sg,
cur->start = start;
cur->remaining = size;
cur->size = 0;
+ cur->dma_addr = NULL;
cur->sgl = sg->sgl;
cur->mem_type = XE_PL_TT;
__xe_res_sg_next(cur);
}
/**
+ * xe_res_first_dma - initialize a xe_res_cursor with dma_addr array
+ *
+ * @dma_addr: struct drm_pagemap_device_addr array to walk
+ * @start: Start of the range
+ * @size: Size of the range
+ * @cur: cursor object to initialize
+ *
+ * Start walking over the range of allocations between @start and @size.
+ */
+static inline void xe_res_first_dma(const struct drm_pagemap_device_addr *dma_addr,
+ u64 start, u64 size,
+ struct xe_res_cursor *cur)
+{
+ XE_WARN_ON(!dma_addr);
+ XE_WARN_ON(!IS_ALIGNED(start, PAGE_SIZE) ||
+ !IS_ALIGNED(size, PAGE_SIZE));
+
+ cur->node = NULL;
+ cur->start = start;
+ cur->remaining = size;
+ cur->dma_seg_size = PAGE_SIZE << dma_addr->order;
+ cur->dma_start = 0;
+ cur->size = 0;
+ cur->dma_addr = dma_addr;
+ __xe_res_dma_next(cur);
+ cur->sgl = NULL;
+ cur->mem_type = XE_PL_TT;
+}
+
+/**
* xe_res_next - advance the cursor
*
* @cur: the cursor to advance
@@ -191,6 +275,12 @@ static inline void xe_res_next(struct xe_res_cursor *cur, u64 size)
return;
}
+ if (cur->dma_addr) {
+ cur->start += size;
+ __xe_res_dma_next(cur);
+ return;
+ }
+
if (cur->sgl) {
cur->start += size;
__xe_res_sg_next(cur);
@@ -232,6 +322,35 @@ static inline void xe_res_next(struct xe_res_cursor *cur, u64 size)
*/
static inline u64 xe_res_dma(const struct xe_res_cursor *cur)
{
- return cur->sgl ? sg_dma_address(cur->sgl) + cur->start : cur->start;
+ if (cur->dma_addr)
+ return cur->dma_start + cur->start;
+ else if (cur->sgl)
+ return sg_dma_address(cur->sgl) + cur->start;
+ else
+ return cur->start;
+}
+
+/**
+ * xe_res_is_vram() - Whether the cursor current dma address points to
+ * same-device VRAM
+ * @cur: The cursor.
+ *
+ * Return: true iff the address returned by xe_res_dma() points to internal vram.
+ */
+static inline bool xe_res_is_vram(const struct xe_res_cursor *cur)
+{
+ if (cur->dma_addr)
+ return cur->dma_addr->proto == XE_INTERCONNECT_VRAM;
+
+ switch (cur->mem_type) {
+ case XE_PL_STOLEN:
+ case XE_PL_VRAM0:
+ case XE_PL_VRAM1:
+ return true;
+ default:
+ break;
+ }
+
+ return false;
}
#endif
diff --git a/drivers/gpu/drm/xe/xe_ring_ops.c b/drivers/gpu/drm/xe/xe_ring_ops.c
index 0c230ee53bba..d2f604aa96fa 100644
--- a/drivers/gpu/drm/xe/xe_ring_ops.c
+++ b/drivers/gpu/drm/xe/xe_ring_ops.c
@@ -177,6 +177,10 @@ static int emit_render_cache_flush(struct xe_sched_job *job, u32 *dw, int i)
bool lacks_render = !(gt->info.engine_mask & XE_HW_ENGINE_RCS_MASK);
u32 flags;
+ if (XE_WA(gt, 14016712196))
+ i = emit_pipe_control(dw, i, 0, PIPE_CONTROL_DEPTH_CACHE_FLUSH,
+ LRC_PPHWSP_FLUSH_INVAL_SCRATCH_ADDR, 0);
+
flags = (PIPE_CONTROL_CS_STALL |
PIPE_CONTROL_TILE_CACHE_FLUSH |
PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.c b/drivers/gpu/drm/xe/xe_survivability_mode.c
index 02b4eadf8407..d939ce70e6fa 100644
--- a/drivers/gpu/drm/xe/xe_survivability_mode.c
+++ b/drivers/gpu/drm/xe/xe_survivability_mode.c
@@ -127,40 +127,55 @@ static ssize_t survivability_mode_show(struct device *dev,
static DEVICE_ATTR_ADMIN_RO(survivability_mode);
-static void enable_survivability_mode(struct pci_dev *pdev)
+static void xe_survivability_mode_fini(void *arg)
+{
+ struct xe_device *xe = arg;
+ struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
+ struct device *dev = &pdev->dev;
+
+ sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr);
+}
+
+static int enable_survivability_mode(struct pci_dev *pdev)
{
struct device *dev = &pdev->dev;
struct xe_device *xe = pdev_to_xe_device(pdev);
struct xe_survivability *survivability = &xe->survivability;
int ret = 0;
- /* set survivability mode */
- survivability->mode = true;
- dev_info(dev, "In Survivability Mode\n");
-
/* create survivability mode sysfs */
ret = sysfs_create_file(&dev->kobj, &dev_attr_survivability_mode.attr);
if (ret) {
dev_warn(dev, "Failed to create survivability sysfs files\n");
- return;
+ return ret;
}
- xe_heci_gsc_init(xe);
+ ret = devm_add_action_or_reset(xe->drm.dev,
+ xe_survivability_mode_fini, xe);
+ if (ret)
+ return ret;
+
+ ret = xe_heci_gsc_init(xe);
+ if (ret)
+ return ret;
xe_vsec_init(xe);
+
+ survivability->mode = true;
+ dev_err(dev, "In Survivability Mode\n");
+
+ return 0;
}
/**
- * xe_survivability_mode_enabled - check if survivability mode is enabled
+ * xe_survivability_mode_is_enabled - check if survivability mode is enabled
* @xe: xe device instance
*
* Returns true if in survivability mode, false otherwise
*/
-bool xe_survivability_mode_enabled(struct xe_device *xe)
+bool xe_survivability_mode_is_enabled(struct xe_device *xe)
{
- struct xe_survivability *survivability = &xe->survivability;
-
- return survivability->mode;
+ return xe->survivability.mode;
}
/**
@@ -183,35 +198,19 @@ bool xe_survivability_mode_required(struct xe_device *xe)
data = xe_mmio_read32(mmio, PCODE_SCRATCH(0));
survivability->boot_status = REG_FIELD_GET(BOOT_STATUS, data);
- return (survivability->boot_status == NON_CRITICAL_FAILURE ||
- survivability->boot_status == CRITICAL_FAILURE);
+ return survivability->boot_status == NON_CRITICAL_FAILURE ||
+ survivability->boot_status == CRITICAL_FAILURE;
}
/**
- * xe_survivability_mode_remove - remove survivability mode
+ * xe_survivability_mode_enable - Initialize and enable the survivability mode
* @xe: xe device instance
*
- * clean up sysfs entries of survivability mode
- */
-void xe_survivability_mode_remove(struct xe_device *xe)
-{
- struct xe_survivability *survivability = &xe->survivability;
- struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
- struct device *dev = &pdev->dev;
-
- sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr);
- xe_heci_gsc_fini(xe);
- kfree(survivability->info);
- pci_set_drvdata(pdev, NULL);
-}
-
-/**
- * xe_survivability_mode_init - Initialize the survivability mode
- * @xe: xe device instance
+ * Initialize survivability information and enable survivability mode
*
- * Initializes survivability information and enables survivability mode
+ * Return: 0 for success, negative error code otherwise.
*/
-void xe_survivability_mode_init(struct xe_device *xe)
+int xe_survivability_mode_enable(struct xe_device *xe)
{
struct xe_survivability *survivability = &xe->survivability;
struct xe_survivability_info *info;
@@ -219,9 +218,10 @@ void xe_survivability_mode_init(struct xe_device *xe)
survivability->size = MAX_SCRATCH_MMIO;
- info = kcalloc(survivability->size, sizeof(*info), GFP_KERNEL);
+ info = devm_kcalloc(xe->drm.dev, survivability->size, sizeof(*info),
+ GFP_KERNEL);
if (!info)
- return;
+ return -ENOMEM;
survivability->info = info;
@@ -230,9 +230,8 @@ void xe_survivability_mode_init(struct xe_device *xe)
/* Only log debug information and exit if it is a critical failure */
if (survivability->boot_status == CRITICAL_FAILURE) {
log_survivability_info(pdev);
- kfree(survivability->info);
- return;
+ return -ENXIO;
}
- enable_survivability_mode(pdev);
+ return enable_survivability_mode(pdev);
}
diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.h b/drivers/gpu/drm/xe/xe_survivability_mode.h
index f530507a22c6..f4df5f9025ce 100644
--- a/drivers/gpu/drm/xe/xe_survivability_mode.h
+++ b/drivers/gpu/drm/xe/xe_survivability_mode.h
@@ -10,9 +10,8 @@
struct xe_device;
-void xe_survivability_mode_init(struct xe_device *xe);
-void xe_survivability_mode_remove(struct xe_device *xe);
-bool xe_survivability_mode_enabled(struct xe_device *xe);
+int xe_survivability_mode_enable(struct xe_device *xe);
+bool xe_survivability_mode_is_enabled(struct xe_device *xe);
bool xe_survivability_mode_required(struct xe_device *xe);
#endif /* _XE_SURVIVABILITY_MODE_H_ */
diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c
new file mode 100644
index 000000000000..516898e99b26
--- /dev/null
+++ b/drivers/gpu/drm/xe/xe_svm.c
@@ -0,0 +1,946 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2024 Intel Corporation
+ */
+
+#include "xe_bo.h"
+#include "xe_gt_tlb_invalidation.h"
+#include "xe_migrate.h"
+#include "xe_module.h"
+#include "xe_pt.h"
+#include "xe_svm.h"
+#include "xe_ttm_vram_mgr.h"
+#include "xe_vm.h"
+#include "xe_vm_types.h"
+
+static bool xe_svm_range_in_vram(struct xe_svm_range *range)
+{
+ /* Not reliable without notifier lock */
+ return range->base.flags.has_devmem_pages;
+}
+
+static bool xe_svm_range_has_vram_binding(struct xe_svm_range *range)
+{
+ /* Not reliable without notifier lock */
+ return xe_svm_range_in_vram(range) && range->tile_present;
+}
+
+static struct xe_vm *gpusvm_to_vm(struct drm_gpusvm *gpusvm)
+{
+ return container_of(gpusvm, struct xe_vm, svm.gpusvm);
+}
+
+static struct xe_vm *range_to_vm(struct drm_gpusvm_range *r)
+{
+ return gpusvm_to_vm(r->gpusvm);
+}
+
+static unsigned long xe_svm_range_start(struct xe_svm_range *range)
+{
+ return drm_gpusvm_range_start(&range->base);
+}
+
+static unsigned long xe_svm_range_end(struct xe_svm_range *range)
+{
+ return drm_gpusvm_range_end(&range->base);
+}
+
+static unsigned long xe_svm_range_size(struct xe_svm_range *range)
+{
+ return drm_gpusvm_range_size(&range->base);
+}
+
+#define range_debug(r__, operaton__) \
+ vm_dbg(&range_to_vm(&(r__)->base)->xe->drm, \
+ "%s: asid=%u, gpusvm=%p, vram=%d,%d, seqno=%lu, " \
+ "start=0x%014lx, end=0x%014lx, size=%lu", \
+ (operaton__), range_to_vm(&(r__)->base)->usm.asid, \
+ (r__)->base.gpusvm, \
+ xe_svm_range_in_vram((r__)) ? 1 : 0, \
+ xe_svm_range_has_vram_binding((r__)) ? 1 : 0, \
+ (r__)->base.notifier_seq, \
+ xe_svm_range_start((r__)), xe_svm_range_end((r__)), \
+ xe_svm_range_size((r__)))
+
+void xe_svm_range_debug(struct xe_svm_range *range, const char *operation)
+{
+ range_debug(range, operation);
+}
+
+static void *xe_svm_devm_owner(struct xe_device *xe)
+{
+ return xe;
+}
+
+static struct drm_gpusvm_range *
+xe_svm_range_alloc(struct drm_gpusvm *gpusvm)
+{
+ struct xe_svm_range *range;
+
+ range = kzalloc(sizeof(*range), GFP_KERNEL);
+ if (!range)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&range->garbage_collector_link);
+ xe_vm_get(gpusvm_to_vm(gpusvm));
+
+ return &range->base;
+}
+
+static void xe_svm_range_free(struct drm_gpusvm_range *range)
+{
+ xe_vm_put(range_to_vm(range));
+ kfree(range);
+}
+
+static struct xe_svm_range *to_xe_range(struct drm_gpusvm_range *r)
+{
+ return container_of(r, struct xe_svm_range, base);
+}
+
+static void
+xe_svm_garbage_collector_add_range(struct xe_vm *vm, struct xe_svm_range *range,
+ const struct mmu_notifier_range *mmu_range)
+{
+ struct xe_device *xe = vm->xe;
+
+ range_debug(range, "GARBAGE COLLECTOR ADD");
+
+ drm_gpusvm_range_set_unmapped(&range->base, mmu_range);
+
+ spin_lock(&vm->svm.garbage_collector.lock);
+ if (list_empty(&range->garbage_collector_link))
+ list_add_tail(&range->garbage_collector_link,
+ &vm->svm.garbage_collector.range_list);
+ spin_unlock(&vm->svm.garbage_collector.lock);
+
+ queue_work(xe_device_get_root_tile(xe)->primary_gt->usm.pf_wq,
+ &vm->svm.garbage_collector.work);
+}
+
+static u8
+xe_svm_range_notifier_event_begin(struct xe_vm *vm, struct drm_gpusvm_range *r,
+ const struct mmu_notifier_range *mmu_range,
+ u64 *adj_start, u64 *adj_end)
+{
+ struct xe_svm_range *range = to_xe_range(r);
+ struct xe_device *xe = vm->xe;
+ struct xe_tile *tile;
+ u8 tile_mask = 0;
+ u8 id;
+
+ xe_svm_assert_in_notifier(vm);
+
+ range_debug(range, "NOTIFIER");
+
+ /* Skip if already unmapped or if no binding exist */
+ if (range->base.flags.unmapped || !range->tile_present)
+ return 0;
+
+ range_debug(range, "NOTIFIER - EXECUTE");
+
+ /* Adjust invalidation to range boundaries */
+ *adj_start = min(xe_svm_range_start(range), mmu_range->start);
+ *adj_end = max(xe_svm_range_end(range), mmu_range->end);
+
+ /*
+ * XXX: Ideally would zap PTEs in one shot in xe_svm_invalidate but the
+ * invalidation code can't correctly cope with sparse ranges or
+ * invalidations spanning multiple ranges.
+ */
+ for_each_tile(tile, xe, id)
+ if (xe_pt_zap_ptes_range(tile, vm, range)) {
+ tile_mask |= BIT(id);
+ range->tile_invalidated |= BIT(id);
+ }
+
+ return tile_mask;
+}
+
+static void
+xe_svm_range_notifier_event_end(struct xe_vm *vm, struct drm_gpusvm_range *r,
+ const struct mmu_notifier_range *mmu_range)
+{
+ struct drm_gpusvm_ctx ctx = { .in_notifier = true, };
+
+ xe_svm_assert_in_notifier(vm);
+
+ drm_gpusvm_range_unmap_pages(&vm->svm.gpusvm, r, &ctx);
+ if (!xe_vm_is_closed(vm) && mmu_range->event == MMU_NOTIFY_UNMAP)
+ xe_svm_garbage_collector_add_range(vm, to_xe_range(r),
+ mmu_range);
+}
+
+static void xe_svm_invalidate(struct drm_gpusvm *gpusvm,
+ struct drm_gpusvm_notifier *notifier,
+ const struct mmu_notifier_range *mmu_range)
+{
+ struct xe_vm *vm = gpusvm_to_vm(gpusvm);
+ struct xe_device *xe = vm->xe;
+ struct xe_tile *tile;
+ struct drm_gpusvm_range *r, *first;
+ struct xe_gt_tlb_invalidation_fence
+ fence[XE_MAX_TILES_PER_DEVICE * XE_MAX_GT_PER_TILE];
+ u64 adj_start = mmu_range->start, adj_end = mmu_range->end;
+ u8 tile_mask = 0;
+ u8 id;
+ u32 fence_id = 0;
+ long err;
+
+ xe_svm_assert_in_notifier(vm);
+
+ vm_dbg(&gpusvm_to_vm(gpusvm)->xe->drm,
+ "INVALIDATE: asid=%u, gpusvm=%p, seqno=%lu, start=0x%016lx, end=0x%016lx, event=%d",
+ vm->usm.asid, gpusvm, notifier->notifier.invalidate_seq,
+ mmu_range->start, mmu_range->end, mmu_range->event);
+
+ /* Adjust invalidation to notifier boundaries */
+ adj_start = max(drm_gpusvm_notifier_start(notifier), adj_start);
+ adj_end = min(drm_gpusvm_notifier_end(notifier), adj_end);
+
+ first = drm_gpusvm_range_find(notifier, adj_start, adj_end);
+ if (!first)
+ return;
+
+ /*
+ * PTs may be getting destroyed so not safe to touch these but PT should
+ * be invalidated at this point in time. Regardless we still need to
+ * ensure any dma mappings are unmapped in the here.
+ */
+ if (xe_vm_is_closed(vm))
+ goto range_notifier_event_end;
+
+ /*
+ * XXX: Less than ideal to always wait on VM's resv slots if an
+ * invalidation is not required. Could walk range list twice to figure
+ * out if an invalidations is need, but also not ideal.
+ */
+ err = dma_resv_wait_timeout(xe_vm_resv(vm),
+ DMA_RESV_USAGE_BOOKKEEP,
+ false, MAX_SCHEDULE_TIMEOUT);
+ XE_WARN_ON(err <= 0);
+
+ r = first;
+ drm_gpusvm_for_each_range(r, notifier, adj_start, adj_end)
+ tile_mask |= xe_svm_range_notifier_event_begin(vm, r, mmu_range,
+ &adj_start,
+ &adj_end);
+ if (!tile_mask)
+ goto range_notifier_event_end;
+
+ xe_device_wmb(xe);
+
+ for_each_tile(tile, xe, id) {
+ if (tile_mask & BIT(id)) {
+ int err;
+
+ xe_gt_tlb_invalidation_fence_init(tile->primary_gt,
+ &fence[fence_id], true);
+
+ err = xe_gt_tlb_invalidation_range(tile->primary_gt,
+ &fence[fence_id],
+ adj_start,
+ adj_end,
+ vm->usm.asid);
+ if (WARN_ON_ONCE(err < 0))
+ goto wait;
+ ++fence_id;
+
+ if (!tile->media_gt)
+ continue;
+
+ xe_gt_tlb_invalidation_fence_init(tile->media_gt,
+ &fence[fence_id], true);
+
+ err = xe_gt_tlb_invalidation_range(tile->media_gt,
+ &fence[fence_id],
+ adj_start,
+ adj_end,
+ vm->usm.asid);
+ if (WARN_ON_ONCE(err < 0))
+ goto wait;
+ ++fence_id;
+ }
+ }
+
+wait:
+ for (id = 0; id < fence_id; ++id)
+ xe_gt_tlb_invalidation_fence_wait(&fence[id]);
+
+range_notifier_event_end:
+ r = first;
+ drm_gpusvm_for_each_range(r, notifier, adj_start, adj_end)
+ xe_svm_range_notifier_event_end(vm, r, mmu_range);
+}
+
+static int __xe_svm_garbage_collector(struct xe_vm *vm,
+ struct xe_svm_range *range)
+{
+ struct dma_fence *fence;
+
+ range_debug(range, "GARBAGE COLLECTOR");
+
+ xe_vm_lock(vm, false);
+ fence = xe_vm_range_unbind(vm, range);
+ xe_vm_unlock(vm);
+ if (IS_ERR(fence))
+ return PTR_ERR(fence);
+ dma_fence_put(fence);
+
+ drm_gpusvm_range_remove(&vm->svm.gpusvm, &range->base);
+
+ return 0;
+}
+
+static int xe_svm_garbage_collector(struct xe_vm *vm)
+{
+ struct xe_svm_range *range;
+ int err;
+
+ lockdep_assert_held_write(&vm->lock);
+
+ if (xe_vm_is_closed_or_banned(vm))
+ return -ENOENT;
+
+ spin_lock(&vm->svm.garbage_collector.lock);
+ for (;;) {
+ range = list_first_entry_or_null(&vm->svm.garbage_collector.range_list,
+ typeof(*range),
+ garbage_collector_link);
+ if (!range)
+ break;
+
+ list_del(&range->garbage_collector_link);
+ spin_unlock(&vm->svm.garbage_collector.lock);
+
+ err = __xe_svm_garbage_collector(vm, range);
+ if (err) {
+ drm_warn(&vm->xe->drm,
+ "Garbage collection failed: %pe\n",
+ ERR_PTR(err));
+ xe_vm_kill(vm, true);
+ return err;
+ }
+
+ spin_lock(&vm->svm.garbage_collector.lock);
+ }
+ spin_unlock(&vm->svm.garbage_collector.lock);
+
+ return 0;
+}
+
+static void xe_svm_garbage_collector_work_func(struct work_struct *w)
+{
+ struct xe_vm *vm = container_of(w, struct xe_vm,
+ svm.garbage_collector.work);
+
+ down_write(&vm->lock);
+ xe_svm_garbage_collector(vm);
+ up_write(&vm->lock);
+}
+
+static struct xe_vram_region *page_to_vr(struct page *page)
+{
+ return container_of(page->pgmap, struct xe_vram_region, pagemap);
+}
+
+static struct xe_tile *vr_to_tile(struct xe_vram_region *vr)
+{
+ return container_of(vr, struct xe_tile, mem.vram);
+}
+
+static u64 xe_vram_region_page_to_dpa(struct xe_vram_region *vr,
+ struct page *page)
+{
+ u64 dpa;
+ struct xe_tile *tile = vr_to_tile(vr);
+ u64 pfn = page_to_pfn(page);
+ u64 offset;
+
+ xe_tile_assert(tile, is_device_private_page(page));
+ xe_tile_assert(tile, (pfn << PAGE_SHIFT) >= vr->hpa_base);
+
+ offset = (pfn << PAGE_SHIFT) - vr->hpa_base;
+ dpa = vr->dpa_base + offset;
+
+ return dpa;
+}
+
+enum xe_svm_copy_dir {
+ XE_SVM_COPY_TO_VRAM,
+ XE_SVM_COPY_TO_SRAM,
+};
+
+static int xe_svm_copy(struct page **pages, dma_addr_t *dma_addr,
+ unsigned long npages, const enum xe_svm_copy_dir dir)
+{
+ struct xe_vram_region *vr = NULL;
+ struct xe_tile *tile;
+ struct dma_fence *fence = NULL;
+ unsigned long i;
+#define XE_VRAM_ADDR_INVALID ~0x0ull
+ u64 vram_addr = XE_VRAM_ADDR_INVALID;
+ int err = 0, pos = 0;
+ bool sram = dir == XE_SVM_COPY_TO_SRAM;
+
+ /*
+ * This flow is complex: it locates physically contiguous device pages,
+ * derives the starting physical address, and performs a single GPU copy
+ * to for every 8M chunk in a DMA address array. Both device pages and
+ * DMA addresses may be sparsely populated. If either is NULL, a copy is
+ * triggered based on the current search state. The last GPU copy is
+ * waited on to ensure all copies are complete.
+ */
+
+ for (i = 0; i < npages; ++i) {
+ struct page *spage = pages[i];
+ struct dma_fence *__fence;
+ u64 __vram_addr;
+ bool match = false, chunk, last;
+
+#define XE_MIGRATE_CHUNK_SIZE SZ_8M
+ chunk = (i - pos) == (XE_MIGRATE_CHUNK_SIZE / PAGE_SIZE);
+ last = (i + 1) == npages;
+
+ /* No CPU page and no device pages queue'd to copy */
+ if (!dma_addr[i] && vram_addr == XE_VRAM_ADDR_INVALID)
+ continue;
+
+ if (!vr && spage) {
+ vr = page_to_vr(spage);
+ tile = vr_to_tile(vr);
+ }
+ XE_WARN_ON(spage && page_to_vr(spage) != vr);
+
+ /*
+ * CPU page and device page valid, capture physical address on
+ * first device page, check if physical contiguous on subsequent
+ * device pages.
+ */
+ if (dma_addr[i] && spage) {
+ __vram_addr = xe_vram_region_page_to_dpa(vr, spage);
+ if (vram_addr == XE_VRAM_ADDR_INVALID) {
+ vram_addr = __vram_addr;
+ pos = i;
+ }
+
+ match = vram_addr + PAGE_SIZE * (i - pos) == __vram_addr;
+ }
+
+ /*
+ * Mismatched physical address, 8M copy chunk, or last page -
+ * trigger a copy.
+ */
+ if (!match || chunk || last) {
+ /*
+ * Extra page for first copy if last page and matching
+ * physical address.
+ */
+ int incr = (match && last) ? 1 : 0;
+
+ if (vram_addr != XE_VRAM_ADDR_INVALID) {
+ if (sram) {
+ vm_dbg(&tile->xe->drm,
+ "COPY TO SRAM - 0x%016llx -> 0x%016llx, NPAGES=%ld",
+ vram_addr, (u64)dma_addr[pos], i - pos + incr);
+ __fence = xe_migrate_from_vram(tile->migrate,
+ i - pos + incr,
+ vram_addr,
+ dma_addr + pos);
+ } else {
+ vm_dbg(&tile->xe->drm,
+ "COPY TO VRAM - 0x%016llx -> 0x%016llx, NPAGES=%ld",
+ (u64)dma_addr[pos], vram_addr, i - pos + incr);
+ __fence = xe_migrate_to_vram(tile->migrate,
+ i - pos + incr,
+ dma_addr + pos,
+ vram_addr);
+ }
+ if (IS_ERR(__fence)) {
+ err = PTR_ERR(__fence);
+ goto err_out;
+ }
+
+ dma_fence_put(fence);
+ fence = __fence;
+ }
+
+ /* Setup physical address of next device page */
+ if (dma_addr[i] && spage) {
+ vram_addr = __vram_addr;
+ pos = i;
+ } else {
+ vram_addr = XE_VRAM_ADDR_INVALID;
+ }
+
+ /* Extra mismatched device page, copy it */
+ if (!match && last && vram_addr != XE_VRAM_ADDR_INVALID) {
+ if (sram) {
+ vm_dbg(&tile->xe->drm,
+ "COPY TO SRAM - 0x%016llx -> 0x%016llx, NPAGES=%d",
+ vram_addr, (u64)dma_addr[pos], 1);
+ __fence = xe_migrate_from_vram(tile->migrate, 1,
+ vram_addr,
+ dma_addr + pos);
+ } else {
+ vm_dbg(&tile->xe->drm,
+ "COPY TO VRAM - 0x%016llx -> 0x%016llx, NPAGES=%d",
+ (u64)dma_addr[pos], vram_addr, 1);
+ __fence = xe_migrate_to_vram(tile->migrate, 1,
+ dma_addr + pos,
+ vram_addr);
+ }
+ if (IS_ERR(__fence)) {
+ err = PTR_ERR(__fence);
+ goto err_out;
+ }
+
+ dma_fence_put(fence);
+ fence = __fence;
+ }
+ }
+ }
+
+err_out:
+ /* Wait for all copies to complete */
+ if (fence) {
+ dma_fence_wait(fence, false);
+ dma_fence_put(fence);
+ }
+
+ return err;
+#undef XE_MIGRATE_CHUNK_SIZE
+#undef XE_VRAM_ADDR_INVALID
+}
+
+static int xe_svm_copy_to_devmem(struct page **pages, dma_addr_t *dma_addr,
+ unsigned long npages)
+{
+ return xe_svm_copy(pages, dma_addr, npages, XE_SVM_COPY_TO_VRAM);
+}
+
+static int xe_svm_copy_to_ram(struct page **pages, dma_addr_t *dma_addr,
+ unsigned long npages)
+{
+ return xe_svm_copy(pages, dma_addr, npages, XE_SVM_COPY_TO_SRAM);
+}
+
+static struct xe_bo *to_xe_bo(struct drm_gpusvm_devmem *devmem_allocation)
+{
+ return container_of(devmem_allocation, struct xe_bo, devmem_allocation);
+}
+
+static void xe_svm_devmem_release(struct drm_gpusvm_devmem *devmem_allocation)
+{
+ struct xe_bo *bo = to_xe_bo(devmem_allocation);
+
+ xe_bo_put_async(bo);
+}
+
+static u64 block_offset_to_pfn(struct xe_vram_region *vr, u64 offset)
+{
+ return PHYS_PFN(offset + vr->hpa_base);
+}
+
+static struct drm_buddy *tile_to_buddy(struct xe_tile *tile)
+{
+ return &tile->mem.vram.ttm.mm;
+}
+
+static int xe_svm_populate_devmem_pfn(struct drm_gpusvm_devmem *devmem_allocation,
+ unsigned long npages, unsigned long *pfn)
+{
+ struct xe_bo *bo = to_xe_bo(devmem_allocation);
+ struct ttm_resource *res = bo->ttm.resource;
+ struct list_head *blocks = &to_xe_ttm_vram_mgr_resource(res)->blocks;
+ struct drm_buddy_block *block;
+ int j = 0;
+
+ list_for_each_entry(block, blocks, link) {
+ struct xe_vram_region *vr = block->private;
+ struct xe_tile *tile = vr_to_tile(vr);
+ struct drm_buddy *buddy = tile_to_buddy(tile);
+ u64 block_pfn = block_offset_to_pfn(vr, drm_buddy_block_offset(block));
+ int i;
+
+ for (i = 0; i < drm_buddy_block_size(buddy, block) >> PAGE_SHIFT; ++i)
+ pfn[j++] = block_pfn + i;
+ }
+
+ return 0;
+}
+
+static const struct drm_gpusvm_devmem_ops gpusvm_devmem_ops = {
+ .devmem_release = xe_svm_devmem_release,
+ .populate_devmem_pfn = xe_svm_populate_devmem_pfn,
+ .copy_to_devmem = xe_svm_copy_to_devmem,
+ .copy_to_ram = xe_svm_copy_to_ram,
+};
+
+static const struct drm_gpusvm_ops gpusvm_ops = {
+ .range_alloc = xe_svm_range_alloc,
+ .range_free = xe_svm_range_free,
+ .invalidate = xe_svm_invalidate,
+};
+
+static const unsigned long fault_chunk_sizes[] = {
+ SZ_2M,
+ SZ_64K,
+ SZ_4K,
+};
+
+/**
+ * xe_svm_init() - SVM initialize
+ * @vm: The VM.
+ *
+ * Initialize SVM state which is embedded within the VM.
+ *
+ * Return: 0 on success, negative error code on error.
+ */
+int xe_svm_init(struct xe_vm *vm)
+{
+ int err;
+
+ spin_lock_init(&vm->svm.garbage_collector.lock);
+ INIT_LIST_HEAD(&vm->svm.garbage_collector.range_list);
+ INIT_WORK(&vm->svm.garbage_collector.work,
+ xe_svm_garbage_collector_work_func);
+
+ err = drm_gpusvm_init(&vm->svm.gpusvm, "Xe SVM", &vm->xe->drm,
+ current->mm, xe_svm_devm_owner(vm->xe), 0,
+ vm->size, xe_modparam.svm_notifier_size * SZ_1M,
+ &gpusvm_ops, fault_chunk_sizes,
+ ARRAY_SIZE(fault_chunk_sizes));
+ if (err)
+ return err;
+
+ drm_gpusvm_driver_set_lock(&vm->svm.gpusvm, &vm->lock);
+
+ return 0;
+}
+
+/**
+ * xe_svm_close() - SVM close
+ * @vm: The VM.
+ *
+ * Close SVM state (i.e., stop and flush all SVM actions).
+ */
+void xe_svm_close(struct xe_vm *vm)
+{
+ xe_assert(vm->xe, xe_vm_is_closed(vm));
+ flush_work(&vm->svm.garbage_collector.work);
+}
+
+/**
+ * xe_svm_fini() - SVM finalize
+ * @vm: The VM.
+ *
+ * Finalize SVM state which is embedded within the VM.
+ */
+void xe_svm_fini(struct xe_vm *vm)
+{
+ xe_assert(vm->xe, xe_vm_is_closed(vm));
+
+ drm_gpusvm_fini(&vm->svm.gpusvm);
+}
+
+static bool xe_svm_range_is_valid(struct xe_svm_range *range,
+ struct xe_tile *tile)
+{
+ return (range->tile_present & ~range->tile_invalidated) & BIT(tile->id);
+}
+
+static struct xe_vram_region *tile_to_vr(struct xe_tile *tile)
+{
+ return &tile->mem.vram;
+}
+
+static int xe_svm_alloc_vram(struct xe_vm *vm, struct xe_tile *tile,
+ struct xe_svm_range *range,
+ const struct drm_gpusvm_ctx *ctx)
+{
+ struct mm_struct *mm = vm->svm.gpusvm.mm;
+ struct xe_vram_region *vr = tile_to_vr(tile);
+ struct drm_buddy_block *block;
+ struct list_head *blocks;
+ struct xe_bo *bo;
+ ktime_t end = 0;
+ int err;
+
+ range_debug(range, "ALLOCATE VRAM");
+
+ if (!mmget_not_zero(mm))
+ return -EFAULT;
+ mmap_read_lock(mm);
+
+retry:
+ bo = xe_bo_create_locked(tile_to_xe(tile), NULL, NULL,
+ xe_svm_range_size(range),
+ ttm_bo_type_device,
+ XE_BO_FLAG_VRAM_IF_DGFX(tile) |
+ XE_BO_FLAG_CPU_ADDR_MIRROR);
+ if (IS_ERR(bo)) {
+ err = PTR_ERR(bo);
+ if (xe_vm_validate_should_retry(NULL, err, &end))
+ goto retry;
+ goto unlock;
+ }
+
+ drm_gpusvm_devmem_init(&bo->devmem_allocation,
+ vm->xe->drm.dev, mm,
+ &gpusvm_devmem_ops,
+ &tile->mem.vram.dpagemap,
+ xe_svm_range_size(range));
+
+ blocks = &to_xe_ttm_vram_mgr_resource(bo->ttm.resource)->blocks;
+ list_for_each_entry(block, blocks, link)
+ block->private = vr;
+
+ err = drm_gpusvm_migrate_to_devmem(&vm->svm.gpusvm, &range->base,
+ &bo->devmem_allocation, ctx);
+ xe_bo_unlock(bo);
+ if (err)
+ xe_bo_put(bo); /* Creation ref */
+
+unlock:
+ mmap_read_unlock(mm);
+ mmput(mm);
+
+ return err;
+}
+
+/**
+ * xe_svm_handle_pagefault() - SVM handle page fault
+ * @vm: The VM.
+ * @vma: The CPU address mirror VMA.
+ * @tile: The tile upon the fault occurred.
+ * @fault_addr: The GPU fault address.
+ * @atomic: The fault atomic access bit.
+ *
+ * Create GPU bindings for a SVM page fault. Optionally migrate to device
+ * memory.
+ *
+ * Return: 0 on success, negative error code on error.
+ */
+int xe_svm_handle_pagefault(struct xe_vm *vm, struct xe_vma *vma,
+ struct xe_tile *tile, u64 fault_addr,
+ bool atomic)
+{
+ struct drm_gpusvm_ctx ctx = {
+ .read_only = xe_vma_read_only(vma),
+ .devmem_possible = IS_DGFX(vm->xe) &&
+ IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR),
+ .check_pages_threshold = IS_DGFX(vm->xe) &&
+ IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR) ? SZ_64K : 0,
+ };
+ struct xe_svm_range *range;
+ struct drm_gpusvm_range *r;
+ struct drm_exec exec;
+ struct dma_fence *fence;
+ ktime_t end = 0;
+ int err;
+
+ lockdep_assert_held_write(&vm->lock);
+ xe_assert(vm->xe, xe_vma_is_cpu_addr_mirror(vma));
+
+retry:
+ /* Always process UNMAPs first so view SVM ranges is current */
+ err = xe_svm_garbage_collector(vm);
+ if (err)
+ return err;
+
+ r = drm_gpusvm_range_find_or_insert(&vm->svm.gpusvm, fault_addr,
+ xe_vma_start(vma), xe_vma_end(vma),
+ &ctx);
+ if (IS_ERR(r))
+ return PTR_ERR(r);
+
+ range = to_xe_range(r);
+ if (xe_svm_range_is_valid(range, tile))
+ return 0;
+
+ range_debug(range, "PAGE FAULT");
+
+ /* XXX: Add migration policy, for now migrate range once */
+ if (!range->skip_migrate && range->base.flags.migrate_devmem &&
+ xe_svm_range_size(range) >= SZ_64K) {
+ range->skip_migrate = true;
+
+ err = xe_svm_alloc_vram(vm, tile, range, &ctx);
+ if (err) {
+ drm_dbg(&vm->xe->drm,
+ "VRAM allocation failed, falling back to "
+ "retrying fault, asid=%u, errno=%pe\n",
+ vm->usm.asid, ERR_PTR(err));
+ goto retry;
+ }
+ }
+
+ range_debug(range, "GET PAGES");
+ err = drm_gpusvm_range_get_pages(&vm->svm.gpusvm, r, &ctx);
+ /* Corner where CPU mappings have changed */
+ if (err == -EOPNOTSUPP || err == -EFAULT || err == -EPERM) {
+ if (err == -EOPNOTSUPP) {
+ range_debug(range, "PAGE FAULT - EVICT PAGES");
+ drm_gpusvm_range_evict(&vm->svm.gpusvm, &range->base);
+ }
+ drm_dbg(&vm->xe->drm,
+ "Get pages failed, falling back to retrying, asid=%u, gpusvm=%p, errno=%pe\n",
+ vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
+ range_debug(range, "PAGE FAULT - RETRY PAGES");
+ goto retry;
+ }
+ if (err) {
+ range_debug(range, "PAGE FAULT - FAIL PAGE COLLECT");
+ goto err_out;
+ }
+
+ range_debug(range, "PAGE FAULT - BIND");
+
+retry_bind:
+ drm_exec_init(&exec, 0, 0);
+ drm_exec_until_all_locked(&exec) {
+ err = drm_exec_lock_obj(&exec, vm->gpuvm.r_obj);
+ drm_exec_retry_on_contention(&exec);
+ if (err) {
+ drm_exec_fini(&exec);
+ goto err_out;
+ }
+
+ fence = xe_vm_range_rebind(vm, vma, range, BIT(tile->id));
+ if (IS_ERR(fence)) {
+ drm_exec_fini(&exec);
+ err = PTR_ERR(fence);
+ if (err == -EAGAIN) {
+ range_debug(range, "PAGE FAULT - RETRY BIND");
+ goto retry;
+ }
+ if (xe_vm_validate_should_retry(&exec, err, &end))
+ goto retry_bind;
+ goto err_out;
+ }
+ }
+ drm_exec_fini(&exec);
+
+ if (xe_modparam.always_migrate_to_vram)
+ range->skip_migrate = false;
+
+ dma_fence_wait(fence, false);
+ dma_fence_put(fence);
+
+err_out:
+
+ return err;
+}
+
+/**
+ * xe_svm_has_mapping() - SVM has mappings
+ * @vm: The VM.
+ * @start: Start address.
+ * @end: End address.
+ *
+ * Check if an address range has SVM mappings.
+ *
+ * Return: True if address range has a SVM mapping, False otherwise
+ */
+bool xe_svm_has_mapping(struct xe_vm *vm, u64 start, u64 end)
+{
+ return drm_gpusvm_has_mapping(&vm->svm.gpusvm, start, end);
+}
+
+/**
+ * xe_svm_bo_evict() - SVM evict BO to system memory
+ * @bo: BO to evict
+ *
+ * SVM evict BO to system memory. GPU SVM layer ensures all device pages
+ * are evicted before returning.
+ *
+ * Return: 0 on success standard error code otherwise
+ */
+int xe_svm_bo_evict(struct xe_bo *bo)
+{
+ return drm_gpusvm_evict_to_ram(&bo->devmem_allocation);
+}
+
+#if IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR)
+static struct drm_pagemap_device_addr
+xe_drm_pagemap_device_map(struct drm_pagemap *dpagemap,
+ struct device *dev,
+ struct page *page,
+ unsigned int order,
+ enum dma_data_direction dir)
+{
+ struct device *pgmap_dev = dpagemap->dev;
+ enum drm_interconnect_protocol prot;
+ dma_addr_t addr;
+
+ if (pgmap_dev == dev) {
+ addr = xe_vram_region_page_to_dpa(page_to_vr(page), page);
+ prot = XE_INTERCONNECT_VRAM;
+ } else {
+ addr = DMA_MAPPING_ERROR;
+ prot = 0;
+ }
+
+ return drm_pagemap_device_addr_encode(addr, prot, order, dir);
+}
+
+static const struct drm_pagemap_ops xe_drm_pagemap_ops = {
+ .device_map = xe_drm_pagemap_device_map,
+};
+
+/**
+ * xe_devm_add: Remap and provide memmap backing for device memory
+ * @tile: tile that the memory region belongs to
+ * @vr: vram memory region to remap
+ *
+ * This remap device memory to host physical address space and create
+ * struct page to back device memory
+ *
+ * Return: 0 on success standard error code otherwise
+ */
+int xe_devm_add(struct xe_tile *tile, struct xe_vram_region *vr)
+{
+ struct xe_device *xe = tile_to_xe(tile);
+ struct device *dev = &to_pci_dev(xe->drm.dev)->dev;
+ struct resource *res;
+ void *addr;
+ int ret;
+
+ res = devm_request_free_mem_region(dev, &iomem_resource,
+ vr->usable_size);
+ if (IS_ERR(res)) {
+ ret = PTR_ERR(res);
+ return ret;
+ }
+
+ vr->pagemap.type = MEMORY_DEVICE_PRIVATE;
+ vr->pagemap.range.start = res->start;
+ vr->pagemap.range.end = res->end;
+ vr->pagemap.nr_range = 1;
+ vr->pagemap.ops = drm_gpusvm_pagemap_ops_get();
+ vr->pagemap.owner = xe_svm_devm_owner(xe);
+ addr = devm_memremap_pages(dev, &vr->pagemap);
+
+ vr->dpagemap.dev = dev;
+ vr->dpagemap.ops = &xe_drm_pagemap_ops;
+
+ if (IS_ERR(addr)) {
+ devm_release_mem_region(dev, res->start, resource_size(res));
+ ret = PTR_ERR(addr);
+ drm_err(&xe->drm, "Failed to remap tile %d memory, errno %pe\n",
+ tile->id, ERR_PTR(ret));
+ return ret;
+ }
+ vr->hpa_base = res->start;
+
+ drm_dbg(&xe->drm, "Added tile %d memory [%llx-%llx] to devm, remapped to %pr\n",
+ tile->id, vr->io_start, vr->io_start + vr->usable_size, res);
+ return 0;
+}
+#else
+int xe_devm_add(struct xe_tile *tile, struct xe_vram_region *vr)
+{
+ return 0;
+}
+#endif
diff --git a/drivers/gpu/drm/xe/xe_svm.h b/drivers/gpu/drm/xe/xe_svm.h
new file mode 100644
index 000000000000..e059590e5076
--- /dev/null
+++ b/drivers/gpu/drm/xe/xe_svm.h
@@ -0,0 +1,150 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2024 Intel Corporation
+ */
+
+#ifndef _XE_SVM_H_
+#define _XE_SVM_H_
+
+#include <drm/drm_pagemap.h>
+#include <drm/drm_gpusvm.h>
+
+#define XE_INTERCONNECT_VRAM DRM_INTERCONNECT_DRIVER
+
+struct xe_bo;
+struct xe_vram_region;
+struct xe_tile;
+struct xe_vm;
+struct xe_vma;
+
+/** struct xe_svm_range - SVM range */
+struct xe_svm_range {
+ /** @base: base drm_gpusvm_range */
+ struct drm_gpusvm_range base;
+ /**
+ * @garbage_collector_link: Link into VM's garbage collect SVM range
+ * list. Protected by VM's garbage collect lock.
+ */
+ struct list_head garbage_collector_link;
+ /**
+ * @tile_present: Tile mask of binding is present for this range.
+ * Protected by GPU SVM notifier lock.
+ */
+ u8 tile_present;
+ /**
+ * @tile_invalidated: Tile mask of binding is invalidated for this
+ * range. Protected by GPU SVM notifier lock.
+ */
+ u8 tile_invalidated;
+ /**
+ * @skip_migrate: Skip migration to VRAM, protected by GPU fault handler
+ * locking.
+ */
+ u8 skip_migrate :1;
+};
+
+#if IS_ENABLED(CONFIG_DRM_GPUSVM)
+/**
+ * xe_svm_range_pages_valid() - SVM range pages valid
+ * @range: SVM range
+ *
+ * Return: True if SVM range pages are valid, False otherwise
+ */
+static inline bool xe_svm_range_pages_valid(struct xe_svm_range *range)
+{
+ return drm_gpusvm_range_pages_valid(range->base.gpusvm, &range->base);
+}
+
+int xe_devm_add(struct xe_tile *tile, struct xe_vram_region *vr);
+
+int xe_svm_init(struct xe_vm *vm);
+
+void xe_svm_fini(struct xe_vm *vm);
+
+void xe_svm_close(struct xe_vm *vm);
+
+int xe_svm_handle_pagefault(struct xe_vm *vm, struct xe_vma *vma,
+ struct xe_tile *tile, u64 fault_addr,
+ bool atomic);
+
+bool xe_svm_has_mapping(struct xe_vm *vm, u64 start, u64 end);
+
+int xe_svm_bo_evict(struct xe_bo *bo);
+
+void xe_svm_range_debug(struct xe_svm_range *range, const char *operation);
+#else
+static inline bool xe_svm_range_pages_valid(struct xe_svm_range *range)
+{
+ return false;
+}
+
+static inline
+int xe_devm_add(struct xe_tile *tile, struct xe_vram_region *vr)
+{
+ return 0;
+}
+
+static inline
+int xe_svm_init(struct xe_vm *vm)
+{
+ return 0;
+}
+
+static inline
+void xe_svm_fini(struct xe_vm *vm)
+{
+}
+
+static inline
+void xe_svm_close(struct xe_vm *vm)
+{
+}
+
+static inline
+int xe_svm_handle_pagefault(struct xe_vm *vm, struct xe_vma *vma,
+ struct xe_tile *tile, u64 fault_addr,
+ bool atomic)
+{
+ return 0;
+}
+
+static inline
+bool xe_svm_has_mapping(struct xe_vm *vm, u64 start, u64 end)
+{
+ return false;
+}
+
+static inline
+int xe_svm_bo_evict(struct xe_bo *bo)
+{
+ return 0;
+}
+
+static inline
+void xe_svm_range_debug(struct xe_svm_range *range, const char *operation)
+{
+}
+#endif
+
+/**
+ * xe_svm_range_has_dma_mapping() - SVM range has DMA mapping
+ * @range: SVM range
+ *
+ * Return: True if SVM range has a DMA mapping, False otherwise
+ */
+static inline bool xe_svm_range_has_dma_mapping(struct xe_svm_range *range)
+{
+ lockdep_assert_held(&range->base.gpusvm->notifier_lock);
+ return range->base.flags.has_dma_mapping;
+}
+
+#define xe_svm_assert_in_notifier(vm__) \
+ lockdep_assert_held_write(&(vm__)->svm.gpusvm.notifier_lock)
+
+#define xe_svm_notifier_lock(vm__) \
+ drm_gpusvm_notifier_lock(&(vm__)->svm.gpusvm)
+
+#define xe_svm_notifier_unlock(vm__) \
+ drm_gpusvm_notifier_unlock(&(vm__)->svm.gpusvm)
+
+#endif
diff --git a/drivers/gpu/drm/xe/xe_tile.c b/drivers/gpu/drm/xe/xe_tile.c
index d29658ff4dd4..0771acbbf367 100644
--- a/drivers/gpu/drm/xe/xe_tile.c
+++ b/drivers/gpu/drm/xe/xe_tile.c
@@ -13,6 +13,7 @@
#include "xe_migrate.h"
#include "xe_pcode.h"
#include "xe_sa.h"
+#include "xe_svm.h"
#include "xe_tile.h"
#include "xe_tile_sysfs.h"
#include "xe_ttm_vram_mgr.h"
@@ -160,6 +161,7 @@ static int tile_ttm_mgr_init(struct xe_tile *tile)
*/
int xe_tile_init_noalloc(struct xe_tile *tile)
{
+ struct xe_device *xe = tile_to_xe(tile);
int err;
err = tile_ttm_mgr_init(tile);
@@ -168,6 +170,9 @@ int xe_tile_init_noalloc(struct xe_tile *tile)
xe_wa_apply_tile_workarounds(tile);
+ if (xe->info.has_usm && IS_DGFX(xe))
+ xe_devm_add(tile, &tile->mem.vram);
+
return xe_tile_sysfs_init(tile);
}
diff --git a/drivers/gpu/drm/xe/xe_trace.h b/drivers/gpu/drm/xe/xe_trace.h
index d5281de04d54..b4a3577df70c 100644
--- a/drivers/gpu/drm/xe/xe_trace.h
+++ b/drivers/gpu/drm/xe/xe_trace.h
@@ -427,6 +427,36 @@ DEFINE_EVENT(xe_pm_runtime, xe_pm_runtime_get_ioctl,
TP_ARGS(xe, caller)
);
+TRACE_EVENT(xe_eu_stall_data_read,
+ TP_PROTO(u8 slice, u8 subslice,
+ u32 read_ptr, u32 write_ptr,
+ size_t read_size, size_t total_size),
+ TP_ARGS(slice, subslice,
+ read_ptr, write_ptr,
+ read_size, total_size),
+
+ TP_STRUCT__entry(__field(u8, slice)
+ __field(u8, subslice)
+ __field(u32, read_ptr)
+ __field(u32, write_ptr)
+ __field(size_t, read_size)
+ __field(size_t, total_size)
+ ),
+
+ TP_fast_assign(__entry->slice = slice;
+ __entry->subslice = subslice;
+ __entry->read_ptr = read_ptr;
+ __entry->write_ptr = write_ptr;
+ __entry->read_size = read_size;
+ __entry->total_size = total_size;
+ ),
+
+ TP_printk("slice: %u subslice: %u read ptr: 0x%x write ptr: 0x%x read size: %zu total read size: %zu",
+ __entry->slice, __entry->subslice,
+ __entry->read_ptr, __entry->write_ptr,
+ __entry->read_size, __entry->total_size)
+);
+
#endif
/* This part must be outside protection */
diff --git a/drivers/gpu/drm/xe/xe_trace_guc.h b/drivers/gpu/drm/xe/xe_trace_guc.h
index 23abdd55dc62..78949db9cfce 100644
--- a/drivers/gpu/drm/xe/xe_trace_guc.h
+++ b/drivers/gpu/drm/xe/xe_trace_guc.h
@@ -14,6 +14,7 @@
#include "xe_device_types.h"
#include "xe_guc_exec_queue_types.h"
+#include "xe_guc_engine_activity_types.h"
#define __dev_name_xe(xe) dev_name((xe)->drm.dev)
@@ -100,6 +101,54 @@ DEFINE_EVENT_PRINT(xe_guc_ctb, xe_guc_ctb_g2h,
);
+TRACE_EVENT(xe_guc_engine_activity,
+ TP_PROTO(struct xe_device *xe, struct engine_activity *ea, const char *name,
+ u16 instance),
+ TP_ARGS(xe, ea, name, instance),
+
+ TP_STRUCT__entry(
+ __string(dev, __dev_name_xe(xe))
+ __string(name, name)
+ __field(u32, global_change_num)
+ __field(u32, guc_tsc_frequency_hz)
+ __field(u32, lag_latency_usec)
+ __field(u16, instance)
+ __field(u16, change_num)
+ __field(u16, quanta_ratio)
+ __field(u32, last_update_tick)
+ __field(u64, active_ticks)
+ __field(u64, active)
+ __field(u64, total)
+ __field(u64, quanta)
+ __field(u64, last_cpu_ts)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dev);
+ __assign_str(name);
+ __entry->global_change_num = ea->metadata.global_change_num;
+ __entry->guc_tsc_frequency_hz = ea->metadata.guc_tsc_frequency_hz;
+ __entry->lag_latency_usec = ea->metadata.lag_latency_usec;
+ __entry->instance = instance;
+ __entry->change_num = ea->activity.change_num;
+ __entry->quanta_ratio = ea->activity.quanta_ratio;
+ __entry->last_update_tick = ea->activity.last_update_tick;
+ __entry->active_ticks = ea->activity.active_ticks;
+ __entry->active = ea->active;
+ __entry->total = ea->total;
+ __entry->quanta = ea->quanta;
+ __entry->last_cpu_ts = ea->last_cpu_ts;
+ ),
+
+ TP_printk("dev=%s engine %s:%d Active=%llu, quanta=%llu, last_cpu_ts=%llu\n"
+ "Activity metadata: global_change_num=%u, guc_tsc_frequency_hz=%u lag_latency_usec=%u\n"
+ "Activity data: change_num=%u, quanta_ratio=0x%x, last_update_tick=%u, active_ticks=%llu\n",
+ __get_str(dev), __get_str(name), __entry->instance,
+ (__entry->active + __entry->total), __entry->quanta, __entry->last_cpu_ts,
+ __entry->global_change_num, __entry->guc_tsc_frequency_hz,
+ __entry->lag_latency_usec, __entry->change_num, __entry->quanta_ratio,
+ __entry->last_update_tick, __entry->active_ticks)
+);
#endif
/* This part must be outside protection */
diff --git a/drivers/gpu/drm/xe/xe_tuning.c b/drivers/gpu/drm/xe/xe_tuning.c
index d449de0fb6ec..77bc958f5a42 100644
--- a/drivers/gpu/drm/xe/xe_tuning.c
+++ b/drivers/gpu/drm/xe/xe_tuning.c
@@ -7,6 +7,8 @@
#include <kunit/visibility.h>
+#include <drm/drm_managed.h>
+
#include "regs/xe_gt_regs.h"
#include "xe_gt_types.h"
#include "xe_platform_types.h"
@@ -88,6 +90,11 @@ static const struct xe_rtp_entry_sr gt_tunings[] = {
};
static const struct xe_rtp_entry_sr engine_tunings[] = {
+ { XE_RTP_NAME("Tuning: L3 Hashing Mask"),
+ XE_RTP_RULES(GRAPHICS_VERSION_RANGE(1200, 1210),
+ FUNC(xe_rtp_match_first_render_or_compute)),
+ XE_RTP_ACTIONS(CLR(XELP_GARBCNTL, XELP_BUS_HASH_CTL_BIT_EXC))
+ },
{ XE_RTP_NAME("Tuning: Set Indirect State Override"),
XE_RTP_RULES(GRAPHICS_VERSION_RANGE(1200, 1274),
ENGINE_CLASS(RENDER)),
@@ -97,14 +104,6 @@ static const struct xe_rtp_entry_sr engine_tunings[] = {
};
static const struct xe_rtp_entry_sr lrc_tunings[] = {
- { XE_RTP_NAME("Tuning: ganged timer, also known as 16011163337"),
- XE_RTP_RULES(GRAPHICS_VERSION_RANGE(1200, 1210), ENGINE_CLASS(RENDER)),
- /* read verification is ignored due to 1608008084. */
- XE_RTP_ACTIONS(FIELD_SET_NO_READ_MASK(FF_MODE2,
- FF_MODE2_GS_TIMER_MASK,
- FF_MODE2_GS_TIMER_224))
- },
-
/* DG2 */
{ XE_RTP_NAME("Tuning: L3 cache"),
@@ -143,10 +142,44 @@ static const struct xe_rtp_entry_sr lrc_tunings[] = {
{}
};
+/**
+ * xe_tuning_init - initialize gt with tunings bookkeeping
+ * @gt: GT instance to initialize
+ *
+ * Returns 0 for success, negative error code otherwise.
+ */
+int xe_tuning_init(struct xe_gt *gt)
+{
+ struct xe_device *xe = gt_to_xe(gt);
+ size_t n_lrc, n_engine, n_gt, total;
+ unsigned long *p;
+
+ n_gt = BITS_TO_LONGS(ARRAY_SIZE(gt_tunings));
+ n_engine = BITS_TO_LONGS(ARRAY_SIZE(engine_tunings));
+ n_lrc = BITS_TO_LONGS(ARRAY_SIZE(lrc_tunings));
+ total = n_gt + n_engine + n_lrc;
+
+ p = drmm_kzalloc(&xe->drm, sizeof(*p) * total, GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ gt->tuning_active.gt = p;
+ p += n_gt;
+ gt->tuning_active.engine = p;
+ p += n_engine;
+ gt->tuning_active.lrc = p;
+
+ return 0;
+}
+ALLOW_ERROR_INJECTION(xe_tuning_init, ERRNO); /* See xe_pci_probe() */
+
void xe_tuning_process_gt(struct xe_gt *gt)
{
struct xe_rtp_process_ctx ctx = XE_RTP_PROCESS_CTX_INITIALIZER(gt);
+ xe_rtp_process_ctx_enable_active_tracking(&ctx,
+ gt->tuning_active.gt,
+ ARRAY_SIZE(gt_tunings));
xe_rtp_process_to_sr(&ctx, gt_tunings, &gt->reg_sr);
}
EXPORT_SYMBOL_IF_KUNIT(xe_tuning_process_gt);
@@ -155,6 +188,9 @@ void xe_tuning_process_engine(struct xe_hw_engine *hwe)
{
struct xe_rtp_process_ctx ctx = XE_RTP_PROCESS_CTX_INITIALIZER(hwe);
+ xe_rtp_process_ctx_enable_active_tracking(&ctx,
+ hwe->gt->tuning_active.engine,
+ ARRAY_SIZE(engine_tunings));
xe_rtp_process_to_sr(&ctx, engine_tunings, &hwe->reg_sr);
}
EXPORT_SYMBOL_IF_KUNIT(xe_tuning_process_engine);
@@ -171,5 +207,25 @@ void xe_tuning_process_lrc(struct xe_hw_engine *hwe)
{
struct xe_rtp_process_ctx ctx = XE_RTP_PROCESS_CTX_INITIALIZER(hwe);
+ xe_rtp_process_ctx_enable_active_tracking(&ctx,
+ hwe->gt->tuning_active.lrc,
+ ARRAY_SIZE(lrc_tunings));
xe_rtp_process_to_sr(&ctx, lrc_tunings, &hwe->reg_lrc);
}
+
+void xe_tuning_dump(struct xe_gt *gt, struct drm_printer *p)
+{
+ size_t idx;
+
+ drm_printf(p, "GT Tunings\n");
+ for_each_set_bit(idx, gt->tuning_active.gt, ARRAY_SIZE(gt_tunings))
+ drm_printf_indent(p, 1, "%s\n", gt_tunings[idx].name);
+
+ drm_printf(p, "\nEngine Tunings\n");
+ for_each_set_bit(idx, gt->tuning_active.engine, ARRAY_SIZE(engine_tunings))
+ drm_printf_indent(p, 1, "%s\n", engine_tunings[idx].name);
+
+ drm_printf(p, "\nLRC Tunings\n");
+ for_each_set_bit(idx, gt->tuning_active.lrc, ARRAY_SIZE(lrc_tunings))
+ drm_printf_indent(p, 1, "%s\n", lrc_tunings[idx].name);
+}
diff --git a/drivers/gpu/drm/xe/xe_tuning.h b/drivers/gpu/drm/xe/xe_tuning.h
index 4f9c3ac3b516..dd0d3ccc9c65 100644
--- a/drivers/gpu/drm/xe/xe_tuning.h
+++ b/drivers/gpu/drm/xe/xe_tuning.h
@@ -6,11 +6,14 @@
#ifndef _XE_TUNING_
#define _XE_TUNING_
+struct drm_printer;
struct xe_gt;
struct xe_hw_engine;
+int xe_tuning_init(struct xe_gt *gt);
void xe_tuning_process_gt(struct xe_gt *gt);
void xe_tuning_process_engine(struct xe_hw_engine *hwe);
void xe_tuning_process_lrc(struct xe_hw_engine *hwe);
+void xe_tuning_dump(struct xe_gt *gt, struct drm_printer *p);
#endif
diff --git a/drivers/gpu/drm/xe/xe_uc.c b/drivers/gpu/drm/xe/xe_uc.c
index d8167e818280..c14bd2282044 100644
--- a/drivers/gpu/drm/xe/xe_uc.c
+++ b/drivers/gpu/drm/xe/xe_uc.c
@@ -14,6 +14,7 @@
#include "xe_gt_sriov_vf.h"
#include "xe_guc.h"
#include "xe_guc_pc.h"
+#include "xe_guc_engine_activity.h"
#include "xe_huc.h"
#include "xe_sriov.h"
#include "xe_uc_fw.h"
@@ -210,6 +211,8 @@ int xe_uc_init_hw(struct xe_uc *uc)
if (ret)
return ret;
+ xe_guc_engine_activity_enable_stats(&uc->guc);
+
/* We don't fail the driver load if HuC fails to auth, but let's warn */
ret = xe_huc_auth(&uc->huc, XE_HUC_AUTH_VIA_GUC);
xe_gt_assert(uc_to_gt(uc), !ret);
diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index d664f2e418b2..22a26aff3a6e 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -8,6 +8,7 @@
#include <linux/dma-fence-array.h>
#include <linux/nospec.h>
+#include <drm/drm_drv.h>
#include <drm/drm_exec.h>
#include <drm/drm_print.h>
#include <drm/ttm/ttm_tt.h>
@@ -35,6 +36,7 @@
#include "xe_pt.h"
#include "xe_pxp.h"
#include "xe_res_cursor.h"
+#include "xe_svm.h"
#include "xe_sync.h"
#include "xe_trace_bo.h"
#include "xe_wa.h"
@@ -270,6 +272,7 @@ out_up_write:
return err;
}
+ALLOW_ERROR_INJECTION(xe_vm_add_compute_exec_queue, ERRNO);
/**
* xe_vm_remove_compute_exec_queue() - Remove compute exec queue from VM
@@ -580,51 +583,26 @@ out_unlock_outer:
trace_xe_vm_rebind_worker_exit(vm);
}
-static bool vma_userptr_invalidate(struct mmu_interval_notifier *mni,
- const struct mmu_notifier_range *range,
- unsigned long cur_seq)
+static void __vma_userptr_invalidate(struct xe_vm *vm, struct xe_userptr_vma *uvma)
{
- struct xe_userptr *userptr = container_of(mni, typeof(*userptr), notifier);
- struct xe_userptr_vma *uvma = container_of(userptr, typeof(*uvma), userptr);
+ struct xe_userptr *userptr = &uvma->userptr;
struct xe_vma *vma = &uvma->vma;
- struct xe_vm *vm = xe_vma_vm(vma);
struct dma_resv_iter cursor;
struct dma_fence *fence;
long err;
- xe_assert(vm->xe, xe_vma_is_userptr(vma));
- trace_xe_vma_userptr_invalidate(vma);
-
- if (!mmu_notifier_range_blockable(range))
- return false;
-
- vm_dbg(&xe_vma_vm(vma)->xe->drm,
- "NOTIFIER: addr=0x%016llx, range=0x%016llx",
- xe_vma_start(vma), xe_vma_size(vma));
-
- down_write(&vm->userptr.notifier_lock);
- mmu_interval_set_seq(mni, cur_seq);
-
- /* No need to stop gpu access if the userptr is not yet bound. */
- if (!userptr->initial_bind) {
- up_write(&vm->userptr.notifier_lock);
- return true;
- }
-
/*
* Tell exec and rebind worker they need to repin and rebind this
* userptr.
*/
if (!xe_vm_in_fault_mode(vm) &&
- !(vma->gpuva.flags & XE_VMA_DESTROYED) && vma->tile_present) {
+ !(vma->gpuva.flags & XE_VMA_DESTROYED)) {
spin_lock(&vm->userptr.invalidated_lock);
list_move_tail(&userptr->invalidate_link,
&vm->userptr.invalidated);
spin_unlock(&vm->userptr.invalidated_lock);
}
- up_write(&vm->userptr.notifier_lock);
-
/*
* Preempt fences turn into schedule disables, pipeline these.
* Note that even in fault mode, we need to wait for binds and
@@ -642,11 +620,37 @@ static bool vma_userptr_invalidate(struct mmu_interval_notifier *mni,
false, MAX_SCHEDULE_TIMEOUT);
XE_WARN_ON(err <= 0);
- if (xe_vm_in_fault_mode(vm)) {
+ if (xe_vm_in_fault_mode(vm) && userptr->initial_bind) {
err = xe_vm_invalidate_vma(vma);
XE_WARN_ON(err);
}
+ xe_hmm_userptr_unmap(uvma);
+}
+
+static bool vma_userptr_invalidate(struct mmu_interval_notifier *mni,
+ const struct mmu_notifier_range *range,
+ unsigned long cur_seq)
+{
+ struct xe_userptr_vma *uvma = container_of(mni, typeof(*uvma), userptr.notifier);
+ struct xe_vma *vma = &uvma->vma;
+ struct xe_vm *vm = xe_vma_vm(vma);
+
+ xe_assert(vm->xe, xe_vma_is_userptr(vma));
+ trace_xe_vma_userptr_invalidate(vma);
+
+ if (!mmu_notifier_range_blockable(range))
+ return false;
+
+ vm_dbg(&xe_vma_vm(vma)->xe->drm,
+ "NOTIFIER: addr=0x%016llx, range=0x%016llx",
+ xe_vma_start(vma), xe_vma_size(vma));
+
+ down_write(&vm->userptr.notifier_lock);
+ mmu_interval_set_seq(mni, cur_seq);
+
+ __vma_userptr_invalidate(vm, uvma);
+ up_write(&vm->userptr.notifier_lock);
trace_xe_vma_userptr_invalidate_complete(vma);
return true;
@@ -656,31 +660,71 @@ static const struct mmu_interval_notifier_ops vma_userptr_notifier_ops = {
.invalidate = vma_userptr_invalidate,
};
+#if IS_ENABLED(CONFIG_DRM_XE_USERPTR_INVAL_INJECT)
+/**
+ * xe_vma_userptr_force_invalidate() - force invalidate a userptr
+ * @uvma: The userptr vma to invalidate
+ *
+ * Perform a forced userptr invalidation for testing purposes.
+ */
+void xe_vma_userptr_force_invalidate(struct xe_userptr_vma *uvma)
+{
+ struct xe_vm *vm = xe_vma_vm(&uvma->vma);
+
+ /* Protect against concurrent userptr pinning */
+ lockdep_assert_held(&vm->lock);
+ /* Protect against concurrent notifiers */
+ lockdep_assert_held(&vm->userptr.notifier_lock);
+ /*
+ * Protect against concurrent instances of this function and
+ * the critical exec sections
+ */
+ xe_vm_assert_held(vm);
+
+ if (!mmu_interval_read_retry(&uvma->userptr.notifier,
+ uvma->userptr.notifier_seq))
+ uvma->userptr.notifier_seq -= 2;
+ __vma_userptr_invalidate(vm, uvma);
+}
+#endif
+
int xe_vm_userptr_pin(struct xe_vm *vm)
{
struct xe_userptr_vma *uvma, *next;
int err = 0;
- LIST_HEAD(tmp_evict);
xe_assert(vm->xe, !xe_vm_in_fault_mode(vm));
lockdep_assert_held_write(&vm->lock);
/* Collect invalidated userptrs */
spin_lock(&vm->userptr.invalidated_lock);
+ xe_assert(vm->xe, list_empty(&vm->userptr.repin_list));
list_for_each_entry_safe(uvma, next, &vm->userptr.invalidated,
userptr.invalidate_link) {
list_del_init(&uvma->userptr.invalidate_link);
- list_move_tail(&uvma->userptr.repin_link,
- &vm->userptr.repin_list);
+ list_add_tail(&uvma->userptr.repin_link,
+ &vm->userptr.repin_list);
}
spin_unlock(&vm->userptr.invalidated_lock);
- /* Pin and move to temporary list */
+ /* Pin and move to bind list */
list_for_each_entry_safe(uvma, next, &vm->userptr.repin_list,
userptr.repin_link) {
err = xe_vma_userptr_pin_pages(uvma);
if (err == -EFAULT) {
list_del_init(&uvma->userptr.repin_link);
+ /*
+ * We might have already done the pin once already, but
+ * then had to retry before the re-bind happened, due
+ * some other condition in the caller, but in the
+ * meantime the userptr got dinged by the notifier such
+ * that we need to revalidate here, but this time we hit
+ * the EFAULT. In such a case make sure we remove
+ * ourselves from the rebind list to avoid going down in
+ * flames.
+ */
+ if (!list_empty(&uvma->vma.combined_links.rebind))
+ list_del_init(&uvma->vma.combined_links.rebind);
/* Wait for pending binds */
xe_vm_lock(vm, false);
@@ -691,10 +735,10 @@ int xe_vm_userptr_pin(struct xe_vm *vm)
err = xe_vm_invalidate_vma(&uvma->vma);
xe_vm_unlock(vm);
if (err)
- return err;
+ break;
} else {
- if (err < 0)
- return err;
+ if (err)
+ break;
list_del_init(&uvma->userptr.repin_link);
list_move_tail(&uvma->vma.combined_links.rebind,
@@ -702,7 +746,19 @@ int xe_vm_userptr_pin(struct xe_vm *vm)
}
}
- return 0;
+ if (err) {
+ down_write(&vm->userptr.notifier_lock);
+ spin_lock(&vm->userptr.invalidated_lock);
+ list_for_each_entry_safe(uvma, next, &vm->userptr.repin_list,
+ userptr.repin_link) {
+ list_del_init(&uvma->userptr.repin_link);
+ list_move_tail(&uvma->userptr.invalidate_link,
+ &vm->userptr.invalidated);
+ }
+ spin_unlock(&vm->userptr.invalidated_lock);
+ up_write(&vm->userptr.notifier_lock);
+ }
+ return err;
}
/**
@@ -894,6 +950,179 @@ free_ops:
return fence;
}
+static void xe_vm_populate_range_rebind(struct xe_vma_op *op,
+ struct xe_vma *vma,
+ struct xe_svm_range *range,
+ u8 tile_mask)
+{
+ INIT_LIST_HEAD(&op->link);
+ op->tile_mask = tile_mask;
+ op->base.op = DRM_GPUVA_OP_DRIVER;
+ op->subop = XE_VMA_SUBOP_MAP_RANGE;
+ op->map_range.vma = vma;
+ op->map_range.range = range;
+}
+
+static int
+xe_vm_ops_add_range_rebind(struct xe_vma_ops *vops,
+ struct xe_vma *vma,
+ struct xe_svm_range *range,
+ u8 tile_mask)
+{
+ struct xe_vma_op *op;
+
+ op = kzalloc(sizeof(*op), GFP_KERNEL);
+ if (!op)
+ return -ENOMEM;
+
+ xe_vm_populate_range_rebind(op, vma, range, tile_mask);
+ list_add_tail(&op->link, &vops->list);
+ xe_vma_ops_incr_pt_update_ops(vops, tile_mask);
+
+ return 0;
+}
+
+/**
+ * xe_vm_range_rebind() - VM range (re)bind
+ * @vm: The VM which the range belongs to.
+ * @vma: The VMA which the range belongs to.
+ * @range: SVM range to rebind.
+ * @tile_mask: Tile mask to bind the range to.
+ *
+ * (re)bind SVM range setting up GPU page tables for the range.
+ *
+ * Return: dma fence for rebind to signal completion on succees, ERR_PTR on
+ * failure
+ */
+struct dma_fence *xe_vm_range_rebind(struct xe_vm *vm,
+ struct xe_vma *vma,
+ struct xe_svm_range *range,
+ u8 tile_mask)
+{
+ struct dma_fence *fence = NULL;
+ struct xe_vma_ops vops;
+ struct xe_vma_op *op, *next_op;
+ struct xe_tile *tile;
+ u8 id;
+ int err;
+
+ lockdep_assert_held(&vm->lock);
+ xe_vm_assert_held(vm);
+ xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
+ xe_assert(vm->xe, xe_vma_is_cpu_addr_mirror(vma));
+
+ xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
+ for_each_tile(tile, vm->xe, id) {
+ vops.pt_update_ops[id].wait_vm_bookkeep = true;
+ vops.pt_update_ops[tile->id].q =
+ xe_tile_migrate_exec_queue(tile);
+ }
+
+ err = xe_vm_ops_add_range_rebind(&vops, vma, range, tile_mask);
+ if (err)
+ return ERR_PTR(err);
+
+ err = xe_vma_ops_alloc(&vops, false);
+ if (err) {
+ fence = ERR_PTR(err);
+ goto free_ops;
+ }
+
+ fence = ops_execute(vm, &vops);
+
+free_ops:
+ list_for_each_entry_safe(op, next_op, &vops.list, link) {
+ list_del(&op->link);
+ kfree(op);
+ }
+ xe_vma_ops_fini(&vops);
+
+ return fence;
+}
+
+static void xe_vm_populate_range_unbind(struct xe_vma_op *op,
+ struct xe_svm_range *range)
+{
+ INIT_LIST_HEAD(&op->link);
+ op->tile_mask = range->tile_present;
+ op->base.op = DRM_GPUVA_OP_DRIVER;
+ op->subop = XE_VMA_SUBOP_UNMAP_RANGE;
+ op->unmap_range.range = range;
+}
+
+static int
+xe_vm_ops_add_range_unbind(struct xe_vma_ops *vops,
+ struct xe_svm_range *range)
+{
+ struct xe_vma_op *op;
+
+ op = kzalloc(sizeof(*op), GFP_KERNEL);
+ if (!op)
+ return -ENOMEM;
+
+ xe_vm_populate_range_unbind(op, range);
+ list_add_tail(&op->link, &vops->list);
+ xe_vma_ops_incr_pt_update_ops(vops, range->tile_present);
+
+ return 0;
+}
+
+/**
+ * xe_vm_range_unbind() - VM range unbind
+ * @vm: The VM which the range belongs to.
+ * @range: SVM range to rebind.
+ *
+ * Unbind SVM range removing the GPU page tables for the range.
+ *
+ * Return: dma fence for unbind to signal completion on succees, ERR_PTR on
+ * failure
+ */
+struct dma_fence *xe_vm_range_unbind(struct xe_vm *vm,
+ struct xe_svm_range *range)
+{
+ struct dma_fence *fence = NULL;
+ struct xe_vma_ops vops;
+ struct xe_vma_op *op, *next_op;
+ struct xe_tile *tile;
+ u8 id;
+ int err;
+
+ lockdep_assert_held(&vm->lock);
+ xe_vm_assert_held(vm);
+ xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
+
+ if (!range->tile_present)
+ return dma_fence_get_stub();
+
+ xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
+ for_each_tile(tile, vm->xe, id) {
+ vops.pt_update_ops[id].wait_vm_bookkeep = true;
+ vops.pt_update_ops[tile->id].q =
+ xe_tile_migrate_exec_queue(tile);
+ }
+
+ err = xe_vm_ops_add_range_unbind(&vops, range);
+ if (err)
+ return ERR_PTR(err);
+
+ err = xe_vma_ops_alloc(&vops, false);
+ if (err) {
+ fence = ERR_PTR(err);
+ goto free_ops;
+ }
+
+ fence = ops_execute(vm, &vops);
+
+free_ops:
+ list_for_each_entry_safe(op, next_op, &vops.list, link) {
+ list_del(&op->link);
+ kfree(op);
+ }
+ xe_vma_ops_fini(&vops);
+
+ return fence;
+}
+
static void xe_vma_free(struct xe_vma *vma)
{
if (xe_vma_is_userptr(vma))
@@ -902,9 +1131,10 @@ static void xe_vma_free(struct xe_vma *vma)
kfree(vma);
}
-#define VMA_CREATE_FLAG_READ_ONLY BIT(0)
-#define VMA_CREATE_FLAG_IS_NULL BIT(1)
-#define VMA_CREATE_FLAG_DUMPABLE BIT(2)
+#define VMA_CREATE_FLAG_READ_ONLY BIT(0)
+#define VMA_CREATE_FLAG_IS_NULL BIT(1)
+#define VMA_CREATE_FLAG_DUMPABLE BIT(2)
+#define VMA_CREATE_FLAG_IS_SYSTEM_ALLOCATOR BIT(3)
static struct xe_vma *xe_vma_create(struct xe_vm *vm,
struct xe_bo *bo,
@@ -918,6 +1148,8 @@ static struct xe_vma *xe_vma_create(struct xe_vm *vm,
bool read_only = (flags & VMA_CREATE_FLAG_READ_ONLY);
bool is_null = (flags & VMA_CREATE_FLAG_IS_NULL);
bool dumpable = (flags & VMA_CREATE_FLAG_DUMPABLE);
+ bool is_cpu_addr_mirror =
+ (flags & VMA_CREATE_FLAG_IS_SYSTEM_ALLOCATOR);
xe_assert(vm->xe, start < end);
xe_assert(vm->xe, end < vm->size);
@@ -926,7 +1158,7 @@ static struct xe_vma *xe_vma_create(struct xe_vm *vm,
* Allocate and ensure that the xe_vma_is_userptr() return
* matches what was allocated.
*/
- if (!bo && !is_null) {
+ if (!bo && !is_null && !is_cpu_addr_mirror) {
struct xe_userptr_vma *uvma = kzalloc(sizeof(*uvma), GFP_KERNEL);
if (!uvma)
@@ -938,6 +1170,8 @@ static struct xe_vma *xe_vma_create(struct xe_vm *vm,
if (!vma)
return ERR_PTR(-ENOMEM);
+ if (is_cpu_addr_mirror)
+ vma->gpuva.flags |= XE_VMA_SYSTEM_ALLOCATOR;
if (is_null)
vma->gpuva.flags |= DRM_GPUVA_SPARSE;
if (bo)
@@ -980,7 +1214,7 @@ static struct xe_vma *xe_vma_create(struct xe_vm *vm,
drm_gpuva_link(&vma->gpuva, vm_bo);
drm_gpuvm_bo_put(vm_bo);
} else /* userptr or null */ {
- if (!is_null) {
+ if (!is_null && !is_cpu_addr_mirror) {
struct xe_userptr *userptr = &to_userptr_vma(vma)->userptr;
u64 size = end - start + 1;
int err;
@@ -988,6 +1222,7 @@ static struct xe_vma *xe_vma_create(struct xe_vm *vm,
INIT_LIST_HEAD(&userptr->invalidate_link);
INIT_LIST_HEAD(&userptr->repin_link);
vma->gpuva.gem.offset = bo_offset_or_userptr;
+ mutex_init(&userptr->unmap_mutex);
err = mmu_interval_notifier_insert(&userptr->notifier,
current->mm,
@@ -1029,8 +1264,9 @@ static void xe_vma_destroy_late(struct xe_vma *vma)
* them anymore
*/
mmu_interval_notifier_remove(&userptr->notifier);
+ mutex_destroy(&userptr->unmap_mutex);
xe_vm_put(vm);
- } else if (xe_vma_is_null(vma)) {
+ } else if (xe_vma_is_null(vma) || xe_vma_is_cpu_addr_mirror(vma)) {
xe_vm_put(vm);
} else {
xe_bo_put(xe_vma_bo(vma));
@@ -1067,9 +1303,10 @@ static void xe_vma_destroy(struct xe_vma *vma, struct dma_fence *fence)
xe_assert(vm->xe, vma->gpuva.flags & XE_VMA_DESTROYED);
spin_lock(&vm->userptr.invalidated_lock);
+ xe_assert(vm->xe, list_empty(&to_userptr_vma(vma)->userptr.repin_link));
list_del(&to_userptr_vma(vma)->userptr.invalidate_link);
spin_unlock(&vm->userptr.invalidated_lock);
- } else if (!xe_vma_is_null(vma)) {
+ } else if (!xe_vma_is_null(vma) && !xe_vma_is_cpu_addr_mirror(vma)) {
xe_bo_assert_held(xe_vma_bo(vma));
drm_gpuva_unlink(&vma->gpuva);
@@ -1520,6 +1757,12 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)
}
}
+ if (flags & XE_VM_FLAG_FAULT_MODE) {
+ err = xe_svm_init(vm);
+ if (err)
+ goto err_close;
+ }
+
if (number_tiles > 1)
vm->composite_fence_ctx = dma_fence_context_alloc(1);
@@ -1546,9 +1789,44 @@ err_no_resv:
static void xe_vm_close(struct xe_vm *vm)
{
+ struct xe_device *xe = vm->xe;
+ bool bound;
+ int idx;
+
+ bound = drm_dev_enter(&xe->drm, &idx);
+
down_write(&vm->lock);
+ if (xe_vm_in_fault_mode(vm))
+ xe_svm_notifier_lock(vm);
+
vm->size = 0;
+
+ if (!((vm->flags & XE_VM_FLAG_MIGRATION))) {
+ struct xe_tile *tile;
+ struct xe_gt *gt;
+ u8 id;
+
+ /* Wait for pending binds */
+ dma_resv_wait_timeout(xe_vm_resv(vm),
+ DMA_RESV_USAGE_BOOKKEEP,
+ false, MAX_SCHEDULE_TIMEOUT);
+
+ if (bound) {
+ for_each_tile(tile, xe, id)
+ if (vm->pt_root[id])
+ xe_pt_clear(xe, vm->pt_root[id]);
+
+ for_each_gt(gt, xe, id)
+ xe_gt_tlb_invalidation_vm(gt, vm);
+ }
+ }
+
+ if (xe_vm_in_fault_mode(vm))
+ xe_svm_notifier_unlock(vm);
up_write(&vm->lock);
+
+ if (bound)
+ drm_dev_exit(idx);
}
void xe_vm_close_and_put(struct xe_vm *vm)
@@ -1565,6 +1843,8 @@ void xe_vm_close_and_put(struct xe_vm *vm)
xe_vm_close(vm);
if (xe_vm_in_preempt_fence_mode(vm))
flush_work(&vm->preempt.rebind_work);
+ if (xe_vm_in_fault_mode(vm))
+ xe_svm_close(vm);
down_write(&vm->lock);
for_each_tile(tile, xe, id) {
@@ -1633,6 +1913,9 @@ void xe_vm_close_and_put(struct xe_vm *vm)
xe_vma_destroy_unlocked(vma);
}
+ if (xe_vm_in_fault_mode(vm))
+ xe_svm_fini(vm);
+
up_write(&vm->lock);
down_write(&xe->usm.lock);
@@ -1989,6 +2272,8 @@ vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_bo *bo,
op->map.read_only =
flags & DRM_XE_VM_BIND_FLAG_READONLY;
op->map.is_null = flags & DRM_XE_VM_BIND_FLAG_NULL;
+ op->map.is_cpu_addr_mirror = flags &
+ DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR;
op->map.dumpable = flags & DRM_XE_VM_BIND_FLAG_DUMPABLE;
op->map.pat_index = pat_index;
} else if (__op->op == DRM_GPUVA_OP_PREFETCH) {
@@ -2181,6 +2466,8 @@ static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct drm_gpuva_ops *ops,
VMA_CREATE_FLAG_IS_NULL : 0;
flags |= op->map.dumpable ?
VMA_CREATE_FLAG_DUMPABLE : 0;
+ flags |= op->map.is_cpu_addr_mirror ?
+ VMA_CREATE_FLAG_IS_SYSTEM_ALLOCATOR : 0;
vma = new_vma(vm, &op->base.map, op->map.pat_index,
flags);
@@ -2188,7 +2475,8 @@ static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct drm_gpuva_ops *ops,
return PTR_ERR(vma);
op->map.vma = vma;
- if (op->map.immediate || !xe_vm_in_fault_mode(vm))
+ if ((op->map.immediate || !xe_vm_in_fault_mode(vm)) &&
+ !op->map.is_cpu_addr_mirror)
xe_vma_ops_incr_pt_update_ops(vops,
op->tile_mask);
break;
@@ -2197,21 +2485,35 @@ static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct drm_gpuva_ops *ops,
{
struct xe_vma *old =
gpuva_to_vma(op->base.remap.unmap->va);
+ bool skip = xe_vma_is_cpu_addr_mirror(old);
+ u64 start = xe_vma_start(old), end = xe_vma_end(old);
+
+ if (op->base.remap.prev)
+ start = op->base.remap.prev->va.addr +
+ op->base.remap.prev->va.range;
+ if (op->base.remap.next)
+ end = op->base.remap.next->va.addr;
+
+ if (xe_vma_is_cpu_addr_mirror(old) &&
+ xe_svm_has_mapping(vm, start, end))
+ return -EBUSY;
op->remap.start = xe_vma_start(old);
op->remap.range = xe_vma_size(old);
- if (op->base.remap.prev) {
- flags |= op->base.remap.unmap->va->flags &
- XE_VMA_READ_ONLY ?
- VMA_CREATE_FLAG_READ_ONLY : 0;
- flags |= op->base.remap.unmap->va->flags &
- DRM_GPUVA_SPARSE ?
- VMA_CREATE_FLAG_IS_NULL : 0;
- flags |= op->base.remap.unmap->va->flags &
- XE_VMA_DUMPABLE ?
- VMA_CREATE_FLAG_DUMPABLE : 0;
+ flags |= op->base.remap.unmap->va->flags &
+ XE_VMA_READ_ONLY ?
+ VMA_CREATE_FLAG_READ_ONLY : 0;
+ flags |= op->base.remap.unmap->va->flags &
+ DRM_GPUVA_SPARSE ?
+ VMA_CREATE_FLAG_IS_NULL : 0;
+ flags |= op->base.remap.unmap->va->flags &
+ XE_VMA_DUMPABLE ?
+ VMA_CREATE_FLAG_DUMPABLE : 0;
+ flags |= xe_vma_is_cpu_addr_mirror(old) ?
+ VMA_CREATE_FLAG_IS_SYSTEM_ALLOCATOR : 0;
+ if (op->base.remap.prev) {
vma = new_vma(vm, op->base.remap.prev,
old->pat_index, flags);
if (IS_ERR(vma))
@@ -2223,9 +2525,10 @@ static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct drm_gpuva_ops *ops,
* Userptr creates a new SG mapping so
* we must also rebind.
*/
- op->remap.skip_prev = !xe_vma_is_userptr(old) &&
+ op->remap.skip_prev = skip ||
+ (!xe_vma_is_userptr(old) &&
IS_ALIGNED(xe_vma_end(vma),
- xe_vma_max_pte_size(old));
+ xe_vma_max_pte_size(old)));
if (op->remap.skip_prev) {
xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
op->remap.range -=
@@ -2241,16 +2544,6 @@ static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct drm_gpuva_ops *ops,
}
if (op->base.remap.next) {
- flags |= op->base.remap.unmap->va->flags &
- XE_VMA_READ_ONLY ?
- VMA_CREATE_FLAG_READ_ONLY : 0;
- flags |= op->base.remap.unmap->va->flags &
- DRM_GPUVA_SPARSE ?
- VMA_CREATE_FLAG_IS_NULL : 0;
- flags |= op->base.remap.unmap->va->flags &
- XE_VMA_DUMPABLE ?
- VMA_CREATE_FLAG_DUMPABLE : 0;
-
vma = new_vma(vm, op->base.remap.next,
old->pat_index, flags);
if (IS_ERR(vma))
@@ -2262,9 +2555,10 @@ static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct drm_gpuva_ops *ops,
* Userptr creates a new SG mapping so
* we must also rebind.
*/
- op->remap.skip_next = !xe_vma_is_userptr(old) &&
+ op->remap.skip_next = skip ||
+ (!xe_vma_is_userptr(old) &&
IS_ALIGNED(xe_vma_start(vma),
- xe_vma_max_pte_size(old));
+ xe_vma_max_pte_size(old)));
if (op->remap.skip_next) {
xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
op->remap.range -=
@@ -2277,13 +2571,32 @@ static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct drm_gpuva_ops *ops,
xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask);
}
}
- xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask);
+ if (!skip)
+ xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask);
break;
}
case DRM_GPUVA_OP_UNMAP:
+ vma = gpuva_to_vma(op->base.unmap.va);
+
+ if (xe_vma_is_cpu_addr_mirror(vma) &&
+ xe_svm_has_mapping(vm, xe_vma_start(vma),
+ xe_vma_end(vma)))
+ return -EBUSY;
+
+ if (!xe_vma_is_cpu_addr_mirror(vma))
+ xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask);
+ break;
case DRM_GPUVA_OP_PREFETCH:
- /* FIXME: Need to skip some prefetch ops */
- xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask);
+ vma = gpuva_to_vma(op->base.prefetch.va);
+
+ if (xe_vma_is_userptr(vma)) {
+ err = xe_vma_userptr_pin_pages(to_userptr_vma(vma));
+ if (err)
+ return err;
+ }
+
+ if (!xe_vma_is_cpu_addr_mirror(vma))
+ xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask);
break;
default:
drm_warn(&vm->xe->drm, "NOT POSSIBLE");
@@ -2509,6 +2822,8 @@ static void op_trace(struct xe_vma_op *op)
case DRM_GPUVA_OP_PREFETCH:
trace_xe_vma_bind(gpuva_to_vma(op->base.prefetch.va));
break;
+ case DRM_GPUVA_OP_DRIVER:
+ break;
default:
XE_WARN_ON("NOT POSSIBLE");
}
@@ -2686,9 +3001,11 @@ static void vm_bind_ioctl_ops_fini(struct xe_vm *vm, struct xe_vma_ops *vops,
}
if (ufence)
xe_sync_ufence_put(ufence);
- for (i = 0; i < vops->num_syncs; i++)
- xe_sync_entry_signal(vops->syncs + i, fence);
- xe_exec_queue_last_fence_set(wait_exec_queue, vm, fence);
+ if (fence) {
+ for (i = 0; i < vops->num_syncs; i++)
+ xe_sync_entry_signal(vops->syncs + i, fence);
+ xe_exec_queue_last_fence_set(wait_exec_queue, vm, fence);
+ }
}
static struct dma_fence *vm_bind_ioctl_ops_execute(struct xe_vm *vm,
@@ -2711,8 +3028,11 @@ static struct dma_fence *vm_bind_ioctl_ops_execute(struct xe_vm *vm,
}
fence = ops_execute(vm, vops);
- if (IS_ERR(fence))
+ if (IS_ERR(fence)) {
+ if (PTR_ERR(fence) == -ENODATA)
+ vm_bind_ioctl_ops_fini(vm, vops, NULL);
goto unlock;
+ }
vm_bind_ioctl_ops_fini(vm, vops, fence);
}
@@ -2728,7 +3048,8 @@ ALLOW_ERROR_INJECTION(vm_bind_ioctl_ops_execute, ERRNO);
DRM_XE_VM_BIND_FLAG_IMMEDIATE | \
DRM_XE_VM_BIND_FLAG_NULL | \
DRM_XE_VM_BIND_FLAG_DUMPABLE | \
- DRM_XE_VM_BIND_FLAG_CHECK_PXP)
+ DRM_XE_VM_BIND_FLAG_CHECK_PXP | \
+ DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR)
#ifdef TEST_VM_OPS_ERROR
#define SUPPORTED_FLAGS (SUPPORTED_FLAGS_STUB | FORCE_OP_ERROR)
@@ -2739,7 +3060,7 @@ ALLOW_ERROR_INJECTION(vm_bind_ioctl_ops_execute, ERRNO);
#define XE_64K_PAGE_MASK 0xffffull
#define ALL_DRM_XE_SYNCS_FLAGS (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
-static int vm_bind_ioctl_check_args(struct xe_device *xe,
+static int vm_bind_ioctl_check_args(struct xe_device *xe, struct xe_vm *vm,
struct drm_xe_vm_bind *args,
struct drm_xe_vm_bind_op **bind_ops)
{
@@ -2784,9 +3105,18 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
u64 obj_offset = (*bind_ops)[i].obj_offset;
u32 prefetch_region = (*bind_ops)[i].prefetch_mem_region_instance;
bool is_null = flags & DRM_XE_VM_BIND_FLAG_NULL;
+ bool is_cpu_addr_mirror = flags &
+ DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR;
u16 pat_index = (*bind_ops)[i].pat_index;
u16 coh_mode;
+ if (XE_IOCTL_DBG(xe, is_cpu_addr_mirror &&
+ (!xe_vm_in_fault_mode(vm) ||
+ !IS_ENABLED(CONFIG_DRM_GPUSVM)))) {
+ err = -EINVAL;
+ goto free_bind_ops;
+ }
+
if (XE_IOCTL_DBG(xe, pat_index >= xe->pat.n_entries)) {
err = -EINVAL;
goto free_bind_ops;
@@ -2807,13 +3137,14 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
if (XE_IOCTL_DBG(xe, op > DRM_XE_VM_BIND_OP_PREFETCH) ||
XE_IOCTL_DBG(xe, flags & ~SUPPORTED_FLAGS) ||
- XE_IOCTL_DBG(xe, obj && is_null) ||
- XE_IOCTL_DBG(xe, obj_offset && is_null) ||
+ XE_IOCTL_DBG(xe, obj && (is_null || is_cpu_addr_mirror)) ||
+ XE_IOCTL_DBG(xe, obj_offset && (is_null ||
+ is_cpu_addr_mirror)) ||
XE_IOCTL_DBG(xe, op != DRM_XE_VM_BIND_OP_MAP &&
- is_null) ||
+ (is_null || is_cpu_addr_mirror)) ||
XE_IOCTL_DBG(xe, !obj &&
op == DRM_XE_VM_BIND_OP_MAP &&
- !is_null) ||
+ !is_null && !is_cpu_addr_mirror) ||
XE_IOCTL_DBG(xe, !obj &&
op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
XE_IOCTL_DBG(xe, addr &&
@@ -2962,15 +3293,19 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
int err;
int i;
- err = vm_bind_ioctl_check_args(xe, args, &bind_ops);
+ vm = xe_vm_lookup(xef, args->vm_id);
+ if (XE_IOCTL_DBG(xe, !vm))
+ return -EINVAL;
+
+ err = vm_bind_ioctl_check_args(xe, vm, args, &bind_ops);
if (err)
- return err;
+ goto put_vm;
if (args->exec_queue_id) {
q = xe_exec_queue_lookup(xef, args->exec_queue_id);
if (XE_IOCTL_DBG(xe, !q)) {
err = -ENOENT;
- goto free_objs;
+ goto put_vm;
}
if (XE_IOCTL_DBG(xe, !(q->flags & EXEC_QUEUE_FLAG_VM))) {
@@ -2979,15 +3314,13 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
}
}
- vm = xe_vm_lookup(xef, args->vm_id);
- if (XE_IOCTL_DBG(xe, !vm)) {
- err = -EINVAL;
- goto put_exec_queue;
- }
+ /* Ensure all UNMAPs visible */
+ if (xe_vm_in_fault_mode(vm))
+ flush_work(&vm->svm.garbage_collector.work);
err = down_write_killable(&vm->lock);
if (err)
- goto put_vm;
+ goto put_exec_queue;
if (XE_IOCTL_DBG(xe, xe_vm_is_closed_or_banned(vm))) {
err = -ENOENT;
@@ -3151,12 +3484,11 @@ put_obj:
xe_bo_put(bos[i]);
release_vm_lock:
up_write(&vm->lock);
-put_vm:
- xe_vm_put(vm);
put_exec_queue:
if (q)
xe_exec_queue_put(q);
-free_objs:
+put_vm:
+ xe_vm_put(vm);
kvfree(bos);
kvfree(ops);
if (args->num_binds > 1)
@@ -3288,6 +3620,7 @@ int xe_vm_invalidate_vma(struct xe_vma *vma)
int ret = 0;
xe_assert(xe, !xe_vma_is_null(vma));
+ xe_assert(xe, !xe_vma_is_cpu_addr_mirror(vma));
trace_xe_vma_invalidate(vma);
vm_dbg(&xe_vma_vm(vma)->xe->drm,
diff --git a/drivers/gpu/drm/xe/xe_vm.h b/drivers/gpu/drm/xe/xe_vm.h
index f66075f8a6fe..0ef811fc2bde 100644
--- a/drivers/gpu/drm/xe/xe_vm.h
+++ b/drivers/gpu/drm/xe/xe_vm.h
@@ -23,6 +23,7 @@ struct dma_fence;
struct xe_exec_queue;
struct xe_file;
struct xe_sync_entry;
+struct xe_svm_range;
struct drm_exec;
struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags);
@@ -152,6 +153,11 @@ static inline bool xe_vma_is_null(struct xe_vma *vma)
return vma->gpuva.flags & DRM_GPUVA_SPARSE;
}
+static inline bool xe_vma_is_cpu_addr_mirror(struct xe_vma *vma)
+{
+ return vma->gpuva.flags & XE_VMA_SYSTEM_ALLOCATOR;
+}
+
static inline bool xe_vma_has_no_bo(struct xe_vma *vma)
{
return !xe_vma_bo(vma);
@@ -159,7 +165,8 @@ static inline bool xe_vma_has_no_bo(struct xe_vma *vma)
static inline bool xe_vma_is_userptr(struct xe_vma *vma)
{
- return xe_vma_has_no_bo(vma) && !xe_vma_is_null(vma);
+ return xe_vma_has_no_bo(vma) && !xe_vma_is_null(vma) &&
+ !xe_vma_is_cpu_addr_mirror(vma);
}
/**
@@ -212,6 +219,12 @@ int xe_vm_userptr_check_repin(struct xe_vm *vm);
int xe_vm_rebind(struct xe_vm *vm, bool rebind_worker);
struct dma_fence *xe_vma_rebind(struct xe_vm *vm, struct xe_vma *vma,
u8 tile_mask);
+struct dma_fence *xe_vm_range_rebind(struct xe_vm *vm,
+ struct xe_vma *vma,
+ struct xe_svm_range *range,
+ u8 tile_mask);
+struct dma_fence *xe_vm_range_unbind(struct xe_vm *vm,
+ struct xe_svm_range *range);
int xe_vm_invalidate_vma(struct xe_vma *vma);
@@ -282,9 +295,17 @@ static inline void vm_dbg(const struct drm_device *dev,
const char *format, ...)
{ /* noop */ }
#endif
-#endif
struct xe_vm_snapshot *xe_vm_snapshot_capture(struct xe_vm *vm);
void xe_vm_snapshot_capture_delayed(struct xe_vm_snapshot *snap);
void xe_vm_snapshot_print(struct xe_vm_snapshot *snap, struct drm_printer *p);
void xe_vm_snapshot_free(struct xe_vm_snapshot *snap);
+
+#if IS_ENABLED(CONFIG_DRM_XE_USERPTR_INVAL_INJECT)
+void xe_vma_userptr_force_invalidate(struct xe_userptr_vma *uvma);
+#else
+static inline void xe_vma_userptr_force_invalidate(struct xe_userptr_vma *uvma)
+{
+}
+#endif
+#endif
diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h
index 52467b9b5348..84fa41b9fa20 100644
--- a/drivers/gpu/drm/xe/xe_vm_types.h
+++ b/drivers/gpu/drm/xe/xe_vm_types.h
@@ -6,6 +6,7 @@
#ifndef _XE_VM_TYPES_H_
#define _XE_VM_TYPES_H_
+#include <drm/drm_gpusvm.h>
#include <drm/drm_gpuvm.h>
#include <linux/dma-resv.h>
@@ -18,6 +19,7 @@
#include "xe_range_fence.h"
struct xe_bo;
+struct xe_svm_range;
struct xe_sync_entry;
struct xe_user_fence;
struct xe_vm;
@@ -42,6 +44,7 @@ struct xe_vm_pgtable_update_op;
#define XE_VMA_PTE_64K (DRM_GPUVA_USERBITS << 6)
#define XE_VMA_PTE_COMPACT (DRM_GPUVA_USERBITS << 7)
#define XE_VMA_DUMPABLE (DRM_GPUVA_USERBITS << 8)
+#define XE_VMA_SYSTEM_ALLOCATOR (DRM_GPUVA_USERBITS << 9)
/** struct xe_userptr - User pointer */
struct xe_userptr {
@@ -59,12 +62,16 @@ struct xe_userptr {
struct sg_table *sg;
/** @notifier_seq: notifier sequence number */
unsigned long notifier_seq;
+ /** @unmap_mutex: Mutex protecting dma-unmapping */
+ struct mutex unmap_mutex;
/**
* @initial_bind: user pointer has been bound at least once.
* write: vm->userptr.notifier_lock in read mode and vm->resv held.
* read: vm->userptr.notifier_lock in write mode or vm->resv held.
*/
bool initial_bind;
+ /** @mapped: Whether the @sgt sg-table is dma-mapped. Protected by @unmap_mutex. */
+ bool mapped;
#if IS_ENABLED(CONFIG_DRM_XE_USERPTR_INVAL_INJECT)
u32 divisor;
#endif
@@ -139,6 +146,30 @@ struct xe_vm {
/** @gpuvm: base GPUVM used to track VMAs */
struct drm_gpuvm gpuvm;
+ /** @svm: Shared virtual memory state */
+ struct {
+ /** @svm.gpusvm: base GPUSVM used to track fault allocations */
+ struct drm_gpusvm gpusvm;
+ /**
+ * @svm.garbage_collector: Garbage collector which is used unmap
+ * SVM range's GPU bindings and destroy the ranges.
+ */
+ struct {
+ /** @svm.garbage_collector.lock: Protect's range list */
+ spinlock_t lock;
+ /**
+ * @svm.garbage_collector.range_list: List of SVM ranges
+ * in the garbage collector.
+ */
+ struct list_head range_list;
+ /**
+ * @svm.garbage_collector.work: Worker which the
+ * garbage collector runs on.
+ */
+ struct work_struct work;
+ } garbage_collector;
+ } svm;
+
struct xe_device *xe;
/* exec queue used for (un)binding vma's */
@@ -228,8 +259,8 @@ struct xe_vm {
* up for revalidation. Protected from access with the
* @invalidated_lock. Removing items from the list
* additionally requires @lock in write mode, and adding
- * items to the list requires the @userptr.notifer_lock in
- * write mode.
+ * items to the list requires either the @userptr.notifer_lock in
+ * write mode, OR @lock in write mode.
*/
struct list_head invalidated;
} userptr;
@@ -295,6 +326,8 @@ struct xe_vma_op_map {
bool read_only;
/** @is_null: is NULL binding */
bool is_null;
+ /** @is_cpu_addr_mirror: is CPU address mirror binding */
+ bool is_cpu_addr_mirror;
/** @dumpable: whether BO is dumped on GPU hang */
bool dumpable;
/** @pat_index: The pat index to use for this operation. */
@@ -325,6 +358,20 @@ struct xe_vma_op_prefetch {
u32 region;
};
+/** struct xe_vma_op_map_range - VMA map range operation */
+struct xe_vma_op_map_range {
+ /** @vma: VMA to map (system allocator VMA) */
+ struct xe_vma *vma;
+ /** @range: SVM range to map */
+ struct xe_svm_range *range;
+};
+
+/** struct xe_vma_op_unmap_range - VMA unmap range operation */
+struct xe_vma_op_unmap_range {
+ /** @range: SVM range to unmap */
+ struct xe_svm_range *range;
+};
+
/** enum xe_vma_op_flags - flags for VMA operation */
enum xe_vma_op_flags {
/** @XE_VMA_OP_COMMITTED: VMA operation committed */
@@ -335,6 +382,14 @@ enum xe_vma_op_flags {
XE_VMA_OP_NEXT_COMMITTED = BIT(2),
};
+/** enum xe_vma_subop - VMA sub-operation */
+enum xe_vma_subop {
+ /** @XE_VMA_SUBOP_MAP_RANGE: Map range */
+ XE_VMA_SUBOP_MAP_RANGE,
+ /** @XE_VMA_SUBOP_UNMAP_RANGE: Unmap range */
+ XE_VMA_SUBOP_UNMAP_RANGE,
+};
+
/** struct xe_vma_op - VMA operation */
struct xe_vma_op {
/** @base: GPUVA base operation */
@@ -343,6 +398,8 @@ struct xe_vma_op {
struct list_head link;
/** @flags: operation flags */
enum xe_vma_op_flags flags;
+ /** @subop: user defined sub-operation */
+ enum xe_vma_subop subop;
/** @tile_mask: Tile mask for operation */
u8 tile_mask;
@@ -353,6 +410,10 @@ struct xe_vma_op {
struct xe_vma_op_remap remap;
/** @prefetch: VMA prefetch operation specific data */
struct xe_vma_op_prefetch prefetch;
+ /** @map_range: VMA map range operation specific data */
+ struct xe_vma_op_map_range map_range;
+ /** @unmap_range: VMA unmap range operation specific data */
+ struct xe_vma_op_unmap_range unmap_range;
};
};
diff --git a/drivers/gpu/drm/xe/xe_wa.c b/drivers/gpu/drm/xe/xe_wa.c
index d4982799383c..55eb453f4b1f 100644
--- a/drivers/gpu/drm/xe/xe_wa.c
+++ b/drivers/gpu/drm/xe/xe_wa.c
@@ -619,11 +619,30 @@ static const struct xe_rtp_entry_sr engine_was[] = {
FUNC(xe_rtp_match_first_render_or_compute)),
XE_RTP_ACTIONS(SET(TDL_CHICKEN, QID_WAIT_FOR_THREAD_NOT_RUN_DISABLE))
},
+ { XE_RTP_NAME("13012615864"),
+ XE_RTP_RULES(GRAPHICS_VERSION_RANGE(3000, 3001),
+ FUNC(xe_rtp_match_first_render_or_compute)),
+ XE_RTP_ACTIONS(SET(TDL_TSL_CHICKEN, RES_CHK_SPR_DIS))
+ },
{}
};
static const struct xe_rtp_entry_sr lrc_was[] = {
+ { XE_RTP_NAME("16011163337"),
+ XE_RTP_RULES(GRAPHICS_VERSION_RANGE(1200, 1210), ENGINE_CLASS(RENDER)),
+ /* read verification is ignored due to 1608008084. */
+ XE_RTP_ACTIONS(FIELD_SET_NO_READ_MASK(FF_MODE2,
+ FF_MODE2_GS_TIMER_MASK,
+ FF_MODE2_GS_TIMER_224))
+ },
+ { XE_RTP_NAME("1604555607"),
+ XE_RTP_RULES(GRAPHICS_VERSION_RANGE(1200, 1210), ENGINE_CLASS(RENDER)),
+ /* read verification is ignored due to 1608008084. */
+ XE_RTP_ACTIONS(FIELD_SET_NO_READ_MASK(FF_MODE2,
+ FF_MODE2_TDS_TIMER_MASK,
+ FF_MODE2_TDS_TIMER_128))
+ },
{ XE_RTP_NAME("1409342910, 14010698770, 14010443199, 1408979724, 1409178076, 1409207793, 1409217633, 1409252684, 1409347922, 1409142259"),
XE_RTP_RULES(GRAPHICS_VERSION_RANGE(1200, 1210)),
XE_RTP_ACTIONS(SET(COMMON_SLICE_CHICKEN3,
diff --git a/drivers/gpu/drm/xe/xe_wa_oob.rules b/drivers/gpu/drm/xe/xe_wa_oob.rules
index 228436532282..e0c5fa460487 100644
--- a/drivers/gpu/drm/xe/xe_wa_oob.rules
+++ b/drivers/gpu/drm/xe/xe_wa_oob.rules
@@ -5,6 +5,7 @@
22011391025 PLATFORM(DG2)
22012727170 SUBPLATFORM(DG2, G11)
22012727685 SUBPLATFORM(DG2, G11)
+22016596838 PLATFORM(PVC)
18020744125 PLATFORM(PVC)
1509372804 PLATFORM(PVC), GRAPHICS_STEP(A0, C0)
1409600907 GRAPHICS_VERSION_RANGE(1200, 1250)
@@ -43,3 +44,12 @@
no_media_l3 MEDIA_VERSION(3000)
14022866841 GRAPHICS_VERSION(3000), GRAPHICS_STEP(A0, B0)
MEDIA_VERSION(3000), MEDIA_STEP(A0, B0)
+16021333562 GRAPHICS_VERSION_RANGE(1200, 1274)
+ MEDIA_VERSION(1300)
+14016712196 GRAPHICS_VERSION(1255)
+ GRAPHICS_VERSION_RANGE(1270, 1274)
+14015568240 GRAPHICS_VERSION_RANGE(1255, 1260)
+18013179988 GRAPHICS_VERSION(1255)
+ GRAPHICS_VERSION_RANGE(1270, 1274)
+1508761755 GRAPHICS_VERSION(1255)
+ GRAPHICS_VERSION(1260), GRAPHICS_STEP(A0, B0)
diff --git a/include/drm/drm_gpusvm.h b/include/drm/drm_gpusvm.h
new file mode 100644
index 000000000000..df120b4d1f83
--- /dev/null
+++ b/include/drm/drm_gpusvm.h
@@ -0,0 +1,509 @@
+/* SPDX-License-Identifier: GPL-2.0-only OR MIT */
+/*
+ * Copyright © 2024 Intel Corporation
+ */
+
+#ifndef __DRM_GPUSVM_H__
+#define __DRM_GPUSVM_H__
+
+#include <linux/kref.h>
+#include <linux/interval_tree.h>
+#include <linux/mmu_notifier.h>
+
+struct dev_pagemap_ops;
+struct drm_device;
+struct drm_gpusvm;
+struct drm_gpusvm_notifier;
+struct drm_gpusvm_ops;
+struct drm_gpusvm_range;
+struct drm_gpusvm_devmem;
+struct drm_pagemap;
+struct drm_pagemap_device_addr;
+
+/**
+ * struct drm_gpusvm_devmem_ops - Operations structure for GPU SVM device memory
+ *
+ * This structure defines the operations for GPU Shared Virtual Memory (SVM)
+ * device memory. These operations are provided by the GPU driver to manage device memory
+ * allocations and perform operations such as migration between device memory and system
+ * RAM.
+ */
+struct drm_gpusvm_devmem_ops {
+ /**
+ * @devmem_release: Release device memory allocation (optional)
+ * @devmem_allocation: device memory allocation
+ *
+ * Release device memory allocation and drop a reference to device
+ * memory allocation.
+ */
+ void (*devmem_release)(struct drm_gpusvm_devmem *devmem_allocation);
+
+ /**
+ * @populate_devmem_pfn: Populate device memory PFN (required for migration)
+ * @devmem_allocation: device memory allocation
+ * @npages: Number of pages to populate
+ * @pfn: Array of page frame numbers to populate
+ *
+ * Populate device memory page frame numbers (PFN).
+ *
+ * Return: 0 on success, a negative error code on failure.
+ */
+ int (*populate_devmem_pfn)(struct drm_gpusvm_devmem *devmem_allocation,
+ unsigned long npages, unsigned long *pfn);
+
+ /**
+ * @copy_to_devmem: Copy to device memory (required for migration)
+ * @pages: Pointer to array of device memory pages (destination)
+ * @dma_addr: Pointer to array of DMA addresses (source)
+ * @npages: Number of pages to copy
+ *
+ * Copy pages to device memory.
+ *
+ * Return: 0 on success, a negative error code on failure.
+ */
+ int (*copy_to_devmem)(struct page **pages,
+ dma_addr_t *dma_addr,
+ unsigned long npages);
+
+ /**
+ * @copy_to_ram: Copy to system RAM (required for migration)
+ * @pages: Pointer to array of device memory pages (source)
+ * @dma_addr: Pointer to array of DMA addresses (destination)
+ * @npages: Number of pages to copy
+ *
+ * Copy pages to system RAM.
+ *
+ * Return: 0 on success, a negative error code on failure.
+ */
+ int (*copy_to_ram)(struct page **pages,
+ dma_addr_t *dma_addr,
+ unsigned long npages);
+};
+
+/**
+ * struct drm_gpusvm_devmem - Structure representing a GPU SVM device memory allocation
+ *
+ * @dev: Pointer to the device structure which device memory allocation belongs to
+ * @mm: Pointer to the mm_struct for the address space
+ * @detached: device memory allocations is detached from device pages
+ * @ops: Pointer to the operations structure for GPU SVM device memory
+ * @dpagemap: The struct drm_pagemap of the pages this allocation belongs to.
+ * @size: Size of device memory allocation
+ */
+struct drm_gpusvm_devmem {
+ struct device *dev;
+ struct mm_struct *mm;
+ struct completion detached;
+ const struct drm_gpusvm_devmem_ops *ops;
+ struct drm_pagemap *dpagemap;
+ size_t size;
+};
+
+/**
+ * struct drm_gpusvm_ops - Operations structure for GPU SVM
+ *
+ * This structure defines the operations for GPU Shared Virtual Memory (SVM).
+ * These operations are provided by the GPU driver to manage SVM ranges and
+ * notifiers.
+ */
+struct drm_gpusvm_ops {
+ /**
+ * @notifier_alloc: Allocate a GPU SVM notifier (optional)
+ *
+ * Allocate a GPU SVM notifier.
+ *
+ * Return: Pointer to the allocated GPU SVM notifier on success, NULL on failure.
+ */
+ struct drm_gpusvm_notifier *(*notifier_alloc)(void);
+
+ /**
+ * @notifier_free: Free a GPU SVM notifier (optional)
+ * @notifier: Pointer to the GPU SVM notifier to be freed
+ *
+ * Free a GPU SVM notifier.
+ */
+ void (*notifier_free)(struct drm_gpusvm_notifier *notifier);
+
+ /**
+ * @range_alloc: Allocate a GPU SVM range (optional)
+ * @gpusvm: Pointer to the GPU SVM
+ *
+ * Allocate a GPU SVM range.
+ *
+ * Return: Pointer to the allocated GPU SVM range on success, NULL on failure.
+ */
+ struct drm_gpusvm_range *(*range_alloc)(struct drm_gpusvm *gpusvm);
+
+ /**
+ * @range_free: Free a GPU SVM range (optional)
+ * @range: Pointer to the GPU SVM range to be freed
+ *
+ * Free a GPU SVM range.
+ */
+ void (*range_free)(struct drm_gpusvm_range *range);
+
+ /**
+ * @invalidate: Invalidate GPU SVM notifier (required)
+ * @gpusvm: Pointer to the GPU SVM
+ * @notifier: Pointer to the GPU SVM notifier
+ * @mmu_range: Pointer to the mmu_notifier_range structure
+ *
+ * Invalidate the GPU page tables. It can safely walk the notifier range
+ * RB tree/list in this function. Called while holding the notifier lock.
+ */
+ void (*invalidate)(struct drm_gpusvm *gpusvm,
+ struct drm_gpusvm_notifier *notifier,
+ const struct mmu_notifier_range *mmu_range);
+};
+
+/**
+ * struct drm_gpusvm_notifier - Structure representing a GPU SVM notifier
+ *
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @notifier: MMU interval notifier
+ * @itree: Interval tree node for the notifier (inserted in GPU SVM)
+ * @entry: List entry to fast interval tree traversal
+ * @root: Cached root node of the RB tree containing ranges
+ * @range_list: List head containing of ranges in the same order they appear in
+ * interval tree. This is useful to keep iterating ranges while
+ * doing modifications to RB tree.
+ * @flags: Flags for notifier
+ * @flags.removed: Flag indicating whether the MMU interval notifier has been
+ * removed
+ *
+ * This structure represents a GPU SVM notifier.
+ */
+struct drm_gpusvm_notifier {
+ struct drm_gpusvm *gpusvm;
+ struct mmu_interval_notifier notifier;
+ struct interval_tree_node itree;
+ struct list_head entry;
+ struct rb_root_cached root;
+ struct list_head range_list;
+ struct {
+ u32 removed : 1;
+ } flags;
+};
+
+/**
+ * struct drm_gpusvm_range - Structure representing a GPU SVM range
+ *
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @notifier: Pointer to the GPU SVM notifier
+ * @refcount: Reference count for the range
+ * @itree: Interval tree node for the range (inserted in GPU SVM notifier)
+ * @entry: List entry to fast interval tree traversal
+ * @notifier_seq: Notifier sequence number of the range's pages
+ * @dma_addr: Device address array
+ * @dpagemap: The struct drm_pagemap of the device pages we're dma-mapping.
+ * Note this is assuming only one drm_pagemap per range is allowed.
+ * @flags: Flags for range
+ * @flags.migrate_devmem: Flag indicating whether the range can be migrated to device memory
+ * @flags.unmapped: Flag indicating if the range has been unmapped
+ * @flags.partial_unmap: Flag indicating if the range has been partially unmapped
+ * @flags.has_devmem_pages: Flag indicating if the range has devmem pages
+ * @flags.has_dma_mapping: Flag indicating if the range has a DMA mapping
+ *
+ * This structure represents a GPU SVM range used for tracking memory ranges
+ * mapped in a DRM device.
+ */
+struct drm_gpusvm_range {
+ struct drm_gpusvm *gpusvm;
+ struct drm_gpusvm_notifier *notifier;
+ struct kref refcount;
+ struct interval_tree_node itree;
+ struct list_head entry;
+ unsigned long notifier_seq;
+ struct drm_pagemap_device_addr *dma_addr;
+ struct drm_pagemap *dpagemap;
+ struct {
+ /* All flags below must be set upon creation */
+ u16 migrate_devmem : 1;
+ /* All flags below must be set / cleared under notifier lock */
+ u16 unmapped : 1;
+ u16 partial_unmap : 1;
+ u16 has_devmem_pages : 1;
+ u16 has_dma_mapping : 1;
+ } flags;
+};
+
+/**
+ * struct drm_gpusvm - GPU SVM structure
+ *
+ * @name: Name of the GPU SVM
+ * @drm: Pointer to the DRM device structure
+ * @mm: Pointer to the mm_struct for the address space
+ * @device_private_page_owner: Device private pages owner
+ * @mm_start: Start address of GPU SVM
+ * @mm_range: Range of the GPU SVM
+ * @notifier_size: Size of individual notifiers
+ * @ops: Pointer to the operations structure for GPU SVM
+ * @chunk_sizes: Pointer to the array of chunk sizes used in range allocation.
+ * Entries should be powers of 2 in descending order.
+ * @num_chunks: Number of chunks
+ * @notifier_lock: Read-write semaphore for protecting notifier operations
+ * @root: Cached root node of the Red-Black tree containing GPU SVM notifiers
+ * @notifier_list: list head containing of notifiers in the same order they
+ * appear in interval tree. This is useful to keep iterating
+ * notifiers while doing modifications to RB tree.
+ *
+ * This structure represents a GPU SVM (Shared Virtual Memory) used for tracking
+ * memory ranges mapped in a DRM (Direct Rendering Manager) device.
+ *
+ * No reference counting is provided, as this is expected to be embedded in the
+ * driver VM structure along with the struct drm_gpuvm, which handles reference
+ * counting.
+ */
+struct drm_gpusvm {
+ const char *name;
+ struct drm_device *drm;
+ struct mm_struct *mm;
+ void *device_private_page_owner;
+ unsigned long mm_start;
+ unsigned long mm_range;
+ unsigned long notifier_size;
+ const struct drm_gpusvm_ops *ops;
+ const unsigned long *chunk_sizes;
+ int num_chunks;
+ struct rw_semaphore notifier_lock;
+ struct rb_root_cached root;
+ struct list_head notifier_list;
+#ifdef CONFIG_LOCKDEP
+ /**
+ * @lock_dep_map: Annotates drm_gpusvm_range_find_or_insert and
+ * drm_gpusvm_range_remove with a driver provided lock.
+ */
+ struct lockdep_map *lock_dep_map;
+#endif
+};
+
+/**
+ * struct drm_gpusvm_ctx - DRM GPU SVM context
+ *
+ * @check_pages_threshold: Check CPU pages for present if chunk is less than or
+ * equal to threshold. If not present, reduce chunk
+ * size.
+ * @in_notifier: entering from a MMU notifier
+ * @read_only: operating on read-only memory
+ * @devmem_possible: possible to use device memory
+ *
+ * Context that is DRM GPUSVM is operating in (i.e. user arguments).
+ */
+struct drm_gpusvm_ctx {
+ unsigned long check_pages_threshold;
+ unsigned int in_notifier :1;
+ unsigned int read_only :1;
+ unsigned int devmem_possible :1;
+};
+
+int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
+ const char *name, struct drm_device *drm,
+ struct mm_struct *mm, void *device_private_page_owner,
+ unsigned long mm_start, unsigned long mm_range,
+ unsigned long notifier_size,
+ const struct drm_gpusvm_ops *ops,
+ const unsigned long *chunk_sizes, int num_chunks);
+
+void drm_gpusvm_fini(struct drm_gpusvm *gpusvm);
+
+void drm_gpusvm_free(struct drm_gpusvm *gpusvm);
+
+struct drm_gpusvm_range *
+drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm,
+ unsigned long fault_addr,
+ unsigned long gpuva_start,
+ unsigned long gpuva_end,
+ const struct drm_gpusvm_ctx *ctx);
+
+void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
+ struct drm_gpusvm_range *range);
+
+int drm_gpusvm_range_evict(struct drm_gpusvm *gpusvm,
+ struct drm_gpusvm_range *range);
+
+struct drm_gpusvm_range *
+drm_gpusvm_range_get(struct drm_gpusvm_range *range);
+
+void drm_gpusvm_range_put(struct drm_gpusvm_range *range);
+
+bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
+ struct drm_gpusvm_range *range);
+
+int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
+ struct drm_gpusvm_range *range,
+ const struct drm_gpusvm_ctx *ctx);
+
+void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
+ struct drm_gpusvm_range *range,
+ const struct drm_gpusvm_ctx *ctx);
+
+int drm_gpusvm_migrate_to_devmem(struct drm_gpusvm *gpusvm,
+ struct drm_gpusvm_range *range,
+ struct drm_gpusvm_devmem *devmem_allocation,
+ const struct drm_gpusvm_ctx *ctx);
+
+int drm_gpusvm_evict_to_ram(struct drm_gpusvm_devmem *devmem_allocation);
+
+const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void);
+
+bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, unsigned long start,
+ unsigned long end);
+
+struct drm_gpusvm_range *
+drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, unsigned long start,
+ unsigned long end);
+
+void drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range *range,
+ const struct mmu_notifier_range *mmu_range);
+
+void drm_gpusvm_devmem_init(struct drm_gpusvm_devmem *devmem_allocation,
+ struct device *dev, struct mm_struct *mm,
+ const struct drm_gpusvm_devmem_ops *ops,
+ struct drm_pagemap *dpagemap, size_t size);
+
+#ifdef CONFIG_LOCKDEP
+/**
+ * drm_gpusvm_driver_set_lock() - Set the lock protecting accesses to GPU SVM
+ * @gpusvm: Pointer to the GPU SVM structure.
+ * @lock: the lock used to protect the gpuva list. The locking primitive
+ * must contain a dep_map field.
+ *
+ * Call this to annotate drm_gpusvm_range_find_or_insert and
+ * drm_gpusvm_range_remove.
+ */
+#define drm_gpusvm_driver_set_lock(gpusvm, lock) \
+ do { \
+ if (!WARN((gpusvm)->lock_dep_map, \
+ "GPUSVM range lock should be set only once."))\
+ (gpusvm)->lock_dep_map = &(lock)->dep_map; \
+ } while (0)
+#else
+#define drm_gpusvm_driver_set_lock(gpusvm, lock) do {} while (0)
+#endif
+
+/**
+ * drm_gpusvm_notifier_lock() - Lock GPU SVM notifier
+ * @gpusvm__: Pointer to the GPU SVM structure.
+ *
+ * Abstract client usage GPU SVM notifier lock, take lock
+ */
+#define drm_gpusvm_notifier_lock(gpusvm__) \
+ down_read(&(gpusvm__)->notifier_lock)
+
+/**
+ * drm_gpusvm_notifier_unlock() - Unlock GPU SVM notifier
+ * @gpusvm__: Pointer to the GPU SVM structure.
+ *
+ * Abstract client usage GPU SVM notifier lock, drop lock
+ */
+#define drm_gpusvm_notifier_unlock(gpusvm__) \
+ up_read(&(gpusvm__)->notifier_lock)
+
+/**
+ * drm_gpusvm_range_start() - GPU SVM range start address
+ * @range: Pointer to the GPU SVM range
+ *
+ * Return: GPU SVM range start address
+ */
+static inline unsigned long
+drm_gpusvm_range_start(struct drm_gpusvm_range *range)
+{
+ return range->itree.start;
+}
+
+/**
+ * drm_gpusvm_range_end() - GPU SVM range end address
+ * @range: Pointer to the GPU SVM range
+ *
+ * Return: GPU SVM range end address
+ */
+static inline unsigned long
+drm_gpusvm_range_end(struct drm_gpusvm_range *range)
+{
+ return range->itree.last + 1;
+}
+
+/**
+ * drm_gpusvm_range_size() - GPU SVM range size
+ * @range: Pointer to the GPU SVM range
+ *
+ * Return: GPU SVM range size
+ */
+static inline unsigned long
+drm_gpusvm_range_size(struct drm_gpusvm_range *range)
+{
+ return drm_gpusvm_range_end(range) - drm_gpusvm_range_start(range);
+}
+
+/**
+ * drm_gpusvm_notifier_start() - GPU SVM notifier start address
+ * @notifier: Pointer to the GPU SVM notifier
+ *
+ * Return: GPU SVM notifier start address
+ */
+static inline unsigned long
+drm_gpusvm_notifier_start(struct drm_gpusvm_notifier *notifier)
+{
+ return notifier->itree.start;
+}
+
+/**
+ * drm_gpusvm_notifier_end() - GPU SVM notifier end address
+ * @notifier: Pointer to the GPU SVM notifier
+ *
+ * Return: GPU SVM notifier end address
+ */
+static inline unsigned long
+drm_gpusvm_notifier_end(struct drm_gpusvm_notifier *notifier)
+{
+ return notifier->itree.last + 1;
+}
+
+/**
+ * drm_gpusvm_notifier_size() - GPU SVM notifier size
+ * @notifier: Pointer to the GPU SVM notifier
+ *
+ * Return: GPU SVM notifier size
+ */
+static inline unsigned long
+drm_gpusvm_notifier_size(struct drm_gpusvm_notifier *notifier)
+{
+ return drm_gpusvm_notifier_end(notifier) -
+ drm_gpusvm_notifier_start(notifier);
+}
+
+/**
+ * __drm_gpusvm_range_next() - Get the next GPU SVM range in the list
+ * @range: a pointer to the current GPU SVM range
+ *
+ * Return: A pointer to the next drm_gpusvm_range if available, or NULL if the
+ * current range is the last one or if the input range is NULL.
+ */
+static inline struct drm_gpusvm_range *
+__drm_gpusvm_range_next(struct drm_gpusvm_range *range)
+{
+ if (range && !list_is_last(&range->entry,
+ &range->notifier->range_list))
+ return list_next_entry(range, entry);
+
+ return NULL;
+}
+
+/**
+ * drm_gpusvm_for_each_range() - Iterate over GPU SVM ranges in a notifier
+ * @range__: Iterator variable for the ranges. If set, it indicates the start of
+ * the iterator. If NULL, call drm_gpusvm_range_find() to get the range.
+ * @notifier__: Pointer to the GPU SVM notifier
+ * @start__: Start address of the range
+ * @end__: End address of the range
+ *
+ * This macro is used to iterate over GPU SVM ranges in a notifier. It is safe
+ * to use while holding the driver SVM lock or the notifier lock.
+ */
+#define drm_gpusvm_for_each_range(range__, notifier__, start__, end__) \
+ for ((range__) = (range__) ?: \
+ drm_gpusvm_range_find((notifier__), (start__), (end__)); \
+ (range__) && (drm_gpusvm_range_start(range__) < (end__)); \
+ (range__) = __drm_gpusvm_range_next(range__))
+
+#endif /* __DRM_GPUSVM_H__ */
diff --git a/include/drm/drm_gpuvm.h b/include/drm/drm_gpuvm.h
index 00d4e43b76b6..2a9629377633 100644
--- a/include/drm/drm_gpuvm.h
+++ b/include/drm/drm_gpuvm.h
@@ -812,6 +812,11 @@ enum drm_gpuva_op_type {
* @DRM_GPUVA_OP_PREFETCH: the prefetch op type
*/
DRM_GPUVA_OP_PREFETCH,
+
+ /**
+ * @DRM_GPUVA_OP_DRIVER: the driver defined op type
+ */
+ DRM_GPUVA_OP_DRIVER,
};
/**
diff --git a/include/drm/drm_pagemap.h b/include/drm/drm_pagemap.h
new file mode 100644
index 000000000000..202c157ff4d7
--- /dev/null
+++ b/include/drm/drm_pagemap.h
@@ -0,0 +1,107 @@
+/* SPDX-License-Identifier: MIT */
+#ifndef _DRM_PAGEMAP_H_
+#define _DRM_PAGEMAP_H_
+
+#include <linux/dma-direction.h>
+#include <linux/hmm.h>
+#include <linux/types.h>
+
+struct drm_pagemap;
+struct device;
+
+/**
+ * enum drm_interconnect_protocol - Used to identify an interconnect protocol.
+ *
+ * @DRM_INTERCONNECT_SYSTEM: DMA map is system pages
+ * @DRM_INTERCONNECT_DRIVER: DMA map is driver defined
+ */
+enum drm_interconnect_protocol {
+ DRM_INTERCONNECT_SYSTEM,
+ DRM_INTERCONNECT_DRIVER,
+ /* A driver can add private values beyond DRM_INTERCONNECT_DRIVER */
+};
+
+/**
+ * struct drm_pagemap_device_addr - Device address representation.
+ * @addr: The dma address or driver-defined address for driver private interconnects.
+ * @proto: The interconnect protocol.
+ * @order: The page order of the device mapping. (Size is PAGE_SIZE << order).
+ * @dir: The DMA direction.
+ *
+ * Note: There is room for improvement here. We should be able to pack into
+ * 64 bits.
+ */
+struct drm_pagemap_device_addr {
+ dma_addr_t addr;
+ u64 proto : 54;
+ u64 order : 8;
+ u64 dir : 2;
+};
+
+/**
+ * drm_pagemap_device_addr_encode() - Encode a dma address with metadata
+ * @addr: The dma address or driver-defined address for driver private interconnects.
+ * @proto: The interconnect protocol.
+ * @order: The page order of the dma mapping. (Size is PAGE_SIZE << order).
+ * @dir: The DMA direction.
+ *
+ * Return: A struct drm_pagemap_device_addr encoding the above information.
+ */
+static inline struct drm_pagemap_device_addr
+drm_pagemap_device_addr_encode(dma_addr_t addr,
+ enum drm_interconnect_protocol proto,
+ unsigned int order,
+ enum dma_data_direction dir)
+{
+ return (struct drm_pagemap_device_addr) {
+ .addr = addr,
+ .proto = proto,
+ .order = order,
+ .dir = dir,
+ };
+}
+
+/**
+ * struct drm_pagemap_ops: Ops for a drm-pagemap.
+ */
+struct drm_pagemap_ops {
+ /**
+ * @device_map: Map for device access or provide a virtual address suitable for
+ *
+ * @dpagemap: The struct drm_pagemap for the page.
+ * @dev: The device mapper.
+ * @page: The page to map.
+ * @order: The page order of the device mapping. (Size is PAGE_SIZE << order).
+ * @dir: The transfer direction.
+ */
+ struct drm_pagemap_device_addr (*device_map)(struct drm_pagemap *dpagemap,
+ struct device *dev,
+ struct page *page,
+ unsigned int order,
+ enum dma_data_direction dir);
+
+ /**
+ * @device_unmap: Unmap a device address previously obtained using @device_map.
+ *
+ * @dpagemap: The struct drm_pagemap for the mapping.
+ * @dev: The device unmapper.
+ * @addr: The device address obtained when mapping.
+ */
+ void (*device_unmap)(struct drm_pagemap *dpagemap,
+ struct device *dev,
+ struct drm_pagemap_device_addr addr);
+
+};
+
+/**
+ * struct drm_pagemap: Additional information for a struct dev_pagemap
+ * used for device p2p handshaking.
+ * @ops: The struct drm_pagemap_ops.
+ * @dev: The struct drevice owning the device-private memory.
+ */
+struct drm_pagemap {
+ const struct drm_pagemap_ops *ops;
+ struct device *dev;
+};
+
+#endif
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 29919faea2f1..80891120cca9 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -227,6 +227,7 @@ void migrate_vma_pages(struct migrate_vma *migrate);
void migrate_vma_finalize(struct migrate_vma *migrate);
int migrate_device_range(unsigned long *src_pfns, unsigned long start,
unsigned long npages);
+int migrate_device_pfns(unsigned long *src_pfns, unsigned long npages);
void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns,
unsigned long npages);
void migrate_device_finalize(unsigned long *src_pfns,
diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
index 892f54d3aa09..616916985e3f 100644
--- a/include/uapi/drm/xe_drm.h
+++ b/include/uapi/drm/xe_drm.h
@@ -393,6 +393,10 @@ struct drm_xe_query_mem_regions {
*
* - %DRM_XE_QUERY_CONFIG_FLAG_HAS_VRAM - Flag is set if the device
* has usable VRAM
+ * - %DRM_XE_QUERY_CONFIG_FLAG_HAS_LOW_LATENCY - Flag is set if the device
+ * has low latency hint support
+ * - %DRM_XE_QUERY_CONFIG_FLAG_HAS_CPU_ADDR_MIRROR - Flag is set if the
+ * device has CPU address mirroring support
* - %DRM_XE_QUERY_CONFIG_MIN_ALIGNMENT - Minimal memory alignment
* required by this device, typically SZ_4K or SZ_64K
* - %DRM_XE_QUERY_CONFIG_VA_BITS - Maximum bits of a virtual address
@@ -409,6 +413,8 @@ struct drm_xe_query_config {
#define DRM_XE_QUERY_CONFIG_REV_AND_DEVICE_ID 0
#define DRM_XE_QUERY_CONFIG_FLAGS 1
#define DRM_XE_QUERY_CONFIG_FLAG_HAS_VRAM (1 << 0)
+ #define DRM_XE_QUERY_CONFIG_FLAG_HAS_LOW_LATENCY (1 << 1)
+ #define DRM_XE_QUERY_CONFIG_FLAG_HAS_CPU_ADDR_MIRROR (1 << 2)
#define DRM_XE_QUERY_CONFIG_MIN_ALIGNMENT 2
#define DRM_XE_QUERY_CONFIG_VA_BITS 3
#define DRM_XE_QUERY_CONFIG_MAX_EXEC_QUEUE_PRIORITY 4
@@ -735,6 +741,7 @@ struct drm_xe_device_query {
#define DRM_XE_DEVICE_QUERY_UC_FW_VERSION 7
#define DRM_XE_DEVICE_QUERY_OA_UNITS 8
#define DRM_XE_DEVICE_QUERY_PXP_STATUS 9
+#define DRM_XE_DEVICE_QUERY_EU_STALL 10
/** @query: The type of data to query */
__u32 query;
@@ -986,6 +993,12 @@ struct drm_xe_vm_destroy {
* - %DRM_XE_VM_BIND_FLAG_CHECK_PXP - If the object is encrypted via PXP,
* reject the binding if the encryption key is no longer valid. This
* flag has no effect on BOs that are not marked as using PXP.
+ * - %DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR - When the CPU address mirror flag is
+ * set, no mappings are created rather the range is reserved for CPU address
+ * mirroring which will be populated on GPU page faults or prefetches. Only
+ * valid on VMs with DRM_XE_VM_CREATE_FLAG_FAULT_MODE set. The CPU address
+ * mirror flag are only valid for DRM_XE_VM_BIND_OP_MAP operations, the BO
+ * handle MBZ, and the BO offset MBZ.
*/
struct drm_xe_vm_bind_op {
/** @extensions: Pointer to the first extension struct, if any */
@@ -1038,7 +1051,9 @@ struct drm_xe_vm_bind_op {
* on the @pat_index. For such mappings there is no actual memory being
* mapped (the address in the PTE is invalid), so the various PAT memory
* attributes likely do not apply. Simply leaving as zero is one
- * option (still a valid pat_index).
+ * option (still a valid pat_index). Same applies to
+ * DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR bindings as for such mapping
+ * there is no actual memory being mapped.
*/
__u16 pat_index;
@@ -1054,6 +1069,14 @@ struct drm_xe_vm_bind_op {
/** @userptr: user pointer to bind on */
__u64 userptr;
+
+ /**
+ * @cpu_addr_mirror_offset: Offset from GPU @addr to create
+ * CPU address mirror mappings. MBZ with current level of
+ * support (e.g. 1 to 1 mapping between GPU and CPU mappings
+ * only supported).
+ */
+ __s64 cpu_addr_mirror_offset;
};
/**
@@ -1077,6 +1100,7 @@ struct drm_xe_vm_bind_op {
#define DRM_XE_VM_BIND_FLAG_NULL (1 << 2)
#define DRM_XE_VM_BIND_FLAG_DUMPABLE (1 << 3)
#define DRM_XE_VM_BIND_FLAG_CHECK_PXP (1 << 4)
+#define DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR (1 << 5)
/** @flags: Bind flags */
__u32 flags;
@@ -1204,6 +1228,21 @@ struct drm_xe_vm_bind {
* };
* ioctl(fd, DRM_IOCTL_XE_EXEC_QUEUE_CREATE, &exec_queue_create);
*
+ * Allow users to provide a hint to kernel for cases demanding low latency
+ * profile. Please note it will have impact on power consumption. User can
+ * indicate low latency hint with flag while creating exec queue as
+ * mentioned below,
+ *
+ * struct drm_xe_exec_queue_create exec_queue_create = {
+ * .flags = DRM_XE_EXEC_QUEUE_LOW_LATENCY_HINT,
+ * .extensions = 0,
+ * .vm_id = vm,
+ * .num_bb_per_exec = 1,
+ * .num_eng_per_bb = 1,
+ * .instances = to_user_pointer(&instance),
+ * };
+ * ioctl(fd, DRM_IOCTL_XE_EXEC_QUEUE_CREATE, &exec_queue_create);
+ *
*/
struct drm_xe_exec_queue_create {
#define DRM_XE_EXEC_QUEUE_EXTENSION_SET_PROPERTY 0
@@ -1222,7 +1261,8 @@ struct drm_xe_exec_queue_create {
/** @vm_id: VM to use for this exec queue */
__u32 vm_id;
- /** @flags: MBZ */
+#define DRM_XE_EXEC_QUEUE_LOW_LATENCY_HINT (1 << 0)
+ /** @flags: flags to use for this exec queue */
__u32 flags;
/** @exec_queue_id: Returned exec queue ID */
@@ -1496,6 +1536,8 @@ struct drm_xe_wait_user_fence {
enum drm_xe_observation_type {
/** @DRM_XE_OBSERVATION_TYPE_OA: OA observation stream type */
DRM_XE_OBSERVATION_TYPE_OA,
+ /** @DRM_XE_OBSERVATION_TYPE_EU_STALL: EU stall sampling observation stream type */
+ DRM_XE_OBSERVATION_TYPE_EU_STALL,
};
/**
@@ -1848,6 +1890,77 @@ enum drm_xe_pxp_session_type {
/* ID of the protected content session managed by Xe when PXP is active */
#define DRM_XE_PXP_HWDRM_DEFAULT_SESSION 0xf
+/**
+ * enum drm_xe_eu_stall_property_id - EU stall sampling input property ids.
+ *
+ * These properties are passed to the driver at open as a chain of
+ * @drm_xe_ext_set_property structures with @property set to these
+ * properties' enums and @value set to the corresponding values of these
+ * properties. @drm_xe_user_extension base.name should be set to
+ * @DRM_XE_EU_STALL_EXTENSION_SET_PROPERTY.
+ *
+ * With the file descriptor obtained from open, user space must enable
+ * the EU stall stream fd with @DRM_XE_OBSERVATION_IOCTL_ENABLE before
+ * calling read(). EIO errno from read() indicates HW dropped data
+ * due to full buffer.
+ */
+enum drm_xe_eu_stall_property_id {
+#define DRM_XE_EU_STALL_EXTENSION_SET_PROPERTY 0
+ /**
+ * @DRM_XE_EU_STALL_PROP_GT_ID: @gt_id of the GT on which
+ * EU stall data will be captured.
+ */
+ DRM_XE_EU_STALL_PROP_GT_ID = 1,
+
+ /**
+ * @DRM_XE_EU_STALL_PROP_SAMPLE_RATE: Sampling rate in
+ * GPU cycles from @sampling_rates in struct @drm_xe_query_eu_stall
+ */
+ DRM_XE_EU_STALL_PROP_SAMPLE_RATE,
+
+ /**
+ * @DRM_XE_EU_STALL_PROP_WAIT_NUM_REPORTS: Minimum number of
+ * EU stall data reports to be present in the kernel buffer
+ * before unblocking a blocked poll or read.
+ */
+ DRM_XE_EU_STALL_PROP_WAIT_NUM_REPORTS,
+};
+
+/**
+ * struct drm_xe_query_eu_stall - Information about EU stall sampling.
+ *
+ * If a query is made with a struct @drm_xe_device_query where .query
+ * is equal to @DRM_XE_DEVICE_QUERY_EU_STALL, then the reply uses
+ * struct @drm_xe_query_eu_stall in .data.
+ */
+struct drm_xe_query_eu_stall {
+ /** @extensions: Pointer to the first extension struct, if any */
+ __u64 extensions;
+
+ /** @capabilities: EU stall capabilities bit-mask */
+ __u64 capabilities;
+#define DRM_XE_EU_STALL_CAPS_BASE (1 << 0)
+
+ /** @record_size: size of each EU stall data record */
+ __u64 record_size;
+
+ /** @per_xecore_buf_size: internal per XeCore buffer size */
+ __u64 per_xecore_buf_size;
+
+ /** @reserved: Reserved */
+ __u64 reserved[5];
+
+ /** @num_sampling_rates: Number of sampling rates in @sampling_rates array */
+ __u64 num_sampling_rates;
+
+ /**
+ * @sampling_rates: Flexible array of sampling rates
+ * sorted in the fastest to slowest order.
+ * Sampling rates are specified in GPU clock cycles.
+ */
+ __u64 sampling_rates[];
+};
+
#if defined(__cplusplus)
}
#endif
diff --git a/mm/memory.c b/mm/memory.c
index b4d3d4893267..59b804f4bf3f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4348,10 +4348,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* Get a page reference while we know the page can't be
* freed.
*/
- get_page(vmf->page);
- pte_unmap_unlock(vmf->pte, vmf->ptl);
- ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
- put_page(vmf->page);
+ if (trylock_page(vmf->page)) {
+ get_page(vmf->page);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
+ unlock_page(vmf->page);
+ put_page(vmf->page);
+ } else {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ }
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
} else if (is_pte_marker_entry(entry)) {
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 5bd888223cc8..a351497ced4a 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -60,6 +60,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
struct mm_walk *walk)
{
struct migrate_vma *migrate = walk->private;
+ struct folio *fault_folio = migrate->fault_page ?
+ page_folio(migrate->fault_page) : NULL;
struct vm_area_struct *vma = walk->vma;
struct mm_struct *mm = vma->vm_mm;
unsigned long addr = start, unmapped = 0;
@@ -88,11 +90,16 @@ again:
folio_get(folio);
spin_unlock(ptl);
+ /* FIXME: we don't expect THP for fault_folio */
+ if (WARN_ON_ONCE(fault_folio == folio))
+ return migrate_vma_collect_skip(start, end,
+ walk);
if (unlikely(!folio_trylock(folio)))
return migrate_vma_collect_skip(start, end,
walk);
ret = split_folio(folio);
- folio_unlock(folio);
+ if (fault_folio != folio)
+ folio_unlock(folio);
folio_put(folio);
if (ret)
return migrate_vma_collect_skip(start, end,
@@ -192,7 +199,7 @@ again:
* optimisation to avoid walking the rmap later with
* try_to_migrate().
*/
- if (folio_trylock(folio)) {
+ if (fault_folio == folio || folio_trylock(folio)) {
bool anon_exclusive;
pte_t swp_pte;
@@ -204,7 +211,8 @@ again:
if (folio_try_share_anon_rmap_pte(folio, page)) {
set_pte_at(mm, addr, ptep, pte);
- folio_unlock(folio);
+ if (fault_folio != folio)
+ folio_unlock(folio);
folio_put(folio);
mpfn = 0;
goto next;
@@ -363,6 +371,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
unsigned long npages,
struct page *fault_page)
{
+ struct folio *fault_folio = fault_page ?
+ page_folio(fault_page) : NULL;
unsigned long i, restore = 0;
bool allow_drain = true;
unsigned long unmapped = 0;
@@ -427,7 +437,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
remove_migration_ptes(folio, folio, 0);
src_pfns[i] = 0;
- folio_unlock(folio);
+ if (fault_folio != folio)
+ folio_unlock(folio);
folio_put(folio);
restore--;
}
@@ -536,6 +547,8 @@ int migrate_vma_setup(struct migrate_vma *args)
return -EINVAL;
if (args->fault_page && !is_device_private_page(args->fault_page))
return -EINVAL;
+ if (args->fault_page && !PageLocked(args->fault_page))
+ return -EINVAL;
memset(args->src, 0, sizeof(*args->src) * nr_pages);
args->cpages = 0;
@@ -799,19 +812,13 @@ void migrate_vma_pages(struct migrate_vma *migrate)
}
EXPORT_SYMBOL(migrate_vma_pages);
-/*
- * migrate_device_finalize() - complete page migration
- * @src_pfns: src_pfns returned from migrate_device_range()
- * @dst_pfns: array of pfns allocated by the driver to migrate memory to
- * @npages: number of pages in the range
- *
- * Completes migration of the page by removing special migration entries.
- * Drivers must ensure copying of page data is complete and visible to the CPU
- * before calling this.
- */
-void migrate_device_finalize(unsigned long *src_pfns,
- unsigned long *dst_pfns, unsigned long npages)
+static void __migrate_device_finalize(unsigned long *src_pfns,
+ unsigned long *dst_pfns,
+ unsigned long npages,
+ struct page *fault_page)
{
+ struct folio *fault_folio = fault_page ?
+ page_folio(fault_page) : NULL;
unsigned long i;
for (i = 0; i < npages; i++) {
@@ -824,6 +831,7 @@ void migrate_device_finalize(unsigned long *src_pfns,
if (!page) {
if (dst) {
+ WARN_ON_ONCE(fault_folio == dst);
folio_unlock(dst);
folio_put(dst);
}
@@ -834,6 +842,7 @@ void migrate_device_finalize(unsigned long *src_pfns,
if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !dst) {
if (dst) {
+ WARN_ON_ONCE(fault_folio == dst);
folio_unlock(dst);
folio_put(dst);
}
@@ -843,15 +852,33 @@ void migrate_device_finalize(unsigned long *src_pfns,
if (!folio_is_zone_device(dst))
folio_add_lru(dst);
remove_migration_ptes(src, dst, 0);
- folio_unlock(src);
+ if (fault_folio != src)
+ folio_unlock(src);
folio_put(src);
if (dst != src) {
+ WARN_ON_ONCE(fault_folio == dst);
folio_unlock(dst);
folio_put(dst);
}
}
}
+
+/*
+ * migrate_device_finalize() - complete page migration
+ * @src_pfns: src_pfns returned from migrate_device_range()
+ * @dst_pfns: array of pfns allocated by the driver to migrate memory to
+ * @npages: number of pages in the range
+ *
+ * Completes migration of the page by removing special migration entries.
+ * Drivers must ensure copying of page data is complete and visible to the CPU
+ * before calling this.
+ */
+void migrate_device_finalize(unsigned long *src_pfns,
+ unsigned long *dst_pfns, unsigned long npages)
+{
+ return __migrate_device_finalize(src_pfns, dst_pfns, npages, NULL);
+}
EXPORT_SYMBOL(migrate_device_finalize);
/**
@@ -867,10 +894,27 @@ EXPORT_SYMBOL(migrate_device_finalize);
*/
void migrate_vma_finalize(struct migrate_vma *migrate)
{
- migrate_device_finalize(migrate->src, migrate->dst, migrate->npages);
+ __migrate_device_finalize(migrate->src, migrate->dst, migrate->npages,
+ migrate->fault_page);
}
EXPORT_SYMBOL(migrate_vma_finalize);
+static unsigned long migrate_device_pfn_lock(unsigned long pfn)
+{
+ struct folio *folio;
+
+ folio = folio_get_nontail_page(pfn_to_page(pfn));
+ if (!folio)
+ return 0;
+
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
+ return 0;
+ }
+
+ return migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
+}
+
/**
* migrate_device_range() - migrate device private pfns to normal memory.
* @src_pfns: array large enough to hold migrating source device private pfns.
@@ -895,29 +939,35 @@ int migrate_device_range(unsigned long *src_pfns, unsigned long start,
{
unsigned long i, pfn;
- for (pfn = start, i = 0; i < npages; pfn++, i++) {
- struct folio *folio;
+ for (pfn = start, i = 0; i < npages; pfn++, i++)
+ src_pfns[i] = migrate_device_pfn_lock(pfn);
- folio = folio_get_nontail_page(pfn_to_page(pfn));
- if (!folio) {
- src_pfns[i] = 0;
- continue;
- }
+ migrate_device_unmap(src_pfns, npages, NULL);
- if (!folio_trylock(folio)) {
- src_pfns[i] = 0;
- folio_put(folio);
- continue;
- }
+ return 0;
+}
+EXPORT_SYMBOL(migrate_device_range);
- src_pfns[i] = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
- }
+/**
+ * migrate_device_pfns() - migrate device private pfns to normal memory.
+ * @src_pfns: pre-popluated array of source device private pfns to migrate.
+ * @npages: number of pages to migrate.
+ *
+ * Similar to migrate_device_range() but supports non-contiguous pre-popluated
+ * array of device pages to migrate.
+ */
+int migrate_device_pfns(unsigned long *src_pfns, unsigned long npages)
+{
+ unsigned long i;
+
+ for (i = 0; i < npages; i++)
+ src_pfns[i] = migrate_device_pfn_lock(src_pfns[i]);
migrate_device_unmap(src_pfns, npages, NULL);
return 0;
}
-EXPORT_SYMBOL(migrate_device_range);
+EXPORT_SYMBOL(migrate_device_pfns);
/*
* Migrate a device coherent folio back to normal memory. The caller should have