Merge tag 'drm-next-2019-05-09' of git://anongit.freedesktop.org/drm/drm

Pull drm updates from Dave Airlie: "This has two exciting community drivers for ARM Mali accelerators. Since ARM has never been open source friendly on the GPU side of the house, the community has had to create open source drivers for the Mali GPUs. Lima covers the older t4xx and panfrost the newer 6xx/7xx series. Well done to all involved and hopefully this will help ARM head in the right direction. There is also now the ability if you don't have any of the legacy drivers enabled (pre-KMS) to remove all the pre-KMS support code from the core drm, this saves 10% or so in codesize on my machine. i915 also enable Icelake/Elkhart Lake Gen11 GPUs by default, vboxvideo moves out of staging. There are also some rcar-du patches which crossover with media tree but all should be acked by Mauro. Summary: uapi changes: - Colorspace connector property - fourcc - new YUV formts - timeline sync objects initially merged - expose FB_DAMAGE_CLIPS to atomic userspace new drivers: - vboxvideo: moved out of staging - aspeed: ASPEED SoC BMC chip display support - lima: ARM Mali4xx GPU acceleration driver support - panfrost: ARM Mali6xx/7xx Midgard/Bitfrost acceleration driver support core: - component helper docs - unplugging fixes - devm device init - MIPI/DSI rate control - shmem backed gem objects - connector, display_info, edid_quirks cleanups - dma_buf fence chain support - 64-bit dma-fence seqno comparison fixes - move initial fb config code to core - gem fence array helpers for Lima - ability to remove legacy support code if no drivers requires it (removes 10% of drm.ko size) - lease fixes ttm: - unified DRM_FILE_PAGE_OFFSET handling - Account for kernel allocations in kernel zone only panel: - OSD070T1718-19TS panel support - panel-tpo-td028ttec1 backlight support - Ronbo RB070D30 MIPI/DSI - Feiyang FY07024DI26A30-D MIPI-DSI panel - Rocktech jh057n00900 MIPI-DSI panel i915: - Comet Lake (Gen9) PCI IDs - Updated Icelake PCI IDs - Elkhartlake (Gen11) support - DP MST property addtions - plane and watermark fixes - Icelake port sync and VEBOX disable fixes - struct_mutex usage reduction - Icelake gamma fix - GuC reset fixes - make mmap more asynchronous - sound display power well race fixes - DDI/MIPI-DSI clocks for Icelake - Icelake RPS frequency changing support - Icelake workarounds amdgpu: - Use HMM for userptr - vega20 experimental smu11 support - RAS support for vega20 - BACO support for vega12 + fixes for vega20 - reworked IH interrupt handling - amdkfd RAS support - Freesync improvements - initial timeline sync object support - DC Z ordering fixes - NV12 planes support - colorspace properties for planes= - eDP opts if eDP already initialized nouveau: - misc fixes etnaviv: - misc fixes msm: - GPU zap shader support expansion - robustness ABI addition exynos: - Logging cleanups tegra: - Shared reset fix - CPU cache maintenance fix cirrus: - driver rewritten using simple helpers meson: - G12A support vmwgfx: - Resource dirtying management improvements - Userspace logging improvements virtio: - PRIME fixes rockchip: - rk3066 hdmi support sun4i: - DSI burst mode support vc4: - load tracker to detect underflow v3d: - v3d v4.2 support malidp: - initial Mali D71 support in komeda driver tfp410: - omap related improvement omapdrm: - drm bridge/panel support - drop some omap specific panels rcar-du: - Display writeback support" * tag 'drm-next-2019-05-09' of git://anongit.freedesktop.org/drm/drm: (1507 commits) drm/msm/a6xx: No zap shader is not an error drm/cma-helper: Fix drm_gem_cma_free_object() drm: Fix timestamp docs for variable refresh properties. drm/komeda: Mark the local functions as static drm/komeda: Fixed warning: Function parameter or member not described drm/komeda: Expose bus_width to Komeda-CORE drm/komeda: Add sysfs attribute: core_id and config_id drm: add non-desktop quirk for Valve HMDs drm/panfrost: Show stored feature registers drm/panfrost: Don't scream about deferred probe drm/panfrost: Disable PM on probe failure drm/panfrost: Set DMA masks earlier drm/panfrost: Add sanity checks to submit IOCTL drm/etnaviv: initialize idle mask before querying the HW db drm: introduce a capability flag for syncobj timeline support drm: report consistent errors when checking syncobj capibility drm/nouveau/nouveau: forward error generated while resuming objects tree drm/nouveau/fb/ramgk104: fix spelling mistake "sucessfully" -> "successfully" drm/nouveau/i2c: Disable i2c bus access after ->fini() drm/nouveau: Remove duplicate ACPI_VIDEO_NOTIFY_PROBE definition ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2019-05-08 21:35:19 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2019-05-08 21:35:19 -0700
commit: a2d635decbfa9c1e4ae15cb05b68b2559f7f827c (patch)
tree: 1c3766c35215450ff9e4228efed578d5e6ba65d1 /drivers/gpu/drm/i915/i915_request.c
parent: 89c3b37af87ec183b666d83428cb28cc421671a6 (diff)
parent: eb85d03e01c3e9f3b0ba7282b2e3515a635decb2 (diff)
1 files changed, 389 insertions, 109 deletions
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index c2a5c48c7541..b836721d3b13 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -22,15 +22,31 @@
  *
  */
 
-#include <linux/prefetch.h>
 #include <linux/dma-fence-array.h>
+#include <linux/irq_work.h>
+#include <linux/prefetch.h>
 #include <linux/sched.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/signal.h>
 
-#include "i915_drv.h"
 #include "i915_active.h"
+#include "i915_drv.h"
+#include "i915_globals.h"
 #include "i915_reset.h"
+#include "intel_pm.h"
+
+struct execute_cb {
+	struct list_head link;
+	struct irq_work work;
+	struct i915_sw_fence *fence;
+};
+
+static struct i915_global_request {
+	struct i915_global base;
+	struct kmem_cache *slab_requests;
+	struct kmem_cache *slab_dependencies;
+	struct kmem_cache *slab_execute_cbs;
+} global;
 
 static const char *i915_fence_get_driver_name(struct dma_fence *fence)
 {
@@ -51,7 +67,7 @@ static const char *i915_fence_get_timeline_name(struct dma_fence *fence)
 	if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
 		return "signaled";
 
-	return to_request(fence)->timeline->name;
+	return to_request(fence)->gem_context->name ?: "[i915]";
 }
 
 static bool i915_fence_signaled(struct dma_fence *fence)
@@ -68,7 +84,9 @@ static signed long i915_fence_wait(struct dma_fence *fence,
 				   bool interruptible,
 				   signed long timeout)
 {
-	return i915_request_wait(to_request(fence), interruptible, timeout);
+	return i915_request_wait(to_request(fence),
+				 interruptible | I915_WAIT_PRIORITY,
+				 timeout);
 }
 
 static void i915_fence_release(struct dma_fence *fence)
@@ -83,8 +101,9 @@ static void i915_fence_release(struct dma_fence *fence)
 	 * caught trying to reuse dead objects.
 	 */
 	i915_sw_fence_fini(&rq->submit);
+	i915_sw_fence_fini(&rq->semaphore);
 
-	kmem_cache_free(rq->i915->requests, rq);
+	kmem_cache_free(global.slab_requests, rq);
 }
 
 const struct dma_fence_ops i915_fence_ops = {
@@ -150,7 +169,6 @@ static void advance_ring(struct i915_request *request)
 		 * is just about to be. Either works, if we miss the last two
 		 * noops - they are safe to be replayed on a reset.
 		 */
-		GEM_TRACE("marking %s as inactive\n", ring->timeline->name);
 		tail = READ_ONCE(request->tail);
 		list_del(&ring->active_link);
 	} else {
@@ -177,12 +195,10 @@ static void free_capture_list(struct i915_request *request)
 static void __retire_engine_request(struct intel_engine_cs *engine,
 				    struct i915_request *rq)
 {
-	GEM_TRACE("%s(%s) fence %llx:%lld, global=%d, current %d:%d\n",
+	GEM_TRACE("%s(%s) fence %llx:%lld, current %d\n",
 		  __func__, engine->name,
 		  rq->fence.context, rq->fence.seqno,
-		  rq->global_seqno,
-		  hwsp_seqno(rq),
-		  intel_engine_get_seqno(engine));
+		  hwsp_seqno(rq));
 
 	GEM_BUG_ON(!i915_request_completed(rq));
 
@@ -241,12 +257,10 @@ static void i915_request_retire(struct i915_request *request)
 {
 	struct i915_active_request *active, *next;
 
-	GEM_TRACE("%s fence %llx:%lld, global=%d, current %d:%d\n",
+	GEM_TRACE("%s fence %llx:%lld, current %d\n",
 		  request->engine->name,
 		  request->fence.context, request->fence.seqno,
-		  request->global_seqno,
-		  hwsp_seqno(request),
-		  intel_engine_get_seqno(request->engine));
+		  hwsp_seqno(request));
 
 	lockdep_assert_held(&request->i915->drm.struct_mutex);
 	GEM_BUG_ON(!i915_sw_fence_signaled(&request->submit));
@@ -288,15 +302,13 @@ static void i915_request_retire(struct i915_request *request)
 
 	i915_request_remove_from_client(request);
 
-	/* Retirement decays the ban score as it is a sign of ctx progress */
-	atomic_dec_if_positive(&request->gem_context->ban_score);
 	intel_context_unpin(request->hw_context);
 
 	__retire_engine_upto(request->engine, request);
 
 	unreserve_gt(request->i915);
 
-	i915_sched_node_fini(request->i915, &request->sched);
+	i915_sched_node_fini(&request->sched);
 	i915_request_put(request);
 }
 
@@ -305,12 +317,10 @@ void i915_request_retire_upto(struct i915_request *rq)
 	struct intel_ring *ring = rq->ring;
 	struct i915_request *tmp;
 
-	GEM_TRACE("%s fence %llx:%lld, global=%d, current %d:%d\n",
+	GEM_TRACE("%s fence %llx:%lld, current %d\n",
 		  rq->engine->name,
 		  rq->fence.context, rq->fence.seqno,
-		  rq->global_seqno,
-		  hwsp_seqno(rq),
-		  intel_engine_get_seqno(rq->engine));
+		  hwsp_seqno(rq));
 
 	lockdep_assert_held(&rq->i915->drm.struct_mutex);
 	GEM_BUG_ON(!i915_request_completed(rq));
@@ -326,9 +336,67 @@ void i915_request_retire_upto(struct i915_request *rq)
 	} while (tmp != rq);
 }
 
-static u32 timeline_get_seqno(struct i915_timeline *tl)
+static void irq_execute_cb(struct irq_work *wrk)
+{
+	struct execute_cb *cb = container_of(wrk, typeof(*cb), work);
+
+	i915_sw_fence_complete(cb->fence);
+	kmem_cache_free(global.slab_execute_cbs, cb);
+}
+
+static void __notify_execute_cb(struct i915_request *rq)
+{
+	struct execute_cb *cb;
+
+	lockdep_assert_held(&rq->lock);
+
+	if (list_empty(&rq->execute_cb))
+		return;
+
+	list_for_each_entry(cb, &rq->execute_cb, link)
+		irq_work_queue(&cb->work);
+
+	/*
+	 * XXX Rollback on __i915_request_unsubmit()
+	 *
+	 * In the future, perhaps when we have an active time-slicing scheduler,
+	 * it will be interesting to unsubmit parallel execution and remove
+	 * busywaits from the GPU until their master is restarted. This is
+	 * quite hairy, we have to carefully rollback the fence and do a
+	 * preempt-to-idle cycle on the target engine, all the while the
+	 * master execute_cb may refire.
+	 */
+	INIT_LIST_HEAD(&rq->execute_cb);
+}
+
+static int
+i915_request_await_execution(struct i915_request *rq,
+			     struct i915_request *signal,
+			     gfp_t gfp)
 {
-	return tl->seqno += 1 + tl->has_initial_breadcrumb;
+	struct execute_cb *cb;
+
+	if (i915_request_is_active(signal))
+		return 0;
+
+	cb = kmem_cache_alloc(global.slab_execute_cbs, gfp);
+	if (!cb)
+		return -ENOMEM;
+
+	cb->fence = &rq->submit;
+	i915_sw_fence_await(cb->fence);
+	init_irq_work(&cb->work, irq_execute_cb);
+
+	spin_lock_irq(&signal->lock);
+	if (i915_request_is_active(signal)) {
+		i915_sw_fence_complete(cb->fence);
+		kmem_cache_free(global.slab_execute_cbs, cb);
+	} else {
+		list_add_tail(&cb->link, &signal->execute_cb);
+	}
+	spin_unlock_irq(&signal->lock);
+
+	return 0;
 }
 
 static void move_to_timeline(struct i915_request *request,
@@ -342,42 +410,33 @@ static void move_to_timeline(struct i915_request *request,
 	spin_unlock(&request->timeline->lock);
 }
 
-static u32 next_global_seqno(struct i915_timeline *tl)
-{
-	if (!++tl->seqno)
-		++tl->seqno;
-	return tl->seqno;
-}
-
 void __i915_request_submit(struct i915_request *request)
 {
 	struct intel_engine_cs *engine = request->engine;
-	u32 seqno;
 
-	GEM_TRACE("%s fence %llx:%lld -> global=%d, current %d:%d\n",
+	GEM_TRACE("%s fence %llx:%lld -> current %d\n",
 		  engine->name,
 		  request->fence.context, request->fence.seqno,
-		  engine->timeline.seqno + 1,
-		  hwsp_seqno(request),
-		  intel_engine_get_seqno(engine));
+		  hwsp_seqno(request));
 
 	GEM_BUG_ON(!irqs_disabled());
 	lockdep_assert_held(&engine->timeline.lock);
 
-	GEM_BUG_ON(request->global_seqno);
-
-	seqno = next_global_seqno(&engine->timeline);
-	GEM_BUG_ON(!seqno);
-	GEM_BUG_ON(intel_engine_signaled(engine, seqno));
+	if (i915_gem_context_is_banned(request->gem_context))
+		i915_request_skip(request, -EIO);
 
 	/* We may be recursing from the signal callback of another i915 fence */
 	spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING);
+
 	GEM_BUG_ON(test_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags));
 	set_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags);
-	request->global_seqno = seqno;
+
 	if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags) &&
 	    !i915_request_enable_breadcrumb(request))
 		intel_engine_queue_breadcrumbs(engine);
+
+	__notify_execute_cb(request);
+
 	spin_unlock(&request->lock);
 
 	engine->emit_fini_breadcrumb(request,
@@ -406,12 +465,10 @@ void __i915_request_unsubmit(struct i915_request *request)
 {
 	struct intel_engine_cs *engine = request->engine;
 
-	GEM_TRACE("%s fence %llx:%lld <- global=%d, current %d:%d\n",
+	GEM_TRACE("%s fence %llx:%lld, current %d\n",
 		  engine->name,
 		  request->fence.context, request->fence.seqno,
-		  request->global_seqno,
-		  hwsp_seqno(request),
-		  intel_engine_get_seqno(engine));
+		  hwsp_seqno(request));
 
 	GEM_BUG_ON(!irqs_disabled());
 	lockdep_assert_held(&engine->timeline.lock);
@@ -420,18 +477,25 @@ void __i915_request_unsubmit(struct i915_request *request)
 	 * Only unwind in reverse order, required so that the per-context list
 	 * is kept in seqno/ring order.
 	 */
-	GEM_BUG_ON(!request->global_seqno);
-	GEM_BUG_ON(request->global_seqno != engine->timeline.seqno);
-	GEM_BUG_ON(intel_engine_has_completed(engine, request->global_seqno));
-	engine->timeline.seqno--;
 
 	/* We may be recursing from the signal callback of another i915 fence */
 	spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING);
-	request->global_seqno = 0;
+
+	/*
+	 * As we do not allow WAIT to preempt inflight requests,
+	 * once we have executed a request, along with triggering
+	 * any execution callbacks, we must preserve its ordering
+	 * within the non-preemptible FIFO.
+	 */
+	BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */
+	request->sched.attr.priority |= __NO_PREEMPTION;
+
 	if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags))
 		i915_request_cancel_breadcrumb(request);
+
 	GEM_BUG_ON(!test_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags));
 	clear_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags);
+
 	spin_unlock(&request->lock);
 
 	/* Transfer back from the global per-engine timeline to per-context */
@@ -489,6 +553,36 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
 	return NOTIFY_DONE;
 }
 
+static int __i915_sw_fence_call
+semaphore_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
+{
+	struct i915_request *request =
+		container_of(fence, typeof(*request), semaphore);
+
+	switch (state) {
+	case FENCE_COMPLETE:
+		/*
+		 * We only check a small portion of our dependencies
+		 * and so cannot guarantee that there remains no
+		 * semaphore chain across all. Instead of opting
+		 * for the full NOSEMAPHORE boost, we go for the
+		 * smaller (but still preempting) boost of
+		 * NEWCLIENT. This will be enough to boost over
+		 * a busywaiting request (as that cannot be
+		 * NEWCLIENT) without accidentally boosting
+		 * a busywait over real work elsewhere.
+		 */
+		i915_schedule_bump_priority(request, I915_PRIORITY_NEWCLIENT);
+		break;
+
+	case FENCE_FREE:
+		i915_request_put(request);
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
 static void ring_retire_requests(struct intel_ring *ring)
 {
 	struct i915_request *rq, *rn;
@@ -518,12 +612,7 @@ i915_request_alloc_slow(struct intel_context *ce)
 	ring_retire_requests(ring);
 
 out:
-	return kmem_cache_alloc(ce->gem_context->i915->requests, GFP_KERNEL);
-}
-
-static int add_timeline_barrier(struct i915_request *rq)
-{
-	return i915_request_await_active_request(rq, &rq->timeline->barrier);
+	return kmem_cache_alloc(global.slab_requests, GFP_KERNEL);
 }
 
 /**
@@ -539,8 +628,10 @@ struct i915_request *
 i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 {
 	struct drm_i915_private *i915 = engine->i915;
-	struct i915_request *rq;
 	struct intel_context *ce;
+	struct i915_timeline *tl;
+	struct i915_request *rq;
+	u32 seqno;
 	int ret;
 
 	lockdep_assert_held(&i915->drm.struct_mutex);
@@ -556,8 +647,9 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 	 * ABI: Before userspace accesses the GPU (e.g. execbuffer), report
 	 * EIO if the GPU is already wedged.
 	 */
-	if (i915_terminally_wedged(&i915->gpu_error))
-		return ERR_PTR(-EIO);
+	ret = i915_terminally_wedged(i915);
+	if (ret)
+		return ERR_PTR(ret);
 
 	/*
 	 * Pinning the contexts may generate requests in order to acquire
@@ -569,6 +661,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 		return ERR_CAST(ce);
 
 	reserve_gt(i915);
+	mutex_lock(&ce->ring->timeline->mutex);
 
 	/* Move our oldest request to the slab-cache (if not in use!) */
 	rq = list_first_entry(&ce->ring->request_list, typeof(*rq), ring_link);
@@ -605,7 +698,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 	 *
 	 * Do not use kmem_cache_zalloc() here!
 	 */
-	rq = kmem_cache_alloc(i915->requests,
+	rq = kmem_cache_alloc(global.slab_requests,
 			      GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
 	if (unlikely(!rq)) {
 		rq = i915_request_alloc_slow(ce);
@@ -615,32 +708,36 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 		}
 	}
 
-	rq->rcustate = get_state_synchronize_rcu();
-
 	INIT_LIST_HEAD(&rq->active_list);
+	INIT_LIST_HEAD(&rq->execute_cb);
+
+	tl = ce->ring->timeline;
+	ret = i915_timeline_get_seqno(tl, rq, &seqno);
+	if (ret)
+		goto err_free;
+
 	rq->i915 = i915;
 	rq->engine = engine;
 	rq->gem_context = ctx;
 	rq->hw_context = ce;
 	rq->ring = ce->ring;
-	rq->timeline = ce->ring->timeline;
+	rq->timeline = tl;
 	GEM_BUG_ON(rq->timeline == &engine->timeline);
-	rq->hwsp_seqno = rq->timeline->hwsp_seqno;
+	rq->hwsp_seqno = tl->hwsp_seqno;
+	rq->hwsp_cacheline = tl->hwsp_cacheline;
+	rq->rcustate = get_state_synchronize_rcu(); /* acts as smp_mb() */
 
 	spin_lock_init(&rq->lock);
-	dma_fence_init(&rq->fence,
-		       &i915_fence_ops,
-		       &rq->lock,
-		       rq->timeline->fence_context,
-		       timeline_get_seqno(rq->timeline));
+	dma_fence_init(&rq->fence, &i915_fence_ops, &rq->lock,
+		       tl->fence_context, seqno);
 
 	/* We bump the ref for the fence chain */
 	i915_sw_fence_init(&i915_request_get(rq)->submit, submit_notify);
+	i915_sw_fence_init(&i915_request_get(rq)->semaphore, semaphore_notify);
 
 	i915_sched_node_init(&rq->sched);
 
 	/* No zalloc, must clear what we need by hand */
-	rq->global_seqno = 0;
 	rq->file_priv = NULL;
 	rq->batch = NULL;
 	rq->capture_list = NULL;
@@ -668,10 +765,6 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 	 */
 	rq->head = rq->ring->emit;
 
-	ret = add_timeline_barrier(rq);
-	if (ret)
-		goto err_unwind;
-
 	ret = engine->request_alloc(rq);
 	if (ret)
 		goto err_unwind;
@@ -682,7 +775,10 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 	rq->infix = rq->ring->emit; /* end of header; start of user payload */
 
 	/* Check that we didn't interrupt ourselves with a new request */
+	lockdep_assert_held(&rq->timeline->mutex);
 	GEM_BUG_ON(rq->timeline->seqno != rq->fence.seqno);
+	rq->cookie = lockdep_pin_lock(&rq->timeline->mutex);
+
 	return rq;
 
 err_unwind:
@@ -693,14 +789,76 @@ err_unwind:
 	GEM_BUG_ON(!list_empty(&rq->sched.signalers_list));
 	GEM_BUG_ON(!list_empty(&rq->sched.waiters_list));
 
-	kmem_cache_free(i915->requests, rq);
+err_free:
+	kmem_cache_free(global.slab_requests, rq);
 err_unreserve:
+	mutex_unlock(&ce->ring->timeline->mutex);
 	unreserve_gt(i915);
 	intel_context_unpin(ce);
 	return ERR_PTR(ret);
 }
 
 static int
+emit_semaphore_wait(struct i915_request *to,
+		    struct i915_request *from,
+		    gfp_t gfp)
+{
+	u32 hwsp_offset;
+	u32 *cs;
+	int err;
+
+	GEM_BUG_ON(!from->timeline->has_initial_breadcrumb);
+	GEM_BUG_ON(INTEL_GEN(to->i915) < 8);
+
+	/* Just emit the first semaphore we see as request space is limited. */
+	if (to->sched.semaphores & from->engine->mask)
+		return i915_sw_fence_await_dma_fence(&to->submit,
+						     &from->fence, 0,
+						     I915_FENCE_GFP);
+
+	err = i915_sw_fence_await_dma_fence(&to->semaphore,
+					    &from->fence, 0,
+					    I915_FENCE_GFP);
+	if (err < 0)
+		return err;
+
+	/* We need to pin the signaler's HWSP until we are finished reading. */
+	err = i915_timeline_read_hwsp(from, to, &hwsp_offset);
+	if (err)
+		return err;
+
+	/* Only submit our spinner after the signaler is running! */
+	err = i915_request_await_execution(to, from, gfp);
+	if (err)
+		return err;
+
+	cs = intel_ring_begin(to, 4);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	/*
+	 * Using greater-than-or-equal here means we have to worry
+	 * about seqno wraparound. To side step that issue, we swap
+	 * the timeline HWSP upon wrapping, so that everyone listening
+	 * for the old (pre-wrap) values do not see the much smaller
+	 * (post-wrap) values than they were expecting (and so wait
+	 * forever).
+	 */
+	*cs++ = MI_SEMAPHORE_WAIT |
+		MI_SEMAPHORE_GLOBAL_GTT |
+		MI_SEMAPHORE_POLL |
+		MI_SEMAPHORE_SAD_GTE_SDD;
+	*cs++ = from->fence.seqno;
+	*cs++ = hwsp_offset;
+	*cs++ = 0;
+
+	intel_ring_advance(to, cs);
+	to->sched.semaphores |= from->engine->mask;
+	to->sched.flags |= I915_SCHED_HAS_SEMAPHORE_CHAIN;
+	return 0;
+}
+
+static int
 i915_request_await_request(struct i915_request *to, struct i915_request *from)
 {
 	int ret;
@@ -712,9 +870,7 @@ i915_request_await_request(struct i915_request *to, struct i915_request *from)
 		return 0;
 
 	if (to->engine->schedule) {
-		ret = i915_sched_node_add_dependency(to->i915,
-						     &to->sched,
-						     &from->sched);
+		ret = i915_sched_node_add_dependency(&to->sched, &from->sched);
 		if (ret < 0)
 			return ret;
 	}
@@ -723,6 +879,9 @@ i915_request_await_request(struct i915_request *to, struct i915_request *from)
 		ret = i915_sw_fence_await_sw_fence_gfp(&to->submit,
 						       &from->submit,
 						       I915_FENCE_GFP);
+	} else if (intel_engine_has_semaphores(to->engine) &&
+		   to->gem_context->sched.priority >= I915_PRIORITY_NORMAL) {
+		ret = emit_semaphore_wait(to, from, I915_FENCE_GFP);
 	} else {
 		ret = i915_sw_fence_await_dma_fence(&to->submit,
 						    &from->fence, 0,
@@ -873,6 +1032,60 @@ void i915_request_skip(struct i915_request *rq, int error)
 	memset(vaddr + head, 0, rq->postfix - head);
 }
 
+static struct i915_request *
+__i915_request_add_to_timeline(struct i915_request *rq)
+{
+	struct i915_timeline *timeline = rq->timeline;
+	struct i915_request *prev;
+
+	/*
+	 * Dependency tracking and request ordering along the timeline
+	 * is special cased so that we can eliminate redundant ordering
+	 * operations while building the request (we know that the timeline
+	 * itself is ordered, and here we guarantee it).
+	 *
+	 * As we know we will need to emit tracking along the timeline,
+	 * we embed the hooks into our request struct -- at the cost of
+	 * having to have specialised no-allocation interfaces (which will
+	 * be beneficial elsewhere).
+	 *
+	 * A second benefit to open-coding i915_request_await_request is
+	 * that we can apply a slight variant of the rules specialised
+	 * for timelines that jump between engines (such as virtual engines).
+	 * If we consider the case of virtual engine, we must emit a dma-fence
+	 * to prevent scheduling of the second request until the first is
+	 * complete (to maximise our greedy late load balancing) and this
+	 * precludes optimising to use semaphores serialisation of a single
+	 * timeline across engines.
+	 */
+	prev = i915_active_request_raw(&timeline->last_request,
+				       &rq->i915->drm.struct_mutex);
+	if (prev && !i915_request_completed(prev)) {
+		if (is_power_of_2(prev->engine->mask | rq->engine->mask))
+			i915_sw_fence_await_sw_fence(&rq->submit,
+						     &prev->submit,
+						     &rq->submitq);
+		else
+			__i915_sw_fence_await_dma_fence(&rq->submit,
+							&prev->fence,
+							&rq->dmaq);
+		if (rq->engine->schedule)
+			__i915_sched_node_add_dependency(&rq->sched,
+							 &prev->sched,
+							 &rq->dep,
+							 0);
+	}
+
+	spin_lock_irq(&timeline->lock);
+	list_add_tail(&rq->link, &timeline->requests);
+	spin_unlock_irq(&timeline->lock);
+
+	GEM_BUG_ON(timeline->seqno != rq->fence.seqno);
+	__i915_active_request_set(&timeline->last_request, rq);
+
+	return prev;
+}
+
 /*
  * NB: This function is not allowed to fail. Doing so would mean the the
  * request is not being tracked for completion but the work itself is
@@ -889,7 +1102,9 @@ void i915_request_add(struct i915_request *request)
 	GEM_TRACE("%s fence %llx:%lld\n",
 		  engine->name, request->fence.context, request->fence.seqno);
 
-	lockdep_assert_held(&request->i915->drm.struct_mutex);
+	lockdep_assert_held(&request->timeline->mutex);
+	lockdep_unpin_lock(&request->timeline->mutex, request->cookie);
+
 	trace_i915_request_add(request);
 
 	/*
@@ -917,37 +1132,12 @@ void i915_request_add(struct i915_request *request)
 	GEM_BUG_ON(IS_ERR(cs));
 	request->postfix = intel_ring_offset(request, cs);
 
-	/*
-	 * Seal the request and mark it as pending execution. Note that
-	 * we may inspect this state, without holding any locks, during
-	 * hangcheck. Hence we apply the barrier to ensure that we do not
-	 * see a more recent value in the hws than we are tracking.
-	 */
-
-	prev = i915_active_request_raw(&timeline->last_request,
-				       &request->i915->drm.struct_mutex);
-	if (prev && !i915_request_completed(prev)) {
-		i915_sw_fence_await_sw_fence(&request->submit, &prev->submit,
-					     &request->submitq);
-		if (engine->schedule)
-			__i915_sched_node_add_dependency(&request->sched,
-							 &prev->sched,
-							 &request->dep,
-							 0);
-	}
-
-	spin_lock_irq(&timeline->lock);
-	list_add_tail(&request->link, &timeline->requests);
-	spin_unlock_irq(&timeline->lock);
-
-	GEM_BUG_ON(timeline->seqno != request->fence.seqno);
-	__i915_active_request_set(&timeline->last_request, request);
+	prev = __i915_request_add_to_timeline(request);
 
 	list_add_tail(&request->ring_link, &ring->request_list);
-	if (list_is_first(&request->ring_link, &ring->request_list)) {
-		GEM_TRACE("marking %s as active\n", ring->timeline->name);
+	if (list_is_first(&request->ring_link, &ring->request_list))
 		list_add(&ring->active_link, &request->i915->gt.active_rings);
-	}
+	request->i915->gt.active_engines |= request->engine->mask;
 	request->emitted_jiffies = jiffies;
 
 	/*
@@ -962,11 +1152,27 @@ void i915_request_add(struct i915_request *request)
 	 * run at the earliest possible convenience.
 	 */
 	local_bh_disable();
+	i915_sw_fence_commit(&request->semaphore);
 	rcu_read_lock(); /* RCU serialisation for set-wedged protection */
 	if (engine->schedule) {
 		struct i915_sched_attr attr = request->gem_context->sched;
 
 		/*
+		 * Boost actual workloads past semaphores!
+		 *
+		 * With semaphores we spin on one engine waiting for another,
+		 * simply to reduce the latency of starting our work when
+		 * the signaler completes. However, if there is any other
+		 * work that we could be doing on this engine instead, that
+		 * is better utilisation and will reduce the overall duration
+		 * of the current work. To avoid PI boosting a semaphore
+		 * far in the distance past over useful work, we keep a history
+		 * of any semaphore use along our dependency chain.
+		 */
+		if (!(request->sched.flags & I915_SCHED_HAS_SEMAPHORE_CHAIN))
+			attr.priority |= I915_PRIORITY_NOSEMAPHORE;
+
+		/*
 		 * Boost priorities to new clients (new request flows).
 		 *
 		 * Allow interactive/synchronous clients to jump ahead of
@@ -1000,6 +1206,8 @@ void i915_request_add(struct i915_request *request)
 	 */
 	if (prev && i915_request_completed(prev))
 		i915_request_retire_upto(prev);
+
+	mutex_unlock(&request->timeline->mutex);
 }
 
 static unsigned long local_clock_us(unsigned int *cpu)
@@ -1136,8 +1344,25 @@ long i915_request_wait(struct i915_request *rq,
 	if (__i915_spin_request(rq, state, 5))
 		goto out;
 
-	if (flags & I915_WAIT_PRIORITY)
+	/*
+	 * This client is about to stall waiting for the GPU. In many cases
+	 * this is undesirable and limits the throughput of the system, as
+	 * many clients cannot continue processing user input/output whilst
+	 * blocked. RPS autotuning may take tens of milliseconds to respond
+	 * to the GPU load and thus incurs additional latency for the client.
+	 * We can circumvent that by promoting the GPU frequency to maximum
+	 * before we sleep. This makes the GPU throttle up much more quickly
+	 * (good for benchmarks and user experience, e.g. window animations),
+	 * but at a cost of spending more power processing the workload
+	 * (bad for battery).
+	 */
+	if (flags & I915_WAIT_PRIORITY) {
+		if (!i915_request_started(rq) && INTEL_GEN(rq->i915) >= 6)
+			gen6_rps_boost(rq);
+		local_bh_disable(); /* suspend tasklets for reprioritisation */
 		i915_schedule_bump_priority(rq, I915_PRIORITY_WAIT);
+		local_bh_enable(); /* kick tasklets en masse */
+	}
 
 	wait.tsk = current;
 	if (dma_fence_add_callback(&rq->fence, &wait.cb, request_wait_wake))
@@ -1179,11 +1404,66 @@ void i915_retire_requests(struct drm_i915_private *i915)
 	if (!i915->gt.active_requests)
 		return;
 
-	list_for_each_entry_safe(ring, tmp, &i915->gt.active_rings, active_link)
+	list_for_each_entry_safe(ring, tmp,
+				 &i915->gt.active_rings, active_link) {
+		intel_ring_get(ring); /* last rq holds reference! */
 		ring_retire_requests(ring);
+		intel_ring_put(ring);
+	}
 }
 
 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
 #include "selftests/mock_request.c"
 #include "selftests/i915_request.c"
 #endif
+
+static void i915_global_request_shrink(void)
+{
+	kmem_cache_shrink(global.slab_dependencies);
+	kmem_cache_shrink(global.slab_execute_cbs);
+	kmem_cache_shrink(global.slab_requests);
+}
+
+static void i915_global_request_exit(void)
+{
+	kmem_cache_destroy(global.slab_dependencies);
+	kmem_cache_destroy(global.slab_execute_cbs);
+	kmem_cache_destroy(global.slab_requests);
+}
+
+static struct i915_global_request global = { {
+	.shrink = i915_global_request_shrink,
+	.exit = i915_global_request_exit,
+} };
+
+int __init i915_global_request_init(void)
+{
+	global.slab_requests = KMEM_CACHE(i915_request,
+					  SLAB_HWCACHE_ALIGN |
+					  SLAB_RECLAIM_ACCOUNT |
+					  SLAB_TYPESAFE_BY_RCU);
+	if (!global.slab_requests)
+		return -ENOMEM;
+
+	global.slab_execute_cbs = KMEM_CACHE(execute_cb,
+					     SLAB_HWCACHE_ALIGN |
+					     SLAB_RECLAIM_ACCOUNT |
+					     SLAB_TYPESAFE_BY_RCU);
+	if (!global.slab_execute_cbs)
+		goto err_requests;
+
+	global.slab_dependencies = KMEM_CACHE(i915_dependency,
+					      SLAB_HWCACHE_ALIGN |
+					      SLAB_RECLAIM_ACCOUNT);
+	if (!global.slab_dependencies)
+		goto err_execute_cbs;
+
+	i915_global_register(&global.base);
+	return 0;
+
+err_execute_cbs:
+	kmem_cache_destroy(global.slab_execute_cbs);
+err_requests:
+	kmem_cache_destroy(global.slab_requests);
+	return -ENOMEM;
+}
author	Linus Torvalds <torvalds@linux-foundation.org>	2019-05-08 21:35:19 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2019-05-08 21:35:19 -0700
commit	a2d635decbfa9c1e4ae15cb05b68b2559f7f827c (patch)
tree	1c3766c35215450ff9e4228efed578d5e6ba65d1 /drivers/gpu/drm/i915/i915_request.c
parent	89c3b37af87ec183b666d83428cb28cc421671a6 (diff)
parent	eb85d03e01c3e9f3b0ba7282b2e3515a635decb2 (diff)