From f4e9af4f5af5dab9aee632f3aff0bd8040f1b2c5 Mon Sep 17 00:00:00 2001 From: Akash Goel Date: Wed, 12 Oct 2016 21:54:30 +0530 Subject: drm/i915: Add low level set of routines for programming PM IER/IIR/IMR register set So far PM IER/IIR/IMR registers were being used only for Turbo related interrupts. But interrupts coming from GuC also use the same set. As a precursor to supporting GuC interrupts, added new low level routines so as to allow sharing the programming of PM IER/IIR/IMR registers between Turbo & GuC. Also similar to PM IMR, maintaining a bitmask for PM IER register, to allow easy sharing of it between Turbo & GuC without involving a rmw operation. v2: - For appropriateness & avoid any ambiguity, rename old functions enable/disable pm_irq to mask/unmask pm_irq and rename new functions enable/disable pm_interrupts to enable/disable pm_irq. (Tvrtko) - Use u32 in place of uint32_t. (Tvrtko) v3: - Rename the fields pm_irq_mask & pm_ier_mask and do some cleanup. (Chris) - Rebase. v4: Fix the inadvertent disabling of User interrupt for VECS ring causing failure for certain IGTs. v5: Use dev_priv with HAS_VEBOX macro. (Tvrtko) Suggested-by: Chris Wilson Signed-off-by: Akash Goel Reviewed-by: Tvrtko Ursulin Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/intel_ringbuffer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/i915/intel_ringbuffer.c') diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index 32786ba199b9..67a70c5e6453 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -1608,7 +1608,7 @@ hsw_vebox_irq_enable(struct intel_engine_cs *engine) struct drm_i915_private *dev_priv = engine->i915; I915_WRITE_IMR(engine, ~engine->irq_enable_mask); - gen6_enable_pm_irq(dev_priv, engine->irq_enable_mask); + gen6_unmask_pm_irq(dev_priv, engine->irq_enable_mask); } static void @@ -1617,7 +1617,7 @@ hsw_vebox_irq_disable(struct intel_engine_cs *engine) struct drm_i915_private *dev_priv = engine->i915; I915_WRITE_IMR(engine, ~0); - gen6_disable_pm_irq(dev_priv, engine->irq_enable_mask); + gen6_mask_pm_irq(dev_priv, engine->irq_enable_mask); } static void -- cgit v1.2.3 From e95433c73a11759203af1cae5958f998c9673370 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 28 Oct 2016 13:58:27 +0100 Subject: drm/i915: Rearrange i915_wait_request() accounting with callers Our low-level wait routine has evolved from our generic wait interface that handled unlocked, RPS boosting, waits with time tracking. If we push our GEM fence tracking to use reservation_objects (required for handling multiple timelines), we lose the ability to pass the required information down to i915_wait_request(). However, if we push the extra functionality from i915_wait_request() to the individual callsites (i915_gem_object_wait_rendering and i915_gem_wait_ioctl) that make use of those extras, we can both simplify our low level wait and prepare for extending the GEM interface for use of reservation_objects. v2: Rewrite i915_wait_request() kerneldocs Signed-off-by: Chris Wilson Cc: Matthew Auld Reviewed-by: Joonas Lahtinen Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-4-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gvt/scheduler.c | 9 +- drivers/gpu/drm/i915/i915_drv.h | 7 +- drivers/gpu/drm/i915/i915_gem.c | 309 ++++++++++++++++++++++++-------- drivers/gpu/drm/i915/i915_gem_request.c | 146 ++++----------- drivers/gpu/drm/i915/i915_gem_request.h | 32 ++-- drivers/gpu/drm/i915/i915_gem_userptr.c | 12 +- drivers/gpu/drm/i915/intel_display.c | 27 +-- drivers/gpu/drm/i915/intel_ringbuffer.c | 14 +- drivers/gpu/drm/i915/intel_ringbuffer.h | 3 +- 9 files changed, 316 insertions(+), 243 deletions(-) (limited to 'drivers/gpu/drm/i915/intel_ringbuffer.c') diff --git a/drivers/gpu/drm/i915/gvt/scheduler.c b/drivers/gpu/drm/i915/gvt/scheduler.c index f7e320b9b17a..18acb45dd14d 100644 --- a/drivers/gpu/drm/i915/gvt/scheduler.c +++ b/drivers/gpu/drm/i915/gvt/scheduler.c @@ -400,6 +400,7 @@ static int workload_thread(void *priv) int ring_id = p->ring_id; struct intel_gvt_workload_scheduler *scheduler = &gvt->scheduler; struct intel_vgpu_workload *workload = NULL; + long lret; int ret; bool need_force_wake = IS_SKYLAKE(gvt->dev_priv); DEFINE_WAIT_FUNC(wait, woken_wake_function); @@ -449,10 +450,12 @@ static int workload_thread(void *priv) gvt_dbg_sched("ring id %d wait workload %p\n", workload->ring_id, workload); - workload->status = i915_wait_request(workload->req, - 0, NULL, NULL); - if (workload->status != 0) + lret = i915_wait_request(workload->req, + 0, MAX_SCHEDULE_TIMEOUT); + if (lret < 0) { + workload->status = lret; gvt_err("fail to wait workload, skip\n"); + } complete: gvt_dbg_sched("will complete workload %p\n, status: %d\n", diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index e95352cc5ac2..cf4b2427aff3 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -3319,9 +3319,10 @@ int __must_check i915_gem_wait_for_idle(struct drm_i915_private *dev_priv, int __must_check i915_gem_suspend(struct drm_device *dev); void i915_gem_resume(struct drm_device *dev); int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf); -int __must_check -i915_gem_object_wait_rendering(struct drm_i915_gem_object *obj, - bool readonly); +int i915_gem_object_wait(struct drm_i915_gem_object *obj, + unsigned int flags, + long timeout, + struct intel_rps_client *rps); int __must_check i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write); diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 1254143ab121..537f502123ea 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -292,7 +292,12 @@ int i915_gem_object_unbind(struct drm_i915_gem_object *obj) * must wait for all rendering to complete to the object (as unbinding * must anyway), and retire the requests. */ - ret = i915_gem_object_wait_rendering(obj, false); + ret = i915_gem_object_wait(obj, + I915_WAIT_INTERRUPTIBLE | + I915_WAIT_LOCKED | + I915_WAIT_ALL, + MAX_SCHEDULE_TIMEOUT, + NULL); if (ret) return ret; @@ -311,88 +316,172 @@ int i915_gem_object_unbind(struct drm_i915_gem_object *obj) return ret; } -/** - * Ensures that all rendering to the object has completed and the object is - * safe to unbind from the GTT or access from the CPU. - * @obj: i915 gem object - * @readonly: waiting for just read access or read-write access - */ -int -i915_gem_object_wait_rendering(struct drm_i915_gem_object *obj, - bool readonly) +static long +i915_gem_object_wait_fence(struct dma_fence *fence, + unsigned int flags, + long timeout, + struct intel_rps_client *rps) { - struct reservation_object *resv; - struct i915_gem_active *active; - unsigned long active_mask; - int idx; + struct drm_i915_gem_request *rq; - lockdep_assert_held(&obj->base.dev->struct_mutex); + BUILD_BUG_ON(I915_WAIT_INTERRUPTIBLE != 0x1); - if (!readonly) { - active = obj->last_read; - active_mask = i915_gem_object_get_active(obj); - } else { - active_mask = 1; - active = &obj->last_write; + if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) + return timeout; + + if (!dma_fence_is_i915(fence)) + return dma_fence_wait_timeout(fence, + flags & I915_WAIT_INTERRUPTIBLE, + timeout); + + rq = to_request(fence); + if (i915_gem_request_completed(rq)) + goto out; + + /* This client is about to stall waiting for the GPU. In many cases + * this is undesirable and limits the throughput of the system, as + * many clients cannot continue processing user input/output whilst + * blocked. RPS autotuning may take tens of milliseconds to respond + * to the GPU load and thus incurs additional latency for the client. + * We can circumvent that by promoting the GPU frequency to maximum + * before we wait. This makes the GPU throttle up much more quickly + * (good for benchmarks and user experience, e.g. window animations), + * but at a cost of spending more power processing the workload + * (bad for battery). Not all clients even want their results + * immediately and for them we should just let the GPU select its own + * frequency to maximise efficiency. To prevent a single client from + * forcing the clocks too high for the whole system, we only allow + * each client to waitboost once in a busy period. + */ + if (rps) { + if (INTEL_GEN(rq->i915) >= 6) + gen6_rps_boost(rq->i915, rps, rq->emitted_jiffies); + else + rps = NULL; } - for_each_active(active_mask, idx) { + timeout = i915_wait_request(rq, flags, timeout); + +out: + if (flags & I915_WAIT_LOCKED && i915_gem_request_completed(rq)) + i915_gem_request_retire_upto(rq); + + if (rps && rq->fence.seqno == rq->engine->last_submitted_seqno) { + /* The GPU is now idle and this client has stalled. + * Since no other client has submitted a request in the + * meantime, assume that this client is the only one + * supplying work to the GPU but is unable to keep that + * work supplied because it is waiting. Since the GPU is + * then never kept fully busy, RPS autoclocking will + * keep the clocks relatively low, causing further delays. + * Compensate by giving the synchronous client credit for + * a waitboost next time. + */ + spin_lock(&rq->i915->rps.client_lock); + list_del_init(&rps->link); + spin_unlock(&rq->i915->rps.client_lock); + } + + return timeout; +} + +static long +i915_gem_object_wait_reservation(struct reservation_object *resv, + unsigned int flags, + long timeout, + struct intel_rps_client *rps) +{ + struct dma_fence *excl; + + if (flags & I915_WAIT_ALL) { + struct dma_fence **shared; + unsigned int count, i; int ret; - ret = i915_gem_active_wait(&active[idx], - &obj->base.dev->struct_mutex); + ret = reservation_object_get_fences_rcu(resv, + &excl, &count, &shared); if (ret) return ret; - } - resv = i915_gem_object_get_dmabuf_resv(obj); - if (resv) { - long err; + for (i = 0; i < count; i++) { + timeout = i915_gem_object_wait_fence(shared[i], + flags, timeout, + rps); + if (timeout <= 0) + break; + + dma_fence_put(shared[i]); + } - err = reservation_object_wait_timeout_rcu(resv, !readonly, true, - MAX_SCHEDULE_TIMEOUT); - if (err < 0) - return err; + for (; i < count; i++) + dma_fence_put(shared[i]); + kfree(shared); + } else { + excl = reservation_object_get_excl_rcu(resv); } - return 0; + if (excl && timeout > 0) + timeout = i915_gem_object_wait_fence(excl, flags, timeout, rps); + + dma_fence_put(excl); + + return timeout; } -/* A nonblocking variant of the above wait. Must be called prior to - * acquiring the mutex for the object, as the object state may change - * during this call. A reference must be held by the caller for the object. +/** + * Waits for rendering to the object to be completed + * @obj: i915 gem object + * @flags: how to wait (under a lock, for all rendering or just for writes etc) + * @timeout: how long to wait + * @rps: client (user process) to charge for any waitboosting */ -static __must_check int -__unsafe_wait_rendering(struct drm_i915_gem_object *obj, - struct intel_rps_client *rps, - bool readonly) +int +i915_gem_object_wait(struct drm_i915_gem_object *obj, + unsigned int flags, + long timeout, + struct intel_rps_client *rps) { + struct reservation_object *resv; struct i915_gem_active *active; unsigned long active_mask; int idx; - active_mask = __I915_BO_ACTIVE(obj); - if (!active_mask) - return 0; + might_sleep(); +#if IS_ENABLED(CONFIG_LOCKDEP) + GEM_BUG_ON(debug_locks && + !!lockdep_is_held(&obj->base.dev->struct_mutex) != + !!(flags & I915_WAIT_LOCKED)); +#endif + GEM_BUG_ON(timeout < 0); - if (!readonly) { + if (flags & I915_WAIT_ALL) { active = obj->last_read; + active_mask = i915_gem_object_get_active(obj); } else { active_mask = 1; active = &obj->last_write; } for_each_active(active_mask, idx) { - int ret; - - ret = i915_gem_active_wait_unlocked(&active[idx], - I915_WAIT_INTERRUPTIBLE, - NULL, rps); - if (ret) - return ret; + struct drm_i915_gem_request *request; + + request = i915_gem_active_get_unlocked(&active[idx]); + if (request) { + timeout = i915_gem_object_wait_fence(&request->fence, + flags, timeout, + rps); + i915_gem_request_put(request); + } + if (timeout < 0) + return timeout; } - return 0; + resv = i915_gem_object_get_dmabuf_resv(obj); + if (resv) + timeout = i915_gem_object_wait_reservation(resv, + flags, timeout, + rps); + return timeout < 0 ? timeout : 0; } static struct intel_rps_client *to_rps_client(struct drm_file *file) @@ -449,12 +538,18 @@ i915_gem_phys_pwrite(struct drm_i915_gem_object *obj, struct drm_device *dev = obj->base.dev; void *vaddr = obj->phys_handle->vaddr + args->offset; char __user *user_data = u64_to_user_ptr(args->data_ptr); - int ret = 0; + int ret; /* We manually control the domain here and pretend that it * remains coherent i.e. in the GTT domain, like shmem_pwrite. */ - ret = i915_gem_object_wait_rendering(obj, false); + lockdep_assert_held(&obj->base.dev->struct_mutex); + ret = i915_gem_object_wait(obj, + I915_WAIT_INTERRUPTIBLE | + I915_WAIT_LOCKED | + I915_WAIT_ALL, + MAX_SCHEDULE_TIMEOUT, + to_rps_client(file_priv)); if (ret) return ret; @@ -614,12 +709,17 @@ int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj, { int ret; - *needs_clflush = 0; + lockdep_assert_held(&obj->base.dev->struct_mutex); + *needs_clflush = 0; if (!i915_gem_object_has_struct_page(obj)) return -ENODEV; - ret = i915_gem_object_wait_rendering(obj, true); + ret = i915_gem_object_wait(obj, + I915_WAIT_INTERRUPTIBLE | + I915_WAIT_LOCKED, + MAX_SCHEDULE_TIMEOUT, + NULL); if (ret) return ret; @@ -661,11 +761,18 @@ int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj, { int ret; + lockdep_assert_held(&obj->base.dev->struct_mutex); + *needs_clflush = 0; if (!i915_gem_object_has_struct_page(obj)) return -ENODEV; - ret = i915_gem_object_wait_rendering(obj, false); + ret = i915_gem_object_wait(obj, + I915_WAIT_INTERRUPTIBLE | + I915_WAIT_LOCKED | + I915_WAIT_ALL, + MAX_SCHEDULE_TIMEOUT, + NULL); if (ret) return ret; @@ -1051,7 +1158,10 @@ i915_gem_pread_ioctl(struct drm_device *dev, void *data, trace_i915_gem_object_pread(obj, args->offset, args->size); - ret = __unsafe_wait_rendering(obj, to_rps_client(file), true); + ret = i915_gem_object_wait(obj, + I915_WAIT_INTERRUPTIBLE, + MAX_SCHEDULE_TIMEOUT, + to_rps_client(file)); if (ret) goto err; @@ -1449,7 +1559,11 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data, trace_i915_gem_object_pwrite(obj, args->offset, args->size); - ret = __unsafe_wait_rendering(obj, to_rps_client(file), false); + ret = i915_gem_object_wait(obj, + I915_WAIT_INTERRUPTIBLE | + I915_WAIT_ALL, + MAX_SCHEDULE_TIMEOUT, + to_rps_client(file)); if (ret) goto err; @@ -1536,7 +1650,11 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void *data, * We will repeat the flush holding the lock in the normal manner * to catch cases where we are gazumped. */ - ret = __unsafe_wait_rendering(obj, to_rps_client(file), !write_domain); + ret = i915_gem_object_wait(obj, + I915_WAIT_INTERRUPTIBLE | + (write_domain ? I915_WAIT_ALL : 0), + MAX_SCHEDULE_TIMEOUT, + to_rps_client(file)); if (ret) goto err; @@ -1772,7 +1890,10 @@ int i915_gem_fault(struct vm_area_struct *area, struct vm_fault *vmf) * repeat the flush holding the lock in the normal manner to catch cases * where we are gazumped. */ - ret = __unsafe_wait_rendering(obj, NULL, !write); + ret = i915_gem_object_wait(obj, + I915_WAIT_INTERRUPTIBLE, + MAX_SCHEDULE_TIMEOUT, + NULL); if (ret) goto err; @@ -2817,6 +2938,17 @@ void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file *file) mutex_unlock(&obj->base.dev->struct_mutex); } +static unsigned long to_wait_timeout(s64 timeout_ns) +{ + if (timeout_ns < 0) + return MAX_SCHEDULE_TIMEOUT; + + if (timeout_ns == 0) + return 0; + + return nsecs_to_jiffies_timeout(timeout_ns); +} + /** * i915_gem_wait_ioctl - implements DRM_IOCTL_I915_GEM_WAIT * @dev: drm device pointer @@ -2845,10 +2977,9 @@ int i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { struct drm_i915_gem_wait *args = data; - struct intel_rps_client *rps = to_rps_client(file); struct drm_i915_gem_object *obj; - unsigned long active; - int idx, ret = 0; + ktime_t start; + long ret; if (args->flags != 0) return -EINVAL; @@ -2857,14 +2988,17 @@ i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file) if (!obj) return -ENOENT; - active = __I915_BO_ACTIVE(obj); - for_each_active(active, idx) { - s64 *timeout = args->timeout_ns >= 0 ? &args->timeout_ns : NULL; - ret = i915_gem_active_wait_unlocked(&obj->last_read[idx], - I915_WAIT_INTERRUPTIBLE, - timeout, rps); - if (ret) - break; + start = ktime_get(); + + ret = i915_gem_object_wait(obj, + I915_WAIT_INTERRUPTIBLE | I915_WAIT_ALL, + to_wait_timeout(args->timeout_ns), + to_rps_client(file)); + + if (args->timeout_ns > 0) { + args->timeout_ns -= ktime_to_ns(ktime_sub(ktime_get(), start)); + if (args->timeout_ns < 0) + args->timeout_ns = 0; } i915_gem_object_put_unlocked(obj); @@ -3283,7 +3417,13 @@ i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write) uint32_t old_write_domain, old_read_domains; int ret; - ret = i915_gem_object_wait_rendering(obj, !write); + lockdep_assert_held(&obj->base.dev->struct_mutex); + ret = i915_gem_object_wait(obj, + I915_WAIT_INTERRUPTIBLE | + I915_WAIT_LOCKED | + (write ? I915_WAIT_ALL : 0), + MAX_SCHEDULE_TIMEOUT, + NULL); if (ret) return ret; @@ -3400,7 +3540,12 @@ restart: * If we wait upon the object, we know that all the bound * VMA are no longer active. */ - ret = i915_gem_object_wait_rendering(obj, false); + ret = i915_gem_object_wait(obj, + I915_WAIT_INTERRUPTIBLE | + I915_WAIT_LOCKED | + I915_WAIT_ALL, + MAX_SCHEDULE_TIMEOUT, + NULL); if (ret) return ret; @@ -3647,7 +3792,13 @@ i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write) uint32_t old_write_domain, old_read_domains; int ret; - ret = i915_gem_object_wait_rendering(obj, !write); + lockdep_assert_held(&obj->base.dev->struct_mutex); + ret = i915_gem_object_wait(obj, + I915_WAIT_INTERRUPTIBLE | + I915_WAIT_LOCKED | + (write ? I915_WAIT_ALL : 0), + MAX_SCHEDULE_TIMEOUT, + NULL); if (ret) return ret; @@ -3703,7 +3854,7 @@ i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file) struct drm_i915_file_private *file_priv = file->driver_priv; unsigned long recent_enough = jiffies - DRM_I915_THROTTLE_JIFFIES; struct drm_i915_gem_request *request, *target = NULL; - int ret; + long ret; /* ABI: return -EIO if already wedged */ if (i915_terminally_wedged(&dev_priv->gpu_error)) @@ -3730,10 +3881,12 @@ i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file) if (target == NULL) return 0; - ret = i915_wait_request(target, I915_WAIT_INTERRUPTIBLE, NULL, NULL); + ret = i915_wait_request(target, + I915_WAIT_INTERRUPTIBLE, + MAX_SCHEDULE_TIMEOUT); i915_gem_request_put(target); - return ret; + return ret < 0 ? ret : 0; } static bool diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c index 5e38bc04a4f0..fbe0923fe0bc 100644 --- a/drivers/gpu/drm/i915/i915_gem_request.c +++ b/drivers/gpu/drm/i915/i915_gem_request.c @@ -59,31 +59,9 @@ static bool i915_fence_enable_signaling(struct dma_fence *fence) static signed long i915_fence_wait(struct dma_fence *fence, bool interruptible, - signed long timeout_jiffies) + signed long timeout) { - s64 timeout_ns, *timeout; - int ret; - - if (timeout_jiffies != MAX_SCHEDULE_TIMEOUT) { - timeout_ns = jiffies_to_nsecs(timeout_jiffies); - timeout = &timeout_ns; - } else { - timeout = NULL; - } - - ret = i915_wait_request(to_request(fence), - interruptible, timeout, - NO_WAITBOOST); - if (ret == -ETIME) - return 0; - - if (ret < 0) - return ret; - - if (timeout_jiffies != MAX_SCHEDULE_TIMEOUT) - timeout_jiffies = nsecs_to_jiffies(timeout_ns); - - return timeout_jiffies; + return i915_wait_request(to_request(fence), interruptible, timeout); } static void i915_fence_value_str(struct dma_fence *fence, char *str, int size) @@ -166,7 +144,7 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request) struct i915_gem_active *active, *next; trace_i915_gem_request_retire(request); - list_del(&request->link); + list_del_init(&request->link); /* We know the GPU must have read the request to have * sent us the seqno + interrupt, so use the position @@ -224,7 +202,8 @@ void i915_gem_request_retire_upto(struct drm_i915_gem_request *req) struct drm_i915_gem_request *tmp; lockdep_assert_held(&req->i915->drm.struct_mutex); - GEM_BUG_ON(list_empty(&req->link)); + if (list_empty(&req->link)) + return; do { tmp = list_first_entry(&engine->request_list, @@ -780,75 +759,48 @@ bool __i915_spin_request(const struct drm_i915_gem_request *req, /** * i915_wait_request - wait until execution of request has finished - * @req: duh! + * @req: the request to wait upon * @flags: how to wait - * @timeout: in - how long to wait (NULL forever); out - how much time remaining - * @rps: client to charge for RPS boosting + * @timeout: how long to wait in jiffies + * + * i915_wait_request() waits for the request to be completed, for a + * maximum of @timeout jiffies (with MAX_SCHEDULE_TIMEOUT implying an + * unbounded wait). * - * Note: It is of utmost importance that the passed in seqno and reset_counter - * values have been read by the caller in an smp safe manner. Where read-side - * locks are involved, it is sufficient to read the reset_counter before - * unlocking the lock that protects the seqno. For lockless tricks, the - * reset_counter _must_ be read before, and an appropriate smp_rmb must be - * inserted. + * If the caller holds the struct_mutex, the caller must pass I915_WAIT_LOCKED + * in via the flags, and vice versa if the struct_mutex is not held, the caller + * must not specify that the wait is locked. * - * Returns 0 if the request was found within the alloted time. Else returns the - * errno with remaining time filled in timeout argument. + * Returns the remaining time (in jiffies) if the request completed, which may + * be zero or -ETIME if the request is unfinished after the timeout expires. + * May return -EINTR is called with I915_WAIT_INTERRUPTIBLE and a signal is + * pending before the request completes. */ -int i915_wait_request(struct drm_i915_gem_request *req, - unsigned int flags, - s64 *timeout, - struct intel_rps_client *rps) +long i915_wait_request(struct drm_i915_gem_request *req, + unsigned int flags, + long timeout) { const int state = flags & I915_WAIT_INTERRUPTIBLE ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE; DEFINE_WAIT(reset); struct intel_wait wait; - unsigned long timeout_remain; - int ret = 0; might_sleep(); #if IS_ENABLED(CONFIG_LOCKDEP) - GEM_BUG_ON(!!lockdep_is_held(&req->i915->drm.struct_mutex) != + GEM_BUG_ON(debug_locks && + !!lockdep_is_held(&req->i915->drm.struct_mutex) != !!(flags & I915_WAIT_LOCKED)); #endif + GEM_BUG_ON(timeout < 0); if (i915_gem_request_completed(req)) - return 0; + return timeout; - timeout_remain = MAX_SCHEDULE_TIMEOUT; - if (timeout) { - if (WARN_ON(*timeout < 0)) - return -EINVAL; - - if (*timeout == 0) - return -ETIME; - - /* Record current time in case interrupted, or wedged */ - timeout_remain = nsecs_to_jiffies_timeout(*timeout); - *timeout += ktime_get_raw_ns(); - } + if (!timeout) + return -ETIME; trace_i915_gem_request_wait_begin(req); - /* This client is about to stall waiting for the GPU. In many cases - * this is undesirable and limits the throughput of the system, as - * many clients cannot continue processing user input/output whilst - * blocked. RPS autotuning may take tens of milliseconds to respond - * to the GPU load and thus incurs additional latency for the client. - * We can circumvent that by promoting the GPU frequency to maximum - * before we wait. This makes the GPU throttle up much more quickly - * (good for benchmarks and user experience, e.g. window animations), - * but at a cost of spending more power processing the workload - * (bad for battery). Not all clients even want their results - * immediately and for them we should just let the GPU select its own - * frequency to maximise efficiency. To prevent a single client from - * forcing the clocks too high for the whole system, we only allow - * each client to waitboost once in a busy period. - */ - if (IS_RPS_CLIENT(rps) && INTEL_GEN(req->i915) >= 6) - gen6_rps_boost(req->i915, rps, req->emitted_jiffies); - /* Optimistic short spin before touching IRQs */ if (i915_spin_request(req, state, 5)) goto complete; @@ -867,16 +819,17 @@ int i915_wait_request(struct drm_i915_gem_request *req, for (;;) { if (signal_pending_state(state, current)) { - ret = -ERESTARTSYS; + timeout = -ERESTARTSYS; break; } - timeout_remain = io_schedule_timeout(timeout_remain); - if (timeout_remain == 0) { - ret = -ETIME; + if (!timeout) { + timeout = -ETIME; break; } + timeout = io_schedule_timeout(timeout); + if (intel_wait_complete(&wait)) break; @@ -923,40 +876,7 @@ wakeup: complete: trace_i915_gem_request_wait_end(req); - if (timeout) { - *timeout -= ktime_get_raw_ns(); - if (*timeout < 0) - *timeout = 0; - - /* - * Apparently ktime isn't accurate enough and occasionally has a - * bit of mismatch in the jiffies<->nsecs<->ktime loop. So patch - * things up to make the test happy. We allow up to 1 jiffy. - * - * This is a regrssion from the timespec->ktime conversion. - */ - if (ret == -ETIME && *timeout < jiffies_to_usecs(1)*1000) - *timeout = 0; - } - - if (IS_RPS_USER(rps) && - req->fence.seqno == req->engine->last_submitted_seqno) { - /* The GPU is now idle and this client has stalled. - * Since no other client has submitted a request in the - * meantime, assume that this client is the only one - * supplying work to the GPU but is unable to keep that - * work supplied because it is waiting. Since the GPU is - * then never kept fully busy, RPS autoclocking will - * keep the clocks relatively low, causing further delays. - * Compensate by giving the synchronous client credit for - * a waitboost next time. - */ - spin_lock(&req->i915->rps.client_lock); - list_del_init(&rps->link); - spin_unlock(&req->i915->rps.client_lock); - } - - return ret; + return timeout; } static bool engine_retire_requests(struct intel_engine_cs *engine) diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h index 4e6d038cc9de..ae0913adfec6 100644 --- a/drivers/gpu/drm/i915/i915_gem_request.h +++ b/drivers/gpu/drm/i915/i915_gem_request.h @@ -228,13 +228,13 @@ struct intel_rps_client; #define IS_RPS_CLIENT(p) (!IS_ERR(p)) #define IS_RPS_USER(p) (!IS_ERR_OR_NULL(p)) -int i915_wait_request(struct drm_i915_gem_request *req, - unsigned int flags, - s64 *timeout, - struct intel_rps_client *rps) +long i915_wait_request(struct drm_i915_gem_request *req, + unsigned int flags, + long timeout) __attribute__((nonnull(1))); #define I915_WAIT_INTERRUPTIBLE BIT(0) #define I915_WAIT_LOCKED BIT(1) /* struct_mutex held, handle GPU reset */ +#define I915_WAIT_ALL BIT(2) /* used by i915_gem_object_wait() */ static inline u32 intel_engine_get_seqno(struct intel_engine_cs *engine); @@ -583,14 +583,16 @@ static inline int __must_check i915_gem_active_wait(const struct i915_gem_active *active, struct mutex *mutex) { struct drm_i915_gem_request *request; + long ret; request = i915_gem_active_peek(active, mutex); if (!request) return 0; - return i915_wait_request(request, - I915_WAIT_INTERRUPTIBLE | I915_WAIT_LOCKED, - NULL, NULL); + ret = i915_wait_request(request, + I915_WAIT_INTERRUPTIBLE | I915_WAIT_LOCKED, + MAX_SCHEDULE_TIMEOUT); + return ret < 0 ? ret : 0; } /** @@ -617,20 +619,18 @@ i915_gem_active_wait(const struct i915_gem_active *active, struct mutex *mutex) */ static inline int i915_gem_active_wait_unlocked(const struct i915_gem_active *active, - unsigned int flags, - s64 *timeout, - struct intel_rps_client *rps) + unsigned int flags) { struct drm_i915_gem_request *request; - int ret = 0; + long ret = 0; request = i915_gem_active_get_unlocked(active); if (request) { - ret = i915_wait_request(request, flags, timeout, rps); + ret = i915_wait_request(request, flags, MAX_SCHEDULE_TIMEOUT); i915_gem_request_put(request); } - return ret; + return ret < 0 ? ret : 0; } /** @@ -647,7 +647,7 @@ i915_gem_active_retire(struct i915_gem_active *active, struct mutex *mutex) { struct drm_i915_gem_request *request; - int ret; + long ret; request = i915_gem_active_raw(active, mutex); if (!request) @@ -655,8 +655,8 @@ i915_gem_active_retire(struct i915_gem_active *active, ret = i915_wait_request(request, I915_WAIT_INTERRUPTIBLE | I915_WAIT_LOCKED, - NULL, NULL); - if (ret) + MAX_SCHEDULE_TIMEOUT); + if (ret < 0) return ret; list_del_init(&active->link); diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c index c6f780f5abc9..c49dd95413bd 100644 --- a/drivers/gpu/drm/i915/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c @@ -61,23 +61,13 @@ struct i915_mmu_object { bool attached; }; -static void wait_rendering(struct drm_i915_gem_object *obj) -{ - unsigned long active = __I915_BO_ACTIVE(obj); - int idx; - - for_each_active(active, idx) - i915_gem_active_wait_unlocked(&obj->last_read[idx], - 0, NULL, NULL); -} - static void cancel_userptr(struct work_struct *work) { struct i915_mmu_object *mo = container_of(work, typeof(*mo), work); struct drm_i915_gem_object *obj = mo->obj; struct drm_device *dev = obj->base.dev; - wait_rendering(obj); + i915_gem_object_wait(obj, I915_WAIT_ALL, MAX_SCHEDULE_TIMEOUT, NULL); mutex_lock(&dev->struct_mutex); /* Cancel any active worker and force us to re-evaluate gup */ diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index cb7dd11277fd..e4800b81c59e 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -12072,7 +12072,7 @@ static void intel_mmio_flip_work_func(struct work_struct *w) if (work->flip_queued_req) WARN_ON(i915_wait_request(work->flip_queued_req, - 0, NULL, NO_WAITBOOST)); + 0, MAX_SCHEDULE_TIMEOUT) < 0); /* For framebuffer backed by dmabuf, wait for fence */ resv = i915_gem_object_get_dmabuf_resv(obj); @@ -14187,19 +14187,21 @@ static int intel_atomic_prepare_commit(struct drm_device *dev, for_each_plane_in_state(state, plane, plane_state, i) { struct intel_plane_state *intel_plane_state = to_intel_plane_state(plane_state); + long timeout; if (!intel_plane_state->wait_req) continue; - ret = i915_wait_request(intel_plane_state->wait_req, - I915_WAIT_INTERRUPTIBLE, - NULL, NULL); - if (ret) { + timeout = i915_wait_request(intel_plane_state->wait_req, + I915_WAIT_INTERRUPTIBLE, + MAX_SCHEDULE_TIMEOUT); + if (timeout < 0) { /* Any hang should be swallowed by the wait */ - WARN_ON(ret == -EIO); + WARN_ON(timeout == -EIO); mutex_lock(&dev->struct_mutex); drm_atomic_helper_cleanup_planes(dev, state); mutex_unlock(&dev->struct_mutex); + ret = timeout; break; } } @@ -14403,7 +14405,7 @@ static void intel_atomic_commit_tail(struct drm_atomic_state *state) bool hw_check = intel_state->modeset; unsigned long put_domains[I915_MAX_PIPES] = {}; unsigned crtc_vblank_mask = 0; - int i, ret; + int i; for_each_plane_in_state(state, plane, plane_state, i) { struct intel_plane_state *intel_plane_state = @@ -14412,11 +14414,10 @@ static void intel_atomic_commit_tail(struct drm_atomic_state *state) if (!intel_plane_state->wait_req) continue; - ret = i915_wait_request(intel_plane_state->wait_req, - 0, NULL, NULL); /* EIO should be eaten, and we can't get interrupted in the * worker, and blocking commits have waited already. */ - WARN_ON(ret); + WARN_ON(i915_wait_request(intel_plane_state->wait_req, + 0, MAX_SCHEDULE_TIMEOUT) < 0); } drm_atomic_helper_wait_for_dependencies(state); @@ -14780,7 +14781,11 @@ intel_prepare_plane_fb(struct drm_plane *plane, * can safely continue. */ if (needs_modeset(crtc_state)) - ret = i915_gem_object_wait_rendering(old_obj, true); + ret = i915_gem_object_wait(old_obj, + I915_WAIT_INTERRUPTIBLE | + I915_WAIT_LOCKED, + MAX_SCHEDULE_TIMEOUT, + NULL); if (ret) { /* GPU hangs should have been swallowed by the wait */ WARN_ON(ret == -EIO); diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index 67a70c5e6453..8ef735faa603 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -2155,7 +2155,9 @@ static int wait_for_space(struct drm_i915_gem_request *req, int bytes) { struct intel_ring *ring = req->ring; struct drm_i915_gem_request *target; - int ret; + long timeout; + + lockdep_assert_held(&req->i915->drm.struct_mutex); intel_ring_update_space(ring); if (ring->space >= bytes) @@ -2185,11 +2187,11 @@ static int wait_for_space(struct drm_i915_gem_request *req, int bytes) if (WARN_ON(&target->ring_link == &ring->request_list)) return -ENOSPC; - ret = i915_wait_request(target, - I915_WAIT_INTERRUPTIBLE | I915_WAIT_LOCKED, - NULL, NO_WAITBOOST); - if (ret) - return ret; + timeout = i915_wait_request(target, + I915_WAIT_INTERRUPTIBLE | I915_WAIT_LOCKED, + MAX_SCHEDULE_TIMEOUT); + if (timeout < 0) + return timeout; i915_gem_request_retire_upto(target); diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index 32b2e6332ccf..884a5ae2225d 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -524,8 +524,7 @@ static inline int intel_engine_idle(struct intel_engine_cs *engine, unsigned int flags) { /* Wait upon the last request to be completed */ - return i915_gem_active_wait_unlocked(&engine->last_request, - flags, NULL, NULL); + return i915_gem_active_wait_unlocked(&engine->last_request, flags); } int intel_init_render_ring_buffer(struct intel_engine_cs *engine); -- cgit v1.2.3 From f8a7fde4561067a8ebc956b27afeb530ac97cb9d Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 28 Oct 2016 13:58:29 +0100 Subject: drm/i915: Defer active reference until required We only need the active reference to keep the object alive after the handle has been deleted (so as to prevent a synchronous gem_close). Why then pay the price of a kref on every execbuf when we can insert that final active ref just in time for the handle deletion? Signed-off-by: Chris Wilson Reviewed-by: Joonas Lahtinen Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-6-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/i915_drv.h | 28 ++++++++++++++++++++++++++++ drivers/gpu/drm/i915/i915_gem.c | 22 +++++++++++++++++++++- drivers/gpu/drm/i915/i915_gem_batch_pool.c | 2 +- drivers/gpu/drm/i915/i915_gem_context.c | 2 +- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 2 -- drivers/gpu/drm/i915/i915_gem_gtt.c | 7 ++++++- drivers/gpu/drm/i915/i915_gem_render_state.c | 3 ++- drivers/gpu/drm/i915/intel_ringbuffer.c | 15 ++++++++++++--- 8 files changed, 71 insertions(+), 10 deletions(-) (limited to 'drivers/gpu/drm/i915/intel_ringbuffer.c') diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index cf4b2427aff3..edc59d08d017 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2246,6 +2246,12 @@ struct drm_i915_gem_object { #define __I915_BO_ACTIVE(bo) \ ((READ_ONCE((bo)->flags) >> I915_BO_ACTIVE_SHIFT) & I915_BO_ACTIVE_MASK) + /** + * Have we taken a reference for the object for incomplete GPU + * activity? + */ +#define I915_BO_ACTIVE_REF (I915_BO_ACTIVE_SHIFT + I915_NUM_ENGINES) + /** * This is set if the object has been written to since last bound * to the GTT @@ -2407,6 +2413,28 @@ i915_gem_object_has_active_engine(const struct drm_i915_gem_object *obj, return obj->flags & BIT(engine + I915_BO_ACTIVE_SHIFT); } +static inline bool +i915_gem_object_has_active_reference(const struct drm_i915_gem_object *obj) +{ + return test_bit(I915_BO_ACTIVE_REF, &obj->flags); +} + +static inline void +i915_gem_object_set_active_reference(struct drm_i915_gem_object *obj) +{ + lockdep_assert_held(&obj->base.dev->struct_mutex); + __set_bit(I915_BO_ACTIVE_REF, &obj->flags); +} + +static inline void +i915_gem_object_clear_active_reference(struct drm_i915_gem_object *obj) +{ + lockdep_assert_held(&obj->base.dev->struct_mutex); + __clear_bit(I915_BO_ACTIVE_REF, &obj->flags); +} + +void __i915_gem_object_release_unless_active(struct drm_i915_gem_object *obj); + static inline unsigned int i915_gem_object_get_tiling(struct drm_i915_gem_object *obj) { diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 537f502123ea..c0103044dede 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2661,7 +2661,10 @@ i915_gem_object_retire__read(struct i915_gem_active *active, list_move_tail(&obj->global_list, &request->i915->mm.bound_list); - i915_gem_object_put(obj); + if (i915_gem_object_has_active_reference(obj)) { + i915_gem_object_clear_active_reference(obj); + i915_gem_object_put(obj); + } } static bool i915_context_is_banned(const struct i915_gem_context *ctx) @@ -2935,6 +2938,12 @@ void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file *file) list_for_each_entry_safe(vma, vn, &obj->vma_list, obj_link) if (vma->vm->file == fpriv) i915_vma_close(vma); + + if (i915_gem_object_is_active(obj) && + !i915_gem_object_has_active_reference(obj)) { + i915_gem_object_set_active_reference(obj); + i915_gem_object_get(obj); + } mutex_unlock(&obj->base.dev->struct_mutex); } @@ -4475,6 +4484,17 @@ void i915_gem_free_object(struct drm_gem_object *gem_obj) intel_runtime_pm_put(dev_priv); } +void __i915_gem_object_release_unless_active(struct drm_i915_gem_object *obj) +{ + lockdep_assert_held(&obj->base.dev->struct_mutex); + + GEM_BUG_ON(i915_gem_object_has_active_reference(obj)); + if (i915_gem_object_is_active(obj)) + i915_gem_object_set_active_reference(obj); + else + i915_gem_object_put(obj); +} + int i915_gem_suspend(struct drm_device *dev) { struct drm_i915_private *dev_priv = to_i915(dev); diff --git a/drivers/gpu/drm/i915/i915_gem_batch_pool.c b/drivers/gpu/drm/i915/i915_gem_batch_pool.c index ed989596d9a3..cb25cad3318c 100644 --- a/drivers/gpu/drm/i915/i915_gem_batch_pool.c +++ b/drivers/gpu/drm/i915/i915_gem_batch_pool.c @@ -73,7 +73,7 @@ void i915_gem_batch_pool_fini(struct i915_gem_batch_pool *pool) list_for_each_entry_safe(obj, next, &pool->cache_list[n], batch_pool_link) - i915_gem_object_put(obj); + __i915_gem_object_release_unless_active(obj); INIT_LIST_HEAD(&pool->cache_list[n]); } diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index 5dca32ac1c67..47e888cc721f 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -155,7 +155,7 @@ void i915_gem_context_free(struct kref *ctx_ref) if (ce->ring) intel_ring_free(ce->ring); - i915_vma_put(ce->state); + __i915_gem_object_release_unless_active(ce->state->obj); } put_pid(ctx->pid); diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 61365ae22b53..4cafce97998a 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -1290,8 +1290,6 @@ void i915_vma_move_to_active(struct i915_vma *vma, * add the active reference first and queue for it to be dropped * *last*. */ - if (!i915_gem_object_is_active(obj)) - i915_gem_object_get(obj); i915_gem_object_set_active(obj, idx); i915_gem_active_set(&obj->last_read[idx], req); diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c index 947d5ad51fb7..a3a364478a89 100644 --- a/drivers/gpu/drm/i915/i915_gem_gtt.c +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c @@ -3734,11 +3734,16 @@ void __iomem *i915_vma_pin_iomap(struct i915_vma *vma) void i915_vma_unpin_and_release(struct i915_vma **p_vma) { struct i915_vma *vma; + struct drm_i915_gem_object *obj; vma = fetch_and_zero(p_vma); if (!vma) return; + obj = vma->obj; + i915_vma_unpin(vma); - i915_vma_put(vma); + i915_vma_close(vma); + + __i915_gem_object_release_unless_active(obj); } diff --git a/drivers/gpu/drm/i915/i915_gem_render_state.c b/drivers/gpu/drm/i915/i915_gem_render_state.c index a98c0f42badd..e7c3dbcc6c81 100644 --- a/drivers/gpu/drm/i915/i915_gem_render_state.c +++ b/drivers/gpu/drm/i915/i915_gem_render_state.c @@ -224,7 +224,8 @@ int i915_gem_render_state_init(struct drm_i915_gem_request *req) i915_vma_move_to_active(so.vma, req, 0); err_unpin: i915_vma_unpin(so.vma); + i915_vma_close(so.vma); err_obj: - i915_gem_object_put(obj); + __i915_gem_object_release_unless_active(obj); return ret; } diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index 8ef735faa603..8eee9675d3bf 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -1762,14 +1762,19 @@ static void cleanup_phys_status_page(struct intel_engine_cs *engine) static void cleanup_status_page(struct intel_engine_cs *engine) { struct i915_vma *vma; + struct drm_i915_gem_object *obj; vma = fetch_and_zero(&engine->status_page.vma); if (!vma) return; + obj = vma->obj; + i915_vma_unpin(vma); - i915_gem_object_unpin_map(vma->obj); - i915_vma_put(vma); + i915_vma_close(vma); + + i915_gem_object_unpin_map(obj); + __i915_gem_object_release_unless_active(obj); } static int init_status_page(struct intel_engine_cs *engine) @@ -1967,7 +1972,11 @@ intel_engine_create_ring(struct intel_engine_cs *engine, int size) void intel_ring_free(struct intel_ring *ring) { - i915_vma_put(ring->vma); + struct drm_i915_gem_object *obj = ring->vma->obj; + + i915_vma_close(ring->vma); + __i915_gem_object_release_unless_active(obj); + kfree(ring); } -- cgit v1.2.3 From 920cf4194954ec6f971506013c7fe3b7def178b6 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 28 Oct 2016 13:58:30 +0100 Subject: drm/i915: Introduce an internal allocator for disposable private objects Quite a few of our objects used for internal hardware programming do not benefit from being swappable or from being zero initialised. As such they do not benefit from using a shmemfs backing storage and since they are internal and never directly exposed to the user, we do not need to worry about providing a filp. For these we can use an drm_i915_gem_object wrapper around a sg_table of plain struct page. They are not swap backed and not automatically pinned. If they are reaped by the shrinker, the pages are released and the contents discarded. For the internal use case, this is fine as for example, ringbuffers are pinned from being written by a request to be read by the hardware. Once they are idle, they can be discarded entirely. As such they are a good match for execlist ringbuffers and a small variety of other internal objects. In the first iteration, this is limited to the scratch batch buffers we use (for command parsing and state initialisation). v2: Allocate physically contiguous pages, where possible. v3: Reduce maximum order on subsequent requests following an allocation failure. v4: Fix up mismatch between swiotlb segment size and page count (it counts in 2k units, not 4k pages) Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Reviewed-by: Tvrtko Ursulin Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-7-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/Makefile | 1 + drivers/gpu/drm/i915/i915_drv.h | 5 + drivers/gpu/drm/i915/i915_gem_batch_pool.c | 27 ++--- drivers/gpu/drm/i915/i915_gem_internal.c | 170 +++++++++++++++++++++++++++ drivers/gpu/drm/i915/i915_gem_render_state.c | 2 +- drivers/gpu/drm/i915/intel_engine_cs.c | 2 +- drivers/gpu/drm/i915/intel_ringbuffer.c | 14 ++- 7 files changed, 197 insertions(+), 24 deletions(-) create mode 100644 drivers/gpu/drm/i915/i915_gem_internal.c (limited to 'drivers/gpu/drm/i915/intel_ringbuffer.c') diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile index 612340097f4b..7faa04c91e1a 100644 --- a/drivers/gpu/drm/i915/Makefile +++ b/drivers/gpu/drm/i915/Makefile @@ -35,6 +35,7 @@ i915-y += i915_cmd_parser.o \ i915_gem_execbuffer.o \ i915_gem_fence.o \ i915_gem_gtt.o \ + i915_gem_internal.o \ i915_gem.o \ i915_gem_render_state.o \ i915_gem_request.o \ diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index edc59d08d017..96995d644da2 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -3556,6 +3556,11 @@ i915_gem_object_create_stolen_for_preallocated(struct drm_device *dev, u32 gtt_offset, u32 size); +/* i915_gem_internal.c */ +struct drm_i915_gem_object * +i915_gem_object_create_internal(struct drm_i915_private *dev_priv, + unsigned int size); + /* i915_gem_shrinker.c */ unsigned long i915_gem_shrink(struct drm_i915_private *dev_priv, unsigned long target, diff --git a/drivers/gpu/drm/i915/i915_gem_batch_pool.c b/drivers/gpu/drm/i915/i915_gem_batch_pool.c index cb25cad3318c..aa4e1e043b4e 100644 --- a/drivers/gpu/drm/i915/i915_gem_batch_pool.c +++ b/drivers/gpu/drm/i915/i915_gem_batch_pool.c @@ -97,9 +97,9 @@ i915_gem_batch_pool_get(struct i915_gem_batch_pool *pool, size_t size) { struct drm_i915_gem_object *obj = NULL; - struct drm_i915_gem_object *tmp, *next; + struct drm_i915_gem_object *tmp; struct list_head *list; - int n; + int n, ret; lockdep_assert_held(&pool->engine->i915->drm.struct_mutex); @@ -112,19 +112,12 @@ i915_gem_batch_pool_get(struct i915_gem_batch_pool *pool, n = ARRAY_SIZE(pool->cache_list) - 1; list = &pool->cache_list[n]; - list_for_each_entry_safe(tmp, next, list, batch_pool_link) { + list_for_each_entry(tmp, list, batch_pool_link) { /* The batches are strictly LRU ordered */ if (!i915_gem_active_is_idle(&tmp->last_read[pool->engine->id], &tmp->base.dev->struct_mutex)) break; - /* While we're looping, do some clean up */ - if (tmp->madv == __I915_MADV_PURGED) { - list_del(&tmp->batch_pool_link); - i915_gem_object_put(tmp); - continue; - } - if (tmp->base.size >= size) { obj = tmp; break; @@ -132,19 +125,15 @@ i915_gem_batch_pool_get(struct i915_gem_batch_pool *pool, } if (obj == NULL) { - int ret; - - obj = i915_gem_object_create(&pool->engine->i915->drm, size); + obj = i915_gem_object_create_internal(pool->engine->i915, size); if (IS_ERR(obj)) return obj; - - ret = i915_gem_object_get_pages(obj); - if (ret) - return ERR_PTR(ret); - - obj->madv = I915_MADV_DONTNEED; } + ret = i915_gem_object_get_pages(obj); + if (ret) + return ERR_PTR(ret); + list_move_tail(&obj->batch_pool_link, list); i915_gem_object_pin_pages(obj); return obj; diff --git a/drivers/gpu/drm/i915/i915_gem_internal.c b/drivers/gpu/drm/i915/i915_gem_internal.c new file mode 100644 index 000000000000..02e66fa170b0 --- /dev/null +++ b/drivers/gpu/drm/i915/i915_gem_internal.c @@ -0,0 +1,170 @@ +/* + * Copyright © 2014-2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include +#include +#include "i915_drv.h" + +#define QUIET (__GFP_NORETRY | __GFP_NOWARN) + +/* convert swiotlb segment size into sensible units (pages)! */ +#define IO_TLB_SEGPAGES (IO_TLB_SEGSIZE << IO_TLB_SHIFT >> PAGE_SHIFT) + +static void internal_free_pages(struct sg_table *st) +{ + struct scatterlist *sg; + + for (sg = st->sgl; sg; sg = __sg_next(sg)) + __free_pages(sg_page(sg), get_order(sg->length)); + + sg_free_table(st); + kfree(st); +} + +static int i915_gem_object_get_pages_internal(struct drm_i915_gem_object *obj) +{ + struct drm_i915_private *i915 = to_i915(obj->base.dev); + unsigned int npages = obj->base.size / PAGE_SIZE; + struct sg_table *st; + struct scatterlist *sg; + int max_order; + gfp_t gfp; + + st = kmalloc(sizeof(*st), GFP_KERNEL); + if (!st) + return -ENOMEM; + + if (sg_alloc_table(st, npages, GFP_KERNEL)) { + kfree(st); + return -ENOMEM; + } + + sg = st->sgl; + st->nents = 0; + + max_order = MAX_ORDER; +#ifdef CONFIG_SWIOTLB + if (swiotlb_nr_tbl()) /* minimum max swiotlb size is IO_TLB_SEGSIZE */ + max_order = min(max_order, ilog2(IO_TLB_SEGPAGES)); +#endif + + gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_RECLAIMABLE; + if (IS_CRESTLINE(i915) || IS_BROADWATER(i915)) { + /* 965gm cannot relocate objects above 4GiB. */ + gfp &= ~__GFP_HIGHMEM; + gfp |= __GFP_DMA32; + } + + do { + int order = min(fls(npages) - 1, max_order); + struct page *page; + + do { + page = alloc_pages(gfp | (order ? QUIET : 0), order); + if (page) + break; + if (!order--) + goto err; + + /* Limit subsequent allocations as well */ + max_order = order; + } while (1); + + sg_set_page(sg, page, PAGE_SIZE << order, 0); + st->nents++; + + npages -= 1 << order; + if (!npages) { + sg_mark_end(sg); + break; + } + + sg = __sg_next(sg); + } while (1); + obj->pages = st; + + if (i915_gem_gtt_prepare_object(obj)) { + obj->pages = NULL; + goto err; + } + + /* Mark the pages as dontneed whilst they are still pinned. As soon + * as they are unpinned they are allowed to be reaped by the shrinker, + * and the caller is expected to repopulate - the contents of this + * object are only valid whilst active and pinned. + */ + obj->madv = I915_MADV_DONTNEED; + return 0; + +err: + sg_mark_end(sg); + internal_free_pages(st); + return -ENOMEM; +} + +static void i915_gem_object_put_pages_internal(struct drm_i915_gem_object *obj) +{ + i915_gem_gtt_finish_object(obj); + internal_free_pages(obj->pages); + + obj->dirty = 0; + obj->madv = I915_MADV_WILLNEED; +} + +static const struct drm_i915_gem_object_ops i915_gem_object_internal_ops = { + .flags = I915_GEM_OBJECT_HAS_STRUCT_PAGE, + .get_pages = i915_gem_object_get_pages_internal, + .put_pages = i915_gem_object_put_pages_internal, +}; + +/** + * Creates a new object that wraps some internal memory for private use. + * This object is not backed by swappable storage, and as such its contents + * are volatile and only valid whilst pinned. If the object is reaped by the + * shrinker, its pages and data will be discarded. Equally, it is not a full + * GEM object and so not valid for access from userspace. This makes it useful + * for hardware interfaces like ringbuffers (which are pinned from the time + * the request is written to the time the hardware stops accessing it), but + * not for contexts (which need to be preserved when not active for later + * reuse). Note that it is not cleared upon allocation. + */ +struct drm_i915_gem_object * +i915_gem_object_create_internal(struct drm_i915_private *i915, + unsigned int size) +{ + struct drm_i915_gem_object *obj; + + obj = i915_gem_object_alloc(&i915->drm); + if (!obj) + return ERR_PTR(-ENOMEM); + + drm_gem_private_object_init(&i915->drm, &obj->base, size); + i915_gem_object_init(obj, &i915_gem_object_internal_ops); + + obj->base.write_domain = I915_GEM_DOMAIN_CPU; + obj->base.read_domains = I915_GEM_DOMAIN_CPU; + obj->cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE; + + return obj; +} diff --git a/drivers/gpu/drm/i915/i915_gem_render_state.c b/drivers/gpu/drm/i915/i915_gem_render_state.c index e7c3dbcc6c81..217e0b58b930 100644 --- a/drivers/gpu/drm/i915/i915_gem_render_state.c +++ b/drivers/gpu/drm/i915/i915_gem_render_state.c @@ -187,7 +187,7 @@ int i915_gem_render_state_init(struct drm_i915_gem_request *req) if (so.rodata->batch_items * 4 > 4096) return -EINVAL; - obj = i915_gem_object_create(&req->i915->drm, 4096); + obj = i915_gem_object_create_internal(req->i915, 4096); if (IS_ERR(obj)) return PTR_ERR(obj); diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c index 8cceb345aa0f..b2de371d2bf5 100644 --- a/drivers/gpu/drm/i915/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/intel_engine_cs.c @@ -264,7 +264,7 @@ int intel_engine_create_scratch(struct intel_engine_cs *engine, int size) obj = i915_gem_object_create_stolen(&engine->i915->drm, size); if (!obj) - obj = i915_gem_object_create(&engine->i915->drm, size); + obj = i915_gem_object_create_internal(engine->i915, size); if (IS_ERR(obj)) { DRM_ERROR("Failed to allocate scratch page\n"); return PTR_ERR(obj); diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index 8eee9675d3bf..a15b9b5f2924 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -1782,9 +1782,10 @@ static int init_status_page(struct intel_engine_cs *engine) struct drm_i915_gem_object *obj; struct i915_vma *vma; unsigned int flags; + void *vaddr; int ret; - obj = i915_gem_object_create(&engine->i915->drm, 4096); + obj = i915_gem_object_create_internal(engine->i915, 4096); if (IS_ERR(obj)) { DRM_ERROR("Failed to allocate status page\n"); return PTR_ERR(obj); @@ -1817,15 +1818,22 @@ static int init_status_page(struct intel_engine_cs *engine) if (ret) goto err; + vaddr = i915_gem_object_pin_map(obj, I915_MAP_WB); + if (IS_ERR(vaddr)) { + ret = PTR_ERR(vaddr); + goto err_unpin; + } + engine->status_page.vma = vma; engine->status_page.ggtt_offset = i915_ggtt_offset(vma); - engine->status_page.page_addr = - i915_gem_object_pin_map(obj, I915_MAP_WB); + engine->status_page.page_addr = memset(vaddr, 0, 4096); DRM_DEBUG_DRIVER("%s hws offset: 0x%08x\n", engine->name, i915_ggtt_offset(vma)); return 0; +err_unpin: + i915_vma_unpin(vma); err: i915_gem_object_put(obj); return ret; -- cgit v1.2.3 From 4e50f082ac51c95046a8315612ce1d9acb2b3d63 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 28 Oct 2016 13:58:31 +0100 Subject: drm/i915: Reuse the active golden render state batch The golden render state is constant, but we recreate the batch setting it up for every new context. If we keep that batch in a volatile cache we can safely reuse it whenever we need to initialise a new context. We mark the pages as purgeable and use the shrinker to recover pages from the batch whenever we face memory pressues, recreating that batch afresh on the next new context. Signed-off-by: Chris Wilson Reviewed-by: Joonas Lahtinen Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-8-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/i915_gem_render_state.c | 184 +++++++++++++++++---------- drivers/gpu/drm/i915/i915_gem_render_state.h | 4 +- drivers/gpu/drm/i915/intel_engine_cs.c | 5 + drivers/gpu/drm/i915/intel_lrc.c | 2 +- drivers/gpu/drm/i915/intel_ringbuffer.c | 2 +- drivers/gpu/drm/i915/intel_ringbuffer.h | 3 + 6 files changed, 129 insertions(+), 71 deletions(-) (limited to 'drivers/gpu/drm/i915/intel_ringbuffer.c') diff --git a/drivers/gpu/drm/i915/i915_gem_render_state.c b/drivers/gpu/drm/i915/i915_gem_render_state.c index 217e0b58b930..9625e1a662ed 100644 --- a/drivers/gpu/drm/i915/i915_gem_render_state.c +++ b/drivers/gpu/drm/i915/i915_gem_render_state.c @@ -28,17 +28,19 @@ #include "i915_drv.h" #include "intel_renderstate.h" -struct render_state { +struct intel_render_state { const struct intel_renderstate_rodata *rodata; struct i915_vma *vma; - u32 aux_batch_size; - u32 aux_batch_offset; + u32 batch_offset; + u32 batch_size; + u32 aux_offset; + u32 aux_size; }; static const struct intel_renderstate_rodata * -render_state_get_rodata(const struct drm_i915_gem_request *req) +render_state_get_rodata(const struct intel_engine_cs *engine) { - switch (INTEL_GEN(req->i915)) { + switch (INTEL_GEN(engine->i915)) { case 6: return &gen6_null_state; case 7: @@ -63,29 +65,27 @@ render_state_get_rodata(const struct drm_i915_gem_request *req) */ #define OUT_BATCH(batch, i, val) \ do { \ - if (WARN_ON((i) >= PAGE_SIZE / sizeof(u32))) { \ - ret = -ENOSPC; \ - goto err_out; \ - } \ + if ((i) >= PAGE_SIZE / sizeof(u32)) \ + goto err; \ (batch)[(i)++] = (val); \ } while(0) -static int render_state_setup(struct render_state *so) +static int render_state_setup(struct intel_render_state *so, + struct drm_i915_private *i915) { - struct drm_i915_private *dev_priv = to_i915(so->vma->vm->dev); const struct intel_renderstate_rodata *rodata = so->rodata; - const bool has_64bit_reloc = INTEL_GEN(dev_priv) >= 8; + const bool has_64bit_reloc = INTEL_GEN(i915) >= 8; + struct drm_i915_gem_object *obj = so->vma->obj; unsigned int i = 0, reloc_index = 0; - struct page *page; + unsigned int needs_clflush; u32 *d; int ret; - ret = i915_gem_object_set_to_cpu_domain(so->vma->obj, true); + ret = i915_gem_obj_prepare_shmem_write(obj, &needs_clflush); if (ret) return ret; - page = i915_gem_object_get_dirty_page(so->vma->obj, 0); - d = kmap(page); + d = kmap_atomic(i915_gem_object_get_dirty_page(obj, 0)); while (i < rodata->batch_items) { u32 s = rodata->batch[i]; @@ -95,10 +95,8 @@ static int render_state_setup(struct render_state *so) s = lower_32_bits(r); if (has_64bit_reloc) { if (i + 1 >= rodata->batch_items || - rodata->batch[i + 1] != 0) { - ret = -EINVAL; - goto err_out; - } + rodata->batch[i + 1] != 0) + goto err; d[i++] = s; s = upper_32_bits(r); @@ -110,12 +108,20 @@ static int render_state_setup(struct render_state *so) d[i++] = s; } + if (rodata->reloc[reloc_index] != -1) { + DRM_ERROR("only %d relocs resolved\n", reloc_index); + goto err; + } + + so->batch_offset = so->vma->node.start; + so->batch_size = rodata->batch_items * sizeof(u32); + while (i % CACHELINE_DWORDS) OUT_BATCH(d, i, MI_NOOP); - so->aux_batch_offset = i * sizeof(u32); + so->aux_offset = i * sizeof(u32); - if (HAS_POOLED_EU(dev_priv)) { + if (HAS_POOLED_EU(i915)) { /* * We always program 3x6 pool config but depending upon which * subslice is disabled HW drops down to appropriate config @@ -143,89 +149,131 @@ static int render_state_setup(struct render_state *so) } OUT_BATCH(d, i, MI_BATCH_BUFFER_END); - so->aux_batch_size = (i * sizeof(u32)) - so->aux_batch_offset; - + so->aux_size = i * sizeof(u32) - so->aux_offset; + so->aux_offset += so->batch_offset; /* * Since we are sending length, we need to strictly conform to * all requirements. For Gen2 this must be a multiple of 8. */ - so->aux_batch_size = ALIGN(so->aux_batch_size, 8); - - kunmap(page); - - ret = i915_gem_object_set_to_gtt_domain(so->vma->obj, false); - if (ret) - return ret; - - if (rodata->reloc[reloc_index] != -1) { - DRM_ERROR("only %d relocs resolved\n", reloc_index); - return -EINVAL; - } + so->aux_size = ALIGN(so->aux_size, 8); - return 0; + if (needs_clflush) + drm_clflush_virt_range(d, i * sizeof(u32)); + kunmap_atomic(d); -err_out: - kunmap(page); + ret = i915_gem_object_set_to_gtt_domain(obj, false); +out: + i915_gem_obj_finish_shmem_access(obj); return ret; + +err: + kunmap_atomic(d); + ret = -EINVAL; + goto out; } #undef OUT_BATCH -int i915_gem_render_state_init(struct drm_i915_gem_request *req) +int i915_gem_render_state_init(struct intel_engine_cs *engine) { - struct render_state so; + struct intel_render_state *so; + const struct intel_renderstate_rodata *rodata; struct drm_i915_gem_object *obj; int ret; - if (WARN_ON(req->engine->id != RCS)) - return -ENOENT; + if (engine->id != RCS) + return 0; - so.rodata = render_state_get_rodata(req); - if (!so.rodata) + rodata = render_state_get_rodata(engine); + if (!rodata) return 0; - if (so.rodata->batch_items * 4 > 4096) + if (rodata->batch_items * 4 > 4096) return -EINVAL; - obj = i915_gem_object_create_internal(req->i915, 4096); - if (IS_ERR(obj)) - return PTR_ERR(obj); + so = kmalloc(sizeof(*so), GFP_KERNEL); + if (!so) + return -ENOMEM; - so.vma = i915_vma_create(obj, &req->i915->ggtt.base, NULL); - if (IS_ERR(so.vma)) { - ret = PTR_ERR(so.vma); - goto err_obj; + obj = i915_gem_object_create_internal(engine->i915, 4096); + if (IS_ERR(obj)) { + ret = PTR_ERR(obj); + goto err_free; } - ret = i915_vma_pin(so.vma, 0, 0, PIN_GLOBAL); - if (ret) + so->vma = i915_vma_create(obj, &engine->i915->ggtt.base, NULL); + if (IS_ERR(so->vma)) { + ret = PTR_ERR(so->vma); goto err_obj; + } + + so->rodata = rodata; + engine->render_state = so; + return 0; - ret = render_state_setup(&so); +err_obj: + i915_gem_object_put(obj); +err_free: + kfree(so); + return ret; +} + +int i915_gem_render_state_emit(struct drm_i915_gem_request *req) +{ + struct intel_render_state *so; + int ret; + + so = req->engine->render_state; + if (!so) + return 0; + + /* Recreate the page after shrinking */ + if (!so->vma->obj->pages) + so->batch_offset = -1; + + ret = i915_vma_pin(so->vma, 0, 0, PIN_GLOBAL | PIN_HIGH); if (ret) - goto err_unpin; + return ret; - ret = req->engine->emit_bb_start(req, so.vma->node.start, - so.rodata->batch_items * 4, + if (so->vma->node.start != so->batch_offset) { + ret = render_state_setup(so, req->i915); + if (ret) + goto err_unpin; + } + + ret = req->engine->emit_bb_start(req, + so->batch_offset, so->batch_size, I915_DISPATCH_SECURE); if (ret) goto err_unpin; - if (so.aux_batch_size > 8) { + if (so->aux_size > 8) { ret = req->engine->emit_bb_start(req, - (so.vma->node.start + - so.aux_batch_offset), - so.aux_batch_size, + so->aux_offset, so->aux_size, I915_DISPATCH_SECURE); if (ret) goto err_unpin; } - i915_vma_move_to_active(so.vma, req, 0); + i915_vma_move_to_active(so->vma, req, 0); err_unpin: - i915_vma_unpin(so.vma); - i915_vma_close(so.vma); -err_obj: - __i915_gem_object_release_unless_active(obj); + i915_vma_unpin(so->vma); return ret; } + +void i915_gem_render_state_fini(struct intel_engine_cs *engine) +{ + struct intel_render_state *so; + struct drm_i915_gem_object *obj; + + so = fetch_and_zero(&engine->render_state); + if (!so) + return; + + obj = so->vma->obj; + + i915_vma_close(so->vma); + __i915_gem_object_release_unless_active(obj); + + kfree(so); +} diff --git a/drivers/gpu/drm/i915/i915_gem_render_state.h b/drivers/gpu/drm/i915/i915_gem_render_state.h index 18cce3f06e9c..87481845799d 100644 --- a/drivers/gpu/drm/i915/i915_gem_render_state.h +++ b/drivers/gpu/drm/i915/i915_gem_render_state.h @@ -26,6 +26,8 @@ struct drm_i915_gem_request; -int i915_gem_render_state_init(struct drm_i915_gem_request *req); +int i915_gem_render_state_init(struct intel_engine_cs *engine); +int i915_gem_render_state_emit(struct drm_i915_gem_request *req); +void i915_gem_render_state_fini(struct intel_engine_cs *engine); #endif /* _I915_GEM_RENDER_STATE_H_ */ diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c index b2de371d2bf5..fd551824adf9 100644 --- a/drivers/gpu/drm/i915/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/intel_engine_cs.c @@ -314,6 +314,10 @@ int intel_engine_init_common(struct intel_engine_cs *engine) if (ret) return ret; + ret = i915_gem_render_state_init(engine); + if (ret) + return ret; + return 0; } @@ -328,6 +332,7 @@ void intel_engine_cleanup_common(struct intel_engine_cs *engine) { intel_engine_cleanup_scratch(engine); + i915_gem_render_state_fini(engine); intel_engine_fini_breadcrumbs(engine); intel_engine_cleanup_cmd_parser(engine); i915_gem_batch_pool_fini(&engine->batch_pool); diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index bc86585b9fbb..1c1bd30e8b2d 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -1637,7 +1637,7 @@ static int gen8_init_rcs_context(struct drm_i915_gem_request *req) if (ret) DRM_ERROR("MOCS failed to program: expect performance issues.\n"); - return i915_gem_render_state_init(req); + return i915_gem_render_state_emit(req); } /** diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index a15b9b5f2924..aaa46d9ffbc1 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -648,7 +648,7 @@ static int intel_rcs_ctx_init(struct drm_i915_gem_request *req) if (ret != 0) return ret; - ret = i915_gem_render_state_init(req); + ret = i915_gem_render_state_emit(req); if (ret) return ret; diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index 09bb89cfb7c3..cb6e96c6cd47 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -157,6 +157,7 @@ struct i915_ctx_workarounds { }; struct drm_i915_gem_request; +struct intel_render_state; struct intel_engine_cs { struct drm_i915_private *i915; @@ -184,6 +185,8 @@ struct intel_engine_cs { unsigned int irq_shift; struct intel_ring *buffer; + struct intel_render_state *render_state; + /* Rather than have every client wait upon all user interrupts, * with the herd waking after every interrupt and each doing the * heavyweight seqno dance, we delegate the task (of being the -- cgit v1.2.3 From 65e4760e3920c21073a9d737929dc36df561380f Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 28 Oct 2016 13:58:49 +0100 Subject: drm/i915: Introduce a global_seqno for each request Though we will have multiple timelines, we still have a single timeline of execution. This we can use to provide an execution and retirement order of requests. This keeps tracking execution of requests simple, and vital for preserving a single waiter (i.e. so that we can order the waiters so that only the earliest to wakeup need be woken). To accomplish this we distinguish the seqno used to order requests per-context (external) and that used internally for execution. Signed-off-by: Chris Wilson Reviewed-by: Joonas Lahtinen Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-26-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/i915_debugfs.c | 2 +- drivers/gpu/drm/i915/i915_drv.h | 4 ++-- drivers/gpu/drm/i915/i915_gem.c | 2 +- drivers/gpu/drm/i915/i915_gem_request.c | 19 +++++++++++++----- drivers/gpu/drm/i915/i915_gem_request.h | 32 +++++++++++++++++++++++++----- drivers/gpu/drm/i915/i915_gpu_error.c | 2 +- drivers/gpu/drm/i915/i915_guc_submission.c | 4 ++-- drivers/gpu/drm/i915/i915_trace.h | 8 ++++---- drivers/gpu/drm/i915/intel_breadcrumbs.c | 8 +++++--- drivers/gpu/drm/i915/intel_lrc.c | 4 ++-- drivers/gpu/drm/i915/intel_ringbuffer.c | 14 ++++++------- 11 files changed, 66 insertions(+), 33 deletions(-) (limited to 'drivers/gpu/drm/i915/intel_ringbuffer.c') diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 3a0ea5eace37..90bc4a89e0d5 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -637,7 +637,7 @@ static void print_request(struct seq_file *m, rcu_read_lock(); task = pid ? pid_task(pid, PIDTYPE_PID) : NULL; seq_printf(m, "%s%x [%x:%x] @ %d: %s [%d]\n", prefix, - rq->fence.seqno, rq->ctx->hw_id, rq->fence.seqno, + rq->global_seqno, rq->ctx->hw_id, rq->fence.seqno, jiffies_to_msecs(jiffies - rq->emitted_jiffies), task ? task->comm : "", task ? task->pid : -1); diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index f0f68f64d09c..217674bb1495 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -4050,7 +4050,7 @@ __i915_request_irq_complete(struct drm_i915_gem_request *req) /* Before we do the heavier coherent read of the seqno, * check the value (hopefully) in the CPU cacheline. */ - if (i915_gem_request_completed(req)) + if (__i915_gem_request_completed(req)) return true; /* Ensure our read of the seqno is coherent so that we @@ -4101,7 +4101,7 @@ __i915_request_irq_complete(struct drm_i915_gem_request *req) wake_up_process(tsk); rcu_read_unlock(); - if (i915_gem_request_completed(req)) + if (__i915_gem_request_completed(req)) return true; } diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 5ea46a7d991f..f4cfb88bd804 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2615,7 +2615,7 @@ static void i915_gem_reset_engine(struct intel_engine_cs *engine) return; DRM_DEBUG_DRIVER("resetting %s to restart from tail of request 0x%x\n", - engine->name, request->fence.seqno); + engine->name, request->global_seqno); /* Setup the CS to resume from the breadcrumb of the hung request */ engine->reset_hw(engine, request); diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c index 03ae85a1eefb..311cf3fac2e0 100644 --- a/drivers/gpu/drm/i915/i915_gem_request.c +++ b/drivers/gpu/drm/i915/i915_gem_request.c @@ -376,7 +376,7 @@ i915_gem_request_alloc(struct intel_engine_cs *engine, * of being read by __i915_gem_active_get_rcu(). As such, * we have to be very careful when overwriting the contents. During * the RCU lookup, we change chase the request->engine pointer, - * read the request->fence.seqno and increment the reference count. + * read the request->global_seqno and increment the reference count. * * The reference count is incremented atomically. If it is zero, * the lookup knows the request is unallocated and complete. Otherwise, @@ -418,6 +418,7 @@ i915_gem_request_alloc(struct intel_engine_cs *engine, INIT_LIST_HEAD(&req->active_list); req->i915 = dev_priv; req->engine = engine; + req->global_seqno = seqno; req->ctx = i915_gem_context_get(ctx); /* No zalloc, must clear what we need by hand */ @@ -475,8 +476,15 @@ i915_gem_request_await_request(struct drm_i915_gem_request *to, return ret < 0 ? ret : 0; } + if (!from->global_seqno) { + ret = i915_sw_fence_await_dma_fence(&to->submit, + &from->fence, 0, + GFP_KERNEL); + return ret < 0 ? ret : 0; + } + idx = intel_engine_sync_index(from->engine, to->engine); - if (from->fence.seqno <= from->engine->semaphore.sync_seqno[idx]) + if (from->global_seqno <= from->engine->semaphore.sync_seqno[idx]) return 0; trace_i915_gem_ring_sync_to(to, from); @@ -494,7 +502,7 @@ i915_gem_request_await_request(struct drm_i915_gem_request *to, return ret; } - from->engine->semaphore.sync_seqno[idx] = from->fence.seqno; + from->engine->semaphore.sync_seqno[idx] = from->global_seqno; return 0; } @@ -774,7 +782,7 @@ bool __i915_spin_request(const struct drm_i915_gem_request *req, timeout_us += local_clock_us(&cpu); do { - if (i915_gem_request_completed(req)) + if (__i915_gem_request_completed(req)) return true; if (signal_pending_state(state, current)) @@ -883,6 +891,7 @@ long i915_wait_request(struct drm_i915_gem_request *req, GEM_BUG_ON(!i915_sw_fence_done(&req->submit)); } + GEM_BUG_ON(!req->global_seqno); /* Optimistic short spin before touching IRQs */ if (i915_spin_request(req, state, 5)) @@ -892,7 +901,7 @@ long i915_wait_request(struct drm_i915_gem_request *req, if (flags & I915_WAIT_LOCKED) add_wait_queue(&req->i915->gpu_error.wait_queue, &reset); - intel_wait_init(&wait, req->fence.seqno); + intel_wait_init(&wait, req->global_seqno); if (intel_engine_add_wait(req->engine, &wait)) /* In order to check that we haven't missed the interrupt * as we enabled it, we need to kick ourselves to do a diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h index 4ac30ae93e49..75f8360b3421 100644 --- a/drivers/gpu/drm/i915/i915_gem_request.h +++ b/drivers/gpu/drm/i915/i915_gem_request.h @@ -87,6 +87,8 @@ struct drm_i915_gem_request { struct i915_sw_fence submit; wait_queue_t submitq; + u32 global_seqno; + /** GEM sequence number associated with the previous request, * when the HWS breadcrumb is equal to this the GPU is processing * this request. @@ -163,7 +165,7 @@ void i915_gem_request_retire_upto(struct drm_i915_gem_request *req); static inline u32 i915_gem_request_get_seqno(struct drm_i915_gem_request *req) { - return req ? req->fence.seqno : 0; + return req ? req->global_seqno : 0; } static inline struct intel_engine_cs * @@ -248,17 +250,37 @@ static inline bool i915_seqno_passed(u32 seq1, u32 seq2) } static inline bool -i915_gem_request_started(const struct drm_i915_gem_request *req) +__i915_gem_request_started(const struct drm_i915_gem_request *req) { + GEM_BUG_ON(!req->global_seqno); return i915_seqno_passed(intel_engine_get_seqno(req->engine), req->previous_seqno); } static inline bool -i915_gem_request_completed(const struct drm_i915_gem_request *req) +i915_gem_request_started(const struct drm_i915_gem_request *req) { + if (!req->global_seqno) + return false; + + return __i915_gem_request_started(req); +} + +static inline bool +__i915_gem_request_completed(const struct drm_i915_gem_request *req) +{ + GEM_BUG_ON(!req->global_seqno); return i915_seqno_passed(intel_engine_get_seqno(req->engine), - req->fence.seqno); + req->global_seqno); +} + +static inline bool +i915_gem_request_completed(const struct drm_i915_gem_request *req) +{ + if (!req->global_seqno) + return false; + + return __i915_gem_request_completed(req); } bool __i915_spin_request(const struct drm_i915_gem_request *request, @@ -266,7 +288,7 @@ bool __i915_spin_request(const struct drm_i915_gem_request *request, static inline bool i915_spin_request(const struct drm_i915_gem_request *request, int state, unsigned long timeout_us) { - return (i915_gem_request_started(request) && + return (__i915_gem_request_started(request) && __i915_spin_request(request, state, timeout_us)); } diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 12fea57d41fb..9aa197ca6210 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -1176,7 +1176,7 @@ static void record_request(struct drm_i915_gem_request *request, struct drm_i915_error_request *erq) { erq->context = request->ctx->hw_id; - erq->seqno = request->fence.seqno; + erq->seqno = request->global_seqno; erq->jiffies = request->emitted_jiffies; erq->head = request->head; erq->tail = request->tail; diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c index cca250e90845..857ef914cae7 100644 --- a/drivers/gpu/drm/i915/i915_guc_submission.c +++ b/drivers/gpu/drm/i915/i915_guc_submission.c @@ -554,7 +554,7 @@ static void guc_wq_item_append(struct i915_guc_client *gc, wqi->context_desc = (u32)intel_lr_context_descriptor(rq->ctx, engine); wqi->ring_tail = tail << WQ_RING_TAIL_SHIFT; - wqi->fence_id = rq->fence.seqno; + wqi->fence_id = rq->global_seqno; kunmap_atomic(base); } @@ -655,7 +655,7 @@ static void i915_guc_submit(struct drm_i915_gem_request *rq) client->b_fail += 1; guc->submissions[engine_id] += 1; - guc->last_seqno[engine_id] = rq->fence.seqno; + guc->last_seqno[engine_id] = rq->global_seqno; spin_unlock(&client->wq_lock); } diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h index 5c912c25f7d3..c5d210ebaa9a 100644 --- a/drivers/gpu/drm/i915/i915_trace.h +++ b/drivers/gpu/drm/i915/i915_trace.h @@ -466,7 +466,7 @@ TRACE_EVENT(i915_gem_ring_sync_to, __entry->dev = from->i915->drm.primary->index; __entry->sync_from = from->engine->id; __entry->sync_to = to->engine->id; - __entry->seqno = from->fence.seqno; + __entry->seqno = from->global_seqno; ), TP_printk("dev=%u, sync-from=%u, sync-to=%u, seqno=%u", @@ -489,7 +489,7 @@ TRACE_EVENT(i915_gem_ring_dispatch, TP_fast_assign( __entry->dev = req->i915->drm.primary->index; __entry->ring = req->engine->id; - __entry->seqno = req->fence.seqno; + __entry->seqno = req->global_seqno; __entry->flags = flags; dma_fence_enable_sw_signaling(&req->fence); ), @@ -534,7 +534,7 @@ DECLARE_EVENT_CLASS(i915_gem_request, TP_fast_assign( __entry->dev = req->i915->drm.primary->index; __entry->ring = req->engine->id; - __entry->seqno = req->fence.seqno; + __entry->seqno = req->global_seqno; ), TP_printk("dev=%u, ring=%u, seqno=%u", @@ -596,7 +596,7 @@ TRACE_EVENT(i915_gem_request_wait_begin, TP_fast_assign( __entry->dev = req->i915->drm.primary->index; __entry->ring = req->engine->id; - __entry->seqno = req->fence.seqno; + __entry->seqno = req->global_seqno; __entry->blocking = mutex_is_locked(&req->i915->drm.struct_mutex); ), diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c index 56efcc507ea2..0d5def0d2dfe 100644 --- a/drivers/gpu/drm/i915/intel_breadcrumbs.c +++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c @@ -504,9 +504,11 @@ void intel_engine_enable_signaling(struct drm_i915_gem_request *request) /* locked by dma_fence_enable_sw_signaling() */ assert_spin_locked(&request->lock); + if (!request->global_seqno) + return; request->signaling.wait.tsk = b->signaler; - request->signaling.wait.seqno = request->fence.seqno; + request->signaling.wait.seqno = request->global_seqno; i915_gem_request_get(request); spin_lock(&b->lock); @@ -530,8 +532,8 @@ void intel_engine_enable_signaling(struct drm_i915_gem_request *request) p = &b->signals.rb_node; while (*p) { parent = *p; - if (i915_seqno_passed(request->fence.seqno, - to_signaler(parent)->fence.seqno)) { + if (i915_seqno_passed(request->global_seqno, + to_signaler(parent)->global_seqno)) { p = &parent->rb_right; first = false; } else { diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index cb30549dfd40..e0a9bf81774b 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -1584,7 +1584,7 @@ static int gen8_emit_request(struct drm_i915_gem_request *request) intel_hws_seqno_address(request->engine) | MI_FLUSH_DW_USE_GTT); intel_ring_emit(ring, 0); - intel_ring_emit(ring, request->fence.seqno); + intel_ring_emit(ring, request->global_seqno); intel_ring_emit(ring, MI_USER_INTERRUPT); intel_ring_emit(ring, MI_NOOP); return intel_logical_ring_advance(request); @@ -1613,7 +1613,7 @@ static int gen8_emit_request_render(struct drm_i915_gem_request *request) PIPE_CONTROL_QW_WRITE)); intel_ring_emit(ring, intel_hws_seqno_address(request->engine)); intel_ring_emit(ring, 0); - intel_ring_emit(ring, i915_gem_request_get_seqno(request)); + intel_ring_emit(ring, request->global_seqno); /* We're thrashing one dword of HWS. */ intel_ring_emit(ring, 0); intel_ring_emit(ring, MI_USER_INTERRUPT); diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index aaa46d9ffbc1..76c6b70303fb 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -1238,7 +1238,7 @@ static int gen8_rcs_signal(struct drm_i915_gem_request *req) PIPE_CONTROL_CS_STALL); intel_ring_emit(ring, lower_32_bits(gtt_offset)); intel_ring_emit(ring, upper_32_bits(gtt_offset)); - intel_ring_emit(ring, req->fence.seqno); + intel_ring_emit(ring, req->global_seqno); intel_ring_emit(ring, 0); intel_ring_emit(ring, MI_SEMAPHORE_SIGNAL | @@ -1274,7 +1274,7 @@ static int gen8_xcs_signal(struct drm_i915_gem_request *req) lower_32_bits(gtt_offset) | MI_FLUSH_DW_USE_GTT); intel_ring_emit(ring, upper_32_bits(gtt_offset)); - intel_ring_emit(ring, req->fence.seqno); + intel_ring_emit(ring, req->global_seqno); intel_ring_emit(ring, MI_SEMAPHORE_SIGNAL | MI_SEMAPHORE_TARGET(waiter->hw_id)); @@ -1308,7 +1308,7 @@ static int gen6_signal(struct drm_i915_gem_request *req) if (i915_mmio_reg_valid(mbox_reg)) { intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1)); intel_ring_emit_reg(ring, mbox_reg); - intel_ring_emit(ring, req->fence.seqno); + intel_ring_emit(ring, req->global_seqno); } } @@ -1339,7 +1339,7 @@ static int i9xx_emit_request(struct drm_i915_gem_request *req) intel_ring_emit(ring, MI_STORE_DWORD_INDEX); intel_ring_emit(ring, I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT); - intel_ring_emit(ring, req->fence.seqno); + intel_ring_emit(ring, req->global_seqno); intel_ring_emit(ring, MI_USER_INTERRUPT); intel_ring_advance(ring); @@ -1389,7 +1389,7 @@ static int gen8_render_emit_request(struct drm_i915_gem_request *req) PIPE_CONTROL_QW_WRITE)); intel_ring_emit(ring, intel_hws_seqno_address(engine)); intel_ring_emit(ring, 0); - intel_ring_emit(ring, i915_gem_request_get_seqno(req)); + intel_ring_emit(ring, req->global_seqno); /* We're thrashing one dword of HWS. */ intel_ring_emit(ring, 0); intel_ring_emit(ring, MI_USER_INTERRUPT); @@ -1427,7 +1427,7 @@ gen8_ring_sync_to(struct drm_i915_gem_request *req, MI_SEMAPHORE_WAIT | MI_SEMAPHORE_GLOBAL_GTT | MI_SEMAPHORE_SAD_GTE_SDD); - intel_ring_emit(ring, signal->fence.seqno); + intel_ring_emit(ring, signal->global_seqno); intel_ring_emit(ring, lower_32_bits(offset)); intel_ring_emit(ring, upper_32_bits(offset)); intel_ring_advance(ring); @@ -1465,7 +1465,7 @@ gen6_ring_sync_to(struct drm_i915_gem_request *req, * seqno is >= the last seqno executed. However for hardware the * comparison is strictly greater than. */ - intel_ring_emit(ring, signal->fence.seqno - 1); + intel_ring_emit(ring, signal->global_seqno - 1); intel_ring_emit(ring, 0); intel_ring_emit(ring, MI_NOOP); intel_ring_advance(ring); -- cgit v1.2.3 From 9b81d556b11fe58827dcd87bc5deaf8da2f716ae Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 28 Oct 2016 13:58:50 +0100 Subject: drm/i915: Rename ->emit_request to ->emit_breadcrumb Now that the emission of the request tail and its submission to hardware are two separate steps, engine->emit_request() is confusing. engine->emit_request() is called to emit the breadcrumb commands for the request into the ring, name it such (engine->emit_breadcrumb). Signed-off-by: Chris Wilson Reviewed-by: Joonas Lahtinen Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-27-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/i915_gem_request.c | 4 ++-- drivers/gpu/drm/i915/intel_lrc.c | 10 +++++----- drivers/gpu/drm/i915/intel_ringbuffer.c | 16 ++++++++-------- drivers/gpu/drm/i915/intel_ringbuffer.h | 2 +- 4 files changed, 16 insertions(+), 16 deletions(-) (limited to 'drivers/gpu/drm/i915/intel_ringbuffer.c') diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c index 311cf3fac2e0..a626b2638722 100644 --- a/drivers/gpu/drm/i915/i915_gem_request.c +++ b/drivers/gpu/drm/i915/i915_gem_request.c @@ -685,8 +685,8 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches) request->postfix = ring->tail; /* Not allowed to fail! */ - ret = engine->emit_request(request); - WARN(ret, "(%s)->emit_request failed: %d!\n", engine->name, ret); + ret = engine->emit_breadcrumb(request); + WARN(ret, "(%s)->emit_breadcrumb failed: %d!\n", engine->name, ret); /* Sanity check that the reserved size was large enough. */ ret = ring->tail - request_start; diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index e0a9bf81774b..57dba458f185 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -440,7 +440,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) if (last) /* WaIdleLiteRestore:bdw,skl * Apply the wa NOOPs to prevent ring:HEAD == req:TAIL - * as we resubmit the request. See gen8_emit_request() + * as we resubmit the request. See gen8_emit_breadcrumb() * for where we prepare the padding after the end of the * request. */ @@ -1567,7 +1567,7 @@ static void bxt_a_seqno_barrier(struct intel_engine_cs *engine) * restore with HEAD==TAIL (WaIdleLiteRestore). */ -static int gen8_emit_request(struct drm_i915_gem_request *request) +static int gen8_emit_breadcrumb(struct drm_i915_gem_request *request) { struct intel_ring *ring = request->ring; int ret; @@ -1590,7 +1590,7 @@ static int gen8_emit_request(struct drm_i915_gem_request *request) return intel_logical_ring_advance(request); } -static int gen8_emit_request_render(struct drm_i915_gem_request *request) +static int gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request) { struct intel_ring *ring = request->ring; int ret; @@ -1694,7 +1694,7 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine) engine->init_hw = gen8_init_common_ring; engine->reset_hw = reset_common_ring; engine->emit_flush = gen8_emit_flush; - engine->emit_request = gen8_emit_request; + engine->emit_breadcrumb = gen8_emit_breadcrumb; engine->submit_request = execlists_submit_request; engine->irq_enable = gen8_logical_ring_enable_irq; @@ -1816,7 +1816,7 @@ int logical_render_ring_init(struct intel_engine_cs *engine) engine->init_hw = gen8_init_render_ring; engine->init_context = gen8_init_rcs_context; engine->emit_flush = gen8_emit_flush_render; - engine->emit_request = gen8_emit_request_render; + engine->emit_breadcrumb = gen8_emit_breadcrumb_render; ret = intel_engine_create_scratch(engine, 4096); if (ret) diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index 76c6b70303fb..54c3981cf716 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -1328,7 +1328,7 @@ static void i9xx_submit_request(struct drm_i915_gem_request *request) intel_ring_offset(request->ring, request->tail)); } -static int i9xx_emit_request(struct drm_i915_gem_request *req) +static int i9xx_emit_breadcrumb(struct drm_i915_gem_request *req) { struct intel_ring *ring = req->ring; int ret; @@ -1349,14 +1349,14 @@ static int i9xx_emit_request(struct drm_i915_gem_request *req) } /** - * gen6_sema_emit_request - Update the semaphore mailbox registers + * gen6_sema_emit_breadcrumb - Update the semaphore mailbox registers * * @request - request to write to the ring * * Update the mailbox registers in the *other* rings with the current seqno. * This acts like a signal in the canonical semaphore. */ -static int gen6_sema_emit_request(struct drm_i915_gem_request *req) +static int gen6_sema_emit_breadcrumb(struct drm_i915_gem_request *req) { int ret; @@ -1364,10 +1364,10 @@ static int gen6_sema_emit_request(struct drm_i915_gem_request *req) if (ret) return ret; - return i9xx_emit_request(req); + return i9xx_emit_breadcrumb(req); } -static int gen8_render_emit_request(struct drm_i915_gem_request *req) +static int gen8_render_emit_breadcrumb(struct drm_i915_gem_request *req) { struct intel_engine_cs *engine = req->engine; struct intel_ring *ring = req->ring; @@ -2637,9 +2637,9 @@ static void intel_ring_default_vfuncs(struct drm_i915_private *dev_priv, engine->init_hw = init_ring_common; engine->reset_hw = reset_ring_common; - engine->emit_request = i9xx_emit_request; + engine->emit_breadcrumb = i9xx_emit_breadcrumb; if (i915.semaphores) - engine->emit_request = gen6_sema_emit_request; + engine->emit_breadcrumb = gen6_sema_emit_breadcrumb; engine->submit_request = i9xx_submit_request; if (INTEL_GEN(dev_priv) >= 8) @@ -2666,7 +2666,7 @@ int intel_init_render_ring_buffer(struct intel_engine_cs *engine) if (INTEL_GEN(dev_priv) >= 8) { engine->init_context = intel_rcs_ctx_init; - engine->emit_request = gen8_render_emit_request; + engine->emit_breadcrumb = gen8_render_emit_breadcrumb; engine->emit_flush = gen8_render_ring_flush; if (i915.semaphores) engine->semaphore.signal = gen8_rcs_signal; diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index a62e396c8863..a5ced1649ecd 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -255,7 +255,7 @@ struct intel_engine_cs { #define I915_DISPATCH_SECURE BIT(0) #define I915_DISPATCH_PINNED BIT(1) #define I915_DISPATCH_RS BIT(2) - int (*emit_request)(struct drm_i915_gem_request *req); + int (*emit_breadcrumb)(struct drm_i915_gem_request *req); /* Pass the request to the hardware queue (e.g. directly into * the legacy ringbuffer or to the end of an execlist). -- cgit v1.2.3 From 98f29e8d908f2b9e3d966f6f7d63cd69b4aaf0a2 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 28 Oct 2016 13:58:51 +0100 Subject: drm/i915: Record space required for breadcrumb emission In the next patch, we will use deferred breadcrumb emission. That requires reserving sufficient space in the ringbuffer to emit the breadcrumb, which first requires us to know how large the breadcrumb is. Signed-off-by: Chris Wilson Reviewed-by: Joonas Lahtinen Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-28-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/i915_gem_request.c | 1 + drivers/gpu/drm/i915/intel_lrc.c | 6 ++++++ drivers/gpu/drm/i915/intel_ringbuffer.c | 29 +++++++++++++++++++++++++++-- drivers/gpu/drm/i915/intel_ringbuffer.h | 1 + 4 files changed, 35 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/i915/intel_ringbuffer.c') diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c index a626b2638722..be9e23b32e4a 100644 --- a/drivers/gpu/drm/i915/i915_gem_request.c +++ b/drivers/gpu/drm/i915/i915_gem_request.c @@ -434,6 +434,7 @@ i915_gem_request_alloc(struct intel_engine_cs *engine, * away, e.g. because a GPU scheduler has deferred it. */ req->reserved_space = MIN_SPACE_FOR_ADD_REQUEST; + GEM_BUG_ON(req->reserved_space < engine->emit_breadcrumb_sz); if (i915.enable_execlists) ret = intel_logical_ring_alloc_request_extras(req); diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index 57dba458f185..8229baebb2b3 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -1590,6 +1590,8 @@ static int gen8_emit_breadcrumb(struct drm_i915_gem_request *request) return intel_logical_ring_advance(request); } +static const int gen8_emit_breadcrumb_sz = 6 + WA_TAIL_DWORDS; + static int gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request) { struct intel_ring *ring = request->ring; @@ -1621,6 +1623,8 @@ static int gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request) return intel_logical_ring_advance(request); } +static const int gen8_emit_breadcrumb_render_sz = 8 + WA_TAIL_DWORDS; + static int gen8_init_rcs_context(struct drm_i915_gem_request *req) { int ret; @@ -1695,6 +1699,7 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine) engine->reset_hw = reset_common_ring; engine->emit_flush = gen8_emit_flush; engine->emit_breadcrumb = gen8_emit_breadcrumb; + engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_sz; engine->submit_request = execlists_submit_request; engine->irq_enable = gen8_logical_ring_enable_irq; @@ -1817,6 +1822,7 @@ int logical_render_ring_init(struct intel_engine_cs *engine) engine->init_context = gen8_init_rcs_context; engine->emit_flush = gen8_emit_flush_render; engine->emit_breadcrumb = gen8_emit_breadcrumb_render; + engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_render_sz; ret = intel_engine_create_scratch(engine, 4096); if (ret) diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index 54c3981cf716..ae9cf6bb4def 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -1348,6 +1348,8 @@ static int i9xx_emit_breadcrumb(struct drm_i915_gem_request *req) return 0; } +static const int i9xx_emit_breadcrumb_sz = 4; + /** * gen6_sema_emit_breadcrumb - Update the semaphore mailbox registers * @@ -1401,6 +1403,8 @@ static int gen8_render_emit_breadcrumb(struct drm_i915_gem_request *req) return 0; } +static const int gen8_render_emit_breadcrumb_sz = 8; + /** * intel_ring_sync - sync the waiter to the signaller on seqno * @@ -2638,8 +2642,21 @@ static void intel_ring_default_vfuncs(struct drm_i915_private *dev_priv, engine->reset_hw = reset_ring_common; engine->emit_breadcrumb = i9xx_emit_breadcrumb; - if (i915.semaphores) + engine->emit_breadcrumb_sz = i9xx_emit_breadcrumb_sz; + if (i915.semaphores) { + int num_rings; + engine->emit_breadcrumb = gen6_sema_emit_breadcrumb; + + num_rings = hweight32(INTEL_INFO(dev_priv)->ring_mask) - 1; + if (INTEL_GEN(dev_priv) >= 8) { + engine->emit_breadcrumb_sz += num_rings * 6; + } else { + engine->emit_breadcrumb_sz += num_rings * 3; + if (num_rings & 1) + engine->emit_breadcrumb_sz++; + } + } engine->submit_request = i9xx_submit_request; if (INTEL_GEN(dev_priv) >= 8) @@ -2667,9 +2684,17 @@ int intel_init_render_ring_buffer(struct intel_engine_cs *engine) if (INTEL_GEN(dev_priv) >= 8) { engine->init_context = intel_rcs_ctx_init; engine->emit_breadcrumb = gen8_render_emit_breadcrumb; + engine->emit_breadcrumb_sz = gen8_render_emit_breadcrumb_sz; engine->emit_flush = gen8_render_ring_flush; - if (i915.semaphores) + if (i915.semaphores) { + int num_rings; + engine->semaphore.signal = gen8_rcs_signal; + + num_rings = + hweight32(INTEL_INFO(dev_priv)->ring_mask) - 1; + engine->emit_breadcrumb_sz += num_rings * 6; + } } else if (INTEL_GEN(dev_priv) >= 6) { engine->init_context = intel_rcs_ctx_init; engine->emit_flush = gen7_render_ring_flush; diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index a5ced1649ecd..7b7aaafac0da 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -256,6 +256,7 @@ struct intel_engine_cs { #define I915_DISPATCH_PINNED BIT(1) #define I915_DISPATCH_RS BIT(2) int (*emit_breadcrumb)(struct drm_i915_gem_request *req); + int emit_breadcrumb_sz; /* Pass the request to the hardware queue (e.g. directly into * the legacy ringbuffer or to the end of an execlist). -- cgit v1.2.3 From caddfe7192f5e74d65ebcfdae614f99e8fd87222 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 28 Oct 2016 13:58:52 +0100 Subject: drm/i915: Defer breadcrumb emission Move the actual emission of the breadcrumb for closing the request from i915_add_request() to the submit callback. (It can be moved later when required.) This allows us to defer the allocation of the global_seqno from request construction to actual submission, allowing us to emit the requests out of order (wrt to the order of their construction, they still will only be executed one all of their dependencies are resolved including that all earlier requests on their timeline have been submitted.) We have to specialise how we then emit the request in order to write into the preallocated space, rather than at the tail of the ringbuffer (which will have been advanced by the addition of new requests). Signed-off-by: Chris Wilson Reviewed-by: Joonas Lahtinen Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-29-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/i915_gem_request.c | 41 +++----- drivers/gpu/drm/i915/intel_lrc.c | 120 ++++++++--------------- drivers/gpu/drm/i915/intel_ringbuffer.c | 169 +++++++++++--------------------- drivers/gpu/drm/i915/intel_ringbuffer.h | 10 +- 4 files changed, 118 insertions(+), 222 deletions(-) (limited to 'drivers/gpu/drm/i915/intel_ringbuffer.c') diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c index be9e23b32e4a..06daa4d203a7 100644 --- a/drivers/gpu/drm/i915/i915_gem_request.c +++ b/drivers/gpu/drm/i915/i915_gem_request.c @@ -318,17 +318,16 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state) container_of(fence, typeof(*request), submit); struct intel_engine_cs *engine = request->engine; + if (state != FENCE_COMPLETE) + return NOTIFY_DONE; + /* Will be called from irq-context when using foreign DMA fences */ - switch (state) { - case FENCE_COMPLETE: - engine->timeline->last_submitted_seqno = request->fence.seqno; - engine->submit_request(request); - break; + engine->timeline->last_submitted_seqno = request->fence.seqno; - case FENCE_FREE: - break; - } + engine->emit_breadcrumb(request, + request->ring->vaddr + request->postfix); + engine->submit_request(request); return NOTIFY_DONE; } @@ -648,9 +647,7 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches) struct intel_ring *ring = request->ring; struct intel_timeline *timeline = request->timeline; struct drm_i915_gem_request *prev; - u32 request_start; - u32 reserved_tail; - int ret; + int err; lockdep_assert_held(&request->i915->drm.struct_mutex); trace_i915_gem_request_add(request); @@ -660,8 +657,6 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches) * should already have been reserved in the ring buffer. Let the ring * know that it is time to use that space up. */ - request_start = ring->tail; - reserved_tail = request->reserved_space; request->reserved_space = 0; /* @@ -672,10 +667,10 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches) * what. */ if (flush_caches) { - ret = engine->emit_flush(request, EMIT_FLUSH); + err = engine->emit_flush(request, EMIT_FLUSH); /* Not allowed to fail! */ - WARN(ret, "engine->emit_flush() failed: %d!\n", ret); + WARN(err, "engine->emit_flush() failed: %d!\n", err); } /* Record the position of the start of the breadcrumb so that @@ -683,20 +678,10 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches) * GPU processing the request, we never over-estimate the * position of the ring's HEAD. */ + err = intel_ring_begin(request, engine->emit_breadcrumb_sz); + GEM_BUG_ON(err); request->postfix = ring->tail; - - /* Not allowed to fail! */ - ret = engine->emit_breadcrumb(request); - WARN(ret, "(%s)->emit_breadcrumb failed: %d!\n", engine->name, ret); - - /* Sanity check that the reserved size was large enough. */ - ret = ring->tail - request_start; - if (ret < 0) - ret += ring->size; - WARN_ONCE(ret > reserved_tail, - "Not enough space reserved (%d bytes) " - "for adding the request (%d bytes)\n", - reserved_tail, ret); + ring->tail += engine->emit_breadcrumb_sz * sizeof(u32); /* Seal the request and mark it as pending execution. Note that * we may inspect this state, without holding any locks, during diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index 8229baebb2b3..fa3012c342cc 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -365,7 +365,7 @@ static u64 execlists_update_context(struct drm_i915_gem_request *rq) struct i915_hw_ppgtt *ppgtt = rq->ctx->ppgtt; u32 *reg_state = ce->lrc_reg_state; - reg_state[CTX_RING_TAIL+1] = intel_ring_offset(rq->ring, rq->tail); + reg_state[CTX_RING_TAIL+1] = rq->tail; /* True 32b PPGTT with dynamic page allocation: update PDP * registers and point the unallocated PDPs to scratch page. @@ -599,6 +599,15 @@ static void execlists_submit_request(struct drm_i915_gem_request *request) spin_lock_irqsave(&engine->execlist_lock, flags); + /* We keep the previous context alive until we retire the following + * request. This ensures that any the context object is still pinned + * for any residual writes the HW makes into it on the context switch + * into the next object following the breadcrumb. Otherwise, we may + * retire the context too early. + */ + request->previous_context = engine->last_context; + engine->last_context = request->ctx; + list_add_tail(&request->execlist_link, &engine->execlist_queue); if (execlists_elsp_idle(engine)) tasklet_hi_schedule(&engine->irq_tasklet); @@ -671,46 +680,6 @@ err_unpin: return ret; } -/* - * intel_logical_ring_advance() - advance the tail and prepare for submission - * @request: Request to advance the logical ringbuffer of. - * - * The tail is updated in our logical ringbuffer struct, not in the actual context. What - * really happens during submission is that the context and current tail will be placed - * on a queue waiting for the ELSP to be ready to accept a new context submission. At that - * point, the tail *inside* the context is updated and the ELSP written to. - */ -static int -intel_logical_ring_advance(struct drm_i915_gem_request *request) -{ - struct intel_ring *ring = request->ring; - struct intel_engine_cs *engine = request->engine; - - intel_ring_advance(ring); - request->tail = ring->tail; - - /* - * Here we add two extra NOOPs as padding to avoid - * lite restore of a context with HEAD==TAIL. - * - * Caller must reserve WA_TAIL_DWORDS for us! - */ - intel_ring_emit(ring, MI_NOOP); - intel_ring_emit(ring, MI_NOOP); - intel_ring_advance(ring); - request->wa_tail = ring->tail; - - /* We keep the previous context alive until we retire the following - * request. This ensures that any the context object is still pinned - * for any residual writes the HW makes into it on the context switch - * into the next object following the breadcrumb. Otherwise, we may - * retire the context too early. - */ - request->previous_context = engine->last_context; - engine->last_context = request->ctx; - return 0; -} - static int intel_lr_context_pin(struct i915_gem_context *ctx, struct intel_engine_cs *engine) { @@ -1566,41 +1535,35 @@ static void bxt_a_seqno_barrier(struct intel_engine_cs *engine) * used as a workaround for not being allowed to do lite * restore with HEAD==TAIL (WaIdleLiteRestore). */ - -static int gen8_emit_breadcrumb(struct drm_i915_gem_request *request) +static void gen8_emit_wa_tail(struct drm_i915_gem_request *request, u32 *out) { - struct intel_ring *ring = request->ring; - int ret; - - ret = intel_ring_begin(request, 6 + WA_TAIL_DWORDS); - if (ret) - return ret; + *out++ = MI_NOOP; + *out++ = MI_NOOP; + request->wa_tail = intel_ring_offset(request->ring, out); +} +static void gen8_emit_breadcrumb(struct drm_i915_gem_request *request, + u32 *out) +{ /* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */ BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5)); - intel_ring_emit(ring, (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW); - intel_ring_emit(ring, - intel_hws_seqno_address(request->engine) | - MI_FLUSH_DW_USE_GTT); - intel_ring_emit(ring, 0); - intel_ring_emit(ring, request->global_seqno); - intel_ring_emit(ring, MI_USER_INTERRUPT); - intel_ring_emit(ring, MI_NOOP); - return intel_logical_ring_advance(request); + *out++ = (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW; + *out++ = intel_hws_seqno_address(request->engine) | MI_FLUSH_DW_USE_GTT; + *out++ = 0; + *out++ = request->global_seqno; + *out++ = MI_USER_INTERRUPT; + *out++ = MI_NOOP; + request->tail = intel_ring_offset(request->ring, out); + + gen8_emit_wa_tail(request, out); } static const int gen8_emit_breadcrumb_sz = 6 + WA_TAIL_DWORDS; -static int gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request) +static void gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request, + u32 *out) { - struct intel_ring *ring = request->ring; - int ret; - - ret = intel_ring_begin(request, 8 + WA_TAIL_DWORDS); - if (ret) - return ret; - /* We're using qword write, seqno should be aligned to 8 bytes. */ BUILD_BUG_ON(I915_GEM_HWS_INDEX & 1); @@ -1608,19 +1571,20 @@ static int gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request) * need a prior CS_STALL, which is emitted by the flush * following the batch. */ - intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6)); - intel_ring_emit(ring, - (PIPE_CONTROL_GLOBAL_GTT_IVB | - PIPE_CONTROL_CS_STALL | - PIPE_CONTROL_QW_WRITE)); - intel_ring_emit(ring, intel_hws_seqno_address(request->engine)); - intel_ring_emit(ring, 0); - intel_ring_emit(ring, request->global_seqno); + *out++ = GFX_OP_PIPE_CONTROL(6); + *out++ = (PIPE_CONTROL_GLOBAL_GTT_IVB | + PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_QW_WRITE); + *out++ = intel_hws_seqno_address(request->engine); + *out++ = 0; + *out++ = request->global_seqno; /* We're thrashing one dword of HWS. */ - intel_ring_emit(ring, 0); - intel_ring_emit(ring, MI_USER_INTERRUPT); - intel_ring_emit(ring, MI_NOOP); - return intel_logical_ring_advance(request); + *out++ = 0; + *out++ = MI_USER_INTERRUPT; + *out++ = MI_NOOP; + request->tail = intel_ring_offset(request->ring, out); + + gen8_emit_wa_tail(request, out); } static const int gen8_emit_breadcrumb_render_sz = 8 + WA_TAIL_DWORDS; diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index ae9cf6bb4def..16244775b9d1 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -1213,90 +1213,62 @@ static void render_ring_cleanup(struct intel_engine_cs *engine) i915_vma_unpin_and_release(&dev_priv->semaphore); } -static int gen8_rcs_signal(struct drm_i915_gem_request *req) +static u32 *gen8_rcs_signal(struct drm_i915_gem_request *req, u32 *out) { - struct intel_ring *ring = req->ring; struct drm_i915_private *dev_priv = req->i915; struct intel_engine_cs *waiter; enum intel_engine_id id; - int ret, num_rings; - - num_rings = INTEL_INFO(dev_priv)->num_rings; - ret = intel_ring_begin(req, (num_rings-1) * 8); - if (ret) - return ret; for_each_engine(waiter, dev_priv, id) { u64 gtt_offset = req->engine->semaphore.signal_ggtt[id]; if (gtt_offset == MI_SEMAPHORE_SYNC_INVALID) continue; - intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6)); - intel_ring_emit(ring, - PIPE_CONTROL_GLOBAL_GTT_IVB | - PIPE_CONTROL_QW_WRITE | - PIPE_CONTROL_CS_STALL); - intel_ring_emit(ring, lower_32_bits(gtt_offset)); - intel_ring_emit(ring, upper_32_bits(gtt_offset)); - intel_ring_emit(ring, req->global_seqno); - intel_ring_emit(ring, 0); - intel_ring_emit(ring, - MI_SEMAPHORE_SIGNAL | - MI_SEMAPHORE_TARGET(waiter->hw_id)); - intel_ring_emit(ring, 0); + *out++ = GFX_OP_PIPE_CONTROL(6); + *out++ = (PIPE_CONTROL_GLOBAL_GTT_IVB | + PIPE_CONTROL_QW_WRITE | + PIPE_CONTROL_CS_STALL); + *out++ = lower_32_bits(gtt_offset); + *out++ = upper_32_bits(gtt_offset); + *out++ = req->global_seqno; + *out++ = 0; + *out++ = (MI_SEMAPHORE_SIGNAL | + MI_SEMAPHORE_TARGET(waiter->hw_id)); + *out++ = 0; } - intel_ring_advance(ring); - return 0; + return out; } -static int gen8_xcs_signal(struct drm_i915_gem_request *req) +static u32 *gen8_xcs_signal(struct drm_i915_gem_request *req, u32 *out) { - struct intel_ring *ring = req->ring; struct drm_i915_private *dev_priv = req->i915; struct intel_engine_cs *waiter; enum intel_engine_id id; - int ret, num_rings; - - num_rings = INTEL_INFO(dev_priv)->num_rings; - ret = intel_ring_begin(req, (num_rings-1) * 6); - if (ret) - return ret; for_each_engine(waiter, dev_priv, id) { u64 gtt_offset = req->engine->semaphore.signal_ggtt[id]; if (gtt_offset == MI_SEMAPHORE_SYNC_INVALID) continue; - intel_ring_emit(ring, - (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW); - intel_ring_emit(ring, - lower_32_bits(gtt_offset) | - MI_FLUSH_DW_USE_GTT); - intel_ring_emit(ring, upper_32_bits(gtt_offset)); - intel_ring_emit(ring, req->global_seqno); - intel_ring_emit(ring, - MI_SEMAPHORE_SIGNAL | - MI_SEMAPHORE_TARGET(waiter->hw_id)); - intel_ring_emit(ring, 0); + *out++ = (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW; + *out++ = lower_32_bits(gtt_offset) | MI_FLUSH_DW_USE_GTT; + *out++ = upper_32_bits(gtt_offset); + *out++ = req->global_seqno; + *out++ = (MI_SEMAPHORE_SIGNAL | + MI_SEMAPHORE_TARGET(waiter->hw_id)); + *out++ = 0; } - intel_ring_advance(ring); - return 0; + return out; } -static int gen6_signal(struct drm_i915_gem_request *req) +static u32 *gen6_signal(struct drm_i915_gem_request *req, u32 *out) { - struct intel_ring *ring = req->ring; struct drm_i915_private *dev_priv = req->i915; struct intel_engine_cs *engine; enum intel_engine_id id; - int ret, num_rings; - - num_rings = INTEL_INFO(dev_priv)->num_rings; - ret = intel_ring_begin(req, round_up((num_rings-1) * 3, 2)); - if (ret) - return ret; + int num_rings = 0; for_each_engine(engine, dev_priv, id) { i915_reg_t mbox_reg; @@ -1306,46 +1278,34 @@ static int gen6_signal(struct drm_i915_gem_request *req) mbox_reg = req->engine->semaphore.mbox.signal[engine->hw_id]; if (i915_mmio_reg_valid(mbox_reg)) { - intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1)); - intel_ring_emit_reg(ring, mbox_reg); - intel_ring_emit(ring, req->global_seqno); + *out++ = MI_LOAD_REGISTER_IMM(1); + *out++ = i915_mmio_reg_offset(mbox_reg); + *out++ = req->global_seqno; + num_rings++; } } + if (num_rings & 1) + *out++ = MI_NOOP; - /* If num_dwords was rounded, make sure the tail pointer is correct */ - if (num_rings % 2 == 0) - intel_ring_emit(ring, MI_NOOP); - intel_ring_advance(ring); - - return 0; + return out; } static void i9xx_submit_request(struct drm_i915_gem_request *request) { struct drm_i915_private *dev_priv = request->i915; - I915_WRITE_TAIL(request->engine, - intel_ring_offset(request->ring, request->tail)); + I915_WRITE_TAIL(request->engine, request->tail); } -static int i9xx_emit_breadcrumb(struct drm_i915_gem_request *req) +static void i9xx_emit_breadcrumb(struct drm_i915_gem_request *req, + u32 *out) { - struct intel_ring *ring = req->ring; - int ret; - - ret = intel_ring_begin(req, 4); - if (ret) - return ret; - - intel_ring_emit(ring, MI_STORE_DWORD_INDEX); - intel_ring_emit(ring, I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT); - intel_ring_emit(ring, req->global_seqno); - intel_ring_emit(ring, MI_USER_INTERRUPT); - intel_ring_advance(ring); - - req->tail = ring->tail; + *out++ = MI_STORE_DWORD_INDEX; + *out++ = I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT; + *out++ = req->global_seqno; + *out++ = MI_USER_INTERRUPT; - return 0; + req->tail = intel_ring_offset(req->ring, out); } static const int i9xx_emit_breadcrumb_sz = 4; @@ -1358,49 +1318,34 @@ static const int i9xx_emit_breadcrumb_sz = 4; * Update the mailbox registers in the *other* rings with the current seqno. * This acts like a signal in the canonical semaphore. */ -static int gen6_sema_emit_breadcrumb(struct drm_i915_gem_request *req) +static void gen6_sema_emit_breadcrumb(struct drm_i915_gem_request *req, + u32 *out) { - int ret; - - ret = req->engine->semaphore.signal(req); - if (ret) - return ret; - - return i9xx_emit_breadcrumb(req); + return i9xx_emit_breadcrumb(req, + req->engine->semaphore.signal(req, out)); } -static int gen8_render_emit_breadcrumb(struct drm_i915_gem_request *req) +static void gen8_render_emit_breadcrumb(struct drm_i915_gem_request *req, + u32 *out) { struct intel_engine_cs *engine = req->engine; - struct intel_ring *ring = req->ring; - int ret; - if (engine->semaphore.signal) { - ret = engine->semaphore.signal(req); - if (ret) - return ret; - } - - ret = intel_ring_begin(req, 8); - if (ret) - return ret; + if (engine->semaphore.signal) + out = engine->semaphore.signal(req, out); - intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6)); - intel_ring_emit(ring, (PIPE_CONTROL_GLOBAL_GTT_IVB | + *out++ = GFX_OP_PIPE_CONTROL(6); + *out++ = (PIPE_CONTROL_GLOBAL_GTT_IVB | PIPE_CONTROL_CS_STALL | - PIPE_CONTROL_QW_WRITE)); - intel_ring_emit(ring, intel_hws_seqno_address(engine)); - intel_ring_emit(ring, 0); - intel_ring_emit(ring, req->global_seqno); + PIPE_CONTROL_QW_WRITE); + *out++ = intel_hws_seqno_address(engine); + *out++ = 0; + *out++ = req->global_seqno; /* We're thrashing one dword of HWS. */ - intel_ring_emit(ring, 0); - intel_ring_emit(ring, MI_USER_INTERRUPT); - intel_ring_emit(ring, MI_NOOP); - intel_ring_advance(ring); - - req->tail = ring->tail; + *out++ = 0; + *out++ = MI_USER_INTERRUPT; + *out++ = MI_NOOP; - return 0; + req->tail = intel_ring_offset(req->ring, out); } static const int gen8_render_emit_breadcrumb_sz = 8; diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index 7b7aaafac0da..9d228bee3511 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -255,7 +255,8 @@ struct intel_engine_cs { #define I915_DISPATCH_SECURE BIT(0) #define I915_DISPATCH_PINNED BIT(1) #define I915_DISPATCH_RS BIT(2) - int (*emit_breadcrumb)(struct drm_i915_gem_request *req); + void (*emit_breadcrumb)(struct drm_i915_gem_request *req, + u32 *out); int emit_breadcrumb_sz; /* Pass the request to the hardware queue (e.g. directly into @@ -331,7 +332,7 @@ struct intel_engine_cs { /* AKA wait() */ int (*sync_to)(struct drm_i915_gem_request *req, struct drm_i915_gem_request *signal); - int (*signal)(struct drm_i915_gem_request *req); + u32 *(*signal)(struct drm_i915_gem_request *req, u32 *out); } semaphore; /* Execlists */ @@ -487,10 +488,11 @@ static inline void intel_ring_advance(struct intel_ring *ring) */ } -static inline u32 intel_ring_offset(struct intel_ring *ring, u32 value) +static inline u32 intel_ring_offset(struct intel_ring *ring, void *addr) { /* Don't write ring->size (equivalent to 0) as that hangs some GPUs. */ - return value & (ring->size - 1); + u32 offset = addr - ring->vaddr; + return offset & (ring->size - 1); } int __intel_ring_space(int head, int tail, int size); -- cgit v1.2.3 From 85e17f5974b357bc4a127be09de71b430be265e0 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 28 Oct 2016 13:58:53 +0100 Subject: drm/i915: Move the global sync optimisation to the timeline Currently we try to reduce the number of synchronisations (now the number of requests we need to wait upon) by noting that if we have earlier waited upon a request, all subsequent requests in the timeline will be after the wait. This only applies to requests in this timeline, as other timelines will not be ordered by that waiter. Signed-off-by: Chris Wilson Reviewed-by: Joonas Lahtinen Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-30-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/i915_debugfs.c | 9 ------ drivers/gpu/drm/i915/i915_drv.h | 1 - drivers/gpu/drm/i915/i915_gem_request.c | 29 +++++++++++-------- drivers/gpu/drm/i915/i915_gem_timeline.h | 1 + drivers/gpu/drm/i915/i915_gpu_error.c | 48 +++++++++++++++++++------------- drivers/gpu/drm/i915/intel_engine_cs.c | 2 -- drivers/gpu/drm/i915/intel_ringbuffer.c | 3 -- drivers/gpu/drm/i915/intel_ringbuffer.h | 23 --------------- 8 files changed, 47 insertions(+), 69 deletions(-) (limited to 'drivers/gpu/drm/i915/intel_ringbuffer.c') diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 90bc4a89e0d5..4655227eb9d9 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -3347,15 +3347,6 @@ static int i915_semaphore_status(struct seq_file *m, void *unused) seq_putc(m, '\n'); } - seq_puts(m, "\nSync seqno:\n"); - for_each_engine(engine, dev_priv, id) { - for (j = 0; j < num_rings; j++) - seq_printf(m, " 0x%08x ", - engine->semaphore.sync_seqno[j]); - seq_putc(m, '\n'); - } - seq_putc(m, '\n'); - intel_runtime_pm_put(dev_priv); mutex_unlock(&dev->struct_mutex); return 0; diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 217674bb1495..6bf40276ace6 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -802,7 +802,6 @@ struct drm_i915_error_state { u32 cpu_ring_tail; u32 last_seqno; - u32 semaphore_seqno[I915_NUM_ENGINES - 1]; /* Register state */ u32 start; diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c index 06daa4d203a7..9c34a4c540b5 100644 --- a/drivers/gpu/drm/i915/i915_gem_request.c +++ b/drivers/gpu/drm/i915/i915_gem_request.c @@ -238,35 +238,41 @@ static int i915_gem_check_wedge(struct drm_i915_private *dev_priv) return 0; } -static int i915_gem_init_global_seqno(struct drm_i915_private *dev_priv, - u32 seqno) +static int i915_gem_init_global_seqno(struct drm_i915_private *i915, u32 seqno) { - struct i915_gem_timeline *timeline = &dev_priv->gt.global_timeline; + struct i915_gem_timeline *timeline = &i915->gt.global_timeline; struct intel_engine_cs *engine; enum intel_engine_id id; int ret; /* Carefully retire all requests without writing to the rings */ - ret = i915_gem_wait_for_idle(dev_priv, + ret = i915_gem_wait_for_idle(i915, I915_WAIT_INTERRUPTIBLE | I915_WAIT_LOCKED); if (ret) return ret; - i915_gem_retire_requests(dev_priv); + i915_gem_retire_requests(i915); /* If the seqno wraps around, we need to clear the breadcrumb rbtree */ if (!i915_seqno_passed(seqno, timeline->next_seqno)) { - while (intel_kick_waiters(dev_priv) || - intel_kick_signalers(dev_priv)) + while (intel_kick_waiters(i915) || intel_kick_signalers(i915)) yield(); yield(); } /* Finally reset hw state */ - for_each_engine(engine, dev_priv, id) + for_each_engine(engine, i915, id) intel_engine_init_global_seqno(engine, seqno); + list_for_each_entry(timeline, &i915->gt.timelines, link) { + for_each_engine(engine, i915, id) { + struct intel_timeline *tl = &timeline->engine[id]; + + memset(tl->sync_seqno, 0, sizeof(tl->sync_seqno)); + } + } + return 0; } @@ -462,7 +468,7 @@ static int i915_gem_request_await_request(struct drm_i915_gem_request *to, struct drm_i915_gem_request *from) { - int idx, ret; + int ret; GEM_BUG_ON(to == from); @@ -483,8 +489,7 @@ i915_gem_request_await_request(struct drm_i915_gem_request *to, return ret < 0 ? ret : 0; } - idx = intel_engine_sync_index(from->engine, to->engine); - if (from->global_seqno <= from->engine->semaphore.sync_seqno[idx]) + if (from->global_seqno <= to->timeline->sync_seqno[from->engine->id]) return 0; trace_i915_gem_ring_sync_to(to, from); @@ -502,7 +507,7 @@ i915_gem_request_await_request(struct drm_i915_gem_request *to, return ret; } - from->engine->semaphore.sync_seqno[idx] = from->global_seqno; + to->timeline->sync_seqno[from->engine->id] = from->global_seqno; return 0; } diff --git a/drivers/gpu/drm/i915/i915_gem_timeline.h b/drivers/gpu/drm/i915/i915_gem_timeline.h index bfdf0331cc50..767b23914ec5 100644 --- a/drivers/gpu/drm/i915/i915_gem_timeline.h +++ b/drivers/gpu/drm/i915/i915_gem_timeline.h @@ -48,6 +48,7 @@ struct intel_timeline { * struct_mutex. */ struct i915_gem_active last_request; + u32 sync_seqno[I915_NUM_ENGINES]; struct i915_gem_timeline *common; }; diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 9aa197ca6210..ef3698120d92 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -415,17 +415,13 @@ static void error_print_engine(struct drm_i915_error_state_buf *m, if (INTEL_GEN(m->i915) >= 6) { err_printf(m, " RC PSMI: 0x%08x\n", ee->rc_psmi); err_printf(m, " FAULT_REG: 0x%08x\n", ee->fault_reg); - err_printf(m, " SYNC_0: 0x%08x [last synced 0x%08x]\n", - ee->semaphore_mboxes[0], - ee->semaphore_seqno[0]); - err_printf(m, " SYNC_1: 0x%08x [last synced 0x%08x]\n", - ee->semaphore_mboxes[1], - ee->semaphore_seqno[1]); - if (HAS_VEBOX(m->i915)) { - err_printf(m, " SYNC_2: 0x%08x [last synced 0x%08x]\n", - ee->semaphore_mboxes[2], - ee->semaphore_seqno[2]); - } + err_printf(m, " SYNC_0: 0x%08x\n", + ee->semaphore_mboxes[0]); + err_printf(m, " SYNC_1: 0x%08x\n", + ee->semaphore_mboxes[1]); + if (HAS_VEBOX(m->i915)) + err_printf(m, " SYNC_2: 0x%08x\n", + ee->semaphore_mboxes[2]); } if (USES_PPGTT(m->i915)) { err_printf(m, " GFX_MODE: 0x%08x\n", ee->vm_info.gfx_mode); @@ -972,6 +968,26 @@ static void i915_gem_record_fences(struct drm_i915_private *dev_priv, } } +static inline u32 +gen8_engine_sync_index(struct intel_engine_cs *engine, + struct intel_engine_cs *other) +{ + int idx; + + /* + * rcs -> 0 = vcs, 1 = bcs, 2 = vecs, 3 = vcs2; + * vcs -> 0 = bcs, 1 = vecs, 2 = vcs2, 3 = rcs; + * bcs -> 0 = vecs, 1 = vcs2. 2 = rcs, 3 = vcs; + * vecs -> 0 = vcs2, 1 = rcs, 2 = vcs, 3 = bcs; + * vcs2 -> 0 = rcs, 1 = vcs, 2 = bcs, 3 = vecs; + */ + + idx = (other - engine) - 1; + if (idx < 0) + idx += I915_NUM_ENGINES; + + return idx; +} static void gen8_record_semaphore_state(struct drm_i915_error_state *error, struct intel_engine_cs *engine, @@ -995,10 +1011,9 @@ static void gen8_record_semaphore_state(struct drm_i915_error_state *error, signal_offset = (GEN8_SIGNAL_OFFSET(engine, id) & (PAGE_SIZE - 1)) / 4; tmp = error->semaphore->pages[0]; - idx = intel_engine_sync_index(engine, to); + idx = gen8_engine_sync_index(engine, to); ee->semaphore_mboxes[idx] = tmp[signal_offset]; - ee->semaphore_seqno[idx] = engine->semaphore.sync_seqno[idx]; } } @@ -1009,14 +1024,9 @@ static void gen6_record_semaphore_state(struct intel_engine_cs *engine, ee->semaphore_mboxes[0] = I915_READ(RING_SYNC_0(engine->mmio_base)); ee->semaphore_mboxes[1] = I915_READ(RING_SYNC_1(engine->mmio_base)); - ee->semaphore_seqno[0] = engine->semaphore.sync_seqno[0]; - ee->semaphore_seqno[1] = engine->semaphore.sync_seqno[1]; - - if (HAS_VEBOX(dev_priv)) { + if (HAS_VEBOX(dev_priv)) ee->semaphore_mboxes[2] = I915_READ(RING_SYNC_2(engine->mmio_base)); - ee->semaphore_seqno[2] = engine->semaphore.sync_seqno[2]; - } } static void error_record_engine_waiters(struct intel_engine_cs *engine, diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c index 6a3105512d18..94de3d66733d 100644 --- a/drivers/gpu/drm/i915/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/intel_engine_cs.c @@ -204,8 +204,6 @@ void intel_engine_init_global_seqno(struct intel_engine_cs *engine, u32 seqno) I915_NUM_ENGINES * gen8_semaphore_seqno_size); kunmap(page); } - memset(engine->semaphore.sync_seqno, 0, - sizeof(engine->semaphore.sync_seqno)); intel_write_status_page(engine, I915_GEM_HWS_INDEX, seqno); if (engine->irq_seqno_barrier) diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index 16244775b9d1..188fdec5fa6b 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -2003,9 +2003,6 @@ static int intel_init_ring_buffer(struct intel_engine_cs *engine) intel_engine_setup_common(engine); - memset(engine->semaphore.sync_seqno, 0, - sizeof(engine->semaphore.sync_seqno)); - ret = intel_engine_init_common(engine); if (ret) goto error; diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index 9d228bee3511..891629caab6c 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -314,8 +314,6 @@ struct intel_engine_cs { * ie. transpose of f(x, y) */ struct { - u32 sync_seqno[I915_NUM_ENGINES-1]; - union { #define GEN6_SEMAPHORE_LAST VECS_HW #define GEN6_NUM_SEMAPHORES (GEN6_SEMAPHORE_LAST + 1) @@ -385,27 +383,6 @@ intel_engine_flag(const struct intel_engine_cs *engine) return 1 << engine->id; } -static inline u32 -intel_engine_sync_index(struct intel_engine_cs *engine, - struct intel_engine_cs *other) -{ - int idx; - - /* - * rcs -> 0 = vcs, 1 = bcs, 2 = vecs, 3 = vcs2; - * vcs -> 0 = bcs, 1 = vecs, 2 = vcs2, 3 = rcs; - * bcs -> 0 = vecs, 1 = vcs2. 2 = rcs, 3 = vcs; - * vecs -> 0 = vcs2, 1 = rcs, 2 = vcs, 3 = bcs; - * vcs2 -> 0 = rcs, 1 = vcs, 2 = bcs, 3 = vecs; - */ - - idx = (other->id - engine->id) - 1; - if (idx < 0) - idx += I915_NUM_ENGINES; - - return idx; -} - static inline void intel_flush_status_page(struct intel_engine_cs *engine, int reg) { -- cgit v1.2.3 From 07c9a21a0d594c3acc0983e5e0a25d2364188d51 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Sun, 30 Oct 2016 13:28:20 +0000 Subject: drm/i915: Export a function to flush the context upon pinning For legacy contexts we employ an optimisation to only flush the context when binding into the global GTT. This avoids stalling on the GPU when reloading an active context. Wrap this detail up into a helper and export it for a potential third user. (Longer term, context pinning needs to be reworked as the current handling of switch context pins too late and so risks eviction and corrupting the request. Plans, plans, plans.) v2: Expand the comment explaining the optimisation for avoiding the stall on active contexts. Signed-off-by: Chris Wilson Link: http://patchwork.freedesktop.org/patch/msgid/20161030132820.32163-1-chris@chris-wilson.co.uk Reviewed-by: Matthew Auld --- drivers/gpu/drm/i915/i915_drv.h | 3 +++ drivers/gpu/drm/i915/i915_gem_context.c | 39 +++++++++++++++++++++++---------- drivers/gpu/drm/i915/intel_ringbuffer.c | 11 +++++----- 3 files changed, 36 insertions(+), 17 deletions(-) (limited to 'drivers/gpu/drm/i915/intel_ringbuffer.c') diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 086396e16de7..b5bfc4c67aa0 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -3529,6 +3529,9 @@ int i915_gem_context_open(struct drm_device *dev, struct drm_file *file); void i915_gem_context_close(struct drm_device *dev, struct drm_file *file); int i915_switch_context(struct drm_i915_gem_request *req); int i915_gem_switch_to_kernel_context(struct drm_i915_private *dev_priv); +struct i915_vma * +i915_gem_context_pin_legacy(struct i915_gem_context *ctx, + unsigned int flags); void i915_gem_context_free(struct kref *ctx_ref); struct drm_i915_gem_object * i915_gem_alloc_context_obj(struct drm_device *dev, size_t size); diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index 461aece6c5bd..6dd475735f0a 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -764,12 +764,36 @@ needs_pd_load_post(struct i915_hw_ppgtt *ppgtt, return false; } +struct i915_vma * +i915_gem_context_pin_legacy(struct i915_gem_context *ctx, + unsigned int flags) +{ + struct i915_vma *vma = ctx->engine[RCS].state; + int ret; + + /* Clear this page out of any CPU caches for coherent swap-in/out. + * We only want to do this on the first bind so that we do not stall + * on an active context (which by nature is already on the GPU). + */ + if (!(vma->flags & I915_VMA_GLOBAL_BIND)) { + ret = i915_gem_object_set_to_gtt_domain(vma->obj, false); + if (ret) + return ERR_PTR(ret); + } + + ret = i915_vma_pin(vma, 0, ctx->ggtt_alignment, PIN_GLOBAL | flags); + if (ret) + return ERR_PTR(ret); + + return vma; +} + static int do_rcs_switch(struct drm_i915_gem_request *req) { struct i915_gem_context *to = req->ctx; struct intel_engine_cs *engine = req->engine; struct i915_hw_ppgtt *ppgtt = to->ppgtt ?: req->i915->mm.aliasing_ppgtt; - struct i915_vma *vma = to->engine[RCS].state; + struct i915_vma *vma; struct i915_gem_context *from; u32 hw_flags; int ret, i; @@ -777,17 +801,10 @@ static int do_rcs_switch(struct drm_i915_gem_request *req) if (skip_rcs_switch(ppgtt, engine, to)) return 0; - /* Clear this page out of any CPU caches for coherent swap-in/out. */ - if (!(vma->flags & I915_VMA_GLOBAL_BIND)) { - ret = i915_gem_object_set_to_gtt_domain(vma->obj, false); - if (ret) - return ret; - } - /* Trying to pin first makes error handling easier. */ - ret = i915_vma_pin(vma, 0, to->ggtt_alignment, PIN_GLOBAL); - if (ret) - return ret; + vma = i915_gem_context_pin_legacy(to, 0); + if (IS_ERR(vma)) + return PTR_ERR(vma); /* * Pin can switch back to the default context if we end up calling into diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index 188fdec5fa6b..700e93d80616 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -1949,14 +1949,13 @@ static int intel_ring_context_pin(struct i915_gem_context *ctx, return 0; if (ce->state) { - ret = i915_gem_object_set_to_gtt_domain(ce->state->obj, false); - if (ret) - goto error; + struct i915_vma *vma; - ret = i915_vma_pin(ce->state, 0, ctx->ggtt_alignment, - PIN_GLOBAL | PIN_HIGH); - if (ret) + vma = i915_gem_context_pin_legacy(ctx, PIN_HIGH); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); goto error; + } } /* The kernel context is only used as a placeholder for flushing the -- cgit v1.2.3