diff options
Diffstat (limited to 'drivers/gpu/drm/i915/selftests')
-rw-r--r-- | drivers/gpu/drm/i915/selftests/i915_gem_gtt.c | 1 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/selftests/i915_live_selftests.h | 4 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/selftests/i915_mock_selftests.h | 4 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/selftests/i915_perf.c | 135 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/selftests/i915_perf_selftests.h | 2 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/selftests/i915_request.c | 873 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/selftests/igt_spinner.c | 18 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/selftests/mock_gem_device.c | 13 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/selftests/mock_gtt.c | 12 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/selftests/mock_region.c | 1 |
10 files changed, 1018 insertions, 45 deletions
diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c index 2e471500a646..0016ffc7d914 100644 --- a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c +++ b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c @@ -97,6 +97,7 @@ static void fake_put_pages(struct drm_i915_gem_object *obj, } static const struct drm_i915_gem_object_ops fake_ops = { + .name = "fake-gem", .flags = I915_GEM_OBJECT_IS_SHRINKABLE, .get_pages = fake_get_pages, .put_pages = fake_put_pages, diff --git a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h index 5dd5d81646c4..a92c0e9b7e6b 100644 --- a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h +++ b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h @@ -11,9 +11,9 @@ * a module parameter. It must be unique and legal for a C identifier. * * The function should be of type int function(void). It may be conditionally - * compiled using #if IS_ENABLED(DRM_I915_SELFTEST). + * compiled using #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST). * - * Tests are executed in order by igt/drv_selftest + * Tests are executed in order by igt/i915_selftest */ selftest(sanitycheck, i915_live_sanitycheck) /* keep first (igt selfcheck) */ selftest(uncore, intel_uncore_live_selftests) diff --git a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h index 6090ce35226b..3db34d3eea58 100644 --- a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h +++ b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h @@ -11,9 +11,9 @@ * a module parameter. It must be unique and legal for a C identifier. * * The function should be of type int function(void). It may be conditionally - * compiled using #if IS_ENABLED(DRM_I915_SELFTEST). + * compiled using #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST). * - * Tests are executed in order by igt/drv_selftest + * Tests are executed in order by igt/i915_selftest */ selftest(sanitycheck, i915_mock_sanitycheck) /* keep first (igt selfcheck) */ selftest(shmem, shmem_utils_mock_selftests) diff --git a/drivers/gpu/drm/i915/selftests/i915_perf.c b/drivers/gpu/drm/i915/selftests/i915_perf.c index 8eb3108f1767..c2d001d9c0ec 100644 --- a/drivers/gpu/drm/i915/selftests/i915_perf.c +++ b/drivers/gpu/drm/i915/selftests/i915_perf.c @@ -162,7 +162,7 @@ static int write_timestamp(struct i915_request *rq, int slot) return PTR_ERR(cs); len = 5; - if (INTEL_GEN(rq->i915) >= 8) + if (INTEL_GEN(rq->engine->i915) >= 8) len++; *cs++ = GFX_OP_PIPE_CONTROL(len); @@ -280,11 +280,144 @@ out: return err; } +static int live_noa_gpr(void *arg) +{ + struct drm_i915_private *i915 = arg; + struct i915_perf_stream *stream; + struct intel_context *ce; + struct i915_request *rq; + u32 *cs, *store; + void *scratch; + u32 gpr0; + int err; + int i; + + /* Check that the delay does not clobber user context state (GPR) */ + + stream = test_stream(&i915->perf); + if (!stream) + return -ENOMEM; + + gpr0 = i915_mmio_reg_offset(GEN8_RING_CS_GPR(stream->engine->mmio_base, 0)); + + ce = intel_context_create(stream->engine); + if (IS_ERR(ce)) { + err = PTR_ERR(ce); + goto out; + } + + /* Poison the ce->vm so we detect writes not to the GGTT gt->scratch */ + scratch = kmap(ce->vm->scratch[0].base.page); + memset(scratch, POISON_FREE, PAGE_SIZE); + + rq = intel_context_create_request(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto out_ce; + } + i915_request_get(rq); + + if (rq->engine->emit_init_breadcrumb) { + err = rq->engine->emit_init_breadcrumb(rq); + if (err) { + i915_request_add(rq); + goto out_rq; + } + } + + /* Fill the 16 qword [32 dword] GPR with a known unlikely value */ + cs = intel_ring_begin(rq, 2 * 32 + 2); + if (IS_ERR(cs)) { + err = PTR_ERR(cs); + i915_request_add(rq); + goto out_rq; + } + + *cs++ = MI_LOAD_REGISTER_IMM(32); + for (i = 0; i < 32; i++) { + *cs++ = gpr0 + i * sizeof(u32); + *cs++ = STACK_MAGIC; + } + *cs++ = MI_NOOP; + intel_ring_advance(rq, cs); + + /* Execute the GPU delay */ + err = rq->engine->emit_bb_start(rq, + i915_ggtt_offset(stream->noa_wait), 0, + I915_DISPATCH_SECURE); + if (err) { + i915_request_add(rq); + goto out_rq; + } + + /* Read the GPR back, using the pinned global HWSP for convenience */ + store = memset32(rq->engine->status_page.addr + 512, 0, 32); + for (i = 0; i < 32; i++) { + u32 cmd; + + cs = intel_ring_begin(rq, 4); + if (IS_ERR(cs)) { + err = PTR_ERR(cs); + i915_request_add(rq); + goto out_rq; + } + + cmd = MI_STORE_REGISTER_MEM; + if (INTEL_GEN(i915) >= 8) + cmd++; + cmd |= MI_USE_GGTT; + + *cs++ = cmd; + *cs++ = gpr0 + i * sizeof(u32); + *cs++ = i915_ggtt_offset(rq->engine->status_page.vma) + + offset_in_page(store) + + i * sizeof(u32); + *cs++ = 0; + intel_ring_advance(rq, cs); + } + + i915_request_add(rq); + + if (i915_request_wait(rq, I915_WAIT_INTERRUPTIBLE, HZ / 2) < 0) { + pr_err("noa_wait timed out\n"); + intel_gt_set_wedged(stream->engine->gt); + err = -EIO; + goto out_rq; + } + + /* Verify that the GPR contain our expected values */ + for (i = 0; i < 32; i++) { + if (store[i] == STACK_MAGIC) + continue; + + pr_err("GPR[%d] lost, found:%08x, expected:%08x!\n", + i, store[i], STACK_MAGIC); + err = -EINVAL; + } + + /* Verify that the user's scratch page was not used for GPR storage */ + if (memchr_inv(scratch, POISON_FREE, PAGE_SIZE)) { + pr_err("Scratch page overwritten!\n"); + igt_hexdump(scratch, 4096); + err = -EINVAL; + } + +out_rq: + i915_request_put(rq); +out_ce: + kunmap(ce->vm->scratch[0].base.page); + intel_context_put(ce); +out: + stream_destroy(stream); + return err; +} + int i915_perf_live_selftests(struct drm_i915_private *i915) { static const struct i915_subtest tests[] = { SUBTEST(live_sanitycheck), SUBTEST(live_noa_delay), + SUBTEST(live_noa_gpr), }; struct i915_perf *perf = &i915->perf; int err; diff --git a/drivers/gpu/drm/i915/selftests/i915_perf_selftests.h b/drivers/gpu/drm/i915/selftests/i915_perf_selftests.h index d8da142985eb..c2389f8a257d 100644 --- a/drivers/gpu/drm/i915/selftests/i915_perf_selftests.h +++ b/drivers/gpu/drm/i915/selftests/i915_perf_selftests.h @@ -11,7 +11,7 @@ * a module parameter. It must be unique and legal for a C identifier. * * The function should be of type int function(void). It may be conditionally - * compiled using #if IS_ENABLED(DRM_I915_SELFTEST). + * compiled using #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST). * * Tests are executed in order by igt/i915_selftest */ diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c index 6014e8dfcbb1..57dd6f5122ee 100644 --- a/drivers/gpu/drm/i915/selftests/i915_request.c +++ b/drivers/gpu/drm/i915/selftests/i915_request.c @@ -24,16 +24,21 @@ #include <linux/prime_numbers.h> #include <linux/pm_qos.h> +#include <linux/sort.h> #include "gem/i915_gem_pm.h" #include "gem/selftests/mock_context.h" +#include "gt/intel_engine_heartbeat.h" #include "gt/intel_engine_pm.h" #include "gt/intel_engine_user.h" #include "gt/intel_gt.h" +#include "gt/intel_gt_requests.h" +#include "gt/selftest_engine_heartbeat.h" #include "i915_random.h" #include "i915_selftest.h" +#include "igt_flush_test.h" #include "igt_live_test.h" #include "igt_spinner.h" #include "lib_sw_fence.h" @@ -1449,7 +1454,7 @@ out_flush: idx++; } pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n", - num_waits, num_fences, RUNTIME_INFO(i915)->num_engines, ncpus); + num_waits, num_fences, idx, ncpus); ret = igt_live_test_end(&live) ?: ret; out_contexts: @@ -1524,6 +1529,808 @@ struct perf_series { struct intel_context *ce[]; }; +static int cmp_u32(const void *A, const void *B) +{ + const u32 *a = A, *b = B; + + return *a - *b; +} + +static u32 trifilter(u32 *a) +{ + u64 sum; + +#define TF_COUNT 5 + sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL); + + sum = mul_u32_u32(a[2], 2); + sum += a[1]; + sum += a[3]; + + GEM_BUG_ON(sum > U32_MAX); + return sum; +#define TF_BIAS 2 +} + +static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles) +{ + u64 ns = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles); + + return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS); +} + +static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset) +{ + *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT; + *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base))); + *cs++ = offset; + *cs++ = 0; + + return cs; +} + +static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value) +{ + *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; + *cs++ = offset; + *cs++ = 0; + *cs++ = value; + + return cs; +} + +static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset) +{ + *cs++ = MI_SEMAPHORE_WAIT | + MI_SEMAPHORE_GLOBAL_GTT | + MI_SEMAPHORE_POLL | + mode; + *cs++ = value; + *cs++ = offset; + *cs++ = 0; + + return cs; +} + +static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value) +{ + return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset); +} + +static void semaphore_set(u32 *sema, u32 value) +{ + WRITE_ONCE(*sema, value); + wmb(); /* flush the update to the cache, and beyond */ +} + +static u32 *hwsp_scratch(const struct intel_context *ce) +{ + return memset32(ce->engine->status_page.addr + 1000, 0, 21); +} + +static u32 hwsp_offset(const struct intel_context *ce, u32 *dw) +{ + return (i915_ggtt_offset(ce->engine->status_page.vma) + + offset_in_page(dw)); +} + +static int measure_semaphore_response(struct intel_context *ce) +{ + u32 *sema = hwsp_scratch(ce); + const u32 offset = hwsp_offset(ce, sema); + u32 elapsed[TF_COUNT], cycles; + struct i915_request *rq; + u32 *cs; + int err; + int i; + + /* + * Measure how many cycles it takes for the HW to detect the change + * in a semaphore value. + * + * A: read CS_TIMESTAMP from CPU + * poke semaphore + * B: read CS_TIMESTAMP on GPU + * + * Semaphore latency: B - A + */ + + semaphore_set(sema, -1); + + rq = i915_request_create(ce); + if (IS_ERR(rq)) + return PTR_ERR(rq); + + cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed)); + if (IS_ERR(cs)) { + i915_request_add(rq); + err = PTR_ERR(cs); + goto err; + } + + cs = emit_store_dw(cs, offset, 0); + for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { + cs = emit_semaphore_poll_until(cs, offset, i); + cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); + cs = emit_store_dw(cs, offset, 0); + } + + intel_ring_advance(rq, cs); + i915_request_add(rq); + + if (wait_for(READ_ONCE(*sema) == 0, 50)) { + err = -EIO; + goto err; + } + + for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { + preempt_disable(); + cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); + semaphore_set(sema, i); + preempt_enable(); + + if (wait_for(READ_ONCE(*sema) == 0, 50)) { + err = -EIO; + goto err; + } + + elapsed[i - 1] = sema[i] - cycles; + } + + cycles = trifilter(elapsed); + pr_info("%s: semaphore response %d cycles, %lluns\n", + ce->engine->name, cycles >> TF_BIAS, + cycles_to_ns(ce->engine, cycles)); + + return intel_gt_wait_for_idle(ce->engine->gt, HZ); + +err: + intel_gt_set_wedged(ce->engine->gt); + return err; +} + +static int measure_idle_dispatch(struct intel_context *ce) +{ + u32 *sema = hwsp_scratch(ce); + const u32 offset = hwsp_offset(ce, sema); + u32 elapsed[TF_COUNT], cycles; + u32 *cs; + int err; + int i; + + /* + * Measure how long it takes for us to submit a request while the + * engine is idle, but is resting in our context. + * + * A: read CS_TIMESTAMP from CPU + * submit request + * B: read CS_TIMESTAMP on GPU + * + * Submission latency: B - A + */ + + for (i = 0; i < ARRAY_SIZE(elapsed); i++) { + struct i915_request *rq; + + err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); + if (err) + return err; + + rq = i915_request_create(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err; + } + + cs = intel_ring_begin(rq, 4); + if (IS_ERR(cs)) { + i915_request_add(rq); + err = PTR_ERR(cs); + goto err; + } + + cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); + + intel_ring_advance(rq, cs); + + preempt_disable(); + local_bh_disable(); + elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); + i915_request_add(rq); + local_bh_enable(); + preempt_enable(); + } + + err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); + if (err) + goto err; + + for (i = 0; i < ARRAY_SIZE(elapsed); i++) + elapsed[i] = sema[i] - elapsed[i]; + + cycles = trifilter(elapsed); + pr_info("%s: idle dispatch latency %d cycles, %lluns\n", + ce->engine->name, cycles >> TF_BIAS, + cycles_to_ns(ce->engine, cycles)); + + return intel_gt_wait_for_idle(ce->engine->gt, HZ); + +err: + intel_gt_set_wedged(ce->engine->gt); + return err; +} + +static int measure_busy_dispatch(struct intel_context *ce) +{ + u32 *sema = hwsp_scratch(ce); + const u32 offset = hwsp_offset(ce, sema); + u32 elapsed[TF_COUNT + 1], cycles; + u32 *cs; + int err; + int i; + + /* + * Measure how long it takes for us to submit a request while the + * engine is busy, polling on a semaphore in our context. With + * direct submission, this will include the cost of a lite restore. + * + * A: read CS_TIMESTAMP from CPU + * submit request + * B: read CS_TIMESTAMP on GPU + * + * Submission latency: B - A + */ + + for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { + struct i915_request *rq; + + rq = i915_request_create(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err; + } + + cs = intel_ring_begin(rq, 12); + if (IS_ERR(cs)) { + i915_request_add(rq); + err = PTR_ERR(cs); + goto err; + } + + cs = emit_store_dw(cs, offset + i * sizeof(u32), -1); + cs = emit_semaphore_poll_until(cs, offset, i); + cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); + + intel_ring_advance(rq, cs); + + if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) { + err = -EIO; + goto err; + } + + preempt_disable(); + local_bh_disable(); + elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); + i915_request_add(rq); + local_bh_enable(); + semaphore_set(sema, i - 1); + preempt_enable(); + } + + wait_for(READ_ONCE(sema[i - 1]), 500); + semaphore_set(sema, i - 1); + + for (i = 1; i <= TF_COUNT; i++) { + GEM_BUG_ON(sema[i] == -1); + elapsed[i - 1] = sema[i] - elapsed[i]; + } + + cycles = trifilter(elapsed); + pr_info("%s: busy dispatch latency %d cycles, %lluns\n", + ce->engine->name, cycles >> TF_BIAS, + cycles_to_ns(ce->engine, cycles)); + + return intel_gt_wait_for_idle(ce->engine->gt, HZ); + +err: + intel_gt_set_wedged(ce->engine->gt); + return err; +} + +static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value) +{ + const u32 offset = + i915_ggtt_offset(engine->status_page.vma) + + offset_in_page(sema); + struct i915_request *rq; + u32 *cs; + + rq = i915_request_create(engine->kernel_context); + if (IS_ERR(rq)) + return PTR_ERR(rq); + + cs = intel_ring_begin(rq, 4); + if (IS_ERR(cs)) { + i915_request_add(rq); + return PTR_ERR(cs); + } + + cs = emit_semaphore_poll(cs, mode, value, offset); + + intel_ring_advance(rq, cs); + i915_request_add(rq); + + return 0; +} + +static int measure_inter_request(struct intel_context *ce) +{ + u32 *sema = hwsp_scratch(ce); + const u32 offset = hwsp_offset(ce, sema); + u32 elapsed[TF_COUNT + 1], cycles; + struct i915_sw_fence *submit; + int i, err; + + /* + * Measure how long it takes to advance from one request into the + * next. Between each request we flush the GPU caches to memory, + * update the breadcrumbs, and then invalidate those caches. + * We queue up all the requests to be submitted in one batch so + * it should be one set of contiguous measurements. + * + * A: read CS_TIMESTAMP on GPU + * advance request + * B: read CS_TIMESTAMP on GPU + * + * Request latency: B - A + */ + + err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0); + if (err) + return err; + + submit = heap_fence_create(GFP_KERNEL); + if (!submit) { + semaphore_set(sema, 1); + return -ENOMEM; + } + + intel_engine_flush_submission(ce->engine); + for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { + struct i915_request *rq; + u32 *cs; + + rq = i915_request_create(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err_submit; + } + + err = i915_sw_fence_await_sw_fence_gfp(&rq->submit, + submit, + GFP_KERNEL); + if (err < 0) { + i915_request_add(rq); + goto err_submit; + } + + cs = intel_ring_begin(rq, 4); + if (IS_ERR(cs)) { + i915_request_add(rq); + err = PTR_ERR(cs); + goto err_submit; + } + + cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); + + intel_ring_advance(rq, cs); + i915_request_add(rq); + } + local_bh_disable(); + i915_sw_fence_commit(submit); + local_bh_enable(); + intel_engine_flush_submission(ce->engine); + heap_fence_put(submit); + + semaphore_set(sema, 1); + err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); + if (err) + goto err; + + for (i = 1; i <= TF_COUNT; i++) + elapsed[i - 1] = sema[i + 1] - sema[i]; + + cycles = trifilter(elapsed); + pr_info("%s: inter-request latency %d cycles, %lluns\n", + ce->engine->name, cycles >> TF_BIAS, + cycles_to_ns(ce->engine, cycles)); + + return intel_gt_wait_for_idle(ce->engine->gt, HZ); + +err_submit: + i915_sw_fence_commit(submit); + heap_fence_put(submit); + semaphore_set(sema, 1); +err: + intel_gt_set_wedged(ce->engine->gt); + return err; +} + +static int measure_context_switch(struct intel_context *ce) +{ + u32 *sema = hwsp_scratch(ce); + const u32 offset = hwsp_offset(ce, sema); + struct i915_request *fence = NULL; + u32 elapsed[TF_COUNT + 1], cycles; + int i, j, err; + u32 *cs; + + /* + * Measure how long it takes to advance from one request in one + * context to a request in another context. This allows us to + * measure how long the context save/restore take, along with all + * the inter-context setup we require. + * + * A: read CS_TIMESTAMP on GPU + * switch context + * B: read CS_TIMESTAMP on GPU + * + * Context switch latency: B - A + */ + + err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0); + if (err) + return err; + + for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { + struct intel_context *arr[] = { + ce, ce->engine->kernel_context + }; + u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32); + + for (j = 0; j < ARRAY_SIZE(arr); j++) { + struct i915_request *rq; + + rq = i915_request_create(arr[j]); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err_fence; + } + + if (fence) { + err = i915_request_await_dma_fence(rq, + &fence->fence); + if (err) { + i915_request_add(rq); + goto err_fence; + } + } + + cs = intel_ring_begin(rq, 4); + if (IS_ERR(cs)) { + i915_request_add(rq); + err = PTR_ERR(cs); + goto err_fence; + } + + cs = emit_timestamp_store(cs, ce, addr); + addr += sizeof(u32); + + intel_ring_advance(rq, cs); + + i915_request_put(fence); + fence = i915_request_get(rq); + + i915_request_add(rq); + } + } + i915_request_put(fence); + intel_engine_flush_submission(ce->engine); + + semaphore_set(sema, 1); + err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); + if (err) + goto err; + + for (i = 1; i <= TF_COUNT; i++) + elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1]; + + cycles = trifilter(elapsed); + pr_info("%s: context switch latency %d cycles, %lluns\n", + ce->engine->name, cycles >> TF_BIAS, + cycles_to_ns(ce->engine, cycles)); + + return intel_gt_wait_for_idle(ce->engine->gt, HZ); + +err_fence: + i915_request_put(fence); + semaphore_set(sema, 1); +err: + intel_gt_set_wedged(ce->engine->gt); + return err; +} + +static int measure_preemption(struct intel_context *ce) +{ + u32 *sema = hwsp_scratch(ce); + const u32 offset = hwsp_offset(ce, sema); + u32 elapsed[TF_COUNT], cycles; + u32 *cs; + int err; + int i; + + /* + * We measure two latencies while triggering preemption. The first + * latency is how long it takes for us to submit a preempting request. + * The second latency is how it takes for us to return from the + * preemption back to the original context. + * + * A: read CS_TIMESTAMP from CPU + * submit preemption + * B: read CS_TIMESTAMP on GPU (in preempting context) + * context switch + * C: read CS_TIMESTAMP on GPU (in original context) + * + * Preemption dispatch latency: B - A + * Preemption switch latency: C - B + */ + + if (!intel_engine_has_preemption(ce->engine)) + return 0; + + for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { + u32 addr = offset + 2 * i * sizeof(u32); + struct i915_request *rq; + + rq = i915_request_create(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err; + } + + cs = intel_ring_begin(rq, 12); + if (IS_ERR(cs)) { + i915_request_add(rq); + err = PTR_ERR(cs); + goto err; + } + + cs = emit_store_dw(cs, addr, -1); + cs = emit_semaphore_poll_until(cs, offset, i); + cs = emit_timestamp_store(cs, ce, addr + sizeof(u32)); + + intel_ring_advance(rq, cs); + i915_request_add(rq); + + if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) { + err = -EIO; + goto err; + } + + rq = i915_request_create(ce->engine->kernel_context); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err; + } + + cs = intel_ring_begin(rq, 8); + if (IS_ERR(cs)) { + i915_request_add(rq); + err = PTR_ERR(cs); + goto err; + } + + cs = emit_timestamp_store(cs, ce, addr); + cs = emit_store_dw(cs, offset, i); + + intel_ring_advance(rq, cs); + rq->sched.attr.priority = I915_PRIORITY_BARRIER; + + elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); + i915_request_add(rq); + } + + if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) { + err = -EIO; + goto err; + } + + for (i = 1; i <= TF_COUNT; i++) + elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1]; + + cycles = trifilter(elapsed); + pr_info("%s: preemption dispatch latency %d cycles, %lluns\n", + ce->engine->name, cycles >> TF_BIAS, + cycles_to_ns(ce->engine, cycles)); + + for (i = 1; i <= TF_COUNT; i++) + elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0]; + + cycles = trifilter(elapsed); + pr_info("%s: preemption switch latency %d cycles, %lluns\n", + ce->engine->name, cycles >> TF_BIAS, + cycles_to_ns(ce->engine, cycles)); + + return intel_gt_wait_for_idle(ce->engine->gt, HZ); + +err: + intel_gt_set_wedged(ce->engine->gt); + return err; +} + +struct signal_cb { + struct dma_fence_cb base; + bool seen; +}; + +static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb) +{ + struct signal_cb *s = container_of(cb, typeof(*s), base); + + smp_store_mb(s->seen, true); /* be safe, be strong */ +} + +static int measure_completion(struct intel_context *ce) +{ + u32 *sema = hwsp_scratch(ce); + const u32 offset = hwsp_offset(ce, sema); + u32 elapsed[TF_COUNT], cycles; + u32 *cs; + int err; + int i; + + /* + * Measure how long it takes for the signal (interrupt) to be + * sent from the GPU to be processed by the CPU. + * + * A: read CS_TIMESTAMP on GPU + * signal + * B: read CS_TIMESTAMP from CPU + * + * Completion latency: B - A + */ + + for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { + struct signal_cb cb = { .seen = false }; + struct i915_request *rq; + + rq = i915_request_create(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err; + } + + cs = intel_ring_begin(rq, 12); + if (IS_ERR(cs)) { + i915_request_add(rq); + err = PTR_ERR(cs); + goto err; + } + + cs = emit_store_dw(cs, offset + i * sizeof(u32), -1); + cs = emit_semaphore_poll_until(cs, offset, i); + cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); + + intel_ring_advance(rq, cs); + + dma_fence_add_callback(&rq->fence, &cb.base, signal_cb); + + local_bh_disable(); + i915_request_add(rq); + local_bh_enable(); + + if (wait_for(READ_ONCE(sema[i]) == -1, 50)) { + err = -EIO; + goto err; + } + + preempt_disable(); + semaphore_set(sema, i); + while (!READ_ONCE(cb.seen)) + cpu_relax(); + + elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); + preempt_enable(); + } + + err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); + if (err) + goto err; + + for (i = 0; i < ARRAY_SIZE(elapsed); i++) { + GEM_BUG_ON(sema[i + 1] == -1); + elapsed[i] = elapsed[i] - sema[i + 1]; + } + + cycles = trifilter(elapsed); + pr_info("%s: completion latency %d cycles, %lluns\n", + ce->engine->name, cycles >> TF_BIAS, + cycles_to_ns(ce->engine, cycles)); + + return intel_gt_wait_for_idle(ce->engine->gt, HZ); + +err: + intel_gt_set_wedged(ce->engine->gt); + return err; +} + +static void rps_pin(struct intel_gt *gt) +{ + /* Pin the frequency to max */ + atomic_inc(>->rps.num_waiters); + intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL); + + mutex_lock(>->rps.lock); + intel_rps_set(>->rps, gt->rps.max_freq); + mutex_unlock(>->rps.lock); +} + +static void rps_unpin(struct intel_gt *gt) +{ + intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL); + atomic_dec(>->rps.num_waiters); +} + +static int perf_request_latency(void *arg) +{ + struct drm_i915_private *i915 = arg; + struct intel_engine_cs *engine; + struct pm_qos_request qos; + int err = 0; + + if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */ + return 0; + + cpu_latency_qos_add_request(&qos, 0); /* disable cstates */ + + for_each_uabi_engine(engine, i915) { + struct intel_context *ce; + + ce = intel_context_create(engine); + if (IS_ERR(ce)) + goto out; + + err = intel_context_pin(ce); + if (err) { + intel_context_put(ce); + goto out; + } + + st_engine_heartbeat_disable(engine); + rps_pin(engine->gt); + + if (err == 0) + err = measure_semaphore_response(ce); + if (err == 0) + err = measure_idle_dispatch(ce); + if (err == 0) + err = measure_busy_dispatch(ce); + if (err == 0) + err = measure_inter_request(ce); + if (err == 0) + err = measure_context_switch(ce); + if (err == 0) + err = measure_preemption(ce); + if (err == 0) + err = measure_completion(ce); + + rps_unpin(engine->gt); + st_engine_heartbeat_enable(engine); + + intel_context_unpin(ce); + intel_context_put(ce); + if (err) + goto out; + } + +out: + if (igt_flush_test(i915)) + err = -EIO; + + cpu_latency_qos_remove_request(&qos); + return err; +} + static int s_sync0(void *arg) { struct perf_series *ps = arg; @@ -1685,9 +2492,11 @@ static int perf_series_engines(void *arg) intel_engine_pm_get(p->engine); if (intel_engine_supports_stats(p->engine)) - p->busy = intel_engine_get_busy_time(p->engine) + 1; + p->busy = intel_engine_get_busy_time(p->engine, + &p->time) + 1; + else + p->time = ktime_get(); p->runtime = -intel_context_get_total_runtime_ns(ce); - p->time = ktime_get(); } err = (*fn)(ps); @@ -1698,13 +2507,15 @@ static int perf_series_engines(void *arg) struct perf_stats *p = &stats[idx]; struct intel_context *ce = ps->ce[idx]; int integer, decimal; - u64 busy, dt; + u64 busy, dt, now; - p->time = ktime_sub(ktime_get(), p->time); - if (p->busy) { - p->busy = ktime_sub(intel_engine_get_busy_time(p->engine), + if (p->busy) + p->busy = ktime_sub(intel_engine_get_busy_time(p->engine, + &now), p->busy - 1); - } + else + now = ktime_get(); + p->time = ktime_sub(now, p->time); err = switch_to_kernel_sync(ce, err); p->runtime += intel_context_get_total_runtime_ns(ce); @@ -1764,13 +2575,14 @@ static int p_sync0(void *arg) return err; } - busy = false; if (intel_engine_supports_stats(engine)) { - p->busy = intel_engine_get_busy_time(engine); + p->busy = intel_engine_get_busy_time(engine, &p->time); busy = true; + } else { + p->time = ktime_get(); + busy = false; } - p->time = ktime_get(); count = 0; do { struct i915_request *rq; @@ -1793,11 +2605,15 @@ static int p_sync0(void *arg) count++; } while (!__igt_timeout(end_time, NULL)); - p->time = ktime_sub(ktime_get(), p->time); if (busy) { - p->busy = ktime_sub(intel_engine_get_busy_time(engine), + ktime_t now; + + p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now), p->busy); + p->time = ktime_sub(now, p->time); + } else { + p->time = ktime_sub(ktime_get(), p->time); } err = switch_to_kernel_sync(ce, err); @@ -1830,13 +2646,14 @@ static int p_sync1(void *arg) return err; } - busy = false; if (intel_engine_supports_stats(engine)) { - p->busy = intel_engine_get_busy_time(engine); + p->busy = intel_engine_get_busy_time(engine, &p->time); busy = true; + } else { + p->time = ktime_get(); + busy = false; } - p->time = ktime_get(); count = 0; do { struct i915_request *rq; @@ -1861,11 +2678,15 @@ static int p_sync1(void *arg) count++; } while (!__igt_timeout(end_time, NULL)); i915_request_put(prev); - p->time = ktime_sub(ktime_get(), p->time); if (busy) { - p->busy = ktime_sub(intel_engine_get_busy_time(engine), + ktime_t now; + + p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now), p->busy); + p->time = ktime_sub(now, p->time); + } else { + p->time = ktime_sub(ktime_get(), p->time); } err = switch_to_kernel_sync(ce, err); @@ -1897,14 +2718,15 @@ static int p_many(void *arg) return err; } - busy = false; if (intel_engine_supports_stats(engine)) { - p->busy = intel_engine_get_busy_time(engine); + p->busy = intel_engine_get_busy_time(engine, &p->time); busy = true; + } else { + p->time = ktime_get(); + busy = false; } count = 0; - p->time = ktime_get(); do { struct i915_request *rq; @@ -1917,11 +2739,15 @@ static int p_many(void *arg) i915_request_add(rq); count++; } while (!__igt_timeout(end_time, NULL)); - p->time = ktime_sub(ktime_get(), p->time); if (busy) { - p->busy = ktime_sub(intel_engine_get_busy_time(engine), + ktime_t now; + + p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now), p->busy); + p->time = ktime_sub(now, p->time); + } else { + p->time = ktime_sub(ktime_get(), p->time); } err = switch_to_kernel_sync(ce, err); @@ -2042,6 +2868,7 @@ static int perf_parallel_engines(void *arg) int i915_request_perf_selftests(struct drm_i915_private *i915) { static const struct i915_subtest tests[] = { + SUBTEST(perf_request_latency), SUBTEST(perf_series_engines), SUBTEST(perf_parallel_engines), }; diff --git a/drivers/gpu/drm/i915/selftests/igt_spinner.c b/drivers/gpu/drm/i915/selftests/igt_spinner.c index e35ba5f9e73f..ec0ecb4e4ca6 100644 --- a/drivers/gpu/drm/i915/selftests/igt_spinner.c +++ b/drivers/gpu/drm/i915/selftests/igt_spinner.c @@ -134,15 +134,15 @@ igt_spinner_create_request(struct igt_spinner *spin, batch = spin->batch; - if (INTEL_GEN(rq->i915) >= 8) { + if (INTEL_GEN(rq->engine->i915) >= 8) { *batch++ = MI_STORE_DWORD_IMM_GEN4; *batch++ = lower_32_bits(hws_address(hws, rq)); *batch++ = upper_32_bits(hws_address(hws, rq)); - } else if (INTEL_GEN(rq->i915) >= 6) { + } else if (INTEL_GEN(rq->engine->i915) >= 6) { *batch++ = MI_STORE_DWORD_IMM_GEN4; *batch++ = 0; *batch++ = hws_address(hws, rq); - } else if (INTEL_GEN(rq->i915) >= 4) { + } else if (INTEL_GEN(rq->engine->i915) >= 4) { *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; *batch++ = 0; *batch++ = hws_address(hws, rq); @@ -154,11 +154,11 @@ igt_spinner_create_request(struct igt_spinner *spin, *batch++ = arbitration_command; - if (INTEL_GEN(rq->i915) >= 8) + if (INTEL_GEN(rq->engine->i915) >= 8) *batch++ = MI_BATCH_BUFFER_START | BIT(8) | 1; - else if (IS_HASWELL(rq->i915)) + else if (IS_HASWELL(rq->engine->i915)) *batch++ = MI_BATCH_BUFFER_START | MI_BATCH_PPGTT_HSW; - else if (INTEL_GEN(rq->i915) >= 6) + else if (INTEL_GEN(rq->engine->i915) >= 6) *batch++ = MI_BATCH_BUFFER_START; else *batch++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT; @@ -176,7 +176,7 @@ igt_spinner_create_request(struct igt_spinner *spin, } flags = 0; - if (INTEL_GEN(rq->i915) <= 5) + if (INTEL_GEN(rq->engine->i915) <= 5) flags |= I915_DISPATCH_SECURE; err = engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags); @@ -221,8 +221,8 @@ bool igt_wait_for_spinner(struct igt_spinner *spin, struct i915_request *rq) { return !(wait_for_us(i915_seqno_passed(hws_seqno(spin, rq), rq->fence.seqno), - 10) && + 100) && wait_for(i915_seqno_passed(hws_seqno(spin, rq), rq->fence.seqno), - 1000)); + 50)); } diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c index 9b105b811f1f..b9810bf156c3 100644 --- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c +++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c @@ -24,6 +24,7 @@ #include <linux/pm_domain.h> #include <linux/pm_runtime.h> +#include <linux/iommu.h> #include <drm/drm_managed.h> @@ -118,6 +119,9 @@ struct drm_i915_private *mock_gem_device(void) { struct drm_i915_private *i915; struct pci_dev *pdev; +#if IS_ENABLED(CONFIG_IOMMU_API) && defined(CONFIG_INTEL_IOMMU) + struct dev_iommu iommu; +#endif int err; pdev = kzalloc(sizeof(*pdev), GFP_KERNEL); @@ -136,8 +140,10 @@ struct drm_i915_private *mock_gem_device(void) dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); #if IS_ENABLED(CONFIG_IOMMU_API) && defined(CONFIG_INTEL_IOMMU) - /* hack to disable iommu for the fake device; force identity mapping */ - pdev->dev.archdata.iommu = (void *)-1; + /* HACK HACK HACK to disable iommu for the fake device; force identity mapping */ + memset(&iommu, 0, sizeof(iommu)); + iommu.priv = (void *)-1; + pdev->dev.iommu = &iommu; #endif pci_set_drvdata(pdev, i915); @@ -190,7 +196,8 @@ struct drm_i915_private *mock_gem_device(void) mock_init_ggtt(i915, &i915->ggtt); i915->gt.vm = i915_vm_get(&i915->ggtt.vm); - mkwrite_device_info(i915)->engine_mask = BIT(0); + mkwrite_device_info(i915)->platform_engine_mask = BIT(0); + i915->gt.info.engine_mask = BIT(0); i915->gt.engine[RCS0] = mock_engine(i915, "mock", RCS0); if (!i915->gt.engine[RCS0]) diff --git a/drivers/gpu/drm/i915/selftests/mock_gtt.c b/drivers/gpu/drm/i915/selftests/mock_gtt.c index edc5e3dda8ca..b173086411ef 100644 --- a/drivers/gpu/drm/i915/selftests/mock_gtt.c +++ b/drivers/gpu/drm/i915/selftests/mock_gtt.c @@ -38,7 +38,8 @@ static void mock_insert_entries(struct i915_address_space *vm, { } -static int mock_bind_ppgtt(struct i915_vma *vma, +static int mock_bind_ppgtt(struct i915_address_space *vm, + struct i915_vma *vma, enum i915_cache_level cache_level, u32 flags) { @@ -47,7 +48,8 @@ static int mock_bind_ppgtt(struct i915_vma *vma, return 0; } -static void mock_unbind_ppgtt(struct i915_vma *vma) +static void mock_unbind_ppgtt(struct i915_address_space *vm, + struct i915_vma *vma) { } @@ -88,7 +90,8 @@ struct i915_ppgtt *mock_ppgtt(struct drm_i915_private *i915, const char *name) return ppgtt; } -static int mock_bind_ggtt(struct i915_vma *vma, +static int mock_bind_ggtt(struct i915_address_space *vm, + struct i915_vma *vma, enum i915_cache_level cache_level, u32 flags) { @@ -96,7 +99,8 @@ static int mock_bind_ggtt(struct i915_vma *vma, return 0; } -static void mock_unbind_ggtt(struct i915_vma *vma) +static void mock_unbind_ggtt(struct i915_address_space *vm, + struct i915_vma *vma) { } diff --git a/drivers/gpu/drm/i915/selftests/mock_region.c b/drivers/gpu/drm/i915/selftests/mock_region.c index b2ad41c27e67..09660f5a0a4c 100644 --- a/drivers/gpu/drm/i915/selftests/mock_region.c +++ b/drivers/gpu/drm/i915/selftests/mock_region.c @@ -9,6 +9,7 @@ #include "mock_region.h" static const struct drm_i915_gem_object_ops mock_region_obj_ops = { + .name = "mock-region", .get_pages = i915_gem_object_get_pages_buddy, .put_pages = i915_gem_object_put_pages_buddy, .release = i915_gem_object_release_memory_region, |