summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/xe/xe_guc_submit.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/xe/xe_guc_submit.c')
-rw-r--r--drivers/gpu/drm/xe/xe_guc_submit.c100
1 files changed, 82 insertions, 18 deletions
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
index 77b0f0d8f729..fbbe6a487bbb 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -1071,7 +1071,9 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
struct xe_exec_queue *q = job->q;
struct xe_gpu_scheduler *sched = &q->guc->sched;
struct xe_guc *guc = exec_queue_to_guc(q);
+ const char *process_name = "no process";
int err = -ETIME;
+ pid_t pid = -1;
int i = 0;
bool wedged, skip_timeout_check;
@@ -1168,9 +1170,14 @@ trigger_reset:
goto sched_enable;
}
- xe_gt_notice(guc_to_gt(guc), "Timedout job: seqno=%u, lrc_seqno=%u, guc_id=%d, flags=0x%lx",
+ if (q->vm && q->vm->xef) {
+ process_name = q->vm->xef->process_name;
+ pid = q->vm->xef->pid;
+ }
+ xe_gt_notice(guc_to_gt(guc), "Timedout job: seqno=%u, lrc_seqno=%u, guc_id=%d, flags=0x%lx in %s [%d]",
xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
- q->guc->id, q->flags);
+ q->guc->id, q->flags, process_name, pid);
+
trace_xe_sched_job_timedout(job);
if (!exec_queue_killed(q))
@@ -1312,6 +1319,15 @@ static void __guc_exec_queue_process_msg_set_sched_props(struct xe_sched_msg *ms
kfree(msg);
}
+static void __suspend_fence_signal(struct xe_exec_queue *q)
+{
+ if (!q->guc->suspend_pending)
+ return;
+
+ WRITE_ONCE(q->guc->suspend_pending, false);
+ wake_up(&q->guc->suspend_wait);
+}
+
static void suspend_fence_signal(struct xe_exec_queue *q)
{
struct xe_guc *guc = exec_queue_to_guc(q);
@@ -1321,9 +1337,7 @@ static void suspend_fence_signal(struct xe_exec_queue *q)
guc_read_stopped(guc));
xe_assert(xe, q->guc->suspend_pending);
- q->guc->suspend_pending = false;
- smp_wmb();
- wake_up(&q->guc->suspend_wait);
+ __suspend_fence_signal(q);
}
static void __guc_exec_queue_process_msg_suspend(struct xe_sched_msg *msg)
@@ -1360,9 +1374,11 @@ static void __guc_exec_queue_process_msg_resume(struct xe_sched_msg *msg)
struct xe_exec_queue *q = msg->private_data;
if (guc_exec_queue_allowed_to_change_state(q)) {
- q->guc->resume_time = RESUME_PENDING;
clear_exec_queue_suspended(q);
- enable_scheduling(q);
+ if (!exec_queue_enabled(q)) {
+ q->guc->resume_time = RESUME_PENDING;
+ enable_scheduling(q);
+ }
} else {
clear_exec_queue_suspended(q);
}
@@ -1372,9 +1388,13 @@ static void __guc_exec_queue_process_msg_resume(struct xe_sched_msg *msg)
#define SET_SCHED_PROPS 2
#define SUSPEND 3
#define RESUME 4
+#define OPCODE_MASK 0xf
+#define MSG_LOCKED BIT(8)
static void guc_exec_queue_process_msg(struct xe_sched_msg *msg)
{
+ struct xe_device *xe = guc_to_xe(exec_queue_to_guc(msg->private_data));
+
trace_xe_sched_msg_recv(msg);
switch (msg->opcode) {
@@ -1394,7 +1414,7 @@ static void guc_exec_queue_process_msg(struct xe_sched_msg *msg)
XE_WARN_ON("Unknown message type");
}
- xe_pm_runtime_put(guc_to_xe(exec_queue_to_guc(msg->private_data)));
+ xe_pm_runtime_put(xe);
}
static const struct drm_sched_backend_ops drm_sched_ops = {
@@ -1414,7 +1434,7 @@ static int guc_exec_queue_init(struct xe_exec_queue *q)
struct xe_device *xe = guc_to_xe(guc);
struct xe_guc_exec_queue *ge;
long timeout;
- int err;
+ int err, i;
xe_assert(xe, xe_device_uc_enabled(guc_to_xe(guc)));
@@ -1426,6 +1446,9 @@ static int guc_exec_queue_init(struct xe_exec_queue *q)
ge->q = q;
init_waitqueue_head(&ge->suspend_wait);
+ for (i = 0; i < MAX_STATIC_MSG_TYPE; ++i)
+ INIT_LIST_HEAD(&ge->static_msgs[i].link);
+
timeout = (q->vm && xe_vm_in_lr_mode(q->vm)) ? MAX_SCHEDULE_TIMEOUT :
msecs_to_jiffies(q->sched_props.job_timeout_ms);
err = xe_sched_init(&ge->sched, &drm_sched_ops, &xe_sched_ops,
@@ -1478,6 +1501,7 @@ static void guc_exec_queue_kill(struct xe_exec_queue *q)
{
trace_xe_exec_queue_kill(q);
set_exec_queue_killed(q);
+ __suspend_fence_signal(q);
xe_guc_exec_queue_trigger_cleanup(q);
}
@@ -1487,11 +1511,26 @@ static void guc_exec_queue_add_msg(struct xe_exec_queue *q, struct xe_sched_msg
xe_pm_runtime_get_noresume(guc_to_xe(exec_queue_to_guc(q)));
INIT_LIST_HEAD(&msg->link);
- msg->opcode = opcode;
+ msg->opcode = opcode & OPCODE_MASK;
msg->private_data = q;
trace_xe_sched_msg_add(msg);
- xe_sched_add_msg(&q->guc->sched, msg);
+ if (opcode & MSG_LOCKED)
+ xe_sched_add_msg_locked(&q->guc->sched, msg);
+ else
+ xe_sched_add_msg(&q->guc->sched, msg);
+}
+
+static bool guc_exec_queue_try_add_msg(struct xe_exec_queue *q,
+ struct xe_sched_msg *msg,
+ u32 opcode)
+{
+ if (!list_empty(&msg->link))
+ return false;
+
+ guc_exec_queue_add_msg(q, msg, opcode | MSG_LOCKED);
+
+ return true;
}
#define STATIC_MSG_CLEANUP 0
@@ -1565,34 +1604,59 @@ static int guc_exec_queue_set_preempt_timeout(struct xe_exec_queue *q,
static int guc_exec_queue_suspend(struct xe_exec_queue *q)
{
+ struct xe_gpu_scheduler *sched = &q->guc->sched;
struct xe_sched_msg *msg = q->guc->static_msgs + STATIC_MSG_SUSPEND;
- if (exec_queue_killed_or_banned_or_wedged(q) || q->guc->suspend_pending)
+ if (exec_queue_killed_or_banned_or_wedged(q))
return -EINVAL;
- q->guc->suspend_pending = true;
- guc_exec_queue_add_msg(q, msg, SUSPEND);
+ xe_sched_msg_lock(sched);
+ if (guc_exec_queue_try_add_msg(q, msg, SUSPEND))
+ q->guc->suspend_pending = true;
+ xe_sched_msg_unlock(sched);
return 0;
}
-static void guc_exec_queue_suspend_wait(struct xe_exec_queue *q)
+static int guc_exec_queue_suspend_wait(struct xe_exec_queue *q)
{
struct xe_guc *guc = exec_queue_to_guc(q);
+ int ret;
+
+ /*
+ * Likely don't need to check exec_queue_killed() as we clear
+ * suspend_pending upon kill but to be paranoid but races in which
+ * suspend_pending is set after kill also check kill here.
+ */
+ ret = wait_event_interruptible_timeout(q->guc->suspend_wait,
+ !READ_ONCE(q->guc->suspend_pending) ||
+ exec_queue_killed(q) ||
+ guc_read_stopped(guc),
+ HZ * 5);
- wait_event(q->guc->suspend_wait, !q->guc->suspend_pending ||
- guc_read_stopped(guc));
+ if (!ret) {
+ xe_gt_warn(guc_to_gt(guc),
+ "Suspend fence, guc_id=%d, failed to respond",
+ q->guc->id);
+ /* XXX: Trigger GT reset? */
+ return -ETIME;
+ }
+
+ return ret < 0 ? ret : 0;
}
static void guc_exec_queue_resume(struct xe_exec_queue *q)
{
+ struct xe_gpu_scheduler *sched = &q->guc->sched;
struct xe_sched_msg *msg = q->guc->static_msgs + STATIC_MSG_RESUME;
struct xe_guc *guc = exec_queue_to_guc(q);
struct xe_device *xe = guc_to_xe(guc);
xe_assert(xe, !q->guc->suspend_pending);
- guc_exec_queue_add_msg(q, msg, RESUME);
+ xe_sched_msg_lock(sched);
+ guc_exec_queue_try_add_msg(q, msg, RESUME);
+ xe_sched_msg_unlock(sched);
}
static bool guc_exec_queue_reset_status(struct xe_exec_queue *q)