drm/nouveau/fifo: add common channel recovery

That sure was fun to untangle. - handled per-runlist, rather than globally - more straight-forward process in general - various potential SW/HW races have been fixed - fixes lockdep issues that were present in >=gk104's prior implementation - volta recovery now actually stands a chance of working - volta/turing waiting for PBDMA idle before engine reset - turing using hw-provided TSG info for CTXSW_TIMEOUT Signed-off-by: Ben Skeggs <bskeggs@redhat.com> Reviewed-by: Lyude Paul <lyude@redhat.com>
author: Ben Skeggs <bskeggs@redhat.com> 2022-06-01 20:47:34 +1000
committer: Ben Skeggs <bskeggs@redhat.com> 2022-11-09 10:44:49 +1000
commit: 4d60100a23ec5b98e43277d82e5de53c359cf02c (patch)
tree: bad1fa3de47ec94ae17a7b39291d484a83310d1c /drivers/gpu/drm/nouveau/nvkm/engine/fifo/gk104.c
parent: 0b1bb1296f288bb7164d143ca82dc958f87cbff6 (diff)
1 files changed, 71 insertions, 270 deletions
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gk104.c b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gk104.c
index 41b265b683cd..d8cb2626b188 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gk104.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gk104.c
@@ -33,7 +33,6 @@
 #include <core/gpuobj.h>
 #include <subdev/bar.h>
 #include <subdev/mc.h>
-#include <subdev/timer.h>
 #include <subdev/top.h>
 
 #include <nvif/class.h>
@@ -89,14 +88,23 @@ gk104_chan = {
 	.preempt = gf100_chan_preempt,
 };
 
-void
-gk104_fifo_engine_status(struct gk104_fifo *fifo, int engn,
-			 struct gk104_fifo_engine_status *status)
+/*TODO: clean this up */
+struct gk104_engn_status {
+	bool busy;
+	bool faulted;
+	bool chsw;
+	bool save;
+	bool load;
+	struct {
+		bool tsg;
+		u32 id;
+	} prev, next, *chan;
+};
+
+static void
+gk104_engn_status(struct nvkm_engn *engn, struct gk104_engn_status *status)
 {
-	struct nvkm_engine *engine = fifo->engine[engn].engine;
-	struct nvkm_subdev *subdev = &fifo->base.engine.subdev;
-	struct nvkm_device *device = subdev->device;
-	u32 stat = nvkm_rd32(device, 0x002640 + (engn * 0x08));
+	u32 stat = nvkm_rd32(engn->runl->fifo->engine.subdev.device, 0x002640 + (engn->id * 0x08));
 
 	status->busy     = !!(stat & 0x80000000);
 	status->faulted  = !!(stat & 0x40000000);
@@ -111,7 +119,7 @@ gk104_fifo_engine_status(struct gk104_fifo *fifo, int engn,
 
 	if (status->busy && status->chsw) {
 		if (status->load && status->save) {
-			if (engine && nvkm_engine_chsw_load(engine))
+			if (nvkm_engine_chsw_load(engn->engine))
 				status->chan = &status->next;
 			else
 				status->chan = &status->prev;
@@ -126,24 +134,64 @@ gk104_fifo_engine_status(struct gk104_fifo *fifo, int engn,
 		status->chan = &status->prev;
 	}
 
-	nvkm_debug(subdev, "engine %02d: busy %d faulted %d chsw %d "
-			   "save %d load %d %sid %d%s-> %sid %d%s\n",
-		   engn, status->busy, status->faulted,
-		   status->chsw, status->save, status->load,
+	ENGN_DEBUG(engn, "%08x: busy %d faulted %d chsw %d save %d load %d %sid %d%s-> %sid %d%s",
+		   stat, status->busy, status->faulted, status->chsw, status->save, status->load,
 		   status->prev.tsg ? "tsg" : "ch", status->prev.id,
 		   status->chan == &status->prev ? "*" : " ",
 		   status->next.tsg ? "tsg" : "ch", status->next.id,
 		   status->chan == &status->next ? "*" : " ");
 }
 
+int
+gk104_engn_cxid(struct nvkm_engn *engn, bool *cgid)
+{
+	struct gk104_engn_status status;
+
+	gk104_engn_status(engn, &status);
+	if (status.chan) {
+		*cgid = status.chan->tsg;
+		return status.chan->id;
+	}
+
+	return -ENODEV;
+}
+
+bool
+gk104_engn_chsw(struct nvkm_engn *engn)
+{
+	struct gk104_engn_status status;
+
+	gk104_engn_status(engn, &status);
+	if (status.busy && status.chsw)
+		return true;
+
+	return false;
+}
+
 const struct nvkm_engn_func
 gk104_engn = {
+	.chsw = gk104_engn_chsw,
+	.cxid = gk104_engn_cxid,
+	.mmu_fault_trigger = gf100_engn_mmu_fault_trigger,
+	.mmu_fault_triggered = gf100_engn_mmu_fault_triggered,
 };
 
 const struct nvkm_engn_func
 gk104_engn_ce = {
+	.chsw = gk104_engn_chsw,
+	.cxid = gk104_engn_cxid,
+	.mmu_fault_trigger = gf100_engn_mmu_fault_trigger,
+	.mmu_fault_triggered = gf100_engn_mmu_fault_triggered,
 };
 
+bool
+gk104_runq_idle(struct nvkm_runq *runq)
+{
+	struct nvkm_device *device = runq->fifo->engine.subdev.device;
+
+	return !(nvkm_rd32(device, 0x003080 + (runq->id * 4)) & 0x0000e000);
+}
+
 static const struct nvkm_bitfield
 gk104_runq_intr_1_names[] = {
 	{ 0x00000001, "HCE_RE_ILLEGAL_OP" },
@@ -248,9 +296,16 @@ gk104_runq = {
 	.init = gk104_runq_init,
 	.intr = gk104_runq_intr,
 	.intr_0_names = gk104_runq_intr_0_names,
+	.idle = gk104_runq_idle,
 };
 
 void
+gk104_runl_fault_clear(struct nvkm_runl *runl)
+{
+	nvkm_wr32(runl->fifo->engine.subdev.device, 0x00262c, BIT(runl->id));
+}
+
+void
 gk104_runl_allow(struct nvkm_runl *runl, u32 engm)
 {
 	nvkm_mask(runl->fifo->engine.subdev.device, 0x002630, BIT(runl->id), 0x00000000);
@@ -373,6 +428,7 @@ gk104_runl = {
 	.pending = gk104_runl_pending,
 	.block = gk104_runl_block,
 	.allow = gk104_runl_allow,
+	.fault_clear = gk104_runl_fault_clear,
 	.preempt_pending = gf100_runl_preempt_pending,
 };
 
@@ -394,193 +450,6 @@ gk104_fifo_engine_id(struct nvkm_fifo *base, struct nvkm_engine *engine)
 	return -1;
 }
 
-static void
-gk104_fifo_recover_work(struct work_struct *w)
-{
-	struct gk104_fifo *fifo = container_of(w, typeof(*fifo), recover.work);
-	struct nvkm_device *device = fifo->base.engine.subdev.device;
-	struct nvkm_engine *engine;
-	unsigned long flags;
-	u32 engm, runm, todo;
-	int engn, runl;
-
-	spin_lock_irqsave(&fifo->base.lock, flags);
-	runm = fifo->recover.runm;
-	engm = fifo->recover.engm;
-	fifo->recover.engm = 0;
-	fifo->recover.runm = 0;
-	spin_unlock_irqrestore(&fifo->base.lock, flags);
-
-	nvkm_mask(device, 0x002630, runm, runm);
-
-	for (todo = engm; engn = __ffs(todo), todo; todo &= ~BIT(engn)) {
-		if ((engine = fifo->engine[engn].engine)) {
-			nvkm_subdev_fini(&engine->subdev, false);
-			WARN_ON(nvkm_subdev_init(&engine->subdev));
-		}
-	}
-
-	for (todo = runm; runl = __ffs(todo), todo; todo &= ~BIT(runl))
-		gk104_fifo_runlist_update(fifo, runl);
-
-	nvkm_wr32(device, 0x00262c, runm);
-	nvkm_mask(device, 0x002630, runm, 0x00000000);
-}
-
-static void gk104_fifo_recover_engn(struct gk104_fifo *fifo, int engn);
-
-static void
-gk104_fifo_recover_runl(struct gk104_fifo *fifo, int runl)
-{
-	struct nvkm_subdev *subdev = &fifo->base.engine.subdev;
-	struct nvkm_device *device = subdev->device;
-	const u32 runm = BIT(runl);
-
-	assert_spin_locked(&fifo->base.lock);
-	if (fifo->recover.runm & runm)
-		return;
-	fifo->recover.runm |= runm;
-
-	/* Block runlist to prevent channel assignment(s) from changing. */
-	nvkm_mask(device, 0x002630, runm, runm);
-
-	/* Schedule recovery. */
-	nvkm_warn(subdev, "runlist %d: scheduled for recovery\n", runl);
-	schedule_work(&fifo->recover.work);
-}
-
-static struct gk104_fifo_chan *
-gk104_fifo_recover_chid(struct gk104_fifo *fifo, int runl, int chid)
-{
-	struct gk104_fifo_chan *chan;
-	struct nvkm_fifo_cgrp *cgrp;
-
-	list_for_each_entry(chan, &fifo->runlist[runl].chan, head) {
-		if (chan->base.chid == chid) {
-			list_del_init(&chan->head);
-			return chan;
-		}
-	}
-
-	list_for_each_entry(cgrp, &fifo->runlist[runl].cgrp, head) {
-		if (cgrp->id == chid) {
-			chan = list_first_entry(&cgrp->chan, typeof(*chan), head);
-			list_del_init(&chan->head);
-			if (!--cgrp->chan_nr)
-				list_del_init(&cgrp->head);
-			return chan;
-		}
-	}
-
-	return NULL;
-}
-
-void
-gk104_fifo_recover_chan(struct nvkm_fifo *base, int chid)
-{
-	struct gk104_fifo *fifo = gk104_fifo(base);
-	struct nvkm_subdev *subdev = &fifo->base.engine.subdev;
-	struct nvkm_device *device = subdev->device;
-	const u32  stat = nvkm_rd32(device, 0x800004 + (chid * 0x08));
-	const u32  runl = (stat & 0x000f0000) >> 16;
-	const bool used = (stat & 0x00000001);
-	unsigned long engn, engm = fifo->runlist[runl].engm;
-	struct gk104_fifo_chan *chan;
-
-	assert_spin_locked(&fifo->base.lock);
-	if (!used)
-		return;
-
-	/* Lookup SW state for channel, and mark it as dead. */
-	chan = gk104_fifo_recover_chid(fifo, runl, chid);
-	if (chan) {
-		chan->killed = true;
-		nvkm_chan_error(&chan->base, false);
-	}
-
-	/* Block channel assignments from changing during recovery. */
-	gk104_fifo_recover_runl(fifo, runl);
-
-	/* Schedule recovery for any engines the channel is on. */
-	for_each_set_bit(engn, &engm, fifo->engine_nr) {
-		struct gk104_fifo_engine_status status;
-		gk104_fifo_engine_status(fifo, engn, &status);
-		if (!status.chan || status.chan->id != chid)
-			continue;
-		gk104_fifo_recover_engn(fifo, engn);
-	}
-}
-
-static void
-gk104_fifo_recover_engn(struct gk104_fifo *fifo, int engn)
-{
-	struct nvkm_engine *engine = fifo->engine[engn].engine;
-	struct nvkm_subdev *subdev = &fifo->base.engine.subdev;
-	struct nvkm_device *device = subdev->device;
-	const u32 runl = fifo->engine[engn].runl;
-	const u32 engm = BIT(engn);
-	struct gk104_fifo_engine_status status;
-	int mmui = -1;
-
-	assert_spin_locked(&fifo->base.lock);
-	if (fifo->recover.engm & engm)
-		return;
-	fifo->recover.engm |= engm;
-
-	/* Block channel assignments from changing during recovery. */
-	gk104_fifo_recover_runl(fifo, runl);
-
-	/* Determine which channel (if any) is currently on the engine. */
-	gk104_fifo_engine_status(fifo, engn, &status);
-	if (status.chan) {
-		/* The channel is not longer viable, kill it. */
-		gk104_fifo_recover_chan(&fifo->base, status.chan->id);
-	}
-
-	/* Determine MMU fault ID for the engine, if we're not being
-	 * called from the fault handler already.
-	 */
-	if (!status.faulted && engine) {
-		mmui = nvkm_top_fault_id(device, engine->subdev.type, engine->subdev.inst);
-		if (mmui < 0) {
-			const struct nvkm_enum *en = fifo->func->mmu_fault->engine;
-			for (; en && en->name; en++) {
-				if (en->data2 == engine->subdev.type &&
-				    en->inst  == engine->subdev.inst) {
-					mmui = en->value;
-					break;
-				}
-			}
-		}
-		WARN_ON(mmui < 0);
-	}
-
-	/* Trigger a MMU fault for the engine.
-	 *
-	 * No good idea why this is needed, but nvgpu does something similar,
-	 * and it makes recovery from CTXSW_TIMEOUT a lot more reliable.
-	 */
-	if (mmui >= 0) {
-		nvkm_wr32(device, 0x002a30 + (engn * 0x04), 0x00000100 | mmui);
-
-		/* Wait for fault to trigger. */
-		nvkm_msec(device, 2000,
-			gk104_fifo_engine_status(fifo, engn, &status);
-			if (status.faulted)
-				break;
-		);
-
-		/* Release MMU fault trigger, and ACK the fault. */
-		nvkm_wr32(device, 0x002a30 + (engn * 0x04), 0x00000000);
-		nvkm_wr32(device, 0x00259c, BIT(mmui));
-		nvkm_wr32(device, 0x002100, 0x10000000);
-	}
-
-	/* Schedule recovery. */
-	nvkm_warn(subdev, "engine %d: scheduled for recovery\n", engn);
-	schedule_work(&fifo->recover.work);
-}
-
 static const struct nvkm_enum
 gk104_fifo_mmu_fault_engine[] = {
 	{ 0x00, "GR", NULL, NVKM_ENGINE_GR },
@@ -728,64 +597,6 @@ gk104_fifo_intr_bind(struct nvkm_fifo *fifo)
 	nvkm_error(subdev, "BIND_ERROR %02x [%s]\n", code, en ? en->name : "");
 }
 
-static const struct nvkm_enum
-gk104_fifo_sched_reason[] = {
-	{ 0x0a, "CTXSW_TIMEOUT" },
-	{}
-};
-
-static void
-gk104_fifo_intr_sched_ctxsw(struct gk104_fifo *fifo)
-{
-	struct nvkm_device *device = fifo->base.engine.subdev.device;
-	unsigned long flags, engm = 0;
-	u32 engn;
-
-	/* We need to ACK the SCHED_ERROR here, and prevent it reasserting,
-	 * as MMU_FAULT cannot be triggered while it's pending.
-	 */
-	spin_lock_irqsave(&fifo->base.lock, flags);
-	nvkm_mask(device, 0x002140, 0x00000100, 0x00000000);
-	nvkm_wr32(device, 0x002100, 0x00000100);
-
-	for (engn = 0; engn < fifo->engine_nr; engn++) {
-		struct gk104_fifo_engine_status status;
-
-		gk104_fifo_engine_status(fifo, engn, &status);
-		if (!status.busy || !status.chsw)
-			continue;
-
-		engm |= BIT(engn);
-	}
-
-	for_each_set_bit(engn, &engm, fifo->engine_nr)
-		gk104_fifo_recover_engn(fifo, engn);
-
-	nvkm_mask(device, 0x002140, 0x00000100, 0x00000100);
-	spin_unlock_irqrestore(&fifo->base.lock, flags);
-}
-
-static void
-gk104_fifo_intr_sched(struct gk104_fifo *fifo)
-{
-	struct nvkm_subdev *subdev = &fifo->base.engine.subdev;
-	struct nvkm_device *device = subdev->device;
-	u32 intr = nvkm_rd32(device, 0x00254c);
-	u32 code = intr & 0x000000ff;
-	const struct nvkm_enum *en =
-		nvkm_enum_find(gk104_fifo_sched_reason, code);
-
-	nvkm_error(subdev, "SCHED_ERROR %02x [%s]\n", code, en ? en->name : "");
-
-	switch (code) {
-	case 0x0a:
-		gk104_fifo_intr_sched_ctxsw(fifo);
-		break;
-	default:
-		break;
-	}
-}
-
 void
 gk104_fifo_intr_chsw(struct nvkm_fifo *fifo)
 {
@@ -840,7 +651,7 @@ gk104_fifo_intr(struct nvkm_inth *inth)
 	}
 
 	if (stat & 0x00000100) {
-		gk104_fifo_intr_sched(gk104_fifo(fifo));
+		gf100_fifo_intr_sched(fifo);
 		nvkm_wr32(device, 0x002100, 0x00000100);
 		stat &= ~0x00000100;
 	}
@@ -902,13 +713,6 @@ gk104_fifo_intr(struct nvkm_inth *inth)
 }
 
 void
-gk104_fifo_fini(struct nvkm_fifo *base)
-{
-	struct gk104_fifo *fifo = gk104_fifo(base);
-	flush_work(&fifo->recover.work);
-}
-
-void
 gk104_fifo_init_pbdmas(struct nvkm_fifo *fifo, u32 mask)
 {
 	struct nvkm_device *device = fifo->engine.subdev.device;
@@ -999,7 +803,6 @@ gk104_fifo_oneinit(struct nvkm_fifo *base)
 			continue;
 
 		fifo->engine[engn].engine = nvkm_device_engine(device, tdev->type, tdev->inst);
-		fifo->engine[engn].runl = tdev->runlist;
 		fifo->engine_nr = max(fifo->engine_nr, engn + 1);
 		fifo->runlist[tdev->runlist].engm |= BIT(engn);
 		fifo->runlist[tdev->runlist].engm_sw |= BIT(engn);
@@ -1064,7 +867,6 @@ gk104_fifo_new_(const struct gk104_fifo_func *func, struct nvkm_device *device,
 	if (!(fifo = kzalloc(sizeof(*fifo), GFP_KERNEL)))
 		return -ENOMEM;
 	fifo->func = func;
-	INIT_WORK(&fifo->recover.work, gk104_fifo_recover_work);
 	*pfifo = &fifo->base;
 
 	return nvkm_fifo_ctor(func, device, type, inst, &fifo->base);
@@ -1080,12 +882,11 @@ gk104_fifo = {
 	.runl_ctor = gk104_fifo_runl_ctor,
 	.init = gk104_fifo_init,
 	.init_pbdmas = gk104_fifo_init_pbdmas,
-	.fini = gk104_fifo_fini,
 	.intr = gk104_fifo_intr,
 	.intr_mmu_fault_unit = gf100_fifo_intr_mmu_fault_unit,
+	.intr_ctxsw_timeout = gf100_fifo_intr_ctxsw_timeout,
 	.mmu_fault = &gk104_fifo_mmu_fault,
 	.engine_id = gk104_fifo_engine_id,
-	.recover_chan = gk104_fifo_recover_chan,
 	.runlist = &gk104_fifo_runlist,
 	.nonstall = &gf100_fifo_nonstall,
 	.runl = &gk104_runl,
author	Ben Skeggs <bskeggs@redhat.com>	2022-06-01 20:47:34 +1000
committer	Ben Skeggs <bskeggs@redhat.com>	2022-11-09 10:44:49 +1000
commit	4d60100a23ec5b98e43277d82e5de53c359cf02c (patch)
tree	bad1fa3de47ec94ae17a7b39291d484a83310d1c /drivers/gpu/drm/nouveau/nvkm/engine/fifo/gk104.c
parent	0b1bb1296f288bb7164d143ca82dc958f87cbff6 (diff)