2 files changed, 80 insertions, 72 deletions
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 5e862c7c0e6b..be86dbfa75a8 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -630,11 +630,8 @@ enum scx_enq_flags {
 	 * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the
 	 * %SCX_ENQ_LAST flag set.
 	 *
-	 * If the BPF scheduler wants to continue executing the task,
-	 * ops.enqueue() should dispatch the task to %SCX_DSQ_LOCAL immediately.
-	 * If the task gets queued on a different dsq or the BPF side, the BPF
-	 * scheduler is responsible for triggering a follow-up scheduling event.
-	 * Otherwise, Execution may stall.
+	 * The BPF scheduler is responsible for triggering a follow-up
+	 * scheduling event. Otherwise, Execution may stall.
 	 */
 	SCX_ENQ_LAST		= 1LLU << 41,
 
@@ -1852,12 +1849,8 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 	if (!scx_rq_online(rq))
 		goto local;
 
-	if (scx_ops_bypassing()) {
-		if (enq_flags & SCX_ENQ_LAST)
-			goto local;
-		else
-			goto global;
-	}
+	if (scx_ops_bypassing())
+		goto global;
 
 	if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
 		goto direct;
@@ -1867,11 +1860,6 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 	    unlikely(p->flags & PF_EXITING))
 		goto local;
 
-	/* see %SCX_OPS_ENQ_LAST */
-	if (!static_branch_unlikely(&scx_ops_enq_last) &&
-	    (enq_flags & SCX_ENQ_LAST))
-		goto local;
-
 	if (!SCX_HAS_OP(enqueue))
 		goto global;
 
@@ -2517,7 +2505,6 @@ static int balance_one(struct rq *rq, struct task_struct *prev, bool local)
 	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
 	bool prev_on_scx = prev->sched_class == &ext_sched_class;
 	int nr_loops = SCX_DSP_MAX_LOOPS;
-	bool has_tasks = false;
 
 	lockdep_assert_rq_held(rq);
 	rq->scx.flags |= SCX_RQ_IN_BALANCE;
@@ -2542,9 +2529,9 @@ static int balance_one(struct rq *rq, struct task_struct *prev, bool local)
 		/*
 		 * If @prev is runnable & has slice left, it has priority and
 		 * fetching more just increases latency for the fetched tasks.
-		 * Tell put_prev_task_scx() to put @prev on local_dsq. If the
-		 * BPF scheduler wants to handle this explicitly, it should
-		 * implement ->cpu_released().
+		 * Tell pick_next_task_scx() to keep running @prev. If the BPF
+		 * scheduler wants to handle this explicitly, it should
+		 * implement ->cpu_release().
 		 *
 		 * See scx_ops_disable_workfn() for the explanation on the
 		 * bypassing test.
@@ -2570,7 +2557,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev, bool local)
 		goto has_tasks;
 
 	if (!SCX_HAS_OP(dispatch) || scx_ops_bypassing() || !scx_rq_online(rq))
-		goto out;
+		goto no_tasks;
 
 	dspc->rq = rq;
 
@@ -2609,13 +2596,23 @@ static int balance_one(struct rq *rq, struct task_struct *prev, bool local)
 		}
 	} while (dspc->nr_tasks);
 
-	goto out;
+no_tasks:
+	/*
+	 * Didn't find another task to run. Keep running @prev unless
+	 * %SCX_OPS_ENQ_LAST is in effect.
+	 */
+	if ((prev->scx.flags & SCX_TASK_QUEUED) &&
+	    (!static_branch_unlikely(&scx_ops_enq_last) || scx_ops_bypassing())) {
+		if (local)
+			prev->scx.flags |= SCX_TASK_BAL_KEEP;
+		goto has_tasks;
+	}
+	rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
+	return false;
 
 has_tasks:
-	has_tasks = true;
-out:
 	rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
-	return has_tasks;
+	return true;
 }
 
 static int balance_scx(struct rq *rq, struct task_struct *prev,
@@ -2728,25 +2725,16 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
 	if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED))
 		SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, true);
 
-	/*
-	 * If we're being called from put_prev_task_balance(), balance_scx() may
-	 * have decided that @p should keep running.
-	 */
-	if (p->scx.flags & SCX_TASK_BAL_KEEP) {
+	if (p->scx.flags & SCX_TASK_QUEUED) {
 		p->scx.flags &= ~SCX_TASK_BAL_KEEP;
-		set_task_runnable(rq, p);
-		dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
-		return;
-	}
 
-	if (p->scx.flags & SCX_TASK_QUEUED) {
 		set_task_runnable(rq, p);
 
 		/*
-		 * If @p has slice left and balance_scx() didn't tag it for
-		 * keeping, @p is getting preempted by a higher priority
-		 * scheduler class or core-sched forcing a different task. Leave
-		 * it at the head of the local DSQ.
+		 * If @p has slice left and is being put, @p is getting
+		 * preempted by a higher priority scheduler class or core-sched
+		 * forcing a different task. Leave it at the head of the local
+		 * DSQ.
 		 */
 		if (p->scx.slice && !scx_ops_bypassing()) {
 			dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
@@ -2754,18 +2742,17 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
 		}
 
 		/*
-		 * If we're in the pick_next_task path, balance_scx() should
-		 * have already populated the local DSQ if there are any other
-		 * available tasks. If empty, tell ops.enqueue() that @p is the
-		 * only one available for this cpu. ops.enqueue() should put it
-		 * on the local DSQ so that the subsequent pick_next_task_scx()
-		 * can find the task unless it wants to trigger a separate
-		 * follow-up scheduling event.
+		 * If @p is runnable but we're about to enter a lower
+		 * sched_class, %SCX_OPS_ENQ_LAST must be set. Tell
+		 * ops.enqueue() that @p is the only one available for this cpu,
+		 * which should trigger an explicit follow-up scheduling event.
 		 */
-		if (list_empty(&rq->scx.local_dsq.list))
+		if (sched_class_above(&ext_sched_class, next->sched_class)) {
+			WARN_ON_ONCE(!static_branch_unlikely(&scx_ops_enq_last));
 			do_enqueue_task(rq, p, SCX_ENQ_LAST, -1);
-		else
+		} else {
 			do_enqueue_task(rq, p, 0, -1);
+		}
 	}
 }
 
@@ -2780,27 +2767,33 @@ static struct task_struct *pick_next_task_scx(struct rq *rq,
 {
 	struct task_struct *p;
 
-	if (prev->sched_class == &ext_sched_class)
-		put_prev_task_scx(rq, prev, NULL);
-
-	p = first_local_task(rq);
-	if (!p)
-		return NULL;
-
-	if (prev->sched_class != &ext_sched_class)
-		prev->sched_class->put_prev_task(rq, prev, p);
-
-	set_next_task_scx(rq, p, true);
+	/*
+	 * If balance_scx() is telling us to keep running @prev, replenish slice
+	 * if necessary and keep running @prev. Otherwise, pop the first one
+	 * from the local DSQ.
+	 */
+	if (prev->scx.flags & SCX_TASK_BAL_KEEP) {
+		prev->scx.flags &= ~SCX_TASK_BAL_KEEP;
+		p = prev;
+		if (!p->scx.slice)
+			p->scx.slice = SCX_SLICE_DFL;
+	} else {
+		p = first_local_task(rq);
+		if (!p)
+			return NULL;
 
-	if (unlikely(!p->scx.slice)) {
-		if (!scx_ops_bypassing() && !scx_warned_zero_slice) {
-			printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n",
-					p->comm, p->pid);
-			scx_warned_zero_slice = true;
+		if (unlikely(!p->scx.slice)) {
+			if (!scx_ops_bypassing() && !scx_warned_zero_slice) {
+				printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n",
+						p->comm, p->pid);
+				scx_warned_zero_slice = true;
+			}
+			p->scx.slice = SCX_SLICE_DFL;
 		}
-		p->scx.slice = SCX_SLICE_DFL;
 	}
 
+	put_prev_set_next_task(rq, prev, p);
+
 	return p;
 }
 
@@ -3875,12 +3868,13 @@ bool task_should_scx(struct task_struct *p)
  * to force global FIFO scheduling.
  *
  * a. ops.enqueue() is ignored and tasks are queued in simple global FIFO order.
+ *    %SCX_OPS_ENQ_LAST is also ignored.
  *
  * b. ops.dispatch() is ignored.
  *
- * c. balance_scx() never sets %SCX_TASK_BAL_KEEP as the slice value can't be
- *    trusted. Whenever a tick triggers, the running task is rotated to the tail
- *    of the queue with core_sched_at touched.
+ * c. balance_scx() does not set %SCX_TASK_BAL_KEEP on non-zero slice as slice
+ *    can't be trusted. Whenever a tick triggers, the running task is rotated to
+ *    the tail of the queue with core_sched_at touched.
  *
  * d. pick_next_task() suppresses zero slice warning.
  *
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 892278f12dce..7e7f28f4117e 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -205,8 +205,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 
 	/*
 	 * All enqueued tasks must have their core_sched_seq updated for correct
-	 * core-sched ordering, which is why %SCX_OPS_ENQ_LAST is specified in
-	 * qmap_ops.flags.
+	 * core-sched ordering. Also, take a look at the end of qmap_dispatch().
 	 */
 	tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
 
@@ -214,7 +213,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 	 * If qmap_select_cpu() is telling us to or this is the last runnable
 	 * task on the CPU, enqueue locally.
 	 */
-	if (tctx->force_local || (enq_flags & SCX_ENQ_LAST)) {
+	if (tctx->force_local) {
 		tctx->force_local = false;
 		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
 		return;
@@ -285,6 +284,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 {
 	struct task_struct *p;
 	struct cpu_ctx *cpuc;
+	struct task_ctx *tctx;
 	u32 zero = 0, batch = dsp_batch ?: 1;
 	void *fifo;
 	s32 i, pid;
@@ -349,6 +349,21 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 
 		cpuc->dsp_cnt = 0;
 	}
+
+	/*
+	 * No other tasks. @prev will keep running. Update its core_sched_seq as
+	 * if the task were enqueued and dispatched immediately.
+	 */
+	if (prev) {
+		tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
+		if (!tctx) {
+			scx_bpf_error("task_ctx lookup failed");
+			return;
+		}
+
+		tctx->core_sched_seq =
+			core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
+	}
 }
 
 void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
@@ -701,6 +716,5 @@ SCX_OPS_DEFINE(qmap_ops,
 	       .cpu_offline		= (void *)qmap_cpu_offline,
 	       .init			= (void *)qmap_init,
 	       .exit			= (void *)qmap_exit,
-	       .flags			= SCX_OPS_ENQ_LAST,
 	       .timeout_ms		= 5000U,
 	       .name			= "qmap");