summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/accounting/delay-accounting.rst12
-rw-r--r--Documentation/admin-guide/hw-vuln/core-scheduling.rst223
-rw-r--r--Documentation/admin-guide/hw-vuln/index.rst1
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt2
-rw-r--r--Documentation/admin-guide/sysctl/kernel.rst7
-rw-r--r--arch/alpha/kernel/smp.c1
-rw-r--r--arch/arc/kernel/smp.c1
-rw-r--r--arch/arm/kernel/smp.c1
-rw-r--r--arch/arm64/include/asm/preempt.h2
-rw-r--r--arch/arm64/kernel/smp.c1
-rw-r--r--arch/arm64/kvm/Kconfig5
-rw-r--r--arch/csky/kernel/smp.c1
-rw-r--r--arch/ia64/kernel/smpboot.c1
-rw-r--r--arch/mips/kernel/smp.c1
-rw-r--r--arch/openrisc/kernel/smp.c2
-rw-r--r--arch/parisc/kernel/smp.c1
-rw-r--r--arch/powerpc/kernel/smp.c1
-rw-r--r--arch/riscv/kernel/smpboot.c1
-rw-r--r--arch/s390/include/asm/preempt.h4
-rw-r--r--arch/s390/kernel/smp.c1
-rw-r--r--arch/sh/kernel/smp.c2
-rw-r--r--arch/sparc/kernel/smp_32.c1
-rw-r--r--arch/sparc/kernel/smp_64.c3
-rw-r--r--arch/x86/include/asm/preempt.h2
-rw-r--r--arch/x86/kernel/smpboot.c1
-rw-r--r--arch/x86/kvm/Kconfig5
-rw-r--r--arch/xtensa/kernel/smp.c1
-rw-r--r--drivers/cpuidle/governors/menu.c6
-rw-r--r--drivers/thermal/cpufreq_cooling.c2
-rw-r--r--fs/proc/loadavg.c2
-rw-r--r--fs/proc/stat.c4
-rw-r--r--include/asm-generic/preempt.h2
-rw-r--r--include/linux/delayacct.h22
-rw-r--r--include/linux/energy_model.h16
-rw-r--r--include/linux/kthread.h2
-rw-r--r--include/linux/sched.h19
-rw-r--r--include/linux/sched/cpufreq.h2
-rw-r--r--include/linux/sched/stat.h16
-rw-r--r--include/linux/sched_clock.h2
-rw-r--r--include/uapi/linux/prctl.h8
-rw-r--r--init/main.c17
-rw-r--r--kernel/Kconfig.preempt20
-rw-r--r--kernel/delayacct.c71
-rw-r--r--kernel/fork.c8
-rw-r--r--kernel/kthread.c30
-rw-r--r--kernel/sched/Makefile1
-rw-r--r--kernel/sched/core.c990
-rw-r--r--kernel/sched/core_sched.c229
-rw-r--r--kernel/sched/cpuacct.c12
-rw-r--r--kernel/sched/cpufreq_schedutil.c1
-rw-r--r--kernel/sched/deadline.c38
-rw-r--r--kernel/sched/debug.c4
-rw-r--r--kernel/sched/fair.c415
-rw-r--r--kernel/sched/idle.c13
-rw-r--r--kernel/sched/isolation.c4
-rw-r--r--kernel/sched/loadavg.c2
-rw-r--r--kernel/sched/pelt.h2
-rw-r--r--kernel/sched/rt.c31
-rw-r--r--kernel/sched/sched.h436
-rw-r--r--kernel/sched/stats.h66
-rw-r--r--kernel/sched/stop_task.c14
-rw-r--r--kernel/sched/topology.c4
-rw-r--r--kernel/smpboot.c1
-rw-r--r--kernel/sys.c5
-rw-r--r--kernel/sysctl.c12
-rw-r--r--lib/smp_processor_id.c6
-rw-r--r--tools/include/uapi/linux/prctl.h8
-rw-r--r--tools/testing/selftests/sched/.gitignore1
-rw-r--r--tools/testing/selftests/sched/Makefile14
-rw-r--r--tools/testing/selftests/sched/config1
-rw-r--r--tools/testing/selftests/sched/cs_prctl_test.c338
71 files changed, 2700 insertions, 481 deletions
diff --git a/Documentation/accounting/delay-accounting.rst b/Documentation/accounting/delay-accounting.rst
index 7cc7f5852da0..1b8b46deeb29 100644
--- a/Documentation/accounting/delay-accounting.rst
+++ b/Documentation/accounting/delay-accounting.rst
@@ -69,13 +69,15 @@ Compile the kernel with::
CONFIG_TASK_DELAY_ACCT=y
CONFIG_TASKSTATS=y
-Delay accounting is enabled by default at boot up.
-To disable, add::
+Delay accounting is disabled by default at boot up.
+To enable, add::
- nodelayacct
+ delayacct
-to the kernel boot options. The rest of the instructions
-below assume this has not been done.
+to the kernel boot options. The rest of the instructions below assume this has
+been done. Alternatively, use sysctl kernel.task_delayacct to switch the state
+at runtime. Note however that only tasks started after enabling it will have
+delayacct information.
After the system has booted up, use a utility
similar to getdelays.c to access the delays
diff --git a/Documentation/admin-guide/hw-vuln/core-scheduling.rst b/Documentation/admin-guide/hw-vuln/core-scheduling.rst
new file mode 100644
index 000000000000..7b410aef9c5c
--- /dev/null
+++ b/Documentation/admin-guide/hw-vuln/core-scheduling.rst
@@ -0,0 +1,223 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===============
+Core Scheduling
+===============
+Core scheduling support allows userspace to define groups of tasks that can
+share a core. These groups can be specified either for security usecases (one
+group of tasks don't trust another), or for performance usecases (some
+workloads may benefit from running on the same core as they don't need the same
+hardware resources of the shared core, or may prefer different cores if they
+do share hardware resource needs). This document only describes the security
+usecase.
+
+Security usecase
+----------------
+A cross-HT attack involves the attacker and victim running on different Hyper
+Threads of the same core. MDS and L1TF are examples of such attacks. The only
+full mitigation of cross-HT attacks is to disable Hyper Threading (HT). Core
+scheduling is a scheduler feature that can mitigate some (not all) cross-HT
+attacks. It allows HT to be turned on safely by ensuring that only tasks in a
+user-designated trusted group can share a core. This increase in core sharing
+can also improve performance, however it is not guaranteed that performance
+will always improve, though that is seen to be the case with a number of real
+world workloads. In theory, core scheduling aims to perform at least as good as
+when Hyper Threading is disabled. In practice, this is mostly the case though
+not always: as synchronizing scheduling decisions across 2 or more CPUs in a
+core involves additional overhead - especially when the system is lightly
+loaded. When ``total_threads <= N_CPUS/2``, the extra overhead may cause core
+scheduling to perform more poorly compared to SMT-disabled, where N_CPUS is the
+total number of CPUs. Please measure the performance of your workloads always.
+
+Usage
+-----
+Core scheduling support is enabled via the ``CONFIG_SCHED_CORE`` config option.
+Using this feature, userspace defines groups of tasks that can be co-scheduled
+on the same core. The core scheduler uses this information to make sure that
+tasks that are not in the same group never run simultaneously on a core, while
+doing its best to satisfy the system's scheduling requirements.
+
+Core scheduling can be enabled via the ``PR_SCHED_CORE`` prctl interface.
+This interface provides support for the creation of core scheduling groups, as
+well as admission and removal of tasks from created groups::
+
+ #include <sys/prctl.h>
+
+ int prctl(int option, unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5);
+
+option:
+ ``PR_SCHED_CORE``
+
+arg2:
+ Command for operation, must be one off:
+
+ - ``PR_SCHED_CORE_GET`` -- get core_sched cookie of ``pid``.
+ - ``PR_SCHED_CORE_CREATE`` -- create a new unique cookie for ``pid``.
+ - ``PR_SCHED_CORE_SHARE_TO`` -- push core_sched cookie to ``pid``.
+ - ``PR_SCHED_CORE_SHARE_FROM`` -- pull core_sched cookie from ``pid``.
+
+arg3:
+ ``pid`` of the task for which the operation applies.
+
+arg4:
+ ``pid_type`` for which the operation applies. It is of type ``enum pid_type``.
+ For example, if arg4 is ``PIDTYPE_TGID``, then the operation of this command
+ will be performed for all tasks in the task group of ``pid``.
+
+arg5:
+ userspace pointer to an unsigned long for storing the cookie returned by
+ ``PR_SCHED_CORE_GET`` command. Should be 0 for all other commands.
+
+In order for a process to push a cookie to, or pull a cookie from a process, it
+is required to have the ptrace access mode: `PTRACE_MODE_READ_REALCREDS` to the
+process.
+
+Building hierarchies of tasks
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The simplest way to build hierarchies of threads/processes which share a
+cookie and thus a core is to rely on the fact that the core-sched cookie is
+inherited across forks/clones and execs, thus setting a cookie for the
+'initial' script/executable/daemon will place every spawned child in the
+same core-sched group.
+
+Cookie Transferral
+~~~~~~~~~~~~~~~~~~
+Transferring a cookie between the current and other tasks is possible using
+PR_SCHED_CORE_SHARE_FROM and PR_SCHED_CORE_SHARE_TO to inherit a cookie from a
+specified task or a share a cookie with a task. In combination this allows a
+simple helper program to pull a cookie from a task in an existing core
+scheduling group and share it with already running tasks.
+
+Design/Implementation
+---------------------
+Each task that is tagged is assigned a cookie internally in the kernel. As
+mentioned in `Usage`_, tasks with the same cookie value are assumed to trust
+each other and share a core.
+
+The basic idea is that, every schedule event tries to select tasks for all the
+siblings of a core such that all the selected tasks running on a core are
+trusted (same cookie) at any point in time. Kernel threads are assumed trusted.
+The idle task is considered special, as it trusts everything and everything
+trusts it.
+
+During a schedule() event on any sibling of a core, the highest priority task on
+the sibling's core is picked and assigned to the sibling calling schedule(), if
+the sibling has the task enqueued. For rest of the siblings in the core,
+highest priority task with the same cookie is selected if there is one runnable
+in their individual run queues. If a task with same cookie is not available,
+the idle task is selected. Idle task is globally trusted.
+
+Once a task has been selected for all the siblings in the core, an IPI is sent to
+siblings for whom a new task was selected. Siblings on receiving the IPI will
+switch to the new task immediately. If an idle task is selected for a sibling,
+then the sibling is considered to be in a `forced idle` state. I.e., it may
+have tasks on its on runqueue to run, however it will still have to run idle.
+More on this in the next section.
+
+Forced-idling of hyperthreads
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The scheduler tries its best to find tasks that trust each other such that all
+tasks selected to be scheduled are of the highest priority in a core. However,
+it is possible that some runqueues had tasks that were incompatible with the
+highest priority ones in the core. Favoring security over fairness, one or more
+siblings could be forced to select a lower priority task if the highest
+priority task is not trusted with respect to the core wide highest priority
+task. If a sibling does not have a trusted task to run, it will be forced idle
+by the scheduler (idle thread is scheduled to run).
+
+When the highest priority task is selected to run, a reschedule-IPI is sent to
+the sibling to force it into idle. This results in 4 cases which need to be
+considered depending on whether a VM or a regular usermode process was running
+on either HT::
+
+ HT1 (attack) HT2 (victim)
+ A idle -> user space user space -> idle
+ B idle -> user space guest -> idle
+ C idle -> guest user space -> idle
+ D idle -> guest guest -> idle
+
+Note that for better performance, we do not wait for the destination CPU
+(victim) to enter idle mode. This is because the sending of the IPI would bring
+the destination CPU immediately into kernel mode from user space, or VMEXIT
+in the case of guests. At best, this would only leak some scheduler metadata
+which may not be worth protecting. It is also possible that the IPI is received
+too late on some architectures, but this has not been observed in the case of
+x86.
+
+Trust model
+~~~~~~~~~~~
+Core scheduling maintains trust relationships amongst groups of tasks by
+assigning them a tag that is the same cookie value.
+When a system with core scheduling boots, all tasks are considered to trust
+each other. This is because the core scheduler does not have information about
+trust relationships until userspace uses the above mentioned interfaces, to
+communicate them. In other words, all tasks have a default cookie value of 0.
+and are considered system-wide trusted. The forced-idling of siblings running
+cookie-0 tasks is also avoided.
+
+Once userspace uses the above mentioned interfaces to group sets of tasks, tasks
+within such groups are considered to trust each other, but do not trust those
+outside. Tasks outside the group also don't trust tasks within.
+
+Limitations of core-scheduling
+------------------------------
+Core scheduling tries to guarantee that only trusted tasks run concurrently on a
+core. But there could be small window of time during which untrusted tasks run
+concurrently or kernel could be running concurrently with a task not trusted by
+kernel.
+
+IPI processing delays
+~~~~~~~~~~~~~~~~~~~~~
+Core scheduling selects only trusted tasks to run together. IPI is used to notify
+the siblings to switch to the new task. But there could be hardware delays in
+receiving of the IPI on some arch (on x86, this has not been observed). This may
+cause an attacker task to start running on a CPU before its siblings receive the
+IPI. Even though cache is flushed on entry to user mode, victim tasks on siblings
+may populate data in the cache and micro architectural buffers after the attacker
+starts to run and this is a possibility for data leak.
+
+Open cross-HT issues that core scheduling does not solve
+--------------------------------------------------------
+1. For MDS
+~~~~~~~~~~
+Core scheduling cannot protect against MDS attacks between an HT running in
+user mode and another running in kernel mode. Even though both HTs run tasks
+which trust each other, kernel memory is still considered untrusted. Such
+attacks are possible for any combination of sibling CPU modes (host or guest mode).
+
+2. For L1TF
+~~~~~~~~~~~
+Core scheduling cannot protect against an L1TF guest attacker exploiting a
+guest or host victim. This is because the guest attacker can craft invalid
+PTEs which are not inverted due to a vulnerable guest kernel. The only
+solution is to disable EPT (Extended Page Tables).
+
+For both MDS and L1TF, if the guest vCPU is configured to not trust each
+other (by tagging separately), then the guest to guest attacks would go away.
+Or it could be a system admin policy which considers guest to guest attacks as
+a guest problem.
+
+Another approach to resolve these would be to make every untrusted task on the
+system to not trust every other untrusted task. While this could reduce
+parallelism of the untrusted tasks, it would still solve the above issues while
+allowing system processes (trusted tasks) to share a core.
+
+3. Protecting the kernel (IRQ, syscall, VMEXIT)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Unfortunately, core scheduling does not protect kernel contexts running on
+sibling hyperthreads from one another. Prototypes of mitigations have been posted
+to LKML to solve this, but it is debatable whether such windows are practically
+exploitable, and whether the performance overhead of the prototypes are worth
+it (not to mention, the added code complexity).
+
+Other Use cases
+---------------
+The main use case for Core scheduling is mitigating the cross-HT vulnerabilities
+with SMT enabled. There are other use cases where this feature could be used:
+
+- Isolating tasks that needs a whole core: Examples include realtime tasks, tasks
+ that uses SIMD instructions etc.
+- Gang scheduling: Requirements for a group of tasks that needs to be scheduled
+ together could also be realized using core scheduling. One example is vCPUs of
+ a VM.
diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst
index ca4dbdd9016d..f12cda55538b 100644
--- a/Documentation/admin-guide/hw-vuln/index.rst
+++ b/Documentation/admin-guide/hw-vuln/index.rst
@@ -15,3 +15,4 @@ are configurable at compile, boot or run time.
tsx_async_abort
multihit.rst
special-register-buffer-data-sampling.rst
+ core-scheduling.rst
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index cb89dbdedc46..ef5048c127a3 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3244,7 +3244,7 @@
noclflush [BUGS=X86] Don't use the CLFLUSH instruction
- nodelayacct [KNL] Disable per-task delay accounting
+ delayacct [KNL] Enable per-task delay accounting
nodsp [SH] Disable hardware DSP at boot time.
diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index 68b21395a743..0ef05750dadc 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -1088,6 +1088,13 @@ Model available). If your platform happens to meet the
requirements for EAS but you do not want to use it, change
this value to 0.
+task_delayacct
+===============
+
+Enables/disables task delay accounting (see
+:doc:`accounting/delay-accounting.rst`). Enabling this feature incurs
+a small amount of overhead in the scheduler but is useful for debugging
+and performance tuning. It is required by some tools such as iotop.
sched_schedstats
================
diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index f4dd9f3f3001..4b2575f936d4 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -166,7 +166,6 @@ smp_callin(void)
DBGS(("smp_callin: commencing CPU %d current %p active_mm %p\n",
cpuid, current, current->active_mm));
- preempt_disable();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
}
diff --git a/arch/arc/kernel/smp.c b/arch/arc/kernel/smp.c
index 52906d314537..db0e104d6835 100644
--- a/arch/arc/kernel/smp.c
+++ b/arch/arc/kernel/smp.c
@@ -189,7 +189,6 @@ void start_kernel_secondary(void)
pr_info("## CPU%u LIVE ##: Executing Code...\n", cpu);
local_irq_enable();
- preempt_disable();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
}
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 74679240a9d8..c7bb168b0d97 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -432,7 +432,6 @@ asmlinkage void secondary_start_kernel(void)
#endif
pr_debug("CPU%u: Booted secondary processor\n", cpu);
- preempt_disable();
trace_hardirqs_off();
/*
diff --git a/arch/arm64/include/asm/preempt.h b/arch/arm64/include/asm/preempt.h
index 80e946b2abee..e83f0982b99c 100644
--- a/arch/arm64/include/asm/preempt.h
+++ b/arch/arm64/include/asm/preempt.h
@@ -23,7 +23,7 @@ static inline void preempt_count_set(u64 pc)
} while (0)
#define init_idle_preempt_count(p, cpu) do { \
- task_thread_info(p)->preempt_count = PREEMPT_ENABLED; \
+ task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \
} while (0)
static inline void set_preempt_need_resched(void)
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index dcd7041b2b07..6671000a8b7d 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -224,7 +224,6 @@ asmlinkage notrace void secondary_start_kernel(void)
init_gic_priority_masking();
rcu_cpu_starting(cpu);
- preempt_disable();
trace_hardirqs_off();
/*
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 3964acf5451e..a4eba0908bfa 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -20,8 +20,6 @@ if VIRTUALIZATION
menuconfig KVM
bool "Kernel-based Virtual Machine (KVM) support"
depends on OF
- # for TASKSTATS/TASK_DELAY_ACCT:
- depends on NET && MULTIUSER
select MMU_NOTIFIER
select PREEMPT_NOTIFIERS
select HAVE_KVM_CPU_RELAX_INTERCEPT
@@ -38,8 +36,7 @@ menuconfig KVM
select IRQ_BYPASS_MANAGER
select HAVE_KVM_IRQ_BYPASS
select HAVE_KVM_VCPU_RUN_PID_CHANGE
- select TASKSTATS
- select TASK_DELAY_ACCT
+ select SCHED_INFO
help
Support hosting virtualized guest machines.
diff --git a/arch/csky/kernel/smp.c b/arch/csky/kernel/smp.c
index 0f9f5eef9338..e2993539af8e 100644
--- a/arch/csky/kernel/smp.c
+++ b/arch/csky/kernel/smp.c
@@ -281,7 +281,6 @@ void csky_start_secondary(void)
pr_info("CPU%u Online: %s...\n", cpu, __func__);
local_irq_enable();
- preempt_disable();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
}
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index 49b488580939..d10f780c13b9 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -441,7 +441,6 @@ start_secondary (void *unused)
#endif
efi_map_pal_code();
cpu_init();
- preempt_disable();
smp_callin();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index ef86fbad8546..d542fb7af3ba 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -348,7 +348,6 @@ asmlinkage void start_secondary(void)
*/
calibrate_delay();
- preempt_disable();
cpu = smp_processor_id();
cpu_data[cpu].udelay_val = loops_per_jiffy;
diff --git a/arch/openrisc/kernel/smp.c b/arch/openrisc/kernel/smp.c
index 48e1092a64de..415e209732a3 100644
--- a/arch/openrisc/kernel/smp.c
+++ b/arch/openrisc/kernel/smp.c
@@ -145,8 +145,6 @@ asmlinkage __init void secondary_start_kernel(void)
set_cpu_online(cpu, true);
local_irq_enable();
-
- preempt_disable();
/*
* OK, it's off to the idle thread for us
*/
diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c
index 10227f667c8a..1405b603b91b 100644
--- a/arch/parisc/kernel/smp.c
+++ b/arch/parisc/kernel/smp.c
@@ -302,7 +302,6 @@ void __init smp_callin(unsigned long pdce_proc)
#endif
smp_cpu_init(slave_id);
- preempt_disable();
flush_cache_all_local(); /* start with known state */
flush_tlb_all_local(NULL);
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 2e05c783440a..6c6e4d934d86 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1547,7 +1547,6 @@ void start_secondary(void *unused)
smp_store_cpu_info(cpu);
set_dec(tb_ticks_per_jiffy);
rcu_cpu_starting(cpu);
- preempt_disable();
cpu_callin_map[cpu] = 1;
if (smp_ops->setup_cpu)
diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c
index 9a408e2942ac..bd82375db51a 100644
--- a/arch/riscv/kernel/smpboot.c
+++ b/arch/riscv/kernel/smpboot.c
@@ -180,7 +180,6 @@ asmlinkage __visible void smp_callin(void)
* Disable preemption before enabling interrupts, so we don't try to
* schedule a CPU that hasn't actually started yet.
*/
- preempt_disable();
local_irq_enable();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
}
diff --git a/arch/s390/include/asm/preempt.h b/arch/s390/include/asm/preempt.h
index b49e0492842c..23ff51be7e29 100644
--- a/arch/s390/include/asm/preempt.h
+++ b/arch/s390/include/asm/preempt.h
@@ -32,7 +32,7 @@ static inline void preempt_count_set(int pc)
#define init_task_preempt_count(p) do { } while (0)
#define init_idle_preempt_count(p, cpu) do { \
- S390_lowcore.preempt_count = PREEMPT_ENABLED; \
+ S390_lowcore.preempt_count = PREEMPT_DISABLED; \
} while (0)
static inline void set_preempt_need_resched(void)
@@ -91,7 +91,7 @@ static inline void preempt_count_set(int pc)
#define init_task_preempt_count(p) do { } while (0)
#define init_idle_preempt_count(p, cpu) do { \
- S390_lowcore.preempt_count = PREEMPT_ENABLED; \
+ S390_lowcore.preempt_count = PREEMPT_DISABLED; \
} while (0)
static inline void set_preempt_need_resched(void)
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 2fec2b80d35d..111909aeb8d2 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -878,7 +878,6 @@ static void smp_init_secondary(void)
restore_access_regs(S390_lowcore.access_regs_save_area);
cpu_init();
rcu_cpu_starting(cpu);
- preempt_disable();
init_cpu_timer();
vtime_init();
vdso_getcpu_init();
diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c
index 372acdc9033e..65924d9ec245 100644
--- a/arch/sh/kernel/smp.c
+++ b/arch/sh/kernel/smp.c
@@ -186,8 +186,6 @@ asmlinkage void start_secondary(void)
per_cpu_trap_init();
- preempt_disable();
-
notify_cpu_starting(cpu);
local_irq_enable();
diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c
index 50c127ab46d5..22b148e5a5f8 100644
--- a/arch/sparc/kernel/smp_32.c
+++ b/arch/sparc/kernel/smp_32.c
@@ -348,7 +348,6 @@ static void sparc_start_secondary(void *arg)
*/
arch_cpu_pre_starting(arg);
- preempt_disable();
cpu = smp_processor_id();
notify_cpu_starting(cpu);
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index e38d8bf454e8..ae5faa1d989d 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -138,9 +138,6 @@ void smp_callin(void)
set_cpu_online(cpuid, true);
- /* idle thread is expected to have preempt disabled */
- preempt_disable();
-
local_irq_enable();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index f8cb8af4de5c..fe5efbcba824 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -44,7 +44,7 @@ static __always_inline void preempt_count_set(int pc)
#define init_task_preempt_count(p) do { } while (0)
#define init_idle_preempt_count(p, cpu) do { \
- per_cpu(__preempt_count, (cpu)) = PREEMPT_ENABLED; \
+ per_cpu(__preempt_count, (cpu)) = PREEMPT_DISABLED; \
} while (0)
/*
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7770245cc7fa..ec2d64aa2163 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -236,7 +236,6 @@ static void notrace start_secondary(void *unused)
cpu_init();
rcu_cpu_starting(raw_smp_processor_id());
x86_cpuinit.early_percpu_clock_init();
- preempt_disable();
smp_callin();
enable_start_cpu0 = 0;
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index f6b93a35ce14..fb8efb387aff 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -22,8 +22,6 @@ config KVM
tristate "Kernel-based Virtual Machine (KVM) support"
depends on HAVE_KVM
depends on HIGH_RES_TIMERS
- # for TASKSTATS/TASK_DELAY_ACCT:
- depends on NET && MULTIUSER
depends on X86_LOCAL_APIC
select PREEMPT_NOTIFIERS
select MMU_NOTIFIER
@@ -36,8 +34,7 @@ config KVM
select KVM_ASYNC_PF
select USER_RETURN_NOTIFIER
select KVM_MMIO
- select TASKSTATS
- select TASK_DELAY_ACCT
+ select SCHED_INFO
select PERF_EVENTS
select HAVE_KVM_MSI
select HAVE_KVM_CPU_RELAX_INTERCEPT
diff --git a/arch/xtensa/kernel/smp.c b/arch/xtensa/kernel/smp.c
index cd85a7a2722b..1254da07ead1 100644
--- a/arch/xtensa/kernel/smp.c
+++ b/arch/xtensa/kernel/smp.c
@@ -145,7 +145,6 @@ void secondary_start_kernel(void)
cpumask_set_cpu(cpu, mm_cpumask(mm));
enter_lazy_tlb(mm, current);
- preempt_disable();
trace_hardirqs_off();
calibrate_delay();
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index c3aa8d6ccee3..2e5670446991 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -117,7 +117,7 @@ struct menu_device {
int interval_ptr;
};
-static inline int which_bucket(u64 duration_ns, unsigned long nr_iowaiters)
+static inline int which_bucket(u64 duration_ns, unsigned int nr_iowaiters)
{
int bucket = 0;
@@ -150,7 +150,7 @@ static inline int which_bucket(u64 duration_ns, unsigned long nr_iowaiters)
* to be, the higher this multiplier, and thus the higher
* the barrier to go to an expensive C state.
*/
-static inline int performance_multiplier(unsigned long nr_iowaiters)
+static inline int performance_multiplier(unsigned int nr_iowaiters)
{
/* for IO wait tasks (per cpu!) we add 10x each */
return 1 + 10 * nr_iowaiters;
@@ -270,7 +270,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
unsigned int predicted_us;
u64 predicted_ns;
u64 interactivity_req;
- unsigned long nr_iowaiters;
+ unsigned int nr_iowaiters;
ktime_t delta, delta_tick;
int i, idx;
diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c
index eeb4e4b76c0b..43b1ae8a7789 100644
--- a/drivers/thermal/cpufreq_cooling.c
+++ b/drivers/thermal/cpufreq_cooling.c
@@ -478,7 +478,7 @@ static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev,
ret = freq_qos_update_request(&cpufreq_cdev->qos_req, frequency);
if (ret >= 0) {
cpufreq_cdev->cpufreq_state = state;
- cpus = cpufreq_cdev->policy->cpus;
+ cpus = cpufreq_cdev->policy->related_cpus;
max_capacity = arch_scale_cpu_capacity(cpumask_first(cpus));
capacity = frequency * max_capacity;
capacity /= cpufreq_cdev->policy->cpuinfo.max_freq;
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index 8468baee951d..f32878d9a39f 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -16,7 +16,7 @@ static int loadavg_proc_show(struct seq_file *m, void *v)
get_avenrun(avnrun, FIXED_1/200, 0);
- seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n",
+ seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %u/%d %d\n",
LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index f25e8531fd27..6561a06ef905 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -200,8 +200,8 @@ static int show_stat(struct seq_file *p, void *v)
"\nctxt %llu\n"
"btime %llu\n"
"processes %lu\n"
- "procs_running %lu\n"
- "procs_blocked %lu\n",
+ "procs_running %u\n"
+ "procs_blocked %u\n",
nr_context_switches(),
(unsigned long long)boottime.tv_sec,
total_forks,
diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h
index d683f5e6d791..b4d43a4af5f7 100644
--- a/include/asm-generic/preempt.h
+++ b/include/asm-generic/preempt.h
@@ -29,7 +29,7 @@ static __always_inline void preempt_count_set(int pc)
} while (0)
#define init_idle_preempt_count(p, cpu) do { \
- task_thread_info(p)->preempt_count = PREEMPT_ENABLED; \
+ task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \
} while (0)
static __always_inline void set_preempt_need_resched(void)
diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 21651f946751..af7e6eb50283 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -58,16 +58,22 @@ struct task_delay_info {
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/jump_label.h>
#ifdef CONFIG_TASK_DELAY_ACCT
+DECLARE_STATIC_KEY_FALSE(delayacct_key);
extern int delayacct_on; /* Delay accounting turned on/off */
extern struct kmem_cache *delayacct_cache;
extern void delayacct_init(void);
+
+extern int sysctl_delayacct(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos);
+
extern void __delayacct_tsk_init(struct task_struct *);
extern void __delayacct_tsk_exit(struct task_struct *);
extern void __delayacct_blkio_start(void);
extern void __delayacct_blkio_end(struct task_struct *);
-extern int __delayacct_add_tsk(struct taskstats *, struct task_struct *);
+extern int delayacct_add_tsk(struct taskstats *, struct task_struct *);
extern __u64 __delayacct_blkio_ticks(struct task_struct *);
extern void __delayacct_freepages_start(void);
extern void __delayacct_freepages_end(void);
@@ -114,6 +120,9 @@ static inline void delayacct_tsk_free(struct task_struct *tsk)
static inline void delayacct_blkio_start(void)
{
+ if (!static_branch_unlikely(&delayacct_key))
+ return;
+
delayacct_set_flag(current, DELAYACCT_PF_BLKIO);
if (current->delays)
__delayacct_blkio_start();
@@ -121,19 +130,14 @@ static inline void delayacct_blkio_start(void)
static inline void delayacct_blkio_end(struct task_struct *p)
{
+ if (!static_branch_unlikely(&delayacct_key))
+ return;
+
if (p->delays)
__delayacct_blkio_end(p);
delayacct_clear_flag(p, DELAYACCT_PF_BLKIO);
}
-static inline int delayacct_add_tsk(struct taskstats *d,
- struct task_struct *tsk)
-{
- if (!delayacct_on || !tsk->delays)
- return 0;
- return __delayacct_add_tsk(d, tsk);
-}
-
static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk)
{
if (tsk->delays)
diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index 757fc60658fa..3f221dbf5f95 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -91,6 +91,8 @@ void em_dev_unregister_perf_domain(struct device *dev);
* @pd : performance domain for which energy has to be estimated
* @max_util : highest utilization among CPUs of the domain
* @sum_util : sum of the utilization of all CPUs in the domain
+ * @allowed_cpu_cap : maximum allowed CPU capacity for the @pd, which
+ might reflect reduced frequency (due to thermal)
*
* This function must be used only for CPU devices. There is no validation,
* i.e. if the EM is a CPU type and has cpumask allocated. It is called from
@@ -100,7 +102,8 @@ void em_dev_unregister_perf_domain(struct device *dev);
* a capacity state satisfying the max utilization of the domain.
*/
static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
- unsigned long max_util, unsigned long sum_util)
+ unsigned long max_util, unsigned long sum_util,
+ unsigned long allowed_cpu_cap)
{
unsigned long freq, scale_cpu;
struct em_perf_state *ps;
@@ -112,11 +115,17 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
/*
* In order to predict the performance state, map the utilization of
* the most utilized CPU of the performance domain to a requested
- * frequency, like schedutil.
+ * frequency, like schedutil. Take also into account that the real
+ * frequency might be set lower (due to thermal capping). Thus, clamp
+ * max utilization to the allowed CPU capacity before calculating
+ * effective frequency.
*/
cpu = cpumask_first(to_cpumask(pd->cpus));
scale_cpu = arch_scale_cpu_capacity(cpu);
ps = &pd->table[pd->nr_perf_states - 1];
+
+ max_util = map_util_perf(max_util);
+ max_util = min(max_util, allowed_cpu_cap);
freq = map_util_freq(max_util, ps->frequency, scale_cpu);
/*
@@ -209,7 +218,8 @@ static inline struct em_perf_domain *em_pd_get(struct device *dev)
return NULL;
}
static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
- unsigned long max_util, unsigned long sum_util)
+ unsigned long max_util, unsigned long sum_util,
+ unsigned long allowed_cpu_cap)
{
return 0;
}
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 2484ed97e72f..d9133d6db308 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -33,6 +33,8 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
unsigned int cpu,
const char *namefmt);
+void set_kthread_struct(struct task_struct *p);
+
void kthread_set_per_cpu(struct task_struct *k, int cpu);
bool kthread_is_per_cpu(struct task_struct *k);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 28a98fc4ded4..ac5a7d29fd4f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -708,10 +708,17 @@ struct task_struct {
const struct sched_class *sched_class;
struct sched_entity se;
struct sched_rt_entity rt;
+ struct sched_dl_entity dl;
+
+#ifdef CONFIG_SCHED_CORE
+ struct rb_node core_node;
+ unsigned long core_cookie;
+ unsigned int core_occupation;
+#endif
+
#ifdef CONFIG_CGROUP_SCHED
struct task_group *sched_task_group;
#endif
- struct sched_dl_entity dl;
#ifdef CONFIG_UCLAMP_TASK
/*
@@ -2180,4 +2187,14 @@ int sched_trace_rq_nr_running(struct rq *rq);
const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
+#ifdef CONFIG_SCHED_CORE
+extern void sched_core_free(struct task_struct *tsk);
+extern void sched_core_fork(struct task_struct *p);
+extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
+ unsigned long uaddr);
+#else
+static inline void sched_core_free(struct task_struct *tsk) { }
+static inline void sched_core_fork(struct task_struct *p) { }
+#endif
+
#endif
diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h
index 6205578ab6ee..bdd31ab93bc5 100644
--- a/include/linux/sched/cpufreq.h
+++ b/include/linux/sched/cpufreq.h
@@ -26,7 +26,7 @@ bool cpufreq_this_cpu_can_update(struct cpufreq_policy *policy);
static inline unsigned long map_util_freq(unsigned long util,
unsigned long freq, unsigned long cap)
{
- return (freq + (freq >> 2)) * util / cap;
+ return freq * util / cap;
}
static inline unsigned long map_util_perf(unsigned long util)
diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h
index 568286411b43..0108a38bb64d 100644
--- a/include/linux/sched/stat.h
+++ b/include/linux/sched/stat.h
@@ -3,6 +3,7 @@
#define _LINUX_SCHED_STAT_H
#include <linux/percpu.h>
+#include <linux/kconfig.h>
/*
* Various counters maintained by the scheduler and fork(),
@@ -16,21 +17,14 @@ extern unsigned long total_forks;
extern int nr_threads;
DECLARE_PER_CPU(unsigned long, process_counts);
extern int nr_processes(void);
-extern unsigned long nr_running(void);
+extern unsigned int nr_running(void);
extern bool single_task_running(void);
-extern unsigned long nr_iowait(void);
-extern unsigned long nr_iowait_cpu(int cpu);
+extern unsigned int nr_iowait(void);
+extern unsigned int nr_iowait_cpu(int cpu);
static inline int sched_info_on(void)
{
-#ifdef CONFIG_SCHEDSTATS
- return 1;
-#elif defined(CONFIG_TASK_DELAY_ACCT)
- extern int delayacct_on;
- return delayacct_on;
-#else
- return 0;
-#endif
+ return IS_ENABLED(CONFIG_SCHED_INFO);
}
#ifdef CONFIG_SCHEDSTATS
diff --git a/include/linux/sched_clock.h b/include/linux/sched_clock.h
index 528718e4ed52..835ee87ed792 100644
--- a/include/linux/sched_clock.h
+++ b/include/linux/sched_clock.h
@@ -14,7 +14,7 @@
* @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit
* clocks.
* @read_sched_clock: Current clock source (or dummy source when suspended).
- * @mult: Multipler for scaled math conversion.
+ * @mult: Multiplier for scaled math conversion.
* @shift: Shift value for scaled math conversion.
*
* Care must be taken when updating this structure; it is read by
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 18a9f59dc067..967d9c55323d 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -259,4 +259,12 @@ struct prctl_mm_map {
#define PR_PAC_SET_ENABLED_KEYS 60
#define PR_PAC_GET_ENABLED_KEYS 61
+/* Request the scheduler to share a core */
+#define PR_SCHED_CORE 62
+# define PR_SCHED_CORE_GET 0
+# define PR_SCHED_CORE_CREATE 1 /* create unique core_sched cookie */
+# define PR_SCHED_CORE_SHARE_TO 2 /* push core_sched cookie to pid */
+# define PR_SCHED_CORE_SHARE_FROM 3 /* pull core_sched cookie to pid */
+# define PR_SCHED_CORE_MAX 4
+
#endif /* _LINUX_PRCTL_H */
diff --git a/init/main.c b/init/main.c
index e9c42a183e33..359358500e54 100644
--- a/init/main.c
+++ b/init/main.c
@@ -692,6 +692,7 @@ noinline void __ref rest_init(void)
*/
rcu_read_lock();
tsk = find_task_by_pid_ns(pid, &init_pid_ns);
+ tsk->flags |= PF_NO_SETAFFINITY;
set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id()));
rcu_read_unlock();
@@ -941,11 +942,7 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
* time - but meanwhile we still have a functioning scheduler.
*/
sched_init();
- /*
- * Disable preemption - early bootup scheduling is extremely
- * fragile until we cpu_idle() for the first time.
- */
- preempt_disable();
+
if (WARN(!irqs_disabled(),
"Interrupts were enabled *very* early, fixing it\n"))
local_irq_disable();
@@ -1444,6 +1441,11 @@ static int __ref kernel_init(void *unused)
{
int ret;
+ /*
+ * Wait until kthreadd is all set-up.
+ */
+ wait_for_completion(&kthreadd_done);
+
kernel_init_freeable();
/* need to finish all async __init code before freeing the memory */
async_synchronize_full();
@@ -1524,11 +1526,6 @@ void __init console_on_rootfs(void)
static noinline void __init kernel_init_freeable(void)
{
- /*
- * Wait until kthreadd is all set-up.
- */
- wait_for_completion(&kthreadd_done);
-
/* Now the scheduler is fully set up and can do blocking allocations */
gfp_allowed_mask = __GFP_BITS_MASK;
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 416017301660..bd7c4147b9a8 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -99,3 +99,23 @@ config PREEMPT_DYNAMIC
Interesting if you want the same pre-built kernel should be used for
both Server and Desktop workloads.
+
+config SCHED_CORE
+ bool "Core Scheduling for SMT"
+ default y
+ depends on SCHED_SMT
+ help
+ This option permits Core Scheduling, a means of coordinated task
+ selection across SMT siblings. When enabled -- see
+ prctl(PR_SCHED_CORE) -- task selection ensures that all SMT siblings
+ will execute a task from the same 'core group', forcing idle when no
+ matching task is found.
+
+ Use of this feature includes:
+ - mitigation of some (not all) SMT side channels;
+ - limiting SMT interference to improve determinism and/or performance.
+
+ SCHED_CORE is default enabled when SCHED_SMT is enabled -- when
+ unused there should be no impact on performance.
+
+
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 27725754ac99..51530d5b15a8 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -7,30 +7,64 @@
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/sched/cputime.h>
+#include <linux/sched/clock.h>
#include <linux/slab.h>
#include <linux/taskstats.h>
-#include <linux/time.h>
#include <linux/sysctl.h>
#include <linux/delayacct.h>
#include <linux/module.h>
-int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */
-EXPORT_SYMBOL_GPL(delayacct_on);
+DEFINE_STATIC_KEY_FALSE(delayacct_key);
+int delayacct_on __read_mostly; /* Delay accounting turned on/off */
struct kmem_cache *delayacct_cache;
-static int __init delayacct_setup_disable(char *str)
+static void set_delayacct(bool enabled)
{
- delayacct_on = 0;
+ if (enabled) {
+ static_branch_enable(&delayacct_key);
+ delayacct_on = 1;
+ } else {
+ delayacct_on = 0;
+ static_branch_disable(&delayacct_key);
+ }
+}
+
+static int __init delayacct_setup_enable(char *str)
+{
+ delayacct_on = 1;
return 1;
}
-__setup("nodelayacct", delayacct_setup_disable);
+__setup("delayacct", delayacct_setup_enable);
void delayacct_init(void)
{
delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC|SLAB_ACCOUNT);
delayacct_tsk_init(&init_task);
+ set_delayacct(delayacct_on);
}
+#ifdef CONFIG_PROC_SYSCTL
+int sysctl_delayacct(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int state = delayacct_on;
+ struct ctl_table t;
+ int err;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ t = *table;
+ t.data = &state;
+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+ if (write)
+ set_delayacct(state);
+ return err;
+}
+#endif
+
void __delayacct_tsk_init(struct task_struct *tsk)
{
tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL);
@@ -42,10 +76,9 @@ void __delayacct_tsk_init(struct task_struct *tsk)
* Finish delay accounting for a statistic using its timestamps (@start),
* accumalator (@total) and @count
*/
-static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total,
- u32 *count)
+static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *count)
{
- s64 ns = ktime_get_ns() - *start;
+ s64 ns = local_clock() - *start;
unsigned long flags;
if (ns > 0) {
@@ -58,7 +91,7 @@ static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total,
void __delayacct_blkio_start(void)
{
- current->delays->blkio_start = ktime_get_ns();
+ current->delays->blkio_start = local_clock();
}
/*
@@ -82,7 +115,7 @@ void __delayacct_blkio_end(struct task_struct *p)
delayacct_end(&delays->lock, &delays->blkio_start, total, count);
}
-int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
+int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
{
u64 utime, stime, stimescaled, utimescaled;
unsigned long long t2, t3;
@@ -117,6 +150,9 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
d->cpu_run_virtual_total =
(tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp;
+ if (!tsk->delays)
+ return 0;
+
/* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
raw_spin_lock_irqsave(&tsk->delays->lock, flags);
@@ -151,21 +187,20 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk)
void __delayacct_freepages_start(void)
{
- current->delays->freepages_start = ktime_get_ns();
+ current->delays->freepages_start = local_clock();
}
void __delayacct_freepages_end(void)
{
- delayacct_end(
- &current->delays->lock,
- &current->delays->freepages_start,
- &current->delays->freepages_delay,
- &current->delays->freepages_count);
+ delayacct_end(&current->delays->lock,
+ &current->delays->freepages_start,
+ &current->delays->freepages_delay,
+ &current->delays->freepages_count);
}
void __delayacct_thrashing_start(void)
{
- current->delays->thrashing_start = ktime_get_ns();
+ current->delays->thrashing_start = local_clock();
}
void __delayacct_thrashing_end(void)
diff --git a/kernel/fork.c b/kernel/fork.c
index dc06afd725cb..e595e77913eb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -742,6 +742,7 @@ void __put_task_struct(struct task_struct *tsk)
exit_creds(tsk);
delayacct_tsk_free(tsk);
put_signal_struct(tsk->signal);
+ sched_core_free(tsk);
if (!profile_handoff_task(tsk))
free_task(tsk);
@@ -1999,7 +2000,7 @@ static __latent_entropy struct task_struct *copy_process(
goto bad_fork_cleanup_count;
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
- p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE);
+ p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE | PF_NO_SETAFFINITY);
p->flags |= PF_FORKNOEXEC;
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
@@ -2250,6 +2251,8 @@ static __latent_entropy struct task_struct *copy_process(
klp_copy_process(p);
+ sched_core_fork(p);
+
spin_lock(&current->sighand->siglock);
/*
@@ -2337,6 +2340,7 @@ static __latent_entropy struct task_struct *copy_process(
return p;
bad_fork_cancel_cgroup:
+ sched_core_free(p);
spin_unlock(&current->sighand->siglock);
write_unlock_irq(&tasklist_lock);
cgroup_cancel_fork(p, args);
@@ -2408,7 +2412,7 @@ static inline void init_idle_pids(struct task_struct *idle)
}
}
-struct task_struct *fork_idle(int cpu)
+struct task_struct * __init fork_idle(int cpu)
{
struct task_struct *task;
struct kernel_clone_args args = {
diff --git a/kernel/kthread.c b/kernel/kthread.c
index fe3f2a40d61e..3d326833092b 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -68,16 +68,6 @@ enum KTHREAD_BITS {
KTHREAD_SHOULD_PARK,
};
-static inline void set_kthread_struct(void *kthread)
-{
- /*
- * We abuse ->set_child_tid to avoid the new member and because it
- * can't be wrongly copied by copy_process(). We also rely on fact
- * that the caller can't exec, so PF_KTHREAD can't be cleared.
- */
- current->set_child_tid = (__force void __user *)kthread;
-}
-
static inline struct kthread *to_kthread(struct task_struct *k)
{
WARN_ON(!(k->flags & PF_KTHREAD));
@@ -103,6 +93,22 @@ static inline struct kthread *__to_kthread(struct task_struct *p)
return kthread;
}
+void set_kthread_struct(struct task_struct *p)
+{
+ struct kthread *kthread;
+
+ if (__to_kthread(p))
+ return;
+
+ kthread = kzalloc(sizeof(*kthread), GFP_KERNEL);
+ /*
+ * We abuse ->set_child_tid to avoid the new member and because it
+ * can't be wrongly copied by copy_process(). We also rely on fact
+ * that the caller can't exec, so PF_KTHREAD can't be cleared.
+ */
+ p->set_child_tid = (__force void __user *)kthread;
+}
+
void free_kthread_struct(struct task_struct *k)
{
struct kthread *kthread;
@@ -272,8 +278,8 @@ static int kthread(void *_create)
struct kthread *self;
int ret;
- self = kzalloc(sizeof(*self), GFP_KERNEL);
- set_kthread_struct(self);
+ set_kthread_struct(current);
+ self = to_kthread(current);
/* If user was SIGKILLed, I release the structure. */
done = xchg(&create->done, NULL);
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 5fc9c9b70862..978fcfca5871 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -36,3 +36,4 @@ obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
obj-$(CONFIG_MEMBARRIER) += membarrier.o
obj-$(CONFIG_CPU_ISOLATION) += isolation.o
obj-$(CONFIG_PSI) += psi.o
+obj-$(CONFIG_SCHED_CORE) += core_sched.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5226cc26a095..75655cdee3bb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -84,6 +84,272 @@ unsigned int sysctl_sched_rt_period = 1000000;
__read_mostly int scheduler_running;
+#ifdef CONFIG_SCHED_CORE
+
+DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
+
+/* kernel prio, less is more */
+static inline int __task_prio(struct task_struct *p)
+{
+ if (p->sched_class == &stop_sched_class) /* trumps deadline */
+ return -2;
+
+ if (rt_prio(p->prio)) /* includes deadline */
+ return p->prio; /* [-1, 99] */
+
+ if (p->sched_class == &idle_sched_class)
+ return MAX_RT_PRIO + NICE_WIDTH; /* 140 */
+
+ return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */
+}
+
+/*
+ * l(a,b)
+ * le(a,b) := !l(b,a)
+ * g(a,b) := l(b,a)
+ * ge(a,b) := !l(a,b)
+ */
+
+/* real prio, less is less */
+static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
+{
+
+ int pa = __task_prio(a), pb = __task_prio(b);
+
+ if (-pa < -pb)
+ return true;
+
+ if (-pb < -pa)
+ return false;
+
+ if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
+ return !dl_time_before(a->dl.deadline, b->dl.deadline);
+
+ if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
+ return cfs_prio_less(a, b, in_fi);
+
+ return false;
+}
+
+static inline bool __sched_core_less(struct task_struct *a, struct task_struct *b)
+{
+ if (a->core_cookie < b->core_cookie)
+ return true;
+
+ if (a->core_cookie > b->core_cookie)
+ return false;
+
+ /* flip prio, so high prio is leftmost */
+ if (prio_less(b, a, task_rq(a)->core->core_forceidle))
+ return true;
+
+ return false;
+}
+
+#define __node_2_sc(node) rb_entry((node), struct task_struct, core_node)
+
+static inline bool rb_sched_core_less(struct rb_node *a, const struct rb_node *b)
+{
+ return __sched_core_less(__node_2_sc(a), __node_2_sc(b));
+}
+
+static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node)
+{
+ const struct task_struct *p = __node_2_sc(node);
+ unsigned long cookie = (unsigned long)key;
+
+ if (cookie < p->core_cookie)
+ return -1;
+
+ if (cookie > p->core_cookie)
+ return 1;
+
+ return 0;
+}
+
+void sched_core_enqueue(struct rq *rq, struct task_struct *p)
+{
+ rq->core->core_task_seq++;
+
+ if (!p->core_cookie)
+ return;
+
+ rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
+}
+
+void sched_core_dequeue(struct rq *rq, struct task_struct *p)
+{
+ rq->core->core_task_seq++;
+
+ if (!sched_core_enqueued(p))
+ return;
+
+ rb_erase(&p->core_node, &rq->core_tree);
+ RB_CLEAR_NODE(&p->core_node);
+}
+
+/*
+ * Find left-most (aka, highest priority) task matching @cookie.
+ */
+static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
+{
+ struct rb_node *node;
+
+ node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp);
+ /*
+ * The idle task always matches any cookie!
+ */
+ if (!node)
+ return idle_sched_class.pick_task(rq);
+
+ return __node_2_sc(node);
+}
+
+static struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie)
+{
+ struct rb_node *node = &p->core_node;
+
+ node = rb_next(node);
+ if (!node)
+ return NULL;
+
+ p = container_of(node, struct task_struct, core_node);
+ if (p->core_cookie != cookie)
+ return NULL;
+
+ return p;
+}
+
+/*
+ * Magic required such that:
+ *
+ * raw_spin_rq_lock(rq);
+ * ...
+ * raw_spin_rq_unlock(rq);
+ *
+ * ends up locking and unlocking the _same_ lock, and all CPUs
+ * always agree on what rq has what lock.
+ *
+ * XXX entirely possible to selectively enable cores, don't bother for now.
+ */
+
+static DEFINE_MUTEX(sched_core_mutex);
+static atomic_t sched_core_count;
+static struct cpumask sched_core_mask;
+
+static void __sched_core_flip(bool enabled)
+{
+ int cpu, t, i;
+
+ cpus_read_lock();
+
+ /*
+ * Toggle the online cores, one by one.
+ */
+ cpumask_copy(&sched_core_mask, cpu_online_mask);
+ for_each_cpu(cpu, &sched_core_mask) {
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+
+ i = 0;
+ local_irq_disable();
+ for_each_cpu(t, smt_mask) {
+ /* supports up to SMT8 */
+ raw_spin_lock_nested(&cpu_rq(t)->__lock, i++);
+ }
+
+ for_each_cpu(t, smt_mask)
+ cpu_rq(t)->core_enabled = enabled;
+
+ for_each_cpu(t, smt_mask)
+ raw_spin_unlock(&cpu_rq(t)->__lock);
+ local_irq_enable();
+
+ cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
+ }
+
+ /*
+ * Toggle the offline CPUs.
+ */
+ cpumask_copy(&sched_core_mask, cpu_possible_mask);
+ cpumask_andnot(&sched_core_mask, &sched_core_mask, cpu_online_mask);
+
+ for_each_cpu(cpu, &sched_core_mask)
+ cpu_rq(cpu)->core_enabled = enabled;
+
+ cpus_read_unlock();
+}
+
+static void sched_core_assert_empty(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ WARN_ON_ONCE(!RB_EMPTY_ROOT(&cpu_rq(cpu)->core_tree));
+}
+
+static void __sched_core_enable(void)
+{
+ static_branch_enable(&__sched_core_enabled);
+ /*
+ * Ensure all previous instances of raw_spin_rq_*lock() have finished
+ * and future ones will observe !sched_core_disabled().
+ */
+ synchronize_rcu();
+ __sched_core_flip(true);
+ sched_core_assert_empty();
+}
+
+static void __sched_core_disable(void)
+{
+ sched_core_assert_empty();
+ __sched_core_flip(false);
+ static_branch_disable(&__sched_core_enabled);
+}
+
+void sched_core_get(void)
+{
+ if (atomic_inc_not_zero(&sched_core_count))
+ return;
+
+ mutex_lock(&sched_core_mutex);
+ if (!atomic_read(&sched_core_count))
+ __sched_core_enable();
+
+ smp_mb__before_atomic();
+ atomic_inc(&sched_core_count);
+ mutex_unlock(&sched_core_mutex);
+}
+
+static void __sched_core_put(struct work_struct *work)
+{
+ if (atomic_dec_and_mutex_lock(&sched_core_count, &sched_core_mutex)) {
+ __sched_core_disable();
+ mutex_unlock(&sched_core_mutex);
+ }
+}
+
+void sched_core_put(void)
+{
+ static DECLARE_WORK(_work, __sched_core_put);
+
+ /*
+ * "There can be only one"
+ *
+ * Either this is the last one, or we don't actually need to do any
+ * 'work'. If it is the last *again*, we rely on
+ * WORK_STRUCT_PENDING_BIT.
+ */
+ if (!atomic_add_unless(&sched_core_count, -1, 1))
+ schedule_work(&_work);
+}
+
+#else /* !CONFIG_SCHED_CORE */
+
+static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
+static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
+
+#endif /* CONFIG_SCHED_CORE */
+
/*
* part of the period that we allow rt tasks to run in us.
* default: 0.95s
@@ -184,6 +450,79 @@ int sysctl_sched_rt_runtime = 950000;
*
*/
+void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
+{
+ raw_spinlock_t *lock;
+
+ /* Matches synchronize_rcu() in __sched_core_enable() */
+ preempt_disable();
+ if (sched_core_disabled()) {
+ raw_spin_lock_nested(&rq->__lock, subclass);
+ /* preempt_count *MUST* be > 1 */
+ preempt_enable_no_resched();
+ return;
+ }
+
+ for (;;) {
+ lock = __rq_lockp(rq);
+ raw_spin_lock_nested(lock, subclass);
+ if (likely(lock == __rq_lockp(rq))) {
+ /* preempt_count *MUST* be > 1 */
+ preempt_enable_no_resched();
+ return;
+ }
+ raw_spin_unlock(lock);
+ }
+}
+
+bool raw_spin_rq_trylock(struct rq *rq)
+{
+ raw_spinlock_t *lock;
+ bool ret;
+
+ /* Matches synchronize_rcu() in __sched_core_enable() */
+ preempt_disable();
+ if (sched_core_disabled()) {
+ ret = raw_spin_trylock(&rq->__lock);
+ preempt_enable();
+ return ret;
+ }
+
+ for (;;) {
+ lock = __rq_lockp(rq);
+ ret = raw_spin_trylock(lock);
+ if (!ret || (likely(lock == __rq_lockp(rq)))) {
+ preempt_enable();
+ return ret;
+ }
+ raw_spin_unlock(lock);
+ }
+}
+
+void raw_spin_rq_unlock(struct rq *rq)
+{
+ raw_spin_unlock(rq_lockp(rq));
+}
+
+#ifdef CONFIG_SMP
+/*
+ * double_rq_lock - safely lock two runqueues
+ */
+void double_rq_lock(struct rq *rq1, struct rq *rq2)
+{
+ lockdep_assert_irqs_disabled();
+
+ if (rq_order_less(rq2, rq1))
+ swap(rq1, rq2);
+
+ raw_spin_rq_lock(rq1);
+ if (__rq_lockp(rq1) == __rq_lockp(rq2))
+ return;
+
+ raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
+}
+#endif
+
/*
* __task_rq_lock - lock the rq @p resides on.
*/
@@ -196,12 +535,12 @@ struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
for (;;) {
rq = task_rq(p);
- raw_spin_lock(&rq->lock);
+ raw_spin_rq_lock(rq);
if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
rq_pin_lock(rq, rf);
return rq;
}
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
while (unlikely(task_on_rq_migrating(p)))
cpu_relax();
@@ -220,7 +559,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
for (;;) {
raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
rq = task_rq(p);
- raw_spin_lock(&rq->lock);
+ raw_spin_rq_lock(rq);
/*
* move_queued_task() task_rq_lock()
*
@@ -242,7 +581,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
rq_pin_lock(rq, rf);
return rq;
}
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
while (unlikely(task_on_rq_migrating(p)))
@@ -312,7 +651,7 @@ void update_rq_clock(struct rq *rq)
{
s64 delta;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
if (rq->clock_update_flags & RQCF_ACT_SKIP)
return;
@@ -585,7 +924,6 @@ void wake_up_q(struct wake_q_head *head)
struct task_struct *task;
task = container_of(node, struct task_struct, wake_q);
- BUG_ON(!task);
/* Task can safely be re-inserted now: */
node = node->next;
task->wake_q.next = NULL;
@@ -611,7 +949,7 @@ void resched_curr(struct rq *rq)
struct task_struct *curr = rq->curr;
int cpu;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
if (test_tsk_need_resched(curr))
return;
@@ -635,10 +973,10 @@ void resched_cpu(int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_rq_lock_irqsave(rq, flags);
if (cpu_online(cpu) || cpu == smp_processor_id())
resched_curr(rq);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_rq_unlock_irqrestore(rq, flags);
}
#ifdef CONFIG_SMP
@@ -1067,7 +1405,6 @@ uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
{
struct uclamp_se uc_req = p->uclamp_req[clamp_id];
#ifdef CONFIG_UCLAMP_TASK_GROUP
- struct uclamp_se uc_max;
/*
* Tasks in autogroups or root task group will be
@@ -1078,9 +1415,23 @@ uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
if (task_group(p) == &root_task_group)
return uc_req;
- uc_max = task_group(p)->uclamp[clamp_id];
- if (uc_req.value > uc_max.value || !uc_req.user_defined)
- return uc_max;
+ switch (clamp_id) {
+ case UCLAMP_MIN: {
+ struct uclamp_se uc_min = task_group(p)->uclamp[clamp_id];
+ if (uc_req.value < uc_min.value)
+ return uc_min;
+ break;
+ }
+ case UCLAMP_MAX: {
+ struct uclamp_se uc_max = task_group(p)->uclamp[clamp_id];
+ if (uc_req.value > uc_max.value)
+ return uc_max;
+ break;
+ }
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
#endif
return uc_req;
@@ -1137,7 +1488,7 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
struct uclamp_se *uc_se = &p->uclamp[clamp_id];
struct uclamp_bucket *bucket;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
/* Update task effective clamp */
p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
@@ -1177,7 +1528,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
unsigned int bkt_clamp;
unsigned int rq_clamp;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
/*
* If sched_uclamp_used was enabled after task @p was enqueued,
@@ -1596,21 +1947,27 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
update_rq_clock(rq);
if (!(flags & ENQUEUE_RESTORE)) {
- sched_info_queued(rq, p);
+ sched_info_enqueue(rq, p);
psi_enqueue(p, flags & ENQUEUE_WAKEUP);
}
uclamp_rq_inc(rq, p);
p->sched_class->enqueue_task(rq, p, flags);
+
+ if (sched_core_enabled(rq))
+ sched_core_enqueue(rq, p);
}
static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
+ if (sched_core_enabled(rq))
+ sched_core_dequeue(rq, p);
+
if (!(flags & DEQUEUE_NOCLOCK))
update_rq_clock(rq);
if (!(flags & DEQUEUE_SAVE)) {
- sched_info_dequeued(rq, p);
+ sched_info_dequeue(rq, p);
psi_dequeue(p, flags & DEQUEUE_SLEEP);
}
@@ -1850,7 +2207,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
struct task_struct *p, int new_cpu)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
deactivate_task(rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, new_cpu);
@@ -1916,7 +2273,6 @@ static int migration_cpu_stop(void *data)
struct migration_arg *arg = data;
struct set_affinity_pending *pending = arg->pending;
struct task_struct *p = arg->task;
- int dest_cpu = arg->dest_cpu;
struct rq *rq = this_rq();
bool complete = false;
struct rq_flags rf;
@@ -1954,19 +2310,15 @@ static int migration_cpu_stop(void *data)
if (pending) {
p->migration_pending = NULL;
complete = true;
- }
- if (dest_cpu < 0) {
if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask))
goto out;
-
- dest_cpu = cpumask_any_distribute(&p->cpus_mask);
}
if (task_on_rq_queued(p))
- rq = __migrate_task(rq, &rf, p, dest_cpu);
+ rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
else
- p->wake_cpu = dest_cpu;
+ p->wake_cpu = arg->dest_cpu;
/*
* XXX __migrate_task() can fail, at which point we might end
@@ -2024,7 +2376,7 @@ int push_cpu_stop(void *arg)
struct task_struct *p = arg;
raw_spin_lock_irq(&p->pi_lock);
- raw_spin_lock(&rq->lock);
+ raw_spin_rq_lock(rq);
if (task_rq(p) != rq)
goto out_unlock;
@@ -2054,7 +2406,7 @@ int push_cpu_stop(void *arg)
out_unlock:
rq->push_busy = false;
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
raw_spin_unlock_irq(&p->pi_lock);
put_task_struct(p);
@@ -2107,7 +2459,7 @@ __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32
* Because __kthread_bind() calls this on blocked tasks without
* holding rq->lock.
*/
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
}
if (running)
@@ -2249,7 +2601,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
init_completion(&my_pending.done);
my_pending.arg = (struct migration_arg) {
.task = p,
- .dest_cpu = -1, /* any */
+ .dest_cpu = dest_cpu,
.pending = &my_pending,
};
@@ -2257,6 +2609,15 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
} else {
pending = p->migration_pending;
refcount_inc(&pending->refs);
+ /*
+ * Affinity has changed, but we've already installed a
+ * pending. migration_cpu_stop() *must* see this, else
+ * we risk a completion of the pending despite having a
+ * task on a disallowed CPU.
+ *
+ * Serialized by p->pi_lock, so this is safe.
+ */
+ pending->arg.dest_cpu = dest_cpu;
}
}
pending = p->migration_pending;
@@ -2448,7 +2809,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
* task_rq_lock().
*/
WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
- lockdep_is_held(&task_rq(p)->lock)));
+ lockdep_is_held(__rq_lockp(task_rq(p)))));
#endif
/*
* Clearly, migrating tasks to offline CPUs is a fairly daft thing.
@@ -2979,6 +3340,9 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
if (rq->avg_idle > max)
rq->avg_idle = max;
+ rq->wake_stamp = jiffies;
+ rq->wake_avg_idle = rq->avg_idle / 2;
+
rq->idle_stamp = 0;
}
#endif
@@ -2990,7 +3354,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
{
int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
if (p->sched_contributes_to_load)
rq->nr_uninterruptible--;
@@ -3648,7 +4012,6 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
#ifdef CONFIG_SCHEDSTATS
DEFINE_STATIC_KEY_FALSE(sched_schedstats);
-static bool __initdata __sched_schedstats = false;
static void set_schedstats(bool enabled)
{
@@ -3672,16 +4035,11 @@ static int __init setup_schedstats(char *str)
if (!str)
goto out;
- /*
- * This code is called before jump labels have been set up, so we can't
- * change the static branch directly just yet. Instead set a temporary
- * variable so init_schedstats() can do it later.
- */
if (!strcmp(str, "enable")) {
- __sched_schedstats = true;
+ set_schedstats(true);
ret = 1;
} else if (!strcmp(str, "disable")) {
- __sched_schedstats = false;
+ set_schedstats(false);
ret = 1;
}
out:
@@ -3692,11 +4050,6 @@ out:
}
__setup("schedstats=", setup_schedstats);
-static void __init init_schedstats(void)
-{
- set_schedstats(__sched_schedstats);
-}
-
#ifdef CONFIG_PROC_SYSCTL
int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
@@ -3718,8 +4071,6 @@ int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
return err;
}
#endif /* CONFIG_PROC_SYSCTL */
-#else /* !CONFIG_SCHEDSTATS */
-static inline void init_schedstats(void) {}
#endif /* CONFIG_SCHEDSTATS */
/*
@@ -4001,7 +4352,7 @@ static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
void (*func)(struct rq *rq);
struct callback_head *next;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
while (head) {
func = (void (*)(struct rq *))head->func;
@@ -4024,7 +4375,7 @@ static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
{
struct callback_head *head = rq->balance_callback;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
if (head)
rq->balance_callback = NULL;
@@ -4041,9 +4392,9 @@ static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
unsigned long flags;
if (unlikely(head)) {
- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_rq_lock_irqsave(rq, flags);
do_balance_callbacks(rq, head);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_rq_unlock_irqrestore(rq, flags);
}
}
@@ -4074,10 +4425,10 @@ prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf
* do an early lockdep release here:
*/
rq_unpin_lock(rq, rf);
- spin_release(&rq->lock.dep_map, _THIS_IP_);
+ spin_release(&__rq_lockp(rq)->dep_map, _THIS_IP_);
#ifdef CONFIG_DEBUG_SPINLOCK
/* this is a valid case when another task releases the spinlock */
- rq->lock.owner = next;
+ rq_lockp(rq)->owner = next;
#endif
}
@@ -4088,9 +4439,9 @@ static inline void finish_lock_switch(struct rq *rq)
* fix up the runqueue lock - which gets 'carried over' from
* prev into current:
*/
- spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
+ spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
__balance_callbacks(rq);
- raw_spin_unlock_irq(&rq->lock);
+ raw_spin_rq_unlock_irq(rq);
}
/*
@@ -4348,9 +4699,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
* externally visible scheduler statistics: current number of runnable
* threads, total number of context switches performed since bootup.
*/
-unsigned long nr_running(void)
+unsigned int nr_running(void)
{
- unsigned long i, sum = 0;
+ unsigned int i, sum = 0;
for_each_online_cpu(i)
sum += cpu_rq(i)->nr_running;
@@ -4395,7 +4746,7 @@ unsigned long long nr_context_switches(void)
* it does become runnable.
*/
-unsigned long nr_iowait_cpu(int cpu)
+unsigned int nr_iowait_cpu(int cpu)
{
return atomic_read(&cpu_rq(cpu)->nr_iowait);
}
@@ -4430,9 +4781,9 @@ unsigned long nr_iowait_cpu(int cpu)
* Task CPU affinities can make all that even more 'interesting'.
*/
-unsigned long nr_iowait(void)
+unsigned int nr_iowait(void)
{
- unsigned long i, sum = 0;
+ unsigned int i, sum = 0;
for_each_possible_cpu(i)
sum += nr_iowait_cpu(i);
@@ -4943,7 +5294,7 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
* Pick up the highest-prio task:
*/
static inline struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+__pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
const struct sched_class *class;
struct task_struct *p;
@@ -4961,7 +5312,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
if (unlikely(p == RETRY_TASK))
goto restart;
- /* Assumes fair_sched_class->next == idle_sched_class */
+ /* Assume the next prioritized class is idle_sched_class */
if (!p) {
put_prev_task(rq, prev);
p = pick_next_task_idle(rq);
@@ -4983,6 +5334,455 @@ restart:
BUG();
}
+#ifdef CONFIG_SCHED_CORE
+static inline bool is_task_rq_idle(struct task_struct *t)
+{
+ return (task_rq(t)->idle == t);
+}
+
+static inline bool cookie_equals(struct task_struct *a, unsigned long cookie)
+{
+ return is_task_rq_idle(a) || (a->core_cookie == cookie);
+}
+
+static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
+{
+ if (is_task_rq_idle(a) || is_task_rq_idle(b))
+ return true;
+
+ return a->core_cookie == b->core_cookie;
+}
+
+// XXX fairness/fwd progress conditions
+/*
+ * Returns
+ * - NULL if there is no runnable task for this class.
+ * - the highest priority task for this runqueue if it matches
+ * rq->core->core_cookie or its priority is greater than max.
+ * - Else returns idle_task.
+ */
+static struct task_struct *
+pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max, bool in_fi)
+{
+ struct task_struct *class_pick, *cookie_pick;
+ unsigned long cookie = rq->core->core_cookie;
+
+ class_pick = class->pick_task(rq);
+ if (!class_pick)
+ return NULL;
+
+ if (!cookie) {
+ /*
+ * If class_pick is tagged, return it only if it has
+ * higher priority than max.
+ */
+ if (max && class_pick->core_cookie &&
+ prio_less(class_pick, max, in_fi))
+ return idle_sched_class.pick_task(rq);
+
+ return class_pick;
+ }
+
+ /*
+ * If class_pick is idle or matches cookie, return early.
+ */
+ if (cookie_equals(class_pick, cookie))
+ return class_pick;
+
+ cookie_pick = sched_core_find(rq, cookie);
+
+ /*
+ * If class > max && class > cookie, it is the highest priority task on
+ * the core (so far) and it must be selected, otherwise we must go with
+ * the cookie pick in order to satisfy the constraint.
+ */
+ if (prio_less(cookie_pick, class_pick, in_fi) &&
+ (!max || prio_less(max, class_pick, in_fi)))
+ return class_pick;
+
+ return cookie_pick;
+}
+
+extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi);
+
+static struct task_struct *
+pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+ struct task_struct *next, *max = NULL;
+ const struct sched_class *class;
+ const struct cpumask *smt_mask;
+ bool fi_before = false;
+ int i, j, cpu, occ = 0;
+ bool need_sync;
+
+ if (!sched_core_enabled(rq))
+ return __pick_next_task(rq, prev, rf);
+
+ cpu = cpu_of(rq);
+
+ /* Stopper task is switching into idle, no need core-wide selection. */
+ if (cpu_is_offline(cpu)) {
+ /*
+ * Reset core_pick so that we don't enter the fastpath when
+ * coming online. core_pick would already be migrated to
+ * another cpu during offline.
+ */
+ rq->core_pick = NULL;
+ return __pick_next_task(rq, prev, rf);
+ }
+
+ /*
+ * If there were no {en,de}queues since we picked (IOW, the task
+ * pointers are all still valid), and we haven't scheduled the last
+ * pick yet, do so now.
+ *
+ * rq->core_pick can be NULL if no selection was made for a CPU because
+ * it was either offline or went offline during a sibling's core-wide
+ * selection. In this case, do a core-wide selection.
+ */
+ if (rq->core->core_pick_seq == rq->core->core_task_seq &&
+ rq->core->core_pick_seq != rq->core_sched_seq &&
+ rq->core_pick) {
+ WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq);
+
+ next = rq->core_pick;
+ if (next != prev) {
+ put_prev_task(rq, prev);
+ set_next_task(rq, next);
+ }
+
+ rq->core_pick = NULL;
+ return next;
+ }
+
+ put_prev_task_balance(rq, prev, rf);
+
+ smt_mask = cpu_smt_mask(cpu);
+ need_sync = !!rq->core->core_cookie;
+
+ /* reset state */
+ rq->core->core_cookie = 0UL;
+ if (rq->core->core_forceidle) {
+ need_sync = true;
+ fi_before = true;
+ rq->core->core_forceidle = false;
+ }
+
+ /*
+ * core->core_task_seq, core->core_pick_seq, rq->core_sched_seq
+ *
+ * @task_seq guards the task state ({en,de}queues)
+ * @pick_seq is the @task_seq we did a selection on
+ * @sched_seq is the @pick_seq we scheduled
+ *
+ * However, preemptions can cause multiple picks on the same task set.
+ * 'Fix' this by also increasing @task_seq for every pick.
+ */
+ rq->core->core_task_seq++;
+
+ /*
+ * Optimize for common case where this CPU has no cookies
+ * and there are no cookied tasks running on siblings.
+ */
+ if (!need_sync) {
+ for_each_class(class) {
+ next = class->pick_task(rq);
+ if (next)
+ break;
+ }
+
+ if (!next->core_cookie) {
+ rq->core_pick = NULL;
+ /*
+ * For robustness, update the min_vruntime_fi for
+ * unconstrained picks as well.
+ */
+ WARN_ON_ONCE(fi_before);
+ task_vruntime_update(rq, next, false);
+ goto done;
+ }
+ }
+
+ for_each_cpu(i, smt_mask) {
+ struct rq *rq_i = cpu_rq(i);
+
+ rq_i->core_pick = NULL;
+
+ if (i != cpu)
+ update_rq_clock(rq_i);
+ }
+
+ /*
+ * Try and select tasks for each sibling in descending sched_class
+ * order.
+ */
+ for_each_class(class) {
+again:
+ for_each_cpu_wrap(i, smt_mask, cpu) {
+ struct rq *rq_i = cpu_rq(i);
+ struct task_struct *p;
+
+ if (rq_i->core_pick)
+ continue;
+
+ /*
+ * If this sibling doesn't yet have a suitable task to
+ * run; ask for the most eligible task, given the
+ * highest priority task already selected for this
+ * core.
+ */
+ p = pick_task(rq_i, class, max, fi_before);
+ if (!p)
+ continue;
+
+ if (!is_task_rq_idle(p))
+ occ++;
+
+ rq_i->core_pick = p;
+ if (rq_i->idle == p && rq_i->nr_running) {
+ rq->core->core_forceidle = true;
+ if (!fi_before)
+ rq->core->core_forceidle_seq++;
+ }
+
+ /*
+ * If this new candidate is of higher priority than the
+ * previous; and they're incompatible; we need to wipe
+ * the slate and start over. pick_task makes sure that
+ * p's priority is more than max if it doesn't match
+ * max's cookie.
+ *
+ * NOTE: this is a linear max-filter and is thus bounded
+ * in execution time.
+ */
+ if (!max || !cookie_match(max, p)) {
+ struct task_struct *old_max = max;
+
+ rq->core->core_cookie = p->core_cookie;
+ max = p;
+
+ if (old_max) {
+ rq->core->core_forceidle = false;
+ for_each_cpu(j, smt_mask) {
+ if (j == i)
+ continue;
+
+ cpu_rq(j)->core_pick = NULL;
+ }
+ occ = 1;
+ goto again;
+ }
+ }
+ }
+ }
+
+ rq->core->core_pick_seq = rq->core->core_task_seq;
+ next = rq->core_pick;
+ rq->core_sched_seq = rq->core->core_pick_seq;
+
+ /* Something should have been selected for current CPU */
+ WARN_ON_ONCE(!next);
+
+ /*
+ * Reschedule siblings
+ *
+ * NOTE: L1TF -- at this point we're no longer running the old task and
+ * sending an IPI (below) ensures the sibling will no longer be running
+ * their task. This ensures there is no inter-sibling overlap between
+ * non-matching user state.
+ */
+ for_each_cpu(i, smt_mask) {
+ struct rq *rq_i = cpu_rq(i);
+
+ /*
+ * An online sibling might have gone offline before a task
+ * could be picked for it, or it might be offline but later
+ * happen to come online, but its too late and nothing was
+ * picked for it. That's Ok - it will pick tasks for itself,
+ * so ignore it.
+ */
+ if (!rq_i->core_pick)
+ continue;
+
+ /*
+ * Update for new !FI->FI transitions, or if continuing to be in !FI:
+ * fi_before fi update?
+ * 0 0 1
+ * 0 1 1
+ * 1 0 1
+ * 1 1 0
+ */
+ if (!(fi_before && rq->core->core_forceidle))
+ task_vruntime_update(rq_i, rq_i->core_pick, rq->core->core_forceidle);
+
+ rq_i->core_pick->core_occupation = occ;
+
+ if (i == cpu) {
+ rq_i->core_pick = NULL;
+ continue;
+ }
+
+ /* Did we break L1TF mitigation requirements? */
+ WARN_ON_ONCE(!cookie_match(next, rq_i->core_pick));
+
+ if (rq_i->curr == rq_i->core_pick) {
+ rq_i->core_pick = NULL;
+ continue;
+ }
+
+ resched_curr(rq_i);
+ }
+
+done:
+ set_next_task(rq, next);
+ return next;
+}
+
+static bool try_steal_cookie(int this, int that)
+{
+ struct rq *dst = cpu_rq(this), *src = cpu_rq(that);
+ struct task_struct *p;
+ unsigned long cookie;
+ bool success = false;
+
+ local_irq_disable();
+ double_rq_lock(dst, src);
+
+ cookie = dst->core->core_cookie;
+ if (!cookie)
+ goto unlock;
+
+ if (dst->curr != dst->idle)
+ goto unlock;
+
+ p = sched_core_find(src, cookie);
+ if (p == src->idle)
+ goto unlock;
+
+ do {
+ if (p == src->core_pick || p == src->curr)
+ goto next;
+
+ if (!cpumask_test_cpu(this, &p->cpus_mask))
+ goto next;
+
+ if (p->core_occupation > dst->idle->core_occupation)
+ goto next;
+
+ p->on_rq = TASK_ON_RQ_MIGRATING;
+ deactivate_task(src, p, 0);
+ set_task_cpu(p, this);
+ activate_task(dst, p, 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
+
+ resched_curr(dst);
+
+ success = true;
+ break;
+
+next:
+ p = sched_core_next(p, cookie);
+ } while (p);
+
+unlock:
+ double_rq_unlock(dst, src);
+ local_irq_enable();
+
+ return success;
+}
+
+static bool steal_cookie_task(int cpu, struct sched_domain *sd)
+{
+ int i;
+
+ for_each_cpu_wrap(i, sched_domain_span(sd), cpu) {
+ if (i == cpu)
+ continue;
+
+ if (need_resched())
+ break;
+
+ if (try_steal_cookie(cpu, i))
+ return true;
+ }
+
+ return false;
+}
+
+static void sched_core_balance(struct rq *rq)
+{
+ struct sched_domain *sd;
+ int cpu = cpu_of(rq);
+
+ preempt_disable();
+ rcu_read_lock();
+ raw_spin_rq_unlock_irq(rq);
+ for_each_domain(cpu, sd) {
+ if (need_resched())
+ break;
+
+ if (steal_cookie_task(cpu, sd))
+ break;
+ }
+ raw_spin_rq_lock_irq(rq);
+ rcu_read_unlock();
+ preempt_enable();
+}
+
+static DEFINE_PER_CPU(struct callback_head, core_balance_head);
+
+void queue_core_balance(struct rq *rq)
+{
+ if (!sched_core_enabled(rq))
+ return;
+
+ if (!rq->core->core_cookie)
+ return;
+
+ if (!rq->nr_running) /* not forced idle */
+ return;
+
+ queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance);
+}
+
+static inline void sched_core_cpu_starting(unsigned int cpu)
+{
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+ struct rq *rq, *core_rq = NULL;
+ int i;
+
+ core_rq = cpu_rq(cpu)->core;
+
+ if (!core_rq) {
+ for_each_cpu(i, smt_mask) {
+ rq = cpu_rq(i);
+ if (rq->core && rq->core == rq)
+ core_rq = rq;
+ }
+
+ if (!core_rq)
+ core_rq = cpu_rq(cpu);
+
+ for_each_cpu(i, smt_mask) {
+ rq = cpu_rq(i);
+
+ WARN_ON_ONCE(rq->core && rq->core != core_rq);
+ rq->core = core_rq;
+ }
+ }
+}
+#else /* !CONFIG_SCHED_CORE */
+
+static inline void sched_core_cpu_starting(unsigned int cpu) {}
+
+static struct task_struct *
+pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+ return __pick_next_task(rq, prev, rf);
+}
+
+#endif /* CONFIG_SCHED_CORE */
+
/*
* __schedule() is the main scheduler function.
*
@@ -5150,7 +5950,7 @@ static void __sched notrace __schedule(bool preempt)
rq_unpin_lock(rq, &rf);
__balance_callbacks(rq);
- raw_spin_unlock_irq(&rq->lock);
+ raw_spin_rq_unlock_irq(rq);
}
}
@@ -5692,7 +6492,7 @@ out_unlock:
rq_unpin_lock(rq, &rf);
__balance_callbacks(rq);
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
preempt_enable();
}
@@ -7434,19 +8234,32 @@ void show_state_filter(unsigned long state_filter)
* NOTE: this function does not set the idle thread's NEED_RESCHED
* flag, to make booting more robust.
*/
-void init_idle(struct task_struct *idle, int cpu)
+void __init init_idle(struct task_struct *idle, int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
__sched_fork(0, idle);
+ /*
+ * The idle task doesn't need the kthread struct to function, but it
+ * is dressed up as a per-CPU kthread and thus needs to play the part
+ * if we want to avoid special-casing it in code that deals with per-CPU
+ * kthreads.
+ */
+ set_kthread_struct(idle);
+
raw_spin_lock_irqsave(&idle->pi_lock, flags);
- raw_spin_lock(&rq->lock);
+ raw_spin_rq_lock(rq);
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
- idle->flags |= PF_IDLE;
+ /*
+ * PF_KTHREAD should already be set at this point; regardless, make it
+ * look like a proper per-CPU kthread.
+ */
+ idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY;
+ kthread_set_per_cpu(idle, cpu);
scs_task_reset(idle);
kasan_unpoison_task_stack(idle);
@@ -7480,7 +8293,7 @@ void init_idle(struct task_struct *idle, int cpu)
#ifdef CONFIG_SMP
idle->on_cpu = 1;
#endif
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
/* Set the preempt count _outside_ the spinlocks! */
@@ -7646,7 +8459,7 @@ static void balance_push(struct rq *rq)
{
struct task_struct *push_task = rq->curr;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
SCHED_WARN_ON(rq->cpu != smp_processor_id());
/*
@@ -7663,12 +8476,8 @@ static void balance_push(struct rq *rq)
/*
* Both the cpu-hotplug and stop task are in this case and are
* required to complete the hotplug process.
- *
- * XXX: the idle task does not match kthread_is_per_cpu() due to
- * histerical raisins.
*/
- if (rq->idle == push_task ||
- kthread_is_per_cpu(push_task) ||
+ if (kthread_is_per_cpu(push_task) ||
is_migration_disabled(push_task)) {
/*
@@ -7684,9 +8493,9 @@ static void balance_push(struct rq *rq)
*/
if (!rq->nr_running && !rq_has_pinned_tasks(rq) &&
rcuwait_active(&rq->hotplug_wait)) {
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
rcuwait_wake_up(&rq->hotplug_wait);
- raw_spin_lock(&rq->lock);
+ raw_spin_rq_lock(rq);
}
return;
}
@@ -7696,7 +8505,7 @@ static void balance_push(struct rq *rq)
* Temporarily drop rq->lock such that we can wake-up the stop task.
* Both preemption and IRQs are still disabled.
*/
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
this_cpu_ptr(&push_work));
/*
@@ -7704,7 +8513,7 @@ static void balance_push(struct rq *rq)
* schedule(). The next pick is obviously going to be the stop task
* which kthread_is_per_cpu() and will push this task away.
*/
- raw_spin_lock(&rq->lock);
+ raw_spin_rq_lock(rq);
}
static void balance_push_set(int cpu, bool on)
@@ -7948,6 +8757,7 @@ static void sched_rq_cpu_starting(unsigned int cpu)
int sched_cpu_starting(unsigned int cpu)
{
+ sched_core_cpu_starting(cpu);
sched_rq_cpu_starting(cpu);
sched_tick_start(cpu);
return 0;
@@ -7994,7 +8804,7 @@ static void dump_rq_tasks(struct rq *rq, const char *loglvl)
struct task_struct *g, *p;
int cpu = cpu_of(rq);
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running);
for_each_process_thread(g, p) {
@@ -8046,6 +8856,7 @@ void __init sched_init_smp(void)
/* Move init over to a non-isolated CPU */
if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
BUG();
+ current->flags &= ~PF_NO_SETAFFINITY;
sched_init_granularity();
init_sched_rt_class();
@@ -8167,7 +8978,7 @@ void __init sched_init(void)
struct rq *rq;
rq = cpu_rq(i);
- raw_spin_lock_init(&rq->lock);
+ raw_spin_lock_init(&rq->__lock);
rq->nr_running = 0;
rq->calc_load_active = 0;
rq->calc_load_update = jiffies + LOAD_FREQ;
@@ -8215,6 +9026,8 @@ void __init sched_init(void)
rq->online = 0;
rq->idle_stamp = 0;
rq->avg_idle = 2*sysctl_sched_migration_cost;
+ rq->wake_stamp = jiffies;
+ rq->wake_avg_idle = rq->avg_idle;
rq->max_idle_balance_cost = sysctl_sched_migration_cost;
INIT_LIST_HEAD(&rq->cfs_tasks);
@@ -8232,6 +9045,16 @@ void __init sched_init(void)
#endif /* CONFIG_SMP */
hrtick_rq_init(rq);
atomic_set(&rq->nr_iowait, 0);
+
+#ifdef CONFIG_SCHED_CORE
+ rq->core = NULL;
+ rq->core_pick = NULL;
+ rq->core_enabled = 0;
+ rq->core_tree = RB_ROOT;
+ rq->core_forceidle = false;
+
+ rq->core_cookie = 0UL;
+#endif
}
set_load_weight(&init_task, false);
@@ -8258,8 +9081,6 @@ void __init sched_init(void)
#endif
init_sched_fair_class();
- init_schedstats();
-
psi_init();
init_uclamp();
@@ -8681,7 +9502,11 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
#ifdef CONFIG_UCLAMP_TASK_GROUP
/* Propagate the effective uclamp value for the new group */
+ mutex_lock(&uclamp_mutex);
+ rcu_read_lock();
cpu_util_update_eff(css);
+ rcu_read_unlock();
+ mutex_unlock(&uclamp_mutex);
#endif
return 0;
@@ -8771,6 +9596,9 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css)
enum uclamp_id clamp_id;
unsigned int clamps;
+ lockdep_assert_held(&uclamp_mutex);
+ SCHED_WARN_ON(!rcu_read_lock_held());
+
css_for_each_descendant_pre(css, top_css) {
uc_parent = css_tg(css)->parent
? css_tg(css)->parent->uclamp : NULL;
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
new file mode 100644
index 000000000000..9a80e9a474c0
--- /dev/null
+++ b/kernel/sched/core_sched.c
@@ -0,0 +1,229 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/prctl.h>
+#include "sched.h"
+
+/*
+ * A simple wrapper around refcount. An allocated sched_core_cookie's
+ * address is used to compute the cookie of the task.
+ */
+struct sched_core_cookie {
+ refcount_t refcnt;
+};
+
+unsigned long sched_core_alloc_cookie(void)
+{
+ struct sched_core_cookie *ck = kmalloc(sizeof(*ck), GFP_KERNEL);
+ if (!ck)
+ return 0;
+
+ refcount_set(&ck->refcnt, 1);
+ sched_core_get();
+
+ return (unsigned long)ck;
+}
+
+void sched_core_put_cookie(unsigned long cookie)
+{
+ struct sched_core_cookie *ptr = (void *)cookie;
+
+ if (ptr && refcount_dec_and_test(&ptr->refcnt)) {
+ kfree(ptr);
+ sched_core_put();
+ }
+}
+
+unsigned long sched_core_get_cookie(unsigned long cookie)
+{
+ struct sched_core_cookie *ptr = (void *)cookie;
+
+ if (ptr)
+ refcount_inc(&ptr->refcnt);
+
+ return cookie;
+}
+
+/*
+ * sched_core_update_cookie - replace the cookie on a task
+ * @p: the task to update
+ * @cookie: the new cookie
+ *
+ * Effectively exchange the task cookie; caller is responsible for lifetimes on
+ * both ends.
+ *
+ * Returns: the old cookie
+ */
+unsigned long sched_core_update_cookie(struct task_struct *p, unsigned long cookie)
+{
+ unsigned long old_cookie;
+ struct rq_flags rf;
+ struct rq *rq;
+ bool enqueued;
+
+ rq = task_rq_lock(p, &rf);
+
+ /*
+ * Since creating a cookie implies sched_core_get(), and we cannot set
+ * a cookie until after we've created it, similarly, we cannot destroy
+ * a cookie until after we've removed it, we must have core scheduling
+ * enabled here.
+ */
+ SCHED_WARN_ON((p->core_cookie || cookie) && !sched_core_enabled(rq));
+
+ enqueued = sched_core_enqueued(p);
+ if (enqueued)
+ sched_core_dequeue(rq, p);
+
+ old_cookie = p->core_cookie;
+ p->core_cookie = cookie;
+
+ if (enqueued)
+ sched_core_enqueue(rq, p);
+
+ /*
+ * If task is currently running, it may not be compatible anymore after
+ * the cookie change, so enter the scheduler on its CPU to schedule it
+ * away.
+ */
+ if (task_running(rq, p))
+ resched_curr(rq);
+
+ task_rq_unlock(rq, p, &rf);
+
+ return old_cookie;
+}
+
+static unsigned long sched_core_clone_cookie(struct task_struct *p)
+{
+ unsigned long cookie, flags;
+
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ cookie = sched_core_get_cookie(p->core_cookie);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+ return cookie;
+}
+
+void sched_core_fork(struct task_struct *p)
+{
+ RB_CLEAR_NODE(&p->core_node);
+ p->core_cookie = sched_core_clone_cookie(current);
+}
+
+void sched_core_free(struct task_struct *p)
+{
+ sched_core_put_cookie(p->core_cookie);
+}
+
+static void __sched_core_set(struct task_struct *p, unsigned long cookie)
+{
+ cookie = sched_core_get_cookie(cookie);
+ cookie = sched_core_update_cookie(p, cookie);
+ sched_core_put_cookie(cookie);
+}
+
+/* Called from prctl interface: PR_SCHED_CORE */
+int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
+ unsigned long uaddr)
+{
+ unsigned long cookie = 0, id = 0;
+ struct task_struct *task, *p;
+ struct pid *grp;
+ int err = 0;
+
+ if (!static_branch_likely(&sched_smt_present))
+ return -ENODEV;
+
+ if (type > PIDTYPE_PGID || cmd >= PR_SCHED_CORE_MAX || pid < 0 ||
+ (cmd != PR_SCHED_CORE_GET && uaddr))
+ return -EINVAL;
+
+ rcu_read_lock();
+ if (pid == 0) {
+ task = current;
+ } else {
+ task = find_task_by_vpid(pid);
+ if (!task) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+ }
+ get_task_struct(task);
+ rcu_read_unlock();
+
+ /*
+ * Check if this process has the right to modify the specified
+ * process. Use the regular "ptrace_may_access()" checks.
+ */
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
+ err = -EPERM;
+ goto out;
+ }
+
+ switch (cmd) {
+ case PR_SCHED_CORE_GET:
+ if (type != PIDTYPE_PID || uaddr & 7) {
+ err = -EINVAL;
+ goto out;
+ }
+ cookie = sched_core_clone_cookie(task);
+ if (cookie) {
+ /* XXX improve ? */
+ ptr_to_hashval((void *)cookie, &id);
+ }
+ err = put_user(id, (u64 __user *)uaddr);
+ goto out;
+
+ case PR_SCHED_CORE_CREATE:
+ cookie = sched_core_alloc_cookie();
+ if (!cookie) {
+ err = -ENOMEM;
+ goto out;
+ }
+ break;
+
+ case PR_SCHED_CORE_SHARE_TO:
+ cookie = sched_core_clone_cookie(current);
+ break;
+
+ case PR_SCHED_CORE_SHARE_FROM:
+ if (type != PIDTYPE_PID) {
+ err = -EINVAL;
+ goto out;
+ }
+ cookie = sched_core_clone_cookie(task);
+ __sched_core_set(current, cookie);
+ goto out;
+
+ default:
+ err = -EINVAL;
+ goto out;
+ };
+
+ if (type == PIDTYPE_PID) {
+ __sched_core_set(task, cookie);
+ goto out;
+ }
+
+ read_lock(&tasklist_lock);
+ grp = task_pid_type(task, type);
+
+ do_each_pid_thread(grp, type, p) {
+ if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) {
+ err = -EPERM;
+ goto out_tasklist;
+ }
+ } while_each_pid_thread(grp, type, p);
+
+ do_each_pid_thread(grp, type, p) {
+ __sched_core_set(p, cookie);
+ } while_each_pid_thread(grp, type, p);
+out_tasklist:
+ read_unlock(&tasklist_lock);
+
+out:
+ sched_core_put_cookie(cookie);
+ put_task_struct(task);
+ return err;
+}
+
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 104a1bade14f..893eece65bfd 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -112,7 +112,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
/*
* Take rq->lock to make 64-bit read safe on 32-bit platforms.
*/
- raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+ raw_spin_rq_lock_irq(cpu_rq(cpu));
#endif
if (index == CPUACCT_STAT_NSTATS) {
@@ -126,7 +126,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
}
#ifndef CONFIG_64BIT
- raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+ raw_spin_rq_unlock_irq(cpu_rq(cpu));
#endif
return data;
@@ -141,14 +141,14 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
/*
* Take rq->lock to make 64-bit write safe on 32-bit platforms.
*/
- raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+ raw_spin_rq_lock_irq(cpu_rq(cpu));
#endif
for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
cpuusage->usages[i] = val;
#ifndef CONFIG_64BIT
- raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+ raw_spin_rq_unlock_irq(cpu_rq(cpu));
#endif
}
@@ -253,13 +253,13 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V)
* Take rq->lock to make 64-bit read safe on 32-bit
* platforms.
*/
- raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+ raw_spin_rq_lock_irq(cpu_rq(cpu));
#endif
seq_printf(m, " %llu", cpuusage->usages[index]);
#ifndef CONFIG_64BIT
- raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+ raw_spin_rq_unlock_irq(cpu_rq(cpu));
#endif
}
seq_puts(m, "\n");
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 4f09afd2f321..57124614363d 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -151,6 +151,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
unsigned int freq = arch_scale_freq_invariant() ?
policy->cpuinfo.max_freq : policy->cur;
+ util = map_util_perf(util);
freq = map_util_freq(util, freq, max);
if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 9a2989749b8d..3829c5a1b936 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -157,7 +157,7 @@ void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
{
u64 old = dl_rq->running_bw;
- lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
dl_rq->running_bw += dl_bw;
SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */
SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
@@ -170,7 +170,7 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
{
u64 old = dl_rq->running_bw;
- lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
dl_rq->running_bw -= dl_bw;
SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */
if (dl_rq->running_bw > old)
@@ -184,7 +184,7 @@ void __add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
{
u64 old = dl_rq->this_bw;
- lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
dl_rq->this_bw += dl_bw;
SCHED_WARN_ON(dl_rq->this_bw < old); /* overflow */
}
@@ -194,7 +194,7 @@ void __sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
{
u64 old = dl_rq->this_bw;
- lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
dl_rq->this_bw -= dl_bw;
SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */
if (dl_rq->this_bw > old)
@@ -987,7 +987,7 @@ static int start_dl_timer(struct task_struct *p)
ktime_t now, act;
s64 delta;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
/*
* We want the timer to fire at the deadline, but considering
@@ -1097,9 +1097,9 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
* If the runqueue is no longer available, migrate the
* task elsewhere. This necessarily changes rq.
*/
- lockdep_unpin_lock(&rq->lock, rf.cookie);
+ lockdep_unpin_lock(__rq_lockp(rq), rf.cookie);
rq = dl_task_offline_migration(rq, p);
- rf.cookie = lockdep_pin_lock(&rq->lock);
+ rf.cookie = lockdep_pin_lock(__rq_lockp(rq));
update_rq_clock(rq);
/*
@@ -1731,7 +1731,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
* from try_to_wake_up(). Hence, p->pi_lock is locked, but
* rq->lock is not... So, lock it
*/
- raw_spin_lock(&rq->lock);
+ raw_spin_rq_lock(rq);
if (p->dl.dl_non_contending) {
sub_running_bw(&p->dl, &rq->dl);
p->dl.dl_non_contending = 0;
@@ -1746,7 +1746,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
put_task_struct(p);
}
sub_rq_bw(&p->dl, &rq->dl);
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
}
static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
@@ -1852,7 +1852,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
return rb_entry(left, struct sched_dl_entity, rb_node);
}
-static struct task_struct *pick_next_task_dl(struct rq *rq)
+static struct task_struct *pick_task_dl(struct rq *rq)
{
struct sched_dl_entity *dl_se;
struct dl_rq *dl_rq = &rq->dl;
@@ -1864,7 +1864,18 @@ static struct task_struct *pick_next_task_dl(struct rq *rq)
dl_se = pick_next_dl_entity(rq, dl_rq);
BUG_ON(!dl_se);
p = dl_task_of(dl_se);
- set_next_task_dl(rq, p, true);
+
+ return p;
+}
+
+static struct task_struct *pick_next_task_dl(struct rq *rq)
+{
+ struct task_struct *p;
+
+ p = pick_task_dl(rq);
+ if (p)
+ set_next_task_dl(rq, p, true);
+
return p;
}
@@ -2291,10 +2302,10 @@ skip:
double_unlock_balance(this_rq, src_rq);
if (push_task) {
- raw_spin_unlock(&this_rq->lock);
+ raw_spin_rq_unlock(this_rq);
stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
push_task, &src_rq->push_work);
- raw_spin_lock(&this_rq->lock);
+ raw_spin_rq_lock(this_rq);
}
}
@@ -2539,6 +2550,7 @@ DEFINE_SCHED_CLASS(dl) = {
#ifdef CONFIG_SMP
.balance = balance_dl,
+ .pick_task = pick_task_dl,
.select_task_rq = select_task_rq_dl,
.migrate_task_rq = migrate_task_rq_dl,
.set_cpus_allowed = set_cpus_allowed_dl,
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index c5aacbd492a1..0c5ec2776ddf 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -576,7 +576,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
SPLIT_NS(cfs_rq->exec_clock));
- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_rq_lock_irqsave(rq, flags);
if (rb_first_cached(&cfs_rq->tasks_timeline))
MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
last = __pick_last_entity(cfs_rq);
@@ -584,7 +584,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
max_vruntime = last->vruntime;
min_vruntime = cfs_rq->min_vruntime;
rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_rq_unlock_irqrestore(rq, flags);
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
SPLIT_NS(MIN_vruntime));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bfaa6e1f6067..5d1a6aace138 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -268,33 +268,11 @@ const struct sched_class fair_sched_class;
*/
#ifdef CONFIG_FAIR_GROUP_SCHED
-static inline struct task_struct *task_of(struct sched_entity *se)
-{
- SCHED_WARN_ON(!entity_is_task(se));
- return container_of(se, struct task_struct, se);
-}
/* Walk up scheduling entities hierarchy */
#define for_each_sched_entity(se) \
for (; se; se = se->parent)
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
-{
- return p->se.cfs_rq;
-}
-
-/* runqueue on which this entity is (to be) queued */
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
-{
- return se->cfs_rq;
-}
-
-/* runqueue "owned" by this group */
-static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
-{
- return grp->my_q;
-}
-
static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
{
if (!path)
@@ -455,33 +433,9 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
#else /* !CONFIG_FAIR_GROUP_SCHED */
-static inline struct task_struct *task_of(struct sched_entity *se)
-{
- return container_of(se, struct task_struct, se);
-}
-
#define for_each_sched_entity(se) \
for (; se; se = NULL)
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
-{
- return &task_rq(p)->cfs;
-}
-
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
-{
- struct task_struct *p = task_of(se);
- struct rq *rq = task_rq(p);
-
- return &rq->cfs;
-}
-
-/* runqueue "owned" by this group */
-static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
-{
- return NULL;
-}
-
static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
{
if (path)
@@ -1107,7 +1061,7 @@ struct numa_group {
static struct numa_group *deref_task_numa_group(struct task_struct *p)
{
return rcu_dereference_check(p->numa_group, p == current ||
- (lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu)));
+ (lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu)));
}
static struct numa_group *deref_curr_numa_group(struct task_struct *p)
@@ -3139,7 +3093,7 @@ void reweight_task(struct task_struct *p, int prio)
*
* tg->weight * grq->load.weight
* ge->load.weight = ----------------------------- (1)
- * \Sum grq->load.weight
+ * \Sum grq->load.weight
*
* Now, because computing that sum is prohibitively expensive to compute (been
* there, done that) we approximate it with this average stuff. The average
@@ -3153,7 +3107,7 @@ void reweight_task(struct task_struct *p, int prio)
*
* tg->weight * grq->avg.load_avg
* ge->load.weight = ------------------------------ (3)
- * tg->load_avg
+ * tg->load_avg
*
* Where: tg->load_avg ~= \Sum grq->avg.load_avg
*
@@ -3169,7 +3123,7 @@ void reweight_task(struct task_struct *p, int prio)
*
* tg->weight * grq->load.weight
* ge->load.weight = ----------------------------- = tg->weight (4)
- * grp->load.weight
+ * grp->load.weight
*
* That is, the sum collapses because all other CPUs are idle; the UP scenario.
*
@@ -3188,7 +3142,7 @@ void reweight_task(struct task_struct *p, int prio)
*
* tg->weight * grq->load.weight
* ge->load.weight = ----------------------------- (6)
- * tg_load_avg'
+ * tg_load_avg'
*
* Where:
*
@@ -3313,6 +3267,15 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
if (cfs_rq->avg.runnable_sum)
return false;
+ /*
+ * _avg must be null when _sum are null because _avg = _sum / divider
+ * Make sure that rounding and/or propagation of PELT values never
+ * break this.
+ */
+ SCHED_WARN_ON(cfs_rq->avg.load_avg ||
+ cfs_rq->avg.util_avg ||
+ cfs_rq->avg.runnable_avg);
+
return true;
}
@@ -3566,9 +3529,12 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
load_sum = (s64)se_weight(se) * runnable_sum;
load_avg = div_s64(load_sum, divider);
+ se->avg.load_sum = runnable_sum;
+
delta = load_avg - se->avg.load_avg;
+ if (!delta)
+ return;
- se->avg.load_sum = runnable_sum;
se->avg.load_avg = load_avg;
add_positive(&cfs_rq->avg.load_avg, delta);
@@ -4448,6 +4414,8 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
static void
set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ clear_buddies(cfs_rq, se);
+
/* 'current' is not kept within the tree. */
if (se->on_rq) {
/*
@@ -4507,7 +4475,7 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
* Avoid running the skip buddy, if running something else can
* be done without getting too unfair.
*/
- if (cfs_rq->skip == se) {
+ if (cfs_rq->skip && cfs_rq->skip == se) {
struct sched_entity *second;
if (se == curr) {
@@ -4534,8 +4502,6 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
se = cfs_rq->last;
}
- clear_buddies(cfs_rq, se);
-
return se;
}
@@ -5357,7 +5323,7 @@ static void __maybe_unused update_runtime_enabled(struct rq *rq)
{
struct task_group *tg;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
rcu_read_lock();
list_for_each_entry_rcu(tg, &task_groups, list) {
@@ -5376,7 +5342,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
{
struct task_group *tg;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
rcu_read_lock();
list_for_each_entry_rcu(tg, &task_groups, list) {
@@ -5964,11 +5930,15 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
+ struct rq *rq = cpu_rq(i);
+
+ if (!sched_core_cookie_match(rq, p))
+ continue;
+
if (sched_idle_cpu(i))
return i;
if (available_idle_cpu(i)) {
- struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
if (idle && idle->exit_latency < min_exit_latency) {
/*
@@ -6054,9 +6024,10 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
return new_cpu;
}
-static inline int __select_idle_cpu(int cpu)
+static inline int __select_idle_cpu(int cpu, struct task_struct *p)
{
- if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
+ if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
+ sched_cpu_cookie_match(cpu_rq(cpu), p))
return cpu;
return -1;
@@ -6126,7 +6097,7 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
int cpu;
if (!static_branch_likely(&sched_smt_present))
- return __select_idle_cpu(core);
+ return __select_idle_cpu(core, p);
for_each_cpu(cpu, cpu_smt_mask(core)) {
if (!available_idle_cpu(cpu)) {
@@ -6182,7 +6153,7 @@ static inline bool test_idle_cores(int cpu, bool def)
static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
{
- return __select_idle_cpu(core);
+ return __select_idle_cpu(core, p);
}
static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
@@ -6201,9 +6172,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
{
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
int i, cpu, idle_cpu = -1, nr = INT_MAX;
+ struct rq *this_rq = this_rq();
int this = smp_processor_id();
struct sched_domain *this_sd;
- u64 time;
+ u64 time = 0;
this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
if (!this_sd)
@@ -6213,12 +6185,21 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
if (sched_feat(SIS_PROP) && !has_idle_core) {
u64 avg_cost, avg_idle, span_avg;
+ unsigned long now = jiffies;
/*
- * Due to large variance we need a large fuzz factor;
- * hackbench in particularly is sensitive here.
+ * If we're busy, the assumption that the last idle period
+ * predicts the future is flawed; age away the remaining
+ * predicted idle time.
*/
- avg_idle = this_rq()->avg_idle / 512;
+ if (unlikely(this_rq->wake_stamp < now)) {
+ while (this_rq->wake_stamp < now && this_rq->wake_avg_idle) {
+ this_rq->wake_stamp++;
+ this_rq->wake_avg_idle >>= 1;
+ }
+ }
+
+ avg_idle = this_rq->wake_avg_idle;
avg_cost = this_sd->avg_scan_cost + 1;
span_avg = sd->span_weight * avg_idle;
@@ -6239,7 +6220,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
} else {
if (!--nr)
return -1;
- idle_cpu = __select_idle_cpu(cpu);
+ idle_cpu = __select_idle_cpu(cpu, p);
if ((unsigned int)idle_cpu < nr_cpumask_bits)
break;
}
@@ -6250,6 +6231,13 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
if (sched_feat(SIS_PROP) && !has_idle_core) {
time = cpu_clock(this) - time;
+
+ /*
+ * Account for the scan cost of wakeups against the average
+ * idle time.
+ */
+ this_rq->wake_avg_idle -= min(this_rq->wake_avg_idle, time);
+
update_avg(&this_sd->avg_scan_cost, time);
}
@@ -6317,6 +6305,11 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
task_util = uclamp_task_util(p);
}
+ /*
+ * per-cpu select_idle_mask usage
+ */
+ lockdep_assert_irqs_disabled();
+
if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
asym_fits_capacity(task_util, target))
return target;
@@ -6592,8 +6585,11 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
struct cpumask *pd_mask = perf_domain_span(pd);
unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
unsigned long max_util = 0, sum_util = 0;
+ unsigned long _cpu_cap = cpu_cap;
int cpu;
+ _cpu_cap -= arch_scale_thermal_pressure(cpumask_first(pd_mask));
+
/*
* The capacity state of CPUs of the current rd can be driven by CPUs
* of another rd if they belong to the same pd. So, account for the
@@ -6629,8 +6625,10 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
* is already enough to scale the EM reported power
* consumption at the (eventually clamped) cpu_capacity.
*/
- sum_util += effective_cpu_util(cpu, util_running, cpu_cap,
- ENERGY_UTIL, NULL);
+ cpu_util = effective_cpu_util(cpu, util_running, cpu_cap,
+ ENERGY_UTIL, NULL);
+
+ sum_util += min(cpu_util, _cpu_cap);
/*
* Performance domain frequency: utilization clamping
@@ -6641,10 +6639,10 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
*/
cpu_util = effective_cpu_util(cpu, util_freq, cpu_cap,
FREQUENCY_UTIL, tsk);
- max_util = max(max_util, cpu_util);
+ max_util = max(max_util, min(cpu_util, _cpu_cap));
}
- return em_cpu_energy(pd->em_pd, max_util, sum_util);
+ return em_cpu_energy(pd->em_pd, max_util, sum_util, _cpu_cap);
}
/*
@@ -6690,15 +6688,15 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
{
unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+ int cpu, best_energy_cpu = prev_cpu, target = -1;
unsigned long cpu_cap, util, base_energy = 0;
- int cpu, best_energy_cpu = prev_cpu;
struct sched_domain *sd;
struct perf_domain *pd;
rcu_read_lock();
pd = rcu_dereference(rd->pd);
if (!pd || READ_ONCE(rd->overutilized))
- goto fail;
+ goto unlock;
/*
* Energy-aware wake-up happens on the lowest sched_domain starting
@@ -6708,7 +6706,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
sd = sd->parent;
if (!sd)
- goto fail;
+ goto unlock;
+
+ target = prev_cpu;
sync_entity_load_avg(&p->se);
if (!task_util_est(p))
@@ -6716,13 +6716,10 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
for (; pd; pd = pd->next) {
unsigned long cur_delta, spare_cap, max_spare_cap = 0;
+ bool compute_prev_delta = false;
unsigned long base_energy_pd;
int max_spare_cap_cpu = -1;
- /* Compute the 'base' energy of the pd, without @p */
- base_energy_pd = compute_energy(p, -1, pd);
- base_energy += base_energy_pd;
-
for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
@@ -6743,26 +6740,40 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (!fits_capacity(util, cpu_cap))
continue;
- /* Always use prev_cpu as a candidate. */
if (cpu == prev_cpu) {
- prev_delta = compute_energy(p, prev_cpu, pd);
- prev_delta -= base_energy_pd;
- best_delta = min(best_delta, prev_delta);
- }
-
- /*
- * Find the CPU with the maximum spare capacity in
- * the performance domain
- */
- if (spare_cap > max_spare_cap) {
+ /* Always use prev_cpu as a candidate. */
+ compute_prev_delta = true;
+ } else if (spare_cap > max_spare_cap) {
+ /*
+ * Find the CPU with the maximum spare capacity
+ * in the performance domain.
+ */
max_spare_cap = spare_cap;
max_spare_cap_cpu = cpu;
}
}
- /* Evaluate the energy impact of using this CPU. */
- if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) {
+ if (max_spare_cap_cpu < 0 && !compute_prev_delta)
+ continue;
+
+ /* Compute the 'base' energy of the pd, without @p */
+ base_energy_pd = compute_energy(p, -1, pd);
+ base_energy += base_energy_pd;
+
+ /* Evaluate the energy impact of using prev_cpu. */
+ if (compute_prev_delta) {
+ prev_delta = compute_energy(p, prev_cpu, pd);
+ if (prev_delta < base_energy_pd)
+ goto unlock;
+ prev_delta -= base_energy_pd;
+ best_delta = min(best_delta, prev_delta);
+ }
+
+ /* Evaluate the energy impact of using max_spare_cap_cpu. */
+ if (max_spare_cap_cpu >= 0) {
cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
+ if (cur_delta < base_energy_pd)
+ goto unlock;
cur_delta -= base_energy_pd;
if (cur_delta < best_delta) {
best_delta = cur_delta;
@@ -6770,25 +6781,22 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
}
}
}
-unlock:
rcu_read_unlock();
/*
* Pick the best CPU if prev_cpu cannot be used, or if it saves at
* least 6% of the energy used by prev_cpu.
*/
- if (prev_delta == ULONG_MAX)
- return best_energy_cpu;
-
- if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
- return best_energy_cpu;
+ if ((prev_delta == ULONG_MAX) ||
+ (prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
+ target = best_energy_cpu;
- return prev_cpu;
+ return target;
-fail:
+unlock:
rcu_read_unlock();
- return -1;
+ return target;
}
/*
@@ -6800,8 +6808,6 @@ fail:
* certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
*
* Returns the target CPU number.
- *
- * preempt must be disabled.
*/
static int
select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
@@ -6814,6 +6820,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
/* SD_flags and WF_flags share the first nibble */
int sd_flag = wake_flags & 0xF;
+ /*
+ * required for stable ->cpus_allowed
+ */
+ lockdep_assert_held(&p->pi_lock);
if (wake_flags & WF_TTWU) {
record_wakee(p);
@@ -6903,7 +6913,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
* In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
* rq->lock and can modify state directly.
*/
- lockdep_assert_held(&task_rq(p)->lock);
+ lockdep_assert_rq_held(task_rq(p));
detach_entity_cfs_rq(&p->se);
} else {
@@ -7107,6 +7117,39 @@ preempt:
set_last_buddy(se);
}
+#ifdef CONFIG_SMP
+static struct task_struct *pick_task_fair(struct rq *rq)
+{
+ struct sched_entity *se;
+ struct cfs_rq *cfs_rq;
+
+again:
+ cfs_rq = &rq->cfs;
+ if (!cfs_rq->nr_running)
+ return NULL;
+
+ do {
+ struct sched_entity *curr = cfs_rq->curr;
+
+ /* When we pick for a remote RQ, we'll not have done put_prev_entity() */
+ if (curr) {
+ if (curr->on_rq)
+ update_curr(cfs_rq);
+ else
+ curr = NULL;
+
+ if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+ goto again;
+ }
+
+ se = pick_next_entity(cfs_rq, curr);
+ cfs_rq = group_cfs_rq(se);
+ } while (cfs_rq);
+
+ return task_of(se);
+}
+#endif
+
struct task_struct *
pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
@@ -7530,7 +7573,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
{
s64 delta;
- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_rq_held(env->src_rq);
if (p->sched_class != &fair_sched_class)
return 0;
@@ -7552,6 +7595,14 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
if (sysctl_sched_migration_cost == -1)
return 1;
+
+ /*
+ * Don't migrate task if the task's cookie does not match
+ * with the destination CPU's core cookie.
+ */
+ if (!sched_core_cookie_match(cpu_rq(env->dst_cpu), p))
+ return 1;
+
if (sysctl_sched_migration_cost == 0)
return 0;
@@ -7628,7 +7679,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
{
int tsk_cache_hot;
- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_rq_held(env->src_rq);
/*
* We do not migrate tasks that are:
@@ -7717,7 +7768,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
*/
static void detach_task(struct task_struct *p, struct lb_env *env)
{
- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_rq_held(env->src_rq);
deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, env->dst_cpu);
@@ -7733,7 +7784,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
{
struct task_struct *p;
- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_rq_held(env->src_rq);
list_for_each_entry_reverse(p,
&env->src_rq->cfs_tasks, se.group_node) {
@@ -7769,7 +7820,7 @@ static int detach_tasks(struct lb_env *env)
struct task_struct *p;
int detached = 0;
- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_rq_held(env->src_rq);
/*
* Source run queue has been emptied by another CPU, clear
@@ -7899,7 +7950,7 @@ next:
*/
static void attach_task(struct rq *rq, struct task_struct *p)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
BUG_ON(task_rq(p) != rq);
activate_task(rq, p, ENQUEUE_NOCLOCK);
@@ -8865,6 +8916,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
p->cpus_ptr))
continue;
+ /* Skip over this group if no cookie matched */
+ if (!sched_group_cookie_match(cpu_rq(this_cpu), p, group))
+ continue;
+
local_group = cpumask_test_cpu(this_cpu,
sched_group_span(group));
@@ -9793,7 +9848,7 @@ more_balance:
if (need_active_balance(&env)) {
unsigned long flags;
- raw_spin_lock_irqsave(&busiest->lock, flags);
+ raw_spin_rq_lock_irqsave(busiest, flags);
/*
* Don't kick the active_load_balance_cpu_stop,
@@ -9801,8 +9856,7 @@ more_balance:
* moved to this_cpu:
*/
if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
- raw_spin_unlock_irqrestore(&busiest->lock,
- flags);
+ raw_spin_rq_unlock_irqrestore(busiest, flags);
goto out_one_pinned;
}
@@ -9819,7 +9873,7 @@ more_balance:
busiest->push_cpu = this_cpu;
active_balance = 1;
}
- raw_spin_unlock_irqrestore(&busiest->lock, flags);
+ raw_spin_rq_unlock_irqrestore(busiest, flags);
if (active_balance) {
stop_one_cpu_nowait(cpu_of(busiest),
@@ -10604,6 +10658,14 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
u64 curr_cost = 0;
update_misfit_status(NULL, this_rq);
+
+ /*
+ * There is a task waiting to run. No need to search for one.
+ * Return 0; the task will be enqueued when switching to idle.
+ */
+ if (this_rq->ttwu_pending)
+ return 0;
+
/*
* We must set idle_stamp _before_ calling idle_balance(), such that we
* measure the duration of idle_balance() as idle time.
@@ -10636,7 +10698,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
goto out;
}
- raw_spin_unlock(&this_rq->lock);
+ raw_spin_rq_unlock(this_rq);
update_blocked_averages(this_cpu);
rcu_read_lock();
@@ -10669,12 +10731,13 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
* Stop searching for tasks to pull if there are
* now runnable tasks on this rq.
*/
- if (pulled_task || this_rq->nr_running > 0)
+ if (pulled_task || this_rq->nr_running > 0 ||
+ this_rq->ttwu_pending)
break;
}
rcu_read_unlock();
- raw_spin_lock(&this_rq->lock);
+ raw_spin_rq_lock(this_rq);
if (curr_cost > this_rq->max_idle_balance_cost)
this_rq->max_idle_balance_cost = curr_cost;
@@ -10767,6 +10830,119 @@ static void rq_offline_fair(struct rq *rq)
#endif /* CONFIG_SMP */
+#ifdef CONFIG_SCHED_CORE
+static inline bool
+__entity_slice_used(struct sched_entity *se, int min_nr_tasks)
+{
+ u64 slice = sched_slice(cfs_rq_of(se), se);
+ u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+
+ return (rtime * min_nr_tasks > slice);
+}
+
+#define MIN_NR_TASKS_DURING_FORCEIDLE 2
+static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
+{
+ if (!sched_core_enabled(rq))
+ return;
+
+ /*
+ * If runqueue has only one task which used up its slice and
+ * if the sibling is forced idle, then trigger schedule to
+ * give forced idle task a chance.
+ *
+ * sched_slice() considers only this active rq and it gets the
+ * whole slice. But during force idle, we have siblings acting
+ * like a single runqueue and hence we need to consider runnable
+ * tasks on this CPU and the forced idle CPU. Ideally, we should
+ * go through the forced idle rq, but that would be a perf hit.
+ * We can assume that the forced idle CPU has at least
+ * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
+ * if we need to give up the CPU.
+ */
+ if (rq->core->core_forceidle && rq->cfs.nr_running == 1 &&
+ __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
+ resched_curr(rq);
+}
+
+/*
+ * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed.
+ */
+static void se_fi_update(struct sched_entity *se, unsigned int fi_seq, bool forceidle)
+{
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ if (forceidle) {
+ if (cfs_rq->forceidle_seq == fi_seq)
+ break;
+ cfs_rq->forceidle_seq = fi_seq;
+ }
+
+ cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
+ }
+}
+
+void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi)
+{
+ struct sched_entity *se = &p->se;
+
+ if (p->sched_class != &fair_sched_class)
+ return;
+
+ se_fi_update(se, rq->core->core_forceidle_seq, in_fi);
+}
+
+bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
+{
+ struct rq *rq = task_rq(a);
+ struct sched_entity *sea = &a->se;
+ struct sched_entity *seb = &b->se;
+ struct cfs_rq *cfs_rqa;
+ struct cfs_rq *cfs_rqb;
+ s64 delta;
+
+ SCHED_WARN_ON(task_rq(b)->core != rq->core);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ /*
+ * Find an se in the hierarchy for tasks a and b, such that the se's
+ * are immediate siblings.
+ */
+ while (sea->cfs_rq->tg != seb->cfs_rq->tg) {
+ int sea_depth = sea->depth;
+ int seb_depth = seb->depth;
+
+ if (sea_depth >= seb_depth)
+ sea = parent_entity(sea);
+ if (sea_depth <= seb_depth)
+ seb = parent_entity(seb);
+ }
+
+ se_fi_update(sea, rq->core->core_forceidle_seq, in_fi);
+ se_fi_update(seb, rq->core->core_forceidle_seq, in_fi);
+
+ cfs_rqa = sea->cfs_rq;
+ cfs_rqb = seb->cfs_rq;
+#else
+ cfs_rqa = &task_rq(a)->cfs;
+ cfs_rqb = &task_rq(b)->cfs;
+#endif
+
+ /*
+ * Find delta after normalizing se's vruntime with its cfs_rq's
+ * min_vruntime_fi, which would have been updated in prior calls
+ * to se_fi_update().
+ */
+ delta = (s64)(sea->vruntime - seb->vruntime) +
+ (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
+
+ return delta > 0;
+}
+#else
+static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
+#endif
+
/*
* scheduler tick hitting a task of our scheduling class.
*
@@ -10790,6 +10966,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
update_misfit_status(curr, rq);
update_overutilized_status(task_rq(curr));
+
+ task_tick_core(rq, curr);
}
/*
@@ -11161,9 +11339,9 @@ void unregister_fair_sched_group(struct task_group *tg)
rq = cpu_rq(cpu);
- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_rq_lock_irqsave(rq, flags);
list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_rq_unlock_irqrestore(rq, flags);
}
}
@@ -11285,6 +11463,7 @@ DEFINE_SCHED_CLASS(fair) = {
#ifdef CONFIG_SMP
.balance = balance_fair,
+ .pick_task = pick_task_fair,
.select_task_rq = select_task_rq_fair,
.migrate_task_rq = migrate_task_rq_fair,
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 7ca3d3d86c2a..912b47aa99d8 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -437,8 +437,16 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir
{
update_idle_core(rq);
schedstat_inc(rq->sched_goidle);
+ queue_core_balance(rq);
}
+#ifdef CONFIG_SMP
+static struct task_struct *pick_task_idle(struct rq *rq)
+{
+ return rq->idle;
+}
+#endif
+
struct task_struct *pick_next_task_idle(struct rq *rq)
{
struct task_struct *next = rq->idle;
@@ -455,10 +463,10 @@ struct task_struct *pick_next_task_idle(struct rq *rq)
static void
dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
{
- raw_spin_unlock_irq(&rq->lock);
+ raw_spin_rq_unlock_irq(rq);
printk(KERN_ERR "bad: scheduling from the idle thread!\n");
dump_stack();
- raw_spin_lock_irq(&rq->lock);
+ raw_spin_rq_lock_irq(rq);
}
/*
@@ -506,6 +514,7 @@ DEFINE_SCHED_CLASS(idle) = {
#ifdef CONFIG_SMP
.balance = balance_idle,
+ .pick_task = pick_task_idle,
.select_task_rq = select_task_rq_idle,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 5a6ea03f9882..7f06eaf12818 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -81,11 +81,9 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags)
{
cpumask_var_t non_housekeeping_mask;
cpumask_var_t tmp;
- int err;
alloc_bootmem_cpumask_var(&non_housekeeping_mask);
- err = cpulist_parse(str, non_housekeeping_mask);
- if (err < 0 || cpumask_last(non_housekeeping_mask) >= nr_cpu_ids) {
+ if (cpulist_parse(str, non_housekeeping_mask) < 0) {
pr_warn("Housekeeping: nohz_full= or isolcpus= incorrect CPU range\n");
free_bootmem_cpumask_var(non_housekeeping_mask);
return 0;
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index 1c79896f1bc0..954b229868d9 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -81,7 +81,7 @@ long calc_load_fold_active(struct rq *this_rq, long adjust)
long nr_active, delta = 0;
nr_active = this_rq->nr_running - adjust;
- nr_active += (long)this_rq->nr_uninterruptible;
+ nr_active += (int)this_rq->nr_uninterruptible;
if (nr_active != this_rq->calc_load_active) {
delta = nr_active - this_rq->calc_load_active;
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index cfe94ffd2b38..e06071bf3472 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -132,7 +132,7 @@ static inline void update_idle_rq_clock_pelt(struct rq *rq)
static inline u64 rq_clock_pelt(struct rq *rq)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
assert_clock_updated(rq);
return rq->clock_pelt - rq->lost_idle_time;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index c286e5ba3c94..a5254471371c 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -888,7 +888,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
if (skip)
continue;
- raw_spin_lock(&rq->lock);
+ raw_spin_rq_lock(rq);
update_rq_clock(rq);
if (rt_rq->rt_time) {
@@ -926,7 +926,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
if (enqueue)
sched_rt_rq_enqueue(rt_rq);
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
}
if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
@@ -1626,7 +1626,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
return rt_task_of(rt_se);
}
-static struct task_struct *pick_next_task_rt(struct rq *rq)
+static struct task_struct *pick_task_rt(struct rq *rq)
{
struct task_struct *p;
@@ -1634,7 +1634,17 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
return NULL;
p = _pick_next_task_rt(rq);
- set_next_task_rt(rq, p, true);
+
+ return p;
+}
+
+static struct task_struct *pick_next_task_rt(struct rq *rq)
+{
+ struct task_struct *p = pick_task_rt(rq);
+
+ if (p)
+ set_next_task_rt(rq, p, true);
+
return p;
}
@@ -1894,10 +1904,10 @@ retry:
*/
push_task = get_push_task(rq);
if (push_task) {
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
push_task, &rq->push_work);
- raw_spin_lock(&rq->lock);
+ raw_spin_rq_lock(rq);
}
return 0;
@@ -2122,10 +2132,10 @@ void rto_push_irq_work_func(struct irq_work *work)
* When it gets updated, a check is made if a push is possible.
*/
if (has_pushable_tasks(rq)) {
- raw_spin_lock(&rq->lock);
+ raw_spin_rq_lock(rq);
while (push_rt_task(rq, true))
;
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
}
raw_spin_lock(&rd->rto_lock);
@@ -2243,10 +2253,10 @@ skip:
double_unlock_balance(this_rq, src_rq);
if (push_task) {
- raw_spin_unlock(&this_rq->lock);
+ raw_spin_rq_unlock(this_rq);
stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
push_task, &src_rq->push_work);
- raw_spin_lock(&this_rq->lock);
+ raw_spin_rq_lock(this_rq);
}
}
@@ -2483,6 +2493,7 @@ DEFINE_SCHED_CLASS(rt) = {
#ifdef CONFIG_SMP
.balance = balance_rt,
+ .pick_task = pick_task_rt,
.select_task_rq = select_task_rq_rt,
.set_cpus_allowed = set_cpus_allowed_common,
.rq_online = rq_online_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a189bec13729..01e48f682d54 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -526,6 +526,11 @@ struct cfs_rq {
u64 exec_clock;
u64 min_vruntime;
+#ifdef CONFIG_SCHED_CORE
+ unsigned int forceidle_seq;
+ u64 min_vruntime_fi;
+#endif
+
#ifndef CONFIG_64BIT
u64 min_vruntime_copy;
#endif
@@ -631,8 +636,8 @@ struct rt_rq {
} highest_prio;
#endif
#ifdef CONFIG_SMP
- unsigned long rt_nr_migratory;
- unsigned long rt_nr_total;
+ unsigned int rt_nr_migratory;
+ unsigned int rt_nr_total;
int overloaded;
struct plist_head pushable_tasks;
@@ -646,7 +651,7 @@ struct rt_rq {
raw_spinlock_t rt_runtime_lock;
#ifdef CONFIG_RT_GROUP_SCHED
- unsigned long rt_nr_boosted;
+ unsigned int rt_nr_boosted;
struct rq *rq;
struct task_group *tg;
@@ -663,7 +668,7 @@ struct dl_rq {
/* runqueue is an rbtree, ordered by deadline */
struct rb_root_cached root;
- unsigned long dl_nr_running;
+ unsigned int dl_nr_running;
#ifdef CONFIG_SMP
/*
@@ -677,7 +682,7 @@ struct dl_rq {
u64 next;
} earliest_dl;
- unsigned long dl_nr_migratory;
+ unsigned int dl_nr_migratory;
int overloaded;
/*
@@ -905,7 +910,7 @@ DECLARE_STATIC_KEY_FALSE(sched_uclamp_used);
*/
struct rq {
/* runqueue lock: */
- raw_spinlock_t lock;
+ raw_spinlock_t __lock;
/*
* nr_running and cpu_load should be in the same cacheline because
@@ -955,7 +960,7 @@ struct rq {
* one CPU and if it got migrated afterwards it may decrease
* it on another CPU. Always updated under the runqueue lock:
*/
- unsigned long nr_uninterruptible;
+ unsigned int nr_uninterruptible;
struct task_struct __rcu *curr;
struct task_struct *idle;
@@ -1017,6 +1022,9 @@ struct rq {
u64 idle_stamp;
u64 avg_idle;
+ unsigned long wake_stamp;
+ u64 wake_avg_idle;
+
/* This is used to determine avg_idle's max value */
u64 max_idle_balance_cost;
@@ -1075,6 +1083,22 @@ struct rq {
#endif
unsigned int push_busy;
struct cpu_stop_work push_work;
+
+#ifdef CONFIG_SCHED_CORE
+ /* per rq */
+ struct rq *core;
+ struct task_struct *core_pick;
+ unsigned int core_enabled;
+ unsigned int core_sched_seq;
+ struct rb_root core_tree;
+
+ /* shared state */
+ unsigned int core_task_seq;
+ unsigned int core_pick_seq;
+ unsigned long core_cookie;
+ unsigned char core_forceidle;
+ unsigned int core_forceidle_seq;
+#endif
};
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1113,6 +1137,206 @@ static inline bool is_migration_disabled(struct task_struct *p)
#endif
}
+struct sched_group;
+#ifdef CONFIG_SCHED_CORE
+static inline struct cpumask *sched_group_span(struct sched_group *sg);
+
+DECLARE_STATIC_KEY_FALSE(__sched_core_enabled);
+
+static inline bool sched_core_enabled(struct rq *rq)
+{
+ return static_branch_unlikely(&__sched_core_enabled) && rq->core_enabled;
+}
+
+static inline bool sched_core_disabled(void)
+{
+ return !static_branch_unlikely(&__sched_core_enabled);
+}
+
+/*
+ * Be careful with this function; not for general use. The return value isn't
+ * stable unless you actually hold a relevant rq->__lock.
+ */
+static inline raw_spinlock_t *rq_lockp(struct rq *rq)
+{
+ if (sched_core_enabled(rq))
+ return &rq->core->__lock;
+
+ return &rq->__lock;
+}
+
+static inline raw_spinlock_t *__rq_lockp(struct rq *rq)
+{
+ if (rq->core_enabled)
+ return &rq->core->__lock;
+
+ return &rq->__lock;
+}
+
+bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool fi);
+
+/*
+ * Helpers to check if the CPU's core cookie matches with the task's cookie
+ * when core scheduling is enabled.
+ * A special case is that the task's cookie always matches with CPU's core
+ * cookie if the CPU is in an idle core.
+ */
+static inline bool sched_cpu_cookie_match(struct rq *rq, struct task_struct *p)
+{
+ /* Ignore cookie match if core scheduler is not enabled on the CPU. */
+ if (!sched_core_enabled(rq))
+ return true;
+
+ return rq->core->core_cookie == p->core_cookie;
+}
+
+static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p)
+{
+ bool idle_core = true;
+ int cpu;
+
+ /* Ignore cookie match if core scheduler is not enabled on the CPU. */
+ if (!sched_core_enabled(rq))
+ return true;
+
+ for_each_cpu(cpu, cpu_smt_mask(cpu_of(rq))) {
+ if (!available_idle_cpu(cpu)) {
+ idle_core = false;
+ break;
+ }
+ }
+
+ /*
+ * A CPU in an idle core is always the best choice for tasks with
+ * cookies.
+ */
+ return idle_core || rq->core->core_cookie == p->core_cookie;
+}
+
+static inline bool sched_group_cookie_match(struct rq *rq,
+ struct task_struct *p,
+ struct sched_group *group)
+{
+ int cpu;
+
+ /* Ignore cookie match if core scheduler is not enabled on the CPU. */
+ if (!sched_core_enabled(rq))
+ return true;
+
+ for_each_cpu_and(cpu, sched_group_span(group), p->cpus_ptr) {
+ if (sched_core_cookie_match(rq, p))
+ return true;
+ }
+ return false;
+}
+
+extern void queue_core_balance(struct rq *rq);
+
+static inline bool sched_core_enqueued(struct task_struct *p)
+{
+ return !RB_EMPTY_NODE(&p->core_node);
+}
+
+extern void sched_core_enqueue(struct rq *rq, struct task_struct *p);
+extern void sched_core_dequeue(struct rq *rq, struct task_struct *p);
+
+extern void sched_core_get(void);
+extern void sched_core_put(void);
+
+extern unsigned long sched_core_alloc_cookie(void);
+extern void sched_core_put_cookie(unsigned long cookie);
+extern unsigned long sched_core_get_cookie(unsigned long cookie);
+extern unsigned long sched_core_update_cookie(struct task_struct *p, unsigned long cookie);
+
+#else /* !CONFIG_SCHED_CORE */
+
+static inline bool sched_core_enabled(struct rq *rq)
+{
+ return false;
+}
+
+static inline bool sched_core_disabled(void)
+{
+ return true;
+}
+
+static inline raw_spinlock_t *rq_lockp(struct rq *rq)
+{
+ return &rq->__lock;
+}
+
+static inline raw_spinlock_t *__rq_lockp(struct rq *rq)
+{
+ return &rq->__lock;
+}
+
+static inline void queue_core_balance(struct rq *rq)
+{
+}
+
+static inline bool sched_cpu_cookie_match(struct rq *rq, struct task_struct *p)
+{
+ return true;
+}
+
+static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p)
+{
+ return true;
+}
+
+static inline bool sched_group_cookie_match(struct rq *rq,
+ struct task_struct *p,
+ struct sched_group *group)
+{
+ return true;
+}
+#endif /* CONFIG_SCHED_CORE */
+
+static inline void lockdep_assert_rq_held(struct rq *rq)
+{
+ lockdep_assert_held(__rq_lockp(rq));
+}
+
+extern void raw_spin_rq_lock_nested(struct rq *rq, int subclass);
+extern bool raw_spin_rq_trylock(struct rq *rq);
+extern void raw_spin_rq_unlock(struct rq *rq);
+
+static inline void raw_spin_rq_lock(struct rq *rq)
+{
+ raw_spin_rq_lock_nested(rq, 0);
+}
+
+static inline void raw_spin_rq_lock_irq(struct rq *rq)
+{
+ local_irq_disable();
+ raw_spin_rq_lock(rq);
+}
+
+static inline void raw_spin_rq_unlock_irq(struct rq *rq)
+{
+ raw_spin_rq_unlock(rq);
+ local_irq_enable();
+}
+
+static inline unsigned long _raw_spin_rq_lock_irqsave(struct rq *rq)
+{
+ unsigned long flags;
+ local_irq_save(flags);
+ raw_spin_rq_lock(rq);
+ return flags;
+}
+
+static inline void raw_spin_rq_unlock_irqrestore(struct rq *rq, unsigned long flags)
+{
+ raw_spin_rq_unlock(rq);
+ local_irq_restore(flags);
+}
+
+#define raw_spin_rq_lock_irqsave(rq, flags) \
+do { \
+ flags = _raw_spin_rq_lock_irqsave(rq); \
+} while (0)
+
#ifdef CONFIG_SCHED_SMT
extern void __update_idle_core(struct rq *rq);
@@ -1134,6 +1358,57 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
#define raw_rq() raw_cpu_ptr(&runqueues)
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static inline struct task_struct *task_of(struct sched_entity *se)
+{
+ SCHED_WARN_ON(!entity_is_task(se));
+ return container_of(se, struct task_struct, se);
+}
+
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+{
+ return p->se.cfs_rq;
+}
+
+/* runqueue on which this entity is (to be) queued */
+static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+{
+ return se->cfs_rq;
+}
+
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+{
+ return grp->my_q;
+}
+
+#else
+
+static inline struct task_struct *task_of(struct sched_entity *se)
+{
+ return container_of(se, struct task_struct, se);
+}
+
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+{
+ return &task_rq(p)->cfs;
+}
+
+static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+{
+ struct task_struct *p = task_of(se);
+ struct rq *rq = task_rq(p);
+
+ return &rq->cfs;
+}
+
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+{
+ return NULL;
+}
+#endif
+
extern void update_rq_clock(struct rq *rq);
static inline u64 __rq_clock_broken(struct rq *rq)
@@ -1179,7 +1454,7 @@ static inline void assert_clock_updated(struct rq *rq)
static inline u64 rq_clock(struct rq *rq)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
assert_clock_updated(rq);
return rq->clock;
@@ -1187,7 +1462,7 @@ static inline u64 rq_clock(struct rq *rq)
static inline u64 rq_clock_task(struct rq *rq)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
assert_clock_updated(rq);
return rq->clock_task;
@@ -1213,7 +1488,7 @@ static inline u64 rq_clock_thermal(struct rq *rq)
static inline void rq_clock_skip_update(struct rq *rq)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
rq->clock_update_flags |= RQCF_REQ_SKIP;
}
@@ -1223,7 +1498,7 @@ static inline void rq_clock_skip_update(struct rq *rq)
*/
static inline void rq_clock_cancel_skipupdate(struct rq *rq)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
rq->clock_update_flags &= ~RQCF_REQ_SKIP;
}
@@ -1254,7 +1529,7 @@ extern struct callback_head balance_push_callback;
*/
static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
{
- rf->cookie = lockdep_pin_lock(&rq->lock);
+ rf->cookie = lockdep_pin_lock(__rq_lockp(rq));
#ifdef CONFIG_SCHED_DEBUG
rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
@@ -1272,12 +1547,12 @@ static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
rf->clock_update_flags = RQCF_UPDATED;
#endif
- lockdep_unpin_lock(&rq->lock, rf->cookie);
+ lockdep_unpin_lock(__rq_lockp(rq), rf->cookie);
}
static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf)
{
- lockdep_repin_lock(&rq->lock, rf->cookie);
+ lockdep_repin_lock(__rq_lockp(rq), rf->cookie);
#ifdef CONFIG_SCHED_DEBUG
/*
@@ -1298,7 +1573,7 @@ static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf)
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
}
static inline void
@@ -1307,7 +1582,7 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
__releases(p->pi_lock)
{
rq_unpin_lock(rq, rf);
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
}
@@ -1315,7 +1590,7 @@ static inline void
rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
{
- raw_spin_lock_irqsave(&rq->lock, rf->flags);
+ raw_spin_rq_lock_irqsave(rq, rf->flags);
rq_pin_lock(rq, rf);
}
@@ -1323,7 +1598,7 @@ static inline void
rq_lock_irq(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
{
- raw_spin_lock_irq(&rq->lock);
+ raw_spin_rq_lock_irq(rq);
rq_pin_lock(rq, rf);
}
@@ -1331,7 +1606,7 @@ static inline void
rq_lock(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
{
- raw_spin_lock(&rq->lock);
+ raw_spin_rq_lock(rq);
rq_pin_lock(rq, rf);
}
@@ -1339,7 +1614,7 @@ static inline void
rq_relock(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
{
- raw_spin_lock(&rq->lock);
+ raw_spin_rq_lock(rq);
rq_repin_lock(rq, rf);
}
@@ -1348,7 +1623,7 @@ rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf)
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
- raw_spin_unlock_irqrestore(&rq->lock, rf->flags);
+ raw_spin_rq_unlock_irqrestore(rq, rf->flags);
}
static inline void
@@ -1356,7 +1631,7 @@ rq_unlock_irq(struct rq *rq, struct rq_flags *rf)
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
- raw_spin_unlock_irq(&rq->lock);
+ raw_spin_rq_unlock_irq(rq);
}
static inline void
@@ -1364,7 +1639,7 @@ rq_unlock(struct rq *rq, struct rq_flags *rf)
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
}
static inline struct rq *
@@ -1429,7 +1704,7 @@ queue_balance_callback(struct rq *rq,
struct callback_head *head,
void (*func)(struct rq *rq))
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
if (unlikely(head->next || rq->balance_callback == &balance_push_callback))
return;
@@ -1844,6 +2119,9 @@ struct sched_class {
#ifdef CONFIG_SMP
int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags);
+
+ struct task_struct * (*pick_task)(struct rq *rq);
+
void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
void (*task_woken)(struct rq *this_rq, struct task_struct *task);
@@ -1893,7 +2171,6 @@ static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
static inline void set_next_task(struct rq *rq, struct task_struct *next)
{
- WARN_ON_ONCE(rq->curr != next);
next->sched_class->set_next_task(rq, next, false);
}
@@ -1969,7 +2246,7 @@ static inline struct task_struct *get_push_task(struct rq *rq)
{
struct task_struct *p = rq->curr;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
if (rq->push_busy)
return NULL;
@@ -2181,10 +2458,38 @@ unsigned long arch_scale_freq_capacity(int cpu)
}
#endif
+
#ifdef CONFIG_SMP
-#ifdef CONFIG_PREEMPTION
-static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);
+static inline bool rq_order_less(struct rq *rq1, struct rq *rq2)
+{
+#ifdef CONFIG_SCHED_CORE
+ /*
+ * In order to not have {0,2},{1,3} turn into into an AB-BA,
+ * order by core-id first and cpu-id second.
+ *
+ * Notably:
+ *
+ * double_rq_lock(0,3); will take core-0, core-1 lock
+ * double_rq_lock(1,2); will take core-1, core-0 lock
+ *
+ * when only cpu-id is considered.
+ */
+ if (rq1->core->cpu < rq2->core->cpu)
+ return true;
+ if (rq1->core->cpu > rq2->core->cpu)
+ return false;
+
+ /*
+ * __sched_core_flip() relies on SMT having cpu-id lock order.
+ */
+#endif
+ return rq1->cpu < rq2->cpu;
+}
+
+extern void double_rq_lock(struct rq *rq1, struct rq *rq2);
+
+#ifdef CONFIG_PREEMPTION
/*
* fair double_lock_balance: Safely acquires both rq->locks in a fair
@@ -2199,7 +2504,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
__acquires(busiest->lock)
__acquires(this_rq->lock)
{
- raw_spin_unlock(&this_rq->lock);
+ raw_spin_rq_unlock(this_rq);
double_rq_lock(this_rq, busiest);
return 1;
@@ -2218,20 +2523,21 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
__acquires(busiest->lock)
__acquires(this_rq->lock)
{
- int ret = 0;
-
- if (unlikely(!raw_spin_trylock(&busiest->lock))) {
- if (busiest < this_rq) {
- raw_spin_unlock(&this_rq->lock);
- raw_spin_lock(&busiest->lock);
- raw_spin_lock_nested(&this_rq->lock,
- SINGLE_DEPTH_NESTING);
- ret = 1;
- } else
- raw_spin_lock_nested(&busiest->lock,
- SINGLE_DEPTH_NESTING);
+ if (__rq_lockp(this_rq) == __rq_lockp(busiest))
+ return 0;
+
+ if (likely(raw_spin_rq_trylock(busiest)))
+ return 0;
+
+ if (rq_order_less(this_rq, busiest)) {
+ raw_spin_rq_lock_nested(busiest, SINGLE_DEPTH_NESTING);
+ return 0;
}
- return ret;
+
+ raw_spin_rq_unlock(this_rq);
+ double_rq_lock(this_rq, busiest);
+
+ return 1;
}
#endif /* CONFIG_PREEMPTION */
@@ -2241,11 +2547,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
*/
static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
{
- if (unlikely(!irqs_disabled())) {
- /* printk() doesn't work well under rq->lock */
- raw_spin_unlock(&this_rq->lock);
- BUG_ON(1);
- }
+ lockdep_assert_irqs_disabled();
return _double_lock_balance(this_rq, busiest);
}
@@ -2253,8 +2555,9 @@ static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
__releases(busiest->lock)
{
- raw_spin_unlock(&busiest->lock);
- lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
+ if (__rq_lockp(this_rq) != __rq_lockp(busiest))
+ raw_spin_rq_unlock(busiest);
+ lock_set_subclass(&__rq_lockp(this_rq)->dep_map, 0, _RET_IP_);
}
static inline void double_lock(spinlock_t *l1, spinlock_t *l2)
@@ -2285,31 +2588,6 @@ static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2)
}
/*
- * double_rq_lock - safely lock two runqueues
- *
- * Note this does not disable interrupts like task_rq_lock,
- * you need to do so manually before calling.
- */
-static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
- __acquires(rq1->lock)
- __acquires(rq2->lock)
-{
- BUG_ON(!irqs_disabled());
- if (rq1 == rq2) {
- raw_spin_lock(&rq1->lock);
- __acquire(rq2->lock); /* Fake it out ;) */
- } else {
- if (rq1 < rq2) {
- raw_spin_lock(&rq1->lock);
- raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
- } else {
- raw_spin_lock(&rq2->lock);
- raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
- }
- }
-}
-
-/*
* double_rq_unlock - safely unlock two runqueues
*
* Note this does not restore interrupts like task_rq_unlock,
@@ -2319,11 +2597,11 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
__releases(rq1->lock)
__releases(rq2->lock)
{
- raw_spin_unlock(&rq1->lock);
- if (rq1 != rq2)
- raw_spin_unlock(&rq2->lock);
+ if (__rq_lockp(rq1) != __rq_lockp(rq2))
+ raw_spin_rq_unlock(rq2);
else
__release(rq2->lock);
+ raw_spin_rq_unlock(rq1);
}
extern void set_rq_online (struct rq *rq);
@@ -2344,7 +2622,7 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
{
BUG_ON(!irqs_disabled());
BUG_ON(rq1 != rq2);
- raw_spin_lock(&rq1->lock);
+ raw_spin_rq_lock(rq1);
__acquire(rq2->lock); /* Fake it out ;) */
}
@@ -2359,7 +2637,7 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
__releases(rq2->lock)
{
BUG_ON(rq1 != rq2);
- raw_spin_unlock(&rq1->lock);
+ raw_spin_rq_unlock(rq1);
__release(rq2->lock);
}
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index dc218e9f4558..111072ee9663 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -25,7 +25,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
}
static inline void
-rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
+rq_sched_info_dequeue(struct rq *rq, unsigned long long delta)
{
if (rq)
rq->rq_sched_info.run_delay += delta;
@@ -42,7 +42,7 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
#else /* !CONFIG_SCHEDSTATS: */
static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { }
-static inline void rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) { }
+static inline void rq_sched_info_dequeue(struct rq *rq, unsigned long long delta) { }
static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delta) { }
# define schedstat_enabled() 0
# define __schedstat_inc(var) do { } while (0)
@@ -150,29 +150,24 @@ static inline void psi_sched_switch(struct task_struct *prev,
#endif /* CONFIG_PSI */
#ifdef CONFIG_SCHED_INFO
-static inline void sched_info_reset_dequeued(struct task_struct *t)
-{
- t->sched_info.last_queued = 0;
-}
-
/*
* We are interested in knowing how long it was from the *first* time a
* task was queued to the time that it finally hit a CPU, we call this routine
* from dequeue_task() to account for possible rq->clock skew across CPUs. The
* delta taken on each CPU would annul the skew.
*/
-static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
+static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t)
{
- unsigned long long now = rq_clock(rq), delta = 0;
+ unsigned long long delta = 0;
- if (sched_info_on()) {
- if (t->sched_info.last_queued)
- delta = now - t->sched_info.last_queued;
- }
- sched_info_reset_dequeued(t);
+ if (!t->sched_info.last_queued)
+ return;
+
+ delta = rq_clock(rq) - t->sched_info.last_queued;
+ t->sched_info.last_queued = 0;
t->sched_info.run_delay += delta;
- rq_sched_info_dequeued(rq, delta);
+ rq_sched_info_dequeue(rq, delta);
}
/*
@@ -182,11 +177,14 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
*/
static void sched_info_arrive(struct rq *rq, struct task_struct *t)
{
- unsigned long long now = rq_clock(rq), delta = 0;
+ unsigned long long now, delta = 0;
+
+ if (!t->sched_info.last_queued)
+ return;
- if (t->sched_info.last_queued)
- delta = now - t->sched_info.last_queued;
- sched_info_reset_dequeued(t);
+ now = rq_clock(rq);
+ delta = now - t->sched_info.last_queued;
+ t->sched_info.last_queued = 0;
t->sched_info.run_delay += delta;
t->sched_info.last_arrival = now;
t->sched_info.pcount++;
@@ -197,14 +195,12 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
/*
* This function is only called from enqueue_task(), but also only updates
* the timestamp if it is already not set. It's assumed that
- * sched_info_dequeued() will clear that stamp when appropriate.
+ * sched_info_dequeue() will clear that stamp when appropriate.
*/
-static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
+static inline void sched_info_enqueue(struct rq *rq, struct task_struct *t)
{
- if (sched_info_on()) {
- if (!t->sched_info.last_queued)
- t->sched_info.last_queued = rq_clock(rq);
- }
+ if (!t->sched_info.last_queued)
+ t->sched_info.last_queued = rq_clock(rq);
}
/*
@@ -212,7 +208,7 @@ static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
* due, typically, to expiring its time slice (this may also be called when
* switching to the idle task). Now we can calculate how long we ran.
* Also, if the process is still in the TASK_RUNNING state, call
- * sched_info_queued() to mark that it has now again started waiting on
+ * sched_info_enqueue() to mark that it has now again started waiting on
* the runqueue.
*/
static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
@@ -222,7 +218,7 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
rq_sched_info_depart(rq, delta);
if (t->state == TASK_RUNNING)
- sched_info_queued(rq, t);
+ sched_info_enqueue(rq, t);
}
/*
@@ -231,7 +227,7 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
* the idle task.) We are only called when prev != next.
*/
static inline void
-__sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
+sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
/*
* prev now departs the CPU. It's not interesting to record
@@ -245,18 +241,8 @@ __sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct
sched_info_arrive(rq, next);
}
-static inline void
-sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
-{
- if (sched_info_on())
- __sched_info_switch(rq, prev, next);
-}
-
#else /* !CONFIG_SCHED_INFO: */
-# define sched_info_queued(rq, t) do { } while (0)
-# define sched_info_reset_dequeued(t) do { } while (0)
-# define sched_info_dequeued(rq, t) do { } while (0)
-# define sched_info_depart(rq, t) do { } while (0)
-# define sched_info_arrive(rq, next) do { } while (0)
+# define sched_info_enqueue(rq, t) do { } while (0)
+# define sched_info_dequeue(rq, t) do { } while (0)
# define sched_info_switch(rq, t, next) do { } while (0)
#endif /* CONFIG_SCHED_INFO */
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 55f39125c0e1..f988ebe3febb 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -34,15 +34,24 @@ static void set_next_task_stop(struct rq *rq, struct task_struct *stop, bool fir
stop->se.exec_start = rq_clock_task(rq);
}
-static struct task_struct *pick_next_task_stop(struct rq *rq)
+static struct task_struct *pick_task_stop(struct rq *rq)
{
if (!sched_stop_runnable(rq))
return NULL;
- set_next_task_stop(rq, rq->stop, true);
return rq->stop;
}
+static struct task_struct *pick_next_task_stop(struct rq *rq)
+{
+ struct task_struct *p = pick_task_stop(rq);
+
+ if (p)
+ set_next_task_stop(rq, p, true);
+
+ return p;
+}
+
static void
enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
@@ -123,6 +132,7 @@ DEFINE_SCHED_CLASS(stop) = {
#ifdef CONFIG_SMP
.balance = balance_stop,
+ .pick_task = pick_task_stop,
.select_task_rq = select_task_rq_stop,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 55a0a243e871..053115b55f89 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -467,7 +467,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
struct root_domain *old_rd = NULL;
unsigned long flags;
- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_rq_lock_irqsave(rq, flags);
if (rq->rd) {
old_rd = rq->rd;
@@ -493,7 +493,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
set_rq_online(rq);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_rq_unlock_irqrestore(rq, flags);
if (old_rd)
call_rcu(&old_rd->rcu, free_rootdomain);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index f25208e8df83..e4163042c4d6 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -33,7 +33,6 @@ struct task_struct *idle_thread_get(unsigned int cpu)
if (!tsk)
return ERR_PTR(-ENOMEM);
- init_idle(tsk, cpu);
return tsk;
}
diff --git a/kernel/sys.c b/kernel/sys.c
index 3a583a29815f..9de46a4bf492 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2550,6 +2550,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = set_syscall_user_dispatch(arg2, arg3, arg4,
(char __user *) arg5);
break;
+#ifdef CONFIG_SCHED_CORE
+ case PR_SCHED_CORE:
+ error = sched_core_share_pid(arg2, arg3, arg4, arg5);
+ break;
+#endif
default:
error = -EINVAL;
break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d4a78e08f6d8..8c8c220637ce 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -71,6 +71,7 @@
#include <linux/coredump.h>
#include <linux/latencytop.h>
#include <linux/pid.h>
+#include <linux/delayacct.h>
#include "../lib/kstrtox.h"
@@ -1747,6 +1748,17 @@ static struct ctl_table kern_table[] = {
.extra2 = SYSCTL_ONE,
},
#endif /* CONFIG_SCHEDSTATS */
+#ifdef CONFIG_TASK_DELAY_ACCT
+ {
+ .procname = "task_delayacct",
+ .data = NULL,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_delayacct,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif /* CONFIG_TASK_DELAY_ACCT */
#ifdef CONFIG_NUMA_BALANCING
{
.procname = "numa_balancing",
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
index 1c1dbd300325..046ac6297c78 100644
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -19,11 +19,7 @@ unsigned int check_preemption_disabled(const char *what1, const char *what2)
if (irqs_disabled())
goto out;
- /*
- * Kernel threads bound to a single CPU can safely use
- * smp_processor_id():
- */
- if (current->nr_cpus_allowed == 1)
+ if (is_percpu_thread())
goto out;
#ifdef CONFIG_SMP
diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h
index 18a9f59dc067..967d9c55323d 100644
--- a/tools/include/uapi/linux/prctl.h
+++ b/tools/include/uapi/linux/prctl.h
@@ -259,4 +259,12 @@ struct prctl_mm_map {
#define PR_PAC_SET_ENABLED_KEYS 60
#define PR_PAC_GET_ENABLED_KEYS 61
+/* Request the scheduler to share a core */
+#define PR_SCHED_CORE 62
+# define PR_SCHED_CORE_GET 0
+# define PR_SCHED_CORE_CREATE 1 /* create unique core_sched cookie */
+# define PR_SCHED_CORE_SHARE_TO 2 /* push core_sched cookie to pid */
+# define PR_SCHED_CORE_SHARE_FROM 3 /* pull core_sched cookie to pid */
+# define PR_SCHED_CORE_MAX 4
+
#endif /* _LINUX_PRCTL_H */
diff --git a/tools/testing/selftests/sched/.gitignore b/tools/testing/selftests/sched/.gitignore
new file mode 100644
index 000000000000..6996d4654d92
--- /dev/null
+++ b/tools/testing/selftests/sched/.gitignore
@@ -0,0 +1 @@
+cs_prctl_test
diff --git a/tools/testing/selftests/sched/Makefile b/tools/testing/selftests/sched/Makefile
new file mode 100644
index 000000000000..10c72f14fea9
--- /dev/null
+++ b/tools/testing/selftests/sched/Makefile
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0+
+
+ifneq ($(shell $(CC) --version 2>&1 | head -n 1 | grep clang),)
+CLANG_FLAGS += -no-integrated-as
+endif
+
+CFLAGS += -O2 -Wall -g -I./ -I../../../../usr/include/ -Wl,-rpath=./ \
+ $(CLANG_FLAGS)
+LDLIBS += -lpthread
+
+TEST_GEN_FILES := cs_prctl_test
+TEST_PROGS := cs_prctl_test
+
+include ../lib.mk
diff --git a/tools/testing/selftests/sched/config b/tools/testing/selftests/sched/config
new file mode 100644
index 000000000000..e8b09aa7c0c4
--- /dev/null
+++ b/tools/testing/selftests/sched/config
@@ -0,0 +1 @@
+CONFIG_SCHED_DEBUG=y
diff --git a/tools/testing/selftests/sched/cs_prctl_test.c b/tools/testing/selftests/sched/cs_prctl_test.c
new file mode 100644
index 000000000000..63fe6521c56d
--- /dev/null
+++ b/tools/testing/selftests/sched/cs_prctl_test.c
@@ -0,0 +1,338 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Use the core scheduling prctl() to test core scheduling cookies control.
+ *
+ * Copyright (c) 2021 Oracle and/or its affiliates.
+ * Author: Chris Hyser <chris.hyser@oracle.com>
+ *
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, see <http://www.gnu.org/licenses>.
+ */
+
+#define _GNU_SOURCE
+#include <sys/eventfd.h>
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <sched.h>
+#include <sys/prctl.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#if __GLIBC_PREREQ(2, 30) == 0
+#include <sys/syscall.h>
+static pid_t gettid(void)
+{
+ return syscall(SYS_gettid);
+}
+#endif
+
+#ifndef PR_SCHED_CORE
+#define PR_SCHED_CORE 62
+# define PR_SCHED_CORE_GET 0
+# define PR_SCHED_CORE_CREATE 1 /* create unique core_sched cookie */
+# define PR_SCHED_CORE_SHARE_TO 2 /* push core_sched cookie to pid */
+# define PR_SCHED_CORE_SHARE_FROM 3 /* pull core_sched cookie to pid */
+# define PR_SCHED_CORE_MAX 4
+#endif
+
+#define MAX_PROCESSES 128
+#define MAX_THREADS 128
+
+static const char USAGE[] = "cs_prctl_test [options]\n"
+" options:\n"
+" -P : number of processes to create.\n"
+" -T : number of threads per process to create.\n"
+" -d : delay time to keep tasks alive.\n"
+" -k : keep tasks alive until keypress.\n";
+
+enum pid_type {PIDTYPE_PID = 0, PIDTYPE_TGID, PIDTYPE_PGID};
+
+const int THREAD_CLONE_FLAGS = CLONE_THREAD | CLONE_SIGHAND | CLONE_FS | CLONE_VM | CLONE_FILES;
+
+static int _prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4,
+ unsigned long arg5)
+{
+ int res;
+
+ res = prctl(option, arg2, arg3, arg4, arg5);
+ printf("%d = prctl(%d, %ld, %ld, %ld, %lx)\n", res, option, (long)arg2, (long)arg3,
+ (long)arg4, arg5);
+ return res;
+}
+
+#define STACK_SIZE (1024 * 1024)
+
+#define handle_error(msg) __handle_error(__FILE__, __LINE__, msg)
+static void __handle_error(char *fn, int ln, char *msg)
+{
+ printf("(%s:%d) - ", fn, ln);
+ perror(msg);
+ exit(EXIT_FAILURE);
+}
+
+static void handle_usage(int rc, char *msg)
+{
+ puts(USAGE);
+ puts(msg);
+ putchar('\n');
+ exit(rc);
+}
+
+static unsigned long get_cs_cookie(int pid)
+{
+ unsigned long long cookie;
+ int ret;
+
+ ret = prctl(PR_SCHED_CORE, PR_SCHED_CORE_GET, pid, PIDTYPE_PID,
+ (unsigned long)&cookie);
+ if (ret) {
+ printf("Not a core sched system\n");
+ return -1UL;
+ }
+
+ return cookie;
+}
+
+struct child_args {
+ int num_threads;
+ int pfd[2];
+ int cpid;
+ int thr_tids[MAX_THREADS];
+};
+
+static int child_func_thread(void __attribute__((unused))*arg)
+{
+ while (1)
+ usleep(20000);
+ return 0;
+}
+
+static void create_threads(int num_threads, int thr_tids[])
+{
+ void *child_stack;
+ pid_t tid;
+ int i;
+
+ for (i = 0; i < num_threads; ++i) {
+ child_stack = malloc(STACK_SIZE);
+ if (!child_stack)
+ handle_error("child stack allocate");
+
+ tid = clone(child_func_thread, child_stack + STACK_SIZE, THREAD_CLONE_FLAGS, NULL);
+ if (tid == -1)
+ handle_error("clone thread");
+ thr_tids[i] = tid;
+ }
+}
+
+static int child_func_process(void *arg)
+{
+ struct child_args *ca = (struct child_args *)arg;
+
+ close(ca->pfd[0]);
+
+ create_threads(ca->num_threads, ca->thr_tids);
+
+ write(ca->pfd[1], &ca->thr_tids, sizeof(int) * ca->num_threads);
+ close(ca->pfd[1]);
+
+ while (1)
+ usleep(20000);
+ return 0;
+}
+
+static unsigned char child_func_process_stack[STACK_SIZE];
+
+void create_processes(int num_processes, int num_threads, struct child_args proc[])
+{
+ pid_t cpid;
+ int i;
+
+ for (i = 0; i < num_processes; ++i) {
+ proc[i].num_threads = num_threads;
+
+ if (pipe(proc[i].pfd) == -1)
+ handle_error("pipe() failed");
+
+ cpid = clone(child_func_process, child_func_process_stack + STACK_SIZE,
+ SIGCHLD, &proc[i]);
+ proc[i].cpid = cpid;
+ close(proc[i].pfd[1]);
+ }
+
+ for (i = 0; i < num_processes; ++i) {
+ read(proc[i].pfd[0], &proc[i].thr_tids, sizeof(int) * proc[i].num_threads);
+ close(proc[i].pfd[0]);
+ }
+}
+
+void disp_processes(int num_processes, struct child_args proc[])
+{
+ int i, j;
+
+ printf("tid=%d, / tgid=%d / pgid=%d: %lx\n", gettid(), getpid(), getpgid(0),
+ get_cs_cookie(getpid()));
+
+ for (i = 0; i < num_processes; ++i) {
+ printf(" tid=%d, / tgid=%d / pgid=%d: %lx\n", proc[i].cpid, proc[i].cpid,
+ getpgid(proc[i].cpid), get_cs_cookie(proc[i].cpid));
+ for (j = 0; j < proc[i].num_threads; ++j) {
+ printf(" tid=%d, / tgid=%d / pgid=%d: %lx\n", proc[i].thr_tids[j],
+ proc[i].cpid, getpgid(0), get_cs_cookie(proc[i].thr_tids[j]));
+ }
+ }
+ puts("\n");
+}
+
+static int errors;
+
+#define validate(v) _validate(__LINE__, v, #v)
+void _validate(int line, int val, char *msg)
+{
+ if (!val) {
+ ++errors;
+ printf("(%d) FAILED: %s\n", line, msg);
+ } else {
+ printf("(%d) PASSED: %s\n", line, msg);
+ }
+}
+
+int main(int argc, char *argv[])
+{
+ struct child_args procs[MAX_PROCESSES];
+
+ int keypress = 0;
+ int num_processes = 2;
+ int num_threads = 3;
+ int delay = 0;
+ int res = 0;
+ int pidx;
+ int pid;
+ int opt;
+
+ while ((opt = getopt(argc, argv, ":hkT:P:d:")) != -1) {
+ switch (opt) {
+ case 'P':
+ num_processes = (int)strtol(optarg, NULL, 10);
+ break;
+ case 'T':
+ num_threads = (int)strtoul(optarg, NULL, 10);
+ break;
+ case 'd':
+ delay = (int)strtol(optarg, NULL, 10);
+ break;
+ case 'k':
+ keypress = 1;
+ break;
+ case 'h':
+ printf(USAGE);
+ exit(EXIT_SUCCESS);
+ default:
+ handle_usage(20, "unknown option");
+ }
+ }
+
+ if (num_processes < 1 || num_processes > MAX_PROCESSES)
+ handle_usage(1, "Bad processes value");
+
+ if (num_threads < 1 || num_threads > MAX_THREADS)
+ handle_usage(2, "Bad thread value");
+
+ if (keypress)
+ delay = -1;
+
+ srand(time(NULL));
+
+ /* put into separate process group */
+ if (setpgid(0, 0) != 0)
+ handle_error("process group");
+
+ printf("\n## Create a thread/process/process group hiearchy\n");
+ create_processes(num_processes, num_threads, procs);
+ disp_processes(num_processes, procs);
+ validate(get_cs_cookie(0) == 0);
+
+ printf("\n## Set a cookie on entire process group\n");
+ if (_prctl(PR_SCHED_CORE, PR_SCHED_CORE_CREATE, 0, PIDTYPE_PGID, 0) < 0)
+ handle_error("core_sched create failed -- PGID");
+ disp_processes(num_processes, procs);
+
+ validate(get_cs_cookie(0) != 0);
+
+ /* get a random process pid */
+ pidx = rand() % num_processes;
+ pid = procs[pidx].cpid;
+
+ validate(get_cs_cookie(0) == get_cs_cookie(pid));
+ validate(get_cs_cookie(0) == get_cs_cookie(procs[pidx].thr_tids[0]));
+
+ printf("\n## Set a new cookie on entire process/TGID [%d]\n", pid);
+ if (_prctl(PR_SCHED_CORE, PR_SCHED_CORE_CREATE, pid, PIDTYPE_TGID, 0) < 0)
+ handle_error("core_sched create failed -- TGID");
+ disp_processes(num_processes, procs);
+
+ validate(get_cs_cookie(0) != get_cs_cookie(pid));
+ validate(get_cs_cookie(pid) != 0);
+ validate(get_cs_cookie(pid) == get_cs_cookie(procs[pidx].thr_tids[0]));
+
+ printf("\n## Copy the cookie of current/PGID[%d], to pid [%d] as PIDTYPE_PID\n",
+ getpid(), pid);
+ if (_prctl(PR_SCHED_CORE, PR_SCHED_CORE_SHARE_TO, pid, PIDTYPE_PID, 0) < 0)
+ handle_error("core_sched share to itself failed -- PID");
+ disp_processes(num_processes, procs);
+
+ validate(get_cs_cookie(0) == get_cs_cookie(pid));
+ validate(get_cs_cookie(pid) != 0);
+ validate(get_cs_cookie(pid) != get_cs_cookie(procs[pidx].thr_tids[0]));
+
+ printf("\n## Copy cookie from a thread [%d] to current/PGID [%d] as PIDTYPE_PID\n",
+ procs[pidx].thr_tids[0], getpid());
+ if (_prctl(PR_SCHED_CORE, PR_SCHED_CORE_SHARE_FROM, procs[pidx].thr_tids[0],
+ PIDTYPE_PID, 0) < 0)
+ handle_error("core_sched share from thread failed -- PID");
+ disp_processes(num_processes, procs);
+
+ validate(get_cs_cookie(0) == get_cs_cookie(procs[pidx].thr_tids[0]));
+ validate(get_cs_cookie(pid) != get_cs_cookie(procs[pidx].thr_tids[0]));
+
+ printf("\n## Copy cookie from current [%d] to current as pidtype PGID\n", getpid());
+ if (_prctl(PR_SCHED_CORE, PR_SCHED_CORE_SHARE_TO, 0, PIDTYPE_PGID, 0) < 0)
+ handle_error("core_sched share to self failed -- PGID");
+ disp_processes(num_processes, procs);
+
+ validate(get_cs_cookie(0) == get_cs_cookie(pid));
+ validate(get_cs_cookie(pid) != 0);
+ validate(get_cs_cookie(pid) == get_cs_cookie(procs[pidx].thr_tids[0]));
+
+ if (errors) {
+ printf("TESTS FAILED. errors: %d\n", errors);
+ res = 10;
+ } else {
+ printf("SUCCESS !!!\n");
+ }
+
+ if (keypress)
+ getchar();
+ else
+ sleep(delay);
+
+ for (pidx = 0; pidx < num_processes; ++pidx)
+ kill(procs[pidx].cpid, 15);
+
+ return res;
+}