summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRafael J. Wysocki <rafael.j.wysocki@intel.com>2025-05-26 21:18:34 +0200
committerRafael J. Wysocki <rafael.j.wysocki@intel.com>2025-05-26 21:18:34 +0200
commitaf86d7e88e3069f19138cabb3d32dd1e39bf3f9e (patch)
tree9edb54f721c8ca8229c364f9958dab189a19b83d
parentf34dc2834347301d4652464867f096048df546ff (diff)
parent5836ebeb4a2b7e25305fab9170060d5f3de1ff30 (diff)
Merge branch 'pm-cpuidle'
Merge cpuidle updates for 6.16-rc1: - Optimize bucket assignment when next_timer_ns equals KTIME_MAX in the menu cpuidle governor (Zhongqiu Han). - Convert the cpuidle PSCI driver to a faux device one (Sudeep Holla). - Add C1 demotion on/off sysfs knob to the intel_idle driver (Artem Bityutskiy). - Fix typos in two comments in the teo cpuidle governor (Atul Kumar Pant). * pm-cpuidle: cpuidle: psci: Avoid initializing faux device if no DT idle states are present Documentation: ABI: testing: document the new cpuidle sysfs file Documentation: admin-guide: pm: Document intel_idle C1 demotion intel_idle: Add C1 demotion on/off sysfs knob cpuidle: psci: Transition to the faux device interface cpuidle: menu: Optimize bucket assignment when next_timer_ns equals KTIME_MAX cpuidle: teo: Fix typos in two comments
-rw-r--r--Documentation/ABI/testing/sysfs-devices-system-cpu7
-rw-r--r--Documentation/admin-guide/pm/intel_idle.rst21
-rw-r--r--drivers/cpuidle/cpuidle-psci.c43
-rw-r--r--drivers/cpuidle/governors/menu.c2
-rw-r--r--drivers/cpuidle/governors/teo.c4
-rw-r--r--drivers/idle/intel_idle.c102
6 files changed, 159 insertions, 20 deletions
diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index 03f43fb667a3..8d6c7b57dc14 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -111,6 +111,7 @@ What: /sys/devices/system/cpu/cpuidle/available_governors
/sys/devices/system/cpu/cpuidle/current_driver
/sys/devices/system/cpu/cpuidle/current_governor
/sys/devices/system/cpu/cpuidle/current_governer_ro
+ /sys/devices/system/cpu/cpuidle/intel_c1_demotion
Date: September 2007
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
Description: Discover cpuidle policy and mechanism
@@ -132,7 +133,11 @@ Description: Discover cpuidle policy and mechanism
current_governor_ro: (RO) displays current idle policy.
- See Documentation/admin-guide/pm/cpuidle.rst and
+ intel_c1_demotion: (RW) enables/disables the C1 demotion
+ feature on Intel CPUs.
+
+ See Documentation/admin-guide/pm/cpuidle.rst,
+ Documentation/admin-guide/pm/intel_idle.rst, and
Documentation/driver-api/pm/cpuidle.rst for more information.
diff --git a/Documentation/admin-guide/pm/intel_idle.rst b/Documentation/admin-guide/pm/intel_idle.rst
index 5940528146eb..ed6f055d4b14 100644
--- a/Documentation/admin-guide/pm/intel_idle.rst
+++ b/Documentation/admin-guide/pm/intel_idle.rst
@@ -38,6 +38,27 @@ instruction at all.
only way to pass early-configuration-time parameters to it is via the kernel
command line.
+Sysfs Interface
+===============
+
+The ``intel_idle`` driver exposes the following ``sysfs`` attributes in
+``/sys/devices/system/cpu/cpuidle/``:
+
+``intel_c1_demotion``
+ Enable or disable C1 demotion for all CPUs in the system. This file is
+ only exposed on platforms that support the C1 demotion feature and where
+ it was tested. Value 0 means that C1 demotion is disabled, value 1 means
+ that it is enabled. Write 0 or 1 to disable or enable C1 demotion for
+ all CPUs.
+
+ The C1 demotion feature involves the platform firmware demoting deep
+ C-state requests from the OS (e.g., C6 requests) to C1. The idea is that
+ firmware monitors CPU wake-up rate, and if it is higher than a
+ platform-specific threshold, the firmware demotes deep C-state requests
+ to C1. For example, Linux requests C6, but firmware noticed too many
+ wake-ups per second, and it keeps the CPU in C1. When the CPU stays in
+ C1 long enough, the platform promotes it back to C6. This may improve
+ some workloads' performance, but it may also increase power consumption.
.. _intel-idle-enumeration-of-states:
diff --git a/drivers/cpuidle/cpuidle-psci.c b/drivers/cpuidle/cpuidle-psci.c
index b46a83f5ffe4..40f378c1dc9f 100644
--- a/drivers/cpuidle/cpuidle-psci.c
+++ b/drivers/cpuidle/cpuidle-psci.c
@@ -16,7 +16,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/of.h>
-#include <linux/platform_device.h>
+#include <linux/device/faux.h>
#include <linux/psci.h>
#include <linux/pm_domain.h>
#include <linux/pm_runtime.h>
@@ -407,14 +407,14 @@ deinit:
* to register cpuidle driver then rollback to cancel all CPUs
* registration.
*/
-static int psci_cpuidle_probe(struct platform_device *pdev)
+static int psci_cpuidle_probe(struct faux_device *fdev)
{
int cpu, ret;
struct cpuidle_driver *drv;
struct cpuidle_device *dev;
for_each_present_cpu(cpu) {
- ret = psci_idle_init_cpu(&pdev->dev, cpu);
+ ret = psci_idle_init_cpu(&fdev->dev, cpu);
if (ret)
goto out_fail;
}
@@ -434,26 +434,37 @@ out_fail:
return ret;
}
-static struct platform_driver psci_cpuidle_driver = {
+static struct faux_device_ops psci_cpuidle_ops = {
.probe = psci_cpuidle_probe,
- .driver = {
- .name = "psci-cpuidle",
- },
};
+static bool __init dt_idle_state_present(void)
+{
+ struct device_node *cpu_node __free(device_node);
+ struct device_node *state_node __free(device_node);
+
+ cpu_node = of_cpu_device_node_get(cpumask_first(cpu_possible_mask));
+ if (!cpu_node)
+ return false;
+
+ state_node = of_get_cpu_state_node(cpu_node, 0);
+ if (!state_node)
+ return false;
+
+ return !!of_match_node(psci_idle_state_match, state_node);
+}
+
static int __init psci_idle_init(void)
{
- struct platform_device *pdev;
- int ret;
+ struct faux_device *fdev;
- ret = platform_driver_register(&psci_cpuidle_driver);
- if (ret)
- return ret;
+ if (!dt_idle_state_present())
+ return 0;
- pdev = platform_device_register_simple("psci-cpuidle", -1, NULL, 0);
- if (IS_ERR(pdev)) {
- platform_driver_unregister(&psci_cpuidle_driver);
- return PTR_ERR(pdev);
+ fdev = faux_device_create("psci-cpuidle", NULL, &psci_cpuidle_ops);
+ if (!fdev) {
+ pr_err("Failed to create psci-cpuidle device\n");
+ return -ENODEV;
}
return 0;
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 39aa0aea61c6..52d5d26fc7c6 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -255,7 +255,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
*/
data->next_timer_ns = KTIME_MAX;
delta_tick = TICK_NSEC / 2;
- data->bucket = which_bucket(KTIME_MAX);
+ data->bucket = BUCKETS - 1;
}
if (unlikely(drv->state_count <= 1 || latency_req == 0) ||
diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c
index 8fe5e1b47ef9..bfa55c1eab5b 100644
--- a/drivers/cpuidle/governors/teo.c
+++ b/drivers/cpuidle/governors/teo.c
@@ -19,7 +19,7 @@
*
* Of course, non-timer wakeup sources are more important in some use cases,
* but even then it is generally unnecessary to consider idle duration values
- * greater than the time time till the next timer event, referred as the sleep
+ * greater than the time till the next timer event, referred as the sleep
* length in what follows, because the closest timer will ultimately wake up the
* CPU anyway unless it is woken up earlier.
*
@@ -311,7 +311,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
struct cpuidle_state *s = &drv->states[i];
/*
- * Update the sums of idle state mertics for all of the states
+ * Update the sums of idle state metrics for all of the states
* shallower than the current one.
*/
intercept_sum += prev_bin->intercepts;
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 976f5be54e36..3292bf74e3c2 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -48,9 +48,11 @@
#include <trace/events/power.h>
#include <linux/sched.h>
#include <linux/sched/smt.h>
+#include <linux/mutex.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/moduleparam.h>
+#include <linux/sysfs.h>
#include <asm/cpuid.h>
#include <asm/cpu_device_id.h>
#include <asm/intel-family.h>
@@ -92,9 +94,15 @@ struct idle_cpu {
*/
unsigned long auto_demotion_disable_flags;
bool disable_promotion_to_c1e;
+ bool c1_demotion_supported;
bool use_acpi;
};
+static bool c1_demotion_supported;
+static DEFINE_MUTEX(c1_demotion_mutex);
+
+static struct device *sysfs_root __initdata;
+
static const struct idle_cpu *icpu __initdata;
static struct cpuidle_state *cpuidle_state_table __initdata;
@@ -1549,18 +1557,21 @@ static const struct idle_cpu idle_cpu_gmt __initconst = {
static const struct idle_cpu idle_cpu_spr __initconst = {
.state_table = spr_cstates,
.disable_promotion_to_c1e = true,
+ .c1_demotion_supported = true,
.use_acpi = true,
};
static const struct idle_cpu idle_cpu_gnr __initconst = {
.state_table = gnr_cstates,
.disable_promotion_to_c1e = true,
+ .c1_demotion_supported = true,
.use_acpi = true,
};
static const struct idle_cpu idle_cpu_gnrd __initconst = {
.state_table = gnrd_cstates,
.disable_promotion_to_c1e = true,
+ .c1_demotion_supported = true,
.use_acpi = true,
};
@@ -1599,12 +1610,14 @@ static const struct idle_cpu idle_cpu_snr __initconst = {
static const struct idle_cpu idle_cpu_grr __initconst = {
.state_table = grr_cstates,
.disable_promotion_to_c1e = true,
+ .c1_demotion_supported = true,
.use_acpi = true,
};
static const struct idle_cpu idle_cpu_srf __initconst = {
.state_table = srf_cstates,
.disable_promotion_to_c1e = true,
+ .c1_demotion_supported = true,
.use_acpi = true,
};
@@ -2324,6 +2337,88 @@ static void __init intel_idle_cpuidle_devices_uninit(void)
cpuidle_unregister_device(per_cpu_ptr(intel_idle_cpuidle_devices, i));
}
+static void intel_c1_demotion_toggle(void *enable)
+{
+ unsigned long long msr_val;
+
+ rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_val);
+ /*
+ * Enable/disable C1 undemotion along with C1 demotion, as this is the
+ * most sensible configuration in general.
+ */
+ if (enable)
+ msr_val |= NHM_C1_AUTO_DEMOTE | SNB_C1_AUTO_UNDEMOTE;
+ else
+ msr_val &= ~(NHM_C1_AUTO_DEMOTE | SNB_C1_AUTO_UNDEMOTE);
+ wrmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_val);
+}
+
+static ssize_t intel_c1_demotion_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ bool enable;
+ int err;
+
+ err = kstrtobool(buf, &enable);
+ if (err)
+ return err;
+
+ mutex_lock(&c1_demotion_mutex);
+ /* Enable/disable C1 demotion on all CPUs */
+ on_each_cpu(intel_c1_demotion_toggle, (void *)enable, 1);
+ mutex_unlock(&c1_demotion_mutex);
+
+ return count;
+}
+
+static ssize_t intel_c1_demotion_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ unsigned long long msr_val;
+
+ /*
+ * Read the MSR value for a CPU and assume it is the same for all CPUs. Any other
+ * configuration would be a BIOS bug.
+ */
+ rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_val);
+ return sysfs_emit(buf, "%d\n", !!(msr_val & NHM_C1_AUTO_DEMOTE));
+}
+static DEVICE_ATTR_RW(intel_c1_demotion);
+
+static int __init intel_idle_sysfs_init(void)
+{
+ int err;
+
+ if (!c1_demotion_supported)
+ return 0;
+
+ sysfs_root = bus_get_dev_root(&cpu_subsys);
+ if (!sysfs_root)
+ return 0;
+
+ err = sysfs_add_file_to_group(&sysfs_root->kobj,
+ &dev_attr_intel_c1_demotion.attr,
+ "cpuidle");
+ if (err) {
+ put_device(sysfs_root);
+ return err;
+ }
+
+ return 0;
+}
+
+static void __init intel_idle_sysfs_uninit(void)
+{
+ if (!sysfs_root)
+ return;
+
+ sysfs_remove_file_from_group(&sysfs_root->kobj,
+ &dev_attr_intel_c1_demotion.attr,
+ "cpuidle");
+ put_device(sysfs_root);
+}
+
static int __init intel_idle_init(void)
{
const struct x86_cpu_id *id;
@@ -2374,6 +2469,8 @@ static int __init intel_idle_init(void)
auto_demotion_disable_flags = icpu->auto_demotion_disable_flags;
if (icpu->disable_promotion_to_c1e)
c1e_promotion = C1E_PROMOTION_DISABLE;
+ if (icpu->c1_demotion_supported)
+ c1_demotion_supported = true;
if (icpu->use_acpi || force_use_acpi)
intel_idle_acpi_cst_extract();
} else if (!intel_idle_acpi_cst_extract()) {
@@ -2387,6 +2484,10 @@ static int __init intel_idle_init(void)
if (!intel_idle_cpuidle_devices)
return -ENOMEM;
+ retval = intel_idle_sysfs_init();
+ if (retval)
+ pr_warn("failed to initialized sysfs");
+
intel_idle_cpuidle_driver_init(&intel_idle_driver);
retval = cpuidle_register_driver(&intel_idle_driver);
@@ -2411,6 +2512,7 @@ hp_setup_fail:
intel_idle_cpuidle_devices_uninit();
cpuidle_unregister_driver(&intel_idle_driver);
init_driver_fail:
+ intel_idle_sysfs_uninit();
free_percpu(intel_idle_cpuidle_devices);
return retval;