diff options
-rw-r--r-- | Documentation/ABI/testing/sysfs-devices-system-cpu | 7 | ||||
-rw-r--r-- | Documentation/admin-guide/pm/intel_idle.rst | 21 | ||||
-rw-r--r-- | drivers/cpuidle/cpuidle-psci.c | 43 | ||||
-rw-r--r-- | drivers/cpuidle/governors/menu.c | 2 | ||||
-rw-r--r-- | drivers/cpuidle/governors/teo.c | 4 | ||||
-rw-r--r-- | drivers/idle/intel_idle.c | 102 |
6 files changed, 159 insertions, 20 deletions
diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index 03f43fb667a3..8d6c7b57dc14 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -111,6 +111,7 @@ What: /sys/devices/system/cpu/cpuidle/available_governors /sys/devices/system/cpu/cpuidle/current_driver /sys/devices/system/cpu/cpuidle/current_governor /sys/devices/system/cpu/cpuidle/current_governer_ro + /sys/devices/system/cpu/cpuidle/intel_c1_demotion Date: September 2007 Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org> Description: Discover cpuidle policy and mechanism @@ -132,7 +133,11 @@ Description: Discover cpuidle policy and mechanism current_governor_ro: (RO) displays current idle policy. - See Documentation/admin-guide/pm/cpuidle.rst and + intel_c1_demotion: (RW) enables/disables the C1 demotion + feature on Intel CPUs. + + See Documentation/admin-guide/pm/cpuidle.rst, + Documentation/admin-guide/pm/intel_idle.rst, and Documentation/driver-api/pm/cpuidle.rst for more information. diff --git a/Documentation/admin-guide/pm/intel_idle.rst b/Documentation/admin-guide/pm/intel_idle.rst index 5940528146eb..ed6f055d4b14 100644 --- a/Documentation/admin-guide/pm/intel_idle.rst +++ b/Documentation/admin-guide/pm/intel_idle.rst @@ -38,6 +38,27 @@ instruction at all. only way to pass early-configuration-time parameters to it is via the kernel command line. +Sysfs Interface +=============== + +The ``intel_idle`` driver exposes the following ``sysfs`` attributes in +``/sys/devices/system/cpu/cpuidle/``: + +``intel_c1_demotion`` + Enable or disable C1 demotion for all CPUs in the system. This file is + only exposed on platforms that support the C1 demotion feature and where + it was tested. Value 0 means that C1 demotion is disabled, value 1 means + that it is enabled. Write 0 or 1 to disable or enable C1 demotion for + all CPUs. + + The C1 demotion feature involves the platform firmware demoting deep + C-state requests from the OS (e.g., C6 requests) to C1. The idea is that + firmware monitors CPU wake-up rate, and if it is higher than a + platform-specific threshold, the firmware demotes deep C-state requests + to C1. For example, Linux requests C6, but firmware noticed too many + wake-ups per second, and it keeps the CPU in C1. When the CPU stays in + C1 long enough, the platform promotes it back to C6. This may improve + some workloads' performance, but it may also increase power consumption. .. _intel-idle-enumeration-of-states: diff --git a/drivers/cpuidle/cpuidle-psci.c b/drivers/cpuidle/cpuidle-psci.c index b46a83f5ffe4..40f378c1dc9f 100644 --- a/drivers/cpuidle/cpuidle-psci.c +++ b/drivers/cpuidle/cpuidle-psci.c @@ -16,7 +16,7 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/of.h> -#include <linux/platform_device.h> +#include <linux/device/faux.h> #include <linux/psci.h> #include <linux/pm_domain.h> #include <linux/pm_runtime.h> @@ -407,14 +407,14 @@ deinit: * to register cpuidle driver then rollback to cancel all CPUs * registration. */ -static int psci_cpuidle_probe(struct platform_device *pdev) +static int psci_cpuidle_probe(struct faux_device *fdev) { int cpu, ret; struct cpuidle_driver *drv; struct cpuidle_device *dev; for_each_present_cpu(cpu) { - ret = psci_idle_init_cpu(&pdev->dev, cpu); + ret = psci_idle_init_cpu(&fdev->dev, cpu); if (ret) goto out_fail; } @@ -434,26 +434,37 @@ out_fail: return ret; } -static struct platform_driver psci_cpuidle_driver = { +static struct faux_device_ops psci_cpuidle_ops = { .probe = psci_cpuidle_probe, - .driver = { - .name = "psci-cpuidle", - }, }; +static bool __init dt_idle_state_present(void) +{ + struct device_node *cpu_node __free(device_node); + struct device_node *state_node __free(device_node); + + cpu_node = of_cpu_device_node_get(cpumask_first(cpu_possible_mask)); + if (!cpu_node) + return false; + + state_node = of_get_cpu_state_node(cpu_node, 0); + if (!state_node) + return false; + + return !!of_match_node(psci_idle_state_match, state_node); +} + static int __init psci_idle_init(void) { - struct platform_device *pdev; - int ret; + struct faux_device *fdev; - ret = platform_driver_register(&psci_cpuidle_driver); - if (ret) - return ret; + if (!dt_idle_state_present()) + return 0; - pdev = platform_device_register_simple("psci-cpuidle", -1, NULL, 0); - if (IS_ERR(pdev)) { - platform_driver_unregister(&psci_cpuidle_driver); - return PTR_ERR(pdev); + fdev = faux_device_create("psci-cpuidle", NULL, &psci_cpuidle_ops); + if (!fdev) { + pr_err("Failed to create psci-cpuidle device\n"); + return -ENODEV; } return 0; diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 39aa0aea61c6..52d5d26fc7c6 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -255,7 +255,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, */ data->next_timer_ns = KTIME_MAX; delta_tick = TICK_NSEC / 2; - data->bucket = which_bucket(KTIME_MAX); + data->bucket = BUCKETS - 1; } if (unlikely(drv->state_count <= 1 || latency_req == 0) || diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 8fe5e1b47ef9..bfa55c1eab5b 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -19,7 +19,7 @@ * * Of course, non-timer wakeup sources are more important in some use cases, * but even then it is generally unnecessary to consider idle duration values - * greater than the time time till the next timer event, referred as the sleep + * greater than the time till the next timer event, referred as the sleep * length in what follows, because the closest timer will ultimately wake up the * CPU anyway unless it is woken up earlier. * @@ -311,7 +311,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, struct cpuidle_state *s = &drv->states[i]; /* - * Update the sums of idle state mertics for all of the states + * Update the sums of idle state metrics for all of the states * shallower than the current one. */ intercept_sum += prev_bin->intercepts; diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 976f5be54e36..3292bf74e3c2 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -48,9 +48,11 @@ #include <trace/events/power.h> #include <linux/sched.h> #include <linux/sched/smt.h> +#include <linux/mutex.h> #include <linux/notifier.h> #include <linux/cpu.h> #include <linux/moduleparam.h> +#include <linux/sysfs.h> #include <asm/cpuid.h> #include <asm/cpu_device_id.h> #include <asm/intel-family.h> @@ -92,9 +94,15 @@ struct idle_cpu { */ unsigned long auto_demotion_disable_flags; bool disable_promotion_to_c1e; + bool c1_demotion_supported; bool use_acpi; }; +static bool c1_demotion_supported; +static DEFINE_MUTEX(c1_demotion_mutex); + +static struct device *sysfs_root __initdata; + static const struct idle_cpu *icpu __initdata; static struct cpuidle_state *cpuidle_state_table __initdata; @@ -1549,18 +1557,21 @@ static const struct idle_cpu idle_cpu_gmt __initconst = { static const struct idle_cpu idle_cpu_spr __initconst = { .state_table = spr_cstates, .disable_promotion_to_c1e = true, + .c1_demotion_supported = true, .use_acpi = true, }; static const struct idle_cpu idle_cpu_gnr __initconst = { .state_table = gnr_cstates, .disable_promotion_to_c1e = true, + .c1_demotion_supported = true, .use_acpi = true, }; static const struct idle_cpu idle_cpu_gnrd __initconst = { .state_table = gnrd_cstates, .disable_promotion_to_c1e = true, + .c1_demotion_supported = true, .use_acpi = true, }; @@ -1599,12 +1610,14 @@ static const struct idle_cpu idle_cpu_snr __initconst = { static const struct idle_cpu idle_cpu_grr __initconst = { .state_table = grr_cstates, .disable_promotion_to_c1e = true, + .c1_demotion_supported = true, .use_acpi = true, }; static const struct idle_cpu idle_cpu_srf __initconst = { .state_table = srf_cstates, .disable_promotion_to_c1e = true, + .c1_demotion_supported = true, .use_acpi = true, }; @@ -2324,6 +2337,88 @@ static void __init intel_idle_cpuidle_devices_uninit(void) cpuidle_unregister_device(per_cpu_ptr(intel_idle_cpuidle_devices, i)); } +static void intel_c1_demotion_toggle(void *enable) +{ + unsigned long long msr_val; + + rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_val); + /* + * Enable/disable C1 undemotion along with C1 demotion, as this is the + * most sensible configuration in general. + */ + if (enable) + msr_val |= NHM_C1_AUTO_DEMOTE | SNB_C1_AUTO_UNDEMOTE; + else + msr_val &= ~(NHM_C1_AUTO_DEMOTE | SNB_C1_AUTO_UNDEMOTE); + wrmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_val); +} + +static ssize_t intel_c1_demotion_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + bool enable; + int err; + + err = kstrtobool(buf, &enable); + if (err) + return err; + + mutex_lock(&c1_demotion_mutex); + /* Enable/disable C1 demotion on all CPUs */ + on_each_cpu(intel_c1_demotion_toggle, (void *)enable, 1); + mutex_unlock(&c1_demotion_mutex); + + return count; +} + +static ssize_t intel_c1_demotion_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + unsigned long long msr_val; + + /* + * Read the MSR value for a CPU and assume it is the same for all CPUs. Any other + * configuration would be a BIOS bug. + */ + rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_val); + return sysfs_emit(buf, "%d\n", !!(msr_val & NHM_C1_AUTO_DEMOTE)); +} +static DEVICE_ATTR_RW(intel_c1_demotion); + +static int __init intel_idle_sysfs_init(void) +{ + int err; + + if (!c1_demotion_supported) + return 0; + + sysfs_root = bus_get_dev_root(&cpu_subsys); + if (!sysfs_root) + return 0; + + err = sysfs_add_file_to_group(&sysfs_root->kobj, + &dev_attr_intel_c1_demotion.attr, + "cpuidle"); + if (err) { + put_device(sysfs_root); + return err; + } + + return 0; +} + +static void __init intel_idle_sysfs_uninit(void) +{ + if (!sysfs_root) + return; + + sysfs_remove_file_from_group(&sysfs_root->kobj, + &dev_attr_intel_c1_demotion.attr, + "cpuidle"); + put_device(sysfs_root); +} + static int __init intel_idle_init(void) { const struct x86_cpu_id *id; @@ -2374,6 +2469,8 @@ static int __init intel_idle_init(void) auto_demotion_disable_flags = icpu->auto_demotion_disable_flags; if (icpu->disable_promotion_to_c1e) c1e_promotion = C1E_PROMOTION_DISABLE; + if (icpu->c1_demotion_supported) + c1_demotion_supported = true; if (icpu->use_acpi || force_use_acpi) intel_idle_acpi_cst_extract(); } else if (!intel_idle_acpi_cst_extract()) { @@ -2387,6 +2484,10 @@ static int __init intel_idle_init(void) if (!intel_idle_cpuidle_devices) return -ENOMEM; + retval = intel_idle_sysfs_init(); + if (retval) + pr_warn("failed to initialized sysfs"); + intel_idle_cpuidle_driver_init(&intel_idle_driver); retval = cpuidle_register_driver(&intel_idle_driver); @@ -2411,6 +2512,7 @@ hp_setup_fail: intel_idle_cpuidle_devices_uninit(); cpuidle_unregister_driver(&intel_idle_driver); init_driver_fail: + intel_idle_sysfs_uninit(); free_percpu(intel_idle_cpuidle_devices); return retval; |