From 3cedbc5a6d7f7c5539e139f89ec9f6e1ed668418 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Mon, 1 May 2017 23:06:08 -0400 Subject: intel_pstate: use updated msr-index.h HWP.EPP values intel_pstate exports sysfs attributes for setting and observing HWP.EPP. These attributes use strings to describe 4 operating states, and inside the driver, these strings are mapped to numerical register values. The authorative mapping between the strings and numerical HWP.EPP values are now globally defined in msr-index.h, replacing the out-dated mapping that were open-coded into intel_pstate.c new old string --- --- ------ 0 0 performance 128 64 balance_performance 192 128 balance_power 255 192 power Note that the HW and BIOS default value on most system is 128, which intel_pstate will now call "balance_performance" while it used to call it "balance_power". Signed-off-by: Len Brown --- drivers/cpufreq/intel_pstate.c | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 50bd6d987fc3..ab8ebaeb3621 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -716,6 +716,12 @@ static const char * const energy_perf_strings[] = { "power", NULL }; +static const unsigned int epp_values[] = { + HWP_EPP_PERFORMANCE, + HWP_EPP_BALANCE_PERFORMANCE, + HWP_EPP_BALANCE_POWERSAVE, + HWP_EPP_POWERSAVE +}; static int intel_pstate_get_energy_pref_index(struct cpudata *cpu_data) { @@ -727,17 +733,14 @@ static int intel_pstate_get_energy_pref_index(struct cpudata *cpu_data) return epp; if (static_cpu_has(X86_FEATURE_HWP_EPP)) { - /* - * Range: - * 0x00-0x3F : Performance - * 0x40-0x7F : Balance performance - * 0x80-0xBF : Balance power - * 0xC0-0xFF : Power - * The EPP is a 8 bit value, but our ranges restrict the - * value which can be set. Here only using top two bits - * effectively. - */ - index = (epp >> 6) + 1; + if (epp == HWP_EPP_PERFORMANCE) + return 1; + if (epp <= HWP_EPP_BALANCE_PERFORMANCE) + return 2; + if (epp <= HWP_EPP_BALANCE_POWERSAVE) + return 3; + else + return 4; } else if (static_cpu_has(X86_FEATURE_EPB)) { /* * Range: @@ -775,15 +778,8 @@ static int intel_pstate_set_energy_pref_index(struct cpudata *cpu_data, value &= ~GENMASK_ULL(31, 24); - /* - * If epp is not default, convert from index into - * energy_perf_strings to epp value, by shifting 6 - * bits left to use only top two bits in epp. - * The resultant epp need to shifted by 24 bits to - * epp position in MSR_HWP_REQUEST. - */ if (epp == -EINVAL) - epp = (pref_index - 1) << 6; + epp = epp_values[pref_index - 1]; value |= (u64)epp << 24; ret = wrmsrl_on_cpu(cpu_data->cpu, MSR_HWP_REQUEST, value); -- cgit v1.2.3 From a0df77348ab37687c9513bc2dd772cbc6eadbe63 Mon Sep 17 00:00:00 2001 From: Tao Wang Date: Tue, 23 May 2017 16:13:18 +0800 Subject: cpufreq: dt: Add support for hi3660 Add the compatible string for supporting the generic device tree cpufreq-dt driver on Hisilicon's 3660 SoC. Signed-off-by: Tao Wang Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq-dt-platdev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c index 921b4a6c3d16..1c262923fe58 100644 --- a/drivers/cpufreq/cpufreq-dt-platdev.c +++ b/drivers/cpufreq/cpufreq-dt-platdev.c @@ -31,6 +31,7 @@ static const struct of_device_id machines[] __initconst = { { .compatible = "arm,integrator-ap", }, { .compatible = "arm,integrator-cp", }, + { .compatible = "hisilicon,hi3660", }, { .compatible = "hisilicon,hi6220", }, { .compatible = "fsl,imx27", }, -- cgit v1.2.3 From 3fafb4e77279ed375077e95f54ee687b481c24d2 Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Tue, 30 May 2017 18:57:18 +0300 Subject: cpufreq: imx6q: imx6ull should use the same flow as imx6ul This fixes an issue with imx6ull where setting the frequency to 528Mhz would actually set the ARM clock to 324Mhz. Signed-off-by: Octavian Purdila Signed-off-by: Leonard Crestez Acked-by: Viresh Kumar Reviewed-by: Fabio Estevam Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/imx6q-cpufreq.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/imx6q-cpufreq.c b/drivers/cpufreq/imx6q-cpufreq.c index 9c13f097fd8c..b6edd3ccaa55 100644 --- a/drivers/cpufreq/imx6q-cpufreq.c +++ b/drivers/cpufreq/imx6q-cpufreq.c @@ -101,7 +101,8 @@ static int imx6q_set_target(struct cpufreq_policy *policy, unsigned int index) * - Reprogram pll1_sys_clk and reparent pll1_sw_clk back to it * - Disable pll2_pfd2_396m_clk */ - if (of_machine_is_compatible("fsl,imx6ul")) { + if (of_machine_is_compatible("fsl,imx6ul") || + of_machine_is_compatible("fsl,imx6ull")) { /* * When changing pll1_sw_clk's parent to pll1_sys_clk, * CPU may run at higher than 528MHz, this will lead to @@ -215,7 +216,8 @@ static int imx6q_cpufreq_probe(struct platform_device *pdev) goto put_clk; } - if (of_machine_is_compatible("fsl,imx6ul")) { + if (of_machine_is_compatible("fsl,imx6ul") || + of_machine_is_compatible("fsl,imx6ull")) { pll2_bus_clk = clk_get(cpu_dev, "pll2_bus"); secondary_sel_clk = clk_get(cpu_dev, "secondary_sel"); if (IS_ERR(pll2_bus_clk) || IS_ERR(secondary_sel_clk)) { -- cgit v1.2.3 From e773f5c7e8c6279b2560ae1ca2a1d37cc12fe7c3 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 7 Jun 2017 20:13:24 +0200 Subject: cpufreq: exynos5440: Fix inconsistent indenting Fix inconsistent indenting and unneeded white space in assignment. Signed-off-by: Krzysztof Kozlowski Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/exynos5440-cpufreq.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/exynos5440-cpufreq.c b/drivers/cpufreq/exynos5440-cpufreq.c index 9180d34cc9fc..b6b369c22272 100644 --- a/drivers/cpufreq/exynos5440-cpufreq.c +++ b/drivers/cpufreq/exynos5440-cpufreq.c @@ -173,12 +173,12 @@ static void exynos_enable_dvfs(unsigned int cur_frequency) /* Enable PSTATE Change Event */ tmp = __raw_readl(dvfs_info->base + XMU_PMUEVTEN); tmp |= (1 << PSTATE_CHANGED_EVTEN_SHIFT); - __raw_writel(tmp, dvfs_info->base + XMU_PMUEVTEN); + __raw_writel(tmp, dvfs_info->base + XMU_PMUEVTEN); /* Enable PSTATE Change IRQ */ tmp = __raw_readl(dvfs_info->base + XMU_PMUIRQEN); tmp |= (1 << PSTATE_CHANGED_IRQEN_SHIFT); - __raw_writel(tmp, dvfs_info->base + XMU_PMUIRQEN); + __raw_writel(tmp, dvfs_info->base + XMU_PMUIRQEN); /* Set initial performance index */ cpufreq_for_each_entry(pos, freq_table) @@ -330,7 +330,7 @@ static int exynos_cpufreq_probe(struct platform_device *pdev) struct resource res; unsigned int cur_frequency; - np = pdev->dev.of_node; + np = pdev->dev.of_node; if (!np) return -ENODEV; -- cgit v1.2.3 From 0370f0f975e5561c658aa86e2c372079e6a425eb Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 15 Jun 2017 10:55:59 +0100 Subject: cpufreq: sfi: make freq_table static pointer freq_table can be made static as it does not need to be in global scope. Cleans up sparse warning: "symbol 'freq_table' was not declared. Should it be static?" Signed-off-by: Colin Ian King Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/sfi-cpufreq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/sfi-cpufreq.c b/drivers/cpufreq/sfi-cpufreq.c index 992ce6f9abec..3779742f86e3 100644 --- a/drivers/cpufreq/sfi-cpufreq.c +++ b/drivers/cpufreq/sfi-cpufreq.c @@ -24,7 +24,7 @@ #include -struct cpufreq_frequency_table *freq_table; +static struct cpufreq_frequency_table *freq_table; static struct sfi_freq_table_entry *sfi_cpufreq_array; static int num_freq_table_entries; -- cgit v1.2.3 From 1a4fe38add8be45eb3873ad756561b9baf6bcaef Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Mon, 12 Jun 2017 16:30:27 -0700 Subject: cpufreq: intel_pstate: Remove max/min fractions to limit performance In the current model the max/min perf limits are a fraction of current user space limits to the allowed max_freq or 100% for global limits. This results in wrong ratio limits calculation because of rounding issues for some user space limits. Initially we tried to solve this issue by issue by having more shift bits to increase precision. Still there are isolated cases where we still have error. This can be avoided by using ratios all together. Since the way we get cpuinfo.max_freq is by multiplying scaling factor to max ratio, we can easily keep the max/min ratios in terms of ratios and not fractions. For example: if the max ratio = 36 cpuinfo.max_freq = 36 * 100000 = 3600000 Suppose user space sets a limit of 1200000, then we can calculate max ratio limit as = 36 * 1200000 / 3600000 = 12 This will be correct for any user limits. The other advantage is that, we don't need to do any calculation in the fast path as ratio limit is already calculated via set_policy() callback. Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 114 +++++++++++++++++++++++------------------ 1 file changed, 63 insertions(+), 51 deletions(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index eb1158532de3..be0290a1a5e2 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -231,10 +231,8 @@ struct global_params { * @prev_cummulative_iowait: IO Wait time difference from last and * current sample * @sample: Storage for storing last Sample data - * @min_perf: Minimum capacity limit as a fraction of the maximum - * turbo P-state capacity. - * @max_perf: Maximum capacity limit as a fraction of the maximum - * turbo P-state capacity. + * @min_perf_ratio: Minimum capacity in terms of PERF or HWP ratios + * @max_perf_ratio: Maximum capacity in terms of PERF or HWP ratios * @acpi_perf_data: Stores ACPI perf information read from _PSS * @valid_pss_table: Set to true for valid ACPI _PSS entries found * @epp_powersave: Last saved HWP energy performance preference @@ -266,8 +264,8 @@ struct cpudata { u64 prev_tsc; u64 prev_cummulative_iowait; struct sample sample; - int32_t min_perf; - int32_t max_perf; + int32_t min_perf_ratio; + int32_t max_perf_ratio; #ifdef CONFIG_ACPI struct acpi_processor_performance acpi_perf_data; bool valid_pss_table; @@ -794,25 +792,32 @@ static struct freq_attr *hwp_cpufreq_attrs[] = { NULL, }; -static void intel_pstate_hwp_set(unsigned int cpu) +static void intel_pstate_get_hwp_max(unsigned int cpu, int *phy_max, + int *current_max) { - struct cpudata *cpu_data = all_cpu_data[cpu]; - int min, hw_min, max, hw_max; - u64 value, cap; - s16 epp; + u64 cap; rdmsrl_on_cpu(cpu, MSR_HWP_CAPABILITIES, &cap); - hw_min = HWP_LOWEST_PERF(cap); if (global.no_turbo) - hw_max = HWP_GUARANTEED_PERF(cap); + *current_max = HWP_GUARANTEED_PERF(cap); else - hw_max = HWP_HIGHEST_PERF(cap); + *current_max = HWP_HIGHEST_PERF(cap); + + *phy_max = HWP_HIGHEST_PERF(cap); +} + +static void intel_pstate_hwp_set(unsigned int cpu) +{ + struct cpudata *cpu_data = all_cpu_data[cpu]; + int max, min; + u64 value; + s16 epp; + + max = cpu_data->max_perf_ratio; + min = cpu_data->min_perf_ratio; - max = fp_ext_toint(hw_max * cpu_data->max_perf); if (cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE) min = max; - else - min = fp_ext_toint(hw_max * cpu_data->min_perf); rdmsrl_on_cpu(cpu, MSR_HWP_REQUEST, &value); @@ -1528,8 +1533,7 @@ static void intel_pstate_max_within_limits(struct cpudata *cpu) update_turbo_state(); pstate = intel_pstate_get_base_pstate(cpu); - pstate = max(cpu->pstate.min_pstate, - fp_ext_toint(pstate * cpu->max_perf)); + pstate = max(cpu->pstate.min_pstate, cpu->max_perf_ratio); intel_pstate_set_pstate(cpu, pstate); } @@ -1695,9 +1699,8 @@ static int intel_pstate_prepare_request(struct cpudata *cpu, int pstate) int max_pstate = intel_pstate_get_base_pstate(cpu); int min_pstate; - min_pstate = max(cpu->pstate.min_pstate, - fp_ext_toint(max_pstate * cpu->min_perf)); - max_pstate = max(min_pstate, fp_ext_toint(max_pstate * cpu->max_perf)); + min_pstate = max(cpu->pstate.min_pstate, cpu->min_perf_ratio); + max_pstate = max(min_pstate, cpu->max_perf_ratio); return clamp_t(int, pstate, min_pstate, max_pstate); } @@ -1967,52 +1970,61 @@ static void intel_pstate_update_perf_limits(struct cpufreq_policy *policy, { int max_freq = intel_pstate_get_max_freq(cpu); int32_t max_policy_perf, min_policy_perf; + int max_state, turbo_max; - max_policy_perf = div_ext_fp(policy->max, max_freq); - max_policy_perf = clamp_t(int32_t, max_policy_perf, 0, int_ext_tofp(1)); + /* + * HWP needs some special consideration, because on BDX the + * HWP_REQUEST uses abstract value to represent performance + * rather than pure ratios. + */ + if (hwp_active) { + intel_pstate_get_hwp_max(cpu->cpu, &turbo_max, &max_state); + } else { + max_state = intel_pstate_get_base_pstate(cpu); + turbo_max = cpu->pstate.turbo_pstate; + } + + max_policy_perf = max_state * policy->max / max_freq; if (policy->max == policy->min) { min_policy_perf = max_policy_perf; } else { - min_policy_perf = div_ext_fp(policy->min, max_freq); + min_policy_perf = max_state * policy->min / max_freq; min_policy_perf = clamp_t(int32_t, min_policy_perf, 0, max_policy_perf); } + pr_debug("cpu:%d max_state %d min_policy_perf:%d max_policy_perf:%d\n", + policy->cpu, max_state, + min_policy_perf, max_policy_perf); + /* Normalize user input to [min_perf, max_perf] */ if (per_cpu_limits) { - cpu->min_perf = min_policy_perf; - cpu->max_perf = max_policy_perf; + cpu->min_perf_ratio = min_policy_perf; + cpu->max_perf_ratio = max_policy_perf; } else { int32_t global_min, global_max; /* Global limits are in percent of the maximum turbo P-state. */ - global_max = percent_ext_fp(global.max_perf_pct); - global_min = percent_ext_fp(global.min_perf_pct); - if (max_freq != cpu->pstate.turbo_freq) { - int32_t turbo_factor; - - turbo_factor = div_ext_fp(cpu->pstate.turbo_pstate, - cpu->pstate.max_pstate); - global_min = mul_ext_fp(global_min, turbo_factor); - global_max = mul_ext_fp(global_max, turbo_factor); - } + global_max = DIV_ROUND_UP(turbo_max * global.max_perf_pct, 100); + global_min = DIV_ROUND_UP(turbo_max * global.min_perf_pct, 100); global_min = clamp_t(int32_t, global_min, 0, global_max); - cpu->min_perf = max(min_policy_perf, global_min); - cpu->min_perf = min(cpu->min_perf, max_policy_perf); - cpu->max_perf = min(max_policy_perf, global_max); - cpu->max_perf = max(min_policy_perf, cpu->max_perf); + pr_debug("cpu:%d global_min:%d global_max:%d\n", policy->cpu, + global_min, global_max); - /* Make sure min_perf <= max_perf */ - cpu->min_perf = min(cpu->min_perf, cpu->max_perf); - } + cpu->min_perf_ratio = max(min_policy_perf, global_min); + cpu->min_perf_ratio = min(cpu->min_perf_ratio, max_policy_perf); + cpu->max_perf_ratio = min(max_policy_perf, global_max); + cpu->max_perf_ratio = max(min_policy_perf, cpu->max_perf_ratio); - cpu->max_perf = round_up(cpu->max_perf, EXT_FRAC_BITS); - cpu->min_perf = round_up(cpu->min_perf, EXT_FRAC_BITS); + /* Make sure min_perf <= max_perf */ + cpu->min_perf_ratio = min(cpu->min_perf_ratio, + cpu->max_perf_ratio); - pr_debug("cpu:%d max_perf_pct:%d min_perf_pct:%d\n", policy->cpu, - fp_ext_toint(cpu->max_perf * 100), - fp_ext_toint(cpu->min_perf * 100)); + } + pr_debug("cpu:%d max_perf_ratio:%d min_perf_ratio:%d\n", policy->cpu, + cpu->max_perf_ratio, + cpu->min_perf_ratio); } static int intel_pstate_set_policy(struct cpufreq_policy *policy) @@ -2115,8 +2127,8 @@ static int __intel_pstate_cpu_init(struct cpufreq_policy *policy) cpu = all_cpu_data[policy->cpu]; - cpu->max_perf = int_ext_tofp(1); - cpu->min_perf = 0; + cpu->max_perf_ratio = 0xFF; + cpu->min_perf_ratio = 0; policy->min = cpu->pstate.min_pstate * cpu->pstate.scaling; policy->max = cpu->pstate.turbo_pstate * cpu->pstate.scaling; -- cgit v1.2.3 From f8475cef90082bf0902ddab106112de130d90395 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 23 Jun 2017 22:11:52 -0700 Subject: x86: use common aperfmperf_khz_on_cpu() to calculate KHz using APERF/MPERF The goal of this change is to give users a uniform and meaningful result when they read /sys/...cpufreq/scaling_cur_freq on modern x86 hardware, as compared to what they get today. Modern x86 processors include the hardware needed to accurately calculate frequency over an interval -- APERF, MPERF, and the TSC. Here we provide an x86 routine to make this calculation on supported hardware, and use it in preference to any driver driver-specific cpufreq_driver.get() routine. MHz is computed like so: MHz = base_MHz * delta_APERF / delta_MPERF MHz is the average frequency of the busy processor over a measurement interval. The interval is defined to be the time between successive invocations of aperfmperf_khz_on_cpu(), which are expected to to happen on-demand when users read sysfs attribute cpufreq/scaling_cur_freq. As with previous methods of calculating MHz, idle time is excluded. base_MHz above is from TSC calibration global "cpu_khz". This x86 native method to calculate MHz returns a meaningful result no matter if P-states are controlled by hardware or firmware and/or if the Linux cpufreq sub-system is or is-not installed. When this routine is invoked more frequently, the measurement interval becomes shorter. However, the code limits re-computation to 10ms intervals so that average frequency remains meaningful. Discerning users are encouraged to take advantage of the turbostat(8) utility, which can gracefully handle concurrent measurement intervals of arbitrary length. Signed-off-by: Len Brown Reviewed-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/cpu/Makefile | 1 + arch/x86/kernel/cpu/aperfmperf.c | 79 ++++++++++++++++++++++++++++++++++++++++ drivers/cpufreq/cpufreq.c | 12 +++++- include/linux/cpufreq.h | 2 + 4 files changed, 93 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/cpu/aperfmperf.c (limited to 'drivers/cpufreq') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 52000010c62e..cdf82492b770 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -21,6 +21,7 @@ obj-y += common.o obj-y += rdrand.o obj-y += match.o obj-y += bugs.o +obj-$(CONFIG_CPU_FREQ) += aperfmperf.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c new file mode 100644 index 000000000000..d869c8671e36 --- /dev/null +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -0,0 +1,79 @@ +/* + * x86 APERF/MPERF KHz calculation for + * /sys/.../cpufreq/scaling_cur_freq + * + * Copyright (C) 2017 Intel Corp. + * Author: Len Brown + * + * This file is licensed under GPLv2. + */ + +#include +#include +#include +#include + +struct aperfmperf_sample { + unsigned int khz; + unsigned long jiffies; + u64 aperf; + u64 mperf; +}; + +static DEFINE_PER_CPU(struct aperfmperf_sample, samples); + +/* + * aperfmperf_snapshot_khz() + * On the current CPU, snapshot APERF, MPERF, and jiffies + * unless we already did it within 10ms + * calculate kHz, save snapshot + */ +static void aperfmperf_snapshot_khz(void *dummy) +{ + u64 aperf, aperf_delta; + u64 mperf, mperf_delta; + struct aperfmperf_sample *s = this_cpu_ptr(&samples); + + /* Don't bother re-computing within 10 ms */ + if (time_before(jiffies, s->jiffies + HZ/100)) + return; + + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + + aperf_delta = aperf - s->aperf; + mperf_delta = mperf - s->mperf; + + /* + * There is no architectural guarantee that MPERF + * increments faster than we can read it. + */ + if (mperf_delta == 0) + return; + + /* + * if (cpu_khz * aperf_delta) fits into ULLONG_MAX, then + * khz = (cpu_khz * aperf_delta) / mperf_delta + */ + if (div64_u64(ULLONG_MAX, cpu_khz) > aperf_delta) + s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta); + else /* khz = aperf_delta / (mperf_delta / cpu_khz) */ + s->khz = div64_u64(aperf_delta, + div64_u64(mperf_delta, cpu_khz)); + s->jiffies = jiffies; + s->aperf = aperf; + s->mperf = mperf; +} + +unsigned int arch_freq_get_on_cpu(int cpu) +{ + if (!cpu_khz) + return 0; + + if (!static_cpu_has(X86_FEATURE_APERFMPERF)) + return 0; + + smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1); + + return per_cpu(samples.khz, cpu); +} diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 26b643d57847..6e7424d12de9 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -632,11 +632,21 @@ show_one(cpuinfo_transition_latency, cpuinfo.transition_latency); show_one(scaling_min_freq, min); show_one(scaling_max_freq, max); +__weak unsigned int arch_freq_get_on_cpu(int cpu) +{ + return 0; +} + static ssize_t show_scaling_cur_freq(struct cpufreq_policy *policy, char *buf) { ssize_t ret; + unsigned int freq; - if (cpufreq_driver && cpufreq_driver->setpolicy && cpufreq_driver->get) + freq = arch_freq_get_on_cpu(policy->cpu); + if (freq) + ret = sprintf(buf, "%u\n", freq); + else if (cpufreq_driver && cpufreq_driver->setpolicy && + cpufreq_driver->get) ret = sprintf(buf, "%u\n", cpufreq_driver->get(policy->cpu)); else ret = sprintf(buf, "%u\n", policy->cur); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index a5ce0bbeadb5..905117bd5012 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -883,6 +883,8 @@ static inline bool policy_has_boost_freq(struct cpufreq_policy *policy) } #endif +extern unsigned int arch_freq_get_on_cpu(int cpu); + /* the following are really really optional */ extern struct freq_attr cpufreq_freq_attr_scaling_available_freqs; extern struct freq_attr cpufreq_freq_attr_scaling_boost_freqs; -- cgit v1.2.3 From 62611cb912f7cb08ef7815780bcc20a09abedd20 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 23 Jun 2017 22:11:53 -0700 Subject: intel_pstate: delete scheduler hook in HWP mode The cpufreq/scaling_cur_freq sysfs attribute is now provided by shared x86 cpufreq code on modern x86 systems, including all systems supported by the intel_pstate driver. In HWP mode, maintaining that value was the sole purpose of the scheduler hook, intel_pstate_update_util_hwp(), so it can now be removed. Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index be0290a1a5e2..b00a45595c89 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1736,16 +1736,6 @@ static void intel_pstate_adjust_pstate(struct cpudata *cpu, int target_pstate) fp_toint(cpu->iowait_boost * 100)); } -static void intel_pstate_update_util_hwp(struct update_util_data *data, - u64 time, unsigned int flags) -{ - struct cpudata *cpu = container_of(data, struct cpudata, update_util); - u64 delta_ns = time - cpu->sample.time; - - if ((s64)delta_ns >= INTEL_PSTATE_HWP_SAMPLING_INTERVAL) - intel_pstate_sample(cpu, time); -} - static void intel_pstate_update_util_pid(struct update_util_data *data, u64 time, unsigned int flags) { @@ -1937,6 +1927,9 @@ static void intel_pstate_set_update_util_hook(unsigned int cpu_num) { struct cpudata *cpu = all_cpu_data[cpu_num]; + if (hwp_active) + return; + if (cpu->update_util_set) return; @@ -2570,7 +2563,6 @@ static int __init intel_pstate_init(void) } else { hwp_active++; intel_pstate.attr = hwp_cpufreq_attrs; - pstate_funcs.update_util = intel_pstate_update_util_hwp; goto hwp_cpu_matched; } } else { -- cgit v1.2.3 From 82b4e03e01bc8def546b52707962809f04ae5c7a Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 23 Jun 2017 22:11:54 -0700 Subject: intel_pstate: skip scheduler hook when in "performance" mode When the governor is set to "performance", intel_pstate does not need the scheduler hook for doing any calculations. Under these conditions, its only purpose is to continue to maintain cpufreq/scaling_cur_freq. The cpufreq/scaling_cur_freq sysfs attribute is now provided by shared x86 cpufreq code on modern x86 systems, including all systems supported by the intel_pstate driver. So in "performance" governor mode, the scheduler hook can be skipped. This applies to both in Software and Hardware P-state control modes. Suggested-by: Srinivas Pandruvada Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index b00a45595c89..1772d309bea9 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2044,10 +2044,10 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy) */ intel_pstate_clear_update_util_hook(policy->cpu); intel_pstate_max_within_limits(cpu); + } else { + intel_pstate_set_update_util_hook(policy->cpu); } - intel_pstate_set_update_util_hook(policy->cpu); - if (hwp_active) intel_pstate_hwp_set(policy->cpu); -- cgit v1.2.3 From 73808d0fd26b3d3f0f44cc7c469ad1d3c1b570b8 Mon Sep 17 00:00:00 2001 From: "Prakash, Prashanth" Date: Thu, 11 May 2017 16:39:44 -0600 Subject: cpufreq / CPPC: Initialize policy->min to lowest nonlinear performance Description of Lowest Perfomance in ACPI 6.1 specification states: "Lowest Performance is the absolute lowest performance level of the platform. Selecting a performance level lower than the lowest nonlinear performance level may actually cause an efficiency penalty, but should reduce the instantaneous power consumption of the processor. In traditional terms, this represents the T-state range of performance levels." Set the default value of policy->min to Lowest Nonlinear Performance to avoid any potential efficiency penalty. Signed-off-by: Prashanth Prakash Acked-by: Viresh Kumar Acked-by: Alexey Klimov Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cppc_cpufreq.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index e82bb3c30b92..10be285c9055 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -144,10 +144,23 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) cppc_dmi_max_khz = cppc_get_dmi_max_khz(); - policy->min = cpu->perf_caps.lowest_perf * cppc_dmi_max_khz / cpu->perf_caps.highest_perf; + /* + * Set min to lowest nonlinear perf to avoid any efficiency penalty (see + * Section 8.4.7.1.1.5 of ACPI 6.1 spec) + */ + policy->min = cpu->perf_caps.lowest_nonlinear_perf * cppc_dmi_max_khz / + cpu->perf_caps.highest_perf; policy->max = cppc_dmi_max_khz; - policy->cpuinfo.min_freq = policy->min; - policy->cpuinfo.max_freq = policy->max; + + /* + * Set cpuinfo.min_freq to Lowest to make the full range of performance + * available if userspace wants to use any perf between lowest & lowest + * nonlinear perf + */ + policy->cpuinfo.min_freq = cpu->perf_caps.lowest_perf * cppc_dmi_max_khz / + cpu->perf_caps.highest_perf; + policy->cpuinfo.max_freq = cppc_dmi_max_khz; + policy->cpuinfo.transition_latency = cppc_get_transition_latency(cpu_num); policy->shared_type = cpu->shared_type; -- cgit v1.2.3 From fab24dcc395637557a7988a867e7b3a5823917a9 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 29 Jun 2017 01:47:56 +0200 Subject: cpufreq: intel_pstate: Clean up after performance governor changes After commit 82b4e03e01bc (intel_pstate: skip scheduler hook when in "performance" mode) get_target_pstate_use_performance() and get_target_pstate_use_cpu_load() are never called if scaling_governor is "performance", so drop the CPUFREQ_POLICY_PERFORMANCE checks from them as they will never trigger anyway. Moreover, the documentation needs to be updated to reflect the change made by the above commit, so do that too. Signed-off-by: Rafael J. Wysocki Acked-by: Srinivas Pandruvada --- Documentation/admin-guide/pm/intel_pstate.rst | 6 ++---- drivers/cpufreq/intel_pstate.c | 6 ------ 2 files changed, 2 insertions(+), 10 deletions(-) (limited to 'drivers/cpufreq') diff --git a/Documentation/admin-guide/pm/intel_pstate.rst b/Documentation/admin-guide/pm/intel_pstate.rst index 33d703989ea8..1d6249825efc 100644 --- a/Documentation/admin-guide/pm/intel_pstate.rst +++ b/Documentation/admin-guide/pm/intel_pstate.rst @@ -157,10 +157,8 @@ Without HWP, this P-state selection algorithm is always the same regardless of the processor model and platform configuration. It selects the maximum P-state it is allowed to use, subject to limits set via -``sysfs``, every time the P-state selection computations are carried out by the -driver's utilization update callback for the given CPU (that does not happen -more often than every 10 ms), but the hardware configuration will not be changed -if the new P-state is the same as the current one. +``sysfs``, every time the driver configuration for the given CPU is updated +(e.g. via ``sysfs``). This is the default P-state selection algorithm if the :c:macro:`CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE` kernel configuration option diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 1772d309bea9..756483251832 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1620,9 +1620,6 @@ static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu) int32_t busy_frac, boost; int target, avg_pstate; - if (cpu->policy == CPUFREQ_POLICY_PERFORMANCE) - return cpu->pstate.turbo_pstate; - busy_frac = div_fp(sample->mperf, sample->tsc); boost = cpu->iowait_boost; @@ -1659,9 +1656,6 @@ static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu) int32_t perf_scaled, max_pstate, current_pstate, sample_ratio; u64 duration_ns; - if (cpu->policy == CPUFREQ_POLICY_PERFORMANCE) - return cpu->pstate.turbo_pstate; - /* * perf_scaled is the ratio of the average P-state during the last * sampling period to the P-state requested last time (in percent). -- cgit v1.2.3