From 86686b8f7ad3abe5aca17643efcee2bbce31a8f7 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 22 Jan 2024 12:22:38 +0100 Subject: PM: sleep: Simplify dpm_suspended_list walk in dpm_resume() Notice that devices can be moved to dpm_prepared_list before running their resume callbacks, in analogy with dpm_noirq_resume_devices() and dpm_resume_early(), because doing so will not affect the final ordering of that list. Namely, if a device is the first dpm_suspended_list entry while dpm_list_mtx is held, it has not been removed so far and it cannot be removed until dpm_list_mtx is released, so moving it to dpm_prepared_list at that point is valid. If it is removed later, while its resume callback is running, it will be deleted from dpm_prepared_list without changing the ordering of the other devices in that list. Accordingly, rearrange the while () loop in dpm_resume() to move devices to dpm_prepared_list before running their resume callbacks and implify the locking and device reference counting in it. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Stanislaw Gruszka --- drivers/base/power/main.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index fadcd0379dc2..10984aa5192b 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -1017,25 +1017,19 @@ void dpm_resume(pm_message_t state) while (!list_empty(&dpm_suspended_list)) { dev = to_device(dpm_suspended_list.next); - - get_device(dev); + list_move_tail(&dev->power.entry, &dpm_prepared_list); if (!dev->power.async_in_progress) { + get_device(dev); + mutex_unlock(&dpm_list_mtx); device_resume(dev, state, false); + put_device(dev); + mutex_lock(&dpm_list_mtx); } - - if (!list_empty(&dev->power.entry)) - list_move_tail(&dev->power.entry, &dpm_prepared_list); - - mutex_unlock(&dpm_list_mtx); - - put_device(dev); - - mutex_lock(&dpm_list_mtx); } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); -- cgit v1.2.3 From 9cb1c9820f960a4b100e7760cb3773f344e7ae35 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 22 Jan 2024 12:24:21 +0100 Subject: PM: sleep: Relocate two device PM core functions Move is_async() and dpm_async_fn() in the PM core to a more suitable place. No functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Stanislaw Gruszka --- drivers/base/power/main.c | 58 +++++++++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 29 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 10984aa5192b..a2cdef95d8c4 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -578,6 +578,35 @@ bool dev_pm_skip_resume(struct device *dev) return !dev->power.must_resume; } +static bool is_async(struct device *dev) +{ + return dev->power.async_suspend && pm_async_enabled + && !pm_trace_is_enabled(); +} + +static bool dpm_async_fn(struct device *dev, async_func_t func) +{ + reinit_completion(&dev->power.completion); + + if (is_async(dev)) { + dev->power.async_in_progress = true; + + get_device(dev); + + if (async_schedule_dev_nocall(func, dev)) + return true; + + put_device(dev); + } + /* + * Because async_schedule_dev_nocall() above has returned false or it + * has not been called at all, func() is not running and it is safe to + * update the async_in_progress flag without extra synchronization. + */ + dev->power.async_in_progress = false; + return false; +} + /** * device_resume_noirq - Execute a "noirq resume" callback for given device. * @dev: Device to handle. @@ -664,35 +693,6 @@ Out: } } -static bool is_async(struct device *dev) -{ - return dev->power.async_suspend && pm_async_enabled - && !pm_trace_is_enabled(); -} - -static bool dpm_async_fn(struct device *dev, async_func_t func) -{ - reinit_completion(&dev->power.completion); - - if (is_async(dev)) { - dev->power.async_in_progress = true; - - get_device(dev); - - if (async_schedule_dev_nocall(func, dev)) - return true; - - put_device(dev); - } - /* - * Because async_schedule_dev_nocall() above has returned false or it - * has not been called at all, func() is not running and it is safe to - * update the async_in_progress flag without extra synchronization. - */ - dev->power.async_in_progress = false; - return false; -} - static void async_resume_noirq(void *data, async_cookie_t cookie) { struct device *dev = data; -- cgit v1.2.3 From 3a480d4bb5b1e1f09426223e68acaa90da32e384 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 5 Jan 2024 11:26:48 +0100 Subject: driver core: cpu: make cpu_subsys const Now that the driver core can properly handle constant struct bus_type, move the cpu_subsys variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Cc: "Rafael J. Wysocki" Link: https://lore.kernel.org/r/2024010548-crane-snooze-a871@gregkh Signed-off-by: Greg Kroah-Hartman --- drivers/base/cpu.c | 2 +- include/linux/cpu.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 47de0f140ba6..ac84854c85d7 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -366,7 +366,7 @@ static int cpu_uevent(const struct device *dev, struct kobj_uevent_env *env) } #endif -struct bus_type cpu_subsys = { +const struct bus_type cpu_subsys = { .name = "cpu", .dev_name = "cpu", .match = cpu_subsys_match, diff --git a/include/linux/cpu.h b/include/linux/cpu.h index dcb89c987164..0b993a140946 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -128,7 +128,7 @@ static inline void cpu_maps_update_done(void) static inline int add_cpu(unsigned int cpu) { return 0;} #endif /* CONFIG_SMP */ -extern struct bus_type cpu_subsys; +extern const struct bus_type cpu_subsys; extern int lockdep_is_cpus_held(void); -- cgit v1.2.3 From f297a3844aa059c53be3f69be85ebc071b8a6d16 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 6 Jan 2024 21:57:50 -0800 Subject: driver core: component: fix spellos Correct spelling mistakes reported by codespell. Signed-off-by: Randy Dunlap Cc: "Rafael J. Wysocki" Cc: dri-devel@lists.freedesktop.org Link: https://lore.kernel.org/r/20240107055750.22441-1-rdunlap@infradead.org Signed-off-by: Greg Kroah-Hartman --- drivers/base/component.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/component.c b/drivers/base/component.c index 7dbf14a1d915..741497324d78 100644 --- a/drivers/base/component.c +++ b/drivers/base/component.c @@ -751,7 +751,7 @@ static int __component_add(struct device *dev, const struct component_ops *ops, * component_bind_all(). See also &struct component_ops. * * @subcomponent must be nonzero and is used to differentiate between multiple - * components registerd on the same device @dev. These components are match + * components registered on the same device @dev. These components are match * using component_match_add_typed(). * * The component needs to be unregistered at driver unload/disconnect by @@ -781,7 +781,7 @@ EXPORT_SYMBOL_GPL(component_add_typed); * The component needs to be unregistered at driver unload/disconnect by * calling component_del(). * - * See also component_add_typed() for a variant that allows multipled different + * See also component_add_typed() for a variant that allows multiple different * components on the same device. */ int component_add(struct device *dev, const struct component_ops *ops) -- cgit v1.2.3 From 98323e9d70172f1b46d1cadb20d6c54abf62870d Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 17 Jan 2024 20:05:45 +0100 Subject: topology: Set capacity_freq_ref in all cases If "capacity-dmips-mhz" is not set, raw_capacity is null and we skip the normalization step which includes setting per_cpu capacity_freq_ref. Always register the notifier but skip the capacity normalization if raw_capacity is null. Fixes: 9942cb22ea45 ("sched/topology: Add a new arch_scale_freq_ref() method") Signed-off-by: Vincent Guittot Acked-by: Sudeep Holla Tested-by: Pierre Gondois Tested-by: Mark Brown Tested-by: Paul Barker Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Link: https://lore.kernel.org/r/20240117190545.596057-1-vincent.guittot@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/base/arch_topology.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 018ac202de34..024b78a0cfc1 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -431,9 +431,6 @@ init_cpu_capacity_callback(struct notifier_block *nb, struct cpufreq_policy *policy = data; int cpu; - if (!raw_capacity) - return 0; - if (val != CPUFREQ_CREATE_POLICY) return 0; @@ -450,9 +447,11 @@ init_cpu_capacity_callback(struct notifier_block *nb, } if (cpumask_empty(cpus_to_visit)) { - topology_normalize_cpu_scale(); - schedule_work(&update_topology_flags_work); - free_raw_capacity(); + if (raw_capacity) { + topology_normalize_cpu_scale(); + schedule_work(&update_topology_flags_work); + free_raw_capacity(); + } pr_debug("cpu_capacity: parsing done\n"); schedule_work(&parsing_done_work); } @@ -472,7 +471,7 @@ static int __init register_cpufreq_notifier(void) * On ACPI-based systems skip registering cpufreq notifier as cpufreq * information is not needed for cpu capacity initialization. */ - if (!acpi_disabled || !raw_capacity) + if (!acpi_disabled) return -EINVAL; if (!alloc_cpumask_var(&cpus_to_visit, GFP_KERNEL)) -- cgit v1.2.3 From 7fddac12c38237252431d5b8af7b6d5771b6d125 Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Fri, 2 Feb 2024 01:56:33 -0800 Subject: driver core: Fix device_link_flag_is_sync_state_only() device_link_flag_is_sync_state_only() correctly returns true on the flags of an existing device link that only implements sync_state() functionality. However, it incorrectly and confusingly returns false if it's called with DL_FLAG_SYNC_STATE_ONLY. This bug doesn't manifest in any of the existing calls to this function, but fix this confusing behavior to avoid future bugs. Fixes: 67cad5c67019 ("driver core: fw_devlink: Add DL_FLAG_CYCLE support to device links") Signed-off-by: Saravana Kannan Tested-by: Xu Yang Link: https://lore.kernel.org/r/20240202095636.868578-2-saravanak@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/core.c b/drivers/base/core.c index 14d46af40f9a..52215c4c7209 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -284,10 +284,12 @@ static bool device_is_ancestor(struct device *dev, struct device *target) return false; } +#define DL_MARKER_FLAGS (DL_FLAG_INFERRED | \ + DL_FLAG_CYCLE | \ + DL_FLAG_MANAGED) static inline bool device_link_flag_is_sync_state_only(u32 flags) { - return (flags & ~(DL_FLAG_INFERRED | DL_FLAG_CYCLE)) == - (DL_FLAG_SYNC_STATE_ONLY | DL_FLAG_MANAGED); + return (flags & ~DL_MARKER_FLAGS) == DL_FLAG_SYNC_STATE_ONLY; } /** -- cgit v1.2.3 From 6442d79d880cf7a2fff18779265d657fef0cce4c Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Fri, 2 Feb 2024 01:56:34 -0800 Subject: driver core: fw_devlink: Improve detection of overlapping cycles fw_devlink can detect most overlapping/intersecting cycles. However it was missing a few corner cases because of an incorrect optimization logic that tries to avoid repeating cycle detection for devices that are already marked as part of a cycle. Here's an example provided by Xu Yang (edited for clarity): usb +-----+ tcpc | | +-----+ | +--| | |----------->|EP| |--+ | | +--| |EP|<-----------| | |--+ | | B | | | +-----+ | A | | +-----+ | ^ +-----+ | | | | | +-----| C |<--+ | | +-----+ usb-phy Node A (tcpc) will be populated as device 1-0050. Node B (usb) will be populated as device 38100000.usb. Node C (usb-phy) will be populated as device 381f0040.usb-phy. The description below uses the notation: consumer --> supplier child ==> parent 1. Node C is populated as device C. No cycles detected because cycle detection is only run when a fwnode link is converted to a device link. 2. Node B is populated as device B. As we convert B --> C into a device link we run cycle detection and find and mark the device link/fwnode link cycle: C--> A --> B.EP ==> B --> C 3. Node A is populated as device A. As we convert C --> A into a device link, we see it's already part of a cycle (from step 2) and don't run cycle detection. Thus we miss detecting the cycle: A --> B.EP ==> B --> A.EP ==> A Looking at it another way, A depends on B in one way: A --> B.EP ==> B But B depends on A in two ways and we only detect the first: B --> C --> A B --> A.EP ==> A To detect both of these, we remove the incorrect optimization attempt in step 3 and run cycle detection even if the fwnode link from which the device link is being created has already been marked as part of a cycle. Reported-by: Xu Yang Closes: https://lore.kernel.org/lkml/DU2PR04MB8822693748725F85DC0CB86C8C792@DU2PR04MB8822.eurprd04.prod.outlook.com/ Fixes: 3fb16866b51d ("driver core: fw_devlink: Make cycle detection more robust") Signed-off-by: Saravana Kannan Tested-by: Xu Yang Link: https://lore.kernel.org/r/20240202095636.868578-3-saravanak@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/core.c b/drivers/base/core.c index 52215c4c7209..e3d666461835 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -2060,9 +2060,14 @@ static int fw_devlink_create_devlink(struct device *con, /* * SYNC_STATE_ONLY device links don't block probing and supports cycles. - * So cycle detection isn't necessary and shouldn't be done. + * So, one might expect that cycle detection isn't necessary for them. + * However, if the device link was marked as SYNC_STATE_ONLY because + * it's part of a cycle, then we still need to do cycle detection. This + * is because the consumer and supplier might be part of multiple cycles + * and we need to detect all those cycles. */ - if (!(flags & DL_FLAG_SYNC_STATE_ONLY)) { + if (!device_link_flag_is_sync_state_only(flags) || + flags & DL_FLAG_CYCLE) { device_links_write_lock(); if (__fw_devlink_relax_cycles(con, sup_handle)) { __fwnode_link_cycle(link); -- cgit v1.2.3 From 6e7ad1aebb4fc9fed0217dd50ef6e58a53f17d81 Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Fri, 2 Feb 2024 01:56:35 -0800 Subject: driver core: fw_devlink: Improve logs for cycle detection The links in a cycle are not all logged in a consistent manner or not logged at all. Make them consistent by adding a "cycle:" string and log all the link in the cycles (even the child ==> parent dependency) so that it's easier to debug cycle detection code. Also, mark the start and end of a cycle so it's easy to tell when multiple cycles are logged back to back. Signed-off-by: Saravana Kannan Tested-by: Xu Yang Link: https://lore.kernel.org/r/20240202095636.868578-4-saravanak@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/core.c b/drivers/base/core.c index e3d666461835..9828da9b933c 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -125,7 +125,7 @@ static void __fwnode_link_del(struct fwnode_link *link) */ static void __fwnode_link_cycle(struct fwnode_link *link) { - pr_debug("%pfwf: Relaxing link with %pfwf\n", + pr_debug("%pfwf: cycle: depends on %pfwf\n", link->consumer, link->supplier); link->flags |= FWLINK_FLAG_CYCLE; } @@ -1945,6 +1945,7 @@ static bool __fw_devlink_relax_cycles(struct device *con, /* Termination condition. */ if (sup_dev == con) { + pr_debug("----- cycle: start -----\n"); ret = true; goto out; } @@ -1976,8 +1977,11 @@ static bool __fw_devlink_relax_cycles(struct device *con, else par_dev = fwnode_get_next_parent_dev(sup_handle); - if (par_dev && __fw_devlink_relax_cycles(con, par_dev->fwnode)) + if (par_dev && __fw_devlink_relax_cycles(con, par_dev->fwnode)) { + pr_debug("%pfwf: cycle: child of %pfwf\n", sup_handle, + par_dev->fwnode); ret = true; + } if (!sup_dev) goto out; @@ -1993,6 +1997,8 @@ static bool __fw_devlink_relax_cycles(struct device *con, if (__fw_devlink_relax_cycles(con, dev_link->supplier->fwnode)) { + pr_debug("%pfwf: cycle: depends on %pfwf\n", sup_handle, + dev_link->supplier->fwnode); fw_devlink_relax_link(dev_link); dev_link->flags |= DL_FLAG_CYCLE; ret = true; @@ -2072,6 +2078,7 @@ static int fw_devlink_create_devlink(struct device *con, if (__fw_devlink_relax_cycles(con, sup_handle)) { __fwnode_link_cycle(link); flags = fw_devlink_get_flags(link->flags); + pr_debug("----- cycle: end -----\n"); dev_info(con, "Fixed dependency cycle(s) with %pfwf\n", sup_handle); } -- cgit v1.2.3 From b730bab0b9c4204d7dda3f5bc8adf4292497fc39 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 29 Jan 2024 17:11:57 +0100 Subject: PM: sleep: stats: Use an array of step failure counters Instead of using a set of individual struct suspend_stats fields representing suspend step failure counters, use an array of counters indexed by enum suspend_stat_step for this purpose, which allows dpm_save_failed_step() to increment the appropriate counter automatically, so that its callers don't need to do that directly. It also allows suspend_stats_show() to carry out a loop over the counters array to print their values. Because the counters cannot become negative, use unsigned int for representing them. The only user-observable impact of this change is a different ordering of entries in the suspend_stats debugfs file which is not expected to matter. Signed-off-by: Rafael J. Wysocki Reviewed-by: Stanislaw Gruszka Reviewed-by: Ulf Hansson --- drivers/base/power/main.c | 22 +++++++++----------- include/linux/suspend.h | 12 ++++------- kernel/power/main.c | 51 +++++++++++++++++++++++++---------------------- kernel/power/suspend.c | 1 - 4 files changed, 40 insertions(+), 46 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index a2cdef95d8c4..896450503d0d 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -686,7 +686,6 @@ Out: TRACE_RESUME(error); if (error) { - suspend_stats.failed_resume_noirq++; dpm_save_failed_step(SUSPEND_RESUME_NOIRQ); dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async noirq" : " noirq", error); @@ -817,7 +816,6 @@ Out: complete_all(&dev->power.completion); if (error) { - suspend_stats.failed_resume_early++; dpm_save_failed_step(SUSPEND_RESUME_EARLY); dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async early" : " early", error); @@ -974,7 +972,6 @@ static void device_resume(struct device *dev, pm_message_t state, bool async) TRACE_RESUME(error); if (error) { - suspend_stats.failed_resume++; dpm_save_failed_step(SUSPEND_RESUME); dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async" : "", error); @@ -1323,10 +1320,9 @@ static int dpm_noirq_suspend_devices(pm_message_t state) if (!error) error = async_error; - if (error) { - suspend_stats.failed_suspend_noirq++; + if (error) dpm_save_failed_step(SUSPEND_SUSPEND_NOIRQ); - } + dpm_show_time(starttime, state, error, "noirq"); trace_suspend_resume(TPS("dpm_suspend_noirq"), state.event, false); return error; @@ -1509,8 +1505,8 @@ int dpm_suspend_late(pm_message_t state) async_synchronize_full(); if (!error) error = async_error; + if (error) { - suspend_stats.failed_suspend_late++; dpm_save_failed_step(SUSPEND_SUSPEND_LATE); dpm_resume_early(resume_event(state)); } @@ -1789,10 +1785,10 @@ int dpm_suspend(pm_message_t state) async_synchronize_full(); if (!error) error = async_error; - if (error) { - suspend_stats.failed_suspend++; + + if (error) dpm_save_failed_step(SUSPEND_SUSPEND); - } + dpm_show_time(starttime, state, error, NULL); trace_suspend_resume(TPS("dpm_suspend"), state.event, false); return error; @@ -1943,11 +1939,11 @@ int dpm_suspend_start(pm_message_t state) int error; error = dpm_prepare(state); - if (error) { - suspend_stats.failed_prepare++; + if (error) dpm_save_failed_step(SUSPEND_PREPARE); - } else + else error = dpm_suspend(state); + dpm_show_time(starttime, state, error, "start"); return error; } diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 58f7352af205..5e4c4d4aed95 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -52,17 +52,12 @@ enum suspend_stat_step { SUSPEND_RESUME }; +#define SUSPEND_NR_STEPS SUSPEND_RESUME + struct suspend_stats { + unsigned int step_failures[SUSPEND_NR_STEPS]; int success; int fail; - int failed_freeze; - int failed_prepare; - int failed_suspend; - int failed_suspend_late; - int failed_suspend_noirq; - int failed_resume; - int failed_resume_early; - int failed_resume_noirq; #define REC_FAILED_NUM 2 int last_failed_dev; char failed_devs[REC_FAILED_NUM][40]; @@ -95,6 +90,7 @@ static inline void dpm_save_failed_errno(int err) static inline void dpm_save_failed_step(enum suspend_stat_step step) { + suspend_stats.step_failures[step-1]++; suspend_stats.failed_steps[suspend_stats.last_failed_step] = step; suspend_stats.last_failed_step++; suspend_stats.last_failed_step %= REC_FAILED_NUM; diff --git a/kernel/power/main.c b/kernel/power/main.c index dca14543dfed..d7a02105b183 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -341,18 +341,28 @@ static struct kobj_attribute _name = __ATTR_RO(_name) suspend_attr(success, "%d\n"); suspend_attr(fail, "%d\n"); -suspend_attr(failed_freeze, "%d\n"); -suspend_attr(failed_prepare, "%d\n"); -suspend_attr(failed_suspend, "%d\n"); -suspend_attr(failed_suspend_late, "%d\n"); -suspend_attr(failed_suspend_noirq, "%d\n"); -suspend_attr(failed_resume, "%d\n"); -suspend_attr(failed_resume_early, "%d\n"); -suspend_attr(failed_resume_noirq, "%d\n"); suspend_attr(last_hw_sleep, "%llu\n"); suspend_attr(total_hw_sleep, "%llu\n"); suspend_attr(max_hw_sleep, "%llu\n"); +#define suspend_step_attr(_name, step) \ +static ssize_t _name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buf) \ +{ \ + return sprintf(buf, "%u\n", \ + suspend_stats.step_failures[step-1]); \ +} \ +static struct kobj_attribute _name = __ATTR_RO(_name) + +suspend_step_attr(failed_freeze, SUSPEND_FREEZE); +suspend_step_attr(failed_prepare, SUSPEND_PREPARE); +suspend_step_attr(failed_suspend, SUSPEND_SUSPEND); +suspend_step_attr(failed_suspend_late, SUSPEND_SUSPEND_LATE); +suspend_step_attr(failed_suspend_noirq, SUSPEND_SUSPEND_NOIRQ); +suspend_step_attr(failed_resume, SUSPEND_RESUME); +suspend_step_attr(failed_resume_early, SUSPEND_RESUME_EARLY); +suspend_step_attr(failed_resume_noirq, SUSPEND_RESUME_NOIRQ); + static ssize_t last_failed_dev_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -439,6 +449,7 @@ static const struct attribute_group suspend_attr_group = { static int suspend_stats_show(struct seq_file *s, void *unused) { int i, index, last_dev, last_errno, last_step; + enum suspend_stat_step step; last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; last_dev %= REC_FAILED_NUM; @@ -446,22 +457,14 @@ static int suspend_stats_show(struct seq_file *s, void *unused) last_errno %= REC_FAILED_NUM; last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; last_step %= REC_FAILED_NUM; - seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n" - "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n", - "success", suspend_stats.success, - "fail", suspend_stats.fail, - "failed_freeze", suspend_stats.failed_freeze, - "failed_prepare", suspend_stats.failed_prepare, - "failed_suspend", suspend_stats.failed_suspend, - "failed_suspend_late", - suspend_stats.failed_suspend_late, - "failed_suspend_noirq", - suspend_stats.failed_suspend_noirq, - "failed_resume", suspend_stats.failed_resume, - "failed_resume_early", - suspend_stats.failed_resume_early, - "failed_resume_noirq", - suspend_stats.failed_resume_noirq); + + seq_printf(s, "success: %d\nfail: %d\n", + suspend_stats.success, suspend_stats.fail); + + for (step = SUSPEND_FREEZE; step <= SUSPEND_NR_STEPS; step++) + seq_printf(s, "failed_%s: %u\n", suspend_step_names[step], + suspend_stats.step_failures[step-1]); + seq_printf(s, "failures:\n last_failed_dev:\t%-s\n", suspend_stats.failed_devs[last_dev]); for (i = 1; i < REC_FAILED_NUM; i++) { diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index fa3bf161d13f..07bde5bba49e 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -367,7 +367,6 @@ static int suspend_prepare(suspend_state_t state) if (!error) return 0; - suspend_stats.failed_freeze++; dpm_save_failed_step(SUSPEND_FREEZE); pm_notifier_call_chain(PM_POST_SUSPEND); Restore: -- cgit v1.2.3 From 9ff544fa5f94fe07f99a36d2138075b322067546 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 29 Jan 2024 17:30:44 +0100 Subject: PM: sleep: stats: Define suspend_stats next to the code using it It is not necessary to define struct suspend_stats in a header file and the suspend_stats variable in the core device system-wide PM code. They both can be defined in kernel/power/main.c, next to the sysfs and debugfs code accessing suspend_stats, which can be static. Modify the code in question in accordance with the above observation and replace the static inline functions manipulating suspend_stats with regular ones defined in kernel/power/main.c. While at it, move the enum suspend_stat_step to the end of suspend.h which is a more suitable place for it. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson --- drivers/base/power/main.c | 1 - include/linux/suspend.h | 71 ++++++++++--------------------------------- kernel/power/main.c | 76 +++++++++++++++++++++++++++++++++++++++-------- kernel/power/power.h | 2 ++ kernel/power/suspend.c | 7 +---- 5 files changed, 81 insertions(+), 76 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 896450503d0d..761b03b4edb1 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -60,7 +60,6 @@ static LIST_HEAD(dpm_suspended_list); static LIST_HEAD(dpm_late_early_list); static LIST_HEAD(dpm_noirq_list); -struct suspend_stats suspend_stats; static DEFINE_MUTEX(dpm_list_mtx); static pm_message_t pm_transition; diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 216bae989535..da6ebca3ff77 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -40,62 +40,6 @@ typedef int __bitwise suspend_state_t; #define PM_SUSPEND_MIN PM_SUSPEND_TO_IDLE #define PM_SUSPEND_MAX ((__force suspend_state_t) 4) -enum suspend_stat_step { - SUSPEND_WORKING = 0, - SUSPEND_FREEZE, - SUSPEND_PREPARE, - SUSPEND_SUSPEND, - SUSPEND_SUSPEND_LATE, - SUSPEND_SUSPEND_NOIRQ, - SUSPEND_RESUME_NOIRQ, - SUSPEND_RESUME_EARLY, - SUSPEND_RESUME -}; - -#define SUSPEND_NR_STEPS SUSPEND_RESUME - -struct suspend_stats { - unsigned int step_failures[SUSPEND_NR_STEPS]; - unsigned int success; - unsigned int fail; -#define REC_FAILED_NUM 2 - int last_failed_dev; - char failed_devs[REC_FAILED_NUM][40]; - int last_failed_errno; - int errno[REC_FAILED_NUM]; - int last_failed_step; - u64 last_hw_sleep; - u64 total_hw_sleep; - u64 max_hw_sleep; - enum suspend_stat_step failed_steps[REC_FAILED_NUM]; -}; - -extern struct suspend_stats suspend_stats; - -static inline void dpm_save_failed_dev(const char *name) -{ - strscpy(suspend_stats.failed_devs[suspend_stats.last_failed_dev], - name, - sizeof(suspend_stats.failed_devs[0])); - suspend_stats.last_failed_dev++; - suspend_stats.last_failed_dev %= REC_FAILED_NUM; -} - -static inline void dpm_save_failed_errno(int err) -{ - suspend_stats.errno[suspend_stats.last_failed_errno] = err; - suspend_stats.last_failed_errno++; - suspend_stats.last_failed_errno %= REC_FAILED_NUM; -} - -static inline void dpm_save_failed_step(enum suspend_stat_step step) -{ - suspend_stats.step_failures[step-1]++; - suspend_stats.failed_steps[suspend_stats.last_failed_step] = step; - suspend_stats.last_failed_step++; - suspend_stats.last_failed_step %= REC_FAILED_NUM; -} - /** * struct platform_suspend_ops - Callbacks for managing platform dependent * system sleep states. @@ -623,4 +567,19 @@ static inline void queue_up_suspend_work(void) {} #endif /* !CONFIG_PM_AUTOSLEEP */ +enum suspend_stat_step { + SUSPEND_WORKING = 0, + SUSPEND_FREEZE, + SUSPEND_PREPARE, + SUSPEND_SUSPEND, + SUSPEND_SUSPEND_LATE, + SUSPEND_SUSPEND_NOIRQ, + SUSPEND_RESUME_NOIRQ, + SUSPEND_RESUME_EARLY, + SUSPEND_RESUME +}; + +void dpm_save_failed_dev(const char *name); +void dpm_save_failed_step(enum suspend_stat_step step); + #endif /* _LINUX_SUSPEND_H */ diff --git a/kernel/power/main.c b/kernel/power/main.c index d6b4a9258288..8c4bf5a54805 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -95,19 +95,6 @@ int unregister_pm_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(unregister_pm_notifier); -void pm_report_hw_sleep_time(u64 t) -{ - suspend_stats.last_hw_sleep = t; - suspend_stats.total_hw_sleep += t; -} -EXPORT_SYMBOL_GPL(pm_report_hw_sleep_time); - -void pm_report_max_hw_sleep(u64 t) -{ - suspend_stats.max_hw_sleep = t; -} -EXPORT_SYMBOL_GPL(pm_report_max_hw_sleep); - int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down) { int ret; @@ -319,6 +306,69 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, power_attr(pm_test); #endif /* CONFIG_PM_SLEEP_DEBUG */ +#define SUSPEND_NR_STEPS SUSPEND_RESUME +#define REC_FAILED_NUM 2 + +struct suspend_stats { + unsigned int step_failures[SUSPEND_NR_STEPS]; + unsigned int success; + unsigned int fail; + int last_failed_dev; + char failed_devs[REC_FAILED_NUM][40]; + int last_failed_errno; + int errno[REC_FAILED_NUM]; + int last_failed_step; + u64 last_hw_sleep; + u64 total_hw_sleep; + u64 max_hw_sleep; + enum suspend_stat_step failed_steps[REC_FAILED_NUM]; +}; + +static struct suspend_stats suspend_stats; + +void dpm_save_failed_dev(const char *name) +{ + strscpy(suspend_stats.failed_devs[suspend_stats.last_failed_dev], + name, sizeof(suspend_stats.failed_devs[0])); + suspend_stats.last_failed_dev++; + suspend_stats.last_failed_dev %= REC_FAILED_NUM; +} + +void dpm_save_failed_step(enum suspend_stat_step step) +{ + suspend_stats.step_failures[step-1]++; + suspend_stats.failed_steps[suspend_stats.last_failed_step] = step; + suspend_stats.last_failed_step++; + suspend_stats.last_failed_step %= REC_FAILED_NUM; +} + +void dpm_save_errno(int err) +{ + if (!err) { + suspend_stats.success++; + return; + } + + suspend_stats.fail++; + + suspend_stats.errno[suspend_stats.last_failed_errno] = err; + suspend_stats.last_failed_errno++; + suspend_stats.last_failed_errno %= REC_FAILED_NUM; +} + +void pm_report_hw_sleep_time(u64 t) +{ + suspend_stats.last_hw_sleep = t; + suspend_stats.total_hw_sleep += t; +} +EXPORT_SYMBOL_GPL(pm_report_hw_sleep_time); + +void pm_report_max_hw_sleep(u64 t) +{ + suspend_stats.max_hw_sleep = t; +} +EXPORT_SYMBOL_GPL(pm_report_max_hw_sleep); + static const char * const suspend_step_names[] = { [SUSPEND_WORKING] = "", [SUSPEND_FREEZE] = "freeze", diff --git a/kernel/power/power.h b/kernel/power/power.h index 8499a39c62f4..4e03046b9c4d 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -327,3 +327,5 @@ static inline void pm_sleep_enable_secondary_cpus(void) suspend_enable_secondary_cpus(); cpuidle_resume(); } + +void dpm_save_errno(int err); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 07bde5bba49e..742eb26618cc 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -616,12 +616,7 @@ int pm_suspend(suspend_state_t state) pr_info("suspend entry (%s)\n", mem_sleep_labels[state]); error = enter_state(state); - if (error) { - suspend_stats.fail++; - dpm_save_failed_errno(error); - } else { - suspend_stats.success++; - } + dpm_save_errno(error); pr_info("suspend exit\n"); return error; } -- cgit v1.2.3 From 4add3e72f0fca0c66a8c9de8f58997eb9a3e3320 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 29 Jan 2024 17:24:04 +0100 Subject: PM: sleep: stats: Call dpm_save_failed_step() at most once per phase If the handling of two or more devices fails in one suspend-resume phase, it should be counted once in the statistics which is not guaranteed to happen during system-wide resume of devices due to the possible asynchronous execution of device callbacks. Address this by using the async_error static variable during system-wide device resume to indicate that there has been a device resume error and the given suspend-resume phase should be counted as failing. Signed-off-by: Rafael J. Wysocki Reviewed-by: Stanislaw Gruszka Reviewed-by: Ulf Hansson --- drivers/base/power/main.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 761b03b4edb1..e75769af5e3e 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -685,7 +685,7 @@ Out: TRACE_RESUME(error); if (error) { - dpm_save_failed_step(SUSPEND_RESUME_NOIRQ); + async_error = error; dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async noirq" : " noirq", error); } @@ -705,6 +705,9 @@ static void dpm_noirq_resume_devices(pm_message_t state) ktime_t starttime = ktime_get(); trace_suspend_resume(TPS("dpm_resume_noirq"), state.event, true); + + async_error = 0; + mutex_lock(&dpm_list_mtx); pm_transition = state; @@ -734,6 +737,9 @@ static void dpm_noirq_resume_devices(pm_message_t state) mutex_unlock(&dpm_list_mtx); async_synchronize_full(); dpm_show_time(starttime, state, 0, "noirq"); + if (async_error) + dpm_save_failed_step(SUSPEND_RESUME_NOIRQ); + trace_suspend_resume(TPS("dpm_resume_noirq"), state.event, false); } @@ -815,7 +821,7 @@ Out: complete_all(&dev->power.completion); if (error) { - dpm_save_failed_step(SUSPEND_RESUME_EARLY); + async_error = error; dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async early" : " early", error); } @@ -839,6 +845,9 @@ void dpm_resume_early(pm_message_t state) ktime_t starttime = ktime_get(); trace_suspend_resume(TPS("dpm_resume_early"), state.event, true); + + async_error = 0; + mutex_lock(&dpm_list_mtx); pm_transition = state; @@ -868,6 +877,9 @@ void dpm_resume_early(pm_message_t state) mutex_unlock(&dpm_list_mtx); async_synchronize_full(); dpm_show_time(starttime, state, 0, "early"); + if (async_error) + dpm_save_failed_step(SUSPEND_RESUME_EARLY); + trace_suspend_resume(TPS("dpm_resume_early"), state.event, false); } @@ -971,7 +983,7 @@ static void device_resume(struct device *dev, pm_message_t state, bool async) TRACE_RESUME(error); if (error) { - dpm_save_failed_step(SUSPEND_RESUME); + async_error = error; dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async" : "", error); } @@ -1030,6 +1042,8 @@ void dpm_resume(pm_message_t state) mutex_unlock(&dpm_list_mtx); async_synchronize_full(); dpm_show_time(starttime, state, 0, NULL); + if (async_error) + dpm_save_failed_step(SUSPEND_RESUME); cpufreq_resume(); devfreq_resume(); -- cgit v1.2.3 From ac6f87aaa26fa88680db23051a5d2b13a08d13b8 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 29 Jan 2024 17:25:48 +0100 Subject: PM: sleep: stats: Log errors right after running suspend callbacks The error logging and failure statistics updates are carried out in two places in each system-wide device suspend phase, which is unnecessary code duplication, so do that in one place in each phase, right after invoking device suspend callbacks. While at it, add "noirq" or "late" to the "async" string printed when the failing device callback in the "noirq" or "late" suspend phase, respectively, was run asynchronously. Signed-off-by: Rafael J. Wysocki Reviewed-by: Stanislaw Gruszka Reviewed-by: Ulf Hansson --- drivers/base/power/main.c | 49 +++++++++++++---------------------------------- 1 file changed, 13 insertions(+), 36 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index e75769af5e3e..892d706af353 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -1244,6 +1244,8 @@ Run: error = dpm_run_callback(callback, dev, state, info); if (error) { async_error = error; + dpm_save_failed_dev(dev_name(dev)); + pm_dev_err(dev, state, async ? " async noirq" : " noirq", error); goto Complete; } @@ -1273,14 +1275,8 @@ Complete: static void async_suspend_noirq(void *data, async_cookie_t cookie) { struct device *dev = data; - int error; - - error = __device_suspend_noirq(dev, pm_transition, true); - if (error) { - dpm_save_failed_dev(dev_name(dev)); - pm_dev_err(dev, pm_transition, " async", error); - } + __device_suspend_noirq(dev, pm_transition, true); put_device(dev); } @@ -1312,12 +1308,8 @@ static int dpm_noirq_suspend_devices(pm_message_t state) mutex_lock(&dpm_list_mtx); - if (error) { - pm_dev_err(dev, state, " noirq", error); - dpm_save_failed_dev(dev_name(dev)); - } else if (!list_empty(&dev->power.entry)) { + if (!error && !list_empty(&dev->power.entry)) list_move(&dev->power.entry, &dpm_noirq_list); - } mutex_unlock(&dpm_list_mtx); @@ -1437,6 +1429,8 @@ Run: error = dpm_run_callback(callback, dev, state, info); if (error) { async_error = error; + dpm_save_failed_dev(dev_name(dev)); + pm_dev_err(dev, state, async ? " async late" : " late", error); goto Complete; } dpm_propagate_wakeup_to_parent(dev); @@ -1453,13 +1447,8 @@ Complete: static void async_suspend_late(void *data, async_cookie_t cookie) { struct device *dev = data; - int error; - error = __device_suspend_late(dev, pm_transition, true); - if (error) { - dpm_save_failed_dev(dev_name(dev)); - pm_dev_err(dev, pm_transition, " async", error); - } + __device_suspend_late(dev, pm_transition, true); put_device(dev); } @@ -1500,11 +1489,6 @@ int dpm_suspend_late(pm_message_t state) if (!list_empty(&dev->power.entry)) list_move(&dev->power.entry, &dpm_late_early_list); - if (error) { - pm_dev_err(dev, state, " late", error); - dpm_save_failed_dev(dev_name(dev)); - } - mutex_unlock(&dpm_list_mtx); put_device(dev); @@ -1719,8 +1703,11 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async) dpm_watchdog_clear(&wd); Complete: - if (error) + if (error) { async_error = error; + dpm_save_failed_dev(dev_name(dev)); + pm_dev_err(dev, state, async ? " async" : "", error); + } complete_all(&dev->power.completion); TRACE_SUSPEND(error); @@ -1730,14 +1717,8 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async) static void async_suspend(void *data, async_cookie_t cookie) { struct device *dev = data; - int error; - - error = __device_suspend(dev, pm_transition, true); - if (error) { - dpm_save_failed_dev(dev_name(dev)); - pm_dev_err(dev, pm_transition, " async", error); - } + __device_suspend(dev, pm_transition, true); put_device(dev); } @@ -1778,12 +1759,8 @@ int dpm_suspend(pm_message_t state) mutex_lock(&dpm_list_mtx); - if (error) { - pm_dev_err(dev, state, "", error); - dpm_save_failed_dev(dev_name(dev)); - } else if (!list_empty(&dev->power.entry)) { + if (!error && !list_empty(&dev->power.entry)) list_move(&dev->power.entry, &dpm_suspended_list); - } mutex_unlock(&dpm_list_mtx); -- cgit v1.2.3 From a4b64b893428734fc872a162ca7d92aa1ca1c429 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 29 Jan 2024 17:27:33 +0100 Subject: PM: sleep: Move some assignments from under a lock The async_error and pm_transition variables are set under dpm_list_mtx in multiple places in the system-wide device PM core code, which is unnecessary and confusing, so rearrange the code so that the variables in question are set before acquiring the lock. While at it, add some empty code lines around locking to improve the consistency of the code. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Stanislaw Gruszka Reviewed-by: Ulf Hansson --- drivers/base/power/main.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 892d706af353..89a3ddd642c9 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -707,9 +707,9 @@ static void dpm_noirq_resume_devices(pm_message_t state) trace_suspend_resume(TPS("dpm_resume_noirq"), state.event, true); async_error = 0; + pm_transition = state; mutex_lock(&dpm_list_mtx); - pm_transition = state; /* * Trigger the resume of "async" devices upfront so they don't have to @@ -847,9 +847,9 @@ void dpm_resume_early(pm_message_t state) trace_suspend_resume(TPS("dpm_resume_early"), state.event, true); async_error = 0; + pm_transition = state; mutex_lock(&dpm_list_mtx); - pm_transition = state; /* * Trigger the resume of "async" devices upfront so they don't have to @@ -1012,10 +1012,11 @@ void dpm_resume(pm_message_t state) trace_suspend_resume(TPS("dpm_resume"), state.event, true); might_sleep(); - mutex_lock(&dpm_list_mtx); pm_transition = state; async_error = 0; + mutex_lock(&dpm_list_mtx); + /* * Trigger the resume of "async" devices upfront so they don't have to * wait for the "non-async" ones they don't depend on. @@ -1294,10 +1295,12 @@ static int dpm_noirq_suspend_devices(pm_message_t state) int error = 0; trace_suspend_resume(TPS("dpm_suspend_noirq"), state.event, true); - mutex_lock(&dpm_list_mtx); + pm_transition = state; async_error = 0; + mutex_lock(&dpm_list_mtx); + while (!list_empty(&dpm_late_early_list)) { struct device *dev = to_device(dpm_late_early_list.prev); @@ -1320,7 +1323,9 @@ static int dpm_noirq_suspend_devices(pm_message_t state) if (error || async_error) break; } + mutex_unlock(&dpm_list_mtx); + async_synchronize_full(); if (!error) error = async_error; @@ -1470,11 +1475,14 @@ int dpm_suspend_late(pm_message_t state) int error = 0; trace_suspend_resume(TPS("dpm_suspend_late"), state.event, true); - wake_up_all_idle_cpus(); - mutex_lock(&dpm_list_mtx); + pm_transition = state; async_error = 0; + wake_up_all_idle_cpus(); + + mutex_lock(&dpm_list_mtx); + while (!list_empty(&dpm_suspended_list)) { struct device *dev = to_device(dpm_suspended_list.prev); @@ -1498,7 +1506,9 @@ int dpm_suspend_late(pm_message_t state) if (error || async_error) break; } + mutex_unlock(&dpm_list_mtx); + async_synchronize_full(); if (!error) error = async_error; @@ -1745,9 +1755,11 @@ int dpm_suspend(pm_message_t state) devfreq_suspend(); cpufreq_suspend(); - mutex_lock(&dpm_list_mtx); pm_transition = state; async_error = 0; + + mutex_lock(&dpm_list_mtx); + while (!list_empty(&dpm_prepared_list)) { struct device *dev = to_device(dpm_prepared_list.prev); @@ -1771,7 +1783,9 @@ int dpm_suspend(pm_message_t state) if (error || async_error) break; } + mutex_unlock(&dpm_list_mtx); + async_synchronize_full(); if (!error) error = async_error; -- cgit v1.2.3 From 96db0f947a14df754586ac38bc0d50ab8dae013c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 29 Jan 2024 17:28:37 +0100 Subject: PM: sleep: Move devices to new lists earlier in each suspend phase During a system-wide suspend of devices, dpm_noirq_suspend_devices(), dpm_suspend_late() and dpm_suspend() move devices from one list to another. They do it with each device after its PM callback in the given suspend phase has run or has been scheduled for asynchronous execution, in case it is deleted from the current list in the meantime. However, devices can be moved to a new list before invoking their PM callbacks (which usually is the case for the devices whose callbacks are executed asynchronously anyway), because doing so does not affect the ordering of that list. In either case, each device is moved to the new list after the previous device has been moved to it or gone away, and if a device is removed, it does not matter which list it is in at that point, because deleting an entry from a list does not change the ordering of the other entries in it. Accordingly, modify the functions mentioned above to move devices to new lists without waiting for their PM callbacks to run regardless of whether or not they run asynchronously. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Stanislaw Gruszka Reviewed-by: Ulf Hansson --- drivers/base/power/main.c | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 89a3ddd642c9..bcd043c1d385 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -1304,18 +1304,12 @@ static int dpm_noirq_suspend_devices(pm_message_t state) while (!list_empty(&dpm_late_early_list)) { struct device *dev = to_device(dpm_late_early_list.prev); + list_move(&dev->power.entry, &dpm_noirq_list); get_device(dev); mutex_unlock(&dpm_list_mtx); error = device_suspend_noirq(dev); - mutex_lock(&dpm_list_mtx); - - if (!error && !list_empty(&dev->power.entry)) - list_move(&dev->power.entry, &dpm_noirq_list); - - mutex_unlock(&dpm_list_mtx); - put_device(dev); mutex_lock(&dpm_list_mtx); @@ -1486,19 +1480,13 @@ int dpm_suspend_late(pm_message_t state) while (!list_empty(&dpm_suspended_list)) { struct device *dev = to_device(dpm_suspended_list.prev); + list_move(&dev->power.entry, &dpm_late_early_list); get_device(dev); mutex_unlock(&dpm_list_mtx); error = device_suspend_late(dev); - mutex_lock(&dpm_list_mtx); - - if (!list_empty(&dev->power.entry)) - list_move(&dev->power.entry, &dpm_late_early_list); - - mutex_unlock(&dpm_list_mtx); - put_device(dev); mutex_lock(&dpm_list_mtx); @@ -1763,19 +1751,13 @@ int dpm_suspend(pm_message_t state) while (!list_empty(&dpm_prepared_list)) { struct device *dev = to_device(dpm_prepared_list.prev); + list_move(&dev->power.entry, &dpm_suspended_list); get_device(dev); mutex_unlock(&dpm_list_mtx); error = device_suspend(dev); - mutex_lock(&dpm_list_mtx); - - if (!error && !list_empty(&dev->power.entry)) - list_move(&dev->power.entry, &dpm_suspended_list); - - mutex_unlock(&dpm_list_mtx); - put_device(dev); mutex_lock(&dpm_list_mtx); -- cgit v1.2.3 From 86205785443bd6eff65152a24501ca58f301ab41 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 29 Jan 2024 17:29:41 +0100 Subject: PM: sleep: Call dpm_async_fn() directly in each suspend phase Simplify the system-wide suspend of devices by invoking dpm_async_fn() directly from the main loop in each suspend phase instead of using an additional wrapper function for running it. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Stanislaw Gruszka Reviewed-by: Ulf Hansson --- drivers/base/power/main.c | 61 +++++++++++++++++++---------------------------- 1 file changed, 25 insertions(+), 36 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index bcd043c1d385..5679f966f676 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -1192,7 +1192,7 @@ static void dpm_superior_set_must_resume(struct device *dev) } /** - * __device_suspend_noirq - Execute a "noirq suspend" callback for given device. + * device_suspend_noirq - Execute a "noirq suspend" callback for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being suspended asynchronously. @@ -1200,7 +1200,7 @@ static void dpm_superior_set_must_resume(struct device *dev) * The driver of @dev will not receive interrupts while this function is being * executed. */ -static int __device_suspend_noirq(struct device *dev, pm_message_t state, bool async) +static int device_suspend_noirq(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; @@ -1277,18 +1277,10 @@ static void async_suspend_noirq(void *data, async_cookie_t cookie) { struct device *dev = data; - __device_suspend_noirq(dev, pm_transition, true); + device_suspend_noirq(dev, pm_transition, true); put_device(dev); } -static int device_suspend_noirq(struct device *dev) -{ - if (dpm_async_fn(dev, async_suspend_noirq)) - return 0; - - return __device_suspend_noirq(dev, pm_transition, false); -} - static int dpm_noirq_suspend_devices(pm_message_t state) { ktime_t starttime = ktime_get(); @@ -1305,10 +1297,15 @@ static int dpm_noirq_suspend_devices(pm_message_t state) struct device *dev = to_device(dpm_late_early_list.prev); list_move(&dev->power.entry, &dpm_noirq_list); + + if (dpm_async_fn(dev, async_suspend_noirq)) + continue; + get_device(dev); + mutex_unlock(&dpm_list_mtx); - error = device_suspend_noirq(dev); + error = device_suspend_noirq(dev, state, false); put_device(dev); @@ -1369,14 +1366,14 @@ static void dpm_propagate_wakeup_to_parent(struct device *dev) } /** - * __device_suspend_late - Execute a "late suspend" callback for given device. + * device_suspend_late - Execute a "late suspend" callback for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being suspended asynchronously. * * Runtime PM is disabled for @dev while this function is being executed. */ -static int __device_suspend_late(struct device *dev, pm_message_t state, bool async) +static int device_suspend_late(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; @@ -1447,18 +1444,10 @@ static void async_suspend_late(void *data, async_cookie_t cookie) { struct device *dev = data; - __device_suspend_late(dev, pm_transition, true); + device_suspend_late(dev, pm_transition, true); put_device(dev); } -static int device_suspend_late(struct device *dev) -{ - if (dpm_async_fn(dev, async_suspend_late)) - return 0; - - return __device_suspend_late(dev, pm_transition, false); -} - /** * dpm_suspend_late - Execute "late suspend" callbacks for all devices. * @state: PM transition of the system being carried out. @@ -1481,11 +1470,15 @@ int dpm_suspend_late(pm_message_t state) struct device *dev = to_device(dpm_suspended_list.prev); list_move(&dev->power.entry, &dpm_late_early_list); + + if (dpm_async_fn(dev, async_suspend_late)) + continue; + get_device(dev); mutex_unlock(&dpm_list_mtx); - error = device_suspend_late(dev); + error = device_suspend_late(dev, state, false); put_device(dev); @@ -1582,12 +1575,12 @@ static void dpm_clear_superiors_direct_complete(struct device *dev) } /** - * __device_suspend - Execute "suspend" callbacks for given device. + * device_suspend - Execute "suspend" callbacks for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being suspended asynchronously. */ -static int __device_suspend(struct device *dev, pm_message_t state, bool async) +static int device_suspend(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; @@ -1716,18 +1709,10 @@ static void async_suspend(void *data, async_cookie_t cookie) { struct device *dev = data; - __device_suspend(dev, pm_transition, true); + device_suspend(dev, pm_transition, true); put_device(dev); } -static int device_suspend(struct device *dev) -{ - if (dpm_async_fn(dev, async_suspend)) - return 0; - - return __device_suspend(dev, pm_transition, false); -} - /** * dpm_suspend - Execute "suspend" callbacks for all non-sysdev devices. * @state: PM transition of the system being carried out. @@ -1752,11 +1737,15 @@ int dpm_suspend(pm_message_t state) struct device *dev = to_device(dpm_prepared_list.prev); list_move(&dev->power.entry, &dpm_suspended_list); + + if (dpm_async_fn(dev, async_suspend)) + continue; + get_device(dev); mutex_unlock(&dpm_list_mtx); - error = device_suspend(dev); + error = device_suspend(dev, state, false); put_device(dev); -- cgit v1.2.3 From 0ec74ad3c157bd4bcbcc8b294777733687e8cd2a Mon Sep 17 00:00:00 2001 From: Jan Dakinevich Date: Fri, 26 Jan 2024 23:08:36 +0300 Subject: regmap: rework ->max_register handling When regmap consists of single register, 'regmap' subsystem is unable to understand whether ->max_register is set or not, because in both cases it is equal to zero. It leads to that the logic based on value of ->max_register doesn't work. For example using of REGCACHE_FLAT fails. This patch introduces an extra parameter to regmap config, indicating that zero value in ->max_register is authentic. Signed-off-by: Jan Dakinevich Link: https://lore.kernel.org/r/20240126200836.1829995-1-jan.dakinevich@salutedevices.com Signed-off-by: Mark Brown --- drivers/base/regmap/internal.h | 1 + drivers/base/regmap/regcache-flat.c | 2 +- drivers/base/regmap/regcache.c | 4 +++- drivers/base/regmap/regmap.c | 10 ++++++---- include/linux/regmap.h | 5 +++++ 5 files changed, 16 insertions(+), 6 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/regmap/internal.h b/drivers/base/regmap/internal.h index 583dd5d7d46b..bcdb25bec77c 100644 --- a/drivers/base/regmap/internal.h +++ b/drivers/base/regmap/internal.h @@ -93,6 +93,7 @@ struct regmap { #endif unsigned int max_register; + bool max_register_is_set; bool (*writeable_reg)(struct device *dev, unsigned int reg); bool (*readable_reg)(struct device *dev, unsigned int reg); bool (*volatile_reg)(struct device *dev, unsigned int reg); diff --git a/drivers/base/regmap/regcache-flat.c b/drivers/base/regmap/regcache-flat.c index b7e4b2464102..9b17c77dec9d 100644 --- a/drivers/base/regmap/regcache-flat.c +++ b/drivers/base/regmap/regcache-flat.c @@ -23,7 +23,7 @@ static int regcache_flat_init(struct regmap *map) int i; unsigned int *cache; - if (!map || map->reg_stride_order < 0 || !map->max_register) + if (!map || map->reg_stride_order < 0 || !map->max_register_is_set) return -EINVAL; map->cache = kcalloc(regcache_flat_get_index(map, map->max_register) diff --git a/drivers/base/regmap/regcache.c b/drivers/base/regmap/regcache.c index ac63a73ccdaa..2e41cb12b8e2 100644 --- a/drivers/base/regmap/regcache.c +++ b/drivers/base/regmap/regcache.c @@ -187,8 +187,10 @@ int regcache_init(struct regmap *map, const struct regmap_config *config) return 0; } - if (!map->max_register && map->num_reg_defaults_raw) + if (!map->max_register_is_set && map->num_reg_defaults_raw) { map->max_register = (map->num_reg_defaults_raw - 1) * map->reg_stride; + map->max_register_is_set = true; + } if (map->cache_ops->init) { dev_dbg(map->dev, "Initializing %s cache\n", diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c index 6db77d8e45f9..5cb425f6f02d 100644 --- a/drivers/base/regmap/regmap.c +++ b/drivers/base/regmap/regmap.c @@ -89,7 +89,7 @@ EXPORT_SYMBOL_GPL(regmap_check_range_table); bool regmap_writeable(struct regmap *map, unsigned int reg) { - if (map->max_register && reg > map->max_register) + if (map->max_register_is_set && reg > map->max_register) return false; if (map->writeable_reg) @@ -112,7 +112,7 @@ bool regmap_cached(struct regmap *map, unsigned int reg) if (!map->cache_ops) return false; - if (map->max_register && reg > map->max_register) + if (map->max_register_is_set && reg > map->max_register) return false; map->lock(map->lock_arg); @@ -129,7 +129,7 @@ bool regmap_readable(struct regmap *map, unsigned int reg) if (!map->reg_read) return false; - if (map->max_register && reg > map->max_register) + if (map->max_register_is_set && reg > map->max_register) return false; if (map->format.format_write) @@ -787,6 +787,7 @@ struct regmap *__regmap_init(struct device *dev, map->bus = bus; map->bus_context = bus_context; map->max_register = config->max_register; + map->max_register_is_set = map->max_register ?: config->max_register_is_0; map->wr_table = config->wr_table; map->rd_table = config->rd_table; map->volatile_table = config->volatile_table; @@ -1412,6 +1413,7 @@ int regmap_reinit_cache(struct regmap *map, const struct regmap_config *config) regmap_debugfs_exit(map); map->max_register = config->max_register; + map->max_register_is_set = map->max_register ?: config->max_register_is_0; map->writeable_reg = config->writeable_reg; map->readable_reg = config->readable_reg; map->volatile_reg = config->volatile_reg; @@ -3383,7 +3385,7 @@ EXPORT_SYMBOL_GPL(regmap_get_val_bytes); */ int regmap_get_max_register(struct regmap *map) { - return map->max_register ? map->max_register : -EINVAL; + return map->max_register_is_set ? map->max_register : -EINVAL; } EXPORT_SYMBOL_GPL(regmap_get_max_register); diff --git a/include/linux/regmap.h b/include/linux/regmap.h index c9182a47736e..b743241cfb7c 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -332,6 +332,10 @@ typedef void (*regmap_unlock)(void *); * @io_port: Support IO port accessors. Makes sense only when MMIO vs. IO port * access can be distinguished. * @max_register: Optional, specifies the maximum valid register address. + * @max_register_is_0: Optional, specifies that zero value in @max_register + * should be taken into account. This is a workaround to + * apply handling of @max_register for regmap that contains + * only one register. * @wr_table: Optional, points to a struct regmap_access_table specifying * valid ranges for write access. * @rd_table: As above, for read access. @@ -422,6 +426,7 @@ struct regmap_config { bool io_port; unsigned int max_register; + bool max_register_is_0; const struct regmap_access_table *wr_table; const struct regmap_access_table *rd_table; const struct regmap_access_table *volatile_table; -- cgit v1.2.3 From 7011b51f13b391ee06708bb1f82653a2953f8cfc Mon Sep 17 00:00:00 2001 From: Ben Wolsieffer Date: Tue, 6 Feb 2024 10:10:05 -0500 Subject: regmap: kunit: fix raw noinc write test wrapping The raw noinc write test places a known value in the register following the noinc register to verify that it is not disturbed by the noinc write. This test ensures this value is distinct by adding 100 to the second element of the noinc write data. The regmap registers are 16-bit, while the test value is stored in an unsigned int. Therefore, adding 100 may cause the register to wrap while the test value does not, causing the test to fail. This patch fixes this by changing val_test and val_last from unsigned int to u16. Signed-off-by: Ben Wolsieffer Reported-by: Guenter Roeck Closes: https://lore.kernel.org/linux-kernel/745d3a11-15bc-48b6-84c8-c8761c943bed@roeck-us.net/T/ Tested-by: Guenter Roeck Link: https://lore.kernel.org/r/20240206151004.1636761-2-ben.wolsieffer@hefring.com Signed-off-by: Mark Brown --- drivers/base/regmap/regmap-kunit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/base') diff --git a/drivers/base/regmap/regmap-kunit.c b/drivers/base/regmap/regmap-kunit.c index 026bdcb45127..4eb18f5d3265 100644 --- a/drivers/base/regmap/regmap-kunit.c +++ b/drivers/base/regmap/regmap-kunit.c @@ -1202,7 +1202,8 @@ static void raw_noinc_write(struct kunit *test) struct regmap *map; struct regmap_config config; struct regmap_ram_data *data; - unsigned int val, val_test, val_last; + unsigned int val; + u16 val_test, val_last; u16 val_array[BLOCK_TEST_SIZE]; config = raw_regmap_config; -- cgit v1.2.3 From 2f0dbb24f78a333433a2b875c0b76bf55c119cd4 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Sun, 11 Feb 2024 16:58:17 +0000 Subject: regmap: kunit: Ensure that changed bytes are actually different During the cache sync test we verify that values we expect to have been written only to the cache do not appear in the hardware. This works most of the time but since we randomly generate both the original and new values there is a low probability that these values may actually be the same. Wrap get_random_bytes() to ensure that the values are different, there are other tests which should have similar verification that we actually changed something. While we're at it refactor the test to use three changed values rather than attempting to use one of them twice, that just complicates checking that our new values are actually new. We use random generation to try to avoid data dependencies in the tests. Reported-by: Guenter Roeck Reviewed-by: Guenter Roeck Tested-by: Guenter Roeck Signed-off-by: Mark Brown Link: https://msgid.link/r/20240211-regmap-kunit-random-change-v3-1-e387a9ea4468@kernel.org Signed-off-by: Mark Brown --- drivers/base/regmap/regmap-kunit.c | 54 +++++++++++++++++++++++++++----------- 1 file changed, 38 insertions(+), 16 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/regmap/regmap-kunit.c b/drivers/base/regmap/regmap-kunit.c index 4eb18f5d3265..0d957c5f1bcc 100644 --- a/drivers/base/regmap/regmap-kunit.c +++ b/drivers/base/regmap/regmap-kunit.c @@ -9,6 +9,23 @@ #define BLOCK_TEST_SIZE 12 +static void get_changed_bytes(void *orig, void *new, size_t size) +{ + char *o = orig; + char *n = new; + int i; + + get_random_bytes(new, size); + + /* + * This could be nicer and more efficient but we shouldn't + * super care. + */ + for (i = 0; i < size; i++) + while (n[i] == o[i]) + get_random_bytes(&n[i], 1); +} + static const struct regmap_config test_regmap_config = { .max_register = BLOCK_TEST_SIZE, .reg_stride = 1, @@ -1252,7 +1269,7 @@ static void raw_sync(struct kunit *test) struct regmap *map; struct regmap_config config; struct regmap_ram_data *data; - u16 val[2]; + u16 val[3]; u16 *hw_buf; unsigned int rval; int i; @@ -1266,17 +1283,13 @@ static void raw_sync(struct kunit *test) hw_buf = (u16 *)data->vals; - get_random_bytes(&val, sizeof(val)); + get_changed_bytes(&hw_buf[2], &val[0], sizeof(val)); /* Do a regular write and a raw write in cache only mode */ regcache_cache_only(map, true); - KUNIT_EXPECT_EQ(test, 0, regmap_raw_write(map, 2, val, sizeof(val))); - if (config.val_format_endian == REGMAP_ENDIAN_BIG) - KUNIT_EXPECT_EQ(test, 0, regmap_write(map, 6, - be16_to_cpu(val[0]))); - else - KUNIT_EXPECT_EQ(test, 0, regmap_write(map, 6, - le16_to_cpu(val[0]))); + KUNIT_EXPECT_EQ(test, 0, regmap_raw_write(map, 2, val, + sizeof(u16) * 2)); + KUNIT_EXPECT_EQ(test, 0, regmap_write(map, 4, val[2])); /* We should read back the new values, and defaults for the rest */ for (i = 0; i < config.max_register + 1; i++) { @@ -1285,24 +1298,34 @@ static void raw_sync(struct kunit *test) switch (i) { case 2: case 3: - case 6: if (config.val_format_endian == REGMAP_ENDIAN_BIG) { KUNIT_EXPECT_EQ(test, rval, - be16_to_cpu(val[i % 2])); + be16_to_cpu(val[i - 2])); } else { KUNIT_EXPECT_EQ(test, rval, - le16_to_cpu(val[i % 2])); + le16_to_cpu(val[i - 2])); } break; + case 4: + KUNIT_EXPECT_EQ(test, rval, val[i - 2]); + break; default: KUNIT_EXPECT_EQ(test, config.reg_defaults[i].def, rval); break; } } + + /* + * The value written via _write() was translated by the core, + * translate the original copy for comparison purposes. + */ + if (config.val_format_endian == REGMAP_ENDIAN_BIG) + val[2] = cpu_to_be16(val[2]); + else + val[2] = cpu_to_le16(val[2]); /* The values should not appear in the "hardware" */ - KUNIT_EXPECT_MEMNEQ(test, &hw_buf[2], val, sizeof(val)); - KUNIT_EXPECT_MEMNEQ(test, &hw_buf[6], val, sizeof(u16)); + KUNIT_EXPECT_MEMNEQ(test, &hw_buf[2], &val[0], sizeof(val)); for (i = 0; i < config.max_register + 1; i++) data->written[i] = false; @@ -1313,8 +1336,7 @@ static void raw_sync(struct kunit *test) KUNIT_EXPECT_EQ(test, 0, regcache_sync(map)); /* The values should now appear in the "hardware" */ - KUNIT_EXPECT_MEMEQ(test, &hw_buf[2], val, sizeof(val)); - KUNIT_EXPECT_MEMEQ(test, &hw_buf[6], val, sizeof(u16)); + KUNIT_EXPECT_MEMEQ(test, &hw_buf[2], &val[0], sizeof(val)); regmap_exit(map); } -- cgit v1.2.3 From c0ef3df8dbaef51ee4cfd58a471adf2eaee6f6b3 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Tue, 30 Jan 2024 13:28:05 +0200 Subject: PM: runtime: Simplify pm_runtime_get_if_active() usage There are two ways to opportunistically increment a device's runtime PM usage count, calling either pm_runtime_get_if_active() or pm_runtime_get_if_in_use(). The former has an argument to tell whether to ignore the usage count or not, and the latter simply calls the former with ign_usage_count set to false. The other users that want to ignore the usage_count will have to explicitly set that argument to true which is a bit cumbersome. To make this function more practical to use, remove the ign_usage_count argument from the function. The main implementation is in a static function called pm_runtime_get_conditional() and implementations of pm_runtime_get_if_active() and pm_runtime_get_if_in_use() are moved to runtime.c. Signed-off-by: Sakari Ailus Reviewed-by: Alex Elder Reviewed-by: Laurent Pinchart Acked-by: Takashi Iwai # sound/ Reviewed-by: Jacek Lawrynowicz # drivers/accel/ivpu/ Acked-by: Rodrigo Vivi # drivers/gpu/drm/i915/ Reviewed-by: Rodrigo Vivi Acked-by: Bjorn Helgaas # drivers/pci/ Signed-off-by: Rafael J. Wysocki --- Documentation/power/runtime_pm.rst | 5 ++--- drivers/accel/ivpu/ivpu_pm.c | 2 +- drivers/base/power/runtime.c | 35 +++++++++++++++++++++++++++++++-- drivers/gpu/drm/i915/intel_runtime_pm.c | 5 ++++- drivers/gpu/drm/xe/xe_pm.c | 2 +- drivers/media/i2c/ccs/ccs-core.c | 2 +- drivers/media/i2c/ov64a40.c | 2 +- drivers/media/i2c/thp7312.c | 2 +- drivers/net/ipa/ipa_smp2p.c | 2 +- drivers/pci/pci.c | 2 +- include/linux/pm_runtime.h | 18 +++-------------- sound/hda/hdac_device.c | 2 +- 12 files changed, 50 insertions(+), 29 deletions(-) (limited to 'drivers/base') diff --git a/Documentation/power/runtime_pm.rst b/Documentation/power/runtime_pm.rst index 65b86e487afe..da99379071a4 100644 --- a/Documentation/power/runtime_pm.rst +++ b/Documentation/power/runtime_pm.rst @@ -396,10 +396,9 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h: nonzero, increment the counter and return 1; otherwise return 0 without changing the counter - `int pm_runtime_get_if_active(struct device *dev, bool ign_usage_count);` + `int pm_runtime_get_if_active(struct device *dev);` - return -EINVAL if 'power.disable_depth' is nonzero; otherwise, if the - runtime PM status is RPM_ACTIVE, and either ign_usage_count is true - or the device's usage_count is non-zero, increment the counter and + runtime PM status is RPM_ACTIVE, increment the counter and return 1; otherwise return 0 without changing the counter `void pm_runtime_put_noidle(struct device *dev);` diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c index f501f27ebafd..981777c93cf5 100644 --- a/drivers/accel/ivpu/ivpu_pm.c +++ b/drivers/accel/ivpu/ivpu_pm.c @@ -304,7 +304,7 @@ int ivpu_rpm_get_if_active(struct ivpu_device *vdev) { int ret; - ret = pm_runtime_get_if_active(vdev->drm.dev, false); + ret = pm_runtime_get_if_in_use(vdev->drm.dev); drm_WARN_ON(&vdev->drm, ret < 0); return ret; diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index 05793c9fbb84..5275a6b2e980 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -1176,7 +1176,7 @@ int __pm_runtime_resume(struct device *dev, int rpmflags) EXPORT_SYMBOL_GPL(__pm_runtime_resume); /** - * pm_runtime_get_if_active - Conditionally bump up device usage counter. + * pm_runtime_get_conditional - Conditionally bump up device usage counter. * @dev: Device to handle. * @ign_usage_count: Whether or not to look at the current usage counter value. * @@ -1197,7 +1197,7 @@ EXPORT_SYMBOL_GPL(__pm_runtime_resume); * The caller is responsible for decrementing the runtime PM usage counter of * @dev after this function has returned a positive value for it. */ -int pm_runtime_get_if_active(struct device *dev, bool ign_usage_count) +static int pm_runtime_get_conditional(struct device *dev, bool ign_usage_count) { unsigned long flags; int retval; @@ -1218,8 +1218,39 @@ int pm_runtime_get_if_active(struct device *dev, bool ign_usage_count) return retval; } + +/** + * pm_runtime_get_if_active - Bump up runtime PM usage counter if the device is + * in active state + * @dev: Target device. + * + * Increment the runtime PM usage counter of @dev if its runtime PM status is + * %RPM_ACTIVE, in which case it returns 1. If the device is in a different + * state, 0 is returned. -EINVAL is returned if runtime PM is disabled for the + * device, in which case also the usage_count will remain unmodified. + */ +int pm_runtime_get_if_active(struct device *dev) +{ + return pm_runtime_get_conditional(dev, true); +} EXPORT_SYMBOL_GPL(pm_runtime_get_if_active); +/** + * pm_runtime_get_if_in_use - Conditionally bump up runtime PM usage counter. + * @dev: Target device. + * + * Increment the runtime PM usage counter of @dev if its runtime PM status is + * %RPM_ACTIVE and its runtime PM usage counter is greater than 0, in which case + * it returns 1. If the device is in a different state or its usage_count is 0, + * 0 is returned. -EINVAL is returned if runtime PM is disabled for the device, + * in which case also the usage_count will remain unmodified. + */ +int pm_runtime_get_if_in_use(struct device *dev) +{ + return pm_runtime_get_conditional(dev, false); +} +EXPORT_SYMBOL_GPL(pm_runtime_get_if_in_use); + /** * __pm_runtime_set_status - Set runtime PM status of a device. * @dev: Device to handle. diff --git a/drivers/gpu/drm/i915/intel_runtime_pm.c b/drivers/gpu/drm/i915/intel_runtime_pm.c index 860b51b56a92..d4e844128826 100644 --- a/drivers/gpu/drm/i915/intel_runtime_pm.c +++ b/drivers/gpu/drm/i915/intel_runtime_pm.c @@ -246,7 +246,10 @@ static intel_wakeref_t __intel_runtime_pm_get_if_active(struct intel_runtime_pm * function, since the power state is undefined. This applies * atm to the late/early system suspend/resume handlers. */ - if (pm_runtime_get_if_active(rpm->kdev, ignore_usecount) <= 0) + if ((ignore_usecount && + pm_runtime_get_if_active(rpm->kdev) <= 0) || + (!ignore_usecount && + pm_runtime_get_if_in_use(rpm->kdev) <= 0)) return 0; } diff --git a/drivers/gpu/drm/xe/xe_pm.c b/drivers/gpu/drm/xe/xe_pm.c index b429c2876a76..dd110058bf74 100644 --- a/drivers/gpu/drm/xe/xe_pm.c +++ b/drivers/gpu/drm/xe/xe_pm.c @@ -330,7 +330,7 @@ int xe_pm_runtime_put(struct xe_device *xe) int xe_pm_runtime_get_if_active(struct xe_device *xe) { - return pm_runtime_get_if_active(xe->drm.dev, true); + return pm_runtime_get_if_active(xe->drm.dev); } void xe_pm_assert_unbounded_bridge(struct xe_device *xe) diff --git a/drivers/media/i2c/ccs/ccs-core.c b/drivers/media/i2c/ccs/ccs-core.c index e21287d50c15..e1ae0f9fad43 100644 --- a/drivers/media/i2c/ccs/ccs-core.c +++ b/drivers/media/i2c/ccs/ccs-core.c @@ -674,7 +674,7 @@ static int ccs_set_ctrl(struct v4l2_ctrl *ctrl) break; } - pm_status = pm_runtime_get_if_active(&client->dev, true); + pm_status = pm_runtime_get_if_active(&client->dev); if (!pm_status) return 0; diff --git a/drivers/media/i2c/ov64a40.c b/drivers/media/i2c/ov64a40.c index 4fba4c2cb064..541bf74581d2 100644 --- a/drivers/media/i2c/ov64a40.c +++ b/drivers/media/i2c/ov64a40.c @@ -3287,7 +3287,7 @@ static int ov64a40_set_ctrl(struct v4l2_ctrl *ctrl) exp_max, 1, exp_val); } - pm_status = pm_runtime_get_if_active(ov64a40->dev, true); + pm_status = pm_runtime_get_if_active(ov64a40->dev); if (!pm_status) return 0; diff --git a/drivers/media/i2c/thp7312.c b/drivers/media/i2c/thp7312.c index 2806887514dc..19bd923a7315 100644 --- a/drivers/media/i2c/thp7312.c +++ b/drivers/media/i2c/thp7312.c @@ -1052,7 +1052,7 @@ static int thp7312_s_ctrl(struct v4l2_ctrl *ctrl) if (ctrl->flags & V4L2_CTRL_FLAG_INACTIVE) return -EINVAL; - if (!pm_runtime_get_if_active(thp7312->dev, true)) + if (!pm_runtime_get_if_active(thp7312->dev)) return 0; switch (ctrl->id) { diff --git a/drivers/net/ipa/ipa_smp2p.c b/drivers/net/ipa/ipa_smp2p.c index 5620dc271fac..cbf3d4761ce3 100644 --- a/drivers/net/ipa/ipa_smp2p.c +++ b/drivers/net/ipa/ipa_smp2p.c @@ -92,7 +92,7 @@ static void ipa_smp2p_notify(struct ipa_smp2p *smp2p) return; dev = &smp2p->ipa->pdev->dev; - smp2p->power_on = pm_runtime_get_if_active(dev, true) > 0; + smp2p->power_on = pm_runtime_get_if_active(dev) > 0; /* Signal whether the IPA power is enabled */ mask = BIT(smp2p->enabled_bit); diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 9ab9b1008d8b..cb51c4079013 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -2536,7 +2536,7 @@ static void pci_pme_list_scan(struct work_struct *work) * If the device is in a low power state it * should not be polled either. */ - pm_status = pm_runtime_get_if_active(dev, true); + pm_status = pm_runtime_get_if_active(dev); if (!pm_status) continue; diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 7c9b35448563..436baa167498 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -72,7 +72,8 @@ extern int pm_runtime_force_resume(struct device *dev); extern int __pm_runtime_idle(struct device *dev, int rpmflags); extern int __pm_runtime_suspend(struct device *dev, int rpmflags); extern int __pm_runtime_resume(struct device *dev, int rpmflags); -extern int pm_runtime_get_if_active(struct device *dev, bool ign_usage_count); +extern int pm_runtime_get_if_active(struct device *dev); +extern int pm_runtime_get_if_in_use(struct device *dev); extern int pm_schedule_suspend(struct device *dev, unsigned int delay); extern int __pm_runtime_set_status(struct device *dev, unsigned int status); extern int pm_runtime_barrier(struct device *dev); @@ -94,18 +95,6 @@ extern void pm_runtime_release_supplier(struct device_link *link); extern int devm_pm_runtime_enable(struct device *dev); -/** - * pm_runtime_get_if_in_use - Conditionally bump up runtime PM usage counter. - * @dev: Target device. - * - * Increment the runtime PM usage counter of @dev if its runtime PM status is - * %RPM_ACTIVE and its runtime PM usage counter is greater than 0. - */ -static inline int pm_runtime_get_if_in_use(struct device *dev) -{ - return pm_runtime_get_if_active(dev, false); -} - /** * pm_suspend_ignore_children - Set runtime PM behavior regarding children. * @dev: Target device. @@ -275,8 +264,7 @@ static inline int pm_runtime_get_if_in_use(struct device *dev) { return -EINVAL; } -static inline int pm_runtime_get_if_active(struct device *dev, - bool ign_usage_count) +static inline int pm_runtime_get_if_active(struct device *dev) { return -EINVAL; } diff --git a/sound/hda/hdac_device.c b/sound/hda/hdac_device.c index 7f7b67fe1b65..068c16e52dff 100644 --- a/sound/hda/hdac_device.c +++ b/sound/hda/hdac_device.c @@ -612,7 +612,7 @@ EXPORT_SYMBOL_GPL(snd_hdac_power_up_pm); int snd_hdac_keep_power_up(struct hdac_device *codec) { if (!atomic_inc_not_zero(&codec->in_pm)) { - int ret = pm_runtime_get_if_active(&codec->dev, true); + int ret = pm_runtime_get_if_active(&codec->dev); if (!ret) return -1; if (ret < 0) -- cgit v1.2.3 From 161e16a5e50a82d219b3df3ce32286b0a2ae08bd Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 30 Jan 2024 13:39:47 +0100 Subject: PM: domains: Add helper functions to attach/detach multiple PM domains Attaching/detaching of a device to multiple PM domains has started to become a common operation for many drivers, typically during ->probe() and ->remove(). In most cases, this has lead to lots of boilerplate code in the drivers. To fixup up the situation, let's introduce a pair of helper functions, dev_pm_domain_attach|detach_list(), that driver can use instead of the open-coding. Note that, it seems reasonable to limit the support for these helpers to DT based platforms, at it's the only valid use case for now. Suggested-by: Daniel Baluta Tested-by: Bryan O'Donoghue Tested-by: Iuliana Prodan Signed-off-by: Ulf Hansson Link: https://lore.kernel.org/r/20240130123951.236243-2-ulf.hansson@linaro.org --- drivers/base/power/common.c | 134 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/pm_domain.h | 38 +++++++++++++ 2 files changed, 172 insertions(+) (limited to 'drivers/base') diff --git a/drivers/base/power/common.c b/drivers/base/power/common.c index 44ec20918a4d..327d168dd37a 100644 --- a/drivers/base/power/common.c +++ b/drivers/base/power/common.c @@ -167,6 +167,115 @@ struct device *dev_pm_domain_attach_by_name(struct device *dev, } EXPORT_SYMBOL_GPL(dev_pm_domain_attach_by_name); +/** + * dev_pm_domain_attach_list - Associate a device with its PM domains. + * @dev: The device used to lookup the PM domains for. + * @data: The data used for attaching to the PM domains. + * @list: An out-parameter with an allocated list of attached PM domains. + * + * This function helps to attach a device to its multiple PM domains. The + * caller, which is typically a driver's probe function, may provide a list of + * names for the PM domains that we should try to attach the device to, but it + * may also provide an empty list, in case the attach should be done for all of + * the available PM domains. + * + * Callers must ensure proper synchronization of this function with power + * management callbacks. + * + * Returns the number of attached PM domains or a negative error code in case of + * a failure. Note that, to detach the list of PM domains, the driver shall call + * dev_pm_domain_detach_list(), typically during the remove phase. + */ +int dev_pm_domain_attach_list(struct device *dev, + const struct dev_pm_domain_attach_data *data, + struct dev_pm_domain_list **list) +{ + struct device_node *np = dev->of_node; + struct dev_pm_domain_list *pds; + struct device *pd_dev = NULL; + int ret, i, num_pds = 0; + bool by_id = true; + u32 pd_flags = data ? data->pd_flags : 0; + u32 link_flags = pd_flags & PD_FLAG_NO_DEV_LINK ? 0 : + DL_FLAG_STATELESS | DL_FLAG_PM_RUNTIME; + + if (dev->pm_domain) + return -EEXIST; + + /* For now this is limited to OF based platforms. */ + if (!np) + return 0; + + if (data && data->pd_names) { + num_pds = data->num_pd_names; + by_id = false; + } else { + num_pds = of_count_phandle_with_args(np, "power-domains", + "#power-domain-cells"); + } + + if (num_pds <= 0) + return 0; + + pds = devm_kzalloc(dev, sizeof(*pds), GFP_KERNEL); + if (!pds) + return -ENOMEM; + + pds->pd_devs = devm_kcalloc(dev, num_pds, sizeof(*pds->pd_devs), + GFP_KERNEL); + if (!pds->pd_devs) + return -ENOMEM; + + pds->pd_links = devm_kcalloc(dev, num_pds, sizeof(*pds->pd_links), + GFP_KERNEL); + if (!pds->pd_links) + return -ENOMEM; + + if (link_flags && pd_flags & PD_FLAG_DEV_LINK_ON) + link_flags |= DL_FLAG_RPM_ACTIVE; + + for (i = 0; i < num_pds; i++) { + if (by_id) + pd_dev = dev_pm_domain_attach_by_id(dev, i); + else + pd_dev = dev_pm_domain_attach_by_name(dev, + data->pd_names[i]); + if (IS_ERR_OR_NULL(pd_dev)) { + ret = pd_dev ? PTR_ERR(pd_dev) : -ENODEV; + goto err_attach; + } + + if (link_flags) { + struct device_link *link; + + link = device_link_add(dev, pd_dev, link_flags); + if (!link) { + ret = -ENODEV; + goto err_link; + } + + pds->pd_links[i] = link; + } + + pds->pd_devs[i] = pd_dev; + } + + pds->num_pds = num_pds; + *list = pds; + return num_pds; + +err_link: + dev_pm_domain_detach(pd_dev, true); +err_attach: + while (--i >= 0) { + if (pds->pd_links[i]) + device_link_del(pds->pd_links[i]); + dev_pm_domain_detach(pds->pd_devs[i], true); + } + return ret; +} +EXPORT_SYMBOL_GPL(dev_pm_domain_attach_list); + /** * dev_pm_domain_detach - Detach a device from its PM domain. * @dev: Device to detach. @@ -187,6 +296,31 @@ void dev_pm_domain_detach(struct device *dev, bool power_off) } EXPORT_SYMBOL_GPL(dev_pm_domain_detach); +/** + * dev_pm_domain_detach_list - Detach a list of PM domains. + * @list: The list of PM domains to detach. + * + * This function reverse the actions from dev_pm_domain_attach_list(). + * Typically it should be invoked during the remove phase from drivers. + * + * Callers must ensure proper synchronization of this function with power + * management callbacks. + */ +void dev_pm_domain_detach_list(struct dev_pm_domain_list *list) +{ + int i; + + if (!list) + return; + + for (i = 0; i < list->num_pds; i++) { + if (list->pd_links[i]) + device_link_del(list->pd_links[i]); + dev_pm_domain_detach(list->pd_devs[i], true); + } +} +EXPORT_SYMBOL_GPL(dev_pm_domain_detach_list); + /** * dev_pm_domain_start - Start the device through its PM domain. * @dev: Device to start. diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index b97c5e9820f9..fb91770ba4ba 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -19,6 +19,33 @@ #include #include +/* + * Flags to control the behaviour when attaching a device to its PM domains. + * + * PD_FLAG_NO_DEV_LINK: As the default behaviour creates a device-link + * for every PM domain that gets attached, this + * flag can be used to skip that. + * + * PD_FLAG_DEV_LINK_ON: Add the DL_FLAG_RPM_ACTIVE to power-on the + * supplier and its PM domain when creating the + * device-links. + * + */ +#define PD_FLAG_NO_DEV_LINK BIT(0) +#define PD_FLAG_DEV_LINK_ON BIT(1) + +struct dev_pm_domain_attach_data { + const char * const *pd_names; + const u32 num_pd_names; + const u32 pd_flags; +}; + +struct dev_pm_domain_list { + struct device **pd_devs; + struct device_link **pd_links; + u32 num_pds; +}; + /* * Flags to control the behaviour of a genpd. * @@ -420,7 +447,11 @@ struct device *dev_pm_domain_attach_by_id(struct device *dev, unsigned int index); struct device *dev_pm_domain_attach_by_name(struct device *dev, const char *name); +int dev_pm_domain_attach_list(struct device *dev, + const struct dev_pm_domain_attach_data *data, + struct dev_pm_domain_list **list); void dev_pm_domain_detach(struct device *dev, bool power_off); +void dev_pm_domain_detach_list(struct dev_pm_domain_list *list); int dev_pm_domain_start(struct device *dev); void dev_pm_domain_set(struct device *dev, struct dev_pm_domain *pd); int dev_pm_domain_set_performance_state(struct device *dev, unsigned int state); @@ -439,7 +470,14 @@ static inline struct device *dev_pm_domain_attach_by_name(struct device *dev, { return NULL; } +static inline int dev_pm_domain_attach_list(struct device *dev, + const struct dev_pm_domain_attach_data *data, + struct dev_pm_domain_list **list) +{ + return 0; +} static inline void dev_pm_domain_detach(struct device *dev, bool power_off) {} +static inline void dev_pm_domain_detach_list(struct dev_pm_domain_list *list) {} static inline int dev_pm_domain_start(struct device *dev) { return 0; -- cgit v1.2.3 From c88f9110bfbca5975a8dee4c9792ba12684c7bca Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 27 Jan 2024 21:47:33 +0530 Subject: platform-msi: Prepare for real per device domains Provide functions to create and remove per device MSI domains which replace the platform-MSI domains. The new model is that each of the devices which utilize platform-MSI gets now its private MSI domain which is "customized" in size and with a device specific function to write the MSI message into the device. This is the same functionality as platform-MSI but it avoids all the down sides of platform MSI, i.e. the extra ID book keeping, the special data structure in the msi descriptor. Further the domains are only created when the devices are really in use, so the burden is on the usage and not on the infrastructure. Fill in the domain template and provide two functions to init/allocate and remove a per device MSI domain. Until all users and parent domain providers are converted, the init/alloc function invokes the original platform-MSI code when the irqdomain which is associated to the device does not provide MSI parent functionality yet. Signed-off-by: Thomas Gleixner Signed-off-by: Anup Patel Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240127161753.114685-6-apatel@ventanamicro.com --- drivers/base/platform-msi.c | 103 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/msi.h | 4 ++ 2 files changed, 107 insertions(+) (limited to 'drivers/base') diff --git a/drivers/base/platform-msi.c b/drivers/base/platform-msi.c index f37ad34c80ec..b56e919acabb 100644 --- a/drivers/base/platform-msi.c +++ b/drivers/base/platform-msi.c @@ -13,6 +13,8 @@ #include #include +/* Begin of removal area. Once everything is converted over. Cleanup the includes too! */ + #define DEV_ID_SHIFT 21 #define MAX_DEV_MSIS (1 << (32 - DEV_ID_SHIFT)) @@ -350,3 +352,104 @@ int platform_msi_device_domain_alloc(struct irq_domain *domain, unsigned int vir return msi_domain_populate_irqs(domain->parent, dev, virq, nr_irqs, &data->arg); } + +/* End of removal area */ + +/* Real per device domain interfaces */ + +/* + * This indirection can go when platform_device_msi_init_and_alloc_irqs() + * is switched to a proper irq_chip::irq_write_msi_msg() callback. Keep it + * simple for now. + */ +static void platform_msi_write_msi_msg(struct irq_data *d, struct msi_msg *msg) +{ + irq_write_msi_msg_t cb = d->chip_data; + + cb(irq_data_get_msi_desc(d), msg); +} + +static void platform_msi_set_desc_byindex(msi_alloc_info_t *arg, struct msi_desc *desc) +{ + arg->desc = desc; + arg->hwirq = desc->msi_index; +} + +static const struct msi_domain_template platform_msi_template = { + .chip = { + .name = "pMSI", + .irq_mask = irq_chip_mask_parent, + .irq_unmask = irq_chip_unmask_parent, + .irq_write_msi_msg = platform_msi_write_msi_msg, + /* The rest is filled in by the platform MSI parent */ + }, + + .ops = { + .set_desc = platform_msi_set_desc_byindex, + }, + + .info = { + .bus_token = DOMAIN_BUS_DEVICE_MSI, + }, +}; + +/** + * platform_device_msi_init_and_alloc_irqs - Initialize platform device MSI + * and allocate interrupts for @dev + * @dev: The device for which to allocate interrupts + * @nvec: The number of interrupts to allocate + * @write_msi_msg: Callback to write an interrupt message for @dev + * + * Returns: + * Zero for success, or an error code in case of failure + * + * This creates a MSI domain on @dev which has @dev->msi.domain as + * parent. The parent domain sets up the new domain. The domain has + * a fixed size of @nvec. The domain is managed by devres and will + * be removed when the device is removed. + * + * Note: For migration purposes this falls back to the original platform_msi code + * up to the point where all platforms have been converted to the MSI + * parent model. + */ +int platform_device_msi_init_and_alloc_irqs(struct device *dev, unsigned int nvec, + irq_write_msi_msg_t write_msi_msg) +{ + struct irq_domain *domain = dev->msi.domain; + + if (!domain || !write_msi_msg) + return -EINVAL; + + /* Migration support. Will go away once everything is converted */ + if (!irq_domain_is_msi_parent(domain)) + return platform_msi_domain_alloc_irqs(dev, nvec, write_msi_msg); + + /* + * @write_msi_msg is stored in the resulting msi_domain_info::data. + * The underlying domain creation mechanism will assign that + * callback to the resulting irq chip. + */ + if (!msi_create_device_irq_domain(dev, MSI_DEFAULT_DOMAIN, + &platform_msi_template, + nvec, NULL, write_msi_msg)) + return -ENODEV; + + return msi_domain_alloc_irqs_range(dev, MSI_DEFAULT_DOMAIN, 0, nvec - 1); +} +EXPORT_SYMBOL_GPL(platform_device_msi_init_and_alloc_irqs); + +/** + * platform_device_msi_free_irqs_all - Free all interrupts for @dev + * @dev: The device for which to free interrupts + */ +void platform_device_msi_free_irqs_all(struct device *dev) +{ + struct irq_domain *domain = dev->msi.domain; + + msi_domain_free_irqs_all(dev, MSI_DEFAULT_DOMAIN); + + /* Migration support. Will go away once everything is converted */ + if (!irq_domain_is_msi_parent(domain)) + platform_msi_free_priv_data(dev); +} +EXPORT_SYMBOL_GPL(platform_device_msi_free_irqs_all); diff --git a/include/linux/msi.h b/include/linux/msi.h index d5d1513ef4d6..ef167961c782 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -664,6 +664,10 @@ int platform_msi_device_domain_alloc(struct irq_domain *domain, unsigned int vir void platform_msi_device_domain_free(struct irq_domain *domain, unsigned int virq, unsigned int nvec); void *platform_msi_get_host_data(struct irq_domain *domain); +/* Per device platform MSI */ +int platform_device_msi_init_and_alloc_irqs(struct device *dev, unsigned int nvec, + irq_write_msi_msg_t write_msi_msg); +void platform_device_msi_free_irqs_all(struct device *dev); bool msi_device_has_isolated_msi(struct device *dev); #else /* CONFIG_GENERIC_MSI_IRQ */ -- cgit v1.2.3 From 1a4671ff7a903e87e4e76213e200bb8bcfa942e4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Feb 2024 16:35:43 +0100 Subject: platform-msi: Remove unused interfaces Signed-off-by: Thomas Gleixner --- drivers/base/platform-msi.c | 16 ++-------------- include/linux/msi.h | 3 --- 2 files changed, 2 insertions(+), 17 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/platform-msi.c b/drivers/base/platform-msi.c index b56e919acabb..0d01890160f3 100644 --- a/drivers/base/platform-msi.c +++ b/drivers/base/platform-msi.c @@ -206,8 +206,8 @@ static void platform_msi_free_priv_data(struct device *dev) * Returns: * Zero for success, or an error code in case of failure */ -int platform_msi_domain_alloc_irqs(struct device *dev, unsigned int nvec, - irq_write_msi_msg_t write_msi_msg) +static int platform_msi_domain_alloc_irqs(struct device *dev, unsigned int nvec, + irq_write_msi_msg_t write_msi_msg) { int err; @@ -221,18 +221,6 @@ int platform_msi_domain_alloc_irqs(struct device *dev, unsigned int nvec, return err; } -EXPORT_SYMBOL_GPL(platform_msi_domain_alloc_irqs); - -/** - * platform_msi_domain_free_irqs - Free MSI interrupts for @dev - * @dev: The device for which to free interrupts - */ -void platform_msi_domain_free_irqs(struct device *dev) -{ - msi_domain_free_irqs_all(dev, MSI_DEFAULT_DOMAIN); - platform_msi_free_priv_data(dev); -} -EXPORT_SYMBOL_GPL(platform_msi_domain_free_irqs); /** * platform_msi_get_host_data - Query the private data associated with diff --git a/include/linux/msi.h b/include/linux/msi.h index ef167961c782..b0842ea55bde 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -635,9 +635,6 @@ struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain); struct irq_domain *platform_msi_create_irq_domain(struct fwnode_handle *fwnode, struct msi_domain_info *info, struct irq_domain *parent); -int platform_msi_domain_alloc_irqs(struct device *dev, unsigned int nvec, - irq_write_msi_msg_t write_msi_msg); -void platform_msi_domain_free_irqs(struct device *dev); /* When an MSI domain is used as an intermediate domain */ int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev, -- cgit v1.2.3 From c5f1e2d1890935a734c302b9b8579748222b8e1e Mon Sep 17 00:00:00 2001 From: Sumanth Korikkar Date: Mon, 8 Jan 2024 14:27:43 +0100 Subject: mm/memory_hotplug: introduce MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers Patch series "implement "memmap on memory" feature on s390". This series provides "memmap on memory" support on s390 platform. "memmap on memory" allows struct pages array to be allocated from the hotplugged memory range instead of allocating it from main system memory. s390 currently preallocates struct pages array for all potentially possible memory, which ensures memory onlining always succeeds, but with the cost of significant memory consumption from the available system memory during boottime. In certain extreme configuration, this could lead to ipl failure. "memmap on memory" ensures struct pages array are populated from self contained hotplugged memory range instead of depleting the available system memory and this could eliminate ipl failure on s390 platform. On other platforms, system might go OOM when the physically hotplugged memory depletes the available memory before it is onlined. Hence, "memmap on memory" feature was introduced as described in commit a08a2ae34613 ("mm,memory_hotplug: allocate memmap from the added memory range"). Unlike other architectures, s390 memory blocks are not physically accessible until it is online. To make it physically accessible two new memory notifiers MEM_PREPARE_ONLINE / MEM_FINISH_OFFLINE are added and this notifier lets the hypervisor inform that the memory should be made physically accessible. This allows for "memmap on memory" initialization during memory hotplug onlining phase, which is performed before calling MEM_GOING_ONLINE notifier. Patch 1 introduces MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE memory notifiers to prepare the transition of memory to and from a physically accessible state. New mhp_flag MHP_OFFLINE_INACCESSIBLE is introduced to ensure altmap cannot be written when adding memory - before it is set online. This enhancement is crucial for implementing the "memmap on memory" feature for s390 in a subsequent patch. Patches 2 allocates vmemmap pages from self-contained memory range for s390. It allocates memory map (struct pages array) from the hotplugged memory range, rather than using system memory by passing altmap to vmemmap functions. Patch 3 removes unhandled memory notifier types on s390. Patch 4 implements MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE memory notifiers on s390. MEM_PREPARE_ONLINE memory notifier makes memory block physical accessible via sclp assign command. The notifier ensures self-contained memory maps are accessible and hence enabling the "memmap on memory" on s390. MEM_FINISH_OFFLINE memory notifier shifts the memory block to an inaccessible state via sclp unassign command. Patch 5 finally enables MHP_MEMMAP_ON_MEMORY on s390. This patch (of 5): Introduce MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE memory notifiers to prepare the transition of memory to and from a physically accessible state. This enhancement is crucial for implementing the "memmap on memory" feature for s390 in a subsequent patch. Platforms such as x86 can support physical memory hotplug via ACPI. When there is physical memory hotplug, ACPI event leads to the memory addition with the following callchain: acpi_memory_device_add() -> acpi_memory_enable_device() -> __add_memory() After this, the hotplugged memory is physically accessible, and altmap support prepared, before the "memmap on memory" initialization in memory_block_online() is called. On s390, memory hotplug works in a different way. The available hotplug memory has to be defined upfront in the hypervisor, but it is made physically accessible only when the user sets it online via sysfs, currently in the MEM_GOING_ONLINE notifier. This is too late and "memmap on memory" initialization is performed before calling MEM_GOING_ONLINE notifier. During the memory hotplug addition phase, altmap support is prepared and during the memory onlining phase s390 requires memory to be physically accessible and then subsequently initiate the "memmap on memory" initialization process. The memory provider will handle new MEM_PREPARE_ONLINE / MEM_FINISH_OFFLINE notifications and make the memory accessible. The mhp_flag MHP_OFFLINE_INACCESSIBLE is introduced and is relevant when used along with MHP_MEMMAP_ON_MEMORY, because the altmap cannot be written (e.g., poisoned) when adding memory -- before it is set online. This allows for adding memory with an altmap that is not currently made available by a hypervisor. When onlining that memory, the hypervisor can be instructed to make that memory accessible via the new notifiers and the onlining phase will not require any memory allocations, which is helpful in low-memory situations. All architectures ignore unknown memory notifiers. Therefore, the introduction of these new notifiers does not result in any functional modifications across architectures. Link: https://lkml.kernel.org/r/20240108132747.3238763-1-sumanthk@linux.ibm.com Link: https://lkml.kernel.org/r/20240108132747.3238763-2-sumanthk@linux.ibm.com Signed-off-by: Sumanth Korikkar Suggested-by: Gerald Schaefer Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Cc: Alexander Gordeev Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Heiko Carstens Cc: Michal Hocko Cc: Oscar Salvador Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- drivers/base/memory.c | 23 ++++++++++++++++++++++- include/linux/memory.h | 9 +++++++++ include/linux/memory_hotplug.h | 18 +++++++++++++++++- include/linux/memremap.h | 1 + mm/memory_hotplug.c | 17 ++++++++++++++--- mm/sparse.c | 3 ++- 6 files changed, 65 insertions(+), 6 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 14f964a7719b..c0436f46cfb7 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -188,6 +188,7 @@ static int memory_block_online(struct memory_block *mem) unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; unsigned long nr_vmemmap_pages = 0; + struct memory_notify arg; struct zone *zone; int ret; @@ -207,9 +208,19 @@ static int memory_block_online(struct memory_block *mem) if (mem->altmap) nr_vmemmap_pages = mem->altmap->free; + arg.altmap_start_pfn = start_pfn; + arg.altmap_nr_pages = nr_vmemmap_pages; + arg.start_pfn = start_pfn + nr_vmemmap_pages; + arg.nr_pages = nr_pages - nr_vmemmap_pages; mem_hotplug_begin(); + ret = memory_notify(MEM_PREPARE_ONLINE, &arg); + ret = notifier_to_errno(ret); + if (ret) + goto out_notifier; + if (nr_vmemmap_pages) { - ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone); + ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, + zone, mem->altmap->inaccessible); if (ret) goto out; } @@ -231,7 +242,11 @@ static int memory_block_online(struct memory_block *mem) nr_vmemmap_pages); mem->zone = zone; + mem_hotplug_done(); + return ret; out: + memory_notify(MEM_FINISH_OFFLINE, &arg); +out_notifier: mem_hotplug_done(); return ret; } @@ -244,6 +259,7 @@ static int memory_block_offline(struct memory_block *mem) unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; unsigned long nr_vmemmap_pages = 0; + struct memory_notify arg; int ret; if (!mem->zone) @@ -275,6 +291,11 @@ static int memory_block_offline(struct memory_block *mem) mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); mem->zone = NULL; + arg.altmap_start_pfn = start_pfn; + arg.altmap_nr_pages = nr_vmemmap_pages; + arg.start_pfn = start_pfn + nr_vmemmap_pages; + arg.nr_pages = nr_pages - nr_vmemmap_pages; + memory_notify(MEM_FINISH_OFFLINE, &arg); out: mem_hotplug_done(); return ret; diff --git a/include/linux/memory.h b/include/linux/memory.h index f53cfdaaaa41..939a16bd5cea 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -96,8 +96,17 @@ int set_memory_block_size_order(unsigned int order); #define MEM_GOING_ONLINE (1<<3) #define MEM_CANCEL_ONLINE (1<<4) #define MEM_CANCEL_OFFLINE (1<<5) +#define MEM_PREPARE_ONLINE (1<<6) +#define MEM_FINISH_OFFLINE (1<<7) struct memory_notify { + /* + * The altmap_start_pfn and altmap_nr_pages fields are designated for + * specifying the altmap range and are exclusively intended for use in + * MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers. + */ + unsigned long altmap_start_pfn; + unsigned long altmap_nr_pages; unsigned long start_pfn; unsigned long nr_pages; int status_change_nid_normal; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 7d2076583494..ee00015575aa 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -106,6 +106,22 @@ typedef int __bitwise mhp_t; * implies the node id (nid). */ #define MHP_NID_IS_MGID ((__force mhp_t)BIT(2)) +/* + * The hotplugged memory is completely inaccessible while the memory is + * offline. The memory provider will handle MEM_PREPARE_ONLINE / + * MEM_FINISH_OFFLINE notifications and make the memory accessible. + * + * This flag is only relevant when used along with MHP_MEMMAP_ON_MEMORY, + * because the altmap cannot be written (e.g., poisoned) when adding + * memory -- before it is set online. + * + * This allows for adding memory with an altmap that is not currently + * made available by a hypervisor. When onlining that memory, the + * hypervisor can be instructed to make that memory available, and + * the onlining phase will not require any memory allocations, which is + * helpful in low-memory situations. + */ +#define MHP_OFFLINE_INACCESSIBLE ((__force mhp_t)BIT(3)) /* * Extended parameters for memory hotplug: @@ -154,7 +170,7 @@ extern void adjust_present_page_count(struct page *page, long nr_pages); /* VM interface that may be used by firmware interface */ extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, - struct zone *zone); + struct zone *zone, bool mhp_off_inaccessible); extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages); extern int online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone, struct memory_group *group); diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 744c830f4b13..9837f3e6fb95 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -25,6 +25,7 @@ struct vmem_altmap { unsigned long free; unsigned long align; unsigned long alloc; + bool inaccessible; }; /* diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 21890994c1d3..707027f69150 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1087,7 +1087,7 @@ void adjust_present_page_count(struct page *page, struct memory_group *group, } int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, - struct zone *zone) + struct zone *zone, bool mhp_off_inaccessible) { unsigned long end_pfn = pfn + nr_pages; int ret, i; @@ -1096,6 +1096,15 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, if (ret) return ret; + /* + * Memory block is accessible at this stage and hence poison the struct + * pages now. If the memory block is accessible during memory hotplug + * addition phase, then page poisining is already performed in + * sparse_add_section(). + */ + if (mhp_off_inaccessible) + page_init_poison(pfn_to_page(pfn), sizeof(struct page) * nr_pages); + move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE); for (i = 0; i < nr_pages; i++) @@ -1415,7 +1424,7 @@ static void __ref remove_memory_blocks_and_altmaps(u64 start, u64 size) } static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group, - u64 start, u64 size) + u64 start, u64 size, mhp_t mhp_flags) { unsigned long memblock_size = memory_block_size_bytes(); u64 cur_start; @@ -1431,6 +1440,8 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group, }; mhp_altmap.free = memory_block_memmap_on_memory_pages(); + if (mhp_flags & MHP_OFFLINE_INACCESSIBLE) + mhp_altmap.inaccessible = true; params.altmap = kmemdup(&mhp_altmap, sizeof(struct vmem_altmap), GFP_KERNEL); if (!params.altmap) { @@ -1516,7 +1527,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) */ if ((mhp_flags & MHP_MEMMAP_ON_MEMORY) && mhp_supports_memmap_on_memory(memory_block_size_bytes())) { - ret = create_altmaps_and_memory_blocks(nid, group, start, size); + ret = create_altmaps_and_memory_blocks(nid, group, start, size, mhp_flags); if (ret) goto error; } else { diff --git a/mm/sparse.c b/mm/sparse.c index 338cf946dee8..aed0951b87fa 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -908,7 +908,8 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn, * Poison uninitialized struct pages in order to catch invalid flags * combinations. */ - page_init_poison(memmap, sizeof(struct page) * nr_pages); + if (!altmap || !altmap->inaccessible) + page_init_poison(memmap, sizeof(struct page) * nr_pages); ms = __nr_to_section(section_nr); set_section_nid(section_nr, nid); -- cgit v1.2.3 From 5cec4eb7fad6fb1e9a3dd8403b558d1eff7490ff Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 26 Jan 2024 16:19:44 +0800 Subject: mm and cache_info: remove unnecessary CPU cache info update For each CPU hotplug event, we will update per-CPU data slice size and corresponding PCP configuration for every online CPU to make the implementation simple. But, Kyle reported that this takes tens seconds during boot on a machine with 34 zones and 3840 CPUs. So, in this patch, for each CPU hotplug event, we only update per-CPU data slice size and corresponding PCP configuration for the CPUs that share caches with the hotplugged CPU. With the patch, the system boot time reduces 67 seconds on the machine. Link: https://lkml.kernel.org/r/20240126081944.414520-1-ying.huang@intel.com Fixes: 362d37a106dd ("mm, pcp: reduce lock contention for draining high-order pages") Signed-off-by: "Huang, Ying" Originally-by: Kyle Meyer Reported-and-tested-by: Kyle Meyer Cc: Sudeep Holla Cc: Mel Gorman Signed-off-by: Andrew Morton --- drivers/base/cacheinfo.c | 50 ++++++++++++++++++++++++++++++++++++++++++------ include/linux/gfp.h | 2 +- mm/page_alloc.c | 39 +++++++++++++++++-------------------- 3 files changed, 63 insertions(+), 28 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c index f1e79263fe61..23b8cba4a2a3 100644 --- a/drivers/base/cacheinfo.c +++ b/drivers/base/cacheinfo.c @@ -898,6 +898,37 @@ err: return rc; } +static unsigned int cpu_map_shared_cache(bool online, unsigned int cpu, + cpumask_t **map) +{ + struct cacheinfo *llc, *sib_llc; + unsigned int sibling; + + if (!last_level_cache_is_valid(cpu)) + return 0; + + llc = per_cpu_cacheinfo_idx(cpu, cache_leaves(cpu) - 1); + + if (llc->type != CACHE_TYPE_DATA && llc->type != CACHE_TYPE_UNIFIED) + return 0; + + if (online) { + *map = &llc->shared_cpu_map; + return cpumask_weight(*map); + } + + /* shared_cpu_map of offlined CPU will be cleared, so use sibling map */ + for_each_cpu(sibling, &llc->shared_cpu_map) { + if (sibling == cpu || !last_level_cache_is_valid(sibling)) + continue; + sib_llc = per_cpu_cacheinfo_idx(sibling, cache_leaves(sibling) - 1); + *map = &sib_llc->shared_cpu_map; + return cpumask_weight(*map); + } + + return 0; +} + /* * Calculate the size of the per-CPU data cache slice. This can be * used to estimate the size of the data cache slice that can be used @@ -929,28 +960,31 @@ static void update_per_cpu_data_slice_size_cpu(unsigned int cpu) ci->per_cpu_data_slice_size = llc->size / nr_shared; } -static void update_per_cpu_data_slice_size(bool cpu_online, unsigned int cpu) +static void update_per_cpu_data_slice_size(bool cpu_online, unsigned int cpu, + cpumask_t *cpu_map) { unsigned int icpu; - for_each_online_cpu(icpu) { + for_each_cpu(icpu, cpu_map) { if (!cpu_online && icpu == cpu) continue; update_per_cpu_data_slice_size_cpu(icpu); + setup_pcp_cacheinfo(icpu); } } static int cacheinfo_cpu_online(unsigned int cpu) { int rc = detect_cache_attributes(cpu); + cpumask_t *cpu_map; if (rc) return rc; rc = cache_add_dev(cpu); if (rc) goto err; - update_per_cpu_data_slice_size(true, cpu); - setup_pcp_cacheinfo(); + if (cpu_map_shared_cache(true, cpu, &cpu_map)) + update_per_cpu_data_slice_size(true, cpu, cpu_map); return 0; err: free_cache_attributes(cpu); @@ -959,12 +993,16 @@ err: static int cacheinfo_cpu_pre_down(unsigned int cpu) { + cpumask_t *cpu_map; + unsigned int nr_shared; + + nr_shared = cpu_map_shared_cache(false, cpu, &cpu_map); if (cpumask_test_and_clear_cpu(cpu, &cache_dev_map)) cpu_cache_sysfs_exit(cpu); free_cache_attributes(cpu); - update_per_cpu_data_slice_size(false, cpu); - setup_pcp_cacheinfo(); + if (nr_shared > 1) + update_per_cpu_data_slice_size(false, cpu, cpu_map); return 0; } diff --git a/include/linux/gfp.h b/include/linux/gfp.h index de292a007138..09e22091f1b0 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -334,7 +334,7 @@ void drain_all_pages(struct zone *zone); void drain_local_pages(struct zone *zone); void page_alloc_init_late(void); -void setup_pcp_cacheinfo(void); +void setup_pcp_cacheinfo(unsigned int cpu); /* * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 150d4f23b010..9faca05d124e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5572,37 +5572,34 @@ static void zone_pcp_update(struct zone *zone, int cpu_online) mutex_unlock(&pcp_batch_high_lock); } -static void zone_pcp_update_cacheinfo(struct zone *zone) +static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu) { - int cpu; struct per_cpu_pages *pcp; struct cpu_cacheinfo *cci; - for_each_online_cpu(cpu) { - pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); - cci = get_cpu_cacheinfo(cpu); - /* - * If data cache slice of CPU is large enough, "pcp->batch" - * pages can be preserved in PCP before draining PCP for - * consecutive high-order pages freeing without allocation. - * This can reduce zone lock contention without hurting - * cache-hot pages sharing. - */ - spin_lock(&pcp->lock); - if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch) - pcp->flags |= PCPF_FREE_HIGH_BATCH; - else - pcp->flags &= ~PCPF_FREE_HIGH_BATCH; - spin_unlock(&pcp->lock); - } + pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); + cci = get_cpu_cacheinfo(cpu); + /* + * If data cache slice of CPU is large enough, "pcp->batch" + * pages can be preserved in PCP before draining PCP for + * consecutive high-order pages freeing without allocation. + * This can reduce zone lock contention without hurting + * cache-hot pages sharing. + */ + spin_lock(&pcp->lock); + if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch) + pcp->flags |= PCPF_FREE_HIGH_BATCH; + else + pcp->flags &= ~PCPF_FREE_HIGH_BATCH; + spin_unlock(&pcp->lock); } -void setup_pcp_cacheinfo(void) +void setup_pcp_cacheinfo(unsigned int cpu) { struct zone *zone; for_each_populated_zone(zone) - zone_pcp_update_cacheinfo(zone); + zone_pcp_update_cacheinfo(zone, cpu); } /* -- cgit v1.2.3 From 015abee404760249a5c968b9ce29216b94b8ced1 Mon Sep 17 00:00:00 2001 From: Vilas Bhat Date: Wed, 21 Feb 2024 19:47:53 +0000 Subject: PM: runtime: add tracepoint for runtime_status changes Existing runtime PM ftrace events (`rpm_suspend`, `rpm_resume`, `rpm_return_int`) offer limited visibility into the exact timing of device runtime power state transitions, particularly when asynchronous operations are involved. When the `rpm_suspend` or `rpm_resume` functions are invoked with the `RPM_ASYNC` flag, a return value of 0 i.e., success merely indicates that the device power state request has been queued, not that the device has yet transitioned. A new ftrace event, `rpm_status`, is introduced. This event directly logs the `power.runtime_status` value of a device whenever it changes providing granular tracking of runtime power state transitions regardless of synchronous or asynchronous `rpm_suspend` / `rpm_resume` usage. Signed-off-by: Vilas Bhat Signed-off-by: Rafael J. Wysocki --- drivers/base/power/runtime.c | 1 + include/trace/events/rpm.h | 42 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) (limited to 'drivers/base') diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index 5275a6b2e980..2ee45841486b 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -94,6 +94,7 @@ static void update_pm_runtime_accounting(struct device *dev) static void __update_runtime_status(struct device *dev, enum rpm_status status) { update_pm_runtime_accounting(dev); + trace_rpm_status(dev, status); dev->power.runtime_status = status; } diff --git a/include/trace/events/rpm.h b/include/trace/events/rpm.h index 3c716214dab1..bd120e23ce12 100644 --- a/include/trace/events/rpm.h +++ b/include/trace/events/rpm.h @@ -101,6 +101,48 @@ TRACE_EVENT(rpm_return_int, __entry->ret) ); +#define RPM_STATUS_STRINGS \ + EM(RPM_INVALID, "RPM_INVALID") \ + EM(RPM_ACTIVE, "RPM_ACTIVE") \ + EM(RPM_RESUMING, "RPM_RESUMING") \ + EM(RPM_SUSPENDED, "RPM_SUSPENDED") \ + EMe(RPM_SUSPENDING, "RPM_SUSPENDING") + +/* Enums require being exported to userspace, for user tool parsing. */ +#undef EM +#undef EMe +#define EM(a, b) TRACE_DEFINE_ENUM(a); +#define EMe(a, b) TRACE_DEFINE_ENUM(a); + +RPM_STATUS_STRINGS + +/* + * Now redefine the EM() and EMe() macros to map the enums to the strings that + * will be printed in the output. + */ +#undef EM +#undef EMe +#define EM(a, b) { a, b }, +#define EMe(a, b) { a, b } + +TRACE_EVENT(rpm_status, + TP_PROTO(struct device *dev, enum rpm_status status), + TP_ARGS(dev, status), + + TP_STRUCT__entry( + __string(name, dev_name(dev)) + __field(int, status) + ), + + TP_fast_assign( + __assign_str(name, dev_name(dev)); + __entry->status = status; + ), + + TP_printk("%s status=%s", __get_str(name), + __print_symbolic(__entry->status, RPM_STATUS_STRINGS)) +); + #endif /* _TRACE_RUNTIME_POWER_H */ /* This part must be outside protection */ -- cgit v1.2.3 From 02aff8480533817a29e820729360866441d7403d Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Jan 2024 13:12:44 +0800 Subject: crash: split crash dumping code out from kexec_core.c Currently, KEXEC_CORE select CRASH_CORE automatically because crash codes need be built in to avoid compiling error when building kexec code even though the crash dumping functionality is not enabled. E.g -------------------- CONFIG_CRASH_CORE=y CONFIG_KEXEC_CORE=y CONFIG_KEXEC=y CONFIG_KEXEC_FILE=y --------------------- After splitting out crashkernel reservation code and vmcoreinfo exporting code, there's only crash related code left in kernel/crash_core.c. Now move crash related codes from kexec_core.c to crash_core.c and only build it in when CONFIG_CRASH_DUMP=y. And also wrap up crash codes inside CONFIG_CRASH_DUMP ifdeffery scope, or replace inappropriate CONFIG_KEXEC_CORE ifdef with CONFIG_CRASH_DUMP ifdef in generic kernel files. With these changes, crash_core codes are abstracted from kexec codes and can be disabled at all if only kexec reboot feature is wanted. Link: https://lkml.kernel.org/r/20240124051254.67105-5-bhe@redhat.com Signed-off-by: Baoquan He Cc: Al Viro Cc: Eric W. Biederman Cc: Hari Bathini Cc: Pingfan Liu Cc: Klara Modin Cc: Michael Kelley Cc: Nathan Chancellor Cc: Stephen Rothwell Cc: Yang Li Signed-off-by: Andrew Morton --- drivers/base/cpu.c | 6 +- include/linux/crash_core.h | 61 +++++++++++ include/linux/kexec.h | 45 +------- init/initramfs.c | 2 +- kernel/Makefile | 3 +- kernel/crash_core.c | 256 +++++++++++++++++++++++++++++++++++++++++++++ kernel/kexec.c | 11 +- kernel/kexec_core.c | 250 +++---------------------------------------- kernel/kexec_file.c | 13 ++- kernel/ksysfs.c | 4 + 10 files changed, 359 insertions(+), 292 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 47de0f140ba6..b621a0fc75e1 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -144,7 +144,7 @@ static DEVICE_ATTR(release, S_IWUSR, NULL, cpu_release_store); #endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */ #endif /* CONFIG_HOTPLUG_CPU */ -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_DUMP #include static ssize_t crash_notes_show(struct device *dev, @@ -189,14 +189,14 @@ static const struct attribute_group crash_note_cpu_attr_group = { #endif static const struct attribute_group *common_cpu_attr_groups[] = { -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_DUMP &crash_note_cpu_attr_group, #endif NULL }; static const struct attribute_group *hotplugable_cpu_attr_groups[] = { -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_DUMP &crash_note_cpu_attr_group, #endif NULL diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 7f19f62018ef..23270b16e1db 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -6,6 +6,48 @@ #include #include +struct kimage; + +#ifdef CONFIG_CRASH_DUMP + +int crash_shrink_memory(unsigned long new_size); +ssize_t crash_get_memory_size(void); + +#ifndef arch_kexec_protect_crashkres +/* + * Protection mechanism for crashkernel reserved memory after + * the kdump kernel is loaded. + * + * Provide an empty default implementation here -- architecture + * code may override this + */ +static inline void arch_kexec_protect_crashkres(void) { } +#endif + +#ifndef arch_kexec_unprotect_crashkres +static inline void arch_kexec_unprotect_crashkres(void) { } +#endif + + + +#ifndef arch_crash_handle_hotplug_event +static inline void arch_crash_handle_hotplug_event(struct kimage *image) { } +#endif + +int crash_check_update_elfcorehdr(void); + +#ifndef crash_hotplug_cpu_support +static inline int crash_hotplug_cpu_support(void) { return 0; } +#endif + +#ifndef crash_hotplug_memory_support +static inline int crash_hotplug_memory_support(void) { return 0; } +#endif + +#ifndef crash_get_elfcorehdr_size +static inline unsigned int crash_get_elfcorehdr_size(void) { return 0; } +#endif + /* Alignment required for elf header segment */ #define ELF_CORE_HEADER_ALIGN 4096 @@ -31,4 +73,23 @@ struct kexec_segment; #define KEXEC_CRASH_HP_REMOVE_MEMORY 4 #define KEXEC_CRASH_HP_INVALID_CPU -1U +extern void __crash_kexec(struct pt_regs *regs); +extern void crash_kexec(struct pt_regs *regs); +int kexec_should_crash(struct task_struct *p); +int kexec_crash_loaded(void); +void crash_save_cpu(struct pt_regs *regs, int cpu); +extern int kimage_crash_copy_vmcoreinfo(struct kimage *image); + +#else /* !CONFIG_CRASH_DUMP*/ +struct pt_regs; +struct task_struct; +struct kimage; +static inline void __crash_kexec(struct pt_regs *regs) { } +static inline void crash_kexec(struct pt_regs *regs) { } +static inline int kexec_should_crash(struct task_struct *p) { return 0; } +static inline int kexec_crash_loaded(void) { return 0; } +static inline void crash_save_cpu(struct pt_regs *regs, int cpu) {}; +static inline int kimage_crash_copy_vmcoreinfo(struct kimage *image) { return 0; }; +#endif /* CONFIG_CRASH_DUMP*/ + #endif /* LINUX_CRASH_CORE_H */ diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 9c7bb8b56ed6..060835bb82d5 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -15,7 +15,6 @@ #if !defined(__ASSEMBLY__) -#include #include #include #include @@ -33,6 +32,7 @@ extern note_buf_t __percpu *crash_notes; #include #include #include +#include /* Verify architecture specific macros are defined */ @@ -380,13 +380,6 @@ extern struct page *kimage_alloc_control_pages(struct kimage *image, static inline int machine_kexec_post_load(struct kimage *image) { return 0; } #endif -extern void __crash_kexec(struct pt_regs *); -extern void crash_kexec(struct pt_regs *); -int kexec_should_crash(struct task_struct *); -int kexec_crash_loaded(void); -void crash_save_cpu(struct pt_regs *regs, int cpu); -extern int kimage_crash_copy_vmcoreinfo(struct kimage *image); - extern struct kimage *kexec_image; extern struct kimage *kexec_crash_image; @@ -410,24 +403,6 @@ bool kexec_load_permitted(int kexec_image_type); /* flag to track if kexec reboot is in progress */ extern bool kexec_in_progress; -int crash_shrink_memory(unsigned long new_size); -ssize_t crash_get_memory_size(void); - -#ifndef arch_kexec_protect_crashkres -/* - * Protection mechanism for crashkernel reserved memory after - * the kdump kernel is loaded. - * - * Provide an empty default implementation here -- architecture - * code may override this - */ -static inline void arch_kexec_protect_crashkres(void) { } -#endif - -#ifndef arch_kexec_unprotect_crashkres -static inline void arch_kexec_unprotect_crashkres(void) { } -#endif - #ifndef page_to_boot_pfn static inline unsigned long page_to_boot_pfn(struct page *page) { @@ -484,24 +459,6 @@ static inline int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, g static inline void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) { } #endif -#ifndef arch_crash_handle_hotplug_event -static inline void arch_crash_handle_hotplug_event(struct kimage *image) { } -#endif - -int crash_check_update_elfcorehdr(void); - -#ifndef crash_hotplug_cpu_support -static inline int crash_hotplug_cpu_support(void) { return 0; } -#endif - -#ifndef crash_hotplug_memory_support -static inline int crash_hotplug_memory_support(void) { return 0; } -#endif - -#ifndef crash_get_elfcorehdr_size -static inline unsigned int crash_get_elfcorehdr_size(void) { return 0; } -#endif - extern bool kexec_file_dbg_print; #define kexec_dprintk(fmt, ...) \ diff --git a/init/initramfs.c b/init/initramfs.c index 76deb48c38cb..6f095f54eec9 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -642,7 +642,7 @@ void __weak __init free_initrd_mem(unsigned long start, unsigned long end) "initrd"); } -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_RESERVE static bool __init kexec_free_initrd(void) { unsigned long crashk_start = (unsigned long)__va(crashk_res.start); diff --git a/kernel/Makefile b/kernel/Makefile index 35abc65e1f1a..3c13240dfc9f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -70,7 +70,8 @@ obj-$(CONFIG_KALLSYMS_SELFTEST) += kallsyms_selftest.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o elfcorehdr.o obj-$(CONFIG_CRASH_RESERVE) += crash_reserve.o -obj-$(CONFIG_KEXEC_CORE) += kexec_core.o crash_core.o +obj-$(CONFIG_KEXEC_CORE) += kexec_core.o +obj-$(CONFIG_CRASH_DUMP) += crash_core.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 2f4df1fe6f7a..78b5dc7cee3a 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -11,9 +11,14 @@ #include #include #include +#include #include #include #include +#include +#include +#include +#include #include #include @@ -26,6 +31,131 @@ /* Per cpu memory for storing cpu states in case of system crash. */ note_buf_t __percpu *crash_notes; +#ifdef CONFIG_CRASH_DUMP + +int kimage_crash_copy_vmcoreinfo(struct kimage *image) +{ + struct page *vmcoreinfo_page; + void *safecopy; + + if (!IS_ENABLED(CONFIG_CRASH_DUMP)) + return 0; + if (image->type != KEXEC_TYPE_CRASH) + return 0; + + /* + * For kdump, allocate one vmcoreinfo safe copy from the + * crash memory. as we have arch_kexec_protect_crashkres() + * after kexec syscall, we naturally protect it from write + * (even read) access under kernel direct mapping. But on + * the other hand, we still need to operate it when crash + * happens to generate vmcoreinfo note, hereby we rely on + * vmap for this purpose. + */ + vmcoreinfo_page = kimage_alloc_control_pages(image, 0); + if (!vmcoreinfo_page) { + pr_warn("Could not allocate vmcoreinfo buffer\n"); + return -ENOMEM; + } + safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL); + if (!safecopy) { + pr_warn("Could not vmap vmcoreinfo buffer\n"); + return -ENOMEM; + } + + image->vmcoreinfo_data_copy = safecopy; + crash_update_vmcoreinfo_safecopy(safecopy); + + return 0; +} + + + +int kexec_should_crash(struct task_struct *p) +{ + /* + * If crash_kexec_post_notifiers is enabled, don't run + * crash_kexec() here yet, which must be run after panic + * notifiers in panic(). + */ + if (crash_kexec_post_notifiers) + return 0; + /* + * There are 4 panic() calls in make_task_dead() path, each of which + * corresponds to each of these 4 conditions. + */ + if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) + return 1; + return 0; +} + +int kexec_crash_loaded(void) +{ + return !!kexec_crash_image; +} +EXPORT_SYMBOL_GPL(kexec_crash_loaded); + +/* + * No panic_cpu check version of crash_kexec(). This function is called + * only when panic_cpu holds the current CPU number; this is the only CPU + * which processes crash_kexec routines. + */ +void __noclone __crash_kexec(struct pt_regs *regs) +{ + /* Take the kexec_lock here to prevent sys_kexec_load + * running on one cpu from replacing the crash kernel + * we are using after a panic on a different cpu. + * + * If the crash kernel was not located in a fixed area + * of memory the xchg(&kexec_crash_image) would be + * sufficient. But since I reuse the memory... + */ + if (kexec_trylock()) { + if (kexec_crash_image) { + struct pt_regs fixed_regs; + + crash_setup_regs(&fixed_regs, regs); + crash_save_vmcoreinfo(); + machine_crash_shutdown(&fixed_regs); + machine_kexec(kexec_crash_image); + } + kexec_unlock(); + } +} +STACK_FRAME_NON_STANDARD(__crash_kexec); + +__bpf_kfunc void crash_kexec(struct pt_regs *regs) +{ + int old_cpu, this_cpu; + + /* + * Only one CPU is allowed to execute the crash_kexec() code as with + * panic(). Otherwise parallel calls of panic() and crash_kexec() + * may stop each other. To exclude them, we use panic_cpu here too. + */ + old_cpu = PANIC_CPU_INVALID; + this_cpu = raw_smp_processor_id(); + + if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) { + /* This is the 1st CPU which comes here, so go ahead. */ + __crash_kexec(regs); + + /* + * Reset panic_cpu to allow another panic()/crash_kexec() + * call. + */ + atomic_set(&panic_cpu, PANIC_CPU_INVALID); + } +} + +static inline resource_size_t crash_resource_size(const struct resource *res) +{ + return !res->end ? 0 : resource_size(res); +} + + + + int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map, void **addr, unsigned long *sz) { @@ -187,6 +317,130 @@ int crash_exclude_mem_range(struct crash_mem *mem, return 0; } +ssize_t crash_get_memory_size(void) +{ + ssize_t size = 0; + + if (!kexec_trylock()) + return -EBUSY; + + size += crash_resource_size(&crashk_res); + size += crash_resource_size(&crashk_low_res); + + kexec_unlock(); + return size; +} + +static int __crash_shrink_memory(struct resource *old_res, + unsigned long new_size) +{ + struct resource *ram_res; + + ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); + if (!ram_res) + return -ENOMEM; + + ram_res->start = old_res->start + new_size; + ram_res->end = old_res->end; + ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; + ram_res->name = "System RAM"; + + if (!new_size) { + release_resource(old_res); + old_res->start = 0; + old_res->end = 0; + } else { + crashk_res.end = ram_res->start - 1; + } + + crash_free_reserved_phys_range(ram_res->start, ram_res->end); + insert_resource(&iomem_resource, ram_res); + + return 0; +} + +int crash_shrink_memory(unsigned long new_size) +{ + int ret = 0; + unsigned long old_size, low_size; + + if (!kexec_trylock()) + return -EBUSY; + + if (kexec_crash_image) { + ret = -ENOENT; + goto unlock; + } + + low_size = crash_resource_size(&crashk_low_res); + old_size = crash_resource_size(&crashk_res) + low_size; + new_size = roundup(new_size, KEXEC_CRASH_MEM_ALIGN); + if (new_size >= old_size) { + ret = (new_size == old_size) ? 0 : -EINVAL; + goto unlock; + } + + /* + * (low_size > new_size) implies that low_size is greater than zero. + * This also means that if low_size is zero, the else branch is taken. + * + * If low_size is greater than 0, (low_size > new_size) indicates that + * crashk_low_res also needs to be shrunken. Otherwise, only crashk_res + * needs to be shrunken. + */ + if (low_size > new_size) { + ret = __crash_shrink_memory(&crashk_res, 0); + if (ret) + goto unlock; + + ret = __crash_shrink_memory(&crashk_low_res, new_size); + } else { + ret = __crash_shrink_memory(&crashk_res, new_size - low_size); + } + + /* Swap crashk_res and crashk_low_res if needed */ + if (!crashk_res.end && crashk_low_res.end) { + crashk_res.start = crashk_low_res.start; + crashk_res.end = crashk_low_res.end; + release_resource(&crashk_low_res); + crashk_low_res.start = 0; + crashk_low_res.end = 0; + insert_resource(&iomem_resource, &crashk_res); + } + +unlock: + kexec_unlock(); + return ret; +} + +void crash_save_cpu(struct pt_regs *regs, int cpu) +{ + struct elf_prstatus prstatus; + u32 *buf; + + if ((cpu < 0) || (cpu >= nr_cpu_ids)) + return; + + /* Using ELF notes here is opportunistic. + * I need a well defined structure format + * for the data I pass, and I need tags + * on the data to indicate what information I have + * squirrelled away. ELF notes happen to provide + * all of that, so there is no need to invent something new. + */ + buf = (u32 *)per_cpu_ptr(crash_notes, cpu); + if (!buf) + return; + memset(&prstatus, 0, sizeof(prstatus)); + prstatus.common.pr_pid = current->pid; + elf_core_copy_regs(&prstatus.pr_reg, regs); + buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, + &prstatus, sizeof(prstatus)); + final_note(buf); +} + + + static int __init crash_notes_memory_init(void) { /* Allocate memory for saving cpu registers. */ @@ -220,6 +474,8 @@ static int __init crash_notes_memory_init(void) } subsys_initcall(crash_notes_memory_init); +#endif /*CONFIG_CRASH_DUMP*/ + #ifdef CONFIG_CRASH_HOTPLUG #undef pr_fmt #define pr_fmt(fmt) "crash hp: " fmt diff --git a/kernel/kexec.c b/kernel/kexec.c index 8f35a5a42af8..bab542fc1463 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -28,12 +28,14 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, struct kimage *image; bool kexec_on_panic = flags & KEXEC_ON_CRASH; +#ifdef CONFIG_CRASH_DUMP if (kexec_on_panic) { /* Verify we have a valid entry point */ if ((entry < phys_to_boot_phys(crashk_res.start)) || (entry > phys_to_boot_phys(crashk_res.end))) return -EADDRNOTAVAIL; } +#endif /* Allocate and initialize a controlling structure */ image = do_kimage_alloc_init(); @@ -44,11 +46,13 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, image->nr_segments = nr_segments; memcpy(image->segment, segments, nr_segments * sizeof(*segments)); +#ifdef CONFIG_CRASH_DUMP if (kexec_on_panic) { /* Enable special crash kernel control page alloc policy. */ image->control_page = crashk_res.start; image->type = KEXEC_TYPE_CRASH; } +#endif ret = sanity_check_segment_list(image); if (ret) @@ -99,13 +103,14 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, if (!kexec_trylock()) return -EBUSY; +#ifdef CONFIG_CRASH_DUMP if (flags & KEXEC_ON_CRASH) { dest_image = &kexec_crash_image; if (kexec_crash_image) arch_kexec_unprotect_crashkres(); - } else { + } else +#endif dest_image = &kexec_image; - } if (nr_segments == 0) { /* Uninstall image */ @@ -162,8 +167,10 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, image = xchg(dest_image, image); out: +#ifdef CONFIG_CRASH_DUMP if ((flags & KEXEC_ON_CRASH) && kexec_crash_image) arch_kexec_protect_crashkres(); +#endif kimage_free(image); out_unlock: diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index d08fc7b5db97..ce3429e7972c 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -54,30 +54,6 @@ bool kexec_in_progress = false; bool kexec_file_dbg_print; -int kexec_should_crash(struct task_struct *p) -{ - /* - * If crash_kexec_post_notifiers is enabled, don't run - * crash_kexec() here yet, which must be run after panic - * notifiers in panic(). - */ - if (crash_kexec_post_notifiers) - return 0; - /* - * There are 4 panic() calls in make_task_dead() path, each of which - * corresponds to each of these 4 conditions. - */ - if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) - return 1; - return 0; -} - -int kexec_crash_loaded(void) -{ - return !!kexec_crash_image; -} -EXPORT_SYMBOL_GPL(kexec_crash_loaded); - /* * When kexec transitions to the new kernel there is a one-to-one * mapping between physical and virtual addresses. On processors @@ -209,6 +185,7 @@ int sanity_check_segment_list(struct kimage *image) if (total_pages > nr_pages / 2) return -EINVAL; +#ifdef CONFIG_CRASH_DUMP /* * Verify we have good destination addresses. Normally * the caller is responsible for making certain we don't @@ -231,6 +208,7 @@ int sanity_check_segment_list(struct kimage *image) return -EADDRNOTAVAIL; } } +#endif return 0; } @@ -403,6 +381,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image, return pages; } +#ifdef CONFIG_CRASH_DUMP static struct page *kimage_alloc_crash_control_pages(struct kimage *image, unsigned int order) { @@ -468,6 +447,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image, return pages; } +#endif struct page *kimage_alloc_control_pages(struct kimage *image, @@ -479,48 +459,16 @@ struct page *kimage_alloc_control_pages(struct kimage *image, case KEXEC_TYPE_DEFAULT: pages = kimage_alloc_normal_control_pages(image, order); break; +#ifdef CONFIG_CRASH_DUMP case KEXEC_TYPE_CRASH: pages = kimage_alloc_crash_control_pages(image, order); break; +#endif } return pages; } -int kimage_crash_copy_vmcoreinfo(struct kimage *image) -{ - struct page *vmcoreinfo_page; - void *safecopy; - - if (image->type != KEXEC_TYPE_CRASH) - return 0; - - /* - * For kdump, allocate one vmcoreinfo safe copy from the - * crash memory. as we have arch_kexec_protect_crashkres() - * after kexec syscall, we naturally protect it from write - * (even read) access under kernel direct mapping. But on - * the other hand, we still need to operate it when crash - * happens to generate vmcoreinfo note, hereby we rely on - * vmap for this purpose. - */ - vmcoreinfo_page = kimage_alloc_control_pages(image, 0); - if (!vmcoreinfo_page) { - pr_warn("Could not allocate vmcoreinfo buffer\n"); - return -ENOMEM; - } - safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL); - if (!safecopy) { - pr_warn("Could not vmap vmcoreinfo buffer\n"); - return -ENOMEM; - } - - image->vmcoreinfo_data_copy = safecopy; - crash_update_vmcoreinfo_safecopy(safecopy); - - return 0; -} - static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) { if (*image->entry != 0) @@ -603,10 +551,12 @@ void kimage_free(struct kimage *image) if (!image) return; +#ifdef CONFIG_CRASH_DUMP if (image->vmcoreinfo_data_copy) { crash_update_vmcoreinfo_safecopy(NULL); vunmap(image->vmcoreinfo_data_copy); } +#endif kimage_free_extra_pages(image); for_each_kimage_entry(image, ptr, entry) { @@ -824,6 +774,7 @@ out: return result; } +#ifdef CONFIG_CRASH_DUMP static int kimage_load_crash_segment(struct kimage *image, struct kexec_segment *segment) { @@ -891,6 +842,7 @@ static int kimage_load_crash_segment(struct kimage *image, out: return result; } +#endif int kimage_load_segment(struct kimage *image, struct kexec_segment *segment) @@ -901,9 +853,11 @@ int kimage_load_segment(struct kimage *image, case KEXEC_TYPE_DEFAULT: result = kimage_load_normal_segment(image, segment); break; +#ifdef CONFIG_CRASH_DUMP case KEXEC_TYPE_CRASH: result = kimage_load_crash_segment(image, segment); break; +#endif } return result; @@ -1027,186 +981,6 @@ bool kexec_load_permitted(int kexec_image_type) return true; } -/* - * No panic_cpu check version of crash_kexec(). This function is called - * only when panic_cpu holds the current CPU number; this is the only CPU - * which processes crash_kexec routines. - */ -void __noclone __crash_kexec(struct pt_regs *regs) -{ - /* Take the kexec_lock here to prevent sys_kexec_load - * running on one cpu from replacing the crash kernel - * we are using after a panic on a different cpu. - * - * If the crash kernel was not located in a fixed area - * of memory the xchg(&kexec_crash_image) would be - * sufficient. But since I reuse the memory... - */ - if (kexec_trylock()) { - if (kexec_crash_image) { - struct pt_regs fixed_regs; - - crash_setup_regs(&fixed_regs, regs); - crash_save_vmcoreinfo(); - machine_crash_shutdown(&fixed_regs); - machine_kexec(kexec_crash_image); - } - kexec_unlock(); - } -} -STACK_FRAME_NON_STANDARD(__crash_kexec); - -__bpf_kfunc void crash_kexec(struct pt_regs *regs) -{ - int old_cpu, this_cpu; - - /* - * Only one CPU is allowed to execute the crash_kexec() code as with - * panic(). Otherwise parallel calls of panic() and crash_kexec() - * may stop each other. To exclude them, we use panic_cpu here too. - */ - old_cpu = PANIC_CPU_INVALID; - this_cpu = raw_smp_processor_id(); - - if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) { - /* This is the 1st CPU which comes here, so go ahead. */ - __crash_kexec(regs); - - /* - * Reset panic_cpu to allow another panic()/crash_kexec() - * call. - */ - atomic_set(&panic_cpu, PANIC_CPU_INVALID); - } -} - -static inline resource_size_t crash_resource_size(const struct resource *res) -{ - return !res->end ? 0 : resource_size(res); -} - -ssize_t crash_get_memory_size(void) -{ - ssize_t size = 0; - - if (!kexec_trylock()) - return -EBUSY; - - size += crash_resource_size(&crashk_res); - size += crash_resource_size(&crashk_low_res); - - kexec_unlock(); - return size; -} - -static int __crash_shrink_memory(struct resource *old_res, - unsigned long new_size) -{ - struct resource *ram_res; - - ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); - if (!ram_res) - return -ENOMEM; - - ram_res->start = old_res->start + new_size; - ram_res->end = old_res->end; - ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; - ram_res->name = "System RAM"; - - if (!new_size) { - release_resource(old_res); - old_res->start = 0; - old_res->end = 0; - } else { - crashk_res.end = ram_res->start - 1; - } - - crash_free_reserved_phys_range(ram_res->start, ram_res->end); - insert_resource(&iomem_resource, ram_res); - - return 0; -} - -int crash_shrink_memory(unsigned long new_size) -{ - int ret = 0; - unsigned long old_size, low_size; - - if (!kexec_trylock()) - return -EBUSY; - - if (kexec_crash_image) { - ret = -ENOENT; - goto unlock; - } - - low_size = crash_resource_size(&crashk_low_res); - old_size = crash_resource_size(&crashk_res) + low_size; - new_size = roundup(new_size, KEXEC_CRASH_MEM_ALIGN); - if (new_size >= old_size) { - ret = (new_size == old_size) ? 0 : -EINVAL; - goto unlock; - } - - /* - * (low_size > new_size) implies that low_size is greater than zero. - * This also means that if low_size is zero, the else branch is taken. - * - * If low_size is greater than 0, (low_size > new_size) indicates that - * crashk_low_res also needs to be shrunken. Otherwise, only crashk_res - * needs to be shrunken. - */ - if (low_size > new_size) { - ret = __crash_shrink_memory(&crashk_res, 0); - if (ret) - goto unlock; - - ret = __crash_shrink_memory(&crashk_low_res, new_size); - } else { - ret = __crash_shrink_memory(&crashk_res, new_size - low_size); - } - - /* Swap crashk_res and crashk_low_res if needed */ - if (!crashk_res.end && crashk_low_res.end) { - crashk_res.start = crashk_low_res.start; - crashk_res.end = crashk_low_res.end; - release_resource(&crashk_low_res); - crashk_low_res.start = 0; - crashk_low_res.end = 0; - insert_resource(&iomem_resource, &crashk_res); - } - -unlock: - kexec_unlock(); - return ret; -} - -void crash_save_cpu(struct pt_regs *regs, int cpu) -{ - struct elf_prstatus prstatus; - u32 *buf; - - if ((cpu < 0) || (cpu >= nr_cpu_ids)) - return; - - /* Using ELF notes here is opportunistic. - * I need a well defined structure format - * for the data I pass, and I need tags - * on the data to indicate what information I have - * squirrelled away. ELF notes happen to provide - * all of that, so there is no need to invent something new. - */ - buf = (u32 *)per_cpu_ptr(crash_notes, cpu); - if (!buf) - return; - memset(&prstatus, 0, sizeof(prstatus)); - prstatus.common.pr_pid = current->pid; - elf_core_copy_regs(&prstatus.pr_reg, regs); - buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, - &prstatus, sizeof(prstatus)); - final_note(buf); -} - /* * Move into place and start executing a preloaded standalone * executable. If nothing was preloaded return an error. diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index bef2f6f2571b..ce7ce2ae27cd 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -285,11 +285,13 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, kexec_file_dbg_print = !!(flags & KEXEC_FILE_DEBUG); image->file_mode = 1; +#ifdef CONFIG_CRASH_DUMP if (kexec_on_panic) { /* Enable special crash kernel control page alloc policy. */ image->control_page = crashk_res.start; image->type = KEXEC_TYPE_CRASH; } +#endif ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd, cmdline_ptr, cmdline_len, flags); @@ -349,13 +351,14 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, if (!kexec_trylock()) return -EBUSY; +#ifdef CONFIG_CRASH_DUMP if (image_type == KEXEC_TYPE_CRASH) { dest_image = &kexec_crash_image; if (kexec_crash_image) arch_kexec_unprotect_crashkres(); - } else { + } else +#endif dest_image = &kexec_image; - } if (flags & KEXEC_FILE_UNLOAD) goto exchange; @@ -419,8 +422,10 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, exchange: image = xchg(dest_image, image); out: +#ifdef CONFIG_CRASH_DUMP if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image) arch_kexec_protect_crashkres(); +#endif kexec_unlock(); kimage_free(image); @@ -595,12 +600,14 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf, static int kexec_walk_resources(struct kexec_buf *kbuf, int (*func)(struct resource *, void *)) { +#ifdef CONFIG_CRASH_DUMP if (kbuf->image->type == KEXEC_TYPE_CRASH) return walk_iomem_res_desc(crashk_res.desc, IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY, crashk_res.start, crashk_res.end, kbuf, func); - else if (kbuf->top_down) +#endif + if (kbuf->top_down) return walk_system_ram_res_rev(0, ULONG_MAX, kbuf, func); else return walk_system_ram_res(0, ULONG_MAX, kbuf, func); diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 11526fc42bc2..fe7a517fc4ab 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -120,6 +120,7 @@ static ssize_t kexec_loaded_show(struct kobject *kobj, } KERNEL_ATTR_RO(kexec_loaded); +#ifdef CONFIG_CRASH_DUMP static ssize_t kexec_crash_loaded_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -152,6 +153,7 @@ static ssize_t kexec_crash_size_store(struct kobject *kobj, } KERNEL_ATTR_RW(kexec_crash_size); +#endif /* CONFIG_CRASH_DUMP*/ #endif /* CONFIG_KEXEC_CORE */ #ifdef CONFIG_VMCORE_INFO @@ -262,9 +264,11 @@ static struct attribute * kernel_attrs[] = { #endif #ifdef CONFIG_KEXEC_CORE &kexec_loaded_attr.attr, +#ifdef CONFIG_CRASH_DUMP &kexec_crash_loaded_attr.attr, &kexec_crash_size_attr.attr, #endif +#endif #ifdef CONFIG_VMCORE_INFO &vmcoreinfo_attr.attr, #ifdef CONFIG_CRASH_HOTPLUG -- cgit v1.2.3 From bb92804ba2b6636e28db05f589a9a8ef62a07917 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Sun, 25 Feb 2024 19:49:25 +0000 Subject: regmap: kunit: Add a test for ranges in combination with windows In preparation for taking advantage of the SPI support for pre-coooked messages add a test case covering the use of windows on a raw regmap, unfortunately the parameterisation prevents direct reuse and we will want to add some raw specific coverage anyway. Signed-off-by: Mark Brown Link: https://msgid.link/r/20240225-regmap-test-format-v1-1-41e4fdfb1c1f@kernel.org Signed-off-by: Mark Brown --- drivers/base/regmap/regmap-kunit.c | 66 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) (limited to 'drivers/base') diff --git a/drivers/base/regmap/regmap-kunit.c b/drivers/base/regmap/regmap-kunit.c index 026bdcb45127..4ef401aca7ea 100644 --- a/drivers/base/regmap/regmap-kunit.c +++ b/drivers/base/regmap/regmap-kunit.c @@ -1318,6 +1318,71 @@ static void raw_sync(struct kunit *test) regmap_exit(map); } +static void raw_ranges(struct kunit *test) +{ + struct raw_test_types *t = (struct raw_test_types *)test->param_value; + struct regmap *map; + struct regmap_config config; + struct regmap_ram_data *data; + unsigned int val; + int i; + + config = raw_regmap_config; + config.volatile_reg = test_range_all_volatile; + config.ranges = &test_range; + config.num_ranges = 1; + config.max_register = test_range.range_max; + + map = gen_raw_regmap(&config, t, &data); + KUNIT_ASSERT_FALSE(test, IS_ERR(map)); + if (IS_ERR(map)) + return; + + /* Reset the page to a non-zero value to trigger a change */ + KUNIT_EXPECT_EQ(test, 0, regmap_write(map, test_range.selector_reg, + test_range.range_max)); + + /* Check we set the page and use the window for writes */ + data->written[test_range.selector_reg] = false; + data->written[test_range.window_start] = false; + KUNIT_EXPECT_EQ(test, 0, regmap_write(map, test_range.range_min, 0)); + KUNIT_EXPECT_TRUE(test, data->written[test_range.selector_reg]); + KUNIT_EXPECT_TRUE(test, data->written[test_range.window_start]); + + data->written[test_range.selector_reg] = false; + data->written[test_range.window_start] = false; + KUNIT_EXPECT_EQ(test, 0, regmap_write(map, + test_range.range_min + + test_range.window_len, + 0)); + KUNIT_EXPECT_TRUE(test, data->written[test_range.selector_reg]); + KUNIT_EXPECT_TRUE(test, data->written[test_range.window_start]); + + /* Same for reads */ + data->written[test_range.selector_reg] = false; + data->read[test_range.window_start] = false; + KUNIT_EXPECT_EQ(test, 0, regmap_read(map, test_range.range_min, &val)); + KUNIT_EXPECT_TRUE(test, data->written[test_range.selector_reg]); + KUNIT_EXPECT_TRUE(test, data->read[test_range.window_start]); + + data->written[test_range.selector_reg] = false; + data->read[test_range.window_start] = false; + KUNIT_EXPECT_EQ(test, 0, regmap_read(map, + test_range.range_min + + test_range.window_len, + &val)); + KUNIT_EXPECT_TRUE(test, data->written[test_range.selector_reg]); + KUNIT_EXPECT_TRUE(test, data->read[test_range.window_start]); + + /* No physical access triggered in the virtual range */ + for (i = test_range.range_min; i < test_range.range_max; i++) { + KUNIT_EXPECT_FALSE(test, data->read[i]); + KUNIT_EXPECT_FALSE(test, data->written[i]); + } + + regmap_exit(map); +} + static struct kunit_case regmap_test_cases[] = { KUNIT_CASE_PARAM(basic_read_write, regcache_types_gen_params), KUNIT_CASE_PARAM(bulk_write, regcache_types_gen_params), @@ -1345,6 +1410,7 @@ static struct kunit_case regmap_test_cases[] = { KUNIT_CASE_PARAM(raw_write, raw_test_types_gen_params), KUNIT_CASE_PARAM(raw_noinc_write, raw_test_types_gen_params), KUNIT_CASE_PARAM(raw_sync, raw_test_cache_types_gen_params), + KUNIT_CASE_PARAM(raw_ranges, raw_test_cache_types_gen_params), {} }; -- cgit v1.2.3 From e7a7681c859643f3f2476b2a28a494877fd89442 Mon Sep 17 00:00:00 2001 From: Qingliang Li Date: Fri, 1 Mar 2024 17:26:57 +0800 Subject: PM: sleep: wakeirq: fix wake irq warning in system suspend When driver uses pm_runtime_force_suspend() as the system suspend callback function and registers the wake irq with reverse enable ordering, the wake irq will be re-enabled when entering system suspend, triggering an 'Unbalanced enable for IRQ xxx' warning. In this scenario, the call sequence during system suspend is as follows: suspend_devices_and_enter() -> dpm_suspend_start() -> dpm_run_callback() -> pm_runtime_force_suspend() -> dev_pm_enable_wake_irq_check() -> dev_pm_enable_wake_irq_complete() -> suspend_enter() -> dpm_suspend_noirq() -> device_wakeup_arm_wake_irqs() -> dev_pm_arm_wake_irq() To fix this issue, complete the setting of WAKE_IRQ_DEDICATED_ENABLED flag in dev_pm_enable_wake_irq_complete() to avoid redundant irq enablement. Fixes: 8527beb12087 ("PM: sleep: wakeirq: fix wake irq arming") Reviewed-by: Dhruva Gole Signed-off-by: Qingliang Li Reviewed-by: Johan Hovold Cc: 5.16+ # 5.16+ Signed-off-by: Rafael J. Wysocki --- drivers/base/power/wakeirq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/base') diff --git a/drivers/base/power/wakeirq.c b/drivers/base/power/wakeirq.c index 42171f766dcb..5a5a9e978e85 100644 --- a/drivers/base/power/wakeirq.c +++ b/drivers/base/power/wakeirq.c @@ -313,8 +313,10 @@ void dev_pm_enable_wake_irq_complete(struct device *dev) return; if (wirq->status & WAKE_IRQ_DEDICATED_MANAGED && - wirq->status & WAKE_IRQ_DEDICATED_REVERSE) + wirq->status & WAKE_IRQ_DEDICATED_REVERSE) { enable_irq(wirq->irq); + wirq->status |= WAKE_IRQ_DEDICATED_ENABLED; + } } /** -- cgit v1.2.3 From f8c7511db009d42e2c24e48eeb04e3f1b67ab209 Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Tue, 5 Mar 2024 16:32:16 -0300 Subject: block: make block_class constant Since commit 43a7206b0963 ("driver core: class: make class_register() take a const *"), the driver core allows for struct class to be in read-only memory, so move the block_class structure to be declared at build time placing it into read-only memory, instead of having to be dynamically allocated at boot time. Cc: Greg Kroah-Hartman Suggested-by: Greg Kroah-Hartman Signed-off-by: Ricardo B. Marliere Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240305-class_cleanup-block-v1-1-130bb27b9c72@marliere.net Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- drivers/base/base.h | 2 +- include/linux/blkdev.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/base') diff --git a/block/genhd.c b/block/genhd.c index 84c822d989da..a214f9cf3a35 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1201,7 +1201,7 @@ static int block_uevent(const struct device *dev, struct kobj_uevent_env *env) return add_uevent_var(env, "DISKSEQ=%llu", disk->diskseq); } -struct class block_class = { +const struct class block_class = { .name = "block", .dev_uevent = block_uevent, }; diff --git a/drivers/base/base.h b/drivers/base/base.h index eb4c0ace9242..0738ccad08b2 100644 --- a/drivers/base/base.h +++ b/drivers/base/base.h @@ -207,7 +207,7 @@ static inline int devtmpfs_init(void) { return 0; } #endif #ifdef CONFIG_BLOCK -extern struct class block_class; +extern const struct class block_class; static inline bool is_blockdev(struct device *dev) { return dev->class == &block_class; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 285e82723d64..19c7596f4ebf 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -42,7 +42,7 @@ struct blk_crypto_profile; extern const struct device_type disk_type; extern const struct device_type part_type; -extern struct class block_class; +extern const struct class block_class; /* * Maximum number of blkcg policies allowed to be registered concurrently. -- cgit v1.2.3 From 822d66c45e793240a9888463127059558bbe9c0d Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 20 Jan 2024 07:58:46 +0100 Subject: platform-msi: Remove usage of the deprecated ida_simple_xx() API ida_alloc() and ida_free() should be preferred to the deprecated ida_simple_get() and ida_simple_remove(). Note that the upper limit of ida_simple_get() is exclusive, but the one of ida_alloc_max() is inclusive. So a -1 has been added when needed. Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/fd87836efa894aee0ae43e767369c85a2ee7e1ff.1705733916.git.christophe.jaillet@wanadoo.fr Signed-off-by: Greg Kroah-Hartman --- drivers/base/platform-msi.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/platform-msi.c b/drivers/base/platform-msi.c index f37ad34c80ec..ca48d1f60865 100644 --- a/drivers/base/platform-msi.c +++ b/drivers/base/platform-msi.c @@ -172,8 +172,8 @@ static int platform_msi_alloc_priv_data(struct device *dev, unsigned int nvec, if (!datap) return -ENOMEM; - datap->devid = ida_simple_get(&platform_msi_devid_ida, - 0, 1 << DEV_ID_SHIFT, GFP_KERNEL); + datap->devid = ida_alloc_max(&platform_msi_devid_ida, + (1 << DEV_ID_SHIFT) - 1, GFP_KERNEL); if (datap->devid < 0) { err = datap->devid; kfree(datap); @@ -191,7 +191,7 @@ static void platform_msi_free_priv_data(struct device *dev) struct platform_msi_priv_data *data = dev->msi.data->platform_data; dev->msi.data->platform_data = NULL; - ida_simple_remove(&platform_msi_devid_ida, data->devid); + ida_free(&platform_msi_devid_ida, data->devid); kfree(data); } -- cgit v1.2.3 From 1fe6e4f0b0c47e70735066e889f97c3c6e1e79b2 Mon Sep 17 00:00:00 2001 From: Mukesh Ojha Date: Mon, 19 Feb 2024 22:09:54 +0530 Subject: firmware_loader: Suppress warning on FW_OPT_NO_WARN flag Some of the warnings are still being printed even if FW_OPT_NO_WARN is passed for some of the function e.g., firmware_request_nowarn(). Fix it by adding a check for FW_OPT_NO_WARN before printing the warning. Signed-off-by: Mukesh Ojha Reviewed-by: Luis Chamberlain Link: https://lore.kernel.org/r/20240219163954.7719-1-quic_mojha@quicinc.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/firmware_loader/main.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/firmware_loader/main.c b/drivers/base/firmware_loader/main.c index ea28102d421e..da8ca01d011c 100644 --- a/drivers/base/firmware_loader/main.c +++ b/drivers/base/firmware_loader/main.c @@ -551,12 +551,16 @@ fw_get_filesystem_firmware(struct device *device, struct fw_priv *fw_priv, file_size_ptr, READING_FIRMWARE); if (rc < 0) { - if (rc != -ENOENT) - dev_warn(device, "loading %s failed with error %d\n", - path, rc); - else - dev_dbg(device, "loading %s failed for no such file or directory.\n", - path); + if (!(fw_priv->opt_flags & FW_OPT_NO_WARN)) { + if (rc != -ENOENT) + dev_warn(device, + "loading %s failed with error %d\n", + path, rc); + else + dev_dbg(device, + "loading %s failed for no such file or directory.\n", + path); + } continue; } size = rc; -- cgit v1.2.3 From 1c4002aeab3c81afa8a00ae76b1ea38d066e9978 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 1 Mar 2024 20:00:06 +0200 Subject: driver core: Move fw_devlink stuff to where it belongs A few APIs, i.e. fwnode_is_ancestor_of(), fwnode_get_next_parent_dev(), and get_dev_from_fwnode(), that belong specifically to the fw_devlink APIs, may be static, but they are not. Resolve this mess by moving them to the driver/base/core where the all users are being resided and make static. No functional changes intended. Reviewed-by: Sakari Ailus Acked-by: "Rafael J. Wysocki" Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240301180138.271590-3-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++ drivers/base/property.c | 56 ---------------------------------------------- include/linux/fwnode.h | 1 - include/linux/property.h | 2 -- 4 files changed, 58 insertions(+), 59 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/core.c b/drivers/base/core.c index 9828da9b933c..35ccd8bb2c9b 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -1871,6 +1871,7 @@ static void fw_devlink_unblock_consumers(struct device *dev) device_links_write_unlock(); } +#define get_dev_from_fwnode(fwnode) get_device((fwnode)->dev) static bool fwnode_init_without_drv(struct fwnode_handle *fwnode) { @@ -1901,6 +1902,63 @@ static bool fwnode_ancestor_init_without_drv(struct fwnode_handle *fwnode) return false; } +/** + * fwnode_is_ancestor_of - Test if @ancestor is ancestor of @child + * @ancestor: Firmware which is tested for being an ancestor + * @child: Firmware which is tested for being the child + * + * A node is considered an ancestor of itself too. + * + * Return: true if @ancestor is an ancestor of @child. Otherwise, returns false. + */ +static bool fwnode_is_ancestor_of(const struct fwnode_handle *ancestor, + const struct fwnode_handle *child) +{ + struct fwnode_handle *parent; + + if (IS_ERR_OR_NULL(ancestor)) + return false; + + if (child == ancestor) + return true; + + fwnode_for_each_parent_node(child, parent) { + if (parent == ancestor) { + fwnode_handle_put(parent); + return true; + } + } + return false; +} + +/** + * fwnode_get_next_parent_dev - Find device of closest ancestor fwnode + * @fwnode: firmware node + * + * Given a firmware node (@fwnode), this function finds its closest ancestor + * firmware node that has a corresponding struct device and returns that struct + * device. + * + * The caller is responsible for calling put_device() on the returned device + * pointer. + * + * Return: a pointer to the device of the @fwnode's closest ancestor. + */ +static struct device *fwnode_get_next_parent_dev(const struct fwnode_handle *fwnode) +{ + struct fwnode_handle *parent; + struct device *dev; + + fwnode_for_each_parent_node(fwnode, parent) { + dev = get_dev_from_fwnode(parent); + if (dev) { + fwnode_handle_put(parent); + return dev; + } + } + return NULL; +} + /** * __fw_devlink_relax_cycles - Relax and mark dependency cycles. * @con: Potential consumer device. diff --git a/drivers/base/property.c b/drivers/base/property.c index a1b01ab42052..afa1bf2b3c5a 100644 --- a/drivers/base/property.c +++ b/drivers/base/property.c @@ -699,34 +699,6 @@ struct fwnode_handle *fwnode_get_next_parent(struct fwnode_handle *fwnode) } EXPORT_SYMBOL_GPL(fwnode_get_next_parent); -/** - * fwnode_get_next_parent_dev - Find device of closest ancestor fwnode - * @fwnode: firmware node - * - * Given a firmware node (@fwnode), this function finds its closest ancestor - * firmware node that has a corresponding struct device and returns that struct - * device. - * - * The caller is responsible for calling put_device() on the returned device - * pointer. - * - * Return: a pointer to the device of the @fwnode's closest ancestor. - */ -struct device *fwnode_get_next_parent_dev(const struct fwnode_handle *fwnode) -{ - struct fwnode_handle *parent; - struct device *dev; - - fwnode_for_each_parent_node(fwnode, parent) { - dev = get_dev_from_fwnode(parent); - if (dev) { - fwnode_handle_put(parent); - return dev; - } - } - return NULL; -} - /** * fwnode_count_parents - Return the number of parents a node has * @fwnode: The node the parents of which are to be counted @@ -773,34 +745,6 @@ struct fwnode_handle *fwnode_get_nth_parent(struct fwnode_handle *fwnode, } EXPORT_SYMBOL_GPL(fwnode_get_nth_parent); -/** - * fwnode_is_ancestor_of - Test if @ancestor is ancestor of @child - * @ancestor: Firmware which is tested for being an ancestor - * @child: Firmware which is tested for being the child - * - * A node is considered an ancestor of itself too. - * - * Return: true if @ancestor is an ancestor of @child. Otherwise, returns false. - */ -bool fwnode_is_ancestor_of(const struct fwnode_handle *ancestor, const struct fwnode_handle *child) -{ - struct fwnode_handle *parent; - - if (IS_ERR_OR_NULL(ancestor)) - return false; - - if (child == ancestor) - return true; - - fwnode_for_each_parent_node(child, parent) { - if (parent == ancestor) { - fwnode_handle_put(parent); - return true; - } - } - return false; -} - /** * fwnode_get_next_child_node - Return the next child node handle for a node * @fwnode: Firmware node to find the next child node for. diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index 2d23a14857c7..416cbe72f0c7 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -187,7 +187,6 @@ struct fwnode_operations { if (fwnode_has_op(fwnode, op)) \ (fwnode)->ops->op(fwnode, ## __VA_ARGS__); \ } while (false) -#define get_dev_from_fwnode(fwnode) get_device((fwnode)->dev) static inline void fwnode_init(struct fwnode_handle *fwnode, const struct fwnode_operations *ops) diff --git a/include/linux/property.h b/include/linux/property.h index e6516d0b7d52..284ff79ebf03 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -156,11 +156,9 @@ struct fwnode_handle *fwnode_get_next_parent(struct fwnode_handle *fwnode); for (parent = fwnode_get_parent(fwnode); parent; \ parent = fwnode_get_next_parent(parent)) -struct device *fwnode_get_next_parent_dev(const struct fwnode_handle *fwnode); unsigned int fwnode_count_parents(const struct fwnode_handle *fwn); struct fwnode_handle *fwnode_get_nth_parent(struct fwnode_handle *fwn, unsigned int depth); -bool fwnode_is_ancestor_of(const struct fwnode_handle *ancestor, const struct fwnode_handle *child); struct fwnode_handle *fwnode_get_next_child_node( const struct fwnode_handle *fwnode, struct fwnode_handle *child); struct fwnode_handle *fwnode_get_next_available_child_node( -- cgit v1.2.3 From 4dc3d612ee5c3be2a4d1a73ab31bcfaaa850aa19 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 1 Mar 2024 20:00:08 +0200 Subject: device property: Don't use "proxy" headers Update header inclusions to follow IWYU (Include What You Use) principle. Reviewed-by: Sakari Ailus Acked-by: "Rafael J. Wysocki" Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240301180138.271590-5-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/property.c | 11 ++++++----- drivers/base/swnode.c | 13 ++++++++++++- include/linux/fwnode.h | 4 ++-- include/linux/property.h | 1 + 4 files changed, 21 insertions(+), 8 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/property.c b/drivers/base/property.c index afa1bf2b3c5a..7324a704a9a1 100644 --- a/drivers/base/property.c +++ b/drivers/base/property.c @@ -7,15 +7,16 @@ * Mika Westerberg */ -#include +#include +#include #include -#include +#include #include -#include -#include -#include #include #include +#include +#include +#include struct fwnode_handle *__dev_fwnode(struct device *dev) { diff --git a/drivers/base/swnode.c b/drivers/base/swnode.c index 36512fb75a20..eb6eb25b343b 100644 --- a/drivers/base/swnode.c +++ b/drivers/base/swnode.c @@ -6,10 +6,21 @@ * Author: Heikki Krogerus */ +#include #include -#include +#include +#include +#include +#include +#include +#include +#include #include #include +#include +#include +#include +#include #include "base.h" diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index 4228c45d5ccc..80f3cd91b471 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -9,10 +9,10 @@ #ifndef _LINUX_FWNODE_H_ #define _LINUX_FWNODE_H_ -#include -#include #include #include +#include +#include enum dev_dma_attr { DEV_DMA_NOT_SUPPORTED, diff --git a/include/linux/property.h b/include/linux/property.h index 1f0135e24d00..3a1045eb786c 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -11,6 +11,7 @@ #define _LINUX_PROPERTY_H_ #include +#include #include #include #include -- cgit v1.2.3 From 75cde56a5b504d07a64ce0e3f8c7410df70308a3 Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Mon, 4 Mar 2024 21:04:54 -0800 Subject: driver core: Adds flags param to fwnode_link_add() Allow the callers to set fwnode link flags when adding fwnode links. Signed-off-by: Saravana Kannan Acked-by: "Rafael J. Wysocki" Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20240305050458.1400667-2-saravanak@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 5 +++-- drivers/firmware/efi/sysfb_efi.c | 2 +- drivers/of/property.c | 2 +- include/linux/fwnode.h | 3 ++- 4 files changed, 7 insertions(+), 5 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/core.c b/drivers/base/core.c index 35ccd8bb2c9b..83a6f429bddb 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -92,12 +92,13 @@ static int __fwnode_link_add(struct fwnode_handle *con, return 0; } -int fwnode_link_add(struct fwnode_handle *con, struct fwnode_handle *sup) +int fwnode_link_add(struct fwnode_handle *con, struct fwnode_handle *sup, + u8 flags) { int ret; mutex_lock(&fwnode_link_lock); - ret = __fwnode_link_add(con, sup, 0); + ret = __fwnode_link_add(con, sup, flags); mutex_unlock(&fwnode_link_lock); return ret; } diff --git a/drivers/firmware/efi/sysfb_efi.c b/drivers/firmware/efi/sysfb_efi.c index 456d0e5eaf78..cc807ed35aed 100644 --- a/drivers/firmware/efi/sysfb_efi.c +++ b/drivers/firmware/efi/sysfb_efi.c @@ -336,7 +336,7 @@ static int efifb_add_links(struct fwnode_handle *fwnode) if (!sup_np) return 0; - fwnode_link_add(fwnode, of_fwnode_handle(sup_np)); + fwnode_link_add(fwnode, of_fwnode_handle(sup_np), 0); of_node_put(sup_np); return 0; diff --git a/drivers/of/property.c b/drivers/of/property.c index b71267c6667c..bce849f21ae2 100644 --- a/drivers/of/property.c +++ b/drivers/of/property.c @@ -1085,7 +1085,7 @@ static void of_link_to_phandle(struct device_node *con_np, tmp_np = of_get_next_parent(tmp_np); } - fwnode_link_add(of_fwnode_handle(con_np), of_fwnode_handle(sup_np)); + fwnode_link_add(of_fwnode_handle(con_np), of_fwnode_handle(sup_np), 0); } /** diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index 80f3cd91b471..70d9c40269b9 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -214,7 +214,8 @@ static inline void fwnode_dev_initialized(struct fwnode_handle *fwnode, fwnode->flags &= ~FWNODE_FLAG_INITIALIZED; } -int fwnode_link_add(struct fwnode_handle *con, struct fwnode_handle *sup); +int fwnode_link_add(struct fwnode_handle *con, struct fwnode_handle *sup, + u8 flags); void fwnode_links_purge(struct fwnode_handle *fwnode); void fw_devlink_purge_absent_suppliers(struct fwnode_handle *fwnode); bool fw_devlink_is_strict(void); -- cgit v1.2.3 From b7e1241d8f77ed64404a5e4450f43a319310fc91 Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Mon, 4 Mar 2024 21:04:55 -0800 Subject: driver core: Add FWLINK_FLAG_IGNORE to completely ignore a fwnode link A fwnode link between specific supplier-consumer fwnodes can be added multiple times for multiple reasons. If that dependency doesn't exist, deleting the fwnode link once doesn't guarantee that it won't get created again. So, add FWLINK_FLAG_IGNORE flag to mark a fwnode link as one that needs to be completely ignored. Since a fwnode link's flags is an OR of all the flags passed to all the fwnode_link_add() calls to create that specific fwnode link, the FWLINK_FLAG_IGNORE flag is preserved and can be used to mark a fwnode link as on that need to be completely ignored until it is deleted. Signed-off-by: Saravana Kannan Acked-by: "Rafael J. Wysocki" Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20240305050458.1400667-3-saravanak@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 9 ++++++++- include/linux/fwnode.h | 2 ++ 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'drivers/base') diff --git a/drivers/base/core.c b/drivers/base/core.c index 83a6f429bddb..b93f3c5716ae 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -1012,7 +1012,8 @@ static struct fwnode_handle *fwnode_links_check_suppliers( return NULL; list_for_each_entry(link, &fwnode->suppliers, c_hook) - if (!(link->flags & FWLINK_FLAG_CYCLE)) + if (!(link->flags & + (FWLINK_FLAG_CYCLE | FWLINK_FLAG_IGNORE))) return link->supplier; return NULL; @@ -2021,6 +2022,9 @@ static bool __fw_devlink_relax_cycles(struct device *con, } list_for_each_entry(link, &sup_handle->suppliers, c_hook) { + if (link->flags & FWLINK_FLAG_IGNORE) + continue; + if (__fw_devlink_relax_cycles(con, link->supplier)) { __fwnode_link_cycle(link); ret = true; @@ -2099,6 +2103,9 @@ static int fw_devlink_create_devlink(struct device *con, int ret = 0; u32 flags; + if (link->flags & FWLINK_FLAG_IGNORE) + return 0; + if (con->fwnode == link->consumer) flags = fw_devlink_get_flags(link->flags); else diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index 70d9c40269b9..0d79070c5a70 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -59,8 +59,10 @@ struct fwnode_handle { * fwnode link flags * * CYCLE: The fwnode link is part of a cycle. Don't defer probe. + * IGNORE: Completely ignore this link, even during cycle detection. */ #define FWLINK_FLAG_CYCLE BIT(0) +#define FWLINK_FLAG_IGNORE BIT(1) struct fwnode_link { struct fwnode_handle *supplier; -- cgit v1.2.3 From 32de4b4f9dfa67917d2cc824a195498513ec8e8d Mon Sep 17 00:00:00 2001 From: "Nícolas F. R. A. Prado" Date: Tue, 5 Mar 2024 17:21:36 -0500 Subject: driver: core: Log probe failure as error and with device metadata MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drivers can return -ENODEV or -ENXIO from their probe to reject a device match, and return -EPROBE_DEFER if probe should be retried. Any other error code is not expected during normal behavior and indicates an issue occurred, so it should be logged at the error level. Also make use of the device variant, dev_err(), so that the device metadata is attached to the log message. Signed-off-by: "Nícolas F. R. A. Prado" Link: https://lore.kernel.org/r/20240305-device-probe-error-v1-1-a06d8722bf19@collabora.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/dd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/dd.c b/drivers/base/dd.c index 85152537dbf1..0b7cf4516796 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -592,8 +592,8 @@ static int call_driver_probe(struct device *dev, struct device_driver *drv) break; default: /* driver matched but the probe failed */ - pr_warn("%s: probe of %s failed with error %d\n", - drv->name, dev_name(dev), ret); + dev_err(dev, "probe with driver %s failed with error %d\n", + drv->name, ret); break; } -- cgit v1.2.3 From 448af2d28899a2b4b1b07944b4910dfd5841bf55 Mon Sep 17 00:00:00 2001 From: "Nícolas F. R. A. Prado" Date: Tue, 5 Mar 2024 17:21:37 -0500 Subject: driver: core: Use dev_* instead of pr_* so device metadata is added MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the dev_* instead of the pr_* functions to log the status of device probe so that the log message gets the device metadata attached to it. Signed-off-by: "Nícolas F. R. A. Prado" Link: https://lore.kernel.org/r/20240305-device-probe-error-v1-2-a06d8722bf19@collabora.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/dd.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/dd.c b/drivers/base/dd.c index 0b7cf4516796..d6e7933e2521 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -397,13 +397,12 @@ bool device_is_bound(struct device *dev) static void driver_bound(struct device *dev) { if (device_is_bound(dev)) { - pr_warn("%s: device %s already bound\n", - __func__, kobject_name(&dev->kobj)); + dev_warn(dev, "%s: device already bound\n", __func__); return; } - pr_debug("driver: '%s': %s: bound to device '%s'\n", dev->driver->name, - __func__, dev_name(dev)); + dev_dbg(dev, "driver: '%s': %s: bound to device\n", dev->driver->name, + __func__); klist_add_tail(&dev->p->knode_driver, &dev->driver->p->klist_devices); device_links_driver_bound(dev); @@ -587,8 +586,8 @@ static int call_driver_probe(struct device *dev, struct device_driver *drv) break; case -ENODEV: case -ENXIO: - pr_debug("%s: probe of %s rejects match %d\n", - drv->name, dev_name(dev), ret); + dev_dbg(dev, "probe with driver %s rejects match %d\n", + drv->name, ret); break; default: /* driver matched but the probe failed */ @@ -620,8 +619,8 @@ static int really_probe(struct device *dev, struct device_driver *drv) if (link_ret == -EPROBE_DEFER) return link_ret; - pr_debug("bus: '%s': %s: probing driver %s with device %s\n", - drv->bus->name, __func__, drv->name, dev_name(dev)); + dev_dbg(dev, "bus: '%s': %s: probing driver %s with device\n", + drv->bus->name, __func__, drv->name); if (!list_empty(&dev->devres_head)) { dev_crit(dev, "Resources present before probing\n"); ret = -EBUSY; @@ -644,8 +643,7 @@ re_probe: ret = driver_sysfs_add(dev); if (ret) { - pr_err("%s: driver_sysfs_add(%s) failed\n", - __func__, dev_name(dev)); + dev_err(dev, "%s: driver_sysfs_add failed\n", __func__); goto sysfs_failed; } @@ -706,8 +704,8 @@ re_probe: dev->pm_domain->sync(dev); driver_bound(dev); - pr_debug("bus: '%s': %s: bound device %s to driver %s\n", - drv->bus->name, __func__, dev_name(dev), drv->name); + dev_dbg(dev, "bus: '%s': %s: bound device to driver %s\n", + drv->bus->name, __func__, drv->name); goto done; dev_sysfs_state_synced_failed: @@ -786,8 +784,8 @@ static int __driver_probe_device(struct device_driver *drv, struct device *dev) return -EBUSY; dev->can_match = true; - pr_debug("bus: '%s': %s: matched device %s with driver %s\n", - drv->bus->name, __func__, dev_name(dev), drv->name); + dev_dbg(dev, "bus: '%s': %s: matched device with driver %s\n", + drv->bus->name, __func__, drv->name); pm_runtime_get_suppliers(dev); if (dev->parent) -- cgit v1.2.3 From 6aeb8850e0f39869d43768603a75c0431562a429 Mon Sep 17 00:00:00 2001 From: "Nícolas F. R. A. Prado" Date: Tue, 5 Mar 2024 17:21:38 -0500 Subject: device: core: Log warning for devices pending deferred probe on timeout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Once the deferred probe timeout has elapsed it is very likely that the devices that are still deferring probe won't ever be probed. Therefore log the defer probe pending reason at the warning level instead to bring attention to the issue. Signed-off-by: "Nícolas F. R. A. Prado" Link: https://lore.kernel.org/r/20240305-device-probe-error-v1-3-a06d8722bf19@collabora.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/dd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/base') diff --git a/drivers/base/dd.c b/drivers/base/dd.c index d6e7933e2521..83d352394fdf 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -313,7 +313,7 @@ static void deferred_probe_timeout_work_func(struct work_struct *work) mutex_lock(&deferred_probe_mutex); list_for_each_entry(p, &deferred_probe_pending_list, deferred_probe) - dev_info(p->device, "deferred probe pending: %s", p->deferred_probe_reason ?: "(reason unknown)\n"); + dev_warn(p->device, "deferred probe pending: %s", p->deferred_probe_reason ?: "(reason unknown)\n"); mutex_unlock(&deferred_probe_mutex); fw_devlink_probing_done(); -- cgit v1.2.3 From 8076fcde016c9c0e0660543e67bff86cb48a7c9c Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Mon, 11 Mar 2024 12:29:43 -0700 Subject: x86/rfds: Mitigate Register File Data Sampling (RFDS) RFDS is a CPU vulnerability that may allow userspace to infer kernel stale data previously used in floating point registers, vector registers and integer registers. RFDS only affects certain Intel Atom processors. Intel released a microcode update that uses VERW instruction to clear the affected CPU buffers. Unlike MDS, none of the affected cores support SMT. Add RFDS bug infrastructure and enable the VERW based mitigation by default, that clears the affected buffers just before exiting to userspace. Also add sysfs reporting and cmdline parameter "reg_file_data_sampling" to control the mitigation. For details see: Documentation/admin-guide/hw-vuln/reg-file-data-sampling.rst Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Acked-by: Josh Poimboeuf --- Documentation/ABI/testing/sysfs-devices-system-cpu | 1 + Documentation/admin-guide/kernel-parameters.txt | 21 ++++++ arch/x86/Kconfig | 11 +++ arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/msr-index.h | 8 +++ arch/x86/kernel/cpu/bugs.c | 78 +++++++++++++++++++++- arch/x86/kernel/cpu/common.c | 38 ++++++++++- drivers/base/cpu.c | 3 + include/linux/cpu.h | 2 + 9 files changed, 157 insertions(+), 6 deletions(-) (limited to 'drivers/base') diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index a1db6db47505..710d47be11e0 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -516,6 +516,7 @@ What: /sys/devices/system/cpu/vulnerabilities /sys/devices/system/cpu/vulnerabilities/mds /sys/devices/system/cpu/vulnerabilities/meltdown /sys/devices/system/cpu/vulnerabilities/mmio_stale_data + /sys/devices/system/cpu/vulnerabilities/reg_file_data_sampling /sys/devices/system/cpu/vulnerabilities/retbleed /sys/devices/system/cpu/vulnerabilities/spec_store_bypass /sys/devices/system/cpu/vulnerabilities/spectre_v1 diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 31b3a25680d0..73062d47a462 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1150,6 +1150,26 @@ The filter can be disabled or changed to another driver later using sysfs. + reg_file_data_sampling= + [X86] Controls mitigation for Register File Data + Sampling (RFDS) vulnerability. RFDS is a CPU + vulnerability which may allow userspace to infer + kernel data values previously stored in floating point + registers, vector registers, or integer registers. + RFDS only affects Intel Atom processors. + + on: Turns ON the mitigation. + off: Turns OFF the mitigation. + + This parameter overrides the compile time default set + by CONFIG_MITIGATION_RFDS. Mitigation cannot be + disabled when other VERW based mitigations (like MDS) + are enabled. In order to disable RFDS mitigation all + VERW based mitigations need to be disabled. + + For details see: + Documentation/admin-guide/hw-vuln/reg-file-data-sampling.rst + driver_async_probe= [KNL] List of driver names to be probed asynchronously. * matches with all driver names. If * is specified, the @@ -3398,6 +3418,7 @@ nospectre_bhb [ARM64] nospectre_v1 [X86,PPC] nospectre_v2 [X86,PPC,S390,ARM64] + reg_file_data_sampling=off [X86] retbleed=off [X86] spec_store_bypass_disable=off [X86,PPC] spectre_v2_user=off [X86] diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5edec175b9bf..637e337c332e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2614,6 +2614,17 @@ config GDS_FORCE_MITIGATION If in doubt, say N. +config MITIGATION_RFDS + bool "RFDS Mitigation" + depends on CPU_SUP_INTEL + default y + help + Enable mitigation for Register File Data Sampling (RFDS) by default. + RFDS is a hardware vulnerability which affects Intel Atom CPUs. It + allows unprivileged speculative access to stale data previously + stored in floating point, vector and integer registers. + See also + endif config ARCH_HAS_ADD_PAGES diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 2b62cdd8dd12..8511aad59581 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -503,4 +503,5 @@ /* BUG word 2 */ #define X86_BUG_SRSO X86_BUG(1*32 + 0) /* AMD SRSO bug */ #define X86_BUG_DIV0 X86_BUG(1*32 + 1) /* AMD DIV0 speculation bug */ +#define X86_BUG_RFDS X86_BUG(1*32 + 2) /* CPU is vulnerable to Register File Data Sampling */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index f1bd7b91b3c6..d1b5edaf6c34 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -165,6 +165,14 @@ * CPU is not vulnerable to Gather * Data Sampling (GDS). */ +#define ARCH_CAP_RFDS_NO BIT(27) /* + * Not susceptible to Register + * File Data Sampling. + */ +#define ARCH_CAP_RFDS_CLEAR BIT(28) /* + * VERW clears CPU Register + * File. + */ #define ARCH_CAP_XAPIC_DISABLE BIT(21) /* * IA32_XAPIC_DISABLE_STATUS MSR diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index cd6ac89c1a0d..01ac18f56147 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -480,6 +480,57 @@ static int __init mmio_stale_data_parse_cmdline(char *str) } early_param("mmio_stale_data", mmio_stale_data_parse_cmdline); +#undef pr_fmt +#define pr_fmt(fmt) "Register File Data Sampling: " fmt + +enum rfds_mitigations { + RFDS_MITIGATION_OFF, + RFDS_MITIGATION_VERW, + RFDS_MITIGATION_UCODE_NEEDED, +}; + +/* Default mitigation for Register File Data Sampling */ +static enum rfds_mitigations rfds_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_VERW : RFDS_MITIGATION_OFF; + +static const char * const rfds_strings[] = { + [RFDS_MITIGATION_OFF] = "Vulnerable", + [RFDS_MITIGATION_VERW] = "Mitigation: Clear Register File", + [RFDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", +}; + +static void __init rfds_select_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_RFDS) || cpu_mitigations_off()) { + rfds_mitigation = RFDS_MITIGATION_OFF; + return; + } + if (rfds_mitigation == RFDS_MITIGATION_OFF) + return; + + if (x86_read_arch_cap_msr() & ARCH_CAP_RFDS_CLEAR) + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + else + rfds_mitigation = RFDS_MITIGATION_UCODE_NEEDED; +} + +static __init int rfds_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!boot_cpu_has_bug(X86_BUG_RFDS)) + return 0; + + if (!strcmp(str, "off")) + rfds_mitigation = RFDS_MITIGATION_OFF; + else if (!strcmp(str, "on")) + rfds_mitigation = RFDS_MITIGATION_VERW; + + return 0; +} +early_param("reg_file_data_sampling", rfds_parse_cmdline); + #undef pr_fmt #define pr_fmt(fmt) "" fmt @@ -513,6 +564,11 @@ static void __init md_clear_update_mitigation(void) mmio_mitigation = MMIO_MITIGATION_VERW; mmio_select_mitigation(); } + if (rfds_mitigation == RFDS_MITIGATION_OFF && + boot_cpu_has_bug(X86_BUG_RFDS)) { + rfds_mitigation = RFDS_MITIGATION_VERW; + rfds_select_mitigation(); + } out: if (boot_cpu_has_bug(X86_BUG_MDS)) pr_info("MDS: %s\n", mds_strings[mds_mitigation]); @@ -522,6 +578,8 @@ out: pr_info("MMIO Stale Data: %s\n", mmio_strings[mmio_mitigation]); else if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) pr_info("MMIO Stale Data: Unknown: No mitigations\n"); + if (boot_cpu_has_bug(X86_BUG_RFDS)) + pr_info("Register File Data Sampling: %s\n", rfds_strings[rfds_mitigation]); } static void __init md_clear_select_mitigation(void) @@ -529,11 +587,12 @@ static void __init md_clear_select_mitigation(void) mds_select_mitigation(); taa_select_mitigation(); mmio_select_mitigation(); + rfds_select_mitigation(); /* - * As MDS, TAA and MMIO Stale Data mitigations are inter-related, update - * and print their mitigation after MDS, TAA and MMIO Stale Data - * mitigation selection is done. + * As these mitigations are inter-related and rely on VERW instruction + * to clear the microarchitural buffers, update and print their status + * after mitigation selection is done for each of these vulnerabilities. */ md_clear_update_mitigation(); } @@ -2622,6 +2681,11 @@ static ssize_t mmio_stale_data_show_state(char *buf) sched_smt_active() ? "vulnerable" : "disabled"); } +static ssize_t rfds_show_state(char *buf) +{ + return sysfs_emit(buf, "%s\n", rfds_strings[rfds_mitigation]); +} + static char *stibp_state(void) { if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && @@ -2781,6 +2845,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_GDS: return gds_show_state(buf); + case X86_BUG_RFDS: + return rfds_show_state(buf); + default: break; } @@ -2855,4 +2922,9 @@ ssize_t cpu_show_gds(struct device *dev, struct device_attribute *attr, char *bu { return cpu_show_common(dev, attr, buf, X86_BUG_GDS); } + +ssize_t cpu_show_reg_file_data_sampling(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_RFDS); +} #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index fbc4e60d027c..40d8c110bb32 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1267,6 +1267,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { #define SRSO BIT(5) /* CPU is affected by GDS */ #define GDS BIT(6) +/* CPU is affected by Register File Data Sampling */ +#define RFDS BIT(7) static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS), @@ -1294,9 +1296,18 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPPINGS(TIGERLAKE, X86_STEPPING_ANY, GDS), VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS), - VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS), - VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO), - VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS), + VULNBL_INTEL_STEPPINGS(ALDERLAKE, X86_STEPPING_ANY, RFDS), + VULNBL_INTEL_STEPPINGS(ALDERLAKE_L, X86_STEPPING_ANY, RFDS), + VULNBL_INTEL_STEPPINGS(RAPTORLAKE, X86_STEPPING_ANY, RFDS), + VULNBL_INTEL_STEPPINGS(RAPTORLAKE_P, X86_STEPPING_ANY, RFDS), + VULNBL_INTEL_STEPPINGS(RAPTORLAKE_S, X86_STEPPING_ANY, RFDS), + VULNBL_INTEL_STEPPINGS(ATOM_GRACEMONT, X86_STEPPING_ANY, RFDS), + VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS), + VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO | RFDS), + VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS), + VULNBL_INTEL_STEPPINGS(ATOM_GOLDMONT, X86_STEPPING_ANY, RFDS), + VULNBL_INTEL_STEPPINGS(ATOM_GOLDMONT_D, X86_STEPPING_ANY, RFDS), + VULNBL_INTEL_STEPPINGS(ATOM_GOLDMONT_PLUS, X86_STEPPING_ANY, RFDS), VULNBL_AMD(0x15, RETBLEED), VULNBL_AMD(0x16, RETBLEED), @@ -1330,6 +1341,24 @@ static bool arch_cap_mmio_immune(u64 ia32_cap) ia32_cap & ARCH_CAP_SBDR_SSDP_NO); } +static bool __init vulnerable_to_rfds(u64 ia32_cap) +{ + /* The "immunity" bit trumps everything else: */ + if (ia32_cap & ARCH_CAP_RFDS_NO) + return false; + + /* + * VMMs set ARCH_CAP_RFDS_CLEAR for processors not in the blacklist to + * indicate that mitigation is needed because guest is running on a + * vulnerable hardware or may migrate to such hardware: + */ + if (ia32_cap & ARCH_CAP_RFDS_CLEAR) + return true; + + /* Only consult the blacklist when there is no enumeration: */ + return cpu_matches(cpu_vuln_blacklist, RFDS); +} + static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) { u64 ia32_cap = x86_read_arch_cap_msr(); @@ -1441,6 +1470,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) boot_cpu_has(X86_FEATURE_AVX)) setup_force_cpu_bug(X86_BUG_GDS); + if (vulnerable_to_rfds(ia32_cap)) + setup_force_cpu_bug(X86_BUG_RFDS); + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 47de0f140ba6..0b33e81f9c9b 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -588,6 +588,7 @@ CPU_SHOW_VULN_FALLBACK(mmio_stale_data); CPU_SHOW_VULN_FALLBACK(retbleed); CPU_SHOW_VULN_FALLBACK(spec_rstack_overflow); CPU_SHOW_VULN_FALLBACK(gds); +CPU_SHOW_VULN_FALLBACK(reg_file_data_sampling); static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); @@ -602,6 +603,7 @@ static DEVICE_ATTR(mmio_stale_data, 0444, cpu_show_mmio_stale_data, NULL); static DEVICE_ATTR(retbleed, 0444, cpu_show_retbleed, NULL); static DEVICE_ATTR(spec_rstack_overflow, 0444, cpu_show_spec_rstack_overflow, NULL); static DEVICE_ATTR(gather_data_sampling, 0444, cpu_show_gds, NULL); +static DEVICE_ATTR(reg_file_data_sampling, 0444, cpu_show_reg_file_data_sampling, NULL); static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_meltdown.attr, @@ -617,6 +619,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_retbleed.attr, &dev_attr_spec_rstack_overflow.attr, &dev_attr_gather_data_sampling.attr, + &dev_attr_reg_file_data_sampling.attr, NULL }; diff --git a/include/linux/cpu.h b/include/linux/cpu.h index dcb89c987164..8654714421a0 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -75,6 +75,8 @@ extern ssize_t cpu_show_spec_rstack_overflow(struct device *dev, struct device_attribute *attr, char *buf); extern ssize_t cpu_show_gds(struct device *dev, struct device_attribute *attr, char *buf); +extern ssize_t cpu_show_reg_file_data_sampling(struct device *dev, + struct device_attribute *attr, char *buf); extern __printf(4, 5) struct device *cpu_device_create(struct device *parent, void *drvdata, -- cgit v1.2.3 From 11270e526276ffad4c4237acb393da82a3287487 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 8 Mar 2024 14:59:21 -0700 Subject: base/node / ACPI: Enumerate node access class for 'struct access_coordinate' Both generic node and HMAT handling code have been using magic numbers to indicate access classes for 'struct access_coordinate'. Introduce enums to enumerate the access0 and access1 classes shared by the two subsystems. Update the function parameters and callers as appropriate to utilize the new enum. Access0 is named to ACCESS_COORDINATE_LOCAL in order to indicate that the access class is for 'struct access_coordinate' between a target node and the nearest initiator node. Access1 is named to ACCESS_COORDINATE_CPU in order to indicate that the access class is for 'struct access_coordinate' between a target node and the nearest CPU node. Cc: Greg Kroah-Hartman Cc: Rafael J. Wysocki Reviewed-by: Jonathan Cameron Tested-by: Jonathan Cameron Acked-by: Greg Kroah-Hartman Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/20240308220055.2172956-3-dave.jiang@intel.com Signed-off-by: Dan Williams --- drivers/acpi/numa/hmat.c | 26 ++++++++++++++------------ drivers/base/node.c | 6 +++--- include/linux/node.h | 18 +++++++++++++++--- 3 files changed, 32 insertions(+), 18 deletions(-) (limited to 'drivers/base') diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index a26e7793ec4e..e0144cfbf1f3 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -59,9 +59,7 @@ struct target_cache { }; enum { - NODE_ACCESS_CLASS_0 = 0, - NODE_ACCESS_CLASS_1, - NODE_ACCESS_CLASS_GENPORT_SINK, + NODE_ACCESS_CLASS_GENPORT_SINK = ACCESS_COORDINATE_MAX, NODE_ACCESS_CLASS_MAX, }; @@ -374,11 +372,11 @@ static __init void hmat_update_target(unsigned int tgt_pxm, unsigned int init_px if (target && target->processor_pxm == init_pxm) { hmat_update_target_access(target, type, value, - NODE_ACCESS_CLASS_0); + ACCESS_COORDINATE_LOCAL); /* If the node has a CPU, update access 1 */ if (node_state(pxm_to_node(init_pxm), N_CPU)) hmat_update_target_access(target, type, value, - NODE_ACCESS_CLASS_1); + ACCESS_COORDINATE_CPU); } } @@ -709,7 +707,8 @@ static void hmat_update_target_attrs(struct memory_target *target, */ if (target->processor_pxm != PXM_INVAL) { cpu_nid = pxm_to_node(target->processor_pxm); - if (access == 0 || node_state(cpu_nid, N_CPU)) { + if (access == ACCESS_COORDINATE_LOCAL || + node_state(cpu_nid, N_CPU)) { set_bit(target->processor_pxm, p_nodes); return; } @@ -737,7 +736,8 @@ static void hmat_update_target_attrs(struct memory_target *target, list_for_each_entry(initiator, &initiators, node) { u32 value; - if (access == 1 && !initiator->has_cpu) { + if (access == ACCESS_COORDINATE_CPU && + !initiator->has_cpu) { clear_bit(initiator->processor_pxm, p_nodes); continue; } @@ -782,8 +782,10 @@ static void hmat_register_target_initiators(struct memory_target *target) { static DECLARE_BITMAP(p_nodes, MAX_NUMNODES); - __hmat_register_target_initiators(target, p_nodes, 0); - __hmat_register_target_initiators(target, p_nodes, 1); + __hmat_register_target_initiators(target, p_nodes, + ACCESS_COORDINATE_LOCAL); + __hmat_register_target_initiators(target, p_nodes, + ACCESS_COORDINATE_CPU); } static void hmat_register_target_cache(struct memory_target *target) @@ -854,8 +856,8 @@ static void hmat_register_target(struct memory_target *target) if (!target->registered) { hmat_register_target_initiators(target); hmat_register_target_cache(target); - hmat_register_target_perf(target, NODE_ACCESS_CLASS_0); - hmat_register_target_perf(target, NODE_ACCESS_CLASS_1); + hmat_register_target_perf(target, ACCESS_COORDINATE_LOCAL); + hmat_register_target_perf(target, ACCESS_COORDINATE_CPU); target->registered = true; } mutex_unlock(&target_lock); @@ -927,7 +929,7 @@ static int hmat_calculate_adistance(struct notifier_block *self, return NOTIFY_OK; mutex_lock(&target_lock); - hmat_update_target_attrs(target, p_nodes, 1); + hmat_update_target_attrs(target, p_nodes, ACCESS_COORDINATE_CPU); mutex_unlock(&target_lock); perf = &target->coord[1]; diff --git a/drivers/base/node.c b/drivers/base/node.c index 1c05640461dd..a73b0c9a401a 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -126,7 +126,7 @@ static void node_access_release(struct device *dev) } static struct node_access_nodes *node_init_node_access(struct node *node, - unsigned int access) + enum access_coordinate_class access) { struct node_access_nodes *access_node; struct device *dev; @@ -191,7 +191,7 @@ static struct attribute *access_attrs[] = { * @access: The access class the for the given attributes */ void node_set_perf_attrs(unsigned int nid, struct access_coordinate *coord, - unsigned int access) + enum access_coordinate_class access) { struct node_access_nodes *c; struct node *node; @@ -689,7 +689,7 @@ int register_cpu_under_node(unsigned int cpu, unsigned int nid) */ int register_memory_node_under_compute_node(unsigned int mem_nid, unsigned int cpu_nid, - unsigned int access) + enum access_coordinate_class access) { struct node *init_node, *targ_node; struct node_access_nodes *initiator, *target; diff --git a/include/linux/node.h b/include/linux/node.h index 25b66d705ee2..dfc004e4bee7 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -34,6 +34,18 @@ struct access_coordinate { unsigned int write_latency; }; +/* + * ACCESS_COORDINATE_LOCAL correlates to ACCESS CLASS 0 + * - access_coordinate between target node and nearest initiator node + * ACCESS_COORDINATE_CPU correlates to ACCESS CLASS 1 + * - access_coordinate between target node and nearest CPU node + */ +enum access_coordinate_class { + ACCESS_COORDINATE_LOCAL, + ACCESS_COORDINATE_CPU, + ACCESS_COORDINATE_MAX +}; + enum cache_indexing { NODE_CACHE_DIRECT_MAP, NODE_CACHE_INDEXED, @@ -66,7 +78,7 @@ struct node_cache_attrs { #ifdef CONFIG_HMEM_REPORTING void node_add_cache(unsigned int nid, struct node_cache_attrs *cache_attrs); void node_set_perf_attrs(unsigned int nid, struct access_coordinate *coord, - unsigned access); + enum access_coordinate_class access); #else static inline void node_add_cache(unsigned int nid, struct node_cache_attrs *cache_attrs) @@ -75,7 +87,7 @@ static inline void node_add_cache(unsigned int nid, static inline void node_set_perf_attrs(unsigned int nid, struct access_coordinate *coord, - unsigned access) + enum access_coordinate_class access) { } #endif @@ -137,7 +149,7 @@ extern void unregister_memory_block_under_nodes(struct memory_block *mem_blk); extern int register_memory_node_under_compute_node(unsigned int mem_nid, unsigned int cpu_nid, - unsigned access); + enum access_coordinate_class access); #else static inline void node_dev_init(void) { -- cgit v1.2.3 From debdce20c4f28b7e5aa48512e7abf270a00e9051 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 8 Mar 2024 14:59:31 -0700 Subject: cxl/region: Deal with numa nodes not enumerated by SRAT For the numa nodes that are not created by SRAT, no memory_target is allocated and is not managed by the HMAT_REPORTING code. Therefore hmat_callback() memory hotplug notifier will exit early on those NUMA nodes. The CXL memory hotplug notifier will need to call node_set_perf_attrs() directly in order to setup the access sysfs attributes. In acpi_numa_init(), the last proximity domain (pxm) id created by SRAT is stored. Add a helper function acpi_node_backed_by_real_pxm() in order to check if a NUMA node id is defined by SRAT or created by CFMWS. node_set_perf_attrs() symbol is exported to allow update of perf attribs for a node. The sysfs path of /sys/devices/system/node/nodeX/access0/initiators/* is created by node_set_perf_attrs() for the various attributes where nodeX is matched to the NUMA node of the CXL region. Cc: Rafael J. Wysocki Reviewed-by: Alison Schofield Reviewed-by: Jonathan Cameron Tested-by: Jonathan Cameron Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/20240308220055.2172956-13-dave.jiang@intel.com Signed-off-by: Dan Williams --- drivers/acpi/numa/srat.c | 11 +++++++++++ drivers/base/node.c | 1 + drivers/cxl/core/cdat.c | 5 +++++ drivers/cxl/core/core.h | 1 + drivers/cxl/core/region.c | 7 ++++++- include/linux/acpi.h | 9 +++++++++ 6 files changed, 33 insertions(+), 1 deletion(-) (limited to 'drivers/base') diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c index 0214518fc582..e45e64993c50 100644 --- a/drivers/acpi/numa/srat.c +++ b/drivers/acpi/numa/srat.c @@ -29,6 +29,8 @@ static int node_to_pxm_map[MAX_NUMNODES] unsigned char acpi_srat_revision __initdata; static int acpi_numa __initdata; +static int last_real_pxm; + void __init disable_srat(void) { acpi_numa = -1; @@ -536,6 +538,7 @@ int __init acpi_numa_init(void) if (node_to_pxm_map[i] > fake_pxm) fake_pxm = node_to_pxm_map[i]; } + last_real_pxm = fake_pxm; fake_pxm++; acpi_table_parse_cedt(ACPI_CEDT_TYPE_CFMWS, acpi_parse_cfmws, &fake_pxm); @@ -547,6 +550,14 @@ int __init acpi_numa_init(void) return 0; } +bool acpi_node_backed_by_real_pxm(int nid) +{ + int pxm = node_to_pxm(nid); + + return pxm <= last_real_pxm; +} +EXPORT_SYMBOL_GPL(acpi_node_backed_by_real_pxm); + static int acpi_get_pxm(acpi_handle h) { unsigned long long pxm; diff --git a/drivers/base/node.c b/drivers/base/node.c index a73b0c9a401a..eb72580288e6 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -215,6 +215,7 @@ void node_set_perf_attrs(unsigned int nid, struct access_coordinate *coord, } } } +EXPORT_SYMBOL_GPL(node_set_perf_attrs); /** * struct node_cache_info - Internal tracking for memory node caches diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c index ee1bc8fa396b..10d4f1d7dad1 100644 --- a/drivers/cxl/core/cdat.c +++ b/drivers/cxl/core/cdat.c @@ -586,3 +586,8 @@ int cxl_update_hmat_access_coordinates(int nid, struct cxl_region *cxlr, { return hmat_update_target_coordinates(nid, &cxlr->coord[access], access); } + +bool cxl_need_node_perf_attrs_update(int nid) +{ + return !acpi_node_backed_by_real_pxm(nid); +} diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index e19800a7ce06..bc5a95665aa0 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -92,5 +92,6 @@ long cxl_pci_get_latency(struct pci_dev *pdev); int cxl_update_hmat_access_coordinates(int nid, struct cxl_region *cxlr, enum access_coordinate_class access); +bool cxl_need_node_perf_attrs_update(int nid); #endif /* __CXL_CORE_H__ */ diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 535492ec8529..5c186e0a39b9 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -2279,7 +2279,12 @@ static bool cxl_region_update_coordinates(struct cxl_region *cxlr, int nid) for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { if (cxlr->coord[i].read_bandwidth) { - rc = cxl_update_hmat_access_coordinates(nid, cxlr, i); + rc = 0; + if (cxl_need_node_perf_attrs_update(nid)) + node_set_perf_attrs(nid, &cxlr->coord[i], i); + else + rc = cxl_update_hmat_access_coordinates(nid, cxlr, i); + if (rc == 0) cset++; } diff --git a/include/linux/acpi.h b/include/linux/acpi.h index c84c2f34b8ee..2a7c4b90d589 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1559,4 +1559,13 @@ static inline int hmat_update_target_coordinates(int nid, } #endif +#ifdef CONFIG_ACPI_NUMA +bool acpi_node_backed_by_real_pxm(int nid); +#else +static inline bool acpi_node_backed_by_real_pxm(int nid) +{ + return false; +} +#endif + #endif /*_LINUX_ACPI_H*/ -- cgit v1.2.3